Bitcoin ABC  0.28.12
P2P Digital Currency
sha256_sse4.cpp
Go to the documentation of this file.
1 // Copyright (c) 2017 The Bitcoin Core developers
2 // Distributed under the MIT software license, see the accompanying
3 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
4 //
5 // This is a translation to GCC extended asm syntax from YASM code by Intel
6 // (available at the bottom of this file).
7 
8 #include <cstdint>
9 #include <cstdlib>
10 
11 #if defined(__x86_64__) || defined(__amd64__)
12 
13 namespace sha256_sse4 {
14 void Transform(uint32_t *s, const uint8_t *chunk, size_t blocks) {
15  static const uint32_t K256 alignas(16)[] = {
16  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
17  0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
18  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
19  0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
20  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
21  0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
22  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
23  0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
24  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
25  0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
26  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
27  };
28  static const uint32_t FLIP_MASK alignas(16)[] = {0x00010203, 0x04050607,
29  0x08090a0b, 0x0c0d0e0f};
30  static const uint32_t SHUF_00BA alignas(16)[] = {0x03020100, 0x0b0a0908,
31  0xffffffff, 0xffffffff};
32  static const uint32_t SHUF_DC00 alignas(16)[] = {0xffffffff, 0xffffffff,
33  0x03020100, 0x0b0a0908};
34  uint32_t a, b, c, d, f, g, h, y0, y1, y2;
35  uint64_t tbl;
36  uint64_t inp_end, inp;
37  uint32_t xfer alignas(16)[4];
38 
39  __asm__ __volatile__(
40  "shl $0x6,%2;"
41  "je Ldone_hash_%=;"
42  "add %1,%2;"
43  "mov %2,%14;"
44  "mov (%0),%3;"
45  "mov 0x4(%0),%4;"
46  "mov 0x8(%0),%5;"
47  "mov 0xc(%0),%6;"
48  "mov 0x10(%0),%k2;"
49  "mov 0x14(%0),%7;"
50  "mov 0x18(%0),%8;"
51  "mov 0x1c(%0),%9;"
52  "movdqa %18,%%xmm12;"
53  "movdqa %19,%%xmm10;"
54  "movdqa %20,%%xmm11;"
55 
56  "Lloop0_%=:"
57  "lea %17,%13;"
58  "movdqu (%1),%%xmm4;"
59  "pshufb %%xmm12,%%xmm4;"
60  "movdqu 0x10(%1),%%xmm5;"
61  "pshufb %%xmm12,%%xmm5;"
62  "movdqu 0x20(%1),%%xmm6;"
63  "pshufb %%xmm12,%%xmm6;"
64  "movdqu 0x30(%1),%%xmm7;"
65  "pshufb %%xmm12,%%xmm7;"
66  "mov %1,%15;"
67  "mov $3,%1;"
68 
69  "Lloop1_%=:"
70  "movdqa 0x0(%13),%%xmm9;"
71  "paddd %%xmm4,%%xmm9;"
72  "movdqa %%xmm9,%16;"
73  "movdqa %%xmm7,%%xmm0;"
74  "mov %k2,%10;"
75  "ror $0xe,%10;"
76  "mov %3,%11;"
77  "palignr $0x4,%%xmm6,%%xmm0;"
78  "ror $0x9,%11;"
79  "xor %k2,%10;"
80  "mov %7,%12;"
81  "ror $0x5,%10;"
82  "movdqa %%xmm5,%%xmm1;"
83  "xor %3,%11;"
84  "xor %8,%12;"
85  "paddd %%xmm4,%%xmm0;"
86  "xor %k2,%10;"
87  "and %k2,%12;"
88  "ror $0xb,%11;"
89  "palignr $0x4,%%xmm4,%%xmm1;"
90  "xor %3,%11;"
91  "ror $0x6,%10;"
92  "xor %8,%12;"
93  "movdqa %%xmm1,%%xmm2;"
94  "ror $0x2,%11;"
95  "add %10,%12;"
96  "add %16,%12;"
97  "movdqa %%xmm1,%%xmm3;"
98  "mov %3,%10;"
99  "add %12,%9;"
100  "mov %3,%12;"
101  "pslld $0x19,%%xmm1;"
102  "or %5,%10;"
103  "add %9,%6;"
104  "and %5,%12;"
105  "psrld $0x7,%%xmm2;"
106  "and %4,%10;"
107  "add %11,%9;"
108  "por %%xmm2,%%xmm1;"
109  "or %12,%10;"
110  "add %10,%9;"
111  "movdqa %%xmm3,%%xmm2;"
112  "mov %6,%10;"
113  "mov %9,%11;"
114  "movdqa %%xmm3,%%xmm8;"
115  "ror $0xe,%10;"
116  "xor %6,%10;"
117  "mov %k2,%12;"
118  "ror $0x9,%11;"
119  "pslld $0xe,%%xmm3;"
120  "xor %9,%11;"
121  "ror $0x5,%10;"
122  "xor %7,%12;"
123  "psrld $0x12,%%xmm2;"
124  "ror $0xb,%11;"
125  "xor %6,%10;"
126  "and %6,%12;"
127  "ror $0x6,%10;"
128  "pxor %%xmm3,%%xmm1;"
129  "xor %9,%11;"
130  "xor %7,%12;"
131  "psrld $0x3,%%xmm8;"
132  "add %10,%12;"
133  "add 4+%16,%12;"
134  "ror $0x2,%11;"
135  "pxor %%xmm2,%%xmm1;"
136  "mov %9,%10;"
137  "add %12,%8;"
138  "mov %9,%12;"
139  "pxor %%xmm8,%%xmm1;"
140  "or %4,%10;"
141  "add %8,%5;"
142  "and %4,%12;"
143  "pshufd $0xfa,%%xmm7,%%xmm2;"
144  "and %3,%10;"
145  "add %11,%8;"
146  "paddd %%xmm1,%%xmm0;"
147  "or %12,%10;"
148  "add %10,%8;"
149  "movdqa %%xmm2,%%xmm3;"
150  "mov %5,%10;"
151  "mov %8,%11;"
152  "ror $0xe,%10;"
153  "movdqa %%xmm2,%%xmm8;"
154  "xor %5,%10;"
155  "ror $0x9,%11;"
156  "mov %6,%12;"
157  "xor %8,%11;"
158  "ror $0x5,%10;"
159  "psrlq $0x11,%%xmm2;"
160  "xor %k2,%12;"
161  "psrlq $0x13,%%xmm3;"
162  "xor %5,%10;"
163  "and %5,%12;"
164  "psrld $0xa,%%xmm8;"
165  "ror $0xb,%11;"
166  "xor %8,%11;"
167  "xor %k2,%12;"
168  "ror $0x6,%10;"
169  "pxor %%xmm3,%%xmm2;"
170  "add %10,%12;"
171  "ror $0x2,%11;"
172  "add 8+%16,%12;"
173  "pxor %%xmm2,%%xmm8;"
174  "mov %8,%10;"
175  "add %12,%7;"
176  "mov %8,%12;"
177  "pshufb %%xmm10,%%xmm8;"
178  "or %3,%10;"
179  "add %7,%4;"
180  "and %3,%12;"
181  "paddd %%xmm8,%%xmm0;"
182  "and %9,%10;"
183  "add %11,%7;"
184  "pshufd $0x50,%%xmm0,%%xmm2;"
185  "or %12,%10;"
186  "add %10,%7;"
187  "movdqa %%xmm2,%%xmm3;"
188  "mov %4,%10;"
189  "ror $0xe,%10;"
190  "mov %7,%11;"
191  "movdqa %%xmm2,%%xmm4;"
192  "ror $0x9,%11;"
193  "xor %4,%10;"
194  "mov %5,%12;"
195  "ror $0x5,%10;"
196  "psrlq $0x11,%%xmm2;"
197  "xor %7,%11;"
198  "xor %6,%12;"
199  "psrlq $0x13,%%xmm3;"
200  "xor %4,%10;"
201  "and %4,%12;"
202  "ror $0xb,%11;"
203  "psrld $0xa,%%xmm4;"
204  "xor %7,%11;"
205  "ror $0x6,%10;"
206  "xor %6,%12;"
207  "pxor %%xmm3,%%xmm2;"
208  "ror $0x2,%11;"
209  "add %10,%12;"
210  "add 12+%16,%12;"
211  "pxor %%xmm2,%%xmm4;"
212  "mov %7,%10;"
213  "add %12,%k2;"
214  "mov %7,%12;"
215  "pshufb %%xmm11,%%xmm4;"
216  "or %9,%10;"
217  "add %k2,%3;"
218  "and %9,%12;"
219  "paddd %%xmm0,%%xmm4;"
220  "and %8,%10;"
221  "add %11,%k2;"
222  "or %12,%10;"
223  "add %10,%k2;"
224  "movdqa 0x10(%13),%%xmm9;"
225  "paddd %%xmm5,%%xmm9;"
226  "movdqa %%xmm9,%16;"
227  "movdqa %%xmm4,%%xmm0;"
228  "mov %3,%10;"
229  "ror $0xe,%10;"
230  "mov %k2,%11;"
231  "palignr $0x4,%%xmm7,%%xmm0;"
232  "ror $0x9,%11;"
233  "xor %3,%10;"
234  "mov %4,%12;"
235  "ror $0x5,%10;"
236  "movdqa %%xmm6,%%xmm1;"
237  "xor %k2,%11;"
238  "xor %5,%12;"
239  "paddd %%xmm5,%%xmm0;"
240  "xor %3,%10;"
241  "and %3,%12;"
242  "ror $0xb,%11;"
243  "palignr $0x4,%%xmm5,%%xmm1;"
244  "xor %k2,%11;"
245  "ror $0x6,%10;"
246  "xor %5,%12;"
247  "movdqa %%xmm1,%%xmm2;"
248  "ror $0x2,%11;"
249  "add %10,%12;"
250  "add %16,%12;"
251  "movdqa %%xmm1,%%xmm3;"
252  "mov %k2,%10;"
253  "add %12,%6;"
254  "mov %k2,%12;"
255  "pslld $0x19,%%xmm1;"
256  "or %8,%10;"
257  "add %6,%9;"
258  "and %8,%12;"
259  "psrld $0x7,%%xmm2;"
260  "and %7,%10;"
261  "add %11,%6;"
262  "por %%xmm2,%%xmm1;"
263  "or %12,%10;"
264  "add %10,%6;"
265  "movdqa %%xmm3,%%xmm2;"
266  "mov %9,%10;"
267  "mov %6,%11;"
268  "movdqa %%xmm3,%%xmm8;"
269  "ror $0xe,%10;"
270  "xor %9,%10;"
271  "mov %3,%12;"
272  "ror $0x9,%11;"
273  "pslld $0xe,%%xmm3;"
274  "xor %6,%11;"
275  "ror $0x5,%10;"
276  "xor %4,%12;"
277  "psrld $0x12,%%xmm2;"
278  "ror $0xb,%11;"
279  "xor %9,%10;"
280  "and %9,%12;"
281  "ror $0x6,%10;"
282  "pxor %%xmm3,%%xmm1;"
283  "xor %6,%11;"
284  "xor %4,%12;"
285  "psrld $0x3,%%xmm8;"
286  "add %10,%12;"
287  "add 4+%16,%12;"
288  "ror $0x2,%11;"
289  "pxor %%xmm2,%%xmm1;"
290  "mov %6,%10;"
291  "add %12,%5;"
292  "mov %6,%12;"
293  "pxor %%xmm8,%%xmm1;"
294  "or %7,%10;"
295  "add %5,%8;"
296  "and %7,%12;"
297  "pshufd $0xfa,%%xmm4,%%xmm2;"
298  "and %k2,%10;"
299  "add %11,%5;"
300  "paddd %%xmm1,%%xmm0;"
301  "or %12,%10;"
302  "add %10,%5;"
303  "movdqa %%xmm2,%%xmm3;"
304  "mov %8,%10;"
305  "mov %5,%11;"
306  "ror $0xe,%10;"
307  "movdqa %%xmm2,%%xmm8;"
308  "xor %8,%10;"
309  "ror $0x9,%11;"
310  "mov %9,%12;"
311  "xor %5,%11;"
312  "ror $0x5,%10;"
313  "psrlq $0x11,%%xmm2;"
314  "xor %3,%12;"
315  "psrlq $0x13,%%xmm3;"
316  "xor %8,%10;"
317  "and %8,%12;"
318  "psrld $0xa,%%xmm8;"
319  "ror $0xb,%11;"
320  "xor %5,%11;"
321  "xor %3,%12;"
322  "ror $0x6,%10;"
323  "pxor %%xmm3,%%xmm2;"
324  "add %10,%12;"
325  "ror $0x2,%11;"
326  "add 8+%16,%12;"
327  "pxor %%xmm2,%%xmm8;"
328  "mov %5,%10;"
329  "add %12,%4;"
330  "mov %5,%12;"
331  "pshufb %%xmm10,%%xmm8;"
332  "or %k2,%10;"
333  "add %4,%7;"
334  "and %k2,%12;"
335  "paddd %%xmm8,%%xmm0;"
336  "and %6,%10;"
337  "add %11,%4;"
338  "pshufd $0x50,%%xmm0,%%xmm2;"
339  "or %12,%10;"
340  "add %10,%4;"
341  "movdqa %%xmm2,%%xmm3;"
342  "mov %7,%10;"
343  "ror $0xe,%10;"
344  "mov %4,%11;"
345  "movdqa %%xmm2,%%xmm5;"
346  "ror $0x9,%11;"
347  "xor %7,%10;"
348  "mov %8,%12;"
349  "ror $0x5,%10;"
350  "psrlq $0x11,%%xmm2;"
351  "xor %4,%11;"
352  "xor %9,%12;"
353  "psrlq $0x13,%%xmm3;"
354  "xor %7,%10;"
355  "and %7,%12;"
356  "ror $0xb,%11;"
357  "psrld $0xa,%%xmm5;"
358  "xor %4,%11;"
359  "ror $0x6,%10;"
360  "xor %9,%12;"
361  "pxor %%xmm3,%%xmm2;"
362  "ror $0x2,%11;"
363  "add %10,%12;"
364  "add 12+%16,%12;"
365  "pxor %%xmm2,%%xmm5;"
366  "mov %4,%10;"
367  "add %12,%3;"
368  "mov %4,%12;"
369  "pshufb %%xmm11,%%xmm5;"
370  "or %6,%10;"
371  "add %3,%k2;"
372  "and %6,%12;"
373  "paddd %%xmm0,%%xmm5;"
374  "and %5,%10;"
375  "add %11,%3;"
376  "or %12,%10;"
377  "add %10,%3;"
378  "movdqa 0x20(%13),%%xmm9;"
379  "paddd %%xmm6,%%xmm9;"
380  "movdqa %%xmm9,%16;"
381  "movdqa %%xmm5,%%xmm0;"
382  "mov %k2,%10;"
383  "ror $0xe,%10;"
384  "mov %3,%11;"
385  "palignr $0x4,%%xmm4,%%xmm0;"
386  "ror $0x9,%11;"
387  "xor %k2,%10;"
388  "mov %7,%12;"
389  "ror $0x5,%10;"
390  "movdqa %%xmm7,%%xmm1;"
391  "xor %3,%11;"
392  "xor %8,%12;"
393  "paddd %%xmm6,%%xmm0;"
394  "xor %k2,%10;"
395  "and %k2,%12;"
396  "ror $0xb,%11;"
397  "palignr $0x4,%%xmm6,%%xmm1;"
398  "xor %3,%11;"
399  "ror $0x6,%10;"
400  "xor %8,%12;"
401  "movdqa %%xmm1,%%xmm2;"
402  "ror $0x2,%11;"
403  "add %10,%12;"
404  "add %16,%12;"
405  "movdqa %%xmm1,%%xmm3;"
406  "mov %3,%10;"
407  "add %12,%9;"
408  "mov %3,%12;"
409  "pslld $0x19,%%xmm1;"
410  "or %5,%10;"
411  "add %9,%6;"
412  "and %5,%12;"
413  "psrld $0x7,%%xmm2;"
414  "and %4,%10;"
415  "add %11,%9;"
416  "por %%xmm2,%%xmm1;"
417  "or %12,%10;"
418  "add %10,%9;"
419  "movdqa %%xmm3,%%xmm2;"
420  "mov %6,%10;"
421  "mov %9,%11;"
422  "movdqa %%xmm3,%%xmm8;"
423  "ror $0xe,%10;"
424  "xor %6,%10;"
425  "mov %k2,%12;"
426  "ror $0x9,%11;"
427  "pslld $0xe,%%xmm3;"
428  "xor %9,%11;"
429  "ror $0x5,%10;"
430  "xor %7,%12;"
431  "psrld $0x12,%%xmm2;"
432  "ror $0xb,%11;"
433  "xor %6,%10;"
434  "and %6,%12;"
435  "ror $0x6,%10;"
436  "pxor %%xmm3,%%xmm1;"
437  "xor %9,%11;"
438  "xor %7,%12;"
439  "psrld $0x3,%%xmm8;"
440  "add %10,%12;"
441  "add 4+%16,%12;"
442  "ror $0x2,%11;"
443  "pxor %%xmm2,%%xmm1;"
444  "mov %9,%10;"
445  "add %12,%8;"
446  "mov %9,%12;"
447  "pxor %%xmm8,%%xmm1;"
448  "or %4,%10;"
449  "add %8,%5;"
450  "and %4,%12;"
451  "pshufd $0xfa,%%xmm5,%%xmm2;"
452  "and %3,%10;"
453  "add %11,%8;"
454  "paddd %%xmm1,%%xmm0;"
455  "or %12,%10;"
456  "add %10,%8;"
457  "movdqa %%xmm2,%%xmm3;"
458  "mov %5,%10;"
459  "mov %8,%11;"
460  "ror $0xe,%10;"
461  "movdqa %%xmm2,%%xmm8;"
462  "xor %5,%10;"
463  "ror $0x9,%11;"
464  "mov %6,%12;"
465  "xor %8,%11;"
466  "ror $0x5,%10;"
467  "psrlq $0x11,%%xmm2;"
468  "xor %k2,%12;"
469  "psrlq $0x13,%%xmm3;"
470  "xor %5,%10;"
471  "and %5,%12;"
472  "psrld $0xa,%%xmm8;"
473  "ror $0xb,%11;"
474  "xor %8,%11;"
475  "xor %k2,%12;"
476  "ror $0x6,%10;"
477  "pxor %%xmm3,%%xmm2;"
478  "add %10,%12;"
479  "ror $0x2,%11;"
480  "add 8+%16,%12;"
481  "pxor %%xmm2,%%xmm8;"
482  "mov %8,%10;"
483  "add %12,%7;"
484  "mov %8,%12;"
485  "pshufb %%xmm10,%%xmm8;"
486  "or %3,%10;"
487  "add %7,%4;"
488  "and %3,%12;"
489  "paddd %%xmm8,%%xmm0;"
490  "and %9,%10;"
491  "add %11,%7;"
492  "pshufd $0x50,%%xmm0,%%xmm2;"
493  "or %12,%10;"
494  "add %10,%7;"
495  "movdqa %%xmm2,%%xmm3;"
496  "mov %4,%10;"
497  "ror $0xe,%10;"
498  "mov %7,%11;"
499  "movdqa %%xmm2,%%xmm6;"
500  "ror $0x9,%11;"
501  "xor %4,%10;"
502  "mov %5,%12;"
503  "ror $0x5,%10;"
504  "psrlq $0x11,%%xmm2;"
505  "xor %7,%11;"
506  "xor %6,%12;"
507  "psrlq $0x13,%%xmm3;"
508  "xor %4,%10;"
509  "and %4,%12;"
510  "ror $0xb,%11;"
511  "psrld $0xa,%%xmm6;"
512  "xor %7,%11;"
513  "ror $0x6,%10;"
514  "xor %6,%12;"
515  "pxor %%xmm3,%%xmm2;"
516  "ror $0x2,%11;"
517  "add %10,%12;"
518  "add 12+%16,%12;"
519  "pxor %%xmm2,%%xmm6;"
520  "mov %7,%10;"
521  "add %12,%k2;"
522  "mov %7,%12;"
523  "pshufb %%xmm11,%%xmm6;"
524  "or %9,%10;"
525  "add %k2,%3;"
526  "and %9,%12;"
527  "paddd %%xmm0,%%xmm6;"
528  "and %8,%10;"
529  "add %11,%k2;"
530  "or %12,%10;"
531  "add %10,%k2;"
532  "movdqa 0x30(%13),%%xmm9;"
533  "paddd %%xmm7,%%xmm9;"
534  "movdqa %%xmm9,%16;"
535  "add $0x40,%13;"
536  "movdqa %%xmm6,%%xmm0;"
537  "mov %3,%10;"
538  "ror $0xe,%10;"
539  "mov %k2,%11;"
540  "palignr $0x4,%%xmm5,%%xmm0;"
541  "ror $0x9,%11;"
542  "xor %3,%10;"
543  "mov %4,%12;"
544  "ror $0x5,%10;"
545  "movdqa %%xmm4,%%xmm1;"
546  "xor %k2,%11;"
547  "xor %5,%12;"
548  "paddd %%xmm7,%%xmm0;"
549  "xor %3,%10;"
550  "and %3,%12;"
551  "ror $0xb,%11;"
552  "palignr $0x4,%%xmm7,%%xmm1;"
553  "xor %k2,%11;"
554  "ror $0x6,%10;"
555  "xor %5,%12;"
556  "movdqa %%xmm1,%%xmm2;"
557  "ror $0x2,%11;"
558  "add %10,%12;"
559  "add %16,%12;"
560  "movdqa %%xmm1,%%xmm3;"
561  "mov %k2,%10;"
562  "add %12,%6;"
563  "mov %k2,%12;"
564  "pslld $0x19,%%xmm1;"
565  "or %8,%10;"
566  "add %6,%9;"
567  "and %8,%12;"
568  "psrld $0x7,%%xmm2;"
569  "and %7,%10;"
570  "add %11,%6;"
571  "por %%xmm2,%%xmm1;"
572  "or %12,%10;"
573  "add %10,%6;"
574  "movdqa %%xmm3,%%xmm2;"
575  "mov %9,%10;"
576  "mov %6,%11;"
577  "movdqa %%xmm3,%%xmm8;"
578  "ror $0xe,%10;"
579  "xor %9,%10;"
580  "mov %3,%12;"
581  "ror $0x9,%11;"
582  "pslld $0xe,%%xmm3;"
583  "xor %6,%11;"
584  "ror $0x5,%10;"
585  "xor %4,%12;"
586  "psrld $0x12,%%xmm2;"
587  "ror $0xb,%11;"
588  "xor %9,%10;"
589  "and %9,%12;"
590  "ror $0x6,%10;"
591  "pxor %%xmm3,%%xmm1;"
592  "xor %6,%11;"
593  "xor %4,%12;"
594  "psrld $0x3,%%xmm8;"
595  "add %10,%12;"
596  "add 4+%16,%12;"
597  "ror $0x2,%11;"
598  "pxor %%xmm2,%%xmm1;"
599  "mov %6,%10;"
600  "add %12,%5;"
601  "mov %6,%12;"
602  "pxor %%xmm8,%%xmm1;"
603  "or %7,%10;"
604  "add %5,%8;"
605  "and %7,%12;"
606  "pshufd $0xfa,%%xmm6,%%xmm2;"
607  "and %k2,%10;"
608  "add %11,%5;"
609  "paddd %%xmm1,%%xmm0;"
610  "or %12,%10;"
611  "add %10,%5;"
612  "movdqa %%xmm2,%%xmm3;"
613  "mov %8,%10;"
614  "mov %5,%11;"
615  "ror $0xe,%10;"
616  "movdqa %%xmm2,%%xmm8;"
617  "xor %8,%10;"
618  "ror $0x9,%11;"
619  "mov %9,%12;"
620  "xor %5,%11;"
621  "ror $0x5,%10;"
622  "psrlq $0x11,%%xmm2;"
623  "xor %3,%12;"
624  "psrlq $0x13,%%xmm3;"
625  "xor %8,%10;"
626  "and %8,%12;"
627  "psrld $0xa,%%xmm8;"
628  "ror $0xb,%11;"
629  "xor %5,%11;"
630  "xor %3,%12;"
631  "ror $0x6,%10;"
632  "pxor %%xmm3,%%xmm2;"
633  "add %10,%12;"
634  "ror $0x2,%11;"
635  "add 8+%16,%12;"
636  "pxor %%xmm2,%%xmm8;"
637  "mov %5,%10;"
638  "add %12,%4;"
639  "mov %5,%12;"
640  "pshufb %%xmm10,%%xmm8;"
641  "or %k2,%10;"
642  "add %4,%7;"
643  "and %k2,%12;"
644  "paddd %%xmm8,%%xmm0;"
645  "and %6,%10;"
646  "add %11,%4;"
647  "pshufd $0x50,%%xmm0,%%xmm2;"
648  "or %12,%10;"
649  "add %10,%4;"
650  "movdqa %%xmm2,%%xmm3;"
651  "mov %7,%10;"
652  "ror $0xe,%10;"
653  "mov %4,%11;"
654  "movdqa %%xmm2,%%xmm7;"
655  "ror $0x9,%11;"
656  "xor %7,%10;"
657  "mov %8,%12;"
658  "ror $0x5,%10;"
659  "psrlq $0x11,%%xmm2;"
660  "xor %4,%11;"
661  "xor %9,%12;"
662  "psrlq $0x13,%%xmm3;"
663  "xor %7,%10;"
664  "and %7,%12;"
665  "ror $0xb,%11;"
666  "psrld $0xa,%%xmm7;"
667  "xor %4,%11;"
668  "ror $0x6,%10;"
669  "xor %9,%12;"
670  "pxor %%xmm3,%%xmm2;"
671  "ror $0x2,%11;"
672  "add %10,%12;"
673  "add 12+%16,%12;"
674  "pxor %%xmm2,%%xmm7;"
675  "mov %4,%10;"
676  "add %12,%3;"
677  "mov %4,%12;"
678  "pshufb %%xmm11,%%xmm7;"
679  "or %6,%10;"
680  "add %3,%k2;"
681  "and %6,%12;"
682  "paddd %%xmm0,%%xmm7;"
683  "and %5,%10;"
684  "add %11,%3;"
685  "or %12,%10;"
686  "add %10,%3;"
687  "sub $0x1,%1;"
688  "jne Lloop1_%=;"
689  "mov $0x2,%1;"
690 
691  "Lloop2_%=:"
692  "paddd 0x0(%13),%%xmm4;"
693  "movdqa %%xmm4,%16;"
694  "mov %k2,%10;"
695  "ror $0xe,%10;"
696  "mov %3,%11;"
697  "xor %k2,%10;"
698  "ror $0x9,%11;"
699  "mov %7,%12;"
700  "xor %3,%11;"
701  "ror $0x5,%10;"
702  "xor %8,%12;"
703  "xor %k2,%10;"
704  "ror $0xb,%11;"
705  "and %k2,%12;"
706  "xor %3,%11;"
707  "ror $0x6,%10;"
708  "xor %8,%12;"
709  "add %10,%12;"
710  "ror $0x2,%11;"
711  "add %16,%12;"
712  "mov %3,%10;"
713  "add %12,%9;"
714  "mov %3,%12;"
715  "or %5,%10;"
716  "add %9,%6;"
717  "and %5,%12;"
718  "and %4,%10;"
719  "add %11,%9;"
720  "or %12,%10;"
721  "add %10,%9;"
722  "mov %6,%10;"
723  "ror $0xe,%10;"
724  "mov %9,%11;"
725  "xor %6,%10;"
726  "ror $0x9,%11;"
727  "mov %k2,%12;"
728  "xor %9,%11;"
729  "ror $0x5,%10;"
730  "xor %7,%12;"
731  "xor %6,%10;"
732  "ror $0xb,%11;"
733  "and %6,%12;"
734  "xor %9,%11;"
735  "ror $0x6,%10;"
736  "xor %7,%12;"
737  "add %10,%12;"
738  "ror $0x2,%11;"
739  "add 4+%16,%12;"
740  "mov %9,%10;"
741  "add %12,%8;"
742  "mov %9,%12;"
743  "or %4,%10;"
744  "add %8,%5;"
745  "and %4,%12;"
746  "and %3,%10;"
747  "add %11,%8;"
748  "or %12,%10;"
749  "add %10,%8;"
750  "mov %5,%10;"
751  "ror $0xe,%10;"
752  "mov %8,%11;"
753  "xor %5,%10;"
754  "ror $0x9,%11;"
755  "mov %6,%12;"
756  "xor %8,%11;"
757  "ror $0x5,%10;"
758  "xor %k2,%12;"
759  "xor %5,%10;"
760  "ror $0xb,%11;"
761  "and %5,%12;"
762  "xor %8,%11;"
763  "ror $0x6,%10;"
764  "xor %k2,%12;"
765  "add %10,%12;"
766  "ror $0x2,%11;"
767  "add 8+%16,%12;"
768  "mov %8,%10;"
769  "add %12,%7;"
770  "mov %8,%12;"
771  "or %3,%10;"
772  "add %7,%4;"
773  "and %3,%12;"
774  "and %9,%10;"
775  "add %11,%7;"
776  "or %12,%10;"
777  "add %10,%7;"
778  "mov %4,%10;"
779  "ror $0xe,%10;"
780  "mov %7,%11;"
781  "xor %4,%10;"
782  "ror $0x9,%11;"
783  "mov %5,%12;"
784  "xor %7,%11;"
785  "ror $0x5,%10;"
786  "xor %6,%12;"
787  "xor %4,%10;"
788  "ror $0xb,%11;"
789  "and %4,%12;"
790  "xor %7,%11;"
791  "ror $0x6,%10;"
792  "xor %6,%12;"
793  "add %10,%12;"
794  "ror $0x2,%11;"
795  "add 12+%16,%12;"
796  "mov %7,%10;"
797  "add %12,%k2;"
798  "mov %7,%12;"
799  "or %9,%10;"
800  "add %k2,%3;"
801  "and %9,%12;"
802  "and %8,%10;"
803  "add %11,%k2;"
804  "or %12,%10;"
805  "add %10,%k2;"
806  "paddd 0x10(%13),%%xmm5;"
807  "movdqa %%xmm5,%16;"
808  "add $0x20,%13;"
809  "mov %3,%10;"
810  "ror $0xe,%10;"
811  "mov %k2,%11;"
812  "xor %3,%10;"
813  "ror $0x9,%11;"
814  "mov %4,%12;"
815  "xor %k2,%11;"
816  "ror $0x5,%10;"
817  "xor %5,%12;"
818  "xor %3,%10;"
819  "ror $0xb,%11;"
820  "and %3,%12;"
821  "xor %k2,%11;"
822  "ror $0x6,%10;"
823  "xor %5,%12;"
824  "add %10,%12;"
825  "ror $0x2,%11;"
826  "add %16,%12;"
827  "mov %k2,%10;"
828  "add %12,%6;"
829  "mov %k2,%12;"
830  "or %8,%10;"
831  "add %6,%9;"
832  "and %8,%12;"
833  "and %7,%10;"
834  "add %11,%6;"
835  "or %12,%10;"
836  "add %10,%6;"
837  "mov %9,%10;"
838  "ror $0xe,%10;"
839  "mov %6,%11;"
840  "xor %9,%10;"
841  "ror $0x9,%11;"
842  "mov %3,%12;"
843  "xor %6,%11;"
844  "ror $0x5,%10;"
845  "xor %4,%12;"
846  "xor %9,%10;"
847  "ror $0xb,%11;"
848  "and %9,%12;"
849  "xor %6,%11;"
850  "ror $0x6,%10;"
851  "xor %4,%12;"
852  "add %10,%12;"
853  "ror $0x2,%11;"
854  "add 4+%16,%12;"
855  "mov %6,%10;"
856  "add %12,%5;"
857  "mov %6,%12;"
858  "or %7,%10;"
859  "add %5,%8;"
860  "and %7,%12;"
861  "and %k2,%10;"
862  "add %11,%5;"
863  "or %12,%10;"
864  "add %10,%5;"
865  "mov %8,%10;"
866  "ror $0xe,%10;"
867  "mov %5,%11;"
868  "xor %8,%10;"
869  "ror $0x9,%11;"
870  "mov %9,%12;"
871  "xor %5,%11;"
872  "ror $0x5,%10;"
873  "xor %3,%12;"
874  "xor %8,%10;"
875  "ror $0xb,%11;"
876  "and %8,%12;"
877  "xor %5,%11;"
878  "ror $0x6,%10;"
879  "xor %3,%12;"
880  "add %10,%12;"
881  "ror $0x2,%11;"
882  "add 8+%16,%12;"
883  "mov %5,%10;"
884  "add %12,%4;"
885  "mov %5,%12;"
886  "or %k2,%10;"
887  "add %4,%7;"
888  "and %k2,%12;"
889  "and %6,%10;"
890  "add %11,%4;"
891  "or %12,%10;"
892  "add %10,%4;"
893  "mov %7,%10;"
894  "ror $0xe,%10;"
895  "mov %4,%11;"
896  "xor %7,%10;"
897  "ror $0x9,%11;"
898  "mov %8,%12;"
899  "xor %4,%11;"
900  "ror $0x5,%10;"
901  "xor %9,%12;"
902  "xor %7,%10;"
903  "ror $0xb,%11;"
904  "and %7,%12;"
905  "xor %4,%11;"
906  "ror $0x6,%10;"
907  "xor %9,%12;"
908  "add %10,%12;"
909  "ror $0x2,%11;"
910  "add 12+%16,%12;"
911  "mov %4,%10;"
912  "add %12,%3;"
913  "mov %4,%12;"
914  "or %6,%10;"
915  "add %3,%k2;"
916  "and %6,%12;"
917  "and %5,%10;"
918  "add %11,%3;"
919  "or %12,%10;"
920  "add %10,%3;"
921  "movdqa %%xmm6,%%xmm4;"
922  "movdqa %%xmm7,%%xmm5;"
923  "sub $0x1,%1;"
924  "jne Lloop2_%=;"
925  "add (%0),%3;"
926  "mov %3,(%0);"
927  "add 0x4(%0),%4;"
928  "mov %4,0x4(%0);"
929  "add 0x8(%0),%5;"
930  "mov %5,0x8(%0);"
931  "add 0xc(%0),%6;"
932  "mov %6,0xc(%0);"
933  "add 0x10(%0),%k2;"
934  "mov %k2,0x10(%0);"
935  "add 0x14(%0),%7;"
936  "mov %7,0x14(%0);"
937  "add 0x18(%0),%8;"
938  "mov %8,0x18(%0);"
939  "add 0x1c(%0),%9;"
940  "mov %9,0x1c(%0);"
941  "mov %15,%1;"
942  "add $0x40,%1;"
943  "cmp %14,%1;"
944  "jne Lloop0_%=;"
945 
946  "Ldone_hash_%=:"
947 
948  : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c),
949  "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0),
950  "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
951  : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
952  : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
953  "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12");
954 }
955 } // namespace sha256_sse4
956 
957 /*
958 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
959 ; Copyright (c) 2012, Intel Corporation
960 ;
961 ; All rights reserved.
962 ;
963 ; Redistribution and use in source and binary forms, with or without
964 ; modification, are permitted provided that the following conditions are
965 ; met:
966 ;
967 ; * Redistributions of source code must retain the above copyright
968 ; notice, this list of conditions and the following disclaimer.
969 ;
970 ; * Redistributions in binary form must reproduce the above copyright
971 ; notice, this list of conditions and the following disclaimer in the
972 ; documentation and/or other materials provided with the
973 ; distribution.
974 ;
975 ; * Neither the name of the Intel Corporation nor the names of its
976 ; contributors may be used to endorse or promote products derived from
977 ; this software without specific prior written permission.
978 ;
979 ;
980 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
981 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
982 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
983 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
984 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
985 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
986 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
987 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
988 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
989 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
990 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
991 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
992 ;
993 ; Example YASM command lines:
994 ; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8
995 sha256_sse4.asm
996 ; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o
997 sha256_sse4.asm
998 ;
999 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1000 ;
1001 ; This code is described in an Intel White-Paper:
1002 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
1003 ;
1004 ; To find it, surf to http://www.intel.com/p/en_US/embedded
1005 ; and search for that title.
1006 ; The paper is expected to be released roughly at the end of April, 2012
1007 ;
1008 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1009 ; This code schedules 1 blocks at a time, with 4 lanes per block
1010 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1011 
1012 %define MOVDQ movdqu ;; assume buffers not aligned
1013 
1014 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1015 
1016 ; addm [mem], reg
1017 ; Add reg to mem using reg-mem add and store
1018 %macro addm 2
1019  add %2, %1
1020  mov %1, %2
1021 %endm
1022 
1023 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1024 
1025 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1026 ; Load xmm with mem and byte swap each dword
1027 %macro COPY_XMM_AND_BSWAP 3
1028  MOVDQ %1, %2
1029  pshufb %1, %3
1030 %endmacro
1031 
1032 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1033 
1034 %define X0 xmm4
1035 %define X1 xmm5
1036 %define X2 xmm6
1037 %define X3 xmm7
1038 
1039 %define XTMP0 xmm0
1040 %define XTMP1 xmm1
1041 %define XTMP2 xmm2
1042 %define XTMP3 xmm3
1043 %define XTMP4 xmm8
1044 %define XFER xmm9
1045 
1046 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
1047 %define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
1048 %define BYTE_FLIP_MASK xmm12
1049 
1050 %ifdef LINUX
1051 %define NUM_BLKS rdx ; 3rd arg
1052 %define CTX rsi ; 2nd arg
1053 %define INP rdi ; 1st arg
1054 
1055 %define SRND rdi ; clobbers INP
1056 %define c ecx
1057 %define d r8d
1058 %define e edx
1059 %else
1060 %define NUM_BLKS r8 ; 3rd arg
1061 %define CTX rdx ; 2nd arg
1062 %define INP rcx ; 1st arg
1063 
1064 %define SRND rcx ; clobbers INP
1065 %define c edi
1066 %define d esi
1067 %define e r8d
1068 
1069 %endif
1070 %define TBL rbp
1071 %define a eax
1072 %define b ebx
1073 
1074 %define f r9d
1075 %define g r10d
1076 %define h r11d
1077 
1078 %define y0 r13d
1079 %define y1 r14d
1080 %define y2 r15d
1081 
1082 
1083 
1084 _INP_END_SIZE equ 8
1085 _INP_SIZE equ 8
1086 _XFER_SIZE equ 8
1087 %ifdef LINUX
1088 _XMM_SAVE_SIZE equ 0
1089 %else
1090 _XMM_SAVE_SIZE equ 7*16
1091 %endif
1092 ; STACK_SIZE plus pushes must be an odd multiple of 8
1093 _ALIGN_SIZE equ 8
1094 
1095 _INP_END equ 0
1096 _INP equ _INP_END + _INP_END_SIZE
1097 _XFER equ _INP + _INP_SIZE
1098 _XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
1099 STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
1100 
1101 ; rotate_Xs
1102 ; Rotate values of symbols X0...X3
1103 %macro rotate_Xs 0
1104 %xdefine X_ X0
1105 %xdefine X0 X1
1106 %xdefine X1 X2
1107 %xdefine X2 X3
1108 %xdefine X3 X_
1109 %endm
1110 
1111 ; ROTATE_ARGS
1112 ; Rotate values of symbols a...h
1113 %macro ROTATE_ARGS 0
1114 %xdefine TMP_ h
1115 %xdefine h g
1116 %xdefine g f
1117 %xdefine f e
1118 %xdefine e d
1119 %xdefine d c
1120 %xdefine c b
1121 %xdefine b a
1122 %xdefine a TMP_
1123 %endm
1124 
1125 %macro FOUR_ROUNDS_AND_SCHED 0
1126  ;; compute s0 four at a time and s1 two at a time
1127  ;; compute W[-16] + W[-7] 4 at a time
1128  movdqa XTMP0, X3
1129  mov y0, e ; y0 = e
1130  ror y0, (25-11) ; y0 = e >> (25-11)
1131  mov y1, a ; y1 = a
1132  palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
1133  ror y1, (22-13) ; y1 = a >> (22-13)
1134  xor y0, e ; y0 = e ^ (e >> (25-11))
1135  mov y2, f ; y2 = f
1136  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1137  movdqa XTMP1, X1
1138  xor y1, a ; y1 = a ^ (a >> (22-13)
1139  xor y2, g ; y2 = f^g
1140  paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
1141  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1142  and y2, e ; y2 = (f^g)&e
1143  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1144  ;; compute s0
1145  palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
1146  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1147  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1148  xor y2, g ; y2 = CH = ((f^g)&e)^g
1149  movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
1150  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1151  add y2, y0 ; y2 = S1 + CH
1152  add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
1153  movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
1154  mov y0, a ; y0 = a
1155  add h, y2 ; h = h + S1 + CH + k + w
1156  mov y2, a ; y2 = a
1157  pslld XTMP1, (32-7)
1158  or y0, c ; y0 = a|c
1159  add d, h ; d = d + h + S1 + CH + k + w
1160  and y2, c ; y2 = a&c
1161  psrld XTMP2, 7
1162  and y0, b ; y0 = (a|c)&b
1163  add h, y1 ; h = h + S1 + CH + k + w + S0
1164  por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
1165  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1166  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1167 
1168 ROTATE_ARGS
1169  movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
1170  mov y0, e ; y0 = e
1171  mov y1, a ; y1 = a
1172  movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
1173  ror y0, (25-11) ; y0 = e >> (25-11)
1174  xor y0, e ; y0 = e ^ (e >> (25-11))
1175  mov y2, f ; y2 = f
1176  ror y1, (22-13) ; y1 = a >> (22-13)
1177  pslld XTMP3, (32-18)
1178  xor y1, a ; y1 = a ^ (a >> (22-13)
1179  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1180  xor y2, g ; y2 = f^g
1181  psrld XTMP2, 18
1182  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1183  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1184  and y2, e ; y2 = (f^g)&e
1185  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1186  pxor XTMP1, XTMP3
1187  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1188  xor y2, g ; y2 = CH = ((f^g)&e)^g
1189  psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
1190  add y2, y0 ; y2 = S1 + CH
1191  add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
1192  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1193  pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1194  mov y0, a ; y0 = a
1195  add h, y2 ; h = h + S1 + CH + k + w
1196  mov y2, a ; y2 = a
1197  pxor XTMP1, XTMP4 ; XTMP1 = s0
1198  or y0, c ; y0 = a|c
1199  add d, h ; d = d + h + S1 + CH + k + w
1200  and y2, c ; y2 = a&c
1201  ;; compute low s1
1202  pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
1203  and y0, b ; y0 = (a|c)&b
1204  add h, y1 ; h = h + S1 + CH + k + w + S0
1205  paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
1206  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1207  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1208 
1209 ROTATE_ARGS
1210  movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
1211  mov y0, e ; y0 = e
1212  mov y1, a ; y1 = a
1213  ror y0, (25-11) ; y0 = e >> (25-11)
1214  movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
1215  xor y0, e ; y0 = e ^ (e >> (25-11))
1216  ror y1, (22-13) ; y1 = a >> (22-13)
1217  mov y2, f ; y2 = f
1218  xor y1, a ; y1 = a ^ (a >> (22-13)
1219  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1220  psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
1221  xor y2, g ; y2 = f^g
1222  psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
1223  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1224  and y2, e ; y2 = (f^g)&e
1225  psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
1226  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1227  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1228  xor y2, g ; y2 = CH = ((f^g)&e)^g
1229  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1230  pxor XTMP2, XTMP3
1231  add y2, y0 ; y2 = S1 + CH
1232  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1233  add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
1234  pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
1235  mov y0, a ; y0 = a
1236  add h, y2 ; h = h + S1 + CH + k + w
1237  mov y2, a ; y2 = a
1238  pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
1239  or y0, c ; y0 = a|c
1240  add d, h ; d = d + h + S1 + CH + k + w
1241  and y2, c ; y2 = a&c
1242  paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
1243  and y0, b ; y0 = (a|c)&b
1244  add h, y1 ; h = h + S1 + CH + k + w + S0
1245  ;; compute high s1
1246  pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
1247  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1248  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1249 
1250 ROTATE_ARGS
1251  movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
1252  mov y0, e ; y0 = e
1253  ror y0, (25-11) ; y0 = e >> (25-11)
1254  mov y1, a ; y1 = a
1255  movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
1256  ror y1, (22-13) ; y1 = a >> (22-13)
1257  xor y0, e ; y0 = e ^ (e >> (25-11))
1258  mov y2, f ; y2 = f
1259  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1260  psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
1261  xor y1, a ; y1 = a ^ (a >> (22-13)
1262  xor y2, g ; y2 = f^g
1263  psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
1264  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1265  and y2, e ; y2 = (f^g)&e
1266  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1267  psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
1268  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1269  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1270  xor y2, g ; y2 = CH = ((f^g)&e)^g
1271  pxor XTMP2, XTMP3
1272  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1273  add y2, y0 ; y2 = S1 + CH
1274  add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
1275  pxor X0, XTMP2 ; X0 = s1 {xDxC}
1276  mov y0, a ; y0 = a
1277  add h, y2 ; h = h + S1 + CH + k + w
1278  mov y2, a ; y2 = a
1279  pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
1280  or y0, c ; y0 = a|c
1281  add d, h ; d = d + h + S1 + CH + k + w
1282  and y2, c ; y2 = a&c
1283  paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
1284  and y0, b ; y0 = (a|c)&b
1285  add h, y1 ; h = h + S1 + CH + k + w + S0
1286  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1287  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1288 
1289 ROTATE_ARGS
1290 rotate_Xs
1291 %endm
1292 
1293 ;; input is [rsp + _XFER + %1 * 4]
1294 %macro DO_ROUND 1
1295  mov y0, e ; y0 = e
1296  ror y0, (25-11) ; y0 = e >> (25-11)
1297  mov y1, a ; y1 = a
1298  xor y0, e ; y0 = e ^ (e >> (25-11))
1299  ror y1, (22-13) ; y1 = a >> (22-13)
1300  mov y2, f ; y2 = f
1301  xor y1, a ; y1 = a ^ (a >> (22-13)
1302  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1303  xor y2, g ; y2 = f^g
1304  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1305  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1306  and y2, e ; y2 = (f^g)&e
1307  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1308  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1309  xor y2, g ; y2 = CH = ((f^g)&e)^g
1310  add y2, y0 ; y2 = S1 + CH
1311  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1312  add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
1313  mov y0, a ; y0 = a
1314  add h, y2 ; h = h + S1 + CH + k + w
1315  mov y2, a ; y2 = a
1316  or y0, c ; y0 = a|c
1317  add d, h ; d = d + h + S1 + CH + k + w
1318  and y2, c ; y2 = a&c
1319  and y0, b ; y0 = (a|c)&b
1320  add h, y1 ; h = h + S1 + CH + k + w + S0
1321  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1322  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1323  ROTATE_ARGS
1324 %endm
1325 
1326 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1327 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1328 ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1329 ;; arg 1 : pointer to input data
1330 ;; arg 2 : pointer to digest
1331 ;; arg 3 : Num blocks
1332 section .text
1333 global sha256_sse4
1334 align 32
1335 sha256_sse4:
1336  push rbx
1337 %ifndef LINUX
1338  push rsi
1339  push rdi
1340 %endif
1341  push rbp
1342  push r13
1343  push r14
1344  push r15
1345 
1346  sub rsp,STACK_SIZE
1347 %ifndef LINUX
1348  movdqa [rsp + _XMM_SAVE + 0*16],xmm6
1349  movdqa [rsp + _XMM_SAVE + 1*16],xmm7
1350  movdqa [rsp + _XMM_SAVE + 2*16],xmm8
1351  movdqa [rsp + _XMM_SAVE + 3*16],xmm9
1352  movdqa [rsp + _XMM_SAVE + 4*16],xmm10
1353  movdqa [rsp + _XMM_SAVE + 5*16],xmm11
1354  movdqa [rsp + _XMM_SAVE + 6*16],xmm12
1355 %endif
1356 
1357  shl NUM_BLKS, 6 ; convert to bytes
1358  jz done_hash
1359  add NUM_BLKS, INP ; pointer to end of data
1360  mov [rsp + _INP_END], NUM_BLKS
1361 
1362  ;; load initial digest
1363  mov a,[4*0 + CTX]
1364  mov b,[4*1 + CTX]
1365  mov c,[4*2 + CTX]
1366  mov d,[4*3 + CTX]
1367  mov e,[4*4 + CTX]
1368  mov f,[4*5 + CTX]
1369  mov g,[4*6 + CTX]
1370  mov h,[4*7 + CTX]
1371 
1372  movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1373  movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
1374  movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
1375 
1376 loop0:
1377  lea TBL,[K256 wrt rip]
1378 
1379  ;; byte swap first 16 dwords
1380  COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
1381  COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
1382  COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
1383  COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
1384 
1385  mov [rsp + _INP], INP
1386 
1387  ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1388  mov SRND, 3
1389 align 16
1390 loop1:
1391  movdqa XFER, [TBL + 0*16]
1392  paddd XFER, X0
1393  movdqa [rsp + _XFER], XFER
1394  FOUR_ROUNDS_AND_SCHED
1395 
1396  movdqa XFER, [TBL + 1*16]
1397  paddd XFER, X0
1398  movdqa [rsp + _XFER], XFER
1399  FOUR_ROUNDS_AND_SCHED
1400 
1401  movdqa XFER, [TBL + 2*16]
1402  paddd XFER, X0
1403  movdqa [rsp + _XFER], XFER
1404  FOUR_ROUNDS_AND_SCHED
1405 
1406  movdqa XFER, [TBL + 3*16]
1407  paddd XFER, X0
1408  movdqa [rsp + _XFER], XFER
1409  add TBL, 4*16
1410  FOUR_ROUNDS_AND_SCHED
1411 
1412  sub SRND, 1
1413  jne loop1
1414 
1415  mov SRND, 2
1416 loop2:
1417  paddd X0, [TBL + 0*16]
1418  movdqa [rsp + _XFER], X0
1419  DO_ROUND 0
1420  DO_ROUND 1
1421  DO_ROUND 2
1422  DO_ROUND 3
1423  paddd X1, [TBL + 1*16]
1424  movdqa [rsp + _XFER], X1
1425  add TBL, 2*16
1426  DO_ROUND 0
1427  DO_ROUND 1
1428  DO_ROUND 2
1429  DO_ROUND 3
1430 
1431  movdqa X0, X2
1432  movdqa X1, X3
1433 
1434  sub SRND, 1
1435  jne loop2
1436 
1437  addm [4*0 + CTX],a
1438  addm [4*1 + CTX],b
1439  addm [4*2 + CTX],c
1440  addm [4*3 + CTX],d
1441  addm [4*4 + CTX],e
1442  addm [4*5 + CTX],f
1443  addm [4*6 + CTX],g
1444  addm [4*7 + CTX],h
1445 
1446  mov INP, [rsp + _INP]
1447  add INP, 64
1448  cmp INP, [rsp + _INP_END]
1449  jne loop0
1450 
1451 done_hash:
1452 %ifndef LINUX
1453  movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
1454  movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
1455  movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
1456  movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
1457  movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
1458  movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
1459  movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
1460 %endif
1461 
1462  add rsp, STACK_SIZE
1463 
1464  pop r15
1465  pop r14
1466  pop r13
1467  pop rbp
1468 %ifndef LINUX
1469  pop rdi
1470  pop rsi
1471 %endif
1472  pop rbx
1473 
1474  ret
1475 
1476 
1477 section .data
1478 align 64
1479 K256:
1480  dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1481  dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1482  dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1483  dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1484  dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1485  dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1486  dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1487  dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1488  dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1489  dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1490  dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1491  dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1492  dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1493  dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1494  dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1495  dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1496 
1497 PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1498 
1499 ; shuffle xBxA -> 00BA
1500 _SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1501 
1502 ; shuffle xDxC -> DC00
1503 _SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
1504 */
1505 
1506 #endif
void Transform(uint32_t *s, const uint8_t *chunk, size_t blocks)