11 #if defined(__x86_64__) || defined(__amd64__) 13 namespace sha256_sse4 {
14 void Transform(uint32_t *s,
const uint8_t *chunk,
size_t blocks) {
15 static const uint32_t K256
alignas(16)[] = {
16 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
17 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
18 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
19 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
20 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
21 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
22 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
23 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
24 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
25 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
26 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
28 static const uint32_t FLIP_MASK
alignas(16)[] = {0x00010203, 0x04050607,
29 0x08090a0b, 0x0c0d0e0f};
30 static const uint32_t SHUF_00BA
alignas(16)[] = {0x03020100, 0x0b0a0908,
31 0xffffffff, 0xffffffff};
32 static const uint32_t SHUF_DC00
alignas(16)[] = {0xffffffff, 0xffffffff,
33 0x03020100, 0x0b0a0908};
34 uint32_t a, b, c, d, f, g, h, y0, y1, y2;
36 uint64_t inp_end, inp;
37 uint32_t xfer
alignas(16)[4];
59 "pshufb %%xmm12,%%xmm4;" 60 "movdqu 0x10(%1),%%xmm5;" 61 "pshufb %%xmm12,%%xmm5;" 62 "movdqu 0x20(%1),%%xmm6;" 63 "pshufb %%xmm12,%%xmm6;" 64 "movdqu 0x30(%1),%%xmm7;" 65 "pshufb %%xmm12,%%xmm7;" 70 "movdqa 0x0(%13),%%xmm9;" 71 "paddd %%xmm4,%%xmm9;" 73 "movdqa %%xmm7,%%xmm0;" 77 "palignr $0x4,%%xmm6,%%xmm0;" 82 "movdqa %%xmm5,%%xmm1;" 85 "paddd %%xmm4,%%xmm0;" 89 "palignr $0x4,%%xmm4,%%xmm1;" 93 "movdqa %%xmm1,%%xmm2;" 97 "movdqa %%xmm1,%%xmm3;" 101 "pslld $0x19,%%xmm1;" 111 "movdqa %%xmm3,%%xmm2;" 114 "movdqa %%xmm3,%%xmm8;" 123 "psrld $0x12,%%xmm2;" 128 "pxor %%xmm3,%%xmm1;" 135 "pxor %%xmm2,%%xmm1;" 139 "pxor %%xmm8,%%xmm1;" 143 "pshufd $0xfa,%%xmm7,%%xmm2;" 146 "paddd %%xmm1,%%xmm0;" 149 "movdqa %%xmm2,%%xmm3;" 153 "movdqa %%xmm2,%%xmm8;" 159 "psrlq $0x11,%%xmm2;" 161 "psrlq $0x13,%%xmm3;" 169 "pxor %%xmm3,%%xmm2;" 173 "pxor %%xmm2,%%xmm8;" 177 "pshufb %%xmm10,%%xmm8;" 181 "paddd %%xmm8,%%xmm0;" 184 "pshufd $0x50,%%xmm0,%%xmm2;" 187 "movdqa %%xmm2,%%xmm3;" 191 "movdqa %%xmm2,%%xmm4;" 196 "psrlq $0x11,%%xmm2;" 199 "psrlq $0x13,%%xmm3;" 207 "pxor %%xmm3,%%xmm2;" 211 "pxor %%xmm2,%%xmm4;" 215 "pshufb %%xmm11,%%xmm4;" 219 "paddd %%xmm0,%%xmm4;" 224 "movdqa 0x10(%13),%%xmm9;" 225 "paddd %%xmm5,%%xmm9;" 227 "movdqa %%xmm4,%%xmm0;" 231 "palignr $0x4,%%xmm7,%%xmm0;" 236 "movdqa %%xmm6,%%xmm1;" 239 "paddd %%xmm5,%%xmm0;" 243 "palignr $0x4,%%xmm5,%%xmm1;" 247 "movdqa %%xmm1,%%xmm2;" 251 "movdqa %%xmm1,%%xmm3;" 255 "pslld $0x19,%%xmm1;" 265 "movdqa %%xmm3,%%xmm2;" 268 "movdqa %%xmm3,%%xmm8;" 277 "psrld $0x12,%%xmm2;" 282 "pxor %%xmm3,%%xmm1;" 289 "pxor %%xmm2,%%xmm1;" 293 "pxor %%xmm8,%%xmm1;" 297 "pshufd $0xfa,%%xmm4,%%xmm2;" 300 "paddd %%xmm1,%%xmm0;" 303 "movdqa %%xmm2,%%xmm3;" 307 "movdqa %%xmm2,%%xmm8;" 313 "psrlq $0x11,%%xmm2;" 315 "psrlq $0x13,%%xmm3;" 323 "pxor %%xmm3,%%xmm2;" 327 "pxor %%xmm2,%%xmm8;" 331 "pshufb %%xmm10,%%xmm8;" 335 "paddd %%xmm8,%%xmm0;" 338 "pshufd $0x50,%%xmm0,%%xmm2;" 341 "movdqa %%xmm2,%%xmm3;" 345 "movdqa %%xmm2,%%xmm5;" 350 "psrlq $0x11,%%xmm2;" 353 "psrlq $0x13,%%xmm3;" 361 "pxor %%xmm3,%%xmm2;" 365 "pxor %%xmm2,%%xmm5;" 369 "pshufb %%xmm11,%%xmm5;" 373 "paddd %%xmm0,%%xmm5;" 378 "movdqa 0x20(%13),%%xmm9;" 379 "paddd %%xmm6,%%xmm9;" 381 "movdqa %%xmm5,%%xmm0;" 385 "palignr $0x4,%%xmm4,%%xmm0;" 390 "movdqa %%xmm7,%%xmm1;" 393 "paddd %%xmm6,%%xmm0;" 397 "palignr $0x4,%%xmm6,%%xmm1;" 401 "movdqa %%xmm1,%%xmm2;" 405 "movdqa %%xmm1,%%xmm3;" 409 "pslld $0x19,%%xmm1;" 419 "movdqa %%xmm3,%%xmm2;" 422 "movdqa %%xmm3,%%xmm8;" 431 "psrld $0x12,%%xmm2;" 436 "pxor %%xmm3,%%xmm1;" 443 "pxor %%xmm2,%%xmm1;" 447 "pxor %%xmm8,%%xmm1;" 451 "pshufd $0xfa,%%xmm5,%%xmm2;" 454 "paddd %%xmm1,%%xmm0;" 457 "movdqa %%xmm2,%%xmm3;" 461 "movdqa %%xmm2,%%xmm8;" 467 "psrlq $0x11,%%xmm2;" 469 "psrlq $0x13,%%xmm3;" 477 "pxor %%xmm3,%%xmm2;" 481 "pxor %%xmm2,%%xmm8;" 485 "pshufb %%xmm10,%%xmm8;" 489 "paddd %%xmm8,%%xmm0;" 492 "pshufd $0x50,%%xmm0,%%xmm2;" 495 "movdqa %%xmm2,%%xmm3;" 499 "movdqa %%xmm2,%%xmm6;" 504 "psrlq $0x11,%%xmm2;" 507 "psrlq $0x13,%%xmm3;" 515 "pxor %%xmm3,%%xmm2;" 519 "pxor %%xmm2,%%xmm6;" 523 "pshufb %%xmm11,%%xmm6;" 527 "paddd %%xmm0,%%xmm6;" 532 "movdqa 0x30(%13),%%xmm9;" 533 "paddd %%xmm7,%%xmm9;" 536 "movdqa %%xmm6,%%xmm0;" 540 "palignr $0x4,%%xmm5,%%xmm0;" 545 "movdqa %%xmm4,%%xmm1;" 548 "paddd %%xmm7,%%xmm0;" 552 "palignr $0x4,%%xmm7,%%xmm1;" 556 "movdqa %%xmm1,%%xmm2;" 560 "movdqa %%xmm1,%%xmm3;" 564 "pslld $0x19,%%xmm1;" 574 "movdqa %%xmm3,%%xmm2;" 577 "movdqa %%xmm3,%%xmm8;" 586 "psrld $0x12,%%xmm2;" 591 "pxor %%xmm3,%%xmm1;" 598 "pxor %%xmm2,%%xmm1;" 602 "pxor %%xmm8,%%xmm1;" 606 "pshufd $0xfa,%%xmm6,%%xmm2;" 609 "paddd %%xmm1,%%xmm0;" 612 "movdqa %%xmm2,%%xmm3;" 616 "movdqa %%xmm2,%%xmm8;" 622 "psrlq $0x11,%%xmm2;" 624 "psrlq $0x13,%%xmm3;" 632 "pxor %%xmm3,%%xmm2;" 636 "pxor %%xmm2,%%xmm8;" 640 "pshufb %%xmm10,%%xmm8;" 644 "paddd %%xmm8,%%xmm0;" 647 "pshufd $0x50,%%xmm0,%%xmm2;" 650 "movdqa %%xmm2,%%xmm3;" 654 "movdqa %%xmm2,%%xmm7;" 659 "psrlq $0x11,%%xmm2;" 662 "psrlq $0x13,%%xmm3;" 670 "pxor %%xmm3,%%xmm2;" 674 "pxor %%xmm2,%%xmm7;" 678 "pshufb %%xmm11,%%xmm7;" 682 "paddd %%xmm0,%%xmm7;" 692 "paddd 0x0(%13),%%xmm4;" 806 "paddd 0x10(%13),%%xmm5;" 921 "movdqa %%xmm6,%%xmm4;" 922 "movdqa %%xmm7,%%xmm5;" 948 :
"+r"(s),
"+r"(chunk),
"+r"(blocks),
"=r"(a),
"=r"(b),
"=r"(c),
949 "=r"(d),
"=r"(f),
"=r"(g),
"=r"(h),
"=r"(y0),
950 "=r"(y1),
"=r"(y2),
"=r"(tbl),
"+m"(inp_end),
"+m"(inp),
"+m"(xfer)
951 :
"m"(K256),
"m"(FLIP_MASK),
"m"(SHUF_00BA),
"m"(SHUF_DC00)
952 :
"cc",
"memory",
"xmm0",
"xmm1",
"xmm2",
"xmm3",
"xmm4",
"xmm5",
953 "xmm6",
"xmm7",
"xmm8",
"xmm9",
"xmm10",
"xmm11",
"xmm12");
void Transform(uint32_t *s, const uint8_t *chunk, size_t blocks)