diff options
| author | Denys Vlasenko <vda.linux@googlemail.com> | 2022-02-07 02:06:18 +0100 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2022-02-07 02:34:04 +0100 |
| commit | c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb (patch) | |
| tree | b75c366622b3146a4fdd3f7739b6eaf9d3bc1ac9 /libbb | |
| parent | 987be932ed3cbea56b68bbe85649191c13b66015 (diff) | |
| download | busybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.tar.gz busybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.tar.bz2 busybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.zip | |
libbb/sha1: shrink and speed up unrolled x86-64 code
function old new delta
sha1_process_block64 3514 3482 -32
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb')
| -rw-r--r-- | libbb/hash_md5_sha256_x86-32_shaNI.S | 8 | ||||
| -rw-r--r-- | libbb/hash_md5_sha256_x86-64_shaNI.S | 8 | ||||
| -rw-r--r-- | libbb/hash_md5_sha_x86-32_shaNI.S | 4 | ||||
| -rw-r--r-- | libbb/hash_md5_sha_x86-64.S | 144 | ||||
| -rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 9 | ||||
| -rw-r--r-- | libbb/hash_md5_sha_x86-64_shaNI.S | 4 |
6 files changed, 131 insertions, 46 deletions
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S index 417da37d8..39e2baf41 100644 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S | |||
| @@ -257,8 +257,8 @@ sha256_process_block64_shaNI: | |||
| 257 | ret | 257 | ret |
| 258 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI | 258 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI |
| 259 | 259 | ||
| 260 | .section .rodata.cst256.K256, "aM", @progbits, 256 | 260 | .section .rodata.cst256.K256, "aM", @progbits, 256 |
| 261 | .balign 16 | 261 | .balign 16 |
| 262 | K256: | 262 | K256: |
| 263 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | 263 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 |
| 264 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | 264 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 |
| @@ -277,8 +277,8 @@ K256: | |||
| 277 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | 277 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 |
| 278 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | 278 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 |
| 279 | 279 | ||
| 280 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 | 280 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 |
| 281 | .balign 16 | 281 | .balign 16 |
| 282 | PSHUFFLE_BSWAP32_FLIP_MASK: | 282 | PSHUFFLE_BSWAP32_FLIP_MASK: |
| 283 | .octa 0x0c0d0e0f08090a0b0405060700010203 | 283 | .octa 0x0c0d0e0f08090a0b0405060700010203 |
| 284 | 284 | ||
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index dbf391135..c6c931341 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S | |||
| @@ -253,8 +253,8 @@ sha256_process_block64_shaNI: | |||
| 253 | ret | 253 | ret |
| 254 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI | 254 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI |
| 255 | 255 | ||
| 256 | .section .rodata.cst256.K256, "aM", @progbits, 256 | 256 | .section .rodata.cst256.K256, "aM", @progbits, 256 |
| 257 | .balign 16 | 257 | .balign 16 |
| 258 | K256: | 258 | K256: |
| 259 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | 259 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 |
| 260 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | 260 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 |
| @@ -273,8 +273,8 @@ K256: | |||
| 273 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | 273 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 |
| 274 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | 274 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 |
| 275 | 275 | ||
| 276 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 | 276 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 |
| 277 | .balign 16 | 277 | .balign 16 |
| 278 | PSHUFFLE_BSWAP32_FLIP_MASK: | 278 | PSHUFFLE_BSWAP32_FLIP_MASK: |
| 279 | .octa 0x0c0d0e0f08090a0b0405060700010203 | 279 | .octa 0x0c0d0e0f08090a0b0405060700010203 |
| 280 | 280 | ||
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S index 11b855e26..5d082ebfb 100644 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ b/libbb/hash_md5_sha_x86-32_shaNI.S | |||
| @@ -223,8 +223,8 @@ sha1_process_block64_shaNI: | |||
| 223 | ret | 223 | ret |
| 224 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | 224 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI |
| 225 | 225 | ||
| 226 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 | 226 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 |
| 227 | .balign 16 | 227 | .balign 16 |
| 228 | PSHUFFLE_BYTE_FLIP_MASK: | 228 | PSHUFFLE_BYTE_FLIP_MASK: |
| 229 | .octa 0x000102030405060708090a0b0c0d0e0f | 229 | .octa 0x000102030405060708090a0b0c0d0e0f |
| 230 | 230 | ||
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 47ace60de..e26c46f25 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S | |||
| @@ -180,8 +180,13 @@ sha1_process_block64: | |||
| 180 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | 180 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
| 181 | movaps %xmm3, %xmm4 | 181 | movaps %xmm3, %xmm4 |
| 182 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 182 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 183 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 183 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 184 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 184 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 185 | # same result as above, but shorter and faster: | ||
| 186 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 187 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 188 | movaps %xmm0, %xmm5 | ||
| 189 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 185 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 190 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 186 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 191 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 187 | xorps %xmm5, %xmm0 # ^ | 192 | xorps %xmm5, %xmm0 # ^ |
| @@ -252,8 +257,13 @@ sha1_process_block64: | |||
| 252 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | 257 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
| 253 | movaps %xmm0, %xmm4 | 258 | movaps %xmm0, %xmm4 |
| 254 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 259 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 255 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 260 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 256 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 261 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 262 | # same result as above, but shorter and faster: | ||
| 263 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 264 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 265 | movaps %xmm1, %xmm5 | ||
| 266 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 257 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 267 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 258 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 268 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 259 | xorps %xmm5, %xmm1 # ^ | 269 | xorps %xmm5, %xmm1 # ^ |
| @@ -323,8 +333,13 @@ sha1_process_block64: | |||
| 323 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | 333 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
| 324 | movaps %xmm1, %xmm4 | 334 | movaps %xmm1, %xmm4 |
| 325 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 335 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 326 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 336 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 327 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 337 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 338 | # same result as above, but shorter and faster: | ||
| 339 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 340 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 341 | movaps %xmm2, %xmm5 | ||
| 342 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 328 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 343 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 329 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 344 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 330 | xorps %xmm5, %xmm2 # ^ | 345 | xorps %xmm5, %xmm2 # ^ |
| @@ -392,8 +407,13 @@ sha1_process_block64: | |||
| 392 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | 407 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
| 393 | movaps %xmm2, %xmm4 | 408 | movaps %xmm2, %xmm4 |
| 394 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 409 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 395 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 410 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 396 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 411 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 412 | # same result as above, but shorter and faster: | ||
| 413 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 414 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 415 | movaps %xmm3, %xmm5 | ||
| 416 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 397 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 417 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 398 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 418 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 399 | xorps %xmm5, %xmm3 # ^ | 419 | xorps %xmm5, %xmm3 # ^ |
| @@ -457,8 +477,13 @@ sha1_process_block64: | |||
| 457 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | 477 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
| 458 | movaps %xmm3, %xmm4 | 478 | movaps %xmm3, %xmm4 |
| 459 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 479 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 460 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 480 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 461 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 481 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 482 | # same result as above, but shorter and faster: | ||
| 483 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 484 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 485 | movaps %xmm0, %xmm5 | ||
| 486 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 462 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 487 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 463 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 488 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 464 | xorps %xmm5, %xmm0 # ^ | 489 | xorps %xmm5, %xmm0 # ^ |
| @@ -522,8 +547,13 @@ sha1_process_block64: | |||
| 522 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | 547 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
| 523 | movaps %xmm0, %xmm4 | 548 | movaps %xmm0, %xmm4 |
| 524 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 549 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 525 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 550 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 526 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 551 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 552 | # same result as above, but shorter and faster: | ||
| 553 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 554 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 555 | movaps %xmm1, %xmm5 | ||
| 556 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 527 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 557 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 528 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 558 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 529 | xorps %xmm5, %xmm1 # ^ | 559 | xorps %xmm5, %xmm1 # ^ |
| @@ -588,8 +618,13 @@ sha1_process_block64: | |||
| 588 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | 618 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
| 589 | movaps %xmm1, %xmm4 | 619 | movaps %xmm1, %xmm4 |
| 590 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 620 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 591 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 621 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 592 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 622 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 623 | # same result as above, but shorter and faster: | ||
| 624 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 625 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 626 | movaps %xmm2, %xmm5 | ||
| 627 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 593 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 628 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 594 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 629 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 595 | xorps %xmm5, %xmm2 # ^ | 630 | xorps %xmm5, %xmm2 # ^ |
| @@ -653,8 +688,13 @@ sha1_process_block64: | |||
| 653 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | 688 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
| 654 | movaps %xmm2, %xmm4 | 689 | movaps %xmm2, %xmm4 |
| 655 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 690 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 656 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 691 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 657 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 692 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 693 | # same result as above, but shorter and faster: | ||
| 694 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 695 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 696 | movaps %xmm3, %xmm5 | ||
| 697 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 658 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 698 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 659 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 699 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 660 | xorps %xmm5, %xmm3 # ^ | 700 | xorps %xmm5, %xmm3 # ^ |
| @@ -718,8 +758,13 @@ sha1_process_block64: | |||
| 718 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | 758 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
| 719 | movaps %xmm3, %xmm4 | 759 | movaps %xmm3, %xmm4 |
| 720 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 760 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 721 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 761 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 722 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 762 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 763 | # same result as above, but shorter and faster: | ||
| 764 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 765 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 766 | movaps %xmm0, %xmm5 | ||
| 767 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 723 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 768 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 724 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 769 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 725 | xorps %xmm5, %xmm0 # ^ | 770 | xorps %xmm5, %xmm0 # ^ |
| @@ -795,8 +840,13 @@ sha1_process_block64: | |||
| 795 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | 840 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
| 796 | movaps %xmm0, %xmm4 | 841 | movaps %xmm0, %xmm4 |
| 797 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 842 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 798 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 843 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 799 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 844 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 845 | # same result as above, but shorter and faster: | ||
| 846 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 847 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 848 | movaps %xmm1, %xmm5 | ||
| 849 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 800 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 850 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 801 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 851 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 802 | xorps %xmm5, %xmm1 # ^ | 852 | xorps %xmm5, %xmm1 # ^ |
| @@ -872,8 +922,13 @@ sha1_process_block64: | |||
| 872 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | 922 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
| 873 | movaps %xmm1, %xmm4 | 923 | movaps %xmm1, %xmm4 |
| 874 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 924 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 875 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 925 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 876 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 926 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 927 | # same result as above, but shorter and faster: | ||
| 928 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 929 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 930 | movaps %xmm2, %xmm5 | ||
| 931 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 877 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 932 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 878 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 933 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 879 | xorps %xmm5, %xmm2 # ^ | 934 | xorps %xmm5, %xmm2 # ^ |
| @@ -950,8 +1005,13 @@ sha1_process_block64: | |||
| 950 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | 1005 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
| 951 | movaps %xmm2, %xmm4 | 1006 | movaps %xmm2, %xmm4 |
| 952 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 1007 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 953 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 1008 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 954 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 1009 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1010 | # same result as above, but shorter and faster: | ||
| 1011 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 1012 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 1013 | movaps %xmm3, %xmm5 | ||
| 1014 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 955 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 1015 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 956 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 1016 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 957 | xorps %xmm5, %xmm3 # ^ | 1017 | xorps %xmm5, %xmm3 # ^ |
| @@ -1027,8 +1087,13 @@ sha1_process_block64: | |||
| 1027 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | 1087 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
| 1028 | movaps %xmm3, %xmm4 | 1088 | movaps %xmm3, %xmm4 |
| 1029 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 1089 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 1030 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 1090 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 1031 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 1091 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1092 | # same result as above, but shorter and faster: | ||
| 1093 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 1094 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 1095 | movaps %xmm0, %xmm5 | ||
| 1096 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 1032 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 1097 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 1033 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 1098 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 1034 | xorps %xmm5, %xmm0 # ^ | 1099 | xorps %xmm5, %xmm0 # ^ |
| @@ -1104,8 +1169,13 @@ sha1_process_block64: | |||
| 1104 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | 1169 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
| 1105 | movaps %xmm0, %xmm4 | 1170 | movaps %xmm0, %xmm4 |
| 1106 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 1171 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 1107 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 1172 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 1108 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 1173 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1174 | # same result as above, but shorter and faster: | ||
| 1175 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 1176 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 1177 | movaps %xmm1, %xmm5 | ||
| 1178 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 1109 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 1179 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 1110 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 1180 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 1111 | xorps %xmm5, %xmm1 # ^ | 1181 | xorps %xmm5, %xmm1 # ^ |
| @@ -1169,8 +1239,13 @@ sha1_process_block64: | |||
| 1169 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | 1239 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
| 1170 | movaps %xmm1, %xmm4 | 1240 | movaps %xmm1, %xmm4 |
| 1171 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 1241 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 1172 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 1242 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 1173 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 1243 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1244 | # same result as above, but shorter and faster: | ||
| 1245 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 1246 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 1247 | movaps %xmm2, %xmm5 | ||
| 1248 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 1174 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 1249 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 1175 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 1250 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 1176 | xorps %xmm5, %xmm2 # ^ | 1251 | xorps %xmm5, %xmm2 # ^ |
| @@ -1234,8 +1309,13 @@ sha1_process_block64: | |||
| 1234 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | 1309 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
| 1235 | movaps %xmm2, %xmm4 | 1310 | movaps %xmm2, %xmm4 |
| 1236 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 1311 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 1237 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 1312 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 1238 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 1313 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1314 | # same result as above, but shorter and faster: | ||
| 1315 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 1316 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 1317 | movaps %xmm3, %xmm5 | ||
| 1318 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 1239 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 1319 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 1240 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 1320 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 1241 | xorps %xmm5, %xmm3 # ^ | 1321 | xorps %xmm5, %xmm3 # ^ |
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 656fb5414..fb1e4b57e 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh | |||
| @@ -203,8 +203,13 @@ echo "# PREP $@ | |||
| 203 | movaps $xmmW12, $xmmT1 | 203 | movaps $xmmW12, $xmmT1 |
| 204 | psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 204 | psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 205 | 205 | ||
| 206 | pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 206 | # pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 207 | punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 207 | # punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 208 | # same result as above, but shorter and faster: | ||
| 209 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
| 210 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
| 211 | movaps $xmmW0, $xmmT2 | ||
| 212 | shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
| 208 | 213 | ||
| 209 | xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 214 | xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 210 | xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 215 | xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S index ba92f09df..8ddec87ce 100644 --- a/libbb/hash_md5_sha_x86-64_shaNI.S +++ b/libbb/hash_md5_sha_x86-64_shaNI.S | |||
| @@ -217,8 +217,8 @@ sha1_process_block64_shaNI: | |||
| 217 | ret | 217 | ret |
| 218 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | 218 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI |
| 219 | 219 | ||
| 220 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 | 220 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 |
| 221 | .balign 16 | 221 | .balign 16 |
| 222 | PSHUFFLE_BYTE_FLIP_MASK: | 222 | PSHUFFLE_BYTE_FLIP_MASK: |
| 223 | .octa 0x000102030405060708090a0b0c0d0e0f | 223 | .octa 0x000102030405060708090a0b0c0d0e0f |
| 224 | 224 | ||
