aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2022-02-07 02:06:18 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2022-02-07 02:34:04 +0100
commitc193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb (patch)
treeb75c366622b3146a4fdd3f7739b6eaf9d3bc1ac9
parent987be932ed3cbea56b68bbe85649191c13b66015 (diff)
downloadbusybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.tar.gz
busybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.tar.bz2
busybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.zip
libbb/sha1: shrink and speed up unrolled x86-64 code
function old new delta sha1_process_block64 3514 3482 -32 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--libbb/hash_md5_sha256_x86-32_shaNI.S8
-rw-r--r--libbb/hash_md5_sha256_x86-64_shaNI.S8
-rw-r--r--libbb/hash_md5_sha_x86-32_shaNI.S4
-rw-r--r--libbb/hash_md5_sha_x86-64.S144
-rwxr-xr-xlibbb/hash_md5_sha_x86-64.S.sh9
-rw-r--r--libbb/hash_md5_sha_x86-64_shaNI.S4
6 files changed, 131 insertions, 46 deletions
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index 417da37d8..39e2baf41 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -257,8 +257,8 @@ sha256_process_block64_shaNI:
257 ret 257 ret
258 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI 258 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
259 259
260.section .rodata.cst256.K256, "aM", @progbits, 256 260 .section .rodata.cst256.K256, "aM", @progbits, 256
261.balign 16 261 .balign 16
262K256: 262K256:
263 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 263 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
264 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 264 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
@@ -277,8 +277,8 @@ K256:
277 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 277 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
278 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 278 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
279 279
280.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 280 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
281.balign 16 281 .balign 16
282PSHUFFLE_BSWAP32_FLIP_MASK: 282PSHUFFLE_BSWAP32_FLIP_MASK:
283 .octa 0x0c0d0e0f08090a0b0405060700010203 283 .octa 0x0c0d0e0f08090a0b0405060700010203
284 284
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
index dbf391135..c6c931341 100644
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -253,8 +253,8 @@ sha256_process_block64_shaNI:
253 ret 253 ret
254 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI 254 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
255 255
256.section .rodata.cst256.K256, "aM", @progbits, 256 256 .section .rodata.cst256.K256, "aM", @progbits, 256
257.balign 16 257 .balign 16
258K256: 258K256:
259 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 259 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
260 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 260 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
@@ -273,8 +273,8 @@ K256:
273 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 273 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
274 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 274 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
275 275
276.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 276 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
277.balign 16 277 .balign 16
278PSHUFFLE_BSWAP32_FLIP_MASK: 278PSHUFFLE_BSWAP32_FLIP_MASK:
279 .octa 0x0c0d0e0f08090a0b0405060700010203 279 .octa 0x0c0d0e0f08090a0b0405060700010203
280 280
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index 11b855e26..5d082ebfb 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -223,8 +223,8 @@ sha1_process_block64_shaNI:
223 ret 223 ret
224 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI 224 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
225 225
226.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 226 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
227.balign 16 227 .balign 16
228PSHUFFLE_BYTE_FLIP_MASK: 228PSHUFFLE_BYTE_FLIP_MASK:
229 .octa 0x000102030405060708090a0b0c0d0e0f 229 .octa 0x000102030405060708090a0b0c0d0e0f
230 230
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 47ace60de..e26c46f25 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -180,8 +180,13 @@ sha1_process_block64:
180# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) 180# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
181 movaps %xmm3, %xmm4 181 movaps %xmm3, %xmm4
182 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 182 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
183 pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 183# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
184 punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 184# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
185# same result as above, but shorter and faster:
186# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
187# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
188 movaps %xmm0, %xmm5
189 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
185 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 190 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
186 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 191 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
187 xorps %xmm5, %xmm0 # ^ 192 xorps %xmm5, %xmm0 # ^
@@ -252,8 +257,13 @@ sha1_process_block64:
252# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) 257# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
253 movaps %xmm0, %xmm4 258 movaps %xmm0, %xmm4
254 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 259 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
255 pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 260# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
256 punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 261# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
262# same result as above, but shorter and faster:
263# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
264# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
265 movaps %xmm1, %xmm5
266 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
257 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 267 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
258 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 268 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
259 xorps %xmm5, %xmm1 # ^ 269 xorps %xmm5, %xmm1 # ^
@@ -323,8 +333,13 @@ sha1_process_block64:
323# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) 333# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
324 movaps %xmm1, %xmm4 334 movaps %xmm1, %xmm4
325 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 335 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
326 pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 336# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
327 punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 337# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
338# same result as above, but shorter and faster:
339# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
340# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
341 movaps %xmm2, %xmm5
342 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
328 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 343 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
329 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 344 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
330 xorps %xmm5, %xmm2 # ^ 345 xorps %xmm5, %xmm2 # ^
@@ -392,8 +407,13 @@ sha1_process_block64:
392# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) 407# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
393 movaps %xmm2, %xmm4 408 movaps %xmm2, %xmm4
394 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 409 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
395 pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 410# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
396 punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 411# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
412# same result as above, but shorter and faster:
413# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
414# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
415 movaps %xmm3, %xmm5
416 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
397 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 417 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
398 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 418 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
399 xorps %xmm5, %xmm3 # ^ 419 xorps %xmm5, %xmm3 # ^
@@ -457,8 +477,13 @@ sha1_process_block64:
457# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) 477# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
458 movaps %xmm3, %xmm4 478 movaps %xmm3, %xmm4
459 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 479 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
460 pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 480# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
461 punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 481# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
482# same result as above, but shorter and faster:
483# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
484# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
485 movaps %xmm0, %xmm5
486 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
462 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 487 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
463 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 488 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
464 xorps %xmm5, %xmm0 # ^ 489 xorps %xmm5, %xmm0 # ^
@@ -522,8 +547,13 @@ sha1_process_block64:
522# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) 547# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
523 movaps %xmm0, %xmm4 548 movaps %xmm0, %xmm4
524 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 549 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
525 pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 550# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
526 punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 551# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
552# same result as above, but shorter and faster:
553# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
554# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
555 movaps %xmm1, %xmm5
556 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
527 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 557 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
528 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 558 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
529 xorps %xmm5, %xmm1 # ^ 559 xorps %xmm5, %xmm1 # ^
@@ -588,8 +618,13 @@ sha1_process_block64:
588# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) 618# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
589 movaps %xmm1, %xmm4 619 movaps %xmm1, %xmm4
590 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 620 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
591 pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 621# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
592 punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 622# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
623# same result as above, but shorter and faster:
624# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
625# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
626 movaps %xmm2, %xmm5
627 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
593 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 628 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
594 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 629 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
595 xorps %xmm5, %xmm2 # ^ 630 xorps %xmm5, %xmm2 # ^
@@ -653,8 +688,13 @@ sha1_process_block64:
653# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) 688# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
654 movaps %xmm2, %xmm4 689 movaps %xmm2, %xmm4
655 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 690 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
656 pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 691# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
657 punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 692# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
693# same result as above, but shorter and faster:
694# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
695# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
696 movaps %xmm3, %xmm5
697 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
658 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 698 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
659 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 699 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
660 xorps %xmm5, %xmm3 # ^ 700 xorps %xmm5, %xmm3 # ^
@@ -718,8 +758,13 @@ sha1_process_block64:
718# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) 758# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
719 movaps %xmm3, %xmm4 759 movaps %xmm3, %xmm4
720 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 760 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
721 pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 761# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
722 punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 762# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
763# same result as above, but shorter and faster:
764# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
765# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
766 movaps %xmm0, %xmm5
767 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
723 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 768 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
724 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 769 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
725 xorps %xmm5, %xmm0 # ^ 770 xorps %xmm5, %xmm0 # ^
@@ -795,8 +840,13 @@ sha1_process_block64:
795# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) 840# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
796 movaps %xmm0, %xmm4 841 movaps %xmm0, %xmm4
797 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 842 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
798 pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 843# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
799 punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 844# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
845# same result as above, but shorter and faster:
846# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
847# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
848 movaps %xmm1, %xmm5
849 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
800 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 850 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
801 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 851 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
802 xorps %xmm5, %xmm1 # ^ 852 xorps %xmm5, %xmm1 # ^
@@ -872,8 +922,13 @@ sha1_process_block64:
872# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) 922# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
873 movaps %xmm1, %xmm4 923 movaps %xmm1, %xmm4
874 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 924 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
875 pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 925# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
876 punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 926# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
927# same result as above, but shorter and faster:
928# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
929# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
930 movaps %xmm2, %xmm5
931 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
877 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 932 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
878 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 933 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
879 xorps %xmm5, %xmm2 # ^ 934 xorps %xmm5, %xmm2 # ^
@@ -950,8 +1005,13 @@ sha1_process_block64:
950# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) 1005# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
951 movaps %xmm2, %xmm4 1006 movaps %xmm2, %xmm4
952 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 1007 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
953 pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 1008# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
954 punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 1009# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1010# same result as above, but shorter and faster:
1011# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1012# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1013 movaps %xmm3, %xmm5
1014 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
955 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 1015 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
956 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 1016 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
957 xorps %xmm5, %xmm3 # ^ 1017 xorps %xmm5, %xmm3 # ^
@@ -1027,8 +1087,13 @@ sha1_process_block64:
1027# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) 1087# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
1028 movaps %xmm3, %xmm4 1088 movaps %xmm3, %xmm4
1029 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 1089 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1030 pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 1090# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1031 punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 1091# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1092# same result as above, but shorter and faster:
1093# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1094# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1095 movaps %xmm0, %xmm5
1096 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1032 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 1097 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1033 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 1098 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1034 xorps %xmm5, %xmm0 # ^ 1099 xorps %xmm5, %xmm0 # ^
@@ -1104,8 +1169,13 @@ sha1_process_block64:
1104# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) 1169# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
1105 movaps %xmm0, %xmm4 1170 movaps %xmm0, %xmm4
1106 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 1171 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1107 pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 1172# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1108 punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 1173# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1174# same result as above, but shorter and faster:
1175# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1176# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1177 movaps %xmm1, %xmm5
1178 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1109 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 1179 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1110 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 1180 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1111 xorps %xmm5, %xmm1 # ^ 1181 xorps %xmm5, %xmm1 # ^
@@ -1169,8 +1239,13 @@ sha1_process_block64:
1169# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) 1239# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
1170 movaps %xmm1, %xmm4 1240 movaps %xmm1, %xmm4
1171 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 1241 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1172 pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 1242# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1173 punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 1243# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1244# same result as above, but shorter and faster:
1245# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1246# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1247 movaps %xmm2, %xmm5
1248 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1174 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 1249 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1175 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 1250 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1176 xorps %xmm5, %xmm2 # ^ 1251 xorps %xmm5, %xmm2 # ^
@@ -1234,8 +1309,13 @@ sha1_process_block64:
1234# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) 1309# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1235 movaps %xmm2, %xmm4 1310 movaps %xmm2, %xmm4
1236 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 1311 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1237 pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 1312# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1238 punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 1313# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1314# same result as above, but shorter and faster:
1315# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1316# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1317 movaps %xmm3, %xmm5
1318 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1239 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 1319 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1240 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 1320 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1241 xorps %xmm5, %xmm3 # ^ 1321 xorps %xmm5, %xmm3 # ^
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 656fb5414..fb1e4b57e 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -203,8 +203,13 @@ echo "# PREP $@
203 movaps $xmmW12, $xmmT1 203 movaps $xmmW12, $xmmT1
204 psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) 204 psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
205 205
206 pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) 206# pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
207 punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) 207# punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
208# same result as above, but shorter and faster:
209# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
210# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
211 movaps $xmmW0, $xmmT2
212 shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
208 213
209 xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) 214 xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
210 xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) 215 xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index ba92f09df..8ddec87ce 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -217,8 +217,8 @@ sha1_process_block64_shaNI:
217 ret 217 ret
218 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI 218 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
219 219
220.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 220 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
221.balign 16 221 .balign 16
222PSHUFFLE_BYTE_FLIP_MASK: 222PSHUFFLE_BYTE_FLIP_MASK:
223 .octa 0x000102030405060708090a0b0c0d0e0f 223 .octa 0x000102030405060708090a0b0c0d0e0f
224 224