diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2022-02-07 02:06:18 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2022-02-07 02:34:04 +0100 |
commit | c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb (patch) | |
tree | b75c366622b3146a4fdd3f7739b6eaf9d3bc1ac9 | |
parent | 987be932ed3cbea56b68bbe85649191c13b66015 (diff) | |
download | busybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.tar.gz busybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.tar.bz2 busybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.zip |
libbb/sha1: shrink and speed up unrolled x86-64 code
function old new delta
sha1_process_block64 3514 3482 -32
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | libbb/hash_md5_sha256_x86-32_shaNI.S | 8 | ||||
-rw-r--r-- | libbb/hash_md5_sha256_x86-64_shaNI.S | 8 | ||||
-rw-r--r-- | libbb/hash_md5_sha_x86-32_shaNI.S | 4 | ||||
-rw-r--r-- | libbb/hash_md5_sha_x86-64.S | 144 | ||||
-rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 9 | ||||
-rw-r--r-- | libbb/hash_md5_sha_x86-64_shaNI.S | 4 |
6 files changed, 131 insertions, 46 deletions
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S index 417da37d8..39e2baf41 100644 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S | |||
@@ -257,8 +257,8 @@ sha256_process_block64_shaNI: | |||
257 | ret | 257 | ret |
258 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI | 258 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI |
259 | 259 | ||
260 | .section .rodata.cst256.K256, "aM", @progbits, 256 | 260 | .section .rodata.cst256.K256, "aM", @progbits, 256 |
261 | .balign 16 | 261 | .balign 16 |
262 | K256: | 262 | K256: |
263 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | 263 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 |
264 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | 264 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 |
@@ -277,8 +277,8 @@ K256: | |||
277 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | 277 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 |
278 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | 278 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 |
279 | 279 | ||
280 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 | 280 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 |
281 | .balign 16 | 281 | .balign 16 |
282 | PSHUFFLE_BSWAP32_FLIP_MASK: | 282 | PSHUFFLE_BSWAP32_FLIP_MASK: |
283 | .octa 0x0c0d0e0f08090a0b0405060700010203 | 283 | .octa 0x0c0d0e0f08090a0b0405060700010203 |
284 | 284 | ||
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index dbf391135..c6c931341 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S | |||
@@ -253,8 +253,8 @@ sha256_process_block64_shaNI: | |||
253 | ret | 253 | ret |
254 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI | 254 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI |
255 | 255 | ||
256 | .section .rodata.cst256.K256, "aM", @progbits, 256 | 256 | .section .rodata.cst256.K256, "aM", @progbits, 256 |
257 | .balign 16 | 257 | .balign 16 |
258 | K256: | 258 | K256: |
259 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | 259 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 |
260 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | 260 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 |
@@ -273,8 +273,8 @@ K256: | |||
273 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | 273 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 |
274 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | 274 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 |
275 | 275 | ||
276 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 | 276 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 |
277 | .balign 16 | 277 | .balign 16 |
278 | PSHUFFLE_BSWAP32_FLIP_MASK: | 278 | PSHUFFLE_BSWAP32_FLIP_MASK: |
279 | .octa 0x0c0d0e0f08090a0b0405060700010203 | 279 | .octa 0x0c0d0e0f08090a0b0405060700010203 |
280 | 280 | ||
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S index 11b855e26..5d082ebfb 100644 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ b/libbb/hash_md5_sha_x86-32_shaNI.S | |||
@@ -223,8 +223,8 @@ sha1_process_block64_shaNI: | |||
223 | ret | 223 | ret |
224 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | 224 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI |
225 | 225 | ||
226 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 | 226 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 |
227 | .balign 16 | 227 | .balign 16 |
228 | PSHUFFLE_BYTE_FLIP_MASK: | 228 | PSHUFFLE_BYTE_FLIP_MASK: |
229 | .octa 0x000102030405060708090a0b0c0d0e0f | 229 | .octa 0x000102030405060708090a0b0c0d0e0f |
230 | 230 | ||
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 47ace60de..e26c46f25 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S | |||
@@ -180,8 +180,13 @@ sha1_process_block64: | |||
180 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | 180 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
181 | movaps %xmm3, %xmm4 | 181 | movaps %xmm3, %xmm4 |
182 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 182 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
183 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 183 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
184 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 184 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
185 | # same result as above, but shorter and faster: | ||
186 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
187 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
188 | movaps %xmm0, %xmm5 | ||
189 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
185 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 190 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
186 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 191 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
187 | xorps %xmm5, %xmm0 # ^ | 192 | xorps %xmm5, %xmm0 # ^ |
@@ -252,8 +257,13 @@ sha1_process_block64: | |||
252 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | 257 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
253 | movaps %xmm0, %xmm4 | 258 | movaps %xmm0, %xmm4 |
254 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 259 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
255 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 260 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
256 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 261 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
262 | # same result as above, but shorter and faster: | ||
263 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
264 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
265 | movaps %xmm1, %xmm5 | ||
266 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
257 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 267 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
258 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 268 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
259 | xorps %xmm5, %xmm1 # ^ | 269 | xorps %xmm5, %xmm1 # ^ |
@@ -323,8 +333,13 @@ sha1_process_block64: | |||
323 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | 333 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
324 | movaps %xmm1, %xmm4 | 334 | movaps %xmm1, %xmm4 |
325 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 335 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
326 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 336 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
327 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 337 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
338 | # same result as above, but shorter and faster: | ||
339 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
340 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
341 | movaps %xmm2, %xmm5 | ||
342 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
328 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 343 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
329 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 344 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
330 | xorps %xmm5, %xmm2 # ^ | 345 | xorps %xmm5, %xmm2 # ^ |
@@ -392,8 +407,13 @@ sha1_process_block64: | |||
392 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | 407 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
393 | movaps %xmm2, %xmm4 | 408 | movaps %xmm2, %xmm4 |
394 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 409 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
395 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 410 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
396 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 411 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
412 | # same result as above, but shorter and faster: | ||
413 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
414 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
415 | movaps %xmm3, %xmm5 | ||
416 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
397 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 417 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
398 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 418 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
399 | xorps %xmm5, %xmm3 # ^ | 419 | xorps %xmm5, %xmm3 # ^ |
@@ -457,8 +477,13 @@ sha1_process_block64: | |||
457 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | 477 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
458 | movaps %xmm3, %xmm4 | 478 | movaps %xmm3, %xmm4 |
459 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 479 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
460 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 480 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
461 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 481 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
482 | # same result as above, but shorter and faster: | ||
483 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
484 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
485 | movaps %xmm0, %xmm5 | ||
486 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
462 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 487 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
463 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 488 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
464 | xorps %xmm5, %xmm0 # ^ | 489 | xorps %xmm5, %xmm0 # ^ |
@@ -522,8 +547,13 @@ sha1_process_block64: | |||
522 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | 547 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
523 | movaps %xmm0, %xmm4 | 548 | movaps %xmm0, %xmm4 |
524 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 549 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
525 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 550 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
526 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 551 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
552 | # same result as above, but shorter and faster: | ||
553 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
554 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
555 | movaps %xmm1, %xmm5 | ||
556 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
527 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 557 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
528 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 558 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
529 | xorps %xmm5, %xmm1 # ^ | 559 | xorps %xmm5, %xmm1 # ^ |
@@ -588,8 +618,13 @@ sha1_process_block64: | |||
588 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | 618 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
589 | movaps %xmm1, %xmm4 | 619 | movaps %xmm1, %xmm4 |
590 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 620 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
591 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 621 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
592 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 622 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
623 | # same result as above, but shorter and faster: | ||
624 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
625 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
626 | movaps %xmm2, %xmm5 | ||
627 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
593 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 628 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
594 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 629 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
595 | xorps %xmm5, %xmm2 # ^ | 630 | xorps %xmm5, %xmm2 # ^ |
@@ -653,8 +688,13 @@ sha1_process_block64: | |||
653 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | 688 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
654 | movaps %xmm2, %xmm4 | 689 | movaps %xmm2, %xmm4 |
655 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 690 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
656 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 691 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
657 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 692 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
693 | # same result as above, but shorter and faster: | ||
694 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
695 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
696 | movaps %xmm3, %xmm5 | ||
697 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
658 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 698 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
659 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 699 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
660 | xorps %xmm5, %xmm3 # ^ | 700 | xorps %xmm5, %xmm3 # ^ |
@@ -718,8 +758,13 @@ sha1_process_block64: | |||
718 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | 758 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
719 | movaps %xmm3, %xmm4 | 759 | movaps %xmm3, %xmm4 |
720 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 760 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
721 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 761 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
722 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 762 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
763 | # same result as above, but shorter and faster: | ||
764 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
765 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
766 | movaps %xmm0, %xmm5 | ||
767 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
723 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 768 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
724 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 769 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
725 | xorps %xmm5, %xmm0 # ^ | 770 | xorps %xmm5, %xmm0 # ^ |
@@ -795,8 +840,13 @@ sha1_process_block64: | |||
795 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | 840 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
796 | movaps %xmm0, %xmm4 | 841 | movaps %xmm0, %xmm4 |
797 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 842 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
798 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 843 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
799 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 844 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
845 | # same result as above, but shorter and faster: | ||
846 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
847 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
848 | movaps %xmm1, %xmm5 | ||
849 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
800 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 850 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
801 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 851 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
802 | xorps %xmm5, %xmm1 # ^ | 852 | xorps %xmm5, %xmm1 # ^ |
@@ -872,8 +922,13 @@ sha1_process_block64: | |||
872 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | 922 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
873 | movaps %xmm1, %xmm4 | 923 | movaps %xmm1, %xmm4 |
874 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 924 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
875 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 925 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
876 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 926 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
927 | # same result as above, but shorter and faster: | ||
928 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
929 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
930 | movaps %xmm2, %xmm5 | ||
931 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
877 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 932 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
878 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 933 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
879 | xorps %xmm5, %xmm2 # ^ | 934 | xorps %xmm5, %xmm2 # ^ |
@@ -950,8 +1005,13 @@ sha1_process_block64: | |||
950 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | 1005 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
951 | movaps %xmm2, %xmm4 | 1006 | movaps %xmm2, %xmm4 |
952 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 1007 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
953 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 1008 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
954 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 1009 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
1010 | # same result as above, but shorter and faster: | ||
1011 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
1012 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
1013 | movaps %xmm3, %xmm5 | ||
1014 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
955 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 1015 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
956 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 1016 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
957 | xorps %xmm5, %xmm3 # ^ | 1017 | xorps %xmm5, %xmm3 # ^ |
@@ -1027,8 +1087,13 @@ sha1_process_block64: | |||
1027 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | 1087 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
1028 | movaps %xmm3, %xmm4 | 1088 | movaps %xmm3, %xmm4 |
1029 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 1089 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
1030 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 1090 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
1031 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 1091 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
1092 | # same result as above, but shorter and faster: | ||
1093 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
1094 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
1095 | movaps %xmm0, %xmm5 | ||
1096 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
1032 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 1097 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
1033 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 1098 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
1034 | xorps %xmm5, %xmm0 # ^ | 1099 | xorps %xmm5, %xmm0 # ^ |
@@ -1104,8 +1169,13 @@ sha1_process_block64: | |||
1104 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | 1169 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
1105 | movaps %xmm0, %xmm4 | 1170 | movaps %xmm0, %xmm4 |
1106 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 1171 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
1107 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 1172 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
1108 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 1173 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
1174 | # same result as above, but shorter and faster: | ||
1175 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
1176 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
1177 | movaps %xmm1, %xmm5 | ||
1178 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
1109 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 1179 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
1110 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 1180 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
1111 | xorps %xmm5, %xmm1 # ^ | 1181 | xorps %xmm5, %xmm1 # ^ |
@@ -1169,8 +1239,13 @@ sha1_process_block64: | |||
1169 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | 1239 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
1170 | movaps %xmm1, %xmm4 | 1240 | movaps %xmm1, %xmm4 |
1171 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 1241 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
1172 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 1242 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
1173 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 1243 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
1244 | # same result as above, but shorter and faster: | ||
1245 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
1246 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
1247 | movaps %xmm2, %xmm5 | ||
1248 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
1174 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 1249 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
1175 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 1250 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
1176 | xorps %xmm5, %xmm2 # ^ | 1251 | xorps %xmm5, %xmm2 # ^ |
@@ -1234,8 +1309,13 @@ sha1_process_block64: | |||
1234 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | 1309 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
1235 | movaps %xmm2, %xmm4 | 1310 | movaps %xmm2, %xmm4 |
1236 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 1311 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
1237 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 1312 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
1238 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 1313 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
1314 | # same result as above, but shorter and faster: | ||
1315 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
1316 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
1317 | movaps %xmm3, %xmm5 | ||
1318 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
1239 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 1319 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
1240 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 1320 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
1241 | xorps %xmm5, %xmm3 # ^ | 1321 | xorps %xmm5, %xmm3 # ^ |
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 656fb5414..fb1e4b57e 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh | |||
@@ -203,8 +203,13 @@ echo "# PREP $@ | |||
203 | movaps $xmmW12, $xmmT1 | 203 | movaps $xmmW12, $xmmT1 |
204 | psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | 204 | psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
205 | 205 | ||
206 | pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | 206 | # pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
207 | punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | 207 | # punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
208 | # same result as above, but shorter and faster: | ||
209 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
210 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
211 | movaps $xmmW0, $xmmT2 | ||
212 | shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
208 | 213 | ||
209 | xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | 214 | xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
210 | xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | 215 | xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S index ba92f09df..8ddec87ce 100644 --- a/libbb/hash_md5_sha_x86-64_shaNI.S +++ b/libbb/hash_md5_sha_x86-64_shaNI.S | |||
@@ -217,8 +217,8 @@ sha1_process_block64_shaNI: | |||
217 | ret | 217 | ret |
218 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | 218 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI |
219 | 219 | ||
220 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 | 220 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 |
221 | .balign 16 | 221 | .balign 16 |
222 | PSHUFFLE_BYTE_FLIP_MASK: | 222 | PSHUFFLE_BYTE_FLIP_MASK: |
223 | .octa 0x000102030405060708090a0b0c0d0e0f | 223 | .octa 0x000102030405060708090a0b0c0d0e0f |
224 | 224 | ||