libbb/sha1: shrink and speed up unrolled x86-64 code

function old new delta sha1_process_block64 3514 3482 -32 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
author: Denys Vlasenko <vda.linux@googlemail.com> 2022-02-07 02:06:18 +0100
committer: Denys Vlasenko <vda.linux@googlemail.com> 2022-02-07 02:34:04 +0100
commit: c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb (patch)
tree: b75c366622b3146a4fdd3f7739b6eaf9d3bc1ac9
parent: 987be932ed3cbea56b68bbe85649191c13b66015 (diff)
download: busybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.tar.gz
busybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.tar.bz2
busybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.zip
6 files changed, 131 insertions, 46 deletions
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index 417da37d8..39e2baf41 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -257,8 +257,8 @@ sha256_process_block64_shaNI:
        ret
        .size   sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
-.section        .rodata.cst256.K256, "aM", @progbits, 256
+        .section        .rodata.cst256.K256, "aM", @progbits, 256
-.balign 16
+        .balign 16
 K256:
        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
        .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
@@ -277,8 +277,8 @@ K256:
        .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
        .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.section        .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
+        .section        .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
-.balign 16
+        .balign 16
 PSHUFFLE_BSWAP32_FLIP_MASK:
        .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
index dbf391135..c6c931341 100644
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -253,8 +253,8 @@ sha256_process_block64_shaNI:
        ret
        .size   sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
-.section        .rodata.cst256.K256, "aM", @progbits, 256
+        .section        .rodata.cst256.K256, "aM", @progbits, 256
-.balign 16
+        .balign 16
 K256:
        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
        .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
@@ -273,8 +273,8 @@ K256:
        .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
        .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.section        .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
+        .section        .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
-.balign 16
+        .balign 16
 PSHUFFLE_BSWAP32_FLIP_MASK:
        .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index 11b855e26..5d082ebfb 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -223,8 +223,8 @@ sha1_process_block64_shaNI:
        ret
        .size   sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
-.section        .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+        .section        .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.balign 16
+        .balign 16
 PSHUFFLE_BYTE_FLIP_MASK:
        .octa 0x000102030405060708090a0b0c0d0e0f
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 47ace60de..e26c46f25 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -180,8 +180,13 @@ sha1_process_block64:
 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
        movaps  %xmm3, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm0, %xmm5
+        shufps  $0x4e, %xmm1, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm2, %xmm0    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm0    # ^
@@ -252,8 +257,13 @@ sha1_process_block64:
 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
        movaps  %xmm0, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm1, %xmm5
+        shufps  $0x4e, %xmm2, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm3, %xmm1    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm1    # ^
@@ -323,8 +333,13 @@ sha1_process_block64:
 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
        movaps  %xmm1, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm2, %xmm5
+        shufps  $0x4e, %xmm3, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm0, %xmm2    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm2    # ^
@@ -392,8 +407,13 @@ sha1_process_block64:
 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
        movaps  %xmm2, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm3, %xmm5
+        shufps  $0x4e, %xmm0, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm1, %xmm3    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm3    # ^
@@ -457,8 +477,13 @@ sha1_process_block64:
 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
        movaps  %xmm3, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm0, %xmm5
+        shufps  $0x4e, %xmm1, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm2, %xmm0    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm0    # ^
@@ -522,8 +547,13 @@ sha1_process_block64:
 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
        movaps  %xmm0, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm1, %xmm5
+        shufps  $0x4e, %xmm2, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm3, %xmm1    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm1    # ^
@@ -588,8 +618,13 @@ sha1_process_block64:
 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
        movaps  %xmm1, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm2, %xmm5
+        shufps  $0x4e, %xmm3, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm0, %xmm2    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm2    # ^
@@ -653,8 +688,13 @@ sha1_process_block64:
 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
        movaps  %xmm2, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm3, %xmm5
+        shufps  $0x4e, %xmm0, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm1, %xmm3    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm3    # ^
@@ -718,8 +758,13 @@ sha1_process_block64:
 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
        movaps  %xmm3, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm0, %xmm5
+        shufps  $0x4e, %xmm1, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm2, %xmm0    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm0    # ^
@@ -795,8 +840,13 @@ sha1_process_block64:
 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
        movaps  %xmm0, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm1, %xmm5
+        shufps  $0x4e, %xmm2, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm3, %xmm1    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm1    # ^
@@ -872,8 +922,13 @@ sha1_process_block64:
 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
        movaps  %xmm1, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm2, %xmm5
+        shufps  $0x4e, %xmm3, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm0, %xmm2    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm2    # ^
@@ -950,8 +1005,13 @@ sha1_process_block64:
 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
        movaps  %xmm2, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm3, %xmm5
+        shufps  $0x4e, %xmm0, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm1, %xmm3    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm3    # ^
@@ -1027,8 +1087,13 @@ sha1_process_block64:
 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
        movaps  %xmm3, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm0, %xmm5
+        shufps  $0x4e, %xmm1, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm2, %xmm0    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm0    # ^
@@ -1104,8 +1169,13 @@ sha1_process_block64:
 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
        movaps  %xmm0, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm1, %xmm5
+        shufps  $0x4e, %xmm2, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm3, %xmm1    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm1    # ^
@@ -1169,8 +1239,13 @@ sha1_process_block64:
 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
        movaps  %xmm1, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm2, %xmm5
+        shufps  $0x4e, %xmm3, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm0, %xmm2    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm2    # ^
@@ -1234,8 +1309,13 @@ sha1_process_block64:
 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
        movaps  %xmm2, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm3, %xmm5
+        shufps  $0x4e, %xmm0, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   %xmm1, %xmm3    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
        xorps   %xmm5, %xmm3    # ^
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 656fb5414..fb1e4b57e 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -203,8 +203,13 @@ echo "# PREP $@
        movaps  $xmmW12, $xmmT1
        psrldq  \$4, $xmmT1     # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-        pshufd  \$0x4e, $xmmW0, $xmmT2  # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       pshufd  \$0x4e, $xmmW0, $xmmT2  # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-        punpcklqdq $xmmW4, $xmmT2       # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#       punpcklqdq $xmmW4, $xmmT2       # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  $xmmW0, $xmmT2
+        shufps  \$0x4e, $xmmW4, $xmmT2  # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
        xorps   $xmmW8, $xmmW0  # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
        xorps   $xmmT1, $xmmT2  # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index ba92f09df..8ddec87ce 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -217,8 +217,8 @@ sha1_process_block64_shaNI:
        ret
        .size   sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
-.section        .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+        .section        .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.balign 16
+        .balign 16
 PSHUFFLE_BYTE_FLIP_MASK:
        .octa 0x000102030405060708090a0b0c0d0e0f
author	Denys Vlasenko <vda.linux@googlemail.com>	2022-02-07 02:06:18 +0100
committer	Denys Vlasenko <vda.linux@googlemail.com>	2022-02-07 02:34:04 +0100
commit	c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb (patch)
tree	b75c366622b3146a4fdd3f7739b6eaf9d3bc1ac9
parent	987be932ed3cbea56b68bbe85649191c13b66015 (diff)
download	busybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.tar.gz busybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.tar.bz2 busybox-w32-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.zip