libbb/sha1: shrink unrolled x86-64 code

function old new delta sha1_process_block64 3482 3481 -1 .rodata 108460 108412 -48 ------------------------------------------------------------------------------ (add/remove: 1/4 grow/shrink: 0/2 up/down: 0/-49) Total: -49 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
author: Denys Vlasenko <vda.linux@googlemail.com> 2022-02-08 03:29:16 +0100
committer: Denys Vlasenko <vda.linux@googlemail.com> 2022-02-08 03:29:16 +0100
commit: 4923f74e5873b25b8205a4059964cff75ee731a8 (patch)
tree: 303d731fc684080fb6438657a235cd7b002d6702
parent: c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb (diff)
download: busybox-w32-4923f74e5873b25b8205a4059964cff75ee731a8.tar.gz
busybox-w32-4923f74e5873b25b8205a4059964cff75ee731a8.tar.bz2
busybox-w32-4923f74e5873b25b8205a4059964cff75ee731a8.zip
2 files changed, 21 insertions, 46 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index e26c46f25..287cfe547 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -24,6 +24,7 @@ sha1_process_block64:
 # xmm0..xmm3: W[]
 # xmm4,xmm5: temps
 # xmm6: current round constant
+# xmm7: all round constants
 # -64(%rsp): area for passing RCONST + W[] from vector to integer units
        movl    80(%rdi), %eax          # a = ctx->hash[0]
@@ -32,16 +33,17 @@ sha1_process_block64:
        movl    92(%rdi), %edx          # d = ctx->hash[3]
        movl    96(%rdi), %ebp          # e = ctx->hash[4]
-        movaps  rconst0x5A827999(%rip), %xmm6
+        movaps  sha1const(%rip), %xmm7
+        pshufd  $0x00, %xmm7, %xmm6
        # Load W[] to xmm registers, byteswapping on the fly.
        #
        # For iterations 0..15, we pass W[] in rsi,r8..r14
-        # for use in RD1A's instead of spilling them to stack.
+        # for use in RD1As instead of spilling them to stack.
        # We lose parallelized addition of RCONST, but LEA
-        # can do two additions at once, so it's probably a wash.
+        # can do two additions at once, so it is probably a wash.
        # (We use rsi instead of rN because this makes two
-        # LEAs in two first RD1A's shorter by one byte).
+        # LEAs in two first RD1As shorter by one byte).
        movq    4*0(%rdi), %rsi
        movq    4*2(%rdi), %r8
        bswapq  %rsi
@@ -253,7 +255,7 @@ sha1_process_block64:
        roll    $5, %edi                # rotl32(a,5)
        addl    %edi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
-        movaps  rconst0x6ED9EBA1(%rip), %xmm6
+        pshufd  $0x55, %xmm7, %xmm6
 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
        movaps  %xmm0, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
@@ -614,7 +616,7 @@ sha1_process_block64:
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
-        movaps  rconst0x8F1BBCDC(%rip), %xmm6
+        pshufd  $0xaa, %xmm7, %xmm6
 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
        movaps  %xmm1, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
@@ -1001,7 +1003,7 @@ sha1_process_block64:
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
-        movaps  rconst0xCA62C1D6(%rip), %xmm6
+        pshufd  $0xff, %xmm7, %xmm6
 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
        movaps  %xmm2, %xmm4
        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
@@ -1475,25 +1477,10 @@ sha1_process_block64:
        .section        .rodata.cst16.sha1const, "aM", @progbits, 16
        .balign 16
-rconst0x5A827999:
+sha1const:
        .long   0x5A827999
-        .long   0x5A827999
-        .long   0x5A827999
-        .long   0x5A827999
-rconst0x6ED9EBA1:
-        .long   0x6ED9EBA1
-        .long   0x6ED9EBA1
-        .long   0x6ED9EBA1
        .long   0x6ED9EBA1
-rconst0x8F1BBCDC:
        .long   0x8F1BBCDC
-        .long   0x8F1BBCDC
-        .long   0x8F1BBCDC
-        .long   0x8F1BBCDC
-rconst0xCA62C1D6:
-        .long   0xCA62C1D6
-        .long   0xCA62C1D6
-        .long   0xCA62C1D6
        .long   0xCA62C1D6
 #endif
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index fb1e4b57e..a10ac411d 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -34,6 +34,7 @@ exec >hash_md5_sha_x86-64.S
 xmmT1="%xmm4"
 xmmT2="%xmm5"
 xmmRCONST="%xmm6"
+xmmALLRCONST="%xmm7"
 T=`printf '\t'`
 # SSE instructions are longer than 4 bytes on average.
@@ -125,6 +126,7 @@ sha1_process_block64:
 # xmm0..xmm3: W[]
 # xmm4,xmm5: temps
 # xmm6: current round constant
+# xmm7: all round constants
 # -64(%rsp): area for passing RCONST + W[] from vector to integer units
        movl    80(%rdi), %eax          # a = ctx->hash[0]
@@ -133,16 +135,17 @@ sha1_process_block64:
        movl    92(%rdi), %edx          # d = ctx->hash[3]
        movl    96(%rdi), %ebp          # e = ctx->hash[4]
-        movaps  rconst0x5A827999(%rip), $xmmRCONST
+        movaps  sha1const(%rip), $xmmALLRCONST
+        pshufd  \$0x00, $xmmALLRCONST, $xmmRCONST
        # Load W[] to xmm registers, byteswapping on the fly.
        #
        # For iterations 0..15, we pass W[] in rsi,r8..r14
-        # for use in RD1A's instead of spilling them to stack.
+        # for use in RD1As instead of spilling them to stack.
        # We lose parallelized addition of RCONST, but LEA
-        # can do two additions at once, so it's probably a wash.
+        # can do two additions at once, so it is probably a wash.
        # (We use rsi instead of rN because this makes two
-        # LEAs in two first RD1A's shorter by one byte).
+        # LEAs in two first RD1As shorter by one byte).
        movq    4*0(%rdi), %rsi
        movq    4*2(%rdi), %r8
        bswapq  %rsi
@@ -359,7 +362,7 @@ RD1A bx cx dx bp ax  4; RD1A ax bx cx dx bp  5; RD1A bp ax bx cx dx  6; RD1A dx
 a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
 b=`RD1A cx dx bp ax bx  8; RD1A bx cx dx bp ax  9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;`
 INTERLEAVE "$a" "$b"
-a=`echo "       movaps  rconst0x6ED9EBA1(%rip), $xmmRCONST"
+a=`echo "       pshufd  \\$0x55, $xmmALLRCONST, $xmmRCONST"
   PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
 b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;`
 INTERLEAVE "$a" "$b"
@@ -378,7 +381,7 @@ INTERLEAVE "$a" "$b"
 a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
 b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;`
 INTERLEAVE "$a" "$b"
-a=`echo "       movaps  rconst0x8F1BBCDC(%rip), $xmmRCONST"
+a=`echo "       pshufd  \\$0xaa, $xmmALLRCONST, $xmmRCONST"
   PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
 b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;`
 INTERLEAVE "$a" "$b"
@@ -397,7 +400,7 @@ INTERLEAVE "$a" "$b"
 a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
 b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;`
 INTERLEAVE "$a" "$b"
-a=`echo "       movaps  rconst0xCA62C1D6(%rip), $xmmRCONST"
+a=`echo "       pshufd  \\$0xff, $xmmALLRCONST, $xmmRCONST"
   PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
 b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;`
 INTERLEAVE "$a" "$b"
@@ -439,25 +442,10 @@ echo "
        .section        .rodata.cst16.sha1const, \"aM\", @progbits, 16
        .balign 16
-rconst0x5A827999:
+sha1const:
        .long   0x5A827999
-        .long   0x5A827999
-        .long   0x5A827999
-        .long   0x5A827999
-rconst0x6ED9EBA1:
-        .long   0x6ED9EBA1
-        .long   0x6ED9EBA1
-        .long   0x6ED9EBA1
        .long   0x6ED9EBA1
-rconst0x8F1BBCDC:
        .long   0x8F1BBCDC
-        .long   0x8F1BBCDC
-        .long   0x8F1BBCDC
-        .long   0x8F1BBCDC
-rconst0xCA62C1D6:
-        .long   0xCA62C1D6
-        .long   0xCA62C1D6
-        .long   0xCA62C1D6
        .long   0xCA62C1D6
 #endif"
author	Denys Vlasenko <vda.linux@googlemail.com>	2022-02-08 03:29:16 +0100
committer	Denys Vlasenko <vda.linux@googlemail.com>	2022-02-08 03:29:16 +0100
commit	4923f74e5873b25b8205a4059964cff75ee731a8 (patch)
tree	303d731fc684080fb6438657a235cd7b002d6702
parent	c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb (diff)
download	busybox-w32-4923f74e5873b25b8205a4059964cff75ee731a8.tar.gz busybox-w32-4923f74e5873b25b8205a4059964cff75ee731a8.tar.bz2 busybox-w32-4923f74e5873b25b8205a4059964cff75ee731a8.zip