libbb/sha1: x86_64 version: reorder prologue/epilogue insns

Not clear exactly why, but this increases hashing speed on Skylake from 454 MB/s to 464 MB/s. Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
author: Denys Vlasenko <vda.linux@googlemail.com> 2022-01-04 01:45:13 +0100
committer: Denys Vlasenko <vda.linux@googlemail.com> 2022-01-04 01:45:52 +0100
commit: c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7 (patch)
tree: fc2c1bd26b585b8da0ba8cbe21b9b9ab745ef42c
parent: 1fc520ed286f815cae1da1e9f8014cb18a256744 (diff)
download: busybox-w32-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.tar.gz
busybox-w32-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.tar.bz2
busybox-w32-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.zip
2 files changed, 67 insertions, 60 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 95b85d80a..ff78fc049 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -6,14 +6,14 @@
        .hidden sha1_process_block64
        .type   sha1_process_block64, @function
-        .balign 8       # allow decoders to fetch at least 4 first insns
+        .balign 8       # allow decoders to fetch at least 5 first insns
 sha1_process_block64:
-        pushq   %r15    #
+        pushq   %rbp    # 1 byte insn
-        pushq   %r14    #
+        pushq   %rbx    # 1 byte insn
-        pushq   %r13    #
+        pushq   %r15    # 2 byte insn
-        pushq   %r12    #
+        pushq   %r14    # 2 byte insn
-        pushq   %rbp    #
+        pushq   %r13    # 2 byte insn
-        pushq   %rbx    #
+        pushq   %r12    # 2 byte insn
        pushq   %rdi    # we need ctx at the end
 #Register and stack use:
@@ -22,24 +22,6 @@ sha1_process_block64:
 # esi,edi: temps
 # -32+4*n(%rsp),r8...r15: W[0..7,8..15]
 # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
-        movq    4*8(%rdi), %r8
-        bswapq  %r8
-        movl    %r8d, %r9d
-        shrq    $32, %r8
-        movq    4*10(%rdi), %r10
-        bswapq  %r10
-        movl    %r10d, %r11d
-        shrq    $32, %r10
-        movq    4*12(%rdi), %r12
-        bswapq  %r12
-        movl    %r12d, %r13d
-        shrq    $32, %r12
-        movq    4*14(%rdi), %r14
-        bswapq  %r14
-        movl    %r14d, %r15d
-        shrq    $32, %r14
        movl    $3, %eax
 1:
        movq    (%rdi,%rax,8), %rsi
@@ -48,12 +30,30 @@ sha1_process_block64:
        movq    %rsi, -32(%rsp,%rax,8)
        decl    %eax
        jns     1b
        movl    80(%rdi), %eax          # a = ctx->hash[0]
        movl    84(%rdi), %ebx          # b = ctx->hash[1]
        movl    88(%rdi), %ecx          # c = ctx->hash[2]
        movl    92(%rdi), %edx          # d = ctx->hash[3]
        movl    96(%rdi), %ebp          # e = ctx->hash[4]
+        movq    4*8(%rdi), %r8
+        movq    4*10(%rdi), %r10
+        bswapq  %r8
+        bswapq  %r10
+        movq    4*12(%rdi), %r12
+        movq    4*14(%rdi), %r14
+        bswapq  %r12
+        bswapq  %r14
+        movl    %r8d, %r9d
+        shrq    $32, %r8
+        movl    %r10d, %r11d
+        shrq    $32, %r10
+        movl    %r12d, %r13d
+        shrq    $32, %r12
+        movl    %r14d, %r15d
+        shrq    $32, %r14
 # 0
        # W[0], already in %esi
        movl    %ecx, %edi              # c
@@ -1272,17 +1272,17 @@ sha1_process_block64:
        rorl    $2, %ecx                # b = rotl32(b,30)
        popq    %rdi            #
+        popq    %r12            #
        addl    %eax, 80(%rdi)  # ctx->hash[0] += a
+        popq    %r13            #
        addl    %ebx, 84(%rdi)  # ctx->hash[1] += b
+        popq    %r14            #
        addl    %ecx, 88(%rdi)  # ctx->hash[2] += c
+        popq    %r15            #
        addl    %edx, 92(%rdi)  # ctx->hash[3] += d
-        addl    %ebp, 96(%rdi)  # ctx->hash[4] += e
        popq    %rbx            #
+        addl    %ebp, 96(%rdi)  # ctx->hash[4] += e
        popq    %rbp            #
-        popq    %r12            #
-        popq    %r13            #
-        popq    %r14            #
-        popq    %r15            #
        ret
        .size   sha1_process_block64, .-sha1_process_block64
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index c5f0ef504..7e50b64fb 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -15,14 +15,14 @@ echo \
        .hidden sha1_process_block64
        .type   sha1_process_block64, @function
-        .balign 8       # allow decoders to fetch at least 4 first insns
+        .balign 8       # allow decoders to fetch at least 5 first insns
 sha1_process_block64:
-        pushq   %r15    #
+        pushq   %rbp    # 1 byte insn
-        pushq   %r14    #
+        pushq   %rbx    # 1 byte insn
-        pushq   %r13    #
+        pushq   %r15    # 2 byte insn
-        pushq   %r12    #
+        pushq   %r14    # 2 byte insn
-        pushq   %rbp    #
+        pushq   %r13    # 2 byte insn
-        pushq   %rbx    #
+        pushq   %r12    # 2 byte insn
        pushq   %rdi    # we need ctx at the end
 #Register and stack use:
@@ -31,24 +31,6 @@ sha1_process_block64:
 # esi,edi: temps
 # -32+4*n(%rsp),r8...r15: W[0..7,8..15]
 # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
-        movq    4*8(%rdi), %r8
-        bswapq  %r8
-        movl    %r8d, %r9d
-        shrq    $32, %r8
-        movq    4*10(%rdi), %r10
-        bswapq  %r10
-        movl    %r10d, %r11d
-        shrq    $32, %r10
-        movq    4*12(%rdi), %r12
-        bswapq  %r12
-        movl    %r12d, %r13d
-        shrq    $32, %r12
-        movq    4*14(%rdi), %r14
-        bswapq  %r14
-        movl    %r14d, %r15d
-        shrq    $32, %r14
        movl    $3, %eax
 1:
        movq    (%rdi,%rax,8), %rsi
@@ -57,11 +39,29 @@ sha1_process_block64:
        movq    %rsi, -32(%rsp,%rax,8)
        decl    %eax
        jns     1b
        movl    80(%rdi), %eax          # a = ctx->hash[0]
        movl    84(%rdi), %ebx          # b = ctx->hash[1]
        movl    88(%rdi), %ecx          # c = ctx->hash[2]
        movl    92(%rdi), %edx          # d = ctx->hash[3]
        movl    96(%rdi), %ebp          # e = ctx->hash[4]
+        movq    4*8(%rdi), %r8
+        movq    4*10(%rdi), %r10
+        bswapq  %r8
+        bswapq  %r10
+        movq    4*12(%rdi), %r12
+        movq    4*14(%rdi), %r14
+        bswapq  %r12
+        bswapq  %r14
+        movl    %r8d, %r9d
+        shrq    $32, %r8
+        movl    %r10d, %r11d
+        shrq    $32, %r10
+        movl    %r12d, %r13d
+        shrq    $32, %r12
+        movl    %r14d, %r15d
+        shrq    $32, %r14
 '
 W32() {
 test "$1" || exit 1
@@ -71,6 +71,13 @@ test "$1" -lt 8 && echo "-32+4*$1(%rsp)"
 test "$1" -ge 8 && echo "%r${1}d"
 }
+# It's possible to interleave insns in rounds to mostly eliminate
+# dependency chains, but this likely to only help old Pentium-based
+# CPUs (ones without OOO, which can only simultaneously execute a pair
+# of _adjacent_ insns).
+# Testing on old-ish Silvermont CPU (which has OOO window of only
+# about ~8 insns) shows very small (~1%) speedup.
 RD1A() {
 local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
 local n=$(($6))
@@ -257,17 +264,17 @@ RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx b
 echo "
        popq    %rdi            #
+        popq    %r12            #
        addl    %eax, 80(%rdi)  # ctx->hash[0] += a
+        popq    %r13            #
        addl    %ebx, 84(%rdi)  # ctx->hash[1] += b
+        popq    %r14            #
        addl    %ecx, 88(%rdi)  # ctx->hash[2] += c
+        popq    %r15            #
        addl    %edx, 92(%rdi)  # ctx->hash[3] += d
-        addl    %ebp, 96(%rdi)  # ctx->hash[4] += e
        popq    %rbx            #
+        addl    %ebp, 96(%rdi)  # ctx->hash[4] += e
        popq    %rbp            #
-        popq    %r12            #
-        popq    %r13            #
-        popq    %r14            #
-        popq    %r15            #
        ret
        .size   sha1_process_block64, .-sha1_process_block64
author	Denys Vlasenko <vda.linux@googlemail.com>	2022-01-04 01:45:13 +0100
committer	Denys Vlasenko <vda.linux@googlemail.com>	2022-01-04 01:45:52 +0100
commit	c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7 (patch)
tree	fc2c1bd26b585b8da0ba8cbe21b9b9ab745ef42c
parent	1fc520ed286f815cae1da1e9f8014cb18a256744 (diff)
download	busybox-w32-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.tar.gz busybox-w32-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.tar.bz2 busybox-w32-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.zip