aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2022-01-04 01:45:13 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2022-01-04 01:45:52 +0100
commitc3cfcc92422f6e525073226cdbfdcb00ab1e7dc7 (patch)
treefc2c1bd26b585b8da0ba8cbe21b9b9ab745ef42c
parent1fc520ed286f815cae1da1e9f8014cb18a256744 (diff)
downloadbusybox-w32-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.tar.gz
busybox-w32-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.tar.bz2
busybox-w32-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.zip
libbb/sha1: x86_64 version: reorder prologue/epilogue insns
Not clear exactly why, but this increases hashing speed on Skylake from 454 MB/s to 464 MB/s. Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--libbb/hash_md5_sha_x86-64.S60
-rwxr-xr-xlibbb/hash_md5_sha_x86-64.S.sh67
2 files changed, 67 insertions, 60 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 95b85d80a..ff78fc049 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -6,14 +6,14 @@
6 .hidden sha1_process_block64 6 .hidden sha1_process_block64
7 .type sha1_process_block64, @function 7 .type sha1_process_block64, @function
8 8
9 .balign 8 # allow decoders to fetch at least 4 first insns 9 .balign 8 # allow decoders to fetch at least 5 first insns
10sha1_process_block64: 10sha1_process_block64:
11 pushq %r15 # 11 pushq %rbp # 1 byte insn
12 pushq %r14 # 12 pushq %rbx # 1 byte insn
13 pushq %r13 # 13 pushq %r15 # 2 byte insn
14 pushq %r12 # 14 pushq %r14 # 2 byte insn
15 pushq %rbp # 15 pushq %r13 # 2 byte insn
16 pushq %rbx # 16 pushq %r12 # 2 byte insn
17 pushq %rdi # we need ctx at the end 17 pushq %rdi # we need ctx at the end
18 18
19#Register and stack use: 19#Register and stack use:
@@ -22,24 +22,6 @@ sha1_process_block64:
22# esi,edi: temps 22# esi,edi: temps
23# -32+4*n(%rsp),r8...r15: W[0..7,8..15] 23# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
24# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) 24# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
25
26 movq 4*8(%rdi), %r8
27 bswapq %r8
28 movl %r8d, %r9d
29 shrq $32, %r8
30 movq 4*10(%rdi), %r10
31 bswapq %r10
32 movl %r10d, %r11d
33 shrq $32, %r10
34 movq 4*12(%rdi), %r12
35 bswapq %r12
36 movl %r12d, %r13d
37 shrq $32, %r12
38 movq 4*14(%rdi), %r14
39 bswapq %r14
40 movl %r14d, %r15d
41 shrq $32, %r14
42
43 movl $3, %eax 25 movl $3, %eax
441: 261:
45 movq (%rdi,%rax,8), %rsi 27 movq (%rdi,%rax,8), %rsi
@@ -48,12 +30,30 @@ sha1_process_block64:
48 movq %rsi, -32(%rsp,%rax,8) 30 movq %rsi, -32(%rsp,%rax,8)
49 decl %eax 31 decl %eax
50 jns 1b 32 jns 1b
33
51 movl 80(%rdi), %eax # a = ctx->hash[0] 34 movl 80(%rdi), %eax # a = ctx->hash[0]
52 movl 84(%rdi), %ebx # b = ctx->hash[1] 35 movl 84(%rdi), %ebx # b = ctx->hash[1]
53 movl 88(%rdi), %ecx # c = ctx->hash[2] 36 movl 88(%rdi), %ecx # c = ctx->hash[2]
54 movl 92(%rdi), %edx # d = ctx->hash[3] 37 movl 92(%rdi), %edx # d = ctx->hash[3]
55 movl 96(%rdi), %ebp # e = ctx->hash[4] 38 movl 96(%rdi), %ebp # e = ctx->hash[4]
56 39
40 movq 4*8(%rdi), %r8
41 movq 4*10(%rdi), %r10
42 bswapq %r8
43 bswapq %r10
44 movq 4*12(%rdi), %r12
45 movq 4*14(%rdi), %r14
46 bswapq %r12
47 bswapq %r14
48 movl %r8d, %r9d
49 shrq $32, %r8
50 movl %r10d, %r11d
51 shrq $32, %r10
52 movl %r12d, %r13d
53 shrq $32, %r12
54 movl %r14d, %r15d
55 shrq $32, %r14
56
57# 0 57# 0
58 # W[0], already in %esi 58 # W[0], already in %esi
59 movl %ecx, %edi # c 59 movl %ecx, %edi # c
@@ -1272,17 +1272,17 @@ sha1_process_block64:
1272 rorl $2, %ecx # b = rotl32(b,30) 1272 rorl $2, %ecx # b = rotl32(b,30)
1273 1273
1274 popq %rdi # 1274 popq %rdi #
1275 popq %r12 #
1275 addl %eax, 80(%rdi) # ctx->hash[0] += a 1276 addl %eax, 80(%rdi) # ctx->hash[0] += a
1277 popq %r13 #
1276 addl %ebx, 84(%rdi) # ctx->hash[1] += b 1278 addl %ebx, 84(%rdi) # ctx->hash[1] += b
1279 popq %r14 #
1277 addl %ecx, 88(%rdi) # ctx->hash[2] += c 1280 addl %ecx, 88(%rdi) # ctx->hash[2] += c
1281 popq %r15 #
1278 addl %edx, 92(%rdi) # ctx->hash[3] += d 1282 addl %edx, 92(%rdi) # ctx->hash[3] += d
1279 addl %ebp, 96(%rdi) # ctx->hash[4] += e
1280 popq %rbx # 1283 popq %rbx #
1284 addl %ebp, 96(%rdi) # ctx->hash[4] += e
1281 popq %rbp # 1285 popq %rbp #
1282 popq %r12 #
1283 popq %r13 #
1284 popq %r14 #
1285 popq %r15 #
1286 1286
1287 ret 1287 ret
1288 .size sha1_process_block64, .-sha1_process_block64 1288 .size sha1_process_block64, .-sha1_process_block64
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index c5f0ef504..7e50b64fb 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -15,14 +15,14 @@ echo \
15 .hidden sha1_process_block64 15 .hidden sha1_process_block64
16 .type sha1_process_block64, @function 16 .type sha1_process_block64, @function
17 17
18 .balign 8 # allow decoders to fetch at least 4 first insns 18 .balign 8 # allow decoders to fetch at least 5 first insns
19sha1_process_block64: 19sha1_process_block64:
20 pushq %r15 # 20 pushq %rbp # 1 byte insn
21 pushq %r14 # 21 pushq %rbx # 1 byte insn
22 pushq %r13 # 22 pushq %r15 # 2 byte insn
23 pushq %r12 # 23 pushq %r14 # 2 byte insn
24 pushq %rbp # 24 pushq %r13 # 2 byte insn
25 pushq %rbx # 25 pushq %r12 # 2 byte insn
26 pushq %rdi # we need ctx at the end 26 pushq %rdi # we need ctx at the end
27 27
28#Register and stack use: 28#Register and stack use:
@@ -31,24 +31,6 @@ sha1_process_block64:
31# esi,edi: temps 31# esi,edi: temps
32# -32+4*n(%rsp),r8...r15: W[0..7,8..15] 32# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
33# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) 33# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
34
35 movq 4*8(%rdi), %r8
36 bswapq %r8
37 movl %r8d, %r9d
38 shrq $32, %r8
39 movq 4*10(%rdi), %r10
40 bswapq %r10
41 movl %r10d, %r11d
42 shrq $32, %r10
43 movq 4*12(%rdi), %r12
44 bswapq %r12
45 movl %r12d, %r13d
46 shrq $32, %r12
47 movq 4*14(%rdi), %r14
48 bswapq %r14
49 movl %r14d, %r15d
50 shrq $32, %r14
51
52 movl $3, %eax 34 movl $3, %eax
531: 351:
54 movq (%rdi,%rax,8), %rsi 36 movq (%rdi,%rax,8), %rsi
@@ -57,11 +39,29 @@ sha1_process_block64:
57 movq %rsi, -32(%rsp,%rax,8) 39 movq %rsi, -32(%rsp,%rax,8)
58 decl %eax 40 decl %eax
59 jns 1b 41 jns 1b
42
60 movl 80(%rdi), %eax # a = ctx->hash[0] 43 movl 80(%rdi), %eax # a = ctx->hash[0]
61 movl 84(%rdi), %ebx # b = ctx->hash[1] 44 movl 84(%rdi), %ebx # b = ctx->hash[1]
62 movl 88(%rdi), %ecx # c = ctx->hash[2] 45 movl 88(%rdi), %ecx # c = ctx->hash[2]
63 movl 92(%rdi), %edx # d = ctx->hash[3] 46 movl 92(%rdi), %edx # d = ctx->hash[3]
64 movl 96(%rdi), %ebp # e = ctx->hash[4] 47 movl 96(%rdi), %ebp # e = ctx->hash[4]
48
49 movq 4*8(%rdi), %r8
50 movq 4*10(%rdi), %r10
51 bswapq %r8
52 bswapq %r10
53 movq 4*12(%rdi), %r12
54 movq 4*14(%rdi), %r14
55 bswapq %r12
56 bswapq %r14
57 movl %r8d, %r9d
58 shrq $32, %r8
59 movl %r10d, %r11d
60 shrq $32, %r10
61 movl %r12d, %r13d
62 shrq $32, %r12
63 movl %r14d, %r15d
64 shrq $32, %r14
65' 65'
66W32() { 66W32() {
67test "$1" || exit 1 67test "$1" || exit 1
@@ -71,6 +71,13 @@ test "$1" -lt 8 && echo "-32+4*$1(%rsp)"
71test "$1" -ge 8 && echo "%r${1}d" 71test "$1" -ge 8 && echo "%r${1}d"
72} 72}
73 73
74# It's possible to interleave insns in rounds to mostly eliminate
75# dependency chains, but this likely to only help old Pentium-based
76# CPUs (ones without OOO, which can only simultaneously execute a pair
77# of _adjacent_ insns).
78# Testing on old-ish Silvermont CPU (which has OOO window of only
79# about ~8 insns) shows very small (~1%) speedup.
80
74RD1A() { 81RD1A() {
75local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 82local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
76local n=$(($6)) 83local n=$(($6))
@@ -257,17 +264,17 @@ RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx b
257 264
258echo " 265echo "
259 popq %rdi # 266 popq %rdi #
267 popq %r12 #
260 addl %eax, 80(%rdi) # ctx->hash[0] += a 268 addl %eax, 80(%rdi) # ctx->hash[0] += a
269 popq %r13 #
261 addl %ebx, 84(%rdi) # ctx->hash[1] += b 270 addl %ebx, 84(%rdi) # ctx->hash[1] += b
271 popq %r14 #
262 addl %ecx, 88(%rdi) # ctx->hash[2] += c 272 addl %ecx, 88(%rdi) # ctx->hash[2] += c
273 popq %r15 #
263 addl %edx, 92(%rdi) # ctx->hash[3] += d 274 addl %edx, 92(%rdi) # ctx->hash[3] += d
264 addl %ebp, 96(%rdi) # ctx->hash[4] += e
265 popq %rbx # 275 popq %rbx #
276 addl %ebp, 96(%rdi) # ctx->hash[4] += e
266 popq %rbp # 277 popq %rbp #
267 popq %r12 #
268 popq %r13 #
269 popq %r14 #
270 popq %r15 #
271 278
272 ret 279 ret
273 .size sha1_process_block64, .-sha1_process_block64 280 .size sha1_process_block64, .-sha1_process_block64