diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-04 01:45:13 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-04 01:45:52 +0100 |
commit | c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7 (patch) | |
tree | fc2c1bd26b585b8da0ba8cbe21b9b9ab745ef42c | |
parent | 1fc520ed286f815cae1da1e9f8014cb18a256744 (diff) | |
download | busybox-w32-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.tar.gz busybox-w32-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.tar.bz2 busybox-w32-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.zip |
libbb/sha1: x86_64 version: reorder prologue/epilogue insns
Not clear exactly why, but this increases hashing speed
on Skylake from 454 MB/s to 464 MB/s.
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | libbb/hash_md5_sha_x86-64.S | 60 | ||||
-rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 67 |
2 files changed, 67 insertions, 60 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 95b85d80a..ff78fc049 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S | |||
@@ -6,14 +6,14 @@ | |||
6 | .hidden sha1_process_block64 | 6 | .hidden sha1_process_block64 |
7 | .type sha1_process_block64, @function | 7 | .type sha1_process_block64, @function |
8 | 8 | ||
9 | .balign 8 # allow decoders to fetch at least 4 first insns | 9 | .balign 8 # allow decoders to fetch at least 5 first insns |
10 | sha1_process_block64: | 10 | sha1_process_block64: |
11 | pushq %r15 # | 11 | pushq %rbp # 1 byte insn |
12 | pushq %r14 # | 12 | pushq %rbx # 1 byte insn |
13 | pushq %r13 # | 13 | pushq %r15 # 2 byte insn |
14 | pushq %r12 # | 14 | pushq %r14 # 2 byte insn |
15 | pushq %rbp # | 15 | pushq %r13 # 2 byte insn |
16 | pushq %rbx # | 16 | pushq %r12 # 2 byte insn |
17 | pushq %rdi # we need ctx at the end | 17 | pushq %rdi # we need ctx at the end |
18 | 18 | ||
19 | #Register and stack use: | 19 | #Register and stack use: |
@@ -22,24 +22,6 @@ sha1_process_block64: | |||
22 | # esi,edi: temps | 22 | # esi,edi: temps |
23 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] | 23 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] |
24 | # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) | 24 | # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) |
25 | |||
26 | movq 4*8(%rdi), %r8 | ||
27 | bswapq %r8 | ||
28 | movl %r8d, %r9d | ||
29 | shrq $32, %r8 | ||
30 | movq 4*10(%rdi), %r10 | ||
31 | bswapq %r10 | ||
32 | movl %r10d, %r11d | ||
33 | shrq $32, %r10 | ||
34 | movq 4*12(%rdi), %r12 | ||
35 | bswapq %r12 | ||
36 | movl %r12d, %r13d | ||
37 | shrq $32, %r12 | ||
38 | movq 4*14(%rdi), %r14 | ||
39 | bswapq %r14 | ||
40 | movl %r14d, %r15d | ||
41 | shrq $32, %r14 | ||
42 | |||
43 | movl $3, %eax | 25 | movl $3, %eax |
44 | 1: | 26 | 1: |
45 | movq (%rdi,%rax,8), %rsi | 27 | movq (%rdi,%rax,8), %rsi |
@@ -48,12 +30,30 @@ sha1_process_block64: | |||
48 | movq %rsi, -32(%rsp,%rax,8) | 30 | movq %rsi, -32(%rsp,%rax,8) |
49 | decl %eax | 31 | decl %eax |
50 | jns 1b | 32 | jns 1b |
33 | |||
51 | movl 80(%rdi), %eax # a = ctx->hash[0] | 34 | movl 80(%rdi), %eax # a = ctx->hash[0] |
52 | movl 84(%rdi), %ebx # b = ctx->hash[1] | 35 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
53 | movl 88(%rdi), %ecx # c = ctx->hash[2] | 36 | movl 88(%rdi), %ecx # c = ctx->hash[2] |
54 | movl 92(%rdi), %edx # d = ctx->hash[3] | 37 | movl 92(%rdi), %edx # d = ctx->hash[3] |
55 | movl 96(%rdi), %ebp # e = ctx->hash[4] | 38 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
56 | 39 | ||
40 | movq 4*8(%rdi), %r8 | ||
41 | movq 4*10(%rdi), %r10 | ||
42 | bswapq %r8 | ||
43 | bswapq %r10 | ||
44 | movq 4*12(%rdi), %r12 | ||
45 | movq 4*14(%rdi), %r14 | ||
46 | bswapq %r12 | ||
47 | bswapq %r14 | ||
48 | movl %r8d, %r9d | ||
49 | shrq $32, %r8 | ||
50 | movl %r10d, %r11d | ||
51 | shrq $32, %r10 | ||
52 | movl %r12d, %r13d | ||
53 | shrq $32, %r12 | ||
54 | movl %r14d, %r15d | ||
55 | shrq $32, %r14 | ||
56 | |||
57 | # 0 | 57 | # 0 |
58 | # W[0], already in %esi | 58 | # W[0], already in %esi |
59 | movl %ecx, %edi # c | 59 | movl %ecx, %edi # c |
@@ -1272,17 +1272,17 @@ sha1_process_block64: | |||
1272 | rorl $2, %ecx # b = rotl32(b,30) | 1272 | rorl $2, %ecx # b = rotl32(b,30) |
1273 | 1273 | ||
1274 | popq %rdi # | 1274 | popq %rdi # |
1275 | popq %r12 # | ||
1275 | addl %eax, 80(%rdi) # ctx->hash[0] += a | 1276 | addl %eax, 80(%rdi) # ctx->hash[0] += a |
1277 | popq %r13 # | ||
1276 | addl %ebx, 84(%rdi) # ctx->hash[1] += b | 1278 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
1279 | popq %r14 # | ||
1277 | addl %ecx, 88(%rdi) # ctx->hash[2] += c | 1280 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
1281 | popq %r15 # | ||
1278 | addl %edx, 92(%rdi) # ctx->hash[3] += d | 1282 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
1279 | addl %ebp, 96(%rdi) # ctx->hash[4] += e | ||
1280 | popq %rbx # | 1283 | popq %rbx # |
1284 | addl %ebp, 96(%rdi) # ctx->hash[4] += e | ||
1281 | popq %rbp # | 1285 | popq %rbp # |
1282 | popq %r12 # | ||
1283 | popq %r13 # | ||
1284 | popq %r14 # | ||
1285 | popq %r15 # | ||
1286 | 1286 | ||
1287 | ret | 1287 | ret |
1288 | .size sha1_process_block64, .-sha1_process_block64 | 1288 | .size sha1_process_block64, .-sha1_process_block64 |
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index c5f0ef504..7e50b64fb 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh | |||
@@ -15,14 +15,14 @@ echo \ | |||
15 | .hidden sha1_process_block64 | 15 | .hidden sha1_process_block64 |
16 | .type sha1_process_block64, @function | 16 | .type sha1_process_block64, @function |
17 | 17 | ||
18 | .balign 8 # allow decoders to fetch at least 4 first insns | 18 | .balign 8 # allow decoders to fetch at least 5 first insns |
19 | sha1_process_block64: | 19 | sha1_process_block64: |
20 | pushq %r15 # | 20 | pushq %rbp # 1 byte insn |
21 | pushq %r14 # | 21 | pushq %rbx # 1 byte insn |
22 | pushq %r13 # | 22 | pushq %r15 # 2 byte insn |
23 | pushq %r12 # | 23 | pushq %r14 # 2 byte insn |
24 | pushq %rbp # | 24 | pushq %r13 # 2 byte insn |
25 | pushq %rbx # | 25 | pushq %r12 # 2 byte insn |
26 | pushq %rdi # we need ctx at the end | 26 | pushq %rdi # we need ctx at the end |
27 | 27 | ||
28 | #Register and stack use: | 28 | #Register and stack use: |
@@ -31,24 +31,6 @@ sha1_process_block64: | |||
31 | # esi,edi: temps | 31 | # esi,edi: temps |
32 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] | 32 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] |
33 | # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) | 33 | # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) |
34 | |||
35 | movq 4*8(%rdi), %r8 | ||
36 | bswapq %r8 | ||
37 | movl %r8d, %r9d | ||
38 | shrq $32, %r8 | ||
39 | movq 4*10(%rdi), %r10 | ||
40 | bswapq %r10 | ||
41 | movl %r10d, %r11d | ||
42 | shrq $32, %r10 | ||
43 | movq 4*12(%rdi), %r12 | ||
44 | bswapq %r12 | ||
45 | movl %r12d, %r13d | ||
46 | shrq $32, %r12 | ||
47 | movq 4*14(%rdi), %r14 | ||
48 | bswapq %r14 | ||
49 | movl %r14d, %r15d | ||
50 | shrq $32, %r14 | ||
51 | |||
52 | movl $3, %eax | 34 | movl $3, %eax |
53 | 1: | 35 | 1: |
54 | movq (%rdi,%rax,8), %rsi | 36 | movq (%rdi,%rax,8), %rsi |
@@ -57,11 +39,29 @@ sha1_process_block64: | |||
57 | movq %rsi, -32(%rsp,%rax,8) | 39 | movq %rsi, -32(%rsp,%rax,8) |
58 | decl %eax | 40 | decl %eax |
59 | jns 1b | 41 | jns 1b |
42 | |||
60 | movl 80(%rdi), %eax # a = ctx->hash[0] | 43 | movl 80(%rdi), %eax # a = ctx->hash[0] |
61 | movl 84(%rdi), %ebx # b = ctx->hash[1] | 44 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
62 | movl 88(%rdi), %ecx # c = ctx->hash[2] | 45 | movl 88(%rdi), %ecx # c = ctx->hash[2] |
63 | movl 92(%rdi), %edx # d = ctx->hash[3] | 46 | movl 92(%rdi), %edx # d = ctx->hash[3] |
64 | movl 96(%rdi), %ebp # e = ctx->hash[4] | 47 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
48 | |||
49 | movq 4*8(%rdi), %r8 | ||
50 | movq 4*10(%rdi), %r10 | ||
51 | bswapq %r8 | ||
52 | bswapq %r10 | ||
53 | movq 4*12(%rdi), %r12 | ||
54 | movq 4*14(%rdi), %r14 | ||
55 | bswapq %r12 | ||
56 | bswapq %r14 | ||
57 | movl %r8d, %r9d | ||
58 | shrq $32, %r8 | ||
59 | movl %r10d, %r11d | ||
60 | shrq $32, %r10 | ||
61 | movl %r12d, %r13d | ||
62 | shrq $32, %r12 | ||
63 | movl %r14d, %r15d | ||
64 | shrq $32, %r14 | ||
65 | ' | 65 | ' |
66 | W32() { | 66 | W32() { |
67 | test "$1" || exit 1 | 67 | test "$1" || exit 1 |
@@ -71,6 +71,13 @@ test "$1" -lt 8 && echo "-32+4*$1(%rsp)" | |||
71 | test "$1" -ge 8 && echo "%r${1}d" | 71 | test "$1" -ge 8 && echo "%r${1}d" |
72 | } | 72 | } |
73 | 73 | ||
74 | # It's possible to interleave insns in rounds to mostly eliminate | ||
75 | # dependency chains, but this likely to only help old Pentium-based | ||
76 | # CPUs (ones without OOO, which can only simultaneously execute a pair | ||
77 | # of _adjacent_ insns). | ||
78 | # Testing on old-ish Silvermont CPU (which has OOO window of only | ||
79 | # about ~8 insns) shows very small (~1%) speedup. | ||
80 | |||
74 | RD1A() { | 81 | RD1A() { |
75 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 | 82 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
76 | local n=$(($6)) | 83 | local n=$(($6)) |
@@ -257,17 +264,17 @@ RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx b | |||
257 | 264 | ||
258 | echo " | 265 | echo " |
259 | popq %rdi # | 266 | popq %rdi # |
267 | popq %r12 # | ||
260 | addl %eax, 80(%rdi) # ctx->hash[0] += a | 268 | addl %eax, 80(%rdi) # ctx->hash[0] += a |
269 | popq %r13 # | ||
261 | addl %ebx, 84(%rdi) # ctx->hash[1] += b | 270 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
271 | popq %r14 # | ||
262 | addl %ecx, 88(%rdi) # ctx->hash[2] += c | 272 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
273 | popq %r15 # | ||
263 | addl %edx, 92(%rdi) # ctx->hash[3] += d | 274 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
264 | addl %ebp, 96(%rdi) # ctx->hash[4] += e | ||
265 | popq %rbx # | 275 | popq %rbx # |
276 | addl %ebp, 96(%rdi) # ctx->hash[4] += e | ||
266 | popq %rbp # | 277 | popq %rbp # |
267 | popq %r12 # | ||
268 | popq %r13 # | ||
269 | popq %r14 # | ||
270 | popq %r15 # | ||
271 | 278 | ||
272 | ret | 279 | ret |
273 | .size sha1_process_block64, .-sha1_process_block64 | 280 | .size sha1_process_block64, .-sha1_process_block64 |