aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2022-02-11 14:53:26 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2022-02-11 14:53:26 +0100
commitdda77e83762861b52d62f0f161e2b4bf8092eacf (patch)
tree9404e64d3dd15be73d9e089058b6b12e0317ca05
parent8154146be491bc66ab34d5d5f2a2466ddbdcff52 (diff)
downloadbusybox-w32-dda77e83762861b52d62f0f161e2b4bf8092eacf.tar.gz
busybox-w32-dda77e83762861b52d62f0f161e2b4bf8092eacf.tar.bz2
busybox-w32-dda77e83762861b52d62f0f161e2b4bf8092eacf.zip
libbb/sha1: revert last commit: pshufb is a SSSE3 insn, can't use it
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--libbb/hash_md5_sha256_x86-32_shaNI.S4
-rw-r--r--libbb/hash_md5_sha256_x86-64_shaNI.S4
-rw-r--r--libbb/hash_md5_sha_x86-32_shaNI.S5
-rw-r--r--libbb/hash_md5_sha_x86-64.S127
-rwxr-xr-xlibbb/hash_md5_sha_x86-64.S.sh133
-rw-r--r--libbb/hash_md5_sha_x86-64_shaNI.S5
6 files changed, 163 insertions, 115 deletions
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index 4b33449d4..c059fb18d 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -15,6 +15,10 @@
15//#define shuf128_32 pshufd 15//#define shuf128_32 pshufd
16#define shuf128_32 shufps 16#define shuf128_32 shufps
17 17
18// pshufb and palignr are SSSE3 insns.
19// We do not check SSSE3 in cpuid,
20// all SHA-capable CPUs support it as well.
21
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits 22 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI 23 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI 24 .hidden sha256_process_block64_shaNI
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
index 5ed80c2ef..9578441f8 100644
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -15,6 +15,10 @@
15//#define shuf128_32 pshufd 15//#define shuf128_32 pshufd
16#define shuf128_32 shufps 16#define shuf128_32 shufps
17 17
18// pshufb and palignr are SSSE3 insns.
19// We do not check SSSE3 in cpuid,
20// all SHA-capable CPUs support it as well.
21
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits 22 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI 23 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI 24 .hidden sha256_process_block64_shaNI
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index c7fb243ce..2366b046a 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -20,6 +20,11 @@
20#define extr128_32 pextrd 20#define extr128_32 pextrd
21//#define extr128_32 extractps # not shorter 21//#define extr128_32 extractps # not shorter
22 22
23// pshufb is a SSSE3 insn.
24// pinsrd, pextrd, extractps are SSE4.1 insns.
25// We do not check SSSE3/SSE4.1 in cpuid,
26// all SHA-capable CPUs support them as well.
27
23 .section .text.sha1_process_block64_shaNI, "ax", @progbits 28 .section .text.sha1_process_block64_shaNI, "ax", @progbits
24 .globl sha1_process_block64_shaNI 29 .globl sha1_process_block64_shaNI
25 .hidden sha1_process_block64_shaNI 30 .hidden sha1_process_block64_shaNI
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 51fde082a..f0daa30f6 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -27,60 +27,68 @@ sha1_process_block64:
27# xmm7: all round constants 27# xmm7: all round constants
28# -64(%rsp): area for passing RCONST + W[] from vector to integer units 28# -64(%rsp): area for passing RCONST + W[] from vector to integer units
29 29
30 movl 80(%rdi), %eax # a = ctx->hash[0]
31 movl 84(%rdi), %ebx # b = ctx->hash[1]
32 movl 88(%rdi), %ecx # c = ctx->hash[2]
33 movl 92(%rdi), %edx # d = ctx->hash[3]
34 movl 96(%rdi), %ebp # e = ctx->hash[4]
35
30 movaps sha1const(%rip), %xmm7 36 movaps sha1const(%rip), %xmm7
31 movaps bswap32_mask(%rip), %xmm4
32 pshufd $0x00, %xmm7, %xmm6 37 pshufd $0x00, %xmm7, %xmm6
33 38
34 # Load W[] to xmm0..3, byteswapping on the fly. 39 # Load W[] to xmm0..3, byteswapping on the fly.
35 # 40 #
36 # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 41 # For iterations 0..15, we pass W[] in rsi,r8..r14
37 # for use in RD1As instead of spilling them to stack. 42 # for use in RD1As instead of spilling them to stack.
43 # We lose parallelized addition of RCONST, but LEA
44 # can do two additions at once, so it is probably a wash.
38 # (We use rsi instead of rN because this makes two 45 # (We use rsi instead of rN because this makes two
39 # ADDs in two first RD1As shorter by one byte). 46 # LEAs in two first RD1As shorter by one byte).
40 movups 16*0(%rdi), %xmm0 47 movq 4*0(%rdi), %rsi
41 pshufb %xmm4, %xmm0 48 movq 4*2(%rdi), %r8
42 movaps %xmm0, %xmm5 49 bswapq %rsi
43 paddd %xmm6, %xmm5 50 bswapq %r8
44 movq %xmm5, %rsi 51 rolq $32, %rsi # rsi = W[1]:W[0]
45# pextrq $1, %xmm5, %r8 #SSE4.1 insn 52 rolq $32, %r8 # r8 = W[3]:W[2]
46# movhpd %xmm5, %r8 #can only move to mem, not to reg 53 movq %rsi, %xmm0
47 shufps $0x0e, %xmm5, %xmm5 54 movq %r8, %xmm4
48 movq %xmm5, %r8 55 punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
49 56# movaps %xmm0, %xmm4 # add RCONST, spill to stack
50 movups 16*1(%rdi), %xmm1 57# paddd %xmm6, %xmm4
51 pshufb %xmm4, %xmm1 58# movups %xmm4, -64+16*0(%rsp)
52 movaps %xmm1, %xmm5
53 paddd %xmm6, %xmm5
54 movq %xmm5, %r9
55 shufps $0x0e, %xmm5, %xmm5
56 movq %xmm5, %r10
57 59
58 movups 16*2(%rdi), %xmm2 60 movq 4*4(%rdi), %r9
59 pshufb %xmm4, %xmm2 61 movq 4*6(%rdi), %r10
60 movaps %xmm2, %xmm5 62 bswapq %r9
61 paddd %xmm6, %xmm5 63 bswapq %r10
62 movq %xmm5, %r11 64 rolq $32, %r9 # r9 = W[5]:W[4]
63 shufps $0x0e, %xmm5, %xmm5 65 rolq $32, %r10 # r10 = W[7]:W[6]
64 movq %xmm5, %r12 66 movq %r9, %xmm1
67 movq %r10, %xmm4
68 punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
65 69
66 movups 16*3(%rdi), %xmm3 70 movq 4*8(%rdi), %r11
67 pshufb %xmm4, %xmm3 71 movq 4*10(%rdi), %r12
68 movaps %xmm3, %xmm5 72 bswapq %r11
69 paddd %xmm6, %xmm5 73 bswapq %r12
70 movq %xmm5, %r13 74 rolq $32, %r11 # r11 = W[9]:W[8]
71 shufps $0x0e, %xmm5, %xmm5 75 rolq $32, %r12 # r12 = W[11]:W[10]
72 movq %xmm5, %r14 76 movq %r11, %xmm2
77 movq %r12, %xmm4
78 punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
73 79
74 # MOVQs to GPRs (above) have somewhat high latency. 80 movq 4*12(%rdi), %r13
75 # Load hash[] while they are completing: 81 movq 4*14(%rdi), %r14
76 movl 80(%rdi), %eax # a = ctx->hash[0] 82 bswapq %r13
77 movl 84(%rdi), %ebx # b = ctx->hash[1] 83 bswapq %r14
78 movl 88(%rdi), %ecx # c = ctx->hash[2] 84 rolq $32, %r13 # r13 = W[13]:W[12]
79 movl 92(%rdi), %edx # d = ctx->hash[3] 85 rolq $32, %r14 # r14 = W[15]:W[14]
80 movl 96(%rdi), %ebp # e = ctx->hash[4] 86 movq %r13, %xmm3
87 movq %r14, %xmm4
88 punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
81 89
82# 0 90# 0
83 addl %esi, %ebp # e += RCONST + W[n] 91 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
84 shrq $32, %rsi 92 shrq $32, %rsi
85 movl %ecx, %edi # c 93 movl %ecx, %edi # c
86 xorl %edx, %edi # ^d 94 xorl %edx, %edi # ^d
@@ -92,7 +100,7 @@ sha1_process_block64:
92 addl %edi, %ebp # e += rotl32(a,5) 100 addl %edi, %ebp # e += rotl32(a,5)
93 rorl $2, %ebx # b = rotl32(b,30) 101 rorl $2, %ebx # b = rotl32(b,30)
94# 1 102# 1
95 addl %esi, %edx # e += RCONST + W[n] 103 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
96 movl %ebx, %edi # c 104 movl %ebx, %edi # c
97 xorl %ecx, %edi # ^d 105 xorl %ecx, %edi # ^d
98 andl %eax, %edi # &b 106 andl %eax, %edi # &b
@@ -103,7 +111,7 @@ sha1_process_block64:
103 addl %edi, %edx # e += rotl32(a,5) 111 addl %edi, %edx # e += rotl32(a,5)
104 rorl $2, %eax # b = rotl32(b,30) 112 rorl $2, %eax # b = rotl32(b,30)
105# 2 113# 2
106 addl %r8d, %ecx # e += RCONST + W[n] 114 leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
107 shrq $32, %r8 115 shrq $32, %r8
108 movl %eax, %edi # c 116 movl %eax, %edi # c
109 xorl %ebx, %edi # ^d 117 xorl %ebx, %edi # ^d
@@ -115,7 +123,7 @@ sha1_process_block64:
115 addl %edi, %ecx # e += rotl32(a,5) 123 addl %edi, %ecx # e += rotl32(a,5)
116 rorl $2, %ebp # b = rotl32(b,30) 124 rorl $2, %ebp # b = rotl32(b,30)
117# 3 125# 3
118 addl %r8d, %ebx # e += RCONST + W[n] 126 leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
119 movl %ebp, %edi # c 127 movl %ebp, %edi # c
120 xorl %eax, %edi # ^d 128 xorl %eax, %edi # ^d
121 andl %edx, %edi # &b 129 andl %edx, %edi # &b
@@ -126,7 +134,7 @@ sha1_process_block64:
126 addl %edi, %ebx # e += rotl32(a,5) 134 addl %edi, %ebx # e += rotl32(a,5)
127 rorl $2, %edx # b = rotl32(b,30) 135 rorl $2, %edx # b = rotl32(b,30)
128# 4 136# 4
129 addl %r9d, %eax # e += RCONST + W[n] 137 leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
130 shrq $32, %r9 138 shrq $32, %r9
131 movl %edx, %edi # c 139 movl %edx, %edi # c
132 xorl %ebp, %edi # ^d 140 xorl %ebp, %edi # ^d
@@ -138,7 +146,7 @@ sha1_process_block64:
138 addl %edi, %eax # e += rotl32(a,5) 146 addl %edi, %eax # e += rotl32(a,5)
139 rorl $2, %ecx # b = rotl32(b,30) 147 rorl $2, %ecx # b = rotl32(b,30)
140# 5 148# 5
141 addl %r9d, %ebp # e += RCONST + W[n] 149 leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
142 movl %ecx, %edi # c 150 movl %ecx, %edi # c
143 xorl %edx, %edi # ^d 151 xorl %edx, %edi # ^d
144 andl %ebx, %edi # &b 152 andl %ebx, %edi # &b
@@ -149,7 +157,7 @@ sha1_process_block64:
149 addl %edi, %ebp # e += rotl32(a,5) 157 addl %edi, %ebp # e += rotl32(a,5)
150 rorl $2, %ebx # b = rotl32(b,30) 158 rorl $2, %ebx # b = rotl32(b,30)
151# 6 159# 6
152 addl %r10d, %edx # e += RCONST + W[n] 160 leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
153 shrq $32, %r10 161 shrq $32, %r10
154 movl %ebx, %edi # c 162 movl %ebx, %edi # c
155 xorl %ecx, %edi # ^d 163 xorl %ecx, %edi # ^d
@@ -161,7 +169,7 @@ sha1_process_block64:
161 addl %edi, %edx # e += rotl32(a,5) 169 addl %edi, %edx # e += rotl32(a,5)
162 rorl $2, %eax # b = rotl32(b,30) 170 rorl $2, %eax # b = rotl32(b,30)
163# 7 171# 7
164 addl %r10d, %ecx # e += RCONST + W[n] 172 leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
165 movl %eax, %edi # c 173 movl %eax, %edi # c
166 xorl %ebx, %edi # ^d 174 xorl %ebx, %edi # ^d
167 andl %ebp, %edi # &b 175 andl %ebp, %edi # &b
@@ -202,7 +210,7 @@ sha1_process_block64:
202 paddd %xmm6, %xmm5 210 paddd %xmm6, %xmm5
203 movups %xmm5, -64+16*0(%rsp) 211 movups %xmm5, -64+16*0(%rsp)
204# 8 212# 8
205 addl %r11d, %ebx # e += RCONST + W[n] 213 leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
206 shrq $32, %r11 214 shrq $32, %r11
207 movl %ebp, %edi # c 215 movl %ebp, %edi # c
208 xorl %eax, %edi # ^d 216 xorl %eax, %edi # ^d
@@ -214,7 +222,7 @@ sha1_process_block64:
214 addl %edi, %ebx # e += rotl32(a,5) 222 addl %edi, %ebx # e += rotl32(a,5)
215 rorl $2, %edx # b = rotl32(b,30) 223 rorl $2, %edx # b = rotl32(b,30)
216# 9 224# 9
217 addl %r11d, %eax # e += RCONST + W[n] 225 leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
218 movl %edx, %edi # c 226 movl %edx, %edi # c
219 xorl %ebp, %edi # ^d 227 xorl %ebp, %edi # ^d
220 andl %ecx, %edi # &b 228 andl %ecx, %edi # &b
@@ -225,7 +233,7 @@ sha1_process_block64:
225 addl %edi, %eax # e += rotl32(a,5) 233 addl %edi, %eax # e += rotl32(a,5)
226 rorl $2, %ecx # b = rotl32(b,30) 234 rorl $2, %ecx # b = rotl32(b,30)
227# 10 235# 10
228 addl %r12d, %ebp # e += RCONST + W[n] 236 leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
229 shrq $32, %r12 237 shrq $32, %r12
230 movl %ecx, %edi # c 238 movl %ecx, %edi # c
231 xorl %edx, %edi # ^d 239 xorl %edx, %edi # ^d
@@ -237,7 +245,7 @@ sha1_process_block64:
237 addl %edi, %ebp # e += rotl32(a,5) 245 addl %edi, %ebp # e += rotl32(a,5)
238 rorl $2, %ebx # b = rotl32(b,30) 246 rorl $2, %ebx # b = rotl32(b,30)
239# 11 247# 11
240 addl %r12d, %edx # e += RCONST + W[n] 248 leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
241 movl %ebx, %edi # c 249 movl %ebx, %edi # c
242 xorl %ecx, %edi # ^d 250 xorl %ecx, %edi # ^d
243 andl %eax, %edi # &b 251 andl %eax, %edi # &b
@@ -279,7 +287,7 @@ sha1_process_block64:
279 paddd %xmm6, %xmm5 287 paddd %xmm6, %xmm5
280 movups %xmm5, -64+16*1(%rsp) 288 movups %xmm5, -64+16*1(%rsp)
281# 12 289# 12
282 addl %r13d, %ecx # e += RCONST + W[n] 290 leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
283 shrq $32, %r13 291 shrq $32, %r13
284 movl %eax, %edi # c 292 movl %eax, %edi # c
285 xorl %ebx, %edi # ^d 293 xorl %ebx, %edi # ^d
@@ -291,7 +299,7 @@ sha1_process_block64:
291 addl %edi, %ecx # e += rotl32(a,5) 299 addl %edi, %ecx # e += rotl32(a,5)
292 rorl $2, %ebp # b = rotl32(b,30) 300 rorl $2, %ebp # b = rotl32(b,30)
293# 13 301# 13
294 addl %r13d, %ebx # e += RCONST + W[n] 302 leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
295 movl %ebp, %edi # c 303 movl %ebp, %edi # c
296 xorl %eax, %edi # ^d 304 xorl %eax, %edi # ^d
297 andl %edx, %edi # &b 305 andl %edx, %edi # &b
@@ -302,7 +310,7 @@ sha1_process_block64:
302 addl %edi, %ebx # e += rotl32(a,5) 310 addl %edi, %ebx # e += rotl32(a,5)
303 rorl $2, %edx # b = rotl32(b,30) 311 rorl $2, %edx # b = rotl32(b,30)
304# 14 312# 14
305 addl %r14d, %eax # e += RCONST + W[n] 313 leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
306 shrq $32, %r14 314 shrq $32, %r14
307 movl %edx, %edi # c 315 movl %edx, %edi # c
308 xorl %ebp, %edi # ^d 316 xorl %ebp, %edi # ^d
@@ -314,7 +322,7 @@ sha1_process_block64:
314 addl %edi, %eax # e += rotl32(a,5) 322 addl %edi, %eax # e += rotl32(a,5)
315 rorl $2, %ecx # b = rotl32(b,30) 323 rorl $2, %ecx # b = rotl32(b,30)
316# 15 324# 15
317 addl %r14d, %ebp # e += RCONST + W[n] 325 leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
318 movl %ecx, %edi # c 326 movl %ecx, %edi # c
319 xorl %edx, %edi # ^d 327 xorl %edx, %edi # ^d
320 andl %ebx, %edi # &b 328 andl %ebx, %edi # &b
@@ -1467,11 +1475,6 @@ sha1_process_block64:
1467 ret 1475 ret
1468 .size sha1_process_block64, .-sha1_process_block64 1476 .size sha1_process_block64, .-sha1_process_block64
1469 1477
1470 .section .rodata.cst16.bswap32_mask, "aM", @progbits, 16
1471 .balign 16
1472bswap32_mask:
1473 .octa 0x0c0d0e0f08090a0b0405060700010203
1474
1475 .section .rodata.cst16.sha1const, "aM", @progbits, 16 1478 .section .rodata.cst16.sha1const, "aM", @progbits, 16
1476 .balign 16 1479 .balign 16
1477sha1const: 1480sha1const:
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index f34e6e6fa..57e77b118 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -99,6 +99,30 @@ INTERLEAVE() {
99 ) 99 )
100} 100}
101 101
102# movaps bswap32_mask(%rip), $xmmT1
103# Load W[] to xmm0..3, byteswapping on the fly.
104# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
105# for use in RD1As instead of spilling them to stack.
106# (We use rsi instead of rN because this makes two
107# ADDs in two first RD1As shorter by one byte).
108# movups 16*0(%rdi), %xmm0
109# pshufb $xmmT1, %xmm0 #SSSE3 insn
110# movaps %xmm0, $xmmT2
111# paddd $xmmRCONST, $xmmT2
112# movq $xmmT2, %rsi
113# #pextrq \$1, $xmmT2, %r8 #SSE4.1 insn
114# #movhpd $xmmT2, %r8 #can only move to mem, not to reg
115# shufps \$0x0e, $xmmT2, $xmmT2 # have to use two-insn sequence
116# movq $xmmT2, %r8 # instead
117# ...
118# <repeat for xmm1,2,3>
119# ...
120#- leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
121#+ addl %esi, %e$e # e += RCONST + W[n]
122# ^^^^^^^^^^^^^^^^^^^^^^^^
123# The above is -97 bytes of code...
124# ...but pshufb is a SSSE3 insn. Can't use it.
125
102echo \ 126echo \
103"### Generated by hash_md5_sha_x86-64.S.sh ### 127"### Generated by hash_md5_sha_x86-64.S.sh ###
104 128
@@ -129,57 +153,65 @@ sha1_process_block64:
129# xmm7: all round constants 153# xmm7: all round constants
130# -64(%rsp): area for passing RCONST + W[] from vector to integer units 154# -64(%rsp): area for passing RCONST + W[] from vector to integer units
131 155
156 movl 80(%rdi), %eax # a = ctx->hash[0]
157 movl 84(%rdi), %ebx # b = ctx->hash[1]
158 movl 88(%rdi), %ecx # c = ctx->hash[2]
159 movl 92(%rdi), %edx # d = ctx->hash[3]
160 movl 96(%rdi), %ebp # e = ctx->hash[4]
161
132 movaps sha1const(%rip), $xmmALLRCONST 162 movaps sha1const(%rip), $xmmALLRCONST
133 movaps bswap32_mask(%rip), $xmmT1
134 pshufd \$0x00, $xmmALLRCONST, $xmmRCONST 163 pshufd \$0x00, $xmmALLRCONST, $xmmRCONST
135 164
136 # Load W[] to xmm0..3, byteswapping on the fly. 165 # Load W[] to xmm0..3, byteswapping on the fly.
137 # 166 #
138 # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 167 # For iterations 0..15, we pass W[] in rsi,r8..r14
139 # for use in RD1As instead of spilling them to stack. 168 # for use in RD1As instead of spilling them to stack.
169 # We lose parallelized addition of RCONST, but LEA
170 # can do two additions at once, so it is probably a wash.
140 # (We use rsi instead of rN because this makes two 171 # (We use rsi instead of rN because this makes two
141 # ADDs in two first RD1As shorter by one byte). 172 # LEAs in two first RD1As shorter by one byte).
142 movups 16*0(%rdi), %xmm0 173 movq 4*0(%rdi), %rsi
143 pshufb $xmmT1, %xmm0 174 movq 4*2(%rdi), %r8
144 movaps %xmm0, $xmmT2 175 bswapq %rsi
145 paddd $xmmRCONST, $xmmT2 176 bswapq %r8
146 movq $xmmT2, %rsi 177 rolq \$32, %rsi # rsi = W[1]:W[0]
147# pextrq \$1, $xmmT2, %r8 #SSE4.1 insn 178 rolq \$32, %r8 # r8 = W[3]:W[2]
148# movhpd $xmmT2, %r8 #can only move to mem, not to reg 179 movq %rsi, %xmm0
149 shufps \$0x0e, $xmmT2, $xmmT2 180 movq %r8, $xmmT1
150 movq $xmmT2, %r8 181 punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
151 182# movaps %xmm0, $xmmT1 # add RCONST, spill to stack
152 movups 16*1(%rdi), %xmm1 183# paddd $xmmRCONST, $xmmT1
153 pshufb $xmmT1, %xmm1 184# movups $xmmT1, -64+16*0(%rsp)
154 movaps %xmm1, $xmmT2 185
155 paddd $xmmRCONST, $xmmT2 186 movq 4*4(%rdi), %r9
156 movq $xmmT2, %r9 187 movq 4*6(%rdi), %r10
157 shufps \$0x0e, $xmmT2, $xmmT2 188 bswapq %r9
158 movq $xmmT2, %r10 189 bswapq %r10
159 190 rolq \$32, %r9 # r9 = W[5]:W[4]
160 movups 16*2(%rdi), %xmm2 191 rolq \$32, %r10 # r10 = W[7]:W[6]
161 pshufb $xmmT1, %xmm2 192 movq %r9, %xmm1
162 movaps %xmm2, $xmmT2 193 movq %r10, $xmmT1
163 paddd $xmmRCONST, $xmmT2 194 punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
164 movq $xmmT2, %r11 195
165 shufps \$0x0e, $xmmT2, $xmmT2 196 movq 4*8(%rdi), %r11
166 movq $xmmT2, %r12 197 movq 4*10(%rdi), %r12
167 198 bswapq %r11
168 movups 16*3(%rdi), %xmm3 199 bswapq %r12
169 pshufb $xmmT1, %xmm3 200 rolq \$32, %r11 # r11 = W[9]:W[8]
170 movaps %xmm3, $xmmT2 201 rolq \$32, %r12 # r12 = W[11]:W[10]
171 paddd $xmmRCONST, $xmmT2 202 movq %r11, %xmm2
172 movq $xmmT2, %r13 203 movq %r12, $xmmT1
173 shufps \$0x0e, $xmmT2, $xmmT2 204 punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
174 movq $xmmT2, %r14 205
175 206 movq 4*12(%rdi), %r13
176 # MOVQs to GPRs (above) have somewhat high latency. 207 movq 4*14(%rdi), %r14
177 # Load hash[] while they are completing: 208 bswapq %r13
178 movl 80(%rdi), %eax # a = ctx->hash[0] 209 bswapq %r14
179 movl 84(%rdi), %ebx # b = ctx->hash[1] 210 rolq \$32, %r13 # r13 = W[13]:W[12]
180 movl 88(%rdi), %ecx # c = ctx->hash[2] 211 rolq \$32, %r14 # r14 = W[15]:W[14]
181 movl 92(%rdi), %edx # d = ctx->hash[3] 212 movq %r13, %xmm3
182 movl 96(%rdi), %ebp # e = ctx->hash[4] 213 movq %r14, $xmmT1
214 punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
183" 215"
184 216
185PREP() { 217PREP() {
@@ -258,15 +290,15 @@ local rN=$((7+n0/2))
258echo " 290echo "
259# $n 291# $n
260";test $n0 = 0 && echo " 292";test $n0 = 0 && echo "
261 addl %esi, %e$e # e += RCONST + W[n] 293 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
262 shrq \$32, %rsi 294 shrq \$32, %rsi
263";test $n0 = 1 && echo " 295";test $n0 = 1 && echo "
264 addl %esi, %e$e # e += RCONST + W[n] 296 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
265";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " 297";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
266 addl %r${rN}d, %e$e # e += RCONST + W[n] 298 leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
267 shrq \$32, %r$rN 299 shrq \$32, %r$rN
268";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " 300";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
269 addl %r${rN}d, %e$e # e += RCONST + W[n] 301 leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
270";echo " 302";echo "
271 movl %e$c, %edi # c 303 movl %e$c, %edi # c
272 xorl %e$d, %edi # ^d 304 xorl %e$d, %edi # ^d
@@ -432,11 +464,6 @@ echo "
432 ret 464 ret
433 .size sha1_process_block64, .-sha1_process_block64 465 .size sha1_process_block64, .-sha1_process_block64
434 466
435 .section .rodata.cst16.bswap32_mask, \"aM\", @progbits, 16
436 .balign 16
437bswap32_mask:
438 .octa 0x0c0d0e0f08090a0b0405060700010203
439
440 .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 467 .section .rodata.cst16.sha1const, \"aM\", @progbits, 16
441 .balign 16 468 .balign 16
442sha1const: 469sha1const:
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index c13cdec07..794e97040 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -20,6 +20,11 @@
20#define extr128_32 pextrd 20#define extr128_32 pextrd
21//#define extr128_32 extractps # not shorter 21//#define extr128_32 extractps # not shorter
22 22
23// pshufb is a SSSE3 insn.
24// pinsrd, pextrd, extractps are SSE4.1 insns.
25// We do not check SSSE3/SSE4.1 in cpuid,
26// all SHA-capable CPUs support them as well.
27
23 .section .text.sha1_process_block64_shaNI, "ax", @progbits 28 .section .text.sha1_process_block64_shaNI, "ax", @progbits
24 .globl sha1_process_block64_shaNI 29 .globl sha1_process_block64_shaNI
25 .hidden sha1_process_block64_shaNI 30 .hidden sha1_process_block64_shaNI