aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2022-02-11 06:08:27 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2022-02-11 06:08:27 +0100
commit8154146be491bc66ab34d5d5f2a2466ddbdcff52 (patch)
tree4e890a52e047c466ca05c8263230a6a110884e52
parent6f56fa17131b3cbb84e887c6c5fb202f2492169e (diff)
downloadbusybox-w32-8154146be491bc66ab34d5d5f2a2466ddbdcff52.tar.gz
busybox-w32-8154146be491bc66ab34d5d5f2a2466ddbdcff52.tar.bz2
busybox-w32-8154146be491bc66ab34d5d5f2a2466ddbdcff52.zip
libbb/sha1: shrink unrolled x86-64 code
function old new delta sha1_process_block64 3481 3384 -97 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--libbb/hash_md5_sha_x86-64.S129
-rwxr-xr-xlibbb/hash_md5_sha_x86-64.S.sh111
2 files changed, 117 insertions, 123 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 287cfe547..51fde082a 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -27,68 +27,60 @@ sha1_process_block64:
27# xmm7: all round constants 27# xmm7: all round constants
28# -64(%rsp): area for passing RCONST + W[] from vector to integer units 28# -64(%rsp): area for passing RCONST + W[] from vector to integer units
29 29
30 movl 80(%rdi), %eax # a = ctx->hash[0]
31 movl 84(%rdi), %ebx # b = ctx->hash[1]
32 movl 88(%rdi), %ecx # c = ctx->hash[2]
33 movl 92(%rdi), %edx # d = ctx->hash[3]
34 movl 96(%rdi), %ebp # e = ctx->hash[4]
35
36 movaps sha1const(%rip), %xmm7 30 movaps sha1const(%rip), %xmm7
31 movaps bswap32_mask(%rip), %xmm4
37 pshufd $0x00, %xmm7, %xmm6 32 pshufd $0x00, %xmm7, %xmm6
38 33
39 # Load W[] to xmm registers, byteswapping on the fly. 34 # Load W[] to xmm0..3, byteswapping on the fly.
40 # 35 #
41 # For iterations 0..15, we pass W[] in rsi,r8..r14 36 # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
42 # for use in RD1As instead of spilling them to stack. 37 # for use in RD1As instead of spilling them to stack.
43 # We lose parallelized addition of RCONST, but LEA
44 # can do two additions at once, so it is probably a wash.
45 # (We use rsi instead of rN because this makes two 38 # (We use rsi instead of rN because this makes two
46 # LEAs in two first RD1As shorter by one byte). 39 # ADDs in two first RD1As shorter by one byte).
47 movq 4*0(%rdi), %rsi 40 movups 16*0(%rdi), %xmm0
48 movq 4*2(%rdi), %r8 41 pshufb %xmm4, %xmm0
49 bswapq %rsi 42 movaps %xmm0, %xmm5
50 bswapq %r8 43 paddd %xmm6, %xmm5
51 rolq $32, %rsi # rsi = W[1]:W[0] 44 movq %xmm5, %rsi
52 rolq $32, %r8 # r8 = W[3]:W[2] 45# pextrq $1, %xmm5, %r8 #SSE4.1 insn
53 movq %rsi, %xmm0 46# movhpd %xmm5, %r8 #can only move to mem, not to reg
54 movq %r8, %xmm4 47 shufps $0x0e, %xmm5, %xmm5
55 punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) 48 movq %xmm5, %r8
56# movaps %xmm0, %xmm4 # add RCONST, spill to stack 49
57# paddd %xmm6, %xmm4 50 movups 16*1(%rdi), %xmm1
58# movups %xmm4, -64+16*0(%rsp) 51 pshufb %xmm4, %xmm1
52 movaps %xmm1, %xmm5
53 paddd %xmm6, %xmm5
54 movq %xmm5, %r9
55 shufps $0x0e, %xmm5, %xmm5
56 movq %xmm5, %r10
59 57
60 movq 4*4(%rdi), %r9 58 movups 16*2(%rdi), %xmm2
61 movq 4*6(%rdi), %r10 59 pshufb %xmm4, %xmm2
62 bswapq %r9 60 movaps %xmm2, %xmm5
63 bswapq %r10 61 paddd %xmm6, %xmm5
64 rolq $32, %r9 # r9 = W[5]:W[4] 62 movq %xmm5, %r11
65 rolq $32, %r10 # r10 = W[7]:W[6] 63 shufps $0x0e, %xmm5, %xmm5
66 movq %r9, %xmm1 64 movq %xmm5, %r12
67 movq %r10, %xmm4
68 punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
69 65
70 movq 4*8(%rdi), %r11 66 movups 16*3(%rdi), %xmm3
71 movq 4*10(%rdi), %r12 67 pshufb %xmm4, %xmm3
72 bswapq %r11 68 movaps %xmm3, %xmm5
73 bswapq %r12 69 paddd %xmm6, %xmm5
74 rolq $32, %r11 # r11 = W[9]:W[8] 70 movq %xmm5, %r13
75 rolq $32, %r12 # r12 = W[11]:W[10] 71 shufps $0x0e, %xmm5, %xmm5
76 movq %r11, %xmm2 72 movq %xmm5, %r14
77 movq %r12, %xmm4
78 punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
79 73
80 movq 4*12(%rdi), %r13 74 # MOVQs to GPRs (above) have somewhat high latency.
81 movq 4*14(%rdi), %r14 75 # Load hash[] while they are completing:
82 bswapq %r13 76 movl 80(%rdi), %eax # a = ctx->hash[0]
83 bswapq %r14 77 movl 84(%rdi), %ebx # b = ctx->hash[1]
84 rolq $32, %r13 # r13 = W[13]:W[12] 78 movl 88(%rdi), %ecx # c = ctx->hash[2]
85 rolq $32, %r14 # r14 = W[15]:W[14] 79 movl 92(%rdi), %edx # d = ctx->hash[3]
86 movq %r13, %xmm3 80 movl 96(%rdi), %ebp # e = ctx->hash[4]
87 movq %r14, %xmm4
88 punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
89 81
90# 0 82# 0
91 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] 83 addl %esi, %ebp # e += RCONST + W[n]
92 shrq $32, %rsi 84 shrq $32, %rsi
93 movl %ecx, %edi # c 85 movl %ecx, %edi # c
94 xorl %edx, %edi # ^d 86 xorl %edx, %edi # ^d
@@ -100,7 +92,7 @@ sha1_process_block64:
100 addl %edi, %ebp # e += rotl32(a,5) 92 addl %edi, %ebp # e += rotl32(a,5)
101 rorl $2, %ebx # b = rotl32(b,30) 93 rorl $2, %ebx # b = rotl32(b,30)
102# 1 94# 1
103 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] 95 addl %esi, %edx # e += RCONST + W[n]
104 movl %ebx, %edi # c 96 movl %ebx, %edi # c
105 xorl %ecx, %edi # ^d 97 xorl %ecx, %edi # ^d
106 andl %eax, %edi # &b 98 andl %eax, %edi # &b
@@ -111,7 +103,7 @@ sha1_process_block64:
111 addl %edi, %edx # e += rotl32(a,5) 103 addl %edi, %edx # e += rotl32(a,5)
112 rorl $2, %eax # b = rotl32(b,30) 104 rorl $2, %eax # b = rotl32(b,30)
113# 2 105# 2
114 leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n] 106 addl %r8d, %ecx # e += RCONST + W[n]
115 shrq $32, %r8 107 shrq $32, %r8
116 movl %eax, %edi # c 108 movl %eax, %edi # c
117 xorl %ebx, %edi # ^d 109 xorl %ebx, %edi # ^d
@@ -123,7 +115,7 @@ sha1_process_block64:
123 addl %edi, %ecx # e += rotl32(a,5) 115 addl %edi, %ecx # e += rotl32(a,5)
124 rorl $2, %ebp # b = rotl32(b,30) 116 rorl $2, %ebp # b = rotl32(b,30)
125# 3 117# 3
126 leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] 118 addl %r8d, %ebx # e += RCONST + W[n]
127 movl %ebp, %edi # c 119 movl %ebp, %edi # c
128 xorl %eax, %edi # ^d 120 xorl %eax, %edi # ^d
129 andl %edx, %edi # &b 121 andl %edx, %edi # &b
@@ -134,7 +126,7 @@ sha1_process_block64:
134 addl %edi, %ebx # e += rotl32(a,5) 126 addl %edi, %ebx # e += rotl32(a,5)
135 rorl $2, %edx # b = rotl32(b,30) 127 rorl $2, %edx # b = rotl32(b,30)
136# 4 128# 4
137 leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] 129 addl %r9d, %eax # e += RCONST + W[n]
138 shrq $32, %r9 130 shrq $32, %r9
139 movl %edx, %edi # c 131 movl %edx, %edi # c
140 xorl %ebp, %edi # ^d 132 xorl %ebp, %edi # ^d
@@ -146,7 +138,7 @@ sha1_process_block64:
146 addl %edi, %eax # e += rotl32(a,5) 138 addl %edi, %eax # e += rotl32(a,5)
147 rorl $2, %ecx # b = rotl32(b,30) 139 rorl $2, %ecx # b = rotl32(b,30)
148# 5 140# 5
149 leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n] 141 addl %r9d, %ebp # e += RCONST + W[n]
150 movl %ecx, %edi # c 142 movl %ecx, %edi # c
151 xorl %edx, %edi # ^d 143 xorl %edx, %edi # ^d
152 andl %ebx, %edi # &b 144 andl %ebx, %edi # &b
@@ -157,7 +149,7 @@ sha1_process_block64:
157 addl %edi, %ebp # e += rotl32(a,5) 149 addl %edi, %ebp # e += rotl32(a,5)
158 rorl $2, %ebx # b = rotl32(b,30) 150 rorl $2, %ebx # b = rotl32(b,30)
159# 6 151# 6
160 leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n] 152 addl %r10d, %edx # e += RCONST + W[n]
161 shrq $32, %r10 153 shrq $32, %r10
162 movl %ebx, %edi # c 154 movl %ebx, %edi # c
163 xorl %ecx, %edi # ^d 155 xorl %ecx, %edi # ^d
@@ -169,7 +161,7 @@ sha1_process_block64:
169 addl %edi, %edx # e += rotl32(a,5) 161 addl %edi, %edx # e += rotl32(a,5)
170 rorl $2, %eax # b = rotl32(b,30) 162 rorl $2, %eax # b = rotl32(b,30)
171# 7 163# 7
172 leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n] 164 addl %r10d, %ecx # e += RCONST + W[n]
173 movl %eax, %edi # c 165 movl %eax, %edi # c
174 xorl %ebx, %edi # ^d 166 xorl %ebx, %edi # ^d
175 andl %ebp, %edi # &b 167 andl %ebp, %edi # &b
@@ -210,7 +202,7 @@ sha1_process_block64:
210 paddd %xmm6, %xmm5 202 paddd %xmm6, %xmm5
211 movups %xmm5, -64+16*0(%rsp) 203 movups %xmm5, -64+16*0(%rsp)
212# 8 204# 8
213 leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n] 205 addl %r11d, %ebx # e += RCONST + W[n]
214 shrq $32, %r11 206 shrq $32, %r11
215 movl %ebp, %edi # c 207 movl %ebp, %edi # c
216 xorl %eax, %edi # ^d 208 xorl %eax, %edi # ^d
@@ -222,7 +214,7 @@ sha1_process_block64:
222 addl %edi, %ebx # e += rotl32(a,5) 214 addl %edi, %ebx # e += rotl32(a,5)
223 rorl $2, %edx # b = rotl32(b,30) 215 rorl $2, %edx # b = rotl32(b,30)
224# 9 216# 9
225 leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n] 217 addl %r11d, %eax # e += RCONST + W[n]
226 movl %edx, %edi # c 218 movl %edx, %edi # c
227 xorl %ebp, %edi # ^d 219 xorl %ebp, %edi # ^d
228 andl %ecx, %edi # &b 220 andl %ecx, %edi # &b
@@ -233,7 +225,7 @@ sha1_process_block64:
233 addl %edi, %eax # e += rotl32(a,5) 225 addl %edi, %eax # e += rotl32(a,5)
234 rorl $2, %ecx # b = rotl32(b,30) 226 rorl $2, %ecx # b = rotl32(b,30)
235# 10 227# 10
236 leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n] 228 addl %r12d, %ebp # e += RCONST + W[n]
237 shrq $32, %r12 229 shrq $32, %r12
238 movl %ecx, %edi # c 230 movl %ecx, %edi # c
239 xorl %edx, %edi # ^d 231 xorl %edx, %edi # ^d
@@ -245,7 +237,7 @@ sha1_process_block64:
245 addl %edi, %ebp # e += rotl32(a,5) 237 addl %edi, %ebp # e += rotl32(a,5)
246 rorl $2, %ebx # b = rotl32(b,30) 238 rorl $2, %ebx # b = rotl32(b,30)
247# 11 239# 11
248 leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n] 240 addl %r12d, %edx # e += RCONST + W[n]
249 movl %ebx, %edi # c 241 movl %ebx, %edi # c
250 xorl %ecx, %edi # ^d 242 xorl %ecx, %edi # ^d
251 andl %eax, %edi # &b 243 andl %eax, %edi # &b
@@ -287,7 +279,7 @@ sha1_process_block64:
287 paddd %xmm6, %xmm5 279 paddd %xmm6, %xmm5
288 movups %xmm5, -64+16*1(%rsp) 280 movups %xmm5, -64+16*1(%rsp)
289# 12 281# 12
290 leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n] 282 addl %r13d, %ecx # e += RCONST + W[n]
291 shrq $32, %r13 283 shrq $32, %r13
292 movl %eax, %edi # c 284 movl %eax, %edi # c
293 xorl %ebx, %edi # ^d 285 xorl %ebx, %edi # ^d
@@ -299,7 +291,7 @@ sha1_process_block64:
299 addl %edi, %ecx # e += rotl32(a,5) 291 addl %edi, %ecx # e += rotl32(a,5)
300 rorl $2, %ebp # b = rotl32(b,30) 292 rorl $2, %ebp # b = rotl32(b,30)
301# 13 293# 13
302 leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] 294 addl %r13d, %ebx # e += RCONST + W[n]
303 movl %ebp, %edi # c 295 movl %ebp, %edi # c
304 xorl %eax, %edi # ^d 296 xorl %eax, %edi # ^d
305 andl %edx, %edi # &b 297 andl %edx, %edi # &b
@@ -310,7 +302,7 @@ sha1_process_block64:
310 addl %edi, %ebx # e += rotl32(a,5) 302 addl %edi, %ebx # e += rotl32(a,5)
311 rorl $2, %edx # b = rotl32(b,30) 303 rorl $2, %edx # b = rotl32(b,30)
312# 14 304# 14
313 leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] 305 addl %r14d, %eax # e += RCONST + W[n]
314 shrq $32, %r14 306 shrq $32, %r14
315 movl %edx, %edi # c 307 movl %edx, %edi # c
316 xorl %ebp, %edi # ^d 308 xorl %ebp, %edi # ^d
@@ -322,7 +314,7 @@ sha1_process_block64:
322 addl %edi, %eax # e += rotl32(a,5) 314 addl %edi, %eax # e += rotl32(a,5)
323 rorl $2, %ecx # b = rotl32(b,30) 315 rorl $2, %ecx # b = rotl32(b,30)
324# 15 316# 15
325 leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n] 317 addl %r14d, %ebp # e += RCONST + W[n]
326 movl %ecx, %edi # c 318 movl %ecx, %edi # c
327 xorl %edx, %edi # ^d 319 xorl %edx, %edi # ^d
328 andl %ebx, %edi # &b 320 andl %ebx, %edi # &b
@@ -1475,6 +1467,11 @@ sha1_process_block64:
1475 ret 1467 ret
1476 .size sha1_process_block64, .-sha1_process_block64 1468 .size sha1_process_block64, .-sha1_process_block64
1477 1469
1470 .section .rodata.cst16.bswap32_mask, "aM", @progbits, 16
1471 .balign 16
1472bswap32_mask:
1473 .octa 0x0c0d0e0f08090a0b0405060700010203
1474
1478 .section .rodata.cst16.sha1const, "aM", @progbits, 16 1475 .section .rodata.cst16.sha1const, "aM", @progbits, 16
1479 .balign 16 1476 .balign 16
1480sha1const: 1477sha1const:
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index a10ac411d..f34e6e6fa 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -129,65 +129,57 @@ sha1_process_block64:
129# xmm7: all round constants 129# xmm7: all round constants
130# -64(%rsp): area for passing RCONST + W[] from vector to integer units 130# -64(%rsp): area for passing RCONST + W[] from vector to integer units
131 131
132 movl 80(%rdi), %eax # a = ctx->hash[0]
133 movl 84(%rdi), %ebx # b = ctx->hash[1]
134 movl 88(%rdi), %ecx # c = ctx->hash[2]
135 movl 92(%rdi), %edx # d = ctx->hash[3]
136 movl 96(%rdi), %ebp # e = ctx->hash[4]
137
138 movaps sha1const(%rip), $xmmALLRCONST 132 movaps sha1const(%rip), $xmmALLRCONST
133 movaps bswap32_mask(%rip), $xmmT1
139 pshufd \$0x00, $xmmALLRCONST, $xmmRCONST 134 pshufd \$0x00, $xmmALLRCONST, $xmmRCONST
140 135
141 # Load W[] to xmm registers, byteswapping on the fly. 136 # Load W[] to xmm0..3, byteswapping on the fly.
142 # 137 #
143 # For iterations 0..15, we pass W[] in rsi,r8..r14 138 # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
144 # for use in RD1As instead of spilling them to stack. 139 # for use in RD1As instead of spilling them to stack.
145 # We lose parallelized addition of RCONST, but LEA
146 # can do two additions at once, so it is probably a wash.
147 # (We use rsi instead of rN because this makes two 140 # (We use rsi instead of rN because this makes two
148 # LEAs in two first RD1As shorter by one byte). 141 # ADDs in two first RD1As shorter by one byte).
149 movq 4*0(%rdi), %rsi 142 movups 16*0(%rdi), %xmm0
150 movq 4*2(%rdi), %r8 143 pshufb $xmmT1, %xmm0
151 bswapq %rsi 144 movaps %xmm0, $xmmT2
152 bswapq %r8 145 paddd $xmmRCONST, $xmmT2
153 rolq \$32, %rsi # rsi = W[1]:W[0] 146 movq $xmmT2, %rsi
154 rolq \$32, %r8 # r8 = W[3]:W[2] 147# pextrq \$1, $xmmT2, %r8 #SSE4.1 insn
155 movq %rsi, %xmm0 148# movhpd $xmmT2, %r8 #can only move to mem, not to reg
156 movq %r8, $xmmT1 149 shufps \$0x0e, $xmmT2, $xmmT2
157 punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) 150 movq $xmmT2, %r8
158# movaps %xmm0, $xmmT1 # add RCONST, spill to stack 151
159# paddd $xmmRCONST, $xmmT1 152 movups 16*1(%rdi), %xmm1
160# movups $xmmT1, -64+16*0(%rsp) 153 pshufb $xmmT1, %xmm1
161 154 movaps %xmm1, $xmmT2
162 movq 4*4(%rdi), %r9 155 paddd $xmmRCONST, $xmmT2
163 movq 4*6(%rdi), %r10 156 movq $xmmT2, %r9
164 bswapq %r9 157 shufps \$0x0e, $xmmT2, $xmmT2
165 bswapq %r10 158 movq $xmmT2, %r10
166 rolq \$32, %r9 # r9 = W[5]:W[4] 159
167 rolq \$32, %r10 # r10 = W[7]:W[6] 160 movups 16*2(%rdi), %xmm2
168 movq %r9, %xmm1 161 pshufb $xmmT1, %xmm2
169 movq %r10, $xmmT1 162 movaps %xmm2, $xmmT2
170 punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) 163 paddd $xmmRCONST, $xmmT2
171 164 movq $xmmT2, %r11
172 movq 4*8(%rdi), %r11 165 shufps \$0x0e, $xmmT2, $xmmT2
173 movq 4*10(%rdi), %r12 166 movq $xmmT2, %r12
174 bswapq %r11 167
175 bswapq %r12 168 movups 16*3(%rdi), %xmm3
176 rolq \$32, %r11 # r11 = W[9]:W[8] 169 pshufb $xmmT1, %xmm3
177 rolq \$32, %r12 # r12 = W[11]:W[10] 170 movaps %xmm3, $xmmT2
178 movq %r11, %xmm2 171 paddd $xmmRCONST, $xmmT2
179 movq %r12, $xmmT1 172 movq $xmmT2, %r13
180 punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) 173 shufps \$0x0e, $xmmT2, $xmmT2
181 174 movq $xmmT2, %r14
182 movq 4*12(%rdi), %r13 175
183 movq 4*14(%rdi), %r14 176 # MOVQs to GPRs (above) have somewhat high latency.
184 bswapq %r13 177 # Load hash[] while they are completing:
185 bswapq %r14 178 movl 80(%rdi), %eax # a = ctx->hash[0]
186 rolq \$32, %r13 # r13 = W[13]:W[12] 179 movl 84(%rdi), %ebx # b = ctx->hash[1]
187 rolq \$32, %r14 # r14 = W[15]:W[14] 180 movl 88(%rdi), %ecx # c = ctx->hash[2]
188 movq %r13, %xmm3 181 movl 92(%rdi), %edx # d = ctx->hash[3]
189 movq %r14, $xmmT1 182 movl 96(%rdi), %ebp # e = ctx->hash[4]
190 punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
191" 183"
192 184
193PREP() { 185PREP() {
@@ -266,15 +258,15 @@ local rN=$((7+n0/2))
266echo " 258echo "
267# $n 259# $n
268";test $n0 = 0 && echo " 260";test $n0 = 0 && echo "
269 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] 261 addl %esi, %e$e # e += RCONST + W[n]
270 shrq \$32, %rsi 262 shrq \$32, %rsi
271";test $n0 = 1 && echo " 263";test $n0 = 1 && echo "
272 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] 264 addl %esi, %e$e # e += RCONST + W[n]
273";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " 265";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
274 leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] 266 addl %r${rN}d, %e$e # e += RCONST + W[n]
275 shrq \$32, %r$rN 267 shrq \$32, %r$rN
276";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " 268";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
277 leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] 269 addl %r${rN}d, %e$e # e += RCONST + W[n]
278";echo " 270";echo "
279 movl %e$c, %edi # c 271 movl %e$c, %edi # c
280 xorl %e$d, %edi # ^d 272 xorl %e$d, %edi # ^d
@@ -440,6 +432,11 @@ echo "
440 ret 432 ret
441 .size sha1_process_block64, .-sha1_process_block64 433 .size sha1_process_block64, .-sha1_process_block64
442 434
435 .section .rodata.cst16.bswap32_mask, \"aM\", @progbits, 16
436 .balign 16
437bswap32_mask:
438 .octa 0x0c0d0e0f08090a0b0405060700010203
439
443 .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 440 .section .rodata.cst16.sha1const, \"aM\", @progbits, 16
444 .balign 16 441 .balign 16
445sha1const: 442sha1const: