aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libbb/hash_md5_sha_x86-64.S310
-rwxr-xr-xlibbb/hash_md5_sha_x86-64.S.sh109
2 files changed, 214 insertions, 205 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 069a18719..743269d98 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -1,7 +1,7 @@
1### Generated by hash_md5_sha_x86-64.S.sh ### 1### Generated by hash_md5_sha_x86-64.S.sh ###
2 2
3#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) 3#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
4 .section .text.sha1_process_block64,"ax",@progbits 4 .section .text.sha1_process_block64, "ax", @progbits
5 .globl sha1_process_block64 5 .globl sha1_process_block64
6 .hidden sha1_process_block64 6 .hidden sha1_process_block64
7 .type sha1_process_block64, @function 7 .type sha1_process_block64, @function
@@ -10,7 +10,7 @@
10sha1_process_block64: 10sha1_process_block64:
11 pushq %rbp # 1 byte insn 11 pushq %rbp # 1 byte insn
12 pushq %rbx # 1 byte insn 12 pushq %rbx # 1 byte insn
13 pushq %r15 # 2 byte insn 13# pushq %r15 # 2 byte insn
14 pushq %r14 # 2 byte insn 14 pushq %r14 # 2 byte insn
15 pushq %r13 # 2 byte insn 15 pushq %r13 # 2 byte insn
16 pushq %r12 # 2 byte insn 16 pushq %r12 # 2 byte insn
@@ -19,7 +19,8 @@ sha1_process_block64:
19#Register and stack use: 19#Register and stack use:
20# eax..edx: a..d 20# eax..edx: a..d
21# ebp: e 21# ebp: e
22# esi,edi: temps 22# esi,edi,r8..r14: temps
23# r15: unused
23# xmm0..xmm3: W[] 24# xmm0..xmm3: W[]
24# xmm4,xmm5: temps 25# xmm4,xmm5: temps
25# xmm6: current round constant 26# xmm6: current round constant
@@ -33,147 +34,148 @@ sha1_process_block64:
33 34
34 movaps rconst0x5A827999(%rip), %xmm6 35 movaps rconst0x5A827999(%rip), %xmm6
35 36
36 # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 37 # Load W[] to xmm registers, byteswapping on the fly.
37 # instead of spilling them to stack. 38 #
38 # (We lose parallelized addition of RCONST, but LEA 39 # For iterations 0..15, we pass W[] in rsi,r8..r14
39 # can do two additions at once, so...) 40 # for use in RD1A's instead of spilling them to stack.
41 # We lose parallelized addition of RCONST, but LEA
42 # can do two additions at once, so it's probably a wash.
43 # (We use rsi instead of rN because this makes two
44 # LEAs in two first RD1A's shorter by one byte).
40 movq 4*0(%rdi), %rsi 45 movq 4*0(%rdi), %rsi
41 movq 4*2(%rdi), %r10 46 movq 4*2(%rdi), %r8
42 bswapq %rsi 47 bswapq %rsi
43 bswapq %r10 48 bswapq %r8
44 rolq $32, %rsi # rsi = W[1]:W[0] 49 rolq $32, %rsi # rsi = W[1]:W[0]
45 rolq $32, %r10 50 rolq $32, %r8 # r8 = W[3]:W[2]
46 movq %rsi, %xmm0 51 movq %rsi, %xmm0
47 movq %r10, %xmm4 52 movq %r8, %xmm4
48 punpcklqdq %xmm4, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) 53 punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
49 movaps %xmm0, %xmm4 54# movaps %xmm0, %xmm4 # add RCONST, spill to stack
50 paddd %xmm6, %xmm4 55# paddd %xmm6, %xmm4
51 movups %xmm4, -64+4*0(%rsp) 56# movups %xmm4, -64+16*0(%rsp)
52 57
53 movq 4*4(%rdi), %r8 58 movq 4*4(%rdi), %r9
54 movq 4*6(%rdi), %r10 59 movq 4*6(%rdi), %r10
55 bswapq %r8 60 bswapq %r9
56 bswapq %r10 61 bswapq %r10
57 rolq $32, %r8 62 rolq $32, %r9 # r9 = W[5]:W[4]
58 rolq $32, %r10 63 rolq $32, %r10 # r10 = W[7]:W[6]
59 movq %r8, %xmm1 64 movq %r9, %xmm1
60 movq %r10, %xmm4 65 movq %r10, %xmm4
61 punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) 66 punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
62 movaps %xmm1, %xmm4
63 paddd %xmm6, %xmm4
64 movups %xmm4, -64+4*4(%rsp)
65 67
66 movq 4*8(%rdi), %r8 68 movq 4*8(%rdi), %r11
67 movq 4*10(%rdi), %r10 69 movq 4*10(%rdi), %r12
68 bswapq %r8 70 bswapq %r11
69 bswapq %r10 71 bswapq %r12
70 movl %r8d, %r9d # r9d = W[9] 72 rolq $32, %r11 # r11 = W[9]:W[8]
71 rolq $32, %r8 # r8 = W[9]:W[8] 73 rolq $32, %r12 # r12 = W[11]:W[10]
72 movl %r10d, %r11d # r11d = W[11] 74 movq %r11, %xmm2
73 rolq $32, %r10 # r10 = W[11]:W[10] 75 movq %r12, %xmm4
74 movq %r8, %xmm2 76 punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
75 movq %r10, %xmm4
76 punpcklqdq %xmm4, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11])
77 77
78 movq 4*12(%rdi), %r12 78 movq 4*12(%rdi), %r13
79 movq 4*14(%rdi), %r14 79 movq 4*14(%rdi), %r14
80 bswapq %r12 80 bswapq %r13
81 bswapq %r14 81 bswapq %r14
82 movl %r12d, %r13d # r13d = W[13] 82 rolq $32, %r13 # r13 = W[13]:W[12]
83 rolq $32, %r12 # r12 = W[13]:W[12]
84 movl %r14d, %r15d # r15d = W[15]
85 rolq $32, %r14 # r14 = W[15]:W[14] 83 rolq $32, %r14 # r14 = W[15]:W[14]
86 movq %r12, %xmm3 84 movq %r13, %xmm3
87 movq %r14, %xmm4 85 movq %r14, %xmm4
88 punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) 86 punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
89 87
90# 0 88# 0
91 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] 89 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
90 shrq $32, %rsi
92 movl %ecx, %edi # c 91 movl %ecx, %edi # c
93 xorl %edx, %edi # ^d 92 xorl %edx, %edi # ^d
94 andl %ebx, %edi # &b 93 andl %ebx, %edi # &b
95 xorl %edx, %edi # (((c ^ d) & b) ^ d) 94 xorl %edx, %edi # (((c ^ d) & b) ^ d)
96 addl %edi, %ebp # e += (((c ^ d) & b) ^ d) 95 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
97 movl %eax, %esi # 96 movl %eax, %edi #
98 roll $5, %esi # rotl32(a,5) 97 roll $5, %edi # rotl32(a,5)
99 addl %esi, %ebp # e += rotl32(a,5) 98 addl %edi, %ebp # e += rotl32(a,5)
100 rorl $2, %ebx # b = rotl32(b,30) 99 rorl $2, %ebx # b = rotl32(b,30)
101# 1 100# 1
102 addl -64+4*1(%rsp), %edx # e += RCONST + W[n] 101 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
103 movl %ebx, %edi # c 102 movl %ebx, %edi # c
104 xorl %ecx, %edi # ^d 103 xorl %ecx, %edi # ^d
105 andl %eax, %edi # &b 104 andl %eax, %edi # &b
106 xorl %ecx, %edi # (((c ^ d) & b) ^ d) 105 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
107 addl %edi, %edx # e += (((c ^ d) & b) ^ d) 106 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
108 movl %ebp, %esi # 107 movl %ebp, %edi #
109 roll $5, %esi # rotl32(a,5) 108 roll $5, %edi # rotl32(a,5)
110 addl %esi, %edx # e += rotl32(a,5) 109 addl %edi, %edx # e += rotl32(a,5)
111 rorl $2, %eax # b = rotl32(b,30) 110 rorl $2, %eax # b = rotl32(b,30)
112# 2 111# 2
113 addl -64+4*2(%rsp), %ecx # e += RCONST + W[n] 112 leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
113 shrq $32, %r8
114 movl %eax, %edi # c 114 movl %eax, %edi # c
115 xorl %ebx, %edi # ^d 115 xorl %ebx, %edi # ^d
116 andl %ebp, %edi # &b 116 andl %ebp, %edi # &b
117 xorl %ebx, %edi # (((c ^ d) & b) ^ d) 117 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
118 addl %edi, %ecx # e += (((c ^ d) & b) ^ d) 118 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
119 movl %edx, %esi # 119 movl %edx, %edi #
120 roll $5, %esi # rotl32(a,5) 120 roll $5, %edi # rotl32(a,5)
121 addl %esi, %ecx # e += rotl32(a,5) 121 addl %edi, %ecx # e += rotl32(a,5)
122 rorl $2, %ebp # b = rotl32(b,30) 122 rorl $2, %ebp # b = rotl32(b,30)
123# 3 123# 3
124 addl -64+4*3(%rsp), %ebx # e += RCONST + W[n] 124 leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
125 movl %ebp, %edi # c 125 movl %ebp, %edi # c
126 xorl %eax, %edi # ^d 126 xorl %eax, %edi # ^d
127 andl %edx, %edi # &b 127 andl %edx, %edi # &b
128 xorl %eax, %edi # (((c ^ d) & b) ^ d) 128 xorl %eax, %edi # (((c ^ d) & b) ^ d)
129 addl %edi, %ebx # e += (((c ^ d) & b) ^ d) 129 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
130 movl %ecx, %esi # 130 movl %ecx, %edi #
131 roll $5, %esi # rotl32(a,5) 131 roll $5, %edi # rotl32(a,5)
132 addl %esi, %ebx # e += rotl32(a,5) 132 addl %edi, %ebx # e += rotl32(a,5)
133 rorl $2, %edx # b = rotl32(b,30) 133 rorl $2, %edx # b = rotl32(b,30)
134# 4 134# 4
135 addl -64+4*4(%rsp), %eax # e += RCONST + W[n] 135 leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
136 shrq $32, %r9
136 movl %edx, %edi # c 137 movl %edx, %edi # c
137 xorl %ebp, %edi # ^d 138 xorl %ebp, %edi # ^d
138 andl %ecx, %edi # &b 139 andl %ecx, %edi # &b
139 xorl %ebp, %edi # (((c ^ d) & b) ^ d) 140 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
140 addl %edi, %eax # e += (((c ^ d) & b) ^ d) 141 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
141 movl %ebx, %esi # 142 movl %ebx, %edi #
142 roll $5, %esi # rotl32(a,5) 143 roll $5, %edi # rotl32(a,5)
143 addl %esi, %eax # e += rotl32(a,5) 144 addl %edi, %eax # e += rotl32(a,5)
144 rorl $2, %ecx # b = rotl32(b,30) 145 rorl $2, %ecx # b = rotl32(b,30)
145# 5 146# 5
146 addl -64+4*5(%rsp), %ebp # e += RCONST + W[n] 147 leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
147 movl %ecx, %edi # c 148 movl %ecx, %edi # c
148 xorl %edx, %edi # ^d 149 xorl %edx, %edi # ^d
149 andl %ebx, %edi # &b 150 andl %ebx, %edi # &b
150 xorl %edx, %edi # (((c ^ d) & b) ^ d) 151 xorl %edx, %edi # (((c ^ d) & b) ^ d)
151 addl %edi, %ebp # e += (((c ^ d) & b) ^ d) 152 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
152 movl %eax, %esi # 153 movl %eax, %edi #
153 roll $5, %esi # rotl32(a,5) 154 roll $5, %edi # rotl32(a,5)
154 addl %esi, %ebp # e += rotl32(a,5) 155 addl %edi, %ebp # e += rotl32(a,5)
155 rorl $2, %ebx # b = rotl32(b,30) 156 rorl $2, %ebx # b = rotl32(b,30)
156# 6 157# 6
157 addl -64+4*6(%rsp), %edx # e += RCONST + W[n] 158 leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
159 shrq $32, %r10
158 movl %ebx, %edi # c 160 movl %ebx, %edi # c
159 xorl %ecx, %edi # ^d 161 xorl %ecx, %edi # ^d
160 andl %eax, %edi # &b 162 andl %eax, %edi # &b
161 xorl %ecx, %edi # (((c ^ d) & b) ^ d) 163 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
162 addl %edi, %edx # e += (((c ^ d) & b) ^ d) 164 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
163 movl %ebp, %esi # 165 movl %ebp, %edi #
164 roll $5, %esi # rotl32(a,5) 166 roll $5, %edi # rotl32(a,5)
165 addl %esi, %edx # e += rotl32(a,5) 167 addl %edi, %edx # e += rotl32(a,5)
166 rorl $2, %eax # b = rotl32(b,30) 168 rorl $2, %eax # b = rotl32(b,30)
167# 7 169# 7
168 addl -64+4*7(%rsp), %ecx # e += RCONST + W[n] 170 leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
169 movl %eax, %edi # c 171 movl %eax, %edi # c
170 xorl %ebx, %edi # ^d 172 xorl %ebx, %edi # ^d
171 andl %ebp, %edi # &b 173 andl %ebp, %edi # &b
172 xorl %ebx, %edi # (((c ^ d) & b) ^ d) 174 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
173 addl %edi, %ecx # e += (((c ^ d) & b) ^ d) 175 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
174 movl %edx, %esi # 176 movl %edx, %edi #
175 roll $5, %esi # rotl32(a,5) 177 roll $5, %edi # rotl32(a,5)
176 addl %esi, %ecx # e += rotl32(a,5) 178 addl %edi, %ecx # e += rotl32(a,5)
177 rorl $2, %ebp # b = rotl32(b,30) 179 rorl $2, %ebp # b = rotl32(b,30)
178# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) 180# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
179 movaps %xmm3, %xmm4 181 movaps %xmm3, %xmm4
@@ -186,9 +188,9 @@ sha1_process_block64:
186 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 188 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
187 movaps %xmm0, %xmm5 189 movaps %xmm0, %xmm5
188 xorps %xmm4, %xmm4 # rol(W0,1): 190 xorps %xmm4, %xmm4 # rol(W0,1):
189 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 191 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
190 paddd %xmm0, %xmm0 # shift left by 1 192 paddd %xmm0, %xmm0 # shift left by 1
191 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 193 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
192 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 194 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
193 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 195 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
194 movaps %xmm5, %xmm4 196 movaps %xmm5, %xmm4
@@ -201,48 +203,50 @@ sha1_process_block64:
201 paddd %xmm6, %xmm5 203 paddd %xmm6, %xmm5
202 movups %xmm5, -64+16*0(%rsp) 204 movups %xmm5, -64+16*0(%rsp)
203# 8 205# 8
204 leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] 206 leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
207 shrq $32, %r11
205 movl %ebp, %edi # c 208 movl %ebp, %edi # c
206 xorl %eax, %edi # ^d 209 xorl %eax, %edi # ^d
207 andl %edx, %edi # &b 210 andl %edx, %edi # &b
208 xorl %eax, %edi # (((c ^ d) & b) ^ d) 211 xorl %eax, %edi # (((c ^ d) & b) ^ d)
209 addl %edi, %ebx # e += (((c ^ d) & b) ^ d) 212 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
210 movl %ecx, %esi # 213 movl %ecx, %edi #
211 roll $5, %esi # rotl32(a,5) 214 roll $5, %edi # rotl32(a,5)
212 addl %esi, %ebx # e += rotl32(a,5) 215 addl %edi, %ebx # e += rotl32(a,5)
213 rorl $2, %edx # b = rotl32(b,30) 216 rorl $2, %edx # b = rotl32(b,30)
214# 9 217# 9
215 leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] 218 leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
216 movl %edx, %edi # c 219 movl %edx, %edi # c
217 xorl %ebp, %edi # ^d 220 xorl %ebp, %edi # ^d
218 andl %ecx, %edi # &b 221 andl %ecx, %edi # &b
219 xorl %ebp, %edi # (((c ^ d) & b) ^ d) 222 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
220 addl %edi, %eax # e += (((c ^ d) & b) ^ d) 223 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
221 movl %ebx, %esi # 224 movl %ebx, %edi #
222 roll $5, %esi # rotl32(a,5) 225 roll $5, %edi # rotl32(a,5)
223 addl %esi, %eax # e += rotl32(a,5) 226 addl %edi, %eax # e += rotl32(a,5)
224 rorl $2, %ecx # b = rotl32(b,30) 227 rorl $2, %ecx # b = rotl32(b,30)
225# 10 228# 10
226 leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n] 229 leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
230 shrq $32, %r12
227 movl %ecx, %edi # c 231 movl %ecx, %edi # c
228 xorl %edx, %edi # ^d 232 xorl %edx, %edi # ^d
229 andl %ebx, %edi # &b 233 andl %ebx, %edi # &b
230 xorl %edx, %edi # (((c ^ d) & b) ^ d) 234 xorl %edx, %edi # (((c ^ d) & b) ^ d)
231 addl %edi, %ebp # e += (((c ^ d) & b) ^ d) 235 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
232 movl %eax, %esi # 236 movl %eax, %edi #
233 roll $5, %esi # rotl32(a,5) 237 roll $5, %edi # rotl32(a,5)
234 addl %esi, %ebp # e += rotl32(a,5) 238 addl %edi, %ebp # e += rotl32(a,5)
235 rorl $2, %ebx # b = rotl32(b,30) 239 rorl $2, %ebx # b = rotl32(b,30)
236# 11 240# 11
237 leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n] 241 leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
238 movl %ebx, %edi # c 242 movl %ebx, %edi # c
239 xorl %ecx, %edi # ^d 243 xorl %ecx, %edi # ^d
240 andl %eax, %edi # &b 244 andl %eax, %edi # &b
241 xorl %ecx, %edi # (((c ^ d) & b) ^ d) 245 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
242 addl %edi, %edx # e += (((c ^ d) & b) ^ d) 246 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
243 movl %ebp, %esi # 247 movl %ebp, %edi #
244 roll $5, %esi # rotl32(a,5) 248 roll $5, %edi # rotl32(a,5)
245 addl %esi, %edx # e += rotl32(a,5) 249 addl %edi, %edx # e += rotl32(a,5)
246 rorl $2, %eax # b = rotl32(b,30) 250 rorl $2, %eax # b = rotl32(b,30)
247 movaps rconst0x6ED9EBA1(%rip), %xmm6 251 movaps rconst0x6ED9EBA1(%rip), %xmm6
248# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) 252# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
@@ -256,9 +260,9 @@ sha1_process_block64:
256 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 260 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
257 movaps %xmm1, %xmm5 261 movaps %xmm1, %xmm5
258 xorps %xmm4, %xmm4 # rol(W0,1): 262 xorps %xmm4, %xmm4 # rol(W0,1):
259 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 263 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
260 paddd %xmm1, %xmm1 # shift left by 1 264 paddd %xmm1, %xmm1 # shift left by 1
261 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 265 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
262 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 266 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
263 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 267 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
264 movaps %xmm5, %xmm4 268 movaps %xmm5, %xmm4
@@ -271,15 +275,16 @@ sha1_process_block64:
271 paddd %xmm6, %xmm5 275 paddd %xmm6, %xmm5
272 movups %xmm5, -64+16*1(%rsp) 276 movups %xmm5, -64+16*1(%rsp)
273# 12 277# 12
274 leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n] 278 leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
279 shrq $32, %r13
275 movl %eax, %edi # c 280 movl %eax, %edi # c
276 xorl %ebx, %edi # ^d 281 xorl %ebx, %edi # ^d
277 andl %ebp, %edi # &b 282 andl %ebp, %edi # &b
278 xorl %ebx, %edi # (((c ^ d) & b) ^ d) 283 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
279 addl %edi, %ecx # e += (((c ^ d) & b) ^ d) 284 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
280 movl %edx, %esi # 285 movl %edx, %edi #
281 roll $5, %esi # rotl32(a,5) 286 roll $5, %edi # rotl32(a,5)
282 addl %esi, %ecx # e += rotl32(a,5) 287 addl %edi, %ecx # e += rotl32(a,5)
283 rorl $2, %ebp # b = rotl32(b,30) 288 rorl $2, %ebp # b = rotl32(b,30)
284# 13 289# 13
285 leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] 290 leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
@@ -288,31 +293,32 @@ sha1_process_block64:
288 andl %edx, %edi # &b 293 andl %edx, %edi # &b
289 xorl %eax, %edi # (((c ^ d) & b) ^ d) 294 xorl %eax, %edi # (((c ^ d) & b) ^ d)
290 addl %edi, %ebx # e += (((c ^ d) & b) ^ d) 295 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
291 movl %ecx, %esi # 296 movl %ecx, %edi #
292 roll $5, %esi # rotl32(a,5) 297 roll $5, %edi # rotl32(a,5)
293 addl %esi, %ebx # e += rotl32(a,5) 298 addl %edi, %ebx # e += rotl32(a,5)
294 rorl $2, %edx # b = rotl32(b,30) 299 rorl $2, %edx # b = rotl32(b,30)
295# 14 300# 14
296 leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] 301 leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
302 shrq $32, %r14
297 movl %edx, %edi # c 303 movl %edx, %edi # c
298 xorl %ebp, %edi # ^d 304 xorl %ebp, %edi # ^d
299 andl %ecx, %edi # &b 305 andl %ecx, %edi # &b
300 xorl %ebp, %edi # (((c ^ d) & b) ^ d) 306 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
301 addl %edi, %eax # e += (((c ^ d) & b) ^ d) 307 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
302 movl %ebx, %esi # 308 movl %ebx, %edi #
303 roll $5, %esi # rotl32(a,5) 309 roll $5, %edi # rotl32(a,5)
304 addl %esi, %eax # e += rotl32(a,5) 310 addl %edi, %eax # e += rotl32(a,5)
305 rorl $2, %ecx # b = rotl32(b,30) 311 rorl $2, %ecx # b = rotl32(b,30)
306# 15 312# 15
307 leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n] 313 leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
308 movl %ecx, %edi # c 314 movl %ecx, %edi # c
309 xorl %edx, %edi # ^d 315 xorl %edx, %edi # ^d
310 andl %ebx, %edi # &b 316 andl %ebx, %edi # &b
311 xorl %edx, %edi # (((c ^ d) & b) ^ d) 317 xorl %edx, %edi # (((c ^ d) & b) ^ d)
312 addl %edi, %ebp # e += (((c ^ d) & b) ^ d) 318 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
313 movl %eax, %esi # 319 movl %eax, %edi #
314 roll $5, %esi # rotl32(a,5) 320 roll $5, %edi # rotl32(a,5)
315 addl %esi, %ebp # e += rotl32(a,5) 321 addl %edi, %ebp # e += rotl32(a,5)
316 rorl $2, %ebx # b = rotl32(b,30) 322 rorl $2, %ebx # b = rotl32(b,30)
317# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) 323# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
318 movaps %xmm1, %xmm4 324 movaps %xmm1, %xmm4
@@ -325,9 +331,9 @@ sha1_process_block64:
325 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 331 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
326 movaps %xmm2, %xmm5 332 movaps %xmm2, %xmm5
327 xorps %xmm4, %xmm4 # rol(W0,1): 333 xorps %xmm4, %xmm4 # rol(W0,1):
328 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 334 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
329 paddd %xmm2, %xmm2 # shift left by 1 335 paddd %xmm2, %xmm2 # shift left by 1
330 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 336 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
331 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 337 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
332 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 338 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
333 movaps %xmm5, %xmm4 339 movaps %xmm5, %xmm4
@@ -394,9 +400,9 @@ sha1_process_block64:
394 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 400 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
395 movaps %xmm3, %xmm5 401 movaps %xmm3, %xmm5
396 xorps %xmm4, %xmm4 # rol(W0,1): 402 xorps %xmm4, %xmm4 # rol(W0,1):
397 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 403 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
398 paddd %xmm3, %xmm3 # shift left by 1 404 paddd %xmm3, %xmm3 # shift left by 1
399 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 405 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
400 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 406 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
401 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 407 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
402 movaps %xmm5, %xmm4 408 movaps %xmm5, %xmm4
@@ -459,9 +465,9 @@ sha1_process_block64:
459 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 465 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
460 movaps %xmm0, %xmm5 466 movaps %xmm0, %xmm5
461 xorps %xmm4, %xmm4 # rol(W0,1): 467 xorps %xmm4, %xmm4 # rol(W0,1):
462 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 468 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
463 paddd %xmm0, %xmm0 # shift left by 1 469 paddd %xmm0, %xmm0 # shift left by 1
464 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 470 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
465 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 471 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
466 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 472 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
467 movaps %xmm5, %xmm4 473 movaps %xmm5, %xmm4
@@ -524,9 +530,9 @@ sha1_process_block64:
524 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 530 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
525 movaps %xmm1, %xmm5 531 movaps %xmm1, %xmm5
526 xorps %xmm4, %xmm4 # rol(W0,1): 532 xorps %xmm4, %xmm4 # rol(W0,1):
527 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 533 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
528 paddd %xmm1, %xmm1 # shift left by 1 534 paddd %xmm1, %xmm1 # shift left by 1
529 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 535 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
530 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 536 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
531 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 537 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
532 movaps %xmm5, %xmm4 538 movaps %xmm5, %xmm4
@@ -590,9 +596,9 @@ sha1_process_block64:
590 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 596 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
591 movaps %xmm2, %xmm5 597 movaps %xmm2, %xmm5
592 xorps %xmm4, %xmm4 # rol(W0,1): 598 xorps %xmm4, %xmm4 # rol(W0,1):
593 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 599 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
594 paddd %xmm2, %xmm2 # shift left by 1 600 paddd %xmm2, %xmm2 # shift left by 1
595 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 601 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
596 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 602 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
597 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 603 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
598 movaps %xmm5, %xmm4 604 movaps %xmm5, %xmm4
@@ -655,9 +661,9 @@ sha1_process_block64:
655 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 661 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
656 movaps %xmm3, %xmm5 662 movaps %xmm3, %xmm5
657 xorps %xmm4, %xmm4 # rol(W0,1): 663 xorps %xmm4, %xmm4 # rol(W0,1):
658 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 664 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
659 paddd %xmm3, %xmm3 # shift left by 1 665 paddd %xmm3, %xmm3 # shift left by 1
660 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 666 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
661 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 667 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
662 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 668 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
663 movaps %xmm5, %xmm4 669 movaps %xmm5, %xmm4
@@ -720,9 +726,9 @@ sha1_process_block64:
720 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 726 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
721 movaps %xmm0, %xmm5 727 movaps %xmm0, %xmm5
722 xorps %xmm4, %xmm4 # rol(W0,1): 728 xorps %xmm4, %xmm4 # rol(W0,1):
723 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 729 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
724 paddd %xmm0, %xmm0 # shift left by 1 730 paddd %xmm0, %xmm0 # shift left by 1
725 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 731 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
726 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 732 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
727 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 733 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
728 movaps %xmm5, %xmm4 734 movaps %xmm5, %xmm4
@@ -797,9 +803,9 @@ sha1_process_block64:
797 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 803 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
798 movaps %xmm1, %xmm5 804 movaps %xmm1, %xmm5
799 xorps %xmm4, %xmm4 # rol(W0,1): 805 xorps %xmm4, %xmm4 # rol(W0,1):
800 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 806 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
801 paddd %xmm1, %xmm1 # shift left by 1 807 paddd %xmm1, %xmm1 # shift left by 1
802 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 808 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
803 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 809 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
804 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 810 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
805 movaps %xmm5, %xmm4 811 movaps %xmm5, %xmm4
@@ -874,9 +880,9 @@ sha1_process_block64:
874 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 880 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
875 movaps %xmm2, %xmm5 881 movaps %xmm2, %xmm5
876 xorps %xmm4, %xmm4 # rol(W0,1): 882 xorps %xmm4, %xmm4 # rol(W0,1):
877 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 883 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
878 paddd %xmm2, %xmm2 # shift left by 1 884 paddd %xmm2, %xmm2 # shift left by 1
879 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 885 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
880 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 886 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
881 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 887 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
882 movaps %xmm5, %xmm4 888 movaps %xmm5, %xmm4
@@ -952,9 +958,9 @@ sha1_process_block64:
952 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 958 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
953 movaps %xmm3, %xmm5 959 movaps %xmm3, %xmm5
954 xorps %xmm4, %xmm4 # rol(W0,1): 960 xorps %xmm4, %xmm4 # rol(W0,1):
955 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 961 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
956 paddd %xmm3, %xmm3 # shift left by 1 962 paddd %xmm3, %xmm3 # shift left by 1
957 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 963 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
958 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 964 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
959 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 965 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
960 movaps %xmm5, %xmm4 966 movaps %xmm5, %xmm4
@@ -1029,9 +1035,9 @@ sha1_process_block64:
1029 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 1035 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1030 movaps %xmm0, %xmm5 1036 movaps %xmm0, %xmm5
1031 xorps %xmm4, %xmm4 # rol(W0,1): 1037 xorps %xmm4, %xmm4 # rol(W0,1):
1032 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 1038 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1033 paddd %xmm0, %xmm0 # shift left by 1 1039 paddd %xmm0, %xmm0 # shift left by 1
1034 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 1040 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
1035 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 1041 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1036 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 1042 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1037 movaps %xmm5, %xmm4 1043 movaps %xmm5, %xmm4
@@ -1106,9 +1112,9 @@ sha1_process_block64:
1106 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 1112 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1107 movaps %xmm1, %xmm5 1113 movaps %xmm1, %xmm5
1108 xorps %xmm4, %xmm4 # rol(W0,1): 1114 xorps %xmm4, %xmm4 # rol(W0,1):
1109 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 1115 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1110 paddd %xmm1, %xmm1 # shift left by 1 1116 paddd %xmm1, %xmm1 # shift left by 1
1111 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 1117 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
1112 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 1118 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1113 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 1119 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1114 movaps %xmm5, %xmm4 1120 movaps %xmm5, %xmm4
@@ -1171,9 +1177,9 @@ sha1_process_block64:
1171 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 1177 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1172 movaps %xmm2, %xmm5 1178 movaps %xmm2, %xmm5
1173 xorps %xmm4, %xmm4 # rol(W0,1): 1179 xorps %xmm4, %xmm4 # rol(W0,1):
1174 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 1180 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1175 paddd %xmm2, %xmm2 # shift left by 1 1181 paddd %xmm2, %xmm2 # shift left by 1
1176 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 1182 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
1177 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 1183 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1178 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 1184 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1179 movaps %xmm5, %xmm4 1185 movaps %xmm5, %xmm4
@@ -1236,9 +1242,9 @@ sha1_process_block64:
1236 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup 1242 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1237 movaps %xmm3, %xmm5 1243 movaps %xmm3, %xmm5
1238 xorps %xmm4, %xmm4 # rol(W0,1): 1244 xorps %xmm4, %xmm4 # rol(W0,1):
1239 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) 1245 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1240 paddd %xmm3, %xmm3 # shift left by 1 1246 paddd %xmm3, %xmm3 # shift left by 1
1241 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 1247 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
1242 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 1248 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1243 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 1249 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1244 movaps %xmm5, %xmm4 1250 movaps %xmm5, %xmm4
@@ -1378,7 +1384,7 @@ sha1_process_block64:
1378 addl %ebx, 84(%rdi) # ctx->hash[1] += b 1384 addl %ebx, 84(%rdi) # ctx->hash[1] += b
1379 popq %r14 # 1385 popq %r14 #
1380 addl %ecx, 88(%rdi) # ctx->hash[2] += c 1386 addl %ecx, 88(%rdi) # ctx->hash[2] += c
1381 popq %r15 # 1387# popq %r15 #
1382 addl %edx, 92(%rdi) # ctx->hash[3] += d 1388 addl %edx, 92(%rdi) # ctx->hash[3] += d
1383 popq %rbx # 1389 popq %rbx #
1384 addl %ebp, 96(%rdi) # ctx->hash[4] += e 1390 addl %ebp, 96(%rdi) # ctx->hash[4] += e
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 87c2d0800..47c40af0d 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -102,7 +102,7 @@ echo \
102"### Generated by hash_md5_sha_x86-64.S.sh ### 102"### Generated by hash_md5_sha_x86-64.S.sh ###
103 103
104#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) 104#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
105 .section .text.sha1_process_block64,\"ax\",@progbits 105 .section .text.sha1_process_block64, \"ax\", @progbits
106 .globl sha1_process_block64 106 .globl sha1_process_block64
107 .hidden sha1_process_block64 107 .hidden sha1_process_block64
108 .type sha1_process_block64, @function 108 .type sha1_process_block64, @function
@@ -111,7 +111,7 @@ echo \
111sha1_process_block64: 111sha1_process_block64:
112 pushq %rbp # 1 byte insn 112 pushq %rbp # 1 byte insn
113 pushq %rbx # 1 byte insn 113 pushq %rbx # 1 byte insn
114 pushq %r15 # 2 byte insn 114# pushq %r15 # 2 byte insn
115 pushq %r14 # 2 byte insn 115 pushq %r14 # 2 byte insn
116 pushq %r13 # 2 byte insn 116 pushq %r13 # 2 byte insn
117 pushq %r12 # 2 byte insn 117 pushq %r12 # 2 byte insn
@@ -120,7 +120,8 @@ sha1_process_block64:
120#Register and stack use: 120#Register and stack use:
121# eax..edx: a..d 121# eax..edx: a..d
122# ebp: e 122# ebp: e
123# esi,edi: temps 123# esi,edi,r8..r14: temps
124# r15: unused
124# xmm0..xmm3: W[] 125# xmm0..xmm3: W[]
125# xmm4,xmm5: temps 126# xmm4,xmm5: temps
126# xmm6: current round constant 127# xmm6: current round constant
@@ -134,59 +135,56 @@ sha1_process_block64:
134 135
135 movaps rconst0x5A827999(%rip), $xmmRCONST 136 movaps rconst0x5A827999(%rip), $xmmRCONST
136 137
137 # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 138 # Load W[] to xmm registers, byteswapping on the fly.
138 # instead of spilling them to stack. 139 #
139 # (We lose parallelized addition of RCONST, but LEA 140 # For iterations 0..15, we pass W[] in rsi,r8..r14
140 # can do two additions at once, so...) 141 # for use in RD1A's instead of spilling them to stack.
142 # We lose parallelized addition of RCONST, but LEA
143 # can do two additions at once, so it's probably a wash.
144 # (We use rsi instead of rN because this makes two
145 # LEAs in two first RD1A's shorter by one byte).
141 movq 4*0(%rdi), %rsi 146 movq 4*0(%rdi), %rsi
142 movq 4*2(%rdi), %r10 147 movq 4*2(%rdi), %r8
143 bswapq %rsi 148 bswapq %rsi
144 bswapq %r10 149 bswapq %r8
145 rolq \$32, %rsi # rsi = W[1]:W[0] 150 rolq \$32, %rsi # rsi = W[1]:W[0]
146 rolq \$32, %r10 151 rolq \$32, %r8 # r8 = W[3]:W[2]
147 movq %rsi, %xmm0 152 movq %rsi, %xmm0
148 movq %r10, $xmmT1 153 movq %r8, $xmmT1
149 punpcklqdq $xmmT1, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) 154 punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
150 movaps %xmm0, $xmmT1 155# movaps %xmm0, $xmmT1 # add RCONST, spill to stack
151 paddd $xmmRCONST, $xmmT1 156# paddd $xmmRCONST, $xmmT1
152 movups $xmmT1, -64+4*0(%rsp) 157# movups $xmmT1, -64+16*0(%rsp)
153 158
154 movq 4*4(%rdi), %r8 159 movq 4*4(%rdi), %r9
155 movq 4*6(%rdi), %r10 160 movq 4*6(%rdi), %r10
156 bswapq %r8 161 bswapq %r9
157 bswapq %r10 162 bswapq %r10
158 rolq \$32, %r8 163 rolq \$32, %r9 # r9 = W[5]:W[4]
159 rolq \$32, %r10 164 rolq \$32, %r10 # r10 = W[7]:W[6]
160 movq %r8, %xmm1 165 movq %r9, %xmm1
161 movq %r10, $xmmT1 166 movq %r10, $xmmT1
162 punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) 167 punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
163 movaps %xmm1, $xmmT1
164 paddd $xmmRCONST, $xmmT1
165 movups $xmmT1, -64+4*4(%rsp)
166 168
167 movq 4*8(%rdi), %r8 169 movq 4*8(%rdi), %r11
168 movq 4*10(%rdi), %r10 170 movq 4*10(%rdi), %r12
169 bswapq %r8 171 bswapq %r11
170 bswapq %r10 172 bswapq %r12
171 movl %r8d, %r9d # r9d = W[9] 173 rolq \$32, %r11 # r11 = W[9]:W[8]
172 rolq \$32, %r8 # r8 = W[9]:W[8] 174 rolq \$32, %r12 # r12 = W[11]:W[10]
173 movl %r10d, %r11d # r11d = W[11] 175 movq %r11, %xmm2
174 rolq \$32, %r10 # r10 = W[11]:W[10] 176 movq %r12, $xmmT1
175 movq %r8, %xmm2 177 punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
176 movq %r10, $xmmT1
177 punpcklqdq $xmmT1, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11])
178 178
179 movq 4*12(%rdi), %r12 179 movq 4*12(%rdi), %r13
180 movq 4*14(%rdi), %r14 180 movq 4*14(%rdi), %r14
181 bswapq %r12 181 bswapq %r13
182 bswapq %r14 182 bswapq %r14
183 movl %r12d, %r13d # r13d = W[13] 183 rolq \$32, %r13 # r13 = W[13]:W[12]
184 rolq \$32, %r12 # r12 = W[13]:W[12]
185 movl %r14d, %r15d # r15d = W[15]
186 rolq \$32, %r14 # r14 = W[15]:W[14] 184 rolq \$32, %r14 # r14 = W[15]:W[14]
187 movq %r12, %xmm3 185 movq %r13, %xmm3
188 movq %r14, $xmmT1 186 movq %r14, $xmmT1
189 punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) 187 punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
190" 188"
191 189
192PREP() { 190PREP() {
@@ -215,9 +213,9 @@ echo "# PREP $@
215 movaps $xmmW0, $xmmT2 213 movaps $xmmW0, $xmmT2
216 214
217 xorps $xmmT1, $xmmT1 # rol(W0,1): 215 xorps $xmmT1, $xmmT1 # rol(W0,1):
218 pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) 216 pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1)
219 paddd $xmmW0, $xmmW0 # shift left by 1 217 paddd $xmmW0, $xmmW0 # shift left by 1
220 psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 218 psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1
221 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 219 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
222 220
223 pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 221 pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
@@ -256,23 +254,28 @@ RD1A() {
256local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 254local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
257local n=$(($6)) 255local n=$(($6))
258local n0=$(((n+0) & 15)) 256local n0=$(((n+0) & 15))
257local rN=$((7+n0/2))
259echo " 258echo "
260# $n 259# $n
261";test $n0 = 0 && echo " 260";test $n0 = 0 && echo "
262 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] 261 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
263";test $n0 != 0 && test $n0 -lt 8 && echo " 262 shrq \$32, %rsi
264 addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n] 263";test $n0 = 1 && echo "
265";test $n0 -ge 8 && echo " 264 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
266 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] 265";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
266 leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
267 shrq \$32, %r$rN
268";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
269 leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
267";echo " 270";echo "
268 movl %e$c, %edi # c 271 movl %e$c, %edi # c
269 xorl %e$d, %edi # ^d 272 xorl %e$d, %edi # ^d
270 andl %e$b, %edi # &b 273 andl %e$b, %edi # &b
271 xorl %e$d, %edi # (((c ^ d) & b) ^ d) 274 xorl %e$d, %edi # (((c ^ d) & b) ^ d)
272 addl %edi, %e$e # e += (((c ^ d) & b) ^ d) 275 addl %edi, %e$e # e += (((c ^ d) & b) ^ d)
273 movl %e$a, %esi # 276 movl %e$a, %edi #
274 roll \$5, %esi # rotl32(a,5) 277 roll \$5, %edi # rotl32(a,5)
275 addl %esi, %e$e # e += rotl32(a,5) 278 addl %edi, %e$e # e += rotl32(a,5)
276 rorl \$2, %e$b # b = rotl32(b,30) 279 rorl \$2, %e$b # b = rotl32(b,30)
277" 280"
278} 281}
@@ -420,7 +423,7 @@ echo "
420 addl %ebx, 84(%rdi) # ctx->hash[1] += b 423 addl %ebx, 84(%rdi) # ctx->hash[1] += b
421 popq %r14 # 424 popq %r14 #
422 addl %ecx, 88(%rdi) # ctx->hash[2] += c 425 addl %ecx, 88(%rdi) # ctx->hash[2] += c
423 popq %r15 # 426# popq %r15 #
424 addl %edx, 92(%rdi) # ctx->hash[3] += d 427 addl %edx, 92(%rdi) # ctx->hash[3] += d
425 popq %rbx # 428 popq %rbx #
426 addl %ebp, 96(%rdi) # ctx->hash[4] += e 429 addl %ebp, 96(%rdi) # ctx->hash[4] += e