diff options
Diffstat (limited to 'libbb/hash_md5_sha_x86-64.S.sh')
-rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 109 |
1 files changed, 56 insertions, 53 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 87c2d0800..47c40af0d 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh | |||
@@ -102,7 +102,7 @@ echo \ | |||
102 | "### Generated by hash_md5_sha_x86-64.S.sh ### | 102 | "### Generated by hash_md5_sha_x86-64.S.sh ### |
103 | 103 | ||
104 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) | 104 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
105 | .section .text.sha1_process_block64,\"ax\",@progbits | 105 | .section .text.sha1_process_block64, \"ax\", @progbits |
106 | .globl sha1_process_block64 | 106 | .globl sha1_process_block64 |
107 | .hidden sha1_process_block64 | 107 | .hidden sha1_process_block64 |
108 | .type sha1_process_block64, @function | 108 | .type sha1_process_block64, @function |
@@ -111,7 +111,7 @@ echo \ | |||
111 | sha1_process_block64: | 111 | sha1_process_block64: |
112 | pushq %rbp # 1 byte insn | 112 | pushq %rbp # 1 byte insn |
113 | pushq %rbx # 1 byte insn | 113 | pushq %rbx # 1 byte insn |
114 | pushq %r15 # 2 byte insn | 114 | # pushq %r15 # 2 byte insn |
115 | pushq %r14 # 2 byte insn | 115 | pushq %r14 # 2 byte insn |
116 | pushq %r13 # 2 byte insn | 116 | pushq %r13 # 2 byte insn |
117 | pushq %r12 # 2 byte insn | 117 | pushq %r12 # 2 byte insn |
@@ -120,7 +120,8 @@ sha1_process_block64: | |||
120 | #Register and stack use: | 120 | #Register and stack use: |
121 | # eax..edx: a..d | 121 | # eax..edx: a..d |
122 | # ebp: e | 122 | # ebp: e |
123 | # esi,edi: temps | 123 | # esi,edi,r8..r14: temps |
124 | # r15: unused | ||
124 | # xmm0..xmm3: W[] | 125 | # xmm0..xmm3: W[] |
125 | # xmm4,xmm5: temps | 126 | # xmm4,xmm5: temps |
126 | # xmm6: current round constant | 127 | # xmm6: current round constant |
@@ -134,59 +135,56 @@ sha1_process_block64: | |||
134 | 135 | ||
135 | movaps rconst0x5A827999(%rip), $xmmRCONST | 136 | movaps rconst0x5A827999(%rip), $xmmRCONST |
136 | 137 | ||
137 | # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 | 138 | # Load W[] to xmm registers, byteswapping on the fly. |
138 | # instead of spilling them to stack. | 139 | # |
139 | # (We lose parallelized addition of RCONST, but LEA | 140 | # For iterations 0..15, we pass W[] in rsi,r8..r14 |
140 | # can do two additions at once, so...) | 141 | # for use in RD1A's instead of spilling them to stack. |
142 | # We lose parallelized addition of RCONST, but LEA | ||
143 | # can do two additions at once, so it's probably a wash. | ||
144 | # (We use rsi instead of rN because this makes two | ||
145 | # LEAs in two first RD1A's shorter by one byte). | ||
141 | movq 4*0(%rdi), %rsi | 146 | movq 4*0(%rdi), %rsi |
142 | movq 4*2(%rdi), %r10 | 147 | movq 4*2(%rdi), %r8 |
143 | bswapq %rsi | 148 | bswapq %rsi |
144 | bswapq %r10 | 149 | bswapq %r8 |
145 | rolq \$32, %rsi # rsi = W[1]:W[0] | 150 | rolq \$32, %rsi # rsi = W[1]:W[0] |
146 | rolq \$32, %r10 | 151 | rolq \$32, %r8 # r8 = W[3]:W[2] |
147 | movq %rsi, %xmm0 | 152 | movq %rsi, %xmm0 |
148 | movq %r10, $xmmT1 | 153 | movq %r8, $xmmT1 |
149 | punpcklqdq $xmmT1, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) | 154 | punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) |
150 | movaps %xmm0, $xmmT1 | 155 | # movaps %xmm0, $xmmT1 # add RCONST, spill to stack |
151 | paddd $xmmRCONST, $xmmT1 | 156 | # paddd $xmmRCONST, $xmmT1 |
152 | movups $xmmT1, -64+4*0(%rsp) | 157 | # movups $xmmT1, -64+16*0(%rsp) |
153 | 158 | ||
154 | movq 4*4(%rdi), %r8 | 159 | movq 4*4(%rdi), %r9 |
155 | movq 4*6(%rdi), %r10 | 160 | movq 4*6(%rdi), %r10 |
156 | bswapq %r8 | 161 | bswapq %r9 |
157 | bswapq %r10 | 162 | bswapq %r10 |
158 | rolq \$32, %r8 | 163 | rolq \$32, %r9 # r9 = W[5]:W[4] |
159 | rolq \$32, %r10 | 164 | rolq \$32, %r10 # r10 = W[7]:W[6] |
160 | movq %r8, %xmm1 | 165 | movq %r9, %xmm1 |
161 | movq %r10, $xmmT1 | 166 | movq %r10, $xmmT1 |
162 | punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) | 167 | punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) |
163 | movaps %xmm1, $xmmT1 | ||
164 | paddd $xmmRCONST, $xmmT1 | ||
165 | movups $xmmT1, -64+4*4(%rsp) | ||
166 | 168 | ||
167 | movq 4*8(%rdi), %r8 | 169 | movq 4*8(%rdi), %r11 |
168 | movq 4*10(%rdi), %r10 | 170 | movq 4*10(%rdi), %r12 |
169 | bswapq %r8 | 171 | bswapq %r11 |
170 | bswapq %r10 | 172 | bswapq %r12 |
171 | movl %r8d, %r9d # r9d = W[9] | 173 | rolq \$32, %r11 # r11 = W[9]:W[8] |
172 | rolq \$32, %r8 # r8 = W[9]:W[8] | 174 | rolq \$32, %r12 # r12 = W[11]:W[10] |
173 | movl %r10d, %r11d # r11d = W[11] | 175 | movq %r11, %xmm2 |
174 | rolq \$32, %r10 # r10 = W[11]:W[10] | 176 | movq %r12, $xmmT1 |
175 | movq %r8, %xmm2 | 177 | punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) |
176 | movq %r10, $xmmT1 | ||
177 | punpcklqdq $xmmT1, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11]) | ||
178 | 178 | ||
179 | movq 4*12(%rdi), %r12 | 179 | movq 4*12(%rdi), %r13 |
180 | movq 4*14(%rdi), %r14 | 180 | movq 4*14(%rdi), %r14 |
181 | bswapq %r12 | 181 | bswapq %r13 |
182 | bswapq %r14 | 182 | bswapq %r14 |
183 | movl %r12d, %r13d # r13d = W[13] | 183 | rolq \$32, %r13 # r13 = W[13]:W[12] |
184 | rolq \$32, %r12 # r12 = W[13]:W[12] | ||
185 | movl %r14d, %r15d # r15d = W[15] | ||
186 | rolq \$32, %r14 # r14 = W[15]:W[14] | 184 | rolq \$32, %r14 # r14 = W[15]:W[14] |
187 | movq %r12, %xmm3 | 185 | movq %r13, %xmm3 |
188 | movq %r14, $xmmT1 | 186 | movq %r14, $xmmT1 |
189 | punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) | 187 | punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) |
190 | " | 188 | " |
191 | 189 | ||
192 | PREP() { | 190 | PREP() { |
@@ -215,9 +213,9 @@ echo "# PREP $@ | |||
215 | movaps $xmmW0, $xmmT2 | 213 | movaps $xmmW0, $xmmT2 |
216 | 214 | ||
217 | xorps $xmmT1, $xmmT1 # rol(W0,1): | 215 | xorps $xmmT1, $xmmT1 # rol(W0,1): |
218 | pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) | 216 | pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) |
219 | paddd $xmmW0, $xmmW0 # shift left by 1 | 217 | paddd $xmmW0, $xmmW0 # shift left by 1 |
220 | psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 | 218 | psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 |
221 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 219 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
222 | 220 | ||
223 | pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 221 | pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
@@ -256,23 +254,28 @@ RD1A() { | |||
256 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 | 254 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
257 | local n=$(($6)) | 255 | local n=$(($6)) |
258 | local n0=$(((n+0) & 15)) | 256 | local n0=$(((n+0) & 15)) |
257 | local rN=$((7+n0/2)) | ||
259 | echo " | 258 | echo " |
260 | # $n | 259 | # $n |
261 | ";test $n0 = 0 && echo " | 260 | ";test $n0 = 0 && echo " |
262 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] | 261 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
263 | ";test $n0 != 0 && test $n0 -lt 8 && echo " | 262 | shrq \$32, %rsi |
264 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n] | 263 | ";test $n0 = 1 && echo " |
265 | ";test $n0 -ge 8 && echo " | 264 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
266 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] | 265 | ";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " |
266 | leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] | ||
267 | shrq \$32, %r$rN | ||
268 | ";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " | ||
269 | leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] | ||
267 | ";echo " | 270 | ";echo " |
268 | movl %e$c, %edi # c | 271 | movl %e$c, %edi # c |
269 | xorl %e$d, %edi # ^d | 272 | xorl %e$d, %edi # ^d |
270 | andl %e$b, %edi # &b | 273 | andl %e$b, %edi # &b |
271 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) | 274 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
272 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) | 275 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
273 | movl %e$a, %esi # | 276 | movl %e$a, %edi # |
274 | roll \$5, %esi # rotl32(a,5) | 277 | roll \$5, %edi # rotl32(a,5) |
275 | addl %esi, %e$e # e += rotl32(a,5) | 278 | addl %edi, %e$e # e += rotl32(a,5) |
276 | rorl \$2, %e$b # b = rotl32(b,30) | 279 | rorl \$2, %e$b # b = rotl32(b,30) |
277 | " | 280 | " |
278 | } | 281 | } |
@@ -420,7 +423,7 @@ echo " | |||
420 | addl %ebx, 84(%rdi) # ctx->hash[1] += b | 423 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
421 | popq %r14 # | 424 | popq %r14 # |
422 | addl %ecx, 88(%rdi) # ctx->hash[2] += c | 425 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
423 | popq %r15 # | 426 | # popq %r15 # |
424 | addl %edx, 92(%rdi) # ctx->hash[3] += d | 427 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
425 | popq %rbx # | 428 | popq %rbx # |
426 | addl %ebp, 96(%rdi) # ctx->hash[4] += e | 429 | addl %ebp, 96(%rdi) # ctx->hash[4] += e |