aboutsummaryrefslogtreecommitdiff
path: root/libbb/hash_md5_sha_x86-64.S.sh
diff options
context:
space:
mode:
Diffstat (limited to 'libbb/hash_md5_sha_x86-64.S.sh')
-rwxr-xr-xlibbb/hash_md5_sha_x86-64.S.sh109
1 files changed, 56 insertions, 53 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 87c2d0800..47c40af0d 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -102,7 +102,7 @@ echo \
102"### Generated by hash_md5_sha_x86-64.S.sh ### 102"### Generated by hash_md5_sha_x86-64.S.sh ###
103 103
104#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) 104#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
105 .section .text.sha1_process_block64,\"ax\",@progbits 105 .section .text.sha1_process_block64, \"ax\", @progbits
106 .globl sha1_process_block64 106 .globl sha1_process_block64
107 .hidden sha1_process_block64 107 .hidden sha1_process_block64
108 .type sha1_process_block64, @function 108 .type sha1_process_block64, @function
@@ -111,7 +111,7 @@ echo \
111sha1_process_block64: 111sha1_process_block64:
112 pushq %rbp # 1 byte insn 112 pushq %rbp # 1 byte insn
113 pushq %rbx # 1 byte insn 113 pushq %rbx # 1 byte insn
114 pushq %r15 # 2 byte insn 114# pushq %r15 # 2 byte insn
115 pushq %r14 # 2 byte insn 115 pushq %r14 # 2 byte insn
116 pushq %r13 # 2 byte insn 116 pushq %r13 # 2 byte insn
117 pushq %r12 # 2 byte insn 117 pushq %r12 # 2 byte insn
@@ -120,7 +120,8 @@ sha1_process_block64:
120#Register and stack use: 120#Register and stack use:
121# eax..edx: a..d 121# eax..edx: a..d
122# ebp: e 122# ebp: e
123# esi,edi: temps 123# esi,edi,r8..r14: temps
124# r15: unused
124# xmm0..xmm3: W[] 125# xmm0..xmm3: W[]
125# xmm4,xmm5: temps 126# xmm4,xmm5: temps
126# xmm6: current round constant 127# xmm6: current round constant
@@ -134,59 +135,56 @@ sha1_process_block64:
134 135
135 movaps rconst0x5A827999(%rip), $xmmRCONST 136 movaps rconst0x5A827999(%rip), $xmmRCONST
136 137
137 # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 138 # Load W[] to xmm registers, byteswapping on the fly.
138 # instead of spilling them to stack. 139 #
139 # (We lose parallelized addition of RCONST, but LEA 140 # For iterations 0..15, we pass W[] in rsi,r8..r14
140 # can do two additions at once, so...) 141 # for use in RD1A's instead of spilling them to stack.
142 # We lose parallelized addition of RCONST, but LEA
143 # can do two additions at once, so it's probably a wash.
144 # (We use rsi instead of rN because this makes two
145 # LEAs in two first RD1A's shorter by one byte).
141 movq 4*0(%rdi), %rsi 146 movq 4*0(%rdi), %rsi
142 movq 4*2(%rdi), %r10 147 movq 4*2(%rdi), %r8
143 bswapq %rsi 148 bswapq %rsi
144 bswapq %r10 149 bswapq %r8
145 rolq \$32, %rsi # rsi = W[1]:W[0] 150 rolq \$32, %rsi # rsi = W[1]:W[0]
146 rolq \$32, %r10 151 rolq \$32, %r8 # r8 = W[3]:W[2]
147 movq %rsi, %xmm0 152 movq %rsi, %xmm0
148 movq %r10, $xmmT1 153 movq %r8, $xmmT1
149 punpcklqdq $xmmT1, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) 154 punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
150 movaps %xmm0, $xmmT1 155# movaps %xmm0, $xmmT1 # add RCONST, spill to stack
151 paddd $xmmRCONST, $xmmT1 156# paddd $xmmRCONST, $xmmT1
152 movups $xmmT1, -64+4*0(%rsp) 157# movups $xmmT1, -64+16*0(%rsp)
153 158
154 movq 4*4(%rdi), %r8 159 movq 4*4(%rdi), %r9
155 movq 4*6(%rdi), %r10 160 movq 4*6(%rdi), %r10
156 bswapq %r8 161 bswapq %r9
157 bswapq %r10 162 bswapq %r10
158 rolq \$32, %r8 163 rolq \$32, %r9 # r9 = W[5]:W[4]
159 rolq \$32, %r10 164 rolq \$32, %r10 # r10 = W[7]:W[6]
160 movq %r8, %xmm1 165 movq %r9, %xmm1
161 movq %r10, $xmmT1 166 movq %r10, $xmmT1
162 punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) 167 punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
163 movaps %xmm1, $xmmT1
164 paddd $xmmRCONST, $xmmT1
165 movups $xmmT1, -64+4*4(%rsp)
166 168
167 movq 4*8(%rdi), %r8 169 movq 4*8(%rdi), %r11
168 movq 4*10(%rdi), %r10 170 movq 4*10(%rdi), %r12
169 bswapq %r8 171 bswapq %r11
170 bswapq %r10 172 bswapq %r12
171 movl %r8d, %r9d # r9d = W[9] 173 rolq \$32, %r11 # r11 = W[9]:W[8]
172 rolq \$32, %r8 # r8 = W[9]:W[8] 174 rolq \$32, %r12 # r12 = W[11]:W[10]
173 movl %r10d, %r11d # r11d = W[11] 175 movq %r11, %xmm2
174 rolq \$32, %r10 # r10 = W[11]:W[10] 176 movq %r12, $xmmT1
175 movq %r8, %xmm2 177 punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
176 movq %r10, $xmmT1
177 punpcklqdq $xmmT1, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11])
178 178
179 movq 4*12(%rdi), %r12 179 movq 4*12(%rdi), %r13
180 movq 4*14(%rdi), %r14 180 movq 4*14(%rdi), %r14
181 bswapq %r12 181 bswapq %r13
182 bswapq %r14 182 bswapq %r14
183 movl %r12d, %r13d # r13d = W[13] 183 rolq \$32, %r13 # r13 = W[13]:W[12]
184 rolq \$32, %r12 # r12 = W[13]:W[12]
185 movl %r14d, %r15d # r15d = W[15]
186 rolq \$32, %r14 # r14 = W[15]:W[14] 184 rolq \$32, %r14 # r14 = W[15]:W[14]
187 movq %r12, %xmm3 185 movq %r13, %xmm3
188 movq %r14, $xmmT1 186 movq %r14, $xmmT1
189 punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) 187 punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
190" 188"
191 189
192PREP() { 190PREP() {
@@ -215,9 +213,9 @@ echo "# PREP $@
215 movaps $xmmW0, $xmmT2 213 movaps $xmmW0, $xmmT2
216 214
217 xorps $xmmT1, $xmmT1 # rol(W0,1): 215 xorps $xmmT1, $xmmT1 # rol(W0,1):
218 pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) 216 pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1)
219 paddd $xmmW0, $xmmW0 # shift left by 1 217 paddd $xmmW0, $xmmW0 # shift left by 1
220 psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 218 psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1
221 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup 219 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
222 220
223 pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) 221 pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
@@ -256,23 +254,28 @@ RD1A() {
256local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 254local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
257local n=$(($6)) 255local n=$(($6))
258local n0=$(((n+0) & 15)) 256local n0=$(((n+0) & 15))
257local rN=$((7+n0/2))
259echo " 258echo "
260# $n 259# $n
261";test $n0 = 0 && echo " 260";test $n0 = 0 && echo "
262 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] 261 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
263";test $n0 != 0 && test $n0 -lt 8 && echo " 262 shrq \$32, %rsi
264 addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n] 263";test $n0 = 1 && echo "
265";test $n0 -ge 8 && echo " 264 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
266 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] 265";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
266 leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
267 shrq \$32, %r$rN
268";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
269 leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
267";echo " 270";echo "
268 movl %e$c, %edi # c 271 movl %e$c, %edi # c
269 xorl %e$d, %edi # ^d 272 xorl %e$d, %edi # ^d
270 andl %e$b, %edi # &b 273 andl %e$b, %edi # &b
271 xorl %e$d, %edi # (((c ^ d) & b) ^ d) 274 xorl %e$d, %edi # (((c ^ d) & b) ^ d)
272 addl %edi, %e$e # e += (((c ^ d) & b) ^ d) 275 addl %edi, %e$e # e += (((c ^ d) & b) ^ d)
273 movl %e$a, %esi # 276 movl %e$a, %edi #
274 roll \$5, %esi # rotl32(a,5) 277 roll \$5, %edi # rotl32(a,5)
275 addl %esi, %e$e # e += rotl32(a,5) 278 addl %edi, %e$e # e += rotl32(a,5)
276 rorl \$2, %e$b # b = rotl32(b,30) 279 rorl \$2, %e$b # b = rotl32(b,30)
277" 280"
278} 281}
@@ -420,7 +423,7 @@ echo "
420 addl %ebx, 84(%rdi) # ctx->hash[1] += b 423 addl %ebx, 84(%rdi) # ctx->hash[1] += b
421 popq %r14 # 424 popq %r14 #
422 addl %ecx, 88(%rdi) # ctx->hash[2] += c 425 addl %ecx, 88(%rdi) # ctx->hash[2] += c
423 popq %r15 # 426# popq %r15 #
424 addl %edx, 92(%rdi) # ctx->hash[3] += d 427 addl %edx, 92(%rdi) # ctx->hash[3] += d
425 popq %rbx # 428 popq %rbx #
426 addl %ebp, 96(%rdi) # ctx->hash[4] += e 429 addl %ebp, 96(%rdi) # ctx->hash[4] += e