aboutsummaryrefslogtreecommitdiff
path: root/libbb/hash_md5_sha_x86-64.S.sh
diff options
context:
space:
mode:
Diffstat (limited to 'libbb/hash_md5_sha_x86-64.S.sh')
-rwxr-xr-xlibbb/hash_md5_sha_x86-64.S.sh133
1 files changed, 80 insertions, 53 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index f34e6e6fa..57e77b118 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -99,6 +99,30 @@ INTERLEAVE() {
99 ) 99 )
100} 100}
101 101
102# movaps bswap32_mask(%rip), $xmmT1
103# Load W[] to xmm0..3, byteswapping on the fly.
104# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
105# for use in RD1As instead of spilling them to stack.
106# (We use rsi instead of rN because this makes two
107# ADDs in two first RD1As shorter by one byte).
108# movups 16*0(%rdi), %xmm0
109# pshufb $xmmT1, %xmm0 #SSSE3 insn
110# movaps %xmm0, $xmmT2
111# paddd $xmmRCONST, $xmmT2
112# movq $xmmT2, %rsi
113# #pextrq \$1, $xmmT2, %r8 #SSE4.1 insn
114# #movhpd $xmmT2, %r8 #can only move to mem, not to reg
115# shufps \$0x0e, $xmmT2, $xmmT2 # have to use two-insn sequence
116# movq $xmmT2, %r8 # instead
117# ...
118# <repeat for xmm1,2,3>
119# ...
120#- leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
121#+ addl %esi, %e$e # e += RCONST + W[n]
122# ^^^^^^^^^^^^^^^^^^^^^^^^
123# The above is -97 bytes of code...
124# ...but pshufb is a SSSE3 insn. Can't use it.
125
102echo \ 126echo \
103"### Generated by hash_md5_sha_x86-64.S.sh ### 127"### Generated by hash_md5_sha_x86-64.S.sh ###
104 128
@@ -129,57 +153,65 @@ sha1_process_block64:
129# xmm7: all round constants 153# xmm7: all round constants
130# -64(%rsp): area for passing RCONST + W[] from vector to integer units 154# -64(%rsp): area for passing RCONST + W[] from vector to integer units
131 155
156 movl 80(%rdi), %eax # a = ctx->hash[0]
157 movl 84(%rdi), %ebx # b = ctx->hash[1]
158 movl 88(%rdi), %ecx # c = ctx->hash[2]
159 movl 92(%rdi), %edx # d = ctx->hash[3]
160 movl 96(%rdi), %ebp # e = ctx->hash[4]
161
132 movaps sha1const(%rip), $xmmALLRCONST 162 movaps sha1const(%rip), $xmmALLRCONST
133 movaps bswap32_mask(%rip), $xmmT1
134 pshufd \$0x00, $xmmALLRCONST, $xmmRCONST 163 pshufd \$0x00, $xmmALLRCONST, $xmmRCONST
135 164
136 # Load W[] to xmm0..3, byteswapping on the fly. 165 # Load W[] to xmm0..3, byteswapping on the fly.
137 # 166 #
138 # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 167 # For iterations 0..15, we pass W[] in rsi,r8..r14
139 # for use in RD1As instead of spilling them to stack. 168 # for use in RD1As instead of spilling them to stack.
169 # We lose parallelized addition of RCONST, but LEA
170 # can do two additions at once, so it is probably a wash.
140 # (We use rsi instead of rN because this makes two 171 # (We use rsi instead of rN because this makes two
141 # ADDs in two first RD1As shorter by one byte). 172 # LEAs in two first RD1As shorter by one byte).
142 movups 16*0(%rdi), %xmm0 173 movq 4*0(%rdi), %rsi
143 pshufb $xmmT1, %xmm0 174 movq 4*2(%rdi), %r8
144 movaps %xmm0, $xmmT2 175 bswapq %rsi
145 paddd $xmmRCONST, $xmmT2 176 bswapq %r8
146 movq $xmmT2, %rsi 177 rolq \$32, %rsi # rsi = W[1]:W[0]
147# pextrq \$1, $xmmT2, %r8 #SSE4.1 insn 178 rolq \$32, %r8 # r8 = W[3]:W[2]
148# movhpd $xmmT2, %r8 #can only move to mem, not to reg 179 movq %rsi, %xmm0
149 shufps \$0x0e, $xmmT2, $xmmT2 180 movq %r8, $xmmT1
150 movq $xmmT2, %r8 181 punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
151 182# movaps %xmm0, $xmmT1 # add RCONST, spill to stack
152 movups 16*1(%rdi), %xmm1 183# paddd $xmmRCONST, $xmmT1
153 pshufb $xmmT1, %xmm1 184# movups $xmmT1, -64+16*0(%rsp)
154 movaps %xmm1, $xmmT2 185
155 paddd $xmmRCONST, $xmmT2 186 movq 4*4(%rdi), %r9
156 movq $xmmT2, %r9 187 movq 4*6(%rdi), %r10
157 shufps \$0x0e, $xmmT2, $xmmT2 188 bswapq %r9
158 movq $xmmT2, %r10 189 bswapq %r10
159 190 rolq \$32, %r9 # r9 = W[5]:W[4]
160 movups 16*2(%rdi), %xmm2 191 rolq \$32, %r10 # r10 = W[7]:W[6]
161 pshufb $xmmT1, %xmm2 192 movq %r9, %xmm1
162 movaps %xmm2, $xmmT2 193 movq %r10, $xmmT1
163 paddd $xmmRCONST, $xmmT2 194 punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
164 movq $xmmT2, %r11 195
165 shufps \$0x0e, $xmmT2, $xmmT2 196 movq 4*8(%rdi), %r11
166 movq $xmmT2, %r12 197 movq 4*10(%rdi), %r12
167 198 bswapq %r11
168 movups 16*3(%rdi), %xmm3 199 bswapq %r12
169 pshufb $xmmT1, %xmm3 200 rolq \$32, %r11 # r11 = W[9]:W[8]
170 movaps %xmm3, $xmmT2 201 rolq \$32, %r12 # r12 = W[11]:W[10]
171 paddd $xmmRCONST, $xmmT2 202 movq %r11, %xmm2
172 movq $xmmT2, %r13 203 movq %r12, $xmmT1
173 shufps \$0x0e, $xmmT2, $xmmT2 204 punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
174 movq $xmmT2, %r14 205
175 206 movq 4*12(%rdi), %r13
176 # MOVQs to GPRs (above) have somewhat high latency. 207 movq 4*14(%rdi), %r14
177 # Load hash[] while they are completing: 208 bswapq %r13
178 movl 80(%rdi), %eax # a = ctx->hash[0] 209 bswapq %r14
179 movl 84(%rdi), %ebx # b = ctx->hash[1] 210 rolq \$32, %r13 # r13 = W[13]:W[12]
180 movl 88(%rdi), %ecx # c = ctx->hash[2] 211 rolq \$32, %r14 # r14 = W[15]:W[14]
181 movl 92(%rdi), %edx # d = ctx->hash[3] 212 movq %r13, %xmm3
182 movl 96(%rdi), %ebp # e = ctx->hash[4] 213 movq %r14, $xmmT1
214 punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
183" 215"
184 216
185PREP() { 217PREP() {
@@ -258,15 +290,15 @@ local rN=$((7+n0/2))
258echo " 290echo "
259# $n 291# $n
260";test $n0 = 0 && echo " 292";test $n0 = 0 && echo "
261 addl %esi, %e$e # e += RCONST + W[n] 293 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
262 shrq \$32, %rsi 294 shrq \$32, %rsi
263";test $n0 = 1 && echo " 295";test $n0 = 1 && echo "
264 addl %esi, %e$e # e += RCONST + W[n] 296 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
265";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " 297";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
266 addl %r${rN}d, %e$e # e += RCONST + W[n] 298 leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
267 shrq \$32, %r$rN 299 shrq \$32, %r$rN
268";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " 300";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
269 addl %r${rN}d, %e$e # e += RCONST + W[n] 301 leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
270";echo " 302";echo "
271 movl %e$c, %edi # c 303 movl %e$c, %edi # c
272 xorl %e$d, %edi # ^d 304 xorl %e$d, %edi # ^d
@@ -432,11 +464,6 @@ echo "
432 ret 464 ret
433 .size sha1_process_block64, .-sha1_process_block64 465 .size sha1_process_block64, .-sha1_process_block64
434 466
435 .section .rodata.cst16.bswap32_mask, \"aM\", @progbits, 16
436 .balign 16
437bswap32_mask:
438 .octa 0x0c0d0e0f08090a0b0405060700010203
439
440 .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 467 .section .rodata.cst16.sha1const, \"aM\", @progbits, 16
441 .balign 16 468 .balign 16
442sha1const: 469sha1const: