aboutsummaryrefslogtreecommitdiff
path: root/libbb/hash_md5_sha_x86-64.S
diff options
context:
space:
mode:
Diffstat (limited to 'libbb/hash_md5_sha_x86-64.S')
-rw-r--r--libbb/hash_md5_sha_x86-64.S127
1 files changed, 65 insertions, 62 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 51fde082a..f0daa30f6 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -27,60 +27,68 @@ sha1_process_block64:
27# xmm7: all round constants 27# xmm7: all round constants
28# -64(%rsp): area for passing RCONST + W[] from vector to integer units 28# -64(%rsp): area for passing RCONST + W[] from vector to integer units
29 29
30 movl 80(%rdi), %eax # a = ctx->hash[0]
31 movl 84(%rdi), %ebx # b = ctx->hash[1]
32 movl 88(%rdi), %ecx # c = ctx->hash[2]
33 movl 92(%rdi), %edx # d = ctx->hash[3]
34 movl 96(%rdi), %ebp # e = ctx->hash[4]
35
30 movaps sha1const(%rip), %xmm7 36 movaps sha1const(%rip), %xmm7
31 movaps bswap32_mask(%rip), %xmm4
32 pshufd $0x00, %xmm7, %xmm6 37 pshufd $0x00, %xmm7, %xmm6
33 38
34 # Load W[] to xmm0..3, byteswapping on the fly. 39 # Load W[] to xmm0..3, byteswapping on the fly.
35 # 40 #
36 # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 41 # For iterations 0..15, we pass W[] in rsi,r8..r14
37 # for use in RD1As instead of spilling them to stack. 42 # for use in RD1As instead of spilling them to stack.
43 # We lose parallelized addition of RCONST, but LEA
44 # can do two additions at once, so it is probably a wash.
38 # (We use rsi instead of rN because this makes two 45 # (We use rsi instead of rN because this makes two
39 # ADDs in two first RD1As shorter by one byte). 46 # LEAs in two first RD1As shorter by one byte).
40 movups 16*0(%rdi), %xmm0 47 movq 4*0(%rdi), %rsi
41 pshufb %xmm4, %xmm0 48 movq 4*2(%rdi), %r8
42 movaps %xmm0, %xmm5 49 bswapq %rsi
43 paddd %xmm6, %xmm5 50 bswapq %r8
44 movq %xmm5, %rsi 51 rolq $32, %rsi # rsi = W[1]:W[0]
45# pextrq $1, %xmm5, %r8 #SSE4.1 insn 52 rolq $32, %r8 # r8 = W[3]:W[2]
46# movhpd %xmm5, %r8 #can only move to mem, not to reg 53 movq %rsi, %xmm0
47 shufps $0x0e, %xmm5, %xmm5 54 movq %r8, %xmm4
48 movq %xmm5, %r8 55 punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
49 56# movaps %xmm0, %xmm4 # add RCONST, spill to stack
50 movups 16*1(%rdi), %xmm1 57# paddd %xmm6, %xmm4
51 pshufb %xmm4, %xmm1 58# movups %xmm4, -64+16*0(%rsp)
52 movaps %xmm1, %xmm5
53 paddd %xmm6, %xmm5
54 movq %xmm5, %r9
55 shufps $0x0e, %xmm5, %xmm5
56 movq %xmm5, %r10
57 59
58 movups 16*2(%rdi), %xmm2 60 movq 4*4(%rdi), %r9
59 pshufb %xmm4, %xmm2 61 movq 4*6(%rdi), %r10
60 movaps %xmm2, %xmm5 62 bswapq %r9
61 paddd %xmm6, %xmm5 63 bswapq %r10
62 movq %xmm5, %r11 64 rolq $32, %r9 # r9 = W[5]:W[4]
63 shufps $0x0e, %xmm5, %xmm5 65 rolq $32, %r10 # r10 = W[7]:W[6]
64 movq %xmm5, %r12 66 movq %r9, %xmm1
67 movq %r10, %xmm4
68 punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
65 69
66 movups 16*3(%rdi), %xmm3 70 movq 4*8(%rdi), %r11
67 pshufb %xmm4, %xmm3 71 movq 4*10(%rdi), %r12
68 movaps %xmm3, %xmm5 72 bswapq %r11
69 paddd %xmm6, %xmm5 73 bswapq %r12
70 movq %xmm5, %r13 74 rolq $32, %r11 # r11 = W[9]:W[8]
71 shufps $0x0e, %xmm5, %xmm5 75 rolq $32, %r12 # r12 = W[11]:W[10]
72 movq %xmm5, %r14 76 movq %r11, %xmm2
77 movq %r12, %xmm4
78 punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
73 79
74 # MOVQs to GPRs (above) have somewhat high latency. 80 movq 4*12(%rdi), %r13
75 # Load hash[] while they are completing: 81 movq 4*14(%rdi), %r14
76 movl 80(%rdi), %eax # a = ctx->hash[0] 82 bswapq %r13
77 movl 84(%rdi), %ebx # b = ctx->hash[1] 83 bswapq %r14
78 movl 88(%rdi), %ecx # c = ctx->hash[2] 84 rolq $32, %r13 # r13 = W[13]:W[12]
79 movl 92(%rdi), %edx # d = ctx->hash[3] 85 rolq $32, %r14 # r14 = W[15]:W[14]
80 movl 96(%rdi), %ebp # e = ctx->hash[4] 86 movq %r13, %xmm3
87 movq %r14, %xmm4
88 punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
81 89
82# 0 90# 0
83 addl %esi, %ebp # e += RCONST + W[n] 91 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
84 shrq $32, %rsi 92 shrq $32, %rsi
85 movl %ecx, %edi # c 93 movl %ecx, %edi # c
86 xorl %edx, %edi # ^d 94 xorl %edx, %edi # ^d
@@ -92,7 +100,7 @@ sha1_process_block64:
92 addl %edi, %ebp # e += rotl32(a,5) 100 addl %edi, %ebp # e += rotl32(a,5)
93 rorl $2, %ebx # b = rotl32(b,30) 101 rorl $2, %ebx # b = rotl32(b,30)
94# 1 102# 1
95 addl %esi, %edx # e += RCONST + W[n] 103 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
96 movl %ebx, %edi # c 104 movl %ebx, %edi # c
97 xorl %ecx, %edi # ^d 105 xorl %ecx, %edi # ^d
98 andl %eax, %edi # &b 106 andl %eax, %edi # &b
@@ -103,7 +111,7 @@ sha1_process_block64:
103 addl %edi, %edx # e += rotl32(a,5) 111 addl %edi, %edx # e += rotl32(a,5)
104 rorl $2, %eax # b = rotl32(b,30) 112 rorl $2, %eax # b = rotl32(b,30)
105# 2 113# 2
106 addl %r8d, %ecx # e += RCONST + W[n] 114 leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
107 shrq $32, %r8 115 shrq $32, %r8
108 movl %eax, %edi # c 116 movl %eax, %edi # c
109 xorl %ebx, %edi # ^d 117 xorl %ebx, %edi # ^d
@@ -115,7 +123,7 @@ sha1_process_block64:
115 addl %edi, %ecx # e += rotl32(a,5) 123 addl %edi, %ecx # e += rotl32(a,5)
116 rorl $2, %ebp # b = rotl32(b,30) 124 rorl $2, %ebp # b = rotl32(b,30)
117# 3 125# 3
118 addl %r8d, %ebx # e += RCONST + W[n] 126 leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
119 movl %ebp, %edi # c 127 movl %ebp, %edi # c
120 xorl %eax, %edi # ^d 128 xorl %eax, %edi # ^d
121 andl %edx, %edi # &b 129 andl %edx, %edi # &b
@@ -126,7 +134,7 @@ sha1_process_block64:
126 addl %edi, %ebx # e += rotl32(a,5) 134 addl %edi, %ebx # e += rotl32(a,5)
127 rorl $2, %edx # b = rotl32(b,30) 135 rorl $2, %edx # b = rotl32(b,30)
128# 4 136# 4
129 addl %r9d, %eax # e += RCONST + W[n] 137 leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
130 shrq $32, %r9 138 shrq $32, %r9
131 movl %edx, %edi # c 139 movl %edx, %edi # c
132 xorl %ebp, %edi # ^d 140 xorl %ebp, %edi # ^d
@@ -138,7 +146,7 @@ sha1_process_block64:
138 addl %edi, %eax # e += rotl32(a,5) 146 addl %edi, %eax # e += rotl32(a,5)
139 rorl $2, %ecx # b = rotl32(b,30) 147 rorl $2, %ecx # b = rotl32(b,30)
140# 5 148# 5
141 addl %r9d, %ebp # e += RCONST + W[n] 149 leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
142 movl %ecx, %edi # c 150 movl %ecx, %edi # c
143 xorl %edx, %edi # ^d 151 xorl %edx, %edi # ^d
144 andl %ebx, %edi # &b 152 andl %ebx, %edi # &b
@@ -149,7 +157,7 @@ sha1_process_block64:
149 addl %edi, %ebp # e += rotl32(a,5) 157 addl %edi, %ebp # e += rotl32(a,5)
150 rorl $2, %ebx # b = rotl32(b,30) 158 rorl $2, %ebx # b = rotl32(b,30)
151# 6 159# 6
152 addl %r10d, %edx # e += RCONST + W[n] 160 leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
153 shrq $32, %r10 161 shrq $32, %r10
154 movl %ebx, %edi # c 162 movl %ebx, %edi # c
155 xorl %ecx, %edi # ^d 163 xorl %ecx, %edi # ^d
@@ -161,7 +169,7 @@ sha1_process_block64:
161 addl %edi, %edx # e += rotl32(a,5) 169 addl %edi, %edx # e += rotl32(a,5)
162 rorl $2, %eax # b = rotl32(b,30) 170 rorl $2, %eax # b = rotl32(b,30)
163# 7 171# 7
164 addl %r10d, %ecx # e += RCONST + W[n] 172 leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
165 movl %eax, %edi # c 173 movl %eax, %edi # c
166 xorl %ebx, %edi # ^d 174 xorl %ebx, %edi # ^d
167 andl %ebp, %edi # &b 175 andl %ebp, %edi # &b
@@ -202,7 +210,7 @@ sha1_process_block64:
202 paddd %xmm6, %xmm5 210 paddd %xmm6, %xmm5
203 movups %xmm5, -64+16*0(%rsp) 211 movups %xmm5, -64+16*0(%rsp)
204# 8 212# 8
205 addl %r11d, %ebx # e += RCONST + W[n] 213 leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
206 shrq $32, %r11 214 shrq $32, %r11
207 movl %ebp, %edi # c 215 movl %ebp, %edi # c
208 xorl %eax, %edi # ^d 216 xorl %eax, %edi # ^d
@@ -214,7 +222,7 @@ sha1_process_block64:
214 addl %edi, %ebx # e += rotl32(a,5) 222 addl %edi, %ebx # e += rotl32(a,5)
215 rorl $2, %edx # b = rotl32(b,30) 223 rorl $2, %edx # b = rotl32(b,30)
216# 9 224# 9
217 addl %r11d, %eax # e += RCONST + W[n] 225 leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
218 movl %edx, %edi # c 226 movl %edx, %edi # c
219 xorl %ebp, %edi # ^d 227 xorl %ebp, %edi # ^d
220 andl %ecx, %edi # &b 228 andl %ecx, %edi # &b
@@ -225,7 +233,7 @@ sha1_process_block64:
225 addl %edi, %eax # e += rotl32(a,5) 233 addl %edi, %eax # e += rotl32(a,5)
226 rorl $2, %ecx # b = rotl32(b,30) 234 rorl $2, %ecx # b = rotl32(b,30)
227# 10 235# 10
228 addl %r12d, %ebp # e += RCONST + W[n] 236 leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
229 shrq $32, %r12 237 shrq $32, %r12
230 movl %ecx, %edi # c 238 movl %ecx, %edi # c
231 xorl %edx, %edi # ^d 239 xorl %edx, %edi # ^d
@@ -237,7 +245,7 @@ sha1_process_block64:
237 addl %edi, %ebp # e += rotl32(a,5) 245 addl %edi, %ebp # e += rotl32(a,5)
238 rorl $2, %ebx # b = rotl32(b,30) 246 rorl $2, %ebx # b = rotl32(b,30)
239# 11 247# 11
240 addl %r12d, %edx # e += RCONST + W[n] 248 leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
241 movl %ebx, %edi # c 249 movl %ebx, %edi # c
242 xorl %ecx, %edi # ^d 250 xorl %ecx, %edi # ^d
243 andl %eax, %edi # &b 251 andl %eax, %edi # &b
@@ -279,7 +287,7 @@ sha1_process_block64:
279 paddd %xmm6, %xmm5 287 paddd %xmm6, %xmm5
280 movups %xmm5, -64+16*1(%rsp) 288 movups %xmm5, -64+16*1(%rsp)
281# 12 289# 12
282 addl %r13d, %ecx # e += RCONST + W[n] 290 leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
283 shrq $32, %r13 291 shrq $32, %r13
284 movl %eax, %edi # c 292 movl %eax, %edi # c
285 xorl %ebx, %edi # ^d 293 xorl %ebx, %edi # ^d
@@ -291,7 +299,7 @@ sha1_process_block64:
291 addl %edi, %ecx # e += rotl32(a,5) 299 addl %edi, %ecx # e += rotl32(a,5)
292 rorl $2, %ebp # b = rotl32(b,30) 300 rorl $2, %ebp # b = rotl32(b,30)
293# 13 301# 13
294 addl %r13d, %ebx # e += RCONST + W[n] 302 leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
295 movl %ebp, %edi # c 303 movl %ebp, %edi # c
296 xorl %eax, %edi # ^d 304 xorl %eax, %edi # ^d
297 andl %edx, %edi # &b 305 andl %edx, %edi # &b
@@ -302,7 +310,7 @@ sha1_process_block64:
302 addl %edi, %ebx # e += rotl32(a,5) 310 addl %edi, %ebx # e += rotl32(a,5)
303 rorl $2, %edx # b = rotl32(b,30) 311 rorl $2, %edx # b = rotl32(b,30)
304# 14 312# 14
305 addl %r14d, %eax # e += RCONST + W[n] 313 leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
306 shrq $32, %r14 314 shrq $32, %r14
307 movl %edx, %edi # c 315 movl %edx, %edi # c
308 xorl %ebp, %edi # ^d 316 xorl %ebp, %edi # ^d
@@ -314,7 +322,7 @@ sha1_process_block64:
314 addl %edi, %eax # e += rotl32(a,5) 322 addl %edi, %eax # e += rotl32(a,5)
315 rorl $2, %ecx # b = rotl32(b,30) 323 rorl $2, %ecx # b = rotl32(b,30)
316# 15 324# 15
317 addl %r14d, %ebp # e += RCONST + W[n] 325 leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
318 movl %ecx, %edi # c 326 movl %ecx, %edi # c
319 xorl %edx, %edi # ^d 327 xorl %edx, %edi # ^d
320 andl %ebx, %edi # &b 328 andl %ebx, %edi # &b
@@ -1467,11 +1475,6 @@ sha1_process_block64:
1467 ret 1475 ret
1468 .size sha1_process_block64, .-sha1_process_block64 1476 .size sha1_process_block64, .-sha1_process_block64
1469 1477
1470 .section .rodata.cst16.bswap32_mask, "aM", @progbits, 16
1471 .balign 16
1472bswap32_mask:
1473 .octa 0x0c0d0e0f08090a0b0405060700010203
1474
1475 .section .rodata.cst16.sha1const, "aM", @progbits, 16 1478 .section .rodata.cst16.sha1const, "aM", @progbits, 16
1476 .balign 16 1479 .balign 16
1477sha1const: 1480sha1const: