diff options
Diffstat (limited to 'libbb/hash_md5_sha_x86-64.S.sh')
-rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 440 |
1 files changed, 296 insertions, 144 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 901896e6e..87c2d0800 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh | |||
@@ -6,33 +6,103 @@ | |||
6 | # also contains the diff of the generated file. | 6 | # also contains the diff of the generated file. |
7 | exec >hash_md5_sha_x86-64.S | 7 | exec >hash_md5_sha_x86-64.S |
8 | 8 | ||
9 | # There is a way to use XMM registers (which always exist for x86-64!) for W[] | 9 | # Based on http://arctic.org/~dean/crypto/sha1.html. |
10 | # For example, if we load W as follows: | 10 | # ("This SHA1 implementation is public domain.") |
11 | # %xmm0: w[0x0] w[0x1] w[0x2] w[0x3] | 11 | # |
12 | # %xmm4: w[0x4] w[0x5] w[0x6] w[0x7] | 12 | # x86-64 has at least SSE2 vector insns always available. |
13 | # %xmm8: w[0x8] w[0x9] w[0xa] w[0xb] | 13 | # We can use them without any CPUID checks (and without a need |
14 | # %xmm12: w[0xc] w[0xd] w[0xe] w[0xf] | 14 | # for a fallback code if needed insns are not available). |
15 | # then the xor'ing operation to generate next W[0..3] is: | 15 | # This code uses them to calculate W[] ahead of time. |
16 | # movaps %xmm0, %xmmT2 | 16 | # |
17 | # palignr $0x8, %xmm4, %xmmT2 # form (w[0x2],w[0x3],w[0x4],w[0x5]) | 17 | # Unfortunately, results are passed from vector unit to |
18 | # # Right-shifts xmm4:xmmT2 by 8 bytes. Writes shifted result to xmmT2. SSSE3 insn. | 18 | # integer ALUs on the stack. MOVD/Q insns to move them directly |
19 | # movaps %xmm0, %xmmT13 | 19 | # from vector to integer registers are slower than store-to-load |
20 | # palignr $0x4,%xmm0,%xmmT13 # form (w[0xd],w[0xe],w[0xf],w[0x0]) | 20 | # forwarding in LSU (on Skylake at least). |
21 | # xmm0 = xmm0 ^ t2 ^ xmm8 ^ t13 | 21 | # |
22 | # xmm0 = rol32(xmm0,1) # no such insn, have to use pslld+psrld+or | 22 | # The win against a purely integer code is small on Skylake, |
23 | # and then results can be extracted for use: | 23 | # only about 7-8%. We offload about 1/3 of our operations to the vector unit. |
24 | # movd %xmm0, %esi # new W[0] | 24 | # It can do 4 ops at once in one 128-bit register, |
25 | # pextrd $1, %xmm0, %esi # new W[1] | 25 | # but we have to use x2 of them because of W[0] complication, |
26 | # # SSE4.1 insn. Can use EXTRACTPS (also SSE4.1) | 26 | # SSE2 has no "rotate each word by N bits" insns, |
27 | # pextrd $2, %xmm0, %esi # new W[2] | 27 | # moving data to/from vector unit is clunky, and Skylake |
28 | # pextrd $3, %xmm0, %esi # new W[3] | 28 | # has four integer ALUs unified with three vector ALUs, |
29 | # ... but this requires SSE4.1 and SSSE3, which are not universally available on x86-64. | 29 | # which makes pure integer code rather fast, and makes |
30 | # vector ops compete with integer ones. | ||
31 | # | ||
32 | # Zen3, with its separate vector ALUs, wins more, about 12%. | ||
33 | |||
34 | xmmT1="%xmm4" | ||
35 | xmmT2="%xmm5" | ||
36 | xmmRCONST="%xmm6" | ||
37 | T=`printf '\t'` | ||
38 | |||
39 | # SSE instructions are longer than 4 bytes on average. | ||
40 | # Intel CPUs (up to Tiger Lake at least) can't decode | ||
41 | # more than 16 bytes of code in one cycle. | ||
42 | # By interleaving SSE code and integer code | ||
43 | # we mostly achieve a situation where 16-byte decode fetch window | ||
44 | # contains 4 (or more) insns. | ||
45 | # | ||
46 | # However. On Skylake, there was no observed difference, | ||
47 | # but on Zen3, non-interleaved code is ~3% faster | ||
48 | # (822 Mb/s versus 795 Mb/s hashing speed). | ||
49 | # Off for now: | ||
50 | interleave=false | ||
51 | |||
52 | INTERLEAVE() { | ||
53 | $interleave || \ | ||
54 | { | ||
55 | # Generate non-interleaved code | ||
56 | # (it should work correctly too) | ||
57 | echo "$1" | ||
58 | echo "$2" | ||
59 | return | ||
60 | } | ||
61 | ( | ||
62 | echo "$1" | grep -v '^$' >"$0.temp1" | ||
63 | echo "$2" | grep -v '^$' >"$0.temp2" | ||
64 | exec 3<"$0.temp1" | ||
65 | exec 4<"$0.temp2" | ||
66 | IFS='' | ||
67 | while :; do | ||
68 | line1='' | ||
69 | line2='' | ||
70 | while :; do | ||
71 | read -r line1 <&3 | ||
72 | if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then | ||
73 | break | ||
74 | fi | ||
75 | echo "$line1" | ||
76 | done | ||
77 | while :; do | ||
78 | read -r line2 <&4 | ||
79 | if test "${line2:0:4}" = "${T}lea"; then | ||
80 | # We use 7-8 byte long forms of LEA. | ||
81 | # Do not interleave them with SSE insns | ||
82 | # which are also long. | ||
83 | echo "$line2" | ||
84 | read -r line2 <&4 | ||
85 | echo "$line2" | ||
86 | continue | ||
87 | fi | ||
88 | if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then | ||
89 | break | ||
90 | fi | ||
91 | echo "$line2" | ||
92 | done | ||
93 | test "$line1$line2" || break | ||
94 | echo "$line1" | ||
95 | echo "$line2" | ||
96 | done | ||
97 | rm "$0.temp1" "$0.temp2" | ||
98 | ) | ||
99 | } | ||
30 | 100 | ||
31 | echo \ | 101 | echo \ |
32 | '### Generated by hash_md5_sha_x86-64.S.sh ### | 102 | "### Generated by hash_md5_sha_x86-64.S.sh ### |
33 | 103 | ||
34 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) | 104 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
35 | .section .text.sha1_process_block64,"ax",@progbits | 105 | .section .text.sha1_process_block64,\"ax\",@progbits |
36 | .globl sha1_process_block64 | 106 | .globl sha1_process_block64 |
37 | .hidden sha1_process_block64 | 107 | .hidden sha1_process_block64 |
38 | .type sha1_process_block64, @function | 108 | .type sha1_process_block64, @function |
@@ -51,16 +121,10 @@ sha1_process_block64: | |||
51 | # eax..edx: a..d | 121 | # eax..edx: a..d |
52 | # ebp: e | 122 | # ebp: e |
53 | # esi,edi: temps | 123 | # esi,edi: temps |
54 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] | 124 | # xmm0..xmm3: W[] |
55 | # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) | 125 | # xmm4,xmm5: temps |
56 | movl $3, %eax | 126 | # xmm6: current round constant |
57 | 1: | 127 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units |
58 | movq (%rdi,%rax,8), %rsi | ||
59 | bswapq %rsi | ||
60 | rolq $32, %rsi | ||
61 | movq %rsi, -32(%rsp,%rax,8) | ||
62 | decl %eax | ||
63 | jns 1b | ||
64 | 128 | ||
65 | movl 80(%rdi), %eax # a = ctx->hash[0] | 129 | movl 80(%rdi), %eax # a = ctx->hash[0] |
66 | movl 84(%rdi), %ebx # b = ctx->hash[1] | 130 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
@@ -68,32 +132,120 @@ sha1_process_block64: | |||
68 | movl 92(%rdi), %edx # d = ctx->hash[3] | 132 | movl 92(%rdi), %edx # d = ctx->hash[3] |
69 | movl 96(%rdi), %ebp # e = ctx->hash[4] | 133 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
70 | 134 | ||
135 | movaps rconst0x5A827999(%rip), $xmmRCONST | ||
136 | |||
137 | # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 | ||
138 | # instead of spilling them to stack. | ||
139 | # (We lose parallelized addition of RCONST, but LEA | ||
140 | # can do two additions at once, so...) | ||
141 | movq 4*0(%rdi), %rsi | ||
142 | movq 4*2(%rdi), %r10 | ||
143 | bswapq %rsi | ||
144 | bswapq %r10 | ||
145 | rolq \$32, %rsi # rsi = W[1]:W[0] | ||
146 | rolq \$32, %r10 | ||
147 | movq %rsi, %xmm0 | ||
148 | movq %r10, $xmmT1 | ||
149 | punpcklqdq $xmmT1, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) | ||
150 | movaps %xmm0, $xmmT1 | ||
151 | paddd $xmmRCONST, $xmmT1 | ||
152 | movups $xmmT1, -64+4*0(%rsp) | ||
153 | |||
154 | movq 4*4(%rdi), %r8 | ||
155 | movq 4*6(%rdi), %r10 | ||
156 | bswapq %r8 | ||
157 | bswapq %r10 | ||
158 | rolq \$32, %r8 | ||
159 | rolq \$32, %r10 | ||
160 | movq %r8, %xmm1 | ||
161 | movq %r10, $xmmT1 | ||
162 | punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) | ||
163 | movaps %xmm1, $xmmT1 | ||
164 | paddd $xmmRCONST, $xmmT1 | ||
165 | movups $xmmT1, -64+4*4(%rsp) | ||
166 | |||
71 | movq 4*8(%rdi), %r8 | 167 | movq 4*8(%rdi), %r8 |
72 | movq 4*10(%rdi), %r10 | 168 | movq 4*10(%rdi), %r10 |
73 | bswapq %r8 | 169 | bswapq %r8 |
74 | bswapq %r10 | 170 | bswapq %r10 |
171 | movl %r8d, %r9d # r9d = W[9] | ||
172 | rolq \$32, %r8 # r8 = W[9]:W[8] | ||
173 | movl %r10d, %r11d # r11d = W[11] | ||
174 | rolq \$32, %r10 # r10 = W[11]:W[10] | ||
175 | movq %r8, %xmm2 | ||
176 | movq %r10, $xmmT1 | ||
177 | punpcklqdq $xmmT1, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11]) | ||
178 | |||
75 | movq 4*12(%rdi), %r12 | 179 | movq 4*12(%rdi), %r12 |
76 | movq 4*14(%rdi), %r14 | 180 | movq 4*14(%rdi), %r14 |
77 | bswapq %r12 | 181 | bswapq %r12 |
78 | bswapq %r14 | 182 | bswapq %r14 |
79 | movl %r8d, %r9d | 183 | movl %r12d, %r13d # r13d = W[13] |
80 | shrq $32, %r8 | 184 | rolq \$32, %r12 # r12 = W[13]:W[12] |
81 | movl %r10d, %r11d | 185 | movl %r14d, %r15d # r15d = W[15] |
82 | shrq $32, %r10 | 186 | rolq \$32, %r14 # r14 = W[15]:W[14] |
83 | movl %r12d, %r13d | 187 | movq %r12, %xmm3 |
84 | shrq $32, %r12 | 188 | movq %r14, $xmmT1 |
85 | movl %r14d, %r15d | 189 | punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) |
86 | shrq $32, %r14 | 190 | " |
87 | ' | 191 | |
88 | W32() { | 192 | PREP() { |
89 | test "$1" || exit 1 | 193 | local xmmW0=$1 |
90 | test "$1" -lt 0 && exit 1 | 194 | local xmmW4=$2 |
91 | test "$1" -gt 15 && exit 1 | 195 | local xmmW8=$3 |
92 | test "$1" -lt 8 && echo "-32+4*$1(%rsp)" | 196 | local xmmW12=$4 |
93 | test "$1" -ge 8 && echo "%r${1}d" | 197 | # the above must be %xmm0..3 in some permutation |
198 | local dstmem=$5 | ||
199 | #W[0] = rol(W[13] ^ W[8] ^ W[2] ^ W[0], 1); | ||
200 | #W[1] = rol(W[14] ^ W[9] ^ W[3] ^ W[1], 1); | ||
201 | #W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1); | ||
202 | #W[3] = rol( 0 ^ W[11] ^ W[5] ^ W[3], 1); | ||
203 | #W[3] ^= rol(W[0], 1); | ||
204 | echo "# PREP $@ | ||
205 | movaps $xmmW12, $xmmT1 | ||
206 | psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
207 | |||
208 | pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
209 | punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
210 | |||
211 | xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
212 | xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
213 | xorps $xmmT2, $xmmW0 # ^ | ||
214 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
215 | movaps $xmmW0, $xmmT2 | ||
216 | |||
217 | xorps $xmmT1, $xmmT1 # rol(W0,1): | ||
218 | pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) | ||
219 | paddd $xmmW0, $xmmW0 # shift left by 1 | ||
220 | psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 | ||
221 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
222 | |||
223 | pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
224 | movaps $xmmT2, $xmmT1 | ||
225 | pslld \$2, $xmmT2 | ||
226 | psrld \$30, $xmmT1 | ||
227 | # xorps $xmmT1, $xmmT2 # rol((0,0,0,unrotW[0]),2) | ||
228 | xorps $xmmT1, $xmmW0 # same result, but does not depend on/does not modify T2 | ||
229 | |||
230 | xorps $xmmT2, $xmmW0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
231 | " | ||
232 | # movq $xmmW0, %r8 # high latency (~6 cycles) | ||
233 | # movaps $xmmW0, $xmmT1 | ||
234 | # psrldq \$8, $xmmT1 # rshift by 8 bytes: move upper 64 bits to lower | ||
235 | # movq $xmmT1, %r10 # high latency | ||
236 | # movq %r8, %r9 | ||
237 | # movq %r10, %r11 | ||
238 | # shrq \$32, %r9 | ||
239 | # shrq \$32, %r11 | ||
240 | # ^^^ slower than passing the results on stack (!!!) | ||
241 | echo " | ||
242 | movaps $xmmW0, $xmmT2 | ||
243 | paddd $xmmRCONST, $xmmT2 | ||
244 | movups $xmmT2, $dstmem | ||
245 | " | ||
94 | } | 246 | } |
95 | 247 | ||
96 | # It's possible to interleave insns in rounds to mostly eliminate | 248 | # It's possible to interleave integer insns in rounds to mostly eliminate |
97 | # dependency chains, but this likely to only help old Pentium-based | 249 | # dependency chains, but this likely to only help old Pentium-based |
98 | # CPUs (ones without OOO, which can only simultaneously execute a pair | 250 | # CPUs (ones without OOO, which can only simultaneously execute a pair |
99 | # of _adjacent_ insns). | 251 | # of _adjacent_ insns). |
@@ -107,21 +259,16 @@ local n0=$(((n+0) & 15)) | |||
107 | echo " | 259 | echo " |
108 | # $n | 260 | # $n |
109 | ";test $n0 = 0 && echo " | 261 | ";test $n0 = 0 && echo " |
110 | # W[0], already in %esi | 262 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
111 | ";test $n0 != 0 && test $n0 -lt 8 && echo " | 263 | ";test $n0 != 0 && test $n0 -lt 8 && echo " |
112 | movl `W32 $n0`, %esi # W[n] | 264 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n] |
113 | ";test $n0 -ge 8 && echo " | 265 | ";test $n0 -ge 8 && echo " |
114 | # W[n], in %r$n0 | 266 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] |
115 | ";echo " | 267 | ";echo " |
116 | movl %e$c, %edi # c | 268 | movl %e$c, %edi # c |
117 | xorl %e$d, %edi # ^d | 269 | xorl %e$d, %edi # ^d |
118 | andl %e$b, %edi # &b | 270 | andl %e$b, %edi # &b |
119 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) | 271 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
120 | ";test $n0 -lt 8 && echo " | ||
121 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] | ||
122 | ";test $n0 -ge 8 && echo " | ||
123 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] | ||
124 | ";echo " | ||
125 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) | 272 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
126 | movl %e$a, %esi # | 273 | movl %e$a, %esi # |
127 | roll \$5, %esi # rotl32(a,5) | 274 | roll \$5, %esi # rotl32(a,5) |
@@ -138,28 +285,11 @@ local n2=$(((n+2) & 15)) | |||
138 | local n0=$(((n+0) & 15)) | 285 | local n0=$(((n+0) & 15)) |
139 | echo " | 286 | echo " |
140 | # $n | 287 | # $n |
141 | ";test $n0 -lt 8 && echo " | ||
142 | movl `W32 $n13`, %esi # W[(n+13) & 15] | ||
143 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] | ||
144 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] | ||
145 | xorl `W32 $n0`, %esi # ^W[n & 15] | ||
146 | roll %esi # | ||
147 | movl %esi, `W32 $n0` # store to W[n & 15] | ||
148 | ";test $n0 -ge 8 && echo " | ||
149 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] | ||
150 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] | ||
151 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] | ||
152 | roll `W32 $n0` # | ||
153 | ";echo " | ||
154 | movl %e$c, %edi # c | 288 | movl %e$c, %edi # c |
155 | xorl %e$d, %edi # ^d | 289 | xorl %e$d, %edi # ^d |
156 | andl %e$b, %edi # &b | 290 | andl %e$b, %edi # &b |
157 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) | 291 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
158 | ";test $n0 -lt 8 && echo " | 292 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
159 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] | ||
160 | ";test $n0 -ge 8 && echo " | ||
161 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] | ||
162 | ";echo " | ||
163 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) | 293 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
164 | movl %e$a, %esi # | 294 | movl %e$a, %esi # |
165 | roll \$5, %esi # rotl32(a,5) | 295 | roll \$5, %esi # rotl32(a,5) |
@@ -167,13 +297,6 @@ echo " | |||
167 | rorl \$2, %e$b # b = rotl32(b,30) | 297 | rorl \$2, %e$b # b = rotl32(b,30) |
168 | " | 298 | " |
169 | } | 299 | } |
170 | { | ||
171 | RCONST=0x5A827999 | ||
172 | RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; RD1A bx cx dx bp ax 4 | ||
173 | RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9 | ||
174 | RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14 | ||
175 | RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19 | ||
176 | } | grep -v '^$' | ||
177 | 300 | ||
178 | RD2() { | 301 | RD2() { |
179 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 | 302 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
@@ -184,27 +307,10 @@ local n2=$(((n+2) & 15)) | |||
184 | local n0=$(((n+0) & 15)) | 307 | local n0=$(((n+0) & 15)) |
185 | echo " | 308 | echo " |
186 | # $n | 309 | # $n |
187 | ";test $n0 -lt 8 && echo " | ||
188 | movl `W32 $n13`, %esi # W[(n+13) & 15] | ||
189 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] | ||
190 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] | ||
191 | xorl `W32 $n0`, %esi # ^W[n & 15] | ||
192 | roll %esi # | ||
193 | movl %esi, `W32 $n0` # store to W[n & 15] | ||
194 | ";test $n0 -ge 8 && echo " | ||
195 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] | ||
196 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] | ||
197 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] | ||
198 | roll `W32 $n0` # | ||
199 | ";echo " | ||
200 | movl %e$c, %edi # c | 310 | movl %e$c, %edi # c |
201 | xorl %e$d, %edi # ^d | 311 | xorl %e$d, %edi # ^d |
202 | xorl %e$b, %edi # ^b | 312 | xorl %e$b, %edi # ^b |
203 | ";test $n0 -lt 8 && echo " | 313 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
204 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] | ||
205 | ";test $n0 -ge 8 && echo " | ||
206 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] | ||
207 | ";echo " | ||
208 | addl %edi, %e$e # e += (c ^ d ^ b) | 314 | addl %edi, %e$e # e += (c ^ d ^ b) |
209 | movl %e$a, %esi # | 315 | movl %e$a, %esi # |
210 | roll \$5, %esi # rotl32(a,5) | 316 | roll \$5, %esi # rotl32(a,5) |
@@ -212,13 +318,6 @@ echo " | |||
212 | rorl \$2, %e$b # b = rotl32(b,30) | 318 | rorl \$2, %e$b # b = rotl32(b,30) |
213 | " | 319 | " |
214 | } | 320 | } |
215 | { | ||
216 | RCONST=0x6ED9EBA1 | ||
217 | RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24 | ||
218 | RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29 | ||
219 | RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34 | ||
220 | RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39 | ||
221 | } | grep -v '^$' | ||
222 | 321 | ||
223 | RD3() { | 322 | RD3() { |
224 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 | 323 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
@@ -235,53 +334,82 @@ echo " | |||
235 | andl %e$c, %esi # si: b & c | 334 | andl %e$c, %esi # si: b & c |
236 | andl %e$d, %edi # di: (b | c) & d | 335 | andl %e$d, %edi # di: (b | c) & d |
237 | orl %esi, %edi # ((b | c) & d) | (b & c) | 336 | orl %esi, %edi # ((b | c) & d) | (b & c) |
238 | ";test $n0 -lt 8 && echo " | ||
239 | movl `W32 $n13`, %esi # W[(n+13) & 15] | ||
240 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] | ||
241 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] | ||
242 | xorl `W32 $n0`, %esi # ^W[n & 15] | ||
243 | roll %esi # | ||
244 | movl %esi, `W32 $n0` # store to W[n & 15] | ||
245 | ";test $n0 -ge 8 && echo " | ||
246 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] | ||
247 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] | ||
248 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] | ||
249 | roll `W32 $n0` # | ||
250 | ";echo " | ||
251 | addl %edi, %e$e # += ((b | c) & d) | (b & c) | 337 | addl %edi, %e$e # += ((b | c) & d) | (b & c) |
252 | ";test $n0 -lt 8 && echo " | 338 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
253 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] | ||
254 | ";test $n0 -ge 8 && echo " | ||
255 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] | ||
256 | ";echo " | ||
257 | movl %e$a, %esi # | 339 | movl %e$a, %esi # |
258 | roll \$5, %esi # rotl32(a,5) | 340 | roll \$5, %esi # rotl32(a,5) |
259 | addl %esi, %e$e # e += rotl32(a,5) | 341 | addl %esi, %e$e # e += rotl32(a,5) |
260 | rorl \$2, %e$b # b = rotl32(b,30) | 342 | rorl \$2, %e$b # b = rotl32(b,30) |
261 | " | 343 | " |
262 | } | 344 | } |
345 | |||
263 | { | 346 | { |
264 | #RCONST=0x8F1BBCDC "out of range for signed 32bit displacement" | 347 | # Round 1 |
265 | RCONST=-0x70E44324 | 348 | RCONST=0x5A827999 |
266 | RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44 | 349 | RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; |
267 | RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49 | 350 | RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; |
268 | RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54 | 351 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` |
269 | RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59 | 352 | b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` |
270 | } | grep -v '^$' | 353 | INTERLEAVE "$a" "$b" |
354 | a=`echo " movaps rconst0x6ED9EBA1(%rip), $xmmRCONST" | ||
355 | PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` | ||
356 | b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` | ||
357 | INTERLEAVE "$a" "$b" | ||
358 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` | ||
359 | b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;` | ||
360 | INTERLEAVE "$a" "$b" | ||
361 | |||
362 | # Round 2 | ||
363 | RCONST=0x6ED9EBA1 | ||
364 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` | ||
365 | b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;` | ||
366 | INTERLEAVE "$a" "$b" | ||
367 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` | ||
368 | b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;` | ||
369 | INTERLEAVE "$a" "$b" | ||
370 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` | ||
371 | b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` | ||
372 | INTERLEAVE "$a" "$b" | ||
373 | a=`echo " movaps rconst0x8F1BBCDC(%rip), $xmmRCONST" | ||
374 | PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` | ||
375 | b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` | ||
376 | INTERLEAVE "$a" "$b" | ||
377 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` | ||
378 | b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;` | ||
379 | INTERLEAVE "$a" "$b" | ||
380 | |||
381 | # Round 3 | ||
382 | RCONST=0x8F1BBCDC | ||
383 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` | ||
384 | b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;` | ||
385 | INTERLEAVE "$a" "$b" | ||
386 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` | ||
387 | b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;` | ||
388 | INTERLEAVE "$a" "$b" | ||
389 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` | ||
390 | b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` | ||
391 | INTERLEAVE "$a" "$b" | ||
392 | a=`echo " movaps rconst0xCA62C1D6(%rip), $xmmRCONST" | ||
393 | PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` | ||
394 | b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` | ||
395 | INTERLEAVE "$a" "$b" | ||
396 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` | ||
397 | b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;` | ||
398 | INTERLEAVE "$a" "$b" | ||
271 | 399 | ||
272 | # Round 4 has the same logic as round 2, only n and RCONST are different | 400 | # Round 4 has the same logic as round 2, only n and RCONST are different |
273 | { | 401 | RCONST=0xCA62C1D6 |
274 | #RCONST=0xCA62C1D6 "out of range for signed 32bit displacement" | 402 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` |
275 | RCONST=-0x359D3E2A | 403 | b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;` |
276 | RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64 | 404 | INTERLEAVE "$a" "$b" |
277 | RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69 | 405 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` |
278 | RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74 | 406 | b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;` |
279 | RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79 | 407 | INTERLEAVE "$a" "$b" |
280 | # Note: new W[n&15] values generated in last 3 iterations | 408 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` |
281 | # (W[13,14,15]) are unused after each of these iterations. | 409 | b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;` |
282 | # Since we use r8..r15 for W[8..15], this does not matter. | 410 | INTERLEAVE "$a" "$b" |
283 | # If we switch to e.g. using r8..r15 for W[0..7], then saving of W[13,14,15] | 411 | RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75; |
284 | # (the "movl %esi, `W32 $n0`" insn) is a dead store and can be removed. | 412 | RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79; |
285 | } | grep -v '^$' | 413 | } | grep -v '^$' |
286 | 414 | ||
287 | echo " | 415 | echo " |
@@ -300,4 +428,28 @@ echo " | |||
300 | 428 | ||
301 | ret | 429 | ret |
302 | .size sha1_process_block64, .-sha1_process_block64 | 430 | .size sha1_process_block64, .-sha1_process_block64 |
431 | |||
432 | .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 | ||
433 | .align 16 | ||
434 | rconst0x5A827999: | ||
435 | .long 0x5A827999 | ||
436 | .long 0x5A827999 | ||
437 | .long 0x5A827999 | ||
438 | .long 0x5A827999 | ||
439 | rconst0x6ED9EBA1: | ||
440 | .long 0x6ED9EBA1 | ||
441 | .long 0x6ED9EBA1 | ||
442 | .long 0x6ED9EBA1 | ||
443 | .long 0x6ED9EBA1 | ||
444 | rconst0x8F1BBCDC: | ||
445 | .long 0x8F1BBCDC | ||
446 | .long 0x8F1BBCDC | ||
447 | .long 0x8F1BBCDC | ||
448 | .long 0x8F1BBCDC | ||
449 | rconst0xCA62C1D6: | ||
450 | .long 0xCA62C1D6 | ||
451 | .long 0xCA62C1D6 | ||
452 | .long 0xCA62C1D6 | ||
453 | .long 0xCA62C1D6 | ||
454 | |||
303 | #endif" | 455 | #endif" |