diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-23 09:27:30 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-23 12:57:27 +0100 |
commit | 39369ff460f3e2dbfec7f6be181b2fb98f3c1867 (patch) | |
tree | 1a67a6376490c729fb58944002cdcabb262b2f50 | |
parent | 1e825acf8d715fe49af040cb02f9e96c26955832 (diff) | |
download | busybox-w32-39369ff460f3e2dbfec7f6be181b2fb98f3c1867.tar.gz busybox-w32-39369ff460f3e2dbfec7f6be181b2fb98f3c1867.tar.bz2 busybox-w32-39369ff460f3e2dbfec7f6be181b2fb98f3c1867.zip |
libbb/sha1: use SSE2 in unrolled x86-64 code. ~10% faster
function old new delta
.rodata 108241 108305 +64
sha1_process_block64 3502 3495 -7
------------------------------------------------------------------------------
(add/remove: 5/0 grow/shrink: 1/1 up/down: 64/-7) Total: 57 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | libbb/hash_md5_sha_x86-64.S | 992 | ||||
-rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 440 |
2 files changed, 854 insertions, 578 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 87fb616a1..069a18719 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S | |||
@@ -20,16 +20,10 @@ sha1_process_block64: | |||
20 | # eax..edx: a..d | 20 | # eax..edx: a..d |
21 | # ebp: e | 21 | # ebp: e |
22 | # esi,edi: temps | 22 | # esi,edi: temps |
23 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] | 23 | # xmm0..xmm3: W[] |
24 | # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) | 24 | # xmm4,xmm5: temps |
25 | movl $3, %eax | 25 | # xmm6: current round constant |
26 | 1: | 26 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units |
27 | movq (%rdi,%rax,8), %rsi | ||
28 | bswapq %rsi | ||
29 | rolq $32, %rsi | ||
30 | movq %rsi, -32(%rsp,%rax,8) | ||
31 | decl %eax | ||
32 | jns 1b | ||
33 | 27 | ||
34 | movl 80(%rdi), %eax # a = ctx->hash[0] | 28 | movl 80(%rdi), %eax # a = ctx->hash[0] |
35 | movl 84(%rdi), %ebx # b = ctx->hash[1] | 29 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
@@ -37,587 +31,709 @@ sha1_process_block64: | |||
37 | movl 92(%rdi), %edx # d = ctx->hash[3] | 31 | movl 92(%rdi), %edx # d = ctx->hash[3] |
38 | movl 96(%rdi), %ebp # e = ctx->hash[4] | 32 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
39 | 33 | ||
34 | movaps rconst0x5A827999(%rip), %xmm6 | ||
35 | |||
36 | # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 | ||
37 | # instead of spilling them to stack. | ||
38 | # (We lose parallelized addition of RCONST, but LEA | ||
39 | # can do two additions at once, so...) | ||
40 | movq 4*0(%rdi), %rsi | ||
41 | movq 4*2(%rdi), %r10 | ||
42 | bswapq %rsi | ||
43 | bswapq %r10 | ||
44 | rolq $32, %rsi # rsi = W[1]:W[0] | ||
45 | rolq $32, %r10 | ||
46 | movq %rsi, %xmm0 | ||
47 | movq %r10, %xmm4 | ||
48 | punpcklqdq %xmm4, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) | ||
49 | movaps %xmm0, %xmm4 | ||
50 | paddd %xmm6, %xmm4 | ||
51 | movups %xmm4, -64+4*0(%rsp) | ||
52 | |||
53 | movq 4*4(%rdi), %r8 | ||
54 | movq 4*6(%rdi), %r10 | ||
55 | bswapq %r8 | ||
56 | bswapq %r10 | ||
57 | rolq $32, %r8 | ||
58 | rolq $32, %r10 | ||
59 | movq %r8, %xmm1 | ||
60 | movq %r10, %xmm4 | ||
61 | punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) | ||
62 | movaps %xmm1, %xmm4 | ||
63 | paddd %xmm6, %xmm4 | ||
64 | movups %xmm4, -64+4*4(%rsp) | ||
65 | |||
40 | movq 4*8(%rdi), %r8 | 66 | movq 4*8(%rdi), %r8 |
41 | movq 4*10(%rdi), %r10 | 67 | movq 4*10(%rdi), %r10 |
42 | bswapq %r8 | 68 | bswapq %r8 |
43 | bswapq %r10 | 69 | bswapq %r10 |
70 | movl %r8d, %r9d # r9d = W[9] | ||
71 | rolq $32, %r8 # r8 = W[9]:W[8] | ||
72 | movl %r10d, %r11d # r11d = W[11] | ||
73 | rolq $32, %r10 # r10 = W[11]:W[10] | ||
74 | movq %r8, %xmm2 | ||
75 | movq %r10, %xmm4 | ||
76 | punpcklqdq %xmm4, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11]) | ||
77 | |||
44 | movq 4*12(%rdi), %r12 | 78 | movq 4*12(%rdi), %r12 |
45 | movq 4*14(%rdi), %r14 | 79 | movq 4*14(%rdi), %r14 |
46 | bswapq %r12 | 80 | bswapq %r12 |
47 | bswapq %r14 | 81 | bswapq %r14 |
48 | movl %r8d, %r9d | 82 | movl %r12d, %r13d # r13d = W[13] |
49 | shrq $32, %r8 | 83 | rolq $32, %r12 # r12 = W[13]:W[12] |
50 | movl %r10d, %r11d | 84 | movl %r14d, %r15d # r15d = W[15] |
51 | shrq $32, %r10 | 85 | rolq $32, %r14 # r14 = W[15]:W[14] |
52 | movl %r12d, %r13d | 86 | movq %r12, %xmm3 |
53 | shrq $32, %r12 | 87 | movq %r14, %xmm4 |
54 | movl %r14d, %r15d | 88 | punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) |
55 | shrq $32, %r14 | ||
56 | 89 | ||
57 | # 0 | 90 | # 0 |
58 | # W[0], already in %esi | 91 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] |
59 | movl %ecx, %edi # c | 92 | movl %ecx, %edi # c |
60 | xorl %edx, %edi # ^d | 93 | xorl %edx, %edi # ^d |
61 | andl %ebx, %edi # &b | 94 | andl %ebx, %edi # &b |
62 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 95 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
63 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] | ||
64 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 96 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
65 | movl %eax, %esi # | 97 | movl %eax, %esi # |
66 | roll $5, %esi # rotl32(a,5) | 98 | roll $5, %esi # rotl32(a,5) |
67 | addl %esi, %ebp # e += rotl32(a,5) | 99 | addl %esi, %ebp # e += rotl32(a,5) |
68 | rorl $2, %ebx # b = rotl32(b,30) | 100 | rorl $2, %ebx # b = rotl32(b,30) |
69 | # 1 | 101 | # 1 |
70 | movl -32+4*1(%rsp), %esi # W[n] | 102 | addl -64+4*1(%rsp), %edx # e += RCONST + W[n] |
71 | movl %ebx, %edi # c | 103 | movl %ebx, %edi # c |
72 | xorl %ecx, %edi # ^d | 104 | xorl %ecx, %edi # ^d |
73 | andl %eax, %edi # &b | 105 | andl %eax, %edi # &b |
74 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 106 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
75 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] | ||
76 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 107 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
77 | movl %ebp, %esi # | 108 | movl %ebp, %esi # |
78 | roll $5, %esi # rotl32(a,5) | 109 | roll $5, %esi # rotl32(a,5) |
79 | addl %esi, %edx # e += rotl32(a,5) | 110 | addl %esi, %edx # e += rotl32(a,5) |
80 | rorl $2, %eax # b = rotl32(b,30) | 111 | rorl $2, %eax # b = rotl32(b,30) |
81 | # 2 | 112 | # 2 |
82 | movl -32+4*2(%rsp), %esi # W[n] | 113 | addl -64+4*2(%rsp), %ecx # e += RCONST + W[n] |
83 | movl %eax, %edi # c | 114 | movl %eax, %edi # c |
84 | xorl %ebx, %edi # ^d | 115 | xorl %ebx, %edi # ^d |
85 | andl %ebp, %edi # &b | 116 | andl %ebp, %edi # &b |
86 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 117 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
87 | leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n] | ||
88 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 118 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
89 | movl %edx, %esi # | 119 | movl %edx, %esi # |
90 | roll $5, %esi # rotl32(a,5) | 120 | roll $5, %esi # rotl32(a,5) |
91 | addl %esi, %ecx # e += rotl32(a,5) | 121 | addl %esi, %ecx # e += rotl32(a,5) |
92 | rorl $2, %ebp # b = rotl32(b,30) | 122 | rorl $2, %ebp # b = rotl32(b,30) |
93 | # 3 | 123 | # 3 |
94 | movl -32+4*3(%rsp), %esi # W[n] | 124 | addl -64+4*3(%rsp), %ebx # e += RCONST + W[n] |
95 | movl %ebp, %edi # c | 125 | movl %ebp, %edi # c |
96 | xorl %eax, %edi # ^d | 126 | xorl %eax, %edi # ^d |
97 | andl %edx, %edi # &b | 127 | andl %edx, %edi # &b |
98 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 128 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
99 | leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n] | ||
100 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 129 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
101 | movl %ecx, %esi # | 130 | movl %ecx, %esi # |
102 | roll $5, %esi # rotl32(a,5) | 131 | roll $5, %esi # rotl32(a,5) |
103 | addl %esi, %ebx # e += rotl32(a,5) | 132 | addl %esi, %ebx # e += rotl32(a,5) |
104 | rorl $2, %edx # b = rotl32(b,30) | 133 | rorl $2, %edx # b = rotl32(b,30) |
105 | # 4 | 134 | # 4 |
106 | movl -32+4*4(%rsp), %esi # W[n] | 135 | addl -64+4*4(%rsp), %eax # e += RCONST + W[n] |
107 | movl %edx, %edi # c | 136 | movl %edx, %edi # c |
108 | xorl %ebp, %edi # ^d | 137 | xorl %ebp, %edi # ^d |
109 | andl %ecx, %edi # &b | 138 | andl %ecx, %edi # &b |
110 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 139 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
111 | leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n] | ||
112 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 140 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
113 | movl %ebx, %esi # | 141 | movl %ebx, %esi # |
114 | roll $5, %esi # rotl32(a,5) | 142 | roll $5, %esi # rotl32(a,5) |
115 | addl %esi, %eax # e += rotl32(a,5) | 143 | addl %esi, %eax # e += rotl32(a,5) |
116 | rorl $2, %ecx # b = rotl32(b,30) | 144 | rorl $2, %ecx # b = rotl32(b,30) |
117 | # 5 | 145 | # 5 |
118 | movl -32+4*5(%rsp), %esi # W[n] | 146 | addl -64+4*5(%rsp), %ebp # e += RCONST + W[n] |
119 | movl %ecx, %edi # c | 147 | movl %ecx, %edi # c |
120 | xorl %edx, %edi # ^d | 148 | xorl %edx, %edi # ^d |
121 | andl %ebx, %edi # &b | 149 | andl %ebx, %edi # &b |
122 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 150 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
123 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] | ||
124 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 151 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
125 | movl %eax, %esi # | 152 | movl %eax, %esi # |
126 | roll $5, %esi # rotl32(a,5) | 153 | roll $5, %esi # rotl32(a,5) |
127 | addl %esi, %ebp # e += rotl32(a,5) | 154 | addl %esi, %ebp # e += rotl32(a,5) |
128 | rorl $2, %ebx # b = rotl32(b,30) | 155 | rorl $2, %ebx # b = rotl32(b,30) |
129 | # 6 | 156 | # 6 |
130 | movl -32+4*6(%rsp), %esi # W[n] | 157 | addl -64+4*6(%rsp), %edx # e += RCONST + W[n] |
131 | movl %ebx, %edi # c | 158 | movl %ebx, %edi # c |
132 | xorl %ecx, %edi # ^d | 159 | xorl %ecx, %edi # ^d |
133 | andl %eax, %edi # &b | 160 | andl %eax, %edi # &b |
134 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 161 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
135 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] | ||
136 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 162 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
137 | movl %ebp, %esi # | 163 | movl %ebp, %esi # |
138 | roll $5, %esi # rotl32(a,5) | 164 | roll $5, %esi # rotl32(a,5) |
139 | addl %esi, %edx # e += rotl32(a,5) | 165 | addl %esi, %edx # e += rotl32(a,5) |
140 | rorl $2, %eax # b = rotl32(b,30) | 166 | rorl $2, %eax # b = rotl32(b,30) |
141 | # 7 | 167 | # 7 |
142 | movl -32+4*7(%rsp), %esi # W[n] | 168 | addl -64+4*7(%rsp), %ecx # e += RCONST + W[n] |
143 | movl %eax, %edi # c | 169 | movl %eax, %edi # c |
144 | xorl %ebx, %edi # ^d | 170 | xorl %ebx, %edi # ^d |
145 | andl %ebp, %edi # &b | 171 | andl %ebp, %edi # &b |
146 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 172 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
147 | leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n] | ||
148 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 173 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
149 | movl %edx, %esi # | 174 | movl %edx, %esi # |
150 | roll $5, %esi # rotl32(a,5) | 175 | roll $5, %esi # rotl32(a,5) |
151 | addl %esi, %ecx # e += rotl32(a,5) | 176 | addl %esi, %ecx # e += rotl32(a,5) |
152 | rorl $2, %ebp # b = rotl32(b,30) | 177 | rorl $2, %ebp # b = rotl32(b,30) |
178 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
179 | movaps %xmm3, %xmm4 | ||
180 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
181 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
182 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
183 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
184 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
185 | xorps %xmm5, %xmm0 # ^ | ||
186 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
187 | movaps %xmm0, %xmm5 | ||
188 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
189 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
190 | paddd %xmm0, %xmm0 # shift left by 1 | ||
191 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
192 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
193 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
194 | movaps %xmm5, %xmm4 | ||
195 | pslld $2, %xmm5 | ||
196 | psrld $30, %xmm4 | ||
197 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
198 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
199 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
200 | movaps %xmm0, %xmm5 | ||
201 | paddd %xmm6, %xmm5 | ||
202 | movups %xmm5, -64+16*0(%rsp) | ||
153 | # 8 | 203 | # 8 |
154 | # W[n], in %r8 | 204 | leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] |
155 | movl %ebp, %edi # c | 205 | movl %ebp, %edi # c |
156 | xorl %eax, %edi # ^d | 206 | xorl %eax, %edi # ^d |
157 | andl %edx, %edi # &b | 207 | andl %edx, %edi # &b |
158 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 208 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
159 | leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] | ||
160 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 209 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
161 | movl %ecx, %esi # | 210 | movl %ecx, %esi # |
162 | roll $5, %esi # rotl32(a,5) | 211 | roll $5, %esi # rotl32(a,5) |
163 | addl %esi, %ebx # e += rotl32(a,5) | 212 | addl %esi, %ebx # e += rotl32(a,5) |
164 | rorl $2, %edx # b = rotl32(b,30) | 213 | rorl $2, %edx # b = rotl32(b,30) |
165 | # 9 | 214 | # 9 |
166 | # W[n], in %r9 | 215 | leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] |
167 | movl %edx, %edi # c | 216 | movl %edx, %edi # c |
168 | xorl %ebp, %edi # ^d | 217 | xorl %ebp, %edi # ^d |
169 | andl %ecx, %edi # &b | 218 | andl %ecx, %edi # &b |
170 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 219 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
171 | leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] | ||
172 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 220 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
173 | movl %ebx, %esi # | 221 | movl %ebx, %esi # |
174 | roll $5, %esi # rotl32(a,5) | 222 | roll $5, %esi # rotl32(a,5) |
175 | addl %esi, %eax # e += rotl32(a,5) | 223 | addl %esi, %eax # e += rotl32(a,5) |
176 | rorl $2, %ecx # b = rotl32(b,30) | 224 | rorl $2, %ecx # b = rotl32(b,30) |
177 | # 10 | 225 | # 10 |
178 | # W[n], in %r10 | 226 | leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n] |
179 | movl %ecx, %edi # c | 227 | movl %ecx, %edi # c |
180 | xorl %edx, %edi # ^d | 228 | xorl %edx, %edi # ^d |
181 | andl %ebx, %edi # &b | 229 | andl %ebx, %edi # &b |
182 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 230 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
183 | leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n] | ||
184 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 231 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
185 | movl %eax, %esi # | 232 | movl %eax, %esi # |
186 | roll $5, %esi # rotl32(a,5) | 233 | roll $5, %esi # rotl32(a,5) |
187 | addl %esi, %ebp # e += rotl32(a,5) | 234 | addl %esi, %ebp # e += rotl32(a,5) |
188 | rorl $2, %ebx # b = rotl32(b,30) | 235 | rorl $2, %ebx # b = rotl32(b,30) |
189 | # 11 | 236 | # 11 |
190 | # W[n], in %r11 | 237 | leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n] |
191 | movl %ebx, %edi # c | 238 | movl %ebx, %edi # c |
192 | xorl %ecx, %edi # ^d | 239 | xorl %ecx, %edi # ^d |
193 | andl %eax, %edi # &b | 240 | andl %eax, %edi # &b |
194 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 241 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
195 | leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n] | ||
196 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 242 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
197 | movl %ebp, %esi # | 243 | movl %ebp, %esi # |
198 | roll $5, %esi # rotl32(a,5) | 244 | roll $5, %esi # rotl32(a,5) |
199 | addl %esi, %edx # e += rotl32(a,5) | 245 | addl %esi, %edx # e += rotl32(a,5) |
200 | rorl $2, %eax # b = rotl32(b,30) | 246 | rorl $2, %eax # b = rotl32(b,30) |
247 | movaps rconst0x6ED9EBA1(%rip), %xmm6 | ||
248 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
249 | movaps %xmm0, %xmm4 | ||
250 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
251 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
252 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
253 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
254 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
255 | xorps %xmm5, %xmm1 # ^ | ||
256 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
257 | movaps %xmm1, %xmm5 | ||
258 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
259 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
260 | paddd %xmm1, %xmm1 # shift left by 1 | ||
261 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
262 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
263 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
264 | movaps %xmm5, %xmm4 | ||
265 | pslld $2, %xmm5 | ||
266 | psrld $30, %xmm4 | ||
267 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
268 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
269 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
270 | movaps %xmm1, %xmm5 | ||
271 | paddd %xmm6, %xmm5 | ||
272 | movups %xmm5, -64+16*1(%rsp) | ||
201 | # 12 | 273 | # 12 |
202 | # W[n], in %r12 | 274 | leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n] |
203 | movl %eax, %edi # c | 275 | movl %eax, %edi # c |
204 | xorl %ebx, %edi # ^d | 276 | xorl %ebx, %edi # ^d |
205 | andl %ebp, %edi # &b | 277 | andl %ebp, %edi # &b |
206 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 278 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
207 | leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n] | ||
208 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 279 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
209 | movl %edx, %esi # | 280 | movl %edx, %esi # |
210 | roll $5, %esi # rotl32(a,5) | 281 | roll $5, %esi # rotl32(a,5) |
211 | addl %esi, %ecx # e += rotl32(a,5) | 282 | addl %esi, %ecx # e += rotl32(a,5) |
212 | rorl $2, %ebp # b = rotl32(b,30) | 283 | rorl $2, %ebp # b = rotl32(b,30) |
213 | # 13 | 284 | # 13 |
214 | # W[n], in %r13 | 285 | leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] |
215 | movl %ebp, %edi # c | 286 | movl %ebp, %edi # c |
216 | xorl %eax, %edi # ^d | 287 | xorl %eax, %edi # ^d |
217 | andl %edx, %edi # &b | 288 | andl %edx, %edi # &b |
218 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 289 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
219 | leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] | ||
220 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 290 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
221 | movl %ecx, %esi # | 291 | movl %ecx, %esi # |
222 | roll $5, %esi # rotl32(a,5) | 292 | roll $5, %esi # rotl32(a,5) |
223 | addl %esi, %ebx # e += rotl32(a,5) | 293 | addl %esi, %ebx # e += rotl32(a,5) |
224 | rorl $2, %edx # b = rotl32(b,30) | 294 | rorl $2, %edx # b = rotl32(b,30) |
225 | # 14 | 295 | # 14 |
226 | # W[n], in %r14 | 296 | leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] |
227 | movl %edx, %edi # c | 297 | movl %edx, %edi # c |
228 | xorl %ebp, %edi # ^d | 298 | xorl %ebp, %edi # ^d |
229 | andl %ecx, %edi # &b | 299 | andl %ecx, %edi # &b |
230 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 300 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
231 | leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] | ||
232 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 301 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
233 | movl %ebx, %esi # | 302 | movl %ebx, %esi # |
234 | roll $5, %esi # rotl32(a,5) | 303 | roll $5, %esi # rotl32(a,5) |
235 | addl %esi, %eax # e += rotl32(a,5) | 304 | addl %esi, %eax # e += rotl32(a,5) |
236 | rorl $2, %ecx # b = rotl32(b,30) | 305 | rorl $2, %ecx # b = rotl32(b,30) |
237 | # 15 | 306 | # 15 |
238 | # W[n], in %r15 | 307 | leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n] |
239 | movl %ecx, %edi # c | 308 | movl %ecx, %edi # c |
240 | xorl %edx, %edi # ^d | 309 | xorl %edx, %edi # ^d |
241 | andl %ebx, %edi # &b | 310 | andl %ebx, %edi # &b |
242 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 311 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
243 | leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n] | ||
244 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 312 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
245 | movl %eax, %esi # | 313 | movl %eax, %esi # |
246 | roll $5, %esi # rotl32(a,5) | 314 | roll $5, %esi # rotl32(a,5) |
247 | addl %esi, %ebp # e += rotl32(a,5) | 315 | addl %esi, %ebp # e += rotl32(a,5) |
248 | rorl $2, %ebx # b = rotl32(b,30) | 316 | rorl $2, %ebx # b = rotl32(b,30) |
317 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
318 | movaps %xmm1, %xmm4 | ||
319 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
320 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
321 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
322 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
323 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
324 | xorps %xmm5, %xmm2 # ^ | ||
325 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
326 | movaps %xmm2, %xmm5 | ||
327 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
328 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
329 | paddd %xmm2, %xmm2 # shift left by 1 | ||
330 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
331 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
332 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
333 | movaps %xmm5, %xmm4 | ||
334 | pslld $2, %xmm5 | ||
335 | psrld $30, %xmm4 | ||
336 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
337 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
338 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
339 | movaps %xmm2, %xmm5 | ||
340 | paddd %xmm6, %xmm5 | ||
341 | movups %xmm5, -64+16*2(%rsp) | ||
249 | # 16 | 342 | # 16 |
250 | movl %r13d, %esi # W[(n+13) & 15] | ||
251 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
252 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
253 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
254 | roll %esi # | ||
255 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
256 | movl %ebx, %edi # c | 343 | movl %ebx, %edi # c |
257 | xorl %ecx, %edi # ^d | 344 | xorl %ecx, %edi # ^d |
258 | andl %eax, %edi # &b | 345 | andl %eax, %edi # &b |
259 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 346 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
260 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 347 | addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15] |
261 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 348 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
262 | movl %ebp, %esi # | 349 | movl %ebp, %esi # |
263 | roll $5, %esi # rotl32(a,5) | 350 | roll $5, %esi # rotl32(a,5) |
264 | addl %esi, %edx # e += rotl32(a,5) | 351 | addl %esi, %edx # e += rotl32(a,5) |
265 | rorl $2, %eax # b = rotl32(b,30) | 352 | rorl $2, %eax # b = rotl32(b,30) |
266 | # 17 | 353 | # 17 |
267 | movl %r14d, %esi # W[(n+13) & 15] | ||
268 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
269 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
270 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
271 | roll %esi # | ||
272 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
273 | movl %eax, %edi # c | 354 | movl %eax, %edi # c |
274 | xorl %ebx, %edi # ^d | 355 | xorl %ebx, %edi # ^d |
275 | andl %ebp, %edi # &b | 356 | andl %ebp, %edi # &b |
276 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 357 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
277 | leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 358 | addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15] |
278 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 359 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
279 | movl %edx, %esi # | 360 | movl %edx, %esi # |
280 | roll $5, %esi # rotl32(a,5) | 361 | roll $5, %esi # rotl32(a,5) |
281 | addl %esi, %ecx # e += rotl32(a,5) | 362 | addl %esi, %ecx # e += rotl32(a,5) |
282 | rorl $2, %ebp # b = rotl32(b,30) | 363 | rorl $2, %ebp # b = rotl32(b,30) |
283 | # 18 | 364 | # 18 |
284 | movl %r15d, %esi # W[(n+13) & 15] | ||
285 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
286 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
287 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
288 | roll %esi # | ||
289 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
290 | movl %ebp, %edi # c | 365 | movl %ebp, %edi # c |
291 | xorl %eax, %edi # ^d | 366 | xorl %eax, %edi # ^d |
292 | andl %edx, %edi # &b | 367 | andl %edx, %edi # &b |
293 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 368 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
294 | leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 369 | addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15] |
295 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 370 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
296 | movl %ecx, %esi # | 371 | movl %ecx, %esi # |
297 | roll $5, %esi # rotl32(a,5) | 372 | roll $5, %esi # rotl32(a,5) |
298 | addl %esi, %ebx # e += rotl32(a,5) | 373 | addl %esi, %ebx # e += rotl32(a,5) |
299 | rorl $2, %edx # b = rotl32(b,30) | 374 | rorl $2, %edx # b = rotl32(b,30) |
300 | # 19 | 375 | # 19 |
301 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
302 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
303 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
304 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
305 | roll %esi # | ||
306 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
307 | movl %edx, %edi # c | 376 | movl %edx, %edi # c |
308 | xorl %ebp, %edi # ^d | 377 | xorl %ebp, %edi # ^d |
309 | andl %ecx, %edi # &b | 378 | andl %ecx, %edi # &b |
310 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 379 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
311 | leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 380 | addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15] |
312 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 381 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
313 | movl %ebx, %esi # | 382 | movl %ebx, %esi # |
314 | roll $5, %esi # rotl32(a,5) | 383 | roll $5, %esi # rotl32(a,5) |
315 | addl %esi, %eax # e += rotl32(a,5) | 384 | addl %esi, %eax # e += rotl32(a,5) |
316 | rorl $2, %ecx # b = rotl32(b,30) | 385 | rorl $2, %ecx # b = rotl32(b,30) |
386 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
387 | movaps %xmm2, %xmm4 | ||
388 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
389 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
390 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
391 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
392 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
393 | xorps %xmm5, %xmm3 # ^ | ||
394 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
395 | movaps %xmm3, %xmm5 | ||
396 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
397 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
398 | paddd %xmm3, %xmm3 # shift left by 1 | ||
399 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
400 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
401 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
402 | movaps %xmm5, %xmm4 | ||
403 | pslld $2, %xmm5 | ||
404 | psrld $30, %xmm4 | ||
405 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
406 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
407 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
408 | movaps %xmm3, %xmm5 | ||
409 | paddd %xmm6, %xmm5 | ||
410 | movups %xmm5, -64+16*3(%rsp) | ||
317 | # 20 | 411 | # 20 |
318 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
319 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
320 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
321 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
322 | roll %esi # | ||
323 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
324 | movl %ecx, %edi # c | 412 | movl %ecx, %edi # c |
325 | xorl %edx, %edi # ^d | 413 | xorl %edx, %edi # ^d |
326 | xorl %ebx, %edi # ^b | 414 | xorl %ebx, %edi # ^b |
327 | leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 415 | addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15] |
328 | addl %edi, %ebp # e += (c ^ d ^ b) | 416 | addl %edi, %ebp # e += (c ^ d ^ b) |
329 | movl %eax, %esi # | 417 | movl %eax, %esi # |
330 | roll $5, %esi # rotl32(a,5) | 418 | roll $5, %esi # rotl32(a,5) |
331 | addl %esi, %ebp # e += rotl32(a,5) | 419 | addl %esi, %ebp # e += rotl32(a,5) |
332 | rorl $2, %ebx # b = rotl32(b,30) | 420 | rorl $2, %ebx # b = rotl32(b,30) |
333 | # 21 | 421 | # 21 |
334 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
335 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
336 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
337 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
338 | roll %esi # | ||
339 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
340 | movl %ebx, %edi # c | 422 | movl %ebx, %edi # c |
341 | xorl %ecx, %edi # ^d | 423 | xorl %ecx, %edi # ^d |
342 | xorl %eax, %edi # ^b | 424 | xorl %eax, %edi # ^b |
343 | leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 425 | addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15] |
344 | addl %edi, %edx # e += (c ^ d ^ b) | 426 | addl %edi, %edx # e += (c ^ d ^ b) |
345 | movl %ebp, %esi # | 427 | movl %ebp, %esi # |
346 | roll $5, %esi # rotl32(a,5) | 428 | roll $5, %esi # rotl32(a,5) |
347 | addl %esi, %edx # e += rotl32(a,5) | 429 | addl %esi, %edx # e += rotl32(a,5) |
348 | rorl $2, %eax # b = rotl32(b,30) | 430 | rorl $2, %eax # b = rotl32(b,30) |
349 | # 22 | 431 | # 22 |
350 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
351 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
352 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
353 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
354 | roll %esi # | ||
355 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
356 | movl %eax, %edi # c | 432 | movl %eax, %edi # c |
357 | xorl %ebx, %edi # ^d | 433 | xorl %ebx, %edi # ^d |
358 | xorl %ebp, %edi # ^b | 434 | xorl %ebp, %edi # ^b |
359 | leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 435 | addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15] |
360 | addl %edi, %ecx # e += (c ^ d ^ b) | 436 | addl %edi, %ecx # e += (c ^ d ^ b) |
361 | movl %edx, %esi # | 437 | movl %edx, %esi # |
362 | roll $5, %esi # rotl32(a,5) | 438 | roll $5, %esi # rotl32(a,5) |
363 | addl %esi, %ecx # e += rotl32(a,5) | 439 | addl %esi, %ecx # e += rotl32(a,5) |
364 | rorl $2, %ebp # b = rotl32(b,30) | 440 | rorl $2, %ebp # b = rotl32(b,30) |
365 | # 23 | 441 | # 23 |
366 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
367 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
368 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
369 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
370 | roll %esi # | ||
371 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
372 | movl %ebp, %edi # c | 442 | movl %ebp, %edi # c |
373 | xorl %eax, %edi # ^d | 443 | xorl %eax, %edi # ^d |
374 | xorl %edx, %edi # ^b | 444 | xorl %edx, %edi # ^b |
375 | leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 445 | addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15] |
376 | addl %edi, %ebx # e += (c ^ d ^ b) | 446 | addl %edi, %ebx # e += (c ^ d ^ b) |
377 | movl %ecx, %esi # | 447 | movl %ecx, %esi # |
378 | roll $5, %esi # rotl32(a,5) | 448 | roll $5, %esi # rotl32(a,5) |
379 | addl %esi, %ebx # e += rotl32(a,5) | 449 | addl %esi, %ebx # e += rotl32(a,5) |
380 | rorl $2, %edx # b = rotl32(b,30) | 450 | rorl $2, %edx # b = rotl32(b,30) |
451 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
452 | movaps %xmm3, %xmm4 | ||
453 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
454 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
455 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
456 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
457 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
458 | xorps %xmm5, %xmm0 # ^ | ||
459 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
460 | movaps %xmm0, %xmm5 | ||
461 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
462 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
463 | paddd %xmm0, %xmm0 # shift left by 1 | ||
464 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
465 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
466 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
467 | movaps %xmm5, %xmm4 | ||
468 | pslld $2, %xmm5 | ||
469 | psrld $30, %xmm4 | ||
470 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
471 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
472 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
473 | movaps %xmm0, %xmm5 | ||
474 | paddd %xmm6, %xmm5 | ||
475 | movups %xmm5, -64+16*0(%rsp) | ||
381 | # 24 | 476 | # 24 |
382 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
383 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
384 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
385 | roll %r8d # | ||
386 | movl %edx, %edi # c | 477 | movl %edx, %edi # c |
387 | xorl %ebp, %edi # ^d | 478 | xorl %ebp, %edi # ^d |
388 | xorl %ecx, %edi # ^b | 479 | xorl %ecx, %edi # ^b |
389 | leal 0x6ED9EBA1(%rax,%r8), %eax # e += RCONST + W[n & 15] | 480 | addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15] |
390 | addl %edi, %eax # e += (c ^ d ^ b) | 481 | addl %edi, %eax # e += (c ^ d ^ b) |
391 | movl %ebx, %esi # | 482 | movl %ebx, %esi # |
392 | roll $5, %esi # rotl32(a,5) | 483 | roll $5, %esi # rotl32(a,5) |
393 | addl %esi, %eax # e += rotl32(a,5) | 484 | addl %esi, %eax # e += rotl32(a,5) |
394 | rorl $2, %ecx # b = rotl32(b,30) | 485 | rorl $2, %ecx # b = rotl32(b,30) |
395 | # 25 | 486 | # 25 |
396 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
397 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
398 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
399 | roll %r9d # | ||
400 | movl %ecx, %edi # c | 487 | movl %ecx, %edi # c |
401 | xorl %edx, %edi # ^d | 488 | xorl %edx, %edi # ^d |
402 | xorl %ebx, %edi # ^b | 489 | xorl %ebx, %edi # ^b |
403 | leal 0x6ED9EBA1(%rbp,%r9), %ebp # e += RCONST + W[n & 15] | 490 | addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15] |
404 | addl %edi, %ebp # e += (c ^ d ^ b) | 491 | addl %edi, %ebp # e += (c ^ d ^ b) |
405 | movl %eax, %esi # | 492 | movl %eax, %esi # |
406 | roll $5, %esi # rotl32(a,5) | 493 | roll $5, %esi # rotl32(a,5) |
407 | addl %esi, %ebp # e += rotl32(a,5) | 494 | addl %esi, %ebp # e += rotl32(a,5) |
408 | rorl $2, %ebx # b = rotl32(b,30) | 495 | rorl $2, %ebx # b = rotl32(b,30) |
409 | # 26 | 496 | # 26 |
410 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
411 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
412 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
413 | roll %r10d # | ||
414 | movl %ebx, %edi # c | 497 | movl %ebx, %edi # c |
415 | xorl %ecx, %edi # ^d | 498 | xorl %ecx, %edi # ^d |
416 | xorl %eax, %edi # ^b | 499 | xorl %eax, %edi # ^b |
417 | leal 0x6ED9EBA1(%rdx,%r10), %edx # e += RCONST + W[n & 15] | 500 | addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15] |
418 | addl %edi, %edx # e += (c ^ d ^ b) | 501 | addl %edi, %edx # e += (c ^ d ^ b) |
419 | movl %ebp, %esi # | 502 | movl %ebp, %esi # |
420 | roll $5, %esi # rotl32(a,5) | 503 | roll $5, %esi # rotl32(a,5) |
421 | addl %esi, %edx # e += rotl32(a,5) | 504 | addl %esi, %edx # e += rotl32(a,5) |
422 | rorl $2, %eax # b = rotl32(b,30) | 505 | rorl $2, %eax # b = rotl32(b,30) |
423 | # 27 | 506 | # 27 |
424 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
425 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
426 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
427 | roll %r11d # | ||
428 | movl %eax, %edi # c | 507 | movl %eax, %edi # c |
429 | xorl %ebx, %edi # ^d | 508 | xorl %ebx, %edi # ^d |
430 | xorl %ebp, %edi # ^b | 509 | xorl %ebp, %edi # ^b |
431 | leal 0x6ED9EBA1(%rcx,%r11), %ecx # e += RCONST + W[n & 15] | 510 | addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15] |
432 | addl %edi, %ecx # e += (c ^ d ^ b) | 511 | addl %edi, %ecx # e += (c ^ d ^ b) |
433 | movl %edx, %esi # | 512 | movl %edx, %esi # |
434 | roll $5, %esi # rotl32(a,5) | 513 | roll $5, %esi # rotl32(a,5) |
435 | addl %esi, %ecx # e += rotl32(a,5) | 514 | addl %esi, %ecx # e += rotl32(a,5) |
436 | rorl $2, %ebp # b = rotl32(b,30) | 515 | rorl $2, %ebp # b = rotl32(b,30) |
516 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
517 | movaps %xmm0, %xmm4 | ||
518 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
519 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
520 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
521 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
522 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
523 | xorps %xmm5, %xmm1 # ^ | ||
524 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
525 | movaps %xmm1, %xmm5 | ||
526 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
527 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
528 | paddd %xmm1, %xmm1 # shift left by 1 | ||
529 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
530 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
531 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
532 | movaps %xmm5, %xmm4 | ||
533 | pslld $2, %xmm5 | ||
534 | psrld $30, %xmm4 | ||
535 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
536 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
537 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
538 | movaps %xmm1, %xmm5 | ||
539 | paddd %xmm6, %xmm5 | ||
540 | movups %xmm5, -64+16*1(%rsp) | ||
437 | # 28 | 541 | # 28 |
438 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
439 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
440 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
441 | roll %r12d # | ||
442 | movl %ebp, %edi # c | 542 | movl %ebp, %edi # c |
443 | xorl %eax, %edi # ^d | 543 | xorl %eax, %edi # ^d |
444 | xorl %edx, %edi # ^b | 544 | xorl %edx, %edi # ^b |
445 | leal 0x6ED9EBA1(%rbx,%r12), %ebx # e += RCONST + W[n & 15] | 545 | addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15] |
446 | addl %edi, %ebx # e += (c ^ d ^ b) | 546 | addl %edi, %ebx # e += (c ^ d ^ b) |
447 | movl %ecx, %esi # | 547 | movl %ecx, %esi # |
448 | roll $5, %esi # rotl32(a,5) | 548 | roll $5, %esi # rotl32(a,5) |
449 | addl %esi, %ebx # e += rotl32(a,5) | 549 | addl %esi, %ebx # e += rotl32(a,5) |
450 | rorl $2, %edx # b = rotl32(b,30) | 550 | rorl $2, %edx # b = rotl32(b,30) |
451 | # 29 | 551 | # 29 |
452 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
453 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
454 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
455 | roll %r13d # | ||
456 | movl %edx, %edi # c | 552 | movl %edx, %edi # c |
457 | xorl %ebp, %edi # ^d | 553 | xorl %ebp, %edi # ^d |
458 | xorl %ecx, %edi # ^b | 554 | xorl %ecx, %edi # ^b |
459 | leal 0x6ED9EBA1(%rax,%r13), %eax # e += RCONST + W[n & 15] | 555 | addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15] |
460 | addl %edi, %eax # e += (c ^ d ^ b) | 556 | addl %edi, %eax # e += (c ^ d ^ b) |
461 | movl %ebx, %esi # | 557 | movl %ebx, %esi # |
462 | roll $5, %esi # rotl32(a,5) | 558 | roll $5, %esi # rotl32(a,5) |
463 | addl %esi, %eax # e += rotl32(a,5) | 559 | addl %esi, %eax # e += rotl32(a,5) |
464 | rorl $2, %ecx # b = rotl32(b,30) | 560 | rorl $2, %ecx # b = rotl32(b,30) |
465 | # 30 | 561 | # 30 |
466 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
467 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
468 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
469 | roll %r14d # | ||
470 | movl %ecx, %edi # c | 562 | movl %ecx, %edi # c |
471 | xorl %edx, %edi # ^d | 563 | xorl %edx, %edi # ^d |
472 | xorl %ebx, %edi # ^b | 564 | xorl %ebx, %edi # ^b |
473 | leal 0x6ED9EBA1(%rbp,%r14), %ebp # e += RCONST + W[n & 15] | 565 | addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15] |
474 | addl %edi, %ebp # e += (c ^ d ^ b) | 566 | addl %edi, %ebp # e += (c ^ d ^ b) |
475 | movl %eax, %esi # | 567 | movl %eax, %esi # |
476 | roll $5, %esi # rotl32(a,5) | 568 | roll $5, %esi # rotl32(a,5) |
477 | addl %esi, %ebp # e += rotl32(a,5) | 569 | addl %esi, %ebp # e += rotl32(a,5) |
478 | rorl $2, %ebx # b = rotl32(b,30) | 570 | rorl $2, %ebx # b = rotl32(b,30) |
479 | # 31 | 571 | # 31 |
480 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
481 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
482 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
483 | roll %r15d # | ||
484 | movl %ebx, %edi # c | 572 | movl %ebx, %edi # c |
485 | xorl %ecx, %edi # ^d | 573 | xorl %ecx, %edi # ^d |
486 | xorl %eax, %edi # ^b | 574 | xorl %eax, %edi # ^b |
487 | leal 0x6ED9EBA1(%rdx,%r15), %edx # e += RCONST + W[n & 15] | 575 | addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15] |
488 | addl %edi, %edx # e += (c ^ d ^ b) | 576 | addl %edi, %edx # e += (c ^ d ^ b) |
489 | movl %ebp, %esi # | 577 | movl %ebp, %esi # |
490 | roll $5, %esi # rotl32(a,5) | 578 | roll $5, %esi # rotl32(a,5) |
491 | addl %esi, %edx # e += rotl32(a,5) | 579 | addl %esi, %edx # e += rotl32(a,5) |
492 | rorl $2, %eax # b = rotl32(b,30) | 580 | rorl $2, %eax # b = rotl32(b,30) |
581 | movaps rconst0x8F1BBCDC(%rip), %xmm6 | ||
582 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
583 | movaps %xmm1, %xmm4 | ||
584 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
585 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
586 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
587 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
588 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
589 | xorps %xmm5, %xmm2 # ^ | ||
590 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
591 | movaps %xmm2, %xmm5 | ||
592 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
593 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
594 | paddd %xmm2, %xmm2 # shift left by 1 | ||
595 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
596 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
597 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
598 | movaps %xmm5, %xmm4 | ||
599 | pslld $2, %xmm5 | ||
600 | psrld $30, %xmm4 | ||
601 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
602 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
603 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
604 | movaps %xmm2, %xmm5 | ||
605 | paddd %xmm6, %xmm5 | ||
606 | movups %xmm5, -64+16*2(%rsp) | ||
493 | # 32 | 607 | # 32 |
494 | movl %r13d, %esi # W[(n+13) & 15] | ||
495 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
496 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
497 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
498 | roll %esi # | ||
499 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
500 | movl %eax, %edi # c | 608 | movl %eax, %edi # c |
501 | xorl %ebx, %edi # ^d | 609 | xorl %ebx, %edi # ^d |
502 | xorl %ebp, %edi # ^b | 610 | xorl %ebp, %edi # ^b |
503 | leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 611 | addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15] |
504 | addl %edi, %ecx # e += (c ^ d ^ b) | 612 | addl %edi, %ecx # e += (c ^ d ^ b) |
505 | movl %edx, %esi # | 613 | movl %edx, %esi # |
506 | roll $5, %esi # rotl32(a,5) | 614 | roll $5, %esi # rotl32(a,5) |
507 | addl %esi, %ecx # e += rotl32(a,5) | 615 | addl %esi, %ecx # e += rotl32(a,5) |
508 | rorl $2, %ebp # b = rotl32(b,30) | 616 | rorl $2, %ebp # b = rotl32(b,30) |
509 | # 33 | 617 | # 33 |
510 | movl %r14d, %esi # W[(n+13) & 15] | ||
511 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
512 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
513 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
514 | roll %esi # | ||
515 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
516 | movl %ebp, %edi # c | 618 | movl %ebp, %edi # c |
517 | xorl %eax, %edi # ^d | 619 | xorl %eax, %edi # ^d |
518 | xorl %edx, %edi # ^b | 620 | xorl %edx, %edi # ^b |
519 | leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 621 | addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15] |
520 | addl %edi, %ebx # e += (c ^ d ^ b) | 622 | addl %edi, %ebx # e += (c ^ d ^ b) |
521 | movl %ecx, %esi # | 623 | movl %ecx, %esi # |
522 | roll $5, %esi # rotl32(a,5) | 624 | roll $5, %esi # rotl32(a,5) |
523 | addl %esi, %ebx # e += rotl32(a,5) | 625 | addl %esi, %ebx # e += rotl32(a,5) |
524 | rorl $2, %edx # b = rotl32(b,30) | 626 | rorl $2, %edx # b = rotl32(b,30) |
525 | # 34 | 627 | # 34 |
526 | movl %r15d, %esi # W[(n+13) & 15] | ||
527 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
528 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
529 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
530 | roll %esi # | ||
531 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
532 | movl %edx, %edi # c | 628 | movl %edx, %edi # c |
533 | xorl %ebp, %edi # ^d | 629 | xorl %ebp, %edi # ^d |
534 | xorl %ecx, %edi # ^b | 630 | xorl %ecx, %edi # ^b |
535 | leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 631 | addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15] |
536 | addl %edi, %eax # e += (c ^ d ^ b) | 632 | addl %edi, %eax # e += (c ^ d ^ b) |
537 | movl %ebx, %esi # | 633 | movl %ebx, %esi # |
538 | roll $5, %esi # rotl32(a,5) | 634 | roll $5, %esi # rotl32(a,5) |
539 | addl %esi, %eax # e += rotl32(a,5) | 635 | addl %esi, %eax # e += rotl32(a,5) |
540 | rorl $2, %ecx # b = rotl32(b,30) | 636 | rorl $2, %ecx # b = rotl32(b,30) |
541 | # 35 | 637 | # 35 |
542 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
543 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
544 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
545 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
546 | roll %esi # | ||
547 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
548 | movl %ecx, %edi # c | 638 | movl %ecx, %edi # c |
549 | xorl %edx, %edi # ^d | 639 | xorl %edx, %edi # ^d |
550 | xorl %ebx, %edi # ^b | 640 | xorl %ebx, %edi # ^b |
551 | leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 641 | addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15] |
552 | addl %edi, %ebp # e += (c ^ d ^ b) | 642 | addl %edi, %ebp # e += (c ^ d ^ b) |
553 | movl %eax, %esi # | 643 | movl %eax, %esi # |
554 | roll $5, %esi # rotl32(a,5) | 644 | roll $5, %esi # rotl32(a,5) |
555 | addl %esi, %ebp # e += rotl32(a,5) | 645 | addl %esi, %ebp # e += rotl32(a,5) |
556 | rorl $2, %ebx # b = rotl32(b,30) | 646 | rorl $2, %ebx # b = rotl32(b,30) |
647 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
648 | movaps %xmm2, %xmm4 | ||
649 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
650 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
651 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
652 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
653 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
654 | xorps %xmm5, %xmm3 # ^ | ||
655 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
656 | movaps %xmm3, %xmm5 | ||
657 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
658 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
659 | paddd %xmm3, %xmm3 # shift left by 1 | ||
660 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
661 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
662 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
663 | movaps %xmm5, %xmm4 | ||
664 | pslld $2, %xmm5 | ||
665 | psrld $30, %xmm4 | ||
666 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
667 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
668 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
669 | movaps %xmm3, %xmm5 | ||
670 | paddd %xmm6, %xmm5 | ||
671 | movups %xmm5, -64+16*3(%rsp) | ||
557 | # 36 | 672 | # 36 |
558 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
559 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
560 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
561 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
562 | roll %esi # | ||
563 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
564 | movl %ebx, %edi # c | 673 | movl %ebx, %edi # c |
565 | xorl %ecx, %edi # ^d | 674 | xorl %ecx, %edi # ^d |
566 | xorl %eax, %edi # ^b | 675 | xorl %eax, %edi # ^b |
567 | leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 676 | addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15] |
568 | addl %edi, %edx # e += (c ^ d ^ b) | 677 | addl %edi, %edx # e += (c ^ d ^ b) |
569 | movl %ebp, %esi # | 678 | movl %ebp, %esi # |
570 | roll $5, %esi # rotl32(a,5) | 679 | roll $5, %esi # rotl32(a,5) |
571 | addl %esi, %edx # e += rotl32(a,5) | 680 | addl %esi, %edx # e += rotl32(a,5) |
572 | rorl $2, %eax # b = rotl32(b,30) | 681 | rorl $2, %eax # b = rotl32(b,30) |
573 | # 37 | 682 | # 37 |
574 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
575 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
576 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
577 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
578 | roll %esi # | ||
579 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
580 | movl %eax, %edi # c | 683 | movl %eax, %edi # c |
581 | xorl %ebx, %edi # ^d | 684 | xorl %ebx, %edi # ^d |
582 | xorl %ebp, %edi # ^b | 685 | xorl %ebp, %edi # ^b |
583 | leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 686 | addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15] |
584 | addl %edi, %ecx # e += (c ^ d ^ b) | 687 | addl %edi, %ecx # e += (c ^ d ^ b) |
585 | movl %edx, %esi # | 688 | movl %edx, %esi # |
586 | roll $5, %esi # rotl32(a,5) | 689 | roll $5, %esi # rotl32(a,5) |
587 | addl %esi, %ecx # e += rotl32(a,5) | 690 | addl %esi, %ecx # e += rotl32(a,5) |
588 | rorl $2, %ebp # b = rotl32(b,30) | 691 | rorl $2, %ebp # b = rotl32(b,30) |
589 | # 38 | 692 | # 38 |
590 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
591 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
592 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
593 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
594 | roll %esi # | ||
595 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
596 | movl %ebp, %edi # c | 693 | movl %ebp, %edi # c |
597 | xorl %eax, %edi # ^d | 694 | xorl %eax, %edi # ^d |
598 | xorl %edx, %edi # ^b | 695 | xorl %edx, %edi # ^b |
599 | leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 696 | addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15] |
600 | addl %edi, %ebx # e += (c ^ d ^ b) | 697 | addl %edi, %ebx # e += (c ^ d ^ b) |
601 | movl %ecx, %esi # | 698 | movl %ecx, %esi # |
602 | roll $5, %esi # rotl32(a,5) | 699 | roll $5, %esi # rotl32(a,5) |
603 | addl %esi, %ebx # e += rotl32(a,5) | 700 | addl %esi, %ebx # e += rotl32(a,5) |
604 | rorl $2, %edx # b = rotl32(b,30) | 701 | rorl $2, %edx # b = rotl32(b,30) |
605 | # 39 | 702 | # 39 |
606 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
607 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
608 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
609 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
610 | roll %esi # | ||
611 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
612 | movl %edx, %edi # c | 703 | movl %edx, %edi # c |
613 | xorl %ebp, %edi # ^d | 704 | xorl %ebp, %edi # ^d |
614 | xorl %ecx, %edi # ^b | 705 | xorl %ecx, %edi # ^b |
615 | leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 706 | addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15] |
616 | addl %edi, %eax # e += (c ^ d ^ b) | 707 | addl %edi, %eax # e += (c ^ d ^ b) |
617 | movl %ebx, %esi # | 708 | movl %ebx, %esi # |
618 | roll $5, %esi # rotl32(a,5) | 709 | roll $5, %esi # rotl32(a,5) |
619 | addl %esi, %eax # e += rotl32(a,5) | 710 | addl %esi, %eax # e += rotl32(a,5) |
620 | rorl $2, %ecx # b = rotl32(b,30) | 711 | rorl $2, %ecx # b = rotl32(b,30) |
712 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
713 | movaps %xmm3, %xmm4 | ||
714 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
715 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
716 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
717 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
718 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
719 | xorps %xmm5, %xmm0 # ^ | ||
720 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
721 | movaps %xmm0, %xmm5 | ||
722 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
723 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
724 | paddd %xmm0, %xmm0 # shift left by 1 | ||
725 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
726 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
727 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
728 | movaps %xmm5, %xmm4 | ||
729 | pslld $2, %xmm5 | ||
730 | psrld $30, %xmm4 | ||
731 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
732 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
733 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
734 | movaps %xmm0, %xmm5 | ||
735 | paddd %xmm6, %xmm5 | ||
736 | movups %xmm5, -64+16*0(%rsp) | ||
621 | # 40 | 737 | # 40 |
622 | movl %ebx, %edi # di: b | 738 | movl %ebx, %edi # di: b |
623 | movl %ebx, %esi # si: b | 739 | movl %ebx, %esi # si: b |
@@ -625,12 +741,8 @@ sha1_process_block64: | |||
625 | andl %ecx, %esi # si: b & c | 741 | andl %ecx, %esi # si: b & c |
626 | andl %edx, %edi # di: (b | c) & d | 742 | andl %edx, %edi # di: (b | c) & d |
627 | orl %esi, %edi # ((b | c) & d) | (b & c) | 743 | orl %esi, %edi # ((b | c) & d) | (b & c) |
628 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
629 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
630 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
631 | roll %r8d # | ||
632 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 744 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
633 | leal -0x70E44324(%rbp,%r8), %ebp # e += RCONST + W[n & 15] | 745 | addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15] |
634 | movl %eax, %esi # | 746 | movl %eax, %esi # |
635 | roll $5, %esi # rotl32(a,5) | 747 | roll $5, %esi # rotl32(a,5) |
636 | addl %esi, %ebp # e += rotl32(a,5) | 748 | addl %esi, %ebp # e += rotl32(a,5) |
@@ -642,12 +754,8 @@ sha1_process_block64: | |||
642 | andl %ebx, %esi # si: b & c | 754 | andl %ebx, %esi # si: b & c |
643 | andl %ecx, %edi # di: (b | c) & d | 755 | andl %ecx, %edi # di: (b | c) & d |
644 | orl %esi, %edi # ((b | c) & d) | (b & c) | 756 | orl %esi, %edi # ((b | c) & d) | (b & c) |
645 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
646 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
647 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
648 | roll %r9d # | ||
649 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 757 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
650 | leal -0x70E44324(%rdx,%r9), %edx # e += RCONST + W[n & 15] | 758 | addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15] |
651 | movl %ebp, %esi # | 759 | movl %ebp, %esi # |
652 | roll $5, %esi # rotl32(a,5) | 760 | roll $5, %esi # rotl32(a,5) |
653 | addl %esi, %edx # e += rotl32(a,5) | 761 | addl %esi, %edx # e += rotl32(a,5) |
@@ -659,12 +767,8 @@ sha1_process_block64: | |||
659 | andl %eax, %esi # si: b & c | 767 | andl %eax, %esi # si: b & c |
660 | andl %ebx, %edi # di: (b | c) & d | 768 | andl %ebx, %edi # di: (b | c) & d |
661 | orl %esi, %edi # ((b | c) & d) | (b & c) | 769 | orl %esi, %edi # ((b | c) & d) | (b & c) |
662 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
663 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
664 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
665 | roll %r10d # | ||
666 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 770 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
667 | leal -0x70E44324(%rcx,%r10), %ecx # e += RCONST + W[n & 15] | 771 | addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15] |
668 | movl %edx, %esi # | 772 | movl %edx, %esi # |
669 | roll $5, %esi # rotl32(a,5) | 773 | roll $5, %esi # rotl32(a,5) |
670 | addl %esi, %ecx # e += rotl32(a,5) | 774 | addl %esi, %ecx # e += rotl32(a,5) |
@@ -676,16 +780,37 @@ sha1_process_block64: | |||
676 | andl %ebp, %esi # si: b & c | 780 | andl %ebp, %esi # si: b & c |
677 | andl %eax, %edi # di: (b | c) & d | 781 | andl %eax, %edi # di: (b | c) & d |
678 | orl %esi, %edi # ((b | c) & d) | (b & c) | 782 | orl %esi, %edi # ((b | c) & d) | (b & c) |
679 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
680 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
681 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
682 | roll %r11d # | ||
683 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 783 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
684 | leal -0x70E44324(%rbx,%r11), %ebx # e += RCONST + W[n & 15] | 784 | addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15] |
685 | movl %ecx, %esi # | 785 | movl %ecx, %esi # |
686 | roll $5, %esi # rotl32(a,5) | 786 | roll $5, %esi # rotl32(a,5) |
687 | addl %esi, %ebx # e += rotl32(a,5) | 787 | addl %esi, %ebx # e += rotl32(a,5) |
688 | rorl $2, %edx # b = rotl32(b,30) | 788 | rorl $2, %edx # b = rotl32(b,30) |
789 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
790 | movaps %xmm0, %xmm4 | ||
791 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
792 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
793 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
794 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
795 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
796 | xorps %xmm5, %xmm1 # ^ | ||
797 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
798 | movaps %xmm1, %xmm5 | ||
799 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
800 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
801 | paddd %xmm1, %xmm1 # shift left by 1 | ||
802 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
803 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
804 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
805 | movaps %xmm5, %xmm4 | ||
806 | pslld $2, %xmm5 | ||
807 | psrld $30, %xmm4 | ||
808 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
809 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
810 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
811 | movaps %xmm1, %xmm5 | ||
812 | paddd %xmm6, %xmm5 | ||
813 | movups %xmm5, -64+16*1(%rsp) | ||
689 | # 44 | 814 | # 44 |
690 | movl %ecx, %edi # di: b | 815 | movl %ecx, %edi # di: b |
691 | movl %ecx, %esi # si: b | 816 | movl %ecx, %esi # si: b |
@@ -693,12 +818,8 @@ sha1_process_block64: | |||
693 | andl %edx, %esi # si: b & c | 818 | andl %edx, %esi # si: b & c |
694 | andl %ebp, %edi # di: (b | c) & d | 819 | andl %ebp, %edi # di: (b | c) & d |
695 | orl %esi, %edi # ((b | c) & d) | (b & c) | 820 | orl %esi, %edi # ((b | c) & d) | (b & c) |
696 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
697 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
698 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
699 | roll %r12d # | ||
700 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 821 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
701 | leal -0x70E44324(%rax,%r12), %eax # e += RCONST + W[n & 15] | 822 | addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15] |
702 | movl %ebx, %esi # | 823 | movl %ebx, %esi # |
703 | roll $5, %esi # rotl32(a,5) | 824 | roll $5, %esi # rotl32(a,5) |
704 | addl %esi, %eax # e += rotl32(a,5) | 825 | addl %esi, %eax # e += rotl32(a,5) |
@@ -710,12 +831,8 @@ sha1_process_block64: | |||
710 | andl %ecx, %esi # si: b & c | 831 | andl %ecx, %esi # si: b & c |
711 | andl %edx, %edi # di: (b | c) & d | 832 | andl %edx, %edi # di: (b | c) & d |
712 | orl %esi, %edi # ((b | c) & d) | (b & c) | 833 | orl %esi, %edi # ((b | c) & d) | (b & c) |
713 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
714 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
715 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
716 | roll %r13d # | ||
717 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 834 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
718 | leal -0x70E44324(%rbp,%r13), %ebp # e += RCONST + W[n & 15] | 835 | addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15] |
719 | movl %eax, %esi # | 836 | movl %eax, %esi # |
720 | roll $5, %esi # rotl32(a,5) | 837 | roll $5, %esi # rotl32(a,5) |
721 | addl %esi, %ebp # e += rotl32(a,5) | 838 | addl %esi, %ebp # e += rotl32(a,5) |
@@ -727,12 +844,8 @@ sha1_process_block64: | |||
727 | andl %ebx, %esi # si: b & c | 844 | andl %ebx, %esi # si: b & c |
728 | andl %ecx, %edi # di: (b | c) & d | 845 | andl %ecx, %edi # di: (b | c) & d |
729 | orl %esi, %edi # ((b | c) & d) | (b & c) | 846 | orl %esi, %edi # ((b | c) & d) | (b & c) |
730 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
731 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
732 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
733 | roll %r14d # | ||
734 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 847 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
735 | leal -0x70E44324(%rdx,%r14), %edx # e += RCONST + W[n & 15] | 848 | addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15] |
736 | movl %ebp, %esi # | 849 | movl %ebp, %esi # |
737 | roll $5, %esi # rotl32(a,5) | 850 | roll $5, %esi # rotl32(a,5) |
738 | addl %esi, %edx # e += rotl32(a,5) | 851 | addl %esi, %edx # e += rotl32(a,5) |
@@ -744,16 +857,37 @@ sha1_process_block64: | |||
744 | andl %eax, %esi # si: b & c | 857 | andl %eax, %esi # si: b & c |
745 | andl %ebx, %edi # di: (b | c) & d | 858 | andl %ebx, %edi # di: (b | c) & d |
746 | orl %esi, %edi # ((b | c) & d) | (b & c) | 859 | orl %esi, %edi # ((b | c) & d) | (b & c) |
747 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
748 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
749 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
750 | roll %r15d # | ||
751 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 860 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
752 | leal -0x70E44324(%rcx,%r15), %ecx # e += RCONST + W[n & 15] | 861 | addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15] |
753 | movl %edx, %esi # | 862 | movl %edx, %esi # |
754 | roll $5, %esi # rotl32(a,5) | 863 | roll $5, %esi # rotl32(a,5) |
755 | addl %esi, %ecx # e += rotl32(a,5) | 864 | addl %esi, %ecx # e += rotl32(a,5) |
756 | rorl $2, %ebp # b = rotl32(b,30) | 865 | rorl $2, %ebp # b = rotl32(b,30) |
866 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
867 | movaps %xmm1, %xmm4 | ||
868 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
869 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
870 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
871 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
872 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
873 | xorps %xmm5, %xmm2 # ^ | ||
874 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
875 | movaps %xmm2, %xmm5 | ||
876 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
877 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
878 | paddd %xmm2, %xmm2 # shift left by 1 | ||
879 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
880 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
881 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
882 | movaps %xmm5, %xmm4 | ||
883 | pslld $2, %xmm5 | ||
884 | psrld $30, %xmm4 | ||
885 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
886 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
887 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
888 | movaps %xmm2, %xmm5 | ||
889 | paddd %xmm6, %xmm5 | ||
890 | movups %xmm5, -64+16*2(%rsp) | ||
757 | # 48 | 891 | # 48 |
758 | movl %edx, %edi # di: b | 892 | movl %edx, %edi # di: b |
759 | movl %edx, %esi # si: b | 893 | movl %edx, %esi # si: b |
@@ -761,14 +895,8 @@ sha1_process_block64: | |||
761 | andl %ebp, %esi # si: b & c | 895 | andl %ebp, %esi # si: b & c |
762 | andl %eax, %edi # di: (b | c) & d | 896 | andl %eax, %edi # di: (b | c) & d |
763 | orl %esi, %edi # ((b | c) & d) | (b & c) | 897 | orl %esi, %edi # ((b | c) & d) | (b & c) |
764 | movl %r13d, %esi # W[(n+13) & 15] | ||
765 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
766 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
767 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
768 | roll %esi # | ||
769 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
770 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 898 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
771 | leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 899 | addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15] |
772 | movl %ecx, %esi # | 900 | movl %ecx, %esi # |
773 | roll $5, %esi # rotl32(a,5) | 901 | roll $5, %esi # rotl32(a,5) |
774 | addl %esi, %ebx # e += rotl32(a,5) | 902 | addl %esi, %ebx # e += rotl32(a,5) |
@@ -780,14 +908,8 @@ sha1_process_block64: | |||
780 | andl %edx, %esi # si: b & c | 908 | andl %edx, %esi # si: b & c |
781 | andl %ebp, %edi # di: (b | c) & d | 909 | andl %ebp, %edi # di: (b | c) & d |
782 | orl %esi, %edi # ((b | c) & d) | (b & c) | 910 | orl %esi, %edi # ((b | c) & d) | (b & c) |
783 | movl %r14d, %esi # W[(n+13) & 15] | ||
784 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
785 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
786 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
787 | roll %esi # | ||
788 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
789 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 911 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
790 | leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 912 | addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15] |
791 | movl %ebx, %esi # | 913 | movl %ebx, %esi # |
792 | roll $5, %esi # rotl32(a,5) | 914 | roll $5, %esi # rotl32(a,5) |
793 | addl %esi, %eax # e += rotl32(a,5) | 915 | addl %esi, %eax # e += rotl32(a,5) |
@@ -799,14 +921,8 @@ sha1_process_block64: | |||
799 | andl %ecx, %esi # si: b & c | 921 | andl %ecx, %esi # si: b & c |
800 | andl %edx, %edi # di: (b | c) & d | 922 | andl %edx, %edi # di: (b | c) & d |
801 | orl %esi, %edi # ((b | c) & d) | (b & c) | 923 | orl %esi, %edi # ((b | c) & d) | (b & c) |
802 | movl %r15d, %esi # W[(n+13) & 15] | ||
803 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
804 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
805 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
806 | roll %esi # | ||
807 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
808 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 924 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
809 | leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 925 | addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15] |
810 | movl %eax, %esi # | 926 | movl %eax, %esi # |
811 | roll $5, %esi # rotl32(a,5) | 927 | roll $5, %esi # rotl32(a,5) |
812 | addl %esi, %ebp # e += rotl32(a,5) | 928 | addl %esi, %ebp # e += rotl32(a,5) |
@@ -818,18 +934,38 @@ sha1_process_block64: | |||
818 | andl %ebx, %esi # si: b & c | 934 | andl %ebx, %esi # si: b & c |
819 | andl %ecx, %edi # di: (b | c) & d | 935 | andl %ecx, %edi # di: (b | c) & d |
820 | orl %esi, %edi # ((b | c) & d) | (b & c) | 936 | orl %esi, %edi # ((b | c) & d) | (b & c) |
821 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
822 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
823 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
824 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
825 | roll %esi # | ||
826 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
827 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 937 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
828 | leal -0x70E44324(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 938 | addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15] |
829 | movl %ebp, %esi # | 939 | movl %ebp, %esi # |
830 | roll $5, %esi # rotl32(a,5) | 940 | roll $5, %esi # rotl32(a,5) |
831 | addl %esi, %edx # e += rotl32(a,5) | 941 | addl %esi, %edx # e += rotl32(a,5) |
832 | rorl $2, %eax # b = rotl32(b,30) | 942 | rorl $2, %eax # b = rotl32(b,30) |
943 | movaps rconst0xCA62C1D6(%rip), %xmm6 | ||
944 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
945 | movaps %xmm2, %xmm4 | ||
946 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
947 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
948 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
949 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
950 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
951 | xorps %xmm5, %xmm3 # ^ | ||
952 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
953 | movaps %xmm3, %xmm5 | ||
954 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
955 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
956 | paddd %xmm3, %xmm3 # shift left by 1 | ||
957 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
958 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
959 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
960 | movaps %xmm5, %xmm4 | ||
961 | pslld $2, %xmm5 | ||
962 | psrld $30, %xmm4 | ||
963 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
964 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
965 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
966 | movaps %xmm3, %xmm5 | ||
967 | paddd %xmm6, %xmm5 | ||
968 | movups %xmm5, -64+16*3(%rsp) | ||
833 | # 52 | 969 | # 52 |
834 | movl %ebp, %edi # di: b | 970 | movl %ebp, %edi # di: b |
835 | movl %ebp, %esi # si: b | 971 | movl %ebp, %esi # si: b |
@@ -837,14 +973,8 @@ sha1_process_block64: | |||
837 | andl %eax, %esi # si: b & c | 973 | andl %eax, %esi # si: b & c |
838 | andl %ebx, %edi # di: (b | c) & d | 974 | andl %ebx, %edi # di: (b | c) & d |
839 | orl %esi, %edi # ((b | c) & d) | (b & c) | 975 | orl %esi, %edi # ((b | c) & d) | (b & c) |
840 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
841 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
842 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
843 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
844 | roll %esi # | ||
845 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
846 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 976 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
847 | leal -0x70E44324(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 977 | addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15] |
848 | movl %edx, %esi # | 978 | movl %edx, %esi # |
849 | roll $5, %esi # rotl32(a,5) | 979 | roll $5, %esi # rotl32(a,5) |
850 | addl %esi, %ecx # e += rotl32(a,5) | 980 | addl %esi, %ecx # e += rotl32(a,5) |
@@ -856,14 +986,8 @@ sha1_process_block64: | |||
856 | andl %ebp, %esi # si: b & c | 986 | andl %ebp, %esi # si: b & c |
857 | andl %eax, %edi # di: (b | c) & d | 987 | andl %eax, %edi # di: (b | c) & d |
858 | orl %esi, %edi # ((b | c) & d) | (b & c) | 988 | orl %esi, %edi # ((b | c) & d) | (b & c) |
859 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
860 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
861 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
862 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
863 | roll %esi # | ||
864 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
865 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 989 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
866 | leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 990 | addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15] |
867 | movl %ecx, %esi # | 991 | movl %ecx, %esi # |
868 | roll $5, %esi # rotl32(a,5) | 992 | roll $5, %esi # rotl32(a,5) |
869 | addl %esi, %ebx # e += rotl32(a,5) | 993 | addl %esi, %ebx # e += rotl32(a,5) |
@@ -875,14 +999,8 @@ sha1_process_block64: | |||
875 | andl %edx, %esi # si: b & c | 999 | andl %edx, %esi # si: b & c |
876 | andl %ebp, %edi # di: (b | c) & d | 1000 | andl %ebp, %edi # di: (b | c) & d |
877 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1001 | orl %esi, %edi # ((b | c) & d) | (b & c) |
878 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
879 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
880 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
881 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
882 | roll %esi # | ||
883 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
884 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 1002 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
885 | leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 1003 | addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15] |
886 | movl %ebx, %esi # | 1004 | movl %ebx, %esi # |
887 | roll $5, %esi # rotl32(a,5) | 1005 | roll $5, %esi # rotl32(a,5) |
888 | addl %esi, %eax # e += rotl32(a,5) | 1006 | addl %esi, %eax # e += rotl32(a,5) |
@@ -894,18 +1012,37 @@ sha1_process_block64: | |||
894 | andl %ecx, %esi # si: b & c | 1012 | andl %ecx, %esi # si: b & c |
895 | andl %edx, %edi # di: (b | c) & d | 1013 | andl %edx, %edi # di: (b | c) & d |
896 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1014 | orl %esi, %edi # ((b | c) & d) | (b & c) |
897 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
898 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
899 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
900 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
901 | roll %esi # | ||
902 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
903 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 1015 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
904 | leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 1016 | addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15] |
905 | movl %eax, %esi # | 1017 | movl %eax, %esi # |
906 | roll $5, %esi # rotl32(a,5) | 1018 | roll $5, %esi # rotl32(a,5) |
907 | addl %esi, %ebp # e += rotl32(a,5) | 1019 | addl %esi, %ebp # e += rotl32(a,5) |
908 | rorl $2, %ebx # b = rotl32(b,30) | 1020 | rorl $2, %ebx # b = rotl32(b,30) |
1021 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
1022 | movaps %xmm3, %xmm4 | ||
1023 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
1024 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
1025 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
1026 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
1027 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
1028 | xorps %xmm5, %xmm0 # ^ | ||
1029 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
1030 | movaps %xmm0, %xmm5 | ||
1031 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
1032 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
1033 | paddd %xmm0, %xmm0 # shift left by 1 | ||
1034 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
1035 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
1036 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
1037 | movaps %xmm5, %xmm4 | ||
1038 | pslld $2, %xmm5 | ||
1039 | psrld $30, %xmm4 | ||
1040 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
1041 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
1042 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
1043 | movaps %xmm0, %xmm5 | ||
1044 | paddd %xmm6, %xmm5 | ||
1045 | movups %xmm5, -64+16*0(%rsp) | ||
909 | # 56 | 1046 | # 56 |
910 | movl %eax, %edi # di: b | 1047 | movl %eax, %edi # di: b |
911 | movl %eax, %esi # si: b | 1048 | movl %eax, %esi # si: b |
@@ -913,12 +1050,8 @@ sha1_process_block64: | |||
913 | andl %ebx, %esi # si: b & c | 1050 | andl %ebx, %esi # si: b & c |
914 | andl %ecx, %edi # di: (b | c) & d | 1051 | andl %ecx, %edi # di: (b | c) & d |
915 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1052 | orl %esi, %edi # ((b | c) & d) | (b & c) |
916 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
917 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
918 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
919 | roll %r8d # | ||
920 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 1053 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
921 | leal -0x70E44324(%rdx,%r8), %edx # e += RCONST + W[n & 15] | 1054 | addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15] |
922 | movl %ebp, %esi # | 1055 | movl %ebp, %esi # |
923 | roll $5, %esi # rotl32(a,5) | 1056 | roll $5, %esi # rotl32(a,5) |
924 | addl %esi, %edx # e += rotl32(a,5) | 1057 | addl %esi, %edx # e += rotl32(a,5) |
@@ -930,12 +1063,8 @@ sha1_process_block64: | |||
930 | andl %eax, %esi # si: b & c | 1063 | andl %eax, %esi # si: b & c |
931 | andl %ebx, %edi # di: (b | c) & d | 1064 | andl %ebx, %edi # di: (b | c) & d |
932 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1065 | orl %esi, %edi # ((b | c) & d) | (b & c) |
933 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
934 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
935 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
936 | roll %r9d # | ||
937 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 1066 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
938 | leal -0x70E44324(%rcx,%r9), %ecx # e += RCONST + W[n & 15] | 1067 | addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15] |
939 | movl %edx, %esi # | 1068 | movl %edx, %esi # |
940 | roll $5, %esi # rotl32(a,5) | 1069 | roll $5, %esi # rotl32(a,5) |
941 | addl %esi, %ecx # e += rotl32(a,5) | 1070 | addl %esi, %ecx # e += rotl32(a,5) |
@@ -947,12 +1076,8 @@ sha1_process_block64: | |||
947 | andl %ebp, %esi # si: b & c | 1076 | andl %ebp, %esi # si: b & c |
948 | andl %eax, %edi # di: (b | c) & d | 1077 | andl %eax, %edi # di: (b | c) & d |
949 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1078 | orl %esi, %edi # ((b | c) & d) | (b & c) |
950 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
951 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
952 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
953 | roll %r10d # | ||
954 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 1079 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
955 | leal -0x70E44324(%rbx,%r10), %ebx # e += RCONST + W[n & 15] | 1080 | addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15] |
956 | movl %ecx, %esi # | 1081 | movl %ecx, %esi # |
957 | roll $5, %esi # rotl32(a,5) | 1082 | roll $5, %esi # rotl32(a,5) |
958 | addl %esi, %ebx # e += rotl32(a,5) | 1083 | addl %esi, %ebx # e += rotl32(a,5) |
@@ -964,307 +1089,282 @@ sha1_process_block64: | |||
964 | andl %edx, %esi # si: b & c | 1089 | andl %edx, %esi # si: b & c |
965 | andl %ebp, %edi # di: (b | c) & d | 1090 | andl %ebp, %edi # di: (b | c) & d |
966 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1091 | orl %esi, %edi # ((b | c) & d) | (b & c) |
967 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
968 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
969 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
970 | roll %r11d # | ||
971 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 1092 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
972 | leal -0x70E44324(%rax,%r11), %eax # e += RCONST + W[n & 15] | 1093 | addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15] |
973 | movl %ebx, %esi # | 1094 | movl %ebx, %esi # |
974 | roll $5, %esi # rotl32(a,5) | 1095 | roll $5, %esi # rotl32(a,5) |
975 | addl %esi, %eax # e += rotl32(a,5) | 1096 | addl %esi, %eax # e += rotl32(a,5) |
976 | rorl $2, %ecx # b = rotl32(b,30) | 1097 | rorl $2, %ecx # b = rotl32(b,30) |
1098 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
1099 | movaps %xmm0, %xmm4 | ||
1100 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
1101 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
1102 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
1103 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
1104 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
1105 | xorps %xmm5, %xmm1 # ^ | ||
1106 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
1107 | movaps %xmm1, %xmm5 | ||
1108 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
1109 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
1110 | paddd %xmm1, %xmm1 # shift left by 1 | ||
1111 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
1112 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
1113 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
1114 | movaps %xmm5, %xmm4 | ||
1115 | pslld $2, %xmm5 | ||
1116 | psrld $30, %xmm4 | ||
1117 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
1118 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
1119 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
1120 | movaps %xmm1, %xmm5 | ||
1121 | paddd %xmm6, %xmm5 | ||
1122 | movups %xmm5, -64+16*1(%rsp) | ||
977 | # 60 | 1123 | # 60 |
978 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
979 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
980 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
981 | roll %r12d # | ||
982 | movl %ecx, %edi # c | 1124 | movl %ecx, %edi # c |
983 | xorl %edx, %edi # ^d | 1125 | xorl %edx, %edi # ^d |
984 | xorl %ebx, %edi # ^b | 1126 | xorl %ebx, %edi # ^b |
985 | leal -0x359D3E2A(%rbp,%r12), %ebp # e += RCONST + W[n & 15] | 1127 | addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15] |
986 | addl %edi, %ebp # e += (c ^ d ^ b) | 1128 | addl %edi, %ebp # e += (c ^ d ^ b) |
987 | movl %eax, %esi # | 1129 | movl %eax, %esi # |
988 | roll $5, %esi # rotl32(a,5) | 1130 | roll $5, %esi # rotl32(a,5) |
989 | addl %esi, %ebp # e += rotl32(a,5) | 1131 | addl %esi, %ebp # e += rotl32(a,5) |
990 | rorl $2, %ebx # b = rotl32(b,30) | 1132 | rorl $2, %ebx # b = rotl32(b,30) |
991 | # 61 | 1133 | # 61 |
992 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
993 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
994 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
995 | roll %r13d # | ||
996 | movl %ebx, %edi # c | 1134 | movl %ebx, %edi # c |
997 | xorl %ecx, %edi # ^d | 1135 | xorl %ecx, %edi # ^d |
998 | xorl %eax, %edi # ^b | 1136 | xorl %eax, %edi # ^b |
999 | leal -0x359D3E2A(%rdx,%r13), %edx # e += RCONST + W[n & 15] | 1137 | addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15] |
1000 | addl %edi, %edx # e += (c ^ d ^ b) | 1138 | addl %edi, %edx # e += (c ^ d ^ b) |
1001 | movl %ebp, %esi # | 1139 | movl %ebp, %esi # |
1002 | roll $5, %esi # rotl32(a,5) | 1140 | roll $5, %esi # rotl32(a,5) |
1003 | addl %esi, %edx # e += rotl32(a,5) | 1141 | addl %esi, %edx # e += rotl32(a,5) |
1004 | rorl $2, %eax # b = rotl32(b,30) | 1142 | rorl $2, %eax # b = rotl32(b,30) |
1005 | # 62 | 1143 | # 62 |
1006 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
1007 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
1008 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
1009 | roll %r14d # | ||
1010 | movl %eax, %edi # c | 1144 | movl %eax, %edi # c |
1011 | xorl %ebx, %edi # ^d | 1145 | xorl %ebx, %edi # ^d |
1012 | xorl %ebp, %edi # ^b | 1146 | xorl %ebp, %edi # ^b |
1013 | leal -0x359D3E2A(%rcx,%r14), %ecx # e += RCONST + W[n & 15] | 1147 | addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15] |
1014 | addl %edi, %ecx # e += (c ^ d ^ b) | 1148 | addl %edi, %ecx # e += (c ^ d ^ b) |
1015 | movl %edx, %esi # | 1149 | movl %edx, %esi # |
1016 | roll $5, %esi # rotl32(a,5) | 1150 | roll $5, %esi # rotl32(a,5) |
1017 | addl %esi, %ecx # e += rotl32(a,5) | 1151 | addl %esi, %ecx # e += rotl32(a,5) |
1018 | rorl $2, %ebp # b = rotl32(b,30) | 1152 | rorl $2, %ebp # b = rotl32(b,30) |
1019 | # 63 | 1153 | # 63 |
1020 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
1021 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
1022 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
1023 | roll %r15d # | ||
1024 | movl %ebp, %edi # c | 1154 | movl %ebp, %edi # c |
1025 | xorl %eax, %edi # ^d | 1155 | xorl %eax, %edi # ^d |
1026 | xorl %edx, %edi # ^b | 1156 | xorl %edx, %edi # ^b |
1027 | leal -0x359D3E2A(%rbx,%r15), %ebx # e += RCONST + W[n & 15] | 1157 | addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15] |
1028 | addl %edi, %ebx # e += (c ^ d ^ b) | 1158 | addl %edi, %ebx # e += (c ^ d ^ b) |
1029 | movl %ecx, %esi # | 1159 | movl %ecx, %esi # |
1030 | roll $5, %esi # rotl32(a,5) | 1160 | roll $5, %esi # rotl32(a,5) |
1031 | addl %esi, %ebx # e += rotl32(a,5) | 1161 | addl %esi, %ebx # e += rotl32(a,5) |
1032 | rorl $2, %edx # b = rotl32(b,30) | 1162 | rorl $2, %edx # b = rotl32(b,30) |
1163 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
1164 | movaps %xmm1, %xmm4 | ||
1165 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
1166 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
1167 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
1168 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
1169 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
1170 | xorps %xmm5, %xmm2 # ^ | ||
1171 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
1172 | movaps %xmm2, %xmm5 | ||
1173 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
1174 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
1175 | paddd %xmm2, %xmm2 # shift left by 1 | ||
1176 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
1177 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
1178 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
1179 | movaps %xmm5, %xmm4 | ||
1180 | pslld $2, %xmm5 | ||
1181 | psrld $30, %xmm4 | ||
1182 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
1183 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
1184 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
1185 | movaps %xmm2, %xmm5 | ||
1186 | paddd %xmm6, %xmm5 | ||
1187 | movups %xmm5, -64+16*2(%rsp) | ||
1033 | # 64 | 1188 | # 64 |
1034 | movl %r13d, %esi # W[(n+13) & 15] | ||
1035 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
1036 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
1037 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
1038 | roll %esi # | ||
1039 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
1040 | movl %edx, %edi # c | 1189 | movl %edx, %edi # c |
1041 | xorl %ebp, %edi # ^d | 1190 | xorl %ebp, %edi # ^d |
1042 | xorl %ecx, %edi # ^b | 1191 | xorl %ecx, %edi # ^b |
1043 | leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 1192 | addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15] |
1044 | addl %edi, %eax # e += (c ^ d ^ b) | 1193 | addl %edi, %eax # e += (c ^ d ^ b) |
1045 | movl %ebx, %esi # | 1194 | movl %ebx, %esi # |
1046 | roll $5, %esi # rotl32(a,5) | 1195 | roll $5, %esi # rotl32(a,5) |
1047 | addl %esi, %eax # e += rotl32(a,5) | 1196 | addl %esi, %eax # e += rotl32(a,5) |
1048 | rorl $2, %ecx # b = rotl32(b,30) | 1197 | rorl $2, %ecx # b = rotl32(b,30) |
1049 | # 65 | 1198 | # 65 |
1050 | movl %r14d, %esi # W[(n+13) & 15] | ||
1051 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
1052 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
1053 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
1054 | roll %esi # | ||
1055 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
1056 | movl %ecx, %edi # c | 1199 | movl %ecx, %edi # c |
1057 | xorl %edx, %edi # ^d | 1200 | xorl %edx, %edi # ^d |
1058 | xorl %ebx, %edi # ^b | 1201 | xorl %ebx, %edi # ^b |
1059 | leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 1202 | addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15] |
1060 | addl %edi, %ebp # e += (c ^ d ^ b) | 1203 | addl %edi, %ebp # e += (c ^ d ^ b) |
1061 | movl %eax, %esi # | 1204 | movl %eax, %esi # |
1062 | roll $5, %esi # rotl32(a,5) | 1205 | roll $5, %esi # rotl32(a,5) |
1063 | addl %esi, %ebp # e += rotl32(a,5) | 1206 | addl %esi, %ebp # e += rotl32(a,5) |
1064 | rorl $2, %ebx # b = rotl32(b,30) | 1207 | rorl $2, %ebx # b = rotl32(b,30) |
1065 | # 66 | 1208 | # 66 |
1066 | movl %r15d, %esi # W[(n+13) & 15] | ||
1067 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
1068 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
1069 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
1070 | roll %esi # | ||
1071 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
1072 | movl %ebx, %edi # c | 1209 | movl %ebx, %edi # c |
1073 | xorl %ecx, %edi # ^d | 1210 | xorl %ecx, %edi # ^d |
1074 | xorl %eax, %edi # ^b | 1211 | xorl %eax, %edi # ^b |
1075 | leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 1212 | addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15] |
1076 | addl %edi, %edx # e += (c ^ d ^ b) | 1213 | addl %edi, %edx # e += (c ^ d ^ b) |
1077 | movl %ebp, %esi # | 1214 | movl %ebp, %esi # |
1078 | roll $5, %esi # rotl32(a,5) | 1215 | roll $5, %esi # rotl32(a,5) |
1079 | addl %esi, %edx # e += rotl32(a,5) | 1216 | addl %esi, %edx # e += rotl32(a,5) |
1080 | rorl $2, %eax # b = rotl32(b,30) | 1217 | rorl $2, %eax # b = rotl32(b,30) |
1081 | # 67 | 1218 | # 67 |
1082 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
1083 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
1084 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
1085 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
1086 | roll %esi # | ||
1087 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
1088 | movl %eax, %edi # c | 1219 | movl %eax, %edi # c |
1089 | xorl %ebx, %edi # ^d | 1220 | xorl %ebx, %edi # ^d |
1090 | xorl %ebp, %edi # ^b | 1221 | xorl %ebp, %edi # ^b |
1091 | leal -0x359D3E2A(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 1222 | addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15] |
1092 | addl %edi, %ecx # e += (c ^ d ^ b) | 1223 | addl %edi, %ecx # e += (c ^ d ^ b) |
1093 | movl %edx, %esi # | 1224 | movl %edx, %esi # |
1094 | roll $5, %esi # rotl32(a,5) | 1225 | roll $5, %esi # rotl32(a,5) |
1095 | addl %esi, %ecx # e += rotl32(a,5) | 1226 | addl %esi, %ecx # e += rotl32(a,5) |
1096 | rorl $2, %ebp # b = rotl32(b,30) | 1227 | rorl $2, %ebp # b = rotl32(b,30) |
1228 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
1229 | movaps %xmm2, %xmm4 | ||
1230 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
1231 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
1232 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
1233 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
1234 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
1235 | xorps %xmm5, %xmm3 # ^ | ||
1236 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
1237 | movaps %xmm3, %xmm5 | ||
1238 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
1239 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
1240 | paddd %xmm3, %xmm3 # shift left by 1 | ||
1241 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
1242 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
1243 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
1244 | movaps %xmm5, %xmm4 | ||
1245 | pslld $2, %xmm5 | ||
1246 | psrld $30, %xmm4 | ||
1247 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
1248 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
1249 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
1250 | movaps %xmm3, %xmm5 | ||
1251 | paddd %xmm6, %xmm5 | ||
1252 | movups %xmm5, -64+16*3(%rsp) | ||
1097 | # 68 | 1253 | # 68 |
1098 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
1099 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
1100 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
1101 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
1102 | roll %esi # | ||
1103 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
1104 | movl %ebp, %edi # c | 1254 | movl %ebp, %edi # c |
1105 | xorl %eax, %edi # ^d | 1255 | xorl %eax, %edi # ^d |
1106 | xorl %edx, %edi # ^b | 1256 | xorl %edx, %edi # ^b |
1107 | leal -0x359D3E2A(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 1257 | addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15] |
1108 | addl %edi, %ebx # e += (c ^ d ^ b) | 1258 | addl %edi, %ebx # e += (c ^ d ^ b) |
1109 | movl %ecx, %esi # | 1259 | movl %ecx, %esi # |
1110 | roll $5, %esi # rotl32(a,5) | 1260 | roll $5, %esi # rotl32(a,5) |
1111 | addl %esi, %ebx # e += rotl32(a,5) | 1261 | addl %esi, %ebx # e += rotl32(a,5) |
1112 | rorl $2, %edx # b = rotl32(b,30) | 1262 | rorl $2, %edx # b = rotl32(b,30) |
1113 | # 69 | 1263 | # 69 |
1114 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
1115 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
1116 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
1117 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
1118 | roll %esi # | ||
1119 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
1120 | movl %edx, %edi # c | 1264 | movl %edx, %edi # c |
1121 | xorl %ebp, %edi # ^d | 1265 | xorl %ebp, %edi # ^d |
1122 | xorl %ecx, %edi # ^b | 1266 | xorl %ecx, %edi # ^b |
1123 | leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 1267 | addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15] |
1124 | addl %edi, %eax # e += (c ^ d ^ b) | 1268 | addl %edi, %eax # e += (c ^ d ^ b) |
1125 | movl %ebx, %esi # | 1269 | movl %ebx, %esi # |
1126 | roll $5, %esi # rotl32(a,5) | 1270 | roll $5, %esi # rotl32(a,5) |
1127 | addl %esi, %eax # e += rotl32(a,5) | 1271 | addl %esi, %eax # e += rotl32(a,5) |
1128 | rorl $2, %ecx # b = rotl32(b,30) | 1272 | rorl $2, %ecx # b = rotl32(b,30) |
1129 | # 70 | 1273 | # 70 |
1130 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
1131 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
1132 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
1133 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
1134 | roll %esi # | ||
1135 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
1136 | movl %ecx, %edi # c | 1274 | movl %ecx, %edi # c |
1137 | xorl %edx, %edi # ^d | 1275 | xorl %edx, %edi # ^d |
1138 | xorl %ebx, %edi # ^b | 1276 | xorl %ebx, %edi # ^b |
1139 | leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 1277 | addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15] |
1140 | addl %edi, %ebp # e += (c ^ d ^ b) | 1278 | addl %edi, %ebp # e += (c ^ d ^ b) |
1141 | movl %eax, %esi # | 1279 | movl %eax, %esi # |
1142 | roll $5, %esi # rotl32(a,5) | 1280 | roll $5, %esi # rotl32(a,5) |
1143 | addl %esi, %ebp # e += rotl32(a,5) | 1281 | addl %esi, %ebp # e += rotl32(a,5) |
1144 | rorl $2, %ebx # b = rotl32(b,30) | 1282 | rorl $2, %ebx # b = rotl32(b,30) |
1145 | # 71 | 1283 | # 71 |
1146 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
1147 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
1148 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
1149 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
1150 | roll %esi # | ||
1151 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
1152 | movl %ebx, %edi # c | 1284 | movl %ebx, %edi # c |
1153 | xorl %ecx, %edi # ^d | 1285 | xorl %ecx, %edi # ^d |
1154 | xorl %eax, %edi # ^b | 1286 | xorl %eax, %edi # ^b |
1155 | leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 1287 | addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15] |
1156 | addl %edi, %edx # e += (c ^ d ^ b) | 1288 | addl %edi, %edx # e += (c ^ d ^ b) |
1157 | movl %ebp, %esi # | 1289 | movl %ebp, %esi # |
1158 | roll $5, %esi # rotl32(a,5) | 1290 | roll $5, %esi # rotl32(a,5) |
1159 | addl %esi, %edx # e += rotl32(a,5) | 1291 | addl %esi, %edx # e += rotl32(a,5) |
1160 | rorl $2, %eax # b = rotl32(b,30) | 1292 | rorl $2, %eax # b = rotl32(b,30) |
1161 | # 72 | 1293 | # 72 |
1162 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
1163 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
1164 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
1165 | roll %r8d # | ||
1166 | movl %eax, %edi # c | 1294 | movl %eax, %edi # c |
1167 | xorl %ebx, %edi # ^d | 1295 | xorl %ebx, %edi # ^d |
1168 | xorl %ebp, %edi # ^b | 1296 | xorl %ebp, %edi # ^b |
1169 | leal -0x359D3E2A(%rcx,%r8), %ecx # e += RCONST + W[n & 15] | 1297 | addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15] |
1170 | addl %edi, %ecx # e += (c ^ d ^ b) | 1298 | addl %edi, %ecx # e += (c ^ d ^ b) |
1171 | movl %edx, %esi # | 1299 | movl %edx, %esi # |
1172 | roll $5, %esi # rotl32(a,5) | 1300 | roll $5, %esi # rotl32(a,5) |
1173 | addl %esi, %ecx # e += rotl32(a,5) | 1301 | addl %esi, %ecx # e += rotl32(a,5) |
1174 | rorl $2, %ebp # b = rotl32(b,30) | 1302 | rorl $2, %ebp # b = rotl32(b,30) |
1175 | # 73 | 1303 | # 73 |
1176 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
1177 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
1178 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
1179 | roll %r9d # | ||
1180 | movl %ebp, %edi # c | 1304 | movl %ebp, %edi # c |
1181 | xorl %eax, %edi # ^d | 1305 | xorl %eax, %edi # ^d |
1182 | xorl %edx, %edi # ^b | 1306 | xorl %edx, %edi # ^b |
1183 | leal -0x359D3E2A(%rbx,%r9), %ebx # e += RCONST + W[n & 15] | 1307 | addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15] |
1184 | addl %edi, %ebx # e += (c ^ d ^ b) | 1308 | addl %edi, %ebx # e += (c ^ d ^ b) |
1185 | movl %ecx, %esi # | 1309 | movl %ecx, %esi # |
1186 | roll $5, %esi # rotl32(a,5) | 1310 | roll $5, %esi # rotl32(a,5) |
1187 | addl %esi, %ebx # e += rotl32(a,5) | 1311 | addl %esi, %ebx # e += rotl32(a,5) |
1188 | rorl $2, %edx # b = rotl32(b,30) | 1312 | rorl $2, %edx # b = rotl32(b,30) |
1189 | # 74 | 1313 | # 74 |
1190 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
1191 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
1192 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
1193 | roll %r10d # | ||
1194 | movl %edx, %edi # c | 1314 | movl %edx, %edi # c |
1195 | xorl %ebp, %edi # ^d | 1315 | xorl %ebp, %edi # ^d |
1196 | xorl %ecx, %edi # ^b | 1316 | xorl %ecx, %edi # ^b |
1197 | leal -0x359D3E2A(%rax,%r10), %eax # e += RCONST + W[n & 15] | 1317 | addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15] |
1198 | addl %edi, %eax # e += (c ^ d ^ b) | 1318 | addl %edi, %eax # e += (c ^ d ^ b) |
1199 | movl %ebx, %esi # | 1319 | movl %ebx, %esi # |
1200 | roll $5, %esi # rotl32(a,5) | 1320 | roll $5, %esi # rotl32(a,5) |
1201 | addl %esi, %eax # e += rotl32(a,5) | 1321 | addl %esi, %eax # e += rotl32(a,5) |
1202 | rorl $2, %ecx # b = rotl32(b,30) | 1322 | rorl $2, %ecx # b = rotl32(b,30) |
1203 | # 75 | 1323 | # 75 |
1204 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
1205 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
1206 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
1207 | roll %r11d # | ||
1208 | movl %ecx, %edi # c | 1324 | movl %ecx, %edi # c |
1209 | xorl %edx, %edi # ^d | 1325 | xorl %edx, %edi # ^d |
1210 | xorl %ebx, %edi # ^b | 1326 | xorl %ebx, %edi # ^b |
1211 | leal -0x359D3E2A(%rbp,%r11), %ebp # e += RCONST + W[n & 15] | 1327 | addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15] |
1212 | addl %edi, %ebp # e += (c ^ d ^ b) | 1328 | addl %edi, %ebp # e += (c ^ d ^ b) |
1213 | movl %eax, %esi # | 1329 | movl %eax, %esi # |
1214 | roll $5, %esi # rotl32(a,5) | 1330 | roll $5, %esi # rotl32(a,5) |
1215 | addl %esi, %ebp # e += rotl32(a,5) | 1331 | addl %esi, %ebp # e += rotl32(a,5) |
1216 | rorl $2, %ebx # b = rotl32(b,30) | 1332 | rorl $2, %ebx # b = rotl32(b,30) |
1217 | # 76 | 1333 | # 76 |
1218 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
1219 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
1220 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
1221 | roll %r12d # | ||
1222 | movl %ebx, %edi # c | 1334 | movl %ebx, %edi # c |
1223 | xorl %ecx, %edi # ^d | 1335 | xorl %ecx, %edi # ^d |
1224 | xorl %eax, %edi # ^b | 1336 | xorl %eax, %edi # ^b |
1225 | leal -0x359D3E2A(%rdx,%r12), %edx # e += RCONST + W[n & 15] | 1337 | addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15] |
1226 | addl %edi, %edx # e += (c ^ d ^ b) | 1338 | addl %edi, %edx # e += (c ^ d ^ b) |
1227 | movl %ebp, %esi # | 1339 | movl %ebp, %esi # |
1228 | roll $5, %esi # rotl32(a,5) | 1340 | roll $5, %esi # rotl32(a,5) |
1229 | addl %esi, %edx # e += rotl32(a,5) | 1341 | addl %esi, %edx # e += rotl32(a,5) |
1230 | rorl $2, %eax # b = rotl32(b,30) | 1342 | rorl $2, %eax # b = rotl32(b,30) |
1231 | # 77 | 1343 | # 77 |
1232 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
1233 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
1234 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
1235 | roll %r13d # | ||
1236 | movl %eax, %edi # c | 1344 | movl %eax, %edi # c |
1237 | xorl %ebx, %edi # ^d | 1345 | xorl %ebx, %edi # ^d |
1238 | xorl %ebp, %edi # ^b | 1346 | xorl %ebp, %edi # ^b |
1239 | leal -0x359D3E2A(%rcx,%r13), %ecx # e += RCONST + W[n & 15] | 1347 | addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15] |
1240 | addl %edi, %ecx # e += (c ^ d ^ b) | 1348 | addl %edi, %ecx # e += (c ^ d ^ b) |
1241 | movl %edx, %esi # | 1349 | movl %edx, %esi # |
1242 | roll $5, %esi # rotl32(a,5) | 1350 | roll $5, %esi # rotl32(a,5) |
1243 | addl %esi, %ecx # e += rotl32(a,5) | 1351 | addl %esi, %ecx # e += rotl32(a,5) |
1244 | rorl $2, %ebp # b = rotl32(b,30) | 1352 | rorl $2, %ebp # b = rotl32(b,30) |
1245 | # 78 | 1353 | # 78 |
1246 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
1247 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
1248 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
1249 | roll %r14d # | ||
1250 | movl %ebp, %edi # c | 1354 | movl %ebp, %edi # c |
1251 | xorl %eax, %edi # ^d | 1355 | xorl %eax, %edi # ^d |
1252 | xorl %edx, %edi # ^b | 1356 | xorl %edx, %edi # ^b |
1253 | leal -0x359D3E2A(%rbx,%r14), %ebx # e += RCONST + W[n & 15] | 1357 | addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15] |
1254 | addl %edi, %ebx # e += (c ^ d ^ b) | 1358 | addl %edi, %ebx # e += (c ^ d ^ b) |
1255 | movl %ecx, %esi # | 1359 | movl %ecx, %esi # |
1256 | roll $5, %esi # rotl32(a,5) | 1360 | roll $5, %esi # rotl32(a,5) |
1257 | addl %esi, %ebx # e += rotl32(a,5) | 1361 | addl %esi, %ebx # e += rotl32(a,5) |
1258 | rorl $2, %edx # b = rotl32(b,30) | 1362 | rorl $2, %edx # b = rotl32(b,30) |
1259 | # 79 | 1363 | # 79 |
1260 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
1261 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
1262 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
1263 | roll %r15d # | ||
1264 | movl %edx, %edi # c | 1364 | movl %edx, %edi # c |
1265 | xorl %ebp, %edi # ^d | 1365 | xorl %ebp, %edi # ^d |
1266 | xorl %ecx, %edi # ^b | 1366 | xorl %ecx, %edi # ^b |
1267 | leal -0x359D3E2A(%rax,%r15), %eax # e += RCONST + W[n & 15] | 1367 | addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15] |
1268 | addl %edi, %eax # e += (c ^ d ^ b) | 1368 | addl %edi, %eax # e += (c ^ d ^ b) |
1269 | movl %ebx, %esi # | 1369 | movl %ebx, %esi # |
1270 | roll $5, %esi # rotl32(a,5) | 1370 | roll $5, %esi # rotl32(a,5) |
@@ -1286,4 +1386,28 @@ sha1_process_block64: | |||
1286 | 1386 | ||
1287 | ret | 1387 | ret |
1288 | .size sha1_process_block64, .-sha1_process_block64 | 1388 | .size sha1_process_block64, .-sha1_process_block64 |
1389 | |||
1390 | .section .rodata.cst16.sha1const, "aM", @progbits, 16 | ||
1391 | .align 16 | ||
1392 | rconst0x5A827999: | ||
1393 | .long 0x5A827999 | ||
1394 | .long 0x5A827999 | ||
1395 | .long 0x5A827999 | ||
1396 | .long 0x5A827999 | ||
1397 | rconst0x6ED9EBA1: | ||
1398 | .long 0x6ED9EBA1 | ||
1399 | .long 0x6ED9EBA1 | ||
1400 | .long 0x6ED9EBA1 | ||
1401 | .long 0x6ED9EBA1 | ||
1402 | rconst0x8F1BBCDC: | ||
1403 | .long 0x8F1BBCDC | ||
1404 | .long 0x8F1BBCDC | ||
1405 | .long 0x8F1BBCDC | ||
1406 | .long 0x8F1BBCDC | ||
1407 | rconst0xCA62C1D6: | ||
1408 | .long 0xCA62C1D6 | ||
1409 | .long 0xCA62C1D6 | ||
1410 | .long 0xCA62C1D6 | ||
1411 | .long 0xCA62C1D6 | ||
1412 | |||
1289 | #endif | 1413 | #endif |
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 901896e6e..87c2d0800 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh | |||
@@ -6,33 +6,103 @@ | |||
6 | # also contains the diff of the generated file. | 6 | # also contains the diff of the generated file. |
7 | exec >hash_md5_sha_x86-64.S | 7 | exec >hash_md5_sha_x86-64.S |
8 | 8 | ||
9 | # There is a way to use XMM registers (which always exist for x86-64!) for W[] | 9 | # Based on http://arctic.org/~dean/crypto/sha1.html. |
10 | # For example, if we load W as follows: | 10 | # ("This SHA1 implementation is public domain.") |
11 | # %xmm0: w[0x0] w[0x1] w[0x2] w[0x3] | 11 | # |
12 | # %xmm4: w[0x4] w[0x5] w[0x6] w[0x7] | 12 | # x86-64 has at least SSE2 vector insns always available. |
13 | # %xmm8: w[0x8] w[0x9] w[0xa] w[0xb] | 13 | # We can use them without any CPUID checks (and without a need |
14 | # %xmm12: w[0xc] w[0xd] w[0xe] w[0xf] | 14 | # for a fallback code if needed insns are not available). |
15 | # then the xor'ing operation to generate next W[0..3] is: | 15 | # This code uses them to calculate W[] ahead of time. |
16 | # movaps %xmm0, %xmmT2 | 16 | # |
17 | # palignr $0x8, %xmm4, %xmmT2 # form (w[0x2],w[0x3],w[0x4],w[0x5]) | 17 | # Unfortunately, results are passed from vector unit to |
18 | # # Right-shifts xmm4:xmmT2 by 8 bytes. Writes shifted result to xmmT2. SSSE3 insn. | 18 | # integer ALUs on the stack. MOVD/Q insns to move them directly |
19 | # movaps %xmm0, %xmmT13 | 19 | # from vector to integer registers are slower than store-to-load |
20 | # palignr $0x4,%xmm0,%xmmT13 # form (w[0xd],w[0xe],w[0xf],w[0x0]) | 20 | # forwarding in LSU (on Skylake at least). |
21 | # xmm0 = xmm0 ^ t2 ^ xmm8 ^ t13 | 21 | # |
22 | # xmm0 = rol32(xmm0,1) # no such insn, have to use pslld+psrld+or | 22 | # The win against a purely integer code is small on Skylake, |
23 | # and then results can be extracted for use: | 23 | # only about 7-8%. We offload about 1/3 of our operations to the vector unit. |
24 | # movd %xmm0, %esi # new W[0] | 24 | # It can do 4 ops at once in one 128-bit register, |
25 | # pextrd $1, %xmm0, %esi # new W[1] | 25 | # but we have to use x2 of them because of W[0] complication, |
26 | # # SSE4.1 insn. Can use EXTRACTPS (also SSE4.1) | 26 | # SSE2 has no "rotate each word by N bits" insns, |
27 | # pextrd $2, %xmm0, %esi # new W[2] | 27 | # moving data to/from vector unit is clunky, and Skylake |
28 | # pextrd $3, %xmm0, %esi # new W[3] | 28 | # has four integer ALUs unified with three vector ALUs, |
29 | # ... but this requires SSE4.1 and SSSE3, which are not universally available on x86-64. | 29 | # which makes pure integer code rather fast, and makes |
30 | # vector ops compete with integer ones. | ||
31 | # | ||
32 | # Zen3, with its separate vector ALUs, wins more, about 12%. | ||
33 | |||
34 | xmmT1="%xmm4" | ||
35 | xmmT2="%xmm5" | ||
36 | xmmRCONST="%xmm6" | ||
37 | T=`printf '\t'` | ||
38 | |||
39 | # SSE instructions are longer than 4 bytes on average. | ||
40 | # Intel CPUs (up to Tiger Lake at least) can't decode | ||
41 | # more than 16 bytes of code in one cycle. | ||
42 | # By interleaving SSE code and integer code | ||
43 | # we mostly achieve a situation where 16-byte decode fetch window | ||
44 | # contains 4 (or more) insns. | ||
45 | # | ||
46 | # However. On Skylake, there was no observed difference, | ||
47 | # but on Zen3, non-interleaved code is ~3% faster | ||
48 | # (822 Mb/s versus 795 Mb/s hashing speed). | ||
49 | # Off for now: | ||
50 | interleave=false | ||
51 | |||
52 | INTERLEAVE() { | ||
53 | $interleave || \ | ||
54 | { | ||
55 | # Generate non-interleaved code | ||
56 | # (it should work correctly too) | ||
57 | echo "$1" | ||
58 | echo "$2" | ||
59 | return | ||
60 | } | ||
61 | ( | ||
62 | echo "$1" | grep -v '^$' >"$0.temp1" | ||
63 | echo "$2" | grep -v '^$' >"$0.temp2" | ||
64 | exec 3<"$0.temp1" | ||
65 | exec 4<"$0.temp2" | ||
66 | IFS='' | ||
67 | while :; do | ||
68 | line1='' | ||
69 | line2='' | ||
70 | while :; do | ||
71 | read -r line1 <&3 | ||
72 | if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then | ||
73 | break | ||
74 | fi | ||
75 | echo "$line1" | ||
76 | done | ||
77 | while :; do | ||
78 | read -r line2 <&4 | ||
79 | if test "${line2:0:4}" = "${T}lea"; then | ||
80 | # We use 7-8 byte long forms of LEA. | ||
81 | # Do not interleave them with SSE insns | ||
82 | # which are also long. | ||
83 | echo "$line2" | ||
84 | read -r line2 <&4 | ||
85 | echo "$line2" | ||
86 | continue | ||
87 | fi | ||
88 | if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then | ||
89 | break | ||
90 | fi | ||
91 | echo "$line2" | ||
92 | done | ||
93 | test "$line1$line2" || break | ||
94 | echo "$line1" | ||
95 | echo "$line2" | ||
96 | done | ||
97 | rm "$0.temp1" "$0.temp2" | ||
98 | ) | ||
99 | } | ||
30 | 100 | ||
31 | echo \ | 101 | echo \ |
32 | '### Generated by hash_md5_sha_x86-64.S.sh ### | 102 | "### Generated by hash_md5_sha_x86-64.S.sh ### |
33 | 103 | ||
34 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) | 104 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
35 | .section .text.sha1_process_block64,"ax",@progbits | 105 | .section .text.sha1_process_block64,\"ax\",@progbits |
36 | .globl sha1_process_block64 | 106 | .globl sha1_process_block64 |
37 | .hidden sha1_process_block64 | 107 | .hidden sha1_process_block64 |
38 | .type sha1_process_block64, @function | 108 | .type sha1_process_block64, @function |
@@ -51,16 +121,10 @@ sha1_process_block64: | |||
51 | # eax..edx: a..d | 121 | # eax..edx: a..d |
52 | # ebp: e | 122 | # ebp: e |
53 | # esi,edi: temps | 123 | # esi,edi: temps |
54 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] | 124 | # xmm0..xmm3: W[] |
55 | # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) | 125 | # xmm4,xmm5: temps |
56 | movl $3, %eax | 126 | # xmm6: current round constant |
57 | 1: | 127 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units |
58 | movq (%rdi,%rax,8), %rsi | ||
59 | bswapq %rsi | ||
60 | rolq $32, %rsi | ||
61 | movq %rsi, -32(%rsp,%rax,8) | ||
62 | decl %eax | ||
63 | jns 1b | ||
64 | 128 | ||
65 | movl 80(%rdi), %eax # a = ctx->hash[0] | 129 | movl 80(%rdi), %eax # a = ctx->hash[0] |
66 | movl 84(%rdi), %ebx # b = ctx->hash[1] | 130 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
@@ -68,32 +132,120 @@ sha1_process_block64: | |||
68 | movl 92(%rdi), %edx # d = ctx->hash[3] | 132 | movl 92(%rdi), %edx # d = ctx->hash[3] |
69 | movl 96(%rdi), %ebp # e = ctx->hash[4] | 133 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
70 | 134 | ||
135 | movaps rconst0x5A827999(%rip), $xmmRCONST | ||
136 | |||
137 | # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 | ||
138 | # instead of spilling them to stack. | ||
139 | # (We lose parallelized addition of RCONST, but LEA | ||
140 | # can do two additions at once, so...) | ||
141 | movq 4*0(%rdi), %rsi | ||
142 | movq 4*2(%rdi), %r10 | ||
143 | bswapq %rsi | ||
144 | bswapq %r10 | ||
145 | rolq \$32, %rsi # rsi = W[1]:W[0] | ||
146 | rolq \$32, %r10 | ||
147 | movq %rsi, %xmm0 | ||
148 | movq %r10, $xmmT1 | ||
149 | punpcklqdq $xmmT1, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) | ||
150 | movaps %xmm0, $xmmT1 | ||
151 | paddd $xmmRCONST, $xmmT1 | ||
152 | movups $xmmT1, -64+4*0(%rsp) | ||
153 | |||
154 | movq 4*4(%rdi), %r8 | ||
155 | movq 4*6(%rdi), %r10 | ||
156 | bswapq %r8 | ||
157 | bswapq %r10 | ||
158 | rolq \$32, %r8 | ||
159 | rolq \$32, %r10 | ||
160 | movq %r8, %xmm1 | ||
161 | movq %r10, $xmmT1 | ||
162 | punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) | ||
163 | movaps %xmm1, $xmmT1 | ||
164 | paddd $xmmRCONST, $xmmT1 | ||
165 | movups $xmmT1, -64+4*4(%rsp) | ||
166 | |||
71 | movq 4*8(%rdi), %r8 | 167 | movq 4*8(%rdi), %r8 |
72 | movq 4*10(%rdi), %r10 | 168 | movq 4*10(%rdi), %r10 |
73 | bswapq %r8 | 169 | bswapq %r8 |
74 | bswapq %r10 | 170 | bswapq %r10 |
171 | movl %r8d, %r9d # r9d = W[9] | ||
172 | rolq \$32, %r8 # r8 = W[9]:W[8] | ||
173 | movl %r10d, %r11d # r11d = W[11] | ||
174 | rolq \$32, %r10 # r10 = W[11]:W[10] | ||
175 | movq %r8, %xmm2 | ||
176 | movq %r10, $xmmT1 | ||
177 | punpcklqdq $xmmT1, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11]) | ||
178 | |||
75 | movq 4*12(%rdi), %r12 | 179 | movq 4*12(%rdi), %r12 |
76 | movq 4*14(%rdi), %r14 | 180 | movq 4*14(%rdi), %r14 |
77 | bswapq %r12 | 181 | bswapq %r12 |
78 | bswapq %r14 | 182 | bswapq %r14 |
79 | movl %r8d, %r9d | 183 | movl %r12d, %r13d # r13d = W[13] |
80 | shrq $32, %r8 | 184 | rolq \$32, %r12 # r12 = W[13]:W[12] |
81 | movl %r10d, %r11d | 185 | movl %r14d, %r15d # r15d = W[15] |
82 | shrq $32, %r10 | 186 | rolq \$32, %r14 # r14 = W[15]:W[14] |
83 | movl %r12d, %r13d | 187 | movq %r12, %xmm3 |
84 | shrq $32, %r12 | 188 | movq %r14, $xmmT1 |
85 | movl %r14d, %r15d | 189 | punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) |
86 | shrq $32, %r14 | 190 | " |
87 | ' | 191 | |
88 | W32() { | 192 | PREP() { |
89 | test "$1" || exit 1 | 193 | local xmmW0=$1 |
90 | test "$1" -lt 0 && exit 1 | 194 | local xmmW4=$2 |
91 | test "$1" -gt 15 && exit 1 | 195 | local xmmW8=$3 |
92 | test "$1" -lt 8 && echo "-32+4*$1(%rsp)" | 196 | local xmmW12=$4 |
93 | test "$1" -ge 8 && echo "%r${1}d" | 197 | # the above must be %xmm0..3 in some permutation |
198 | local dstmem=$5 | ||
199 | #W[0] = rol(W[13] ^ W[8] ^ W[2] ^ W[0], 1); | ||
200 | #W[1] = rol(W[14] ^ W[9] ^ W[3] ^ W[1], 1); | ||
201 | #W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1); | ||
202 | #W[3] = rol( 0 ^ W[11] ^ W[5] ^ W[3], 1); | ||
203 | #W[3] ^= rol(W[0], 1); | ||
204 | echo "# PREP $@ | ||
205 | movaps $xmmW12, $xmmT1 | ||
206 | psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
207 | |||
208 | pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
209 | punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
210 | |||
211 | xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
212 | xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
213 | xorps $xmmT2, $xmmW0 # ^ | ||
214 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
215 | movaps $xmmW0, $xmmT2 | ||
216 | |||
217 | xorps $xmmT1, $xmmT1 # rol(W0,1): | ||
218 | pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) | ||
219 | paddd $xmmW0, $xmmW0 # shift left by 1 | ||
220 | psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 | ||
221 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
222 | |||
223 | pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
224 | movaps $xmmT2, $xmmT1 | ||
225 | pslld \$2, $xmmT2 | ||
226 | psrld \$30, $xmmT1 | ||
227 | # xorps $xmmT1, $xmmT2 # rol((0,0,0,unrotW[0]),2) | ||
228 | xorps $xmmT1, $xmmW0 # same result, but does not depend on/does not modify T2 | ||
229 | |||
230 | xorps $xmmT2, $xmmW0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
231 | " | ||
232 | # movq $xmmW0, %r8 # high latency (~6 cycles) | ||
233 | # movaps $xmmW0, $xmmT1 | ||
234 | # psrldq \$8, $xmmT1 # rshift by 8 bytes: move upper 64 bits to lower | ||
235 | # movq $xmmT1, %r10 # high latency | ||
236 | # movq %r8, %r9 | ||
237 | # movq %r10, %r11 | ||
238 | # shrq \$32, %r9 | ||
239 | # shrq \$32, %r11 | ||
240 | # ^^^ slower than passing the results on stack (!!!) | ||
241 | echo " | ||
242 | movaps $xmmW0, $xmmT2 | ||
243 | paddd $xmmRCONST, $xmmT2 | ||
244 | movups $xmmT2, $dstmem | ||
245 | " | ||
94 | } | 246 | } |
95 | 247 | ||
96 | # It's possible to interleave insns in rounds to mostly eliminate | 248 | # It's possible to interleave integer insns in rounds to mostly eliminate |
97 | # dependency chains, but this likely to only help old Pentium-based | 249 | # dependency chains, but this likely to only help old Pentium-based |
98 | # CPUs (ones without OOO, which can only simultaneously execute a pair | 250 | # CPUs (ones without OOO, which can only simultaneously execute a pair |
99 | # of _adjacent_ insns). | 251 | # of _adjacent_ insns). |
@@ -107,21 +259,16 @@ local n0=$(((n+0) & 15)) | |||
107 | echo " | 259 | echo " |
108 | # $n | 260 | # $n |
109 | ";test $n0 = 0 && echo " | 261 | ";test $n0 = 0 && echo " |
110 | # W[0], already in %esi | 262 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
111 | ";test $n0 != 0 && test $n0 -lt 8 && echo " | 263 | ";test $n0 != 0 && test $n0 -lt 8 && echo " |
112 | movl `W32 $n0`, %esi # W[n] | 264 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n] |
113 | ";test $n0 -ge 8 && echo " | 265 | ";test $n0 -ge 8 && echo " |
114 | # W[n], in %r$n0 | 266 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] |
115 | ";echo " | 267 | ";echo " |
116 | movl %e$c, %edi # c | 268 | movl %e$c, %edi # c |
117 | xorl %e$d, %edi # ^d | 269 | xorl %e$d, %edi # ^d |
118 | andl %e$b, %edi # &b | 270 | andl %e$b, %edi # &b |
119 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) | 271 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
120 | ";test $n0 -lt 8 && echo " | ||
121 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] | ||
122 | ";test $n0 -ge 8 && echo " | ||
123 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] | ||
124 | ";echo " | ||
125 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) | 272 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
126 | movl %e$a, %esi # | 273 | movl %e$a, %esi # |
127 | roll \$5, %esi # rotl32(a,5) | 274 | roll \$5, %esi # rotl32(a,5) |
@@ -138,28 +285,11 @@ local n2=$(((n+2) & 15)) | |||
138 | local n0=$(((n+0) & 15)) | 285 | local n0=$(((n+0) & 15)) |
139 | echo " | 286 | echo " |
140 | # $n | 287 | # $n |
141 | ";test $n0 -lt 8 && echo " | ||
142 | movl `W32 $n13`, %esi # W[(n+13) & 15] | ||
143 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] | ||
144 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] | ||
145 | xorl `W32 $n0`, %esi # ^W[n & 15] | ||
146 | roll %esi # | ||
147 | movl %esi, `W32 $n0` # store to W[n & 15] | ||
148 | ";test $n0 -ge 8 && echo " | ||
149 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] | ||
150 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] | ||
151 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] | ||
152 | roll `W32 $n0` # | ||
153 | ";echo " | ||
154 | movl %e$c, %edi # c | 288 | movl %e$c, %edi # c |
155 | xorl %e$d, %edi # ^d | 289 | xorl %e$d, %edi # ^d |
156 | andl %e$b, %edi # &b | 290 | andl %e$b, %edi # &b |
157 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) | 291 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
158 | ";test $n0 -lt 8 && echo " | 292 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
159 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] | ||
160 | ";test $n0 -ge 8 && echo " | ||
161 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] | ||
162 | ";echo " | ||
163 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) | 293 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
164 | movl %e$a, %esi # | 294 | movl %e$a, %esi # |
165 | roll \$5, %esi # rotl32(a,5) | 295 | roll \$5, %esi # rotl32(a,5) |
@@ -167,13 +297,6 @@ echo " | |||
167 | rorl \$2, %e$b # b = rotl32(b,30) | 297 | rorl \$2, %e$b # b = rotl32(b,30) |
168 | " | 298 | " |
169 | } | 299 | } |
170 | { | ||
171 | RCONST=0x5A827999 | ||
172 | RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; RD1A bx cx dx bp ax 4 | ||
173 | RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9 | ||
174 | RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14 | ||
175 | RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19 | ||
176 | } | grep -v '^$' | ||
177 | 300 | ||
178 | RD2() { | 301 | RD2() { |
179 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 | 302 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
@@ -184,27 +307,10 @@ local n2=$(((n+2) & 15)) | |||
184 | local n0=$(((n+0) & 15)) | 307 | local n0=$(((n+0) & 15)) |
185 | echo " | 308 | echo " |
186 | # $n | 309 | # $n |
187 | ";test $n0 -lt 8 && echo " | ||
188 | movl `W32 $n13`, %esi # W[(n+13) & 15] | ||
189 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] | ||
190 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] | ||
191 | xorl `W32 $n0`, %esi # ^W[n & 15] | ||
192 | roll %esi # | ||
193 | movl %esi, `W32 $n0` # store to W[n & 15] | ||
194 | ";test $n0 -ge 8 && echo " | ||
195 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] | ||
196 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] | ||
197 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] | ||
198 | roll `W32 $n0` # | ||
199 | ";echo " | ||
200 | movl %e$c, %edi # c | 310 | movl %e$c, %edi # c |
201 | xorl %e$d, %edi # ^d | 311 | xorl %e$d, %edi # ^d |
202 | xorl %e$b, %edi # ^b | 312 | xorl %e$b, %edi # ^b |
203 | ";test $n0 -lt 8 && echo " | 313 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
204 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] | ||
205 | ";test $n0 -ge 8 && echo " | ||
206 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] | ||
207 | ";echo " | ||
208 | addl %edi, %e$e # e += (c ^ d ^ b) | 314 | addl %edi, %e$e # e += (c ^ d ^ b) |
209 | movl %e$a, %esi # | 315 | movl %e$a, %esi # |
210 | roll \$5, %esi # rotl32(a,5) | 316 | roll \$5, %esi # rotl32(a,5) |
@@ -212,13 +318,6 @@ echo " | |||
212 | rorl \$2, %e$b # b = rotl32(b,30) | 318 | rorl \$2, %e$b # b = rotl32(b,30) |
213 | " | 319 | " |
214 | } | 320 | } |
215 | { | ||
216 | RCONST=0x6ED9EBA1 | ||
217 | RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24 | ||
218 | RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29 | ||
219 | RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34 | ||
220 | RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39 | ||
221 | } | grep -v '^$' | ||
222 | 321 | ||
223 | RD3() { | 322 | RD3() { |
224 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 | 323 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
@@ -235,53 +334,82 @@ echo " | |||
235 | andl %e$c, %esi # si: b & c | 334 | andl %e$c, %esi # si: b & c |
236 | andl %e$d, %edi # di: (b | c) & d | 335 | andl %e$d, %edi # di: (b | c) & d |
237 | orl %esi, %edi # ((b | c) & d) | (b & c) | 336 | orl %esi, %edi # ((b | c) & d) | (b & c) |
238 | ";test $n0 -lt 8 && echo " | ||
239 | movl `W32 $n13`, %esi # W[(n+13) & 15] | ||
240 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] | ||
241 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] | ||
242 | xorl `W32 $n0`, %esi # ^W[n & 15] | ||
243 | roll %esi # | ||
244 | movl %esi, `W32 $n0` # store to W[n & 15] | ||
245 | ";test $n0 -ge 8 && echo " | ||
246 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] | ||
247 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] | ||
248 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] | ||
249 | roll `W32 $n0` # | ||
250 | ";echo " | ||
251 | addl %edi, %e$e # += ((b | c) & d) | (b & c) | 337 | addl %edi, %e$e # += ((b | c) & d) | (b & c) |
252 | ";test $n0 -lt 8 && echo " | 338 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
253 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] | ||
254 | ";test $n0 -ge 8 && echo " | ||
255 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] | ||
256 | ";echo " | ||
257 | movl %e$a, %esi # | 339 | movl %e$a, %esi # |
258 | roll \$5, %esi # rotl32(a,5) | 340 | roll \$5, %esi # rotl32(a,5) |
259 | addl %esi, %e$e # e += rotl32(a,5) | 341 | addl %esi, %e$e # e += rotl32(a,5) |
260 | rorl \$2, %e$b # b = rotl32(b,30) | 342 | rorl \$2, %e$b # b = rotl32(b,30) |
261 | " | 343 | " |
262 | } | 344 | } |
345 | |||
263 | { | 346 | { |
264 | #RCONST=0x8F1BBCDC "out of range for signed 32bit displacement" | 347 | # Round 1 |
265 | RCONST=-0x70E44324 | 348 | RCONST=0x5A827999 |
266 | RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44 | 349 | RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; |
267 | RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49 | 350 | RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; |
268 | RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54 | 351 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` |
269 | RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59 | 352 | b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` |
270 | } | grep -v '^$' | 353 | INTERLEAVE "$a" "$b" |
354 | a=`echo " movaps rconst0x6ED9EBA1(%rip), $xmmRCONST" | ||
355 | PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` | ||
356 | b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` | ||
357 | INTERLEAVE "$a" "$b" | ||
358 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` | ||
359 | b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;` | ||
360 | INTERLEAVE "$a" "$b" | ||
361 | |||
362 | # Round 2 | ||
363 | RCONST=0x6ED9EBA1 | ||
364 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` | ||
365 | b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;` | ||
366 | INTERLEAVE "$a" "$b" | ||
367 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` | ||
368 | b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;` | ||
369 | INTERLEAVE "$a" "$b" | ||
370 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` | ||
371 | b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` | ||
372 | INTERLEAVE "$a" "$b" | ||
373 | a=`echo " movaps rconst0x8F1BBCDC(%rip), $xmmRCONST" | ||
374 | PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` | ||
375 | b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` | ||
376 | INTERLEAVE "$a" "$b" | ||
377 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` | ||
378 | b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;` | ||
379 | INTERLEAVE "$a" "$b" | ||
380 | |||
381 | # Round 3 | ||
382 | RCONST=0x8F1BBCDC | ||
383 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` | ||
384 | b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;` | ||
385 | INTERLEAVE "$a" "$b" | ||
386 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` | ||
387 | b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;` | ||
388 | INTERLEAVE "$a" "$b" | ||
389 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` | ||
390 | b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` | ||
391 | INTERLEAVE "$a" "$b" | ||
392 | a=`echo " movaps rconst0xCA62C1D6(%rip), $xmmRCONST" | ||
393 | PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` | ||
394 | b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` | ||
395 | INTERLEAVE "$a" "$b" | ||
396 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` | ||
397 | b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;` | ||
398 | INTERLEAVE "$a" "$b" | ||
271 | 399 | ||
272 | # Round 4 has the same logic as round 2, only n and RCONST are different | 400 | # Round 4 has the same logic as round 2, only n and RCONST are different |
273 | { | 401 | RCONST=0xCA62C1D6 |
274 | #RCONST=0xCA62C1D6 "out of range for signed 32bit displacement" | 402 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` |
275 | RCONST=-0x359D3E2A | 403 | b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;` |
276 | RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64 | 404 | INTERLEAVE "$a" "$b" |
277 | RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69 | 405 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` |
278 | RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74 | 406 | b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;` |
279 | RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79 | 407 | INTERLEAVE "$a" "$b" |
280 | # Note: new W[n&15] values generated in last 3 iterations | 408 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` |
281 | # (W[13,14,15]) are unused after each of these iterations. | 409 | b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;` |
282 | # Since we use r8..r15 for W[8..15], this does not matter. | 410 | INTERLEAVE "$a" "$b" |
283 | # If we switch to e.g. using r8..r15 for W[0..7], then saving of W[13,14,15] | 411 | RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75; |
284 | # (the "movl %esi, `W32 $n0`" insn) is a dead store and can be removed. | 412 | RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79; |
285 | } | grep -v '^$' | 413 | } | grep -v '^$' |
286 | 414 | ||
287 | echo " | 415 | echo " |
@@ -300,4 +428,28 @@ echo " | |||
300 | 428 | ||
301 | ret | 429 | ret |
302 | .size sha1_process_block64, .-sha1_process_block64 | 430 | .size sha1_process_block64, .-sha1_process_block64 |
431 | |||
432 | .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 | ||
433 | .align 16 | ||
434 | rconst0x5A827999: | ||
435 | .long 0x5A827999 | ||
436 | .long 0x5A827999 | ||
437 | .long 0x5A827999 | ||
438 | .long 0x5A827999 | ||
439 | rconst0x6ED9EBA1: | ||
440 | .long 0x6ED9EBA1 | ||
441 | .long 0x6ED9EBA1 | ||
442 | .long 0x6ED9EBA1 | ||
443 | .long 0x6ED9EBA1 | ||
444 | rconst0x8F1BBCDC: | ||
445 | .long 0x8F1BBCDC | ||
446 | .long 0x8F1BBCDC | ||
447 | .long 0x8F1BBCDC | ||
448 | .long 0x8F1BBCDC | ||
449 | rconst0xCA62C1D6: | ||
450 | .long 0xCA62C1D6 | ||
451 | .long 0xCA62C1D6 | ||
452 | .long 0xCA62C1D6 | ||
453 | .long 0xCA62C1D6 | ||
454 | |||
303 | #endif" | 455 | #endif" |