diff options
Diffstat (limited to 'libbb/hash_md5_sha_x86-64.S')
-rw-r--r-- | libbb/hash_md5_sha_x86-64.S | 992 |
1 files changed, 558 insertions, 434 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 87fb616a1..069a18719 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S | |||
@@ -20,16 +20,10 @@ sha1_process_block64: | |||
20 | # eax..edx: a..d | 20 | # eax..edx: a..d |
21 | # ebp: e | 21 | # ebp: e |
22 | # esi,edi: temps | 22 | # esi,edi: temps |
23 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] | 23 | # xmm0..xmm3: W[] |
24 | # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) | 24 | # xmm4,xmm5: temps |
25 | movl $3, %eax | 25 | # xmm6: current round constant |
26 | 1: | 26 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units |
27 | movq (%rdi,%rax,8), %rsi | ||
28 | bswapq %rsi | ||
29 | rolq $32, %rsi | ||
30 | movq %rsi, -32(%rsp,%rax,8) | ||
31 | decl %eax | ||
32 | jns 1b | ||
33 | 27 | ||
34 | movl 80(%rdi), %eax # a = ctx->hash[0] | 28 | movl 80(%rdi), %eax # a = ctx->hash[0] |
35 | movl 84(%rdi), %ebx # b = ctx->hash[1] | 29 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
@@ -37,587 +31,709 @@ sha1_process_block64: | |||
37 | movl 92(%rdi), %edx # d = ctx->hash[3] | 31 | movl 92(%rdi), %edx # d = ctx->hash[3] |
38 | movl 96(%rdi), %ebp # e = ctx->hash[4] | 32 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
39 | 33 | ||
34 | movaps rconst0x5A827999(%rip), %xmm6 | ||
35 | |||
36 | # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 | ||
37 | # instead of spilling them to stack. | ||
38 | # (We lose parallelized addition of RCONST, but LEA | ||
39 | # can do two additions at once, so...) | ||
40 | movq 4*0(%rdi), %rsi | ||
41 | movq 4*2(%rdi), %r10 | ||
42 | bswapq %rsi | ||
43 | bswapq %r10 | ||
44 | rolq $32, %rsi # rsi = W[1]:W[0] | ||
45 | rolq $32, %r10 | ||
46 | movq %rsi, %xmm0 | ||
47 | movq %r10, %xmm4 | ||
48 | punpcklqdq %xmm4, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) | ||
49 | movaps %xmm0, %xmm4 | ||
50 | paddd %xmm6, %xmm4 | ||
51 | movups %xmm4, -64+4*0(%rsp) | ||
52 | |||
53 | movq 4*4(%rdi), %r8 | ||
54 | movq 4*6(%rdi), %r10 | ||
55 | bswapq %r8 | ||
56 | bswapq %r10 | ||
57 | rolq $32, %r8 | ||
58 | rolq $32, %r10 | ||
59 | movq %r8, %xmm1 | ||
60 | movq %r10, %xmm4 | ||
61 | punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) | ||
62 | movaps %xmm1, %xmm4 | ||
63 | paddd %xmm6, %xmm4 | ||
64 | movups %xmm4, -64+4*4(%rsp) | ||
65 | |||
40 | movq 4*8(%rdi), %r8 | 66 | movq 4*8(%rdi), %r8 |
41 | movq 4*10(%rdi), %r10 | 67 | movq 4*10(%rdi), %r10 |
42 | bswapq %r8 | 68 | bswapq %r8 |
43 | bswapq %r10 | 69 | bswapq %r10 |
70 | movl %r8d, %r9d # r9d = W[9] | ||
71 | rolq $32, %r8 # r8 = W[9]:W[8] | ||
72 | movl %r10d, %r11d # r11d = W[11] | ||
73 | rolq $32, %r10 # r10 = W[11]:W[10] | ||
74 | movq %r8, %xmm2 | ||
75 | movq %r10, %xmm4 | ||
76 | punpcklqdq %xmm4, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11]) | ||
77 | |||
44 | movq 4*12(%rdi), %r12 | 78 | movq 4*12(%rdi), %r12 |
45 | movq 4*14(%rdi), %r14 | 79 | movq 4*14(%rdi), %r14 |
46 | bswapq %r12 | 80 | bswapq %r12 |
47 | bswapq %r14 | 81 | bswapq %r14 |
48 | movl %r8d, %r9d | 82 | movl %r12d, %r13d # r13d = W[13] |
49 | shrq $32, %r8 | 83 | rolq $32, %r12 # r12 = W[13]:W[12] |
50 | movl %r10d, %r11d | 84 | movl %r14d, %r15d # r15d = W[15] |
51 | shrq $32, %r10 | 85 | rolq $32, %r14 # r14 = W[15]:W[14] |
52 | movl %r12d, %r13d | 86 | movq %r12, %xmm3 |
53 | shrq $32, %r12 | 87 | movq %r14, %xmm4 |
54 | movl %r14d, %r15d | 88 | punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) |
55 | shrq $32, %r14 | ||
56 | 89 | ||
57 | # 0 | 90 | # 0 |
58 | # W[0], already in %esi | 91 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] |
59 | movl %ecx, %edi # c | 92 | movl %ecx, %edi # c |
60 | xorl %edx, %edi # ^d | 93 | xorl %edx, %edi # ^d |
61 | andl %ebx, %edi # &b | 94 | andl %ebx, %edi # &b |
62 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 95 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
63 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] | ||
64 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 96 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
65 | movl %eax, %esi # | 97 | movl %eax, %esi # |
66 | roll $5, %esi # rotl32(a,5) | 98 | roll $5, %esi # rotl32(a,5) |
67 | addl %esi, %ebp # e += rotl32(a,5) | 99 | addl %esi, %ebp # e += rotl32(a,5) |
68 | rorl $2, %ebx # b = rotl32(b,30) | 100 | rorl $2, %ebx # b = rotl32(b,30) |
69 | # 1 | 101 | # 1 |
70 | movl -32+4*1(%rsp), %esi # W[n] | 102 | addl -64+4*1(%rsp), %edx # e += RCONST + W[n] |
71 | movl %ebx, %edi # c | 103 | movl %ebx, %edi # c |
72 | xorl %ecx, %edi # ^d | 104 | xorl %ecx, %edi # ^d |
73 | andl %eax, %edi # &b | 105 | andl %eax, %edi # &b |
74 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 106 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
75 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] | ||
76 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 107 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
77 | movl %ebp, %esi # | 108 | movl %ebp, %esi # |
78 | roll $5, %esi # rotl32(a,5) | 109 | roll $5, %esi # rotl32(a,5) |
79 | addl %esi, %edx # e += rotl32(a,5) | 110 | addl %esi, %edx # e += rotl32(a,5) |
80 | rorl $2, %eax # b = rotl32(b,30) | 111 | rorl $2, %eax # b = rotl32(b,30) |
81 | # 2 | 112 | # 2 |
82 | movl -32+4*2(%rsp), %esi # W[n] | 113 | addl -64+4*2(%rsp), %ecx # e += RCONST + W[n] |
83 | movl %eax, %edi # c | 114 | movl %eax, %edi # c |
84 | xorl %ebx, %edi # ^d | 115 | xorl %ebx, %edi # ^d |
85 | andl %ebp, %edi # &b | 116 | andl %ebp, %edi # &b |
86 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 117 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
87 | leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n] | ||
88 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 118 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
89 | movl %edx, %esi # | 119 | movl %edx, %esi # |
90 | roll $5, %esi # rotl32(a,5) | 120 | roll $5, %esi # rotl32(a,5) |
91 | addl %esi, %ecx # e += rotl32(a,5) | 121 | addl %esi, %ecx # e += rotl32(a,5) |
92 | rorl $2, %ebp # b = rotl32(b,30) | 122 | rorl $2, %ebp # b = rotl32(b,30) |
93 | # 3 | 123 | # 3 |
94 | movl -32+4*3(%rsp), %esi # W[n] | 124 | addl -64+4*3(%rsp), %ebx # e += RCONST + W[n] |
95 | movl %ebp, %edi # c | 125 | movl %ebp, %edi # c |
96 | xorl %eax, %edi # ^d | 126 | xorl %eax, %edi # ^d |
97 | andl %edx, %edi # &b | 127 | andl %edx, %edi # &b |
98 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 128 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
99 | leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n] | ||
100 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 129 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
101 | movl %ecx, %esi # | 130 | movl %ecx, %esi # |
102 | roll $5, %esi # rotl32(a,5) | 131 | roll $5, %esi # rotl32(a,5) |
103 | addl %esi, %ebx # e += rotl32(a,5) | 132 | addl %esi, %ebx # e += rotl32(a,5) |
104 | rorl $2, %edx # b = rotl32(b,30) | 133 | rorl $2, %edx # b = rotl32(b,30) |
105 | # 4 | 134 | # 4 |
106 | movl -32+4*4(%rsp), %esi # W[n] | 135 | addl -64+4*4(%rsp), %eax # e += RCONST + W[n] |
107 | movl %edx, %edi # c | 136 | movl %edx, %edi # c |
108 | xorl %ebp, %edi # ^d | 137 | xorl %ebp, %edi # ^d |
109 | andl %ecx, %edi # &b | 138 | andl %ecx, %edi # &b |
110 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 139 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
111 | leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n] | ||
112 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 140 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
113 | movl %ebx, %esi # | 141 | movl %ebx, %esi # |
114 | roll $5, %esi # rotl32(a,5) | 142 | roll $5, %esi # rotl32(a,5) |
115 | addl %esi, %eax # e += rotl32(a,5) | 143 | addl %esi, %eax # e += rotl32(a,5) |
116 | rorl $2, %ecx # b = rotl32(b,30) | 144 | rorl $2, %ecx # b = rotl32(b,30) |
117 | # 5 | 145 | # 5 |
118 | movl -32+4*5(%rsp), %esi # W[n] | 146 | addl -64+4*5(%rsp), %ebp # e += RCONST + W[n] |
119 | movl %ecx, %edi # c | 147 | movl %ecx, %edi # c |
120 | xorl %edx, %edi # ^d | 148 | xorl %edx, %edi # ^d |
121 | andl %ebx, %edi # &b | 149 | andl %ebx, %edi # &b |
122 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 150 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
123 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] | ||
124 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 151 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
125 | movl %eax, %esi # | 152 | movl %eax, %esi # |
126 | roll $5, %esi # rotl32(a,5) | 153 | roll $5, %esi # rotl32(a,5) |
127 | addl %esi, %ebp # e += rotl32(a,5) | 154 | addl %esi, %ebp # e += rotl32(a,5) |
128 | rorl $2, %ebx # b = rotl32(b,30) | 155 | rorl $2, %ebx # b = rotl32(b,30) |
129 | # 6 | 156 | # 6 |
130 | movl -32+4*6(%rsp), %esi # W[n] | 157 | addl -64+4*6(%rsp), %edx # e += RCONST + W[n] |
131 | movl %ebx, %edi # c | 158 | movl %ebx, %edi # c |
132 | xorl %ecx, %edi # ^d | 159 | xorl %ecx, %edi # ^d |
133 | andl %eax, %edi # &b | 160 | andl %eax, %edi # &b |
134 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 161 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
135 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] | ||
136 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 162 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
137 | movl %ebp, %esi # | 163 | movl %ebp, %esi # |
138 | roll $5, %esi # rotl32(a,5) | 164 | roll $5, %esi # rotl32(a,5) |
139 | addl %esi, %edx # e += rotl32(a,5) | 165 | addl %esi, %edx # e += rotl32(a,5) |
140 | rorl $2, %eax # b = rotl32(b,30) | 166 | rorl $2, %eax # b = rotl32(b,30) |
141 | # 7 | 167 | # 7 |
142 | movl -32+4*7(%rsp), %esi # W[n] | 168 | addl -64+4*7(%rsp), %ecx # e += RCONST + W[n] |
143 | movl %eax, %edi # c | 169 | movl %eax, %edi # c |
144 | xorl %ebx, %edi # ^d | 170 | xorl %ebx, %edi # ^d |
145 | andl %ebp, %edi # &b | 171 | andl %ebp, %edi # &b |
146 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 172 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
147 | leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n] | ||
148 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 173 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
149 | movl %edx, %esi # | 174 | movl %edx, %esi # |
150 | roll $5, %esi # rotl32(a,5) | 175 | roll $5, %esi # rotl32(a,5) |
151 | addl %esi, %ecx # e += rotl32(a,5) | 176 | addl %esi, %ecx # e += rotl32(a,5) |
152 | rorl $2, %ebp # b = rotl32(b,30) | 177 | rorl $2, %ebp # b = rotl32(b,30) |
178 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
179 | movaps %xmm3, %xmm4 | ||
180 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
181 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
182 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
183 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
184 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
185 | xorps %xmm5, %xmm0 # ^ | ||
186 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
187 | movaps %xmm0, %xmm5 | ||
188 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
189 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
190 | paddd %xmm0, %xmm0 # shift left by 1 | ||
191 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
192 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
193 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
194 | movaps %xmm5, %xmm4 | ||
195 | pslld $2, %xmm5 | ||
196 | psrld $30, %xmm4 | ||
197 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
198 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
199 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
200 | movaps %xmm0, %xmm5 | ||
201 | paddd %xmm6, %xmm5 | ||
202 | movups %xmm5, -64+16*0(%rsp) | ||
153 | # 8 | 203 | # 8 |
154 | # W[n], in %r8 | 204 | leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] |
155 | movl %ebp, %edi # c | 205 | movl %ebp, %edi # c |
156 | xorl %eax, %edi # ^d | 206 | xorl %eax, %edi # ^d |
157 | andl %edx, %edi # &b | 207 | andl %edx, %edi # &b |
158 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 208 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
159 | leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] | ||
160 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 209 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
161 | movl %ecx, %esi # | 210 | movl %ecx, %esi # |
162 | roll $5, %esi # rotl32(a,5) | 211 | roll $5, %esi # rotl32(a,5) |
163 | addl %esi, %ebx # e += rotl32(a,5) | 212 | addl %esi, %ebx # e += rotl32(a,5) |
164 | rorl $2, %edx # b = rotl32(b,30) | 213 | rorl $2, %edx # b = rotl32(b,30) |
165 | # 9 | 214 | # 9 |
166 | # W[n], in %r9 | 215 | leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] |
167 | movl %edx, %edi # c | 216 | movl %edx, %edi # c |
168 | xorl %ebp, %edi # ^d | 217 | xorl %ebp, %edi # ^d |
169 | andl %ecx, %edi # &b | 218 | andl %ecx, %edi # &b |
170 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 219 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
171 | leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] | ||
172 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 220 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
173 | movl %ebx, %esi # | 221 | movl %ebx, %esi # |
174 | roll $5, %esi # rotl32(a,5) | 222 | roll $5, %esi # rotl32(a,5) |
175 | addl %esi, %eax # e += rotl32(a,5) | 223 | addl %esi, %eax # e += rotl32(a,5) |
176 | rorl $2, %ecx # b = rotl32(b,30) | 224 | rorl $2, %ecx # b = rotl32(b,30) |
177 | # 10 | 225 | # 10 |
178 | # W[n], in %r10 | 226 | leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n] |
179 | movl %ecx, %edi # c | 227 | movl %ecx, %edi # c |
180 | xorl %edx, %edi # ^d | 228 | xorl %edx, %edi # ^d |
181 | andl %ebx, %edi # &b | 229 | andl %ebx, %edi # &b |
182 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 230 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
183 | leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n] | ||
184 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 231 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
185 | movl %eax, %esi # | 232 | movl %eax, %esi # |
186 | roll $5, %esi # rotl32(a,5) | 233 | roll $5, %esi # rotl32(a,5) |
187 | addl %esi, %ebp # e += rotl32(a,5) | 234 | addl %esi, %ebp # e += rotl32(a,5) |
188 | rorl $2, %ebx # b = rotl32(b,30) | 235 | rorl $2, %ebx # b = rotl32(b,30) |
189 | # 11 | 236 | # 11 |
190 | # W[n], in %r11 | 237 | leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n] |
191 | movl %ebx, %edi # c | 238 | movl %ebx, %edi # c |
192 | xorl %ecx, %edi # ^d | 239 | xorl %ecx, %edi # ^d |
193 | andl %eax, %edi # &b | 240 | andl %eax, %edi # &b |
194 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 241 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
195 | leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n] | ||
196 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 242 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
197 | movl %ebp, %esi # | 243 | movl %ebp, %esi # |
198 | roll $5, %esi # rotl32(a,5) | 244 | roll $5, %esi # rotl32(a,5) |
199 | addl %esi, %edx # e += rotl32(a,5) | 245 | addl %esi, %edx # e += rotl32(a,5) |
200 | rorl $2, %eax # b = rotl32(b,30) | 246 | rorl $2, %eax # b = rotl32(b,30) |
247 | movaps rconst0x6ED9EBA1(%rip), %xmm6 | ||
248 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
249 | movaps %xmm0, %xmm4 | ||
250 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
251 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
252 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
253 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
254 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
255 | xorps %xmm5, %xmm1 # ^ | ||
256 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
257 | movaps %xmm1, %xmm5 | ||
258 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
259 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
260 | paddd %xmm1, %xmm1 # shift left by 1 | ||
261 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
262 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
263 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
264 | movaps %xmm5, %xmm4 | ||
265 | pslld $2, %xmm5 | ||
266 | psrld $30, %xmm4 | ||
267 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
268 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
269 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
270 | movaps %xmm1, %xmm5 | ||
271 | paddd %xmm6, %xmm5 | ||
272 | movups %xmm5, -64+16*1(%rsp) | ||
201 | # 12 | 273 | # 12 |
202 | # W[n], in %r12 | 274 | leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n] |
203 | movl %eax, %edi # c | 275 | movl %eax, %edi # c |
204 | xorl %ebx, %edi # ^d | 276 | xorl %ebx, %edi # ^d |
205 | andl %ebp, %edi # &b | 277 | andl %ebp, %edi # &b |
206 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 278 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
207 | leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n] | ||
208 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 279 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
209 | movl %edx, %esi # | 280 | movl %edx, %esi # |
210 | roll $5, %esi # rotl32(a,5) | 281 | roll $5, %esi # rotl32(a,5) |
211 | addl %esi, %ecx # e += rotl32(a,5) | 282 | addl %esi, %ecx # e += rotl32(a,5) |
212 | rorl $2, %ebp # b = rotl32(b,30) | 283 | rorl $2, %ebp # b = rotl32(b,30) |
213 | # 13 | 284 | # 13 |
214 | # W[n], in %r13 | 285 | leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] |
215 | movl %ebp, %edi # c | 286 | movl %ebp, %edi # c |
216 | xorl %eax, %edi # ^d | 287 | xorl %eax, %edi # ^d |
217 | andl %edx, %edi # &b | 288 | andl %edx, %edi # &b |
218 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 289 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
219 | leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] | ||
220 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 290 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
221 | movl %ecx, %esi # | 291 | movl %ecx, %esi # |
222 | roll $5, %esi # rotl32(a,5) | 292 | roll $5, %esi # rotl32(a,5) |
223 | addl %esi, %ebx # e += rotl32(a,5) | 293 | addl %esi, %ebx # e += rotl32(a,5) |
224 | rorl $2, %edx # b = rotl32(b,30) | 294 | rorl $2, %edx # b = rotl32(b,30) |
225 | # 14 | 295 | # 14 |
226 | # W[n], in %r14 | 296 | leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] |
227 | movl %edx, %edi # c | 297 | movl %edx, %edi # c |
228 | xorl %ebp, %edi # ^d | 298 | xorl %ebp, %edi # ^d |
229 | andl %ecx, %edi # &b | 299 | andl %ecx, %edi # &b |
230 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 300 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
231 | leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] | ||
232 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 301 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
233 | movl %ebx, %esi # | 302 | movl %ebx, %esi # |
234 | roll $5, %esi # rotl32(a,5) | 303 | roll $5, %esi # rotl32(a,5) |
235 | addl %esi, %eax # e += rotl32(a,5) | 304 | addl %esi, %eax # e += rotl32(a,5) |
236 | rorl $2, %ecx # b = rotl32(b,30) | 305 | rorl $2, %ecx # b = rotl32(b,30) |
237 | # 15 | 306 | # 15 |
238 | # W[n], in %r15 | 307 | leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n] |
239 | movl %ecx, %edi # c | 308 | movl %ecx, %edi # c |
240 | xorl %edx, %edi # ^d | 309 | xorl %edx, %edi # ^d |
241 | andl %ebx, %edi # &b | 310 | andl %ebx, %edi # &b |
242 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 311 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
243 | leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n] | ||
244 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 312 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
245 | movl %eax, %esi # | 313 | movl %eax, %esi # |
246 | roll $5, %esi # rotl32(a,5) | 314 | roll $5, %esi # rotl32(a,5) |
247 | addl %esi, %ebp # e += rotl32(a,5) | 315 | addl %esi, %ebp # e += rotl32(a,5) |
248 | rorl $2, %ebx # b = rotl32(b,30) | 316 | rorl $2, %ebx # b = rotl32(b,30) |
317 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
318 | movaps %xmm1, %xmm4 | ||
319 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
320 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
321 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
322 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
323 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
324 | xorps %xmm5, %xmm2 # ^ | ||
325 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
326 | movaps %xmm2, %xmm5 | ||
327 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
328 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
329 | paddd %xmm2, %xmm2 # shift left by 1 | ||
330 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
331 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
332 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
333 | movaps %xmm5, %xmm4 | ||
334 | pslld $2, %xmm5 | ||
335 | psrld $30, %xmm4 | ||
336 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
337 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
338 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
339 | movaps %xmm2, %xmm5 | ||
340 | paddd %xmm6, %xmm5 | ||
341 | movups %xmm5, -64+16*2(%rsp) | ||
249 | # 16 | 342 | # 16 |
250 | movl %r13d, %esi # W[(n+13) & 15] | ||
251 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
252 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
253 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
254 | roll %esi # | ||
255 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
256 | movl %ebx, %edi # c | 343 | movl %ebx, %edi # c |
257 | xorl %ecx, %edi # ^d | 344 | xorl %ecx, %edi # ^d |
258 | andl %eax, %edi # &b | 345 | andl %eax, %edi # &b |
259 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 346 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
260 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 347 | addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15] |
261 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 348 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
262 | movl %ebp, %esi # | 349 | movl %ebp, %esi # |
263 | roll $5, %esi # rotl32(a,5) | 350 | roll $5, %esi # rotl32(a,5) |
264 | addl %esi, %edx # e += rotl32(a,5) | 351 | addl %esi, %edx # e += rotl32(a,5) |
265 | rorl $2, %eax # b = rotl32(b,30) | 352 | rorl $2, %eax # b = rotl32(b,30) |
266 | # 17 | 353 | # 17 |
267 | movl %r14d, %esi # W[(n+13) & 15] | ||
268 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
269 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
270 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
271 | roll %esi # | ||
272 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
273 | movl %eax, %edi # c | 354 | movl %eax, %edi # c |
274 | xorl %ebx, %edi # ^d | 355 | xorl %ebx, %edi # ^d |
275 | andl %ebp, %edi # &b | 356 | andl %ebp, %edi # &b |
276 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 357 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
277 | leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 358 | addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15] |
278 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 359 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
279 | movl %edx, %esi # | 360 | movl %edx, %esi # |
280 | roll $5, %esi # rotl32(a,5) | 361 | roll $5, %esi # rotl32(a,5) |
281 | addl %esi, %ecx # e += rotl32(a,5) | 362 | addl %esi, %ecx # e += rotl32(a,5) |
282 | rorl $2, %ebp # b = rotl32(b,30) | 363 | rorl $2, %ebp # b = rotl32(b,30) |
283 | # 18 | 364 | # 18 |
284 | movl %r15d, %esi # W[(n+13) & 15] | ||
285 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
286 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
287 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
288 | roll %esi # | ||
289 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
290 | movl %ebp, %edi # c | 365 | movl %ebp, %edi # c |
291 | xorl %eax, %edi # ^d | 366 | xorl %eax, %edi # ^d |
292 | andl %edx, %edi # &b | 367 | andl %edx, %edi # &b |
293 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 368 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
294 | leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 369 | addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15] |
295 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 370 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
296 | movl %ecx, %esi # | 371 | movl %ecx, %esi # |
297 | roll $5, %esi # rotl32(a,5) | 372 | roll $5, %esi # rotl32(a,5) |
298 | addl %esi, %ebx # e += rotl32(a,5) | 373 | addl %esi, %ebx # e += rotl32(a,5) |
299 | rorl $2, %edx # b = rotl32(b,30) | 374 | rorl $2, %edx # b = rotl32(b,30) |
300 | # 19 | 375 | # 19 |
301 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
302 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
303 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
304 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
305 | roll %esi # | ||
306 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
307 | movl %edx, %edi # c | 376 | movl %edx, %edi # c |
308 | xorl %ebp, %edi # ^d | 377 | xorl %ebp, %edi # ^d |
309 | andl %ecx, %edi # &b | 378 | andl %ecx, %edi # &b |
310 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 379 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
311 | leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 380 | addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15] |
312 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 381 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
313 | movl %ebx, %esi # | 382 | movl %ebx, %esi # |
314 | roll $5, %esi # rotl32(a,5) | 383 | roll $5, %esi # rotl32(a,5) |
315 | addl %esi, %eax # e += rotl32(a,5) | 384 | addl %esi, %eax # e += rotl32(a,5) |
316 | rorl $2, %ecx # b = rotl32(b,30) | 385 | rorl $2, %ecx # b = rotl32(b,30) |
386 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
387 | movaps %xmm2, %xmm4 | ||
388 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
389 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
390 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
391 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
392 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
393 | xorps %xmm5, %xmm3 # ^ | ||
394 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
395 | movaps %xmm3, %xmm5 | ||
396 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
397 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
398 | paddd %xmm3, %xmm3 # shift left by 1 | ||
399 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
400 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
401 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
402 | movaps %xmm5, %xmm4 | ||
403 | pslld $2, %xmm5 | ||
404 | psrld $30, %xmm4 | ||
405 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
406 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
407 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
408 | movaps %xmm3, %xmm5 | ||
409 | paddd %xmm6, %xmm5 | ||
410 | movups %xmm5, -64+16*3(%rsp) | ||
317 | # 20 | 411 | # 20 |
318 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
319 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
320 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
321 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
322 | roll %esi # | ||
323 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
324 | movl %ecx, %edi # c | 412 | movl %ecx, %edi # c |
325 | xorl %edx, %edi # ^d | 413 | xorl %edx, %edi # ^d |
326 | xorl %ebx, %edi # ^b | 414 | xorl %ebx, %edi # ^b |
327 | leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 415 | addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15] |
328 | addl %edi, %ebp # e += (c ^ d ^ b) | 416 | addl %edi, %ebp # e += (c ^ d ^ b) |
329 | movl %eax, %esi # | 417 | movl %eax, %esi # |
330 | roll $5, %esi # rotl32(a,5) | 418 | roll $5, %esi # rotl32(a,5) |
331 | addl %esi, %ebp # e += rotl32(a,5) | 419 | addl %esi, %ebp # e += rotl32(a,5) |
332 | rorl $2, %ebx # b = rotl32(b,30) | 420 | rorl $2, %ebx # b = rotl32(b,30) |
333 | # 21 | 421 | # 21 |
334 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
335 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
336 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
337 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
338 | roll %esi # | ||
339 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
340 | movl %ebx, %edi # c | 422 | movl %ebx, %edi # c |
341 | xorl %ecx, %edi # ^d | 423 | xorl %ecx, %edi # ^d |
342 | xorl %eax, %edi # ^b | 424 | xorl %eax, %edi # ^b |
343 | leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 425 | addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15] |
344 | addl %edi, %edx # e += (c ^ d ^ b) | 426 | addl %edi, %edx # e += (c ^ d ^ b) |
345 | movl %ebp, %esi # | 427 | movl %ebp, %esi # |
346 | roll $5, %esi # rotl32(a,5) | 428 | roll $5, %esi # rotl32(a,5) |
347 | addl %esi, %edx # e += rotl32(a,5) | 429 | addl %esi, %edx # e += rotl32(a,5) |
348 | rorl $2, %eax # b = rotl32(b,30) | 430 | rorl $2, %eax # b = rotl32(b,30) |
349 | # 22 | 431 | # 22 |
350 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
351 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
352 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
353 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
354 | roll %esi # | ||
355 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
356 | movl %eax, %edi # c | 432 | movl %eax, %edi # c |
357 | xorl %ebx, %edi # ^d | 433 | xorl %ebx, %edi # ^d |
358 | xorl %ebp, %edi # ^b | 434 | xorl %ebp, %edi # ^b |
359 | leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 435 | addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15] |
360 | addl %edi, %ecx # e += (c ^ d ^ b) | 436 | addl %edi, %ecx # e += (c ^ d ^ b) |
361 | movl %edx, %esi # | 437 | movl %edx, %esi # |
362 | roll $5, %esi # rotl32(a,5) | 438 | roll $5, %esi # rotl32(a,5) |
363 | addl %esi, %ecx # e += rotl32(a,5) | 439 | addl %esi, %ecx # e += rotl32(a,5) |
364 | rorl $2, %ebp # b = rotl32(b,30) | 440 | rorl $2, %ebp # b = rotl32(b,30) |
365 | # 23 | 441 | # 23 |
366 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
367 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
368 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
369 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
370 | roll %esi # | ||
371 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
372 | movl %ebp, %edi # c | 442 | movl %ebp, %edi # c |
373 | xorl %eax, %edi # ^d | 443 | xorl %eax, %edi # ^d |
374 | xorl %edx, %edi # ^b | 444 | xorl %edx, %edi # ^b |
375 | leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 445 | addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15] |
376 | addl %edi, %ebx # e += (c ^ d ^ b) | 446 | addl %edi, %ebx # e += (c ^ d ^ b) |
377 | movl %ecx, %esi # | 447 | movl %ecx, %esi # |
378 | roll $5, %esi # rotl32(a,5) | 448 | roll $5, %esi # rotl32(a,5) |
379 | addl %esi, %ebx # e += rotl32(a,5) | 449 | addl %esi, %ebx # e += rotl32(a,5) |
380 | rorl $2, %edx # b = rotl32(b,30) | 450 | rorl $2, %edx # b = rotl32(b,30) |
451 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
452 | movaps %xmm3, %xmm4 | ||
453 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
454 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
455 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
456 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
457 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
458 | xorps %xmm5, %xmm0 # ^ | ||
459 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
460 | movaps %xmm0, %xmm5 | ||
461 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
462 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
463 | paddd %xmm0, %xmm0 # shift left by 1 | ||
464 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
465 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
466 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
467 | movaps %xmm5, %xmm4 | ||
468 | pslld $2, %xmm5 | ||
469 | psrld $30, %xmm4 | ||
470 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
471 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
472 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
473 | movaps %xmm0, %xmm5 | ||
474 | paddd %xmm6, %xmm5 | ||
475 | movups %xmm5, -64+16*0(%rsp) | ||
381 | # 24 | 476 | # 24 |
382 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
383 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
384 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
385 | roll %r8d # | ||
386 | movl %edx, %edi # c | 477 | movl %edx, %edi # c |
387 | xorl %ebp, %edi # ^d | 478 | xorl %ebp, %edi # ^d |
388 | xorl %ecx, %edi # ^b | 479 | xorl %ecx, %edi # ^b |
389 | leal 0x6ED9EBA1(%rax,%r8), %eax # e += RCONST + W[n & 15] | 480 | addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15] |
390 | addl %edi, %eax # e += (c ^ d ^ b) | 481 | addl %edi, %eax # e += (c ^ d ^ b) |
391 | movl %ebx, %esi # | 482 | movl %ebx, %esi # |
392 | roll $5, %esi # rotl32(a,5) | 483 | roll $5, %esi # rotl32(a,5) |
393 | addl %esi, %eax # e += rotl32(a,5) | 484 | addl %esi, %eax # e += rotl32(a,5) |
394 | rorl $2, %ecx # b = rotl32(b,30) | 485 | rorl $2, %ecx # b = rotl32(b,30) |
395 | # 25 | 486 | # 25 |
396 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
397 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
398 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
399 | roll %r9d # | ||
400 | movl %ecx, %edi # c | 487 | movl %ecx, %edi # c |
401 | xorl %edx, %edi # ^d | 488 | xorl %edx, %edi # ^d |
402 | xorl %ebx, %edi # ^b | 489 | xorl %ebx, %edi # ^b |
403 | leal 0x6ED9EBA1(%rbp,%r9), %ebp # e += RCONST + W[n & 15] | 490 | addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15] |
404 | addl %edi, %ebp # e += (c ^ d ^ b) | 491 | addl %edi, %ebp # e += (c ^ d ^ b) |
405 | movl %eax, %esi # | 492 | movl %eax, %esi # |
406 | roll $5, %esi # rotl32(a,5) | 493 | roll $5, %esi # rotl32(a,5) |
407 | addl %esi, %ebp # e += rotl32(a,5) | 494 | addl %esi, %ebp # e += rotl32(a,5) |
408 | rorl $2, %ebx # b = rotl32(b,30) | 495 | rorl $2, %ebx # b = rotl32(b,30) |
409 | # 26 | 496 | # 26 |
410 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
411 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
412 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
413 | roll %r10d # | ||
414 | movl %ebx, %edi # c | 497 | movl %ebx, %edi # c |
415 | xorl %ecx, %edi # ^d | 498 | xorl %ecx, %edi # ^d |
416 | xorl %eax, %edi # ^b | 499 | xorl %eax, %edi # ^b |
417 | leal 0x6ED9EBA1(%rdx,%r10), %edx # e += RCONST + W[n & 15] | 500 | addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15] |
418 | addl %edi, %edx # e += (c ^ d ^ b) | 501 | addl %edi, %edx # e += (c ^ d ^ b) |
419 | movl %ebp, %esi # | 502 | movl %ebp, %esi # |
420 | roll $5, %esi # rotl32(a,5) | 503 | roll $5, %esi # rotl32(a,5) |
421 | addl %esi, %edx # e += rotl32(a,5) | 504 | addl %esi, %edx # e += rotl32(a,5) |
422 | rorl $2, %eax # b = rotl32(b,30) | 505 | rorl $2, %eax # b = rotl32(b,30) |
423 | # 27 | 506 | # 27 |
424 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
425 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
426 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
427 | roll %r11d # | ||
428 | movl %eax, %edi # c | 507 | movl %eax, %edi # c |
429 | xorl %ebx, %edi # ^d | 508 | xorl %ebx, %edi # ^d |
430 | xorl %ebp, %edi # ^b | 509 | xorl %ebp, %edi # ^b |
431 | leal 0x6ED9EBA1(%rcx,%r11), %ecx # e += RCONST + W[n & 15] | 510 | addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15] |
432 | addl %edi, %ecx # e += (c ^ d ^ b) | 511 | addl %edi, %ecx # e += (c ^ d ^ b) |
433 | movl %edx, %esi # | 512 | movl %edx, %esi # |
434 | roll $5, %esi # rotl32(a,5) | 513 | roll $5, %esi # rotl32(a,5) |
435 | addl %esi, %ecx # e += rotl32(a,5) | 514 | addl %esi, %ecx # e += rotl32(a,5) |
436 | rorl $2, %ebp # b = rotl32(b,30) | 515 | rorl $2, %ebp # b = rotl32(b,30) |
516 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
517 | movaps %xmm0, %xmm4 | ||
518 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
519 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
520 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
521 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
522 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
523 | xorps %xmm5, %xmm1 # ^ | ||
524 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
525 | movaps %xmm1, %xmm5 | ||
526 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
527 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
528 | paddd %xmm1, %xmm1 # shift left by 1 | ||
529 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
530 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
531 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
532 | movaps %xmm5, %xmm4 | ||
533 | pslld $2, %xmm5 | ||
534 | psrld $30, %xmm4 | ||
535 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
536 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
537 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
538 | movaps %xmm1, %xmm5 | ||
539 | paddd %xmm6, %xmm5 | ||
540 | movups %xmm5, -64+16*1(%rsp) | ||
437 | # 28 | 541 | # 28 |
438 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
439 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
440 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
441 | roll %r12d # | ||
442 | movl %ebp, %edi # c | 542 | movl %ebp, %edi # c |
443 | xorl %eax, %edi # ^d | 543 | xorl %eax, %edi # ^d |
444 | xorl %edx, %edi # ^b | 544 | xorl %edx, %edi # ^b |
445 | leal 0x6ED9EBA1(%rbx,%r12), %ebx # e += RCONST + W[n & 15] | 545 | addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15] |
446 | addl %edi, %ebx # e += (c ^ d ^ b) | 546 | addl %edi, %ebx # e += (c ^ d ^ b) |
447 | movl %ecx, %esi # | 547 | movl %ecx, %esi # |
448 | roll $5, %esi # rotl32(a,5) | 548 | roll $5, %esi # rotl32(a,5) |
449 | addl %esi, %ebx # e += rotl32(a,5) | 549 | addl %esi, %ebx # e += rotl32(a,5) |
450 | rorl $2, %edx # b = rotl32(b,30) | 550 | rorl $2, %edx # b = rotl32(b,30) |
451 | # 29 | 551 | # 29 |
452 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
453 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
454 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
455 | roll %r13d # | ||
456 | movl %edx, %edi # c | 552 | movl %edx, %edi # c |
457 | xorl %ebp, %edi # ^d | 553 | xorl %ebp, %edi # ^d |
458 | xorl %ecx, %edi # ^b | 554 | xorl %ecx, %edi # ^b |
459 | leal 0x6ED9EBA1(%rax,%r13), %eax # e += RCONST + W[n & 15] | 555 | addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15] |
460 | addl %edi, %eax # e += (c ^ d ^ b) | 556 | addl %edi, %eax # e += (c ^ d ^ b) |
461 | movl %ebx, %esi # | 557 | movl %ebx, %esi # |
462 | roll $5, %esi # rotl32(a,5) | 558 | roll $5, %esi # rotl32(a,5) |
463 | addl %esi, %eax # e += rotl32(a,5) | 559 | addl %esi, %eax # e += rotl32(a,5) |
464 | rorl $2, %ecx # b = rotl32(b,30) | 560 | rorl $2, %ecx # b = rotl32(b,30) |
465 | # 30 | 561 | # 30 |
466 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
467 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
468 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
469 | roll %r14d # | ||
470 | movl %ecx, %edi # c | 562 | movl %ecx, %edi # c |
471 | xorl %edx, %edi # ^d | 563 | xorl %edx, %edi # ^d |
472 | xorl %ebx, %edi # ^b | 564 | xorl %ebx, %edi # ^b |
473 | leal 0x6ED9EBA1(%rbp,%r14), %ebp # e += RCONST + W[n & 15] | 565 | addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15] |
474 | addl %edi, %ebp # e += (c ^ d ^ b) | 566 | addl %edi, %ebp # e += (c ^ d ^ b) |
475 | movl %eax, %esi # | 567 | movl %eax, %esi # |
476 | roll $5, %esi # rotl32(a,5) | 568 | roll $5, %esi # rotl32(a,5) |
477 | addl %esi, %ebp # e += rotl32(a,5) | 569 | addl %esi, %ebp # e += rotl32(a,5) |
478 | rorl $2, %ebx # b = rotl32(b,30) | 570 | rorl $2, %ebx # b = rotl32(b,30) |
479 | # 31 | 571 | # 31 |
480 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
481 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
482 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
483 | roll %r15d # | ||
484 | movl %ebx, %edi # c | 572 | movl %ebx, %edi # c |
485 | xorl %ecx, %edi # ^d | 573 | xorl %ecx, %edi # ^d |
486 | xorl %eax, %edi # ^b | 574 | xorl %eax, %edi # ^b |
487 | leal 0x6ED9EBA1(%rdx,%r15), %edx # e += RCONST + W[n & 15] | 575 | addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15] |
488 | addl %edi, %edx # e += (c ^ d ^ b) | 576 | addl %edi, %edx # e += (c ^ d ^ b) |
489 | movl %ebp, %esi # | 577 | movl %ebp, %esi # |
490 | roll $5, %esi # rotl32(a,5) | 578 | roll $5, %esi # rotl32(a,5) |
491 | addl %esi, %edx # e += rotl32(a,5) | 579 | addl %esi, %edx # e += rotl32(a,5) |
492 | rorl $2, %eax # b = rotl32(b,30) | 580 | rorl $2, %eax # b = rotl32(b,30) |
581 | movaps rconst0x8F1BBCDC(%rip), %xmm6 | ||
582 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
583 | movaps %xmm1, %xmm4 | ||
584 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
585 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
586 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
587 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
588 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
589 | xorps %xmm5, %xmm2 # ^ | ||
590 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
591 | movaps %xmm2, %xmm5 | ||
592 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
593 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
594 | paddd %xmm2, %xmm2 # shift left by 1 | ||
595 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
596 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
597 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
598 | movaps %xmm5, %xmm4 | ||
599 | pslld $2, %xmm5 | ||
600 | psrld $30, %xmm4 | ||
601 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
602 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
603 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
604 | movaps %xmm2, %xmm5 | ||
605 | paddd %xmm6, %xmm5 | ||
606 | movups %xmm5, -64+16*2(%rsp) | ||
493 | # 32 | 607 | # 32 |
494 | movl %r13d, %esi # W[(n+13) & 15] | ||
495 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
496 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
497 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
498 | roll %esi # | ||
499 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
500 | movl %eax, %edi # c | 608 | movl %eax, %edi # c |
501 | xorl %ebx, %edi # ^d | 609 | xorl %ebx, %edi # ^d |
502 | xorl %ebp, %edi # ^b | 610 | xorl %ebp, %edi # ^b |
503 | leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 611 | addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15] |
504 | addl %edi, %ecx # e += (c ^ d ^ b) | 612 | addl %edi, %ecx # e += (c ^ d ^ b) |
505 | movl %edx, %esi # | 613 | movl %edx, %esi # |
506 | roll $5, %esi # rotl32(a,5) | 614 | roll $5, %esi # rotl32(a,5) |
507 | addl %esi, %ecx # e += rotl32(a,5) | 615 | addl %esi, %ecx # e += rotl32(a,5) |
508 | rorl $2, %ebp # b = rotl32(b,30) | 616 | rorl $2, %ebp # b = rotl32(b,30) |
509 | # 33 | 617 | # 33 |
510 | movl %r14d, %esi # W[(n+13) & 15] | ||
511 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
512 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
513 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
514 | roll %esi # | ||
515 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
516 | movl %ebp, %edi # c | 618 | movl %ebp, %edi # c |
517 | xorl %eax, %edi # ^d | 619 | xorl %eax, %edi # ^d |
518 | xorl %edx, %edi # ^b | 620 | xorl %edx, %edi # ^b |
519 | leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 621 | addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15] |
520 | addl %edi, %ebx # e += (c ^ d ^ b) | 622 | addl %edi, %ebx # e += (c ^ d ^ b) |
521 | movl %ecx, %esi # | 623 | movl %ecx, %esi # |
522 | roll $5, %esi # rotl32(a,5) | 624 | roll $5, %esi # rotl32(a,5) |
523 | addl %esi, %ebx # e += rotl32(a,5) | 625 | addl %esi, %ebx # e += rotl32(a,5) |
524 | rorl $2, %edx # b = rotl32(b,30) | 626 | rorl $2, %edx # b = rotl32(b,30) |
525 | # 34 | 627 | # 34 |
526 | movl %r15d, %esi # W[(n+13) & 15] | ||
527 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
528 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
529 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
530 | roll %esi # | ||
531 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
532 | movl %edx, %edi # c | 628 | movl %edx, %edi # c |
533 | xorl %ebp, %edi # ^d | 629 | xorl %ebp, %edi # ^d |
534 | xorl %ecx, %edi # ^b | 630 | xorl %ecx, %edi # ^b |
535 | leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 631 | addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15] |
536 | addl %edi, %eax # e += (c ^ d ^ b) | 632 | addl %edi, %eax # e += (c ^ d ^ b) |
537 | movl %ebx, %esi # | 633 | movl %ebx, %esi # |
538 | roll $5, %esi # rotl32(a,5) | 634 | roll $5, %esi # rotl32(a,5) |
539 | addl %esi, %eax # e += rotl32(a,5) | 635 | addl %esi, %eax # e += rotl32(a,5) |
540 | rorl $2, %ecx # b = rotl32(b,30) | 636 | rorl $2, %ecx # b = rotl32(b,30) |
541 | # 35 | 637 | # 35 |
542 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
543 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
544 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
545 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
546 | roll %esi # | ||
547 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
548 | movl %ecx, %edi # c | 638 | movl %ecx, %edi # c |
549 | xorl %edx, %edi # ^d | 639 | xorl %edx, %edi # ^d |
550 | xorl %ebx, %edi # ^b | 640 | xorl %ebx, %edi # ^b |
551 | leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 641 | addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15] |
552 | addl %edi, %ebp # e += (c ^ d ^ b) | 642 | addl %edi, %ebp # e += (c ^ d ^ b) |
553 | movl %eax, %esi # | 643 | movl %eax, %esi # |
554 | roll $5, %esi # rotl32(a,5) | 644 | roll $5, %esi # rotl32(a,5) |
555 | addl %esi, %ebp # e += rotl32(a,5) | 645 | addl %esi, %ebp # e += rotl32(a,5) |
556 | rorl $2, %ebx # b = rotl32(b,30) | 646 | rorl $2, %ebx # b = rotl32(b,30) |
647 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
648 | movaps %xmm2, %xmm4 | ||
649 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
650 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
651 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
652 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
653 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
654 | xorps %xmm5, %xmm3 # ^ | ||
655 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
656 | movaps %xmm3, %xmm5 | ||
657 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
658 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
659 | paddd %xmm3, %xmm3 # shift left by 1 | ||
660 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
661 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
662 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
663 | movaps %xmm5, %xmm4 | ||
664 | pslld $2, %xmm5 | ||
665 | psrld $30, %xmm4 | ||
666 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
667 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
668 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
669 | movaps %xmm3, %xmm5 | ||
670 | paddd %xmm6, %xmm5 | ||
671 | movups %xmm5, -64+16*3(%rsp) | ||
557 | # 36 | 672 | # 36 |
558 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
559 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
560 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
561 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
562 | roll %esi # | ||
563 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
564 | movl %ebx, %edi # c | 673 | movl %ebx, %edi # c |
565 | xorl %ecx, %edi # ^d | 674 | xorl %ecx, %edi # ^d |
566 | xorl %eax, %edi # ^b | 675 | xorl %eax, %edi # ^b |
567 | leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 676 | addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15] |
568 | addl %edi, %edx # e += (c ^ d ^ b) | 677 | addl %edi, %edx # e += (c ^ d ^ b) |
569 | movl %ebp, %esi # | 678 | movl %ebp, %esi # |
570 | roll $5, %esi # rotl32(a,5) | 679 | roll $5, %esi # rotl32(a,5) |
571 | addl %esi, %edx # e += rotl32(a,5) | 680 | addl %esi, %edx # e += rotl32(a,5) |
572 | rorl $2, %eax # b = rotl32(b,30) | 681 | rorl $2, %eax # b = rotl32(b,30) |
573 | # 37 | 682 | # 37 |
574 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
575 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
576 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
577 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
578 | roll %esi # | ||
579 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
580 | movl %eax, %edi # c | 683 | movl %eax, %edi # c |
581 | xorl %ebx, %edi # ^d | 684 | xorl %ebx, %edi # ^d |
582 | xorl %ebp, %edi # ^b | 685 | xorl %ebp, %edi # ^b |
583 | leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 686 | addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15] |
584 | addl %edi, %ecx # e += (c ^ d ^ b) | 687 | addl %edi, %ecx # e += (c ^ d ^ b) |
585 | movl %edx, %esi # | 688 | movl %edx, %esi # |
586 | roll $5, %esi # rotl32(a,5) | 689 | roll $5, %esi # rotl32(a,5) |
587 | addl %esi, %ecx # e += rotl32(a,5) | 690 | addl %esi, %ecx # e += rotl32(a,5) |
588 | rorl $2, %ebp # b = rotl32(b,30) | 691 | rorl $2, %ebp # b = rotl32(b,30) |
589 | # 38 | 692 | # 38 |
590 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
591 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
592 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
593 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
594 | roll %esi # | ||
595 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
596 | movl %ebp, %edi # c | 693 | movl %ebp, %edi # c |
597 | xorl %eax, %edi # ^d | 694 | xorl %eax, %edi # ^d |
598 | xorl %edx, %edi # ^b | 695 | xorl %edx, %edi # ^b |
599 | leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 696 | addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15] |
600 | addl %edi, %ebx # e += (c ^ d ^ b) | 697 | addl %edi, %ebx # e += (c ^ d ^ b) |
601 | movl %ecx, %esi # | 698 | movl %ecx, %esi # |
602 | roll $5, %esi # rotl32(a,5) | 699 | roll $5, %esi # rotl32(a,5) |
603 | addl %esi, %ebx # e += rotl32(a,5) | 700 | addl %esi, %ebx # e += rotl32(a,5) |
604 | rorl $2, %edx # b = rotl32(b,30) | 701 | rorl $2, %edx # b = rotl32(b,30) |
605 | # 39 | 702 | # 39 |
606 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
607 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
608 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
609 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
610 | roll %esi # | ||
611 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
612 | movl %edx, %edi # c | 703 | movl %edx, %edi # c |
613 | xorl %ebp, %edi # ^d | 704 | xorl %ebp, %edi # ^d |
614 | xorl %ecx, %edi # ^b | 705 | xorl %ecx, %edi # ^b |
615 | leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 706 | addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15] |
616 | addl %edi, %eax # e += (c ^ d ^ b) | 707 | addl %edi, %eax # e += (c ^ d ^ b) |
617 | movl %ebx, %esi # | 708 | movl %ebx, %esi # |
618 | roll $5, %esi # rotl32(a,5) | 709 | roll $5, %esi # rotl32(a,5) |
619 | addl %esi, %eax # e += rotl32(a,5) | 710 | addl %esi, %eax # e += rotl32(a,5) |
620 | rorl $2, %ecx # b = rotl32(b,30) | 711 | rorl $2, %ecx # b = rotl32(b,30) |
712 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
713 | movaps %xmm3, %xmm4 | ||
714 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
715 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
716 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
717 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
718 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
719 | xorps %xmm5, %xmm0 # ^ | ||
720 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
721 | movaps %xmm0, %xmm5 | ||
722 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
723 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
724 | paddd %xmm0, %xmm0 # shift left by 1 | ||
725 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
726 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
727 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
728 | movaps %xmm5, %xmm4 | ||
729 | pslld $2, %xmm5 | ||
730 | psrld $30, %xmm4 | ||
731 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
732 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
733 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
734 | movaps %xmm0, %xmm5 | ||
735 | paddd %xmm6, %xmm5 | ||
736 | movups %xmm5, -64+16*0(%rsp) | ||
621 | # 40 | 737 | # 40 |
622 | movl %ebx, %edi # di: b | 738 | movl %ebx, %edi # di: b |
623 | movl %ebx, %esi # si: b | 739 | movl %ebx, %esi # si: b |
@@ -625,12 +741,8 @@ sha1_process_block64: | |||
625 | andl %ecx, %esi # si: b & c | 741 | andl %ecx, %esi # si: b & c |
626 | andl %edx, %edi # di: (b | c) & d | 742 | andl %edx, %edi # di: (b | c) & d |
627 | orl %esi, %edi # ((b | c) & d) | (b & c) | 743 | orl %esi, %edi # ((b | c) & d) | (b & c) |
628 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
629 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
630 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
631 | roll %r8d # | ||
632 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 744 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
633 | leal -0x70E44324(%rbp,%r8), %ebp # e += RCONST + W[n & 15] | 745 | addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15] |
634 | movl %eax, %esi # | 746 | movl %eax, %esi # |
635 | roll $5, %esi # rotl32(a,5) | 747 | roll $5, %esi # rotl32(a,5) |
636 | addl %esi, %ebp # e += rotl32(a,5) | 748 | addl %esi, %ebp # e += rotl32(a,5) |
@@ -642,12 +754,8 @@ sha1_process_block64: | |||
642 | andl %ebx, %esi # si: b & c | 754 | andl %ebx, %esi # si: b & c |
643 | andl %ecx, %edi # di: (b | c) & d | 755 | andl %ecx, %edi # di: (b | c) & d |
644 | orl %esi, %edi # ((b | c) & d) | (b & c) | 756 | orl %esi, %edi # ((b | c) & d) | (b & c) |
645 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
646 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
647 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
648 | roll %r9d # | ||
649 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 757 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
650 | leal -0x70E44324(%rdx,%r9), %edx # e += RCONST + W[n & 15] | 758 | addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15] |
651 | movl %ebp, %esi # | 759 | movl %ebp, %esi # |
652 | roll $5, %esi # rotl32(a,5) | 760 | roll $5, %esi # rotl32(a,5) |
653 | addl %esi, %edx # e += rotl32(a,5) | 761 | addl %esi, %edx # e += rotl32(a,5) |
@@ -659,12 +767,8 @@ sha1_process_block64: | |||
659 | andl %eax, %esi # si: b & c | 767 | andl %eax, %esi # si: b & c |
660 | andl %ebx, %edi # di: (b | c) & d | 768 | andl %ebx, %edi # di: (b | c) & d |
661 | orl %esi, %edi # ((b | c) & d) | (b & c) | 769 | orl %esi, %edi # ((b | c) & d) | (b & c) |
662 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
663 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
664 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
665 | roll %r10d # | ||
666 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 770 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
667 | leal -0x70E44324(%rcx,%r10), %ecx # e += RCONST + W[n & 15] | 771 | addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15] |
668 | movl %edx, %esi # | 772 | movl %edx, %esi # |
669 | roll $5, %esi # rotl32(a,5) | 773 | roll $5, %esi # rotl32(a,5) |
670 | addl %esi, %ecx # e += rotl32(a,5) | 774 | addl %esi, %ecx # e += rotl32(a,5) |
@@ -676,16 +780,37 @@ sha1_process_block64: | |||
676 | andl %ebp, %esi # si: b & c | 780 | andl %ebp, %esi # si: b & c |
677 | andl %eax, %edi # di: (b | c) & d | 781 | andl %eax, %edi # di: (b | c) & d |
678 | orl %esi, %edi # ((b | c) & d) | (b & c) | 782 | orl %esi, %edi # ((b | c) & d) | (b & c) |
679 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
680 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
681 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
682 | roll %r11d # | ||
683 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 783 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
684 | leal -0x70E44324(%rbx,%r11), %ebx # e += RCONST + W[n & 15] | 784 | addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15] |
685 | movl %ecx, %esi # | 785 | movl %ecx, %esi # |
686 | roll $5, %esi # rotl32(a,5) | 786 | roll $5, %esi # rotl32(a,5) |
687 | addl %esi, %ebx # e += rotl32(a,5) | 787 | addl %esi, %ebx # e += rotl32(a,5) |
688 | rorl $2, %edx # b = rotl32(b,30) | 788 | rorl $2, %edx # b = rotl32(b,30) |
789 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
790 | movaps %xmm0, %xmm4 | ||
791 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
792 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
793 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
794 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
795 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
796 | xorps %xmm5, %xmm1 # ^ | ||
797 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
798 | movaps %xmm1, %xmm5 | ||
799 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
800 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
801 | paddd %xmm1, %xmm1 # shift left by 1 | ||
802 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
803 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
804 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
805 | movaps %xmm5, %xmm4 | ||
806 | pslld $2, %xmm5 | ||
807 | psrld $30, %xmm4 | ||
808 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
809 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
810 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
811 | movaps %xmm1, %xmm5 | ||
812 | paddd %xmm6, %xmm5 | ||
813 | movups %xmm5, -64+16*1(%rsp) | ||
689 | # 44 | 814 | # 44 |
690 | movl %ecx, %edi # di: b | 815 | movl %ecx, %edi # di: b |
691 | movl %ecx, %esi # si: b | 816 | movl %ecx, %esi # si: b |
@@ -693,12 +818,8 @@ sha1_process_block64: | |||
693 | andl %edx, %esi # si: b & c | 818 | andl %edx, %esi # si: b & c |
694 | andl %ebp, %edi # di: (b | c) & d | 819 | andl %ebp, %edi # di: (b | c) & d |
695 | orl %esi, %edi # ((b | c) & d) | (b & c) | 820 | orl %esi, %edi # ((b | c) & d) | (b & c) |
696 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
697 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
698 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
699 | roll %r12d # | ||
700 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 821 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
701 | leal -0x70E44324(%rax,%r12), %eax # e += RCONST + W[n & 15] | 822 | addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15] |
702 | movl %ebx, %esi # | 823 | movl %ebx, %esi # |
703 | roll $5, %esi # rotl32(a,5) | 824 | roll $5, %esi # rotl32(a,5) |
704 | addl %esi, %eax # e += rotl32(a,5) | 825 | addl %esi, %eax # e += rotl32(a,5) |
@@ -710,12 +831,8 @@ sha1_process_block64: | |||
710 | andl %ecx, %esi # si: b & c | 831 | andl %ecx, %esi # si: b & c |
711 | andl %edx, %edi # di: (b | c) & d | 832 | andl %edx, %edi # di: (b | c) & d |
712 | orl %esi, %edi # ((b | c) & d) | (b & c) | 833 | orl %esi, %edi # ((b | c) & d) | (b & c) |
713 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
714 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
715 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
716 | roll %r13d # | ||
717 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 834 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
718 | leal -0x70E44324(%rbp,%r13), %ebp # e += RCONST + W[n & 15] | 835 | addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15] |
719 | movl %eax, %esi # | 836 | movl %eax, %esi # |
720 | roll $5, %esi # rotl32(a,5) | 837 | roll $5, %esi # rotl32(a,5) |
721 | addl %esi, %ebp # e += rotl32(a,5) | 838 | addl %esi, %ebp # e += rotl32(a,5) |
@@ -727,12 +844,8 @@ sha1_process_block64: | |||
727 | andl %ebx, %esi # si: b & c | 844 | andl %ebx, %esi # si: b & c |
728 | andl %ecx, %edi # di: (b | c) & d | 845 | andl %ecx, %edi # di: (b | c) & d |
729 | orl %esi, %edi # ((b | c) & d) | (b & c) | 846 | orl %esi, %edi # ((b | c) & d) | (b & c) |
730 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
731 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
732 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
733 | roll %r14d # | ||
734 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 847 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
735 | leal -0x70E44324(%rdx,%r14), %edx # e += RCONST + W[n & 15] | 848 | addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15] |
736 | movl %ebp, %esi # | 849 | movl %ebp, %esi # |
737 | roll $5, %esi # rotl32(a,5) | 850 | roll $5, %esi # rotl32(a,5) |
738 | addl %esi, %edx # e += rotl32(a,5) | 851 | addl %esi, %edx # e += rotl32(a,5) |
@@ -744,16 +857,37 @@ sha1_process_block64: | |||
744 | andl %eax, %esi # si: b & c | 857 | andl %eax, %esi # si: b & c |
745 | andl %ebx, %edi # di: (b | c) & d | 858 | andl %ebx, %edi # di: (b | c) & d |
746 | orl %esi, %edi # ((b | c) & d) | (b & c) | 859 | orl %esi, %edi # ((b | c) & d) | (b & c) |
747 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
748 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
749 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
750 | roll %r15d # | ||
751 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 860 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
752 | leal -0x70E44324(%rcx,%r15), %ecx # e += RCONST + W[n & 15] | 861 | addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15] |
753 | movl %edx, %esi # | 862 | movl %edx, %esi # |
754 | roll $5, %esi # rotl32(a,5) | 863 | roll $5, %esi # rotl32(a,5) |
755 | addl %esi, %ecx # e += rotl32(a,5) | 864 | addl %esi, %ecx # e += rotl32(a,5) |
756 | rorl $2, %ebp # b = rotl32(b,30) | 865 | rorl $2, %ebp # b = rotl32(b,30) |
866 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
867 | movaps %xmm1, %xmm4 | ||
868 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
869 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
870 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
871 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
872 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
873 | xorps %xmm5, %xmm2 # ^ | ||
874 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
875 | movaps %xmm2, %xmm5 | ||
876 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
877 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
878 | paddd %xmm2, %xmm2 # shift left by 1 | ||
879 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
880 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
881 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
882 | movaps %xmm5, %xmm4 | ||
883 | pslld $2, %xmm5 | ||
884 | psrld $30, %xmm4 | ||
885 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
886 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
887 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
888 | movaps %xmm2, %xmm5 | ||
889 | paddd %xmm6, %xmm5 | ||
890 | movups %xmm5, -64+16*2(%rsp) | ||
757 | # 48 | 891 | # 48 |
758 | movl %edx, %edi # di: b | 892 | movl %edx, %edi # di: b |
759 | movl %edx, %esi # si: b | 893 | movl %edx, %esi # si: b |
@@ -761,14 +895,8 @@ sha1_process_block64: | |||
761 | andl %ebp, %esi # si: b & c | 895 | andl %ebp, %esi # si: b & c |
762 | andl %eax, %edi # di: (b | c) & d | 896 | andl %eax, %edi # di: (b | c) & d |
763 | orl %esi, %edi # ((b | c) & d) | (b & c) | 897 | orl %esi, %edi # ((b | c) & d) | (b & c) |
764 | movl %r13d, %esi # W[(n+13) & 15] | ||
765 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
766 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
767 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
768 | roll %esi # | ||
769 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
770 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 898 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
771 | leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 899 | addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15] |
772 | movl %ecx, %esi # | 900 | movl %ecx, %esi # |
773 | roll $5, %esi # rotl32(a,5) | 901 | roll $5, %esi # rotl32(a,5) |
774 | addl %esi, %ebx # e += rotl32(a,5) | 902 | addl %esi, %ebx # e += rotl32(a,5) |
@@ -780,14 +908,8 @@ sha1_process_block64: | |||
780 | andl %edx, %esi # si: b & c | 908 | andl %edx, %esi # si: b & c |
781 | andl %ebp, %edi # di: (b | c) & d | 909 | andl %ebp, %edi # di: (b | c) & d |
782 | orl %esi, %edi # ((b | c) & d) | (b & c) | 910 | orl %esi, %edi # ((b | c) & d) | (b & c) |
783 | movl %r14d, %esi # W[(n+13) & 15] | ||
784 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
785 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
786 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
787 | roll %esi # | ||
788 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
789 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 911 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
790 | leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 912 | addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15] |
791 | movl %ebx, %esi # | 913 | movl %ebx, %esi # |
792 | roll $5, %esi # rotl32(a,5) | 914 | roll $5, %esi # rotl32(a,5) |
793 | addl %esi, %eax # e += rotl32(a,5) | 915 | addl %esi, %eax # e += rotl32(a,5) |
@@ -799,14 +921,8 @@ sha1_process_block64: | |||
799 | andl %ecx, %esi # si: b & c | 921 | andl %ecx, %esi # si: b & c |
800 | andl %edx, %edi # di: (b | c) & d | 922 | andl %edx, %edi # di: (b | c) & d |
801 | orl %esi, %edi # ((b | c) & d) | (b & c) | 923 | orl %esi, %edi # ((b | c) & d) | (b & c) |
802 | movl %r15d, %esi # W[(n+13) & 15] | ||
803 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
804 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
805 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
806 | roll %esi # | ||
807 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
808 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 924 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
809 | leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 925 | addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15] |
810 | movl %eax, %esi # | 926 | movl %eax, %esi # |
811 | roll $5, %esi # rotl32(a,5) | 927 | roll $5, %esi # rotl32(a,5) |
812 | addl %esi, %ebp # e += rotl32(a,5) | 928 | addl %esi, %ebp # e += rotl32(a,5) |
@@ -818,18 +934,38 @@ sha1_process_block64: | |||
818 | andl %ebx, %esi # si: b & c | 934 | andl %ebx, %esi # si: b & c |
819 | andl %ecx, %edi # di: (b | c) & d | 935 | andl %ecx, %edi # di: (b | c) & d |
820 | orl %esi, %edi # ((b | c) & d) | (b & c) | 936 | orl %esi, %edi # ((b | c) & d) | (b & c) |
821 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
822 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
823 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
824 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
825 | roll %esi # | ||
826 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
827 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 937 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
828 | leal -0x70E44324(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 938 | addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15] |
829 | movl %ebp, %esi # | 939 | movl %ebp, %esi # |
830 | roll $5, %esi # rotl32(a,5) | 940 | roll $5, %esi # rotl32(a,5) |
831 | addl %esi, %edx # e += rotl32(a,5) | 941 | addl %esi, %edx # e += rotl32(a,5) |
832 | rorl $2, %eax # b = rotl32(b,30) | 942 | rorl $2, %eax # b = rotl32(b,30) |
943 | movaps rconst0xCA62C1D6(%rip), %xmm6 | ||
944 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
945 | movaps %xmm2, %xmm4 | ||
946 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
947 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
948 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
949 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
950 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
951 | xorps %xmm5, %xmm3 # ^ | ||
952 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
953 | movaps %xmm3, %xmm5 | ||
954 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
955 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
956 | paddd %xmm3, %xmm3 # shift left by 1 | ||
957 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
958 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
959 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
960 | movaps %xmm5, %xmm4 | ||
961 | pslld $2, %xmm5 | ||
962 | psrld $30, %xmm4 | ||
963 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
964 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
965 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
966 | movaps %xmm3, %xmm5 | ||
967 | paddd %xmm6, %xmm5 | ||
968 | movups %xmm5, -64+16*3(%rsp) | ||
833 | # 52 | 969 | # 52 |
834 | movl %ebp, %edi # di: b | 970 | movl %ebp, %edi # di: b |
835 | movl %ebp, %esi # si: b | 971 | movl %ebp, %esi # si: b |
@@ -837,14 +973,8 @@ sha1_process_block64: | |||
837 | andl %eax, %esi # si: b & c | 973 | andl %eax, %esi # si: b & c |
838 | andl %ebx, %edi # di: (b | c) & d | 974 | andl %ebx, %edi # di: (b | c) & d |
839 | orl %esi, %edi # ((b | c) & d) | (b & c) | 975 | orl %esi, %edi # ((b | c) & d) | (b & c) |
840 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
841 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
842 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
843 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
844 | roll %esi # | ||
845 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
846 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 976 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
847 | leal -0x70E44324(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 977 | addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15] |
848 | movl %edx, %esi # | 978 | movl %edx, %esi # |
849 | roll $5, %esi # rotl32(a,5) | 979 | roll $5, %esi # rotl32(a,5) |
850 | addl %esi, %ecx # e += rotl32(a,5) | 980 | addl %esi, %ecx # e += rotl32(a,5) |
@@ -856,14 +986,8 @@ sha1_process_block64: | |||
856 | andl %ebp, %esi # si: b & c | 986 | andl %ebp, %esi # si: b & c |
857 | andl %eax, %edi # di: (b | c) & d | 987 | andl %eax, %edi # di: (b | c) & d |
858 | orl %esi, %edi # ((b | c) & d) | (b & c) | 988 | orl %esi, %edi # ((b | c) & d) | (b & c) |
859 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
860 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
861 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
862 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
863 | roll %esi # | ||
864 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
865 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 989 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
866 | leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 990 | addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15] |
867 | movl %ecx, %esi # | 991 | movl %ecx, %esi # |
868 | roll $5, %esi # rotl32(a,5) | 992 | roll $5, %esi # rotl32(a,5) |
869 | addl %esi, %ebx # e += rotl32(a,5) | 993 | addl %esi, %ebx # e += rotl32(a,5) |
@@ -875,14 +999,8 @@ sha1_process_block64: | |||
875 | andl %edx, %esi # si: b & c | 999 | andl %edx, %esi # si: b & c |
876 | andl %ebp, %edi # di: (b | c) & d | 1000 | andl %ebp, %edi # di: (b | c) & d |
877 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1001 | orl %esi, %edi # ((b | c) & d) | (b & c) |
878 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
879 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
880 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
881 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
882 | roll %esi # | ||
883 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
884 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 1002 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
885 | leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 1003 | addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15] |
886 | movl %ebx, %esi # | 1004 | movl %ebx, %esi # |
887 | roll $5, %esi # rotl32(a,5) | 1005 | roll $5, %esi # rotl32(a,5) |
888 | addl %esi, %eax # e += rotl32(a,5) | 1006 | addl %esi, %eax # e += rotl32(a,5) |
@@ -894,18 +1012,37 @@ sha1_process_block64: | |||
894 | andl %ecx, %esi # si: b & c | 1012 | andl %ecx, %esi # si: b & c |
895 | andl %edx, %edi # di: (b | c) & d | 1013 | andl %edx, %edi # di: (b | c) & d |
896 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1014 | orl %esi, %edi # ((b | c) & d) | (b & c) |
897 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
898 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
899 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
900 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
901 | roll %esi # | ||
902 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
903 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 1015 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
904 | leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 1016 | addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15] |
905 | movl %eax, %esi # | 1017 | movl %eax, %esi # |
906 | roll $5, %esi # rotl32(a,5) | 1018 | roll $5, %esi # rotl32(a,5) |
907 | addl %esi, %ebp # e += rotl32(a,5) | 1019 | addl %esi, %ebp # e += rotl32(a,5) |
908 | rorl $2, %ebx # b = rotl32(b,30) | 1020 | rorl $2, %ebx # b = rotl32(b,30) |
1021 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
1022 | movaps %xmm3, %xmm4 | ||
1023 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
1024 | pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
1025 | punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
1026 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
1027 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
1028 | xorps %xmm5, %xmm0 # ^ | ||
1029 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
1030 | movaps %xmm0, %xmm5 | ||
1031 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
1032 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
1033 | paddd %xmm0, %xmm0 # shift left by 1 | ||
1034 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
1035 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
1036 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
1037 | movaps %xmm5, %xmm4 | ||
1038 | pslld $2, %xmm5 | ||
1039 | psrld $30, %xmm4 | ||
1040 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
1041 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
1042 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
1043 | movaps %xmm0, %xmm5 | ||
1044 | paddd %xmm6, %xmm5 | ||
1045 | movups %xmm5, -64+16*0(%rsp) | ||
909 | # 56 | 1046 | # 56 |
910 | movl %eax, %edi # di: b | 1047 | movl %eax, %edi # di: b |
911 | movl %eax, %esi # si: b | 1048 | movl %eax, %esi # si: b |
@@ -913,12 +1050,8 @@ sha1_process_block64: | |||
913 | andl %ebx, %esi # si: b & c | 1050 | andl %ebx, %esi # si: b & c |
914 | andl %ecx, %edi # di: (b | c) & d | 1051 | andl %ecx, %edi # di: (b | c) & d |
915 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1052 | orl %esi, %edi # ((b | c) & d) | (b & c) |
916 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
917 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
918 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
919 | roll %r8d # | ||
920 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 1053 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
921 | leal -0x70E44324(%rdx,%r8), %edx # e += RCONST + W[n & 15] | 1054 | addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15] |
922 | movl %ebp, %esi # | 1055 | movl %ebp, %esi # |
923 | roll $5, %esi # rotl32(a,5) | 1056 | roll $5, %esi # rotl32(a,5) |
924 | addl %esi, %edx # e += rotl32(a,5) | 1057 | addl %esi, %edx # e += rotl32(a,5) |
@@ -930,12 +1063,8 @@ sha1_process_block64: | |||
930 | andl %eax, %esi # si: b & c | 1063 | andl %eax, %esi # si: b & c |
931 | andl %ebx, %edi # di: (b | c) & d | 1064 | andl %ebx, %edi # di: (b | c) & d |
932 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1065 | orl %esi, %edi # ((b | c) & d) | (b & c) |
933 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
934 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
935 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
936 | roll %r9d # | ||
937 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 1066 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
938 | leal -0x70E44324(%rcx,%r9), %ecx # e += RCONST + W[n & 15] | 1067 | addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15] |
939 | movl %edx, %esi # | 1068 | movl %edx, %esi # |
940 | roll $5, %esi # rotl32(a,5) | 1069 | roll $5, %esi # rotl32(a,5) |
941 | addl %esi, %ecx # e += rotl32(a,5) | 1070 | addl %esi, %ecx # e += rotl32(a,5) |
@@ -947,12 +1076,8 @@ sha1_process_block64: | |||
947 | andl %ebp, %esi # si: b & c | 1076 | andl %ebp, %esi # si: b & c |
948 | andl %eax, %edi # di: (b | c) & d | 1077 | andl %eax, %edi # di: (b | c) & d |
949 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1078 | orl %esi, %edi # ((b | c) & d) | (b & c) |
950 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
951 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
952 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
953 | roll %r10d # | ||
954 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 1079 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
955 | leal -0x70E44324(%rbx,%r10), %ebx # e += RCONST + W[n & 15] | 1080 | addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15] |
956 | movl %ecx, %esi # | 1081 | movl %ecx, %esi # |
957 | roll $5, %esi # rotl32(a,5) | 1082 | roll $5, %esi # rotl32(a,5) |
958 | addl %esi, %ebx # e += rotl32(a,5) | 1083 | addl %esi, %ebx # e += rotl32(a,5) |
@@ -964,307 +1089,282 @@ sha1_process_block64: | |||
964 | andl %edx, %esi # si: b & c | 1089 | andl %edx, %esi # si: b & c |
965 | andl %ebp, %edi # di: (b | c) & d | 1090 | andl %ebp, %edi # di: (b | c) & d |
966 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1091 | orl %esi, %edi # ((b | c) & d) | (b & c) |
967 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
968 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
969 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
970 | roll %r11d # | ||
971 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 1092 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
972 | leal -0x70E44324(%rax,%r11), %eax # e += RCONST + W[n & 15] | 1093 | addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15] |
973 | movl %ebx, %esi # | 1094 | movl %ebx, %esi # |
974 | roll $5, %esi # rotl32(a,5) | 1095 | roll $5, %esi # rotl32(a,5) |
975 | addl %esi, %eax # e += rotl32(a,5) | 1096 | addl %esi, %eax # e += rotl32(a,5) |
976 | rorl $2, %ecx # b = rotl32(b,30) | 1097 | rorl $2, %ecx # b = rotl32(b,30) |
1098 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
1099 | movaps %xmm0, %xmm4 | ||
1100 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
1101 | pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
1102 | punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
1103 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
1104 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
1105 | xorps %xmm5, %xmm1 # ^ | ||
1106 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
1107 | movaps %xmm1, %xmm5 | ||
1108 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
1109 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
1110 | paddd %xmm1, %xmm1 # shift left by 1 | ||
1111 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
1112 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
1113 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
1114 | movaps %xmm5, %xmm4 | ||
1115 | pslld $2, %xmm5 | ||
1116 | psrld $30, %xmm4 | ||
1117 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
1118 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
1119 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
1120 | movaps %xmm1, %xmm5 | ||
1121 | paddd %xmm6, %xmm5 | ||
1122 | movups %xmm5, -64+16*1(%rsp) | ||
977 | # 60 | 1123 | # 60 |
978 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
979 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
980 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
981 | roll %r12d # | ||
982 | movl %ecx, %edi # c | 1124 | movl %ecx, %edi # c |
983 | xorl %edx, %edi # ^d | 1125 | xorl %edx, %edi # ^d |
984 | xorl %ebx, %edi # ^b | 1126 | xorl %ebx, %edi # ^b |
985 | leal -0x359D3E2A(%rbp,%r12), %ebp # e += RCONST + W[n & 15] | 1127 | addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15] |
986 | addl %edi, %ebp # e += (c ^ d ^ b) | 1128 | addl %edi, %ebp # e += (c ^ d ^ b) |
987 | movl %eax, %esi # | 1129 | movl %eax, %esi # |
988 | roll $5, %esi # rotl32(a,5) | 1130 | roll $5, %esi # rotl32(a,5) |
989 | addl %esi, %ebp # e += rotl32(a,5) | 1131 | addl %esi, %ebp # e += rotl32(a,5) |
990 | rorl $2, %ebx # b = rotl32(b,30) | 1132 | rorl $2, %ebx # b = rotl32(b,30) |
991 | # 61 | 1133 | # 61 |
992 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
993 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
994 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
995 | roll %r13d # | ||
996 | movl %ebx, %edi # c | 1134 | movl %ebx, %edi # c |
997 | xorl %ecx, %edi # ^d | 1135 | xorl %ecx, %edi # ^d |
998 | xorl %eax, %edi # ^b | 1136 | xorl %eax, %edi # ^b |
999 | leal -0x359D3E2A(%rdx,%r13), %edx # e += RCONST + W[n & 15] | 1137 | addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15] |
1000 | addl %edi, %edx # e += (c ^ d ^ b) | 1138 | addl %edi, %edx # e += (c ^ d ^ b) |
1001 | movl %ebp, %esi # | 1139 | movl %ebp, %esi # |
1002 | roll $5, %esi # rotl32(a,5) | 1140 | roll $5, %esi # rotl32(a,5) |
1003 | addl %esi, %edx # e += rotl32(a,5) | 1141 | addl %esi, %edx # e += rotl32(a,5) |
1004 | rorl $2, %eax # b = rotl32(b,30) | 1142 | rorl $2, %eax # b = rotl32(b,30) |
1005 | # 62 | 1143 | # 62 |
1006 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
1007 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
1008 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
1009 | roll %r14d # | ||
1010 | movl %eax, %edi # c | 1144 | movl %eax, %edi # c |
1011 | xorl %ebx, %edi # ^d | 1145 | xorl %ebx, %edi # ^d |
1012 | xorl %ebp, %edi # ^b | 1146 | xorl %ebp, %edi # ^b |
1013 | leal -0x359D3E2A(%rcx,%r14), %ecx # e += RCONST + W[n & 15] | 1147 | addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15] |
1014 | addl %edi, %ecx # e += (c ^ d ^ b) | 1148 | addl %edi, %ecx # e += (c ^ d ^ b) |
1015 | movl %edx, %esi # | 1149 | movl %edx, %esi # |
1016 | roll $5, %esi # rotl32(a,5) | 1150 | roll $5, %esi # rotl32(a,5) |
1017 | addl %esi, %ecx # e += rotl32(a,5) | 1151 | addl %esi, %ecx # e += rotl32(a,5) |
1018 | rorl $2, %ebp # b = rotl32(b,30) | 1152 | rorl $2, %ebp # b = rotl32(b,30) |
1019 | # 63 | 1153 | # 63 |
1020 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
1021 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
1022 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
1023 | roll %r15d # | ||
1024 | movl %ebp, %edi # c | 1154 | movl %ebp, %edi # c |
1025 | xorl %eax, %edi # ^d | 1155 | xorl %eax, %edi # ^d |
1026 | xorl %edx, %edi # ^b | 1156 | xorl %edx, %edi # ^b |
1027 | leal -0x359D3E2A(%rbx,%r15), %ebx # e += RCONST + W[n & 15] | 1157 | addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15] |
1028 | addl %edi, %ebx # e += (c ^ d ^ b) | 1158 | addl %edi, %ebx # e += (c ^ d ^ b) |
1029 | movl %ecx, %esi # | 1159 | movl %ecx, %esi # |
1030 | roll $5, %esi # rotl32(a,5) | 1160 | roll $5, %esi # rotl32(a,5) |
1031 | addl %esi, %ebx # e += rotl32(a,5) | 1161 | addl %esi, %ebx # e += rotl32(a,5) |
1032 | rorl $2, %edx # b = rotl32(b,30) | 1162 | rorl $2, %edx # b = rotl32(b,30) |
1163 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
1164 | movaps %xmm1, %xmm4 | ||
1165 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
1166 | pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
1167 | punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
1168 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
1169 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
1170 | xorps %xmm5, %xmm2 # ^ | ||
1171 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
1172 | movaps %xmm2, %xmm5 | ||
1173 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
1174 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
1175 | paddd %xmm2, %xmm2 # shift left by 1 | ||
1176 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
1177 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
1178 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
1179 | movaps %xmm5, %xmm4 | ||
1180 | pslld $2, %xmm5 | ||
1181 | psrld $30, %xmm4 | ||
1182 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
1183 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
1184 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
1185 | movaps %xmm2, %xmm5 | ||
1186 | paddd %xmm6, %xmm5 | ||
1187 | movups %xmm5, -64+16*2(%rsp) | ||
1033 | # 64 | 1188 | # 64 |
1034 | movl %r13d, %esi # W[(n+13) & 15] | ||
1035 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
1036 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
1037 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
1038 | roll %esi # | ||
1039 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
1040 | movl %edx, %edi # c | 1189 | movl %edx, %edi # c |
1041 | xorl %ebp, %edi # ^d | 1190 | xorl %ebp, %edi # ^d |
1042 | xorl %ecx, %edi # ^b | 1191 | xorl %ecx, %edi # ^b |
1043 | leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 1192 | addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15] |
1044 | addl %edi, %eax # e += (c ^ d ^ b) | 1193 | addl %edi, %eax # e += (c ^ d ^ b) |
1045 | movl %ebx, %esi # | 1194 | movl %ebx, %esi # |
1046 | roll $5, %esi # rotl32(a,5) | 1195 | roll $5, %esi # rotl32(a,5) |
1047 | addl %esi, %eax # e += rotl32(a,5) | 1196 | addl %esi, %eax # e += rotl32(a,5) |
1048 | rorl $2, %ecx # b = rotl32(b,30) | 1197 | rorl $2, %ecx # b = rotl32(b,30) |
1049 | # 65 | 1198 | # 65 |
1050 | movl %r14d, %esi # W[(n+13) & 15] | ||
1051 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
1052 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
1053 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
1054 | roll %esi # | ||
1055 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
1056 | movl %ecx, %edi # c | 1199 | movl %ecx, %edi # c |
1057 | xorl %edx, %edi # ^d | 1200 | xorl %edx, %edi # ^d |
1058 | xorl %ebx, %edi # ^b | 1201 | xorl %ebx, %edi # ^b |
1059 | leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 1202 | addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15] |
1060 | addl %edi, %ebp # e += (c ^ d ^ b) | 1203 | addl %edi, %ebp # e += (c ^ d ^ b) |
1061 | movl %eax, %esi # | 1204 | movl %eax, %esi # |
1062 | roll $5, %esi # rotl32(a,5) | 1205 | roll $5, %esi # rotl32(a,5) |
1063 | addl %esi, %ebp # e += rotl32(a,5) | 1206 | addl %esi, %ebp # e += rotl32(a,5) |
1064 | rorl $2, %ebx # b = rotl32(b,30) | 1207 | rorl $2, %ebx # b = rotl32(b,30) |
1065 | # 66 | 1208 | # 66 |
1066 | movl %r15d, %esi # W[(n+13) & 15] | ||
1067 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
1068 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
1069 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
1070 | roll %esi # | ||
1071 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
1072 | movl %ebx, %edi # c | 1209 | movl %ebx, %edi # c |
1073 | xorl %ecx, %edi # ^d | 1210 | xorl %ecx, %edi # ^d |
1074 | xorl %eax, %edi # ^b | 1211 | xorl %eax, %edi # ^b |
1075 | leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 1212 | addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15] |
1076 | addl %edi, %edx # e += (c ^ d ^ b) | 1213 | addl %edi, %edx # e += (c ^ d ^ b) |
1077 | movl %ebp, %esi # | 1214 | movl %ebp, %esi # |
1078 | roll $5, %esi # rotl32(a,5) | 1215 | roll $5, %esi # rotl32(a,5) |
1079 | addl %esi, %edx # e += rotl32(a,5) | 1216 | addl %esi, %edx # e += rotl32(a,5) |
1080 | rorl $2, %eax # b = rotl32(b,30) | 1217 | rorl $2, %eax # b = rotl32(b,30) |
1081 | # 67 | 1218 | # 67 |
1082 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
1083 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
1084 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
1085 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
1086 | roll %esi # | ||
1087 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
1088 | movl %eax, %edi # c | 1219 | movl %eax, %edi # c |
1089 | xorl %ebx, %edi # ^d | 1220 | xorl %ebx, %edi # ^d |
1090 | xorl %ebp, %edi # ^b | 1221 | xorl %ebp, %edi # ^b |
1091 | leal -0x359D3E2A(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 1222 | addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15] |
1092 | addl %edi, %ecx # e += (c ^ d ^ b) | 1223 | addl %edi, %ecx # e += (c ^ d ^ b) |
1093 | movl %edx, %esi # | 1224 | movl %edx, %esi # |
1094 | roll $5, %esi # rotl32(a,5) | 1225 | roll $5, %esi # rotl32(a,5) |
1095 | addl %esi, %ecx # e += rotl32(a,5) | 1226 | addl %esi, %ecx # e += rotl32(a,5) |
1096 | rorl $2, %ebp # b = rotl32(b,30) | 1227 | rorl $2, %ebp # b = rotl32(b,30) |
1228 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
1229 | movaps %xmm2, %xmm4 | ||
1230 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
1231 | pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
1232 | punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
1233 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
1234 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
1235 | xorps %xmm5, %xmm3 # ^ | ||
1236 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
1237 | movaps %xmm3, %xmm5 | ||
1238 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
1239 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
1240 | paddd %xmm3, %xmm3 # shift left by 1 | ||
1241 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
1242 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
1243 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
1244 | movaps %xmm5, %xmm4 | ||
1245 | pslld $2, %xmm5 | ||
1246 | psrld $30, %xmm4 | ||
1247 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
1248 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
1249 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
1250 | movaps %xmm3, %xmm5 | ||
1251 | paddd %xmm6, %xmm5 | ||
1252 | movups %xmm5, -64+16*3(%rsp) | ||
1097 | # 68 | 1253 | # 68 |
1098 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
1099 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
1100 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
1101 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
1102 | roll %esi # | ||
1103 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
1104 | movl %ebp, %edi # c | 1254 | movl %ebp, %edi # c |
1105 | xorl %eax, %edi # ^d | 1255 | xorl %eax, %edi # ^d |
1106 | xorl %edx, %edi # ^b | 1256 | xorl %edx, %edi # ^b |
1107 | leal -0x359D3E2A(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 1257 | addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15] |
1108 | addl %edi, %ebx # e += (c ^ d ^ b) | 1258 | addl %edi, %ebx # e += (c ^ d ^ b) |
1109 | movl %ecx, %esi # | 1259 | movl %ecx, %esi # |
1110 | roll $5, %esi # rotl32(a,5) | 1260 | roll $5, %esi # rotl32(a,5) |
1111 | addl %esi, %ebx # e += rotl32(a,5) | 1261 | addl %esi, %ebx # e += rotl32(a,5) |
1112 | rorl $2, %edx # b = rotl32(b,30) | 1262 | rorl $2, %edx # b = rotl32(b,30) |
1113 | # 69 | 1263 | # 69 |
1114 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
1115 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
1116 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
1117 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
1118 | roll %esi # | ||
1119 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
1120 | movl %edx, %edi # c | 1264 | movl %edx, %edi # c |
1121 | xorl %ebp, %edi # ^d | 1265 | xorl %ebp, %edi # ^d |
1122 | xorl %ecx, %edi # ^b | 1266 | xorl %ecx, %edi # ^b |
1123 | leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 1267 | addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15] |
1124 | addl %edi, %eax # e += (c ^ d ^ b) | 1268 | addl %edi, %eax # e += (c ^ d ^ b) |
1125 | movl %ebx, %esi # | 1269 | movl %ebx, %esi # |
1126 | roll $5, %esi # rotl32(a,5) | 1270 | roll $5, %esi # rotl32(a,5) |
1127 | addl %esi, %eax # e += rotl32(a,5) | 1271 | addl %esi, %eax # e += rotl32(a,5) |
1128 | rorl $2, %ecx # b = rotl32(b,30) | 1272 | rorl $2, %ecx # b = rotl32(b,30) |
1129 | # 70 | 1273 | # 70 |
1130 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
1131 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
1132 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
1133 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
1134 | roll %esi # | ||
1135 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
1136 | movl %ecx, %edi # c | 1274 | movl %ecx, %edi # c |
1137 | xorl %edx, %edi # ^d | 1275 | xorl %edx, %edi # ^d |
1138 | xorl %ebx, %edi # ^b | 1276 | xorl %ebx, %edi # ^b |
1139 | leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 1277 | addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15] |
1140 | addl %edi, %ebp # e += (c ^ d ^ b) | 1278 | addl %edi, %ebp # e += (c ^ d ^ b) |
1141 | movl %eax, %esi # | 1279 | movl %eax, %esi # |
1142 | roll $5, %esi # rotl32(a,5) | 1280 | roll $5, %esi # rotl32(a,5) |
1143 | addl %esi, %ebp # e += rotl32(a,5) | 1281 | addl %esi, %ebp # e += rotl32(a,5) |
1144 | rorl $2, %ebx # b = rotl32(b,30) | 1282 | rorl $2, %ebx # b = rotl32(b,30) |
1145 | # 71 | 1283 | # 71 |
1146 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
1147 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
1148 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
1149 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
1150 | roll %esi # | ||
1151 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
1152 | movl %ebx, %edi # c | 1284 | movl %ebx, %edi # c |
1153 | xorl %ecx, %edi # ^d | 1285 | xorl %ecx, %edi # ^d |
1154 | xorl %eax, %edi # ^b | 1286 | xorl %eax, %edi # ^b |
1155 | leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 1287 | addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15] |
1156 | addl %edi, %edx # e += (c ^ d ^ b) | 1288 | addl %edi, %edx # e += (c ^ d ^ b) |
1157 | movl %ebp, %esi # | 1289 | movl %ebp, %esi # |
1158 | roll $5, %esi # rotl32(a,5) | 1290 | roll $5, %esi # rotl32(a,5) |
1159 | addl %esi, %edx # e += rotl32(a,5) | 1291 | addl %esi, %edx # e += rotl32(a,5) |
1160 | rorl $2, %eax # b = rotl32(b,30) | 1292 | rorl $2, %eax # b = rotl32(b,30) |
1161 | # 72 | 1293 | # 72 |
1162 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
1163 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
1164 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
1165 | roll %r8d # | ||
1166 | movl %eax, %edi # c | 1294 | movl %eax, %edi # c |
1167 | xorl %ebx, %edi # ^d | 1295 | xorl %ebx, %edi # ^d |
1168 | xorl %ebp, %edi # ^b | 1296 | xorl %ebp, %edi # ^b |
1169 | leal -0x359D3E2A(%rcx,%r8), %ecx # e += RCONST + W[n & 15] | 1297 | addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15] |
1170 | addl %edi, %ecx # e += (c ^ d ^ b) | 1298 | addl %edi, %ecx # e += (c ^ d ^ b) |
1171 | movl %edx, %esi # | 1299 | movl %edx, %esi # |
1172 | roll $5, %esi # rotl32(a,5) | 1300 | roll $5, %esi # rotl32(a,5) |
1173 | addl %esi, %ecx # e += rotl32(a,5) | 1301 | addl %esi, %ecx # e += rotl32(a,5) |
1174 | rorl $2, %ebp # b = rotl32(b,30) | 1302 | rorl $2, %ebp # b = rotl32(b,30) |
1175 | # 73 | 1303 | # 73 |
1176 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
1177 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
1178 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
1179 | roll %r9d # | ||
1180 | movl %ebp, %edi # c | 1304 | movl %ebp, %edi # c |
1181 | xorl %eax, %edi # ^d | 1305 | xorl %eax, %edi # ^d |
1182 | xorl %edx, %edi # ^b | 1306 | xorl %edx, %edi # ^b |
1183 | leal -0x359D3E2A(%rbx,%r9), %ebx # e += RCONST + W[n & 15] | 1307 | addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15] |
1184 | addl %edi, %ebx # e += (c ^ d ^ b) | 1308 | addl %edi, %ebx # e += (c ^ d ^ b) |
1185 | movl %ecx, %esi # | 1309 | movl %ecx, %esi # |
1186 | roll $5, %esi # rotl32(a,5) | 1310 | roll $5, %esi # rotl32(a,5) |
1187 | addl %esi, %ebx # e += rotl32(a,5) | 1311 | addl %esi, %ebx # e += rotl32(a,5) |
1188 | rorl $2, %edx # b = rotl32(b,30) | 1312 | rorl $2, %edx # b = rotl32(b,30) |
1189 | # 74 | 1313 | # 74 |
1190 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
1191 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
1192 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
1193 | roll %r10d # | ||
1194 | movl %edx, %edi # c | 1314 | movl %edx, %edi # c |
1195 | xorl %ebp, %edi # ^d | 1315 | xorl %ebp, %edi # ^d |
1196 | xorl %ecx, %edi # ^b | 1316 | xorl %ecx, %edi # ^b |
1197 | leal -0x359D3E2A(%rax,%r10), %eax # e += RCONST + W[n & 15] | 1317 | addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15] |
1198 | addl %edi, %eax # e += (c ^ d ^ b) | 1318 | addl %edi, %eax # e += (c ^ d ^ b) |
1199 | movl %ebx, %esi # | 1319 | movl %ebx, %esi # |
1200 | roll $5, %esi # rotl32(a,5) | 1320 | roll $5, %esi # rotl32(a,5) |
1201 | addl %esi, %eax # e += rotl32(a,5) | 1321 | addl %esi, %eax # e += rotl32(a,5) |
1202 | rorl $2, %ecx # b = rotl32(b,30) | 1322 | rorl $2, %ecx # b = rotl32(b,30) |
1203 | # 75 | 1323 | # 75 |
1204 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
1205 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
1206 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
1207 | roll %r11d # | ||
1208 | movl %ecx, %edi # c | 1324 | movl %ecx, %edi # c |
1209 | xorl %edx, %edi # ^d | 1325 | xorl %edx, %edi # ^d |
1210 | xorl %ebx, %edi # ^b | 1326 | xorl %ebx, %edi # ^b |
1211 | leal -0x359D3E2A(%rbp,%r11), %ebp # e += RCONST + W[n & 15] | 1327 | addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15] |
1212 | addl %edi, %ebp # e += (c ^ d ^ b) | 1328 | addl %edi, %ebp # e += (c ^ d ^ b) |
1213 | movl %eax, %esi # | 1329 | movl %eax, %esi # |
1214 | roll $5, %esi # rotl32(a,5) | 1330 | roll $5, %esi # rotl32(a,5) |
1215 | addl %esi, %ebp # e += rotl32(a,5) | 1331 | addl %esi, %ebp # e += rotl32(a,5) |
1216 | rorl $2, %ebx # b = rotl32(b,30) | 1332 | rorl $2, %ebx # b = rotl32(b,30) |
1217 | # 76 | 1333 | # 76 |
1218 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
1219 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
1220 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
1221 | roll %r12d # | ||
1222 | movl %ebx, %edi # c | 1334 | movl %ebx, %edi # c |
1223 | xorl %ecx, %edi # ^d | 1335 | xorl %ecx, %edi # ^d |
1224 | xorl %eax, %edi # ^b | 1336 | xorl %eax, %edi # ^b |
1225 | leal -0x359D3E2A(%rdx,%r12), %edx # e += RCONST + W[n & 15] | 1337 | addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15] |
1226 | addl %edi, %edx # e += (c ^ d ^ b) | 1338 | addl %edi, %edx # e += (c ^ d ^ b) |
1227 | movl %ebp, %esi # | 1339 | movl %ebp, %esi # |
1228 | roll $5, %esi # rotl32(a,5) | 1340 | roll $5, %esi # rotl32(a,5) |
1229 | addl %esi, %edx # e += rotl32(a,5) | 1341 | addl %esi, %edx # e += rotl32(a,5) |
1230 | rorl $2, %eax # b = rotl32(b,30) | 1342 | rorl $2, %eax # b = rotl32(b,30) |
1231 | # 77 | 1343 | # 77 |
1232 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
1233 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
1234 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
1235 | roll %r13d # | ||
1236 | movl %eax, %edi # c | 1344 | movl %eax, %edi # c |
1237 | xorl %ebx, %edi # ^d | 1345 | xorl %ebx, %edi # ^d |
1238 | xorl %ebp, %edi # ^b | 1346 | xorl %ebp, %edi # ^b |
1239 | leal -0x359D3E2A(%rcx,%r13), %ecx # e += RCONST + W[n & 15] | 1347 | addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15] |
1240 | addl %edi, %ecx # e += (c ^ d ^ b) | 1348 | addl %edi, %ecx # e += (c ^ d ^ b) |
1241 | movl %edx, %esi # | 1349 | movl %edx, %esi # |
1242 | roll $5, %esi # rotl32(a,5) | 1350 | roll $5, %esi # rotl32(a,5) |
1243 | addl %esi, %ecx # e += rotl32(a,5) | 1351 | addl %esi, %ecx # e += rotl32(a,5) |
1244 | rorl $2, %ebp # b = rotl32(b,30) | 1352 | rorl $2, %ebp # b = rotl32(b,30) |
1245 | # 78 | 1353 | # 78 |
1246 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
1247 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
1248 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
1249 | roll %r14d # | ||
1250 | movl %ebp, %edi # c | 1354 | movl %ebp, %edi # c |
1251 | xorl %eax, %edi # ^d | 1355 | xorl %eax, %edi # ^d |
1252 | xorl %edx, %edi # ^b | 1356 | xorl %edx, %edi # ^b |
1253 | leal -0x359D3E2A(%rbx,%r14), %ebx # e += RCONST + W[n & 15] | 1357 | addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15] |
1254 | addl %edi, %ebx # e += (c ^ d ^ b) | 1358 | addl %edi, %ebx # e += (c ^ d ^ b) |
1255 | movl %ecx, %esi # | 1359 | movl %ecx, %esi # |
1256 | roll $5, %esi # rotl32(a,5) | 1360 | roll $5, %esi # rotl32(a,5) |
1257 | addl %esi, %ebx # e += rotl32(a,5) | 1361 | addl %esi, %ebx # e += rotl32(a,5) |
1258 | rorl $2, %edx # b = rotl32(b,30) | 1362 | rorl $2, %edx # b = rotl32(b,30) |
1259 | # 79 | 1363 | # 79 |
1260 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
1261 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
1262 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
1263 | roll %r15d # | ||
1264 | movl %edx, %edi # c | 1364 | movl %edx, %edi # c |
1265 | xorl %ebp, %edi # ^d | 1365 | xorl %ebp, %edi # ^d |
1266 | xorl %ecx, %edi # ^b | 1366 | xorl %ecx, %edi # ^b |
1267 | leal -0x359D3E2A(%rax,%r15), %eax # e += RCONST + W[n & 15] | 1367 | addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15] |
1268 | addl %edi, %eax # e += (c ^ d ^ b) | 1368 | addl %edi, %eax # e += (c ^ d ^ b) |
1269 | movl %ebx, %esi # | 1369 | movl %ebx, %esi # |
1270 | roll $5, %esi # rotl32(a,5) | 1370 | roll $5, %esi # rotl32(a,5) |
@@ -1286,4 +1386,28 @@ sha1_process_block64: | |||
1286 | 1386 | ||
1287 | ret | 1387 | ret |
1288 | .size sha1_process_block64, .-sha1_process_block64 | 1388 | .size sha1_process_block64, .-sha1_process_block64 |
1389 | |||
1390 | .section .rodata.cst16.sha1const, "aM", @progbits, 16 | ||
1391 | .align 16 | ||
1392 | rconst0x5A827999: | ||
1393 | .long 0x5A827999 | ||
1394 | .long 0x5A827999 | ||
1395 | .long 0x5A827999 | ||
1396 | .long 0x5A827999 | ||
1397 | rconst0x6ED9EBA1: | ||
1398 | .long 0x6ED9EBA1 | ||
1399 | .long 0x6ED9EBA1 | ||
1400 | .long 0x6ED9EBA1 | ||
1401 | .long 0x6ED9EBA1 | ||
1402 | rconst0x8F1BBCDC: | ||
1403 | .long 0x8F1BBCDC | ||
1404 | .long 0x8F1BBCDC | ||
1405 | .long 0x8F1BBCDC | ||
1406 | .long 0x8F1BBCDC | ||
1407 | rconst0xCA62C1D6: | ||
1408 | .long 0xCA62C1D6 | ||
1409 | .long 0xCA62C1D6 | ||
1410 | .long 0xCA62C1D6 | ||
1411 | .long 0xCA62C1D6 | ||
1412 | |||
1289 | #endif | 1413 | #endif |