diff options
-rw-r--r-- | libbb/hash_md5_sha_x86-64.S | 310 | ||||
-rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 109 |
2 files changed, 214 insertions, 205 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 069a18719..743269d98 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S | |||
@@ -1,7 +1,7 @@ | |||
1 | ### Generated by hash_md5_sha_x86-64.S.sh ### | 1 | ### Generated by hash_md5_sha_x86-64.S.sh ### |
2 | 2 | ||
3 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) | 3 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
4 | .section .text.sha1_process_block64,"ax",@progbits | 4 | .section .text.sha1_process_block64, "ax", @progbits |
5 | .globl sha1_process_block64 | 5 | .globl sha1_process_block64 |
6 | .hidden sha1_process_block64 | 6 | .hidden sha1_process_block64 |
7 | .type sha1_process_block64, @function | 7 | .type sha1_process_block64, @function |
@@ -10,7 +10,7 @@ | |||
10 | sha1_process_block64: | 10 | sha1_process_block64: |
11 | pushq %rbp # 1 byte insn | 11 | pushq %rbp # 1 byte insn |
12 | pushq %rbx # 1 byte insn | 12 | pushq %rbx # 1 byte insn |
13 | pushq %r15 # 2 byte insn | 13 | # pushq %r15 # 2 byte insn |
14 | pushq %r14 # 2 byte insn | 14 | pushq %r14 # 2 byte insn |
15 | pushq %r13 # 2 byte insn | 15 | pushq %r13 # 2 byte insn |
16 | pushq %r12 # 2 byte insn | 16 | pushq %r12 # 2 byte insn |
@@ -19,7 +19,8 @@ sha1_process_block64: | |||
19 | #Register and stack use: | 19 | #Register and stack use: |
20 | # eax..edx: a..d | 20 | # eax..edx: a..d |
21 | # ebp: e | 21 | # ebp: e |
22 | # esi,edi: temps | 22 | # esi,edi,r8..r14: temps |
23 | # r15: unused | ||
23 | # xmm0..xmm3: W[] | 24 | # xmm0..xmm3: W[] |
24 | # xmm4,xmm5: temps | 25 | # xmm4,xmm5: temps |
25 | # xmm6: current round constant | 26 | # xmm6: current round constant |
@@ -33,147 +34,148 @@ sha1_process_block64: | |||
33 | 34 | ||
34 | movaps rconst0x5A827999(%rip), %xmm6 | 35 | movaps rconst0x5A827999(%rip), %xmm6 |
35 | 36 | ||
36 | # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 | 37 | # Load W[] to xmm registers, byteswapping on the fly. |
37 | # instead of spilling them to stack. | 38 | # |
38 | # (We lose parallelized addition of RCONST, but LEA | 39 | # For iterations 0..15, we pass W[] in rsi,r8..r14 |
39 | # can do two additions at once, so...) | 40 | # for use in RD1A's instead of spilling them to stack. |
41 | # We lose parallelized addition of RCONST, but LEA | ||
42 | # can do two additions at once, so it's probably a wash. | ||
43 | # (We use rsi instead of rN because this makes two | ||
44 | # LEAs in two first RD1A's shorter by one byte). | ||
40 | movq 4*0(%rdi), %rsi | 45 | movq 4*0(%rdi), %rsi |
41 | movq 4*2(%rdi), %r10 | 46 | movq 4*2(%rdi), %r8 |
42 | bswapq %rsi | 47 | bswapq %rsi |
43 | bswapq %r10 | 48 | bswapq %r8 |
44 | rolq $32, %rsi # rsi = W[1]:W[0] | 49 | rolq $32, %rsi # rsi = W[1]:W[0] |
45 | rolq $32, %r10 | 50 | rolq $32, %r8 # r8 = W[3]:W[2] |
46 | movq %rsi, %xmm0 | 51 | movq %rsi, %xmm0 |
47 | movq %r10, %xmm4 | 52 | movq %r8, %xmm4 |
48 | punpcklqdq %xmm4, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) | 53 | punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) |
49 | movaps %xmm0, %xmm4 | 54 | # movaps %xmm0, %xmm4 # add RCONST, spill to stack |
50 | paddd %xmm6, %xmm4 | 55 | # paddd %xmm6, %xmm4 |
51 | movups %xmm4, -64+4*0(%rsp) | 56 | # movups %xmm4, -64+16*0(%rsp) |
52 | 57 | ||
53 | movq 4*4(%rdi), %r8 | 58 | movq 4*4(%rdi), %r9 |
54 | movq 4*6(%rdi), %r10 | 59 | movq 4*6(%rdi), %r10 |
55 | bswapq %r8 | 60 | bswapq %r9 |
56 | bswapq %r10 | 61 | bswapq %r10 |
57 | rolq $32, %r8 | 62 | rolq $32, %r9 # r9 = W[5]:W[4] |
58 | rolq $32, %r10 | 63 | rolq $32, %r10 # r10 = W[7]:W[6] |
59 | movq %r8, %xmm1 | 64 | movq %r9, %xmm1 |
60 | movq %r10, %xmm4 | 65 | movq %r10, %xmm4 |
61 | punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) | 66 | punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) |
62 | movaps %xmm1, %xmm4 | ||
63 | paddd %xmm6, %xmm4 | ||
64 | movups %xmm4, -64+4*4(%rsp) | ||
65 | 67 | ||
66 | movq 4*8(%rdi), %r8 | 68 | movq 4*8(%rdi), %r11 |
67 | movq 4*10(%rdi), %r10 | 69 | movq 4*10(%rdi), %r12 |
68 | bswapq %r8 | 70 | bswapq %r11 |
69 | bswapq %r10 | 71 | bswapq %r12 |
70 | movl %r8d, %r9d # r9d = W[9] | 72 | rolq $32, %r11 # r11 = W[9]:W[8] |
71 | rolq $32, %r8 # r8 = W[9]:W[8] | 73 | rolq $32, %r12 # r12 = W[11]:W[10] |
72 | movl %r10d, %r11d # r11d = W[11] | 74 | movq %r11, %xmm2 |
73 | rolq $32, %r10 # r10 = W[11]:W[10] | 75 | movq %r12, %xmm4 |
74 | movq %r8, %xmm2 | 76 | punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) |
75 | movq %r10, %xmm4 | ||
76 | punpcklqdq %xmm4, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11]) | ||
77 | 77 | ||
78 | movq 4*12(%rdi), %r12 | 78 | movq 4*12(%rdi), %r13 |
79 | movq 4*14(%rdi), %r14 | 79 | movq 4*14(%rdi), %r14 |
80 | bswapq %r12 | 80 | bswapq %r13 |
81 | bswapq %r14 | 81 | bswapq %r14 |
82 | movl %r12d, %r13d # r13d = W[13] | 82 | rolq $32, %r13 # r13 = W[13]:W[12] |
83 | rolq $32, %r12 # r12 = W[13]:W[12] | ||
84 | movl %r14d, %r15d # r15d = W[15] | ||
85 | rolq $32, %r14 # r14 = W[15]:W[14] | 83 | rolq $32, %r14 # r14 = W[15]:W[14] |
86 | movq %r12, %xmm3 | 84 | movq %r13, %xmm3 |
87 | movq %r14, %xmm4 | 85 | movq %r14, %xmm4 |
88 | punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) | 86 | punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) |
89 | 87 | ||
90 | # 0 | 88 | # 0 |
91 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] | 89 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] |
90 | shrq $32, %rsi | ||
92 | movl %ecx, %edi # c | 91 | movl %ecx, %edi # c |
93 | xorl %edx, %edi # ^d | 92 | xorl %edx, %edi # ^d |
94 | andl %ebx, %edi # &b | 93 | andl %ebx, %edi # &b |
95 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 94 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
96 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 95 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
97 | movl %eax, %esi # | 96 | movl %eax, %edi # |
98 | roll $5, %esi # rotl32(a,5) | 97 | roll $5, %edi # rotl32(a,5) |
99 | addl %esi, %ebp # e += rotl32(a,5) | 98 | addl %edi, %ebp # e += rotl32(a,5) |
100 | rorl $2, %ebx # b = rotl32(b,30) | 99 | rorl $2, %ebx # b = rotl32(b,30) |
101 | # 1 | 100 | # 1 |
102 | addl -64+4*1(%rsp), %edx # e += RCONST + W[n] | 101 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] |
103 | movl %ebx, %edi # c | 102 | movl %ebx, %edi # c |
104 | xorl %ecx, %edi # ^d | 103 | xorl %ecx, %edi # ^d |
105 | andl %eax, %edi # &b | 104 | andl %eax, %edi # &b |
106 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 105 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
107 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 106 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
108 | movl %ebp, %esi # | 107 | movl %ebp, %edi # |
109 | roll $5, %esi # rotl32(a,5) | 108 | roll $5, %edi # rotl32(a,5) |
110 | addl %esi, %edx # e += rotl32(a,5) | 109 | addl %edi, %edx # e += rotl32(a,5) |
111 | rorl $2, %eax # b = rotl32(b,30) | 110 | rorl $2, %eax # b = rotl32(b,30) |
112 | # 2 | 111 | # 2 |
113 | addl -64+4*2(%rsp), %ecx # e += RCONST + W[n] | 112 | leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n] |
113 | shrq $32, %r8 | ||
114 | movl %eax, %edi # c | 114 | movl %eax, %edi # c |
115 | xorl %ebx, %edi # ^d | 115 | xorl %ebx, %edi # ^d |
116 | andl %ebp, %edi # &b | 116 | andl %ebp, %edi # &b |
117 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 117 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
118 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 118 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
119 | movl %edx, %esi # | 119 | movl %edx, %edi # |
120 | roll $5, %esi # rotl32(a,5) | 120 | roll $5, %edi # rotl32(a,5) |
121 | addl %esi, %ecx # e += rotl32(a,5) | 121 | addl %edi, %ecx # e += rotl32(a,5) |
122 | rorl $2, %ebp # b = rotl32(b,30) | 122 | rorl $2, %ebp # b = rotl32(b,30) |
123 | # 3 | 123 | # 3 |
124 | addl -64+4*3(%rsp), %ebx # e += RCONST + W[n] | 124 | leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] |
125 | movl %ebp, %edi # c | 125 | movl %ebp, %edi # c |
126 | xorl %eax, %edi # ^d | 126 | xorl %eax, %edi # ^d |
127 | andl %edx, %edi # &b | 127 | andl %edx, %edi # &b |
128 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 128 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
129 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 129 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
130 | movl %ecx, %esi # | 130 | movl %ecx, %edi # |
131 | roll $5, %esi # rotl32(a,5) | 131 | roll $5, %edi # rotl32(a,5) |
132 | addl %esi, %ebx # e += rotl32(a,5) | 132 | addl %edi, %ebx # e += rotl32(a,5) |
133 | rorl $2, %edx # b = rotl32(b,30) | 133 | rorl $2, %edx # b = rotl32(b,30) |
134 | # 4 | 134 | # 4 |
135 | addl -64+4*4(%rsp), %eax # e += RCONST + W[n] | 135 | leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] |
136 | shrq $32, %r9 | ||
136 | movl %edx, %edi # c | 137 | movl %edx, %edi # c |
137 | xorl %ebp, %edi # ^d | 138 | xorl %ebp, %edi # ^d |
138 | andl %ecx, %edi # &b | 139 | andl %ecx, %edi # &b |
139 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 140 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
140 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 141 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
141 | movl %ebx, %esi # | 142 | movl %ebx, %edi # |
142 | roll $5, %esi # rotl32(a,5) | 143 | roll $5, %edi # rotl32(a,5) |
143 | addl %esi, %eax # e += rotl32(a,5) | 144 | addl %edi, %eax # e += rotl32(a,5) |
144 | rorl $2, %ecx # b = rotl32(b,30) | 145 | rorl $2, %ecx # b = rotl32(b,30) |
145 | # 5 | 146 | # 5 |
146 | addl -64+4*5(%rsp), %ebp # e += RCONST + W[n] | 147 | leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n] |
147 | movl %ecx, %edi # c | 148 | movl %ecx, %edi # c |
148 | xorl %edx, %edi # ^d | 149 | xorl %edx, %edi # ^d |
149 | andl %ebx, %edi # &b | 150 | andl %ebx, %edi # &b |
150 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 151 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
151 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 152 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
152 | movl %eax, %esi # | 153 | movl %eax, %edi # |
153 | roll $5, %esi # rotl32(a,5) | 154 | roll $5, %edi # rotl32(a,5) |
154 | addl %esi, %ebp # e += rotl32(a,5) | 155 | addl %edi, %ebp # e += rotl32(a,5) |
155 | rorl $2, %ebx # b = rotl32(b,30) | 156 | rorl $2, %ebx # b = rotl32(b,30) |
156 | # 6 | 157 | # 6 |
157 | addl -64+4*6(%rsp), %edx # e += RCONST + W[n] | 158 | leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n] |
159 | shrq $32, %r10 | ||
158 | movl %ebx, %edi # c | 160 | movl %ebx, %edi # c |
159 | xorl %ecx, %edi # ^d | 161 | xorl %ecx, %edi # ^d |
160 | andl %eax, %edi # &b | 162 | andl %eax, %edi # &b |
161 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 163 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
162 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 164 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
163 | movl %ebp, %esi # | 165 | movl %ebp, %edi # |
164 | roll $5, %esi # rotl32(a,5) | 166 | roll $5, %edi # rotl32(a,5) |
165 | addl %esi, %edx # e += rotl32(a,5) | 167 | addl %edi, %edx # e += rotl32(a,5) |
166 | rorl $2, %eax # b = rotl32(b,30) | 168 | rorl $2, %eax # b = rotl32(b,30) |
167 | # 7 | 169 | # 7 |
168 | addl -64+4*7(%rsp), %ecx # e += RCONST + W[n] | 170 | leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n] |
169 | movl %eax, %edi # c | 171 | movl %eax, %edi # c |
170 | xorl %ebx, %edi # ^d | 172 | xorl %ebx, %edi # ^d |
171 | andl %ebp, %edi # &b | 173 | andl %ebp, %edi # &b |
172 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 174 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
173 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 175 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
174 | movl %edx, %esi # | 176 | movl %edx, %edi # |
175 | roll $5, %esi # rotl32(a,5) | 177 | roll $5, %edi # rotl32(a,5) |
176 | addl %esi, %ecx # e += rotl32(a,5) | 178 | addl %edi, %ecx # e += rotl32(a,5) |
177 | rorl $2, %ebp # b = rotl32(b,30) | 179 | rorl $2, %ebp # b = rotl32(b,30) |
178 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | 180 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
179 | movaps %xmm3, %xmm4 | 181 | movaps %xmm3, %xmm4 |
@@ -186,9 +188,9 @@ sha1_process_block64: | |||
186 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 188 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
187 | movaps %xmm0, %xmm5 | 189 | movaps %xmm0, %xmm5 |
188 | xorps %xmm4, %xmm4 # rol(W0,1): | 190 | xorps %xmm4, %xmm4 # rol(W0,1): |
189 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 191 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
190 | paddd %xmm0, %xmm0 # shift left by 1 | 192 | paddd %xmm0, %xmm0 # shift left by 1 |
191 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | 193 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 |
192 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 194 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
193 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 195 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
194 | movaps %xmm5, %xmm4 | 196 | movaps %xmm5, %xmm4 |
@@ -201,48 +203,50 @@ sha1_process_block64: | |||
201 | paddd %xmm6, %xmm5 | 203 | paddd %xmm6, %xmm5 |
202 | movups %xmm5, -64+16*0(%rsp) | 204 | movups %xmm5, -64+16*0(%rsp) |
203 | # 8 | 205 | # 8 |
204 | leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] | 206 | leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n] |
207 | shrq $32, %r11 | ||
205 | movl %ebp, %edi # c | 208 | movl %ebp, %edi # c |
206 | xorl %eax, %edi # ^d | 209 | xorl %eax, %edi # ^d |
207 | andl %edx, %edi # &b | 210 | andl %edx, %edi # &b |
208 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 211 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
209 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 212 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
210 | movl %ecx, %esi # | 213 | movl %ecx, %edi # |
211 | roll $5, %esi # rotl32(a,5) | 214 | roll $5, %edi # rotl32(a,5) |
212 | addl %esi, %ebx # e += rotl32(a,5) | 215 | addl %edi, %ebx # e += rotl32(a,5) |
213 | rorl $2, %edx # b = rotl32(b,30) | 216 | rorl $2, %edx # b = rotl32(b,30) |
214 | # 9 | 217 | # 9 |
215 | leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] | 218 | leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n] |
216 | movl %edx, %edi # c | 219 | movl %edx, %edi # c |
217 | xorl %ebp, %edi # ^d | 220 | xorl %ebp, %edi # ^d |
218 | andl %ecx, %edi # &b | 221 | andl %ecx, %edi # &b |
219 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 222 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
220 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 223 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
221 | movl %ebx, %esi # | 224 | movl %ebx, %edi # |
222 | roll $5, %esi # rotl32(a,5) | 225 | roll $5, %edi # rotl32(a,5) |
223 | addl %esi, %eax # e += rotl32(a,5) | 226 | addl %edi, %eax # e += rotl32(a,5) |
224 | rorl $2, %ecx # b = rotl32(b,30) | 227 | rorl $2, %ecx # b = rotl32(b,30) |
225 | # 10 | 228 | # 10 |
226 | leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n] | 229 | leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n] |
230 | shrq $32, %r12 | ||
227 | movl %ecx, %edi # c | 231 | movl %ecx, %edi # c |
228 | xorl %edx, %edi # ^d | 232 | xorl %edx, %edi # ^d |
229 | andl %ebx, %edi # &b | 233 | andl %ebx, %edi # &b |
230 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 234 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
231 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 235 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
232 | movl %eax, %esi # | 236 | movl %eax, %edi # |
233 | roll $5, %esi # rotl32(a,5) | 237 | roll $5, %edi # rotl32(a,5) |
234 | addl %esi, %ebp # e += rotl32(a,5) | 238 | addl %edi, %ebp # e += rotl32(a,5) |
235 | rorl $2, %ebx # b = rotl32(b,30) | 239 | rorl $2, %ebx # b = rotl32(b,30) |
236 | # 11 | 240 | # 11 |
237 | leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n] | 241 | leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n] |
238 | movl %ebx, %edi # c | 242 | movl %ebx, %edi # c |
239 | xorl %ecx, %edi # ^d | 243 | xorl %ecx, %edi # ^d |
240 | andl %eax, %edi # &b | 244 | andl %eax, %edi # &b |
241 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 245 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
242 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 246 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
243 | movl %ebp, %esi # | 247 | movl %ebp, %edi # |
244 | roll $5, %esi # rotl32(a,5) | 248 | roll $5, %edi # rotl32(a,5) |
245 | addl %esi, %edx # e += rotl32(a,5) | 249 | addl %edi, %edx # e += rotl32(a,5) |
246 | rorl $2, %eax # b = rotl32(b,30) | 250 | rorl $2, %eax # b = rotl32(b,30) |
247 | movaps rconst0x6ED9EBA1(%rip), %xmm6 | 251 | movaps rconst0x6ED9EBA1(%rip), %xmm6 |
248 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | 252 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
@@ -256,9 +260,9 @@ sha1_process_block64: | |||
256 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 260 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
257 | movaps %xmm1, %xmm5 | 261 | movaps %xmm1, %xmm5 |
258 | xorps %xmm4, %xmm4 # rol(W0,1): | 262 | xorps %xmm4, %xmm4 # rol(W0,1): |
259 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 263 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
260 | paddd %xmm1, %xmm1 # shift left by 1 | 264 | paddd %xmm1, %xmm1 # shift left by 1 |
261 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | 265 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 |
262 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 266 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
263 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 267 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
264 | movaps %xmm5, %xmm4 | 268 | movaps %xmm5, %xmm4 |
@@ -271,15 +275,16 @@ sha1_process_block64: | |||
271 | paddd %xmm6, %xmm5 | 275 | paddd %xmm6, %xmm5 |
272 | movups %xmm5, -64+16*1(%rsp) | 276 | movups %xmm5, -64+16*1(%rsp) |
273 | # 12 | 277 | # 12 |
274 | leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n] | 278 | leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n] |
279 | shrq $32, %r13 | ||
275 | movl %eax, %edi # c | 280 | movl %eax, %edi # c |
276 | xorl %ebx, %edi # ^d | 281 | xorl %ebx, %edi # ^d |
277 | andl %ebp, %edi # &b | 282 | andl %ebp, %edi # &b |
278 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 283 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
279 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 284 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
280 | movl %edx, %esi # | 285 | movl %edx, %edi # |
281 | roll $5, %esi # rotl32(a,5) | 286 | roll $5, %edi # rotl32(a,5) |
282 | addl %esi, %ecx # e += rotl32(a,5) | 287 | addl %edi, %ecx # e += rotl32(a,5) |
283 | rorl $2, %ebp # b = rotl32(b,30) | 288 | rorl $2, %ebp # b = rotl32(b,30) |
284 | # 13 | 289 | # 13 |
285 | leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] | 290 | leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] |
@@ -288,31 +293,32 @@ sha1_process_block64: | |||
288 | andl %edx, %edi # &b | 293 | andl %edx, %edi # &b |
289 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 294 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
290 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 295 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
291 | movl %ecx, %esi # | 296 | movl %ecx, %edi # |
292 | roll $5, %esi # rotl32(a,5) | 297 | roll $5, %edi # rotl32(a,5) |
293 | addl %esi, %ebx # e += rotl32(a,5) | 298 | addl %edi, %ebx # e += rotl32(a,5) |
294 | rorl $2, %edx # b = rotl32(b,30) | 299 | rorl $2, %edx # b = rotl32(b,30) |
295 | # 14 | 300 | # 14 |
296 | leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] | 301 | leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] |
302 | shrq $32, %r14 | ||
297 | movl %edx, %edi # c | 303 | movl %edx, %edi # c |
298 | xorl %ebp, %edi # ^d | 304 | xorl %ebp, %edi # ^d |
299 | andl %ecx, %edi # &b | 305 | andl %ecx, %edi # &b |
300 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 306 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
301 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 307 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
302 | movl %ebx, %esi # | 308 | movl %ebx, %edi # |
303 | roll $5, %esi # rotl32(a,5) | 309 | roll $5, %edi # rotl32(a,5) |
304 | addl %esi, %eax # e += rotl32(a,5) | 310 | addl %edi, %eax # e += rotl32(a,5) |
305 | rorl $2, %ecx # b = rotl32(b,30) | 311 | rorl $2, %ecx # b = rotl32(b,30) |
306 | # 15 | 312 | # 15 |
307 | leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n] | 313 | leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n] |
308 | movl %ecx, %edi # c | 314 | movl %ecx, %edi # c |
309 | xorl %edx, %edi # ^d | 315 | xorl %edx, %edi # ^d |
310 | andl %ebx, %edi # &b | 316 | andl %ebx, %edi # &b |
311 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 317 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
312 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 318 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
313 | movl %eax, %esi # | 319 | movl %eax, %edi # |
314 | roll $5, %esi # rotl32(a,5) | 320 | roll $5, %edi # rotl32(a,5) |
315 | addl %esi, %ebp # e += rotl32(a,5) | 321 | addl %edi, %ebp # e += rotl32(a,5) |
316 | rorl $2, %ebx # b = rotl32(b,30) | 322 | rorl $2, %ebx # b = rotl32(b,30) |
317 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | 323 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
318 | movaps %xmm1, %xmm4 | 324 | movaps %xmm1, %xmm4 |
@@ -325,9 +331,9 @@ sha1_process_block64: | |||
325 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 331 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
326 | movaps %xmm2, %xmm5 | 332 | movaps %xmm2, %xmm5 |
327 | xorps %xmm4, %xmm4 # rol(W0,1): | 333 | xorps %xmm4, %xmm4 # rol(W0,1): |
328 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 334 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
329 | paddd %xmm2, %xmm2 # shift left by 1 | 335 | paddd %xmm2, %xmm2 # shift left by 1 |
330 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | 336 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 |
331 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 337 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
332 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 338 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
333 | movaps %xmm5, %xmm4 | 339 | movaps %xmm5, %xmm4 |
@@ -394,9 +400,9 @@ sha1_process_block64: | |||
394 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 400 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
395 | movaps %xmm3, %xmm5 | 401 | movaps %xmm3, %xmm5 |
396 | xorps %xmm4, %xmm4 # rol(W0,1): | 402 | xorps %xmm4, %xmm4 # rol(W0,1): |
397 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 403 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
398 | paddd %xmm3, %xmm3 # shift left by 1 | 404 | paddd %xmm3, %xmm3 # shift left by 1 |
399 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | 405 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 |
400 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 406 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
401 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 407 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
402 | movaps %xmm5, %xmm4 | 408 | movaps %xmm5, %xmm4 |
@@ -459,9 +465,9 @@ sha1_process_block64: | |||
459 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 465 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
460 | movaps %xmm0, %xmm5 | 466 | movaps %xmm0, %xmm5 |
461 | xorps %xmm4, %xmm4 # rol(W0,1): | 467 | xorps %xmm4, %xmm4 # rol(W0,1): |
462 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 468 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
463 | paddd %xmm0, %xmm0 # shift left by 1 | 469 | paddd %xmm0, %xmm0 # shift left by 1 |
464 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | 470 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 |
465 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 471 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
466 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 472 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
467 | movaps %xmm5, %xmm4 | 473 | movaps %xmm5, %xmm4 |
@@ -524,9 +530,9 @@ sha1_process_block64: | |||
524 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 530 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
525 | movaps %xmm1, %xmm5 | 531 | movaps %xmm1, %xmm5 |
526 | xorps %xmm4, %xmm4 # rol(W0,1): | 532 | xorps %xmm4, %xmm4 # rol(W0,1): |
527 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 533 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
528 | paddd %xmm1, %xmm1 # shift left by 1 | 534 | paddd %xmm1, %xmm1 # shift left by 1 |
529 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | 535 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 |
530 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 536 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
531 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 537 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
532 | movaps %xmm5, %xmm4 | 538 | movaps %xmm5, %xmm4 |
@@ -590,9 +596,9 @@ sha1_process_block64: | |||
590 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 596 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
591 | movaps %xmm2, %xmm5 | 597 | movaps %xmm2, %xmm5 |
592 | xorps %xmm4, %xmm4 # rol(W0,1): | 598 | xorps %xmm4, %xmm4 # rol(W0,1): |
593 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 599 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
594 | paddd %xmm2, %xmm2 # shift left by 1 | 600 | paddd %xmm2, %xmm2 # shift left by 1 |
595 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | 601 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 |
596 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 602 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
597 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 603 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
598 | movaps %xmm5, %xmm4 | 604 | movaps %xmm5, %xmm4 |
@@ -655,9 +661,9 @@ sha1_process_block64: | |||
655 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 661 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
656 | movaps %xmm3, %xmm5 | 662 | movaps %xmm3, %xmm5 |
657 | xorps %xmm4, %xmm4 # rol(W0,1): | 663 | xorps %xmm4, %xmm4 # rol(W0,1): |
658 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 664 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
659 | paddd %xmm3, %xmm3 # shift left by 1 | 665 | paddd %xmm3, %xmm3 # shift left by 1 |
660 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | 666 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 |
661 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 667 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
662 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 668 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
663 | movaps %xmm5, %xmm4 | 669 | movaps %xmm5, %xmm4 |
@@ -720,9 +726,9 @@ sha1_process_block64: | |||
720 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 726 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
721 | movaps %xmm0, %xmm5 | 727 | movaps %xmm0, %xmm5 |
722 | xorps %xmm4, %xmm4 # rol(W0,1): | 728 | xorps %xmm4, %xmm4 # rol(W0,1): |
723 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 729 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
724 | paddd %xmm0, %xmm0 # shift left by 1 | 730 | paddd %xmm0, %xmm0 # shift left by 1 |
725 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | 731 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 |
726 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 732 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
727 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 733 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
728 | movaps %xmm5, %xmm4 | 734 | movaps %xmm5, %xmm4 |
@@ -797,9 +803,9 @@ sha1_process_block64: | |||
797 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 803 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
798 | movaps %xmm1, %xmm5 | 804 | movaps %xmm1, %xmm5 |
799 | xorps %xmm4, %xmm4 # rol(W0,1): | 805 | xorps %xmm4, %xmm4 # rol(W0,1): |
800 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 806 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
801 | paddd %xmm1, %xmm1 # shift left by 1 | 807 | paddd %xmm1, %xmm1 # shift left by 1 |
802 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | 808 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 |
803 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 809 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
804 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 810 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
805 | movaps %xmm5, %xmm4 | 811 | movaps %xmm5, %xmm4 |
@@ -874,9 +880,9 @@ sha1_process_block64: | |||
874 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 880 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
875 | movaps %xmm2, %xmm5 | 881 | movaps %xmm2, %xmm5 |
876 | xorps %xmm4, %xmm4 # rol(W0,1): | 882 | xorps %xmm4, %xmm4 # rol(W0,1): |
877 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 883 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
878 | paddd %xmm2, %xmm2 # shift left by 1 | 884 | paddd %xmm2, %xmm2 # shift left by 1 |
879 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | 885 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 |
880 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 886 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
881 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 887 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
882 | movaps %xmm5, %xmm4 | 888 | movaps %xmm5, %xmm4 |
@@ -952,9 +958,9 @@ sha1_process_block64: | |||
952 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 958 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
953 | movaps %xmm3, %xmm5 | 959 | movaps %xmm3, %xmm5 |
954 | xorps %xmm4, %xmm4 # rol(W0,1): | 960 | xorps %xmm4, %xmm4 # rol(W0,1): |
955 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 961 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
956 | paddd %xmm3, %xmm3 # shift left by 1 | 962 | paddd %xmm3, %xmm3 # shift left by 1 |
957 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | 963 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 |
958 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 964 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
959 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 965 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
960 | movaps %xmm5, %xmm4 | 966 | movaps %xmm5, %xmm4 |
@@ -1029,9 +1035,9 @@ sha1_process_block64: | |||
1029 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 1035 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
1030 | movaps %xmm0, %xmm5 | 1036 | movaps %xmm0, %xmm5 |
1031 | xorps %xmm4, %xmm4 # rol(W0,1): | 1037 | xorps %xmm4, %xmm4 # rol(W0,1): |
1032 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 1038 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
1033 | paddd %xmm0, %xmm0 # shift left by 1 | 1039 | paddd %xmm0, %xmm0 # shift left by 1 |
1034 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | 1040 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 |
1035 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 1041 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
1036 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 1042 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
1037 | movaps %xmm5, %xmm4 | 1043 | movaps %xmm5, %xmm4 |
@@ -1106,9 +1112,9 @@ sha1_process_block64: | |||
1106 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 1112 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
1107 | movaps %xmm1, %xmm5 | 1113 | movaps %xmm1, %xmm5 |
1108 | xorps %xmm4, %xmm4 # rol(W0,1): | 1114 | xorps %xmm4, %xmm4 # rol(W0,1): |
1109 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 1115 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
1110 | paddd %xmm1, %xmm1 # shift left by 1 | 1116 | paddd %xmm1, %xmm1 # shift left by 1 |
1111 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | 1117 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 |
1112 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 1118 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
1113 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 1119 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
1114 | movaps %xmm5, %xmm4 | 1120 | movaps %xmm5, %xmm4 |
@@ -1171,9 +1177,9 @@ sha1_process_block64: | |||
1171 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 1177 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
1172 | movaps %xmm2, %xmm5 | 1178 | movaps %xmm2, %xmm5 |
1173 | xorps %xmm4, %xmm4 # rol(W0,1): | 1179 | xorps %xmm4, %xmm4 # rol(W0,1): |
1174 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 1180 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
1175 | paddd %xmm2, %xmm2 # shift left by 1 | 1181 | paddd %xmm2, %xmm2 # shift left by 1 |
1176 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | 1182 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 |
1177 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 1183 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
1178 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 1184 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
1179 | movaps %xmm5, %xmm4 | 1185 | movaps %xmm5, %xmm4 |
@@ -1236,9 +1242,9 @@ sha1_process_block64: | |||
1236 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | 1242 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
1237 | movaps %xmm3, %xmm5 | 1243 | movaps %xmm3, %xmm5 |
1238 | xorps %xmm4, %xmm4 # rol(W0,1): | 1244 | xorps %xmm4, %xmm4 # rol(W0,1): |
1239 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | 1245 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
1240 | paddd %xmm3, %xmm3 # shift left by 1 | 1246 | paddd %xmm3, %xmm3 # shift left by 1 |
1241 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | 1247 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 |
1242 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 1248 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
1243 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 1249 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
1244 | movaps %xmm5, %xmm4 | 1250 | movaps %xmm5, %xmm4 |
@@ -1378,7 +1384,7 @@ sha1_process_block64: | |||
1378 | addl %ebx, 84(%rdi) # ctx->hash[1] += b | 1384 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
1379 | popq %r14 # | 1385 | popq %r14 # |
1380 | addl %ecx, 88(%rdi) # ctx->hash[2] += c | 1386 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
1381 | popq %r15 # | 1387 | # popq %r15 # |
1382 | addl %edx, 92(%rdi) # ctx->hash[3] += d | 1388 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
1383 | popq %rbx # | 1389 | popq %rbx # |
1384 | addl %ebp, 96(%rdi) # ctx->hash[4] += e | 1390 | addl %ebp, 96(%rdi) # ctx->hash[4] += e |
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 87c2d0800..47c40af0d 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh | |||
@@ -102,7 +102,7 @@ echo \ | |||
102 | "### Generated by hash_md5_sha_x86-64.S.sh ### | 102 | "### Generated by hash_md5_sha_x86-64.S.sh ### |
103 | 103 | ||
104 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) | 104 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
105 | .section .text.sha1_process_block64,\"ax\",@progbits | 105 | .section .text.sha1_process_block64, \"ax\", @progbits |
106 | .globl sha1_process_block64 | 106 | .globl sha1_process_block64 |
107 | .hidden sha1_process_block64 | 107 | .hidden sha1_process_block64 |
108 | .type sha1_process_block64, @function | 108 | .type sha1_process_block64, @function |
@@ -111,7 +111,7 @@ echo \ | |||
111 | sha1_process_block64: | 111 | sha1_process_block64: |
112 | pushq %rbp # 1 byte insn | 112 | pushq %rbp # 1 byte insn |
113 | pushq %rbx # 1 byte insn | 113 | pushq %rbx # 1 byte insn |
114 | pushq %r15 # 2 byte insn | 114 | # pushq %r15 # 2 byte insn |
115 | pushq %r14 # 2 byte insn | 115 | pushq %r14 # 2 byte insn |
116 | pushq %r13 # 2 byte insn | 116 | pushq %r13 # 2 byte insn |
117 | pushq %r12 # 2 byte insn | 117 | pushq %r12 # 2 byte insn |
@@ -120,7 +120,8 @@ sha1_process_block64: | |||
120 | #Register and stack use: | 120 | #Register and stack use: |
121 | # eax..edx: a..d | 121 | # eax..edx: a..d |
122 | # ebp: e | 122 | # ebp: e |
123 | # esi,edi: temps | 123 | # esi,edi,r8..r14: temps |
124 | # r15: unused | ||
124 | # xmm0..xmm3: W[] | 125 | # xmm0..xmm3: W[] |
125 | # xmm4,xmm5: temps | 126 | # xmm4,xmm5: temps |
126 | # xmm6: current round constant | 127 | # xmm6: current round constant |
@@ -134,59 +135,56 @@ sha1_process_block64: | |||
134 | 135 | ||
135 | movaps rconst0x5A827999(%rip), $xmmRCONST | 136 | movaps rconst0x5A827999(%rip), $xmmRCONST |
136 | 137 | ||
137 | # For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15 | 138 | # Load W[] to xmm registers, byteswapping on the fly. |
138 | # instead of spilling them to stack. | 139 | # |
139 | # (We lose parallelized addition of RCONST, but LEA | 140 | # For iterations 0..15, we pass W[] in rsi,r8..r14 |
140 | # can do two additions at once, so...) | 141 | # for use in RD1A's instead of spilling them to stack. |
142 | # We lose parallelized addition of RCONST, but LEA | ||
143 | # can do two additions at once, so it's probably a wash. | ||
144 | # (We use rsi instead of rN because this makes two | ||
145 | # LEAs in two first RD1A's shorter by one byte). | ||
141 | movq 4*0(%rdi), %rsi | 146 | movq 4*0(%rdi), %rsi |
142 | movq 4*2(%rdi), %r10 | 147 | movq 4*2(%rdi), %r8 |
143 | bswapq %rsi | 148 | bswapq %rsi |
144 | bswapq %r10 | 149 | bswapq %r8 |
145 | rolq \$32, %rsi # rsi = W[1]:W[0] | 150 | rolq \$32, %rsi # rsi = W[1]:W[0] |
146 | rolq \$32, %r10 | 151 | rolq \$32, %r8 # r8 = W[3]:W[2] |
147 | movq %rsi, %xmm0 | 152 | movq %rsi, %xmm0 |
148 | movq %r10, $xmmT1 | 153 | movq %r8, $xmmT1 |
149 | punpcklqdq $xmmT1, %xmm0 # xmm0 = r10:rsi = (W[0],W[1],W[2],W[3]) | 154 | punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) |
150 | movaps %xmm0, $xmmT1 | 155 | # movaps %xmm0, $xmmT1 # add RCONST, spill to stack |
151 | paddd $xmmRCONST, $xmmT1 | 156 | # paddd $xmmRCONST, $xmmT1 |
152 | movups $xmmT1, -64+4*0(%rsp) | 157 | # movups $xmmT1, -64+16*0(%rsp) |
153 | 158 | ||
154 | movq 4*4(%rdi), %r8 | 159 | movq 4*4(%rdi), %r9 |
155 | movq 4*6(%rdi), %r10 | 160 | movq 4*6(%rdi), %r10 |
156 | bswapq %r8 | 161 | bswapq %r9 |
157 | bswapq %r10 | 162 | bswapq %r10 |
158 | rolq \$32, %r8 | 163 | rolq \$32, %r9 # r9 = W[5]:W[4] |
159 | rolq \$32, %r10 | 164 | rolq \$32, %r10 # r10 = W[7]:W[6] |
160 | movq %r8, %xmm1 | 165 | movq %r9, %xmm1 |
161 | movq %r10, $xmmT1 | 166 | movq %r10, $xmmT1 |
162 | punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r8 = (W[4],W[5],W[6],W[7]) | 167 | punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) |
163 | movaps %xmm1, $xmmT1 | ||
164 | paddd $xmmRCONST, $xmmT1 | ||
165 | movups $xmmT1, -64+4*4(%rsp) | ||
166 | 168 | ||
167 | movq 4*8(%rdi), %r8 | 169 | movq 4*8(%rdi), %r11 |
168 | movq 4*10(%rdi), %r10 | 170 | movq 4*10(%rdi), %r12 |
169 | bswapq %r8 | 171 | bswapq %r11 |
170 | bswapq %r10 | 172 | bswapq %r12 |
171 | movl %r8d, %r9d # r9d = W[9] | 173 | rolq \$32, %r11 # r11 = W[9]:W[8] |
172 | rolq \$32, %r8 # r8 = W[9]:W[8] | 174 | rolq \$32, %r12 # r12 = W[11]:W[10] |
173 | movl %r10d, %r11d # r11d = W[11] | 175 | movq %r11, %xmm2 |
174 | rolq \$32, %r10 # r10 = W[11]:W[10] | 176 | movq %r12, $xmmT1 |
175 | movq %r8, %xmm2 | 177 | punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) |
176 | movq %r10, $xmmT1 | ||
177 | punpcklqdq $xmmT1, %xmm2 # xmm2 = r10:r8 = (W[8],W[9],W[10],W[11]) | ||
178 | 178 | ||
179 | movq 4*12(%rdi), %r12 | 179 | movq 4*12(%rdi), %r13 |
180 | movq 4*14(%rdi), %r14 | 180 | movq 4*14(%rdi), %r14 |
181 | bswapq %r12 | 181 | bswapq %r13 |
182 | bswapq %r14 | 182 | bswapq %r14 |
183 | movl %r12d, %r13d # r13d = W[13] | 183 | rolq \$32, %r13 # r13 = W[13]:W[12] |
184 | rolq \$32, %r12 # r12 = W[13]:W[12] | ||
185 | movl %r14d, %r15d # r15d = W[15] | ||
186 | rolq \$32, %r14 # r14 = W[15]:W[14] | 184 | rolq \$32, %r14 # r14 = W[15]:W[14] |
187 | movq %r12, %xmm3 | 185 | movq %r13, %xmm3 |
188 | movq %r14, $xmmT1 | 186 | movq %r14, $xmmT1 |
189 | punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r12 = (W[12],W[13],W[14],W[15]) | 187 | punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) |
190 | " | 188 | " |
191 | 189 | ||
192 | PREP() { | 190 | PREP() { |
@@ -215,9 +213,9 @@ echo "# PREP $@ | |||
215 | movaps $xmmW0, $xmmT2 | 213 | movaps $xmmW0, $xmmT2 |
216 | 214 | ||
217 | xorps $xmmT1, $xmmT1 # rol(W0,1): | 215 | xorps $xmmT1, $xmmT1 # rol(W0,1): |
218 | pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) | 216 | pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) |
219 | paddd $xmmW0, $xmmW0 # shift left by 1 | 217 | paddd $xmmW0, $xmmW0 # shift left by 1 |
220 | psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 | 218 | psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 |
221 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | 219 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
222 | 220 | ||
223 | pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | 221 | pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
@@ -256,23 +254,28 @@ RD1A() { | |||
256 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 | 254 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
257 | local n=$(($6)) | 255 | local n=$(($6)) |
258 | local n0=$(((n+0) & 15)) | 256 | local n0=$(((n+0) & 15)) |
257 | local rN=$((7+n0/2)) | ||
259 | echo " | 258 | echo " |
260 | # $n | 259 | # $n |
261 | ";test $n0 = 0 && echo " | 260 | ";test $n0 = 0 && echo " |
262 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] | 261 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
263 | ";test $n0 != 0 && test $n0 -lt 8 && echo " | 262 | shrq \$32, %rsi |
264 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n] | 263 | ";test $n0 = 1 && echo " |
265 | ";test $n0 -ge 8 && echo " | 264 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
266 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] | 265 | ";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " |
266 | leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] | ||
267 | shrq \$32, %r$rN | ||
268 | ";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " | ||
269 | leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] | ||
267 | ";echo " | 270 | ";echo " |
268 | movl %e$c, %edi # c | 271 | movl %e$c, %edi # c |
269 | xorl %e$d, %edi # ^d | 272 | xorl %e$d, %edi # ^d |
270 | andl %e$b, %edi # &b | 273 | andl %e$b, %edi # &b |
271 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) | 274 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
272 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) | 275 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
273 | movl %e$a, %esi # | 276 | movl %e$a, %edi # |
274 | roll \$5, %esi # rotl32(a,5) | 277 | roll \$5, %edi # rotl32(a,5) |
275 | addl %esi, %e$e # e += rotl32(a,5) | 278 | addl %edi, %e$e # e += rotl32(a,5) |
276 | rorl \$2, %e$b # b = rotl32(b,30) | 279 | rorl \$2, %e$b # b = rotl32(b,30) |
277 | " | 280 | " |
278 | } | 281 | } |
@@ -420,7 +423,7 @@ echo " | |||
420 | addl %ebx, 84(%rdi) # ctx->hash[1] += b | 423 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
421 | popq %r14 # | 424 | popq %r14 # |
422 | addl %ecx, 88(%rdi) # ctx->hash[2] += c | 425 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
423 | popq %r15 # | 426 | # popq %r15 # |
424 | addl %edx, 92(%rdi) # ctx->hash[3] += d | 427 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
425 | popq %rbx # | 428 | popq %rbx # |
426 | addl %ebp, 96(%rdi) # ctx->hash[4] += e | 429 | addl %ebp, 96(%rdi) # ctx->hash[4] += e |