diff options
Diffstat (limited to 'libbb/hash_md5_sha256_x86-64_shaNI.S')
-rw-r--r-- | libbb/hash_md5_sha256_x86-64_shaNI.S | 105 |
1 files changed, 54 insertions, 51 deletions
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index 4663f750a..082ceafe4 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S | |||
@@ -4,7 +4,7 @@ | |||
4 | // We use shorter insns, even though they are for "wrong" | 4 | // We use shorter insns, even though they are for "wrong" |
5 | // data type (fp, not int). | 5 | // data type (fp, not int). |
6 | // For Intel, there is no penalty for doing it at all | 6 | // For Intel, there is no penalty for doing it at all |
7 | // (CPUs which do have such penalty do not support SHA1 insns). | 7 | // (CPUs which do have such penalty do not support SHA insns). |
8 | // For AMD, the penalty is one extra cycle | 8 | // For AMD, the penalty is one extra cycle |
9 | // (allegedly: I failed to find measurable difference). | 9 | // (allegedly: I failed to find measurable difference). |
10 | 10 | ||
@@ -15,6 +15,10 @@ | |||
15 | //#define shuf128_32 pshufd | 15 | //#define shuf128_32 pshufd |
16 | #define shuf128_32 shufps | 16 | #define shuf128_32 shufps |
17 | 17 | ||
18 | // pshufb and palignr are SSSE3 insns. | ||
19 | // We do not check SSSE3 in cpuid, | ||
20 | // all SHA-capable CPUs support it as well. | ||
21 | |||
18 | .section .text.sha256_process_block64_shaNI, "ax", @progbits | 22 | .section .text.sha256_process_block64_shaNI, "ax", @progbits |
19 | .globl sha256_process_block64_shaNI | 23 | .globl sha256_process_block64_shaNI |
20 | .hidden sha256_process_block64_shaNI | 24 | .hidden sha256_process_block64_shaNI |
@@ -34,46 +38,47 @@ | |||
34 | 38 | ||
35 | #define XMMTMP %xmm7 | 39 | #define XMMTMP %xmm7 |
36 | 40 | ||
37 | #define ABEF_SAVE %xmm9 | 41 | #define SAVE0 %xmm8 |
38 | #define CDGH_SAVE %xmm10 | 42 | #define SAVE1 %xmm9 |
39 | 43 | ||
40 | #define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) | 44 | #define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) |
41 | 45 | ||
42 | .balign 8 # allow decoders to fetch at least 2 first insns | 46 | .balign 8 # allow decoders to fetch at least 2 first insns |
43 | sha256_process_block64_shaNI: | 47 | sha256_process_block64_shaNI: |
44 | 48 | ||
45 | movu128 80+0*16(%rdi), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */ | 49 | movu128 80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */ |
46 | movu128 80+1*16(%rdi), STATE1 /* HGFE */ | 50 | movu128 80+1*16(%rdi), STATE1 /* EFGH */ |
47 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ | 51 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ |
48 | mova128 STATE1, STATE0 | 52 | mova128 STATE1, STATE0 |
49 | shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */ | 53 | /* --- -------------- ABCD -- EFGH */ |
50 | shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */ | 54 | shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */ |
55 | shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */ | ||
51 | 56 | ||
52 | /* XMMTMP holds flip mask from here... */ | 57 | /* XMMTMP holds flip mask from here... */ |
53 | mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP | 58 | mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP |
54 | leaq K256+8*16(%rip), SHA256CONSTANTS | 59 | leaq K256+8*16(%rip), SHA256CONSTANTS |
55 | 60 | ||
56 | /* Save hash values for addition after rounds */ | 61 | /* Save hash values for addition after rounds */ |
57 | mova128 STATE0, ABEF_SAVE | 62 | mova128 STATE0, SAVE0 |
58 | mova128 STATE1, CDGH_SAVE | 63 | mova128 STATE1, SAVE1 |
59 | 64 | ||
60 | /* Rounds 0-3 */ | 65 | /* Rounds 0-3 */ |
61 | movu128 0*16(DATA_PTR), MSG | 66 | movu128 0*16(DATA_PTR), MSG |
62 | pshufb XMMTMP, MSG | 67 | pshufb XMMTMP, MSG |
63 | mova128 MSG, MSGTMP0 | 68 | mova128 MSG, MSGTMP0 |
64 | paddd 0*16-8*16(SHA256CONSTANTS), MSG | 69 | paddd 0*16-8*16(SHA256CONSTANTS), MSG |
65 | sha256rnds2 STATE0, STATE1 | 70 | sha256rnds2 MSG, STATE0, STATE1 |
66 | shuf128_32 $0x0E, MSG, MSG | 71 | shuf128_32 $0x0E, MSG, MSG |
67 | sha256rnds2 STATE1, STATE0 | 72 | sha256rnds2 MSG, STATE1, STATE0 |
68 | 73 | ||
69 | /* Rounds 4-7 */ | 74 | /* Rounds 4-7 */ |
70 | movu128 1*16(DATA_PTR), MSG | 75 | movu128 1*16(DATA_PTR), MSG |
71 | pshufb XMMTMP, MSG | 76 | pshufb XMMTMP, MSG |
72 | mova128 MSG, MSGTMP1 | 77 | mova128 MSG, MSGTMP1 |
73 | paddd 1*16-8*16(SHA256CONSTANTS), MSG | 78 | paddd 1*16-8*16(SHA256CONSTANTS), MSG |
74 | sha256rnds2 STATE0, STATE1 | 79 | sha256rnds2 MSG, STATE0, STATE1 |
75 | shuf128_32 $0x0E, MSG, MSG | 80 | shuf128_32 $0x0E, MSG, MSG |
76 | sha256rnds2 STATE1, STATE0 | 81 | sha256rnds2 MSG, STATE1, STATE0 |
77 | sha256msg1 MSGTMP1, MSGTMP0 | 82 | sha256msg1 MSGTMP1, MSGTMP0 |
78 | 83 | ||
79 | /* Rounds 8-11 */ | 84 | /* Rounds 8-11 */ |
@@ -81,9 +86,9 @@ sha256_process_block64_shaNI: | |||
81 | pshufb XMMTMP, MSG | 86 | pshufb XMMTMP, MSG |
82 | mova128 MSG, MSGTMP2 | 87 | mova128 MSG, MSGTMP2 |
83 | paddd 2*16-8*16(SHA256CONSTANTS), MSG | 88 | paddd 2*16-8*16(SHA256CONSTANTS), MSG |
84 | sha256rnds2 STATE0, STATE1 | 89 | sha256rnds2 MSG, STATE0, STATE1 |
85 | shuf128_32 $0x0E, MSG, MSG | 90 | shuf128_32 $0x0E, MSG, MSG |
86 | sha256rnds2 STATE1, STATE0 | 91 | sha256rnds2 MSG, STATE1, STATE0 |
87 | sha256msg1 MSGTMP2, MSGTMP1 | 92 | sha256msg1 MSGTMP2, MSGTMP1 |
88 | 93 | ||
89 | /* Rounds 12-15 */ | 94 | /* Rounds 12-15 */ |
@@ -92,164 +97,162 @@ sha256_process_block64_shaNI: | |||
92 | /* ...to here */ | 97 | /* ...to here */ |
93 | mova128 MSG, MSGTMP3 | 98 | mova128 MSG, MSGTMP3 |
94 | paddd 3*16-8*16(SHA256CONSTANTS), MSG | 99 | paddd 3*16-8*16(SHA256CONSTANTS), MSG |
95 | sha256rnds2 STATE0, STATE1 | 100 | sha256rnds2 MSG, STATE0, STATE1 |
96 | mova128 MSGTMP3, XMMTMP | 101 | mova128 MSGTMP3, XMMTMP |
97 | palignr $4, MSGTMP2, XMMTMP | 102 | palignr $4, MSGTMP2, XMMTMP |
98 | paddd XMMTMP, MSGTMP0 | 103 | paddd XMMTMP, MSGTMP0 |
99 | sha256msg2 MSGTMP3, MSGTMP0 | 104 | sha256msg2 MSGTMP3, MSGTMP0 |
100 | shuf128_32 $0x0E, MSG, MSG | 105 | shuf128_32 $0x0E, MSG, MSG |
101 | sha256rnds2 STATE1, STATE0 | 106 | sha256rnds2 MSG, STATE1, STATE0 |
102 | sha256msg1 MSGTMP3, MSGTMP2 | 107 | sha256msg1 MSGTMP3, MSGTMP2 |
103 | 108 | ||
104 | /* Rounds 16-19 */ | 109 | /* Rounds 16-19 */ |
105 | mova128 MSGTMP0, MSG | 110 | mova128 MSGTMP0, MSG |
106 | paddd 4*16-8*16(SHA256CONSTANTS), MSG | 111 | paddd 4*16-8*16(SHA256CONSTANTS), MSG |
107 | sha256rnds2 STATE0, STATE1 | 112 | sha256rnds2 MSG, STATE0, STATE1 |
108 | mova128 MSGTMP0, XMMTMP | 113 | mova128 MSGTMP0, XMMTMP |
109 | palignr $4, MSGTMP3, XMMTMP | 114 | palignr $4, MSGTMP3, XMMTMP |
110 | paddd XMMTMP, MSGTMP1 | 115 | paddd XMMTMP, MSGTMP1 |
111 | sha256msg2 MSGTMP0, MSGTMP1 | 116 | sha256msg2 MSGTMP0, MSGTMP1 |
112 | shuf128_32 $0x0E, MSG, MSG | 117 | shuf128_32 $0x0E, MSG, MSG |
113 | sha256rnds2 STATE1, STATE0 | 118 | sha256rnds2 MSG, STATE1, STATE0 |
114 | sha256msg1 MSGTMP0, MSGTMP3 | 119 | sha256msg1 MSGTMP0, MSGTMP3 |
115 | 120 | ||
116 | /* Rounds 20-23 */ | 121 | /* Rounds 20-23 */ |
117 | mova128 MSGTMP1, MSG | 122 | mova128 MSGTMP1, MSG |
118 | paddd 5*16-8*16(SHA256CONSTANTS), MSG | 123 | paddd 5*16-8*16(SHA256CONSTANTS), MSG |
119 | sha256rnds2 STATE0, STATE1 | 124 | sha256rnds2 MSG, STATE0, STATE1 |
120 | mova128 MSGTMP1, XMMTMP | 125 | mova128 MSGTMP1, XMMTMP |
121 | palignr $4, MSGTMP0, XMMTMP | 126 | palignr $4, MSGTMP0, XMMTMP |
122 | paddd XMMTMP, MSGTMP2 | 127 | paddd XMMTMP, MSGTMP2 |
123 | sha256msg2 MSGTMP1, MSGTMP2 | 128 | sha256msg2 MSGTMP1, MSGTMP2 |
124 | shuf128_32 $0x0E, MSG, MSG | 129 | shuf128_32 $0x0E, MSG, MSG |
125 | sha256rnds2 STATE1, STATE0 | 130 | sha256rnds2 MSG, STATE1, STATE0 |
126 | sha256msg1 MSGTMP1, MSGTMP0 | 131 | sha256msg1 MSGTMP1, MSGTMP0 |
127 | 132 | ||
128 | /* Rounds 24-27 */ | 133 | /* Rounds 24-27 */ |
129 | mova128 MSGTMP2, MSG | 134 | mova128 MSGTMP2, MSG |
130 | paddd 6*16-8*16(SHA256CONSTANTS), MSG | 135 | paddd 6*16-8*16(SHA256CONSTANTS), MSG |
131 | sha256rnds2 STATE0, STATE1 | 136 | sha256rnds2 MSG, STATE0, STATE1 |
132 | mova128 MSGTMP2, XMMTMP | 137 | mova128 MSGTMP2, XMMTMP |
133 | palignr $4, MSGTMP1, XMMTMP | 138 | palignr $4, MSGTMP1, XMMTMP |
134 | paddd XMMTMP, MSGTMP3 | 139 | paddd XMMTMP, MSGTMP3 |
135 | sha256msg2 MSGTMP2, MSGTMP3 | 140 | sha256msg2 MSGTMP2, MSGTMP3 |
136 | shuf128_32 $0x0E, MSG, MSG | 141 | shuf128_32 $0x0E, MSG, MSG |
137 | sha256rnds2 STATE1, STATE0 | 142 | sha256rnds2 MSG, STATE1, STATE0 |
138 | sha256msg1 MSGTMP2, MSGTMP1 | 143 | sha256msg1 MSGTMP2, MSGTMP1 |
139 | 144 | ||
140 | /* Rounds 28-31 */ | 145 | /* Rounds 28-31 */ |
141 | mova128 MSGTMP3, MSG | 146 | mova128 MSGTMP3, MSG |
142 | paddd 7*16-8*16(SHA256CONSTANTS), MSG | 147 | paddd 7*16-8*16(SHA256CONSTANTS), MSG |
143 | sha256rnds2 STATE0, STATE1 | 148 | sha256rnds2 MSG, STATE0, STATE1 |
144 | mova128 MSGTMP3, XMMTMP | 149 | mova128 MSGTMP3, XMMTMP |
145 | palignr $4, MSGTMP2, XMMTMP | 150 | palignr $4, MSGTMP2, XMMTMP |
146 | paddd XMMTMP, MSGTMP0 | 151 | paddd XMMTMP, MSGTMP0 |
147 | sha256msg2 MSGTMP3, MSGTMP0 | 152 | sha256msg2 MSGTMP3, MSGTMP0 |
148 | shuf128_32 $0x0E, MSG, MSG | 153 | shuf128_32 $0x0E, MSG, MSG |
149 | sha256rnds2 STATE1, STATE0 | 154 | sha256rnds2 MSG, STATE1, STATE0 |
150 | sha256msg1 MSGTMP3, MSGTMP2 | 155 | sha256msg1 MSGTMP3, MSGTMP2 |
151 | 156 | ||
152 | /* Rounds 32-35 */ | 157 | /* Rounds 32-35 */ |
153 | mova128 MSGTMP0, MSG | 158 | mova128 MSGTMP0, MSG |
154 | paddd 8*16-8*16(SHA256CONSTANTS), MSG | 159 | paddd 8*16-8*16(SHA256CONSTANTS), MSG |
155 | sha256rnds2 STATE0, STATE1 | 160 | sha256rnds2 MSG, STATE0, STATE1 |
156 | mova128 MSGTMP0, XMMTMP | 161 | mova128 MSGTMP0, XMMTMP |
157 | palignr $4, MSGTMP3, XMMTMP | 162 | palignr $4, MSGTMP3, XMMTMP |
158 | paddd XMMTMP, MSGTMP1 | 163 | paddd XMMTMP, MSGTMP1 |
159 | sha256msg2 MSGTMP0, MSGTMP1 | 164 | sha256msg2 MSGTMP0, MSGTMP1 |
160 | shuf128_32 $0x0E, MSG, MSG | 165 | shuf128_32 $0x0E, MSG, MSG |
161 | sha256rnds2 STATE1, STATE0 | 166 | sha256rnds2 MSG, STATE1, STATE0 |
162 | sha256msg1 MSGTMP0, MSGTMP3 | 167 | sha256msg1 MSGTMP0, MSGTMP3 |
163 | 168 | ||
164 | /* Rounds 36-39 */ | 169 | /* Rounds 36-39 */ |
165 | mova128 MSGTMP1, MSG | 170 | mova128 MSGTMP1, MSG |
166 | paddd 9*16-8*16(SHA256CONSTANTS), MSG | 171 | paddd 9*16-8*16(SHA256CONSTANTS), MSG |
167 | sha256rnds2 STATE0, STATE1 | 172 | sha256rnds2 MSG, STATE0, STATE1 |
168 | mova128 MSGTMP1, XMMTMP | 173 | mova128 MSGTMP1, XMMTMP |
169 | palignr $4, MSGTMP0, XMMTMP | 174 | palignr $4, MSGTMP0, XMMTMP |
170 | paddd XMMTMP, MSGTMP2 | 175 | paddd XMMTMP, MSGTMP2 |
171 | sha256msg2 MSGTMP1, MSGTMP2 | 176 | sha256msg2 MSGTMP1, MSGTMP2 |
172 | shuf128_32 $0x0E, MSG, MSG | 177 | shuf128_32 $0x0E, MSG, MSG |
173 | sha256rnds2 STATE1, STATE0 | 178 | sha256rnds2 MSG, STATE1, STATE0 |
174 | sha256msg1 MSGTMP1, MSGTMP0 | 179 | sha256msg1 MSGTMP1, MSGTMP0 |
175 | 180 | ||
176 | /* Rounds 40-43 */ | 181 | /* Rounds 40-43 */ |
177 | mova128 MSGTMP2, MSG | 182 | mova128 MSGTMP2, MSG |
178 | paddd 10*16-8*16(SHA256CONSTANTS), MSG | 183 | paddd 10*16-8*16(SHA256CONSTANTS), MSG |
179 | sha256rnds2 STATE0, STATE1 | 184 | sha256rnds2 MSG, STATE0, STATE1 |
180 | mova128 MSGTMP2, XMMTMP | 185 | mova128 MSGTMP2, XMMTMP |
181 | palignr $4, MSGTMP1, XMMTMP | 186 | palignr $4, MSGTMP1, XMMTMP |
182 | paddd XMMTMP, MSGTMP3 | 187 | paddd XMMTMP, MSGTMP3 |
183 | sha256msg2 MSGTMP2, MSGTMP3 | 188 | sha256msg2 MSGTMP2, MSGTMP3 |
184 | shuf128_32 $0x0E, MSG, MSG | 189 | shuf128_32 $0x0E, MSG, MSG |
185 | sha256rnds2 STATE1, STATE0 | 190 | sha256rnds2 MSG, STATE1, STATE0 |
186 | sha256msg1 MSGTMP2, MSGTMP1 | 191 | sha256msg1 MSGTMP2, MSGTMP1 |
187 | 192 | ||
188 | /* Rounds 44-47 */ | 193 | /* Rounds 44-47 */ |
189 | mova128 MSGTMP3, MSG | 194 | mova128 MSGTMP3, MSG |
190 | paddd 11*16-8*16(SHA256CONSTANTS), MSG | 195 | paddd 11*16-8*16(SHA256CONSTANTS), MSG |
191 | sha256rnds2 STATE0, STATE1 | 196 | sha256rnds2 MSG, STATE0, STATE1 |
192 | mova128 MSGTMP3, XMMTMP | 197 | mova128 MSGTMP3, XMMTMP |
193 | palignr $4, MSGTMP2, XMMTMP | 198 | palignr $4, MSGTMP2, XMMTMP |
194 | paddd XMMTMP, MSGTMP0 | 199 | paddd XMMTMP, MSGTMP0 |
195 | sha256msg2 MSGTMP3, MSGTMP0 | 200 | sha256msg2 MSGTMP3, MSGTMP0 |
196 | shuf128_32 $0x0E, MSG, MSG | 201 | shuf128_32 $0x0E, MSG, MSG |
197 | sha256rnds2 STATE1, STATE0 | 202 | sha256rnds2 MSG, STATE1, STATE0 |
198 | sha256msg1 MSGTMP3, MSGTMP2 | 203 | sha256msg1 MSGTMP3, MSGTMP2 |
199 | 204 | ||
200 | /* Rounds 48-51 */ | 205 | /* Rounds 48-51 */ |
201 | mova128 MSGTMP0, MSG | 206 | mova128 MSGTMP0, MSG |
202 | paddd 12*16-8*16(SHA256CONSTANTS), MSG | 207 | paddd 12*16-8*16(SHA256CONSTANTS), MSG |
203 | sha256rnds2 STATE0, STATE1 | 208 | sha256rnds2 MSG, STATE0, STATE1 |
204 | mova128 MSGTMP0, XMMTMP | 209 | mova128 MSGTMP0, XMMTMP |
205 | palignr $4, MSGTMP3, XMMTMP | 210 | palignr $4, MSGTMP3, XMMTMP |
206 | paddd XMMTMP, MSGTMP1 | 211 | paddd XMMTMP, MSGTMP1 |
207 | sha256msg2 MSGTMP0, MSGTMP1 | 212 | sha256msg2 MSGTMP0, MSGTMP1 |
208 | shuf128_32 $0x0E, MSG, MSG | 213 | shuf128_32 $0x0E, MSG, MSG |
209 | sha256rnds2 STATE1, STATE0 | 214 | sha256rnds2 MSG, STATE1, STATE0 |
210 | sha256msg1 MSGTMP0, MSGTMP3 | 215 | sha256msg1 MSGTMP0, MSGTMP3 |
211 | 216 | ||
212 | /* Rounds 52-55 */ | 217 | /* Rounds 52-55 */ |
213 | mova128 MSGTMP1, MSG | 218 | mova128 MSGTMP1, MSG |
214 | paddd 13*16-8*16(SHA256CONSTANTS), MSG | 219 | paddd 13*16-8*16(SHA256CONSTANTS), MSG |
215 | sha256rnds2 STATE0, STATE1 | 220 | sha256rnds2 MSG, STATE0, STATE1 |
216 | mova128 MSGTMP1, XMMTMP | 221 | mova128 MSGTMP1, XMMTMP |
217 | palignr $4, MSGTMP0, XMMTMP | 222 | palignr $4, MSGTMP0, XMMTMP |
218 | paddd XMMTMP, MSGTMP2 | 223 | paddd XMMTMP, MSGTMP2 |
219 | sha256msg2 MSGTMP1, MSGTMP2 | 224 | sha256msg2 MSGTMP1, MSGTMP2 |
220 | shuf128_32 $0x0E, MSG, MSG | 225 | shuf128_32 $0x0E, MSG, MSG |
221 | sha256rnds2 STATE1, STATE0 | 226 | sha256rnds2 MSG, STATE1, STATE0 |
222 | 227 | ||
223 | /* Rounds 56-59 */ | 228 | /* Rounds 56-59 */ |
224 | mova128 MSGTMP2, MSG | 229 | mova128 MSGTMP2, MSG |
225 | paddd 14*16-8*16(SHA256CONSTANTS), MSG | 230 | paddd 14*16-8*16(SHA256CONSTANTS), MSG |
226 | sha256rnds2 STATE0, STATE1 | 231 | sha256rnds2 MSG, STATE0, STATE1 |
227 | mova128 MSGTMP2, XMMTMP | 232 | mova128 MSGTMP2, XMMTMP |
228 | palignr $4, MSGTMP1, XMMTMP | 233 | palignr $4, MSGTMP1, XMMTMP |
229 | paddd XMMTMP, MSGTMP3 | 234 | paddd XMMTMP, MSGTMP3 |
230 | sha256msg2 MSGTMP2, MSGTMP3 | 235 | sha256msg2 MSGTMP2, MSGTMP3 |
231 | shuf128_32 $0x0E, MSG, MSG | 236 | shuf128_32 $0x0E, MSG, MSG |
232 | sha256rnds2 STATE1, STATE0 | 237 | sha256rnds2 MSG, STATE1, STATE0 |
233 | 238 | ||
234 | /* Rounds 60-63 */ | 239 | /* Rounds 60-63 */ |
235 | mova128 MSGTMP3, MSG | 240 | mova128 MSGTMP3, MSG |
236 | paddd 15*16-8*16(SHA256CONSTANTS), MSG | 241 | paddd 15*16-8*16(SHA256CONSTANTS), MSG |
237 | sha256rnds2 STATE0, STATE1 | 242 | sha256rnds2 MSG, STATE0, STATE1 |
238 | shuf128_32 $0x0E, MSG, MSG | 243 | shuf128_32 $0x0E, MSG, MSG |
239 | sha256rnds2 STATE1, STATE0 | 244 | sha256rnds2 MSG, STATE1, STATE0 |
240 | 245 | ||
241 | /* Add current hash values with previously saved */ | 246 | /* Add current hash values with previously saved */ |
242 | paddd ABEF_SAVE, STATE0 | 247 | paddd SAVE0, STATE0 |
243 | paddd CDGH_SAVE, STATE1 | 248 | paddd SAVE1, STATE1 |
244 | 249 | ||
245 | /* Write hash values back in the correct order */ | 250 | /* Write hash values back in the correct order */ |
246 | /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */ | ||
247 | /* STATE1: CDGH */ | ||
248 | mova128 STATE0, XMMTMP | 251 | mova128 STATE0, XMMTMP |
249 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ | 252 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ |
250 | shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */ | 253 | /* --- -------------- HGDC -- FEBA */ |
251 | shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */ | 254 | shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */ |
252 | 255 | shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */ | |
253 | movu128 STATE0, 80+0*16(%rdi) | 256 | movu128 STATE0, 80+0*16(%rdi) |
254 | movu128 XMMTMP, 80+1*16(%rdi) | 257 | movu128 XMMTMP, 80+1*16(%rdi) |
255 | 258 | ||
@@ -257,7 +260,7 @@ sha256_process_block64_shaNI: | |||
257 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI | 260 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI |
258 | 261 | ||
259 | .section .rodata.cst256.K256, "aM", @progbits, 256 | 262 | .section .rodata.cst256.K256, "aM", @progbits, 256 |
260 | .balign 16 | 263 | .balign 16 |
261 | K256: | 264 | K256: |
262 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | 265 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 |
263 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | 266 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 |
@@ -277,8 +280,8 @@ K256: | |||
277 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | 280 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 |
278 | 281 | ||
279 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 | 282 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 |
280 | .balign 16 | 283 | .balign 16 |
281 | PSHUFFLE_BSWAP32_FLIP_MASK: | 284 | PSHUFFLE_BSWAP32_FLIP_MASK: |
282 | .octa 0x0c0d0e0f08090a0b0405060700010203 | 285 | .octa 0x0c0d0e0f08090a0b0405060700010203 |
283 | 286 | ||
284 | #endif | 287 | #endif |