aboutsummaryrefslogtreecommitdiff
path: root/libbb/hash_md5_sha256_x86-64_shaNI.S
diff options
context:
space:
mode:
Diffstat (limited to 'libbb/hash_md5_sha256_x86-64_shaNI.S')
-rw-r--r--libbb/hash_md5_sha256_x86-64_shaNI.S284
1 files changed, 284 insertions, 0 deletions
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
new file mode 100644
index 000000000..4663f750a
--- /dev/null
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -0,0 +1,284 @@
1#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns).
8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI
21 .type sha256_process_block64_shaNI, @function
22
23#define DATA_PTR %rdi
24
25#define SHA256CONSTANTS %rax
26
27#define MSG %xmm0
28#define STATE0 %xmm1
29#define STATE1 %xmm2
30#define MSGTMP0 %xmm3
31#define MSGTMP1 %xmm4
32#define MSGTMP2 %xmm5
33#define MSGTMP3 %xmm6
34
35#define XMMTMP %xmm7
36
37#define ABEF_SAVE %xmm9
38#define CDGH_SAVE %xmm10
39
40#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
41
42 .balign 8 # allow decoders to fetch at least 2 first insns
43sha256_process_block64_shaNI:
44
45 movu128 80+0*16(%rdi), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */
46 movu128 80+1*16(%rdi), STATE1 /* HGFE */
47/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
48 mova128 STATE1, STATE0
49 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */
50 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */
51
52/* XMMTMP holds flip mask from here... */
53 mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
54 leaq K256+8*16(%rip), SHA256CONSTANTS
55
56 /* Save hash values for addition after rounds */
57 mova128 STATE0, ABEF_SAVE
58 mova128 STATE1, CDGH_SAVE
59
60 /* Rounds 0-3 */
61 movu128 0*16(DATA_PTR), MSG
62 pshufb XMMTMP, MSG
63 mova128 MSG, MSGTMP0
64 paddd 0*16-8*16(SHA256CONSTANTS), MSG
65 sha256rnds2 STATE0, STATE1
66 shuf128_32 $0x0E, MSG, MSG
67 sha256rnds2 STATE1, STATE0
68
69 /* Rounds 4-7 */
70 movu128 1*16(DATA_PTR), MSG
71 pshufb XMMTMP, MSG
72 mova128 MSG, MSGTMP1
73 paddd 1*16-8*16(SHA256CONSTANTS), MSG
74 sha256rnds2 STATE0, STATE1
75 shuf128_32 $0x0E, MSG, MSG
76 sha256rnds2 STATE1, STATE0
77 sha256msg1 MSGTMP1, MSGTMP0
78
79 /* Rounds 8-11 */
80 movu128 2*16(DATA_PTR), MSG
81 pshufb XMMTMP, MSG
82 mova128 MSG, MSGTMP2
83 paddd 2*16-8*16(SHA256CONSTANTS), MSG
84 sha256rnds2 STATE0, STATE1
85 shuf128_32 $0x0E, MSG, MSG
86 sha256rnds2 STATE1, STATE0
87 sha256msg1 MSGTMP2, MSGTMP1
88
89 /* Rounds 12-15 */
90 movu128 3*16(DATA_PTR), MSG
91 pshufb XMMTMP, MSG
92/* ...to here */
93 mova128 MSG, MSGTMP3
94 paddd 3*16-8*16(SHA256CONSTANTS), MSG
95 sha256rnds2 STATE0, STATE1
96 mova128 MSGTMP3, XMMTMP
97 palignr $4, MSGTMP2, XMMTMP
98 paddd XMMTMP, MSGTMP0
99 sha256msg2 MSGTMP3, MSGTMP0
100 shuf128_32 $0x0E, MSG, MSG
101 sha256rnds2 STATE1, STATE0
102 sha256msg1 MSGTMP3, MSGTMP2
103
104 /* Rounds 16-19 */
105 mova128 MSGTMP0, MSG
106 paddd 4*16-8*16(SHA256CONSTANTS), MSG
107 sha256rnds2 STATE0, STATE1
108 mova128 MSGTMP0, XMMTMP
109 palignr $4, MSGTMP3, XMMTMP
110 paddd XMMTMP, MSGTMP1
111 sha256msg2 MSGTMP0, MSGTMP1
112 shuf128_32 $0x0E, MSG, MSG
113 sha256rnds2 STATE1, STATE0
114 sha256msg1 MSGTMP0, MSGTMP3
115
116 /* Rounds 20-23 */
117 mova128 MSGTMP1, MSG
118 paddd 5*16-8*16(SHA256CONSTANTS), MSG
119 sha256rnds2 STATE0, STATE1
120 mova128 MSGTMP1, XMMTMP
121 palignr $4, MSGTMP0, XMMTMP
122 paddd XMMTMP, MSGTMP2
123 sha256msg2 MSGTMP1, MSGTMP2
124 shuf128_32 $0x0E, MSG, MSG
125 sha256rnds2 STATE1, STATE0
126 sha256msg1 MSGTMP1, MSGTMP0
127
128 /* Rounds 24-27 */
129 mova128 MSGTMP2, MSG
130 paddd 6*16-8*16(SHA256CONSTANTS), MSG
131 sha256rnds2 STATE0, STATE1
132 mova128 MSGTMP2, XMMTMP
133 palignr $4, MSGTMP1, XMMTMP
134 paddd XMMTMP, MSGTMP3
135 sha256msg2 MSGTMP2, MSGTMP3
136 shuf128_32 $0x0E, MSG, MSG
137 sha256rnds2 STATE1, STATE0
138 sha256msg1 MSGTMP2, MSGTMP1
139
140 /* Rounds 28-31 */
141 mova128 MSGTMP3, MSG
142 paddd 7*16-8*16(SHA256CONSTANTS), MSG
143 sha256rnds2 STATE0, STATE1
144 mova128 MSGTMP3, XMMTMP
145 palignr $4, MSGTMP2, XMMTMP
146 paddd XMMTMP, MSGTMP0
147 sha256msg2 MSGTMP3, MSGTMP0
148 shuf128_32 $0x0E, MSG, MSG
149 sha256rnds2 STATE1, STATE0
150 sha256msg1 MSGTMP3, MSGTMP2
151
152 /* Rounds 32-35 */
153 mova128 MSGTMP0, MSG
154 paddd 8*16-8*16(SHA256CONSTANTS), MSG
155 sha256rnds2 STATE0, STATE1
156 mova128 MSGTMP0, XMMTMP
157 palignr $4, MSGTMP3, XMMTMP
158 paddd XMMTMP, MSGTMP1
159 sha256msg2 MSGTMP0, MSGTMP1
160 shuf128_32 $0x0E, MSG, MSG
161 sha256rnds2 STATE1, STATE0
162 sha256msg1 MSGTMP0, MSGTMP3
163
164 /* Rounds 36-39 */
165 mova128 MSGTMP1, MSG
166 paddd 9*16-8*16(SHA256CONSTANTS), MSG
167 sha256rnds2 STATE0, STATE1
168 mova128 MSGTMP1, XMMTMP
169 palignr $4, MSGTMP0, XMMTMP
170 paddd XMMTMP, MSGTMP2
171 sha256msg2 MSGTMP1, MSGTMP2
172 shuf128_32 $0x0E, MSG, MSG
173 sha256rnds2 STATE1, STATE0
174 sha256msg1 MSGTMP1, MSGTMP0
175
176 /* Rounds 40-43 */
177 mova128 MSGTMP2, MSG
178 paddd 10*16-8*16(SHA256CONSTANTS), MSG
179 sha256rnds2 STATE0, STATE1
180 mova128 MSGTMP2, XMMTMP
181 palignr $4, MSGTMP1, XMMTMP
182 paddd XMMTMP, MSGTMP3
183 sha256msg2 MSGTMP2, MSGTMP3
184 shuf128_32 $0x0E, MSG, MSG
185 sha256rnds2 STATE1, STATE0
186 sha256msg1 MSGTMP2, MSGTMP1
187
188 /* Rounds 44-47 */
189 mova128 MSGTMP3, MSG
190 paddd 11*16-8*16(SHA256CONSTANTS), MSG
191 sha256rnds2 STATE0, STATE1
192 mova128 MSGTMP3, XMMTMP
193 palignr $4, MSGTMP2, XMMTMP
194 paddd XMMTMP, MSGTMP0
195 sha256msg2 MSGTMP3, MSGTMP0
196 shuf128_32 $0x0E, MSG, MSG
197 sha256rnds2 STATE1, STATE0
198 sha256msg1 MSGTMP3, MSGTMP2
199
200 /* Rounds 48-51 */
201 mova128 MSGTMP0, MSG
202 paddd 12*16-8*16(SHA256CONSTANTS), MSG
203 sha256rnds2 STATE0, STATE1
204 mova128 MSGTMP0, XMMTMP
205 palignr $4, MSGTMP3, XMMTMP
206 paddd XMMTMP, MSGTMP1
207 sha256msg2 MSGTMP0, MSGTMP1
208 shuf128_32 $0x0E, MSG, MSG
209 sha256rnds2 STATE1, STATE0
210 sha256msg1 MSGTMP0, MSGTMP3
211
212 /* Rounds 52-55 */
213 mova128 MSGTMP1, MSG
214 paddd 13*16-8*16(SHA256CONSTANTS), MSG
215 sha256rnds2 STATE0, STATE1
216 mova128 MSGTMP1, XMMTMP
217 palignr $4, MSGTMP0, XMMTMP
218 paddd XMMTMP, MSGTMP2
219 sha256msg2 MSGTMP1, MSGTMP2
220 shuf128_32 $0x0E, MSG, MSG
221 sha256rnds2 STATE1, STATE0
222
223 /* Rounds 56-59 */
224 mova128 MSGTMP2, MSG
225 paddd 14*16-8*16(SHA256CONSTANTS), MSG
226 sha256rnds2 STATE0, STATE1
227 mova128 MSGTMP2, XMMTMP
228 palignr $4, MSGTMP1, XMMTMP
229 paddd XMMTMP, MSGTMP3
230 sha256msg2 MSGTMP2, MSGTMP3
231 shuf128_32 $0x0E, MSG, MSG
232 sha256rnds2 STATE1, STATE0
233
234 /* Rounds 60-63 */
235 mova128 MSGTMP3, MSG
236 paddd 15*16-8*16(SHA256CONSTANTS), MSG
237 sha256rnds2 STATE0, STATE1
238 shuf128_32 $0x0E, MSG, MSG
239 sha256rnds2 STATE1, STATE0
240
241 /* Add current hash values with previously saved */
242 paddd ABEF_SAVE, STATE0
243 paddd CDGH_SAVE, STATE1
244
245 /* Write hash values back in the correct order */
246 /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */
247 /* STATE1: CDGH */
248 mova128 STATE0, XMMTMP
249/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
250 shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */
251 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */
252
253 movu128 STATE0, 80+0*16(%rdi)
254 movu128 XMMTMP, 80+1*16(%rdi)
255
256 ret
257 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
258
259 .section .rodata.cst256.K256, "aM", @progbits, 256
260 .balign 16
261K256:
262 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
263 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
264 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
265 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
266 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
267 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
268 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
269 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
270 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
271 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
272 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
273 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
274 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
275 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
276 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
277 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
278
279 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
280 .balign 16
281PSHUFFLE_BSWAP32_FLIP_MASK:
282 .octa 0x0c0d0e0f08090a0b0405060700010203
283
284#endif