aboutsummaryrefslogtreecommitdiff
path: root/libbb/hash_md5_sha256_x86-32_shaNI.S
diff options
context:
space:
mode:
Diffstat (limited to 'libbb/hash_md5_sha256_x86-32_shaNI.S')
-rw-r--r--libbb/hash_md5_sha256_x86-32_shaNI.S277
1 files changed, 277 insertions, 0 deletions
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
new file mode 100644
index 000000000..aa68193bd
--- /dev/null
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -0,0 +1,277 @@
1#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns).
8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI
21 .type sha256_process_block64_shaNI, @function
22
23#define DATA_PTR %eax
24
25#define SHA256CONSTANTS %ecx
26
27#define MSG %xmm0
28#define STATE0 %xmm1
29#define STATE1 %xmm2
30#define MSGTMP0 %xmm3
31#define MSGTMP1 %xmm4
32#define MSGTMP2 %xmm5
33#define MSGTMP3 %xmm6
34
35#define XMMTMP %xmm7
36
37#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
38
39 .balign 8 # allow decoders to fetch at least 2 first insns
40sha256_process_block64_shaNI:
41
42 movu128 76+0*16(%eax), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */
43 movu128 76+1*16(%eax), STATE1 /* HGFE */
44/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
45 mova128 STATE1, STATE0
46 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */
47 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */
48
49/* XMMTMP holds flip mask from here... */
50 mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
51 movl $K256+8*16, SHA256CONSTANTS
52
53 /* Rounds 0-3 */
54 movu128 0*16(DATA_PTR), MSG
55 pshufb XMMTMP, MSG
56 mova128 MSG, MSGTMP0
57 paddd 0*16-8*16(SHA256CONSTANTS), MSG
58 sha256rnds2 STATE0, STATE1
59 shuf128_32 $0x0E, MSG, MSG
60 sha256rnds2 STATE1, STATE0
61
62 /* Rounds 4-7 */
63 movu128 1*16(DATA_PTR), MSG
64 pshufb XMMTMP, MSG
65 mova128 MSG, MSGTMP1
66 paddd 1*16-8*16(SHA256CONSTANTS), MSG
67 sha256rnds2 STATE0, STATE1
68 shuf128_32 $0x0E, MSG, MSG
69 sha256rnds2 STATE1, STATE0
70 sha256msg1 MSGTMP1, MSGTMP0
71
72 /* Rounds 8-11 */
73 movu128 2*16(DATA_PTR), MSG
74 pshufb XMMTMP, MSG
75 mova128 MSG, MSGTMP2
76 paddd 2*16-8*16(SHA256CONSTANTS), MSG
77 sha256rnds2 STATE0, STATE1
78 shuf128_32 $0x0E, MSG, MSG
79 sha256rnds2 STATE1, STATE0
80 sha256msg1 MSGTMP2, MSGTMP1
81
82 /* Rounds 12-15 */
83 movu128 3*16(DATA_PTR), MSG
84 pshufb XMMTMP, MSG
85/* ...to here */
86 mova128 MSG, MSGTMP3
87 paddd 3*16-8*16(SHA256CONSTANTS), MSG
88 sha256rnds2 STATE0, STATE1
89 mova128 MSGTMP3, XMMTMP
90 palignr $4, MSGTMP2, XMMTMP
91 paddd XMMTMP, MSGTMP0
92 sha256msg2 MSGTMP3, MSGTMP0
93 shuf128_32 $0x0E, MSG, MSG
94 sha256rnds2 STATE1, STATE0
95 sha256msg1 MSGTMP3, MSGTMP2
96
97 /* Rounds 16-19 */
98 mova128 MSGTMP0, MSG
99 paddd 4*16-8*16(SHA256CONSTANTS), MSG
100 sha256rnds2 STATE0, STATE1
101 mova128 MSGTMP0, XMMTMP
102 palignr $4, MSGTMP3, XMMTMP
103 paddd XMMTMP, MSGTMP1
104 sha256msg2 MSGTMP0, MSGTMP1
105 shuf128_32 $0x0E, MSG, MSG
106 sha256rnds2 STATE1, STATE0
107 sha256msg1 MSGTMP0, MSGTMP3
108
109 /* Rounds 20-23 */
110 mova128 MSGTMP1, MSG
111 paddd 5*16-8*16(SHA256CONSTANTS), MSG
112 sha256rnds2 STATE0, STATE1
113 mova128 MSGTMP1, XMMTMP
114 palignr $4, MSGTMP0, XMMTMP
115 paddd XMMTMP, MSGTMP2
116 sha256msg2 MSGTMP1, MSGTMP2
117 shuf128_32 $0x0E, MSG, MSG
118 sha256rnds2 STATE1, STATE0
119 sha256msg1 MSGTMP1, MSGTMP0
120
121 /* Rounds 24-27 */
122 mova128 MSGTMP2, MSG
123 paddd 6*16-8*16(SHA256CONSTANTS), MSG
124 sha256rnds2 STATE0, STATE1
125 mova128 MSGTMP2, XMMTMP
126 palignr $4, MSGTMP1, XMMTMP
127 paddd XMMTMP, MSGTMP3
128 sha256msg2 MSGTMP2, MSGTMP3
129 shuf128_32 $0x0E, MSG, MSG
130 sha256rnds2 STATE1, STATE0
131 sha256msg1 MSGTMP2, MSGTMP1
132
133 /* Rounds 28-31 */
134 mova128 MSGTMP3, MSG
135 paddd 7*16-8*16(SHA256CONSTANTS), MSG
136 sha256rnds2 STATE0, STATE1
137 mova128 MSGTMP3, XMMTMP
138 palignr $4, MSGTMP2, XMMTMP
139 paddd XMMTMP, MSGTMP0
140 sha256msg2 MSGTMP3, MSGTMP0
141 shuf128_32 $0x0E, MSG, MSG
142 sha256rnds2 STATE1, STATE0
143 sha256msg1 MSGTMP3, MSGTMP2
144
145 /* Rounds 32-35 */
146 mova128 MSGTMP0, MSG
147 paddd 8*16-8*16(SHA256CONSTANTS), MSG
148 sha256rnds2 STATE0, STATE1
149 mova128 MSGTMP0, XMMTMP
150 palignr $4, MSGTMP3, XMMTMP
151 paddd XMMTMP, MSGTMP1
152 sha256msg2 MSGTMP0, MSGTMP1
153 shuf128_32 $0x0E, MSG, MSG
154 sha256rnds2 STATE1, STATE0
155 sha256msg1 MSGTMP0, MSGTMP3
156
157 /* Rounds 36-39 */
158 mova128 MSGTMP1, MSG
159 paddd 9*16-8*16(SHA256CONSTANTS), MSG
160 sha256rnds2 STATE0, STATE1
161 mova128 MSGTMP1, XMMTMP
162 palignr $4, MSGTMP0, XMMTMP
163 paddd XMMTMP, MSGTMP2
164 sha256msg2 MSGTMP1, MSGTMP2
165 shuf128_32 $0x0E, MSG, MSG
166 sha256rnds2 STATE1, STATE0
167 sha256msg1 MSGTMP1, MSGTMP0
168
169 /* Rounds 40-43 */
170 mova128 MSGTMP2, MSG
171 paddd 10*16-8*16(SHA256CONSTANTS), MSG
172 sha256rnds2 STATE0, STATE1
173 mova128 MSGTMP2, XMMTMP
174 palignr $4, MSGTMP1, XMMTMP
175 paddd XMMTMP, MSGTMP3
176 sha256msg2 MSGTMP2, MSGTMP3
177 shuf128_32 $0x0E, MSG, MSG
178 sha256rnds2 STATE1, STATE0
179 sha256msg1 MSGTMP2, MSGTMP1
180
181 /* Rounds 44-47 */
182 mova128 MSGTMP3, MSG
183 paddd 11*16-8*16(SHA256CONSTANTS), MSG
184 sha256rnds2 STATE0, STATE1
185 mova128 MSGTMP3, XMMTMP
186 palignr $4, MSGTMP2, XMMTMP
187 paddd XMMTMP, MSGTMP0
188 sha256msg2 MSGTMP3, MSGTMP0
189 shuf128_32 $0x0E, MSG, MSG
190 sha256rnds2 STATE1, STATE0
191 sha256msg1 MSGTMP3, MSGTMP2
192
193 /* Rounds 48-51 */
194 mova128 MSGTMP0, MSG
195 paddd 12*16-8*16(SHA256CONSTANTS), MSG
196 sha256rnds2 STATE0, STATE1
197 mova128 MSGTMP0, XMMTMP
198 palignr $4, MSGTMP3, XMMTMP
199 paddd XMMTMP, MSGTMP1
200 sha256msg2 MSGTMP0, MSGTMP1
201 shuf128_32 $0x0E, MSG, MSG
202 sha256rnds2 STATE1, STATE0
203 sha256msg1 MSGTMP0, MSGTMP3
204
205 /* Rounds 52-55 */
206 mova128 MSGTMP1, MSG
207 paddd 13*16-8*16(SHA256CONSTANTS), MSG
208 sha256rnds2 STATE0, STATE1
209 mova128 MSGTMP1, XMMTMP
210 palignr $4, MSGTMP0, XMMTMP
211 paddd XMMTMP, MSGTMP2
212 sha256msg2 MSGTMP1, MSGTMP2
213 shuf128_32 $0x0E, MSG, MSG
214 sha256rnds2 STATE1, STATE0
215
216 /* Rounds 56-59 */
217 mova128 MSGTMP2, MSG
218 paddd 14*16-8*16(SHA256CONSTANTS), MSG
219 sha256rnds2 STATE0, STATE1
220 mova128 MSGTMP2, XMMTMP
221 palignr $4, MSGTMP1, XMMTMP
222 paddd XMMTMP, MSGTMP3
223 sha256msg2 MSGTMP2, MSGTMP3
224 shuf128_32 $0x0E, MSG, MSG
225 sha256rnds2 STATE1, STATE0
226
227 /* Rounds 60-63 */
228 mova128 MSGTMP3, MSG
229 paddd 15*16-8*16(SHA256CONSTANTS), MSG
230 sha256rnds2 STATE0, STATE1
231 shuf128_32 $0x0E, MSG, MSG
232 sha256rnds2 STATE1, STATE0
233
234 /* Write hash values back in the correct order */
235 /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */
236 /* STATE1: CDGH */
237 mova128 STATE0, XMMTMP
238/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
239 shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */
240 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */
241 /* add current hash values to previous ones */
242 movu128 76+1*16(%eax), STATE1
243 paddd XMMTMP, STATE1
244 movu128 STATE1, 76+1*16(%eax)
245 movu128 76+0*16(%eax), XMMTMP
246 paddd XMMTMP, STATE0
247 movu128 STATE0, 76+0*16(%eax)
248
249 ret
250 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
251
252 .section .rodata.cst256.K256, "aM", @progbits, 256
253 .balign 16
254K256:
255 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
256 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
257 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
258 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
259 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
260 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
261 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
262 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
263 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
264 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
265 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
266 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
267 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
268 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
269 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
270 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
271
272 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
273 .balign 16
274PSHUFFLE_BSWAP32_FLIP_MASK:
275 .octa 0x0c0d0e0f08090a0b0405060700010203
276
277#endif