aboutsummaryrefslogtreecommitdiff
path: root/libbb/hash_md5_sha256_x86-64_shaNI.S
diff options
context:
space:
mode:
Diffstat (limited to 'libbb/hash_md5_sha256_x86-64_shaNI.S')
-rw-r--r--libbb/hash_md5_sha256_x86-64_shaNI.S281
1 files changed, 281 insertions, 0 deletions
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
new file mode 100644
index 000000000..1c2b75af3
--- /dev/null
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -0,0 +1,281 @@
1#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns).
8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI
21 .type sha256_process_block64_shaNI, @function
22
23#define DATA_PTR %rdi
24
25#define SHA256CONSTANTS %rax
26
27#define MSG %xmm0
28#define STATE0 %xmm1
29#define STATE1 %xmm2
30#define MSGTMP0 %xmm3
31#define MSGTMP1 %xmm4
32#define MSGTMP2 %xmm5
33#define MSGTMP3 %xmm6
34#define MSGTMP4 %xmm7
35
36#define SHUF_MASK %xmm8
37
38#define ABEF_SAVE %xmm9
39#define CDGH_SAVE %xmm10
40
41 .balign 8 # allow decoders to fetch at least 2 first insns
42sha256_process_block64_shaNI:
43 movu128 80+0*16(%rdi), STATE0
44 movu128 80+1*16(%rdi), STATE1
45
46 shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */
47 shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */
48 mova128 STATE0, MSGTMP4
49 palignr $8, STATE1, STATE0 /* ABEF */
50 pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
51
52 mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), SHUF_MASK
53 lea K256(%rip), SHA256CONSTANTS
54
55 /* Save hash values for addition after rounds */
56 mova128 STATE0, ABEF_SAVE
57 mova128 STATE1, CDGH_SAVE
58
59 /* Rounds 0-3 */
60 movu128 0*16(DATA_PTR), MSG
61 pshufb SHUF_MASK, MSG
62 mova128 MSG, MSGTMP0
63 paddd 0*16(SHA256CONSTANTS), MSG
64 sha256rnds2 STATE0, STATE1
65 shuf128_32 $0x0E, MSG, MSG
66 sha256rnds2 STATE1, STATE0
67
68 /* Rounds 4-7 */
69 movu128 1*16(DATA_PTR), MSG
70 pshufb SHUF_MASK, MSG
71 mova128 MSG, MSGTMP1
72 paddd 1*16(SHA256CONSTANTS), MSG
73 sha256rnds2 STATE0, STATE1
74 shuf128_32 $0x0E, MSG, MSG
75 sha256rnds2 STATE1, STATE0
76 sha256msg1 MSGTMP1, MSGTMP0
77
78 /* Rounds 8-11 */
79 movu128 2*16(DATA_PTR), MSG
80 pshufb SHUF_MASK, MSG
81 mova128 MSG, MSGTMP2
82 paddd 2*16(SHA256CONSTANTS), MSG
83 sha256rnds2 STATE0, STATE1
84 shuf128_32 $0x0E, MSG, MSG
85 sha256rnds2 STATE1, STATE0
86 sha256msg1 MSGTMP2, MSGTMP1
87
88 /* Rounds 12-15 */
89 movu128 3*16(DATA_PTR), MSG
90 pshufb SHUF_MASK, MSG
91 mova128 MSG, MSGTMP3
92 paddd 3*16(SHA256CONSTANTS), MSG
93 sha256rnds2 STATE0, STATE1
94 mova128 MSGTMP3, MSGTMP4
95 palignr $4, MSGTMP2, MSGTMP4
96 paddd MSGTMP4, MSGTMP0
97 sha256msg2 MSGTMP3, MSGTMP0
98 shuf128_32 $0x0E, MSG, MSG
99 sha256rnds2 STATE1, STATE0
100 sha256msg1 MSGTMP3, MSGTMP2
101
102 /* Rounds 16-19 */
103 mova128 MSGTMP0, MSG
104 paddd 4*16(SHA256CONSTANTS), MSG
105 sha256rnds2 STATE0, STATE1
106 mova128 MSGTMP0, MSGTMP4
107 palignr $4, MSGTMP3, MSGTMP4
108 paddd MSGTMP4, MSGTMP1
109 sha256msg2 MSGTMP0, MSGTMP1
110 shuf128_32 $0x0E, MSG, MSG
111 sha256rnds2 STATE1, STATE0
112 sha256msg1 MSGTMP0, MSGTMP3
113
114 /* Rounds 20-23 */
115 mova128 MSGTMP1, MSG
116 paddd 5*16(SHA256CONSTANTS), MSG
117 sha256rnds2 STATE0, STATE1
118 mova128 MSGTMP1, MSGTMP4
119 palignr $4, MSGTMP0, MSGTMP4
120 paddd MSGTMP4, MSGTMP2
121 sha256msg2 MSGTMP1, MSGTMP2
122 shuf128_32 $0x0E, MSG, MSG
123 sha256rnds2 STATE1, STATE0
124 sha256msg1 MSGTMP1, MSGTMP0
125
126 /* Rounds 24-27 */
127 mova128 MSGTMP2, MSG
128 paddd 6*16(SHA256CONSTANTS), MSG
129 sha256rnds2 STATE0, STATE1
130 mova128 MSGTMP2, MSGTMP4
131 palignr $4, MSGTMP1, MSGTMP4
132 paddd MSGTMP4, MSGTMP3
133 sha256msg2 MSGTMP2, MSGTMP3
134 shuf128_32 $0x0E, MSG, MSG
135 sha256rnds2 STATE1, STATE0
136 sha256msg1 MSGTMP2, MSGTMP1
137
138 /* Rounds 28-31 */
139 mova128 MSGTMP3, MSG
140 paddd 7*16(SHA256CONSTANTS), MSG
141 sha256rnds2 STATE0, STATE1
142 mova128 MSGTMP3, MSGTMP4
143 palignr $4, MSGTMP2, MSGTMP4
144 paddd MSGTMP4, MSGTMP0
145 sha256msg2 MSGTMP3, MSGTMP0
146 shuf128_32 $0x0E, MSG, MSG
147 sha256rnds2 STATE1, STATE0
148 sha256msg1 MSGTMP3, MSGTMP2
149
150 /* Rounds 32-35 */
151 mova128 MSGTMP0, MSG
152 paddd 8*16(SHA256CONSTANTS), MSG
153 sha256rnds2 STATE0, STATE1
154 mova128 MSGTMP0, MSGTMP4
155 palignr $4, MSGTMP3, MSGTMP4
156 paddd MSGTMP4, MSGTMP1
157 sha256msg2 MSGTMP0, MSGTMP1
158 shuf128_32 $0x0E, MSG, MSG
159 sha256rnds2 STATE1, STATE0
160 sha256msg1 MSGTMP0, MSGTMP3
161
162 /* Rounds 36-39 */
163 mova128 MSGTMP1, MSG
164 paddd 9*16(SHA256CONSTANTS), MSG
165 sha256rnds2 STATE0, STATE1
166 mova128 MSGTMP1, MSGTMP4
167 palignr $4, MSGTMP0, MSGTMP4
168 paddd MSGTMP4, MSGTMP2
169 sha256msg2 MSGTMP1, MSGTMP2
170 shuf128_32 $0x0E, MSG, MSG
171 sha256rnds2 STATE1, STATE0
172 sha256msg1 MSGTMP1, MSGTMP0
173
174 /* Rounds 40-43 */
175 mova128 MSGTMP2, MSG
176 paddd 10*16(SHA256CONSTANTS), MSG
177 sha256rnds2 STATE0, STATE1
178 mova128 MSGTMP2, MSGTMP4
179 palignr $4, MSGTMP1, MSGTMP4
180 paddd MSGTMP4, MSGTMP3
181 sha256msg2 MSGTMP2, MSGTMP3
182 shuf128_32 $0x0E, MSG, MSG
183 sha256rnds2 STATE1, STATE0
184 sha256msg1 MSGTMP2, MSGTMP1
185
186 /* Rounds 44-47 */
187 mova128 MSGTMP3, MSG
188 paddd 11*16(SHA256CONSTANTS), MSG
189 sha256rnds2 STATE0, STATE1
190 mova128 MSGTMP3, MSGTMP4
191 palignr $4, MSGTMP2, MSGTMP4
192 paddd MSGTMP4, MSGTMP0
193 sha256msg2 MSGTMP3, MSGTMP0
194 shuf128_32 $0x0E, MSG, MSG
195 sha256rnds2 STATE1, STATE0
196 sha256msg1 MSGTMP3, MSGTMP2
197
198 /* Rounds 48-51 */
199 mova128 MSGTMP0, MSG
200 paddd 12*16(SHA256CONSTANTS), MSG
201 sha256rnds2 STATE0, STATE1
202 mova128 MSGTMP0, MSGTMP4
203 palignr $4, MSGTMP3, MSGTMP4
204 paddd MSGTMP4, MSGTMP1
205 sha256msg2 MSGTMP0, MSGTMP1
206 shuf128_32 $0x0E, MSG, MSG
207 sha256rnds2 STATE1, STATE0
208 sha256msg1 MSGTMP0, MSGTMP3
209
210 /* Rounds 52-55 */
211 mova128 MSGTMP1, MSG
212 paddd 13*16(SHA256CONSTANTS), MSG
213 sha256rnds2 STATE0, STATE1
214 mova128 MSGTMP1, MSGTMP4
215 palignr $4, MSGTMP0, MSGTMP4
216 paddd MSGTMP4, MSGTMP2
217 sha256msg2 MSGTMP1, MSGTMP2
218 shuf128_32 $0x0E, MSG, MSG
219 sha256rnds2 STATE1, STATE0
220
221 /* Rounds 56-59 */
222 mova128 MSGTMP2, MSG
223 paddd 14*16(SHA256CONSTANTS), MSG
224 sha256rnds2 STATE0, STATE1
225 mova128 MSGTMP2, MSGTMP4
226 palignr $4, MSGTMP1, MSGTMP4
227 paddd MSGTMP4, MSGTMP3
228 sha256msg2 MSGTMP2, MSGTMP3
229 shuf128_32 $0x0E, MSG, MSG
230 sha256rnds2 STATE1, STATE0
231
232 /* Rounds 60-63 */
233 mova128 MSGTMP3, MSG
234 paddd 15*16(SHA256CONSTANTS), MSG
235 sha256rnds2 STATE0, STATE1
236 shuf128_32 $0x0E, MSG, MSG
237 sha256rnds2 STATE1, STATE0
238
239 /* Add current hash values with previously saved */
240 paddd ABEF_SAVE, STATE0
241 paddd CDGH_SAVE, STATE1
242
243 /* Write hash values back in the correct order */
244 shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */
245 shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */
246 mova128 STATE0, MSGTMP4
247 pblendw $0xF0, STATE1, STATE0 /* DCBA */
248 palignr $8, MSGTMP4, STATE1 /* HGFE */
249
250 movu128 STATE0, 80+0*16(%rdi)
251 movu128 STATE1, 80+1*16(%rdi)
252
253 ret
254 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
255
256.section .rodata.cst256.K256, "aM", @progbits, 256
257.balign 16
258K256:
259 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
260 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
261 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
262 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
263 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
264 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
265 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
266 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
267 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
268 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
269 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
270 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
271 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
272 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
273 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
274 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
275
276.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
277.balign 16
278PSHUFFLE_BSWAP32_FLIP_MASK:
279 .octa 0x0c0d0e0f08090a0b0405060700010203
280
281#endif