diff options
author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2021-12-27 00:00:00 +0000 |
---|---|---|
committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2022-03-18 15:35:13 +0500 |
commit | f19f813537c7aea1c20749c914e756b54a9c3cf5 (patch) | |
tree | 816ba62ca7c0fa19f2eb46d9e9d6f7dd7c3a744d /C/AesOpt.c | |
parent | 98e06a519b63b81986abe76d28887f6984a7732b (diff) | |
download | 7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.gz 7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.bz2 7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.zip |
'21.07'21.07
Diffstat (limited to 'C/AesOpt.c')
-rw-r--r-- | C/AesOpt.c | 776 |
1 files changed, 776 insertions, 0 deletions
diff --git a/C/AesOpt.c b/C/AesOpt.c new file mode 100644 index 0000000..8be8ff6 --- /dev/null +++ b/C/AesOpt.c | |||
@@ -0,0 +1,776 @@ | |||
1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions | ||
2 | 2021-04-01 : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #include "Precomp.h" | ||
5 | |||
6 | #include "CpuArch.h" | ||
7 | |||
8 | #ifdef MY_CPU_X86_OR_AMD64 | ||
9 | |||
10 | #if defined(__clang__) | ||
11 | #if __clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 8) | ||
12 | #define USE_INTEL_AES | ||
13 | #define ATTRIB_AES __attribute__((__target__("aes"))) | ||
14 | #if (__clang_major__ >= 8) | ||
15 | #define USE_INTEL_VAES | ||
16 | #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2"))) | ||
17 | #endif | ||
18 | #endif | ||
19 | #elif defined(__GNUC__) | ||
20 | #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4) | ||
21 | #define USE_INTEL_AES | ||
22 | #ifndef __AES__ | ||
23 | #define ATTRIB_AES __attribute__((__target__("aes"))) | ||
24 | #endif | ||
25 | #if (__GNUC__ >= 8) | ||
26 | #define USE_INTEL_VAES | ||
27 | #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2"))) | ||
28 | #endif | ||
29 | #endif | ||
30 | #elif defined(__INTEL_COMPILER) | ||
31 | #if (__INTEL_COMPILER >= 1110) | ||
32 | #define USE_INTEL_AES | ||
33 | #if (__INTEL_COMPILER >= 1900) | ||
34 | #define USE_INTEL_VAES | ||
35 | #endif | ||
36 | #endif | ||
37 | #elif defined(_MSC_VER) | ||
38 | #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729) | ||
39 | #define USE_INTEL_AES | ||
40 | #if (_MSC_VER >= 1910) | ||
41 | #define USE_INTEL_VAES | ||
42 | #endif | ||
43 | #endif | ||
44 | #endif | ||
45 | |||
46 | #ifndef ATTRIB_AES | ||
47 | #define ATTRIB_AES | ||
48 | #endif | ||
49 | #ifndef ATTRIB_VAES | ||
50 | #define ATTRIB_VAES | ||
51 | #endif | ||
52 | |||
53 | |||
54 | #ifdef USE_INTEL_AES | ||
55 | |||
56 | #include <wmmintrin.h> | ||
57 | |||
58 | #ifndef USE_INTEL_VAES | ||
59 | #define AES_TYPE_keys __m128i | ||
60 | #define AES_TYPE_data __m128i | ||
61 | #endif | ||
62 | |||
63 | #define AES_FUNC_START(name) \ | ||
64 | void MY_FAST_CALL name(__m128i *p, __m128i *data, size_t numBlocks) | ||
65 | |||
66 | #define AES_FUNC_START2(name) \ | ||
67 | AES_FUNC_START (name); \ | ||
68 | ATTRIB_AES \ | ||
69 | AES_FUNC_START (name) | ||
70 | |||
71 | #define MM_OP(op, dest, src) dest = op(dest, src); | ||
72 | #define MM_OP_m(op, src) MM_OP(op, m, src); | ||
73 | |||
74 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src); | ||
75 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src); | ||
76 | |||
77 | |||
78 | AES_FUNC_START2 (AesCbc_Encode_HW) | ||
79 | { | ||
80 | __m128i m = *p; | ||
81 | const __m128i k0 = p[2]; | ||
82 | const __m128i k1 = p[3]; | ||
83 | const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; | ||
84 | for (; numBlocks != 0; numBlocks--, data++) | ||
85 | { | ||
86 | UInt32 r = numRounds2; | ||
87 | const __m128i *w = p + 4; | ||
88 | __m128i temp = *data; | ||
89 | MM_XOR (temp, k0); | ||
90 | MM_XOR (m, temp); | ||
91 | MM_OP_m (_mm_aesenc_si128, k1); | ||
92 | do | ||
93 | { | ||
94 | MM_OP_m (_mm_aesenc_si128, w[0]); | ||
95 | MM_OP_m (_mm_aesenc_si128, w[1]); | ||
96 | w += 2; | ||
97 | } | ||
98 | while (--r); | ||
99 | MM_OP_m (_mm_aesenclast_si128, w[0]); | ||
100 | *data = m; | ||
101 | } | ||
102 | *p = m; | ||
103 | } | ||
104 | |||
105 | |||
106 | #define WOP_1(op) | ||
107 | #define WOP_2(op) WOP_1 (op) op (m1, 1); | ||
108 | #define WOP_3(op) WOP_2 (op) op (m2, 2); | ||
109 | #define WOP_4(op) WOP_3 (op) op (m3, 3); | ||
110 | #ifdef MY_CPU_AMD64 | ||
111 | #define WOP_5(op) WOP_4 (op) op (m4, 4); | ||
112 | #define WOP_6(op) WOP_5 (op) op (m5, 5); | ||
113 | #define WOP_7(op) WOP_6 (op) op (m6, 6); | ||
114 | #define WOP_8(op) WOP_7 (op) op (m7, 7); | ||
115 | #endif | ||
116 | /* | ||
117 | #define WOP_9(op) WOP_8 (op) op (m8, 8); | ||
118 | #define WOP_10(op) WOP_9 (op) op (m9, 9); | ||
119 | #define WOP_11(op) WOP_10(op) op (m10, 10); | ||
120 | #define WOP_12(op) WOP_11(op) op (m11, 11); | ||
121 | #define WOP_13(op) WOP_12(op) op (m12, 12); | ||
122 | #define WOP_14(op) WOP_13(op) op (m13, 13); | ||
123 | */ | ||
124 | |||
125 | #ifdef MY_CPU_AMD64 | ||
126 | #define NUM_WAYS 8 | ||
127 | #define WOP_M1 WOP_8 | ||
128 | #else | ||
129 | #define NUM_WAYS 4 | ||
130 | #define WOP_M1 WOP_4 | ||
131 | #endif | ||
132 | |||
133 | #define WOP(op) op (m0, 0); WOP_M1(op) | ||
134 | |||
135 | |||
136 | #define DECLARE_VAR(reg, ii) __m128i reg | ||
137 | #define LOAD_data( reg, ii) reg = data[ii]; | ||
138 | #define STORE_data( reg, ii) data[ii] = reg; | ||
139 | #if (NUM_WAYS > 1) | ||
140 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]); | ||
141 | #endif | ||
142 | |||
143 | #define AVX__DECLARE_VAR(reg, ii) __m256i reg | ||
144 | #define AVX__LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; | ||
145 | #define AVX__STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; | ||
146 | #define AVX__XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])); | ||
147 | |||
148 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); | ||
149 | |||
150 | #define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg) | ||
151 | #define AES_DEC_LAST( reg, ii) MM_OP_key (_mm_aesdeclast_si128, reg) | ||
152 | #define AES_ENC( reg, ii) MM_OP_key (_mm_aesenc_si128, reg) | ||
153 | #define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg) | ||
154 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) | ||
155 | |||
156 | |||
157 | #define AVX__AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) | ||
158 | #define AVX__AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) | ||
159 | #define AVX__AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) | ||
160 | #define AVX__AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) | ||
161 | #define AVX__AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) | ||
162 | |||
163 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one); reg = ctr; | ||
164 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg); | ||
165 | |||
166 | #define AVX__CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two); reg = _mm256_xor_si256(ctr2, key); | ||
167 | #define AVX__CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg); | ||
168 | |||
169 | #define WOP_KEY(op, n) { \ | ||
170 | const __m128i key = w[n]; \ | ||
171 | WOP(op); } | ||
172 | |||
173 | #define AVX__WOP_KEY(op, n) { \ | ||
174 | const __m256i key = w[n]; \ | ||
175 | WOP(op); } | ||
176 | |||
177 | |||
178 | #define WIDE_LOOP_START \ | ||
179 | dataEnd = data + numBlocks; \ | ||
180 | if (numBlocks >= NUM_WAYS) \ | ||
181 | { dataEnd -= NUM_WAYS; do { \ | ||
182 | |||
183 | |||
184 | #define WIDE_LOOP_END \ | ||
185 | data += NUM_WAYS; \ | ||
186 | } while (data <= dataEnd); \ | ||
187 | dataEnd += NUM_WAYS; } \ | ||
188 | |||
189 | |||
190 | #define SINGLE_LOOP \ | ||
191 | for (; data < dataEnd; data++) | ||
192 | |||
193 | |||
194 | #define NUM_AES_KEYS_MAX 15 | ||
195 | |||
196 | #define WIDE_LOOP_START_AVX(OP) \ | ||
197 | dataEnd = data + numBlocks; \ | ||
198 | if (numBlocks >= NUM_WAYS * 2) \ | ||
199 | { __m256i keys[NUM_AES_KEYS_MAX]; \ | ||
200 | UInt32 ii; \ | ||
201 | OP \ | ||
202 | for (ii = 0; ii < numRounds; ii++) \ | ||
203 | keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \ | ||
204 | dataEnd -= NUM_WAYS * 2; do { \ | ||
205 | |||
206 | |||
207 | #define WIDE_LOOP_END_AVX(OP) \ | ||
208 | data += NUM_WAYS * 2; \ | ||
209 | } while (data <= dataEnd); \ | ||
210 | dataEnd += NUM_WAYS * 2; \ | ||
211 | OP \ | ||
212 | _mm256_zeroupper(); \ | ||
213 | } \ | ||
214 | |||
215 | /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, | ||
216 | MSVC still can insert vzeroupper instruction. */ | ||
217 | |||
218 | |||
219 | AES_FUNC_START2 (AesCbc_Decode_HW) | ||
220 | { | ||
221 | __m128i iv = *p; | ||
222 | const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1; | ||
223 | const __m128i *dataEnd; | ||
224 | p += 2; | ||
225 | |||
226 | WIDE_LOOP_START | ||
227 | { | ||
228 | const __m128i *w = wStart; | ||
229 | |||
230 | WOP (DECLARE_VAR) | ||
231 | WOP (LOAD_data); | ||
232 | WOP_KEY (AES_XOR, 1) | ||
233 | |||
234 | do | ||
235 | { | ||
236 | WOP_KEY (AES_DEC, 0) | ||
237 | w--; | ||
238 | } | ||
239 | while (w != p); | ||
240 | WOP_KEY (AES_DEC_LAST, 0) | ||
241 | |||
242 | MM_XOR (m0, iv); | ||
243 | WOP_M1 (XOR_data_M1) | ||
244 | iv = data[NUM_WAYS - 1]; | ||
245 | WOP (STORE_data); | ||
246 | } | ||
247 | WIDE_LOOP_END | ||
248 | |||
249 | SINGLE_LOOP | ||
250 | { | ||
251 | const __m128i *w = wStart - 1; | ||
252 | __m128i m = _mm_xor_si128 (w[2], *data); | ||
253 | do | ||
254 | { | ||
255 | MM_OP_m (_mm_aesdec_si128, w[1]); | ||
256 | MM_OP_m (_mm_aesdec_si128, w[0]); | ||
257 | w -= 2; | ||
258 | } | ||
259 | while (w != p); | ||
260 | MM_OP_m (_mm_aesdec_si128, w[1]); | ||
261 | MM_OP_m (_mm_aesdeclast_si128, w[0]); | ||
262 | |||
263 | MM_XOR (m, iv); | ||
264 | iv = *data; | ||
265 | *data = m; | ||
266 | } | ||
267 | |||
268 | p[-2] = iv; | ||
269 | } | ||
270 | |||
271 | |||
272 | AES_FUNC_START2 (AesCtr_Code_HW) | ||
273 | { | ||
274 | __m128i ctr = *p; | ||
275 | UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; | ||
276 | const __m128i *dataEnd; | ||
277 | __m128i one = _mm_cvtsi32_si128(1); | ||
278 | |||
279 | p += 2; | ||
280 | |||
281 | WIDE_LOOP_START | ||
282 | { | ||
283 | const __m128i *w = p; | ||
284 | UInt32 r = numRoundsMinus2; | ||
285 | WOP (DECLARE_VAR) | ||
286 | WOP (CTR_START); | ||
287 | WOP_KEY (AES_XOR, 0) | ||
288 | w += 1; | ||
289 | do | ||
290 | { | ||
291 | WOP_KEY (AES_ENC, 0) | ||
292 | w += 1; | ||
293 | } | ||
294 | while (--r); | ||
295 | WOP_KEY (AES_ENC_LAST, 0) | ||
296 | |||
297 | WOP (CTR_END); | ||
298 | } | ||
299 | WIDE_LOOP_END | ||
300 | |||
301 | SINGLE_LOOP | ||
302 | { | ||
303 | UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1; | ||
304 | const __m128i *w = p; | ||
305 | __m128i m; | ||
306 | MM_OP (_mm_add_epi64, ctr, one); | ||
307 | m = _mm_xor_si128 (ctr, p[0]); | ||
308 | w += 1; | ||
309 | do | ||
310 | { | ||
311 | MM_OP_m (_mm_aesenc_si128, w[0]); | ||
312 | MM_OP_m (_mm_aesenc_si128, w[1]); | ||
313 | w += 2; | ||
314 | } | ||
315 | while (--numRounds2); | ||
316 | MM_OP_m (_mm_aesenc_si128, w[0]); | ||
317 | MM_OP_m (_mm_aesenclast_si128, w[1]); | ||
318 | MM_XOR (*data, m); | ||
319 | } | ||
320 | |||
321 | p[-2] = ctr; | ||
322 | } | ||
323 | |||
324 | |||
325 | |||
326 | #ifdef USE_INTEL_VAES | ||
327 | |||
328 | #if defined(__clang__) && defined(_MSC_VER) | ||
329 | #define __SSE4_2__ | ||
330 | #define __AES__ | ||
331 | #define __AVX__ | ||
332 | #define __AVX2__ | ||
333 | #define __VAES__ | ||
334 | #define __AVX512F__ | ||
335 | #define __AVX512VL__ | ||
336 | #endif | ||
337 | |||
338 | #include <immintrin.h> | ||
339 | |||
340 | #define VAES_FUNC_START2(name) \ | ||
341 | AES_FUNC_START (name); \ | ||
342 | ATTRIB_VAES \ | ||
343 | AES_FUNC_START (name) | ||
344 | |||
345 | VAES_FUNC_START2 (AesCbc_Decode_HW_256) | ||
346 | { | ||
347 | __m128i iv = *p; | ||
348 | const __m128i *dataEnd; | ||
349 | UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; | ||
350 | p += 2; | ||
351 | |||
352 | WIDE_LOOP_START_AVX(;) | ||
353 | { | ||
354 | const __m256i *w = keys + numRounds - 2; | ||
355 | |||
356 | WOP (AVX__DECLARE_VAR) | ||
357 | WOP (AVX__LOAD_data); | ||
358 | AVX__WOP_KEY (AVX__AES_XOR, 1) | ||
359 | |||
360 | do | ||
361 | { | ||
362 | AVX__WOP_KEY (AVX__AES_DEC, 0) | ||
363 | w--; | ||
364 | } | ||
365 | while (w != keys); | ||
366 | AVX__WOP_KEY (AVX__AES_DEC_LAST, 0) | ||
367 | |||
368 | AVX_XOR (m0, _mm256_setr_m128i(iv, data[0])); | ||
369 | WOP_M1 (AVX__XOR_data_M1) | ||
370 | iv = data[NUM_WAYS * 2 - 1]; | ||
371 | WOP (AVX__STORE_data); | ||
372 | } | ||
373 | WIDE_LOOP_END_AVX(;) | ||
374 | |||
375 | SINGLE_LOOP | ||
376 | { | ||
377 | const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3; | ||
378 | __m128i m = _mm_xor_si128 (w[2], *data); | ||
379 | do | ||
380 | { | ||
381 | MM_OP_m (_mm_aesdec_si128, w[1]); | ||
382 | MM_OP_m (_mm_aesdec_si128, w[0]); | ||
383 | w -= 2; | ||
384 | } | ||
385 | while (w != p); | ||
386 | MM_OP_m (_mm_aesdec_si128, w[1]); | ||
387 | MM_OP_m (_mm_aesdeclast_si128, w[0]); | ||
388 | |||
389 | MM_XOR (m, iv); | ||
390 | iv = *data; | ||
391 | *data = m; | ||
392 | } | ||
393 | |||
394 | p[-2] = iv; | ||
395 | } | ||
396 | |||
397 | |||
398 | /* | ||
399 | SSE2: _mm_cvtsi32_si128 : movd | ||
400 | AVX: _mm256_setr_m128i : vinsertf128 | ||
401 | AVX2: _mm256_add_epi64 : vpaddq ymm, ymm, ymm | ||
402 | _mm256_extracti128_si256 : vextracti128 | ||
403 | _mm256_broadcastsi128_si256 : vbroadcasti128 | ||
404 | */ | ||
405 | |||
406 | #define AVX__CTR_LOOP_START \ | ||
407 | ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \ | ||
408 | two = _mm256_setr_m128i(one, one); \ | ||
409 | two = _mm256_add_epi64(two, two); \ | ||
410 | |||
411 | // two = _mm256_setr_epi64x(2, 0, 2, 0); | ||
412 | |||
413 | #define AVX__CTR_LOOP_ENC \ | ||
414 | ctr = _mm256_extracti128_si256 (ctr2, 1); \ | ||
415 | |||
416 | VAES_FUNC_START2 (AesCtr_Code_HW_256) | ||
417 | { | ||
418 | __m128i ctr = *p; | ||
419 | UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; | ||
420 | const __m128i *dataEnd; | ||
421 | __m128i one = _mm_cvtsi32_si128(1); | ||
422 | __m256i ctr2, two; | ||
423 | p += 2; | ||
424 | |||
425 | WIDE_LOOP_START_AVX (AVX__CTR_LOOP_START) | ||
426 | { | ||
427 | const __m256i *w = keys; | ||
428 | UInt32 r = numRounds - 2; | ||
429 | WOP (AVX__DECLARE_VAR) | ||
430 | AVX__WOP_KEY (AVX__CTR_START, 0); | ||
431 | |||
432 | w += 1; | ||
433 | do | ||
434 | { | ||
435 | AVX__WOP_KEY (AVX__AES_ENC, 0) | ||
436 | w += 1; | ||
437 | } | ||
438 | while (--r); | ||
439 | AVX__WOP_KEY (AVX__AES_ENC_LAST, 0) | ||
440 | |||
441 | WOP (AVX__CTR_END); | ||
442 | } | ||
443 | WIDE_LOOP_END_AVX (AVX__CTR_LOOP_ENC) | ||
444 | |||
445 | SINGLE_LOOP | ||
446 | { | ||
447 | UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1; | ||
448 | const __m128i *w = p; | ||
449 | __m128i m; | ||
450 | MM_OP (_mm_add_epi64, ctr, one); | ||
451 | m = _mm_xor_si128 (ctr, p[0]); | ||
452 | w += 1; | ||
453 | do | ||
454 | { | ||
455 | MM_OP_m (_mm_aesenc_si128, w[0]); | ||
456 | MM_OP_m (_mm_aesenc_si128, w[1]); | ||
457 | w += 2; | ||
458 | } | ||
459 | while (--numRounds2); | ||
460 | MM_OP_m (_mm_aesenc_si128, w[0]); | ||
461 | MM_OP_m (_mm_aesenclast_si128, w[1]); | ||
462 | MM_XOR (*data, m); | ||
463 | } | ||
464 | |||
465 | p[-2] = ctr; | ||
466 | } | ||
467 | |||
468 | #endif // USE_INTEL_VAES | ||
469 | |||
470 | #else // USE_INTEL_AES | ||
471 | |||
472 | /* no USE_INTEL_AES */ | ||
473 | |||
474 | #pragma message("AES HW_SW stub was used") | ||
475 | |||
476 | #define AES_TYPE_keys UInt32 | ||
477 | #define AES_TYPE_data Byte | ||
478 | |||
479 | #define AES_FUNC_START(name) \ | ||
480 | void MY_FAST_CALL name(UInt32 *p, Byte *data, size_t numBlocks) \ | ||
481 | |||
482 | #define AES_COMPAT_STUB(name) \ | ||
483 | AES_FUNC_START(name); \ | ||
484 | AES_FUNC_START(name ## _HW) \ | ||
485 | { name(p, data, numBlocks); } | ||
486 | |||
487 | AES_COMPAT_STUB (AesCbc_Encode) | ||
488 | AES_COMPAT_STUB (AesCbc_Decode) | ||
489 | AES_COMPAT_STUB (AesCtr_Code) | ||
490 | |||
491 | #endif // USE_INTEL_AES | ||
492 | |||
493 | |||
494 | #ifndef USE_INTEL_VAES | ||
495 | |||
496 | #pragma message("VAES HW_SW stub was used") | ||
497 | |||
498 | #define VAES_COMPAT_STUB(name) \ | ||
499 | void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ | ||
500 | void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \ | ||
501 | { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); } | ||
502 | |||
503 | VAES_COMPAT_STUB (AesCbc_Decode_HW) | ||
504 | VAES_COMPAT_STUB (AesCtr_Code_HW) | ||
505 | |||
506 | #endif // ! USE_INTEL_VAES | ||
507 | |||
508 | |||
509 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) | ||
510 | |||
511 | #if defined(__clang__) | ||
512 | #if (__clang_major__ >= 8) // fix that check | ||
513 | #define USE_HW_AES | ||
514 | #endif | ||
515 | #elif defined(__GNUC__) | ||
516 | #if (__GNUC__ >= 6) // fix that check | ||
517 | #define USE_HW_AES | ||
518 | #endif | ||
519 | #elif defined(_MSC_VER) | ||
520 | #if _MSC_VER >= 1910 | ||
521 | #define USE_HW_AES | ||
522 | #endif | ||
523 | #endif | ||
524 | |||
525 | #ifdef USE_HW_AES | ||
526 | |||
527 | // #pragma message("=== AES HW === ") | ||
528 | |||
529 | #if defined(__clang__) || defined(__GNUC__) | ||
530 | #ifdef MY_CPU_ARM64 | ||
531 | #define ATTRIB_AES __attribute__((__target__("+crypto"))) | ||
532 | #else | ||
533 | #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | ||
534 | #endif | ||
535 | #else | ||
536 | // _MSC_VER | ||
537 | // for arm32 | ||
538 | #define _ARM_USE_NEW_NEON_INTRINSICS | ||
539 | #endif | ||
540 | |||
541 | #ifndef ATTRIB_AES | ||
542 | #define ATTRIB_AES | ||
543 | #endif | ||
544 | |||
545 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | ||
546 | #include <arm64_neon.h> | ||
547 | #else | ||
548 | #include <arm_neon.h> | ||
549 | #endif | ||
550 | |||
551 | typedef uint8x16_t v128; | ||
552 | |||
553 | #define AES_FUNC_START(name) \ | ||
554 | void MY_FAST_CALL name(v128 *p, v128 *data, size_t numBlocks) | ||
555 | |||
556 | #define AES_FUNC_START2(name) \ | ||
557 | AES_FUNC_START (name); \ | ||
558 | ATTRIB_AES \ | ||
559 | AES_FUNC_START (name) | ||
560 | |||
561 | #define MM_OP(op, dest, src) dest = op(dest, src); | ||
562 | #define MM_OP_m(op, src) MM_OP(op, m, src); | ||
563 | #define MM_OP1_m(op) m = op(m); | ||
564 | |||
565 | #define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src); | ||
566 | #define MM_XOR_m( src) MM_XOR(m, src); | ||
567 | |||
568 | #define AES_E_m(k) MM_OP_m (vaeseq_u8, k); | ||
569 | #define AES_E_MC_m(k) AES_E_m (k); MM_OP1_m(vaesmcq_u8); | ||
570 | |||
571 | |||
572 | AES_FUNC_START2 (AesCbc_Encode_HW) | ||
573 | { | ||
574 | v128 m = *p; | ||
575 | const v128 k0 = p[2]; | ||
576 | const v128 k1 = p[3]; | ||
577 | const v128 k2 = p[4]; | ||
578 | const v128 k3 = p[5]; | ||
579 | const v128 k4 = p[6]; | ||
580 | const v128 k5 = p[7]; | ||
581 | const v128 k6 = p[8]; | ||
582 | const v128 k7 = p[9]; | ||
583 | const v128 k8 = p[10]; | ||
584 | const v128 k9 = p[11]; | ||
585 | const UInt32 numRounds2 = *(const UInt32 *)(p + 1); | ||
586 | const v128 *w = p + ((size_t)numRounds2 * 2); | ||
587 | const v128 k_z1 = w[1]; | ||
588 | const v128 k_z0 = w[2]; | ||
589 | for (; numBlocks != 0; numBlocks--, data++) | ||
590 | { | ||
591 | MM_XOR_m (*data); | ||
592 | AES_E_MC_m (k0) | ||
593 | AES_E_MC_m (k1) | ||
594 | AES_E_MC_m (k2) | ||
595 | AES_E_MC_m (k3) | ||
596 | AES_E_MC_m (k4) | ||
597 | AES_E_MC_m (k5) | ||
598 | AES_E_MC_m (k6) | ||
599 | AES_E_MC_m (k7) | ||
600 | AES_E_MC_m (k8) | ||
601 | if (numRounds2 >= 6) | ||
602 | { | ||
603 | AES_E_MC_m (k9) | ||
604 | AES_E_MC_m (p[12]) | ||
605 | if (numRounds2 != 6) | ||
606 | { | ||
607 | AES_E_MC_m (p[13]) | ||
608 | AES_E_MC_m (p[14]) | ||
609 | } | ||
610 | } | ||
611 | AES_E_m (k_z1); | ||
612 | MM_XOR_m (k_z0); | ||
613 | *data = m; | ||
614 | } | ||
615 | *p = m; | ||
616 | } | ||
617 | |||
618 | |||
619 | #define WOP_1(op) | ||
620 | #define WOP_2(op) WOP_1 (op) op (m1, 1); | ||
621 | #define WOP_3(op) WOP_2 (op) op (m2, 2); | ||
622 | #define WOP_4(op) WOP_3 (op) op (m3, 3); | ||
623 | #define WOP_5(op) WOP_4 (op) op (m4, 4); | ||
624 | #define WOP_6(op) WOP_5 (op) op (m5, 5); | ||
625 | #define WOP_7(op) WOP_6 (op) op (m6, 6); | ||
626 | #define WOP_8(op) WOP_7 (op) op (m7, 7); | ||
627 | |||
628 | #define NUM_WAYS 8 | ||
629 | #define WOP_M1 WOP_8 | ||
630 | |||
631 | #define WOP(op) op (m0, 0); WOP_M1(op) | ||
632 | |||
633 | #define DECLARE_VAR(reg, ii) v128 reg | ||
634 | #define LOAD_data( reg, ii) reg = data[ii]; | ||
635 | #define STORE_data( reg, ii) data[ii] = reg; | ||
636 | #if (NUM_WAYS > 1) | ||
637 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]); | ||
638 | #endif | ||
639 | |||
640 | #define MM_OP_key(op, reg) MM_OP (op, reg, key); | ||
641 | |||
642 | #define AES_D_m(k) MM_OP_m (vaesdq_u8, k); | ||
643 | #define AES_D_IMC_m(k) AES_D_m (k); MM_OP1_m (vaesimcq_u8); | ||
644 | |||
645 | #define AES_XOR( reg, ii) MM_OP_key (veorq_u8, reg) | ||
646 | #define AES_D( reg, ii) MM_OP_key (vaesdq_u8, reg) | ||
647 | #define AES_E( reg, ii) MM_OP_key (vaeseq_u8, reg) | ||
648 | |||
649 | #define AES_D_IMC( reg, ii) AES_D (reg, ii); reg = vaesimcq_u8(reg) | ||
650 | #define AES_E_MC( reg, ii) AES_E (reg, ii); reg = vaesmcq_u8(reg) | ||
651 | |||
652 | #define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one); reg = vreinterpretq_u8_u64(ctr); | ||
653 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg); | ||
654 | |||
655 | #define WOP_KEY(op, n) { \ | ||
656 | const v128 key = w[n]; \ | ||
657 | WOP(op); } | ||
658 | |||
659 | #define WIDE_LOOP_START \ | ||
660 | dataEnd = data + numBlocks; \ | ||
661 | if (numBlocks >= NUM_WAYS) \ | ||
662 | { dataEnd -= NUM_WAYS; do { \ | ||
663 | |||
664 | #define WIDE_LOOP_END \ | ||
665 | data += NUM_WAYS; \ | ||
666 | } while (data <= dataEnd); \ | ||
667 | dataEnd += NUM_WAYS; } \ | ||
668 | |||
669 | #define SINGLE_LOOP \ | ||
670 | for (; data < dataEnd; data++) | ||
671 | |||
672 | |||
673 | AES_FUNC_START2 (AesCbc_Decode_HW) | ||
674 | { | ||
675 | v128 iv = *p; | ||
676 | const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; | ||
677 | const v128 *dataEnd; | ||
678 | p += 2; | ||
679 | |||
680 | WIDE_LOOP_START | ||
681 | { | ||
682 | const v128 *w = wStart; | ||
683 | WOP (DECLARE_VAR) | ||
684 | WOP (LOAD_data); | ||
685 | WOP_KEY (AES_D_IMC, 2) | ||
686 | do | ||
687 | { | ||
688 | WOP_KEY (AES_D_IMC, 1) | ||
689 | WOP_KEY (AES_D_IMC, 0) | ||
690 | w -= 2; | ||
691 | } | ||
692 | while (w != p); | ||
693 | WOP_KEY (AES_D, 1) | ||
694 | WOP_KEY (AES_XOR, 0) | ||
695 | MM_XOR (m0, iv); | ||
696 | WOP_M1 (XOR_data_M1) | ||
697 | iv = data[NUM_WAYS - 1]; | ||
698 | WOP (STORE_data); | ||
699 | } | ||
700 | WIDE_LOOP_END | ||
701 | |||
702 | SINGLE_LOOP | ||
703 | { | ||
704 | const v128 *w = wStart; | ||
705 | v128 m = *data; | ||
706 | AES_D_IMC_m (w[2]) | ||
707 | do | ||
708 | { | ||
709 | AES_D_IMC_m (w[1]); | ||
710 | AES_D_IMC_m (w[0]); | ||
711 | w -= 2; | ||
712 | } | ||
713 | while (w != p); | ||
714 | AES_D_m (w[1]); | ||
715 | MM_XOR_m (w[0]); | ||
716 | MM_XOR_m (iv); | ||
717 | iv = *data; | ||
718 | *data = m; | ||
719 | } | ||
720 | |||
721 | p[-2] = iv; | ||
722 | } | ||
723 | |||
724 | |||
725 | AES_FUNC_START2 (AesCtr_Code_HW) | ||
726 | { | ||
727 | uint64x2_t ctr = vreinterpretq_u64_u8(*p); | ||
728 | const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; | ||
729 | const v128 *dataEnd; | ||
730 | uint64x2_t one = vdupq_n_u64(0); | ||
731 | one = vsetq_lane_u64(1, one, 0); | ||
732 | p += 2; | ||
733 | |||
734 | WIDE_LOOP_START | ||
735 | { | ||
736 | const v128 *w = p; | ||
737 | WOP (DECLARE_VAR) | ||
738 | WOP (CTR_START); | ||
739 | do | ||
740 | { | ||
741 | WOP_KEY (AES_E_MC, 0) | ||
742 | WOP_KEY (AES_E_MC, 1) | ||
743 | w += 2; | ||
744 | } | ||
745 | while (w != wEnd); | ||
746 | WOP_KEY (AES_E_MC, 0) | ||
747 | WOP_KEY (AES_E, 1) | ||
748 | WOP_KEY (AES_XOR, 2) | ||
749 | WOP (CTR_END); | ||
750 | } | ||
751 | WIDE_LOOP_END | ||
752 | |||
753 | SINGLE_LOOP | ||
754 | { | ||
755 | const v128 *w = p; | ||
756 | v128 m; | ||
757 | CTR_START (m, 0); | ||
758 | do | ||
759 | { | ||
760 | AES_E_MC_m (w[0]); | ||
761 | AES_E_MC_m (w[1]); | ||
762 | w += 2; | ||
763 | } | ||
764 | while (w != wEnd); | ||
765 | AES_E_MC_m (w[0]); | ||
766 | AES_E_m (w[1]); | ||
767 | MM_XOR_m (w[2]); | ||
768 | CTR_END (m, 0); | ||
769 | } | ||
770 | |||
771 | p[-2] = vreinterpretq_u8_u64(ctr); | ||
772 | } | ||
773 | |||
774 | #endif // USE_HW_AES | ||
775 | |||
776 | #endif // MY_CPU_ARM_OR_ARM64 | ||