aboutsummaryrefslogtreecommitdiff
path: root/C/AesOpt.c
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2021-12-27 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2022-03-18 15:35:13 +0500
commitf19f813537c7aea1c20749c914e756b54a9c3cf5 (patch)
tree816ba62ca7c0fa19f2eb46d9e9d6f7dd7c3a744d /C/AesOpt.c
parent98e06a519b63b81986abe76d28887f6984a7732b (diff)
download7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.gz
7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.bz2
7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.zip
'21.07'21.07
Diffstat (limited to 'C/AesOpt.c')
-rw-r--r--C/AesOpt.c776
1 files changed, 776 insertions, 0 deletions
diff --git a/C/AesOpt.c b/C/AesOpt.c
new file mode 100644
index 0000000..8be8ff6
--- /dev/null
+++ b/C/AesOpt.c
@@ -0,0 +1,776 @@
1/* AesOpt.c -- AES optimized code for x86 AES hardware instructions
22021-04-01 : Igor Pavlov : Public domain */
3
4#include "Precomp.h"
5
6#include "CpuArch.h"
7
8#ifdef MY_CPU_X86_OR_AMD64
9
10 #if defined(__clang__)
11 #if __clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 8)
12 #define USE_INTEL_AES
13 #define ATTRIB_AES __attribute__((__target__("aes")))
14 #if (__clang_major__ >= 8)
15 #define USE_INTEL_VAES
16 #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2")))
17 #endif
18 #endif
19 #elif defined(__GNUC__)
20 #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
21 #define USE_INTEL_AES
22 #ifndef __AES__
23 #define ATTRIB_AES __attribute__((__target__("aes")))
24 #endif
25 #if (__GNUC__ >= 8)
26 #define USE_INTEL_VAES
27 #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2")))
28 #endif
29 #endif
30 #elif defined(__INTEL_COMPILER)
31 #if (__INTEL_COMPILER >= 1110)
32 #define USE_INTEL_AES
33 #if (__INTEL_COMPILER >= 1900)
34 #define USE_INTEL_VAES
35 #endif
36 #endif
37 #elif defined(_MSC_VER)
38 #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
39 #define USE_INTEL_AES
40 #if (_MSC_VER >= 1910)
41 #define USE_INTEL_VAES
42 #endif
43 #endif
44 #endif
45
46#ifndef ATTRIB_AES
47 #define ATTRIB_AES
48#endif
49#ifndef ATTRIB_VAES
50 #define ATTRIB_VAES
51#endif
52
53
54#ifdef USE_INTEL_AES
55
56#include <wmmintrin.h>
57
58#ifndef USE_INTEL_VAES
59#define AES_TYPE_keys __m128i
60#define AES_TYPE_data __m128i
61#endif
62
63#define AES_FUNC_START(name) \
64 void MY_FAST_CALL name(__m128i *p, __m128i *data, size_t numBlocks)
65
66#define AES_FUNC_START2(name) \
67AES_FUNC_START (name); \
68ATTRIB_AES \
69AES_FUNC_START (name)
70
71#define MM_OP(op, dest, src) dest = op(dest, src);
72#define MM_OP_m(op, src) MM_OP(op, m, src);
73
74#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src);
75#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src);
76
77
78AES_FUNC_START2 (AesCbc_Encode_HW)
79{
80 __m128i m = *p;
81 const __m128i k0 = p[2];
82 const __m128i k1 = p[3];
83 const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
84 for (; numBlocks != 0; numBlocks--, data++)
85 {
86 UInt32 r = numRounds2;
87 const __m128i *w = p + 4;
88 __m128i temp = *data;
89 MM_XOR (temp, k0);
90 MM_XOR (m, temp);
91 MM_OP_m (_mm_aesenc_si128, k1);
92 do
93 {
94 MM_OP_m (_mm_aesenc_si128, w[0]);
95 MM_OP_m (_mm_aesenc_si128, w[1]);
96 w += 2;
97 }
98 while (--r);
99 MM_OP_m (_mm_aesenclast_si128, w[0]);
100 *data = m;
101 }
102 *p = m;
103}
104
105
106#define WOP_1(op)
107#define WOP_2(op) WOP_1 (op) op (m1, 1);
108#define WOP_3(op) WOP_2 (op) op (m2, 2);
109#define WOP_4(op) WOP_3 (op) op (m3, 3);
110#ifdef MY_CPU_AMD64
111#define WOP_5(op) WOP_4 (op) op (m4, 4);
112#define WOP_6(op) WOP_5 (op) op (m5, 5);
113#define WOP_7(op) WOP_6 (op) op (m6, 6);
114#define WOP_8(op) WOP_7 (op) op (m7, 7);
115#endif
116/*
117#define WOP_9(op) WOP_8 (op) op (m8, 8);
118#define WOP_10(op) WOP_9 (op) op (m9, 9);
119#define WOP_11(op) WOP_10(op) op (m10, 10);
120#define WOP_12(op) WOP_11(op) op (m11, 11);
121#define WOP_13(op) WOP_12(op) op (m12, 12);
122#define WOP_14(op) WOP_13(op) op (m13, 13);
123*/
124
125#ifdef MY_CPU_AMD64
126 #define NUM_WAYS 8
127 #define WOP_M1 WOP_8
128#else
129 #define NUM_WAYS 4
130 #define WOP_M1 WOP_4
131#endif
132
133#define WOP(op) op (m0, 0); WOP_M1(op)
134
135
136#define DECLARE_VAR(reg, ii) __m128i reg
137#define LOAD_data( reg, ii) reg = data[ii];
138#define STORE_data( reg, ii) data[ii] = reg;
139#if (NUM_WAYS > 1)
140#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]);
141#endif
142
143#define AVX__DECLARE_VAR(reg, ii) __m256i reg
144#define AVX__LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii];
145#define AVX__STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg;
146#define AVX__XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]));
147
148#define MM_OP_key(op, reg) MM_OP(op, reg, key);
149
150#define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg)
151#define AES_DEC_LAST( reg, ii) MM_OP_key (_mm_aesdeclast_si128, reg)
152#define AES_ENC( reg, ii) MM_OP_key (_mm_aesenc_si128, reg)
153#define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg)
154#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg)
155
156
157#define AVX__AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
158#define AVX__AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
159#define AVX__AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
160#define AVX__AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
161#define AVX__AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
162
163#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one); reg = ctr;
164#define CTR_END( reg, ii) MM_XOR (data[ii], reg);
165
166#define AVX__CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two); reg = _mm256_xor_si256(ctr2, key);
167#define AVX__CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg);
168
169#define WOP_KEY(op, n) { \
170 const __m128i key = w[n]; \
171 WOP(op); }
172
173#define AVX__WOP_KEY(op, n) { \
174 const __m256i key = w[n]; \
175 WOP(op); }
176
177
178#define WIDE_LOOP_START \
179 dataEnd = data + numBlocks; \
180 if (numBlocks >= NUM_WAYS) \
181 { dataEnd -= NUM_WAYS; do { \
182
183
184#define WIDE_LOOP_END \
185 data += NUM_WAYS; \
186 } while (data <= dataEnd); \
187 dataEnd += NUM_WAYS; } \
188
189
190#define SINGLE_LOOP \
191 for (; data < dataEnd; data++)
192
193
194#define NUM_AES_KEYS_MAX 15
195
196#define WIDE_LOOP_START_AVX(OP) \
197 dataEnd = data + numBlocks; \
198 if (numBlocks >= NUM_WAYS * 2) \
199 { __m256i keys[NUM_AES_KEYS_MAX]; \
200 UInt32 ii; \
201 OP \
202 for (ii = 0; ii < numRounds; ii++) \
203 keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \
204 dataEnd -= NUM_WAYS * 2; do { \
205
206
207#define WIDE_LOOP_END_AVX(OP) \
208 data += NUM_WAYS * 2; \
209 } while (data <= dataEnd); \
210 dataEnd += NUM_WAYS * 2; \
211 OP \
212 _mm256_zeroupper(); \
213 } \
214
215/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
216 MSVC still can insert vzeroupper instruction. */
217
218
219AES_FUNC_START2 (AesCbc_Decode_HW)
220{
221 __m128i iv = *p;
222 const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
223 const __m128i *dataEnd;
224 p += 2;
225
226 WIDE_LOOP_START
227 {
228 const __m128i *w = wStart;
229
230 WOP (DECLARE_VAR)
231 WOP (LOAD_data);
232 WOP_KEY (AES_XOR, 1)
233
234 do
235 {
236 WOP_KEY (AES_DEC, 0)
237 w--;
238 }
239 while (w != p);
240 WOP_KEY (AES_DEC_LAST, 0)
241
242 MM_XOR (m0, iv);
243 WOP_M1 (XOR_data_M1)
244 iv = data[NUM_WAYS - 1];
245 WOP (STORE_data);
246 }
247 WIDE_LOOP_END
248
249 SINGLE_LOOP
250 {
251 const __m128i *w = wStart - 1;
252 __m128i m = _mm_xor_si128 (w[2], *data);
253 do
254 {
255 MM_OP_m (_mm_aesdec_si128, w[1]);
256 MM_OP_m (_mm_aesdec_si128, w[0]);
257 w -= 2;
258 }
259 while (w != p);
260 MM_OP_m (_mm_aesdec_si128, w[1]);
261 MM_OP_m (_mm_aesdeclast_si128, w[0]);
262
263 MM_XOR (m, iv);
264 iv = *data;
265 *data = m;
266 }
267
268 p[-2] = iv;
269}
270
271
272AES_FUNC_START2 (AesCtr_Code_HW)
273{
274 __m128i ctr = *p;
275 UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
276 const __m128i *dataEnd;
277 __m128i one = _mm_cvtsi32_si128(1);
278
279 p += 2;
280
281 WIDE_LOOP_START
282 {
283 const __m128i *w = p;
284 UInt32 r = numRoundsMinus2;
285 WOP (DECLARE_VAR)
286 WOP (CTR_START);
287 WOP_KEY (AES_XOR, 0)
288 w += 1;
289 do
290 {
291 WOP_KEY (AES_ENC, 0)
292 w += 1;
293 }
294 while (--r);
295 WOP_KEY (AES_ENC_LAST, 0)
296
297 WOP (CTR_END);
298 }
299 WIDE_LOOP_END
300
301 SINGLE_LOOP
302 {
303 UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
304 const __m128i *w = p;
305 __m128i m;
306 MM_OP (_mm_add_epi64, ctr, one);
307 m = _mm_xor_si128 (ctr, p[0]);
308 w += 1;
309 do
310 {
311 MM_OP_m (_mm_aesenc_si128, w[0]);
312 MM_OP_m (_mm_aesenc_si128, w[1]);
313 w += 2;
314 }
315 while (--numRounds2);
316 MM_OP_m (_mm_aesenc_si128, w[0]);
317 MM_OP_m (_mm_aesenclast_si128, w[1]);
318 MM_XOR (*data, m);
319 }
320
321 p[-2] = ctr;
322}
323
324
325
326#ifdef USE_INTEL_VAES
327
328#if defined(__clang__) && defined(_MSC_VER)
329#define __SSE4_2__
330#define __AES__
331#define __AVX__
332#define __AVX2__
333#define __VAES__
334#define __AVX512F__
335#define __AVX512VL__
336#endif
337
338#include <immintrin.h>
339
340#define VAES_FUNC_START2(name) \
341AES_FUNC_START (name); \
342ATTRIB_VAES \
343AES_FUNC_START (name)
344
345VAES_FUNC_START2 (AesCbc_Decode_HW_256)
346{
347 __m128i iv = *p;
348 const __m128i *dataEnd;
349 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
350 p += 2;
351
352 WIDE_LOOP_START_AVX(;)
353 {
354 const __m256i *w = keys + numRounds - 2;
355
356 WOP (AVX__DECLARE_VAR)
357 WOP (AVX__LOAD_data);
358 AVX__WOP_KEY (AVX__AES_XOR, 1)
359
360 do
361 {
362 AVX__WOP_KEY (AVX__AES_DEC, 0)
363 w--;
364 }
365 while (w != keys);
366 AVX__WOP_KEY (AVX__AES_DEC_LAST, 0)
367
368 AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]));
369 WOP_M1 (AVX__XOR_data_M1)
370 iv = data[NUM_WAYS * 2 - 1];
371 WOP (AVX__STORE_data);
372 }
373 WIDE_LOOP_END_AVX(;)
374
375 SINGLE_LOOP
376 {
377 const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3;
378 __m128i m = _mm_xor_si128 (w[2], *data);
379 do
380 {
381 MM_OP_m (_mm_aesdec_si128, w[1]);
382 MM_OP_m (_mm_aesdec_si128, w[0]);
383 w -= 2;
384 }
385 while (w != p);
386 MM_OP_m (_mm_aesdec_si128, w[1]);
387 MM_OP_m (_mm_aesdeclast_si128, w[0]);
388
389 MM_XOR (m, iv);
390 iv = *data;
391 *data = m;
392 }
393
394 p[-2] = iv;
395}
396
397
398/*
399SSE2: _mm_cvtsi32_si128 : movd
400AVX: _mm256_setr_m128i : vinsertf128
401AVX2: _mm256_add_epi64 : vpaddq ymm, ymm, ymm
402 _mm256_extracti128_si256 : vextracti128
403 _mm256_broadcastsi128_si256 : vbroadcasti128
404*/
405
406#define AVX__CTR_LOOP_START \
407 ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
408 two = _mm256_setr_m128i(one, one); \
409 two = _mm256_add_epi64(two, two); \
410
411// two = _mm256_setr_epi64x(2, 0, 2, 0);
412
413#define AVX__CTR_LOOP_ENC \
414 ctr = _mm256_extracti128_si256 (ctr2, 1); \
415
416VAES_FUNC_START2 (AesCtr_Code_HW_256)
417{
418 __m128i ctr = *p;
419 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
420 const __m128i *dataEnd;
421 __m128i one = _mm_cvtsi32_si128(1);
422 __m256i ctr2, two;
423 p += 2;
424
425 WIDE_LOOP_START_AVX (AVX__CTR_LOOP_START)
426 {
427 const __m256i *w = keys;
428 UInt32 r = numRounds - 2;
429 WOP (AVX__DECLARE_VAR)
430 AVX__WOP_KEY (AVX__CTR_START, 0);
431
432 w += 1;
433 do
434 {
435 AVX__WOP_KEY (AVX__AES_ENC, 0)
436 w += 1;
437 }
438 while (--r);
439 AVX__WOP_KEY (AVX__AES_ENC_LAST, 0)
440
441 WOP (AVX__CTR_END);
442 }
443 WIDE_LOOP_END_AVX (AVX__CTR_LOOP_ENC)
444
445 SINGLE_LOOP
446 {
447 UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
448 const __m128i *w = p;
449 __m128i m;
450 MM_OP (_mm_add_epi64, ctr, one);
451 m = _mm_xor_si128 (ctr, p[0]);
452 w += 1;
453 do
454 {
455 MM_OP_m (_mm_aesenc_si128, w[0]);
456 MM_OP_m (_mm_aesenc_si128, w[1]);
457 w += 2;
458 }
459 while (--numRounds2);
460 MM_OP_m (_mm_aesenc_si128, w[0]);
461 MM_OP_m (_mm_aesenclast_si128, w[1]);
462 MM_XOR (*data, m);
463 }
464
465 p[-2] = ctr;
466}
467
468#endif // USE_INTEL_VAES
469
470#else // USE_INTEL_AES
471
472/* no USE_INTEL_AES */
473
474#pragma message("AES HW_SW stub was used")
475
476#define AES_TYPE_keys UInt32
477#define AES_TYPE_data Byte
478
479#define AES_FUNC_START(name) \
480 void MY_FAST_CALL name(UInt32 *p, Byte *data, size_t numBlocks) \
481
482#define AES_COMPAT_STUB(name) \
483 AES_FUNC_START(name); \
484 AES_FUNC_START(name ## _HW) \
485 { name(p, data, numBlocks); }
486
487AES_COMPAT_STUB (AesCbc_Encode)
488AES_COMPAT_STUB (AesCbc_Decode)
489AES_COMPAT_STUB (AesCtr_Code)
490
491#endif // USE_INTEL_AES
492
493
494#ifndef USE_INTEL_VAES
495
496#pragma message("VAES HW_SW stub was used")
497
498#define VAES_COMPAT_STUB(name) \
499 void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
500 void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
501 { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
502
503VAES_COMPAT_STUB (AesCbc_Decode_HW)
504VAES_COMPAT_STUB (AesCtr_Code_HW)
505
506#endif // ! USE_INTEL_VAES
507
508
509#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
510
511 #if defined(__clang__)
512 #if (__clang_major__ >= 8) // fix that check
513 #define USE_HW_AES
514 #endif
515 #elif defined(__GNUC__)
516 #if (__GNUC__ >= 6) // fix that check
517 #define USE_HW_AES
518 #endif
519 #elif defined(_MSC_VER)
520 #if _MSC_VER >= 1910
521 #define USE_HW_AES
522 #endif
523 #endif
524
525#ifdef USE_HW_AES
526
527// #pragma message("=== AES HW === ")
528
529#if defined(__clang__) || defined(__GNUC__)
530 #ifdef MY_CPU_ARM64
531 #define ATTRIB_AES __attribute__((__target__("+crypto")))
532 #else
533 #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
534 #endif
535#else
536 // _MSC_VER
537 // for arm32
538 #define _ARM_USE_NEW_NEON_INTRINSICS
539#endif
540
541#ifndef ATTRIB_AES
542 #define ATTRIB_AES
543#endif
544
545#if defined(_MSC_VER) && defined(MY_CPU_ARM64)
546#include <arm64_neon.h>
547#else
548#include <arm_neon.h>
549#endif
550
551typedef uint8x16_t v128;
552
553#define AES_FUNC_START(name) \
554 void MY_FAST_CALL name(v128 *p, v128 *data, size_t numBlocks)
555
556#define AES_FUNC_START2(name) \
557AES_FUNC_START (name); \
558ATTRIB_AES \
559AES_FUNC_START (name)
560
561#define MM_OP(op, dest, src) dest = op(dest, src);
562#define MM_OP_m(op, src) MM_OP(op, m, src);
563#define MM_OP1_m(op) m = op(m);
564
565#define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src);
566#define MM_XOR_m( src) MM_XOR(m, src);
567
568#define AES_E_m(k) MM_OP_m (vaeseq_u8, k);
569#define AES_E_MC_m(k) AES_E_m (k); MM_OP1_m(vaesmcq_u8);
570
571
572AES_FUNC_START2 (AesCbc_Encode_HW)
573{
574 v128 m = *p;
575 const v128 k0 = p[2];
576 const v128 k1 = p[3];
577 const v128 k2 = p[4];
578 const v128 k3 = p[5];
579 const v128 k4 = p[6];
580 const v128 k5 = p[7];
581 const v128 k6 = p[8];
582 const v128 k7 = p[9];
583 const v128 k8 = p[10];
584 const v128 k9 = p[11];
585 const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
586 const v128 *w = p + ((size_t)numRounds2 * 2);
587 const v128 k_z1 = w[1];
588 const v128 k_z0 = w[2];
589 for (; numBlocks != 0; numBlocks--, data++)
590 {
591 MM_XOR_m (*data);
592 AES_E_MC_m (k0)
593 AES_E_MC_m (k1)
594 AES_E_MC_m (k2)
595 AES_E_MC_m (k3)
596 AES_E_MC_m (k4)
597 AES_E_MC_m (k5)
598 AES_E_MC_m (k6)
599 AES_E_MC_m (k7)
600 AES_E_MC_m (k8)
601 if (numRounds2 >= 6)
602 {
603 AES_E_MC_m (k9)
604 AES_E_MC_m (p[12])
605 if (numRounds2 != 6)
606 {
607 AES_E_MC_m (p[13])
608 AES_E_MC_m (p[14])
609 }
610 }
611 AES_E_m (k_z1);
612 MM_XOR_m (k_z0);
613 *data = m;
614 }
615 *p = m;
616}
617
618
619#define WOP_1(op)
620#define WOP_2(op) WOP_1 (op) op (m1, 1);
621#define WOP_3(op) WOP_2 (op) op (m2, 2);
622#define WOP_4(op) WOP_3 (op) op (m3, 3);
623#define WOP_5(op) WOP_4 (op) op (m4, 4);
624#define WOP_6(op) WOP_5 (op) op (m5, 5);
625#define WOP_7(op) WOP_6 (op) op (m6, 6);
626#define WOP_8(op) WOP_7 (op) op (m7, 7);
627
628 #define NUM_WAYS 8
629 #define WOP_M1 WOP_8
630
631#define WOP(op) op (m0, 0); WOP_M1(op)
632
633#define DECLARE_VAR(reg, ii) v128 reg
634#define LOAD_data( reg, ii) reg = data[ii];
635#define STORE_data( reg, ii) data[ii] = reg;
636#if (NUM_WAYS > 1)
637#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]);
638#endif
639
640#define MM_OP_key(op, reg) MM_OP (op, reg, key);
641
642#define AES_D_m(k) MM_OP_m (vaesdq_u8, k);
643#define AES_D_IMC_m(k) AES_D_m (k); MM_OP1_m (vaesimcq_u8);
644
645#define AES_XOR( reg, ii) MM_OP_key (veorq_u8, reg)
646#define AES_D( reg, ii) MM_OP_key (vaesdq_u8, reg)
647#define AES_E( reg, ii) MM_OP_key (vaeseq_u8, reg)
648
649#define AES_D_IMC( reg, ii) AES_D (reg, ii); reg = vaesimcq_u8(reg)
650#define AES_E_MC( reg, ii) AES_E (reg, ii); reg = vaesmcq_u8(reg)
651
652#define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one); reg = vreinterpretq_u8_u64(ctr);
653#define CTR_END( reg, ii) MM_XOR (data[ii], reg);
654
655#define WOP_KEY(op, n) { \
656 const v128 key = w[n]; \
657 WOP(op); }
658
659#define WIDE_LOOP_START \
660 dataEnd = data + numBlocks; \
661 if (numBlocks >= NUM_WAYS) \
662 { dataEnd -= NUM_WAYS; do { \
663
664#define WIDE_LOOP_END \
665 data += NUM_WAYS; \
666 } while (data <= dataEnd); \
667 dataEnd += NUM_WAYS; } \
668
669#define SINGLE_LOOP \
670 for (; data < dataEnd; data++)
671
672
673AES_FUNC_START2 (AesCbc_Decode_HW)
674{
675 v128 iv = *p;
676 const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
677 const v128 *dataEnd;
678 p += 2;
679
680 WIDE_LOOP_START
681 {
682 const v128 *w = wStart;
683 WOP (DECLARE_VAR)
684 WOP (LOAD_data);
685 WOP_KEY (AES_D_IMC, 2)
686 do
687 {
688 WOP_KEY (AES_D_IMC, 1)
689 WOP_KEY (AES_D_IMC, 0)
690 w -= 2;
691 }
692 while (w != p);
693 WOP_KEY (AES_D, 1)
694 WOP_KEY (AES_XOR, 0)
695 MM_XOR (m0, iv);
696 WOP_M1 (XOR_data_M1)
697 iv = data[NUM_WAYS - 1];
698 WOP (STORE_data);
699 }
700 WIDE_LOOP_END
701
702 SINGLE_LOOP
703 {
704 const v128 *w = wStart;
705 v128 m = *data;
706 AES_D_IMC_m (w[2])
707 do
708 {
709 AES_D_IMC_m (w[1]);
710 AES_D_IMC_m (w[0]);
711 w -= 2;
712 }
713 while (w != p);
714 AES_D_m (w[1]);
715 MM_XOR_m (w[0]);
716 MM_XOR_m (iv);
717 iv = *data;
718 *data = m;
719 }
720
721 p[-2] = iv;
722}
723
724
725AES_FUNC_START2 (AesCtr_Code_HW)
726{
727 uint64x2_t ctr = vreinterpretq_u64_u8(*p);
728 const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
729 const v128 *dataEnd;
730 uint64x2_t one = vdupq_n_u64(0);
731 one = vsetq_lane_u64(1, one, 0);
732 p += 2;
733
734 WIDE_LOOP_START
735 {
736 const v128 *w = p;
737 WOP (DECLARE_VAR)
738 WOP (CTR_START);
739 do
740 {
741 WOP_KEY (AES_E_MC, 0)
742 WOP_KEY (AES_E_MC, 1)
743 w += 2;
744 }
745 while (w != wEnd);
746 WOP_KEY (AES_E_MC, 0)
747 WOP_KEY (AES_E, 1)
748 WOP_KEY (AES_XOR, 2)
749 WOP (CTR_END);
750 }
751 WIDE_LOOP_END
752
753 SINGLE_LOOP
754 {
755 const v128 *w = p;
756 v128 m;
757 CTR_START (m, 0);
758 do
759 {
760 AES_E_MC_m (w[0]);
761 AES_E_MC_m (w[1]);
762 w += 2;
763 }
764 while (w != wEnd);
765 AES_E_MC_m (w[0]);
766 AES_E_m (w[1]);
767 MM_XOR_m (w[2]);
768 CTR_END (m, 0);
769 }
770
771 p[-2] = vreinterpretq_u8_u64(ctr);
772}
773
774#endif // USE_HW_AES
775
776#endif // MY_CPU_ARM_OR_ARM64