diff options
author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2024-05-14 00:00:00 +0000 |
---|---|---|
committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2024-05-15 23:55:04 +0500 |
commit | fc662341e6f85da78ada0e443f6116b978f79f22 (patch) | |
tree | 1be1cc402a7a9cbc18d4eeea6b141354c2d559e3 /C/AesOpt.c | |
parent | 5b39dc76f1bc82f941d5c800ab9f34407a06b53a (diff) | |
download | 7zip-fc662341e6f85da78ada0e443f6116b978f79f22.tar.gz 7zip-fc662341e6f85da78ada0e443f6116b978f79f22.tar.bz2 7zip-fc662341e6f85da78ada0e443f6116b978f79f22.zip |
24.0524.05
Diffstat (limited to 'C/AesOpt.c')
-rw-r--r-- | C/AesOpt.c | 225 |
1 files changed, 171 insertions, 54 deletions
@@ -1,5 +1,5 @@ | |||
1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions | 1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -15,8 +15,8 @@ | |||
15 | #define USE_INTEL_VAES | 15 | #define USE_INTEL_VAES |
16 | #endif | 16 | #endif |
17 | #endif | 17 | #endif |
18 | #elif defined(__clang__) && (__clang_major__ > 3 || __clang_major__ == 3 && __clang_minor__ >= 8) \ | 18 | #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ |
19 | || defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4) | 19 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400) |
20 | #define USE_INTEL_AES | 20 | #define USE_INTEL_AES |
21 | #if !defined(__AES__) | 21 | #if !defined(__AES__) |
22 | #define ATTRIB_AES __attribute__((__target__("aes"))) | 22 | #define ATTRIB_AES __attribute__((__target__("aes"))) |
@@ -35,27 +35,37 @@ | |||
35 | #define USE_INTEL_VAES | 35 | #define USE_INTEL_VAES |
36 | #endif | 36 | #endif |
37 | #endif | 37 | #endif |
38 | #ifndef USE_INTEL_AES | ||
39 | #define Z7_USE_AES_HW_STUB | ||
40 | #endif | ||
41 | #ifndef USE_INTEL_VAES | ||
42 | #define Z7_USE_VAES_HW_STUB | ||
43 | #endif | ||
38 | #endif | 44 | #endif |
39 | 45 | ||
40 | #ifndef ATTRIB_AES | 46 | #ifndef USE_INTEL_AES |
41 | #define ATTRIB_AES | 47 | // #define Z7_USE_AES_HW_STUB // for debug |
42 | #endif | 48 | #endif |
43 | #ifndef ATTRIB_VAES | 49 | #ifndef USE_INTEL_VAES |
44 | #define ATTRIB_VAES | 50 | // #define Z7_USE_VAES_HW_STUB // for debug |
45 | #endif | 51 | #endif |
46 | 52 | ||
47 | 53 | ||
48 | #ifdef USE_INTEL_AES | 54 | #ifdef USE_INTEL_AES |
49 | 55 | ||
50 | #include <wmmintrin.h> | 56 | #include <wmmintrin.h> |
51 | 57 | ||
52 | #ifndef USE_INTEL_VAES | 58 | #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB) |
53 | #define AES_TYPE_keys UInt32 | 59 | #define AES_TYPE_keys UInt32 |
54 | #define AES_TYPE_data Byte | 60 | #define AES_TYPE_data Byte |
55 | // #define AES_TYPE_keys __m128i | 61 | // #define AES_TYPE_keys __m128i |
56 | // #define AES_TYPE_data __m128i | 62 | // #define AES_TYPE_data __m128i |
57 | #endif | 63 | #endif |
58 | 64 | ||
65 | #ifndef ATTRIB_AES | ||
66 | #define ATTRIB_AES | ||
67 | #endif | ||
68 | |||
59 | #define AES_FUNC_START(name) \ | 69 | #define AES_FUNC_START(name) \ |
60 | void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) | 70 | void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) |
61 | // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks) | 71 | // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks) |
@@ -69,8 +79,6 @@ AES_FUNC_START (name) | |||
69 | #define MM_OP_m(op, src) MM_OP(op, m, src) | 79 | #define MM_OP_m(op, src) MM_OP(op, m, src) |
70 | 80 | ||
71 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) | 81 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) |
72 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) | ||
73 | |||
74 | 82 | ||
75 | AES_FUNC_START2 (AesCbc_Encode_HW) | 83 | AES_FUNC_START2 (AesCbc_Encode_HW) |
76 | { | 84 | { |
@@ -139,11 +147,6 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
139 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) | 147 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) |
140 | #endif | 148 | #endif |
141 | 149 | ||
142 | #define AVX_DECLARE_VAR(reg, ii) __m256i reg; | ||
143 | #define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; | ||
144 | #define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; | ||
145 | #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) | ||
146 | |||
147 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); | 150 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); |
148 | 151 | ||
149 | #define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg) | 152 | #define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg) |
@@ -152,27 +155,13 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
152 | #define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg) | 155 | #define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg) |
153 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) | 156 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) |
154 | 157 | ||
155 | |||
156 | #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) | ||
157 | #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) | ||
158 | #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) | ||
159 | #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) | ||
160 | #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) | ||
161 | |||
162 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; | 158 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; |
163 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg) | 159 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg) |
164 | 160 | ||
165 | #define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); | ||
166 | #define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) | ||
167 | |||
168 | #define WOP_KEY(op, n) { \ | 161 | #define WOP_KEY(op, n) { \ |
169 | const __m128i key = w[n]; \ | 162 | const __m128i key = w[n]; \ |
170 | WOP(op); } | 163 | WOP(op); } |
171 | 164 | ||
172 | #define AVX_WOP_KEY(op, n) { \ | ||
173 | const __m256i key = w[n]; \ | ||
174 | WOP(op); } | ||
175 | |||
176 | 165 | ||
177 | #define WIDE_LOOP_START \ | 166 | #define WIDE_LOOP_START \ |
178 | dataEnd = data + numBlocks; \ | 167 | dataEnd = data + numBlocks; \ |
@@ -190,6 +179,40 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
190 | for (; data < dataEnd; data++) | 179 | for (; data < dataEnd; data++) |
191 | 180 | ||
192 | 181 | ||
182 | |||
183 | #ifdef USE_INTEL_VAES | ||
184 | |||
185 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) | ||
186 | #define AVX_DECLARE_VAR(reg, ii) __m256i reg; | ||
187 | #define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; | ||
188 | #define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; | ||
189 | /* | ||
190 | AVX_XOR_data_M1() needs unaligned memory load | ||
191 | if (we don't use _mm256_loadu_si256() here) | ||
192 | { | ||
193 | Most compilers with enabled optimizations generate fused AVX (LOAD + OP) | ||
194 | instruction that can load unaligned data. | ||
195 | But GCC and CLANG without -O2 or -O1 optimizations can generate separated | ||
196 | LOAD-ALIGNED (vmovdqa) instruction that will fail on execution. | ||
197 | } | ||
198 | Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here. | ||
199 | v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler. | ||
200 | */ | ||
201 | #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii]))) | ||
202 | // for debug only: the following code will fail on execution, if compiled by some compilers: | ||
203 | // #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) | ||
204 | |||
205 | #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) | ||
206 | #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) | ||
207 | #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) | ||
208 | #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) | ||
209 | #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) | ||
210 | #define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); | ||
211 | #define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) | ||
212 | #define AVX_WOP_KEY(op, n) { \ | ||
213 | const __m256i key = w[n]; \ | ||
214 | WOP(op); } | ||
215 | |||
193 | #define NUM_AES_KEYS_MAX 15 | 216 | #define NUM_AES_KEYS_MAX 15 |
194 | 217 | ||
195 | #define WIDE_LOOP_START_AVX(OP) \ | 218 | #define WIDE_LOOP_START_AVX(OP) \ |
@@ -214,6 +237,9 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
214 | /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, | 237 | /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, |
215 | MSVC still can insert vzeroupper instruction. */ | 238 | MSVC still can insert vzeroupper instruction. */ |
216 | 239 | ||
240 | #endif | ||
241 | |||
242 | |||
217 | 243 | ||
218 | AES_FUNC_START2 (AesCbc_Decode_HW) | 244 | AES_FUNC_START2 (AesCbc_Decode_HW) |
219 | { | 245 | { |
@@ -380,6 +406,9 @@ required that <immintrin.h> must be included before <avxintrin.h>. | |||
380 | #endif | 406 | #endif |
381 | #endif // __clang__ && _MSC_VER | 407 | #endif // __clang__ && _MSC_VER |
382 | 408 | ||
409 | #ifndef ATTRIB_VAES | ||
410 | #define ATTRIB_VAES | ||
411 | #endif | ||
383 | 412 | ||
384 | #define VAES_FUNC_START2(name) \ | 413 | #define VAES_FUNC_START2(name) \ |
385 | AES_FUNC_START (name); \ | 414 | AES_FUNC_START (name); \ |
@@ -519,10 +548,18 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) | |||
519 | 548 | ||
520 | /* no USE_INTEL_AES */ | 549 | /* no USE_INTEL_AES */ |
521 | 550 | ||
551 | #if defined(Z7_USE_AES_HW_STUB) | ||
552 | // We can compile this file with another C compiler, | ||
553 | // or we can compile asm version. | ||
554 | // So we can generate real code instead of this stub function. | ||
555 | // #if defined(_MSC_VER) | ||
522 | #pragma message("AES HW_SW stub was used") | 556 | #pragma message("AES HW_SW stub was used") |
557 | // #endif | ||
523 | 558 | ||
559 | #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB) | ||
524 | #define AES_TYPE_keys UInt32 | 560 | #define AES_TYPE_keys UInt32 |
525 | #define AES_TYPE_data Byte | 561 | #define AES_TYPE_data Byte |
562 | #endif | ||
526 | 563 | ||
527 | #define AES_FUNC_START(name) \ | 564 | #define AES_FUNC_START(name) \ |
528 | void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \ | 565 | void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \ |
@@ -535,13 +572,16 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) | |||
535 | AES_COMPAT_STUB (AesCbc_Encode) | 572 | AES_COMPAT_STUB (AesCbc_Encode) |
536 | AES_COMPAT_STUB (AesCbc_Decode) | 573 | AES_COMPAT_STUB (AesCbc_Decode) |
537 | AES_COMPAT_STUB (AesCtr_Code) | 574 | AES_COMPAT_STUB (AesCtr_Code) |
575 | #endif // Z7_USE_AES_HW_STUB | ||
538 | 576 | ||
539 | #endif // USE_INTEL_AES | 577 | #endif // USE_INTEL_AES |
540 | 578 | ||
541 | 579 | ||
542 | #ifndef USE_INTEL_VAES | 580 | #ifndef USE_INTEL_VAES |
543 | 581 | #if defined(Z7_USE_VAES_HW_STUB) | |
582 | // #if defined(_MSC_VER) | ||
544 | #pragma message("VAES HW_SW stub was used") | 583 | #pragma message("VAES HW_SW stub was used") |
584 | // #endif | ||
545 | 585 | ||
546 | #define VAES_COMPAT_STUB(name) \ | 586 | #define VAES_COMPAT_STUB(name) \ |
547 | void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ | 587 | void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ |
@@ -550,36 +590,59 @@ AES_COMPAT_STUB (AesCtr_Code) | |||
550 | 590 | ||
551 | VAES_COMPAT_STUB (AesCbc_Decode_HW) | 591 | VAES_COMPAT_STUB (AesCbc_Decode_HW) |
552 | VAES_COMPAT_STUB (AesCtr_Code_HW) | 592 | VAES_COMPAT_STUB (AesCtr_Code_HW) |
553 | 593 | #endif | |
554 | #endif // ! USE_INTEL_VAES | 594 | #endif // ! USE_INTEL_VAES |
555 | 595 | ||
556 | 596 | ||
597 | |||
598 | |||
557 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) | 599 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) |
558 | 600 | ||
559 | #if defined(__clang__) | 601 | #if defined(__ARM_FEATURE_AES) \ |
560 | #if (__clang_major__ >= 8) // fix that check | 602 | || defined(__ARM_FEATURE_CRYPTO) |
603 | #define USE_HW_AES | ||
604 | #else | ||
605 | #if defined(MY_CPU_ARM64) \ | ||
606 | || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ | ||
607 | || defined(Z7_MSC_VER_ORIGINAL) | ||
608 | #if defined(__ARM_FP) && \ | ||
609 | ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
610 | || defined(__GNUC__) && (__GNUC__ >= 6) \ | ||
611 | ) \ | ||
612 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) | ||
613 | #if defined(MY_CPU_ARM64) \ | ||
614 | || !defined(Z7_CLANG_VERSION) \ | ||
615 | || defined(__ARM_NEON) && \ | ||
616 | (Z7_CLANG_VERSION < 170000 || \ | ||
617 | Z7_CLANG_VERSION > 170001) | ||
561 | #define USE_HW_AES | 618 | #define USE_HW_AES |
562 | #endif | 619 | #endif |
563 | #elif defined(__GNUC__) | ||
564 | #if (__GNUC__ >= 6) // fix that check | ||
565 | #define USE_HW_AES | ||
566 | #endif | 620 | #endif |
567 | #elif defined(_MSC_VER) | ||
568 | #if _MSC_VER >= 1910 | ||
569 | #define USE_HW_AES | ||
570 | #endif | 621 | #endif |
571 | #endif | 622 | #endif |
572 | 623 | ||
573 | #ifdef USE_HW_AES | 624 | #ifdef USE_HW_AES |
574 | 625 | ||
575 | // #pragma message("=== AES HW === ") | 626 | // #pragma message("=== AES HW === ") |
627 | // __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_AES | ||
576 | 628 | ||
577 | #if defined(__clang__) || defined(__GNUC__) | 629 | #if defined(__clang__) || defined(__GNUC__) |
630 | #if !defined(__ARM_FEATURE_AES) && \ | ||
631 | !defined(__ARM_FEATURE_CRYPTO) | ||
578 | #ifdef MY_CPU_ARM64 | 632 | #ifdef MY_CPU_ARM64 |
633 | #if defined(__clang__) | ||
634 | #define ATTRIB_AES __attribute__((__target__("crypto"))) | ||
635 | #else | ||
579 | #define ATTRIB_AES __attribute__((__target__("+crypto"))) | 636 | #define ATTRIB_AES __attribute__((__target__("+crypto"))) |
637 | #endif | ||
580 | #else | 638 | #else |
639 | #if defined(__clang__) | ||
640 | #define ATTRIB_AES __attribute__((__target__("armv8-a,aes"))) | ||
641 | #else | ||
581 | #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | 642 | #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) |
643 | #endif | ||
582 | #endif | 644 | #endif |
645 | #endif | ||
583 | #else | 646 | #else |
584 | // _MSC_VER | 647 | // _MSC_VER |
585 | // for arm32 | 648 | // for arm32 |
@@ -590,12 +653,60 @@ VAES_COMPAT_STUB (AesCtr_Code_HW) | |||
590 | #define ATTRIB_AES | 653 | #define ATTRIB_AES |
591 | #endif | 654 | #endif |
592 | 655 | ||
593 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | 656 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) |
594 | #include <arm64_neon.h> | 657 | #include <arm64_neon.h> |
595 | #else | 658 | #else |
659 | /* | ||
660 | clang-17.0.1: error : Cannot select: intrinsic %llvm.arm.neon.aese | ||
661 | clang | ||
662 | 3.8.1 : __ARM_NEON : defined(__ARM_FEATURE_CRYPTO) | ||
663 | 7.0.1 : __ARM_NEON : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO) | ||
664 | 11.?.0 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO) | ||
665 | 13.0.1 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES) | ||
666 | 16 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 | ||
667 | */ | ||
668 | #if defined(__clang__) && __clang_major__ < 16 | ||
669 | #if !defined(__ARM_FEATURE_AES) && \ | ||
670 | !defined(__ARM_FEATURE_CRYPTO) | ||
671 | // #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ") | ||
672 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
673 | #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1 | ||
674 | // #if defined(__clang__) && __clang_major__ < 13 | ||
675 | #define __ARM_FEATURE_CRYPTO 1 | ||
676 | // #else | ||
677 | #define __ARM_FEATURE_AES 1 | ||
678 | // #endif | ||
679 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
680 | #endif | ||
681 | #endif // clang | ||
682 | |||
683 | #if defined(__clang__) | ||
684 | |||
685 | #if defined(__ARM_ARCH) && __ARM_ARCH < 8 | ||
686 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
687 | // #pragma message("#define __ARM_ARCH 8") | ||
688 | #undef __ARM_ARCH | ||
689 | #define __ARM_ARCH 8 | ||
690 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
691 | #endif | ||
692 | |||
693 | #endif // clang | ||
694 | |||
596 | #include <arm_neon.h> | 695 | #include <arm_neon.h> |
696 | |||
697 | #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \ | ||
698 | defined(__ARM_FEATURE_CRYPTO) && \ | ||
699 | defined(__ARM_FEATURE_AES) | ||
700 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
701 | #undef __ARM_FEATURE_CRYPTO | ||
702 | #undef __ARM_FEATURE_AES | ||
703 | #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET | ||
704 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
705 | // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") | ||
597 | #endif | 706 | #endif |
598 | 707 | ||
708 | #endif // Z7_MSC_VER_ORIGINAL | ||
709 | |||
599 | typedef uint8x16_t v128; | 710 | typedef uint8x16_t v128; |
600 | 711 | ||
601 | #define AES_FUNC_START(name) \ | 712 | #define AES_FUNC_START(name) \ |
@@ -620,7 +731,7 @@ AES_FUNC_START (name) | |||
620 | 731 | ||
621 | AES_FUNC_START2 (AesCbc_Encode_HW) | 732 | AES_FUNC_START2 (AesCbc_Encode_HW) |
622 | { | 733 | { |
623 | v128 *p = (v128*)(void*)ivAes; | 734 | v128 * const p = (v128*)(void*)ivAes; |
624 | v128 *data = (v128*)(void*)data8; | 735 | v128 *data = (v128*)(void*)data8; |
625 | v128 m = *p; | 736 | v128 m = *p; |
626 | const v128 k0 = p[2]; | 737 | const v128 k0 = p[2]; |
@@ -639,7 +750,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
639 | const v128 k_z0 = w[2]; | 750 | const v128 k_z0 = w[2]; |
640 | for (; numBlocks != 0; numBlocks--, data++) | 751 | for (; numBlocks != 0; numBlocks--, data++) |
641 | { | 752 | { |
642 | MM_XOR_m (*data); | 753 | MM_XOR_m (*data) |
643 | AES_E_MC_m (k0) | 754 | AES_E_MC_m (k0) |
644 | AES_E_MC_m (k1) | 755 | AES_E_MC_m (k1) |
645 | AES_E_MC_m (k2) | 756 | AES_E_MC_m (k2) |
@@ -660,7 +771,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
660 | } | 771 | } |
661 | } | 772 | } |
662 | AES_E_m (k_z1) | 773 | AES_E_m (k_z1) |
663 | MM_XOR_m (k_z0); | 774 | MM_XOR_m (k_z0) |
664 | *data = m; | 775 | *data = m; |
665 | } | 776 | } |
666 | *p = m; | 777 | *p = m; |
@@ -745,7 +856,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
745 | while (w != p); | 856 | while (w != p); |
746 | WOP_KEY (AES_D, 1) | 857 | WOP_KEY (AES_D, 1) |
747 | WOP_KEY (AES_XOR, 0) | 858 | WOP_KEY (AES_XOR, 0) |
748 | MM_XOR (m0, iv); | 859 | MM_XOR (m0, iv) |
749 | WOP_M1 (XOR_data_M1) | 860 | WOP_M1 (XOR_data_M1) |
750 | iv = data[NUM_WAYS - 1]; | 861 | iv = data[NUM_WAYS - 1]; |
751 | WOP (STORE_data) | 862 | WOP (STORE_data) |
@@ -759,14 +870,14 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
759 | AES_D_IMC_m (w[2]) | 870 | AES_D_IMC_m (w[2]) |
760 | do | 871 | do |
761 | { | 872 | { |
762 | AES_D_IMC_m (w[1]); | 873 | AES_D_IMC_m (w[1]) |
763 | AES_D_IMC_m (w[0]); | 874 | AES_D_IMC_m (w[0]) |
764 | w -= 2; | 875 | w -= 2; |
765 | } | 876 | } |
766 | while (w != p); | 877 | while (w != p); |
767 | AES_D_m (w[1]); | 878 | AES_D_m (w[1]) |
768 | MM_XOR_m (w[0]); | 879 | MM_XOR_m (w[0]) |
769 | MM_XOR_m (iv); | 880 | MM_XOR_m (iv) |
770 | iv = *data; | 881 | iv = *data; |
771 | *data = m; | 882 | *data = m; |
772 | } | 883 | } |
@@ -783,6 +894,12 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
783 | const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; | 894 | const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; |
784 | const v128 *dataEnd; | 895 | const v128 *dataEnd; |
785 | uint64x2_t one = vdupq_n_u64(0); | 896 | uint64x2_t one = vdupq_n_u64(0); |
897 | |||
898 | // the bug in clang: | ||
899 | // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); | ||
900 | #if defined(__clang__) && (__clang_major__ <= 9) | ||
901 | #pragma GCC diagnostic ignored "-Wvector-conversion" | ||
902 | #endif | ||
786 | one = vsetq_lane_u64(1, one, 0); | 903 | one = vsetq_lane_u64(1, one, 0); |
787 | p += 2; | 904 | p += 2; |
788 | 905 | ||
@@ -809,11 +926,11 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
809 | { | 926 | { |
810 | const v128 *w = p; | 927 | const v128 *w = p; |
811 | v128 m; | 928 | v128 m; |
812 | CTR_START (m, 0); | 929 | CTR_START (m, 0) |
813 | do | 930 | do |
814 | { | 931 | { |
815 | AES_E_MC_m (w[0]); | 932 | AES_E_MC_m (w[0]) |
816 | AES_E_MC_m (w[1]); | 933 | AES_E_MC_m (w[1]) |
817 | w += 2; | 934 | w += 2; |
818 | } | 935 | } |
819 | while (w != wEnd); | 936 | while (w != wEnd); |