aboutsummaryrefslogtreecommitdiff
path: root/C/AesOpt.c
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-05-14 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-05-15 23:55:04 +0500
commitfc662341e6f85da78ada0e443f6116b978f79f22 (patch)
tree1be1cc402a7a9cbc18d4eeea6b141354c2d559e3 /C/AesOpt.c
parent5b39dc76f1bc82f941d5c800ab9f34407a06b53a (diff)
download7zip-24.05.tar.gz
7zip-24.05.tar.bz2
7zip-24.05.zip
24.0524.05
Diffstat (limited to 'C/AesOpt.c')
-rw-r--r--C/AesOpt.c225
1 files changed, 171 insertions, 54 deletions
diff --git a/C/AesOpt.c b/C/AesOpt.c
index cfa6413..58769ea 100644
--- a/C/AesOpt.c
+++ b/C/AesOpt.c
@@ -1,5 +1,5 @@
1/* AesOpt.c -- AES optimized code for x86 AES hardware instructions 1/* AesOpt.c -- AES optimized code for x86 AES hardware instructions
22023-04-02 : Igor Pavlov : Public domain */ 22024-03-01 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -15,8 +15,8 @@
15 #define USE_INTEL_VAES 15 #define USE_INTEL_VAES
16 #endif 16 #endif
17 #endif 17 #endif
18 #elif defined(__clang__) && (__clang_major__ > 3 || __clang_major__ == 3 && __clang_minor__ >= 8) \ 18 #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
19 || defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4) 19 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400)
20 #define USE_INTEL_AES 20 #define USE_INTEL_AES
21 #if !defined(__AES__) 21 #if !defined(__AES__)
22 #define ATTRIB_AES __attribute__((__target__("aes"))) 22 #define ATTRIB_AES __attribute__((__target__("aes")))
@@ -35,27 +35,37 @@
35 #define USE_INTEL_VAES 35 #define USE_INTEL_VAES
36 #endif 36 #endif
37 #endif 37 #endif
38 #ifndef USE_INTEL_AES
39 #define Z7_USE_AES_HW_STUB
40 #endif
41 #ifndef USE_INTEL_VAES
42 #define Z7_USE_VAES_HW_STUB
43 #endif
38 #endif 44 #endif
39 45
40#ifndef ATTRIB_AES 46 #ifndef USE_INTEL_AES
41 #define ATTRIB_AES 47 // #define Z7_USE_AES_HW_STUB // for debug
42#endif 48 #endif
43#ifndef ATTRIB_VAES 49 #ifndef USE_INTEL_VAES
44 #define ATTRIB_VAES 50 // #define Z7_USE_VAES_HW_STUB // for debug
45#endif 51 #endif
46 52
47 53
48#ifdef USE_INTEL_AES 54#ifdef USE_INTEL_AES
49 55
50#include <wmmintrin.h> 56#include <wmmintrin.h>
51 57
52#ifndef USE_INTEL_VAES 58#if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
53#define AES_TYPE_keys UInt32 59#define AES_TYPE_keys UInt32
54#define AES_TYPE_data Byte 60#define AES_TYPE_data Byte
55// #define AES_TYPE_keys __m128i 61// #define AES_TYPE_keys __m128i
56// #define AES_TYPE_data __m128i 62// #define AES_TYPE_data __m128i
57#endif 63#endif
58 64
65#ifndef ATTRIB_AES
66 #define ATTRIB_AES
67#endif
68
59#define AES_FUNC_START(name) \ 69#define AES_FUNC_START(name) \
60 void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) 70 void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
61 // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks) 71 // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks)
@@ -69,8 +79,6 @@ AES_FUNC_START (name)
69#define MM_OP_m(op, src) MM_OP(op, m, src) 79#define MM_OP_m(op, src) MM_OP(op, m, src)
70 80
71#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) 81#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src)
72#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
73
74 82
75AES_FUNC_START2 (AesCbc_Encode_HW) 83AES_FUNC_START2 (AesCbc_Encode_HW)
76{ 84{
@@ -139,11 +147,6 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
139#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) 147#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1])
140#endif 148#endif
141 149
142#define AVX_DECLARE_VAR(reg, ii) __m256i reg;
143#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii];
144#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg;
145#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
146
147#define MM_OP_key(op, reg) MM_OP(op, reg, key); 150#define MM_OP_key(op, reg) MM_OP(op, reg, key);
148 151
149#define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg) 152#define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg)
@@ -152,27 +155,13 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
152#define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg) 155#define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg)
153#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) 156#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg)
154 157
155
156#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
157#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
158#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
159#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
160#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
161
162#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; 158#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr;
163#define CTR_END( reg, ii) MM_XOR (data[ii], reg) 159#define CTR_END( reg, ii) MM_XOR (data[ii], reg)
164 160
165#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key);
166#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg)
167
168#define WOP_KEY(op, n) { \ 161#define WOP_KEY(op, n) { \
169 const __m128i key = w[n]; \ 162 const __m128i key = w[n]; \
170 WOP(op); } 163 WOP(op); }
171 164
172#define AVX_WOP_KEY(op, n) { \
173 const __m256i key = w[n]; \
174 WOP(op); }
175
176 165
177#define WIDE_LOOP_START \ 166#define WIDE_LOOP_START \
178 dataEnd = data + numBlocks; \ 167 dataEnd = data + numBlocks; \
@@ -190,6 +179,40 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
190 for (; data < dataEnd; data++) 179 for (; data < dataEnd; data++)
191 180
192 181
182
183#ifdef USE_INTEL_VAES
184
185#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
186#define AVX_DECLARE_VAR(reg, ii) __m256i reg;
187#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii];
188#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg;
189/*
190AVX_XOR_data_M1() needs unaligned memory load
191if (we don't use _mm256_loadu_si256() here)
192{
193 Most compilers with enabled optimizations generate fused AVX (LOAD + OP)
194 instruction that can load unaligned data.
195 But GCC and CLANG without -O2 or -O1 optimizations can generate separated
196 LOAD-ALIGNED (vmovdqa) instruction that will fail on execution.
197}
198Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here.
199v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler.
200*/
201#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii])))
202// for debug only: the following code will fail on execution, if compiled by some compilers:
203// #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
204
205#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
206#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
207#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
208#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
209#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
210#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key);
211#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg)
212#define AVX_WOP_KEY(op, n) { \
213 const __m256i key = w[n]; \
214 WOP(op); }
215
193#define NUM_AES_KEYS_MAX 15 216#define NUM_AES_KEYS_MAX 15
194 217
195#define WIDE_LOOP_START_AVX(OP) \ 218#define WIDE_LOOP_START_AVX(OP) \
@@ -214,6 +237,9 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
214/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, 237/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
215 MSVC still can insert vzeroupper instruction. */ 238 MSVC still can insert vzeroupper instruction. */
216 239
240#endif
241
242
217 243
218AES_FUNC_START2 (AesCbc_Decode_HW) 244AES_FUNC_START2 (AesCbc_Decode_HW)
219{ 245{
@@ -380,6 +406,9 @@ required that <immintrin.h> must be included before <avxintrin.h>.
380 #endif 406 #endif
381#endif // __clang__ && _MSC_VER 407#endif // __clang__ && _MSC_VER
382 408
409#ifndef ATTRIB_VAES
410 #define ATTRIB_VAES
411#endif
383 412
384#define VAES_FUNC_START2(name) \ 413#define VAES_FUNC_START2(name) \
385AES_FUNC_START (name); \ 414AES_FUNC_START (name); \
@@ -519,10 +548,18 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
519 548
520/* no USE_INTEL_AES */ 549/* no USE_INTEL_AES */
521 550
551#if defined(Z7_USE_AES_HW_STUB)
552// We can compile this file with another C compiler,
553// or we can compile asm version.
554// So we can generate real code instead of this stub function.
555// #if defined(_MSC_VER)
522#pragma message("AES HW_SW stub was used") 556#pragma message("AES HW_SW stub was used")
557// #endif
523 558
559#if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
524#define AES_TYPE_keys UInt32 560#define AES_TYPE_keys UInt32
525#define AES_TYPE_data Byte 561#define AES_TYPE_data Byte
562#endif
526 563
527#define AES_FUNC_START(name) \ 564#define AES_FUNC_START(name) \
528 void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \ 565 void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \
@@ -535,13 +572,16 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
535AES_COMPAT_STUB (AesCbc_Encode) 572AES_COMPAT_STUB (AesCbc_Encode)
536AES_COMPAT_STUB (AesCbc_Decode) 573AES_COMPAT_STUB (AesCbc_Decode)
537AES_COMPAT_STUB (AesCtr_Code) 574AES_COMPAT_STUB (AesCtr_Code)
575#endif // Z7_USE_AES_HW_STUB
538 576
539#endif // USE_INTEL_AES 577#endif // USE_INTEL_AES
540 578
541 579
542#ifndef USE_INTEL_VAES 580#ifndef USE_INTEL_VAES
543 581#if defined(Z7_USE_VAES_HW_STUB)
582// #if defined(_MSC_VER)
544#pragma message("VAES HW_SW stub was used") 583#pragma message("VAES HW_SW stub was used")
584// #endif
545 585
546#define VAES_COMPAT_STUB(name) \ 586#define VAES_COMPAT_STUB(name) \
547 void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ 587 void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
@@ -550,36 +590,59 @@ AES_COMPAT_STUB (AesCtr_Code)
550 590
551VAES_COMPAT_STUB (AesCbc_Decode_HW) 591VAES_COMPAT_STUB (AesCbc_Decode_HW)
552VAES_COMPAT_STUB (AesCtr_Code_HW) 592VAES_COMPAT_STUB (AesCtr_Code_HW)
553 593#endif
554#endif // ! USE_INTEL_VAES 594#endif // ! USE_INTEL_VAES
555 595
556 596
597
598
557#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) 599#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
558 600
559 #if defined(__clang__) 601 #if defined(__ARM_FEATURE_AES) \
560 #if (__clang_major__ >= 8) // fix that check 602 || defined(__ARM_FEATURE_CRYPTO)
603 #define USE_HW_AES
604 #else
605 #if defined(MY_CPU_ARM64) \
606 || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
607 || defined(Z7_MSC_VER_ORIGINAL)
608 #if defined(__ARM_FP) && \
609 ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
610 || defined(__GNUC__) && (__GNUC__ >= 6) \
611 ) \
612 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
613 #if defined(MY_CPU_ARM64) \
614 || !defined(Z7_CLANG_VERSION) \
615 || defined(__ARM_NEON) && \
616 (Z7_CLANG_VERSION < 170000 || \
617 Z7_CLANG_VERSION > 170001)
561 #define USE_HW_AES 618 #define USE_HW_AES
562 #endif 619 #endif
563 #elif defined(__GNUC__)
564 #if (__GNUC__ >= 6) // fix that check
565 #define USE_HW_AES
566 #endif 620 #endif
567 #elif defined(_MSC_VER)
568 #if _MSC_VER >= 1910
569 #define USE_HW_AES
570 #endif 621 #endif
571 #endif 622 #endif
572 623
573#ifdef USE_HW_AES 624#ifdef USE_HW_AES
574 625
575// #pragma message("=== AES HW === ") 626// #pragma message("=== AES HW === ")
627// __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_AES
576 628
577#if defined(__clang__) || defined(__GNUC__) 629#if defined(__clang__) || defined(__GNUC__)
630#if !defined(__ARM_FEATURE_AES) && \
631 !defined(__ARM_FEATURE_CRYPTO)
578 #ifdef MY_CPU_ARM64 632 #ifdef MY_CPU_ARM64
633#if defined(__clang__)
634 #define ATTRIB_AES __attribute__((__target__("crypto")))
635#else
579 #define ATTRIB_AES __attribute__((__target__("+crypto"))) 636 #define ATTRIB_AES __attribute__((__target__("+crypto")))
637#endif
580 #else 638 #else
639#if defined(__clang__)
640 #define ATTRIB_AES __attribute__((__target__("armv8-a,aes")))
641#else
581 #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) 642 #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
643#endif
582 #endif 644 #endif
645#endif
583#else 646#else
584 // _MSC_VER 647 // _MSC_VER
585 // for arm32 648 // for arm32
@@ -590,12 +653,60 @@ VAES_COMPAT_STUB (AesCtr_Code_HW)
590 #define ATTRIB_AES 653 #define ATTRIB_AES
591#endif 654#endif
592 655
593#if defined(_MSC_VER) && defined(MY_CPU_ARM64) 656#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
594#include <arm64_neon.h> 657#include <arm64_neon.h>
595#else 658#else
659/*
660 clang-17.0.1: error : Cannot select: intrinsic %llvm.arm.neon.aese
661 clang
662 3.8.1 : __ARM_NEON : defined(__ARM_FEATURE_CRYPTO)
663 7.0.1 : __ARM_NEON : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
664 11.?.0 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
665 13.0.1 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES)
666 16 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8
667*/
668#if defined(__clang__) && __clang_major__ < 16
669#if !defined(__ARM_FEATURE_AES) && \
670 !defined(__ARM_FEATURE_CRYPTO)
671// #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
672 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
673 #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
674// #if defined(__clang__) && __clang_major__ < 13
675 #define __ARM_FEATURE_CRYPTO 1
676// #else
677 #define __ARM_FEATURE_AES 1
678// #endif
679 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
680#endif
681#endif // clang
682
683#if defined(__clang__)
684
685#if defined(__ARM_ARCH) && __ARM_ARCH < 8
686 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
687// #pragma message("#define __ARM_ARCH 8")
688 #undef __ARM_ARCH
689 #define __ARM_ARCH 8
690 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
691#endif
692
693#endif // clang
694
596#include <arm_neon.h> 695#include <arm_neon.h>
696
697#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
698 defined(__ARM_FEATURE_CRYPTO) && \
699 defined(__ARM_FEATURE_AES)
700Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
701 #undef __ARM_FEATURE_CRYPTO
702 #undef __ARM_FEATURE_AES
703 #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
704Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
705// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
597#endif 706#endif
598 707
708#endif // Z7_MSC_VER_ORIGINAL
709
599typedef uint8x16_t v128; 710typedef uint8x16_t v128;
600 711
601#define AES_FUNC_START(name) \ 712#define AES_FUNC_START(name) \
@@ -620,7 +731,7 @@ AES_FUNC_START (name)
620 731
621AES_FUNC_START2 (AesCbc_Encode_HW) 732AES_FUNC_START2 (AesCbc_Encode_HW)
622{ 733{
623 v128 *p = (v128*)(void*)ivAes; 734 v128 * const p = (v128*)(void*)ivAes;
624 v128 *data = (v128*)(void*)data8; 735 v128 *data = (v128*)(void*)data8;
625 v128 m = *p; 736 v128 m = *p;
626 const v128 k0 = p[2]; 737 const v128 k0 = p[2];
@@ -639,7 +750,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
639 const v128 k_z0 = w[2]; 750 const v128 k_z0 = w[2];
640 for (; numBlocks != 0; numBlocks--, data++) 751 for (; numBlocks != 0; numBlocks--, data++)
641 { 752 {
642 MM_XOR_m (*data); 753 MM_XOR_m (*data)
643 AES_E_MC_m (k0) 754 AES_E_MC_m (k0)
644 AES_E_MC_m (k1) 755 AES_E_MC_m (k1)
645 AES_E_MC_m (k2) 756 AES_E_MC_m (k2)
@@ -660,7 +771,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
660 } 771 }
661 } 772 }
662 AES_E_m (k_z1) 773 AES_E_m (k_z1)
663 MM_XOR_m (k_z0); 774 MM_XOR_m (k_z0)
664 *data = m; 775 *data = m;
665 } 776 }
666 *p = m; 777 *p = m;
@@ -745,7 +856,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
745 while (w != p); 856 while (w != p);
746 WOP_KEY (AES_D, 1) 857 WOP_KEY (AES_D, 1)
747 WOP_KEY (AES_XOR, 0) 858 WOP_KEY (AES_XOR, 0)
748 MM_XOR (m0, iv); 859 MM_XOR (m0, iv)
749 WOP_M1 (XOR_data_M1) 860 WOP_M1 (XOR_data_M1)
750 iv = data[NUM_WAYS - 1]; 861 iv = data[NUM_WAYS - 1];
751 WOP (STORE_data) 862 WOP (STORE_data)
@@ -759,14 +870,14 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
759 AES_D_IMC_m (w[2]) 870 AES_D_IMC_m (w[2])
760 do 871 do
761 { 872 {
762 AES_D_IMC_m (w[1]); 873 AES_D_IMC_m (w[1])
763 AES_D_IMC_m (w[0]); 874 AES_D_IMC_m (w[0])
764 w -= 2; 875 w -= 2;
765 } 876 }
766 while (w != p); 877 while (w != p);
767 AES_D_m (w[1]); 878 AES_D_m (w[1])
768 MM_XOR_m (w[0]); 879 MM_XOR_m (w[0])
769 MM_XOR_m (iv); 880 MM_XOR_m (iv)
770 iv = *data; 881 iv = *data;
771 *data = m; 882 *data = m;
772 } 883 }
@@ -783,6 +894,12 @@ AES_FUNC_START2 (AesCtr_Code_HW)
783 const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; 894 const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
784 const v128 *dataEnd; 895 const v128 *dataEnd;
785 uint64x2_t one = vdupq_n_u64(0); 896 uint64x2_t one = vdupq_n_u64(0);
897
898// the bug in clang:
899// __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
900#if defined(__clang__) && (__clang_major__ <= 9)
901#pragma GCC diagnostic ignored "-Wvector-conversion"
902#endif
786 one = vsetq_lane_u64(1, one, 0); 903 one = vsetq_lane_u64(1, one, 0);
787 p += 2; 904 p += 2;
788 905
@@ -809,11 +926,11 @@ AES_FUNC_START2 (AesCtr_Code_HW)
809 { 926 {
810 const v128 *w = p; 927 const v128 *w = p;
811 v128 m; 928 v128 m;
812 CTR_START (m, 0); 929 CTR_START (m, 0)
813 do 930 do
814 { 931 {
815 AES_E_MC_m (w[0]); 932 AES_E_MC_m (w[0])
816 AES_E_MC_m (w[1]); 933 AES_E_MC_m (w[1])
817 w += 2; 934 w += 2;
818 } 935 }
819 while (w != wEnd); 936 while (w != wEnd);