diff options
author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2023-06-21 00:00:00 +0000 |
---|---|---|
committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2023-12-17 14:59:19 +0500 |
commit | 5b39dc76f1bc82f941d5c800ab9f34407a06b53a (patch) | |
tree | fe5e17420300b715021a76328444088d32047963 /C/AesOpt.c | |
parent | 93be7d4abfd4233228f58ee1fbbcd76d91be66a4 (diff) | |
download | 7zip-5b39dc76f1bc82f941d5c800ab9f34407a06b53a.tar.gz 7zip-5b39dc76f1bc82f941d5c800ab9f34407a06b53a.tar.bz2 7zip-5b39dc76f1bc82f941d5c800ab9f34407a06b53a.zip |
23.0123.01
Diffstat (limited to 'C/AesOpt.c')
-rw-r--r-- | C/AesOpt.c | 348 |
1 files changed, 206 insertions, 142 deletions
@@ -1,39 +1,33 @@ | |||
1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions | 1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions |
2 | 2021-04-01 : Igor Pavlov : Public domain */ | 2 | 2023-04-02 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
6 | #include "Aes.h" | ||
6 | #include "CpuArch.h" | 7 | #include "CpuArch.h" |
7 | 8 | ||
8 | #ifdef MY_CPU_X86_OR_AMD64 | 9 | #ifdef MY_CPU_X86_OR_AMD64 |
9 | 10 | ||
10 | #if defined(__clang__) | 11 | #if defined(__INTEL_COMPILER) |
11 | #if __clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 8) | ||
12 | #define USE_INTEL_AES | ||
13 | #define ATTRIB_AES __attribute__((__target__("aes"))) | ||
14 | #if (__clang_major__ >= 8) | ||
15 | #define USE_INTEL_VAES | ||
16 | #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2"))) | ||
17 | #endif | ||
18 | #endif | ||
19 | #elif defined(__GNUC__) | ||
20 | #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4) | ||
21 | #define USE_INTEL_AES | ||
22 | #ifndef __AES__ | ||
23 | #define ATTRIB_AES __attribute__((__target__("aes"))) | ||
24 | #endif | ||
25 | #if (__GNUC__ >= 8) | ||
26 | #define USE_INTEL_VAES | ||
27 | #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2"))) | ||
28 | #endif | ||
29 | #endif | ||
30 | #elif defined(__INTEL_COMPILER) | ||
31 | #if (__INTEL_COMPILER >= 1110) | 12 | #if (__INTEL_COMPILER >= 1110) |
32 | #define USE_INTEL_AES | 13 | #define USE_INTEL_AES |
33 | #if (__INTEL_COMPILER >= 1900) | 14 | #if (__INTEL_COMPILER >= 1900) |
34 | #define USE_INTEL_VAES | 15 | #define USE_INTEL_VAES |
35 | #endif | 16 | #endif |
36 | #endif | 17 | #endif |
18 | #elif defined(__clang__) && (__clang_major__ > 3 || __clang_major__ == 3 && __clang_minor__ >= 8) \ | ||
19 | || defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4) | ||
20 | #define USE_INTEL_AES | ||
21 | #if !defined(__AES__) | ||
22 | #define ATTRIB_AES __attribute__((__target__("aes"))) | ||
23 | #endif | ||
24 | #if defined(__clang__) && (__clang_major__ >= 8) \ | ||
25 | || defined(__GNUC__) && (__GNUC__ >= 8) | ||
26 | #define USE_INTEL_VAES | ||
27 | #if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__) | ||
28 | #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2"))) | ||
29 | #endif | ||
30 | #endif | ||
37 | #elif defined(_MSC_VER) | 31 | #elif defined(_MSC_VER) |
38 | #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729) | 32 | #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729) |
39 | #define USE_INTEL_AES | 33 | #define USE_INTEL_AES |
@@ -56,12 +50,15 @@ | |||
56 | #include <wmmintrin.h> | 50 | #include <wmmintrin.h> |
57 | 51 | ||
58 | #ifndef USE_INTEL_VAES | 52 | #ifndef USE_INTEL_VAES |
59 | #define AES_TYPE_keys __m128i | 53 | #define AES_TYPE_keys UInt32 |
60 | #define AES_TYPE_data __m128i | 54 | #define AES_TYPE_data Byte |
55 | // #define AES_TYPE_keys __m128i | ||
56 | // #define AES_TYPE_data __m128i | ||
61 | #endif | 57 | #endif |
62 | 58 | ||
63 | #define AES_FUNC_START(name) \ | 59 | #define AES_FUNC_START(name) \ |
64 | void MY_FAST_CALL name(__m128i *p, __m128i *data, size_t numBlocks) | 60 | void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) |
61 | // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks) | ||
65 | 62 | ||
66 | #define AES_FUNC_START2(name) \ | 63 | #define AES_FUNC_START2(name) \ |
67 | AES_FUNC_START (name); \ | 64 | AES_FUNC_START (name); \ |
@@ -69,14 +66,16 @@ ATTRIB_AES \ | |||
69 | AES_FUNC_START (name) | 66 | AES_FUNC_START (name) |
70 | 67 | ||
71 | #define MM_OP(op, dest, src) dest = op(dest, src); | 68 | #define MM_OP(op, dest, src) dest = op(dest, src); |
72 | #define MM_OP_m(op, src) MM_OP(op, m, src); | 69 | #define MM_OP_m(op, src) MM_OP(op, m, src) |
73 | 70 | ||
74 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src); | 71 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) |
75 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src); | 72 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) |
76 | 73 | ||
77 | 74 | ||
78 | AES_FUNC_START2 (AesCbc_Encode_HW) | 75 | AES_FUNC_START2 (AesCbc_Encode_HW) |
79 | { | 76 | { |
77 | __m128i *p = (__m128i *)(void *)ivAes; | ||
78 | __m128i *data = (__m128i *)(void *)data8; | ||
80 | __m128i m = *p; | 79 | __m128i m = *p; |
81 | const __m128i k0 = p[2]; | 80 | const __m128i k0 = p[2]; |
82 | const __m128i k1 = p[3]; | 81 | const __m128i k1 = p[3]; |
@@ -86,17 +85,17 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
86 | UInt32 r = numRounds2; | 85 | UInt32 r = numRounds2; |
87 | const __m128i *w = p + 4; | 86 | const __m128i *w = p + 4; |
88 | __m128i temp = *data; | 87 | __m128i temp = *data; |
89 | MM_XOR (temp, k0); | 88 | MM_XOR (temp, k0) |
90 | MM_XOR (m, temp); | 89 | MM_XOR (m, temp) |
91 | MM_OP_m (_mm_aesenc_si128, k1); | 90 | MM_OP_m (_mm_aesenc_si128, k1) |
92 | do | 91 | do |
93 | { | 92 | { |
94 | MM_OP_m (_mm_aesenc_si128, w[0]); | 93 | MM_OP_m (_mm_aesenc_si128, w[0]) |
95 | MM_OP_m (_mm_aesenc_si128, w[1]); | 94 | MM_OP_m (_mm_aesenc_si128, w[1]) |
96 | w += 2; | 95 | w += 2; |
97 | } | 96 | } |
98 | while (--r); | 97 | while (--r); |
99 | MM_OP_m (_mm_aesenclast_si128, w[0]); | 98 | MM_OP_m (_mm_aesenclast_si128, w[0]) |
100 | *data = m; | 99 | *data = m; |
101 | } | 100 | } |
102 | *p = m; | 101 | *p = m; |
@@ -104,14 +103,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
104 | 103 | ||
105 | 104 | ||
106 | #define WOP_1(op) | 105 | #define WOP_1(op) |
107 | #define WOP_2(op) WOP_1 (op) op (m1, 1); | 106 | #define WOP_2(op) WOP_1 (op) op (m1, 1) |
108 | #define WOP_3(op) WOP_2 (op) op (m2, 2); | 107 | #define WOP_3(op) WOP_2 (op) op (m2, 2) |
109 | #define WOP_4(op) WOP_3 (op) op (m3, 3); | 108 | #define WOP_4(op) WOP_3 (op) op (m3, 3) |
110 | #ifdef MY_CPU_AMD64 | 109 | #ifdef MY_CPU_AMD64 |
111 | #define WOP_5(op) WOP_4 (op) op (m4, 4); | 110 | #define WOP_5(op) WOP_4 (op) op (m4, 4) |
112 | #define WOP_6(op) WOP_5 (op) op (m5, 5); | 111 | #define WOP_6(op) WOP_5 (op) op (m5, 5) |
113 | #define WOP_7(op) WOP_6 (op) op (m6, 6); | 112 | #define WOP_7(op) WOP_6 (op) op (m6, 6) |
114 | #define WOP_8(op) WOP_7 (op) op (m7, 7); | 113 | #define WOP_8(op) WOP_7 (op) op (m7, 7) |
115 | #endif | 114 | #endif |
116 | /* | 115 | /* |
117 | #define WOP_9(op) WOP_8 (op) op (m8, 8); | 116 | #define WOP_9(op) WOP_8 (op) op (m8, 8); |
@@ -130,20 +129,20 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
130 | #define WOP_M1 WOP_4 | 129 | #define WOP_M1 WOP_4 |
131 | #endif | 130 | #endif |
132 | 131 | ||
133 | #define WOP(op) op (m0, 0); WOP_M1(op) | 132 | #define WOP(op) op (m0, 0) WOP_M1(op) |
134 | 133 | ||
135 | 134 | ||
136 | #define DECLARE_VAR(reg, ii) __m128i reg | 135 | #define DECLARE_VAR(reg, ii) __m128i reg; |
137 | #define LOAD_data( reg, ii) reg = data[ii]; | 136 | #define LOAD_data( reg, ii) reg = data[ii]; |
138 | #define STORE_data( reg, ii) data[ii] = reg; | 137 | #define STORE_data( reg, ii) data[ii] = reg; |
139 | #if (NUM_WAYS > 1) | 138 | #if (NUM_WAYS > 1) |
140 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]); | 139 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) |
141 | #endif | 140 | #endif |
142 | 141 | ||
143 | #define AVX__DECLARE_VAR(reg, ii) __m256i reg | 142 | #define AVX_DECLARE_VAR(reg, ii) __m256i reg; |
144 | #define AVX__LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; | 143 | #define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; |
145 | #define AVX__STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; | 144 | #define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; |
146 | #define AVX__XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])); | 145 | #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) |
147 | 146 | ||
148 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); | 147 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); |
149 | 148 | ||
@@ -154,23 +153,23 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
154 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) | 153 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) |
155 | 154 | ||
156 | 155 | ||
157 | #define AVX__AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) | 156 | #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) |
158 | #define AVX__AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) | 157 | #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) |
159 | #define AVX__AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) | 158 | #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) |
160 | #define AVX__AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) | 159 | #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) |
161 | #define AVX__AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) | 160 | #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) |
162 | 161 | ||
163 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one); reg = ctr; | 162 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; |
164 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg); | 163 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg) |
165 | 164 | ||
166 | #define AVX__CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two); reg = _mm256_xor_si256(ctr2, key); | 165 | #define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); |
167 | #define AVX__CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg); | 166 | #define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) |
168 | 167 | ||
169 | #define WOP_KEY(op, n) { \ | 168 | #define WOP_KEY(op, n) { \ |
170 | const __m128i key = w[n]; \ | 169 | const __m128i key = w[n]; \ |
171 | WOP(op); } | 170 | WOP(op); } |
172 | 171 | ||
173 | #define AVX__WOP_KEY(op, n) { \ | 172 | #define AVX_WOP_KEY(op, n) { \ |
174 | const __m256i key = w[n]; \ | 173 | const __m256i key = w[n]; \ |
175 | WOP(op); } | 174 | WOP(op); } |
176 | 175 | ||
@@ -218,6 +217,8 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
218 | 217 | ||
219 | AES_FUNC_START2 (AesCbc_Decode_HW) | 218 | AES_FUNC_START2 (AesCbc_Decode_HW) |
220 | { | 219 | { |
220 | __m128i *p = (__m128i *)(void *)ivAes; | ||
221 | __m128i *data = (__m128i *)(void *)data8; | ||
221 | __m128i iv = *p; | 222 | __m128i iv = *p; |
222 | const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1; | 223 | const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1; |
223 | const __m128i *dataEnd; | 224 | const __m128i *dataEnd; |
@@ -228,7 +229,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
228 | const __m128i *w = wStart; | 229 | const __m128i *w = wStart; |
229 | 230 | ||
230 | WOP (DECLARE_VAR) | 231 | WOP (DECLARE_VAR) |
231 | WOP (LOAD_data); | 232 | WOP (LOAD_data) |
232 | WOP_KEY (AES_XOR, 1) | 233 | WOP_KEY (AES_XOR, 1) |
233 | 234 | ||
234 | do | 235 | do |
@@ -239,10 +240,10 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
239 | while (w != p); | 240 | while (w != p); |
240 | WOP_KEY (AES_DEC_LAST, 0) | 241 | WOP_KEY (AES_DEC_LAST, 0) |
241 | 242 | ||
242 | MM_XOR (m0, iv); | 243 | MM_XOR (m0, iv) |
243 | WOP_M1 (XOR_data_M1) | 244 | WOP_M1 (XOR_data_M1) |
244 | iv = data[NUM_WAYS - 1]; | 245 | iv = data[NUM_WAYS - 1]; |
245 | WOP (STORE_data); | 246 | WOP (STORE_data) |
246 | } | 247 | } |
247 | WIDE_LOOP_END | 248 | WIDE_LOOP_END |
248 | 249 | ||
@@ -252,15 +253,15 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
252 | __m128i m = _mm_xor_si128 (w[2], *data); | 253 | __m128i m = _mm_xor_si128 (w[2], *data); |
253 | do | 254 | do |
254 | { | 255 | { |
255 | MM_OP_m (_mm_aesdec_si128, w[1]); | 256 | MM_OP_m (_mm_aesdec_si128, w[1]) |
256 | MM_OP_m (_mm_aesdec_si128, w[0]); | 257 | MM_OP_m (_mm_aesdec_si128, w[0]) |
257 | w -= 2; | 258 | w -= 2; |
258 | } | 259 | } |
259 | while (w != p); | 260 | while (w != p); |
260 | MM_OP_m (_mm_aesdec_si128, w[1]); | 261 | MM_OP_m (_mm_aesdec_si128, w[1]) |
261 | MM_OP_m (_mm_aesdeclast_si128, w[0]); | 262 | MM_OP_m (_mm_aesdeclast_si128, w[0]) |
262 | 263 | ||
263 | MM_XOR (m, iv); | 264 | MM_XOR (m, iv) |
264 | iv = *data; | 265 | iv = *data; |
265 | *data = m; | 266 | *data = m; |
266 | } | 267 | } |
@@ -271,6 +272,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
271 | 272 | ||
272 | AES_FUNC_START2 (AesCtr_Code_HW) | 273 | AES_FUNC_START2 (AesCtr_Code_HW) |
273 | { | 274 | { |
275 | __m128i *p = (__m128i *)(void *)ivAes; | ||
276 | __m128i *data = (__m128i *)(void *)data8; | ||
274 | __m128i ctr = *p; | 277 | __m128i ctr = *p; |
275 | UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; | 278 | UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; |
276 | const __m128i *dataEnd; | 279 | const __m128i *dataEnd; |
@@ -283,7 +286,7 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
283 | const __m128i *w = p; | 286 | const __m128i *w = p; |
284 | UInt32 r = numRoundsMinus2; | 287 | UInt32 r = numRoundsMinus2; |
285 | WOP (DECLARE_VAR) | 288 | WOP (DECLARE_VAR) |
286 | WOP (CTR_START); | 289 | WOP (CTR_START) |
287 | WOP_KEY (AES_XOR, 0) | 290 | WOP_KEY (AES_XOR, 0) |
288 | w += 1; | 291 | w += 1; |
289 | do | 292 | do |
@@ -294,7 +297,7 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
294 | while (--r); | 297 | while (--r); |
295 | WOP_KEY (AES_ENC_LAST, 0) | 298 | WOP_KEY (AES_ENC_LAST, 0) |
296 | 299 | ||
297 | WOP (CTR_END); | 300 | WOP (CTR_END) |
298 | } | 301 | } |
299 | WIDE_LOOP_END | 302 | WIDE_LOOP_END |
300 | 303 | ||
@@ -303,19 +306,19 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
303 | UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1; | 306 | UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1; |
304 | const __m128i *w = p; | 307 | const __m128i *w = p; |
305 | __m128i m; | 308 | __m128i m; |
306 | MM_OP (_mm_add_epi64, ctr, one); | 309 | MM_OP (_mm_add_epi64, ctr, one) |
307 | m = _mm_xor_si128 (ctr, p[0]); | 310 | m = _mm_xor_si128 (ctr, p[0]); |
308 | w += 1; | 311 | w += 1; |
309 | do | 312 | do |
310 | { | 313 | { |
311 | MM_OP_m (_mm_aesenc_si128, w[0]); | 314 | MM_OP_m (_mm_aesenc_si128, w[0]) |
312 | MM_OP_m (_mm_aesenc_si128, w[1]); | 315 | MM_OP_m (_mm_aesenc_si128, w[1]) |
313 | w += 2; | 316 | w += 2; |
314 | } | 317 | } |
315 | while (--numRounds2); | 318 | while (--numRounds2); |
316 | MM_OP_m (_mm_aesenc_si128, w[0]); | 319 | MM_OP_m (_mm_aesenc_si128, w[0]) |
317 | MM_OP_m (_mm_aesenclast_si128, w[1]); | 320 | MM_OP_m (_mm_aesenclast_si128, w[1]) |
318 | MM_XOR (*data, m); | 321 | MM_XOR (*data, m) |
319 | } | 322 | } |
320 | 323 | ||
321 | p[-2] = ctr; | 324 | p[-2] = ctr; |
@@ -325,17 +328,58 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
325 | 328 | ||
326 | #ifdef USE_INTEL_VAES | 329 | #ifdef USE_INTEL_VAES |
327 | 330 | ||
331 | /* | ||
332 | GCC before 2013-Jun: | ||
333 | <immintrin.h>: | ||
334 | #ifdef __AVX__ | ||
335 | #include <avxintrin.h> | ||
336 | #endif | ||
337 | GCC after 2013-Jun: | ||
338 | <immintrin.h>: | ||
339 | #include <avxintrin.h> | ||
340 | CLANG 3.8+: | ||
341 | { | ||
342 | <immintrin.h>: | ||
343 | #if !defined(_MSC_VER) || defined(__AVX__) | ||
344 | #include <avxintrin.h> | ||
345 | #endif | ||
346 | |||
347 | if (the compiler is clang for Windows and if global arch is not set for __AVX__) | ||
348 | [ if (defined(_MSC_VER) && !defined(__AVX__)) ] | ||
349 | { | ||
350 | <immintrin.h> doesn't include <avxintrin.h> | ||
351 | and we have 2 ways to fix it: | ||
352 | 1) we can define required __AVX__ before <immintrin.h> | ||
353 | or | ||
354 | 2) we can include <avxintrin.h> after <immintrin.h> | ||
355 | } | ||
356 | } | ||
357 | |||
358 | If we include <avxintrin.h> manually for GCC/CLANG, it's | ||
359 | required that <immintrin.h> must be included before <avxintrin.h>. | ||
360 | */ | ||
361 | |||
362 | /* | ||
328 | #if defined(__clang__) && defined(_MSC_VER) | 363 | #if defined(__clang__) && defined(_MSC_VER) |
329 | #define __SSE4_2__ | ||
330 | #define __AES__ | ||
331 | #define __AVX__ | 364 | #define __AVX__ |
332 | #define __AVX2__ | 365 | #define __AVX2__ |
333 | #define __VAES__ | 366 | #define __VAES__ |
334 | #define __AVX512F__ | ||
335 | #define __AVX512VL__ | ||
336 | #endif | 367 | #endif |
368 | */ | ||
337 | 369 | ||
338 | #include <immintrin.h> | 370 | #include <immintrin.h> |
371 | #if defined(__clang__) && defined(_MSC_VER) | ||
372 | #if !defined(__AVX__) | ||
373 | #include <avxintrin.h> | ||
374 | #endif | ||
375 | #if !defined(__AVX2__) | ||
376 | #include <avx2intrin.h> | ||
377 | #endif | ||
378 | #if !defined(__VAES__) | ||
379 | #include <vaesintrin.h> | ||
380 | #endif | ||
381 | #endif // __clang__ && _MSC_VER | ||
382 | |||
339 | 383 | ||
340 | #define VAES_FUNC_START2(name) \ | 384 | #define VAES_FUNC_START2(name) \ |
341 | AES_FUNC_START (name); \ | 385 | AES_FUNC_START (name); \ |
@@ -344,6 +388,8 @@ AES_FUNC_START (name) | |||
344 | 388 | ||
345 | VAES_FUNC_START2 (AesCbc_Decode_HW_256) | 389 | VAES_FUNC_START2 (AesCbc_Decode_HW_256) |
346 | { | 390 | { |
391 | __m128i *p = (__m128i *)(void *)ivAes; | ||
392 | __m128i *data = (__m128i *)(void *)data8; | ||
347 | __m128i iv = *p; | 393 | __m128i iv = *p; |
348 | const __m128i *dataEnd; | 394 | const __m128i *dataEnd; |
349 | UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; | 395 | UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; |
@@ -353,22 +399,22 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) | |||
353 | { | 399 | { |
354 | const __m256i *w = keys + numRounds - 2; | 400 | const __m256i *w = keys + numRounds - 2; |
355 | 401 | ||
356 | WOP (AVX__DECLARE_VAR) | 402 | WOP (AVX_DECLARE_VAR) |
357 | WOP (AVX__LOAD_data); | 403 | WOP (AVX_LOAD_data) |
358 | AVX__WOP_KEY (AVX__AES_XOR, 1) | 404 | AVX_WOP_KEY (AVX_AES_XOR, 1) |
359 | 405 | ||
360 | do | 406 | do |
361 | { | 407 | { |
362 | AVX__WOP_KEY (AVX__AES_DEC, 0) | 408 | AVX_WOP_KEY (AVX_AES_DEC, 0) |
363 | w--; | 409 | w--; |
364 | } | 410 | } |
365 | while (w != keys); | 411 | while (w != keys); |
366 | AVX__WOP_KEY (AVX__AES_DEC_LAST, 0) | 412 | AVX_WOP_KEY (AVX_AES_DEC_LAST, 0) |
367 | 413 | ||
368 | AVX_XOR (m0, _mm256_setr_m128i(iv, data[0])); | 414 | AVX_XOR (m0, _mm256_setr_m128i(iv, data[0])) |
369 | WOP_M1 (AVX__XOR_data_M1) | 415 | WOP_M1 (AVX_XOR_data_M1) |
370 | iv = data[NUM_WAYS * 2 - 1]; | 416 | iv = data[NUM_WAYS * 2 - 1]; |
371 | WOP (AVX__STORE_data); | 417 | WOP (AVX_STORE_data) |
372 | } | 418 | } |
373 | WIDE_LOOP_END_AVX(;) | 419 | WIDE_LOOP_END_AVX(;) |
374 | 420 | ||
@@ -378,15 +424,15 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) | |||
378 | __m128i m = _mm_xor_si128 (w[2], *data); | 424 | __m128i m = _mm_xor_si128 (w[2], *data); |
379 | do | 425 | do |
380 | { | 426 | { |
381 | MM_OP_m (_mm_aesdec_si128, w[1]); | 427 | MM_OP_m (_mm_aesdec_si128, w[1]) |
382 | MM_OP_m (_mm_aesdec_si128, w[0]); | 428 | MM_OP_m (_mm_aesdec_si128, w[0]) |
383 | w -= 2; | 429 | w -= 2; |
384 | } | 430 | } |
385 | while (w != p); | 431 | while (w != p); |
386 | MM_OP_m (_mm_aesdec_si128, w[1]); | 432 | MM_OP_m (_mm_aesdec_si128, w[1]) |
387 | MM_OP_m (_mm_aesdeclast_si128, w[0]); | 433 | MM_OP_m (_mm_aesdeclast_si128, w[0]) |
388 | 434 | ||
389 | MM_XOR (m, iv); | 435 | MM_XOR (m, iv) |
390 | iv = *data; | 436 | iv = *data; |
391 | *data = m; | 437 | *data = m; |
392 | } | 438 | } |
@@ -403,18 +449,20 @@ AVX2: _mm256_add_epi64 : vpaddq ymm, ymm, ymm | |||
403 | _mm256_broadcastsi128_si256 : vbroadcasti128 | 449 | _mm256_broadcastsi128_si256 : vbroadcasti128 |
404 | */ | 450 | */ |
405 | 451 | ||
406 | #define AVX__CTR_LOOP_START \ | 452 | #define AVX_CTR_LOOP_START \ |
407 | ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \ | 453 | ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \ |
408 | two = _mm256_setr_m128i(one, one); \ | 454 | two = _mm256_setr_m128i(one, one); \ |
409 | two = _mm256_add_epi64(two, two); \ | 455 | two = _mm256_add_epi64(two, two); \ |
410 | 456 | ||
411 | // two = _mm256_setr_epi64x(2, 0, 2, 0); | 457 | // two = _mm256_setr_epi64x(2, 0, 2, 0); |
412 | 458 | ||
413 | #define AVX__CTR_LOOP_ENC \ | 459 | #define AVX_CTR_LOOP_ENC \ |
414 | ctr = _mm256_extracti128_si256 (ctr2, 1); \ | 460 | ctr = _mm256_extracti128_si256 (ctr2, 1); \ |
415 | 461 | ||
416 | VAES_FUNC_START2 (AesCtr_Code_HW_256) | 462 | VAES_FUNC_START2 (AesCtr_Code_HW_256) |
417 | { | 463 | { |
464 | __m128i *p = (__m128i *)(void *)ivAes; | ||
465 | __m128i *data = (__m128i *)(void *)data8; | ||
418 | __m128i ctr = *p; | 466 | __m128i ctr = *p; |
419 | UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; | 467 | UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; |
420 | const __m128i *dataEnd; | 468 | const __m128i *dataEnd; |
@@ -422,44 +470,44 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) | |||
422 | __m256i ctr2, two; | 470 | __m256i ctr2, two; |
423 | p += 2; | 471 | p += 2; |
424 | 472 | ||
425 | WIDE_LOOP_START_AVX (AVX__CTR_LOOP_START) | 473 | WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START) |
426 | { | 474 | { |
427 | const __m256i *w = keys; | 475 | const __m256i *w = keys; |
428 | UInt32 r = numRounds - 2; | 476 | UInt32 r = numRounds - 2; |
429 | WOP (AVX__DECLARE_VAR) | 477 | WOP (AVX_DECLARE_VAR) |
430 | AVX__WOP_KEY (AVX__CTR_START, 0); | 478 | AVX_WOP_KEY (AVX_CTR_START, 0) |
431 | 479 | ||
432 | w += 1; | 480 | w += 1; |
433 | do | 481 | do |
434 | { | 482 | { |
435 | AVX__WOP_KEY (AVX__AES_ENC, 0) | 483 | AVX_WOP_KEY (AVX_AES_ENC, 0) |
436 | w += 1; | 484 | w += 1; |
437 | } | 485 | } |
438 | while (--r); | 486 | while (--r); |
439 | AVX__WOP_KEY (AVX__AES_ENC_LAST, 0) | 487 | AVX_WOP_KEY (AVX_AES_ENC_LAST, 0) |
440 | 488 | ||
441 | WOP (AVX__CTR_END); | 489 | WOP (AVX_CTR_END) |
442 | } | 490 | } |
443 | WIDE_LOOP_END_AVX (AVX__CTR_LOOP_ENC) | 491 | WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC) |
444 | 492 | ||
445 | SINGLE_LOOP | 493 | SINGLE_LOOP |
446 | { | 494 | { |
447 | UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1; | 495 | UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1; |
448 | const __m128i *w = p; | 496 | const __m128i *w = p; |
449 | __m128i m; | 497 | __m128i m; |
450 | MM_OP (_mm_add_epi64, ctr, one); | 498 | MM_OP (_mm_add_epi64, ctr, one) |
451 | m = _mm_xor_si128 (ctr, p[0]); | 499 | m = _mm_xor_si128 (ctr, p[0]); |
452 | w += 1; | 500 | w += 1; |
453 | do | 501 | do |
454 | { | 502 | { |
455 | MM_OP_m (_mm_aesenc_si128, w[0]); | 503 | MM_OP_m (_mm_aesenc_si128, w[0]) |
456 | MM_OP_m (_mm_aesenc_si128, w[1]); | 504 | MM_OP_m (_mm_aesenc_si128, w[1]) |
457 | w += 2; | 505 | w += 2; |
458 | } | 506 | } |
459 | while (--numRounds2); | 507 | while (--numRounds2); |
460 | MM_OP_m (_mm_aesenc_si128, w[0]); | 508 | MM_OP_m (_mm_aesenc_si128, w[0]) |
461 | MM_OP_m (_mm_aesenclast_si128, w[1]); | 509 | MM_OP_m (_mm_aesenclast_si128, w[1]) |
462 | MM_XOR (*data, m); | 510 | MM_XOR (*data, m) |
463 | } | 511 | } |
464 | 512 | ||
465 | p[-2] = ctr; | 513 | p[-2] = ctr; |
@@ -477,7 +525,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) | |||
477 | #define AES_TYPE_data Byte | 525 | #define AES_TYPE_data Byte |
478 | 526 | ||
479 | #define AES_FUNC_START(name) \ | 527 | #define AES_FUNC_START(name) \ |
480 | void MY_FAST_CALL name(UInt32 *p, Byte *data, size_t numBlocks) \ | 528 | void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \ |
481 | 529 | ||
482 | #define AES_COMPAT_STUB(name) \ | 530 | #define AES_COMPAT_STUB(name) \ |
483 | AES_FUNC_START(name); \ | 531 | AES_FUNC_START(name); \ |
@@ -496,8 +544,8 @@ AES_COMPAT_STUB (AesCtr_Code) | |||
496 | #pragma message("VAES HW_SW stub was used") | 544 | #pragma message("VAES HW_SW stub was used") |
497 | 545 | ||
498 | #define VAES_COMPAT_STUB(name) \ | 546 | #define VAES_COMPAT_STUB(name) \ |
499 | void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ | 547 | void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ |
500 | void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \ | 548 | void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \ |
501 | { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); } | 549 | { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); } |
502 | 550 | ||
503 | VAES_COMPAT_STUB (AesCbc_Decode_HW) | 551 | VAES_COMPAT_STUB (AesCbc_Decode_HW) |
@@ -551,7 +599,8 @@ VAES_COMPAT_STUB (AesCtr_Code_HW) | |||
551 | typedef uint8x16_t v128; | 599 | typedef uint8x16_t v128; |
552 | 600 | ||
553 | #define AES_FUNC_START(name) \ | 601 | #define AES_FUNC_START(name) \ |
554 | void MY_FAST_CALL name(v128 *p, v128 *data, size_t numBlocks) | 602 | void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) |
603 | // void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks) | ||
555 | 604 | ||
556 | #define AES_FUNC_START2(name) \ | 605 | #define AES_FUNC_START2(name) \ |
557 | AES_FUNC_START (name); \ | 606 | AES_FUNC_START (name); \ |
@@ -559,18 +608,20 @@ ATTRIB_AES \ | |||
559 | AES_FUNC_START (name) | 608 | AES_FUNC_START (name) |
560 | 609 | ||
561 | #define MM_OP(op, dest, src) dest = op(dest, src); | 610 | #define MM_OP(op, dest, src) dest = op(dest, src); |
562 | #define MM_OP_m(op, src) MM_OP(op, m, src); | 611 | #define MM_OP_m(op, src) MM_OP(op, m, src) |
563 | #define MM_OP1_m(op) m = op(m); | 612 | #define MM_OP1_m(op) m = op(m); |
564 | 613 | ||
565 | #define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src); | 614 | #define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src) |
566 | #define MM_XOR_m( src) MM_XOR(m, src); | 615 | #define MM_XOR_m( src) MM_XOR(m, src) |
567 | 616 | ||
568 | #define AES_E_m(k) MM_OP_m (vaeseq_u8, k); | 617 | #define AES_E_m(k) MM_OP_m (vaeseq_u8, k) |
569 | #define AES_E_MC_m(k) AES_E_m (k); MM_OP1_m(vaesmcq_u8); | 618 | #define AES_E_MC_m(k) AES_E_m (k) MM_OP1_m(vaesmcq_u8) |
570 | 619 | ||
571 | 620 | ||
572 | AES_FUNC_START2 (AesCbc_Encode_HW) | 621 | AES_FUNC_START2 (AesCbc_Encode_HW) |
573 | { | 622 | { |
623 | v128 *p = (v128*)(void*)ivAes; | ||
624 | v128 *data = (v128*)(void*)data8; | ||
574 | v128 m = *p; | 625 | v128 m = *p; |
575 | const v128 k0 = p[2]; | 626 | const v128 k0 = p[2]; |
576 | const v128 k1 = p[3]; | 627 | const v128 k1 = p[3]; |
@@ -608,7 +659,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
608 | AES_E_MC_m (p[14]) | 659 | AES_E_MC_m (p[14]) |
609 | } | 660 | } |
610 | } | 661 | } |
611 | AES_E_m (k_z1); | 662 | AES_E_m (k_z1) |
612 | MM_XOR_m (k_z0); | 663 | MM_XOR_m (k_z0); |
613 | *data = m; | 664 | *data = m; |
614 | } | 665 | } |
@@ -617,44 +668,44 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
617 | 668 | ||
618 | 669 | ||
619 | #define WOP_1(op) | 670 | #define WOP_1(op) |
620 | #define WOP_2(op) WOP_1 (op) op (m1, 1); | 671 | #define WOP_2(op) WOP_1 (op) op (m1, 1) |
621 | #define WOP_3(op) WOP_2 (op) op (m2, 2); | 672 | #define WOP_3(op) WOP_2 (op) op (m2, 2) |
622 | #define WOP_4(op) WOP_3 (op) op (m3, 3); | 673 | #define WOP_4(op) WOP_3 (op) op (m3, 3) |
623 | #define WOP_5(op) WOP_4 (op) op (m4, 4); | 674 | #define WOP_5(op) WOP_4 (op) op (m4, 4) |
624 | #define WOP_6(op) WOP_5 (op) op (m5, 5); | 675 | #define WOP_6(op) WOP_5 (op) op (m5, 5) |
625 | #define WOP_7(op) WOP_6 (op) op (m6, 6); | 676 | #define WOP_7(op) WOP_6 (op) op (m6, 6) |
626 | #define WOP_8(op) WOP_7 (op) op (m7, 7); | 677 | #define WOP_8(op) WOP_7 (op) op (m7, 7) |
627 | 678 | ||
628 | #define NUM_WAYS 8 | 679 | #define NUM_WAYS 8 |
629 | #define WOP_M1 WOP_8 | 680 | #define WOP_M1 WOP_8 |
630 | 681 | ||
631 | #define WOP(op) op (m0, 0); WOP_M1(op) | 682 | #define WOP(op) op (m0, 0) WOP_M1(op) |
632 | 683 | ||
633 | #define DECLARE_VAR(reg, ii) v128 reg | 684 | #define DECLARE_VAR(reg, ii) v128 reg; |
634 | #define LOAD_data( reg, ii) reg = data[ii]; | 685 | #define LOAD_data( reg, ii) reg = data[ii]; |
635 | #define STORE_data( reg, ii) data[ii] = reg; | 686 | #define STORE_data( reg, ii) data[ii] = reg; |
636 | #if (NUM_WAYS > 1) | 687 | #if (NUM_WAYS > 1) |
637 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]); | 688 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) |
638 | #endif | 689 | #endif |
639 | 690 | ||
640 | #define MM_OP_key(op, reg) MM_OP (op, reg, key); | 691 | #define MM_OP_key(op, reg) MM_OP (op, reg, key) |
641 | 692 | ||
642 | #define AES_D_m(k) MM_OP_m (vaesdq_u8, k); | 693 | #define AES_D_m(k) MM_OP_m (vaesdq_u8, k) |
643 | #define AES_D_IMC_m(k) AES_D_m (k); MM_OP1_m (vaesimcq_u8); | 694 | #define AES_D_IMC_m(k) AES_D_m (k) MM_OP1_m (vaesimcq_u8) |
644 | 695 | ||
645 | #define AES_XOR( reg, ii) MM_OP_key (veorq_u8, reg) | 696 | #define AES_XOR( reg, ii) MM_OP_key (veorq_u8, reg) |
646 | #define AES_D( reg, ii) MM_OP_key (vaesdq_u8, reg) | 697 | #define AES_D( reg, ii) MM_OP_key (vaesdq_u8, reg) |
647 | #define AES_E( reg, ii) MM_OP_key (vaeseq_u8, reg) | 698 | #define AES_E( reg, ii) MM_OP_key (vaeseq_u8, reg) |
648 | 699 | ||
649 | #define AES_D_IMC( reg, ii) AES_D (reg, ii); reg = vaesimcq_u8(reg) | 700 | #define AES_D_IMC( reg, ii) AES_D (reg, ii) reg = vaesimcq_u8(reg); |
650 | #define AES_E_MC( reg, ii) AES_E (reg, ii); reg = vaesmcq_u8(reg) | 701 | #define AES_E_MC( reg, ii) AES_E (reg, ii) reg = vaesmcq_u8(reg); |
651 | 702 | ||
652 | #define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one); reg = vreinterpretq_u8_u64(ctr); | 703 | #define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one) reg = vreinterpretq_u8_u64(ctr); |
653 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg); | 704 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg) |
654 | 705 | ||
655 | #define WOP_KEY(op, n) { \ | 706 | #define WOP_KEY(op, n) { \ |
656 | const v128 key = w[n]; \ | 707 | const v128 key = w[n]; \ |
657 | WOP(op); } | 708 | WOP(op) } |
658 | 709 | ||
659 | #define WIDE_LOOP_START \ | 710 | #define WIDE_LOOP_START \ |
660 | dataEnd = data + numBlocks; \ | 711 | dataEnd = data + numBlocks; \ |
@@ -672,6 +723,8 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
672 | 723 | ||
673 | AES_FUNC_START2 (AesCbc_Decode_HW) | 724 | AES_FUNC_START2 (AesCbc_Decode_HW) |
674 | { | 725 | { |
726 | v128 *p = (v128*)(void*)ivAes; | ||
727 | v128 *data = (v128*)(void*)data8; | ||
675 | v128 iv = *p; | 728 | v128 iv = *p; |
676 | const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; | 729 | const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; |
677 | const v128 *dataEnd; | 730 | const v128 *dataEnd; |
@@ -681,7 +734,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
681 | { | 734 | { |
682 | const v128 *w = wStart; | 735 | const v128 *w = wStart; |
683 | WOP (DECLARE_VAR) | 736 | WOP (DECLARE_VAR) |
684 | WOP (LOAD_data); | 737 | WOP (LOAD_data) |
685 | WOP_KEY (AES_D_IMC, 2) | 738 | WOP_KEY (AES_D_IMC, 2) |
686 | do | 739 | do |
687 | { | 740 | { |
@@ -695,7 +748,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
695 | MM_XOR (m0, iv); | 748 | MM_XOR (m0, iv); |
696 | WOP_M1 (XOR_data_M1) | 749 | WOP_M1 (XOR_data_M1) |
697 | iv = data[NUM_WAYS - 1]; | 750 | iv = data[NUM_WAYS - 1]; |
698 | WOP (STORE_data); | 751 | WOP (STORE_data) |
699 | } | 752 | } |
700 | WIDE_LOOP_END | 753 | WIDE_LOOP_END |
701 | 754 | ||
@@ -724,6 +777,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
724 | 777 | ||
725 | AES_FUNC_START2 (AesCtr_Code_HW) | 778 | AES_FUNC_START2 (AesCtr_Code_HW) |
726 | { | 779 | { |
780 | v128 *p = (v128*)(void*)ivAes; | ||
781 | v128 *data = (v128*)(void*)data8; | ||
727 | uint64x2_t ctr = vreinterpretq_u64_u8(*p); | 782 | uint64x2_t ctr = vreinterpretq_u64_u8(*p); |
728 | const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; | 783 | const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; |
729 | const v128 *dataEnd; | 784 | const v128 *dataEnd; |
@@ -735,7 +790,7 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
735 | { | 790 | { |
736 | const v128 *w = p; | 791 | const v128 *w = p; |
737 | WOP (DECLARE_VAR) | 792 | WOP (DECLARE_VAR) |
738 | WOP (CTR_START); | 793 | WOP (CTR_START) |
739 | do | 794 | do |
740 | { | 795 | { |
741 | WOP_KEY (AES_E_MC, 0) | 796 | WOP_KEY (AES_E_MC, 0) |
@@ -746,7 +801,7 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
746 | WOP_KEY (AES_E_MC, 0) | 801 | WOP_KEY (AES_E_MC, 0) |
747 | WOP_KEY (AES_E, 1) | 802 | WOP_KEY (AES_E, 1) |
748 | WOP_KEY (AES_XOR, 2) | 803 | WOP_KEY (AES_XOR, 2) |
749 | WOP (CTR_END); | 804 | WOP (CTR_END) |
750 | } | 805 | } |
751 | WIDE_LOOP_END | 806 | WIDE_LOOP_END |
752 | 807 | ||
@@ -762,10 +817,10 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
762 | w += 2; | 817 | w += 2; |
763 | } | 818 | } |
764 | while (w != wEnd); | 819 | while (w != wEnd); |
765 | AES_E_MC_m (w[0]); | 820 | AES_E_MC_m (w[0]) |
766 | AES_E_m (w[1]); | 821 | AES_E_m (w[1]) |
767 | MM_XOR_m (w[2]); | 822 | MM_XOR_m (w[2]) |
768 | CTR_END (m, 0); | 823 | CTR_END (m, 0) |
769 | } | 824 | } |
770 | 825 | ||
771 | p[-2] = vreinterpretq_u8_u64(ctr); | 826 | p[-2] = vreinterpretq_u8_u64(ctr); |
@@ -774,3 +829,12 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
774 | #endif // USE_HW_AES | 829 | #endif // USE_HW_AES |
775 | 830 | ||
776 | #endif // MY_CPU_ARM_OR_ARM64 | 831 | #endif // MY_CPU_ARM_OR_ARM64 |
832 | |||
833 | #undef NUM_WAYS | ||
834 | #undef WOP_M1 | ||
835 | #undef WOP | ||
836 | #undef DECLARE_VAR | ||
837 | #undef LOAD_data | ||
838 | #undef STORE_data | ||
839 | #undef USE_INTEL_AES | ||
840 | #undef USE_HW_AES | ||