aboutsummaryrefslogtreecommitdiff
path: root/C/AesOpt.c
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2023-06-21 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2023-12-17 14:59:19 +0500
commit5b39dc76f1bc82f941d5c800ab9f34407a06b53a (patch)
treefe5e17420300b715021a76328444088d32047963 /C/AesOpt.c
parent93be7d4abfd4233228f58ee1fbbcd76d91be66a4 (diff)
download7zip-5b39dc76f1bc82f941d5c800ab9f34407a06b53a.tar.gz
7zip-5b39dc76f1bc82f941d5c800ab9f34407a06b53a.tar.bz2
7zip-5b39dc76f1bc82f941d5c800ab9f34407a06b53a.zip
23.0123.01
Diffstat (limited to 'C/AesOpt.c')
-rw-r--r--C/AesOpt.c348
1 files changed, 206 insertions, 142 deletions
diff --git a/C/AesOpt.c b/C/AesOpt.c
index 8be8ff6..cfa6413 100644
--- a/C/AesOpt.c
+++ b/C/AesOpt.c
@@ -1,39 +1,33 @@
1/* AesOpt.c -- AES optimized code for x86 AES hardware instructions 1/* AesOpt.c -- AES optimized code for x86 AES hardware instructions
22021-04-01 : Igor Pavlov : Public domain */ 22023-04-02 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
6#include "Aes.h"
6#include "CpuArch.h" 7#include "CpuArch.h"
7 8
8#ifdef MY_CPU_X86_OR_AMD64 9#ifdef MY_CPU_X86_OR_AMD64
9 10
10 #if defined(__clang__) 11 #if defined(__INTEL_COMPILER)
11 #if __clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 8)
12 #define USE_INTEL_AES
13 #define ATTRIB_AES __attribute__((__target__("aes")))
14 #if (__clang_major__ >= 8)
15 #define USE_INTEL_VAES
16 #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2")))
17 #endif
18 #endif
19 #elif defined(__GNUC__)
20 #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
21 #define USE_INTEL_AES
22 #ifndef __AES__
23 #define ATTRIB_AES __attribute__((__target__("aes")))
24 #endif
25 #if (__GNUC__ >= 8)
26 #define USE_INTEL_VAES
27 #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2")))
28 #endif
29 #endif
30 #elif defined(__INTEL_COMPILER)
31 #if (__INTEL_COMPILER >= 1110) 12 #if (__INTEL_COMPILER >= 1110)
32 #define USE_INTEL_AES 13 #define USE_INTEL_AES
33 #if (__INTEL_COMPILER >= 1900) 14 #if (__INTEL_COMPILER >= 1900)
34 #define USE_INTEL_VAES 15 #define USE_INTEL_VAES
35 #endif 16 #endif
36 #endif 17 #endif
18 #elif defined(__clang__) && (__clang_major__ > 3 || __clang_major__ == 3 && __clang_minor__ >= 8) \
19 || defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4)
20 #define USE_INTEL_AES
21 #if !defined(__AES__)
22 #define ATTRIB_AES __attribute__((__target__("aes")))
23 #endif
24 #if defined(__clang__) && (__clang_major__ >= 8) \
25 || defined(__GNUC__) && (__GNUC__ >= 8)
26 #define USE_INTEL_VAES
27 #if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__)
28 #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2")))
29 #endif
30 #endif
37 #elif defined(_MSC_VER) 31 #elif defined(_MSC_VER)
38 #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729) 32 #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
39 #define USE_INTEL_AES 33 #define USE_INTEL_AES
@@ -56,12 +50,15 @@
56#include <wmmintrin.h> 50#include <wmmintrin.h>
57 51
58#ifndef USE_INTEL_VAES 52#ifndef USE_INTEL_VAES
59#define AES_TYPE_keys __m128i 53#define AES_TYPE_keys UInt32
60#define AES_TYPE_data __m128i 54#define AES_TYPE_data Byte
55// #define AES_TYPE_keys __m128i
56// #define AES_TYPE_data __m128i
61#endif 57#endif
62 58
63#define AES_FUNC_START(name) \ 59#define AES_FUNC_START(name) \
64 void MY_FAST_CALL name(__m128i *p, __m128i *data, size_t numBlocks) 60 void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
61 // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks)
65 62
66#define AES_FUNC_START2(name) \ 63#define AES_FUNC_START2(name) \
67AES_FUNC_START (name); \ 64AES_FUNC_START (name); \
@@ -69,14 +66,16 @@ ATTRIB_AES \
69AES_FUNC_START (name) 66AES_FUNC_START (name)
70 67
71#define MM_OP(op, dest, src) dest = op(dest, src); 68#define MM_OP(op, dest, src) dest = op(dest, src);
72#define MM_OP_m(op, src) MM_OP(op, m, src); 69#define MM_OP_m(op, src) MM_OP(op, m, src)
73 70
74#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src); 71#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src)
75#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src); 72#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
76 73
77 74
78AES_FUNC_START2 (AesCbc_Encode_HW) 75AES_FUNC_START2 (AesCbc_Encode_HW)
79{ 76{
77 __m128i *p = (__m128i *)(void *)ivAes;
78 __m128i *data = (__m128i *)(void *)data8;
80 __m128i m = *p; 79 __m128i m = *p;
81 const __m128i k0 = p[2]; 80 const __m128i k0 = p[2];
82 const __m128i k1 = p[3]; 81 const __m128i k1 = p[3];
@@ -86,17 +85,17 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
86 UInt32 r = numRounds2; 85 UInt32 r = numRounds2;
87 const __m128i *w = p + 4; 86 const __m128i *w = p + 4;
88 __m128i temp = *data; 87 __m128i temp = *data;
89 MM_XOR (temp, k0); 88 MM_XOR (temp, k0)
90 MM_XOR (m, temp); 89 MM_XOR (m, temp)
91 MM_OP_m (_mm_aesenc_si128, k1); 90 MM_OP_m (_mm_aesenc_si128, k1)
92 do 91 do
93 { 92 {
94 MM_OP_m (_mm_aesenc_si128, w[0]); 93 MM_OP_m (_mm_aesenc_si128, w[0])
95 MM_OP_m (_mm_aesenc_si128, w[1]); 94 MM_OP_m (_mm_aesenc_si128, w[1])
96 w += 2; 95 w += 2;
97 } 96 }
98 while (--r); 97 while (--r);
99 MM_OP_m (_mm_aesenclast_si128, w[0]); 98 MM_OP_m (_mm_aesenclast_si128, w[0])
100 *data = m; 99 *data = m;
101 } 100 }
102 *p = m; 101 *p = m;
@@ -104,14 +103,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
104 103
105 104
106#define WOP_1(op) 105#define WOP_1(op)
107#define WOP_2(op) WOP_1 (op) op (m1, 1); 106#define WOP_2(op) WOP_1 (op) op (m1, 1)
108#define WOP_3(op) WOP_2 (op) op (m2, 2); 107#define WOP_3(op) WOP_2 (op) op (m2, 2)
109#define WOP_4(op) WOP_3 (op) op (m3, 3); 108#define WOP_4(op) WOP_3 (op) op (m3, 3)
110#ifdef MY_CPU_AMD64 109#ifdef MY_CPU_AMD64
111#define WOP_5(op) WOP_4 (op) op (m4, 4); 110#define WOP_5(op) WOP_4 (op) op (m4, 4)
112#define WOP_6(op) WOP_5 (op) op (m5, 5); 111#define WOP_6(op) WOP_5 (op) op (m5, 5)
113#define WOP_7(op) WOP_6 (op) op (m6, 6); 112#define WOP_7(op) WOP_6 (op) op (m6, 6)
114#define WOP_8(op) WOP_7 (op) op (m7, 7); 113#define WOP_8(op) WOP_7 (op) op (m7, 7)
115#endif 114#endif
116/* 115/*
117#define WOP_9(op) WOP_8 (op) op (m8, 8); 116#define WOP_9(op) WOP_8 (op) op (m8, 8);
@@ -130,20 +129,20 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
130 #define WOP_M1 WOP_4 129 #define WOP_M1 WOP_4
131#endif 130#endif
132 131
133#define WOP(op) op (m0, 0); WOP_M1(op) 132#define WOP(op) op (m0, 0) WOP_M1(op)
134 133
135 134
136#define DECLARE_VAR(reg, ii) __m128i reg 135#define DECLARE_VAR(reg, ii) __m128i reg;
137#define LOAD_data( reg, ii) reg = data[ii]; 136#define LOAD_data( reg, ii) reg = data[ii];
138#define STORE_data( reg, ii) data[ii] = reg; 137#define STORE_data( reg, ii) data[ii] = reg;
139#if (NUM_WAYS > 1) 138#if (NUM_WAYS > 1)
140#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]); 139#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1])
141#endif 140#endif
142 141
143#define AVX__DECLARE_VAR(reg, ii) __m256i reg 142#define AVX_DECLARE_VAR(reg, ii) __m256i reg;
144#define AVX__LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; 143#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii];
145#define AVX__STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; 144#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg;
146#define AVX__XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])); 145#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
147 146
148#define MM_OP_key(op, reg) MM_OP(op, reg, key); 147#define MM_OP_key(op, reg) MM_OP(op, reg, key);
149 148
@@ -154,23 +153,23 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
154#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) 153#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg)
155 154
156 155
157#define AVX__AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) 156#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
158#define AVX__AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) 157#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
159#define AVX__AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) 158#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
160#define AVX__AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) 159#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
161#define AVX__AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) 160#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
162 161
163#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one); reg = ctr; 162#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr;
164#define CTR_END( reg, ii) MM_XOR (data[ii], reg); 163#define CTR_END( reg, ii) MM_XOR (data[ii], reg)
165 164
166#define AVX__CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two); reg = _mm256_xor_si256(ctr2, key); 165#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key);
167#define AVX__CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg); 166#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg)
168 167
169#define WOP_KEY(op, n) { \ 168#define WOP_KEY(op, n) { \
170 const __m128i key = w[n]; \ 169 const __m128i key = w[n]; \
171 WOP(op); } 170 WOP(op); }
172 171
173#define AVX__WOP_KEY(op, n) { \ 172#define AVX_WOP_KEY(op, n) { \
174 const __m256i key = w[n]; \ 173 const __m256i key = w[n]; \
175 WOP(op); } 174 WOP(op); }
176 175
@@ -218,6 +217,8 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
218 217
219AES_FUNC_START2 (AesCbc_Decode_HW) 218AES_FUNC_START2 (AesCbc_Decode_HW)
220{ 219{
220 __m128i *p = (__m128i *)(void *)ivAes;
221 __m128i *data = (__m128i *)(void *)data8;
221 __m128i iv = *p; 222 __m128i iv = *p;
222 const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1; 223 const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
223 const __m128i *dataEnd; 224 const __m128i *dataEnd;
@@ -228,7 +229,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
228 const __m128i *w = wStart; 229 const __m128i *w = wStart;
229 230
230 WOP (DECLARE_VAR) 231 WOP (DECLARE_VAR)
231 WOP (LOAD_data); 232 WOP (LOAD_data)
232 WOP_KEY (AES_XOR, 1) 233 WOP_KEY (AES_XOR, 1)
233 234
234 do 235 do
@@ -239,10 +240,10 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
239 while (w != p); 240 while (w != p);
240 WOP_KEY (AES_DEC_LAST, 0) 241 WOP_KEY (AES_DEC_LAST, 0)
241 242
242 MM_XOR (m0, iv); 243 MM_XOR (m0, iv)
243 WOP_M1 (XOR_data_M1) 244 WOP_M1 (XOR_data_M1)
244 iv = data[NUM_WAYS - 1]; 245 iv = data[NUM_WAYS - 1];
245 WOP (STORE_data); 246 WOP (STORE_data)
246 } 247 }
247 WIDE_LOOP_END 248 WIDE_LOOP_END
248 249
@@ -252,15 +253,15 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
252 __m128i m = _mm_xor_si128 (w[2], *data); 253 __m128i m = _mm_xor_si128 (w[2], *data);
253 do 254 do
254 { 255 {
255 MM_OP_m (_mm_aesdec_si128, w[1]); 256 MM_OP_m (_mm_aesdec_si128, w[1])
256 MM_OP_m (_mm_aesdec_si128, w[0]); 257 MM_OP_m (_mm_aesdec_si128, w[0])
257 w -= 2; 258 w -= 2;
258 } 259 }
259 while (w != p); 260 while (w != p);
260 MM_OP_m (_mm_aesdec_si128, w[1]); 261 MM_OP_m (_mm_aesdec_si128, w[1])
261 MM_OP_m (_mm_aesdeclast_si128, w[0]); 262 MM_OP_m (_mm_aesdeclast_si128, w[0])
262 263
263 MM_XOR (m, iv); 264 MM_XOR (m, iv)
264 iv = *data; 265 iv = *data;
265 *data = m; 266 *data = m;
266 } 267 }
@@ -271,6 +272,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
271 272
272AES_FUNC_START2 (AesCtr_Code_HW) 273AES_FUNC_START2 (AesCtr_Code_HW)
273{ 274{
275 __m128i *p = (__m128i *)(void *)ivAes;
276 __m128i *data = (__m128i *)(void *)data8;
274 __m128i ctr = *p; 277 __m128i ctr = *p;
275 UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; 278 UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
276 const __m128i *dataEnd; 279 const __m128i *dataEnd;
@@ -283,7 +286,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
283 const __m128i *w = p; 286 const __m128i *w = p;
284 UInt32 r = numRoundsMinus2; 287 UInt32 r = numRoundsMinus2;
285 WOP (DECLARE_VAR) 288 WOP (DECLARE_VAR)
286 WOP (CTR_START); 289 WOP (CTR_START)
287 WOP_KEY (AES_XOR, 0) 290 WOP_KEY (AES_XOR, 0)
288 w += 1; 291 w += 1;
289 do 292 do
@@ -294,7 +297,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
294 while (--r); 297 while (--r);
295 WOP_KEY (AES_ENC_LAST, 0) 298 WOP_KEY (AES_ENC_LAST, 0)
296 299
297 WOP (CTR_END); 300 WOP (CTR_END)
298 } 301 }
299 WIDE_LOOP_END 302 WIDE_LOOP_END
300 303
@@ -303,19 +306,19 @@ AES_FUNC_START2 (AesCtr_Code_HW)
303 UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1; 306 UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
304 const __m128i *w = p; 307 const __m128i *w = p;
305 __m128i m; 308 __m128i m;
306 MM_OP (_mm_add_epi64, ctr, one); 309 MM_OP (_mm_add_epi64, ctr, one)
307 m = _mm_xor_si128 (ctr, p[0]); 310 m = _mm_xor_si128 (ctr, p[0]);
308 w += 1; 311 w += 1;
309 do 312 do
310 { 313 {
311 MM_OP_m (_mm_aesenc_si128, w[0]); 314 MM_OP_m (_mm_aesenc_si128, w[0])
312 MM_OP_m (_mm_aesenc_si128, w[1]); 315 MM_OP_m (_mm_aesenc_si128, w[1])
313 w += 2; 316 w += 2;
314 } 317 }
315 while (--numRounds2); 318 while (--numRounds2);
316 MM_OP_m (_mm_aesenc_si128, w[0]); 319 MM_OP_m (_mm_aesenc_si128, w[0])
317 MM_OP_m (_mm_aesenclast_si128, w[1]); 320 MM_OP_m (_mm_aesenclast_si128, w[1])
318 MM_XOR (*data, m); 321 MM_XOR (*data, m)
319 } 322 }
320 323
321 p[-2] = ctr; 324 p[-2] = ctr;
@@ -325,17 +328,58 @@ AES_FUNC_START2 (AesCtr_Code_HW)
325 328
326#ifdef USE_INTEL_VAES 329#ifdef USE_INTEL_VAES
327 330
331/*
332GCC before 2013-Jun:
333 <immintrin.h>:
334 #ifdef __AVX__
335 #include <avxintrin.h>
336 #endif
337GCC after 2013-Jun:
338 <immintrin.h>:
339 #include <avxintrin.h>
340CLANG 3.8+:
341{
342 <immintrin.h>:
343 #if !defined(_MSC_VER) || defined(__AVX__)
344 #include <avxintrin.h>
345 #endif
346
347 if (the compiler is clang for Windows and if global arch is not set for __AVX__)
348 [ if (defined(_MSC_VER) && !defined(__AVX__)) ]
349 {
350 <immintrin.h> doesn't include <avxintrin.h>
351 and we have 2 ways to fix it:
352 1) we can define required __AVX__ before <immintrin.h>
353 or
354 2) we can include <avxintrin.h> after <immintrin.h>
355 }
356}
357
358If we include <avxintrin.h> manually for GCC/CLANG, it's
359required that <immintrin.h> must be included before <avxintrin.h>.
360*/
361
362/*
328#if defined(__clang__) && defined(_MSC_VER) 363#if defined(__clang__) && defined(_MSC_VER)
329#define __SSE4_2__
330#define __AES__
331#define __AVX__ 364#define __AVX__
332#define __AVX2__ 365#define __AVX2__
333#define __VAES__ 366#define __VAES__
334#define __AVX512F__
335#define __AVX512VL__
336#endif 367#endif
368*/
337 369
338#include <immintrin.h> 370#include <immintrin.h>
371#if defined(__clang__) && defined(_MSC_VER)
372 #if !defined(__AVX__)
373 #include <avxintrin.h>
374 #endif
375 #if !defined(__AVX2__)
376 #include <avx2intrin.h>
377 #endif
378 #if !defined(__VAES__)
379 #include <vaesintrin.h>
380 #endif
381#endif // __clang__ && _MSC_VER
382
339 383
340#define VAES_FUNC_START2(name) \ 384#define VAES_FUNC_START2(name) \
341AES_FUNC_START (name); \ 385AES_FUNC_START (name); \
@@ -344,6 +388,8 @@ AES_FUNC_START (name)
344 388
345VAES_FUNC_START2 (AesCbc_Decode_HW_256) 389VAES_FUNC_START2 (AesCbc_Decode_HW_256)
346{ 390{
391 __m128i *p = (__m128i *)(void *)ivAes;
392 __m128i *data = (__m128i *)(void *)data8;
347 __m128i iv = *p; 393 __m128i iv = *p;
348 const __m128i *dataEnd; 394 const __m128i *dataEnd;
349 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; 395 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
@@ -353,22 +399,22 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
353 { 399 {
354 const __m256i *w = keys + numRounds - 2; 400 const __m256i *w = keys + numRounds - 2;
355 401
356 WOP (AVX__DECLARE_VAR) 402 WOP (AVX_DECLARE_VAR)
357 WOP (AVX__LOAD_data); 403 WOP (AVX_LOAD_data)
358 AVX__WOP_KEY (AVX__AES_XOR, 1) 404 AVX_WOP_KEY (AVX_AES_XOR, 1)
359 405
360 do 406 do
361 { 407 {
362 AVX__WOP_KEY (AVX__AES_DEC, 0) 408 AVX_WOP_KEY (AVX_AES_DEC, 0)
363 w--; 409 w--;
364 } 410 }
365 while (w != keys); 411 while (w != keys);
366 AVX__WOP_KEY (AVX__AES_DEC_LAST, 0) 412 AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
367 413
368 AVX_XOR (m0, _mm256_setr_m128i(iv, data[0])); 414 AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]))
369 WOP_M1 (AVX__XOR_data_M1) 415 WOP_M1 (AVX_XOR_data_M1)
370 iv = data[NUM_WAYS * 2 - 1]; 416 iv = data[NUM_WAYS * 2 - 1];
371 WOP (AVX__STORE_data); 417 WOP (AVX_STORE_data)
372 } 418 }
373 WIDE_LOOP_END_AVX(;) 419 WIDE_LOOP_END_AVX(;)
374 420
@@ -378,15 +424,15 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
378 __m128i m = _mm_xor_si128 (w[2], *data); 424 __m128i m = _mm_xor_si128 (w[2], *data);
379 do 425 do
380 { 426 {
381 MM_OP_m (_mm_aesdec_si128, w[1]); 427 MM_OP_m (_mm_aesdec_si128, w[1])
382 MM_OP_m (_mm_aesdec_si128, w[0]); 428 MM_OP_m (_mm_aesdec_si128, w[0])
383 w -= 2; 429 w -= 2;
384 } 430 }
385 while (w != p); 431 while (w != p);
386 MM_OP_m (_mm_aesdec_si128, w[1]); 432 MM_OP_m (_mm_aesdec_si128, w[1])
387 MM_OP_m (_mm_aesdeclast_si128, w[0]); 433 MM_OP_m (_mm_aesdeclast_si128, w[0])
388 434
389 MM_XOR (m, iv); 435 MM_XOR (m, iv)
390 iv = *data; 436 iv = *data;
391 *data = m; 437 *data = m;
392 } 438 }
@@ -403,18 +449,20 @@ AVX2: _mm256_add_epi64 : vpaddq ymm, ymm, ymm
403 _mm256_broadcastsi128_si256 : vbroadcasti128 449 _mm256_broadcastsi128_si256 : vbroadcasti128
404*/ 450*/
405 451
406#define AVX__CTR_LOOP_START \ 452#define AVX_CTR_LOOP_START \
407 ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \ 453 ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
408 two = _mm256_setr_m128i(one, one); \ 454 two = _mm256_setr_m128i(one, one); \
409 two = _mm256_add_epi64(two, two); \ 455 two = _mm256_add_epi64(two, two); \
410 456
411// two = _mm256_setr_epi64x(2, 0, 2, 0); 457// two = _mm256_setr_epi64x(2, 0, 2, 0);
412 458
413#define AVX__CTR_LOOP_ENC \ 459#define AVX_CTR_LOOP_ENC \
414 ctr = _mm256_extracti128_si256 (ctr2, 1); \ 460 ctr = _mm256_extracti128_si256 (ctr2, 1); \
415 461
416VAES_FUNC_START2 (AesCtr_Code_HW_256) 462VAES_FUNC_START2 (AesCtr_Code_HW_256)
417{ 463{
464 __m128i *p = (__m128i *)(void *)ivAes;
465 __m128i *data = (__m128i *)(void *)data8;
418 __m128i ctr = *p; 466 __m128i ctr = *p;
419 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; 467 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
420 const __m128i *dataEnd; 468 const __m128i *dataEnd;
@@ -422,44 +470,44 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
422 __m256i ctr2, two; 470 __m256i ctr2, two;
423 p += 2; 471 p += 2;
424 472
425 WIDE_LOOP_START_AVX (AVX__CTR_LOOP_START) 473 WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START)
426 { 474 {
427 const __m256i *w = keys; 475 const __m256i *w = keys;
428 UInt32 r = numRounds - 2; 476 UInt32 r = numRounds - 2;
429 WOP (AVX__DECLARE_VAR) 477 WOP (AVX_DECLARE_VAR)
430 AVX__WOP_KEY (AVX__CTR_START, 0); 478 AVX_WOP_KEY (AVX_CTR_START, 0)
431 479
432 w += 1; 480 w += 1;
433 do 481 do
434 { 482 {
435 AVX__WOP_KEY (AVX__AES_ENC, 0) 483 AVX_WOP_KEY (AVX_AES_ENC, 0)
436 w += 1; 484 w += 1;
437 } 485 }
438 while (--r); 486 while (--r);
439 AVX__WOP_KEY (AVX__AES_ENC_LAST, 0) 487 AVX_WOP_KEY (AVX_AES_ENC_LAST, 0)
440 488
441 WOP (AVX__CTR_END); 489 WOP (AVX_CTR_END)
442 } 490 }
443 WIDE_LOOP_END_AVX (AVX__CTR_LOOP_ENC) 491 WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC)
444 492
445 SINGLE_LOOP 493 SINGLE_LOOP
446 { 494 {
447 UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1; 495 UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
448 const __m128i *w = p; 496 const __m128i *w = p;
449 __m128i m; 497 __m128i m;
450 MM_OP (_mm_add_epi64, ctr, one); 498 MM_OP (_mm_add_epi64, ctr, one)
451 m = _mm_xor_si128 (ctr, p[0]); 499 m = _mm_xor_si128 (ctr, p[0]);
452 w += 1; 500 w += 1;
453 do 501 do
454 { 502 {
455 MM_OP_m (_mm_aesenc_si128, w[0]); 503 MM_OP_m (_mm_aesenc_si128, w[0])
456 MM_OP_m (_mm_aesenc_si128, w[1]); 504 MM_OP_m (_mm_aesenc_si128, w[1])
457 w += 2; 505 w += 2;
458 } 506 }
459 while (--numRounds2); 507 while (--numRounds2);
460 MM_OP_m (_mm_aesenc_si128, w[0]); 508 MM_OP_m (_mm_aesenc_si128, w[0])
461 MM_OP_m (_mm_aesenclast_si128, w[1]); 509 MM_OP_m (_mm_aesenclast_si128, w[1])
462 MM_XOR (*data, m); 510 MM_XOR (*data, m)
463 } 511 }
464 512
465 p[-2] = ctr; 513 p[-2] = ctr;
@@ -477,7 +525,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
477#define AES_TYPE_data Byte 525#define AES_TYPE_data Byte
478 526
479#define AES_FUNC_START(name) \ 527#define AES_FUNC_START(name) \
480 void MY_FAST_CALL name(UInt32 *p, Byte *data, size_t numBlocks) \ 528 void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \
481 529
482#define AES_COMPAT_STUB(name) \ 530#define AES_COMPAT_STUB(name) \
483 AES_FUNC_START(name); \ 531 AES_FUNC_START(name); \
@@ -496,8 +544,8 @@ AES_COMPAT_STUB (AesCtr_Code)
496#pragma message("VAES HW_SW stub was used") 544#pragma message("VAES HW_SW stub was used")
497 545
498#define VAES_COMPAT_STUB(name) \ 546#define VAES_COMPAT_STUB(name) \
499 void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ 547 void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
500 void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \ 548 void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
501 { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); } 549 { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
502 550
503VAES_COMPAT_STUB (AesCbc_Decode_HW) 551VAES_COMPAT_STUB (AesCbc_Decode_HW)
@@ -551,7 +599,8 @@ VAES_COMPAT_STUB (AesCtr_Code_HW)
551typedef uint8x16_t v128; 599typedef uint8x16_t v128;
552 600
553#define AES_FUNC_START(name) \ 601#define AES_FUNC_START(name) \
554 void MY_FAST_CALL name(v128 *p, v128 *data, size_t numBlocks) 602 void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
603 // void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks)
555 604
556#define AES_FUNC_START2(name) \ 605#define AES_FUNC_START2(name) \
557AES_FUNC_START (name); \ 606AES_FUNC_START (name); \
@@ -559,18 +608,20 @@ ATTRIB_AES \
559AES_FUNC_START (name) 608AES_FUNC_START (name)
560 609
561#define MM_OP(op, dest, src) dest = op(dest, src); 610#define MM_OP(op, dest, src) dest = op(dest, src);
562#define MM_OP_m(op, src) MM_OP(op, m, src); 611#define MM_OP_m(op, src) MM_OP(op, m, src)
563#define MM_OP1_m(op) m = op(m); 612#define MM_OP1_m(op) m = op(m);
564 613
565#define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src); 614#define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src)
566#define MM_XOR_m( src) MM_XOR(m, src); 615#define MM_XOR_m( src) MM_XOR(m, src)
567 616
568#define AES_E_m(k) MM_OP_m (vaeseq_u8, k); 617#define AES_E_m(k) MM_OP_m (vaeseq_u8, k)
569#define AES_E_MC_m(k) AES_E_m (k); MM_OP1_m(vaesmcq_u8); 618#define AES_E_MC_m(k) AES_E_m (k) MM_OP1_m(vaesmcq_u8)
570 619
571 620
572AES_FUNC_START2 (AesCbc_Encode_HW) 621AES_FUNC_START2 (AesCbc_Encode_HW)
573{ 622{
623 v128 *p = (v128*)(void*)ivAes;
624 v128 *data = (v128*)(void*)data8;
574 v128 m = *p; 625 v128 m = *p;
575 const v128 k0 = p[2]; 626 const v128 k0 = p[2];
576 const v128 k1 = p[3]; 627 const v128 k1 = p[3];
@@ -608,7 +659,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
608 AES_E_MC_m (p[14]) 659 AES_E_MC_m (p[14])
609 } 660 }
610 } 661 }
611 AES_E_m (k_z1); 662 AES_E_m (k_z1)
612 MM_XOR_m (k_z0); 663 MM_XOR_m (k_z0);
613 *data = m; 664 *data = m;
614 } 665 }
@@ -617,44 +668,44 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
617 668
618 669
619#define WOP_1(op) 670#define WOP_1(op)
620#define WOP_2(op) WOP_1 (op) op (m1, 1); 671#define WOP_2(op) WOP_1 (op) op (m1, 1)
621#define WOP_3(op) WOP_2 (op) op (m2, 2); 672#define WOP_3(op) WOP_2 (op) op (m2, 2)
622#define WOP_4(op) WOP_3 (op) op (m3, 3); 673#define WOP_4(op) WOP_3 (op) op (m3, 3)
623#define WOP_5(op) WOP_4 (op) op (m4, 4); 674#define WOP_5(op) WOP_4 (op) op (m4, 4)
624#define WOP_6(op) WOP_5 (op) op (m5, 5); 675#define WOP_6(op) WOP_5 (op) op (m5, 5)
625#define WOP_7(op) WOP_6 (op) op (m6, 6); 676#define WOP_7(op) WOP_6 (op) op (m6, 6)
626#define WOP_8(op) WOP_7 (op) op (m7, 7); 677#define WOP_8(op) WOP_7 (op) op (m7, 7)
627 678
628 #define NUM_WAYS 8 679 #define NUM_WAYS 8
629 #define WOP_M1 WOP_8 680 #define WOP_M1 WOP_8
630 681
631#define WOP(op) op (m0, 0); WOP_M1(op) 682#define WOP(op) op (m0, 0) WOP_M1(op)
632 683
633#define DECLARE_VAR(reg, ii) v128 reg 684#define DECLARE_VAR(reg, ii) v128 reg;
634#define LOAD_data( reg, ii) reg = data[ii]; 685#define LOAD_data( reg, ii) reg = data[ii];
635#define STORE_data( reg, ii) data[ii] = reg; 686#define STORE_data( reg, ii) data[ii] = reg;
636#if (NUM_WAYS > 1) 687#if (NUM_WAYS > 1)
637#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]); 688#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1])
638#endif 689#endif
639 690
640#define MM_OP_key(op, reg) MM_OP (op, reg, key); 691#define MM_OP_key(op, reg) MM_OP (op, reg, key)
641 692
642#define AES_D_m(k) MM_OP_m (vaesdq_u8, k); 693#define AES_D_m(k) MM_OP_m (vaesdq_u8, k)
643#define AES_D_IMC_m(k) AES_D_m (k); MM_OP1_m (vaesimcq_u8); 694#define AES_D_IMC_m(k) AES_D_m (k) MM_OP1_m (vaesimcq_u8)
644 695
645#define AES_XOR( reg, ii) MM_OP_key (veorq_u8, reg) 696#define AES_XOR( reg, ii) MM_OP_key (veorq_u8, reg)
646#define AES_D( reg, ii) MM_OP_key (vaesdq_u8, reg) 697#define AES_D( reg, ii) MM_OP_key (vaesdq_u8, reg)
647#define AES_E( reg, ii) MM_OP_key (vaeseq_u8, reg) 698#define AES_E( reg, ii) MM_OP_key (vaeseq_u8, reg)
648 699
649#define AES_D_IMC( reg, ii) AES_D (reg, ii); reg = vaesimcq_u8(reg) 700#define AES_D_IMC( reg, ii) AES_D (reg, ii) reg = vaesimcq_u8(reg);
650#define AES_E_MC( reg, ii) AES_E (reg, ii); reg = vaesmcq_u8(reg) 701#define AES_E_MC( reg, ii) AES_E (reg, ii) reg = vaesmcq_u8(reg);
651 702
652#define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one); reg = vreinterpretq_u8_u64(ctr); 703#define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one) reg = vreinterpretq_u8_u64(ctr);
653#define CTR_END( reg, ii) MM_XOR (data[ii], reg); 704#define CTR_END( reg, ii) MM_XOR (data[ii], reg)
654 705
655#define WOP_KEY(op, n) { \ 706#define WOP_KEY(op, n) { \
656 const v128 key = w[n]; \ 707 const v128 key = w[n]; \
657 WOP(op); } 708 WOP(op) }
658 709
659#define WIDE_LOOP_START \ 710#define WIDE_LOOP_START \
660 dataEnd = data + numBlocks; \ 711 dataEnd = data + numBlocks; \
@@ -672,6 +723,8 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
672 723
673AES_FUNC_START2 (AesCbc_Decode_HW) 724AES_FUNC_START2 (AesCbc_Decode_HW)
674{ 725{
726 v128 *p = (v128*)(void*)ivAes;
727 v128 *data = (v128*)(void*)data8;
675 v128 iv = *p; 728 v128 iv = *p;
676 const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; 729 const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
677 const v128 *dataEnd; 730 const v128 *dataEnd;
@@ -681,7 +734,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
681 { 734 {
682 const v128 *w = wStart; 735 const v128 *w = wStart;
683 WOP (DECLARE_VAR) 736 WOP (DECLARE_VAR)
684 WOP (LOAD_data); 737 WOP (LOAD_data)
685 WOP_KEY (AES_D_IMC, 2) 738 WOP_KEY (AES_D_IMC, 2)
686 do 739 do
687 { 740 {
@@ -695,7 +748,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
695 MM_XOR (m0, iv); 748 MM_XOR (m0, iv);
696 WOP_M1 (XOR_data_M1) 749 WOP_M1 (XOR_data_M1)
697 iv = data[NUM_WAYS - 1]; 750 iv = data[NUM_WAYS - 1];
698 WOP (STORE_data); 751 WOP (STORE_data)
699 } 752 }
700 WIDE_LOOP_END 753 WIDE_LOOP_END
701 754
@@ -724,6 +777,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
724 777
725AES_FUNC_START2 (AesCtr_Code_HW) 778AES_FUNC_START2 (AesCtr_Code_HW)
726{ 779{
780 v128 *p = (v128*)(void*)ivAes;
781 v128 *data = (v128*)(void*)data8;
727 uint64x2_t ctr = vreinterpretq_u64_u8(*p); 782 uint64x2_t ctr = vreinterpretq_u64_u8(*p);
728 const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; 783 const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
729 const v128 *dataEnd; 784 const v128 *dataEnd;
@@ -735,7 +790,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
735 { 790 {
736 const v128 *w = p; 791 const v128 *w = p;
737 WOP (DECLARE_VAR) 792 WOP (DECLARE_VAR)
738 WOP (CTR_START); 793 WOP (CTR_START)
739 do 794 do
740 { 795 {
741 WOP_KEY (AES_E_MC, 0) 796 WOP_KEY (AES_E_MC, 0)
@@ -746,7 +801,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
746 WOP_KEY (AES_E_MC, 0) 801 WOP_KEY (AES_E_MC, 0)
747 WOP_KEY (AES_E, 1) 802 WOP_KEY (AES_E, 1)
748 WOP_KEY (AES_XOR, 2) 803 WOP_KEY (AES_XOR, 2)
749 WOP (CTR_END); 804 WOP (CTR_END)
750 } 805 }
751 WIDE_LOOP_END 806 WIDE_LOOP_END
752 807
@@ -762,10 +817,10 @@ AES_FUNC_START2 (AesCtr_Code_HW)
762 w += 2; 817 w += 2;
763 } 818 }
764 while (w != wEnd); 819 while (w != wEnd);
765 AES_E_MC_m (w[0]); 820 AES_E_MC_m (w[0])
766 AES_E_m (w[1]); 821 AES_E_m (w[1])
767 MM_XOR_m (w[2]); 822 MM_XOR_m (w[2])
768 CTR_END (m, 0); 823 CTR_END (m, 0)
769 } 824 }
770 825
771 p[-2] = vreinterpretq_u8_u64(ctr); 826 p[-2] = vreinterpretq_u8_u64(ctr);
@@ -774,3 +829,12 @@ AES_FUNC_START2 (AesCtr_Code_HW)
774#endif // USE_HW_AES 829#endif // USE_HW_AES
775 830
776#endif // MY_CPU_ARM_OR_ARM64 831#endif // MY_CPU_ARM_OR_ARM64
832
833#undef NUM_WAYS
834#undef WOP_M1
835#undef WOP
836#undef DECLARE_VAR
837#undef LOAD_data
838#undef STORE_data
839#undef USE_INTEL_AES
840#undef USE_HW_AES