aboutsummaryrefslogtreecommitdiff
path: root/C/AesOpt.c
diff options
context:
space:
mode:
Diffstat (limited to 'C/AesOpt.c')
-rw-r--r--C/AesOpt.c233
1 files changed, 139 insertions, 94 deletions
diff --git a/C/AesOpt.c b/C/AesOpt.c
index 58769ea..b281807 100644
--- a/C/AesOpt.c
+++ b/C/AesOpt.c
@@ -1,5 +1,5 @@
1/* AesOpt.c -- AES optimized code for x86 AES hardware instructions 1/* AesOpt.c -- AES optimized code for x86 AES hardware instructions
22024-03-01 : Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -80,19 +80,39 @@ AES_FUNC_START (name)
80 80
81#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) 81#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src)
82 82
83#if 1
84// use aligned SSE load/store for data.
85// It is required for our Aes functions, that data is aligned for 16-bytes.
86// So we can use this branch of code.
87// and compiler can use fused load-op SSE instructions:
88// xorps xmm0, XMMWORD PTR [rdx]
89#define LOAD_128(pp) (*(__m128i *)(void *)(pp))
90#define STORE_128(pp, _v) *(__m128i *)(void *)(pp) = _v
91// use aligned SSE load/store for data. Alternative code with direct access
92// #define LOAD_128(pp) _mm_load_si128(pp)
93// #define STORE_128(pp, _v) _mm_store_si128(pp, _v)
94#else
95// use unaligned load/store for data: movdqu XMMWORD PTR [rdx]
96#define LOAD_128(pp) _mm_loadu_si128(pp)
97#define STORE_128(pp, _v) _mm_storeu_si128(pp, _v)
98#endif
99
83AES_FUNC_START2 (AesCbc_Encode_HW) 100AES_FUNC_START2 (AesCbc_Encode_HW)
84{ 101{
102 if (numBlocks == 0)
103 return;
104 {
85 __m128i *p = (__m128i *)(void *)ivAes; 105 __m128i *p = (__m128i *)(void *)ivAes;
86 __m128i *data = (__m128i *)(void *)data8; 106 __m128i *data = (__m128i *)(void *)data8;
87 __m128i m = *p; 107 __m128i m = *p;
88 const __m128i k0 = p[2]; 108 const __m128i k0 = p[2];
89 const __m128i k1 = p[3]; 109 const __m128i k1 = p[3];
90 const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; 110 const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
91 for (; numBlocks != 0; numBlocks--, data++) 111 do
92 { 112 {
93 UInt32 r = numRounds2; 113 UInt32 r = numRounds2;
94 const __m128i *w = p + 4; 114 const __m128i *w = p + 4;
95 __m128i temp = *data; 115 __m128i temp = LOAD_128(data);
96 MM_XOR (temp, k0) 116 MM_XOR (temp, k0)
97 MM_XOR (m, temp) 117 MM_XOR (m, temp)
98 MM_OP_m (_mm_aesenc_si128, k1) 118 MM_OP_m (_mm_aesenc_si128, k1)
@@ -104,9 +124,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
104 } 124 }
105 while (--r); 125 while (--r);
106 MM_OP_m (_mm_aesenclast_si128, w[0]) 126 MM_OP_m (_mm_aesenclast_si128, w[0])
107 *data = m; 127 STORE_128(data, m);
128 data++;
108 } 129 }
130 while (--numBlocks);
109 *p = m; 131 *p = m;
132 }
110} 133}
111 134
112 135
@@ -139,12 +162,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
139 162
140#define WOP(op) op (m0, 0) WOP_M1(op) 163#define WOP(op) op (m0, 0) WOP_M1(op)
141 164
142
143#define DECLARE_VAR(reg, ii) __m128i reg; 165#define DECLARE_VAR(reg, ii) __m128i reg;
144#define LOAD_data( reg, ii) reg = data[ii]; 166#define LOAD_data_ii(ii) LOAD_128(data + (ii))
145#define STORE_data( reg, ii) data[ii] = reg; 167#define LOAD_data( reg, ii) reg = LOAD_data_ii(ii);
168#define STORE_data( reg, ii) STORE_128(data + (ii), reg);
146#if (NUM_WAYS > 1) 169#if (NUM_WAYS > 1)
147#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) 170#define XOR_data_M1(reg, ii) MM_XOR (reg, LOAD_128(data + (ii- 1)))
148#endif 171#endif
149 172
150#define MM_OP_key(op, reg) MM_OP(op, reg, key); 173#define MM_OP_key(op, reg) MM_OP(op, reg, key);
@@ -156,25 +179,22 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
156#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) 179#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg)
157 180
158#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; 181#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr;
159#define CTR_END( reg, ii) MM_XOR (data[ii], reg) 182#define CTR_END( reg, ii) STORE_128(data + (ii), _mm_xor_si128(reg, \
160 183 LOAD_128 (data + (ii))));
161#define WOP_KEY(op, n) { \ 184#define WOP_KEY(op, n) { \
162 const __m128i key = w[n]; \ 185 const __m128i key = w[n]; \
163 WOP(op); } 186 WOP(op) }
164
165 187
166#define WIDE_LOOP_START \ 188#define WIDE_LOOP_START \
167 dataEnd = data + numBlocks; \ 189 dataEnd = data + numBlocks; \
168 if (numBlocks >= NUM_WAYS) \ 190 if (numBlocks >= NUM_WAYS) \
169 { dataEnd -= NUM_WAYS; do { \ 191 { dataEnd -= NUM_WAYS; do { \
170 192
171
172#define WIDE_LOOP_END \ 193#define WIDE_LOOP_END \
173 data += NUM_WAYS; \ 194 data += NUM_WAYS; \
174 } while (data <= dataEnd); \ 195 } while (data <= dataEnd); \
175 dataEnd += NUM_WAYS; } \ 196 dataEnd += NUM_WAYS; } \
176 197
177
178#define SINGLE_LOOP \ 198#define SINGLE_LOOP \
179 for (; data < dataEnd; data++) 199 for (; data < dataEnd; data++)
180 200
@@ -184,54 +204,73 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
184 204
185#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) 205#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
186#define AVX_DECLARE_VAR(reg, ii) __m256i reg; 206#define AVX_DECLARE_VAR(reg, ii) __m256i reg;
187#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; 207
188#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; 208#if 1
209// use unaligned AVX load/store for data.
210// It is required for our Aes functions, that data is aligned for 16-bytes.
211// But we need 32-bytes reading.
212// So we use intrinsics for unaligned AVX load/store.
213// notes for _mm256_storeu_si256:
214// msvc2022: uses vmovdqu and keeps the order of instruction sequence.
215// new gcc11 uses vmovdqu
216// old gcc9 could use pair of instructions:
217// vmovups %xmm7, -224(%rax)
218// vextracti128 $0x1, %ymm7, -208(%rax)
219#define AVX_LOAD(p) _mm256_loadu_si256((const __m256i *)(const void *)(p))
220#define AVX_STORE(p, _v) _mm256_storeu_si256((__m256i *)(void *)(p), _v);
221#else
222// use aligned AVX load/store for data.
223// for debug: we can use this branch, if we are sure that data is aligned for 32-bytes.
224// msvc2022 uses vmovdqu still
225// gcc uses vmovdqa (that requires 32-bytes alignment)
226#define AVX_LOAD(p) (*(const __m256i *)(const void *)(p))
227#define AVX_STORE(p, _v) (*(__m256i *)(void *)(p)) = _v;
228#endif
229
230#define AVX_LOAD_data( reg, ii) reg = AVX_LOAD((const __m256i *)(const void *)data + (ii));
231#define AVX_STORE_data( reg, ii) AVX_STORE((__m256i *)(void *)data + (ii), reg)
189/* 232/*
190AVX_XOR_data_M1() needs unaligned memory load 233AVX_XOR_data_M1() needs unaligned memory load, even if (data)
191if (we don't use _mm256_loadu_si256() here) 234is aligned for 256-bits, because we read 32-bytes chunk that
192{ 235crosses (data) position: from (data - 16bytes) to (data + 16bytes).
193 Most compilers with enabled optimizations generate fused AVX (LOAD + OP)
194 instruction that can load unaligned data.
195 But GCC and CLANG without -O2 or -O1 optimizations can generate separated
196 LOAD-ALIGNED (vmovdqa) instruction that will fail on execution.
197}
198Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here.
199v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler.
200*/ 236*/
201#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii]))) 237#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii)))
202// for debug only: the following code will fail on execution, if compiled by some compilers:
203// #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
204 238
205#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) 239#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
206#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) 240#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
207#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) 241#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
208#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) 242#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
209#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) 243#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
210#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); 244#define AVX_CTR_START(reg, ii) \
211#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) 245 MM_OP (_mm256_add_epi64, ctr2, two) \
246 reg = _mm256_xor_si256(ctr2, key);
247
248#define AVX_CTR_END(reg, ii) \
249 AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \
250 AVX_LOAD ((__m256i *)(void *)data + (ii))));
251
212#define AVX_WOP_KEY(op, n) { \ 252#define AVX_WOP_KEY(op, n) { \
213 const __m256i key = w[n]; \ 253 const __m256i key = w[n]; \
214 WOP(op); } 254 WOP(op) }
215 255
216#define NUM_AES_KEYS_MAX 15 256#define NUM_AES_KEYS_MAX 15
217 257
218#define WIDE_LOOP_START_AVX(OP) \ 258#define WIDE_LOOP_START_AVX(OP) \
219 dataEnd = data + numBlocks; \ 259 dataEnd = data + numBlocks; \
220 if (numBlocks >= NUM_WAYS * 2) \ 260 if (numBlocks >= NUM_WAYS * 2) \
221 { __m256i keys[NUM_AES_KEYS_MAX]; \ 261 { __m256i keys[NUM_AES_KEYS_MAX]; \
222 UInt32 ii; \ 262 OP \
223 OP \ 263 { UInt32 ii; for (ii = 0; ii < numRounds; ii++) \
224 for (ii = 0; ii < numRounds; ii++) \ 264 keys[ii] = _mm256_broadcastsi128_si256(p[ii]); } \
225 keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \ 265 dataEnd -= NUM_WAYS * 2; \
226 dataEnd -= NUM_WAYS * 2; do { \ 266 do { \
227
228 267
229#define WIDE_LOOP_END_AVX(OP) \ 268#define WIDE_LOOP_END_AVX(OP) \
230 data += NUM_WAYS * 2; \ 269 data += NUM_WAYS * 2; \
231 } while (data <= dataEnd); \ 270 } while (data <= dataEnd); \
232 dataEnd += NUM_WAYS * 2; \ 271 dataEnd += NUM_WAYS * 2; \
233 OP \ 272 OP \
234 _mm256_zeroupper(); \ 273 _mm256_zeroupper(); \
235 } \ 274 } \
236 275
237/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, 276/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
@@ -246,21 +285,20 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
246 __m128i *p = (__m128i *)(void *)ivAes; 285 __m128i *p = (__m128i *)(void *)ivAes;
247 __m128i *data = (__m128i *)(void *)data8; 286 __m128i *data = (__m128i *)(void *)data8;
248 __m128i iv = *p; 287 __m128i iv = *p;
249 const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1; 288 const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1;
250 const __m128i *dataEnd; 289 const __m128i *dataEnd;
251 p += 2; 290 p += 2;
252 291
253 WIDE_LOOP_START 292 WIDE_LOOP_START
254 { 293 {
255 const __m128i *w = wStart; 294 const __m128i *w = wStart;
256
257 WOP (DECLARE_VAR) 295 WOP (DECLARE_VAR)
258 WOP (LOAD_data) 296 WOP (LOAD_data)
259 WOP_KEY (AES_XOR, 1) 297 WOP_KEY (AES_XOR, 1)
260
261 do 298 do
262 { 299 {
263 WOP_KEY (AES_DEC, 0) 300 WOP_KEY (AES_DEC, 0)
301
264 w--; 302 w--;
265 } 303 }
266 while (w != p); 304 while (w != p);
@@ -268,7 +306,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
268 306
269 MM_XOR (m0, iv) 307 MM_XOR (m0, iv)
270 WOP_M1 (XOR_data_M1) 308 WOP_M1 (XOR_data_M1)
271 iv = data[NUM_WAYS - 1]; 309 LOAD_data(iv, NUM_WAYS - 1)
272 WOP (STORE_data) 310 WOP (STORE_data)
273 } 311 }
274 WIDE_LOOP_END 312 WIDE_LOOP_END
@@ -276,7 +314,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
276 SINGLE_LOOP 314 SINGLE_LOOP
277 { 315 {
278 const __m128i *w = wStart - 1; 316 const __m128i *w = wStart - 1;
279 __m128i m = _mm_xor_si128 (w[2], *data); 317 __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
318
280 do 319 do
281 { 320 {
282 MM_OP_m (_mm_aesdec_si128, w[1]) 321 MM_OP_m (_mm_aesdec_si128, w[1])
@@ -286,10 +325,9 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
286 while (w != p); 325 while (w != p);
287 MM_OP_m (_mm_aesdec_si128, w[1]) 326 MM_OP_m (_mm_aesdec_si128, w[1])
288 MM_OP_m (_mm_aesdeclast_si128, w[0]) 327 MM_OP_m (_mm_aesdeclast_si128, w[0])
289
290 MM_XOR (m, iv) 328 MM_XOR (m, iv)
291 iv = *data; 329 LOAD_data(iv, 0)
292 *data = m; 330 STORE_data(m, 0)
293 } 331 }
294 332
295 p[-2] = iv; 333 p[-2] = iv;
@@ -301,9 +339,9 @@ AES_FUNC_START2 (AesCtr_Code_HW)
301 __m128i *p = (__m128i *)(void *)ivAes; 339 __m128i *p = (__m128i *)(void *)ivAes;
302 __m128i *data = (__m128i *)(void *)data8; 340 __m128i *data = (__m128i *)(void *)data8;
303 __m128i ctr = *p; 341 __m128i ctr = *p;
304 UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; 342 const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
305 const __m128i *dataEnd; 343 const __m128i *dataEnd;
306 __m128i one = _mm_cvtsi32_si128(1); 344 const __m128i one = _mm_cvtsi32_si128(1);
307 345
308 p += 2; 346 p += 2;
309 347
@@ -322,7 +360,6 @@ AES_FUNC_START2 (AesCtr_Code_HW)
322 } 360 }
323 while (--r); 361 while (--r);
324 WOP_KEY (AES_ENC_LAST, 0) 362 WOP_KEY (AES_ENC_LAST, 0)
325
326 WOP (CTR_END) 363 WOP (CTR_END)
327 } 364 }
328 WIDE_LOOP_END 365 WIDE_LOOP_END
@@ -344,7 +381,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
344 while (--numRounds2); 381 while (--numRounds2);
345 MM_OP_m (_mm_aesenc_si128, w[0]) 382 MM_OP_m (_mm_aesenc_si128, w[0])
346 MM_OP_m (_mm_aesenclast_si128, w[1]) 383 MM_OP_m (_mm_aesenclast_si128, w[1])
347 MM_XOR (*data, m) 384 CTR_END (m, 0)
348 } 385 }
349 386
350 p[-2] = ctr; 387 p[-2] = ctr;
@@ -421,7 +458,7 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
421 __m128i *data = (__m128i *)(void *)data8; 458 __m128i *data = (__m128i *)(void *)data8;
422 __m128i iv = *p; 459 __m128i iv = *p;
423 const __m128i *dataEnd; 460 const __m128i *dataEnd;
424 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; 461 const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
425 p += 2; 462 p += 2;
426 463
427 WIDE_LOOP_START_AVX(;) 464 WIDE_LOOP_START_AVX(;)
@@ -440,17 +477,17 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
440 while (w != keys); 477 while (w != keys);
441 AVX_WOP_KEY (AVX_AES_DEC_LAST, 0) 478 AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
442 479
443 AVX_XOR (m0, _mm256_setr_m128i(iv, data[0])) 480 AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0)))
444 WOP_M1 (AVX_XOR_data_M1) 481 WOP_M1 (AVX_XOR_data_M1)
445 iv = data[NUM_WAYS * 2 - 1]; 482 LOAD_data (iv, NUM_WAYS * 2 - 1)
446 WOP (AVX_STORE_data) 483 WOP (AVX_STORE_data)
447 } 484 }
448 WIDE_LOOP_END_AVX(;) 485 WIDE_LOOP_END_AVX(;)
449 486
450 SINGLE_LOOP 487 SINGLE_LOOP
451 { 488 {
452 const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3; 489 const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2;
453 __m128i m = _mm_xor_si128 (w[2], *data); 490 __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
454 do 491 do
455 { 492 {
456 MM_OP_m (_mm_aesdec_si128, w[1]) 493 MM_OP_m (_mm_aesdec_si128, w[1])
@@ -462,8 +499,8 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
462 MM_OP_m (_mm_aesdeclast_si128, w[0]) 499 MM_OP_m (_mm_aesdeclast_si128, w[0])
463 500
464 MM_XOR (m, iv) 501 MM_XOR (m, iv)
465 iv = *data; 502 LOAD_data(iv, 0)
466 *data = m; 503 STORE_data(m, 0)
467 } 504 }
468 505
469 p[-2] = iv; 506 p[-2] = iv;
@@ -493,9 +530,9 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
493 __m128i *p = (__m128i *)(void *)ivAes; 530 __m128i *p = (__m128i *)(void *)ivAes;
494 __m128i *data = (__m128i *)(void *)data8; 531 __m128i *data = (__m128i *)(void *)data8;
495 __m128i ctr = *p; 532 __m128i ctr = *p;
496 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; 533 const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
497 const __m128i *dataEnd; 534 const __m128i *dataEnd;
498 __m128i one = _mm_cvtsi32_si128(1); 535 const __m128i one = _mm_cvtsi32_si128(1);
499 __m256i ctr2, two; 536 __m256i ctr2, two;
500 p += 2; 537 p += 2;
501 538
@@ -536,7 +573,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
536 while (--numRounds2); 573 while (--numRounds2);
537 MM_OP_m (_mm_aesenc_si128, w[0]) 574 MM_OP_m (_mm_aesenc_si128, w[0])
538 MM_OP_m (_mm_aesenclast_si128, w[1]) 575 MM_OP_m (_mm_aesenclast_si128, w[1])
539 MM_XOR (*data, m) 576 CTR_END (m, 0)
540 } 577 }
541 578
542 p[-2] = ctr; 579 p[-2] = ctr;
@@ -731,9 +768,14 @@ AES_FUNC_START (name)
731 768
732AES_FUNC_START2 (AesCbc_Encode_HW) 769AES_FUNC_START2 (AesCbc_Encode_HW)
733{ 770{
734 v128 * const p = (v128*)(void*)ivAes; 771 if (numBlocks == 0)
735 v128 *data = (v128*)(void*)data8; 772 return;
773 {
774 v128 * const p = (v128 *)(void *)ivAes;
775 v128 *data = (v128 *)(void *)data8;
736 v128 m = *p; 776 v128 m = *p;
777 const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
778 const v128 *w = p + (size_t)numRounds2 * 2;
737 const v128 k0 = p[2]; 779 const v128 k0 = p[2];
738 const v128 k1 = p[3]; 780 const v128 k1 = p[3];
739 const v128 k2 = p[4]; 781 const v128 k2 = p[4];
@@ -744,11 +786,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
744 const v128 k7 = p[9]; 786 const v128 k7 = p[9];
745 const v128 k8 = p[10]; 787 const v128 k8 = p[10];
746 const v128 k9 = p[11]; 788 const v128 k9 = p[11];
747 const UInt32 numRounds2 = *(const UInt32 *)(p + 1); 789 const v128 k_z4 = w[-2];
748 const v128 *w = p + ((size_t)numRounds2 * 2); 790 const v128 k_z3 = w[-1];
791 const v128 k_z2 = w[0];
749 const v128 k_z1 = w[1]; 792 const v128 k_z1 = w[1];
750 const v128 k_z0 = w[2]; 793 const v128 k_z0 = w[2];
751 for (; numBlocks != 0; numBlocks--, data++) 794 // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle,
795 // because gcc/clang compilers are not good for that optimization.
796 do
752 { 797 {
753 MM_XOR_m (*data) 798 MM_XOR_m (*data)
754 AES_E_MC_m (k0) 799 AES_E_MC_m (k0)
@@ -757,24 +802,26 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
757 AES_E_MC_m (k3) 802 AES_E_MC_m (k3)
758 AES_E_MC_m (k4) 803 AES_E_MC_m (k4)
759 AES_E_MC_m (k5) 804 AES_E_MC_m (k5)
760 AES_E_MC_m (k6)
761 AES_E_MC_m (k7)
762 AES_E_MC_m (k8)
763 if (numRounds2 >= 6) 805 if (numRounds2 >= 6)
764 { 806 {
765 AES_E_MC_m (k9) 807 AES_E_MC_m (k6)
766 AES_E_MC_m (p[12]) 808 AES_E_MC_m (k7)
767 if (numRounds2 != 6) 809 if (numRounds2 != 6)
768 { 810 {
769 AES_E_MC_m (p[13]) 811 AES_E_MC_m (k8)
770 AES_E_MC_m (p[14]) 812 AES_E_MC_m (k9)
771 } 813 }
772 } 814 }
773 AES_E_m (k_z1) 815 AES_E_MC_m (k_z4)
774 MM_XOR_m (k_z0) 816 AES_E_MC_m (k_z3)
775 *data = m; 817 AES_E_MC_m (k_z2)
818 AES_E_m (k_z1)
819 MM_XOR_m (k_z0)
820 *data++ = m;
776 } 821 }
822 while (--numBlocks);
777 *p = m; 823 *p = m;
824 }
778} 825}
779 826
780 827
@@ -834,10 +881,10 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
834 881
835AES_FUNC_START2 (AesCbc_Decode_HW) 882AES_FUNC_START2 (AesCbc_Decode_HW)
836{ 883{
837 v128 *p = (v128*)(void*)ivAes; 884 v128 *p = (v128 *)(void *)ivAes;
838 v128 *data = (v128*)(void*)data8; 885 v128 *data = (v128 *)(void *)data8;
839 v128 iv = *p; 886 v128 iv = *p;
840 const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; 887 const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2;
841 const v128 *dataEnd; 888 const v128 *dataEnd;
842 p += 2; 889 p += 2;
843 890
@@ -858,7 +905,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
858 WOP_KEY (AES_XOR, 0) 905 WOP_KEY (AES_XOR, 0)
859 MM_XOR (m0, iv) 906 MM_XOR (m0, iv)
860 WOP_M1 (XOR_data_M1) 907 WOP_M1 (XOR_data_M1)
861 iv = data[NUM_WAYS - 1]; 908 LOAD_data(iv, NUM_WAYS - 1)
862 WOP (STORE_data) 909 WOP (STORE_data)
863 } 910 }
864 WIDE_LOOP_END 911 WIDE_LOOP_END
@@ -866,7 +913,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
866 SINGLE_LOOP 913 SINGLE_LOOP
867 { 914 {
868 const v128 *w = wStart; 915 const v128 *w = wStart;
869 v128 m = *data; 916 v128 m; LOAD_data(m, 0)
870 AES_D_IMC_m (w[2]) 917 AES_D_IMC_m (w[2])
871 do 918 do
872 { 919 {
@@ -878,8 +925,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
878 AES_D_m (w[1]) 925 AES_D_m (w[1])
879 MM_XOR_m (w[0]) 926 MM_XOR_m (w[0])
880 MM_XOR_m (iv) 927 MM_XOR_m (iv)
881 iv = *data; 928 LOAD_data(iv, 0)
882 *data = m; 929 STORE_data(m, 0)
883 } 930 }
884 931
885 p[-2] = iv; 932 p[-2] = iv;
@@ -888,19 +935,17 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
888 935
889AES_FUNC_START2 (AesCtr_Code_HW) 936AES_FUNC_START2 (AesCtr_Code_HW)
890{ 937{
891 v128 *p = (v128*)(void*)ivAes; 938 v128 *p = (v128 *)(void *)ivAes;
892 v128 *data = (v128*)(void*)data8; 939 v128 *data = (v128 *)(void *)data8;
893 uint64x2_t ctr = vreinterpretq_u64_u8(*p); 940 uint64x2_t ctr = vreinterpretq_u64_u8(*p);
894 const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; 941 const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2;
895 const v128 *dataEnd; 942 const v128 *dataEnd;
896 uint64x2_t one = vdupq_n_u64(0);
897
898// the bug in clang: 943// the bug in clang:
899// __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); 944// __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
900#if defined(__clang__) && (__clang_major__ <= 9) 945#if defined(__clang__) && (__clang_major__ <= 9)
901#pragma GCC diagnostic ignored "-Wvector-conversion" 946#pragma GCC diagnostic ignored "-Wvector-conversion"
902#endif 947#endif
903 one = vsetq_lane_u64(1, one, 0); 948 const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0);
904 p += 2; 949 p += 2;
905 950
906 WIDE_LOOP_START 951 WIDE_LOOP_START