diff options
author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2024-11-29 00:00:00 +0000 |
---|---|---|
committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2024-11-30 15:27:15 +0500 |
commit | e5431fa6f5505e385c6f9367260717e9c47dc2ee (patch) | |
tree | 4cd2c2c3b225b48c8e7053432c41d7b6b6a3d5f8 /C/AesOpt.c | |
parent | e008ce3976c087bfd21344af8f00a23cf69d4174 (diff) | |
download | 7zip-main.tar.gz 7zip-main.tar.bz2 7zip-main.zip |
Diffstat (limited to '')
-rw-r--r-- | C/AesOpt.c | 233 |
1 files changed, 139 insertions, 94 deletions
@@ -1,5 +1,5 @@ | |||
1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions | 1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions |
2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -80,19 +80,39 @@ AES_FUNC_START (name) | |||
80 | 80 | ||
81 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) | 81 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) |
82 | 82 | ||
83 | #if 1 | ||
84 | // use aligned SSE load/store for data. | ||
85 | // It is required for our Aes functions, that data is aligned for 16-bytes. | ||
86 | // So we can use this branch of code. | ||
87 | // and compiler can use fused load-op SSE instructions: | ||
88 | // xorps xmm0, XMMWORD PTR [rdx] | ||
89 | #define LOAD_128(pp) (*(__m128i *)(void *)(pp)) | ||
90 | #define STORE_128(pp, _v) *(__m128i *)(void *)(pp) = _v | ||
91 | // use aligned SSE load/store for data. Alternative code with direct access | ||
92 | // #define LOAD_128(pp) _mm_load_si128(pp) | ||
93 | // #define STORE_128(pp, _v) _mm_store_si128(pp, _v) | ||
94 | #else | ||
95 | // use unaligned load/store for data: movdqu XMMWORD PTR [rdx] | ||
96 | #define LOAD_128(pp) _mm_loadu_si128(pp) | ||
97 | #define STORE_128(pp, _v) _mm_storeu_si128(pp, _v) | ||
98 | #endif | ||
99 | |||
83 | AES_FUNC_START2 (AesCbc_Encode_HW) | 100 | AES_FUNC_START2 (AesCbc_Encode_HW) |
84 | { | 101 | { |
102 | if (numBlocks == 0) | ||
103 | return; | ||
104 | { | ||
85 | __m128i *p = (__m128i *)(void *)ivAes; | 105 | __m128i *p = (__m128i *)(void *)ivAes; |
86 | __m128i *data = (__m128i *)(void *)data8; | 106 | __m128i *data = (__m128i *)(void *)data8; |
87 | __m128i m = *p; | 107 | __m128i m = *p; |
88 | const __m128i k0 = p[2]; | 108 | const __m128i k0 = p[2]; |
89 | const __m128i k1 = p[3]; | 109 | const __m128i k1 = p[3]; |
90 | const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; | 110 | const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; |
91 | for (; numBlocks != 0; numBlocks--, data++) | 111 | do |
92 | { | 112 | { |
93 | UInt32 r = numRounds2; | 113 | UInt32 r = numRounds2; |
94 | const __m128i *w = p + 4; | 114 | const __m128i *w = p + 4; |
95 | __m128i temp = *data; | 115 | __m128i temp = LOAD_128(data); |
96 | MM_XOR (temp, k0) | 116 | MM_XOR (temp, k0) |
97 | MM_XOR (m, temp) | 117 | MM_XOR (m, temp) |
98 | MM_OP_m (_mm_aesenc_si128, k1) | 118 | MM_OP_m (_mm_aesenc_si128, k1) |
@@ -104,9 +124,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
104 | } | 124 | } |
105 | while (--r); | 125 | while (--r); |
106 | MM_OP_m (_mm_aesenclast_si128, w[0]) | 126 | MM_OP_m (_mm_aesenclast_si128, w[0]) |
107 | *data = m; | 127 | STORE_128(data, m); |
128 | data++; | ||
108 | } | 129 | } |
130 | while (--numBlocks); | ||
109 | *p = m; | 131 | *p = m; |
132 | } | ||
110 | } | 133 | } |
111 | 134 | ||
112 | 135 | ||
@@ -139,12 +162,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
139 | 162 | ||
140 | #define WOP(op) op (m0, 0) WOP_M1(op) | 163 | #define WOP(op) op (m0, 0) WOP_M1(op) |
141 | 164 | ||
142 | |||
143 | #define DECLARE_VAR(reg, ii) __m128i reg; | 165 | #define DECLARE_VAR(reg, ii) __m128i reg; |
144 | #define LOAD_data( reg, ii) reg = data[ii]; | 166 | #define LOAD_data_ii(ii) LOAD_128(data + (ii)) |
145 | #define STORE_data( reg, ii) data[ii] = reg; | 167 | #define LOAD_data( reg, ii) reg = LOAD_data_ii(ii); |
168 | #define STORE_data( reg, ii) STORE_128(data + (ii), reg); | ||
146 | #if (NUM_WAYS > 1) | 169 | #if (NUM_WAYS > 1) |
147 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) | 170 | #define XOR_data_M1(reg, ii) MM_XOR (reg, LOAD_128(data + (ii- 1))) |
148 | #endif | 171 | #endif |
149 | 172 | ||
150 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); | 173 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); |
@@ -156,25 +179,22 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
156 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) | 179 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) |
157 | 180 | ||
158 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; | 181 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; |
159 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg) | 182 | #define CTR_END( reg, ii) STORE_128(data + (ii), _mm_xor_si128(reg, \ |
160 | 183 | LOAD_128 (data + (ii)))); | |
161 | #define WOP_KEY(op, n) { \ | 184 | #define WOP_KEY(op, n) { \ |
162 | const __m128i key = w[n]; \ | 185 | const __m128i key = w[n]; \ |
163 | WOP(op); } | 186 | WOP(op) } |
164 | |||
165 | 187 | ||
166 | #define WIDE_LOOP_START \ | 188 | #define WIDE_LOOP_START \ |
167 | dataEnd = data + numBlocks; \ | 189 | dataEnd = data + numBlocks; \ |
168 | if (numBlocks >= NUM_WAYS) \ | 190 | if (numBlocks >= NUM_WAYS) \ |
169 | { dataEnd -= NUM_WAYS; do { \ | 191 | { dataEnd -= NUM_WAYS; do { \ |
170 | 192 | ||
171 | |||
172 | #define WIDE_LOOP_END \ | 193 | #define WIDE_LOOP_END \ |
173 | data += NUM_WAYS; \ | 194 | data += NUM_WAYS; \ |
174 | } while (data <= dataEnd); \ | 195 | } while (data <= dataEnd); \ |
175 | dataEnd += NUM_WAYS; } \ | 196 | dataEnd += NUM_WAYS; } \ |
176 | 197 | ||
177 | |||
178 | #define SINGLE_LOOP \ | 198 | #define SINGLE_LOOP \ |
179 | for (; data < dataEnd; data++) | 199 | for (; data < dataEnd; data++) |
180 | 200 | ||
@@ -184,54 +204,73 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
184 | 204 | ||
185 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) | 205 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) |
186 | #define AVX_DECLARE_VAR(reg, ii) __m256i reg; | 206 | #define AVX_DECLARE_VAR(reg, ii) __m256i reg; |
187 | #define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; | 207 | |
188 | #define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; | 208 | #if 1 |
209 | // use unaligned AVX load/store for data. | ||
210 | // It is required for our Aes functions, that data is aligned for 16-bytes. | ||
211 | // But we need 32-bytes reading. | ||
212 | // So we use intrinsics for unaligned AVX load/store. | ||
213 | // notes for _mm256_storeu_si256: | ||
214 | // msvc2022: uses vmovdqu and keeps the order of instruction sequence. | ||
215 | // new gcc11 uses vmovdqu | ||
216 | // old gcc9 could use pair of instructions: | ||
217 | // vmovups %xmm7, -224(%rax) | ||
218 | // vextracti128 $0x1, %ymm7, -208(%rax) | ||
219 | #define AVX_LOAD(p) _mm256_loadu_si256((const __m256i *)(const void *)(p)) | ||
220 | #define AVX_STORE(p, _v) _mm256_storeu_si256((__m256i *)(void *)(p), _v); | ||
221 | #else | ||
222 | // use aligned AVX load/store for data. | ||
223 | // for debug: we can use this branch, if we are sure that data is aligned for 32-bytes. | ||
224 | // msvc2022 uses vmovdqu still | ||
225 | // gcc uses vmovdqa (that requires 32-bytes alignment) | ||
226 | #define AVX_LOAD(p) (*(const __m256i *)(const void *)(p)) | ||
227 | #define AVX_STORE(p, _v) (*(__m256i *)(void *)(p)) = _v; | ||
228 | #endif | ||
229 | |||
230 | #define AVX_LOAD_data( reg, ii) reg = AVX_LOAD((const __m256i *)(const void *)data + (ii)); | ||
231 | #define AVX_STORE_data( reg, ii) AVX_STORE((__m256i *)(void *)data + (ii), reg) | ||
189 | /* | 232 | /* |
190 | AVX_XOR_data_M1() needs unaligned memory load | 233 | AVX_XOR_data_M1() needs unaligned memory load, even if (data) |
191 | if (we don't use _mm256_loadu_si256() here) | 234 | is aligned for 256-bits, because we read 32-bytes chunk that |
192 | { | 235 | crosses (data) position: from (data - 16bytes) to (data + 16bytes). |
193 | Most compilers with enabled optimizations generate fused AVX (LOAD + OP) | ||
194 | instruction that can load unaligned data. | ||
195 | But GCC and CLANG without -O2 or -O1 optimizations can generate separated | ||
196 | LOAD-ALIGNED (vmovdqa) instruction that will fail on execution. | ||
197 | } | ||
198 | Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here. | ||
199 | v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler. | ||
200 | */ | 236 | */ |
201 | #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii]))) | 237 | #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii))) |
202 | // for debug only: the following code will fail on execution, if compiled by some compilers: | ||
203 | // #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) | ||
204 | 238 | ||
205 | #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) | 239 | #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) |
206 | #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) | 240 | #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) |
207 | #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) | 241 | #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) |
208 | #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) | 242 | #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) |
209 | #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) | 243 | #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) |
210 | #define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); | 244 | #define AVX_CTR_START(reg, ii) \ |
211 | #define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) | 245 | MM_OP (_mm256_add_epi64, ctr2, two) \ |
246 | reg = _mm256_xor_si256(ctr2, key); | ||
247 | |||
248 | #define AVX_CTR_END(reg, ii) \ | ||
249 | AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \ | ||
250 | AVX_LOAD ((__m256i *)(void *)data + (ii)))); | ||
251 | |||
212 | #define AVX_WOP_KEY(op, n) { \ | 252 | #define AVX_WOP_KEY(op, n) { \ |
213 | const __m256i key = w[n]; \ | 253 | const __m256i key = w[n]; \ |
214 | WOP(op); } | 254 | WOP(op) } |
215 | 255 | ||
216 | #define NUM_AES_KEYS_MAX 15 | 256 | #define NUM_AES_KEYS_MAX 15 |
217 | 257 | ||
218 | #define WIDE_LOOP_START_AVX(OP) \ | 258 | #define WIDE_LOOP_START_AVX(OP) \ |
219 | dataEnd = data + numBlocks; \ | 259 | dataEnd = data + numBlocks; \ |
220 | if (numBlocks >= NUM_WAYS * 2) \ | 260 | if (numBlocks >= NUM_WAYS * 2) \ |
221 | { __m256i keys[NUM_AES_KEYS_MAX]; \ | 261 | { __m256i keys[NUM_AES_KEYS_MAX]; \ |
222 | UInt32 ii; \ | 262 | OP \ |
223 | OP \ | 263 | { UInt32 ii; for (ii = 0; ii < numRounds; ii++) \ |
224 | for (ii = 0; ii < numRounds; ii++) \ | 264 | keys[ii] = _mm256_broadcastsi128_si256(p[ii]); } \ |
225 | keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \ | 265 | dataEnd -= NUM_WAYS * 2; \ |
226 | dataEnd -= NUM_WAYS * 2; do { \ | 266 | do { \ |
227 | |||
228 | 267 | ||
229 | #define WIDE_LOOP_END_AVX(OP) \ | 268 | #define WIDE_LOOP_END_AVX(OP) \ |
230 | data += NUM_WAYS * 2; \ | 269 | data += NUM_WAYS * 2; \ |
231 | } while (data <= dataEnd); \ | 270 | } while (data <= dataEnd); \ |
232 | dataEnd += NUM_WAYS * 2; \ | 271 | dataEnd += NUM_WAYS * 2; \ |
233 | OP \ | 272 | OP \ |
234 | _mm256_zeroupper(); \ | 273 | _mm256_zeroupper(); \ |
235 | } \ | 274 | } \ |
236 | 275 | ||
237 | /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, | 276 | /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, |
@@ -246,21 +285,20 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
246 | __m128i *p = (__m128i *)(void *)ivAes; | 285 | __m128i *p = (__m128i *)(void *)ivAes; |
247 | __m128i *data = (__m128i *)(void *)data8; | 286 | __m128i *data = (__m128i *)(void *)data8; |
248 | __m128i iv = *p; | 287 | __m128i iv = *p; |
249 | const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1; | 288 | const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1; |
250 | const __m128i *dataEnd; | 289 | const __m128i *dataEnd; |
251 | p += 2; | 290 | p += 2; |
252 | 291 | ||
253 | WIDE_LOOP_START | 292 | WIDE_LOOP_START |
254 | { | 293 | { |
255 | const __m128i *w = wStart; | 294 | const __m128i *w = wStart; |
256 | |||
257 | WOP (DECLARE_VAR) | 295 | WOP (DECLARE_VAR) |
258 | WOP (LOAD_data) | 296 | WOP (LOAD_data) |
259 | WOP_KEY (AES_XOR, 1) | 297 | WOP_KEY (AES_XOR, 1) |
260 | |||
261 | do | 298 | do |
262 | { | 299 | { |
263 | WOP_KEY (AES_DEC, 0) | 300 | WOP_KEY (AES_DEC, 0) |
301 | |||
264 | w--; | 302 | w--; |
265 | } | 303 | } |
266 | while (w != p); | 304 | while (w != p); |
@@ -268,7 +306,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
268 | 306 | ||
269 | MM_XOR (m0, iv) | 307 | MM_XOR (m0, iv) |
270 | WOP_M1 (XOR_data_M1) | 308 | WOP_M1 (XOR_data_M1) |
271 | iv = data[NUM_WAYS - 1]; | 309 | LOAD_data(iv, NUM_WAYS - 1) |
272 | WOP (STORE_data) | 310 | WOP (STORE_data) |
273 | } | 311 | } |
274 | WIDE_LOOP_END | 312 | WIDE_LOOP_END |
@@ -276,7 +314,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
276 | SINGLE_LOOP | 314 | SINGLE_LOOP |
277 | { | 315 | { |
278 | const __m128i *w = wStart - 1; | 316 | const __m128i *w = wStart - 1; |
279 | __m128i m = _mm_xor_si128 (w[2], *data); | 317 | __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0)); |
318 | |||
280 | do | 319 | do |
281 | { | 320 | { |
282 | MM_OP_m (_mm_aesdec_si128, w[1]) | 321 | MM_OP_m (_mm_aesdec_si128, w[1]) |
@@ -286,10 +325,9 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
286 | while (w != p); | 325 | while (w != p); |
287 | MM_OP_m (_mm_aesdec_si128, w[1]) | 326 | MM_OP_m (_mm_aesdec_si128, w[1]) |
288 | MM_OP_m (_mm_aesdeclast_si128, w[0]) | 327 | MM_OP_m (_mm_aesdeclast_si128, w[0]) |
289 | |||
290 | MM_XOR (m, iv) | 328 | MM_XOR (m, iv) |
291 | iv = *data; | 329 | LOAD_data(iv, 0) |
292 | *data = m; | 330 | STORE_data(m, 0) |
293 | } | 331 | } |
294 | 332 | ||
295 | p[-2] = iv; | 333 | p[-2] = iv; |
@@ -301,9 +339,9 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
301 | __m128i *p = (__m128i *)(void *)ivAes; | 339 | __m128i *p = (__m128i *)(void *)ivAes; |
302 | __m128i *data = (__m128i *)(void *)data8; | 340 | __m128i *data = (__m128i *)(void *)data8; |
303 | __m128i ctr = *p; | 341 | __m128i ctr = *p; |
304 | UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; | 342 | const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; |
305 | const __m128i *dataEnd; | 343 | const __m128i *dataEnd; |
306 | __m128i one = _mm_cvtsi32_si128(1); | 344 | const __m128i one = _mm_cvtsi32_si128(1); |
307 | 345 | ||
308 | p += 2; | 346 | p += 2; |
309 | 347 | ||
@@ -322,7 +360,6 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
322 | } | 360 | } |
323 | while (--r); | 361 | while (--r); |
324 | WOP_KEY (AES_ENC_LAST, 0) | 362 | WOP_KEY (AES_ENC_LAST, 0) |
325 | |||
326 | WOP (CTR_END) | 363 | WOP (CTR_END) |
327 | } | 364 | } |
328 | WIDE_LOOP_END | 365 | WIDE_LOOP_END |
@@ -344,7 +381,7 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
344 | while (--numRounds2); | 381 | while (--numRounds2); |
345 | MM_OP_m (_mm_aesenc_si128, w[0]) | 382 | MM_OP_m (_mm_aesenc_si128, w[0]) |
346 | MM_OP_m (_mm_aesenclast_si128, w[1]) | 383 | MM_OP_m (_mm_aesenclast_si128, w[1]) |
347 | MM_XOR (*data, m) | 384 | CTR_END (m, 0) |
348 | } | 385 | } |
349 | 386 | ||
350 | p[-2] = ctr; | 387 | p[-2] = ctr; |
@@ -421,7 +458,7 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) | |||
421 | __m128i *data = (__m128i *)(void *)data8; | 458 | __m128i *data = (__m128i *)(void *)data8; |
422 | __m128i iv = *p; | 459 | __m128i iv = *p; |
423 | const __m128i *dataEnd; | 460 | const __m128i *dataEnd; |
424 | UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; | 461 | const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; |
425 | p += 2; | 462 | p += 2; |
426 | 463 | ||
427 | WIDE_LOOP_START_AVX(;) | 464 | WIDE_LOOP_START_AVX(;) |
@@ -440,17 +477,17 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) | |||
440 | while (w != keys); | 477 | while (w != keys); |
441 | AVX_WOP_KEY (AVX_AES_DEC_LAST, 0) | 478 | AVX_WOP_KEY (AVX_AES_DEC_LAST, 0) |
442 | 479 | ||
443 | AVX_XOR (m0, _mm256_setr_m128i(iv, data[0])) | 480 | AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0))) |
444 | WOP_M1 (AVX_XOR_data_M1) | 481 | WOP_M1 (AVX_XOR_data_M1) |
445 | iv = data[NUM_WAYS * 2 - 1]; | 482 | LOAD_data (iv, NUM_WAYS * 2 - 1) |
446 | WOP (AVX_STORE_data) | 483 | WOP (AVX_STORE_data) |
447 | } | 484 | } |
448 | WIDE_LOOP_END_AVX(;) | 485 | WIDE_LOOP_END_AVX(;) |
449 | 486 | ||
450 | SINGLE_LOOP | 487 | SINGLE_LOOP |
451 | { | 488 | { |
452 | const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3; | 489 | const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2; |
453 | __m128i m = _mm_xor_si128 (w[2], *data); | 490 | __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0)); |
454 | do | 491 | do |
455 | { | 492 | { |
456 | MM_OP_m (_mm_aesdec_si128, w[1]) | 493 | MM_OP_m (_mm_aesdec_si128, w[1]) |
@@ -462,8 +499,8 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) | |||
462 | MM_OP_m (_mm_aesdeclast_si128, w[0]) | 499 | MM_OP_m (_mm_aesdeclast_si128, w[0]) |
463 | 500 | ||
464 | MM_XOR (m, iv) | 501 | MM_XOR (m, iv) |
465 | iv = *data; | 502 | LOAD_data(iv, 0) |
466 | *data = m; | 503 | STORE_data(m, 0) |
467 | } | 504 | } |
468 | 505 | ||
469 | p[-2] = iv; | 506 | p[-2] = iv; |
@@ -493,9 +530,9 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) | |||
493 | __m128i *p = (__m128i *)(void *)ivAes; | 530 | __m128i *p = (__m128i *)(void *)ivAes; |
494 | __m128i *data = (__m128i *)(void *)data8; | 531 | __m128i *data = (__m128i *)(void *)data8; |
495 | __m128i ctr = *p; | 532 | __m128i ctr = *p; |
496 | UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; | 533 | const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; |
497 | const __m128i *dataEnd; | 534 | const __m128i *dataEnd; |
498 | __m128i one = _mm_cvtsi32_si128(1); | 535 | const __m128i one = _mm_cvtsi32_si128(1); |
499 | __m256i ctr2, two; | 536 | __m256i ctr2, two; |
500 | p += 2; | 537 | p += 2; |
501 | 538 | ||
@@ -536,7 +573,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) | |||
536 | while (--numRounds2); | 573 | while (--numRounds2); |
537 | MM_OP_m (_mm_aesenc_si128, w[0]) | 574 | MM_OP_m (_mm_aesenc_si128, w[0]) |
538 | MM_OP_m (_mm_aesenclast_si128, w[1]) | 575 | MM_OP_m (_mm_aesenclast_si128, w[1]) |
539 | MM_XOR (*data, m) | 576 | CTR_END (m, 0) |
540 | } | 577 | } |
541 | 578 | ||
542 | p[-2] = ctr; | 579 | p[-2] = ctr; |
@@ -731,9 +768,14 @@ AES_FUNC_START (name) | |||
731 | 768 | ||
732 | AES_FUNC_START2 (AesCbc_Encode_HW) | 769 | AES_FUNC_START2 (AesCbc_Encode_HW) |
733 | { | 770 | { |
734 | v128 * const p = (v128*)(void*)ivAes; | 771 | if (numBlocks == 0) |
735 | v128 *data = (v128*)(void*)data8; | 772 | return; |
773 | { | ||
774 | v128 * const p = (v128 *)(void *)ivAes; | ||
775 | v128 *data = (v128 *)(void *)data8; | ||
736 | v128 m = *p; | 776 | v128 m = *p; |
777 | const UInt32 numRounds2 = *(const UInt32 *)(p + 1); | ||
778 | const v128 *w = p + (size_t)numRounds2 * 2; | ||
737 | const v128 k0 = p[2]; | 779 | const v128 k0 = p[2]; |
738 | const v128 k1 = p[3]; | 780 | const v128 k1 = p[3]; |
739 | const v128 k2 = p[4]; | 781 | const v128 k2 = p[4]; |
@@ -744,11 +786,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
744 | const v128 k7 = p[9]; | 786 | const v128 k7 = p[9]; |
745 | const v128 k8 = p[10]; | 787 | const v128 k8 = p[10]; |
746 | const v128 k9 = p[11]; | 788 | const v128 k9 = p[11]; |
747 | const UInt32 numRounds2 = *(const UInt32 *)(p + 1); | 789 | const v128 k_z4 = w[-2]; |
748 | const v128 *w = p + ((size_t)numRounds2 * 2); | 790 | const v128 k_z3 = w[-1]; |
791 | const v128 k_z2 = w[0]; | ||
749 | const v128 k_z1 = w[1]; | 792 | const v128 k_z1 = w[1]; |
750 | const v128 k_z0 = w[2]; | 793 | const v128 k_z0 = w[2]; |
751 | for (; numBlocks != 0; numBlocks--, data++) | 794 | // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle, |
795 | // because gcc/clang compilers are not good for that optimization. | ||
796 | do | ||
752 | { | 797 | { |
753 | MM_XOR_m (*data) | 798 | MM_XOR_m (*data) |
754 | AES_E_MC_m (k0) | 799 | AES_E_MC_m (k0) |
@@ -757,24 +802,26 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
757 | AES_E_MC_m (k3) | 802 | AES_E_MC_m (k3) |
758 | AES_E_MC_m (k4) | 803 | AES_E_MC_m (k4) |
759 | AES_E_MC_m (k5) | 804 | AES_E_MC_m (k5) |
760 | AES_E_MC_m (k6) | ||
761 | AES_E_MC_m (k7) | ||
762 | AES_E_MC_m (k8) | ||
763 | if (numRounds2 >= 6) | 805 | if (numRounds2 >= 6) |
764 | { | 806 | { |
765 | AES_E_MC_m (k9) | 807 | AES_E_MC_m (k6) |
766 | AES_E_MC_m (p[12]) | 808 | AES_E_MC_m (k7) |
767 | if (numRounds2 != 6) | 809 | if (numRounds2 != 6) |
768 | { | 810 | { |
769 | AES_E_MC_m (p[13]) | 811 | AES_E_MC_m (k8) |
770 | AES_E_MC_m (p[14]) | 812 | AES_E_MC_m (k9) |
771 | } | 813 | } |
772 | } | 814 | } |
773 | AES_E_m (k_z1) | 815 | AES_E_MC_m (k_z4) |
774 | MM_XOR_m (k_z0) | 816 | AES_E_MC_m (k_z3) |
775 | *data = m; | 817 | AES_E_MC_m (k_z2) |
818 | AES_E_m (k_z1) | ||
819 | MM_XOR_m (k_z0) | ||
820 | *data++ = m; | ||
776 | } | 821 | } |
822 | while (--numBlocks); | ||
777 | *p = m; | 823 | *p = m; |
824 | } | ||
778 | } | 825 | } |
779 | 826 | ||
780 | 827 | ||
@@ -834,10 +881,10 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
834 | 881 | ||
835 | AES_FUNC_START2 (AesCbc_Decode_HW) | 882 | AES_FUNC_START2 (AesCbc_Decode_HW) |
836 | { | 883 | { |
837 | v128 *p = (v128*)(void*)ivAes; | 884 | v128 *p = (v128 *)(void *)ivAes; |
838 | v128 *data = (v128*)(void*)data8; | 885 | v128 *data = (v128 *)(void *)data8; |
839 | v128 iv = *p; | 886 | v128 iv = *p; |
840 | const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; | 887 | const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2; |
841 | const v128 *dataEnd; | 888 | const v128 *dataEnd; |
842 | p += 2; | 889 | p += 2; |
843 | 890 | ||
@@ -858,7 +905,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
858 | WOP_KEY (AES_XOR, 0) | 905 | WOP_KEY (AES_XOR, 0) |
859 | MM_XOR (m0, iv) | 906 | MM_XOR (m0, iv) |
860 | WOP_M1 (XOR_data_M1) | 907 | WOP_M1 (XOR_data_M1) |
861 | iv = data[NUM_WAYS - 1]; | 908 | LOAD_data(iv, NUM_WAYS - 1) |
862 | WOP (STORE_data) | 909 | WOP (STORE_data) |
863 | } | 910 | } |
864 | WIDE_LOOP_END | 911 | WIDE_LOOP_END |
@@ -866,7 +913,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
866 | SINGLE_LOOP | 913 | SINGLE_LOOP |
867 | { | 914 | { |
868 | const v128 *w = wStart; | 915 | const v128 *w = wStart; |
869 | v128 m = *data; | 916 | v128 m; LOAD_data(m, 0) |
870 | AES_D_IMC_m (w[2]) | 917 | AES_D_IMC_m (w[2]) |
871 | do | 918 | do |
872 | { | 919 | { |
@@ -878,8 +925,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
878 | AES_D_m (w[1]) | 925 | AES_D_m (w[1]) |
879 | MM_XOR_m (w[0]) | 926 | MM_XOR_m (w[0]) |
880 | MM_XOR_m (iv) | 927 | MM_XOR_m (iv) |
881 | iv = *data; | 928 | LOAD_data(iv, 0) |
882 | *data = m; | 929 | STORE_data(m, 0) |
883 | } | 930 | } |
884 | 931 | ||
885 | p[-2] = iv; | 932 | p[-2] = iv; |
@@ -888,19 +935,17 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
888 | 935 | ||
889 | AES_FUNC_START2 (AesCtr_Code_HW) | 936 | AES_FUNC_START2 (AesCtr_Code_HW) |
890 | { | 937 | { |
891 | v128 *p = (v128*)(void*)ivAes; | 938 | v128 *p = (v128 *)(void *)ivAes; |
892 | v128 *data = (v128*)(void*)data8; | 939 | v128 *data = (v128 *)(void *)data8; |
893 | uint64x2_t ctr = vreinterpretq_u64_u8(*p); | 940 | uint64x2_t ctr = vreinterpretq_u64_u8(*p); |
894 | const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; | 941 | const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2; |
895 | const v128 *dataEnd; | 942 | const v128 *dataEnd; |
896 | uint64x2_t one = vdupq_n_u64(0); | ||
897 | |||
898 | // the bug in clang: | 943 | // the bug in clang: |
899 | // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); | 944 | // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); |
900 | #if defined(__clang__) && (__clang_major__ <= 9) | 945 | #if defined(__clang__) && (__clang_major__ <= 9) |
901 | #pragma GCC diagnostic ignored "-Wvector-conversion" | 946 | #pragma GCC diagnostic ignored "-Wvector-conversion" |
902 | #endif | 947 | #endif |
903 | one = vsetq_lane_u64(1, one, 0); | 948 | const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0); |
904 | p += 2; | 949 | p += 2; |
905 | 950 | ||
906 | WIDE_LOOP_START | 951 | WIDE_LOOP_START |