diff options
Diffstat (limited to 'Asm/x86/AesOpt.asm')
-rw-r--r-- | Asm/x86/AesOpt.asm | 742 |
1 files changed, 742 insertions, 0 deletions
diff --git a/Asm/x86/AesOpt.asm b/Asm/x86/AesOpt.asm new file mode 100644 index 0000000..84bf897 --- /dev/null +++ b/Asm/x86/AesOpt.asm | |||
@@ -0,0 +1,742 @@ | |||
1 | ; AesOpt.asm -- AES optimized code for x86 AES hardware instructions | ||
2 | ; 2021-12-25 : Igor Pavlov : Public domain | ||
3 | |||
4 | include 7zAsm.asm | ||
5 | |||
6 | ifdef __ASMC__ | ||
7 | use_vaes_256 equ 1 | ||
8 | else | ||
9 | ifdef ymm0 | ||
10 | use_vaes_256 equ 1 | ||
11 | endif | ||
12 | endif | ||
13 | |||
14 | |||
15 | ifdef use_vaes_256 | ||
16 | ECHO "++ VAES 256" | ||
17 | else | ||
18 | ECHO "-- NO VAES 256" | ||
19 | endif | ||
20 | |||
21 | ifdef x64 | ||
22 | ECHO "x86-64" | ||
23 | else | ||
24 | ECHO "x86" | ||
25 | if (IS_CDECL gt 0) | ||
26 | ECHO "ABI : CDECL" | ||
27 | else | ||
28 | ECHO "ABI : no CDECL : FASTCALL" | ||
29 | endif | ||
30 | endif | ||
31 | |||
32 | if (IS_LINUX gt 0) | ||
33 | ECHO "ABI : LINUX" | ||
34 | else | ||
35 | ECHO "ABI : WINDOWS" | ||
36 | endif | ||
37 | |||
38 | MY_ASM_START | ||
39 | |||
40 | ifndef x64 | ||
41 | .686 | ||
42 | .xmm | ||
43 | endif | ||
44 | |||
45 | |||
46 | ; MY_ALIGN EQU ALIGN(64) | ||
47 | MY_ALIGN EQU | ||
48 | |||
49 | SEG_ALIGN EQU MY_ALIGN | ||
50 | |||
51 | MY_SEG_PROC macro name:req, numParams:req | ||
52 | ; seg_name equ @CatStr(_TEXT$, name) | ||
53 | ; seg_name SEGMENT SEG_ALIGN 'CODE' | ||
54 | MY_PROC name, numParams | ||
55 | endm | ||
56 | |||
57 | MY_SEG_ENDP macro | ||
58 | ; seg_name ENDS | ||
59 | endm | ||
60 | |||
61 | |||
62 | NUM_AES_KEYS_MAX equ 15 | ||
63 | |||
64 | ; the number of push operators in function PROLOG | ||
65 | if (IS_LINUX eq 0) or (IS_X64 eq 0) | ||
66 | num_regs_push equ 2 | ||
67 | stack_param_offset equ (REG_SIZE * (1 + num_regs_push)) | ||
68 | endif | ||
69 | |||
70 | ifdef x64 | ||
71 | num_param equ REG_ABI_PARAM_2 | ||
72 | else | ||
73 | if (IS_CDECL gt 0) | ||
74 | ; size_t size | ||
75 | ; void * data | ||
76 | ; UInt32 * aes | ||
77 | ; ret-ip <- (r4) | ||
78 | aes_OFFS equ (stack_param_offset) | ||
79 | data_OFFS equ (REG_SIZE + aes_OFFS) | ||
80 | size_OFFS equ (REG_SIZE + data_OFFS) | ||
81 | num_param equ [r4 + size_OFFS] | ||
82 | else | ||
83 | num_param equ [r4 + stack_param_offset] | ||
84 | endif | ||
85 | endif | ||
86 | |||
87 | keys equ REG_PARAM_0 ; r1 | ||
88 | rD equ REG_PARAM_1 ; r2 | ||
89 | rN equ r0 | ||
90 | |||
91 | koffs_x equ x7 | ||
92 | koffs_r equ r7 | ||
93 | |||
94 | ksize_x equ x6 | ||
95 | ksize_r equ r6 | ||
96 | |||
97 | keys2 equ r3 | ||
98 | |||
99 | state equ xmm0 | ||
100 | key equ xmm0 | ||
101 | key_ymm equ ymm0 | ||
102 | key_ymm_n equ 0 | ||
103 | |||
104 | ifdef x64 | ||
105 | ways = 11 | ||
106 | else | ||
107 | ways = 4 | ||
108 | endif | ||
109 | |||
110 | ways_start_reg equ 1 | ||
111 | |||
112 | iv equ @CatStr(xmm, %(ways_start_reg + ways)) | ||
113 | iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways)) | ||
114 | |||
115 | |||
116 | WOP macro op, op2 | ||
117 | i = 0 | ||
118 | rept ways | ||
119 | op @CatStr(xmm, %(ways_start_reg + i)), op2 | ||
120 | i = i + 1 | ||
121 | endm | ||
122 | endm | ||
123 | |||
124 | |||
125 | ifndef ABI_LINUX | ||
126 | ifdef x64 | ||
127 | |||
128 | ; we use 32 bytes of home space in stack in WIN64-x64 | ||
129 | NUM_HOME_MM_REGS equ (32 / 16) | ||
130 | ; we preserve xmm registers starting from xmm6 in WIN64-x64 | ||
131 | MM_START_SAVE_REG equ 6 | ||
132 | |||
133 | SAVE_XMM macro num_used_mm_regs:req | ||
134 | num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG | ||
135 | if num_save_mm_regs GT 0 | ||
136 | num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS | ||
137 | ; RSP is (16*x + 8) after entering the function in WIN64-x64 | ||
138 | stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16) | ||
139 | |||
140 | i = 0 | ||
141 | rept num_save_mm_regs | ||
142 | |||
143 | if i eq NUM_HOME_MM_REGS | ||
144 | sub r4, stack_offset | ||
145 | endif | ||
146 | |||
147 | if i lt NUM_HOME_MM_REGS | ||
148 | movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i)) | ||
149 | else | ||
150 | movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i)) | ||
151 | endif | ||
152 | |||
153 | i = i + 1 | ||
154 | endm | ||
155 | endif | ||
156 | endm | ||
157 | |||
158 | RESTORE_XMM macro num_used_mm_regs:req | ||
159 | if num_save_mm_regs GT 0 | ||
160 | i = 0 | ||
161 | if num_save_mm_regs2 GT 0 | ||
162 | rept num_save_mm_regs2 | ||
163 | movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16] | ||
164 | i = i + 1 | ||
165 | endm | ||
166 | add r4, stack_offset | ||
167 | endif | ||
168 | |||
169 | num_low_regs = num_save_mm_regs - i | ||
170 | i = 0 | ||
171 | rept num_low_regs | ||
172 | movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16] | ||
173 | i = i + 1 | ||
174 | endm | ||
175 | endif | ||
176 | endm | ||
177 | |||
178 | endif ; x64 | ||
179 | endif ; ABI_LINUX | ||
180 | |||
181 | |||
182 | MY_PROLOG macro num_used_mm_regs:req | ||
183 | ; num_regs_push: must be equal to the number of push operators | ||
184 | ; push r3 | ||
185 | ; push r5 | ||
186 | if (IS_LINUX eq 0) or (IS_X64 eq 0) | ||
187 | push r6 | ||
188 | push r7 | ||
189 | endif | ||
190 | |||
191 | mov rN, num_param ; don't move it; num_param can use stack pointer (r4) | ||
192 | |||
193 | if (IS_X64 eq 0) | ||
194 | if (IS_CDECL gt 0) | ||
195 | mov rD, [r4 + data_OFFS] | ||
196 | mov keys, [r4 + aes_OFFS] | ||
197 | endif | ||
198 | elseif (IS_LINUX gt 0) | ||
199 | MY_ABI_LINUX_TO_WIN_2 | ||
200 | endif | ||
201 | |||
202 | |||
203 | ifndef ABI_LINUX | ||
204 | ifdef x64 | ||
205 | SAVE_XMM num_used_mm_regs | ||
206 | endif | ||
207 | endif | ||
208 | |||
209 | mov ksize_x, [keys + 16] | ||
210 | shl ksize_x, 5 | ||
211 | endm | ||
212 | |||
213 | |||
214 | MY_EPILOG macro | ||
215 | ifndef ABI_LINUX | ||
216 | ifdef x64 | ||
217 | RESTORE_XMM num_save_mm_regs | ||
218 | endif | ||
219 | endif | ||
220 | |||
221 | if (IS_LINUX eq 0) or (IS_X64 eq 0) | ||
222 | pop r7 | ||
223 | pop r6 | ||
224 | endif | ||
225 | ; pop r5 | ||
226 | ; pop r3 | ||
227 | MY_ENDP | ||
228 | endm | ||
229 | |||
230 | |||
231 | OP_KEY macro op:req, offs:req | ||
232 | op state, [keys + offs] | ||
233 | endm | ||
234 | |||
235 | |||
236 | WOP_KEY macro op:req, offs:req | ||
237 | movdqa key, [keys + offs] | ||
238 | WOP op, key | ||
239 | endm | ||
240 | |||
241 | |||
242 | ; ---------- AES-CBC Decode ---------- | ||
243 | |||
244 | |||
245 | XOR_WITH_DATA macro reg, _ppp_ | ||
246 | pxor reg, [rD + i * 16] | ||
247 | endm | ||
248 | |||
249 | WRITE_TO_DATA macro reg, _ppp_ | ||
250 | movdqa [rD + i * 16], reg | ||
251 | endm | ||
252 | |||
253 | |||
254 | ; state0 equ @CatStr(xmm, %(ways_start_reg)) | ||
255 | |||
256 | key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1)) | ||
257 | key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1)) | ||
258 | |||
259 | key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2)) | ||
260 | key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2)) | ||
261 | key_last_ymm_n equ (ways_start_reg + ways + 2) | ||
262 | |||
263 | NUM_CBC_REGS equ (ways_start_reg + ways + 3) | ||
264 | |||
265 | |||
266 | MY_SEG_PROC AesCbc_Decode_HW, 3 | ||
267 | |||
268 | AesCbc_Decode_HW_start:: | ||
269 | MY_PROLOG NUM_CBC_REGS | ||
270 | |||
271 | AesCbc_Decode_HW_start_2:: | ||
272 | movdqa iv, [keys] | ||
273 | add keys, 32 | ||
274 | |||
275 | movdqa key0, [keys + 1 * ksize_r] | ||
276 | movdqa key_last, [keys] | ||
277 | sub ksize_x, 16 | ||
278 | |||
279 | jmp check2 | ||
280 | align 16 | ||
281 | nextBlocks2: | ||
282 | WOP movdqa, [rD + i * 16] | ||
283 | mov koffs_x, ksize_x | ||
284 | ; WOP_KEY pxor, ksize_r + 16 | ||
285 | WOP pxor, key0 | ||
286 | ; align 16 | ||
287 | @@: | ||
288 | WOP_KEY aesdec, 1 * koffs_r | ||
289 | sub koffs_r, 16 | ||
290 | jnz @B | ||
291 | ; WOP_KEY aesdeclast, 0 | ||
292 | WOP aesdeclast, key_last | ||
293 | |||
294 | pxor @CatStr(xmm, %(ways_start_reg)), iv | ||
295 | i = 1 | ||
296 | rept ways - 1 | ||
297 | pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16] | ||
298 | i = i + 1 | ||
299 | endm | ||
300 | movdqa iv, [rD + ways * 16 - 16] | ||
301 | WOP WRITE_TO_DATA | ||
302 | |||
303 | add rD, ways * 16 | ||
304 | AesCbc_Decode_HW_start_3:: | ||
305 | check2: | ||
306 | sub rN, ways | ||
307 | jnc nextBlocks2 | ||
308 | add rN, ways | ||
309 | |||
310 | sub ksize_x, 16 | ||
311 | |||
312 | jmp check | ||
313 | nextBlock: | ||
314 | movdqa state, [rD] | ||
315 | mov koffs_x, ksize_x | ||
316 | ; OP_KEY pxor, 1 * ksize_r + 32 | ||
317 | pxor state, key0 | ||
318 | ; movdqa state0, [rD] | ||
319 | ; movdqa state, key0 | ||
320 | ; pxor state, state0 | ||
321 | @@: | ||
322 | OP_KEY aesdec, 1 * koffs_r + 16 | ||
323 | OP_KEY aesdec, 1 * koffs_r | ||
324 | sub koffs_r, 32 | ||
325 | jnz @B | ||
326 | OP_KEY aesdec, 16 | ||
327 | ; OP_KEY aesdeclast, 0 | ||
328 | aesdeclast state, key_last | ||
329 | |||
330 | pxor state, iv | ||
331 | movdqa iv, [rD] | ||
332 | ; movdqa iv, state0 | ||
333 | movdqa [rD], state | ||
334 | |||
335 | add rD, 16 | ||
336 | check: | ||
337 | sub rN, 1 | ||
338 | jnc nextBlock | ||
339 | |||
340 | movdqa [keys - 32], iv | ||
341 | MY_EPILOG | ||
342 | |||
343 | |||
344 | |||
345 | |||
346 | ; ---------- AVX ---------- | ||
347 | |||
348 | |||
349 | AVX__WOP_n macro op | ||
350 | i = 0 | ||
351 | rept ways | ||
352 | op (ways_start_reg + i) | ||
353 | i = i + 1 | ||
354 | endm | ||
355 | endm | ||
356 | |||
357 | AVX__WOP macro op | ||
358 | i = 0 | ||
359 | rept ways | ||
360 | op @CatStr(ymm, %(ways_start_reg + i)) | ||
361 | i = i + 1 | ||
362 | endm | ||
363 | endm | ||
364 | |||
365 | |||
366 | AVX__WOP_KEY macro op:req, offs:req | ||
367 | vmovdqa key_ymm, ymmword ptr [keys2 + offs] | ||
368 | AVX__WOP_n op | ||
369 | endm | ||
370 | |||
371 | |||
372 | AVX__CBC_START macro reg | ||
373 | ; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i] | ||
374 | vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i] | ||
375 | endm | ||
376 | |||
377 | AVX__CBC_END macro reg | ||
378 | if i eq 0 | ||
379 | vpxor reg, reg, iv_ymm | ||
380 | else | ||
381 | vpxor reg, reg, ymmword ptr [rD + i * 32 - 16] | ||
382 | endif | ||
383 | endm | ||
384 | |||
385 | |||
386 | AVX__WRITE_TO_DATA macro reg | ||
387 | vmovdqu ymmword ptr [rD + 32 * i], reg | ||
388 | endm | ||
389 | |||
390 | AVX__XOR_WITH_DATA macro reg | ||
391 | vpxor reg, reg, ymmword ptr [rD + 32 * i] | ||
392 | endm | ||
393 | |||
394 | AVX__CTR_START macro reg | ||
395 | vpaddq iv_ymm, iv_ymm, one_ymm | ||
396 | ; vpxor reg, iv_ymm, key_ymm | ||
397 | vpxor reg, iv_ymm, key0_ymm | ||
398 | endm | ||
399 | |||
400 | |||
401 | MY_VAES_INSTR_2 macro cmd, dest, a1, a2 | ||
402 | db 0c4H | ||
403 | db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8) | ||
404 | db 5 + 8 * ((not (a1)) and 15) | ||
405 | db cmd | ||
406 | db 0c0H + 8 * ((dest) and 7) + ((a2) and 7) | ||
407 | endm | ||
408 | |||
409 | MY_VAES_INSTR macro cmd, dest, a | ||
410 | MY_VAES_INSTR_2 cmd, dest, dest, a | ||
411 | endm | ||
412 | |||
413 | MY_vaesenc macro dest, a | ||
414 | MY_VAES_INSTR 0dcH, dest, a | ||
415 | endm | ||
416 | MY_vaesenclast macro dest, a | ||
417 | MY_VAES_INSTR 0ddH, dest, a | ||
418 | endm | ||
419 | MY_vaesdec macro dest, a | ||
420 | MY_VAES_INSTR 0deH, dest, a | ||
421 | endm | ||
422 | MY_vaesdeclast macro dest, a | ||
423 | MY_VAES_INSTR 0dfH, dest, a | ||
424 | endm | ||
425 | |||
426 | |||
427 | AVX__VAES_DEC macro reg | ||
428 | MY_vaesdec reg, key_ymm_n | ||
429 | endm | ||
430 | |||
431 | AVX__VAES_DEC_LAST_key_last macro reg | ||
432 | ; MY_vaesdeclast reg, key_ymm_n | ||
433 | MY_vaesdeclast reg, key_last_ymm_n | ||
434 | endm | ||
435 | |||
436 | AVX__VAES_ENC macro reg | ||
437 | MY_vaesenc reg, key_ymm_n | ||
438 | endm | ||
439 | |||
440 | AVX__VAES_ENC_LAST macro reg | ||
441 | MY_vaesenclast reg, key_ymm_n | ||
442 | endm | ||
443 | |||
444 | AVX__vinserti128_TO_HIGH macro dest, src | ||
445 | vinserti128 dest, dest, src, 1 | ||
446 | endm | ||
447 | |||
448 | |||
449 | MY_PROC AesCbc_Decode_HW_256, 3 | ||
450 | ifdef use_vaes_256 | ||
451 | MY_PROLOG NUM_CBC_REGS | ||
452 | |||
453 | cmp rN, ways * 2 | ||
454 | jb AesCbc_Decode_HW_start_2 | ||
455 | |||
456 | vmovdqa iv, xmmword ptr [keys] | ||
457 | add keys, 32 | ||
458 | |||
459 | vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r] | ||
460 | vbroadcasti128 key_last_ymm, xmmword ptr [keys] | ||
461 | sub ksize_x, 16 | ||
462 | mov koffs_x, ksize_x | ||
463 | add ksize_x, ksize_x | ||
464 | |||
465 | AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32) | ||
466 | push keys2 | ||
467 | sub r4, AVX_STACK_SUB | ||
468 | ; sub r4, 32 | ||
469 | ; sub r4, ksize_r | ||
470 | ; lea keys2, [r4 + 32] | ||
471 | mov keys2, r4 | ||
472 | and keys2, -32 | ||
473 | broad: | ||
474 | vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r] | ||
475 | vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm | ||
476 | sub koffs_r, 16 | ||
477 | ; jnc broad | ||
478 | jnz broad | ||
479 | |||
480 | sub rN, ways * 2 | ||
481 | |||
482 | align 16 | ||
483 | avx_cbcdec_nextBlock2: | ||
484 | mov koffs_x, ksize_x | ||
485 | ; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32 | ||
486 | AVX__WOP AVX__CBC_START | ||
487 | @@: | ||
488 | AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r | ||
489 | sub koffs_r, 32 | ||
490 | jnz @B | ||
491 | ; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0 | ||
492 | AVX__WOP_n AVX__VAES_DEC_LAST_key_last | ||
493 | |||
494 | AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD] | ||
495 | AVX__WOP AVX__CBC_END | ||
496 | |||
497 | vmovdqa iv, xmmword ptr [rD + ways * 32 - 16] | ||
498 | AVX__WOP AVX__WRITE_TO_DATA | ||
499 | |||
500 | add rD, ways * 32 | ||
501 | sub rN, ways * 2 | ||
502 | jnc avx_cbcdec_nextBlock2 | ||
503 | add rN, ways * 2 | ||
504 | |||
505 | shr ksize_x, 1 | ||
506 | |||
507 | ; lea r4, [r4 + 1 * ksize_r + 32] | ||
508 | add r4, AVX_STACK_SUB | ||
509 | pop keys2 | ||
510 | |||
511 | vzeroupper | ||
512 | jmp AesCbc_Decode_HW_start_3 | ||
513 | else | ||
514 | jmp AesCbc_Decode_HW_start | ||
515 | endif | ||
516 | MY_ENDP | ||
517 | MY_SEG_ENDP | ||
518 | |||
519 | |||
520 | |||
521 | |||
522 | ; ---------- AES-CBC Encode ---------- | ||
523 | |||
524 | e0 equ xmm1 | ||
525 | |||
526 | CENC_START_KEY equ 2 | ||
527 | CENC_NUM_REG_KEYS equ (3 * 2) | ||
528 | ; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS)) | ||
529 | |||
530 | MY_SEG_PROC AesCbc_Encode_HW, 3 | ||
531 | MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0) | ||
532 | |||
533 | movdqa state, [keys] | ||
534 | add keys, 32 | ||
535 | |||
536 | i = 0 | ||
537 | rept CENC_NUM_REG_KEYS | ||
538 | movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16] | ||
539 | i = i + 1 | ||
540 | endm | ||
541 | |||
542 | add keys, ksize_r | ||
543 | neg ksize_r | ||
544 | add ksize_r, (16 * CENC_NUM_REG_KEYS) | ||
545 | ; movdqa last_key, [keys] | ||
546 | jmp check_e | ||
547 | |||
548 | align 16 | ||
549 | nextBlock_e: | ||
550 | movdqa e0, [rD] | ||
551 | mov koffs_r, ksize_r | ||
552 | pxor e0, @CatStr(xmm, %(CENC_START_KEY)) | ||
553 | pxor state, e0 | ||
554 | |||
555 | i = 1 | ||
556 | rept (CENC_NUM_REG_KEYS - 1) | ||
557 | aesenc state, @CatStr(xmm, %(CENC_START_KEY + i)) | ||
558 | i = i + 1 | ||
559 | endm | ||
560 | |||
561 | @@: | ||
562 | OP_KEY aesenc, 1 * koffs_r | ||
563 | OP_KEY aesenc, 1 * koffs_r + 16 | ||
564 | add koffs_r, 32 | ||
565 | jnz @B | ||
566 | OP_KEY aesenclast, 0 | ||
567 | ; aesenclast state, last_key | ||
568 | |||
569 | movdqa [rD], state | ||
570 | add rD, 16 | ||
571 | check_e: | ||
572 | sub rN, 1 | ||
573 | jnc nextBlock_e | ||
574 | |||
575 | ; movdqa [keys - 32], state | ||
576 | movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state | ||
577 | MY_EPILOG | ||
578 | MY_SEG_ENDP | ||
579 | |||
580 | |||
581 | |||
582 | ; ---------- AES-CTR ---------- | ||
583 | |||
584 | ifdef x64 | ||
585 | ; ways = 11 | ||
586 | endif | ||
587 | |||
588 | |||
589 | one equ @CatStr(xmm, %(ways_start_reg + ways + 1)) | ||
590 | one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1)) | ||
591 | key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2)) | ||
592 | key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2)) | ||
593 | NUM_CTR_REGS equ (ways_start_reg + ways + 3) | ||
594 | |||
595 | INIT_CTR macro reg, _ppp_ | ||
596 | paddq iv, one | ||
597 | movdqa reg, iv | ||
598 | endm | ||
599 | |||
600 | |||
601 | MY_SEG_PROC AesCtr_Code_HW, 3 | ||
602 | Ctr_start:: | ||
603 | MY_PROLOG NUM_CTR_REGS | ||
604 | |||
605 | Ctr_start_2:: | ||
606 | movdqa iv, [keys] | ||
607 | add keys, 32 | ||
608 | movdqa key0, [keys] | ||
609 | |||
610 | add keys, ksize_r | ||
611 | neg ksize_r | ||
612 | add ksize_r, 16 | ||
613 | |||
614 | Ctr_start_3:: | ||
615 | mov koffs_x, 1 | ||
616 | movd one, koffs_x | ||
617 | jmp check2_c | ||
618 | |||
619 | align 16 | ||
620 | nextBlocks2_c: | ||
621 | WOP INIT_CTR, 0 | ||
622 | mov koffs_r, ksize_r | ||
623 | ; WOP_KEY pxor, 1 * koffs_r -16 | ||
624 | WOP pxor, key0 | ||
625 | @@: | ||
626 | WOP_KEY aesenc, 1 * koffs_r | ||
627 | add koffs_r, 16 | ||
628 | jnz @B | ||
629 | WOP_KEY aesenclast, 0 | ||
630 | |||
631 | WOP XOR_WITH_DATA | ||
632 | WOP WRITE_TO_DATA | ||
633 | add rD, ways * 16 | ||
634 | check2_c: | ||
635 | sub rN, ways | ||
636 | jnc nextBlocks2_c | ||
637 | add rN, ways | ||
638 | |||
639 | sub keys, 16 | ||
640 | add ksize_r, 16 | ||
641 | |||
642 | jmp check_c | ||
643 | |||
644 | ; align 16 | ||
645 | nextBlock_c: | ||
646 | paddq iv, one | ||
647 | ; movdqa state, [keys + 1 * koffs_r - 16] | ||
648 | movdqa state, key0 | ||
649 | mov koffs_r, ksize_r | ||
650 | pxor state, iv | ||
651 | |||
652 | @@: | ||
653 | OP_KEY aesenc, 1 * koffs_r | ||
654 | OP_KEY aesenc, 1 * koffs_r + 16 | ||
655 | add koffs_r, 32 | ||
656 | jnz @B | ||
657 | OP_KEY aesenc, 0 | ||
658 | OP_KEY aesenclast, 16 | ||
659 | |||
660 | pxor state, [rD] | ||
661 | movdqa [rD], state | ||
662 | add rD, 16 | ||
663 | check_c: | ||
664 | sub rN, 1 | ||
665 | jnc nextBlock_c | ||
666 | |||
667 | ; movdqa [keys - 32], iv | ||
668 | movdqa [keys + 1 * ksize_r - 16 - 32], iv | ||
669 | MY_EPILOG | ||
670 | |||
671 | |||
672 | MY_PROC AesCtr_Code_HW_256, 3 | ||
673 | ifdef use_vaes_256 | ||
674 | MY_PROLOG NUM_CTR_REGS | ||
675 | |||
676 | cmp rN, ways * 2 | ||
677 | jb Ctr_start_2 | ||
678 | |||
679 | vbroadcasti128 iv_ymm, xmmword ptr [keys] | ||
680 | add keys, 32 | ||
681 | vbroadcasti128 key0_ymm, xmmword ptr [keys] | ||
682 | mov koffs_x, 1 | ||
683 | vmovd one, koffs_x | ||
684 | vpsubq iv_ymm, iv_ymm, one_ymm | ||
685 | vpaddq one, one, one | ||
686 | AVX__vinserti128_TO_HIGH one_ymm, one | ||
687 | |||
688 | add keys, ksize_r | ||
689 | sub ksize_x, 16 | ||
690 | neg ksize_r | ||
691 | mov koffs_r, ksize_r | ||
692 | add ksize_r, ksize_r | ||
693 | |||
694 | AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32) | ||
695 | push keys2 | ||
696 | lea keys2, [r4 - 32] | ||
697 | sub r4, AVX_STACK_SUB | ||
698 | and keys2, -32 | ||
699 | vbroadcasti128 key_ymm, xmmword ptr [keys] | ||
700 | vmovdqa ymmword ptr [keys2], key_ymm | ||
701 | @@: | ||
702 | vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r] | ||
703 | vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm | ||
704 | add koffs_r, 16 | ||
705 | jnz @B | ||
706 | |||
707 | sub rN, ways * 2 | ||
708 | |||
709 | align 16 | ||
710 | avx_ctr_nextBlock2: | ||
711 | mov koffs_r, ksize_r | ||
712 | AVX__WOP AVX__CTR_START | ||
713 | ; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32 | ||
714 | @@: | ||
715 | AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r | ||
716 | add koffs_r, 32 | ||
717 | jnz @B | ||
718 | AVX__WOP_KEY AVX__VAES_ENC_LAST, 0 | ||
719 | |||
720 | AVX__WOP AVX__XOR_WITH_DATA | ||
721 | AVX__WOP AVX__WRITE_TO_DATA | ||
722 | |||
723 | add rD, ways * 32 | ||
724 | sub rN, ways * 2 | ||
725 | jnc avx_ctr_nextBlock2 | ||
726 | add rN, ways * 2 | ||
727 | |||
728 | vextracti128 iv, iv_ymm, 1 | ||
729 | sar ksize_r, 1 | ||
730 | |||
731 | add r4, AVX_STACK_SUB | ||
732 | pop keys2 | ||
733 | |||
734 | vzeroupper | ||
735 | jmp Ctr_start_3 | ||
736 | else | ||
737 | jmp Ctr_start | ||
738 | endif | ||
739 | MY_ENDP | ||
740 | MY_SEG_ENDP | ||
741 | |||
742 | end | ||