aboutsummaryrefslogtreecommitdiff
path: root/Asm/x86/AesOpt.asm
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2021-12-27 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2022-03-18 15:35:13 +0500
commitf19f813537c7aea1c20749c914e756b54a9c3cf5 (patch)
tree816ba62ca7c0fa19f2eb46d9e9d6f7dd7c3a744d /Asm/x86/AesOpt.asm
parent98e06a519b63b81986abe76d28887f6984a7732b (diff)
download7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.gz
7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.tar.bz2
7zip-f19f813537c7aea1c20749c914e756b54a9c3cf5.zip
'21.07'21.07
Diffstat (limited to 'Asm/x86/AesOpt.asm')
-rw-r--r--Asm/x86/AesOpt.asm742
1 files changed, 742 insertions, 0 deletions
diff --git a/Asm/x86/AesOpt.asm b/Asm/x86/AesOpt.asm
new file mode 100644
index 0000000..84bf897
--- /dev/null
+++ b/Asm/x86/AesOpt.asm
@@ -0,0 +1,742 @@
1; AesOpt.asm -- AES optimized code for x86 AES hardware instructions
2; 2021-12-25 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6ifdef __ASMC__
7 use_vaes_256 equ 1
8else
9ifdef ymm0
10 use_vaes_256 equ 1
11endif
12endif
13
14
15ifdef use_vaes_256
16 ECHO "++ VAES 256"
17else
18 ECHO "-- NO VAES 256"
19endif
20
21ifdef x64
22 ECHO "x86-64"
23else
24 ECHO "x86"
25if (IS_CDECL gt 0)
26 ECHO "ABI : CDECL"
27else
28 ECHO "ABI : no CDECL : FASTCALL"
29endif
30endif
31
32if (IS_LINUX gt 0)
33 ECHO "ABI : LINUX"
34else
35 ECHO "ABI : WINDOWS"
36endif
37
38MY_ASM_START
39
40ifndef x64
41 .686
42 .xmm
43endif
44
45
46; MY_ALIGN EQU ALIGN(64)
47MY_ALIGN EQU
48
49SEG_ALIGN EQU MY_ALIGN
50
51MY_SEG_PROC macro name:req, numParams:req
52 ; seg_name equ @CatStr(_TEXT$, name)
53 ; seg_name SEGMENT SEG_ALIGN 'CODE'
54 MY_PROC name, numParams
55endm
56
57MY_SEG_ENDP macro
58 ; seg_name ENDS
59endm
60
61
62NUM_AES_KEYS_MAX equ 15
63
64; the number of push operators in function PROLOG
65if (IS_LINUX eq 0) or (IS_X64 eq 0)
66num_regs_push equ 2
67stack_param_offset equ (REG_SIZE * (1 + num_regs_push))
68endif
69
70ifdef x64
71 num_param equ REG_ABI_PARAM_2
72else
73 if (IS_CDECL gt 0)
74 ; size_t size
75 ; void * data
76 ; UInt32 * aes
77 ; ret-ip <- (r4)
78 aes_OFFS equ (stack_param_offset)
79 data_OFFS equ (REG_SIZE + aes_OFFS)
80 size_OFFS equ (REG_SIZE + data_OFFS)
81 num_param equ [r4 + size_OFFS]
82 else
83 num_param equ [r4 + stack_param_offset]
84 endif
85endif
86
87keys equ REG_PARAM_0 ; r1
88rD equ REG_PARAM_1 ; r2
89rN equ r0
90
91koffs_x equ x7
92koffs_r equ r7
93
94ksize_x equ x6
95ksize_r equ r6
96
97keys2 equ r3
98
99state equ xmm0
100key equ xmm0
101key_ymm equ ymm0
102key_ymm_n equ 0
103
104ifdef x64
105 ways = 11
106else
107 ways = 4
108endif
109
110ways_start_reg equ 1
111
112iv equ @CatStr(xmm, %(ways_start_reg + ways))
113iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways))
114
115
116WOP macro op, op2
117 i = 0
118 rept ways
119 op @CatStr(xmm, %(ways_start_reg + i)), op2
120 i = i + 1
121 endm
122endm
123
124
125ifndef ABI_LINUX
126ifdef x64
127
128; we use 32 bytes of home space in stack in WIN64-x64
129NUM_HOME_MM_REGS equ (32 / 16)
130; we preserve xmm registers starting from xmm6 in WIN64-x64
131MM_START_SAVE_REG equ 6
132
133SAVE_XMM macro num_used_mm_regs:req
134 num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG
135 if num_save_mm_regs GT 0
136 num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS
137 ; RSP is (16*x + 8) after entering the function in WIN64-x64
138 stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16)
139
140 i = 0
141 rept num_save_mm_regs
142
143 if i eq NUM_HOME_MM_REGS
144 sub r4, stack_offset
145 endif
146
147 if i lt NUM_HOME_MM_REGS
148 movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
149 else
150 movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
151 endif
152
153 i = i + 1
154 endm
155 endif
156endm
157
158RESTORE_XMM macro num_used_mm_regs:req
159 if num_save_mm_regs GT 0
160 i = 0
161 if num_save_mm_regs2 GT 0
162 rept num_save_mm_regs2
163 movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16]
164 i = i + 1
165 endm
166 add r4, stack_offset
167 endif
168
169 num_low_regs = num_save_mm_regs - i
170 i = 0
171 rept num_low_regs
172 movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16]
173 i = i + 1
174 endm
175 endif
176endm
177
178endif ; x64
179endif ; ABI_LINUX
180
181
182MY_PROLOG macro num_used_mm_regs:req
183 ; num_regs_push: must be equal to the number of push operators
184 ; push r3
185 ; push r5
186 if (IS_LINUX eq 0) or (IS_X64 eq 0)
187 push r6
188 push r7
189 endif
190
191 mov rN, num_param ; don't move it; num_param can use stack pointer (r4)
192
193 if (IS_X64 eq 0)
194 if (IS_CDECL gt 0)
195 mov rD, [r4 + data_OFFS]
196 mov keys, [r4 + aes_OFFS]
197 endif
198 elseif (IS_LINUX gt 0)
199 MY_ABI_LINUX_TO_WIN_2
200 endif
201
202
203 ifndef ABI_LINUX
204 ifdef x64
205 SAVE_XMM num_used_mm_regs
206 endif
207 endif
208
209 mov ksize_x, [keys + 16]
210 shl ksize_x, 5
211endm
212
213
214MY_EPILOG macro
215 ifndef ABI_LINUX
216 ifdef x64
217 RESTORE_XMM num_save_mm_regs
218 endif
219 endif
220
221 if (IS_LINUX eq 0) or (IS_X64 eq 0)
222 pop r7
223 pop r6
224 endif
225 ; pop r5
226 ; pop r3
227 MY_ENDP
228endm
229
230
231OP_KEY macro op:req, offs:req
232 op state, [keys + offs]
233endm
234
235
236WOP_KEY macro op:req, offs:req
237 movdqa key, [keys + offs]
238 WOP op, key
239endm
240
241
242; ---------- AES-CBC Decode ----------
243
244
245XOR_WITH_DATA macro reg, _ppp_
246 pxor reg, [rD + i * 16]
247endm
248
249WRITE_TO_DATA macro reg, _ppp_
250 movdqa [rD + i * 16], reg
251endm
252
253
254; state0 equ @CatStr(xmm, %(ways_start_reg))
255
256key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1))
257key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
258
259key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2))
260key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
261key_last_ymm_n equ (ways_start_reg + ways + 2)
262
263NUM_CBC_REGS equ (ways_start_reg + ways + 3)
264
265
266MY_SEG_PROC AesCbc_Decode_HW, 3
267
268 AesCbc_Decode_HW_start::
269 MY_PROLOG NUM_CBC_REGS
270
271 AesCbc_Decode_HW_start_2::
272 movdqa iv, [keys]
273 add keys, 32
274
275 movdqa key0, [keys + 1 * ksize_r]
276 movdqa key_last, [keys]
277 sub ksize_x, 16
278
279 jmp check2
280 align 16
281 nextBlocks2:
282 WOP movdqa, [rD + i * 16]
283 mov koffs_x, ksize_x
284 ; WOP_KEY pxor, ksize_r + 16
285 WOP pxor, key0
286 ; align 16
287 @@:
288 WOP_KEY aesdec, 1 * koffs_r
289 sub koffs_r, 16
290 jnz @B
291 ; WOP_KEY aesdeclast, 0
292 WOP aesdeclast, key_last
293
294 pxor @CatStr(xmm, %(ways_start_reg)), iv
295 i = 1
296 rept ways - 1
297 pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16]
298 i = i + 1
299 endm
300 movdqa iv, [rD + ways * 16 - 16]
301 WOP WRITE_TO_DATA
302
303 add rD, ways * 16
304 AesCbc_Decode_HW_start_3::
305 check2:
306 sub rN, ways
307 jnc nextBlocks2
308 add rN, ways
309
310 sub ksize_x, 16
311
312 jmp check
313 nextBlock:
314 movdqa state, [rD]
315 mov koffs_x, ksize_x
316 ; OP_KEY pxor, 1 * ksize_r + 32
317 pxor state, key0
318 ; movdqa state0, [rD]
319 ; movdqa state, key0
320 ; pxor state, state0
321 @@:
322 OP_KEY aesdec, 1 * koffs_r + 16
323 OP_KEY aesdec, 1 * koffs_r
324 sub koffs_r, 32
325 jnz @B
326 OP_KEY aesdec, 16
327 ; OP_KEY aesdeclast, 0
328 aesdeclast state, key_last
329
330 pxor state, iv
331 movdqa iv, [rD]
332 ; movdqa iv, state0
333 movdqa [rD], state
334
335 add rD, 16
336 check:
337 sub rN, 1
338 jnc nextBlock
339
340 movdqa [keys - 32], iv
341MY_EPILOG
342
343
344
345
346; ---------- AVX ----------
347
348
349AVX__WOP_n macro op
350 i = 0
351 rept ways
352 op (ways_start_reg + i)
353 i = i + 1
354 endm
355endm
356
357AVX__WOP macro op
358 i = 0
359 rept ways
360 op @CatStr(ymm, %(ways_start_reg + i))
361 i = i + 1
362 endm
363endm
364
365
366AVX__WOP_KEY macro op:req, offs:req
367 vmovdqa key_ymm, ymmword ptr [keys2 + offs]
368 AVX__WOP_n op
369endm
370
371
372AVX__CBC_START macro reg
373 ; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i]
374 vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i]
375endm
376
377AVX__CBC_END macro reg
378 if i eq 0
379 vpxor reg, reg, iv_ymm
380 else
381 vpxor reg, reg, ymmword ptr [rD + i * 32 - 16]
382 endif
383endm
384
385
386AVX__WRITE_TO_DATA macro reg
387 vmovdqu ymmword ptr [rD + 32 * i], reg
388endm
389
390AVX__XOR_WITH_DATA macro reg
391 vpxor reg, reg, ymmword ptr [rD + 32 * i]
392endm
393
394AVX__CTR_START macro reg
395 vpaddq iv_ymm, iv_ymm, one_ymm
396 ; vpxor reg, iv_ymm, key_ymm
397 vpxor reg, iv_ymm, key0_ymm
398endm
399
400
401MY_VAES_INSTR_2 macro cmd, dest, a1, a2
402 db 0c4H
403 db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8)
404 db 5 + 8 * ((not (a1)) and 15)
405 db cmd
406 db 0c0H + 8 * ((dest) and 7) + ((a2) and 7)
407endm
408
409MY_VAES_INSTR macro cmd, dest, a
410 MY_VAES_INSTR_2 cmd, dest, dest, a
411endm
412
413MY_vaesenc macro dest, a
414 MY_VAES_INSTR 0dcH, dest, a
415endm
416MY_vaesenclast macro dest, a
417 MY_VAES_INSTR 0ddH, dest, a
418endm
419MY_vaesdec macro dest, a
420 MY_VAES_INSTR 0deH, dest, a
421endm
422MY_vaesdeclast macro dest, a
423 MY_VAES_INSTR 0dfH, dest, a
424endm
425
426
427AVX__VAES_DEC macro reg
428 MY_vaesdec reg, key_ymm_n
429endm
430
431AVX__VAES_DEC_LAST_key_last macro reg
432 ; MY_vaesdeclast reg, key_ymm_n
433 MY_vaesdeclast reg, key_last_ymm_n
434endm
435
436AVX__VAES_ENC macro reg
437 MY_vaesenc reg, key_ymm_n
438endm
439
440AVX__VAES_ENC_LAST macro reg
441 MY_vaesenclast reg, key_ymm_n
442endm
443
444AVX__vinserti128_TO_HIGH macro dest, src
445 vinserti128 dest, dest, src, 1
446endm
447
448
449MY_PROC AesCbc_Decode_HW_256, 3
450 ifdef use_vaes_256
451 MY_PROLOG NUM_CBC_REGS
452
453 cmp rN, ways * 2
454 jb AesCbc_Decode_HW_start_2
455
456 vmovdqa iv, xmmword ptr [keys]
457 add keys, 32
458
459 vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r]
460 vbroadcasti128 key_last_ymm, xmmword ptr [keys]
461 sub ksize_x, 16
462 mov koffs_x, ksize_x
463 add ksize_x, ksize_x
464
465 AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32)
466 push keys2
467 sub r4, AVX_STACK_SUB
468 ; sub r4, 32
469 ; sub r4, ksize_r
470 ; lea keys2, [r4 + 32]
471 mov keys2, r4
472 and keys2, -32
473 broad:
474 vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
475 vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
476 sub koffs_r, 16
477 ; jnc broad
478 jnz broad
479
480 sub rN, ways * 2
481
482 align 16
483 avx_cbcdec_nextBlock2:
484 mov koffs_x, ksize_x
485 ; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32
486 AVX__WOP AVX__CBC_START
487 @@:
488 AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r
489 sub koffs_r, 32
490 jnz @B
491 ; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0
492 AVX__WOP_n AVX__VAES_DEC_LAST_key_last
493
494 AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD]
495 AVX__WOP AVX__CBC_END
496
497 vmovdqa iv, xmmword ptr [rD + ways * 32 - 16]
498 AVX__WOP AVX__WRITE_TO_DATA
499
500 add rD, ways * 32
501 sub rN, ways * 2
502 jnc avx_cbcdec_nextBlock2
503 add rN, ways * 2
504
505 shr ksize_x, 1
506
507 ; lea r4, [r4 + 1 * ksize_r + 32]
508 add r4, AVX_STACK_SUB
509 pop keys2
510
511 vzeroupper
512 jmp AesCbc_Decode_HW_start_3
513 else
514 jmp AesCbc_Decode_HW_start
515 endif
516MY_ENDP
517MY_SEG_ENDP
518
519
520
521
522; ---------- AES-CBC Encode ----------
523
524e0 equ xmm1
525
526CENC_START_KEY equ 2
527CENC_NUM_REG_KEYS equ (3 * 2)
528; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS))
529
530MY_SEG_PROC AesCbc_Encode_HW, 3
531 MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0)
532
533 movdqa state, [keys]
534 add keys, 32
535
536 i = 0
537 rept CENC_NUM_REG_KEYS
538 movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16]
539 i = i + 1
540 endm
541
542 add keys, ksize_r
543 neg ksize_r
544 add ksize_r, (16 * CENC_NUM_REG_KEYS)
545 ; movdqa last_key, [keys]
546 jmp check_e
547
548 align 16
549 nextBlock_e:
550 movdqa e0, [rD]
551 mov koffs_r, ksize_r
552 pxor e0, @CatStr(xmm, %(CENC_START_KEY))
553 pxor state, e0
554
555 i = 1
556 rept (CENC_NUM_REG_KEYS - 1)
557 aesenc state, @CatStr(xmm, %(CENC_START_KEY + i))
558 i = i + 1
559 endm
560
561 @@:
562 OP_KEY aesenc, 1 * koffs_r
563 OP_KEY aesenc, 1 * koffs_r + 16
564 add koffs_r, 32
565 jnz @B
566 OP_KEY aesenclast, 0
567 ; aesenclast state, last_key
568
569 movdqa [rD], state
570 add rD, 16
571 check_e:
572 sub rN, 1
573 jnc nextBlock_e
574
575 ; movdqa [keys - 32], state
576 movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state
577MY_EPILOG
578MY_SEG_ENDP
579
580
581
582; ---------- AES-CTR ----------
583
584ifdef x64
585 ; ways = 11
586endif
587
588
589one equ @CatStr(xmm, %(ways_start_reg + ways + 1))
590one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
591key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2))
592key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
593NUM_CTR_REGS equ (ways_start_reg + ways + 3)
594
595INIT_CTR macro reg, _ppp_
596 paddq iv, one
597 movdqa reg, iv
598endm
599
600
601MY_SEG_PROC AesCtr_Code_HW, 3
602 Ctr_start::
603 MY_PROLOG NUM_CTR_REGS
604
605 Ctr_start_2::
606 movdqa iv, [keys]
607 add keys, 32
608 movdqa key0, [keys]
609
610 add keys, ksize_r
611 neg ksize_r
612 add ksize_r, 16
613
614 Ctr_start_3::
615 mov koffs_x, 1
616 movd one, koffs_x
617 jmp check2_c
618
619 align 16
620 nextBlocks2_c:
621 WOP INIT_CTR, 0
622 mov koffs_r, ksize_r
623 ; WOP_KEY pxor, 1 * koffs_r -16
624 WOP pxor, key0
625 @@:
626 WOP_KEY aesenc, 1 * koffs_r
627 add koffs_r, 16
628 jnz @B
629 WOP_KEY aesenclast, 0
630
631 WOP XOR_WITH_DATA
632 WOP WRITE_TO_DATA
633 add rD, ways * 16
634 check2_c:
635 sub rN, ways
636 jnc nextBlocks2_c
637 add rN, ways
638
639 sub keys, 16
640 add ksize_r, 16
641
642 jmp check_c
643
644 ; align 16
645 nextBlock_c:
646 paddq iv, one
647 ; movdqa state, [keys + 1 * koffs_r - 16]
648 movdqa state, key0
649 mov koffs_r, ksize_r
650 pxor state, iv
651
652 @@:
653 OP_KEY aesenc, 1 * koffs_r
654 OP_KEY aesenc, 1 * koffs_r + 16
655 add koffs_r, 32
656 jnz @B
657 OP_KEY aesenc, 0
658 OP_KEY aesenclast, 16
659
660 pxor state, [rD]
661 movdqa [rD], state
662 add rD, 16
663 check_c:
664 sub rN, 1
665 jnc nextBlock_c
666
667 ; movdqa [keys - 32], iv
668 movdqa [keys + 1 * ksize_r - 16 - 32], iv
669MY_EPILOG
670
671
672MY_PROC AesCtr_Code_HW_256, 3
673 ifdef use_vaes_256
674 MY_PROLOG NUM_CTR_REGS
675
676 cmp rN, ways * 2
677 jb Ctr_start_2
678
679 vbroadcasti128 iv_ymm, xmmword ptr [keys]
680 add keys, 32
681 vbroadcasti128 key0_ymm, xmmword ptr [keys]
682 mov koffs_x, 1
683 vmovd one, koffs_x
684 vpsubq iv_ymm, iv_ymm, one_ymm
685 vpaddq one, one, one
686 AVX__vinserti128_TO_HIGH one_ymm, one
687
688 add keys, ksize_r
689 sub ksize_x, 16
690 neg ksize_r
691 mov koffs_r, ksize_r
692 add ksize_r, ksize_r
693
694 AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32)
695 push keys2
696 lea keys2, [r4 - 32]
697 sub r4, AVX_STACK_SUB
698 and keys2, -32
699 vbroadcasti128 key_ymm, xmmword ptr [keys]
700 vmovdqa ymmword ptr [keys2], key_ymm
701 @@:
702 vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
703 vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
704 add koffs_r, 16
705 jnz @B
706
707 sub rN, ways * 2
708
709 align 16
710 avx_ctr_nextBlock2:
711 mov koffs_r, ksize_r
712 AVX__WOP AVX__CTR_START
713 ; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32
714 @@:
715 AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r
716 add koffs_r, 32
717 jnz @B
718 AVX__WOP_KEY AVX__VAES_ENC_LAST, 0
719
720 AVX__WOP AVX__XOR_WITH_DATA
721 AVX__WOP AVX__WRITE_TO_DATA
722
723 add rD, ways * 32
724 sub rN, ways * 2
725 jnc avx_ctr_nextBlock2
726 add rN, ways * 2
727
728 vextracti128 iv, iv_ymm, 1
729 sar ksize_r, 1
730
731 add r4, AVX_STACK_SUB
732 pop keys2
733
734 vzeroupper
735 jmp Ctr_start_3
736 else
737 jmp Ctr_start
738 endif
739MY_ENDP
740MY_SEG_ENDP
741
742end