summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/aes
diff options
context:
space:
mode:
authordjm <>2010-10-01 22:54:21 +0000
committerdjm <>2010-10-01 22:54:21 +0000
commit2ea67f4aa254b09ded62e6e14fc893bbe6381579 (patch)
treebb3923b81f2ce34b1ad62684afdf1a94d904c185 /src/lib/libcrypto/aes
parent6ddfb710ab14b10183ff3a6a32f643554c80065e (diff)
parent829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2 (diff)
downloadopenbsd-2ea67f4aa254b09ded62e6e14fc893bbe6381579.tar.gz
openbsd-2ea67f4aa254b09ded62e6e14fc893bbe6381579.tar.bz2
openbsd-2ea67f4aa254b09ded62e6e14fc893bbe6381579.zip
This commit was generated by cvs2git to track changes on a CVS vendor
branch.
Diffstat (limited to 'src/lib/libcrypto/aes')
-rw-r--r--src/lib/libcrypto/aes/aes_ige.c12
-rw-r--r--src/lib/libcrypto/aes/asm/aes-armv4.pl1
-rw-r--r--src/lib/libcrypto/aes/asm/aes-ppc.pl269
-rw-r--r--src/lib/libcrypto/aes/asm/aes-s390x.pl6
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-x86_64.pl2012
5 files changed, 1775 insertions, 525 deletions
diff --git a/src/lib/libcrypto/aes/aes_ige.c b/src/lib/libcrypto/aes/aes_ige.c
index 45d7096181..c161351e65 100644
--- a/src/lib/libcrypto/aes/aes_ige.c
+++ b/src/lib/libcrypto/aes/aes_ige.c
@@ -77,11 +77,11 @@ typedef struct {
77/* N.B. The IV for this mode is _twice_ the block size */ 77/* N.B. The IV for this mode is _twice_ the block size */
78 78
79void AES_ige_encrypt(const unsigned char *in, unsigned char *out, 79void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
80 const unsigned long length, const AES_KEY *key, 80 size_t length, const AES_KEY *key,
81 unsigned char *ivec, const int enc) 81 unsigned char *ivec, const int enc)
82 { 82 {
83 unsigned long n; 83 size_t n;
84 unsigned long len; 84 size_t len = length;
85 85
86 OPENSSL_assert(in && out && key && ivec); 86 OPENSSL_assert(in && out && key && ivec);
87 OPENSSL_assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc)); 87 OPENSSL_assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc));
@@ -211,12 +211,12 @@ void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
211/* N.B. The IV for this mode is _four times_ the block size */ 211/* N.B. The IV for this mode is _four times_ the block size */
212 212
213void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out, 213void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out,
214 const unsigned long length, const AES_KEY *key, 214 size_t length, const AES_KEY *key,
215 const AES_KEY *key2, const unsigned char *ivec, 215 const AES_KEY *key2, const unsigned char *ivec,
216 const int enc) 216 const int enc)
217 { 217 {
218 unsigned long n; 218 size_t n;
219 unsigned long len = length; 219 size_t len = length;
220 unsigned char tmp[AES_BLOCK_SIZE]; 220 unsigned char tmp[AES_BLOCK_SIZE];
221 unsigned char tmp2[AES_BLOCK_SIZE]; 221 unsigned char tmp2[AES_BLOCK_SIZE];
222 unsigned char tmp3[AES_BLOCK_SIZE]; 222 unsigned char tmp3[AES_BLOCK_SIZE];
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl
index 15742c1ec5..690244111a 100644
--- a/src/lib/libcrypto/aes/asm/aes-armv4.pl
+++ b/src/lib/libcrypto/aes/asm/aes-armv4.pl
@@ -1024,6 +1024,7 @@ _armv4_AES_decrypt:
1024 mov pc,lr @ return 1024 mov pc,lr @ return
1025.size _armv4_AES_decrypt,.-_armv4_AES_decrypt 1025.size _armv4_AES_decrypt,.-_armv4_AES_decrypt
1026.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 1026.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
1027.align 2
1027___ 1028___
1028 1029
1029$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 1030$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl
index ce427655ef..f82c5e1814 100644
--- a/src/lib/libcrypto/aes/asm/aes-ppc.pl
+++ b/src/lib/libcrypto/aes/asm/aes-ppc.pl
@@ -16,6 +16,19 @@
16# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - 16# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
17# at 1/3 of ppc_AES_decrypt. 17# at 1/3 of ppc_AES_decrypt.
18 18
19# February 2010
20#
21# Rescheduling instructions to favour Power6 pipeline gives 10%
22# performance improvement on the platfrom in question (and marginal
23# improvement even on others). It should be noted that Power6 fails
24# to process byte in 18 cycles, only in 23, because it fails to issue
25# 4 load instructions in two cycles, only in 3. As result non-compact
26# block subroutines are 25% slower than one would expect. Compact
27# functions scale better, because they have pure computational part,
28# which scales perfectly with clock frequency. To be specific
29# ppc_AES_encrypt_compact operates at 42 cycles per byte, while
30# ppc_AES_decrypt_compact - at 55 (in 64-bit build).
31
19$flavour = shift; 32$flavour = shift;
20 33
21if ($flavour =~ /64/) { 34if ($flavour =~ /64/) {
@@ -376,7 +389,7 @@ $code.=<<___;
376 addi $sp,$sp,$FRAME 389 addi $sp,$sp,$FRAME
377 blr 390 blr
378 391
379.align 4 392.align 5
380Lppc_AES_encrypt: 393Lppc_AES_encrypt:
381 lwz $acc00,240($key) 394 lwz $acc00,240($key)
382 lwz $t0,0($key) 395 lwz $t0,0($key)
@@ -397,46 +410,46 @@ Lppc_AES_encrypt:
397Lenc_loop: 410Lenc_loop:
398 rlwinm $acc00,$s0,`32-24+3`,21,28 411 rlwinm $acc00,$s0,`32-24+3`,21,28
399 rlwinm $acc01,$s1,`32-24+3`,21,28 412 rlwinm $acc01,$s1,`32-24+3`,21,28
400 lwz $t0,0($key)
401 lwz $t1,4($key)
402 rlwinm $acc02,$s2,`32-24+3`,21,28 413 rlwinm $acc02,$s2,`32-24+3`,21,28
403 rlwinm $acc03,$s3,`32-24+3`,21,28 414 rlwinm $acc03,$s3,`32-24+3`,21,28
404 lwz $t2,8($key) 415 lwz $t0,0($key)
405 lwz $t3,12($key) 416 lwz $t1,4($key)
406 rlwinm $acc04,$s1,`32-16+3`,21,28 417 rlwinm $acc04,$s1,`32-16+3`,21,28
407 rlwinm $acc05,$s2,`32-16+3`,21,28 418 rlwinm $acc05,$s2,`32-16+3`,21,28
408 lwzx $acc00,$Tbl0,$acc00 419 lwz $t2,8($key)
409 lwzx $acc01,$Tbl0,$acc01 420 lwz $t3,12($key)
410 rlwinm $acc06,$s3,`32-16+3`,21,28 421 rlwinm $acc06,$s3,`32-16+3`,21,28
411 rlwinm $acc07,$s0,`32-16+3`,21,28 422 rlwinm $acc07,$s0,`32-16+3`,21,28
412 lwzx $acc02,$Tbl0,$acc02 423 lwzx $acc00,$Tbl0,$acc00
413 lwzx $acc03,$Tbl0,$acc03 424 lwzx $acc01,$Tbl0,$acc01
414 rlwinm $acc08,$s2,`32-8+3`,21,28 425 rlwinm $acc08,$s2,`32-8+3`,21,28
415 rlwinm $acc09,$s3,`32-8+3`,21,28 426 rlwinm $acc09,$s3,`32-8+3`,21,28
416 lwzx $acc04,$Tbl1,$acc04 427 lwzx $acc02,$Tbl0,$acc02
417 lwzx $acc05,$Tbl1,$acc05 428 lwzx $acc03,$Tbl0,$acc03
418 rlwinm $acc10,$s0,`32-8+3`,21,28 429 rlwinm $acc10,$s0,`32-8+3`,21,28
419 rlwinm $acc11,$s1,`32-8+3`,21,28 430 rlwinm $acc11,$s1,`32-8+3`,21,28
420 lwzx $acc06,$Tbl1,$acc06 431 lwzx $acc04,$Tbl1,$acc04
421 lwzx $acc07,$Tbl1,$acc07 432 lwzx $acc05,$Tbl1,$acc05
422 rlwinm $acc12,$s3,`0+3`,21,28 433 rlwinm $acc12,$s3,`0+3`,21,28
423 rlwinm $acc13,$s0,`0+3`,21,28 434 rlwinm $acc13,$s0,`0+3`,21,28
424 lwzx $acc08,$Tbl2,$acc08 435 lwzx $acc06,$Tbl1,$acc06
425 lwzx $acc09,$Tbl2,$acc09 436 lwzx $acc07,$Tbl1,$acc07
426 rlwinm $acc14,$s1,`0+3`,21,28 437 rlwinm $acc14,$s1,`0+3`,21,28
427 rlwinm $acc15,$s2,`0+3`,21,28 438 rlwinm $acc15,$s2,`0+3`,21,28
428 lwzx $acc10,$Tbl2,$acc10 439 lwzx $acc08,$Tbl2,$acc08
429 lwzx $acc11,$Tbl2,$acc11 440 lwzx $acc09,$Tbl2,$acc09
430 xor $t0,$t0,$acc00 441 xor $t0,$t0,$acc00
431 xor $t1,$t1,$acc01 442 xor $t1,$t1,$acc01
432 lwzx $acc12,$Tbl3,$acc12 443 lwzx $acc10,$Tbl2,$acc10
433 lwzx $acc13,$Tbl3,$acc13 444 lwzx $acc11,$Tbl2,$acc11
434 xor $t2,$t2,$acc02 445 xor $t2,$t2,$acc02
435 xor $t3,$t3,$acc03 446 xor $t3,$t3,$acc03
436 lwzx $acc14,$Tbl3,$acc14 447 lwzx $acc12,$Tbl3,$acc12
437 lwzx $acc15,$Tbl3,$acc15 448 lwzx $acc13,$Tbl3,$acc13
438 xor $t0,$t0,$acc04 449 xor $t0,$t0,$acc04
439 xor $t1,$t1,$acc05 450 xor $t1,$t1,$acc05
451 lwzx $acc14,$Tbl3,$acc14
452 lwzx $acc15,$Tbl3,$acc15
440 xor $t2,$t2,$acc06 453 xor $t2,$t2,$acc06
441 xor $t3,$t3,$acc07 454 xor $t3,$t3,$acc07
442 xor $t0,$t0,$acc08 455 xor $t0,$t0,$acc08
@@ -452,60 +465,60 @@ Lenc_loop:
452 465
453 addi $Tbl2,$Tbl0,2048 466 addi $Tbl2,$Tbl0,2048
454 nop 467 nop
455 lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
456 lwz $acc09,`2048+32`($Tbl0)
457 lwz $acc10,`2048+64`($Tbl0)
458 lwz $acc11,`2048+96`($Tbl0)
459 lwz $acc08,`2048+128`($Tbl0)
460 lwz $acc09,`2048+160`($Tbl0)
461 lwz $acc10,`2048+192`($Tbl0)
462 lwz $acc11,`2048+224`($Tbl0)
463 rlwinm $acc00,$s0,`32-24`,24,31
464 rlwinm $acc01,$s1,`32-24`,24,31
465 lwz $t0,0($key) 468 lwz $t0,0($key)
466 lwz $t1,4($key) 469 lwz $t1,4($key)
467 rlwinm $acc02,$s2,`32-24`,24,31 470 rlwinm $acc00,$s0,`32-24`,24,31
468 rlwinm $acc03,$s3,`32-24`,24,31 471 rlwinm $acc01,$s1,`32-24`,24,31
469 lwz $t2,8($key) 472 lwz $t2,8($key)
470 lwz $t3,12($key) 473 lwz $t3,12($key)
474 rlwinm $acc02,$s2,`32-24`,24,31
475 rlwinm $acc03,$s3,`32-24`,24,31
476 lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
477 lwz $acc09,`2048+32`($Tbl0)
471 rlwinm $acc04,$s1,`32-16`,24,31 478 rlwinm $acc04,$s1,`32-16`,24,31
472 rlwinm $acc05,$s2,`32-16`,24,31 479 rlwinm $acc05,$s2,`32-16`,24,31
473 lbzx $acc00,$Tbl2,$acc00 480 lwz $acc10,`2048+64`($Tbl0)
474 lbzx $acc01,$Tbl2,$acc01 481 lwz $acc11,`2048+96`($Tbl0)
475 rlwinm $acc06,$s3,`32-16`,24,31 482 rlwinm $acc06,$s3,`32-16`,24,31
476 rlwinm $acc07,$s0,`32-16`,24,31 483 rlwinm $acc07,$s0,`32-16`,24,31
477 lbzx $acc02,$Tbl2,$acc02 484 lwz $acc12,`2048+128`($Tbl0)
478 lbzx $acc03,$Tbl2,$acc03 485 lwz $acc13,`2048+160`($Tbl0)
479 rlwinm $acc08,$s2,`32-8`,24,31 486 rlwinm $acc08,$s2,`32-8`,24,31
480 rlwinm $acc09,$s3,`32-8`,24,31 487 rlwinm $acc09,$s3,`32-8`,24,31
481 lbzx $acc04,$Tbl2,$acc04 488 lwz $acc14,`2048+192`($Tbl0)
482 lbzx $acc05,$Tbl2,$acc05 489 lwz $acc15,`2048+224`($Tbl0)
483 rlwinm $acc10,$s0,`32-8`,24,31 490 rlwinm $acc10,$s0,`32-8`,24,31
484 rlwinm $acc11,$s1,`32-8`,24,31 491 rlwinm $acc11,$s1,`32-8`,24,31
485 lbzx $acc06,$Tbl2,$acc06 492 lbzx $acc00,$Tbl2,$acc00
486 lbzx $acc07,$Tbl2,$acc07 493 lbzx $acc01,$Tbl2,$acc01
487 rlwinm $acc12,$s3,`0`,24,31 494 rlwinm $acc12,$s3,`0`,24,31
488 rlwinm $acc13,$s0,`0`,24,31 495 rlwinm $acc13,$s0,`0`,24,31
489 lbzx $acc08,$Tbl2,$acc08 496 lbzx $acc02,$Tbl2,$acc02
490 lbzx $acc09,$Tbl2,$acc09 497 lbzx $acc03,$Tbl2,$acc03
491 rlwinm $acc14,$s1,`0`,24,31 498 rlwinm $acc14,$s1,`0`,24,31
492 rlwinm $acc15,$s2,`0`,24,31 499 rlwinm $acc15,$s2,`0`,24,31
493 lbzx $acc10,$Tbl2,$acc10 500 lbzx $acc04,$Tbl2,$acc04
494 lbzx $acc11,$Tbl2,$acc11 501 lbzx $acc05,$Tbl2,$acc05
495 rlwinm $s0,$acc00,24,0,7 502 rlwinm $s0,$acc00,24,0,7
496 rlwinm $s1,$acc01,24,0,7 503 rlwinm $s1,$acc01,24,0,7
497 lbzx $acc12,$Tbl2,$acc12 504 lbzx $acc06,$Tbl2,$acc06
498 lbzx $acc13,$Tbl2,$acc13 505 lbzx $acc07,$Tbl2,$acc07
499 rlwinm $s2,$acc02,24,0,7 506 rlwinm $s2,$acc02,24,0,7
500 rlwinm $s3,$acc03,24,0,7 507 rlwinm $s3,$acc03,24,0,7
501 lbzx $acc14,$Tbl2,$acc14 508 lbzx $acc08,$Tbl2,$acc08
502 lbzx $acc15,$Tbl2,$acc15 509 lbzx $acc09,$Tbl2,$acc09
503 rlwimi $s0,$acc04,16,8,15 510 rlwimi $s0,$acc04,16,8,15
504 rlwimi $s1,$acc05,16,8,15 511 rlwimi $s1,$acc05,16,8,15
512 lbzx $acc10,$Tbl2,$acc10
513 lbzx $acc11,$Tbl2,$acc11
505 rlwimi $s2,$acc06,16,8,15 514 rlwimi $s2,$acc06,16,8,15
506 rlwimi $s3,$acc07,16,8,15 515 rlwimi $s3,$acc07,16,8,15
516 lbzx $acc12,$Tbl2,$acc12
517 lbzx $acc13,$Tbl2,$acc13
507 rlwimi $s0,$acc08,8,16,23 518 rlwimi $s0,$acc08,8,16,23
508 rlwimi $s1,$acc09,8,16,23 519 rlwimi $s1,$acc09,8,16,23
520 lbzx $acc14,$Tbl2,$acc14
521 lbzx $acc15,$Tbl2,$acc15
509 rlwimi $s2,$acc10,8,16,23 522 rlwimi $s2,$acc10,8,16,23
510 rlwimi $s3,$acc11,8,16,23 523 rlwimi $s3,$acc11,8,16,23
511 or $s0,$s0,$acc12 524 or $s0,$s0,$acc12
@@ -542,40 +555,40 @@ Lenc_compact_loop:
542 rlwinm $acc01,$s1,`32-24`,24,31 555 rlwinm $acc01,$s1,`32-24`,24,31
543 rlwinm $acc02,$s2,`32-24`,24,31 556 rlwinm $acc02,$s2,`32-24`,24,31
544 rlwinm $acc03,$s3,`32-24`,24,31 557 rlwinm $acc03,$s3,`32-24`,24,31
545 lbzx $acc00,$Tbl1,$acc00
546 lbzx $acc01,$Tbl1,$acc01
547 rlwinm $acc04,$s1,`32-16`,24,31 558 rlwinm $acc04,$s1,`32-16`,24,31
548 rlwinm $acc05,$s2,`32-16`,24,31 559 rlwinm $acc05,$s2,`32-16`,24,31
549 lbzx $acc02,$Tbl1,$acc02
550 lbzx $acc03,$Tbl1,$acc03
551 rlwinm $acc06,$s3,`32-16`,24,31 560 rlwinm $acc06,$s3,`32-16`,24,31
552 rlwinm $acc07,$s0,`32-16`,24,31 561 rlwinm $acc07,$s0,`32-16`,24,31
553 lbzx $acc04,$Tbl1,$acc04 562 lbzx $acc00,$Tbl1,$acc00
554 lbzx $acc05,$Tbl1,$acc05 563 lbzx $acc01,$Tbl1,$acc01
555 rlwinm $acc08,$s2,`32-8`,24,31 564 rlwinm $acc08,$s2,`32-8`,24,31
556 rlwinm $acc09,$s3,`32-8`,24,31 565 rlwinm $acc09,$s3,`32-8`,24,31
557 lbzx $acc06,$Tbl1,$acc06 566 lbzx $acc02,$Tbl1,$acc02
558 lbzx $acc07,$Tbl1,$acc07 567 lbzx $acc03,$Tbl1,$acc03
559 rlwinm $acc10,$s0,`32-8`,24,31 568 rlwinm $acc10,$s0,`32-8`,24,31
560 rlwinm $acc11,$s1,`32-8`,24,31 569 rlwinm $acc11,$s1,`32-8`,24,31
561 lbzx $acc08,$Tbl1,$acc08 570 lbzx $acc04,$Tbl1,$acc04
562 lbzx $acc09,$Tbl1,$acc09 571 lbzx $acc05,$Tbl1,$acc05
563 rlwinm $acc12,$s3,`0`,24,31 572 rlwinm $acc12,$s3,`0`,24,31
564 rlwinm $acc13,$s0,`0`,24,31 573 rlwinm $acc13,$s0,`0`,24,31
565 lbzx $acc10,$Tbl1,$acc10 574 lbzx $acc06,$Tbl1,$acc06
566 lbzx $acc11,$Tbl1,$acc11 575 lbzx $acc07,$Tbl1,$acc07
567 rlwinm $acc14,$s1,`0`,24,31 576 rlwinm $acc14,$s1,`0`,24,31
568 rlwinm $acc15,$s2,`0`,24,31 577 rlwinm $acc15,$s2,`0`,24,31
569 lbzx $acc12,$Tbl1,$acc12 578 lbzx $acc08,$Tbl1,$acc08
570 lbzx $acc13,$Tbl1,$acc13 579 lbzx $acc09,$Tbl1,$acc09
571 rlwinm $s0,$acc00,24,0,7 580 rlwinm $s0,$acc00,24,0,7
572 rlwinm $s1,$acc01,24,0,7 581 rlwinm $s1,$acc01,24,0,7
573 lbzx $acc14,$Tbl1,$acc14 582 lbzx $acc10,$Tbl1,$acc10
574 lbzx $acc15,$Tbl1,$acc15 583 lbzx $acc11,$Tbl1,$acc11
575 rlwinm $s2,$acc02,24,0,7 584 rlwinm $s2,$acc02,24,0,7
576 rlwinm $s3,$acc03,24,0,7 585 rlwinm $s3,$acc03,24,0,7
586 lbzx $acc12,$Tbl1,$acc12
587 lbzx $acc13,$Tbl1,$acc13
577 rlwimi $s0,$acc04,16,8,15 588 rlwimi $s0,$acc04,16,8,15
578 rlwimi $s1,$acc05,16,8,15 589 rlwimi $s1,$acc05,16,8,15
590 lbzx $acc14,$Tbl1,$acc14
591 lbzx $acc15,$Tbl1,$acc15
579 rlwimi $s2,$acc06,16,8,15 592 rlwimi $s2,$acc06,16,8,15
580 rlwimi $s3,$acc07,16,8,15 593 rlwimi $s3,$acc07,16,8,15
581 rlwimi $s0,$acc08,8,16,23 594 rlwimi $s0,$acc08,8,16,23
@@ -725,7 +738,7 @@ Lenc_compact_done:
725 addi $sp,$sp,$FRAME 738 addi $sp,$sp,$FRAME
726 blr 739 blr
727 740
728.align 4 741.align 5
729Lppc_AES_decrypt: 742Lppc_AES_decrypt:
730 lwz $acc00,240($key) 743 lwz $acc00,240($key)
731 lwz $t0,0($key) 744 lwz $t0,0($key)
@@ -746,46 +759,46 @@ Lppc_AES_decrypt:
746Ldec_loop: 759Ldec_loop:
747 rlwinm $acc00,$s0,`32-24+3`,21,28 760 rlwinm $acc00,$s0,`32-24+3`,21,28
748 rlwinm $acc01,$s1,`32-24+3`,21,28 761 rlwinm $acc01,$s1,`32-24+3`,21,28
749 lwz $t0,0($key)
750 lwz $t1,4($key)
751 rlwinm $acc02,$s2,`32-24+3`,21,28 762 rlwinm $acc02,$s2,`32-24+3`,21,28
752 rlwinm $acc03,$s3,`32-24+3`,21,28 763 rlwinm $acc03,$s3,`32-24+3`,21,28
753 lwz $t2,8($key) 764 lwz $t0,0($key)
754 lwz $t3,12($key) 765 lwz $t1,4($key)
755 rlwinm $acc04,$s3,`32-16+3`,21,28 766 rlwinm $acc04,$s3,`32-16+3`,21,28
756 rlwinm $acc05,$s0,`32-16+3`,21,28 767 rlwinm $acc05,$s0,`32-16+3`,21,28
757 lwzx $acc00,$Tbl0,$acc00 768 lwz $t2,8($key)
758 lwzx $acc01,$Tbl0,$acc01 769 lwz $t3,12($key)
759 rlwinm $acc06,$s1,`32-16+3`,21,28 770 rlwinm $acc06,$s1,`32-16+3`,21,28
760 rlwinm $acc07,$s2,`32-16+3`,21,28 771 rlwinm $acc07,$s2,`32-16+3`,21,28
761 lwzx $acc02,$Tbl0,$acc02 772 lwzx $acc00,$Tbl0,$acc00
762 lwzx $acc03,$Tbl0,$acc03 773 lwzx $acc01,$Tbl0,$acc01
763 rlwinm $acc08,$s2,`32-8+3`,21,28 774 rlwinm $acc08,$s2,`32-8+3`,21,28
764 rlwinm $acc09,$s3,`32-8+3`,21,28 775 rlwinm $acc09,$s3,`32-8+3`,21,28
765 lwzx $acc04,$Tbl1,$acc04 776 lwzx $acc02,$Tbl0,$acc02
766 lwzx $acc05,$Tbl1,$acc05 777 lwzx $acc03,$Tbl0,$acc03
767 rlwinm $acc10,$s0,`32-8+3`,21,28 778 rlwinm $acc10,$s0,`32-8+3`,21,28
768 rlwinm $acc11,$s1,`32-8+3`,21,28 779 rlwinm $acc11,$s1,`32-8+3`,21,28
769 lwzx $acc06,$Tbl1,$acc06 780 lwzx $acc04,$Tbl1,$acc04
770 lwzx $acc07,$Tbl1,$acc07 781 lwzx $acc05,$Tbl1,$acc05
771 rlwinm $acc12,$s1,`0+3`,21,28 782 rlwinm $acc12,$s1,`0+3`,21,28
772 rlwinm $acc13,$s2,`0+3`,21,28 783 rlwinm $acc13,$s2,`0+3`,21,28
773 lwzx $acc08,$Tbl2,$acc08 784 lwzx $acc06,$Tbl1,$acc06
774 lwzx $acc09,$Tbl2,$acc09 785 lwzx $acc07,$Tbl1,$acc07
775 rlwinm $acc14,$s3,`0+3`,21,28 786 rlwinm $acc14,$s3,`0+3`,21,28
776 rlwinm $acc15,$s0,`0+3`,21,28 787 rlwinm $acc15,$s0,`0+3`,21,28
777 lwzx $acc10,$Tbl2,$acc10 788 lwzx $acc08,$Tbl2,$acc08
778 lwzx $acc11,$Tbl2,$acc11 789 lwzx $acc09,$Tbl2,$acc09
779 xor $t0,$t0,$acc00 790 xor $t0,$t0,$acc00
780 xor $t1,$t1,$acc01 791 xor $t1,$t1,$acc01
781 lwzx $acc12,$Tbl3,$acc12 792 lwzx $acc10,$Tbl2,$acc10
782 lwzx $acc13,$Tbl3,$acc13 793 lwzx $acc11,$Tbl2,$acc11
783 xor $t2,$t2,$acc02 794 xor $t2,$t2,$acc02
784 xor $t3,$t3,$acc03 795 xor $t3,$t3,$acc03
785 lwzx $acc14,$Tbl3,$acc14 796 lwzx $acc12,$Tbl3,$acc12
786 lwzx $acc15,$Tbl3,$acc15 797 lwzx $acc13,$Tbl3,$acc13
787 xor $t0,$t0,$acc04 798 xor $t0,$t0,$acc04
788 xor $t1,$t1,$acc05 799 xor $t1,$t1,$acc05
800 lwzx $acc14,$Tbl3,$acc14
801 lwzx $acc15,$Tbl3,$acc15
789 xor $t2,$t2,$acc06 802 xor $t2,$t2,$acc06
790 xor $t3,$t3,$acc07 803 xor $t3,$t3,$acc07
791 xor $t0,$t0,$acc08 804 xor $t0,$t0,$acc08
@@ -801,56 +814,56 @@ Ldec_loop:
801 814
802 addi $Tbl2,$Tbl0,2048 815 addi $Tbl2,$Tbl0,2048
803 nop 816 nop
804 lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
805 lwz $acc09,`2048+32`($Tbl0)
806 lwz $acc10,`2048+64`($Tbl0)
807 lwz $acc11,`2048+96`($Tbl0)
808 lwz $acc08,`2048+128`($Tbl0)
809 lwz $acc09,`2048+160`($Tbl0)
810 lwz $acc10,`2048+192`($Tbl0)
811 lwz $acc11,`2048+224`($Tbl0)
812 rlwinm $acc00,$s0,`32-24`,24,31
813 rlwinm $acc01,$s1,`32-24`,24,31
814 lwz $t0,0($key) 817 lwz $t0,0($key)
815 lwz $t1,4($key) 818 lwz $t1,4($key)
816 rlwinm $acc02,$s2,`32-24`,24,31 819 rlwinm $acc00,$s0,`32-24`,24,31
817 rlwinm $acc03,$s3,`32-24`,24,31 820 rlwinm $acc01,$s1,`32-24`,24,31
818 lwz $t2,8($key) 821 lwz $t2,8($key)
819 lwz $t3,12($key) 822 lwz $t3,12($key)
823 rlwinm $acc02,$s2,`32-24`,24,31
824 rlwinm $acc03,$s3,`32-24`,24,31
825 lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
826 lwz $acc09,`2048+32`($Tbl0)
820 rlwinm $acc04,$s3,`32-16`,24,31 827 rlwinm $acc04,$s3,`32-16`,24,31
821 rlwinm $acc05,$s0,`32-16`,24,31 828 rlwinm $acc05,$s0,`32-16`,24,31
829 lwz $acc10,`2048+64`($Tbl0)
830 lwz $acc11,`2048+96`($Tbl0)
822 lbzx $acc00,$Tbl2,$acc00 831 lbzx $acc00,$Tbl2,$acc00
823 lbzx $acc01,$Tbl2,$acc01 832 lbzx $acc01,$Tbl2,$acc01
833 lwz $acc12,`2048+128`($Tbl0)
834 lwz $acc13,`2048+160`($Tbl0)
824 rlwinm $acc06,$s1,`32-16`,24,31 835 rlwinm $acc06,$s1,`32-16`,24,31
825 rlwinm $acc07,$s2,`32-16`,24,31 836 rlwinm $acc07,$s2,`32-16`,24,31
826 lbzx $acc02,$Tbl2,$acc02 837 lwz $acc14,`2048+192`($Tbl0)
827 lbzx $acc03,$Tbl2,$acc03 838 lwz $acc15,`2048+224`($Tbl0)
828 rlwinm $acc08,$s2,`32-8`,24,31 839 rlwinm $acc08,$s2,`32-8`,24,31
829 rlwinm $acc09,$s3,`32-8`,24,31 840 rlwinm $acc09,$s3,`32-8`,24,31
830 lbzx $acc04,$Tbl2,$acc04 841 lbzx $acc02,$Tbl2,$acc02
831 lbzx $acc05,$Tbl2,$acc05 842 lbzx $acc03,$Tbl2,$acc03
832 rlwinm $acc10,$s0,`32-8`,24,31 843 rlwinm $acc10,$s0,`32-8`,24,31
833 rlwinm $acc11,$s1,`32-8`,24,31 844 rlwinm $acc11,$s1,`32-8`,24,31
834 lbzx $acc06,$Tbl2,$acc06 845 lbzx $acc04,$Tbl2,$acc04
835 lbzx $acc07,$Tbl2,$acc07 846 lbzx $acc05,$Tbl2,$acc05
836 rlwinm $acc12,$s1,`0`,24,31 847 rlwinm $acc12,$s1,`0`,24,31
837 rlwinm $acc13,$s2,`0`,24,31 848 rlwinm $acc13,$s2,`0`,24,31
838 lbzx $acc08,$Tbl2,$acc08 849 lbzx $acc06,$Tbl2,$acc06
839 lbzx $acc09,$Tbl2,$acc09 850 lbzx $acc07,$Tbl2,$acc07
840 rlwinm $acc14,$s3,`0`,24,31 851 rlwinm $acc14,$s3,`0`,24,31
841 rlwinm $acc15,$s0,`0`,24,31 852 rlwinm $acc15,$s0,`0`,24,31
842 lbzx $acc10,$Tbl2,$acc10 853 lbzx $acc08,$Tbl2,$acc08
843 lbzx $acc11,$Tbl2,$acc11 854 lbzx $acc09,$Tbl2,$acc09
844 rlwinm $s0,$acc00,24,0,7 855 rlwinm $s0,$acc00,24,0,7
845 rlwinm $s1,$acc01,24,0,7 856 rlwinm $s1,$acc01,24,0,7
846 lbzx $acc12,$Tbl2,$acc12 857 lbzx $acc10,$Tbl2,$acc10
847 lbzx $acc13,$Tbl2,$acc13 858 lbzx $acc11,$Tbl2,$acc11
848 rlwinm $s2,$acc02,24,0,7 859 rlwinm $s2,$acc02,24,0,7
849 rlwinm $s3,$acc03,24,0,7 860 rlwinm $s3,$acc03,24,0,7
850 lbzx $acc14,$Tbl2,$acc14 861 lbzx $acc12,$Tbl2,$acc12
851 lbzx $acc15,$Tbl2,$acc15 862 lbzx $acc13,$Tbl2,$acc13
852 rlwimi $s0,$acc04,16,8,15 863 rlwimi $s0,$acc04,16,8,15
853 rlwimi $s1,$acc05,16,8,15 864 rlwimi $s1,$acc05,16,8,15
865 lbzx $acc14,$Tbl2,$acc14
866 lbzx $acc15,$Tbl2,$acc15
854 rlwimi $s2,$acc06,16,8,15 867 rlwimi $s2,$acc06,16,8,15
855 rlwimi $s3,$acc07,16,8,15 868 rlwimi $s3,$acc07,16,8,15
856 rlwimi $s0,$acc08,8,16,23 869 rlwimi $s0,$acc08,8,16,23
@@ -897,40 +910,40 @@ Ldec_compact_loop:
897 rlwinm $acc01,$s1,`32-24`,24,31 910 rlwinm $acc01,$s1,`32-24`,24,31
898 rlwinm $acc02,$s2,`32-24`,24,31 911 rlwinm $acc02,$s2,`32-24`,24,31
899 rlwinm $acc03,$s3,`32-24`,24,31 912 rlwinm $acc03,$s3,`32-24`,24,31
900 lbzx $acc00,$Tbl1,$acc00
901 lbzx $acc01,$Tbl1,$acc01
902 rlwinm $acc04,$s3,`32-16`,24,31 913 rlwinm $acc04,$s3,`32-16`,24,31
903 rlwinm $acc05,$s0,`32-16`,24,31 914 rlwinm $acc05,$s0,`32-16`,24,31
904 lbzx $acc02,$Tbl1,$acc02
905 lbzx $acc03,$Tbl1,$acc03
906 rlwinm $acc06,$s1,`32-16`,24,31 915 rlwinm $acc06,$s1,`32-16`,24,31
907 rlwinm $acc07,$s2,`32-16`,24,31 916 rlwinm $acc07,$s2,`32-16`,24,31
908 lbzx $acc04,$Tbl1,$acc04 917 lbzx $acc00,$Tbl1,$acc00
909 lbzx $acc05,$Tbl1,$acc05 918 lbzx $acc01,$Tbl1,$acc01
910 rlwinm $acc08,$s2,`32-8`,24,31 919 rlwinm $acc08,$s2,`32-8`,24,31
911 rlwinm $acc09,$s3,`32-8`,24,31 920 rlwinm $acc09,$s3,`32-8`,24,31
912 lbzx $acc06,$Tbl1,$acc06 921 lbzx $acc02,$Tbl1,$acc02
913 lbzx $acc07,$Tbl1,$acc07 922 lbzx $acc03,$Tbl1,$acc03
914 rlwinm $acc10,$s0,`32-8`,24,31 923 rlwinm $acc10,$s0,`32-8`,24,31
915 rlwinm $acc11,$s1,`32-8`,24,31 924 rlwinm $acc11,$s1,`32-8`,24,31
916 lbzx $acc08,$Tbl1,$acc08 925 lbzx $acc04,$Tbl1,$acc04
917 lbzx $acc09,$Tbl1,$acc09 926 lbzx $acc05,$Tbl1,$acc05
918 rlwinm $acc12,$s1,`0`,24,31 927 rlwinm $acc12,$s1,`0`,24,31
919 rlwinm $acc13,$s2,`0`,24,31 928 rlwinm $acc13,$s2,`0`,24,31
920 lbzx $acc10,$Tbl1,$acc10 929 lbzx $acc06,$Tbl1,$acc06
921 lbzx $acc11,$Tbl1,$acc11 930 lbzx $acc07,$Tbl1,$acc07
922 rlwinm $acc14,$s3,`0`,24,31 931 rlwinm $acc14,$s3,`0`,24,31
923 rlwinm $acc15,$s0,`0`,24,31 932 rlwinm $acc15,$s0,`0`,24,31
924 lbzx $acc12,$Tbl1,$acc12 933 lbzx $acc08,$Tbl1,$acc08
925 lbzx $acc13,$Tbl1,$acc13 934 lbzx $acc09,$Tbl1,$acc09
926 rlwinm $s0,$acc00,24,0,7 935 rlwinm $s0,$acc00,24,0,7
927 rlwinm $s1,$acc01,24,0,7 936 rlwinm $s1,$acc01,24,0,7
928 lbzx $acc14,$Tbl1,$acc14 937 lbzx $acc10,$Tbl1,$acc10
929 lbzx $acc15,$Tbl1,$acc15 938 lbzx $acc11,$Tbl1,$acc11
930 rlwinm $s2,$acc02,24,0,7 939 rlwinm $s2,$acc02,24,0,7
931 rlwinm $s3,$acc03,24,0,7 940 rlwinm $s3,$acc03,24,0,7
941 lbzx $acc12,$Tbl1,$acc12
942 lbzx $acc13,$Tbl1,$acc13
932 rlwimi $s0,$acc04,16,8,15 943 rlwimi $s0,$acc04,16,8,15
933 rlwimi $s1,$acc05,16,8,15 944 rlwimi $s1,$acc05,16,8,15
945 lbzx $acc14,$Tbl1,$acc14
946 lbzx $acc15,$Tbl1,$acc15
934 rlwimi $s2,$acc06,16,8,15 947 rlwimi $s2,$acc06,16,8,15
935 rlwimi $s3,$acc07,16,8,15 948 rlwimi $s3,$acc07,16,8,15
936 rlwimi $s0,$acc08,8,16,23 949 rlwimi $s0,$acc08,8,16,23
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl
index 4b27afd92f..7e01889298 100644
--- a/src/lib/libcrypto/aes/asm/aes-s390x.pl
+++ b/src/lib/libcrypto/aes/asm/aes-s390x.pl
@@ -765,6 +765,11 @@ $code.=<<___ if (!$softonly);
765 srl %r5,6 765 srl %r5,6
766 ar %r5,%r0 766 ar %r5,%r0
767 767
768 larl %r1,OPENSSL_s390xcap_P
769 lg %r0,0(%r1)
770 tmhl %r0,0x4000 # check for message-security assist
771 jz .Lekey_internal
772
768 lghi %r0,0 # query capability vector 773 lghi %r0,0 # query capability vector
769 la %r1,16($sp) 774 la %r1,16($sp)
770 .long 0xb92f0042 # kmc %r4,%r2 775 .long 0xb92f0042 # kmc %r4,%r2
@@ -1323,6 +1328,7 @@ $code.=<<___;
13234: ex $len,0($s1) 13284: ex $len,0($s1)
1324 j .Lcbc_dec_exit 1329 j .Lcbc_dec_exit
1325.size AES_cbc_encrypt,.-AES_cbc_encrypt 1330.size AES_cbc_encrypt,.-AES_cbc_encrypt
1331.comm OPENSSL_s390xcap_P,8,8
1326___ 1332___
1327} 1333}
1328$code.=<<___; 1334$code.=<<___;
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
index f616f1751f..a545e892ae 100755
--- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl
+++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
@@ -2,11 +2,12 @@
2# 2#
3# ==================================================================== 3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary 5# project. The module is, however, dual licensed under OpenSSL and
6# forms are granted according to the OpenSSL license. 6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
7# ==================================================================== 8# ====================================================================
8# 9#
9# Version 1.2. 10# Version 2.1.
10# 11#
11# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on 12# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
12# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version 13# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
@@ -17,17 +18,29 @@
17# 18#
18# Performance in number of cycles per processed byte for 128-bit key: 19# Performance in number of cycles per processed byte for 128-bit key:
19# 20#
20# ECB CBC encrypt 21# ECB encrypt ECB decrypt CBC large chunk
21# AMD64 13.7 13.0(*) 22# AMD64 33 41 13.0
22# EM64T 20.2 18.6(*) 23# EM64T 38 59 18.6(*)
24# Core 2 30 43 14.5(*)
23# 25#
24# (*) CBC benchmarks are better than ECB thanks to custom ABI used 26# (*) with hyper-threading off
25# by the private block encryption function. 27
28$flavour = shift;
29$output = shift;
30if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
31
32$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
33
34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
36( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
37die "can't locate x86_64-xlate.pl";
38
39open STDOUT,"| $^X $xlate $flavour $output";
26 40
27$verticalspin=1; # unlike 32-bit version $verticalspin performs 41$verticalspin=1; # unlike 32-bit version $verticalspin performs
28 # ~15% better on both AMD and Intel cores 42 # ~15% better on both AMD and Intel cores
29$output=shift; 43$speed_limit=512; # see aes-586.pl for details
30open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
31 44
32$code=".text\n"; 45$code=".text\n";
33 46
@@ -35,9 +48,9 @@ $s0="%eax";
35$s1="%ebx"; 48$s1="%ebx";
36$s2="%ecx"; 49$s2="%ecx";
37$s3="%edx"; 50$s3="%edx";
38$acc0="%esi"; 51$acc0="%esi"; $mask80="%rsi";
39$acc1="%edi"; 52$acc1="%edi"; $maskfe="%rdi";
40$acc2="%ebp"; 53$acc2="%ebp"; $mask1b="%rbp";
41$inp="%r8"; 54$inp="%r8";
42$out="%r9"; 55$out="%r9";
43$t0="%r10d"; 56$t0="%r10d";
@@ -51,6 +64,8 @@ sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
51sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; 64sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
52 $r =~ s/%[er]([sd]i)/%\1l/; 65 $r =~ s/%[er]([sd]i)/%\1l/;
53 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 66 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
67sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
68 $r =~ s/%r([0-9]+)/%r\1d/; $r; }
54sub _data_word() 69sub _data_word()
55{ my $i; 70{ my $i;
56 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } 71 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -138,22 +153,17 @@ $code.=<<___;
138 movzb `&lo("$s0")`,$acc0 153 movzb `&lo("$s0")`,$acc0
139 movzb `&lo("$s1")`,$acc1 154 movzb `&lo("$s1")`,$acc1
140 movzb `&lo("$s2")`,$acc2 155 movzb `&lo("$s2")`,$acc2
141 mov 2($sbox,$acc0,8),$t0 156 movzb 2($sbox,$acc0,8),$t0
142 mov 2($sbox,$acc1,8),$t1 157 movzb 2($sbox,$acc1,8),$t1
143 mov 2($sbox,$acc2,8),$t2 158 movzb 2($sbox,$acc2,8),$t2
144
145 and \$0x000000ff,$t0
146 and \$0x000000ff,$t1
147 and \$0x000000ff,$t2
148 159
149 movzb `&lo("$s3")`,$acc0 160 movzb `&lo("$s3")`,$acc0
150 movzb `&hi("$s1")`,$acc1 161 movzb `&hi("$s1")`,$acc1
151 movzb `&hi("$s2")`,$acc2 162 movzb `&hi("$s2")`,$acc2
152 mov 2($sbox,$acc0,8),$t3 163 movzb 2($sbox,$acc0,8),$t3
153 mov 0($sbox,$acc1,8),$acc1 #$t0 164 mov 0($sbox,$acc1,8),$acc1 #$t0
154 mov 0($sbox,$acc2,8),$acc2 #$t1 165 mov 0($sbox,$acc2,8),$acc2 #$t1
155 166
156 and \$0x000000ff,$t3
157 and \$0x0000ff00,$acc1 167 and \$0x0000ff00,$acc1
158 and \$0x0000ff00,$acc2 168 and \$0x0000ff00,$acc2
159 169
@@ -345,6 +355,234 @@ $code.=<<___;
345.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt 355.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
346___ 356___
347 357
358# it's possible to implement this by shifting tN by 8, filling least
359# significant byte with byte load and finally bswap-ing at the end,
360# but such partial register load kills Core 2...
361sub enccompactvert()
362{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
363
364$code.=<<___;
365 movzb `&lo("$s0")`,$t0
366 movzb `&lo("$s1")`,$t1
367 movzb `&lo("$s2")`,$t2
368 movzb ($sbox,$t0,1),$t0
369 movzb ($sbox,$t1,1),$t1
370 movzb ($sbox,$t2,1),$t2
371
372 movzb `&lo("$s3")`,$t3
373 movzb `&hi("$s1")`,$acc0
374 movzb `&hi("$s2")`,$acc1
375 movzb ($sbox,$t3,1),$t3
376 movzb ($sbox,$acc0,1),$t4 #$t0
377 movzb ($sbox,$acc1,1),$t5 #$t1
378
379 movzb `&hi("$s3")`,$acc2
380 movzb `&hi("$s0")`,$acc0
381 shr \$16,$s2
382 movzb ($sbox,$acc2,1),$acc2 #$t2
383 movzb ($sbox,$acc0,1),$acc0 #$t3
384 shr \$16,$s3
385
386 movzb `&lo("$s2")`,$acc1
387 shl \$8,$t4
388 shl \$8,$t5
389 movzb ($sbox,$acc1,1),$acc1 #$t0
390 xor $t4,$t0
391 xor $t5,$t1
392
393 movzb `&lo("$s3")`,$t4
394 shr \$16,$s0
395 shr \$16,$s1
396 movzb `&lo("$s0")`,$t5
397 shl \$8,$acc2
398 shl \$8,$acc0
399 movzb ($sbox,$t4,1),$t4 #$t1
400 movzb ($sbox,$t5,1),$t5 #$t2
401 xor $acc2,$t2
402 xor $acc0,$t3
403
404 movzb `&lo("$s1")`,$acc2
405 movzb `&hi("$s3")`,$acc0
406 shl \$16,$acc1
407 movzb ($sbox,$acc2,1),$acc2 #$t3
408 movzb ($sbox,$acc0,1),$acc0 #$t0
409 xor $acc1,$t0
410
411 movzb `&hi("$s0")`,$acc1
412 shr \$8,$s2
413 shr \$8,$s1
414 movzb ($sbox,$acc1,1),$acc1 #$t1
415 movzb ($sbox,$s2,1),$s3 #$t3
416 movzb ($sbox,$s1,1),$s2 #$t2
417 shl \$16,$t4
418 shl \$16,$t5
419 shl \$16,$acc2
420 xor $t4,$t1
421 xor $t5,$t2
422 xor $acc2,$t3
423
424 shl \$24,$acc0
425 shl \$24,$acc1
426 shl \$24,$s3
427 xor $acc0,$t0
428 shl \$24,$s2
429 xor $acc1,$t1
430 mov $t0,$s0
431 mov $t1,$s1
432 xor $t2,$s2
433 xor $t3,$s3
434___
435}
436
437sub enctransform_ref()
438{ my $sn = shift;
439 my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
440
441$code.=<<___;
442 mov $sn,$acc
443 and \$0x80808080,$acc
444 mov $acc,$tmp
445 shr \$7,$tmp
446 lea ($sn,$sn),$r2
447 sub $tmp,$acc
448 and \$0xfefefefe,$r2
449 and \$0x1b1b1b1b,$acc
450 mov $sn,$tmp
451 xor $acc,$r2
452
453 xor $r2,$sn
454 rol \$24,$sn
455 xor $r2,$sn
456 ror \$16,$tmp
457 xor $tmp,$sn
458 ror \$8,$tmp
459 xor $tmp,$sn
460___
461}
462
463# unlike decrypt case it does not pay off to parallelize enctransform
464sub enctransform()
465{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
466
467$code.=<<___;
468 mov $s0,$acc0
469 mov $s1,$acc1
470 and \$0x80808080,$acc0
471 and \$0x80808080,$acc1
472 mov $acc0,$t0
473 mov $acc1,$t1
474 shr \$7,$t0
475 lea ($s0,$s0),$r20
476 shr \$7,$t1
477 lea ($s1,$s1),$r21
478 sub $t0,$acc0
479 sub $t1,$acc1
480 and \$0xfefefefe,$r20
481 and \$0xfefefefe,$r21
482 and \$0x1b1b1b1b,$acc0
483 and \$0x1b1b1b1b,$acc1
484 mov $s0,$t0
485 mov $s1,$t1
486 xor $acc0,$r20
487 xor $acc1,$r21
488
489 xor $r20,$s0
490 xor $r21,$s1
491 mov $s2,$acc0
492 mov $s3,$acc1
493 rol \$24,$s0
494 rol \$24,$s1
495 and \$0x80808080,$acc0
496 and \$0x80808080,$acc1
497 xor $r20,$s0
498 xor $r21,$s1
499 mov $acc0,$t2
500 mov $acc1,$t3
501 ror \$16,$t0
502 ror \$16,$t1
503 shr \$7,$t2
504 lea ($s2,$s2),$r20
505 xor $t0,$s0
506 xor $t1,$s1
507 shr \$7,$t3
508 lea ($s3,$s3),$r21
509 ror \$8,$t0
510 ror \$8,$t1
511 sub $t2,$acc0
512 sub $t3,$acc1
513 xor $t0,$s0
514 xor $t1,$s1
515
516 and \$0xfefefefe,$r20
517 and \$0xfefefefe,$r21
518 and \$0x1b1b1b1b,$acc0
519 and \$0x1b1b1b1b,$acc1
520 mov $s2,$t2
521 mov $s3,$t3
522 xor $acc0,$r20
523 xor $acc1,$r21
524
525 xor $r20,$s2
526 xor $r21,$s3
527 rol \$24,$s2
528 rol \$24,$s3
529 xor $r20,$s2
530 xor $r21,$s3
531 mov 0($sbox),$acc0 # prefetch Te4
532 ror \$16,$t2
533 ror \$16,$t3
534 mov 64($sbox),$acc1
535 xor $t2,$s2
536 xor $t3,$s3
537 mov 128($sbox),$r20
538 ror \$8,$t2
539 ror \$8,$t3
540 mov 192($sbox),$r21
541 xor $t2,$s2
542 xor $t3,$s3
543___
544}
545
546$code.=<<___;
547.type _x86_64_AES_encrypt_compact,\@abi-omnipotent
548.align 16
549_x86_64_AES_encrypt_compact:
550 lea 128($sbox),$inp # size optimization
551 mov 0-128($inp),$acc1 # prefetch Te4
552 mov 32-128($inp),$acc2
553 mov 64-128($inp),$t0
554 mov 96-128($inp),$t1
555 mov 128-128($inp),$acc1
556 mov 160-128($inp),$acc2
557 mov 192-128($inp),$t0
558 mov 224-128($inp),$t1
559 jmp .Lenc_loop_compact
560.align 16
561.Lenc_loop_compact:
562 xor 0($key),$s0 # xor with key
563 xor 4($key),$s1
564 xor 8($key),$s2
565 xor 12($key),$s3
566 lea 16($key),$key
567___
568 &enccompactvert();
569$code.=<<___;
570 cmp 16(%rsp),$key
571 je .Lenc_compact_done
572___
573 &enctransform();
574$code.=<<___;
575 jmp .Lenc_loop_compact
576.align 16
577.Lenc_compact_done:
578 xor 0($key),$s0
579 xor 4($key),$s1
580 xor 8($key),$s2
581 xor 12($key),$s3
582 .byte 0xf3,0xc3 # rep ret
583.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
584___
585
348# void AES_encrypt (const void *inp,void *out,const AES_KEY *key); 586# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
349$code.=<<___; 587$code.=<<___;
350.globl AES_encrypt 588.globl AES_encrypt
@@ -358,31 +596,57 @@ AES_encrypt:
358 push %r14 596 push %r14
359 push %r15 597 push %r15
360 598
361 mov %rdx,$key 599 # allocate frame "above" key schedule
362 mov %rdi,$inp 600 mov %rsp,%r10
363 mov %rsi,$out 601 lea -63(%rdx),%rcx # %rdx is key argument
364 602 and \$-64,%rsp
365 .picmeup $sbox 603 sub %rsp,%rcx
366 lea AES_Te-.($sbox),$sbox 604 neg %rcx
367 605 and \$0x3c0,%rcx
368 mov 0($inp),$s0 606 sub %rcx,%rsp
369 mov 4($inp),$s1 607 sub \$32,%rsp
370 mov 8($inp),$s2
371 mov 12($inp),$s3
372 608
373 call _x86_64_AES_encrypt 609 mov %rsi,16(%rsp) # save out
610 mov %r10,24(%rsp) # save real stack pointer
611.Lenc_prologue:
374 612
375 mov $s0,0($out) 613 mov %rdx,$key
614 mov 240($key),$rnds # load rounds
615
616 mov 0(%rdi),$s0 # load input vector
617 mov 4(%rdi),$s1
618 mov 8(%rdi),$s2
619 mov 12(%rdi),$s3
620
621 shl \$4,$rnds
622 lea ($key,$rnds),%rbp
623 mov $key,(%rsp) # key schedule
624 mov %rbp,8(%rsp) # end of key schedule
625
626 # pick Te4 copy which can't "overlap" with stack frame or key schedule
627 lea .LAES_Te+2048(%rip),$sbox
628 lea 768(%rsp),%rbp
629 sub $sbox,%rbp
630 and \$0x300,%rbp
631 lea ($sbox,%rbp),$sbox
632
633 call _x86_64_AES_encrypt_compact
634
635 mov 16(%rsp),$out # restore out
636 mov 24(%rsp),%rsi # restore saved stack pointer
637 mov $s0,0($out) # write output vector
376 mov $s1,4($out) 638 mov $s1,4($out)
377 mov $s2,8($out) 639 mov $s2,8($out)
378 mov $s3,12($out) 640 mov $s3,12($out)
379 641
380 pop %r15 642 mov (%rsi),%r15
381 pop %r14 643 mov 8(%rsi),%r14
382 pop %r13 644 mov 16(%rsi),%r13
383 pop %r12 645 mov 24(%rsi),%r12
384 pop %rbp 646 mov 32(%rsi),%rbp
385 pop %rbx 647 mov 40(%rsi),%rbx
648 lea 48(%rsi),%rsp
649.Lenc_epilogue:
386 ret 650 ret
387.size AES_encrypt,.-AES_encrypt 651.size AES_encrypt,.-AES_encrypt
388___ 652___
@@ -453,19 +717,20 @@ sub declastvert()
453{ my $t3="%r8d"; # zaps $inp! 717{ my $t3="%r8d"; # zaps $inp!
454 718
455$code.=<<___; 719$code.=<<___;
720 lea 2048($sbox),$sbox # size optimization
456 movzb `&lo("$s0")`,$acc0 721 movzb `&lo("$s0")`,$acc0
457 movzb `&lo("$s1")`,$acc1 722 movzb `&lo("$s1")`,$acc1
458 movzb `&lo("$s2")`,$acc2 723 movzb `&lo("$s2")`,$acc2
459 movzb 2048($sbox,$acc0,1),$t0 724 movzb ($sbox,$acc0,1),$t0
460 movzb 2048($sbox,$acc1,1),$t1 725 movzb ($sbox,$acc1,1),$t1
461 movzb 2048($sbox,$acc2,1),$t2 726 movzb ($sbox,$acc2,1),$t2
462 727
463 movzb `&lo("$s3")`,$acc0 728 movzb `&lo("$s3")`,$acc0
464 movzb `&hi("$s3")`,$acc1 729 movzb `&hi("$s3")`,$acc1
465 movzb `&hi("$s0")`,$acc2 730 movzb `&hi("$s0")`,$acc2
466 movzb 2048($sbox,$acc0,1),$t3 731 movzb ($sbox,$acc0,1),$t3
467 movzb 2048($sbox,$acc1,1),$acc1 #$t0 732 movzb ($sbox,$acc1,1),$acc1 #$t0
468 movzb 2048($sbox,$acc2,1),$acc2 #$t1 733 movzb ($sbox,$acc2,1),$acc2 #$t1
469 734
470 shl \$8,$acc1 735 shl \$8,$acc1
471 shl \$8,$acc2 736 shl \$8,$acc2
@@ -477,8 +742,8 @@ $code.=<<___;
477 movzb `&hi("$s1")`,$acc0 742 movzb `&hi("$s1")`,$acc0
478 movzb `&hi("$s2")`,$acc1 743 movzb `&hi("$s2")`,$acc1
479 shr \$16,$s0 744 shr \$16,$s0
480 movzb 2048($sbox,$acc0,1),$acc0 #$t2 745 movzb ($sbox,$acc0,1),$acc0 #$t2
481 movzb 2048($sbox,$acc1,1),$acc1 #$t3 746 movzb ($sbox,$acc1,1),$acc1 #$t3
482 747
483 shl \$8,$acc0 748 shl \$8,$acc0
484 shl \$8,$acc1 749 shl \$8,$acc1
@@ -490,9 +755,9 @@ $code.=<<___;
490 movzb `&lo("$s2")`,$acc0 755 movzb `&lo("$s2")`,$acc0
491 movzb `&lo("$s3")`,$acc1 756 movzb `&lo("$s3")`,$acc1
492 movzb `&lo("$s0")`,$acc2 757 movzb `&lo("$s0")`,$acc2
493 movzb 2048($sbox,$acc0,1),$acc0 #$t0 758 movzb ($sbox,$acc0,1),$acc0 #$t0
494 movzb 2048($sbox,$acc1,1),$acc1 #$t1 759 movzb ($sbox,$acc1,1),$acc1 #$t1
495 movzb 2048($sbox,$acc2,1),$acc2 #$t2 760 movzb ($sbox,$acc2,1),$acc2 #$t2
496 761
497 shl \$16,$acc0 762 shl \$16,$acc0
498 shl \$16,$acc1 763 shl \$16,$acc1
@@ -505,9 +770,9 @@ $code.=<<___;
505 movzb `&lo("$s1")`,$acc0 770 movzb `&lo("$s1")`,$acc0
506 movzb `&hi("$s1")`,$acc1 771 movzb `&hi("$s1")`,$acc1
507 movzb `&hi("$s2")`,$acc2 772 movzb `&hi("$s2")`,$acc2
508 movzb 2048($sbox,$acc0,1),$acc0 #$t3 773 movzb ($sbox,$acc0,1),$acc0 #$t3
509 movzb 2048($sbox,$acc1,1),$acc1 #$t0 774 movzb ($sbox,$acc1,1),$acc1 #$t0
510 movzb 2048($sbox,$acc2,1),$acc2 #$t1 775 movzb ($sbox,$acc2,1),$acc2 #$t1
511 776
512 shl \$16,$acc0 777 shl \$16,$acc0
513 shl \$24,$acc1 778 shl \$24,$acc1
@@ -520,8 +785,8 @@ $code.=<<___;
520 movzb `&hi("$s3")`,$acc0 785 movzb `&hi("$s3")`,$acc0
521 movzb `&hi("$s0")`,$acc1 786 movzb `&hi("$s0")`,$acc1
522 mov 16+12($key),$s3 787 mov 16+12($key),$s3
523 movzb 2048($sbox,$acc0,1),$acc0 #$t2 788 movzb ($sbox,$acc0,1),$acc0 #$t2
524 movzb 2048($sbox,$acc1,1),$acc1 #$t3 789 movzb ($sbox,$acc1,1),$acc1 #$t3
525 mov 16+0($key),$s0 790 mov 16+0($key),$s0
526 791
527 shl \$24,$acc0 792 shl \$24,$acc0
@@ -532,6 +797,7 @@ $code.=<<___;
532 797
533 mov 16+4($key),$s1 798 mov 16+4($key),$s1
534 mov 16+8($key),$s2 799 mov 16+8($key),$s2
800 lea -2048($sbox),$sbox
535 xor $t0,$s0 801 xor $t0,$s0
536 xor $t1,$s1 802 xor $t1,$s1
537 xor $t2,$s2 803 xor $t2,$s2
@@ -659,6 +925,260 @@ $code.=<<___;
659.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt 925.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
660___ 926___
661 927
928sub deccompactvert()
929{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
930
931$code.=<<___;
932 movzb `&lo("$s0")`,$t0
933 movzb `&lo("$s1")`,$t1
934 movzb `&lo("$s2")`,$t2
935 movzb ($sbox,$t0,1),$t0
936 movzb ($sbox,$t1,1),$t1
937 movzb ($sbox,$t2,1),$t2
938
939 movzb `&lo("$s3")`,$t3
940 movzb `&hi("$s3")`,$acc0
941 movzb `&hi("$s0")`,$acc1
942 movzb ($sbox,$t3,1),$t3
943 movzb ($sbox,$acc0,1),$t4 #$t0
944 movzb ($sbox,$acc1,1),$t5 #$t1
945
946 movzb `&hi("$s1")`,$acc2
947 movzb `&hi("$s2")`,$acc0
948 shr \$16,$s2
949 movzb ($sbox,$acc2,1),$acc2 #$t2
950 movzb ($sbox,$acc0,1),$acc0 #$t3
951 shr \$16,$s3
952
953 movzb `&lo("$s2")`,$acc1
954 shl \$8,$t4
955 shl \$8,$t5
956 movzb ($sbox,$acc1,1),$acc1 #$t0
957 xor $t4,$t0
958 xor $t5,$t1
959
960 movzb `&lo("$s3")`,$t4
961 shr \$16,$s0
962 shr \$16,$s1
963 movzb `&lo("$s0")`,$t5
964 shl \$8,$acc2
965 shl \$8,$acc0
966 movzb ($sbox,$t4,1),$t4 #$t1
967 movzb ($sbox,$t5,1),$t5 #$t2
968 xor $acc2,$t2
969 xor $acc0,$t3
970
971 movzb `&lo("$s1")`,$acc2
972 movzb `&hi("$s1")`,$acc0
973 shl \$16,$acc1
974 movzb ($sbox,$acc2,1),$acc2 #$t3
975 movzb ($sbox,$acc0,1),$acc0 #$t0
976 xor $acc1,$t0
977
978 movzb `&hi("$s2")`,$acc1
979 shl \$16,$t4
980 shl \$16,$t5
981 movzb ($sbox,$acc1,1),$s1 #$t1
982 xor $t4,$t1
983 xor $t5,$t2
984
985 movzb `&hi("$s3")`,$acc1
986 shr \$8,$s0
987 shl \$16,$acc2
988 movzb ($sbox,$acc1,1),$s2 #$t2
989 movzb ($sbox,$s0,1),$s3 #$t3
990 xor $acc2,$t3
991
992 shl \$24,$acc0
993 shl \$24,$s1
994 shl \$24,$s2
995 xor $acc0,$t0
996 shl \$24,$s3
997 xor $t1,$s1
998 mov $t0,$s0
999 xor $t2,$s2
1000 xor $t3,$s3
1001___
1002}
1003
1004# parallelized version! input is pair of 64-bit values: %rax=s1.s0
1005# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
1006# %ecx=s2 and %edx=s3.
1007sub dectransform()
1008{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
1009 my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
1010 my $prefetch = shift;
1011
1012$code.=<<___;
1013 mov $tp10,$acc0
1014 mov $tp18,$acc8
1015 and $mask80,$acc0
1016 and $mask80,$acc8
1017 mov $acc0,$tp40
1018 mov $acc8,$tp48
1019 shr \$7,$tp40
1020 lea ($tp10,$tp10),$tp20
1021 shr \$7,$tp48
1022 lea ($tp18,$tp18),$tp28
1023 sub $tp40,$acc0
1024 sub $tp48,$acc8
1025 and $maskfe,$tp20
1026 and $maskfe,$tp28
1027 and $mask1b,$acc0
1028 and $mask1b,$acc8
1029 xor $tp20,$acc0
1030 xor $tp28,$acc8
1031 mov $acc0,$tp20
1032 mov $acc8,$tp28
1033
1034 and $mask80,$acc0
1035 and $mask80,$acc8
1036 mov $acc0,$tp80
1037 mov $acc8,$tp88
1038 shr \$7,$tp80
1039 lea ($tp20,$tp20),$tp40
1040 shr \$7,$tp88
1041 lea ($tp28,$tp28),$tp48
1042 sub $tp80,$acc0
1043 sub $tp88,$acc8
1044 and $maskfe,$tp40
1045 and $maskfe,$tp48
1046 and $mask1b,$acc0
1047 and $mask1b,$acc8
1048 xor $tp40,$acc0
1049 xor $tp48,$acc8
1050 mov $acc0,$tp40
1051 mov $acc8,$tp48
1052
1053 and $mask80,$acc0
1054 and $mask80,$acc8
1055 mov $acc0,$tp80
1056 mov $acc8,$tp88
1057 shr \$7,$tp80
1058 xor $tp10,$tp20 # tp2^=tp1
1059 shr \$7,$tp88
1060 xor $tp18,$tp28 # tp2^=tp1
1061 sub $tp80,$acc0
1062 sub $tp88,$acc8
1063 lea ($tp40,$tp40),$tp80
1064 lea ($tp48,$tp48),$tp88
1065 xor $tp10,$tp40 # tp4^=tp1
1066 xor $tp18,$tp48 # tp4^=tp1
1067 and $maskfe,$tp80
1068 and $maskfe,$tp88
1069 and $mask1b,$acc0
1070 and $mask1b,$acc8
1071 xor $acc0,$tp80
1072 xor $acc8,$tp88
1073
1074 xor $tp80,$tp10 # tp1^=tp8
1075 xor $tp88,$tp18 # tp1^=tp8
1076 xor $tp80,$tp20 # tp2^tp1^=tp8
1077 xor $tp88,$tp28 # tp2^tp1^=tp8
1078 mov $tp10,$acc0
1079 mov $tp18,$acc8
1080 xor $tp80,$tp40 # tp4^tp1^=tp8
1081 xor $tp88,$tp48 # tp4^tp1^=tp8
1082 shr \$32,$acc0
1083 shr \$32,$acc8
1084 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1
1085 xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
1086 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8)
1087 rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8)
1088 xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1089 xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1090
1091 rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
1092 rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8)
1093 xor `&LO("$tp80")`,`&LO("$tp10")`
1094 xor `&LO("$tp88")`,`&LO("$tp18")`
1095 shr \$32,$tp80
1096 shr \$32,$tp88
1097 xor `&LO("$tp80")`,`&LO("$acc0")`
1098 xor `&LO("$tp88")`,`&LO("$acc8")`
1099
1100 mov $tp20,$tp80
1101 mov $tp28,$tp88
1102 shr \$32,$tp80
1103 shr \$32,$tp88
1104 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
1105 rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
1106 rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
1107 rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
1108 xor `&LO("$tp20")`,`&LO("$tp10")`
1109 xor `&LO("$tp28")`,`&LO("$tp18")`
1110 mov $tp40,$tp20
1111 mov $tp48,$tp28
1112 xor `&LO("$tp80")`,`&LO("$acc0")`
1113 xor `&LO("$tp88")`,`&LO("$acc8")`
1114
1115 `"mov 0($sbox),$mask80" if ($prefetch)`
1116 shr \$32,$tp20
1117 shr \$32,$tp28
1118 `"mov 64($sbox),$maskfe" if ($prefetch)`
1119 rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
1120 rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
1121 `"mov 128($sbox),$mask1b" if ($prefetch)`
1122 rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
1123 rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
1124 `"mov 192($sbox),$tp80" if ($prefetch)`
1125 xor `&LO("$tp40")`,`&LO("$tp10")`
1126 xor `&LO("$tp48")`,`&LO("$tp18")`
1127 `"mov 256($sbox),$tp88" if ($prefetch)`
1128 xor `&LO("$tp20")`,`&LO("$acc0")`
1129 xor `&LO("$tp28")`,`&LO("$acc8")`
1130___
1131}
1132
1133$code.=<<___;
1134.type _x86_64_AES_decrypt_compact,\@abi-omnipotent
1135.align 16
1136_x86_64_AES_decrypt_compact:
1137 lea 128($sbox),$inp # size optimization
1138 mov 0-128($inp),$acc1 # prefetch Td4
1139 mov 32-128($inp),$acc2
1140 mov 64-128($inp),$t0
1141 mov 96-128($inp),$t1
1142 mov 128-128($inp),$acc1
1143 mov 160-128($inp),$acc2
1144 mov 192-128($inp),$t0
1145 mov 224-128($inp),$t1
1146 jmp .Ldec_loop_compact
1147
1148.align 16
1149.Ldec_loop_compact:
1150 xor 0($key),$s0 # xor with key
1151 xor 4($key),$s1
1152 xor 8($key),$s2
1153 xor 12($key),$s3
1154 lea 16($key),$key
1155___
1156 &deccompactvert();
1157$code.=<<___;
1158 cmp 16(%rsp),$key
1159 je .Ldec_compact_done
1160
1161 mov 256+0($sbox),$mask80
1162 shl \$32,%rbx
1163 shl \$32,%rdx
1164 mov 256+8($sbox),$maskfe
1165 or %rbx,%rax
1166 or %rdx,%rcx
1167 mov 256+16($sbox),$mask1b
1168___
1169 &dectransform(1);
1170$code.=<<___;
1171 jmp .Ldec_loop_compact
1172.align 16
1173.Ldec_compact_done:
1174 xor 0($key),$s0
1175 xor 4($key),$s1
1176 xor 8($key),$s2
1177 xor 12($key),$s3
1178 .byte 0xf3,0xc3 # rep ret
1179.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
1180___
1181
662# void AES_decrypt (const void *inp,void *out,const AES_KEY *key); 1182# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
663$code.=<<___; 1183$code.=<<___;
664.globl AES_decrypt 1184.globl AES_decrypt
@@ -672,43 +1192,59 @@ AES_decrypt:
672 push %r14 1192 push %r14
673 push %r15 1193 push %r15
674 1194
675 mov %rdx,$key 1195 # allocate frame "above" key schedule
676 mov %rdi,$inp 1196 mov %rsp,%r10
677 mov %rsi,$out 1197 lea -63(%rdx),%rcx # %rdx is key argument
1198 and \$-64,%rsp
1199 sub %rsp,%rcx
1200 neg %rcx
1201 and \$0x3c0,%rcx
1202 sub %rcx,%rsp
1203 sub \$32,%rsp
1204
1205 mov %rsi,16(%rsp) # save out
1206 mov %r10,24(%rsp) # save real stack pointer
1207.Ldec_prologue:
678 1208
679 .picmeup $sbox 1209 mov %rdx,$key
680 lea AES_Td-.($sbox),$sbox 1210 mov 240($key),$rnds # load rounds
681 1211
682 # prefetch Td4 1212 mov 0(%rdi),$s0 # load input vector
683 lea 2048+128($sbox),$sbox; 1213 mov 4(%rdi),$s1
684 mov 0-128($sbox),$s0 1214 mov 8(%rdi),$s2
685 mov 32-128($sbox),$s1 1215 mov 12(%rdi),$s3
686 mov 64-128($sbox),$s2 1216
687 mov 96-128($sbox),$s3 1217 shl \$4,$rnds
688 mov 128-128($sbox),$s0 1218 lea ($key,$rnds),%rbp
689 mov 160-128($sbox),$s1 1219 mov $key,(%rsp) # key schedule
690 mov 192-128($sbox),$s2 1220 mov %rbp,8(%rsp) # end of key schedule
691 mov 224-128($sbox),$s3 1221
692 lea -2048-128($sbox),$sbox; 1222 # pick Td4 copy which can't "overlap" with stack frame or key schedule
693 1223 lea .LAES_Td+2048(%rip),$sbox
694 mov 0($inp),$s0 1224 lea 768(%rsp),%rbp
695 mov 4($inp),$s1 1225 sub $sbox,%rbp
696 mov 8($inp),$s2 1226 and \$0x300,%rbp
697 mov 12($inp),$s3 1227 lea ($sbox,%rbp),$sbox
698 1228 shr \$3,%rbp # recall "magic" constants!
699 call _x86_64_AES_decrypt 1229 add %rbp,$sbox
700 1230
701 mov $s0,0($out) 1231 call _x86_64_AES_decrypt_compact
1232
1233 mov 16(%rsp),$out # restore out
1234 mov 24(%rsp),%rsi # restore saved stack pointer
1235 mov $s0,0($out) # write output vector
702 mov $s1,4($out) 1236 mov $s1,4($out)
703 mov $s2,8($out) 1237 mov $s2,8($out)
704 mov $s3,12($out) 1238 mov $s3,12($out)
705 1239
706 pop %r15 1240 mov (%rsi),%r15
707 pop %r14 1241 mov 8(%rsi),%r14
708 pop %r13 1242 mov 16(%rsi),%r13
709 pop %r12 1243 mov 24(%rsi),%r12
710 pop %rbp 1244 mov 32(%rsi),%rbp
711 pop %rbx 1245 mov 40(%rsi),%rbx
1246 lea 48(%rsi),%rsp
1247.Ldec_epilogue:
712 ret 1248 ret
713.size AES_decrypt,.-AES_decrypt 1249.size AES_decrypt,.-AES_decrypt
714___ 1250___
@@ -718,27 +1254,26 @@ sub enckey()
718{ 1254{
719$code.=<<___; 1255$code.=<<___;
720 movz %dl,%esi # rk[i]>>0 1256 movz %dl,%esi # rk[i]>>0
721 mov 2(%rbp,%rsi,8),%ebx 1257 movzb -128(%rbp,%rsi),%ebx
722 movz %dh,%esi # rk[i]>>8 1258 movz %dh,%esi # rk[i]>>8
723 and \$0xFF000000,%ebx 1259 shl \$24,%ebx
724 xor %ebx,%eax 1260 xor %ebx,%eax
725 1261
726 mov 2(%rbp,%rsi,8),%ebx 1262 movzb -128(%rbp,%rsi),%ebx
727 shr \$16,%edx 1263 shr \$16,%edx
728 and \$0x000000FF,%ebx
729 movz %dl,%esi # rk[i]>>16 1264 movz %dl,%esi # rk[i]>>16
730 xor %ebx,%eax 1265 xor %ebx,%eax
731 1266
732 mov 0(%rbp,%rsi,8),%ebx 1267 movzb -128(%rbp,%rsi),%ebx
733 movz %dh,%esi # rk[i]>>24 1268 movz %dh,%esi # rk[i]>>24
734 and \$0x0000FF00,%ebx 1269 shl \$8,%ebx
735 xor %ebx,%eax 1270 xor %ebx,%eax
736 1271
737 mov 0(%rbp,%rsi,8),%ebx 1272 movzb -128(%rbp,%rsi),%ebx
738 and \$0x00FF0000,%ebx 1273 shl \$16,%ebx
739 xor %ebx,%eax 1274 xor %ebx,%eax
740 1275
741 xor 2048(%rbp,%rcx,4),%eax # rcon 1276 xor 1024-128(%rbp,%rcx,4),%eax # rcon
742___ 1277___
743} 1278}
744 1279
@@ -751,7 +1286,29 @@ $code.=<<___;
751AES_set_encrypt_key: 1286AES_set_encrypt_key:
752 push %rbx 1287 push %rbx
753 push %rbp 1288 push %rbp
1289 push %r12 # redundant, but allows to share
1290 push %r13 # exception handler...
1291 push %r14
1292 push %r15
1293 sub \$8,%rsp
1294.Lenc_key_prologue:
1295
1296 call _x86_64_AES_set_encrypt_key
1297
1298 mov 8(%rsp),%r15
1299 mov 16(%rsp),%r14
1300 mov 24(%rsp),%r13
1301 mov 32(%rsp),%r12
1302 mov 40(%rsp),%rbp
1303 mov 48(%rsp),%rbx
1304 add \$56,%rsp
1305.Lenc_key_epilogue:
1306 ret
1307.size AES_set_encrypt_key,.-AES_set_encrypt_key
754 1308
1309.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
1310.align 16
1311_x86_64_AES_set_encrypt_key:
755 mov %esi,%ecx # %ecx=bits 1312 mov %esi,%ecx # %ecx=bits
756 mov %rdi,%rsi # %rsi=userKey 1313 mov %rdi,%rsi # %rsi=userKey
757 mov %rdx,%rdi # %rdi=key 1314 mov %rdx,%rdi # %rdi=key
@@ -761,8 +1318,18 @@ AES_set_encrypt_key:
761 test \$-1,%rdi 1318 test \$-1,%rdi
762 jz .Lbadpointer 1319 jz .Lbadpointer
763 1320
764 .picmeup %rbp 1321 lea .LAES_Te(%rip),%rbp
765 lea AES_Te-.(%rbp),%rbp 1322 lea 2048+128(%rbp),%rbp
1323
1324 # prefetch Te4
1325 mov 0-128(%rbp),%eax
1326 mov 32-128(%rbp),%ebx
1327 mov 64-128(%rbp),%r8d
1328 mov 96-128(%rbp),%edx
1329 mov 128-128(%rbp),%eax
1330 mov 160-128(%rbp),%ebx
1331 mov 192-128(%rbp),%r8d
1332 mov 224-128(%rbp),%edx
766 1333
767 cmp \$128,%ecx 1334 cmp \$128,%ecx
768 je .L10rounds 1335 je .L10rounds
@@ -774,15 +1341,12 @@ AES_set_encrypt_key:
774 jmp .Lexit 1341 jmp .Lexit
775 1342
776.L10rounds: 1343.L10rounds:
777 mov 0(%rsi),%eax # copy first 4 dwords 1344 mov 0(%rsi),%rax # copy first 4 dwords
778 mov 4(%rsi),%ebx 1345 mov 8(%rsi),%rdx
779 mov 8(%rsi),%ecx 1346 mov %rax,0(%rdi)
780 mov 12(%rsi),%edx 1347 mov %rdx,8(%rdi)
781 mov %eax,0(%rdi)
782 mov %ebx,4(%rdi)
783 mov %ecx,8(%rdi)
784 mov %edx,12(%rdi)
785 1348
1349 shr \$32,%rdx
786 xor %ecx,%ecx 1350 xor %ecx,%ecx
787 jmp .L10shortcut 1351 jmp .L10shortcut
788.align 4 1352.align 4
@@ -810,19 +1374,14 @@ $code.=<<___;
810 jmp .Lexit 1374 jmp .Lexit
811 1375
812.L12rounds: 1376.L12rounds:
813 mov 0(%rsi),%eax # copy first 6 dwords 1377 mov 0(%rsi),%rax # copy first 6 dwords
814 mov 4(%rsi),%ebx 1378 mov 8(%rsi),%rbx
815 mov 8(%rsi),%ecx 1379 mov 16(%rsi),%rdx
816 mov 12(%rsi),%edx 1380 mov %rax,0(%rdi)
817 mov %eax,0(%rdi) 1381 mov %rbx,8(%rdi)
818 mov %ebx,4(%rdi) 1382 mov %rdx,16(%rdi)
819 mov %ecx,8(%rdi) 1383
820 mov %edx,12(%rdi) 1384 shr \$32,%rdx
821 mov 16(%rsi),%ecx
822 mov 20(%rsi),%edx
823 mov %ecx,16(%rdi)
824 mov %edx,20(%rdi)
825
826 xor %ecx,%ecx 1385 xor %ecx,%ecx
827 jmp .L12shortcut 1386 jmp .L12shortcut
828.align 4 1387.align 4
@@ -858,30 +1417,23 @@ $code.=<<___;
858 jmp .Lexit 1417 jmp .Lexit
859 1418
860.L14rounds: 1419.L14rounds:
861 mov 0(%rsi),%eax # copy first 8 dwords 1420 mov 0(%rsi),%rax # copy first 8 dwords
862 mov 4(%rsi),%ebx 1421 mov 8(%rsi),%rbx
863 mov 8(%rsi),%ecx 1422 mov 16(%rsi),%rcx
864 mov 12(%rsi),%edx 1423 mov 24(%rsi),%rdx
865 mov %eax,0(%rdi) 1424 mov %rax,0(%rdi)
866 mov %ebx,4(%rdi) 1425 mov %rbx,8(%rdi)
867 mov %ecx,8(%rdi) 1426 mov %rcx,16(%rdi)
868 mov %edx,12(%rdi) 1427 mov %rdx,24(%rdi)
869 mov 16(%rsi),%eax 1428
870 mov 20(%rsi),%ebx 1429 shr \$32,%rdx
871 mov 24(%rsi),%ecx
872 mov 28(%rsi),%edx
873 mov %eax,16(%rdi)
874 mov %ebx,20(%rdi)
875 mov %ecx,24(%rdi)
876 mov %edx,28(%rdi)
877
878 xor %ecx,%ecx 1430 xor %ecx,%ecx
879 jmp .L14shortcut 1431 jmp .L14shortcut
880.align 4 1432.align 4
881.L14loop: 1433.L14loop:
1434 mov 0(%rdi),%eax # rk[0]
882 mov 28(%rdi),%edx # rk[4] 1435 mov 28(%rdi),%edx # rk[4]
883.L14shortcut: 1436.L14shortcut:
884 mov 0(%rdi),%eax # rk[0]
885___ 1437___
886 &enckey (); 1438 &enckey ();
887$code.=<<___; 1439$code.=<<___;
@@ -900,24 +1452,23 @@ $code.=<<___;
900 mov %eax,%edx 1452 mov %eax,%edx
901 mov 16(%rdi),%eax # rk[4] 1453 mov 16(%rdi),%eax # rk[4]
902 movz %dl,%esi # rk[11]>>0 1454 movz %dl,%esi # rk[11]>>0
903 mov 2(%rbp,%rsi,8),%ebx 1455 movzb -128(%rbp,%rsi),%ebx
904 movz %dh,%esi # rk[11]>>8 1456 movz %dh,%esi # rk[11]>>8
905 and \$0x000000FF,%ebx
906 xor %ebx,%eax 1457 xor %ebx,%eax
907 1458
908 mov 0(%rbp,%rsi,8),%ebx 1459 movzb -128(%rbp,%rsi),%ebx
909 shr \$16,%edx 1460 shr \$16,%edx
910 and \$0x0000FF00,%ebx 1461 shl \$8,%ebx
911 movz %dl,%esi # rk[11]>>16 1462 movz %dl,%esi # rk[11]>>16
912 xor %ebx,%eax 1463 xor %ebx,%eax
913 1464
914 mov 0(%rbp,%rsi,8),%ebx 1465 movzb -128(%rbp,%rsi),%ebx
915 movz %dh,%esi # rk[11]>>24 1466 movz %dh,%esi # rk[11]>>24
916 and \$0x00FF0000,%ebx 1467 shl \$16,%ebx
917 xor %ebx,%eax 1468 xor %ebx,%eax
918 1469
919 mov 2(%rbp,%rsi,8),%ebx 1470 movzb -128(%rbp,%rsi),%ebx
920 and \$0xFF000000,%ebx 1471 shl \$24,%ebx
921 xor %ebx,%eax 1472 xor %ebx,%eax
922 1473
923 mov %eax,48(%rdi) # rk[12] 1474 mov %eax,48(%rdi) # rk[12]
@@ -938,31 +1489,61 @@ $code.=<<___;
938.Lbadpointer: 1489.Lbadpointer:
939 mov \$-1,%rax 1490 mov \$-1,%rax
940.Lexit: 1491.Lexit:
941 pop %rbp 1492 .byte 0xf3,0xc3 # rep ret
942 pop %rbx 1493.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
943 ret
944.size AES_set_encrypt_key,.-AES_set_encrypt_key
945___ 1494___
946 1495
947sub deckey() 1496sub deckey_ref()
948{ my ($i,$ptr,$te,$td) = @_; 1497{ my ($i,$ptr,$te,$td) = @_;
1498 my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
949$code.=<<___; 1499$code.=<<___;
950 mov $i($ptr),%eax 1500 mov $i($ptr),$tp1
951 mov %eax,%edx 1501 mov $tp1,$acc
952 movz %ah,%ebx 1502 and \$0x80808080,$acc
953 shr \$16,%edx 1503 mov $acc,$tp4
954 and \$0xFF,%eax 1504 shr \$7,$tp4
955 movzb 2($te,%rax,8),%rax 1505 lea 0($tp1,$tp1),$tp2
956 movzb 2($te,%rbx,8),%rbx 1506 sub $tp4,$acc
957 mov 0($td,%rax,8),%eax 1507 and \$0xfefefefe,$tp2
958 xor 3($td,%rbx,8),%eax 1508 and \$0x1b1b1b1b,$acc
959 movzb %dh,%ebx 1509 xor $tp2,$acc
960 and \$0xFF,%edx 1510 mov $acc,$tp2
961 movzb 2($te,%rdx,8),%rdx 1511
962 movzb 2($te,%rbx,8),%rbx 1512 and \$0x80808080,$acc
963 xor 2($td,%rdx,8),%eax 1513 mov $acc,$tp8
964 xor 1($td,%rbx,8),%eax 1514 shr \$7,$tp8
965 mov %eax,$i($ptr) 1515 lea 0($tp2,$tp2),$tp4
1516 sub $tp8,$acc
1517 and \$0xfefefefe,$tp4
1518 and \$0x1b1b1b1b,$acc
1519 xor $tp1,$tp2 # tp2^tp1
1520 xor $tp4,$acc
1521 mov $acc,$tp4
1522
1523 and \$0x80808080,$acc
1524 mov $acc,$tp8
1525 shr \$7,$tp8
1526 sub $tp8,$acc
1527 lea 0($tp4,$tp4),$tp8
1528 xor $tp1,$tp4 # tp4^tp1
1529 and \$0xfefefefe,$tp8
1530 and \$0x1b1b1b1b,$acc
1531 xor $acc,$tp8
1532
1533 xor $tp8,$tp1 # tp1^tp8
1534 rol \$8,$tp1 # ROTATE(tp1^tp8,8)
1535 xor $tp8,$tp2 # tp2^tp1^tp8
1536 xor $tp8,$tp4 # tp4^tp1^tp8
1537 xor $tp2,$tp8
1538 xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
1539
1540 xor $tp8,$tp1
1541 rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24)
1542 xor $tp2,$tp1
1543 rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16)
1544 xor $tp4,$tp1
1545
1546 mov $tp1,$i($ptr)
966___ 1547___
967} 1548}
968 1549
@@ -973,19 +1554,23 @@ $code.=<<___;
973.type AES_set_decrypt_key,\@function,3 1554.type AES_set_decrypt_key,\@function,3
974.align 16 1555.align 16
975AES_set_decrypt_key: 1556AES_set_decrypt_key:
976 push %rdx 1557 push %rbx
977 call AES_set_encrypt_key 1558 push %rbp
978 cmp \$0,%eax 1559 push %r12
979 je .Lproceed 1560 push %r13
980 lea 24(%rsp),%rsp 1561 push %r14
981 ret 1562 push %r15
982.Lproceed: 1563 push %rdx # save key schedule
1564.Ldec_key_prologue:
1565
1566 call _x86_64_AES_set_encrypt_key
983 mov (%rsp),%r8 # restore key schedule 1567 mov (%rsp),%r8 # restore key schedule
984 mov %rbx,(%rsp) 1568 cmp \$0,%eax
1569 jne .Labort
985 1570
986 mov 240(%r8),%ecx # pull number of rounds 1571 mov 240(%r8),%r14d # pull number of rounds
987 xor %rdi,%rdi 1572 xor %rdi,%rdi
988 lea (%rdi,%rcx,4),%rcx 1573 lea (%rdi,%r14d,4),%rcx
989 mov %r8,%rsi 1574 mov %r8,%rsi
990 lea (%r8,%rcx,4),%rdi # pointer to last chunk 1575 lea (%r8,%rcx,4),%rdi # pointer to last chunk
991.align 4 1576.align 4
@@ -1003,27 +1588,39 @@ AES_set_decrypt_key:
1003 cmp %rsi,%rdi 1588 cmp %rsi,%rdi
1004 jne .Linvert 1589 jne .Linvert
1005 1590
1006 .picmeup %r9 1591 lea .LAES_Te+2048+1024(%rip),%rax # rcon
1007 lea AES_Td-.(%r9),%rdi
1008 lea AES_Te-AES_Td(%rdi),%r9
1009 1592
1010 mov %r8,%rsi 1593 mov 40(%rax),$mask80
1011 mov 240(%r8),%ecx # pull number of rounds 1594 mov 48(%rax),$maskfe
1012 sub \$1,%ecx 1595 mov 56(%rax),$mask1b
1596
1597 mov %r8,$key
1598 sub \$1,%r14d
1013.align 4 1599.align 4
1014.Lpermute: 1600.Lpermute:
1015 lea 16(%rsi),%rsi 1601 lea 16($key),$key
1602 mov 0($key),%rax
1603 mov 8($key),%rcx
1016___ 1604___
1017 &deckey (0,"%rsi","%r9","%rdi"); 1605 &dectransform ();
1018 &deckey (4,"%rsi","%r9","%rdi");
1019 &deckey (8,"%rsi","%r9","%rdi");
1020 &deckey (12,"%rsi","%r9","%rdi");
1021$code.=<<___; 1606$code.=<<___;
1022 sub \$1,%ecx 1607 mov %eax,0($key)
1608 mov %ebx,4($key)
1609 mov %ecx,8($key)
1610 mov %edx,12($key)
1611 sub \$1,%r14d
1023 jnz .Lpermute 1612 jnz .Lpermute
1024 1613
1025 xor %rax,%rax 1614 xor %rax,%rax
1026 pop %rbx 1615.Labort:
1616 mov 8(%rsp),%r15
1617 mov 16(%rsp),%r14
1618 mov 24(%rsp),%r13
1619 mov 32(%rsp),%r12
1620 mov 40(%rsp),%rbp
1621 mov 48(%rsp),%rbx
1622 add \$56,%rsp
1623.Ldec_key_epilogue:
1027 ret 1624 ret
1028.size AES_set_decrypt_key,.-AES_set_decrypt_key 1625.size AES_set_decrypt_key,.-AES_set_decrypt_key
1029___ 1626___
@@ -1034,47 +1631,59 @@ ___
1034{ 1631{
1035# stack frame layout 1632# stack frame layout
1036# -8(%rsp) return address 1633# -8(%rsp) return address
1037my $_rsp="0(%rsp)"; # saved %rsp 1634my $keyp="0(%rsp)"; # one to pass as $key
1038my $_len="8(%rsp)"; # copy of 3rd parameter, length 1635my $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds])
1039my $_key="16(%rsp)"; # copy of 4th parameter, key 1636my $_rsp="16(%rsp)"; # saved %rsp
1040my $_ivp="24(%rsp)"; # copy of 5th parameter, ivp 1637my $_inp="24(%rsp)"; # copy of 1st parameter, inp
1041my $keyp="32(%rsp)"; # one to pass as $key 1638my $_out="32(%rsp)"; # copy of 2nd parameter, out
1042my $ivec="40(%rsp)"; # ivec[16] 1639my $_len="40(%rsp)"; # copy of 3rd parameter, length
1043my $aes_key="56(%rsp)"; # copy of aes_key 1640my $_key="48(%rsp)"; # copy of 4th parameter, key
1044my $mark="56+240(%rsp)"; # copy of aes_key->rounds 1641my $_ivp="56(%rsp)"; # copy of 5th parameter, ivp
1642my $ivec="64(%rsp)"; # ivec[16]
1643my $aes_key="80(%rsp)"; # copy of aes_key
1644my $mark="80+240(%rsp)"; # copy of aes_key->rounds
1045 1645
1046$code.=<<___; 1646$code.=<<___;
1047.globl AES_cbc_encrypt 1647.globl AES_cbc_encrypt
1048.type AES_cbc_encrypt,\@function,6 1648.type AES_cbc_encrypt,\@function,6
1049.align 16 1649.align 16
1650.extern OPENSSL_ia32cap_P
1050AES_cbc_encrypt: 1651AES_cbc_encrypt:
1051 cmp \$0,%rdx # check length 1652 cmp \$0,%rdx # check length
1052 je .Lcbc_just_ret 1653 je .Lcbc_epilogue
1654 pushfq
1053 push %rbx 1655 push %rbx
1054 push %rbp 1656 push %rbp
1055 push %r12 1657 push %r12
1056 push %r13 1658 push %r13
1057 push %r14 1659 push %r14
1058 push %r15 1660 push %r15
1059 pushfq 1661.Lcbc_prologue:
1662
1060 cld 1663 cld
1061 mov %r9d,%r9d # clear upper half of enc 1664 mov %r9d,%r9d # clear upper half of enc
1062 1665
1063 .picmeup $sbox 1666 lea .LAES_Te(%rip),$sbox
1064.Lcbc_pic_point:
1065
1066 cmp \$0,%r9 1667 cmp \$0,%r9
1067 je .LDECRYPT 1668 jne .Lcbc_picked_te
1068 1669 lea .LAES_Td(%rip),$sbox
1069 lea AES_Te-.Lcbc_pic_point($sbox),$sbox 1670.Lcbc_picked_te:
1671
1672 mov OPENSSL_ia32cap_P(%rip),%r10d
1673 cmp \$$speed_limit,%rdx
1674 jb .Lcbc_slow_prologue
1675 test \$15,%rdx
1676 jnz .Lcbc_slow_prologue
1677 bt \$28,%r10d
1678 jc .Lcbc_slow_prologue
1070 1679
1071 # allocate aligned stack frame... 1680 # allocate aligned stack frame...
1072 lea -64-248(%rsp),$key 1681 lea -88-248(%rsp),$key
1073 and \$-64,$key 1682 and \$-64,$key
1074 1683
1075 # ... and make it doesn't alias with AES_Te modulo 4096 1684 # ... and make sure it doesn't alias with AES_T[ed] modulo 4096
1076 mov $sbox,%r10 1685 mov $sbox,%r10
1077 lea 2048($sbox),%r11 1686 lea 2304($sbox),%r11
1078 mov $key,%r12 1687 mov $key,%r12
1079 and \$0xFFF,%r10 # s = $sbox&0xfff 1688 and \$0xFFF,%r10 # s = $sbox&0xfff
1080 and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff 1689 and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
@@ -1094,22 +1703,27 @@ AES_cbc_encrypt:
1094.Lcbc_te_ok: 1703.Lcbc_te_ok:
1095 1704
1096 xchg %rsp,$key 1705 xchg %rsp,$key
1097 add \$8,%rsp # reserve for return address! 1706 #add \$8,%rsp # reserve for return address!
1098 mov $key,$_rsp # save %rsp 1707 mov $key,$_rsp # save %rsp
1708.Lcbc_fast_body:
1709 mov %rdi,$_inp # save copy of inp
1710 mov %rsi,$_out # save copy of out
1099 mov %rdx,$_len # save copy of len 1711 mov %rdx,$_len # save copy of len
1100 mov %rcx,$_key # save copy of key 1712 mov %rcx,$_key # save copy of key
1101 mov %r8,$_ivp # save copy of ivp 1713 mov %r8,$_ivp # save copy of ivp
1102 movl \$0,$mark # copy of aes_key->rounds = 0; 1714 movl \$0,$mark # copy of aes_key->rounds = 0;
1103 mov %r8,%rbp # rearrange input arguments 1715 mov %r8,%rbp # rearrange input arguments
1716 mov %r9,%rbx
1104 mov %rsi,$out 1717 mov %rsi,$out
1105 mov %rdi,$inp 1718 mov %rdi,$inp
1106 mov %rcx,$key 1719 mov %rcx,$key
1107 1720
1721 mov 240($key),%eax # key->rounds
1108 # do we copy key schedule to stack? 1722 # do we copy key schedule to stack?
1109 mov $key,%r10 1723 mov $key,%r10
1110 sub $sbox,%r10 1724 sub $sbox,%r10
1111 and \$0xfff,%r10 1725 and \$0xfff,%r10
1112 cmp \$2048,%r10 1726 cmp \$2304,%r10
1113 jb .Lcbc_do_ecopy 1727 jb .Lcbc_do_ecopy
1114 cmp \$4096-248,%r10 1728 cmp \$4096-248,%r10
1115 jb .Lcbc_skip_ecopy 1729 jb .Lcbc_skip_ecopy
@@ -1120,12 +1734,11 @@ AES_cbc_encrypt:
1120 lea $aes_key,$key 1734 lea $aes_key,$key
1121 mov \$240/8,%ecx 1735 mov \$240/8,%ecx
1122 .long 0x90A548F3 # rep movsq 1736 .long 0x90A548F3 # rep movsq
1123 mov (%rsi),%eax # copy aes_key->rounds 1737 mov %eax,(%rdi) # copy aes_key->rounds
1124 mov %eax,(%rdi)
1125.Lcbc_skip_ecopy: 1738.Lcbc_skip_ecopy:
1126 mov $key,$keyp # save key pointer 1739 mov $key,$keyp # save key pointer
1127 1740
1128 mov \$16,%ecx 1741 mov \$18,%ecx
1129.align 4 1742.align 4
1130.Lcbc_prefetch_te: 1743.Lcbc_prefetch_te:
1131 mov 0($sbox),%r10 1744 mov 0($sbox),%r10
@@ -1135,184 +1748,77 @@ AES_cbc_encrypt:
1135 lea 128($sbox),$sbox 1748 lea 128($sbox),$sbox
1136 sub \$1,%ecx 1749 sub \$1,%ecx
1137 jnz .Lcbc_prefetch_te 1750 jnz .Lcbc_prefetch_te
1138 sub \$2048,$sbox 1751 lea -2304($sbox),$sbox
1139 1752
1140 test \$-16,%rdx # check upon length 1753 cmp \$0,%rbx
1141 mov %rdx,%r10 1754 je .LFAST_DECRYPT
1755
1756#----------------------------- ENCRYPT -----------------------------#
1142 mov 0(%rbp),$s0 # load iv 1757 mov 0(%rbp),$s0 # load iv
1143 mov 4(%rbp),$s1 1758 mov 4(%rbp),$s1
1144 mov 8(%rbp),$s2 1759 mov 8(%rbp),$s2
1145 mov 12(%rbp),$s3 1760 mov 12(%rbp),$s3
1146 jz .Lcbc_enc_tail # short input...
1147 1761
1148.align 4 1762.align 4
1149.Lcbc_enc_loop: 1763.Lcbc_fast_enc_loop:
1150 xor 0($inp),$s0 1764 xor 0($inp),$s0
1151 xor 4($inp),$s1 1765 xor 4($inp),$s1
1152 xor 8($inp),$s2 1766 xor 8($inp),$s2
1153 xor 12($inp),$s3 1767 xor 12($inp),$s3
1154 mov $inp,$ivec # if ($verticalspin) save inp
1155
1156 mov $keyp,$key # restore key 1768 mov $keyp,$key # restore key
1769 mov $inp,$_inp # if ($verticalspin) save inp
1770
1157 call _x86_64_AES_encrypt 1771 call _x86_64_AES_encrypt
1158 1772
1159 mov $ivec,$inp # if ($verticalspin) restore inp 1773 mov $_inp,$inp # if ($verticalspin) restore inp
1774 mov $_len,%r10
1160 mov $s0,0($out) 1775 mov $s0,0($out)
1161 mov $s1,4($out) 1776 mov $s1,4($out)
1162 mov $s2,8($out) 1777 mov $s2,8($out)
1163 mov $s3,12($out) 1778 mov $s3,12($out)
1164 1779
1165 mov $_len,%r10
1166 lea 16($inp),$inp 1780 lea 16($inp),$inp
1167 lea 16($out),$out 1781 lea 16($out),$out
1168 sub \$16,%r10 1782 sub \$16,%r10
1169 test \$-16,%r10 1783 test \$-16,%r10
1170 mov %r10,$_len 1784 mov %r10,$_len
1171 jnz .Lcbc_enc_loop 1785 jnz .Lcbc_fast_enc_loop
1172 test \$15,%r10
1173 jnz .Lcbc_enc_tail
1174 mov $_ivp,%rbp # restore ivp 1786 mov $_ivp,%rbp # restore ivp
1175 mov $s0,0(%rbp) # save ivec 1787 mov $s0,0(%rbp) # save ivec
1176 mov $s1,4(%rbp) 1788 mov $s1,4(%rbp)
1177 mov $s2,8(%rbp) 1789 mov $s2,8(%rbp)
1178 mov $s3,12(%rbp) 1790 mov $s3,12(%rbp)
1179 1791
1180.align 4 1792 jmp .Lcbc_fast_cleanup
1181.Lcbc_cleanup: 1793
1182 cmpl \$0,$mark # was the key schedule copied?
1183 lea $aes_key,%rdi
1184 mov $_rsp,%rsp
1185 je .Lcbc_exit
1186 mov \$240/8,%ecx
1187 xor %rax,%rax
1188 .long 0x90AB48F3 # rep stosq
1189.Lcbc_exit:
1190 popfq
1191 pop %r15
1192 pop %r14
1193 pop %r13
1194 pop %r12
1195 pop %rbp
1196 pop %rbx
1197.Lcbc_just_ret:
1198 ret
1199.align 4
1200.Lcbc_enc_tail:
1201 mov %rax,%r11
1202 mov %rcx,%r12
1203 mov %r10,%rcx
1204 mov $inp,%rsi
1205 mov $out,%rdi
1206 .long 0xF689A4F3 # rep movsb
1207 mov \$16,%rcx # zero tail
1208 sub %r10,%rcx
1209 xor %rax,%rax
1210 .long 0xF689AAF3 # rep stosb
1211 mov $out,$inp # this is not a mistake!
1212 movq \$16,$_len # len=16
1213 mov %r11,%rax
1214 mov %r12,%rcx
1215 jmp .Lcbc_enc_loop # one more spin...
1216#----------------------------- DECRYPT -----------------------------# 1794#----------------------------- DECRYPT -----------------------------#
1217.align 16 1795.align 16
1218.LDECRYPT: 1796.LFAST_DECRYPT:
1219 lea AES_Td-.Lcbc_pic_point($sbox),$sbox
1220
1221 # allocate aligned stack frame...
1222 lea -64-248(%rsp),$key
1223 and \$-64,$key
1224
1225 # ... and make it doesn't alias with AES_Td modulo 4096
1226 mov $sbox,%r10
1227 lea 2304($sbox),%r11
1228 mov $key,%r12
1229 and \$0xFFF,%r10 # s = $sbox&0xfff
1230 and \$0xFFF,%r11 # e = ($sbox+2048+256)&0xfff
1231 and \$0xFFF,%r12 # p = %rsp&0xfff
1232
1233 cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
1234 jb .Lcbc_td_break_out
1235 sub %r11,%r12
1236 sub %r12,$key
1237 jmp .Lcbc_td_ok
1238.Lcbc_td_break_out: # else %rsp -= (p-s)&0xfff + framesz
1239 sub %r10,%r12
1240 and \$0xFFF,%r12
1241 add \$320,%r12
1242 sub %r12,$key
1243.align 4
1244.Lcbc_td_ok:
1245
1246 xchg %rsp,$key
1247 add \$8,%rsp # reserve for return address!
1248 mov $key,$_rsp # save %rsp
1249 mov %rdx,$_len # save copy of len
1250 mov %rcx,$_key # save copy of key
1251 mov %r8,$_ivp # save copy of ivp
1252 movl \$0,$mark # copy of aes_key->rounds = 0;
1253 mov %r8,%rbp # rearrange input arguments
1254 mov %rsi,$out
1255 mov %rdi,$inp
1256 mov %rcx,$key
1257
1258 # do we copy key schedule to stack?
1259 mov $key,%r10
1260 sub $sbox,%r10
1261 and \$0xfff,%r10
1262 cmp \$2304,%r10
1263 jb .Lcbc_do_dcopy
1264 cmp \$4096-248,%r10
1265 jb .Lcbc_skip_dcopy
1266.align 4
1267.Lcbc_do_dcopy:
1268 mov $key,%rsi
1269 lea $aes_key,%rdi
1270 lea $aes_key,$key
1271 mov \$240/8,%ecx
1272 .long 0x90A548F3 # rep movsq
1273 mov (%rsi),%eax # copy aes_key->rounds
1274 mov %eax,(%rdi)
1275.Lcbc_skip_dcopy:
1276 mov $key,$keyp # save key pointer
1277
1278 mov \$18,%ecx
1279.align 4
1280.Lcbc_prefetch_td:
1281 mov 0($sbox),%r10
1282 mov 32($sbox),%r11
1283 mov 64($sbox),%r12
1284 mov 96($sbox),%r13
1285 lea 128($sbox),$sbox
1286 sub \$1,%ecx
1287 jnz .Lcbc_prefetch_td
1288 sub \$2304,$sbox
1289
1290 cmp $inp,$out 1797 cmp $inp,$out
1291 je .Lcbc_dec_in_place 1798 je .Lcbc_fast_dec_in_place
1292 1799
1293 mov %rbp,$ivec 1800 mov %rbp,$ivec
1294.align 4 1801.align 4
1295.Lcbc_dec_loop: 1802.Lcbc_fast_dec_loop:
1296 mov 0($inp),$s0 # read input 1803 mov 0($inp),$s0 # read input
1297 mov 4($inp),$s1 1804 mov 4($inp),$s1
1298 mov 8($inp),$s2 1805 mov 8($inp),$s2
1299 mov 12($inp),$s3 1806 mov 12($inp),$s3
1300 mov $inp,8+$ivec # if ($verticalspin) save inp
1301
1302 mov $keyp,$key # restore key 1807 mov $keyp,$key # restore key
1808 mov $inp,$_inp # if ($verticalspin) save inp
1809
1303 call _x86_64_AES_decrypt 1810 call _x86_64_AES_decrypt
1304 1811
1305 mov $ivec,%rbp # load ivp 1812 mov $ivec,%rbp # load ivp
1306 mov 8+$ivec,$inp # if ($verticalspin) restore inp 1813 mov $_inp,$inp # if ($verticalspin) restore inp
1814 mov $_len,%r10 # load len
1307 xor 0(%rbp),$s0 # xor iv 1815 xor 0(%rbp),$s0 # xor iv
1308 xor 4(%rbp),$s1 1816 xor 4(%rbp),$s1
1309 xor 8(%rbp),$s2 1817 xor 8(%rbp),$s2
1310 xor 12(%rbp),$s3 1818 xor 12(%rbp),$s3
1311 mov $inp,%rbp # current input, next iv 1819 mov $inp,%rbp # current input, next iv
1312 1820
1313 mov $_len,%r10 # load len
1314 sub \$16,%r10 1821 sub \$16,%r10
1315 jc .Lcbc_dec_partial
1316 mov %r10,$_len # update len 1822 mov %r10,$_len # update len
1317 mov %rbp,$ivec # update ivp 1823 mov %rbp,$ivec # update ivp
1318 1824
@@ -1323,81 +1829,281 @@ AES_cbc_encrypt:
1323 1829
1324 lea 16($inp),$inp 1830 lea 16($inp),$inp
1325 lea 16($out),$out 1831 lea 16($out),$out
1326 jnz .Lcbc_dec_loop 1832 jnz .Lcbc_fast_dec_loop
1327.Lcbc_dec_end:
1328 mov $_ivp,%r12 # load user ivp 1833 mov $_ivp,%r12 # load user ivp
1329 mov 0(%rbp),%r10 # load iv 1834 mov 0(%rbp),%r10 # load iv
1330 mov 8(%rbp),%r11 1835 mov 8(%rbp),%r11
1331 mov %r10,0(%r12) # copy back to user 1836 mov %r10,0(%r12) # copy back to user
1332 mov %r11,8(%r12) 1837 mov %r11,8(%r12)
1333 jmp .Lcbc_cleanup 1838 jmp .Lcbc_fast_cleanup
1334
1335.align 4
1336.Lcbc_dec_partial:
1337 mov $s0,0+$ivec # dump output to stack
1338 mov $s1,4+$ivec
1339 mov $s2,8+$ivec
1340 mov $s3,12+$ivec
1341 mov $out,%rdi
1342 lea $ivec,%rsi
1343 mov \$16,%rcx
1344 add %r10,%rcx # number of bytes to copy
1345 .long 0xF689A4F3 # rep movsb
1346 jmp .Lcbc_dec_end
1347 1839
1348.align 16 1840.align 16
1349.Lcbc_dec_in_place: 1841.Lcbc_fast_dec_in_place:
1842 mov 0(%rbp),%r10 # copy iv to stack
1843 mov 8(%rbp),%r11
1844 mov %r10,0+$ivec
1845 mov %r11,8+$ivec
1846.align 4
1847.Lcbc_fast_dec_in_place_loop:
1350 mov 0($inp),$s0 # load input 1848 mov 0($inp),$s0 # load input
1351 mov 4($inp),$s1 1849 mov 4($inp),$s1
1352 mov 8($inp),$s2 1850 mov 8($inp),$s2
1353 mov 12($inp),$s3 1851 mov 12($inp),$s3
1852 mov $keyp,$key # restore key
1853 mov $inp,$_inp # if ($verticalspin) save inp
1354 1854
1355 mov $inp,$ivec # if ($verticalspin) save inp
1356 mov $keyp,$key
1357 call _x86_64_AES_decrypt 1855 call _x86_64_AES_decrypt
1358 1856
1359 mov $ivec,$inp # if ($verticalspin) restore inp 1857 mov $_inp,$inp # if ($verticalspin) restore inp
1360 mov $_ivp,%rbp 1858 mov $_len,%r10
1361 xor 0(%rbp),$s0 1859 xor 0+$ivec,$s0
1362 xor 4(%rbp),$s1 1860 xor 4+$ivec,$s1
1363 xor 8(%rbp),$s2 1861 xor 8+$ivec,$s2
1364 xor 12(%rbp),$s3 1862 xor 12+$ivec,$s3
1863
1864 mov 0($inp),%r11 # load input
1865 mov 8($inp),%r12
1866 sub \$16,%r10
1867 jz .Lcbc_fast_dec_in_place_done
1365 1868
1366 mov 0($inp),%r10 # copy input to iv 1869 mov %r11,0+$ivec # copy input to iv
1367 mov 8($inp),%r11 1870 mov %r12,8+$ivec
1368 mov %r10,0(%rbp)
1369 mov %r11,8(%rbp)
1370 1871
1371 mov $s0,0($out) # save output [zaps input] 1872 mov $s0,0($out) # save output [zaps input]
1372 mov $s1,4($out) 1873 mov $s1,4($out)
1373 mov $s2,8($out) 1874 mov $s2,8($out)
1374 mov $s3,12($out) 1875 mov $s3,12($out)
1375 1876
1376 mov $_len,%rcx
1377 lea 16($inp),$inp 1877 lea 16($inp),$inp
1378 lea 16($out),$out 1878 lea 16($out),$out
1379 sub \$16,%rcx 1879 mov %r10,$_len
1380 jc .Lcbc_dec_in_place_partial 1880 jmp .Lcbc_fast_dec_in_place_loop
1381 mov %rcx,$_len 1881.Lcbc_fast_dec_in_place_done:
1382 jnz .Lcbc_dec_in_place 1882 mov $_ivp,%rdi
1383 jmp .Lcbc_cleanup 1883 mov %r11,0(%rdi) # copy iv back to user
1884 mov %r12,8(%rdi)
1885
1886 mov $s0,0($out) # save output [zaps input]
1887 mov $s1,4($out)
1888 mov $s2,8($out)
1889 mov $s3,12($out)
1384 1890
1385.align 4 1891.align 4
1386.Lcbc_dec_in_place_partial: 1892.Lcbc_fast_cleanup:
1387 # one can argue if this is actually required 1893 cmpl \$0,$mark # was the key schedule copied?
1388 lea ($out,%rcx),%rdi 1894 lea $aes_key,%rdi
1389 lea (%rbp,%rcx),%rsi 1895 je .Lcbc_exit
1390 neg %rcx 1896 mov \$240/8,%ecx
1391 .long 0xF689A4F3 # rep movsb # restore tail 1897 xor %rax,%rax
1392 jmp .Lcbc_cleanup 1898 .long 0x90AB48F3 # rep stosq
1899
1900 jmp .Lcbc_exit
1901
1902#--------------------------- SLOW ROUTINE ---------------------------#
1903.align 16
1904.Lcbc_slow_prologue:
1905 # allocate aligned stack frame...
1906 lea -88(%rsp),%rbp
1907 and \$-64,%rbp
1908 # ... just "above" key schedule
1909 lea -88-63(%rcx),%r10
1910 sub %rbp,%r10
1911 neg %r10
1912 and \$0x3c0,%r10
1913 sub %r10,%rbp
1914
1915 xchg %rsp,%rbp
1916 #add \$8,%rsp # reserve for return address!
1917 mov %rbp,$_rsp # save %rsp
1918.Lcbc_slow_body:
1919 #mov %rdi,$_inp # save copy of inp
1920 #mov %rsi,$_out # save copy of out
1921 #mov %rdx,$_len # save copy of len
1922 #mov %rcx,$_key # save copy of key
1923 mov %r8,$_ivp # save copy of ivp
1924 mov %r8,%rbp # rearrange input arguments
1925 mov %r9,%rbx
1926 mov %rsi,$out
1927 mov %rdi,$inp
1928 mov %rcx,$key
1929 mov %rdx,%r10
1930
1931 mov 240($key),%eax
1932 mov $key,$keyp # save key pointer
1933 shl \$4,%eax
1934 lea ($key,%rax),%rax
1935 mov %rax,$keyend
1936
1937 # pick Te4 copy which can't "overlap" with stack frame or key scdedule
1938 lea 2048($sbox),$sbox
1939 lea 768-8(%rsp),%rax
1940 sub $sbox,%rax
1941 and \$0x300,%rax
1942 lea ($sbox,%rax),$sbox
1943
1944 cmp \$0,%rbx
1945 je .LSLOW_DECRYPT
1946
1947#--------------------------- SLOW ENCRYPT ---------------------------#
1948 test \$-16,%r10 # check upon length
1949 mov 0(%rbp),$s0 # load iv
1950 mov 4(%rbp),$s1
1951 mov 8(%rbp),$s2
1952 mov 12(%rbp),$s3
1953 jz .Lcbc_slow_enc_tail # short input...
1954
1955.align 4
1956.Lcbc_slow_enc_loop:
1957 xor 0($inp),$s0
1958 xor 4($inp),$s1
1959 xor 8($inp),$s2
1960 xor 12($inp),$s3
1961 mov $keyp,$key # restore key
1962 mov $inp,$_inp # save inp
1963 mov $out,$_out # save out
1964 mov %r10,$_len # save len
1965
1966 call _x86_64_AES_encrypt_compact
1967
1968 mov $_inp,$inp # restore inp
1969 mov $_out,$out # restore out
1970 mov $_len,%r10 # restore len
1971 mov $s0,0($out)
1972 mov $s1,4($out)
1973 mov $s2,8($out)
1974 mov $s3,12($out)
1975
1976 lea 16($inp),$inp
1977 lea 16($out),$out
1978 sub \$16,%r10
1979 test \$-16,%r10
1980 jnz .Lcbc_slow_enc_loop
1981 test \$15,%r10
1982 jnz .Lcbc_slow_enc_tail
1983 mov $_ivp,%rbp # restore ivp
1984 mov $s0,0(%rbp) # save ivec
1985 mov $s1,4(%rbp)
1986 mov $s2,8(%rbp)
1987 mov $s3,12(%rbp)
1988
1989 jmp .Lcbc_exit
1990
1991.align 4
1992.Lcbc_slow_enc_tail:
1993 mov %rax,%r11
1994 mov %rcx,%r12
1995 mov %r10,%rcx
1996 mov $inp,%rsi
1997 mov $out,%rdi
1998 .long 0x9066A4F3 # rep movsb
1999 mov \$16,%rcx # zero tail
2000 sub %r10,%rcx
2001 xor %rax,%rax
2002 .long 0x9066AAF3 # rep stosb
2003 mov $out,$inp # this is not a mistake!
2004 mov \$16,%r10 # len=16
2005 mov %r11,%rax
2006 mov %r12,%rcx
2007 jmp .Lcbc_slow_enc_loop # one more spin...
2008#--------------------------- SLOW DECRYPT ---------------------------#
2009.align 16
2010.LSLOW_DECRYPT:
2011 shr \$3,%rax
2012 add %rax,$sbox # recall "magic" constants!
2013
2014 mov 0(%rbp),%r11 # copy iv to stack
2015 mov 8(%rbp),%r12
2016 mov %r11,0+$ivec
2017 mov %r12,8+$ivec
2018
2019.align 4
2020.Lcbc_slow_dec_loop:
2021 mov 0($inp),$s0 # load input
2022 mov 4($inp),$s1
2023 mov 8($inp),$s2
2024 mov 12($inp),$s3
2025 mov $keyp,$key # restore key
2026 mov $inp,$_inp # save inp
2027 mov $out,$_out # save out
2028 mov %r10,$_len # save len
2029
2030 call _x86_64_AES_decrypt_compact
2031
2032 mov $_inp,$inp # restore inp
2033 mov $_out,$out # restore out
2034 mov $_len,%r10
2035 xor 0+$ivec,$s0
2036 xor 4+$ivec,$s1
2037 xor 8+$ivec,$s2
2038 xor 12+$ivec,$s3
2039
2040 mov 0($inp),%r11 # load input
2041 mov 8($inp),%r12
2042 sub \$16,%r10
2043 jc .Lcbc_slow_dec_partial
2044 jz .Lcbc_slow_dec_done
2045
2046 mov %r11,0+$ivec # copy input to iv
2047 mov %r12,8+$ivec
2048
2049 mov $s0,0($out) # save output [can zap input]
2050 mov $s1,4($out)
2051 mov $s2,8($out)
2052 mov $s3,12($out)
2053
2054 lea 16($inp),$inp
2055 lea 16($out),$out
2056 jmp .Lcbc_slow_dec_loop
2057.Lcbc_slow_dec_done:
2058 mov $_ivp,%rdi
2059 mov %r11,0(%rdi) # copy iv back to user
2060 mov %r12,8(%rdi)
2061
2062 mov $s0,0($out) # save output [can zap input]
2063 mov $s1,4($out)
2064 mov $s2,8($out)
2065 mov $s3,12($out)
2066
2067 jmp .Lcbc_exit
2068
2069.align 4
2070.Lcbc_slow_dec_partial:
2071 mov $_ivp,%rdi
2072 mov %r11,0(%rdi) # copy iv back to user
2073 mov %r12,8(%rdi)
2074
2075 mov $s0,0+$ivec # save output to stack
2076 mov $s1,4+$ivec
2077 mov $s2,8+$ivec
2078 mov $s3,12+$ivec
2079
2080 mov $out,%rdi
2081 lea $ivec,%rsi
2082 lea 16(%r10),%rcx
2083 .long 0x9066A4F3 # rep movsb
2084 jmp .Lcbc_exit
2085
2086.align 16
2087.Lcbc_exit:
2088 mov $_rsp,%rsi
2089 mov (%rsi),%r15
2090 mov 8(%rsi),%r14
2091 mov 16(%rsi),%r13
2092 mov 24(%rsi),%r12
2093 mov 32(%rsi),%rbp
2094 mov 40(%rsi),%rbx
2095 lea 48(%rsi),%rsp
2096.Lcbc_popfq:
2097 popfq
2098.Lcbc_epilogue:
2099 ret
1393.size AES_cbc_encrypt,.-AES_cbc_encrypt 2100.size AES_cbc_encrypt,.-AES_cbc_encrypt
1394___ 2101___
1395} 2102}
1396 2103
1397$code.=<<___; 2104$code.=<<___;
1398.globl AES_Te
1399.align 64 2105.align 64
1400AES_Te: 2106.LAES_Te:
1401___ 2107___
1402 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); 2108 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
1403 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); 2109 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
@@ -1463,16 +2169,149 @@ ___
1463 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); 2169 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
1464 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); 2170 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
1465 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); 2171 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
2172
2173#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
2174 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2175 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2176 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2177 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2178 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2179 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2180 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2181 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2182 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2183 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2184 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2185 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2186 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2187 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2188 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2189 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2190 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2191 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2192 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2193 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2194 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2195 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2196 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2197 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2198 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2199 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2200 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2201 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2202 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2203 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2204 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2205 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2206
2207 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2208 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2209 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2210 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2211 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2212 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2213 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2214 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2215 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2216 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2217 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2218 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2219 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2220 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2221 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2222 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2223 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2224 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2225 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2226 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2227 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2228 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2229 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2230 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2231 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2232 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2233 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2234 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2235 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2236 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2237 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2238 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2239
2240 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2241 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2242 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2243 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2244 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2245 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2246 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2247 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2248 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2249 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2250 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2251 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2252 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2253 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2254 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2255 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2256 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2257 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2258 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2259 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2260 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2261 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2262 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2263 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2264 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2265 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2266 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2267 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2268 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2269 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2270 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2271 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2272
2273 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2274 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2275 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2276 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2277 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2278 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2279 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2280 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2281 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2282 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2283 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2284 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2285 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2286 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2287 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2288 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2289 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2290 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2291 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2292 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2293 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2294 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2295 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2296 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2297 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2298 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2299 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2300 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2301 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2302 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2303 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2304 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1466#rcon: 2305#rcon:
1467$code.=<<___; 2306$code.=<<___;
1468 .long 0x00000001, 0x00000002, 0x00000004, 0x00000008 2307 .long 0x00000001, 0x00000002, 0x00000004, 0x00000008
1469 .long 0x00000010, 0x00000020, 0x00000040, 0x00000080 2308 .long 0x00000010, 0x00000020, 0x00000040, 0x00000080
1470 .long 0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0 2309 .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080
2310 .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
1471___ 2311___
1472$code.=<<___; 2312$code.=<<___;
1473.globl AES_Td
1474.align 64 2313.align 64
1475AES_Td: 2314.LAES_Td:
1476___ 2315___
1477 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a); 2316 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
1478 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b); 2317 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
@@ -1538,7 +2377,116 @@ ___
1538 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); 2377 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
1539 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); 2378 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
1540 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); 2379 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
1541#Td4: 2380
2381#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
2382 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2383 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2384 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2385 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2386 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2387 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2388 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2389 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2390 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2391 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2392 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2393 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2394 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2395 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2396 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2397 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2398 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2399 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2400 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2401 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2402 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2403 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2404 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2405 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2406 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2407 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2408 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2409 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2410 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2411 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2412 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2413 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2414$code.=<<___;
2415 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2416 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2417___
2418 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2419 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2420 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2421 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2422 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2423 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2424 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2425 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2426 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2427 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2428 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2429 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2430 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2431 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2432 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2433 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2434 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2435 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2436 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2437 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2438 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2439 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2440 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2441 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2442 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2443 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2444 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2445 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2446 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2447 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2448 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2449 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2450$code.=<<___;
2451 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2452 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2453___
2454 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2455 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2456 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2457 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2458 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2459 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2460 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2461 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2462 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2463 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2464 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2465 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2466 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2467 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2468 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2469 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2470 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2471 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2472 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2473 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2474 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2475 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2476 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2477 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2478 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2479 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2480 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2481 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2482 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2483 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2484 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2485 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2486$code.=<<___;
2487 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2488 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2489___
1542 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); 2490 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1543 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); 2491 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1544 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); 2492 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
@@ -1571,6 +2519,288 @@ ___
1571 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); 2519 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1572 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); 2520 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1573 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); 2521 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2522$code.=<<___;
2523 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2524 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2525.asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2526.align 64
2527___
2528
2529# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2530# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2531if ($win64) {
2532$rec="%rcx";
2533$frame="%rdx";
2534$context="%r8";
2535$disp="%r9";
2536
2537$code.=<<___;
2538.extern __imp_RtlVirtualUnwind
2539.type block_se_handler,\@abi-omnipotent
2540.align 16
2541block_se_handler:
2542 push %rsi
2543 push %rdi
2544 push %rbx
2545 push %rbp
2546 push %r12
2547 push %r13
2548 push %r14
2549 push %r15
2550 pushfq
2551 sub \$64,%rsp
2552
2553 mov 120($context),%rax # pull context->Rax
2554 mov 248($context),%rbx # pull context->Rip
2555
2556 mov 8($disp),%rsi # disp->ImageBase
2557 mov 56($disp),%r11 # disp->HandlerData
2558
2559 mov 0(%r11),%r10d # HandlerData[0]
2560 lea (%rsi,%r10),%r10 # prologue label
2561 cmp %r10,%rbx # context->Rip<prologue label
2562 jb .Lin_block_prologue
2563
2564 mov 152($context),%rax # pull context->Rsp
2565
2566 mov 4(%r11),%r10d # HandlerData[1]
2567 lea (%rsi,%r10),%r10 # epilogue label
2568 cmp %r10,%rbx # context->Rip>=epilogue label
2569 jae .Lin_block_prologue
2570
2571 mov 24(%rax),%rax # pull saved real stack pointer
2572 lea 48(%rax),%rax # adjust...
2573
2574 mov -8(%rax),%rbx
2575 mov -16(%rax),%rbp
2576 mov -24(%rax),%r12
2577 mov -32(%rax),%r13
2578 mov -40(%rax),%r14
2579 mov -48(%rax),%r15
2580 mov %rbx,144($context) # restore context->Rbx
2581 mov %rbp,160($context) # restore context->Rbp
2582 mov %r12,216($context) # restore context->R12
2583 mov %r13,224($context) # restore context->R13
2584 mov %r14,232($context) # restore context->R14
2585 mov %r15,240($context) # restore context->R15
2586
2587.Lin_block_prologue:
2588 mov 8(%rax),%rdi
2589 mov 16(%rax),%rsi
2590 mov %rax,152($context) # restore context->Rsp
2591 mov %rsi,168($context) # restore context->Rsi
2592 mov %rdi,176($context) # restore context->Rdi
2593
2594 jmp .Lcommon_seh_exit
2595.size block_se_handler,.-block_se_handler
2596
2597.type key_se_handler,\@abi-omnipotent
2598.align 16
2599key_se_handler:
2600 push %rsi
2601 push %rdi
2602 push %rbx
2603 push %rbp
2604 push %r12
2605 push %r13
2606 push %r14
2607 push %r15
2608 pushfq
2609 sub \$64,%rsp
2610
2611 mov 120($context),%rax # pull context->Rax
2612 mov 248($context),%rbx # pull context->Rip
2613
2614 mov 8($disp),%rsi # disp->ImageBase
2615 mov 56($disp),%r11 # disp->HandlerData
2616
2617 mov 0(%r11),%r10d # HandlerData[0]
2618 lea (%rsi,%r10),%r10 # prologue label
2619 cmp %r10,%rbx # context->Rip<prologue label
2620 jb .Lin_key_prologue
2621
2622 mov 152($context),%rax # pull context->Rsp
2623
2624 mov 4(%r11),%r10d # HandlerData[1]
2625 lea (%rsi,%r10),%r10 # epilogue label
2626 cmp %r10,%rbx # context->Rip>=epilogue label
2627 jae .Lin_key_prologue
2628
2629 lea 56(%rax),%rax
2630
2631 mov -8(%rax),%rbx
2632 mov -16(%rax),%rbp
2633 mov -24(%rax),%r12
2634 mov -32(%rax),%r13
2635 mov -40(%rax),%r14
2636 mov -48(%rax),%r15
2637 mov %rbx,144($context) # restore context->Rbx
2638 mov %rbp,160($context) # restore context->Rbp
2639 mov %r12,216($context) # restore context->R12
2640 mov %r13,224($context) # restore context->R13
2641 mov %r14,232($context) # restore context->R14
2642 mov %r15,240($context) # restore context->R15
2643
2644.Lin_key_prologue:
2645 mov 8(%rax),%rdi
2646 mov 16(%rax),%rsi
2647 mov %rax,152($context) # restore context->Rsp
2648 mov %rsi,168($context) # restore context->Rsi
2649 mov %rdi,176($context) # restore context->Rdi
2650
2651 jmp .Lcommon_seh_exit
2652.size key_se_handler,.-key_se_handler
2653
2654.type cbc_se_handler,\@abi-omnipotent
2655.align 16
2656cbc_se_handler:
2657 push %rsi
2658 push %rdi
2659 push %rbx
2660 push %rbp
2661 push %r12
2662 push %r13
2663 push %r14
2664 push %r15
2665 pushfq
2666 sub \$64,%rsp
2667
2668 mov 120($context),%rax # pull context->Rax
2669 mov 248($context),%rbx # pull context->Rip
2670
2671 lea .Lcbc_prologue(%rip),%r10
2672 cmp %r10,%rbx # context->Rip<.Lcbc_prologue
2673 jb .Lin_cbc_prologue
2674
2675 lea .Lcbc_fast_body(%rip),%r10
2676 cmp %r10,%rbx # context->Rip<.Lcbc_fast_body
2677 jb .Lin_cbc_frame_setup
2678
2679 lea .Lcbc_slow_prologue(%rip),%r10
2680 cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue
2681 jb .Lin_cbc_body
2682
2683 lea .Lcbc_slow_body(%rip),%r10
2684 cmp %r10,%rbx # context->Rip<.Lcbc_slow_body
2685 jb .Lin_cbc_frame_setup
2686
2687.Lin_cbc_body:
2688 mov 152($context),%rax # pull context->Rsp
2689
2690 lea .Lcbc_epilogue(%rip),%r10
2691 cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue
2692 jae .Lin_cbc_prologue
2693
2694 lea 8(%rax),%rax
2695
2696 lea .Lcbc_popfq(%rip),%r10
2697 cmp %r10,%rbx # context->Rip>=.Lcbc_popfq
2698 jae .Lin_cbc_prologue
2699
2700 mov `16-8`(%rax),%rax # biased $_rsp
2701 lea 56(%rax),%rax
2702
2703.Lin_cbc_frame_setup:
2704 mov -16(%rax),%rbx
2705 mov -24(%rax),%rbp
2706 mov -32(%rax),%r12
2707 mov -40(%rax),%r13
2708 mov -48(%rax),%r14
2709 mov -56(%rax),%r15
2710 mov %rbx,144($context) # restore context->Rbx
2711 mov %rbp,160($context) # restore context->Rbp
2712 mov %r12,216($context) # restore context->R12
2713 mov %r13,224($context) # restore context->R13
2714 mov %r14,232($context) # restore context->R14
2715 mov %r15,240($context) # restore context->R15
2716
2717.Lin_cbc_prologue:
2718 mov 8(%rax),%rdi
2719 mov 16(%rax),%rsi
2720 mov %rax,152($context) # restore context->Rsp
2721 mov %rsi,168($context) # restore context->Rsi
2722 mov %rdi,176($context) # restore context->Rdi
2723
2724.Lcommon_seh_exit:
2725
2726 mov 40($disp),%rdi # disp->ContextRecord
2727 mov $context,%rsi # context
2728 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2729 .long 0xa548f3fc # cld; rep movsq
2730
2731 mov $disp,%rsi
2732 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2733 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2734 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2735 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2736 mov 40(%rsi),%r10 # disp->ContextRecord
2737 lea 56(%rsi),%r11 # &disp->HandlerData
2738 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2739 mov %r10,32(%rsp) # arg5
2740 mov %r11,40(%rsp) # arg6
2741 mov %r12,48(%rsp) # arg7
2742 mov %rcx,56(%rsp) # arg8, (NULL)
2743 call *__imp_RtlVirtualUnwind(%rip)
2744
2745 mov \$1,%eax # ExceptionContinueSearch
2746 add \$64,%rsp
2747 popfq
2748 pop %r15
2749 pop %r14
2750 pop %r13
2751 pop %r12
2752 pop %rbp
2753 pop %rbx
2754 pop %rdi
2755 pop %rsi
2756 ret
2757.size cbc_se_handler,.-cbc_se_handler
2758
2759.section .pdata
2760.align 4
2761 .rva .LSEH_begin_AES_encrypt
2762 .rva .LSEH_end_AES_encrypt
2763 .rva .LSEH_info_AES_encrypt
2764
2765 .rva .LSEH_begin_AES_decrypt
2766 .rva .LSEH_end_AES_decrypt
2767 .rva .LSEH_info_AES_decrypt
2768
2769 .rva .LSEH_begin_AES_set_encrypt_key
2770 .rva .LSEH_end_AES_set_encrypt_key
2771 .rva .LSEH_info_AES_set_encrypt_key
2772
2773 .rva .LSEH_begin_AES_set_decrypt_key
2774 .rva .LSEH_end_AES_set_decrypt_key
2775 .rva .LSEH_info_AES_set_decrypt_key
2776
2777 .rva .LSEH_begin_AES_cbc_encrypt
2778 .rva .LSEH_end_AES_cbc_encrypt
2779 .rva .LSEH_info_AES_cbc_encrypt
2780
2781.section .xdata
2782.align 8
2783.LSEH_info_AES_encrypt:
2784 .byte 9,0,0,0
2785 .rva block_se_handler
2786 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
2787.LSEH_info_AES_decrypt:
2788 .byte 9,0,0,0
2789 .rva block_se_handler
2790 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
2791.LSEH_info_AES_set_encrypt_key:
2792 .byte 9,0,0,0
2793 .rva key_se_handler
2794 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
2795.LSEH_info_AES_set_decrypt_key:
2796 .byte 9,0,0,0
2797 .rva key_se_handler
2798 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]
2799.LSEH_info_AES_cbc_encrypt:
2800 .byte 9,0,0,0
2801 .rva cbc_se_handler
2802___
2803}
1574 2804
1575$code =~ s/\`([^\`]*)\`/eval($1)/gem; 2805$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1576 2806