diff options
author | djm <> | 2010-10-01 22:54:21 +0000 |
---|---|---|
committer | djm <> | 2010-10-01 22:54:21 +0000 |
commit | 2ea67f4aa254b09ded62e6e14fc893bbe6381579 (patch) | |
tree | bb3923b81f2ce34b1ad62684afdf1a94d904c185 /src/lib/libcrypto/aes | |
parent | 6ddfb710ab14b10183ff3a6a32f643554c80065e (diff) | |
parent | 829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2 (diff) | |
download | openbsd-2ea67f4aa254b09ded62e6e14fc893bbe6381579.tar.gz openbsd-2ea67f4aa254b09ded62e6e14fc893bbe6381579.tar.bz2 openbsd-2ea67f4aa254b09ded62e6e14fc893bbe6381579.zip |
This commit was generated by cvs2git to track changes on a CVS vendor
branch.
Diffstat (limited to 'src/lib/libcrypto/aes')
-rw-r--r-- | src/lib/libcrypto/aes/aes_ige.c | 12 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/asm/aes-armv4.pl | 1 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/asm/aes-ppc.pl | 269 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/asm/aes-s390x.pl | 6 | ||||
-rwxr-xr-x | src/lib/libcrypto/aes/asm/aes-x86_64.pl | 2012 |
5 files changed, 1775 insertions, 525 deletions
diff --git a/src/lib/libcrypto/aes/aes_ige.c b/src/lib/libcrypto/aes/aes_ige.c index 45d7096181..c161351e65 100644 --- a/src/lib/libcrypto/aes/aes_ige.c +++ b/src/lib/libcrypto/aes/aes_ige.c | |||
@@ -77,11 +77,11 @@ typedef struct { | |||
77 | /* N.B. The IV for this mode is _twice_ the block size */ | 77 | /* N.B. The IV for this mode is _twice_ the block size */ |
78 | 78 | ||
79 | void AES_ige_encrypt(const unsigned char *in, unsigned char *out, | 79 | void AES_ige_encrypt(const unsigned char *in, unsigned char *out, |
80 | const unsigned long length, const AES_KEY *key, | 80 | size_t length, const AES_KEY *key, |
81 | unsigned char *ivec, const int enc) | 81 | unsigned char *ivec, const int enc) |
82 | { | 82 | { |
83 | unsigned long n; | 83 | size_t n; |
84 | unsigned long len; | 84 | size_t len = length; |
85 | 85 | ||
86 | OPENSSL_assert(in && out && key && ivec); | 86 | OPENSSL_assert(in && out && key && ivec); |
87 | OPENSSL_assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc)); | 87 | OPENSSL_assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc)); |
@@ -211,12 +211,12 @@ void AES_ige_encrypt(const unsigned char *in, unsigned char *out, | |||
211 | /* N.B. The IV for this mode is _four times_ the block size */ | 211 | /* N.B. The IV for this mode is _four times_ the block size */ |
212 | 212 | ||
213 | void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out, | 213 | void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out, |
214 | const unsigned long length, const AES_KEY *key, | 214 | size_t length, const AES_KEY *key, |
215 | const AES_KEY *key2, const unsigned char *ivec, | 215 | const AES_KEY *key2, const unsigned char *ivec, |
216 | const int enc) | 216 | const int enc) |
217 | { | 217 | { |
218 | unsigned long n; | 218 | size_t n; |
219 | unsigned long len = length; | 219 | size_t len = length; |
220 | unsigned char tmp[AES_BLOCK_SIZE]; | 220 | unsigned char tmp[AES_BLOCK_SIZE]; |
221 | unsigned char tmp2[AES_BLOCK_SIZE]; | 221 | unsigned char tmp2[AES_BLOCK_SIZE]; |
222 | unsigned char tmp3[AES_BLOCK_SIZE]; | 222 | unsigned char tmp3[AES_BLOCK_SIZE]; |
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl index 15742c1ec5..690244111a 100644 --- a/src/lib/libcrypto/aes/asm/aes-armv4.pl +++ b/src/lib/libcrypto/aes/asm/aes-armv4.pl | |||
@@ -1024,6 +1024,7 @@ _armv4_AES_decrypt: | |||
1024 | mov pc,lr @ return | 1024 | mov pc,lr @ return |
1025 | .size _armv4_AES_decrypt,.-_armv4_AES_decrypt | 1025 | .size _armv4_AES_decrypt,.-_armv4_AES_decrypt |
1026 | .asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" | 1026 | .asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" |
1027 | .align 2 | ||
1027 | ___ | 1028 | ___ |
1028 | 1029 | ||
1029 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | 1030 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 |
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl index ce427655ef..f82c5e1814 100644 --- a/src/lib/libcrypto/aes/asm/aes-ppc.pl +++ b/src/lib/libcrypto/aes/asm/aes-ppc.pl | |||
@@ -16,6 +16,19 @@ | |||
16 | # at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - | 16 | # at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - |
17 | # at 1/3 of ppc_AES_decrypt. | 17 | # at 1/3 of ppc_AES_decrypt. |
18 | 18 | ||
19 | # February 2010 | ||
20 | # | ||
21 | # Rescheduling instructions to favour Power6 pipeline gives 10% | ||
22 | # performance improvement on the platfrom in question (and marginal | ||
23 | # improvement even on others). It should be noted that Power6 fails | ||
24 | # to process byte in 18 cycles, only in 23, because it fails to issue | ||
25 | # 4 load instructions in two cycles, only in 3. As result non-compact | ||
26 | # block subroutines are 25% slower than one would expect. Compact | ||
27 | # functions scale better, because they have pure computational part, | ||
28 | # which scales perfectly with clock frequency. To be specific | ||
29 | # ppc_AES_encrypt_compact operates at 42 cycles per byte, while | ||
30 | # ppc_AES_decrypt_compact - at 55 (in 64-bit build). | ||
31 | |||
19 | $flavour = shift; | 32 | $flavour = shift; |
20 | 33 | ||
21 | if ($flavour =~ /64/) { | 34 | if ($flavour =~ /64/) { |
@@ -376,7 +389,7 @@ $code.=<<___; | |||
376 | addi $sp,$sp,$FRAME | 389 | addi $sp,$sp,$FRAME |
377 | blr | 390 | blr |
378 | 391 | ||
379 | .align 4 | 392 | .align 5 |
380 | Lppc_AES_encrypt: | 393 | Lppc_AES_encrypt: |
381 | lwz $acc00,240($key) | 394 | lwz $acc00,240($key) |
382 | lwz $t0,0($key) | 395 | lwz $t0,0($key) |
@@ -397,46 +410,46 @@ Lppc_AES_encrypt: | |||
397 | Lenc_loop: | 410 | Lenc_loop: |
398 | rlwinm $acc00,$s0,`32-24+3`,21,28 | 411 | rlwinm $acc00,$s0,`32-24+3`,21,28 |
399 | rlwinm $acc01,$s1,`32-24+3`,21,28 | 412 | rlwinm $acc01,$s1,`32-24+3`,21,28 |
400 | lwz $t0,0($key) | ||
401 | lwz $t1,4($key) | ||
402 | rlwinm $acc02,$s2,`32-24+3`,21,28 | 413 | rlwinm $acc02,$s2,`32-24+3`,21,28 |
403 | rlwinm $acc03,$s3,`32-24+3`,21,28 | 414 | rlwinm $acc03,$s3,`32-24+3`,21,28 |
404 | lwz $t2,8($key) | 415 | lwz $t0,0($key) |
405 | lwz $t3,12($key) | 416 | lwz $t1,4($key) |
406 | rlwinm $acc04,$s1,`32-16+3`,21,28 | 417 | rlwinm $acc04,$s1,`32-16+3`,21,28 |
407 | rlwinm $acc05,$s2,`32-16+3`,21,28 | 418 | rlwinm $acc05,$s2,`32-16+3`,21,28 |
408 | lwzx $acc00,$Tbl0,$acc00 | 419 | lwz $t2,8($key) |
409 | lwzx $acc01,$Tbl0,$acc01 | 420 | lwz $t3,12($key) |
410 | rlwinm $acc06,$s3,`32-16+3`,21,28 | 421 | rlwinm $acc06,$s3,`32-16+3`,21,28 |
411 | rlwinm $acc07,$s0,`32-16+3`,21,28 | 422 | rlwinm $acc07,$s0,`32-16+3`,21,28 |
412 | lwzx $acc02,$Tbl0,$acc02 | 423 | lwzx $acc00,$Tbl0,$acc00 |
413 | lwzx $acc03,$Tbl0,$acc03 | 424 | lwzx $acc01,$Tbl0,$acc01 |
414 | rlwinm $acc08,$s2,`32-8+3`,21,28 | 425 | rlwinm $acc08,$s2,`32-8+3`,21,28 |
415 | rlwinm $acc09,$s3,`32-8+3`,21,28 | 426 | rlwinm $acc09,$s3,`32-8+3`,21,28 |
416 | lwzx $acc04,$Tbl1,$acc04 | 427 | lwzx $acc02,$Tbl0,$acc02 |
417 | lwzx $acc05,$Tbl1,$acc05 | 428 | lwzx $acc03,$Tbl0,$acc03 |
418 | rlwinm $acc10,$s0,`32-8+3`,21,28 | 429 | rlwinm $acc10,$s0,`32-8+3`,21,28 |
419 | rlwinm $acc11,$s1,`32-8+3`,21,28 | 430 | rlwinm $acc11,$s1,`32-8+3`,21,28 |
420 | lwzx $acc06,$Tbl1,$acc06 | 431 | lwzx $acc04,$Tbl1,$acc04 |
421 | lwzx $acc07,$Tbl1,$acc07 | 432 | lwzx $acc05,$Tbl1,$acc05 |
422 | rlwinm $acc12,$s3,`0+3`,21,28 | 433 | rlwinm $acc12,$s3,`0+3`,21,28 |
423 | rlwinm $acc13,$s0,`0+3`,21,28 | 434 | rlwinm $acc13,$s0,`0+3`,21,28 |
424 | lwzx $acc08,$Tbl2,$acc08 | 435 | lwzx $acc06,$Tbl1,$acc06 |
425 | lwzx $acc09,$Tbl2,$acc09 | 436 | lwzx $acc07,$Tbl1,$acc07 |
426 | rlwinm $acc14,$s1,`0+3`,21,28 | 437 | rlwinm $acc14,$s1,`0+3`,21,28 |
427 | rlwinm $acc15,$s2,`0+3`,21,28 | 438 | rlwinm $acc15,$s2,`0+3`,21,28 |
428 | lwzx $acc10,$Tbl2,$acc10 | 439 | lwzx $acc08,$Tbl2,$acc08 |
429 | lwzx $acc11,$Tbl2,$acc11 | 440 | lwzx $acc09,$Tbl2,$acc09 |
430 | xor $t0,$t0,$acc00 | 441 | xor $t0,$t0,$acc00 |
431 | xor $t1,$t1,$acc01 | 442 | xor $t1,$t1,$acc01 |
432 | lwzx $acc12,$Tbl3,$acc12 | 443 | lwzx $acc10,$Tbl2,$acc10 |
433 | lwzx $acc13,$Tbl3,$acc13 | 444 | lwzx $acc11,$Tbl2,$acc11 |
434 | xor $t2,$t2,$acc02 | 445 | xor $t2,$t2,$acc02 |
435 | xor $t3,$t3,$acc03 | 446 | xor $t3,$t3,$acc03 |
436 | lwzx $acc14,$Tbl3,$acc14 | 447 | lwzx $acc12,$Tbl3,$acc12 |
437 | lwzx $acc15,$Tbl3,$acc15 | 448 | lwzx $acc13,$Tbl3,$acc13 |
438 | xor $t0,$t0,$acc04 | 449 | xor $t0,$t0,$acc04 |
439 | xor $t1,$t1,$acc05 | 450 | xor $t1,$t1,$acc05 |
451 | lwzx $acc14,$Tbl3,$acc14 | ||
452 | lwzx $acc15,$Tbl3,$acc15 | ||
440 | xor $t2,$t2,$acc06 | 453 | xor $t2,$t2,$acc06 |
441 | xor $t3,$t3,$acc07 | 454 | xor $t3,$t3,$acc07 |
442 | xor $t0,$t0,$acc08 | 455 | xor $t0,$t0,$acc08 |
@@ -452,60 +465,60 @@ Lenc_loop: | |||
452 | 465 | ||
453 | addi $Tbl2,$Tbl0,2048 | 466 | addi $Tbl2,$Tbl0,2048 |
454 | nop | 467 | nop |
455 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4 | ||
456 | lwz $acc09,`2048+32`($Tbl0) | ||
457 | lwz $acc10,`2048+64`($Tbl0) | ||
458 | lwz $acc11,`2048+96`($Tbl0) | ||
459 | lwz $acc08,`2048+128`($Tbl0) | ||
460 | lwz $acc09,`2048+160`($Tbl0) | ||
461 | lwz $acc10,`2048+192`($Tbl0) | ||
462 | lwz $acc11,`2048+224`($Tbl0) | ||
463 | rlwinm $acc00,$s0,`32-24`,24,31 | ||
464 | rlwinm $acc01,$s1,`32-24`,24,31 | ||
465 | lwz $t0,0($key) | 468 | lwz $t0,0($key) |
466 | lwz $t1,4($key) | 469 | lwz $t1,4($key) |
467 | rlwinm $acc02,$s2,`32-24`,24,31 | 470 | rlwinm $acc00,$s0,`32-24`,24,31 |
468 | rlwinm $acc03,$s3,`32-24`,24,31 | 471 | rlwinm $acc01,$s1,`32-24`,24,31 |
469 | lwz $t2,8($key) | 472 | lwz $t2,8($key) |
470 | lwz $t3,12($key) | 473 | lwz $t3,12($key) |
474 | rlwinm $acc02,$s2,`32-24`,24,31 | ||
475 | rlwinm $acc03,$s3,`32-24`,24,31 | ||
476 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4 | ||
477 | lwz $acc09,`2048+32`($Tbl0) | ||
471 | rlwinm $acc04,$s1,`32-16`,24,31 | 478 | rlwinm $acc04,$s1,`32-16`,24,31 |
472 | rlwinm $acc05,$s2,`32-16`,24,31 | 479 | rlwinm $acc05,$s2,`32-16`,24,31 |
473 | lbzx $acc00,$Tbl2,$acc00 | 480 | lwz $acc10,`2048+64`($Tbl0) |
474 | lbzx $acc01,$Tbl2,$acc01 | 481 | lwz $acc11,`2048+96`($Tbl0) |
475 | rlwinm $acc06,$s3,`32-16`,24,31 | 482 | rlwinm $acc06,$s3,`32-16`,24,31 |
476 | rlwinm $acc07,$s0,`32-16`,24,31 | 483 | rlwinm $acc07,$s0,`32-16`,24,31 |
477 | lbzx $acc02,$Tbl2,$acc02 | 484 | lwz $acc12,`2048+128`($Tbl0) |
478 | lbzx $acc03,$Tbl2,$acc03 | 485 | lwz $acc13,`2048+160`($Tbl0) |
479 | rlwinm $acc08,$s2,`32-8`,24,31 | 486 | rlwinm $acc08,$s2,`32-8`,24,31 |
480 | rlwinm $acc09,$s3,`32-8`,24,31 | 487 | rlwinm $acc09,$s3,`32-8`,24,31 |
481 | lbzx $acc04,$Tbl2,$acc04 | 488 | lwz $acc14,`2048+192`($Tbl0) |
482 | lbzx $acc05,$Tbl2,$acc05 | 489 | lwz $acc15,`2048+224`($Tbl0) |
483 | rlwinm $acc10,$s0,`32-8`,24,31 | 490 | rlwinm $acc10,$s0,`32-8`,24,31 |
484 | rlwinm $acc11,$s1,`32-8`,24,31 | 491 | rlwinm $acc11,$s1,`32-8`,24,31 |
485 | lbzx $acc06,$Tbl2,$acc06 | 492 | lbzx $acc00,$Tbl2,$acc00 |
486 | lbzx $acc07,$Tbl2,$acc07 | 493 | lbzx $acc01,$Tbl2,$acc01 |
487 | rlwinm $acc12,$s3,`0`,24,31 | 494 | rlwinm $acc12,$s3,`0`,24,31 |
488 | rlwinm $acc13,$s0,`0`,24,31 | 495 | rlwinm $acc13,$s0,`0`,24,31 |
489 | lbzx $acc08,$Tbl2,$acc08 | 496 | lbzx $acc02,$Tbl2,$acc02 |
490 | lbzx $acc09,$Tbl2,$acc09 | 497 | lbzx $acc03,$Tbl2,$acc03 |
491 | rlwinm $acc14,$s1,`0`,24,31 | 498 | rlwinm $acc14,$s1,`0`,24,31 |
492 | rlwinm $acc15,$s2,`0`,24,31 | 499 | rlwinm $acc15,$s2,`0`,24,31 |
493 | lbzx $acc10,$Tbl2,$acc10 | 500 | lbzx $acc04,$Tbl2,$acc04 |
494 | lbzx $acc11,$Tbl2,$acc11 | 501 | lbzx $acc05,$Tbl2,$acc05 |
495 | rlwinm $s0,$acc00,24,0,7 | 502 | rlwinm $s0,$acc00,24,0,7 |
496 | rlwinm $s1,$acc01,24,0,7 | 503 | rlwinm $s1,$acc01,24,0,7 |
497 | lbzx $acc12,$Tbl2,$acc12 | 504 | lbzx $acc06,$Tbl2,$acc06 |
498 | lbzx $acc13,$Tbl2,$acc13 | 505 | lbzx $acc07,$Tbl2,$acc07 |
499 | rlwinm $s2,$acc02,24,0,7 | 506 | rlwinm $s2,$acc02,24,0,7 |
500 | rlwinm $s3,$acc03,24,0,7 | 507 | rlwinm $s3,$acc03,24,0,7 |
501 | lbzx $acc14,$Tbl2,$acc14 | 508 | lbzx $acc08,$Tbl2,$acc08 |
502 | lbzx $acc15,$Tbl2,$acc15 | 509 | lbzx $acc09,$Tbl2,$acc09 |
503 | rlwimi $s0,$acc04,16,8,15 | 510 | rlwimi $s0,$acc04,16,8,15 |
504 | rlwimi $s1,$acc05,16,8,15 | 511 | rlwimi $s1,$acc05,16,8,15 |
512 | lbzx $acc10,$Tbl2,$acc10 | ||
513 | lbzx $acc11,$Tbl2,$acc11 | ||
505 | rlwimi $s2,$acc06,16,8,15 | 514 | rlwimi $s2,$acc06,16,8,15 |
506 | rlwimi $s3,$acc07,16,8,15 | 515 | rlwimi $s3,$acc07,16,8,15 |
516 | lbzx $acc12,$Tbl2,$acc12 | ||
517 | lbzx $acc13,$Tbl2,$acc13 | ||
507 | rlwimi $s0,$acc08,8,16,23 | 518 | rlwimi $s0,$acc08,8,16,23 |
508 | rlwimi $s1,$acc09,8,16,23 | 519 | rlwimi $s1,$acc09,8,16,23 |
520 | lbzx $acc14,$Tbl2,$acc14 | ||
521 | lbzx $acc15,$Tbl2,$acc15 | ||
509 | rlwimi $s2,$acc10,8,16,23 | 522 | rlwimi $s2,$acc10,8,16,23 |
510 | rlwimi $s3,$acc11,8,16,23 | 523 | rlwimi $s3,$acc11,8,16,23 |
511 | or $s0,$s0,$acc12 | 524 | or $s0,$s0,$acc12 |
@@ -542,40 +555,40 @@ Lenc_compact_loop: | |||
542 | rlwinm $acc01,$s1,`32-24`,24,31 | 555 | rlwinm $acc01,$s1,`32-24`,24,31 |
543 | rlwinm $acc02,$s2,`32-24`,24,31 | 556 | rlwinm $acc02,$s2,`32-24`,24,31 |
544 | rlwinm $acc03,$s3,`32-24`,24,31 | 557 | rlwinm $acc03,$s3,`32-24`,24,31 |
545 | lbzx $acc00,$Tbl1,$acc00 | ||
546 | lbzx $acc01,$Tbl1,$acc01 | ||
547 | rlwinm $acc04,$s1,`32-16`,24,31 | 558 | rlwinm $acc04,$s1,`32-16`,24,31 |
548 | rlwinm $acc05,$s2,`32-16`,24,31 | 559 | rlwinm $acc05,$s2,`32-16`,24,31 |
549 | lbzx $acc02,$Tbl1,$acc02 | ||
550 | lbzx $acc03,$Tbl1,$acc03 | ||
551 | rlwinm $acc06,$s3,`32-16`,24,31 | 560 | rlwinm $acc06,$s3,`32-16`,24,31 |
552 | rlwinm $acc07,$s0,`32-16`,24,31 | 561 | rlwinm $acc07,$s0,`32-16`,24,31 |
553 | lbzx $acc04,$Tbl1,$acc04 | 562 | lbzx $acc00,$Tbl1,$acc00 |
554 | lbzx $acc05,$Tbl1,$acc05 | 563 | lbzx $acc01,$Tbl1,$acc01 |
555 | rlwinm $acc08,$s2,`32-8`,24,31 | 564 | rlwinm $acc08,$s2,`32-8`,24,31 |
556 | rlwinm $acc09,$s3,`32-8`,24,31 | 565 | rlwinm $acc09,$s3,`32-8`,24,31 |
557 | lbzx $acc06,$Tbl1,$acc06 | 566 | lbzx $acc02,$Tbl1,$acc02 |
558 | lbzx $acc07,$Tbl1,$acc07 | 567 | lbzx $acc03,$Tbl1,$acc03 |
559 | rlwinm $acc10,$s0,`32-8`,24,31 | 568 | rlwinm $acc10,$s0,`32-8`,24,31 |
560 | rlwinm $acc11,$s1,`32-8`,24,31 | 569 | rlwinm $acc11,$s1,`32-8`,24,31 |
561 | lbzx $acc08,$Tbl1,$acc08 | 570 | lbzx $acc04,$Tbl1,$acc04 |
562 | lbzx $acc09,$Tbl1,$acc09 | 571 | lbzx $acc05,$Tbl1,$acc05 |
563 | rlwinm $acc12,$s3,`0`,24,31 | 572 | rlwinm $acc12,$s3,`0`,24,31 |
564 | rlwinm $acc13,$s0,`0`,24,31 | 573 | rlwinm $acc13,$s0,`0`,24,31 |
565 | lbzx $acc10,$Tbl1,$acc10 | 574 | lbzx $acc06,$Tbl1,$acc06 |
566 | lbzx $acc11,$Tbl1,$acc11 | 575 | lbzx $acc07,$Tbl1,$acc07 |
567 | rlwinm $acc14,$s1,`0`,24,31 | 576 | rlwinm $acc14,$s1,`0`,24,31 |
568 | rlwinm $acc15,$s2,`0`,24,31 | 577 | rlwinm $acc15,$s2,`0`,24,31 |
569 | lbzx $acc12,$Tbl1,$acc12 | 578 | lbzx $acc08,$Tbl1,$acc08 |
570 | lbzx $acc13,$Tbl1,$acc13 | 579 | lbzx $acc09,$Tbl1,$acc09 |
571 | rlwinm $s0,$acc00,24,0,7 | 580 | rlwinm $s0,$acc00,24,0,7 |
572 | rlwinm $s1,$acc01,24,0,7 | 581 | rlwinm $s1,$acc01,24,0,7 |
573 | lbzx $acc14,$Tbl1,$acc14 | 582 | lbzx $acc10,$Tbl1,$acc10 |
574 | lbzx $acc15,$Tbl1,$acc15 | 583 | lbzx $acc11,$Tbl1,$acc11 |
575 | rlwinm $s2,$acc02,24,0,7 | 584 | rlwinm $s2,$acc02,24,0,7 |
576 | rlwinm $s3,$acc03,24,0,7 | 585 | rlwinm $s3,$acc03,24,0,7 |
586 | lbzx $acc12,$Tbl1,$acc12 | ||
587 | lbzx $acc13,$Tbl1,$acc13 | ||
577 | rlwimi $s0,$acc04,16,8,15 | 588 | rlwimi $s0,$acc04,16,8,15 |
578 | rlwimi $s1,$acc05,16,8,15 | 589 | rlwimi $s1,$acc05,16,8,15 |
590 | lbzx $acc14,$Tbl1,$acc14 | ||
591 | lbzx $acc15,$Tbl1,$acc15 | ||
579 | rlwimi $s2,$acc06,16,8,15 | 592 | rlwimi $s2,$acc06,16,8,15 |
580 | rlwimi $s3,$acc07,16,8,15 | 593 | rlwimi $s3,$acc07,16,8,15 |
581 | rlwimi $s0,$acc08,8,16,23 | 594 | rlwimi $s0,$acc08,8,16,23 |
@@ -725,7 +738,7 @@ Lenc_compact_done: | |||
725 | addi $sp,$sp,$FRAME | 738 | addi $sp,$sp,$FRAME |
726 | blr | 739 | blr |
727 | 740 | ||
728 | .align 4 | 741 | .align 5 |
729 | Lppc_AES_decrypt: | 742 | Lppc_AES_decrypt: |
730 | lwz $acc00,240($key) | 743 | lwz $acc00,240($key) |
731 | lwz $t0,0($key) | 744 | lwz $t0,0($key) |
@@ -746,46 +759,46 @@ Lppc_AES_decrypt: | |||
746 | Ldec_loop: | 759 | Ldec_loop: |
747 | rlwinm $acc00,$s0,`32-24+3`,21,28 | 760 | rlwinm $acc00,$s0,`32-24+3`,21,28 |
748 | rlwinm $acc01,$s1,`32-24+3`,21,28 | 761 | rlwinm $acc01,$s1,`32-24+3`,21,28 |
749 | lwz $t0,0($key) | ||
750 | lwz $t1,4($key) | ||
751 | rlwinm $acc02,$s2,`32-24+3`,21,28 | 762 | rlwinm $acc02,$s2,`32-24+3`,21,28 |
752 | rlwinm $acc03,$s3,`32-24+3`,21,28 | 763 | rlwinm $acc03,$s3,`32-24+3`,21,28 |
753 | lwz $t2,8($key) | 764 | lwz $t0,0($key) |
754 | lwz $t3,12($key) | 765 | lwz $t1,4($key) |
755 | rlwinm $acc04,$s3,`32-16+3`,21,28 | 766 | rlwinm $acc04,$s3,`32-16+3`,21,28 |
756 | rlwinm $acc05,$s0,`32-16+3`,21,28 | 767 | rlwinm $acc05,$s0,`32-16+3`,21,28 |
757 | lwzx $acc00,$Tbl0,$acc00 | 768 | lwz $t2,8($key) |
758 | lwzx $acc01,$Tbl0,$acc01 | 769 | lwz $t3,12($key) |
759 | rlwinm $acc06,$s1,`32-16+3`,21,28 | 770 | rlwinm $acc06,$s1,`32-16+3`,21,28 |
760 | rlwinm $acc07,$s2,`32-16+3`,21,28 | 771 | rlwinm $acc07,$s2,`32-16+3`,21,28 |
761 | lwzx $acc02,$Tbl0,$acc02 | 772 | lwzx $acc00,$Tbl0,$acc00 |
762 | lwzx $acc03,$Tbl0,$acc03 | 773 | lwzx $acc01,$Tbl0,$acc01 |
763 | rlwinm $acc08,$s2,`32-8+3`,21,28 | 774 | rlwinm $acc08,$s2,`32-8+3`,21,28 |
764 | rlwinm $acc09,$s3,`32-8+3`,21,28 | 775 | rlwinm $acc09,$s3,`32-8+3`,21,28 |
765 | lwzx $acc04,$Tbl1,$acc04 | 776 | lwzx $acc02,$Tbl0,$acc02 |
766 | lwzx $acc05,$Tbl1,$acc05 | 777 | lwzx $acc03,$Tbl0,$acc03 |
767 | rlwinm $acc10,$s0,`32-8+3`,21,28 | 778 | rlwinm $acc10,$s0,`32-8+3`,21,28 |
768 | rlwinm $acc11,$s1,`32-8+3`,21,28 | 779 | rlwinm $acc11,$s1,`32-8+3`,21,28 |
769 | lwzx $acc06,$Tbl1,$acc06 | 780 | lwzx $acc04,$Tbl1,$acc04 |
770 | lwzx $acc07,$Tbl1,$acc07 | 781 | lwzx $acc05,$Tbl1,$acc05 |
771 | rlwinm $acc12,$s1,`0+3`,21,28 | 782 | rlwinm $acc12,$s1,`0+3`,21,28 |
772 | rlwinm $acc13,$s2,`0+3`,21,28 | 783 | rlwinm $acc13,$s2,`0+3`,21,28 |
773 | lwzx $acc08,$Tbl2,$acc08 | 784 | lwzx $acc06,$Tbl1,$acc06 |
774 | lwzx $acc09,$Tbl2,$acc09 | 785 | lwzx $acc07,$Tbl1,$acc07 |
775 | rlwinm $acc14,$s3,`0+3`,21,28 | 786 | rlwinm $acc14,$s3,`0+3`,21,28 |
776 | rlwinm $acc15,$s0,`0+3`,21,28 | 787 | rlwinm $acc15,$s0,`0+3`,21,28 |
777 | lwzx $acc10,$Tbl2,$acc10 | 788 | lwzx $acc08,$Tbl2,$acc08 |
778 | lwzx $acc11,$Tbl2,$acc11 | 789 | lwzx $acc09,$Tbl2,$acc09 |
779 | xor $t0,$t0,$acc00 | 790 | xor $t0,$t0,$acc00 |
780 | xor $t1,$t1,$acc01 | 791 | xor $t1,$t1,$acc01 |
781 | lwzx $acc12,$Tbl3,$acc12 | 792 | lwzx $acc10,$Tbl2,$acc10 |
782 | lwzx $acc13,$Tbl3,$acc13 | 793 | lwzx $acc11,$Tbl2,$acc11 |
783 | xor $t2,$t2,$acc02 | 794 | xor $t2,$t2,$acc02 |
784 | xor $t3,$t3,$acc03 | 795 | xor $t3,$t3,$acc03 |
785 | lwzx $acc14,$Tbl3,$acc14 | 796 | lwzx $acc12,$Tbl3,$acc12 |
786 | lwzx $acc15,$Tbl3,$acc15 | 797 | lwzx $acc13,$Tbl3,$acc13 |
787 | xor $t0,$t0,$acc04 | 798 | xor $t0,$t0,$acc04 |
788 | xor $t1,$t1,$acc05 | 799 | xor $t1,$t1,$acc05 |
800 | lwzx $acc14,$Tbl3,$acc14 | ||
801 | lwzx $acc15,$Tbl3,$acc15 | ||
789 | xor $t2,$t2,$acc06 | 802 | xor $t2,$t2,$acc06 |
790 | xor $t3,$t3,$acc07 | 803 | xor $t3,$t3,$acc07 |
791 | xor $t0,$t0,$acc08 | 804 | xor $t0,$t0,$acc08 |
@@ -801,56 +814,56 @@ Ldec_loop: | |||
801 | 814 | ||
802 | addi $Tbl2,$Tbl0,2048 | 815 | addi $Tbl2,$Tbl0,2048 |
803 | nop | 816 | nop |
804 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4 | ||
805 | lwz $acc09,`2048+32`($Tbl0) | ||
806 | lwz $acc10,`2048+64`($Tbl0) | ||
807 | lwz $acc11,`2048+96`($Tbl0) | ||
808 | lwz $acc08,`2048+128`($Tbl0) | ||
809 | lwz $acc09,`2048+160`($Tbl0) | ||
810 | lwz $acc10,`2048+192`($Tbl0) | ||
811 | lwz $acc11,`2048+224`($Tbl0) | ||
812 | rlwinm $acc00,$s0,`32-24`,24,31 | ||
813 | rlwinm $acc01,$s1,`32-24`,24,31 | ||
814 | lwz $t0,0($key) | 817 | lwz $t0,0($key) |
815 | lwz $t1,4($key) | 818 | lwz $t1,4($key) |
816 | rlwinm $acc02,$s2,`32-24`,24,31 | 819 | rlwinm $acc00,$s0,`32-24`,24,31 |
817 | rlwinm $acc03,$s3,`32-24`,24,31 | 820 | rlwinm $acc01,$s1,`32-24`,24,31 |
818 | lwz $t2,8($key) | 821 | lwz $t2,8($key) |
819 | lwz $t3,12($key) | 822 | lwz $t3,12($key) |
823 | rlwinm $acc02,$s2,`32-24`,24,31 | ||
824 | rlwinm $acc03,$s3,`32-24`,24,31 | ||
825 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4 | ||
826 | lwz $acc09,`2048+32`($Tbl0) | ||
820 | rlwinm $acc04,$s3,`32-16`,24,31 | 827 | rlwinm $acc04,$s3,`32-16`,24,31 |
821 | rlwinm $acc05,$s0,`32-16`,24,31 | 828 | rlwinm $acc05,$s0,`32-16`,24,31 |
829 | lwz $acc10,`2048+64`($Tbl0) | ||
830 | lwz $acc11,`2048+96`($Tbl0) | ||
822 | lbzx $acc00,$Tbl2,$acc00 | 831 | lbzx $acc00,$Tbl2,$acc00 |
823 | lbzx $acc01,$Tbl2,$acc01 | 832 | lbzx $acc01,$Tbl2,$acc01 |
833 | lwz $acc12,`2048+128`($Tbl0) | ||
834 | lwz $acc13,`2048+160`($Tbl0) | ||
824 | rlwinm $acc06,$s1,`32-16`,24,31 | 835 | rlwinm $acc06,$s1,`32-16`,24,31 |
825 | rlwinm $acc07,$s2,`32-16`,24,31 | 836 | rlwinm $acc07,$s2,`32-16`,24,31 |
826 | lbzx $acc02,$Tbl2,$acc02 | 837 | lwz $acc14,`2048+192`($Tbl0) |
827 | lbzx $acc03,$Tbl2,$acc03 | 838 | lwz $acc15,`2048+224`($Tbl0) |
828 | rlwinm $acc08,$s2,`32-8`,24,31 | 839 | rlwinm $acc08,$s2,`32-8`,24,31 |
829 | rlwinm $acc09,$s3,`32-8`,24,31 | 840 | rlwinm $acc09,$s3,`32-8`,24,31 |
830 | lbzx $acc04,$Tbl2,$acc04 | 841 | lbzx $acc02,$Tbl2,$acc02 |
831 | lbzx $acc05,$Tbl2,$acc05 | 842 | lbzx $acc03,$Tbl2,$acc03 |
832 | rlwinm $acc10,$s0,`32-8`,24,31 | 843 | rlwinm $acc10,$s0,`32-8`,24,31 |
833 | rlwinm $acc11,$s1,`32-8`,24,31 | 844 | rlwinm $acc11,$s1,`32-8`,24,31 |
834 | lbzx $acc06,$Tbl2,$acc06 | 845 | lbzx $acc04,$Tbl2,$acc04 |
835 | lbzx $acc07,$Tbl2,$acc07 | 846 | lbzx $acc05,$Tbl2,$acc05 |
836 | rlwinm $acc12,$s1,`0`,24,31 | 847 | rlwinm $acc12,$s1,`0`,24,31 |
837 | rlwinm $acc13,$s2,`0`,24,31 | 848 | rlwinm $acc13,$s2,`0`,24,31 |
838 | lbzx $acc08,$Tbl2,$acc08 | 849 | lbzx $acc06,$Tbl2,$acc06 |
839 | lbzx $acc09,$Tbl2,$acc09 | 850 | lbzx $acc07,$Tbl2,$acc07 |
840 | rlwinm $acc14,$s3,`0`,24,31 | 851 | rlwinm $acc14,$s3,`0`,24,31 |
841 | rlwinm $acc15,$s0,`0`,24,31 | 852 | rlwinm $acc15,$s0,`0`,24,31 |
842 | lbzx $acc10,$Tbl2,$acc10 | 853 | lbzx $acc08,$Tbl2,$acc08 |
843 | lbzx $acc11,$Tbl2,$acc11 | 854 | lbzx $acc09,$Tbl2,$acc09 |
844 | rlwinm $s0,$acc00,24,0,7 | 855 | rlwinm $s0,$acc00,24,0,7 |
845 | rlwinm $s1,$acc01,24,0,7 | 856 | rlwinm $s1,$acc01,24,0,7 |
846 | lbzx $acc12,$Tbl2,$acc12 | 857 | lbzx $acc10,$Tbl2,$acc10 |
847 | lbzx $acc13,$Tbl2,$acc13 | 858 | lbzx $acc11,$Tbl2,$acc11 |
848 | rlwinm $s2,$acc02,24,0,7 | 859 | rlwinm $s2,$acc02,24,0,7 |
849 | rlwinm $s3,$acc03,24,0,7 | 860 | rlwinm $s3,$acc03,24,0,7 |
850 | lbzx $acc14,$Tbl2,$acc14 | 861 | lbzx $acc12,$Tbl2,$acc12 |
851 | lbzx $acc15,$Tbl2,$acc15 | 862 | lbzx $acc13,$Tbl2,$acc13 |
852 | rlwimi $s0,$acc04,16,8,15 | 863 | rlwimi $s0,$acc04,16,8,15 |
853 | rlwimi $s1,$acc05,16,8,15 | 864 | rlwimi $s1,$acc05,16,8,15 |
865 | lbzx $acc14,$Tbl2,$acc14 | ||
866 | lbzx $acc15,$Tbl2,$acc15 | ||
854 | rlwimi $s2,$acc06,16,8,15 | 867 | rlwimi $s2,$acc06,16,8,15 |
855 | rlwimi $s3,$acc07,16,8,15 | 868 | rlwimi $s3,$acc07,16,8,15 |
856 | rlwimi $s0,$acc08,8,16,23 | 869 | rlwimi $s0,$acc08,8,16,23 |
@@ -897,40 +910,40 @@ Ldec_compact_loop: | |||
897 | rlwinm $acc01,$s1,`32-24`,24,31 | 910 | rlwinm $acc01,$s1,`32-24`,24,31 |
898 | rlwinm $acc02,$s2,`32-24`,24,31 | 911 | rlwinm $acc02,$s2,`32-24`,24,31 |
899 | rlwinm $acc03,$s3,`32-24`,24,31 | 912 | rlwinm $acc03,$s3,`32-24`,24,31 |
900 | lbzx $acc00,$Tbl1,$acc00 | ||
901 | lbzx $acc01,$Tbl1,$acc01 | ||
902 | rlwinm $acc04,$s3,`32-16`,24,31 | 913 | rlwinm $acc04,$s3,`32-16`,24,31 |
903 | rlwinm $acc05,$s0,`32-16`,24,31 | 914 | rlwinm $acc05,$s0,`32-16`,24,31 |
904 | lbzx $acc02,$Tbl1,$acc02 | ||
905 | lbzx $acc03,$Tbl1,$acc03 | ||
906 | rlwinm $acc06,$s1,`32-16`,24,31 | 915 | rlwinm $acc06,$s1,`32-16`,24,31 |
907 | rlwinm $acc07,$s2,`32-16`,24,31 | 916 | rlwinm $acc07,$s2,`32-16`,24,31 |
908 | lbzx $acc04,$Tbl1,$acc04 | 917 | lbzx $acc00,$Tbl1,$acc00 |
909 | lbzx $acc05,$Tbl1,$acc05 | 918 | lbzx $acc01,$Tbl1,$acc01 |
910 | rlwinm $acc08,$s2,`32-8`,24,31 | 919 | rlwinm $acc08,$s2,`32-8`,24,31 |
911 | rlwinm $acc09,$s3,`32-8`,24,31 | 920 | rlwinm $acc09,$s3,`32-8`,24,31 |
912 | lbzx $acc06,$Tbl1,$acc06 | 921 | lbzx $acc02,$Tbl1,$acc02 |
913 | lbzx $acc07,$Tbl1,$acc07 | 922 | lbzx $acc03,$Tbl1,$acc03 |
914 | rlwinm $acc10,$s0,`32-8`,24,31 | 923 | rlwinm $acc10,$s0,`32-8`,24,31 |
915 | rlwinm $acc11,$s1,`32-8`,24,31 | 924 | rlwinm $acc11,$s1,`32-8`,24,31 |
916 | lbzx $acc08,$Tbl1,$acc08 | 925 | lbzx $acc04,$Tbl1,$acc04 |
917 | lbzx $acc09,$Tbl1,$acc09 | 926 | lbzx $acc05,$Tbl1,$acc05 |
918 | rlwinm $acc12,$s1,`0`,24,31 | 927 | rlwinm $acc12,$s1,`0`,24,31 |
919 | rlwinm $acc13,$s2,`0`,24,31 | 928 | rlwinm $acc13,$s2,`0`,24,31 |
920 | lbzx $acc10,$Tbl1,$acc10 | 929 | lbzx $acc06,$Tbl1,$acc06 |
921 | lbzx $acc11,$Tbl1,$acc11 | 930 | lbzx $acc07,$Tbl1,$acc07 |
922 | rlwinm $acc14,$s3,`0`,24,31 | 931 | rlwinm $acc14,$s3,`0`,24,31 |
923 | rlwinm $acc15,$s0,`0`,24,31 | 932 | rlwinm $acc15,$s0,`0`,24,31 |
924 | lbzx $acc12,$Tbl1,$acc12 | 933 | lbzx $acc08,$Tbl1,$acc08 |
925 | lbzx $acc13,$Tbl1,$acc13 | 934 | lbzx $acc09,$Tbl1,$acc09 |
926 | rlwinm $s0,$acc00,24,0,7 | 935 | rlwinm $s0,$acc00,24,0,7 |
927 | rlwinm $s1,$acc01,24,0,7 | 936 | rlwinm $s1,$acc01,24,0,7 |
928 | lbzx $acc14,$Tbl1,$acc14 | 937 | lbzx $acc10,$Tbl1,$acc10 |
929 | lbzx $acc15,$Tbl1,$acc15 | 938 | lbzx $acc11,$Tbl1,$acc11 |
930 | rlwinm $s2,$acc02,24,0,7 | 939 | rlwinm $s2,$acc02,24,0,7 |
931 | rlwinm $s3,$acc03,24,0,7 | 940 | rlwinm $s3,$acc03,24,0,7 |
941 | lbzx $acc12,$Tbl1,$acc12 | ||
942 | lbzx $acc13,$Tbl1,$acc13 | ||
932 | rlwimi $s0,$acc04,16,8,15 | 943 | rlwimi $s0,$acc04,16,8,15 |
933 | rlwimi $s1,$acc05,16,8,15 | 944 | rlwimi $s1,$acc05,16,8,15 |
945 | lbzx $acc14,$Tbl1,$acc14 | ||
946 | lbzx $acc15,$Tbl1,$acc15 | ||
934 | rlwimi $s2,$acc06,16,8,15 | 947 | rlwimi $s2,$acc06,16,8,15 |
935 | rlwimi $s3,$acc07,16,8,15 | 948 | rlwimi $s3,$acc07,16,8,15 |
936 | rlwimi $s0,$acc08,8,16,23 | 949 | rlwimi $s0,$acc08,8,16,23 |
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl index 4b27afd92f..7e01889298 100644 --- a/src/lib/libcrypto/aes/asm/aes-s390x.pl +++ b/src/lib/libcrypto/aes/asm/aes-s390x.pl | |||
@@ -765,6 +765,11 @@ $code.=<<___ if (!$softonly); | |||
765 | srl %r5,6 | 765 | srl %r5,6 |
766 | ar %r5,%r0 | 766 | ar %r5,%r0 |
767 | 767 | ||
768 | larl %r1,OPENSSL_s390xcap_P | ||
769 | lg %r0,0(%r1) | ||
770 | tmhl %r0,0x4000 # check for message-security assist | ||
771 | jz .Lekey_internal | ||
772 | |||
768 | lghi %r0,0 # query capability vector | 773 | lghi %r0,0 # query capability vector |
769 | la %r1,16($sp) | 774 | la %r1,16($sp) |
770 | .long 0xb92f0042 # kmc %r4,%r2 | 775 | .long 0xb92f0042 # kmc %r4,%r2 |
@@ -1323,6 +1328,7 @@ $code.=<<___; | |||
1323 | 4: ex $len,0($s1) | 1328 | 4: ex $len,0($s1) |
1324 | j .Lcbc_dec_exit | 1329 | j .Lcbc_dec_exit |
1325 | .size AES_cbc_encrypt,.-AES_cbc_encrypt | 1330 | .size AES_cbc_encrypt,.-AES_cbc_encrypt |
1331 | .comm OPENSSL_s390xcap_P,8,8 | ||
1326 | ___ | 1332 | ___ |
1327 | } | 1333 | } |
1328 | $code.=<<___; | 1334 | $code.=<<___; |
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl index f616f1751f..a545e892ae 100755 --- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl +++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl | |||
@@ -2,11 +2,12 @@ | |||
2 | # | 2 | # |
3 | # ==================================================================== | 3 | # ==================================================================== |
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
5 | # project. Rights for redistribution and usage in source and binary | 5 | # project. The module is, however, dual licensed under OpenSSL and |
6 | # forms are granted according to the OpenSSL license. | 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
7 | # ==================================================================== | 8 | # ==================================================================== |
8 | # | 9 | # |
9 | # Version 1.2. | 10 | # Version 2.1. |
10 | # | 11 | # |
11 | # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on | 12 | # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on |
12 | # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version | 13 | # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version |
@@ -17,17 +18,29 @@ | |||
17 | # | 18 | # |
18 | # Performance in number of cycles per processed byte for 128-bit key: | 19 | # Performance in number of cycles per processed byte for 128-bit key: |
19 | # | 20 | # |
20 | # ECB CBC encrypt | 21 | # ECB encrypt ECB decrypt CBC large chunk |
21 | # AMD64 13.7 13.0(*) | 22 | # AMD64 33 41 13.0 |
22 | # EM64T 20.2 18.6(*) | 23 | # EM64T 38 59 18.6(*) |
24 | # Core 2 30 43 14.5(*) | ||
23 | # | 25 | # |
24 | # (*) CBC benchmarks are better than ECB thanks to custom ABI used | 26 | # (*) with hyper-threading off |
25 | # by the private block encryption function. | 27 | |
28 | $flavour = shift; | ||
29 | $output = shift; | ||
30 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
31 | |||
32 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
33 | |||
34 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
35 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
36 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
37 | die "can't locate x86_64-xlate.pl"; | ||
38 | |||
39 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
26 | 40 | ||
27 | $verticalspin=1; # unlike 32-bit version $verticalspin performs | 41 | $verticalspin=1; # unlike 32-bit version $verticalspin performs |
28 | # ~15% better on both AMD and Intel cores | 42 | # ~15% better on both AMD and Intel cores |
29 | $output=shift; | 43 | $speed_limit=512; # see aes-586.pl for details |
30 | open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output"; | ||
31 | 44 | ||
32 | $code=".text\n"; | 45 | $code=".text\n"; |
33 | 46 | ||
@@ -35,9 +48,9 @@ $s0="%eax"; | |||
35 | $s1="%ebx"; | 48 | $s1="%ebx"; |
36 | $s2="%ecx"; | 49 | $s2="%ecx"; |
37 | $s3="%edx"; | 50 | $s3="%edx"; |
38 | $acc0="%esi"; | 51 | $acc0="%esi"; $mask80="%rsi"; |
39 | $acc1="%edi"; | 52 | $acc1="%edi"; $maskfe="%rdi"; |
40 | $acc2="%ebp"; | 53 | $acc2="%ebp"; $mask1b="%rbp"; |
41 | $inp="%r8"; | 54 | $inp="%r8"; |
42 | $out="%r9"; | 55 | $out="%r9"; |
43 | $t0="%r10d"; | 56 | $t0="%r10d"; |
@@ -51,6 +64,8 @@ sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } | |||
51 | sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; | 64 | sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; |
52 | $r =~ s/%[er]([sd]i)/%\1l/; | 65 | $r =~ s/%[er]([sd]i)/%\1l/; |
53 | $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } | 66 | $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } |
67 | sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/; | ||
68 | $r =~ s/%r([0-9]+)/%r\1d/; $r; } | ||
54 | sub _data_word() | 69 | sub _data_word() |
55 | { my $i; | 70 | { my $i; |
56 | while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } | 71 | while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } |
@@ -138,22 +153,17 @@ $code.=<<___; | |||
138 | movzb `&lo("$s0")`,$acc0 | 153 | movzb `&lo("$s0")`,$acc0 |
139 | movzb `&lo("$s1")`,$acc1 | 154 | movzb `&lo("$s1")`,$acc1 |
140 | movzb `&lo("$s2")`,$acc2 | 155 | movzb `&lo("$s2")`,$acc2 |
141 | mov 2($sbox,$acc0,8),$t0 | 156 | movzb 2($sbox,$acc0,8),$t0 |
142 | mov 2($sbox,$acc1,8),$t1 | 157 | movzb 2($sbox,$acc1,8),$t1 |
143 | mov 2($sbox,$acc2,8),$t2 | 158 | movzb 2($sbox,$acc2,8),$t2 |
144 | |||
145 | and \$0x000000ff,$t0 | ||
146 | and \$0x000000ff,$t1 | ||
147 | and \$0x000000ff,$t2 | ||
148 | 159 | ||
149 | movzb `&lo("$s3")`,$acc0 | 160 | movzb `&lo("$s3")`,$acc0 |
150 | movzb `&hi("$s1")`,$acc1 | 161 | movzb `&hi("$s1")`,$acc1 |
151 | movzb `&hi("$s2")`,$acc2 | 162 | movzb `&hi("$s2")`,$acc2 |
152 | mov 2($sbox,$acc0,8),$t3 | 163 | movzb 2($sbox,$acc0,8),$t3 |
153 | mov 0($sbox,$acc1,8),$acc1 #$t0 | 164 | mov 0($sbox,$acc1,8),$acc1 #$t0 |
154 | mov 0($sbox,$acc2,8),$acc2 #$t1 | 165 | mov 0($sbox,$acc2,8),$acc2 #$t1 |
155 | 166 | ||
156 | and \$0x000000ff,$t3 | ||
157 | and \$0x0000ff00,$acc1 | 167 | and \$0x0000ff00,$acc1 |
158 | and \$0x0000ff00,$acc2 | 168 | and \$0x0000ff00,$acc2 |
159 | 169 | ||
@@ -345,6 +355,234 @@ $code.=<<___; | |||
345 | .size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt | 355 | .size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt |
346 | ___ | 356 | ___ |
347 | 357 | ||
358 | # it's possible to implement this by shifting tN by 8, filling least | ||
359 | # significant byte with byte load and finally bswap-ing at the end, | ||
360 | # but such partial register load kills Core 2... | ||
361 | sub enccompactvert() | ||
362 | { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); | ||
363 | |||
364 | $code.=<<___; | ||
365 | movzb `&lo("$s0")`,$t0 | ||
366 | movzb `&lo("$s1")`,$t1 | ||
367 | movzb `&lo("$s2")`,$t2 | ||
368 | movzb ($sbox,$t0,1),$t0 | ||
369 | movzb ($sbox,$t1,1),$t1 | ||
370 | movzb ($sbox,$t2,1),$t2 | ||
371 | |||
372 | movzb `&lo("$s3")`,$t3 | ||
373 | movzb `&hi("$s1")`,$acc0 | ||
374 | movzb `&hi("$s2")`,$acc1 | ||
375 | movzb ($sbox,$t3,1),$t3 | ||
376 | movzb ($sbox,$acc0,1),$t4 #$t0 | ||
377 | movzb ($sbox,$acc1,1),$t5 #$t1 | ||
378 | |||
379 | movzb `&hi("$s3")`,$acc2 | ||
380 | movzb `&hi("$s0")`,$acc0 | ||
381 | shr \$16,$s2 | ||
382 | movzb ($sbox,$acc2,1),$acc2 #$t2 | ||
383 | movzb ($sbox,$acc0,1),$acc0 #$t3 | ||
384 | shr \$16,$s3 | ||
385 | |||
386 | movzb `&lo("$s2")`,$acc1 | ||
387 | shl \$8,$t4 | ||
388 | shl \$8,$t5 | ||
389 | movzb ($sbox,$acc1,1),$acc1 #$t0 | ||
390 | xor $t4,$t0 | ||
391 | xor $t5,$t1 | ||
392 | |||
393 | movzb `&lo("$s3")`,$t4 | ||
394 | shr \$16,$s0 | ||
395 | shr \$16,$s1 | ||
396 | movzb `&lo("$s0")`,$t5 | ||
397 | shl \$8,$acc2 | ||
398 | shl \$8,$acc0 | ||
399 | movzb ($sbox,$t4,1),$t4 #$t1 | ||
400 | movzb ($sbox,$t5,1),$t5 #$t2 | ||
401 | xor $acc2,$t2 | ||
402 | xor $acc0,$t3 | ||
403 | |||
404 | movzb `&lo("$s1")`,$acc2 | ||
405 | movzb `&hi("$s3")`,$acc0 | ||
406 | shl \$16,$acc1 | ||
407 | movzb ($sbox,$acc2,1),$acc2 #$t3 | ||
408 | movzb ($sbox,$acc0,1),$acc0 #$t0 | ||
409 | xor $acc1,$t0 | ||
410 | |||
411 | movzb `&hi("$s0")`,$acc1 | ||
412 | shr \$8,$s2 | ||
413 | shr \$8,$s1 | ||
414 | movzb ($sbox,$acc1,1),$acc1 #$t1 | ||
415 | movzb ($sbox,$s2,1),$s3 #$t3 | ||
416 | movzb ($sbox,$s1,1),$s2 #$t2 | ||
417 | shl \$16,$t4 | ||
418 | shl \$16,$t5 | ||
419 | shl \$16,$acc2 | ||
420 | xor $t4,$t1 | ||
421 | xor $t5,$t2 | ||
422 | xor $acc2,$t3 | ||
423 | |||
424 | shl \$24,$acc0 | ||
425 | shl \$24,$acc1 | ||
426 | shl \$24,$s3 | ||
427 | xor $acc0,$t0 | ||
428 | shl \$24,$s2 | ||
429 | xor $acc1,$t1 | ||
430 | mov $t0,$s0 | ||
431 | mov $t1,$s1 | ||
432 | xor $t2,$s2 | ||
433 | xor $t3,$s3 | ||
434 | ___ | ||
435 | } | ||
436 | |||
437 | sub enctransform_ref() | ||
438 | { my $sn = shift; | ||
439 | my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d"); | ||
440 | |||
441 | $code.=<<___; | ||
442 | mov $sn,$acc | ||
443 | and \$0x80808080,$acc | ||
444 | mov $acc,$tmp | ||
445 | shr \$7,$tmp | ||
446 | lea ($sn,$sn),$r2 | ||
447 | sub $tmp,$acc | ||
448 | and \$0xfefefefe,$r2 | ||
449 | and \$0x1b1b1b1b,$acc | ||
450 | mov $sn,$tmp | ||
451 | xor $acc,$r2 | ||
452 | |||
453 | xor $r2,$sn | ||
454 | rol \$24,$sn | ||
455 | xor $r2,$sn | ||
456 | ror \$16,$tmp | ||
457 | xor $tmp,$sn | ||
458 | ror \$8,$tmp | ||
459 | xor $tmp,$sn | ||
460 | ___ | ||
461 | } | ||
462 | |||
463 | # unlike decrypt case it does not pay off to parallelize enctransform | ||
464 | sub enctransform() | ||
465 | { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d"); | ||
466 | |||
467 | $code.=<<___; | ||
468 | mov $s0,$acc0 | ||
469 | mov $s1,$acc1 | ||
470 | and \$0x80808080,$acc0 | ||
471 | and \$0x80808080,$acc1 | ||
472 | mov $acc0,$t0 | ||
473 | mov $acc1,$t1 | ||
474 | shr \$7,$t0 | ||
475 | lea ($s0,$s0),$r20 | ||
476 | shr \$7,$t1 | ||
477 | lea ($s1,$s1),$r21 | ||
478 | sub $t0,$acc0 | ||
479 | sub $t1,$acc1 | ||
480 | and \$0xfefefefe,$r20 | ||
481 | and \$0xfefefefe,$r21 | ||
482 | and \$0x1b1b1b1b,$acc0 | ||
483 | and \$0x1b1b1b1b,$acc1 | ||
484 | mov $s0,$t0 | ||
485 | mov $s1,$t1 | ||
486 | xor $acc0,$r20 | ||
487 | xor $acc1,$r21 | ||
488 | |||
489 | xor $r20,$s0 | ||
490 | xor $r21,$s1 | ||
491 | mov $s2,$acc0 | ||
492 | mov $s3,$acc1 | ||
493 | rol \$24,$s0 | ||
494 | rol \$24,$s1 | ||
495 | and \$0x80808080,$acc0 | ||
496 | and \$0x80808080,$acc1 | ||
497 | xor $r20,$s0 | ||
498 | xor $r21,$s1 | ||
499 | mov $acc0,$t2 | ||
500 | mov $acc1,$t3 | ||
501 | ror \$16,$t0 | ||
502 | ror \$16,$t1 | ||
503 | shr \$7,$t2 | ||
504 | lea ($s2,$s2),$r20 | ||
505 | xor $t0,$s0 | ||
506 | xor $t1,$s1 | ||
507 | shr \$7,$t3 | ||
508 | lea ($s3,$s3),$r21 | ||
509 | ror \$8,$t0 | ||
510 | ror \$8,$t1 | ||
511 | sub $t2,$acc0 | ||
512 | sub $t3,$acc1 | ||
513 | xor $t0,$s0 | ||
514 | xor $t1,$s1 | ||
515 | |||
516 | and \$0xfefefefe,$r20 | ||
517 | and \$0xfefefefe,$r21 | ||
518 | and \$0x1b1b1b1b,$acc0 | ||
519 | and \$0x1b1b1b1b,$acc1 | ||
520 | mov $s2,$t2 | ||
521 | mov $s3,$t3 | ||
522 | xor $acc0,$r20 | ||
523 | xor $acc1,$r21 | ||
524 | |||
525 | xor $r20,$s2 | ||
526 | xor $r21,$s3 | ||
527 | rol \$24,$s2 | ||
528 | rol \$24,$s3 | ||
529 | xor $r20,$s2 | ||
530 | xor $r21,$s3 | ||
531 | mov 0($sbox),$acc0 # prefetch Te4 | ||
532 | ror \$16,$t2 | ||
533 | ror \$16,$t3 | ||
534 | mov 64($sbox),$acc1 | ||
535 | xor $t2,$s2 | ||
536 | xor $t3,$s3 | ||
537 | mov 128($sbox),$r20 | ||
538 | ror \$8,$t2 | ||
539 | ror \$8,$t3 | ||
540 | mov 192($sbox),$r21 | ||
541 | xor $t2,$s2 | ||
542 | xor $t3,$s3 | ||
543 | ___ | ||
544 | } | ||
545 | |||
546 | $code.=<<___; | ||
547 | .type _x86_64_AES_encrypt_compact,\@abi-omnipotent | ||
548 | .align 16 | ||
549 | _x86_64_AES_encrypt_compact: | ||
550 | lea 128($sbox),$inp # size optimization | ||
551 | mov 0-128($inp),$acc1 # prefetch Te4 | ||
552 | mov 32-128($inp),$acc2 | ||
553 | mov 64-128($inp),$t0 | ||
554 | mov 96-128($inp),$t1 | ||
555 | mov 128-128($inp),$acc1 | ||
556 | mov 160-128($inp),$acc2 | ||
557 | mov 192-128($inp),$t0 | ||
558 | mov 224-128($inp),$t1 | ||
559 | jmp .Lenc_loop_compact | ||
560 | .align 16 | ||
561 | .Lenc_loop_compact: | ||
562 | xor 0($key),$s0 # xor with key | ||
563 | xor 4($key),$s1 | ||
564 | xor 8($key),$s2 | ||
565 | xor 12($key),$s3 | ||
566 | lea 16($key),$key | ||
567 | ___ | ||
568 | &enccompactvert(); | ||
569 | $code.=<<___; | ||
570 | cmp 16(%rsp),$key | ||
571 | je .Lenc_compact_done | ||
572 | ___ | ||
573 | &enctransform(); | ||
574 | $code.=<<___; | ||
575 | jmp .Lenc_loop_compact | ||
576 | .align 16 | ||
577 | .Lenc_compact_done: | ||
578 | xor 0($key),$s0 | ||
579 | xor 4($key),$s1 | ||
580 | xor 8($key),$s2 | ||
581 | xor 12($key),$s3 | ||
582 | .byte 0xf3,0xc3 # rep ret | ||
583 | .size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact | ||
584 | ___ | ||
585 | |||
348 | # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); | 586 | # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); |
349 | $code.=<<___; | 587 | $code.=<<___; |
350 | .globl AES_encrypt | 588 | .globl AES_encrypt |
@@ -358,31 +596,57 @@ AES_encrypt: | |||
358 | push %r14 | 596 | push %r14 |
359 | push %r15 | 597 | push %r15 |
360 | 598 | ||
361 | mov %rdx,$key | 599 | # allocate frame "above" key schedule |
362 | mov %rdi,$inp | 600 | mov %rsp,%r10 |
363 | mov %rsi,$out | 601 | lea -63(%rdx),%rcx # %rdx is key argument |
364 | 602 | and \$-64,%rsp | |
365 | .picmeup $sbox | 603 | sub %rsp,%rcx |
366 | lea AES_Te-.($sbox),$sbox | 604 | neg %rcx |
367 | 605 | and \$0x3c0,%rcx | |
368 | mov 0($inp),$s0 | 606 | sub %rcx,%rsp |
369 | mov 4($inp),$s1 | 607 | sub \$32,%rsp |
370 | mov 8($inp),$s2 | ||
371 | mov 12($inp),$s3 | ||
372 | 608 | ||
373 | call _x86_64_AES_encrypt | 609 | mov %rsi,16(%rsp) # save out |
610 | mov %r10,24(%rsp) # save real stack pointer | ||
611 | .Lenc_prologue: | ||
374 | 612 | ||
375 | mov $s0,0($out) | 613 | mov %rdx,$key |
614 | mov 240($key),$rnds # load rounds | ||
615 | |||
616 | mov 0(%rdi),$s0 # load input vector | ||
617 | mov 4(%rdi),$s1 | ||
618 | mov 8(%rdi),$s2 | ||
619 | mov 12(%rdi),$s3 | ||
620 | |||
621 | shl \$4,$rnds | ||
622 | lea ($key,$rnds),%rbp | ||
623 | mov $key,(%rsp) # key schedule | ||
624 | mov %rbp,8(%rsp) # end of key schedule | ||
625 | |||
626 | # pick Te4 copy which can't "overlap" with stack frame or key schedule | ||
627 | lea .LAES_Te+2048(%rip),$sbox | ||
628 | lea 768(%rsp),%rbp | ||
629 | sub $sbox,%rbp | ||
630 | and \$0x300,%rbp | ||
631 | lea ($sbox,%rbp),$sbox | ||
632 | |||
633 | call _x86_64_AES_encrypt_compact | ||
634 | |||
635 | mov 16(%rsp),$out # restore out | ||
636 | mov 24(%rsp),%rsi # restore saved stack pointer | ||
637 | mov $s0,0($out) # write output vector | ||
376 | mov $s1,4($out) | 638 | mov $s1,4($out) |
377 | mov $s2,8($out) | 639 | mov $s2,8($out) |
378 | mov $s3,12($out) | 640 | mov $s3,12($out) |
379 | 641 | ||
380 | pop %r15 | 642 | mov (%rsi),%r15 |
381 | pop %r14 | 643 | mov 8(%rsi),%r14 |
382 | pop %r13 | 644 | mov 16(%rsi),%r13 |
383 | pop %r12 | 645 | mov 24(%rsi),%r12 |
384 | pop %rbp | 646 | mov 32(%rsi),%rbp |
385 | pop %rbx | 647 | mov 40(%rsi),%rbx |
648 | lea 48(%rsi),%rsp | ||
649 | .Lenc_epilogue: | ||
386 | ret | 650 | ret |
387 | .size AES_encrypt,.-AES_encrypt | 651 | .size AES_encrypt,.-AES_encrypt |
388 | ___ | 652 | ___ |
@@ -453,19 +717,20 @@ sub declastvert() | |||
453 | { my $t3="%r8d"; # zaps $inp! | 717 | { my $t3="%r8d"; # zaps $inp! |
454 | 718 | ||
455 | $code.=<<___; | 719 | $code.=<<___; |
720 | lea 2048($sbox),$sbox # size optimization | ||
456 | movzb `&lo("$s0")`,$acc0 | 721 | movzb `&lo("$s0")`,$acc0 |
457 | movzb `&lo("$s1")`,$acc1 | 722 | movzb `&lo("$s1")`,$acc1 |
458 | movzb `&lo("$s2")`,$acc2 | 723 | movzb `&lo("$s2")`,$acc2 |
459 | movzb 2048($sbox,$acc0,1),$t0 | 724 | movzb ($sbox,$acc0,1),$t0 |
460 | movzb 2048($sbox,$acc1,1),$t1 | 725 | movzb ($sbox,$acc1,1),$t1 |
461 | movzb 2048($sbox,$acc2,1),$t2 | 726 | movzb ($sbox,$acc2,1),$t2 |
462 | 727 | ||
463 | movzb `&lo("$s3")`,$acc0 | 728 | movzb `&lo("$s3")`,$acc0 |
464 | movzb `&hi("$s3")`,$acc1 | 729 | movzb `&hi("$s3")`,$acc1 |
465 | movzb `&hi("$s0")`,$acc2 | 730 | movzb `&hi("$s0")`,$acc2 |
466 | movzb 2048($sbox,$acc0,1),$t3 | 731 | movzb ($sbox,$acc0,1),$t3 |
467 | movzb 2048($sbox,$acc1,1),$acc1 #$t0 | 732 | movzb ($sbox,$acc1,1),$acc1 #$t0 |
468 | movzb 2048($sbox,$acc2,1),$acc2 #$t1 | 733 | movzb ($sbox,$acc2,1),$acc2 #$t1 |
469 | 734 | ||
470 | shl \$8,$acc1 | 735 | shl \$8,$acc1 |
471 | shl \$8,$acc2 | 736 | shl \$8,$acc2 |
@@ -477,8 +742,8 @@ $code.=<<___; | |||
477 | movzb `&hi("$s1")`,$acc0 | 742 | movzb `&hi("$s1")`,$acc0 |
478 | movzb `&hi("$s2")`,$acc1 | 743 | movzb `&hi("$s2")`,$acc1 |
479 | shr \$16,$s0 | 744 | shr \$16,$s0 |
480 | movzb 2048($sbox,$acc0,1),$acc0 #$t2 | 745 | movzb ($sbox,$acc0,1),$acc0 #$t2 |
481 | movzb 2048($sbox,$acc1,1),$acc1 #$t3 | 746 | movzb ($sbox,$acc1,1),$acc1 #$t3 |
482 | 747 | ||
483 | shl \$8,$acc0 | 748 | shl \$8,$acc0 |
484 | shl \$8,$acc1 | 749 | shl \$8,$acc1 |
@@ -490,9 +755,9 @@ $code.=<<___; | |||
490 | movzb `&lo("$s2")`,$acc0 | 755 | movzb `&lo("$s2")`,$acc0 |
491 | movzb `&lo("$s3")`,$acc1 | 756 | movzb `&lo("$s3")`,$acc1 |
492 | movzb `&lo("$s0")`,$acc2 | 757 | movzb `&lo("$s0")`,$acc2 |
493 | movzb 2048($sbox,$acc0,1),$acc0 #$t0 | 758 | movzb ($sbox,$acc0,1),$acc0 #$t0 |
494 | movzb 2048($sbox,$acc1,1),$acc1 #$t1 | 759 | movzb ($sbox,$acc1,1),$acc1 #$t1 |
495 | movzb 2048($sbox,$acc2,1),$acc2 #$t2 | 760 | movzb ($sbox,$acc2,1),$acc2 #$t2 |
496 | 761 | ||
497 | shl \$16,$acc0 | 762 | shl \$16,$acc0 |
498 | shl \$16,$acc1 | 763 | shl \$16,$acc1 |
@@ -505,9 +770,9 @@ $code.=<<___; | |||
505 | movzb `&lo("$s1")`,$acc0 | 770 | movzb `&lo("$s1")`,$acc0 |
506 | movzb `&hi("$s1")`,$acc1 | 771 | movzb `&hi("$s1")`,$acc1 |
507 | movzb `&hi("$s2")`,$acc2 | 772 | movzb `&hi("$s2")`,$acc2 |
508 | movzb 2048($sbox,$acc0,1),$acc0 #$t3 | 773 | movzb ($sbox,$acc0,1),$acc0 #$t3 |
509 | movzb 2048($sbox,$acc1,1),$acc1 #$t0 | 774 | movzb ($sbox,$acc1,1),$acc1 #$t0 |
510 | movzb 2048($sbox,$acc2,1),$acc2 #$t1 | 775 | movzb ($sbox,$acc2,1),$acc2 #$t1 |
511 | 776 | ||
512 | shl \$16,$acc0 | 777 | shl \$16,$acc0 |
513 | shl \$24,$acc1 | 778 | shl \$24,$acc1 |
@@ -520,8 +785,8 @@ $code.=<<___; | |||
520 | movzb `&hi("$s3")`,$acc0 | 785 | movzb `&hi("$s3")`,$acc0 |
521 | movzb `&hi("$s0")`,$acc1 | 786 | movzb `&hi("$s0")`,$acc1 |
522 | mov 16+12($key),$s3 | 787 | mov 16+12($key),$s3 |
523 | movzb 2048($sbox,$acc0,1),$acc0 #$t2 | 788 | movzb ($sbox,$acc0,1),$acc0 #$t2 |
524 | movzb 2048($sbox,$acc1,1),$acc1 #$t3 | 789 | movzb ($sbox,$acc1,1),$acc1 #$t3 |
525 | mov 16+0($key),$s0 | 790 | mov 16+0($key),$s0 |
526 | 791 | ||
527 | shl \$24,$acc0 | 792 | shl \$24,$acc0 |
@@ -532,6 +797,7 @@ $code.=<<___; | |||
532 | 797 | ||
533 | mov 16+4($key),$s1 | 798 | mov 16+4($key),$s1 |
534 | mov 16+8($key),$s2 | 799 | mov 16+8($key),$s2 |
800 | lea -2048($sbox),$sbox | ||
535 | xor $t0,$s0 | 801 | xor $t0,$s0 |
536 | xor $t1,$s1 | 802 | xor $t1,$s1 |
537 | xor $t2,$s2 | 803 | xor $t2,$s2 |
@@ -659,6 +925,260 @@ $code.=<<___; | |||
659 | .size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt | 925 | .size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt |
660 | ___ | 926 | ___ |
661 | 927 | ||
928 | sub deccompactvert() | ||
929 | { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); | ||
930 | |||
931 | $code.=<<___; | ||
932 | movzb `&lo("$s0")`,$t0 | ||
933 | movzb `&lo("$s1")`,$t1 | ||
934 | movzb `&lo("$s2")`,$t2 | ||
935 | movzb ($sbox,$t0,1),$t0 | ||
936 | movzb ($sbox,$t1,1),$t1 | ||
937 | movzb ($sbox,$t2,1),$t2 | ||
938 | |||
939 | movzb `&lo("$s3")`,$t3 | ||
940 | movzb `&hi("$s3")`,$acc0 | ||
941 | movzb `&hi("$s0")`,$acc1 | ||
942 | movzb ($sbox,$t3,1),$t3 | ||
943 | movzb ($sbox,$acc0,1),$t4 #$t0 | ||
944 | movzb ($sbox,$acc1,1),$t5 #$t1 | ||
945 | |||
946 | movzb `&hi("$s1")`,$acc2 | ||
947 | movzb `&hi("$s2")`,$acc0 | ||
948 | shr \$16,$s2 | ||
949 | movzb ($sbox,$acc2,1),$acc2 #$t2 | ||
950 | movzb ($sbox,$acc0,1),$acc0 #$t3 | ||
951 | shr \$16,$s3 | ||
952 | |||
953 | movzb `&lo("$s2")`,$acc1 | ||
954 | shl \$8,$t4 | ||
955 | shl \$8,$t5 | ||
956 | movzb ($sbox,$acc1,1),$acc1 #$t0 | ||
957 | xor $t4,$t0 | ||
958 | xor $t5,$t1 | ||
959 | |||
960 | movzb `&lo("$s3")`,$t4 | ||
961 | shr \$16,$s0 | ||
962 | shr \$16,$s1 | ||
963 | movzb `&lo("$s0")`,$t5 | ||
964 | shl \$8,$acc2 | ||
965 | shl \$8,$acc0 | ||
966 | movzb ($sbox,$t4,1),$t4 #$t1 | ||
967 | movzb ($sbox,$t5,1),$t5 #$t2 | ||
968 | xor $acc2,$t2 | ||
969 | xor $acc0,$t3 | ||
970 | |||
971 | movzb `&lo("$s1")`,$acc2 | ||
972 | movzb `&hi("$s1")`,$acc0 | ||
973 | shl \$16,$acc1 | ||
974 | movzb ($sbox,$acc2,1),$acc2 #$t3 | ||
975 | movzb ($sbox,$acc0,1),$acc0 #$t0 | ||
976 | xor $acc1,$t0 | ||
977 | |||
978 | movzb `&hi("$s2")`,$acc1 | ||
979 | shl \$16,$t4 | ||
980 | shl \$16,$t5 | ||
981 | movzb ($sbox,$acc1,1),$s1 #$t1 | ||
982 | xor $t4,$t1 | ||
983 | xor $t5,$t2 | ||
984 | |||
985 | movzb `&hi("$s3")`,$acc1 | ||
986 | shr \$8,$s0 | ||
987 | shl \$16,$acc2 | ||
988 | movzb ($sbox,$acc1,1),$s2 #$t2 | ||
989 | movzb ($sbox,$s0,1),$s3 #$t3 | ||
990 | xor $acc2,$t3 | ||
991 | |||
992 | shl \$24,$acc0 | ||
993 | shl \$24,$s1 | ||
994 | shl \$24,$s2 | ||
995 | xor $acc0,$t0 | ||
996 | shl \$24,$s3 | ||
997 | xor $t1,$s1 | ||
998 | mov $t0,$s0 | ||
999 | xor $t2,$s2 | ||
1000 | xor $t3,$s3 | ||
1001 | ___ | ||
1002 | } | ||
1003 | |||
1004 | # parallelized version! input is pair of 64-bit values: %rax=s1.s0 | ||
1005 | # and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1, | ||
1006 | # %ecx=s2 and %edx=s3. | ||
1007 | sub dectransform() | ||
1008 | { my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx"); | ||
1009 | my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx"); | ||
1010 | my $prefetch = shift; | ||
1011 | |||
1012 | $code.=<<___; | ||
1013 | mov $tp10,$acc0 | ||
1014 | mov $tp18,$acc8 | ||
1015 | and $mask80,$acc0 | ||
1016 | and $mask80,$acc8 | ||
1017 | mov $acc0,$tp40 | ||
1018 | mov $acc8,$tp48 | ||
1019 | shr \$7,$tp40 | ||
1020 | lea ($tp10,$tp10),$tp20 | ||
1021 | shr \$7,$tp48 | ||
1022 | lea ($tp18,$tp18),$tp28 | ||
1023 | sub $tp40,$acc0 | ||
1024 | sub $tp48,$acc8 | ||
1025 | and $maskfe,$tp20 | ||
1026 | and $maskfe,$tp28 | ||
1027 | and $mask1b,$acc0 | ||
1028 | and $mask1b,$acc8 | ||
1029 | xor $tp20,$acc0 | ||
1030 | xor $tp28,$acc8 | ||
1031 | mov $acc0,$tp20 | ||
1032 | mov $acc8,$tp28 | ||
1033 | |||
1034 | and $mask80,$acc0 | ||
1035 | and $mask80,$acc8 | ||
1036 | mov $acc0,$tp80 | ||
1037 | mov $acc8,$tp88 | ||
1038 | shr \$7,$tp80 | ||
1039 | lea ($tp20,$tp20),$tp40 | ||
1040 | shr \$7,$tp88 | ||
1041 | lea ($tp28,$tp28),$tp48 | ||
1042 | sub $tp80,$acc0 | ||
1043 | sub $tp88,$acc8 | ||
1044 | and $maskfe,$tp40 | ||
1045 | and $maskfe,$tp48 | ||
1046 | and $mask1b,$acc0 | ||
1047 | and $mask1b,$acc8 | ||
1048 | xor $tp40,$acc0 | ||
1049 | xor $tp48,$acc8 | ||
1050 | mov $acc0,$tp40 | ||
1051 | mov $acc8,$tp48 | ||
1052 | |||
1053 | and $mask80,$acc0 | ||
1054 | and $mask80,$acc8 | ||
1055 | mov $acc0,$tp80 | ||
1056 | mov $acc8,$tp88 | ||
1057 | shr \$7,$tp80 | ||
1058 | xor $tp10,$tp20 # tp2^=tp1 | ||
1059 | shr \$7,$tp88 | ||
1060 | xor $tp18,$tp28 # tp2^=tp1 | ||
1061 | sub $tp80,$acc0 | ||
1062 | sub $tp88,$acc8 | ||
1063 | lea ($tp40,$tp40),$tp80 | ||
1064 | lea ($tp48,$tp48),$tp88 | ||
1065 | xor $tp10,$tp40 # tp4^=tp1 | ||
1066 | xor $tp18,$tp48 # tp4^=tp1 | ||
1067 | and $maskfe,$tp80 | ||
1068 | and $maskfe,$tp88 | ||
1069 | and $mask1b,$acc0 | ||
1070 | and $mask1b,$acc8 | ||
1071 | xor $acc0,$tp80 | ||
1072 | xor $acc8,$tp88 | ||
1073 | |||
1074 | xor $tp80,$tp10 # tp1^=tp8 | ||
1075 | xor $tp88,$tp18 # tp1^=tp8 | ||
1076 | xor $tp80,$tp20 # tp2^tp1^=tp8 | ||
1077 | xor $tp88,$tp28 # tp2^tp1^=tp8 | ||
1078 | mov $tp10,$acc0 | ||
1079 | mov $tp18,$acc8 | ||
1080 | xor $tp80,$tp40 # tp4^tp1^=tp8 | ||
1081 | xor $tp88,$tp48 # tp4^tp1^=tp8 | ||
1082 | shr \$32,$acc0 | ||
1083 | shr \$32,$acc8 | ||
1084 | xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1 | ||
1085 | xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 | ||
1086 | rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8) | ||
1087 | rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8) | ||
1088 | xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 | ||
1089 | xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 | ||
1090 | |||
1091 | rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) | ||
1092 | rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8) | ||
1093 | xor `&LO("$tp80")`,`&LO("$tp10")` | ||
1094 | xor `&LO("$tp88")`,`&LO("$tp18")` | ||
1095 | shr \$32,$tp80 | ||
1096 | shr \$32,$tp88 | ||
1097 | xor `&LO("$tp80")`,`&LO("$acc0")` | ||
1098 | xor `&LO("$tp88")`,`&LO("$acc8")` | ||
1099 | |||
1100 | mov $tp20,$tp80 | ||
1101 | mov $tp28,$tp88 | ||
1102 | shr \$32,$tp80 | ||
1103 | shr \$32,$tp88 | ||
1104 | rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24) | ||
1105 | rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24) | ||
1106 | rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) | ||
1107 | rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) | ||
1108 | xor `&LO("$tp20")`,`&LO("$tp10")` | ||
1109 | xor `&LO("$tp28")`,`&LO("$tp18")` | ||
1110 | mov $tp40,$tp20 | ||
1111 | mov $tp48,$tp28 | ||
1112 | xor `&LO("$tp80")`,`&LO("$acc0")` | ||
1113 | xor `&LO("$tp88")`,`&LO("$acc8")` | ||
1114 | |||
1115 | `"mov 0($sbox),$mask80" if ($prefetch)` | ||
1116 | shr \$32,$tp20 | ||
1117 | shr \$32,$tp28 | ||
1118 | `"mov 64($sbox),$maskfe" if ($prefetch)` | ||
1119 | rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16) | ||
1120 | rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16) | ||
1121 | `"mov 128($sbox),$mask1b" if ($prefetch)` | ||
1122 | rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16) | ||
1123 | rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) | ||
1124 | `"mov 192($sbox),$tp80" if ($prefetch)` | ||
1125 | xor `&LO("$tp40")`,`&LO("$tp10")` | ||
1126 | xor `&LO("$tp48")`,`&LO("$tp18")` | ||
1127 | `"mov 256($sbox),$tp88" if ($prefetch)` | ||
1128 | xor `&LO("$tp20")`,`&LO("$acc0")` | ||
1129 | xor `&LO("$tp28")`,`&LO("$acc8")` | ||
1130 | ___ | ||
1131 | } | ||
1132 | |||
1133 | $code.=<<___; | ||
1134 | .type _x86_64_AES_decrypt_compact,\@abi-omnipotent | ||
1135 | .align 16 | ||
1136 | _x86_64_AES_decrypt_compact: | ||
1137 | lea 128($sbox),$inp # size optimization | ||
1138 | mov 0-128($inp),$acc1 # prefetch Td4 | ||
1139 | mov 32-128($inp),$acc2 | ||
1140 | mov 64-128($inp),$t0 | ||
1141 | mov 96-128($inp),$t1 | ||
1142 | mov 128-128($inp),$acc1 | ||
1143 | mov 160-128($inp),$acc2 | ||
1144 | mov 192-128($inp),$t0 | ||
1145 | mov 224-128($inp),$t1 | ||
1146 | jmp .Ldec_loop_compact | ||
1147 | |||
1148 | .align 16 | ||
1149 | .Ldec_loop_compact: | ||
1150 | xor 0($key),$s0 # xor with key | ||
1151 | xor 4($key),$s1 | ||
1152 | xor 8($key),$s2 | ||
1153 | xor 12($key),$s3 | ||
1154 | lea 16($key),$key | ||
1155 | ___ | ||
1156 | &deccompactvert(); | ||
1157 | $code.=<<___; | ||
1158 | cmp 16(%rsp),$key | ||
1159 | je .Ldec_compact_done | ||
1160 | |||
1161 | mov 256+0($sbox),$mask80 | ||
1162 | shl \$32,%rbx | ||
1163 | shl \$32,%rdx | ||
1164 | mov 256+8($sbox),$maskfe | ||
1165 | or %rbx,%rax | ||
1166 | or %rdx,%rcx | ||
1167 | mov 256+16($sbox),$mask1b | ||
1168 | ___ | ||
1169 | &dectransform(1); | ||
1170 | $code.=<<___; | ||
1171 | jmp .Ldec_loop_compact | ||
1172 | .align 16 | ||
1173 | .Ldec_compact_done: | ||
1174 | xor 0($key),$s0 | ||
1175 | xor 4($key),$s1 | ||
1176 | xor 8($key),$s2 | ||
1177 | xor 12($key),$s3 | ||
1178 | .byte 0xf3,0xc3 # rep ret | ||
1179 | .size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact | ||
1180 | ___ | ||
1181 | |||
662 | # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); | 1182 | # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); |
663 | $code.=<<___; | 1183 | $code.=<<___; |
664 | .globl AES_decrypt | 1184 | .globl AES_decrypt |
@@ -672,43 +1192,59 @@ AES_decrypt: | |||
672 | push %r14 | 1192 | push %r14 |
673 | push %r15 | 1193 | push %r15 |
674 | 1194 | ||
675 | mov %rdx,$key | 1195 | # allocate frame "above" key schedule |
676 | mov %rdi,$inp | 1196 | mov %rsp,%r10 |
677 | mov %rsi,$out | 1197 | lea -63(%rdx),%rcx # %rdx is key argument |
1198 | and \$-64,%rsp | ||
1199 | sub %rsp,%rcx | ||
1200 | neg %rcx | ||
1201 | and \$0x3c0,%rcx | ||
1202 | sub %rcx,%rsp | ||
1203 | sub \$32,%rsp | ||
1204 | |||
1205 | mov %rsi,16(%rsp) # save out | ||
1206 | mov %r10,24(%rsp) # save real stack pointer | ||
1207 | .Ldec_prologue: | ||
678 | 1208 | ||
679 | .picmeup $sbox | 1209 | mov %rdx,$key |
680 | lea AES_Td-.($sbox),$sbox | 1210 | mov 240($key),$rnds # load rounds |
681 | 1211 | ||
682 | # prefetch Td4 | 1212 | mov 0(%rdi),$s0 # load input vector |
683 | lea 2048+128($sbox),$sbox; | 1213 | mov 4(%rdi),$s1 |
684 | mov 0-128($sbox),$s0 | 1214 | mov 8(%rdi),$s2 |
685 | mov 32-128($sbox),$s1 | 1215 | mov 12(%rdi),$s3 |
686 | mov 64-128($sbox),$s2 | 1216 | |
687 | mov 96-128($sbox),$s3 | 1217 | shl \$4,$rnds |
688 | mov 128-128($sbox),$s0 | 1218 | lea ($key,$rnds),%rbp |
689 | mov 160-128($sbox),$s1 | 1219 | mov $key,(%rsp) # key schedule |
690 | mov 192-128($sbox),$s2 | 1220 | mov %rbp,8(%rsp) # end of key schedule |
691 | mov 224-128($sbox),$s3 | 1221 | |
692 | lea -2048-128($sbox),$sbox; | 1222 | # pick Td4 copy which can't "overlap" with stack frame or key schedule |
693 | 1223 | lea .LAES_Td+2048(%rip),$sbox | |
694 | mov 0($inp),$s0 | 1224 | lea 768(%rsp),%rbp |
695 | mov 4($inp),$s1 | 1225 | sub $sbox,%rbp |
696 | mov 8($inp),$s2 | 1226 | and \$0x300,%rbp |
697 | mov 12($inp),$s3 | 1227 | lea ($sbox,%rbp),$sbox |
698 | 1228 | shr \$3,%rbp # recall "magic" constants! | |
699 | call _x86_64_AES_decrypt | 1229 | add %rbp,$sbox |
700 | 1230 | ||
701 | mov $s0,0($out) | 1231 | call _x86_64_AES_decrypt_compact |
1232 | |||
1233 | mov 16(%rsp),$out # restore out | ||
1234 | mov 24(%rsp),%rsi # restore saved stack pointer | ||
1235 | mov $s0,0($out) # write output vector | ||
702 | mov $s1,4($out) | 1236 | mov $s1,4($out) |
703 | mov $s2,8($out) | 1237 | mov $s2,8($out) |
704 | mov $s3,12($out) | 1238 | mov $s3,12($out) |
705 | 1239 | ||
706 | pop %r15 | 1240 | mov (%rsi),%r15 |
707 | pop %r14 | 1241 | mov 8(%rsi),%r14 |
708 | pop %r13 | 1242 | mov 16(%rsi),%r13 |
709 | pop %r12 | 1243 | mov 24(%rsi),%r12 |
710 | pop %rbp | 1244 | mov 32(%rsi),%rbp |
711 | pop %rbx | 1245 | mov 40(%rsi),%rbx |
1246 | lea 48(%rsi),%rsp | ||
1247 | .Ldec_epilogue: | ||
712 | ret | 1248 | ret |
713 | .size AES_decrypt,.-AES_decrypt | 1249 | .size AES_decrypt,.-AES_decrypt |
714 | ___ | 1250 | ___ |
@@ -718,27 +1254,26 @@ sub enckey() | |||
718 | { | 1254 | { |
719 | $code.=<<___; | 1255 | $code.=<<___; |
720 | movz %dl,%esi # rk[i]>>0 | 1256 | movz %dl,%esi # rk[i]>>0 |
721 | mov 2(%rbp,%rsi,8),%ebx | 1257 | movzb -128(%rbp,%rsi),%ebx |
722 | movz %dh,%esi # rk[i]>>8 | 1258 | movz %dh,%esi # rk[i]>>8 |
723 | and \$0xFF000000,%ebx | 1259 | shl \$24,%ebx |
724 | xor %ebx,%eax | 1260 | xor %ebx,%eax |
725 | 1261 | ||
726 | mov 2(%rbp,%rsi,8),%ebx | 1262 | movzb -128(%rbp,%rsi),%ebx |
727 | shr \$16,%edx | 1263 | shr \$16,%edx |
728 | and \$0x000000FF,%ebx | ||
729 | movz %dl,%esi # rk[i]>>16 | 1264 | movz %dl,%esi # rk[i]>>16 |
730 | xor %ebx,%eax | 1265 | xor %ebx,%eax |
731 | 1266 | ||
732 | mov 0(%rbp,%rsi,8),%ebx | 1267 | movzb -128(%rbp,%rsi),%ebx |
733 | movz %dh,%esi # rk[i]>>24 | 1268 | movz %dh,%esi # rk[i]>>24 |
734 | and \$0x0000FF00,%ebx | 1269 | shl \$8,%ebx |
735 | xor %ebx,%eax | 1270 | xor %ebx,%eax |
736 | 1271 | ||
737 | mov 0(%rbp,%rsi,8),%ebx | 1272 | movzb -128(%rbp,%rsi),%ebx |
738 | and \$0x00FF0000,%ebx | 1273 | shl \$16,%ebx |
739 | xor %ebx,%eax | 1274 | xor %ebx,%eax |
740 | 1275 | ||
741 | xor 2048(%rbp,%rcx,4),%eax # rcon | 1276 | xor 1024-128(%rbp,%rcx,4),%eax # rcon |
742 | ___ | 1277 | ___ |
743 | } | 1278 | } |
744 | 1279 | ||
@@ -751,7 +1286,29 @@ $code.=<<___; | |||
751 | AES_set_encrypt_key: | 1286 | AES_set_encrypt_key: |
752 | push %rbx | 1287 | push %rbx |
753 | push %rbp | 1288 | push %rbp |
1289 | push %r12 # redundant, but allows to share | ||
1290 | push %r13 # exception handler... | ||
1291 | push %r14 | ||
1292 | push %r15 | ||
1293 | sub \$8,%rsp | ||
1294 | .Lenc_key_prologue: | ||
1295 | |||
1296 | call _x86_64_AES_set_encrypt_key | ||
1297 | |||
1298 | mov 8(%rsp),%r15 | ||
1299 | mov 16(%rsp),%r14 | ||
1300 | mov 24(%rsp),%r13 | ||
1301 | mov 32(%rsp),%r12 | ||
1302 | mov 40(%rsp),%rbp | ||
1303 | mov 48(%rsp),%rbx | ||
1304 | add \$56,%rsp | ||
1305 | .Lenc_key_epilogue: | ||
1306 | ret | ||
1307 | .size AES_set_encrypt_key,.-AES_set_encrypt_key | ||
754 | 1308 | ||
1309 | .type _x86_64_AES_set_encrypt_key,\@abi-omnipotent | ||
1310 | .align 16 | ||
1311 | _x86_64_AES_set_encrypt_key: | ||
755 | mov %esi,%ecx # %ecx=bits | 1312 | mov %esi,%ecx # %ecx=bits |
756 | mov %rdi,%rsi # %rsi=userKey | 1313 | mov %rdi,%rsi # %rsi=userKey |
757 | mov %rdx,%rdi # %rdi=key | 1314 | mov %rdx,%rdi # %rdi=key |
@@ -761,8 +1318,18 @@ AES_set_encrypt_key: | |||
761 | test \$-1,%rdi | 1318 | test \$-1,%rdi |
762 | jz .Lbadpointer | 1319 | jz .Lbadpointer |
763 | 1320 | ||
764 | .picmeup %rbp | 1321 | lea .LAES_Te(%rip),%rbp |
765 | lea AES_Te-.(%rbp),%rbp | 1322 | lea 2048+128(%rbp),%rbp |
1323 | |||
1324 | # prefetch Te4 | ||
1325 | mov 0-128(%rbp),%eax | ||
1326 | mov 32-128(%rbp),%ebx | ||
1327 | mov 64-128(%rbp),%r8d | ||
1328 | mov 96-128(%rbp),%edx | ||
1329 | mov 128-128(%rbp),%eax | ||
1330 | mov 160-128(%rbp),%ebx | ||
1331 | mov 192-128(%rbp),%r8d | ||
1332 | mov 224-128(%rbp),%edx | ||
766 | 1333 | ||
767 | cmp \$128,%ecx | 1334 | cmp \$128,%ecx |
768 | je .L10rounds | 1335 | je .L10rounds |
@@ -774,15 +1341,12 @@ AES_set_encrypt_key: | |||
774 | jmp .Lexit | 1341 | jmp .Lexit |
775 | 1342 | ||
776 | .L10rounds: | 1343 | .L10rounds: |
777 | mov 0(%rsi),%eax # copy first 4 dwords | 1344 | mov 0(%rsi),%rax # copy first 4 dwords |
778 | mov 4(%rsi),%ebx | 1345 | mov 8(%rsi),%rdx |
779 | mov 8(%rsi),%ecx | 1346 | mov %rax,0(%rdi) |
780 | mov 12(%rsi),%edx | 1347 | mov %rdx,8(%rdi) |
781 | mov %eax,0(%rdi) | ||
782 | mov %ebx,4(%rdi) | ||
783 | mov %ecx,8(%rdi) | ||
784 | mov %edx,12(%rdi) | ||
785 | 1348 | ||
1349 | shr \$32,%rdx | ||
786 | xor %ecx,%ecx | 1350 | xor %ecx,%ecx |
787 | jmp .L10shortcut | 1351 | jmp .L10shortcut |
788 | .align 4 | 1352 | .align 4 |
@@ -810,19 +1374,14 @@ $code.=<<___; | |||
810 | jmp .Lexit | 1374 | jmp .Lexit |
811 | 1375 | ||
812 | .L12rounds: | 1376 | .L12rounds: |
813 | mov 0(%rsi),%eax # copy first 6 dwords | 1377 | mov 0(%rsi),%rax # copy first 6 dwords |
814 | mov 4(%rsi),%ebx | 1378 | mov 8(%rsi),%rbx |
815 | mov 8(%rsi),%ecx | 1379 | mov 16(%rsi),%rdx |
816 | mov 12(%rsi),%edx | 1380 | mov %rax,0(%rdi) |
817 | mov %eax,0(%rdi) | 1381 | mov %rbx,8(%rdi) |
818 | mov %ebx,4(%rdi) | 1382 | mov %rdx,16(%rdi) |
819 | mov %ecx,8(%rdi) | 1383 | |
820 | mov %edx,12(%rdi) | 1384 | shr \$32,%rdx |
821 | mov 16(%rsi),%ecx | ||
822 | mov 20(%rsi),%edx | ||
823 | mov %ecx,16(%rdi) | ||
824 | mov %edx,20(%rdi) | ||
825 | |||
826 | xor %ecx,%ecx | 1385 | xor %ecx,%ecx |
827 | jmp .L12shortcut | 1386 | jmp .L12shortcut |
828 | .align 4 | 1387 | .align 4 |
@@ -858,30 +1417,23 @@ $code.=<<___; | |||
858 | jmp .Lexit | 1417 | jmp .Lexit |
859 | 1418 | ||
860 | .L14rounds: | 1419 | .L14rounds: |
861 | mov 0(%rsi),%eax # copy first 8 dwords | 1420 | mov 0(%rsi),%rax # copy first 8 dwords |
862 | mov 4(%rsi),%ebx | 1421 | mov 8(%rsi),%rbx |
863 | mov 8(%rsi),%ecx | 1422 | mov 16(%rsi),%rcx |
864 | mov 12(%rsi),%edx | 1423 | mov 24(%rsi),%rdx |
865 | mov %eax,0(%rdi) | 1424 | mov %rax,0(%rdi) |
866 | mov %ebx,4(%rdi) | 1425 | mov %rbx,8(%rdi) |
867 | mov %ecx,8(%rdi) | 1426 | mov %rcx,16(%rdi) |
868 | mov %edx,12(%rdi) | 1427 | mov %rdx,24(%rdi) |
869 | mov 16(%rsi),%eax | 1428 | |
870 | mov 20(%rsi),%ebx | 1429 | shr \$32,%rdx |
871 | mov 24(%rsi),%ecx | ||
872 | mov 28(%rsi),%edx | ||
873 | mov %eax,16(%rdi) | ||
874 | mov %ebx,20(%rdi) | ||
875 | mov %ecx,24(%rdi) | ||
876 | mov %edx,28(%rdi) | ||
877 | |||
878 | xor %ecx,%ecx | 1430 | xor %ecx,%ecx |
879 | jmp .L14shortcut | 1431 | jmp .L14shortcut |
880 | .align 4 | 1432 | .align 4 |
881 | .L14loop: | 1433 | .L14loop: |
1434 | mov 0(%rdi),%eax # rk[0] | ||
882 | mov 28(%rdi),%edx # rk[4] | 1435 | mov 28(%rdi),%edx # rk[4] |
883 | .L14shortcut: | 1436 | .L14shortcut: |
884 | mov 0(%rdi),%eax # rk[0] | ||
885 | ___ | 1437 | ___ |
886 | &enckey (); | 1438 | &enckey (); |
887 | $code.=<<___; | 1439 | $code.=<<___; |
@@ -900,24 +1452,23 @@ $code.=<<___; | |||
900 | mov %eax,%edx | 1452 | mov %eax,%edx |
901 | mov 16(%rdi),%eax # rk[4] | 1453 | mov 16(%rdi),%eax # rk[4] |
902 | movz %dl,%esi # rk[11]>>0 | 1454 | movz %dl,%esi # rk[11]>>0 |
903 | mov 2(%rbp,%rsi,8),%ebx | 1455 | movzb -128(%rbp,%rsi),%ebx |
904 | movz %dh,%esi # rk[11]>>8 | 1456 | movz %dh,%esi # rk[11]>>8 |
905 | and \$0x000000FF,%ebx | ||
906 | xor %ebx,%eax | 1457 | xor %ebx,%eax |
907 | 1458 | ||
908 | mov 0(%rbp,%rsi,8),%ebx | 1459 | movzb -128(%rbp,%rsi),%ebx |
909 | shr \$16,%edx | 1460 | shr \$16,%edx |
910 | and \$0x0000FF00,%ebx | 1461 | shl \$8,%ebx |
911 | movz %dl,%esi # rk[11]>>16 | 1462 | movz %dl,%esi # rk[11]>>16 |
912 | xor %ebx,%eax | 1463 | xor %ebx,%eax |
913 | 1464 | ||
914 | mov 0(%rbp,%rsi,8),%ebx | 1465 | movzb -128(%rbp,%rsi),%ebx |
915 | movz %dh,%esi # rk[11]>>24 | 1466 | movz %dh,%esi # rk[11]>>24 |
916 | and \$0x00FF0000,%ebx | 1467 | shl \$16,%ebx |
917 | xor %ebx,%eax | 1468 | xor %ebx,%eax |
918 | 1469 | ||
919 | mov 2(%rbp,%rsi,8),%ebx | 1470 | movzb -128(%rbp,%rsi),%ebx |
920 | and \$0xFF000000,%ebx | 1471 | shl \$24,%ebx |
921 | xor %ebx,%eax | 1472 | xor %ebx,%eax |
922 | 1473 | ||
923 | mov %eax,48(%rdi) # rk[12] | 1474 | mov %eax,48(%rdi) # rk[12] |
@@ -938,31 +1489,61 @@ $code.=<<___; | |||
938 | .Lbadpointer: | 1489 | .Lbadpointer: |
939 | mov \$-1,%rax | 1490 | mov \$-1,%rax |
940 | .Lexit: | 1491 | .Lexit: |
941 | pop %rbp | 1492 | .byte 0xf3,0xc3 # rep ret |
942 | pop %rbx | 1493 | .size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key |
943 | ret | ||
944 | .size AES_set_encrypt_key,.-AES_set_encrypt_key | ||
945 | ___ | 1494 | ___ |
946 | 1495 | ||
947 | sub deckey() | 1496 | sub deckey_ref() |
948 | { my ($i,$ptr,$te,$td) = @_; | 1497 | { my ($i,$ptr,$te,$td) = @_; |
1498 | my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d"); | ||
949 | $code.=<<___; | 1499 | $code.=<<___; |
950 | mov $i($ptr),%eax | 1500 | mov $i($ptr),$tp1 |
951 | mov %eax,%edx | 1501 | mov $tp1,$acc |
952 | movz %ah,%ebx | 1502 | and \$0x80808080,$acc |
953 | shr \$16,%edx | 1503 | mov $acc,$tp4 |
954 | and \$0xFF,%eax | 1504 | shr \$7,$tp4 |
955 | movzb 2($te,%rax,8),%rax | 1505 | lea 0($tp1,$tp1),$tp2 |
956 | movzb 2($te,%rbx,8),%rbx | 1506 | sub $tp4,$acc |
957 | mov 0($td,%rax,8),%eax | 1507 | and \$0xfefefefe,$tp2 |
958 | xor 3($td,%rbx,8),%eax | 1508 | and \$0x1b1b1b1b,$acc |
959 | movzb %dh,%ebx | 1509 | xor $tp2,$acc |
960 | and \$0xFF,%edx | 1510 | mov $acc,$tp2 |
961 | movzb 2($te,%rdx,8),%rdx | 1511 | |
962 | movzb 2($te,%rbx,8),%rbx | 1512 | and \$0x80808080,$acc |
963 | xor 2($td,%rdx,8),%eax | 1513 | mov $acc,$tp8 |
964 | xor 1($td,%rbx,8),%eax | 1514 | shr \$7,$tp8 |
965 | mov %eax,$i($ptr) | 1515 | lea 0($tp2,$tp2),$tp4 |
1516 | sub $tp8,$acc | ||
1517 | and \$0xfefefefe,$tp4 | ||
1518 | and \$0x1b1b1b1b,$acc | ||
1519 | xor $tp1,$tp2 # tp2^tp1 | ||
1520 | xor $tp4,$acc | ||
1521 | mov $acc,$tp4 | ||
1522 | |||
1523 | and \$0x80808080,$acc | ||
1524 | mov $acc,$tp8 | ||
1525 | shr \$7,$tp8 | ||
1526 | sub $tp8,$acc | ||
1527 | lea 0($tp4,$tp4),$tp8 | ||
1528 | xor $tp1,$tp4 # tp4^tp1 | ||
1529 | and \$0xfefefefe,$tp8 | ||
1530 | and \$0x1b1b1b1b,$acc | ||
1531 | xor $acc,$tp8 | ||
1532 | |||
1533 | xor $tp8,$tp1 # tp1^tp8 | ||
1534 | rol \$8,$tp1 # ROTATE(tp1^tp8,8) | ||
1535 | xor $tp8,$tp2 # tp2^tp1^tp8 | ||
1536 | xor $tp8,$tp4 # tp4^tp1^tp8 | ||
1537 | xor $tp2,$tp8 | ||
1538 | xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2 | ||
1539 | |||
1540 | xor $tp8,$tp1 | ||
1541 | rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24) | ||
1542 | xor $tp2,$tp1 | ||
1543 | rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16) | ||
1544 | xor $tp4,$tp1 | ||
1545 | |||
1546 | mov $tp1,$i($ptr) | ||
966 | ___ | 1547 | ___ |
967 | } | 1548 | } |
968 | 1549 | ||
@@ -973,19 +1554,23 @@ $code.=<<___; | |||
973 | .type AES_set_decrypt_key,\@function,3 | 1554 | .type AES_set_decrypt_key,\@function,3 |
974 | .align 16 | 1555 | .align 16 |
975 | AES_set_decrypt_key: | 1556 | AES_set_decrypt_key: |
976 | push %rdx | 1557 | push %rbx |
977 | call AES_set_encrypt_key | 1558 | push %rbp |
978 | cmp \$0,%eax | 1559 | push %r12 |
979 | je .Lproceed | 1560 | push %r13 |
980 | lea 24(%rsp),%rsp | 1561 | push %r14 |
981 | ret | 1562 | push %r15 |
982 | .Lproceed: | 1563 | push %rdx # save key schedule |
1564 | .Ldec_key_prologue: | ||
1565 | |||
1566 | call _x86_64_AES_set_encrypt_key | ||
983 | mov (%rsp),%r8 # restore key schedule | 1567 | mov (%rsp),%r8 # restore key schedule |
984 | mov %rbx,(%rsp) | 1568 | cmp \$0,%eax |
1569 | jne .Labort | ||
985 | 1570 | ||
986 | mov 240(%r8),%ecx # pull number of rounds | 1571 | mov 240(%r8),%r14d # pull number of rounds |
987 | xor %rdi,%rdi | 1572 | xor %rdi,%rdi |
988 | lea (%rdi,%rcx,4),%rcx | 1573 | lea (%rdi,%r14d,4),%rcx |
989 | mov %r8,%rsi | 1574 | mov %r8,%rsi |
990 | lea (%r8,%rcx,4),%rdi # pointer to last chunk | 1575 | lea (%r8,%rcx,4),%rdi # pointer to last chunk |
991 | .align 4 | 1576 | .align 4 |
@@ -1003,27 +1588,39 @@ AES_set_decrypt_key: | |||
1003 | cmp %rsi,%rdi | 1588 | cmp %rsi,%rdi |
1004 | jne .Linvert | 1589 | jne .Linvert |
1005 | 1590 | ||
1006 | .picmeup %r9 | 1591 | lea .LAES_Te+2048+1024(%rip),%rax # rcon |
1007 | lea AES_Td-.(%r9),%rdi | ||
1008 | lea AES_Te-AES_Td(%rdi),%r9 | ||
1009 | 1592 | ||
1010 | mov %r8,%rsi | 1593 | mov 40(%rax),$mask80 |
1011 | mov 240(%r8),%ecx # pull number of rounds | 1594 | mov 48(%rax),$maskfe |
1012 | sub \$1,%ecx | 1595 | mov 56(%rax),$mask1b |
1596 | |||
1597 | mov %r8,$key | ||
1598 | sub \$1,%r14d | ||
1013 | .align 4 | 1599 | .align 4 |
1014 | .Lpermute: | 1600 | .Lpermute: |
1015 | lea 16(%rsi),%rsi | 1601 | lea 16($key),$key |
1602 | mov 0($key),%rax | ||
1603 | mov 8($key),%rcx | ||
1016 | ___ | 1604 | ___ |
1017 | &deckey (0,"%rsi","%r9","%rdi"); | 1605 | &dectransform (); |
1018 | &deckey (4,"%rsi","%r9","%rdi"); | ||
1019 | &deckey (8,"%rsi","%r9","%rdi"); | ||
1020 | &deckey (12,"%rsi","%r9","%rdi"); | ||
1021 | $code.=<<___; | 1606 | $code.=<<___; |
1022 | sub \$1,%ecx | 1607 | mov %eax,0($key) |
1608 | mov %ebx,4($key) | ||
1609 | mov %ecx,8($key) | ||
1610 | mov %edx,12($key) | ||
1611 | sub \$1,%r14d | ||
1023 | jnz .Lpermute | 1612 | jnz .Lpermute |
1024 | 1613 | ||
1025 | xor %rax,%rax | 1614 | xor %rax,%rax |
1026 | pop %rbx | 1615 | .Labort: |
1616 | mov 8(%rsp),%r15 | ||
1617 | mov 16(%rsp),%r14 | ||
1618 | mov 24(%rsp),%r13 | ||
1619 | mov 32(%rsp),%r12 | ||
1620 | mov 40(%rsp),%rbp | ||
1621 | mov 48(%rsp),%rbx | ||
1622 | add \$56,%rsp | ||
1623 | .Ldec_key_epilogue: | ||
1027 | ret | 1624 | ret |
1028 | .size AES_set_decrypt_key,.-AES_set_decrypt_key | 1625 | .size AES_set_decrypt_key,.-AES_set_decrypt_key |
1029 | ___ | 1626 | ___ |
@@ -1034,47 +1631,59 @@ ___ | |||
1034 | { | 1631 | { |
1035 | # stack frame layout | 1632 | # stack frame layout |
1036 | # -8(%rsp) return address | 1633 | # -8(%rsp) return address |
1037 | my $_rsp="0(%rsp)"; # saved %rsp | 1634 | my $keyp="0(%rsp)"; # one to pass as $key |
1038 | my $_len="8(%rsp)"; # copy of 3rd parameter, length | 1635 | my $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds]) |
1039 | my $_key="16(%rsp)"; # copy of 4th parameter, key | 1636 | my $_rsp="16(%rsp)"; # saved %rsp |
1040 | my $_ivp="24(%rsp)"; # copy of 5th parameter, ivp | 1637 | my $_inp="24(%rsp)"; # copy of 1st parameter, inp |
1041 | my $keyp="32(%rsp)"; # one to pass as $key | 1638 | my $_out="32(%rsp)"; # copy of 2nd parameter, out |
1042 | my $ivec="40(%rsp)"; # ivec[16] | 1639 | my $_len="40(%rsp)"; # copy of 3rd parameter, length |
1043 | my $aes_key="56(%rsp)"; # copy of aes_key | 1640 | my $_key="48(%rsp)"; # copy of 4th parameter, key |
1044 | my $mark="56+240(%rsp)"; # copy of aes_key->rounds | 1641 | my $_ivp="56(%rsp)"; # copy of 5th parameter, ivp |
1642 | my $ivec="64(%rsp)"; # ivec[16] | ||
1643 | my $aes_key="80(%rsp)"; # copy of aes_key | ||
1644 | my $mark="80+240(%rsp)"; # copy of aes_key->rounds | ||
1045 | 1645 | ||
1046 | $code.=<<___; | 1646 | $code.=<<___; |
1047 | .globl AES_cbc_encrypt | 1647 | .globl AES_cbc_encrypt |
1048 | .type AES_cbc_encrypt,\@function,6 | 1648 | .type AES_cbc_encrypt,\@function,6 |
1049 | .align 16 | 1649 | .align 16 |
1650 | .extern OPENSSL_ia32cap_P | ||
1050 | AES_cbc_encrypt: | 1651 | AES_cbc_encrypt: |
1051 | cmp \$0,%rdx # check length | 1652 | cmp \$0,%rdx # check length |
1052 | je .Lcbc_just_ret | 1653 | je .Lcbc_epilogue |
1654 | pushfq | ||
1053 | push %rbx | 1655 | push %rbx |
1054 | push %rbp | 1656 | push %rbp |
1055 | push %r12 | 1657 | push %r12 |
1056 | push %r13 | 1658 | push %r13 |
1057 | push %r14 | 1659 | push %r14 |
1058 | push %r15 | 1660 | push %r15 |
1059 | pushfq | 1661 | .Lcbc_prologue: |
1662 | |||
1060 | cld | 1663 | cld |
1061 | mov %r9d,%r9d # clear upper half of enc | 1664 | mov %r9d,%r9d # clear upper half of enc |
1062 | 1665 | ||
1063 | .picmeup $sbox | 1666 | lea .LAES_Te(%rip),$sbox |
1064 | .Lcbc_pic_point: | ||
1065 | |||
1066 | cmp \$0,%r9 | 1667 | cmp \$0,%r9 |
1067 | je .LDECRYPT | 1668 | jne .Lcbc_picked_te |
1068 | 1669 | lea .LAES_Td(%rip),$sbox | |
1069 | lea AES_Te-.Lcbc_pic_point($sbox),$sbox | 1670 | .Lcbc_picked_te: |
1671 | |||
1672 | mov OPENSSL_ia32cap_P(%rip),%r10d | ||
1673 | cmp \$$speed_limit,%rdx | ||
1674 | jb .Lcbc_slow_prologue | ||
1675 | test \$15,%rdx | ||
1676 | jnz .Lcbc_slow_prologue | ||
1677 | bt \$28,%r10d | ||
1678 | jc .Lcbc_slow_prologue | ||
1070 | 1679 | ||
1071 | # allocate aligned stack frame... | 1680 | # allocate aligned stack frame... |
1072 | lea -64-248(%rsp),$key | 1681 | lea -88-248(%rsp),$key |
1073 | and \$-64,$key | 1682 | and \$-64,$key |
1074 | 1683 | ||
1075 | # ... and make it doesn't alias with AES_Te modulo 4096 | 1684 | # ... and make sure it doesn't alias with AES_T[ed] modulo 4096 |
1076 | mov $sbox,%r10 | 1685 | mov $sbox,%r10 |
1077 | lea 2048($sbox),%r11 | 1686 | lea 2304($sbox),%r11 |
1078 | mov $key,%r12 | 1687 | mov $key,%r12 |
1079 | and \$0xFFF,%r10 # s = $sbox&0xfff | 1688 | and \$0xFFF,%r10 # s = $sbox&0xfff |
1080 | and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff | 1689 | and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff |
@@ -1094,22 +1703,27 @@ AES_cbc_encrypt: | |||
1094 | .Lcbc_te_ok: | 1703 | .Lcbc_te_ok: |
1095 | 1704 | ||
1096 | xchg %rsp,$key | 1705 | xchg %rsp,$key |
1097 | add \$8,%rsp # reserve for return address! | 1706 | #add \$8,%rsp # reserve for return address! |
1098 | mov $key,$_rsp # save %rsp | 1707 | mov $key,$_rsp # save %rsp |
1708 | .Lcbc_fast_body: | ||
1709 | mov %rdi,$_inp # save copy of inp | ||
1710 | mov %rsi,$_out # save copy of out | ||
1099 | mov %rdx,$_len # save copy of len | 1711 | mov %rdx,$_len # save copy of len |
1100 | mov %rcx,$_key # save copy of key | 1712 | mov %rcx,$_key # save copy of key |
1101 | mov %r8,$_ivp # save copy of ivp | 1713 | mov %r8,$_ivp # save copy of ivp |
1102 | movl \$0,$mark # copy of aes_key->rounds = 0; | 1714 | movl \$0,$mark # copy of aes_key->rounds = 0; |
1103 | mov %r8,%rbp # rearrange input arguments | 1715 | mov %r8,%rbp # rearrange input arguments |
1716 | mov %r9,%rbx | ||
1104 | mov %rsi,$out | 1717 | mov %rsi,$out |
1105 | mov %rdi,$inp | 1718 | mov %rdi,$inp |
1106 | mov %rcx,$key | 1719 | mov %rcx,$key |
1107 | 1720 | ||
1721 | mov 240($key),%eax # key->rounds | ||
1108 | # do we copy key schedule to stack? | 1722 | # do we copy key schedule to stack? |
1109 | mov $key,%r10 | 1723 | mov $key,%r10 |
1110 | sub $sbox,%r10 | 1724 | sub $sbox,%r10 |
1111 | and \$0xfff,%r10 | 1725 | and \$0xfff,%r10 |
1112 | cmp \$2048,%r10 | 1726 | cmp \$2304,%r10 |
1113 | jb .Lcbc_do_ecopy | 1727 | jb .Lcbc_do_ecopy |
1114 | cmp \$4096-248,%r10 | 1728 | cmp \$4096-248,%r10 |
1115 | jb .Lcbc_skip_ecopy | 1729 | jb .Lcbc_skip_ecopy |
@@ -1120,12 +1734,11 @@ AES_cbc_encrypt: | |||
1120 | lea $aes_key,$key | 1734 | lea $aes_key,$key |
1121 | mov \$240/8,%ecx | 1735 | mov \$240/8,%ecx |
1122 | .long 0x90A548F3 # rep movsq | 1736 | .long 0x90A548F3 # rep movsq |
1123 | mov (%rsi),%eax # copy aes_key->rounds | 1737 | mov %eax,(%rdi) # copy aes_key->rounds |
1124 | mov %eax,(%rdi) | ||
1125 | .Lcbc_skip_ecopy: | 1738 | .Lcbc_skip_ecopy: |
1126 | mov $key,$keyp # save key pointer | 1739 | mov $key,$keyp # save key pointer |
1127 | 1740 | ||
1128 | mov \$16,%ecx | 1741 | mov \$18,%ecx |
1129 | .align 4 | 1742 | .align 4 |
1130 | .Lcbc_prefetch_te: | 1743 | .Lcbc_prefetch_te: |
1131 | mov 0($sbox),%r10 | 1744 | mov 0($sbox),%r10 |
@@ -1135,184 +1748,77 @@ AES_cbc_encrypt: | |||
1135 | lea 128($sbox),$sbox | 1748 | lea 128($sbox),$sbox |
1136 | sub \$1,%ecx | 1749 | sub \$1,%ecx |
1137 | jnz .Lcbc_prefetch_te | 1750 | jnz .Lcbc_prefetch_te |
1138 | sub \$2048,$sbox | 1751 | lea -2304($sbox),$sbox |
1139 | 1752 | ||
1140 | test \$-16,%rdx # check upon length | 1753 | cmp \$0,%rbx |
1141 | mov %rdx,%r10 | 1754 | je .LFAST_DECRYPT |
1755 | |||
1756 | #----------------------------- ENCRYPT -----------------------------# | ||
1142 | mov 0(%rbp),$s0 # load iv | 1757 | mov 0(%rbp),$s0 # load iv |
1143 | mov 4(%rbp),$s1 | 1758 | mov 4(%rbp),$s1 |
1144 | mov 8(%rbp),$s2 | 1759 | mov 8(%rbp),$s2 |
1145 | mov 12(%rbp),$s3 | 1760 | mov 12(%rbp),$s3 |
1146 | jz .Lcbc_enc_tail # short input... | ||
1147 | 1761 | ||
1148 | .align 4 | 1762 | .align 4 |
1149 | .Lcbc_enc_loop: | 1763 | .Lcbc_fast_enc_loop: |
1150 | xor 0($inp),$s0 | 1764 | xor 0($inp),$s0 |
1151 | xor 4($inp),$s1 | 1765 | xor 4($inp),$s1 |
1152 | xor 8($inp),$s2 | 1766 | xor 8($inp),$s2 |
1153 | xor 12($inp),$s3 | 1767 | xor 12($inp),$s3 |
1154 | mov $inp,$ivec # if ($verticalspin) save inp | ||
1155 | |||
1156 | mov $keyp,$key # restore key | 1768 | mov $keyp,$key # restore key |
1769 | mov $inp,$_inp # if ($verticalspin) save inp | ||
1770 | |||
1157 | call _x86_64_AES_encrypt | 1771 | call _x86_64_AES_encrypt |
1158 | 1772 | ||
1159 | mov $ivec,$inp # if ($verticalspin) restore inp | 1773 | mov $_inp,$inp # if ($verticalspin) restore inp |
1774 | mov $_len,%r10 | ||
1160 | mov $s0,0($out) | 1775 | mov $s0,0($out) |
1161 | mov $s1,4($out) | 1776 | mov $s1,4($out) |
1162 | mov $s2,8($out) | 1777 | mov $s2,8($out) |
1163 | mov $s3,12($out) | 1778 | mov $s3,12($out) |
1164 | 1779 | ||
1165 | mov $_len,%r10 | ||
1166 | lea 16($inp),$inp | 1780 | lea 16($inp),$inp |
1167 | lea 16($out),$out | 1781 | lea 16($out),$out |
1168 | sub \$16,%r10 | 1782 | sub \$16,%r10 |
1169 | test \$-16,%r10 | 1783 | test \$-16,%r10 |
1170 | mov %r10,$_len | 1784 | mov %r10,$_len |
1171 | jnz .Lcbc_enc_loop | 1785 | jnz .Lcbc_fast_enc_loop |
1172 | test \$15,%r10 | ||
1173 | jnz .Lcbc_enc_tail | ||
1174 | mov $_ivp,%rbp # restore ivp | 1786 | mov $_ivp,%rbp # restore ivp |
1175 | mov $s0,0(%rbp) # save ivec | 1787 | mov $s0,0(%rbp) # save ivec |
1176 | mov $s1,4(%rbp) | 1788 | mov $s1,4(%rbp) |
1177 | mov $s2,8(%rbp) | 1789 | mov $s2,8(%rbp) |
1178 | mov $s3,12(%rbp) | 1790 | mov $s3,12(%rbp) |
1179 | 1791 | ||
1180 | .align 4 | 1792 | jmp .Lcbc_fast_cleanup |
1181 | .Lcbc_cleanup: | 1793 | |
1182 | cmpl \$0,$mark # was the key schedule copied? | ||
1183 | lea $aes_key,%rdi | ||
1184 | mov $_rsp,%rsp | ||
1185 | je .Lcbc_exit | ||
1186 | mov \$240/8,%ecx | ||
1187 | xor %rax,%rax | ||
1188 | .long 0x90AB48F3 # rep stosq | ||
1189 | .Lcbc_exit: | ||
1190 | popfq | ||
1191 | pop %r15 | ||
1192 | pop %r14 | ||
1193 | pop %r13 | ||
1194 | pop %r12 | ||
1195 | pop %rbp | ||
1196 | pop %rbx | ||
1197 | .Lcbc_just_ret: | ||
1198 | ret | ||
1199 | .align 4 | ||
1200 | .Lcbc_enc_tail: | ||
1201 | mov %rax,%r11 | ||
1202 | mov %rcx,%r12 | ||
1203 | mov %r10,%rcx | ||
1204 | mov $inp,%rsi | ||
1205 | mov $out,%rdi | ||
1206 | .long 0xF689A4F3 # rep movsb | ||
1207 | mov \$16,%rcx # zero tail | ||
1208 | sub %r10,%rcx | ||
1209 | xor %rax,%rax | ||
1210 | .long 0xF689AAF3 # rep stosb | ||
1211 | mov $out,$inp # this is not a mistake! | ||
1212 | movq \$16,$_len # len=16 | ||
1213 | mov %r11,%rax | ||
1214 | mov %r12,%rcx | ||
1215 | jmp .Lcbc_enc_loop # one more spin... | ||
1216 | #----------------------------- DECRYPT -----------------------------# | 1794 | #----------------------------- DECRYPT -----------------------------# |
1217 | .align 16 | 1795 | .align 16 |
1218 | .LDECRYPT: | 1796 | .LFAST_DECRYPT: |
1219 | lea AES_Td-.Lcbc_pic_point($sbox),$sbox | ||
1220 | |||
1221 | # allocate aligned stack frame... | ||
1222 | lea -64-248(%rsp),$key | ||
1223 | and \$-64,$key | ||
1224 | |||
1225 | # ... and make it doesn't alias with AES_Td modulo 4096 | ||
1226 | mov $sbox,%r10 | ||
1227 | lea 2304($sbox),%r11 | ||
1228 | mov $key,%r12 | ||
1229 | and \$0xFFF,%r10 # s = $sbox&0xfff | ||
1230 | and \$0xFFF,%r11 # e = ($sbox+2048+256)&0xfff | ||
1231 | and \$0xFFF,%r12 # p = %rsp&0xfff | ||
1232 | |||
1233 | cmp %r11,%r12 # if (p=>e) %rsp =- (p-e); | ||
1234 | jb .Lcbc_td_break_out | ||
1235 | sub %r11,%r12 | ||
1236 | sub %r12,$key | ||
1237 | jmp .Lcbc_td_ok | ||
1238 | .Lcbc_td_break_out: # else %rsp -= (p-s)&0xfff + framesz | ||
1239 | sub %r10,%r12 | ||
1240 | and \$0xFFF,%r12 | ||
1241 | add \$320,%r12 | ||
1242 | sub %r12,$key | ||
1243 | .align 4 | ||
1244 | .Lcbc_td_ok: | ||
1245 | |||
1246 | xchg %rsp,$key | ||
1247 | add \$8,%rsp # reserve for return address! | ||
1248 | mov $key,$_rsp # save %rsp | ||
1249 | mov %rdx,$_len # save copy of len | ||
1250 | mov %rcx,$_key # save copy of key | ||
1251 | mov %r8,$_ivp # save copy of ivp | ||
1252 | movl \$0,$mark # copy of aes_key->rounds = 0; | ||
1253 | mov %r8,%rbp # rearrange input arguments | ||
1254 | mov %rsi,$out | ||
1255 | mov %rdi,$inp | ||
1256 | mov %rcx,$key | ||
1257 | |||
1258 | # do we copy key schedule to stack? | ||
1259 | mov $key,%r10 | ||
1260 | sub $sbox,%r10 | ||
1261 | and \$0xfff,%r10 | ||
1262 | cmp \$2304,%r10 | ||
1263 | jb .Lcbc_do_dcopy | ||
1264 | cmp \$4096-248,%r10 | ||
1265 | jb .Lcbc_skip_dcopy | ||
1266 | .align 4 | ||
1267 | .Lcbc_do_dcopy: | ||
1268 | mov $key,%rsi | ||
1269 | lea $aes_key,%rdi | ||
1270 | lea $aes_key,$key | ||
1271 | mov \$240/8,%ecx | ||
1272 | .long 0x90A548F3 # rep movsq | ||
1273 | mov (%rsi),%eax # copy aes_key->rounds | ||
1274 | mov %eax,(%rdi) | ||
1275 | .Lcbc_skip_dcopy: | ||
1276 | mov $key,$keyp # save key pointer | ||
1277 | |||
1278 | mov \$18,%ecx | ||
1279 | .align 4 | ||
1280 | .Lcbc_prefetch_td: | ||
1281 | mov 0($sbox),%r10 | ||
1282 | mov 32($sbox),%r11 | ||
1283 | mov 64($sbox),%r12 | ||
1284 | mov 96($sbox),%r13 | ||
1285 | lea 128($sbox),$sbox | ||
1286 | sub \$1,%ecx | ||
1287 | jnz .Lcbc_prefetch_td | ||
1288 | sub \$2304,$sbox | ||
1289 | |||
1290 | cmp $inp,$out | 1797 | cmp $inp,$out |
1291 | je .Lcbc_dec_in_place | 1798 | je .Lcbc_fast_dec_in_place |
1292 | 1799 | ||
1293 | mov %rbp,$ivec | 1800 | mov %rbp,$ivec |
1294 | .align 4 | 1801 | .align 4 |
1295 | .Lcbc_dec_loop: | 1802 | .Lcbc_fast_dec_loop: |
1296 | mov 0($inp),$s0 # read input | 1803 | mov 0($inp),$s0 # read input |
1297 | mov 4($inp),$s1 | 1804 | mov 4($inp),$s1 |
1298 | mov 8($inp),$s2 | 1805 | mov 8($inp),$s2 |
1299 | mov 12($inp),$s3 | 1806 | mov 12($inp),$s3 |
1300 | mov $inp,8+$ivec # if ($verticalspin) save inp | ||
1301 | |||
1302 | mov $keyp,$key # restore key | 1807 | mov $keyp,$key # restore key |
1808 | mov $inp,$_inp # if ($verticalspin) save inp | ||
1809 | |||
1303 | call _x86_64_AES_decrypt | 1810 | call _x86_64_AES_decrypt |
1304 | 1811 | ||
1305 | mov $ivec,%rbp # load ivp | 1812 | mov $ivec,%rbp # load ivp |
1306 | mov 8+$ivec,$inp # if ($verticalspin) restore inp | 1813 | mov $_inp,$inp # if ($verticalspin) restore inp |
1814 | mov $_len,%r10 # load len | ||
1307 | xor 0(%rbp),$s0 # xor iv | 1815 | xor 0(%rbp),$s0 # xor iv |
1308 | xor 4(%rbp),$s1 | 1816 | xor 4(%rbp),$s1 |
1309 | xor 8(%rbp),$s2 | 1817 | xor 8(%rbp),$s2 |
1310 | xor 12(%rbp),$s3 | 1818 | xor 12(%rbp),$s3 |
1311 | mov $inp,%rbp # current input, next iv | 1819 | mov $inp,%rbp # current input, next iv |
1312 | 1820 | ||
1313 | mov $_len,%r10 # load len | ||
1314 | sub \$16,%r10 | 1821 | sub \$16,%r10 |
1315 | jc .Lcbc_dec_partial | ||
1316 | mov %r10,$_len # update len | 1822 | mov %r10,$_len # update len |
1317 | mov %rbp,$ivec # update ivp | 1823 | mov %rbp,$ivec # update ivp |
1318 | 1824 | ||
@@ -1323,81 +1829,281 @@ AES_cbc_encrypt: | |||
1323 | 1829 | ||
1324 | lea 16($inp),$inp | 1830 | lea 16($inp),$inp |
1325 | lea 16($out),$out | 1831 | lea 16($out),$out |
1326 | jnz .Lcbc_dec_loop | 1832 | jnz .Lcbc_fast_dec_loop |
1327 | .Lcbc_dec_end: | ||
1328 | mov $_ivp,%r12 # load user ivp | 1833 | mov $_ivp,%r12 # load user ivp |
1329 | mov 0(%rbp),%r10 # load iv | 1834 | mov 0(%rbp),%r10 # load iv |
1330 | mov 8(%rbp),%r11 | 1835 | mov 8(%rbp),%r11 |
1331 | mov %r10,0(%r12) # copy back to user | 1836 | mov %r10,0(%r12) # copy back to user |
1332 | mov %r11,8(%r12) | 1837 | mov %r11,8(%r12) |
1333 | jmp .Lcbc_cleanup | 1838 | jmp .Lcbc_fast_cleanup |
1334 | |||
1335 | .align 4 | ||
1336 | .Lcbc_dec_partial: | ||
1337 | mov $s0,0+$ivec # dump output to stack | ||
1338 | mov $s1,4+$ivec | ||
1339 | mov $s2,8+$ivec | ||
1340 | mov $s3,12+$ivec | ||
1341 | mov $out,%rdi | ||
1342 | lea $ivec,%rsi | ||
1343 | mov \$16,%rcx | ||
1344 | add %r10,%rcx # number of bytes to copy | ||
1345 | .long 0xF689A4F3 # rep movsb | ||
1346 | jmp .Lcbc_dec_end | ||
1347 | 1839 | ||
1348 | .align 16 | 1840 | .align 16 |
1349 | .Lcbc_dec_in_place: | 1841 | .Lcbc_fast_dec_in_place: |
1842 | mov 0(%rbp),%r10 # copy iv to stack | ||
1843 | mov 8(%rbp),%r11 | ||
1844 | mov %r10,0+$ivec | ||
1845 | mov %r11,8+$ivec | ||
1846 | .align 4 | ||
1847 | .Lcbc_fast_dec_in_place_loop: | ||
1350 | mov 0($inp),$s0 # load input | 1848 | mov 0($inp),$s0 # load input |
1351 | mov 4($inp),$s1 | 1849 | mov 4($inp),$s1 |
1352 | mov 8($inp),$s2 | 1850 | mov 8($inp),$s2 |
1353 | mov 12($inp),$s3 | 1851 | mov 12($inp),$s3 |
1852 | mov $keyp,$key # restore key | ||
1853 | mov $inp,$_inp # if ($verticalspin) save inp | ||
1354 | 1854 | ||
1355 | mov $inp,$ivec # if ($verticalspin) save inp | ||
1356 | mov $keyp,$key | ||
1357 | call _x86_64_AES_decrypt | 1855 | call _x86_64_AES_decrypt |
1358 | 1856 | ||
1359 | mov $ivec,$inp # if ($verticalspin) restore inp | 1857 | mov $_inp,$inp # if ($verticalspin) restore inp |
1360 | mov $_ivp,%rbp | 1858 | mov $_len,%r10 |
1361 | xor 0(%rbp),$s0 | 1859 | xor 0+$ivec,$s0 |
1362 | xor 4(%rbp),$s1 | 1860 | xor 4+$ivec,$s1 |
1363 | xor 8(%rbp),$s2 | 1861 | xor 8+$ivec,$s2 |
1364 | xor 12(%rbp),$s3 | 1862 | xor 12+$ivec,$s3 |
1863 | |||
1864 | mov 0($inp),%r11 # load input | ||
1865 | mov 8($inp),%r12 | ||
1866 | sub \$16,%r10 | ||
1867 | jz .Lcbc_fast_dec_in_place_done | ||
1365 | 1868 | ||
1366 | mov 0($inp),%r10 # copy input to iv | 1869 | mov %r11,0+$ivec # copy input to iv |
1367 | mov 8($inp),%r11 | 1870 | mov %r12,8+$ivec |
1368 | mov %r10,0(%rbp) | ||
1369 | mov %r11,8(%rbp) | ||
1370 | 1871 | ||
1371 | mov $s0,0($out) # save output [zaps input] | 1872 | mov $s0,0($out) # save output [zaps input] |
1372 | mov $s1,4($out) | 1873 | mov $s1,4($out) |
1373 | mov $s2,8($out) | 1874 | mov $s2,8($out) |
1374 | mov $s3,12($out) | 1875 | mov $s3,12($out) |
1375 | 1876 | ||
1376 | mov $_len,%rcx | ||
1377 | lea 16($inp),$inp | 1877 | lea 16($inp),$inp |
1378 | lea 16($out),$out | 1878 | lea 16($out),$out |
1379 | sub \$16,%rcx | 1879 | mov %r10,$_len |
1380 | jc .Lcbc_dec_in_place_partial | 1880 | jmp .Lcbc_fast_dec_in_place_loop |
1381 | mov %rcx,$_len | 1881 | .Lcbc_fast_dec_in_place_done: |
1382 | jnz .Lcbc_dec_in_place | 1882 | mov $_ivp,%rdi |
1383 | jmp .Lcbc_cleanup | 1883 | mov %r11,0(%rdi) # copy iv back to user |
1884 | mov %r12,8(%rdi) | ||
1885 | |||
1886 | mov $s0,0($out) # save output [zaps input] | ||
1887 | mov $s1,4($out) | ||
1888 | mov $s2,8($out) | ||
1889 | mov $s3,12($out) | ||
1384 | 1890 | ||
1385 | .align 4 | 1891 | .align 4 |
1386 | .Lcbc_dec_in_place_partial: | 1892 | .Lcbc_fast_cleanup: |
1387 | # one can argue if this is actually required | 1893 | cmpl \$0,$mark # was the key schedule copied? |
1388 | lea ($out,%rcx),%rdi | 1894 | lea $aes_key,%rdi |
1389 | lea (%rbp,%rcx),%rsi | 1895 | je .Lcbc_exit |
1390 | neg %rcx | 1896 | mov \$240/8,%ecx |
1391 | .long 0xF689A4F3 # rep movsb # restore tail | 1897 | xor %rax,%rax |
1392 | jmp .Lcbc_cleanup | 1898 | .long 0x90AB48F3 # rep stosq |
1899 | |||
1900 | jmp .Lcbc_exit | ||
1901 | |||
1902 | #--------------------------- SLOW ROUTINE ---------------------------# | ||
1903 | .align 16 | ||
1904 | .Lcbc_slow_prologue: | ||
1905 | # allocate aligned stack frame... | ||
1906 | lea -88(%rsp),%rbp | ||
1907 | and \$-64,%rbp | ||
1908 | # ... just "above" key schedule | ||
1909 | lea -88-63(%rcx),%r10 | ||
1910 | sub %rbp,%r10 | ||
1911 | neg %r10 | ||
1912 | and \$0x3c0,%r10 | ||
1913 | sub %r10,%rbp | ||
1914 | |||
1915 | xchg %rsp,%rbp | ||
1916 | #add \$8,%rsp # reserve for return address! | ||
1917 | mov %rbp,$_rsp # save %rsp | ||
1918 | .Lcbc_slow_body: | ||
1919 | #mov %rdi,$_inp # save copy of inp | ||
1920 | #mov %rsi,$_out # save copy of out | ||
1921 | #mov %rdx,$_len # save copy of len | ||
1922 | #mov %rcx,$_key # save copy of key | ||
1923 | mov %r8,$_ivp # save copy of ivp | ||
1924 | mov %r8,%rbp # rearrange input arguments | ||
1925 | mov %r9,%rbx | ||
1926 | mov %rsi,$out | ||
1927 | mov %rdi,$inp | ||
1928 | mov %rcx,$key | ||
1929 | mov %rdx,%r10 | ||
1930 | |||
1931 | mov 240($key),%eax | ||
1932 | mov $key,$keyp # save key pointer | ||
1933 | shl \$4,%eax | ||
1934 | lea ($key,%rax),%rax | ||
1935 | mov %rax,$keyend | ||
1936 | |||
1937 | # pick Te4 copy which can't "overlap" with stack frame or key scdedule | ||
1938 | lea 2048($sbox),$sbox | ||
1939 | lea 768-8(%rsp),%rax | ||
1940 | sub $sbox,%rax | ||
1941 | and \$0x300,%rax | ||
1942 | lea ($sbox,%rax),$sbox | ||
1943 | |||
1944 | cmp \$0,%rbx | ||
1945 | je .LSLOW_DECRYPT | ||
1946 | |||
1947 | #--------------------------- SLOW ENCRYPT ---------------------------# | ||
1948 | test \$-16,%r10 # check upon length | ||
1949 | mov 0(%rbp),$s0 # load iv | ||
1950 | mov 4(%rbp),$s1 | ||
1951 | mov 8(%rbp),$s2 | ||
1952 | mov 12(%rbp),$s3 | ||
1953 | jz .Lcbc_slow_enc_tail # short input... | ||
1954 | |||
1955 | .align 4 | ||
1956 | .Lcbc_slow_enc_loop: | ||
1957 | xor 0($inp),$s0 | ||
1958 | xor 4($inp),$s1 | ||
1959 | xor 8($inp),$s2 | ||
1960 | xor 12($inp),$s3 | ||
1961 | mov $keyp,$key # restore key | ||
1962 | mov $inp,$_inp # save inp | ||
1963 | mov $out,$_out # save out | ||
1964 | mov %r10,$_len # save len | ||
1965 | |||
1966 | call _x86_64_AES_encrypt_compact | ||
1967 | |||
1968 | mov $_inp,$inp # restore inp | ||
1969 | mov $_out,$out # restore out | ||
1970 | mov $_len,%r10 # restore len | ||
1971 | mov $s0,0($out) | ||
1972 | mov $s1,4($out) | ||
1973 | mov $s2,8($out) | ||
1974 | mov $s3,12($out) | ||
1975 | |||
1976 | lea 16($inp),$inp | ||
1977 | lea 16($out),$out | ||
1978 | sub \$16,%r10 | ||
1979 | test \$-16,%r10 | ||
1980 | jnz .Lcbc_slow_enc_loop | ||
1981 | test \$15,%r10 | ||
1982 | jnz .Lcbc_slow_enc_tail | ||
1983 | mov $_ivp,%rbp # restore ivp | ||
1984 | mov $s0,0(%rbp) # save ivec | ||
1985 | mov $s1,4(%rbp) | ||
1986 | mov $s2,8(%rbp) | ||
1987 | mov $s3,12(%rbp) | ||
1988 | |||
1989 | jmp .Lcbc_exit | ||
1990 | |||
1991 | .align 4 | ||
1992 | .Lcbc_slow_enc_tail: | ||
1993 | mov %rax,%r11 | ||
1994 | mov %rcx,%r12 | ||
1995 | mov %r10,%rcx | ||
1996 | mov $inp,%rsi | ||
1997 | mov $out,%rdi | ||
1998 | .long 0x9066A4F3 # rep movsb | ||
1999 | mov \$16,%rcx # zero tail | ||
2000 | sub %r10,%rcx | ||
2001 | xor %rax,%rax | ||
2002 | .long 0x9066AAF3 # rep stosb | ||
2003 | mov $out,$inp # this is not a mistake! | ||
2004 | mov \$16,%r10 # len=16 | ||
2005 | mov %r11,%rax | ||
2006 | mov %r12,%rcx | ||
2007 | jmp .Lcbc_slow_enc_loop # one more spin... | ||
2008 | #--------------------------- SLOW DECRYPT ---------------------------# | ||
2009 | .align 16 | ||
2010 | .LSLOW_DECRYPT: | ||
2011 | shr \$3,%rax | ||
2012 | add %rax,$sbox # recall "magic" constants! | ||
2013 | |||
2014 | mov 0(%rbp),%r11 # copy iv to stack | ||
2015 | mov 8(%rbp),%r12 | ||
2016 | mov %r11,0+$ivec | ||
2017 | mov %r12,8+$ivec | ||
2018 | |||
2019 | .align 4 | ||
2020 | .Lcbc_slow_dec_loop: | ||
2021 | mov 0($inp),$s0 # load input | ||
2022 | mov 4($inp),$s1 | ||
2023 | mov 8($inp),$s2 | ||
2024 | mov 12($inp),$s3 | ||
2025 | mov $keyp,$key # restore key | ||
2026 | mov $inp,$_inp # save inp | ||
2027 | mov $out,$_out # save out | ||
2028 | mov %r10,$_len # save len | ||
2029 | |||
2030 | call _x86_64_AES_decrypt_compact | ||
2031 | |||
2032 | mov $_inp,$inp # restore inp | ||
2033 | mov $_out,$out # restore out | ||
2034 | mov $_len,%r10 | ||
2035 | xor 0+$ivec,$s0 | ||
2036 | xor 4+$ivec,$s1 | ||
2037 | xor 8+$ivec,$s2 | ||
2038 | xor 12+$ivec,$s3 | ||
2039 | |||
2040 | mov 0($inp),%r11 # load input | ||
2041 | mov 8($inp),%r12 | ||
2042 | sub \$16,%r10 | ||
2043 | jc .Lcbc_slow_dec_partial | ||
2044 | jz .Lcbc_slow_dec_done | ||
2045 | |||
2046 | mov %r11,0+$ivec # copy input to iv | ||
2047 | mov %r12,8+$ivec | ||
2048 | |||
2049 | mov $s0,0($out) # save output [can zap input] | ||
2050 | mov $s1,4($out) | ||
2051 | mov $s2,8($out) | ||
2052 | mov $s3,12($out) | ||
2053 | |||
2054 | lea 16($inp),$inp | ||
2055 | lea 16($out),$out | ||
2056 | jmp .Lcbc_slow_dec_loop | ||
2057 | .Lcbc_slow_dec_done: | ||
2058 | mov $_ivp,%rdi | ||
2059 | mov %r11,0(%rdi) # copy iv back to user | ||
2060 | mov %r12,8(%rdi) | ||
2061 | |||
2062 | mov $s0,0($out) # save output [can zap input] | ||
2063 | mov $s1,4($out) | ||
2064 | mov $s2,8($out) | ||
2065 | mov $s3,12($out) | ||
2066 | |||
2067 | jmp .Lcbc_exit | ||
2068 | |||
2069 | .align 4 | ||
2070 | .Lcbc_slow_dec_partial: | ||
2071 | mov $_ivp,%rdi | ||
2072 | mov %r11,0(%rdi) # copy iv back to user | ||
2073 | mov %r12,8(%rdi) | ||
2074 | |||
2075 | mov $s0,0+$ivec # save output to stack | ||
2076 | mov $s1,4+$ivec | ||
2077 | mov $s2,8+$ivec | ||
2078 | mov $s3,12+$ivec | ||
2079 | |||
2080 | mov $out,%rdi | ||
2081 | lea $ivec,%rsi | ||
2082 | lea 16(%r10),%rcx | ||
2083 | .long 0x9066A4F3 # rep movsb | ||
2084 | jmp .Lcbc_exit | ||
2085 | |||
2086 | .align 16 | ||
2087 | .Lcbc_exit: | ||
2088 | mov $_rsp,%rsi | ||
2089 | mov (%rsi),%r15 | ||
2090 | mov 8(%rsi),%r14 | ||
2091 | mov 16(%rsi),%r13 | ||
2092 | mov 24(%rsi),%r12 | ||
2093 | mov 32(%rsi),%rbp | ||
2094 | mov 40(%rsi),%rbx | ||
2095 | lea 48(%rsi),%rsp | ||
2096 | .Lcbc_popfq: | ||
2097 | popfq | ||
2098 | .Lcbc_epilogue: | ||
2099 | ret | ||
1393 | .size AES_cbc_encrypt,.-AES_cbc_encrypt | 2100 | .size AES_cbc_encrypt,.-AES_cbc_encrypt |
1394 | ___ | 2101 | ___ |
1395 | } | 2102 | } |
1396 | 2103 | ||
1397 | $code.=<<___; | 2104 | $code.=<<___; |
1398 | .globl AES_Te | ||
1399 | .align 64 | 2105 | .align 64 |
1400 | AES_Te: | 2106 | .LAES_Te: |
1401 | ___ | 2107 | ___ |
1402 | &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); | 2108 | &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); |
1403 | &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); | 2109 | &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); |
@@ -1463,16 +2169,149 @@ ___ | |||
1463 | &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); | 2169 | &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); |
1464 | &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); | 2170 | &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); |
1465 | &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); | 2171 | &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); |
2172 | |||
2173 | #Te4 # four copies of Te4 to choose from to avoid L1 aliasing | ||
2174 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
2175 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
2176 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
2177 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
2178 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
2179 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
2180 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
2181 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
2182 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
2183 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
2184 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
2185 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
2186 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
2187 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
2188 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
2189 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
2190 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
2191 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
2192 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
2193 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
2194 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
2195 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
2196 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
2197 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
2198 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
2199 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
2200 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
2201 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
2202 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
2203 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
2204 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
2205 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
2206 | |||
2207 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
2208 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
2209 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
2210 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
2211 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
2212 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
2213 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
2214 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
2215 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
2216 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
2217 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
2218 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
2219 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
2220 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
2221 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
2222 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
2223 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
2224 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
2225 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
2226 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
2227 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
2228 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
2229 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
2230 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
2231 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
2232 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
2233 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
2234 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
2235 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
2236 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
2237 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
2238 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
2239 | |||
2240 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
2241 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
2242 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
2243 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
2244 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
2245 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
2246 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
2247 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
2248 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
2249 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
2250 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
2251 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
2252 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
2253 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
2254 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
2255 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
2256 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
2257 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
2258 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
2259 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
2260 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
2261 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
2262 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
2263 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
2264 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
2265 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
2266 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
2267 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
2268 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
2269 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
2270 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
2271 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
2272 | |||
2273 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
2274 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
2275 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
2276 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
2277 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
2278 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
2279 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
2280 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
2281 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
2282 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
2283 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
2284 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
2285 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
2286 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
2287 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
2288 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
2289 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
2290 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
2291 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
2292 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
2293 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
2294 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
2295 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
2296 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
2297 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
2298 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
2299 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
2300 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
2301 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
2302 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
2303 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
2304 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
1466 | #rcon: | 2305 | #rcon: |
1467 | $code.=<<___; | 2306 | $code.=<<___; |
1468 | .long 0x00000001, 0x00000002, 0x00000004, 0x00000008 | 2307 | .long 0x00000001, 0x00000002, 0x00000004, 0x00000008 |
1469 | .long 0x00000010, 0x00000020, 0x00000040, 0x00000080 | 2308 | .long 0x00000010, 0x00000020, 0x00000040, 0x00000080 |
1470 | .long 0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0 | 2309 | .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080 |
2310 | .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b | ||
1471 | ___ | 2311 | ___ |
1472 | $code.=<<___; | 2312 | $code.=<<___; |
1473 | .globl AES_Td | ||
1474 | .align 64 | 2313 | .align 64 |
1475 | AES_Td: | 2314 | .LAES_Td: |
1476 | ___ | 2315 | ___ |
1477 | &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a); | 2316 | &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a); |
1478 | &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b); | 2317 | &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b); |
@@ -1538,7 +2377,116 @@ ___ | |||
1538 | &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); | 2377 | &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); |
1539 | &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); | 2378 | &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); |
1540 | &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); | 2379 | &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); |
1541 | #Td4: | 2380 | |
2381 | #Td4: # four copies of Td4 to choose from to avoid L1 aliasing | ||
2382 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
2383 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
2384 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
2385 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
2386 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
2387 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
2388 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
2389 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
2390 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
2391 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
2392 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
2393 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
2394 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
2395 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
2396 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
2397 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
2398 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
2399 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
2400 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
2401 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
2402 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
2403 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
2404 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
2405 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
2406 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
2407 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
2408 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
2409 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
2410 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
2411 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
2412 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
2413 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
2414 | $code.=<<___; | ||
2415 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
2416 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
2417 | ___ | ||
2418 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
2419 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
2420 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
2421 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
2422 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
2423 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
2424 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
2425 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
2426 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
2427 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
2428 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
2429 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
2430 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
2431 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
2432 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
2433 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
2434 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
2435 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
2436 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
2437 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
2438 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
2439 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
2440 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
2441 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
2442 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
2443 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
2444 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
2445 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
2446 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
2447 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
2448 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
2449 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
2450 | $code.=<<___; | ||
2451 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
2452 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
2453 | ___ | ||
2454 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
2455 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
2456 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
2457 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
2458 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
2459 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
2460 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
2461 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
2462 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
2463 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
2464 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
2465 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
2466 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
2467 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
2468 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
2469 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
2470 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
2471 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
2472 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
2473 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
2474 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
2475 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
2476 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
2477 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
2478 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
2479 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
2480 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
2481 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
2482 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
2483 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
2484 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
2485 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
2486 | $code.=<<___; | ||
2487 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
2488 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
2489 | ___ | ||
1542 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | 2490 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); |
1543 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | 2491 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); |
1544 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | 2492 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); |
@@ -1571,6 +2519,288 @@ ___ | |||
1571 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | 2519 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); |
1572 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | 2520 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); |
1573 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | 2521 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); |
2522 | $code.=<<___; | ||
2523 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
2524 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
2525 | .asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
2526 | .align 64 | ||
2527 | ___ | ||
2528 | |||
2529 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
2530 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
2531 | if ($win64) { | ||
2532 | $rec="%rcx"; | ||
2533 | $frame="%rdx"; | ||
2534 | $context="%r8"; | ||
2535 | $disp="%r9"; | ||
2536 | |||
2537 | $code.=<<___; | ||
2538 | .extern __imp_RtlVirtualUnwind | ||
2539 | .type block_se_handler,\@abi-omnipotent | ||
2540 | .align 16 | ||
2541 | block_se_handler: | ||
2542 | push %rsi | ||
2543 | push %rdi | ||
2544 | push %rbx | ||
2545 | push %rbp | ||
2546 | push %r12 | ||
2547 | push %r13 | ||
2548 | push %r14 | ||
2549 | push %r15 | ||
2550 | pushfq | ||
2551 | sub \$64,%rsp | ||
2552 | |||
2553 | mov 120($context),%rax # pull context->Rax | ||
2554 | mov 248($context),%rbx # pull context->Rip | ||
2555 | |||
2556 | mov 8($disp),%rsi # disp->ImageBase | ||
2557 | mov 56($disp),%r11 # disp->HandlerData | ||
2558 | |||
2559 | mov 0(%r11),%r10d # HandlerData[0] | ||
2560 | lea (%rsi,%r10),%r10 # prologue label | ||
2561 | cmp %r10,%rbx # context->Rip<prologue label | ||
2562 | jb .Lin_block_prologue | ||
2563 | |||
2564 | mov 152($context),%rax # pull context->Rsp | ||
2565 | |||
2566 | mov 4(%r11),%r10d # HandlerData[1] | ||
2567 | lea (%rsi,%r10),%r10 # epilogue label | ||
2568 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2569 | jae .Lin_block_prologue | ||
2570 | |||
2571 | mov 24(%rax),%rax # pull saved real stack pointer | ||
2572 | lea 48(%rax),%rax # adjust... | ||
2573 | |||
2574 | mov -8(%rax),%rbx | ||
2575 | mov -16(%rax),%rbp | ||
2576 | mov -24(%rax),%r12 | ||
2577 | mov -32(%rax),%r13 | ||
2578 | mov -40(%rax),%r14 | ||
2579 | mov -48(%rax),%r15 | ||
2580 | mov %rbx,144($context) # restore context->Rbx | ||
2581 | mov %rbp,160($context) # restore context->Rbp | ||
2582 | mov %r12,216($context) # restore context->R12 | ||
2583 | mov %r13,224($context) # restore context->R13 | ||
2584 | mov %r14,232($context) # restore context->R14 | ||
2585 | mov %r15,240($context) # restore context->R15 | ||
2586 | |||
2587 | .Lin_block_prologue: | ||
2588 | mov 8(%rax),%rdi | ||
2589 | mov 16(%rax),%rsi | ||
2590 | mov %rax,152($context) # restore context->Rsp | ||
2591 | mov %rsi,168($context) # restore context->Rsi | ||
2592 | mov %rdi,176($context) # restore context->Rdi | ||
2593 | |||
2594 | jmp .Lcommon_seh_exit | ||
2595 | .size block_se_handler,.-block_se_handler | ||
2596 | |||
2597 | .type key_se_handler,\@abi-omnipotent | ||
2598 | .align 16 | ||
2599 | key_se_handler: | ||
2600 | push %rsi | ||
2601 | push %rdi | ||
2602 | push %rbx | ||
2603 | push %rbp | ||
2604 | push %r12 | ||
2605 | push %r13 | ||
2606 | push %r14 | ||
2607 | push %r15 | ||
2608 | pushfq | ||
2609 | sub \$64,%rsp | ||
2610 | |||
2611 | mov 120($context),%rax # pull context->Rax | ||
2612 | mov 248($context),%rbx # pull context->Rip | ||
2613 | |||
2614 | mov 8($disp),%rsi # disp->ImageBase | ||
2615 | mov 56($disp),%r11 # disp->HandlerData | ||
2616 | |||
2617 | mov 0(%r11),%r10d # HandlerData[0] | ||
2618 | lea (%rsi,%r10),%r10 # prologue label | ||
2619 | cmp %r10,%rbx # context->Rip<prologue label | ||
2620 | jb .Lin_key_prologue | ||
2621 | |||
2622 | mov 152($context),%rax # pull context->Rsp | ||
2623 | |||
2624 | mov 4(%r11),%r10d # HandlerData[1] | ||
2625 | lea (%rsi,%r10),%r10 # epilogue label | ||
2626 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2627 | jae .Lin_key_prologue | ||
2628 | |||
2629 | lea 56(%rax),%rax | ||
2630 | |||
2631 | mov -8(%rax),%rbx | ||
2632 | mov -16(%rax),%rbp | ||
2633 | mov -24(%rax),%r12 | ||
2634 | mov -32(%rax),%r13 | ||
2635 | mov -40(%rax),%r14 | ||
2636 | mov -48(%rax),%r15 | ||
2637 | mov %rbx,144($context) # restore context->Rbx | ||
2638 | mov %rbp,160($context) # restore context->Rbp | ||
2639 | mov %r12,216($context) # restore context->R12 | ||
2640 | mov %r13,224($context) # restore context->R13 | ||
2641 | mov %r14,232($context) # restore context->R14 | ||
2642 | mov %r15,240($context) # restore context->R15 | ||
2643 | |||
2644 | .Lin_key_prologue: | ||
2645 | mov 8(%rax),%rdi | ||
2646 | mov 16(%rax),%rsi | ||
2647 | mov %rax,152($context) # restore context->Rsp | ||
2648 | mov %rsi,168($context) # restore context->Rsi | ||
2649 | mov %rdi,176($context) # restore context->Rdi | ||
2650 | |||
2651 | jmp .Lcommon_seh_exit | ||
2652 | .size key_se_handler,.-key_se_handler | ||
2653 | |||
2654 | .type cbc_se_handler,\@abi-omnipotent | ||
2655 | .align 16 | ||
2656 | cbc_se_handler: | ||
2657 | push %rsi | ||
2658 | push %rdi | ||
2659 | push %rbx | ||
2660 | push %rbp | ||
2661 | push %r12 | ||
2662 | push %r13 | ||
2663 | push %r14 | ||
2664 | push %r15 | ||
2665 | pushfq | ||
2666 | sub \$64,%rsp | ||
2667 | |||
2668 | mov 120($context),%rax # pull context->Rax | ||
2669 | mov 248($context),%rbx # pull context->Rip | ||
2670 | |||
2671 | lea .Lcbc_prologue(%rip),%r10 | ||
2672 | cmp %r10,%rbx # context->Rip<.Lcbc_prologue | ||
2673 | jb .Lin_cbc_prologue | ||
2674 | |||
2675 | lea .Lcbc_fast_body(%rip),%r10 | ||
2676 | cmp %r10,%rbx # context->Rip<.Lcbc_fast_body | ||
2677 | jb .Lin_cbc_frame_setup | ||
2678 | |||
2679 | lea .Lcbc_slow_prologue(%rip),%r10 | ||
2680 | cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue | ||
2681 | jb .Lin_cbc_body | ||
2682 | |||
2683 | lea .Lcbc_slow_body(%rip),%r10 | ||
2684 | cmp %r10,%rbx # context->Rip<.Lcbc_slow_body | ||
2685 | jb .Lin_cbc_frame_setup | ||
2686 | |||
2687 | .Lin_cbc_body: | ||
2688 | mov 152($context),%rax # pull context->Rsp | ||
2689 | |||
2690 | lea .Lcbc_epilogue(%rip),%r10 | ||
2691 | cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue | ||
2692 | jae .Lin_cbc_prologue | ||
2693 | |||
2694 | lea 8(%rax),%rax | ||
2695 | |||
2696 | lea .Lcbc_popfq(%rip),%r10 | ||
2697 | cmp %r10,%rbx # context->Rip>=.Lcbc_popfq | ||
2698 | jae .Lin_cbc_prologue | ||
2699 | |||
2700 | mov `16-8`(%rax),%rax # biased $_rsp | ||
2701 | lea 56(%rax),%rax | ||
2702 | |||
2703 | .Lin_cbc_frame_setup: | ||
2704 | mov -16(%rax),%rbx | ||
2705 | mov -24(%rax),%rbp | ||
2706 | mov -32(%rax),%r12 | ||
2707 | mov -40(%rax),%r13 | ||
2708 | mov -48(%rax),%r14 | ||
2709 | mov -56(%rax),%r15 | ||
2710 | mov %rbx,144($context) # restore context->Rbx | ||
2711 | mov %rbp,160($context) # restore context->Rbp | ||
2712 | mov %r12,216($context) # restore context->R12 | ||
2713 | mov %r13,224($context) # restore context->R13 | ||
2714 | mov %r14,232($context) # restore context->R14 | ||
2715 | mov %r15,240($context) # restore context->R15 | ||
2716 | |||
2717 | .Lin_cbc_prologue: | ||
2718 | mov 8(%rax),%rdi | ||
2719 | mov 16(%rax),%rsi | ||
2720 | mov %rax,152($context) # restore context->Rsp | ||
2721 | mov %rsi,168($context) # restore context->Rsi | ||
2722 | mov %rdi,176($context) # restore context->Rdi | ||
2723 | |||
2724 | .Lcommon_seh_exit: | ||
2725 | |||
2726 | mov 40($disp),%rdi # disp->ContextRecord | ||
2727 | mov $context,%rsi # context | ||
2728 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
2729 | .long 0xa548f3fc # cld; rep movsq | ||
2730 | |||
2731 | mov $disp,%rsi | ||
2732 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
2733 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
2734 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
2735 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
2736 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
2737 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
2738 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
2739 | mov %r10,32(%rsp) # arg5 | ||
2740 | mov %r11,40(%rsp) # arg6 | ||
2741 | mov %r12,48(%rsp) # arg7 | ||
2742 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
2743 | call *__imp_RtlVirtualUnwind(%rip) | ||
2744 | |||
2745 | mov \$1,%eax # ExceptionContinueSearch | ||
2746 | add \$64,%rsp | ||
2747 | popfq | ||
2748 | pop %r15 | ||
2749 | pop %r14 | ||
2750 | pop %r13 | ||
2751 | pop %r12 | ||
2752 | pop %rbp | ||
2753 | pop %rbx | ||
2754 | pop %rdi | ||
2755 | pop %rsi | ||
2756 | ret | ||
2757 | .size cbc_se_handler,.-cbc_se_handler | ||
2758 | |||
2759 | .section .pdata | ||
2760 | .align 4 | ||
2761 | .rva .LSEH_begin_AES_encrypt | ||
2762 | .rva .LSEH_end_AES_encrypt | ||
2763 | .rva .LSEH_info_AES_encrypt | ||
2764 | |||
2765 | .rva .LSEH_begin_AES_decrypt | ||
2766 | .rva .LSEH_end_AES_decrypt | ||
2767 | .rva .LSEH_info_AES_decrypt | ||
2768 | |||
2769 | .rva .LSEH_begin_AES_set_encrypt_key | ||
2770 | .rva .LSEH_end_AES_set_encrypt_key | ||
2771 | .rva .LSEH_info_AES_set_encrypt_key | ||
2772 | |||
2773 | .rva .LSEH_begin_AES_set_decrypt_key | ||
2774 | .rva .LSEH_end_AES_set_decrypt_key | ||
2775 | .rva .LSEH_info_AES_set_decrypt_key | ||
2776 | |||
2777 | .rva .LSEH_begin_AES_cbc_encrypt | ||
2778 | .rva .LSEH_end_AES_cbc_encrypt | ||
2779 | .rva .LSEH_info_AES_cbc_encrypt | ||
2780 | |||
2781 | .section .xdata | ||
2782 | .align 8 | ||
2783 | .LSEH_info_AES_encrypt: | ||
2784 | .byte 9,0,0,0 | ||
2785 | .rva block_se_handler | ||
2786 | .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] | ||
2787 | .LSEH_info_AES_decrypt: | ||
2788 | .byte 9,0,0,0 | ||
2789 | .rva block_se_handler | ||
2790 | .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] | ||
2791 | .LSEH_info_AES_set_encrypt_key: | ||
2792 | .byte 9,0,0,0 | ||
2793 | .rva key_se_handler | ||
2794 | .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] | ||
2795 | .LSEH_info_AES_set_decrypt_key: | ||
2796 | .byte 9,0,0,0 | ||
2797 | .rva key_se_handler | ||
2798 | .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] | ||
2799 | .LSEH_info_AES_cbc_encrypt: | ||
2800 | .byte 9,0,0,0 | ||
2801 | .rva cbc_se_handler | ||
2802 | ___ | ||
2803 | } | ||
1574 | 2804 | ||
1575 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | 2805 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; |
1576 | 2806 | ||