diff options
author | djm <> | 2012-10-13 21:25:14 +0000 |
---|---|---|
committer | djm <> | 2012-10-13 21:25:14 +0000 |
commit | 93723b50b639d8dc717bc1bf463fd46e1b321239 (patch) | |
tree | 281e0a29ae8f87a8c47fbd4deaa1f3d48b8cc5c1 /src/lib/libcrypto/aes | |
parent | 65e72ac55a6405783db7a12d7e35a7561d46005b (diff) | |
download | openbsd-93723b50b639d8dc717bc1bf463fd46e1b321239.tar.gz openbsd-93723b50b639d8dc717bc1bf463fd46e1b321239.tar.bz2 openbsd-93723b50b639d8dc717bc1bf463fd46e1b321239.zip |
resolve conflicts
Diffstat (limited to 'src/lib/libcrypto/aes')
-rw-r--r-- | src/lib/libcrypto/aes/aes.h | 5 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/aes_core.c | 12 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/aes_misc.c | 21 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/asm/aes-586.pl | 14 | ||||
-rwxr-xr-x | src/lib/libcrypto/aes/asm/aes-x86_64.pl | 45 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/asm/aesni-x86_64.pl | 2478 |
6 files changed, 2343 insertions, 232 deletions
diff --git a/src/lib/libcrypto/aes/aes.h b/src/lib/libcrypto/aes/aes.h index d2c99730fe..031abf01b5 100644 --- a/src/lib/libcrypto/aes/aes.h +++ b/src/lib/libcrypto/aes/aes.h | |||
@@ -90,6 +90,11 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | |||
90 | int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | 90 | int AES_set_decrypt_key(const unsigned char *userKey, const int bits, |
91 | AES_KEY *key); | 91 | AES_KEY *key); |
92 | 92 | ||
93 | int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, | ||
94 | AES_KEY *key); | ||
95 | int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, | ||
96 | AES_KEY *key); | ||
97 | |||
93 | void AES_encrypt(const unsigned char *in, unsigned char *out, | 98 | void AES_encrypt(const unsigned char *in, unsigned char *out, |
94 | const AES_KEY *key); | 99 | const AES_KEY *key); |
95 | void AES_decrypt(const unsigned char *in, unsigned char *out, | 100 | void AES_decrypt(const unsigned char *in, unsigned char *out, |
diff --git a/src/lib/libcrypto/aes/aes_core.c b/src/lib/libcrypto/aes/aes_core.c index a7ec54f4da..8f5210ac70 100644 --- a/src/lib/libcrypto/aes/aes_core.c +++ b/src/lib/libcrypto/aes/aes_core.c | |||
@@ -625,7 +625,7 @@ static const u32 rcon[] = { | |||
625 | /** | 625 | /** |
626 | * Expand the cipher key into the encryption key schedule. | 626 | * Expand the cipher key into the encryption key schedule. |
627 | */ | 627 | */ |
628 | int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | 628 | int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, |
629 | AES_KEY *key) { | 629 | AES_KEY *key) { |
630 | 630 | ||
631 | u32 *rk; | 631 | u32 *rk; |
@@ -726,7 +726,7 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | |||
726 | /** | 726 | /** |
727 | * Expand the cipher key into the decryption key schedule. | 727 | * Expand the cipher key into the decryption key schedule. |
728 | */ | 728 | */ |
729 | int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | 729 | int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, |
730 | AES_KEY *key) { | 730 | AES_KEY *key) { |
731 | 731 | ||
732 | u32 *rk; | 732 | u32 *rk; |
@@ -734,7 +734,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | |||
734 | u32 temp; | 734 | u32 temp; |
735 | 735 | ||
736 | /* first, start with an encryption schedule */ | 736 | /* first, start with an encryption schedule */ |
737 | status = AES_set_encrypt_key(userKey, bits, key); | 737 | status = private_AES_set_encrypt_key(userKey, bits, key); |
738 | if (status < 0) | 738 | if (status < 0) |
739 | return status; | 739 | return status; |
740 | 740 | ||
@@ -1201,7 +1201,7 @@ static const u32 rcon[] = { | |||
1201 | /** | 1201 | /** |
1202 | * Expand the cipher key into the encryption key schedule. | 1202 | * Expand the cipher key into the encryption key schedule. |
1203 | */ | 1203 | */ |
1204 | int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | 1204 | int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, |
1205 | AES_KEY *key) { | 1205 | AES_KEY *key) { |
1206 | u32 *rk; | 1206 | u32 *rk; |
1207 | int i = 0; | 1207 | int i = 0; |
@@ -1301,7 +1301,7 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | |||
1301 | /** | 1301 | /** |
1302 | * Expand the cipher key into the decryption key schedule. | 1302 | * Expand the cipher key into the decryption key schedule. |
1303 | */ | 1303 | */ |
1304 | int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | 1304 | int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, |
1305 | AES_KEY *key) { | 1305 | AES_KEY *key) { |
1306 | 1306 | ||
1307 | u32 *rk; | 1307 | u32 *rk; |
@@ -1309,7 +1309,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | |||
1309 | u32 temp; | 1309 | u32 temp; |
1310 | 1310 | ||
1311 | /* first, start with an encryption schedule */ | 1311 | /* first, start with an encryption schedule */ |
1312 | status = AES_set_encrypt_key(userKey, bits, key); | 1312 | status = private_AES_set_encrypt_key(userKey, bits, key); |
1313 | if (status < 0) | 1313 | if (status < 0) |
1314 | return status; | 1314 | return status; |
1315 | 1315 | ||
diff --git a/src/lib/libcrypto/aes/aes_misc.c b/src/lib/libcrypto/aes/aes_misc.c index 4fead1b4c7..f083488ecb 100644 --- a/src/lib/libcrypto/aes/aes_misc.c +++ b/src/lib/libcrypto/aes/aes_misc.c | |||
@@ -50,6 +50,7 @@ | |||
50 | */ | 50 | */ |
51 | 51 | ||
52 | #include <openssl/opensslv.h> | 52 | #include <openssl/opensslv.h> |
53 | #include <openssl/crypto.h> | ||
53 | #include <openssl/aes.h> | 54 | #include <openssl/aes.h> |
54 | #include "aes_locl.h" | 55 | #include "aes_locl.h" |
55 | 56 | ||
@@ -62,3 +63,23 @@ const char *AES_options(void) { | |||
62 | return "aes(partial)"; | 63 | return "aes(partial)"; |
63 | #endif | 64 | #endif |
64 | } | 65 | } |
66 | |||
67 | /* FIPS wrapper functions to block low level AES calls in FIPS mode */ | ||
68 | |||
69 | int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | ||
70 | AES_KEY *key) | ||
71 | { | ||
72 | #ifdef OPENSSL_FIPS | ||
73 | fips_cipher_abort(AES); | ||
74 | #endif | ||
75 | return private_AES_set_encrypt_key(userKey, bits, key); | ||
76 | } | ||
77 | |||
78 | int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | ||
79 | AES_KEY *key) | ||
80 | { | ||
81 | #ifdef OPENSSL_FIPS | ||
82 | fips_cipher_abort(AES); | ||
83 | #endif | ||
84 | return private_AES_set_decrypt_key(userKey, bits, key); | ||
85 | } | ||
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl index aab40e6f1c..687ed811be 100644 --- a/src/lib/libcrypto/aes/asm/aes-586.pl +++ b/src/lib/libcrypto/aes/asm/aes-586.pl | |||
@@ -39,7 +39,7 @@ | |||
39 | # but exhibits up to 10% improvement on other cores. | 39 | # but exhibits up to 10% improvement on other cores. |
40 | # | 40 | # |
41 | # Second version is "monolithic" replacement for aes_core.c, which in | 41 | # Second version is "monolithic" replacement for aes_core.c, which in |
42 | # addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key. | 42 | # addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key. |
43 | # This made it possible to implement little-endian variant of the | 43 | # This made it possible to implement little-endian variant of the |
44 | # algorithm without modifying the base C code. Motivating factor for | 44 | # algorithm without modifying the base C code. Motivating factor for |
45 | # the undertaken effort was that it appeared that in tight IA-32 | 45 | # the undertaken effort was that it appeared that in tight IA-32 |
@@ -2854,12 +2854,12 @@ sub enckey() | |||
2854 | &set_label("exit"); | 2854 | &set_label("exit"); |
2855 | &function_end("_x86_AES_set_encrypt_key"); | 2855 | &function_end("_x86_AES_set_encrypt_key"); |
2856 | 2856 | ||
2857 | # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | 2857 | # int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, |
2858 | # AES_KEY *key) | 2858 | # AES_KEY *key) |
2859 | &function_begin_B("AES_set_encrypt_key"); | 2859 | &function_begin_B("private_AES_set_encrypt_key"); |
2860 | &call ("_x86_AES_set_encrypt_key"); | 2860 | &call ("_x86_AES_set_encrypt_key"); |
2861 | &ret (); | 2861 | &ret (); |
2862 | &function_end_B("AES_set_encrypt_key"); | 2862 | &function_end_B("private_AES_set_encrypt_key"); |
2863 | 2863 | ||
2864 | sub deckey() | 2864 | sub deckey() |
2865 | { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; | 2865 | { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; |
@@ -2916,9 +2916,9 @@ sub deckey() | |||
2916 | &mov (&DWP(4*$i,$key),$tp1); | 2916 | &mov (&DWP(4*$i,$key),$tp1); |
2917 | } | 2917 | } |
2918 | 2918 | ||
2919 | # int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | 2919 | # int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, |
2920 | # AES_KEY *key) | 2920 | # AES_KEY *key) |
2921 | &function_begin_B("AES_set_decrypt_key"); | 2921 | &function_begin_B("private_AES_set_decrypt_key"); |
2922 | &call ("_x86_AES_set_encrypt_key"); | 2922 | &call ("_x86_AES_set_encrypt_key"); |
2923 | &cmp ("eax",0); | 2923 | &cmp ("eax",0); |
2924 | &je (&label("proceed")); | 2924 | &je (&label("proceed")); |
@@ -2974,7 +2974,7 @@ sub deckey() | |||
2974 | &jb (&label("permute")); | 2974 | &jb (&label("permute")); |
2975 | 2975 | ||
2976 | &xor ("eax","eax"); # return success | 2976 | &xor ("eax","eax"); # return success |
2977 | &function_end("AES_set_decrypt_key"); | 2977 | &function_end("private_AES_set_decrypt_key"); |
2978 | &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>"); | 2978 | &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>"); |
2979 | 2979 | ||
2980 | &asm_finish(); | 2980 | &asm_finish(); |
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl index 53e4ef85fd..027b4ae2e5 100755 --- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl +++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl | |||
@@ -588,6 +588,9 @@ $code.=<<___; | |||
588 | .globl AES_encrypt | 588 | .globl AES_encrypt |
589 | .type AES_encrypt,\@function,3 | 589 | .type AES_encrypt,\@function,3 |
590 | .align 16 | 590 | .align 16 |
591 | .globl asm_AES_encrypt | ||
592 | .hidden asm_AES_encrypt | ||
593 | asm_AES_encrypt: | ||
591 | AES_encrypt: | 594 | AES_encrypt: |
592 | push %rbx | 595 | push %rbx |
593 | push %rbp | 596 | push %rbp |
@@ -1184,6 +1187,9 @@ $code.=<<___; | |||
1184 | .globl AES_decrypt | 1187 | .globl AES_decrypt |
1185 | .type AES_decrypt,\@function,3 | 1188 | .type AES_decrypt,\@function,3 |
1186 | .align 16 | 1189 | .align 16 |
1190 | .globl asm_AES_decrypt | ||
1191 | .hidden asm_AES_decrypt | ||
1192 | asm_AES_decrypt: | ||
1187 | AES_decrypt: | 1193 | AES_decrypt: |
1188 | push %rbx | 1194 | push %rbx |
1189 | push %rbp | 1195 | push %rbp |
@@ -1277,13 +1283,13 @@ $code.=<<___; | |||
1277 | ___ | 1283 | ___ |
1278 | } | 1284 | } |
1279 | 1285 | ||
1280 | # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | 1286 | # int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, |
1281 | # AES_KEY *key) | 1287 | # AES_KEY *key) |
1282 | $code.=<<___; | 1288 | $code.=<<___; |
1283 | .globl AES_set_encrypt_key | 1289 | .globl private_AES_set_encrypt_key |
1284 | .type AES_set_encrypt_key,\@function,3 | 1290 | .type private_AES_set_encrypt_key,\@function,3 |
1285 | .align 16 | 1291 | .align 16 |
1286 | AES_set_encrypt_key: | 1292 | private_AES_set_encrypt_key: |
1287 | push %rbx | 1293 | push %rbx |
1288 | push %rbp | 1294 | push %rbp |
1289 | push %r12 # redundant, but allows to share | 1295 | push %r12 # redundant, but allows to share |
@@ -1304,7 +1310,7 @@ AES_set_encrypt_key: | |||
1304 | add \$56,%rsp | 1310 | add \$56,%rsp |
1305 | .Lenc_key_epilogue: | 1311 | .Lenc_key_epilogue: |
1306 | ret | 1312 | ret |
1307 | .size AES_set_encrypt_key,.-AES_set_encrypt_key | 1313 | .size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key |
1308 | 1314 | ||
1309 | .type _x86_64_AES_set_encrypt_key,\@abi-omnipotent | 1315 | .type _x86_64_AES_set_encrypt_key,\@abi-omnipotent |
1310 | .align 16 | 1316 | .align 16 |
@@ -1547,13 +1553,13 @@ $code.=<<___; | |||
1547 | ___ | 1553 | ___ |
1548 | } | 1554 | } |
1549 | 1555 | ||
1550 | # int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | 1556 | # int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, |
1551 | # AES_KEY *key) | 1557 | # AES_KEY *key) |
1552 | $code.=<<___; | 1558 | $code.=<<___; |
1553 | .globl AES_set_decrypt_key | 1559 | .globl private_AES_set_decrypt_key |
1554 | .type AES_set_decrypt_key,\@function,3 | 1560 | .type private_AES_set_decrypt_key,\@function,3 |
1555 | .align 16 | 1561 | .align 16 |
1556 | AES_set_decrypt_key: | 1562 | private_AES_set_decrypt_key: |
1557 | push %rbx | 1563 | push %rbx |
1558 | push %rbp | 1564 | push %rbp |
1559 | push %r12 | 1565 | push %r12 |
@@ -1622,7 +1628,7 @@ $code.=<<___; | |||
1622 | add \$56,%rsp | 1628 | add \$56,%rsp |
1623 | .Ldec_key_epilogue: | 1629 | .Ldec_key_epilogue: |
1624 | ret | 1630 | ret |
1625 | .size AES_set_decrypt_key,.-AES_set_decrypt_key | 1631 | .size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key |
1626 | ___ | 1632 | ___ |
1627 | 1633 | ||
1628 | # void AES_cbc_encrypt (const void char *inp, unsigned char *out, | 1634 | # void AES_cbc_encrypt (const void char *inp, unsigned char *out, |
@@ -1648,6 +1654,9 @@ $code.=<<___; | |||
1648 | .type AES_cbc_encrypt,\@function,6 | 1654 | .type AES_cbc_encrypt,\@function,6 |
1649 | .align 16 | 1655 | .align 16 |
1650 | .extern OPENSSL_ia32cap_P | 1656 | .extern OPENSSL_ia32cap_P |
1657 | .globl asm_AES_cbc_encrypt | ||
1658 | .hidden asm_AES_cbc_encrypt | ||
1659 | asm_AES_cbc_encrypt: | ||
1651 | AES_cbc_encrypt: | 1660 | AES_cbc_encrypt: |
1652 | cmp \$0,%rdx # check length | 1661 | cmp \$0,%rdx # check length |
1653 | je .Lcbc_epilogue | 1662 | je .Lcbc_epilogue |
@@ -2766,13 +2775,13 @@ cbc_se_handler: | |||
2766 | .rva .LSEH_end_AES_decrypt | 2775 | .rva .LSEH_end_AES_decrypt |
2767 | .rva .LSEH_info_AES_decrypt | 2776 | .rva .LSEH_info_AES_decrypt |
2768 | 2777 | ||
2769 | .rva .LSEH_begin_AES_set_encrypt_key | 2778 | .rva .LSEH_begin_private_AES_set_encrypt_key |
2770 | .rva .LSEH_end_AES_set_encrypt_key | 2779 | .rva .LSEH_end_private_AES_set_encrypt_key |
2771 | .rva .LSEH_info_AES_set_encrypt_key | 2780 | .rva .LSEH_info_private_AES_set_encrypt_key |
2772 | 2781 | ||
2773 | .rva .LSEH_begin_AES_set_decrypt_key | 2782 | .rva .LSEH_begin_private_AES_set_decrypt_key |
2774 | .rva .LSEH_end_AES_set_decrypt_key | 2783 | .rva .LSEH_end_private_AES_set_decrypt_key |
2775 | .rva .LSEH_info_AES_set_decrypt_key | 2784 | .rva .LSEH_info_private_AES_set_decrypt_key |
2776 | 2785 | ||
2777 | .rva .LSEH_begin_AES_cbc_encrypt | 2786 | .rva .LSEH_begin_AES_cbc_encrypt |
2778 | .rva .LSEH_end_AES_cbc_encrypt | 2787 | .rva .LSEH_end_AES_cbc_encrypt |
@@ -2788,11 +2797,11 @@ cbc_se_handler: | |||
2788 | .byte 9,0,0,0 | 2797 | .byte 9,0,0,0 |
2789 | .rva block_se_handler | 2798 | .rva block_se_handler |
2790 | .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] | 2799 | .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] |
2791 | .LSEH_info_AES_set_encrypt_key: | 2800 | .LSEH_info_private_AES_set_encrypt_key: |
2792 | .byte 9,0,0,0 | 2801 | .byte 9,0,0,0 |
2793 | .rva key_se_handler | 2802 | .rva key_se_handler |
2794 | .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] | 2803 | .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] |
2795 | .LSEH_info_AES_set_decrypt_key: | 2804 | .LSEH_info_private_AES_set_decrypt_key: |
2796 | .byte 9,0,0,0 | 2805 | .byte 9,0,0,0 |
2797 | .rva key_se_handler | 2806 | .rva key_se_handler |
2798 | .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] | 2807 | .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] |
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl index 49e0f4b351..499f3b3f42 100644 --- a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl +++ b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl | |||
@@ -11,6 +11,151 @@ | |||
11 | # OpenSSL context it's used with Intel engine, but can also be used as | 11 | # OpenSSL context it's used with Intel engine, but can also be used as |
12 | # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for | 12 | # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for |
13 | # details]. | 13 | # details]. |
14 | # | ||
15 | # Performance. | ||
16 | # | ||
17 | # Given aes(enc|dec) instructions' latency asymptotic performance for | ||
18 | # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte | ||
19 | # processed with 128-bit key. And given their throughput asymptotic | ||
20 | # performance for parallelizable modes is 1.25 cycles per byte. Being | ||
21 | # asymptotic limit it's not something you commonly achieve in reality, | ||
22 | # but how close does one get? Below are results collected for | ||
23 | # different modes and block sized. Pairs of numbers are for en-/ | ||
24 | # decryption. | ||
25 | # | ||
26 | # 16-byte 64-byte 256-byte 1-KB 8-KB | ||
27 | # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 | ||
28 | # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 | ||
29 | # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 | ||
30 | # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 | ||
31 | # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 | ||
32 | # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 | ||
33 | # | ||
34 | # ECB, CTR, CBC and CCM results are free from EVP overhead. This means | ||
35 | # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni | ||
36 | # [-decrypt]' will exhibit 10-15% worse results for smaller blocks. | ||
37 | # The results were collected with specially crafted speed.c benchmark | ||
38 | # in order to compare them with results reported in "Intel Advanced | ||
39 | # Encryption Standard (AES) New Instruction Set" White Paper Revision | ||
40 | # 3.0 dated May 2010. All above results are consistently better. This | ||
41 | # module also provides better performance for block sizes smaller than | ||
42 | # 128 bytes in points *not* represented in the above table. | ||
43 | # | ||
44 | # Looking at the results for 8-KB buffer. | ||
45 | # | ||
46 | # CFB and OFB results are far from the limit, because implementation | ||
47 | # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on | ||
48 | # single-block aesni_encrypt, which is not the most optimal way to go. | ||
49 | # CBC encrypt result is unexpectedly high and there is no documented | ||
50 | # explanation for it. Seemingly there is a small penalty for feeding | ||
51 | # the result back to AES unit the way it's done in CBC mode. There is | ||
52 | # nothing one can do and the result appears optimal. CCM result is | ||
53 | # identical to CBC, because CBC-MAC is essentially CBC encrypt without | ||
54 | # saving output. CCM CTR "stays invisible," because it's neatly | ||
55 | # interleaved wih CBC-MAC. This provides ~30% improvement over | ||
56 | # "straghtforward" CCM implementation with CTR and CBC-MAC performed | ||
57 | # disjointly. Parallelizable modes practically achieve the theoretical | ||
58 | # limit. | ||
59 | # | ||
60 | # Looking at how results vary with buffer size. | ||
61 | # | ||
62 | # Curves are practically saturated at 1-KB buffer size. In most cases | ||
63 | # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. | ||
64 | # CTR curve doesn't follow this pattern and is "slowest" changing one | ||
65 | # with "256-byte" result being 87% of "8-KB." This is because overhead | ||
66 | # in CTR mode is most computationally intensive. Small-block CCM | ||
67 | # decrypt is slower than encrypt, because first CTR and last CBC-MAC | ||
68 | # iterations can't be interleaved. | ||
69 | # | ||
70 | # Results for 192- and 256-bit keys. | ||
71 | # | ||
72 | # EVP-free results were observed to scale perfectly with number of | ||
73 | # rounds for larger block sizes, i.e. 192-bit result being 10/12 times | ||
74 | # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences | ||
75 | # are a tad smaller, because the above mentioned penalty biases all | ||
76 | # results by same constant value. In similar way function call | ||
77 | # overhead affects small-block performance, as well as OFB and CFB | ||
78 | # results. Differences are not large, most common coefficients are | ||
79 | # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one | ||
80 | # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... | ||
81 | |||
82 | # January 2011 | ||
83 | # | ||
84 | # While Westmere processor features 6 cycles latency for aes[enc|dec] | ||
85 | # instructions, which can be scheduled every second cycle, Sandy | ||
86 | # Bridge spends 8 cycles per instruction, but it can schedule them | ||
87 | # every cycle. This means that code targeting Westmere would perform | ||
88 | # suboptimally on Sandy Bridge. Therefore this update. | ||
89 | # | ||
90 | # In addition, non-parallelizable CBC encrypt (as well as CCM) is | ||
91 | # optimized. Relative improvement might appear modest, 8% on Westmere, | ||
92 | # but in absolute terms it's 3.77 cycles per byte encrypted with | ||
93 | # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers | ||
94 | # should be compared to asymptotic limits of 3.75 for Westmere and | ||
95 | # 5.00 for Sandy Bridge. Actually, the fact that they get this close | ||
96 | # to asymptotic limits is quite amazing. Indeed, the limit is | ||
97 | # calculated as latency times number of rounds, 10 for 128-bit key, | ||
98 | # and divided by 16, the number of bytes in block, or in other words | ||
99 | # it accounts *solely* for aesenc instructions. But there are extra | ||
100 | # instructions, and numbers so close to the asymptotic limits mean | ||
101 | # that it's as if it takes as little as *one* additional cycle to | ||
102 | # execute all of them. How is it possible? It is possible thanks to | ||
103 | # out-of-order execution logic, which manages to overlap post- | ||
104 | # processing of previous block, things like saving the output, with | ||
105 | # actual encryption of current block, as well as pre-processing of | ||
106 | # current block, things like fetching input and xor-ing it with | ||
107 | # 0-round element of the key schedule, with actual encryption of | ||
108 | # previous block. Keep this in mind... | ||
109 | # | ||
110 | # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher | ||
111 | # performance is achieved by interleaving instructions working on | ||
112 | # independent blocks. In which case asymptotic limit for such modes | ||
113 | # can be obtained by dividing above mentioned numbers by AES | ||
114 | # instructions' interleave factor. Westmere can execute at most 3 | ||
115 | # instructions at a time, meaning that optimal interleave factor is 3, | ||
116 | # and that's where the "magic" number of 1.25 come from. "Optimal | ||
117 | # interleave factor" means that increase of interleave factor does | ||
118 | # not improve performance. The formula has proven to reflect reality | ||
119 | # pretty well on Westmere... Sandy Bridge on the other hand can | ||
120 | # execute up to 8 AES instructions at a time, so how does varying | ||
121 | # interleave factor affect the performance? Here is table for ECB | ||
122 | # (numbers are cycles per byte processed with 128-bit key): | ||
123 | # | ||
124 | # instruction interleave factor 3x 6x 8x | ||
125 | # theoretical asymptotic limit 1.67 0.83 0.625 | ||
126 | # measured performance for 8KB block 1.05 0.86 0.84 | ||
127 | # | ||
128 | # "as if" interleave factor 4.7x 5.8x 6.0x | ||
129 | # | ||
130 | # Further data for other parallelizable modes: | ||
131 | # | ||
132 | # CBC decrypt 1.16 0.93 0.93 | ||
133 | # CTR 1.14 0.91 n/a | ||
134 | # | ||
135 | # Well, given 3x column it's probably inappropriate to call the limit | ||
136 | # asymptotic, if it can be surpassed, isn't it? What happens there? | ||
137 | # Rewind to CBC paragraph for the answer. Yes, out-of-order execution | ||
138 | # magic is responsible for this. Processor overlaps not only the | ||
139 | # additional instructions with AES ones, but even AES instuctions | ||
140 | # processing adjacent triplets of independent blocks. In the 6x case | ||
141 | # additional instructions still claim disproportionally small amount | ||
142 | # of additional cycles, but in 8x case number of instructions must be | ||
143 | # a tad too high for out-of-order logic to cope with, and AES unit | ||
144 | # remains underutilized... As you can see 8x interleave is hardly | ||
145 | # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl | ||
146 | # utilizies 6x interleave because of limited register bank capacity. | ||
147 | # | ||
148 | # Higher interleave factors do have negative impact on Westmere | ||
149 | # performance. While for ECB mode it's negligible ~1.5%, other | ||
150 | # parallelizables perform ~5% worse, which is outweighed by ~25% | ||
151 | # improvement on Sandy Bridge. To balance regression on Westmere | ||
152 | # CTR mode was implemented with 6x aesenc interleave factor. | ||
153 | |||
154 | # April 2011 | ||
155 | # | ||
156 | # Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing | ||
157 | # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like | ||
158 | # in CTR mode AES instruction interleave factor was chosen to be 6x. | ||
14 | 159 | ||
15 | $PREFIX="aesni"; # if $PREFIX is set to "AES", the script | 160 | $PREFIX="aesni"; # if $PREFIX is set to "AES", the script |
16 | # generates drop-in replacement for | 161 | # generates drop-in replacement for |
@@ -29,7 +174,7 @@ die "can't locate x86_64-xlate.pl"; | |||
29 | 174 | ||
30 | open STDOUT,"| $^X $xlate $flavour $output"; | 175 | open STDOUT,"| $^X $xlate $flavour $output"; |
31 | 176 | ||
32 | $movkey = $PREFIX eq "aesni" ? "movaps" : "movups"; | 177 | $movkey = $PREFIX eq "aesni" ? "movups" : "movups"; |
33 | @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order | 178 | @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order |
34 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | 179 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order |
35 | 180 | ||
@@ -41,18 +186,20 @@ $inp="%rdi"; | |||
41 | $out="%rsi"; | 186 | $out="%rsi"; |
42 | $len="%rdx"; | 187 | $len="%rdx"; |
43 | $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! | 188 | $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! |
44 | $ivp="%r8"; # cbc | 189 | $ivp="%r8"; # cbc, ctr, ... |
45 | 190 | ||
46 | $rnds_="%r10d"; # backup copy for $rounds | 191 | $rnds_="%r10d"; # backup copy for $rounds |
47 | $key_="%r11"; # backup copy for $key | 192 | $key_="%r11"; # backup copy for $key |
48 | 193 | ||
49 | # %xmm register layout | 194 | # %xmm register layout |
50 | $inout0="%xmm0"; $inout1="%xmm1"; | 195 | $rndkey0="%xmm0"; $rndkey1="%xmm1"; |
51 | $inout2="%xmm2"; $inout3="%xmm3"; | 196 | $inout0="%xmm2"; $inout1="%xmm3"; |
52 | $rndkey0="%xmm4"; $rndkey1="%xmm5"; | 197 | $inout2="%xmm4"; $inout3="%xmm5"; |
53 | 198 | $inout4="%xmm6"; $inout5="%xmm7"; | |
54 | $iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt | 199 | $inout6="%xmm8"; $inout7="%xmm9"; |
55 | $in1="%xmm8"; $in2="%xmm9"; | 200 | |
201 | $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... | ||
202 | $in0="%xmm8"; $iv="%xmm9"; | ||
56 | 203 | ||
57 | # Inline version of internal aesni_[en|de]crypt1. | 204 | # Inline version of internal aesni_[en|de]crypt1. |
58 | # | 205 | # |
@@ -60,20 +207,29 @@ $in1="%xmm8"; $in2="%xmm9"; | |||
60 | # cycles which take care of loop variables... | 207 | # cycles which take care of loop variables... |
61 | { my $sn; | 208 | { my $sn; |
62 | sub aesni_generate1 { | 209 | sub aesni_generate1 { |
63 | my ($p,$key,$rounds)=@_; | 210 | my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); |
64 | ++$sn; | 211 | ++$sn; |
65 | $code.=<<___; | 212 | $code.=<<___; |
66 | $movkey ($key),$rndkey0 | 213 | $movkey ($key),$rndkey0 |
67 | $movkey 16($key),$rndkey1 | 214 | $movkey 16($key),$rndkey1 |
215 | ___ | ||
216 | $code.=<<___ if (defined($ivec)); | ||
217 | xorps $rndkey0,$ivec | ||
218 | lea 32($key),$key | ||
219 | xorps $ivec,$inout | ||
220 | ___ | ||
221 | $code.=<<___ if (!defined($ivec)); | ||
68 | lea 32($key),$key | 222 | lea 32($key),$key |
69 | pxor $rndkey0,$inout0 | 223 | xorps $rndkey0,$inout |
224 | ___ | ||
225 | $code.=<<___; | ||
70 | .Loop_${p}1_$sn: | 226 | .Loop_${p}1_$sn: |
71 | aes${p} $rndkey1,$inout0 | 227 | aes${p} $rndkey1,$inout |
72 | dec $rounds | 228 | dec $rounds |
73 | $movkey ($key),$rndkey1 | 229 | $movkey ($key),$rndkey1 |
74 | lea 16($key),$key | 230 | lea 16($key),$key |
75 | jnz .Loop_${p}1_$sn # loop body is 16 bytes | 231 | jnz .Loop_${p}1_$sn # loop body is 16 bytes |
76 | aes${p}last $rndkey1,$inout0 | 232 | aes${p}last $rndkey1,$inout |
77 | ___ | 233 | ___ |
78 | }} | 234 | }} |
79 | # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); | 235 | # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); |
@@ -86,7 +242,7 @@ $code.=<<___; | |||
86 | .align 16 | 242 | .align 16 |
87 | ${PREFIX}_encrypt: | 243 | ${PREFIX}_encrypt: |
88 | movups ($inp),$inout0 # load input | 244 | movups ($inp),$inout0 # load input |
89 | mov 240($key),$rounds # pull $rounds | 245 | mov 240($key),$rounds # key->rounds |
90 | ___ | 246 | ___ |
91 | &aesni_generate1("enc",$key,$rounds); | 247 | &aesni_generate1("enc",$key,$rounds); |
92 | $code.=<<___; | 248 | $code.=<<___; |
@@ -99,7 +255,7 @@ $code.=<<___; | |||
99 | .align 16 | 255 | .align 16 |
100 | ${PREFIX}_decrypt: | 256 | ${PREFIX}_decrypt: |
101 | movups ($inp),$inout0 # load input | 257 | movups ($inp),$inout0 # load input |
102 | mov 240($key),$rounds # pull $rounds | 258 | mov 240($key),$rounds # key->rounds |
103 | ___ | 259 | ___ |
104 | &aesni_generate1("dec",$key,$rounds); | 260 | &aesni_generate1("dec",$key,$rounds); |
105 | $code.=<<___; | 261 | $code.=<<___; |
@@ -109,16 +265,16 @@ $code.=<<___; | |||
109 | ___ | 265 | ___ |
110 | } | 266 | } |
111 | 267 | ||
112 | # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave | 268 | # _aesni_[en|de]cryptN are private interfaces, N denotes interleave |
113 | # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] | 269 | # factor. Why 3x subroutine were originally used in loops? Even though |
114 | # latency is 6, it turned out that it can be scheduled only every | 270 | # aes[enc|dec] latency was originally 6, it could be scheduled only |
115 | # *second* cycle. Thus 3x interleave is the one providing optimal | 271 | # every *2nd* cycle. Thus 3x interleave was the one providing optimal |
116 | # utilization, i.e. when subroutine's throughput is virtually same as | 272 | # utilization, i.e. when subroutine's throughput is virtually same as |
117 | # of non-interleaved subroutine [for number of input blocks up to 3]. | 273 | # of non-interleaved subroutine [for number of input blocks up to 3]. |
118 | # This is why it makes no sense to implement 2x subroutine. As soon | 274 | # This is why it makes no sense to implement 2x subroutine. |
119 | # as/if Intel improves throughput by making it possible to schedule | 275 | # aes[enc|dec] latency in next processor generation is 8, but the |
120 | # the instructions in question *every* cycles I would have to | 276 | # instructions can be scheduled every cycle. Optimal interleave for |
121 | # implement 6x interleave and use it in loop... | 277 | # new processor is therefore 8x... |
122 | sub aesni_generate3 { | 278 | sub aesni_generate3 { |
123 | my $dir=shift; | 279 | my $dir=shift; |
124 | # As already mentioned it takes in $key and $rounds, which are *not* | 280 | # As already mentioned it takes in $key and $rounds, which are *not* |
@@ -131,25 +287,25 @@ _aesni_${dir}rypt3: | |||
131 | shr \$1,$rounds | 287 | shr \$1,$rounds |
132 | $movkey 16($key),$rndkey1 | 288 | $movkey 16($key),$rndkey1 |
133 | lea 32($key),$key | 289 | lea 32($key),$key |
134 | pxor $rndkey0,$inout0 | 290 | xorps $rndkey0,$inout0 |
135 | pxor $rndkey0,$inout1 | 291 | xorps $rndkey0,$inout1 |
136 | pxor $rndkey0,$inout2 | 292 | xorps $rndkey0,$inout2 |
293 | $movkey ($key),$rndkey0 | ||
137 | 294 | ||
138 | .L${dir}_loop3: | 295 | .L${dir}_loop3: |
139 | aes${dir} $rndkey1,$inout0 | 296 | aes${dir} $rndkey1,$inout0 |
140 | $movkey ($key),$rndkey0 | ||
141 | aes${dir} $rndkey1,$inout1 | 297 | aes${dir} $rndkey1,$inout1 |
142 | dec $rounds | 298 | dec $rounds |
143 | aes${dir} $rndkey1,$inout2 | 299 | aes${dir} $rndkey1,$inout2 |
144 | aes${dir} $rndkey0,$inout0 | ||
145 | $movkey 16($key),$rndkey1 | 300 | $movkey 16($key),$rndkey1 |
301 | aes${dir} $rndkey0,$inout0 | ||
146 | aes${dir} $rndkey0,$inout1 | 302 | aes${dir} $rndkey0,$inout1 |
147 | lea 32($key),$key | 303 | lea 32($key),$key |
148 | aes${dir} $rndkey0,$inout2 | 304 | aes${dir} $rndkey0,$inout2 |
305 | $movkey ($key),$rndkey0 | ||
149 | jnz .L${dir}_loop3 | 306 | jnz .L${dir}_loop3 |
150 | 307 | ||
151 | aes${dir} $rndkey1,$inout0 | 308 | aes${dir} $rndkey1,$inout0 |
152 | $movkey ($key),$rndkey0 | ||
153 | aes${dir} $rndkey1,$inout1 | 309 | aes${dir} $rndkey1,$inout1 |
154 | aes${dir} $rndkey1,$inout2 | 310 | aes${dir} $rndkey1,$inout2 |
155 | aes${dir}last $rndkey0,$inout0 | 311 | aes${dir}last $rndkey0,$inout0 |
@@ -175,28 +331,28 @@ _aesni_${dir}rypt4: | |||
175 | shr \$1,$rounds | 331 | shr \$1,$rounds |
176 | $movkey 16($key),$rndkey1 | 332 | $movkey 16($key),$rndkey1 |
177 | lea 32($key),$key | 333 | lea 32($key),$key |
178 | pxor $rndkey0,$inout0 | 334 | xorps $rndkey0,$inout0 |
179 | pxor $rndkey0,$inout1 | 335 | xorps $rndkey0,$inout1 |
180 | pxor $rndkey0,$inout2 | 336 | xorps $rndkey0,$inout2 |
181 | pxor $rndkey0,$inout3 | 337 | xorps $rndkey0,$inout3 |
338 | $movkey ($key),$rndkey0 | ||
182 | 339 | ||
183 | .L${dir}_loop4: | 340 | .L${dir}_loop4: |
184 | aes${dir} $rndkey1,$inout0 | 341 | aes${dir} $rndkey1,$inout0 |
185 | $movkey ($key),$rndkey0 | ||
186 | aes${dir} $rndkey1,$inout1 | 342 | aes${dir} $rndkey1,$inout1 |
187 | dec $rounds | 343 | dec $rounds |
188 | aes${dir} $rndkey1,$inout2 | 344 | aes${dir} $rndkey1,$inout2 |
189 | aes${dir} $rndkey1,$inout3 | 345 | aes${dir} $rndkey1,$inout3 |
190 | aes${dir} $rndkey0,$inout0 | ||
191 | $movkey 16($key),$rndkey1 | 346 | $movkey 16($key),$rndkey1 |
347 | aes${dir} $rndkey0,$inout0 | ||
192 | aes${dir} $rndkey0,$inout1 | 348 | aes${dir} $rndkey0,$inout1 |
193 | lea 32($key),$key | 349 | lea 32($key),$key |
194 | aes${dir} $rndkey0,$inout2 | 350 | aes${dir} $rndkey0,$inout2 |
195 | aes${dir} $rndkey0,$inout3 | 351 | aes${dir} $rndkey0,$inout3 |
352 | $movkey ($key),$rndkey0 | ||
196 | jnz .L${dir}_loop4 | 353 | jnz .L${dir}_loop4 |
197 | 354 | ||
198 | aes${dir} $rndkey1,$inout0 | 355 | aes${dir} $rndkey1,$inout0 |
199 | $movkey ($key),$rndkey0 | ||
200 | aes${dir} $rndkey1,$inout1 | 356 | aes${dir} $rndkey1,$inout1 |
201 | aes${dir} $rndkey1,$inout2 | 357 | aes${dir} $rndkey1,$inout2 |
202 | aes${dir} $rndkey1,$inout3 | 358 | aes${dir} $rndkey1,$inout3 |
@@ -208,12 +364,158 @@ _aesni_${dir}rypt4: | |||
208 | .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 | 364 | .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 |
209 | ___ | 365 | ___ |
210 | } | 366 | } |
367 | sub aesni_generate6 { | ||
368 | my $dir=shift; | ||
369 | # As already mentioned it takes in $key and $rounds, which are *not* | ||
370 | # preserved. $inout[0-5] is cipher/clear text... | ||
371 | $code.=<<___; | ||
372 | .type _aesni_${dir}rypt6,\@abi-omnipotent | ||
373 | .align 16 | ||
374 | _aesni_${dir}rypt6: | ||
375 | $movkey ($key),$rndkey0 | ||
376 | shr \$1,$rounds | ||
377 | $movkey 16($key),$rndkey1 | ||
378 | lea 32($key),$key | ||
379 | xorps $rndkey0,$inout0 | ||
380 | pxor $rndkey0,$inout1 | ||
381 | aes${dir} $rndkey1,$inout0 | ||
382 | pxor $rndkey0,$inout2 | ||
383 | aes${dir} $rndkey1,$inout1 | ||
384 | pxor $rndkey0,$inout3 | ||
385 | aes${dir} $rndkey1,$inout2 | ||
386 | pxor $rndkey0,$inout4 | ||
387 | aes${dir} $rndkey1,$inout3 | ||
388 | pxor $rndkey0,$inout5 | ||
389 | dec $rounds | ||
390 | aes${dir} $rndkey1,$inout4 | ||
391 | $movkey ($key),$rndkey0 | ||
392 | aes${dir} $rndkey1,$inout5 | ||
393 | jmp .L${dir}_loop6_enter | ||
394 | .align 16 | ||
395 | .L${dir}_loop6: | ||
396 | aes${dir} $rndkey1,$inout0 | ||
397 | aes${dir} $rndkey1,$inout1 | ||
398 | dec $rounds | ||
399 | aes${dir} $rndkey1,$inout2 | ||
400 | aes${dir} $rndkey1,$inout3 | ||
401 | aes${dir} $rndkey1,$inout4 | ||
402 | aes${dir} $rndkey1,$inout5 | ||
403 | .L${dir}_loop6_enter: # happens to be 16-byte aligned | ||
404 | $movkey 16($key),$rndkey1 | ||
405 | aes${dir} $rndkey0,$inout0 | ||
406 | aes${dir} $rndkey0,$inout1 | ||
407 | lea 32($key),$key | ||
408 | aes${dir} $rndkey0,$inout2 | ||
409 | aes${dir} $rndkey0,$inout3 | ||
410 | aes${dir} $rndkey0,$inout4 | ||
411 | aes${dir} $rndkey0,$inout5 | ||
412 | $movkey ($key),$rndkey0 | ||
413 | jnz .L${dir}_loop6 | ||
414 | |||
415 | aes${dir} $rndkey1,$inout0 | ||
416 | aes${dir} $rndkey1,$inout1 | ||
417 | aes${dir} $rndkey1,$inout2 | ||
418 | aes${dir} $rndkey1,$inout3 | ||
419 | aes${dir} $rndkey1,$inout4 | ||
420 | aes${dir} $rndkey1,$inout5 | ||
421 | aes${dir}last $rndkey0,$inout0 | ||
422 | aes${dir}last $rndkey0,$inout1 | ||
423 | aes${dir}last $rndkey0,$inout2 | ||
424 | aes${dir}last $rndkey0,$inout3 | ||
425 | aes${dir}last $rndkey0,$inout4 | ||
426 | aes${dir}last $rndkey0,$inout5 | ||
427 | ret | ||
428 | .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 | ||
429 | ___ | ||
430 | } | ||
431 | sub aesni_generate8 { | ||
432 | my $dir=shift; | ||
433 | # As already mentioned it takes in $key and $rounds, which are *not* | ||
434 | # preserved. $inout[0-7] is cipher/clear text... | ||
435 | $code.=<<___; | ||
436 | .type _aesni_${dir}rypt8,\@abi-omnipotent | ||
437 | .align 16 | ||
438 | _aesni_${dir}rypt8: | ||
439 | $movkey ($key),$rndkey0 | ||
440 | shr \$1,$rounds | ||
441 | $movkey 16($key),$rndkey1 | ||
442 | lea 32($key),$key | ||
443 | xorps $rndkey0,$inout0 | ||
444 | xorps $rndkey0,$inout1 | ||
445 | aes${dir} $rndkey1,$inout0 | ||
446 | pxor $rndkey0,$inout2 | ||
447 | aes${dir} $rndkey1,$inout1 | ||
448 | pxor $rndkey0,$inout3 | ||
449 | aes${dir} $rndkey1,$inout2 | ||
450 | pxor $rndkey0,$inout4 | ||
451 | aes${dir} $rndkey1,$inout3 | ||
452 | pxor $rndkey0,$inout5 | ||
453 | dec $rounds | ||
454 | aes${dir} $rndkey1,$inout4 | ||
455 | pxor $rndkey0,$inout6 | ||
456 | aes${dir} $rndkey1,$inout5 | ||
457 | pxor $rndkey0,$inout7 | ||
458 | $movkey ($key),$rndkey0 | ||
459 | aes${dir} $rndkey1,$inout6 | ||
460 | aes${dir} $rndkey1,$inout7 | ||
461 | $movkey 16($key),$rndkey1 | ||
462 | jmp .L${dir}_loop8_enter | ||
463 | .align 16 | ||
464 | .L${dir}_loop8: | ||
465 | aes${dir} $rndkey1,$inout0 | ||
466 | aes${dir} $rndkey1,$inout1 | ||
467 | dec $rounds | ||
468 | aes${dir} $rndkey1,$inout2 | ||
469 | aes${dir} $rndkey1,$inout3 | ||
470 | aes${dir} $rndkey1,$inout4 | ||
471 | aes${dir} $rndkey1,$inout5 | ||
472 | aes${dir} $rndkey1,$inout6 | ||
473 | aes${dir} $rndkey1,$inout7 | ||
474 | $movkey 16($key),$rndkey1 | ||
475 | .L${dir}_loop8_enter: # happens to be 16-byte aligned | ||
476 | aes${dir} $rndkey0,$inout0 | ||
477 | aes${dir} $rndkey0,$inout1 | ||
478 | lea 32($key),$key | ||
479 | aes${dir} $rndkey0,$inout2 | ||
480 | aes${dir} $rndkey0,$inout3 | ||
481 | aes${dir} $rndkey0,$inout4 | ||
482 | aes${dir} $rndkey0,$inout5 | ||
483 | aes${dir} $rndkey0,$inout6 | ||
484 | aes${dir} $rndkey0,$inout7 | ||
485 | $movkey ($key),$rndkey0 | ||
486 | jnz .L${dir}_loop8 | ||
487 | |||
488 | aes${dir} $rndkey1,$inout0 | ||
489 | aes${dir} $rndkey1,$inout1 | ||
490 | aes${dir} $rndkey1,$inout2 | ||
491 | aes${dir} $rndkey1,$inout3 | ||
492 | aes${dir} $rndkey1,$inout4 | ||
493 | aes${dir} $rndkey1,$inout5 | ||
494 | aes${dir} $rndkey1,$inout6 | ||
495 | aes${dir} $rndkey1,$inout7 | ||
496 | aes${dir}last $rndkey0,$inout0 | ||
497 | aes${dir}last $rndkey0,$inout1 | ||
498 | aes${dir}last $rndkey0,$inout2 | ||
499 | aes${dir}last $rndkey0,$inout3 | ||
500 | aes${dir}last $rndkey0,$inout4 | ||
501 | aes${dir}last $rndkey0,$inout5 | ||
502 | aes${dir}last $rndkey0,$inout6 | ||
503 | aes${dir}last $rndkey0,$inout7 | ||
504 | ret | ||
505 | .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 | ||
506 | ___ | ||
507 | } | ||
211 | &aesni_generate3("enc") if ($PREFIX eq "aesni"); | 508 | &aesni_generate3("enc") if ($PREFIX eq "aesni"); |
212 | &aesni_generate3("dec"); | 509 | &aesni_generate3("dec"); |
213 | &aesni_generate4("enc") if ($PREFIX eq "aesni"); | 510 | &aesni_generate4("enc") if ($PREFIX eq "aesni"); |
214 | &aesni_generate4("dec"); | 511 | &aesni_generate4("dec"); |
512 | &aesni_generate6("enc") if ($PREFIX eq "aesni"); | ||
513 | &aesni_generate6("dec"); | ||
514 | &aesni_generate8("enc") if ($PREFIX eq "aesni"); | ||
515 | &aesni_generate8("dec"); | ||
215 | 516 | ||
216 | if ($PREFIX eq "aesni") { | 517 | if ($PREFIX eq "aesni") { |
518 | ######################################################################## | ||
217 | # void aesni_ecb_encrypt (const void *in, void *out, | 519 | # void aesni_ecb_encrypt (const void *in, void *out, |
218 | # size_t length, const AES_KEY *key, | 520 | # size_t length, const AES_KEY *key, |
219 | # int enc); | 521 | # int enc); |
@@ -222,54 +524,98 @@ $code.=<<___; | |||
222 | .type aesni_ecb_encrypt,\@function,5 | 524 | .type aesni_ecb_encrypt,\@function,5 |
223 | .align 16 | 525 | .align 16 |
224 | aesni_ecb_encrypt: | 526 | aesni_ecb_encrypt: |
225 | cmp \$16,$len # check length | ||
226 | jb .Lecb_ret | ||
227 | |||
228 | mov 240($key),$rounds # pull $rounds | ||
229 | and \$-16,$len | 527 | and \$-16,$len |
528 | jz .Lecb_ret | ||
529 | |||
530 | mov 240($key),$rounds # key->rounds | ||
531 | $movkey ($key),$rndkey0 | ||
230 | mov $key,$key_ # backup $key | 532 | mov $key,$key_ # backup $key |
231 | test %r8d,%r8d # 5th argument | ||
232 | mov $rounds,$rnds_ # backup $rounds | 533 | mov $rounds,$rnds_ # backup $rounds |
534 | test %r8d,%r8d # 5th argument | ||
233 | jz .Lecb_decrypt | 535 | jz .Lecb_decrypt |
234 | #--------------------------- ECB ENCRYPT ------------------------------# | 536 | #--------------------------- ECB ENCRYPT ------------------------------# |
235 | sub \$0x40,$len | 537 | cmp \$0x80,$len |
236 | jbe .Lecb_enc_tail | 538 | jb .Lecb_enc_tail |
237 | jmp .Lecb_enc_loop3 | 539 | |
540 | movdqu ($inp),$inout0 | ||
541 | movdqu 0x10($inp),$inout1 | ||
542 | movdqu 0x20($inp),$inout2 | ||
543 | movdqu 0x30($inp),$inout3 | ||
544 | movdqu 0x40($inp),$inout4 | ||
545 | movdqu 0x50($inp),$inout5 | ||
546 | movdqu 0x60($inp),$inout6 | ||
547 | movdqu 0x70($inp),$inout7 | ||
548 | lea 0x80($inp),$inp | ||
549 | sub \$0x80,$len | ||
550 | jmp .Lecb_enc_loop8_enter | ||
238 | .align 16 | 551 | .align 16 |
239 | .Lecb_enc_loop3: | 552 | .Lecb_enc_loop8: |
240 | movups ($inp),$inout0 | 553 | movups $inout0,($out) |
241 | movups 0x10($inp),$inout1 | ||
242 | movups 0x20($inp),$inout2 | ||
243 | call _aesni_encrypt3 | ||
244 | sub \$0x30,$len | ||
245 | lea 0x30($inp),$inp | ||
246 | lea 0x30($out),$out | ||
247 | movups $inout0,-0x30($out) | ||
248 | mov $rnds_,$rounds # restore $rounds | ||
249 | movups $inout1,-0x20($out) | ||
250 | mov $key_,$key # restore $key | 554 | mov $key_,$key # restore $key |
251 | movups $inout2,-0x10($out) | 555 | movdqu ($inp),$inout0 |
252 | ja .Lecb_enc_loop3 | 556 | mov $rnds_,$rounds # restore $rounds |
557 | movups $inout1,0x10($out) | ||
558 | movdqu 0x10($inp),$inout1 | ||
559 | movups $inout2,0x20($out) | ||
560 | movdqu 0x20($inp),$inout2 | ||
561 | movups $inout3,0x30($out) | ||
562 | movdqu 0x30($inp),$inout3 | ||
563 | movups $inout4,0x40($out) | ||
564 | movdqu 0x40($inp),$inout4 | ||
565 | movups $inout5,0x50($out) | ||
566 | movdqu 0x50($inp),$inout5 | ||
567 | movups $inout6,0x60($out) | ||
568 | movdqu 0x60($inp),$inout6 | ||
569 | movups $inout7,0x70($out) | ||
570 | lea 0x80($out),$out | ||
571 | movdqu 0x70($inp),$inout7 | ||
572 | lea 0x80($inp),$inp | ||
573 | .Lecb_enc_loop8_enter: | ||
574 | |||
575 | call _aesni_encrypt8 | ||
576 | |||
577 | sub \$0x80,$len | ||
578 | jnc .Lecb_enc_loop8 | ||
253 | 579 | ||
254 | .Lecb_enc_tail: | 580 | movups $inout0,($out) |
255 | add \$0x40,$len | 581 | mov $key_,$key # restore $key |
582 | movups $inout1,0x10($out) | ||
583 | mov $rnds_,$rounds # restore $rounds | ||
584 | movups $inout2,0x20($out) | ||
585 | movups $inout3,0x30($out) | ||
586 | movups $inout4,0x40($out) | ||
587 | movups $inout5,0x50($out) | ||
588 | movups $inout6,0x60($out) | ||
589 | movups $inout7,0x70($out) | ||
590 | lea 0x80($out),$out | ||
591 | add \$0x80,$len | ||
256 | jz .Lecb_ret | 592 | jz .Lecb_ret |
257 | 593 | ||
258 | cmp \$0x10,$len | 594 | .Lecb_enc_tail: |
259 | movups ($inp),$inout0 | 595 | movups ($inp),$inout0 |
260 | je .Lecb_enc_one | ||
261 | cmp \$0x20,$len | 596 | cmp \$0x20,$len |
597 | jb .Lecb_enc_one | ||
262 | movups 0x10($inp),$inout1 | 598 | movups 0x10($inp),$inout1 |
263 | je .Lecb_enc_two | 599 | je .Lecb_enc_two |
264 | cmp \$0x30,$len | ||
265 | movups 0x20($inp),$inout2 | 600 | movups 0x20($inp),$inout2 |
266 | je .Lecb_enc_three | 601 | cmp \$0x40,$len |
602 | jb .Lecb_enc_three | ||
267 | movups 0x30($inp),$inout3 | 603 | movups 0x30($inp),$inout3 |
268 | call _aesni_encrypt4 | 604 | je .Lecb_enc_four |
605 | movups 0x40($inp),$inout4 | ||
606 | cmp \$0x60,$len | ||
607 | jb .Lecb_enc_five | ||
608 | movups 0x50($inp),$inout5 | ||
609 | je .Lecb_enc_six | ||
610 | movdqu 0x60($inp),$inout6 | ||
611 | call _aesni_encrypt8 | ||
269 | movups $inout0,($out) | 612 | movups $inout0,($out) |
270 | movups $inout1,0x10($out) | 613 | movups $inout1,0x10($out) |
271 | movups $inout2,0x20($out) | 614 | movups $inout2,0x20($out) |
272 | movups $inout3,0x30($out) | 615 | movups $inout3,0x30($out) |
616 | movups $inout4,0x40($out) | ||
617 | movups $inout5,0x50($out) | ||
618 | movups $inout6,0x60($out) | ||
273 | jmp .Lecb_ret | 619 | jmp .Lecb_ret |
274 | .align 16 | 620 | .align 16 |
275 | .Lecb_enc_one: | 621 | .Lecb_enc_one: |
@@ -280,6 +626,7 @@ $code.=<<___; | |||
280 | jmp .Lecb_ret | 626 | jmp .Lecb_ret |
281 | .align 16 | 627 | .align 16 |
282 | .Lecb_enc_two: | 628 | .Lecb_enc_two: |
629 | xorps $inout2,$inout2 | ||
283 | call _aesni_encrypt3 | 630 | call _aesni_encrypt3 |
284 | movups $inout0,($out) | 631 | movups $inout0,($out) |
285 | movups $inout1,0x10($out) | 632 | movups $inout1,0x10($out) |
@@ -291,47 +638,121 @@ $code.=<<___; | |||
291 | movups $inout1,0x10($out) | 638 | movups $inout1,0x10($out) |
292 | movups $inout2,0x20($out) | 639 | movups $inout2,0x20($out) |
293 | jmp .Lecb_ret | 640 | jmp .Lecb_ret |
641 | .align 16 | ||
642 | .Lecb_enc_four: | ||
643 | call _aesni_encrypt4 | ||
644 | movups $inout0,($out) | ||
645 | movups $inout1,0x10($out) | ||
646 | movups $inout2,0x20($out) | ||
647 | movups $inout3,0x30($out) | ||
648 | jmp .Lecb_ret | ||
649 | .align 16 | ||
650 | .Lecb_enc_five: | ||
651 | xorps $inout5,$inout5 | ||
652 | call _aesni_encrypt6 | ||
653 | movups $inout0,($out) | ||
654 | movups $inout1,0x10($out) | ||
655 | movups $inout2,0x20($out) | ||
656 | movups $inout3,0x30($out) | ||
657 | movups $inout4,0x40($out) | ||
658 | jmp .Lecb_ret | ||
659 | .align 16 | ||
660 | .Lecb_enc_six: | ||
661 | call _aesni_encrypt6 | ||
662 | movups $inout0,($out) | ||
663 | movups $inout1,0x10($out) | ||
664 | movups $inout2,0x20($out) | ||
665 | movups $inout3,0x30($out) | ||
666 | movups $inout4,0x40($out) | ||
667 | movups $inout5,0x50($out) | ||
668 | jmp .Lecb_ret | ||
294 | #--------------------------- ECB DECRYPT ------------------------------# | 669 | #--------------------------- ECB DECRYPT ------------------------------# |
295 | .align 16 | 670 | .align 16 |
296 | .Lecb_decrypt: | 671 | .Lecb_decrypt: |
297 | sub \$0x40,$len | 672 | cmp \$0x80,$len |
298 | jbe .Lecb_dec_tail | 673 | jb .Lecb_dec_tail |
299 | jmp .Lecb_dec_loop3 | 674 | |
675 | movdqu ($inp),$inout0 | ||
676 | movdqu 0x10($inp),$inout1 | ||
677 | movdqu 0x20($inp),$inout2 | ||
678 | movdqu 0x30($inp),$inout3 | ||
679 | movdqu 0x40($inp),$inout4 | ||
680 | movdqu 0x50($inp),$inout5 | ||
681 | movdqu 0x60($inp),$inout6 | ||
682 | movdqu 0x70($inp),$inout7 | ||
683 | lea 0x80($inp),$inp | ||
684 | sub \$0x80,$len | ||
685 | jmp .Lecb_dec_loop8_enter | ||
300 | .align 16 | 686 | .align 16 |
301 | .Lecb_dec_loop3: | 687 | .Lecb_dec_loop8: |
302 | movups ($inp),$inout0 | 688 | movups $inout0,($out) |
303 | movups 0x10($inp),$inout1 | ||
304 | movups 0x20($inp),$inout2 | ||
305 | call _aesni_decrypt3 | ||
306 | sub \$0x30,$len | ||
307 | lea 0x30($inp),$inp | ||
308 | lea 0x30($out),$out | ||
309 | movups $inout0,-0x30($out) | ||
310 | mov $rnds_,$rounds # restore $rounds | ||
311 | movups $inout1,-0x20($out) | ||
312 | mov $key_,$key # restore $key | 689 | mov $key_,$key # restore $key |
313 | movups $inout2,-0x10($out) | 690 | movdqu ($inp),$inout0 |
314 | ja .Lecb_dec_loop3 | 691 | mov $rnds_,$rounds # restore $rounds |
692 | movups $inout1,0x10($out) | ||
693 | movdqu 0x10($inp),$inout1 | ||
694 | movups $inout2,0x20($out) | ||
695 | movdqu 0x20($inp),$inout2 | ||
696 | movups $inout3,0x30($out) | ||
697 | movdqu 0x30($inp),$inout3 | ||
698 | movups $inout4,0x40($out) | ||
699 | movdqu 0x40($inp),$inout4 | ||
700 | movups $inout5,0x50($out) | ||
701 | movdqu 0x50($inp),$inout5 | ||
702 | movups $inout6,0x60($out) | ||
703 | movdqu 0x60($inp),$inout6 | ||
704 | movups $inout7,0x70($out) | ||
705 | lea 0x80($out),$out | ||
706 | movdqu 0x70($inp),$inout7 | ||
707 | lea 0x80($inp),$inp | ||
708 | .Lecb_dec_loop8_enter: | ||
709 | |||
710 | call _aesni_decrypt8 | ||
711 | |||
712 | $movkey ($key_),$rndkey0 | ||
713 | sub \$0x80,$len | ||
714 | jnc .Lecb_dec_loop8 | ||
315 | 715 | ||
316 | .Lecb_dec_tail: | 716 | movups $inout0,($out) |
317 | add \$0x40,$len | 717 | mov $key_,$key # restore $key |
718 | movups $inout1,0x10($out) | ||
719 | mov $rnds_,$rounds # restore $rounds | ||
720 | movups $inout2,0x20($out) | ||
721 | movups $inout3,0x30($out) | ||
722 | movups $inout4,0x40($out) | ||
723 | movups $inout5,0x50($out) | ||
724 | movups $inout6,0x60($out) | ||
725 | movups $inout7,0x70($out) | ||
726 | lea 0x80($out),$out | ||
727 | add \$0x80,$len | ||
318 | jz .Lecb_ret | 728 | jz .Lecb_ret |
319 | 729 | ||
320 | cmp \$0x10,$len | 730 | .Lecb_dec_tail: |
321 | movups ($inp),$inout0 | 731 | movups ($inp),$inout0 |
322 | je .Lecb_dec_one | ||
323 | cmp \$0x20,$len | 732 | cmp \$0x20,$len |
733 | jb .Lecb_dec_one | ||
324 | movups 0x10($inp),$inout1 | 734 | movups 0x10($inp),$inout1 |
325 | je .Lecb_dec_two | 735 | je .Lecb_dec_two |
326 | cmp \$0x30,$len | ||
327 | movups 0x20($inp),$inout2 | 736 | movups 0x20($inp),$inout2 |
328 | je .Lecb_dec_three | 737 | cmp \$0x40,$len |
738 | jb .Lecb_dec_three | ||
329 | movups 0x30($inp),$inout3 | 739 | movups 0x30($inp),$inout3 |
330 | call _aesni_decrypt4 | 740 | je .Lecb_dec_four |
741 | movups 0x40($inp),$inout4 | ||
742 | cmp \$0x60,$len | ||
743 | jb .Lecb_dec_five | ||
744 | movups 0x50($inp),$inout5 | ||
745 | je .Lecb_dec_six | ||
746 | movups 0x60($inp),$inout6 | ||
747 | $movkey ($key),$rndkey0 | ||
748 | call _aesni_decrypt8 | ||
331 | movups $inout0,($out) | 749 | movups $inout0,($out) |
332 | movups $inout1,0x10($out) | 750 | movups $inout1,0x10($out) |
333 | movups $inout2,0x20($out) | 751 | movups $inout2,0x20($out) |
334 | movups $inout3,0x30($out) | 752 | movups $inout3,0x30($out) |
753 | movups $inout4,0x40($out) | ||
754 | movups $inout5,0x50($out) | ||
755 | movups $inout6,0x60($out) | ||
335 | jmp .Lecb_ret | 756 | jmp .Lecb_ret |
336 | .align 16 | 757 | .align 16 |
337 | .Lecb_dec_one: | 758 | .Lecb_dec_one: |
@@ -342,6 +763,7 @@ $code.=<<___; | |||
342 | jmp .Lecb_ret | 763 | jmp .Lecb_ret |
343 | .align 16 | 764 | .align 16 |
344 | .Lecb_dec_two: | 765 | .Lecb_dec_two: |
766 | xorps $inout2,$inout2 | ||
345 | call _aesni_decrypt3 | 767 | call _aesni_decrypt3 |
346 | movups $inout0,($out) | 768 | movups $inout0,($out) |
347 | movups $inout1,0x10($out) | 769 | movups $inout1,0x10($out) |
@@ -352,17 +774,1353 @@ $code.=<<___; | |||
352 | movups $inout0,($out) | 774 | movups $inout0,($out) |
353 | movups $inout1,0x10($out) | 775 | movups $inout1,0x10($out) |
354 | movups $inout2,0x20($out) | 776 | movups $inout2,0x20($out) |
777 | jmp .Lecb_ret | ||
778 | .align 16 | ||
779 | .Lecb_dec_four: | ||
780 | call _aesni_decrypt4 | ||
781 | movups $inout0,($out) | ||
782 | movups $inout1,0x10($out) | ||
783 | movups $inout2,0x20($out) | ||
784 | movups $inout3,0x30($out) | ||
785 | jmp .Lecb_ret | ||
786 | .align 16 | ||
787 | .Lecb_dec_five: | ||
788 | xorps $inout5,$inout5 | ||
789 | call _aesni_decrypt6 | ||
790 | movups $inout0,($out) | ||
791 | movups $inout1,0x10($out) | ||
792 | movups $inout2,0x20($out) | ||
793 | movups $inout3,0x30($out) | ||
794 | movups $inout4,0x40($out) | ||
795 | jmp .Lecb_ret | ||
796 | .align 16 | ||
797 | .Lecb_dec_six: | ||
798 | call _aesni_decrypt6 | ||
799 | movups $inout0,($out) | ||
800 | movups $inout1,0x10($out) | ||
801 | movups $inout2,0x20($out) | ||
802 | movups $inout3,0x30($out) | ||
803 | movups $inout4,0x40($out) | ||
804 | movups $inout5,0x50($out) | ||
355 | 805 | ||
356 | .Lecb_ret: | 806 | .Lecb_ret: |
357 | ret | 807 | ret |
358 | .size aesni_ecb_encrypt,.-aesni_ecb_encrypt | 808 | .size aesni_ecb_encrypt,.-aesni_ecb_encrypt |
359 | ___ | 809 | ___ |
810 | |||
811 | { | ||
812 | ###################################################################### | ||
813 | # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, | ||
814 | # size_t blocks, const AES_KEY *key, | ||
815 | # const char *ivec,char *cmac); | ||
816 | # | ||
817 | # Handles only complete blocks, operates on 64-bit counter and | ||
818 | # does not update *ivec! Nor does it finalize CMAC value | ||
819 | # (see engine/eng_aesni.c for details) | ||
820 | # | ||
821 | { | ||
822 | my $cmac="%r9"; # 6th argument | ||
823 | |||
824 | my $increment="%xmm6"; | ||
825 | my $bswap_mask="%xmm7"; | ||
826 | |||
827 | $code.=<<___; | ||
828 | .globl aesni_ccm64_encrypt_blocks | ||
829 | .type aesni_ccm64_encrypt_blocks,\@function,6 | ||
830 | .align 16 | ||
831 | aesni_ccm64_encrypt_blocks: | ||
832 | ___ | ||
833 | $code.=<<___ if ($win64); | ||
834 | lea -0x58(%rsp),%rsp | ||
835 | movaps %xmm6,(%rsp) | ||
836 | movaps %xmm7,0x10(%rsp) | ||
837 | movaps %xmm8,0x20(%rsp) | ||
838 | movaps %xmm9,0x30(%rsp) | ||
839 | .Lccm64_enc_body: | ||
840 | ___ | ||
841 | $code.=<<___; | ||
842 | mov 240($key),$rounds # key->rounds | ||
843 | movdqu ($ivp),$iv | ||
844 | movdqa .Lincrement64(%rip),$increment | ||
845 | movdqa .Lbswap_mask(%rip),$bswap_mask | ||
846 | |||
847 | shr \$1,$rounds | ||
848 | lea 0($key),$key_ | ||
849 | movdqu ($cmac),$inout1 | ||
850 | movdqa $iv,$inout0 | ||
851 | mov $rounds,$rnds_ | ||
852 | pshufb $bswap_mask,$iv | ||
853 | jmp .Lccm64_enc_outer | ||
854 | .align 16 | ||
855 | .Lccm64_enc_outer: | ||
856 | $movkey ($key_),$rndkey0 | ||
857 | mov $rnds_,$rounds | ||
858 | movups ($inp),$in0 # load inp | ||
859 | |||
860 | xorps $rndkey0,$inout0 # counter | ||
861 | $movkey 16($key_),$rndkey1 | ||
862 | xorps $in0,$rndkey0 | ||
863 | lea 32($key_),$key | ||
864 | xorps $rndkey0,$inout1 # cmac^=inp | ||
865 | $movkey ($key),$rndkey0 | ||
866 | |||
867 | .Lccm64_enc2_loop: | ||
868 | aesenc $rndkey1,$inout0 | ||
869 | dec $rounds | ||
870 | aesenc $rndkey1,$inout1 | ||
871 | $movkey 16($key),$rndkey1 | ||
872 | aesenc $rndkey0,$inout0 | ||
873 | lea 32($key),$key | ||
874 | aesenc $rndkey0,$inout1 | ||
875 | $movkey 0($key),$rndkey0 | ||
876 | jnz .Lccm64_enc2_loop | ||
877 | aesenc $rndkey1,$inout0 | ||
878 | aesenc $rndkey1,$inout1 | ||
879 | paddq $increment,$iv | ||
880 | aesenclast $rndkey0,$inout0 | ||
881 | aesenclast $rndkey0,$inout1 | ||
882 | |||
883 | dec $len | ||
884 | lea 16($inp),$inp | ||
885 | xorps $inout0,$in0 # inp ^= E(iv) | ||
886 | movdqa $iv,$inout0 | ||
887 | movups $in0,($out) # save output | ||
888 | lea 16($out),$out | ||
889 | pshufb $bswap_mask,$inout0 | ||
890 | jnz .Lccm64_enc_outer | ||
891 | |||
892 | movups $inout1,($cmac) | ||
893 | ___ | ||
894 | $code.=<<___ if ($win64); | ||
895 | movaps (%rsp),%xmm6 | ||
896 | movaps 0x10(%rsp),%xmm7 | ||
897 | movaps 0x20(%rsp),%xmm8 | ||
898 | movaps 0x30(%rsp),%xmm9 | ||
899 | lea 0x58(%rsp),%rsp | ||
900 | .Lccm64_enc_ret: | ||
901 | ___ | ||
902 | $code.=<<___; | ||
903 | ret | ||
904 | .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks | ||
905 | ___ | ||
906 | ###################################################################### | ||
907 | $code.=<<___; | ||
908 | .globl aesni_ccm64_decrypt_blocks | ||
909 | .type aesni_ccm64_decrypt_blocks,\@function,6 | ||
910 | .align 16 | ||
911 | aesni_ccm64_decrypt_blocks: | ||
912 | ___ | ||
913 | $code.=<<___ if ($win64); | ||
914 | lea -0x58(%rsp),%rsp | ||
915 | movaps %xmm6,(%rsp) | ||
916 | movaps %xmm7,0x10(%rsp) | ||
917 | movaps %xmm8,0x20(%rsp) | ||
918 | movaps %xmm9,0x30(%rsp) | ||
919 | .Lccm64_dec_body: | ||
920 | ___ | ||
921 | $code.=<<___; | ||
922 | mov 240($key),$rounds # key->rounds | ||
923 | movups ($ivp),$iv | ||
924 | movdqu ($cmac),$inout1 | ||
925 | movdqa .Lincrement64(%rip),$increment | ||
926 | movdqa .Lbswap_mask(%rip),$bswap_mask | ||
927 | |||
928 | movaps $iv,$inout0 | ||
929 | mov $rounds,$rnds_ | ||
930 | mov $key,$key_ | ||
931 | pshufb $bswap_mask,$iv | ||
932 | ___ | ||
933 | &aesni_generate1("enc",$key,$rounds); | ||
934 | $code.=<<___; | ||
935 | movups ($inp),$in0 # load inp | ||
936 | paddq $increment,$iv | ||
937 | lea 16($inp),$inp | ||
938 | jmp .Lccm64_dec_outer | ||
939 | .align 16 | ||
940 | .Lccm64_dec_outer: | ||
941 | xorps $inout0,$in0 # inp ^= E(iv) | ||
942 | movdqa $iv,$inout0 | ||
943 | mov $rnds_,$rounds | ||
944 | movups $in0,($out) # save output | ||
945 | lea 16($out),$out | ||
946 | pshufb $bswap_mask,$inout0 | ||
947 | |||
948 | sub \$1,$len | ||
949 | jz .Lccm64_dec_break | ||
950 | |||
951 | $movkey ($key_),$rndkey0 | ||
952 | shr \$1,$rounds | ||
953 | $movkey 16($key_),$rndkey1 | ||
954 | xorps $rndkey0,$in0 | ||
955 | lea 32($key_),$key | ||
956 | xorps $rndkey0,$inout0 | ||
957 | xorps $in0,$inout1 # cmac^=out | ||
958 | $movkey ($key),$rndkey0 | ||
959 | |||
960 | .Lccm64_dec2_loop: | ||
961 | aesenc $rndkey1,$inout0 | ||
962 | dec $rounds | ||
963 | aesenc $rndkey1,$inout1 | ||
964 | $movkey 16($key),$rndkey1 | ||
965 | aesenc $rndkey0,$inout0 | ||
966 | lea 32($key),$key | ||
967 | aesenc $rndkey0,$inout1 | ||
968 | $movkey 0($key),$rndkey0 | ||
969 | jnz .Lccm64_dec2_loop | ||
970 | movups ($inp),$in0 # load inp | ||
971 | paddq $increment,$iv | ||
972 | aesenc $rndkey1,$inout0 | ||
973 | aesenc $rndkey1,$inout1 | ||
974 | lea 16($inp),$inp | ||
975 | aesenclast $rndkey0,$inout0 | ||
976 | aesenclast $rndkey0,$inout1 | ||
977 | jmp .Lccm64_dec_outer | ||
978 | |||
979 | .align 16 | ||
980 | .Lccm64_dec_break: | ||
981 | #xorps $in0,$inout1 # cmac^=out | ||
982 | ___ | ||
983 | &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); | ||
984 | $code.=<<___; | ||
985 | movups $inout1,($cmac) | ||
986 | ___ | ||
987 | $code.=<<___ if ($win64); | ||
988 | movaps (%rsp),%xmm6 | ||
989 | movaps 0x10(%rsp),%xmm7 | ||
990 | movaps 0x20(%rsp),%xmm8 | ||
991 | movaps 0x30(%rsp),%xmm9 | ||
992 | lea 0x58(%rsp),%rsp | ||
993 | .Lccm64_dec_ret: | ||
994 | ___ | ||
995 | $code.=<<___; | ||
996 | ret | ||
997 | .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks | ||
998 | ___ | ||
999 | } | ||
1000 | ###################################################################### | ||
1001 | # void aesni_ctr32_encrypt_blocks (const void *in, void *out, | ||
1002 | # size_t blocks, const AES_KEY *key, | ||
1003 | # const char *ivec); | ||
1004 | # | ||
1005 | # Handles only complete blocks, operates on 32-bit counter and | ||
1006 | # does not update *ivec! (see engine/eng_aesni.c for details) | ||
1007 | # | ||
1008 | { | ||
1009 | my $reserved = $win64?0:-0x28; | ||
1010 | my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11)); | ||
1011 | my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14"); | ||
1012 | my $bswap_mask="%xmm15"; | ||
1013 | |||
1014 | $code.=<<___; | ||
1015 | .globl aesni_ctr32_encrypt_blocks | ||
1016 | .type aesni_ctr32_encrypt_blocks,\@function,5 | ||
1017 | .align 16 | ||
1018 | aesni_ctr32_encrypt_blocks: | ||
1019 | ___ | ||
1020 | $code.=<<___ if ($win64); | ||
1021 | lea -0xc8(%rsp),%rsp | ||
1022 | movaps %xmm6,0x20(%rsp) | ||
1023 | movaps %xmm7,0x30(%rsp) | ||
1024 | movaps %xmm8,0x40(%rsp) | ||
1025 | movaps %xmm9,0x50(%rsp) | ||
1026 | movaps %xmm10,0x60(%rsp) | ||
1027 | movaps %xmm11,0x70(%rsp) | ||
1028 | movaps %xmm12,0x80(%rsp) | ||
1029 | movaps %xmm13,0x90(%rsp) | ||
1030 | movaps %xmm14,0xa0(%rsp) | ||
1031 | movaps %xmm15,0xb0(%rsp) | ||
1032 | .Lctr32_body: | ||
1033 | ___ | ||
1034 | $code.=<<___; | ||
1035 | cmp \$1,$len | ||
1036 | je .Lctr32_one_shortcut | ||
1037 | |||
1038 | movdqu ($ivp),$ivec | ||
1039 | movdqa .Lbswap_mask(%rip),$bswap_mask | ||
1040 | xor $rounds,$rounds | ||
1041 | pextrd \$3,$ivec,$rnds_ # pull 32-bit counter | ||
1042 | pinsrd \$3,$rounds,$ivec # wipe 32-bit counter | ||
1043 | |||
1044 | mov 240($key),$rounds # key->rounds | ||
1045 | bswap $rnds_ | ||
1046 | pxor $iv0,$iv0 # vector of 3 32-bit counters | ||
1047 | pxor $iv1,$iv1 # vector of 3 32-bit counters | ||
1048 | pinsrd \$0,$rnds_,$iv0 | ||
1049 | lea 3($rnds_),$key_ | ||
1050 | pinsrd \$0,$key_,$iv1 | ||
1051 | inc $rnds_ | ||
1052 | pinsrd \$1,$rnds_,$iv0 | ||
1053 | inc $key_ | ||
1054 | pinsrd \$1,$key_,$iv1 | ||
1055 | inc $rnds_ | ||
1056 | pinsrd \$2,$rnds_,$iv0 | ||
1057 | inc $key_ | ||
1058 | pinsrd \$2,$key_,$iv1 | ||
1059 | movdqa $iv0,$reserved(%rsp) | ||
1060 | pshufb $bswap_mask,$iv0 | ||
1061 | movdqa $iv1,`$reserved+0x10`(%rsp) | ||
1062 | pshufb $bswap_mask,$iv1 | ||
1063 | |||
1064 | pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword | ||
1065 | pshufd \$`2<<6`,$iv0,$inout1 | ||
1066 | pshufd \$`1<<6`,$iv0,$inout2 | ||
1067 | cmp \$6,$len | ||
1068 | jb .Lctr32_tail | ||
1069 | shr \$1,$rounds | ||
1070 | mov $key,$key_ # backup $key | ||
1071 | mov $rounds,$rnds_ # backup $rounds | ||
1072 | sub \$6,$len | ||
1073 | jmp .Lctr32_loop6 | ||
1074 | |||
1075 | .align 16 | ||
1076 | .Lctr32_loop6: | ||
1077 | pshufd \$`3<<6`,$iv1,$inout3 | ||
1078 | por $ivec,$inout0 # merge counter-less ivec | ||
1079 | $movkey ($key_),$rndkey0 | ||
1080 | pshufd \$`2<<6`,$iv1,$inout4 | ||
1081 | por $ivec,$inout1 | ||
1082 | $movkey 16($key_),$rndkey1 | ||
1083 | pshufd \$`1<<6`,$iv1,$inout5 | ||
1084 | por $ivec,$inout2 | ||
1085 | por $ivec,$inout3 | ||
1086 | xorps $rndkey0,$inout0 | ||
1087 | por $ivec,$inout4 | ||
1088 | por $ivec,$inout5 | ||
1089 | |||
1090 | # inline _aesni_encrypt6 and interleave last rounds | ||
1091 | # with own code... | ||
1092 | |||
1093 | pxor $rndkey0,$inout1 | ||
1094 | aesenc $rndkey1,$inout0 | ||
1095 | lea 32($key_),$key | ||
1096 | pxor $rndkey0,$inout2 | ||
1097 | aesenc $rndkey1,$inout1 | ||
1098 | movdqa .Lincrement32(%rip),$iv1 | ||
1099 | pxor $rndkey0,$inout3 | ||
1100 | aesenc $rndkey1,$inout2 | ||
1101 | movdqa $reserved(%rsp),$iv0 | ||
1102 | pxor $rndkey0,$inout4 | ||
1103 | aesenc $rndkey1,$inout3 | ||
1104 | pxor $rndkey0,$inout5 | ||
1105 | $movkey ($key),$rndkey0 | ||
1106 | dec $rounds | ||
1107 | aesenc $rndkey1,$inout4 | ||
1108 | aesenc $rndkey1,$inout5 | ||
1109 | jmp .Lctr32_enc_loop6_enter | ||
1110 | .align 16 | ||
1111 | .Lctr32_enc_loop6: | ||
1112 | aesenc $rndkey1,$inout0 | ||
1113 | aesenc $rndkey1,$inout1 | ||
1114 | dec $rounds | ||
1115 | aesenc $rndkey1,$inout2 | ||
1116 | aesenc $rndkey1,$inout3 | ||
1117 | aesenc $rndkey1,$inout4 | ||
1118 | aesenc $rndkey1,$inout5 | ||
1119 | .Lctr32_enc_loop6_enter: | ||
1120 | $movkey 16($key),$rndkey1 | ||
1121 | aesenc $rndkey0,$inout0 | ||
1122 | aesenc $rndkey0,$inout1 | ||
1123 | lea 32($key),$key | ||
1124 | aesenc $rndkey0,$inout2 | ||
1125 | aesenc $rndkey0,$inout3 | ||
1126 | aesenc $rndkey0,$inout4 | ||
1127 | aesenc $rndkey0,$inout5 | ||
1128 | $movkey ($key),$rndkey0 | ||
1129 | jnz .Lctr32_enc_loop6 | ||
1130 | |||
1131 | aesenc $rndkey1,$inout0 | ||
1132 | paddd $iv1,$iv0 # increment counter vector | ||
1133 | aesenc $rndkey1,$inout1 | ||
1134 | paddd `$reserved+0x10`(%rsp),$iv1 | ||
1135 | aesenc $rndkey1,$inout2 | ||
1136 | movdqa $iv0,$reserved(%rsp) # save counter vector | ||
1137 | aesenc $rndkey1,$inout3 | ||
1138 | movdqa $iv1,`$reserved+0x10`(%rsp) | ||
1139 | aesenc $rndkey1,$inout4 | ||
1140 | pshufb $bswap_mask,$iv0 # byte swap | ||
1141 | aesenc $rndkey1,$inout5 | ||
1142 | pshufb $bswap_mask,$iv1 | ||
1143 | |||
1144 | aesenclast $rndkey0,$inout0 | ||
1145 | movups ($inp),$in0 # load input | ||
1146 | aesenclast $rndkey0,$inout1 | ||
1147 | movups 0x10($inp),$in1 | ||
1148 | aesenclast $rndkey0,$inout2 | ||
1149 | movups 0x20($inp),$in2 | ||
1150 | aesenclast $rndkey0,$inout3 | ||
1151 | movups 0x30($inp),$in3 | ||
1152 | aesenclast $rndkey0,$inout4 | ||
1153 | movups 0x40($inp),$rndkey1 | ||
1154 | aesenclast $rndkey0,$inout5 | ||
1155 | movups 0x50($inp),$rndkey0 | ||
1156 | lea 0x60($inp),$inp | ||
1157 | |||
1158 | xorps $inout0,$in0 # xor | ||
1159 | pshufd \$`3<<6`,$iv0,$inout0 | ||
1160 | xorps $inout1,$in1 | ||
1161 | pshufd \$`2<<6`,$iv0,$inout1 | ||
1162 | movups $in0,($out) # store output | ||
1163 | xorps $inout2,$in2 | ||
1164 | pshufd \$`1<<6`,$iv0,$inout2 | ||
1165 | movups $in1,0x10($out) | ||
1166 | xorps $inout3,$in3 | ||
1167 | movups $in2,0x20($out) | ||
1168 | xorps $inout4,$rndkey1 | ||
1169 | movups $in3,0x30($out) | ||
1170 | xorps $inout5,$rndkey0 | ||
1171 | movups $rndkey1,0x40($out) | ||
1172 | movups $rndkey0,0x50($out) | ||
1173 | lea 0x60($out),$out | ||
1174 | mov $rnds_,$rounds | ||
1175 | sub \$6,$len | ||
1176 | jnc .Lctr32_loop6 | ||
1177 | |||
1178 | add \$6,$len | ||
1179 | jz .Lctr32_done | ||
1180 | mov $key_,$key # restore $key | ||
1181 | lea 1($rounds,$rounds),$rounds # restore original value | ||
1182 | |||
1183 | .Lctr32_tail: | ||
1184 | por $ivec,$inout0 | ||
1185 | movups ($inp),$in0 | ||
1186 | cmp \$2,$len | ||
1187 | jb .Lctr32_one | ||
1188 | |||
1189 | por $ivec,$inout1 | ||
1190 | movups 0x10($inp),$in1 | ||
1191 | je .Lctr32_two | ||
1192 | |||
1193 | pshufd \$`3<<6`,$iv1,$inout3 | ||
1194 | por $ivec,$inout2 | ||
1195 | movups 0x20($inp),$in2 | ||
1196 | cmp \$4,$len | ||
1197 | jb .Lctr32_three | ||
1198 | |||
1199 | pshufd \$`2<<6`,$iv1,$inout4 | ||
1200 | por $ivec,$inout3 | ||
1201 | movups 0x30($inp),$in3 | ||
1202 | je .Lctr32_four | ||
1203 | |||
1204 | por $ivec,$inout4 | ||
1205 | xorps $inout5,$inout5 | ||
1206 | |||
1207 | call _aesni_encrypt6 | ||
1208 | |||
1209 | movups 0x40($inp),$rndkey1 | ||
1210 | xorps $inout0,$in0 | ||
1211 | xorps $inout1,$in1 | ||
1212 | movups $in0,($out) | ||
1213 | xorps $inout2,$in2 | ||
1214 | movups $in1,0x10($out) | ||
1215 | xorps $inout3,$in3 | ||
1216 | movups $in2,0x20($out) | ||
1217 | xorps $inout4,$rndkey1 | ||
1218 | movups $in3,0x30($out) | ||
1219 | movups $rndkey1,0x40($out) | ||
1220 | jmp .Lctr32_done | ||
1221 | |||
1222 | .align 16 | ||
1223 | .Lctr32_one_shortcut: | ||
1224 | movups ($ivp),$inout0 | ||
1225 | movups ($inp),$in0 | ||
1226 | mov 240($key),$rounds # key->rounds | ||
1227 | .Lctr32_one: | ||
1228 | ___ | ||
1229 | &aesni_generate1("enc",$key,$rounds); | ||
1230 | $code.=<<___; | ||
1231 | xorps $inout0,$in0 | ||
1232 | movups $in0,($out) | ||
1233 | jmp .Lctr32_done | ||
1234 | |||
1235 | .align 16 | ||
1236 | .Lctr32_two: | ||
1237 | xorps $inout2,$inout2 | ||
1238 | call _aesni_encrypt3 | ||
1239 | xorps $inout0,$in0 | ||
1240 | xorps $inout1,$in1 | ||
1241 | movups $in0,($out) | ||
1242 | movups $in1,0x10($out) | ||
1243 | jmp .Lctr32_done | ||
1244 | |||
1245 | .align 16 | ||
1246 | .Lctr32_three: | ||
1247 | call _aesni_encrypt3 | ||
1248 | xorps $inout0,$in0 | ||
1249 | xorps $inout1,$in1 | ||
1250 | movups $in0,($out) | ||
1251 | xorps $inout2,$in2 | ||
1252 | movups $in1,0x10($out) | ||
1253 | movups $in2,0x20($out) | ||
1254 | jmp .Lctr32_done | ||
1255 | |||
1256 | .align 16 | ||
1257 | .Lctr32_four: | ||
1258 | call _aesni_encrypt4 | ||
1259 | xorps $inout0,$in0 | ||
1260 | xorps $inout1,$in1 | ||
1261 | movups $in0,($out) | ||
1262 | xorps $inout2,$in2 | ||
1263 | movups $in1,0x10($out) | ||
1264 | xorps $inout3,$in3 | ||
1265 | movups $in2,0x20($out) | ||
1266 | movups $in3,0x30($out) | ||
1267 | |||
1268 | .Lctr32_done: | ||
1269 | ___ | ||
1270 | $code.=<<___ if ($win64); | ||
1271 | movaps 0x20(%rsp),%xmm6 | ||
1272 | movaps 0x30(%rsp),%xmm7 | ||
1273 | movaps 0x40(%rsp),%xmm8 | ||
1274 | movaps 0x50(%rsp),%xmm9 | ||
1275 | movaps 0x60(%rsp),%xmm10 | ||
1276 | movaps 0x70(%rsp),%xmm11 | ||
1277 | movaps 0x80(%rsp),%xmm12 | ||
1278 | movaps 0x90(%rsp),%xmm13 | ||
1279 | movaps 0xa0(%rsp),%xmm14 | ||
1280 | movaps 0xb0(%rsp),%xmm15 | ||
1281 | lea 0xc8(%rsp),%rsp | ||
1282 | .Lctr32_ret: | ||
1283 | ___ | ||
1284 | $code.=<<___; | ||
1285 | ret | ||
1286 | .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks | ||
1287 | ___ | ||
360 | } | 1288 | } |
361 | 1289 | ||
1290 | ###################################################################### | ||
1291 | # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, | ||
1292 | # const AES_KEY *key1, const AES_KEY *key2 | ||
1293 | # const unsigned char iv[16]); | ||
1294 | # | ||
1295 | { | ||
1296 | my @tweak=map("%xmm$_",(10..15)); | ||
1297 | my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); | ||
1298 | my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); | ||
1299 | my $frame_size = 0x68 + ($win64?160:0); | ||
1300 | |||
1301 | $code.=<<___; | ||
1302 | .globl aesni_xts_encrypt | ||
1303 | .type aesni_xts_encrypt,\@function,6 | ||
1304 | .align 16 | ||
1305 | aesni_xts_encrypt: | ||
1306 | lea -$frame_size(%rsp),%rsp | ||
1307 | ___ | ||
1308 | $code.=<<___ if ($win64); | ||
1309 | movaps %xmm6,0x60(%rsp) | ||
1310 | movaps %xmm7,0x70(%rsp) | ||
1311 | movaps %xmm8,0x80(%rsp) | ||
1312 | movaps %xmm9,0x90(%rsp) | ||
1313 | movaps %xmm10,0xa0(%rsp) | ||
1314 | movaps %xmm11,0xb0(%rsp) | ||
1315 | movaps %xmm12,0xc0(%rsp) | ||
1316 | movaps %xmm13,0xd0(%rsp) | ||
1317 | movaps %xmm14,0xe0(%rsp) | ||
1318 | movaps %xmm15,0xf0(%rsp) | ||
1319 | .Lxts_enc_body: | ||
1320 | ___ | ||
1321 | $code.=<<___; | ||
1322 | movups ($ivp),@tweak[5] # load clear-text tweak | ||
1323 | mov 240(%r8),$rounds # key2->rounds | ||
1324 | mov 240($key),$rnds_ # key1->rounds | ||
1325 | ___ | ||
1326 | # generate the tweak | ||
1327 | &aesni_generate1("enc",$key2,$rounds,@tweak[5]); | ||
1328 | $code.=<<___; | ||
1329 | mov $key,$key_ # backup $key | ||
1330 | mov $rnds_,$rounds # backup $rounds | ||
1331 | mov $len,$len_ # backup $len | ||
1332 | and \$-16,$len | ||
1333 | |||
1334 | movdqa .Lxts_magic(%rip),$twmask | ||
1335 | pxor $twtmp,$twtmp | ||
1336 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1337 | ___ | ||
1338 | for ($i=0;$i<4;$i++) { | ||
1339 | $code.=<<___; | ||
1340 | pshufd \$0x13,$twtmp,$twres | ||
1341 | pxor $twtmp,$twtmp | ||
1342 | movdqa @tweak[5],@tweak[$i] | ||
1343 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1344 | pand $twmask,$twres # isolate carry and residue | ||
1345 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1346 | pxor $twres,@tweak[5] | ||
1347 | ___ | ||
1348 | } | ||
1349 | $code.=<<___; | ||
1350 | sub \$16*6,$len | ||
1351 | jc .Lxts_enc_short | ||
1352 | |||
1353 | shr \$1,$rounds | ||
1354 | sub \$1,$rounds | ||
1355 | mov $rounds,$rnds_ | ||
1356 | jmp .Lxts_enc_grandloop | ||
1357 | |||
1358 | .align 16 | ||
1359 | .Lxts_enc_grandloop: | ||
1360 | pshufd \$0x13,$twtmp,$twres | ||
1361 | movdqa @tweak[5],@tweak[4] | ||
1362 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1363 | movdqu `16*0`($inp),$inout0 # load input | ||
1364 | pand $twmask,$twres # isolate carry and residue | ||
1365 | movdqu `16*1`($inp),$inout1 | ||
1366 | pxor $twres,@tweak[5] | ||
1367 | |||
1368 | movdqu `16*2`($inp),$inout2 | ||
1369 | pxor @tweak[0],$inout0 # input^=tweak | ||
1370 | movdqu `16*3`($inp),$inout3 | ||
1371 | pxor @tweak[1],$inout1 | ||
1372 | movdqu `16*4`($inp),$inout4 | ||
1373 | pxor @tweak[2],$inout2 | ||
1374 | movdqu `16*5`($inp),$inout5 | ||
1375 | lea `16*6`($inp),$inp | ||
1376 | pxor @tweak[3],$inout3 | ||
1377 | $movkey ($key_),$rndkey0 | ||
1378 | pxor @tweak[4],$inout4 | ||
1379 | pxor @tweak[5],$inout5 | ||
1380 | |||
1381 | # inline _aesni_encrypt6 and interleave first and last rounds | ||
1382 | # with own code... | ||
1383 | $movkey 16($key_),$rndkey1 | ||
1384 | pxor $rndkey0,$inout0 | ||
1385 | pxor $rndkey0,$inout1 | ||
1386 | movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks | ||
1387 | aesenc $rndkey1,$inout0 | ||
1388 | lea 32($key_),$key | ||
1389 | pxor $rndkey0,$inout2 | ||
1390 | movdqa @tweak[1],`16*1`(%rsp) | ||
1391 | aesenc $rndkey1,$inout1 | ||
1392 | pxor $rndkey0,$inout3 | ||
1393 | movdqa @tweak[2],`16*2`(%rsp) | ||
1394 | aesenc $rndkey1,$inout2 | ||
1395 | pxor $rndkey0,$inout4 | ||
1396 | movdqa @tweak[3],`16*3`(%rsp) | ||
1397 | aesenc $rndkey1,$inout3 | ||
1398 | pxor $rndkey0,$inout5 | ||
1399 | $movkey ($key),$rndkey0 | ||
1400 | dec $rounds | ||
1401 | movdqa @tweak[4],`16*4`(%rsp) | ||
1402 | aesenc $rndkey1,$inout4 | ||
1403 | movdqa @tweak[5],`16*5`(%rsp) | ||
1404 | aesenc $rndkey1,$inout5 | ||
1405 | pxor $twtmp,$twtmp | ||
1406 | pcmpgtd @tweak[5],$twtmp | ||
1407 | jmp .Lxts_enc_loop6_enter | ||
1408 | |||
1409 | .align 16 | ||
1410 | .Lxts_enc_loop6: | ||
1411 | aesenc $rndkey1,$inout0 | ||
1412 | aesenc $rndkey1,$inout1 | ||
1413 | dec $rounds | ||
1414 | aesenc $rndkey1,$inout2 | ||
1415 | aesenc $rndkey1,$inout3 | ||
1416 | aesenc $rndkey1,$inout4 | ||
1417 | aesenc $rndkey1,$inout5 | ||
1418 | .Lxts_enc_loop6_enter: | ||
1419 | $movkey 16($key),$rndkey1 | ||
1420 | aesenc $rndkey0,$inout0 | ||
1421 | aesenc $rndkey0,$inout1 | ||
1422 | lea 32($key),$key | ||
1423 | aesenc $rndkey0,$inout2 | ||
1424 | aesenc $rndkey0,$inout3 | ||
1425 | aesenc $rndkey0,$inout4 | ||
1426 | aesenc $rndkey0,$inout5 | ||
1427 | $movkey ($key),$rndkey0 | ||
1428 | jnz .Lxts_enc_loop6 | ||
1429 | |||
1430 | pshufd \$0x13,$twtmp,$twres | ||
1431 | pxor $twtmp,$twtmp | ||
1432 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1433 | aesenc $rndkey1,$inout0 | ||
1434 | pand $twmask,$twres # isolate carry and residue | ||
1435 | aesenc $rndkey1,$inout1 | ||
1436 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1437 | aesenc $rndkey1,$inout2 | ||
1438 | pxor $twres,@tweak[5] | ||
1439 | aesenc $rndkey1,$inout3 | ||
1440 | aesenc $rndkey1,$inout4 | ||
1441 | aesenc $rndkey1,$inout5 | ||
1442 | $movkey 16($key),$rndkey1 | ||
1443 | |||
1444 | pshufd \$0x13,$twtmp,$twres | ||
1445 | pxor $twtmp,$twtmp | ||
1446 | movdqa @tweak[5],@tweak[0] | ||
1447 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1448 | aesenc $rndkey0,$inout0 | ||
1449 | pand $twmask,$twres # isolate carry and residue | ||
1450 | aesenc $rndkey0,$inout1 | ||
1451 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1452 | aesenc $rndkey0,$inout2 | ||
1453 | pxor $twres,@tweak[5] | ||
1454 | aesenc $rndkey0,$inout3 | ||
1455 | aesenc $rndkey0,$inout4 | ||
1456 | aesenc $rndkey0,$inout5 | ||
1457 | $movkey 32($key),$rndkey0 | ||
1458 | |||
1459 | pshufd \$0x13,$twtmp,$twres | ||
1460 | pxor $twtmp,$twtmp | ||
1461 | movdqa @tweak[5],@tweak[1] | ||
1462 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1463 | aesenc $rndkey1,$inout0 | ||
1464 | pand $twmask,$twres # isolate carry and residue | ||
1465 | aesenc $rndkey1,$inout1 | ||
1466 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1467 | aesenc $rndkey1,$inout2 | ||
1468 | pxor $twres,@tweak[5] | ||
1469 | aesenc $rndkey1,$inout3 | ||
1470 | aesenc $rndkey1,$inout4 | ||
1471 | aesenc $rndkey1,$inout5 | ||
1472 | |||
1473 | pshufd \$0x13,$twtmp,$twres | ||
1474 | pxor $twtmp,$twtmp | ||
1475 | movdqa @tweak[5],@tweak[2] | ||
1476 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1477 | aesenclast $rndkey0,$inout0 | ||
1478 | pand $twmask,$twres # isolate carry and residue | ||
1479 | aesenclast $rndkey0,$inout1 | ||
1480 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1481 | aesenclast $rndkey0,$inout2 | ||
1482 | pxor $twres,@tweak[5] | ||
1483 | aesenclast $rndkey0,$inout3 | ||
1484 | aesenclast $rndkey0,$inout4 | ||
1485 | aesenclast $rndkey0,$inout5 | ||
1486 | |||
1487 | pshufd \$0x13,$twtmp,$twres | ||
1488 | pxor $twtmp,$twtmp | ||
1489 | movdqa @tweak[5],@tweak[3] | ||
1490 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1491 | xorps `16*0`(%rsp),$inout0 # output^=tweak | ||
1492 | pand $twmask,$twres # isolate carry and residue | ||
1493 | xorps `16*1`(%rsp),$inout1 | ||
1494 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1495 | pxor $twres,@tweak[5] | ||
1496 | |||
1497 | xorps `16*2`(%rsp),$inout2 | ||
1498 | movups $inout0,`16*0`($out) # write output | ||
1499 | xorps `16*3`(%rsp),$inout3 | ||
1500 | movups $inout1,`16*1`($out) | ||
1501 | xorps `16*4`(%rsp),$inout4 | ||
1502 | movups $inout2,`16*2`($out) | ||
1503 | xorps `16*5`(%rsp),$inout5 | ||
1504 | movups $inout3,`16*3`($out) | ||
1505 | mov $rnds_,$rounds # restore $rounds | ||
1506 | movups $inout4,`16*4`($out) | ||
1507 | movups $inout5,`16*5`($out) | ||
1508 | lea `16*6`($out),$out | ||
1509 | sub \$16*6,$len | ||
1510 | jnc .Lxts_enc_grandloop | ||
1511 | |||
1512 | lea 3($rounds,$rounds),$rounds # restore original value | ||
1513 | mov $key_,$key # restore $key | ||
1514 | mov $rounds,$rnds_ # backup $rounds | ||
1515 | |||
1516 | .Lxts_enc_short: | ||
1517 | add \$16*6,$len | ||
1518 | jz .Lxts_enc_done | ||
1519 | |||
1520 | cmp \$0x20,$len | ||
1521 | jb .Lxts_enc_one | ||
1522 | je .Lxts_enc_two | ||
1523 | |||
1524 | cmp \$0x40,$len | ||
1525 | jb .Lxts_enc_three | ||
1526 | je .Lxts_enc_four | ||
1527 | |||
1528 | pshufd \$0x13,$twtmp,$twres | ||
1529 | movdqa @tweak[5],@tweak[4] | ||
1530 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1531 | movdqu ($inp),$inout0 | ||
1532 | pand $twmask,$twres # isolate carry and residue | ||
1533 | movdqu 16*1($inp),$inout1 | ||
1534 | pxor $twres,@tweak[5] | ||
1535 | |||
1536 | movdqu 16*2($inp),$inout2 | ||
1537 | pxor @tweak[0],$inout0 | ||
1538 | movdqu 16*3($inp),$inout3 | ||
1539 | pxor @tweak[1],$inout1 | ||
1540 | movdqu 16*4($inp),$inout4 | ||
1541 | lea 16*5($inp),$inp | ||
1542 | pxor @tweak[2],$inout2 | ||
1543 | pxor @tweak[3],$inout3 | ||
1544 | pxor @tweak[4],$inout4 | ||
1545 | |||
1546 | call _aesni_encrypt6 | ||
1547 | |||
1548 | xorps @tweak[0],$inout0 | ||
1549 | movdqa @tweak[5],@tweak[0] | ||
1550 | xorps @tweak[1],$inout1 | ||
1551 | xorps @tweak[2],$inout2 | ||
1552 | movdqu $inout0,($out) | ||
1553 | xorps @tweak[3],$inout3 | ||
1554 | movdqu $inout1,16*1($out) | ||
1555 | xorps @tweak[4],$inout4 | ||
1556 | movdqu $inout2,16*2($out) | ||
1557 | movdqu $inout3,16*3($out) | ||
1558 | movdqu $inout4,16*4($out) | ||
1559 | lea 16*5($out),$out | ||
1560 | jmp .Lxts_enc_done | ||
1561 | |||
1562 | .align 16 | ||
1563 | .Lxts_enc_one: | ||
1564 | movups ($inp),$inout0 | ||
1565 | lea 16*1($inp),$inp | ||
1566 | xorps @tweak[0],$inout0 | ||
1567 | ___ | ||
1568 | &aesni_generate1("enc",$key,$rounds); | ||
1569 | $code.=<<___; | ||
1570 | xorps @tweak[0],$inout0 | ||
1571 | movdqa @tweak[1],@tweak[0] | ||
1572 | movups $inout0,($out) | ||
1573 | lea 16*1($out),$out | ||
1574 | jmp .Lxts_enc_done | ||
1575 | |||
1576 | .align 16 | ||
1577 | .Lxts_enc_two: | ||
1578 | movups ($inp),$inout0 | ||
1579 | movups 16($inp),$inout1 | ||
1580 | lea 32($inp),$inp | ||
1581 | xorps @tweak[0],$inout0 | ||
1582 | xorps @tweak[1],$inout1 | ||
1583 | |||
1584 | call _aesni_encrypt3 | ||
1585 | |||
1586 | xorps @tweak[0],$inout0 | ||
1587 | movdqa @tweak[2],@tweak[0] | ||
1588 | xorps @tweak[1],$inout1 | ||
1589 | movups $inout0,($out) | ||
1590 | movups $inout1,16*1($out) | ||
1591 | lea 16*2($out),$out | ||
1592 | jmp .Lxts_enc_done | ||
1593 | |||
1594 | .align 16 | ||
1595 | .Lxts_enc_three: | ||
1596 | movups ($inp),$inout0 | ||
1597 | movups 16*1($inp),$inout1 | ||
1598 | movups 16*2($inp),$inout2 | ||
1599 | lea 16*3($inp),$inp | ||
1600 | xorps @tweak[0],$inout0 | ||
1601 | xorps @tweak[1],$inout1 | ||
1602 | xorps @tweak[2],$inout2 | ||
1603 | |||
1604 | call _aesni_encrypt3 | ||
1605 | |||
1606 | xorps @tweak[0],$inout0 | ||
1607 | movdqa @tweak[3],@tweak[0] | ||
1608 | xorps @tweak[1],$inout1 | ||
1609 | xorps @tweak[2],$inout2 | ||
1610 | movups $inout0,($out) | ||
1611 | movups $inout1,16*1($out) | ||
1612 | movups $inout2,16*2($out) | ||
1613 | lea 16*3($out),$out | ||
1614 | jmp .Lxts_enc_done | ||
1615 | |||
1616 | .align 16 | ||
1617 | .Lxts_enc_four: | ||
1618 | movups ($inp),$inout0 | ||
1619 | movups 16*1($inp),$inout1 | ||
1620 | movups 16*2($inp),$inout2 | ||
1621 | xorps @tweak[0],$inout0 | ||
1622 | movups 16*3($inp),$inout3 | ||
1623 | lea 16*4($inp),$inp | ||
1624 | xorps @tweak[1],$inout1 | ||
1625 | xorps @tweak[2],$inout2 | ||
1626 | xorps @tweak[3],$inout3 | ||
1627 | |||
1628 | call _aesni_encrypt4 | ||
1629 | |||
1630 | xorps @tweak[0],$inout0 | ||
1631 | movdqa @tweak[5],@tweak[0] | ||
1632 | xorps @tweak[1],$inout1 | ||
1633 | xorps @tweak[2],$inout2 | ||
1634 | movups $inout0,($out) | ||
1635 | xorps @tweak[3],$inout3 | ||
1636 | movups $inout1,16*1($out) | ||
1637 | movups $inout2,16*2($out) | ||
1638 | movups $inout3,16*3($out) | ||
1639 | lea 16*4($out),$out | ||
1640 | jmp .Lxts_enc_done | ||
1641 | |||
1642 | .align 16 | ||
1643 | .Lxts_enc_done: | ||
1644 | and \$15,$len_ | ||
1645 | jz .Lxts_enc_ret | ||
1646 | mov $len_,$len | ||
1647 | |||
1648 | .Lxts_enc_steal: | ||
1649 | movzb ($inp),%eax # borrow $rounds ... | ||
1650 | movzb -16($out),%ecx # ... and $key | ||
1651 | lea 1($inp),$inp | ||
1652 | mov %al,-16($out) | ||
1653 | mov %cl,0($out) | ||
1654 | lea 1($out),$out | ||
1655 | sub \$1,$len | ||
1656 | jnz .Lxts_enc_steal | ||
1657 | |||
1658 | sub $len_,$out # rewind $out | ||
1659 | mov $key_,$key # restore $key | ||
1660 | mov $rnds_,$rounds # restore $rounds | ||
1661 | |||
1662 | movups -16($out),$inout0 | ||
1663 | xorps @tweak[0],$inout0 | ||
1664 | ___ | ||
1665 | &aesni_generate1("enc",$key,$rounds); | ||
1666 | $code.=<<___; | ||
1667 | xorps @tweak[0],$inout0 | ||
1668 | movups $inout0,-16($out) | ||
1669 | |||
1670 | .Lxts_enc_ret: | ||
1671 | ___ | ||
1672 | $code.=<<___ if ($win64); | ||
1673 | movaps 0x60(%rsp),%xmm6 | ||
1674 | movaps 0x70(%rsp),%xmm7 | ||
1675 | movaps 0x80(%rsp),%xmm8 | ||
1676 | movaps 0x90(%rsp),%xmm9 | ||
1677 | movaps 0xa0(%rsp),%xmm10 | ||
1678 | movaps 0xb0(%rsp),%xmm11 | ||
1679 | movaps 0xc0(%rsp),%xmm12 | ||
1680 | movaps 0xd0(%rsp),%xmm13 | ||
1681 | movaps 0xe0(%rsp),%xmm14 | ||
1682 | movaps 0xf0(%rsp),%xmm15 | ||
1683 | ___ | ||
1684 | $code.=<<___; | ||
1685 | lea $frame_size(%rsp),%rsp | ||
1686 | .Lxts_enc_epilogue: | ||
1687 | ret | ||
1688 | .size aesni_xts_encrypt,.-aesni_xts_encrypt | ||
1689 | ___ | ||
1690 | |||
1691 | $code.=<<___; | ||
1692 | .globl aesni_xts_decrypt | ||
1693 | .type aesni_xts_decrypt,\@function,6 | ||
1694 | .align 16 | ||
1695 | aesni_xts_decrypt: | ||
1696 | lea -$frame_size(%rsp),%rsp | ||
1697 | ___ | ||
1698 | $code.=<<___ if ($win64); | ||
1699 | movaps %xmm6,0x60(%rsp) | ||
1700 | movaps %xmm7,0x70(%rsp) | ||
1701 | movaps %xmm8,0x80(%rsp) | ||
1702 | movaps %xmm9,0x90(%rsp) | ||
1703 | movaps %xmm10,0xa0(%rsp) | ||
1704 | movaps %xmm11,0xb0(%rsp) | ||
1705 | movaps %xmm12,0xc0(%rsp) | ||
1706 | movaps %xmm13,0xd0(%rsp) | ||
1707 | movaps %xmm14,0xe0(%rsp) | ||
1708 | movaps %xmm15,0xf0(%rsp) | ||
1709 | .Lxts_dec_body: | ||
1710 | ___ | ||
1711 | $code.=<<___; | ||
1712 | movups ($ivp),@tweak[5] # load clear-text tweak | ||
1713 | mov 240($key2),$rounds # key2->rounds | ||
1714 | mov 240($key),$rnds_ # key1->rounds | ||
1715 | ___ | ||
1716 | # generate the tweak | ||
1717 | &aesni_generate1("enc",$key2,$rounds,@tweak[5]); | ||
1718 | $code.=<<___; | ||
1719 | xor %eax,%eax # if ($len%16) len-=16; | ||
1720 | test \$15,$len | ||
1721 | setnz %al | ||
1722 | shl \$4,%rax | ||
1723 | sub %rax,$len | ||
1724 | |||
1725 | mov $key,$key_ # backup $key | ||
1726 | mov $rnds_,$rounds # backup $rounds | ||
1727 | mov $len,$len_ # backup $len | ||
1728 | and \$-16,$len | ||
1729 | |||
1730 | movdqa .Lxts_magic(%rip),$twmask | ||
1731 | pxor $twtmp,$twtmp | ||
1732 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1733 | ___ | ||
1734 | for ($i=0;$i<4;$i++) { | ||
1735 | $code.=<<___; | ||
1736 | pshufd \$0x13,$twtmp,$twres | ||
1737 | pxor $twtmp,$twtmp | ||
1738 | movdqa @tweak[5],@tweak[$i] | ||
1739 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1740 | pand $twmask,$twres # isolate carry and residue | ||
1741 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1742 | pxor $twres,@tweak[5] | ||
1743 | ___ | ||
1744 | } | ||
1745 | $code.=<<___; | ||
1746 | sub \$16*6,$len | ||
1747 | jc .Lxts_dec_short | ||
1748 | |||
1749 | shr \$1,$rounds | ||
1750 | sub \$1,$rounds | ||
1751 | mov $rounds,$rnds_ | ||
1752 | jmp .Lxts_dec_grandloop | ||
1753 | |||
1754 | .align 16 | ||
1755 | .Lxts_dec_grandloop: | ||
1756 | pshufd \$0x13,$twtmp,$twres | ||
1757 | movdqa @tweak[5],@tweak[4] | ||
1758 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1759 | movdqu `16*0`($inp),$inout0 # load input | ||
1760 | pand $twmask,$twres # isolate carry and residue | ||
1761 | movdqu `16*1`($inp),$inout1 | ||
1762 | pxor $twres,@tweak[5] | ||
1763 | |||
1764 | movdqu `16*2`($inp),$inout2 | ||
1765 | pxor @tweak[0],$inout0 # input^=tweak | ||
1766 | movdqu `16*3`($inp),$inout3 | ||
1767 | pxor @tweak[1],$inout1 | ||
1768 | movdqu `16*4`($inp),$inout4 | ||
1769 | pxor @tweak[2],$inout2 | ||
1770 | movdqu `16*5`($inp),$inout5 | ||
1771 | lea `16*6`($inp),$inp | ||
1772 | pxor @tweak[3],$inout3 | ||
1773 | $movkey ($key_),$rndkey0 | ||
1774 | pxor @tweak[4],$inout4 | ||
1775 | pxor @tweak[5],$inout5 | ||
1776 | |||
1777 | # inline _aesni_decrypt6 and interleave first and last rounds | ||
1778 | # with own code... | ||
1779 | $movkey 16($key_),$rndkey1 | ||
1780 | pxor $rndkey0,$inout0 | ||
1781 | pxor $rndkey0,$inout1 | ||
1782 | movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks | ||
1783 | aesdec $rndkey1,$inout0 | ||
1784 | lea 32($key_),$key | ||
1785 | pxor $rndkey0,$inout2 | ||
1786 | movdqa @tweak[1],`16*1`(%rsp) | ||
1787 | aesdec $rndkey1,$inout1 | ||
1788 | pxor $rndkey0,$inout3 | ||
1789 | movdqa @tweak[2],`16*2`(%rsp) | ||
1790 | aesdec $rndkey1,$inout2 | ||
1791 | pxor $rndkey0,$inout4 | ||
1792 | movdqa @tweak[3],`16*3`(%rsp) | ||
1793 | aesdec $rndkey1,$inout3 | ||
1794 | pxor $rndkey0,$inout5 | ||
1795 | $movkey ($key),$rndkey0 | ||
1796 | dec $rounds | ||
1797 | movdqa @tweak[4],`16*4`(%rsp) | ||
1798 | aesdec $rndkey1,$inout4 | ||
1799 | movdqa @tweak[5],`16*5`(%rsp) | ||
1800 | aesdec $rndkey1,$inout5 | ||
1801 | pxor $twtmp,$twtmp | ||
1802 | pcmpgtd @tweak[5],$twtmp | ||
1803 | jmp .Lxts_dec_loop6_enter | ||
1804 | |||
1805 | .align 16 | ||
1806 | .Lxts_dec_loop6: | ||
1807 | aesdec $rndkey1,$inout0 | ||
1808 | aesdec $rndkey1,$inout1 | ||
1809 | dec $rounds | ||
1810 | aesdec $rndkey1,$inout2 | ||
1811 | aesdec $rndkey1,$inout3 | ||
1812 | aesdec $rndkey1,$inout4 | ||
1813 | aesdec $rndkey1,$inout5 | ||
1814 | .Lxts_dec_loop6_enter: | ||
1815 | $movkey 16($key),$rndkey1 | ||
1816 | aesdec $rndkey0,$inout0 | ||
1817 | aesdec $rndkey0,$inout1 | ||
1818 | lea 32($key),$key | ||
1819 | aesdec $rndkey0,$inout2 | ||
1820 | aesdec $rndkey0,$inout3 | ||
1821 | aesdec $rndkey0,$inout4 | ||
1822 | aesdec $rndkey0,$inout5 | ||
1823 | $movkey ($key),$rndkey0 | ||
1824 | jnz .Lxts_dec_loop6 | ||
1825 | |||
1826 | pshufd \$0x13,$twtmp,$twres | ||
1827 | pxor $twtmp,$twtmp | ||
1828 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1829 | aesdec $rndkey1,$inout0 | ||
1830 | pand $twmask,$twres # isolate carry and residue | ||
1831 | aesdec $rndkey1,$inout1 | ||
1832 | pcmpgtd @tweak[5],$twtmp # broadcast upper bits | ||
1833 | aesdec $rndkey1,$inout2 | ||
1834 | pxor $twres,@tweak[5] | ||
1835 | aesdec $rndkey1,$inout3 | ||
1836 | aesdec $rndkey1,$inout4 | ||
1837 | aesdec $rndkey1,$inout5 | ||
1838 | $movkey 16($key),$rndkey1 | ||
1839 | |||
1840 | pshufd \$0x13,$twtmp,$twres | ||
1841 | pxor $twtmp,$twtmp | ||
1842 | movdqa @tweak[5],@tweak[0] | ||
1843 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1844 | aesdec $rndkey0,$inout0 | ||
1845 | pand $twmask,$twres # isolate carry and residue | ||
1846 | aesdec $rndkey0,$inout1 | ||
1847 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1848 | aesdec $rndkey0,$inout2 | ||
1849 | pxor $twres,@tweak[5] | ||
1850 | aesdec $rndkey0,$inout3 | ||
1851 | aesdec $rndkey0,$inout4 | ||
1852 | aesdec $rndkey0,$inout5 | ||
1853 | $movkey 32($key),$rndkey0 | ||
1854 | |||
1855 | pshufd \$0x13,$twtmp,$twres | ||
1856 | pxor $twtmp,$twtmp | ||
1857 | movdqa @tweak[5],@tweak[1] | ||
1858 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1859 | aesdec $rndkey1,$inout0 | ||
1860 | pand $twmask,$twres # isolate carry and residue | ||
1861 | aesdec $rndkey1,$inout1 | ||
1862 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1863 | aesdec $rndkey1,$inout2 | ||
1864 | pxor $twres,@tweak[5] | ||
1865 | aesdec $rndkey1,$inout3 | ||
1866 | aesdec $rndkey1,$inout4 | ||
1867 | aesdec $rndkey1,$inout5 | ||
1868 | |||
1869 | pshufd \$0x13,$twtmp,$twres | ||
1870 | pxor $twtmp,$twtmp | ||
1871 | movdqa @tweak[5],@tweak[2] | ||
1872 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1873 | aesdeclast $rndkey0,$inout0 | ||
1874 | pand $twmask,$twres # isolate carry and residue | ||
1875 | aesdeclast $rndkey0,$inout1 | ||
1876 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1877 | aesdeclast $rndkey0,$inout2 | ||
1878 | pxor $twres,@tweak[5] | ||
1879 | aesdeclast $rndkey0,$inout3 | ||
1880 | aesdeclast $rndkey0,$inout4 | ||
1881 | aesdeclast $rndkey0,$inout5 | ||
1882 | |||
1883 | pshufd \$0x13,$twtmp,$twres | ||
1884 | pxor $twtmp,$twtmp | ||
1885 | movdqa @tweak[5],@tweak[3] | ||
1886 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1887 | xorps `16*0`(%rsp),$inout0 # output^=tweak | ||
1888 | pand $twmask,$twres # isolate carry and residue | ||
1889 | xorps `16*1`(%rsp),$inout1 | ||
1890 | pcmpgtd @tweak[5],$twtmp # broadcat upper bits | ||
1891 | pxor $twres,@tweak[5] | ||
1892 | |||
1893 | xorps `16*2`(%rsp),$inout2 | ||
1894 | movups $inout0,`16*0`($out) # write output | ||
1895 | xorps `16*3`(%rsp),$inout3 | ||
1896 | movups $inout1,`16*1`($out) | ||
1897 | xorps `16*4`(%rsp),$inout4 | ||
1898 | movups $inout2,`16*2`($out) | ||
1899 | xorps `16*5`(%rsp),$inout5 | ||
1900 | movups $inout3,`16*3`($out) | ||
1901 | mov $rnds_,$rounds # restore $rounds | ||
1902 | movups $inout4,`16*4`($out) | ||
1903 | movups $inout5,`16*5`($out) | ||
1904 | lea `16*6`($out),$out | ||
1905 | sub \$16*6,$len | ||
1906 | jnc .Lxts_dec_grandloop | ||
1907 | |||
1908 | lea 3($rounds,$rounds),$rounds # restore original value | ||
1909 | mov $key_,$key # restore $key | ||
1910 | mov $rounds,$rnds_ # backup $rounds | ||
1911 | |||
1912 | .Lxts_dec_short: | ||
1913 | add \$16*6,$len | ||
1914 | jz .Lxts_dec_done | ||
1915 | |||
1916 | cmp \$0x20,$len | ||
1917 | jb .Lxts_dec_one | ||
1918 | je .Lxts_dec_two | ||
1919 | |||
1920 | cmp \$0x40,$len | ||
1921 | jb .Lxts_dec_three | ||
1922 | je .Lxts_dec_four | ||
1923 | |||
1924 | pshufd \$0x13,$twtmp,$twres | ||
1925 | movdqa @tweak[5],@tweak[4] | ||
1926 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1927 | movdqu ($inp),$inout0 | ||
1928 | pand $twmask,$twres # isolate carry and residue | ||
1929 | movdqu 16*1($inp),$inout1 | ||
1930 | pxor $twres,@tweak[5] | ||
1931 | |||
1932 | movdqu 16*2($inp),$inout2 | ||
1933 | pxor @tweak[0],$inout0 | ||
1934 | movdqu 16*3($inp),$inout3 | ||
1935 | pxor @tweak[1],$inout1 | ||
1936 | movdqu 16*4($inp),$inout4 | ||
1937 | lea 16*5($inp),$inp | ||
1938 | pxor @tweak[2],$inout2 | ||
1939 | pxor @tweak[3],$inout3 | ||
1940 | pxor @tweak[4],$inout4 | ||
1941 | |||
1942 | call _aesni_decrypt6 | ||
1943 | |||
1944 | xorps @tweak[0],$inout0 | ||
1945 | xorps @tweak[1],$inout1 | ||
1946 | xorps @tweak[2],$inout2 | ||
1947 | movdqu $inout0,($out) | ||
1948 | xorps @tweak[3],$inout3 | ||
1949 | movdqu $inout1,16*1($out) | ||
1950 | xorps @tweak[4],$inout4 | ||
1951 | movdqu $inout2,16*2($out) | ||
1952 | pxor $twtmp,$twtmp | ||
1953 | movdqu $inout3,16*3($out) | ||
1954 | pcmpgtd @tweak[5],$twtmp | ||
1955 | movdqu $inout4,16*4($out) | ||
1956 | lea 16*5($out),$out | ||
1957 | pshufd \$0x13,$twtmp,@tweak[1] # $twres | ||
1958 | and \$15,$len_ | ||
1959 | jz .Lxts_dec_ret | ||
1960 | |||
1961 | movdqa @tweak[5],@tweak[0] | ||
1962 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
1963 | pand $twmask,@tweak[1] # isolate carry and residue | ||
1964 | pxor @tweak[5],@tweak[1] | ||
1965 | jmp .Lxts_dec_done2 | ||
1966 | |||
1967 | .align 16 | ||
1968 | .Lxts_dec_one: | ||
1969 | movups ($inp),$inout0 | ||
1970 | lea 16*1($inp),$inp | ||
1971 | xorps @tweak[0],$inout0 | ||
1972 | ___ | ||
1973 | &aesni_generate1("dec",$key,$rounds); | ||
1974 | $code.=<<___; | ||
1975 | xorps @tweak[0],$inout0 | ||
1976 | movdqa @tweak[1],@tweak[0] | ||
1977 | movups $inout0,($out) | ||
1978 | movdqa @tweak[2],@tweak[1] | ||
1979 | lea 16*1($out),$out | ||
1980 | jmp .Lxts_dec_done | ||
1981 | |||
1982 | .align 16 | ||
1983 | .Lxts_dec_two: | ||
1984 | movups ($inp),$inout0 | ||
1985 | movups 16($inp),$inout1 | ||
1986 | lea 32($inp),$inp | ||
1987 | xorps @tweak[0],$inout0 | ||
1988 | xorps @tweak[1],$inout1 | ||
1989 | |||
1990 | call _aesni_decrypt3 | ||
1991 | |||
1992 | xorps @tweak[0],$inout0 | ||
1993 | movdqa @tweak[2],@tweak[0] | ||
1994 | xorps @tweak[1],$inout1 | ||
1995 | movdqa @tweak[3],@tweak[1] | ||
1996 | movups $inout0,($out) | ||
1997 | movups $inout1,16*1($out) | ||
1998 | lea 16*2($out),$out | ||
1999 | jmp .Lxts_dec_done | ||
2000 | |||
2001 | .align 16 | ||
2002 | .Lxts_dec_three: | ||
2003 | movups ($inp),$inout0 | ||
2004 | movups 16*1($inp),$inout1 | ||
2005 | movups 16*2($inp),$inout2 | ||
2006 | lea 16*3($inp),$inp | ||
2007 | xorps @tweak[0],$inout0 | ||
2008 | xorps @tweak[1],$inout1 | ||
2009 | xorps @tweak[2],$inout2 | ||
2010 | |||
2011 | call _aesni_decrypt3 | ||
2012 | |||
2013 | xorps @tweak[0],$inout0 | ||
2014 | movdqa @tweak[3],@tweak[0] | ||
2015 | xorps @tweak[1],$inout1 | ||
2016 | movdqa @tweak[5],@tweak[1] | ||
2017 | xorps @tweak[2],$inout2 | ||
2018 | movups $inout0,($out) | ||
2019 | movups $inout1,16*1($out) | ||
2020 | movups $inout2,16*2($out) | ||
2021 | lea 16*3($out),$out | ||
2022 | jmp .Lxts_dec_done | ||
2023 | |||
2024 | .align 16 | ||
2025 | .Lxts_dec_four: | ||
2026 | pshufd \$0x13,$twtmp,$twres | ||
2027 | movdqa @tweak[5],@tweak[4] | ||
2028 | paddq @tweak[5],@tweak[5] # psllq 1,$tweak | ||
2029 | movups ($inp),$inout0 | ||
2030 | pand $twmask,$twres # isolate carry and residue | ||
2031 | movups 16*1($inp),$inout1 | ||
2032 | pxor $twres,@tweak[5] | ||
2033 | |||
2034 | movups 16*2($inp),$inout2 | ||
2035 | xorps @tweak[0],$inout0 | ||
2036 | movups 16*3($inp),$inout3 | ||
2037 | lea 16*4($inp),$inp | ||
2038 | xorps @tweak[1],$inout1 | ||
2039 | xorps @tweak[2],$inout2 | ||
2040 | xorps @tweak[3],$inout3 | ||
2041 | |||
2042 | call _aesni_decrypt4 | ||
2043 | |||
2044 | xorps @tweak[0],$inout0 | ||
2045 | movdqa @tweak[4],@tweak[0] | ||
2046 | xorps @tweak[1],$inout1 | ||
2047 | movdqa @tweak[5],@tweak[1] | ||
2048 | xorps @tweak[2],$inout2 | ||
2049 | movups $inout0,($out) | ||
2050 | xorps @tweak[3],$inout3 | ||
2051 | movups $inout1,16*1($out) | ||
2052 | movups $inout2,16*2($out) | ||
2053 | movups $inout3,16*3($out) | ||
2054 | lea 16*4($out),$out | ||
2055 | jmp .Lxts_dec_done | ||
2056 | |||
2057 | .align 16 | ||
2058 | .Lxts_dec_done: | ||
2059 | and \$15,$len_ | ||
2060 | jz .Lxts_dec_ret | ||
2061 | .Lxts_dec_done2: | ||
2062 | mov $len_,$len | ||
2063 | mov $key_,$key # restore $key | ||
2064 | mov $rnds_,$rounds # restore $rounds | ||
2065 | |||
2066 | movups ($inp),$inout0 | ||
2067 | xorps @tweak[1],$inout0 | ||
2068 | ___ | ||
2069 | &aesni_generate1("dec",$key,$rounds); | ||
2070 | $code.=<<___; | ||
2071 | xorps @tweak[1],$inout0 | ||
2072 | movups $inout0,($out) | ||
2073 | |||
2074 | .Lxts_dec_steal: | ||
2075 | movzb 16($inp),%eax # borrow $rounds ... | ||
2076 | movzb ($out),%ecx # ... and $key | ||
2077 | lea 1($inp),$inp | ||
2078 | mov %al,($out) | ||
2079 | mov %cl,16($out) | ||
2080 | lea 1($out),$out | ||
2081 | sub \$1,$len | ||
2082 | jnz .Lxts_dec_steal | ||
2083 | |||
2084 | sub $len_,$out # rewind $out | ||
2085 | mov $key_,$key # restore $key | ||
2086 | mov $rnds_,$rounds # restore $rounds | ||
2087 | |||
2088 | movups ($out),$inout0 | ||
2089 | xorps @tweak[0],$inout0 | ||
2090 | ___ | ||
2091 | &aesni_generate1("dec",$key,$rounds); | ||
2092 | $code.=<<___; | ||
2093 | xorps @tweak[0],$inout0 | ||
2094 | movups $inout0,($out) | ||
2095 | |||
2096 | .Lxts_dec_ret: | ||
2097 | ___ | ||
2098 | $code.=<<___ if ($win64); | ||
2099 | movaps 0x60(%rsp),%xmm6 | ||
2100 | movaps 0x70(%rsp),%xmm7 | ||
2101 | movaps 0x80(%rsp),%xmm8 | ||
2102 | movaps 0x90(%rsp),%xmm9 | ||
2103 | movaps 0xa0(%rsp),%xmm10 | ||
2104 | movaps 0xb0(%rsp),%xmm11 | ||
2105 | movaps 0xc0(%rsp),%xmm12 | ||
2106 | movaps 0xd0(%rsp),%xmm13 | ||
2107 | movaps 0xe0(%rsp),%xmm14 | ||
2108 | movaps 0xf0(%rsp),%xmm15 | ||
2109 | ___ | ||
2110 | $code.=<<___; | ||
2111 | lea $frame_size(%rsp),%rsp | ||
2112 | .Lxts_dec_epilogue: | ||
2113 | ret | ||
2114 | .size aesni_xts_decrypt,.-aesni_xts_decrypt | ||
2115 | ___ | ||
2116 | } }} | ||
2117 | |||
2118 | ######################################################################## | ||
362 | # void $PREFIX_cbc_encrypt (const void *inp, void *out, | 2119 | # void $PREFIX_cbc_encrypt (const void *inp, void *out, |
363 | # size_t length, const AES_KEY *key, | 2120 | # size_t length, const AES_KEY *key, |
364 | # unsigned char *ivp,const int enc); | 2121 | # unsigned char *ivp,const int enc); |
365 | $reserved = $win64?0x40:-0x18; # used in decrypt | 2122 | { |
2123 | my $reserved = $win64?0x40:-0x18; # used in decrypt | ||
366 | $code.=<<___; | 2124 | $code.=<<___; |
367 | .globl ${PREFIX}_cbc_encrypt | 2125 | .globl ${PREFIX}_cbc_encrypt |
368 | .type ${PREFIX}_cbc_encrypt,\@function,6 | 2126 | .type ${PREFIX}_cbc_encrypt,\@function,6 |
@@ -371,30 +2129,30 @@ ${PREFIX}_cbc_encrypt: | |||
371 | test $len,$len # check length | 2129 | test $len,$len # check length |
372 | jz .Lcbc_ret | 2130 | jz .Lcbc_ret |
373 | 2131 | ||
374 | mov 240($key),$rnds_ # pull $rounds | 2132 | mov 240($key),$rnds_ # key->rounds |
375 | mov $key,$key_ # backup $key | 2133 | mov $key,$key_ # backup $key |
376 | test %r9d,%r9d # 6th argument | 2134 | test %r9d,%r9d # 6th argument |
377 | jz .Lcbc_decrypt | 2135 | jz .Lcbc_decrypt |
378 | #--------------------------- CBC ENCRYPT ------------------------------# | 2136 | #--------------------------- CBC ENCRYPT ------------------------------# |
379 | movups ($ivp),$inout0 # load iv as initial state | 2137 | movups ($ivp),$inout0 # load iv as initial state |
380 | cmp \$16,$len | ||
381 | mov $rnds_,$rounds | 2138 | mov $rnds_,$rounds |
2139 | cmp \$16,$len | ||
382 | jb .Lcbc_enc_tail | 2140 | jb .Lcbc_enc_tail |
383 | sub \$16,$len | 2141 | sub \$16,$len |
384 | jmp .Lcbc_enc_loop | 2142 | jmp .Lcbc_enc_loop |
385 | .align 16 | 2143 | .align 16 |
386 | .Lcbc_enc_loop: | 2144 | .Lcbc_enc_loop: |
387 | movups ($inp),$inout1 # load input | 2145 | movups ($inp),$inout1 # load input |
388 | lea 16($inp),$inp | 2146 | lea 16($inp),$inp |
389 | pxor $inout1,$inout0 | 2147 | #xorps $inout1,$inout0 |
390 | ___ | 2148 | ___ |
391 | &aesni_generate1("enc",$key,$rounds); | 2149 | &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); |
392 | $code.=<<___; | 2150 | $code.=<<___; |
393 | sub \$16,$len | ||
394 | lea 16($out),$out | ||
395 | mov $rnds_,$rounds # restore $rounds | 2151 | mov $rnds_,$rounds # restore $rounds |
396 | mov $key_,$key # restore $key | 2152 | mov $key_,$key # restore $key |
397 | movups $inout0,-16($out) # store output | 2153 | movups $inout0,0($out) # store output |
2154 | lea 16($out),$out | ||
2155 | sub \$16,$len | ||
398 | jnc .Lcbc_enc_loop | 2156 | jnc .Lcbc_enc_loop |
399 | add \$16,$len | 2157 | add \$16,$len |
400 | jnz .Lcbc_enc_tail | 2158 | jnz .Lcbc_enc_tail |
@@ -429,92 +2187,238 @@ $code.=<<___ if ($win64); | |||
429 | ___ | 2187 | ___ |
430 | $code.=<<___; | 2188 | $code.=<<___; |
431 | movups ($ivp),$iv | 2189 | movups ($ivp),$iv |
432 | sub \$0x40,$len | ||
433 | mov $rnds_,$rounds | 2190 | mov $rnds_,$rounds |
2191 | cmp \$0x70,$len | ||
434 | jbe .Lcbc_dec_tail | 2192 | jbe .Lcbc_dec_tail |
435 | jmp .Lcbc_dec_loop3 | 2193 | shr \$1,$rnds_ |
436 | .align 16 | 2194 | sub \$0x70,$len |
437 | .Lcbc_dec_loop3: | 2195 | mov $rnds_,$rounds |
438 | movups ($inp),$inout0 | 2196 | movaps $iv,$reserved(%rsp) |
2197 | jmp .Lcbc_dec_loop8_enter | ||
2198 | .align 16 | ||
2199 | .Lcbc_dec_loop8: | ||
2200 | movaps $rndkey0,$reserved(%rsp) # save IV | ||
2201 | movups $inout7,($out) | ||
2202 | lea 0x10($out),$out | ||
2203 | .Lcbc_dec_loop8_enter: | ||
2204 | $movkey ($key),$rndkey0 | ||
2205 | movups ($inp),$inout0 # load input | ||
439 | movups 0x10($inp),$inout1 | 2206 | movups 0x10($inp),$inout1 |
440 | movups 0x20($inp),$inout2 | 2207 | $movkey 16($key),$rndkey1 |
441 | movaps $inout0,$in0 | ||
442 | movaps $inout1,$in1 | ||
443 | movaps $inout2,$in2 | ||
444 | call _aesni_decrypt3 | ||
445 | sub \$0x30,$len | ||
446 | lea 0x30($inp),$inp | ||
447 | lea 0x30($out),$out | ||
448 | pxor $iv,$inout0 | ||
449 | pxor $in0,$inout1 | ||
450 | movaps $in2,$iv | ||
451 | pxor $in1,$inout2 | ||
452 | movups $inout0,-0x30($out) | ||
453 | mov $rnds_,$rounds # restore $rounds | ||
454 | movups $inout1,-0x20($out) | ||
455 | mov $key_,$key # restore $key | ||
456 | movups $inout2,-0x10($out) | ||
457 | ja .Lcbc_dec_loop3 | ||
458 | 2208 | ||
459 | .Lcbc_dec_tail: | 2209 | lea 32($key),$key |
460 | add \$0x40,$len | 2210 | movdqu 0x20($inp),$inout2 |
461 | movups $iv,($ivp) | 2211 | xorps $rndkey0,$inout0 |
462 | jz .Lcbc_dec_ret | 2212 | movdqu 0x30($inp),$inout3 |
2213 | xorps $rndkey0,$inout1 | ||
2214 | movdqu 0x40($inp),$inout4 | ||
2215 | aesdec $rndkey1,$inout0 | ||
2216 | pxor $rndkey0,$inout2 | ||
2217 | movdqu 0x50($inp),$inout5 | ||
2218 | aesdec $rndkey1,$inout1 | ||
2219 | pxor $rndkey0,$inout3 | ||
2220 | movdqu 0x60($inp),$inout6 | ||
2221 | aesdec $rndkey1,$inout2 | ||
2222 | pxor $rndkey0,$inout4 | ||
2223 | movdqu 0x70($inp),$inout7 | ||
2224 | aesdec $rndkey1,$inout3 | ||
2225 | pxor $rndkey0,$inout5 | ||
2226 | dec $rounds | ||
2227 | aesdec $rndkey1,$inout4 | ||
2228 | pxor $rndkey0,$inout6 | ||
2229 | aesdec $rndkey1,$inout5 | ||
2230 | pxor $rndkey0,$inout7 | ||
2231 | $movkey ($key),$rndkey0 | ||
2232 | aesdec $rndkey1,$inout6 | ||
2233 | aesdec $rndkey1,$inout7 | ||
2234 | $movkey 16($key),$rndkey1 | ||
463 | 2235 | ||
2236 | call .Ldec_loop8_enter | ||
2237 | |||
2238 | movups ($inp),$rndkey1 # re-load input | ||
2239 | movups 0x10($inp),$rndkey0 | ||
2240 | xorps $reserved(%rsp),$inout0 # ^= IV | ||
2241 | xorps $rndkey1,$inout1 | ||
2242 | movups 0x20($inp),$rndkey1 | ||
2243 | xorps $rndkey0,$inout2 | ||
2244 | movups 0x30($inp),$rndkey0 | ||
2245 | xorps $rndkey1,$inout3 | ||
2246 | movups 0x40($inp),$rndkey1 | ||
2247 | xorps $rndkey0,$inout4 | ||
2248 | movups 0x50($inp),$rndkey0 | ||
2249 | xorps $rndkey1,$inout5 | ||
2250 | movups 0x60($inp),$rndkey1 | ||
2251 | xorps $rndkey0,$inout6 | ||
2252 | movups 0x70($inp),$rndkey0 # IV | ||
2253 | xorps $rndkey1,$inout7 | ||
2254 | movups $inout0,($out) | ||
2255 | movups $inout1,0x10($out) | ||
2256 | movups $inout2,0x20($out) | ||
2257 | movups $inout3,0x30($out) | ||
2258 | mov $rnds_,$rounds # restore $rounds | ||
2259 | movups $inout4,0x40($out) | ||
2260 | mov $key_,$key # restore $key | ||
2261 | movups $inout5,0x50($out) | ||
2262 | lea 0x80($inp),$inp | ||
2263 | movups $inout6,0x60($out) | ||
2264 | lea 0x70($out),$out | ||
2265 | sub \$0x80,$len | ||
2266 | ja .Lcbc_dec_loop8 | ||
2267 | |||
2268 | movaps $inout7,$inout0 | ||
2269 | movaps $rndkey0,$iv | ||
2270 | add \$0x70,$len | ||
2271 | jle .Lcbc_dec_tail_collected | ||
2272 | movups $inout0,($out) | ||
2273 | lea 1($rnds_,$rnds_),$rounds | ||
2274 | lea 0x10($out),$out | ||
2275 | .Lcbc_dec_tail: | ||
464 | movups ($inp),$inout0 | 2276 | movups ($inp),$inout0 |
465 | cmp \$0x10,$len | ||
466 | movaps $inout0,$in0 | 2277 | movaps $inout0,$in0 |
2278 | cmp \$0x10,$len | ||
467 | jbe .Lcbc_dec_one | 2279 | jbe .Lcbc_dec_one |
2280 | |||
468 | movups 0x10($inp),$inout1 | 2281 | movups 0x10($inp),$inout1 |
469 | cmp \$0x20,$len | ||
470 | movaps $inout1,$in1 | 2282 | movaps $inout1,$in1 |
2283 | cmp \$0x20,$len | ||
471 | jbe .Lcbc_dec_two | 2284 | jbe .Lcbc_dec_two |
2285 | |||
472 | movups 0x20($inp),$inout2 | 2286 | movups 0x20($inp),$inout2 |
473 | cmp \$0x30,$len | ||
474 | movaps $inout2,$in2 | 2287 | movaps $inout2,$in2 |
2288 | cmp \$0x30,$len | ||
475 | jbe .Lcbc_dec_three | 2289 | jbe .Lcbc_dec_three |
2290 | |||
476 | movups 0x30($inp),$inout3 | 2291 | movups 0x30($inp),$inout3 |
477 | call _aesni_decrypt4 | 2292 | cmp \$0x40,$len |
478 | pxor $iv,$inout0 | 2293 | jbe .Lcbc_dec_four |
479 | movups 0x30($inp),$iv | 2294 | |
480 | pxor $in0,$inout1 | 2295 | movups 0x40($inp),$inout4 |
2296 | cmp \$0x50,$len | ||
2297 | jbe .Lcbc_dec_five | ||
2298 | |||
2299 | movups 0x50($inp),$inout5 | ||
2300 | cmp \$0x60,$len | ||
2301 | jbe .Lcbc_dec_six | ||
2302 | |||
2303 | movups 0x60($inp),$inout6 | ||
2304 | movaps $iv,$reserved(%rsp) # save IV | ||
2305 | call _aesni_decrypt8 | ||
2306 | movups ($inp),$rndkey1 | ||
2307 | movups 0x10($inp),$rndkey0 | ||
2308 | xorps $reserved(%rsp),$inout0 # ^= IV | ||
2309 | xorps $rndkey1,$inout1 | ||
2310 | movups 0x20($inp),$rndkey1 | ||
2311 | xorps $rndkey0,$inout2 | ||
2312 | movups 0x30($inp),$rndkey0 | ||
2313 | xorps $rndkey1,$inout3 | ||
2314 | movups 0x40($inp),$rndkey1 | ||
2315 | xorps $rndkey0,$inout4 | ||
2316 | movups 0x50($inp),$rndkey0 | ||
2317 | xorps $rndkey1,$inout5 | ||
2318 | movups 0x60($inp),$iv # IV | ||
2319 | xorps $rndkey0,$inout6 | ||
481 | movups $inout0,($out) | 2320 | movups $inout0,($out) |
482 | pxor $in1,$inout2 | ||
483 | movups $inout1,0x10($out) | 2321 | movups $inout1,0x10($out) |
484 | pxor $in2,$inout3 | ||
485 | movups $inout2,0x20($out) | 2322 | movups $inout2,0x20($out) |
486 | movaps $inout3,$inout0 | 2323 | movups $inout3,0x30($out) |
487 | lea 0x30($out),$out | 2324 | movups $inout4,0x40($out) |
2325 | movups $inout5,0x50($out) | ||
2326 | lea 0x60($out),$out | ||
2327 | movaps $inout6,$inout0 | ||
2328 | sub \$0x70,$len | ||
488 | jmp .Lcbc_dec_tail_collected | 2329 | jmp .Lcbc_dec_tail_collected |
489 | .align 16 | 2330 | .align 16 |
490 | .Lcbc_dec_one: | 2331 | .Lcbc_dec_one: |
491 | ___ | 2332 | ___ |
492 | &aesni_generate1("dec",$key,$rounds); | 2333 | &aesni_generate1("dec",$key,$rounds); |
493 | $code.=<<___; | 2334 | $code.=<<___; |
494 | pxor $iv,$inout0 | 2335 | xorps $iv,$inout0 |
495 | movaps $in0,$iv | 2336 | movaps $in0,$iv |
2337 | sub \$0x10,$len | ||
496 | jmp .Lcbc_dec_tail_collected | 2338 | jmp .Lcbc_dec_tail_collected |
497 | .align 16 | 2339 | .align 16 |
498 | .Lcbc_dec_two: | 2340 | .Lcbc_dec_two: |
2341 | xorps $inout2,$inout2 | ||
499 | call _aesni_decrypt3 | 2342 | call _aesni_decrypt3 |
500 | pxor $iv,$inout0 | 2343 | xorps $iv,$inout0 |
501 | pxor $in0,$inout1 | 2344 | xorps $in0,$inout1 |
502 | movups $inout0,($out) | 2345 | movups $inout0,($out) |
503 | movaps $in1,$iv | 2346 | movaps $in1,$iv |
504 | movaps $inout1,$inout0 | 2347 | movaps $inout1,$inout0 |
505 | lea 0x10($out),$out | 2348 | lea 0x10($out),$out |
2349 | sub \$0x20,$len | ||
506 | jmp .Lcbc_dec_tail_collected | 2350 | jmp .Lcbc_dec_tail_collected |
507 | .align 16 | 2351 | .align 16 |
508 | .Lcbc_dec_three: | 2352 | .Lcbc_dec_three: |
509 | call _aesni_decrypt3 | 2353 | call _aesni_decrypt3 |
510 | pxor $iv,$inout0 | 2354 | xorps $iv,$inout0 |
511 | pxor $in0,$inout1 | 2355 | xorps $in0,$inout1 |
512 | movups $inout0,($out) | 2356 | movups $inout0,($out) |
513 | pxor $in1,$inout2 | 2357 | xorps $in1,$inout2 |
514 | movups $inout1,0x10($out) | 2358 | movups $inout1,0x10($out) |
515 | movaps $in2,$iv | 2359 | movaps $in2,$iv |
516 | movaps $inout2,$inout0 | 2360 | movaps $inout2,$inout0 |
517 | lea 0x20($out),$out | 2361 | lea 0x20($out),$out |
2362 | sub \$0x30,$len | ||
2363 | jmp .Lcbc_dec_tail_collected | ||
2364 | .align 16 | ||
2365 | .Lcbc_dec_four: | ||
2366 | call _aesni_decrypt4 | ||
2367 | xorps $iv,$inout0 | ||
2368 | movups 0x30($inp),$iv | ||
2369 | xorps $in0,$inout1 | ||
2370 | movups $inout0,($out) | ||
2371 | xorps $in1,$inout2 | ||
2372 | movups $inout1,0x10($out) | ||
2373 | xorps $in2,$inout3 | ||
2374 | movups $inout2,0x20($out) | ||
2375 | movaps $inout3,$inout0 | ||
2376 | lea 0x30($out),$out | ||
2377 | sub \$0x40,$len | ||
2378 | jmp .Lcbc_dec_tail_collected | ||
2379 | .align 16 | ||
2380 | .Lcbc_dec_five: | ||
2381 | xorps $inout5,$inout5 | ||
2382 | call _aesni_decrypt6 | ||
2383 | movups 0x10($inp),$rndkey1 | ||
2384 | movups 0x20($inp),$rndkey0 | ||
2385 | xorps $iv,$inout0 | ||
2386 | xorps $in0,$inout1 | ||
2387 | xorps $rndkey1,$inout2 | ||
2388 | movups 0x30($inp),$rndkey1 | ||
2389 | xorps $rndkey0,$inout3 | ||
2390 | movups 0x40($inp),$iv | ||
2391 | xorps $rndkey1,$inout4 | ||
2392 | movups $inout0,($out) | ||
2393 | movups $inout1,0x10($out) | ||
2394 | movups $inout2,0x20($out) | ||
2395 | movups $inout3,0x30($out) | ||
2396 | lea 0x40($out),$out | ||
2397 | movaps $inout4,$inout0 | ||
2398 | sub \$0x50,$len | ||
2399 | jmp .Lcbc_dec_tail_collected | ||
2400 | .align 16 | ||
2401 | .Lcbc_dec_six: | ||
2402 | call _aesni_decrypt6 | ||
2403 | movups 0x10($inp),$rndkey1 | ||
2404 | movups 0x20($inp),$rndkey0 | ||
2405 | xorps $iv,$inout0 | ||
2406 | xorps $in0,$inout1 | ||
2407 | xorps $rndkey1,$inout2 | ||
2408 | movups 0x30($inp),$rndkey1 | ||
2409 | xorps $rndkey0,$inout3 | ||
2410 | movups 0x40($inp),$rndkey0 | ||
2411 | xorps $rndkey1,$inout4 | ||
2412 | movups 0x50($inp),$iv | ||
2413 | xorps $rndkey0,$inout5 | ||
2414 | movups $inout0,($out) | ||
2415 | movups $inout1,0x10($out) | ||
2416 | movups $inout2,0x20($out) | ||
2417 | movups $inout3,0x30($out) | ||
2418 | movups $inout4,0x40($out) | ||
2419 | lea 0x50($out),$out | ||
2420 | movaps $inout5,$inout0 | ||
2421 | sub \$0x60,$len | ||
518 | jmp .Lcbc_dec_tail_collected | 2422 | jmp .Lcbc_dec_tail_collected |
519 | .align 16 | 2423 | .align 16 |
520 | .Lcbc_dec_tail_collected: | 2424 | .Lcbc_dec_tail_collected: |
@@ -523,10 +2427,12 @@ $code.=<<___; | |||
523 | jnz .Lcbc_dec_tail_partial | 2427 | jnz .Lcbc_dec_tail_partial |
524 | movups $inout0,($out) | 2428 | movups $inout0,($out) |
525 | jmp .Lcbc_dec_ret | 2429 | jmp .Lcbc_dec_ret |
2430 | .align 16 | ||
526 | .Lcbc_dec_tail_partial: | 2431 | .Lcbc_dec_tail_partial: |
527 | movaps $inout0,$reserved(%rsp) | 2432 | movaps $inout0,$reserved(%rsp) |
2433 | mov \$16,%rcx | ||
528 | mov $out,%rdi | 2434 | mov $out,%rdi |
529 | mov $len,%rcx | 2435 | sub $len,%rcx |
530 | lea $reserved(%rsp),%rsi | 2436 | lea $reserved(%rsp),%rsi |
531 | .long 0x9066A4F3 # rep movsb | 2437 | .long 0x9066A4F3 # rep movsb |
532 | 2438 | ||
@@ -544,7 +2450,7 @@ $code.=<<___; | |||
544 | ret | 2450 | ret |
545 | .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt | 2451 | .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt |
546 | ___ | 2452 | ___ |
547 | 2453 | } | |
548 | # int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, | 2454 | # int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, |
549 | # int bits, AES_KEY *key) | 2455 | # int bits, AES_KEY *key) |
550 | { my ($inp,$bits,$key) = @_4args; | 2456 | { my ($inp,$bits,$key) = @_4args; |
@@ -556,7 +2462,7 @@ $code.=<<___; | |||
556 | .align 16 | 2462 | .align 16 |
557 | ${PREFIX}_set_decrypt_key: | 2463 | ${PREFIX}_set_decrypt_key: |
558 | .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 | 2464 | .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 |
559 | call _aesni_set_encrypt_key | 2465 | call __aesni_set_encrypt_key |
560 | shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key | 2466 | shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key |
561 | test %eax,%eax | 2467 | test %eax,%eax |
562 | jnz .Ldec_key_ret | 2468 | jnz .Ldec_key_ret |
@@ -576,9 +2482,9 @@ ${PREFIX}_set_decrypt_key: | |||
576 | aesimc %xmm1,%xmm1 | 2482 | aesimc %xmm1,%xmm1 |
577 | lea 16($key),$key | 2483 | lea 16($key),$key |
578 | lea -16($inp),$inp | 2484 | lea -16($inp),$inp |
579 | cmp $key,$inp | ||
580 | $movkey %xmm0,16($inp) | 2485 | $movkey %xmm0,16($inp) |
581 | $movkey %xmm1,-16($key) | 2486 | $movkey %xmm1,-16($key) |
2487 | cmp $key,$inp | ||
582 | ja .Ldec_key_inverse | 2488 | ja .Ldec_key_inverse |
583 | 2489 | ||
584 | $movkey ($key),%xmm0 # inverse middle | 2490 | $movkey ($key),%xmm0 # inverse middle |
@@ -605,16 +2511,16 @@ $code.=<<___; | |||
605 | .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent | 2511 | .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent |
606 | .align 16 | 2512 | .align 16 |
607 | ${PREFIX}_set_encrypt_key: | 2513 | ${PREFIX}_set_encrypt_key: |
608 | _aesni_set_encrypt_key: | 2514 | __aesni_set_encrypt_key: |
609 | .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 | 2515 | .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 |
610 | test $inp,$inp | ||
611 | mov \$-1,%rax | 2516 | mov \$-1,%rax |
2517 | test $inp,$inp | ||
612 | jz .Lenc_key_ret | 2518 | jz .Lenc_key_ret |
613 | test $key,$key | 2519 | test $key,$key |
614 | jz .Lenc_key_ret | 2520 | jz .Lenc_key_ret |
615 | 2521 | ||
616 | movups ($inp),%xmm0 # pull first 128 bits of *userKey | 2522 | movups ($inp),%xmm0 # pull first 128 bits of *userKey |
617 | pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0 | 2523 | xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 |
618 | lea 16($key),%rax | 2524 | lea 16($key),%rax |
619 | cmp \$256,$bits | 2525 | cmp \$256,$bits |
620 | je .L14rounds | 2526 | je .L14rounds |
@@ -729,11 +2635,11 @@ _aesni_set_encrypt_key: | |||
729 | lea 16(%rax),%rax | 2635 | lea 16(%rax),%rax |
730 | .Lkey_expansion_128_cold: | 2636 | .Lkey_expansion_128_cold: |
731 | shufps \$0b00010000,%xmm0,%xmm4 | 2637 | shufps \$0b00010000,%xmm0,%xmm4 |
732 | pxor %xmm4, %xmm0 | 2638 | xorps %xmm4, %xmm0 |
733 | shufps \$0b10001100,%xmm0,%xmm4 | 2639 | shufps \$0b10001100,%xmm0,%xmm4 |
734 | pxor %xmm4, %xmm0 | 2640 | xorps %xmm4, %xmm0 |
735 | pshufd \$0b11111111,%xmm1,%xmm1 # critical path | 2641 | shufps \$0b11111111,%xmm1,%xmm1 # critical path |
736 | pxor %xmm1,%xmm0 | 2642 | xorps %xmm1,%xmm0 |
737 | ret | 2643 | ret |
738 | 2644 | ||
739 | .align 16 | 2645 | .align 16 |
@@ -744,11 +2650,11 @@ _aesni_set_encrypt_key: | |||
744 | movaps %xmm2, %xmm5 | 2650 | movaps %xmm2, %xmm5 |
745 | .Lkey_expansion_192b_warm: | 2651 | .Lkey_expansion_192b_warm: |
746 | shufps \$0b00010000,%xmm0,%xmm4 | 2652 | shufps \$0b00010000,%xmm0,%xmm4 |
747 | movaps %xmm2,%xmm3 | 2653 | movdqa %xmm2,%xmm3 |
748 | pxor %xmm4,%xmm0 | 2654 | xorps %xmm4,%xmm0 |
749 | shufps \$0b10001100,%xmm0,%xmm4 | 2655 | shufps \$0b10001100,%xmm0,%xmm4 |
750 | pslldq \$4,%xmm3 | 2656 | pslldq \$4,%xmm3 |
751 | pxor %xmm4,%xmm0 | 2657 | xorps %xmm4,%xmm0 |
752 | pshufd \$0b01010101,%xmm1,%xmm1 # critical path | 2658 | pshufd \$0b01010101,%xmm1,%xmm1 # critical path |
753 | pxor %xmm3,%xmm2 | 2659 | pxor %xmm3,%xmm2 |
754 | pxor %xmm1,%xmm0 | 2660 | pxor %xmm1,%xmm0 |
@@ -772,11 +2678,11 @@ _aesni_set_encrypt_key: | |||
772 | lea 16(%rax),%rax | 2678 | lea 16(%rax),%rax |
773 | .Lkey_expansion_256a_cold: | 2679 | .Lkey_expansion_256a_cold: |
774 | shufps \$0b00010000,%xmm0,%xmm4 | 2680 | shufps \$0b00010000,%xmm0,%xmm4 |
775 | pxor %xmm4,%xmm0 | 2681 | xorps %xmm4,%xmm0 |
776 | shufps \$0b10001100,%xmm0,%xmm4 | 2682 | shufps \$0b10001100,%xmm0,%xmm4 |
777 | pxor %xmm4,%xmm0 | 2683 | xorps %xmm4,%xmm0 |
778 | pshufd \$0b11111111,%xmm1,%xmm1 # critical path | 2684 | shufps \$0b11111111,%xmm1,%xmm1 # critical path |
779 | pxor %xmm1,%xmm0 | 2685 | xorps %xmm1,%xmm0 |
780 | ret | 2686 | ret |
781 | 2687 | ||
782 | .align 16 | 2688 | .align 16 |
@@ -785,17 +2691,28 @@ _aesni_set_encrypt_key: | |||
785 | lea 16(%rax),%rax | 2691 | lea 16(%rax),%rax |
786 | 2692 | ||
787 | shufps \$0b00010000,%xmm2,%xmm4 | 2693 | shufps \$0b00010000,%xmm2,%xmm4 |
788 | pxor %xmm4,%xmm2 | 2694 | xorps %xmm4,%xmm2 |
789 | shufps \$0b10001100,%xmm2,%xmm4 | 2695 | shufps \$0b10001100,%xmm2,%xmm4 |
790 | pxor %xmm4,%xmm2 | 2696 | xorps %xmm4,%xmm2 |
791 | pshufd \$0b10101010,%xmm1,%xmm1 # critical path | 2697 | shufps \$0b10101010,%xmm1,%xmm1 # critical path |
792 | pxor %xmm1,%xmm2 | 2698 | xorps %xmm1,%xmm2 |
793 | ret | 2699 | ret |
794 | .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key | 2700 | .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key |
2701 | .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key | ||
795 | ___ | 2702 | ___ |
796 | } | 2703 | } |
797 | 2704 | ||
798 | $code.=<<___; | 2705 | $code.=<<___; |
2706 | .align 64 | ||
2707 | .Lbswap_mask: | ||
2708 | .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 | ||
2709 | .Lincrement32: | ||
2710 | .long 6,6,6,0 | ||
2711 | .Lincrement64: | ||
2712 | .long 1,0,0,0 | ||
2713 | .Lxts_magic: | ||
2714 | .long 0x87,0,1,0 | ||
2715 | |||
799 | .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" | 2716 | .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" |
800 | .align 64 | 2717 | .align 64 |
801 | ___ | 2718 | ___ |
@@ -810,9 +2727,11 @@ $disp="%r9"; | |||
810 | 2727 | ||
811 | $code.=<<___; | 2728 | $code.=<<___; |
812 | .extern __imp_RtlVirtualUnwind | 2729 | .extern __imp_RtlVirtualUnwind |
813 | .type cbc_se_handler,\@abi-omnipotent | 2730 | ___ |
2731 | $code.=<<___ if ($PREFIX eq "aesni"); | ||
2732 | .type ecb_se_handler,\@abi-omnipotent | ||
814 | .align 16 | 2733 | .align 16 |
815 | cbc_se_handler: | 2734 | ecb_se_handler: |
816 | push %rsi | 2735 | push %rsi |
817 | push %rdi | 2736 | push %rdi |
818 | push %rbx | 2737 | push %rbx |
@@ -825,42 +2744,132 @@ cbc_se_handler: | |||
825 | sub \$64,%rsp | 2744 | sub \$64,%rsp |
826 | 2745 | ||
827 | mov 152($context),%rax # pull context->Rsp | 2746 | mov 152($context),%rax # pull context->Rsp |
2747 | |||
2748 | jmp .Lcommon_seh_tail | ||
2749 | .size ecb_se_handler,.-ecb_se_handler | ||
2750 | |||
2751 | .type ccm64_se_handler,\@abi-omnipotent | ||
2752 | .align 16 | ||
2753 | ccm64_se_handler: | ||
2754 | push %rsi | ||
2755 | push %rdi | ||
2756 | push %rbx | ||
2757 | push %rbp | ||
2758 | push %r12 | ||
2759 | push %r13 | ||
2760 | push %r14 | ||
2761 | push %r15 | ||
2762 | pushfq | ||
2763 | sub \$64,%rsp | ||
2764 | |||
2765 | mov 120($context),%rax # pull context->Rax | ||
828 | mov 248($context),%rbx # pull context->Rip | 2766 | mov 248($context),%rbx # pull context->Rip |
829 | 2767 | ||
830 | lea .Lcbc_decrypt(%rip),%r10 | 2768 | mov 8($disp),%rsi # disp->ImageBase |
831 | cmp %r10,%rbx # context->Rip<"prologue" label | 2769 | mov 56($disp),%r11 # disp->HandlerData |
832 | jb .Lin_prologue | ||
833 | 2770 | ||
834 | lea .Lcbc_decrypt_body(%rip),%r10 | 2771 | mov 0(%r11),%r10d # HandlerData[0] |
835 | cmp %r10,%rbx # context->Rip<cbc_decrypt_body | 2772 | lea (%rsi,%r10),%r10 # prologue label |
836 | jb .Lrestore_rax | 2773 | cmp %r10,%rbx # context->Rip<prologue label |
2774 | jb .Lcommon_seh_tail | ||
837 | 2775 | ||
838 | lea .Lcbc_ret(%rip),%r10 | 2776 | mov 152($context),%rax # pull context->Rsp |
839 | cmp %r10,%rbx # context->Rip>="epilogue" label | ||
840 | jae .Lin_prologue | ||
841 | 2777 | ||
842 | lea 0(%rax),%rsi # top of stack | 2778 | mov 4(%r11),%r10d # HandlerData[1] |
2779 | lea (%rsi,%r10),%r10 # epilogue label | ||
2780 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2781 | jae .Lcommon_seh_tail | ||
2782 | |||
2783 | lea 0(%rax),%rsi # %xmm save area | ||
843 | lea 512($context),%rdi # &context.Xmm6 | 2784 | lea 512($context),%rdi # &context.Xmm6 |
844 | mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) | 2785 | mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) |
845 | .long 0xa548f3fc # cld; rep movsq | 2786 | .long 0xa548f3fc # cld; rep movsq |
846 | lea 0x58(%rax),%rax # adjust stack pointer | 2787 | lea 0x58(%rax),%rax # adjust stack pointer |
847 | jmp .Lin_prologue | ||
848 | 2788 | ||
849 | .Lrestore_rax: | 2789 | jmp .Lcommon_seh_tail |
850 | mov 120($context),%rax | 2790 | .size ccm64_se_handler,.-ccm64_se_handler |
851 | .Lin_prologue: | ||
852 | mov 8(%rax),%rdi | ||
853 | mov 16(%rax),%rsi | ||
854 | mov %rax,152($context) # restore context->Rsp | ||
855 | mov %rsi,168($context) # restore context->Rsi | ||
856 | mov %rdi,176($context) # restore context->Rdi | ||
857 | 2791 | ||
858 | jmp .Lcommon_seh_exit | 2792 | .type ctr32_se_handler,\@abi-omnipotent |
859 | .size cbc_se_handler,.-cbc_se_handler | 2793 | .align 16 |
2794 | ctr32_se_handler: | ||
2795 | push %rsi | ||
2796 | push %rdi | ||
2797 | push %rbx | ||
2798 | push %rbp | ||
2799 | push %r12 | ||
2800 | push %r13 | ||
2801 | push %r14 | ||
2802 | push %r15 | ||
2803 | pushfq | ||
2804 | sub \$64,%rsp | ||
860 | 2805 | ||
861 | .type ecb_se_handler,\@abi-omnipotent | 2806 | mov 120($context),%rax # pull context->Rax |
2807 | mov 248($context),%rbx # pull context->Rip | ||
2808 | |||
2809 | lea .Lctr32_body(%rip),%r10 | ||
2810 | cmp %r10,%rbx # context->Rip<"prologue" label | ||
2811 | jb .Lcommon_seh_tail | ||
2812 | |||
2813 | mov 152($context),%rax # pull context->Rsp | ||
2814 | |||
2815 | lea .Lctr32_ret(%rip),%r10 | ||
2816 | cmp %r10,%rbx | ||
2817 | jae .Lcommon_seh_tail | ||
2818 | |||
2819 | lea 0x20(%rax),%rsi # %xmm save area | ||
2820 | lea 512($context),%rdi # &context.Xmm6 | ||
2821 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
2822 | .long 0xa548f3fc # cld; rep movsq | ||
2823 | lea 0xc8(%rax),%rax # adjust stack pointer | ||
2824 | |||
2825 | jmp .Lcommon_seh_tail | ||
2826 | .size ctr32_se_handler,.-ctr32_se_handler | ||
2827 | |||
2828 | .type xts_se_handler,\@abi-omnipotent | ||
862 | .align 16 | 2829 | .align 16 |
863 | ecb_se_handler: | 2830 | xts_se_handler: |
2831 | push %rsi | ||
2832 | push %rdi | ||
2833 | push %rbx | ||
2834 | push %rbp | ||
2835 | push %r12 | ||
2836 | push %r13 | ||
2837 | push %r14 | ||
2838 | push %r15 | ||
2839 | pushfq | ||
2840 | sub \$64,%rsp | ||
2841 | |||
2842 | mov 120($context),%rax # pull context->Rax | ||
2843 | mov 248($context),%rbx # pull context->Rip | ||
2844 | |||
2845 | mov 8($disp),%rsi # disp->ImageBase | ||
2846 | mov 56($disp),%r11 # disp->HandlerData | ||
2847 | |||
2848 | mov 0(%r11),%r10d # HandlerData[0] | ||
2849 | lea (%rsi,%r10),%r10 # prologue lable | ||
2850 | cmp %r10,%rbx # context->Rip<prologue label | ||
2851 | jb .Lcommon_seh_tail | ||
2852 | |||
2853 | mov 152($context),%rax # pull context->Rsp | ||
2854 | |||
2855 | mov 4(%r11),%r10d # HandlerData[1] | ||
2856 | lea (%rsi,%r10),%r10 # epilogue label | ||
2857 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2858 | jae .Lcommon_seh_tail | ||
2859 | |||
2860 | lea 0x60(%rax),%rsi # %xmm save area | ||
2861 | lea 512($context),%rdi # & context.Xmm6 | ||
2862 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
2863 | .long 0xa548f3fc # cld; rep movsq | ||
2864 | lea 0x68+160(%rax),%rax # adjust stack pointer | ||
2865 | |||
2866 | jmp .Lcommon_seh_tail | ||
2867 | .size xts_se_handler,.-xts_se_handler | ||
2868 | ___ | ||
2869 | $code.=<<___; | ||
2870 | .type cbc_se_handler,\@abi-omnipotent | ||
2871 | .align 16 | ||
2872 | cbc_se_handler: | ||
864 | push %rsi | 2873 | push %rsi |
865 | push %rdi | 2874 | push %rdi |
866 | push %rbx | 2875 | push %rbx |
@@ -873,13 +2882,37 @@ ecb_se_handler: | |||
873 | sub \$64,%rsp | 2882 | sub \$64,%rsp |
874 | 2883 | ||
875 | mov 152($context),%rax # pull context->Rsp | 2884 | mov 152($context),%rax # pull context->Rsp |
2885 | mov 248($context),%rbx # pull context->Rip | ||
2886 | |||
2887 | lea .Lcbc_decrypt(%rip),%r10 | ||
2888 | cmp %r10,%rbx # context->Rip<"prologue" label | ||
2889 | jb .Lcommon_seh_tail | ||
2890 | |||
2891 | lea .Lcbc_decrypt_body(%rip),%r10 | ||
2892 | cmp %r10,%rbx # context->Rip<cbc_decrypt_body | ||
2893 | jb .Lrestore_cbc_rax | ||
2894 | |||
2895 | lea .Lcbc_ret(%rip),%r10 | ||
2896 | cmp %r10,%rbx # context->Rip>="epilogue" label | ||
2897 | jae .Lcommon_seh_tail | ||
2898 | |||
2899 | lea 0(%rax),%rsi # top of stack | ||
2900 | lea 512($context),%rdi # &context.Xmm6 | ||
2901 | mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) | ||
2902 | .long 0xa548f3fc # cld; rep movsq | ||
2903 | lea 0x58(%rax),%rax # adjust stack pointer | ||
2904 | jmp .Lcommon_seh_tail | ||
2905 | |||
2906 | .Lrestore_cbc_rax: | ||
2907 | mov 120($context),%rax | ||
2908 | |||
2909 | .Lcommon_seh_tail: | ||
876 | mov 8(%rax),%rdi | 2910 | mov 8(%rax),%rdi |
877 | mov 16(%rax),%rsi | 2911 | mov 16(%rax),%rsi |
2912 | mov %rax,152($context) # restore context->Rsp | ||
878 | mov %rsi,168($context) # restore context->Rsi | 2913 | mov %rsi,168($context) # restore context->Rsi |
879 | mov %rdi,176($context) # restore context->Rdi | 2914 | mov %rdi,176($context) # restore context->Rdi |
880 | 2915 | ||
881 | .Lcommon_seh_exit: | ||
882 | |||
883 | mov 40($disp),%rdi # disp->ContextRecord | 2916 | mov 40($disp),%rdi # disp->ContextRecord |
884 | mov $context,%rsi # context | 2917 | mov $context,%rsi # context |
885 | mov \$154,%ecx # sizeof(CONTEXT) | 2918 | mov \$154,%ecx # sizeof(CONTEXT) |
@@ -915,10 +2948,33 @@ ecb_se_handler: | |||
915 | 2948 | ||
916 | .section .pdata | 2949 | .section .pdata |
917 | .align 4 | 2950 | .align 4 |
918 | .rva .LSEH_begin_${PREFIX}_ecb_encrypt | 2951 | ___ |
919 | .rva .LSEH_end_${PREFIX}_ecb_encrypt | 2952 | $code.=<<___ if ($PREFIX eq "aesni"); |
2953 | .rva .LSEH_begin_aesni_ecb_encrypt | ||
2954 | .rva .LSEH_end_aesni_ecb_encrypt | ||
920 | .rva .LSEH_info_ecb | 2955 | .rva .LSEH_info_ecb |
921 | 2956 | ||
2957 | .rva .LSEH_begin_aesni_ccm64_encrypt_blocks | ||
2958 | .rva .LSEH_end_aesni_ccm64_encrypt_blocks | ||
2959 | .rva .LSEH_info_ccm64_enc | ||
2960 | |||
2961 | .rva .LSEH_begin_aesni_ccm64_decrypt_blocks | ||
2962 | .rva .LSEH_end_aesni_ccm64_decrypt_blocks | ||
2963 | .rva .LSEH_info_ccm64_dec | ||
2964 | |||
2965 | .rva .LSEH_begin_aesni_ctr32_encrypt_blocks | ||
2966 | .rva .LSEH_end_aesni_ctr32_encrypt_blocks | ||
2967 | .rva .LSEH_info_ctr32 | ||
2968 | |||
2969 | .rva .LSEH_begin_aesni_xts_encrypt | ||
2970 | .rva .LSEH_end_aesni_xts_encrypt | ||
2971 | .rva .LSEH_info_xts_enc | ||
2972 | |||
2973 | .rva .LSEH_begin_aesni_xts_decrypt | ||
2974 | .rva .LSEH_end_aesni_xts_decrypt | ||
2975 | .rva .LSEH_info_xts_dec | ||
2976 | ___ | ||
2977 | $code.=<<___; | ||
922 | .rva .LSEH_begin_${PREFIX}_cbc_encrypt | 2978 | .rva .LSEH_begin_${PREFIX}_cbc_encrypt |
923 | .rva .LSEH_end_${PREFIX}_cbc_encrypt | 2979 | .rva .LSEH_end_${PREFIX}_cbc_encrypt |
924 | .rva .LSEH_info_cbc | 2980 | .rva .LSEH_info_cbc |
@@ -932,28 +2988,49 @@ ecb_se_handler: | |||
932 | .rva .LSEH_info_key | 2988 | .rva .LSEH_info_key |
933 | .section .xdata | 2989 | .section .xdata |
934 | .align 8 | 2990 | .align 8 |
2991 | ___ | ||
2992 | $code.=<<___ if ($PREFIX eq "aesni"); | ||
935 | .LSEH_info_ecb: | 2993 | .LSEH_info_ecb: |
936 | .byte 9,0,0,0 | 2994 | .byte 9,0,0,0 |
937 | .rva ecb_se_handler | 2995 | .rva ecb_se_handler |
2996 | .LSEH_info_ccm64_enc: | ||
2997 | .byte 9,0,0,0 | ||
2998 | .rva ccm64_se_handler | ||
2999 | .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] | ||
3000 | .LSEH_info_ccm64_dec: | ||
3001 | .byte 9,0,0,0 | ||
3002 | .rva ccm64_se_handler | ||
3003 | .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] | ||
3004 | .LSEH_info_ctr32: | ||
3005 | .byte 9,0,0,0 | ||
3006 | .rva ctr32_se_handler | ||
3007 | .LSEH_info_xts_enc: | ||
3008 | .byte 9,0,0,0 | ||
3009 | .rva xts_se_handler | ||
3010 | .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] | ||
3011 | .LSEH_info_xts_dec: | ||
3012 | .byte 9,0,0,0 | ||
3013 | .rva xts_se_handler | ||
3014 | .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] | ||
3015 | ___ | ||
3016 | $code.=<<___; | ||
938 | .LSEH_info_cbc: | 3017 | .LSEH_info_cbc: |
939 | .byte 9,0,0,0 | 3018 | .byte 9,0,0,0 |
940 | .rva cbc_se_handler | 3019 | .rva cbc_se_handler |
941 | .LSEH_info_key: | 3020 | .LSEH_info_key: |
942 | .byte 0x01,0x04,0x01,0x00 | 3021 | .byte 0x01,0x04,0x01,0x00 |
943 | .byte 0x04,0x02,0x00,0x00 | 3022 | .byte 0x04,0x02,0x00,0x00 # sub rsp,8 |
944 | ___ | 3023 | ___ |
945 | } | 3024 | } |
946 | 3025 | ||
947 | sub rex { | 3026 | sub rex { |
948 | local *opcode=shift; | 3027 | local *opcode=shift; |
949 | my ($dst,$src)=@_; | 3028 | my ($dst,$src)=@_; |
950 | 3029 | my $rex=0; | |
951 | if ($dst>=8 || $src>=8) { | 3030 | |
952 | $rex=0x40; | 3031 | $rex|=0x04 if($dst>=8); |
953 | $rex|=0x04 if($dst>=8); | 3032 | $rex|=0x01 if($src>=8); |
954 | $rex|=0x01 if($src>=8); | 3033 | push @opcode,$rex|0x40 if($rex); |
955 | push @opcode,$rex; | ||
956 | } | ||
957 | } | 3034 | } |
958 | 3035 | ||
959 | sub aesni { | 3036 | sub aesni { |
@@ -989,4 +3066,3 @@ $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; | |||
989 | print $code; | 3066 | print $code; |
990 | 3067 | ||
991 | close STDOUT; | 3068 | close STDOUT; |
992 | |||