diff options
Diffstat (limited to 'src/lib/libcrypto/aes/asm')
-rw-r--r-- | src/lib/libcrypto/aes/asm/aes-586.pl | 30 | ||||
-rwxr-xr-x | src/lib/libcrypto/aes/asm/aes-x86_64.pl | 90 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/asm/bsaes-x86_64.pl | 3123 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/asm/vpaes-x86.pl | 911 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/asm/vpaes-x86_64.pl | 1222 |
5 files changed, 60 insertions, 5316 deletions
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl index 364099d4d3..402a1a3c46 100644 --- a/src/lib/libcrypto/aes/asm/aes-586.pl +++ b/src/lib/libcrypto/aes/asm/aes-586.pl | |||
@@ -1158,8 +1158,8 @@ sub enclast() | |||
1158 | &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000); | 1158 | &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000); |
1159 | &previous(); | 1159 | &previous(); |
1160 | 1160 | ||
1161 | # void aes_encrypt_internal(const void *inp, void *out, const AES_KEY *key); | 1161 | # void aes_encrypt_generic(const void *inp, void *out, const AES_KEY *key); |
1162 | &function_begin("aes_encrypt_internal"); | 1162 | &function_begin("aes_encrypt_generic"); |
1163 | &mov ($acc,&wparam(0)); # load inp | 1163 | &mov ($acc,&wparam(0)); # load inp |
1164 | &mov ($key,&wparam(2)); # load key | 1164 | &mov ($key,&wparam(2)); # load key |
1165 | 1165 | ||
@@ -1213,7 +1213,7 @@ sub enclast() | |||
1213 | &mov (&DWP(4,$acc),$s1); | 1213 | &mov (&DWP(4,$acc),$s1); |
1214 | &mov (&DWP(8,$acc),$s2); | 1214 | &mov (&DWP(8,$acc),$s2); |
1215 | &mov (&DWP(12,$acc),$s3); | 1215 | &mov (&DWP(12,$acc),$s3); |
1216 | &function_end("aes_encrypt_internal"); | 1216 | &function_end("aes_encrypt_generic"); |
1217 | 1217 | ||
1218 | #--------------------------------------------------------------------# | 1218 | #--------------------------------------------------------------------# |
1219 | 1219 | ||
@@ -1947,8 +1947,8 @@ sub declast() | |||
1947 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | 1947 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); |
1948 | &previous(); | 1948 | &previous(); |
1949 | 1949 | ||
1950 | # void aes_decrypt_internal(const void *inp, void *out, const AES_KEY *key); | 1950 | # void aes_decrypt_generic(const void *inp, void *out, const AES_KEY *key); |
1951 | &function_begin("aes_decrypt_internal"); | 1951 | &function_begin("aes_decrypt_generic"); |
1952 | &mov ($acc,&wparam(0)); # load inp | 1952 | &mov ($acc,&wparam(0)); # load inp |
1953 | &mov ($key,&wparam(2)); # load key | 1953 | &mov ($key,&wparam(2)); # load key |
1954 | 1954 | ||
@@ -2002,9 +2002,9 @@ sub declast() | |||
2002 | &mov (&DWP(4,$acc),$s1); | 2002 | &mov (&DWP(4,$acc),$s1); |
2003 | &mov (&DWP(8,$acc),$s2); | 2003 | &mov (&DWP(8,$acc),$s2); |
2004 | &mov (&DWP(12,$acc),$s3); | 2004 | &mov (&DWP(12,$acc),$s3); |
2005 | &function_end("aes_decrypt_internal"); | 2005 | &function_end("aes_decrypt_generic"); |
2006 | 2006 | ||
2007 | # void aes_cbc_encrypt_internal(const void char *inp, unsigned char *out, | 2007 | # void aes_cbc_encrypt_generic(const void char *inp, unsigned char *out, |
2008 | # size_t length, const AES_KEY *key, unsigned char *ivp,const int enc); | 2008 | # size_t length, const AES_KEY *key, unsigned char *ivp,const int enc); |
2009 | { | 2009 | { |
2010 | # stack frame layout | 2010 | # stack frame layout |
@@ -2028,7 +2028,7 @@ my $ivec=&DWP(60,"esp"); # ivec[16] | |||
2028 | my $aes_key=&DWP(76,"esp"); # copy of aes_key | 2028 | my $aes_key=&DWP(76,"esp"); # copy of aes_key |
2029 | my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds | 2029 | my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds |
2030 | 2030 | ||
2031 | &function_begin("aes_cbc_encrypt_internal"); | 2031 | &function_begin("aes_cbc_encrypt_generic"); |
2032 | &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len | 2032 | &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len |
2033 | &cmp ($s2,0); | 2033 | &cmp ($s2,0); |
2034 | &je (&label("drop_out")); | 2034 | &je (&label("drop_out")); |
@@ -2616,7 +2616,7 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds | |||
2616 | 2616 | ||
2617 | &mov ("esp",$_esp); | 2617 | &mov ("esp",$_esp); |
2618 | &popf (); | 2618 | &popf (); |
2619 | &function_end("aes_cbc_encrypt_internal"); | 2619 | &function_end("aes_cbc_encrypt_generic"); |
2620 | } | 2620 | } |
2621 | 2621 | ||
2622 | #------------------------------------------------------------------# | 2622 | #------------------------------------------------------------------# |
@@ -2849,12 +2849,12 @@ sub enckey() | |||
2849 | &set_label("exit"); | 2849 | &set_label("exit"); |
2850 | &function_end("_x86_AES_set_encrypt_key"); | 2850 | &function_end("_x86_AES_set_encrypt_key"); |
2851 | 2851 | ||
2852 | # int aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits, | 2852 | # int aes_set_encrypt_key_generic(const unsigned char *userKey, const int bits, |
2853 | # AES_KEY *key) | 2853 | # AES_KEY *key) |
2854 | &function_begin_B("aes_set_encrypt_key_internal"); | 2854 | &function_begin_B("aes_set_encrypt_key_generic"); |
2855 | &call ("_x86_AES_set_encrypt_key"); | 2855 | &call ("_x86_AES_set_encrypt_key"); |
2856 | &ret (); | 2856 | &ret (); |
2857 | &function_end_B("aes_set_encrypt_key_internal"); | 2857 | &function_end_B("aes_set_encrypt_key_generic"); |
2858 | 2858 | ||
2859 | sub deckey() | 2859 | sub deckey() |
2860 | { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; | 2860 | { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; |
@@ -2911,9 +2911,9 @@ sub deckey() | |||
2911 | &mov (&DWP(4*$i,$key),$tp1); | 2911 | &mov (&DWP(4*$i,$key),$tp1); |
2912 | } | 2912 | } |
2913 | 2913 | ||
2914 | # int aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits, | 2914 | # int aes_set_decrypt_key_generic(const unsigned char *userKey, const int bits, |
2915 | # AES_KEY *key) | 2915 | # AES_KEY *key) |
2916 | &function_begin_B("aes_set_decrypt_key_internal"); | 2916 | &function_begin_B("aes_set_decrypt_key_generic"); |
2917 | &call ("_x86_AES_set_encrypt_key"); | 2917 | &call ("_x86_AES_set_encrypt_key"); |
2918 | &cmp ("eax",0); | 2918 | &cmp ("eax",0); |
2919 | &je (&label("proceed")); | 2919 | &je (&label("proceed")); |
@@ -2969,6 +2969,6 @@ sub deckey() | |||
2969 | &jb (&label("permute")); | 2969 | &jb (&label("permute")); |
2970 | 2970 | ||
2971 | &xor ("eax","eax"); # return success | 2971 | &xor ("eax","eax"); # return success |
2972 | &function_end("aes_set_decrypt_key_internal"); | 2972 | &function_end("aes_set_decrypt_key_generic"); |
2973 | 2973 | ||
2974 | &asm_finish(); | 2974 | &asm_finish(); |
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl index 324c4a2be2..2c73627546 100755 --- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl +++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl | |||
@@ -586,15 +586,15 @@ $code.=<<___; | |||
586 | .size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact | 586 | .size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact |
587 | ___ | 587 | ___ |
588 | 588 | ||
589 | # void aes_encrypt_internal(const void *inp, void *out, const AES_KEY *key); | 589 | # void aes_encrypt_generic(const void *inp, void *out, const AES_KEY *key); |
590 | $code.=<<___; | 590 | $code.=<<___; |
591 | .globl aes_encrypt_internal | 591 | .globl aes_encrypt_generic |
592 | .type aes_encrypt_internal,\@function,3 | 592 | .type aes_encrypt_generic,\@function,3 |
593 | .align 16 | 593 | .align 16 |
594 | .globl asm_AES_encrypt | 594 | .globl asm_AES_encrypt |
595 | .hidden asm_AES_encrypt | 595 | .hidden asm_AES_encrypt |
596 | asm_AES_encrypt: | 596 | asm_AES_encrypt: |
597 | aes_encrypt_internal: | 597 | aes_encrypt_generic: |
598 | _CET_ENDBR | 598 | _CET_ENDBR |
599 | push %rbx | 599 | push %rbx |
600 | push %rbp | 600 | push %rbp |
@@ -655,7 +655,7 @@ aes_encrypt_internal: | |||
655 | lea 48(%rsi),%rsp | 655 | lea 48(%rsi),%rsp |
656 | .Lenc_epilogue: | 656 | .Lenc_epilogue: |
657 | ret | 657 | ret |
658 | .size aes_encrypt_internal,.-aes_encrypt_internal | 658 | .size aes_encrypt_generic,.-aes_encrypt_generic |
659 | ___ | 659 | ___ |
660 | 660 | ||
661 | #------------------------------------------------------------------# | 661 | #------------------------------------------------------------------# |
@@ -1188,15 +1188,15 @@ $code.=<<___; | |||
1188 | .size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact | 1188 | .size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact |
1189 | ___ | 1189 | ___ |
1190 | 1190 | ||
1191 | # void aes_decrypt_internal(const void *inp, void *out, const AES_KEY *key); | 1191 | # void aes_decrypt_generic(const void *inp, void *out, const AES_KEY *key); |
1192 | $code.=<<___; | 1192 | $code.=<<___; |
1193 | .globl aes_decrypt_internal | 1193 | .globl aes_decrypt_generic |
1194 | .type aes_decrypt_internal,\@function,3 | 1194 | .type aes_decrypt_generic,\@function,3 |
1195 | .align 16 | 1195 | .align 16 |
1196 | .globl asm_AES_decrypt | 1196 | .globl asm_AES_decrypt |
1197 | .hidden asm_AES_decrypt | 1197 | .hidden asm_AES_decrypt |
1198 | asm_AES_decrypt: | 1198 | asm_AES_decrypt: |
1199 | aes_decrypt_internal: | 1199 | aes_decrypt_generic: |
1200 | _CET_ENDBR | 1200 | _CET_ENDBR |
1201 | push %rbx | 1201 | push %rbx |
1202 | push %rbp | 1202 | push %rbp |
@@ -1259,7 +1259,7 @@ aes_decrypt_internal: | |||
1259 | lea 48(%rsi),%rsp | 1259 | lea 48(%rsi),%rsp |
1260 | .Ldec_epilogue: | 1260 | .Ldec_epilogue: |
1261 | ret | 1261 | ret |
1262 | .size aes_decrypt_internal,.-aes_decrypt_internal | 1262 | .size aes_decrypt_generic,.-aes_decrypt_generic |
1263 | ___ | 1263 | ___ |
1264 | #------------------------------------------------------------------# | 1264 | #------------------------------------------------------------------# |
1265 | 1265 | ||
@@ -1290,13 +1290,13 @@ $code.=<<___; | |||
1290 | ___ | 1290 | ___ |
1291 | } | 1291 | } |
1292 | 1292 | ||
1293 | # int aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits, | 1293 | # int aes_set_encrypt_key_generic(const unsigned char *userKey, const int bits, |
1294 | # AES_KEY *key) | 1294 | # AES_KEY *key) |
1295 | $code.=<<___; | 1295 | $code.=<<___; |
1296 | .globl aes_set_encrypt_key_internal | 1296 | .globl aes_set_encrypt_key_generic |
1297 | .type aes_set_encrypt_key_internal,\@function,3 | 1297 | .type aes_set_encrypt_key_generic,\@function,3 |
1298 | .align 16 | 1298 | .align 16 |
1299 | aes_set_encrypt_key_internal: | 1299 | aes_set_encrypt_key_generic: |
1300 | _CET_ENDBR | 1300 | _CET_ENDBR |
1301 | push %rbx | 1301 | push %rbx |
1302 | push %rbp | 1302 | push %rbp |
@@ -1318,7 +1318,7 @@ aes_set_encrypt_key_internal: | |||
1318 | add \$56,%rsp | 1318 | add \$56,%rsp |
1319 | .Lenc_key_epilogue: | 1319 | .Lenc_key_epilogue: |
1320 | ret | 1320 | ret |
1321 | .size aes_set_encrypt_key_internal,.-aes_set_encrypt_key_internal | 1321 | .size aes_set_encrypt_key_generic,.-aes_set_encrypt_key_generic |
1322 | 1322 | ||
1323 | .type _x86_64_AES_set_encrypt_key,\@abi-omnipotent | 1323 | .type _x86_64_AES_set_encrypt_key,\@abi-omnipotent |
1324 | .align 16 | 1324 | .align 16 |
@@ -1562,13 +1562,13 @@ $code.=<<___; | |||
1562 | ___ | 1562 | ___ |
1563 | } | 1563 | } |
1564 | 1564 | ||
1565 | # int aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits, | 1565 | # int aes_set_decrypt_key_generic(const unsigned char *userKey, const int bits, |
1566 | # AES_KEY *key) | 1566 | # AES_KEY *key) |
1567 | $code.=<<___; | 1567 | $code.=<<___; |
1568 | .globl aes_set_decrypt_key_internal | 1568 | .globl aes_set_decrypt_key_generic |
1569 | .type aes_set_decrypt_key_internal,\@function,3 | 1569 | .type aes_set_decrypt_key_generic,\@function,3 |
1570 | .align 16 | 1570 | .align 16 |
1571 | aes_set_decrypt_key_internal: | 1571 | aes_set_decrypt_key_generic: |
1572 | _CET_ENDBR | 1572 | _CET_ENDBR |
1573 | push %rbx | 1573 | push %rbx |
1574 | push %rbp | 1574 | push %rbp |
@@ -1638,10 +1638,10 @@ $code.=<<___; | |||
1638 | add \$56,%rsp | 1638 | add \$56,%rsp |
1639 | .Ldec_key_epilogue: | 1639 | .Ldec_key_epilogue: |
1640 | ret | 1640 | ret |
1641 | .size aes_set_decrypt_key_internal,.-aes_set_decrypt_key_internal | 1641 | .size aes_set_decrypt_key_generic,.-aes_set_decrypt_key_generic |
1642 | ___ | 1642 | ___ |
1643 | 1643 | ||
1644 | # void aes_cbc_encrypt_internal(const void char *inp, unsigned char *out, | 1644 | # void aes_cbc_encrypt_generic(const void char *inp, unsigned char *out, |
1645 | # size_t length, const AES_KEY *key, unsigned char *ivp,const int enc); | 1645 | # size_t length, const AES_KEY *key, unsigned char *ivp,const int enc); |
1646 | { | 1646 | { |
1647 | # stack frame layout | 1647 | # stack frame layout |
@@ -1659,15 +1659,15 @@ my $aes_key="80(%rsp)"; # copy of aes_key | |||
1659 | my $mark="80+240(%rsp)"; # copy of aes_key->rounds | 1659 | my $mark="80+240(%rsp)"; # copy of aes_key->rounds |
1660 | 1660 | ||
1661 | $code.=<<___; | 1661 | $code.=<<___; |
1662 | .globl aes_cbc_encrypt_internal | 1662 | .globl aes_cbc_encrypt_generic |
1663 | .type aes_cbc_encrypt_internal,\@function,6 | 1663 | .type aes_cbc_encrypt_generic,\@function,6 |
1664 | .align 16 | 1664 | .align 16 |
1665 | .extern OPENSSL_ia32cap_P | 1665 | .extern OPENSSL_ia32cap_P |
1666 | .hidden OPENSSL_ia32cap_P | 1666 | .hidden OPENSSL_ia32cap_P |
1667 | .globl asm_AES_cbc_encrypt | 1667 | .globl asm_AES_cbc_encrypt |
1668 | .hidden asm_AES_cbc_encrypt | 1668 | .hidden asm_AES_cbc_encrypt |
1669 | asm_AES_cbc_encrypt: | 1669 | asm_AES_cbc_encrypt: |
1670 | aes_cbc_encrypt_internal: | 1670 | aes_cbc_encrypt_generic: |
1671 | _CET_ENDBR | 1671 | _CET_ENDBR |
1672 | cmp \$0,%rdx # check length | 1672 | cmp \$0,%rdx # check length |
1673 | je .Lcbc_epilogue | 1673 | je .Lcbc_epilogue |
@@ -2117,7 +2117,7 @@ aes_cbc_encrypt_internal: | |||
2117 | popfq | 2117 | popfq |
2118 | .Lcbc_epilogue: | 2118 | .Lcbc_epilogue: |
2119 | ret | 2119 | ret |
2120 | .size aes_cbc_encrypt_internal,.-aes_cbc_encrypt_internal | 2120 | .size aes_cbc_encrypt_generic,.-aes_cbc_encrypt_generic |
2121 | ___ | 2121 | ___ |
2122 | } | 2122 | } |
2123 | 2123 | ||
@@ -2782,45 +2782,45 @@ cbc_se_handler: | |||
2782 | 2782 | ||
2783 | .section .pdata | 2783 | .section .pdata |
2784 | .align 4 | 2784 | .align 4 |
2785 | .rva .LSEH_begin_aes_encrypt_internal | 2785 | .rva .LSEH_begin_aes_encrypt_generic |
2786 | .rva .LSEH_end_aes_encrypt_internal | 2786 | .rva .LSEH_end_aes_encrypt_generic |
2787 | .rva .LSEH_info_aes_encrypt_internal | 2787 | .rva .LSEH_info_aes_encrypt_generic |
2788 | 2788 | ||
2789 | .rva .LSEH_begin_aes_decrypt_internal | 2789 | .rva .LSEH_begin_aes_decrypt_generic |
2790 | .rva .LSEH_end_aes_decrypt_internal | 2790 | .rva .LSEH_end_aes_decrypt_generic |
2791 | .rva .LSEH_info_aes_decrypt_internal | 2791 | .rva .LSEH_info_aes_decrypt_generic |
2792 | 2792 | ||
2793 | .rva .LSEH_begin_aes_set_encrypt_key_internal | 2793 | .rva .LSEH_begin_aes_set_encrypt_key_generic |
2794 | .rva .LSEH_end_aes_set_encrypt_key_internal | 2794 | .rva .LSEH_end_aes_set_encrypt_key_generic |
2795 | .rva .LSEH_info_aes_set_encrypt_key_internal | 2795 | .rva .LSEH_info_aes_set_encrypt_key_generic |
2796 | 2796 | ||
2797 | .rva .LSEH_begin_aes_set_decrypt_key_internal | 2797 | .rva .LSEH_begin_aes_set_decrypt_key_generic |
2798 | .rva .LSEH_end_aes_set_decrypt_key_internal | 2798 | .rva .LSEH_end_aes_set_decrypt_key_generic |
2799 | .rva .LSEH_info_aes_set_decrypt_key_internal | 2799 | .rva .LSEH_info_aes_set_decrypt_key_generic |
2800 | 2800 | ||
2801 | .rva .LSEH_begin_aes_cbc_encrypt_internal | 2801 | .rva .LSEH_begin_aes_cbc_encrypt_generic |
2802 | .rva .LSEH_end_aes_cbc_encrypt_internal | 2802 | .rva .LSEH_end_aes_cbc_encrypt_generic |
2803 | .rva .LSEH_info_aes_cbc_encrypt_internal | 2803 | .rva .LSEH_info_aes_cbc_encrypt_generic |
2804 | 2804 | ||
2805 | .section .xdata | 2805 | .section .xdata |
2806 | .align 8 | 2806 | .align 8 |
2807 | .LSEH_info_aes_encrypt_internal: | 2807 | .LSEH_info_aes_encrypt_generic: |
2808 | .byte 9,0,0,0 | 2808 | .byte 9,0,0,0 |
2809 | .rva block_se_handler | 2809 | .rva block_se_handler |
2810 | .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] | 2810 | .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] |
2811 | .LSEH_info_aes_decrypt_internal: | 2811 | .LSEH_info_aes_decrypt_generic: |
2812 | .byte 9,0,0,0 | 2812 | .byte 9,0,0,0 |
2813 | .rva block_se_handler | 2813 | .rva block_se_handler |
2814 | .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] | 2814 | .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] |
2815 | .LSEH_info_aes_set_encrypt_key_internal: | 2815 | .LSEH_info_aes_set_encrypt_key_generic: |
2816 | .byte 9,0,0,0 | 2816 | .byte 9,0,0,0 |
2817 | .rva key_se_handler | 2817 | .rva key_se_handler |
2818 | .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] | 2818 | .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] |
2819 | .LSEH_info_aes_set_decrypt_key_internal: | 2819 | .LSEH_info_aes_set_decrypt_key_generic: |
2820 | .byte 9,0,0,0 | 2820 | .byte 9,0,0,0 |
2821 | .rva key_se_handler | 2821 | .rva key_se_handler |
2822 | .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] | 2822 | .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] |
2823 | .LSEH_info_aes_cbc_encrypt_internal: | 2823 | .LSEH_info_aes_cbc_encrypt_generic: |
2824 | .byte 9,0,0,0 | 2824 | .byte 9,0,0,0 |
2825 | .rva cbc_se_handler | 2825 | .rva cbc_se_handler |
2826 | ___ | 2826 | ___ |
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl deleted file mode 100644 index c44a338114..0000000000 --- a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl +++ /dev/null | |||
@@ -1,3123 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | ################################################################### | ||
4 | ### AES-128 [originally in CTR mode] ### | ||
5 | ### bitsliced implementation for Intel Core 2 processors ### | ||
6 | ### requires support of SSE extensions up to SSSE3 ### | ||
7 | ### Author: Emilia Käsper and Peter Schwabe ### | ||
8 | ### Date: 2009-03-19 ### | ||
9 | ### Public domain ### | ||
10 | ### ### | ||
11 | ### See http://homes.esat.kuleuven.be/~ekasper/#software for ### | ||
12 | ### further information. ### | ||
13 | ################################################################### | ||
14 | # | ||
15 | # September 2011. | ||
16 | # | ||
17 | # Started as transliteration to "perlasm" the original code has | ||
18 | # undergone following changes: | ||
19 | # | ||
20 | # - code was made position-independent; | ||
21 | # - rounds were folded into a loop resulting in >5x size reduction | ||
22 | # from 12.5KB to 2.2KB; | ||
23 | # - above was possible thanks to mixcolumns() modification that | ||
24 | # allowed to feed its output back to aesenc[last], this was | ||
25 | # achieved at cost of two additional inter-registers moves; | ||
26 | # - some instruction reordering and interleaving; | ||
27 | # - this module doesn't implement key setup subroutine, instead it | ||
28 | # relies on conversion of "conventional" key schedule as returned | ||
29 | # by AES_set_encrypt_key (see discussion below); | ||
30 | # - first and last round keys are treated differently, which allowed | ||
31 | # to skip one shiftrows(), reduce bit-sliced key schedule and | ||
32 | # speed-up conversion by 22%; | ||
33 | # - support for 192- and 256-bit keys was added; | ||
34 | # | ||
35 | # Resulting performance in CPU cycles spent to encrypt one byte out | ||
36 | # of 4096-byte buffer with 128-bit key is: | ||
37 | # | ||
38 | # Emilia's this(*) difference | ||
39 | # | ||
40 | # Core 2 9.30 8.69 +7% | ||
41 | # Nehalem(**) 7.63 6.98 +9% | ||
42 | # Atom 17.1 17.4 -2%(***) | ||
43 | # | ||
44 | # (*) Comparison is not completely fair, because "this" is ECB, | ||
45 | # i.e. no extra processing such as counter values calculation | ||
46 | # and xor-ing input as in Emilia's CTR implementation is | ||
47 | # performed. However, the CTR calculations stand for not more | ||
48 | # than 1% of total time, so comparison is *rather* fair. | ||
49 | # | ||
50 | # (**) Results were collected on Westmere, which is considered to | ||
51 | # be equivalent to Nehalem for this code. | ||
52 | # | ||
53 | # (***) Slowdown on Atom is rather strange per se, because original | ||
54 | # implementation has a number of 9+-bytes instructions, which | ||
55 | # are bad for Atom front-end, and which I eliminated completely. | ||
56 | # In attempt to address deterioration sbox() was tested in FP | ||
57 | # SIMD "domain" (movaps instead of movdqa, xorps instead of | ||
58 | # pxor, etc.). While it resulted in nominal 4% improvement on | ||
59 | # Atom, it hurted Westmere by more than 2x factor. | ||
60 | # | ||
61 | # As for key schedule conversion subroutine. Interface to OpenSSL | ||
62 | # relies on per-invocation on-the-fly conversion. This naturally | ||
63 | # has impact on performance, especially for short inputs. Conversion | ||
64 | # time in CPU cycles and its ratio to CPU cycles spent in 8x block | ||
65 | # function is: | ||
66 | # | ||
67 | # conversion conversion/8x block | ||
68 | # Core 2 240 0.22 | ||
69 | # Nehalem 180 0.20 | ||
70 | # Atom 430 0.19 | ||
71 | # | ||
72 | # The ratio values mean that 128-byte blocks will be processed | ||
73 | # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, | ||
74 | # etc. Then keep in mind that input sizes not divisible by 128 are | ||
75 | # *effectively* slower, especially shortest ones, e.g. consecutive | ||
76 | # 144-byte blocks are processed 44% slower than one would expect, | ||
77 | # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" | ||
78 | # it's still faster than ["hyper-threading-safe" code path in] | ||
79 | # aes-x86_64.pl on all lengths above 64 bytes... | ||
80 | # | ||
81 | # October 2011. | ||
82 | # | ||
83 | # Add decryption procedure. Performance in CPU cycles spent to decrypt | ||
84 | # one byte out of 4096-byte buffer with 128-bit key is: | ||
85 | # | ||
86 | # Core 2 9.83 | ||
87 | # Nehalem 7.74 | ||
88 | # Atom 19.0 | ||
89 | # | ||
90 | # November 2011. | ||
91 | # | ||
92 | # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is | ||
93 | # suboptimal, but XTS is meant to be used with larger blocks... | ||
94 | # | ||
95 | # <appro@openssl.org> | ||
96 | |||
97 | $flavour = shift; | ||
98 | $output = shift; | ||
99 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
100 | |||
101 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
102 | |||
103 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
104 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
105 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
106 | die "can't locate x86_64-xlate.pl"; | ||
107 | |||
108 | open OUT,"| \"$^X\" $xlate $flavour $output"; | ||
109 | *STDOUT=*OUT; | ||
110 | |||
111 | my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); | ||
112 | my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) | ||
113 | my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... | ||
114 | |||
115 | { | ||
116 | my ($key,$rounds,$const)=("%rax","%r10d","%r11"); | ||
117 | |||
118 | sub Sbox { | ||
119 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
120 | # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb | ||
121 | my @b=@_[0..7]; | ||
122 | my @t=@_[8..11]; | ||
123 | my @s=@_[12..15]; | ||
124 | &InBasisChange (@b); | ||
125 | &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); | ||
126 | &OutBasisChange (@b[7,1,4,2,6,5,0,3]); | ||
127 | } | ||
128 | |||
129 | sub InBasisChange { | ||
130 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
131 | # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb | ||
132 | my @b=@_[0..7]; | ||
133 | $code.=<<___; | ||
134 | pxor @b[6], @b[5] | ||
135 | pxor @b[1], @b[2] | ||
136 | pxor @b[0], @b[3] | ||
137 | pxor @b[2], @b[6] | ||
138 | pxor @b[0], @b[5] | ||
139 | |||
140 | pxor @b[3], @b[6] | ||
141 | pxor @b[7], @b[3] | ||
142 | pxor @b[5], @b[7] | ||
143 | pxor @b[4], @b[3] | ||
144 | pxor @b[5], @b[4] | ||
145 | pxor @b[1], @b[3] | ||
146 | |||
147 | pxor @b[7], @b[2] | ||
148 | pxor @b[5], @b[1] | ||
149 | ___ | ||
150 | } | ||
151 | |||
152 | sub OutBasisChange { | ||
153 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
154 | # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb | ||
155 | my @b=@_[0..7]; | ||
156 | $code.=<<___; | ||
157 | pxor @b[6], @b[0] | ||
158 | pxor @b[4], @b[1] | ||
159 | pxor @b[0], @b[2] | ||
160 | pxor @b[6], @b[4] | ||
161 | pxor @b[1], @b[6] | ||
162 | |||
163 | pxor @b[5], @b[1] | ||
164 | pxor @b[3], @b[5] | ||
165 | pxor @b[7], @b[3] | ||
166 | pxor @b[5], @b[7] | ||
167 | pxor @b[5], @b[2] | ||
168 | |||
169 | pxor @b[7], @b[4] | ||
170 | ___ | ||
171 | } | ||
172 | |||
173 | sub InvSbox { | ||
174 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
175 | # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb | ||
176 | my @b=@_[0..7]; | ||
177 | my @t=@_[8..11]; | ||
178 | my @s=@_[12..15]; | ||
179 | &InvInBasisChange (@b); | ||
180 | &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); | ||
181 | &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); | ||
182 | } | ||
183 | |||
184 | sub InvInBasisChange { # OutBasisChange in reverse | ||
185 | my @b=@_[5,1,2,6,3,7,0,4]; | ||
186 | $code.=<<___ | ||
187 | pxor @b[7], @b[4] | ||
188 | |||
189 | pxor @b[5], @b[7] | ||
190 | pxor @b[5], @b[2] | ||
191 | pxor @b[7], @b[3] | ||
192 | pxor @b[3], @b[5] | ||
193 | pxor @b[5], @b[1] | ||
194 | |||
195 | pxor @b[1], @b[6] | ||
196 | pxor @b[0], @b[2] | ||
197 | pxor @b[6], @b[4] | ||
198 | pxor @b[6], @b[0] | ||
199 | pxor @b[4], @b[1] | ||
200 | ___ | ||
201 | } | ||
202 | |||
203 | sub InvOutBasisChange { # InBasisChange in reverse | ||
204 | my @b=@_[2,5,7,3,6,1,0,4]; | ||
205 | $code.=<<___; | ||
206 | pxor @b[5], @b[1] | ||
207 | pxor @b[7], @b[2] | ||
208 | |||
209 | pxor @b[1], @b[3] | ||
210 | pxor @b[5], @b[4] | ||
211 | pxor @b[5], @b[7] | ||
212 | pxor @b[4], @b[3] | ||
213 | pxor @b[0], @b[5] | ||
214 | pxor @b[7], @b[3] | ||
215 | pxor @b[2], @b[6] | ||
216 | pxor @b[1], @b[2] | ||
217 | pxor @b[3], @b[6] | ||
218 | |||
219 | pxor @b[0], @b[3] | ||
220 | pxor @b[6], @b[5] | ||
221 | ___ | ||
222 | } | ||
223 | |||
224 | sub Mul_GF4 { | ||
225 | #;************************************************************* | ||
226 | #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * | ||
227 | #;************************************************************* | ||
228 | my ($x0,$x1,$y0,$y1,$t0)=@_; | ||
229 | $code.=<<___; | ||
230 | movdqa $y0, $t0 | ||
231 | pxor $y1, $t0 | ||
232 | pand $x0, $t0 | ||
233 | pxor $x1, $x0 | ||
234 | pand $y0, $x1 | ||
235 | pand $y1, $x0 | ||
236 | pxor $x1, $x0 | ||
237 | pxor $t0, $x1 | ||
238 | ___ | ||
239 | } | ||
240 | |||
241 | sub Mul_GF4_N { # not used, see next subroutine | ||
242 | # multiply and scale by N | ||
243 | my ($x0,$x1,$y0,$y1,$t0)=@_; | ||
244 | $code.=<<___; | ||
245 | movdqa $y0, $t0 | ||
246 | pxor $y1, $t0 | ||
247 | pand $x0, $t0 | ||
248 | pxor $x1, $x0 | ||
249 | pand $y0, $x1 | ||
250 | pand $y1, $x0 | ||
251 | pxor $x0, $x1 | ||
252 | pxor $t0, $x0 | ||
253 | ___ | ||
254 | } | ||
255 | |||
256 | sub Mul_GF4_N_GF4 { | ||
257 | # interleaved Mul_GF4_N and Mul_GF4 | ||
258 | my ($x0,$x1,$y0,$y1,$t0, | ||
259 | $x2,$x3,$y2,$y3,$t1)=@_; | ||
260 | $code.=<<___; | ||
261 | movdqa $y0, $t0 | ||
262 | movdqa $y2, $t1 | ||
263 | pxor $y1, $t0 | ||
264 | pxor $y3, $t1 | ||
265 | pand $x0, $t0 | ||
266 | pand $x2, $t1 | ||
267 | pxor $x1, $x0 | ||
268 | pxor $x3, $x2 | ||
269 | pand $y0, $x1 | ||
270 | pand $y2, $x3 | ||
271 | pand $y1, $x0 | ||
272 | pand $y3, $x2 | ||
273 | pxor $x0, $x1 | ||
274 | pxor $x3, $x2 | ||
275 | pxor $t0, $x0 | ||
276 | pxor $t1, $x3 | ||
277 | ___ | ||
278 | } | ||
279 | sub Mul_GF16_2 { | ||
280 | my @x=@_[0..7]; | ||
281 | my @y=@_[8..11]; | ||
282 | my @t=@_[12..15]; | ||
283 | $code.=<<___; | ||
284 | movdqa @x[0], @t[0] | ||
285 | movdqa @x[1], @t[1] | ||
286 | ___ | ||
287 | &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); | ||
288 | $code.=<<___; | ||
289 | pxor @x[2], @t[0] | ||
290 | pxor @x[3], @t[1] | ||
291 | pxor @y[2], @y[0] | ||
292 | pxor @y[3], @y[1] | ||
293 | ___ | ||
294 | Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | ||
295 | @x[2], @x[3], @y[2], @y[3], @t[2]); | ||
296 | $code.=<<___; | ||
297 | pxor @t[0], @x[0] | ||
298 | pxor @t[0], @x[2] | ||
299 | pxor @t[1], @x[1] | ||
300 | pxor @t[1], @x[3] | ||
301 | |||
302 | movdqa @x[4], @t[0] | ||
303 | movdqa @x[5], @t[1] | ||
304 | pxor @x[6], @t[0] | ||
305 | pxor @x[7], @t[1] | ||
306 | ___ | ||
307 | &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | ||
308 | @x[6], @x[7], @y[2], @y[3], @t[2]); | ||
309 | $code.=<<___; | ||
310 | pxor @y[2], @y[0] | ||
311 | pxor @y[3], @y[1] | ||
312 | ___ | ||
313 | &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); | ||
314 | $code.=<<___; | ||
315 | pxor @t[0], @x[4] | ||
316 | pxor @t[0], @x[6] | ||
317 | pxor @t[1], @x[5] | ||
318 | pxor @t[1], @x[7] | ||
319 | ___ | ||
320 | } | ||
321 | sub Inv_GF256 { | ||
322 | #;******************************************************************** | ||
323 | #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * | ||
324 | #;******************************************************************** | ||
325 | my @x=@_[0..7]; | ||
326 | my @t=@_[8..11]; | ||
327 | my @s=@_[12..15]; | ||
328 | # direct optimizations from hardware | ||
329 | $code.=<<___; | ||
330 | movdqa @x[4], @t[3] | ||
331 | movdqa @x[5], @t[2] | ||
332 | movdqa @x[1], @t[1] | ||
333 | movdqa @x[7], @s[1] | ||
334 | movdqa @x[0], @s[0] | ||
335 | |||
336 | pxor @x[6], @t[3] | ||
337 | pxor @x[7], @t[2] | ||
338 | pxor @x[3], @t[1] | ||
339 | movdqa @t[3], @s[2] | ||
340 | pxor @x[6], @s[1] | ||
341 | movdqa @t[2], @t[0] | ||
342 | pxor @x[2], @s[0] | ||
343 | movdqa @t[3], @s[3] | ||
344 | |||
345 | por @t[1], @t[2] | ||
346 | por @s[0], @t[3] | ||
347 | pxor @t[0], @s[3] | ||
348 | pand @s[0], @s[2] | ||
349 | pxor @t[1], @s[0] | ||
350 | pand @t[1], @t[0] | ||
351 | pand @s[0], @s[3] | ||
352 | movdqa @x[3], @s[0] | ||
353 | pxor @x[2], @s[0] | ||
354 | pand @s[0], @s[1] | ||
355 | pxor @s[1], @t[3] | ||
356 | pxor @s[1], @t[2] | ||
357 | movdqa @x[4], @s[1] | ||
358 | movdqa @x[1], @s[0] | ||
359 | pxor @x[5], @s[1] | ||
360 | pxor @x[0], @s[0] | ||
361 | movdqa @s[1], @t[1] | ||
362 | pand @s[0], @s[1] | ||
363 | por @s[0], @t[1] | ||
364 | pxor @s[1], @t[0] | ||
365 | pxor @s[3], @t[3] | ||
366 | pxor @s[2], @t[2] | ||
367 | pxor @s[3], @t[1] | ||
368 | movdqa @x[7], @s[0] | ||
369 | pxor @s[2], @t[0] | ||
370 | movdqa @x[6], @s[1] | ||
371 | pxor @s[2], @t[1] | ||
372 | movdqa @x[5], @s[2] | ||
373 | pand @x[3], @s[0] | ||
374 | movdqa @x[4], @s[3] | ||
375 | pand @x[2], @s[1] | ||
376 | pand @x[1], @s[2] | ||
377 | por @x[0], @s[3] | ||
378 | pxor @s[0], @t[3] | ||
379 | pxor @s[1], @t[2] | ||
380 | pxor @s[2], @t[1] | ||
381 | pxor @s[3], @t[0] | ||
382 | |||
383 | #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 | ||
384 | |||
385 | # new smaller inversion | ||
386 | |||
387 | movdqa @t[3], @s[0] | ||
388 | pand @t[1], @t[3] | ||
389 | pxor @t[2], @s[0] | ||
390 | |||
391 | movdqa @t[0], @s[2] | ||
392 | movdqa @s[0], @s[3] | ||
393 | pxor @t[3], @s[2] | ||
394 | pand @s[2], @s[3] | ||
395 | |||
396 | movdqa @t[1], @s[1] | ||
397 | pxor @t[2], @s[3] | ||
398 | pxor @t[0], @s[1] | ||
399 | |||
400 | pxor @t[2], @t[3] | ||
401 | |||
402 | pand @t[3], @s[1] | ||
403 | |||
404 | movdqa @s[2], @t[2] | ||
405 | pxor @t[0], @s[1] | ||
406 | |||
407 | pxor @s[1], @t[2] | ||
408 | pxor @s[1], @t[1] | ||
409 | |||
410 | pand @t[0], @t[2] | ||
411 | |||
412 | pxor @t[2], @s[2] | ||
413 | pxor @t[2], @t[1] | ||
414 | |||
415 | pand @s[3], @s[2] | ||
416 | |||
417 | pxor @s[0], @s[2] | ||
418 | ___ | ||
419 | # output in s3, s2, s1, t1 | ||
420 | |||
421 | # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 | ||
422 | |||
423 | # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 | ||
424 | &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); | ||
425 | |||
426 | ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb | ||
427 | } | ||
428 | |||
429 | # AES linear components | ||
430 | |||
431 | sub ShiftRows { | ||
432 | my @x=@_[0..7]; | ||
433 | my $mask=pop; | ||
434 | $code.=<<___; | ||
435 | pxor 0x00($key),@x[0] | ||
436 | pxor 0x10($key),@x[1] | ||
437 | pshufb $mask,@x[0] | ||
438 | pxor 0x20($key),@x[2] | ||
439 | pshufb $mask,@x[1] | ||
440 | pxor 0x30($key),@x[3] | ||
441 | pshufb $mask,@x[2] | ||
442 | pxor 0x40($key),@x[4] | ||
443 | pshufb $mask,@x[3] | ||
444 | pxor 0x50($key),@x[5] | ||
445 | pshufb $mask,@x[4] | ||
446 | pxor 0x60($key),@x[6] | ||
447 | pshufb $mask,@x[5] | ||
448 | pxor 0x70($key),@x[7] | ||
449 | pshufb $mask,@x[6] | ||
450 | lea 0x80($key),$key | ||
451 | pshufb $mask,@x[7] | ||
452 | ___ | ||
453 | } | ||
454 | |||
455 | sub MixColumns { | ||
456 | # modified to emit output in order suitable for feeding back to aesenc[last] | ||
457 | my @x=@_[0..7]; | ||
458 | my @t=@_[8..15]; | ||
459 | my $inv=@_[16]; # optional | ||
460 | $code.=<<___; | ||
461 | pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 | ||
462 | pshufd \$0x93, @x[1], @t[1] | ||
463 | pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) | ||
464 | pshufd \$0x93, @x[2], @t[2] | ||
465 | pxor @t[1], @x[1] | ||
466 | pshufd \$0x93, @x[3], @t[3] | ||
467 | pxor @t[2], @x[2] | ||
468 | pshufd \$0x93, @x[4], @t[4] | ||
469 | pxor @t[3], @x[3] | ||
470 | pshufd \$0x93, @x[5], @t[5] | ||
471 | pxor @t[4], @x[4] | ||
472 | pshufd \$0x93, @x[6], @t[6] | ||
473 | pxor @t[5], @x[5] | ||
474 | pshufd \$0x93, @x[7], @t[7] | ||
475 | pxor @t[6], @x[6] | ||
476 | pxor @t[7], @x[7] | ||
477 | |||
478 | pxor @x[0], @t[1] | ||
479 | pxor @x[7], @t[0] | ||
480 | pxor @x[7], @t[1] | ||
481 | pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) | ||
482 | pxor @x[1], @t[2] | ||
483 | pshufd \$0x4E, @x[1], @x[1] | ||
484 | pxor @x[4], @t[5] | ||
485 | pxor @t[0], @x[0] | ||
486 | pxor @x[5], @t[6] | ||
487 | pxor @t[1], @x[1] | ||
488 | pxor @x[3], @t[4] | ||
489 | pshufd \$0x4E, @x[4], @t[0] | ||
490 | pxor @x[6], @t[7] | ||
491 | pshufd \$0x4E, @x[5], @t[1] | ||
492 | pxor @x[2], @t[3] | ||
493 | pshufd \$0x4E, @x[3], @x[4] | ||
494 | pxor @x[7], @t[3] | ||
495 | pshufd \$0x4E, @x[7], @x[5] | ||
496 | pxor @x[7], @t[4] | ||
497 | pshufd \$0x4E, @x[6], @x[3] | ||
498 | pxor @t[4], @t[0] | ||
499 | pshufd \$0x4E, @x[2], @x[6] | ||
500 | pxor @t[5], @t[1] | ||
501 | ___ | ||
502 | $code.=<<___ if (!$inv); | ||
503 | pxor @t[3], @x[4] | ||
504 | pxor @t[7], @x[5] | ||
505 | pxor @t[6], @x[3] | ||
506 | movdqa @t[0], @x[2] | ||
507 | pxor @t[2], @x[6] | ||
508 | movdqa @t[1], @x[7] | ||
509 | ___ | ||
510 | $code.=<<___ if ($inv); | ||
511 | pxor @x[4], @t[3] | ||
512 | pxor @t[7], @x[5] | ||
513 | pxor @x[3], @t[6] | ||
514 | movdqa @t[0], @x[3] | ||
515 | pxor @t[2], @x[6] | ||
516 | movdqa @t[6], @x[2] | ||
517 | movdqa @t[1], @x[7] | ||
518 | movdqa @x[6], @x[4] | ||
519 | movdqa @t[3], @x[6] | ||
520 | ___ | ||
521 | } | ||
522 | |||
523 | sub InvMixColumns_orig { | ||
524 | my @x=@_[0..7]; | ||
525 | my @t=@_[8..15]; | ||
526 | |||
527 | $code.=<<___; | ||
528 | # multiplication by 0x0e | ||
529 | pshufd \$0x93, @x[7], @t[7] | ||
530 | movdqa @x[2], @t[2] | ||
531 | pxor @x[5], @x[7] # 7 5 | ||
532 | pxor @x[5], @x[2] # 2 5 | ||
533 | pshufd \$0x93, @x[0], @t[0] | ||
534 | movdqa @x[5], @t[5] | ||
535 | pxor @x[0], @x[5] # 5 0 [1] | ||
536 | pxor @x[1], @x[0] # 0 1 | ||
537 | pshufd \$0x93, @x[1], @t[1] | ||
538 | pxor @x[2], @x[1] # 1 25 | ||
539 | pxor @x[6], @x[0] # 01 6 [2] | ||
540 | pxor @x[3], @x[1] # 125 3 [4] | ||
541 | pshufd \$0x93, @x[3], @t[3] | ||
542 | pxor @x[0], @x[2] # 25 016 [3] | ||
543 | pxor @x[7], @x[3] # 3 75 | ||
544 | pxor @x[6], @x[7] # 75 6 [0] | ||
545 | pshufd \$0x93, @x[6], @t[6] | ||
546 | movdqa @x[4], @t[4] | ||
547 | pxor @x[4], @x[6] # 6 4 | ||
548 | pxor @x[3], @x[4] # 4 375 [6] | ||
549 | pxor @x[7], @x[3] # 375 756=36 | ||
550 | pxor @t[5], @x[6] # 64 5 [7] | ||
551 | pxor @t[2], @x[3] # 36 2 | ||
552 | pxor @t[4], @x[3] # 362 4 [5] | ||
553 | pshufd \$0x93, @t[5], @t[5] | ||
554 | ___ | ||
555 | my @y = @x[7,5,0,2,1,3,4,6]; | ||
556 | $code.=<<___; | ||
557 | # multiplication by 0x0b | ||
558 | pxor @y[0], @y[1] | ||
559 | pxor @t[0], @y[0] | ||
560 | pxor @t[1], @y[1] | ||
561 | pshufd \$0x93, @t[2], @t[2] | ||
562 | pxor @t[5], @y[0] | ||
563 | pxor @t[6], @y[1] | ||
564 | pxor @t[7], @y[0] | ||
565 | pshufd \$0x93, @t[4], @t[4] | ||
566 | pxor @t[6], @t[7] # clobber t[7] | ||
567 | pxor @y[0], @y[1] | ||
568 | |||
569 | pxor @t[0], @y[3] | ||
570 | pshufd \$0x93, @t[0], @t[0] | ||
571 | pxor @t[1], @y[2] | ||
572 | pxor @t[1], @y[4] | ||
573 | pxor @t[2], @y[2] | ||
574 | pshufd \$0x93, @t[1], @t[1] | ||
575 | pxor @t[2], @y[3] | ||
576 | pxor @t[2], @y[5] | ||
577 | pxor @t[7], @y[2] | ||
578 | pshufd \$0x93, @t[2], @t[2] | ||
579 | pxor @t[3], @y[3] | ||
580 | pxor @t[3], @y[6] | ||
581 | pxor @t[3], @y[4] | ||
582 | pshufd \$0x93, @t[3], @t[3] | ||
583 | pxor @t[4], @y[7] | ||
584 | pxor @t[4], @y[5] | ||
585 | pxor @t[7], @y[7] | ||
586 | pxor @t[5], @y[3] | ||
587 | pxor @t[4], @y[4] | ||
588 | pxor @t[5], @t[7] # clobber t[7] even more | ||
589 | |||
590 | pxor @t[7], @y[5] | ||
591 | pshufd \$0x93, @t[4], @t[4] | ||
592 | pxor @t[7], @y[6] | ||
593 | pxor @t[7], @y[4] | ||
594 | |||
595 | pxor @t[5], @t[7] | ||
596 | pshufd \$0x93, @t[5], @t[5] | ||
597 | pxor @t[6], @t[7] # restore t[7] | ||
598 | |||
599 | # multiplication by 0x0d | ||
600 | pxor @y[7], @y[4] | ||
601 | pxor @t[4], @y[7] | ||
602 | pshufd \$0x93, @t[6], @t[6] | ||
603 | pxor @t[0], @y[2] | ||
604 | pxor @t[5], @y[7] | ||
605 | pxor @t[2], @y[2] | ||
606 | pshufd \$0x93, @t[7], @t[7] | ||
607 | |||
608 | pxor @y[1], @y[3] | ||
609 | pxor @t[1], @y[1] | ||
610 | pxor @t[0], @y[0] | ||
611 | pxor @t[0], @y[3] | ||
612 | pxor @t[5], @y[1] | ||
613 | pxor @t[5], @y[0] | ||
614 | pxor @t[7], @y[1] | ||
615 | pshufd \$0x93, @t[0], @t[0] | ||
616 | pxor @t[6], @y[0] | ||
617 | pxor @y[1], @y[3] | ||
618 | pxor @t[1], @y[4] | ||
619 | pshufd \$0x93, @t[1], @t[1] | ||
620 | |||
621 | pxor @t[7], @y[7] | ||
622 | pxor @t[2], @y[4] | ||
623 | pxor @t[2], @y[5] | ||
624 | pshufd \$0x93, @t[2], @t[2] | ||
625 | pxor @t[6], @y[2] | ||
626 | pxor @t[3], @t[6] # clobber t[6] | ||
627 | pxor @y[7], @y[4] | ||
628 | pxor @t[6], @y[3] | ||
629 | |||
630 | pxor @t[6], @y[6] | ||
631 | pxor @t[5], @y[5] | ||
632 | pxor @t[4], @y[6] | ||
633 | pshufd \$0x93, @t[4], @t[4] | ||
634 | pxor @t[6], @y[5] | ||
635 | pxor @t[7], @y[6] | ||
636 | pxor @t[3], @t[6] # restore t[6] | ||
637 | |||
638 | pshufd \$0x93, @t[5], @t[5] | ||
639 | pshufd \$0x93, @t[6], @t[6] | ||
640 | pshufd \$0x93, @t[7], @t[7] | ||
641 | pshufd \$0x93, @t[3], @t[3] | ||
642 | |||
643 | # multiplication by 0x09 | ||
644 | pxor @y[1], @y[4] | ||
645 | pxor @y[1], @t[1] # t[1]=y[1] | ||
646 | pxor @t[5], @t[0] # clobber t[0] | ||
647 | pxor @t[5], @t[1] | ||
648 | pxor @t[0], @y[3] | ||
649 | pxor @y[0], @t[0] # t[0]=y[0] | ||
650 | pxor @t[6], @t[1] | ||
651 | pxor @t[7], @t[6] # clobber t[6] | ||
652 | pxor @t[1], @y[4] | ||
653 | pxor @t[4], @y[7] | ||
654 | pxor @y[4], @t[4] # t[4]=y[4] | ||
655 | pxor @t[3], @y[6] | ||
656 | pxor @y[3], @t[3] # t[3]=y[3] | ||
657 | pxor @t[2], @y[5] | ||
658 | pxor @y[2], @t[2] # t[2]=y[2] | ||
659 | pxor @t[7], @t[3] | ||
660 | pxor @y[5], @t[5] # t[5]=y[5] | ||
661 | pxor @t[6], @t[2] | ||
662 | pxor @t[6], @t[5] | ||
663 | pxor @y[6], @t[6] # t[6]=y[6] | ||
664 | pxor @y[7], @t[7] # t[7]=y[7] | ||
665 | |||
666 | movdqa @t[0],@XMM[0] | ||
667 | movdqa @t[1],@XMM[1] | ||
668 | movdqa @t[2],@XMM[2] | ||
669 | movdqa @t[3],@XMM[3] | ||
670 | movdqa @t[4],@XMM[4] | ||
671 | movdqa @t[5],@XMM[5] | ||
672 | movdqa @t[6],@XMM[6] | ||
673 | movdqa @t[7],@XMM[7] | ||
674 | ___ | ||
675 | } | ||
676 | |||
677 | sub InvMixColumns { | ||
678 | my @x=@_[0..7]; | ||
679 | my @t=@_[8..15]; | ||
680 | |||
681 | # Thanks to Jussi Kivilinna for providing pointer to | ||
682 | # | ||
683 | # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | | ||
684 | # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | | ||
685 | # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | | ||
686 | # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | | ||
687 | |||
688 | $code.=<<___; | ||
689 | # multiplication by 0x05-0x00-0x04-0x00 | ||
690 | pshufd \$0x4E, @x[0], @t[0] | ||
691 | pshufd \$0x4E, @x[6], @t[6] | ||
692 | pxor @x[0], @t[0] | ||
693 | pshufd \$0x4E, @x[7], @t[7] | ||
694 | pxor @x[6], @t[6] | ||
695 | pshufd \$0x4E, @x[1], @t[1] | ||
696 | pxor @x[7], @t[7] | ||
697 | pshufd \$0x4E, @x[2], @t[2] | ||
698 | pxor @x[1], @t[1] | ||
699 | pshufd \$0x4E, @x[3], @t[3] | ||
700 | pxor @x[2], @t[2] | ||
701 | pxor @t[6], @x[0] | ||
702 | pxor @t[6], @x[1] | ||
703 | pshufd \$0x4E, @x[4], @t[4] | ||
704 | pxor @x[3], @t[3] | ||
705 | pxor @t[0], @x[2] | ||
706 | pxor @t[1], @x[3] | ||
707 | pshufd \$0x4E, @x[5], @t[5] | ||
708 | pxor @x[4], @t[4] | ||
709 | pxor @t[7], @x[1] | ||
710 | pxor @t[2], @x[4] | ||
711 | pxor @x[5], @t[5] | ||
712 | |||
713 | pxor @t[7], @x[2] | ||
714 | pxor @t[6], @x[3] | ||
715 | pxor @t[6], @x[4] | ||
716 | pxor @t[3], @x[5] | ||
717 | pxor @t[4], @x[6] | ||
718 | pxor @t[7], @x[4] | ||
719 | pxor @t[7], @x[5] | ||
720 | pxor @t[5], @x[7] | ||
721 | ___ | ||
722 | &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 | ||
723 | } | ||
724 | |||
725 | sub aesenc { # not used | ||
726 | my @b=@_[0..7]; | ||
727 | my @t=@_[8..15]; | ||
728 | $code.=<<___; | ||
729 | movdqa 0x30($const),@t[0] # .LSR | ||
730 | ___ | ||
731 | &ShiftRows (@b,@t[0]); | ||
732 | &Sbox (@b,@t); | ||
733 | &MixColumns (@b[0,1,4,6,3,7,2,5],@t); | ||
734 | } | ||
735 | |||
736 | sub aesenclast { # not used | ||
737 | my @b=@_[0..7]; | ||
738 | my @t=@_[8..15]; | ||
739 | $code.=<<___; | ||
740 | movdqa 0x40($const),@t[0] # .LSRM0 | ||
741 | ___ | ||
742 | &ShiftRows (@b,@t[0]); | ||
743 | &Sbox (@b,@t); | ||
744 | $code.=<<___ | ||
745 | pxor 0x00($key),@b[0] | ||
746 | pxor 0x10($key),@b[1] | ||
747 | pxor 0x20($key),@b[4] | ||
748 | pxor 0x30($key),@b[6] | ||
749 | pxor 0x40($key),@b[3] | ||
750 | pxor 0x50($key),@b[7] | ||
751 | pxor 0x60($key),@b[2] | ||
752 | pxor 0x70($key),@b[5] | ||
753 | ___ | ||
754 | } | ||
755 | |||
756 | sub swapmove { | ||
757 | my ($a,$b,$n,$mask,$t)=@_; | ||
758 | $code.=<<___; | ||
759 | movdqa $b,$t | ||
760 | psrlq \$$n,$b | ||
761 | pxor $a,$b | ||
762 | pand $mask,$b | ||
763 | pxor $b,$a | ||
764 | psllq \$$n,$b | ||
765 | pxor $t,$b | ||
766 | ___ | ||
767 | } | ||
768 | sub swapmove2x { | ||
769 | my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; | ||
770 | $code.=<<___; | ||
771 | movdqa $b0,$t0 | ||
772 | psrlq \$$n,$b0 | ||
773 | movdqa $b1,$t1 | ||
774 | psrlq \$$n,$b1 | ||
775 | pxor $a0,$b0 | ||
776 | pxor $a1,$b1 | ||
777 | pand $mask,$b0 | ||
778 | pand $mask,$b1 | ||
779 | pxor $b0,$a0 | ||
780 | psllq \$$n,$b0 | ||
781 | pxor $b1,$a1 | ||
782 | psllq \$$n,$b1 | ||
783 | pxor $t0,$b0 | ||
784 | pxor $t1,$b1 | ||
785 | ___ | ||
786 | } | ||
787 | |||
788 | sub bitslice { | ||
789 | my @x=reverse(@_[0..7]); | ||
790 | my ($t0,$t1,$t2,$t3)=@_[8..11]; | ||
791 | $code.=<<___; | ||
792 | movdqa 0x00($const),$t0 # .LBS0 | ||
793 | movdqa 0x10($const),$t1 # .LBS1 | ||
794 | ___ | ||
795 | &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); | ||
796 | &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | ||
797 | $code.=<<___; | ||
798 | movdqa 0x20($const),$t0 # .LBS2 | ||
799 | ___ | ||
800 | &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); | ||
801 | &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | ||
802 | |||
803 | &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); | ||
804 | &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); | ||
805 | } | ||
806 | |||
807 | $code.=<<___; | ||
808 | .text | ||
809 | |||
810 | .extern asm_AES_encrypt | ||
811 | .extern asm_AES_decrypt | ||
812 | |||
813 | .type _bsaes_encrypt8,\@abi-omnipotent | ||
814 | .align 64 | ||
815 | _bsaes_encrypt8: | ||
816 | _CET_ENDBR | ||
817 | lea .LBS0(%rip), $const # constants table | ||
818 | |||
819 | movdqa ($key), @XMM[9] # round 0 key | ||
820 | lea 0x10($key), $key | ||
821 | movdqa 0x50($const), @XMM[8] # .LM0SR | ||
822 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
823 | pxor @XMM[9], @XMM[1] | ||
824 | pshufb @XMM[8], @XMM[0] | ||
825 | pxor @XMM[9], @XMM[2] | ||
826 | pshufb @XMM[8], @XMM[1] | ||
827 | pxor @XMM[9], @XMM[3] | ||
828 | pshufb @XMM[8], @XMM[2] | ||
829 | pxor @XMM[9], @XMM[4] | ||
830 | pshufb @XMM[8], @XMM[3] | ||
831 | pxor @XMM[9], @XMM[5] | ||
832 | pshufb @XMM[8], @XMM[4] | ||
833 | pxor @XMM[9], @XMM[6] | ||
834 | pshufb @XMM[8], @XMM[5] | ||
835 | pxor @XMM[9], @XMM[7] | ||
836 | pshufb @XMM[8], @XMM[6] | ||
837 | pshufb @XMM[8], @XMM[7] | ||
838 | _bsaes_encrypt8_bitslice: | ||
839 | ___ | ||
840 | &bitslice (@XMM[0..7, 8..11]); | ||
841 | $code.=<<___; | ||
842 | dec $rounds | ||
843 | jmp .Lenc_sbox | ||
844 | .align 16 | ||
845 | .Lenc_loop: | ||
846 | ___ | ||
847 | &ShiftRows (@XMM[0..7, 8]); | ||
848 | $code.=".Lenc_sbox:\n"; | ||
849 | &Sbox (@XMM[0..7, 8..15]); | ||
850 | $code.=<<___; | ||
851 | dec $rounds | ||
852 | jl .Lenc_done | ||
853 | ___ | ||
854 | &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); | ||
855 | $code.=<<___; | ||
856 | movdqa 0x30($const), @XMM[8] # .LSR | ||
857 | jnz .Lenc_loop | ||
858 | movdqa 0x40($const), @XMM[8] # .LSRM0 | ||
859 | jmp .Lenc_loop | ||
860 | .align 16 | ||
861 | .Lenc_done: | ||
862 | ___ | ||
863 | # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb | ||
864 | &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); | ||
865 | $code.=<<___; | ||
866 | movdqa ($key), @XMM[8] # last round key | ||
867 | pxor @XMM[8], @XMM[4] | ||
868 | pxor @XMM[8], @XMM[6] | ||
869 | pxor @XMM[8], @XMM[3] | ||
870 | pxor @XMM[8], @XMM[7] | ||
871 | pxor @XMM[8], @XMM[2] | ||
872 | pxor @XMM[8], @XMM[5] | ||
873 | pxor @XMM[8], @XMM[0] | ||
874 | pxor @XMM[8], @XMM[1] | ||
875 | ret | ||
876 | .size _bsaes_encrypt8,.-_bsaes_encrypt8 | ||
877 | |||
878 | .type _bsaes_decrypt8,\@abi-omnipotent | ||
879 | .align 64 | ||
880 | _bsaes_decrypt8: | ||
881 | _CET_ENDBR | ||
882 | lea .LBS0(%rip), $const # constants table | ||
883 | |||
884 | movdqa ($key), @XMM[9] # round 0 key | ||
885 | lea 0x10($key), $key | ||
886 | movdqa -0x30($const), @XMM[8] # .LM0ISR | ||
887 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
888 | pxor @XMM[9], @XMM[1] | ||
889 | pshufb @XMM[8], @XMM[0] | ||
890 | pxor @XMM[9], @XMM[2] | ||
891 | pshufb @XMM[8], @XMM[1] | ||
892 | pxor @XMM[9], @XMM[3] | ||
893 | pshufb @XMM[8], @XMM[2] | ||
894 | pxor @XMM[9], @XMM[4] | ||
895 | pshufb @XMM[8], @XMM[3] | ||
896 | pxor @XMM[9], @XMM[5] | ||
897 | pshufb @XMM[8], @XMM[4] | ||
898 | pxor @XMM[9], @XMM[6] | ||
899 | pshufb @XMM[8], @XMM[5] | ||
900 | pxor @XMM[9], @XMM[7] | ||
901 | pshufb @XMM[8], @XMM[6] | ||
902 | pshufb @XMM[8], @XMM[7] | ||
903 | ___ | ||
904 | &bitslice (@XMM[0..7, 8..11]); | ||
905 | $code.=<<___; | ||
906 | dec $rounds | ||
907 | jmp .Ldec_sbox | ||
908 | .align 16 | ||
909 | .Ldec_loop: | ||
910 | ___ | ||
911 | &ShiftRows (@XMM[0..7, 8]); | ||
912 | $code.=".Ldec_sbox:\n"; | ||
913 | &InvSbox (@XMM[0..7, 8..15]); | ||
914 | $code.=<<___; | ||
915 | dec $rounds | ||
916 | jl .Ldec_done | ||
917 | ___ | ||
918 | &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); | ||
919 | $code.=<<___; | ||
920 | movdqa -0x10($const), @XMM[8] # .LISR | ||
921 | jnz .Ldec_loop | ||
922 | movdqa -0x20($const), @XMM[8] # .LISRM0 | ||
923 | jmp .Ldec_loop | ||
924 | .align 16 | ||
925 | .Ldec_done: | ||
926 | ___ | ||
927 | &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); | ||
928 | $code.=<<___; | ||
929 | movdqa ($key), @XMM[8] # last round key | ||
930 | pxor @XMM[8], @XMM[6] | ||
931 | pxor @XMM[8], @XMM[4] | ||
932 | pxor @XMM[8], @XMM[2] | ||
933 | pxor @XMM[8], @XMM[7] | ||
934 | pxor @XMM[8], @XMM[3] | ||
935 | pxor @XMM[8], @XMM[5] | ||
936 | pxor @XMM[8], @XMM[0] | ||
937 | pxor @XMM[8], @XMM[1] | ||
938 | ret | ||
939 | .size _bsaes_decrypt8,.-_bsaes_decrypt8 | ||
940 | ___ | ||
941 | } | ||
942 | { | ||
943 | my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); | ||
944 | |||
945 | sub bitslice_key { | ||
946 | my @x=reverse(@_[0..7]); | ||
947 | my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; | ||
948 | |||
949 | &swapmove (@x[0,1],1,$bs0,$t2,$t3); | ||
950 | $code.=<<___; | ||
951 | #&swapmove(@x[2,3],1,$t0,$t2,$t3); | ||
952 | movdqa @x[0], @x[2] | ||
953 | movdqa @x[1], @x[3] | ||
954 | ___ | ||
955 | #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | ||
956 | |||
957 | &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); | ||
958 | $code.=<<___; | ||
959 | #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | ||
960 | movdqa @x[0], @x[4] | ||
961 | movdqa @x[2], @x[6] | ||
962 | movdqa @x[1], @x[5] | ||
963 | movdqa @x[3], @x[7] | ||
964 | ___ | ||
965 | &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); | ||
966 | &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); | ||
967 | } | ||
968 | |||
969 | $code.=<<___; | ||
970 | .type _bsaes_key_convert,\@abi-omnipotent | ||
971 | .align 16 | ||
972 | _bsaes_key_convert: | ||
973 | _CET_ENDBR | ||
974 | lea .Lmasks(%rip), $const | ||
975 | movdqu ($inp), %xmm7 # load round 0 key | ||
976 | lea 0x10($inp), $inp | ||
977 | movdqa 0x00($const), %xmm0 # 0x01... | ||
978 | movdqa 0x10($const), %xmm1 # 0x02... | ||
979 | movdqa 0x20($const), %xmm2 # 0x04... | ||
980 | movdqa 0x30($const), %xmm3 # 0x08... | ||
981 | movdqa 0x40($const), %xmm4 # .LM0 | ||
982 | pcmpeqd %xmm5, %xmm5 # .LNOT | ||
983 | |||
984 | movdqu ($inp), %xmm6 # load round 1 key | ||
985 | movdqa %xmm7, ($out) # save round 0 key | ||
986 | lea 0x10($out), $out | ||
987 | dec $rounds | ||
988 | jmp .Lkey_loop | ||
989 | .align 16 | ||
990 | .Lkey_loop: | ||
991 | pshufb %xmm4, %xmm6 # .LM0 | ||
992 | |||
993 | movdqa %xmm0, %xmm8 | ||
994 | movdqa %xmm1, %xmm9 | ||
995 | |||
996 | pand %xmm6, %xmm8 | ||
997 | pand %xmm6, %xmm9 | ||
998 | movdqa %xmm2, %xmm10 | ||
999 | pcmpeqb %xmm0, %xmm8 | ||
1000 | psllq \$4, %xmm0 # 0x10... | ||
1001 | movdqa %xmm3, %xmm11 | ||
1002 | pcmpeqb %xmm1, %xmm9 | ||
1003 | psllq \$4, %xmm1 # 0x20... | ||
1004 | |||
1005 | pand %xmm6, %xmm10 | ||
1006 | pand %xmm6, %xmm11 | ||
1007 | movdqa %xmm0, %xmm12 | ||
1008 | pcmpeqb %xmm2, %xmm10 | ||
1009 | psllq \$4, %xmm2 # 0x40... | ||
1010 | movdqa %xmm1, %xmm13 | ||
1011 | pcmpeqb %xmm3, %xmm11 | ||
1012 | psllq \$4, %xmm3 # 0x80... | ||
1013 | |||
1014 | movdqa %xmm2, %xmm14 | ||
1015 | movdqa %xmm3, %xmm15 | ||
1016 | pxor %xmm5, %xmm8 # "pnot" | ||
1017 | pxor %xmm5, %xmm9 | ||
1018 | |||
1019 | pand %xmm6, %xmm12 | ||
1020 | pand %xmm6, %xmm13 | ||
1021 | movdqa %xmm8, 0x00($out) # write bit-sliced round key | ||
1022 | pcmpeqb %xmm0, %xmm12 | ||
1023 | psrlq \$4, %xmm0 # 0x01... | ||
1024 | movdqa %xmm9, 0x10($out) | ||
1025 | pcmpeqb %xmm1, %xmm13 | ||
1026 | psrlq \$4, %xmm1 # 0x02... | ||
1027 | lea 0x10($inp), $inp | ||
1028 | |||
1029 | pand %xmm6, %xmm14 | ||
1030 | pand %xmm6, %xmm15 | ||
1031 | movdqa %xmm10, 0x20($out) | ||
1032 | pcmpeqb %xmm2, %xmm14 | ||
1033 | psrlq \$4, %xmm2 # 0x04... | ||
1034 | movdqa %xmm11, 0x30($out) | ||
1035 | pcmpeqb %xmm3, %xmm15 | ||
1036 | psrlq \$4, %xmm3 # 0x08... | ||
1037 | movdqu ($inp), %xmm6 # load next round key | ||
1038 | |||
1039 | pxor %xmm5, %xmm13 # "pnot" | ||
1040 | pxor %xmm5, %xmm14 | ||
1041 | movdqa %xmm12, 0x40($out) | ||
1042 | movdqa %xmm13, 0x50($out) | ||
1043 | movdqa %xmm14, 0x60($out) | ||
1044 | movdqa %xmm15, 0x70($out) | ||
1045 | lea 0x80($out),$out | ||
1046 | dec $rounds | ||
1047 | jnz .Lkey_loop | ||
1048 | |||
1049 | movdqa 0x50($const), %xmm7 # .L63 | ||
1050 | #movdqa %xmm6, ($out) # don't save last round key | ||
1051 | ret | ||
1052 | .size _bsaes_key_convert,.-_bsaes_key_convert | ||
1053 | ___ | ||
1054 | } | ||
1055 | |||
1056 | if (0 && !$win64) { # following four functions are unsupported interface | ||
1057 | # used for benchmarking... | ||
1058 | $code.=<<___; | ||
1059 | .globl bsaes_enc_key_convert | ||
1060 | .type bsaes_enc_key_convert,\@function,2 | ||
1061 | .align 16 | ||
1062 | bsaes_enc_key_convert: | ||
1063 | _CET_ENDBR | ||
1064 | mov 240($inp),%r10d # pass rounds | ||
1065 | mov $inp,%rcx # pass key | ||
1066 | mov $out,%rax # pass key schedule | ||
1067 | call _bsaes_key_convert | ||
1068 | pxor %xmm6,%xmm7 # fix up last round key | ||
1069 | movdqa %xmm7,(%rax) # save last round key | ||
1070 | ret | ||
1071 | .size bsaes_enc_key_convert,.-bsaes_enc_key_convert | ||
1072 | |||
1073 | .globl bsaes_encrypt_128 | ||
1074 | .type bsaes_encrypt_128,\@function,4 | ||
1075 | .align 16 | ||
1076 | bsaes_encrypt_128: | ||
1077 | .Lenc128_loop: | ||
1078 | _CET_ENDBR | ||
1079 | movdqu 0x00($inp), @XMM[0] # load input | ||
1080 | movdqu 0x10($inp), @XMM[1] | ||
1081 | movdqu 0x20($inp), @XMM[2] | ||
1082 | movdqu 0x30($inp), @XMM[3] | ||
1083 | movdqu 0x40($inp), @XMM[4] | ||
1084 | movdqu 0x50($inp), @XMM[5] | ||
1085 | movdqu 0x60($inp), @XMM[6] | ||
1086 | movdqu 0x70($inp), @XMM[7] | ||
1087 | mov $key, %rax # pass the $key | ||
1088 | lea 0x80($inp), $inp | ||
1089 | mov \$10,%r10d | ||
1090 | |||
1091 | call _bsaes_encrypt8 | ||
1092 | |||
1093 | movdqu @XMM[0], 0x00($out) # write output | ||
1094 | movdqu @XMM[1], 0x10($out) | ||
1095 | movdqu @XMM[4], 0x20($out) | ||
1096 | movdqu @XMM[6], 0x30($out) | ||
1097 | movdqu @XMM[3], 0x40($out) | ||
1098 | movdqu @XMM[7], 0x50($out) | ||
1099 | movdqu @XMM[2], 0x60($out) | ||
1100 | movdqu @XMM[5], 0x70($out) | ||
1101 | lea 0x80($out), $out | ||
1102 | sub \$0x80,$len | ||
1103 | ja .Lenc128_loop | ||
1104 | ret | ||
1105 | .size bsaes_encrypt_128,.-bsaes_encrypt_128 | ||
1106 | |||
1107 | .globl bsaes_dec_key_convert | ||
1108 | .type bsaes_dec_key_convert,\@function,2 | ||
1109 | .align 16 | ||
1110 | bsaes_dec_key_convert: | ||
1111 | _CET_ENDBR | ||
1112 | mov 240($inp),%r10d # pass rounds | ||
1113 | mov $inp,%rcx # pass key | ||
1114 | mov $out,%rax # pass key schedule | ||
1115 | call _bsaes_key_convert | ||
1116 | pxor ($out),%xmm7 # fix up round 0 key | ||
1117 | movdqa %xmm6,(%rax) # save last round key | ||
1118 | movdqa %xmm7,($out) | ||
1119 | ret | ||
1120 | .size bsaes_dec_key_convert,.-bsaes_dec_key_convert | ||
1121 | |||
1122 | .globl bsaes_decrypt_128 | ||
1123 | .type bsaes_decrypt_128,\@function,4 | ||
1124 | .align 16 | ||
1125 | bsaes_decrypt_128: | ||
1126 | _CET_ENDBR | ||
1127 | .Ldec128_loop: | ||
1128 | movdqu 0x00($inp), @XMM[0] # load input | ||
1129 | movdqu 0x10($inp), @XMM[1] | ||
1130 | movdqu 0x20($inp), @XMM[2] | ||
1131 | movdqu 0x30($inp), @XMM[3] | ||
1132 | movdqu 0x40($inp), @XMM[4] | ||
1133 | movdqu 0x50($inp), @XMM[5] | ||
1134 | movdqu 0x60($inp), @XMM[6] | ||
1135 | movdqu 0x70($inp), @XMM[7] | ||
1136 | mov $key, %rax # pass the $key | ||
1137 | lea 0x80($inp), $inp | ||
1138 | mov \$10,%r10d | ||
1139 | |||
1140 | call _bsaes_decrypt8 | ||
1141 | |||
1142 | movdqu @XMM[0], 0x00($out) # write output | ||
1143 | movdqu @XMM[1], 0x10($out) | ||
1144 | movdqu @XMM[6], 0x20($out) | ||
1145 | movdqu @XMM[4], 0x30($out) | ||
1146 | movdqu @XMM[2], 0x40($out) | ||
1147 | movdqu @XMM[7], 0x50($out) | ||
1148 | movdqu @XMM[3], 0x60($out) | ||
1149 | movdqu @XMM[5], 0x70($out) | ||
1150 | lea 0x80($out), $out | ||
1151 | sub \$0x80,$len | ||
1152 | ja .Ldec128_loop | ||
1153 | ret | ||
1154 | .size bsaes_decrypt_128,.-bsaes_decrypt_128 | ||
1155 | ___ | ||
1156 | } | ||
1157 | { | ||
1158 | ###################################################################### | ||
1159 | # | ||
1160 | # OpenSSL interface | ||
1161 | # | ||
1162 | my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") | ||
1163 | : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); | ||
1164 | my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); | ||
1165 | |||
1166 | if ($ecb) { | ||
1167 | $code.=<<___; | ||
1168 | .globl bsaes_ecb_encrypt_blocks | ||
1169 | .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent | ||
1170 | .align 16 | ||
1171 | bsaes_ecb_encrypt_blocks: | ||
1172 | _CET_ENDBR | ||
1173 | mov %rsp, %rax | ||
1174 | .Lecb_enc_prologue: | ||
1175 | push %rbp | ||
1176 | push %rbx | ||
1177 | push %r12 | ||
1178 | push %r13 | ||
1179 | push %r14 | ||
1180 | push %r15 | ||
1181 | lea -0x48(%rsp),%rsp | ||
1182 | ___ | ||
1183 | $code.=<<___ if ($win64); | ||
1184 | lea -0xa0(%rsp), %rsp | ||
1185 | movaps %xmm6, 0x40(%rsp) | ||
1186 | movaps %xmm7, 0x50(%rsp) | ||
1187 | movaps %xmm8, 0x60(%rsp) | ||
1188 | movaps %xmm9, 0x70(%rsp) | ||
1189 | movaps %xmm10, 0x80(%rsp) | ||
1190 | movaps %xmm11, 0x90(%rsp) | ||
1191 | movaps %xmm12, 0xa0(%rsp) | ||
1192 | movaps %xmm13, 0xb0(%rsp) | ||
1193 | movaps %xmm14, 0xc0(%rsp) | ||
1194 | movaps %xmm15, 0xd0(%rsp) | ||
1195 | .Lecb_enc_body: | ||
1196 | ___ | ||
1197 | $code.=<<___; | ||
1198 | mov %rsp,%rbp # backup %rsp | ||
1199 | mov 240($arg4),%eax # rounds | ||
1200 | mov $arg1,$inp # backup arguments | ||
1201 | mov $arg2,$out | ||
1202 | mov $arg3,$len | ||
1203 | mov $arg4,$key | ||
1204 | cmp \$8,$arg3 | ||
1205 | jb .Lecb_enc_short | ||
1206 | |||
1207 | mov %eax,%ebx # backup rounds | ||
1208 | shl \$7,%rax # 128 bytes per inner round key | ||
1209 | sub \$`128-32`,%rax # size of bit-sliced key schedule | ||
1210 | sub %rax,%rsp | ||
1211 | mov %rsp,%rax # pass key schedule | ||
1212 | mov $key,%rcx # pass key | ||
1213 | mov %ebx,%r10d # pass rounds | ||
1214 | call _bsaes_key_convert | ||
1215 | pxor %xmm6,%xmm7 # fix up last round key | ||
1216 | movdqa %xmm7,(%rax) # save last round key | ||
1217 | |||
1218 | sub \$8,$len | ||
1219 | .Lecb_enc_loop: | ||
1220 | movdqu 0x00($inp), @XMM[0] # load input | ||
1221 | movdqu 0x10($inp), @XMM[1] | ||
1222 | movdqu 0x20($inp), @XMM[2] | ||
1223 | movdqu 0x30($inp), @XMM[3] | ||
1224 | movdqu 0x40($inp), @XMM[4] | ||
1225 | movdqu 0x50($inp), @XMM[5] | ||
1226 | mov %rsp, %rax # pass key schedule | ||
1227 | movdqu 0x60($inp), @XMM[6] | ||
1228 | mov %ebx,%r10d # pass rounds | ||
1229 | movdqu 0x70($inp), @XMM[7] | ||
1230 | lea 0x80($inp), $inp | ||
1231 | |||
1232 | call _bsaes_encrypt8 | ||
1233 | |||
1234 | movdqu @XMM[0], 0x00($out) # write output | ||
1235 | movdqu @XMM[1], 0x10($out) | ||
1236 | movdqu @XMM[4], 0x20($out) | ||
1237 | movdqu @XMM[6], 0x30($out) | ||
1238 | movdqu @XMM[3], 0x40($out) | ||
1239 | movdqu @XMM[7], 0x50($out) | ||
1240 | movdqu @XMM[2], 0x60($out) | ||
1241 | movdqu @XMM[5], 0x70($out) | ||
1242 | lea 0x80($out), $out | ||
1243 | sub \$8,$len | ||
1244 | jnc .Lecb_enc_loop | ||
1245 | |||
1246 | add \$8,$len | ||
1247 | jz .Lecb_enc_done | ||
1248 | |||
1249 | movdqu 0x00($inp), @XMM[0] # load input | ||
1250 | mov %rsp, %rax # pass key schedule | ||
1251 | mov %ebx,%r10d # pass rounds | ||
1252 | cmp \$2,$len | ||
1253 | jb .Lecb_enc_one | ||
1254 | movdqu 0x10($inp), @XMM[1] | ||
1255 | je .Lecb_enc_two | ||
1256 | movdqu 0x20($inp), @XMM[2] | ||
1257 | cmp \$4,$len | ||
1258 | jb .Lecb_enc_three | ||
1259 | movdqu 0x30($inp), @XMM[3] | ||
1260 | je .Lecb_enc_four | ||
1261 | movdqu 0x40($inp), @XMM[4] | ||
1262 | cmp \$6,$len | ||
1263 | jb .Lecb_enc_five | ||
1264 | movdqu 0x50($inp), @XMM[5] | ||
1265 | je .Lecb_enc_six | ||
1266 | movdqu 0x60($inp), @XMM[6] | ||
1267 | call _bsaes_encrypt8 | ||
1268 | movdqu @XMM[0], 0x00($out) # write output | ||
1269 | movdqu @XMM[1], 0x10($out) | ||
1270 | movdqu @XMM[4], 0x20($out) | ||
1271 | movdqu @XMM[6], 0x30($out) | ||
1272 | movdqu @XMM[3], 0x40($out) | ||
1273 | movdqu @XMM[7], 0x50($out) | ||
1274 | movdqu @XMM[2], 0x60($out) | ||
1275 | jmp .Lecb_enc_done | ||
1276 | .align 16 | ||
1277 | .Lecb_enc_six: | ||
1278 | call _bsaes_encrypt8 | ||
1279 | movdqu @XMM[0], 0x00($out) # write output | ||
1280 | movdqu @XMM[1], 0x10($out) | ||
1281 | movdqu @XMM[4], 0x20($out) | ||
1282 | movdqu @XMM[6], 0x30($out) | ||
1283 | movdqu @XMM[3], 0x40($out) | ||
1284 | movdqu @XMM[7], 0x50($out) | ||
1285 | jmp .Lecb_enc_done | ||
1286 | .align 16 | ||
1287 | .Lecb_enc_five: | ||
1288 | call _bsaes_encrypt8 | ||
1289 | movdqu @XMM[0], 0x00($out) # write output | ||
1290 | movdqu @XMM[1], 0x10($out) | ||
1291 | movdqu @XMM[4], 0x20($out) | ||
1292 | movdqu @XMM[6], 0x30($out) | ||
1293 | movdqu @XMM[3], 0x40($out) | ||
1294 | jmp .Lecb_enc_done | ||
1295 | .align 16 | ||
1296 | .Lecb_enc_four: | ||
1297 | call _bsaes_encrypt8 | ||
1298 | movdqu @XMM[0], 0x00($out) # write output | ||
1299 | movdqu @XMM[1], 0x10($out) | ||
1300 | movdqu @XMM[4], 0x20($out) | ||
1301 | movdqu @XMM[6], 0x30($out) | ||
1302 | jmp .Lecb_enc_done | ||
1303 | .align 16 | ||
1304 | .Lecb_enc_three: | ||
1305 | call _bsaes_encrypt8 | ||
1306 | movdqu @XMM[0], 0x00($out) # write output | ||
1307 | movdqu @XMM[1], 0x10($out) | ||
1308 | movdqu @XMM[4], 0x20($out) | ||
1309 | jmp .Lecb_enc_done | ||
1310 | .align 16 | ||
1311 | .Lecb_enc_two: | ||
1312 | call _bsaes_encrypt8 | ||
1313 | movdqu @XMM[0], 0x00($out) # write output | ||
1314 | movdqu @XMM[1], 0x10($out) | ||
1315 | jmp .Lecb_enc_done | ||
1316 | .align 16 | ||
1317 | .Lecb_enc_one: | ||
1318 | call _bsaes_encrypt8 | ||
1319 | movdqu @XMM[0], 0x00($out) # write output | ||
1320 | jmp .Lecb_enc_done | ||
1321 | .align 16 | ||
1322 | .Lecb_enc_short: | ||
1323 | lea ($inp), $arg1 | ||
1324 | lea ($out), $arg2 | ||
1325 | lea ($key), $arg3 | ||
1326 | call asm_AES_encrypt | ||
1327 | lea 16($inp), $inp | ||
1328 | lea 16($out), $out | ||
1329 | dec $len | ||
1330 | jnz .Lecb_enc_short | ||
1331 | |||
1332 | .Lecb_enc_done: | ||
1333 | lea (%rsp),%rax | ||
1334 | pxor %xmm0, %xmm0 | ||
1335 | .Lecb_enc_bzero: # wipe key schedule [if any] | ||
1336 | movdqa %xmm0, 0x00(%rax) | ||
1337 | movdqa %xmm0, 0x10(%rax) | ||
1338 | lea 0x20(%rax), %rax | ||
1339 | cmp %rax, %rbp | ||
1340 | jb .Lecb_enc_bzero | ||
1341 | |||
1342 | lea (%rbp),%rsp # restore %rsp | ||
1343 | ___ | ||
1344 | $code.=<<___ if ($win64); | ||
1345 | movaps 0x40(%rbp), %xmm6 | ||
1346 | movaps 0x50(%rbp), %xmm7 | ||
1347 | movaps 0x60(%rbp), %xmm8 | ||
1348 | movaps 0x70(%rbp), %xmm9 | ||
1349 | movaps 0x80(%rbp), %xmm10 | ||
1350 | movaps 0x90(%rbp), %xmm11 | ||
1351 | movaps 0xa0(%rbp), %xmm12 | ||
1352 | movaps 0xb0(%rbp), %xmm13 | ||
1353 | movaps 0xc0(%rbp), %xmm14 | ||
1354 | movaps 0xd0(%rbp), %xmm15 | ||
1355 | lea 0xa0(%rbp), %rsp | ||
1356 | ___ | ||
1357 | $code.=<<___; | ||
1358 | mov 0x48(%rsp), %r15 | ||
1359 | mov 0x50(%rsp), %r14 | ||
1360 | mov 0x58(%rsp), %r13 | ||
1361 | mov 0x60(%rsp), %r12 | ||
1362 | mov 0x68(%rsp), %rbx | ||
1363 | mov 0x70(%rsp), %rax | ||
1364 | lea 0x78(%rsp), %rsp | ||
1365 | mov %rax, %rbp | ||
1366 | .Lecb_enc_epilogue: | ||
1367 | ret | ||
1368 | .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks | ||
1369 | |||
1370 | .globl bsaes_ecb_decrypt_blocks | ||
1371 | .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent | ||
1372 | .align 16 | ||
1373 | bsaes_ecb_decrypt_blocks: | ||
1374 | _CET_ENDBR | ||
1375 | mov %rsp, %rax | ||
1376 | .Lecb_dec_prologue: | ||
1377 | push %rbp | ||
1378 | push %rbx | ||
1379 | push %r12 | ||
1380 | push %r13 | ||
1381 | push %r14 | ||
1382 | push %r15 | ||
1383 | lea -0x48(%rsp),%rsp | ||
1384 | ___ | ||
1385 | $code.=<<___ if ($win64); | ||
1386 | lea -0xa0(%rsp), %rsp | ||
1387 | movaps %xmm6, 0x40(%rsp) | ||
1388 | movaps %xmm7, 0x50(%rsp) | ||
1389 | movaps %xmm8, 0x60(%rsp) | ||
1390 | movaps %xmm9, 0x70(%rsp) | ||
1391 | movaps %xmm10, 0x80(%rsp) | ||
1392 | movaps %xmm11, 0x90(%rsp) | ||
1393 | movaps %xmm12, 0xa0(%rsp) | ||
1394 | movaps %xmm13, 0xb0(%rsp) | ||
1395 | movaps %xmm14, 0xc0(%rsp) | ||
1396 | movaps %xmm15, 0xd0(%rsp) | ||
1397 | .Lecb_dec_body: | ||
1398 | ___ | ||
1399 | $code.=<<___; | ||
1400 | mov %rsp,%rbp # backup %rsp | ||
1401 | mov 240($arg4),%eax # rounds | ||
1402 | mov $arg1,$inp # backup arguments | ||
1403 | mov $arg2,$out | ||
1404 | mov $arg3,$len | ||
1405 | mov $arg4,$key | ||
1406 | cmp \$8,$arg3 | ||
1407 | jb .Lecb_dec_short | ||
1408 | |||
1409 | mov %eax,%ebx # backup rounds | ||
1410 | shl \$7,%rax # 128 bytes per inner round key | ||
1411 | sub \$`128-32`,%rax # size of bit-sliced key schedule | ||
1412 | sub %rax,%rsp | ||
1413 | mov %rsp,%rax # pass key schedule | ||
1414 | mov $key,%rcx # pass key | ||
1415 | mov %ebx,%r10d # pass rounds | ||
1416 | call _bsaes_key_convert | ||
1417 | pxor (%rsp),%xmm7 # fix up 0 round key | ||
1418 | movdqa %xmm6,(%rax) # save last round key | ||
1419 | movdqa %xmm7,(%rsp) | ||
1420 | |||
1421 | sub \$8,$len | ||
1422 | .Lecb_dec_loop: | ||
1423 | movdqu 0x00($inp), @XMM[0] # load input | ||
1424 | movdqu 0x10($inp), @XMM[1] | ||
1425 | movdqu 0x20($inp), @XMM[2] | ||
1426 | movdqu 0x30($inp), @XMM[3] | ||
1427 | movdqu 0x40($inp), @XMM[4] | ||
1428 | movdqu 0x50($inp), @XMM[5] | ||
1429 | mov %rsp, %rax # pass key schedule | ||
1430 | movdqu 0x60($inp), @XMM[6] | ||
1431 | mov %ebx,%r10d # pass rounds | ||
1432 | movdqu 0x70($inp), @XMM[7] | ||
1433 | lea 0x80($inp), $inp | ||
1434 | |||
1435 | call _bsaes_decrypt8 | ||
1436 | |||
1437 | movdqu @XMM[0], 0x00($out) # write output | ||
1438 | movdqu @XMM[1], 0x10($out) | ||
1439 | movdqu @XMM[6], 0x20($out) | ||
1440 | movdqu @XMM[4], 0x30($out) | ||
1441 | movdqu @XMM[2], 0x40($out) | ||
1442 | movdqu @XMM[7], 0x50($out) | ||
1443 | movdqu @XMM[3], 0x60($out) | ||
1444 | movdqu @XMM[5], 0x70($out) | ||
1445 | lea 0x80($out), $out | ||
1446 | sub \$8,$len | ||
1447 | jnc .Lecb_dec_loop | ||
1448 | |||
1449 | add \$8,$len | ||
1450 | jz .Lecb_dec_done | ||
1451 | |||
1452 | movdqu 0x00($inp), @XMM[0] # load input | ||
1453 | mov %rsp, %rax # pass key schedule | ||
1454 | mov %ebx,%r10d # pass rounds | ||
1455 | cmp \$2,$len | ||
1456 | jb .Lecb_dec_one | ||
1457 | movdqu 0x10($inp), @XMM[1] | ||
1458 | je .Lecb_dec_two | ||
1459 | movdqu 0x20($inp), @XMM[2] | ||
1460 | cmp \$4,$len | ||
1461 | jb .Lecb_dec_three | ||
1462 | movdqu 0x30($inp), @XMM[3] | ||
1463 | je .Lecb_dec_four | ||
1464 | movdqu 0x40($inp), @XMM[4] | ||
1465 | cmp \$6,$len | ||
1466 | jb .Lecb_dec_five | ||
1467 | movdqu 0x50($inp), @XMM[5] | ||
1468 | je .Lecb_dec_six | ||
1469 | movdqu 0x60($inp), @XMM[6] | ||
1470 | call _bsaes_decrypt8 | ||
1471 | movdqu @XMM[0], 0x00($out) # write output | ||
1472 | movdqu @XMM[1], 0x10($out) | ||
1473 | movdqu @XMM[6], 0x20($out) | ||
1474 | movdqu @XMM[4], 0x30($out) | ||
1475 | movdqu @XMM[2], 0x40($out) | ||
1476 | movdqu @XMM[7], 0x50($out) | ||
1477 | movdqu @XMM[3], 0x60($out) | ||
1478 | jmp .Lecb_dec_done | ||
1479 | .align 16 | ||
1480 | .Lecb_dec_six: | ||
1481 | call _bsaes_decrypt8 | ||
1482 | movdqu @XMM[0], 0x00($out) # write output | ||
1483 | movdqu @XMM[1], 0x10($out) | ||
1484 | movdqu @XMM[6], 0x20($out) | ||
1485 | movdqu @XMM[4], 0x30($out) | ||
1486 | movdqu @XMM[2], 0x40($out) | ||
1487 | movdqu @XMM[7], 0x50($out) | ||
1488 | jmp .Lecb_dec_done | ||
1489 | .align 16 | ||
1490 | .Lecb_dec_five: | ||
1491 | call _bsaes_decrypt8 | ||
1492 | movdqu @XMM[0], 0x00($out) # write output | ||
1493 | movdqu @XMM[1], 0x10($out) | ||
1494 | movdqu @XMM[6], 0x20($out) | ||
1495 | movdqu @XMM[4], 0x30($out) | ||
1496 | movdqu @XMM[2], 0x40($out) | ||
1497 | jmp .Lecb_dec_done | ||
1498 | .align 16 | ||
1499 | .Lecb_dec_four: | ||
1500 | call _bsaes_decrypt8 | ||
1501 | movdqu @XMM[0], 0x00($out) # write output | ||
1502 | movdqu @XMM[1], 0x10($out) | ||
1503 | movdqu @XMM[6], 0x20($out) | ||
1504 | movdqu @XMM[4], 0x30($out) | ||
1505 | jmp .Lecb_dec_done | ||
1506 | .align 16 | ||
1507 | .Lecb_dec_three: | ||
1508 | call _bsaes_decrypt8 | ||
1509 | movdqu @XMM[0], 0x00($out) # write output | ||
1510 | movdqu @XMM[1], 0x10($out) | ||
1511 | movdqu @XMM[6], 0x20($out) | ||
1512 | jmp .Lecb_dec_done | ||
1513 | .align 16 | ||
1514 | .Lecb_dec_two: | ||
1515 | call _bsaes_decrypt8 | ||
1516 | movdqu @XMM[0], 0x00($out) # write output | ||
1517 | movdqu @XMM[1], 0x10($out) | ||
1518 | jmp .Lecb_dec_done | ||
1519 | .align 16 | ||
1520 | .Lecb_dec_one: | ||
1521 | call _bsaes_decrypt8 | ||
1522 | movdqu @XMM[0], 0x00($out) # write output | ||
1523 | jmp .Lecb_dec_done | ||
1524 | .align 16 | ||
1525 | .Lecb_dec_short: | ||
1526 | lea ($inp), $arg1 | ||
1527 | lea ($out), $arg2 | ||
1528 | lea ($key), $arg3 | ||
1529 | call asm_AES_decrypt | ||
1530 | lea 16($inp), $inp | ||
1531 | lea 16($out), $out | ||
1532 | dec $len | ||
1533 | jnz .Lecb_dec_short | ||
1534 | |||
1535 | .Lecb_dec_done: | ||
1536 | lea (%rsp),%rax | ||
1537 | pxor %xmm0, %xmm0 | ||
1538 | .Lecb_dec_bzero: # wipe key schedule [if any] | ||
1539 | movdqa %xmm0, 0x00(%rax) | ||
1540 | movdqa %xmm0, 0x10(%rax) | ||
1541 | lea 0x20(%rax), %rax | ||
1542 | cmp %rax, %rbp | ||
1543 | jb .Lecb_dec_bzero | ||
1544 | |||
1545 | lea (%rbp),%rsp # restore %rsp | ||
1546 | ___ | ||
1547 | $code.=<<___ if ($win64); | ||
1548 | movaps 0x40(%rbp), %xmm6 | ||
1549 | movaps 0x50(%rbp), %xmm7 | ||
1550 | movaps 0x60(%rbp), %xmm8 | ||
1551 | movaps 0x70(%rbp), %xmm9 | ||
1552 | movaps 0x80(%rbp), %xmm10 | ||
1553 | movaps 0x90(%rbp), %xmm11 | ||
1554 | movaps 0xa0(%rbp), %xmm12 | ||
1555 | movaps 0xb0(%rbp), %xmm13 | ||
1556 | movaps 0xc0(%rbp), %xmm14 | ||
1557 | movaps 0xd0(%rbp), %xmm15 | ||
1558 | lea 0xa0(%rbp), %rsp | ||
1559 | ___ | ||
1560 | $code.=<<___; | ||
1561 | mov 0x48(%rsp), %r15 | ||
1562 | mov 0x50(%rsp), %r14 | ||
1563 | mov 0x58(%rsp), %r13 | ||
1564 | mov 0x60(%rsp), %r12 | ||
1565 | mov 0x68(%rsp), %rbx | ||
1566 | mov 0x70(%rsp), %rax | ||
1567 | lea 0x78(%rsp), %rsp | ||
1568 | mov %rax, %rbp | ||
1569 | .Lecb_dec_epilogue: | ||
1570 | ret | ||
1571 | .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks | ||
1572 | ___ | ||
1573 | } | ||
1574 | $code.=<<___; | ||
1575 | .extern asm_AES_cbc_encrypt | ||
1576 | .globl bsaes_cbc_encrypt | ||
1577 | .type bsaes_cbc_encrypt,\@abi-omnipotent | ||
1578 | .align 16 | ||
1579 | bsaes_cbc_encrypt: | ||
1580 | _CET_ENDBR | ||
1581 | ___ | ||
1582 | $code.=<<___ if ($win64); | ||
1583 | mov 48(%rsp),$arg6 # pull direction flag | ||
1584 | ___ | ||
1585 | $code.=<<___; | ||
1586 | cmp \$0,$arg6 | ||
1587 | jne asm_AES_cbc_encrypt | ||
1588 | cmp \$128,$arg3 | ||
1589 | jb asm_AES_cbc_encrypt | ||
1590 | |||
1591 | mov %rsp, %rax | ||
1592 | .Lcbc_dec_prologue: | ||
1593 | push %rbp | ||
1594 | push %rbx | ||
1595 | push %r12 | ||
1596 | push %r13 | ||
1597 | push %r14 | ||
1598 | push %r15 | ||
1599 | lea -0x48(%rsp), %rsp | ||
1600 | ___ | ||
1601 | $code.=<<___ if ($win64); | ||
1602 | mov 0xa0(%rsp),$arg5 # pull ivp | ||
1603 | lea -0xa0(%rsp), %rsp | ||
1604 | movaps %xmm6, 0x40(%rsp) | ||
1605 | movaps %xmm7, 0x50(%rsp) | ||
1606 | movaps %xmm8, 0x60(%rsp) | ||
1607 | movaps %xmm9, 0x70(%rsp) | ||
1608 | movaps %xmm10, 0x80(%rsp) | ||
1609 | movaps %xmm11, 0x90(%rsp) | ||
1610 | movaps %xmm12, 0xa0(%rsp) | ||
1611 | movaps %xmm13, 0xb0(%rsp) | ||
1612 | movaps %xmm14, 0xc0(%rsp) | ||
1613 | movaps %xmm15, 0xd0(%rsp) | ||
1614 | .Lcbc_dec_body: | ||
1615 | ___ | ||
1616 | $code.=<<___; | ||
1617 | mov %rsp, %rbp # backup %rsp | ||
1618 | mov 240($arg4), %eax # rounds | ||
1619 | mov $arg1, $inp # backup arguments | ||
1620 | mov $arg2, $out | ||
1621 | mov $arg3, $len | ||
1622 | mov $arg4, $key | ||
1623 | mov $arg5, %rbx | ||
1624 | shr \$4, $len # bytes to blocks | ||
1625 | |||
1626 | mov %eax, %edx # rounds | ||
1627 | shl \$7, %rax # 128 bytes per inner round key | ||
1628 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
1629 | sub %rax, %rsp | ||
1630 | |||
1631 | mov %rsp, %rax # pass key schedule | ||
1632 | mov $key, %rcx # pass key | ||
1633 | mov %edx, %r10d # pass rounds | ||
1634 | call _bsaes_key_convert | ||
1635 | pxor (%rsp),%xmm7 # fix up 0 round key | ||
1636 | movdqa %xmm6,(%rax) # save last round key | ||
1637 | movdqa %xmm7,(%rsp) | ||
1638 | |||
1639 | movdqu (%rbx), @XMM[15] # load IV | ||
1640 | sub \$8,$len | ||
1641 | .Lcbc_dec_loop: | ||
1642 | movdqu 0x00($inp), @XMM[0] # load input | ||
1643 | movdqu 0x10($inp), @XMM[1] | ||
1644 | movdqu 0x20($inp), @XMM[2] | ||
1645 | movdqu 0x30($inp), @XMM[3] | ||
1646 | movdqu 0x40($inp), @XMM[4] | ||
1647 | movdqu 0x50($inp), @XMM[5] | ||
1648 | mov %rsp, %rax # pass key schedule | ||
1649 | movdqu 0x60($inp), @XMM[6] | ||
1650 | mov %edx,%r10d # pass rounds | ||
1651 | movdqu 0x70($inp), @XMM[7] | ||
1652 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1653 | |||
1654 | call _bsaes_decrypt8 | ||
1655 | |||
1656 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1657 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1658 | movdqu 0x10($inp), @XMM[9] | ||
1659 | pxor @XMM[8], @XMM[1] | ||
1660 | movdqu 0x20($inp), @XMM[10] | ||
1661 | pxor @XMM[9], @XMM[6] | ||
1662 | movdqu 0x30($inp), @XMM[11] | ||
1663 | pxor @XMM[10], @XMM[4] | ||
1664 | movdqu 0x40($inp), @XMM[12] | ||
1665 | pxor @XMM[11], @XMM[2] | ||
1666 | movdqu 0x50($inp), @XMM[13] | ||
1667 | pxor @XMM[12], @XMM[7] | ||
1668 | movdqu 0x60($inp), @XMM[14] | ||
1669 | pxor @XMM[13], @XMM[3] | ||
1670 | movdqu 0x70($inp), @XMM[15] # IV | ||
1671 | pxor @XMM[14], @XMM[5] | ||
1672 | movdqu @XMM[0], 0x00($out) # write output | ||
1673 | lea 0x80($inp), $inp | ||
1674 | movdqu @XMM[1], 0x10($out) | ||
1675 | movdqu @XMM[6], 0x20($out) | ||
1676 | movdqu @XMM[4], 0x30($out) | ||
1677 | movdqu @XMM[2], 0x40($out) | ||
1678 | movdqu @XMM[7], 0x50($out) | ||
1679 | movdqu @XMM[3], 0x60($out) | ||
1680 | movdqu @XMM[5], 0x70($out) | ||
1681 | lea 0x80($out), $out | ||
1682 | sub \$8,$len | ||
1683 | jnc .Lcbc_dec_loop | ||
1684 | |||
1685 | add \$8,$len | ||
1686 | jz .Lcbc_dec_done | ||
1687 | |||
1688 | movdqu 0x00($inp), @XMM[0] # load input | ||
1689 | mov %rsp, %rax # pass key schedule | ||
1690 | mov %edx, %r10d # pass rounds | ||
1691 | cmp \$2,$len | ||
1692 | jb .Lcbc_dec_one | ||
1693 | movdqu 0x10($inp), @XMM[1] | ||
1694 | je .Lcbc_dec_two | ||
1695 | movdqu 0x20($inp), @XMM[2] | ||
1696 | cmp \$4,$len | ||
1697 | jb .Lcbc_dec_three | ||
1698 | movdqu 0x30($inp), @XMM[3] | ||
1699 | je .Lcbc_dec_four | ||
1700 | movdqu 0x40($inp), @XMM[4] | ||
1701 | cmp \$6,$len | ||
1702 | jb .Lcbc_dec_five | ||
1703 | movdqu 0x50($inp), @XMM[5] | ||
1704 | je .Lcbc_dec_six | ||
1705 | movdqu 0x60($inp), @XMM[6] | ||
1706 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1707 | call _bsaes_decrypt8 | ||
1708 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1709 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1710 | movdqu 0x10($inp), @XMM[9] | ||
1711 | pxor @XMM[8], @XMM[1] | ||
1712 | movdqu 0x20($inp), @XMM[10] | ||
1713 | pxor @XMM[9], @XMM[6] | ||
1714 | movdqu 0x30($inp), @XMM[11] | ||
1715 | pxor @XMM[10], @XMM[4] | ||
1716 | movdqu 0x40($inp), @XMM[12] | ||
1717 | pxor @XMM[11], @XMM[2] | ||
1718 | movdqu 0x50($inp), @XMM[13] | ||
1719 | pxor @XMM[12], @XMM[7] | ||
1720 | movdqu 0x60($inp), @XMM[15] # IV | ||
1721 | pxor @XMM[13], @XMM[3] | ||
1722 | movdqu @XMM[0], 0x00($out) # write output | ||
1723 | movdqu @XMM[1], 0x10($out) | ||
1724 | movdqu @XMM[6], 0x20($out) | ||
1725 | movdqu @XMM[4], 0x30($out) | ||
1726 | movdqu @XMM[2], 0x40($out) | ||
1727 | movdqu @XMM[7], 0x50($out) | ||
1728 | movdqu @XMM[3], 0x60($out) | ||
1729 | jmp .Lcbc_dec_done | ||
1730 | .align 16 | ||
1731 | .Lcbc_dec_six: | ||
1732 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1733 | call _bsaes_decrypt8 | ||
1734 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1735 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1736 | movdqu 0x10($inp), @XMM[9] | ||
1737 | pxor @XMM[8], @XMM[1] | ||
1738 | movdqu 0x20($inp), @XMM[10] | ||
1739 | pxor @XMM[9], @XMM[6] | ||
1740 | movdqu 0x30($inp), @XMM[11] | ||
1741 | pxor @XMM[10], @XMM[4] | ||
1742 | movdqu 0x40($inp), @XMM[12] | ||
1743 | pxor @XMM[11], @XMM[2] | ||
1744 | movdqu 0x50($inp), @XMM[15] # IV | ||
1745 | pxor @XMM[12], @XMM[7] | ||
1746 | movdqu @XMM[0], 0x00($out) # write output | ||
1747 | movdqu @XMM[1], 0x10($out) | ||
1748 | movdqu @XMM[6], 0x20($out) | ||
1749 | movdqu @XMM[4], 0x30($out) | ||
1750 | movdqu @XMM[2], 0x40($out) | ||
1751 | movdqu @XMM[7], 0x50($out) | ||
1752 | jmp .Lcbc_dec_done | ||
1753 | .align 16 | ||
1754 | .Lcbc_dec_five: | ||
1755 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1756 | call _bsaes_decrypt8 | ||
1757 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1758 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1759 | movdqu 0x10($inp), @XMM[9] | ||
1760 | pxor @XMM[8], @XMM[1] | ||
1761 | movdqu 0x20($inp), @XMM[10] | ||
1762 | pxor @XMM[9], @XMM[6] | ||
1763 | movdqu 0x30($inp), @XMM[11] | ||
1764 | pxor @XMM[10], @XMM[4] | ||
1765 | movdqu 0x40($inp), @XMM[15] # IV | ||
1766 | pxor @XMM[11], @XMM[2] | ||
1767 | movdqu @XMM[0], 0x00($out) # write output | ||
1768 | movdqu @XMM[1], 0x10($out) | ||
1769 | movdqu @XMM[6], 0x20($out) | ||
1770 | movdqu @XMM[4], 0x30($out) | ||
1771 | movdqu @XMM[2], 0x40($out) | ||
1772 | jmp .Lcbc_dec_done | ||
1773 | .align 16 | ||
1774 | .Lcbc_dec_four: | ||
1775 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1776 | call _bsaes_decrypt8 | ||
1777 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1778 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1779 | movdqu 0x10($inp), @XMM[9] | ||
1780 | pxor @XMM[8], @XMM[1] | ||
1781 | movdqu 0x20($inp), @XMM[10] | ||
1782 | pxor @XMM[9], @XMM[6] | ||
1783 | movdqu 0x30($inp), @XMM[15] # IV | ||
1784 | pxor @XMM[10], @XMM[4] | ||
1785 | movdqu @XMM[0], 0x00($out) # write output | ||
1786 | movdqu @XMM[1], 0x10($out) | ||
1787 | movdqu @XMM[6], 0x20($out) | ||
1788 | movdqu @XMM[4], 0x30($out) | ||
1789 | jmp .Lcbc_dec_done | ||
1790 | .align 16 | ||
1791 | .Lcbc_dec_three: | ||
1792 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1793 | call _bsaes_decrypt8 | ||
1794 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1795 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1796 | movdqu 0x10($inp), @XMM[9] | ||
1797 | pxor @XMM[8], @XMM[1] | ||
1798 | movdqu 0x20($inp), @XMM[15] # IV | ||
1799 | pxor @XMM[9], @XMM[6] | ||
1800 | movdqu @XMM[0], 0x00($out) # write output | ||
1801 | movdqu @XMM[1], 0x10($out) | ||
1802 | movdqu @XMM[6], 0x20($out) | ||
1803 | jmp .Lcbc_dec_done | ||
1804 | .align 16 | ||
1805 | .Lcbc_dec_two: | ||
1806 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1807 | call _bsaes_decrypt8 | ||
1808 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1809 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1810 | movdqu 0x10($inp), @XMM[15] # IV | ||
1811 | pxor @XMM[8], @XMM[1] | ||
1812 | movdqu @XMM[0], 0x00($out) # write output | ||
1813 | movdqu @XMM[1], 0x10($out) | ||
1814 | jmp .Lcbc_dec_done | ||
1815 | .align 16 | ||
1816 | .Lcbc_dec_one: | ||
1817 | lea ($inp), $arg1 | ||
1818 | lea 0x20(%rbp), $arg2 # buffer output | ||
1819 | lea ($key), $arg3 | ||
1820 | call asm_AES_decrypt # doesn't touch %xmm | ||
1821 | pxor 0x20(%rbp), @XMM[15] # ^= IV | ||
1822 | movdqu @XMM[15], ($out) # write output | ||
1823 | movdqa @XMM[0], @XMM[15] # IV | ||
1824 | |||
1825 | .Lcbc_dec_done: | ||
1826 | movdqu @XMM[15], (%rbx) # return IV | ||
1827 | lea (%rsp), %rax | ||
1828 | pxor %xmm0, %xmm0 | ||
1829 | .Lcbc_dec_bzero: # wipe key schedule [if any] | ||
1830 | movdqa %xmm0, 0x00(%rax) | ||
1831 | movdqa %xmm0, 0x10(%rax) | ||
1832 | lea 0x20(%rax), %rax | ||
1833 | cmp %rax, %rbp | ||
1834 | ja .Lcbc_dec_bzero | ||
1835 | |||
1836 | lea (%rbp),%rsp # restore %rsp | ||
1837 | ___ | ||
1838 | $code.=<<___ if ($win64); | ||
1839 | movaps 0x40(%rbp), %xmm6 | ||
1840 | movaps 0x50(%rbp), %xmm7 | ||
1841 | movaps 0x60(%rbp), %xmm8 | ||
1842 | movaps 0x70(%rbp), %xmm9 | ||
1843 | movaps 0x80(%rbp), %xmm10 | ||
1844 | movaps 0x90(%rbp), %xmm11 | ||
1845 | movaps 0xa0(%rbp), %xmm12 | ||
1846 | movaps 0xb0(%rbp), %xmm13 | ||
1847 | movaps 0xc0(%rbp), %xmm14 | ||
1848 | movaps 0xd0(%rbp), %xmm15 | ||
1849 | lea 0xa0(%rbp), %rsp | ||
1850 | ___ | ||
1851 | $code.=<<___; | ||
1852 | mov 0x48(%rsp), %r15 | ||
1853 | mov 0x50(%rsp), %r14 | ||
1854 | mov 0x58(%rsp), %r13 | ||
1855 | mov 0x60(%rsp), %r12 | ||
1856 | mov 0x68(%rsp), %rbx | ||
1857 | mov 0x70(%rsp), %rax | ||
1858 | lea 0x78(%rsp), %rsp | ||
1859 | mov %rax, %rbp | ||
1860 | .Lcbc_dec_epilogue: | ||
1861 | ret | ||
1862 | .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt | ||
1863 | |||
1864 | .globl bsaes_ctr32_encrypt_blocks | ||
1865 | .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent | ||
1866 | .align 16 | ||
1867 | bsaes_ctr32_encrypt_blocks: | ||
1868 | _CET_ENDBR | ||
1869 | mov %rsp, %rax | ||
1870 | .Lctr_enc_prologue: | ||
1871 | push %rbp | ||
1872 | push %rbx | ||
1873 | push %r12 | ||
1874 | push %r13 | ||
1875 | push %r14 | ||
1876 | push %r15 | ||
1877 | lea -0x48(%rsp), %rsp | ||
1878 | ___ | ||
1879 | $code.=<<___ if ($win64); | ||
1880 | mov 0xa0(%rsp),$arg5 # pull ivp | ||
1881 | lea -0xa0(%rsp), %rsp | ||
1882 | movaps %xmm6, 0x40(%rsp) | ||
1883 | movaps %xmm7, 0x50(%rsp) | ||
1884 | movaps %xmm8, 0x60(%rsp) | ||
1885 | movaps %xmm9, 0x70(%rsp) | ||
1886 | movaps %xmm10, 0x80(%rsp) | ||
1887 | movaps %xmm11, 0x90(%rsp) | ||
1888 | movaps %xmm12, 0xa0(%rsp) | ||
1889 | movaps %xmm13, 0xb0(%rsp) | ||
1890 | movaps %xmm14, 0xc0(%rsp) | ||
1891 | movaps %xmm15, 0xd0(%rsp) | ||
1892 | .Lctr_enc_body: | ||
1893 | ___ | ||
1894 | $code.=<<___; | ||
1895 | mov %rsp, %rbp # backup %rsp | ||
1896 | movdqu ($arg5), %xmm0 # load counter | ||
1897 | mov 240($arg4), %eax # rounds | ||
1898 | mov $arg1, $inp # backup arguments | ||
1899 | mov $arg2, $out | ||
1900 | mov $arg3, $len | ||
1901 | mov $arg4, $key | ||
1902 | movdqa %xmm0, 0x20(%rbp) # copy counter | ||
1903 | cmp \$8, $arg3 | ||
1904 | jb .Lctr_enc_short | ||
1905 | |||
1906 | mov %eax, %ebx # rounds | ||
1907 | shl \$7, %rax # 128 bytes per inner round key | ||
1908 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
1909 | sub %rax, %rsp | ||
1910 | |||
1911 | mov %rsp, %rax # pass key schedule | ||
1912 | mov $key, %rcx # pass key | ||
1913 | mov %ebx, %r10d # pass rounds | ||
1914 | call _bsaes_key_convert | ||
1915 | pxor %xmm6,%xmm7 # fix up last round key | ||
1916 | movdqa %xmm7,(%rax) # save last round key | ||
1917 | |||
1918 | movdqa (%rsp), @XMM[9] # load round0 key | ||
1919 | lea .LADD1(%rip), %r11 | ||
1920 | movdqa 0x20(%rbp), @XMM[0] # counter copy | ||
1921 | movdqa -0x20(%r11), @XMM[8] # .LSWPUP | ||
1922 | pshufb @XMM[8], @XMM[9] # byte swap upper part | ||
1923 | pshufb @XMM[8], @XMM[0] | ||
1924 | movdqa @XMM[9], (%rsp) # save adjusted round0 key | ||
1925 | jmp .Lctr_enc_loop | ||
1926 | .align 16 | ||
1927 | .Lctr_enc_loop: | ||
1928 | movdqa @XMM[0], 0x20(%rbp) # save counter | ||
1929 | movdqa @XMM[0], @XMM[1] # prepare 8 counter values | ||
1930 | movdqa @XMM[0], @XMM[2] | ||
1931 | paddd 0x00(%r11), @XMM[1] # .LADD1 | ||
1932 | movdqa @XMM[0], @XMM[3] | ||
1933 | paddd 0x10(%r11), @XMM[2] # .LADD2 | ||
1934 | movdqa @XMM[0], @XMM[4] | ||
1935 | paddd 0x20(%r11), @XMM[3] # .LADD3 | ||
1936 | movdqa @XMM[0], @XMM[5] | ||
1937 | paddd 0x30(%r11), @XMM[4] # .LADD4 | ||
1938 | movdqa @XMM[0], @XMM[6] | ||
1939 | paddd 0x40(%r11), @XMM[5] # .LADD5 | ||
1940 | movdqa @XMM[0], @XMM[7] | ||
1941 | paddd 0x50(%r11), @XMM[6] # .LADD6 | ||
1942 | paddd 0x60(%r11), @XMM[7] # .LADD7 | ||
1943 | |||
1944 | # Borrow prologue from _bsaes_encrypt8 to use the opportunity | ||
1945 | # to flip byte order in 32-bit counter | ||
1946 | movdqa (%rsp), @XMM[9] # round 0 key | ||
1947 | lea 0x10(%rsp), %rax # pass key schedule | ||
1948 | movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR | ||
1949 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
1950 | pxor @XMM[9], @XMM[1] | ||
1951 | pshufb @XMM[8], @XMM[0] | ||
1952 | pxor @XMM[9], @XMM[2] | ||
1953 | pshufb @XMM[8], @XMM[1] | ||
1954 | pxor @XMM[9], @XMM[3] | ||
1955 | pshufb @XMM[8], @XMM[2] | ||
1956 | pxor @XMM[9], @XMM[4] | ||
1957 | pshufb @XMM[8], @XMM[3] | ||
1958 | pxor @XMM[9], @XMM[5] | ||
1959 | pshufb @XMM[8], @XMM[4] | ||
1960 | pxor @XMM[9], @XMM[6] | ||
1961 | pshufb @XMM[8], @XMM[5] | ||
1962 | pxor @XMM[9], @XMM[7] | ||
1963 | pshufb @XMM[8], @XMM[6] | ||
1964 | lea .LBS0(%rip), %r11 # constants table | ||
1965 | pshufb @XMM[8], @XMM[7] | ||
1966 | mov %ebx,%r10d # pass rounds | ||
1967 | |||
1968 | call _bsaes_encrypt8_bitslice | ||
1969 | |||
1970 | sub \$8,$len | ||
1971 | jc .Lctr_enc_loop_done | ||
1972 | |||
1973 | movdqu 0x00($inp), @XMM[8] # load input | ||
1974 | movdqu 0x10($inp), @XMM[9] | ||
1975 | movdqu 0x20($inp), @XMM[10] | ||
1976 | movdqu 0x30($inp), @XMM[11] | ||
1977 | movdqu 0x40($inp), @XMM[12] | ||
1978 | movdqu 0x50($inp), @XMM[13] | ||
1979 | movdqu 0x60($inp), @XMM[14] | ||
1980 | movdqu 0x70($inp), @XMM[15] | ||
1981 | lea 0x80($inp),$inp | ||
1982 | pxor @XMM[0], @XMM[8] | ||
1983 | movdqa 0x20(%rbp), @XMM[0] # load counter | ||
1984 | pxor @XMM[9], @XMM[1] | ||
1985 | movdqu @XMM[8], 0x00($out) # write output | ||
1986 | pxor @XMM[10], @XMM[4] | ||
1987 | movdqu @XMM[1], 0x10($out) | ||
1988 | pxor @XMM[11], @XMM[6] | ||
1989 | movdqu @XMM[4], 0x20($out) | ||
1990 | pxor @XMM[12], @XMM[3] | ||
1991 | movdqu @XMM[6], 0x30($out) | ||
1992 | pxor @XMM[13], @XMM[7] | ||
1993 | movdqu @XMM[3], 0x40($out) | ||
1994 | pxor @XMM[14], @XMM[2] | ||
1995 | movdqu @XMM[7], 0x50($out) | ||
1996 | pxor @XMM[15], @XMM[5] | ||
1997 | movdqu @XMM[2], 0x60($out) | ||
1998 | lea .LADD1(%rip), %r11 | ||
1999 | movdqu @XMM[5], 0x70($out) | ||
2000 | lea 0x80($out), $out | ||
2001 | paddd 0x70(%r11), @XMM[0] # .LADD8 | ||
2002 | jnz .Lctr_enc_loop | ||
2003 | |||
2004 | jmp .Lctr_enc_done | ||
2005 | .align 16 | ||
2006 | .Lctr_enc_loop_done: | ||
2007 | add \$8, $len | ||
2008 | movdqu 0x00($inp), @XMM[8] # load input | ||
2009 | pxor @XMM[8], @XMM[0] | ||
2010 | movdqu @XMM[0], 0x00($out) # write output | ||
2011 | cmp \$2,$len | ||
2012 | jb .Lctr_enc_done | ||
2013 | movdqu 0x10($inp), @XMM[9] | ||
2014 | pxor @XMM[9], @XMM[1] | ||
2015 | movdqu @XMM[1], 0x10($out) | ||
2016 | je .Lctr_enc_done | ||
2017 | movdqu 0x20($inp), @XMM[10] | ||
2018 | pxor @XMM[10], @XMM[4] | ||
2019 | movdqu @XMM[4], 0x20($out) | ||
2020 | cmp \$4,$len | ||
2021 | jb .Lctr_enc_done | ||
2022 | movdqu 0x30($inp), @XMM[11] | ||
2023 | pxor @XMM[11], @XMM[6] | ||
2024 | movdqu @XMM[6], 0x30($out) | ||
2025 | je .Lctr_enc_done | ||
2026 | movdqu 0x40($inp), @XMM[12] | ||
2027 | pxor @XMM[12], @XMM[3] | ||
2028 | movdqu @XMM[3], 0x40($out) | ||
2029 | cmp \$6,$len | ||
2030 | jb .Lctr_enc_done | ||
2031 | movdqu 0x50($inp), @XMM[13] | ||
2032 | pxor @XMM[13], @XMM[7] | ||
2033 | movdqu @XMM[7], 0x50($out) | ||
2034 | je .Lctr_enc_done | ||
2035 | movdqu 0x60($inp), @XMM[14] | ||
2036 | pxor @XMM[14], @XMM[2] | ||
2037 | movdqu @XMM[2], 0x60($out) | ||
2038 | jmp .Lctr_enc_done | ||
2039 | |||
2040 | .align 16 | ||
2041 | .Lctr_enc_short: | ||
2042 | lea 0x20(%rbp), $arg1 | ||
2043 | lea 0x30(%rbp), $arg2 | ||
2044 | lea ($key), $arg3 | ||
2045 | call asm_AES_encrypt | ||
2046 | movdqu ($inp), @XMM[1] | ||
2047 | lea 16($inp), $inp | ||
2048 | mov 0x2c(%rbp), %eax # load 32-bit counter | ||
2049 | bswap %eax | ||
2050 | pxor 0x30(%rbp), @XMM[1] | ||
2051 | inc %eax # increment | ||
2052 | movdqu @XMM[1], ($out) | ||
2053 | bswap %eax | ||
2054 | lea 16($out), $out | ||
2055 | mov %eax, 0x2c(%rsp) # save 32-bit counter | ||
2056 | dec $len | ||
2057 | jnz .Lctr_enc_short | ||
2058 | |||
2059 | .Lctr_enc_done: | ||
2060 | lea (%rsp), %rax | ||
2061 | pxor %xmm0, %xmm0 | ||
2062 | .Lctr_enc_bzero: # wipe key schedule [if any] | ||
2063 | movdqa %xmm0, 0x00(%rax) | ||
2064 | movdqa %xmm0, 0x10(%rax) | ||
2065 | lea 0x20(%rax), %rax | ||
2066 | cmp %rax, %rbp | ||
2067 | ja .Lctr_enc_bzero | ||
2068 | |||
2069 | lea (%rbp),%rsp # restore %rsp | ||
2070 | ___ | ||
2071 | $code.=<<___ if ($win64); | ||
2072 | movaps 0x40(%rbp), %xmm6 | ||
2073 | movaps 0x50(%rbp), %xmm7 | ||
2074 | movaps 0x60(%rbp), %xmm8 | ||
2075 | movaps 0x70(%rbp), %xmm9 | ||
2076 | movaps 0x80(%rbp), %xmm10 | ||
2077 | movaps 0x90(%rbp), %xmm11 | ||
2078 | movaps 0xa0(%rbp), %xmm12 | ||
2079 | movaps 0xb0(%rbp), %xmm13 | ||
2080 | movaps 0xc0(%rbp), %xmm14 | ||
2081 | movaps 0xd0(%rbp), %xmm15 | ||
2082 | lea 0xa0(%rbp), %rsp | ||
2083 | ___ | ||
2084 | $code.=<<___; | ||
2085 | mov 0x48(%rsp), %r15 | ||
2086 | mov 0x50(%rsp), %r14 | ||
2087 | mov 0x58(%rsp), %r13 | ||
2088 | mov 0x60(%rsp), %r12 | ||
2089 | mov 0x68(%rsp), %rbx | ||
2090 | mov 0x70(%rsp), %rax | ||
2091 | lea 0x78(%rsp), %rsp | ||
2092 | mov %rax, %rbp | ||
2093 | .Lctr_enc_epilogue: | ||
2094 | ret | ||
2095 | .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks | ||
2096 | ___ | ||
2097 | ###################################################################### | ||
2098 | # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, | ||
2099 | # const AES_KEY *key1, const AES_KEY *key2, | ||
2100 | # const unsigned char iv[16]); | ||
2101 | # | ||
2102 | my ($twmask,$twres,$twtmp)=@XMM[13..15]; | ||
2103 | $arg6=~s/d$//; | ||
2104 | |||
2105 | $code.=<<___; | ||
2106 | .globl bsaes_xts_encrypt | ||
2107 | .type bsaes_xts_encrypt,\@abi-omnipotent | ||
2108 | .align 16 | ||
2109 | bsaes_xts_encrypt: | ||
2110 | _CET_ENDBR | ||
2111 | mov %rsp, %rax | ||
2112 | .Lxts_enc_prologue: | ||
2113 | push %rbp | ||
2114 | push %rbx | ||
2115 | push %r12 | ||
2116 | push %r13 | ||
2117 | push %r14 | ||
2118 | push %r15 | ||
2119 | lea -0x48(%rsp), %rsp | ||
2120 | ___ | ||
2121 | $code.=<<___ if ($win64); | ||
2122 | mov 0xa0(%rsp),$arg5 # pull key2 | ||
2123 | mov 0xa8(%rsp),$arg6 # pull ivp | ||
2124 | lea -0xa0(%rsp), %rsp | ||
2125 | movaps %xmm6, 0x40(%rsp) | ||
2126 | movaps %xmm7, 0x50(%rsp) | ||
2127 | movaps %xmm8, 0x60(%rsp) | ||
2128 | movaps %xmm9, 0x70(%rsp) | ||
2129 | movaps %xmm10, 0x80(%rsp) | ||
2130 | movaps %xmm11, 0x90(%rsp) | ||
2131 | movaps %xmm12, 0xa0(%rsp) | ||
2132 | movaps %xmm13, 0xb0(%rsp) | ||
2133 | movaps %xmm14, 0xc0(%rsp) | ||
2134 | movaps %xmm15, 0xd0(%rsp) | ||
2135 | .Lxts_enc_body: | ||
2136 | ___ | ||
2137 | $code.=<<___; | ||
2138 | mov %rsp, %rbp # backup %rsp | ||
2139 | mov $arg1, $inp # backup arguments | ||
2140 | mov $arg2, $out | ||
2141 | mov $arg3, $len | ||
2142 | mov $arg4, $key | ||
2143 | |||
2144 | lea ($arg6), $arg1 | ||
2145 | lea 0x20(%rbp), $arg2 | ||
2146 | lea ($arg5), $arg3 | ||
2147 | call asm_AES_encrypt # generate initial tweak | ||
2148 | |||
2149 | mov 240($key), %eax # rounds | ||
2150 | mov $len, %rbx # backup $len | ||
2151 | |||
2152 | mov %eax, %edx # rounds | ||
2153 | shl \$7, %rax # 128 bytes per inner round key | ||
2154 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
2155 | sub %rax, %rsp | ||
2156 | |||
2157 | mov %rsp, %rax # pass key schedule | ||
2158 | mov $key, %rcx # pass key | ||
2159 | mov %edx, %r10d # pass rounds | ||
2160 | call _bsaes_key_convert | ||
2161 | pxor %xmm6, %xmm7 # fix up last round key | ||
2162 | movdqa %xmm7, (%rax) # save last round key | ||
2163 | |||
2164 | and \$-16, $len | ||
2165 | sub \$0x80, %rsp # place for tweak[8] | ||
2166 | movdqa 0x20(%rbp), @XMM[7] # initial tweak | ||
2167 | |||
2168 | pxor $twtmp, $twtmp | ||
2169 | movdqa .Lxts_magic(%rip), $twmask | ||
2170 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2171 | |||
2172 | sub \$0x80, $len | ||
2173 | jc .Lxts_enc_short | ||
2174 | jmp .Lxts_enc_loop | ||
2175 | |||
2176 | .align 16 | ||
2177 | .Lxts_enc_loop: | ||
2178 | ___ | ||
2179 | for ($i=0;$i<7;$i++) { | ||
2180 | $code.=<<___; | ||
2181 | pshufd \$0x13, $twtmp, $twres | ||
2182 | pxor $twtmp, $twtmp | ||
2183 | movdqa @XMM[7], @XMM[$i] | ||
2184 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
2185 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2186 | pand $twmask, $twres # isolate carry and residue | ||
2187 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2188 | pxor $twres, @XMM[7] | ||
2189 | ___ | ||
2190 | $code.=<<___ if ($i>=1); | ||
2191 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
2192 | ___ | ||
2193 | $code.=<<___ if ($i>=2); | ||
2194 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
2195 | ___ | ||
2196 | } | ||
2197 | $code.=<<___; | ||
2198 | movdqu 0x60($inp), @XMM[8+6] | ||
2199 | pxor @XMM[8+5], @XMM[5] | ||
2200 | movdqu 0x70($inp), @XMM[8+7] | ||
2201 | lea 0x80($inp), $inp | ||
2202 | movdqa @XMM[7], 0x70(%rsp) | ||
2203 | pxor @XMM[8+6], @XMM[6] | ||
2204 | lea 0x80(%rsp), %rax # pass key schedule | ||
2205 | pxor @XMM[8+7], @XMM[7] | ||
2206 | mov %edx, %r10d # pass rounds | ||
2207 | |||
2208 | call _bsaes_encrypt8 | ||
2209 | |||
2210 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2211 | pxor 0x10(%rsp), @XMM[1] | ||
2212 | movdqu @XMM[0], 0x00($out) # write output | ||
2213 | pxor 0x20(%rsp), @XMM[4] | ||
2214 | movdqu @XMM[1], 0x10($out) | ||
2215 | pxor 0x30(%rsp), @XMM[6] | ||
2216 | movdqu @XMM[4], 0x20($out) | ||
2217 | pxor 0x40(%rsp), @XMM[3] | ||
2218 | movdqu @XMM[6], 0x30($out) | ||
2219 | pxor 0x50(%rsp), @XMM[7] | ||
2220 | movdqu @XMM[3], 0x40($out) | ||
2221 | pxor 0x60(%rsp), @XMM[2] | ||
2222 | movdqu @XMM[7], 0x50($out) | ||
2223 | pxor 0x70(%rsp), @XMM[5] | ||
2224 | movdqu @XMM[2], 0x60($out) | ||
2225 | movdqu @XMM[5], 0x70($out) | ||
2226 | lea 0x80($out), $out | ||
2227 | |||
2228 | movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak | ||
2229 | pxor $twtmp, $twtmp | ||
2230 | movdqa .Lxts_magic(%rip), $twmask | ||
2231 | pcmpgtd @XMM[7], $twtmp | ||
2232 | pshufd \$0x13, $twtmp, $twres | ||
2233 | pxor $twtmp, $twtmp | ||
2234 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2235 | pand $twmask, $twres # isolate carry and residue | ||
2236 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2237 | pxor $twres, @XMM[7] | ||
2238 | |||
2239 | sub \$0x80,$len | ||
2240 | jnc .Lxts_enc_loop | ||
2241 | |||
2242 | .Lxts_enc_short: | ||
2243 | add \$0x80, $len | ||
2244 | jz .Lxts_enc_done | ||
2245 | ___ | ||
2246 | for ($i=0;$i<7;$i++) { | ||
2247 | $code.=<<___; | ||
2248 | pshufd \$0x13, $twtmp, $twres | ||
2249 | pxor $twtmp, $twtmp | ||
2250 | movdqa @XMM[7], @XMM[$i] | ||
2251 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
2252 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2253 | pand $twmask, $twres # isolate carry and residue | ||
2254 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2255 | pxor $twres, @XMM[7] | ||
2256 | ___ | ||
2257 | $code.=<<___ if ($i>=1); | ||
2258 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
2259 | cmp \$`0x10*$i`,$len | ||
2260 | je .Lxts_enc_$i | ||
2261 | ___ | ||
2262 | $code.=<<___ if ($i>=2); | ||
2263 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
2264 | ___ | ||
2265 | } | ||
2266 | $code.=<<___; | ||
2267 | movdqu 0x60($inp), @XMM[8+6] | ||
2268 | pxor @XMM[8+5], @XMM[5] | ||
2269 | movdqa @XMM[7], 0x70(%rsp) | ||
2270 | lea 0x70($inp), $inp | ||
2271 | pxor @XMM[8+6], @XMM[6] | ||
2272 | lea 0x80(%rsp), %rax # pass key schedule | ||
2273 | mov %edx, %r10d # pass rounds | ||
2274 | |||
2275 | call _bsaes_encrypt8 | ||
2276 | |||
2277 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2278 | pxor 0x10(%rsp), @XMM[1] | ||
2279 | movdqu @XMM[0], 0x00($out) # write output | ||
2280 | pxor 0x20(%rsp), @XMM[4] | ||
2281 | movdqu @XMM[1], 0x10($out) | ||
2282 | pxor 0x30(%rsp), @XMM[6] | ||
2283 | movdqu @XMM[4], 0x20($out) | ||
2284 | pxor 0x40(%rsp), @XMM[3] | ||
2285 | movdqu @XMM[6], 0x30($out) | ||
2286 | pxor 0x50(%rsp), @XMM[7] | ||
2287 | movdqu @XMM[3], 0x40($out) | ||
2288 | pxor 0x60(%rsp), @XMM[2] | ||
2289 | movdqu @XMM[7], 0x50($out) | ||
2290 | movdqu @XMM[2], 0x60($out) | ||
2291 | lea 0x70($out), $out | ||
2292 | |||
2293 | movdqa 0x70(%rsp), @XMM[7] # next iteration tweak | ||
2294 | jmp .Lxts_enc_done | ||
2295 | .align 16 | ||
2296 | .Lxts_enc_6: | ||
2297 | pxor @XMM[8+4], @XMM[4] | ||
2298 | lea 0x60($inp), $inp | ||
2299 | pxor @XMM[8+5], @XMM[5] | ||
2300 | lea 0x80(%rsp), %rax # pass key schedule | ||
2301 | mov %edx, %r10d # pass rounds | ||
2302 | |||
2303 | call _bsaes_encrypt8 | ||
2304 | |||
2305 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2306 | pxor 0x10(%rsp), @XMM[1] | ||
2307 | movdqu @XMM[0], 0x00($out) # write output | ||
2308 | pxor 0x20(%rsp), @XMM[4] | ||
2309 | movdqu @XMM[1], 0x10($out) | ||
2310 | pxor 0x30(%rsp), @XMM[6] | ||
2311 | movdqu @XMM[4], 0x20($out) | ||
2312 | pxor 0x40(%rsp), @XMM[3] | ||
2313 | movdqu @XMM[6], 0x30($out) | ||
2314 | pxor 0x50(%rsp), @XMM[7] | ||
2315 | movdqu @XMM[3], 0x40($out) | ||
2316 | movdqu @XMM[7], 0x50($out) | ||
2317 | lea 0x60($out), $out | ||
2318 | |||
2319 | movdqa 0x60(%rsp), @XMM[7] # next iteration tweak | ||
2320 | jmp .Lxts_enc_done | ||
2321 | .align 16 | ||
2322 | .Lxts_enc_5: | ||
2323 | pxor @XMM[8+3], @XMM[3] | ||
2324 | lea 0x50($inp), $inp | ||
2325 | pxor @XMM[8+4], @XMM[4] | ||
2326 | lea 0x80(%rsp), %rax # pass key schedule | ||
2327 | mov %edx, %r10d # pass rounds | ||
2328 | |||
2329 | call _bsaes_encrypt8 | ||
2330 | |||
2331 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2332 | pxor 0x10(%rsp), @XMM[1] | ||
2333 | movdqu @XMM[0], 0x00($out) # write output | ||
2334 | pxor 0x20(%rsp), @XMM[4] | ||
2335 | movdqu @XMM[1], 0x10($out) | ||
2336 | pxor 0x30(%rsp), @XMM[6] | ||
2337 | movdqu @XMM[4], 0x20($out) | ||
2338 | pxor 0x40(%rsp), @XMM[3] | ||
2339 | movdqu @XMM[6], 0x30($out) | ||
2340 | movdqu @XMM[3], 0x40($out) | ||
2341 | lea 0x50($out), $out | ||
2342 | |||
2343 | movdqa 0x50(%rsp), @XMM[7] # next iteration tweak | ||
2344 | jmp .Lxts_enc_done | ||
2345 | .align 16 | ||
2346 | .Lxts_enc_4: | ||
2347 | pxor @XMM[8+2], @XMM[2] | ||
2348 | lea 0x40($inp), $inp | ||
2349 | pxor @XMM[8+3], @XMM[3] | ||
2350 | lea 0x80(%rsp), %rax # pass key schedule | ||
2351 | mov %edx, %r10d # pass rounds | ||
2352 | |||
2353 | call _bsaes_encrypt8 | ||
2354 | |||
2355 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2356 | pxor 0x10(%rsp), @XMM[1] | ||
2357 | movdqu @XMM[0], 0x00($out) # write output | ||
2358 | pxor 0x20(%rsp), @XMM[4] | ||
2359 | movdqu @XMM[1], 0x10($out) | ||
2360 | pxor 0x30(%rsp), @XMM[6] | ||
2361 | movdqu @XMM[4], 0x20($out) | ||
2362 | movdqu @XMM[6], 0x30($out) | ||
2363 | lea 0x40($out), $out | ||
2364 | |||
2365 | movdqa 0x40(%rsp), @XMM[7] # next iteration tweak | ||
2366 | jmp .Lxts_enc_done | ||
2367 | .align 16 | ||
2368 | .Lxts_enc_3: | ||
2369 | pxor @XMM[8+1], @XMM[1] | ||
2370 | lea 0x30($inp), $inp | ||
2371 | pxor @XMM[8+2], @XMM[2] | ||
2372 | lea 0x80(%rsp), %rax # pass key schedule | ||
2373 | mov %edx, %r10d # pass rounds | ||
2374 | |||
2375 | call _bsaes_encrypt8 | ||
2376 | |||
2377 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2378 | pxor 0x10(%rsp), @XMM[1] | ||
2379 | movdqu @XMM[0], 0x00($out) # write output | ||
2380 | pxor 0x20(%rsp), @XMM[4] | ||
2381 | movdqu @XMM[1], 0x10($out) | ||
2382 | movdqu @XMM[4], 0x20($out) | ||
2383 | lea 0x30($out), $out | ||
2384 | |||
2385 | movdqa 0x30(%rsp), @XMM[7] # next iteration tweak | ||
2386 | jmp .Lxts_enc_done | ||
2387 | .align 16 | ||
2388 | .Lxts_enc_2: | ||
2389 | pxor @XMM[8+0], @XMM[0] | ||
2390 | lea 0x20($inp), $inp | ||
2391 | pxor @XMM[8+1], @XMM[1] | ||
2392 | lea 0x80(%rsp), %rax # pass key schedule | ||
2393 | mov %edx, %r10d # pass rounds | ||
2394 | |||
2395 | call _bsaes_encrypt8 | ||
2396 | |||
2397 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2398 | pxor 0x10(%rsp), @XMM[1] | ||
2399 | movdqu @XMM[0], 0x00($out) # write output | ||
2400 | movdqu @XMM[1], 0x10($out) | ||
2401 | lea 0x20($out), $out | ||
2402 | |||
2403 | movdqa 0x20(%rsp), @XMM[7] # next iteration tweak | ||
2404 | jmp .Lxts_enc_done | ||
2405 | .align 16 | ||
2406 | .Lxts_enc_1: | ||
2407 | pxor @XMM[0], @XMM[8] | ||
2408 | lea 0x10($inp), $inp | ||
2409 | movdqa @XMM[8], 0x20(%rbp) | ||
2410 | lea 0x20(%rbp), $arg1 | ||
2411 | lea 0x20(%rbp), $arg2 | ||
2412 | lea ($key), $arg3 | ||
2413 | call asm_AES_encrypt # doesn't touch %xmm | ||
2414 | pxor 0x20(%rbp), @XMM[0] # ^= tweak[] | ||
2415 | #pxor @XMM[8], @XMM[0] | ||
2416 | #lea 0x80(%rsp), %rax # pass key schedule | ||
2417 | #mov %edx, %r10d # pass rounds | ||
2418 | #call _bsaes_encrypt8 | ||
2419 | #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2420 | movdqu @XMM[0], 0x00($out) # write output | ||
2421 | lea 0x10($out), $out | ||
2422 | |||
2423 | movdqa 0x10(%rsp), @XMM[7] # next iteration tweak | ||
2424 | |||
2425 | .Lxts_enc_done: | ||
2426 | and \$15, %ebx | ||
2427 | jz .Lxts_enc_ret | ||
2428 | mov $out, %rdx | ||
2429 | |||
2430 | .Lxts_enc_steal: | ||
2431 | movzb ($inp), %eax | ||
2432 | movzb -16(%rdx), %ecx | ||
2433 | lea 1($inp), $inp | ||
2434 | mov %al, -16(%rdx) | ||
2435 | mov %cl, 0(%rdx) | ||
2436 | lea 1(%rdx), %rdx | ||
2437 | sub \$1,%ebx | ||
2438 | jnz .Lxts_enc_steal | ||
2439 | |||
2440 | movdqu -16($out), @XMM[0] | ||
2441 | lea 0x20(%rbp), $arg1 | ||
2442 | pxor @XMM[7], @XMM[0] | ||
2443 | lea 0x20(%rbp), $arg2 | ||
2444 | movdqa @XMM[0], 0x20(%rbp) | ||
2445 | lea ($key), $arg3 | ||
2446 | call asm_AES_encrypt # doesn't touch %xmm | ||
2447 | pxor 0x20(%rbp), @XMM[7] | ||
2448 | movdqu @XMM[7], -16($out) | ||
2449 | |||
2450 | .Lxts_enc_ret: | ||
2451 | lea (%rsp), %rax | ||
2452 | pxor %xmm0, %xmm0 | ||
2453 | .Lxts_enc_bzero: # wipe key schedule [if any] | ||
2454 | movdqa %xmm0, 0x00(%rax) | ||
2455 | movdqa %xmm0, 0x10(%rax) | ||
2456 | lea 0x20(%rax), %rax | ||
2457 | cmp %rax, %rbp | ||
2458 | ja .Lxts_enc_bzero | ||
2459 | |||
2460 | lea (%rbp),%rsp # restore %rsp | ||
2461 | ___ | ||
2462 | $code.=<<___ if ($win64); | ||
2463 | movaps 0x40(%rbp), %xmm6 | ||
2464 | movaps 0x50(%rbp), %xmm7 | ||
2465 | movaps 0x60(%rbp), %xmm8 | ||
2466 | movaps 0x70(%rbp), %xmm9 | ||
2467 | movaps 0x80(%rbp), %xmm10 | ||
2468 | movaps 0x90(%rbp), %xmm11 | ||
2469 | movaps 0xa0(%rbp), %xmm12 | ||
2470 | movaps 0xb0(%rbp), %xmm13 | ||
2471 | movaps 0xc0(%rbp), %xmm14 | ||
2472 | movaps 0xd0(%rbp), %xmm15 | ||
2473 | lea 0xa0(%rbp), %rsp | ||
2474 | ___ | ||
2475 | $code.=<<___; | ||
2476 | mov 0x48(%rsp), %r15 | ||
2477 | mov 0x50(%rsp), %r14 | ||
2478 | mov 0x58(%rsp), %r13 | ||
2479 | mov 0x60(%rsp), %r12 | ||
2480 | mov 0x68(%rsp), %rbx | ||
2481 | mov 0x70(%rsp), %rax | ||
2482 | lea 0x78(%rsp), %rsp | ||
2483 | mov %rax, %rbp | ||
2484 | .Lxts_enc_epilogue: | ||
2485 | ret | ||
2486 | .size bsaes_xts_encrypt,.-bsaes_xts_encrypt | ||
2487 | |||
2488 | .globl bsaes_xts_decrypt | ||
2489 | .type bsaes_xts_decrypt,\@abi-omnipotent | ||
2490 | .align 16 | ||
2491 | bsaes_xts_decrypt: | ||
2492 | _CET_ENDBR | ||
2493 | mov %rsp, %rax | ||
2494 | .Lxts_dec_prologue: | ||
2495 | push %rbp | ||
2496 | push %rbx | ||
2497 | push %r12 | ||
2498 | push %r13 | ||
2499 | push %r14 | ||
2500 | push %r15 | ||
2501 | lea -0x48(%rsp), %rsp | ||
2502 | ___ | ||
2503 | $code.=<<___ if ($win64); | ||
2504 | mov 0xa0(%rsp),$arg5 # pull key2 | ||
2505 | mov 0xa8(%rsp),$arg6 # pull ivp | ||
2506 | lea -0xa0(%rsp), %rsp | ||
2507 | movaps %xmm6, 0x40(%rsp) | ||
2508 | movaps %xmm7, 0x50(%rsp) | ||
2509 | movaps %xmm8, 0x60(%rsp) | ||
2510 | movaps %xmm9, 0x70(%rsp) | ||
2511 | movaps %xmm10, 0x80(%rsp) | ||
2512 | movaps %xmm11, 0x90(%rsp) | ||
2513 | movaps %xmm12, 0xa0(%rsp) | ||
2514 | movaps %xmm13, 0xb0(%rsp) | ||
2515 | movaps %xmm14, 0xc0(%rsp) | ||
2516 | movaps %xmm15, 0xd0(%rsp) | ||
2517 | .Lxts_dec_body: | ||
2518 | ___ | ||
2519 | $code.=<<___; | ||
2520 | mov %rsp, %rbp # backup %rsp | ||
2521 | mov $arg1, $inp # backup arguments | ||
2522 | mov $arg2, $out | ||
2523 | mov $arg3, $len | ||
2524 | mov $arg4, $key | ||
2525 | |||
2526 | lea ($arg6), $arg1 | ||
2527 | lea 0x20(%rbp), $arg2 | ||
2528 | lea ($arg5), $arg3 | ||
2529 | call asm_AES_encrypt # generate initial tweak | ||
2530 | |||
2531 | mov 240($key), %eax # rounds | ||
2532 | mov $len, %rbx # backup $len | ||
2533 | |||
2534 | mov %eax, %edx # rounds | ||
2535 | shl \$7, %rax # 128 bytes per inner round key | ||
2536 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
2537 | sub %rax, %rsp | ||
2538 | |||
2539 | mov %rsp, %rax # pass key schedule | ||
2540 | mov $key, %rcx # pass key | ||
2541 | mov %edx, %r10d # pass rounds | ||
2542 | call _bsaes_key_convert | ||
2543 | pxor (%rsp), %xmm7 # fix up round 0 key | ||
2544 | movdqa %xmm6, (%rax) # save last round key | ||
2545 | movdqa %xmm7, (%rsp) | ||
2546 | |||
2547 | xor %eax, %eax # if ($len%16) len-=16; | ||
2548 | and \$-16, $len | ||
2549 | test \$15, %ebx | ||
2550 | setnz %al | ||
2551 | shl \$4, %rax | ||
2552 | sub %rax, $len | ||
2553 | |||
2554 | sub \$0x80, %rsp # place for tweak[8] | ||
2555 | movdqa 0x20(%rbp), @XMM[7] # initial tweak | ||
2556 | |||
2557 | pxor $twtmp, $twtmp | ||
2558 | movdqa .Lxts_magic(%rip), $twmask | ||
2559 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2560 | |||
2561 | sub \$0x80, $len | ||
2562 | jc .Lxts_dec_short | ||
2563 | jmp .Lxts_dec_loop | ||
2564 | |||
2565 | .align 16 | ||
2566 | .Lxts_dec_loop: | ||
2567 | ___ | ||
2568 | for ($i=0;$i<7;$i++) { | ||
2569 | $code.=<<___; | ||
2570 | pshufd \$0x13, $twtmp, $twres | ||
2571 | pxor $twtmp, $twtmp | ||
2572 | movdqa @XMM[7], @XMM[$i] | ||
2573 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
2574 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2575 | pand $twmask, $twres # isolate carry and residue | ||
2576 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2577 | pxor $twres, @XMM[7] | ||
2578 | ___ | ||
2579 | $code.=<<___ if ($i>=1); | ||
2580 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
2581 | ___ | ||
2582 | $code.=<<___ if ($i>=2); | ||
2583 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
2584 | ___ | ||
2585 | } | ||
2586 | $code.=<<___; | ||
2587 | movdqu 0x60($inp), @XMM[8+6] | ||
2588 | pxor @XMM[8+5], @XMM[5] | ||
2589 | movdqu 0x70($inp), @XMM[8+7] | ||
2590 | lea 0x80($inp), $inp | ||
2591 | movdqa @XMM[7], 0x70(%rsp) | ||
2592 | pxor @XMM[8+6], @XMM[6] | ||
2593 | lea 0x80(%rsp), %rax # pass key schedule | ||
2594 | pxor @XMM[8+7], @XMM[7] | ||
2595 | mov %edx, %r10d # pass rounds | ||
2596 | |||
2597 | call _bsaes_decrypt8 | ||
2598 | |||
2599 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2600 | pxor 0x10(%rsp), @XMM[1] | ||
2601 | movdqu @XMM[0], 0x00($out) # write output | ||
2602 | pxor 0x20(%rsp), @XMM[6] | ||
2603 | movdqu @XMM[1], 0x10($out) | ||
2604 | pxor 0x30(%rsp), @XMM[4] | ||
2605 | movdqu @XMM[6], 0x20($out) | ||
2606 | pxor 0x40(%rsp), @XMM[2] | ||
2607 | movdqu @XMM[4], 0x30($out) | ||
2608 | pxor 0x50(%rsp), @XMM[7] | ||
2609 | movdqu @XMM[2], 0x40($out) | ||
2610 | pxor 0x60(%rsp), @XMM[3] | ||
2611 | movdqu @XMM[7], 0x50($out) | ||
2612 | pxor 0x70(%rsp), @XMM[5] | ||
2613 | movdqu @XMM[3], 0x60($out) | ||
2614 | movdqu @XMM[5], 0x70($out) | ||
2615 | lea 0x80($out), $out | ||
2616 | |||
2617 | movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak | ||
2618 | pxor $twtmp, $twtmp | ||
2619 | movdqa .Lxts_magic(%rip), $twmask | ||
2620 | pcmpgtd @XMM[7], $twtmp | ||
2621 | pshufd \$0x13, $twtmp, $twres | ||
2622 | pxor $twtmp, $twtmp | ||
2623 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2624 | pand $twmask, $twres # isolate carry and residue | ||
2625 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2626 | pxor $twres, @XMM[7] | ||
2627 | |||
2628 | sub \$0x80,$len | ||
2629 | jnc .Lxts_dec_loop | ||
2630 | |||
2631 | .Lxts_dec_short: | ||
2632 | add \$0x80, $len | ||
2633 | jz .Lxts_dec_done | ||
2634 | ___ | ||
2635 | for ($i=0;$i<7;$i++) { | ||
2636 | $code.=<<___; | ||
2637 | pshufd \$0x13, $twtmp, $twres | ||
2638 | pxor $twtmp, $twtmp | ||
2639 | movdqa @XMM[7], @XMM[$i] | ||
2640 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
2641 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2642 | pand $twmask, $twres # isolate carry and residue | ||
2643 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2644 | pxor $twres, @XMM[7] | ||
2645 | ___ | ||
2646 | $code.=<<___ if ($i>=1); | ||
2647 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
2648 | cmp \$`0x10*$i`,$len | ||
2649 | je .Lxts_dec_$i | ||
2650 | ___ | ||
2651 | $code.=<<___ if ($i>=2); | ||
2652 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
2653 | ___ | ||
2654 | } | ||
2655 | $code.=<<___; | ||
2656 | movdqu 0x60($inp), @XMM[8+6] | ||
2657 | pxor @XMM[8+5], @XMM[5] | ||
2658 | movdqa @XMM[7], 0x70(%rsp) | ||
2659 | lea 0x70($inp), $inp | ||
2660 | pxor @XMM[8+6], @XMM[6] | ||
2661 | lea 0x80(%rsp), %rax # pass key schedule | ||
2662 | mov %edx, %r10d # pass rounds | ||
2663 | |||
2664 | call _bsaes_decrypt8 | ||
2665 | |||
2666 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2667 | pxor 0x10(%rsp), @XMM[1] | ||
2668 | movdqu @XMM[0], 0x00($out) # write output | ||
2669 | pxor 0x20(%rsp), @XMM[6] | ||
2670 | movdqu @XMM[1], 0x10($out) | ||
2671 | pxor 0x30(%rsp), @XMM[4] | ||
2672 | movdqu @XMM[6], 0x20($out) | ||
2673 | pxor 0x40(%rsp), @XMM[2] | ||
2674 | movdqu @XMM[4], 0x30($out) | ||
2675 | pxor 0x50(%rsp), @XMM[7] | ||
2676 | movdqu @XMM[2], 0x40($out) | ||
2677 | pxor 0x60(%rsp), @XMM[3] | ||
2678 | movdqu @XMM[7], 0x50($out) | ||
2679 | movdqu @XMM[3], 0x60($out) | ||
2680 | lea 0x70($out), $out | ||
2681 | |||
2682 | movdqa 0x70(%rsp), @XMM[7] # next iteration tweak | ||
2683 | jmp .Lxts_dec_done | ||
2684 | .align 16 | ||
2685 | .Lxts_dec_6: | ||
2686 | pxor @XMM[8+4], @XMM[4] | ||
2687 | lea 0x60($inp), $inp | ||
2688 | pxor @XMM[8+5], @XMM[5] | ||
2689 | lea 0x80(%rsp), %rax # pass key schedule | ||
2690 | mov %edx, %r10d # pass rounds | ||
2691 | |||
2692 | call _bsaes_decrypt8 | ||
2693 | |||
2694 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2695 | pxor 0x10(%rsp), @XMM[1] | ||
2696 | movdqu @XMM[0], 0x00($out) # write output | ||
2697 | pxor 0x20(%rsp), @XMM[6] | ||
2698 | movdqu @XMM[1], 0x10($out) | ||
2699 | pxor 0x30(%rsp), @XMM[4] | ||
2700 | movdqu @XMM[6], 0x20($out) | ||
2701 | pxor 0x40(%rsp), @XMM[2] | ||
2702 | movdqu @XMM[4], 0x30($out) | ||
2703 | pxor 0x50(%rsp), @XMM[7] | ||
2704 | movdqu @XMM[2], 0x40($out) | ||
2705 | movdqu @XMM[7], 0x50($out) | ||
2706 | lea 0x60($out), $out | ||
2707 | |||
2708 | movdqa 0x60(%rsp), @XMM[7] # next iteration tweak | ||
2709 | jmp .Lxts_dec_done | ||
2710 | .align 16 | ||
2711 | .Lxts_dec_5: | ||
2712 | pxor @XMM[8+3], @XMM[3] | ||
2713 | lea 0x50($inp), $inp | ||
2714 | pxor @XMM[8+4], @XMM[4] | ||
2715 | lea 0x80(%rsp), %rax # pass key schedule | ||
2716 | mov %edx, %r10d # pass rounds | ||
2717 | |||
2718 | call _bsaes_decrypt8 | ||
2719 | |||
2720 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2721 | pxor 0x10(%rsp), @XMM[1] | ||
2722 | movdqu @XMM[0], 0x00($out) # write output | ||
2723 | pxor 0x20(%rsp), @XMM[6] | ||
2724 | movdqu @XMM[1], 0x10($out) | ||
2725 | pxor 0x30(%rsp), @XMM[4] | ||
2726 | movdqu @XMM[6], 0x20($out) | ||
2727 | pxor 0x40(%rsp), @XMM[2] | ||
2728 | movdqu @XMM[4], 0x30($out) | ||
2729 | movdqu @XMM[2], 0x40($out) | ||
2730 | lea 0x50($out), $out | ||
2731 | |||
2732 | movdqa 0x50(%rsp), @XMM[7] # next iteration tweak | ||
2733 | jmp .Lxts_dec_done | ||
2734 | .align 16 | ||
2735 | .Lxts_dec_4: | ||
2736 | pxor @XMM[8+2], @XMM[2] | ||
2737 | lea 0x40($inp), $inp | ||
2738 | pxor @XMM[8+3], @XMM[3] | ||
2739 | lea 0x80(%rsp), %rax # pass key schedule | ||
2740 | mov %edx, %r10d # pass rounds | ||
2741 | |||
2742 | call _bsaes_decrypt8 | ||
2743 | |||
2744 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2745 | pxor 0x10(%rsp), @XMM[1] | ||
2746 | movdqu @XMM[0], 0x00($out) # write output | ||
2747 | pxor 0x20(%rsp), @XMM[6] | ||
2748 | movdqu @XMM[1], 0x10($out) | ||
2749 | pxor 0x30(%rsp), @XMM[4] | ||
2750 | movdqu @XMM[6], 0x20($out) | ||
2751 | movdqu @XMM[4], 0x30($out) | ||
2752 | lea 0x40($out), $out | ||
2753 | |||
2754 | movdqa 0x40(%rsp), @XMM[7] # next iteration tweak | ||
2755 | jmp .Lxts_dec_done | ||
2756 | .align 16 | ||
2757 | .Lxts_dec_3: | ||
2758 | pxor @XMM[8+1], @XMM[1] | ||
2759 | lea 0x30($inp), $inp | ||
2760 | pxor @XMM[8+2], @XMM[2] | ||
2761 | lea 0x80(%rsp), %rax # pass key schedule | ||
2762 | mov %edx, %r10d # pass rounds | ||
2763 | |||
2764 | call _bsaes_decrypt8 | ||
2765 | |||
2766 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2767 | pxor 0x10(%rsp), @XMM[1] | ||
2768 | movdqu @XMM[0], 0x00($out) # write output | ||
2769 | pxor 0x20(%rsp), @XMM[6] | ||
2770 | movdqu @XMM[1], 0x10($out) | ||
2771 | movdqu @XMM[6], 0x20($out) | ||
2772 | lea 0x30($out), $out | ||
2773 | |||
2774 | movdqa 0x30(%rsp), @XMM[7] # next iteration tweak | ||
2775 | jmp .Lxts_dec_done | ||
2776 | .align 16 | ||
2777 | .Lxts_dec_2: | ||
2778 | pxor @XMM[8+0], @XMM[0] | ||
2779 | lea 0x20($inp), $inp | ||
2780 | pxor @XMM[8+1], @XMM[1] | ||
2781 | lea 0x80(%rsp), %rax # pass key schedule | ||
2782 | mov %edx, %r10d # pass rounds | ||
2783 | |||
2784 | call _bsaes_decrypt8 | ||
2785 | |||
2786 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2787 | pxor 0x10(%rsp), @XMM[1] | ||
2788 | movdqu @XMM[0], 0x00($out) # write output | ||
2789 | movdqu @XMM[1], 0x10($out) | ||
2790 | lea 0x20($out), $out | ||
2791 | |||
2792 | movdqa 0x20(%rsp), @XMM[7] # next iteration tweak | ||
2793 | jmp .Lxts_dec_done | ||
2794 | .align 16 | ||
2795 | .Lxts_dec_1: | ||
2796 | pxor @XMM[0], @XMM[8] | ||
2797 | lea 0x10($inp), $inp | ||
2798 | movdqa @XMM[8], 0x20(%rbp) | ||
2799 | lea 0x20(%rbp), $arg1 | ||
2800 | lea 0x20(%rbp), $arg2 | ||
2801 | lea ($key), $arg3 | ||
2802 | call asm_AES_decrypt # doesn't touch %xmm | ||
2803 | pxor 0x20(%rbp), @XMM[0] # ^= tweak[] | ||
2804 | #pxor @XMM[8], @XMM[0] | ||
2805 | #lea 0x80(%rsp), %rax # pass key schedule | ||
2806 | #mov %edx, %r10d # pass rounds | ||
2807 | #call _bsaes_decrypt8 | ||
2808 | #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2809 | movdqu @XMM[0], 0x00($out) # write output | ||
2810 | lea 0x10($out), $out | ||
2811 | |||
2812 | movdqa 0x10(%rsp), @XMM[7] # next iteration tweak | ||
2813 | |||
2814 | .Lxts_dec_done: | ||
2815 | and \$15, %ebx | ||
2816 | jz .Lxts_dec_ret | ||
2817 | |||
2818 | pxor $twtmp, $twtmp | ||
2819 | movdqa .Lxts_magic(%rip), $twmask | ||
2820 | pcmpgtd @XMM[7], $twtmp | ||
2821 | pshufd \$0x13, $twtmp, $twres | ||
2822 | movdqa @XMM[7], @XMM[6] | ||
2823 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2824 | pand $twmask, $twres # isolate carry and residue | ||
2825 | movdqu ($inp), @XMM[0] | ||
2826 | pxor $twres, @XMM[7] | ||
2827 | |||
2828 | lea 0x20(%rbp), $arg1 | ||
2829 | pxor @XMM[7], @XMM[0] | ||
2830 | lea 0x20(%rbp), $arg2 | ||
2831 | movdqa @XMM[0], 0x20(%rbp) | ||
2832 | lea ($key), $arg3 | ||
2833 | call asm_AES_decrypt # doesn't touch %xmm | ||
2834 | pxor 0x20(%rbp), @XMM[7] | ||
2835 | mov $out, %rdx | ||
2836 | movdqu @XMM[7], ($out) | ||
2837 | |||
2838 | .Lxts_dec_steal: | ||
2839 | movzb 16($inp), %eax | ||
2840 | movzb (%rdx), %ecx | ||
2841 | lea 1($inp), $inp | ||
2842 | mov %al, (%rdx) | ||
2843 | mov %cl, 16(%rdx) | ||
2844 | lea 1(%rdx), %rdx | ||
2845 | sub \$1,%ebx | ||
2846 | jnz .Lxts_dec_steal | ||
2847 | |||
2848 | movdqu ($out), @XMM[0] | ||
2849 | lea 0x20(%rbp), $arg1 | ||
2850 | pxor @XMM[6], @XMM[0] | ||
2851 | lea 0x20(%rbp), $arg2 | ||
2852 | movdqa @XMM[0], 0x20(%rbp) | ||
2853 | lea ($key), $arg3 | ||
2854 | call asm_AES_decrypt # doesn't touch %xmm | ||
2855 | pxor 0x20(%rbp), @XMM[6] | ||
2856 | movdqu @XMM[6], ($out) | ||
2857 | |||
2858 | .Lxts_dec_ret: | ||
2859 | lea (%rsp), %rax | ||
2860 | pxor %xmm0, %xmm0 | ||
2861 | .Lxts_dec_bzero: # wipe key schedule [if any] | ||
2862 | movdqa %xmm0, 0x00(%rax) | ||
2863 | movdqa %xmm0, 0x10(%rax) | ||
2864 | lea 0x20(%rax), %rax | ||
2865 | cmp %rax, %rbp | ||
2866 | ja .Lxts_dec_bzero | ||
2867 | |||
2868 | lea (%rbp),%rsp # restore %rsp | ||
2869 | ___ | ||
2870 | $code.=<<___ if ($win64); | ||
2871 | movaps 0x40(%rbp), %xmm6 | ||
2872 | movaps 0x50(%rbp), %xmm7 | ||
2873 | movaps 0x60(%rbp), %xmm8 | ||
2874 | movaps 0x70(%rbp), %xmm9 | ||
2875 | movaps 0x80(%rbp), %xmm10 | ||
2876 | movaps 0x90(%rbp), %xmm11 | ||
2877 | movaps 0xa0(%rbp), %xmm12 | ||
2878 | movaps 0xb0(%rbp), %xmm13 | ||
2879 | movaps 0xc0(%rbp), %xmm14 | ||
2880 | movaps 0xd0(%rbp), %xmm15 | ||
2881 | lea 0xa0(%rbp), %rsp | ||
2882 | ___ | ||
2883 | $code.=<<___; | ||
2884 | mov 0x48(%rsp), %r15 | ||
2885 | mov 0x50(%rsp), %r14 | ||
2886 | mov 0x58(%rsp), %r13 | ||
2887 | mov 0x60(%rsp), %r12 | ||
2888 | mov 0x68(%rsp), %rbx | ||
2889 | mov 0x70(%rsp), %rax | ||
2890 | lea 0x78(%rsp), %rsp | ||
2891 | mov %rax, %rbp | ||
2892 | .Lxts_dec_epilogue: | ||
2893 | ret | ||
2894 | .size bsaes_xts_decrypt,.-bsaes_xts_decrypt | ||
2895 | ___ | ||
2896 | } | ||
2897 | $code.=<<___; | ||
2898 | .section .rodata | ||
2899 | .type _bsaes_const,\@object | ||
2900 | .align 64 | ||
2901 | _bsaes_const: | ||
2902 | .LM0ISR: # InvShiftRows constants | ||
2903 | .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 | ||
2904 | .LISRM0: | ||
2905 | .quad 0x01040b0e0205080f, 0x0306090c00070a0d | ||
2906 | .LISR: | ||
2907 | .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 | ||
2908 | .LBS0: # bit-slice constants | ||
2909 | .quad 0x5555555555555555, 0x5555555555555555 | ||
2910 | .LBS1: | ||
2911 | .quad 0x3333333333333333, 0x3333333333333333 | ||
2912 | .LBS2: | ||
2913 | .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f | ||
2914 | .LSR: # shiftrows constants | ||
2915 | .quad 0x0504070600030201, 0x0f0e0d0c0a09080b | ||
2916 | .LSRM0: | ||
2917 | .quad 0x0304090e00050a0f, 0x01060b0c0207080d | ||
2918 | .LM0SR: | ||
2919 | .quad 0x0a0e02060f03070b, 0x0004080c05090d01 | ||
2920 | .LSWPUP: # byte-swap upper dword | ||
2921 | .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 | ||
2922 | .LSWPUPM0SR: | ||
2923 | .quad 0x0a0d02060c03070b, 0x0004080f05090e01 | ||
2924 | .LADD1: # counter increment constants | ||
2925 | .quad 0x0000000000000000, 0x0000000100000000 | ||
2926 | .LADD2: | ||
2927 | .quad 0x0000000000000000, 0x0000000200000000 | ||
2928 | .LADD3: | ||
2929 | .quad 0x0000000000000000, 0x0000000300000000 | ||
2930 | .LADD4: | ||
2931 | .quad 0x0000000000000000, 0x0000000400000000 | ||
2932 | .LADD5: | ||
2933 | .quad 0x0000000000000000, 0x0000000500000000 | ||
2934 | .LADD6: | ||
2935 | .quad 0x0000000000000000, 0x0000000600000000 | ||
2936 | .LADD7: | ||
2937 | .quad 0x0000000000000000, 0x0000000700000000 | ||
2938 | .LADD8: | ||
2939 | .quad 0x0000000000000000, 0x0000000800000000 | ||
2940 | .Lxts_magic: | ||
2941 | .long 0x87,0,1,0 | ||
2942 | .Lmasks: | ||
2943 | .quad 0x0101010101010101, 0x0101010101010101 | ||
2944 | .quad 0x0202020202020202, 0x0202020202020202 | ||
2945 | .quad 0x0404040404040404, 0x0404040404040404 | ||
2946 | .quad 0x0808080808080808, 0x0808080808080808 | ||
2947 | .LM0: | ||
2948 | .quad 0x02060a0e03070b0f, 0x0004080c0105090d | ||
2949 | .L63: | ||
2950 | .quad 0x6363636363636363, 0x6363636363636363 | ||
2951 | .align 64 | ||
2952 | .size _bsaes_const,.-_bsaes_const | ||
2953 | .text | ||
2954 | ___ | ||
2955 | |||
2956 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
2957 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
2958 | if ($win64) { | ||
2959 | $rec="%rcx"; | ||
2960 | $frame="%rdx"; | ||
2961 | $context="%r8"; | ||
2962 | $disp="%r9"; | ||
2963 | |||
2964 | $code.=<<___; | ||
2965 | .extern __imp_RtlVirtualUnwind | ||
2966 | .type se_handler,\@abi-omnipotent | ||
2967 | .align 16 | ||
2968 | se_handler: | ||
2969 | _CET_ENDBR | ||
2970 | push %rsi | ||
2971 | push %rdi | ||
2972 | push %rbx | ||
2973 | push %rbp | ||
2974 | push %r12 | ||
2975 | push %r13 | ||
2976 | push %r14 | ||
2977 | push %r15 | ||
2978 | pushfq | ||
2979 | sub \$64,%rsp | ||
2980 | |||
2981 | mov 120($context),%rax # pull context->Rax | ||
2982 | mov 248($context),%rbx # pull context->Rip | ||
2983 | |||
2984 | mov 8($disp),%rsi # disp->ImageBase | ||
2985 | mov 56($disp),%r11 # disp->HandlerData | ||
2986 | |||
2987 | mov 0(%r11),%r10d # HandlerData[0] | ||
2988 | lea (%rsi,%r10),%r10 # prologue label | ||
2989 | cmp %r10,%rbx # context->Rip<prologue label | ||
2990 | jb .Lin_prologue | ||
2991 | |||
2992 | mov 152($context),%rax # pull context->Rsp | ||
2993 | |||
2994 | mov 4(%r11),%r10d # HandlerData[1] | ||
2995 | lea (%rsi,%r10),%r10 # epilogue label | ||
2996 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2997 | jae .Lin_prologue | ||
2998 | |||
2999 | mov 160($context),%rax # pull context->Rbp | ||
3000 | |||
3001 | lea 0x40(%rax),%rsi # %xmm save area | ||
3002 | lea 512($context),%rdi # &context.Xmm6 | ||
3003 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
3004 | .long 0xa548f3fc # cld; rep movsq | ||
3005 | lea 0xa0(%rax),%rax # adjust stack pointer | ||
3006 | |||
3007 | mov 0x70(%rax),%rbp | ||
3008 | mov 0x68(%rax),%rbx | ||
3009 | mov 0x60(%rax),%r12 | ||
3010 | mov 0x58(%rax),%r13 | ||
3011 | mov 0x50(%rax),%r14 | ||
3012 | mov 0x48(%rax),%r15 | ||
3013 | lea 0x78(%rax),%rax # adjust stack pointer | ||
3014 | mov %rbx,144($context) # restore context->Rbx | ||
3015 | mov %rbp,160($context) # restore context->Rbp | ||
3016 | mov %r12,216($context) # restore context->R12 | ||
3017 | mov %r13,224($context) # restore context->R13 | ||
3018 | mov %r14,232($context) # restore context->R14 | ||
3019 | mov %r15,240($context) # restore context->R15 | ||
3020 | |||
3021 | .Lin_prologue: | ||
3022 | mov %rax,152($context) # restore context->Rsp | ||
3023 | |||
3024 | mov 40($disp),%rdi # disp->ContextRecord | ||
3025 | mov $context,%rsi # context | ||
3026 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
3027 | .long 0xa548f3fc # cld; rep movsq | ||
3028 | |||
3029 | mov $disp,%rsi | ||
3030 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
3031 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
3032 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
3033 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
3034 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
3035 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
3036 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
3037 | mov %r10,32(%rsp) # arg5 | ||
3038 | mov %r11,40(%rsp) # arg6 | ||
3039 | mov %r12,48(%rsp) # arg7 | ||
3040 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
3041 | call *__imp_RtlVirtualUnwind(%rip) | ||
3042 | |||
3043 | mov \$1,%eax # ExceptionContinueSearch | ||
3044 | add \$64,%rsp | ||
3045 | popfq | ||
3046 | pop %r15 | ||
3047 | pop %r14 | ||
3048 | pop %r13 | ||
3049 | pop %r12 | ||
3050 | pop %rbp | ||
3051 | pop %rbx | ||
3052 | pop %rdi | ||
3053 | pop %rsi | ||
3054 | ret | ||
3055 | .size se_handler,.-se_handler | ||
3056 | |||
3057 | .section .pdata | ||
3058 | .align 4 | ||
3059 | ___ | ||
3060 | $code.=<<___ if ($ecb); | ||
3061 | .rva .Lecb_enc_prologue | ||
3062 | .rva .Lecb_enc_epilogue | ||
3063 | .rva .Lecb_enc_info | ||
3064 | |||
3065 | .rva .Lecb_dec_prologue | ||
3066 | .rva .Lecb_dec_epilogue | ||
3067 | .rva .Lecb_dec_info | ||
3068 | ___ | ||
3069 | $code.=<<___; | ||
3070 | .rva .Lcbc_dec_prologue | ||
3071 | .rva .Lcbc_dec_epilogue | ||
3072 | .rva .Lcbc_dec_info | ||
3073 | |||
3074 | .rva .Lctr_enc_prologue | ||
3075 | .rva .Lctr_enc_epilogue | ||
3076 | .rva .Lctr_enc_info | ||
3077 | |||
3078 | .rva .Lxts_enc_prologue | ||
3079 | .rva .Lxts_enc_epilogue | ||
3080 | .rva .Lxts_enc_info | ||
3081 | |||
3082 | .rva .Lxts_dec_prologue | ||
3083 | .rva .Lxts_dec_epilogue | ||
3084 | .rva .Lxts_dec_info | ||
3085 | |||
3086 | .section .xdata | ||
3087 | .align 8 | ||
3088 | ___ | ||
3089 | $code.=<<___ if ($ecb); | ||
3090 | .Lecb_enc_info: | ||
3091 | .byte 9,0,0,0 | ||
3092 | .rva se_handler | ||
3093 | .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] | ||
3094 | .Lecb_dec_info: | ||
3095 | .byte 9,0,0,0 | ||
3096 | .rva se_handler | ||
3097 | .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] | ||
3098 | ___ | ||
3099 | $code.=<<___; | ||
3100 | .Lcbc_dec_info: | ||
3101 | .byte 9,0,0,0 | ||
3102 | .rva se_handler | ||
3103 | .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] | ||
3104 | .Lctr_enc_info: | ||
3105 | .byte 9,0,0,0 | ||
3106 | .rva se_handler | ||
3107 | .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] | ||
3108 | .Lxts_enc_info: | ||
3109 | .byte 9,0,0,0 | ||
3110 | .rva se_handler | ||
3111 | .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] | ||
3112 | .Lxts_dec_info: | ||
3113 | .byte 9,0,0,0 | ||
3114 | .rva se_handler | ||
3115 | .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] | ||
3116 | ___ | ||
3117 | } | ||
3118 | |||
3119 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
3120 | |||
3121 | print $code; | ||
3122 | |||
3123 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl deleted file mode 100644 index 6e7bd36d05..0000000000 --- a/src/lib/libcrypto/aes/asm/vpaes-x86.pl +++ /dev/null | |||
@@ -1,911 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | ###################################################################### | ||
4 | ## Constant-time SSSE3 AES core implementation. | ||
5 | ## version 0.1 | ||
6 | ## | ||
7 | ## By Mike Hamburg (Stanford University), 2009 | ||
8 | ## Public domain. | ||
9 | ## | ||
10 | ## For details see http://shiftleft.org/papers/vector_aes/ and | ||
11 | ## http://crypto.stanford.edu/vpaes/. | ||
12 | |||
13 | ###################################################################### | ||
14 | # September 2011. | ||
15 | # | ||
16 | # Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for | ||
17 | # aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt | ||
18 | # doesn't handle partial vectors (doesn't have to if called from | ||
19 | # EVP only). "Drop-in" implies that this module doesn't share key | ||
20 | # schedule structure with the original nor does it make assumption | ||
21 | # about its alignment... | ||
22 | # | ||
23 | # Performance summary. aes-586.pl column lists large-block CBC | ||
24 | # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per | ||
25 | # byte processed with 128-bit key, and vpaes-x86.pl column - [also | ||
26 | # large-block CBC] encrypt/decrypt. | ||
27 | # | ||
28 | # aes-586.pl vpaes-x86.pl | ||
29 | # | ||
30 | # Core 2(**) 29.1/42.3/18.3 22.0/25.6(***) | ||
31 | # Nehalem 27.9/40.4/18.1 10.3/12.0 | ||
32 | # Atom 102./119./60.1 64.5/85.3(***) | ||
33 | # | ||
34 | # (*) "Hyper-threading" in the context refers rather to cache shared | ||
35 | # among multiple cores, than to specifically Intel HTT. As vast | ||
36 | # majority of contemporary cores share cache, slower code path | ||
37 | # is common place. In other words "with-hyper-threading-off" | ||
38 | # results are presented mostly for reference purposes. | ||
39 | # | ||
40 | # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. | ||
41 | # | ||
42 | # (***) Less impressive improvement on Core 2 and Atom is due to slow | ||
43 | # pshufb, yet it's respectable +32%/65% improvement on Core 2 | ||
44 | # and +58%/40% on Atom (as implied, over "hyper-threading-safe" | ||
45 | # code path). | ||
46 | # | ||
47 | # <appro@openssl.org> | ||
48 | |||
49 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
50 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
51 | require "x86asm.pl"; | ||
52 | |||
53 | &asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); | ||
54 | |||
55 | $PREFIX="vpaes"; | ||
56 | |||
57 | my ($round, $base, $magic, $key, $const, $inp, $out)= | ||
58 | ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); | ||
59 | |||
60 | &rodataseg(); | ||
61 | &static_label("_vpaes_consts"); | ||
62 | &static_label("_vpaes_schedule_low_round"); | ||
63 | |||
64 | &set_label("_vpaes_consts",64); | ||
65 | $k_inv=-0x30; # inv, inva | ||
66 | &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); | ||
67 | &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); | ||
68 | |||
69 | $k_s0F=-0x10; # s0F | ||
70 | &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); | ||
71 | |||
72 | $k_ipt=0x00; # input transform (lo, hi) | ||
73 | &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); | ||
74 | &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); | ||
75 | |||
76 | $k_sb1=0x20; # sb1u, sb1t | ||
77 | &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); | ||
78 | &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); | ||
79 | $k_sb2=0x40; # sb2u, sb2t | ||
80 | &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); | ||
81 | &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); | ||
82 | $k_sbo=0x60; # sbou, sbot | ||
83 | &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); | ||
84 | &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); | ||
85 | |||
86 | $k_mc_forward=0x80; # mc_forward | ||
87 | &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); | ||
88 | &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); | ||
89 | &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); | ||
90 | &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); | ||
91 | |||
92 | $k_mc_backward=0xc0; # mc_backward | ||
93 | &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); | ||
94 | &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); | ||
95 | &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); | ||
96 | &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); | ||
97 | |||
98 | $k_sr=0x100; # sr | ||
99 | &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); | ||
100 | &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); | ||
101 | &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); | ||
102 | &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); | ||
103 | |||
104 | $k_rcon=0x140; # rcon | ||
105 | &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); | ||
106 | |||
107 | $k_s63=0x150; # s63: all equal to 0x63 transformed | ||
108 | &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); | ||
109 | |||
110 | $k_opt=0x160; # output transform | ||
111 | &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); | ||
112 | &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); | ||
113 | |||
114 | $k_deskew=0x180; # deskew tables: inverts the sbox's "skew" | ||
115 | &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); | ||
116 | &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); | ||
117 | ## | ||
118 | ## Decryption stuff | ||
119 | ## Key schedule constants | ||
120 | ## | ||
121 | $k_dksd=0x1a0; # decryption key schedule: invskew x*D | ||
122 | &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); | ||
123 | &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); | ||
124 | $k_dksb=0x1c0; # decryption key schedule: invskew x*B | ||
125 | &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); | ||
126 | &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); | ||
127 | $k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 | ||
128 | &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); | ||
129 | &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); | ||
130 | $k_dks9=0x200; # decryption key schedule: invskew x*9 | ||
131 | &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); | ||
132 | &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); | ||
133 | |||
134 | ## | ||
135 | ## Decryption stuff | ||
136 | ## Round function constants | ||
137 | ## | ||
138 | $k_dipt=0x220; # decryption input transform | ||
139 | &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); | ||
140 | &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); | ||
141 | |||
142 | $k_dsb9=0x240; # decryption sbox output *9*u, *9*t | ||
143 | &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); | ||
144 | &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); | ||
145 | $k_dsbd=0x260; # decryption sbox output *D*u, *D*t | ||
146 | &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); | ||
147 | &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); | ||
148 | $k_dsbb=0x280; # decryption sbox output *B*u, *B*t | ||
149 | &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); | ||
150 | &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); | ||
151 | $k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t | ||
152 | &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); | ||
153 | &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); | ||
154 | $k_dsbo=0x2c0; # decryption sbox final output | ||
155 | &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); | ||
156 | &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); | ||
157 | &previous(); | ||
158 | |||
159 | &function_begin_B("_vpaes_preheat"); | ||
160 | &movdqa ("xmm7",&QWP($k_inv,$const)); | ||
161 | &movdqa ("xmm6",&QWP($k_s0F,$const)); | ||
162 | &ret (); | ||
163 | &function_end_B("_vpaes_preheat"); | ||
164 | |||
165 | ## | ||
166 | ## _aes_encrypt_core | ||
167 | ## | ||
168 | ## AES-encrypt %xmm0. | ||
169 | ## | ||
170 | ## Inputs: | ||
171 | ## %xmm0 = input | ||
172 | ## %xmm6-%xmm7 as in _vpaes_preheat | ||
173 | ## (%edx) = scheduled keys | ||
174 | ## | ||
175 | ## Output in %xmm0 | ||
176 | ## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx | ||
177 | ## | ||
178 | ## | ||
179 | &function_begin_B("_vpaes_encrypt_core"); | ||
180 | &mov ($magic,16); | ||
181 | &mov ($round,&DWP(240,$key)); | ||
182 | &movdqa ("xmm1","xmm6") | ||
183 | &movdqa ("xmm2",&QWP($k_ipt,$const)); | ||
184 | &pandn ("xmm1","xmm0"); | ||
185 | &movdqu ("xmm5",&QWP(0,$key)); | ||
186 | &psrld ("xmm1",4); | ||
187 | &pand ("xmm0","xmm6"); | ||
188 | &pshufb ("xmm2","xmm0"); | ||
189 | &movdqa ("xmm0",&QWP($k_ipt+16,$const)); | ||
190 | &pshufb ("xmm0","xmm1"); | ||
191 | &pxor ("xmm2","xmm5"); | ||
192 | &pxor ("xmm0","xmm2"); | ||
193 | &add ($key,16); | ||
194 | &lea ($base,&DWP($k_mc_backward,$const)); | ||
195 | &jmp (&label("enc_entry")); | ||
196 | |||
197 | |||
198 | &set_label("enc_loop",16); | ||
199 | # middle of middle round | ||
200 | &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u | ||
201 | &pshufb ("xmm4","xmm2"); # 4 = sb1u | ||
202 | &pxor ("xmm4","xmm5"); # 4 = sb1u + k | ||
203 | &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t | ||
204 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
205 | &pxor ("xmm0","xmm4"); # 0 = A | ||
206 | &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u | ||
207 | &pshufb ("xmm5","xmm2"); # 4 = sb2u | ||
208 | &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] | ||
209 | &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t | ||
210 | &pshufb ("xmm2","xmm3"); # 2 = sb2t | ||
211 | &pxor ("xmm2","xmm5"); # 2 = 2A | ||
212 | &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] | ||
213 | &movdqa ("xmm3","xmm0"); # 3 = A | ||
214 | &pshufb ("xmm0","xmm1"); # 0 = B | ||
215 | &add ($key,16); # next key | ||
216 | &pxor ("xmm0","xmm2"); # 0 = 2A+B | ||
217 | &pshufb ("xmm3","xmm4"); # 3 = D | ||
218 | &add ($magic,16); # next mc | ||
219 | &pxor ("xmm3","xmm0"); # 3 = 2A+B+D | ||
220 | &pshufb ("xmm0","xmm1"); # 0 = 2B+C | ||
221 | &and ($magic,0x30); # ... mod 4 | ||
222 | &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D | ||
223 | &sub ($round,1); # nr-- | ||
224 | |||
225 | &set_label("enc_entry"); | ||
226 | # top of round | ||
227 | &movdqa ("xmm1","xmm6"); # 1 : i | ||
228 | &pandn ("xmm1","xmm0"); # 1 = i<<4 | ||
229 | &psrld ("xmm1",4); # 1 = i | ||
230 | &pand ("xmm0","xmm6"); # 0 = k | ||
231 | &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k | ||
232 | &pshufb ("xmm5","xmm0"); # 2 = a/k | ||
233 | &pxor ("xmm0","xmm1"); # 0 = j | ||
234 | &movdqa ("xmm3","xmm7"); # 3 : 1/i | ||
235 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
236 | &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k | ||
237 | &movdqa ("xmm4","xmm7"); # 4 : 1/j | ||
238 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
239 | &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k | ||
240 | &movdqa ("xmm2","xmm7"); # 2 : 1/iak | ||
241 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
242 | &pxor ("xmm2","xmm0"); # 2 = io | ||
243 | &movdqa ("xmm3","xmm7"); # 3 : 1/jak | ||
244 | &movdqu ("xmm5",&QWP(0,$key)); | ||
245 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
246 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
247 | &jnz (&label("enc_loop")); | ||
248 | |||
249 | # middle of last round | ||
250 | &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo | ||
251 | &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 | ||
252 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
253 | &pxor ("xmm4","xmm5"); # 4 = sb1u + k | ||
254 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
255 | &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] | ||
256 | &pxor ("xmm0","xmm4"); # 0 = A | ||
257 | &pshufb ("xmm0","xmm1"); | ||
258 | &ret (); | ||
259 | &function_end_B("_vpaes_encrypt_core"); | ||
260 | |||
261 | ## | ||
262 | ## Decryption core | ||
263 | ## | ||
264 | ## Same API as encryption core. | ||
265 | ## | ||
266 | &function_begin_B("_vpaes_decrypt_core"); | ||
267 | &mov ($round,&DWP(240,$key)); | ||
268 | &lea ($base,&DWP($k_dsbd,$const)); | ||
269 | &movdqa ("xmm1","xmm6"); | ||
270 | &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); | ||
271 | &pandn ("xmm1","xmm0"); | ||
272 | &mov ($magic,$round); | ||
273 | &psrld ("xmm1",4) | ||
274 | &movdqu ("xmm5",&QWP(0,$key)); | ||
275 | &shl ($magic,4); | ||
276 | &pand ("xmm0","xmm6"); | ||
277 | &pshufb ("xmm2","xmm0"); | ||
278 | &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); | ||
279 | &xor ($magic,0x30); | ||
280 | &pshufb ("xmm0","xmm1"); | ||
281 | &and ($magic,0x30); | ||
282 | &pxor ("xmm2","xmm5"); | ||
283 | &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); | ||
284 | &pxor ("xmm0","xmm2"); | ||
285 | &add ($key,16); | ||
286 | &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); | ||
287 | &jmp (&label("dec_entry")); | ||
288 | |||
289 | &set_label("dec_loop",16); | ||
290 | ## | ||
291 | ## Inverse mix columns | ||
292 | ## | ||
293 | &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u | ||
294 | &pshufb ("xmm4","xmm2"); # 4 = sb9u | ||
295 | &pxor ("xmm4","xmm0"); | ||
296 | &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t | ||
297 | &pshufb ("xmm0","xmm3"); # 0 = sb9t | ||
298 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
299 | &add ($key,16); # next round key | ||
300 | |||
301 | &pshufb ("xmm0","xmm5"); # MC ch | ||
302 | &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu | ||
303 | &pshufb ("xmm4","xmm2"); # 4 = sbdu | ||
304 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
305 | &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt | ||
306 | &pshufb ("xmm0","xmm3"); # 0 = sbdt | ||
307 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
308 | &sub ($round,1); # nr-- | ||
309 | |||
310 | &pshufb ("xmm0","xmm5"); # MC ch | ||
311 | &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu | ||
312 | &pshufb ("xmm4","xmm2"); # 4 = sbbu | ||
313 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
314 | &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt | ||
315 | &pshufb ("xmm0","xmm3"); # 0 = sbbt | ||
316 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
317 | |||
318 | &pshufb ("xmm0","xmm5"); # MC ch | ||
319 | &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu | ||
320 | &pshufb ("xmm4","xmm2"); # 4 = sbeu | ||
321 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
322 | &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet | ||
323 | &pshufb ("xmm0","xmm3"); # 0 = sbet | ||
324 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
325 | |||
326 | &palignr("xmm5","xmm5",12); | ||
327 | |||
328 | &set_label("dec_entry"); | ||
329 | # top of round | ||
330 | &movdqa ("xmm1","xmm6"); # 1 : i | ||
331 | &pandn ("xmm1","xmm0"); # 1 = i<<4 | ||
332 | &psrld ("xmm1",4); # 1 = i | ||
333 | &pand ("xmm0","xmm6"); # 0 = k | ||
334 | &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k | ||
335 | &pshufb ("xmm2","xmm0"); # 2 = a/k | ||
336 | &pxor ("xmm0","xmm1"); # 0 = j | ||
337 | &movdqa ("xmm3","xmm7"); # 3 : 1/i | ||
338 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
339 | &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k | ||
340 | &movdqa ("xmm4","xmm7"); # 4 : 1/j | ||
341 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
342 | &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k | ||
343 | &movdqa ("xmm2","xmm7"); # 2 : 1/iak | ||
344 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
345 | &pxor ("xmm2","xmm0"); # 2 = io | ||
346 | &movdqa ("xmm3","xmm7"); # 3 : 1/jak | ||
347 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
348 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
349 | &movdqu ("xmm0",&QWP(0,$key)); | ||
350 | &jnz (&label("dec_loop")); | ||
351 | |||
352 | # middle of last round | ||
353 | &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou | ||
354 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
355 | &pxor ("xmm4","xmm0"); # 4 = sb1u + k | ||
356 | &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot | ||
357 | &movdqa ("xmm2",&QWP(0,$magic)); | ||
358 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
359 | &pxor ("xmm0","xmm4"); # 0 = A | ||
360 | &pshufb ("xmm0","xmm2"); | ||
361 | &ret (); | ||
362 | &function_end_B("_vpaes_decrypt_core"); | ||
363 | |||
364 | ######################################################## | ||
365 | ## ## | ||
366 | ## AES key schedule ## | ||
367 | ## ## | ||
368 | ######################################################## | ||
369 | &function_begin_B("_vpaes_schedule_core"); | ||
370 | &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) | ||
371 | &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon | ||
372 | |||
373 | # input transform | ||
374 | &movdqa ("xmm3","xmm0"); | ||
375 | &lea ($base,&DWP($k_ipt,$const)); | ||
376 | &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 | ||
377 | &call ("_vpaes_schedule_transform"); | ||
378 | &movdqa ("xmm7","xmm0"); | ||
379 | |||
380 | &test ($out,$out); | ||
381 | &jnz (&label("schedule_am_decrypting")); | ||
382 | |||
383 | # encrypting, output zeroth round key after transform | ||
384 | &movdqu (&QWP(0,$key),"xmm0"); | ||
385 | &jmp (&label("schedule_go")); | ||
386 | |||
387 | &set_label("schedule_am_decrypting"); | ||
388 | # decrypting, output zeroth round key after shiftrows | ||
389 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
390 | &pshufb ("xmm3","xmm1"); | ||
391 | &movdqu (&QWP(0,$key),"xmm3"); | ||
392 | &xor ($magic,0x30); | ||
393 | |||
394 | &set_label("schedule_go"); | ||
395 | &cmp ($round,192); | ||
396 | &ja (&label("schedule_256")); | ||
397 | &je (&label("schedule_192")); | ||
398 | # 128: fall though | ||
399 | |||
400 | ## | ||
401 | ## .schedule_128 | ||
402 | ## | ||
403 | ## 128-bit specific part of key schedule. | ||
404 | ## | ||
405 | ## This schedule is really simple, because all its parts | ||
406 | ## are accomplished by the subroutines. | ||
407 | ## | ||
408 | &set_label("schedule_128"); | ||
409 | &mov ($round,10); | ||
410 | |||
411 | &set_label("loop_schedule_128"); | ||
412 | &call ("_vpaes_schedule_round"); | ||
413 | &dec ($round); | ||
414 | &jz (&label("schedule_mangle_last")); | ||
415 | &call ("_vpaes_schedule_mangle"); # write output | ||
416 | &jmp (&label("loop_schedule_128")); | ||
417 | |||
418 | ## | ||
419 | ## .aes_schedule_192 | ||
420 | ## | ||
421 | ## 192-bit specific part of key schedule. | ||
422 | ## | ||
423 | ## The main body of this schedule is the same as the 128-bit | ||
424 | ## schedule, but with more smearing. The long, high side is | ||
425 | ## stored in %xmm7 as before, and the short, low side is in | ||
426 | ## the high bits of %xmm6. | ||
427 | ## | ||
428 | ## This schedule is somewhat nastier, however, because each | ||
429 | ## round produces 192 bits of key material, or 1.5 round keys. | ||
430 | ## Therefore, on each cycle we do 2 rounds and produce 3 round | ||
431 | ## keys. | ||
432 | ## | ||
433 | &set_label("schedule_192",16); | ||
434 | &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) | ||
435 | &call ("_vpaes_schedule_transform"); # input transform | ||
436 | &movdqa ("xmm6","xmm0"); # save short part | ||
437 | &pxor ("xmm4","xmm4"); # clear 4 | ||
438 | &movhlps("xmm6","xmm4"); # clobber low side with zeros | ||
439 | &mov ($round,4); | ||
440 | |||
441 | &set_label("loop_schedule_192"); | ||
442 | &call ("_vpaes_schedule_round"); | ||
443 | &palignr("xmm0","xmm6",8); | ||
444 | &call ("_vpaes_schedule_mangle"); # save key n | ||
445 | &call ("_vpaes_schedule_192_smear"); | ||
446 | &call ("_vpaes_schedule_mangle"); # save key n+1 | ||
447 | &call ("_vpaes_schedule_round"); | ||
448 | &dec ($round); | ||
449 | &jz (&label("schedule_mangle_last")); | ||
450 | &call ("_vpaes_schedule_mangle"); # save key n+2 | ||
451 | &call ("_vpaes_schedule_192_smear"); | ||
452 | &jmp (&label("loop_schedule_192")); | ||
453 | |||
454 | ## | ||
455 | ## .aes_schedule_256 | ||
456 | ## | ||
457 | ## 256-bit specific part of key schedule. | ||
458 | ## | ||
459 | ## The structure here is very similar to the 128-bit | ||
460 | ## schedule, but with an additional "low side" in | ||
461 | ## %xmm6. The low side's rounds are the same as the | ||
462 | ## high side's, except no rcon and no rotation. | ||
463 | ## | ||
464 | &set_label("schedule_256",16); | ||
465 | &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) | ||
466 | &call ("_vpaes_schedule_transform"); # input transform | ||
467 | &mov ($round,7); | ||
468 | |||
469 | &set_label("loop_schedule_256"); | ||
470 | &call ("_vpaes_schedule_mangle"); # output low result | ||
471 | &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 | ||
472 | |||
473 | # high round | ||
474 | &call ("_vpaes_schedule_round"); | ||
475 | &dec ($round); | ||
476 | &jz (&label("schedule_mangle_last")); | ||
477 | &call ("_vpaes_schedule_mangle"); | ||
478 | |||
479 | # low round. swap xmm7 and xmm6 | ||
480 | &pshufd ("xmm0","xmm0",0xFF); | ||
481 | &movdqa (&QWP(20,"esp"),"xmm7"); | ||
482 | &movdqa ("xmm7","xmm6"); | ||
483 | &call ("_vpaes_schedule_low_round"); | ||
484 | &movdqa ("xmm7",&QWP(20,"esp")); | ||
485 | |||
486 | &jmp (&label("loop_schedule_256")); | ||
487 | |||
488 | ## | ||
489 | ## .aes_schedule_mangle_last | ||
490 | ## | ||
491 | ## Mangler for last round of key schedule | ||
492 | ## Mangles %xmm0 | ||
493 | ## when encrypting, outputs out(%xmm0) ^ 63 | ||
494 | ## when decrypting, outputs unskew(%xmm0) | ||
495 | ## | ||
496 | ## Always called right before return... jumps to cleanup and exits | ||
497 | ## | ||
498 | &set_label("schedule_mangle_last",16); | ||
499 | # schedule last round key from xmm0 | ||
500 | &lea ($base,&DWP($k_deskew,$const)); | ||
501 | &test ($out,$out); | ||
502 | &jnz (&label("schedule_mangle_last_dec")); | ||
503 | |||
504 | # encrypting | ||
505 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
506 | &pshufb ("xmm0","xmm1"); # output permute | ||
507 | &lea ($base,&DWP($k_opt,$const)); # prepare to output transform | ||
508 | &add ($key,32); | ||
509 | |||
510 | &set_label("schedule_mangle_last_dec"); | ||
511 | &add ($key,-16); | ||
512 | &pxor ("xmm0",&QWP($k_s63,$const)); | ||
513 | &call ("_vpaes_schedule_transform"); # output transform | ||
514 | &movdqu (&QWP(0,$key),"xmm0"); # save last key | ||
515 | |||
516 | # cleanup | ||
517 | &pxor ("xmm0","xmm0"); | ||
518 | &pxor ("xmm1","xmm1"); | ||
519 | &pxor ("xmm2","xmm2"); | ||
520 | &pxor ("xmm3","xmm3"); | ||
521 | &pxor ("xmm4","xmm4"); | ||
522 | &pxor ("xmm5","xmm5"); | ||
523 | &pxor ("xmm6","xmm6"); | ||
524 | &pxor ("xmm7","xmm7"); | ||
525 | &ret (); | ||
526 | &function_end_B("_vpaes_schedule_core"); | ||
527 | |||
528 | ## | ||
529 | ## .aes_schedule_192_smear | ||
530 | ## | ||
531 | ## Smear the short, low side in the 192-bit key schedule. | ||
532 | ## | ||
533 | ## Inputs: | ||
534 | ## %xmm7: high side, b a x y | ||
535 | ## %xmm6: low side, d c 0 0 | ||
536 | ## %xmm13: 0 | ||
537 | ## | ||
538 | ## Outputs: | ||
539 | ## %xmm6: b+c+d b+c 0 0 | ||
540 | ## %xmm0: b+c+d b+c b a | ||
541 | ## | ||
542 | &function_begin_B("_vpaes_schedule_192_smear"); | ||
543 | &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0 | ||
544 | &pxor ("xmm6","xmm0"); # -> c+d c 0 0 | ||
545 | &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a | ||
546 | &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a | ||
547 | &movdqa ("xmm0","xmm6"); | ||
548 | &pxor ("xmm1","xmm1"); | ||
549 | &movhlps("xmm6","xmm1"); # clobber low side with zeros | ||
550 | &ret (); | ||
551 | &function_end_B("_vpaes_schedule_192_smear"); | ||
552 | |||
553 | ## | ||
554 | ## .aes_schedule_round | ||
555 | ## | ||
556 | ## Runs one main round of the key schedule on %xmm0, %xmm7 | ||
557 | ## | ||
558 | ## Specifically, runs subbytes on the high dword of %xmm0 | ||
559 | ## then rotates it by one byte and xors into the low dword of | ||
560 | ## %xmm7. | ||
561 | ## | ||
562 | ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for | ||
563 | ## next rcon. | ||
564 | ## | ||
565 | ## Smears the dwords of %xmm7 by xoring the low into the | ||
566 | ## second low, result into third, result into highest. | ||
567 | ## | ||
568 | ## Returns results in %xmm7 = %xmm0. | ||
569 | ## Clobbers %xmm1-%xmm5. | ||
570 | ## | ||
571 | &function_begin_B("_vpaes_schedule_round"); | ||
572 | # extract rcon from xmm8 | ||
573 | &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 | ||
574 | &pxor ("xmm1","xmm1"); | ||
575 | &palignr("xmm1","xmm2",15); | ||
576 | &palignr("xmm2","xmm2",15); | ||
577 | &pxor ("xmm7","xmm1"); | ||
578 | |||
579 | # rotate | ||
580 | &pshufd ("xmm0","xmm0",0xFF); | ||
581 | &palignr("xmm0","xmm0",1); | ||
582 | |||
583 | # fall through... | ||
584 | &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 | ||
585 | |||
586 | # low round: same as high round, but no rotation and no rcon. | ||
587 | &set_label("_vpaes_schedule_low_round"); | ||
588 | # smear xmm7 | ||
589 | &movdqa ("xmm1","xmm7"); | ||
590 | &pslldq ("xmm7",4); | ||
591 | &pxor ("xmm7","xmm1"); | ||
592 | &movdqa ("xmm1","xmm7"); | ||
593 | &pslldq ("xmm7",8); | ||
594 | &pxor ("xmm7","xmm1"); | ||
595 | &pxor ("xmm7",&QWP($k_s63,$const)); | ||
596 | |||
597 | # subbyte | ||
598 | &movdqa ("xmm4",&QWP($k_s0F,$const)); | ||
599 | &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j | ||
600 | &movdqa ("xmm1","xmm4"); | ||
601 | &pandn ("xmm1","xmm0"); | ||
602 | &psrld ("xmm1",4); # 1 = i | ||
603 | &pand ("xmm0","xmm4"); # 0 = k | ||
604 | &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k | ||
605 | &pshufb ("xmm2","xmm0"); # 2 = a/k | ||
606 | &pxor ("xmm0","xmm1"); # 0 = j | ||
607 | &movdqa ("xmm3","xmm5"); # 3 : 1/i | ||
608 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
609 | &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k | ||
610 | &movdqa ("xmm4","xmm5"); # 4 : 1/j | ||
611 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
612 | &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k | ||
613 | &movdqa ("xmm2","xmm5"); # 2 : 1/iak | ||
614 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
615 | &pxor ("xmm2","xmm0"); # 2 = io | ||
616 | &movdqa ("xmm3","xmm5"); # 3 : 1/jak | ||
617 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
618 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
619 | &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou | ||
620 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
621 | &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot | ||
622 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
623 | &pxor ("xmm0","xmm4"); # 0 = sbox output | ||
624 | |||
625 | # add in smeared stuff | ||
626 | &pxor ("xmm0","xmm7"); | ||
627 | &movdqa ("xmm7","xmm0"); | ||
628 | &ret (); | ||
629 | &function_end_B("_vpaes_schedule_round"); | ||
630 | |||
631 | ## | ||
632 | ## .aes_schedule_transform | ||
633 | ## | ||
634 | ## Linear-transform %xmm0 according to tables at (%ebx) | ||
635 | ## | ||
636 | ## Output in %xmm0 | ||
637 | ## Clobbers %xmm1, %xmm2 | ||
638 | ## | ||
639 | &function_begin_B("_vpaes_schedule_transform"); | ||
640 | &movdqa ("xmm2",&QWP($k_s0F,$const)); | ||
641 | &movdqa ("xmm1","xmm2"); | ||
642 | &pandn ("xmm1","xmm0"); | ||
643 | &psrld ("xmm1",4); | ||
644 | &pand ("xmm0","xmm2"); | ||
645 | &movdqa ("xmm2",&QWP(0,$base)); | ||
646 | &pshufb ("xmm2","xmm0"); | ||
647 | &movdqa ("xmm0",&QWP(16,$base)); | ||
648 | &pshufb ("xmm0","xmm1"); | ||
649 | &pxor ("xmm0","xmm2"); | ||
650 | &ret (); | ||
651 | &function_end_B("_vpaes_schedule_transform"); | ||
652 | |||
653 | ## | ||
654 | ## .aes_schedule_mangle | ||
655 | ## | ||
656 | ## Mangle xmm0 from (basis-transformed) standard version | ||
657 | ## to our version. | ||
658 | ## | ||
659 | ## On encrypt, | ||
660 | ## xor with 0x63 | ||
661 | ## multiply by circulant 0,1,1,1 | ||
662 | ## apply shiftrows transform | ||
663 | ## | ||
664 | ## On decrypt, | ||
665 | ## xor with 0x63 | ||
666 | ## multiply by "inverse mixcolumns" circulant E,B,D,9 | ||
667 | ## deskew | ||
668 | ## apply shiftrows transform | ||
669 | ## | ||
670 | ## | ||
671 | ## Writes out to (%edx), and increments or decrements it | ||
672 | ## Keeps track of round number mod 4 in %ecx | ||
673 | ## Preserves xmm0 | ||
674 | ## Clobbers xmm1-xmm5 | ||
675 | ## | ||
676 | &function_begin_B("_vpaes_schedule_mangle"); | ||
677 | &movdqa ("xmm4","xmm0"); # save xmm0 for later | ||
678 | &movdqa ("xmm5",&QWP($k_mc_forward,$const)); | ||
679 | &test ($out,$out); | ||
680 | &jnz (&label("schedule_mangle_dec")); | ||
681 | |||
682 | # encrypting | ||
683 | &add ($key,16); | ||
684 | &pxor ("xmm4",&QWP($k_s63,$const)); | ||
685 | &pshufb ("xmm4","xmm5"); | ||
686 | &movdqa ("xmm3","xmm4"); | ||
687 | &pshufb ("xmm4","xmm5"); | ||
688 | &pxor ("xmm3","xmm4"); | ||
689 | &pshufb ("xmm4","xmm5"); | ||
690 | &pxor ("xmm3","xmm4"); | ||
691 | |||
692 | &jmp (&label("schedule_mangle_both")); | ||
693 | |||
694 | &set_label("schedule_mangle_dec",16); | ||
695 | # inverse mix columns | ||
696 | &movdqa ("xmm2",&QWP($k_s0F,$const)); | ||
697 | &lea ($inp,&DWP($k_dksd,$const)); | ||
698 | &movdqa ("xmm1","xmm2"); | ||
699 | &pandn ("xmm1","xmm4"); | ||
700 | &psrld ("xmm1",4); # 1 = hi | ||
701 | &pand ("xmm4","xmm2"); # 4 = lo | ||
702 | |||
703 | &movdqa ("xmm2",&QWP(0,$inp)); | ||
704 | &pshufb ("xmm2","xmm4"); | ||
705 | &movdqa ("xmm3",&QWP(0x10,$inp)); | ||
706 | &pshufb ("xmm3","xmm1"); | ||
707 | &pxor ("xmm3","xmm2"); | ||
708 | &pshufb ("xmm3","xmm5"); | ||
709 | |||
710 | &movdqa ("xmm2",&QWP(0x20,$inp)); | ||
711 | &pshufb ("xmm2","xmm4"); | ||
712 | &pxor ("xmm2","xmm3"); | ||
713 | &movdqa ("xmm3",&QWP(0x30,$inp)); | ||
714 | &pshufb ("xmm3","xmm1"); | ||
715 | &pxor ("xmm3","xmm2"); | ||
716 | &pshufb ("xmm3","xmm5"); | ||
717 | |||
718 | &movdqa ("xmm2",&QWP(0x40,$inp)); | ||
719 | &pshufb ("xmm2","xmm4"); | ||
720 | &pxor ("xmm2","xmm3"); | ||
721 | &movdqa ("xmm3",&QWP(0x50,$inp)); | ||
722 | &pshufb ("xmm3","xmm1"); | ||
723 | &pxor ("xmm3","xmm2"); | ||
724 | &pshufb ("xmm3","xmm5"); | ||
725 | |||
726 | &movdqa ("xmm2",&QWP(0x60,$inp)); | ||
727 | &pshufb ("xmm2","xmm4"); | ||
728 | &pxor ("xmm2","xmm3"); | ||
729 | &movdqa ("xmm3",&QWP(0x70,$inp)); | ||
730 | &pshufb ("xmm3","xmm1"); | ||
731 | &pxor ("xmm3","xmm2"); | ||
732 | |||
733 | &add ($key,-16); | ||
734 | |||
735 | &set_label("schedule_mangle_both"); | ||
736 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
737 | &pshufb ("xmm3","xmm1"); | ||
738 | &add ($magic,-16); | ||
739 | &and ($magic,0x30); | ||
740 | &movdqu (&QWP(0,$key),"xmm3"); | ||
741 | &ret (); | ||
742 | &function_end_B("_vpaes_schedule_mangle"); | ||
743 | |||
744 | # | ||
745 | # Interface to OpenSSL | ||
746 | # | ||
747 | &function_begin("${PREFIX}_set_encrypt_key"); | ||
748 | &mov ($inp,&wparam(0)); # inp | ||
749 | &lea ($base,&DWP(-56,"esp")); | ||
750 | &mov ($round,&wparam(1)); # bits | ||
751 | &and ($base,-16); | ||
752 | &mov ($key,&wparam(2)); # key | ||
753 | &xchg ($base,"esp"); # alloca | ||
754 | &mov (&DWP(48,"esp"),$base); | ||
755 | |||
756 | &mov ($base,$round); | ||
757 | &shr ($base,5); | ||
758 | &add ($base,5); | ||
759 | &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; | ||
760 | &mov ($magic,0x30); | ||
761 | &mov ($out,0); | ||
762 | |||
763 | &picsetup($const); | ||
764 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
765 | &lea ($const,&DWP(0x30,$const)) | ||
766 | |||
767 | &call ("_vpaes_schedule_core"); | ||
768 | |||
769 | &mov ("esp",&DWP(48,"esp")); | ||
770 | &xor ("eax","eax"); | ||
771 | &function_end("${PREFIX}_set_encrypt_key"); | ||
772 | |||
773 | &function_begin("${PREFIX}_set_decrypt_key"); | ||
774 | &mov ($inp,&wparam(0)); # inp | ||
775 | &lea ($base,&DWP(-56,"esp")); | ||
776 | &mov ($round,&wparam(1)); # bits | ||
777 | &and ($base,-16); | ||
778 | &mov ($key,&wparam(2)); # key | ||
779 | &xchg ($base,"esp"); # alloca | ||
780 | &mov (&DWP(48,"esp"),$base); | ||
781 | |||
782 | &mov ($base,$round); | ||
783 | &shr ($base,5); | ||
784 | &add ($base,5); | ||
785 | &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; | ||
786 | &shl ($base,4); | ||
787 | &lea ($key,&DWP(16,$key,$base)); | ||
788 | |||
789 | &mov ($out,1); | ||
790 | &mov ($magic,$round); | ||
791 | &shr ($magic,1); | ||
792 | &and ($magic,32); | ||
793 | &xor ($magic,32); # nbist==192?0:32; | ||
794 | |||
795 | &picsetup($const); | ||
796 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
797 | &lea ($const,&DWP(0x30,$const)) | ||
798 | |||
799 | &call ("_vpaes_schedule_core"); | ||
800 | |||
801 | &mov ("esp",&DWP(48,"esp")); | ||
802 | &xor ("eax","eax"); | ||
803 | &function_end("${PREFIX}_set_decrypt_key"); | ||
804 | |||
805 | &function_begin("${PREFIX}_encrypt"); | ||
806 | &picsetup($const); | ||
807 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
808 | &lea ($const,&DWP(0x30,$const)) | ||
809 | |||
810 | &call ("_vpaes_preheat"); | ||
811 | &mov ($inp,&wparam(0)); # inp | ||
812 | &lea ($base,&DWP(-56,"esp")); | ||
813 | &mov ($out,&wparam(1)); # out | ||
814 | &and ($base,-16); | ||
815 | &mov ($key,&wparam(2)); # key | ||
816 | &xchg ($base,"esp"); # alloca | ||
817 | &mov (&DWP(48,"esp"),$base); | ||
818 | |||
819 | &movdqu ("xmm0",&QWP(0,$inp)); | ||
820 | &call ("_vpaes_encrypt_core"); | ||
821 | &movdqu (&QWP(0,$out),"xmm0"); | ||
822 | |||
823 | &mov ("esp",&DWP(48,"esp")); | ||
824 | &function_end("${PREFIX}_encrypt"); | ||
825 | |||
826 | &function_begin("${PREFIX}_decrypt"); | ||
827 | &picsetup($const); | ||
828 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
829 | &lea ($const,&DWP(0x30,$const)) | ||
830 | |||
831 | &call ("_vpaes_preheat"); | ||
832 | &mov ($inp,&wparam(0)); # inp | ||
833 | &lea ($base,&DWP(-56,"esp")); | ||
834 | &mov ($out,&wparam(1)); # out | ||
835 | &and ($base,-16); | ||
836 | &mov ($key,&wparam(2)); # key | ||
837 | &xchg ($base,"esp"); # alloca | ||
838 | &mov (&DWP(48,"esp"),$base); | ||
839 | |||
840 | &movdqu ("xmm0",&QWP(0,$inp)); | ||
841 | &call ("_vpaes_decrypt_core"); | ||
842 | &movdqu (&QWP(0,$out),"xmm0"); | ||
843 | |||
844 | &mov ("esp",&DWP(48,"esp")); | ||
845 | &function_end("${PREFIX}_decrypt"); | ||
846 | |||
847 | &function_begin("${PREFIX}_cbc_encrypt"); | ||
848 | &mov ($inp,&wparam(0)); # inp | ||
849 | &mov ($out,&wparam(1)); # out | ||
850 | &mov ($round,&wparam(2)); # len | ||
851 | &mov ($key,&wparam(3)); # key | ||
852 | &sub ($round,16); | ||
853 | &jc (&label("cbc_abort")); | ||
854 | &lea ($base,&DWP(-56,"esp")); | ||
855 | &mov ($const,&wparam(4)); # ivp | ||
856 | &and ($base,-16); | ||
857 | &mov ($magic,&wparam(5)); # enc | ||
858 | &xchg ($base,"esp"); # alloca | ||
859 | &movdqu ("xmm1",&QWP(0,$const)); # load IV | ||
860 | &sub ($out,$inp); | ||
861 | &mov (&DWP(48,"esp"),$base); | ||
862 | |||
863 | &mov (&DWP(0,"esp"),$out); # save out | ||
864 | &mov (&DWP(4,"esp"),$key) # save key | ||
865 | &mov (&DWP(8,"esp"),$const); # save ivp | ||
866 | &mov ($out,$round); # $out works as $len | ||
867 | |||
868 | &picsetup($const); | ||
869 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
870 | &lea ($const,&DWP(0x30,$const)) | ||
871 | |||
872 | &call ("_vpaes_preheat"); | ||
873 | &cmp ($magic,0); | ||
874 | &je (&label("cbc_dec_loop")); | ||
875 | &jmp (&label("cbc_enc_loop")); | ||
876 | |||
877 | &set_label("cbc_enc_loop",16); | ||
878 | &movdqu ("xmm0",&QWP(0,$inp)); # load input | ||
879 | &pxor ("xmm0","xmm1"); # inp^=iv | ||
880 | &call ("_vpaes_encrypt_core"); | ||
881 | &mov ($base,&DWP(0,"esp")); # restore out | ||
882 | &mov ($key,&DWP(4,"esp")); # restore key | ||
883 | &movdqa ("xmm1","xmm0"); | ||
884 | &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output | ||
885 | &lea ($inp,&DWP(16,$inp)); | ||
886 | &sub ($out,16); | ||
887 | &jnc (&label("cbc_enc_loop")); | ||
888 | &jmp (&label("cbc_done")); | ||
889 | |||
890 | &set_label("cbc_dec_loop",16); | ||
891 | &movdqu ("xmm0",&QWP(0,$inp)); # load input | ||
892 | &movdqa (&QWP(16,"esp"),"xmm1"); # save IV | ||
893 | &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV | ||
894 | &call ("_vpaes_decrypt_core"); | ||
895 | &mov ($base,&DWP(0,"esp")); # restore out | ||
896 | &mov ($key,&DWP(4,"esp")); # restore key | ||
897 | &pxor ("xmm0",&QWP(16,"esp")); # out^=iv | ||
898 | &movdqa ("xmm1",&QWP(32,"esp")); # load next IV | ||
899 | &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output | ||
900 | &lea ($inp,&DWP(16,$inp)); | ||
901 | &sub ($out,16); | ||
902 | &jnc (&label("cbc_dec_loop")); | ||
903 | |||
904 | &set_label("cbc_done"); | ||
905 | &mov ($base,&DWP(8,"esp")); # restore ivp | ||
906 | &mov ("esp",&DWP(48,"esp")); | ||
907 | &movdqu (&QWP(0,$base),"xmm1"); # write IV | ||
908 | &set_label("cbc_abort"); | ||
909 | &function_end("${PREFIX}_cbc_encrypt"); | ||
910 | |||
911 | &asm_finish(); | ||
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl deleted file mode 100644 index 7d92e8d8ca..0000000000 --- a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl +++ /dev/null | |||
@@ -1,1222 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | ###################################################################### | ||
4 | ## Constant-time SSSE3 AES core implementation. | ||
5 | ## version 0.1 | ||
6 | ## | ||
7 | ## By Mike Hamburg (Stanford University), 2009 | ||
8 | ## Public domain. | ||
9 | ## | ||
10 | ## For details see http://shiftleft.org/papers/vector_aes/ and | ||
11 | ## http://crypto.stanford.edu/vpaes/. | ||
12 | |||
13 | ###################################################################### | ||
14 | # September 2011. | ||
15 | # | ||
16 | # Interface to OpenSSL as "almost" drop-in replacement for | ||
17 | # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt | ||
18 | # doesn't handle partial vectors (doesn't have to if called from | ||
19 | # EVP only). "Drop-in" implies that this module doesn't share key | ||
20 | # schedule structure with the original nor does it make assumption | ||
21 | # about its alignment... | ||
22 | # | ||
23 | # Performance summary. aes-x86_64.pl column lists large-block CBC | ||
24 | # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per | ||
25 | # byte processed with 128-bit key, and vpaes-x86_64.pl column - | ||
26 | # [also large-block CBC] encrypt/decrypt. | ||
27 | # | ||
28 | # aes-x86_64.pl vpaes-x86_64.pl | ||
29 | # | ||
30 | # Core 2(**) 30.5/43.7/14.3 21.8/25.7(***) | ||
31 | # Nehalem 30.5/42.2/14.6 9.8/11.8 | ||
32 | # Atom 63.9/79.0/32.1 64.0/84.8(***) | ||
33 | # | ||
34 | # (*) "Hyper-threading" in the context refers rather to cache shared | ||
35 | # among multiple cores, than to specifically Intel HTT. As vast | ||
36 | # majority of contemporary cores share cache, slower code path | ||
37 | # is common place. In other words "with-hyper-threading-off" | ||
38 | # results are presented mostly for reference purposes. | ||
39 | # | ||
40 | # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. | ||
41 | # | ||
42 | # (***) Less impressive improvement on Core 2 and Atom is due to slow | ||
43 | # pshufb, yet it's respectable +40%/78% improvement on Core 2 | ||
44 | # (as implied, over "hyper-threading-safe" code path). | ||
45 | # | ||
46 | # <appro@openssl.org> | ||
47 | |||
48 | $flavour = shift; | ||
49 | $output = shift; | ||
50 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
51 | |||
52 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
53 | |||
54 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
55 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
56 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
57 | die "can't locate x86_64-xlate.pl"; | ||
58 | |||
59 | open OUT,"| \"$^X\" $xlate $flavour $output"; | ||
60 | *STDOUT=*OUT; | ||
61 | |||
62 | $PREFIX="vpaes"; | ||
63 | |||
64 | $code.=<<___; | ||
65 | .text | ||
66 | |||
67 | ## | ||
68 | ## _aes_encrypt_core | ||
69 | ## | ||
70 | ## AES-encrypt %xmm0. | ||
71 | ## | ||
72 | ## Inputs: | ||
73 | ## %xmm0 = input | ||
74 | ## %xmm9-%xmm15 as in _vpaes_preheat | ||
75 | ## (%rdx) = scheduled keys | ||
76 | ## | ||
77 | ## Output in %xmm0 | ||
78 | ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax | ||
79 | ## Preserves %xmm6 - %xmm8 so you get some local vectors | ||
80 | ## | ||
81 | ## | ||
82 | .type _vpaes_encrypt_core,\@abi-omnipotent | ||
83 | .align 16 | ||
84 | _vpaes_encrypt_core: | ||
85 | _CET_ENDBR | ||
86 | mov %rdx, %r9 | ||
87 | mov \$16, %r11 | ||
88 | mov 240(%rdx),%eax | ||
89 | movdqa %xmm9, %xmm1 | ||
90 | movdqa .Lk_ipt(%rip), %xmm2 # iptlo | ||
91 | pandn %xmm0, %xmm1 | ||
92 | movdqu (%r9), %xmm5 # round0 key | ||
93 | psrld \$4, %xmm1 | ||
94 | pand %xmm9, %xmm0 | ||
95 | pshufb %xmm0, %xmm2 | ||
96 | movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi | ||
97 | pshufb %xmm1, %xmm0 | ||
98 | pxor %xmm5, %xmm2 | ||
99 | pxor %xmm2, %xmm0 | ||
100 | add \$16, %r9 | ||
101 | lea .Lk_mc_backward(%rip),%r10 | ||
102 | jmp .Lenc_entry | ||
103 | |||
104 | .align 16 | ||
105 | .Lenc_loop: | ||
106 | # middle of middle round | ||
107 | movdqa %xmm13, %xmm4 # 4 : sb1u | ||
108 | pshufb %xmm2, %xmm4 # 4 = sb1u | ||
109 | pxor %xmm5, %xmm4 # 4 = sb1u + k | ||
110 | movdqa %xmm12, %xmm0 # 0 : sb1t | ||
111 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
112 | pxor %xmm4, %xmm0 # 0 = A | ||
113 | movdqa %xmm15, %xmm5 # 4 : sb2u | ||
114 | pshufb %xmm2, %xmm5 # 4 = sb2u | ||
115 | movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] | ||
116 | movdqa %xmm14, %xmm2 # 2 : sb2t | ||
117 | pshufb %xmm3, %xmm2 # 2 = sb2t | ||
118 | pxor %xmm5, %xmm2 # 2 = 2A | ||
119 | movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] | ||
120 | movdqa %xmm0, %xmm3 # 3 = A | ||
121 | pshufb %xmm1, %xmm0 # 0 = B | ||
122 | add \$16, %r9 # next key | ||
123 | pxor %xmm2, %xmm0 # 0 = 2A+B | ||
124 | pshufb %xmm4, %xmm3 # 3 = D | ||
125 | add \$16, %r11 # next mc | ||
126 | pxor %xmm0, %xmm3 # 3 = 2A+B+D | ||
127 | pshufb %xmm1, %xmm0 # 0 = 2B+C | ||
128 | and \$0x30, %r11 # ... mod 4 | ||
129 | pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D | ||
130 | sub \$1,%rax # nr-- | ||
131 | |||
132 | .Lenc_entry: | ||
133 | # top of round | ||
134 | movdqa %xmm9, %xmm1 # 1 : i | ||
135 | pandn %xmm0, %xmm1 # 1 = i<<4 | ||
136 | psrld \$4, %xmm1 # 1 = i | ||
137 | pand %xmm9, %xmm0 # 0 = k | ||
138 | movdqa %xmm11, %xmm5 # 2 : a/k | ||
139 | pshufb %xmm0, %xmm5 # 2 = a/k | ||
140 | pxor %xmm1, %xmm0 # 0 = j | ||
141 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
142 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
143 | pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k | ||
144 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
145 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
146 | pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k | ||
147 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
148 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
149 | pxor %xmm0, %xmm2 # 2 = io | ||
150 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
151 | movdqu (%r9), %xmm5 | ||
152 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
153 | pxor %xmm1, %xmm3 # 3 = jo | ||
154 | jnz .Lenc_loop | ||
155 | |||
156 | # middle of last round | ||
157 | movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo | ||
158 | movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 | ||
159 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
160 | pxor %xmm5, %xmm4 # 4 = sb1u + k | ||
161 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
162 | movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] | ||
163 | pxor %xmm4, %xmm0 # 0 = A | ||
164 | pshufb %xmm1, %xmm0 | ||
165 | ret | ||
166 | .size _vpaes_encrypt_core,.-_vpaes_encrypt_core | ||
167 | |||
168 | ## | ||
169 | ## Decryption core | ||
170 | ## | ||
171 | ## Same API as encryption core. | ||
172 | ## | ||
173 | .type _vpaes_decrypt_core,\@abi-omnipotent | ||
174 | .align 16 | ||
175 | _vpaes_decrypt_core: | ||
176 | _CET_ENDBR | ||
177 | mov %rdx, %r9 # load key | ||
178 | mov 240(%rdx),%eax | ||
179 | movdqa %xmm9, %xmm1 | ||
180 | movdqa .Lk_dipt(%rip), %xmm2 # iptlo | ||
181 | pandn %xmm0, %xmm1 | ||
182 | mov %rax, %r11 | ||
183 | psrld \$4, %xmm1 | ||
184 | movdqu (%r9), %xmm5 # round0 key | ||
185 | shl \$4, %r11 | ||
186 | pand %xmm9, %xmm0 | ||
187 | pshufb %xmm0, %xmm2 | ||
188 | movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi | ||
189 | xor \$0x30, %r11 | ||
190 | lea .Lk_dsbd(%rip),%r10 | ||
191 | pshufb %xmm1, %xmm0 | ||
192 | and \$0x30, %r11 | ||
193 | pxor %xmm5, %xmm2 | ||
194 | movdqa .Lk_mc_forward+48(%rip), %xmm5 | ||
195 | pxor %xmm2, %xmm0 | ||
196 | add \$16, %r9 | ||
197 | add %r10, %r11 | ||
198 | jmp .Ldec_entry | ||
199 | |||
200 | .align 16 | ||
201 | .Ldec_loop: | ||
202 | ## | ||
203 | ## Inverse mix columns | ||
204 | ## | ||
205 | movdqa -0x20(%r10),%xmm4 # 4 : sb9u | ||
206 | pshufb %xmm2, %xmm4 # 4 = sb9u | ||
207 | pxor %xmm0, %xmm4 | ||
208 | movdqa -0x10(%r10),%xmm0 # 0 : sb9t | ||
209 | pshufb %xmm3, %xmm0 # 0 = sb9t | ||
210 | pxor %xmm4, %xmm0 # 0 = ch | ||
211 | add \$16, %r9 # next round key | ||
212 | |||
213 | pshufb %xmm5, %xmm0 # MC ch | ||
214 | movdqa 0x00(%r10),%xmm4 # 4 : sbdu | ||
215 | pshufb %xmm2, %xmm4 # 4 = sbdu | ||
216 | pxor %xmm0, %xmm4 # 4 = ch | ||
217 | movdqa 0x10(%r10),%xmm0 # 0 : sbdt | ||
218 | pshufb %xmm3, %xmm0 # 0 = sbdt | ||
219 | pxor %xmm4, %xmm0 # 0 = ch | ||
220 | sub \$1,%rax # nr-- | ||
221 | |||
222 | pshufb %xmm5, %xmm0 # MC ch | ||
223 | movdqa 0x20(%r10),%xmm4 # 4 : sbbu | ||
224 | pshufb %xmm2, %xmm4 # 4 = sbbu | ||
225 | pxor %xmm0, %xmm4 # 4 = ch | ||
226 | movdqa 0x30(%r10),%xmm0 # 0 : sbbt | ||
227 | pshufb %xmm3, %xmm0 # 0 = sbbt | ||
228 | pxor %xmm4, %xmm0 # 0 = ch | ||
229 | |||
230 | pshufb %xmm5, %xmm0 # MC ch | ||
231 | movdqa 0x40(%r10),%xmm4 # 4 : sbeu | ||
232 | pshufb %xmm2, %xmm4 # 4 = sbeu | ||
233 | pxor %xmm0, %xmm4 # 4 = ch | ||
234 | movdqa 0x50(%r10),%xmm0 # 0 : sbet | ||
235 | pshufb %xmm3, %xmm0 # 0 = sbet | ||
236 | pxor %xmm4, %xmm0 # 0 = ch | ||
237 | |||
238 | palignr \$12, %xmm5, %xmm5 | ||
239 | |||
240 | .Ldec_entry: | ||
241 | # top of round | ||
242 | movdqa %xmm9, %xmm1 # 1 : i | ||
243 | pandn %xmm0, %xmm1 # 1 = i<<4 | ||
244 | psrld \$4, %xmm1 # 1 = i | ||
245 | pand %xmm9, %xmm0 # 0 = k | ||
246 | movdqa %xmm11, %xmm2 # 2 : a/k | ||
247 | pshufb %xmm0, %xmm2 # 2 = a/k | ||
248 | pxor %xmm1, %xmm0 # 0 = j | ||
249 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
250 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
251 | pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k | ||
252 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
253 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
254 | pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k | ||
255 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
256 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
257 | pxor %xmm0, %xmm2 # 2 = io | ||
258 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
259 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
260 | pxor %xmm1, %xmm3 # 3 = jo | ||
261 | movdqu (%r9), %xmm0 | ||
262 | jnz .Ldec_loop | ||
263 | |||
264 | # middle of last round | ||
265 | movdqa 0x60(%r10), %xmm4 # 3 : sbou | ||
266 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
267 | pxor %xmm0, %xmm4 # 4 = sb1u + k | ||
268 | movdqa 0x70(%r10), %xmm0 # 0 : sbot | ||
269 | movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 | ||
270 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
271 | pxor %xmm4, %xmm0 # 0 = A | ||
272 | pshufb %xmm2, %xmm0 | ||
273 | ret | ||
274 | .size _vpaes_decrypt_core,.-_vpaes_decrypt_core | ||
275 | |||
276 | ######################################################## | ||
277 | ## ## | ||
278 | ## AES key schedule ## | ||
279 | ## ## | ||
280 | ######################################################## | ||
281 | .type _vpaes_schedule_core,\@abi-omnipotent | ||
282 | .align 16 | ||
283 | _vpaes_schedule_core: | ||
284 | _CET_ENDBR | ||
285 | # rdi = key | ||
286 | # rsi = size in bits | ||
287 | # rdx = buffer | ||
288 | # rcx = direction. 0=encrypt, 1=decrypt | ||
289 | |||
290 | call _vpaes_preheat # load the tables | ||
291 | movdqa .Lk_rcon(%rip), %xmm8 # load rcon | ||
292 | movdqu (%rdi), %xmm0 # load key (unaligned) | ||
293 | |||
294 | # input transform | ||
295 | movdqa %xmm0, %xmm3 | ||
296 | lea .Lk_ipt(%rip), %r11 | ||
297 | call _vpaes_schedule_transform | ||
298 | movdqa %xmm0, %xmm7 | ||
299 | |||
300 | lea .Lk_sr(%rip),%r10 | ||
301 | test %rcx, %rcx | ||
302 | jnz .Lschedule_am_decrypting | ||
303 | |||
304 | # encrypting, output zeroth round key after transform | ||
305 | movdqu %xmm0, (%rdx) | ||
306 | jmp .Lschedule_go | ||
307 | |||
308 | .Lschedule_am_decrypting: | ||
309 | # decrypting, output zeroth round key after shiftrows | ||
310 | movdqa (%r8,%r10),%xmm1 | ||
311 | pshufb %xmm1, %xmm3 | ||
312 | movdqu %xmm3, (%rdx) | ||
313 | xor \$0x30, %r8 | ||
314 | |||
315 | .Lschedule_go: | ||
316 | cmp \$192, %esi | ||
317 | ja .Lschedule_256 | ||
318 | je .Lschedule_192 | ||
319 | # 128: fall though | ||
320 | |||
321 | ## | ||
322 | ## .schedule_128 | ||
323 | ## | ||
324 | ## 128-bit specific part of key schedule. | ||
325 | ## | ||
326 | ## This schedule is really simple, because all its parts | ||
327 | ## are accomplished by the subroutines. | ||
328 | ## | ||
329 | .Lschedule_128: | ||
330 | mov \$10, %esi | ||
331 | |||
332 | .Loop_schedule_128: | ||
333 | call _vpaes_schedule_round | ||
334 | dec %rsi | ||
335 | jz .Lschedule_mangle_last | ||
336 | call _vpaes_schedule_mangle # write output | ||
337 | jmp .Loop_schedule_128 | ||
338 | |||
339 | ## | ||
340 | ## .aes_schedule_192 | ||
341 | ## | ||
342 | ## 192-bit specific part of key schedule. | ||
343 | ## | ||
344 | ## The main body of this schedule is the same as the 128-bit | ||
345 | ## schedule, but with more smearing. The long, high side is | ||
346 | ## stored in %xmm7 as before, and the short, low side is in | ||
347 | ## the high bits of %xmm6. | ||
348 | ## | ||
349 | ## This schedule is somewhat nastier, however, because each | ||
350 | ## round produces 192 bits of key material, or 1.5 round keys. | ||
351 | ## Therefore, on each cycle we do 2 rounds and produce 3 round | ||
352 | ## keys. | ||
353 | ## | ||
354 | .align 16 | ||
355 | .Lschedule_192: | ||
356 | movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) | ||
357 | call _vpaes_schedule_transform # input transform | ||
358 | movdqa %xmm0, %xmm6 # save short part | ||
359 | pxor %xmm4, %xmm4 # clear 4 | ||
360 | movhlps %xmm4, %xmm6 # clobber low side with zeros | ||
361 | mov \$4, %esi | ||
362 | |||
363 | .Loop_schedule_192: | ||
364 | call _vpaes_schedule_round | ||
365 | palignr \$8,%xmm6,%xmm0 | ||
366 | call _vpaes_schedule_mangle # save key n | ||
367 | call _vpaes_schedule_192_smear | ||
368 | call _vpaes_schedule_mangle # save key n+1 | ||
369 | call _vpaes_schedule_round | ||
370 | dec %rsi | ||
371 | jz .Lschedule_mangle_last | ||
372 | call _vpaes_schedule_mangle # save key n+2 | ||
373 | call _vpaes_schedule_192_smear | ||
374 | jmp .Loop_schedule_192 | ||
375 | |||
376 | ## | ||
377 | ## .aes_schedule_256 | ||
378 | ## | ||
379 | ## 256-bit specific part of key schedule. | ||
380 | ## | ||
381 | ## The structure here is very similar to the 128-bit | ||
382 | ## schedule, but with an additional "low side" in | ||
383 | ## %xmm6. The low side's rounds are the same as the | ||
384 | ## high side's, except no rcon and no rotation. | ||
385 | ## | ||
386 | .align 16 | ||
387 | .Lschedule_256: | ||
388 | movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) | ||
389 | call _vpaes_schedule_transform # input transform | ||
390 | mov \$7, %esi | ||
391 | |||
392 | .Loop_schedule_256: | ||
393 | call _vpaes_schedule_mangle # output low result | ||
394 | movdqa %xmm0, %xmm6 # save cur_lo in xmm6 | ||
395 | |||
396 | # high round | ||
397 | call _vpaes_schedule_round | ||
398 | dec %rsi | ||
399 | jz .Lschedule_mangle_last | ||
400 | call _vpaes_schedule_mangle | ||
401 | |||
402 | # low round. swap xmm7 and xmm6 | ||
403 | pshufd \$0xFF, %xmm0, %xmm0 | ||
404 | movdqa %xmm7, %xmm5 | ||
405 | movdqa %xmm6, %xmm7 | ||
406 | call _vpaes_schedule_low_round | ||
407 | movdqa %xmm5, %xmm7 | ||
408 | |||
409 | jmp .Loop_schedule_256 | ||
410 | |||
411 | |||
412 | ## | ||
413 | ## .aes_schedule_mangle_last | ||
414 | ## | ||
415 | ## Mangler for last round of key schedule | ||
416 | ## Mangles %xmm0 | ||
417 | ## when encrypting, outputs out(%xmm0) ^ 63 | ||
418 | ## when decrypting, outputs unskew(%xmm0) | ||
419 | ## | ||
420 | ## Always called right before return... jumps to cleanup and exits | ||
421 | ## | ||
422 | .align 16 | ||
423 | .Lschedule_mangle_last: | ||
424 | # schedule last round key from xmm0 | ||
425 | lea .Lk_deskew(%rip),%r11 # prepare to deskew | ||
426 | test %rcx, %rcx | ||
427 | jnz .Lschedule_mangle_last_dec | ||
428 | |||
429 | # encrypting | ||
430 | movdqa (%r8,%r10),%xmm1 | ||
431 | pshufb %xmm1, %xmm0 # output permute | ||
432 | lea .Lk_opt(%rip), %r11 # prepare to output transform | ||
433 | add \$32, %rdx | ||
434 | |||
435 | .Lschedule_mangle_last_dec: | ||
436 | add \$-16, %rdx | ||
437 | pxor .Lk_s63(%rip), %xmm0 | ||
438 | call _vpaes_schedule_transform # output transform | ||
439 | movdqu %xmm0, (%rdx) # save last key | ||
440 | |||
441 | # cleanup | ||
442 | pxor %xmm0, %xmm0 | ||
443 | pxor %xmm1, %xmm1 | ||
444 | pxor %xmm2, %xmm2 | ||
445 | pxor %xmm3, %xmm3 | ||
446 | pxor %xmm4, %xmm4 | ||
447 | pxor %xmm5, %xmm5 | ||
448 | pxor %xmm6, %xmm6 | ||
449 | pxor %xmm7, %xmm7 | ||
450 | ret | ||
451 | .size _vpaes_schedule_core,.-_vpaes_schedule_core | ||
452 | |||
453 | ## | ||
454 | ## .aes_schedule_192_smear | ||
455 | ## | ||
456 | ## Smear the short, low side in the 192-bit key schedule. | ||
457 | ## | ||
458 | ## Inputs: | ||
459 | ## %xmm7: high side, b a x y | ||
460 | ## %xmm6: low side, d c 0 0 | ||
461 | ## %xmm13: 0 | ||
462 | ## | ||
463 | ## Outputs: | ||
464 | ## %xmm6: b+c+d b+c 0 0 | ||
465 | ## %xmm0: b+c+d b+c b a | ||
466 | ## | ||
467 | .type _vpaes_schedule_192_smear,\@abi-omnipotent | ||
468 | .align 16 | ||
469 | _vpaes_schedule_192_smear: | ||
470 | _CET_ENDBR | ||
471 | pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0 | ||
472 | pxor %xmm0, %xmm6 # -> c+d c 0 0 | ||
473 | pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a | ||
474 | pxor %xmm0, %xmm6 # -> b+c+d b+c b a | ||
475 | movdqa %xmm6, %xmm0 | ||
476 | pxor %xmm1, %xmm1 | ||
477 | movhlps %xmm1, %xmm6 # clobber low side with zeros | ||
478 | ret | ||
479 | .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear | ||
480 | |||
481 | ## | ||
482 | ## .aes_schedule_round | ||
483 | ## | ||
484 | ## Runs one main round of the key schedule on %xmm0, %xmm7 | ||
485 | ## | ||
486 | ## Specifically, runs subbytes on the high dword of %xmm0 | ||
487 | ## then rotates it by one byte and xors into the low dword of | ||
488 | ## %xmm7. | ||
489 | ## | ||
490 | ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for | ||
491 | ## next rcon. | ||
492 | ## | ||
493 | ## Smears the dwords of %xmm7 by xoring the low into the | ||
494 | ## second low, result into third, result into highest. | ||
495 | ## | ||
496 | ## Returns results in %xmm7 = %xmm0. | ||
497 | ## Clobbers %xmm1-%xmm4, %r11. | ||
498 | ## | ||
499 | .type _vpaes_schedule_round,\@abi-omnipotent | ||
500 | .align 16 | ||
501 | _vpaes_schedule_round: | ||
502 | _CET_ENDBR | ||
503 | # extract rcon from xmm8 | ||
504 | pxor %xmm1, %xmm1 | ||
505 | palignr \$15, %xmm8, %xmm1 | ||
506 | palignr \$15, %xmm8, %xmm8 | ||
507 | pxor %xmm1, %xmm7 | ||
508 | |||
509 | # rotate | ||
510 | pshufd \$0xFF, %xmm0, %xmm0 | ||
511 | palignr \$1, %xmm0, %xmm0 | ||
512 | |||
513 | # fall through... | ||
514 | |||
515 | # low round: same as high round, but no rotation and no rcon. | ||
516 | _vpaes_schedule_low_round: | ||
517 | # smear xmm7 | ||
518 | movdqa %xmm7, %xmm1 | ||
519 | pslldq \$4, %xmm7 | ||
520 | pxor %xmm1, %xmm7 | ||
521 | movdqa %xmm7, %xmm1 | ||
522 | pslldq \$8, %xmm7 | ||
523 | pxor %xmm1, %xmm7 | ||
524 | pxor .Lk_s63(%rip), %xmm7 | ||
525 | |||
526 | # subbytes | ||
527 | movdqa %xmm9, %xmm1 | ||
528 | pandn %xmm0, %xmm1 | ||
529 | psrld \$4, %xmm1 # 1 = i | ||
530 | pand %xmm9, %xmm0 # 0 = k | ||
531 | movdqa %xmm11, %xmm2 # 2 : a/k | ||
532 | pshufb %xmm0, %xmm2 # 2 = a/k | ||
533 | pxor %xmm1, %xmm0 # 0 = j | ||
534 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
535 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
536 | pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k | ||
537 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
538 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
539 | pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k | ||
540 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
541 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
542 | pxor %xmm0, %xmm2 # 2 = io | ||
543 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
544 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
545 | pxor %xmm1, %xmm3 # 3 = jo | ||
546 | movdqa %xmm13, %xmm4 # 4 : sbou | ||
547 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
548 | movdqa %xmm12, %xmm0 # 0 : sbot | ||
549 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
550 | pxor %xmm4, %xmm0 # 0 = sbox output | ||
551 | |||
552 | # add in smeared stuff | ||
553 | pxor %xmm7, %xmm0 | ||
554 | movdqa %xmm0, %xmm7 | ||
555 | ret | ||
556 | .size _vpaes_schedule_round,.-_vpaes_schedule_round | ||
557 | |||
558 | ## | ||
559 | ## .aes_schedule_transform | ||
560 | ## | ||
561 | ## Linear-transform %xmm0 according to tables at (%r11) | ||
562 | ## | ||
563 | ## Requires that %xmm9 = 0x0F0F... as in preheat | ||
564 | ## Output in %xmm0 | ||
565 | ## Clobbers %xmm1, %xmm2 | ||
566 | ## | ||
567 | .type _vpaes_schedule_transform,\@abi-omnipotent | ||
568 | .align 16 | ||
569 | _vpaes_schedule_transform: | ||
570 | _CET_ENDBR | ||
571 | movdqa %xmm9, %xmm1 | ||
572 | pandn %xmm0, %xmm1 | ||
573 | psrld \$4, %xmm1 | ||
574 | pand %xmm9, %xmm0 | ||
575 | movdqa (%r11), %xmm2 # lo | ||
576 | pshufb %xmm0, %xmm2 | ||
577 | movdqa 16(%r11), %xmm0 # hi | ||
578 | pshufb %xmm1, %xmm0 | ||
579 | pxor %xmm2, %xmm0 | ||
580 | ret | ||
581 | .size _vpaes_schedule_transform,.-_vpaes_schedule_transform | ||
582 | |||
583 | ## | ||
584 | ## .aes_schedule_mangle | ||
585 | ## | ||
586 | ## Mangle xmm0 from (basis-transformed) standard version | ||
587 | ## to our version. | ||
588 | ## | ||
589 | ## On encrypt, | ||
590 | ## xor with 0x63 | ||
591 | ## multiply by circulant 0,1,1,1 | ||
592 | ## apply shiftrows transform | ||
593 | ## | ||
594 | ## On decrypt, | ||
595 | ## xor with 0x63 | ||
596 | ## multiply by "inverse mixcolumns" circulant E,B,D,9 | ||
597 | ## deskew | ||
598 | ## apply shiftrows transform | ||
599 | ## | ||
600 | ## | ||
601 | ## Writes out to (%rdx), and increments or decrements it | ||
602 | ## Keeps track of round number mod 4 in %r8 | ||
603 | ## Preserves xmm0 | ||
604 | ## Clobbers xmm1-xmm5 | ||
605 | ## | ||
606 | .type _vpaes_schedule_mangle,\@abi-omnipotent | ||
607 | .align 16 | ||
608 | _vpaes_schedule_mangle: | ||
609 | _CET_ENDBR | ||
610 | movdqa %xmm0, %xmm4 # save xmm0 for later | ||
611 | movdqa .Lk_mc_forward(%rip),%xmm5 | ||
612 | test %rcx, %rcx | ||
613 | jnz .Lschedule_mangle_dec | ||
614 | |||
615 | # encrypting | ||
616 | add \$16, %rdx | ||
617 | pxor .Lk_s63(%rip),%xmm4 | ||
618 | pshufb %xmm5, %xmm4 | ||
619 | movdqa %xmm4, %xmm3 | ||
620 | pshufb %xmm5, %xmm4 | ||
621 | pxor %xmm4, %xmm3 | ||
622 | pshufb %xmm5, %xmm4 | ||
623 | pxor %xmm4, %xmm3 | ||
624 | |||
625 | jmp .Lschedule_mangle_both | ||
626 | .align 16 | ||
627 | .Lschedule_mangle_dec: | ||
628 | # inverse mix columns | ||
629 | lea .Lk_dksd(%rip),%r11 | ||
630 | movdqa %xmm9, %xmm1 | ||
631 | pandn %xmm4, %xmm1 | ||
632 | psrld \$4, %xmm1 # 1 = hi | ||
633 | pand %xmm9, %xmm4 # 4 = lo | ||
634 | |||
635 | movdqa 0x00(%r11), %xmm2 | ||
636 | pshufb %xmm4, %xmm2 | ||
637 | movdqa 0x10(%r11), %xmm3 | ||
638 | pshufb %xmm1, %xmm3 | ||
639 | pxor %xmm2, %xmm3 | ||
640 | pshufb %xmm5, %xmm3 | ||
641 | |||
642 | movdqa 0x20(%r11), %xmm2 | ||
643 | pshufb %xmm4, %xmm2 | ||
644 | pxor %xmm3, %xmm2 | ||
645 | movdqa 0x30(%r11), %xmm3 | ||
646 | pshufb %xmm1, %xmm3 | ||
647 | pxor %xmm2, %xmm3 | ||
648 | pshufb %xmm5, %xmm3 | ||
649 | |||
650 | movdqa 0x40(%r11), %xmm2 | ||
651 | pshufb %xmm4, %xmm2 | ||
652 | pxor %xmm3, %xmm2 | ||
653 | movdqa 0x50(%r11), %xmm3 | ||
654 | pshufb %xmm1, %xmm3 | ||
655 | pxor %xmm2, %xmm3 | ||
656 | pshufb %xmm5, %xmm3 | ||
657 | |||
658 | movdqa 0x60(%r11), %xmm2 | ||
659 | pshufb %xmm4, %xmm2 | ||
660 | pxor %xmm3, %xmm2 | ||
661 | movdqa 0x70(%r11), %xmm3 | ||
662 | pshufb %xmm1, %xmm3 | ||
663 | pxor %xmm2, %xmm3 | ||
664 | |||
665 | add \$-16, %rdx | ||
666 | |||
667 | .Lschedule_mangle_both: | ||
668 | movdqa (%r8,%r10),%xmm1 | ||
669 | pshufb %xmm1,%xmm3 | ||
670 | add \$-16, %r8 | ||
671 | and \$0x30, %r8 | ||
672 | movdqu %xmm3, (%rdx) | ||
673 | ret | ||
674 | .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle | ||
675 | |||
676 | # | ||
677 | # Interface to OpenSSL | ||
678 | # | ||
679 | .globl ${PREFIX}_set_encrypt_key | ||
680 | .type ${PREFIX}_set_encrypt_key,\@function,3 | ||
681 | .align 16 | ||
682 | ${PREFIX}_set_encrypt_key: | ||
683 | _CET_ENDBR | ||
684 | ___ | ||
685 | $code.=<<___ if ($win64); | ||
686 | lea -0xb8(%rsp),%rsp | ||
687 | movaps %xmm6,0x10(%rsp) | ||
688 | movaps %xmm7,0x20(%rsp) | ||
689 | movaps %xmm8,0x30(%rsp) | ||
690 | movaps %xmm9,0x40(%rsp) | ||
691 | movaps %xmm10,0x50(%rsp) | ||
692 | movaps %xmm11,0x60(%rsp) | ||
693 | movaps %xmm12,0x70(%rsp) | ||
694 | movaps %xmm13,0x80(%rsp) | ||
695 | movaps %xmm14,0x90(%rsp) | ||
696 | movaps %xmm15,0xa0(%rsp) | ||
697 | .Lenc_key_body: | ||
698 | ___ | ||
699 | $code.=<<___; | ||
700 | mov %esi,%eax | ||
701 | shr \$5,%eax | ||
702 | add \$5,%eax | ||
703 | mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; | ||
704 | |||
705 | mov \$0,%ecx | ||
706 | mov \$0x30,%r8d | ||
707 | call _vpaes_schedule_core | ||
708 | ___ | ||
709 | $code.=<<___ if ($win64); | ||
710 | movaps 0x10(%rsp),%xmm6 | ||
711 | movaps 0x20(%rsp),%xmm7 | ||
712 | movaps 0x30(%rsp),%xmm8 | ||
713 | movaps 0x40(%rsp),%xmm9 | ||
714 | movaps 0x50(%rsp),%xmm10 | ||
715 | movaps 0x60(%rsp),%xmm11 | ||
716 | movaps 0x70(%rsp),%xmm12 | ||
717 | movaps 0x80(%rsp),%xmm13 | ||
718 | movaps 0x90(%rsp),%xmm14 | ||
719 | movaps 0xa0(%rsp),%xmm15 | ||
720 | lea 0xb8(%rsp),%rsp | ||
721 | .Lenc_key_epilogue: | ||
722 | ___ | ||
723 | $code.=<<___; | ||
724 | xor %eax,%eax | ||
725 | ret | ||
726 | .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key | ||
727 | |||
728 | .globl ${PREFIX}_set_decrypt_key | ||
729 | .type ${PREFIX}_set_decrypt_key,\@function,3 | ||
730 | .align 16 | ||
731 | ${PREFIX}_set_decrypt_key: | ||
732 | _CET_ENDBR | ||
733 | ___ | ||
734 | $code.=<<___ if ($win64); | ||
735 | lea -0xb8(%rsp),%rsp | ||
736 | movaps %xmm6,0x10(%rsp) | ||
737 | movaps %xmm7,0x20(%rsp) | ||
738 | movaps %xmm8,0x30(%rsp) | ||
739 | movaps %xmm9,0x40(%rsp) | ||
740 | movaps %xmm10,0x50(%rsp) | ||
741 | movaps %xmm11,0x60(%rsp) | ||
742 | movaps %xmm12,0x70(%rsp) | ||
743 | movaps %xmm13,0x80(%rsp) | ||
744 | movaps %xmm14,0x90(%rsp) | ||
745 | movaps %xmm15,0xa0(%rsp) | ||
746 | .Ldec_key_body: | ||
747 | ___ | ||
748 | $code.=<<___; | ||
749 | mov %esi,%eax | ||
750 | shr \$5,%eax | ||
751 | add \$5,%eax | ||
752 | mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; | ||
753 | shl \$4,%eax | ||
754 | lea 16(%rdx,%rax),%rdx | ||
755 | |||
756 | mov \$1,%ecx | ||
757 | mov %esi,%r8d | ||
758 | shr \$1,%r8d | ||
759 | and \$32,%r8d | ||
760 | xor \$32,%r8d # nbits==192?0:32 | ||
761 | call _vpaes_schedule_core | ||
762 | ___ | ||
763 | $code.=<<___ if ($win64); | ||
764 | movaps 0x10(%rsp),%xmm6 | ||
765 | movaps 0x20(%rsp),%xmm7 | ||
766 | movaps 0x30(%rsp),%xmm8 | ||
767 | movaps 0x40(%rsp),%xmm9 | ||
768 | movaps 0x50(%rsp),%xmm10 | ||
769 | movaps 0x60(%rsp),%xmm11 | ||
770 | movaps 0x70(%rsp),%xmm12 | ||
771 | movaps 0x80(%rsp),%xmm13 | ||
772 | movaps 0x90(%rsp),%xmm14 | ||
773 | movaps 0xa0(%rsp),%xmm15 | ||
774 | lea 0xb8(%rsp),%rsp | ||
775 | .Ldec_key_epilogue: | ||
776 | ___ | ||
777 | $code.=<<___; | ||
778 | xor %eax,%eax | ||
779 | ret | ||
780 | .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key | ||
781 | |||
782 | .globl ${PREFIX}_encrypt | ||
783 | .type ${PREFIX}_encrypt,\@function,3 | ||
784 | .align 16 | ||
785 | ${PREFIX}_encrypt: | ||
786 | _CET_ENDBR | ||
787 | ___ | ||
788 | $code.=<<___ if ($win64); | ||
789 | lea -0xb8(%rsp),%rsp | ||
790 | movaps %xmm6,0x10(%rsp) | ||
791 | movaps %xmm7,0x20(%rsp) | ||
792 | movaps %xmm8,0x30(%rsp) | ||
793 | movaps %xmm9,0x40(%rsp) | ||
794 | movaps %xmm10,0x50(%rsp) | ||
795 | movaps %xmm11,0x60(%rsp) | ||
796 | movaps %xmm12,0x70(%rsp) | ||
797 | movaps %xmm13,0x80(%rsp) | ||
798 | movaps %xmm14,0x90(%rsp) | ||
799 | movaps %xmm15,0xa0(%rsp) | ||
800 | .Lenc_body: | ||
801 | ___ | ||
802 | $code.=<<___; | ||
803 | movdqu (%rdi),%xmm0 | ||
804 | call _vpaes_preheat | ||
805 | call _vpaes_encrypt_core | ||
806 | movdqu %xmm0,(%rsi) | ||
807 | ___ | ||
808 | $code.=<<___ if ($win64); | ||
809 | movaps 0x10(%rsp),%xmm6 | ||
810 | movaps 0x20(%rsp),%xmm7 | ||
811 | movaps 0x30(%rsp),%xmm8 | ||
812 | movaps 0x40(%rsp),%xmm9 | ||
813 | movaps 0x50(%rsp),%xmm10 | ||
814 | movaps 0x60(%rsp),%xmm11 | ||
815 | movaps 0x70(%rsp),%xmm12 | ||
816 | movaps 0x80(%rsp),%xmm13 | ||
817 | movaps 0x90(%rsp),%xmm14 | ||
818 | movaps 0xa0(%rsp),%xmm15 | ||
819 | lea 0xb8(%rsp),%rsp | ||
820 | .Lenc_epilogue: | ||
821 | ___ | ||
822 | $code.=<<___; | ||
823 | ret | ||
824 | .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt | ||
825 | |||
826 | .globl ${PREFIX}_decrypt | ||
827 | .type ${PREFIX}_decrypt,\@function,3 | ||
828 | .align 16 | ||
829 | ${PREFIX}_decrypt: | ||
830 | _CET_ENDBR | ||
831 | ___ | ||
832 | $code.=<<___ if ($win64); | ||
833 | lea -0xb8(%rsp),%rsp | ||
834 | movaps %xmm6,0x10(%rsp) | ||
835 | movaps %xmm7,0x20(%rsp) | ||
836 | movaps %xmm8,0x30(%rsp) | ||
837 | movaps %xmm9,0x40(%rsp) | ||
838 | movaps %xmm10,0x50(%rsp) | ||
839 | movaps %xmm11,0x60(%rsp) | ||
840 | movaps %xmm12,0x70(%rsp) | ||
841 | movaps %xmm13,0x80(%rsp) | ||
842 | movaps %xmm14,0x90(%rsp) | ||
843 | movaps %xmm15,0xa0(%rsp) | ||
844 | .Ldec_body: | ||
845 | ___ | ||
846 | $code.=<<___; | ||
847 | movdqu (%rdi),%xmm0 | ||
848 | call _vpaes_preheat | ||
849 | call _vpaes_decrypt_core | ||
850 | movdqu %xmm0,(%rsi) | ||
851 | ___ | ||
852 | $code.=<<___ if ($win64); | ||
853 | movaps 0x10(%rsp),%xmm6 | ||
854 | movaps 0x20(%rsp),%xmm7 | ||
855 | movaps 0x30(%rsp),%xmm8 | ||
856 | movaps 0x40(%rsp),%xmm9 | ||
857 | movaps 0x50(%rsp),%xmm10 | ||
858 | movaps 0x60(%rsp),%xmm11 | ||
859 | movaps 0x70(%rsp),%xmm12 | ||
860 | movaps 0x80(%rsp),%xmm13 | ||
861 | movaps 0x90(%rsp),%xmm14 | ||
862 | movaps 0xa0(%rsp),%xmm15 | ||
863 | lea 0xb8(%rsp),%rsp | ||
864 | .Ldec_epilogue: | ||
865 | ___ | ||
866 | $code.=<<___; | ||
867 | ret | ||
868 | .size ${PREFIX}_decrypt,.-${PREFIX}_decrypt | ||
869 | ___ | ||
870 | { | ||
871 | my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); | ||
872 | # void AES_cbc_encrypt (const void char *inp, unsigned char *out, | ||
873 | # size_t length, const AES_KEY *key, | ||
874 | # unsigned char *ivp,const int enc); | ||
875 | $code.=<<___; | ||
876 | .globl ${PREFIX}_cbc_encrypt | ||
877 | .type ${PREFIX}_cbc_encrypt,\@function,6 | ||
878 | .align 16 | ||
879 | ${PREFIX}_cbc_encrypt: | ||
880 | _CET_ENDBR | ||
881 | xchg $key,$len | ||
882 | ___ | ||
883 | ($len,$key)=($key,$len); | ||
884 | $code.=<<___; | ||
885 | sub \$16,$len | ||
886 | jc .Lcbc_abort | ||
887 | ___ | ||
888 | $code.=<<___ if ($win64); | ||
889 | lea -0xb8(%rsp),%rsp | ||
890 | movaps %xmm6,0x10(%rsp) | ||
891 | movaps %xmm7,0x20(%rsp) | ||
892 | movaps %xmm8,0x30(%rsp) | ||
893 | movaps %xmm9,0x40(%rsp) | ||
894 | movaps %xmm10,0x50(%rsp) | ||
895 | movaps %xmm11,0x60(%rsp) | ||
896 | movaps %xmm12,0x70(%rsp) | ||
897 | movaps %xmm13,0x80(%rsp) | ||
898 | movaps %xmm14,0x90(%rsp) | ||
899 | movaps %xmm15,0xa0(%rsp) | ||
900 | .Lcbc_body: | ||
901 | ___ | ||
902 | $code.=<<___; | ||
903 | movdqu ($ivp),%xmm6 # load IV | ||
904 | sub $inp,$out | ||
905 | call _vpaes_preheat | ||
906 | cmp \$0,${enc}d | ||
907 | je .Lcbc_dec_loop | ||
908 | jmp .Lcbc_enc_loop | ||
909 | .align 16 | ||
910 | .Lcbc_enc_loop: | ||
911 | movdqu ($inp),%xmm0 | ||
912 | pxor %xmm6,%xmm0 | ||
913 | call _vpaes_encrypt_core | ||
914 | movdqa %xmm0,%xmm6 | ||
915 | movdqu %xmm0,($out,$inp) | ||
916 | lea 16($inp),$inp | ||
917 | sub \$16,$len | ||
918 | jnc .Lcbc_enc_loop | ||
919 | jmp .Lcbc_done | ||
920 | .align 16 | ||
921 | .Lcbc_dec_loop: | ||
922 | movdqu ($inp),%xmm0 | ||
923 | movdqa %xmm0,%xmm7 | ||
924 | call _vpaes_decrypt_core | ||
925 | pxor %xmm6,%xmm0 | ||
926 | movdqa %xmm7,%xmm6 | ||
927 | movdqu %xmm0,($out,$inp) | ||
928 | lea 16($inp),$inp | ||
929 | sub \$16,$len | ||
930 | jnc .Lcbc_dec_loop | ||
931 | .Lcbc_done: | ||
932 | movdqu %xmm6,($ivp) # save IV | ||
933 | ___ | ||
934 | $code.=<<___ if ($win64); | ||
935 | movaps 0x10(%rsp),%xmm6 | ||
936 | movaps 0x20(%rsp),%xmm7 | ||
937 | movaps 0x30(%rsp),%xmm8 | ||
938 | movaps 0x40(%rsp),%xmm9 | ||
939 | movaps 0x50(%rsp),%xmm10 | ||
940 | movaps 0x60(%rsp),%xmm11 | ||
941 | movaps 0x70(%rsp),%xmm12 | ||
942 | movaps 0x80(%rsp),%xmm13 | ||
943 | movaps 0x90(%rsp),%xmm14 | ||
944 | movaps 0xa0(%rsp),%xmm15 | ||
945 | lea 0xb8(%rsp),%rsp | ||
946 | .Lcbc_epilogue: | ||
947 | ___ | ||
948 | $code.=<<___; | ||
949 | .Lcbc_abort: | ||
950 | ret | ||
951 | .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt | ||
952 | ___ | ||
953 | } | ||
954 | $code.=<<___; | ||
955 | ## | ||
956 | ## _aes_preheat | ||
957 | ## | ||
958 | ## Fills register %r10 -> .aes_consts (so you can -fPIC) | ||
959 | ## and %xmm9-%xmm15 as specified below. | ||
960 | ## | ||
961 | .type _vpaes_preheat,\@abi-omnipotent | ||
962 | .align 16 | ||
963 | _vpaes_preheat: | ||
964 | _CET_ENDBR | ||
965 | lea .Lk_s0F(%rip), %r10 | ||
966 | movdqa -0x20(%r10), %xmm10 # .Lk_inv | ||
967 | movdqa -0x10(%r10), %xmm11 # .Lk_inv+16 | ||
968 | movdqa 0x00(%r10), %xmm9 # .Lk_s0F | ||
969 | movdqa 0x30(%r10), %xmm13 # .Lk_sb1 | ||
970 | movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16 | ||
971 | movdqa 0x50(%r10), %xmm15 # .Lk_sb2 | ||
972 | movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16 | ||
973 | ret | ||
974 | .size _vpaes_preheat,.-_vpaes_preheat | ||
975 | ######################################################## | ||
976 | ## ## | ||
977 | ## Constants ## | ||
978 | ## ## | ||
979 | ######################################################## | ||
980 | .section .rodata | ||
981 | .type _vpaes_consts,\@object | ||
982 | .align 64 | ||
983 | _vpaes_consts: | ||
984 | .Lk_inv: # inv, inva | ||
985 | .quad 0x0E05060F0D080180, 0x040703090A0B0C02 | ||
986 | .quad 0x01040A060F0B0780, 0x030D0E0C02050809 | ||
987 | |||
988 | .Lk_s0F: # s0F | ||
989 | .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F | ||
990 | |||
991 | .Lk_ipt: # input transform (lo, hi) | ||
992 | .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 | ||
993 | .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 | ||
994 | |||
995 | .Lk_sb1: # sb1u, sb1t | ||
996 | .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 | ||
997 | .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF | ||
998 | .Lk_sb2: # sb2u, sb2t | ||
999 | .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD | ||
1000 | .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A | ||
1001 | .Lk_sbo: # sbou, sbot | ||
1002 | .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 | ||
1003 | .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA | ||
1004 | |||
1005 | .Lk_mc_forward: # mc_forward | ||
1006 | .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 | ||
1007 | .quad 0x080B0A0904070605, 0x000302010C0F0E0D | ||
1008 | .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 | ||
1009 | .quad 0x000302010C0F0E0D, 0x080B0A0904070605 | ||
1010 | |||
1011 | .Lk_mc_backward:# mc_backward | ||
1012 | .quad 0x0605040702010003, 0x0E0D0C0F0A09080B | ||
1013 | .quad 0x020100030E0D0C0F, 0x0A09080B06050407 | ||
1014 | .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 | ||
1015 | .quad 0x0A09080B06050407, 0x020100030E0D0C0F | ||
1016 | |||
1017 | .Lk_sr: # sr | ||
1018 | .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 | ||
1019 | .quad 0x030E09040F0A0500, 0x0B06010C07020D08 | ||
1020 | .quad 0x0F060D040B020900, 0x070E050C030A0108 | ||
1021 | .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 | ||
1022 | |||
1023 | .Lk_rcon: # rcon | ||
1024 | .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 | ||
1025 | |||
1026 | .Lk_s63: # s63: all equal to 0x63 transformed | ||
1027 | .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B | ||
1028 | |||
1029 | .Lk_opt: # output transform | ||
1030 | .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 | ||
1031 | .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 | ||
1032 | |||
1033 | .Lk_deskew: # deskew tables: inverts the sbox's "skew" | ||
1034 | .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A | ||
1035 | .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 | ||
1036 | |||
1037 | ## | ||
1038 | ## Decryption stuff | ||
1039 | ## Key schedule constants | ||
1040 | ## | ||
1041 | .Lk_dksd: # decryption key schedule: invskew x*D | ||
1042 | .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 | ||
1043 | .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E | ||
1044 | .Lk_dksb: # decryption key schedule: invskew x*B | ||
1045 | .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 | ||
1046 | .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 | ||
1047 | .Lk_dkse: # decryption key schedule: invskew x*E + 0x63 | ||
1048 | .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 | ||
1049 | .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 | ||
1050 | .Lk_dks9: # decryption key schedule: invskew x*9 | ||
1051 | .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC | ||
1052 | .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE | ||
1053 | |||
1054 | ## | ||
1055 | ## Decryption stuff | ||
1056 | ## Round function constants | ||
1057 | ## | ||
1058 | .Lk_dipt: # decryption input transform | ||
1059 | .quad 0x0F505B040B545F00, 0x154A411E114E451A | ||
1060 | .quad 0x86E383E660056500, 0x12771772F491F194 | ||
1061 | |||
1062 | .Lk_dsb9: # decryption sbox output *9*u, *9*t | ||
1063 | .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 | ||
1064 | .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 | ||
1065 | .Lk_dsbd: # decryption sbox output *D*u, *D*t | ||
1066 | .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 | ||
1067 | .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 | ||
1068 | .Lk_dsbb: # decryption sbox output *B*u, *B*t | ||
1069 | .quad 0xD022649296B44200, 0x602646F6B0F2D404 | ||
1070 | .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B | ||
1071 | .Lk_dsbe: # decryption sbox output *E*u, *E*t | ||
1072 | .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 | ||
1073 | .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 | ||
1074 | .Lk_dsbo: # decryption sbox final output | ||
1075 | .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D | ||
1076 | .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C | ||
1077 | .align 64 | ||
1078 | .size _vpaes_consts,.-_vpaes_consts | ||
1079 | .text | ||
1080 | ___ | ||
1081 | |||
1082 | if ($win64) { | ||
1083 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
1084 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
1085 | $rec="%rcx"; | ||
1086 | $frame="%rdx"; | ||
1087 | $context="%r8"; | ||
1088 | $disp="%r9"; | ||
1089 | |||
1090 | $code.=<<___; | ||
1091 | .extern __imp_RtlVirtualUnwind | ||
1092 | .type se_handler,\@abi-omnipotent | ||
1093 | .align 16 | ||
1094 | se_handler: | ||
1095 | _CET_ENDBR | ||
1096 | push %rsi | ||
1097 | push %rdi | ||
1098 | push %rbx | ||
1099 | push %rbp | ||
1100 | push %r12 | ||
1101 | push %r13 | ||
1102 | push %r14 | ||
1103 | push %r15 | ||
1104 | pushfq | ||
1105 | sub \$64,%rsp | ||
1106 | |||
1107 | mov 120($context),%rax # pull context->Rax | ||
1108 | mov 248($context),%rbx # pull context->Rip | ||
1109 | |||
1110 | mov 8($disp),%rsi # disp->ImageBase | ||
1111 | mov 56($disp),%r11 # disp->HandlerData | ||
1112 | |||
1113 | mov 0(%r11),%r10d # HandlerData[0] | ||
1114 | lea (%rsi,%r10),%r10 # prologue label | ||
1115 | cmp %r10,%rbx # context->Rip<prologue label | ||
1116 | jb .Lin_prologue | ||
1117 | |||
1118 | mov 152($context),%rax # pull context->Rsp | ||
1119 | |||
1120 | mov 4(%r11),%r10d # HandlerData[1] | ||
1121 | lea (%rsi,%r10),%r10 # epilogue label | ||
1122 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
1123 | jae .Lin_prologue | ||
1124 | |||
1125 | lea 16(%rax),%rsi # %xmm save area | ||
1126 | lea 512($context),%rdi # &context.Xmm6 | ||
1127 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
1128 | .long 0xa548f3fc # cld; rep movsq | ||
1129 | lea 0xb8(%rax),%rax # adjust stack pointer | ||
1130 | |||
1131 | .Lin_prologue: | ||
1132 | mov 8(%rax),%rdi | ||
1133 | mov 16(%rax),%rsi | ||
1134 | mov %rax,152($context) # restore context->Rsp | ||
1135 | mov %rsi,168($context) # restore context->Rsi | ||
1136 | mov %rdi,176($context) # restore context->Rdi | ||
1137 | |||
1138 | mov 40($disp),%rdi # disp->ContextRecord | ||
1139 | mov $context,%rsi # context | ||
1140 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
1141 | .long 0xa548f3fc # cld; rep movsq | ||
1142 | |||
1143 | mov $disp,%rsi | ||
1144 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
1145 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
1146 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
1147 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
1148 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
1149 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
1150 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
1151 | mov %r10,32(%rsp) # arg5 | ||
1152 | mov %r11,40(%rsp) # arg6 | ||
1153 | mov %r12,48(%rsp) # arg7 | ||
1154 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
1155 | call *__imp_RtlVirtualUnwind(%rip) | ||
1156 | |||
1157 | mov \$1,%eax # ExceptionContinueSearch | ||
1158 | add \$64,%rsp | ||
1159 | popfq | ||
1160 | pop %r15 | ||
1161 | pop %r14 | ||
1162 | pop %r13 | ||
1163 | pop %r12 | ||
1164 | pop %rbp | ||
1165 | pop %rbx | ||
1166 | pop %rdi | ||
1167 | pop %rsi | ||
1168 | ret | ||
1169 | .size se_handler,.-se_handler | ||
1170 | |||
1171 | .section .pdata | ||
1172 | .align 4 | ||
1173 | .rva .LSEH_begin_${PREFIX}_set_encrypt_key | ||
1174 | .rva .LSEH_end_${PREFIX}_set_encrypt_key | ||
1175 | .rva .LSEH_info_${PREFIX}_set_encrypt_key | ||
1176 | |||
1177 | .rva .LSEH_begin_${PREFIX}_set_decrypt_key | ||
1178 | .rva .LSEH_end_${PREFIX}_set_decrypt_key | ||
1179 | .rva .LSEH_info_${PREFIX}_set_decrypt_key | ||
1180 | |||
1181 | .rva .LSEH_begin_${PREFIX}_encrypt | ||
1182 | .rva .LSEH_end_${PREFIX}_encrypt | ||
1183 | .rva .LSEH_info_${PREFIX}_encrypt | ||
1184 | |||
1185 | .rva .LSEH_begin_${PREFIX}_decrypt | ||
1186 | .rva .LSEH_end_${PREFIX}_decrypt | ||
1187 | .rva .LSEH_info_${PREFIX}_decrypt | ||
1188 | |||
1189 | .rva .LSEH_begin_${PREFIX}_cbc_encrypt | ||
1190 | .rva .LSEH_end_${PREFIX}_cbc_encrypt | ||
1191 | .rva .LSEH_info_${PREFIX}_cbc_encrypt | ||
1192 | |||
1193 | .section .xdata | ||
1194 | .align 8 | ||
1195 | .LSEH_info_${PREFIX}_set_encrypt_key: | ||
1196 | .byte 9,0,0,0 | ||
1197 | .rva se_handler | ||
1198 | .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[] | ||
1199 | .LSEH_info_${PREFIX}_set_decrypt_key: | ||
1200 | .byte 9,0,0,0 | ||
1201 | .rva se_handler | ||
1202 | .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[] | ||
1203 | .LSEH_info_${PREFIX}_encrypt: | ||
1204 | .byte 9,0,0,0 | ||
1205 | .rva se_handler | ||
1206 | .rva .Lenc_body,.Lenc_epilogue # HandlerData[] | ||
1207 | .LSEH_info_${PREFIX}_decrypt: | ||
1208 | .byte 9,0,0,0 | ||
1209 | .rva se_handler | ||
1210 | .rva .Ldec_body,.Ldec_epilogue # HandlerData[] | ||
1211 | .LSEH_info_${PREFIX}_cbc_encrypt: | ||
1212 | .byte 9,0,0,0 | ||
1213 | .rva se_handler | ||
1214 | .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[] | ||
1215 | ___ | ||
1216 | } | ||
1217 | |||
1218 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
1219 | |||
1220 | print $code; | ||
1221 | |||
1222 | close STDOUT; | ||