summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/aes/asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/aes/asm')
-rw-r--r--src/lib/libcrypto/aes/asm/aes-586.pl30
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-x86_64.pl90
-rw-r--r--src/lib/libcrypto/aes/asm/bsaes-x86_64.pl3123
-rw-r--r--src/lib/libcrypto/aes/asm/vpaes-x86.pl911
-rw-r--r--src/lib/libcrypto/aes/asm/vpaes-x86_64.pl1222
5 files changed, 60 insertions, 5316 deletions
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl
index 364099d4d3..402a1a3c46 100644
--- a/src/lib/libcrypto/aes/asm/aes-586.pl
+++ b/src/lib/libcrypto/aes/asm/aes-586.pl
@@ -1158,8 +1158,8 @@ sub enclast()
1158 &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000); 1158 &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
1159 &previous(); 1159 &previous();
1160 1160
1161# void aes_encrypt_internal(const void *inp, void *out, const AES_KEY *key); 1161# void aes_encrypt_generic(const void *inp, void *out, const AES_KEY *key);
1162&function_begin("aes_encrypt_internal"); 1162&function_begin("aes_encrypt_generic");
1163 &mov ($acc,&wparam(0)); # load inp 1163 &mov ($acc,&wparam(0)); # load inp
1164 &mov ($key,&wparam(2)); # load key 1164 &mov ($key,&wparam(2)); # load key
1165 1165
@@ -1213,7 +1213,7 @@ sub enclast()
1213 &mov (&DWP(4,$acc),$s1); 1213 &mov (&DWP(4,$acc),$s1);
1214 &mov (&DWP(8,$acc),$s2); 1214 &mov (&DWP(8,$acc),$s2);
1215 &mov (&DWP(12,$acc),$s3); 1215 &mov (&DWP(12,$acc),$s3);
1216&function_end("aes_encrypt_internal"); 1216&function_end("aes_encrypt_generic");
1217 1217
1218#--------------------------------------------------------------------# 1218#--------------------------------------------------------------------#
1219 1219
@@ -1947,8 +1947,8 @@ sub declast()
1947 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); 1947 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1948 &previous(); 1948 &previous();
1949 1949
1950# void aes_decrypt_internal(const void *inp, void *out, const AES_KEY *key); 1950# void aes_decrypt_generic(const void *inp, void *out, const AES_KEY *key);
1951&function_begin("aes_decrypt_internal"); 1951&function_begin("aes_decrypt_generic");
1952 &mov ($acc,&wparam(0)); # load inp 1952 &mov ($acc,&wparam(0)); # load inp
1953 &mov ($key,&wparam(2)); # load key 1953 &mov ($key,&wparam(2)); # load key
1954 1954
@@ -2002,9 +2002,9 @@ sub declast()
2002 &mov (&DWP(4,$acc),$s1); 2002 &mov (&DWP(4,$acc),$s1);
2003 &mov (&DWP(8,$acc),$s2); 2003 &mov (&DWP(8,$acc),$s2);
2004 &mov (&DWP(12,$acc),$s3); 2004 &mov (&DWP(12,$acc),$s3);
2005&function_end("aes_decrypt_internal"); 2005&function_end("aes_decrypt_generic");
2006 2006
2007# void aes_cbc_encrypt_internal(const void char *inp, unsigned char *out, 2007# void aes_cbc_encrypt_generic(const void char *inp, unsigned char *out,
2008# size_t length, const AES_KEY *key, unsigned char *ivp,const int enc); 2008# size_t length, const AES_KEY *key, unsigned char *ivp,const int enc);
2009{ 2009{
2010# stack frame layout 2010# stack frame layout
@@ -2028,7 +2028,7 @@ my $ivec=&DWP(60,"esp"); # ivec[16]
2028my $aes_key=&DWP(76,"esp"); # copy of aes_key 2028my $aes_key=&DWP(76,"esp"); # copy of aes_key
2029my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds 2029my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
2030 2030
2031&function_begin("aes_cbc_encrypt_internal"); 2031&function_begin("aes_cbc_encrypt_generic");
2032 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len 2032 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
2033 &cmp ($s2,0); 2033 &cmp ($s2,0);
2034 &je (&label("drop_out")); 2034 &je (&label("drop_out"));
@@ -2616,7 +2616,7 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
2616 2616
2617 &mov ("esp",$_esp); 2617 &mov ("esp",$_esp);
2618 &popf (); 2618 &popf ();
2619&function_end("aes_cbc_encrypt_internal"); 2619&function_end("aes_cbc_encrypt_generic");
2620} 2620}
2621 2621
2622#------------------------------------------------------------------# 2622#------------------------------------------------------------------#
@@ -2849,12 +2849,12 @@ sub enckey()
2849 &set_label("exit"); 2849 &set_label("exit");
2850&function_end("_x86_AES_set_encrypt_key"); 2850&function_end("_x86_AES_set_encrypt_key");
2851 2851
2852# int aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits, 2852# int aes_set_encrypt_key_generic(const unsigned char *userKey, const int bits,
2853# AES_KEY *key) 2853# AES_KEY *key)
2854&function_begin_B("aes_set_encrypt_key_internal"); 2854&function_begin_B("aes_set_encrypt_key_generic");
2855 &call ("_x86_AES_set_encrypt_key"); 2855 &call ("_x86_AES_set_encrypt_key");
2856 &ret (); 2856 &ret ();
2857&function_end_B("aes_set_encrypt_key_internal"); 2857&function_end_B("aes_set_encrypt_key_generic");
2858 2858
2859sub deckey() 2859sub deckey()
2860{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; 2860{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
@@ -2911,9 +2911,9 @@ sub deckey()
2911 &mov (&DWP(4*$i,$key),$tp1); 2911 &mov (&DWP(4*$i,$key),$tp1);
2912} 2912}
2913 2913
2914# int aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits, 2914# int aes_set_decrypt_key_generic(const unsigned char *userKey, const int bits,
2915# AES_KEY *key) 2915# AES_KEY *key)
2916&function_begin_B("aes_set_decrypt_key_internal"); 2916&function_begin_B("aes_set_decrypt_key_generic");
2917 &call ("_x86_AES_set_encrypt_key"); 2917 &call ("_x86_AES_set_encrypt_key");
2918 &cmp ("eax",0); 2918 &cmp ("eax",0);
2919 &je (&label("proceed")); 2919 &je (&label("proceed"));
@@ -2969,6 +2969,6 @@ sub deckey()
2969 &jb (&label("permute")); 2969 &jb (&label("permute"));
2970 2970
2971 &xor ("eax","eax"); # return success 2971 &xor ("eax","eax"); # return success
2972&function_end("aes_set_decrypt_key_internal"); 2972&function_end("aes_set_decrypt_key_generic");
2973 2973
2974&asm_finish(); 2974&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
index 324c4a2be2..2c73627546 100755
--- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl
+++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
@@ -586,15 +586,15 @@ $code.=<<___;
586.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact 586.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
587___ 587___
588 588
589# void aes_encrypt_internal(const void *inp, void *out, const AES_KEY *key); 589# void aes_encrypt_generic(const void *inp, void *out, const AES_KEY *key);
590$code.=<<___; 590$code.=<<___;
591.globl aes_encrypt_internal 591.globl aes_encrypt_generic
592.type aes_encrypt_internal,\@function,3 592.type aes_encrypt_generic,\@function,3
593.align 16 593.align 16
594.globl asm_AES_encrypt 594.globl asm_AES_encrypt
595.hidden asm_AES_encrypt 595.hidden asm_AES_encrypt
596asm_AES_encrypt: 596asm_AES_encrypt:
597aes_encrypt_internal: 597aes_encrypt_generic:
598 _CET_ENDBR 598 _CET_ENDBR
599 push %rbx 599 push %rbx
600 push %rbp 600 push %rbp
@@ -655,7 +655,7 @@ aes_encrypt_internal:
655 lea 48(%rsi),%rsp 655 lea 48(%rsi),%rsp
656.Lenc_epilogue: 656.Lenc_epilogue:
657 ret 657 ret
658.size aes_encrypt_internal,.-aes_encrypt_internal 658.size aes_encrypt_generic,.-aes_encrypt_generic
659___ 659___
660 660
661#------------------------------------------------------------------# 661#------------------------------------------------------------------#
@@ -1188,15 +1188,15 @@ $code.=<<___;
1188.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact 1188.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
1189___ 1189___
1190 1190
1191# void aes_decrypt_internal(const void *inp, void *out, const AES_KEY *key); 1191# void aes_decrypt_generic(const void *inp, void *out, const AES_KEY *key);
1192$code.=<<___; 1192$code.=<<___;
1193.globl aes_decrypt_internal 1193.globl aes_decrypt_generic
1194.type aes_decrypt_internal,\@function,3 1194.type aes_decrypt_generic,\@function,3
1195.align 16 1195.align 16
1196.globl asm_AES_decrypt 1196.globl asm_AES_decrypt
1197.hidden asm_AES_decrypt 1197.hidden asm_AES_decrypt
1198asm_AES_decrypt: 1198asm_AES_decrypt:
1199aes_decrypt_internal: 1199aes_decrypt_generic:
1200 _CET_ENDBR 1200 _CET_ENDBR
1201 push %rbx 1201 push %rbx
1202 push %rbp 1202 push %rbp
@@ -1259,7 +1259,7 @@ aes_decrypt_internal:
1259 lea 48(%rsi),%rsp 1259 lea 48(%rsi),%rsp
1260.Ldec_epilogue: 1260.Ldec_epilogue:
1261 ret 1261 ret
1262.size aes_decrypt_internal,.-aes_decrypt_internal 1262.size aes_decrypt_generic,.-aes_decrypt_generic
1263___ 1263___
1264#------------------------------------------------------------------# 1264#------------------------------------------------------------------#
1265 1265
@@ -1290,13 +1290,13 @@ $code.=<<___;
1290___ 1290___
1291} 1291}
1292 1292
1293# int aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits, 1293# int aes_set_encrypt_key_generic(const unsigned char *userKey, const int bits,
1294# AES_KEY *key) 1294# AES_KEY *key)
1295$code.=<<___; 1295$code.=<<___;
1296.globl aes_set_encrypt_key_internal 1296.globl aes_set_encrypt_key_generic
1297.type aes_set_encrypt_key_internal,\@function,3 1297.type aes_set_encrypt_key_generic,\@function,3
1298.align 16 1298.align 16
1299aes_set_encrypt_key_internal: 1299aes_set_encrypt_key_generic:
1300 _CET_ENDBR 1300 _CET_ENDBR
1301 push %rbx 1301 push %rbx
1302 push %rbp 1302 push %rbp
@@ -1318,7 +1318,7 @@ aes_set_encrypt_key_internal:
1318 add \$56,%rsp 1318 add \$56,%rsp
1319.Lenc_key_epilogue: 1319.Lenc_key_epilogue:
1320 ret 1320 ret
1321.size aes_set_encrypt_key_internal,.-aes_set_encrypt_key_internal 1321.size aes_set_encrypt_key_generic,.-aes_set_encrypt_key_generic
1322 1322
1323.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent 1323.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
1324.align 16 1324.align 16
@@ -1562,13 +1562,13 @@ $code.=<<___;
1562___ 1562___
1563} 1563}
1564 1564
1565# int aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits, 1565# int aes_set_decrypt_key_generic(const unsigned char *userKey, const int bits,
1566# AES_KEY *key) 1566# AES_KEY *key)
1567$code.=<<___; 1567$code.=<<___;
1568.globl aes_set_decrypt_key_internal 1568.globl aes_set_decrypt_key_generic
1569.type aes_set_decrypt_key_internal,\@function,3 1569.type aes_set_decrypt_key_generic,\@function,3
1570.align 16 1570.align 16
1571aes_set_decrypt_key_internal: 1571aes_set_decrypt_key_generic:
1572 _CET_ENDBR 1572 _CET_ENDBR
1573 push %rbx 1573 push %rbx
1574 push %rbp 1574 push %rbp
@@ -1638,10 +1638,10 @@ $code.=<<___;
1638 add \$56,%rsp 1638 add \$56,%rsp
1639.Ldec_key_epilogue: 1639.Ldec_key_epilogue:
1640 ret 1640 ret
1641.size aes_set_decrypt_key_internal,.-aes_set_decrypt_key_internal 1641.size aes_set_decrypt_key_generic,.-aes_set_decrypt_key_generic
1642___ 1642___
1643 1643
1644# void aes_cbc_encrypt_internal(const void char *inp, unsigned char *out, 1644# void aes_cbc_encrypt_generic(const void char *inp, unsigned char *out,
1645# size_t length, const AES_KEY *key, unsigned char *ivp,const int enc); 1645# size_t length, const AES_KEY *key, unsigned char *ivp,const int enc);
1646{ 1646{
1647# stack frame layout 1647# stack frame layout
@@ -1659,15 +1659,15 @@ my $aes_key="80(%rsp)"; # copy of aes_key
1659my $mark="80+240(%rsp)"; # copy of aes_key->rounds 1659my $mark="80+240(%rsp)"; # copy of aes_key->rounds
1660 1660
1661$code.=<<___; 1661$code.=<<___;
1662.globl aes_cbc_encrypt_internal 1662.globl aes_cbc_encrypt_generic
1663.type aes_cbc_encrypt_internal,\@function,6 1663.type aes_cbc_encrypt_generic,\@function,6
1664.align 16 1664.align 16
1665.extern OPENSSL_ia32cap_P 1665.extern OPENSSL_ia32cap_P
1666.hidden OPENSSL_ia32cap_P 1666.hidden OPENSSL_ia32cap_P
1667.globl asm_AES_cbc_encrypt 1667.globl asm_AES_cbc_encrypt
1668.hidden asm_AES_cbc_encrypt 1668.hidden asm_AES_cbc_encrypt
1669asm_AES_cbc_encrypt: 1669asm_AES_cbc_encrypt:
1670aes_cbc_encrypt_internal: 1670aes_cbc_encrypt_generic:
1671 _CET_ENDBR 1671 _CET_ENDBR
1672 cmp \$0,%rdx # check length 1672 cmp \$0,%rdx # check length
1673 je .Lcbc_epilogue 1673 je .Lcbc_epilogue
@@ -2117,7 +2117,7 @@ aes_cbc_encrypt_internal:
2117 popfq 2117 popfq
2118.Lcbc_epilogue: 2118.Lcbc_epilogue:
2119 ret 2119 ret
2120.size aes_cbc_encrypt_internal,.-aes_cbc_encrypt_internal 2120.size aes_cbc_encrypt_generic,.-aes_cbc_encrypt_generic
2121___ 2121___
2122} 2122}
2123 2123
@@ -2782,45 +2782,45 @@ cbc_se_handler:
2782 2782
2783.section .pdata 2783.section .pdata
2784.align 4 2784.align 4
2785 .rva .LSEH_begin_aes_encrypt_internal 2785 .rva .LSEH_begin_aes_encrypt_generic
2786 .rva .LSEH_end_aes_encrypt_internal 2786 .rva .LSEH_end_aes_encrypt_generic
2787 .rva .LSEH_info_aes_encrypt_internal 2787 .rva .LSEH_info_aes_encrypt_generic
2788 2788
2789 .rva .LSEH_begin_aes_decrypt_internal 2789 .rva .LSEH_begin_aes_decrypt_generic
2790 .rva .LSEH_end_aes_decrypt_internal 2790 .rva .LSEH_end_aes_decrypt_generic
2791 .rva .LSEH_info_aes_decrypt_internal 2791 .rva .LSEH_info_aes_decrypt_generic
2792 2792
2793 .rva .LSEH_begin_aes_set_encrypt_key_internal 2793 .rva .LSEH_begin_aes_set_encrypt_key_generic
2794 .rva .LSEH_end_aes_set_encrypt_key_internal 2794 .rva .LSEH_end_aes_set_encrypt_key_generic
2795 .rva .LSEH_info_aes_set_encrypt_key_internal 2795 .rva .LSEH_info_aes_set_encrypt_key_generic
2796 2796
2797 .rva .LSEH_begin_aes_set_decrypt_key_internal 2797 .rva .LSEH_begin_aes_set_decrypt_key_generic
2798 .rva .LSEH_end_aes_set_decrypt_key_internal 2798 .rva .LSEH_end_aes_set_decrypt_key_generic
2799 .rva .LSEH_info_aes_set_decrypt_key_internal 2799 .rva .LSEH_info_aes_set_decrypt_key_generic
2800 2800
2801 .rva .LSEH_begin_aes_cbc_encrypt_internal 2801 .rva .LSEH_begin_aes_cbc_encrypt_generic
2802 .rva .LSEH_end_aes_cbc_encrypt_internal 2802 .rva .LSEH_end_aes_cbc_encrypt_generic
2803 .rva .LSEH_info_aes_cbc_encrypt_internal 2803 .rva .LSEH_info_aes_cbc_encrypt_generic
2804 2804
2805.section .xdata 2805.section .xdata
2806.align 8 2806.align 8
2807.LSEH_info_aes_encrypt_internal: 2807.LSEH_info_aes_encrypt_generic:
2808 .byte 9,0,0,0 2808 .byte 9,0,0,0
2809 .rva block_se_handler 2809 .rva block_se_handler
2810 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] 2810 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
2811.LSEH_info_aes_decrypt_internal: 2811.LSEH_info_aes_decrypt_generic:
2812 .byte 9,0,0,0 2812 .byte 9,0,0,0
2813 .rva block_se_handler 2813 .rva block_se_handler
2814 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] 2814 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
2815.LSEH_info_aes_set_encrypt_key_internal: 2815.LSEH_info_aes_set_encrypt_key_generic:
2816 .byte 9,0,0,0 2816 .byte 9,0,0,0
2817 .rva key_se_handler 2817 .rva key_se_handler
2818 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] 2818 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
2819.LSEH_info_aes_set_decrypt_key_internal: 2819.LSEH_info_aes_set_decrypt_key_generic:
2820 .byte 9,0,0,0 2820 .byte 9,0,0,0
2821 .rva key_se_handler 2821 .rva key_se_handler
2822 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] 2822 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]
2823.LSEH_info_aes_cbc_encrypt_internal: 2823.LSEH_info_aes_cbc_encrypt_generic:
2824 .byte 9,0,0,0 2824 .byte 9,0,0,0
2825 .rva cbc_se_handler 2825 .rva cbc_se_handler
2826___ 2826___
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
deleted file mode 100644
index c44a338114..0000000000
--- a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
+++ /dev/null
@@ -1,3123 +0,0 @@
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode] ###
5### bitsliced implementation for Intel Core 2 processors ###
6### requires support of SSE extensions up to SSSE3 ###
7### Author: Emilia Käsper and Peter Schwabe ###
8### Date: 2009-03-19 ###
9### Public domain ###
10### ###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12### further information. ###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22# from 12.5KB to 2.2KB;
23# - above was possible thanks to mixcolumns() modification that
24# allowed to feed its output back to aesenc[last], this was
25# achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28# relies on conversion of "conventional" key schedule as returned
29# by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31# to skip one shiftrows(), reduce bit-sliced key schedule and
32# speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38# Emilia's this(*) difference
39#
40# Core 2 9.30 8.69 +7%
41# Nehalem(**) 7.63 6.98 +9%
42# Atom 17.1 17.4 -2%(***)
43#
44# (*) Comparison is not completely fair, because "this" is ECB,
45# i.e. no extra processing such as counter values calculation
46# and xor-ing input as in Emilia's CTR implementation is
47# performed. However, the CTR calculations stand for not more
48# than 1% of total time, so comparison is *rather* fair.
49#
50# (**) Results were collected on Westmere, which is considered to
51# be equivalent to Nehalem for this code.
52#
53# (***) Slowdown on Atom is rather strange per se, because original
54# implementation has a number of 9+-bytes instructions, which
55# are bad for Atom front-end, and which I eliminated completely.
56# In attempt to address deterioration sbox() was tested in FP
57# SIMD "domain" (movaps instead of movdqa, xorps instead of
58# pxor, etc.). While it resulted in nominal 4% improvement on
59# Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# conversion conversion/8x block
68# Core 2 240 0.22
69# Nehalem 180 0.20
70# Atom 430 0.19
71#
72# The ratio values mean that 128-byte blocks will be processed
73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
81# October 2011.
82#
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2 9.83
87# Nehalem 7.74
88# Atom 19.0
89#
90# November 2011.
91#
92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93# suboptimal, but XTS is meant to be used with larger blocks...
94#
95# <appro@openssl.org>
96
97$flavour = shift;
98$output = shift;
99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
108open OUT,"| \"$^X\" $xlate $flavour $output";
109*STDOUT=*OUT;
110
111my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
113my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
114
115{
116my ($key,$rounds,$const)=("%rax","%r10d","%r11");
117
118sub Sbox {
119# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
121my @b=@_[0..7];
122my @t=@_[8..11];
123my @s=@_[12..15];
124 &InBasisChange (@b);
125 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
126 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
127}
128
129sub InBasisChange {
130# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
132my @b=@_[0..7];
133$code.=<<___;
134 pxor @b[6], @b[5]
135 pxor @b[1], @b[2]
136 pxor @b[0], @b[3]
137 pxor @b[2], @b[6]
138 pxor @b[0], @b[5]
139
140 pxor @b[3], @b[6]
141 pxor @b[7], @b[3]
142 pxor @b[5], @b[7]
143 pxor @b[4], @b[3]
144 pxor @b[5], @b[4]
145 pxor @b[1], @b[3]
146
147 pxor @b[7], @b[2]
148 pxor @b[5], @b[1]
149___
150}
151
152sub OutBasisChange {
153# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
155my @b=@_[0..7];
156$code.=<<___;
157 pxor @b[6], @b[0]
158 pxor @b[4], @b[1]
159 pxor @b[0], @b[2]
160 pxor @b[6], @b[4]
161 pxor @b[1], @b[6]
162
163 pxor @b[5], @b[1]
164 pxor @b[3], @b[5]
165 pxor @b[7], @b[3]
166 pxor @b[5], @b[7]
167 pxor @b[5], @b[2]
168
169 pxor @b[7], @b[4]
170___
171}
172
173sub InvSbox {
174# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
176my @b=@_[0..7];
177my @t=@_[8..11];
178my @s=@_[12..15];
179 &InvInBasisChange (@b);
180 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
181 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
182}
183
184sub InvInBasisChange { # OutBasisChange in reverse
185my @b=@_[5,1,2,6,3,7,0,4];
186$code.=<<___
187 pxor @b[7], @b[4]
188
189 pxor @b[5], @b[7]
190 pxor @b[5], @b[2]
191 pxor @b[7], @b[3]
192 pxor @b[3], @b[5]
193 pxor @b[5], @b[1]
194
195 pxor @b[1], @b[6]
196 pxor @b[0], @b[2]
197 pxor @b[6], @b[4]
198 pxor @b[6], @b[0]
199 pxor @b[4], @b[1]
200___
201}
202
203sub InvOutBasisChange { # InBasisChange in reverse
204my @b=@_[2,5,7,3,6,1,0,4];
205$code.=<<___;
206 pxor @b[5], @b[1]
207 pxor @b[7], @b[2]
208
209 pxor @b[1], @b[3]
210 pxor @b[5], @b[4]
211 pxor @b[5], @b[7]
212 pxor @b[4], @b[3]
213 pxor @b[0], @b[5]
214 pxor @b[7], @b[3]
215 pxor @b[2], @b[6]
216 pxor @b[1], @b[2]
217 pxor @b[3], @b[6]
218
219 pxor @b[0], @b[3]
220 pxor @b[6], @b[5]
221___
222}
223
224sub Mul_GF4 {
225#;*************************************************************
226#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227#;*************************************************************
228my ($x0,$x1,$y0,$y1,$t0)=@_;
229$code.=<<___;
230 movdqa $y0, $t0
231 pxor $y1, $t0
232 pand $x0, $t0
233 pxor $x1, $x0
234 pand $y0, $x1
235 pand $y1, $x0
236 pxor $x1, $x0
237 pxor $t0, $x1
238___
239}
240
241sub Mul_GF4_N { # not used, see next subroutine
242# multiply and scale by N
243my ($x0,$x1,$y0,$y1,$t0)=@_;
244$code.=<<___;
245 movdqa $y0, $t0
246 pxor $y1, $t0
247 pand $x0, $t0
248 pxor $x1, $x0
249 pand $y0, $x1
250 pand $y1, $x0
251 pxor $x0, $x1
252 pxor $t0, $x0
253___
254}
255
256sub Mul_GF4_N_GF4 {
257# interleaved Mul_GF4_N and Mul_GF4
258my ($x0,$x1,$y0,$y1,$t0,
259 $x2,$x3,$y2,$y3,$t1)=@_;
260$code.=<<___;
261 movdqa $y0, $t0
262 movdqa $y2, $t1
263 pxor $y1, $t0
264 pxor $y3, $t1
265 pand $x0, $t0
266 pand $x2, $t1
267 pxor $x1, $x0
268 pxor $x3, $x2
269 pand $y0, $x1
270 pand $y2, $x3
271 pand $y1, $x0
272 pand $y3, $x2
273 pxor $x0, $x1
274 pxor $x3, $x2
275 pxor $t0, $x0
276 pxor $t1, $x3
277___
278}
279sub Mul_GF16_2 {
280my @x=@_[0..7];
281my @y=@_[8..11];
282my @t=@_[12..15];
283$code.=<<___;
284 movdqa @x[0], @t[0]
285 movdqa @x[1], @t[1]
286___
287 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
288$code.=<<___;
289 pxor @x[2], @t[0]
290 pxor @x[3], @t[1]
291 pxor @y[2], @y[0]
292 pxor @y[3], @y[1]
293___
294 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
295 @x[2], @x[3], @y[2], @y[3], @t[2]);
296$code.=<<___;
297 pxor @t[0], @x[0]
298 pxor @t[0], @x[2]
299 pxor @t[1], @x[1]
300 pxor @t[1], @x[3]
301
302 movdqa @x[4], @t[0]
303 movdqa @x[5], @t[1]
304 pxor @x[6], @t[0]
305 pxor @x[7], @t[1]
306___
307 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
308 @x[6], @x[7], @y[2], @y[3], @t[2]);
309$code.=<<___;
310 pxor @y[2], @y[0]
311 pxor @y[3], @y[1]
312___
313 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
314$code.=<<___;
315 pxor @t[0], @x[4]
316 pxor @t[0], @x[6]
317 pxor @t[1], @x[5]
318 pxor @t[1], @x[7]
319___
320}
321sub Inv_GF256 {
322#;********************************************************************
323#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
324#;********************************************************************
325my @x=@_[0..7];
326my @t=@_[8..11];
327my @s=@_[12..15];
328# direct optimizations from hardware
329$code.=<<___;
330 movdqa @x[4], @t[3]
331 movdqa @x[5], @t[2]
332 movdqa @x[1], @t[1]
333 movdqa @x[7], @s[1]
334 movdqa @x[0], @s[0]
335
336 pxor @x[6], @t[3]
337 pxor @x[7], @t[2]
338 pxor @x[3], @t[1]
339 movdqa @t[3], @s[2]
340 pxor @x[6], @s[1]
341 movdqa @t[2], @t[0]
342 pxor @x[2], @s[0]
343 movdqa @t[3], @s[3]
344
345 por @t[1], @t[2]
346 por @s[0], @t[3]
347 pxor @t[0], @s[3]
348 pand @s[0], @s[2]
349 pxor @t[1], @s[0]
350 pand @t[1], @t[0]
351 pand @s[0], @s[3]
352 movdqa @x[3], @s[0]
353 pxor @x[2], @s[0]
354 pand @s[0], @s[1]
355 pxor @s[1], @t[3]
356 pxor @s[1], @t[2]
357 movdqa @x[4], @s[1]
358 movdqa @x[1], @s[0]
359 pxor @x[5], @s[1]
360 pxor @x[0], @s[0]
361 movdqa @s[1], @t[1]
362 pand @s[0], @s[1]
363 por @s[0], @t[1]
364 pxor @s[1], @t[0]
365 pxor @s[3], @t[3]
366 pxor @s[2], @t[2]
367 pxor @s[3], @t[1]
368 movdqa @x[7], @s[0]
369 pxor @s[2], @t[0]
370 movdqa @x[6], @s[1]
371 pxor @s[2], @t[1]
372 movdqa @x[5], @s[2]
373 pand @x[3], @s[0]
374 movdqa @x[4], @s[3]
375 pand @x[2], @s[1]
376 pand @x[1], @s[2]
377 por @x[0], @s[3]
378 pxor @s[0], @t[3]
379 pxor @s[1], @t[2]
380 pxor @s[2], @t[1]
381 pxor @s[3], @t[0]
382
383 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
384
385 # new smaller inversion
386
387 movdqa @t[3], @s[0]
388 pand @t[1], @t[3]
389 pxor @t[2], @s[0]
390
391 movdqa @t[0], @s[2]
392 movdqa @s[0], @s[3]
393 pxor @t[3], @s[2]
394 pand @s[2], @s[3]
395
396 movdqa @t[1], @s[1]
397 pxor @t[2], @s[3]
398 pxor @t[0], @s[1]
399
400 pxor @t[2], @t[3]
401
402 pand @t[3], @s[1]
403
404 movdqa @s[2], @t[2]
405 pxor @t[0], @s[1]
406
407 pxor @s[1], @t[2]
408 pxor @s[1], @t[1]
409
410 pand @t[0], @t[2]
411
412 pxor @t[2], @s[2]
413 pxor @t[2], @t[1]
414
415 pand @s[3], @s[2]
416
417 pxor @s[0], @s[2]
418___
419# output in s3, s2, s1, t1
420
421# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
422
423# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
424 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
425
426### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
427}
428
429# AES linear components
430
431sub ShiftRows {
432my @x=@_[0..7];
433my $mask=pop;
434$code.=<<___;
435 pxor 0x00($key),@x[0]
436 pxor 0x10($key),@x[1]
437 pshufb $mask,@x[0]
438 pxor 0x20($key),@x[2]
439 pshufb $mask,@x[1]
440 pxor 0x30($key),@x[3]
441 pshufb $mask,@x[2]
442 pxor 0x40($key),@x[4]
443 pshufb $mask,@x[3]
444 pxor 0x50($key),@x[5]
445 pshufb $mask,@x[4]
446 pxor 0x60($key),@x[6]
447 pshufb $mask,@x[5]
448 pxor 0x70($key),@x[7]
449 pshufb $mask,@x[6]
450 lea 0x80($key),$key
451 pshufb $mask,@x[7]
452___
453}
454
455sub MixColumns {
456# modified to emit output in order suitable for feeding back to aesenc[last]
457my @x=@_[0..7];
458my @t=@_[8..15];
459my $inv=@_[16]; # optional
460$code.=<<___;
461 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
462 pshufd \$0x93, @x[1], @t[1]
463 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
464 pshufd \$0x93, @x[2], @t[2]
465 pxor @t[1], @x[1]
466 pshufd \$0x93, @x[3], @t[3]
467 pxor @t[2], @x[2]
468 pshufd \$0x93, @x[4], @t[4]
469 pxor @t[3], @x[3]
470 pshufd \$0x93, @x[5], @t[5]
471 pxor @t[4], @x[4]
472 pshufd \$0x93, @x[6], @t[6]
473 pxor @t[5], @x[5]
474 pshufd \$0x93, @x[7], @t[7]
475 pxor @t[6], @x[6]
476 pxor @t[7], @x[7]
477
478 pxor @x[0], @t[1]
479 pxor @x[7], @t[0]
480 pxor @x[7], @t[1]
481 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
482 pxor @x[1], @t[2]
483 pshufd \$0x4E, @x[1], @x[1]
484 pxor @x[4], @t[5]
485 pxor @t[0], @x[0]
486 pxor @x[5], @t[6]
487 pxor @t[1], @x[1]
488 pxor @x[3], @t[4]
489 pshufd \$0x4E, @x[4], @t[0]
490 pxor @x[6], @t[7]
491 pshufd \$0x4E, @x[5], @t[1]
492 pxor @x[2], @t[3]
493 pshufd \$0x4E, @x[3], @x[4]
494 pxor @x[7], @t[3]
495 pshufd \$0x4E, @x[7], @x[5]
496 pxor @x[7], @t[4]
497 pshufd \$0x4E, @x[6], @x[3]
498 pxor @t[4], @t[0]
499 pshufd \$0x4E, @x[2], @x[6]
500 pxor @t[5], @t[1]
501___
502$code.=<<___ if (!$inv);
503 pxor @t[3], @x[4]
504 pxor @t[7], @x[5]
505 pxor @t[6], @x[3]
506 movdqa @t[0], @x[2]
507 pxor @t[2], @x[6]
508 movdqa @t[1], @x[7]
509___
510$code.=<<___ if ($inv);
511 pxor @x[4], @t[3]
512 pxor @t[7], @x[5]
513 pxor @x[3], @t[6]
514 movdqa @t[0], @x[3]
515 pxor @t[2], @x[6]
516 movdqa @t[6], @x[2]
517 movdqa @t[1], @x[7]
518 movdqa @x[6], @x[4]
519 movdqa @t[3], @x[6]
520___
521}
522
523sub InvMixColumns_orig {
524my @x=@_[0..7];
525my @t=@_[8..15];
526
527$code.=<<___;
528 # multiplication by 0x0e
529 pshufd \$0x93, @x[7], @t[7]
530 movdqa @x[2], @t[2]
531 pxor @x[5], @x[7] # 7 5
532 pxor @x[5], @x[2] # 2 5
533 pshufd \$0x93, @x[0], @t[0]
534 movdqa @x[5], @t[5]
535 pxor @x[0], @x[5] # 5 0 [1]
536 pxor @x[1], @x[0] # 0 1
537 pshufd \$0x93, @x[1], @t[1]
538 pxor @x[2], @x[1] # 1 25
539 pxor @x[6], @x[0] # 01 6 [2]
540 pxor @x[3], @x[1] # 125 3 [4]
541 pshufd \$0x93, @x[3], @t[3]
542 pxor @x[0], @x[2] # 25 016 [3]
543 pxor @x[7], @x[3] # 3 75
544 pxor @x[6], @x[7] # 75 6 [0]
545 pshufd \$0x93, @x[6], @t[6]
546 movdqa @x[4], @t[4]
547 pxor @x[4], @x[6] # 6 4
548 pxor @x[3], @x[4] # 4 375 [6]
549 pxor @x[7], @x[3] # 375 756=36
550 pxor @t[5], @x[6] # 64 5 [7]
551 pxor @t[2], @x[3] # 36 2
552 pxor @t[4], @x[3] # 362 4 [5]
553 pshufd \$0x93, @t[5], @t[5]
554___
555 my @y = @x[7,5,0,2,1,3,4,6];
556$code.=<<___;
557 # multiplication by 0x0b
558 pxor @y[0], @y[1]
559 pxor @t[0], @y[0]
560 pxor @t[1], @y[1]
561 pshufd \$0x93, @t[2], @t[2]
562 pxor @t[5], @y[0]
563 pxor @t[6], @y[1]
564 pxor @t[7], @y[0]
565 pshufd \$0x93, @t[4], @t[4]
566 pxor @t[6], @t[7] # clobber t[7]
567 pxor @y[0], @y[1]
568
569 pxor @t[0], @y[3]
570 pshufd \$0x93, @t[0], @t[0]
571 pxor @t[1], @y[2]
572 pxor @t[1], @y[4]
573 pxor @t[2], @y[2]
574 pshufd \$0x93, @t[1], @t[1]
575 pxor @t[2], @y[3]
576 pxor @t[2], @y[5]
577 pxor @t[7], @y[2]
578 pshufd \$0x93, @t[2], @t[2]
579 pxor @t[3], @y[3]
580 pxor @t[3], @y[6]
581 pxor @t[3], @y[4]
582 pshufd \$0x93, @t[3], @t[3]
583 pxor @t[4], @y[7]
584 pxor @t[4], @y[5]
585 pxor @t[7], @y[7]
586 pxor @t[5], @y[3]
587 pxor @t[4], @y[4]
588 pxor @t[5], @t[7] # clobber t[7] even more
589
590 pxor @t[7], @y[5]
591 pshufd \$0x93, @t[4], @t[4]
592 pxor @t[7], @y[6]
593 pxor @t[7], @y[4]
594
595 pxor @t[5], @t[7]
596 pshufd \$0x93, @t[5], @t[5]
597 pxor @t[6], @t[7] # restore t[7]
598
599 # multiplication by 0x0d
600 pxor @y[7], @y[4]
601 pxor @t[4], @y[7]
602 pshufd \$0x93, @t[6], @t[6]
603 pxor @t[0], @y[2]
604 pxor @t[5], @y[7]
605 pxor @t[2], @y[2]
606 pshufd \$0x93, @t[7], @t[7]
607
608 pxor @y[1], @y[3]
609 pxor @t[1], @y[1]
610 pxor @t[0], @y[0]
611 pxor @t[0], @y[3]
612 pxor @t[5], @y[1]
613 pxor @t[5], @y[0]
614 pxor @t[7], @y[1]
615 pshufd \$0x93, @t[0], @t[0]
616 pxor @t[6], @y[0]
617 pxor @y[1], @y[3]
618 pxor @t[1], @y[4]
619 pshufd \$0x93, @t[1], @t[1]
620
621 pxor @t[7], @y[7]
622 pxor @t[2], @y[4]
623 pxor @t[2], @y[5]
624 pshufd \$0x93, @t[2], @t[2]
625 pxor @t[6], @y[2]
626 pxor @t[3], @t[6] # clobber t[6]
627 pxor @y[7], @y[4]
628 pxor @t[6], @y[3]
629
630 pxor @t[6], @y[6]
631 pxor @t[5], @y[5]
632 pxor @t[4], @y[6]
633 pshufd \$0x93, @t[4], @t[4]
634 pxor @t[6], @y[5]
635 pxor @t[7], @y[6]
636 pxor @t[3], @t[6] # restore t[6]
637
638 pshufd \$0x93, @t[5], @t[5]
639 pshufd \$0x93, @t[6], @t[6]
640 pshufd \$0x93, @t[7], @t[7]
641 pshufd \$0x93, @t[3], @t[3]
642
643 # multiplication by 0x09
644 pxor @y[1], @y[4]
645 pxor @y[1], @t[1] # t[1]=y[1]
646 pxor @t[5], @t[0] # clobber t[0]
647 pxor @t[5], @t[1]
648 pxor @t[0], @y[3]
649 pxor @y[0], @t[0] # t[0]=y[0]
650 pxor @t[6], @t[1]
651 pxor @t[7], @t[6] # clobber t[6]
652 pxor @t[1], @y[4]
653 pxor @t[4], @y[7]
654 pxor @y[4], @t[4] # t[4]=y[4]
655 pxor @t[3], @y[6]
656 pxor @y[3], @t[3] # t[3]=y[3]
657 pxor @t[2], @y[5]
658 pxor @y[2], @t[2] # t[2]=y[2]
659 pxor @t[7], @t[3]
660 pxor @y[5], @t[5] # t[5]=y[5]
661 pxor @t[6], @t[2]
662 pxor @t[6], @t[5]
663 pxor @y[6], @t[6] # t[6]=y[6]
664 pxor @y[7], @t[7] # t[7]=y[7]
665
666 movdqa @t[0],@XMM[0]
667 movdqa @t[1],@XMM[1]
668 movdqa @t[2],@XMM[2]
669 movdqa @t[3],@XMM[3]
670 movdqa @t[4],@XMM[4]
671 movdqa @t[5],@XMM[5]
672 movdqa @t[6],@XMM[6]
673 movdqa @t[7],@XMM[7]
674___
675}
676
677sub InvMixColumns {
678my @x=@_[0..7];
679my @t=@_[8..15];
680
681# Thanks to Jussi Kivilinna for providing pointer to
682#
683# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
684# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
685# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
686# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
687
688$code.=<<___;
689 # multiplication by 0x05-0x00-0x04-0x00
690 pshufd \$0x4E, @x[0], @t[0]
691 pshufd \$0x4E, @x[6], @t[6]
692 pxor @x[0], @t[0]
693 pshufd \$0x4E, @x[7], @t[7]
694 pxor @x[6], @t[6]
695 pshufd \$0x4E, @x[1], @t[1]
696 pxor @x[7], @t[7]
697 pshufd \$0x4E, @x[2], @t[2]
698 pxor @x[1], @t[1]
699 pshufd \$0x4E, @x[3], @t[3]
700 pxor @x[2], @t[2]
701 pxor @t[6], @x[0]
702 pxor @t[6], @x[1]
703 pshufd \$0x4E, @x[4], @t[4]
704 pxor @x[3], @t[3]
705 pxor @t[0], @x[2]
706 pxor @t[1], @x[3]
707 pshufd \$0x4E, @x[5], @t[5]
708 pxor @x[4], @t[4]
709 pxor @t[7], @x[1]
710 pxor @t[2], @x[4]
711 pxor @x[5], @t[5]
712
713 pxor @t[7], @x[2]
714 pxor @t[6], @x[3]
715 pxor @t[6], @x[4]
716 pxor @t[3], @x[5]
717 pxor @t[4], @x[6]
718 pxor @t[7], @x[4]
719 pxor @t[7], @x[5]
720 pxor @t[5], @x[7]
721___
722 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
723}
724
725sub aesenc { # not used
726my @b=@_[0..7];
727my @t=@_[8..15];
728$code.=<<___;
729 movdqa 0x30($const),@t[0] # .LSR
730___
731 &ShiftRows (@b,@t[0]);
732 &Sbox (@b,@t);
733 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
734}
735
736sub aesenclast { # not used
737my @b=@_[0..7];
738my @t=@_[8..15];
739$code.=<<___;
740 movdqa 0x40($const),@t[0] # .LSRM0
741___
742 &ShiftRows (@b,@t[0]);
743 &Sbox (@b,@t);
744$code.=<<___
745 pxor 0x00($key),@b[0]
746 pxor 0x10($key),@b[1]
747 pxor 0x20($key),@b[4]
748 pxor 0x30($key),@b[6]
749 pxor 0x40($key),@b[3]
750 pxor 0x50($key),@b[7]
751 pxor 0x60($key),@b[2]
752 pxor 0x70($key),@b[5]
753___
754}
755
756sub swapmove {
757my ($a,$b,$n,$mask,$t)=@_;
758$code.=<<___;
759 movdqa $b,$t
760 psrlq \$$n,$b
761 pxor $a,$b
762 pand $mask,$b
763 pxor $b,$a
764 psllq \$$n,$b
765 pxor $t,$b
766___
767}
768sub swapmove2x {
769my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
770$code.=<<___;
771 movdqa $b0,$t0
772 psrlq \$$n,$b0
773 movdqa $b1,$t1
774 psrlq \$$n,$b1
775 pxor $a0,$b0
776 pxor $a1,$b1
777 pand $mask,$b0
778 pand $mask,$b1
779 pxor $b0,$a0
780 psllq \$$n,$b0
781 pxor $b1,$a1
782 psllq \$$n,$b1
783 pxor $t0,$b0
784 pxor $t1,$b1
785___
786}
787
788sub bitslice {
789my @x=reverse(@_[0..7]);
790my ($t0,$t1,$t2,$t3)=@_[8..11];
791$code.=<<___;
792 movdqa 0x00($const),$t0 # .LBS0
793 movdqa 0x10($const),$t1 # .LBS1
794___
795 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
796 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
797$code.=<<___;
798 movdqa 0x20($const),$t0 # .LBS2
799___
800 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
801 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
802
803 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
804 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
805}
806
807$code.=<<___;
808.text
809
810.extern asm_AES_encrypt
811.extern asm_AES_decrypt
812
813.type _bsaes_encrypt8,\@abi-omnipotent
814.align 64
815_bsaes_encrypt8:
816 _CET_ENDBR
817 lea .LBS0(%rip), $const # constants table
818
819 movdqa ($key), @XMM[9] # round 0 key
820 lea 0x10($key), $key
821 movdqa 0x50($const), @XMM[8] # .LM0SR
822 pxor @XMM[9], @XMM[0] # xor with round0 key
823 pxor @XMM[9], @XMM[1]
824 pshufb @XMM[8], @XMM[0]
825 pxor @XMM[9], @XMM[2]
826 pshufb @XMM[8], @XMM[1]
827 pxor @XMM[9], @XMM[3]
828 pshufb @XMM[8], @XMM[2]
829 pxor @XMM[9], @XMM[4]
830 pshufb @XMM[8], @XMM[3]
831 pxor @XMM[9], @XMM[5]
832 pshufb @XMM[8], @XMM[4]
833 pxor @XMM[9], @XMM[6]
834 pshufb @XMM[8], @XMM[5]
835 pxor @XMM[9], @XMM[7]
836 pshufb @XMM[8], @XMM[6]
837 pshufb @XMM[8], @XMM[7]
838_bsaes_encrypt8_bitslice:
839___
840 &bitslice (@XMM[0..7, 8..11]);
841$code.=<<___;
842 dec $rounds
843 jmp .Lenc_sbox
844.align 16
845.Lenc_loop:
846___
847 &ShiftRows (@XMM[0..7, 8]);
848$code.=".Lenc_sbox:\n";
849 &Sbox (@XMM[0..7, 8..15]);
850$code.=<<___;
851 dec $rounds
852 jl .Lenc_done
853___
854 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
855$code.=<<___;
856 movdqa 0x30($const), @XMM[8] # .LSR
857 jnz .Lenc_loop
858 movdqa 0x40($const), @XMM[8] # .LSRM0
859 jmp .Lenc_loop
860.align 16
861.Lenc_done:
862___
863 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
864 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
865$code.=<<___;
866 movdqa ($key), @XMM[8] # last round key
867 pxor @XMM[8], @XMM[4]
868 pxor @XMM[8], @XMM[6]
869 pxor @XMM[8], @XMM[3]
870 pxor @XMM[8], @XMM[7]
871 pxor @XMM[8], @XMM[2]
872 pxor @XMM[8], @XMM[5]
873 pxor @XMM[8], @XMM[0]
874 pxor @XMM[8], @XMM[1]
875 ret
876.size _bsaes_encrypt8,.-_bsaes_encrypt8
877
878.type _bsaes_decrypt8,\@abi-omnipotent
879.align 64
880_bsaes_decrypt8:
881 _CET_ENDBR
882 lea .LBS0(%rip), $const # constants table
883
884 movdqa ($key), @XMM[9] # round 0 key
885 lea 0x10($key), $key
886 movdqa -0x30($const), @XMM[8] # .LM0ISR
887 pxor @XMM[9], @XMM[0] # xor with round0 key
888 pxor @XMM[9], @XMM[1]
889 pshufb @XMM[8], @XMM[0]
890 pxor @XMM[9], @XMM[2]
891 pshufb @XMM[8], @XMM[1]
892 pxor @XMM[9], @XMM[3]
893 pshufb @XMM[8], @XMM[2]
894 pxor @XMM[9], @XMM[4]
895 pshufb @XMM[8], @XMM[3]
896 pxor @XMM[9], @XMM[5]
897 pshufb @XMM[8], @XMM[4]
898 pxor @XMM[9], @XMM[6]
899 pshufb @XMM[8], @XMM[5]
900 pxor @XMM[9], @XMM[7]
901 pshufb @XMM[8], @XMM[6]
902 pshufb @XMM[8], @XMM[7]
903___
904 &bitslice (@XMM[0..7, 8..11]);
905$code.=<<___;
906 dec $rounds
907 jmp .Ldec_sbox
908.align 16
909.Ldec_loop:
910___
911 &ShiftRows (@XMM[0..7, 8]);
912$code.=".Ldec_sbox:\n";
913 &InvSbox (@XMM[0..7, 8..15]);
914$code.=<<___;
915 dec $rounds
916 jl .Ldec_done
917___
918 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
919$code.=<<___;
920 movdqa -0x10($const), @XMM[8] # .LISR
921 jnz .Ldec_loop
922 movdqa -0x20($const), @XMM[8] # .LISRM0
923 jmp .Ldec_loop
924.align 16
925.Ldec_done:
926___
927 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
928$code.=<<___;
929 movdqa ($key), @XMM[8] # last round key
930 pxor @XMM[8], @XMM[6]
931 pxor @XMM[8], @XMM[4]
932 pxor @XMM[8], @XMM[2]
933 pxor @XMM[8], @XMM[7]
934 pxor @XMM[8], @XMM[3]
935 pxor @XMM[8], @XMM[5]
936 pxor @XMM[8], @XMM[0]
937 pxor @XMM[8], @XMM[1]
938 ret
939.size _bsaes_decrypt8,.-_bsaes_decrypt8
940___
941}
942{
943my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
944
945sub bitslice_key {
946my @x=reverse(@_[0..7]);
947my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
948
949 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
950$code.=<<___;
951 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
952 movdqa @x[0], @x[2]
953 movdqa @x[1], @x[3]
954___
955 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
956
957 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
958$code.=<<___;
959 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
960 movdqa @x[0], @x[4]
961 movdqa @x[2], @x[6]
962 movdqa @x[1], @x[5]
963 movdqa @x[3], @x[7]
964___
965 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
966 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
967}
968
969$code.=<<___;
970.type _bsaes_key_convert,\@abi-omnipotent
971.align 16
972_bsaes_key_convert:
973 _CET_ENDBR
974 lea .Lmasks(%rip), $const
975 movdqu ($inp), %xmm7 # load round 0 key
976 lea 0x10($inp), $inp
977 movdqa 0x00($const), %xmm0 # 0x01...
978 movdqa 0x10($const), %xmm1 # 0x02...
979 movdqa 0x20($const), %xmm2 # 0x04...
980 movdqa 0x30($const), %xmm3 # 0x08...
981 movdqa 0x40($const), %xmm4 # .LM0
982 pcmpeqd %xmm5, %xmm5 # .LNOT
983
984 movdqu ($inp), %xmm6 # load round 1 key
985 movdqa %xmm7, ($out) # save round 0 key
986 lea 0x10($out), $out
987 dec $rounds
988 jmp .Lkey_loop
989.align 16
990.Lkey_loop:
991 pshufb %xmm4, %xmm6 # .LM0
992
993 movdqa %xmm0, %xmm8
994 movdqa %xmm1, %xmm9
995
996 pand %xmm6, %xmm8
997 pand %xmm6, %xmm9
998 movdqa %xmm2, %xmm10
999 pcmpeqb %xmm0, %xmm8
1000 psllq \$4, %xmm0 # 0x10...
1001 movdqa %xmm3, %xmm11
1002 pcmpeqb %xmm1, %xmm9
1003 psllq \$4, %xmm1 # 0x20...
1004
1005 pand %xmm6, %xmm10
1006 pand %xmm6, %xmm11
1007 movdqa %xmm0, %xmm12
1008 pcmpeqb %xmm2, %xmm10
1009 psllq \$4, %xmm2 # 0x40...
1010 movdqa %xmm1, %xmm13
1011 pcmpeqb %xmm3, %xmm11
1012 psllq \$4, %xmm3 # 0x80...
1013
1014 movdqa %xmm2, %xmm14
1015 movdqa %xmm3, %xmm15
1016 pxor %xmm5, %xmm8 # "pnot"
1017 pxor %xmm5, %xmm9
1018
1019 pand %xmm6, %xmm12
1020 pand %xmm6, %xmm13
1021 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1022 pcmpeqb %xmm0, %xmm12
1023 psrlq \$4, %xmm0 # 0x01...
1024 movdqa %xmm9, 0x10($out)
1025 pcmpeqb %xmm1, %xmm13
1026 psrlq \$4, %xmm1 # 0x02...
1027 lea 0x10($inp), $inp
1028
1029 pand %xmm6, %xmm14
1030 pand %xmm6, %xmm15
1031 movdqa %xmm10, 0x20($out)
1032 pcmpeqb %xmm2, %xmm14
1033 psrlq \$4, %xmm2 # 0x04...
1034 movdqa %xmm11, 0x30($out)
1035 pcmpeqb %xmm3, %xmm15
1036 psrlq \$4, %xmm3 # 0x08...
1037 movdqu ($inp), %xmm6 # load next round key
1038
1039 pxor %xmm5, %xmm13 # "pnot"
1040 pxor %xmm5, %xmm14
1041 movdqa %xmm12, 0x40($out)
1042 movdqa %xmm13, 0x50($out)
1043 movdqa %xmm14, 0x60($out)
1044 movdqa %xmm15, 0x70($out)
1045 lea 0x80($out),$out
1046 dec $rounds
1047 jnz .Lkey_loop
1048
1049 movdqa 0x50($const), %xmm7 # .L63
1050 #movdqa %xmm6, ($out) # don't save last round key
1051 ret
1052.size _bsaes_key_convert,.-_bsaes_key_convert
1053___
1054}
1055
1056if (0 && !$win64) { # following four functions are unsupported interface
1057 # used for benchmarking...
1058$code.=<<___;
1059.globl bsaes_enc_key_convert
1060.type bsaes_enc_key_convert,\@function,2
1061.align 16
1062bsaes_enc_key_convert:
1063 _CET_ENDBR
1064 mov 240($inp),%r10d # pass rounds
1065 mov $inp,%rcx # pass key
1066 mov $out,%rax # pass key schedule
1067 call _bsaes_key_convert
1068 pxor %xmm6,%xmm7 # fix up last round key
1069 movdqa %xmm7,(%rax) # save last round key
1070 ret
1071.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1072
1073.globl bsaes_encrypt_128
1074.type bsaes_encrypt_128,\@function,4
1075.align 16
1076bsaes_encrypt_128:
1077.Lenc128_loop:
1078 _CET_ENDBR
1079 movdqu 0x00($inp), @XMM[0] # load input
1080 movdqu 0x10($inp), @XMM[1]
1081 movdqu 0x20($inp), @XMM[2]
1082 movdqu 0x30($inp), @XMM[3]
1083 movdqu 0x40($inp), @XMM[4]
1084 movdqu 0x50($inp), @XMM[5]
1085 movdqu 0x60($inp), @XMM[6]
1086 movdqu 0x70($inp), @XMM[7]
1087 mov $key, %rax # pass the $key
1088 lea 0x80($inp), $inp
1089 mov \$10,%r10d
1090
1091 call _bsaes_encrypt8
1092
1093 movdqu @XMM[0], 0x00($out) # write output
1094 movdqu @XMM[1], 0x10($out)
1095 movdqu @XMM[4], 0x20($out)
1096 movdqu @XMM[6], 0x30($out)
1097 movdqu @XMM[3], 0x40($out)
1098 movdqu @XMM[7], 0x50($out)
1099 movdqu @XMM[2], 0x60($out)
1100 movdqu @XMM[5], 0x70($out)
1101 lea 0x80($out), $out
1102 sub \$0x80,$len
1103 ja .Lenc128_loop
1104 ret
1105.size bsaes_encrypt_128,.-bsaes_encrypt_128
1106
1107.globl bsaes_dec_key_convert
1108.type bsaes_dec_key_convert,\@function,2
1109.align 16
1110bsaes_dec_key_convert:
1111 _CET_ENDBR
1112 mov 240($inp),%r10d # pass rounds
1113 mov $inp,%rcx # pass key
1114 mov $out,%rax # pass key schedule
1115 call _bsaes_key_convert
1116 pxor ($out),%xmm7 # fix up round 0 key
1117 movdqa %xmm6,(%rax) # save last round key
1118 movdqa %xmm7,($out)
1119 ret
1120.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1121
1122.globl bsaes_decrypt_128
1123.type bsaes_decrypt_128,\@function,4
1124.align 16
1125bsaes_decrypt_128:
1126 _CET_ENDBR
1127.Ldec128_loop:
1128 movdqu 0x00($inp), @XMM[0] # load input
1129 movdqu 0x10($inp), @XMM[1]
1130 movdqu 0x20($inp), @XMM[2]
1131 movdqu 0x30($inp), @XMM[3]
1132 movdqu 0x40($inp), @XMM[4]
1133 movdqu 0x50($inp), @XMM[5]
1134 movdqu 0x60($inp), @XMM[6]
1135 movdqu 0x70($inp), @XMM[7]
1136 mov $key, %rax # pass the $key
1137 lea 0x80($inp), $inp
1138 mov \$10,%r10d
1139
1140 call _bsaes_decrypt8
1141
1142 movdqu @XMM[0], 0x00($out) # write output
1143 movdqu @XMM[1], 0x10($out)
1144 movdqu @XMM[6], 0x20($out)
1145 movdqu @XMM[4], 0x30($out)
1146 movdqu @XMM[2], 0x40($out)
1147 movdqu @XMM[7], 0x50($out)
1148 movdqu @XMM[3], 0x60($out)
1149 movdqu @XMM[5], 0x70($out)
1150 lea 0x80($out), $out
1151 sub \$0x80,$len
1152 ja .Ldec128_loop
1153 ret
1154.size bsaes_decrypt_128,.-bsaes_decrypt_128
1155___
1156}
1157{
1158######################################################################
1159#
1160# OpenSSL interface
1161#
1162my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1163 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1164my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1165
1166if ($ecb) {
1167$code.=<<___;
1168.globl bsaes_ecb_encrypt_blocks
1169.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1170.align 16
1171bsaes_ecb_encrypt_blocks:
1172 _CET_ENDBR
1173 mov %rsp, %rax
1174.Lecb_enc_prologue:
1175 push %rbp
1176 push %rbx
1177 push %r12
1178 push %r13
1179 push %r14
1180 push %r15
1181 lea -0x48(%rsp),%rsp
1182___
1183$code.=<<___ if ($win64);
1184 lea -0xa0(%rsp), %rsp
1185 movaps %xmm6, 0x40(%rsp)
1186 movaps %xmm7, 0x50(%rsp)
1187 movaps %xmm8, 0x60(%rsp)
1188 movaps %xmm9, 0x70(%rsp)
1189 movaps %xmm10, 0x80(%rsp)
1190 movaps %xmm11, 0x90(%rsp)
1191 movaps %xmm12, 0xa0(%rsp)
1192 movaps %xmm13, 0xb0(%rsp)
1193 movaps %xmm14, 0xc0(%rsp)
1194 movaps %xmm15, 0xd0(%rsp)
1195.Lecb_enc_body:
1196___
1197$code.=<<___;
1198 mov %rsp,%rbp # backup %rsp
1199 mov 240($arg4),%eax # rounds
1200 mov $arg1,$inp # backup arguments
1201 mov $arg2,$out
1202 mov $arg3,$len
1203 mov $arg4,$key
1204 cmp \$8,$arg3
1205 jb .Lecb_enc_short
1206
1207 mov %eax,%ebx # backup rounds
1208 shl \$7,%rax # 128 bytes per inner round key
1209 sub \$`128-32`,%rax # size of bit-sliced key schedule
1210 sub %rax,%rsp
1211 mov %rsp,%rax # pass key schedule
1212 mov $key,%rcx # pass key
1213 mov %ebx,%r10d # pass rounds
1214 call _bsaes_key_convert
1215 pxor %xmm6,%xmm7 # fix up last round key
1216 movdqa %xmm7,(%rax) # save last round key
1217
1218 sub \$8,$len
1219.Lecb_enc_loop:
1220 movdqu 0x00($inp), @XMM[0] # load input
1221 movdqu 0x10($inp), @XMM[1]
1222 movdqu 0x20($inp), @XMM[2]
1223 movdqu 0x30($inp), @XMM[3]
1224 movdqu 0x40($inp), @XMM[4]
1225 movdqu 0x50($inp), @XMM[5]
1226 mov %rsp, %rax # pass key schedule
1227 movdqu 0x60($inp), @XMM[6]
1228 mov %ebx,%r10d # pass rounds
1229 movdqu 0x70($inp), @XMM[7]
1230 lea 0x80($inp), $inp
1231
1232 call _bsaes_encrypt8
1233
1234 movdqu @XMM[0], 0x00($out) # write output
1235 movdqu @XMM[1], 0x10($out)
1236 movdqu @XMM[4], 0x20($out)
1237 movdqu @XMM[6], 0x30($out)
1238 movdqu @XMM[3], 0x40($out)
1239 movdqu @XMM[7], 0x50($out)
1240 movdqu @XMM[2], 0x60($out)
1241 movdqu @XMM[5], 0x70($out)
1242 lea 0x80($out), $out
1243 sub \$8,$len
1244 jnc .Lecb_enc_loop
1245
1246 add \$8,$len
1247 jz .Lecb_enc_done
1248
1249 movdqu 0x00($inp), @XMM[0] # load input
1250 mov %rsp, %rax # pass key schedule
1251 mov %ebx,%r10d # pass rounds
1252 cmp \$2,$len
1253 jb .Lecb_enc_one
1254 movdqu 0x10($inp), @XMM[1]
1255 je .Lecb_enc_two
1256 movdqu 0x20($inp), @XMM[2]
1257 cmp \$4,$len
1258 jb .Lecb_enc_three
1259 movdqu 0x30($inp), @XMM[3]
1260 je .Lecb_enc_four
1261 movdqu 0x40($inp), @XMM[4]
1262 cmp \$6,$len
1263 jb .Lecb_enc_five
1264 movdqu 0x50($inp), @XMM[5]
1265 je .Lecb_enc_six
1266 movdqu 0x60($inp), @XMM[6]
1267 call _bsaes_encrypt8
1268 movdqu @XMM[0], 0x00($out) # write output
1269 movdqu @XMM[1], 0x10($out)
1270 movdqu @XMM[4], 0x20($out)
1271 movdqu @XMM[6], 0x30($out)
1272 movdqu @XMM[3], 0x40($out)
1273 movdqu @XMM[7], 0x50($out)
1274 movdqu @XMM[2], 0x60($out)
1275 jmp .Lecb_enc_done
1276.align 16
1277.Lecb_enc_six:
1278 call _bsaes_encrypt8
1279 movdqu @XMM[0], 0x00($out) # write output
1280 movdqu @XMM[1], 0x10($out)
1281 movdqu @XMM[4], 0x20($out)
1282 movdqu @XMM[6], 0x30($out)
1283 movdqu @XMM[3], 0x40($out)
1284 movdqu @XMM[7], 0x50($out)
1285 jmp .Lecb_enc_done
1286.align 16
1287.Lecb_enc_five:
1288 call _bsaes_encrypt8
1289 movdqu @XMM[0], 0x00($out) # write output
1290 movdqu @XMM[1], 0x10($out)
1291 movdqu @XMM[4], 0x20($out)
1292 movdqu @XMM[6], 0x30($out)
1293 movdqu @XMM[3], 0x40($out)
1294 jmp .Lecb_enc_done
1295.align 16
1296.Lecb_enc_four:
1297 call _bsaes_encrypt8
1298 movdqu @XMM[0], 0x00($out) # write output
1299 movdqu @XMM[1], 0x10($out)
1300 movdqu @XMM[4], 0x20($out)
1301 movdqu @XMM[6], 0x30($out)
1302 jmp .Lecb_enc_done
1303.align 16
1304.Lecb_enc_three:
1305 call _bsaes_encrypt8
1306 movdqu @XMM[0], 0x00($out) # write output
1307 movdqu @XMM[1], 0x10($out)
1308 movdqu @XMM[4], 0x20($out)
1309 jmp .Lecb_enc_done
1310.align 16
1311.Lecb_enc_two:
1312 call _bsaes_encrypt8
1313 movdqu @XMM[0], 0x00($out) # write output
1314 movdqu @XMM[1], 0x10($out)
1315 jmp .Lecb_enc_done
1316.align 16
1317.Lecb_enc_one:
1318 call _bsaes_encrypt8
1319 movdqu @XMM[0], 0x00($out) # write output
1320 jmp .Lecb_enc_done
1321.align 16
1322.Lecb_enc_short:
1323 lea ($inp), $arg1
1324 lea ($out), $arg2
1325 lea ($key), $arg3
1326 call asm_AES_encrypt
1327 lea 16($inp), $inp
1328 lea 16($out), $out
1329 dec $len
1330 jnz .Lecb_enc_short
1331
1332.Lecb_enc_done:
1333 lea (%rsp),%rax
1334 pxor %xmm0, %xmm0
1335.Lecb_enc_bzero: # wipe key schedule [if any]
1336 movdqa %xmm0, 0x00(%rax)
1337 movdqa %xmm0, 0x10(%rax)
1338 lea 0x20(%rax), %rax
1339 cmp %rax, %rbp
1340 jb .Lecb_enc_bzero
1341
1342 lea (%rbp),%rsp # restore %rsp
1343___
1344$code.=<<___ if ($win64);
1345 movaps 0x40(%rbp), %xmm6
1346 movaps 0x50(%rbp), %xmm7
1347 movaps 0x60(%rbp), %xmm8
1348 movaps 0x70(%rbp), %xmm9
1349 movaps 0x80(%rbp), %xmm10
1350 movaps 0x90(%rbp), %xmm11
1351 movaps 0xa0(%rbp), %xmm12
1352 movaps 0xb0(%rbp), %xmm13
1353 movaps 0xc0(%rbp), %xmm14
1354 movaps 0xd0(%rbp), %xmm15
1355 lea 0xa0(%rbp), %rsp
1356___
1357$code.=<<___;
1358 mov 0x48(%rsp), %r15
1359 mov 0x50(%rsp), %r14
1360 mov 0x58(%rsp), %r13
1361 mov 0x60(%rsp), %r12
1362 mov 0x68(%rsp), %rbx
1363 mov 0x70(%rsp), %rax
1364 lea 0x78(%rsp), %rsp
1365 mov %rax, %rbp
1366.Lecb_enc_epilogue:
1367 ret
1368.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1369
1370.globl bsaes_ecb_decrypt_blocks
1371.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1372.align 16
1373bsaes_ecb_decrypt_blocks:
1374 _CET_ENDBR
1375 mov %rsp, %rax
1376.Lecb_dec_prologue:
1377 push %rbp
1378 push %rbx
1379 push %r12
1380 push %r13
1381 push %r14
1382 push %r15
1383 lea -0x48(%rsp),%rsp
1384___
1385$code.=<<___ if ($win64);
1386 lea -0xa0(%rsp), %rsp
1387 movaps %xmm6, 0x40(%rsp)
1388 movaps %xmm7, 0x50(%rsp)
1389 movaps %xmm8, 0x60(%rsp)
1390 movaps %xmm9, 0x70(%rsp)
1391 movaps %xmm10, 0x80(%rsp)
1392 movaps %xmm11, 0x90(%rsp)
1393 movaps %xmm12, 0xa0(%rsp)
1394 movaps %xmm13, 0xb0(%rsp)
1395 movaps %xmm14, 0xc0(%rsp)
1396 movaps %xmm15, 0xd0(%rsp)
1397.Lecb_dec_body:
1398___
1399$code.=<<___;
1400 mov %rsp,%rbp # backup %rsp
1401 mov 240($arg4),%eax # rounds
1402 mov $arg1,$inp # backup arguments
1403 mov $arg2,$out
1404 mov $arg3,$len
1405 mov $arg4,$key
1406 cmp \$8,$arg3
1407 jb .Lecb_dec_short
1408
1409 mov %eax,%ebx # backup rounds
1410 shl \$7,%rax # 128 bytes per inner round key
1411 sub \$`128-32`,%rax # size of bit-sliced key schedule
1412 sub %rax,%rsp
1413 mov %rsp,%rax # pass key schedule
1414 mov $key,%rcx # pass key
1415 mov %ebx,%r10d # pass rounds
1416 call _bsaes_key_convert
1417 pxor (%rsp),%xmm7 # fix up 0 round key
1418 movdqa %xmm6,(%rax) # save last round key
1419 movdqa %xmm7,(%rsp)
1420
1421 sub \$8,$len
1422.Lecb_dec_loop:
1423 movdqu 0x00($inp), @XMM[0] # load input
1424 movdqu 0x10($inp), @XMM[1]
1425 movdqu 0x20($inp), @XMM[2]
1426 movdqu 0x30($inp), @XMM[3]
1427 movdqu 0x40($inp), @XMM[4]
1428 movdqu 0x50($inp), @XMM[5]
1429 mov %rsp, %rax # pass key schedule
1430 movdqu 0x60($inp), @XMM[6]
1431 mov %ebx,%r10d # pass rounds
1432 movdqu 0x70($inp), @XMM[7]
1433 lea 0x80($inp), $inp
1434
1435 call _bsaes_decrypt8
1436
1437 movdqu @XMM[0], 0x00($out) # write output
1438 movdqu @XMM[1], 0x10($out)
1439 movdqu @XMM[6], 0x20($out)
1440 movdqu @XMM[4], 0x30($out)
1441 movdqu @XMM[2], 0x40($out)
1442 movdqu @XMM[7], 0x50($out)
1443 movdqu @XMM[3], 0x60($out)
1444 movdqu @XMM[5], 0x70($out)
1445 lea 0x80($out), $out
1446 sub \$8,$len
1447 jnc .Lecb_dec_loop
1448
1449 add \$8,$len
1450 jz .Lecb_dec_done
1451
1452 movdqu 0x00($inp), @XMM[0] # load input
1453 mov %rsp, %rax # pass key schedule
1454 mov %ebx,%r10d # pass rounds
1455 cmp \$2,$len
1456 jb .Lecb_dec_one
1457 movdqu 0x10($inp), @XMM[1]
1458 je .Lecb_dec_two
1459 movdqu 0x20($inp), @XMM[2]
1460 cmp \$4,$len
1461 jb .Lecb_dec_three
1462 movdqu 0x30($inp), @XMM[3]
1463 je .Lecb_dec_four
1464 movdqu 0x40($inp), @XMM[4]
1465 cmp \$6,$len
1466 jb .Lecb_dec_five
1467 movdqu 0x50($inp), @XMM[5]
1468 je .Lecb_dec_six
1469 movdqu 0x60($inp), @XMM[6]
1470 call _bsaes_decrypt8
1471 movdqu @XMM[0], 0x00($out) # write output
1472 movdqu @XMM[1], 0x10($out)
1473 movdqu @XMM[6], 0x20($out)
1474 movdqu @XMM[4], 0x30($out)
1475 movdqu @XMM[2], 0x40($out)
1476 movdqu @XMM[7], 0x50($out)
1477 movdqu @XMM[3], 0x60($out)
1478 jmp .Lecb_dec_done
1479.align 16
1480.Lecb_dec_six:
1481 call _bsaes_decrypt8
1482 movdqu @XMM[0], 0x00($out) # write output
1483 movdqu @XMM[1], 0x10($out)
1484 movdqu @XMM[6], 0x20($out)
1485 movdqu @XMM[4], 0x30($out)
1486 movdqu @XMM[2], 0x40($out)
1487 movdqu @XMM[7], 0x50($out)
1488 jmp .Lecb_dec_done
1489.align 16
1490.Lecb_dec_five:
1491 call _bsaes_decrypt8
1492 movdqu @XMM[0], 0x00($out) # write output
1493 movdqu @XMM[1], 0x10($out)
1494 movdqu @XMM[6], 0x20($out)
1495 movdqu @XMM[4], 0x30($out)
1496 movdqu @XMM[2], 0x40($out)
1497 jmp .Lecb_dec_done
1498.align 16
1499.Lecb_dec_four:
1500 call _bsaes_decrypt8
1501 movdqu @XMM[0], 0x00($out) # write output
1502 movdqu @XMM[1], 0x10($out)
1503 movdqu @XMM[6], 0x20($out)
1504 movdqu @XMM[4], 0x30($out)
1505 jmp .Lecb_dec_done
1506.align 16
1507.Lecb_dec_three:
1508 call _bsaes_decrypt8
1509 movdqu @XMM[0], 0x00($out) # write output
1510 movdqu @XMM[1], 0x10($out)
1511 movdqu @XMM[6], 0x20($out)
1512 jmp .Lecb_dec_done
1513.align 16
1514.Lecb_dec_two:
1515 call _bsaes_decrypt8
1516 movdqu @XMM[0], 0x00($out) # write output
1517 movdqu @XMM[1], 0x10($out)
1518 jmp .Lecb_dec_done
1519.align 16
1520.Lecb_dec_one:
1521 call _bsaes_decrypt8
1522 movdqu @XMM[0], 0x00($out) # write output
1523 jmp .Lecb_dec_done
1524.align 16
1525.Lecb_dec_short:
1526 lea ($inp), $arg1
1527 lea ($out), $arg2
1528 lea ($key), $arg3
1529 call asm_AES_decrypt
1530 lea 16($inp), $inp
1531 lea 16($out), $out
1532 dec $len
1533 jnz .Lecb_dec_short
1534
1535.Lecb_dec_done:
1536 lea (%rsp),%rax
1537 pxor %xmm0, %xmm0
1538.Lecb_dec_bzero: # wipe key schedule [if any]
1539 movdqa %xmm0, 0x00(%rax)
1540 movdqa %xmm0, 0x10(%rax)
1541 lea 0x20(%rax), %rax
1542 cmp %rax, %rbp
1543 jb .Lecb_dec_bzero
1544
1545 lea (%rbp),%rsp # restore %rsp
1546___
1547$code.=<<___ if ($win64);
1548 movaps 0x40(%rbp), %xmm6
1549 movaps 0x50(%rbp), %xmm7
1550 movaps 0x60(%rbp), %xmm8
1551 movaps 0x70(%rbp), %xmm9
1552 movaps 0x80(%rbp), %xmm10
1553 movaps 0x90(%rbp), %xmm11
1554 movaps 0xa0(%rbp), %xmm12
1555 movaps 0xb0(%rbp), %xmm13
1556 movaps 0xc0(%rbp), %xmm14
1557 movaps 0xd0(%rbp), %xmm15
1558 lea 0xa0(%rbp), %rsp
1559___
1560$code.=<<___;
1561 mov 0x48(%rsp), %r15
1562 mov 0x50(%rsp), %r14
1563 mov 0x58(%rsp), %r13
1564 mov 0x60(%rsp), %r12
1565 mov 0x68(%rsp), %rbx
1566 mov 0x70(%rsp), %rax
1567 lea 0x78(%rsp), %rsp
1568 mov %rax, %rbp
1569.Lecb_dec_epilogue:
1570 ret
1571.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1572___
1573}
1574$code.=<<___;
1575.extern asm_AES_cbc_encrypt
1576.globl bsaes_cbc_encrypt
1577.type bsaes_cbc_encrypt,\@abi-omnipotent
1578.align 16
1579bsaes_cbc_encrypt:
1580 _CET_ENDBR
1581___
1582$code.=<<___ if ($win64);
1583 mov 48(%rsp),$arg6 # pull direction flag
1584___
1585$code.=<<___;
1586 cmp \$0,$arg6
1587 jne asm_AES_cbc_encrypt
1588 cmp \$128,$arg3
1589 jb asm_AES_cbc_encrypt
1590
1591 mov %rsp, %rax
1592.Lcbc_dec_prologue:
1593 push %rbp
1594 push %rbx
1595 push %r12
1596 push %r13
1597 push %r14
1598 push %r15
1599 lea -0x48(%rsp), %rsp
1600___
1601$code.=<<___ if ($win64);
1602 mov 0xa0(%rsp),$arg5 # pull ivp
1603 lea -0xa0(%rsp), %rsp
1604 movaps %xmm6, 0x40(%rsp)
1605 movaps %xmm7, 0x50(%rsp)
1606 movaps %xmm8, 0x60(%rsp)
1607 movaps %xmm9, 0x70(%rsp)
1608 movaps %xmm10, 0x80(%rsp)
1609 movaps %xmm11, 0x90(%rsp)
1610 movaps %xmm12, 0xa0(%rsp)
1611 movaps %xmm13, 0xb0(%rsp)
1612 movaps %xmm14, 0xc0(%rsp)
1613 movaps %xmm15, 0xd0(%rsp)
1614.Lcbc_dec_body:
1615___
1616$code.=<<___;
1617 mov %rsp, %rbp # backup %rsp
1618 mov 240($arg4), %eax # rounds
1619 mov $arg1, $inp # backup arguments
1620 mov $arg2, $out
1621 mov $arg3, $len
1622 mov $arg4, $key
1623 mov $arg5, %rbx
1624 shr \$4, $len # bytes to blocks
1625
1626 mov %eax, %edx # rounds
1627 shl \$7, %rax # 128 bytes per inner round key
1628 sub \$`128-32`, %rax # size of bit-sliced key schedule
1629 sub %rax, %rsp
1630
1631 mov %rsp, %rax # pass key schedule
1632 mov $key, %rcx # pass key
1633 mov %edx, %r10d # pass rounds
1634 call _bsaes_key_convert
1635 pxor (%rsp),%xmm7 # fix up 0 round key
1636 movdqa %xmm6,(%rax) # save last round key
1637 movdqa %xmm7,(%rsp)
1638
1639 movdqu (%rbx), @XMM[15] # load IV
1640 sub \$8,$len
1641.Lcbc_dec_loop:
1642 movdqu 0x00($inp), @XMM[0] # load input
1643 movdqu 0x10($inp), @XMM[1]
1644 movdqu 0x20($inp), @XMM[2]
1645 movdqu 0x30($inp), @XMM[3]
1646 movdqu 0x40($inp), @XMM[4]
1647 movdqu 0x50($inp), @XMM[5]
1648 mov %rsp, %rax # pass key schedule
1649 movdqu 0x60($inp), @XMM[6]
1650 mov %edx,%r10d # pass rounds
1651 movdqu 0x70($inp), @XMM[7]
1652 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1653
1654 call _bsaes_decrypt8
1655
1656 pxor 0x20(%rbp), @XMM[0] # ^= IV
1657 movdqu 0x00($inp), @XMM[8] # re-load input
1658 movdqu 0x10($inp), @XMM[9]
1659 pxor @XMM[8], @XMM[1]
1660 movdqu 0x20($inp), @XMM[10]
1661 pxor @XMM[9], @XMM[6]
1662 movdqu 0x30($inp), @XMM[11]
1663 pxor @XMM[10], @XMM[4]
1664 movdqu 0x40($inp), @XMM[12]
1665 pxor @XMM[11], @XMM[2]
1666 movdqu 0x50($inp), @XMM[13]
1667 pxor @XMM[12], @XMM[7]
1668 movdqu 0x60($inp), @XMM[14]
1669 pxor @XMM[13], @XMM[3]
1670 movdqu 0x70($inp), @XMM[15] # IV
1671 pxor @XMM[14], @XMM[5]
1672 movdqu @XMM[0], 0x00($out) # write output
1673 lea 0x80($inp), $inp
1674 movdqu @XMM[1], 0x10($out)
1675 movdqu @XMM[6], 0x20($out)
1676 movdqu @XMM[4], 0x30($out)
1677 movdqu @XMM[2], 0x40($out)
1678 movdqu @XMM[7], 0x50($out)
1679 movdqu @XMM[3], 0x60($out)
1680 movdqu @XMM[5], 0x70($out)
1681 lea 0x80($out), $out
1682 sub \$8,$len
1683 jnc .Lcbc_dec_loop
1684
1685 add \$8,$len
1686 jz .Lcbc_dec_done
1687
1688 movdqu 0x00($inp), @XMM[0] # load input
1689 mov %rsp, %rax # pass key schedule
1690 mov %edx, %r10d # pass rounds
1691 cmp \$2,$len
1692 jb .Lcbc_dec_one
1693 movdqu 0x10($inp), @XMM[1]
1694 je .Lcbc_dec_two
1695 movdqu 0x20($inp), @XMM[2]
1696 cmp \$4,$len
1697 jb .Lcbc_dec_three
1698 movdqu 0x30($inp), @XMM[3]
1699 je .Lcbc_dec_four
1700 movdqu 0x40($inp), @XMM[4]
1701 cmp \$6,$len
1702 jb .Lcbc_dec_five
1703 movdqu 0x50($inp), @XMM[5]
1704 je .Lcbc_dec_six
1705 movdqu 0x60($inp), @XMM[6]
1706 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1707 call _bsaes_decrypt8
1708 pxor 0x20(%rbp), @XMM[0] # ^= IV
1709 movdqu 0x00($inp), @XMM[8] # re-load input
1710 movdqu 0x10($inp), @XMM[9]
1711 pxor @XMM[8], @XMM[1]
1712 movdqu 0x20($inp), @XMM[10]
1713 pxor @XMM[9], @XMM[6]
1714 movdqu 0x30($inp), @XMM[11]
1715 pxor @XMM[10], @XMM[4]
1716 movdqu 0x40($inp), @XMM[12]
1717 pxor @XMM[11], @XMM[2]
1718 movdqu 0x50($inp), @XMM[13]
1719 pxor @XMM[12], @XMM[7]
1720 movdqu 0x60($inp), @XMM[15] # IV
1721 pxor @XMM[13], @XMM[3]
1722 movdqu @XMM[0], 0x00($out) # write output
1723 movdqu @XMM[1], 0x10($out)
1724 movdqu @XMM[6], 0x20($out)
1725 movdqu @XMM[4], 0x30($out)
1726 movdqu @XMM[2], 0x40($out)
1727 movdqu @XMM[7], 0x50($out)
1728 movdqu @XMM[3], 0x60($out)
1729 jmp .Lcbc_dec_done
1730.align 16
1731.Lcbc_dec_six:
1732 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1733 call _bsaes_decrypt8
1734 pxor 0x20(%rbp), @XMM[0] # ^= IV
1735 movdqu 0x00($inp), @XMM[8] # re-load input
1736 movdqu 0x10($inp), @XMM[9]
1737 pxor @XMM[8], @XMM[1]
1738 movdqu 0x20($inp), @XMM[10]
1739 pxor @XMM[9], @XMM[6]
1740 movdqu 0x30($inp), @XMM[11]
1741 pxor @XMM[10], @XMM[4]
1742 movdqu 0x40($inp), @XMM[12]
1743 pxor @XMM[11], @XMM[2]
1744 movdqu 0x50($inp), @XMM[15] # IV
1745 pxor @XMM[12], @XMM[7]
1746 movdqu @XMM[0], 0x00($out) # write output
1747 movdqu @XMM[1], 0x10($out)
1748 movdqu @XMM[6], 0x20($out)
1749 movdqu @XMM[4], 0x30($out)
1750 movdqu @XMM[2], 0x40($out)
1751 movdqu @XMM[7], 0x50($out)
1752 jmp .Lcbc_dec_done
1753.align 16
1754.Lcbc_dec_five:
1755 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1756 call _bsaes_decrypt8
1757 pxor 0x20(%rbp), @XMM[0] # ^= IV
1758 movdqu 0x00($inp), @XMM[8] # re-load input
1759 movdqu 0x10($inp), @XMM[9]
1760 pxor @XMM[8], @XMM[1]
1761 movdqu 0x20($inp), @XMM[10]
1762 pxor @XMM[9], @XMM[6]
1763 movdqu 0x30($inp), @XMM[11]
1764 pxor @XMM[10], @XMM[4]
1765 movdqu 0x40($inp), @XMM[15] # IV
1766 pxor @XMM[11], @XMM[2]
1767 movdqu @XMM[0], 0x00($out) # write output
1768 movdqu @XMM[1], 0x10($out)
1769 movdqu @XMM[6], 0x20($out)
1770 movdqu @XMM[4], 0x30($out)
1771 movdqu @XMM[2], 0x40($out)
1772 jmp .Lcbc_dec_done
1773.align 16
1774.Lcbc_dec_four:
1775 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1776 call _bsaes_decrypt8
1777 pxor 0x20(%rbp), @XMM[0] # ^= IV
1778 movdqu 0x00($inp), @XMM[8] # re-load input
1779 movdqu 0x10($inp), @XMM[9]
1780 pxor @XMM[8], @XMM[1]
1781 movdqu 0x20($inp), @XMM[10]
1782 pxor @XMM[9], @XMM[6]
1783 movdqu 0x30($inp), @XMM[15] # IV
1784 pxor @XMM[10], @XMM[4]
1785 movdqu @XMM[0], 0x00($out) # write output
1786 movdqu @XMM[1], 0x10($out)
1787 movdqu @XMM[6], 0x20($out)
1788 movdqu @XMM[4], 0x30($out)
1789 jmp .Lcbc_dec_done
1790.align 16
1791.Lcbc_dec_three:
1792 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1793 call _bsaes_decrypt8
1794 pxor 0x20(%rbp), @XMM[0] # ^= IV
1795 movdqu 0x00($inp), @XMM[8] # re-load input
1796 movdqu 0x10($inp), @XMM[9]
1797 pxor @XMM[8], @XMM[1]
1798 movdqu 0x20($inp), @XMM[15] # IV
1799 pxor @XMM[9], @XMM[6]
1800 movdqu @XMM[0], 0x00($out) # write output
1801 movdqu @XMM[1], 0x10($out)
1802 movdqu @XMM[6], 0x20($out)
1803 jmp .Lcbc_dec_done
1804.align 16
1805.Lcbc_dec_two:
1806 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1807 call _bsaes_decrypt8
1808 pxor 0x20(%rbp), @XMM[0] # ^= IV
1809 movdqu 0x00($inp), @XMM[8] # re-load input
1810 movdqu 0x10($inp), @XMM[15] # IV
1811 pxor @XMM[8], @XMM[1]
1812 movdqu @XMM[0], 0x00($out) # write output
1813 movdqu @XMM[1], 0x10($out)
1814 jmp .Lcbc_dec_done
1815.align 16
1816.Lcbc_dec_one:
1817 lea ($inp), $arg1
1818 lea 0x20(%rbp), $arg2 # buffer output
1819 lea ($key), $arg3
1820 call asm_AES_decrypt # doesn't touch %xmm
1821 pxor 0x20(%rbp), @XMM[15] # ^= IV
1822 movdqu @XMM[15], ($out) # write output
1823 movdqa @XMM[0], @XMM[15] # IV
1824
1825.Lcbc_dec_done:
1826 movdqu @XMM[15], (%rbx) # return IV
1827 lea (%rsp), %rax
1828 pxor %xmm0, %xmm0
1829.Lcbc_dec_bzero: # wipe key schedule [if any]
1830 movdqa %xmm0, 0x00(%rax)
1831 movdqa %xmm0, 0x10(%rax)
1832 lea 0x20(%rax), %rax
1833 cmp %rax, %rbp
1834 ja .Lcbc_dec_bzero
1835
1836 lea (%rbp),%rsp # restore %rsp
1837___
1838$code.=<<___ if ($win64);
1839 movaps 0x40(%rbp), %xmm6
1840 movaps 0x50(%rbp), %xmm7
1841 movaps 0x60(%rbp), %xmm8
1842 movaps 0x70(%rbp), %xmm9
1843 movaps 0x80(%rbp), %xmm10
1844 movaps 0x90(%rbp), %xmm11
1845 movaps 0xa0(%rbp), %xmm12
1846 movaps 0xb0(%rbp), %xmm13
1847 movaps 0xc0(%rbp), %xmm14
1848 movaps 0xd0(%rbp), %xmm15
1849 lea 0xa0(%rbp), %rsp
1850___
1851$code.=<<___;
1852 mov 0x48(%rsp), %r15
1853 mov 0x50(%rsp), %r14
1854 mov 0x58(%rsp), %r13
1855 mov 0x60(%rsp), %r12
1856 mov 0x68(%rsp), %rbx
1857 mov 0x70(%rsp), %rax
1858 lea 0x78(%rsp), %rsp
1859 mov %rax, %rbp
1860.Lcbc_dec_epilogue:
1861 ret
1862.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1863
1864.globl bsaes_ctr32_encrypt_blocks
1865.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1866.align 16
1867bsaes_ctr32_encrypt_blocks:
1868 _CET_ENDBR
1869 mov %rsp, %rax
1870.Lctr_enc_prologue:
1871 push %rbp
1872 push %rbx
1873 push %r12
1874 push %r13
1875 push %r14
1876 push %r15
1877 lea -0x48(%rsp), %rsp
1878___
1879$code.=<<___ if ($win64);
1880 mov 0xa0(%rsp),$arg5 # pull ivp
1881 lea -0xa0(%rsp), %rsp
1882 movaps %xmm6, 0x40(%rsp)
1883 movaps %xmm7, 0x50(%rsp)
1884 movaps %xmm8, 0x60(%rsp)
1885 movaps %xmm9, 0x70(%rsp)
1886 movaps %xmm10, 0x80(%rsp)
1887 movaps %xmm11, 0x90(%rsp)
1888 movaps %xmm12, 0xa0(%rsp)
1889 movaps %xmm13, 0xb0(%rsp)
1890 movaps %xmm14, 0xc0(%rsp)
1891 movaps %xmm15, 0xd0(%rsp)
1892.Lctr_enc_body:
1893___
1894$code.=<<___;
1895 mov %rsp, %rbp # backup %rsp
1896 movdqu ($arg5), %xmm0 # load counter
1897 mov 240($arg4), %eax # rounds
1898 mov $arg1, $inp # backup arguments
1899 mov $arg2, $out
1900 mov $arg3, $len
1901 mov $arg4, $key
1902 movdqa %xmm0, 0x20(%rbp) # copy counter
1903 cmp \$8, $arg3
1904 jb .Lctr_enc_short
1905
1906 mov %eax, %ebx # rounds
1907 shl \$7, %rax # 128 bytes per inner round key
1908 sub \$`128-32`, %rax # size of bit-sliced key schedule
1909 sub %rax, %rsp
1910
1911 mov %rsp, %rax # pass key schedule
1912 mov $key, %rcx # pass key
1913 mov %ebx, %r10d # pass rounds
1914 call _bsaes_key_convert
1915 pxor %xmm6,%xmm7 # fix up last round key
1916 movdqa %xmm7,(%rax) # save last round key
1917
1918 movdqa (%rsp), @XMM[9] # load round0 key
1919 lea .LADD1(%rip), %r11
1920 movdqa 0x20(%rbp), @XMM[0] # counter copy
1921 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1922 pshufb @XMM[8], @XMM[9] # byte swap upper part
1923 pshufb @XMM[8], @XMM[0]
1924 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1925 jmp .Lctr_enc_loop
1926.align 16
1927.Lctr_enc_loop:
1928 movdqa @XMM[0], 0x20(%rbp) # save counter
1929 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1930 movdqa @XMM[0], @XMM[2]
1931 paddd 0x00(%r11), @XMM[1] # .LADD1
1932 movdqa @XMM[0], @XMM[3]
1933 paddd 0x10(%r11), @XMM[2] # .LADD2
1934 movdqa @XMM[0], @XMM[4]
1935 paddd 0x20(%r11), @XMM[3] # .LADD3
1936 movdqa @XMM[0], @XMM[5]
1937 paddd 0x30(%r11), @XMM[4] # .LADD4
1938 movdqa @XMM[0], @XMM[6]
1939 paddd 0x40(%r11), @XMM[5] # .LADD5
1940 movdqa @XMM[0], @XMM[7]
1941 paddd 0x50(%r11), @XMM[6] # .LADD6
1942 paddd 0x60(%r11), @XMM[7] # .LADD7
1943
1944 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1945 # to flip byte order in 32-bit counter
1946 movdqa (%rsp), @XMM[9] # round 0 key
1947 lea 0x10(%rsp), %rax # pass key schedule
1948 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1949 pxor @XMM[9], @XMM[0] # xor with round0 key
1950 pxor @XMM[9], @XMM[1]
1951 pshufb @XMM[8], @XMM[0]
1952 pxor @XMM[9], @XMM[2]
1953 pshufb @XMM[8], @XMM[1]
1954 pxor @XMM[9], @XMM[3]
1955 pshufb @XMM[8], @XMM[2]
1956 pxor @XMM[9], @XMM[4]
1957 pshufb @XMM[8], @XMM[3]
1958 pxor @XMM[9], @XMM[5]
1959 pshufb @XMM[8], @XMM[4]
1960 pxor @XMM[9], @XMM[6]
1961 pshufb @XMM[8], @XMM[5]
1962 pxor @XMM[9], @XMM[7]
1963 pshufb @XMM[8], @XMM[6]
1964 lea .LBS0(%rip), %r11 # constants table
1965 pshufb @XMM[8], @XMM[7]
1966 mov %ebx,%r10d # pass rounds
1967
1968 call _bsaes_encrypt8_bitslice
1969
1970 sub \$8,$len
1971 jc .Lctr_enc_loop_done
1972
1973 movdqu 0x00($inp), @XMM[8] # load input
1974 movdqu 0x10($inp), @XMM[9]
1975 movdqu 0x20($inp), @XMM[10]
1976 movdqu 0x30($inp), @XMM[11]
1977 movdqu 0x40($inp), @XMM[12]
1978 movdqu 0x50($inp), @XMM[13]
1979 movdqu 0x60($inp), @XMM[14]
1980 movdqu 0x70($inp), @XMM[15]
1981 lea 0x80($inp),$inp
1982 pxor @XMM[0], @XMM[8]
1983 movdqa 0x20(%rbp), @XMM[0] # load counter
1984 pxor @XMM[9], @XMM[1]
1985 movdqu @XMM[8], 0x00($out) # write output
1986 pxor @XMM[10], @XMM[4]
1987 movdqu @XMM[1], 0x10($out)
1988 pxor @XMM[11], @XMM[6]
1989 movdqu @XMM[4], 0x20($out)
1990 pxor @XMM[12], @XMM[3]
1991 movdqu @XMM[6], 0x30($out)
1992 pxor @XMM[13], @XMM[7]
1993 movdqu @XMM[3], 0x40($out)
1994 pxor @XMM[14], @XMM[2]
1995 movdqu @XMM[7], 0x50($out)
1996 pxor @XMM[15], @XMM[5]
1997 movdqu @XMM[2], 0x60($out)
1998 lea .LADD1(%rip), %r11
1999 movdqu @XMM[5], 0x70($out)
2000 lea 0x80($out), $out
2001 paddd 0x70(%r11), @XMM[0] # .LADD8
2002 jnz .Lctr_enc_loop
2003
2004 jmp .Lctr_enc_done
2005.align 16
2006.Lctr_enc_loop_done:
2007 add \$8, $len
2008 movdqu 0x00($inp), @XMM[8] # load input
2009 pxor @XMM[8], @XMM[0]
2010 movdqu @XMM[0], 0x00($out) # write output
2011 cmp \$2,$len
2012 jb .Lctr_enc_done
2013 movdqu 0x10($inp), @XMM[9]
2014 pxor @XMM[9], @XMM[1]
2015 movdqu @XMM[1], 0x10($out)
2016 je .Lctr_enc_done
2017 movdqu 0x20($inp), @XMM[10]
2018 pxor @XMM[10], @XMM[4]
2019 movdqu @XMM[4], 0x20($out)
2020 cmp \$4,$len
2021 jb .Lctr_enc_done
2022 movdqu 0x30($inp), @XMM[11]
2023 pxor @XMM[11], @XMM[6]
2024 movdqu @XMM[6], 0x30($out)
2025 je .Lctr_enc_done
2026 movdqu 0x40($inp), @XMM[12]
2027 pxor @XMM[12], @XMM[3]
2028 movdqu @XMM[3], 0x40($out)
2029 cmp \$6,$len
2030 jb .Lctr_enc_done
2031 movdqu 0x50($inp), @XMM[13]
2032 pxor @XMM[13], @XMM[7]
2033 movdqu @XMM[7], 0x50($out)
2034 je .Lctr_enc_done
2035 movdqu 0x60($inp), @XMM[14]
2036 pxor @XMM[14], @XMM[2]
2037 movdqu @XMM[2], 0x60($out)
2038 jmp .Lctr_enc_done
2039
2040.align 16
2041.Lctr_enc_short:
2042 lea 0x20(%rbp), $arg1
2043 lea 0x30(%rbp), $arg2
2044 lea ($key), $arg3
2045 call asm_AES_encrypt
2046 movdqu ($inp), @XMM[1]
2047 lea 16($inp), $inp
2048 mov 0x2c(%rbp), %eax # load 32-bit counter
2049 bswap %eax
2050 pxor 0x30(%rbp), @XMM[1]
2051 inc %eax # increment
2052 movdqu @XMM[1], ($out)
2053 bswap %eax
2054 lea 16($out), $out
2055 mov %eax, 0x2c(%rsp) # save 32-bit counter
2056 dec $len
2057 jnz .Lctr_enc_short
2058
2059.Lctr_enc_done:
2060 lea (%rsp), %rax
2061 pxor %xmm0, %xmm0
2062.Lctr_enc_bzero: # wipe key schedule [if any]
2063 movdqa %xmm0, 0x00(%rax)
2064 movdqa %xmm0, 0x10(%rax)
2065 lea 0x20(%rax), %rax
2066 cmp %rax, %rbp
2067 ja .Lctr_enc_bzero
2068
2069 lea (%rbp),%rsp # restore %rsp
2070___
2071$code.=<<___ if ($win64);
2072 movaps 0x40(%rbp), %xmm6
2073 movaps 0x50(%rbp), %xmm7
2074 movaps 0x60(%rbp), %xmm8
2075 movaps 0x70(%rbp), %xmm9
2076 movaps 0x80(%rbp), %xmm10
2077 movaps 0x90(%rbp), %xmm11
2078 movaps 0xa0(%rbp), %xmm12
2079 movaps 0xb0(%rbp), %xmm13
2080 movaps 0xc0(%rbp), %xmm14
2081 movaps 0xd0(%rbp), %xmm15
2082 lea 0xa0(%rbp), %rsp
2083___
2084$code.=<<___;
2085 mov 0x48(%rsp), %r15
2086 mov 0x50(%rsp), %r14
2087 mov 0x58(%rsp), %r13
2088 mov 0x60(%rsp), %r12
2089 mov 0x68(%rsp), %rbx
2090 mov 0x70(%rsp), %rax
2091 lea 0x78(%rsp), %rsp
2092 mov %rax, %rbp
2093.Lctr_enc_epilogue:
2094 ret
2095.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2096___
2097######################################################################
2098# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2099# const AES_KEY *key1, const AES_KEY *key2,
2100# const unsigned char iv[16]);
2101#
2102my ($twmask,$twres,$twtmp)=@XMM[13..15];
2103$arg6=~s/d$//;
2104
2105$code.=<<___;
2106.globl bsaes_xts_encrypt
2107.type bsaes_xts_encrypt,\@abi-omnipotent
2108.align 16
2109bsaes_xts_encrypt:
2110 _CET_ENDBR
2111 mov %rsp, %rax
2112.Lxts_enc_prologue:
2113 push %rbp
2114 push %rbx
2115 push %r12
2116 push %r13
2117 push %r14
2118 push %r15
2119 lea -0x48(%rsp), %rsp
2120___
2121$code.=<<___ if ($win64);
2122 mov 0xa0(%rsp),$arg5 # pull key2
2123 mov 0xa8(%rsp),$arg6 # pull ivp
2124 lea -0xa0(%rsp), %rsp
2125 movaps %xmm6, 0x40(%rsp)
2126 movaps %xmm7, 0x50(%rsp)
2127 movaps %xmm8, 0x60(%rsp)
2128 movaps %xmm9, 0x70(%rsp)
2129 movaps %xmm10, 0x80(%rsp)
2130 movaps %xmm11, 0x90(%rsp)
2131 movaps %xmm12, 0xa0(%rsp)
2132 movaps %xmm13, 0xb0(%rsp)
2133 movaps %xmm14, 0xc0(%rsp)
2134 movaps %xmm15, 0xd0(%rsp)
2135.Lxts_enc_body:
2136___
2137$code.=<<___;
2138 mov %rsp, %rbp # backup %rsp
2139 mov $arg1, $inp # backup arguments
2140 mov $arg2, $out
2141 mov $arg3, $len
2142 mov $arg4, $key
2143
2144 lea ($arg6), $arg1
2145 lea 0x20(%rbp), $arg2
2146 lea ($arg5), $arg3
2147 call asm_AES_encrypt # generate initial tweak
2148
2149 mov 240($key), %eax # rounds
2150 mov $len, %rbx # backup $len
2151
2152 mov %eax, %edx # rounds
2153 shl \$7, %rax # 128 bytes per inner round key
2154 sub \$`128-32`, %rax # size of bit-sliced key schedule
2155 sub %rax, %rsp
2156
2157 mov %rsp, %rax # pass key schedule
2158 mov $key, %rcx # pass key
2159 mov %edx, %r10d # pass rounds
2160 call _bsaes_key_convert
2161 pxor %xmm6, %xmm7 # fix up last round key
2162 movdqa %xmm7, (%rax) # save last round key
2163
2164 and \$-16, $len
2165 sub \$0x80, %rsp # place for tweak[8]
2166 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2167
2168 pxor $twtmp, $twtmp
2169 movdqa .Lxts_magic(%rip), $twmask
2170 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2171
2172 sub \$0x80, $len
2173 jc .Lxts_enc_short
2174 jmp .Lxts_enc_loop
2175
2176.align 16
2177.Lxts_enc_loop:
2178___
2179 for ($i=0;$i<7;$i++) {
2180 $code.=<<___;
2181 pshufd \$0x13, $twtmp, $twres
2182 pxor $twtmp, $twtmp
2183 movdqa @XMM[7], @XMM[$i]
2184 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2185 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2186 pand $twmask, $twres # isolate carry and residue
2187 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2188 pxor $twres, @XMM[7]
2189___
2190 $code.=<<___ if ($i>=1);
2191 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2192___
2193 $code.=<<___ if ($i>=2);
2194 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2195___
2196 }
2197$code.=<<___;
2198 movdqu 0x60($inp), @XMM[8+6]
2199 pxor @XMM[8+5], @XMM[5]
2200 movdqu 0x70($inp), @XMM[8+7]
2201 lea 0x80($inp), $inp
2202 movdqa @XMM[7], 0x70(%rsp)
2203 pxor @XMM[8+6], @XMM[6]
2204 lea 0x80(%rsp), %rax # pass key schedule
2205 pxor @XMM[8+7], @XMM[7]
2206 mov %edx, %r10d # pass rounds
2207
2208 call _bsaes_encrypt8
2209
2210 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2211 pxor 0x10(%rsp), @XMM[1]
2212 movdqu @XMM[0], 0x00($out) # write output
2213 pxor 0x20(%rsp), @XMM[4]
2214 movdqu @XMM[1], 0x10($out)
2215 pxor 0x30(%rsp), @XMM[6]
2216 movdqu @XMM[4], 0x20($out)
2217 pxor 0x40(%rsp), @XMM[3]
2218 movdqu @XMM[6], 0x30($out)
2219 pxor 0x50(%rsp), @XMM[7]
2220 movdqu @XMM[3], 0x40($out)
2221 pxor 0x60(%rsp), @XMM[2]
2222 movdqu @XMM[7], 0x50($out)
2223 pxor 0x70(%rsp), @XMM[5]
2224 movdqu @XMM[2], 0x60($out)
2225 movdqu @XMM[5], 0x70($out)
2226 lea 0x80($out), $out
2227
2228 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2229 pxor $twtmp, $twtmp
2230 movdqa .Lxts_magic(%rip), $twmask
2231 pcmpgtd @XMM[7], $twtmp
2232 pshufd \$0x13, $twtmp, $twres
2233 pxor $twtmp, $twtmp
2234 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2235 pand $twmask, $twres # isolate carry and residue
2236 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2237 pxor $twres, @XMM[7]
2238
2239 sub \$0x80,$len
2240 jnc .Lxts_enc_loop
2241
2242.Lxts_enc_short:
2243 add \$0x80, $len
2244 jz .Lxts_enc_done
2245___
2246 for ($i=0;$i<7;$i++) {
2247 $code.=<<___;
2248 pshufd \$0x13, $twtmp, $twres
2249 pxor $twtmp, $twtmp
2250 movdqa @XMM[7], @XMM[$i]
2251 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2252 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2253 pand $twmask, $twres # isolate carry and residue
2254 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2255 pxor $twres, @XMM[7]
2256___
2257 $code.=<<___ if ($i>=1);
2258 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2259 cmp \$`0x10*$i`,$len
2260 je .Lxts_enc_$i
2261___
2262 $code.=<<___ if ($i>=2);
2263 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2264___
2265 }
2266$code.=<<___;
2267 movdqu 0x60($inp), @XMM[8+6]
2268 pxor @XMM[8+5], @XMM[5]
2269 movdqa @XMM[7], 0x70(%rsp)
2270 lea 0x70($inp), $inp
2271 pxor @XMM[8+6], @XMM[6]
2272 lea 0x80(%rsp), %rax # pass key schedule
2273 mov %edx, %r10d # pass rounds
2274
2275 call _bsaes_encrypt8
2276
2277 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2278 pxor 0x10(%rsp), @XMM[1]
2279 movdqu @XMM[0], 0x00($out) # write output
2280 pxor 0x20(%rsp), @XMM[4]
2281 movdqu @XMM[1], 0x10($out)
2282 pxor 0x30(%rsp), @XMM[6]
2283 movdqu @XMM[4], 0x20($out)
2284 pxor 0x40(%rsp), @XMM[3]
2285 movdqu @XMM[6], 0x30($out)
2286 pxor 0x50(%rsp), @XMM[7]
2287 movdqu @XMM[3], 0x40($out)
2288 pxor 0x60(%rsp), @XMM[2]
2289 movdqu @XMM[7], 0x50($out)
2290 movdqu @XMM[2], 0x60($out)
2291 lea 0x70($out), $out
2292
2293 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2294 jmp .Lxts_enc_done
2295.align 16
2296.Lxts_enc_6:
2297 pxor @XMM[8+4], @XMM[4]
2298 lea 0x60($inp), $inp
2299 pxor @XMM[8+5], @XMM[5]
2300 lea 0x80(%rsp), %rax # pass key schedule
2301 mov %edx, %r10d # pass rounds
2302
2303 call _bsaes_encrypt8
2304
2305 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2306 pxor 0x10(%rsp), @XMM[1]
2307 movdqu @XMM[0], 0x00($out) # write output
2308 pxor 0x20(%rsp), @XMM[4]
2309 movdqu @XMM[1], 0x10($out)
2310 pxor 0x30(%rsp), @XMM[6]
2311 movdqu @XMM[4], 0x20($out)
2312 pxor 0x40(%rsp), @XMM[3]
2313 movdqu @XMM[6], 0x30($out)
2314 pxor 0x50(%rsp), @XMM[7]
2315 movdqu @XMM[3], 0x40($out)
2316 movdqu @XMM[7], 0x50($out)
2317 lea 0x60($out), $out
2318
2319 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2320 jmp .Lxts_enc_done
2321.align 16
2322.Lxts_enc_5:
2323 pxor @XMM[8+3], @XMM[3]
2324 lea 0x50($inp), $inp
2325 pxor @XMM[8+4], @XMM[4]
2326 lea 0x80(%rsp), %rax # pass key schedule
2327 mov %edx, %r10d # pass rounds
2328
2329 call _bsaes_encrypt8
2330
2331 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2332 pxor 0x10(%rsp), @XMM[1]
2333 movdqu @XMM[0], 0x00($out) # write output
2334 pxor 0x20(%rsp), @XMM[4]
2335 movdqu @XMM[1], 0x10($out)
2336 pxor 0x30(%rsp), @XMM[6]
2337 movdqu @XMM[4], 0x20($out)
2338 pxor 0x40(%rsp), @XMM[3]
2339 movdqu @XMM[6], 0x30($out)
2340 movdqu @XMM[3], 0x40($out)
2341 lea 0x50($out), $out
2342
2343 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2344 jmp .Lxts_enc_done
2345.align 16
2346.Lxts_enc_4:
2347 pxor @XMM[8+2], @XMM[2]
2348 lea 0x40($inp), $inp
2349 pxor @XMM[8+3], @XMM[3]
2350 lea 0x80(%rsp), %rax # pass key schedule
2351 mov %edx, %r10d # pass rounds
2352
2353 call _bsaes_encrypt8
2354
2355 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2356 pxor 0x10(%rsp), @XMM[1]
2357 movdqu @XMM[0], 0x00($out) # write output
2358 pxor 0x20(%rsp), @XMM[4]
2359 movdqu @XMM[1], 0x10($out)
2360 pxor 0x30(%rsp), @XMM[6]
2361 movdqu @XMM[4], 0x20($out)
2362 movdqu @XMM[6], 0x30($out)
2363 lea 0x40($out), $out
2364
2365 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2366 jmp .Lxts_enc_done
2367.align 16
2368.Lxts_enc_3:
2369 pxor @XMM[8+1], @XMM[1]
2370 lea 0x30($inp), $inp
2371 pxor @XMM[8+2], @XMM[2]
2372 lea 0x80(%rsp), %rax # pass key schedule
2373 mov %edx, %r10d # pass rounds
2374
2375 call _bsaes_encrypt8
2376
2377 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2378 pxor 0x10(%rsp), @XMM[1]
2379 movdqu @XMM[0], 0x00($out) # write output
2380 pxor 0x20(%rsp), @XMM[4]
2381 movdqu @XMM[1], 0x10($out)
2382 movdqu @XMM[4], 0x20($out)
2383 lea 0x30($out), $out
2384
2385 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2386 jmp .Lxts_enc_done
2387.align 16
2388.Lxts_enc_2:
2389 pxor @XMM[8+0], @XMM[0]
2390 lea 0x20($inp), $inp
2391 pxor @XMM[8+1], @XMM[1]
2392 lea 0x80(%rsp), %rax # pass key schedule
2393 mov %edx, %r10d # pass rounds
2394
2395 call _bsaes_encrypt8
2396
2397 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2398 pxor 0x10(%rsp), @XMM[1]
2399 movdqu @XMM[0], 0x00($out) # write output
2400 movdqu @XMM[1], 0x10($out)
2401 lea 0x20($out), $out
2402
2403 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2404 jmp .Lxts_enc_done
2405.align 16
2406.Lxts_enc_1:
2407 pxor @XMM[0], @XMM[8]
2408 lea 0x10($inp), $inp
2409 movdqa @XMM[8], 0x20(%rbp)
2410 lea 0x20(%rbp), $arg1
2411 lea 0x20(%rbp), $arg2
2412 lea ($key), $arg3
2413 call asm_AES_encrypt # doesn't touch %xmm
2414 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2415 #pxor @XMM[8], @XMM[0]
2416 #lea 0x80(%rsp), %rax # pass key schedule
2417 #mov %edx, %r10d # pass rounds
2418 #call _bsaes_encrypt8
2419 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2420 movdqu @XMM[0], 0x00($out) # write output
2421 lea 0x10($out), $out
2422
2423 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2424
2425.Lxts_enc_done:
2426 and \$15, %ebx
2427 jz .Lxts_enc_ret
2428 mov $out, %rdx
2429
2430.Lxts_enc_steal:
2431 movzb ($inp), %eax
2432 movzb -16(%rdx), %ecx
2433 lea 1($inp), $inp
2434 mov %al, -16(%rdx)
2435 mov %cl, 0(%rdx)
2436 lea 1(%rdx), %rdx
2437 sub \$1,%ebx
2438 jnz .Lxts_enc_steal
2439
2440 movdqu -16($out), @XMM[0]
2441 lea 0x20(%rbp), $arg1
2442 pxor @XMM[7], @XMM[0]
2443 lea 0x20(%rbp), $arg2
2444 movdqa @XMM[0], 0x20(%rbp)
2445 lea ($key), $arg3
2446 call asm_AES_encrypt # doesn't touch %xmm
2447 pxor 0x20(%rbp), @XMM[7]
2448 movdqu @XMM[7], -16($out)
2449
2450.Lxts_enc_ret:
2451 lea (%rsp), %rax
2452 pxor %xmm0, %xmm0
2453.Lxts_enc_bzero: # wipe key schedule [if any]
2454 movdqa %xmm0, 0x00(%rax)
2455 movdqa %xmm0, 0x10(%rax)
2456 lea 0x20(%rax), %rax
2457 cmp %rax, %rbp
2458 ja .Lxts_enc_bzero
2459
2460 lea (%rbp),%rsp # restore %rsp
2461___
2462$code.=<<___ if ($win64);
2463 movaps 0x40(%rbp), %xmm6
2464 movaps 0x50(%rbp), %xmm7
2465 movaps 0x60(%rbp), %xmm8
2466 movaps 0x70(%rbp), %xmm9
2467 movaps 0x80(%rbp), %xmm10
2468 movaps 0x90(%rbp), %xmm11
2469 movaps 0xa0(%rbp), %xmm12
2470 movaps 0xb0(%rbp), %xmm13
2471 movaps 0xc0(%rbp), %xmm14
2472 movaps 0xd0(%rbp), %xmm15
2473 lea 0xa0(%rbp), %rsp
2474___
2475$code.=<<___;
2476 mov 0x48(%rsp), %r15
2477 mov 0x50(%rsp), %r14
2478 mov 0x58(%rsp), %r13
2479 mov 0x60(%rsp), %r12
2480 mov 0x68(%rsp), %rbx
2481 mov 0x70(%rsp), %rax
2482 lea 0x78(%rsp), %rsp
2483 mov %rax, %rbp
2484.Lxts_enc_epilogue:
2485 ret
2486.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2487
2488.globl bsaes_xts_decrypt
2489.type bsaes_xts_decrypt,\@abi-omnipotent
2490.align 16
2491bsaes_xts_decrypt:
2492 _CET_ENDBR
2493 mov %rsp, %rax
2494.Lxts_dec_prologue:
2495 push %rbp
2496 push %rbx
2497 push %r12
2498 push %r13
2499 push %r14
2500 push %r15
2501 lea -0x48(%rsp), %rsp
2502___
2503$code.=<<___ if ($win64);
2504 mov 0xa0(%rsp),$arg5 # pull key2
2505 mov 0xa8(%rsp),$arg6 # pull ivp
2506 lea -0xa0(%rsp), %rsp
2507 movaps %xmm6, 0x40(%rsp)
2508 movaps %xmm7, 0x50(%rsp)
2509 movaps %xmm8, 0x60(%rsp)
2510 movaps %xmm9, 0x70(%rsp)
2511 movaps %xmm10, 0x80(%rsp)
2512 movaps %xmm11, 0x90(%rsp)
2513 movaps %xmm12, 0xa0(%rsp)
2514 movaps %xmm13, 0xb0(%rsp)
2515 movaps %xmm14, 0xc0(%rsp)
2516 movaps %xmm15, 0xd0(%rsp)
2517.Lxts_dec_body:
2518___
2519$code.=<<___;
2520 mov %rsp, %rbp # backup %rsp
2521 mov $arg1, $inp # backup arguments
2522 mov $arg2, $out
2523 mov $arg3, $len
2524 mov $arg4, $key
2525
2526 lea ($arg6), $arg1
2527 lea 0x20(%rbp), $arg2
2528 lea ($arg5), $arg3
2529 call asm_AES_encrypt # generate initial tweak
2530
2531 mov 240($key), %eax # rounds
2532 mov $len, %rbx # backup $len
2533
2534 mov %eax, %edx # rounds
2535 shl \$7, %rax # 128 bytes per inner round key
2536 sub \$`128-32`, %rax # size of bit-sliced key schedule
2537 sub %rax, %rsp
2538
2539 mov %rsp, %rax # pass key schedule
2540 mov $key, %rcx # pass key
2541 mov %edx, %r10d # pass rounds
2542 call _bsaes_key_convert
2543 pxor (%rsp), %xmm7 # fix up round 0 key
2544 movdqa %xmm6, (%rax) # save last round key
2545 movdqa %xmm7, (%rsp)
2546
2547 xor %eax, %eax # if ($len%16) len-=16;
2548 and \$-16, $len
2549 test \$15, %ebx
2550 setnz %al
2551 shl \$4, %rax
2552 sub %rax, $len
2553
2554 sub \$0x80, %rsp # place for tweak[8]
2555 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2556
2557 pxor $twtmp, $twtmp
2558 movdqa .Lxts_magic(%rip), $twmask
2559 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2560
2561 sub \$0x80, $len
2562 jc .Lxts_dec_short
2563 jmp .Lxts_dec_loop
2564
2565.align 16
2566.Lxts_dec_loop:
2567___
2568 for ($i=0;$i<7;$i++) {
2569 $code.=<<___;
2570 pshufd \$0x13, $twtmp, $twres
2571 pxor $twtmp, $twtmp
2572 movdqa @XMM[7], @XMM[$i]
2573 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2574 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2575 pand $twmask, $twres # isolate carry and residue
2576 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2577 pxor $twres, @XMM[7]
2578___
2579 $code.=<<___ if ($i>=1);
2580 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2581___
2582 $code.=<<___ if ($i>=2);
2583 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2584___
2585 }
2586$code.=<<___;
2587 movdqu 0x60($inp), @XMM[8+6]
2588 pxor @XMM[8+5], @XMM[5]
2589 movdqu 0x70($inp), @XMM[8+7]
2590 lea 0x80($inp), $inp
2591 movdqa @XMM[7], 0x70(%rsp)
2592 pxor @XMM[8+6], @XMM[6]
2593 lea 0x80(%rsp), %rax # pass key schedule
2594 pxor @XMM[8+7], @XMM[7]
2595 mov %edx, %r10d # pass rounds
2596
2597 call _bsaes_decrypt8
2598
2599 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2600 pxor 0x10(%rsp), @XMM[1]
2601 movdqu @XMM[0], 0x00($out) # write output
2602 pxor 0x20(%rsp), @XMM[6]
2603 movdqu @XMM[1], 0x10($out)
2604 pxor 0x30(%rsp), @XMM[4]
2605 movdqu @XMM[6], 0x20($out)
2606 pxor 0x40(%rsp), @XMM[2]
2607 movdqu @XMM[4], 0x30($out)
2608 pxor 0x50(%rsp), @XMM[7]
2609 movdqu @XMM[2], 0x40($out)
2610 pxor 0x60(%rsp), @XMM[3]
2611 movdqu @XMM[7], 0x50($out)
2612 pxor 0x70(%rsp), @XMM[5]
2613 movdqu @XMM[3], 0x60($out)
2614 movdqu @XMM[5], 0x70($out)
2615 lea 0x80($out), $out
2616
2617 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2618 pxor $twtmp, $twtmp
2619 movdqa .Lxts_magic(%rip), $twmask
2620 pcmpgtd @XMM[7], $twtmp
2621 pshufd \$0x13, $twtmp, $twres
2622 pxor $twtmp, $twtmp
2623 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2624 pand $twmask, $twres # isolate carry and residue
2625 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2626 pxor $twres, @XMM[7]
2627
2628 sub \$0x80,$len
2629 jnc .Lxts_dec_loop
2630
2631.Lxts_dec_short:
2632 add \$0x80, $len
2633 jz .Lxts_dec_done
2634___
2635 for ($i=0;$i<7;$i++) {
2636 $code.=<<___;
2637 pshufd \$0x13, $twtmp, $twres
2638 pxor $twtmp, $twtmp
2639 movdqa @XMM[7], @XMM[$i]
2640 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2641 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2642 pand $twmask, $twres # isolate carry and residue
2643 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2644 pxor $twres, @XMM[7]
2645___
2646 $code.=<<___ if ($i>=1);
2647 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2648 cmp \$`0x10*$i`,$len
2649 je .Lxts_dec_$i
2650___
2651 $code.=<<___ if ($i>=2);
2652 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2653___
2654 }
2655$code.=<<___;
2656 movdqu 0x60($inp), @XMM[8+6]
2657 pxor @XMM[8+5], @XMM[5]
2658 movdqa @XMM[7], 0x70(%rsp)
2659 lea 0x70($inp), $inp
2660 pxor @XMM[8+6], @XMM[6]
2661 lea 0x80(%rsp), %rax # pass key schedule
2662 mov %edx, %r10d # pass rounds
2663
2664 call _bsaes_decrypt8
2665
2666 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2667 pxor 0x10(%rsp), @XMM[1]
2668 movdqu @XMM[0], 0x00($out) # write output
2669 pxor 0x20(%rsp), @XMM[6]
2670 movdqu @XMM[1], 0x10($out)
2671 pxor 0x30(%rsp), @XMM[4]
2672 movdqu @XMM[6], 0x20($out)
2673 pxor 0x40(%rsp), @XMM[2]
2674 movdqu @XMM[4], 0x30($out)
2675 pxor 0x50(%rsp), @XMM[7]
2676 movdqu @XMM[2], 0x40($out)
2677 pxor 0x60(%rsp), @XMM[3]
2678 movdqu @XMM[7], 0x50($out)
2679 movdqu @XMM[3], 0x60($out)
2680 lea 0x70($out), $out
2681
2682 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2683 jmp .Lxts_dec_done
2684.align 16
2685.Lxts_dec_6:
2686 pxor @XMM[8+4], @XMM[4]
2687 lea 0x60($inp), $inp
2688 pxor @XMM[8+5], @XMM[5]
2689 lea 0x80(%rsp), %rax # pass key schedule
2690 mov %edx, %r10d # pass rounds
2691
2692 call _bsaes_decrypt8
2693
2694 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2695 pxor 0x10(%rsp), @XMM[1]
2696 movdqu @XMM[0], 0x00($out) # write output
2697 pxor 0x20(%rsp), @XMM[6]
2698 movdqu @XMM[1], 0x10($out)
2699 pxor 0x30(%rsp), @XMM[4]
2700 movdqu @XMM[6], 0x20($out)
2701 pxor 0x40(%rsp), @XMM[2]
2702 movdqu @XMM[4], 0x30($out)
2703 pxor 0x50(%rsp), @XMM[7]
2704 movdqu @XMM[2], 0x40($out)
2705 movdqu @XMM[7], 0x50($out)
2706 lea 0x60($out), $out
2707
2708 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2709 jmp .Lxts_dec_done
2710.align 16
2711.Lxts_dec_5:
2712 pxor @XMM[8+3], @XMM[3]
2713 lea 0x50($inp), $inp
2714 pxor @XMM[8+4], @XMM[4]
2715 lea 0x80(%rsp), %rax # pass key schedule
2716 mov %edx, %r10d # pass rounds
2717
2718 call _bsaes_decrypt8
2719
2720 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2721 pxor 0x10(%rsp), @XMM[1]
2722 movdqu @XMM[0], 0x00($out) # write output
2723 pxor 0x20(%rsp), @XMM[6]
2724 movdqu @XMM[1], 0x10($out)
2725 pxor 0x30(%rsp), @XMM[4]
2726 movdqu @XMM[6], 0x20($out)
2727 pxor 0x40(%rsp), @XMM[2]
2728 movdqu @XMM[4], 0x30($out)
2729 movdqu @XMM[2], 0x40($out)
2730 lea 0x50($out), $out
2731
2732 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2733 jmp .Lxts_dec_done
2734.align 16
2735.Lxts_dec_4:
2736 pxor @XMM[8+2], @XMM[2]
2737 lea 0x40($inp), $inp
2738 pxor @XMM[8+3], @XMM[3]
2739 lea 0x80(%rsp), %rax # pass key schedule
2740 mov %edx, %r10d # pass rounds
2741
2742 call _bsaes_decrypt8
2743
2744 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2745 pxor 0x10(%rsp), @XMM[1]
2746 movdqu @XMM[0], 0x00($out) # write output
2747 pxor 0x20(%rsp), @XMM[6]
2748 movdqu @XMM[1], 0x10($out)
2749 pxor 0x30(%rsp), @XMM[4]
2750 movdqu @XMM[6], 0x20($out)
2751 movdqu @XMM[4], 0x30($out)
2752 lea 0x40($out), $out
2753
2754 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2755 jmp .Lxts_dec_done
2756.align 16
2757.Lxts_dec_3:
2758 pxor @XMM[8+1], @XMM[1]
2759 lea 0x30($inp), $inp
2760 pxor @XMM[8+2], @XMM[2]
2761 lea 0x80(%rsp), %rax # pass key schedule
2762 mov %edx, %r10d # pass rounds
2763
2764 call _bsaes_decrypt8
2765
2766 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2767 pxor 0x10(%rsp), @XMM[1]
2768 movdqu @XMM[0], 0x00($out) # write output
2769 pxor 0x20(%rsp), @XMM[6]
2770 movdqu @XMM[1], 0x10($out)
2771 movdqu @XMM[6], 0x20($out)
2772 lea 0x30($out), $out
2773
2774 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2775 jmp .Lxts_dec_done
2776.align 16
2777.Lxts_dec_2:
2778 pxor @XMM[8+0], @XMM[0]
2779 lea 0x20($inp), $inp
2780 pxor @XMM[8+1], @XMM[1]
2781 lea 0x80(%rsp), %rax # pass key schedule
2782 mov %edx, %r10d # pass rounds
2783
2784 call _bsaes_decrypt8
2785
2786 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2787 pxor 0x10(%rsp), @XMM[1]
2788 movdqu @XMM[0], 0x00($out) # write output
2789 movdqu @XMM[1], 0x10($out)
2790 lea 0x20($out), $out
2791
2792 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2793 jmp .Lxts_dec_done
2794.align 16
2795.Lxts_dec_1:
2796 pxor @XMM[0], @XMM[8]
2797 lea 0x10($inp), $inp
2798 movdqa @XMM[8], 0x20(%rbp)
2799 lea 0x20(%rbp), $arg1
2800 lea 0x20(%rbp), $arg2
2801 lea ($key), $arg3
2802 call asm_AES_decrypt # doesn't touch %xmm
2803 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2804 #pxor @XMM[8], @XMM[0]
2805 #lea 0x80(%rsp), %rax # pass key schedule
2806 #mov %edx, %r10d # pass rounds
2807 #call _bsaes_decrypt8
2808 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2809 movdqu @XMM[0], 0x00($out) # write output
2810 lea 0x10($out), $out
2811
2812 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2813
2814.Lxts_dec_done:
2815 and \$15, %ebx
2816 jz .Lxts_dec_ret
2817
2818 pxor $twtmp, $twtmp
2819 movdqa .Lxts_magic(%rip), $twmask
2820 pcmpgtd @XMM[7], $twtmp
2821 pshufd \$0x13, $twtmp, $twres
2822 movdqa @XMM[7], @XMM[6]
2823 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2824 pand $twmask, $twres # isolate carry and residue
2825 movdqu ($inp), @XMM[0]
2826 pxor $twres, @XMM[7]
2827
2828 lea 0x20(%rbp), $arg1
2829 pxor @XMM[7], @XMM[0]
2830 lea 0x20(%rbp), $arg2
2831 movdqa @XMM[0], 0x20(%rbp)
2832 lea ($key), $arg3
2833 call asm_AES_decrypt # doesn't touch %xmm
2834 pxor 0x20(%rbp), @XMM[7]
2835 mov $out, %rdx
2836 movdqu @XMM[7], ($out)
2837
2838.Lxts_dec_steal:
2839 movzb 16($inp), %eax
2840 movzb (%rdx), %ecx
2841 lea 1($inp), $inp
2842 mov %al, (%rdx)
2843 mov %cl, 16(%rdx)
2844 lea 1(%rdx), %rdx
2845 sub \$1,%ebx
2846 jnz .Lxts_dec_steal
2847
2848 movdqu ($out), @XMM[0]
2849 lea 0x20(%rbp), $arg1
2850 pxor @XMM[6], @XMM[0]
2851 lea 0x20(%rbp), $arg2
2852 movdqa @XMM[0], 0x20(%rbp)
2853 lea ($key), $arg3
2854 call asm_AES_decrypt # doesn't touch %xmm
2855 pxor 0x20(%rbp), @XMM[6]
2856 movdqu @XMM[6], ($out)
2857
2858.Lxts_dec_ret:
2859 lea (%rsp), %rax
2860 pxor %xmm0, %xmm0
2861.Lxts_dec_bzero: # wipe key schedule [if any]
2862 movdqa %xmm0, 0x00(%rax)
2863 movdqa %xmm0, 0x10(%rax)
2864 lea 0x20(%rax), %rax
2865 cmp %rax, %rbp
2866 ja .Lxts_dec_bzero
2867
2868 lea (%rbp),%rsp # restore %rsp
2869___
2870$code.=<<___ if ($win64);
2871 movaps 0x40(%rbp), %xmm6
2872 movaps 0x50(%rbp), %xmm7
2873 movaps 0x60(%rbp), %xmm8
2874 movaps 0x70(%rbp), %xmm9
2875 movaps 0x80(%rbp), %xmm10
2876 movaps 0x90(%rbp), %xmm11
2877 movaps 0xa0(%rbp), %xmm12
2878 movaps 0xb0(%rbp), %xmm13
2879 movaps 0xc0(%rbp), %xmm14
2880 movaps 0xd0(%rbp), %xmm15
2881 lea 0xa0(%rbp), %rsp
2882___
2883$code.=<<___;
2884 mov 0x48(%rsp), %r15
2885 mov 0x50(%rsp), %r14
2886 mov 0x58(%rsp), %r13
2887 mov 0x60(%rsp), %r12
2888 mov 0x68(%rsp), %rbx
2889 mov 0x70(%rsp), %rax
2890 lea 0x78(%rsp), %rsp
2891 mov %rax, %rbp
2892.Lxts_dec_epilogue:
2893 ret
2894.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2895___
2896}
2897$code.=<<___;
2898.section .rodata
2899.type _bsaes_const,\@object
2900.align 64
2901_bsaes_const:
2902.LM0ISR: # InvShiftRows constants
2903 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2904.LISRM0:
2905 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2906.LISR:
2907 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2908.LBS0: # bit-slice constants
2909 .quad 0x5555555555555555, 0x5555555555555555
2910.LBS1:
2911 .quad 0x3333333333333333, 0x3333333333333333
2912.LBS2:
2913 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2914.LSR: # shiftrows constants
2915 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2916.LSRM0:
2917 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2918.LM0SR:
2919 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2920.LSWPUP: # byte-swap upper dword
2921 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2922.LSWPUPM0SR:
2923 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2924.LADD1: # counter increment constants
2925 .quad 0x0000000000000000, 0x0000000100000000
2926.LADD2:
2927 .quad 0x0000000000000000, 0x0000000200000000
2928.LADD3:
2929 .quad 0x0000000000000000, 0x0000000300000000
2930.LADD4:
2931 .quad 0x0000000000000000, 0x0000000400000000
2932.LADD5:
2933 .quad 0x0000000000000000, 0x0000000500000000
2934.LADD6:
2935 .quad 0x0000000000000000, 0x0000000600000000
2936.LADD7:
2937 .quad 0x0000000000000000, 0x0000000700000000
2938.LADD8:
2939 .quad 0x0000000000000000, 0x0000000800000000
2940.Lxts_magic:
2941 .long 0x87,0,1,0
2942.Lmasks:
2943 .quad 0x0101010101010101, 0x0101010101010101
2944 .quad 0x0202020202020202, 0x0202020202020202
2945 .quad 0x0404040404040404, 0x0404040404040404
2946 .quad 0x0808080808080808, 0x0808080808080808
2947.LM0:
2948 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2949.L63:
2950 .quad 0x6363636363636363, 0x6363636363636363
2951.align 64
2952.size _bsaes_const,.-_bsaes_const
2953.text
2954___
2955
2956# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2957# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2958if ($win64) {
2959$rec="%rcx";
2960$frame="%rdx";
2961$context="%r8";
2962$disp="%r9";
2963
2964$code.=<<___;
2965.extern __imp_RtlVirtualUnwind
2966.type se_handler,\@abi-omnipotent
2967.align 16
2968se_handler:
2969 _CET_ENDBR
2970 push %rsi
2971 push %rdi
2972 push %rbx
2973 push %rbp
2974 push %r12
2975 push %r13
2976 push %r14
2977 push %r15
2978 pushfq
2979 sub \$64,%rsp
2980
2981 mov 120($context),%rax # pull context->Rax
2982 mov 248($context),%rbx # pull context->Rip
2983
2984 mov 8($disp),%rsi # disp->ImageBase
2985 mov 56($disp),%r11 # disp->HandlerData
2986
2987 mov 0(%r11),%r10d # HandlerData[0]
2988 lea (%rsi,%r10),%r10 # prologue label
2989 cmp %r10,%rbx # context->Rip<prologue label
2990 jb .Lin_prologue
2991
2992 mov 152($context),%rax # pull context->Rsp
2993
2994 mov 4(%r11),%r10d # HandlerData[1]
2995 lea (%rsi,%r10),%r10 # epilogue label
2996 cmp %r10,%rbx # context->Rip>=epilogue label
2997 jae .Lin_prologue
2998
2999 mov 160($context),%rax # pull context->Rbp
3000
3001 lea 0x40(%rax),%rsi # %xmm save area
3002 lea 512($context),%rdi # &context.Xmm6
3003 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
3004 .long 0xa548f3fc # cld; rep movsq
3005 lea 0xa0(%rax),%rax # adjust stack pointer
3006
3007 mov 0x70(%rax),%rbp
3008 mov 0x68(%rax),%rbx
3009 mov 0x60(%rax),%r12
3010 mov 0x58(%rax),%r13
3011 mov 0x50(%rax),%r14
3012 mov 0x48(%rax),%r15
3013 lea 0x78(%rax),%rax # adjust stack pointer
3014 mov %rbx,144($context) # restore context->Rbx
3015 mov %rbp,160($context) # restore context->Rbp
3016 mov %r12,216($context) # restore context->R12
3017 mov %r13,224($context) # restore context->R13
3018 mov %r14,232($context) # restore context->R14
3019 mov %r15,240($context) # restore context->R15
3020
3021.Lin_prologue:
3022 mov %rax,152($context) # restore context->Rsp
3023
3024 mov 40($disp),%rdi # disp->ContextRecord
3025 mov $context,%rsi # context
3026 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3027 .long 0xa548f3fc # cld; rep movsq
3028
3029 mov $disp,%rsi
3030 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3031 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3032 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3033 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3034 mov 40(%rsi),%r10 # disp->ContextRecord
3035 lea 56(%rsi),%r11 # &disp->HandlerData
3036 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3037 mov %r10,32(%rsp) # arg5
3038 mov %r11,40(%rsp) # arg6
3039 mov %r12,48(%rsp) # arg7
3040 mov %rcx,56(%rsp) # arg8, (NULL)
3041 call *__imp_RtlVirtualUnwind(%rip)
3042
3043 mov \$1,%eax # ExceptionContinueSearch
3044 add \$64,%rsp
3045 popfq
3046 pop %r15
3047 pop %r14
3048 pop %r13
3049 pop %r12
3050 pop %rbp
3051 pop %rbx
3052 pop %rdi
3053 pop %rsi
3054 ret
3055.size se_handler,.-se_handler
3056
3057.section .pdata
3058.align 4
3059___
3060$code.=<<___ if ($ecb);
3061 .rva .Lecb_enc_prologue
3062 .rva .Lecb_enc_epilogue
3063 .rva .Lecb_enc_info
3064
3065 .rva .Lecb_dec_prologue
3066 .rva .Lecb_dec_epilogue
3067 .rva .Lecb_dec_info
3068___
3069$code.=<<___;
3070 .rva .Lcbc_dec_prologue
3071 .rva .Lcbc_dec_epilogue
3072 .rva .Lcbc_dec_info
3073
3074 .rva .Lctr_enc_prologue
3075 .rva .Lctr_enc_epilogue
3076 .rva .Lctr_enc_info
3077
3078 .rva .Lxts_enc_prologue
3079 .rva .Lxts_enc_epilogue
3080 .rva .Lxts_enc_info
3081
3082 .rva .Lxts_dec_prologue
3083 .rva .Lxts_dec_epilogue
3084 .rva .Lxts_dec_info
3085
3086.section .xdata
3087.align 8
3088___
3089$code.=<<___ if ($ecb);
3090.Lecb_enc_info:
3091 .byte 9,0,0,0
3092 .rva se_handler
3093 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3094.Lecb_dec_info:
3095 .byte 9,0,0,0
3096 .rva se_handler
3097 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3098___
3099$code.=<<___;
3100.Lcbc_dec_info:
3101 .byte 9,0,0,0
3102 .rva se_handler
3103 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3104.Lctr_enc_info:
3105 .byte 9,0,0,0
3106 .rva se_handler
3107 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3108.Lxts_enc_info:
3109 .byte 9,0,0,0
3110 .rva se_handler
3111 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3112.Lxts_dec_info:
3113 .byte 9,0,0,0
3114 .rva se_handler
3115 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3116___
3117}
3118
3119$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3120
3121print $code;
3122
3123close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl
deleted file mode 100644
index 6e7bd36d05..0000000000
--- a/src/lib/libcrypto/aes/asm/vpaes-x86.pl
+++ /dev/null
@@ -1,911 +0,0 @@
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
17# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-586.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86.pl column - [also
26# large-block CBC] encrypt/decrypt.
27#
28# aes-586.pl vpaes-x86.pl
29#
30# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***)
31# Nehalem 27.9/40.4/18.1 10.3/12.0
32# Atom 102./119./60.1 64.5/85.3(***)
33#
34# (*) "Hyper-threading" in the context refers rather to cache shared
35# among multiple cores, than to specifically Intel HTT. As vast
36# majority of contemporary cores share cache, slower code path
37# is common place. In other words "with-hyper-threading-off"
38# results are presented mostly for reference purposes.
39#
40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
41#
42# (***) Less impressive improvement on Core 2 and Atom is due to slow
43# pshufb, yet it's respectable +32%/65% improvement on Core 2
44# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
45# code path).
46#
47# <appro@openssl.org>
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50push(@INC,"${dir}","${dir}../../perlasm");
51require "x86asm.pl";
52
53&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
54
55$PREFIX="vpaes";
56
57my ($round, $base, $magic, $key, $const, $inp, $out)=
58 ("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
59
60 &rodataseg();
61&static_label("_vpaes_consts");
62&static_label("_vpaes_schedule_low_round");
63
64&set_label("_vpaes_consts",64);
65$k_inv=-0x30; # inv, inva
66 &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
67 &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
68
69$k_s0F=-0x10; # s0F
70 &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
71
72$k_ipt=0x00; # input transform (lo, hi)
73 &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
74 &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
75
76$k_sb1=0x20; # sb1u, sb1t
77 &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
78 &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
79$k_sb2=0x40; # sb2u, sb2t
80 &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
81 &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
82$k_sbo=0x60; # sbou, sbot
83 &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
84 &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
85
86$k_mc_forward=0x80; # mc_forward
87 &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
88 &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
89 &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
90 &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
91
92$k_mc_backward=0xc0; # mc_backward
93 &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
94 &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
95 &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
96 &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
97
98$k_sr=0x100; # sr
99 &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
100 &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
101 &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
102 &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
103
104$k_rcon=0x140; # rcon
105 &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
106
107$k_s63=0x150; # s63: all equal to 0x63 transformed
108 &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
109
110$k_opt=0x160; # output transform
111 &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
112 &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
113
114$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
115 &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
116 &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
117##
118## Decryption stuff
119## Key schedule constants
120##
121$k_dksd=0x1a0; # decryption key schedule: invskew x*D
122 &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
123 &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
124$k_dksb=0x1c0; # decryption key schedule: invskew x*B
125 &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
126 &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
127$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
128 &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
129 &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
130$k_dks9=0x200; # decryption key schedule: invskew x*9
131 &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
132 &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
133
134##
135## Decryption stuff
136## Round function constants
137##
138$k_dipt=0x220; # decryption input transform
139 &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
140 &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
141
142$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
143 &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
144 &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
145$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
146 &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
147 &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
148$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
149 &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
150 &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
151$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
152 &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
153 &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
154$k_dsbo=0x2c0; # decryption sbox final output
155 &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
156 &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
157 &previous();
158
159&function_begin_B("_vpaes_preheat");
160 &movdqa ("xmm7",&QWP($k_inv,$const));
161 &movdqa ("xmm6",&QWP($k_s0F,$const));
162 &ret ();
163&function_end_B("_vpaes_preheat");
164
165##
166## _aes_encrypt_core
167##
168## AES-encrypt %xmm0.
169##
170## Inputs:
171## %xmm0 = input
172## %xmm6-%xmm7 as in _vpaes_preheat
173## (%edx) = scheduled keys
174##
175## Output in %xmm0
176## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
177##
178##
179&function_begin_B("_vpaes_encrypt_core");
180 &mov ($magic,16);
181 &mov ($round,&DWP(240,$key));
182 &movdqa ("xmm1","xmm6")
183 &movdqa ("xmm2",&QWP($k_ipt,$const));
184 &pandn ("xmm1","xmm0");
185 &movdqu ("xmm5",&QWP(0,$key));
186 &psrld ("xmm1",4);
187 &pand ("xmm0","xmm6");
188 &pshufb ("xmm2","xmm0");
189 &movdqa ("xmm0",&QWP($k_ipt+16,$const));
190 &pshufb ("xmm0","xmm1");
191 &pxor ("xmm2","xmm5");
192 &pxor ("xmm0","xmm2");
193 &add ($key,16);
194 &lea ($base,&DWP($k_mc_backward,$const));
195 &jmp (&label("enc_entry"));
196
197
198&set_label("enc_loop",16);
199 # middle of middle round
200 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
201 &pshufb ("xmm4","xmm2"); # 4 = sb1u
202 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
203 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
204 &pshufb ("xmm0","xmm3"); # 0 = sb1t
205 &pxor ("xmm0","xmm4"); # 0 = A
206 &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
207 &pshufb ("xmm5","xmm2"); # 4 = sb2u
208 &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
209 &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
210 &pshufb ("xmm2","xmm3"); # 2 = sb2t
211 &pxor ("xmm2","xmm5"); # 2 = 2A
212 &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
213 &movdqa ("xmm3","xmm0"); # 3 = A
214 &pshufb ("xmm0","xmm1"); # 0 = B
215 &add ($key,16); # next key
216 &pxor ("xmm0","xmm2"); # 0 = 2A+B
217 &pshufb ("xmm3","xmm4"); # 3 = D
218 &add ($magic,16); # next mc
219 &pxor ("xmm3","xmm0"); # 3 = 2A+B+D
220 &pshufb ("xmm0","xmm1"); # 0 = 2B+C
221 &and ($magic,0x30); # ... mod 4
222 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
223 &sub ($round,1); # nr--
224
225&set_label("enc_entry");
226 # top of round
227 &movdqa ("xmm1","xmm6"); # 1 : i
228 &pandn ("xmm1","xmm0"); # 1 = i<<4
229 &psrld ("xmm1",4); # 1 = i
230 &pand ("xmm0","xmm6"); # 0 = k
231 &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
232 &pshufb ("xmm5","xmm0"); # 2 = a/k
233 &pxor ("xmm0","xmm1"); # 0 = j
234 &movdqa ("xmm3","xmm7"); # 3 : 1/i
235 &pshufb ("xmm3","xmm1"); # 3 = 1/i
236 &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
237 &movdqa ("xmm4","xmm7"); # 4 : 1/j
238 &pshufb ("xmm4","xmm0"); # 4 = 1/j
239 &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
240 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
241 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
242 &pxor ("xmm2","xmm0"); # 2 = io
243 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
244 &movdqu ("xmm5",&QWP(0,$key));
245 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
246 &pxor ("xmm3","xmm1"); # 3 = jo
247 &jnz (&label("enc_loop"));
248
249 # middle of last round
250 &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
251 &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
252 &pshufb ("xmm4","xmm2"); # 4 = sbou
253 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
254 &pshufb ("xmm0","xmm3"); # 0 = sb1t
255 &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
256 &pxor ("xmm0","xmm4"); # 0 = A
257 &pshufb ("xmm0","xmm1");
258 &ret ();
259&function_end_B("_vpaes_encrypt_core");
260
261##
262## Decryption core
263##
264## Same API as encryption core.
265##
266&function_begin_B("_vpaes_decrypt_core");
267 &mov ($round,&DWP(240,$key));
268 &lea ($base,&DWP($k_dsbd,$const));
269 &movdqa ("xmm1","xmm6");
270 &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
271 &pandn ("xmm1","xmm0");
272 &mov ($magic,$round);
273 &psrld ("xmm1",4)
274 &movdqu ("xmm5",&QWP(0,$key));
275 &shl ($magic,4);
276 &pand ("xmm0","xmm6");
277 &pshufb ("xmm2","xmm0");
278 &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
279 &xor ($magic,0x30);
280 &pshufb ("xmm0","xmm1");
281 &and ($magic,0x30);
282 &pxor ("xmm2","xmm5");
283 &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
284 &pxor ("xmm0","xmm2");
285 &add ($key,16);
286 &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
287 &jmp (&label("dec_entry"));
288
289&set_label("dec_loop",16);
290##
291## Inverse mix columns
292##
293 &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
294 &pshufb ("xmm4","xmm2"); # 4 = sb9u
295 &pxor ("xmm4","xmm0");
296 &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t
297 &pshufb ("xmm0","xmm3"); # 0 = sb9t
298 &pxor ("xmm0","xmm4"); # 0 = ch
299 &add ($key,16); # next round key
300
301 &pshufb ("xmm0","xmm5"); # MC ch
302 &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
303 &pshufb ("xmm4","xmm2"); # 4 = sbdu
304 &pxor ("xmm4","xmm0"); # 4 = ch
305 &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt
306 &pshufb ("xmm0","xmm3"); # 0 = sbdt
307 &pxor ("xmm0","xmm4"); # 0 = ch
308 &sub ($round,1); # nr--
309
310 &pshufb ("xmm0","xmm5"); # MC ch
311 &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
312 &pshufb ("xmm4","xmm2"); # 4 = sbbu
313 &pxor ("xmm4","xmm0"); # 4 = ch
314 &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt
315 &pshufb ("xmm0","xmm3"); # 0 = sbbt
316 &pxor ("xmm0","xmm4"); # 0 = ch
317
318 &pshufb ("xmm0","xmm5"); # MC ch
319 &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
320 &pshufb ("xmm4","xmm2"); # 4 = sbeu
321 &pxor ("xmm4","xmm0"); # 4 = ch
322 &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
323 &pshufb ("xmm0","xmm3"); # 0 = sbet
324 &pxor ("xmm0","xmm4"); # 0 = ch
325
326 &palignr("xmm5","xmm5",12);
327
328&set_label("dec_entry");
329 # top of round
330 &movdqa ("xmm1","xmm6"); # 1 : i
331 &pandn ("xmm1","xmm0"); # 1 = i<<4
332 &psrld ("xmm1",4); # 1 = i
333 &pand ("xmm0","xmm6"); # 0 = k
334 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
335 &pshufb ("xmm2","xmm0"); # 2 = a/k
336 &pxor ("xmm0","xmm1"); # 0 = j
337 &movdqa ("xmm3","xmm7"); # 3 : 1/i
338 &pshufb ("xmm3","xmm1"); # 3 = 1/i
339 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
340 &movdqa ("xmm4","xmm7"); # 4 : 1/j
341 &pshufb ("xmm4","xmm0"); # 4 = 1/j
342 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
343 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
344 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
345 &pxor ("xmm2","xmm0"); # 2 = io
346 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
347 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
348 &pxor ("xmm3","xmm1"); # 3 = jo
349 &movdqu ("xmm0",&QWP(0,$key));
350 &jnz (&label("dec_loop"));
351
352 # middle of last round
353 &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
354 &pshufb ("xmm4","xmm2"); # 4 = sbou
355 &pxor ("xmm4","xmm0"); # 4 = sb1u + k
356 &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
357 &movdqa ("xmm2",&QWP(0,$magic));
358 &pshufb ("xmm0","xmm3"); # 0 = sb1t
359 &pxor ("xmm0","xmm4"); # 0 = A
360 &pshufb ("xmm0","xmm2");
361 &ret ();
362&function_end_B("_vpaes_decrypt_core");
363
364########################################################
365## ##
366## AES key schedule ##
367## ##
368########################################################
369&function_begin_B("_vpaes_schedule_core");
370 &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
371 &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
372
373 # input transform
374 &movdqa ("xmm3","xmm0");
375 &lea ($base,&DWP($k_ipt,$const));
376 &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
377 &call ("_vpaes_schedule_transform");
378 &movdqa ("xmm7","xmm0");
379
380 &test ($out,$out);
381 &jnz (&label("schedule_am_decrypting"));
382
383 # encrypting, output zeroth round key after transform
384 &movdqu (&QWP(0,$key),"xmm0");
385 &jmp (&label("schedule_go"));
386
387&set_label("schedule_am_decrypting");
388 # decrypting, output zeroth round key after shiftrows
389 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
390 &pshufb ("xmm3","xmm1");
391 &movdqu (&QWP(0,$key),"xmm3");
392 &xor ($magic,0x30);
393
394&set_label("schedule_go");
395 &cmp ($round,192);
396 &ja (&label("schedule_256"));
397 &je (&label("schedule_192"));
398 # 128: fall though
399
400##
401## .schedule_128
402##
403## 128-bit specific part of key schedule.
404##
405## This schedule is really simple, because all its parts
406## are accomplished by the subroutines.
407##
408&set_label("schedule_128");
409 &mov ($round,10);
410
411&set_label("loop_schedule_128");
412 &call ("_vpaes_schedule_round");
413 &dec ($round);
414 &jz (&label("schedule_mangle_last"));
415 &call ("_vpaes_schedule_mangle"); # write output
416 &jmp (&label("loop_schedule_128"));
417
418##
419## .aes_schedule_192
420##
421## 192-bit specific part of key schedule.
422##
423## The main body of this schedule is the same as the 128-bit
424## schedule, but with more smearing. The long, high side is
425## stored in %xmm7 as before, and the short, low side is in
426## the high bits of %xmm6.
427##
428## This schedule is somewhat nastier, however, because each
429## round produces 192 bits of key material, or 1.5 round keys.
430## Therefore, on each cycle we do 2 rounds and produce 3 round
431## keys.
432##
433&set_label("schedule_192",16);
434 &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
435 &call ("_vpaes_schedule_transform"); # input transform
436 &movdqa ("xmm6","xmm0"); # save short part
437 &pxor ("xmm4","xmm4"); # clear 4
438 &movhlps("xmm6","xmm4"); # clobber low side with zeros
439 &mov ($round,4);
440
441&set_label("loop_schedule_192");
442 &call ("_vpaes_schedule_round");
443 &palignr("xmm0","xmm6",8);
444 &call ("_vpaes_schedule_mangle"); # save key n
445 &call ("_vpaes_schedule_192_smear");
446 &call ("_vpaes_schedule_mangle"); # save key n+1
447 &call ("_vpaes_schedule_round");
448 &dec ($round);
449 &jz (&label("schedule_mangle_last"));
450 &call ("_vpaes_schedule_mangle"); # save key n+2
451 &call ("_vpaes_schedule_192_smear");
452 &jmp (&label("loop_schedule_192"));
453
454##
455## .aes_schedule_256
456##
457## 256-bit specific part of key schedule.
458##
459## The structure here is very similar to the 128-bit
460## schedule, but with an additional "low side" in
461## %xmm6. The low side's rounds are the same as the
462## high side's, except no rcon and no rotation.
463##
464&set_label("schedule_256",16);
465 &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
466 &call ("_vpaes_schedule_transform"); # input transform
467 &mov ($round,7);
468
469&set_label("loop_schedule_256");
470 &call ("_vpaes_schedule_mangle"); # output low result
471 &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
472
473 # high round
474 &call ("_vpaes_schedule_round");
475 &dec ($round);
476 &jz (&label("schedule_mangle_last"));
477 &call ("_vpaes_schedule_mangle");
478
479 # low round. swap xmm7 and xmm6
480 &pshufd ("xmm0","xmm0",0xFF);
481 &movdqa (&QWP(20,"esp"),"xmm7");
482 &movdqa ("xmm7","xmm6");
483 &call ("_vpaes_schedule_low_round");
484 &movdqa ("xmm7",&QWP(20,"esp"));
485
486 &jmp (&label("loop_schedule_256"));
487
488##
489## .aes_schedule_mangle_last
490##
491## Mangler for last round of key schedule
492## Mangles %xmm0
493## when encrypting, outputs out(%xmm0) ^ 63
494## when decrypting, outputs unskew(%xmm0)
495##
496## Always called right before return... jumps to cleanup and exits
497##
498&set_label("schedule_mangle_last",16);
499 # schedule last round key from xmm0
500 &lea ($base,&DWP($k_deskew,$const));
501 &test ($out,$out);
502 &jnz (&label("schedule_mangle_last_dec"));
503
504 # encrypting
505 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
506 &pshufb ("xmm0","xmm1"); # output permute
507 &lea ($base,&DWP($k_opt,$const)); # prepare to output transform
508 &add ($key,32);
509
510&set_label("schedule_mangle_last_dec");
511 &add ($key,-16);
512 &pxor ("xmm0",&QWP($k_s63,$const));
513 &call ("_vpaes_schedule_transform"); # output transform
514 &movdqu (&QWP(0,$key),"xmm0"); # save last key
515
516 # cleanup
517 &pxor ("xmm0","xmm0");
518 &pxor ("xmm1","xmm1");
519 &pxor ("xmm2","xmm2");
520 &pxor ("xmm3","xmm3");
521 &pxor ("xmm4","xmm4");
522 &pxor ("xmm5","xmm5");
523 &pxor ("xmm6","xmm6");
524 &pxor ("xmm7","xmm7");
525 &ret ();
526&function_end_B("_vpaes_schedule_core");
527
528##
529## .aes_schedule_192_smear
530##
531## Smear the short, low side in the 192-bit key schedule.
532##
533## Inputs:
534## %xmm7: high side, b a x y
535## %xmm6: low side, d c 0 0
536## %xmm13: 0
537##
538## Outputs:
539## %xmm6: b+c+d b+c 0 0
540## %xmm0: b+c+d b+c b a
541##
542&function_begin_B("_vpaes_schedule_192_smear");
543 &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0
544 &pxor ("xmm6","xmm0"); # -> c+d c 0 0
545 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
546 &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
547 &movdqa ("xmm0","xmm6");
548 &pxor ("xmm1","xmm1");
549 &movhlps("xmm6","xmm1"); # clobber low side with zeros
550 &ret ();
551&function_end_B("_vpaes_schedule_192_smear");
552
553##
554## .aes_schedule_round
555##
556## Runs one main round of the key schedule on %xmm0, %xmm7
557##
558## Specifically, runs subbytes on the high dword of %xmm0
559## then rotates it by one byte and xors into the low dword of
560## %xmm7.
561##
562## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
563## next rcon.
564##
565## Smears the dwords of %xmm7 by xoring the low into the
566## second low, result into third, result into highest.
567##
568## Returns results in %xmm7 = %xmm0.
569## Clobbers %xmm1-%xmm5.
570##
571&function_begin_B("_vpaes_schedule_round");
572 # extract rcon from xmm8
573 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8
574 &pxor ("xmm1","xmm1");
575 &palignr("xmm1","xmm2",15);
576 &palignr("xmm2","xmm2",15);
577 &pxor ("xmm7","xmm1");
578
579 # rotate
580 &pshufd ("xmm0","xmm0",0xFF);
581 &palignr("xmm0","xmm0",1);
582
583 # fall through...
584 &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
585
586 # low round: same as high round, but no rotation and no rcon.
587&set_label("_vpaes_schedule_low_round");
588 # smear xmm7
589 &movdqa ("xmm1","xmm7");
590 &pslldq ("xmm7",4);
591 &pxor ("xmm7","xmm1");
592 &movdqa ("xmm1","xmm7");
593 &pslldq ("xmm7",8);
594 &pxor ("xmm7","xmm1");
595 &pxor ("xmm7",&QWP($k_s63,$const));
596
597 # subbyte
598 &movdqa ("xmm4",&QWP($k_s0F,$const));
599 &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
600 &movdqa ("xmm1","xmm4");
601 &pandn ("xmm1","xmm0");
602 &psrld ("xmm1",4); # 1 = i
603 &pand ("xmm0","xmm4"); # 0 = k
604 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
605 &pshufb ("xmm2","xmm0"); # 2 = a/k
606 &pxor ("xmm0","xmm1"); # 0 = j
607 &movdqa ("xmm3","xmm5"); # 3 : 1/i
608 &pshufb ("xmm3","xmm1"); # 3 = 1/i
609 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
610 &movdqa ("xmm4","xmm5"); # 4 : 1/j
611 &pshufb ("xmm4","xmm0"); # 4 = 1/j
612 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
613 &movdqa ("xmm2","xmm5"); # 2 : 1/iak
614 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
615 &pxor ("xmm2","xmm0"); # 2 = io
616 &movdqa ("xmm3","xmm5"); # 3 : 1/jak
617 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
618 &pxor ("xmm3","xmm1"); # 3 = jo
619 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
620 &pshufb ("xmm4","xmm2"); # 4 = sbou
621 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
622 &pshufb ("xmm0","xmm3"); # 0 = sb1t
623 &pxor ("xmm0","xmm4"); # 0 = sbox output
624
625 # add in smeared stuff
626 &pxor ("xmm0","xmm7");
627 &movdqa ("xmm7","xmm0");
628 &ret ();
629&function_end_B("_vpaes_schedule_round");
630
631##
632## .aes_schedule_transform
633##
634## Linear-transform %xmm0 according to tables at (%ebx)
635##
636## Output in %xmm0
637## Clobbers %xmm1, %xmm2
638##
639&function_begin_B("_vpaes_schedule_transform");
640 &movdqa ("xmm2",&QWP($k_s0F,$const));
641 &movdqa ("xmm1","xmm2");
642 &pandn ("xmm1","xmm0");
643 &psrld ("xmm1",4);
644 &pand ("xmm0","xmm2");
645 &movdqa ("xmm2",&QWP(0,$base));
646 &pshufb ("xmm2","xmm0");
647 &movdqa ("xmm0",&QWP(16,$base));
648 &pshufb ("xmm0","xmm1");
649 &pxor ("xmm0","xmm2");
650 &ret ();
651&function_end_B("_vpaes_schedule_transform");
652
653##
654## .aes_schedule_mangle
655##
656## Mangle xmm0 from (basis-transformed) standard version
657## to our version.
658##
659## On encrypt,
660## xor with 0x63
661## multiply by circulant 0,1,1,1
662## apply shiftrows transform
663##
664## On decrypt,
665## xor with 0x63
666## multiply by "inverse mixcolumns" circulant E,B,D,9
667## deskew
668## apply shiftrows transform
669##
670##
671## Writes out to (%edx), and increments or decrements it
672## Keeps track of round number mod 4 in %ecx
673## Preserves xmm0
674## Clobbers xmm1-xmm5
675##
676&function_begin_B("_vpaes_schedule_mangle");
677 &movdqa ("xmm4","xmm0"); # save xmm0 for later
678 &movdqa ("xmm5",&QWP($k_mc_forward,$const));
679 &test ($out,$out);
680 &jnz (&label("schedule_mangle_dec"));
681
682 # encrypting
683 &add ($key,16);
684 &pxor ("xmm4",&QWP($k_s63,$const));
685 &pshufb ("xmm4","xmm5");
686 &movdqa ("xmm3","xmm4");
687 &pshufb ("xmm4","xmm5");
688 &pxor ("xmm3","xmm4");
689 &pshufb ("xmm4","xmm5");
690 &pxor ("xmm3","xmm4");
691
692 &jmp (&label("schedule_mangle_both"));
693
694&set_label("schedule_mangle_dec",16);
695 # inverse mix columns
696 &movdqa ("xmm2",&QWP($k_s0F,$const));
697 &lea ($inp,&DWP($k_dksd,$const));
698 &movdqa ("xmm1","xmm2");
699 &pandn ("xmm1","xmm4");
700 &psrld ("xmm1",4); # 1 = hi
701 &pand ("xmm4","xmm2"); # 4 = lo
702
703 &movdqa ("xmm2",&QWP(0,$inp));
704 &pshufb ("xmm2","xmm4");
705 &movdqa ("xmm3",&QWP(0x10,$inp));
706 &pshufb ("xmm3","xmm1");
707 &pxor ("xmm3","xmm2");
708 &pshufb ("xmm3","xmm5");
709
710 &movdqa ("xmm2",&QWP(0x20,$inp));
711 &pshufb ("xmm2","xmm4");
712 &pxor ("xmm2","xmm3");
713 &movdqa ("xmm3",&QWP(0x30,$inp));
714 &pshufb ("xmm3","xmm1");
715 &pxor ("xmm3","xmm2");
716 &pshufb ("xmm3","xmm5");
717
718 &movdqa ("xmm2",&QWP(0x40,$inp));
719 &pshufb ("xmm2","xmm4");
720 &pxor ("xmm2","xmm3");
721 &movdqa ("xmm3",&QWP(0x50,$inp));
722 &pshufb ("xmm3","xmm1");
723 &pxor ("xmm3","xmm2");
724 &pshufb ("xmm3","xmm5");
725
726 &movdqa ("xmm2",&QWP(0x60,$inp));
727 &pshufb ("xmm2","xmm4");
728 &pxor ("xmm2","xmm3");
729 &movdqa ("xmm3",&QWP(0x70,$inp));
730 &pshufb ("xmm3","xmm1");
731 &pxor ("xmm3","xmm2");
732
733 &add ($key,-16);
734
735&set_label("schedule_mangle_both");
736 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
737 &pshufb ("xmm3","xmm1");
738 &add ($magic,-16);
739 &and ($magic,0x30);
740 &movdqu (&QWP(0,$key),"xmm3");
741 &ret ();
742&function_end_B("_vpaes_schedule_mangle");
743
744#
745# Interface to OpenSSL
746#
747&function_begin("${PREFIX}_set_encrypt_key");
748 &mov ($inp,&wparam(0)); # inp
749 &lea ($base,&DWP(-56,"esp"));
750 &mov ($round,&wparam(1)); # bits
751 &and ($base,-16);
752 &mov ($key,&wparam(2)); # key
753 &xchg ($base,"esp"); # alloca
754 &mov (&DWP(48,"esp"),$base);
755
756 &mov ($base,$round);
757 &shr ($base,5);
758 &add ($base,5);
759 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
760 &mov ($magic,0x30);
761 &mov ($out,0);
762
763 &picsetup($const);
764 &picsymbol($const, &label("_vpaes_consts"), $const);
765 &lea ($const,&DWP(0x30,$const))
766
767 &call ("_vpaes_schedule_core");
768
769 &mov ("esp",&DWP(48,"esp"));
770 &xor ("eax","eax");
771&function_end("${PREFIX}_set_encrypt_key");
772
773&function_begin("${PREFIX}_set_decrypt_key");
774 &mov ($inp,&wparam(0)); # inp
775 &lea ($base,&DWP(-56,"esp"));
776 &mov ($round,&wparam(1)); # bits
777 &and ($base,-16);
778 &mov ($key,&wparam(2)); # key
779 &xchg ($base,"esp"); # alloca
780 &mov (&DWP(48,"esp"),$base);
781
782 &mov ($base,$round);
783 &shr ($base,5);
784 &add ($base,5);
785 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
786 &shl ($base,4);
787 &lea ($key,&DWP(16,$key,$base));
788
789 &mov ($out,1);
790 &mov ($magic,$round);
791 &shr ($magic,1);
792 &and ($magic,32);
793 &xor ($magic,32); # nbist==192?0:32;
794
795 &picsetup($const);
796 &picsymbol($const, &label("_vpaes_consts"), $const);
797 &lea ($const,&DWP(0x30,$const))
798
799 &call ("_vpaes_schedule_core");
800
801 &mov ("esp",&DWP(48,"esp"));
802 &xor ("eax","eax");
803&function_end("${PREFIX}_set_decrypt_key");
804
805&function_begin("${PREFIX}_encrypt");
806 &picsetup($const);
807 &picsymbol($const, &label("_vpaes_consts"), $const);
808 &lea ($const,&DWP(0x30,$const))
809
810 &call ("_vpaes_preheat");
811 &mov ($inp,&wparam(0)); # inp
812 &lea ($base,&DWP(-56,"esp"));
813 &mov ($out,&wparam(1)); # out
814 &and ($base,-16);
815 &mov ($key,&wparam(2)); # key
816 &xchg ($base,"esp"); # alloca
817 &mov (&DWP(48,"esp"),$base);
818
819 &movdqu ("xmm0",&QWP(0,$inp));
820 &call ("_vpaes_encrypt_core");
821 &movdqu (&QWP(0,$out),"xmm0");
822
823 &mov ("esp",&DWP(48,"esp"));
824&function_end("${PREFIX}_encrypt");
825
826&function_begin("${PREFIX}_decrypt");
827 &picsetup($const);
828 &picsymbol($const, &label("_vpaes_consts"), $const);
829 &lea ($const,&DWP(0x30,$const))
830
831 &call ("_vpaes_preheat");
832 &mov ($inp,&wparam(0)); # inp
833 &lea ($base,&DWP(-56,"esp"));
834 &mov ($out,&wparam(1)); # out
835 &and ($base,-16);
836 &mov ($key,&wparam(2)); # key
837 &xchg ($base,"esp"); # alloca
838 &mov (&DWP(48,"esp"),$base);
839
840 &movdqu ("xmm0",&QWP(0,$inp));
841 &call ("_vpaes_decrypt_core");
842 &movdqu (&QWP(0,$out),"xmm0");
843
844 &mov ("esp",&DWP(48,"esp"));
845&function_end("${PREFIX}_decrypt");
846
847&function_begin("${PREFIX}_cbc_encrypt");
848 &mov ($inp,&wparam(0)); # inp
849 &mov ($out,&wparam(1)); # out
850 &mov ($round,&wparam(2)); # len
851 &mov ($key,&wparam(3)); # key
852 &sub ($round,16);
853 &jc (&label("cbc_abort"));
854 &lea ($base,&DWP(-56,"esp"));
855 &mov ($const,&wparam(4)); # ivp
856 &and ($base,-16);
857 &mov ($magic,&wparam(5)); # enc
858 &xchg ($base,"esp"); # alloca
859 &movdqu ("xmm1",&QWP(0,$const)); # load IV
860 &sub ($out,$inp);
861 &mov (&DWP(48,"esp"),$base);
862
863 &mov (&DWP(0,"esp"),$out); # save out
864 &mov (&DWP(4,"esp"),$key) # save key
865 &mov (&DWP(8,"esp"),$const); # save ivp
866 &mov ($out,$round); # $out works as $len
867
868 &picsetup($const);
869 &picsymbol($const, &label("_vpaes_consts"), $const);
870 &lea ($const,&DWP(0x30,$const))
871
872 &call ("_vpaes_preheat");
873 &cmp ($magic,0);
874 &je (&label("cbc_dec_loop"));
875 &jmp (&label("cbc_enc_loop"));
876
877&set_label("cbc_enc_loop",16);
878 &movdqu ("xmm0",&QWP(0,$inp)); # load input
879 &pxor ("xmm0","xmm1"); # inp^=iv
880 &call ("_vpaes_encrypt_core");
881 &mov ($base,&DWP(0,"esp")); # restore out
882 &mov ($key,&DWP(4,"esp")); # restore key
883 &movdqa ("xmm1","xmm0");
884 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
885 &lea ($inp,&DWP(16,$inp));
886 &sub ($out,16);
887 &jnc (&label("cbc_enc_loop"));
888 &jmp (&label("cbc_done"));
889
890&set_label("cbc_dec_loop",16);
891 &movdqu ("xmm0",&QWP(0,$inp)); # load input
892 &movdqa (&QWP(16,"esp"),"xmm1"); # save IV
893 &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
894 &call ("_vpaes_decrypt_core");
895 &mov ($base,&DWP(0,"esp")); # restore out
896 &mov ($key,&DWP(4,"esp")); # restore key
897 &pxor ("xmm0",&QWP(16,"esp")); # out^=iv
898 &movdqa ("xmm1",&QWP(32,"esp")); # load next IV
899 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
900 &lea ($inp,&DWP(16,$inp));
901 &sub ($out,16);
902 &jnc (&label("cbc_dec_loop"));
903
904&set_label("cbc_done");
905 &mov ($base,&DWP(8,"esp")); # restore ivp
906 &mov ("esp",&DWP(48,"esp"));
907 &movdqu (&QWP(0,$base),"xmm1"); # write IV
908&set_label("cbc_abort");
909&function_end("${PREFIX}_cbc_encrypt");
910
911&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
deleted file mode 100644
index 7d92e8d8ca..0000000000
--- a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
+++ /dev/null
@@ -1,1222 +0,0 @@
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Interface to OpenSSL as "almost" drop-in replacement for
17# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-x86_64.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86_64.pl column -
26# [also large-block CBC] encrypt/decrypt.
27#
28# aes-x86_64.pl vpaes-x86_64.pl
29#
30# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***)
31# Nehalem 30.5/42.2/14.6 9.8/11.8
32# Atom 63.9/79.0/32.1 64.0/84.8(***)
33#
34# (*) "Hyper-threading" in the context refers rather to cache shared
35# among multiple cores, than to specifically Intel HTT. As vast
36# majority of contemporary cores share cache, slower code path
37# is common place. In other words "with-hyper-threading-off"
38# results are presented mostly for reference purposes.
39#
40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
41#
42# (***) Less impressive improvement on Core 2 and Atom is due to slow
43# pshufb, yet it's respectable +40%/78% improvement on Core 2
44# (as implied, over "hyper-threading-safe" code path).
45#
46# <appro@openssl.org>
47
48$flavour = shift;
49$output = shift;
50if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
51
52$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53
54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57die "can't locate x86_64-xlate.pl";
58
59open OUT,"| \"$^X\" $xlate $flavour $output";
60*STDOUT=*OUT;
61
62$PREFIX="vpaes";
63
64$code.=<<___;
65.text
66
67##
68## _aes_encrypt_core
69##
70## AES-encrypt %xmm0.
71##
72## Inputs:
73## %xmm0 = input
74## %xmm9-%xmm15 as in _vpaes_preheat
75## (%rdx) = scheduled keys
76##
77## Output in %xmm0
78## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
79## Preserves %xmm6 - %xmm8 so you get some local vectors
80##
81##
82.type _vpaes_encrypt_core,\@abi-omnipotent
83.align 16
84_vpaes_encrypt_core:
85 _CET_ENDBR
86 mov %rdx, %r9
87 mov \$16, %r11
88 mov 240(%rdx),%eax
89 movdqa %xmm9, %xmm1
90 movdqa .Lk_ipt(%rip), %xmm2 # iptlo
91 pandn %xmm0, %xmm1
92 movdqu (%r9), %xmm5 # round0 key
93 psrld \$4, %xmm1
94 pand %xmm9, %xmm0
95 pshufb %xmm0, %xmm2
96 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
97 pshufb %xmm1, %xmm0
98 pxor %xmm5, %xmm2
99 pxor %xmm2, %xmm0
100 add \$16, %r9
101 lea .Lk_mc_backward(%rip),%r10
102 jmp .Lenc_entry
103
104.align 16
105.Lenc_loop:
106 # middle of middle round
107 movdqa %xmm13, %xmm4 # 4 : sb1u
108 pshufb %xmm2, %xmm4 # 4 = sb1u
109 pxor %xmm5, %xmm4 # 4 = sb1u + k
110 movdqa %xmm12, %xmm0 # 0 : sb1t
111 pshufb %xmm3, %xmm0 # 0 = sb1t
112 pxor %xmm4, %xmm0 # 0 = A
113 movdqa %xmm15, %xmm5 # 4 : sb2u
114 pshufb %xmm2, %xmm5 # 4 = sb2u
115 movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
116 movdqa %xmm14, %xmm2 # 2 : sb2t
117 pshufb %xmm3, %xmm2 # 2 = sb2t
118 pxor %xmm5, %xmm2 # 2 = 2A
119 movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
120 movdqa %xmm0, %xmm3 # 3 = A
121 pshufb %xmm1, %xmm0 # 0 = B
122 add \$16, %r9 # next key
123 pxor %xmm2, %xmm0 # 0 = 2A+B
124 pshufb %xmm4, %xmm3 # 3 = D
125 add \$16, %r11 # next mc
126 pxor %xmm0, %xmm3 # 3 = 2A+B+D
127 pshufb %xmm1, %xmm0 # 0 = 2B+C
128 and \$0x30, %r11 # ... mod 4
129 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
130 sub \$1,%rax # nr--
131
132.Lenc_entry:
133 # top of round
134 movdqa %xmm9, %xmm1 # 1 : i
135 pandn %xmm0, %xmm1 # 1 = i<<4
136 psrld \$4, %xmm1 # 1 = i
137 pand %xmm9, %xmm0 # 0 = k
138 movdqa %xmm11, %xmm5 # 2 : a/k
139 pshufb %xmm0, %xmm5 # 2 = a/k
140 pxor %xmm1, %xmm0 # 0 = j
141 movdqa %xmm10, %xmm3 # 3 : 1/i
142 pshufb %xmm1, %xmm3 # 3 = 1/i
143 pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
144 movdqa %xmm10, %xmm4 # 4 : 1/j
145 pshufb %xmm0, %xmm4 # 4 = 1/j
146 pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
147 movdqa %xmm10, %xmm2 # 2 : 1/iak
148 pshufb %xmm3, %xmm2 # 2 = 1/iak
149 pxor %xmm0, %xmm2 # 2 = io
150 movdqa %xmm10, %xmm3 # 3 : 1/jak
151 movdqu (%r9), %xmm5
152 pshufb %xmm4, %xmm3 # 3 = 1/jak
153 pxor %xmm1, %xmm3 # 3 = jo
154 jnz .Lenc_loop
155
156 # middle of last round
157 movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
158 movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
159 pshufb %xmm2, %xmm4 # 4 = sbou
160 pxor %xmm5, %xmm4 # 4 = sb1u + k
161 pshufb %xmm3, %xmm0 # 0 = sb1t
162 movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
163 pxor %xmm4, %xmm0 # 0 = A
164 pshufb %xmm1, %xmm0
165 ret
166.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
167
168##
169## Decryption core
170##
171## Same API as encryption core.
172##
173.type _vpaes_decrypt_core,\@abi-omnipotent
174.align 16
175_vpaes_decrypt_core:
176 _CET_ENDBR
177 mov %rdx, %r9 # load key
178 mov 240(%rdx),%eax
179 movdqa %xmm9, %xmm1
180 movdqa .Lk_dipt(%rip), %xmm2 # iptlo
181 pandn %xmm0, %xmm1
182 mov %rax, %r11
183 psrld \$4, %xmm1
184 movdqu (%r9), %xmm5 # round0 key
185 shl \$4, %r11
186 pand %xmm9, %xmm0
187 pshufb %xmm0, %xmm2
188 movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
189 xor \$0x30, %r11
190 lea .Lk_dsbd(%rip),%r10
191 pshufb %xmm1, %xmm0
192 and \$0x30, %r11
193 pxor %xmm5, %xmm2
194 movdqa .Lk_mc_forward+48(%rip), %xmm5
195 pxor %xmm2, %xmm0
196 add \$16, %r9
197 add %r10, %r11
198 jmp .Ldec_entry
199
200.align 16
201.Ldec_loop:
202##
203## Inverse mix columns
204##
205 movdqa -0x20(%r10),%xmm4 # 4 : sb9u
206 pshufb %xmm2, %xmm4 # 4 = sb9u
207 pxor %xmm0, %xmm4
208 movdqa -0x10(%r10),%xmm0 # 0 : sb9t
209 pshufb %xmm3, %xmm0 # 0 = sb9t
210 pxor %xmm4, %xmm0 # 0 = ch
211 add \$16, %r9 # next round key
212
213 pshufb %xmm5, %xmm0 # MC ch
214 movdqa 0x00(%r10),%xmm4 # 4 : sbdu
215 pshufb %xmm2, %xmm4 # 4 = sbdu
216 pxor %xmm0, %xmm4 # 4 = ch
217 movdqa 0x10(%r10),%xmm0 # 0 : sbdt
218 pshufb %xmm3, %xmm0 # 0 = sbdt
219 pxor %xmm4, %xmm0 # 0 = ch
220 sub \$1,%rax # nr--
221
222 pshufb %xmm5, %xmm0 # MC ch
223 movdqa 0x20(%r10),%xmm4 # 4 : sbbu
224 pshufb %xmm2, %xmm4 # 4 = sbbu
225 pxor %xmm0, %xmm4 # 4 = ch
226 movdqa 0x30(%r10),%xmm0 # 0 : sbbt
227 pshufb %xmm3, %xmm0 # 0 = sbbt
228 pxor %xmm4, %xmm0 # 0 = ch
229
230 pshufb %xmm5, %xmm0 # MC ch
231 movdqa 0x40(%r10),%xmm4 # 4 : sbeu
232 pshufb %xmm2, %xmm4 # 4 = sbeu
233 pxor %xmm0, %xmm4 # 4 = ch
234 movdqa 0x50(%r10),%xmm0 # 0 : sbet
235 pshufb %xmm3, %xmm0 # 0 = sbet
236 pxor %xmm4, %xmm0 # 0 = ch
237
238 palignr \$12, %xmm5, %xmm5
239
240.Ldec_entry:
241 # top of round
242 movdqa %xmm9, %xmm1 # 1 : i
243 pandn %xmm0, %xmm1 # 1 = i<<4
244 psrld \$4, %xmm1 # 1 = i
245 pand %xmm9, %xmm0 # 0 = k
246 movdqa %xmm11, %xmm2 # 2 : a/k
247 pshufb %xmm0, %xmm2 # 2 = a/k
248 pxor %xmm1, %xmm0 # 0 = j
249 movdqa %xmm10, %xmm3 # 3 : 1/i
250 pshufb %xmm1, %xmm3 # 3 = 1/i
251 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
252 movdqa %xmm10, %xmm4 # 4 : 1/j
253 pshufb %xmm0, %xmm4 # 4 = 1/j
254 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
255 movdqa %xmm10, %xmm2 # 2 : 1/iak
256 pshufb %xmm3, %xmm2 # 2 = 1/iak
257 pxor %xmm0, %xmm2 # 2 = io
258 movdqa %xmm10, %xmm3 # 3 : 1/jak
259 pshufb %xmm4, %xmm3 # 3 = 1/jak
260 pxor %xmm1, %xmm3 # 3 = jo
261 movdqu (%r9), %xmm0
262 jnz .Ldec_loop
263
264 # middle of last round
265 movdqa 0x60(%r10), %xmm4 # 3 : sbou
266 pshufb %xmm2, %xmm4 # 4 = sbou
267 pxor %xmm0, %xmm4 # 4 = sb1u + k
268 movdqa 0x70(%r10), %xmm0 # 0 : sbot
269 movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
270 pshufb %xmm3, %xmm0 # 0 = sb1t
271 pxor %xmm4, %xmm0 # 0 = A
272 pshufb %xmm2, %xmm0
273 ret
274.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
275
276########################################################
277## ##
278## AES key schedule ##
279## ##
280########################################################
281.type _vpaes_schedule_core,\@abi-omnipotent
282.align 16
283_vpaes_schedule_core:
284 _CET_ENDBR
285 # rdi = key
286 # rsi = size in bits
287 # rdx = buffer
288 # rcx = direction. 0=encrypt, 1=decrypt
289
290 call _vpaes_preheat # load the tables
291 movdqa .Lk_rcon(%rip), %xmm8 # load rcon
292 movdqu (%rdi), %xmm0 # load key (unaligned)
293
294 # input transform
295 movdqa %xmm0, %xmm3
296 lea .Lk_ipt(%rip), %r11
297 call _vpaes_schedule_transform
298 movdqa %xmm0, %xmm7
299
300 lea .Lk_sr(%rip),%r10
301 test %rcx, %rcx
302 jnz .Lschedule_am_decrypting
303
304 # encrypting, output zeroth round key after transform
305 movdqu %xmm0, (%rdx)
306 jmp .Lschedule_go
307
308.Lschedule_am_decrypting:
309 # decrypting, output zeroth round key after shiftrows
310 movdqa (%r8,%r10),%xmm1
311 pshufb %xmm1, %xmm3
312 movdqu %xmm3, (%rdx)
313 xor \$0x30, %r8
314
315.Lschedule_go:
316 cmp \$192, %esi
317 ja .Lschedule_256
318 je .Lschedule_192
319 # 128: fall though
320
321##
322## .schedule_128
323##
324## 128-bit specific part of key schedule.
325##
326## This schedule is really simple, because all its parts
327## are accomplished by the subroutines.
328##
329.Lschedule_128:
330 mov \$10, %esi
331
332.Loop_schedule_128:
333 call _vpaes_schedule_round
334 dec %rsi
335 jz .Lschedule_mangle_last
336 call _vpaes_schedule_mangle # write output
337 jmp .Loop_schedule_128
338
339##
340## .aes_schedule_192
341##
342## 192-bit specific part of key schedule.
343##
344## The main body of this schedule is the same as the 128-bit
345## schedule, but with more smearing. The long, high side is
346## stored in %xmm7 as before, and the short, low side is in
347## the high bits of %xmm6.
348##
349## This schedule is somewhat nastier, however, because each
350## round produces 192 bits of key material, or 1.5 round keys.
351## Therefore, on each cycle we do 2 rounds and produce 3 round
352## keys.
353##
354.align 16
355.Lschedule_192:
356 movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
357 call _vpaes_schedule_transform # input transform
358 movdqa %xmm0, %xmm6 # save short part
359 pxor %xmm4, %xmm4 # clear 4
360 movhlps %xmm4, %xmm6 # clobber low side with zeros
361 mov \$4, %esi
362
363.Loop_schedule_192:
364 call _vpaes_schedule_round
365 palignr \$8,%xmm6,%xmm0
366 call _vpaes_schedule_mangle # save key n
367 call _vpaes_schedule_192_smear
368 call _vpaes_schedule_mangle # save key n+1
369 call _vpaes_schedule_round
370 dec %rsi
371 jz .Lschedule_mangle_last
372 call _vpaes_schedule_mangle # save key n+2
373 call _vpaes_schedule_192_smear
374 jmp .Loop_schedule_192
375
376##
377## .aes_schedule_256
378##
379## 256-bit specific part of key schedule.
380##
381## The structure here is very similar to the 128-bit
382## schedule, but with an additional "low side" in
383## %xmm6. The low side's rounds are the same as the
384## high side's, except no rcon and no rotation.
385##
386.align 16
387.Lschedule_256:
388 movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
389 call _vpaes_schedule_transform # input transform
390 mov \$7, %esi
391
392.Loop_schedule_256:
393 call _vpaes_schedule_mangle # output low result
394 movdqa %xmm0, %xmm6 # save cur_lo in xmm6
395
396 # high round
397 call _vpaes_schedule_round
398 dec %rsi
399 jz .Lschedule_mangle_last
400 call _vpaes_schedule_mangle
401
402 # low round. swap xmm7 and xmm6
403 pshufd \$0xFF, %xmm0, %xmm0
404 movdqa %xmm7, %xmm5
405 movdqa %xmm6, %xmm7
406 call _vpaes_schedule_low_round
407 movdqa %xmm5, %xmm7
408
409 jmp .Loop_schedule_256
410
411
412##
413## .aes_schedule_mangle_last
414##
415## Mangler for last round of key schedule
416## Mangles %xmm0
417## when encrypting, outputs out(%xmm0) ^ 63
418## when decrypting, outputs unskew(%xmm0)
419##
420## Always called right before return... jumps to cleanup and exits
421##
422.align 16
423.Lschedule_mangle_last:
424 # schedule last round key from xmm0
425 lea .Lk_deskew(%rip),%r11 # prepare to deskew
426 test %rcx, %rcx
427 jnz .Lschedule_mangle_last_dec
428
429 # encrypting
430 movdqa (%r8,%r10),%xmm1
431 pshufb %xmm1, %xmm0 # output permute
432 lea .Lk_opt(%rip), %r11 # prepare to output transform
433 add \$32, %rdx
434
435.Lschedule_mangle_last_dec:
436 add \$-16, %rdx
437 pxor .Lk_s63(%rip), %xmm0
438 call _vpaes_schedule_transform # output transform
439 movdqu %xmm0, (%rdx) # save last key
440
441 # cleanup
442 pxor %xmm0, %xmm0
443 pxor %xmm1, %xmm1
444 pxor %xmm2, %xmm2
445 pxor %xmm3, %xmm3
446 pxor %xmm4, %xmm4
447 pxor %xmm5, %xmm5
448 pxor %xmm6, %xmm6
449 pxor %xmm7, %xmm7
450 ret
451.size _vpaes_schedule_core,.-_vpaes_schedule_core
452
453##
454## .aes_schedule_192_smear
455##
456## Smear the short, low side in the 192-bit key schedule.
457##
458## Inputs:
459## %xmm7: high side, b a x y
460## %xmm6: low side, d c 0 0
461## %xmm13: 0
462##
463## Outputs:
464## %xmm6: b+c+d b+c 0 0
465## %xmm0: b+c+d b+c b a
466##
467.type _vpaes_schedule_192_smear,\@abi-omnipotent
468.align 16
469_vpaes_schedule_192_smear:
470 _CET_ENDBR
471 pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
472 pxor %xmm0, %xmm6 # -> c+d c 0 0
473 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
474 pxor %xmm0, %xmm6 # -> b+c+d b+c b a
475 movdqa %xmm6, %xmm0
476 pxor %xmm1, %xmm1
477 movhlps %xmm1, %xmm6 # clobber low side with zeros
478 ret
479.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
480
481##
482## .aes_schedule_round
483##
484## Runs one main round of the key schedule on %xmm0, %xmm7
485##
486## Specifically, runs subbytes on the high dword of %xmm0
487## then rotates it by one byte and xors into the low dword of
488## %xmm7.
489##
490## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
491## next rcon.
492##
493## Smears the dwords of %xmm7 by xoring the low into the
494## second low, result into third, result into highest.
495##
496## Returns results in %xmm7 = %xmm0.
497## Clobbers %xmm1-%xmm4, %r11.
498##
499.type _vpaes_schedule_round,\@abi-omnipotent
500.align 16
501_vpaes_schedule_round:
502 _CET_ENDBR
503 # extract rcon from xmm8
504 pxor %xmm1, %xmm1
505 palignr \$15, %xmm8, %xmm1
506 palignr \$15, %xmm8, %xmm8
507 pxor %xmm1, %xmm7
508
509 # rotate
510 pshufd \$0xFF, %xmm0, %xmm0
511 palignr \$1, %xmm0, %xmm0
512
513 # fall through...
514
515 # low round: same as high round, but no rotation and no rcon.
516_vpaes_schedule_low_round:
517 # smear xmm7
518 movdqa %xmm7, %xmm1
519 pslldq \$4, %xmm7
520 pxor %xmm1, %xmm7
521 movdqa %xmm7, %xmm1
522 pslldq \$8, %xmm7
523 pxor %xmm1, %xmm7
524 pxor .Lk_s63(%rip), %xmm7
525
526 # subbytes
527 movdqa %xmm9, %xmm1
528 pandn %xmm0, %xmm1
529 psrld \$4, %xmm1 # 1 = i
530 pand %xmm9, %xmm0 # 0 = k
531 movdqa %xmm11, %xmm2 # 2 : a/k
532 pshufb %xmm0, %xmm2 # 2 = a/k
533 pxor %xmm1, %xmm0 # 0 = j
534 movdqa %xmm10, %xmm3 # 3 : 1/i
535 pshufb %xmm1, %xmm3 # 3 = 1/i
536 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
537 movdqa %xmm10, %xmm4 # 4 : 1/j
538 pshufb %xmm0, %xmm4 # 4 = 1/j
539 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
540 movdqa %xmm10, %xmm2 # 2 : 1/iak
541 pshufb %xmm3, %xmm2 # 2 = 1/iak
542 pxor %xmm0, %xmm2 # 2 = io
543 movdqa %xmm10, %xmm3 # 3 : 1/jak
544 pshufb %xmm4, %xmm3 # 3 = 1/jak
545 pxor %xmm1, %xmm3 # 3 = jo
546 movdqa %xmm13, %xmm4 # 4 : sbou
547 pshufb %xmm2, %xmm4 # 4 = sbou
548 movdqa %xmm12, %xmm0 # 0 : sbot
549 pshufb %xmm3, %xmm0 # 0 = sb1t
550 pxor %xmm4, %xmm0 # 0 = sbox output
551
552 # add in smeared stuff
553 pxor %xmm7, %xmm0
554 movdqa %xmm0, %xmm7
555 ret
556.size _vpaes_schedule_round,.-_vpaes_schedule_round
557
558##
559## .aes_schedule_transform
560##
561## Linear-transform %xmm0 according to tables at (%r11)
562##
563## Requires that %xmm9 = 0x0F0F... as in preheat
564## Output in %xmm0
565## Clobbers %xmm1, %xmm2
566##
567.type _vpaes_schedule_transform,\@abi-omnipotent
568.align 16
569_vpaes_schedule_transform:
570 _CET_ENDBR
571 movdqa %xmm9, %xmm1
572 pandn %xmm0, %xmm1
573 psrld \$4, %xmm1
574 pand %xmm9, %xmm0
575 movdqa (%r11), %xmm2 # lo
576 pshufb %xmm0, %xmm2
577 movdqa 16(%r11), %xmm0 # hi
578 pshufb %xmm1, %xmm0
579 pxor %xmm2, %xmm0
580 ret
581.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
582
583##
584## .aes_schedule_mangle
585##
586## Mangle xmm0 from (basis-transformed) standard version
587## to our version.
588##
589## On encrypt,
590## xor with 0x63
591## multiply by circulant 0,1,1,1
592## apply shiftrows transform
593##
594## On decrypt,
595## xor with 0x63
596## multiply by "inverse mixcolumns" circulant E,B,D,9
597## deskew
598## apply shiftrows transform
599##
600##
601## Writes out to (%rdx), and increments or decrements it
602## Keeps track of round number mod 4 in %r8
603## Preserves xmm0
604## Clobbers xmm1-xmm5
605##
606.type _vpaes_schedule_mangle,\@abi-omnipotent
607.align 16
608_vpaes_schedule_mangle:
609 _CET_ENDBR
610 movdqa %xmm0, %xmm4 # save xmm0 for later
611 movdqa .Lk_mc_forward(%rip),%xmm5
612 test %rcx, %rcx
613 jnz .Lschedule_mangle_dec
614
615 # encrypting
616 add \$16, %rdx
617 pxor .Lk_s63(%rip),%xmm4
618 pshufb %xmm5, %xmm4
619 movdqa %xmm4, %xmm3
620 pshufb %xmm5, %xmm4
621 pxor %xmm4, %xmm3
622 pshufb %xmm5, %xmm4
623 pxor %xmm4, %xmm3
624
625 jmp .Lschedule_mangle_both
626.align 16
627.Lschedule_mangle_dec:
628 # inverse mix columns
629 lea .Lk_dksd(%rip),%r11
630 movdqa %xmm9, %xmm1
631 pandn %xmm4, %xmm1
632 psrld \$4, %xmm1 # 1 = hi
633 pand %xmm9, %xmm4 # 4 = lo
634
635 movdqa 0x00(%r11), %xmm2
636 pshufb %xmm4, %xmm2
637 movdqa 0x10(%r11), %xmm3
638 pshufb %xmm1, %xmm3
639 pxor %xmm2, %xmm3
640 pshufb %xmm5, %xmm3
641
642 movdqa 0x20(%r11), %xmm2
643 pshufb %xmm4, %xmm2
644 pxor %xmm3, %xmm2
645 movdqa 0x30(%r11), %xmm3
646 pshufb %xmm1, %xmm3
647 pxor %xmm2, %xmm3
648 pshufb %xmm5, %xmm3
649
650 movdqa 0x40(%r11), %xmm2
651 pshufb %xmm4, %xmm2
652 pxor %xmm3, %xmm2
653 movdqa 0x50(%r11), %xmm3
654 pshufb %xmm1, %xmm3
655 pxor %xmm2, %xmm3
656 pshufb %xmm5, %xmm3
657
658 movdqa 0x60(%r11), %xmm2
659 pshufb %xmm4, %xmm2
660 pxor %xmm3, %xmm2
661 movdqa 0x70(%r11), %xmm3
662 pshufb %xmm1, %xmm3
663 pxor %xmm2, %xmm3
664
665 add \$-16, %rdx
666
667.Lschedule_mangle_both:
668 movdqa (%r8,%r10),%xmm1
669 pshufb %xmm1,%xmm3
670 add \$-16, %r8
671 and \$0x30, %r8
672 movdqu %xmm3, (%rdx)
673 ret
674.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
675
676#
677# Interface to OpenSSL
678#
679.globl ${PREFIX}_set_encrypt_key
680.type ${PREFIX}_set_encrypt_key,\@function,3
681.align 16
682${PREFIX}_set_encrypt_key:
683 _CET_ENDBR
684___
685$code.=<<___ if ($win64);
686 lea -0xb8(%rsp),%rsp
687 movaps %xmm6,0x10(%rsp)
688 movaps %xmm7,0x20(%rsp)
689 movaps %xmm8,0x30(%rsp)
690 movaps %xmm9,0x40(%rsp)
691 movaps %xmm10,0x50(%rsp)
692 movaps %xmm11,0x60(%rsp)
693 movaps %xmm12,0x70(%rsp)
694 movaps %xmm13,0x80(%rsp)
695 movaps %xmm14,0x90(%rsp)
696 movaps %xmm15,0xa0(%rsp)
697.Lenc_key_body:
698___
699$code.=<<___;
700 mov %esi,%eax
701 shr \$5,%eax
702 add \$5,%eax
703 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
704
705 mov \$0,%ecx
706 mov \$0x30,%r8d
707 call _vpaes_schedule_core
708___
709$code.=<<___ if ($win64);
710 movaps 0x10(%rsp),%xmm6
711 movaps 0x20(%rsp),%xmm7
712 movaps 0x30(%rsp),%xmm8
713 movaps 0x40(%rsp),%xmm9
714 movaps 0x50(%rsp),%xmm10
715 movaps 0x60(%rsp),%xmm11
716 movaps 0x70(%rsp),%xmm12
717 movaps 0x80(%rsp),%xmm13
718 movaps 0x90(%rsp),%xmm14
719 movaps 0xa0(%rsp),%xmm15
720 lea 0xb8(%rsp),%rsp
721.Lenc_key_epilogue:
722___
723$code.=<<___;
724 xor %eax,%eax
725 ret
726.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
727
728.globl ${PREFIX}_set_decrypt_key
729.type ${PREFIX}_set_decrypt_key,\@function,3
730.align 16
731${PREFIX}_set_decrypt_key:
732 _CET_ENDBR
733___
734$code.=<<___ if ($win64);
735 lea -0xb8(%rsp),%rsp
736 movaps %xmm6,0x10(%rsp)
737 movaps %xmm7,0x20(%rsp)
738 movaps %xmm8,0x30(%rsp)
739 movaps %xmm9,0x40(%rsp)
740 movaps %xmm10,0x50(%rsp)
741 movaps %xmm11,0x60(%rsp)
742 movaps %xmm12,0x70(%rsp)
743 movaps %xmm13,0x80(%rsp)
744 movaps %xmm14,0x90(%rsp)
745 movaps %xmm15,0xa0(%rsp)
746.Ldec_key_body:
747___
748$code.=<<___;
749 mov %esi,%eax
750 shr \$5,%eax
751 add \$5,%eax
752 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
753 shl \$4,%eax
754 lea 16(%rdx,%rax),%rdx
755
756 mov \$1,%ecx
757 mov %esi,%r8d
758 shr \$1,%r8d
759 and \$32,%r8d
760 xor \$32,%r8d # nbits==192?0:32
761 call _vpaes_schedule_core
762___
763$code.=<<___ if ($win64);
764 movaps 0x10(%rsp),%xmm6
765 movaps 0x20(%rsp),%xmm7
766 movaps 0x30(%rsp),%xmm8
767 movaps 0x40(%rsp),%xmm9
768 movaps 0x50(%rsp),%xmm10
769 movaps 0x60(%rsp),%xmm11
770 movaps 0x70(%rsp),%xmm12
771 movaps 0x80(%rsp),%xmm13
772 movaps 0x90(%rsp),%xmm14
773 movaps 0xa0(%rsp),%xmm15
774 lea 0xb8(%rsp),%rsp
775.Ldec_key_epilogue:
776___
777$code.=<<___;
778 xor %eax,%eax
779 ret
780.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
781
782.globl ${PREFIX}_encrypt
783.type ${PREFIX}_encrypt,\@function,3
784.align 16
785${PREFIX}_encrypt:
786 _CET_ENDBR
787___
788$code.=<<___ if ($win64);
789 lea -0xb8(%rsp),%rsp
790 movaps %xmm6,0x10(%rsp)
791 movaps %xmm7,0x20(%rsp)
792 movaps %xmm8,0x30(%rsp)
793 movaps %xmm9,0x40(%rsp)
794 movaps %xmm10,0x50(%rsp)
795 movaps %xmm11,0x60(%rsp)
796 movaps %xmm12,0x70(%rsp)
797 movaps %xmm13,0x80(%rsp)
798 movaps %xmm14,0x90(%rsp)
799 movaps %xmm15,0xa0(%rsp)
800.Lenc_body:
801___
802$code.=<<___;
803 movdqu (%rdi),%xmm0
804 call _vpaes_preheat
805 call _vpaes_encrypt_core
806 movdqu %xmm0,(%rsi)
807___
808$code.=<<___ if ($win64);
809 movaps 0x10(%rsp),%xmm6
810 movaps 0x20(%rsp),%xmm7
811 movaps 0x30(%rsp),%xmm8
812 movaps 0x40(%rsp),%xmm9
813 movaps 0x50(%rsp),%xmm10
814 movaps 0x60(%rsp),%xmm11
815 movaps 0x70(%rsp),%xmm12
816 movaps 0x80(%rsp),%xmm13
817 movaps 0x90(%rsp),%xmm14
818 movaps 0xa0(%rsp),%xmm15
819 lea 0xb8(%rsp),%rsp
820.Lenc_epilogue:
821___
822$code.=<<___;
823 ret
824.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
825
826.globl ${PREFIX}_decrypt
827.type ${PREFIX}_decrypt,\@function,3
828.align 16
829${PREFIX}_decrypt:
830 _CET_ENDBR
831___
832$code.=<<___ if ($win64);
833 lea -0xb8(%rsp),%rsp
834 movaps %xmm6,0x10(%rsp)
835 movaps %xmm7,0x20(%rsp)
836 movaps %xmm8,0x30(%rsp)
837 movaps %xmm9,0x40(%rsp)
838 movaps %xmm10,0x50(%rsp)
839 movaps %xmm11,0x60(%rsp)
840 movaps %xmm12,0x70(%rsp)
841 movaps %xmm13,0x80(%rsp)
842 movaps %xmm14,0x90(%rsp)
843 movaps %xmm15,0xa0(%rsp)
844.Ldec_body:
845___
846$code.=<<___;
847 movdqu (%rdi),%xmm0
848 call _vpaes_preheat
849 call _vpaes_decrypt_core
850 movdqu %xmm0,(%rsi)
851___
852$code.=<<___ if ($win64);
853 movaps 0x10(%rsp),%xmm6
854 movaps 0x20(%rsp),%xmm7
855 movaps 0x30(%rsp),%xmm8
856 movaps 0x40(%rsp),%xmm9
857 movaps 0x50(%rsp),%xmm10
858 movaps 0x60(%rsp),%xmm11
859 movaps 0x70(%rsp),%xmm12
860 movaps 0x80(%rsp),%xmm13
861 movaps 0x90(%rsp),%xmm14
862 movaps 0xa0(%rsp),%xmm15
863 lea 0xb8(%rsp),%rsp
864.Ldec_epilogue:
865___
866$code.=<<___;
867 ret
868.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
869___
870{
871my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
872# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
873# size_t length, const AES_KEY *key,
874# unsigned char *ivp,const int enc);
875$code.=<<___;
876.globl ${PREFIX}_cbc_encrypt
877.type ${PREFIX}_cbc_encrypt,\@function,6
878.align 16
879${PREFIX}_cbc_encrypt:
880 _CET_ENDBR
881 xchg $key,$len
882___
883($len,$key)=($key,$len);
884$code.=<<___;
885 sub \$16,$len
886 jc .Lcbc_abort
887___
888$code.=<<___ if ($win64);
889 lea -0xb8(%rsp),%rsp
890 movaps %xmm6,0x10(%rsp)
891 movaps %xmm7,0x20(%rsp)
892 movaps %xmm8,0x30(%rsp)
893 movaps %xmm9,0x40(%rsp)
894 movaps %xmm10,0x50(%rsp)
895 movaps %xmm11,0x60(%rsp)
896 movaps %xmm12,0x70(%rsp)
897 movaps %xmm13,0x80(%rsp)
898 movaps %xmm14,0x90(%rsp)
899 movaps %xmm15,0xa0(%rsp)
900.Lcbc_body:
901___
902$code.=<<___;
903 movdqu ($ivp),%xmm6 # load IV
904 sub $inp,$out
905 call _vpaes_preheat
906 cmp \$0,${enc}d
907 je .Lcbc_dec_loop
908 jmp .Lcbc_enc_loop
909.align 16
910.Lcbc_enc_loop:
911 movdqu ($inp),%xmm0
912 pxor %xmm6,%xmm0
913 call _vpaes_encrypt_core
914 movdqa %xmm0,%xmm6
915 movdqu %xmm0,($out,$inp)
916 lea 16($inp),$inp
917 sub \$16,$len
918 jnc .Lcbc_enc_loop
919 jmp .Lcbc_done
920.align 16
921.Lcbc_dec_loop:
922 movdqu ($inp),%xmm0
923 movdqa %xmm0,%xmm7
924 call _vpaes_decrypt_core
925 pxor %xmm6,%xmm0
926 movdqa %xmm7,%xmm6
927 movdqu %xmm0,($out,$inp)
928 lea 16($inp),$inp
929 sub \$16,$len
930 jnc .Lcbc_dec_loop
931.Lcbc_done:
932 movdqu %xmm6,($ivp) # save IV
933___
934$code.=<<___ if ($win64);
935 movaps 0x10(%rsp),%xmm6
936 movaps 0x20(%rsp),%xmm7
937 movaps 0x30(%rsp),%xmm8
938 movaps 0x40(%rsp),%xmm9
939 movaps 0x50(%rsp),%xmm10
940 movaps 0x60(%rsp),%xmm11
941 movaps 0x70(%rsp),%xmm12
942 movaps 0x80(%rsp),%xmm13
943 movaps 0x90(%rsp),%xmm14
944 movaps 0xa0(%rsp),%xmm15
945 lea 0xb8(%rsp),%rsp
946.Lcbc_epilogue:
947___
948$code.=<<___;
949.Lcbc_abort:
950 ret
951.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
952___
953}
954$code.=<<___;
955##
956## _aes_preheat
957##
958## Fills register %r10 -> .aes_consts (so you can -fPIC)
959## and %xmm9-%xmm15 as specified below.
960##
961.type _vpaes_preheat,\@abi-omnipotent
962.align 16
963_vpaes_preheat:
964 _CET_ENDBR
965 lea .Lk_s0F(%rip), %r10
966 movdqa -0x20(%r10), %xmm10 # .Lk_inv
967 movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
968 movdqa 0x00(%r10), %xmm9 # .Lk_s0F
969 movdqa 0x30(%r10), %xmm13 # .Lk_sb1
970 movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
971 movdqa 0x50(%r10), %xmm15 # .Lk_sb2
972 movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
973 ret
974.size _vpaes_preheat,.-_vpaes_preheat
975########################################################
976## ##
977## Constants ##
978## ##
979########################################################
980.section .rodata
981.type _vpaes_consts,\@object
982.align 64
983_vpaes_consts:
984.Lk_inv: # inv, inva
985 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
986 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
987
988.Lk_s0F: # s0F
989 .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
990
991.Lk_ipt: # input transform (lo, hi)
992 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
993 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
994
995.Lk_sb1: # sb1u, sb1t
996 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
997 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
998.Lk_sb2: # sb2u, sb2t
999 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
1000 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
1001.Lk_sbo: # sbou, sbot
1002 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
1003 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
1004
1005.Lk_mc_forward: # mc_forward
1006 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
1007 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
1008 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
1009 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
1010
1011.Lk_mc_backward:# mc_backward
1012 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
1013 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
1014 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
1015 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
1016
1017.Lk_sr: # sr
1018 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
1019 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
1020 .quad 0x0F060D040B020900, 0x070E050C030A0108
1021 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
1022
1023.Lk_rcon: # rcon
1024 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1025
1026.Lk_s63: # s63: all equal to 0x63 transformed
1027 .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1028
1029.Lk_opt: # output transform
1030 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
1031 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1032
1033.Lk_deskew: # deskew tables: inverts the sbox's "skew"
1034 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1035 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1036
1037##
1038## Decryption stuff
1039## Key schedule constants
1040##
1041.Lk_dksd: # decryption key schedule: invskew x*D
1042 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1043 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1044.Lk_dksb: # decryption key schedule: invskew x*B
1045 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
1046 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1047.Lk_dkse: # decryption key schedule: invskew x*E + 0x63
1048 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
1049 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1050.Lk_dks9: # decryption key schedule: invskew x*9
1051 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
1052 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
1053
1054##
1055## Decryption stuff
1056## Round function constants
1057##
1058.Lk_dipt: # decryption input transform
1059 .quad 0x0F505B040B545F00, 0x154A411E114E451A
1060 .quad 0x86E383E660056500, 0x12771772F491F194
1061
1062.Lk_dsb9: # decryption sbox output *9*u, *9*t
1063 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
1064 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1065.Lk_dsbd: # decryption sbox output *D*u, *D*t
1066 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1067 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1068.Lk_dsbb: # decryption sbox output *B*u, *B*t
1069 .quad 0xD022649296B44200, 0x602646F6B0F2D404
1070 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1071.Lk_dsbe: # decryption sbox output *E*u, *E*t
1072 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
1073 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1074.Lk_dsbo: # decryption sbox final output
1075 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1076 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1077.align 64
1078.size _vpaes_consts,.-_vpaes_consts
1079.text
1080___
1081
1082if ($win64) {
1083# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1084# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1085$rec="%rcx";
1086$frame="%rdx";
1087$context="%r8";
1088$disp="%r9";
1089
1090$code.=<<___;
1091.extern __imp_RtlVirtualUnwind
1092.type se_handler,\@abi-omnipotent
1093.align 16
1094se_handler:
1095 _CET_ENDBR
1096 push %rsi
1097 push %rdi
1098 push %rbx
1099 push %rbp
1100 push %r12
1101 push %r13
1102 push %r14
1103 push %r15
1104 pushfq
1105 sub \$64,%rsp
1106
1107 mov 120($context),%rax # pull context->Rax
1108 mov 248($context),%rbx # pull context->Rip
1109
1110 mov 8($disp),%rsi # disp->ImageBase
1111 mov 56($disp),%r11 # disp->HandlerData
1112
1113 mov 0(%r11),%r10d # HandlerData[0]
1114 lea (%rsi,%r10),%r10 # prologue label
1115 cmp %r10,%rbx # context->Rip<prologue label
1116 jb .Lin_prologue
1117
1118 mov 152($context),%rax # pull context->Rsp
1119
1120 mov 4(%r11),%r10d # HandlerData[1]
1121 lea (%rsi,%r10),%r10 # epilogue label
1122 cmp %r10,%rbx # context->Rip>=epilogue label
1123 jae .Lin_prologue
1124
1125 lea 16(%rax),%rsi # %xmm save area
1126 lea 512($context),%rdi # &context.Xmm6
1127 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1128 .long 0xa548f3fc # cld; rep movsq
1129 lea 0xb8(%rax),%rax # adjust stack pointer
1130
1131.Lin_prologue:
1132 mov 8(%rax),%rdi
1133 mov 16(%rax),%rsi
1134 mov %rax,152($context) # restore context->Rsp
1135 mov %rsi,168($context) # restore context->Rsi
1136 mov %rdi,176($context) # restore context->Rdi
1137
1138 mov 40($disp),%rdi # disp->ContextRecord
1139 mov $context,%rsi # context
1140 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1141 .long 0xa548f3fc # cld; rep movsq
1142
1143 mov $disp,%rsi
1144 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1145 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1146 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1147 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1148 mov 40(%rsi),%r10 # disp->ContextRecord
1149 lea 56(%rsi),%r11 # &disp->HandlerData
1150 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1151 mov %r10,32(%rsp) # arg5
1152 mov %r11,40(%rsp) # arg6
1153 mov %r12,48(%rsp) # arg7
1154 mov %rcx,56(%rsp) # arg8, (NULL)
1155 call *__imp_RtlVirtualUnwind(%rip)
1156
1157 mov \$1,%eax # ExceptionContinueSearch
1158 add \$64,%rsp
1159 popfq
1160 pop %r15
1161 pop %r14
1162 pop %r13
1163 pop %r12
1164 pop %rbp
1165 pop %rbx
1166 pop %rdi
1167 pop %rsi
1168 ret
1169.size se_handler,.-se_handler
1170
1171.section .pdata
1172.align 4
1173 .rva .LSEH_begin_${PREFIX}_set_encrypt_key
1174 .rva .LSEH_end_${PREFIX}_set_encrypt_key
1175 .rva .LSEH_info_${PREFIX}_set_encrypt_key
1176
1177 .rva .LSEH_begin_${PREFIX}_set_decrypt_key
1178 .rva .LSEH_end_${PREFIX}_set_decrypt_key
1179 .rva .LSEH_info_${PREFIX}_set_decrypt_key
1180
1181 .rva .LSEH_begin_${PREFIX}_encrypt
1182 .rva .LSEH_end_${PREFIX}_encrypt
1183 .rva .LSEH_info_${PREFIX}_encrypt
1184
1185 .rva .LSEH_begin_${PREFIX}_decrypt
1186 .rva .LSEH_end_${PREFIX}_decrypt
1187 .rva .LSEH_info_${PREFIX}_decrypt
1188
1189 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
1190 .rva .LSEH_end_${PREFIX}_cbc_encrypt
1191 .rva .LSEH_info_${PREFIX}_cbc_encrypt
1192
1193.section .xdata
1194.align 8
1195.LSEH_info_${PREFIX}_set_encrypt_key:
1196 .byte 9,0,0,0
1197 .rva se_handler
1198 .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
1199.LSEH_info_${PREFIX}_set_decrypt_key:
1200 .byte 9,0,0,0
1201 .rva se_handler
1202 .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
1203.LSEH_info_${PREFIX}_encrypt:
1204 .byte 9,0,0,0
1205 .rva se_handler
1206 .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
1207.LSEH_info_${PREFIX}_decrypt:
1208 .byte 9,0,0,0
1209 .rva se_handler
1210 .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
1211.LSEH_info_${PREFIX}_cbc_encrypt:
1212 .byte 9,0,0,0
1213 .rva se_handler
1214 .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
1215___
1216}
1217
1218$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1219
1220print $code;
1221
1222close STDOUT;