diff options
Diffstat (limited to 'src/lib/libcrypto/aes')
-rw-r--r-- | src/lib/libcrypto/aes/Makefile.ssl | 103 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/aes.h | 28 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/aes_cbc.c | 82 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/aes_cfb.c | 160 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/aes_core.c | 209 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/aes_ctr.c | 90 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/aes_ofb.c | 94 | ||||
-rw-r--r-- | src/lib/libcrypto/aes/asm/aes-586.pl | 2403 | ||||
-rwxr-xr-x | src/lib/libcrypto/aes/asm/aes-x86_64.pl | 2 |
9 files changed, 2163 insertions, 1008 deletions
diff --git a/src/lib/libcrypto/aes/Makefile.ssl b/src/lib/libcrypto/aes/Makefile.ssl deleted file mode 100644 index f353aeb697..0000000000 --- a/src/lib/libcrypto/aes/Makefile.ssl +++ /dev/null | |||
@@ -1,103 +0,0 @@ | |||
1 | # | ||
2 | # crypto/aes/Makefile | ||
3 | # | ||
4 | |||
5 | DIR= aes | ||
6 | TOP= ../.. | ||
7 | CC= cc | ||
8 | CPP= $(CC) -E | ||
9 | INCLUDES= | ||
10 | CFLAG=-g | ||
11 | INSTALL_PREFIX= | ||
12 | OPENSSLDIR= /usr/local/ssl | ||
13 | INSTALLTOP= /usr/local/ssl | ||
14 | MAKE= make -f Makefile.ssl | ||
15 | MAKEDEPPROG= makedepend | ||
16 | MAKEDEPEND= $(TOP)/util/domd $(TOP) -MD $(MAKEDEPPROG) | ||
17 | MAKEFILE= Makefile.ssl | ||
18 | AR= ar r | ||
19 | |||
20 | # CFLAGS= -mpentiumpro $(INCLUDES) $(CFLAG) -O3 -fexpensive-optimizations -funroll-loops -fforce-addr | ||
21 | CFLAGS= $(INCLUDES) $(CFLAG) | ||
22 | |||
23 | GENERAL=Makefile | ||
24 | #TEST=aestest.c | ||
25 | TEST= | ||
26 | APPS= | ||
27 | |||
28 | LIB=$(TOP)/libcrypto.a | ||
29 | LIBSRC=aes_core.c aes_misc.c aes_ecb.c aes_cbc.c aes_cfb.c aes_ofb.c aes_ctr.c | ||
30 | LIBOBJ=aes_core.o aes_misc.o aes_ecb.o aes_cbc.o aes_cfb.o aes_ofb.o aes_ctr.o | ||
31 | |||
32 | SRC= $(LIBSRC) | ||
33 | |||
34 | EXHEADER= aes.h | ||
35 | HEADER= aes_locl.h $(EXHEADER) | ||
36 | |||
37 | ALL= $(GENERAL) $(SRC) $(HEADER) | ||
38 | |||
39 | top: | ||
40 | (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all) | ||
41 | |||
42 | all: lib | ||
43 | |||
44 | lib: $(LIBOBJ) | ||
45 | $(AR) $(LIB) $(LIBOBJ) | ||
46 | $(RANLIB) $(LIB) || echo Never mind. | ||
47 | @touch lib | ||
48 | |||
49 | $(LIBOBJ): $(LIBSRC) | ||
50 | |||
51 | files: | ||
52 | $(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO | ||
53 | |||
54 | links: | ||
55 | @sh $(TOP)/util/point.sh Makefile.ssl Makefile | ||
56 | @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER) | ||
57 | @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST) | ||
58 | @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS) | ||
59 | |||
60 | install: installs | ||
61 | |||
62 | installs: | ||
63 | @for i in $(EXHEADER) ; \ | ||
64 | do \ | ||
65 | (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \ | ||
66 | chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \ | ||
67 | done; | ||
68 | |||
69 | tags: | ||
70 | ctags $(SRC) | ||
71 | |||
72 | tests: | ||
73 | |||
74 | lint: | ||
75 | lint -DLINT $(INCLUDES) $(SRC)>fluff | ||
76 | |||
77 | depend: | ||
78 | $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC) | ||
79 | |||
80 | dclean: | ||
81 | $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new | ||
82 | mv -f Makefile.new $(MAKEFILE) | ||
83 | |||
84 | clean: | ||
85 | rm -f *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff | ||
86 | |||
87 | # DO NOT DELETE THIS LINE -- make depend depends on it. | ||
88 | |||
89 | aes_cbc.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h | ||
90 | aes_cbc.o: ../../include/openssl/opensslconf.h aes_cbc.c aes_locl.h | ||
91 | aes_cfb.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h | ||
92 | aes_cfb.o: ../../include/openssl/opensslconf.h aes_cfb.c aes_locl.h | ||
93 | aes_core.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h | ||
94 | aes_core.o: ../../include/openssl/opensslconf.h aes_core.c aes_locl.h | ||
95 | aes_ctr.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h | ||
96 | aes_ctr.o: ../../include/openssl/opensslconf.h aes_ctr.c aes_locl.h | ||
97 | aes_ecb.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h | ||
98 | aes_ecb.o: ../../include/openssl/opensslconf.h aes_ecb.c aes_locl.h | ||
99 | aes_misc.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h | ||
100 | aes_misc.o: ../../include/openssl/opensslconf.h | ||
101 | aes_misc.o: ../../include/openssl/opensslv.h aes_locl.h aes_misc.c | ||
102 | aes_ofb.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h | ||
103 | aes_ofb.o: ../../include/openssl/opensslconf.h aes_locl.h aes_ofb.c | ||
diff --git a/src/lib/libcrypto/aes/aes.h b/src/lib/libcrypto/aes/aes.h index 450f2b4051..d2c99730fe 100644 --- a/src/lib/libcrypto/aes/aes.h +++ b/src/lib/libcrypto/aes/aes.h | |||
@@ -58,6 +58,8 @@ | |||
58 | #error AES is disabled. | 58 | #error AES is disabled. |
59 | #endif | 59 | #endif |
60 | 60 | ||
61 | #include <stddef.h> | ||
62 | |||
61 | #define AES_ENCRYPT 1 | 63 | #define AES_ENCRYPT 1 |
62 | #define AES_DECRYPT 0 | 64 | #define AES_DECRYPT 0 |
63 | 65 | ||
@@ -66,10 +68,6 @@ | |||
66 | #define AES_MAXNR 14 | 68 | #define AES_MAXNR 14 |
67 | #define AES_BLOCK_SIZE 16 | 69 | #define AES_BLOCK_SIZE 16 |
68 | 70 | ||
69 | #ifdef OPENSSL_FIPS | ||
70 | #define FIPS_AES_SIZE_T int | ||
71 | #endif | ||
72 | |||
73 | #ifdef __cplusplus | 71 | #ifdef __cplusplus |
74 | extern "C" { | 72 | extern "C" { |
75 | #endif | 73 | #endif |
@@ -100,37 +98,32 @@ void AES_decrypt(const unsigned char *in, unsigned char *out, | |||
100 | void AES_ecb_encrypt(const unsigned char *in, unsigned char *out, | 98 | void AES_ecb_encrypt(const unsigned char *in, unsigned char *out, |
101 | const AES_KEY *key, const int enc); | 99 | const AES_KEY *key, const int enc); |
102 | void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, | 100 | void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, |
103 | const unsigned long length, const AES_KEY *key, | 101 | size_t length, const AES_KEY *key, |
104 | unsigned char *ivec, const int enc); | 102 | unsigned char *ivec, const int enc); |
105 | void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, | 103 | void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, |
106 | const unsigned long length, const AES_KEY *key, | 104 | size_t length, const AES_KEY *key, |
107 | unsigned char *ivec, int *num, const int enc); | 105 | unsigned char *ivec, int *num, const int enc); |
108 | void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, | 106 | void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, |
109 | const unsigned long length, const AES_KEY *key, | 107 | size_t length, const AES_KEY *key, |
110 | unsigned char *ivec, int *num, const int enc); | 108 | unsigned char *ivec, int *num, const int enc); |
111 | void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, | 109 | void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, |
112 | const unsigned long length, const AES_KEY *key, | 110 | size_t length, const AES_KEY *key, |
113 | unsigned char *ivec, int *num, const int enc); | 111 | unsigned char *ivec, int *num, const int enc); |
114 | void AES_cfbr_encrypt_block(const unsigned char *in,unsigned char *out, | ||
115 | const int nbits,const AES_KEY *key, | ||
116 | unsigned char *ivec,const int enc); | ||
117 | void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, | 112 | void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, |
118 | const unsigned long length, const AES_KEY *key, | 113 | size_t length, const AES_KEY *key, |
119 | unsigned char *ivec, int *num); | 114 | unsigned char *ivec, int *num); |
120 | void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, | 115 | void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, |
121 | const unsigned long length, const AES_KEY *key, | 116 | size_t length, const AES_KEY *key, |
122 | unsigned char ivec[AES_BLOCK_SIZE], | 117 | unsigned char ivec[AES_BLOCK_SIZE], |
123 | unsigned char ecount_buf[AES_BLOCK_SIZE], | 118 | unsigned char ecount_buf[AES_BLOCK_SIZE], |
124 | unsigned int *num); | 119 | unsigned int *num); |
125 | |||
126 | /* For IGE, see also http://www.links.org/files/openssl-ige.pdf */ | ||
127 | /* NB: the IV is _two_ blocks long */ | 120 | /* NB: the IV is _two_ blocks long */ |
128 | void AES_ige_encrypt(const unsigned char *in, unsigned char *out, | 121 | void AES_ige_encrypt(const unsigned char *in, unsigned char *out, |
129 | const unsigned long length, const AES_KEY *key, | 122 | size_t length, const AES_KEY *key, |
130 | unsigned char *ivec, const int enc); | 123 | unsigned char *ivec, const int enc); |
131 | /* NB: the IV is _four_ blocks long */ | 124 | /* NB: the IV is _four_ blocks long */ |
132 | void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out, | 125 | void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out, |
133 | const unsigned long length, const AES_KEY *key, | 126 | size_t length, const AES_KEY *key, |
134 | const AES_KEY *key2, const unsigned char *ivec, | 127 | const AES_KEY *key2, const unsigned char *ivec, |
135 | const int enc); | 128 | const int enc); |
136 | 129 | ||
@@ -141,6 +134,7 @@ int AES_unwrap_key(AES_KEY *key, const unsigned char *iv, | |||
141 | unsigned char *out, | 134 | unsigned char *out, |
142 | const unsigned char *in, unsigned int inlen); | 135 | const unsigned char *in, unsigned int inlen); |
143 | 136 | ||
137 | |||
144 | #ifdef __cplusplus | 138 | #ifdef __cplusplus |
145 | } | 139 | } |
146 | #endif | 140 | #endif |
diff --git a/src/lib/libcrypto/aes/aes_cbc.c b/src/lib/libcrypto/aes/aes_cbc.c index 373864cd4b..227f75625d 100644 --- a/src/lib/libcrypto/aes/aes_cbc.c +++ b/src/lib/libcrypto/aes/aes_cbc.c | |||
@@ -49,85 +49,15 @@ | |||
49 | * | 49 | * |
50 | */ | 50 | */ |
51 | 51 | ||
52 | #ifndef AES_DEBUG | ||
53 | # ifndef NDEBUG | ||
54 | # define NDEBUG | ||
55 | # endif | ||
56 | #endif | ||
57 | #include <assert.h> | ||
58 | |||
59 | #include <openssl/aes.h> | 52 | #include <openssl/aes.h> |
60 | #include "aes_locl.h" | 53 | #include <openssl/modes.h> |
61 | 54 | ||
62 | #if !defined(OPENSSL_FIPS_AES_ASM) | ||
63 | void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, | 55 | void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, |
64 | const unsigned long length, const AES_KEY *key, | 56 | size_t len, const AES_KEY *key, |
65 | unsigned char *ivec, const int enc) { | 57 | unsigned char *ivec, const int enc) { |
66 | 58 | ||
67 | unsigned long n; | 59 | if (enc) |
68 | unsigned long len = length; | 60 | CRYPTO_cbc128_encrypt(in,out,len,key,ivec,(block128_f)AES_encrypt); |
69 | unsigned char tmp[AES_BLOCK_SIZE]; | 61 | else |
70 | const unsigned char *iv = ivec; | 62 | CRYPTO_cbc128_decrypt(in,out,len,key,ivec,(block128_f)AES_decrypt); |
71 | |||
72 | assert(in && out && key && ivec); | ||
73 | assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc)); | ||
74 | |||
75 | if (AES_ENCRYPT == enc) { | ||
76 | while (len >= AES_BLOCK_SIZE) { | ||
77 | for(n=0; n < AES_BLOCK_SIZE; ++n) | ||
78 | out[n] = in[n] ^ iv[n]; | ||
79 | AES_encrypt(out, out, key); | ||
80 | iv = out; | ||
81 | len -= AES_BLOCK_SIZE; | ||
82 | in += AES_BLOCK_SIZE; | ||
83 | out += AES_BLOCK_SIZE; | ||
84 | } | ||
85 | if (len) { | ||
86 | for(n=0; n < len; ++n) | ||
87 | out[n] = in[n] ^ iv[n]; | ||
88 | for(n=len; n < AES_BLOCK_SIZE; ++n) | ||
89 | out[n] = iv[n]; | ||
90 | AES_encrypt(out, out, key); | ||
91 | iv = out; | ||
92 | } | ||
93 | memcpy(ivec,iv,AES_BLOCK_SIZE); | ||
94 | } else if (in != out) { | ||
95 | while (len >= AES_BLOCK_SIZE) { | ||
96 | AES_decrypt(in, out, key); | ||
97 | for(n=0; n < AES_BLOCK_SIZE; ++n) | ||
98 | out[n] ^= iv[n]; | ||
99 | iv = in; | ||
100 | len -= AES_BLOCK_SIZE; | ||
101 | in += AES_BLOCK_SIZE; | ||
102 | out += AES_BLOCK_SIZE; | ||
103 | } | ||
104 | if (len) { | ||
105 | AES_decrypt(in,tmp,key); | ||
106 | for(n=0; n < len; ++n) | ||
107 | out[n] = tmp[n] ^ iv[n]; | ||
108 | iv = in; | ||
109 | } | ||
110 | memcpy(ivec,iv,AES_BLOCK_SIZE); | ||
111 | } else { | ||
112 | while (len >= AES_BLOCK_SIZE) { | ||
113 | memcpy(tmp, in, AES_BLOCK_SIZE); | ||
114 | AES_decrypt(in, out, key); | ||
115 | for(n=0; n < AES_BLOCK_SIZE; ++n) | ||
116 | out[n] ^= ivec[n]; | ||
117 | memcpy(ivec, tmp, AES_BLOCK_SIZE); | ||
118 | len -= AES_BLOCK_SIZE; | ||
119 | in += AES_BLOCK_SIZE; | ||
120 | out += AES_BLOCK_SIZE; | ||
121 | } | ||
122 | if (len) { | ||
123 | memcpy(tmp, in, AES_BLOCK_SIZE); | ||
124 | AES_decrypt(tmp, out, key); | ||
125 | for(n=0; n < len; ++n) | ||
126 | out[n] ^= ivec[n]; | ||
127 | for(n=len; n < AES_BLOCK_SIZE; ++n) | ||
128 | out[n] = tmp[n]; | ||
129 | memcpy(ivec, tmp, AES_BLOCK_SIZE); | ||
130 | } | ||
131 | } | ||
132 | } | 63 | } |
133 | #endif | ||
diff --git a/src/lib/libcrypto/aes/aes_cfb.c b/src/lib/libcrypto/aes/aes_cfb.c index 49f0411010..0c6d058ce7 100644 --- a/src/lib/libcrypto/aes/aes_cfb.c +++ b/src/lib/libcrypto/aes/aes_cfb.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* crypto/aes/aes_cfb.c -*- mode:C; c-file-style: "eay" -*- */ | 1 | /* crypto/aes/aes_cfb.c -*- mode:C; c-file-style: "eay" -*- */ |
2 | /* ==================================================================== | 2 | /* ==================================================================== |
3 | * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. | 3 | * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved. |
4 | * | 4 | * |
5 | * Redistribution and use in source and binary forms, with or without | 5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions | 6 | * modification, are permitted provided that the following conditions |
@@ -48,73 +48,9 @@ | |||
48 | * ==================================================================== | 48 | * ==================================================================== |
49 | * | 49 | * |
50 | */ | 50 | */ |
51 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
52 | * All rights reserved. | ||
53 | * | ||
54 | * This package is an SSL implementation written | ||
55 | * by Eric Young (eay@cryptsoft.com). | ||
56 | * The implementation was written so as to conform with Netscapes SSL. | ||
57 | * | ||
58 | * This library is free for commercial and non-commercial use as long as | ||
59 | * the following conditions are aheared to. The following conditions | ||
60 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
61 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
62 | * included with this distribution is covered by the same copyright terms | ||
63 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
64 | * | ||
65 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
66 | * the code are not to be removed. | ||
67 | * If this package is used in a product, Eric Young should be given attribution | ||
68 | * as the author of the parts of the library used. | ||
69 | * This can be in the form of a textual message at program startup or | ||
70 | * in documentation (online or textual) provided with the package. | ||
71 | * | ||
72 | * Redistribution and use in source and binary forms, with or without | ||
73 | * modification, are permitted provided that the following conditions | ||
74 | * are met: | ||
75 | * 1. Redistributions of source code must retain the copyright | ||
76 | * notice, this list of conditions and the following disclaimer. | ||
77 | * 2. Redistributions in binary form must reproduce the above copyright | ||
78 | * notice, this list of conditions and the following disclaimer in the | ||
79 | * documentation and/or other materials provided with the distribution. | ||
80 | * 3. All advertising materials mentioning features or use of this software | ||
81 | * must display the following acknowledgement: | ||
82 | * "This product includes cryptographic software written by | ||
83 | * Eric Young (eay@cryptsoft.com)" | ||
84 | * The word 'cryptographic' can be left out if the rouines from the library | ||
85 | * being used are not cryptographic related :-). | ||
86 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
87 | * the apps directory (application code) you must include an acknowledgement: | ||
88 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
89 | * | ||
90 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
91 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
92 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
93 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
94 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
95 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
96 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
97 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
98 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
99 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
100 | * SUCH DAMAGE. | ||
101 | * | ||
102 | * The licence and distribution terms for any publically available version or | ||
103 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
104 | * copied and put under another distribution licence | ||
105 | * [including the GNU Public Licence.] | ||
106 | */ | ||
107 | |||
108 | #ifndef AES_DEBUG | ||
109 | # ifndef NDEBUG | ||
110 | # define NDEBUG | ||
111 | # endif | ||
112 | #endif | ||
113 | #include <assert.h> | ||
114 | 51 | ||
115 | #include <openssl/aes.h> | 52 | #include <openssl/aes.h> |
116 | #include "aes_locl.h" | 53 | #include <openssl/modes.h> |
117 | #include "e_os.h" | ||
118 | 54 | ||
119 | /* The input and output encrypted as though 128bit cfb mode is being | 55 | /* The input and output encrypted as though 128bit cfb mode is being |
120 | * used. The extra state information to record how much of the | 56 | * used. The extra state information to record how much of the |
@@ -122,104 +58,24 @@ | |||
122 | */ | 58 | */ |
123 | 59 | ||
124 | void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, | 60 | void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, |
125 | const unsigned long length, const AES_KEY *key, | 61 | size_t length, const AES_KEY *key, |
126 | unsigned char *ivec, int *num, const int enc) { | 62 | unsigned char *ivec, int *num, const int enc) { |
127 | 63 | ||
128 | unsigned int n; | 64 | CRYPTO_cfb128_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt); |
129 | unsigned long l = length; | ||
130 | unsigned char c; | ||
131 | |||
132 | assert(in && out && key && ivec && num); | ||
133 | |||
134 | n = *num; | ||
135 | |||
136 | if (enc) { | ||
137 | while (l--) { | ||
138 | if (n == 0) { | ||
139 | AES_encrypt(ivec, ivec, key); | ||
140 | } | ||
141 | ivec[n] = *(out++) = *(in++) ^ ivec[n]; | ||
142 | n = (n+1) % AES_BLOCK_SIZE; | ||
143 | } | ||
144 | } else { | ||
145 | while (l--) { | ||
146 | if (n == 0) { | ||
147 | AES_encrypt(ivec, ivec, key); | ||
148 | } | ||
149 | c = *(in); | ||
150 | *(out++) = *(in++) ^ ivec[n]; | ||
151 | ivec[n] = c; | ||
152 | n = (n+1) % AES_BLOCK_SIZE; | ||
153 | } | ||
154 | } | ||
155 | |||
156 | *num=n; | ||
157 | } | 65 | } |
158 | 66 | ||
159 | /* This expects a single block of size nbits for both in and out. Note that | ||
160 | it corrupts any extra bits in the last byte of out */ | ||
161 | void AES_cfbr_encrypt_block(const unsigned char *in,unsigned char *out, | ||
162 | const int nbits,const AES_KEY *key, | ||
163 | unsigned char *ivec,const int enc) | ||
164 | { | ||
165 | int n,rem,num; | ||
166 | unsigned char ovec[AES_BLOCK_SIZE*2]; | ||
167 | |||
168 | if (nbits<=0 || nbits>128) return; | ||
169 | |||
170 | /* fill in the first half of the new IV with the current IV */ | ||
171 | memcpy(ovec,ivec,AES_BLOCK_SIZE); | ||
172 | /* construct the new IV */ | ||
173 | AES_encrypt(ivec,ivec,key); | ||
174 | num = (nbits+7)/8; | ||
175 | if (enc) /* encrypt the input */ | ||
176 | for(n=0 ; n < num ; ++n) | ||
177 | out[n] = (ovec[AES_BLOCK_SIZE+n] = in[n] ^ ivec[n]); | ||
178 | else /* decrypt the input */ | ||
179 | for(n=0 ; n < num ; ++n) | ||
180 | out[n] = (ovec[AES_BLOCK_SIZE+n] = in[n]) ^ ivec[n]; | ||
181 | /* shift ovec left... */ | ||
182 | rem = nbits%8; | ||
183 | num = nbits/8; | ||
184 | if(rem==0) | ||
185 | memcpy(ivec,ovec+num,AES_BLOCK_SIZE); | ||
186 | else | ||
187 | for(n=0 ; n < AES_BLOCK_SIZE ; ++n) | ||
188 | ivec[n] = ovec[n+num]<<rem | ovec[n+num+1]>>(8-rem); | ||
189 | |||
190 | /* it is not necessary to cleanse ovec, since the IV is not secret */ | ||
191 | } | ||
192 | |||
193 | /* N.B. This expects the input to be packed, MS bit first */ | 67 | /* N.B. This expects the input to be packed, MS bit first */ |
194 | void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, | 68 | void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, |
195 | const unsigned long length, const AES_KEY *key, | 69 | size_t length, const AES_KEY *key, |
196 | unsigned char *ivec, int *num, const int enc) | 70 | unsigned char *ivec, int *num, const int enc) |
197 | { | 71 | { |
198 | unsigned int n; | 72 | CRYPTO_cfb128_1_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt); |
199 | unsigned char c[1],d[1]; | ||
200 | |||
201 | assert(in && out && key && ivec && num); | ||
202 | assert(*num == 0); | ||
203 | |||
204 | memset(out,0,(length+7)/8); | ||
205 | for(n=0 ; n < length ; ++n) | ||
206 | { | ||
207 | c[0]=(in[n/8]&(1 << (7-n%8))) ? 0x80 : 0; | ||
208 | AES_cfbr_encrypt_block(c,d,1,key,ivec,enc); | ||
209 | out[n/8]=(out[n/8]&~(1 << (7-n%8)))|((d[0]&0x80) >> (n%8)); | ||
210 | } | ||
211 | } | 73 | } |
212 | 74 | ||
213 | void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, | 75 | void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, |
214 | const unsigned long length, const AES_KEY *key, | 76 | size_t length, const AES_KEY *key, |
215 | unsigned char *ivec, int *num, const int enc) | 77 | unsigned char *ivec, int *num, const int enc) |
216 | { | 78 | { |
217 | unsigned int n; | 79 | CRYPTO_cfb128_8_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt); |
218 | |||
219 | assert(in && out && key && ivec && num); | ||
220 | assert(*num == 0); | ||
221 | |||
222 | for(n=0 ; n < length ; ++n) | ||
223 | AES_cfbr_encrypt_block(&in[n],&out[n],8,key,ivec,enc); | ||
224 | } | 80 | } |
225 | 81 | ||
diff --git a/src/lib/libcrypto/aes/aes_core.c b/src/lib/libcrypto/aes/aes_core.c index cffdd4daec..a7ec54f4da 100644 --- a/src/lib/libcrypto/aes/aes_core.c +++ b/src/lib/libcrypto/aes/aes_core.c | |||
@@ -37,12 +37,9 @@ | |||
37 | 37 | ||
38 | #include <stdlib.h> | 38 | #include <stdlib.h> |
39 | #include <openssl/aes.h> | 39 | #include <openssl/aes.h> |
40 | #ifdef OPENSSL_FIPS | ||
41 | #include <openssl/fips.h> | ||
42 | #endif | ||
43 | |||
44 | #include "aes_locl.h" | 40 | #include "aes_locl.h" |
45 | 41 | ||
42 | #ifndef AES_ASM | ||
46 | /* | 43 | /* |
47 | Te0[x] = S [x].[02, 01, 01, 03]; | 44 | Te0[x] = S [x].[02, 01, 01, 03]; |
48 | Te1[x] = S [x].[03, 02, 01, 01]; | 45 | Te1[x] = S [x].[03, 02, 01, 01]; |
@@ -635,10 +632,6 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | |||
635 | int i = 0; | 632 | int i = 0; |
636 | u32 temp; | 633 | u32 temp; |
637 | 634 | ||
638 | #ifdef OPENSSL_FIPS | ||
639 | FIPS_selftest_check(); | ||
640 | #endif | ||
641 | |||
642 | if (!userKey || !key) | 635 | if (!userKey || !key) |
643 | return -1; | 636 | return -1; |
644 | if (bits != 128 && bits != 192 && bits != 256) | 637 | if (bits != 128 && bits != 192 && bits != 256) |
@@ -781,7 +774,6 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | |||
781 | return 0; | 774 | return 0; |
782 | } | 775 | } |
783 | 776 | ||
784 | #ifndef AES_ASM | ||
785 | /* | 777 | /* |
786 | * Encrypt a single block | 778 | * Encrypt a single block |
787 | * in and out can overlap | 779 | * in and out can overlap |
@@ -1164,4 +1156,203 @@ void AES_decrypt(const unsigned char *in, unsigned char *out, | |||
1164 | PUTU32(out + 12, s3); | 1156 | PUTU32(out + 12, s3); |
1165 | } | 1157 | } |
1166 | 1158 | ||
1159 | #else /* AES_ASM */ | ||
1160 | |||
1161 | static const u8 Te4[256] = { | ||
1162 | 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U, | ||
1163 | 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U, | ||
1164 | 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U, | ||
1165 | 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U, | ||
1166 | 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU, | ||
1167 | 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U, | ||
1168 | 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU, | ||
1169 | 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U, | ||
1170 | 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U, | ||
1171 | 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U, | ||
1172 | 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU, | ||
1173 | 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU, | ||
1174 | 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U, | ||
1175 | 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U, | ||
1176 | 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U, | ||
1177 | 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U, | ||
1178 | 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U, | ||
1179 | 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U, | ||
1180 | 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U, | ||
1181 | 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU, | ||
1182 | 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU, | ||
1183 | 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U, | ||
1184 | 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U, | ||
1185 | 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U, | ||
1186 | 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U, | ||
1187 | 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU, | ||
1188 | 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU, | ||
1189 | 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU, | ||
1190 | 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U, | ||
1191 | 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU, | ||
1192 | 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U, | ||
1193 | 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U | ||
1194 | }; | ||
1195 | static const u32 rcon[] = { | ||
1196 | 0x01000000, 0x02000000, 0x04000000, 0x08000000, | ||
1197 | 0x10000000, 0x20000000, 0x40000000, 0x80000000, | ||
1198 | 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ | ||
1199 | }; | ||
1200 | |||
1201 | /** | ||
1202 | * Expand the cipher key into the encryption key schedule. | ||
1203 | */ | ||
1204 | int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | ||
1205 | AES_KEY *key) { | ||
1206 | u32 *rk; | ||
1207 | int i = 0; | ||
1208 | u32 temp; | ||
1209 | |||
1210 | if (!userKey || !key) | ||
1211 | return -1; | ||
1212 | if (bits != 128 && bits != 192 && bits != 256) | ||
1213 | return -2; | ||
1214 | |||
1215 | rk = key->rd_key; | ||
1216 | |||
1217 | if (bits==128) | ||
1218 | key->rounds = 10; | ||
1219 | else if (bits==192) | ||
1220 | key->rounds = 12; | ||
1221 | else | ||
1222 | key->rounds = 14; | ||
1223 | |||
1224 | rk[0] = GETU32(userKey ); | ||
1225 | rk[1] = GETU32(userKey + 4); | ||
1226 | rk[2] = GETU32(userKey + 8); | ||
1227 | rk[3] = GETU32(userKey + 12); | ||
1228 | if (bits == 128) { | ||
1229 | while (1) { | ||
1230 | temp = rk[3]; | ||
1231 | rk[4] = rk[0] ^ | ||
1232 | (Te4[(temp >> 16) & 0xff] << 24) ^ | ||
1233 | (Te4[(temp >> 8) & 0xff] << 16) ^ | ||
1234 | (Te4[(temp ) & 0xff] << 8) ^ | ||
1235 | (Te4[(temp >> 24) ]) ^ | ||
1236 | rcon[i]; | ||
1237 | rk[5] = rk[1] ^ rk[4]; | ||
1238 | rk[6] = rk[2] ^ rk[5]; | ||
1239 | rk[7] = rk[3] ^ rk[6]; | ||
1240 | if (++i == 10) { | ||
1241 | return 0; | ||
1242 | } | ||
1243 | rk += 4; | ||
1244 | } | ||
1245 | } | ||
1246 | rk[4] = GETU32(userKey + 16); | ||
1247 | rk[5] = GETU32(userKey + 20); | ||
1248 | if (bits == 192) { | ||
1249 | while (1) { | ||
1250 | temp = rk[ 5]; | ||
1251 | rk[ 6] = rk[ 0] ^ | ||
1252 | (Te4[(temp >> 16) & 0xff] << 24) ^ | ||
1253 | (Te4[(temp >> 8) & 0xff] << 16) ^ | ||
1254 | (Te4[(temp ) & 0xff] << 8) ^ | ||
1255 | (Te4[(temp >> 24) ]) ^ | ||
1256 | rcon[i]; | ||
1257 | rk[ 7] = rk[ 1] ^ rk[ 6]; | ||
1258 | rk[ 8] = rk[ 2] ^ rk[ 7]; | ||
1259 | rk[ 9] = rk[ 3] ^ rk[ 8]; | ||
1260 | if (++i == 8) { | ||
1261 | return 0; | ||
1262 | } | ||
1263 | rk[10] = rk[ 4] ^ rk[ 9]; | ||
1264 | rk[11] = rk[ 5] ^ rk[10]; | ||
1265 | rk += 6; | ||
1266 | } | ||
1267 | } | ||
1268 | rk[6] = GETU32(userKey + 24); | ||
1269 | rk[7] = GETU32(userKey + 28); | ||
1270 | if (bits == 256) { | ||
1271 | while (1) { | ||
1272 | temp = rk[ 7]; | ||
1273 | rk[ 8] = rk[ 0] ^ | ||
1274 | (Te4[(temp >> 16) & 0xff] << 24) ^ | ||
1275 | (Te4[(temp >> 8) & 0xff] << 16) ^ | ||
1276 | (Te4[(temp ) & 0xff] << 8) ^ | ||
1277 | (Te4[(temp >> 24) ]) ^ | ||
1278 | rcon[i]; | ||
1279 | rk[ 9] = rk[ 1] ^ rk[ 8]; | ||
1280 | rk[10] = rk[ 2] ^ rk[ 9]; | ||
1281 | rk[11] = rk[ 3] ^ rk[10]; | ||
1282 | if (++i == 7) { | ||
1283 | return 0; | ||
1284 | } | ||
1285 | temp = rk[11]; | ||
1286 | rk[12] = rk[ 4] ^ | ||
1287 | (Te4[(temp >> 24) ] << 24) ^ | ||
1288 | (Te4[(temp >> 16) & 0xff] << 16) ^ | ||
1289 | (Te4[(temp >> 8) & 0xff] << 8) ^ | ||
1290 | (Te4[(temp ) & 0xff]); | ||
1291 | rk[13] = rk[ 5] ^ rk[12]; | ||
1292 | rk[14] = rk[ 6] ^ rk[13]; | ||
1293 | rk[15] = rk[ 7] ^ rk[14]; | ||
1294 | |||
1295 | rk += 8; | ||
1296 | } | ||
1297 | } | ||
1298 | return 0; | ||
1299 | } | ||
1300 | |||
1301 | /** | ||
1302 | * Expand the cipher key into the decryption key schedule. | ||
1303 | */ | ||
1304 | int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | ||
1305 | AES_KEY *key) { | ||
1306 | |||
1307 | u32 *rk; | ||
1308 | int i, j, status; | ||
1309 | u32 temp; | ||
1310 | |||
1311 | /* first, start with an encryption schedule */ | ||
1312 | status = AES_set_encrypt_key(userKey, bits, key); | ||
1313 | if (status < 0) | ||
1314 | return status; | ||
1315 | |||
1316 | rk = key->rd_key; | ||
1317 | |||
1318 | /* invert the order of the round keys: */ | ||
1319 | for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) { | ||
1320 | temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp; | ||
1321 | temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; | ||
1322 | temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; | ||
1323 | temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp; | ||
1324 | } | ||
1325 | /* apply the inverse MixColumn transform to all round keys but the first and the last: */ | ||
1326 | for (i = 1; i < (key->rounds); i++) { | ||
1327 | rk += 4; | ||
1328 | for (j = 0; j < 4; j++) { | ||
1329 | u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m; | ||
1330 | |||
1331 | tp1 = rk[j]; | ||
1332 | m = tp1 & 0x80808080; | ||
1333 | tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^ | ||
1334 | ((m - (m >> 7)) & 0x1b1b1b1b); | ||
1335 | m = tp2 & 0x80808080; | ||
1336 | tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^ | ||
1337 | ((m - (m >> 7)) & 0x1b1b1b1b); | ||
1338 | m = tp4 & 0x80808080; | ||
1339 | tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^ | ||
1340 | ((m - (m >> 7)) & 0x1b1b1b1b); | ||
1341 | tp9 = tp8 ^ tp1; | ||
1342 | tpb = tp9 ^ tp2; | ||
1343 | tpd = tp9 ^ tp4; | ||
1344 | tpe = tp8 ^ tp4 ^ tp2; | ||
1345 | #if defined(ROTATE) | ||
1346 | rk[j] = tpe ^ ROTATE(tpd,16) ^ | ||
1347 | ROTATE(tp9,24) ^ ROTATE(tpb,8); | ||
1348 | #else | ||
1349 | rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ | ||
1350 | (tp9 >> 8) ^ (tp9 << 24) ^ | ||
1351 | (tpb >> 24) ^ (tpb << 8); | ||
1352 | #endif | ||
1353 | } | ||
1354 | } | ||
1355 | return 0; | ||
1356 | } | ||
1357 | |||
1167 | #endif /* AES_ASM */ | 1358 | #endif /* AES_ASM */ |
diff --git a/src/lib/libcrypto/aes/aes_ctr.c b/src/lib/libcrypto/aes/aes_ctr.c index f36982be1e..7c9d165d8a 100644 --- a/src/lib/libcrypto/aes/aes_ctr.c +++ b/src/lib/libcrypto/aes/aes_ctr.c | |||
@@ -49,91 +49,13 @@ | |||
49 | * | 49 | * |
50 | */ | 50 | */ |
51 | 51 | ||
52 | #ifndef AES_DEBUG | ||
53 | # ifndef NDEBUG | ||
54 | # define NDEBUG | ||
55 | # endif | ||
56 | #endif | ||
57 | #include <assert.h> | ||
58 | |||
59 | #include <openssl/aes.h> | 52 | #include <openssl/aes.h> |
60 | #include "aes_locl.h" | 53 | #include <openssl/modes.h> |
61 | |||
62 | /* NOTE: the IV/counter CTR mode is big-endian. The rest of the AES code | ||
63 | * is endian-neutral. */ | ||
64 | |||
65 | /* increment counter (128-bit int) by 1 */ | ||
66 | static void AES_ctr128_inc(unsigned char *counter) { | ||
67 | unsigned long c; | ||
68 | |||
69 | /* Grab bottom dword of counter and increment */ | ||
70 | c = GETU32(counter + 12); | ||
71 | c++; c &= 0xFFFFFFFF; | ||
72 | PUTU32(counter + 12, c); | ||
73 | |||
74 | /* if no overflow, we're done */ | ||
75 | if (c) | ||
76 | return; | ||
77 | |||
78 | /* Grab 1st dword of counter and increment */ | ||
79 | c = GETU32(counter + 8); | ||
80 | c++; c &= 0xFFFFFFFF; | ||
81 | PUTU32(counter + 8, c); | ||
82 | |||
83 | /* if no overflow, we're done */ | ||
84 | if (c) | ||
85 | return; | ||
86 | |||
87 | /* Grab 2nd dword of counter and increment */ | ||
88 | c = GETU32(counter + 4); | ||
89 | c++; c &= 0xFFFFFFFF; | ||
90 | PUTU32(counter + 4, c); | ||
91 | |||
92 | /* if no overflow, we're done */ | ||
93 | if (c) | ||
94 | return; | ||
95 | 54 | ||
96 | /* Grab top dword of counter and increment */ | ||
97 | c = GETU32(counter + 0); | ||
98 | c++; c &= 0xFFFFFFFF; | ||
99 | PUTU32(counter + 0, c); | ||
100 | } | ||
101 | |||
102 | /* The input encrypted as though 128bit counter mode is being | ||
103 | * used. The extra state information to record how much of the | ||
104 | * 128bit block we have used is contained in *num, and the | ||
105 | * encrypted counter is kept in ecount_buf. Both *num and | ||
106 | * ecount_buf must be initialised with zeros before the first | ||
107 | * call to AES_ctr128_encrypt(). | ||
108 | * | ||
109 | * This algorithm assumes that the counter is in the x lower bits | ||
110 | * of the IV (ivec), and that the application has full control over | ||
111 | * overflow and the rest of the IV. This implementation takes NO | ||
112 | * responsability for checking that the counter doesn't overflow | ||
113 | * into the rest of the IV when incremented. | ||
114 | */ | ||
115 | void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, | 55 | void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, |
116 | const unsigned long length, const AES_KEY *key, | 56 | size_t length, const AES_KEY *key, |
117 | unsigned char ivec[AES_BLOCK_SIZE], | 57 | unsigned char ivec[AES_BLOCK_SIZE], |
118 | unsigned char ecount_buf[AES_BLOCK_SIZE], | 58 | unsigned char ecount_buf[AES_BLOCK_SIZE], |
119 | unsigned int *num) { | 59 | unsigned int *num) { |
120 | 60 | CRYPTO_ctr128_encrypt(in,out,length,key,ivec,ecount_buf,num,(block128_f)AES_encrypt); | |
121 | unsigned int n; | ||
122 | unsigned long l=length; | ||
123 | |||
124 | assert(in && out && key && counter && num); | ||
125 | assert(*num < AES_BLOCK_SIZE); | ||
126 | |||
127 | n = *num; | ||
128 | |||
129 | while (l--) { | ||
130 | if (n == 0) { | ||
131 | AES_encrypt(ivec, ecount_buf, key); | ||
132 | AES_ctr128_inc(ivec); | ||
133 | } | ||
134 | *(out++) = *(in++) ^ ecount_buf[n]; | ||
135 | n = (n+1) % AES_BLOCK_SIZE; | ||
136 | } | ||
137 | |||
138 | *num=n; | ||
139 | } | 61 | } |
diff --git a/src/lib/libcrypto/aes/aes_ofb.c b/src/lib/libcrypto/aes/aes_ofb.c index f358bb39e2..50bf0b8325 100644 --- a/src/lib/libcrypto/aes/aes_ofb.c +++ b/src/lib/libcrypto/aes/aes_ofb.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* crypto/aes/aes_ofb.c -*- mode:C; c-file-style: "eay" -*- */ | 1 | /* crypto/aes/aes_ofb.c -*- mode:C; c-file-style: "eay" -*- */ |
2 | /* ==================================================================== | 2 | /* ==================================================================== |
3 | * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. | 3 | * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved. |
4 | * | 4 | * |
5 | * Redistribution and use in source and binary forms, with or without | 5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions | 6 | * modification, are permitted provided that the following conditions |
@@ -48,95 +48,13 @@ | |||
48 | * ==================================================================== | 48 | * ==================================================================== |
49 | * | 49 | * |
50 | */ | 50 | */ |
51 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
52 | * All rights reserved. | ||
53 | * | ||
54 | * This package is an SSL implementation written | ||
55 | * by Eric Young (eay@cryptsoft.com). | ||
56 | * The implementation was written so as to conform with Netscapes SSL. | ||
57 | * | ||
58 | * This library is free for commercial and non-commercial use as long as | ||
59 | * the following conditions are aheared to. The following conditions | ||
60 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
61 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
62 | * included with this distribution is covered by the same copyright terms | ||
63 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
64 | * | ||
65 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
66 | * the code are not to be removed. | ||
67 | * If this package is used in a product, Eric Young should be given attribution | ||
68 | * as the author of the parts of the library used. | ||
69 | * This can be in the form of a textual message at program startup or | ||
70 | * in documentation (online or textual) provided with the package. | ||
71 | * | ||
72 | * Redistribution and use in source and binary forms, with or without | ||
73 | * modification, are permitted provided that the following conditions | ||
74 | * are met: | ||
75 | * 1. Redistributions of source code must retain the copyright | ||
76 | * notice, this list of conditions and the following disclaimer. | ||
77 | * 2. Redistributions in binary form must reproduce the above copyright | ||
78 | * notice, this list of conditions and the following disclaimer in the | ||
79 | * documentation and/or other materials provided with the distribution. | ||
80 | * 3. All advertising materials mentioning features or use of this software | ||
81 | * must display the following acknowledgement: | ||
82 | * "This product includes cryptographic software written by | ||
83 | * Eric Young (eay@cryptsoft.com)" | ||
84 | * The word 'cryptographic' can be left out if the rouines from the library | ||
85 | * being used are not cryptographic related :-). | ||
86 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
87 | * the apps directory (application code) you must include an acknowledgement: | ||
88 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
89 | * | ||
90 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
91 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
92 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
93 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
94 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
95 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
96 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
97 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
98 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
99 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
100 | * SUCH DAMAGE. | ||
101 | * | ||
102 | * The licence and distribution terms for any publically available version or | ||
103 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
104 | * copied and put under another distribution licence | ||
105 | * [including the GNU Public Licence.] | ||
106 | */ | ||
107 | |||
108 | #ifndef AES_DEBUG | ||
109 | # ifndef NDEBUG | ||
110 | # define NDEBUG | ||
111 | # endif | ||
112 | #endif | ||
113 | #include <assert.h> | ||
114 | 51 | ||
115 | #include <openssl/aes.h> | 52 | #include <openssl/aes.h> |
116 | #include "aes_locl.h" | 53 | #include <openssl/modes.h> |
117 | 54 | ||
118 | /* The input and output encrypted as though 128bit ofb mode is being | ||
119 | * used. The extra state information to record how much of the | ||
120 | * 128bit block we have used is contained in *num; | ||
121 | */ | ||
122 | void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, | 55 | void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, |
123 | const unsigned long length, const AES_KEY *key, | 56 | size_t length, const AES_KEY *key, |
124 | unsigned char *ivec, int *num) { | 57 | unsigned char *ivec, int *num) |
125 | 58 | { | |
126 | unsigned int n; | 59 | CRYPTO_ofb128_encrypt(in,out,length,key,ivec,num,(block128_f)AES_encrypt); |
127 | unsigned long l=length; | ||
128 | |||
129 | assert(in && out && key && ivec && num); | ||
130 | |||
131 | n = *num; | ||
132 | |||
133 | while (l--) { | ||
134 | if (n == 0) { | ||
135 | AES_encrypt(ivec, ivec, key); | ||
136 | } | ||
137 | *(out++) = *(in++) ^ ivec[n]; | ||
138 | n = (n+1) % AES_BLOCK_SIZE; | ||
139 | } | ||
140 | |||
141 | *num=n; | ||
142 | } | 60 | } |
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl index e771e83953..aab40e6f1c 100644 --- a/src/lib/libcrypto/aes/asm/aes-586.pl +++ b/src/lib/libcrypto/aes/asm/aes-586.pl | |||
@@ -2,11 +2,12 @@ | |||
2 | # | 2 | # |
3 | # ==================================================================== | 3 | # ==================================================================== |
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
5 | # project. Rights for redistribution and usage in source and binary | 5 | # project. The module is, however, dual licensed under OpenSSL and |
6 | # forms are granted according to the OpenSSL license. | 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
7 | # ==================================================================== | 8 | # ==================================================================== |
8 | # | 9 | # |
9 | # Version 3.6. | 10 | # Version 4.3. |
10 | # | 11 | # |
11 | # You might fail to appreciate this module performance from the first | 12 | # You might fail to appreciate this module performance from the first |
12 | # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered | 13 | # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered |
@@ -81,11 +82,117 @@ | |||
81 | # AMD K8 20 19 | 82 | # AMD K8 20 19 |
82 | # PIII 25 23 | 83 | # PIII 25 23 |
83 | # Pentium 81 78 | 84 | # Pentium 81 78 |
84 | 85 | # | |
85 | push(@INC,"perlasm","../../perlasm"); | 86 | # Version 3.7 reimplements outer rounds as "compact." Meaning that |
87 | # first and last rounds reference compact 256 bytes S-box. This means | ||
88 | # that first round consumes a lot more CPU cycles and that encrypt | ||
89 | # and decrypt performance becomes asymmetric. Encrypt performance | ||
90 | # drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is | ||
91 | # aggressively pre-fetched. | ||
92 | # | ||
93 | # Version 4.0 effectively rolls back to 3.6 and instead implements | ||
94 | # additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact, | ||
95 | # which use exclusively 256 byte S-box. These functions are to be | ||
96 | # called in modes not concealing plain text, such as ECB, or when | ||
97 | # we're asked to process smaller amount of data [or unconditionally | ||
98 | # on hyper-threading CPU]. Currently it's called unconditionally from | ||
99 | # AES_[en|de]crypt, which affects all modes, but CBC. CBC routine | ||
100 | # still needs to be modified to switch between slower and faster | ||
101 | # mode when appropriate... But in either case benchmark landscape | ||
102 | # changes dramatically and below numbers are CPU cycles per processed | ||
103 | # byte for 128-bit key. | ||
104 | # | ||
105 | # ECB encrypt ECB decrypt CBC large chunk | ||
106 | # P4 56[60] 84[100] 23 | ||
107 | # AMD K8 48[44] 70[79] 18 | ||
108 | # PIII 41[50] 61[91] 24 | ||
109 | # Core 2 32[38] 45[70] 18.5 | ||
110 | # Pentium 120 160 77 | ||
111 | # | ||
112 | # Version 4.1 switches to compact S-box even in key schedule setup. | ||
113 | # | ||
114 | # Version 4.2 prefetches compact S-box in every SSE round or in other | ||
115 | # words every cache-line is *guaranteed* to be accessed within ~50 | ||
116 | # cycles window. Why just SSE? Because it's needed on hyper-threading | ||
117 | # CPU! Which is also why it's prefetched with 64 byte stride. Best | ||
118 | # part is that it has no negative effect on performance:-) | ||
119 | # | ||
120 | # Version 4.3 implements switch between compact and non-compact block | ||
121 | # functions in AES_cbc_encrypt depending on how much data was asked | ||
122 | # to be processed in one stroke. | ||
123 | # | ||
124 | ###################################################################### | ||
125 | # Timing attacks are classified in two classes: synchronous when | ||
126 | # attacker consciously initiates cryptographic operation and collects | ||
127 | # timing data of various character afterwards, and asynchronous when | ||
128 | # malicious code is executed on same CPU simultaneously with AES, | ||
129 | # instruments itself and performs statistical analysis of this data. | ||
130 | # | ||
131 | # As far as synchronous attacks go the root to the AES timing | ||
132 | # vulnerability is twofold. Firstly, of 256 S-box elements at most 160 | ||
133 | # are referred to in single 128-bit block operation. Well, in C | ||
134 | # implementation with 4 distinct tables it's actually as little as 40 | ||
135 | # references per 256 elements table, but anyway... Secondly, even | ||
136 | # though S-box elements are clustered into smaller amount of cache- | ||
137 | # lines, smaller than 160 and even 40, it turned out that for certain | ||
138 | # plain-text pattern[s] or simply put chosen plain-text and given key | ||
139 | # few cache-lines remain unaccessed during block operation. Now, if | ||
140 | # attacker can figure out this access pattern, he can deduct the key | ||
141 | # [or at least part of it]. The natural way to mitigate this kind of | ||
142 | # attacks is to minimize the amount of cache-lines in S-box and/or | ||
143 | # prefetch them to ensure that every one is accessed for more uniform | ||
144 | # timing. But note that *if* plain-text was concealed in such way that | ||
145 | # input to block function is distributed *uniformly*, then attack | ||
146 | # wouldn't apply. Now note that some encryption modes, most notably | ||
147 | # CBC, do mask the plain-text in this exact way [secure cipher output | ||
148 | # is distributed uniformly]. Yes, one still might find input that | ||
149 | # would reveal the information about given key, but if amount of | ||
150 | # candidate inputs to be tried is larger than amount of possible key | ||
151 | # combinations then attack becomes infeasible. This is why revised | ||
152 | # AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk | ||
153 | # of data is to be processed in one stroke. The current size limit of | ||
154 | # 512 bytes is chosen to provide same [diminishigly low] probability | ||
155 | # for cache-line to remain untouched in large chunk operation with | ||
156 | # large S-box as for single block operation with compact S-box and | ||
157 | # surely needs more careful consideration... | ||
158 | # | ||
159 | # As for asynchronous attacks. There are two flavours: attacker code | ||
160 | # being interleaved with AES on hyper-threading CPU at *instruction* | ||
161 | # level, and two processes time sharing single core. As for latter. | ||
162 | # Two vectors. 1. Given that attacker process has higher priority, | ||
163 | # yield execution to process performing AES just before timer fires | ||
164 | # off the scheduler, immediately regain control of CPU and analyze the | ||
165 | # cache state. For this attack to be efficient attacker would have to | ||
166 | # effectively slow down the operation by several *orders* of magnitute, | ||
167 | # by ratio of time slice to duration of handful of AES rounds, which | ||
168 | # unlikely to remain unnoticed. Not to mention that this also means | ||
169 | # that he would spend correspondigly more time to collect enough | ||
170 | # statistical data to mount the attack. It's probably appropriate to | ||
171 | # say that if adeversary reckons that this attack is beneficial and | ||
172 | # risks to be noticed, you probably have larger problems having him | ||
173 | # mere opportunity. In other words suggested code design expects you | ||
174 | # to preclude/mitigate this attack by overall system security design. | ||
175 | # 2. Attacker manages to make his code interrupt driven. In order for | ||
176 | # this kind of attack to be feasible, interrupt rate has to be high | ||
177 | # enough, again comparable to duration of handful of AES rounds. But | ||
178 | # is there interrupt source of such rate? Hardly, not even 1Gbps NIC | ||
179 | # generates interrupts at such raging rate... | ||
180 | # | ||
181 | # And now back to the former, hyper-threading CPU or more specifically | ||
182 | # Intel P4. Recall that asynchronous attack implies that malicious | ||
183 | # code instruments itself. And naturally instrumentation granularity | ||
184 | # has be noticeably lower than duration of codepath accessing S-box. | ||
185 | # Given that all cache-lines are accessed during that time that is. | ||
186 | # Current implementation accesses *all* cache-lines within ~50 cycles | ||
187 | # window, which is actually *less* than RDTSC latency on Intel P4! | ||
188 | |||
189 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
190 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
86 | require "x86asm.pl"; | 191 | require "x86asm.pl"; |
87 | 192 | ||
88 | &asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386"); | 193 | &asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386"); |
194 | &static_label("AES_Te"); | ||
195 | &static_label("AES_Td"); | ||
89 | 196 | ||
90 | $s0="eax"; | 197 | $s0="eax"; |
91 | $s1="ebx"; | 198 | $s1="ebx"; |
@@ -93,21 +200,36 @@ $s2="ecx"; | |||
93 | $s3="edx"; | 200 | $s3="edx"; |
94 | $key="edi"; | 201 | $key="edi"; |
95 | $acc="esi"; | 202 | $acc="esi"; |
203 | $tbl="ebp"; | ||
204 | |||
205 | # stack frame layout in _[x86|sse]_AES_* routines, frame is allocated | ||
206 | # by caller | ||
207 | $__ra=&DWP(0,"esp"); # return address | ||
208 | $__s0=&DWP(4,"esp"); # s0 backing store | ||
209 | $__s1=&DWP(8,"esp"); # s1 backing store | ||
210 | $__s2=&DWP(12,"esp"); # s2 backing store | ||
211 | $__s3=&DWP(16,"esp"); # s3 backing store | ||
212 | $__key=&DWP(20,"esp"); # pointer to key schedule | ||
213 | $__end=&DWP(24,"esp"); # pointer to end of key schedule | ||
214 | $__tbl=&DWP(28,"esp"); # %ebp backing store | ||
215 | |||
216 | # stack frame layout in AES_[en|crypt] routines, which differs from | ||
217 | # above by 4 and overlaps by %ebp backing store | ||
218 | $_tbl=&DWP(24,"esp"); | ||
219 | $_esp=&DWP(28,"esp"); | ||
96 | 220 | ||
97 | $compromise=0; # $compromise=128 abstains from copying key | 221 | sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } |
98 | # schedule to stack when encrypting inputs | 222 | |
99 | # shorter than 128 bytes at the cost of | 223 | $speed_limit=512; # chunks smaller than $speed_limit are |
100 | # risksing aliasing with S-boxes. In return | 224 | # processed with compact routine in CBC mode |
101 | # you get way better, up to +70%, small block | ||
102 | # performance. | ||
103 | $small_footprint=1; # $small_footprint=1 code is ~5% slower [on | 225 | $small_footprint=1; # $small_footprint=1 code is ~5% slower [on |
104 | # recent µ-archs], but ~5 times smaller! | 226 | # recent µ-archs], but ~5 times smaller! |
105 | # I favor compact code to minimize cache | 227 | # I favor compact code to minimize cache |
106 | # contention and in hope to "collect" 5% back | 228 | # contention and in hope to "collect" 5% back |
107 | # in real-life applications... | 229 | # in real-life applications... |
230 | |||
108 | $vertical_spin=0; # shift "verticaly" defaults to 0, because of | 231 | $vertical_spin=0; # shift "verticaly" defaults to 0, because of |
109 | # its proof-of-concept status... | 232 | # its proof-of-concept status... |
110 | |||
111 | # Note that there is no decvert(), as well as last encryption round is | 233 | # Note that there is no decvert(), as well as last encryption round is |
112 | # performed with "horizontal" shifts. This is because this "vertical" | 234 | # performed with "horizontal" shifts. This is because this "vertical" |
113 | # implementation [one which groups shifts on a given $s[i] to form a | 235 | # implementation [one which groups shifts on a given $s[i] to form a |
@@ -170,17 +292,484 @@ sub encvert() | |||
170 | &movz ($v0,&HB($v1)); | 292 | &movz ($v0,&HB($v1)); |
171 | &and ($v1,0xFF); | 293 | &and ($v1,0xFF); |
172 | &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16 | 294 | &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16 |
173 | &mov ($key,&DWP(12,"esp")); # reincarnate v1 as key | 295 | &mov ($key,$__key); # reincarnate v1 as key |
174 | &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24 | 296 | &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24 |
175 | } | 297 | } |
176 | 298 | ||
299 | # Another experimental routine, which features "horizontal spin," but | ||
300 | # eliminates one reference to stack. Strangely enough runs slower... | ||
301 | sub enchoriz() | ||
302 | { my $v0 = $key, $v1 = $acc; | ||
303 | |||
304 | &movz ($v0,&LB($s0)); # 3, 2, 1, 0* | ||
305 | &rotr ($s2,8); # 8,11,10, 9 | ||
306 | &mov ($v1,&DWP(0,$te,$v0,8)); # 0 | ||
307 | &movz ($v0,&HB($s1)); # 7, 6, 5*, 4 | ||
308 | &rotr ($s3,16); # 13,12,15,14 | ||
309 | &xor ($v1,&DWP(3,$te,$v0,8)); # 5 | ||
310 | &movz ($v0,&HB($s2)); # 8,11,10*, 9 | ||
311 | &rotr ($s0,16); # 1, 0, 3, 2 | ||
312 | &xor ($v1,&DWP(2,$te,$v0,8)); # 10 | ||
313 | &movz ($v0,&HB($s3)); # 13,12,15*,14 | ||
314 | &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected | ||
315 | &mov ($__s0,$v1); # t[0] saved | ||
316 | |||
317 | &movz ($v0,&LB($s1)); # 7, 6, 5, 4* | ||
318 | &shr ($s1,16); # -, -, 7, 6 | ||
319 | &mov ($v1,&DWP(0,$te,$v0,8)); # 4 | ||
320 | &movz ($v0,&LB($s3)); # 13,12,15,14* | ||
321 | &xor ($v1,&DWP(2,$te,$v0,8)); # 14 | ||
322 | &movz ($v0,&HB($s0)); # 1, 0, 3*, 2 | ||
323 | &and ($s3,0xffff0000); # 13,12, -, - | ||
324 | &xor ($v1,&DWP(1,$te,$v0,8)); # 3 | ||
325 | &movz ($v0,&LB($s2)); # 8,11,10, 9* | ||
326 | &or ($s3,$s1); # 13,12, 7, 6 | ||
327 | &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected | ||
328 | &mov ($s1,$v1); # s[1]=t[1] | ||
329 | |||
330 | &movz ($v0,&LB($s0)); # 1, 0, 3, 2* | ||
331 | &shr ($s2,16); # -, -, 8,11 | ||
332 | &mov ($v1,&DWP(2,$te,$v0,8)); # 2 | ||
333 | &movz ($v0,&HB($s3)); # 13,12, 7*, 6 | ||
334 | &xor ($v1,&DWP(1,$te,$v0,8)); # 7 | ||
335 | &movz ($v0,&HB($s2)); # -, -, 8*,11 | ||
336 | &xor ($v1,&DWP(0,$te,$v0,8)); # 8 | ||
337 | &mov ($v0,$s3); | ||
338 | &shr ($v0,24); # 13 | ||
339 | &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected | ||
340 | |||
341 | &movz ($v0,&LB($s2)); # -, -, 8,11* | ||
342 | &shr ($s0,24); # 1* | ||
343 | &mov ($s2,&DWP(1,$te,$v0,8)); # 11 | ||
344 | &xor ($s2,&DWP(3,$te,$s0,8)); # 1 | ||
345 | &mov ($s0,$__s0); # s[0]=t[0] | ||
346 | &movz ($v0,&LB($s3)); # 13,12, 7, 6* | ||
347 | &shr ($s3,16); # , ,13,12 | ||
348 | &xor ($s2,&DWP(2,$te,$v0,8)); # 6 | ||
349 | &mov ($key,$__key); # reincarnate v0 as key | ||
350 | &and ($s3,0xff); # , ,13,12* | ||
351 | &mov ($s3,&DWP(0,$te,$s3,8)); # 12 | ||
352 | &xor ($s3,$s2); # s[2]=t[3] collected | ||
353 | &mov ($s2,$v1); # s[2]=t[2] | ||
354 | } | ||
355 | |||
356 | # More experimental code... SSE one... Even though this one eliminates | ||
357 | # *all* references to stack, it's not faster... | ||
358 | sub sse_encbody() | ||
359 | { | ||
360 | &movz ($acc,&LB("eax")); # 0 | ||
361 | &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0 | ||
362 | &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 | ||
363 | &movz ("edx",&HB("eax")); # 1 | ||
364 | &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1 | ||
365 | &shr ("eax",16); # 5, 4 | ||
366 | |||
367 | &movz ($acc,&LB("ebx")); # 10 | ||
368 | &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10 | ||
369 | &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 | ||
370 | &movz ($acc,&HB("ebx")); # 11 | ||
371 | &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11 | ||
372 | &shr ("ebx",16); # 15,14 | ||
373 | |||
374 | &movz ($acc,&HB("eax")); # 5 | ||
375 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5 | ||
376 | &movq ("mm3",QWP(16,$key)); | ||
377 | &movz ($acc,&HB("ebx")); # 15 | ||
378 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15 | ||
379 | &movd ("mm0","ecx"); # t[0] collected | ||
380 | |||
381 | &movz ($acc,&LB("eax")); # 4 | ||
382 | &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4 | ||
383 | &movd ("eax","mm2"); # 7, 6, 3, 2 | ||
384 | &movz ($acc,&LB("ebx")); # 14 | ||
385 | &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14 | ||
386 | &movd ("ebx","mm6"); # 13,12, 9, 8 | ||
387 | |||
388 | &movz ($acc,&HB("eax")); # 3 | ||
389 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3 | ||
390 | &movz ($acc,&HB("ebx")); # 9 | ||
391 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9 | ||
392 | &movd ("mm1","ecx"); # t[1] collected | ||
393 | |||
394 | &movz ($acc,&LB("eax")); # 2 | ||
395 | &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2 | ||
396 | &shr ("eax",16); # 7, 6 | ||
397 | &punpckldq ("mm0","mm1"); # t[0,1] collected | ||
398 | &movz ($acc,&LB("ebx")); # 8 | ||
399 | &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8 | ||
400 | &shr ("ebx",16); # 13,12 | ||
401 | |||
402 | &movz ($acc,&HB("eax")); # 7 | ||
403 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7 | ||
404 | &pxor ("mm0","mm3"); | ||
405 | &movz ("eax",&LB("eax")); # 6 | ||
406 | &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6 | ||
407 | &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0 | ||
408 | &movz ($acc,&HB("ebx")); # 13 | ||
409 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13 | ||
410 | &xor ("ecx",&DWP(24,$key)); # t[2] | ||
411 | &movd ("mm4","ecx"); # t[2] collected | ||
412 | &movz ("ebx",&LB("ebx")); # 12 | ||
413 | &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12 | ||
414 | &shr ("ecx",16); | ||
415 | &movd ("eax","mm1"); # 5, 4, 1, 0 | ||
416 | &mov ("ebx",&DWP(28,$key)); # t[3] | ||
417 | &xor ("ebx","edx"); | ||
418 | &movd ("mm5","ebx"); # t[3] collected | ||
419 | &and ("ebx",0xffff0000); | ||
420 | &or ("ebx","ecx"); | ||
421 | |||
422 | &punpckldq ("mm4","mm5"); # t[2,3] collected | ||
423 | } | ||
424 | |||
425 | ###################################################################### | ||
426 | # "Compact" block function | ||
427 | ###################################################################### | ||
428 | |||
429 | sub enccompact() | ||
430 | { my $Fn = mov; | ||
431 | while ($#_>5) { pop(@_); $Fn=sub{}; } | ||
432 | my ($i,$te,@s)=@_; | ||
433 | my $tmp = $key; | ||
434 | my $out = $i==3?$s[0]:$acc; | ||
435 | |||
436 | # $Fn is used in first compact round and its purpose is to | ||
437 | # void restoration of some values from stack, so that after | ||
438 | # 4xenccompact with extra argument $key value is left there... | ||
439 | if ($i==3) { &$Fn ($key,$__key); }##%edx | ||
440 | else { &mov ($out,$s[0]); } | ||
441 | &and ($out,0xFF); | ||
442 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] | ||
443 | if ($i==2) { &shr ($s[0],24); }#%ecx[2] | ||
444 | &movz ($out,&BP(-128,$te,$out,1)); | ||
445 | |||
446 | if ($i==3) { $tmp=$s[1]; }##%eax | ||
447 | &movz ($tmp,&HB($s[1])); | ||
448 | &movz ($tmp,&BP(-128,$te,$tmp,1)); | ||
449 | &shl ($tmp,8); | ||
450 | &xor ($out,$tmp); | ||
451 | |||
452 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx | ||
453 | else { &mov ($tmp,$s[2]); | ||
454 | &shr ($tmp,16); } | ||
455 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] | ||
456 | &and ($tmp,0xFF); | ||
457 | &movz ($tmp,&BP(-128,$te,$tmp,1)); | ||
458 | &shl ($tmp,16); | ||
459 | &xor ($out,$tmp); | ||
460 | |||
461 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx | ||
462 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] | ||
463 | else { &mov ($tmp,$s[3]); | ||
464 | &shr ($tmp,24); } | ||
465 | &movz ($tmp,&BP(-128,$te,$tmp,1)); | ||
466 | &shl ($tmp,24); | ||
467 | &xor ($out,$tmp); | ||
468 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | ||
469 | if ($i==3) { &mov ($s[3],$acc); } | ||
470 | &comment(); | ||
471 | } | ||
472 | |||
473 | sub enctransform() | ||
474 | { my @s = ($s0,$s1,$s2,$s3); | ||
475 | my $i = shift; | ||
476 | my $tmp = $tbl; | ||
477 | my $r2 = $key ; | ||
478 | |||
479 | &mov ($acc,$s[$i]); | ||
480 | &and ($acc,0x80808080); | ||
481 | &mov ($tmp,$acc); | ||
482 | &shr ($tmp,7); | ||
483 | &lea ($r2,&DWP(0,$s[$i],$s[$i])); | ||
484 | &sub ($acc,$tmp); | ||
485 | &and ($r2,0xfefefefe); | ||
486 | &and ($acc,0x1b1b1b1b); | ||
487 | &mov ($tmp,$s[$i]); | ||
488 | &xor ($acc,$r2); # r2 | ||
489 | |||
490 | &xor ($s[$i],$acc); # r0 ^ r2 | ||
491 | &rotl ($s[$i],24); | ||
492 | &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2 | ||
493 | &rotr ($tmp,16); | ||
494 | &xor ($s[$i],$tmp); | ||
495 | &rotr ($tmp,8); | ||
496 | &xor ($s[$i],$tmp); | ||
497 | } | ||
498 | |||
499 | &function_begin_B("_x86_AES_encrypt_compact"); | ||
500 | # note that caller is expected to allocate stack frame for me! | ||
501 | &mov ($__key,$key); # save key | ||
502 | |||
503 | &xor ($s0,&DWP(0,$key)); # xor with key | ||
504 | &xor ($s1,&DWP(4,$key)); | ||
505 | &xor ($s2,&DWP(8,$key)); | ||
506 | &xor ($s3,&DWP(12,$key)); | ||
507 | |||
508 | &mov ($acc,&DWP(240,$key)); # load key->rounds | ||
509 | &lea ($acc,&DWP(-2,$acc,$acc)); | ||
510 | &lea ($acc,&DWP(0,$key,$acc,8)); | ||
511 | &mov ($__end,$acc); # end of key schedule | ||
512 | |||
513 | # prefetch Te4 | ||
514 | &mov ($key,&DWP(0-128,$tbl)); | ||
515 | &mov ($acc,&DWP(32-128,$tbl)); | ||
516 | &mov ($key,&DWP(64-128,$tbl)); | ||
517 | &mov ($acc,&DWP(96-128,$tbl)); | ||
518 | &mov ($key,&DWP(128-128,$tbl)); | ||
519 | &mov ($acc,&DWP(160-128,$tbl)); | ||
520 | &mov ($key,&DWP(192-128,$tbl)); | ||
521 | &mov ($acc,&DWP(224-128,$tbl)); | ||
522 | |||
523 | &set_label("loop",16); | ||
524 | |||
525 | &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1); | ||
526 | &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1); | ||
527 | &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1); | ||
528 | &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1); | ||
529 | &enctransform(2); | ||
530 | &enctransform(3); | ||
531 | &enctransform(0); | ||
532 | &enctransform(1); | ||
533 | &mov ($key,$__key); | ||
534 | &mov ($tbl,$__tbl); | ||
535 | &add ($key,16); # advance rd_key | ||
536 | &xor ($s0,&DWP(0,$key)); | ||
537 | &xor ($s1,&DWP(4,$key)); | ||
538 | &xor ($s2,&DWP(8,$key)); | ||
539 | &xor ($s3,&DWP(12,$key)); | ||
540 | |||
541 | &cmp ($key,$__end); | ||
542 | &mov ($__key,$key); | ||
543 | &jb (&label("loop")); | ||
544 | |||
545 | &enccompact(0,$tbl,$s0,$s1,$s2,$s3); | ||
546 | &enccompact(1,$tbl,$s1,$s2,$s3,$s0); | ||
547 | &enccompact(2,$tbl,$s2,$s3,$s0,$s1); | ||
548 | &enccompact(3,$tbl,$s3,$s0,$s1,$s2); | ||
549 | |||
550 | &xor ($s0,&DWP(16,$key)); | ||
551 | &xor ($s1,&DWP(20,$key)); | ||
552 | &xor ($s2,&DWP(24,$key)); | ||
553 | &xor ($s3,&DWP(28,$key)); | ||
554 | |||
555 | &ret (); | ||
556 | &function_end_B("_x86_AES_encrypt_compact"); | ||
557 | |||
558 | ###################################################################### | ||
559 | # "Compact" SSE block function. | ||
560 | ###################################################################### | ||
561 | # | ||
562 | # Performance is not actually extraordinary in comparison to pure | ||
563 | # x86 code. In particular encrypt performance is virtually the same. | ||
564 | # Decrypt performance on the other hand is 15-20% better on newer | ||
565 | # µ-archs [but we're thankful for *any* improvement here], and ~50% | ||
566 | # better on PIII:-) And additionally on the pros side this code | ||
567 | # eliminates redundant references to stack and thus relieves/ | ||
568 | # minimizes the pressure on the memory bus. | ||
569 | # | ||
570 | # MMX register layout lsb | ||
571 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | ||
572 | # | mm4 | mm0 | | ||
573 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | ||
574 | # | s3 | s2 | s1 | s0 | | ||
575 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | ||
576 | # |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0| | ||
577 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | ||
578 | # | ||
579 | # Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8. | ||
580 | # In this terms encryption and decryption "compact" permutation | ||
581 | # matrices can be depicted as following: | ||
582 | # | ||
583 | # encryption lsb # decryption lsb | ||
584 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
585 | # | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 | | ||
586 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
587 | # | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 | | ||
588 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
589 | # | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 | | ||
590 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
591 | # | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 | | ||
592 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
593 | # | ||
594 | ###################################################################### | ||
595 | # Why not xmm registers? Short answer. It was actually tested and | ||
596 | # was not any faster, but *contrary*, most notably on Intel CPUs. | ||
597 | # Longer answer. Main advantage of using mm registers is that movd | ||
598 | # latency is lower, especially on Intel P4. While arithmetic | ||
599 | # instructions are twice as many, they can be scheduled every cycle | ||
600 | # and not every second one when they are operating on xmm register, | ||
601 | # so that "arithmetic throughput" remains virtually the same. And | ||
602 | # finally the code can be executed even on elder SSE-only CPUs:-) | ||
603 | |||
604 | sub sse_enccompact() | ||
605 | { | ||
606 | &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0 | ||
607 | &pshufw ("mm5","mm4",0x0d); # 15,14,11,10 | ||
608 | &movd ("eax","mm1"); # 5, 4, 1, 0 | ||
609 | &movd ("ebx","mm5"); # 15,14,11,10 | ||
610 | |||
611 | &movz ($acc,&LB("eax")); # 0 | ||
612 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 | ||
613 | &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 | ||
614 | &movz ("edx",&HB("eax")); # 1 | ||
615 | &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 | ||
616 | &shl ("edx",8); # 1 | ||
617 | &shr ("eax",16); # 5, 4 | ||
618 | |||
619 | &movz ($acc,&LB("ebx")); # 10 | ||
620 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 | ||
621 | &shl ($acc,16); # 10 | ||
622 | &or ("ecx",$acc); # 10 | ||
623 | &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 | ||
624 | &movz ($acc,&HB("ebx")); # 11 | ||
625 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 | ||
626 | &shl ($acc,24); # 11 | ||
627 | &or ("edx",$acc); # 11 | ||
628 | &shr ("ebx",16); # 15,14 | ||
629 | |||
630 | &movz ($acc,&HB("eax")); # 5 | ||
631 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5 | ||
632 | &shl ($acc,8); # 5 | ||
633 | &or ("ecx",$acc); # 5 | ||
634 | &movz ($acc,&HB("ebx")); # 15 | ||
635 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 | ||
636 | &shl ($acc,24); # 15 | ||
637 | &or ("ecx",$acc); # 15 | ||
638 | &movd ("mm0","ecx"); # t[0] collected | ||
639 | |||
640 | &movz ($acc,&LB("eax")); # 4 | ||
641 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4 | ||
642 | &movd ("eax","mm2"); # 7, 6, 3, 2 | ||
643 | &movz ($acc,&LB("ebx")); # 14 | ||
644 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 | ||
645 | &shl ($acc,16); # 14 | ||
646 | &or ("ecx",$acc); # 14 | ||
647 | |||
648 | &movd ("ebx","mm6"); # 13,12, 9, 8 | ||
649 | &movz ($acc,&HB("eax")); # 3 | ||
650 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3 | ||
651 | &shl ($acc,24); # 3 | ||
652 | &or ("ecx",$acc); # 3 | ||
653 | &movz ($acc,&HB("ebx")); # 9 | ||
654 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 | ||
655 | &shl ($acc,8); # 9 | ||
656 | &or ("ecx",$acc); # 9 | ||
657 | &movd ("mm1","ecx"); # t[1] collected | ||
658 | |||
659 | &movz ($acc,&LB("ebx")); # 8 | ||
660 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8 | ||
661 | &shr ("ebx",16); # 13,12 | ||
662 | &movz ($acc,&LB("eax")); # 2 | ||
663 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 | ||
664 | &shl ($acc,16); # 2 | ||
665 | &or ("ecx",$acc); # 2 | ||
666 | &shr ("eax",16); # 7, 6 | ||
667 | |||
668 | &punpckldq ("mm0","mm1"); # t[0,1] collected | ||
669 | |||
670 | &movz ($acc,&HB("eax")); # 7 | ||
671 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 | ||
672 | &shl ($acc,24); # 7 | ||
673 | &or ("ecx",$acc); # 7 | ||
674 | &and ("eax",0xff); # 6 | ||
675 | &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6 | ||
676 | &shl ("eax",16); # 6 | ||
677 | &or ("edx","eax"); # 6 | ||
678 | &movz ($acc,&HB("ebx")); # 13 | ||
679 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 | ||
680 | &shl ($acc,8); # 13 | ||
681 | &or ("ecx",$acc); # 13 | ||
682 | &movd ("mm4","ecx"); # t[2] collected | ||
683 | &and ("ebx",0xff); # 12 | ||
684 | &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12 | ||
685 | &or ("edx","ebx"); # 12 | ||
686 | &movd ("mm5","edx"); # t[3] collected | ||
687 | |||
688 | &punpckldq ("mm4","mm5"); # t[2,3] collected | ||
689 | } | ||
690 | |||
691 | if (!$x86only) { | ||
692 | &function_begin_B("_sse_AES_encrypt_compact"); | ||
693 | &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0 | ||
694 | &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8 | ||
695 | |||
696 | # note that caller is expected to allocate stack frame for me! | ||
697 | &mov ($acc,&DWP(240,$key)); # load key->rounds | ||
698 | &lea ($acc,&DWP(-2,$acc,$acc)); | ||
699 | &lea ($acc,&DWP(0,$key,$acc,8)); | ||
700 | &mov ($__end,$acc); # end of key schedule | ||
701 | |||
702 | &mov ($s0,0x1b1b1b1b); # magic constant | ||
703 | &mov (&DWP(8,"esp"),$s0); | ||
704 | &mov (&DWP(12,"esp"),$s0); | ||
705 | |||
706 | # prefetch Te4 | ||
707 | &mov ($s0,&DWP(0-128,$tbl)); | ||
708 | &mov ($s1,&DWP(32-128,$tbl)); | ||
709 | &mov ($s2,&DWP(64-128,$tbl)); | ||
710 | &mov ($s3,&DWP(96-128,$tbl)); | ||
711 | &mov ($s0,&DWP(128-128,$tbl)); | ||
712 | &mov ($s1,&DWP(160-128,$tbl)); | ||
713 | &mov ($s2,&DWP(192-128,$tbl)); | ||
714 | &mov ($s3,&DWP(224-128,$tbl)); | ||
715 | |||
716 | &set_label("loop",16); | ||
717 | &sse_enccompact(); | ||
718 | &add ($key,16); | ||
719 | &cmp ($key,$__end); | ||
720 | &ja (&label("out")); | ||
721 | |||
722 | &movq ("mm2",&QWP(8,"esp")); | ||
723 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); | ||
724 | &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0 | ||
725 | &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4"); | ||
726 | &pand ("mm3","mm2"); &pand ("mm7","mm2"); | ||
727 | &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16) | ||
728 | &paddb ("mm0","mm0"); &paddb ("mm4","mm4"); | ||
729 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2 | ||
730 | &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0 | ||
731 | &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2 | ||
732 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16) | ||
733 | |||
734 | &movq ("mm2","mm3"); &movq ("mm6","mm7"); | ||
735 | &pslld ("mm3",8); &pslld ("mm7",8); | ||
736 | &psrld ("mm2",24); &psrld ("mm6",24); | ||
737 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8 | ||
738 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24 | ||
739 | |||
740 | &movq ("mm3","mm1"); &movq ("mm7","mm5"); | ||
741 | &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key)); | ||
742 | &psrld ("mm1",8); &psrld ("mm5",8); | ||
743 | &mov ($s0,&DWP(0-128,$tbl)); | ||
744 | &pslld ("mm3",24); &pslld ("mm7",24); | ||
745 | &mov ($s1,&DWP(64-128,$tbl)); | ||
746 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8 | ||
747 | &mov ($s2,&DWP(128-128,$tbl)); | ||
748 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24 | ||
749 | &mov ($s3,&DWP(192-128,$tbl)); | ||
750 | |||
751 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); | ||
752 | &jmp (&label("loop")); | ||
753 | |||
754 | &set_label("out",16); | ||
755 | &pxor ("mm0",&QWP(0,$key)); | ||
756 | &pxor ("mm4",&QWP(8,$key)); | ||
757 | |||
758 | &ret (); | ||
759 | &function_end_B("_sse_AES_encrypt_compact"); | ||
760 | } | ||
761 | |||
762 | ###################################################################### | ||
763 | # Vanilla block function. | ||
764 | ###################################################################### | ||
765 | |||
177 | sub encstep() | 766 | sub encstep() |
178 | { my ($i,$te,@s) = @_; | 767 | { my ($i,$te,@s) = @_; |
179 | my $tmp = $key; | 768 | my $tmp = $key; |
180 | my $out = $i==3?$s[0]:$acc; | 769 | my $out = $i==3?$s[0]:$acc; |
181 | 770 | ||
182 | # lines marked with #%e?x[i] denote "reordered" instructions... | 771 | # lines marked with #%e?x[i] denote "reordered" instructions... |
183 | if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx | 772 | if ($i==3) { &mov ($key,$__key); }##%edx |
184 | else { &mov ($out,$s[0]); | 773 | else { &mov ($out,$s[0]); |
185 | &and ($out,0xFF); } | 774 | &and ($out,0xFF); } |
186 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] | 775 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] |
@@ -191,14 +780,14 @@ sub encstep() | |||
191 | &movz ($tmp,&HB($s[1])); | 780 | &movz ($tmp,&HB($s[1])); |
192 | &xor ($out,&DWP(3,$te,$tmp,8)); | 781 | &xor ($out,&DWP(3,$te,$tmp,8)); |
193 | 782 | ||
194 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx | 783 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx |
195 | else { &mov ($tmp,$s[2]); | 784 | else { &mov ($tmp,$s[2]); |
196 | &shr ($tmp,16); } | 785 | &shr ($tmp,16); } |
197 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] | 786 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] |
198 | &and ($tmp,0xFF); | 787 | &and ($tmp,0xFF); |
199 | &xor ($out,&DWP(2,$te,$tmp,8)); | 788 | &xor ($out,&DWP(2,$te,$tmp,8)); |
200 | 789 | ||
201 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx | 790 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx |
202 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] | 791 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] |
203 | else { &mov ($tmp,$s[3]); | 792 | else { &mov ($tmp,$s[3]); |
204 | &shr ($tmp,24) } | 793 | &shr ($tmp,24) } |
@@ -213,7 +802,7 @@ sub enclast() | |||
213 | my $tmp = $key; | 802 | my $tmp = $key; |
214 | my $out = $i==3?$s[0]:$acc; | 803 | my $out = $i==3?$s[0]:$acc; |
215 | 804 | ||
216 | if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx | 805 | if ($i==3) { &mov ($key,$__key); }##%edx |
217 | else { &mov ($out,$s[0]); } | 806 | else { &mov ($out,$s[0]); } |
218 | &and ($out,0xFF); | 807 | &and ($out,0xFF); |
219 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] | 808 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] |
@@ -227,8 +816,8 @@ sub enclast() | |||
227 | &and ($tmp,0x0000ff00); | 816 | &and ($tmp,0x0000ff00); |
228 | &xor ($out,$tmp); | 817 | &xor ($out,$tmp); |
229 | 818 | ||
230 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx | 819 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx |
231 | else { mov ($tmp,$s[2]); | 820 | else { &mov ($tmp,$s[2]); |
232 | &shr ($tmp,16); } | 821 | &shr ($tmp,16); } |
233 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] | 822 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] |
234 | &and ($tmp,0xFF); | 823 | &and ($tmp,0xFF); |
@@ -236,7 +825,7 @@ sub enclast() | |||
236 | &and ($tmp,0x00ff0000); | 825 | &and ($tmp,0x00ff0000); |
237 | &xor ($out,$tmp); | 826 | &xor ($out,$tmp); |
238 | 827 | ||
239 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx | 828 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx |
240 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] | 829 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] |
241 | else { &mov ($tmp,$s[3]); | 830 | else { &mov ($tmp,$s[3]); |
242 | &shr ($tmp,24); } | 831 | &shr ($tmp,24); } |
@@ -247,10 +836,7 @@ sub enclast() | |||
247 | if ($i==3) { &mov ($s[3],$acc); } | 836 | if ($i==3) { &mov ($s[3],$acc); } |
248 | } | 837 | } |
249 | 838 | ||
250 | sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | 839 | &function_begin_B("_x86_AES_encrypt"); |
251 | |||
252 | &public_label("AES_Te"); | ||
253 | &function_begin_C("_x86_AES_encrypt"); | ||
254 | if ($vertical_spin) { | 840 | if ($vertical_spin) { |
255 | # I need high parts of volatile registers to be accessible... | 841 | # I need high parts of volatile registers to be accessible... |
256 | &exch ($s1="edi",$key="ebx"); | 842 | &exch ($s1="edi",$key="ebx"); |
@@ -258,7 +844,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
258 | } | 844 | } |
259 | 845 | ||
260 | # note that caller is expected to allocate stack frame for me! | 846 | # note that caller is expected to allocate stack frame for me! |
261 | &mov (&DWP(12,"esp"),$key); # save key | 847 | &mov ($__key,$key); # save key |
262 | 848 | ||
263 | &xor ($s0,&DWP(0,$key)); # xor with key | 849 | &xor ($s0,&DWP(0,$key)); # xor with key |
264 | &xor ($s1,&DWP(4,$key)); | 850 | &xor ($s1,&DWP(4,$key)); |
@@ -270,24 +856,24 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
270 | if ($small_footprint) { | 856 | if ($small_footprint) { |
271 | &lea ($acc,&DWP(-2,$acc,$acc)); | 857 | &lea ($acc,&DWP(-2,$acc,$acc)); |
272 | &lea ($acc,&DWP(0,$key,$acc,8)); | 858 | &lea ($acc,&DWP(0,$key,$acc,8)); |
273 | &mov (&DWP(16,"esp"),$acc); # end of key schedule | 859 | &mov ($__end,$acc); # end of key schedule |
274 | &align (4); | 860 | |
275 | &set_label("loop"); | 861 | &set_label("loop",16); |
276 | if ($vertical_spin) { | 862 | if ($vertical_spin) { |
277 | &encvert("ebp",$s0,$s1,$s2,$s3); | 863 | &encvert($tbl,$s0,$s1,$s2,$s3); |
278 | } else { | 864 | } else { |
279 | &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 865 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
280 | &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 866 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
281 | &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 867 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
282 | &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 868 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
283 | } | 869 | } |
284 | &add ($key,16); # advance rd_key | 870 | &add ($key,16); # advance rd_key |
285 | &xor ($s0,&DWP(0,$key)); | 871 | &xor ($s0,&DWP(0,$key)); |
286 | &xor ($s1,&DWP(4,$key)); | 872 | &xor ($s1,&DWP(4,$key)); |
287 | &xor ($s2,&DWP(8,$key)); | 873 | &xor ($s2,&DWP(8,$key)); |
288 | &xor ($s3,&DWP(12,$key)); | 874 | &xor ($s3,&DWP(12,$key)); |
289 | &cmp ($key,&DWP(16,"esp")); | 875 | &cmp ($key,$__end); |
290 | &mov (&DWP(12,"esp"),$key); | 876 | &mov ($__key,$key); |
291 | &jb (&label("loop")); | 877 | &jb (&label("loop")); |
292 | } | 878 | } |
293 | else { | 879 | else { |
@@ -296,15 +882,15 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
296 | &cmp ($acc,12); | 882 | &cmp ($acc,12); |
297 | &jle (&label("12rounds")); | 883 | &jle (&label("12rounds")); |
298 | 884 | ||
299 | &set_label("14rounds"); | 885 | &set_label("14rounds",4); |
300 | for ($i=1;$i<3;$i++) { | 886 | for ($i=1;$i<3;$i++) { |
301 | if ($vertical_spin) { | 887 | if ($vertical_spin) { |
302 | &encvert("ebp",$s0,$s1,$s2,$s3); | 888 | &encvert($tbl,$s0,$s1,$s2,$s3); |
303 | } else { | 889 | } else { |
304 | &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 890 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
305 | &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 891 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
306 | &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 892 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
307 | &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 893 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
308 | } | 894 | } |
309 | &xor ($s0,&DWP(16*$i+0,$key)); | 895 | &xor ($s0,&DWP(16*$i+0,$key)); |
310 | &xor ($s1,&DWP(16*$i+4,$key)); | 896 | &xor ($s1,&DWP(16*$i+4,$key)); |
@@ -312,16 +898,16 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
312 | &xor ($s3,&DWP(16*$i+12,$key)); | 898 | &xor ($s3,&DWP(16*$i+12,$key)); |
313 | } | 899 | } |
314 | &add ($key,32); | 900 | &add ($key,32); |
315 | &mov (&DWP(12,"esp"),$key); # advance rd_key | 901 | &mov ($__key,$key); # advance rd_key |
316 | &set_label("12rounds"); | 902 | &set_label("12rounds",4); |
317 | for ($i=1;$i<3;$i++) { | 903 | for ($i=1;$i<3;$i++) { |
318 | if ($vertical_spin) { | 904 | if ($vertical_spin) { |
319 | &encvert("ebp",$s0,$s1,$s2,$s3); | 905 | &encvert($tbl,$s0,$s1,$s2,$s3); |
320 | } else { | 906 | } else { |
321 | &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 907 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
322 | &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 908 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
323 | &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 909 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
324 | &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 910 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
325 | } | 911 | } |
326 | &xor ($s0,&DWP(16*$i+0,$key)); | 912 | &xor ($s0,&DWP(16*$i+0,$key)); |
327 | &xor ($s1,&DWP(16*$i+4,$key)); | 913 | &xor ($s1,&DWP(16*$i+4,$key)); |
@@ -329,16 +915,16 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
329 | &xor ($s3,&DWP(16*$i+12,$key)); | 915 | &xor ($s3,&DWP(16*$i+12,$key)); |
330 | } | 916 | } |
331 | &add ($key,32); | 917 | &add ($key,32); |
332 | &mov (&DWP(12,"esp"),$key); # advance rd_key | 918 | &mov ($__key,$key); # advance rd_key |
333 | &set_label("10rounds"); | 919 | &set_label("10rounds",4); |
334 | for ($i=1;$i<10;$i++) { | 920 | for ($i=1;$i<10;$i++) { |
335 | if ($vertical_spin) { | 921 | if ($vertical_spin) { |
336 | &encvert("ebp",$s0,$s1,$s2,$s3); | 922 | &encvert($tbl,$s0,$s1,$s2,$s3); |
337 | } else { | 923 | } else { |
338 | &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 924 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
339 | &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 925 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
340 | &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 926 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
341 | &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 927 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
342 | } | 928 | } |
343 | &xor ($s0,&DWP(16*$i+0,$key)); | 929 | &xor ($s0,&DWP(16*$i+0,$key)); |
344 | &xor ($s1,&DWP(16*$i+4,$key)); | 930 | &xor ($s1,&DWP(16*$i+4,$key)); |
@@ -352,10 +938,10 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
352 | &mov ($s1="ebx",$key="edi"); | 938 | &mov ($s1="ebx",$key="edi"); |
353 | &mov ($s2="ecx",$acc="esi"); | 939 | &mov ($s2="ecx",$acc="esi"); |
354 | } | 940 | } |
355 | &enclast(0,"ebp",$s0,$s1,$s2,$s3); | 941 | &enclast(0,$tbl,$s0,$s1,$s2,$s3); |
356 | &enclast(1,"ebp",$s1,$s2,$s3,$s0); | 942 | &enclast(1,$tbl,$s1,$s2,$s3,$s0); |
357 | &enclast(2,"ebp",$s2,$s3,$s0,$s1); | 943 | &enclast(2,$tbl,$s2,$s3,$s0,$s1); |
358 | &enclast(3,"ebp",$s3,$s0,$s1,$s2); | 944 | &enclast(3,$tbl,$s3,$s0,$s1,$s2); |
359 | 945 | ||
360 | &add ($key,$small_footprint?16:160); | 946 | &add ($key,$small_footprint?16:160); |
361 | &xor ($s0,&DWP(0,$key)); | 947 | &xor ($s0,&DWP(0,$key)); |
@@ -430,38 +1016,198 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
430 | &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); | 1016 | &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); |
431 | &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); | 1017 | &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); |
432 | &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); | 1018 | &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); |
1019 | |||
1020 | #Te4 # four copies of Te4 to choose from to avoid L1 aliasing | ||
1021 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
1022 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
1023 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
1024 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
1025 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
1026 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
1027 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
1028 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
1029 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
1030 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
1031 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
1032 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
1033 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
1034 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
1035 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
1036 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
1037 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
1038 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
1039 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
1040 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
1041 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
1042 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
1043 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
1044 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
1045 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
1046 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
1047 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
1048 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
1049 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
1050 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
1051 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
1052 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
1053 | |||
1054 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
1055 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
1056 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
1057 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
1058 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
1059 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
1060 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
1061 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
1062 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
1063 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
1064 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
1065 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
1066 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
1067 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
1068 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
1069 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
1070 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
1071 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
1072 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
1073 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
1074 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
1075 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
1076 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
1077 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
1078 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
1079 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
1080 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
1081 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
1082 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
1083 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
1084 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
1085 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
1086 | |||
1087 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
1088 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
1089 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
1090 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
1091 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
1092 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
1093 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
1094 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
1095 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
1096 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
1097 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
1098 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
1099 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
1100 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
1101 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
1102 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
1103 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
1104 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
1105 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
1106 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
1107 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
1108 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
1109 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
1110 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
1111 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
1112 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
1113 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
1114 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
1115 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
1116 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
1117 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
1118 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
1119 | |||
1120 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
1121 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
1122 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
1123 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
1124 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
1125 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
1126 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
1127 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
1128 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
1129 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
1130 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
1131 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
1132 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
1133 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
1134 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
1135 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
1136 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
1137 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
1138 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
1139 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
1140 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
1141 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
1142 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
1143 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
1144 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
1145 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
1146 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
1147 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
1148 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
1149 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
1150 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
1151 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
433 | #rcon: | 1152 | #rcon: |
434 | &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008); | 1153 | &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008); |
435 | &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080); | 1154 | &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080); |
436 | &data_word(0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0); | 1155 | &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000); |
1156 | &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000); | ||
437 | &function_end_B("_x86_AES_encrypt"); | 1157 | &function_end_B("_x86_AES_encrypt"); |
438 | 1158 | ||
439 | # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); | 1159 | # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); |
440 | &public_label("AES_Te"); | ||
441 | &function_begin("AES_encrypt"); | 1160 | &function_begin("AES_encrypt"); |
442 | &mov ($acc,&wparam(0)); # load inp | 1161 | &mov ($acc,&wparam(0)); # load inp |
443 | &mov ($key,&wparam(2)); # load key | 1162 | &mov ($key,&wparam(2)); # load key |
444 | 1163 | ||
445 | &mov ($s0,"esp"); | 1164 | &mov ($s0,"esp"); |
446 | &sub ("esp",24); | 1165 | &sub ("esp",36); |
447 | &and ("esp",-64); | 1166 | &and ("esp",-64); # align to cache-line |
448 | &add ("esp",4); | 1167 | |
449 | &mov (&DWP(16,"esp"),$s0); | 1168 | # place stack frame just "above" the key schedule |
1169 | &lea ($s1,&DWP(-64-63,$key)); | ||
1170 | &sub ($s1,"esp"); | ||
1171 | &neg ($s1); | ||
1172 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line | ||
1173 | &sub ("esp",$s1); | ||
1174 | &add ("esp",4); # 4 is reserved for caller's return address | ||
1175 | &mov ($_esp,$s0); # save stack pointer | ||
450 | 1176 | ||
451 | &call (&label("pic_point")); # make it PIC! | 1177 | &call (&label("pic_point")); # make it PIC! |
452 | &set_label("pic_point"); | 1178 | &set_label("pic_point"); |
453 | &blindpop("ebp"); | 1179 | &blindpop($tbl); |
454 | &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 1180 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only); |
455 | 1181 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); | |
1182 | |||
1183 | # pick Te4 copy which can't "overlap" with stack frame or key schedule | ||
1184 | &lea ($s1,&DWP(768-4,"esp")); | ||
1185 | &sub ($s1,$tbl); | ||
1186 | &and ($s1,0x300); | ||
1187 | &lea ($tbl,&DWP(2048+128,$tbl,$s1)); | ||
1188 | |||
1189 | if (!$x86only) { | ||
1190 | &bt (&DWP(0,$s0),25); # check for SSE bit | ||
1191 | &jnc (&label("x86")); | ||
1192 | |||
1193 | &movq ("mm0",&QWP(0,$acc)); | ||
1194 | &movq ("mm4",&QWP(8,$acc)); | ||
1195 | &call ("_sse_AES_encrypt_compact"); | ||
1196 | &mov ("esp",$_esp); # restore stack pointer | ||
1197 | &mov ($acc,&wparam(1)); # load out | ||
1198 | &movq (&QWP(0,$acc),"mm0"); # write output data | ||
1199 | &movq (&QWP(8,$acc),"mm4"); | ||
1200 | &emms (); | ||
1201 | &function_end_A(); | ||
1202 | } | ||
1203 | &set_label("x86",16); | ||
1204 | &mov ($_tbl,$tbl); | ||
456 | &mov ($s0,&DWP(0,$acc)); # load input data | 1205 | &mov ($s0,&DWP(0,$acc)); # load input data |
457 | &mov ($s1,&DWP(4,$acc)); | 1206 | &mov ($s1,&DWP(4,$acc)); |
458 | &mov ($s2,&DWP(8,$acc)); | 1207 | &mov ($s2,&DWP(8,$acc)); |
459 | &mov ($s3,&DWP(12,$acc)); | 1208 | &mov ($s3,&DWP(12,$acc)); |
460 | 1209 | &call ("_x86_AES_encrypt_compact"); | |
461 | &call ("_x86_AES_encrypt"); | 1210 | &mov ("esp",$_esp); # restore stack pointer |
462 | |||
463 | &mov ("esp",&DWP(16,"esp")); | ||
464 | |||
465 | &mov ($acc,&wparam(1)); # load out | 1211 | &mov ($acc,&wparam(1)); # load out |
466 | &mov (&DWP(0,$acc),$s0); # write output data | 1212 | &mov (&DWP(0,$acc),$s0); # write output data |
467 | &mov (&DWP(4,$acc),$s1); | 1213 | &mov (&DWP(4,$acc),$s1); |
@@ -469,7 +1215,370 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
469 | &mov (&DWP(12,$acc),$s3); | 1215 | &mov (&DWP(12,$acc),$s3); |
470 | &function_end("AES_encrypt"); | 1216 | &function_end("AES_encrypt"); |
471 | 1217 | ||
472 | #------------------------------------------------------------------# | 1218 | #--------------------------------------------------------------------# |
1219 | |||
1220 | ###################################################################### | ||
1221 | # "Compact" block function | ||
1222 | ###################################################################### | ||
1223 | |||
1224 | sub deccompact() | ||
1225 | { my $Fn = mov; | ||
1226 | while ($#_>5) { pop(@_); $Fn=sub{}; } | ||
1227 | my ($i,$td,@s)=@_; | ||
1228 | my $tmp = $key; | ||
1229 | my $out = $i==3?$s[0]:$acc; | ||
1230 | |||
1231 | # $Fn is used in first compact round and its purpose is to | ||
1232 | # void restoration of some values from stack, so that after | ||
1233 | # 4xdeccompact with extra argument $key, $s0 and $s1 values | ||
1234 | # are left there... | ||
1235 | if($i==3) { &$Fn ($key,$__key); } | ||
1236 | else { &mov ($out,$s[0]); } | ||
1237 | &and ($out,0xFF); | ||
1238 | &movz ($out,&BP(-128,$td,$out,1)); | ||
1239 | |||
1240 | if ($i==3) { $tmp=$s[1]; } | ||
1241 | &movz ($tmp,&HB($s[1])); | ||
1242 | &movz ($tmp,&BP(-128,$td,$tmp,1)); | ||
1243 | &shl ($tmp,8); | ||
1244 | &xor ($out,$tmp); | ||
1245 | |||
1246 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); } | ||
1247 | else { mov ($tmp,$s[2]); } | ||
1248 | &shr ($tmp,16); | ||
1249 | &and ($tmp,0xFF); | ||
1250 | &movz ($tmp,&BP(-128,$td,$tmp,1)); | ||
1251 | &shl ($tmp,16); | ||
1252 | &xor ($out,$tmp); | ||
1253 | |||
1254 | if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); } | ||
1255 | else { &mov ($tmp,$s[3]); } | ||
1256 | &shr ($tmp,24); | ||
1257 | &movz ($tmp,&BP(-128,$td,$tmp,1)); | ||
1258 | &shl ($tmp,24); | ||
1259 | &xor ($out,$tmp); | ||
1260 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | ||
1261 | if ($i==3) { &$Fn ($s[3],$__s0); } | ||
1262 | } | ||
1263 | |||
1264 | # must be called with 2,3,0,1 as argument sequence!!! | ||
1265 | sub dectransform() | ||
1266 | { my @s = ($s0,$s1,$s2,$s3); | ||
1267 | my $i = shift; | ||
1268 | my $tmp = $key; | ||
1269 | my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1); | ||
1270 | my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1); | ||
1271 | my $tp8 = $tbl; | ||
1272 | |||
1273 | &mov ($acc,$s[$i]); | ||
1274 | &and ($acc,0x80808080); | ||
1275 | &mov ($tmp,$acc); | ||
1276 | &shr ($tmp,7); | ||
1277 | &lea ($tp2,&DWP(0,$s[$i],$s[$i])); | ||
1278 | &sub ($acc,$tmp); | ||
1279 | &and ($tp2,0xfefefefe); | ||
1280 | &and ($acc,0x1b1b1b1b); | ||
1281 | &xor ($acc,$tp2); | ||
1282 | &mov ($tp2,$acc); | ||
1283 | |||
1284 | &and ($acc,0x80808080); | ||
1285 | &mov ($tmp,$acc); | ||
1286 | &shr ($tmp,7); | ||
1287 | &lea ($tp4,&DWP(0,$tp2,$tp2)); | ||
1288 | &sub ($acc,$tmp); | ||
1289 | &and ($tp4,0xfefefefe); | ||
1290 | &and ($acc,0x1b1b1b1b); | ||
1291 | &xor ($tp2,$s[$i]); # tp2^tp1 | ||
1292 | &xor ($acc,$tp4); | ||
1293 | &mov ($tp4,$acc); | ||
1294 | |||
1295 | &and ($acc,0x80808080); | ||
1296 | &mov ($tmp,$acc); | ||
1297 | &shr ($tmp,7); | ||
1298 | &lea ($tp8,&DWP(0,$tp4,$tp4)); | ||
1299 | &sub ($acc,$tmp); | ||
1300 | &and ($tp8,0xfefefefe); | ||
1301 | &and ($acc,0x1b1b1b1b); | ||
1302 | &xor ($tp4,$s[$i]); # tp4^tp1 | ||
1303 | &rotl ($s[$i],8); # = ROTATE(tp1,8) | ||
1304 | &xor ($tp8,$acc); | ||
1305 | |||
1306 | &xor ($s[$i],$tp2); | ||
1307 | &xor ($tp2,$tp8); | ||
1308 | &rotl ($tp2,24); | ||
1309 | &xor ($s[$i],$tp4); | ||
1310 | &xor ($tp4,$tp8); | ||
1311 | &rotl ($tp4,16); | ||
1312 | &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) | ||
1313 | &rotl ($tp8,8); | ||
1314 | &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24) | ||
1315 | &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16) | ||
1316 | &mov ($s[0],$__s0) if($i==2); #prefetch $s0 | ||
1317 | &mov ($s[1],$__s1) if($i==3); #prefetch $s1 | ||
1318 | &mov ($s[2],$__s2) if($i==1); | ||
1319 | &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8) | ||
1320 | |||
1321 | &mov ($s[3],$__s3) if($i==1); | ||
1322 | &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2); | ||
1323 | } | ||
1324 | |||
1325 | &function_begin_B("_x86_AES_decrypt_compact"); | ||
1326 | # note that caller is expected to allocate stack frame for me! | ||
1327 | &mov ($__key,$key); # save key | ||
1328 | |||
1329 | &xor ($s0,&DWP(0,$key)); # xor with key | ||
1330 | &xor ($s1,&DWP(4,$key)); | ||
1331 | &xor ($s2,&DWP(8,$key)); | ||
1332 | &xor ($s3,&DWP(12,$key)); | ||
1333 | |||
1334 | &mov ($acc,&DWP(240,$key)); # load key->rounds | ||
1335 | |||
1336 | &lea ($acc,&DWP(-2,$acc,$acc)); | ||
1337 | &lea ($acc,&DWP(0,$key,$acc,8)); | ||
1338 | &mov ($__end,$acc); # end of key schedule | ||
1339 | |||
1340 | # prefetch Td4 | ||
1341 | &mov ($key,&DWP(0-128,$tbl)); | ||
1342 | &mov ($acc,&DWP(32-128,$tbl)); | ||
1343 | &mov ($key,&DWP(64-128,$tbl)); | ||
1344 | &mov ($acc,&DWP(96-128,$tbl)); | ||
1345 | &mov ($key,&DWP(128-128,$tbl)); | ||
1346 | &mov ($acc,&DWP(160-128,$tbl)); | ||
1347 | &mov ($key,&DWP(192-128,$tbl)); | ||
1348 | &mov ($acc,&DWP(224-128,$tbl)); | ||
1349 | |||
1350 | &set_label("loop",16); | ||
1351 | |||
1352 | &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1); | ||
1353 | &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1); | ||
1354 | &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1); | ||
1355 | &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1); | ||
1356 | &dectransform(2); | ||
1357 | &dectransform(3); | ||
1358 | &dectransform(0); | ||
1359 | &dectransform(1); | ||
1360 | &mov ($key,$__key); | ||
1361 | &mov ($tbl,$__tbl); | ||
1362 | &add ($key,16); # advance rd_key | ||
1363 | &xor ($s0,&DWP(0,$key)); | ||
1364 | &xor ($s1,&DWP(4,$key)); | ||
1365 | &xor ($s2,&DWP(8,$key)); | ||
1366 | &xor ($s3,&DWP(12,$key)); | ||
1367 | |||
1368 | &cmp ($key,$__end); | ||
1369 | &mov ($__key,$key); | ||
1370 | &jb (&label("loop")); | ||
1371 | |||
1372 | &deccompact(0,$tbl,$s0,$s3,$s2,$s1); | ||
1373 | &deccompact(1,$tbl,$s1,$s0,$s3,$s2); | ||
1374 | &deccompact(2,$tbl,$s2,$s1,$s0,$s3); | ||
1375 | &deccompact(3,$tbl,$s3,$s2,$s1,$s0); | ||
1376 | |||
1377 | &xor ($s0,&DWP(16,$key)); | ||
1378 | &xor ($s1,&DWP(20,$key)); | ||
1379 | &xor ($s2,&DWP(24,$key)); | ||
1380 | &xor ($s3,&DWP(28,$key)); | ||
1381 | |||
1382 | &ret (); | ||
1383 | &function_end_B("_x86_AES_decrypt_compact"); | ||
1384 | |||
1385 | ###################################################################### | ||
1386 | # "Compact" SSE block function. | ||
1387 | ###################################################################### | ||
1388 | |||
1389 | sub sse_deccompact() | ||
1390 | { | ||
1391 | &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0 | ||
1392 | &movd ("eax","mm1"); # 7, 6, 1, 0 | ||
1393 | |||
1394 | &pshufw ("mm5","mm4",0x09); # 13,12,11,10 | ||
1395 | &movz ($acc,&LB("eax")); # 0 | ||
1396 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 | ||
1397 | &movd ("ebx","mm5"); # 13,12,11,10 | ||
1398 | &movz ("edx",&HB("eax")); # 1 | ||
1399 | &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 | ||
1400 | &shl ("edx",8); # 1 | ||
1401 | |||
1402 | &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 | ||
1403 | &movz ($acc,&LB("ebx")); # 10 | ||
1404 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 | ||
1405 | &shl ($acc,16); # 10 | ||
1406 | &or ("ecx",$acc); # 10 | ||
1407 | &shr ("eax",16); # 7, 6 | ||
1408 | &movz ($acc,&HB("ebx")); # 11 | ||
1409 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 | ||
1410 | &shl ($acc,24); # 11 | ||
1411 | &or ("edx",$acc); # 11 | ||
1412 | &shr ("ebx",16); # 13,12 | ||
1413 | |||
1414 | &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 | ||
1415 | &movz ($acc,&HB("eax")); # 7 | ||
1416 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 | ||
1417 | &shl ($acc,24); # 7 | ||
1418 | &or ("ecx",$acc); # 7 | ||
1419 | &movz ($acc,&HB("ebx")); # 13 | ||
1420 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 | ||
1421 | &shl ($acc,8); # 13 | ||
1422 | &or ("ecx",$acc); # 13 | ||
1423 | &movd ("mm0","ecx"); # t[0] collected | ||
1424 | |||
1425 | &movz ($acc,&LB("eax")); # 6 | ||
1426 | &movd ("eax","mm2"); # 3, 2, 5, 4 | ||
1427 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6 | ||
1428 | &shl ("ecx",16); # 6 | ||
1429 | &movz ($acc,&LB("ebx")); # 12 | ||
1430 | &movd ("ebx","mm6"); # 9, 8,15,14 | ||
1431 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12 | ||
1432 | &or ("ecx",$acc); # 12 | ||
1433 | |||
1434 | &movz ($acc,&LB("eax")); # 4 | ||
1435 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4 | ||
1436 | &or ("edx",$acc); # 4 | ||
1437 | &movz ($acc,&LB("ebx")); # 14 | ||
1438 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 | ||
1439 | &shl ($acc,16); # 14 | ||
1440 | &or ("edx",$acc); # 14 | ||
1441 | &movd ("mm1","edx"); # t[1] collected | ||
1442 | |||
1443 | &movz ($acc,&HB("eax")); # 5 | ||
1444 | &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5 | ||
1445 | &shl ("edx",8); # 5 | ||
1446 | &movz ($acc,&HB("ebx")); # 15 | ||
1447 | &shr ("eax",16); # 3, 2 | ||
1448 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 | ||
1449 | &shl ($acc,24); # 15 | ||
1450 | &or ("edx",$acc); # 15 | ||
1451 | &shr ("ebx",16); # 9, 8 | ||
1452 | |||
1453 | &punpckldq ("mm0","mm1"); # t[0,1] collected | ||
1454 | |||
1455 | &movz ($acc,&HB("ebx")); # 9 | ||
1456 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 | ||
1457 | &shl ($acc,8); # 9 | ||
1458 | &or ("ecx",$acc); # 9 | ||
1459 | &and ("ebx",0xff); # 8 | ||
1460 | &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8 | ||
1461 | &or ("edx","ebx"); # 8 | ||
1462 | &movz ($acc,&LB("eax")); # 2 | ||
1463 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 | ||
1464 | &shl ($acc,16); # 2 | ||
1465 | &or ("edx",$acc); # 2 | ||
1466 | &movd ("mm4","edx"); # t[2] collected | ||
1467 | &movz ("eax",&HB("eax")); # 3 | ||
1468 | &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3 | ||
1469 | &shl ("eax",24); # 3 | ||
1470 | &or ("ecx","eax"); # 3 | ||
1471 | &movd ("mm5","ecx"); # t[3] collected | ||
1472 | |||
1473 | &punpckldq ("mm4","mm5"); # t[2,3] collected | ||
1474 | } | ||
1475 | |||
1476 | if (!$x86only) { | ||
1477 | &function_begin_B("_sse_AES_decrypt_compact"); | ||
1478 | &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0 | ||
1479 | &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8 | ||
1480 | |||
1481 | # note that caller is expected to allocate stack frame for me! | ||
1482 | &mov ($acc,&DWP(240,$key)); # load key->rounds | ||
1483 | &lea ($acc,&DWP(-2,$acc,$acc)); | ||
1484 | &lea ($acc,&DWP(0,$key,$acc,8)); | ||
1485 | &mov ($__end,$acc); # end of key schedule | ||
1486 | |||
1487 | &mov ($s0,0x1b1b1b1b); # magic constant | ||
1488 | &mov (&DWP(8,"esp"),$s0); | ||
1489 | &mov (&DWP(12,"esp"),$s0); | ||
1490 | |||
1491 | # prefetch Td4 | ||
1492 | &mov ($s0,&DWP(0-128,$tbl)); | ||
1493 | &mov ($s1,&DWP(32-128,$tbl)); | ||
1494 | &mov ($s2,&DWP(64-128,$tbl)); | ||
1495 | &mov ($s3,&DWP(96-128,$tbl)); | ||
1496 | &mov ($s0,&DWP(128-128,$tbl)); | ||
1497 | &mov ($s1,&DWP(160-128,$tbl)); | ||
1498 | &mov ($s2,&DWP(192-128,$tbl)); | ||
1499 | &mov ($s3,&DWP(224-128,$tbl)); | ||
1500 | |||
1501 | &set_label("loop",16); | ||
1502 | &sse_deccompact(); | ||
1503 | &add ($key,16); | ||
1504 | &cmp ($key,$__end); | ||
1505 | &ja (&label("out")); | ||
1506 | |||
1507 | # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N) | ||
1508 | &movq ("mm3","mm0"); &movq ("mm7","mm4"); | ||
1509 | &movq ("mm2","mm0",1); &movq ("mm6","mm4",1); | ||
1510 | &movq ("mm1","mm0"); &movq ("mm5","mm4"); | ||
1511 | &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16) | ||
1512 | &pslld ("mm2",8); &pslld ("mm6",8); | ||
1513 | &psrld ("mm3",8); &psrld ("mm7",8); | ||
1514 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8 | ||
1515 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8 | ||
1516 | &pslld ("mm2",16); &pslld ("mm6",16); | ||
1517 | &psrld ("mm3",16); &psrld ("mm7",16); | ||
1518 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24 | ||
1519 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24 | ||
1520 | |||
1521 | &movq ("mm3",&QWP(8,"esp")); | ||
1522 | &pxor ("mm2","mm2"); &pxor ("mm6","mm6"); | ||
1523 | &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5"); | ||
1524 | &pand ("mm2","mm3"); &pand ("mm6","mm3"); | ||
1525 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); | ||
1526 | &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2 | ||
1527 | &movq ("mm3","mm1"); &movq ("mm7","mm5"); | ||
1528 | &movq ("mm2","mm1"); &movq ("mm6","mm5"); | ||
1529 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2 | ||
1530 | &pslld ("mm3",24); &pslld ("mm7",24); | ||
1531 | &psrld ("mm2",8); &psrld ("mm6",8); | ||
1532 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24 | ||
1533 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8 | ||
1534 | |||
1535 | &movq ("mm2",&QWP(8,"esp")); | ||
1536 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); | ||
1537 | &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5"); | ||
1538 | &pand ("mm3","mm2"); &pand ("mm7","mm2"); | ||
1539 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); | ||
1540 | &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4 | ||
1541 | &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1); | ||
1542 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4 | ||
1543 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16) | ||
1544 | |||
1545 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); | ||
1546 | &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5"); | ||
1547 | &pand ("mm3","mm2"); &pand ("mm7","mm2"); | ||
1548 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); | ||
1549 | &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8 | ||
1550 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8 | ||
1551 | &movq ("mm3","mm1"); &movq ("mm7","mm5"); | ||
1552 | &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1); | ||
1553 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16) | ||
1554 | &pslld ("mm1",8); &pslld ("mm5",8); | ||
1555 | &psrld ("mm3",8); &psrld ("mm7",8); | ||
1556 | &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key)); | ||
1557 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8 | ||
1558 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8 | ||
1559 | &mov ($s0,&DWP(0-128,$tbl)); | ||
1560 | &pslld ("mm1",16); &pslld ("mm5",16); | ||
1561 | &mov ($s1,&DWP(64-128,$tbl)); | ||
1562 | &psrld ("mm3",16); &psrld ("mm7",16); | ||
1563 | &mov ($s2,&DWP(128-128,$tbl)); | ||
1564 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24 | ||
1565 | &mov ($s3,&DWP(192-128,$tbl)); | ||
1566 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24 | ||
1567 | |||
1568 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); | ||
1569 | &jmp (&label("loop")); | ||
1570 | |||
1571 | &set_label("out",16); | ||
1572 | &pxor ("mm0",&QWP(0,$key)); | ||
1573 | &pxor ("mm4",&QWP(8,$key)); | ||
1574 | |||
1575 | &ret (); | ||
1576 | &function_end_B("_sse_AES_decrypt_compact"); | ||
1577 | } | ||
1578 | |||
1579 | ###################################################################### | ||
1580 | # Vanilla block function. | ||
1581 | ###################################################################### | ||
473 | 1582 | ||
474 | sub decstep() | 1583 | sub decstep() |
475 | { my ($i,$td,@s) = @_; | 1584 | { my ($i,$td,@s) = @_; |
@@ -480,7 +1589,7 @@ sub decstep() | |||
480 | # optimal... or rather that all attempts to reorder didn't | 1589 | # optimal... or rather that all attempts to reorder didn't |
481 | # result in better performance [which by the way is not a | 1590 | # result in better performance [which by the way is not a |
482 | # bit lower than ecryption]. | 1591 | # bit lower than ecryption]. |
483 | if($i==3) { &mov ($key,&DWP(12,"esp")); } | 1592 | if($i==3) { &mov ($key,$__key); } |
484 | else { &mov ($out,$s[0]); } | 1593 | else { &mov ($out,$s[0]); } |
485 | &and ($out,0xFF); | 1594 | &and ($out,0xFF); |
486 | &mov ($out,&DWP(0,$td,$out,8)); | 1595 | &mov ($out,&DWP(0,$td,$out,8)); |
@@ -495,12 +1604,12 @@ sub decstep() | |||
495 | &and ($tmp,0xFF); | 1604 | &and ($tmp,0xFF); |
496 | &xor ($out,&DWP(2,$td,$tmp,8)); | 1605 | &xor ($out,&DWP(2,$td,$tmp,8)); |
497 | 1606 | ||
498 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); } | 1607 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); } |
499 | else { &mov ($tmp,$s[3]); } | 1608 | else { &mov ($tmp,$s[3]); } |
500 | &shr ($tmp,24); | 1609 | &shr ($tmp,24); |
501 | &xor ($out,&DWP(1,$td,$tmp,8)); | 1610 | &xor ($out,&DWP(1,$td,$tmp,8)); |
502 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | 1611 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } |
503 | if ($i==3) { &mov ($s[3],&DWP(4,"esp")); } | 1612 | if ($i==3) { &mov ($s[3],$__s0); } |
504 | &comment(); | 1613 | &comment(); |
505 | } | 1614 | } |
506 | 1615 | ||
@@ -509,14 +1618,24 @@ sub declast() | |||
509 | my $tmp = $key; | 1618 | my $tmp = $key; |
510 | my $out = $i==3?$s[0]:$acc; | 1619 | my $out = $i==3?$s[0]:$acc; |
511 | 1620 | ||
512 | if($i==3) { &mov ($key,&DWP(12,"esp")); } | 1621 | if($i==0) { &lea ($td,&DWP(2048+128,$td)); |
1622 | &mov ($tmp,&DWP(0-128,$td)); | ||
1623 | &mov ($acc,&DWP(32-128,$td)); | ||
1624 | &mov ($tmp,&DWP(64-128,$td)); | ||
1625 | &mov ($acc,&DWP(96-128,$td)); | ||
1626 | &mov ($tmp,&DWP(128-128,$td)); | ||
1627 | &mov ($acc,&DWP(160-128,$td)); | ||
1628 | &mov ($tmp,&DWP(192-128,$td)); | ||
1629 | &mov ($acc,&DWP(224-128,$td)); | ||
1630 | &lea ($td,&DWP(-128,$td)); } | ||
1631 | if($i==3) { &mov ($key,$__key); } | ||
513 | else { &mov ($out,$s[0]); } | 1632 | else { &mov ($out,$s[0]); } |
514 | &and ($out,0xFF); | 1633 | &and ($out,0xFF); |
515 | &movz ($out,&BP(2048,$td,$out,1)); | 1634 | &movz ($out,&BP(0,$td,$out,1)); |
516 | 1635 | ||
517 | if ($i==3) { $tmp=$s[1]; } | 1636 | if ($i==3) { $tmp=$s[1]; } |
518 | &movz ($tmp,&HB($s[1])); | 1637 | &movz ($tmp,&HB($s[1])); |
519 | &movz ($tmp,&BP(2048,$td,$tmp,1)); | 1638 | &movz ($tmp,&BP(0,$td,$tmp,1)); |
520 | &shl ($tmp,8); | 1639 | &shl ($tmp,8); |
521 | &xor ($out,$tmp); | 1640 | &xor ($out,$tmp); |
522 | 1641 | ||
@@ -524,24 +1643,24 @@ sub declast() | |||
524 | else { mov ($tmp,$s[2]); } | 1643 | else { mov ($tmp,$s[2]); } |
525 | &shr ($tmp,16); | 1644 | &shr ($tmp,16); |
526 | &and ($tmp,0xFF); | 1645 | &and ($tmp,0xFF); |
527 | &movz ($tmp,&BP(2048,$td,$tmp,1)); | 1646 | &movz ($tmp,&BP(0,$td,$tmp,1)); |
528 | &shl ($tmp,16); | 1647 | &shl ($tmp,16); |
529 | &xor ($out,$tmp); | 1648 | &xor ($out,$tmp); |
530 | 1649 | ||
531 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); } | 1650 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); } |
532 | else { &mov ($tmp,$s[3]); } | 1651 | else { &mov ($tmp,$s[3]); } |
533 | &shr ($tmp,24); | 1652 | &shr ($tmp,24); |
534 | &movz ($tmp,&BP(2048,$td,$tmp,1)); | 1653 | &movz ($tmp,&BP(0,$td,$tmp,1)); |
535 | &shl ($tmp,24); | 1654 | &shl ($tmp,24); |
536 | &xor ($out,$tmp); | 1655 | &xor ($out,$tmp); |
537 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | 1656 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } |
538 | if ($i==3) { &mov ($s[3],&DWP(4,"esp")); } | 1657 | if ($i==3) { &mov ($s[3],$__s0); |
1658 | &lea ($td,&DWP(-2048,$td)); } | ||
539 | } | 1659 | } |
540 | 1660 | ||
541 | &public_label("AES_Td"); | 1661 | &function_begin_B("_x86_AES_decrypt"); |
542 | &function_begin_C("_x86_AES_decrypt"); | ||
543 | # note that caller is expected to allocate stack frame for me! | 1662 | # note that caller is expected to allocate stack frame for me! |
544 | &mov (&DWP(12,"esp"),$key); # save key | 1663 | &mov ($__key,$key); # save key |
545 | 1664 | ||
546 | &xor ($s0,&DWP(0,$key)); # xor with key | 1665 | &xor ($s0,&DWP(0,$key)); # xor with key |
547 | &xor ($s1,&DWP(4,$key)); | 1666 | &xor ($s1,&DWP(4,$key)); |
@@ -553,20 +1672,19 @@ sub declast() | |||
553 | if ($small_footprint) { | 1672 | if ($small_footprint) { |
554 | &lea ($acc,&DWP(-2,$acc,$acc)); | 1673 | &lea ($acc,&DWP(-2,$acc,$acc)); |
555 | &lea ($acc,&DWP(0,$key,$acc,8)); | 1674 | &lea ($acc,&DWP(0,$key,$acc,8)); |
556 | &mov (&DWP(16,"esp"),$acc); # end of key schedule | 1675 | &mov ($__end,$acc); # end of key schedule |
557 | &align (4); | 1676 | &set_label("loop",16); |
558 | &set_label("loop"); | 1677 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
559 | &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1678 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
560 | &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1679 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
561 | &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1680 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
562 | &decstep(3,"ebp",$s3,$s2,$s1,$s0); | ||
563 | &add ($key,16); # advance rd_key | 1681 | &add ($key,16); # advance rd_key |
564 | &xor ($s0,&DWP(0,$key)); | 1682 | &xor ($s0,&DWP(0,$key)); |
565 | &xor ($s1,&DWP(4,$key)); | 1683 | &xor ($s1,&DWP(4,$key)); |
566 | &xor ($s2,&DWP(8,$key)); | 1684 | &xor ($s2,&DWP(8,$key)); |
567 | &xor ($s3,&DWP(12,$key)); | 1685 | &xor ($s3,&DWP(12,$key)); |
568 | &cmp ($key,&DWP(16,"esp")); | 1686 | &cmp ($key,$__end); |
569 | &mov (&DWP(12,"esp"),$key); | 1687 | &mov ($__key,$key); |
570 | &jb (&label("loop")); | 1688 | &jb (&label("loop")); |
571 | } | 1689 | } |
572 | else { | 1690 | else { |
@@ -575,38 +1693,38 @@ sub declast() | |||
575 | &cmp ($acc,12); | 1693 | &cmp ($acc,12); |
576 | &jle (&label("12rounds")); | 1694 | &jle (&label("12rounds")); |
577 | 1695 | ||
578 | &set_label("14rounds"); | 1696 | &set_label("14rounds",4); |
579 | for ($i=1;$i<3;$i++) { | 1697 | for ($i=1;$i<3;$i++) { |
580 | &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1698 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
581 | &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1699 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
582 | &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1700 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
583 | &decstep(3,"ebp",$s3,$s2,$s1,$s0); | 1701 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
584 | &xor ($s0,&DWP(16*$i+0,$key)); | 1702 | &xor ($s0,&DWP(16*$i+0,$key)); |
585 | &xor ($s1,&DWP(16*$i+4,$key)); | 1703 | &xor ($s1,&DWP(16*$i+4,$key)); |
586 | &xor ($s2,&DWP(16*$i+8,$key)); | 1704 | &xor ($s2,&DWP(16*$i+8,$key)); |
587 | &xor ($s3,&DWP(16*$i+12,$key)); | 1705 | &xor ($s3,&DWP(16*$i+12,$key)); |
588 | } | 1706 | } |
589 | &add ($key,32); | 1707 | &add ($key,32); |
590 | &mov (&DWP(12,"esp"),$key); # advance rd_key | 1708 | &mov ($__key,$key); # advance rd_key |
591 | &set_label("12rounds"); | 1709 | &set_label("12rounds",4); |
592 | for ($i=1;$i<3;$i++) { | 1710 | for ($i=1;$i<3;$i++) { |
593 | &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1711 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
594 | &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1712 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
595 | &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1713 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
596 | &decstep(3,"ebp",$s3,$s2,$s1,$s0); | 1714 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
597 | &xor ($s0,&DWP(16*$i+0,$key)); | 1715 | &xor ($s0,&DWP(16*$i+0,$key)); |
598 | &xor ($s1,&DWP(16*$i+4,$key)); | 1716 | &xor ($s1,&DWP(16*$i+4,$key)); |
599 | &xor ($s2,&DWP(16*$i+8,$key)); | 1717 | &xor ($s2,&DWP(16*$i+8,$key)); |
600 | &xor ($s3,&DWP(16*$i+12,$key)); | 1718 | &xor ($s3,&DWP(16*$i+12,$key)); |
601 | } | 1719 | } |
602 | &add ($key,32); | 1720 | &add ($key,32); |
603 | &mov (&DWP(12,"esp"),$key); # advance rd_key | 1721 | &mov ($__key,$key); # advance rd_key |
604 | &set_label("10rounds"); | 1722 | &set_label("10rounds",4); |
605 | for ($i=1;$i<10;$i++) { | 1723 | for ($i=1;$i<10;$i++) { |
606 | &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1724 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
607 | &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1725 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
608 | &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1726 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
609 | &decstep(3,"ebp",$s3,$s2,$s1,$s0); | 1727 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
610 | &xor ($s0,&DWP(16*$i+0,$key)); | 1728 | &xor ($s0,&DWP(16*$i+0,$key)); |
611 | &xor ($s1,&DWP(16*$i+4,$key)); | 1729 | &xor ($s1,&DWP(16*$i+4,$key)); |
612 | &xor ($s2,&DWP(16*$i+8,$key)); | 1730 | &xor ($s2,&DWP(16*$i+8,$key)); |
@@ -614,10 +1732,10 @@ sub declast() | |||
614 | } | 1732 | } |
615 | } | 1733 | } |
616 | 1734 | ||
617 | &declast(0,"ebp",$s0,$s3,$s2,$s1); | 1735 | &declast(0,$tbl,$s0,$s3,$s2,$s1); |
618 | &declast(1,"ebp",$s1,$s0,$s3,$s2); | 1736 | &declast(1,$tbl,$s1,$s0,$s3,$s2); |
619 | &declast(2,"ebp",$s2,$s1,$s0,$s3); | 1737 | &declast(2,$tbl,$s2,$s1,$s0,$s3); |
620 | &declast(3,"ebp",$s3,$s2,$s1,$s0); | 1738 | &declast(3,$tbl,$s3,$s2,$s1,$s0); |
621 | 1739 | ||
622 | &add ($key,$small_footprint?16:160); | 1740 | &add ($key,$small_footprint?16:160); |
623 | &xor ($s0,&DWP(0,$key)); | 1741 | &xor ($s0,&DWP(0,$key)); |
@@ -692,7 +1810,107 @@ sub declast() | |||
692 | &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); | 1810 | &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); |
693 | &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); | 1811 | &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); |
694 | &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); | 1812 | &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); |
695 | #Td4: | 1813 | |
1814 | #Td4: # four copies of Td4 to choose from to avoid L1 aliasing | ||
1815 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
1816 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
1817 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
1818 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
1819 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
1820 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
1821 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
1822 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
1823 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
1824 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
1825 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
1826 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
1827 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
1828 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
1829 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
1830 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
1831 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
1832 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
1833 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
1834 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
1835 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
1836 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
1837 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
1838 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
1839 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
1840 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
1841 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
1842 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
1843 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
1844 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
1845 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
1846 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
1847 | |||
1848 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
1849 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
1850 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
1851 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
1852 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
1853 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
1854 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
1855 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
1856 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
1857 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
1858 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
1859 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
1860 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
1861 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
1862 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
1863 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
1864 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
1865 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
1866 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
1867 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
1868 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
1869 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
1870 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
1871 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
1872 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
1873 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
1874 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
1875 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
1876 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
1877 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
1878 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
1879 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
1880 | |||
1881 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
1882 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
1883 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
1884 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
1885 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
1886 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
1887 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
1888 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
1889 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
1890 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
1891 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
1892 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
1893 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
1894 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
1895 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
1896 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
1897 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
1898 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
1899 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
1900 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
1901 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
1902 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
1903 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
1904 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
1905 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
1906 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
1907 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
1908 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
1909 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
1910 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
1911 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
1912 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
1913 | |||
696 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | 1914 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); |
697 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | 1915 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); |
698 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | 1916 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); |
@@ -728,43 +1946,57 @@ sub declast() | |||
728 | &function_end_B("_x86_AES_decrypt"); | 1946 | &function_end_B("_x86_AES_decrypt"); |
729 | 1947 | ||
730 | # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); | 1948 | # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); |
731 | &public_label("AES_Td"); | ||
732 | &function_begin("AES_decrypt"); | 1949 | &function_begin("AES_decrypt"); |
733 | &mov ($acc,&wparam(0)); # load inp | 1950 | &mov ($acc,&wparam(0)); # load inp |
734 | &mov ($key,&wparam(2)); # load key | 1951 | &mov ($key,&wparam(2)); # load key |
735 | 1952 | ||
736 | &mov ($s0,"esp"); | 1953 | &mov ($s0,"esp"); |
737 | &sub ("esp",24); | 1954 | &sub ("esp",36); |
738 | &and ("esp",-64); | 1955 | &and ("esp",-64); # align to cache-line |
739 | &add ("esp",4); | 1956 | |
740 | &mov (&DWP(16,"esp"),$s0); | 1957 | # place stack frame just "above" the key schedule |
1958 | &lea ($s1,&DWP(-64-63,$key)); | ||
1959 | &sub ($s1,"esp"); | ||
1960 | &neg ($s1); | ||
1961 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line | ||
1962 | &sub ("esp",$s1); | ||
1963 | &add ("esp",4); # 4 is reserved for caller's return address | ||
1964 | &mov ($_esp,$s0); # save stack pointer | ||
741 | 1965 | ||
742 | &call (&label("pic_point")); # make it PIC! | 1966 | &call (&label("pic_point")); # make it PIC! |
743 | &set_label("pic_point"); | 1967 | &set_label("pic_point"); |
744 | &blindpop("ebp"); | 1968 | &blindpop($tbl); |
745 | &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); | 1969 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only); |
746 | 1970 | &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl)); | |
747 | # prefetch Td4 | 1971 | |
748 | &lea ("ebp",&DWP(2048+128,"ebp")); | 1972 | # pick Td4 copy which can't "overlap" with stack frame or key schedule |
749 | &mov ($s0,&DWP(0-128,"ebp")); | 1973 | &lea ($s1,&DWP(768-4,"esp")); |
750 | &mov ($s1,&DWP(32-128,"ebp")); | 1974 | &sub ($s1,$tbl); |
751 | &mov ($s2,&DWP(64-128,"ebp")); | 1975 | &and ($s1,0x300); |
752 | &mov ($s3,&DWP(96-128,"ebp")); | 1976 | &lea ($tbl,&DWP(2048+128,$tbl,$s1)); |
753 | &mov ($s0,&DWP(128-128,"ebp")); | 1977 | |
754 | &mov ($s1,&DWP(160-128,"ebp")); | 1978 | if (!$x86only) { |
755 | &mov ($s2,&DWP(192-128,"ebp")); | 1979 | &bt (&DWP(0,$s0),25); # check for SSE bit |
756 | &mov ($s3,&DWP(224-128,"ebp")); | 1980 | &jnc (&label("x86")); |
757 | &lea ("ebp",&DWP(-2048-128,"ebp")); | 1981 | |
758 | 1982 | &movq ("mm0",&QWP(0,$acc)); | |
1983 | &movq ("mm4",&QWP(8,$acc)); | ||
1984 | &call ("_sse_AES_decrypt_compact"); | ||
1985 | &mov ("esp",$_esp); # restore stack pointer | ||
1986 | &mov ($acc,&wparam(1)); # load out | ||
1987 | &movq (&QWP(0,$acc),"mm0"); # write output data | ||
1988 | &movq (&QWP(8,$acc),"mm4"); | ||
1989 | &emms (); | ||
1990 | &function_end_A(); | ||
1991 | } | ||
1992 | &set_label("x86",16); | ||
1993 | &mov ($_tbl,$tbl); | ||
759 | &mov ($s0,&DWP(0,$acc)); # load input data | 1994 | &mov ($s0,&DWP(0,$acc)); # load input data |
760 | &mov ($s1,&DWP(4,$acc)); | 1995 | &mov ($s1,&DWP(4,$acc)); |
761 | &mov ($s2,&DWP(8,$acc)); | 1996 | &mov ($s2,&DWP(8,$acc)); |
762 | &mov ($s3,&DWP(12,$acc)); | 1997 | &mov ($s3,&DWP(12,$acc)); |
763 | 1998 | &call ("_x86_AES_decrypt_compact"); | |
764 | &call ("_x86_AES_decrypt"); | 1999 | &mov ("esp",$_esp); # restore stack pointer |
765 | |||
766 | &mov ("esp",&DWP(16,"esp")); | ||
767 | |||
768 | &mov ($acc,&wparam(1)); # load out | 2000 | &mov ($acc,&wparam(1)); # load out |
769 | &mov (&DWP(0,$acc),$s0); # write output data | 2001 | &mov (&DWP(0,$acc),$s0); # write output data |
770 | &mov (&DWP(4,$acc),$s1); | 2002 | &mov (&DWP(4,$acc),$s1); |
@@ -777,126 +2009,136 @@ sub declast() | |||
777 | # unsigned char *ivp,const int enc); | 2009 | # unsigned char *ivp,const int enc); |
778 | { | 2010 | { |
779 | # stack frame layout | 2011 | # stack frame layout |
780 | # -4(%esp) 0(%esp) return address | 2012 | # -4(%esp) # return address 0(%esp) |
781 | # 0(%esp) 4(%esp) tmp1 | 2013 | # 0(%esp) # s0 backing store 4(%esp) |
782 | # 4(%esp) 8(%esp) tmp2 | 2014 | # 4(%esp) # s1 backing store 8(%esp) |
783 | # 8(%esp) 12(%esp) key | 2015 | # 8(%esp) # s2 backing store 12(%esp) |
784 | # 12(%esp) 16(%esp) end of key schedule | 2016 | # 12(%esp) # s3 backing store 16(%esp) |
785 | my $_esp=&DWP(16,"esp"); #saved %esp | 2017 | # 16(%esp) # key backup 20(%esp) |
786 | my $_inp=&DWP(20,"esp"); #copy of wparam(0) | 2018 | # 20(%esp) # end of key schedule 24(%esp) |
787 | my $_out=&DWP(24,"esp"); #copy of wparam(1) | 2019 | # 24(%esp) # %ebp backup 28(%esp) |
788 | my $_len=&DWP(28,"esp"); #copy of wparam(2) | 2020 | # 28(%esp) # %esp backup |
789 | my $_key=&DWP(32,"esp"); #copy of wparam(3) | 2021 | my $_inp=&DWP(32,"esp"); # copy of wparam(0) |
790 | my $_ivp=&DWP(36,"esp"); #copy of wparam(4) | 2022 | my $_out=&DWP(36,"esp"); # copy of wparam(1) |
791 | my $_tmp=&DWP(40,"esp"); #volatile variable | 2023 | my $_len=&DWP(40,"esp"); # copy of wparam(2) |
792 | my $ivec=&DWP(44,"esp"); #ivec[16] | 2024 | my $_key=&DWP(44,"esp"); # copy of wparam(3) |
793 | my $aes_key=&DWP(60,"esp"); #copy of aes_key | 2025 | my $_ivp=&DWP(48,"esp"); # copy of wparam(4) |
794 | my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | 2026 | my $_tmp=&DWP(52,"esp"); # volatile variable |
795 | 2027 | # | |
796 | &public_label("AES_Te"); | 2028 | my $ivec=&DWP(60,"esp"); # ivec[16] |
797 | &public_label("AES_Td"); | 2029 | my $aes_key=&DWP(76,"esp"); # copy of aes_key |
2030 | my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds | ||
2031 | |||
798 | &function_begin("AES_cbc_encrypt"); | 2032 | &function_begin("AES_cbc_encrypt"); |
799 | &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len | 2033 | &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len |
800 | &cmp ($s2,0); | 2034 | &cmp ($s2,0); |
801 | &je (&label("enc_out")); | 2035 | &je (&label("drop_out")); |
802 | 2036 | ||
803 | &call (&label("pic_point")); # make it PIC! | 2037 | &call (&label("pic_point")); # make it PIC! |
804 | &set_label("pic_point"); | 2038 | &set_label("pic_point"); |
805 | &blindpop("ebp"); | 2039 | &blindpop($tbl); |
806 | 2040 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only); | |
807 | &pushf (); | ||
808 | &cld (); | ||
809 | 2041 | ||
810 | &cmp (&wparam(5),0); | 2042 | &cmp (&wparam(5),0); |
811 | &je (&label("DECRYPT")); | 2043 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); |
2044 | &jne (&label("picked_te")); | ||
2045 | &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl)); | ||
2046 | &set_label("picked_te"); | ||
812 | 2047 | ||
813 | &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 2048 | # one can argue if this is required |
814 | 2049 | &pushf (); | |
815 | # allocate aligned stack frame... | 2050 | &cld (); |
816 | &lea ($key,&DWP(-64-244,"esp")); | ||
817 | &and ($key,-64); | ||
818 | 2051 | ||
819 | # ... and make sure it doesn't alias with AES_Te modulo 4096 | 2052 | &cmp ($s2,$speed_limit); |
820 | &mov ($s0,"ebp"); | 2053 | &jb (&label("slow_way")); |
821 | &lea ($s1,&DWP(2048,"ebp")); | 2054 | &test ($s2,15); |
822 | &mov ($s3,$key); | 2055 | &jnz (&label("slow_way")); |
2056 | if (!$x86only) { | ||
2057 | &bt (&DWP(0,$s0),28); # check for hyper-threading bit | ||
2058 | &jc (&label("slow_way")); | ||
2059 | } | ||
2060 | # pre-allocate aligned stack frame... | ||
2061 | &lea ($acc,&DWP(-80-244,"esp")); | ||
2062 | &and ($acc,-64); | ||
2063 | |||
2064 | # ... and make sure it doesn't alias with $tbl modulo 4096 | ||
2065 | &mov ($s0,$tbl); | ||
2066 | &lea ($s1,&DWP(2048+256,$tbl)); | ||
2067 | &mov ($s3,$acc); | ||
823 | &and ($s0,0xfff); # s = %ebp&0xfff | 2068 | &and ($s0,0xfff); # s = %ebp&0xfff |
824 | &and ($s1,0xfff); # e = (%ebp+2048)&0xfff | 2069 | &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff |
825 | &and ($s3,0xfff); # p = %esp&0xfff | 2070 | &and ($s3,0xfff); # p = %esp&0xfff |
826 | 2071 | ||
827 | &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); | 2072 | &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); |
828 | &jb (&label("te_break_out")); | 2073 | &jb (&label("tbl_break_out")); |
829 | &sub ($s3,$s1); | 2074 | &sub ($s3,$s1); |
830 | &sub ($key,$s3); | 2075 | &sub ($acc,$s3); |
831 | &jmp (&label("te_ok")); | 2076 | &jmp (&label("tbl_ok")); |
832 | &set_label("te_break_out"); # else %esp -= (p-s)&0xfff + framesz; | 2077 | &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz; |
833 | &sub ($s3,$s0); | 2078 | &sub ($s3,$s0); |
834 | &and ($s3,0xfff); | 2079 | &and ($s3,0xfff); |
835 | &add ($s3,64+256); | 2080 | &add ($s3,384); |
836 | &sub ($key,$s3); | 2081 | &sub ($acc,$s3); |
837 | &align (4); | 2082 | &set_label("tbl_ok",4); |
838 | &set_label("te_ok"); | ||
839 | 2083 | ||
840 | &mov ($s0,&wparam(0)); # load inp | 2084 | &lea ($s3,&wparam(0)); # obtain pointer to parameter block |
841 | &mov ($s1,&wparam(1)); # load out | 2085 | &exch ("esp",$acc); # allocate stack frame |
842 | &mov ($s3,&wparam(3)); # load key | ||
843 | &mov ($acc,&wparam(4)); # load ivp | ||
844 | |||
845 | &exch ("esp",$key); | ||
846 | &add ("esp",4); # reserve for return address! | 2086 | &add ("esp",4); # reserve for return address! |
847 | &mov ($_esp,$key); # save %esp | 2087 | &mov ($_tbl,$tbl); # save %ebp |
2088 | &mov ($_esp,$acc); # save %esp | ||
2089 | |||
2090 | &mov ($s0,&DWP(0,$s3)); # load inp | ||
2091 | &mov ($s1,&DWP(4,$s3)); # load out | ||
2092 | #&mov ($s2,&DWP(8,$s3)); # load len | ||
2093 | &mov ($key,&DWP(12,$s3)); # load key | ||
2094 | &mov ($acc,&DWP(16,$s3)); # load ivp | ||
2095 | &mov ($s3,&DWP(20,$s3)); # load enc flag | ||
848 | 2096 | ||
849 | &mov ($_inp,$s0); # save copy of inp | 2097 | &mov ($_inp,$s0); # save copy of inp |
850 | &mov ($_out,$s1); # save copy of out | 2098 | &mov ($_out,$s1); # save copy of out |
851 | &mov ($_len,$s2); # save copy of len | 2099 | &mov ($_len,$s2); # save copy of len |
852 | &mov ($_key,$s3); # save copy of key | 2100 | &mov ($_key,$key); # save copy of key |
853 | &mov ($_ivp,$acc); # save copy of ivp | 2101 | &mov ($_ivp,$acc); # save copy of ivp |
854 | 2102 | ||
855 | &mov ($mark,0); # copy of aes_key->rounds = 0; | 2103 | &mov ($mark,0); # copy of aes_key->rounds = 0; |
856 | if ($compromise) { | ||
857 | &cmp ($s2,$compromise); | ||
858 | &jb (&label("skip_ecopy")); | ||
859 | } | ||
860 | # do we copy key schedule to stack? | 2104 | # do we copy key schedule to stack? |
861 | &mov ($s1 eq "ebx" ? $s1 : "",$s3); | 2105 | &mov ($s1 eq "ebx" ? $s1 : "",$key); |
862 | &mov ($s2 eq "ecx" ? $s2 : "",244/4); | 2106 | &mov ($s2 eq "ecx" ? $s2 : "",244/4); |
863 | &sub ($s1,"ebp"); | 2107 | &sub ($s1,$tbl); |
864 | &mov ("esi",$s3); | 2108 | &mov ("esi",$key); |
865 | &and ($s1,0xfff); | 2109 | &and ($s1,0xfff); |
866 | &lea ("edi",$aes_key); | 2110 | &lea ("edi",$aes_key); |
867 | &cmp ($s1,2048); | 2111 | &cmp ($s1,2048+256); |
868 | &jb (&label("do_ecopy")); | 2112 | &jb (&label("do_copy")); |
869 | &cmp ($s1,4096-244); | 2113 | &cmp ($s1,4096-244); |
870 | &jb (&label("skip_ecopy")); | 2114 | &jb (&label("skip_copy")); |
871 | &align (4); | 2115 | &set_label("do_copy",4); |
872 | &set_label("do_ecopy"); | ||
873 | &mov ($_key,"edi"); | 2116 | &mov ($_key,"edi"); |
874 | &data_word(0xA5F3F689); # rep movsd | 2117 | &data_word(0xA5F3F689); # rep movsd |
875 | &set_label("skip_ecopy"); | 2118 | &set_label("skip_copy"); |
876 | 2119 | ||
877 | &mov ($acc,$s0); | ||
878 | &mov ($key,16); | 2120 | &mov ($key,16); |
879 | &align (4); | 2121 | &set_label("prefetch_tbl",4); |
880 | &set_label("prefetch_te"); | 2122 | &mov ($s0,&DWP(0,$tbl)); |
881 | &mov ($s0,&DWP(0,"ebp")); | 2123 | &mov ($s1,&DWP(32,$tbl)); |
882 | &mov ($s1,&DWP(32,"ebp")); | 2124 | &mov ($s2,&DWP(64,$tbl)); |
883 | &mov ($s2,&DWP(64,"ebp")); | 2125 | &mov ($acc,&DWP(96,$tbl)); |
884 | &mov ($s3,&DWP(96,"ebp")); | 2126 | &lea ($tbl,&DWP(128,$tbl)); |
885 | &lea ("ebp",&DWP(128,"ebp")); | 2127 | &sub ($key,1); |
886 | &dec ($key); | 2128 | &jnz (&label("prefetch_tbl")); |
887 | &jnz (&label("prefetch_te")); | 2129 | &sub ($tbl,2048); |
888 | &sub ("ebp",2048); | 2130 | |
889 | 2131 | &mov ($acc,$_inp); | |
890 | &mov ($s2,$_len); | ||
891 | &mov ($key,$_ivp); | 2132 | &mov ($key,$_ivp); |
892 | &test ($s2,0xFFFFFFF0); | ||
893 | &jz (&label("enc_tail")); # short input... | ||
894 | 2133 | ||
2134 | &cmp ($s3,0); | ||
2135 | &je (&label("fast_decrypt")); | ||
2136 | |||
2137 | #----------------------------- ENCRYPT -----------------------------# | ||
895 | &mov ($s0,&DWP(0,$key)); # load iv | 2138 | &mov ($s0,&DWP(0,$key)); # load iv |
896 | &mov ($s1,&DWP(4,$key)); | 2139 | &mov ($s1,&DWP(4,$key)); |
897 | 2140 | ||
898 | &align (4); | 2141 | &set_label("fast_enc_loop",16); |
899 | &set_label("enc_loop"); | ||
900 | &mov ($s2,&DWP(8,$key)); | 2142 | &mov ($s2,&DWP(8,$key)); |
901 | &mov ($s3,&DWP(12,$key)); | 2143 | &mov ($s3,&DWP(12,$key)); |
902 | 2144 | ||
@@ -916,22 +2158,16 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
916 | &mov (&DWP(8,$key),$s2); | 2158 | &mov (&DWP(8,$key),$s2); |
917 | &mov (&DWP(12,$key),$s3); | 2159 | &mov (&DWP(12,$key),$s3); |
918 | 2160 | ||
2161 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
919 | &mov ($s2,$_len); # load len | 2162 | &mov ($s2,$_len); # load len |
920 | |||
921 | &lea ($acc,&DWP(16,$acc)); | ||
922 | &mov ($_inp,$acc); # save inp | 2163 | &mov ($_inp,$acc); # save inp |
923 | 2164 | &lea ($s3,&DWP(16,$key)); # advance out | |
924 | &lea ($s3,&DWP(16,$key)); | ||
925 | &mov ($_out,$s3); # save out | 2165 | &mov ($_out,$s3); # save out |
926 | 2166 | &sub ($s2,16); # decrease len | |
927 | &sub ($s2,16); | ||
928 | &test ($s2,0xFFFFFFF0); | ||
929 | &mov ($_len,$s2); # save len | 2167 | &mov ($_len,$s2); # save len |
930 | &jnz (&label("enc_loop")); | 2168 | &jnz (&label("fast_enc_loop")); |
931 | &test ($s2,15); | ||
932 | &jnz (&label("enc_tail")); | ||
933 | &mov ($acc,$_ivp); # load ivp | 2169 | &mov ($acc,$_ivp); # load ivp |
934 | &mov ($s2,&DWP(8,$key)); # restore last dwords | 2170 | &mov ($s2,&DWP(8,$key)); # restore last 2 dwords |
935 | &mov ($s3,&DWP(12,$key)); | 2171 | &mov ($s3,&DWP(12,$key)); |
936 | &mov (&DWP(0,$acc),$s0); # save ivec | 2172 | &mov (&DWP(0,$acc),$s0); # save ivec |
937 | &mov (&DWP(4,$acc),$s1); | 2173 | &mov (&DWP(4,$acc),$s1); |
@@ -949,125 +2185,20 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
949 | &set_label("skip_ezero") | 2185 | &set_label("skip_ezero") |
950 | &mov ("esp",$_esp); | 2186 | &mov ("esp",$_esp); |
951 | &popf (); | 2187 | &popf (); |
952 | &set_label("enc_out"); | 2188 | &set_label("drop_out"); |
953 | &function_end_A(); | 2189 | &function_end_A(); |
954 | &pushf (); # kludge, never executed | 2190 | &pushf (); # kludge, never executed |
955 | 2191 | ||
956 | &align (4); | ||
957 | &set_label("enc_tail"); | ||
958 | &mov ($s0,$key eq "edi" ? $key : ""); | ||
959 | &mov ($key,$_out); # load out | ||
960 | &push ($s0); # push ivp | ||
961 | &mov ($s1,16); | ||
962 | &sub ($s1,$s2); | ||
963 | &cmp ($key,$acc); # compare with inp | ||
964 | &je (&label("enc_in_place")); | ||
965 | &align (4); | ||
966 | &data_word(0xA4F3F689); # rep movsb # copy input | ||
967 | &jmp (&label("enc_skip_in_place")); | ||
968 | &set_label("enc_in_place"); | ||
969 | &lea ($key,&DWP(0,$key,$s2)); | ||
970 | &set_label("enc_skip_in_place"); | ||
971 | &mov ($s2,$s1); | ||
972 | &xor ($s0,$s0); | ||
973 | &align (4); | ||
974 | &data_word(0xAAF3F689); # rep stosb # zero tail | ||
975 | &pop ($key); # pop ivp | ||
976 | |||
977 | &mov ($acc,$_out); # output as input | ||
978 | &mov ($s0,&DWP(0,$key)); | ||
979 | &mov ($s1,&DWP(4,$key)); | ||
980 | &mov ($_len,16); # len=16 | ||
981 | &jmp (&label("enc_loop")); # one more spin... | ||
982 | |||
983 | #----------------------------- DECRYPT -----------------------------# | 2192 | #----------------------------- DECRYPT -----------------------------# |
984 | &align (4); | 2193 | &set_label("fast_decrypt",16); |
985 | &set_label("DECRYPT"); | ||
986 | &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); | ||
987 | |||
988 | # allocate aligned stack frame... | ||
989 | &lea ($key,&DWP(-64-244,"esp")); | ||
990 | &and ($key,-64); | ||
991 | |||
992 | # ... and make sure it doesn't alias with AES_Td modulo 4096 | ||
993 | &mov ($s0,"ebp"); | ||
994 | &lea ($s1,&DWP(2048+256,"ebp")); | ||
995 | &mov ($s3,$key); | ||
996 | &and ($s0,0xfff); # s = %ebp&0xfff | ||
997 | &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff | ||
998 | &and ($s3,0xfff); # p = %esp&0xfff | ||
999 | |||
1000 | &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); | ||
1001 | &jb (&label("td_break_out")); | ||
1002 | &sub ($s3,$s1); | ||
1003 | &sub ($key,$s3); | ||
1004 | &jmp (&label("td_ok")); | ||
1005 | &set_label("td_break_out"); # else %esp -= (p-s)&0xfff + framesz; | ||
1006 | &sub ($s3,$s0); | ||
1007 | &and ($s3,0xfff); | ||
1008 | &add ($s3,64+256); | ||
1009 | &sub ($key,$s3); | ||
1010 | &align (4); | ||
1011 | &set_label("td_ok"); | ||
1012 | |||
1013 | &mov ($s0,&wparam(0)); # load inp | ||
1014 | &mov ($s1,&wparam(1)); # load out | ||
1015 | &mov ($s3,&wparam(3)); # load key | ||
1016 | &mov ($acc,&wparam(4)); # load ivp | ||
1017 | |||
1018 | &exch ("esp",$key); | ||
1019 | &add ("esp",4); # reserve for return address! | ||
1020 | &mov ($_esp,$key); # save %esp | ||
1021 | |||
1022 | &mov ($_inp,$s0); # save copy of inp | ||
1023 | &mov ($_out,$s1); # save copy of out | ||
1024 | &mov ($_len,$s2); # save copy of len | ||
1025 | &mov ($_key,$s3); # save copy of key | ||
1026 | &mov ($_ivp,$acc); # save copy of ivp | ||
1027 | |||
1028 | &mov ($mark,0); # copy of aes_key->rounds = 0; | ||
1029 | if ($compromise) { | ||
1030 | &cmp ($s2,$compromise); | ||
1031 | &jb (&label("skip_dcopy")); | ||
1032 | } | ||
1033 | # do we copy key schedule to stack? | ||
1034 | &mov ($s1 eq "ebx" ? $s1 : "",$s3); | ||
1035 | &mov ($s2 eq "ecx" ? $s2 : "",244/4); | ||
1036 | &sub ($s1,"ebp"); | ||
1037 | &mov ("esi",$s3); | ||
1038 | &and ($s1,0xfff); | ||
1039 | &lea ("edi",$aes_key); | ||
1040 | &cmp ($s1,2048+256); | ||
1041 | &jb (&label("do_dcopy")); | ||
1042 | &cmp ($s1,4096-244); | ||
1043 | &jb (&label("skip_dcopy")); | ||
1044 | &align (4); | ||
1045 | &set_label("do_dcopy"); | ||
1046 | &mov ($_key,"edi"); | ||
1047 | &data_word(0xA5F3F689); # rep movsd | ||
1048 | &set_label("skip_dcopy"); | ||
1049 | |||
1050 | &mov ($acc,$s0); | ||
1051 | &mov ($key,18); | ||
1052 | &align (4); | ||
1053 | &set_label("prefetch_td"); | ||
1054 | &mov ($s0,&DWP(0,"ebp")); | ||
1055 | &mov ($s1,&DWP(32,"ebp")); | ||
1056 | &mov ($s2,&DWP(64,"ebp")); | ||
1057 | &mov ($s3,&DWP(96,"ebp")); | ||
1058 | &lea ("ebp",&DWP(128,"ebp")); | ||
1059 | &dec ($key); | ||
1060 | &jnz (&label("prefetch_td")); | ||
1061 | &sub ("ebp",2048+256); | ||
1062 | 2194 | ||
1063 | &cmp ($acc,$_out); | 2195 | &cmp ($acc,$_out); |
1064 | &je (&label("dec_in_place")); # in-place processing... | 2196 | &je (&label("fast_dec_in_place")); # in-place processing... |
1065 | 2197 | ||
1066 | &mov ($key,$_ivp); # load ivp | ||
1067 | &mov ($_tmp,$key); | 2198 | &mov ($_tmp,$key); |
1068 | 2199 | ||
1069 | &align (4); | 2200 | &align (4); |
1070 | &set_label("dec_loop"); | 2201 | &set_label("fast_dec_loop",16); |
1071 | &mov ($s0,&DWP(0,$acc)); # read input | 2202 | &mov ($s0,&DWP(0,$acc)); # read input |
1072 | &mov ($s1,&DWP(4,$acc)); | 2203 | &mov ($s1,&DWP(4,$acc)); |
1073 | &mov ($s2,&DWP(8,$acc)); | 2204 | &mov ($s2,&DWP(8,$acc)); |
@@ -1083,27 +2214,24 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
1083 | &xor ($s2,&DWP(8,$key)); | 2214 | &xor ($s2,&DWP(8,$key)); |
1084 | &xor ($s3,&DWP(12,$key)); | 2215 | &xor ($s3,&DWP(12,$key)); |
1085 | 2216 | ||
1086 | &sub ($acc,16); | ||
1087 | &jc (&label("dec_partial")); | ||
1088 | &mov ($_len,$acc); # save len | ||
1089 | &mov ($acc,$_inp); # load inp | ||
1090 | &mov ($key,$_out); # load out | 2217 | &mov ($key,$_out); # load out |
2218 | &mov ($acc,$_inp); # load inp | ||
1091 | 2219 | ||
1092 | &mov (&DWP(0,$key),$s0); # write output | 2220 | &mov (&DWP(0,$key),$s0); # write output |
1093 | &mov (&DWP(4,$key),$s1); | 2221 | &mov (&DWP(4,$key),$s1); |
1094 | &mov (&DWP(8,$key),$s2); | 2222 | &mov (&DWP(8,$key),$s2); |
1095 | &mov (&DWP(12,$key),$s3); | 2223 | &mov (&DWP(12,$key),$s3); |
1096 | 2224 | ||
2225 | &mov ($s2,$_len); # load len | ||
1097 | &mov ($_tmp,$acc); # save ivp | 2226 | &mov ($_tmp,$acc); # save ivp |
1098 | &lea ($acc,&DWP(16,$acc)); | 2227 | &lea ($acc,&DWP(16,$acc)); # advance inp |
1099 | &mov ($_inp,$acc); # save inp | 2228 | &mov ($_inp,$acc); # save inp |
1100 | 2229 | &lea ($key,&DWP(16,$key)); # advance out | |
1101 | &lea ($key,&DWP(16,$key)); | ||
1102 | &mov ($_out,$key); # save out | 2230 | &mov ($_out,$key); # save out |
1103 | 2231 | &sub ($s2,16); # decrease len | |
1104 | &jnz (&label("dec_loop")); | 2232 | &mov ($_len,$s2); # save len |
2233 | &jnz (&label("fast_dec_loop")); | ||
1105 | &mov ($key,$_tmp); # load temp ivp | 2234 | &mov ($key,$_tmp); # load temp ivp |
1106 | &set_label("dec_end"); | ||
1107 | &mov ($acc,$_ivp); # load user ivp | 2235 | &mov ($acc,$_ivp); # load user ivp |
1108 | &mov ($s0,&DWP(0,$key)); # load iv | 2236 | &mov ($s0,&DWP(0,$key)); # load iv |
1109 | &mov ($s1,&DWP(4,$key)); | 2237 | &mov ($s1,&DWP(4,$key)); |
@@ -1113,31 +2241,16 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
1113 | &mov (&DWP(4,$acc),$s1); | 2241 | &mov (&DWP(4,$acc),$s1); |
1114 | &mov (&DWP(8,$acc),$s2); | 2242 | &mov (&DWP(8,$acc),$s2); |
1115 | &mov (&DWP(12,$acc),$s3); | 2243 | &mov (&DWP(12,$acc),$s3); |
1116 | &jmp (&label("dec_out")); | 2244 | &jmp (&label("fast_dec_out")); |
1117 | 2245 | ||
1118 | &align (4); | 2246 | &set_label("fast_dec_in_place",16); |
1119 | &set_label("dec_partial"); | 2247 | &set_label("fast_dec_in_place_loop"); |
1120 | &lea ($key,$ivec); | ||
1121 | &mov (&DWP(0,$key),$s0); # dump output to stack | ||
1122 | &mov (&DWP(4,$key),$s1); | ||
1123 | &mov (&DWP(8,$key),$s2); | ||
1124 | &mov (&DWP(12,$key),$s3); | ||
1125 | &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc)); | ||
1126 | &mov ($acc eq "esi" ? $acc : "",$key); | ||
1127 | &mov ($key eq "edi" ? $key : "",$_out); # load out | ||
1128 | &data_word(0xA4F3F689); # rep movsb # copy output | ||
1129 | &mov ($key,$_inp); # use inp as temp ivp | ||
1130 | &jmp (&label("dec_end")); | ||
1131 | |||
1132 | &align (4); | ||
1133 | &set_label("dec_in_place"); | ||
1134 | &set_label("dec_in_place_loop"); | ||
1135 | &lea ($key,$ivec); | ||
1136 | &mov ($s0,&DWP(0,$acc)); # read input | 2248 | &mov ($s0,&DWP(0,$acc)); # read input |
1137 | &mov ($s1,&DWP(4,$acc)); | 2249 | &mov ($s1,&DWP(4,$acc)); |
1138 | &mov ($s2,&DWP(8,$acc)); | 2250 | &mov ($s2,&DWP(8,$acc)); |
1139 | &mov ($s3,&DWP(12,$acc)); | 2251 | &mov ($s3,&DWP(12,$acc)); |
1140 | 2252 | ||
2253 | &lea ($key,$ivec); | ||
1141 | &mov (&DWP(0,$key),$s0); # copy to temp | 2254 | &mov (&DWP(0,$key),$s0); # copy to temp |
1142 | &mov (&DWP(4,$key),$s1); | 2255 | &mov (&DWP(4,$key),$s1); |
1143 | &mov (&DWP(8,$key),$s2); | 2256 | &mov (&DWP(8,$key),$s2); |
@@ -1158,7 +2271,7 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
1158 | &mov (&DWP(8,$acc),$s2); | 2271 | &mov (&DWP(8,$acc),$s2); |
1159 | &mov (&DWP(12,$acc),$s3); | 2272 | &mov (&DWP(12,$acc),$s3); |
1160 | 2273 | ||
1161 | &lea ($acc,&DWP(16,$acc)); | 2274 | &lea ($acc,&DWP(16,$acc)); # advance out |
1162 | &mov ($_out,$acc); # save out | 2275 | &mov ($_out,$acc); # save out |
1163 | 2276 | ||
1164 | &lea ($acc,$ivec); | 2277 | &lea ($acc,$ivec); |
@@ -1173,40 +2286,340 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
1173 | &mov (&DWP(12,$key),$s3); | 2286 | &mov (&DWP(12,$key),$s3); |
1174 | 2287 | ||
1175 | &mov ($acc,$_inp); # load inp | 2288 | &mov ($acc,$_inp); # load inp |
2289 | &mov ($s2,$_len); # load len | ||
2290 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
2291 | &mov ($_inp,$acc); # save inp | ||
2292 | &sub ($s2,16); # decrease len | ||
2293 | &mov ($_len,$s2); # save len | ||
2294 | &jnz (&label("fast_dec_in_place_loop")); | ||
2295 | |||
2296 | &set_label("fast_dec_out",4); | ||
2297 | &cmp ($mark,0); # was the key schedule copied? | ||
2298 | &mov ("edi",$_key); | ||
2299 | &je (&label("skip_dzero")); | ||
2300 | # zero copy of key schedule | ||
2301 | &mov ("ecx",240/4); | ||
2302 | &xor ("eax","eax"); | ||
2303 | &align (4); | ||
2304 | &data_word(0xABF3F689); # rep stosd | ||
2305 | &set_label("skip_dzero") | ||
2306 | &mov ("esp",$_esp); | ||
2307 | &popf (); | ||
2308 | &function_end_A(); | ||
2309 | &pushf (); # kludge, never executed | ||
2310 | |||
2311 | #--------------------------- SLOW ROUTINE ---------------------------# | ||
2312 | &set_label("slow_way",16); | ||
2313 | |||
2314 | &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap | ||
2315 | &mov ($key,&wparam(3)); # load key | ||
2316 | |||
2317 | # pre-allocate aligned stack frame... | ||
2318 | &lea ($acc,&DWP(-80,"esp")); | ||
2319 | &and ($acc,-64); | ||
2320 | |||
2321 | # ... and make sure it doesn't alias with $key modulo 1024 | ||
2322 | &lea ($s1,&DWP(-80-63,$key)); | ||
2323 | &sub ($s1,$acc); | ||
2324 | &neg ($s1); | ||
2325 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line | ||
2326 | &sub ($acc,$s1); | ||
2327 | |||
2328 | # pick S-box copy which can't overlap with stack frame or $key | ||
2329 | &lea ($s1,&DWP(768,$acc)); | ||
2330 | &sub ($s1,$tbl); | ||
2331 | &and ($s1,0x300); | ||
2332 | &lea ($tbl,&DWP(2048+128,$tbl,$s1)); | ||
2333 | |||
2334 | &lea ($s3,&wparam(0)); # pointer to parameter block | ||
2335 | |||
2336 | &exch ("esp",$acc); | ||
2337 | &add ("esp",4); # reserve for return address! | ||
2338 | &mov ($_tbl,$tbl); # save %ebp | ||
2339 | &mov ($_esp,$acc); # save %esp | ||
2340 | &mov ($_tmp,$s0); # save OPENSSL_ia32cap | ||
2341 | |||
2342 | &mov ($s0,&DWP(0,$s3)); # load inp | ||
2343 | &mov ($s1,&DWP(4,$s3)); # load out | ||
2344 | #&mov ($s2,&DWP(8,$s3)); # load len | ||
2345 | #&mov ($key,&DWP(12,$s3)); # load key | ||
2346 | &mov ($acc,&DWP(16,$s3)); # load ivp | ||
2347 | &mov ($s3,&DWP(20,$s3)); # load enc flag | ||
2348 | |||
2349 | &mov ($_inp,$s0); # save copy of inp | ||
2350 | &mov ($_out,$s1); # save copy of out | ||
2351 | &mov ($_len,$s2); # save copy of len | ||
2352 | &mov ($_key,$key); # save copy of key | ||
2353 | &mov ($_ivp,$acc); # save copy of ivp | ||
2354 | |||
2355 | &mov ($key,$acc); | ||
2356 | &mov ($acc,$s0); | ||
2357 | |||
2358 | &cmp ($s3,0); | ||
2359 | &je (&label("slow_decrypt")); | ||
2360 | |||
2361 | #--------------------------- SLOW ENCRYPT ---------------------------# | ||
2362 | &cmp ($s2,16); | ||
2363 | &mov ($s3,$s1); | ||
2364 | &jb (&label("slow_enc_tail")); | ||
2365 | |||
2366 | if (!$x86only) { | ||
2367 | &bt ($_tmp,25); # check for SSE bit | ||
2368 | &jnc (&label("slow_enc_x86")); | ||
2369 | |||
2370 | &movq ("mm0",&QWP(0,$key)); # load iv | ||
2371 | &movq ("mm4",&QWP(8,$key)); | ||
1176 | 2372 | ||
1177 | &lea ($acc,&DWP(16,$acc)); | 2373 | &set_label("slow_enc_loop_sse",16); |
2374 | &pxor ("mm0",&QWP(0,$acc)); # xor input data | ||
2375 | &pxor ("mm4",&QWP(8,$acc)); | ||
2376 | |||
2377 | &mov ($key,$_key); | ||
2378 | &call ("_sse_AES_encrypt_compact"); | ||
2379 | |||
2380 | &mov ($acc,$_inp); # load inp | ||
2381 | &mov ($key,$_out); # load out | ||
2382 | &mov ($s2,$_len); # load len | ||
2383 | |||
2384 | &movq (&QWP(0,$key),"mm0"); # save output data | ||
2385 | &movq (&QWP(8,$key),"mm4"); | ||
2386 | |||
2387 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
1178 | &mov ($_inp,$acc); # save inp | 2388 | &mov ($_inp,$acc); # save inp |
2389 | &lea ($s3,&DWP(16,$key)); # advance out | ||
2390 | &mov ($_out,$s3); # save out | ||
2391 | &sub ($s2,16); # decrease len | ||
2392 | &cmp ($s2,16); | ||
2393 | &mov ($_len,$s2); # save len | ||
2394 | &jae (&label("slow_enc_loop_sse")); | ||
2395 | &test ($s2,15); | ||
2396 | &jnz (&label("slow_enc_tail")); | ||
2397 | &mov ($acc,$_ivp); # load ivp | ||
2398 | &movq (&QWP(0,$acc),"mm0"); # save ivec | ||
2399 | &movq (&QWP(8,$acc),"mm4"); | ||
2400 | &emms (); | ||
2401 | &mov ("esp",$_esp); | ||
2402 | &popf (); | ||
2403 | &function_end_A(); | ||
2404 | &pushf (); # kludge, never executed | ||
2405 | } | ||
2406 | &set_label("slow_enc_x86",16); | ||
2407 | &mov ($s0,&DWP(0,$key)); # load iv | ||
2408 | &mov ($s1,&DWP(4,$key)); | ||
2409 | |||
2410 | &set_label("slow_enc_loop_x86",4); | ||
2411 | &mov ($s2,&DWP(8,$key)); | ||
2412 | &mov ($s3,&DWP(12,$key)); | ||
2413 | |||
2414 | &xor ($s0,&DWP(0,$acc)); # xor input data | ||
2415 | &xor ($s1,&DWP(4,$acc)); | ||
2416 | &xor ($s2,&DWP(8,$acc)); | ||
2417 | &xor ($s3,&DWP(12,$acc)); | ||
2418 | |||
2419 | &mov ($key,$_key); # load key | ||
2420 | &call ("_x86_AES_encrypt_compact"); | ||
2421 | |||
2422 | &mov ($acc,$_inp); # load inp | ||
2423 | &mov ($key,$_out); # load out | ||
2424 | |||
2425 | &mov (&DWP(0,$key),$s0); # save output data | ||
2426 | &mov (&DWP(4,$key),$s1); | ||
2427 | &mov (&DWP(8,$key),$s2); | ||
2428 | &mov (&DWP(12,$key),$s3); | ||
1179 | 2429 | ||
1180 | &mov ($s2,$_len); # load len | 2430 | &mov ($s2,$_len); # load len |
1181 | &sub ($s2,16); | 2431 | &lea ($acc,&DWP(16,$acc)); # advance inp |
1182 | &jc (&label("dec_in_place_partial")); | 2432 | &mov ($_inp,$acc); # save inp |
2433 | &lea ($s3,&DWP(16,$key)); # advance out | ||
2434 | &mov ($_out,$s3); # save out | ||
2435 | &sub ($s2,16); # decrease len | ||
2436 | &cmp ($s2,16); | ||
1183 | &mov ($_len,$s2); # save len | 2437 | &mov ($_len,$s2); # save len |
1184 | &jnz (&label("dec_in_place_loop")); | 2438 | &jae (&label("slow_enc_loop_x86")); |
1185 | &jmp (&label("dec_out")); | 2439 | &test ($s2,15); |
1186 | 2440 | &jnz (&label("slow_enc_tail")); | |
1187 | &align (4); | 2441 | &mov ($acc,$_ivp); # load ivp |
1188 | &set_label("dec_in_place_partial"); | 2442 | &mov ($s2,&DWP(8,$key)); # restore last dwords |
1189 | # one can argue if this is actually required... | 2443 | &mov ($s3,&DWP(12,$key)); |
1190 | &mov ($key eq "edi" ? $key : "",$_out); | 2444 | &mov (&DWP(0,$acc),$s0); # save ivec |
1191 | &lea ($acc eq "esi" ? $acc : "",$ivec); | 2445 | &mov (&DWP(4,$acc),$s1); |
2446 | &mov (&DWP(8,$acc),$s2); | ||
2447 | &mov (&DWP(12,$acc),$s3); | ||
2448 | |||
2449 | &mov ("esp",$_esp); | ||
2450 | &popf (); | ||
2451 | &function_end_A(); | ||
2452 | &pushf (); # kludge, never executed | ||
2453 | |||
2454 | &set_label("slow_enc_tail",16); | ||
2455 | &emms () if (!$x86only); | ||
2456 | &mov ($key eq "edi"? $key:"",$s3); # load out to edi | ||
2457 | &mov ($s1,16); | ||
2458 | &sub ($s1,$s2); | ||
2459 | &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp | ||
2460 | &je (&label("enc_in_place")); | ||
2461 | &align (4); | ||
2462 | &data_word(0xA4F3F689); # rep movsb # copy input | ||
2463 | &jmp (&label("enc_skip_in_place")); | ||
2464 | &set_label("enc_in_place"); | ||
1192 | &lea ($key,&DWP(0,$key,$s2)); | 2465 | &lea ($key,&DWP(0,$key,$s2)); |
1193 | &lea ($acc,&DWP(16,$acc,$s2)); | 2466 | &set_label("enc_skip_in_place"); |
1194 | &neg ($s2 eq "ecx" ? $s2 : ""); | 2467 | &mov ($s2,$s1); |
1195 | &data_word(0xA4F3F689); # rep movsb # restore tail | 2468 | &xor ($s0,$s0); |
1196 | 2469 | &align (4); | |
1197 | &align (4); | 2470 | &data_word(0xAAF3F689); # rep stosb # zero tail |
1198 | &set_label("dec_out"); | 2471 | |
1199 | &cmp ($mark,0); # was the key schedule copied? | 2472 | &mov ($key,$_ivp); # restore ivp |
1200 | &mov ("edi",$_key); | 2473 | &mov ($acc,$s3); # output as input |
1201 | &je (&label("skip_dzero")); | 2474 | &mov ($s0,&DWP(0,$key)); |
1202 | # zero copy of key schedule | 2475 | &mov ($s1,&DWP(4,$key)); |
1203 | &mov ("ecx",240/4); | 2476 | &mov ($_len,16); # len=16 |
1204 | &xor ("eax","eax"); | 2477 | &jmp (&label("slow_enc_loop_x86")); # one more spin... |
1205 | &align (4); | 2478 | |
1206 | &data_word(0xABF3F689); # rep stosd | 2479 | #--------------------------- SLOW DECRYPT ---------------------------# |
1207 | &set_label("skip_dzero") | 2480 | &set_label("slow_decrypt",16); |
1208 | &mov ("esp",$_esp); | 2481 | if (!$x86only) { |
1209 | &popf (); | 2482 | &bt ($_tmp,25); # check for SSE bit |
2483 | &jnc (&label("slow_dec_loop_x86")); | ||
2484 | |||
2485 | &set_label("slow_dec_loop_sse",4); | ||
2486 | &movq ("mm0",&QWP(0,$acc)); # read input | ||
2487 | &movq ("mm4",&QWP(8,$acc)); | ||
2488 | |||
2489 | &mov ($key,$_key); | ||
2490 | &call ("_sse_AES_decrypt_compact"); | ||
2491 | |||
2492 | &mov ($acc,$_inp); # load inp | ||
2493 | &lea ($s0,$ivec); | ||
2494 | &mov ($s1,$_out); # load out | ||
2495 | &mov ($s2,$_len); # load len | ||
2496 | &mov ($key,$_ivp); # load ivp | ||
2497 | |||
2498 | &movq ("mm1",&QWP(0,$acc)); # re-read input | ||
2499 | &movq ("mm5",&QWP(8,$acc)); | ||
2500 | |||
2501 | &pxor ("mm0",&QWP(0,$key)); # xor iv | ||
2502 | &pxor ("mm4",&QWP(8,$key)); | ||
2503 | |||
2504 | &movq (&QWP(0,$key),"mm1"); # copy input to iv | ||
2505 | &movq (&QWP(8,$key),"mm5"); | ||
2506 | |||
2507 | &sub ($s2,16); # decrease len | ||
2508 | &jc (&label("slow_dec_partial_sse")); | ||
2509 | |||
2510 | &movq (&QWP(0,$s1),"mm0"); # write output | ||
2511 | &movq (&QWP(8,$s1),"mm4"); | ||
2512 | |||
2513 | &lea ($s1,&DWP(16,$s1)); # advance out | ||
2514 | &mov ($_out,$s1); # save out | ||
2515 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
2516 | &mov ($_inp,$acc); # save inp | ||
2517 | &mov ($_len,$s2); # save len | ||
2518 | &jnz (&label("slow_dec_loop_sse")); | ||
2519 | &emms (); | ||
2520 | &mov ("esp",$_esp); | ||
2521 | &popf (); | ||
2522 | &function_end_A(); | ||
2523 | &pushf (); # kludge, never executed | ||
2524 | |||
2525 | &set_label("slow_dec_partial_sse",16); | ||
2526 | &movq (&QWP(0,$s0),"mm0"); # save output to temp | ||
2527 | &movq (&QWP(8,$s0),"mm4"); | ||
2528 | &emms (); | ||
2529 | |||
2530 | &add ($s2 eq "ecx" ? "ecx":"",16); | ||
2531 | &mov ("edi",$s1); # out | ||
2532 | &mov ("esi",$s0); # temp | ||
2533 | &align (4); | ||
2534 | &data_word(0xA4F3F689); # rep movsb # copy partial output | ||
2535 | |||
2536 | &mov ("esp",$_esp); | ||
2537 | &popf (); | ||
2538 | &function_end_A(); | ||
2539 | &pushf (); # kludge, never executed | ||
2540 | } | ||
2541 | &set_label("slow_dec_loop_x86",16); | ||
2542 | &mov ($s0,&DWP(0,$acc)); # read input | ||
2543 | &mov ($s1,&DWP(4,$acc)); | ||
2544 | &mov ($s2,&DWP(8,$acc)); | ||
2545 | &mov ($s3,&DWP(12,$acc)); | ||
2546 | |||
2547 | &lea ($key,$ivec); | ||
2548 | &mov (&DWP(0,$key),$s0); # copy to temp | ||
2549 | &mov (&DWP(4,$key),$s1); | ||
2550 | &mov (&DWP(8,$key),$s2); | ||
2551 | &mov (&DWP(12,$key),$s3); | ||
2552 | |||
2553 | &mov ($key,$_key); # load key | ||
2554 | &call ("_x86_AES_decrypt_compact"); | ||
2555 | |||
2556 | &mov ($key,$_ivp); # load ivp | ||
2557 | &mov ($acc,$_len); # load len | ||
2558 | &xor ($s0,&DWP(0,$key)); # xor iv | ||
2559 | &xor ($s1,&DWP(4,$key)); | ||
2560 | &xor ($s2,&DWP(8,$key)); | ||
2561 | &xor ($s3,&DWP(12,$key)); | ||
2562 | |||
2563 | &sub ($acc,16); | ||
2564 | &jc (&label("slow_dec_partial_x86")); | ||
2565 | |||
2566 | &mov ($_len,$acc); # save len | ||
2567 | &mov ($acc,$_out); # load out | ||
2568 | |||
2569 | &mov (&DWP(0,$acc),$s0); # write output | ||
2570 | &mov (&DWP(4,$acc),$s1); | ||
2571 | &mov (&DWP(8,$acc),$s2); | ||
2572 | &mov (&DWP(12,$acc),$s3); | ||
2573 | |||
2574 | &lea ($acc,&DWP(16,$acc)); # advance out | ||
2575 | &mov ($_out,$acc); # save out | ||
2576 | |||
2577 | &lea ($acc,$ivec); | ||
2578 | &mov ($s0,&DWP(0,$acc)); # read temp | ||
2579 | &mov ($s1,&DWP(4,$acc)); | ||
2580 | &mov ($s2,&DWP(8,$acc)); | ||
2581 | &mov ($s3,&DWP(12,$acc)); | ||
2582 | |||
2583 | &mov (&DWP(0,$key),$s0); # copy it to iv | ||
2584 | &mov (&DWP(4,$key),$s1); | ||
2585 | &mov (&DWP(8,$key),$s2); | ||
2586 | &mov (&DWP(12,$key),$s3); | ||
2587 | |||
2588 | &mov ($acc,$_inp); # load inp | ||
2589 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
2590 | &mov ($_inp,$acc); # save inp | ||
2591 | &jnz (&label("slow_dec_loop_x86")); | ||
2592 | &mov ("esp",$_esp); | ||
2593 | &popf (); | ||
2594 | &function_end_A(); | ||
2595 | &pushf (); # kludge, never executed | ||
2596 | |||
2597 | &set_label("slow_dec_partial_x86",16); | ||
2598 | &lea ($acc,$ivec); | ||
2599 | &mov (&DWP(0,$acc),$s0); # save output to temp | ||
2600 | &mov (&DWP(4,$acc),$s1); | ||
2601 | &mov (&DWP(8,$acc),$s2); | ||
2602 | &mov (&DWP(12,$acc),$s3); | ||
2603 | |||
2604 | &mov ($acc,$_inp); | ||
2605 | &mov ($s0,&DWP(0,$acc)); # re-read input | ||
2606 | &mov ($s1,&DWP(4,$acc)); | ||
2607 | &mov ($s2,&DWP(8,$acc)); | ||
2608 | &mov ($s3,&DWP(12,$acc)); | ||
2609 | |||
2610 | &mov (&DWP(0,$key),$s0); # copy it to iv | ||
2611 | &mov (&DWP(4,$key),$s1); | ||
2612 | &mov (&DWP(8,$key),$s2); | ||
2613 | &mov (&DWP(12,$key),$s3); | ||
2614 | |||
2615 | &mov ("ecx",$_len); | ||
2616 | &mov ("edi",$_out); | ||
2617 | &lea ("esi",$ivec); | ||
2618 | &align (4); | ||
2619 | &data_word(0xA4F3F689); # rep movsb # copy partial output | ||
2620 | |||
2621 | &mov ("esp",$_esp); | ||
2622 | &popf (); | ||
1210 | &function_end("AES_cbc_encrypt"); | 2623 | &function_end("AES_cbc_encrypt"); |
1211 | } | 2624 | } |
1212 | 2625 | ||
@@ -1215,35 +2628,31 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
1215 | sub enckey() | 2628 | sub enckey() |
1216 | { | 2629 | { |
1217 | &movz ("esi",&LB("edx")); # rk[i]>>0 | 2630 | &movz ("esi",&LB("edx")); # rk[i]>>0 |
1218 | &mov ("ebx",&DWP(2,"ebp","esi",8)); | 2631 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
1219 | &movz ("esi",&HB("edx")); # rk[i]>>8 | 2632 | &movz ("esi",&HB("edx")); # rk[i]>>8 |
1220 | &and ("ebx",0xFF000000); | 2633 | &shl ("ebx",24); |
1221 | &xor ("eax","ebx"); | 2634 | &xor ("eax","ebx"); |
1222 | 2635 | ||
1223 | &mov ("ebx",&DWP(2,"ebp","esi",8)); | 2636 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
1224 | &shr ("edx",16); | 2637 | &shr ("edx",16); |
1225 | &and ("ebx",0x000000FF); | ||
1226 | &movz ("esi",&LB("edx")); # rk[i]>>16 | 2638 | &movz ("esi",&LB("edx")); # rk[i]>>16 |
1227 | &xor ("eax","ebx"); | 2639 | &xor ("eax","ebx"); |
1228 | 2640 | ||
1229 | &mov ("ebx",&DWP(0,"ebp","esi",8)); | 2641 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
1230 | &movz ("esi",&HB("edx")); # rk[i]>>24 | 2642 | &movz ("esi",&HB("edx")); # rk[i]>>24 |
1231 | &and ("ebx",0x0000FF00); | 2643 | &shl ("ebx",8); |
1232 | &xor ("eax","ebx"); | 2644 | &xor ("eax","ebx"); |
1233 | 2645 | ||
1234 | &mov ("ebx",&DWP(0,"ebp","esi",8)); | 2646 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
1235 | &and ("ebx",0x00FF0000); | 2647 | &shl ("ebx",16); |
1236 | &xor ("eax","ebx"); | 2648 | &xor ("eax","ebx"); |
1237 | 2649 | ||
1238 | &xor ("eax",&DWP(2048,"ebp","ecx",4)); # rcon | 2650 | &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon |
1239 | } | 2651 | } |
1240 | 2652 | ||
1241 | # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | 2653 | &function_begin("_x86_AES_set_encrypt_key"); |
1242 | # AES_KEY *key) | 2654 | &mov ("esi",&wparam(1)); # user supplied key |
1243 | &public_label("AES_Te"); | 2655 | &mov ("edi",&wparam(3)); # private key schedule |
1244 | &function_begin("AES_set_encrypt_key", "", "_x86_AES_set_encrypt_key"); | ||
1245 | &mov ("esi",&wparam(0)); # user supplied key | ||
1246 | &mov ("edi",&wparam(2)); # private key schedule | ||
1247 | 2656 | ||
1248 | &test ("esi",-1); | 2657 | &test ("esi",-1); |
1249 | &jz (&label("badpointer")); | 2658 | &jz (&label("badpointer")); |
@@ -1252,10 +2661,21 @@ sub enckey() | |||
1252 | 2661 | ||
1253 | &call (&label("pic_point")); | 2662 | &call (&label("pic_point")); |
1254 | &set_label("pic_point"); | 2663 | &set_label("pic_point"); |
1255 | &blindpop("ebp"); | 2664 | &blindpop($tbl); |
1256 | &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 2665 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); |
1257 | 2666 | &lea ($tbl,&DWP(2048+128,$tbl)); | |
1258 | &mov ("ecx",&wparam(1)); # number of bits in key | 2667 | |
2668 | # prefetch Te4 | ||
2669 | &mov ("eax",&DWP(0-128,$tbl)); | ||
2670 | &mov ("ebx",&DWP(32-128,$tbl)); | ||
2671 | &mov ("ecx",&DWP(64-128,$tbl)); | ||
2672 | &mov ("edx",&DWP(96-128,$tbl)); | ||
2673 | &mov ("eax",&DWP(128-128,$tbl)); | ||
2674 | &mov ("ebx",&DWP(160-128,$tbl)); | ||
2675 | &mov ("ecx",&DWP(192-128,$tbl)); | ||
2676 | &mov ("edx",&DWP(224-128,$tbl)); | ||
2677 | |||
2678 | &mov ("ecx",&wparam(2)); # number of bits in key | ||
1259 | &cmp ("ecx",128); | 2679 | &cmp ("ecx",128); |
1260 | &je (&label("10rounds")); | 2680 | &je (&label("10rounds")); |
1261 | &cmp ("ecx",192); | 2681 | &cmp ("ecx",192); |
@@ -1394,24 +2814,23 @@ sub enckey() | |||
1394 | &mov ("edx","eax"); | 2814 | &mov ("edx","eax"); |
1395 | &mov ("eax",&DWP(16,"edi")); # rk[4] | 2815 | &mov ("eax",&DWP(16,"edi")); # rk[4] |
1396 | &movz ("esi",&LB("edx")); # rk[11]>>0 | 2816 | &movz ("esi",&LB("edx")); # rk[11]>>0 |
1397 | &mov ("ebx",&DWP(2,"ebp","esi",8)); | 2817 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
1398 | &movz ("esi",&HB("edx")); # rk[11]>>8 | 2818 | &movz ("esi",&HB("edx")); # rk[11]>>8 |
1399 | &and ("ebx",0x000000FF); | ||
1400 | &xor ("eax","ebx"); | 2819 | &xor ("eax","ebx"); |
1401 | 2820 | ||
1402 | &mov ("ebx",&DWP(0,"ebp","esi",8)); | 2821 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
1403 | &shr ("edx",16); | 2822 | &shr ("edx",16); |
1404 | &and ("ebx",0x0000FF00); | 2823 | &shl ("ebx",8); |
1405 | &movz ("esi",&LB("edx")); # rk[11]>>16 | 2824 | &movz ("esi",&LB("edx")); # rk[11]>>16 |
1406 | &xor ("eax","ebx"); | 2825 | &xor ("eax","ebx"); |
1407 | 2826 | ||
1408 | &mov ("ebx",&DWP(0,"ebp","esi",8)); | 2827 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
1409 | &movz ("esi",&HB("edx")); # rk[11]>>24 | 2828 | &movz ("esi",&HB("edx")); # rk[11]>>24 |
1410 | &and ("ebx",0x00FF0000); | 2829 | &shl ("ebx",16); |
1411 | &xor ("eax","ebx"); | 2830 | &xor ("eax","ebx"); |
1412 | 2831 | ||
1413 | &mov ("ebx",&DWP(2,"ebp","esi",8)); | 2832 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
1414 | &and ("ebx",0xFF000000); | 2833 | &shl ("ebx",24); |
1415 | &xor ("eax","ebx"); | 2834 | &xor ("eax","ebx"); |
1416 | 2835 | ||
1417 | &mov (&DWP(48,"edi"),"eax"); # rk[12] | 2836 | &mov (&DWP(48,"edi"),"eax"); # rk[12] |
@@ -1433,43 +2852,74 @@ sub enckey() | |||
1433 | &set_label("badpointer"); | 2852 | &set_label("badpointer"); |
1434 | &mov ("eax",-1); | 2853 | &mov ("eax",-1); |
1435 | &set_label("exit"); | 2854 | &set_label("exit"); |
1436 | &function_end("AES_set_encrypt_key"); | 2855 | &function_end("_x86_AES_set_encrypt_key"); |
1437 | 2856 | ||
1438 | sub deckey() | 2857 | # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, |
1439 | { my ($i,$ptr,$te,$td) = @_; | 2858 | # AES_KEY *key) |
2859 | &function_begin_B("AES_set_encrypt_key"); | ||
2860 | &call ("_x86_AES_set_encrypt_key"); | ||
2861 | &ret (); | ||
2862 | &function_end_B("AES_set_encrypt_key"); | ||
1440 | 2863 | ||
1441 | &mov ("eax",&DWP($i,$ptr)); | 2864 | sub deckey() |
1442 | &mov ("edx","eax"); | 2865 | { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; |
1443 | &movz ("ebx",&HB("eax")); | 2866 | my $tmp = $tbl; |
1444 | &shr ("edx",16); | 2867 | |
1445 | &and ("eax",0xFF); | 2868 | &mov ($acc,$tp1); |
1446 | &movz ("eax",&BP(2,$te,"eax",8)); | 2869 | &and ($acc,0x80808080); |
1447 | &movz ("ebx",&BP(2,$te,"ebx",8)); | 2870 | &mov ($tmp,$acc); |
1448 | &mov ("eax",&DWP(0,$td,"eax",8)); | 2871 | &shr ($tmp,7); |
1449 | &xor ("eax",&DWP(3,$td,"ebx",8)); | 2872 | &lea ($tp2,&DWP(0,$tp1,$tp1)); |
1450 | &movz ("ebx",&HB("edx")); | 2873 | &sub ($acc,$tmp); |
1451 | &and ("edx",0xFF); | 2874 | &and ($tp2,0xfefefefe); |
1452 | &movz ("edx",&BP(2,$te,"edx",8)); | 2875 | &and ($acc,0x1b1b1b1b); |
1453 | &movz ("ebx",&BP(2,$te,"ebx",8)); | 2876 | &xor ($acc,$tp2); |
1454 | &xor ("eax",&DWP(2,$td,"edx",8)); | 2877 | &mov ($tp2,$acc); |
1455 | &xor ("eax",&DWP(1,$td,"ebx",8)); | 2878 | |
1456 | &mov (&DWP($i,$ptr),"eax"); | 2879 | &and ($acc,0x80808080); |
2880 | &mov ($tmp,$acc); | ||
2881 | &shr ($tmp,7); | ||
2882 | &lea ($tp4,&DWP(0,$tp2,$tp2)); | ||
2883 | &sub ($acc,$tmp); | ||
2884 | &and ($tp4,0xfefefefe); | ||
2885 | &and ($acc,0x1b1b1b1b); | ||
2886 | &xor ($tp2,$tp1); # tp2^tp1 | ||
2887 | &xor ($acc,$tp4); | ||
2888 | &mov ($tp4,$acc); | ||
2889 | |||
2890 | &and ($acc,0x80808080); | ||
2891 | &mov ($tmp,$acc); | ||
2892 | &shr ($tmp,7); | ||
2893 | &lea ($tp8,&DWP(0,$tp4,$tp4)); | ||
2894 | &xor ($tp4,$tp1); # tp4^tp1 | ||
2895 | &sub ($acc,$tmp); | ||
2896 | &and ($tp8,0xfefefefe); | ||
2897 | &and ($acc,0x1b1b1b1b); | ||
2898 | &rotl ($tp1,8); # = ROTATE(tp1,8) | ||
2899 | &xor ($tp8,$acc); | ||
2900 | |||
2901 | &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load | ||
2902 | |||
2903 | &xor ($tp1,$tp2); | ||
2904 | &xor ($tp2,$tp8); | ||
2905 | &xor ($tp1,$tp4); | ||
2906 | &rotl ($tp2,24); | ||
2907 | &xor ($tp4,$tp8); | ||
2908 | &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) | ||
2909 | &rotl ($tp4,16); | ||
2910 | &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24) | ||
2911 | &rotl ($tp8,8); | ||
2912 | &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16) | ||
2913 | &mov ($tp2,$tmp); | ||
2914 | &xor ($tp1,$tp8); # ^= ROTATE(tp8,8) | ||
2915 | |||
2916 | &mov (&DWP(4*$i,$key),$tp1); | ||
1457 | } | 2917 | } |
1458 | 2918 | ||
1459 | # int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | 2919 | # int AES_set_decrypt_key(const unsigned char *userKey, const int bits, |
1460 | # AES_KEY *key) | 2920 | # AES_KEY *key) |
1461 | &public_label("AES_Td"); | ||
1462 | &public_label("AES_Te"); | ||
1463 | &function_begin_B("AES_set_decrypt_key"); | 2921 | &function_begin_B("AES_set_decrypt_key"); |
1464 | &mov ("eax",&wparam(0)); | ||
1465 | &mov ("ecx",&wparam(1)); | ||
1466 | &mov ("edx",&wparam(2)); | ||
1467 | &sub ("esp",12); | ||
1468 | &mov (&DWP(0,"esp"),"eax"); | ||
1469 | &mov (&DWP(4,"esp"),"ecx"); | ||
1470 | &mov (&DWP(8,"esp"),"edx"); | ||
1471 | &call ("_x86_AES_set_encrypt_key"); | 2922 | &call ("_x86_AES_set_encrypt_key"); |
1472 | &add ("esp",12); | ||
1473 | &cmp ("eax",0); | 2923 | &cmp ("eax",0); |
1474 | &je (&label("proceed")); | 2924 | &je (&label("proceed")); |
1475 | &ret (); | 2925 | &ret (); |
@@ -1485,8 +2935,7 @@ sub deckey() | |||
1485 | &lea ("ecx",&DWP(0,"","ecx",4)); | 2935 | &lea ("ecx",&DWP(0,"","ecx",4)); |
1486 | &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk | 2936 | &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk |
1487 | 2937 | ||
1488 | &align (4); | 2938 | &set_label("invert",4); # invert order of chunks |
1489 | &set_label("invert"); # invert order of chunks | ||
1490 | &mov ("eax",&DWP(0,"esi")); | 2939 | &mov ("eax",&DWP(0,"esi")); |
1491 | &mov ("ebx",&DWP(4,"esi")); | 2940 | &mov ("ebx",&DWP(4,"esi")); |
1492 | &mov ("ecx",&DWP(0,"edi")); | 2941 | &mov ("ecx",&DWP(0,"edi")); |
@@ -1508,26 +2957,24 @@ sub deckey() | |||
1508 | &cmp ("esi","edi"); | 2957 | &cmp ("esi","edi"); |
1509 | &jne (&label("invert")); | 2958 | &jne (&label("invert")); |
1510 | 2959 | ||
1511 | &call (&label("pic_point")); | 2960 | &mov ($key,&wparam(2)); |
1512 | &set_label("pic_point"); | 2961 | &mov ($acc,&DWP(240,$key)); # pull number of rounds |
1513 | blindpop("ebp"); | 2962 | &lea ($acc,&DWP(-2,$acc,$acc)); |
1514 | &lea ("edi",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); | 2963 | &lea ($acc,&DWP(0,$key,$acc,8)); |
1515 | &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 2964 | &mov (&wparam(2),$acc); |
1516 | 2965 | ||
1517 | &mov ("esi",&wparam(2)); | 2966 | &mov ($s0,&DWP(16,$key)); # modulo-scheduled load |
1518 | &mov ("ecx",&DWP(240,"esi")); # pull number of rounds | 2967 | &set_label("permute",4); # permute the key schedule |
1519 | &dec ("ecx"); | 2968 | &add ($key,16); |
1520 | &align (4); | 2969 | &deckey (0,$key,$s0,$s1,$s2,$s3); |
1521 | &set_label("permute"); # permute the key schedule | 2970 | &deckey (1,$key,$s1,$s2,$s3,$s0); |
1522 | &add ("esi",16); | 2971 | &deckey (2,$key,$s2,$s3,$s0,$s1); |
1523 | &deckey (0,"esi","ebp","edi"); | 2972 | &deckey (3,$key,$s3,$s0,$s1,$s2); |
1524 | &deckey (4,"esi","ebp","edi"); | 2973 | &cmp ($key,&wparam(2)); |
1525 | &deckey (8,"esi","ebp","edi"); | 2974 | &jb (&label("permute")); |
1526 | &deckey (12,"esi","ebp","edi"); | ||
1527 | &dec ("ecx"); | ||
1528 | &jnz (&label("permute")); | ||
1529 | 2975 | ||
1530 | &xor ("eax","eax"); # return success | 2976 | &xor ("eax","eax"); # return success |
1531 | &function_end("AES_set_decrypt_key"); | 2977 | &function_end("AES_set_decrypt_key"); |
2978 | &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>"); | ||
1532 | 2979 | ||
1533 | &asm_finish(); | 2980 | &asm_finish(); |
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl index a545e892ae..53e4ef85fd 100755 --- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl +++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl | |||
@@ -1669,7 +1669,7 @@ AES_cbc_encrypt: | |||
1669 | lea .LAES_Td(%rip),$sbox | 1669 | lea .LAES_Td(%rip),$sbox |
1670 | .Lcbc_picked_te: | 1670 | .Lcbc_picked_te: |
1671 | 1671 | ||
1672 | mov OPENSSL_ia32cap_P(%rip),%r10d | 1672 | mov PIC_GOT(OPENSSL_ia32cap_P),%r10d |
1673 | cmp \$$speed_limit,%rdx | 1673 | cmp \$$speed_limit,%rdx |
1674 | jb .Lcbc_slow_prologue | 1674 | jb .Lcbc_slow_prologue |
1675 | test \$15,%rdx | 1675 | test \$15,%rdx |