summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/aes
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/aes')
-rw-r--r--src/lib/libcrypto/aes/Makefile.ssl103
-rw-r--r--src/lib/libcrypto/aes/aes.h28
-rw-r--r--src/lib/libcrypto/aes/aes_cbc.c82
-rw-r--r--src/lib/libcrypto/aes/aes_cfb.c160
-rw-r--r--src/lib/libcrypto/aes/aes_core.c209
-rw-r--r--src/lib/libcrypto/aes/aes_ctr.c90
-rw-r--r--src/lib/libcrypto/aes/aes_ofb.c94
-rw-r--r--src/lib/libcrypto/aes/asm/aes-586.pl2403
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-x86_64.pl2
9 files changed, 2163 insertions, 1008 deletions
diff --git a/src/lib/libcrypto/aes/Makefile.ssl b/src/lib/libcrypto/aes/Makefile.ssl
deleted file mode 100644
index f353aeb697..0000000000
--- a/src/lib/libcrypto/aes/Makefile.ssl
+++ /dev/null
@@ -1,103 +0,0 @@
1#
2# crypto/aes/Makefile
3#
4
5DIR= aes
6TOP= ../..
7CC= cc
8CPP= $(CC) -E
9INCLUDES=
10CFLAG=-g
11INSTALL_PREFIX=
12OPENSSLDIR= /usr/local/ssl
13INSTALLTOP= /usr/local/ssl
14MAKE= make -f Makefile.ssl
15MAKEDEPPROG= makedepend
16MAKEDEPEND= $(TOP)/util/domd $(TOP) -MD $(MAKEDEPPROG)
17MAKEFILE= Makefile.ssl
18AR= ar r
19
20# CFLAGS= -mpentiumpro $(INCLUDES) $(CFLAG) -O3 -fexpensive-optimizations -funroll-loops -fforce-addr
21CFLAGS= $(INCLUDES) $(CFLAG)
22
23GENERAL=Makefile
24#TEST=aestest.c
25TEST=
26APPS=
27
28LIB=$(TOP)/libcrypto.a
29LIBSRC=aes_core.c aes_misc.c aes_ecb.c aes_cbc.c aes_cfb.c aes_ofb.c aes_ctr.c
30LIBOBJ=aes_core.o aes_misc.o aes_ecb.o aes_cbc.o aes_cfb.o aes_ofb.o aes_ctr.o
31
32SRC= $(LIBSRC)
33
34EXHEADER= aes.h
35HEADER= aes_locl.h $(EXHEADER)
36
37ALL= $(GENERAL) $(SRC) $(HEADER)
38
39top:
40 (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all)
41
42all: lib
43
44lib: $(LIBOBJ)
45 $(AR) $(LIB) $(LIBOBJ)
46 $(RANLIB) $(LIB) || echo Never mind.
47 @touch lib
48
49$(LIBOBJ): $(LIBSRC)
50
51files:
52 $(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO
53
54links:
55 @sh $(TOP)/util/point.sh Makefile.ssl Makefile
56 @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
57 @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
58 @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
59
60install: installs
61
62installs:
63 @for i in $(EXHEADER) ; \
64 do \
65 (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
66 chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
67 done;
68
69tags:
70 ctags $(SRC)
71
72tests:
73
74lint:
75 lint -DLINT $(INCLUDES) $(SRC)>fluff
76
77depend:
78 $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
79
80dclean:
81 $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
82 mv -f Makefile.new $(MAKEFILE)
83
84clean:
85 rm -f *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
86
87# DO NOT DELETE THIS LINE -- make depend depends on it.
88
89aes_cbc.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h
90aes_cbc.o: ../../include/openssl/opensslconf.h aes_cbc.c aes_locl.h
91aes_cfb.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h
92aes_cfb.o: ../../include/openssl/opensslconf.h aes_cfb.c aes_locl.h
93aes_core.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h
94aes_core.o: ../../include/openssl/opensslconf.h aes_core.c aes_locl.h
95aes_ctr.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h
96aes_ctr.o: ../../include/openssl/opensslconf.h aes_ctr.c aes_locl.h
97aes_ecb.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h
98aes_ecb.o: ../../include/openssl/opensslconf.h aes_ecb.c aes_locl.h
99aes_misc.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h
100aes_misc.o: ../../include/openssl/opensslconf.h
101aes_misc.o: ../../include/openssl/opensslv.h aes_locl.h aes_misc.c
102aes_ofb.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h
103aes_ofb.o: ../../include/openssl/opensslconf.h aes_locl.h aes_ofb.c
diff --git a/src/lib/libcrypto/aes/aes.h b/src/lib/libcrypto/aes/aes.h
index 450f2b4051..d2c99730fe 100644
--- a/src/lib/libcrypto/aes/aes.h
+++ b/src/lib/libcrypto/aes/aes.h
@@ -58,6 +58,8 @@
58#error AES is disabled. 58#error AES is disabled.
59#endif 59#endif
60 60
61#include <stddef.h>
62
61#define AES_ENCRYPT 1 63#define AES_ENCRYPT 1
62#define AES_DECRYPT 0 64#define AES_DECRYPT 0
63 65
@@ -66,10 +68,6 @@
66#define AES_MAXNR 14 68#define AES_MAXNR 14
67#define AES_BLOCK_SIZE 16 69#define AES_BLOCK_SIZE 16
68 70
69#ifdef OPENSSL_FIPS
70#define FIPS_AES_SIZE_T int
71#endif
72
73#ifdef __cplusplus 71#ifdef __cplusplus
74extern "C" { 72extern "C" {
75#endif 73#endif
@@ -100,37 +98,32 @@ void AES_decrypt(const unsigned char *in, unsigned char *out,
100void AES_ecb_encrypt(const unsigned char *in, unsigned char *out, 98void AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
101 const AES_KEY *key, const int enc); 99 const AES_KEY *key, const int enc);
102void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, 100void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
103 const unsigned long length, const AES_KEY *key, 101 size_t length, const AES_KEY *key,
104 unsigned char *ivec, const int enc); 102 unsigned char *ivec, const int enc);
105void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, 103void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out,
106 const unsigned long length, const AES_KEY *key, 104 size_t length, const AES_KEY *key,
107 unsigned char *ivec, int *num, const int enc); 105 unsigned char *ivec, int *num, const int enc);
108void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, 106void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out,
109 const unsigned long length, const AES_KEY *key, 107 size_t length, const AES_KEY *key,
110 unsigned char *ivec, int *num, const int enc); 108 unsigned char *ivec, int *num, const int enc);
111void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, 109void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out,
112 const unsigned long length, const AES_KEY *key, 110 size_t length, const AES_KEY *key,
113 unsigned char *ivec, int *num, const int enc); 111 unsigned char *ivec, int *num, const int enc);
114void AES_cfbr_encrypt_block(const unsigned char *in,unsigned char *out,
115 const int nbits,const AES_KEY *key,
116 unsigned char *ivec,const int enc);
117void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, 112void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out,
118 const unsigned long length, const AES_KEY *key, 113 size_t length, const AES_KEY *key,
119 unsigned char *ivec, int *num); 114 unsigned char *ivec, int *num);
120void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, 115void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
121 const unsigned long length, const AES_KEY *key, 116 size_t length, const AES_KEY *key,
122 unsigned char ivec[AES_BLOCK_SIZE], 117 unsigned char ivec[AES_BLOCK_SIZE],
123 unsigned char ecount_buf[AES_BLOCK_SIZE], 118 unsigned char ecount_buf[AES_BLOCK_SIZE],
124 unsigned int *num); 119 unsigned int *num);
125
126/* For IGE, see also http://www.links.org/files/openssl-ige.pdf */
127/* NB: the IV is _two_ blocks long */ 120/* NB: the IV is _two_ blocks long */
128void AES_ige_encrypt(const unsigned char *in, unsigned char *out, 121void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
129 const unsigned long length, const AES_KEY *key, 122 size_t length, const AES_KEY *key,
130 unsigned char *ivec, const int enc); 123 unsigned char *ivec, const int enc);
131/* NB: the IV is _four_ blocks long */ 124/* NB: the IV is _four_ blocks long */
132void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out, 125void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out,
133 const unsigned long length, const AES_KEY *key, 126 size_t length, const AES_KEY *key,
134 const AES_KEY *key2, const unsigned char *ivec, 127 const AES_KEY *key2, const unsigned char *ivec,
135 const int enc); 128 const int enc);
136 129
@@ -141,6 +134,7 @@ int AES_unwrap_key(AES_KEY *key, const unsigned char *iv,
141 unsigned char *out, 134 unsigned char *out,
142 const unsigned char *in, unsigned int inlen); 135 const unsigned char *in, unsigned int inlen);
143 136
137
144#ifdef __cplusplus 138#ifdef __cplusplus
145} 139}
146#endif 140#endif
diff --git a/src/lib/libcrypto/aes/aes_cbc.c b/src/lib/libcrypto/aes/aes_cbc.c
index 373864cd4b..227f75625d 100644
--- a/src/lib/libcrypto/aes/aes_cbc.c
+++ b/src/lib/libcrypto/aes/aes_cbc.c
@@ -49,85 +49,15 @@
49 * 49 *
50 */ 50 */
51 51
52#ifndef AES_DEBUG
53# ifndef NDEBUG
54# define NDEBUG
55# endif
56#endif
57#include <assert.h>
58
59#include <openssl/aes.h> 52#include <openssl/aes.h>
60#include "aes_locl.h" 53#include <openssl/modes.h>
61 54
62#if !defined(OPENSSL_FIPS_AES_ASM)
63void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, 55void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
64 const unsigned long length, const AES_KEY *key, 56 size_t len, const AES_KEY *key,
65 unsigned char *ivec, const int enc) { 57 unsigned char *ivec, const int enc) {
66 58
67 unsigned long n; 59 if (enc)
68 unsigned long len = length; 60 CRYPTO_cbc128_encrypt(in,out,len,key,ivec,(block128_f)AES_encrypt);
69 unsigned char tmp[AES_BLOCK_SIZE]; 61 else
70 const unsigned char *iv = ivec; 62 CRYPTO_cbc128_decrypt(in,out,len,key,ivec,(block128_f)AES_decrypt);
71
72 assert(in && out && key && ivec);
73 assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc));
74
75 if (AES_ENCRYPT == enc) {
76 while (len >= AES_BLOCK_SIZE) {
77 for(n=0; n < AES_BLOCK_SIZE; ++n)
78 out[n] = in[n] ^ iv[n];
79 AES_encrypt(out, out, key);
80 iv = out;
81 len -= AES_BLOCK_SIZE;
82 in += AES_BLOCK_SIZE;
83 out += AES_BLOCK_SIZE;
84 }
85 if (len) {
86 for(n=0; n < len; ++n)
87 out[n] = in[n] ^ iv[n];
88 for(n=len; n < AES_BLOCK_SIZE; ++n)
89 out[n] = iv[n];
90 AES_encrypt(out, out, key);
91 iv = out;
92 }
93 memcpy(ivec,iv,AES_BLOCK_SIZE);
94 } else if (in != out) {
95 while (len >= AES_BLOCK_SIZE) {
96 AES_decrypt(in, out, key);
97 for(n=0; n < AES_BLOCK_SIZE; ++n)
98 out[n] ^= iv[n];
99 iv = in;
100 len -= AES_BLOCK_SIZE;
101 in += AES_BLOCK_SIZE;
102 out += AES_BLOCK_SIZE;
103 }
104 if (len) {
105 AES_decrypt(in,tmp,key);
106 for(n=0; n < len; ++n)
107 out[n] = tmp[n] ^ iv[n];
108 iv = in;
109 }
110 memcpy(ivec,iv,AES_BLOCK_SIZE);
111 } else {
112 while (len >= AES_BLOCK_SIZE) {
113 memcpy(tmp, in, AES_BLOCK_SIZE);
114 AES_decrypt(in, out, key);
115 for(n=0; n < AES_BLOCK_SIZE; ++n)
116 out[n] ^= ivec[n];
117 memcpy(ivec, tmp, AES_BLOCK_SIZE);
118 len -= AES_BLOCK_SIZE;
119 in += AES_BLOCK_SIZE;
120 out += AES_BLOCK_SIZE;
121 }
122 if (len) {
123 memcpy(tmp, in, AES_BLOCK_SIZE);
124 AES_decrypt(tmp, out, key);
125 for(n=0; n < len; ++n)
126 out[n] ^= ivec[n];
127 for(n=len; n < AES_BLOCK_SIZE; ++n)
128 out[n] = tmp[n];
129 memcpy(ivec, tmp, AES_BLOCK_SIZE);
130 }
131 }
132} 63}
133#endif
diff --git a/src/lib/libcrypto/aes/aes_cfb.c b/src/lib/libcrypto/aes/aes_cfb.c
index 49f0411010..0c6d058ce7 100644
--- a/src/lib/libcrypto/aes/aes_cfb.c
+++ b/src/lib/libcrypto/aes/aes_cfb.c
@@ -1,6 +1,6 @@
1/* crypto/aes/aes_cfb.c -*- mode:C; c-file-style: "eay" -*- */ 1/* crypto/aes/aes_cfb.c -*- mode:C; c-file-style: "eay" -*- */
2/* ==================================================================== 2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. 3 * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved.
4 * 4 *
5 * Redistribution and use in source and binary forms, with or without 5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions 6 * modification, are permitted provided that the following conditions
@@ -48,73 +48,9 @@
48 * ==================================================================== 48 * ====================================================================
49 * 49 *
50 */ 50 */
51/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
52 * All rights reserved.
53 *
54 * This package is an SSL implementation written
55 * by Eric Young (eay@cryptsoft.com).
56 * The implementation was written so as to conform with Netscapes SSL.
57 *
58 * This library is free for commercial and non-commercial use as long as
59 * the following conditions are aheared to. The following conditions
60 * apply to all code found in this distribution, be it the RC4, RSA,
61 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
62 * included with this distribution is covered by the same copyright terms
63 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
64 *
65 * Copyright remains Eric Young's, and as such any Copyright notices in
66 * the code are not to be removed.
67 * If this package is used in a product, Eric Young should be given attribution
68 * as the author of the parts of the library used.
69 * This can be in the form of a textual message at program startup or
70 * in documentation (online or textual) provided with the package.
71 *
72 * Redistribution and use in source and binary forms, with or without
73 * modification, are permitted provided that the following conditions
74 * are met:
75 * 1. Redistributions of source code must retain the copyright
76 * notice, this list of conditions and the following disclaimer.
77 * 2. Redistributions in binary form must reproduce the above copyright
78 * notice, this list of conditions and the following disclaimer in the
79 * documentation and/or other materials provided with the distribution.
80 * 3. All advertising materials mentioning features or use of this software
81 * must display the following acknowledgement:
82 * "This product includes cryptographic software written by
83 * Eric Young (eay@cryptsoft.com)"
84 * The word 'cryptographic' can be left out if the rouines from the library
85 * being used are not cryptographic related :-).
86 * 4. If you include any Windows specific code (or a derivative thereof) from
87 * the apps directory (application code) you must include an acknowledgement:
88 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
89 *
90 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
91 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
92 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
93 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
94 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
95 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
96 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
97 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
98 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
99 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
100 * SUCH DAMAGE.
101 *
102 * The licence and distribution terms for any publically available version or
103 * derivative of this code cannot be changed. i.e. this code cannot simply be
104 * copied and put under another distribution licence
105 * [including the GNU Public Licence.]
106 */
107
108#ifndef AES_DEBUG
109# ifndef NDEBUG
110# define NDEBUG
111# endif
112#endif
113#include <assert.h>
114 51
115#include <openssl/aes.h> 52#include <openssl/aes.h>
116#include "aes_locl.h" 53#include <openssl/modes.h>
117#include "e_os.h"
118 54
119/* The input and output encrypted as though 128bit cfb mode is being 55/* The input and output encrypted as though 128bit cfb mode is being
120 * used. The extra state information to record how much of the 56 * used. The extra state information to record how much of the
@@ -122,104 +58,24 @@
122 */ 58 */
123 59
124void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, 60void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out,
125 const unsigned long length, const AES_KEY *key, 61 size_t length, const AES_KEY *key,
126 unsigned char *ivec, int *num, const int enc) { 62 unsigned char *ivec, int *num, const int enc) {
127 63
128 unsigned int n; 64 CRYPTO_cfb128_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt);
129 unsigned long l = length;
130 unsigned char c;
131
132 assert(in && out && key && ivec && num);
133
134 n = *num;
135
136 if (enc) {
137 while (l--) {
138 if (n == 0) {
139 AES_encrypt(ivec, ivec, key);
140 }
141 ivec[n] = *(out++) = *(in++) ^ ivec[n];
142 n = (n+1) % AES_BLOCK_SIZE;
143 }
144 } else {
145 while (l--) {
146 if (n == 0) {
147 AES_encrypt(ivec, ivec, key);
148 }
149 c = *(in);
150 *(out++) = *(in++) ^ ivec[n];
151 ivec[n] = c;
152 n = (n+1) % AES_BLOCK_SIZE;
153 }
154 }
155
156 *num=n;
157} 65}
158 66
159/* This expects a single block of size nbits for both in and out. Note that
160 it corrupts any extra bits in the last byte of out */
161void AES_cfbr_encrypt_block(const unsigned char *in,unsigned char *out,
162 const int nbits,const AES_KEY *key,
163 unsigned char *ivec,const int enc)
164 {
165 int n,rem,num;
166 unsigned char ovec[AES_BLOCK_SIZE*2];
167
168 if (nbits<=0 || nbits>128) return;
169
170 /* fill in the first half of the new IV with the current IV */
171 memcpy(ovec,ivec,AES_BLOCK_SIZE);
172 /* construct the new IV */
173 AES_encrypt(ivec,ivec,key);
174 num = (nbits+7)/8;
175 if (enc) /* encrypt the input */
176 for(n=0 ; n < num ; ++n)
177 out[n] = (ovec[AES_BLOCK_SIZE+n] = in[n] ^ ivec[n]);
178 else /* decrypt the input */
179 for(n=0 ; n < num ; ++n)
180 out[n] = (ovec[AES_BLOCK_SIZE+n] = in[n]) ^ ivec[n];
181 /* shift ovec left... */
182 rem = nbits%8;
183 num = nbits/8;
184 if(rem==0)
185 memcpy(ivec,ovec+num,AES_BLOCK_SIZE);
186 else
187 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
188 ivec[n] = ovec[n+num]<<rem | ovec[n+num+1]>>(8-rem);
189
190 /* it is not necessary to cleanse ovec, since the IV is not secret */
191 }
192
193/* N.B. This expects the input to be packed, MS bit first */ 67/* N.B. This expects the input to be packed, MS bit first */
194void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, 68void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out,
195 const unsigned long length, const AES_KEY *key, 69 size_t length, const AES_KEY *key,
196 unsigned char *ivec, int *num, const int enc) 70 unsigned char *ivec, int *num, const int enc)
197 { 71 {
198 unsigned int n; 72 CRYPTO_cfb128_1_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt);
199 unsigned char c[1],d[1];
200
201 assert(in && out && key && ivec && num);
202 assert(*num == 0);
203
204 memset(out,0,(length+7)/8);
205 for(n=0 ; n < length ; ++n)
206 {
207 c[0]=(in[n/8]&(1 << (7-n%8))) ? 0x80 : 0;
208 AES_cfbr_encrypt_block(c,d,1,key,ivec,enc);
209 out[n/8]=(out[n/8]&~(1 << (7-n%8)))|((d[0]&0x80) >> (n%8));
210 }
211 } 73 }
212 74
213void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, 75void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out,
214 const unsigned long length, const AES_KEY *key, 76 size_t length, const AES_KEY *key,
215 unsigned char *ivec, int *num, const int enc) 77 unsigned char *ivec, int *num, const int enc)
216 { 78 {
217 unsigned int n; 79 CRYPTO_cfb128_8_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt);
218
219 assert(in && out && key && ivec && num);
220 assert(*num == 0);
221
222 for(n=0 ; n < length ; ++n)
223 AES_cfbr_encrypt_block(&in[n],&out[n],8,key,ivec,enc);
224 } 80 }
225 81
diff --git a/src/lib/libcrypto/aes/aes_core.c b/src/lib/libcrypto/aes/aes_core.c
index cffdd4daec..a7ec54f4da 100644
--- a/src/lib/libcrypto/aes/aes_core.c
+++ b/src/lib/libcrypto/aes/aes_core.c
@@ -37,12 +37,9 @@
37 37
38#include <stdlib.h> 38#include <stdlib.h>
39#include <openssl/aes.h> 39#include <openssl/aes.h>
40#ifdef OPENSSL_FIPS
41#include <openssl/fips.h>
42#endif
43
44#include "aes_locl.h" 40#include "aes_locl.h"
45 41
42#ifndef AES_ASM
46/* 43/*
47Te0[x] = S [x].[02, 01, 01, 03]; 44Te0[x] = S [x].[02, 01, 01, 03];
48Te1[x] = S [x].[03, 02, 01, 01]; 45Te1[x] = S [x].[03, 02, 01, 01];
@@ -635,10 +632,6 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
635 int i = 0; 632 int i = 0;
636 u32 temp; 633 u32 temp;
637 634
638#ifdef OPENSSL_FIPS
639 FIPS_selftest_check();
640#endif
641
642 if (!userKey || !key) 635 if (!userKey || !key)
643 return -1; 636 return -1;
644 if (bits != 128 && bits != 192 && bits != 256) 637 if (bits != 128 && bits != 192 && bits != 256)
@@ -781,7 +774,6 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
781 return 0; 774 return 0;
782} 775}
783 776
784#ifndef AES_ASM
785/* 777/*
786 * Encrypt a single block 778 * Encrypt a single block
787 * in and out can overlap 779 * in and out can overlap
@@ -1164,4 +1156,203 @@ void AES_decrypt(const unsigned char *in, unsigned char *out,
1164 PUTU32(out + 12, s3); 1156 PUTU32(out + 12, s3);
1165} 1157}
1166 1158
1159#else /* AES_ASM */
1160
1161static const u8 Te4[256] = {
1162 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
1163 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
1164 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
1165 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
1166 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
1167 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
1168 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
1169 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
1170 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
1171 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
1172 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
1173 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
1174 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
1175 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
1176 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
1177 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
1178 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
1179 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
1180 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
1181 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
1182 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
1183 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
1184 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
1185 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
1186 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
1187 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
1188 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
1189 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
1190 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
1191 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
1192 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
1193 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
1194};
1195static const u32 rcon[] = {
1196 0x01000000, 0x02000000, 0x04000000, 0x08000000,
1197 0x10000000, 0x20000000, 0x40000000, 0x80000000,
1198 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
1199};
1200
1201/**
1202 * Expand the cipher key into the encryption key schedule.
1203 */
1204int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
1205 AES_KEY *key) {
1206 u32 *rk;
1207 int i = 0;
1208 u32 temp;
1209
1210 if (!userKey || !key)
1211 return -1;
1212 if (bits != 128 && bits != 192 && bits != 256)
1213 return -2;
1214
1215 rk = key->rd_key;
1216
1217 if (bits==128)
1218 key->rounds = 10;
1219 else if (bits==192)
1220 key->rounds = 12;
1221 else
1222 key->rounds = 14;
1223
1224 rk[0] = GETU32(userKey );
1225 rk[1] = GETU32(userKey + 4);
1226 rk[2] = GETU32(userKey + 8);
1227 rk[3] = GETU32(userKey + 12);
1228 if (bits == 128) {
1229 while (1) {
1230 temp = rk[3];
1231 rk[4] = rk[0] ^
1232 (Te4[(temp >> 16) & 0xff] << 24) ^
1233 (Te4[(temp >> 8) & 0xff] << 16) ^
1234 (Te4[(temp ) & 0xff] << 8) ^
1235 (Te4[(temp >> 24) ]) ^
1236 rcon[i];
1237 rk[5] = rk[1] ^ rk[4];
1238 rk[6] = rk[2] ^ rk[5];
1239 rk[7] = rk[3] ^ rk[6];
1240 if (++i == 10) {
1241 return 0;
1242 }
1243 rk += 4;
1244 }
1245 }
1246 rk[4] = GETU32(userKey + 16);
1247 rk[5] = GETU32(userKey + 20);
1248 if (bits == 192) {
1249 while (1) {
1250 temp = rk[ 5];
1251 rk[ 6] = rk[ 0] ^
1252 (Te4[(temp >> 16) & 0xff] << 24) ^
1253 (Te4[(temp >> 8) & 0xff] << 16) ^
1254 (Te4[(temp ) & 0xff] << 8) ^
1255 (Te4[(temp >> 24) ]) ^
1256 rcon[i];
1257 rk[ 7] = rk[ 1] ^ rk[ 6];
1258 rk[ 8] = rk[ 2] ^ rk[ 7];
1259 rk[ 9] = rk[ 3] ^ rk[ 8];
1260 if (++i == 8) {
1261 return 0;
1262 }
1263 rk[10] = rk[ 4] ^ rk[ 9];
1264 rk[11] = rk[ 5] ^ rk[10];
1265 rk += 6;
1266 }
1267 }
1268 rk[6] = GETU32(userKey + 24);
1269 rk[7] = GETU32(userKey + 28);
1270 if (bits == 256) {
1271 while (1) {
1272 temp = rk[ 7];
1273 rk[ 8] = rk[ 0] ^
1274 (Te4[(temp >> 16) & 0xff] << 24) ^
1275 (Te4[(temp >> 8) & 0xff] << 16) ^
1276 (Te4[(temp ) & 0xff] << 8) ^
1277 (Te4[(temp >> 24) ]) ^
1278 rcon[i];
1279 rk[ 9] = rk[ 1] ^ rk[ 8];
1280 rk[10] = rk[ 2] ^ rk[ 9];
1281 rk[11] = rk[ 3] ^ rk[10];
1282 if (++i == 7) {
1283 return 0;
1284 }
1285 temp = rk[11];
1286 rk[12] = rk[ 4] ^
1287 (Te4[(temp >> 24) ] << 24) ^
1288 (Te4[(temp >> 16) & 0xff] << 16) ^
1289 (Te4[(temp >> 8) & 0xff] << 8) ^
1290 (Te4[(temp ) & 0xff]);
1291 rk[13] = rk[ 5] ^ rk[12];
1292 rk[14] = rk[ 6] ^ rk[13];
1293 rk[15] = rk[ 7] ^ rk[14];
1294
1295 rk += 8;
1296 }
1297 }
1298 return 0;
1299}
1300
1301/**
1302 * Expand the cipher key into the decryption key schedule.
1303 */
1304int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
1305 AES_KEY *key) {
1306
1307 u32 *rk;
1308 int i, j, status;
1309 u32 temp;
1310
1311 /* first, start with an encryption schedule */
1312 status = AES_set_encrypt_key(userKey, bits, key);
1313 if (status < 0)
1314 return status;
1315
1316 rk = key->rd_key;
1317
1318 /* invert the order of the round keys: */
1319 for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
1320 temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
1321 temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
1322 temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
1323 temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
1324 }
1325 /* apply the inverse MixColumn transform to all round keys but the first and the last: */
1326 for (i = 1; i < (key->rounds); i++) {
1327 rk += 4;
1328 for (j = 0; j < 4; j++) {
1329 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
1330
1331 tp1 = rk[j];
1332 m = tp1 & 0x80808080;
1333 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
1334 ((m - (m >> 7)) & 0x1b1b1b1b);
1335 m = tp2 & 0x80808080;
1336 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
1337 ((m - (m >> 7)) & 0x1b1b1b1b);
1338 m = tp4 & 0x80808080;
1339 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
1340 ((m - (m >> 7)) & 0x1b1b1b1b);
1341 tp9 = tp8 ^ tp1;
1342 tpb = tp9 ^ tp2;
1343 tpd = tp9 ^ tp4;
1344 tpe = tp8 ^ tp4 ^ tp2;
1345#if defined(ROTATE)
1346 rk[j] = tpe ^ ROTATE(tpd,16) ^
1347 ROTATE(tp9,24) ^ ROTATE(tpb,8);
1348#else
1349 rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
1350 (tp9 >> 8) ^ (tp9 << 24) ^
1351 (tpb >> 24) ^ (tpb << 8);
1352#endif
1353 }
1354 }
1355 return 0;
1356}
1357
1167#endif /* AES_ASM */ 1358#endif /* AES_ASM */
diff --git a/src/lib/libcrypto/aes/aes_ctr.c b/src/lib/libcrypto/aes/aes_ctr.c
index f36982be1e..7c9d165d8a 100644
--- a/src/lib/libcrypto/aes/aes_ctr.c
+++ b/src/lib/libcrypto/aes/aes_ctr.c
@@ -49,91 +49,13 @@
49 * 49 *
50 */ 50 */
51 51
52#ifndef AES_DEBUG
53# ifndef NDEBUG
54# define NDEBUG
55# endif
56#endif
57#include <assert.h>
58
59#include <openssl/aes.h> 52#include <openssl/aes.h>
60#include "aes_locl.h" 53#include <openssl/modes.h>
61
62/* NOTE: the IV/counter CTR mode is big-endian. The rest of the AES code
63 * is endian-neutral. */
64
65/* increment counter (128-bit int) by 1 */
66static void AES_ctr128_inc(unsigned char *counter) {
67 unsigned long c;
68
69 /* Grab bottom dword of counter and increment */
70 c = GETU32(counter + 12);
71 c++; c &= 0xFFFFFFFF;
72 PUTU32(counter + 12, c);
73
74 /* if no overflow, we're done */
75 if (c)
76 return;
77
78 /* Grab 1st dword of counter and increment */
79 c = GETU32(counter + 8);
80 c++; c &= 0xFFFFFFFF;
81 PUTU32(counter + 8, c);
82
83 /* if no overflow, we're done */
84 if (c)
85 return;
86
87 /* Grab 2nd dword of counter and increment */
88 c = GETU32(counter + 4);
89 c++; c &= 0xFFFFFFFF;
90 PUTU32(counter + 4, c);
91
92 /* if no overflow, we're done */
93 if (c)
94 return;
95 54
96 /* Grab top dword of counter and increment */
97 c = GETU32(counter + 0);
98 c++; c &= 0xFFFFFFFF;
99 PUTU32(counter + 0, c);
100}
101
102/* The input encrypted as though 128bit counter mode is being
103 * used. The extra state information to record how much of the
104 * 128bit block we have used is contained in *num, and the
105 * encrypted counter is kept in ecount_buf. Both *num and
106 * ecount_buf must be initialised with zeros before the first
107 * call to AES_ctr128_encrypt().
108 *
109 * This algorithm assumes that the counter is in the x lower bits
110 * of the IV (ivec), and that the application has full control over
111 * overflow and the rest of the IV. This implementation takes NO
112 * responsability for checking that the counter doesn't overflow
113 * into the rest of the IV when incremented.
114 */
115void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, 55void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
116 const unsigned long length, const AES_KEY *key, 56 size_t length, const AES_KEY *key,
117 unsigned char ivec[AES_BLOCK_SIZE], 57 unsigned char ivec[AES_BLOCK_SIZE],
118 unsigned char ecount_buf[AES_BLOCK_SIZE], 58 unsigned char ecount_buf[AES_BLOCK_SIZE],
119 unsigned int *num) { 59 unsigned int *num) {
120 60 CRYPTO_ctr128_encrypt(in,out,length,key,ivec,ecount_buf,num,(block128_f)AES_encrypt);
121 unsigned int n;
122 unsigned long l=length;
123
124 assert(in && out && key && counter && num);
125 assert(*num < AES_BLOCK_SIZE);
126
127 n = *num;
128
129 while (l--) {
130 if (n == 0) {
131 AES_encrypt(ivec, ecount_buf, key);
132 AES_ctr128_inc(ivec);
133 }
134 *(out++) = *(in++) ^ ecount_buf[n];
135 n = (n+1) % AES_BLOCK_SIZE;
136 }
137
138 *num=n;
139} 61}
diff --git a/src/lib/libcrypto/aes/aes_ofb.c b/src/lib/libcrypto/aes/aes_ofb.c
index f358bb39e2..50bf0b8325 100644
--- a/src/lib/libcrypto/aes/aes_ofb.c
+++ b/src/lib/libcrypto/aes/aes_ofb.c
@@ -1,6 +1,6 @@
1/* crypto/aes/aes_ofb.c -*- mode:C; c-file-style: "eay" -*- */ 1/* crypto/aes/aes_ofb.c -*- mode:C; c-file-style: "eay" -*- */
2/* ==================================================================== 2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. 3 * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved.
4 * 4 *
5 * Redistribution and use in source and binary forms, with or without 5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions 6 * modification, are permitted provided that the following conditions
@@ -48,95 +48,13 @@
48 * ==================================================================== 48 * ====================================================================
49 * 49 *
50 */ 50 */
51/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
52 * All rights reserved.
53 *
54 * This package is an SSL implementation written
55 * by Eric Young (eay@cryptsoft.com).
56 * The implementation was written so as to conform with Netscapes SSL.
57 *
58 * This library is free for commercial and non-commercial use as long as
59 * the following conditions are aheared to. The following conditions
60 * apply to all code found in this distribution, be it the RC4, RSA,
61 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
62 * included with this distribution is covered by the same copyright terms
63 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
64 *
65 * Copyright remains Eric Young's, and as such any Copyright notices in
66 * the code are not to be removed.
67 * If this package is used in a product, Eric Young should be given attribution
68 * as the author of the parts of the library used.
69 * This can be in the form of a textual message at program startup or
70 * in documentation (online or textual) provided with the package.
71 *
72 * Redistribution and use in source and binary forms, with or without
73 * modification, are permitted provided that the following conditions
74 * are met:
75 * 1. Redistributions of source code must retain the copyright
76 * notice, this list of conditions and the following disclaimer.
77 * 2. Redistributions in binary form must reproduce the above copyright
78 * notice, this list of conditions and the following disclaimer in the
79 * documentation and/or other materials provided with the distribution.
80 * 3. All advertising materials mentioning features or use of this software
81 * must display the following acknowledgement:
82 * "This product includes cryptographic software written by
83 * Eric Young (eay@cryptsoft.com)"
84 * The word 'cryptographic' can be left out if the rouines from the library
85 * being used are not cryptographic related :-).
86 * 4. If you include any Windows specific code (or a derivative thereof) from
87 * the apps directory (application code) you must include an acknowledgement:
88 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
89 *
90 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
91 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
92 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
93 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
94 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
95 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
96 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
97 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
98 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
99 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
100 * SUCH DAMAGE.
101 *
102 * The licence and distribution terms for any publically available version or
103 * derivative of this code cannot be changed. i.e. this code cannot simply be
104 * copied and put under another distribution licence
105 * [including the GNU Public Licence.]
106 */
107
108#ifndef AES_DEBUG
109# ifndef NDEBUG
110# define NDEBUG
111# endif
112#endif
113#include <assert.h>
114 51
115#include <openssl/aes.h> 52#include <openssl/aes.h>
116#include "aes_locl.h" 53#include <openssl/modes.h>
117 54
118/* The input and output encrypted as though 128bit ofb mode is being
119 * used. The extra state information to record how much of the
120 * 128bit block we have used is contained in *num;
121 */
122void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, 55void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out,
123 const unsigned long length, const AES_KEY *key, 56 size_t length, const AES_KEY *key,
124 unsigned char *ivec, int *num) { 57 unsigned char *ivec, int *num)
125 58{
126 unsigned int n; 59 CRYPTO_ofb128_encrypt(in,out,length,key,ivec,num,(block128_f)AES_encrypt);
127 unsigned long l=length;
128
129 assert(in && out && key && ivec && num);
130
131 n = *num;
132
133 while (l--) {
134 if (n == 0) {
135 AES_encrypt(ivec, ivec, key);
136 }
137 *(out++) = *(in++) ^ ivec[n];
138 n = (n+1) % AES_BLOCK_SIZE;
139 }
140
141 *num=n;
142} 60}
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl
index e771e83953..aab40e6f1c 100644
--- a/src/lib/libcrypto/aes/asm/aes-586.pl
+++ b/src/lib/libcrypto/aes/asm/aes-586.pl
@@ -2,11 +2,12 @@
2# 2#
3# ==================================================================== 3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary 5# project. The module is, however, dual licensed under OpenSSL and
6# forms are granted according to the OpenSSL license. 6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
7# ==================================================================== 8# ====================================================================
8# 9#
9# Version 3.6. 10# Version 4.3.
10# 11#
11# You might fail to appreciate this module performance from the first 12# You might fail to appreciate this module performance from the first
12# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered 13# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
@@ -81,11 +82,117 @@
81# AMD K8 20 19 82# AMD K8 20 19
82# PIII 25 23 83# PIII 25 23
83# Pentium 81 78 84# Pentium 81 78
84 85#
85push(@INC,"perlasm","../../perlasm"); 86# Version 3.7 reimplements outer rounds as "compact." Meaning that
87# first and last rounds reference compact 256 bytes S-box. This means
88# that first round consumes a lot more CPU cycles and that encrypt
89# and decrypt performance becomes asymmetric. Encrypt performance
90# drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
91# aggressively pre-fetched.
92#
93# Version 4.0 effectively rolls back to 3.6 and instead implements
94# additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
95# which use exclusively 256 byte S-box. These functions are to be
96# called in modes not concealing plain text, such as ECB, or when
97# we're asked to process smaller amount of data [or unconditionally
98# on hyper-threading CPU]. Currently it's called unconditionally from
99# AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
100# still needs to be modified to switch between slower and faster
101# mode when appropriate... But in either case benchmark landscape
102# changes dramatically and below numbers are CPU cycles per processed
103# byte for 128-bit key.
104#
105# ECB encrypt ECB decrypt CBC large chunk
106# P4 56[60] 84[100] 23
107# AMD K8 48[44] 70[79] 18
108# PIII 41[50] 61[91] 24
109# Core 2 32[38] 45[70] 18.5
110# Pentium 120 160 77
111#
112# Version 4.1 switches to compact S-box even in key schedule setup.
113#
114# Version 4.2 prefetches compact S-box in every SSE round or in other
115# words every cache-line is *guaranteed* to be accessed within ~50
116# cycles window. Why just SSE? Because it's needed on hyper-threading
117# CPU! Which is also why it's prefetched with 64 byte stride. Best
118# part is that it has no negative effect on performance:-)
119#
120# Version 4.3 implements switch between compact and non-compact block
121# functions in AES_cbc_encrypt depending on how much data was asked
122# to be processed in one stroke.
123#
124######################################################################
125# Timing attacks are classified in two classes: synchronous when
126# attacker consciously initiates cryptographic operation and collects
127# timing data of various character afterwards, and asynchronous when
128# malicious code is executed on same CPU simultaneously with AES,
129# instruments itself and performs statistical analysis of this data.
130#
131# As far as synchronous attacks go the root to the AES timing
132# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
133# are referred to in single 128-bit block operation. Well, in C
134# implementation with 4 distinct tables it's actually as little as 40
135# references per 256 elements table, but anyway... Secondly, even
136# though S-box elements are clustered into smaller amount of cache-
137# lines, smaller than 160 and even 40, it turned out that for certain
138# plain-text pattern[s] or simply put chosen plain-text and given key
139# few cache-lines remain unaccessed during block operation. Now, if
140# attacker can figure out this access pattern, he can deduct the key
141# [or at least part of it]. The natural way to mitigate this kind of
142# attacks is to minimize the amount of cache-lines in S-box and/or
143# prefetch them to ensure that every one is accessed for more uniform
144# timing. But note that *if* plain-text was concealed in such way that
145# input to block function is distributed *uniformly*, then attack
146# wouldn't apply. Now note that some encryption modes, most notably
147# CBC, do mask the plain-text in this exact way [secure cipher output
148# is distributed uniformly]. Yes, one still might find input that
149# would reveal the information about given key, but if amount of
150# candidate inputs to be tried is larger than amount of possible key
151# combinations then attack becomes infeasible. This is why revised
152# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
153# of data is to be processed in one stroke. The current size limit of
154# 512 bytes is chosen to provide same [diminishigly low] probability
155# for cache-line to remain untouched in large chunk operation with
156# large S-box as for single block operation with compact S-box and
157# surely needs more careful consideration...
158#
159# As for asynchronous attacks. There are two flavours: attacker code
160# being interleaved with AES on hyper-threading CPU at *instruction*
161# level, and two processes time sharing single core. As for latter.
162# Two vectors. 1. Given that attacker process has higher priority,
163# yield execution to process performing AES just before timer fires
164# off the scheduler, immediately regain control of CPU and analyze the
165# cache state. For this attack to be efficient attacker would have to
166# effectively slow down the operation by several *orders* of magnitute,
167# by ratio of time slice to duration of handful of AES rounds, which
168# unlikely to remain unnoticed. Not to mention that this also means
169# that he would spend correspondigly more time to collect enough
170# statistical data to mount the attack. It's probably appropriate to
171# say that if adeversary reckons that this attack is beneficial and
172# risks to be noticed, you probably have larger problems having him
173# mere opportunity. In other words suggested code design expects you
174# to preclude/mitigate this attack by overall system security design.
175# 2. Attacker manages to make his code interrupt driven. In order for
176# this kind of attack to be feasible, interrupt rate has to be high
177# enough, again comparable to duration of handful of AES rounds. But
178# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
179# generates interrupts at such raging rate...
180#
181# And now back to the former, hyper-threading CPU or more specifically
182# Intel P4. Recall that asynchronous attack implies that malicious
183# code instruments itself. And naturally instrumentation granularity
184# has be noticeably lower than duration of codepath accessing S-box.
185# Given that all cache-lines are accessed during that time that is.
186# Current implementation accesses *all* cache-lines within ~50 cycles
187# window, which is actually *less* than RDTSC latency on Intel P4!
188
189$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
190push(@INC,"${dir}","${dir}../../perlasm");
86require "x86asm.pl"; 191require "x86asm.pl";
87 192
88&asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386"); 193&asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
194&static_label("AES_Te");
195&static_label("AES_Td");
89 196
90$s0="eax"; 197$s0="eax";
91$s1="ebx"; 198$s1="ebx";
@@ -93,21 +200,36 @@ $s2="ecx";
93$s3="edx"; 200$s3="edx";
94$key="edi"; 201$key="edi";
95$acc="esi"; 202$acc="esi";
203$tbl="ebp";
204
205# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
206# by caller
207$__ra=&DWP(0,"esp"); # return address
208$__s0=&DWP(4,"esp"); # s0 backing store
209$__s1=&DWP(8,"esp"); # s1 backing store
210$__s2=&DWP(12,"esp"); # s2 backing store
211$__s3=&DWP(16,"esp"); # s3 backing store
212$__key=&DWP(20,"esp"); # pointer to key schedule
213$__end=&DWP(24,"esp"); # pointer to end of key schedule
214$__tbl=&DWP(28,"esp"); # %ebp backing store
215
216# stack frame layout in AES_[en|crypt] routines, which differs from
217# above by 4 and overlaps by %ebp backing store
218$_tbl=&DWP(24,"esp");
219$_esp=&DWP(28,"esp");
96 220
97$compromise=0; # $compromise=128 abstains from copying key 221sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
98 # schedule to stack when encrypting inputs 222
99 # shorter than 128 bytes at the cost of 223$speed_limit=512; # chunks smaller than $speed_limit are
100 # risksing aliasing with S-boxes. In return 224 # processed with compact routine in CBC mode
101 # you get way better, up to +70%, small block
102 # performance.
103$small_footprint=1; # $small_footprint=1 code is ~5% slower [on 225$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
104 # recent µ-archs], but ~5 times smaller! 226 # recent µ-archs], but ~5 times smaller!
105 # I favor compact code to minimize cache 227 # I favor compact code to minimize cache
106 # contention and in hope to "collect" 5% back 228 # contention and in hope to "collect" 5% back
107 # in real-life applications... 229 # in real-life applications...
230
108$vertical_spin=0; # shift "verticaly" defaults to 0, because of 231$vertical_spin=0; # shift "verticaly" defaults to 0, because of
109 # its proof-of-concept status... 232 # its proof-of-concept status...
110
111# Note that there is no decvert(), as well as last encryption round is 233# Note that there is no decvert(), as well as last encryption round is
112# performed with "horizontal" shifts. This is because this "vertical" 234# performed with "horizontal" shifts. This is because this "vertical"
113# implementation [one which groups shifts on a given $s[i] to form a 235# implementation [one which groups shifts on a given $s[i] to form a
@@ -170,17 +292,484 @@ sub encvert()
170 &movz ($v0,&HB($v1)); 292 &movz ($v0,&HB($v1));
171 &and ($v1,0xFF); 293 &and ($v1,0xFF);
172 &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16 294 &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
173 &mov ($key,&DWP(12,"esp")); # reincarnate v1 as key 295 &mov ($key,$__key); # reincarnate v1 as key
174 &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24 296 &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
175} 297}
176 298
299# Another experimental routine, which features "horizontal spin," but
300# eliminates one reference to stack. Strangely enough runs slower...
301sub enchoriz()
302{ my $v0 = $key, $v1 = $acc;
303
304 &movz ($v0,&LB($s0)); # 3, 2, 1, 0*
305 &rotr ($s2,8); # 8,11,10, 9
306 &mov ($v1,&DWP(0,$te,$v0,8)); # 0
307 &movz ($v0,&HB($s1)); # 7, 6, 5*, 4
308 &rotr ($s3,16); # 13,12,15,14
309 &xor ($v1,&DWP(3,$te,$v0,8)); # 5
310 &movz ($v0,&HB($s2)); # 8,11,10*, 9
311 &rotr ($s0,16); # 1, 0, 3, 2
312 &xor ($v1,&DWP(2,$te,$v0,8)); # 10
313 &movz ($v0,&HB($s3)); # 13,12,15*,14
314 &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
315 &mov ($__s0,$v1); # t[0] saved
316
317 &movz ($v0,&LB($s1)); # 7, 6, 5, 4*
318 &shr ($s1,16); # -, -, 7, 6
319 &mov ($v1,&DWP(0,$te,$v0,8)); # 4
320 &movz ($v0,&LB($s3)); # 13,12,15,14*
321 &xor ($v1,&DWP(2,$te,$v0,8)); # 14
322 &movz ($v0,&HB($s0)); # 1, 0, 3*, 2
323 &and ($s3,0xffff0000); # 13,12, -, -
324 &xor ($v1,&DWP(1,$te,$v0,8)); # 3
325 &movz ($v0,&LB($s2)); # 8,11,10, 9*
326 &or ($s3,$s1); # 13,12, 7, 6
327 &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected
328 &mov ($s1,$v1); # s[1]=t[1]
329
330 &movz ($v0,&LB($s0)); # 1, 0, 3, 2*
331 &shr ($s2,16); # -, -, 8,11
332 &mov ($v1,&DWP(2,$te,$v0,8)); # 2
333 &movz ($v0,&HB($s3)); # 13,12, 7*, 6
334 &xor ($v1,&DWP(1,$te,$v0,8)); # 7
335 &movz ($v0,&HB($s2)); # -, -, 8*,11
336 &xor ($v1,&DWP(0,$te,$v0,8)); # 8
337 &mov ($v0,$s3);
338 &shr ($v0,24); # 13
339 &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected
340
341 &movz ($v0,&LB($s2)); # -, -, 8,11*
342 &shr ($s0,24); # 1*
343 &mov ($s2,&DWP(1,$te,$v0,8)); # 11
344 &xor ($s2,&DWP(3,$te,$s0,8)); # 1
345 &mov ($s0,$__s0); # s[0]=t[0]
346 &movz ($v0,&LB($s3)); # 13,12, 7, 6*
347 &shr ($s3,16); # , ,13,12
348 &xor ($s2,&DWP(2,$te,$v0,8)); # 6
349 &mov ($key,$__key); # reincarnate v0 as key
350 &and ($s3,0xff); # , ,13,12*
351 &mov ($s3,&DWP(0,$te,$s3,8)); # 12
352 &xor ($s3,$s2); # s[2]=t[3] collected
353 &mov ($s2,$v1); # s[2]=t[2]
354}
355
356# More experimental code... SSE one... Even though this one eliminates
357# *all* references to stack, it's not faster...
358sub sse_encbody()
359{
360 &movz ($acc,&LB("eax")); # 0
361 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
362 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
363 &movz ("edx",&HB("eax")); # 1
364 &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
365 &shr ("eax",16); # 5, 4
366
367 &movz ($acc,&LB("ebx")); # 10
368 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
369 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
370 &movz ($acc,&HB("ebx")); # 11
371 &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
372 &shr ("ebx",16); # 15,14
373
374 &movz ($acc,&HB("eax")); # 5
375 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
376 &movq ("mm3",QWP(16,$key));
377 &movz ($acc,&HB("ebx")); # 15
378 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
379 &movd ("mm0","ecx"); # t[0] collected
380
381 &movz ($acc,&LB("eax")); # 4
382 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
383 &movd ("eax","mm2"); # 7, 6, 3, 2
384 &movz ($acc,&LB("ebx")); # 14
385 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
386 &movd ("ebx","mm6"); # 13,12, 9, 8
387
388 &movz ($acc,&HB("eax")); # 3
389 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
390 &movz ($acc,&HB("ebx")); # 9
391 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
392 &movd ("mm1","ecx"); # t[1] collected
393
394 &movz ($acc,&LB("eax")); # 2
395 &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
396 &shr ("eax",16); # 7, 6
397 &punpckldq ("mm0","mm1"); # t[0,1] collected
398 &movz ($acc,&LB("ebx")); # 8
399 &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
400 &shr ("ebx",16); # 13,12
401
402 &movz ($acc,&HB("eax")); # 7
403 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
404 &pxor ("mm0","mm3");
405 &movz ("eax",&LB("eax")); # 6
406 &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
407 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
408 &movz ($acc,&HB("ebx")); # 13
409 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
410 &xor ("ecx",&DWP(24,$key)); # t[2]
411 &movd ("mm4","ecx"); # t[2] collected
412 &movz ("ebx",&LB("ebx")); # 12
413 &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12
414 &shr ("ecx",16);
415 &movd ("eax","mm1"); # 5, 4, 1, 0
416 &mov ("ebx",&DWP(28,$key)); # t[3]
417 &xor ("ebx","edx");
418 &movd ("mm5","ebx"); # t[3] collected
419 &and ("ebx",0xffff0000);
420 &or ("ebx","ecx");
421
422 &punpckldq ("mm4","mm5"); # t[2,3] collected
423}
424
425######################################################################
426# "Compact" block function
427######################################################################
428
429sub enccompact()
430{ my $Fn = mov;
431 while ($#_>5) { pop(@_); $Fn=sub{}; }
432 my ($i,$te,@s)=@_;
433 my $tmp = $key;
434 my $out = $i==3?$s[0]:$acc;
435
436 # $Fn is used in first compact round and its purpose is to
437 # void restoration of some values from stack, so that after
438 # 4xenccompact with extra argument $key value is left there...
439 if ($i==3) { &$Fn ($key,$__key); }##%edx
440 else { &mov ($out,$s[0]); }
441 &and ($out,0xFF);
442 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
443 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
444 &movz ($out,&BP(-128,$te,$out,1));
445
446 if ($i==3) { $tmp=$s[1]; }##%eax
447 &movz ($tmp,&HB($s[1]));
448 &movz ($tmp,&BP(-128,$te,$tmp,1));
449 &shl ($tmp,8);
450 &xor ($out,$tmp);
451
452 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
453 else { &mov ($tmp,$s[2]);
454 &shr ($tmp,16); }
455 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
456 &and ($tmp,0xFF);
457 &movz ($tmp,&BP(-128,$te,$tmp,1));
458 &shl ($tmp,16);
459 &xor ($out,$tmp);
460
461 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
462 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
463 else { &mov ($tmp,$s[3]);
464 &shr ($tmp,24); }
465 &movz ($tmp,&BP(-128,$te,$tmp,1));
466 &shl ($tmp,24);
467 &xor ($out,$tmp);
468 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
469 if ($i==3) { &mov ($s[3],$acc); }
470 &comment();
471}
472
473sub enctransform()
474{ my @s = ($s0,$s1,$s2,$s3);
475 my $i = shift;
476 my $tmp = $tbl;
477 my $r2 = $key ;
478
479 &mov ($acc,$s[$i]);
480 &and ($acc,0x80808080);
481 &mov ($tmp,$acc);
482 &shr ($tmp,7);
483 &lea ($r2,&DWP(0,$s[$i],$s[$i]));
484 &sub ($acc,$tmp);
485 &and ($r2,0xfefefefe);
486 &and ($acc,0x1b1b1b1b);
487 &mov ($tmp,$s[$i]);
488 &xor ($acc,$r2); # r2
489
490 &xor ($s[$i],$acc); # r0 ^ r2
491 &rotl ($s[$i],24);
492 &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2
493 &rotr ($tmp,16);
494 &xor ($s[$i],$tmp);
495 &rotr ($tmp,8);
496 &xor ($s[$i],$tmp);
497}
498
499&function_begin_B("_x86_AES_encrypt_compact");
500 # note that caller is expected to allocate stack frame for me!
501 &mov ($__key,$key); # save key
502
503 &xor ($s0,&DWP(0,$key)); # xor with key
504 &xor ($s1,&DWP(4,$key));
505 &xor ($s2,&DWP(8,$key));
506 &xor ($s3,&DWP(12,$key));
507
508 &mov ($acc,&DWP(240,$key)); # load key->rounds
509 &lea ($acc,&DWP(-2,$acc,$acc));
510 &lea ($acc,&DWP(0,$key,$acc,8));
511 &mov ($__end,$acc); # end of key schedule
512
513 # prefetch Te4
514 &mov ($key,&DWP(0-128,$tbl));
515 &mov ($acc,&DWP(32-128,$tbl));
516 &mov ($key,&DWP(64-128,$tbl));
517 &mov ($acc,&DWP(96-128,$tbl));
518 &mov ($key,&DWP(128-128,$tbl));
519 &mov ($acc,&DWP(160-128,$tbl));
520 &mov ($key,&DWP(192-128,$tbl));
521 &mov ($acc,&DWP(224-128,$tbl));
522
523 &set_label("loop",16);
524
525 &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
526 &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
527 &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
528 &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
529 &enctransform(2);
530 &enctransform(3);
531 &enctransform(0);
532 &enctransform(1);
533 &mov ($key,$__key);
534 &mov ($tbl,$__tbl);
535 &add ($key,16); # advance rd_key
536 &xor ($s0,&DWP(0,$key));
537 &xor ($s1,&DWP(4,$key));
538 &xor ($s2,&DWP(8,$key));
539 &xor ($s3,&DWP(12,$key));
540
541 &cmp ($key,$__end);
542 &mov ($__key,$key);
543 &jb (&label("loop"));
544
545 &enccompact(0,$tbl,$s0,$s1,$s2,$s3);
546 &enccompact(1,$tbl,$s1,$s2,$s3,$s0);
547 &enccompact(2,$tbl,$s2,$s3,$s0,$s1);
548 &enccompact(3,$tbl,$s3,$s0,$s1,$s2);
549
550 &xor ($s0,&DWP(16,$key));
551 &xor ($s1,&DWP(20,$key));
552 &xor ($s2,&DWP(24,$key));
553 &xor ($s3,&DWP(28,$key));
554
555 &ret ();
556&function_end_B("_x86_AES_encrypt_compact");
557
558######################################################################
559# "Compact" SSE block function.
560######################################################################
561#
562# Performance is not actually extraordinary in comparison to pure
563# x86 code. In particular encrypt performance is virtually the same.
564# Decrypt performance on the other hand is 15-20% better on newer
565# µ-archs [but we're thankful for *any* improvement here], and ~50%
566# better on PIII:-) And additionally on the pros side this code
567# eliminates redundant references to stack and thus relieves/
568# minimizes the pressure on the memory bus.
569#
570# MMX register layout lsb
571# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
572# | mm4 | mm0 |
573# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
574# | s3 | s2 | s1 | s0 |
575# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
576# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
577# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
578#
579# Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
580# In this terms encryption and decryption "compact" permutation
581# matrices can be depicted as following:
582#
583# encryption lsb # decryption lsb
584# +----++----+----+----+----+ # +----++----+----+----+----+
585# | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 |
586# +----++----+----+----+----+ # +----++----+----+----+----+
587# | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 |
588# +----++----+----+----+----+ # +----++----+----+----+----+
589# | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 |
590# +----++----+----+----+----+ # +----++----+----+----+----+
591# | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 |
592# +----++----+----+----+----+ # +----++----+----+----+----+
593#
594######################################################################
595# Why not xmm registers? Short answer. It was actually tested and
596# was not any faster, but *contrary*, most notably on Intel CPUs.
597# Longer answer. Main advantage of using mm registers is that movd
598# latency is lower, especially on Intel P4. While arithmetic
599# instructions are twice as many, they can be scheduled every cycle
600# and not every second one when they are operating on xmm register,
601# so that "arithmetic throughput" remains virtually the same. And
602# finally the code can be executed even on elder SSE-only CPUs:-)
603
604sub sse_enccompact()
605{
606 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
607 &pshufw ("mm5","mm4",0x0d); # 15,14,11,10
608 &movd ("eax","mm1"); # 5, 4, 1, 0
609 &movd ("ebx","mm5"); # 15,14,11,10
610
611 &movz ($acc,&LB("eax")); # 0
612 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
613 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
614 &movz ("edx",&HB("eax")); # 1
615 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
616 &shl ("edx",8); # 1
617 &shr ("eax",16); # 5, 4
618
619 &movz ($acc,&LB("ebx")); # 10
620 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
621 &shl ($acc,16); # 10
622 &or ("ecx",$acc); # 10
623 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
624 &movz ($acc,&HB("ebx")); # 11
625 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
626 &shl ($acc,24); # 11
627 &or ("edx",$acc); # 11
628 &shr ("ebx",16); # 15,14
629
630 &movz ($acc,&HB("eax")); # 5
631 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
632 &shl ($acc,8); # 5
633 &or ("ecx",$acc); # 5
634 &movz ($acc,&HB("ebx")); # 15
635 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
636 &shl ($acc,24); # 15
637 &or ("ecx",$acc); # 15
638 &movd ("mm0","ecx"); # t[0] collected
639
640 &movz ($acc,&LB("eax")); # 4
641 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
642 &movd ("eax","mm2"); # 7, 6, 3, 2
643 &movz ($acc,&LB("ebx")); # 14
644 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
645 &shl ($acc,16); # 14
646 &or ("ecx",$acc); # 14
647
648 &movd ("ebx","mm6"); # 13,12, 9, 8
649 &movz ($acc,&HB("eax")); # 3
650 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
651 &shl ($acc,24); # 3
652 &or ("ecx",$acc); # 3
653 &movz ($acc,&HB("ebx")); # 9
654 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
655 &shl ($acc,8); # 9
656 &or ("ecx",$acc); # 9
657 &movd ("mm1","ecx"); # t[1] collected
658
659 &movz ($acc,&LB("ebx")); # 8
660 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
661 &shr ("ebx",16); # 13,12
662 &movz ($acc,&LB("eax")); # 2
663 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
664 &shl ($acc,16); # 2
665 &or ("ecx",$acc); # 2
666 &shr ("eax",16); # 7, 6
667
668 &punpckldq ("mm0","mm1"); # t[0,1] collected
669
670 &movz ($acc,&HB("eax")); # 7
671 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
672 &shl ($acc,24); # 7
673 &or ("ecx",$acc); # 7
674 &and ("eax",0xff); # 6
675 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
676 &shl ("eax",16); # 6
677 &or ("edx","eax"); # 6
678 &movz ($acc,&HB("ebx")); # 13
679 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
680 &shl ($acc,8); # 13
681 &or ("ecx",$acc); # 13
682 &movd ("mm4","ecx"); # t[2] collected
683 &and ("ebx",0xff); # 12
684 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
685 &or ("edx","ebx"); # 12
686 &movd ("mm5","edx"); # t[3] collected
687
688 &punpckldq ("mm4","mm5"); # t[2,3] collected
689}
690
691 if (!$x86only) {
692&function_begin_B("_sse_AES_encrypt_compact");
693 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
694 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
695
696 # note that caller is expected to allocate stack frame for me!
697 &mov ($acc,&DWP(240,$key)); # load key->rounds
698 &lea ($acc,&DWP(-2,$acc,$acc));
699 &lea ($acc,&DWP(0,$key,$acc,8));
700 &mov ($__end,$acc); # end of key schedule
701
702 &mov ($s0,0x1b1b1b1b); # magic constant
703 &mov (&DWP(8,"esp"),$s0);
704 &mov (&DWP(12,"esp"),$s0);
705
706 # prefetch Te4
707 &mov ($s0,&DWP(0-128,$tbl));
708 &mov ($s1,&DWP(32-128,$tbl));
709 &mov ($s2,&DWP(64-128,$tbl));
710 &mov ($s3,&DWP(96-128,$tbl));
711 &mov ($s0,&DWP(128-128,$tbl));
712 &mov ($s1,&DWP(160-128,$tbl));
713 &mov ($s2,&DWP(192-128,$tbl));
714 &mov ($s3,&DWP(224-128,$tbl));
715
716 &set_label("loop",16);
717 &sse_enccompact();
718 &add ($key,16);
719 &cmp ($key,$__end);
720 &ja (&label("out"));
721
722 &movq ("mm2",&QWP(8,"esp"));
723 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
724 &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
725 &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
726 &pand ("mm3","mm2"); &pand ("mm7","mm2");
727 &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
728 &paddb ("mm0","mm0"); &paddb ("mm4","mm4");
729 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
730 &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
731 &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
732 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
733
734 &movq ("mm2","mm3"); &movq ("mm6","mm7");
735 &pslld ("mm3",8); &pslld ("mm7",8);
736 &psrld ("mm2",24); &psrld ("mm6",24);
737 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
738 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
739
740 &movq ("mm3","mm1"); &movq ("mm7","mm5");
741 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
742 &psrld ("mm1",8); &psrld ("mm5",8);
743 &mov ($s0,&DWP(0-128,$tbl));
744 &pslld ("mm3",24); &pslld ("mm7",24);
745 &mov ($s1,&DWP(64-128,$tbl));
746 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
747 &mov ($s2,&DWP(128-128,$tbl));
748 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
749 &mov ($s3,&DWP(192-128,$tbl));
750
751 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
752 &jmp (&label("loop"));
753
754 &set_label("out",16);
755 &pxor ("mm0",&QWP(0,$key));
756 &pxor ("mm4",&QWP(8,$key));
757
758 &ret ();
759&function_end_B("_sse_AES_encrypt_compact");
760 }
761
762######################################################################
763# Vanilla block function.
764######################################################################
765
177sub encstep() 766sub encstep()
178{ my ($i,$te,@s) = @_; 767{ my ($i,$te,@s) = @_;
179 my $tmp = $key; 768 my $tmp = $key;
180 my $out = $i==3?$s[0]:$acc; 769 my $out = $i==3?$s[0]:$acc;
181 770
182 # lines marked with #%e?x[i] denote "reordered" instructions... 771 # lines marked with #%e?x[i] denote "reordered" instructions...
183 if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx 772 if ($i==3) { &mov ($key,$__key); }##%edx
184 else { &mov ($out,$s[0]); 773 else { &mov ($out,$s[0]);
185 &and ($out,0xFF); } 774 &and ($out,0xFF); }
186 if ($i==1) { &shr ($s[0],16); }#%ebx[1] 775 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
@@ -191,14 +780,14 @@ sub encstep()
191 &movz ($tmp,&HB($s[1])); 780 &movz ($tmp,&HB($s[1]));
192 &xor ($out,&DWP(3,$te,$tmp,8)); 781 &xor ($out,&DWP(3,$te,$tmp,8));
193 782
194 if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx 783 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
195 else { &mov ($tmp,$s[2]); 784 else { &mov ($tmp,$s[2]);
196 &shr ($tmp,16); } 785 &shr ($tmp,16); }
197 if ($i==2) { &and ($s[1],0xFF); }#%edx[2] 786 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
198 &and ($tmp,0xFF); 787 &and ($tmp,0xFF);
199 &xor ($out,&DWP(2,$te,$tmp,8)); 788 &xor ($out,&DWP(2,$te,$tmp,8));
200 789
201 if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx 790 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
202 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] 791 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
203 else { &mov ($tmp,$s[3]); 792 else { &mov ($tmp,$s[3]);
204 &shr ($tmp,24) } 793 &shr ($tmp,24) }
@@ -213,7 +802,7 @@ sub enclast()
213 my $tmp = $key; 802 my $tmp = $key;
214 my $out = $i==3?$s[0]:$acc; 803 my $out = $i==3?$s[0]:$acc;
215 804
216 if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx 805 if ($i==3) { &mov ($key,$__key); }##%edx
217 else { &mov ($out,$s[0]); } 806 else { &mov ($out,$s[0]); }
218 &and ($out,0xFF); 807 &and ($out,0xFF);
219 if ($i==1) { &shr ($s[0],16); }#%ebx[1] 808 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
@@ -227,8 +816,8 @@ sub enclast()
227 &and ($tmp,0x0000ff00); 816 &and ($tmp,0x0000ff00);
228 &xor ($out,$tmp); 817 &xor ($out,$tmp);
229 818
230 if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx 819 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
231 else { mov ($tmp,$s[2]); 820 else { &mov ($tmp,$s[2]);
232 &shr ($tmp,16); } 821 &shr ($tmp,16); }
233 if ($i==2) { &and ($s[1],0xFF); }#%edx[2] 822 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
234 &and ($tmp,0xFF); 823 &and ($tmp,0xFF);
@@ -236,7 +825,7 @@ sub enclast()
236 &and ($tmp,0x00ff0000); 825 &and ($tmp,0x00ff0000);
237 &xor ($out,$tmp); 826 &xor ($out,$tmp);
238 827
239 if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx 828 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
240 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] 829 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
241 else { &mov ($tmp,$s[3]); 830 else { &mov ($tmp,$s[3]);
242 &shr ($tmp,24); } 831 &shr ($tmp,24); }
@@ -247,10 +836,7 @@ sub enclast()
247 if ($i==3) { &mov ($s[3],$acc); } 836 if ($i==3) { &mov ($s[3],$acc); }
248} 837}
249 838
250sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } 839&function_begin_B("_x86_AES_encrypt");
251
252&public_label("AES_Te");
253&function_begin_C("_x86_AES_encrypt");
254 if ($vertical_spin) { 840 if ($vertical_spin) {
255 # I need high parts of volatile registers to be accessible... 841 # I need high parts of volatile registers to be accessible...
256 &exch ($s1="edi",$key="ebx"); 842 &exch ($s1="edi",$key="ebx");
@@ -258,7 +844,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
258 } 844 }
259 845
260 # note that caller is expected to allocate stack frame for me! 846 # note that caller is expected to allocate stack frame for me!
261 &mov (&DWP(12,"esp"),$key); # save key 847 &mov ($__key,$key); # save key
262 848
263 &xor ($s0,&DWP(0,$key)); # xor with key 849 &xor ($s0,&DWP(0,$key)); # xor with key
264 &xor ($s1,&DWP(4,$key)); 850 &xor ($s1,&DWP(4,$key));
@@ -270,24 +856,24 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
270 if ($small_footprint) { 856 if ($small_footprint) {
271 &lea ($acc,&DWP(-2,$acc,$acc)); 857 &lea ($acc,&DWP(-2,$acc,$acc));
272 &lea ($acc,&DWP(0,$key,$acc,8)); 858 &lea ($acc,&DWP(0,$key,$acc,8));
273 &mov (&DWP(16,"esp"),$acc); # end of key schedule 859 &mov ($__end,$acc); # end of key schedule
274 &align (4); 860
275 &set_label("loop"); 861 &set_label("loop",16);
276 if ($vertical_spin) { 862 if ($vertical_spin) {
277 &encvert("ebp",$s0,$s1,$s2,$s3); 863 &encvert($tbl,$s0,$s1,$s2,$s3);
278 } else { 864 } else {
279 &encstep(0,"ebp",$s0,$s1,$s2,$s3); 865 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
280 &encstep(1,"ebp",$s1,$s2,$s3,$s0); 866 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
281 &encstep(2,"ebp",$s2,$s3,$s0,$s1); 867 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
282 &encstep(3,"ebp",$s3,$s0,$s1,$s2); 868 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
283 } 869 }
284 &add ($key,16); # advance rd_key 870 &add ($key,16); # advance rd_key
285 &xor ($s0,&DWP(0,$key)); 871 &xor ($s0,&DWP(0,$key));
286 &xor ($s1,&DWP(4,$key)); 872 &xor ($s1,&DWP(4,$key));
287 &xor ($s2,&DWP(8,$key)); 873 &xor ($s2,&DWP(8,$key));
288 &xor ($s3,&DWP(12,$key)); 874 &xor ($s3,&DWP(12,$key));
289 &cmp ($key,&DWP(16,"esp")); 875 &cmp ($key,$__end);
290 &mov (&DWP(12,"esp"),$key); 876 &mov ($__key,$key);
291 &jb (&label("loop")); 877 &jb (&label("loop"));
292 } 878 }
293 else { 879 else {
@@ -296,15 +882,15 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
296 &cmp ($acc,12); 882 &cmp ($acc,12);
297 &jle (&label("12rounds")); 883 &jle (&label("12rounds"));
298 884
299 &set_label("14rounds"); 885 &set_label("14rounds",4);
300 for ($i=1;$i<3;$i++) { 886 for ($i=1;$i<3;$i++) {
301 if ($vertical_spin) { 887 if ($vertical_spin) {
302 &encvert("ebp",$s0,$s1,$s2,$s3); 888 &encvert($tbl,$s0,$s1,$s2,$s3);
303 } else { 889 } else {
304 &encstep(0,"ebp",$s0,$s1,$s2,$s3); 890 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
305 &encstep(1,"ebp",$s1,$s2,$s3,$s0); 891 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
306 &encstep(2,"ebp",$s2,$s3,$s0,$s1); 892 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
307 &encstep(3,"ebp",$s3,$s0,$s1,$s2); 893 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
308 } 894 }
309 &xor ($s0,&DWP(16*$i+0,$key)); 895 &xor ($s0,&DWP(16*$i+0,$key));
310 &xor ($s1,&DWP(16*$i+4,$key)); 896 &xor ($s1,&DWP(16*$i+4,$key));
@@ -312,16 +898,16 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
312 &xor ($s3,&DWP(16*$i+12,$key)); 898 &xor ($s3,&DWP(16*$i+12,$key));
313 } 899 }
314 &add ($key,32); 900 &add ($key,32);
315 &mov (&DWP(12,"esp"),$key); # advance rd_key 901 &mov ($__key,$key); # advance rd_key
316 &set_label("12rounds"); 902 &set_label("12rounds",4);
317 for ($i=1;$i<3;$i++) { 903 for ($i=1;$i<3;$i++) {
318 if ($vertical_spin) { 904 if ($vertical_spin) {
319 &encvert("ebp",$s0,$s1,$s2,$s3); 905 &encvert($tbl,$s0,$s1,$s2,$s3);
320 } else { 906 } else {
321 &encstep(0,"ebp",$s0,$s1,$s2,$s3); 907 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
322 &encstep(1,"ebp",$s1,$s2,$s3,$s0); 908 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
323 &encstep(2,"ebp",$s2,$s3,$s0,$s1); 909 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
324 &encstep(3,"ebp",$s3,$s0,$s1,$s2); 910 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
325 } 911 }
326 &xor ($s0,&DWP(16*$i+0,$key)); 912 &xor ($s0,&DWP(16*$i+0,$key));
327 &xor ($s1,&DWP(16*$i+4,$key)); 913 &xor ($s1,&DWP(16*$i+4,$key));
@@ -329,16 +915,16 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
329 &xor ($s3,&DWP(16*$i+12,$key)); 915 &xor ($s3,&DWP(16*$i+12,$key));
330 } 916 }
331 &add ($key,32); 917 &add ($key,32);
332 &mov (&DWP(12,"esp"),$key); # advance rd_key 918 &mov ($__key,$key); # advance rd_key
333 &set_label("10rounds"); 919 &set_label("10rounds",4);
334 for ($i=1;$i<10;$i++) { 920 for ($i=1;$i<10;$i++) {
335 if ($vertical_spin) { 921 if ($vertical_spin) {
336 &encvert("ebp",$s0,$s1,$s2,$s3); 922 &encvert($tbl,$s0,$s1,$s2,$s3);
337 } else { 923 } else {
338 &encstep(0,"ebp",$s0,$s1,$s2,$s3); 924 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
339 &encstep(1,"ebp",$s1,$s2,$s3,$s0); 925 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
340 &encstep(2,"ebp",$s2,$s3,$s0,$s1); 926 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
341 &encstep(3,"ebp",$s3,$s0,$s1,$s2); 927 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
342 } 928 }
343 &xor ($s0,&DWP(16*$i+0,$key)); 929 &xor ($s0,&DWP(16*$i+0,$key));
344 &xor ($s1,&DWP(16*$i+4,$key)); 930 &xor ($s1,&DWP(16*$i+4,$key));
@@ -352,10 +938,10 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
352 &mov ($s1="ebx",$key="edi"); 938 &mov ($s1="ebx",$key="edi");
353 &mov ($s2="ecx",$acc="esi"); 939 &mov ($s2="ecx",$acc="esi");
354 } 940 }
355 &enclast(0,"ebp",$s0,$s1,$s2,$s3); 941 &enclast(0,$tbl,$s0,$s1,$s2,$s3);
356 &enclast(1,"ebp",$s1,$s2,$s3,$s0); 942 &enclast(1,$tbl,$s1,$s2,$s3,$s0);
357 &enclast(2,"ebp",$s2,$s3,$s0,$s1); 943 &enclast(2,$tbl,$s2,$s3,$s0,$s1);
358 &enclast(3,"ebp",$s3,$s0,$s1,$s2); 944 &enclast(3,$tbl,$s3,$s0,$s1,$s2);
359 945
360 &add ($key,$small_footprint?16:160); 946 &add ($key,$small_footprint?16:160);
361 &xor ($s0,&DWP(0,$key)); 947 &xor ($s0,&DWP(0,$key));
@@ -430,38 +1016,198 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
430 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); 1016 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
431 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); 1017 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
432 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); 1018 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
1019
1020#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
1021 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1022 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1023 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1024 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1025 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1026 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1027 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1028 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1029 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1030 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1031 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1032 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1033 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1034 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1035 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1036 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1037 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1038 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1039 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1040 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1041 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1042 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1043 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1044 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1045 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1046 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1047 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1048 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1049 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1050 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1051 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1052 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1053
1054 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1055 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1056 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1057 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1058 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1059 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1060 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1061 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1062 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1063 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1064 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1065 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1066 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1067 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1068 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1069 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1070 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1071 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1072 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1073 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1074 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1075 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1076 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1077 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1078 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1079 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1080 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1081 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1082 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1083 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1084 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1085 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1086
1087 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1088 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1089 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1090 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1091 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1092 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1093 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1094 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1095 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1096 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1097 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1098 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1099 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1100 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1101 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1102 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1103 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1104 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1105 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1106 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1107 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1108 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1109 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1110 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1111 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1112 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1113 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1114 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1115 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1116 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1117 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1118 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1119
1120 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1121 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1122 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1123 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1124 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1125 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1126 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1127 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1128 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1129 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1130 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1131 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1132 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1133 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1134 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1135 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1136 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1137 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1138 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1139 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1140 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1141 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1142 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1143 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1144 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1145 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1146 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1147 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1148 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1149 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1150 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1151 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
433#rcon: 1152#rcon:
434 &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008); 1153 &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
435 &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080); 1154 &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
436 &data_word(0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0); 1155 &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
1156 &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
437&function_end_B("_x86_AES_encrypt"); 1157&function_end_B("_x86_AES_encrypt");
438 1158
439# void AES_encrypt (const void *inp,void *out,const AES_KEY *key); 1159# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
440&public_label("AES_Te");
441&function_begin("AES_encrypt"); 1160&function_begin("AES_encrypt");
442 &mov ($acc,&wparam(0)); # load inp 1161 &mov ($acc,&wparam(0)); # load inp
443 &mov ($key,&wparam(2)); # load key 1162 &mov ($key,&wparam(2)); # load key
444 1163
445 &mov ($s0,"esp"); 1164 &mov ($s0,"esp");
446 &sub ("esp",24); 1165 &sub ("esp",36);
447 &and ("esp",-64); 1166 &and ("esp",-64); # align to cache-line
448 &add ("esp",4); 1167
449 &mov (&DWP(16,"esp"),$s0); 1168 # place stack frame just "above" the key schedule
1169 &lea ($s1,&DWP(-64-63,$key));
1170 &sub ($s1,"esp");
1171 &neg ($s1);
1172 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1173 &sub ("esp",$s1);
1174 &add ("esp",4); # 4 is reserved for caller's return address
1175 &mov ($_esp,$s0); # save stack pointer
450 1176
451 &call (&label("pic_point")); # make it PIC! 1177 &call (&label("pic_point")); # make it PIC!
452 &set_label("pic_point"); 1178 &set_label("pic_point");
453 &blindpop("ebp"); 1179 &blindpop($tbl);
454 &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); 1180 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
455 1181 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
1182
1183 # pick Te4 copy which can't "overlap" with stack frame or key schedule
1184 &lea ($s1,&DWP(768-4,"esp"));
1185 &sub ($s1,$tbl);
1186 &and ($s1,0x300);
1187 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1188
1189 if (!$x86only) {
1190 &bt (&DWP(0,$s0),25); # check for SSE bit
1191 &jnc (&label("x86"));
1192
1193 &movq ("mm0",&QWP(0,$acc));
1194 &movq ("mm4",&QWP(8,$acc));
1195 &call ("_sse_AES_encrypt_compact");
1196 &mov ("esp",$_esp); # restore stack pointer
1197 &mov ($acc,&wparam(1)); # load out
1198 &movq (&QWP(0,$acc),"mm0"); # write output data
1199 &movq (&QWP(8,$acc),"mm4");
1200 &emms ();
1201 &function_end_A();
1202 }
1203 &set_label("x86",16);
1204 &mov ($_tbl,$tbl);
456 &mov ($s0,&DWP(0,$acc)); # load input data 1205 &mov ($s0,&DWP(0,$acc)); # load input data
457 &mov ($s1,&DWP(4,$acc)); 1206 &mov ($s1,&DWP(4,$acc));
458 &mov ($s2,&DWP(8,$acc)); 1207 &mov ($s2,&DWP(8,$acc));
459 &mov ($s3,&DWP(12,$acc)); 1208 &mov ($s3,&DWP(12,$acc));
460 1209 &call ("_x86_AES_encrypt_compact");
461 &call ("_x86_AES_encrypt"); 1210 &mov ("esp",$_esp); # restore stack pointer
462
463 &mov ("esp",&DWP(16,"esp"));
464
465 &mov ($acc,&wparam(1)); # load out 1211 &mov ($acc,&wparam(1)); # load out
466 &mov (&DWP(0,$acc),$s0); # write output data 1212 &mov (&DWP(0,$acc),$s0); # write output data
467 &mov (&DWP(4,$acc),$s1); 1213 &mov (&DWP(4,$acc),$s1);
@@ -469,7 +1215,370 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
469 &mov (&DWP(12,$acc),$s3); 1215 &mov (&DWP(12,$acc),$s3);
470&function_end("AES_encrypt"); 1216&function_end("AES_encrypt");
471 1217
472#------------------------------------------------------------------# 1218#--------------------------------------------------------------------#
1219
1220######################################################################
1221# "Compact" block function
1222######################################################################
1223
1224sub deccompact()
1225{ my $Fn = mov;
1226 while ($#_>5) { pop(@_); $Fn=sub{}; }
1227 my ($i,$td,@s)=@_;
1228 my $tmp = $key;
1229 my $out = $i==3?$s[0]:$acc;
1230
1231 # $Fn is used in first compact round and its purpose is to
1232 # void restoration of some values from stack, so that after
1233 # 4xdeccompact with extra argument $key, $s0 and $s1 values
1234 # are left there...
1235 if($i==3) { &$Fn ($key,$__key); }
1236 else { &mov ($out,$s[0]); }
1237 &and ($out,0xFF);
1238 &movz ($out,&BP(-128,$td,$out,1));
1239
1240 if ($i==3) { $tmp=$s[1]; }
1241 &movz ($tmp,&HB($s[1]));
1242 &movz ($tmp,&BP(-128,$td,$tmp,1));
1243 &shl ($tmp,8);
1244 &xor ($out,$tmp);
1245
1246 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1247 else { mov ($tmp,$s[2]); }
1248 &shr ($tmp,16);
1249 &and ($tmp,0xFF);
1250 &movz ($tmp,&BP(-128,$td,$tmp,1));
1251 &shl ($tmp,16);
1252 &xor ($out,$tmp);
1253
1254 if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }
1255 else { &mov ($tmp,$s[3]); }
1256 &shr ($tmp,24);
1257 &movz ($tmp,&BP(-128,$td,$tmp,1));
1258 &shl ($tmp,24);
1259 &xor ($out,$tmp);
1260 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1261 if ($i==3) { &$Fn ($s[3],$__s0); }
1262}
1263
1264# must be called with 2,3,0,1 as argument sequence!!!
1265sub dectransform()
1266{ my @s = ($s0,$s1,$s2,$s3);
1267 my $i = shift;
1268 my $tmp = $key;
1269 my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
1270 my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
1271 my $tp8 = $tbl;
1272
1273 &mov ($acc,$s[$i]);
1274 &and ($acc,0x80808080);
1275 &mov ($tmp,$acc);
1276 &shr ($tmp,7);
1277 &lea ($tp2,&DWP(0,$s[$i],$s[$i]));
1278 &sub ($acc,$tmp);
1279 &and ($tp2,0xfefefefe);
1280 &and ($acc,0x1b1b1b1b);
1281 &xor ($acc,$tp2);
1282 &mov ($tp2,$acc);
1283
1284 &and ($acc,0x80808080);
1285 &mov ($tmp,$acc);
1286 &shr ($tmp,7);
1287 &lea ($tp4,&DWP(0,$tp2,$tp2));
1288 &sub ($acc,$tmp);
1289 &and ($tp4,0xfefefefe);
1290 &and ($acc,0x1b1b1b1b);
1291 &xor ($tp2,$s[$i]); # tp2^tp1
1292 &xor ($acc,$tp4);
1293 &mov ($tp4,$acc);
1294
1295 &and ($acc,0x80808080);
1296 &mov ($tmp,$acc);
1297 &shr ($tmp,7);
1298 &lea ($tp8,&DWP(0,$tp4,$tp4));
1299 &sub ($acc,$tmp);
1300 &and ($tp8,0xfefefefe);
1301 &and ($acc,0x1b1b1b1b);
1302 &xor ($tp4,$s[$i]); # tp4^tp1
1303 &rotl ($s[$i],8); # = ROTATE(tp1,8)
1304 &xor ($tp8,$acc);
1305
1306 &xor ($s[$i],$tp2);
1307 &xor ($tp2,$tp8);
1308 &rotl ($tp2,24);
1309 &xor ($s[$i],$tp4);
1310 &xor ($tp4,$tp8);
1311 &rotl ($tp4,16);
1312 &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
1313 &rotl ($tp8,8);
1314 &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
1315 &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
1316 &mov ($s[0],$__s0) if($i==2); #prefetch $s0
1317 &mov ($s[1],$__s1) if($i==3); #prefetch $s1
1318 &mov ($s[2],$__s2) if($i==1);
1319 &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
1320
1321 &mov ($s[3],$__s3) if($i==1);
1322 &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
1323}
1324
1325&function_begin_B("_x86_AES_decrypt_compact");
1326 # note that caller is expected to allocate stack frame for me!
1327 &mov ($__key,$key); # save key
1328
1329 &xor ($s0,&DWP(0,$key)); # xor with key
1330 &xor ($s1,&DWP(4,$key));
1331 &xor ($s2,&DWP(8,$key));
1332 &xor ($s3,&DWP(12,$key));
1333
1334 &mov ($acc,&DWP(240,$key)); # load key->rounds
1335
1336 &lea ($acc,&DWP(-2,$acc,$acc));
1337 &lea ($acc,&DWP(0,$key,$acc,8));
1338 &mov ($__end,$acc); # end of key schedule
1339
1340 # prefetch Td4
1341 &mov ($key,&DWP(0-128,$tbl));
1342 &mov ($acc,&DWP(32-128,$tbl));
1343 &mov ($key,&DWP(64-128,$tbl));
1344 &mov ($acc,&DWP(96-128,$tbl));
1345 &mov ($key,&DWP(128-128,$tbl));
1346 &mov ($acc,&DWP(160-128,$tbl));
1347 &mov ($key,&DWP(192-128,$tbl));
1348 &mov ($acc,&DWP(224-128,$tbl));
1349
1350 &set_label("loop",16);
1351
1352 &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
1353 &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
1354 &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
1355 &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
1356 &dectransform(2);
1357 &dectransform(3);
1358 &dectransform(0);
1359 &dectransform(1);
1360 &mov ($key,$__key);
1361 &mov ($tbl,$__tbl);
1362 &add ($key,16); # advance rd_key
1363 &xor ($s0,&DWP(0,$key));
1364 &xor ($s1,&DWP(4,$key));
1365 &xor ($s2,&DWP(8,$key));
1366 &xor ($s3,&DWP(12,$key));
1367
1368 &cmp ($key,$__end);
1369 &mov ($__key,$key);
1370 &jb (&label("loop"));
1371
1372 &deccompact(0,$tbl,$s0,$s3,$s2,$s1);
1373 &deccompact(1,$tbl,$s1,$s0,$s3,$s2);
1374 &deccompact(2,$tbl,$s2,$s1,$s0,$s3);
1375 &deccompact(3,$tbl,$s3,$s2,$s1,$s0);
1376
1377 &xor ($s0,&DWP(16,$key));
1378 &xor ($s1,&DWP(20,$key));
1379 &xor ($s2,&DWP(24,$key));
1380 &xor ($s3,&DWP(28,$key));
1381
1382 &ret ();
1383&function_end_B("_x86_AES_decrypt_compact");
1384
1385######################################################################
1386# "Compact" SSE block function.
1387######################################################################
1388
1389sub sse_deccompact()
1390{
1391 &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
1392 &movd ("eax","mm1"); # 7, 6, 1, 0
1393
1394 &pshufw ("mm5","mm4",0x09); # 13,12,11,10
1395 &movz ($acc,&LB("eax")); # 0
1396 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
1397 &movd ("ebx","mm5"); # 13,12,11,10
1398 &movz ("edx",&HB("eax")); # 1
1399 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
1400 &shl ("edx",8); # 1
1401
1402 &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
1403 &movz ($acc,&LB("ebx")); # 10
1404 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
1405 &shl ($acc,16); # 10
1406 &or ("ecx",$acc); # 10
1407 &shr ("eax",16); # 7, 6
1408 &movz ($acc,&HB("ebx")); # 11
1409 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
1410 &shl ($acc,24); # 11
1411 &or ("edx",$acc); # 11
1412 &shr ("ebx",16); # 13,12
1413
1414 &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
1415 &movz ($acc,&HB("eax")); # 7
1416 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
1417 &shl ($acc,24); # 7
1418 &or ("ecx",$acc); # 7
1419 &movz ($acc,&HB("ebx")); # 13
1420 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
1421 &shl ($acc,8); # 13
1422 &or ("ecx",$acc); # 13
1423 &movd ("mm0","ecx"); # t[0] collected
1424
1425 &movz ($acc,&LB("eax")); # 6
1426 &movd ("eax","mm2"); # 3, 2, 5, 4
1427 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
1428 &shl ("ecx",16); # 6
1429 &movz ($acc,&LB("ebx")); # 12
1430 &movd ("ebx","mm6"); # 9, 8,15,14
1431 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
1432 &or ("ecx",$acc); # 12
1433
1434 &movz ($acc,&LB("eax")); # 4
1435 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
1436 &or ("edx",$acc); # 4
1437 &movz ($acc,&LB("ebx")); # 14
1438 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
1439 &shl ($acc,16); # 14
1440 &or ("edx",$acc); # 14
1441 &movd ("mm1","edx"); # t[1] collected
1442
1443 &movz ($acc,&HB("eax")); # 5
1444 &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
1445 &shl ("edx",8); # 5
1446 &movz ($acc,&HB("ebx")); # 15
1447 &shr ("eax",16); # 3, 2
1448 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
1449 &shl ($acc,24); # 15
1450 &or ("edx",$acc); # 15
1451 &shr ("ebx",16); # 9, 8
1452
1453 &punpckldq ("mm0","mm1"); # t[0,1] collected
1454
1455 &movz ($acc,&HB("ebx")); # 9
1456 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
1457 &shl ($acc,8); # 9
1458 &or ("ecx",$acc); # 9
1459 &and ("ebx",0xff); # 8
1460 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
1461 &or ("edx","ebx"); # 8
1462 &movz ($acc,&LB("eax")); # 2
1463 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
1464 &shl ($acc,16); # 2
1465 &or ("edx",$acc); # 2
1466 &movd ("mm4","edx"); # t[2] collected
1467 &movz ("eax",&HB("eax")); # 3
1468 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
1469 &shl ("eax",24); # 3
1470 &or ("ecx","eax"); # 3
1471 &movd ("mm5","ecx"); # t[3] collected
1472
1473 &punpckldq ("mm4","mm5"); # t[2,3] collected
1474}
1475
1476 if (!$x86only) {
1477&function_begin_B("_sse_AES_decrypt_compact");
1478 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
1479 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
1480
1481 # note that caller is expected to allocate stack frame for me!
1482 &mov ($acc,&DWP(240,$key)); # load key->rounds
1483 &lea ($acc,&DWP(-2,$acc,$acc));
1484 &lea ($acc,&DWP(0,$key,$acc,8));
1485 &mov ($__end,$acc); # end of key schedule
1486
1487 &mov ($s0,0x1b1b1b1b); # magic constant
1488 &mov (&DWP(8,"esp"),$s0);
1489 &mov (&DWP(12,"esp"),$s0);
1490
1491 # prefetch Td4
1492 &mov ($s0,&DWP(0-128,$tbl));
1493 &mov ($s1,&DWP(32-128,$tbl));
1494 &mov ($s2,&DWP(64-128,$tbl));
1495 &mov ($s3,&DWP(96-128,$tbl));
1496 &mov ($s0,&DWP(128-128,$tbl));
1497 &mov ($s1,&DWP(160-128,$tbl));
1498 &mov ($s2,&DWP(192-128,$tbl));
1499 &mov ($s3,&DWP(224-128,$tbl));
1500
1501 &set_label("loop",16);
1502 &sse_deccompact();
1503 &add ($key,16);
1504 &cmp ($key,$__end);
1505 &ja (&label("out"));
1506
1507 # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
1508 &movq ("mm3","mm0"); &movq ("mm7","mm4");
1509 &movq ("mm2","mm0",1); &movq ("mm6","mm4",1);
1510 &movq ("mm1","mm0"); &movq ("mm5","mm4");
1511 &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16)
1512 &pslld ("mm2",8); &pslld ("mm6",8);
1513 &psrld ("mm3",8); &psrld ("mm7",8);
1514 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8
1515 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8
1516 &pslld ("mm2",16); &pslld ("mm6",16);
1517 &psrld ("mm3",16); &psrld ("mm7",16);
1518 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24
1519 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24
1520
1521 &movq ("mm3",&QWP(8,"esp"));
1522 &pxor ("mm2","mm2"); &pxor ("mm6","mm6");
1523 &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5");
1524 &pand ("mm2","mm3"); &pand ("mm6","mm3");
1525 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1526 &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2
1527 &movq ("mm3","mm1"); &movq ("mm7","mm5");
1528 &movq ("mm2","mm1"); &movq ("mm6","mm5");
1529 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2
1530 &pslld ("mm3",24); &pslld ("mm7",24);
1531 &psrld ("mm2",8); &psrld ("mm6",8);
1532 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24
1533 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8
1534
1535 &movq ("mm2",&QWP(8,"esp"));
1536 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1537 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1538 &pand ("mm3","mm2"); &pand ("mm7","mm2");
1539 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1540 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
1541 &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
1542 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
1543 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
1544
1545 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1546 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1547 &pand ("mm3","mm2"); &pand ("mm7","mm2");
1548 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1549 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8
1550 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8
1551 &movq ("mm3","mm1"); &movq ("mm7","mm5");
1552 &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1);
1553 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16)
1554 &pslld ("mm1",8); &pslld ("mm5",8);
1555 &psrld ("mm3",8); &psrld ("mm7",8);
1556 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
1557 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8
1558 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8
1559 &mov ($s0,&DWP(0-128,$tbl));
1560 &pslld ("mm1",16); &pslld ("mm5",16);
1561 &mov ($s1,&DWP(64-128,$tbl));
1562 &psrld ("mm3",16); &psrld ("mm7",16);
1563 &mov ($s2,&DWP(128-128,$tbl));
1564 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24
1565 &mov ($s3,&DWP(192-128,$tbl));
1566 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24
1567
1568 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
1569 &jmp (&label("loop"));
1570
1571 &set_label("out",16);
1572 &pxor ("mm0",&QWP(0,$key));
1573 &pxor ("mm4",&QWP(8,$key));
1574
1575 &ret ();
1576&function_end_B("_sse_AES_decrypt_compact");
1577 }
1578
1579######################################################################
1580# Vanilla block function.
1581######################################################################
473 1582
474sub decstep() 1583sub decstep()
475{ my ($i,$td,@s) = @_; 1584{ my ($i,$td,@s) = @_;
@@ -480,7 +1589,7 @@ sub decstep()
480 # optimal... or rather that all attempts to reorder didn't 1589 # optimal... or rather that all attempts to reorder didn't
481 # result in better performance [which by the way is not a 1590 # result in better performance [which by the way is not a
482 # bit lower than ecryption]. 1591 # bit lower than ecryption].
483 if($i==3) { &mov ($key,&DWP(12,"esp")); } 1592 if($i==3) { &mov ($key,$__key); }
484 else { &mov ($out,$s[0]); } 1593 else { &mov ($out,$s[0]); }
485 &and ($out,0xFF); 1594 &and ($out,0xFF);
486 &mov ($out,&DWP(0,$td,$out,8)); 1595 &mov ($out,&DWP(0,$td,$out,8));
@@ -495,12 +1604,12 @@ sub decstep()
495 &and ($tmp,0xFF); 1604 &and ($tmp,0xFF);
496 &xor ($out,&DWP(2,$td,$tmp,8)); 1605 &xor ($out,&DWP(2,$td,$tmp,8));
497 1606
498 if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); } 1607 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
499 else { &mov ($tmp,$s[3]); } 1608 else { &mov ($tmp,$s[3]); }
500 &shr ($tmp,24); 1609 &shr ($tmp,24);
501 &xor ($out,&DWP(1,$td,$tmp,8)); 1610 &xor ($out,&DWP(1,$td,$tmp,8));
502 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } 1611 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
503 if ($i==3) { &mov ($s[3],&DWP(4,"esp")); } 1612 if ($i==3) { &mov ($s[3],$__s0); }
504 &comment(); 1613 &comment();
505} 1614}
506 1615
@@ -509,14 +1618,24 @@ sub declast()
509 my $tmp = $key; 1618 my $tmp = $key;
510 my $out = $i==3?$s[0]:$acc; 1619 my $out = $i==3?$s[0]:$acc;
511 1620
512 if($i==3) { &mov ($key,&DWP(12,"esp")); } 1621 if($i==0) { &lea ($td,&DWP(2048+128,$td));
1622 &mov ($tmp,&DWP(0-128,$td));
1623 &mov ($acc,&DWP(32-128,$td));
1624 &mov ($tmp,&DWP(64-128,$td));
1625 &mov ($acc,&DWP(96-128,$td));
1626 &mov ($tmp,&DWP(128-128,$td));
1627 &mov ($acc,&DWP(160-128,$td));
1628 &mov ($tmp,&DWP(192-128,$td));
1629 &mov ($acc,&DWP(224-128,$td));
1630 &lea ($td,&DWP(-128,$td)); }
1631 if($i==3) { &mov ($key,$__key); }
513 else { &mov ($out,$s[0]); } 1632 else { &mov ($out,$s[0]); }
514 &and ($out,0xFF); 1633 &and ($out,0xFF);
515 &movz ($out,&BP(2048,$td,$out,1)); 1634 &movz ($out,&BP(0,$td,$out,1));
516 1635
517 if ($i==3) { $tmp=$s[1]; } 1636 if ($i==3) { $tmp=$s[1]; }
518 &movz ($tmp,&HB($s[1])); 1637 &movz ($tmp,&HB($s[1]));
519 &movz ($tmp,&BP(2048,$td,$tmp,1)); 1638 &movz ($tmp,&BP(0,$td,$tmp,1));
520 &shl ($tmp,8); 1639 &shl ($tmp,8);
521 &xor ($out,$tmp); 1640 &xor ($out,$tmp);
522 1641
@@ -524,24 +1643,24 @@ sub declast()
524 else { mov ($tmp,$s[2]); } 1643 else { mov ($tmp,$s[2]); }
525 &shr ($tmp,16); 1644 &shr ($tmp,16);
526 &and ($tmp,0xFF); 1645 &and ($tmp,0xFF);
527 &movz ($tmp,&BP(2048,$td,$tmp,1)); 1646 &movz ($tmp,&BP(0,$td,$tmp,1));
528 &shl ($tmp,16); 1647 &shl ($tmp,16);
529 &xor ($out,$tmp); 1648 &xor ($out,$tmp);
530 1649
531 if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); } 1650 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
532 else { &mov ($tmp,$s[3]); } 1651 else { &mov ($tmp,$s[3]); }
533 &shr ($tmp,24); 1652 &shr ($tmp,24);
534 &movz ($tmp,&BP(2048,$td,$tmp,1)); 1653 &movz ($tmp,&BP(0,$td,$tmp,1));
535 &shl ($tmp,24); 1654 &shl ($tmp,24);
536 &xor ($out,$tmp); 1655 &xor ($out,$tmp);
537 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } 1656 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
538 if ($i==3) { &mov ($s[3],&DWP(4,"esp")); } 1657 if ($i==3) { &mov ($s[3],$__s0);
1658 &lea ($td,&DWP(-2048,$td)); }
539} 1659}
540 1660
541&public_label("AES_Td"); 1661&function_begin_B("_x86_AES_decrypt");
542&function_begin_C("_x86_AES_decrypt");
543 # note that caller is expected to allocate stack frame for me! 1662 # note that caller is expected to allocate stack frame for me!
544 &mov (&DWP(12,"esp"),$key); # save key 1663 &mov ($__key,$key); # save key
545 1664
546 &xor ($s0,&DWP(0,$key)); # xor with key 1665 &xor ($s0,&DWP(0,$key)); # xor with key
547 &xor ($s1,&DWP(4,$key)); 1666 &xor ($s1,&DWP(4,$key));
@@ -553,20 +1672,19 @@ sub declast()
553 if ($small_footprint) { 1672 if ($small_footprint) {
554 &lea ($acc,&DWP(-2,$acc,$acc)); 1673 &lea ($acc,&DWP(-2,$acc,$acc));
555 &lea ($acc,&DWP(0,$key,$acc,8)); 1674 &lea ($acc,&DWP(0,$key,$acc,8));
556 &mov (&DWP(16,"esp"),$acc); # end of key schedule 1675 &mov ($__end,$acc); # end of key schedule
557 &align (4); 1676 &set_label("loop",16);
558 &set_label("loop"); 1677 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
559 &decstep(0,"ebp",$s0,$s3,$s2,$s1); 1678 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
560 &decstep(1,"ebp",$s1,$s0,$s3,$s2); 1679 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
561 &decstep(2,"ebp",$s2,$s1,$s0,$s3); 1680 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
562 &decstep(3,"ebp",$s3,$s2,$s1,$s0);
563 &add ($key,16); # advance rd_key 1681 &add ($key,16); # advance rd_key
564 &xor ($s0,&DWP(0,$key)); 1682 &xor ($s0,&DWP(0,$key));
565 &xor ($s1,&DWP(4,$key)); 1683 &xor ($s1,&DWP(4,$key));
566 &xor ($s2,&DWP(8,$key)); 1684 &xor ($s2,&DWP(8,$key));
567 &xor ($s3,&DWP(12,$key)); 1685 &xor ($s3,&DWP(12,$key));
568 &cmp ($key,&DWP(16,"esp")); 1686 &cmp ($key,$__end);
569 &mov (&DWP(12,"esp"),$key); 1687 &mov ($__key,$key);
570 &jb (&label("loop")); 1688 &jb (&label("loop"));
571 } 1689 }
572 else { 1690 else {
@@ -575,38 +1693,38 @@ sub declast()
575 &cmp ($acc,12); 1693 &cmp ($acc,12);
576 &jle (&label("12rounds")); 1694 &jle (&label("12rounds"));
577 1695
578 &set_label("14rounds"); 1696 &set_label("14rounds",4);
579 for ($i=1;$i<3;$i++) { 1697 for ($i=1;$i<3;$i++) {
580 &decstep(0,"ebp",$s0,$s3,$s2,$s1); 1698 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
581 &decstep(1,"ebp",$s1,$s0,$s3,$s2); 1699 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
582 &decstep(2,"ebp",$s2,$s1,$s0,$s3); 1700 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
583 &decstep(3,"ebp",$s3,$s2,$s1,$s0); 1701 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
584 &xor ($s0,&DWP(16*$i+0,$key)); 1702 &xor ($s0,&DWP(16*$i+0,$key));
585 &xor ($s1,&DWP(16*$i+4,$key)); 1703 &xor ($s1,&DWP(16*$i+4,$key));
586 &xor ($s2,&DWP(16*$i+8,$key)); 1704 &xor ($s2,&DWP(16*$i+8,$key));
587 &xor ($s3,&DWP(16*$i+12,$key)); 1705 &xor ($s3,&DWP(16*$i+12,$key));
588 } 1706 }
589 &add ($key,32); 1707 &add ($key,32);
590 &mov (&DWP(12,"esp"),$key); # advance rd_key 1708 &mov ($__key,$key); # advance rd_key
591 &set_label("12rounds"); 1709 &set_label("12rounds",4);
592 for ($i=1;$i<3;$i++) { 1710 for ($i=1;$i<3;$i++) {
593 &decstep(0,"ebp",$s0,$s3,$s2,$s1); 1711 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
594 &decstep(1,"ebp",$s1,$s0,$s3,$s2); 1712 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
595 &decstep(2,"ebp",$s2,$s1,$s0,$s3); 1713 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
596 &decstep(3,"ebp",$s3,$s2,$s1,$s0); 1714 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
597 &xor ($s0,&DWP(16*$i+0,$key)); 1715 &xor ($s0,&DWP(16*$i+0,$key));
598 &xor ($s1,&DWP(16*$i+4,$key)); 1716 &xor ($s1,&DWP(16*$i+4,$key));
599 &xor ($s2,&DWP(16*$i+8,$key)); 1717 &xor ($s2,&DWP(16*$i+8,$key));
600 &xor ($s3,&DWP(16*$i+12,$key)); 1718 &xor ($s3,&DWP(16*$i+12,$key));
601 } 1719 }
602 &add ($key,32); 1720 &add ($key,32);
603 &mov (&DWP(12,"esp"),$key); # advance rd_key 1721 &mov ($__key,$key); # advance rd_key
604 &set_label("10rounds"); 1722 &set_label("10rounds",4);
605 for ($i=1;$i<10;$i++) { 1723 for ($i=1;$i<10;$i++) {
606 &decstep(0,"ebp",$s0,$s3,$s2,$s1); 1724 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
607 &decstep(1,"ebp",$s1,$s0,$s3,$s2); 1725 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
608 &decstep(2,"ebp",$s2,$s1,$s0,$s3); 1726 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
609 &decstep(3,"ebp",$s3,$s2,$s1,$s0); 1727 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
610 &xor ($s0,&DWP(16*$i+0,$key)); 1728 &xor ($s0,&DWP(16*$i+0,$key));
611 &xor ($s1,&DWP(16*$i+4,$key)); 1729 &xor ($s1,&DWP(16*$i+4,$key));
612 &xor ($s2,&DWP(16*$i+8,$key)); 1730 &xor ($s2,&DWP(16*$i+8,$key));
@@ -614,10 +1732,10 @@ sub declast()
614 } 1732 }
615 } 1733 }
616 1734
617 &declast(0,"ebp",$s0,$s3,$s2,$s1); 1735 &declast(0,$tbl,$s0,$s3,$s2,$s1);
618 &declast(1,"ebp",$s1,$s0,$s3,$s2); 1736 &declast(1,$tbl,$s1,$s0,$s3,$s2);
619 &declast(2,"ebp",$s2,$s1,$s0,$s3); 1737 &declast(2,$tbl,$s2,$s1,$s0,$s3);
620 &declast(3,"ebp",$s3,$s2,$s1,$s0); 1738 &declast(3,$tbl,$s3,$s2,$s1,$s0);
621 1739
622 &add ($key,$small_footprint?16:160); 1740 &add ($key,$small_footprint?16:160);
623 &xor ($s0,&DWP(0,$key)); 1741 &xor ($s0,&DWP(0,$key));
@@ -692,7 +1810,107 @@ sub declast()
692 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); 1810 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
693 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); 1811 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
694 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); 1812 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
695#Td4: 1813
1814#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
1815 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1816 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1817 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1818 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1819 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1820 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1821 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1822 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1823 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1824 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1825 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1826 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1827 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1828 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1829 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1830 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1831 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1832 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1833 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1834 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1835 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1836 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1837 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1838 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1839 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1840 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1841 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1842 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1843 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1844 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1845 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1846 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1847
1848 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1849 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1850 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1851 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1852 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1853 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1854 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1855 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1856 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1857 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1858 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1859 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1860 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1861 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1862 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1863 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1864 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1865 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1866 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1867 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1868 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1869 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1870 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1871 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1872 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1873 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1874 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1875 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1876 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1877 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1878 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1879 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1880
1881 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1882 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1883 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1884 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1885 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1886 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1887 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1888 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1889 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1890 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1891 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1892 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1893 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1894 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1895 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1896 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1897 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1898 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1899 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1900 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1901 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1902 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1903 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1904 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1905 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1906 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1907 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1908 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1909 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1910 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1911 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1912 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1913
696 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); 1914 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
697 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); 1915 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
698 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); 1916 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
@@ -728,43 +1946,57 @@ sub declast()
728&function_end_B("_x86_AES_decrypt"); 1946&function_end_B("_x86_AES_decrypt");
729 1947
730# void AES_decrypt (const void *inp,void *out,const AES_KEY *key); 1948# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
731&public_label("AES_Td");
732&function_begin("AES_decrypt"); 1949&function_begin("AES_decrypt");
733 &mov ($acc,&wparam(0)); # load inp 1950 &mov ($acc,&wparam(0)); # load inp
734 &mov ($key,&wparam(2)); # load key 1951 &mov ($key,&wparam(2)); # load key
735 1952
736 &mov ($s0,"esp"); 1953 &mov ($s0,"esp");
737 &sub ("esp",24); 1954 &sub ("esp",36);
738 &and ("esp",-64); 1955 &and ("esp",-64); # align to cache-line
739 &add ("esp",4); 1956
740 &mov (&DWP(16,"esp"),$s0); 1957 # place stack frame just "above" the key schedule
1958 &lea ($s1,&DWP(-64-63,$key));
1959 &sub ($s1,"esp");
1960 &neg ($s1);
1961 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1962 &sub ("esp",$s1);
1963 &add ("esp",4); # 4 is reserved for caller's return address
1964 &mov ($_esp,$s0); # save stack pointer
741 1965
742 &call (&label("pic_point")); # make it PIC! 1966 &call (&label("pic_point")); # make it PIC!
743 &set_label("pic_point"); 1967 &set_label("pic_point");
744 &blindpop("ebp"); 1968 &blindpop($tbl);
745 &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); 1969 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
746 1970 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
747 # prefetch Td4 1971
748 &lea ("ebp",&DWP(2048+128,"ebp")); 1972 # pick Td4 copy which can't "overlap" with stack frame or key schedule
749 &mov ($s0,&DWP(0-128,"ebp")); 1973 &lea ($s1,&DWP(768-4,"esp"));
750 &mov ($s1,&DWP(32-128,"ebp")); 1974 &sub ($s1,$tbl);
751 &mov ($s2,&DWP(64-128,"ebp")); 1975 &and ($s1,0x300);
752 &mov ($s3,&DWP(96-128,"ebp")); 1976 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
753 &mov ($s0,&DWP(128-128,"ebp")); 1977
754 &mov ($s1,&DWP(160-128,"ebp")); 1978 if (!$x86only) {
755 &mov ($s2,&DWP(192-128,"ebp")); 1979 &bt (&DWP(0,$s0),25); # check for SSE bit
756 &mov ($s3,&DWP(224-128,"ebp")); 1980 &jnc (&label("x86"));
757 &lea ("ebp",&DWP(-2048-128,"ebp")); 1981
758 1982 &movq ("mm0",&QWP(0,$acc));
1983 &movq ("mm4",&QWP(8,$acc));
1984 &call ("_sse_AES_decrypt_compact");
1985 &mov ("esp",$_esp); # restore stack pointer
1986 &mov ($acc,&wparam(1)); # load out
1987 &movq (&QWP(0,$acc),"mm0"); # write output data
1988 &movq (&QWP(8,$acc),"mm4");
1989 &emms ();
1990 &function_end_A();
1991 }
1992 &set_label("x86",16);
1993 &mov ($_tbl,$tbl);
759 &mov ($s0,&DWP(0,$acc)); # load input data 1994 &mov ($s0,&DWP(0,$acc)); # load input data
760 &mov ($s1,&DWP(4,$acc)); 1995 &mov ($s1,&DWP(4,$acc));
761 &mov ($s2,&DWP(8,$acc)); 1996 &mov ($s2,&DWP(8,$acc));
762 &mov ($s3,&DWP(12,$acc)); 1997 &mov ($s3,&DWP(12,$acc));
763 1998 &call ("_x86_AES_decrypt_compact");
764 &call ("_x86_AES_decrypt"); 1999 &mov ("esp",$_esp); # restore stack pointer
765
766 &mov ("esp",&DWP(16,"esp"));
767
768 &mov ($acc,&wparam(1)); # load out 2000 &mov ($acc,&wparam(1)); # load out
769 &mov (&DWP(0,$acc),$s0); # write output data 2001 &mov (&DWP(0,$acc),$s0); # write output data
770 &mov (&DWP(4,$acc),$s1); 2002 &mov (&DWP(4,$acc),$s1);
@@ -777,126 +2009,136 @@ sub declast()
777# unsigned char *ivp,const int enc); 2009# unsigned char *ivp,const int enc);
778{ 2010{
779# stack frame layout 2011# stack frame layout
780# -4(%esp) 0(%esp) return address 2012# -4(%esp) # return address 0(%esp)
781# 0(%esp) 4(%esp) tmp1 2013# 0(%esp) # s0 backing store 4(%esp)
782# 4(%esp) 8(%esp) tmp2 2014# 4(%esp) # s1 backing store 8(%esp)
783# 8(%esp) 12(%esp) key 2015# 8(%esp) # s2 backing store 12(%esp)
784# 12(%esp) 16(%esp) end of key schedule 2016# 12(%esp) # s3 backing store 16(%esp)
785my $_esp=&DWP(16,"esp"); #saved %esp 2017# 16(%esp) # key backup 20(%esp)
786my $_inp=&DWP(20,"esp"); #copy of wparam(0) 2018# 20(%esp) # end of key schedule 24(%esp)
787my $_out=&DWP(24,"esp"); #copy of wparam(1) 2019# 24(%esp) # %ebp backup 28(%esp)
788my $_len=&DWP(28,"esp"); #copy of wparam(2) 2020# 28(%esp) # %esp backup
789my $_key=&DWP(32,"esp"); #copy of wparam(3) 2021my $_inp=&DWP(32,"esp"); # copy of wparam(0)
790my $_ivp=&DWP(36,"esp"); #copy of wparam(4) 2022my $_out=&DWP(36,"esp"); # copy of wparam(1)
791my $_tmp=&DWP(40,"esp"); #volatile variable 2023my $_len=&DWP(40,"esp"); # copy of wparam(2)
792my $ivec=&DWP(44,"esp"); #ivec[16] 2024my $_key=&DWP(44,"esp"); # copy of wparam(3)
793my $aes_key=&DWP(60,"esp"); #copy of aes_key 2025my $_ivp=&DWP(48,"esp"); # copy of wparam(4)
794my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds 2026my $_tmp=&DWP(52,"esp"); # volatile variable
795 2027#
796&public_label("AES_Te"); 2028my $ivec=&DWP(60,"esp"); # ivec[16]
797&public_label("AES_Td"); 2029my $aes_key=&DWP(76,"esp"); # copy of aes_key
2030my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
2031
798&function_begin("AES_cbc_encrypt"); 2032&function_begin("AES_cbc_encrypt");
799 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len 2033 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
800 &cmp ($s2,0); 2034 &cmp ($s2,0);
801 &je (&label("enc_out")); 2035 &je (&label("drop_out"));
802 2036
803 &call (&label("pic_point")); # make it PIC! 2037 &call (&label("pic_point")); # make it PIC!
804 &set_label("pic_point"); 2038 &set_label("pic_point");
805 &blindpop("ebp"); 2039 &blindpop($tbl);
806 2040 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
807 &pushf ();
808 &cld ();
809 2041
810 &cmp (&wparam(5),0); 2042 &cmp (&wparam(5),0);
811 &je (&label("DECRYPT")); 2043 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
2044 &jne (&label("picked_te"));
2045 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
2046 &set_label("picked_te");
812 2047
813 &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); 2048 # one can argue if this is required
814 2049 &pushf ();
815 # allocate aligned stack frame... 2050 &cld ();
816 &lea ($key,&DWP(-64-244,"esp"));
817 &and ($key,-64);
818 2051
819 # ... and make sure it doesn't alias with AES_Te modulo 4096 2052 &cmp ($s2,$speed_limit);
820 &mov ($s0,"ebp"); 2053 &jb (&label("slow_way"));
821 &lea ($s1,&DWP(2048,"ebp")); 2054 &test ($s2,15);
822 &mov ($s3,$key); 2055 &jnz (&label("slow_way"));
2056 if (!$x86only) {
2057 &bt (&DWP(0,$s0),28); # check for hyper-threading bit
2058 &jc (&label("slow_way"));
2059 }
2060 # pre-allocate aligned stack frame...
2061 &lea ($acc,&DWP(-80-244,"esp"));
2062 &and ($acc,-64);
2063
2064 # ... and make sure it doesn't alias with $tbl modulo 4096
2065 &mov ($s0,$tbl);
2066 &lea ($s1,&DWP(2048+256,$tbl));
2067 &mov ($s3,$acc);
823 &and ($s0,0xfff); # s = %ebp&0xfff 2068 &and ($s0,0xfff); # s = %ebp&0xfff
824 &and ($s1,0xfff); # e = (%ebp+2048)&0xfff 2069 &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
825 &and ($s3,0xfff); # p = %esp&0xfff 2070 &and ($s3,0xfff); # p = %esp&0xfff
826 2071
827 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); 2072 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
828 &jb (&label("te_break_out")); 2073 &jb (&label("tbl_break_out"));
829 &sub ($s3,$s1); 2074 &sub ($s3,$s1);
830 &sub ($key,$s3); 2075 &sub ($acc,$s3);
831 &jmp (&label("te_ok")); 2076 &jmp (&label("tbl_ok"));
832 &set_label("te_break_out"); # else %esp -= (p-s)&0xfff + framesz; 2077 &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz;
833 &sub ($s3,$s0); 2078 &sub ($s3,$s0);
834 &and ($s3,0xfff); 2079 &and ($s3,0xfff);
835 &add ($s3,64+256); 2080 &add ($s3,384);
836 &sub ($key,$s3); 2081 &sub ($acc,$s3);
837 &align (4); 2082 &set_label("tbl_ok",4);
838 &set_label("te_ok");
839 2083
840 &mov ($s0,&wparam(0)); # load inp 2084 &lea ($s3,&wparam(0)); # obtain pointer to parameter block
841 &mov ($s1,&wparam(1)); # load out 2085 &exch ("esp",$acc); # allocate stack frame
842 &mov ($s3,&wparam(3)); # load key
843 &mov ($acc,&wparam(4)); # load ivp
844
845 &exch ("esp",$key);
846 &add ("esp",4); # reserve for return address! 2086 &add ("esp",4); # reserve for return address!
847 &mov ($_esp,$key); # save %esp 2087 &mov ($_tbl,$tbl); # save %ebp
2088 &mov ($_esp,$acc); # save %esp
2089
2090 &mov ($s0,&DWP(0,$s3)); # load inp
2091 &mov ($s1,&DWP(4,$s3)); # load out
2092 #&mov ($s2,&DWP(8,$s3)); # load len
2093 &mov ($key,&DWP(12,$s3)); # load key
2094 &mov ($acc,&DWP(16,$s3)); # load ivp
2095 &mov ($s3,&DWP(20,$s3)); # load enc flag
848 2096
849 &mov ($_inp,$s0); # save copy of inp 2097 &mov ($_inp,$s0); # save copy of inp
850 &mov ($_out,$s1); # save copy of out 2098 &mov ($_out,$s1); # save copy of out
851 &mov ($_len,$s2); # save copy of len 2099 &mov ($_len,$s2); # save copy of len
852 &mov ($_key,$s3); # save copy of key 2100 &mov ($_key,$key); # save copy of key
853 &mov ($_ivp,$acc); # save copy of ivp 2101 &mov ($_ivp,$acc); # save copy of ivp
854 2102
855 &mov ($mark,0); # copy of aes_key->rounds = 0; 2103 &mov ($mark,0); # copy of aes_key->rounds = 0;
856 if ($compromise) {
857 &cmp ($s2,$compromise);
858 &jb (&label("skip_ecopy"));
859 }
860 # do we copy key schedule to stack? 2104 # do we copy key schedule to stack?
861 &mov ($s1 eq "ebx" ? $s1 : "",$s3); 2105 &mov ($s1 eq "ebx" ? $s1 : "",$key);
862 &mov ($s2 eq "ecx" ? $s2 : "",244/4); 2106 &mov ($s2 eq "ecx" ? $s2 : "",244/4);
863 &sub ($s1,"ebp"); 2107 &sub ($s1,$tbl);
864 &mov ("esi",$s3); 2108 &mov ("esi",$key);
865 &and ($s1,0xfff); 2109 &and ($s1,0xfff);
866 &lea ("edi",$aes_key); 2110 &lea ("edi",$aes_key);
867 &cmp ($s1,2048); 2111 &cmp ($s1,2048+256);
868 &jb (&label("do_ecopy")); 2112 &jb (&label("do_copy"));
869 &cmp ($s1,4096-244); 2113 &cmp ($s1,4096-244);
870 &jb (&label("skip_ecopy")); 2114 &jb (&label("skip_copy"));
871 &align (4); 2115 &set_label("do_copy",4);
872 &set_label("do_ecopy");
873 &mov ($_key,"edi"); 2116 &mov ($_key,"edi");
874 &data_word(0xA5F3F689); # rep movsd 2117 &data_word(0xA5F3F689); # rep movsd
875 &set_label("skip_ecopy"); 2118 &set_label("skip_copy");
876 2119
877 &mov ($acc,$s0);
878 &mov ($key,16); 2120 &mov ($key,16);
879 &align (4); 2121 &set_label("prefetch_tbl",4);
880 &set_label("prefetch_te"); 2122 &mov ($s0,&DWP(0,$tbl));
881 &mov ($s0,&DWP(0,"ebp")); 2123 &mov ($s1,&DWP(32,$tbl));
882 &mov ($s1,&DWP(32,"ebp")); 2124 &mov ($s2,&DWP(64,$tbl));
883 &mov ($s2,&DWP(64,"ebp")); 2125 &mov ($acc,&DWP(96,$tbl));
884 &mov ($s3,&DWP(96,"ebp")); 2126 &lea ($tbl,&DWP(128,$tbl));
885 &lea ("ebp",&DWP(128,"ebp")); 2127 &sub ($key,1);
886 &dec ($key); 2128 &jnz (&label("prefetch_tbl"));
887 &jnz (&label("prefetch_te")); 2129 &sub ($tbl,2048);
888 &sub ("ebp",2048); 2130
889 2131 &mov ($acc,$_inp);
890 &mov ($s2,$_len);
891 &mov ($key,$_ivp); 2132 &mov ($key,$_ivp);
892 &test ($s2,0xFFFFFFF0);
893 &jz (&label("enc_tail")); # short input...
894 2133
2134 &cmp ($s3,0);
2135 &je (&label("fast_decrypt"));
2136
2137#----------------------------- ENCRYPT -----------------------------#
895 &mov ($s0,&DWP(0,$key)); # load iv 2138 &mov ($s0,&DWP(0,$key)); # load iv
896 &mov ($s1,&DWP(4,$key)); 2139 &mov ($s1,&DWP(4,$key));
897 2140
898 &align (4); 2141 &set_label("fast_enc_loop",16);
899 &set_label("enc_loop");
900 &mov ($s2,&DWP(8,$key)); 2142 &mov ($s2,&DWP(8,$key));
901 &mov ($s3,&DWP(12,$key)); 2143 &mov ($s3,&DWP(12,$key));
902 2144
@@ -916,22 +2158,16 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
916 &mov (&DWP(8,$key),$s2); 2158 &mov (&DWP(8,$key),$s2);
917 &mov (&DWP(12,$key),$s3); 2159 &mov (&DWP(12,$key),$s3);
918 2160
2161 &lea ($acc,&DWP(16,$acc)); # advance inp
919 &mov ($s2,$_len); # load len 2162 &mov ($s2,$_len); # load len
920
921 &lea ($acc,&DWP(16,$acc));
922 &mov ($_inp,$acc); # save inp 2163 &mov ($_inp,$acc); # save inp
923 2164 &lea ($s3,&DWP(16,$key)); # advance out
924 &lea ($s3,&DWP(16,$key));
925 &mov ($_out,$s3); # save out 2165 &mov ($_out,$s3); # save out
926 2166 &sub ($s2,16); # decrease len
927 &sub ($s2,16);
928 &test ($s2,0xFFFFFFF0);
929 &mov ($_len,$s2); # save len 2167 &mov ($_len,$s2); # save len
930 &jnz (&label("enc_loop")); 2168 &jnz (&label("fast_enc_loop"));
931 &test ($s2,15);
932 &jnz (&label("enc_tail"));
933 &mov ($acc,$_ivp); # load ivp 2169 &mov ($acc,$_ivp); # load ivp
934 &mov ($s2,&DWP(8,$key)); # restore last dwords 2170 &mov ($s2,&DWP(8,$key)); # restore last 2 dwords
935 &mov ($s3,&DWP(12,$key)); 2171 &mov ($s3,&DWP(12,$key));
936 &mov (&DWP(0,$acc),$s0); # save ivec 2172 &mov (&DWP(0,$acc),$s0); # save ivec
937 &mov (&DWP(4,$acc),$s1); 2173 &mov (&DWP(4,$acc),$s1);
@@ -949,125 +2185,20 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
949 &set_label("skip_ezero") 2185 &set_label("skip_ezero")
950 &mov ("esp",$_esp); 2186 &mov ("esp",$_esp);
951 &popf (); 2187 &popf ();
952 &set_label("enc_out"); 2188 &set_label("drop_out");
953 &function_end_A(); 2189 &function_end_A();
954 &pushf (); # kludge, never executed 2190 &pushf (); # kludge, never executed
955 2191
956 &align (4);
957 &set_label("enc_tail");
958 &mov ($s0,$key eq "edi" ? $key : "");
959 &mov ($key,$_out); # load out
960 &push ($s0); # push ivp
961 &mov ($s1,16);
962 &sub ($s1,$s2);
963 &cmp ($key,$acc); # compare with inp
964 &je (&label("enc_in_place"));
965 &align (4);
966 &data_word(0xA4F3F689); # rep movsb # copy input
967 &jmp (&label("enc_skip_in_place"));
968 &set_label("enc_in_place");
969 &lea ($key,&DWP(0,$key,$s2));
970 &set_label("enc_skip_in_place");
971 &mov ($s2,$s1);
972 &xor ($s0,$s0);
973 &align (4);
974 &data_word(0xAAF3F689); # rep stosb # zero tail
975 &pop ($key); # pop ivp
976
977 &mov ($acc,$_out); # output as input
978 &mov ($s0,&DWP(0,$key));
979 &mov ($s1,&DWP(4,$key));
980 &mov ($_len,16); # len=16
981 &jmp (&label("enc_loop")); # one more spin...
982
983#----------------------------- DECRYPT -----------------------------# 2192#----------------------------- DECRYPT -----------------------------#
984&align (4); 2193&set_label("fast_decrypt",16);
985&set_label("DECRYPT");
986 &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
987
988 # allocate aligned stack frame...
989 &lea ($key,&DWP(-64-244,"esp"));
990 &and ($key,-64);
991
992 # ... and make sure it doesn't alias with AES_Td modulo 4096
993 &mov ($s0,"ebp");
994 &lea ($s1,&DWP(2048+256,"ebp"));
995 &mov ($s3,$key);
996 &and ($s0,0xfff); # s = %ebp&0xfff
997 &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
998 &and ($s3,0xfff); # p = %esp&0xfff
999
1000 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
1001 &jb (&label("td_break_out"));
1002 &sub ($s3,$s1);
1003 &sub ($key,$s3);
1004 &jmp (&label("td_ok"));
1005 &set_label("td_break_out"); # else %esp -= (p-s)&0xfff + framesz;
1006 &sub ($s3,$s0);
1007 &and ($s3,0xfff);
1008 &add ($s3,64+256);
1009 &sub ($key,$s3);
1010 &align (4);
1011 &set_label("td_ok");
1012
1013 &mov ($s0,&wparam(0)); # load inp
1014 &mov ($s1,&wparam(1)); # load out
1015 &mov ($s3,&wparam(3)); # load key
1016 &mov ($acc,&wparam(4)); # load ivp
1017
1018 &exch ("esp",$key);
1019 &add ("esp",4); # reserve for return address!
1020 &mov ($_esp,$key); # save %esp
1021
1022 &mov ($_inp,$s0); # save copy of inp
1023 &mov ($_out,$s1); # save copy of out
1024 &mov ($_len,$s2); # save copy of len
1025 &mov ($_key,$s3); # save copy of key
1026 &mov ($_ivp,$acc); # save copy of ivp
1027
1028 &mov ($mark,0); # copy of aes_key->rounds = 0;
1029 if ($compromise) {
1030 &cmp ($s2,$compromise);
1031 &jb (&label("skip_dcopy"));
1032 }
1033 # do we copy key schedule to stack?
1034 &mov ($s1 eq "ebx" ? $s1 : "",$s3);
1035 &mov ($s2 eq "ecx" ? $s2 : "",244/4);
1036 &sub ($s1,"ebp");
1037 &mov ("esi",$s3);
1038 &and ($s1,0xfff);
1039 &lea ("edi",$aes_key);
1040 &cmp ($s1,2048+256);
1041 &jb (&label("do_dcopy"));
1042 &cmp ($s1,4096-244);
1043 &jb (&label("skip_dcopy"));
1044 &align (4);
1045 &set_label("do_dcopy");
1046 &mov ($_key,"edi");
1047 &data_word(0xA5F3F689); # rep movsd
1048 &set_label("skip_dcopy");
1049
1050 &mov ($acc,$s0);
1051 &mov ($key,18);
1052 &align (4);
1053 &set_label("prefetch_td");
1054 &mov ($s0,&DWP(0,"ebp"));
1055 &mov ($s1,&DWP(32,"ebp"));
1056 &mov ($s2,&DWP(64,"ebp"));
1057 &mov ($s3,&DWP(96,"ebp"));
1058 &lea ("ebp",&DWP(128,"ebp"));
1059 &dec ($key);
1060 &jnz (&label("prefetch_td"));
1061 &sub ("ebp",2048+256);
1062 2194
1063 &cmp ($acc,$_out); 2195 &cmp ($acc,$_out);
1064 &je (&label("dec_in_place")); # in-place processing... 2196 &je (&label("fast_dec_in_place")); # in-place processing...
1065 2197
1066 &mov ($key,$_ivp); # load ivp
1067 &mov ($_tmp,$key); 2198 &mov ($_tmp,$key);
1068 2199
1069 &align (4); 2200 &align (4);
1070 &set_label("dec_loop"); 2201 &set_label("fast_dec_loop",16);
1071 &mov ($s0,&DWP(0,$acc)); # read input 2202 &mov ($s0,&DWP(0,$acc)); # read input
1072 &mov ($s1,&DWP(4,$acc)); 2203 &mov ($s1,&DWP(4,$acc));
1073 &mov ($s2,&DWP(8,$acc)); 2204 &mov ($s2,&DWP(8,$acc));
@@ -1083,27 +2214,24 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
1083 &xor ($s2,&DWP(8,$key)); 2214 &xor ($s2,&DWP(8,$key));
1084 &xor ($s3,&DWP(12,$key)); 2215 &xor ($s3,&DWP(12,$key));
1085 2216
1086 &sub ($acc,16);
1087 &jc (&label("dec_partial"));
1088 &mov ($_len,$acc); # save len
1089 &mov ($acc,$_inp); # load inp
1090 &mov ($key,$_out); # load out 2217 &mov ($key,$_out); # load out
2218 &mov ($acc,$_inp); # load inp
1091 2219
1092 &mov (&DWP(0,$key),$s0); # write output 2220 &mov (&DWP(0,$key),$s0); # write output
1093 &mov (&DWP(4,$key),$s1); 2221 &mov (&DWP(4,$key),$s1);
1094 &mov (&DWP(8,$key),$s2); 2222 &mov (&DWP(8,$key),$s2);
1095 &mov (&DWP(12,$key),$s3); 2223 &mov (&DWP(12,$key),$s3);
1096 2224
2225 &mov ($s2,$_len); # load len
1097 &mov ($_tmp,$acc); # save ivp 2226 &mov ($_tmp,$acc); # save ivp
1098 &lea ($acc,&DWP(16,$acc)); 2227 &lea ($acc,&DWP(16,$acc)); # advance inp
1099 &mov ($_inp,$acc); # save inp 2228 &mov ($_inp,$acc); # save inp
1100 2229 &lea ($key,&DWP(16,$key)); # advance out
1101 &lea ($key,&DWP(16,$key));
1102 &mov ($_out,$key); # save out 2230 &mov ($_out,$key); # save out
1103 2231 &sub ($s2,16); # decrease len
1104 &jnz (&label("dec_loop")); 2232 &mov ($_len,$s2); # save len
2233 &jnz (&label("fast_dec_loop"));
1105 &mov ($key,$_tmp); # load temp ivp 2234 &mov ($key,$_tmp); # load temp ivp
1106 &set_label("dec_end");
1107 &mov ($acc,$_ivp); # load user ivp 2235 &mov ($acc,$_ivp); # load user ivp
1108 &mov ($s0,&DWP(0,$key)); # load iv 2236 &mov ($s0,&DWP(0,$key)); # load iv
1109 &mov ($s1,&DWP(4,$key)); 2237 &mov ($s1,&DWP(4,$key));
@@ -1113,31 +2241,16 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
1113 &mov (&DWP(4,$acc),$s1); 2241 &mov (&DWP(4,$acc),$s1);
1114 &mov (&DWP(8,$acc),$s2); 2242 &mov (&DWP(8,$acc),$s2);
1115 &mov (&DWP(12,$acc),$s3); 2243 &mov (&DWP(12,$acc),$s3);
1116 &jmp (&label("dec_out")); 2244 &jmp (&label("fast_dec_out"));
1117 2245
1118 &align (4); 2246 &set_label("fast_dec_in_place",16);
1119 &set_label("dec_partial"); 2247 &set_label("fast_dec_in_place_loop");
1120 &lea ($key,$ivec);
1121 &mov (&DWP(0,$key),$s0); # dump output to stack
1122 &mov (&DWP(4,$key),$s1);
1123 &mov (&DWP(8,$key),$s2);
1124 &mov (&DWP(12,$key),$s3);
1125 &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc));
1126 &mov ($acc eq "esi" ? $acc : "",$key);
1127 &mov ($key eq "edi" ? $key : "",$_out); # load out
1128 &data_word(0xA4F3F689); # rep movsb # copy output
1129 &mov ($key,$_inp); # use inp as temp ivp
1130 &jmp (&label("dec_end"));
1131
1132 &align (4);
1133 &set_label("dec_in_place");
1134 &set_label("dec_in_place_loop");
1135 &lea ($key,$ivec);
1136 &mov ($s0,&DWP(0,$acc)); # read input 2248 &mov ($s0,&DWP(0,$acc)); # read input
1137 &mov ($s1,&DWP(4,$acc)); 2249 &mov ($s1,&DWP(4,$acc));
1138 &mov ($s2,&DWP(8,$acc)); 2250 &mov ($s2,&DWP(8,$acc));
1139 &mov ($s3,&DWP(12,$acc)); 2251 &mov ($s3,&DWP(12,$acc));
1140 2252
2253 &lea ($key,$ivec);
1141 &mov (&DWP(0,$key),$s0); # copy to temp 2254 &mov (&DWP(0,$key),$s0); # copy to temp
1142 &mov (&DWP(4,$key),$s1); 2255 &mov (&DWP(4,$key),$s1);
1143 &mov (&DWP(8,$key),$s2); 2256 &mov (&DWP(8,$key),$s2);
@@ -1158,7 +2271,7 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
1158 &mov (&DWP(8,$acc),$s2); 2271 &mov (&DWP(8,$acc),$s2);
1159 &mov (&DWP(12,$acc),$s3); 2272 &mov (&DWP(12,$acc),$s3);
1160 2273
1161 &lea ($acc,&DWP(16,$acc)); 2274 &lea ($acc,&DWP(16,$acc)); # advance out
1162 &mov ($_out,$acc); # save out 2275 &mov ($_out,$acc); # save out
1163 2276
1164 &lea ($acc,$ivec); 2277 &lea ($acc,$ivec);
@@ -1173,40 +2286,340 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
1173 &mov (&DWP(12,$key),$s3); 2286 &mov (&DWP(12,$key),$s3);
1174 2287
1175 &mov ($acc,$_inp); # load inp 2288 &mov ($acc,$_inp); # load inp
2289 &mov ($s2,$_len); # load len
2290 &lea ($acc,&DWP(16,$acc)); # advance inp
2291 &mov ($_inp,$acc); # save inp
2292 &sub ($s2,16); # decrease len
2293 &mov ($_len,$s2); # save len
2294 &jnz (&label("fast_dec_in_place_loop"));
2295
2296 &set_label("fast_dec_out",4);
2297 &cmp ($mark,0); # was the key schedule copied?
2298 &mov ("edi",$_key);
2299 &je (&label("skip_dzero"));
2300 # zero copy of key schedule
2301 &mov ("ecx",240/4);
2302 &xor ("eax","eax");
2303 &align (4);
2304 &data_word(0xABF3F689); # rep stosd
2305 &set_label("skip_dzero")
2306 &mov ("esp",$_esp);
2307 &popf ();
2308 &function_end_A();
2309 &pushf (); # kludge, never executed
2310
2311#--------------------------- SLOW ROUTINE ---------------------------#
2312&set_label("slow_way",16);
2313
2314 &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap
2315 &mov ($key,&wparam(3)); # load key
2316
2317 # pre-allocate aligned stack frame...
2318 &lea ($acc,&DWP(-80,"esp"));
2319 &and ($acc,-64);
2320
2321 # ... and make sure it doesn't alias with $key modulo 1024
2322 &lea ($s1,&DWP(-80-63,$key));
2323 &sub ($s1,$acc);
2324 &neg ($s1);
2325 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
2326 &sub ($acc,$s1);
2327
2328 # pick S-box copy which can't overlap with stack frame or $key
2329 &lea ($s1,&DWP(768,$acc));
2330 &sub ($s1,$tbl);
2331 &and ($s1,0x300);
2332 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
2333
2334 &lea ($s3,&wparam(0)); # pointer to parameter block
2335
2336 &exch ("esp",$acc);
2337 &add ("esp",4); # reserve for return address!
2338 &mov ($_tbl,$tbl); # save %ebp
2339 &mov ($_esp,$acc); # save %esp
2340 &mov ($_tmp,$s0); # save OPENSSL_ia32cap
2341
2342 &mov ($s0,&DWP(0,$s3)); # load inp
2343 &mov ($s1,&DWP(4,$s3)); # load out
2344 #&mov ($s2,&DWP(8,$s3)); # load len
2345 #&mov ($key,&DWP(12,$s3)); # load key
2346 &mov ($acc,&DWP(16,$s3)); # load ivp
2347 &mov ($s3,&DWP(20,$s3)); # load enc flag
2348
2349 &mov ($_inp,$s0); # save copy of inp
2350 &mov ($_out,$s1); # save copy of out
2351 &mov ($_len,$s2); # save copy of len
2352 &mov ($_key,$key); # save copy of key
2353 &mov ($_ivp,$acc); # save copy of ivp
2354
2355 &mov ($key,$acc);
2356 &mov ($acc,$s0);
2357
2358 &cmp ($s3,0);
2359 &je (&label("slow_decrypt"));
2360
2361#--------------------------- SLOW ENCRYPT ---------------------------#
2362 &cmp ($s2,16);
2363 &mov ($s3,$s1);
2364 &jb (&label("slow_enc_tail"));
2365
2366 if (!$x86only) {
2367 &bt ($_tmp,25); # check for SSE bit
2368 &jnc (&label("slow_enc_x86"));
2369
2370 &movq ("mm0",&QWP(0,$key)); # load iv
2371 &movq ("mm4",&QWP(8,$key));
1176 2372
1177 &lea ($acc,&DWP(16,$acc)); 2373 &set_label("slow_enc_loop_sse",16);
2374 &pxor ("mm0",&QWP(0,$acc)); # xor input data
2375 &pxor ("mm4",&QWP(8,$acc));
2376
2377 &mov ($key,$_key);
2378 &call ("_sse_AES_encrypt_compact");
2379
2380 &mov ($acc,$_inp); # load inp
2381 &mov ($key,$_out); # load out
2382 &mov ($s2,$_len); # load len
2383
2384 &movq (&QWP(0,$key),"mm0"); # save output data
2385 &movq (&QWP(8,$key),"mm4");
2386
2387 &lea ($acc,&DWP(16,$acc)); # advance inp
1178 &mov ($_inp,$acc); # save inp 2388 &mov ($_inp,$acc); # save inp
2389 &lea ($s3,&DWP(16,$key)); # advance out
2390 &mov ($_out,$s3); # save out
2391 &sub ($s2,16); # decrease len
2392 &cmp ($s2,16);
2393 &mov ($_len,$s2); # save len
2394 &jae (&label("slow_enc_loop_sse"));
2395 &test ($s2,15);
2396 &jnz (&label("slow_enc_tail"));
2397 &mov ($acc,$_ivp); # load ivp
2398 &movq (&QWP(0,$acc),"mm0"); # save ivec
2399 &movq (&QWP(8,$acc),"mm4");
2400 &emms ();
2401 &mov ("esp",$_esp);
2402 &popf ();
2403 &function_end_A();
2404 &pushf (); # kludge, never executed
2405 }
2406 &set_label("slow_enc_x86",16);
2407 &mov ($s0,&DWP(0,$key)); # load iv
2408 &mov ($s1,&DWP(4,$key));
2409
2410 &set_label("slow_enc_loop_x86",4);
2411 &mov ($s2,&DWP(8,$key));
2412 &mov ($s3,&DWP(12,$key));
2413
2414 &xor ($s0,&DWP(0,$acc)); # xor input data
2415 &xor ($s1,&DWP(4,$acc));
2416 &xor ($s2,&DWP(8,$acc));
2417 &xor ($s3,&DWP(12,$acc));
2418
2419 &mov ($key,$_key); # load key
2420 &call ("_x86_AES_encrypt_compact");
2421
2422 &mov ($acc,$_inp); # load inp
2423 &mov ($key,$_out); # load out
2424
2425 &mov (&DWP(0,$key),$s0); # save output data
2426 &mov (&DWP(4,$key),$s1);
2427 &mov (&DWP(8,$key),$s2);
2428 &mov (&DWP(12,$key),$s3);
1179 2429
1180 &mov ($s2,$_len); # load len 2430 &mov ($s2,$_len); # load len
1181 &sub ($s2,16); 2431 &lea ($acc,&DWP(16,$acc)); # advance inp
1182 &jc (&label("dec_in_place_partial")); 2432 &mov ($_inp,$acc); # save inp
2433 &lea ($s3,&DWP(16,$key)); # advance out
2434 &mov ($_out,$s3); # save out
2435 &sub ($s2,16); # decrease len
2436 &cmp ($s2,16);
1183 &mov ($_len,$s2); # save len 2437 &mov ($_len,$s2); # save len
1184 &jnz (&label("dec_in_place_loop")); 2438 &jae (&label("slow_enc_loop_x86"));
1185 &jmp (&label("dec_out")); 2439 &test ($s2,15);
1186 2440 &jnz (&label("slow_enc_tail"));
1187 &align (4); 2441 &mov ($acc,$_ivp); # load ivp
1188 &set_label("dec_in_place_partial"); 2442 &mov ($s2,&DWP(8,$key)); # restore last dwords
1189 # one can argue if this is actually required... 2443 &mov ($s3,&DWP(12,$key));
1190 &mov ($key eq "edi" ? $key : "",$_out); 2444 &mov (&DWP(0,$acc),$s0); # save ivec
1191 &lea ($acc eq "esi" ? $acc : "",$ivec); 2445 &mov (&DWP(4,$acc),$s1);
2446 &mov (&DWP(8,$acc),$s2);
2447 &mov (&DWP(12,$acc),$s3);
2448
2449 &mov ("esp",$_esp);
2450 &popf ();
2451 &function_end_A();
2452 &pushf (); # kludge, never executed
2453
2454 &set_label("slow_enc_tail",16);
2455 &emms () if (!$x86only);
2456 &mov ($key eq "edi"? $key:"",$s3); # load out to edi
2457 &mov ($s1,16);
2458 &sub ($s1,$s2);
2459 &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp
2460 &je (&label("enc_in_place"));
2461 &align (4);
2462 &data_word(0xA4F3F689); # rep movsb # copy input
2463 &jmp (&label("enc_skip_in_place"));
2464 &set_label("enc_in_place");
1192 &lea ($key,&DWP(0,$key,$s2)); 2465 &lea ($key,&DWP(0,$key,$s2));
1193 &lea ($acc,&DWP(16,$acc,$s2)); 2466 &set_label("enc_skip_in_place");
1194 &neg ($s2 eq "ecx" ? $s2 : ""); 2467 &mov ($s2,$s1);
1195 &data_word(0xA4F3F689); # rep movsb # restore tail 2468 &xor ($s0,$s0);
1196 2469 &align (4);
1197 &align (4); 2470 &data_word(0xAAF3F689); # rep stosb # zero tail
1198 &set_label("dec_out"); 2471
1199 &cmp ($mark,0); # was the key schedule copied? 2472 &mov ($key,$_ivp); # restore ivp
1200 &mov ("edi",$_key); 2473 &mov ($acc,$s3); # output as input
1201 &je (&label("skip_dzero")); 2474 &mov ($s0,&DWP(0,$key));
1202 # zero copy of key schedule 2475 &mov ($s1,&DWP(4,$key));
1203 &mov ("ecx",240/4); 2476 &mov ($_len,16); # len=16
1204 &xor ("eax","eax"); 2477 &jmp (&label("slow_enc_loop_x86")); # one more spin...
1205 &align (4); 2478
1206 &data_word(0xABF3F689); # rep stosd 2479#--------------------------- SLOW DECRYPT ---------------------------#
1207 &set_label("skip_dzero") 2480&set_label("slow_decrypt",16);
1208 &mov ("esp",$_esp); 2481 if (!$x86only) {
1209 &popf (); 2482 &bt ($_tmp,25); # check for SSE bit
2483 &jnc (&label("slow_dec_loop_x86"));
2484
2485 &set_label("slow_dec_loop_sse",4);
2486 &movq ("mm0",&QWP(0,$acc)); # read input
2487 &movq ("mm4",&QWP(8,$acc));
2488
2489 &mov ($key,$_key);
2490 &call ("_sse_AES_decrypt_compact");
2491
2492 &mov ($acc,$_inp); # load inp
2493 &lea ($s0,$ivec);
2494 &mov ($s1,$_out); # load out
2495 &mov ($s2,$_len); # load len
2496 &mov ($key,$_ivp); # load ivp
2497
2498 &movq ("mm1",&QWP(0,$acc)); # re-read input
2499 &movq ("mm5",&QWP(8,$acc));
2500
2501 &pxor ("mm0",&QWP(0,$key)); # xor iv
2502 &pxor ("mm4",&QWP(8,$key));
2503
2504 &movq (&QWP(0,$key),"mm1"); # copy input to iv
2505 &movq (&QWP(8,$key),"mm5");
2506
2507 &sub ($s2,16); # decrease len
2508 &jc (&label("slow_dec_partial_sse"));
2509
2510 &movq (&QWP(0,$s1),"mm0"); # write output
2511 &movq (&QWP(8,$s1),"mm4");
2512
2513 &lea ($s1,&DWP(16,$s1)); # advance out
2514 &mov ($_out,$s1); # save out
2515 &lea ($acc,&DWP(16,$acc)); # advance inp
2516 &mov ($_inp,$acc); # save inp
2517 &mov ($_len,$s2); # save len
2518 &jnz (&label("slow_dec_loop_sse"));
2519 &emms ();
2520 &mov ("esp",$_esp);
2521 &popf ();
2522 &function_end_A();
2523 &pushf (); # kludge, never executed
2524
2525 &set_label("slow_dec_partial_sse",16);
2526 &movq (&QWP(0,$s0),"mm0"); # save output to temp
2527 &movq (&QWP(8,$s0),"mm4");
2528 &emms ();
2529
2530 &add ($s2 eq "ecx" ? "ecx":"",16);
2531 &mov ("edi",$s1); # out
2532 &mov ("esi",$s0); # temp
2533 &align (4);
2534 &data_word(0xA4F3F689); # rep movsb # copy partial output
2535
2536 &mov ("esp",$_esp);
2537 &popf ();
2538 &function_end_A();
2539 &pushf (); # kludge, never executed
2540 }
2541 &set_label("slow_dec_loop_x86",16);
2542 &mov ($s0,&DWP(0,$acc)); # read input
2543 &mov ($s1,&DWP(4,$acc));
2544 &mov ($s2,&DWP(8,$acc));
2545 &mov ($s3,&DWP(12,$acc));
2546
2547 &lea ($key,$ivec);
2548 &mov (&DWP(0,$key),$s0); # copy to temp
2549 &mov (&DWP(4,$key),$s1);
2550 &mov (&DWP(8,$key),$s2);
2551 &mov (&DWP(12,$key),$s3);
2552
2553 &mov ($key,$_key); # load key
2554 &call ("_x86_AES_decrypt_compact");
2555
2556 &mov ($key,$_ivp); # load ivp
2557 &mov ($acc,$_len); # load len
2558 &xor ($s0,&DWP(0,$key)); # xor iv
2559 &xor ($s1,&DWP(4,$key));
2560 &xor ($s2,&DWP(8,$key));
2561 &xor ($s3,&DWP(12,$key));
2562
2563 &sub ($acc,16);
2564 &jc (&label("slow_dec_partial_x86"));
2565
2566 &mov ($_len,$acc); # save len
2567 &mov ($acc,$_out); # load out
2568
2569 &mov (&DWP(0,$acc),$s0); # write output
2570 &mov (&DWP(4,$acc),$s1);
2571 &mov (&DWP(8,$acc),$s2);
2572 &mov (&DWP(12,$acc),$s3);
2573
2574 &lea ($acc,&DWP(16,$acc)); # advance out
2575 &mov ($_out,$acc); # save out
2576
2577 &lea ($acc,$ivec);
2578 &mov ($s0,&DWP(0,$acc)); # read temp
2579 &mov ($s1,&DWP(4,$acc));
2580 &mov ($s2,&DWP(8,$acc));
2581 &mov ($s3,&DWP(12,$acc));
2582
2583 &mov (&DWP(0,$key),$s0); # copy it to iv
2584 &mov (&DWP(4,$key),$s1);
2585 &mov (&DWP(8,$key),$s2);
2586 &mov (&DWP(12,$key),$s3);
2587
2588 &mov ($acc,$_inp); # load inp
2589 &lea ($acc,&DWP(16,$acc)); # advance inp
2590 &mov ($_inp,$acc); # save inp
2591 &jnz (&label("slow_dec_loop_x86"));
2592 &mov ("esp",$_esp);
2593 &popf ();
2594 &function_end_A();
2595 &pushf (); # kludge, never executed
2596
2597 &set_label("slow_dec_partial_x86",16);
2598 &lea ($acc,$ivec);
2599 &mov (&DWP(0,$acc),$s0); # save output to temp
2600 &mov (&DWP(4,$acc),$s1);
2601 &mov (&DWP(8,$acc),$s2);
2602 &mov (&DWP(12,$acc),$s3);
2603
2604 &mov ($acc,$_inp);
2605 &mov ($s0,&DWP(0,$acc)); # re-read input
2606 &mov ($s1,&DWP(4,$acc));
2607 &mov ($s2,&DWP(8,$acc));
2608 &mov ($s3,&DWP(12,$acc));
2609
2610 &mov (&DWP(0,$key),$s0); # copy it to iv
2611 &mov (&DWP(4,$key),$s1);
2612 &mov (&DWP(8,$key),$s2);
2613 &mov (&DWP(12,$key),$s3);
2614
2615 &mov ("ecx",$_len);
2616 &mov ("edi",$_out);
2617 &lea ("esi",$ivec);
2618 &align (4);
2619 &data_word(0xA4F3F689); # rep movsb # copy partial output
2620
2621 &mov ("esp",$_esp);
2622 &popf ();
1210&function_end("AES_cbc_encrypt"); 2623&function_end("AES_cbc_encrypt");
1211} 2624}
1212 2625
@@ -1215,35 +2628,31 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
1215sub enckey() 2628sub enckey()
1216{ 2629{
1217 &movz ("esi",&LB("edx")); # rk[i]>>0 2630 &movz ("esi",&LB("edx")); # rk[i]>>0
1218 &mov ("ebx",&DWP(2,"ebp","esi",8)); 2631 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1219 &movz ("esi",&HB("edx")); # rk[i]>>8 2632 &movz ("esi",&HB("edx")); # rk[i]>>8
1220 &and ("ebx",0xFF000000); 2633 &shl ("ebx",24);
1221 &xor ("eax","ebx"); 2634 &xor ("eax","ebx");
1222 2635
1223 &mov ("ebx",&DWP(2,"ebp","esi",8)); 2636 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1224 &shr ("edx",16); 2637 &shr ("edx",16);
1225 &and ("ebx",0x000000FF);
1226 &movz ("esi",&LB("edx")); # rk[i]>>16 2638 &movz ("esi",&LB("edx")); # rk[i]>>16
1227 &xor ("eax","ebx"); 2639 &xor ("eax","ebx");
1228 2640
1229 &mov ("ebx",&DWP(0,"ebp","esi",8)); 2641 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1230 &movz ("esi",&HB("edx")); # rk[i]>>24 2642 &movz ("esi",&HB("edx")); # rk[i]>>24
1231 &and ("ebx",0x0000FF00); 2643 &shl ("ebx",8);
1232 &xor ("eax","ebx"); 2644 &xor ("eax","ebx");
1233 2645
1234 &mov ("ebx",&DWP(0,"ebp","esi",8)); 2646 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1235 &and ("ebx",0x00FF0000); 2647 &shl ("ebx",16);
1236 &xor ("eax","ebx"); 2648 &xor ("eax","ebx");
1237 2649
1238 &xor ("eax",&DWP(2048,"ebp","ecx",4)); # rcon 2650 &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon
1239} 2651}
1240 2652
1241# int AES_set_encrypt_key(const unsigned char *userKey, const int bits, 2653&function_begin("_x86_AES_set_encrypt_key");
1242# AES_KEY *key) 2654 &mov ("esi",&wparam(1)); # user supplied key
1243&public_label("AES_Te"); 2655 &mov ("edi",&wparam(3)); # private key schedule
1244&function_begin("AES_set_encrypt_key", "", "_x86_AES_set_encrypt_key");
1245 &mov ("esi",&wparam(0)); # user supplied key
1246 &mov ("edi",&wparam(2)); # private key schedule
1247 2656
1248 &test ("esi",-1); 2657 &test ("esi",-1);
1249 &jz (&label("badpointer")); 2658 &jz (&label("badpointer"));
@@ -1252,10 +2661,21 @@ sub enckey()
1252 2661
1253 &call (&label("pic_point")); 2662 &call (&label("pic_point"));
1254 &set_label("pic_point"); 2663 &set_label("pic_point");
1255 &blindpop("ebp"); 2664 &blindpop($tbl);
1256 &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); 2665 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
1257 2666 &lea ($tbl,&DWP(2048+128,$tbl));
1258 &mov ("ecx",&wparam(1)); # number of bits in key 2667
2668 # prefetch Te4
2669 &mov ("eax",&DWP(0-128,$tbl));
2670 &mov ("ebx",&DWP(32-128,$tbl));
2671 &mov ("ecx",&DWP(64-128,$tbl));
2672 &mov ("edx",&DWP(96-128,$tbl));
2673 &mov ("eax",&DWP(128-128,$tbl));
2674 &mov ("ebx",&DWP(160-128,$tbl));
2675 &mov ("ecx",&DWP(192-128,$tbl));
2676 &mov ("edx",&DWP(224-128,$tbl));
2677
2678 &mov ("ecx",&wparam(2)); # number of bits in key
1259 &cmp ("ecx",128); 2679 &cmp ("ecx",128);
1260 &je (&label("10rounds")); 2680 &je (&label("10rounds"));
1261 &cmp ("ecx",192); 2681 &cmp ("ecx",192);
@@ -1394,24 +2814,23 @@ sub enckey()
1394 &mov ("edx","eax"); 2814 &mov ("edx","eax");
1395 &mov ("eax",&DWP(16,"edi")); # rk[4] 2815 &mov ("eax",&DWP(16,"edi")); # rk[4]
1396 &movz ("esi",&LB("edx")); # rk[11]>>0 2816 &movz ("esi",&LB("edx")); # rk[11]>>0
1397 &mov ("ebx",&DWP(2,"ebp","esi",8)); 2817 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1398 &movz ("esi",&HB("edx")); # rk[11]>>8 2818 &movz ("esi",&HB("edx")); # rk[11]>>8
1399 &and ("ebx",0x000000FF);
1400 &xor ("eax","ebx"); 2819 &xor ("eax","ebx");
1401 2820
1402 &mov ("ebx",&DWP(0,"ebp","esi",8)); 2821 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1403 &shr ("edx",16); 2822 &shr ("edx",16);
1404 &and ("ebx",0x0000FF00); 2823 &shl ("ebx",8);
1405 &movz ("esi",&LB("edx")); # rk[11]>>16 2824 &movz ("esi",&LB("edx")); # rk[11]>>16
1406 &xor ("eax","ebx"); 2825 &xor ("eax","ebx");
1407 2826
1408 &mov ("ebx",&DWP(0,"ebp","esi",8)); 2827 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1409 &movz ("esi",&HB("edx")); # rk[11]>>24 2828 &movz ("esi",&HB("edx")); # rk[11]>>24
1410 &and ("ebx",0x00FF0000); 2829 &shl ("ebx",16);
1411 &xor ("eax","ebx"); 2830 &xor ("eax","ebx");
1412 2831
1413 &mov ("ebx",&DWP(2,"ebp","esi",8)); 2832 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1414 &and ("ebx",0xFF000000); 2833 &shl ("ebx",24);
1415 &xor ("eax","ebx"); 2834 &xor ("eax","ebx");
1416 2835
1417 &mov (&DWP(48,"edi"),"eax"); # rk[12] 2836 &mov (&DWP(48,"edi"),"eax"); # rk[12]
@@ -1433,43 +2852,74 @@ sub enckey()
1433 &set_label("badpointer"); 2852 &set_label("badpointer");
1434 &mov ("eax",-1); 2853 &mov ("eax",-1);
1435 &set_label("exit"); 2854 &set_label("exit");
1436&function_end("AES_set_encrypt_key"); 2855&function_end("_x86_AES_set_encrypt_key");
1437 2856
1438sub deckey() 2857# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
1439{ my ($i,$ptr,$te,$td) = @_; 2858# AES_KEY *key)
2859&function_begin_B("AES_set_encrypt_key");
2860 &call ("_x86_AES_set_encrypt_key");
2861 &ret ();
2862&function_end_B("AES_set_encrypt_key");
1440 2863
1441 &mov ("eax",&DWP($i,$ptr)); 2864sub deckey()
1442 &mov ("edx","eax"); 2865{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
1443 &movz ("ebx",&HB("eax")); 2866 my $tmp = $tbl;
1444 &shr ("edx",16); 2867
1445 &and ("eax",0xFF); 2868 &mov ($acc,$tp1);
1446 &movz ("eax",&BP(2,$te,"eax",8)); 2869 &and ($acc,0x80808080);
1447 &movz ("ebx",&BP(2,$te,"ebx",8)); 2870 &mov ($tmp,$acc);
1448 &mov ("eax",&DWP(0,$td,"eax",8)); 2871 &shr ($tmp,7);
1449 &xor ("eax",&DWP(3,$td,"ebx",8)); 2872 &lea ($tp2,&DWP(0,$tp1,$tp1));
1450 &movz ("ebx",&HB("edx")); 2873 &sub ($acc,$tmp);
1451 &and ("edx",0xFF); 2874 &and ($tp2,0xfefefefe);
1452 &movz ("edx",&BP(2,$te,"edx",8)); 2875 &and ($acc,0x1b1b1b1b);
1453 &movz ("ebx",&BP(2,$te,"ebx",8)); 2876 &xor ($acc,$tp2);
1454 &xor ("eax",&DWP(2,$td,"edx",8)); 2877 &mov ($tp2,$acc);
1455 &xor ("eax",&DWP(1,$td,"ebx",8)); 2878
1456 &mov (&DWP($i,$ptr),"eax"); 2879 &and ($acc,0x80808080);
2880 &mov ($tmp,$acc);
2881 &shr ($tmp,7);
2882 &lea ($tp4,&DWP(0,$tp2,$tp2));
2883 &sub ($acc,$tmp);
2884 &and ($tp4,0xfefefefe);
2885 &and ($acc,0x1b1b1b1b);
2886 &xor ($tp2,$tp1); # tp2^tp1
2887 &xor ($acc,$tp4);
2888 &mov ($tp4,$acc);
2889
2890 &and ($acc,0x80808080);
2891 &mov ($tmp,$acc);
2892 &shr ($tmp,7);
2893 &lea ($tp8,&DWP(0,$tp4,$tp4));
2894 &xor ($tp4,$tp1); # tp4^tp1
2895 &sub ($acc,$tmp);
2896 &and ($tp8,0xfefefefe);
2897 &and ($acc,0x1b1b1b1b);
2898 &rotl ($tp1,8); # = ROTATE(tp1,8)
2899 &xor ($tp8,$acc);
2900
2901 &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load
2902
2903 &xor ($tp1,$tp2);
2904 &xor ($tp2,$tp8);
2905 &xor ($tp1,$tp4);
2906 &rotl ($tp2,24);
2907 &xor ($tp4,$tp8);
2908 &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
2909 &rotl ($tp4,16);
2910 &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
2911 &rotl ($tp8,8);
2912 &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
2913 &mov ($tp2,$tmp);
2914 &xor ($tp1,$tp8); # ^= ROTATE(tp8,8)
2915
2916 &mov (&DWP(4*$i,$key),$tp1);
1457} 2917}
1458 2918
1459# int AES_set_decrypt_key(const unsigned char *userKey, const int bits, 2919# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
1460# AES_KEY *key) 2920# AES_KEY *key)
1461&public_label("AES_Td");
1462&public_label("AES_Te");
1463&function_begin_B("AES_set_decrypt_key"); 2921&function_begin_B("AES_set_decrypt_key");
1464 &mov ("eax",&wparam(0));
1465 &mov ("ecx",&wparam(1));
1466 &mov ("edx",&wparam(2));
1467 &sub ("esp",12);
1468 &mov (&DWP(0,"esp"),"eax");
1469 &mov (&DWP(4,"esp"),"ecx");
1470 &mov (&DWP(8,"esp"),"edx");
1471 &call ("_x86_AES_set_encrypt_key"); 2922 &call ("_x86_AES_set_encrypt_key");
1472 &add ("esp",12);
1473 &cmp ("eax",0); 2923 &cmp ("eax",0);
1474 &je (&label("proceed")); 2924 &je (&label("proceed"));
1475 &ret (); 2925 &ret ();
@@ -1485,8 +2935,7 @@ sub deckey()
1485 &lea ("ecx",&DWP(0,"","ecx",4)); 2935 &lea ("ecx",&DWP(0,"","ecx",4));
1486 &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk 2936 &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
1487 2937
1488 &align (4); 2938 &set_label("invert",4); # invert order of chunks
1489 &set_label("invert"); # invert order of chunks
1490 &mov ("eax",&DWP(0,"esi")); 2939 &mov ("eax",&DWP(0,"esi"));
1491 &mov ("ebx",&DWP(4,"esi")); 2940 &mov ("ebx",&DWP(4,"esi"));
1492 &mov ("ecx",&DWP(0,"edi")); 2941 &mov ("ecx",&DWP(0,"edi"));
@@ -1508,26 +2957,24 @@ sub deckey()
1508 &cmp ("esi","edi"); 2957 &cmp ("esi","edi");
1509 &jne (&label("invert")); 2958 &jne (&label("invert"));
1510 2959
1511 &call (&label("pic_point")); 2960 &mov ($key,&wparam(2));
1512 &set_label("pic_point"); 2961 &mov ($acc,&DWP(240,$key)); # pull number of rounds
1513 blindpop("ebp"); 2962 &lea ($acc,&DWP(-2,$acc,$acc));
1514 &lea ("edi",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); 2963 &lea ($acc,&DWP(0,$key,$acc,8));
1515 &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); 2964 &mov (&wparam(2),$acc);
1516 2965
1517 &mov ("esi",&wparam(2)); 2966 &mov ($s0,&DWP(16,$key)); # modulo-scheduled load
1518 &mov ("ecx",&DWP(240,"esi")); # pull number of rounds 2967 &set_label("permute",4); # permute the key schedule
1519 &dec ("ecx"); 2968 &add ($key,16);
1520 &align (4); 2969 &deckey (0,$key,$s0,$s1,$s2,$s3);
1521 &set_label("permute"); # permute the key schedule 2970 &deckey (1,$key,$s1,$s2,$s3,$s0);
1522 &add ("esi",16); 2971 &deckey (2,$key,$s2,$s3,$s0,$s1);
1523 &deckey (0,"esi","ebp","edi"); 2972 &deckey (3,$key,$s3,$s0,$s1,$s2);
1524 &deckey (4,"esi","ebp","edi"); 2973 &cmp ($key,&wparam(2));
1525 &deckey (8,"esi","ebp","edi"); 2974 &jb (&label("permute"));
1526 &deckey (12,"esi","ebp","edi");
1527 &dec ("ecx");
1528 &jnz (&label("permute"));
1529 2975
1530 &xor ("eax","eax"); # return success 2976 &xor ("eax","eax"); # return success
1531&function_end("AES_set_decrypt_key"); 2977&function_end("AES_set_decrypt_key");
2978&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
1532 2979
1533&asm_finish(); 2980&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
index a545e892ae..53e4ef85fd 100755
--- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl
+++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
@@ -1669,7 +1669,7 @@ AES_cbc_encrypt:
1669 lea .LAES_Td(%rip),$sbox 1669 lea .LAES_Td(%rip),$sbox
1670.Lcbc_picked_te: 1670.Lcbc_picked_te:
1671 1671
1672 mov OPENSSL_ia32cap_P(%rip),%r10d 1672 mov PIC_GOT(OPENSSL_ia32cap_P),%r10d
1673 cmp \$$speed_limit,%rdx 1673 cmp \$$speed_limit,%rdx
1674 jb .Lcbc_slow_prologue 1674 jb .Lcbc_slow_prologue
1675 test \$15,%rdx 1675 test \$15,%rdx