diff options
| author | djm <> | 2010-10-01 22:54:21 +0000 |
|---|---|---|
| committer | djm <> | 2010-10-01 22:54:21 +0000 |
| commit | 829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2 (patch) | |
| tree | e03b9f1bd051e844b971936729e9df549a209130 /src/lib/libcrypto/aes | |
| parent | e6b755d2a53d3cac7a344dfdd6bf7c951cac754c (diff) | |
| download | openbsd-829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2.tar.gz openbsd-829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2.tar.bz2 openbsd-829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2.zip | |
import OpenSSL-1.0.0a
Diffstat (limited to 'src/lib/libcrypto/aes')
| -rw-r--r-- | src/lib/libcrypto/aes/aes.h | 28 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/aes_cbc.c | 82 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/aes_cfb.c | 160 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/aes_core.c | 209 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/aes_ctr.c | 90 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/aes_ige.c | 12 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/aes_ofb.c | 94 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/asm/aes-586.pl | 2401 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/asm/aes-armv4.pl | 1 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/asm/aes-ppc.pl | 269 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/asm/aes-s390x.pl | 6 | ||||
| -rwxr-xr-x | src/lib/libcrypto/aes/asm/aes-x86_64.pl | 2012 |
12 files changed, 3936 insertions, 1428 deletions
diff --git a/src/lib/libcrypto/aes/aes.h b/src/lib/libcrypto/aes/aes.h index 450f2b4051..d2c99730fe 100644 --- a/src/lib/libcrypto/aes/aes.h +++ b/src/lib/libcrypto/aes/aes.h | |||
| @@ -58,6 +58,8 @@ | |||
| 58 | #error AES is disabled. | 58 | #error AES is disabled. |
| 59 | #endif | 59 | #endif |
| 60 | 60 | ||
| 61 | #include <stddef.h> | ||
| 62 | |||
| 61 | #define AES_ENCRYPT 1 | 63 | #define AES_ENCRYPT 1 |
| 62 | #define AES_DECRYPT 0 | 64 | #define AES_DECRYPT 0 |
| 63 | 65 | ||
| @@ -66,10 +68,6 @@ | |||
| 66 | #define AES_MAXNR 14 | 68 | #define AES_MAXNR 14 |
| 67 | #define AES_BLOCK_SIZE 16 | 69 | #define AES_BLOCK_SIZE 16 |
| 68 | 70 | ||
| 69 | #ifdef OPENSSL_FIPS | ||
| 70 | #define FIPS_AES_SIZE_T int | ||
| 71 | #endif | ||
| 72 | |||
| 73 | #ifdef __cplusplus | 71 | #ifdef __cplusplus |
| 74 | extern "C" { | 72 | extern "C" { |
| 75 | #endif | 73 | #endif |
| @@ -100,37 +98,32 @@ void AES_decrypt(const unsigned char *in, unsigned char *out, | |||
| 100 | void AES_ecb_encrypt(const unsigned char *in, unsigned char *out, | 98 | void AES_ecb_encrypt(const unsigned char *in, unsigned char *out, |
| 101 | const AES_KEY *key, const int enc); | 99 | const AES_KEY *key, const int enc); |
| 102 | void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, | 100 | void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, |
| 103 | const unsigned long length, const AES_KEY *key, | 101 | size_t length, const AES_KEY *key, |
| 104 | unsigned char *ivec, const int enc); | 102 | unsigned char *ivec, const int enc); |
| 105 | void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, | 103 | void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, |
| 106 | const unsigned long length, const AES_KEY *key, | 104 | size_t length, const AES_KEY *key, |
| 107 | unsigned char *ivec, int *num, const int enc); | 105 | unsigned char *ivec, int *num, const int enc); |
| 108 | void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, | 106 | void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, |
| 109 | const unsigned long length, const AES_KEY *key, | 107 | size_t length, const AES_KEY *key, |
| 110 | unsigned char *ivec, int *num, const int enc); | 108 | unsigned char *ivec, int *num, const int enc); |
| 111 | void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, | 109 | void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, |
| 112 | const unsigned long length, const AES_KEY *key, | 110 | size_t length, const AES_KEY *key, |
| 113 | unsigned char *ivec, int *num, const int enc); | 111 | unsigned char *ivec, int *num, const int enc); |
| 114 | void AES_cfbr_encrypt_block(const unsigned char *in,unsigned char *out, | ||
| 115 | const int nbits,const AES_KEY *key, | ||
| 116 | unsigned char *ivec,const int enc); | ||
| 117 | void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, | 112 | void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, |
| 118 | const unsigned long length, const AES_KEY *key, | 113 | size_t length, const AES_KEY *key, |
| 119 | unsigned char *ivec, int *num); | 114 | unsigned char *ivec, int *num); |
| 120 | void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, | 115 | void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, |
| 121 | const unsigned long length, const AES_KEY *key, | 116 | size_t length, const AES_KEY *key, |
| 122 | unsigned char ivec[AES_BLOCK_SIZE], | 117 | unsigned char ivec[AES_BLOCK_SIZE], |
| 123 | unsigned char ecount_buf[AES_BLOCK_SIZE], | 118 | unsigned char ecount_buf[AES_BLOCK_SIZE], |
| 124 | unsigned int *num); | 119 | unsigned int *num); |
| 125 | |||
| 126 | /* For IGE, see also http://www.links.org/files/openssl-ige.pdf */ | ||
| 127 | /* NB: the IV is _two_ blocks long */ | 120 | /* NB: the IV is _two_ blocks long */ |
| 128 | void AES_ige_encrypt(const unsigned char *in, unsigned char *out, | 121 | void AES_ige_encrypt(const unsigned char *in, unsigned char *out, |
| 129 | const unsigned long length, const AES_KEY *key, | 122 | size_t length, const AES_KEY *key, |
| 130 | unsigned char *ivec, const int enc); | 123 | unsigned char *ivec, const int enc); |
| 131 | /* NB: the IV is _four_ blocks long */ | 124 | /* NB: the IV is _four_ blocks long */ |
| 132 | void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out, | 125 | void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out, |
| 133 | const unsigned long length, const AES_KEY *key, | 126 | size_t length, const AES_KEY *key, |
| 134 | const AES_KEY *key2, const unsigned char *ivec, | 127 | const AES_KEY *key2, const unsigned char *ivec, |
| 135 | const int enc); | 128 | const int enc); |
| 136 | 129 | ||
| @@ -141,6 +134,7 @@ int AES_unwrap_key(AES_KEY *key, const unsigned char *iv, | |||
| 141 | unsigned char *out, | 134 | unsigned char *out, |
| 142 | const unsigned char *in, unsigned int inlen); | 135 | const unsigned char *in, unsigned int inlen); |
| 143 | 136 | ||
| 137 | |||
| 144 | #ifdef __cplusplus | 138 | #ifdef __cplusplus |
| 145 | } | 139 | } |
| 146 | #endif | 140 | #endif |
diff --git a/src/lib/libcrypto/aes/aes_cbc.c b/src/lib/libcrypto/aes/aes_cbc.c index 373864cd4b..227f75625d 100644 --- a/src/lib/libcrypto/aes/aes_cbc.c +++ b/src/lib/libcrypto/aes/aes_cbc.c | |||
| @@ -49,85 +49,15 @@ | |||
| 49 | * | 49 | * |
| 50 | */ | 50 | */ |
| 51 | 51 | ||
| 52 | #ifndef AES_DEBUG | ||
| 53 | # ifndef NDEBUG | ||
| 54 | # define NDEBUG | ||
| 55 | # endif | ||
| 56 | #endif | ||
| 57 | #include <assert.h> | ||
| 58 | |||
| 59 | #include <openssl/aes.h> | 52 | #include <openssl/aes.h> |
| 60 | #include "aes_locl.h" | 53 | #include <openssl/modes.h> |
| 61 | 54 | ||
| 62 | #if !defined(OPENSSL_FIPS_AES_ASM) | ||
| 63 | void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, | 55 | void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, |
| 64 | const unsigned long length, const AES_KEY *key, | 56 | size_t len, const AES_KEY *key, |
| 65 | unsigned char *ivec, const int enc) { | 57 | unsigned char *ivec, const int enc) { |
| 66 | 58 | ||
| 67 | unsigned long n; | 59 | if (enc) |
| 68 | unsigned long len = length; | 60 | CRYPTO_cbc128_encrypt(in,out,len,key,ivec,(block128_f)AES_encrypt); |
| 69 | unsigned char tmp[AES_BLOCK_SIZE]; | 61 | else |
| 70 | const unsigned char *iv = ivec; | 62 | CRYPTO_cbc128_decrypt(in,out,len,key,ivec,(block128_f)AES_decrypt); |
| 71 | |||
| 72 | assert(in && out && key && ivec); | ||
| 73 | assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc)); | ||
| 74 | |||
| 75 | if (AES_ENCRYPT == enc) { | ||
| 76 | while (len >= AES_BLOCK_SIZE) { | ||
| 77 | for(n=0; n < AES_BLOCK_SIZE; ++n) | ||
| 78 | out[n] = in[n] ^ iv[n]; | ||
| 79 | AES_encrypt(out, out, key); | ||
| 80 | iv = out; | ||
| 81 | len -= AES_BLOCK_SIZE; | ||
| 82 | in += AES_BLOCK_SIZE; | ||
| 83 | out += AES_BLOCK_SIZE; | ||
| 84 | } | ||
| 85 | if (len) { | ||
| 86 | for(n=0; n < len; ++n) | ||
| 87 | out[n] = in[n] ^ iv[n]; | ||
| 88 | for(n=len; n < AES_BLOCK_SIZE; ++n) | ||
| 89 | out[n] = iv[n]; | ||
| 90 | AES_encrypt(out, out, key); | ||
| 91 | iv = out; | ||
| 92 | } | ||
| 93 | memcpy(ivec,iv,AES_BLOCK_SIZE); | ||
| 94 | } else if (in != out) { | ||
| 95 | while (len >= AES_BLOCK_SIZE) { | ||
| 96 | AES_decrypt(in, out, key); | ||
| 97 | for(n=0; n < AES_BLOCK_SIZE; ++n) | ||
| 98 | out[n] ^= iv[n]; | ||
| 99 | iv = in; | ||
| 100 | len -= AES_BLOCK_SIZE; | ||
| 101 | in += AES_BLOCK_SIZE; | ||
| 102 | out += AES_BLOCK_SIZE; | ||
| 103 | } | ||
| 104 | if (len) { | ||
| 105 | AES_decrypt(in,tmp,key); | ||
| 106 | for(n=0; n < len; ++n) | ||
| 107 | out[n] = tmp[n] ^ iv[n]; | ||
| 108 | iv = in; | ||
| 109 | } | ||
| 110 | memcpy(ivec,iv,AES_BLOCK_SIZE); | ||
| 111 | } else { | ||
| 112 | while (len >= AES_BLOCK_SIZE) { | ||
| 113 | memcpy(tmp, in, AES_BLOCK_SIZE); | ||
| 114 | AES_decrypt(in, out, key); | ||
| 115 | for(n=0; n < AES_BLOCK_SIZE; ++n) | ||
| 116 | out[n] ^= ivec[n]; | ||
| 117 | memcpy(ivec, tmp, AES_BLOCK_SIZE); | ||
| 118 | len -= AES_BLOCK_SIZE; | ||
| 119 | in += AES_BLOCK_SIZE; | ||
| 120 | out += AES_BLOCK_SIZE; | ||
| 121 | } | ||
| 122 | if (len) { | ||
| 123 | memcpy(tmp, in, AES_BLOCK_SIZE); | ||
| 124 | AES_decrypt(tmp, out, key); | ||
| 125 | for(n=0; n < len; ++n) | ||
| 126 | out[n] ^= ivec[n]; | ||
| 127 | for(n=len; n < AES_BLOCK_SIZE; ++n) | ||
| 128 | out[n] = tmp[n]; | ||
| 129 | memcpy(ivec, tmp, AES_BLOCK_SIZE); | ||
| 130 | } | ||
| 131 | } | ||
| 132 | } | 63 | } |
| 133 | #endif | ||
diff --git a/src/lib/libcrypto/aes/aes_cfb.c b/src/lib/libcrypto/aes/aes_cfb.c index 49f0411010..0c6d058ce7 100644 --- a/src/lib/libcrypto/aes/aes_cfb.c +++ b/src/lib/libcrypto/aes/aes_cfb.c | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | /* crypto/aes/aes_cfb.c -*- mode:C; c-file-style: "eay" -*- */ | 1 | /* crypto/aes/aes_cfb.c -*- mode:C; c-file-style: "eay" -*- */ |
| 2 | /* ==================================================================== | 2 | /* ==================================================================== |
| 3 | * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. | 3 | * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved. |
| 4 | * | 4 | * |
| 5 | * Redistribution and use in source and binary forms, with or without | 5 | * Redistribution and use in source and binary forms, with or without |
| 6 | * modification, are permitted provided that the following conditions | 6 | * modification, are permitted provided that the following conditions |
| @@ -48,73 +48,9 @@ | |||
| 48 | * ==================================================================== | 48 | * ==================================================================== |
| 49 | * | 49 | * |
| 50 | */ | 50 | */ |
| 51 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
| 52 | * All rights reserved. | ||
| 53 | * | ||
| 54 | * This package is an SSL implementation written | ||
| 55 | * by Eric Young (eay@cryptsoft.com). | ||
| 56 | * The implementation was written so as to conform with Netscapes SSL. | ||
| 57 | * | ||
| 58 | * This library is free for commercial and non-commercial use as long as | ||
| 59 | * the following conditions are aheared to. The following conditions | ||
| 60 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
| 61 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
| 62 | * included with this distribution is covered by the same copyright terms | ||
| 63 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
| 64 | * | ||
| 65 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
| 66 | * the code are not to be removed. | ||
| 67 | * If this package is used in a product, Eric Young should be given attribution | ||
| 68 | * as the author of the parts of the library used. | ||
| 69 | * This can be in the form of a textual message at program startup or | ||
| 70 | * in documentation (online or textual) provided with the package. | ||
| 71 | * | ||
| 72 | * Redistribution and use in source and binary forms, with or without | ||
| 73 | * modification, are permitted provided that the following conditions | ||
| 74 | * are met: | ||
| 75 | * 1. Redistributions of source code must retain the copyright | ||
| 76 | * notice, this list of conditions and the following disclaimer. | ||
| 77 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 78 | * notice, this list of conditions and the following disclaimer in the | ||
| 79 | * documentation and/or other materials provided with the distribution. | ||
| 80 | * 3. All advertising materials mentioning features or use of this software | ||
| 81 | * must display the following acknowledgement: | ||
| 82 | * "This product includes cryptographic software written by | ||
| 83 | * Eric Young (eay@cryptsoft.com)" | ||
| 84 | * The word 'cryptographic' can be left out if the rouines from the library | ||
| 85 | * being used are not cryptographic related :-). | ||
| 86 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
| 87 | * the apps directory (application code) you must include an acknowledgement: | ||
| 88 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
| 89 | * | ||
| 90 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
| 91 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 92 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 93 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
| 94 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 95 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
| 96 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 97 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
| 98 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
| 99 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
| 100 | * SUCH DAMAGE. | ||
| 101 | * | ||
| 102 | * The licence and distribution terms for any publically available version or | ||
| 103 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
| 104 | * copied and put under another distribution licence | ||
| 105 | * [including the GNU Public Licence.] | ||
| 106 | */ | ||
| 107 | |||
| 108 | #ifndef AES_DEBUG | ||
| 109 | # ifndef NDEBUG | ||
| 110 | # define NDEBUG | ||
| 111 | # endif | ||
| 112 | #endif | ||
| 113 | #include <assert.h> | ||
| 114 | 51 | ||
| 115 | #include <openssl/aes.h> | 52 | #include <openssl/aes.h> |
| 116 | #include "aes_locl.h" | 53 | #include <openssl/modes.h> |
| 117 | #include "e_os.h" | ||
| 118 | 54 | ||
| 119 | /* The input and output encrypted as though 128bit cfb mode is being | 55 | /* The input and output encrypted as though 128bit cfb mode is being |
| 120 | * used. The extra state information to record how much of the | 56 | * used. The extra state information to record how much of the |
| @@ -122,104 +58,24 @@ | |||
| 122 | */ | 58 | */ |
| 123 | 59 | ||
| 124 | void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, | 60 | void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, |
| 125 | const unsigned long length, const AES_KEY *key, | 61 | size_t length, const AES_KEY *key, |
| 126 | unsigned char *ivec, int *num, const int enc) { | 62 | unsigned char *ivec, int *num, const int enc) { |
| 127 | 63 | ||
| 128 | unsigned int n; | 64 | CRYPTO_cfb128_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt); |
| 129 | unsigned long l = length; | ||
| 130 | unsigned char c; | ||
| 131 | |||
| 132 | assert(in && out && key && ivec && num); | ||
| 133 | |||
| 134 | n = *num; | ||
| 135 | |||
| 136 | if (enc) { | ||
| 137 | while (l--) { | ||
| 138 | if (n == 0) { | ||
| 139 | AES_encrypt(ivec, ivec, key); | ||
| 140 | } | ||
| 141 | ivec[n] = *(out++) = *(in++) ^ ivec[n]; | ||
| 142 | n = (n+1) % AES_BLOCK_SIZE; | ||
| 143 | } | ||
| 144 | } else { | ||
| 145 | while (l--) { | ||
| 146 | if (n == 0) { | ||
| 147 | AES_encrypt(ivec, ivec, key); | ||
| 148 | } | ||
| 149 | c = *(in); | ||
| 150 | *(out++) = *(in++) ^ ivec[n]; | ||
| 151 | ivec[n] = c; | ||
| 152 | n = (n+1) % AES_BLOCK_SIZE; | ||
| 153 | } | ||
| 154 | } | ||
| 155 | |||
| 156 | *num=n; | ||
| 157 | } | 65 | } |
| 158 | 66 | ||
| 159 | /* This expects a single block of size nbits for both in and out. Note that | ||
| 160 | it corrupts any extra bits in the last byte of out */ | ||
| 161 | void AES_cfbr_encrypt_block(const unsigned char *in,unsigned char *out, | ||
| 162 | const int nbits,const AES_KEY *key, | ||
| 163 | unsigned char *ivec,const int enc) | ||
| 164 | { | ||
| 165 | int n,rem,num; | ||
| 166 | unsigned char ovec[AES_BLOCK_SIZE*2]; | ||
| 167 | |||
| 168 | if (nbits<=0 || nbits>128) return; | ||
| 169 | |||
| 170 | /* fill in the first half of the new IV with the current IV */ | ||
| 171 | memcpy(ovec,ivec,AES_BLOCK_SIZE); | ||
| 172 | /* construct the new IV */ | ||
| 173 | AES_encrypt(ivec,ivec,key); | ||
| 174 | num = (nbits+7)/8; | ||
| 175 | if (enc) /* encrypt the input */ | ||
| 176 | for(n=0 ; n < num ; ++n) | ||
| 177 | out[n] = (ovec[AES_BLOCK_SIZE+n] = in[n] ^ ivec[n]); | ||
| 178 | else /* decrypt the input */ | ||
| 179 | for(n=0 ; n < num ; ++n) | ||
| 180 | out[n] = (ovec[AES_BLOCK_SIZE+n] = in[n]) ^ ivec[n]; | ||
| 181 | /* shift ovec left... */ | ||
| 182 | rem = nbits%8; | ||
| 183 | num = nbits/8; | ||
| 184 | if(rem==0) | ||
| 185 | memcpy(ivec,ovec+num,AES_BLOCK_SIZE); | ||
| 186 | else | ||
| 187 | for(n=0 ; n < AES_BLOCK_SIZE ; ++n) | ||
| 188 | ivec[n] = ovec[n+num]<<rem | ovec[n+num+1]>>(8-rem); | ||
| 189 | |||
| 190 | /* it is not necessary to cleanse ovec, since the IV is not secret */ | ||
| 191 | } | ||
| 192 | |||
| 193 | /* N.B. This expects the input to be packed, MS bit first */ | 67 | /* N.B. This expects the input to be packed, MS bit first */ |
| 194 | void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, | 68 | void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, |
| 195 | const unsigned long length, const AES_KEY *key, | 69 | size_t length, const AES_KEY *key, |
| 196 | unsigned char *ivec, int *num, const int enc) | 70 | unsigned char *ivec, int *num, const int enc) |
| 197 | { | 71 | { |
| 198 | unsigned int n; | 72 | CRYPTO_cfb128_1_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt); |
| 199 | unsigned char c[1],d[1]; | ||
| 200 | |||
| 201 | assert(in && out && key && ivec && num); | ||
| 202 | assert(*num == 0); | ||
| 203 | |||
| 204 | memset(out,0,(length+7)/8); | ||
| 205 | for(n=0 ; n < length ; ++n) | ||
| 206 | { | ||
| 207 | c[0]=(in[n/8]&(1 << (7-n%8))) ? 0x80 : 0; | ||
| 208 | AES_cfbr_encrypt_block(c,d,1,key,ivec,enc); | ||
| 209 | out[n/8]=(out[n/8]&~(1 << (7-n%8)))|((d[0]&0x80) >> (n%8)); | ||
| 210 | } | ||
| 211 | } | 73 | } |
| 212 | 74 | ||
| 213 | void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, | 75 | void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, |
| 214 | const unsigned long length, const AES_KEY *key, | 76 | size_t length, const AES_KEY *key, |
| 215 | unsigned char *ivec, int *num, const int enc) | 77 | unsigned char *ivec, int *num, const int enc) |
| 216 | { | 78 | { |
| 217 | unsigned int n; | 79 | CRYPTO_cfb128_8_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt); |
| 218 | |||
| 219 | assert(in && out && key && ivec && num); | ||
| 220 | assert(*num == 0); | ||
| 221 | |||
| 222 | for(n=0 ; n < length ; ++n) | ||
| 223 | AES_cfbr_encrypt_block(&in[n],&out[n],8,key,ivec,enc); | ||
| 224 | } | 80 | } |
| 225 | 81 | ||
diff --git a/src/lib/libcrypto/aes/aes_core.c b/src/lib/libcrypto/aes/aes_core.c index cffdd4daec..a7ec54f4da 100644 --- a/src/lib/libcrypto/aes/aes_core.c +++ b/src/lib/libcrypto/aes/aes_core.c | |||
| @@ -37,12 +37,9 @@ | |||
| 37 | 37 | ||
| 38 | #include <stdlib.h> | 38 | #include <stdlib.h> |
| 39 | #include <openssl/aes.h> | 39 | #include <openssl/aes.h> |
| 40 | #ifdef OPENSSL_FIPS | ||
| 41 | #include <openssl/fips.h> | ||
| 42 | #endif | ||
| 43 | |||
| 44 | #include "aes_locl.h" | 40 | #include "aes_locl.h" |
| 45 | 41 | ||
| 42 | #ifndef AES_ASM | ||
| 46 | /* | 43 | /* |
| 47 | Te0[x] = S [x].[02, 01, 01, 03]; | 44 | Te0[x] = S [x].[02, 01, 01, 03]; |
| 48 | Te1[x] = S [x].[03, 02, 01, 01]; | 45 | Te1[x] = S [x].[03, 02, 01, 01]; |
| @@ -635,10 +632,6 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | |||
| 635 | int i = 0; | 632 | int i = 0; |
| 636 | u32 temp; | 633 | u32 temp; |
| 637 | 634 | ||
| 638 | #ifdef OPENSSL_FIPS | ||
| 639 | FIPS_selftest_check(); | ||
| 640 | #endif | ||
| 641 | |||
| 642 | if (!userKey || !key) | 635 | if (!userKey || !key) |
| 643 | return -1; | 636 | return -1; |
| 644 | if (bits != 128 && bits != 192 && bits != 256) | 637 | if (bits != 128 && bits != 192 && bits != 256) |
| @@ -781,7 +774,6 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | |||
| 781 | return 0; | 774 | return 0; |
| 782 | } | 775 | } |
| 783 | 776 | ||
| 784 | #ifndef AES_ASM | ||
| 785 | /* | 777 | /* |
| 786 | * Encrypt a single block | 778 | * Encrypt a single block |
| 787 | * in and out can overlap | 779 | * in and out can overlap |
| @@ -1164,4 +1156,203 @@ void AES_decrypt(const unsigned char *in, unsigned char *out, | |||
| 1164 | PUTU32(out + 12, s3); | 1156 | PUTU32(out + 12, s3); |
| 1165 | } | 1157 | } |
| 1166 | 1158 | ||
| 1159 | #else /* AES_ASM */ | ||
| 1160 | |||
| 1161 | static const u8 Te4[256] = { | ||
| 1162 | 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U, | ||
| 1163 | 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U, | ||
| 1164 | 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U, | ||
| 1165 | 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U, | ||
| 1166 | 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU, | ||
| 1167 | 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U, | ||
| 1168 | 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU, | ||
| 1169 | 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U, | ||
| 1170 | 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U, | ||
| 1171 | 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U, | ||
| 1172 | 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU, | ||
| 1173 | 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU, | ||
| 1174 | 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U, | ||
| 1175 | 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U, | ||
| 1176 | 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U, | ||
| 1177 | 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U, | ||
| 1178 | 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U, | ||
| 1179 | 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U, | ||
| 1180 | 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U, | ||
| 1181 | 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU, | ||
| 1182 | 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU, | ||
| 1183 | 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U, | ||
| 1184 | 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U, | ||
| 1185 | 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U, | ||
| 1186 | 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U, | ||
| 1187 | 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU, | ||
| 1188 | 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU, | ||
| 1189 | 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU, | ||
| 1190 | 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U, | ||
| 1191 | 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU, | ||
| 1192 | 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U, | ||
| 1193 | 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U | ||
| 1194 | }; | ||
| 1195 | static const u32 rcon[] = { | ||
| 1196 | 0x01000000, 0x02000000, 0x04000000, 0x08000000, | ||
| 1197 | 0x10000000, 0x20000000, 0x40000000, 0x80000000, | ||
| 1198 | 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ | ||
| 1199 | }; | ||
| 1200 | |||
| 1201 | /** | ||
| 1202 | * Expand the cipher key into the encryption key schedule. | ||
| 1203 | */ | ||
| 1204 | int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | ||
| 1205 | AES_KEY *key) { | ||
| 1206 | u32 *rk; | ||
| 1207 | int i = 0; | ||
| 1208 | u32 temp; | ||
| 1209 | |||
| 1210 | if (!userKey || !key) | ||
| 1211 | return -1; | ||
| 1212 | if (bits != 128 && bits != 192 && bits != 256) | ||
| 1213 | return -2; | ||
| 1214 | |||
| 1215 | rk = key->rd_key; | ||
| 1216 | |||
| 1217 | if (bits==128) | ||
| 1218 | key->rounds = 10; | ||
| 1219 | else if (bits==192) | ||
| 1220 | key->rounds = 12; | ||
| 1221 | else | ||
| 1222 | key->rounds = 14; | ||
| 1223 | |||
| 1224 | rk[0] = GETU32(userKey ); | ||
| 1225 | rk[1] = GETU32(userKey + 4); | ||
| 1226 | rk[2] = GETU32(userKey + 8); | ||
| 1227 | rk[3] = GETU32(userKey + 12); | ||
| 1228 | if (bits == 128) { | ||
| 1229 | while (1) { | ||
| 1230 | temp = rk[3]; | ||
| 1231 | rk[4] = rk[0] ^ | ||
| 1232 | (Te4[(temp >> 16) & 0xff] << 24) ^ | ||
| 1233 | (Te4[(temp >> 8) & 0xff] << 16) ^ | ||
| 1234 | (Te4[(temp ) & 0xff] << 8) ^ | ||
| 1235 | (Te4[(temp >> 24) ]) ^ | ||
| 1236 | rcon[i]; | ||
| 1237 | rk[5] = rk[1] ^ rk[4]; | ||
| 1238 | rk[6] = rk[2] ^ rk[5]; | ||
| 1239 | rk[7] = rk[3] ^ rk[6]; | ||
| 1240 | if (++i == 10) { | ||
| 1241 | return 0; | ||
| 1242 | } | ||
| 1243 | rk += 4; | ||
| 1244 | } | ||
| 1245 | } | ||
| 1246 | rk[4] = GETU32(userKey + 16); | ||
| 1247 | rk[5] = GETU32(userKey + 20); | ||
| 1248 | if (bits == 192) { | ||
| 1249 | while (1) { | ||
| 1250 | temp = rk[ 5]; | ||
| 1251 | rk[ 6] = rk[ 0] ^ | ||
| 1252 | (Te4[(temp >> 16) & 0xff] << 24) ^ | ||
| 1253 | (Te4[(temp >> 8) & 0xff] << 16) ^ | ||
| 1254 | (Te4[(temp ) & 0xff] << 8) ^ | ||
| 1255 | (Te4[(temp >> 24) ]) ^ | ||
| 1256 | rcon[i]; | ||
| 1257 | rk[ 7] = rk[ 1] ^ rk[ 6]; | ||
| 1258 | rk[ 8] = rk[ 2] ^ rk[ 7]; | ||
| 1259 | rk[ 9] = rk[ 3] ^ rk[ 8]; | ||
| 1260 | if (++i == 8) { | ||
| 1261 | return 0; | ||
| 1262 | } | ||
| 1263 | rk[10] = rk[ 4] ^ rk[ 9]; | ||
| 1264 | rk[11] = rk[ 5] ^ rk[10]; | ||
| 1265 | rk += 6; | ||
| 1266 | } | ||
| 1267 | } | ||
| 1268 | rk[6] = GETU32(userKey + 24); | ||
| 1269 | rk[7] = GETU32(userKey + 28); | ||
| 1270 | if (bits == 256) { | ||
| 1271 | while (1) { | ||
| 1272 | temp = rk[ 7]; | ||
| 1273 | rk[ 8] = rk[ 0] ^ | ||
| 1274 | (Te4[(temp >> 16) & 0xff] << 24) ^ | ||
| 1275 | (Te4[(temp >> 8) & 0xff] << 16) ^ | ||
| 1276 | (Te4[(temp ) & 0xff] << 8) ^ | ||
| 1277 | (Te4[(temp >> 24) ]) ^ | ||
| 1278 | rcon[i]; | ||
| 1279 | rk[ 9] = rk[ 1] ^ rk[ 8]; | ||
| 1280 | rk[10] = rk[ 2] ^ rk[ 9]; | ||
| 1281 | rk[11] = rk[ 3] ^ rk[10]; | ||
| 1282 | if (++i == 7) { | ||
| 1283 | return 0; | ||
| 1284 | } | ||
| 1285 | temp = rk[11]; | ||
| 1286 | rk[12] = rk[ 4] ^ | ||
| 1287 | (Te4[(temp >> 24) ] << 24) ^ | ||
| 1288 | (Te4[(temp >> 16) & 0xff] << 16) ^ | ||
| 1289 | (Te4[(temp >> 8) & 0xff] << 8) ^ | ||
| 1290 | (Te4[(temp ) & 0xff]); | ||
| 1291 | rk[13] = rk[ 5] ^ rk[12]; | ||
| 1292 | rk[14] = rk[ 6] ^ rk[13]; | ||
| 1293 | rk[15] = rk[ 7] ^ rk[14]; | ||
| 1294 | |||
| 1295 | rk += 8; | ||
| 1296 | } | ||
| 1297 | } | ||
| 1298 | return 0; | ||
| 1299 | } | ||
| 1300 | |||
| 1301 | /** | ||
| 1302 | * Expand the cipher key into the decryption key schedule. | ||
| 1303 | */ | ||
| 1304 | int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | ||
| 1305 | AES_KEY *key) { | ||
| 1306 | |||
| 1307 | u32 *rk; | ||
| 1308 | int i, j, status; | ||
| 1309 | u32 temp; | ||
| 1310 | |||
| 1311 | /* first, start with an encryption schedule */ | ||
| 1312 | status = AES_set_encrypt_key(userKey, bits, key); | ||
| 1313 | if (status < 0) | ||
| 1314 | return status; | ||
| 1315 | |||
| 1316 | rk = key->rd_key; | ||
| 1317 | |||
| 1318 | /* invert the order of the round keys: */ | ||
| 1319 | for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) { | ||
| 1320 | temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp; | ||
| 1321 | temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; | ||
| 1322 | temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; | ||
| 1323 | temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp; | ||
| 1324 | } | ||
| 1325 | /* apply the inverse MixColumn transform to all round keys but the first and the last: */ | ||
| 1326 | for (i = 1; i < (key->rounds); i++) { | ||
| 1327 | rk += 4; | ||
| 1328 | for (j = 0; j < 4; j++) { | ||
| 1329 | u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m; | ||
| 1330 | |||
| 1331 | tp1 = rk[j]; | ||
| 1332 | m = tp1 & 0x80808080; | ||
| 1333 | tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^ | ||
| 1334 | ((m - (m >> 7)) & 0x1b1b1b1b); | ||
| 1335 | m = tp2 & 0x80808080; | ||
| 1336 | tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^ | ||
| 1337 | ((m - (m >> 7)) & 0x1b1b1b1b); | ||
| 1338 | m = tp4 & 0x80808080; | ||
| 1339 | tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^ | ||
| 1340 | ((m - (m >> 7)) & 0x1b1b1b1b); | ||
| 1341 | tp9 = tp8 ^ tp1; | ||
| 1342 | tpb = tp9 ^ tp2; | ||
| 1343 | tpd = tp9 ^ tp4; | ||
| 1344 | tpe = tp8 ^ tp4 ^ tp2; | ||
| 1345 | #if defined(ROTATE) | ||
| 1346 | rk[j] = tpe ^ ROTATE(tpd,16) ^ | ||
| 1347 | ROTATE(tp9,24) ^ ROTATE(tpb,8); | ||
| 1348 | #else | ||
| 1349 | rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ | ||
| 1350 | (tp9 >> 8) ^ (tp9 << 24) ^ | ||
| 1351 | (tpb >> 24) ^ (tpb << 8); | ||
| 1352 | #endif | ||
| 1353 | } | ||
| 1354 | } | ||
| 1355 | return 0; | ||
| 1356 | } | ||
| 1357 | |||
| 1167 | #endif /* AES_ASM */ | 1358 | #endif /* AES_ASM */ |
diff --git a/src/lib/libcrypto/aes/aes_ctr.c b/src/lib/libcrypto/aes/aes_ctr.c index f36982be1e..7c9d165d8a 100644 --- a/src/lib/libcrypto/aes/aes_ctr.c +++ b/src/lib/libcrypto/aes/aes_ctr.c | |||
| @@ -49,91 +49,13 @@ | |||
| 49 | * | 49 | * |
| 50 | */ | 50 | */ |
| 51 | 51 | ||
| 52 | #ifndef AES_DEBUG | ||
| 53 | # ifndef NDEBUG | ||
| 54 | # define NDEBUG | ||
| 55 | # endif | ||
| 56 | #endif | ||
| 57 | #include <assert.h> | ||
| 58 | |||
| 59 | #include <openssl/aes.h> | 52 | #include <openssl/aes.h> |
| 60 | #include "aes_locl.h" | 53 | #include <openssl/modes.h> |
| 61 | |||
| 62 | /* NOTE: the IV/counter CTR mode is big-endian. The rest of the AES code | ||
| 63 | * is endian-neutral. */ | ||
| 64 | |||
| 65 | /* increment counter (128-bit int) by 1 */ | ||
| 66 | static void AES_ctr128_inc(unsigned char *counter) { | ||
| 67 | unsigned long c; | ||
| 68 | |||
| 69 | /* Grab bottom dword of counter and increment */ | ||
| 70 | c = GETU32(counter + 12); | ||
| 71 | c++; c &= 0xFFFFFFFF; | ||
| 72 | PUTU32(counter + 12, c); | ||
| 73 | |||
| 74 | /* if no overflow, we're done */ | ||
| 75 | if (c) | ||
| 76 | return; | ||
| 77 | |||
| 78 | /* Grab 1st dword of counter and increment */ | ||
| 79 | c = GETU32(counter + 8); | ||
| 80 | c++; c &= 0xFFFFFFFF; | ||
| 81 | PUTU32(counter + 8, c); | ||
| 82 | |||
| 83 | /* if no overflow, we're done */ | ||
| 84 | if (c) | ||
| 85 | return; | ||
| 86 | |||
| 87 | /* Grab 2nd dword of counter and increment */ | ||
| 88 | c = GETU32(counter + 4); | ||
| 89 | c++; c &= 0xFFFFFFFF; | ||
| 90 | PUTU32(counter + 4, c); | ||
| 91 | |||
| 92 | /* if no overflow, we're done */ | ||
| 93 | if (c) | ||
| 94 | return; | ||
| 95 | 54 | ||
| 96 | /* Grab top dword of counter and increment */ | ||
| 97 | c = GETU32(counter + 0); | ||
| 98 | c++; c &= 0xFFFFFFFF; | ||
| 99 | PUTU32(counter + 0, c); | ||
| 100 | } | ||
| 101 | |||
| 102 | /* The input encrypted as though 128bit counter mode is being | ||
| 103 | * used. The extra state information to record how much of the | ||
| 104 | * 128bit block we have used is contained in *num, and the | ||
| 105 | * encrypted counter is kept in ecount_buf. Both *num and | ||
| 106 | * ecount_buf must be initialised with zeros before the first | ||
| 107 | * call to AES_ctr128_encrypt(). | ||
| 108 | * | ||
| 109 | * This algorithm assumes that the counter is in the x lower bits | ||
| 110 | * of the IV (ivec), and that the application has full control over | ||
| 111 | * overflow and the rest of the IV. This implementation takes NO | ||
| 112 | * responsability for checking that the counter doesn't overflow | ||
| 113 | * into the rest of the IV when incremented. | ||
| 114 | */ | ||
| 115 | void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, | 55 | void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, |
| 116 | const unsigned long length, const AES_KEY *key, | 56 | size_t length, const AES_KEY *key, |
| 117 | unsigned char ivec[AES_BLOCK_SIZE], | 57 | unsigned char ivec[AES_BLOCK_SIZE], |
| 118 | unsigned char ecount_buf[AES_BLOCK_SIZE], | 58 | unsigned char ecount_buf[AES_BLOCK_SIZE], |
| 119 | unsigned int *num) { | 59 | unsigned int *num) { |
| 120 | 60 | CRYPTO_ctr128_encrypt(in,out,length,key,ivec,ecount_buf,num,(block128_f)AES_encrypt); | |
| 121 | unsigned int n; | ||
| 122 | unsigned long l=length; | ||
| 123 | |||
| 124 | assert(in && out && key && counter && num); | ||
| 125 | assert(*num < AES_BLOCK_SIZE); | ||
| 126 | |||
| 127 | n = *num; | ||
| 128 | |||
| 129 | while (l--) { | ||
| 130 | if (n == 0) { | ||
| 131 | AES_encrypt(ivec, ecount_buf, key); | ||
| 132 | AES_ctr128_inc(ivec); | ||
| 133 | } | ||
| 134 | *(out++) = *(in++) ^ ecount_buf[n]; | ||
| 135 | n = (n+1) % AES_BLOCK_SIZE; | ||
| 136 | } | ||
| 137 | |||
| 138 | *num=n; | ||
| 139 | } | 61 | } |
diff --git a/src/lib/libcrypto/aes/aes_ige.c b/src/lib/libcrypto/aes/aes_ige.c index 45d7096181..c161351e65 100644 --- a/src/lib/libcrypto/aes/aes_ige.c +++ b/src/lib/libcrypto/aes/aes_ige.c | |||
| @@ -77,11 +77,11 @@ typedef struct { | |||
| 77 | /* N.B. The IV for this mode is _twice_ the block size */ | 77 | /* N.B. The IV for this mode is _twice_ the block size */ |
| 78 | 78 | ||
| 79 | void AES_ige_encrypt(const unsigned char *in, unsigned char *out, | 79 | void AES_ige_encrypt(const unsigned char *in, unsigned char *out, |
| 80 | const unsigned long length, const AES_KEY *key, | 80 | size_t length, const AES_KEY *key, |
| 81 | unsigned char *ivec, const int enc) | 81 | unsigned char *ivec, const int enc) |
| 82 | { | 82 | { |
| 83 | unsigned long n; | 83 | size_t n; |
| 84 | unsigned long len; | 84 | size_t len = length; |
| 85 | 85 | ||
| 86 | OPENSSL_assert(in && out && key && ivec); | 86 | OPENSSL_assert(in && out && key && ivec); |
| 87 | OPENSSL_assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc)); | 87 | OPENSSL_assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc)); |
| @@ -211,12 +211,12 @@ void AES_ige_encrypt(const unsigned char *in, unsigned char *out, | |||
| 211 | /* N.B. The IV for this mode is _four times_ the block size */ | 211 | /* N.B. The IV for this mode is _four times_ the block size */ |
| 212 | 212 | ||
| 213 | void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out, | 213 | void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out, |
| 214 | const unsigned long length, const AES_KEY *key, | 214 | size_t length, const AES_KEY *key, |
| 215 | const AES_KEY *key2, const unsigned char *ivec, | 215 | const AES_KEY *key2, const unsigned char *ivec, |
| 216 | const int enc) | 216 | const int enc) |
| 217 | { | 217 | { |
| 218 | unsigned long n; | 218 | size_t n; |
| 219 | unsigned long len = length; | 219 | size_t len = length; |
| 220 | unsigned char tmp[AES_BLOCK_SIZE]; | 220 | unsigned char tmp[AES_BLOCK_SIZE]; |
| 221 | unsigned char tmp2[AES_BLOCK_SIZE]; | 221 | unsigned char tmp2[AES_BLOCK_SIZE]; |
| 222 | unsigned char tmp3[AES_BLOCK_SIZE]; | 222 | unsigned char tmp3[AES_BLOCK_SIZE]; |
diff --git a/src/lib/libcrypto/aes/aes_ofb.c b/src/lib/libcrypto/aes/aes_ofb.c index f358bb39e2..50bf0b8325 100644 --- a/src/lib/libcrypto/aes/aes_ofb.c +++ b/src/lib/libcrypto/aes/aes_ofb.c | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | /* crypto/aes/aes_ofb.c -*- mode:C; c-file-style: "eay" -*- */ | 1 | /* crypto/aes/aes_ofb.c -*- mode:C; c-file-style: "eay" -*- */ |
| 2 | /* ==================================================================== | 2 | /* ==================================================================== |
| 3 | * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. | 3 | * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved. |
| 4 | * | 4 | * |
| 5 | * Redistribution and use in source and binary forms, with or without | 5 | * Redistribution and use in source and binary forms, with or without |
| 6 | * modification, are permitted provided that the following conditions | 6 | * modification, are permitted provided that the following conditions |
| @@ -48,95 +48,13 @@ | |||
| 48 | * ==================================================================== | 48 | * ==================================================================== |
| 49 | * | 49 | * |
| 50 | */ | 50 | */ |
| 51 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
| 52 | * All rights reserved. | ||
| 53 | * | ||
| 54 | * This package is an SSL implementation written | ||
| 55 | * by Eric Young (eay@cryptsoft.com). | ||
| 56 | * The implementation was written so as to conform with Netscapes SSL. | ||
| 57 | * | ||
| 58 | * This library is free for commercial and non-commercial use as long as | ||
| 59 | * the following conditions are aheared to. The following conditions | ||
| 60 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
| 61 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
| 62 | * included with this distribution is covered by the same copyright terms | ||
| 63 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
| 64 | * | ||
| 65 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
| 66 | * the code are not to be removed. | ||
| 67 | * If this package is used in a product, Eric Young should be given attribution | ||
| 68 | * as the author of the parts of the library used. | ||
| 69 | * This can be in the form of a textual message at program startup or | ||
| 70 | * in documentation (online or textual) provided with the package. | ||
| 71 | * | ||
| 72 | * Redistribution and use in source and binary forms, with or without | ||
| 73 | * modification, are permitted provided that the following conditions | ||
| 74 | * are met: | ||
| 75 | * 1. Redistributions of source code must retain the copyright | ||
| 76 | * notice, this list of conditions and the following disclaimer. | ||
| 77 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 78 | * notice, this list of conditions and the following disclaimer in the | ||
| 79 | * documentation and/or other materials provided with the distribution. | ||
| 80 | * 3. All advertising materials mentioning features or use of this software | ||
| 81 | * must display the following acknowledgement: | ||
| 82 | * "This product includes cryptographic software written by | ||
| 83 | * Eric Young (eay@cryptsoft.com)" | ||
| 84 | * The word 'cryptographic' can be left out if the rouines from the library | ||
| 85 | * being used are not cryptographic related :-). | ||
| 86 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
| 87 | * the apps directory (application code) you must include an acknowledgement: | ||
| 88 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
| 89 | * | ||
| 90 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
| 91 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 92 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 93 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
| 94 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 95 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
| 96 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 97 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
| 98 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
| 99 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
| 100 | * SUCH DAMAGE. | ||
| 101 | * | ||
| 102 | * The licence and distribution terms for any publically available version or | ||
| 103 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
| 104 | * copied and put under another distribution licence | ||
| 105 | * [including the GNU Public Licence.] | ||
| 106 | */ | ||
| 107 | |||
| 108 | #ifndef AES_DEBUG | ||
| 109 | # ifndef NDEBUG | ||
| 110 | # define NDEBUG | ||
| 111 | # endif | ||
| 112 | #endif | ||
| 113 | #include <assert.h> | ||
| 114 | 51 | ||
| 115 | #include <openssl/aes.h> | 52 | #include <openssl/aes.h> |
| 116 | #include "aes_locl.h" | 53 | #include <openssl/modes.h> |
| 117 | 54 | ||
| 118 | /* The input and output encrypted as though 128bit ofb mode is being | ||
| 119 | * used. The extra state information to record how much of the | ||
| 120 | * 128bit block we have used is contained in *num; | ||
| 121 | */ | ||
| 122 | void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, | 55 | void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, |
| 123 | const unsigned long length, const AES_KEY *key, | 56 | size_t length, const AES_KEY *key, |
| 124 | unsigned char *ivec, int *num) { | 57 | unsigned char *ivec, int *num) |
| 125 | 58 | { | |
| 126 | unsigned int n; | 59 | CRYPTO_ofb128_encrypt(in,out,length,key,ivec,num,(block128_f)AES_encrypt); |
| 127 | unsigned long l=length; | ||
| 128 | |||
| 129 | assert(in && out && key && ivec && num); | ||
| 130 | |||
| 131 | n = *num; | ||
| 132 | |||
| 133 | while (l--) { | ||
| 134 | if (n == 0) { | ||
| 135 | AES_encrypt(ivec, ivec, key); | ||
| 136 | } | ||
| 137 | *(out++) = *(in++) ^ ivec[n]; | ||
| 138 | n = (n+1) % AES_BLOCK_SIZE; | ||
| 139 | } | ||
| 140 | |||
| 141 | *num=n; | ||
| 142 | } | 60 | } |
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl index 3bc46a968e..aab40e6f1c 100644 --- a/src/lib/libcrypto/aes/asm/aes-586.pl +++ b/src/lib/libcrypto/aes/asm/aes-586.pl | |||
| @@ -2,11 +2,12 @@ | |||
| 2 | # | 2 | # |
| 3 | # ==================================================================== | 3 | # ==================================================================== |
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| 5 | # project. Rights for redistribution and usage in source and binary | 5 | # project. The module is, however, dual licensed under OpenSSL and |
| 6 | # forms are granted according to the OpenSSL license. | 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 7 | # ==================================================================== | 8 | # ==================================================================== |
| 8 | # | 9 | # |
| 9 | # Version 3.6. | 10 | # Version 4.3. |
| 10 | # | 11 | # |
| 11 | # You might fail to appreciate this module performance from the first | 12 | # You might fail to appreciate this module performance from the first |
| 12 | # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered | 13 | # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered |
| @@ -81,11 +82,117 @@ | |||
| 81 | # AMD K8 20 19 | 82 | # AMD K8 20 19 |
| 82 | # PIII 25 23 | 83 | # PIII 25 23 |
| 83 | # Pentium 81 78 | 84 | # Pentium 81 78 |
| 84 | 85 | # | |
| 85 | push(@INC,"perlasm","../../perlasm"); | 86 | # Version 3.7 reimplements outer rounds as "compact." Meaning that |
| 87 | # first and last rounds reference compact 256 bytes S-box. This means | ||
| 88 | # that first round consumes a lot more CPU cycles and that encrypt | ||
| 89 | # and decrypt performance becomes asymmetric. Encrypt performance | ||
| 90 | # drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is | ||
| 91 | # aggressively pre-fetched. | ||
| 92 | # | ||
| 93 | # Version 4.0 effectively rolls back to 3.6 and instead implements | ||
| 94 | # additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact, | ||
| 95 | # which use exclusively 256 byte S-box. These functions are to be | ||
| 96 | # called in modes not concealing plain text, such as ECB, or when | ||
| 97 | # we're asked to process smaller amount of data [or unconditionally | ||
| 98 | # on hyper-threading CPU]. Currently it's called unconditionally from | ||
| 99 | # AES_[en|de]crypt, which affects all modes, but CBC. CBC routine | ||
| 100 | # still needs to be modified to switch between slower and faster | ||
| 101 | # mode when appropriate... But in either case benchmark landscape | ||
| 102 | # changes dramatically and below numbers are CPU cycles per processed | ||
| 103 | # byte for 128-bit key. | ||
| 104 | # | ||
| 105 | # ECB encrypt ECB decrypt CBC large chunk | ||
| 106 | # P4 56[60] 84[100] 23 | ||
| 107 | # AMD K8 48[44] 70[79] 18 | ||
| 108 | # PIII 41[50] 61[91] 24 | ||
| 109 | # Core 2 32[38] 45[70] 18.5 | ||
| 110 | # Pentium 120 160 77 | ||
| 111 | # | ||
| 112 | # Version 4.1 switches to compact S-box even in key schedule setup. | ||
| 113 | # | ||
| 114 | # Version 4.2 prefetches compact S-box in every SSE round or in other | ||
| 115 | # words every cache-line is *guaranteed* to be accessed within ~50 | ||
| 116 | # cycles window. Why just SSE? Because it's needed on hyper-threading | ||
| 117 | # CPU! Which is also why it's prefetched with 64 byte stride. Best | ||
| 118 | # part is that it has no negative effect on performance:-) | ||
| 119 | # | ||
| 120 | # Version 4.3 implements switch between compact and non-compact block | ||
| 121 | # functions in AES_cbc_encrypt depending on how much data was asked | ||
| 122 | # to be processed in one stroke. | ||
| 123 | # | ||
| 124 | ###################################################################### | ||
| 125 | # Timing attacks are classified in two classes: synchronous when | ||
| 126 | # attacker consciously initiates cryptographic operation and collects | ||
| 127 | # timing data of various character afterwards, and asynchronous when | ||
| 128 | # malicious code is executed on same CPU simultaneously with AES, | ||
| 129 | # instruments itself and performs statistical analysis of this data. | ||
| 130 | # | ||
| 131 | # As far as synchronous attacks go the root to the AES timing | ||
| 132 | # vulnerability is twofold. Firstly, of 256 S-box elements at most 160 | ||
| 133 | # are referred to in single 128-bit block operation. Well, in C | ||
| 134 | # implementation with 4 distinct tables it's actually as little as 40 | ||
| 135 | # references per 256 elements table, but anyway... Secondly, even | ||
| 136 | # though S-box elements are clustered into smaller amount of cache- | ||
| 137 | # lines, smaller than 160 and even 40, it turned out that for certain | ||
| 138 | # plain-text pattern[s] or simply put chosen plain-text and given key | ||
| 139 | # few cache-lines remain unaccessed during block operation. Now, if | ||
| 140 | # attacker can figure out this access pattern, he can deduct the key | ||
| 141 | # [or at least part of it]. The natural way to mitigate this kind of | ||
| 142 | # attacks is to minimize the amount of cache-lines in S-box and/or | ||
| 143 | # prefetch them to ensure that every one is accessed for more uniform | ||
| 144 | # timing. But note that *if* plain-text was concealed in such way that | ||
| 145 | # input to block function is distributed *uniformly*, then attack | ||
| 146 | # wouldn't apply. Now note that some encryption modes, most notably | ||
| 147 | # CBC, do mask the plain-text in this exact way [secure cipher output | ||
| 148 | # is distributed uniformly]. Yes, one still might find input that | ||
| 149 | # would reveal the information about given key, but if amount of | ||
| 150 | # candidate inputs to be tried is larger than amount of possible key | ||
| 151 | # combinations then attack becomes infeasible. This is why revised | ||
| 152 | # AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk | ||
| 153 | # of data is to be processed in one stroke. The current size limit of | ||
| 154 | # 512 bytes is chosen to provide same [diminishigly low] probability | ||
| 155 | # for cache-line to remain untouched in large chunk operation with | ||
| 156 | # large S-box as for single block operation with compact S-box and | ||
| 157 | # surely needs more careful consideration... | ||
| 158 | # | ||
| 159 | # As for asynchronous attacks. There are two flavours: attacker code | ||
| 160 | # being interleaved with AES on hyper-threading CPU at *instruction* | ||
| 161 | # level, and two processes time sharing single core. As for latter. | ||
| 162 | # Two vectors. 1. Given that attacker process has higher priority, | ||
| 163 | # yield execution to process performing AES just before timer fires | ||
| 164 | # off the scheduler, immediately regain control of CPU and analyze the | ||
| 165 | # cache state. For this attack to be efficient attacker would have to | ||
| 166 | # effectively slow down the operation by several *orders* of magnitute, | ||
| 167 | # by ratio of time slice to duration of handful of AES rounds, which | ||
| 168 | # unlikely to remain unnoticed. Not to mention that this also means | ||
| 169 | # that he would spend correspondigly more time to collect enough | ||
| 170 | # statistical data to mount the attack. It's probably appropriate to | ||
| 171 | # say that if adeversary reckons that this attack is beneficial and | ||
| 172 | # risks to be noticed, you probably have larger problems having him | ||
| 173 | # mere opportunity. In other words suggested code design expects you | ||
| 174 | # to preclude/mitigate this attack by overall system security design. | ||
| 175 | # 2. Attacker manages to make his code interrupt driven. In order for | ||
| 176 | # this kind of attack to be feasible, interrupt rate has to be high | ||
| 177 | # enough, again comparable to duration of handful of AES rounds. But | ||
| 178 | # is there interrupt source of such rate? Hardly, not even 1Gbps NIC | ||
| 179 | # generates interrupts at such raging rate... | ||
| 180 | # | ||
| 181 | # And now back to the former, hyper-threading CPU or more specifically | ||
| 182 | # Intel P4. Recall that asynchronous attack implies that malicious | ||
| 183 | # code instruments itself. And naturally instrumentation granularity | ||
| 184 | # has be noticeably lower than duration of codepath accessing S-box. | ||
| 185 | # Given that all cache-lines are accessed during that time that is. | ||
| 186 | # Current implementation accesses *all* cache-lines within ~50 cycles | ||
| 187 | # window, which is actually *less* than RDTSC latency on Intel P4! | ||
| 188 | |||
| 189 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 190 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 86 | require "x86asm.pl"; | 191 | require "x86asm.pl"; |
| 87 | 192 | ||
| 88 | &asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386"); | 193 | &asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386"); |
| 194 | &static_label("AES_Te"); | ||
| 195 | &static_label("AES_Td"); | ||
| 89 | 196 | ||
| 90 | $s0="eax"; | 197 | $s0="eax"; |
| 91 | $s1="ebx"; | 198 | $s1="ebx"; |
| @@ -93,21 +200,36 @@ $s2="ecx"; | |||
| 93 | $s3="edx"; | 200 | $s3="edx"; |
| 94 | $key="edi"; | 201 | $key="edi"; |
| 95 | $acc="esi"; | 202 | $acc="esi"; |
| 203 | $tbl="ebp"; | ||
| 204 | |||
| 205 | # stack frame layout in _[x86|sse]_AES_* routines, frame is allocated | ||
| 206 | # by caller | ||
| 207 | $__ra=&DWP(0,"esp"); # return address | ||
| 208 | $__s0=&DWP(4,"esp"); # s0 backing store | ||
| 209 | $__s1=&DWP(8,"esp"); # s1 backing store | ||
| 210 | $__s2=&DWP(12,"esp"); # s2 backing store | ||
| 211 | $__s3=&DWP(16,"esp"); # s3 backing store | ||
| 212 | $__key=&DWP(20,"esp"); # pointer to key schedule | ||
| 213 | $__end=&DWP(24,"esp"); # pointer to end of key schedule | ||
| 214 | $__tbl=&DWP(28,"esp"); # %ebp backing store | ||
| 215 | |||
| 216 | # stack frame layout in AES_[en|crypt] routines, which differs from | ||
| 217 | # above by 4 and overlaps by %ebp backing store | ||
| 218 | $_tbl=&DWP(24,"esp"); | ||
| 219 | $_esp=&DWP(28,"esp"); | ||
| 96 | 220 | ||
| 97 | $compromise=0; # $compromise=128 abstains from copying key | 221 | sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } |
| 98 | # schedule to stack when encrypting inputs | 222 | |
| 99 | # shorter than 128 bytes at the cost of | 223 | $speed_limit=512; # chunks smaller than $speed_limit are |
| 100 | # risksing aliasing with S-boxes. In return | 224 | # processed with compact routine in CBC mode |
| 101 | # you get way better, up to +70%, small block | ||
| 102 | # performance. | ||
| 103 | $small_footprint=1; # $small_footprint=1 code is ~5% slower [on | 225 | $small_footprint=1; # $small_footprint=1 code is ~5% slower [on |
| 104 | # recent µ-archs], but ~5 times smaller! | 226 | # recent µ-archs], but ~5 times smaller! |
| 105 | # I favor compact code to minimize cache | 227 | # I favor compact code to minimize cache |
| 106 | # contention and in hope to "collect" 5% back | 228 | # contention and in hope to "collect" 5% back |
| 107 | # in real-life applications... | 229 | # in real-life applications... |
| 230 | |||
| 108 | $vertical_spin=0; # shift "verticaly" defaults to 0, because of | 231 | $vertical_spin=0; # shift "verticaly" defaults to 0, because of |
| 109 | # its proof-of-concept status... | 232 | # its proof-of-concept status... |
| 110 | |||
| 111 | # Note that there is no decvert(), as well as last encryption round is | 233 | # Note that there is no decvert(), as well as last encryption round is |
| 112 | # performed with "horizontal" shifts. This is because this "vertical" | 234 | # performed with "horizontal" shifts. This is because this "vertical" |
| 113 | # implementation [one which groups shifts on a given $s[i] to form a | 235 | # implementation [one which groups shifts on a given $s[i] to form a |
| @@ -170,17 +292,484 @@ sub encvert() | |||
| 170 | &movz ($v0,&HB($v1)); | 292 | &movz ($v0,&HB($v1)); |
| 171 | &and ($v1,0xFF); | 293 | &and ($v1,0xFF); |
| 172 | &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16 | 294 | &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16 |
| 173 | &mov ($key,&DWP(12,"esp")); # reincarnate v1 as key | 295 | &mov ($key,$__key); # reincarnate v1 as key |
| 174 | &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24 | 296 | &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24 |
| 175 | } | 297 | } |
| 176 | 298 | ||
| 299 | # Another experimental routine, which features "horizontal spin," but | ||
| 300 | # eliminates one reference to stack. Strangely enough runs slower... | ||
| 301 | sub enchoriz() | ||
| 302 | { my $v0 = $key, $v1 = $acc; | ||
| 303 | |||
| 304 | &movz ($v0,&LB($s0)); # 3, 2, 1, 0* | ||
| 305 | &rotr ($s2,8); # 8,11,10, 9 | ||
| 306 | &mov ($v1,&DWP(0,$te,$v0,8)); # 0 | ||
| 307 | &movz ($v0,&HB($s1)); # 7, 6, 5*, 4 | ||
| 308 | &rotr ($s3,16); # 13,12,15,14 | ||
| 309 | &xor ($v1,&DWP(3,$te,$v0,8)); # 5 | ||
| 310 | &movz ($v0,&HB($s2)); # 8,11,10*, 9 | ||
| 311 | &rotr ($s0,16); # 1, 0, 3, 2 | ||
| 312 | &xor ($v1,&DWP(2,$te,$v0,8)); # 10 | ||
| 313 | &movz ($v0,&HB($s3)); # 13,12,15*,14 | ||
| 314 | &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected | ||
| 315 | &mov ($__s0,$v1); # t[0] saved | ||
| 316 | |||
| 317 | &movz ($v0,&LB($s1)); # 7, 6, 5, 4* | ||
| 318 | &shr ($s1,16); # -, -, 7, 6 | ||
| 319 | &mov ($v1,&DWP(0,$te,$v0,8)); # 4 | ||
| 320 | &movz ($v0,&LB($s3)); # 13,12,15,14* | ||
| 321 | &xor ($v1,&DWP(2,$te,$v0,8)); # 14 | ||
| 322 | &movz ($v0,&HB($s0)); # 1, 0, 3*, 2 | ||
| 323 | &and ($s3,0xffff0000); # 13,12, -, - | ||
| 324 | &xor ($v1,&DWP(1,$te,$v0,8)); # 3 | ||
| 325 | &movz ($v0,&LB($s2)); # 8,11,10, 9* | ||
| 326 | &or ($s3,$s1); # 13,12, 7, 6 | ||
| 327 | &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected | ||
| 328 | &mov ($s1,$v1); # s[1]=t[1] | ||
| 329 | |||
| 330 | &movz ($v0,&LB($s0)); # 1, 0, 3, 2* | ||
| 331 | &shr ($s2,16); # -, -, 8,11 | ||
| 332 | &mov ($v1,&DWP(2,$te,$v0,8)); # 2 | ||
| 333 | &movz ($v0,&HB($s3)); # 13,12, 7*, 6 | ||
| 334 | &xor ($v1,&DWP(1,$te,$v0,8)); # 7 | ||
| 335 | &movz ($v0,&HB($s2)); # -, -, 8*,11 | ||
| 336 | &xor ($v1,&DWP(0,$te,$v0,8)); # 8 | ||
| 337 | &mov ($v0,$s3); | ||
| 338 | &shr ($v0,24); # 13 | ||
| 339 | &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected | ||
| 340 | |||
| 341 | &movz ($v0,&LB($s2)); # -, -, 8,11* | ||
| 342 | &shr ($s0,24); # 1* | ||
| 343 | &mov ($s2,&DWP(1,$te,$v0,8)); # 11 | ||
| 344 | &xor ($s2,&DWP(3,$te,$s0,8)); # 1 | ||
| 345 | &mov ($s0,$__s0); # s[0]=t[0] | ||
| 346 | &movz ($v0,&LB($s3)); # 13,12, 7, 6* | ||
| 347 | &shr ($s3,16); # , ,13,12 | ||
| 348 | &xor ($s2,&DWP(2,$te,$v0,8)); # 6 | ||
| 349 | &mov ($key,$__key); # reincarnate v0 as key | ||
| 350 | &and ($s3,0xff); # , ,13,12* | ||
| 351 | &mov ($s3,&DWP(0,$te,$s3,8)); # 12 | ||
| 352 | &xor ($s3,$s2); # s[2]=t[3] collected | ||
| 353 | &mov ($s2,$v1); # s[2]=t[2] | ||
| 354 | } | ||
| 355 | |||
| 356 | # More experimental code... SSE one... Even though this one eliminates | ||
| 357 | # *all* references to stack, it's not faster... | ||
| 358 | sub sse_encbody() | ||
| 359 | { | ||
| 360 | &movz ($acc,&LB("eax")); # 0 | ||
| 361 | &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0 | ||
| 362 | &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 | ||
| 363 | &movz ("edx",&HB("eax")); # 1 | ||
| 364 | &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1 | ||
| 365 | &shr ("eax",16); # 5, 4 | ||
| 366 | |||
| 367 | &movz ($acc,&LB("ebx")); # 10 | ||
| 368 | &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10 | ||
| 369 | &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 | ||
| 370 | &movz ($acc,&HB("ebx")); # 11 | ||
| 371 | &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11 | ||
| 372 | &shr ("ebx",16); # 15,14 | ||
| 373 | |||
| 374 | &movz ($acc,&HB("eax")); # 5 | ||
| 375 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5 | ||
| 376 | &movq ("mm3",QWP(16,$key)); | ||
| 377 | &movz ($acc,&HB("ebx")); # 15 | ||
| 378 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15 | ||
| 379 | &movd ("mm0","ecx"); # t[0] collected | ||
| 380 | |||
| 381 | &movz ($acc,&LB("eax")); # 4 | ||
| 382 | &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4 | ||
| 383 | &movd ("eax","mm2"); # 7, 6, 3, 2 | ||
| 384 | &movz ($acc,&LB("ebx")); # 14 | ||
| 385 | &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14 | ||
| 386 | &movd ("ebx","mm6"); # 13,12, 9, 8 | ||
| 387 | |||
| 388 | &movz ($acc,&HB("eax")); # 3 | ||
| 389 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3 | ||
| 390 | &movz ($acc,&HB("ebx")); # 9 | ||
| 391 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9 | ||
| 392 | &movd ("mm1","ecx"); # t[1] collected | ||
| 393 | |||
| 394 | &movz ($acc,&LB("eax")); # 2 | ||
| 395 | &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2 | ||
| 396 | &shr ("eax",16); # 7, 6 | ||
| 397 | &punpckldq ("mm0","mm1"); # t[0,1] collected | ||
| 398 | &movz ($acc,&LB("ebx")); # 8 | ||
| 399 | &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8 | ||
| 400 | &shr ("ebx",16); # 13,12 | ||
| 401 | |||
| 402 | &movz ($acc,&HB("eax")); # 7 | ||
| 403 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7 | ||
| 404 | &pxor ("mm0","mm3"); | ||
| 405 | &movz ("eax",&LB("eax")); # 6 | ||
| 406 | &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6 | ||
| 407 | &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0 | ||
| 408 | &movz ($acc,&HB("ebx")); # 13 | ||
| 409 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13 | ||
| 410 | &xor ("ecx",&DWP(24,$key)); # t[2] | ||
| 411 | &movd ("mm4","ecx"); # t[2] collected | ||
| 412 | &movz ("ebx",&LB("ebx")); # 12 | ||
| 413 | &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12 | ||
| 414 | &shr ("ecx",16); | ||
| 415 | &movd ("eax","mm1"); # 5, 4, 1, 0 | ||
| 416 | &mov ("ebx",&DWP(28,$key)); # t[3] | ||
| 417 | &xor ("ebx","edx"); | ||
| 418 | &movd ("mm5","ebx"); # t[3] collected | ||
| 419 | &and ("ebx",0xffff0000); | ||
| 420 | &or ("ebx","ecx"); | ||
| 421 | |||
| 422 | &punpckldq ("mm4","mm5"); # t[2,3] collected | ||
| 423 | } | ||
| 424 | |||
| 425 | ###################################################################### | ||
| 426 | # "Compact" block function | ||
| 427 | ###################################################################### | ||
| 428 | |||
| 429 | sub enccompact() | ||
| 430 | { my $Fn = mov; | ||
| 431 | while ($#_>5) { pop(@_); $Fn=sub{}; } | ||
| 432 | my ($i,$te,@s)=@_; | ||
| 433 | my $tmp = $key; | ||
| 434 | my $out = $i==3?$s[0]:$acc; | ||
| 435 | |||
| 436 | # $Fn is used in first compact round and its purpose is to | ||
| 437 | # void restoration of some values from stack, so that after | ||
| 438 | # 4xenccompact with extra argument $key value is left there... | ||
| 439 | if ($i==3) { &$Fn ($key,$__key); }##%edx | ||
| 440 | else { &mov ($out,$s[0]); } | ||
| 441 | &and ($out,0xFF); | ||
| 442 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] | ||
| 443 | if ($i==2) { &shr ($s[0],24); }#%ecx[2] | ||
| 444 | &movz ($out,&BP(-128,$te,$out,1)); | ||
| 445 | |||
| 446 | if ($i==3) { $tmp=$s[1]; }##%eax | ||
| 447 | &movz ($tmp,&HB($s[1])); | ||
| 448 | &movz ($tmp,&BP(-128,$te,$tmp,1)); | ||
| 449 | &shl ($tmp,8); | ||
| 450 | &xor ($out,$tmp); | ||
| 451 | |||
| 452 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx | ||
| 453 | else { &mov ($tmp,$s[2]); | ||
| 454 | &shr ($tmp,16); } | ||
| 455 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] | ||
| 456 | &and ($tmp,0xFF); | ||
| 457 | &movz ($tmp,&BP(-128,$te,$tmp,1)); | ||
| 458 | &shl ($tmp,16); | ||
| 459 | &xor ($out,$tmp); | ||
| 460 | |||
| 461 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx | ||
| 462 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] | ||
| 463 | else { &mov ($tmp,$s[3]); | ||
| 464 | &shr ($tmp,24); } | ||
| 465 | &movz ($tmp,&BP(-128,$te,$tmp,1)); | ||
| 466 | &shl ($tmp,24); | ||
| 467 | &xor ($out,$tmp); | ||
| 468 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | ||
| 469 | if ($i==3) { &mov ($s[3],$acc); } | ||
| 470 | &comment(); | ||
| 471 | } | ||
| 472 | |||
| 473 | sub enctransform() | ||
| 474 | { my @s = ($s0,$s1,$s2,$s3); | ||
| 475 | my $i = shift; | ||
| 476 | my $tmp = $tbl; | ||
| 477 | my $r2 = $key ; | ||
| 478 | |||
| 479 | &mov ($acc,$s[$i]); | ||
| 480 | &and ($acc,0x80808080); | ||
| 481 | &mov ($tmp,$acc); | ||
| 482 | &shr ($tmp,7); | ||
| 483 | &lea ($r2,&DWP(0,$s[$i],$s[$i])); | ||
| 484 | &sub ($acc,$tmp); | ||
| 485 | &and ($r2,0xfefefefe); | ||
| 486 | &and ($acc,0x1b1b1b1b); | ||
| 487 | &mov ($tmp,$s[$i]); | ||
| 488 | &xor ($acc,$r2); # r2 | ||
| 489 | |||
| 490 | &xor ($s[$i],$acc); # r0 ^ r2 | ||
| 491 | &rotl ($s[$i],24); | ||
| 492 | &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2 | ||
| 493 | &rotr ($tmp,16); | ||
| 494 | &xor ($s[$i],$tmp); | ||
| 495 | &rotr ($tmp,8); | ||
| 496 | &xor ($s[$i],$tmp); | ||
| 497 | } | ||
| 498 | |||
| 499 | &function_begin_B("_x86_AES_encrypt_compact"); | ||
| 500 | # note that caller is expected to allocate stack frame for me! | ||
| 501 | &mov ($__key,$key); # save key | ||
| 502 | |||
| 503 | &xor ($s0,&DWP(0,$key)); # xor with key | ||
| 504 | &xor ($s1,&DWP(4,$key)); | ||
| 505 | &xor ($s2,&DWP(8,$key)); | ||
| 506 | &xor ($s3,&DWP(12,$key)); | ||
| 507 | |||
| 508 | &mov ($acc,&DWP(240,$key)); # load key->rounds | ||
| 509 | &lea ($acc,&DWP(-2,$acc,$acc)); | ||
| 510 | &lea ($acc,&DWP(0,$key,$acc,8)); | ||
| 511 | &mov ($__end,$acc); # end of key schedule | ||
| 512 | |||
| 513 | # prefetch Te4 | ||
| 514 | &mov ($key,&DWP(0-128,$tbl)); | ||
| 515 | &mov ($acc,&DWP(32-128,$tbl)); | ||
| 516 | &mov ($key,&DWP(64-128,$tbl)); | ||
| 517 | &mov ($acc,&DWP(96-128,$tbl)); | ||
| 518 | &mov ($key,&DWP(128-128,$tbl)); | ||
| 519 | &mov ($acc,&DWP(160-128,$tbl)); | ||
| 520 | &mov ($key,&DWP(192-128,$tbl)); | ||
| 521 | &mov ($acc,&DWP(224-128,$tbl)); | ||
| 522 | |||
| 523 | &set_label("loop",16); | ||
| 524 | |||
| 525 | &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1); | ||
| 526 | &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1); | ||
| 527 | &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1); | ||
| 528 | &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1); | ||
| 529 | &enctransform(2); | ||
| 530 | &enctransform(3); | ||
| 531 | &enctransform(0); | ||
| 532 | &enctransform(1); | ||
| 533 | &mov ($key,$__key); | ||
| 534 | &mov ($tbl,$__tbl); | ||
| 535 | &add ($key,16); # advance rd_key | ||
| 536 | &xor ($s0,&DWP(0,$key)); | ||
| 537 | &xor ($s1,&DWP(4,$key)); | ||
| 538 | &xor ($s2,&DWP(8,$key)); | ||
| 539 | &xor ($s3,&DWP(12,$key)); | ||
| 540 | |||
| 541 | &cmp ($key,$__end); | ||
| 542 | &mov ($__key,$key); | ||
| 543 | &jb (&label("loop")); | ||
| 544 | |||
| 545 | &enccompact(0,$tbl,$s0,$s1,$s2,$s3); | ||
| 546 | &enccompact(1,$tbl,$s1,$s2,$s3,$s0); | ||
| 547 | &enccompact(2,$tbl,$s2,$s3,$s0,$s1); | ||
| 548 | &enccompact(3,$tbl,$s3,$s0,$s1,$s2); | ||
| 549 | |||
| 550 | &xor ($s0,&DWP(16,$key)); | ||
| 551 | &xor ($s1,&DWP(20,$key)); | ||
| 552 | &xor ($s2,&DWP(24,$key)); | ||
| 553 | &xor ($s3,&DWP(28,$key)); | ||
| 554 | |||
| 555 | &ret (); | ||
| 556 | &function_end_B("_x86_AES_encrypt_compact"); | ||
| 557 | |||
| 558 | ###################################################################### | ||
| 559 | # "Compact" SSE block function. | ||
| 560 | ###################################################################### | ||
| 561 | # | ||
| 562 | # Performance is not actually extraordinary in comparison to pure | ||
| 563 | # x86 code. In particular encrypt performance is virtually the same. | ||
| 564 | # Decrypt performance on the other hand is 15-20% better on newer | ||
| 565 | # µ-archs [but we're thankful for *any* improvement here], and ~50% | ||
| 566 | # better on PIII:-) And additionally on the pros side this code | ||
| 567 | # eliminates redundant references to stack and thus relieves/ | ||
| 568 | # minimizes the pressure on the memory bus. | ||
| 569 | # | ||
| 570 | # MMX register layout lsb | ||
| 571 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | ||
| 572 | # | mm4 | mm0 | | ||
| 573 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | ||
| 574 | # | s3 | s2 | s1 | s0 | | ||
| 575 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | ||
| 576 | # |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0| | ||
| 577 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | ||
| 578 | # | ||
| 579 | # Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8. | ||
| 580 | # In this terms encryption and decryption "compact" permutation | ||
| 581 | # matrices can be depicted as following: | ||
| 582 | # | ||
| 583 | # encryption lsb # decryption lsb | ||
| 584 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
| 585 | # | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 | | ||
| 586 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
| 587 | # | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 | | ||
| 588 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
| 589 | # | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 | | ||
| 590 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
| 591 | # | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 | | ||
| 592 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
| 593 | # | ||
| 594 | ###################################################################### | ||
| 595 | # Why not xmm registers? Short answer. It was actually tested and | ||
| 596 | # was not any faster, but *contrary*, most notably on Intel CPUs. | ||
| 597 | # Longer answer. Main advantage of using mm registers is that movd | ||
| 598 | # latency is lower, especially on Intel P4. While arithmetic | ||
| 599 | # instructions are twice as many, they can be scheduled every cycle | ||
| 600 | # and not every second one when they are operating on xmm register, | ||
| 601 | # so that "arithmetic throughput" remains virtually the same. And | ||
| 602 | # finally the code can be executed even on elder SSE-only CPUs:-) | ||
| 603 | |||
| 604 | sub sse_enccompact() | ||
| 605 | { | ||
| 606 | &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0 | ||
| 607 | &pshufw ("mm5","mm4",0x0d); # 15,14,11,10 | ||
| 608 | &movd ("eax","mm1"); # 5, 4, 1, 0 | ||
| 609 | &movd ("ebx","mm5"); # 15,14,11,10 | ||
| 610 | |||
| 611 | &movz ($acc,&LB("eax")); # 0 | ||
| 612 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 | ||
| 613 | &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 | ||
| 614 | &movz ("edx",&HB("eax")); # 1 | ||
| 615 | &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 | ||
| 616 | &shl ("edx",8); # 1 | ||
| 617 | &shr ("eax",16); # 5, 4 | ||
| 618 | |||
| 619 | &movz ($acc,&LB("ebx")); # 10 | ||
| 620 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 | ||
| 621 | &shl ($acc,16); # 10 | ||
| 622 | &or ("ecx",$acc); # 10 | ||
| 623 | &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 | ||
| 624 | &movz ($acc,&HB("ebx")); # 11 | ||
| 625 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 | ||
| 626 | &shl ($acc,24); # 11 | ||
| 627 | &or ("edx",$acc); # 11 | ||
| 628 | &shr ("ebx",16); # 15,14 | ||
| 629 | |||
| 630 | &movz ($acc,&HB("eax")); # 5 | ||
| 631 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5 | ||
| 632 | &shl ($acc,8); # 5 | ||
| 633 | &or ("ecx",$acc); # 5 | ||
| 634 | &movz ($acc,&HB("ebx")); # 15 | ||
| 635 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 | ||
| 636 | &shl ($acc,24); # 15 | ||
| 637 | &or ("ecx",$acc); # 15 | ||
| 638 | &movd ("mm0","ecx"); # t[0] collected | ||
| 639 | |||
| 640 | &movz ($acc,&LB("eax")); # 4 | ||
| 641 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4 | ||
| 642 | &movd ("eax","mm2"); # 7, 6, 3, 2 | ||
| 643 | &movz ($acc,&LB("ebx")); # 14 | ||
| 644 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 | ||
| 645 | &shl ($acc,16); # 14 | ||
| 646 | &or ("ecx",$acc); # 14 | ||
| 647 | |||
| 648 | &movd ("ebx","mm6"); # 13,12, 9, 8 | ||
| 649 | &movz ($acc,&HB("eax")); # 3 | ||
| 650 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3 | ||
| 651 | &shl ($acc,24); # 3 | ||
| 652 | &or ("ecx",$acc); # 3 | ||
| 653 | &movz ($acc,&HB("ebx")); # 9 | ||
| 654 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 | ||
| 655 | &shl ($acc,8); # 9 | ||
| 656 | &or ("ecx",$acc); # 9 | ||
| 657 | &movd ("mm1","ecx"); # t[1] collected | ||
| 658 | |||
| 659 | &movz ($acc,&LB("ebx")); # 8 | ||
| 660 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8 | ||
| 661 | &shr ("ebx",16); # 13,12 | ||
| 662 | &movz ($acc,&LB("eax")); # 2 | ||
| 663 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 | ||
| 664 | &shl ($acc,16); # 2 | ||
| 665 | &or ("ecx",$acc); # 2 | ||
| 666 | &shr ("eax",16); # 7, 6 | ||
| 667 | |||
| 668 | &punpckldq ("mm0","mm1"); # t[0,1] collected | ||
| 669 | |||
| 670 | &movz ($acc,&HB("eax")); # 7 | ||
| 671 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 | ||
| 672 | &shl ($acc,24); # 7 | ||
| 673 | &or ("ecx",$acc); # 7 | ||
| 674 | &and ("eax",0xff); # 6 | ||
| 675 | &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6 | ||
| 676 | &shl ("eax",16); # 6 | ||
| 677 | &or ("edx","eax"); # 6 | ||
| 678 | &movz ($acc,&HB("ebx")); # 13 | ||
| 679 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 | ||
| 680 | &shl ($acc,8); # 13 | ||
| 681 | &or ("ecx",$acc); # 13 | ||
| 682 | &movd ("mm4","ecx"); # t[2] collected | ||
| 683 | &and ("ebx",0xff); # 12 | ||
| 684 | &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12 | ||
| 685 | &or ("edx","ebx"); # 12 | ||
| 686 | &movd ("mm5","edx"); # t[3] collected | ||
| 687 | |||
| 688 | &punpckldq ("mm4","mm5"); # t[2,3] collected | ||
| 689 | } | ||
| 690 | |||
| 691 | if (!$x86only) { | ||
| 692 | &function_begin_B("_sse_AES_encrypt_compact"); | ||
| 693 | &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0 | ||
| 694 | &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8 | ||
| 695 | |||
| 696 | # note that caller is expected to allocate stack frame for me! | ||
| 697 | &mov ($acc,&DWP(240,$key)); # load key->rounds | ||
| 698 | &lea ($acc,&DWP(-2,$acc,$acc)); | ||
| 699 | &lea ($acc,&DWP(0,$key,$acc,8)); | ||
| 700 | &mov ($__end,$acc); # end of key schedule | ||
| 701 | |||
| 702 | &mov ($s0,0x1b1b1b1b); # magic constant | ||
| 703 | &mov (&DWP(8,"esp"),$s0); | ||
| 704 | &mov (&DWP(12,"esp"),$s0); | ||
| 705 | |||
| 706 | # prefetch Te4 | ||
| 707 | &mov ($s0,&DWP(0-128,$tbl)); | ||
| 708 | &mov ($s1,&DWP(32-128,$tbl)); | ||
| 709 | &mov ($s2,&DWP(64-128,$tbl)); | ||
| 710 | &mov ($s3,&DWP(96-128,$tbl)); | ||
| 711 | &mov ($s0,&DWP(128-128,$tbl)); | ||
| 712 | &mov ($s1,&DWP(160-128,$tbl)); | ||
| 713 | &mov ($s2,&DWP(192-128,$tbl)); | ||
| 714 | &mov ($s3,&DWP(224-128,$tbl)); | ||
| 715 | |||
| 716 | &set_label("loop",16); | ||
| 717 | &sse_enccompact(); | ||
| 718 | &add ($key,16); | ||
| 719 | &cmp ($key,$__end); | ||
| 720 | &ja (&label("out")); | ||
| 721 | |||
| 722 | &movq ("mm2",&QWP(8,"esp")); | ||
| 723 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); | ||
| 724 | &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0 | ||
| 725 | &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4"); | ||
| 726 | &pand ("mm3","mm2"); &pand ("mm7","mm2"); | ||
| 727 | &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16) | ||
| 728 | &paddb ("mm0","mm0"); &paddb ("mm4","mm4"); | ||
| 729 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2 | ||
| 730 | &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0 | ||
| 731 | &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2 | ||
| 732 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16) | ||
| 733 | |||
| 734 | &movq ("mm2","mm3"); &movq ("mm6","mm7"); | ||
| 735 | &pslld ("mm3",8); &pslld ("mm7",8); | ||
| 736 | &psrld ("mm2",24); &psrld ("mm6",24); | ||
| 737 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8 | ||
| 738 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24 | ||
| 739 | |||
| 740 | &movq ("mm3","mm1"); &movq ("mm7","mm5"); | ||
| 741 | &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key)); | ||
| 742 | &psrld ("mm1",8); &psrld ("mm5",8); | ||
| 743 | &mov ($s0,&DWP(0-128,$tbl)); | ||
| 744 | &pslld ("mm3",24); &pslld ("mm7",24); | ||
| 745 | &mov ($s1,&DWP(64-128,$tbl)); | ||
| 746 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8 | ||
| 747 | &mov ($s2,&DWP(128-128,$tbl)); | ||
| 748 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24 | ||
| 749 | &mov ($s3,&DWP(192-128,$tbl)); | ||
| 750 | |||
| 751 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); | ||
| 752 | &jmp (&label("loop")); | ||
| 753 | |||
| 754 | &set_label("out",16); | ||
| 755 | &pxor ("mm0",&QWP(0,$key)); | ||
| 756 | &pxor ("mm4",&QWP(8,$key)); | ||
| 757 | |||
| 758 | &ret (); | ||
| 759 | &function_end_B("_sse_AES_encrypt_compact"); | ||
| 760 | } | ||
| 761 | |||
| 762 | ###################################################################### | ||
| 763 | # Vanilla block function. | ||
| 764 | ###################################################################### | ||
| 765 | |||
| 177 | sub encstep() | 766 | sub encstep() |
| 178 | { my ($i,$te,@s) = @_; | 767 | { my ($i,$te,@s) = @_; |
| 179 | my $tmp = $key; | 768 | my $tmp = $key; |
| 180 | my $out = $i==3?$s[0]:$acc; | 769 | my $out = $i==3?$s[0]:$acc; |
| 181 | 770 | ||
| 182 | # lines marked with #%e?x[i] denote "reordered" instructions... | 771 | # lines marked with #%e?x[i] denote "reordered" instructions... |
| 183 | if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx | 772 | if ($i==3) { &mov ($key,$__key); }##%edx |
| 184 | else { &mov ($out,$s[0]); | 773 | else { &mov ($out,$s[0]); |
| 185 | &and ($out,0xFF); } | 774 | &and ($out,0xFF); } |
| 186 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] | 775 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] |
| @@ -191,14 +780,14 @@ sub encstep() | |||
| 191 | &movz ($tmp,&HB($s[1])); | 780 | &movz ($tmp,&HB($s[1])); |
| 192 | &xor ($out,&DWP(3,$te,$tmp,8)); | 781 | &xor ($out,&DWP(3,$te,$tmp,8)); |
| 193 | 782 | ||
| 194 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx | 783 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx |
| 195 | else { &mov ($tmp,$s[2]); | 784 | else { &mov ($tmp,$s[2]); |
| 196 | &shr ($tmp,16); } | 785 | &shr ($tmp,16); } |
| 197 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] | 786 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] |
| 198 | &and ($tmp,0xFF); | 787 | &and ($tmp,0xFF); |
| 199 | &xor ($out,&DWP(2,$te,$tmp,8)); | 788 | &xor ($out,&DWP(2,$te,$tmp,8)); |
| 200 | 789 | ||
| 201 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx | 790 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx |
| 202 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] | 791 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] |
| 203 | else { &mov ($tmp,$s[3]); | 792 | else { &mov ($tmp,$s[3]); |
| 204 | &shr ($tmp,24) } | 793 | &shr ($tmp,24) } |
| @@ -213,7 +802,7 @@ sub enclast() | |||
| 213 | my $tmp = $key; | 802 | my $tmp = $key; |
| 214 | my $out = $i==3?$s[0]:$acc; | 803 | my $out = $i==3?$s[0]:$acc; |
| 215 | 804 | ||
| 216 | if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx | 805 | if ($i==3) { &mov ($key,$__key); }##%edx |
| 217 | else { &mov ($out,$s[0]); } | 806 | else { &mov ($out,$s[0]); } |
| 218 | &and ($out,0xFF); | 807 | &and ($out,0xFF); |
| 219 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] | 808 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] |
| @@ -227,8 +816,8 @@ sub enclast() | |||
| 227 | &and ($tmp,0x0000ff00); | 816 | &and ($tmp,0x0000ff00); |
| 228 | &xor ($out,$tmp); | 817 | &xor ($out,$tmp); |
| 229 | 818 | ||
| 230 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx | 819 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx |
| 231 | else { mov ($tmp,$s[2]); | 820 | else { &mov ($tmp,$s[2]); |
| 232 | &shr ($tmp,16); } | 821 | &shr ($tmp,16); } |
| 233 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] | 822 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] |
| 234 | &and ($tmp,0xFF); | 823 | &and ($tmp,0xFF); |
| @@ -236,7 +825,7 @@ sub enclast() | |||
| 236 | &and ($tmp,0x00ff0000); | 825 | &and ($tmp,0x00ff0000); |
| 237 | &xor ($out,$tmp); | 826 | &xor ($out,$tmp); |
| 238 | 827 | ||
| 239 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx | 828 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx |
| 240 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] | 829 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] |
| 241 | else { &mov ($tmp,$s[3]); | 830 | else { &mov ($tmp,$s[3]); |
| 242 | &shr ($tmp,24); } | 831 | &shr ($tmp,24); } |
| @@ -247,9 +836,6 @@ sub enclast() | |||
| 247 | if ($i==3) { &mov ($s[3],$acc); } | 836 | if ($i==3) { &mov ($s[3],$acc); } |
| 248 | } | 837 | } |
| 249 | 838 | ||
| 250 | sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | ||
| 251 | |||
| 252 | &public_label("AES_Te"); | ||
| 253 | &function_begin_B("_x86_AES_encrypt"); | 839 | &function_begin_B("_x86_AES_encrypt"); |
| 254 | if ($vertical_spin) { | 840 | if ($vertical_spin) { |
| 255 | # I need high parts of volatile registers to be accessible... | 841 | # I need high parts of volatile registers to be accessible... |
| @@ -258,7 +844,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 258 | } | 844 | } |
| 259 | 845 | ||
| 260 | # note that caller is expected to allocate stack frame for me! | 846 | # note that caller is expected to allocate stack frame for me! |
| 261 | &mov (&DWP(12,"esp"),$key); # save key | 847 | &mov ($__key,$key); # save key |
| 262 | 848 | ||
| 263 | &xor ($s0,&DWP(0,$key)); # xor with key | 849 | &xor ($s0,&DWP(0,$key)); # xor with key |
| 264 | &xor ($s1,&DWP(4,$key)); | 850 | &xor ($s1,&DWP(4,$key)); |
| @@ -270,24 +856,24 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 270 | if ($small_footprint) { | 856 | if ($small_footprint) { |
| 271 | &lea ($acc,&DWP(-2,$acc,$acc)); | 857 | &lea ($acc,&DWP(-2,$acc,$acc)); |
| 272 | &lea ($acc,&DWP(0,$key,$acc,8)); | 858 | &lea ($acc,&DWP(0,$key,$acc,8)); |
| 273 | &mov (&DWP(16,"esp"),$acc); # end of key schedule | 859 | &mov ($__end,$acc); # end of key schedule |
| 274 | &align (4); | 860 | |
| 275 | &set_label("loop"); | 861 | &set_label("loop",16); |
| 276 | if ($vertical_spin) { | 862 | if ($vertical_spin) { |
| 277 | &encvert("ebp",$s0,$s1,$s2,$s3); | 863 | &encvert($tbl,$s0,$s1,$s2,$s3); |
| 278 | } else { | 864 | } else { |
| 279 | &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 865 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
| 280 | &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 866 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
| 281 | &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 867 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
| 282 | &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 868 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
| 283 | } | 869 | } |
| 284 | &add ($key,16); # advance rd_key | 870 | &add ($key,16); # advance rd_key |
| 285 | &xor ($s0,&DWP(0,$key)); | 871 | &xor ($s0,&DWP(0,$key)); |
| 286 | &xor ($s1,&DWP(4,$key)); | 872 | &xor ($s1,&DWP(4,$key)); |
| 287 | &xor ($s2,&DWP(8,$key)); | 873 | &xor ($s2,&DWP(8,$key)); |
| 288 | &xor ($s3,&DWP(12,$key)); | 874 | &xor ($s3,&DWP(12,$key)); |
| 289 | &cmp ($key,&DWP(16,"esp")); | 875 | &cmp ($key,$__end); |
| 290 | &mov (&DWP(12,"esp"),$key); | 876 | &mov ($__key,$key); |
| 291 | &jb (&label("loop")); | 877 | &jb (&label("loop")); |
| 292 | } | 878 | } |
| 293 | else { | 879 | else { |
| @@ -296,15 +882,15 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 296 | &cmp ($acc,12); | 882 | &cmp ($acc,12); |
| 297 | &jle (&label("12rounds")); | 883 | &jle (&label("12rounds")); |
| 298 | 884 | ||
| 299 | &set_label("14rounds"); | 885 | &set_label("14rounds",4); |
| 300 | for ($i=1;$i<3;$i++) { | 886 | for ($i=1;$i<3;$i++) { |
| 301 | if ($vertical_spin) { | 887 | if ($vertical_spin) { |
| 302 | &encvert("ebp",$s0,$s1,$s2,$s3); | 888 | &encvert($tbl,$s0,$s1,$s2,$s3); |
| 303 | } else { | 889 | } else { |
| 304 | &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 890 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
| 305 | &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 891 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
| 306 | &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 892 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
| 307 | &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 893 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
| 308 | } | 894 | } |
| 309 | &xor ($s0,&DWP(16*$i+0,$key)); | 895 | &xor ($s0,&DWP(16*$i+0,$key)); |
| 310 | &xor ($s1,&DWP(16*$i+4,$key)); | 896 | &xor ($s1,&DWP(16*$i+4,$key)); |
| @@ -312,16 +898,16 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 312 | &xor ($s3,&DWP(16*$i+12,$key)); | 898 | &xor ($s3,&DWP(16*$i+12,$key)); |
| 313 | } | 899 | } |
| 314 | &add ($key,32); | 900 | &add ($key,32); |
| 315 | &mov (&DWP(12,"esp"),$key); # advance rd_key | 901 | &mov ($__key,$key); # advance rd_key |
| 316 | &set_label("12rounds"); | 902 | &set_label("12rounds",4); |
| 317 | for ($i=1;$i<3;$i++) { | 903 | for ($i=1;$i<3;$i++) { |
| 318 | if ($vertical_spin) { | 904 | if ($vertical_spin) { |
| 319 | &encvert("ebp",$s0,$s1,$s2,$s3); | 905 | &encvert($tbl,$s0,$s1,$s2,$s3); |
| 320 | } else { | 906 | } else { |
| 321 | &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 907 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
| 322 | &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 908 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
| 323 | &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 909 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
| 324 | &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 910 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
| 325 | } | 911 | } |
| 326 | &xor ($s0,&DWP(16*$i+0,$key)); | 912 | &xor ($s0,&DWP(16*$i+0,$key)); |
| 327 | &xor ($s1,&DWP(16*$i+4,$key)); | 913 | &xor ($s1,&DWP(16*$i+4,$key)); |
| @@ -329,16 +915,16 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 329 | &xor ($s3,&DWP(16*$i+12,$key)); | 915 | &xor ($s3,&DWP(16*$i+12,$key)); |
| 330 | } | 916 | } |
| 331 | &add ($key,32); | 917 | &add ($key,32); |
| 332 | &mov (&DWP(12,"esp"),$key); # advance rd_key | 918 | &mov ($__key,$key); # advance rd_key |
| 333 | &set_label("10rounds"); | 919 | &set_label("10rounds",4); |
| 334 | for ($i=1;$i<10;$i++) { | 920 | for ($i=1;$i<10;$i++) { |
| 335 | if ($vertical_spin) { | 921 | if ($vertical_spin) { |
| 336 | &encvert("ebp",$s0,$s1,$s2,$s3); | 922 | &encvert($tbl,$s0,$s1,$s2,$s3); |
| 337 | } else { | 923 | } else { |
| 338 | &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 924 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
| 339 | &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 925 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
| 340 | &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 926 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
| 341 | &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 927 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
| 342 | } | 928 | } |
| 343 | &xor ($s0,&DWP(16*$i+0,$key)); | 929 | &xor ($s0,&DWP(16*$i+0,$key)); |
| 344 | &xor ($s1,&DWP(16*$i+4,$key)); | 930 | &xor ($s1,&DWP(16*$i+4,$key)); |
| @@ -352,10 +938,10 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 352 | &mov ($s1="ebx",$key="edi"); | 938 | &mov ($s1="ebx",$key="edi"); |
| 353 | &mov ($s2="ecx",$acc="esi"); | 939 | &mov ($s2="ecx",$acc="esi"); |
| 354 | } | 940 | } |
| 355 | &enclast(0,"ebp",$s0,$s1,$s2,$s3); | 941 | &enclast(0,$tbl,$s0,$s1,$s2,$s3); |
| 356 | &enclast(1,"ebp",$s1,$s2,$s3,$s0); | 942 | &enclast(1,$tbl,$s1,$s2,$s3,$s0); |
| 357 | &enclast(2,"ebp",$s2,$s3,$s0,$s1); | 943 | &enclast(2,$tbl,$s2,$s3,$s0,$s1); |
| 358 | &enclast(3,"ebp",$s3,$s0,$s1,$s2); | 944 | &enclast(3,$tbl,$s3,$s0,$s1,$s2); |
| 359 | 945 | ||
| 360 | &add ($key,$small_footprint?16:160); | 946 | &add ($key,$small_footprint?16:160); |
| 361 | &xor ($s0,&DWP(0,$key)); | 947 | &xor ($s0,&DWP(0,$key)); |
| @@ -430,38 +1016,198 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 430 | &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); | 1016 | &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); |
| 431 | &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); | 1017 | &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); |
| 432 | &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); | 1018 | &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); |
| 1019 | |||
| 1020 | #Te4 # four copies of Te4 to choose from to avoid L1 aliasing | ||
| 1021 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
| 1022 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
| 1023 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
| 1024 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
| 1025 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
| 1026 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
| 1027 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
| 1028 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
| 1029 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
| 1030 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
| 1031 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
| 1032 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
| 1033 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
| 1034 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
| 1035 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
| 1036 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
| 1037 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
| 1038 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
| 1039 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
| 1040 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
| 1041 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
| 1042 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
| 1043 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
| 1044 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
| 1045 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
| 1046 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
| 1047 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
| 1048 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
| 1049 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
| 1050 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
| 1051 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
| 1052 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
| 1053 | |||
| 1054 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
| 1055 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
| 1056 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
| 1057 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
| 1058 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
| 1059 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
| 1060 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
| 1061 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
| 1062 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
| 1063 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
| 1064 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
| 1065 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
| 1066 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
| 1067 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
| 1068 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
| 1069 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
| 1070 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
| 1071 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
| 1072 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
| 1073 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
| 1074 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
| 1075 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
| 1076 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
| 1077 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
| 1078 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
| 1079 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
| 1080 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
| 1081 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
| 1082 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
| 1083 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
| 1084 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
| 1085 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
| 1086 | |||
| 1087 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
| 1088 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
| 1089 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
| 1090 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
| 1091 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
| 1092 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
| 1093 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
| 1094 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
| 1095 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
| 1096 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
| 1097 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
| 1098 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
| 1099 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
| 1100 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
| 1101 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
| 1102 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
| 1103 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
| 1104 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
| 1105 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
| 1106 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
| 1107 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
| 1108 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
| 1109 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
| 1110 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
| 1111 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
| 1112 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
| 1113 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
| 1114 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
| 1115 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
| 1116 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
| 1117 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
| 1118 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
| 1119 | |||
| 1120 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
| 1121 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
| 1122 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
| 1123 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
| 1124 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
| 1125 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
| 1126 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
| 1127 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
| 1128 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
| 1129 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
| 1130 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
| 1131 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
| 1132 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
| 1133 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
| 1134 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
| 1135 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
| 1136 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
| 1137 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
| 1138 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
| 1139 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
| 1140 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
| 1141 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
| 1142 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
| 1143 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
| 1144 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
| 1145 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
| 1146 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
| 1147 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
| 1148 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
| 1149 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
| 1150 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
| 1151 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
| 433 | #rcon: | 1152 | #rcon: |
| 434 | &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008); | 1153 | &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008); |
| 435 | &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080); | 1154 | &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080); |
| 436 | &data_word(0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0); | 1155 | &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000); |
| 1156 | &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000); | ||
| 437 | &function_end_B("_x86_AES_encrypt"); | 1157 | &function_end_B("_x86_AES_encrypt"); |
| 438 | 1158 | ||
| 439 | # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); | 1159 | # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); |
| 440 | &public_label("AES_Te"); | ||
| 441 | &function_begin("AES_encrypt"); | 1160 | &function_begin("AES_encrypt"); |
| 442 | &mov ($acc,&wparam(0)); # load inp | 1161 | &mov ($acc,&wparam(0)); # load inp |
| 443 | &mov ($key,&wparam(2)); # load key | 1162 | &mov ($key,&wparam(2)); # load key |
| 444 | 1163 | ||
| 445 | &mov ($s0,"esp"); | 1164 | &mov ($s0,"esp"); |
| 446 | &sub ("esp",24); | 1165 | &sub ("esp",36); |
| 447 | &and ("esp",-64); | 1166 | &and ("esp",-64); # align to cache-line |
| 448 | &add ("esp",4); | 1167 | |
| 449 | &mov (&DWP(16,"esp"),$s0); | 1168 | # place stack frame just "above" the key schedule |
| 1169 | &lea ($s1,&DWP(-64-63,$key)); | ||
| 1170 | &sub ($s1,"esp"); | ||
| 1171 | &neg ($s1); | ||
| 1172 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line | ||
| 1173 | &sub ("esp",$s1); | ||
| 1174 | &add ("esp",4); # 4 is reserved for caller's return address | ||
| 1175 | &mov ($_esp,$s0); # save stack pointer | ||
| 450 | 1176 | ||
| 451 | &call (&label("pic_point")); # make it PIC! | 1177 | &call (&label("pic_point")); # make it PIC! |
| 452 | &set_label("pic_point"); | 1178 | &set_label("pic_point"); |
| 453 | &blindpop("ebp"); | 1179 | &blindpop($tbl); |
| 454 | &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 1180 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only); |
| 455 | 1181 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); | |
| 1182 | |||
| 1183 | # pick Te4 copy which can't "overlap" with stack frame or key schedule | ||
| 1184 | &lea ($s1,&DWP(768-4,"esp")); | ||
| 1185 | &sub ($s1,$tbl); | ||
| 1186 | &and ($s1,0x300); | ||
| 1187 | &lea ($tbl,&DWP(2048+128,$tbl,$s1)); | ||
| 1188 | |||
| 1189 | if (!$x86only) { | ||
| 1190 | &bt (&DWP(0,$s0),25); # check for SSE bit | ||
| 1191 | &jnc (&label("x86")); | ||
| 1192 | |||
| 1193 | &movq ("mm0",&QWP(0,$acc)); | ||
| 1194 | &movq ("mm4",&QWP(8,$acc)); | ||
| 1195 | &call ("_sse_AES_encrypt_compact"); | ||
| 1196 | &mov ("esp",$_esp); # restore stack pointer | ||
| 1197 | &mov ($acc,&wparam(1)); # load out | ||
| 1198 | &movq (&QWP(0,$acc),"mm0"); # write output data | ||
| 1199 | &movq (&QWP(8,$acc),"mm4"); | ||
| 1200 | &emms (); | ||
| 1201 | &function_end_A(); | ||
| 1202 | } | ||
| 1203 | &set_label("x86",16); | ||
| 1204 | &mov ($_tbl,$tbl); | ||
| 456 | &mov ($s0,&DWP(0,$acc)); # load input data | 1205 | &mov ($s0,&DWP(0,$acc)); # load input data |
| 457 | &mov ($s1,&DWP(4,$acc)); | 1206 | &mov ($s1,&DWP(4,$acc)); |
| 458 | &mov ($s2,&DWP(8,$acc)); | 1207 | &mov ($s2,&DWP(8,$acc)); |
| 459 | &mov ($s3,&DWP(12,$acc)); | 1208 | &mov ($s3,&DWP(12,$acc)); |
| 460 | 1209 | &call ("_x86_AES_encrypt_compact"); | |
| 461 | &call ("_x86_AES_encrypt"); | 1210 | &mov ("esp",$_esp); # restore stack pointer |
| 462 | |||
| 463 | &mov ("esp",&DWP(16,"esp")); | ||
| 464 | |||
| 465 | &mov ($acc,&wparam(1)); # load out | 1211 | &mov ($acc,&wparam(1)); # load out |
| 466 | &mov (&DWP(0,$acc),$s0); # write output data | 1212 | &mov (&DWP(0,$acc),$s0); # write output data |
| 467 | &mov (&DWP(4,$acc),$s1); | 1213 | &mov (&DWP(4,$acc),$s1); |
| @@ -469,7 +1215,370 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 469 | &mov (&DWP(12,$acc),$s3); | 1215 | &mov (&DWP(12,$acc),$s3); |
| 470 | &function_end("AES_encrypt"); | 1216 | &function_end("AES_encrypt"); |
| 471 | 1217 | ||
| 472 | #------------------------------------------------------------------# | 1218 | #--------------------------------------------------------------------# |
| 1219 | |||
| 1220 | ###################################################################### | ||
| 1221 | # "Compact" block function | ||
| 1222 | ###################################################################### | ||
| 1223 | |||
| 1224 | sub deccompact() | ||
| 1225 | { my $Fn = mov; | ||
| 1226 | while ($#_>5) { pop(@_); $Fn=sub{}; } | ||
| 1227 | my ($i,$td,@s)=@_; | ||
| 1228 | my $tmp = $key; | ||
| 1229 | my $out = $i==3?$s[0]:$acc; | ||
| 1230 | |||
| 1231 | # $Fn is used in first compact round and its purpose is to | ||
| 1232 | # void restoration of some values from stack, so that after | ||
| 1233 | # 4xdeccompact with extra argument $key, $s0 and $s1 values | ||
| 1234 | # are left there... | ||
| 1235 | if($i==3) { &$Fn ($key,$__key); } | ||
| 1236 | else { &mov ($out,$s[0]); } | ||
| 1237 | &and ($out,0xFF); | ||
| 1238 | &movz ($out,&BP(-128,$td,$out,1)); | ||
| 1239 | |||
| 1240 | if ($i==3) { $tmp=$s[1]; } | ||
| 1241 | &movz ($tmp,&HB($s[1])); | ||
| 1242 | &movz ($tmp,&BP(-128,$td,$tmp,1)); | ||
| 1243 | &shl ($tmp,8); | ||
| 1244 | &xor ($out,$tmp); | ||
| 1245 | |||
| 1246 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); } | ||
| 1247 | else { mov ($tmp,$s[2]); } | ||
| 1248 | &shr ($tmp,16); | ||
| 1249 | &and ($tmp,0xFF); | ||
| 1250 | &movz ($tmp,&BP(-128,$td,$tmp,1)); | ||
| 1251 | &shl ($tmp,16); | ||
| 1252 | &xor ($out,$tmp); | ||
| 1253 | |||
| 1254 | if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); } | ||
| 1255 | else { &mov ($tmp,$s[3]); } | ||
| 1256 | &shr ($tmp,24); | ||
| 1257 | &movz ($tmp,&BP(-128,$td,$tmp,1)); | ||
| 1258 | &shl ($tmp,24); | ||
| 1259 | &xor ($out,$tmp); | ||
| 1260 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | ||
| 1261 | if ($i==3) { &$Fn ($s[3],$__s0); } | ||
| 1262 | } | ||
| 1263 | |||
| 1264 | # must be called with 2,3,0,1 as argument sequence!!! | ||
| 1265 | sub dectransform() | ||
| 1266 | { my @s = ($s0,$s1,$s2,$s3); | ||
| 1267 | my $i = shift; | ||
| 1268 | my $tmp = $key; | ||
| 1269 | my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1); | ||
| 1270 | my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1); | ||
| 1271 | my $tp8 = $tbl; | ||
| 1272 | |||
| 1273 | &mov ($acc,$s[$i]); | ||
| 1274 | &and ($acc,0x80808080); | ||
| 1275 | &mov ($tmp,$acc); | ||
| 1276 | &shr ($tmp,7); | ||
| 1277 | &lea ($tp2,&DWP(0,$s[$i],$s[$i])); | ||
| 1278 | &sub ($acc,$tmp); | ||
| 1279 | &and ($tp2,0xfefefefe); | ||
| 1280 | &and ($acc,0x1b1b1b1b); | ||
| 1281 | &xor ($acc,$tp2); | ||
| 1282 | &mov ($tp2,$acc); | ||
| 1283 | |||
| 1284 | &and ($acc,0x80808080); | ||
| 1285 | &mov ($tmp,$acc); | ||
| 1286 | &shr ($tmp,7); | ||
| 1287 | &lea ($tp4,&DWP(0,$tp2,$tp2)); | ||
| 1288 | &sub ($acc,$tmp); | ||
| 1289 | &and ($tp4,0xfefefefe); | ||
| 1290 | &and ($acc,0x1b1b1b1b); | ||
| 1291 | &xor ($tp2,$s[$i]); # tp2^tp1 | ||
| 1292 | &xor ($acc,$tp4); | ||
| 1293 | &mov ($tp4,$acc); | ||
| 1294 | |||
| 1295 | &and ($acc,0x80808080); | ||
| 1296 | &mov ($tmp,$acc); | ||
| 1297 | &shr ($tmp,7); | ||
| 1298 | &lea ($tp8,&DWP(0,$tp4,$tp4)); | ||
| 1299 | &sub ($acc,$tmp); | ||
| 1300 | &and ($tp8,0xfefefefe); | ||
| 1301 | &and ($acc,0x1b1b1b1b); | ||
| 1302 | &xor ($tp4,$s[$i]); # tp4^tp1 | ||
| 1303 | &rotl ($s[$i],8); # = ROTATE(tp1,8) | ||
| 1304 | &xor ($tp8,$acc); | ||
| 1305 | |||
| 1306 | &xor ($s[$i],$tp2); | ||
| 1307 | &xor ($tp2,$tp8); | ||
| 1308 | &rotl ($tp2,24); | ||
| 1309 | &xor ($s[$i],$tp4); | ||
| 1310 | &xor ($tp4,$tp8); | ||
| 1311 | &rotl ($tp4,16); | ||
| 1312 | &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) | ||
| 1313 | &rotl ($tp8,8); | ||
| 1314 | &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24) | ||
| 1315 | &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16) | ||
| 1316 | &mov ($s[0],$__s0) if($i==2); #prefetch $s0 | ||
| 1317 | &mov ($s[1],$__s1) if($i==3); #prefetch $s1 | ||
| 1318 | &mov ($s[2],$__s2) if($i==1); | ||
| 1319 | &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8) | ||
| 1320 | |||
| 1321 | &mov ($s[3],$__s3) if($i==1); | ||
| 1322 | &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2); | ||
| 1323 | } | ||
| 1324 | |||
| 1325 | &function_begin_B("_x86_AES_decrypt_compact"); | ||
| 1326 | # note that caller is expected to allocate stack frame for me! | ||
| 1327 | &mov ($__key,$key); # save key | ||
| 1328 | |||
| 1329 | &xor ($s0,&DWP(0,$key)); # xor with key | ||
| 1330 | &xor ($s1,&DWP(4,$key)); | ||
| 1331 | &xor ($s2,&DWP(8,$key)); | ||
| 1332 | &xor ($s3,&DWP(12,$key)); | ||
| 1333 | |||
| 1334 | &mov ($acc,&DWP(240,$key)); # load key->rounds | ||
| 1335 | |||
| 1336 | &lea ($acc,&DWP(-2,$acc,$acc)); | ||
| 1337 | &lea ($acc,&DWP(0,$key,$acc,8)); | ||
| 1338 | &mov ($__end,$acc); # end of key schedule | ||
| 1339 | |||
| 1340 | # prefetch Td4 | ||
| 1341 | &mov ($key,&DWP(0-128,$tbl)); | ||
| 1342 | &mov ($acc,&DWP(32-128,$tbl)); | ||
| 1343 | &mov ($key,&DWP(64-128,$tbl)); | ||
| 1344 | &mov ($acc,&DWP(96-128,$tbl)); | ||
| 1345 | &mov ($key,&DWP(128-128,$tbl)); | ||
| 1346 | &mov ($acc,&DWP(160-128,$tbl)); | ||
| 1347 | &mov ($key,&DWP(192-128,$tbl)); | ||
| 1348 | &mov ($acc,&DWP(224-128,$tbl)); | ||
| 1349 | |||
| 1350 | &set_label("loop",16); | ||
| 1351 | |||
| 1352 | &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1); | ||
| 1353 | &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1); | ||
| 1354 | &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1); | ||
| 1355 | &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1); | ||
| 1356 | &dectransform(2); | ||
| 1357 | &dectransform(3); | ||
| 1358 | &dectransform(0); | ||
| 1359 | &dectransform(1); | ||
| 1360 | &mov ($key,$__key); | ||
| 1361 | &mov ($tbl,$__tbl); | ||
| 1362 | &add ($key,16); # advance rd_key | ||
| 1363 | &xor ($s0,&DWP(0,$key)); | ||
| 1364 | &xor ($s1,&DWP(4,$key)); | ||
| 1365 | &xor ($s2,&DWP(8,$key)); | ||
| 1366 | &xor ($s3,&DWP(12,$key)); | ||
| 1367 | |||
| 1368 | &cmp ($key,$__end); | ||
| 1369 | &mov ($__key,$key); | ||
| 1370 | &jb (&label("loop")); | ||
| 1371 | |||
| 1372 | &deccompact(0,$tbl,$s0,$s3,$s2,$s1); | ||
| 1373 | &deccompact(1,$tbl,$s1,$s0,$s3,$s2); | ||
| 1374 | &deccompact(2,$tbl,$s2,$s1,$s0,$s3); | ||
| 1375 | &deccompact(3,$tbl,$s3,$s2,$s1,$s0); | ||
| 1376 | |||
| 1377 | &xor ($s0,&DWP(16,$key)); | ||
| 1378 | &xor ($s1,&DWP(20,$key)); | ||
| 1379 | &xor ($s2,&DWP(24,$key)); | ||
| 1380 | &xor ($s3,&DWP(28,$key)); | ||
| 1381 | |||
| 1382 | &ret (); | ||
| 1383 | &function_end_B("_x86_AES_decrypt_compact"); | ||
| 1384 | |||
| 1385 | ###################################################################### | ||
| 1386 | # "Compact" SSE block function. | ||
| 1387 | ###################################################################### | ||
| 1388 | |||
| 1389 | sub sse_deccompact() | ||
| 1390 | { | ||
| 1391 | &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0 | ||
| 1392 | &movd ("eax","mm1"); # 7, 6, 1, 0 | ||
| 1393 | |||
| 1394 | &pshufw ("mm5","mm4",0x09); # 13,12,11,10 | ||
| 1395 | &movz ($acc,&LB("eax")); # 0 | ||
| 1396 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 | ||
| 1397 | &movd ("ebx","mm5"); # 13,12,11,10 | ||
| 1398 | &movz ("edx",&HB("eax")); # 1 | ||
| 1399 | &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 | ||
| 1400 | &shl ("edx",8); # 1 | ||
| 1401 | |||
| 1402 | &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 | ||
| 1403 | &movz ($acc,&LB("ebx")); # 10 | ||
| 1404 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 | ||
| 1405 | &shl ($acc,16); # 10 | ||
| 1406 | &or ("ecx",$acc); # 10 | ||
| 1407 | &shr ("eax",16); # 7, 6 | ||
| 1408 | &movz ($acc,&HB("ebx")); # 11 | ||
| 1409 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 | ||
| 1410 | &shl ($acc,24); # 11 | ||
| 1411 | &or ("edx",$acc); # 11 | ||
| 1412 | &shr ("ebx",16); # 13,12 | ||
| 1413 | |||
| 1414 | &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 | ||
| 1415 | &movz ($acc,&HB("eax")); # 7 | ||
| 1416 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 | ||
| 1417 | &shl ($acc,24); # 7 | ||
| 1418 | &or ("ecx",$acc); # 7 | ||
| 1419 | &movz ($acc,&HB("ebx")); # 13 | ||
| 1420 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 | ||
| 1421 | &shl ($acc,8); # 13 | ||
| 1422 | &or ("ecx",$acc); # 13 | ||
| 1423 | &movd ("mm0","ecx"); # t[0] collected | ||
| 1424 | |||
| 1425 | &movz ($acc,&LB("eax")); # 6 | ||
| 1426 | &movd ("eax","mm2"); # 3, 2, 5, 4 | ||
| 1427 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6 | ||
| 1428 | &shl ("ecx",16); # 6 | ||
| 1429 | &movz ($acc,&LB("ebx")); # 12 | ||
| 1430 | &movd ("ebx","mm6"); # 9, 8,15,14 | ||
| 1431 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12 | ||
| 1432 | &or ("ecx",$acc); # 12 | ||
| 1433 | |||
| 1434 | &movz ($acc,&LB("eax")); # 4 | ||
| 1435 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4 | ||
| 1436 | &or ("edx",$acc); # 4 | ||
| 1437 | &movz ($acc,&LB("ebx")); # 14 | ||
| 1438 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 | ||
| 1439 | &shl ($acc,16); # 14 | ||
| 1440 | &or ("edx",$acc); # 14 | ||
| 1441 | &movd ("mm1","edx"); # t[1] collected | ||
| 1442 | |||
| 1443 | &movz ($acc,&HB("eax")); # 5 | ||
| 1444 | &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5 | ||
| 1445 | &shl ("edx",8); # 5 | ||
| 1446 | &movz ($acc,&HB("ebx")); # 15 | ||
| 1447 | &shr ("eax",16); # 3, 2 | ||
| 1448 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 | ||
| 1449 | &shl ($acc,24); # 15 | ||
| 1450 | &or ("edx",$acc); # 15 | ||
| 1451 | &shr ("ebx",16); # 9, 8 | ||
| 1452 | |||
| 1453 | &punpckldq ("mm0","mm1"); # t[0,1] collected | ||
| 1454 | |||
| 1455 | &movz ($acc,&HB("ebx")); # 9 | ||
| 1456 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 | ||
| 1457 | &shl ($acc,8); # 9 | ||
| 1458 | &or ("ecx",$acc); # 9 | ||
| 1459 | &and ("ebx",0xff); # 8 | ||
| 1460 | &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8 | ||
| 1461 | &or ("edx","ebx"); # 8 | ||
| 1462 | &movz ($acc,&LB("eax")); # 2 | ||
| 1463 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 | ||
| 1464 | &shl ($acc,16); # 2 | ||
| 1465 | &or ("edx",$acc); # 2 | ||
| 1466 | &movd ("mm4","edx"); # t[2] collected | ||
| 1467 | &movz ("eax",&HB("eax")); # 3 | ||
| 1468 | &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3 | ||
| 1469 | &shl ("eax",24); # 3 | ||
| 1470 | &or ("ecx","eax"); # 3 | ||
| 1471 | &movd ("mm5","ecx"); # t[3] collected | ||
| 1472 | |||
| 1473 | &punpckldq ("mm4","mm5"); # t[2,3] collected | ||
| 1474 | } | ||
| 1475 | |||
| 1476 | if (!$x86only) { | ||
| 1477 | &function_begin_B("_sse_AES_decrypt_compact"); | ||
| 1478 | &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0 | ||
| 1479 | &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8 | ||
| 1480 | |||
| 1481 | # note that caller is expected to allocate stack frame for me! | ||
| 1482 | &mov ($acc,&DWP(240,$key)); # load key->rounds | ||
| 1483 | &lea ($acc,&DWP(-2,$acc,$acc)); | ||
| 1484 | &lea ($acc,&DWP(0,$key,$acc,8)); | ||
| 1485 | &mov ($__end,$acc); # end of key schedule | ||
| 1486 | |||
| 1487 | &mov ($s0,0x1b1b1b1b); # magic constant | ||
| 1488 | &mov (&DWP(8,"esp"),$s0); | ||
| 1489 | &mov (&DWP(12,"esp"),$s0); | ||
| 1490 | |||
| 1491 | # prefetch Td4 | ||
| 1492 | &mov ($s0,&DWP(0-128,$tbl)); | ||
| 1493 | &mov ($s1,&DWP(32-128,$tbl)); | ||
| 1494 | &mov ($s2,&DWP(64-128,$tbl)); | ||
| 1495 | &mov ($s3,&DWP(96-128,$tbl)); | ||
| 1496 | &mov ($s0,&DWP(128-128,$tbl)); | ||
| 1497 | &mov ($s1,&DWP(160-128,$tbl)); | ||
| 1498 | &mov ($s2,&DWP(192-128,$tbl)); | ||
| 1499 | &mov ($s3,&DWP(224-128,$tbl)); | ||
| 1500 | |||
| 1501 | &set_label("loop",16); | ||
| 1502 | &sse_deccompact(); | ||
| 1503 | &add ($key,16); | ||
| 1504 | &cmp ($key,$__end); | ||
| 1505 | &ja (&label("out")); | ||
| 1506 | |||
| 1507 | # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N) | ||
| 1508 | &movq ("mm3","mm0"); &movq ("mm7","mm4"); | ||
| 1509 | &movq ("mm2","mm0",1); &movq ("mm6","mm4",1); | ||
| 1510 | &movq ("mm1","mm0"); &movq ("mm5","mm4"); | ||
| 1511 | &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16) | ||
| 1512 | &pslld ("mm2",8); &pslld ("mm6",8); | ||
| 1513 | &psrld ("mm3",8); &psrld ("mm7",8); | ||
| 1514 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8 | ||
| 1515 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8 | ||
| 1516 | &pslld ("mm2",16); &pslld ("mm6",16); | ||
| 1517 | &psrld ("mm3",16); &psrld ("mm7",16); | ||
| 1518 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24 | ||
| 1519 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24 | ||
| 1520 | |||
| 1521 | &movq ("mm3",&QWP(8,"esp")); | ||
| 1522 | &pxor ("mm2","mm2"); &pxor ("mm6","mm6"); | ||
| 1523 | &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5"); | ||
| 1524 | &pand ("mm2","mm3"); &pand ("mm6","mm3"); | ||
| 1525 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); | ||
| 1526 | &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2 | ||
| 1527 | &movq ("mm3","mm1"); &movq ("mm7","mm5"); | ||
| 1528 | &movq ("mm2","mm1"); &movq ("mm6","mm5"); | ||
| 1529 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2 | ||
| 1530 | &pslld ("mm3",24); &pslld ("mm7",24); | ||
| 1531 | &psrld ("mm2",8); &psrld ("mm6",8); | ||
| 1532 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24 | ||
| 1533 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8 | ||
| 1534 | |||
| 1535 | &movq ("mm2",&QWP(8,"esp")); | ||
| 1536 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); | ||
| 1537 | &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5"); | ||
| 1538 | &pand ("mm3","mm2"); &pand ("mm7","mm2"); | ||
| 1539 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); | ||
| 1540 | &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4 | ||
| 1541 | &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1); | ||
| 1542 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4 | ||
| 1543 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16) | ||
| 1544 | |||
| 1545 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); | ||
| 1546 | &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5"); | ||
| 1547 | &pand ("mm3","mm2"); &pand ("mm7","mm2"); | ||
| 1548 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); | ||
| 1549 | &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8 | ||
| 1550 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8 | ||
| 1551 | &movq ("mm3","mm1"); &movq ("mm7","mm5"); | ||
| 1552 | &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1); | ||
| 1553 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16) | ||
| 1554 | &pslld ("mm1",8); &pslld ("mm5",8); | ||
| 1555 | &psrld ("mm3",8); &psrld ("mm7",8); | ||
| 1556 | &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key)); | ||
| 1557 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8 | ||
| 1558 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8 | ||
| 1559 | &mov ($s0,&DWP(0-128,$tbl)); | ||
| 1560 | &pslld ("mm1",16); &pslld ("mm5",16); | ||
| 1561 | &mov ($s1,&DWP(64-128,$tbl)); | ||
| 1562 | &psrld ("mm3",16); &psrld ("mm7",16); | ||
| 1563 | &mov ($s2,&DWP(128-128,$tbl)); | ||
| 1564 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24 | ||
| 1565 | &mov ($s3,&DWP(192-128,$tbl)); | ||
| 1566 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24 | ||
| 1567 | |||
| 1568 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); | ||
| 1569 | &jmp (&label("loop")); | ||
| 1570 | |||
| 1571 | &set_label("out",16); | ||
| 1572 | &pxor ("mm0",&QWP(0,$key)); | ||
| 1573 | &pxor ("mm4",&QWP(8,$key)); | ||
| 1574 | |||
| 1575 | &ret (); | ||
| 1576 | &function_end_B("_sse_AES_decrypt_compact"); | ||
| 1577 | } | ||
| 1578 | |||
| 1579 | ###################################################################### | ||
| 1580 | # Vanilla block function. | ||
| 1581 | ###################################################################### | ||
| 473 | 1582 | ||
| 474 | sub decstep() | 1583 | sub decstep() |
| 475 | { my ($i,$td,@s) = @_; | 1584 | { my ($i,$td,@s) = @_; |
| @@ -480,7 +1589,7 @@ sub decstep() | |||
| 480 | # optimal... or rather that all attempts to reorder didn't | 1589 | # optimal... or rather that all attempts to reorder didn't |
| 481 | # result in better performance [which by the way is not a | 1590 | # result in better performance [which by the way is not a |
| 482 | # bit lower than ecryption]. | 1591 | # bit lower than ecryption]. |
| 483 | if($i==3) { &mov ($key,&DWP(12,"esp")); } | 1592 | if($i==3) { &mov ($key,$__key); } |
| 484 | else { &mov ($out,$s[0]); } | 1593 | else { &mov ($out,$s[0]); } |
| 485 | &and ($out,0xFF); | 1594 | &and ($out,0xFF); |
| 486 | &mov ($out,&DWP(0,$td,$out,8)); | 1595 | &mov ($out,&DWP(0,$td,$out,8)); |
| @@ -495,12 +1604,12 @@ sub decstep() | |||
| 495 | &and ($tmp,0xFF); | 1604 | &and ($tmp,0xFF); |
| 496 | &xor ($out,&DWP(2,$td,$tmp,8)); | 1605 | &xor ($out,&DWP(2,$td,$tmp,8)); |
| 497 | 1606 | ||
| 498 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); } | 1607 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); } |
| 499 | else { &mov ($tmp,$s[3]); } | 1608 | else { &mov ($tmp,$s[3]); } |
| 500 | &shr ($tmp,24); | 1609 | &shr ($tmp,24); |
| 501 | &xor ($out,&DWP(1,$td,$tmp,8)); | 1610 | &xor ($out,&DWP(1,$td,$tmp,8)); |
| 502 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | 1611 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } |
| 503 | if ($i==3) { &mov ($s[3],&DWP(4,"esp")); } | 1612 | if ($i==3) { &mov ($s[3],$__s0); } |
| 504 | &comment(); | 1613 | &comment(); |
| 505 | } | 1614 | } |
| 506 | 1615 | ||
| @@ -509,14 +1618,24 @@ sub declast() | |||
| 509 | my $tmp = $key; | 1618 | my $tmp = $key; |
| 510 | my $out = $i==3?$s[0]:$acc; | 1619 | my $out = $i==3?$s[0]:$acc; |
| 511 | 1620 | ||
| 512 | if($i==3) { &mov ($key,&DWP(12,"esp")); } | 1621 | if($i==0) { &lea ($td,&DWP(2048+128,$td)); |
| 1622 | &mov ($tmp,&DWP(0-128,$td)); | ||
| 1623 | &mov ($acc,&DWP(32-128,$td)); | ||
| 1624 | &mov ($tmp,&DWP(64-128,$td)); | ||
| 1625 | &mov ($acc,&DWP(96-128,$td)); | ||
| 1626 | &mov ($tmp,&DWP(128-128,$td)); | ||
| 1627 | &mov ($acc,&DWP(160-128,$td)); | ||
| 1628 | &mov ($tmp,&DWP(192-128,$td)); | ||
| 1629 | &mov ($acc,&DWP(224-128,$td)); | ||
| 1630 | &lea ($td,&DWP(-128,$td)); } | ||
| 1631 | if($i==3) { &mov ($key,$__key); } | ||
| 513 | else { &mov ($out,$s[0]); } | 1632 | else { &mov ($out,$s[0]); } |
| 514 | &and ($out,0xFF); | 1633 | &and ($out,0xFF); |
| 515 | &movz ($out,&BP(2048,$td,$out,1)); | 1634 | &movz ($out,&BP(0,$td,$out,1)); |
| 516 | 1635 | ||
| 517 | if ($i==3) { $tmp=$s[1]; } | 1636 | if ($i==3) { $tmp=$s[1]; } |
| 518 | &movz ($tmp,&HB($s[1])); | 1637 | &movz ($tmp,&HB($s[1])); |
| 519 | &movz ($tmp,&BP(2048,$td,$tmp,1)); | 1638 | &movz ($tmp,&BP(0,$td,$tmp,1)); |
| 520 | &shl ($tmp,8); | 1639 | &shl ($tmp,8); |
| 521 | &xor ($out,$tmp); | 1640 | &xor ($out,$tmp); |
| 522 | 1641 | ||
| @@ -524,24 +1643,24 @@ sub declast() | |||
| 524 | else { mov ($tmp,$s[2]); } | 1643 | else { mov ($tmp,$s[2]); } |
| 525 | &shr ($tmp,16); | 1644 | &shr ($tmp,16); |
| 526 | &and ($tmp,0xFF); | 1645 | &and ($tmp,0xFF); |
| 527 | &movz ($tmp,&BP(2048,$td,$tmp,1)); | 1646 | &movz ($tmp,&BP(0,$td,$tmp,1)); |
| 528 | &shl ($tmp,16); | 1647 | &shl ($tmp,16); |
| 529 | &xor ($out,$tmp); | 1648 | &xor ($out,$tmp); |
| 530 | 1649 | ||
| 531 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); } | 1650 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); } |
| 532 | else { &mov ($tmp,$s[3]); } | 1651 | else { &mov ($tmp,$s[3]); } |
| 533 | &shr ($tmp,24); | 1652 | &shr ($tmp,24); |
| 534 | &movz ($tmp,&BP(2048,$td,$tmp,1)); | 1653 | &movz ($tmp,&BP(0,$td,$tmp,1)); |
| 535 | &shl ($tmp,24); | 1654 | &shl ($tmp,24); |
| 536 | &xor ($out,$tmp); | 1655 | &xor ($out,$tmp); |
| 537 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | 1656 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } |
| 538 | if ($i==3) { &mov ($s[3],&DWP(4,"esp")); } | 1657 | if ($i==3) { &mov ($s[3],$__s0); |
| 1658 | &lea ($td,&DWP(-2048,$td)); } | ||
| 539 | } | 1659 | } |
| 540 | 1660 | ||
| 541 | &public_label("AES_Td"); | ||
| 542 | &function_begin_B("_x86_AES_decrypt"); | 1661 | &function_begin_B("_x86_AES_decrypt"); |
| 543 | # note that caller is expected to allocate stack frame for me! | 1662 | # note that caller is expected to allocate stack frame for me! |
| 544 | &mov (&DWP(12,"esp"),$key); # save key | 1663 | &mov ($__key,$key); # save key |
| 545 | 1664 | ||
| 546 | &xor ($s0,&DWP(0,$key)); # xor with key | 1665 | &xor ($s0,&DWP(0,$key)); # xor with key |
| 547 | &xor ($s1,&DWP(4,$key)); | 1666 | &xor ($s1,&DWP(4,$key)); |
| @@ -553,20 +1672,19 @@ sub declast() | |||
| 553 | if ($small_footprint) { | 1672 | if ($small_footprint) { |
| 554 | &lea ($acc,&DWP(-2,$acc,$acc)); | 1673 | &lea ($acc,&DWP(-2,$acc,$acc)); |
| 555 | &lea ($acc,&DWP(0,$key,$acc,8)); | 1674 | &lea ($acc,&DWP(0,$key,$acc,8)); |
| 556 | &mov (&DWP(16,"esp"),$acc); # end of key schedule | 1675 | &mov ($__end,$acc); # end of key schedule |
| 557 | &align (4); | 1676 | &set_label("loop",16); |
| 558 | &set_label("loop"); | 1677 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
| 559 | &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1678 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
| 560 | &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1679 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
| 561 | &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1680 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
| 562 | &decstep(3,"ebp",$s3,$s2,$s1,$s0); | ||
| 563 | &add ($key,16); # advance rd_key | 1681 | &add ($key,16); # advance rd_key |
| 564 | &xor ($s0,&DWP(0,$key)); | 1682 | &xor ($s0,&DWP(0,$key)); |
| 565 | &xor ($s1,&DWP(4,$key)); | 1683 | &xor ($s1,&DWP(4,$key)); |
| 566 | &xor ($s2,&DWP(8,$key)); | 1684 | &xor ($s2,&DWP(8,$key)); |
| 567 | &xor ($s3,&DWP(12,$key)); | 1685 | &xor ($s3,&DWP(12,$key)); |
| 568 | &cmp ($key,&DWP(16,"esp")); | 1686 | &cmp ($key,$__end); |
| 569 | &mov (&DWP(12,"esp"),$key); | 1687 | &mov ($__key,$key); |
| 570 | &jb (&label("loop")); | 1688 | &jb (&label("loop")); |
| 571 | } | 1689 | } |
| 572 | else { | 1690 | else { |
| @@ -575,38 +1693,38 @@ sub declast() | |||
| 575 | &cmp ($acc,12); | 1693 | &cmp ($acc,12); |
| 576 | &jle (&label("12rounds")); | 1694 | &jle (&label("12rounds")); |
| 577 | 1695 | ||
| 578 | &set_label("14rounds"); | 1696 | &set_label("14rounds",4); |
| 579 | for ($i=1;$i<3;$i++) { | 1697 | for ($i=1;$i<3;$i++) { |
| 580 | &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1698 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
| 581 | &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1699 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
| 582 | &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1700 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
| 583 | &decstep(3,"ebp",$s3,$s2,$s1,$s0); | 1701 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
| 584 | &xor ($s0,&DWP(16*$i+0,$key)); | 1702 | &xor ($s0,&DWP(16*$i+0,$key)); |
| 585 | &xor ($s1,&DWP(16*$i+4,$key)); | 1703 | &xor ($s1,&DWP(16*$i+4,$key)); |
| 586 | &xor ($s2,&DWP(16*$i+8,$key)); | 1704 | &xor ($s2,&DWP(16*$i+8,$key)); |
| 587 | &xor ($s3,&DWP(16*$i+12,$key)); | 1705 | &xor ($s3,&DWP(16*$i+12,$key)); |
| 588 | } | 1706 | } |
| 589 | &add ($key,32); | 1707 | &add ($key,32); |
| 590 | &mov (&DWP(12,"esp"),$key); # advance rd_key | 1708 | &mov ($__key,$key); # advance rd_key |
| 591 | &set_label("12rounds"); | 1709 | &set_label("12rounds",4); |
| 592 | for ($i=1;$i<3;$i++) { | 1710 | for ($i=1;$i<3;$i++) { |
| 593 | &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1711 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
| 594 | &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1712 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
| 595 | &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1713 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
| 596 | &decstep(3,"ebp",$s3,$s2,$s1,$s0); | 1714 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
| 597 | &xor ($s0,&DWP(16*$i+0,$key)); | 1715 | &xor ($s0,&DWP(16*$i+0,$key)); |
| 598 | &xor ($s1,&DWP(16*$i+4,$key)); | 1716 | &xor ($s1,&DWP(16*$i+4,$key)); |
| 599 | &xor ($s2,&DWP(16*$i+8,$key)); | 1717 | &xor ($s2,&DWP(16*$i+8,$key)); |
| 600 | &xor ($s3,&DWP(16*$i+12,$key)); | 1718 | &xor ($s3,&DWP(16*$i+12,$key)); |
| 601 | } | 1719 | } |
| 602 | &add ($key,32); | 1720 | &add ($key,32); |
| 603 | &mov (&DWP(12,"esp"),$key); # advance rd_key | 1721 | &mov ($__key,$key); # advance rd_key |
| 604 | &set_label("10rounds"); | 1722 | &set_label("10rounds",4); |
| 605 | for ($i=1;$i<10;$i++) { | 1723 | for ($i=1;$i<10;$i++) { |
| 606 | &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1724 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
| 607 | &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1725 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
| 608 | &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1726 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
| 609 | &decstep(3,"ebp",$s3,$s2,$s1,$s0); | 1727 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
| 610 | &xor ($s0,&DWP(16*$i+0,$key)); | 1728 | &xor ($s0,&DWP(16*$i+0,$key)); |
| 611 | &xor ($s1,&DWP(16*$i+4,$key)); | 1729 | &xor ($s1,&DWP(16*$i+4,$key)); |
| 612 | &xor ($s2,&DWP(16*$i+8,$key)); | 1730 | &xor ($s2,&DWP(16*$i+8,$key)); |
| @@ -614,10 +1732,10 @@ sub declast() | |||
| 614 | } | 1732 | } |
| 615 | } | 1733 | } |
| 616 | 1734 | ||
| 617 | &declast(0,"ebp",$s0,$s3,$s2,$s1); | 1735 | &declast(0,$tbl,$s0,$s3,$s2,$s1); |
| 618 | &declast(1,"ebp",$s1,$s0,$s3,$s2); | 1736 | &declast(1,$tbl,$s1,$s0,$s3,$s2); |
| 619 | &declast(2,"ebp",$s2,$s1,$s0,$s3); | 1737 | &declast(2,$tbl,$s2,$s1,$s0,$s3); |
| 620 | &declast(3,"ebp",$s3,$s2,$s1,$s0); | 1738 | &declast(3,$tbl,$s3,$s2,$s1,$s0); |
| 621 | 1739 | ||
| 622 | &add ($key,$small_footprint?16:160); | 1740 | &add ($key,$small_footprint?16:160); |
| 623 | &xor ($s0,&DWP(0,$key)); | 1741 | &xor ($s0,&DWP(0,$key)); |
| @@ -692,7 +1810,107 @@ sub declast() | |||
| 692 | &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); | 1810 | &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); |
| 693 | &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); | 1811 | &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); |
| 694 | &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); | 1812 | &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); |
| 695 | #Td4: | 1813 | |
| 1814 | #Td4: # four copies of Td4 to choose from to avoid L1 aliasing | ||
| 1815 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
| 1816 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
| 1817 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
| 1818 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
| 1819 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
| 1820 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
| 1821 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
| 1822 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
| 1823 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
| 1824 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
| 1825 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
| 1826 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
| 1827 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
| 1828 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
| 1829 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
| 1830 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
| 1831 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
| 1832 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
| 1833 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
| 1834 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
| 1835 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
| 1836 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
| 1837 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
| 1838 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
| 1839 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
| 1840 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
| 1841 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
| 1842 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
| 1843 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
| 1844 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
| 1845 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
| 1846 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
| 1847 | |||
| 1848 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
| 1849 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
| 1850 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
| 1851 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
| 1852 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
| 1853 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
| 1854 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
| 1855 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
| 1856 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
| 1857 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
| 1858 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
| 1859 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
| 1860 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
| 1861 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
| 1862 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
| 1863 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
| 1864 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
| 1865 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
| 1866 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
| 1867 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
| 1868 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
| 1869 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
| 1870 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
| 1871 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
| 1872 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
| 1873 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
| 1874 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
| 1875 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
| 1876 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
| 1877 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
| 1878 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
| 1879 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
| 1880 | |||
| 1881 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
| 1882 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
| 1883 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
| 1884 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
| 1885 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
| 1886 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
| 1887 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
| 1888 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
| 1889 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
| 1890 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
| 1891 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
| 1892 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
| 1893 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
| 1894 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
| 1895 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
| 1896 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
| 1897 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
| 1898 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
| 1899 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
| 1900 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
| 1901 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
| 1902 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
| 1903 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
| 1904 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
| 1905 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
| 1906 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
| 1907 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
| 1908 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
| 1909 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
| 1910 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
| 1911 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
| 1912 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
| 1913 | |||
| 696 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | 1914 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); |
| 697 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | 1915 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); |
| 698 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | 1916 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); |
| @@ -728,43 +1946,57 @@ sub declast() | |||
| 728 | &function_end_B("_x86_AES_decrypt"); | 1946 | &function_end_B("_x86_AES_decrypt"); |
| 729 | 1947 | ||
| 730 | # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); | 1948 | # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); |
| 731 | &public_label("AES_Td"); | ||
| 732 | &function_begin("AES_decrypt"); | 1949 | &function_begin("AES_decrypt"); |
| 733 | &mov ($acc,&wparam(0)); # load inp | 1950 | &mov ($acc,&wparam(0)); # load inp |
| 734 | &mov ($key,&wparam(2)); # load key | 1951 | &mov ($key,&wparam(2)); # load key |
| 735 | 1952 | ||
| 736 | &mov ($s0,"esp"); | 1953 | &mov ($s0,"esp"); |
| 737 | &sub ("esp",24); | 1954 | &sub ("esp",36); |
| 738 | &and ("esp",-64); | 1955 | &and ("esp",-64); # align to cache-line |
| 739 | &add ("esp",4); | 1956 | |
| 740 | &mov (&DWP(16,"esp"),$s0); | 1957 | # place stack frame just "above" the key schedule |
| 1958 | &lea ($s1,&DWP(-64-63,$key)); | ||
| 1959 | &sub ($s1,"esp"); | ||
| 1960 | &neg ($s1); | ||
| 1961 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line | ||
| 1962 | &sub ("esp",$s1); | ||
| 1963 | &add ("esp",4); # 4 is reserved for caller's return address | ||
| 1964 | &mov ($_esp,$s0); # save stack pointer | ||
| 741 | 1965 | ||
| 742 | &call (&label("pic_point")); # make it PIC! | 1966 | &call (&label("pic_point")); # make it PIC! |
| 743 | &set_label("pic_point"); | 1967 | &set_label("pic_point"); |
| 744 | &blindpop("ebp"); | 1968 | &blindpop($tbl); |
| 745 | &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); | 1969 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only); |
| 746 | 1970 | &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl)); | |
| 747 | # prefetch Td4 | 1971 | |
| 748 | &lea ("ebp",&DWP(2048+128,"ebp")); | 1972 | # pick Td4 copy which can't "overlap" with stack frame or key schedule |
| 749 | &mov ($s0,&DWP(0-128,"ebp")); | 1973 | &lea ($s1,&DWP(768-4,"esp")); |
| 750 | &mov ($s1,&DWP(32-128,"ebp")); | 1974 | &sub ($s1,$tbl); |
| 751 | &mov ($s2,&DWP(64-128,"ebp")); | 1975 | &and ($s1,0x300); |
| 752 | &mov ($s3,&DWP(96-128,"ebp")); | 1976 | &lea ($tbl,&DWP(2048+128,$tbl,$s1)); |
| 753 | &mov ($s0,&DWP(128-128,"ebp")); | 1977 | |
| 754 | &mov ($s1,&DWP(160-128,"ebp")); | 1978 | if (!$x86only) { |
| 755 | &mov ($s2,&DWP(192-128,"ebp")); | 1979 | &bt (&DWP(0,$s0),25); # check for SSE bit |
| 756 | &mov ($s3,&DWP(224-128,"ebp")); | 1980 | &jnc (&label("x86")); |
| 757 | &lea ("ebp",&DWP(-2048-128,"ebp")); | 1981 | |
| 758 | 1982 | &movq ("mm0",&QWP(0,$acc)); | |
| 1983 | &movq ("mm4",&QWP(8,$acc)); | ||
| 1984 | &call ("_sse_AES_decrypt_compact"); | ||
| 1985 | &mov ("esp",$_esp); # restore stack pointer | ||
| 1986 | &mov ($acc,&wparam(1)); # load out | ||
| 1987 | &movq (&QWP(0,$acc),"mm0"); # write output data | ||
| 1988 | &movq (&QWP(8,$acc),"mm4"); | ||
| 1989 | &emms (); | ||
| 1990 | &function_end_A(); | ||
| 1991 | } | ||
| 1992 | &set_label("x86",16); | ||
| 1993 | &mov ($_tbl,$tbl); | ||
| 759 | &mov ($s0,&DWP(0,$acc)); # load input data | 1994 | &mov ($s0,&DWP(0,$acc)); # load input data |
| 760 | &mov ($s1,&DWP(4,$acc)); | 1995 | &mov ($s1,&DWP(4,$acc)); |
| 761 | &mov ($s2,&DWP(8,$acc)); | 1996 | &mov ($s2,&DWP(8,$acc)); |
| 762 | &mov ($s3,&DWP(12,$acc)); | 1997 | &mov ($s3,&DWP(12,$acc)); |
| 763 | 1998 | &call ("_x86_AES_decrypt_compact"); | |
| 764 | &call ("_x86_AES_decrypt"); | 1999 | &mov ("esp",$_esp); # restore stack pointer |
| 765 | |||
| 766 | &mov ("esp",&DWP(16,"esp")); | ||
| 767 | |||
| 768 | &mov ($acc,&wparam(1)); # load out | 2000 | &mov ($acc,&wparam(1)); # load out |
| 769 | &mov (&DWP(0,$acc),$s0); # write output data | 2001 | &mov (&DWP(0,$acc),$s0); # write output data |
| 770 | &mov (&DWP(4,$acc),$s1); | 2002 | &mov (&DWP(4,$acc),$s1); |
| @@ -777,126 +2009,136 @@ sub declast() | |||
| 777 | # unsigned char *ivp,const int enc); | 2009 | # unsigned char *ivp,const int enc); |
| 778 | { | 2010 | { |
| 779 | # stack frame layout | 2011 | # stack frame layout |
| 780 | # -4(%esp) 0(%esp) return address | 2012 | # -4(%esp) # return address 0(%esp) |
| 781 | # 0(%esp) 4(%esp) tmp1 | 2013 | # 0(%esp) # s0 backing store 4(%esp) |
| 782 | # 4(%esp) 8(%esp) tmp2 | 2014 | # 4(%esp) # s1 backing store 8(%esp) |
| 783 | # 8(%esp) 12(%esp) key | 2015 | # 8(%esp) # s2 backing store 12(%esp) |
| 784 | # 12(%esp) 16(%esp) end of key schedule | 2016 | # 12(%esp) # s3 backing store 16(%esp) |
| 785 | my $_esp=&DWP(16,"esp"); #saved %esp | 2017 | # 16(%esp) # key backup 20(%esp) |
| 786 | my $_inp=&DWP(20,"esp"); #copy of wparam(0) | 2018 | # 20(%esp) # end of key schedule 24(%esp) |
| 787 | my $_out=&DWP(24,"esp"); #copy of wparam(1) | 2019 | # 24(%esp) # %ebp backup 28(%esp) |
| 788 | my $_len=&DWP(28,"esp"); #copy of wparam(2) | 2020 | # 28(%esp) # %esp backup |
| 789 | my $_key=&DWP(32,"esp"); #copy of wparam(3) | 2021 | my $_inp=&DWP(32,"esp"); # copy of wparam(0) |
| 790 | my $_ivp=&DWP(36,"esp"); #copy of wparam(4) | 2022 | my $_out=&DWP(36,"esp"); # copy of wparam(1) |
| 791 | my $_tmp=&DWP(40,"esp"); #volatile variable | 2023 | my $_len=&DWP(40,"esp"); # copy of wparam(2) |
| 792 | my $ivec=&DWP(44,"esp"); #ivec[16] | 2024 | my $_key=&DWP(44,"esp"); # copy of wparam(3) |
| 793 | my $aes_key=&DWP(60,"esp"); #copy of aes_key | 2025 | my $_ivp=&DWP(48,"esp"); # copy of wparam(4) |
| 794 | my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | 2026 | my $_tmp=&DWP(52,"esp"); # volatile variable |
| 795 | 2027 | # | |
| 796 | &public_label("AES_Te"); | 2028 | my $ivec=&DWP(60,"esp"); # ivec[16] |
| 797 | &public_label("AES_Td"); | 2029 | my $aes_key=&DWP(76,"esp"); # copy of aes_key |
| 2030 | my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds | ||
| 2031 | |||
| 798 | &function_begin("AES_cbc_encrypt"); | 2032 | &function_begin("AES_cbc_encrypt"); |
| 799 | &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len | 2033 | &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len |
| 800 | &cmp ($s2,0); | 2034 | &cmp ($s2,0); |
| 801 | &je (&label("enc_out")); | 2035 | &je (&label("drop_out")); |
| 802 | 2036 | ||
| 803 | &call (&label("pic_point")); # make it PIC! | 2037 | &call (&label("pic_point")); # make it PIC! |
| 804 | &set_label("pic_point"); | 2038 | &set_label("pic_point"); |
| 805 | &blindpop("ebp"); | 2039 | &blindpop($tbl); |
| 806 | 2040 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only); | |
| 807 | &pushf (); | ||
| 808 | &cld (); | ||
| 809 | 2041 | ||
| 810 | &cmp (&wparam(5),0); | 2042 | &cmp (&wparam(5),0); |
| 811 | &je (&label("DECRYPT")); | 2043 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); |
| 812 | 2044 | &jne (&label("picked_te")); | |
| 813 | &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 2045 | &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl)); |
| 2046 | &set_label("picked_te"); | ||
| 814 | 2047 | ||
| 815 | # allocate aligned stack frame... | 2048 | # one can argue if this is required |
| 816 | &lea ($key,&DWP(-64-244,"esp")); | 2049 | &pushf (); |
| 817 | &and ($key,-64); | 2050 | &cld (); |
| 818 | 2051 | ||
| 819 | # ... and make sure it doesn't alias with AES_Te modulo 4096 | 2052 | &cmp ($s2,$speed_limit); |
| 820 | &mov ($s0,"ebp"); | 2053 | &jb (&label("slow_way")); |
| 821 | &lea ($s1,&DWP(2048,"ebp")); | 2054 | &test ($s2,15); |
| 822 | &mov ($s3,$key); | 2055 | &jnz (&label("slow_way")); |
| 2056 | if (!$x86only) { | ||
| 2057 | &bt (&DWP(0,$s0),28); # check for hyper-threading bit | ||
| 2058 | &jc (&label("slow_way")); | ||
| 2059 | } | ||
| 2060 | # pre-allocate aligned stack frame... | ||
| 2061 | &lea ($acc,&DWP(-80-244,"esp")); | ||
| 2062 | &and ($acc,-64); | ||
| 2063 | |||
| 2064 | # ... and make sure it doesn't alias with $tbl modulo 4096 | ||
| 2065 | &mov ($s0,$tbl); | ||
| 2066 | &lea ($s1,&DWP(2048+256,$tbl)); | ||
| 2067 | &mov ($s3,$acc); | ||
| 823 | &and ($s0,0xfff); # s = %ebp&0xfff | 2068 | &and ($s0,0xfff); # s = %ebp&0xfff |
| 824 | &and ($s1,0xfff); # e = (%ebp+2048)&0xfff | 2069 | &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff |
| 825 | &and ($s3,0xfff); # p = %esp&0xfff | 2070 | &and ($s3,0xfff); # p = %esp&0xfff |
| 826 | 2071 | ||
| 827 | &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); | 2072 | &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); |
| 828 | &jb (&label("te_break_out")); | 2073 | &jb (&label("tbl_break_out")); |
| 829 | &sub ($s3,$s1); | 2074 | &sub ($s3,$s1); |
| 830 | &sub ($key,$s3); | 2075 | &sub ($acc,$s3); |
| 831 | &jmp (&label("te_ok")); | 2076 | &jmp (&label("tbl_ok")); |
| 832 | &set_label("te_break_out"); # else %esp -= (p-s)&0xfff + framesz; | 2077 | &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz; |
| 833 | &sub ($s3,$s0); | 2078 | &sub ($s3,$s0); |
| 834 | &and ($s3,0xfff); | 2079 | &and ($s3,0xfff); |
| 835 | &add ($s3,64+256); | 2080 | &add ($s3,384); |
| 836 | &sub ($key,$s3); | 2081 | &sub ($acc,$s3); |
| 837 | &align (4); | 2082 | &set_label("tbl_ok",4); |
| 838 | &set_label("te_ok"); | ||
| 839 | |||
| 840 | &mov ($s0,&wparam(0)); # load inp | ||
| 841 | &mov ($s1,&wparam(1)); # load out | ||
| 842 | &mov ($s3,&wparam(3)); # load key | ||
| 843 | &mov ($acc,&wparam(4)); # load ivp | ||
| 844 | 2083 | ||
| 845 | &exch ("esp",$key); | 2084 | &lea ($s3,&wparam(0)); # obtain pointer to parameter block |
| 2085 | &exch ("esp",$acc); # allocate stack frame | ||
| 846 | &add ("esp",4); # reserve for return address! | 2086 | &add ("esp",4); # reserve for return address! |
| 847 | &mov ($_esp,$key); # save %esp | 2087 | &mov ($_tbl,$tbl); # save %ebp |
| 2088 | &mov ($_esp,$acc); # save %esp | ||
| 2089 | |||
| 2090 | &mov ($s0,&DWP(0,$s3)); # load inp | ||
| 2091 | &mov ($s1,&DWP(4,$s3)); # load out | ||
| 2092 | #&mov ($s2,&DWP(8,$s3)); # load len | ||
| 2093 | &mov ($key,&DWP(12,$s3)); # load key | ||
| 2094 | &mov ($acc,&DWP(16,$s3)); # load ivp | ||
| 2095 | &mov ($s3,&DWP(20,$s3)); # load enc flag | ||
| 848 | 2096 | ||
| 849 | &mov ($_inp,$s0); # save copy of inp | 2097 | &mov ($_inp,$s0); # save copy of inp |
| 850 | &mov ($_out,$s1); # save copy of out | 2098 | &mov ($_out,$s1); # save copy of out |
| 851 | &mov ($_len,$s2); # save copy of len | 2099 | &mov ($_len,$s2); # save copy of len |
| 852 | &mov ($_key,$s3); # save copy of key | 2100 | &mov ($_key,$key); # save copy of key |
| 853 | &mov ($_ivp,$acc); # save copy of ivp | 2101 | &mov ($_ivp,$acc); # save copy of ivp |
| 854 | 2102 | ||
| 855 | &mov ($mark,0); # copy of aes_key->rounds = 0; | 2103 | &mov ($mark,0); # copy of aes_key->rounds = 0; |
| 856 | if ($compromise) { | ||
| 857 | &cmp ($s2,$compromise); | ||
| 858 | &jb (&label("skip_ecopy")); | ||
| 859 | } | ||
| 860 | # do we copy key schedule to stack? | 2104 | # do we copy key schedule to stack? |
| 861 | &mov ($s1 eq "ebx" ? $s1 : "",$s3); | 2105 | &mov ($s1 eq "ebx" ? $s1 : "",$key); |
| 862 | &mov ($s2 eq "ecx" ? $s2 : "",244/4); | 2106 | &mov ($s2 eq "ecx" ? $s2 : "",244/4); |
| 863 | &sub ($s1,"ebp"); | 2107 | &sub ($s1,$tbl); |
| 864 | &mov ("esi",$s3); | 2108 | &mov ("esi",$key); |
| 865 | &and ($s1,0xfff); | 2109 | &and ($s1,0xfff); |
| 866 | &lea ("edi",$aes_key); | 2110 | &lea ("edi",$aes_key); |
| 867 | &cmp ($s1,2048); | 2111 | &cmp ($s1,2048+256); |
| 868 | &jb (&label("do_ecopy")); | 2112 | &jb (&label("do_copy")); |
| 869 | &cmp ($s1,4096-244); | 2113 | &cmp ($s1,4096-244); |
| 870 | &jb (&label("skip_ecopy")); | 2114 | &jb (&label("skip_copy")); |
| 871 | &align (4); | 2115 | &set_label("do_copy",4); |
| 872 | &set_label("do_ecopy"); | ||
| 873 | &mov ($_key,"edi"); | 2116 | &mov ($_key,"edi"); |
| 874 | &data_word(0xA5F3F689); # rep movsd | 2117 | &data_word(0xA5F3F689); # rep movsd |
| 875 | &set_label("skip_ecopy"); | 2118 | &set_label("skip_copy"); |
| 876 | 2119 | ||
| 877 | &mov ($acc,$s0); | ||
| 878 | &mov ($key,16); | 2120 | &mov ($key,16); |
| 879 | &align (4); | 2121 | &set_label("prefetch_tbl",4); |
| 880 | &set_label("prefetch_te"); | 2122 | &mov ($s0,&DWP(0,$tbl)); |
| 881 | &mov ($s0,&DWP(0,"ebp")); | 2123 | &mov ($s1,&DWP(32,$tbl)); |
| 882 | &mov ($s1,&DWP(32,"ebp")); | 2124 | &mov ($s2,&DWP(64,$tbl)); |
| 883 | &mov ($s2,&DWP(64,"ebp")); | 2125 | &mov ($acc,&DWP(96,$tbl)); |
| 884 | &mov ($s3,&DWP(96,"ebp")); | 2126 | &lea ($tbl,&DWP(128,$tbl)); |
| 885 | &lea ("ebp",&DWP(128,"ebp")); | 2127 | &sub ($key,1); |
| 886 | &dec ($key); | 2128 | &jnz (&label("prefetch_tbl")); |
| 887 | &jnz (&label("prefetch_te")); | 2129 | &sub ($tbl,2048); |
| 888 | &sub ("ebp",2048); | 2130 | |
| 889 | 2131 | &mov ($acc,$_inp); | |
| 890 | &mov ($s2,$_len); | ||
| 891 | &mov ($key,$_ivp); | 2132 | &mov ($key,$_ivp); |
| 892 | &test ($s2,0xFFFFFFF0); | ||
| 893 | &jz (&label("enc_tail")); # short input... | ||
| 894 | 2133 | ||
| 2134 | &cmp ($s3,0); | ||
| 2135 | &je (&label("fast_decrypt")); | ||
| 2136 | |||
| 2137 | #----------------------------- ENCRYPT -----------------------------# | ||
| 895 | &mov ($s0,&DWP(0,$key)); # load iv | 2138 | &mov ($s0,&DWP(0,$key)); # load iv |
| 896 | &mov ($s1,&DWP(4,$key)); | 2139 | &mov ($s1,&DWP(4,$key)); |
| 897 | 2140 | ||
| 898 | &align (4); | 2141 | &set_label("fast_enc_loop",16); |
| 899 | &set_label("enc_loop"); | ||
| 900 | &mov ($s2,&DWP(8,$key)); | 2142 | &mov ($s2,&DWP(8,$key)); |
| 901 | &mov ($s3,&DWP(12,$key)); | 2143 | &mov ($s3,&DWP(12,$key)); |
| 902 | 2144 | ||
| @@ -916,22 +2158,16 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
| 916 | &mov (&DWP(8,$key),$s2); | 2158 | &mov (&DWP(8,$key),$s2); |
| 917 | &mov (&DWP(12,$key),$s3); | 2159 | &mov (&DWP(12,$key),$s3); |
| 918 | 2160 | ||
| 2161 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
| 919 | &mov ($s2,$_len); # load len | 2162 | &mov ($s2,$_len); # load len |
| 920 | |||
| 921 | &lea ($acc,&DWP(16,$acc)); | ||
| 922 | &mov ($_inp,$acc); # save inp | 2163 | &mov ($_inp,$acc); # save inp |
| 923 | 2164 | &lea ($s3,&DWP(16,$key)); # advance out | |
| 924 | &lea ($s3,&DWP(16,$key)); | ||
| 925 | &mov ($_out,$s3); # save out | 2165 | &mov ($_out,$s3); # save out |
| 926 | 2166 | &sub ($s2,16); # decrease len | |
| 927 | &sub ($s2,16); | ||
| 928 | &test ($s2,0xFFFFFFF0); | ||
| 929 | &mov ($_len,$s2); # save len | 2167 | &mov ($_len,$s2); # save len |
| 930 | &jnz (&label("enc_loop")); | 2168 | &jnz (&label("fast_enc_loop")); |
| 931 | &test ($s2,15); | ||
| 932 | &jnz (&label("enc_tail")); | ||
| 933 | &mov ($acc,$_ivp); # load ivp | 2169 | &mov ($acc,$_ivp); # load ivp |
| 934 | &mov ($s2,&DWP(8,$key)); # restore last dwords | 2170 | &mov ($s2,&DWP(8,$key)); # restore last 2 dwords |
| 935 | &mov ($s3,&DWP(12,$key)); | 2171 | &mov ($s3,&DWP(12,$key)); |
| 936 | &mov (&DWP(0,$acc),$s0); # save ivec | 2172 | &mov (&DWP(0,$acc),$s0); # save ivec |
| 937 | &mov (&DWP(4,$acc),$s1); | 2173 | &mov (&DWP(4,$acc),$s1); |
| @@ -949,125 +2185,20 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
| 949 | &set_label("skip_ezero") | 2185 | &set_label("skip_ezero") |
| 950 | &mov ("esp",$_esp); | 2186 | &mov ("esp",$_esp); |
| 951 | &popf (); | 2187 | &popf (); |
| 952 | &set_label("enc_out"); | 2188 | &set_label("drop_out"); |
| 953 | &function_end_A(); | 2189 | &function_end_A(); |
| 954 | &pushf (); # kludge, never executed | 2190 | &pushf (); # kludge, never executed |
| 955 | 2191 | ||
| 956 | &align (4); | ||
| 957 | &set_label("enc_tail"); | ||
| 958 | &mov ($s0,$key eq "edi" ? $key : ""); | ||
| 959 | &mov ($key,$_out); # load out | ||
| 960 | &push ($s0); # push ivp | ||
| 961 | &mov ($s1,16); | ||
| 962 | &sub ($s1,$s2); | ||
| 963 | &cmp ($key,$acc); # compare with inp | ||
| 964 | &je (&label("enc_in_place")); | ||
| 965 | &align (4); | ||
| 966 | &data_word(0xA4F3F689); # rep movsb # copy input | ||
| 967 | &jmp (&label("enc_skip_in_place")); | ||
| 968 | &set_label("enc_in_place"); | ||
| 969 | &lea ($key,&DWP(0,$key,$s2)); | ||
| 970 | &set_label("enc_skip_in_place"); | ||
| 971 | &mov ($s2,$s1); | ||
| 972 | &xor ($s0,$s0); | ||
| 973 | &align (4); | ||
| 974 | &data_word(0xAAF3F689); # rep stosb # zero tail | ||
| 975 | &pop ($key); # pop ivp | ||
| 976 | |||
| 977 | &mov ($acc,$_out); # output as input | ||
| 978 | &mov ($s0,&DWP(0,$key)); | ||
| 979 | &mov ($s1,&DWP(4,$key)); | ||
| 980 | &mov ($_len,16); # len=16 | ||
| 981 | &jmp (&label("enc_loop")); # one more spin... | ||
| 982 | |||
| 983 | #----------------------------- DECRYPT -----------------------------# | 2192 | #----------------------------- DECRYPT -----------------------------# |
| 984 | &align (4); | 2193 | &set_label("fast_decrypt",16); |
| 985 | &set_label("DECRYPT"); | ||
| 986 | &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); | ||
| 987 | |||
| 988 | # allocate aligned stack frame... | ||
| 989 | &lea ($key,&DWP(-64-244,"esp")); | ||
| 990 | &and ($key,-64); | ||
| 991 | |||
| 992 | # ... and make sure it doesn't alias with AES_Td modulo 4096 | ||
| 993 | &mov ($s0,"ebp"); | ||
| 994 | &lea ($s1,&DWP(2048+256,"ebp")); | ||
| 995 | &mov ($s3,$key); | ||
| 996 | &and ($s0,0xfff); # s = %ebp&0xfff | ||
| 997 | &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff | ||
| 998 | &and ($s3,0xfff); # p = %esp&0xfff | ||
| 999 | |||
| 1000 | &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); | ||
| 1001 | &jb (&label("td_break_out")); | ||
| 1002 | &sub ($s3,$s1); | ||
| 1003 | &sub ($key,$s3); | ||
| 1004 | &jmp (&label("td_ok")); | ||
| 1005 | &set_label("td_break_out"); # else %esp -= (p-s)&0xfff + framesz; | ||
| 1006 | &sub ($s3,$s0); | ||
| 1007 | &and ($s3,0xfff); | ||
| 1008 | &add ($s3,64+256); | ||
| 1009 | &sub ($key,$s3); | ||
| 1010 | &align (4); | ||
| 1011 | &set_label("td_ok"); | ||
| 1012 | |||
| 1013 | &mov ($s0,&wparam(0)); # load inp | ||
| 1014 | &mov ($s1,&wparam(1)); # load out | ||
| 1015 | &mov ($s3,&wparam(3)); # load key | ||
| 1016 | &mov ($acc,&wparam(4)); # load ivp | ||
| 1017 | |||
| 1018 | &exch ("esp",$key); | ||
| 1019 | &add ("esp",4); # reserve for return address! | ||
| 1020 | &mov ($_esp,$key); # save %esp | ||
| 1021 | |||
| 1022 | &mov ($_inp,$s0); # save copy of inp | ||
| 1023 | &mov ($_out,$s1); # save copy of out | ||
| 1024 | &mov ($_len,$s2); # save copy of len | ||
| 1025 | &mov ($_key,$s3); # save copy of key | ||
| 1026 | &mov ($_ivp,$acc); # save copy of ivp | ||
| 1027 | |||
| 1028 | &mov ($mark,0); # copy of aes_key->rounds = 0; | ||
| 1029 | if ($compromise) { | ||
| 1030 | &cmp ($s2,$compromise); | ||
| 1031 | &jb (&label("skip_dcopy")); | ||
| 1032 | } | ||
| 1033 | # do we copy key schedule to stack? | ||
| 1034 | &mov ($s1 eq "ebx" ? $s1 : "",$s3); | ||
| 1035 | &mov ($s2 eq "ecx" ? $s2 : "",244/4); | ||
| 1036 | &sub ($s1,"ebp"); | ||
| 1037 | &mov ("esi",$s3); | ||
| 1038 | &and ($s1,0xfff); | ||
| 1039 | &lea ("edi",$aes_key); | ||
| 1040 | &cmp ($s1,2048+256); | ||
| 1041 | &jb (&label("do_dcopy")); | ||
| 1042 | &cmp ($s1,4096-244); | ||
| 1043 | &jb (&label("skip_dcopy")); | ||
| 1044 | &align (4); | ||
| 1045 | &set_label("do_dcopy"); | ||
| 1046 | &mov ($_key,"edi"); | ||
| 1047 | &data_word(0xA5F3F689); # rep movsd | ||
| 1048 | &set_label("skip_dcopy"); | ||
| 1049 | |||
| 1050 | &mov ($acc,$s0); | ||
| 1051 | &mov ($key,18); | ||
| 1052 | &align (4); | ||
| 1053 | &set_label("prefetch_td"); | ||
| 1054 | &mov ($s0,&DWP(0,"ebp")); | ||
| 1055 | &mov ($s1,&DWP(32,"ebp")); | ||
| 1056 | &mov ($s2,&DWP(64,"ebp")); | ||
| 1057 | &mov ($s3,&DWP(96,"ebp")); | ||
| 1058 | &lea ("ebp",&DWP(128,"ebp")); | ||
| 1059 | &dec ($key); | ||
| 1060 | &jnz (&label("prefetch_td")); | ||
| 1061 | &sub ("ebp",2048+256); | ||
| 1062 | 2194 | ||
| 1063 | &cmp ($acc,$_out); | 2195 | &cmp ($acc,$_out); |
| 1064 | &je (&label("dec_in_place")); # in-place processing... | 2196 | &je (&label("fast_dec_in_place")); # in-place processing... |
| 1065 | 2197 | ||
| 1066 | &mov ($key,$_ivp); # load ivp | ||
| 1067 | &mov ($_tmp,$key); | 2198 | &mov ($_tmp,$key); |
| 1068 | 2199 | ||
| 1069 | &align (4); | 2200 | &align (4); |
| 1070 | &set_label("dec_loop"); | 2201 | &set_label("fast_dec_loop",16); |
| 1071 | &mov ($s0,&DWP(0,$acc)); # read input | 2202 | &mov ($s0,&DWP(0,$acc)); # read input |
| 1072 | &mov ($s1,&DWP(4,$acc)); | 2203 | &mov ($s1,&DWP(4,$acc)); |
| 1073 | &mov ($s2,&DWP(8,$acc)); | 2204 | &mov ($s2,&DWP(8,$acc)); |
| @@ -1083,27 +2214,24 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
| 1083 | &xor ($s2,&DWP(8,$key)); | 2214 | &xor ($s2,&DWP(8,$key)); |
| 1084 | &xor ($s3,&DWP(12,$key)); | 2215 | &xor ($s3,&DWP(12,$key)); |
| 1085 | 2216 | ||
| 1086 | &sub ($acc,16); | ||
| 1087 | &jc (&label("dec_partial")); | ||
| 1088 | &mov ($_len,$acc); # save len | ||
| 1089 | &mov ($acc,$_inp); # load inp | ||
| 1090 | &mov ($key,$_out); # load out | 2217 | &mov ($key,$_out); # load out |
| 2218 | &mov ($acc,$_inp); # load inp | ||
| 1091 | 2219 | ||
| 1092 | &mov (&DWP(0,$key),$s0); # write output | 2220 | &mov (&DWP(0,$key),$s0); # write output |
| 1093 | &mov (&DWP(4,$key),$s1); | 2221 | &mov (&DWP(4,$key),$s1); |
| 1094 | &mov (&DWP(8,$key),$s2); | 2222 | &mov (&DWP(8,$key),$s2); |
| 1095 | &mov (&DWP(12,$key),$s3); | 2223 | &mov (&DWP(12,$key),$s3); |
| 1096 | 2224 | ||
| 2225 | &mov ($s2,$_len); # load len | ||
| 1097 | &mov ($_tmp,$acc); # save ivp | 2226 | &mov ($_tmp,$acc); # save ivp |
| 1098 | &lea ($acc,&DWP(16,$acc)); | 2227 | &lea ($acc,&DWP(16,$acc)); # advance inp |
| 1099 | &mov ($_inp,$acc); # save inp | 2228 | &mov ($_inp,$acc); # save inp |
| 1100 | 2229 | &lea ($key,&DWP(16,$key)); # advance out | |
| 1101 | &lea ($key,&DWP(16,$key)); | ||
| 1102 | &mov ($_out,$key); # save out | 2230 | &mov ($_out,$key); # save out |
| 1103 | 2231 | &sub ($s2,16); # decrease len | |
| 1104 | &jnz (&label("dec_loop")); | 2232 | &mov ($_len,$s2); # save len |
| 2233 | &jnz (&label("fast_dec_loop")); | ||
| 1105 | &mov ($key,$_tmp); # load temp ivp | 2234 | &mov ($key,$_tmp); # load temp ivp |
| 1106 | &set_label("dec_end"); | ||
| 1107 | &mov ($acc,$_ivp); # load user ivp | 2235 | &mov ($acc,$_ivp); # load user ivp |
| 1108 | &mov ($s0,&DWP(0,$key)); # load iv | 2236 | &mov ($s0,&DWP(0,$key)); # load iv |
| 1109 | &mov ($s1,&DWP(4,$key)); | 2237 | &mov ($s1,&DWP(4,$key)); |
| @@ -1113,31 +2241,16 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
| 1113 | &mov (&DWP(4,$acc),$s1); | 2241 | &mov (&DWP(4,$acc),$s1); |
| 1114 | &mov (&DWP(8,$acc),$s2); | 2242 | &mov (&DWP(8,$acc),$s2); |
| 1115 | &mov (&DWP(12,$acc),$s3); | 2243 | &mov (&DWP(12,$acc),$s3); |
| 1116 | &jmp (&label("dec_out")); | 2244 | &jmp (&label("fast_dec_out")); |
| 1117 | 2245 | ||
| 1118 | &align (4); | 2246 | &set_label("fast_dec_in_place",16); |
| 1119 | &set_label("dec_partial"); | 2247 | &set_label("fast_dec_in_place_loop"); |
| 1120 | &lea ($key,$ivec); | ||
| 1121 | &mov (&DWP(0,$key),$s0); # dump output to stack | ||
| 1122 | &mov (&DWP(4,$key),$s1); | ||
| 1123 | &mov (&DWP(8,$key),$s2); | ||
| 1124 | &mov (&DWP(12,$key),$s3); | ||
| 1125 | &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc)); | ||
| 1126 | &mov ($acc eq "esi" ? $acc : "",$key); | ||
| 1127 | &mov ($key eq "edi" ? $key : "",$_out); # load out | ||
| 1128 | &data_word(0xA4F3F689); # rep movsb # copy output | ||
| 1129 | &mov ($key,$_inp); # use inp as temp ivp | ||
| 1130 | &jmp (&label("dec_end")); | ||
| 1131 | |||
| 1132 | &align (4); | ||
| 1133 | &set_label("dec_in_place"); | ||
| 1134 | &set_label("dec_in_place_loop"); | ||
| 1135 | &lea ($key,$ivec); | ||
| 1136 | &mov ($s0,&DWP(0,$acc)); # read input | 2248 | &mov ($s0,&DWP(0,$acc)); # read input |
| 1137 | &mov ($s1,&DWP(4,$acc)); | 2249 | &mov ($s1,&DWP(4,$acc)); |
| 1138 | &mov ($s2,&DWP(8,$acc)); | 2250 | &mov ($s2,&DWP(8,$acc)); |
| 1139 | &mov ($s3,&DWP(12,$acc)); | 2251 | &mov ($s3,&DWP(12,$acc)); |
| 1140 | 2252 | ||
| 2253 | &lea ($key,$ivec); | ||
| 1141 | &mov (&DWP(0,$key),$s0); # copy to temp | 2254 | &mov (&DWP(0,$key),$s0); # copy to temp |
| 1142 | &mov (&DWP(4,$key),$s1); | 2255 | &mov (&DWP(4,$key),$s1); |
| 1143 | &mov (&DWP(8,$key),$s2); | 2256 | &mov (&DWP(8,$key),$s2); |
| @@ -1158,7 +2271,7 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
| 1158 | &mov (&DWP(8,$acc),$s2); | 2271 | &mov (&DWP(8,$acc),$s2); |
| 1159 | &mov (&DWP(12,$acc),$s3); | 2272 | &mov (&DWP(12,$acc),$s3); |
| 1160 | 2273 | ||
| 1161 | &lea ($acc,&DWP(16,$acc)); | 2274 | &lea ($acc,&DWP(16,$acc)); # advance out |
| 1162 | &mov ($_out,$acc); # save out | 2275 | &mov ($_out,$acc); # save out |
| 1163 | 2276 | ||
| 1164 | &lea ($acc,$ivec); | 2277 | &lea ($acc,$ivec); |
| @@ -1173,40 +2286,340 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
| 1173 | &mov (&DWP(12,$key),$s3); | 2286 | &mov (&DWP(12,$key),$s3); |
| 1174 | 2287 | ||
| 1175 | &mov ($acc,$_inp); # load inp | 2288 | &mov ($acc,$_inp); # load inp |
| 2289 | &mov ($s2,$_len); # load len | ||
| 2290 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
| 2291 | &mov ($_inp,$acc); # save inp | ||
| 2292 | &sub ($s2,16); # decrease len | ||
| 2293 | &mov ($_len,$s2); # save len | ||
| 2294 | &jnz (&label("fast_dec_in_place_loop")); | ||
| 2295 | |||
| 2296 | &set_label("fast_dec_out",4); | ||
| 2297 | &cmp ($mark,0); # was the key schedule copied? | ||
| 2298 | &mov ("edi",$_key); | ||
| 2299 | &je (&label("skip_dzero")); | ||
| 2300 | # zero copy of key schedule | ||
| 2301 | &mov ("ecx",240/4); | ||
| 2302 | &xor ("eax","eax"); | ||
| 2303 | &align (4); | ||
| 2304 | &data_word(0xABF3F689); # rep stosd | ||
| 2305 | &set_label("skip_dzero") | ||
| 2306 | &mov ("esp",$_esp); | ||
| 2307 | &popf (); | ||
| 2308 | &function_end_A(); | ||
| 2309 | &pushf (); # kludge, never executed | ||
| 2310 | |||
| 2311 | #--------------------------- SLOW ROUTINE ---------------------------# | ||
| 2312 | &set_label("slow_way",16); | ||
| 2313 | |||
| 2314 | &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap | ||
| 2315 | &mov ($key,&wparam(3)); # load key | ||
| 2316 | |||
| 2317 | # pre-allocate aligned stack frame... | ||
| 2318 | &lea ($acc,&DWP(-80,"esp")); | ||
| 2319 | &and ($acc,-64); | ||
| 2320 | |||
| 2321 | # ... and make sure it doesn't alias with $key modulo 1024 | ||
| 2322 | &lea ($s1,&DWP(-80-63,$key)); | ||
| 2323 | &sub ($s1,$acc); | ||
| 2324 | &neg ($s1); | ||
| 2325 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line | ||
| 2326 | &sub ($acc,$s1); | ||
| 2327 | |||
| 2328 | # pick S-box copy which can't overlap with stack frame or $key | ||
| 2329 | &lea ($s1,&DWP(768,$acc)); | ||
| 2330 | &sub ($s1,$tbl); | ||
| 2331 | &and ($s1,0x300); | ||
| 2332 | &lea ($tbl,&DWP(2048+128,$tbl,$s1)); | ||
| 2333 | |||
| 2334 | &lea ($s3,&wparam(0)); # pointer to parameter block | ||
| 2335 | |||
| 2336 | &exch ("esp",$acc); | ||
| 2337 | &add ("esp",4); # reserve for return address! | ||
| 2338 | &mov ($_tbl,$tbl); # save %ebp | ||
| 2339 | &mov ($_esp,$acc); # save %esp | ||
| 2340 | &mov ($_tmp,$s0); # save OPENSSL_ia32cap | ||
| 2341 | |||
| 2342 | &mov ($s0,&DWP(0,$s3)); # load inp | ||
| 2343 | &mov ($s1,&DWP(4,$s3)); # load out | ||
| 2344 | #&mov ($s2,&DWP(8,$s3)); # load len | ||
| 2345 | #&mov ($key,&DWP(12,$s3)); # load key | ||
| 2346 | &mov ($acc,&DWP(16,$s3)); # load ivp | ||
| 2347 | &mov ($s3,&DWP(20,$s3)); # load enc flag | ||
| 2348 | |||
| 2349 | &mov ($_inp,$s0); # save copy of inp | ||
| 2350 | &mov ($_out,$s1); # save copy of out | ||
| 2351 | &mov ($_len,$s2); # save copy of len | ||
| 2352 | &mov ($_key,$key); # save copy of key | ||
| 2353 | &mov ($_ivp,$acc); # save copy of ivp | ||
| 2354 | |||
| 2355 | &mov ($key,$acc); | ||
| 2356 | &mov ($acc,$s0); | ||
| 2357 | |||
| 2358 | &cmp ($s3,0); | ||
| 2359 | &je (&label("slow_decrypt")); | ||
| 2360 | |||
| 2361 | #--------------------------- SLOW ENCRYPT ---------------------------# | ||
| 2362 | &cmp ($s2,16); | ||
| 2363 | &mov ($s3,$s1); | ||
| 2364 | &jb (&label("slow_enc_tail")); | ||
| 2365 | |||
| 2366 | if (!$x86only) { | ||
| 2367 | &bt ($_tmp,25); # check for SSE bit | ||
| 2368 | &jnc (&label("slow_enc_x86")); | ||
| 1176 | 2369 | ||
| 1177 | &lea ($acc,&DWP(16,$acc)); | 2370 | &movq ("mm0",&QWP(0,$key)); # load iv |
| 2371 | &movq ("mm4",&QWP(8,$key)); | ||
| 2372 | |||
| 2373 | &set_label("slow_enc_loop_sse",16); | ||
| 2374 | &pxor ("mm0",&QWP(0,$acc)); # xor input data | ||
| 2375 | &pxor ("mm4",&QWP(8,$acc)); | ||
| 2376 | |||
| 2377 | &mov ($key,$_key); | ||
| 2378 | &call ("_sse_AES_encrypt_compact"); | ||
| 2379 | |||
| 2380 | &mov ($acc,$_inp); # load inp | ||
| 2381 | &mov ($key,$_out); # load out | ||
| 2382 | &mov ($s2,$_len); # load len | ||
| 2383 | |||
| 2384 | &movq (&QWP(0,$key),"mm0"); # save output data | ||
| 2385 | &movq (&QWP(8,$key),"mm4"); | ||
| 2386 | |||
| 2387 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
| 1178 | &mov ($_inp,$acc); # save inp | 2388 | &mov ($_inp,$acc); # save inp |
| 2389 | &lea ($s3,&DWP(16,$key)); # advance out | ||
| 2390 | &mov ($_out,$s3); # save out | ||
| 2391 | &sub ($s2,16); # decrease len | ||
| 2392 | &cmp ($s2,16); | ||
| 2393 | &mov ($_len,$s2); # save len | ||
| 2394 | &jae (&label("slow_enc_loop_sse")); | ||
| 2395 | &test ($s2,15); | ||
| 2396 | &jnz (&label("slow_enc_tail")); | ||
| 2397 | &mov ($acc,$_ivp); # load ivp | ||
| 2398 | &movq (&QWP(0,$acc),"mm0"); # save ivec | ||
| 2399 | &movq (&QWP(8,$acc),"mm4"); | ||
| 2400 | &emms (); | ||
| 2401 | &mov ("esp",$_esp); | ||
| 2402 | &popf (); | ||
| 2403 | &function_end_A(); | ||
| 2404 | &pushf (); # kludge, never executed | ||
| 2405 | } | ||
| 2406 | &set_label("slow_enc_x86",16); | ||
| 2407 | &mov ($s0,&DWP(0,$key)); # load iv | ||
| 2408 | &mov ($s1,&DWP(4,$key)); | ||
| 2409 | |||
| 2410 | &set_label("slow_enc_loop_x86",4); | ||
| 2411 | &mov ($s2,&DWP(8,$key)); | ||
| 2412 | &mov ($s3,&DWP(12,$key)); | ||
| 2413 | |||
| 2414 | &xor ($s0,&DWP(0,$acc)); # xor input data | ||
| 2415 | &xor ($s1,&DWP(4,$acc)); | ||
| 2416 | &xor ($s2,&DWP(8,$acc)); | ||
| 2417 | &xor ($s3,&DWP(12,$acc)); | ||
| 2418 | |||
| 2419 | &mov ($key,$_key); # load key | ||
| 2420 | &call ("_x86_AES_encrypt_compact"); | ||
| 2421 | |||
| 2422 | &mov ($acc,$_inp); # load inp | ||
| 2423 | &mov ($key,$_out); # load out | ||
| 2424 | |||
| 2425 | &mov (&DWP(0,$key),$s0); # save output data | ||
| 2426 | &mov (&DWP(4,$key),$s1); | ||
| 2427 | &mov (&DWP(8,$key),$s2); | ||
| 2428 | &mov (&DWP(12,$key),$s3); | ||
| 1179 | 2429 | ||
| 1180 | &mov ($s2,$_len); # load len | 2430 | &mov ($s2,$_len); # load len |
| 1181 | &sub ($s2,16); | 2431 | &lea ($acc,&DWP(16,$acc)); # advance inp |
| 1182 | &jc (&label("dec_in_place_partial")); | 2432 | &mov ($_inp,$acc); # save inp |
| 2433 | &lea ($s3,&DWP(16,$key)); # advance out | ||
| 2434 | &mov ($_out,$s3); # save out | ||
| 2435 | &sub ($s2,16); # decrease len | ||
| 2436 | &cmp ($s2,16); | ||
| 1183 | &mov ($_len,$s2); # save len | 2437 | &mov ($_len,$s2); # save len |
| 1184 | &jnz (&label("dec_in_place_loop")); | 2438 | &jae (&label("slow_enc_loop_x86")); |
| 1185 | &jmp (&label("dec_out")); | 2439 | &test ($s2,15); |
| 1186 | 2440 | &jnz (&label("slow_enc_tail")); | |
| 1187 | &align (4); | 2441 | &mov ($acc,$_ivp); # load ivp |
| 1188 | &set_label("dec_in_place_partial"); | 2442 | &mov ($s2,&DWP(8,$key)); # restore last dwords |
| 1189 | # one can argue if this is actually required... | 2443 | &mov ($s3,&DWP(12,$key)); |
| 1190 | &mov ($key eq "edi" ? $key : "",$_out); | 2444 | &mov (&DWP(0,$acc),$s0); # save ivec |
| 1191 | &lea ($acc eq "esi" ? $acc : "",$ivec); | 2445 | &mov (&DWP(4,$acc),$s1); |
| 2446 | &mov (&DWP(8,$acc),$s2); | ||
| 2447 | &mov (&DWP(12,$acc),$s3); | ||
| 2448 | |||
| 2449 | &mov ("esp",$_esp); | ||
| 2450 | &popf (); | ||
| 2451 | &function_end_A(); | ||
| 2452 | &pushf (); # kludge, never executed | ||
| 2453 | |||
| 2454 | &set_label("slow_enc_tail",16); | ||
| 2455 | &emms () if (!$x86only); | ||
| 2456 | &mov ($key eq "edi"? $key:"",$s3); # load out to edi | ||
| 2457 | &mov ($s1,16); | ||
| 2458 | &sub ($s1,$s2); | ||
| 2459 | &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp | ||
| 2460 | &je (&label("enc_in_place")); | ||
| 2461 | &align (4); | ||
| 2462 | &data_word(0xA4F3F689); # rep movsb # copy input | ||
| 2463 | &jmp (&label("enc_skip_in_place")); | ||
| 2464 | &set_label("enc_in_place"); | ||
| 1192 | &lea ($key,&DWP(0,$key,$s2)); | 2465 | &lea ($key,&DWP(0,$key,$s2)); |
| 1193 | &lea ($acc,&DWP(16,$acc,$s2)); | 2466 | &set_label("enc_skip_in_place"); |
| 1194 | &neg ($s2 eq "ecx" ? $s2 : ""); | 2467 | &mov ($s2,$s1); |
| 1195 | &data_word(0xA4F3F689); # rep movsb # restore tail | 2468 | &xor ($s0,$s0); |
| 1196 | 2469 | &align (4); | |
| 1197 | &align (4); | 2470 | &data_word(0xAAF3F689); # rep stosb # zero tail |
| 1198 | &set_label("dec_out"); | 2471 | |
| 1199 | &cmp ($mark,0); # was the key schedule copied? | 2472 | &mov ($key,$_ivp); # restore ivp |
| 1200 | &mov ("edi",$_key); | 2473 | &mov ($acc,$s3); # output as input |
| 1201 | &je (&label("skip_dzero")); | 2474 | &mov ($s0,&DWP(0,$key)); |
| 1202 | # zero copy of key schedule | 2475 | &mov ($s1,&DWP(4,$key)); |
| 1203 | &mov ("ecx",240/4); | 2476 | &mov ($_len,16); # len=16 |
| 1204 | &xor ("eax","eax"); | 2477 | &jmp (&label("slow_enc_loop_x86")); # one more spin... |
| 1205 | &align (4); | 2478 | |
| 1206 | &data_word(0xABF3F689); # rep stosd | 2479 | #--------------------------- SLOW DECRYPT ---------------------------# |
| 1207 | &set_label("skip_dzero") | 2480 | &set_label("slow_decrypt",16); |
| 1208 | &mov ("esp",$_esp); | 2481 | if (!$x86only) { |
| 1209 | &popf (); | 2482 | &bt ($_tmp,25); # check for SSE bit |
| 2483 | &jnc (&label("slow_dec_loop_x86")); | ||
| 2484 | |||
| 2485 | &set_label("slow_dec_loop_sse",4); | ||
| 2486 | &movq ("mm0",&QWP(0,$acc)); # read input | ||
| 2487 | &movq ("mm4",&QWP(8,$acc)); | ||
| 2488 | |||
| 2489 | &mov ($key,$_key); | ||
| 2490 | &call ("_sse_AES_decrypt_compact"); | ||
| 2491 | |||
| 2492 | &mov ($acc,$_inp); # load inp | ||
| 2493 | &lea ($s0,$ivec); | ||
| 2494 | &mov ($s1,$_out); # load out | ||
| 2495 | &mov ($s2,$_len); # load len | ||
| 2496 | &mov ($key,$_ivp); # load ivp | ||
| 2497 | |||
| 2498 | &movq ("mm1",&QWP(0,$acc)); # re-read input | ||
| 2499 | &movq ("mm5",&QWP(8,$acc)); | ||
| 2500 | |||
| 2501 | &pxor ("mm0",&QWP(0,$key)); # xor iv | ||
| 2502 | &pxor ("mm4",&QWP(8,$key)); | ||
| 2503 | |||
| 2504 | &movq (&QWP(0,$key),"mm1"); # copy input to iv | ||
| 2505 | &movq (&QWP(8,$key),"mm5"); | ||
| 2506 | |||
| 2507 | &sub ($s2,16); # decrease len | ||
| 2508 | &jc (&label("slow_dec_partial_sse")); | ||
| 2509 | |||
| 2510 | &movq (&QWP(0,$s1),"mm0"); # write output | ||
| 2511 | &movq (&QWP(8,$s1),"mm4"); | ||
| 2512 | |||
| 2513 | &lea ($s1,&DWP(16,$s1)); # advance out | ||
| 2514 | &mov ($_out,$s1); # save out | ||
| 2515 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
| 2516 | &mov ($_inp,$acc); # save inp | ||
| 2517 | &mov ($_len,$s2); # save len | ||
| 2518 | &jnz (&label("slow_dec_loop_sse")); | ||
| 2519 | &emms (); | ||
| 2520 | &mov ("esp",$_esp); | ||
| 2521 | &popf (); | ||
| 2522 | &function_end_A(); | ||
| 2523 | &pushf (); # kludge, never executed | ||
| 2524 | |||
| 2525 | &set_label("slow_dec_partial_sse",16); | ||
| 2526 | &movq (&QWP(0,$s0),"mm0"); # save output to temp | ||
| 2527 | &movq (&QWP(8,$s0),"mm4"); | ||
| 2528 | &emms (); | ||
| 2529 | |||
| 2530 | &add ($s2 eq "ecx" ? "ecx":"",16); | ||
| 2531 | &mov ("edi",$s1); # out | ||
| 2532 | &mov ("esi",$s0); # temp | ||
| 2533 | &align (4); | ||
| 2534 | &data_word(0xA4F3F689); # rep movsb # copy partial output | ||
| 2535 | |||
| 2536 | &mov ("esp",$_esp); | ||
| 2537 | &popf (); | ||
| 2538 | &function_end_A(); | ||
| 2539 | &pushf (); # kludge, never executed | ||
| 2540 | } | ||
| 2541 | &set_label("slow_dec_loop_x86",16); | ||
| 2542 | &mov ($s0,&DWP(0,$acc)); # read input | ||
| 2543 | &mov ($s1,&DWP(4,$acc)); | ||
| 2544 | &mov ($s2,&DWP(8,$acc)); | ||
| 2545 | &mov ($s3,&DWP(12,$acc)); | ||
| 2546 | |||
| 2547 | &lea ($key,$ivec); | ||
| 2548 | &mov (&DWP(0,$key),$s0); # copy to temp | ||
| 2549 | &mov (&DWP(4,$key),$s1); | ||
| 2550 | &mov (&DWP(8,$key),$s2); | ||
| 2551 | &mov (&DWP(12,$key),$s3); | ||
| 2552 | |||
| 2553 | &mov ($key,$_key); # load key | ||
| 2554 | &call ("_x86_AES_decrypt_compact"); | ||
| 2555 | |||
| 2556 | &mov ($key,$_ivp); # load ivp | ||
| 2557 | &mov ($acc,$_len); # load len | ||
| 2558 | &xor ($s0,&DWP(0,$key)); # xor iv | ||
| 2559 | &xor ($s1,&DWP(4,$key)); | ||
| 2560 | &xor ($s2,&DWP(8,$key)); | ||
| 2561 | &xor ($s3,&DWP(12,$key)); | ||
| 2562 | |||
| 2563 | &sub ($acc,16); | ||
| 2564 | &jc (&label("slow_dec_partial_x86")); | ||
| 2565 | |||
| 2566 | &mov ($_len,$acc); # save len | ||
| 2567 | &mov ($acc,$_out); # load out | ||
| 2568 | |||
| 2569 | &mov (&DWP(0,$acc),$s0); # write output | ||
| 2570 | &mov (&DWP(4,$acc),$s1); | ||
| 2571 | &mov (&DWP(8,$acc),$s2); | ||
| 2572 | &mov (&DWP(12,$acc),$s3); | ||
| 2573 | |||
| 2574 | &lea ($acc,&DWP(16,$acc)); # advance out | ||
| 2575 | &mov ($_out,$acc); # save out | ||
| 2576 | |||
| 2577 | &lea ($acc,$ivec); | ||
| 2578 | &mov ($s0,&DWP(0,$acc)); # read temp | ||
| 2579 | &mov ($s1,&DWP(4,$acc)); | ||
| 2580 | &mov ($s2,&DWP(8,$acc)); | ||
| 2581 | &mov ($s3,&DWP(12,$acc)); | ||
| 2582 | |||
| 2583 | &mov (&DWP(0,$key),$s0); # copy it to iv | ||
| 2584 | &mov (&DWP(4,$key),$s1); | ||
| 2585 | &mov (&DWP(8,$key),$s2); | ||
| 2586 | &mov (&DWP(12,$key),$s3); | ||
| 2587 | |||
| 2588 | &mov ($acc,$_inp); # load inp | ||
| 2589 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
| 2590 | &mov ($_inp,$acc); # save inp | ||
| 2591 | &jnz (&label("slow_dec_loop_x86")); | ||
| 2592 | &mov ("esp",$_esp); | ||
| 2593 | &popf (); | ||
| 2594 | &function_end_A(); | ||
| 2595 | &pushf (); # kludge, never executed | ||
| 2596 | |||
| 2597 | &set_label("slow_dec_partial_x86",16); | ||
| 2598 | &lea ($acc,$ivec); | ||
| 2599 | &mov (&DWP(0,$acc),$s0); # save output to temp | ||
| 2600 | &mov (&DWP(4,$acc),$s1); | ||
| 2601 | &mov (&DWP(8,$acc),$s2); | ||
| 2602 | &mov (&DWP(12,$acc),$s3); | ||
| 2603 | |||
| 2604 | &mov ($acc,$_inp); | ||
| 2605 | &mov ($s0,&DWP(0,$acc)); # re-read input | ||
| 2606 | &mov ($s1,&DWP(4,$acc)); | ||
| 2607 | &mov ($s2,&DWP(8,$acc)); | ||
| 2608 | &mov ($s3,&DWP(12,$acc)); | ||
| 2609 | |||
| 2610 | &mov (&DWP(0,$key),$s0); # copy it to iv | ||
| 2611 | &mov (&DWP(4,$key),$s1); | ||
| 2612 | &mov (&DWP(8,$key),$s2); | ||
| 2613 | &mov (&DWP(12,$key),$s3); | ||
| 2614 | |||
| 2615 | &mov ("ecx",$_len); | ||
| 2616 | &mov ("edi",$_out); | ||
| 2617 | &lea ("esi",$ivec); | ||
| 2618 | &align (4); | ||
| 2619 | &data_word(0xA4F3F689); # rep movsb # copy partial output | ||
| 2620 | |||
| 2621 | &mov ("esp",$_esp); | ||
| 2622 | &popf (); | ||
| 1210 | &function_end("AES_cbc_encrypt"); | 2623 | &function_end("AES_cbc_encrypt"); |
| 1211 | } | 2624 | } |
| 1212 | 2625 | ||
| @@ -1215,35 +2628,31 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
| 1215 | sub enckey() | 2628 | sub enckey() |
| 1216 | { | 2629 | { |
| 1217 | &movz ("esi",&LB("edx")); # rk[i]>>0 | 2630 | &movz ("esi",&LB("edx")); # rk[i]>>0 |
| 1218 | &mov ("ebx",&DWP(2,"ebp","esi",8)); | 2631 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1219 | &movz ("esi",&HB("edx")); # rk[i]>>8 | 2632 | &movz ("esi",&HB("edx")); # rk[i]>>8 |
| 1220 | &and ("ebx",0xFF000000); | 2633 | &shl ("ebx",24); |
| 1221 | &xor ("eax","ebx"); | 2634 | &xor ("eax","ebx"); |
| 1222 | 2635 | ||
| 1223 | &mov ("ebx",&DWP(2,"ebp","esi",8)); | 2636 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1224 | &shr ("edx",16); | 2637 | &shr ("edx",16); |
| 1225 | &and ("ebx",0x000000FF); | ||
| 1226 | &movz ("esi",&LB("edx")); # rk[i]>>16 | 2638 | &movz ("esi",&LB("edx")); # rk[i]>>16 |
| 1227 | &xor ("eax","ebx"); | 2639 | &xor ("eax","ebx"); |
| 1228 | 2640 | ||
| 1229 | &mov ("ebx",&DWP(0,"ebp","esi",8)); | 2641 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1230 | &movz ("esi",&HB("edx")); # rk[i]>>24 | 2642 | &movz ("esi",&HB("edx")); # rk[i]>>24 |
| 1231 | &and ("ebx",0x0000FF00); | 2643 | &shl ("ebx",8); |
| 1232 | &xor ("eax","ebx"); | 2644 | &xor ("eax","ebx"); |
| 1233 | 2645 | ||
| 1234 | &mov ("ebx",&DWP(0,"ebp","esi",8)); | 2646 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1235 | &and ("ebx",0x00FF0000); | 2647 | &shl ("ebx",16); |
| 1236 | &xor ("eax","ebx"); | 2648 | &xor ("eax","ebx"); |
| 1237 | 2649 | ||
| 1238 | &xor ("eax",&DWP(2048,"ebp","ecx",4)); # rcon | 2650 | &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon |
| 1239 | } | 2651 | } |
| 1240 | 2652 | ||
| 1241 | # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | 2653 | &function_begin("_x86_AES_set_encrypt_key"); |
| 1242 | # AES_KEY *key) | 2654 | &mov ("esi",&wparam(1)); # user supplied key |
| 1243 | &public_label("AES_Te"); | 2655 | &mov ("edi",&wparam(3)); # private key schedule |
| 1244 | &function_begin("AES_set_encrypt_key"); | ||
| 1245 | &mov ("esi",&wparam(0)); # user supplied key | ||
| 1246 | &mov ("edi",&wparam(2)); # private key schedule | ||
| 1247 | 2656 | ||
| 1248 | &test ("esi",-1); | 2657 | &test ("esi",-1); |
| 1249 | &jz (&label("badpointer")); | 2658 | &jz (&label("badpointer")); |
| @@ -1252,10 +2661,21 @@ sub enckey() | |||
| 1252 | 2661 | ||
| 1253 | &call (&label("pic_point")); | 2662 | &call (&label("pic_point")); |
| 1254 | &set_label("pic_point"); | 2663 | &set_label("pic_point"); |
| 1255 | &blindpop("ebp"); | 2664 | &blindpop($tbl); |
| 1256 | &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 2665 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); |
| 1257 | 2666 | &lea ($tbl,&DWP(2048+128,$tbl)); | |
| 1258 | &mov ("ecx",&wparam(1)); # number of bits in key | 2667 | |
| 2668 | # prefetch Te4 | ||
| 2669 | &mov ("eax",&DWP(0-128,$tbl)); | ||
| 2670 | &mov ("ebx",&DWP(32-128,$tbl)); | ||
| 2671 | &mov ("ecx",&DWP(64-128,$tbl)); | ||
| 2672 | &mov ("edx",&DWP(96-128,$tbl)); | ||
| 2673 | &mov ("eax",&DWP(128-128,$tbl)); | ||
| 2674 | &mov ("ebx",&DWP(160-128,$tbl)); | ||
| 2675 | &mov ("ecx",&DWP(192-128,$tbl)); | ||
| 2676 | &mov ("edx",&DWP(224-128,$tbl)); | ||
| 2677 | |||
| 2678 | &mov ("ecx",&wparam(2)); # number of bits in key | ||
| 1259 | &cmp ("ecx",128); | 2679 | &cmp ("ecx",128); |
| 1260 | &je (&label("10rounds")); | 2680 | &je (&label("10rounds")); |
| 1261 | &cmp ("ecx",192); | 2681 | &cmp ("ecx",192); |
| @@ -1394,24 +2814,23 @@ sub enckey() | |||
| 1394 | &mov ("edx","eax"); | 2814 | &mov ("edx","eax"); |
| 1395 | &mov ("eax",&DWP(16,"edi")); # rk[4] | 2815 | &mov ("eax",&DWP(16,"edi")); # rk[4] |
| 1396 | &movz ("esi",&LB("edx")); # rk[11]>>0 | 2816 | &movz ("esi",&LB("edx")); # rk[11]>>0 |
| 1397 | &mov ("ebx",&DWP(2,"ebp","esi",8)); | 2817 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1398 | &movz ("esi",&HB("edx")); # rk[11]>>8 | 2818 | &movz ("esi",&HB("edx")); # rk[11]>>8 |
| 1399 | &and ("ebx",0x000000FF); | ||
| 1400 | &xor ("eax","ebx"); | 2819 | &xor ("eax","ebx"); |
| 1401 | 2820 | ||
| 1402 | &mov ("ebx",&DWP(0,"ebp","esi",8)); | 2821 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1403 | &shr ("edx",16); | 2822 | &shr ("edx",16); |
| 1404 | &and ("ebx",0x0000FF00); | 2823 | &shl ("ebx",8); |
| 1405 | &movz ("esi",&LB("edx")); # rk[11]>>16 | 2824 | &movz ("esi",&LB("edx")); # rk[11]>>16 |
| 1406 | &xor ("eax","ebx"); | 2825 | &xor ("eax","ebx"); |
| 1407 | 2826 | ||
| 1408 | &mov ("ebx",&DWP(0,"ebp","esi",8)); | 2827 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1409 | &movz ("esi",&HB("edx")); # rk[11]>>24 | 2828 | &movz ("esi",&HB("edx")); # rk[11]>>24 |
| 1410 | &and ("ebx",0x00FF0000); | 2829 | &shl ("ebx",16); |
| 1411 | &xor ("eax","ebx"); | 2830 | &xor ("eax","ebx"); |
| 1412 | 2831 | ||
| 1413 | &mov ("ebx",&DWP(2,"ebp","esi",8)); | 2832 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1414 | &and ("ebx",0xFF000000); | 2833 | &shl ("ebx",24); |
| 1415 | &xor ("eax","ebx"); | 2834 | &xor ("eax","ebx"); |
| 1416 | 2835 | ||
| 1417 | &mov (&DWP(48,"edi"),"eax"); # rk[12] | 2836 | &mov (&DWP(48,"edi"),"eax"); # rk[12] |
| @@ -1433,43 +2852,74 @@ sub enckey() | |||
| 1433 | &set_label("badpointer"); | 2852 | &set_label("badpointer"); |
| 1434 | &mov ("eax",-1); | 2853 | &mov ("eax",-1); |
| 1435 | &set_label("exit"); | 2854 | &set_label("exit"); |
| 1436 | &function_end("AES_set_encrypt_key"); | 2855 | &function_end("_x86_AES_set_encrypt_key"); |
| 1437 | 2856 | ||
| 1438 | sub deckey() | 2857 | # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, |
| 1439 | { my ($i,$ptr,$te,$td) = @_; | 2858 | # AES_KEY *key) |
| 2859 | &function_begin_B("AES_set_encrypt_key"); | ||
| 2860 | &call ("_x86_AES_set_encrypt_key"); | ||
| 2861 | &ret (); | ||
| 2862 | &function_end_B("AES_set_encrypt_key"); | ||
| 1440 | 2863 | ||
| 1441 | &mov ("eax",&DWP($i,$ptr)); | 2864 | sub deckey() |
| 1442 | &mov ("edx","eax"); | 2865 | { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; |
| 1443 | &movz ("ebx",&HB("eax")); | 2866 | my $tmp = $tbl; |
| 1444 | &shr ("edx",16); | 2867 | |
| 1445 | &and ("eax",0xFF); | 2868 | &mov ($acc,$tp1); |
| 1446 | &movz ("eax",&BP(2,$te,"eax",8)); | 2869 | &and ($acc,0x80808080); |
| 1447 | &movz ("ebx",&BP(2,$te,"ebx",8)); | 2870 | &mov ($tmp,$acc); |
| 1448 | &mov ("eax",&DWP(0,$td,"eax",8)); | 2871 | &shr ($tmp,7); |
| 1449 | &xor ("eax",&DWP(3,$td,"ebx",8)); | 2872 | &lea ($tp2,&DWP(0,$tp1,$tp1)); |
| 1450 | &movz ("ebx",&HB("edx")); | 2873 | &sub ($acc,$tmp); |
| 1451 | &and ("edx",0xFF); | 2874 | &and ($tp2,0xfefefefe); |
| 1452 | &movz ("edx",&BP(2,$te,"edx",8)); | 2875 | &and ($acc,0x1b1b1b1b); |
| 1453 | &movz ("ebx",&BP(2,$te,"ebx",8)); | 2876 | &xor ($acc,$tp2); |
| 1454 | &xor ("eax",&DWP(2,$td,"edx",8)); | 2877 | &mov ($tp2,$acc); |
| 1455 | &xor ("eax",&DWP(1,$td,"ebx",8)); | 2878 | |
| 1456 | &mov (&DWP($i,$ptr),"eax"); | 2879 | &and ($acc,0x80808080); |
| 2880 | &mov ($tmp,$acc); | ||
| 2881 | &shr ($tmp,7); | ||
| 2882 | &lea ($tp4,&DWP(0,$tp2,$tp2)); | ||
| 2883 | &sub ($acc,$tmp); | ||
| 2884 | &and ($tp4,0xfefefefe); | ||
| 2885 | &and ($acc,0x1b1b1b1b); | ||
| 2886 | &xor ($tp2,$tp1); # tp2^tp1 | ||
| 2887 | &xor ($acc,$tp4); | ||
| 2888 | &mov ($tp4,$acc); | ||
| 2889 | |||
| 2890 | &and ($acc,0x80808080); | ||
| 2891 | &mov ($tmp,$acc); | ||
| 2892 | &shr ($tmp,7); | ||
| 2893 | &lea ($tp8,&DWP(0,$tp4,$tp4)); | ||
| 2894 | &xor ($tp4,$tp1); # tp4^tp1 | ||
| 2895 | &sub ($acc,$tmp); | ||
| 2896 | &and ($tp8,0xfefefefe); | ||
| 2897 | &and ($acc,0x1b1b1b1b); | ||
| 2898 | &rotl ($tp1,8); # = ROTATE(tp1,8) | ||
| 2899 | &xor ($tp8,$acc); | ||
| 2900 | |||
| 2901 | &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load | ||
| 2902 | |||
| 2903 | &xor ($tp1,$tp2); | ||
| 2904 | &xor ($tp2,$tp8); | ||
| 2905 | &xor ($tp1,$tp4); | ||
| 2906 | &rotl ($tp2,24); | ||
| 2907 | &xor ($tp4,$tp8); | ||
| 2908 | &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) | ||
| 2909 | &rotl ($tp4,16); | ||
| 2910 | &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24) | ||
| 2911 | &rotl ($tp8,8); | ||
| 2912 | &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16) | ||
| 2913 | &mov ($tp2,$tmp); | ||
| 2914 | &xor ($tp1,$tp8); # ^= ROTATE(tp8,8) | ||
| 2915 | |||
| 2916 | &mov (&DWP(4*$i,$key),$tp1); | ||
| 1457 | } | 2917 | } |
| 1458 | 2918 | ||
| 1459 | # int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | 2919 | # int AES_set_decrypt_key(const unsigned char *userKey, const int bits, |
| 1460 | # AES_KEY *key) | 2920 | # AES_KEY *key) |
| 1461 | &public_label("AES_Td"); | ||
| 1462 | &public_label("AES_Te"); | ||
| 1463 | &function_begin_B("AES_set_decrypt_key"); | 2921 | &function_begin_B("AES_set_decrypt_key"); |
| 1464 | &mov ("eax",&wparam(0)); | 2922 | &call ("_x86_AES_set_encrypt_key"); |
| 1465 | &mov ("ecx",&wparam(1)); | ||
| 1466 | &mov ("edx",&wparam(2)); | ||
| 1467 | &sub ("esp",12); | ||
| 1468 | &mov (&DWP(0,"esp"),"eax"); | ||
| 1469 | &mov (&DWP(4,"esp"),"ecx"); | ||
| 1470 | &mov (&DWP(8,"esp"),"edx"); | ||
| 1471 | &call ("AES_set_encrypt_key"); | ||
| 1472 | &add ("esp",12); | ||
| 1473 | &cmp ("eax",0); | 2923 | &cmp ("eax",0); |
| 1474 | &je (&label("proceed")); | 2924 | &je (&label("proceed")); |
| 1475 | &ret (); | 2925 | &ret (); |
| @@ -1485,8 +2935,7 @@ sub deckey() | |||
| 1485 | &lea ("ecx",&DWP(0,"","ecx",4)); | 2935 | &lea ("ecx",&DWP(0,"","ecx",4)); |
| 1486 | &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk | 2936 | &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk |
| 1487 | 2937 | ||
| 1488 | &align (4); | 2938 | &set_label("invert",4); # invert order of chunks |
| 1489 | &set_label("invert"); # invert order of chunks | ||
| 1490 | &mov ("eax",&DWP(0,"esi")); | 2939 | &mov ("eax",&DWP(0,"esi")); |
| 1491 | &mov ("ebx",&DWP(4,"esi")); | 2940 | &mov ("ebx",&DWP(4,"esi")); |
| 1492 | &mov ("ecx",&DWP(0,"edi")); | 2941 | &mov ("ecx",&DWP(0,"edi")); |
| @@ -1508,26 +2957,24 @@ sub deckey() | |||
| 1508 | &cmp ("esi","edi"); | 2957 | &cmp ("esi","edi"); |
| 1509 | &jne (&label("invert")); | 2958 | &jne (&label("invert")); |
| 1510 | 2959 | ||
| 1511 | &call (&label("pic_point")); | 2960 | &mov ($key,&wparam(2)); |
| 1512 | &set_label("pic_point"); | 2961 | &mov ($acc,&DWP(240,$key)); # pull number of rounds |
| 1513 | blindpop("ebp"); | 2962 | &lea ($acc,&DWP(-2,$acc,$acc)); |
| 1514 | &lea ("edi",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); | 2963 | &lea ($acc,&DWP(0,$key,$acc,8)); |
| 1515 | &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 2964 | &mov (&wparam(2),$acc); |
| 1516 | 2965 | ||
| 1517 | &mov ("esi",&wparam(2)); | 2966 | &mov ($s0,&DWP(16,$key)); # modulo-scheduled load |
| 1518 | &mov ("ecx",&DWP(240,"esi")); # pull number of rounds | 2967 | &set_label("permute",4); # permute the key schedule |
| 1519 | &dec ("ecx"); | 2968 | &add ($key,16); |
| 1520 | &align (4); | 2969 | &deckey (0,$key,$s0,$s1,$s2,$s3); |
| 1521 | &set_label("permute"); # permute the key schedule | 2970 | &deckey (1,$key,$s1,$s2,$s3,$s0); |
| 1522 | &add ("esi",16); | 2971 | &deckey (2,$key,$s2,$s3,$s0,$s1); |
| 1523 | &deckey (0,"esi","ebp","edi"); | 2972 | &deckey (3,$key,$s3,$s0,$s1,$s2); |
| 1524 | &deckey (4,"esi","ebp","edi"); | 2973 | &cmp ($key,&wparam(2)); |
| 1525 | &deckey (8,"esi","ebp","edi"); | 2974 | &jb (&label("permute")); |
| 1526 | &deckey (12,"esi","ebp","edi"); | ||
| 1527 | &dec ("ecx"); | ||
| 1528 | &jnz (&label("permute")); | ||
| 1529 | 2975 | ||
| 1530 | &xor ("eax","eax"); # return success | 2976 | &xor ("eax","eax"); # return success |
| 1531 | &function_end("AES_set_decrypt_key"); | 2977 | &function_end("AES_set_decrypt_key"); |
| 2978 | &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>"); | ||
| 1532 | 2979 | ||
| 1533 | &asm_finish(); | 2980 | &asm_finish(); |
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl index 15742c1ec5..690244111a 100644 --- a/src/lib/libcrypto/aes/asm/aes-armv4.pl +++ b/src/lib/libcrypto/aes/asm/aes-armv4.pl | |||
| @@ -1024,6 +1024,7 @@ _armv4_AES_decrypt: | |||
| 1024 | mov pc,lr @ return | 1024 | mov pc,lr @ return |
| 1025 | .size _armv4_AES_decrypt,.-_armv4_AES_decrypt | 1025 | .size _armv4_AES_decrypt,.-_armv4_AES_decrypt |
| 1026 | .asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" | 1026 | .asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" |
| 1027 | .align 2 | ||
| 1027 | ___ | 1028 | ___ |
| 1028 | 1029 | ||
| 1029 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | 1030 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 |
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl index ce427655ef..f82c5e1814 100644 --- a/src/lib/libcrypto/aes/asm/aes-ppc.pl +++ b/src/lib/libcrypto/aes/asm/aes-ppc.pl | |||
| @@ -16,6 +16,19 @@ | |||
| 16 | # at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - | 16 | # at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - |
| 17 | # at 1/3 of ppc_AES_decrypt. | 17 | # at 1/3 of ppc_AES_decrypt. |
| 18 | 18 | ||
| 19 | # February 2010 | ||
| 20 | # | ||
| 21 | # Rescheduling instructions to favour Power6 pipeline gives 10% | ||
| 22 | # performance improvement on the platfrom in question (and marginal | ||
| 23 | # improvement even on others). It should be noted that Power6 fails | ||
| 24 | # to process byte in 18 cycles, only in 23, because it fails to issue | ||
| 25 | # 4 load instructions in two cycles, only in 3. As result non-compact | ||
| 26 | # block subroutines are 25% slower than one would expect. Compact | ||
| 27 | # functions scale better, because they have pure computational part, | ||
| 28 | # which scales perfectly with clock frequency. To be specific | ||
| 29 | # ppc_AES_encrypt_compact operates at 42 cycles per byte, while | ||
| 30 | # ppc_AES_decrypt_compact - at 55 (in 64-bit build). | ||
| 31 | |||
| 19 | $flavour = shift; | 32 | $flavour = shift; |
| 20 | 33 | ||
| 21 | if ($flavour =~ /64/) { | 34 | if ($flavour =~ /64/) { |
| @@ -376,7 +389,7 @@ $code.=<<___; | |||
| 376 | addi $sp,$sp,$FRAME | 389 | addi $sp,$sp,$FRAME |
| 377 | blr | 390 | blr |
| 378 | 391 | ||
| 379 | .align 4 | 392 | .align 5 |
| 380 | Lppc_AES_encrypt: | 393 | Lppc_AES_encrypt: |
| 381 | lwz $acc00,240($key) | 394 | lwz $acc00,240($key) |
| 382 | lwz $t0,0($key) | 395 | lwz $t0,0($key) |
| @@ -397,46 +410,46 @@ Lppc_AES_encrypt: | |||
| 397 | Lenc_loop: | 410 | Lenc_loop: |
| 398 | rlwinm $acc00,$s0,`32-24+3`,21,28 | 411 | rlwinm $acc00,$s0,`32-24+3`,21,28 |
| 399 | rlwinm $acc01,$s1,`32-24+3`,21,28 | 412 | rlwinm $acc01,$s1,`32-24+3`,21,28 |
| 400 | lwz $t0,0($key) | ||
| 401 | lwz $t1,4($key) | ||
| 402 | rlwinm $acc02,$s2,`32-24+3`,21,28 | 413 | rlwinm $acc02,$s2,`32-24+3`,21,28 |
| 403 | rlwinm $acc03,$s3,`32-24+3`,21,28 | 414 | rlwinm $acc03,$s3,`32-24+3`,21,28 |
| 404 | lwz $t2,8($key) | 415 | lwz $t0,0($key) |
| 405 | lwz $t3,12($key) | 416 | lwz $t1,4($key) |
| 406 | rlwinm $acc04,$s1,`32-16+3`,21,28 | 417 | rlwinm $acc04,$s1,`32-16+3`,21,28 |
| 407 | rlwinm $acc05,$s2,`32-16+3`,21,28 | 418 | rlwinm $acc05,$s2,`32-16+3`,21,28 |
| 408 | lwzx $acc00,$Tbl0,$acc00 | 419 | lwz $t2,8($key) |
| 409 | lwzx $acc01,$Tbl0,$acc01 | 420 | lwz $t3,12($key) |
| 410 | rlwinm $acc06,$s3,`32-16+3`,21,28 | 421 | rlwinm $acc06,$s3,`32-16+3`,21,28 |
| 411 | rlwinm $acc07,$s0,`32-16+3`,21,28 | 422 | rlwinm $acc07,$s0,`32-16+3`,21,28 |
| 412 | lwzx $acc02,$Tbl0,$acc02 | 423 | lwzx $acc00,$Tbl0,$acc00 |
| 413 | lwzx $acc03,$Tbl0,$acc03 | 424 | lwzx $acc01,$Tbl0,$acc01 |
| 414 | rlwinm $acc08,$s2,`32-8+3`,21,28 | 425 | rlwinm $acc08,$s2,`32-8+3`,21,28 |
| 415 | rlwinm $acc09,$s3,`32-8+3`,21,28 | 426 | rlwinm $acc09,$s3,`32-8+3`,21,28 |
| 416 | lwzx $acc04,$Tbl1,$acc04 | 427 | lwzx $acc02,$Tbl0,$acc02 |
| 417 | lwzx $acc05,$Tbl1,$acc05 | 428 | lwzx $acc03,$Tbl0,$acc03 |
| 418 | rlwinm $acc10,$s0,`32-8+3`,21,28 | 429 | rlwinm $acc10,$s0,`32-8+3`,21,28 |
| 419 | rlwinm $acc11,$s1,`32-8+3`,21,28 | 430 | rlwinm $acc11,$s1,`32-8+3`,21,28 |
| 420 | lwzx $acc06,$Tbl1,$acc06 | 431 | lwzx $acc04,$Tbl1,$acc04 |
| 421 | lwzx $acc07,$Tbl1,$acc07 | 432 | lwzx $acc05,$Tbl1,$acc05 |
| 422 | rlwinm $acc12,$s3,`0+3`,21,28 | 433 | rlwinm $acc12,$s3,`0+3`,21,28 |
| 423 | rlwinm $acc13,$s0,`0+3`,21,28 | 434 | rlwinm $acc13,$s0,`0+3`,21,28 |
| 424 | lwzx $acc08,$Tbl2,$acc08 | 435 | lwzx $acc06,$Tbl1,$acc06 |
| 425 | lwzx $acc09,$Tbl2,$acc09 | 436 | lwzx $acc07,$Tbl1,$acc07 |
| 426 | rlwinm $acc14,$s1,`0+3`,21,28 | 437 | rlwinm $acc14,$s1,`0+3`,21,28 |
| 427 | rlwinm $acc15,$s2,`0+3`,21,28 | 438 | rlwinm $acc15,$s2,`0+3`,21,28 |
| 428 | lwzx $acc10,$Tbl2,$acc10 | 439 | lwzx $acc08,$Tbl2,$acc08 |
| 429 | lwzx $acc11,$Tbl2,$acc11 | 440 | lwzx $acc09,$Tbl2,$acc09 |
| 430 | xor $t0,$t0,$acc00 | 441 | xor $t0,$t0,$acc00 |
| 431 | xor $t1,$t1,$acc01 | 442 | xor $t1,$t1,$acc01 |
| 432 | lwzx $acc12,$Tbl3,$acc12 | 443 | lwzx $acc10,$Tbl2,$acc10 |
| 433 | lwzx $acc13,$Tbl3,$acc13 | 444 | lwzx $acc11,$Tbl2,$acc11 |
| 434 | xor $t2,$t2,$acc02 | 445 | xor $t2,$t2,$acc02 |
| 435 | xor $t3,$t3,$acc03 | 446 | xor $t3,$t3,$acc03 |
| 436 | lwzx $acc14,$Tbl3,$acc14 | 447 | lwzx $acc12,$Tbl3,$acc12 |
| 437 | lwzx $acc15,$Tbl3,$acc15 | 448 | lwzx $acc13,$Tbl3,$acc13 |
| 438 | xor $t0,$t0,$acc04 | 449 | xor $t0,$t0,$acc04 |
| 439 | xor $t1,$t1,$acc05 | 450 | xor $t1,$t1,$acc05 |
| 451 | lwzx $acc14,$Tbl3,$acc14 | ||
| 452 | lwzx $acc15,$Tbl3,$acc15 | ||
| 440 | xor $t2,$t2,$acc06 | 453 | xor $t2,$t2,$acc06 |
| 441 | xor $t3,$t3,$acc07 | 454 | xor $t3,$t3,$acc07 |
| 442 | xor $t0,$t0,$acc08 | 455 | xor $t0,$t0,$acc08 |
| @@ -452,60 +465,60 @@ Lenc_loop: | |||
| 452 | 465 | ||
| 453 | addi $Tbl2,$Tbl0,2048 | 466 | addi $Tbl2,$Tbl0,2048 |
| 454 | nop | 467 | nop |
| 455 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4 | ||
| 456 | lwz $acc09,`2048+32`($Tbl0) | ||
| 457 | lwz $acc10,`2048+64`($Tbl0) | ||
| 458 | lwz $acc11,`2048+96`($Tbl0) | ||
| 459 | lwz $acc08,`2048+128`($Tbl0) | ||
| 460 | lwz $acc09,`2048+160`($Tbl0) | ||
| 461 | lwz $acc10,`2048+192`($Tbl0) | ||
| 462 | lwz $acc11,`2048+224`($Tbl0) | ||
| 463 | rlwinm $acc00,$s0,`32-24`,24,31 | ||
| 464 | rlwinm $acc01,$s1,`32-24`,24,31 | ||
| 465 | lwz $t0,0($key) | 468 | lwz $t0,0($key) |
| 466 | lwz $t1,4($key) | 469 | lwz $t1,4($key) |
| 467 | rlwinm $acc02,$s2,`32-24`,24,31 | 470 | rlwinm $acc00,$s0,`32-24`,24,31 |
| 468 | rlwinm $acc03,$s3,`32-24`,24,31 | 471 | rlwinm $acc01,$s1,`32-24`,24,31 |
| 469 | lwz $t2,8($key) | 472 | lwz $t2,8($key) |
| 470 | lwz $t3,12($key) | 473 | lwz $t3,12($key) |
| 474 | rlwinm $acc02,$s2,`32-24`,24,31 | ||
| 475 | rlwinm $acc03,$s3,`32-24`,24,31 | ||
| 476 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4 | ||
| 477 | lwz $acc09,`2048+32`($Tbl0) | ||
| 471 | rlwinm $acc04,$s1,`32-16`,24,31 | 478 | rlwinm $acc04,$s1,`32-16`,24,31 |
| 472 | rlwinm $acc05,$s2,`32-16`,24,31 | 479 | rlwinm $acc05,$s2,`32-16`,24,31 |
| 473 | lbzx $acc00,$Tbl2,$acc00 | 480 | lwz $acc10,`2048+64`($Tbl0) |
| 474 | lbzx $acc01,$Tbl2,$acc01 | 481 | lwz $acc11,`2048+96`($Tbl0) |
| 475 | rlwinm $acc06,$s3,`32-16`,24,31 | 482 | rlwinm $acc06,$s3,`32-16`,24,31 |
| 476 | rlwinm $acc07,$s0,`32-16`,24,31 | 483 | rlwinm $acc07,$s0,`32-16`,24,31 |
| 477 | lbzx $acc02,$Tbl2,$acc02 | 484 | lwz $acc12,`2048+128`($Tbl0) |
| 478 | lbzx $acc03,$Tbl2,$acc03 | 485 | lwz $acc13,`2048+160`($Tbl0) |
| 479 | rlwinm $acc08,$s2,`32-8`,24,31 | 486 | rlwinm $acc08,$s2,`32-8`,24,31 |
| 480 | rlwinm $acc09,$s3,`32-8`,24,31 | 487 | rlwinm $acc09,$s3,`32-8`,24,31 |
| 481 | lbzx $acc04,$Tbl2,$acc04 | 488 | lwz $acc14,`2048+192`($Tbl0) |
| 482 | lbzx $acc05,$Tbl2,$acc05 | 489 | lwz $acc15,`2048+224`($Tbl0) |
| 483 | rlwinm $acc10,$s0,`32-8`,24,31 | 490 | rlwinm $acc10,$s0,`32-8`,24,31 |
| 484 | rlwinm $acc11,$s1,`32-8`,24,31 | 491 | rlwinm $acc11,$s1,`32-8`,24,31 |
| 485 | lbzx $acc06,$Tbl2,$acc06 | 492 | lbzx $acc00,$Tbl2,$acc00 |
| 486 | lbzx $acc07,$Tbl2,$acc07 | 493 | lbzx $acc01,$Tbl2,$acc01 |
| 487 | rlwinm $acc12,$s3,`0`,24,31 | 494 | rlwinm $acc12,$s3,`0`,24,31 |
| 488 | rlwinm $acc13,$s0,`0`,24,31 | 495 | rlwinm $acc13,$s0,`0`,24,31 |
| 489 | lbzx $acc08,$Tbl2,$acc08 | 496 | lbzx $acc02,$Tbl2,$acc02 |
| 490 | lbzx $acc09,$Tbl2,$acc09 | 497 | lbzx $acc03,$Tbl2,$acc03 |
| 491 | rlwinm $acc14,$s1,`0`,24,31 | 498 | rlwinm $acc14,$s1,`0`,24,31 |
| 492 | rlwinm $acc15,$s2,`0`,24,31 | 499 | rlwinm $acc15,$s2,`0`,24,31 |
| 493 | lbzx $acc10,$Tbl2,$acc10 | 500 | lbzx $acc04,$Tbl2,$acc04 |
| 494 | lbzx $acc11,$Tbl2,$acc11 | 501 | lbzx $acc05,$Tbl2,$acc05 |
| 495 | rlwinm $s0,$acc00,24,0,7 | 502 | rlwinm $s0,$acc00,24,0,7 |
| 496 | rlwinm $s1,$acc01,24,0,7 | 503 | rlwinm $s1,$acc01,24,0,7 |
| 497 | lbzx $acc12,$Tbl2,$acc12 | 504 | lbzx $acc06,$Tbl2,$acc06 |
| 498 | lbzx $acc13,$Tbl2,$acc13 | 505 | lbzx $acc07,$Tbl2,$acc07 |
| 499 | rlwinm $s2,$acc02,24,0,7 | 506 | rlwinm $s2,$acc02,24,0,7 |
| 500 | rlwinm $s3,$acc03,24,0,7 | 507 | rlwinm $s3,$acc03,24,0,7 |
| 501 | lbzx $acc14,$Tbl2,$acc14 | 508 | lbzx $acc08,$Tbl2,$acc08 |
| 502 | lbzx $acc15,$Tbl2,$acc15 | 509 | lbzx $acc09,$Tbl2,$acc09 |
| 503 | rlwimi $s0,$acc04,16,8,15 | 510 | rlwimi $s0,$acc04,16,8,15 |
| 504 | rlwimi $s1,$acc05,16,8,15 | 511 | rlwimi $s1,$acc05,16,8,15 |
| 512 | lbzx $acc10,$Tbl2,$acc10 | ||
| 513 | lbzx $acc11,$Tbl2,$acc11 | ||
| 505 | rlwimi $s2,$acc06,16,8,15 | 514 | rlwimi $s2,$acc06,16,8,15 |
| 506 | rlwimi $s3,$acc07,16,8,15 | 515 | rlwimi $s3,$acc07,16,8,15 |
| 516 | lbzx $acc12,$Tbl2,$acc12 | ||
| 517 | lbzx $acc13,$Tbl2,$acc13 | ||
| 507 | rlwimi $s0,$acc08,8,16,23 | 518 | rlwimi $s0,$acc08,8,16,23 |
| 508 | rlwimi $s1,$acc09,8,16,23 | 519 | rlwimi $s1,$acc09,8,16,23 |
| 520 | lbzx $acc14,$Tbl2,$acc14 | ||
| 521 | lbzx $acc15,$Tbl2,$acc15 | ||
| 509 | rlwimi $s2,$acc10,8,16,23 | 522 | rlwimi $s2,$acc10,8,16,23 |
| 510 | rlwimi $s3,$acc11,8,16,23 | 523 | rlwimi $s3,$acc11,8,16,23 |
| 511 | or $s0,$s0,$acc12 | 524 | or $s0,$s0,$acc12 |
| @@ -542,40 +555,40 @@ Lenc_compact_loop: | |||
| 542 | rlwinm $acc01,$s1,`32-24`,24,31 | 555 | rlwinm $acc01,$s1,`32-24`,24,31 |
| 543 | rlwinm $acc02,$s2,`32-24`,24,31 | 556 | rlwinm $acc02,$s2,`32-24`,24,31 |
| 544 | rlwinm $acc03,$s3,`32-24`,24,31 | 557 | rlwinm $acc03,$s3,`32-24`,24,31 |
| 545 | lbzx $acc00,$Tbl1,$acc00 | ||
| 546 | lbzx $acc01,$Tbl1,$acc01 | ||
| 547 | rlwinm $acc04,$s1,`32-16`,24,31 | 558 | rlwinm $acc04,$s1,`32-16`,24,31 |
| 548 | rlwinm $acc05,$s2,`32-16`,24,31 | 559 | rlwinm $acc05,$s2,`32-16`,24,31 |
| 549 | lbzx $acc02,$Tbl1,$acc02 | ||
| 550 | lbzx $acc03,$Tbl1,$acc03 | ||
| 551 | rlwinm $acc06,$s3,`32-16`,24,31 | 560 | rlwinm $acc06,$s3,`32-16`,24,31 |
| 552 | rlwinm $acc07,$s0,`32-16`,24,31 | 561 | rlwinm $acc07,$s0,`32-16`,24,31 |
| 553 | lbzx $acc04,$Tbl1,$acc04 | 562 | lbzx $acc00,$Tbl1,$acc00 |
| 554 | lbzx $acc05,$Tbl1,$acc05 | 563 | lbzx $acc01,$Tbl1,$acc01 |
| 555 | rlwinm $acc08,$s2,`32-8`,24,31 | 564 | rlwinm $acc08,$s2,`32-8`,24,31 |
| 556 | rlwinm $acc09,$s3,`32-8`,24,31 | 565 | rlwinm $acc09,$s3,`32-8`,24,31 |
| 557 | lbzx $acc06,$Tbl1,$acc06 | 566 | lbzx $acc02,$Tbl1,$acc02 |
| 558 | lbzx $acc07,$Tbl1,$acc07 | 567 | lbzx $acc03,$Tbl1,$acc03 |
| 559 | rlwinm $acc10,$s0,`32-8`,24,31 | 568 | rlwinm $acc10,$s0,`32-8`,24,31 |
| 560 | rlwinm $acc11,$s1,`32-8`,24,31 | 569 | rlwinm $acc11,$s1,`32-8`,24,31 |
| 561 | lbzx $acc08,$Tbl1,$acc08 | 570 | lbzx $acc04,$Tbl1,$acc04 |
| 562 | lbzx $acc09,$Tbl1,$acc09 | 571 | lbzx $acc05,$Tbl1,$acc05 |
| 563 | rlwinm $acc12,$s3,`0`,24,31 | 572 | rlwinm $acc12,$s3,`0`,24,31 |
| 564 | rlwinm $acc13,$s0,`0`,24,31 | 573 | rlwinm $acc13,$s0,`0`,24,31 |
| 565 | lbzx $acc10,$Tbl1,$acc10 | 574 | lbzx $acc06,$Tbl1,$acc06 |
| 566 | lbzx $acc11,$Tbl1,$acc11 | 575 | lbzx $acc07,$Tbl1,$acc07 |
| 567 | rlwinm $acc14,$s1,`0`,24,31 | 576 | rlwinm $acc14,$s1,`0`,24,31 |
| 568 | rlwinm $acc15,$s2,`0`,24,31 | 577 | rlwinm $acc15,$s2,`0`,24,31 |
| 569 | lbzx $acc12,$Tbl1,$acc12 | 578 | lbzx $acc08,$Tbl1,$acc08 |
| 570 | lbzx $acc13,$Tbl1,$acc13 | 579 | lbzx $acc09,$Tbl1,$acc09 |
| 571 | rlwinm $s0,$acc00,24,0,7 | 580 | rlwinm $s0,$acc00,24,0,7 |
| 572 | rlwinm $s1,$acc01,24,0,7 | 581 | rlwinm $s1,$acc01,24,0,7 |
| 573 | lbzx $acc14,$Tbl1,$acc14 | 582 | lbzx $acc10,$Tbl1,$acc10 |
| 574 | lbzx $acc15,$Tbl1,$acc15 | 583 | lbzx $acc11,$Tbl1,$acc11 |
| 575 | rlwinm $s2,$acc02,24,0,7 | 584 | rlwinm $s2,$acc02,24,0,7 |
| 576 | rlwinm $s3,$acc03,24,0,7 | 585 | rlwinm $s3,$acc03,24,0,7 |
| 586 | lbzx $acc12,$Tbl1,$acc12 | ||
| 587 | lbzx $acc13,$Tbl1,$acc13 | ||
| 577 | rlwimi $s0,$acc04,16,8,15 | 588 | rlwimi $s0,$acc04,16,8,15 |
| 578 | rlwimi $s1,$acc05,16,8,15 | 589 | rlwimi $s1,$acc05,16,8,15 |
| 590 | lbzx $acc14,$Tbl1,$acc14 | ||
| 591 | lbzx $acc15,$Tbl1,$acc15 | ||
| 579 | rlwimi $s2,$acc06,16,8,15 | 592 | rlwimi $s2,$acc06,16,8,15 |
| 580 | rlwimi $s3,$acc07,16,8,15 | 593 | rlwimi $s3,$acc07,16,8,15 |
| 581 | rlwimi $s0,$acc08,8,16,23 | 594 | rlwimi $s0,$acc08,8,16,23 |
| @@ -725,7 +738,7 @@ Lenc_compact_done: | |||
| 725 | addi $sp,$sp,$FRAME | 738 | addi $sp,$sp,$FRAME |
| 726 | blr | 739 | blr |
| 727 | 740 | ||
| 728 | .align 4 | 741 | .align 5 |
| 729 | Lppc_AES_decrypt: | 742 | Lppc_AES_decrypt: |
| 730 | lwz $acc00,240($key) | 743 | lwz $acc00,240($key) |
| 731 | lwz $t0,0($key) | 744 | lwz $t0,0($key) |
| @@ -746,46 +759,46 @@ Lppc_AES_decrypt: | |||
| 746 | Ldec_loop: | 759 | Ldec_loop: |
| 747 | rlwinm $acc00,$s0,`32-24+3`,21,28 | 760 | rlwinm $acc00,$s0,`32-24+3`,21,28 |
| 748 | rlwinm $acc01,$s1,`32-24+3`,21,28 | 761 | rlwinm $acc01,$s1,`32-24+3`,21,28 |
| 749 | lwz $t0,0($key) | ||
| 750 | lwz $t1,4($key) | ||
| 751 | rlwinm $acc02,$s2,`32-24+3`,21,28 | 762 | rlwinm $acc02,$s2,`32-24+3`,21,28 |
| 752 | rlwinm $acc03,$s3,`32-24+3`,21,28 | 763 | rlwinm $acc03,$s3,`32-24+3`,21,28 |
| 753 | lwz $t2,8($key) | 764 | lwz $t0,0($key) |
| 754 | lwz $t3,12($key) | 765 | lwz $t1,4($key) |
| 755 | rlwinm $acc04,$s3,`32-16+3`,21,28 | 766 | rlwinm $acc04,$s3,`32-16+3`,21,28 |
| 756 | rlwinm $acc05,$s0,`32-16+3`,21,28 | 767 | rlwinm $acc05,$s0,`32-16+3`,21,28 |
| 757 | lwzx $acc00,$Tbl0,$acc00 | 768 | lwz $t2,8($key) |
| 758 | lwzx $acc01,$Tbl0,$acc01 | 769 | lwz $t3,12($key) |
| 759 | rlwinm $acc06,$s1,`32-16+3`,21,28 | 770 | rlwinm $acc06,$s1,`32-16+3`,21,28 |
| 760 | rlwinm $acc07,$s2,`32-16+3`,21,28 | 771 | rlwinm $acc07,$s2,`32-16+3`,21,28 |
| 761 | lwzx $acc02,$Tbl0,$acc02 | 772 | lwzx $acc00,$Tbl0,$acc00 |
| 762 | lwzx $acc03,$Tbl0,$acc03 | 773 | lwzx $acc01,$Tbl0,$acc01 |
| 763 | rlwinm $acc08,$s2,`32-8+3`,21,28 | 774 | rlwinm $acc08,$s2,`32-8+3`,21,28 |
| 764 | rlwinm $acc09,$s3,`32-8+3`,21,28 | 775 | rlwinm $acc09,$s3,`32-8+3`,21,28 |
| 765 | lwzx $acc04,$Tbl1,$acc04 | 776 | lwzx $acc02,$Tbl0,$acc02 |
| 766 | lwzx $acc05,$Tbl1,$acc05 | 777 | lwzx $acc03,$Tbl0,$acc03 |
| 767 | rlwinm $acc10,$s0,`32-8+3`,21,28 | 778 | rlwinm $acc10,$s0,`32-8+3`,21,28 |
| 768 | rlwinm $acc11,$s1,`32-8+3`,21,28 | 779 | rlwinm $acc11,$s1,`32-8+3`,21,28 |
| 769 | lwzx $acc06,$Tbl1,$acc06 | 780 | lwzx $acc04,$Tbl1,$acc04 |
| 770 | lwzx $acc07,$Tbl1,$acc07 | 781 | lwzx $acc05,$Tbl1,$acc05 |
| 771 | rlwinm $acc12,$s1,`0+3`,21,28 | 782 | rlwinm $acc12,$s1,`0+3`,21,28 |
| 772 | rlwinm $acc13,$s2,`0+3`,21,28 | 783 | rlwinm $acc13,$s2,`0+3`,21,28 |
| 773 | lwzx $acc08,$Tbl2,$acc08 | 784 | lwzx $acc06,$Tbl1,$acc06 |
| 774 | lwzx $acc09,$Tbl2,$acc09 | 785 | lwzx $acc07,$Tbl1,$acc07 |
| 775 | rlwinm $acc14,$s3,`0+3`,21,28 | 786 | rlwinm $acc14,$s3,`0+3`,21,28 |
| 776 | rlwinm $acc15,$s0,`0+3`,21,28 | 787 | rlwinm $acc15,$s0,`0+3`,21,28 |
| 777 | lwzx $acc10,$Tbl2,$acc10 | 788 | lwzx $acc08,$Tbl2,$acc08 |
| 778 | lwzx $acc11,$Tbl2,$acc11 | 789 | lwzx $acc09,$Tbl2,$acc09 |
| 779 | xor $t0,$t0,$acc00 | 790 | xor $t0,$t0,$acc00 |
| 780 | xor $t1,$t1,$acc01 | 791 | xor $t1,$t1,$acc01 |
| 781 | lwzx $acc12,$Tbl3,$acc12 | 792 | lwzx $acc10,$Tbl2,$acc10 |
| 782 | lwzx $acc13,$Tbl3,$acc13 | 793 | lwzx $acc11,$Tbl2,$acc11 |
| 783 | xor $t2,$t2,$acc02 | 794 | xor $t2,$t2,$acc02 |
| 784 | xor $t3,$t3,$acc03 | 795 | xor $t3,$t3,$acc03 |
| 785 | lwzx $acc14,$Tbl3,$acc14 | 796 | lwzx $acc12,$Tbl3,$acc12 |
| 786 | lwzx $acc15,$Tbl3,$acc15 | 797 | lwzx $acc13,$Tbl3,$acc13 |
| 787 | xor $t0,$t0,$acc04 | 798 | xor $t0,$t0,$acc04 |
| 788 | xor $t1,$t1,$acc05 | 799 | xor $t1,$t1,$acc05 |
| 800 | lwzx $acc14,$Tbl3,$acc14 | ||
| 801 | lwzx $acc15,$Tbl3,$acc15 | ||
| 789 | xor $t2,$t2,$acc06 | 802 | xor $t2,$t2,$acc06 |
| 790 | xor $t3,$t3,$acc07 | 803 | xor $t3,$t3,$acc07 |
| 791 | xor $t0,$t0,$acc08 | 804 | xor $t0,$t0,$acc08 |
| @@ -801,56 +814,56 @@ Ldec_loop: | |||
| 801 | 814 | ||
| 802 | addi $Tbl2,$Tbl0,2048 | 815 | addi $Tbl2,$Tbl0,2048 |
| 803 | nop | 816 | nop |
| 804 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4 | ||
| 805 | lwz $acc09,`2048+32`($Tbl0) | ||
| 806 | lwz $acc10,`2048+64`($Tbl0) | ||
| 807 | lwz $acc11,`2048+96`($Tbl0) | ||
| 808 | lwz $acc08,`2048+128`($Tbl0) | ||
| 809 | lwz $acc09,`2048+160`($Tbl0) | ||
| 810 | lwz $acc10,`2048+192`($Tbl0) | ||
| 811 | lwz $acc11,`2048+224`($Tbl0) | ||
| 812 | rlwinm $acc00,$s0,`32-24`,24,31 | ||
| 813 | rlwinm $acc01,$s1,`32-24`,24,31 | ||
| 814 | lwz $t0,0($key) | 817 | lwz $t0,0($key) |
| 815 | lwz $t1,4($key) | 818 | lwz $t1,4($key) |
| 816 | rlwinm $acc02,$s2,`32-24`,24,31 | 819 | rlwinm $acc00,$s0,`32-24`,24,31 |
| 817 | rlwinm $acc03,$s3,`32-24`,24,31 | 820 | rlwinm $acc01,$s1,`32-24`,24,31 |
| 818 | lwz $t2,8($key) | 821 | lwz $t2,8($key) |
| 819 | lwz $t3,12($key) | 822 | lwz $t3,12($key) |
| 823 | rlwinm $acc02,$s2,`32-24`,24,31 | ||
| 824 | rlwinm $acc03,$s3,`32-24`,24,31 | ||
| 825 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4 | ||
| 826 | lwz $acc09,`2048+32`($Tbl0) | ||
| 820 | rlwinm $acc04,$s3,`32-16`,24,31 | 827 | rlwinm $acc04,$s3,`32-16`,24,31 |
| 821 | rlwinm $acc05,$s0,`32-16`,24,31 | 828 | rlwinm $acc05,$s0,`32-16`,24,31 |
| 829 | lwz $acc10,`2048+64`($Tbl0) | ||
| 830 | lwz $acc11,`2048+96`($Tbl0) | ||
| 822 | lbzx $acc00,$Tbl2,$acc00 | 831 | lbzx $acc00,$Tbl2,$acc00 |
| 823 | lbzx $acc01,$Tbl2,$acc01 | 832 | lbzx $acc01,$Tbl2,$acc01 |
| 833 | lwz $acc12,`2048+128`($Tbl0) | ||
| 834 | lwz $acc13,`2048+160`($Tbl0) | ||
| 824 | rlwinm $acc06,$s1,`32-16`,24,31 | 835 | rlwinm $acc06,$s1,`32-16`,24,31 |
| 825 | rlwinm $acc07,$s2,`32-16`,24,31 | 836 | rlwinm $acc07,$s2,`32-16`,24,31 |
| 826 | lbzx $acc02,$Tbl2,$acc02 | 837 | lwz $acc14,`2048+192`($Tbl0) |
| 827 | lbzx $acc03,$Tbl2,$acc03 | 838 | lwz $acc15,`2048+224`($Tbl0) |
| 828 | rlwinm $acc08,$s2,`32-8`,24,31 | 839 | rlwinm $acc08,$s2,`32-8`,24,31 |
| 829 | rlwinm $acc09,$s3,`32-8`,24,31 | 840 | rlwinm $acc09,$s3,`32-8`,24,31 |
| 830 | lbzx $acc04,$Tbl2,$acc04 | 841 | lbzx $acc02,$Tbl2,$acc02 |
| 831 | lbzx $acc05,$Tbl2,$acc05 | 842 | lbzx $acc03,$Tbl2,$acc03 |
| 832 | rlwinm $acc10,$s0,`32-8`,24,31 | 843 | rlwinm $acc10,$s0,`32-8`,24,31 |
| 833 | rlwinm $acc11,$s1,`32-8`,24,31 | 844 | rlwinm $acc11,$s1,`32-8`,24,31 |
| 834 | lbzx $acc06,$Tbl2,$acc06 | 845 | lbzx $acc04,$Tbl2,$acc04 |
| 835 | lbzx $acc07,$Tbl2,$acc07 | 846 | lbzx $acc05,$Tbl2,$acc05 |
| 836 | rlwinm $acc12,$s1,`0`,24,31 | 847 | rlwinm $acc12,$s1,`0`,24,31 |
| 837 | rlwinm $acc13,$s2,`0`,24,31 | 848 | rlwinm $acc13,$s2,`0`,24,31 |
| 838 | lbzx $acc08,$Tbl2,$acc08 | 849 | lbzx $acc06,$Tbl2,$acc06 |
| 839 | lbzx $acc09,$Tbl2,$acc09 | 850 | lbzx $acc07,$Tbl2,$acc07 |
| 840 | rlwinm $acc14,$s3,`0`,24,31 | 851 | rlwinm $acc14,$s3,`0`,24,31 |
| 841 | rlwinm $acc15,$s0,`0`,24,31 | 852 | rlwinm $acc15,$s0,`0`,24,31 |
| 842 | lbzx $acc10,$Tbl2,$acc10 | 853 | lbzx $acc08,$Tbl2,$acc08 |
| 843 | lbzx $acc11,$Tbl2,$acc11 | 854 | lbzx $acc09,$Tbl2,$acc09 |
| 844 | rlwinm $s0,$acc00,24,0,7 | 855 | rlwinm $s0,$acc00,24,0,7 |
| 845 | rlwinm $s1,$acc01,24,0,7 | 856 | rlwinm $s1,$acc01,24,0,7 |
| 846 | lbzx $acc12,$Tbl2,$acc12 | 857 | lbzx $acc10,$Tbl2,$acc10 |
| 847 | lbzx $acc13,$Tbl2,$acc13 | 858 | lbzx $acc11,$Tbl2,$acc11 |
| 848 | rlwinm $s2,$acc02,24,0,7 | 859 | rlwinm $s2,$acc02,24,0,7 |
| 849 | rlwinm $s3,$acc03,24,0,7 | 860 | rlwinm $s3,$acc03,24,0,7 |
| 850 | lbzx $acc14,$Tbl2,$acc14 | 861 | lbzx $acc12,$Tbl2,$acc12 |
| 851 | lbzx $acc15,$Tbl2,$acc15 | 862 | lbzx $acc13,$Tbl2,$acc13 |
| 852 | rlwimi $s0,$acc04,16,8,15 | 863 | rlwimi $s0,$acc04,16,8,15 |
| 853 | rlwimi $s1,$acc05,16,8,15 | 864 | rlwimi $s1,$acc05,16,8,15 |
| 865 | lbzx $acc14,$Tbl2,$acc14 | ||
| 866 | lbzx $acc15,$Tbl2,$acc15 | ||
| 854 | rlwimi $s2,$acc06,16,8,15 | 867 | rlwimi $s2,$acc06,16,8,15 |
| 855 | rlwimi $s3,$acc07,16,8,15 | 868 | rlwimi $s3,$acc07,16,8,15 |
| 856 | rlwimi $s0,$acc08,8,16,23 | 869 | rlwimi $s0,$acc08,8,16,23 |
| @@ -897,40 +910,40 @@ Ldec_compact_loop: | |||
| 897 | rlwinm $acc01,$s1,`32-24`,24,31 | 910 | rlwinm $acc01,$s1,`32-24`,24,31 |
| 898 | rlwinm $acc02,$s2,`32-24`,24,31 | 911 | rlwinm $acc02,$s2,`32-24`,24,31 |
| 899 | rlwinm $acc03,$s3,`32-24`,24,31 | 912 | rlwinm $acc03,$s3,`32-24`,24,31 |
| 900 | lbzx $acc00,$Tbl1,$acc00 | ||
| 901 | lbzx $acc01,$Tbl1,$acc01 | ||
| 902 | rlwinm $acc04,$s3,`32-16`,24,31 | 913 | rlwinm $acc04,$s3,`32-16`,24,31 |
| 903 | rlwinm $acc05,$s0,`32-16`,24,31 | 914 | rlwinm $acc05,$s0,`32-16`,24,31 |
| 904 | lbzx $acc02,$Tbl1,$acc02 | ||
| 905 | lbzx $acc03,$Tbl1,$acc03 | ||
| 906 | rlwinm $acc06,$s1,`32-16`,24,31 | 915 | rlwinm $acc06,$s1,`32-16`,24,31 |
| 907 | rlwinm $acc07,$s2,`32-16`,24,31 | 916 | rlwinm $acc07,$s2,`32-16`,24,31 |
| 908 | lbzx $acc04,$Tbl1,$acc04 | 917 | lbzx $acc00,$Tbl1,$acc00 |
| 909 | lbzx $acc05,$Tbl1,$acc05 | 918 | lbzx $acc01,$Tbl1,$acc01 |
| 910 | rlwinm $acc08,$s2,`32-8`,24,31 | 919 | rlwinm $acc08,$s2,`32-8`,24,31 |
| 911 | rlwinm $acc09,$s3,`32-8`,24,31 | 920 | rlwinm $acc09,$s3,`32-8`,24,31 |
| 912 | lbzx $acc06,$Tbl1,$acc06 | 921 | lbzx $acc02,$Tbl1,$acc02 |
| 913 | lbzx $acc07,$Tbl1,$acc07 | 922 | lbzx $acc03,$Tbl1,$acc03 |
| 914 | rlwinm $acc10,$s0,`32-8`,24,31 | 923 | rlwinm $acc10,$s0,`32-8`,24,31 |
| 915 | rlwinm $acc11,$s1,`32-8`,24,31 | 924 | rlwinm $acc11,$s1,`32-8`,24,31 |
| 916 | lbzx $acc08,$Tbl1,$acc08 | 925 | lbzx $acc04,$Tbl1,$acc04 |
| 917 | lbzx $acc09,$Tbl1,$acc09 | 926 | lbzx $acc05,$Tbl1,$acc05 |
| 918 | rlwinm $acc12,$s1,`0`,24,31 | 927 | rlwinm $acc12,$s1,`0`,24,31 |
| 919 | rlwinm $acc13,$s2,`0`,24,31 | 928 | rlwinm $acc13,$s2,`0`,24,31 |
| 920 | lbzx $acc10,$Tbl1,$acc10 | 929 | lbzx $acc06,$Tbl1,$acc06 |
| 921 | lbzx $acc11,$Tbl1,$acc11 | 930 | lbzx $acc07,$Tbl1,$acc07 |
| 922 | rlwinm $acc14,$s3,`0`,24,31 | 931 | rlwinm $acc14,$s3,`0`,24,31 |
| 923 | rlwinm $acc15,$s0,`0`,24,31 | 932 | rlwinm $acc15,$s0,`0`,24,31 |
| 924 | lbzx $acc12,$Tbl1,$acc12 | 933 | lbzx $acc08,$Tbl1,$acc08 |
| 925 | lbzx $acc13,$Tbl1,$acc13 | 934 | lbzx $acc09,$Tbl1,$acc09 |
| 926 | rlwinm $s0,$acc00,24,0,7 | 935 | rlwinm $s0,$acc00,24,0,7 |
| 927 | rlwinm $s1,$acc01,24,0,7 | 936 | rlwinm $s1,$acc01,24,0,7 |
| 928 | lbzx $acc14,$Tbl1,$acc14 | 937 | lbzx $acc10,$Tbl1,$acc10 |
| 929 | lbzx $acc15,$Tbl1,$acc15 | 938 | lbzx $acc11,$Tbl1,$acc11 |
| 930 | rlwinm $s2,$acc02,24,0,7 | 939 | rlwinm $s2,$acc02,24,0,7 |
| 931 | rlwinm $s3,$acc03,24,0,7 | 940 | rlwinm $s3,$acc03,24,0,7 |
| 941 | lbzx $acc12,$Tbl1,$acc12 | ||
| 942 | lbzx $acc13,$Tbl1,$acc13 | ||
| 932 | rlwimi $s0,$acc04,16,8,15 | 943 | rlwimi $s0,$acc04,16,8,15 |
| 933 | rlwimi $s1,$acc05,16,8,15 | 944 | rlwimi $s1,$acc05,16,8,15 |
| 945 | lbzx $acc14,$Tbl1,$acc14 | ||
| 946 | lbzx $acc15,$Tbl1,$acc15 | ||
| 934 | rlwimi $s2,$acc06,16,8,15 | 947 | rlwimi $s2,$acc06,16,8,15 |
| 935 | rlwimi $s3,$acc07,16,8,15 | 948 | rlwimi $s3,$acc07,16,8,15 |
| 936 | rlwimi $s0,$acc08,8,16,23 | 949 | rlwimi $s0,$acc08,8,16,23 |
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl index 4b27afd92f..7e01889298 100644 --- a/src/lib/libcrypto/aes/asm/aes-s390x.pl +++ b/src/lib/libcrypto/aes/asm/aes-s390x.pl | |||
| @@ -765,6 +765,11 @@ $code.=<<___ if (!$softonly); | |||
| 765 | srl %r5,6 | 765 | srl %r5,6 |
| 766 | ar %r5,%r0 | 766 | ar %r5,%r0 |
| 767 | 767 | ||
| 768 | larl %r1,OPENSSL_s390xcap_P | ||
| 769 | lg %r0,0(%r1) | ||
| 770 | tmhl %r0,0x4000 # check for message-security assist | ||
| 771 | jz .Lekey_internal | ||
| 772 | |||
| 768 | lghi %r0,0 # query capability vector | 773 | lghi %r0,0 # query capability vector |
| 769 | la %r1,16($sp) | 774 | la %r1,16($sp) |
| 770 | .long 0xb92f0042 # kmc %r4,%r2 | 775 | .long 0xb92f0042 # kmc %r4,%r2 |
| @@ -1323,6 +1328,7 @@ $code.=<<___; | |||
| 1323 | 4: ex $len,0($s1) | 1328 | 4: ex $len,0($s1) |
| 1324 | j .Lcbc_dec_exit | 1329 | j .Lcbc_dec_exit |
| 1325 | .size AES_cbc_encrypt,.-AES_cbc_encrypt | 1330 | .size AES_cbc_encrypt,.-AES_cbc_encrypt |
| 1331 | .comm OPENSSL_s390xcap_P,8,8 | ||
| 1326 | ___ | 1332 | ___ |
| 1327 | } | 1333 | } |
| 1328 | $code.=<<___; | 1334 | $code.=<<___; |
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl index f616f1751f..a545e892ae 100755 --- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl +++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl | |||
| @@ -2,11 +2,12 @@ | |||
| 2 | # | 2 | # |
| 3 | # ==================================================================== | 3 | # ==================================================================== |
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| 5 | # project. Rights for redistribution and usage in source and binary | 5 | # project. The module is, however, dual licensed under OpenSSL and |
| 6 | # forms are granted according to the OpenSSL license. | 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 7 | # ==================================================================== | 8 | # ==================================================================== |
| 8 | # | 9 | # |
| 9 | # Version 1.2. | 10 | # Version 2.1. |
| 10 | # | 11 | # |
| 11 | # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on | 12 | # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on |
| 12 | # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version | 13 | # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version |
| @@ -17,17 +18,29 @@ | |||
| 17 | # | 18 | # |
| 18 | # Performance in number of cycles per processed byte for 128-bit key: | 19 | # Performance in number of cycles per processed byte for 128-bit key: |
| 19 | # | 20 | # |
| 20 | # ECB CBC encrypt | 21 | # ECB encrypt ECB decrypt CBC large chunk |
| 21 | # AMD64 13.7 13.0(*) | 22 | # AMD64 33 41 13.0 |
| 22 | # EM64T 20.2 18.6(*) | 23 | # EM64T 38 59 18.6(*) |
| 24 | # Core 2 30 43 14.5(*) | ||
| 23 | # | 25 | # |
| 24 | # (*) CBC benchmarks are better than ECB thanks to custom ABI used | 26 | # (*) with hyper-threading off |
| 25 | # by the private block encryption function. | 27 | |
| 28 | $flavour = shift; | ||
| 29 | $output = shift; | ||
| 30 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 31 | |||
| 32 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 33 | |||
| 34 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 35 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 36 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 37 | die "can't locate x86_64-xlate.pl"; | ||
| 38 | |||
| 39 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
| 26 | 40 | ||
| 27 | $verticalspin=1; # unlike 32-bit version $verticalspin performs | 41 | $verticalspin=1; # unlike 32-bit version $verticalspin performs |
| 28 | # ~15% better on both AMD and Intel cores | 42 | # ~15% better on both AMD and Intel cores |
| 29 | $output=shift; | 43 | $speed_limit=512; # see aes-586.pl for details |
| 30 | open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output"; | ||
| 31 | 44 | ||
| 32 | $code=".text\n"; | 45 | $code=".text\n"; |
| 33 | 46 | ||
| @@ -35,9 +48,9 @@ $s0="%eax"; | |||
| 35 | $s1="%ebx"; | 48 | $s1="%ebx"; |
| 36 | $s2="%ecx"; | 49 | $s2="%ecx"; |
| 37 | $s3="%edx"; | 50 | $s3="%edx"; |
| 38 | $acc0="%esi"; | 51 | $acc0="%esi"; $mask80="%rsi"; |
| 39 | $acc1="%edi"; | 52 | $acc1="%edi"; $maskfe="%rdi"; |
| 40 | $acc2="%ebp"; | 53 | $acc2="%ebp"; $mask1b="%rbp"; |
| 41 | $inp="%r8"; | 54 | $inp="%r8"; |
| 42 | $out="%r9"; | 55 | $out="%r9"; |
| 43 | $t0="%r10d"; | 56 | $t0="%r10d"; |
| @@ -51,6 +64,8 @@ sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } | |||
| 51 | sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; | 64 | sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; |
| 52 | $r =~ s/%[er]([sd]i)/%\1l/; | 65 | $r =~ s/%[er]([sd]i)/%\1l/; |
| 53 | $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } | 66 | $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } |
| 67 | sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/; | ||
| 68 | $r =~ s/%r([0-9]+)/%r\1d/; $r; } | ||
| 54 | sub _data_word() | 69 | sub _data_word() |
| 55 | { my $i; | 70 | { my $i; |
| 56 | while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } | 71 | while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } |
| @@ -138,22 +153,17 @@ $code.=<<___; | |||
| 138 | movzb `&lo("$s0")`,$acc0 | 153 | movzb `&lo("$s0")`,$acc0 |
| 139 | movzb `&lo("$s1")`,$acc1 | 154 | movzb `&lo("$s1")`,$acc1 |
| 140 | movzb `&lo("$s2")`,$acc2 | 155 | movzb `&lo("$s2")`,$acc2 |
| 141 | mov 2($sbox,$acc0,8),$t0 | 156 | movzb 2($sbox,$acc0,8),$t0 |
| 142 | mov 2($sbox,$acc1,8),$t1 | 157 | movzb 2($sbox,$acc1,8),$t1 |
| 143 | mov 2($sbox,$acc2,8),$t2 | 158 | movzb 2($sbox,$acc2,8),$t2 |
| 144 | |||
| 145 | and \$0x000000ff,$t0 | ||
| 146 | and \$0x000000ff,$t1 | ||
| 147 | and \$0x000000ff,$t2 | ||
| 148 | 159 | ||
| 149 | movzb `&lo("$s3")`,$acc0 | 160 | movzb `&lo("$s3")`,$acc0 |
| 150 | movzb `&hi("$s1")`,$acc1 | 161 | movzb `&hi("$s1")`,$acc1 |
| 151 | movzb `&hi("$s2")`,$acc2 | 162 | movzb `&hi("$s2")`,$acc2 |
| 152 | mov 2($sbox,$acc0,8),$t3 | 163 | movzb 2($sbox,$acc0,8),$t3 |
| 153 | mov 0($sbox,$acc1,8),$acc1 #$t0 | 164 | mov 0($sbox,$acc1,8),$acc1 #$t0 |
| 154 | mov 0($sbox,$acc2,8),$acc2 #$t1 | 165 | mov 0($sbox,$acc2,8),$acc2 #$t1 |
| 155 | 166 | ||
| 156 | and \$0x000000ff,$t3 | ||
| 157 | and \$0x0000ff00,$acc1 | 167 | and \$0x0000ff00,$acc1 |
| 158 | and \$0x0000ff00,$acc2 | 168 | and \$0x0000ff00,$acc2 |
| 159 | 169 | ||
| @@ -345,6 +355,234 @@ $code.=<<___; | |||
| 345 | .size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt | 355 | .size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt |
| 346 | ___ | 356 | ___ |
| 347 | 357 | ||
| 358 | # it's possible to implement this by shifting tN by 8, filling least | ||
| 359 | # significant byte with byte load and finally bswap-ing at the end, | ||
| 360 | # but such partial register load kills Core 2... | ||
| 361 | sub enccompactvert() | ||
| 362 | { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); | ||
| 363 | |||
| 364 | $code.=<<___; | ||
| 365 | movzb `&lo("$s0")`,$t0 | ||
| 366 | movzb `&lo("$s1")`,$t1 | ||
| 367 | movzb `&lo("$s2")`,$t2 | ||
| 368 | movzb ($sbox,$t0,1),$t0 | ||
| 369 | movzb ($sbox,$t1,1),$t1 | ||
| 370 | movzb ($sbox,$t2,1),$t2 | ||
| 371 | |||
| 372 | movzb `&lo("$s3")`,$t3 | ||
| 373 | movzb `&hi("$s1")`,$acc0 | ||
| 374 | movzb `&hi("$s2")`,$acc1 | ||
| 375 | movzb ($sbox,$t3,1),$t3 | ||
| 376 | movzb ($sbox,$acc0,1),$t4 #$t0 | ||
| 377 | movzb ($sbox,$acc1,1),$t5 #$t1 | ||
| 378 | |||
| 379 | movzb `&hi("$s3")`,$acc2 | ||
| 380 | movzb `&hi("$s0")`,$acc0 | ||
| 381 | shr \$16,$s2 | ||
| 382 | movzb ($sbox,$acc2,1),$acc2 #$t2 | ||
| 383 | movzb ($sbox,$acc0,1),$acc0 #$t3 | ||
| 384 | shr \$16,$s3 | ||
| 385 | |||
| 386 | movzb `&lo("$s2")`,$acc1 | ||
| 387 | shl \$8,$t4 | ||
| 388 | shl \$8,$t5 | ||
| 389 | movzb ($sbox,$acc1,1),$acc1 #$t0 | ||
| 390 | xor $t4,$t0 | ||
| 391 | xor $t5,$t1 | ||
| 392 | |||
| 393 | movzb `&lo("$s3")`,$t4 | ||
| 394 | shr \$16,$s0 | ||
| 395 | shr \$16,$s1 | ||
| 396 | movzb `&lo("$s0")`,$t5 | ||
| 397 | shl \$8,$acc2 | ||
| 398 | shl \$8,$acc0 | ||
| 399 | movzb ($sbox,$t4,1),$t4 #$t1 | ||
| 400 | movzb ($sbox,$t5,1),$t5 #$t2 | ||
| 401 | xor $acc2,$t2 | ||
| 402 | xor $acc0,$t3 | ||
| 403 | |||
| 404 | movzb `&lo("$s1")`,$acc2 | ||
| 405 | movzb `&hi("$s3")`,$acc0 | ||
| 406 | shl \$16,$acc1 | ||
| 407 | movzb ($sbox,$acc2,1),$acc2 #$t3 | ||
| 408 | movzb ($sbox,$acc0,1),$acc0 #$t0 | ||
| 409 | xor $acc1,$t0 | ||
| 410 | |||
| 411 | movzb `&hi("$s0")`,$acc1 | ||
| 412 | shr \$8,$s2 | ||
| 413 | shr \$8,$s1 | ||
| 414 | movzb ($sbox,$acc1,1),$acc1 #$t1 | ||
| 415 | movzb ($sbox,$s2,1),$s3 #$t3 | ||
| 416 | movzb ($sbox,$s1,1),$s2 #$t2 | ||
| 417 | shl \$16,$t4 | ||
| 418 | shl \$16,$t5 | ||
| 419 | shl \$16,$acc2 | ||
| 420 | xor $t4,$t1 | ||
| 421 | xor $t5,$t2 | ||
| 422 | xor $acc2,$t3 | ||
| 423 | |||
| 424 | shl \$24,$acc0 | ||
| 425 | shl \$24,$acc1 | ||
| 426 | shl \$24,$s3 | ||
| 427 | xor $acc0,$t0 | ||
| 428 | shl \$24,$s2 | ||
| 429 | xor $acc1,$t1 | ||
| 430 | mov $t0,$s0 | ||
| 431 | mov $t1,$s1 | ||
| 432 | xor $t2,$s2 | ||
| 433 | xor $t3,$s3 | ||
| 434 | ___ | ||
| 435 | } | ||
| 436 | |||
| 437 | sub enctransform_ref() | ||
| 438 | { my $sn = shift; | ||
| 439 | my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d"); | ||
| 440 | |||
| 441 | $code.=<<___; | ||
| 442 | mov $sn,$acc | ||
| 443 | and \$0x80808080,$acc | ||
| 444 | mov $acc,$tmp | ||
| 445 | shr \$7,$tmp | ||
| 446 | lea ($sn,$sn),$r2 | ||
| 447 | sub $tmp,$acc | ||
| 448 | and \$0xfefefefe,$r2 | ||
| 449 | and \$0x1b1b1b1b,$acc | ||
| 450 | mov $sn,$tmp | ||
| 451 | xor $acc,$r2 | ||
| 452 | |||
| 453 | xor $r2,$sn | ||
| 454 | rol \$24,$sn | ||
| 455 | xor $r2,$sn | ||
| 456 | ror \$16,$tmp | ||
| 457 | xor $tmp,$sn | ||
| 458 | ror \$8,$tmp | ||
| 459 | xor $tmp,$sn | ||
| 460 | ___ | ||
| 461 | } | ||
| 462 | |||
| 463 | # unlike decrypt case it does not pay off to parallelize enctransform | ||
| 464 | sub enctransform() | ||
| 465 | { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d"); | ||
| 466 | |||
| 467 | $code.=<<___; | ||
| 468 | mov $s0,$acc0 | ||
| 469 | mov $s1,$acc1 | ||
| 470 | and \$0x80808080,$acc0 | ||
| 471 | and \$0x80808080,$acc1 | ||
| 472 | mov $acc0,$t0 | ||
| 473 | mov $acc1,$t1 | ||
| 474 | shr \$7,$t0 | ||
| 475 | lea ($s0,$s0),$r20 | ||
| 476 | shr \$7,$t1 | ||
| 477 | lea ($s1,$s1),$r21 | ||
| 478 | sub $t0,$acc0 | ||
| 479 | sub $t1,$acc1 | ||
| 480 | and \$0xfefefefe,$r20 | ||
| 481 | and \$0xfefefefe,$r21 | ||
| 482 | and \$0x1b1b1b1b,$acc0 | ||
| 483 | and \$0x1b1b1b1b,$acc1 | ||
| 484 | mov $s0,$t0 | ||
| 485 | mov $s1,$t1 | ||
| 486 | xor $acc0,$r20 | ||
| 487 | xor $acc1,$r21 | ||
| 488 | |||
| 489 | xor $r20,$s0 | ||
| 490 | xor $r21,$s1 | ||
| 491 | mov $s2,$acc0 | ||
| 492 | mov $s3,$acc1 | ||
| 493 | rol \$24,$s0 | ||
| 494 | rol \$24,$s1 | ||
| 495 | and \$0x80808080,$acc0 | ||
| 496 | and \$0x80808080,$acc1 | ||
| 497 | xor $r20,$s0 | ||
| 498 | xor $r21,$s1 | ||
| 499 | mov $acc0,$t2 | ||
| 500 | mov $acc1,$t3 | ||
| 501 | ror \$16,$t0 | ||
| 502 | ror \$16,$t1 | ||
| 503 | shr \$7,$t2 | ||
| 504 | lea ($s2,$s2),$r20 | ||
| 505 | xor $t0,$s0 | ||
| 506 | xor $t1,$s1 | ||
| 507 | shr \$7,$t3 | ||
| 508 | lea ($s3,$s3),$r21 | ||
| 509 | ror \$8,$t0 | ||
| 510 | ror \$8,$t1 | ||
| 511 | sub $t2,$acc0 | ||
| 512 | sub $t3,$acc1 | ||
| 513 | xor $t0,$s0 | ||
| 514 | xor $t1,$s1 | ||
| 515 | |||
| 516 | and \$0xfefefefe,$r20 | ||
| 517 | and \$0xfefefefe,$r21 | ||
| 518 | and \$0x1b1b1b1b,$acc0 | ||
| 519 | and \$0x1b1b1b1b,$acc1 | ||
| 520 | mov $s2,$t2 | ||
| 521 | mov $s3,$t3 | ||
| 522 | xor $acc0,$r20 | ||
| 523 | xor $acc1,$r21 | ||
| 524 | |||
| 525 | xor $r20,$s2 | ||
| 526 | xor $r21,$s3 | ||
| 527 | rol \$24,$s2 | ||
| 528 | rol \$24,$s3 | ||
| 529 | xor $r20,$s2 | ||
| 530 | xor $r21,$s3 | ||
| 531 | mov 0($sbox),$acc0 # prefetch Te4 | ||
| 532 | ror \$16,$t2 | ||
| 533 | ror \$16,$t3 | ||
| 534 | mov 64($sbox),$acc1 | ||
| 535 | xor $t2,$s2 | ||
| 536 | xor $t3,$s3 | ||
| 537 | mov 128($sbox),$r20 | ||
| 538 | ror \$8,$t2 | ||
| 539 | ror \$8,$t3 | ||
| 540 | mov 192($sbox),$r21 | ||
| 541 | xor $t2,$s2 | ||
| 542 | xor $t3,$s3 | ||
| 543 | ___ | ||
| 544 | } | ||
| 545 | |||
| 546 | $code.=<<___; | ||
| 547 | .type _x86_64_AES_encrypt_compact,\@abi-omnipotent | ||
| 548 | .align 16 | ||
| 549 | _x86_64_AES_encrypt_compact: | ||
| 550 | lea 128($sbox),$inp # size optimization | ||
| 551 | mov 0-128($inp),$acc1 # prefetch Te4 | ||
| 552 | mov 32-128($inp),$acc2 | ||
| 553 | mov 64-128($inp),$t0 | ||
| 554 | mov 96-128($inp),$t1 | ||
| 555 | mov 128-128($inp),$acc1 | ||
| 556 | mov 160-128($inp),$acc2 | ||
| 557 | mov 192-128($inp),$t0 | ||
| 558 | mov 224-128($inp),$t1 | ||
| 559 | jmp .Lenc_loop_compact | ||
| 560 | .align 16 | ||
| 561 | .Lenc_loop_compact: | ||
| 562 | xor 0($key),$s0 # xor with key | ||
| 563 | xor 4($key),$s1 | ||
| 564 | xor 8($key),$s2 | ||
| 565 | xor 12($key),$s3 | ||
| 566 | lea 16($key),$key | ||
| 567 | ___ | ||
| 568 | &enccompactvert(); | ||
| 569 | $code.=<<___; | ||
| 570 | cmp 16(%rsp),$key | ||
| 571 | je .Lenc_compact_done | ||
| 572 | ___ | ||
| 573 | &enctransform(); | ||
| 574 | $code.=<<___; | ||
| 575 | jmp .Lenc_loop_compact | ||
| 576 | .align 16 | ||
| 577 | .Lenc_compact_done: | ||
| 578 | xor 0($key),$s0 | ||
| 579 | xor 4($key),$s1 | ||
| 580 | xor 8($key),$s2 | ||
| 581 | xor 12($key),$s3 | ||
| 582 | .byte 0xf3,0xc3 # rep ret | ||
| 583 | .size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact | ||
| 584 | ___ | ||
| 585 | |||
| 348 | # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); | 586 | # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); |
| 349 | $code.=<<___; | 587 | $code.=<<___; |
| 350 | .globl AES_encrypt | 588 | .globl AES_encrypt |
| @@ -358,31 +596,57 @@ AES_encrypt: | |||
| 358 | push %r14 | 596 | push %r14 |
| 359 | push %r15 | 597 | push %r15 |
| 360 | 598 | ||
| 361 | mov %rdx,$key | 599 | # allocate frame "above" key schedule |
| 362 | mov %rdi,$inp | 600 | mov %rsp,%r10 |
| 363 | mov %rsi,$out | 601 | lea -63(%rdx),%rcx # %rdx is key argument |
| 364 | 602 | and \$-64,%rsp | |
| 365 | .picmeup $sbox | 603 | sub %rsp,%rcx |
| 366 | lea AES_Te-.($sbox),$sbox | 604 | neg %rcx |
| 367 | 605 | and \$0x3c0,%rcx | |
| 368 | mov 0($inp),$s0 | 606 | sub %rcx,%rsp |
| 369 | mov 4($inp),$s1 | 607 | sub \$32,%rsp |
| 370 | mov 8($inp),$s2 | ||
| 371 | mov 12($inp),$s3 | ||
| 372 | 608 | ||
| 373 | call _x86_64_AES_encrypt | 609 | mov %rsi,16(%rsp) # save out |
| 610 | mov %r10,24(%rsp) # save real stack pointer | ||
| 611 | .Lenc_prologue: | ||
| 374 | 612 | ||
| 375 | mov $s0,0($out) | 613 | mov %rdx,$key |
| 614 | mov 240($key),$rnds # load rounds | ||
| 615 | |||
| 616 | mov 0(%rdi),$s0 # load input vector | ||
| 617 | mov 4(%rdi),$s1 | ||
| 618 | mov 8(%rdi),$s2 | ||
| 619 | mov 12(%rdi),$s3 | ||
| 620 | |||
| 621 | shl \$4,$rnds | ||
| 622 | lea ($key,$rnds),%rbp | ||
| 623 | mov $key,(%rsp) # key schedule | ||
| 624 | mov %rbp,8(%rsp) # end of key schedule | ||
| 625 | |||
| 626 | # pick Te4 copy which can't "overlap" with stack frame or key schedule | ||
| 627 | lea .LAES_Te+2048(%rip),$sbox | ||
| 628 | lea 768(%rsp),%rbp | ||
| 629 | sub $sbox,%rbp | ||
| 630 | and \$0x300,%rbp | ||
| 631 | lea ($sbox,%rbp),$sbox | ||
| 632 | |||
| 633 | call _x86_64_AES_encrypt_compact | ||
| 634 | |||
| 635 | mov 16(%rsp),$out # restore out | ||
| 636 | mov 24(%rsp),%rsi # restore saved stack pointer | ||
| 637 | mov $s0,0($out) # write output vector | ||
| 376 | mov $s1,4($out) | 638 | mov $s1,4($out) |
| 377 | mov $s2,8($out) | 639 | mov $s2,8($out) |
| 378 | mov $s3,12($out) | 640 | mov $s3,12($out) |
| 379 | 641 | ||
| 380 | pop %r15 | 642 | mov (%rsi),%r15 |
| 381 | pop %r14 | 643 | mov 8(%rsi),%r14 |
| 382 | pop %r13 | 644 | mov 16(%rsi),%r13 |
| 383 | pop %r12 | 645 | mov 24(%rsi),%r12 |
| 384 | pop %rbp | 646 | mov 32(%rsi),%rbp |
| 385 | pop %rbx | 647 | mov 40(%rsi),%rbx |
| 648 | lea 48(%rsi),%rsp | ||
| 649 | .Lenc_epilogue: | ||
| 386 | ret | 650 | ret |
| 387 | .size AES_encrypt,.-AES_encrypt | 651 | .size AES_encrypt,.-AES_encrypt |
| 388 | ___ | 652 | ___ |
| @@ -453,19 +717,20 @@ sub declastvert() | |||
| 453 | { my $t3="%r8d"; # zaps $inp! | 717 | { my $t3="%r8d"; # zaps $inp! |
| 454 | 718 | ||
| 455 | $code.=<<___; | 719 | $code.=<<___; |
| 720 | lea 2048($sbox),$sbox # size optimization | ||
| 456 | movzb `&lo("$s0")`,$acc0 | 721 | movzb `&lo("$s0")`,$acc0 |
| 457 | movzb `&lo("$s1")`,$acc1 | 722 | movzb `&lo("$s1")`,$acc1 |
| 458 | movzb `&lo("$s2")`,$acc2 | 723 | movzb `&lo("$s2")`,$acc2 |
| 459 | movzb 2048($sbox,$acc0,1),$t0 | 724 | movzb ($sbox,$acc0,1),$t0 |
| 460 | movzb 2048($sbox,$acc1,1),$t1 | 725 | movzb ($sbox,$acc1,1),$t1 |
| 461 | movzb 2048($sbox,$acc2,1),$t2 | 726 | movzb ($sbox,$acc2,1),$t2 |
| 462 | 727 | ||
| 463 | movzb `&lo("$s3")`,$acc0 | 728 | movzb `&lo("$s3")`,$acc0 |
| 464 | movzb `&hi("$s3")`,$acc1 | 729 | movzb `&hi("$s3")`,$acc1 |
| 465 | movzb `&hi("$s0")`,$acc2 | 730 | movzb `&hi("$s0")`,$acc2 |
| 466 | movzb 2048($sbox,$acc0,1),$t3 | 731 | movzb ($sbox,$acc0,1),$t3 |
| 467 | movzb 2048($sbox,$acc1,1),$acc1 #$t0 | 732 | movzb ($sbox,$acc1,1),$acc1 #$t0 |
| 468 | movzb 2048($sbox,$acc2,1),$acc2 #$t1 | 733 | movzb ($sbox,$acc2,1),$acc2 #$t1 |
| 469 | 734 | ||
| 470 | shl \$8,$acc1 | 735 | shl \$8,$acc1 |
| 471 | shl \$8,$acc2 | 736 | shl \$8,$acc2 |
| @@ -477,8 +742,8 @@ $code.=<<___; | |||
| 477 | movzb `&hi("$s1")`,$acc0 | 742 | movzb `&hi("$s1")`,$acc0 |
| 478 | movzb `&hi("$s2")`,$acc1 | 743 | movzb `&hi("$s2")`,$acc1 |
| 479 | shr \$16,$s0 | 744 | shr \$16,$s0 |
| 480 | movzb 2048($sbox,$acc0,1),$acc0 #$t2 | 745 | movzb ($sbox,$acc0,1),$acc0 #$t2 |
| 481 | movzb 2048($sbox,$acc1,1),$acc1 #$t3 | 746 | movzb ($sbox,$acc1,1),$acc1 #$t3 |
| 482 | 747 | ||
| 483 | shl \$8,$acc0 | 748 | shl \$8,$acc0 |
| 484 | shl \$8,$acc1 | 749 | shl \$8,$acc1 |
| @@ -490,9 +755,9 @@ $code.=<<___; | |||
| 490 | movzb `&lo("$s2")`,$acc0 | 755 | movzb `&lo("$s2")`,$acc0 |
| 491 | movzb `&lo("$s3")`,$acc1 | 756 | movzb `&lo("$s3")`,$acc1 |
| 492 | movzb `&lo("$s0")`,$acc2 | 757 | movzb `&lo("$s0")`,$acc2 |
| 493 | movzb 2048($sbox,$acc0,1),$acc0 #$t0 | 758 | movzb ($sbox,$acc0,1),$acc0 #$t0 |
| 494 | movzb 2048($sbox,$acc1,1),$acc1 #$t1 | 759 | movzb ($sbox,$acc1,1),$acc1 #$t1 |
| 495 | movzb 2048($sbox,$acc2,1),$acc2 #$t2 | 760 | movzb ($sbox,$acc2,1),$acc2 #$t2 |
| 496 | 761 | ||
| 497 | shl \$16,$acc0 | 762 | shl \$16,$acc0 |
| 498 | shl \$16,$acc1 | 763 | shl \$16,$acc1 |
| @@ -505,9 +770,9 @@ $code.=<<___; | |||
| 505 | movzb `&lo("$s1")`,$acc0 | 770 | movzb `&lo("$s1")`,$acc0 |
| 506 | movzb `&hi("$s1")`,$acc1 | 771 | movzb `&hi("$s1")`,$acc1 |
| 507 | movzb `&hi("$s2")`,$acc2 | 772 | movzb `&hi("$s2")`,$acc2 |
| 508 | movzb 2048($sbox,$acc0,1),$acc0 #$t3 | 773 | movzb ($sbox,$acc0,1),$acc0 #$t3 |
| 509 | movzb 2048($sbox,$acc1,1),$acc1 #$t0 | 774 | movzb ($sbox,$acc1,1),$acc1 #$t0 |
| 510 | movzb 2048($sbox,$acc2,1),$acc2 #$t1 | 775 | movzb ($sbox,$acc2,1),$acc2 #$t1 |
| 511 | 776 | ||
| 512 | shl \$16,$acc0 | 777 | shl \$16,$acc0 |
| 513 | shl \$24,$acc1 | 778 | shl \$24,$acc1 |
| @@ -520,8 +785,8 @@ $code.=<<___; | |||
| 520 | movzb `&hi("$s3")`,$acc0 | 785 | movzb `&hi("$s3")`,$acc0 |
| 521 | movzb `&hi("$s0")`,$acc1 | 786 | movzb `&hi("$s0")`,$acc1 |
| 522 | mov 16+12($key),$s3 | 787 | mov 16+12($key),$s3 |
| 523 | movzb 2048($sbox,$acc0,1),$acc0 #$t2 | 788 | movzb ($sbox,$acc0,1),$acc0 #$t2 |
| 524 | movzb 2048($sbox,$acc1,1),$acc1 #$t3 | 789 | movzb ($sbox,$acc1,1),$acc1 #$t3 |
| 525 | mov 16+0($key),$s0 | 790 | mov 16+0($key),$s0 |
| 526 | 791 | ||
| 527 | shl \$24,$acc0 | 792 | shl \$24,$acc0 |
| @@ -532,6 +797,7 @@ $code.=<<___; | |||
| 532 | 797 | ||
| 533 | mov 16+4($key),$s1 | 798 | mov 16+4($key),$s1 |
| 534 | mov 16+8($key),$s2 | 799 | mov 16+8($key),$s2 |
| 800 | lea -2048($sbox),$sbox | ||
| 535 | xor $t0,$s0 | 801 | xor $t0,$s0 |
| 536 | xor $t1,$s1 | 802 | xor $t1,$s1 |
| 537 | xor $t2,$s2 | 803 | xor $t2,$s2 |
| @@ -659,6 +925,260 @@ $code.=<<___; | |||
| 659 | .size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt | 925 | .size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt |
| 660 | ___ | 926 | ___ |
| 661 | 927 | ||
| 928 | sub deccompactvert() | ||
| 929 | { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); | ||
| 930 | |||
| 931 | $code.=<<___; | ||
| 932 | movzb `&lo("$s0")`,$t0 | ||
| 933 | movzb `&lo("$s1")`,$t1 | ||
| 934 | movzb `&lo("$s2")`,$t2 | ||
| 935 | movzb ($sbox,$t0,1),$t0 | ||
| 936 | movzb ($sbox,$t1,1),$t1 | ||
| 937 | movzb ($sbox,$t2,1),$t2 | ||
| 938 | |||
| 939 | movzb `&lo("$s3")`,$t3 | ||
| 940 | movzb `&hi("$s3")`,$acc0 | ||
| 941 | movzb `&hi("$s0")`,$acc1 | ||
| 942 | movzb ($sbox,$t3,1),$t3 | ||
| 943 | movzb ($sbox,$acc0,1),$t4 #$t0 | ||
| 944 | movzb ($sbox,$acc1,1),$t5 #$t1 | ||
| 945 | |||
| 946 | movzb `&hi("$s1")`,$acc2 | ||
| 947 | movzb `&hi("$s2")`,$acc0 | ||
| 948 | shr \$16,$s2 | ||
| 949 | movzb ($sbox,$acc2,1),$acc2 #$t2 | ||
| 950 | movzb ($sbox,$acc0,1),$acc0 #$t3 | ||
| 951 | shr \$16,$s3 | ||
| 952 | |||
| 953 | movzb `&lo("$s2")`,$acc1 | ||
| 954 | shl \$8,$t4 | ||
| 955 | shl \$8,$t5 | ||
| 956 | movzb ($sbox,$acc1,1),$acc1 #$t0 | ||
| 957 | xor $t4,$t0 | ||
| 958 | xor $t5,$t1 | ||
| 959 | |||
| 960 | movzb `&lo("$s3")`,$t4 | ||
| 961 | shr \$16,$s0 | ||
| 962 | shr \$16,$s1 | ||
| 963 | movzb `&lo("$s0")`,$t5 | ||
| 964 | shl \$8,$acc2 | ||
| 965 | shl \$8,$acc0 | ||
| 966 | movzb ($sbox,$t4,1),$t4 #$t1 | ||
| 967 | movzb ($sbox,$t5,1),$t5 #$t2 | ||
| 968 | xor $acc2,$t2 | ||
| 969 | xor $acc0,$t3 | ||
| 970 | |||
| 971 | movzb `&lo("$s1")`,$acc2 | ||
| 972 | movzb `&hi("$s1")`,$acc0 | ||
| 973 | shl \$16,$acc1 | ||
| 974 | movzb ($sbox,$acc2,1),$acc2 #$t3 | ||
| 975 | movzb ($sbox,$acc0,1),$acc0 #$t0 | ||
| 976 | xor $acc1,$t0 | ||
| 977 | |||
| 978 | movzb `&hi("$s2")`,$acc1 | ||
| 979 | shl \$16,$t4 | ||
| 980 | shl \$16,$t5 | ||
| 981 | movzb ($sbox,$acc1,1),$s1 #$t1 | ||
| 982 | xor $t4,$t1 | ||
| 983 | xor $t5,$t2 | ||
| 984 | |||
| 985 | movzb `&hi("$s3")`,$acc1 | ||
| 986 | shr \$8,$s0 | ||
| 987 | shl \$16,$acc2 | ||
| 988 | movzb ($sbox,$acc1,1),$s2 #$t2 | ||
| 989 | movzb ($sbox,$s0,1),$s3 #$t3 | ||
| 990 | xor $acc2,$t3 | ||
| 991 | |||
| 992 | shl \$24,$acc0 | ||
| 993 | shl \$24,$s1 | ||
| 994 | shl \$24,$s2 | ||
| 995 | xor $acc0,$t0 | ||
| 996 | shl \$24,$s3 | ||
| 997 | xor $t1,$s1 | ||
| 998 | mov $t0,$s0 | ||
| 999 | xor $t2,$s2 | ||
| 1000 | xor $t3,$s3 | ||
| 1001 | ___ | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | # parallelized version! input is pair of 64-bit values: %rax=s1.s0 | ||
| 1005 | # and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1, | ||
| 1006 | # %ecx=s2 and %edx=s3. | ||
| 1007 | sub dectransform() | ||
| 1008 | { my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx"); | ||
| 1009 | my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx"); | ||
| 1010 | my $prefetch = shift; | ||
| 1011 | |||
| 1012 | $code.=<<___; | ||
| 1013 | mov $tp10,$acc0 | ||
| 1014 | mov $tp18,$acc8 | ||
| 1015 | and $mask80,$acc0 | ||
| 1016 | and $mask80,$acc8 | ||
| 1017 | mov $acc0,$tp40 | ||
| 1018 | mov $acc8,$tp48 | ||
| 1019 | shr \$7,$tp40 | ||
| 1020 | lea ($tp10,$tp10),$tp20 | ||
| 1021 | shr \$7,$tp48 | ||
| 1022 | lea ($tp18,$tp18),$tp28 | ||
| 1023 | sub $tp40,$acc0 | ||
| 1024 | sub $tp48,$acc8 | ||
| 1025 | and $maskfe,$tp20 | ||
| 1026 | and $maskfe,$tp28 | ||
| 1027 | and $mask1b,$acc0 | ||
| 1028 | and $mask1b,$acc8 | ||
| 1029 | xor $tp20,$acc0 | ||
| 1030 | xor $tp28,$acc8 | ||
| 1031 | mov $acc0,$tp20 | ||
| 1032 | mov $acc8,$tp28 | ||
| 1033 | |||
| 1034 | and $mask80,$acc0 | ||
| 1035 | and $mask80,$acc8 | ||
| 1036 | mov $acc0,$tp80 | ||
| 1037 | mov $acc8,$tp88 | ||
| 1038 | shr \$7,$tp80 | ||
| 1039 | lea ($tp20,$tp20),$tp40 | ||
| 1040 | shr \$7,$tp88 | ||
| 1041 | lea ($tp28,$tp28),$tp48 | ||
| 1042 | sub $tp80,$acc0 | ||
| 1043 | sub $tp88,$acc8 | ||
| 1044 | and $maskfe,$tp40 | ||
| 1045 | and $maskfe,$tp48 | ||
| 1046 | and $mask1b,$acc0 | ||
| 1047 | and $mask1b,$acc8 | ||
| 1048 | xor $tp40,$acc0 | ||
| 1049 | xor $tp48,$acc8 | ||
| 1050 | mov $acc0,$tp40 | ||
| 1051 | mov $acc8,$tp48 | ||
| 1052 | |||
| 1053 | and $mask80,$acc0 | ||
| 1054 | and $mask80,$acc8 | ||
| 1055 | mov $acc0,$tp80 | ||
| 1056 | mov $acc8,$tp88 | ||
| 1057 | shr \$7,$tp80 | ||
| 1058 | xor $tp10,$tp20 # tp2^=tp1 | ||
| 1059 | shr \$7,$tp88 | ||
| 1060 | xor $tp18,$tp28 # tp2^=tp1 | ||
| 1061 | sub $tp80,$acc0 | ||
| 1062 | sub $tp88,$acc8 | ||
| 1063 | lea ($tp40,$tp40),$tp80 | ||
| 1064 | lea ($tp48,$tp48),$tp88 | ||
| 1065 | xor $tp10,$tp40 # tp4^=tp1 | ||
| 1066 | xor $tp18,$tp48 # tp4^=tp1 | ||
| 1067 | and $maskfe,$tp80 | ||
| 1068 | and $maskfe,$tp88 | ||
| 1069 | and $mask1b,$acc0 | ||
| 1070 | and $mask1b,$acc8 | ||
| 1071 | xor $acc0,$tp80 | ||
| 1072 | xor $acc8,$tp88 | ||
| 1073 | |||
| 1074 | xor $tp80,$tp10 # tp1^=tp8 | ||
| 1075 | xor $tp88,$tp18 # tp1^=tp8 | ||
| 1076 | xor $tp80,$tp20 # tp2^tp1^=tp8 | ||
| 1077 | xor $tp88,$tp28 # tp2^tp1^=tp8 | ||
| 1078 | mov $tp10,$acc0 | ||
| 1079 | mov $tp18,$acc8 | ||
| 1080 | xor $tp80,$tp40 # tp4^tp1^=tp8 | ||
| 1081 | xor $tp88,$tp48 # tp4^tp1^=tp8 | ||
| 1082 | shr \$32,$acc0 | ||
| 1083 | shr \$32,$acc8 | ||
| 1084 | xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1 | ||
| 1085 | xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 | ||
| 1086 | rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8) | ||
| 1087 | rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8) | ||
| 1088 | xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 | ||
| 1089 | xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 | ||
| 1090 | |||
| 1091 | rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) | ||
| 1092 | rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8) | ||
| 1093 | xor `&LO("$tp80")`,`&LO("$tp10")` | ||
| 1094 | xor `&LO("$tp88")`,`&LO("$tp18")` | ||
| 1095 | shr \$32,$tp80 | ||
| 1096 | shr \$32,$tp88 | ||
| 1097 | xor `&LO("$tp80")`,`&LO("$acc0")` | ||
| 1098 | xor `&LO("$tp88")`,`&LO("$acc8")` | ||
| 1099 | |||
| 1100 | mov $tp20,$tp80 | ||
| 1101 | mov $tp28,$tp88 | ||
| 1102 | shr \$32,$tp80 | ||
| 1103 | shr \$32,$tp88 | ||
| 1104 | rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24) | ||
| 1105 | rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24) | ||
| 1106 | rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) | ||
| 1107 | rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) | ||
| 1108 | xor `&LO("$tp20")`,`&LO("$tp10")` | ||
| 1109 | xor `&LO("$tp28")`,`&LO("$tp18")` | ||
| 1110 | mov $tp40,$tp20 | ||
| 1111 | mov $tp48,$tp28 | ||
| 1112 | xor `&LO("$tp80")`,`&LO("$acc0")` | ||
| 1113 | xor `&LO("$tp88")`,`&LO("$acc8")` | ||
| 1114 | |||
| 1115 | `"mov 0($sbox),$mask80" if ($prefetch)` | ||
| 1116 | shr \$32,$tp20 | ||
| 1117 | shr \$32,$tp28 | ||
| 1118 | `"mov 64($sbox),$maskfe" if ($prefetch)` | ||
| 1119 | rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16) | ||
| 1120 | rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16) | ||
| 1121 | `"mov 128($sbox),$mask1b" if ($prefetch)` | ||
| 1122 | rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16) | ||
| 1123 | rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) | ||
| 1124 | `"mov 192($sbox),$tp80" if ($prefetch)` | ||
| 1125 | xor `&LO("$tp40")`,`&LO("$tp10")` | ||
| 1126 | xor `&LO("$tp48")`,`&LO("$tp18")` | ||
| 1127 | `"mov 256($sbox),$tp88" if ($prefetch)` | ||
| 1128 | xor `&LO("$tp20")`,`&LO("$acc0")` | ||
| 1129 | xor `&LO("$tp28")`,`&LO("$acc8")` | ||
| 1130 | ___ | ||
| 1131 | } | ||
| 1132 | |||
| 1133 | $code.=<<___; | ||
| 1134 | .type _x86_64_AES_decrypt_compact,\@abi-omnipotent | ||
| 1135 | .align 16 | ||
| 1136 | _x86_64_AES_decrypt_compact: | ||
| 1137 | lea 128($sbox),$inp # size optimization | ||
| 1138 | mov 0-128($inp),$acc1 # prefetch Td4 | ||
| 1139 | mov 32-128($inp),$acc2 | ||
| 1140 | mov 64-128($inp),$t0 | ||
| 1141 | mov 96-128($inp),$t1 | ||
| 1142 | mov 128-128($inp),$acc1 | ||
| 1143 | mov 160-128($inp),$acc2 | ||
| 1144 | mov 192-128($inp),$t0 | ||
| 1145 | mov 224-128($inp),$t1 | ||
| 1146 | jmp .Ldec_loop_compact | ||
| 1147 | |||
| 1148 | .align 16 | ||
| 1149 | .Ldec_loop_compact: | ||
| 1150 | xor 0($key),$s0 # xor with key | ||
| 1151 | xor 4($key),$s1 | ||
| 1152 | xor 8($key),$s2 | ||
| 1153 | xor 12($key),$s3 | ||
| 1154 | lea 16($key),$key | ||
| 1155 | ___ | ||
| 1156 | &deccompactvert(); | ||
| 1157 | $code.=<<___; | ||
| 1158 | cmp 16(%rsp),$key | ||
| 1159 | je .Ldec_compact_done | ||
| 1160 | |||
| 1161 | mov 256+0($sbox),$mask80 | ||
| 1162 | shl \$32,%rbx | ||
| 1163 | shl \$32,%rdx | ||
| 1164 | mov 256+8($sbox),$maskfe | ||
| 1165 | or %rbx,%rax | ||
| 1166 | or %rdx,%rcx | ||
| 1167 | mov 256+16($sbox),$mask1b | ||
| 1168 | ___ | ||
| 1169 | &dectransform(1); | ||
| 1170 | $code.=<<___; | ||
| 1171 | jmp .Ldec_loop_compact | ||
| 1172 | .align 16 | ||
| 1173 | .Ldec_compact_done: | ||
| 1174 | xor 0($key),$s0 | ||
| 1175 | xor 4($key),$s1 | ||
| 1176 | xor 8($key),$s2 | ||
| 1177 | xor 12($key),$s3 | ||
| 1178 | .byte 0xf3,0xc3 # rep ret | ||
| 1179 | .size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact | ||
| 1180 | ___ | ||
| 1181 | |||
| 662 | # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); | 1182 | # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); |
| 663 | $code.=<<___; | 1183 | $code.=<<___; |
| 664 | .globl AES_decrypt | 1184 | .globl AES_decrypt |
| @@ -672,43 +1192,59 @@ AES_decrypt: | |||
| 672 | push %r14 | 1192 | push %r14 |
| 673 | push %r15 | 1193 | push %r15 |
| 674 | 1194 | ||
| 675 | mov %rdx,$key | 1195 | # allocate frame "above" key schedule |
| 676 | mov %rdi,$inp | 1196 | mov %rsp,%r10 |
| 677 | mov %rsi,$out | 1197 | lea -63(%rdx),%rcx # %rdx is key argument |
| 1198 | and \$-64,%rsp | ||
| 1199 | sub %rsp,%rcx | ||
| 1200 | neg %rcx | ||
| 1201 | and \$0x3c0,%rcx | ||
| 1202 | sub %rcx,%rsp | ||
| 1203 | sub \$32,%rsp | ||
| 1204 | |||
| 1205 | mov %rsi,16(%rsp) # save out | ||
| 1206 | mov %r10,24(%rsp) # save real stack pointer | ||
| 1207 | .Ldec_prologue: | ||
| 678 | 1208 | ||
| 679 | .picmeup $sbox | 1209 | mov %rdx,$key |
| 680 | lea AES_Td-.($sbox),$sbox | 1210 | mov 240($key),$rnds # load rounds |
| 681 | 1211 | ||
| 682 | # prefetch Td4 | 1212 | mov 0(%rdi),$s0 # load input vector |
| 683 | lea 2048+128($sbox),$sbox; | 1213 | mov 4(%rdi),$s1 |
| 684 | mov 0-128($sbox),$s0 | 1214 | mov 8(%rdi),$s2 |
| 685 | mov 32-128($sbox),$s1 | 1215 | mov 12(%rdi),$s3 |
| 686 | mov 64-128($sbox),$s2 | 1216 | |
| 687 | mov 96-128($sbox),$s3 | 1217 | shl \$4,$rnds |
| 688 | mov 128-128($sbox),$s0 | 1218 | lea ($key,$rnds),%rbp |
| 689 | mov 160-128($sbox),$s1 | 1219 | mov $key,(%rsp) # key schedule |
| 690 | mov 192-128($sbox),$s2 | 1220 | mov %rbp,8(%rsp) # end of key schedule |
| 691 | mov 224-128($sbox),$s3 | 1221 | |
| 692 | lea -2048-128($sbox),$sbox; | 1222 | # pick Td4 copy which can't "overlap" with stack frame or key schedule |
| 693 | 1223 | lea .LAES_Td+2048(%rip),$sbox | |
| 694 | mov 0($inp),$s0 | 1224 | lea 768(%rsp),%rbp |
| 695 | mov 4($inp),$s1 | 1225 | sub $sbox,%rbp |
| 696 | mov 8($inp),$s2 | 1226 | and \$0x300,%rbp |
| 697 | mov 12($inp),$s3 | 1227 | lea ($sbox,%rbp),$sbox |
| 698 | 1228 | shr \$3,%rbp # recall "magic" constants! | |
| 699 | call _x86_64_AES_decrypt | 1229 | add %rbp,$sbox |
| 700 | 1230 | ||
| 701 | mov $s0,0($out) | 1231 | call _x86_64_AES_decrypt_compact |
| 1232 | |||
| 1233 | mov 16(%rsp),$out # restore out | ||
| 1234 | mov 24(%rsp),%rsi # restore saved stack pointer | ||
| 1235 | mov $s0,0($out) # write output vector | ||
| 702 | mov $s1,4($out) | 1236 | mov $s1,4($out) |
| 703 | mov $s2,8($out) | 1237 | mov $s2,8($out) |
| 704 | mov $s3,12($out) | 1238 | mov $s3,12($out) |
| 705 | 1239 | ||
| 706 | pop %r15 | 1240 | mov (%rsi),%r15 |
| 707 | pop %r14 | 1241 | mov 8(%rsi),%r14 |
| 708 | pop %r13 | 1242 | mov 16(%rsi),%r13 |
| 709 | pop %r12 | 1243 | mov 24(%rsi),%r12 |
| 710 | pop %rbp | 1244 | mov 32(%rsi),%rbp |
| 711 | pop %rbx | 1245 | mov 40(%rsi),%rbx |
| 1246 | lea 48(%rsi),%rsp | ||
| 1247 | .Ldec_epilogue: | ||
| 712 | ret | 1248 | ret |
| 713 | .size AES_decrypt,.-AES_decrypt | 1249 | .size AES_decrypt,.-AES_decrypt |
| 714 | ___ | 1250 | ___ |
| @@ -718,27 +1254,26 @@ sub enckey() | |||
| 718 | { | 1254 | { |
| 719 | $code.=<<___; | 1255 | $code.=<<___; |
| 720 | movz %dl,%esi # rk[i]>>0 | 1256 | movz %dl,%esi # rk[i]>>0 |
| 721 | mov 2(%rbp,%rsi,8),%ebx | 1257 | movzb -128(%rbp,%rsi),%ebx |
| 722 | movz %dh,%esi # rk[i]>>8 | 1258 | movz %dh,%esi # rk[i]>>8 |
| 723 | and \$0xFF000000,%ebx | 1259 | shl \$24,%ebx |
| 724 | xor %ebx,%eax | 1260 | xor %ebx,%eax |
| 725 | 1261 | ||
| 726 | mov 2(%rbp,%rsi,8),%ebx | 1262 | movzb -128(%rbp,%rsi),%ebx |
| 727 | shr \$16,%edx | 1263 | shr \$16,%edx |
| 728 | and \$0x000000FF,%ebx | ||
| 729 | movz %dl,%esi # rk[i]>>16 | 1264 | movz %dl,%esi # rk[i]>>16 |
| 730 | xor %ebx,%eax | 1265 | xor %ebx,%eax |
| 731 | 1266 | ||
| 732 | mov 0(%rbp,%rsi,8),%ebx | 1267 | movzb -128(%rbp,%rsi),%ebx |
| 733 | movz %dh,%esi # rk[i]>>24 | 1268 | movz %dh,%esi # rk[i]>>24 |
| 734 | and \$0x0000FF00,%ebx | 1269 | shl \$8,%ebx |
| 735 | xor %ebx,%eax | 1270 | xor %ebx,%eax |
| 736 | 1271 | ||
| 737 | mov 0(%rbp,%rsi,8),%ebx | 1272 | movzb -128(%rbp,%rsi),%ebx |
| 738 | and \$0x00FF0000,%ebx | 1273 | shl \$16,%ebx |
| 739 | xor %ebx,%eax | 1274 | xor %ebx,%eax |
| 740 | 1275 | ||
| 741 | xor 2048(%rbp,%rcx,4),%eax # rcon | 1276 | xor 1024-128(%rbp,%rcx,4),%eax # rcon |
| 742 | ___ | 1277 | ___ |
| 743 | } | 1278 | } |
| 744 | 1279 | ||
| @@ -751,7 +1286,29 @@ $code.=<<___; | |||
| 751 | AES_set_encrypt_key: | 1286 | AES_set_encrypt_key: |
| 752 | push %rbx | 1287 | push %rbx |
| 753 | push %rbp | 1288 | push %rbp |
| 1289 | push %r12 # redundant, but allows to share | ||
| 1290 | push %r13 # exception handler... | ||
| 1291 | push %r14 | ||
| 1292 | push %r15 | ||
| 1293 | sub \$8,%rsp | ||
| 1294 | .Lenc_key_prologue: | ||
| 1295 | |||
| 1296 | call _x86_64_AES_set_encrypt_key | ||
| 1297 | |||
| 1298 | mov 8(%rsp),%r15 | ||
| 1299 | mov 16(%rsp),%r14 | ||
| 1300 | mov 24(%rsp),%r13 | ||
| 1301 | mov 32(%rsp),%r12 | ||
| 1302 | mov 40(%rsp),%rbp | ||
| 1303 | mov 48(%rsp),%rbx | ||
| 1304 | add \$56,%rsp | ||
| 1305 | .Lenc_key_epilogue: | ||
| 1306 | ret | ||
| 1307 | .size AES_set_encrypt_key,.-AES_set_encrypt_key | ||
| 754 | 1308 | ||
| 1309 | .type _x86_64_AES_set_encrypt_key,\@abi-omnipotent | ||
| 1310 | .align 16 | ||
| 1311 | _x86_64_AES_set_encrypt_key: | ||
| 755 | mov %esi,%ecx # %ecx=bits | 1312 | mov %esi,%ecx # %ecx=bits |
| 756 | mov %rdi,%rsi # %rsi=userKey | 1313 | mov %rdi,%rsi # %rsi=userKey |
| 757 | mov %rdx,%rdi # %rdi=key | 1314 | mov %rdx,%rdi # %rdi=key |
| @@ -761,8 +1318,18 @@ AES_set_encrypt_key: | |||
| 761 | test \$-1,%rdi | 1318 | test \$-1,%rdi |
| 762 | jz .Lbadpointer | 1319 | jz .Lbadpointer |
| 763 | 1320 | ||
| 764 | .picmeup %rbp | 1321 | lea .LAES_Te(%rip),%rbp |
| 765 | lea AES_Te-.(%rbp),%rbp | 1322 | lea 2048+128(%rbp),%rbp |
| 1323 | |||
| 1324 | # prefetch Te4 | ||
| 1325 | mov 0-128(%rbp),%eax | ||
| 1326 | mov 32-128(%rbp),%ebx | ||
| 1327 | mov 64-128(%rbp),%r8d | ||
| 1328 | mov 96-128(%rbp),%edx | ||
| 1329 | mov 128-128(%rbp),%eax | ||
| 1330 | mov 160-128(%rbp),%ebx | ||
| 1331 | mov 192-128(%rbp),%r8d | ||
| 1332 | mov 224-128(%rbp),%edx | ||
| 766 | 1333 | ||
| 767 | cmp \$128,%ecx | 1334 | cmp \$128,%ecx |
| 768 | je .L10rounds | 1335 | je .L10rounds |
| @@ -774,15 +1341,12 @@ AES_set_encrypt_key: | |||
| 774 | jmp .Lexit | 1341 | jmp .Lexit |
| 775 | 1342 | ||
| 776 | .L10rounds: | 1343 | .L10rounds: |
| 777 | mov 0(%rsi),%eax # copy first 4 dwords | 1344 | mov 0(%rsi),%rax # copy first 4 dwords |
| 778 | mov 4(%rsi),%ebx | 1345 | mov 8(%rsi),%rdx |
| 779 | mov 8(%rsi),%ecx | 1346 | mov %rax,0(%rdi) |
| 780 | mov 12(%rsi),%edx | 1347 | mov %rdx,8(%rdi) |
| 781 | mov %eax,0(%rdi) | ||
| 782 | mov %ebx,4(%rdi) | ||
| 783 | mov %ecx,8(%rdi) | ||
| 784 | mov %edx,12(%rdi) | ||
| 785 | 1348 | ||
| 1349 | shr \$32,%rdx | ||
| 786 | xor %ecx,%ecx | 1350 | xor %ecx,%ecx |
| 787 | jmp .L10shortcut | 1351 | jmp .L10shortcut |
| 788 | .align 4 | 1352 | .align 4 |
| @@ -810,19 +1374,14 @@ $code.=<<___; | |||
| 810 | jmp .Lexit | 1374 | jmp .Lexit |
| 811 | 1375 | ||
| 812 | .L12rounds: | 1376 | .L12rounds: |
| 813 | mov 0(%rsi),%eax # copy first 6 dwords | 1377 | mov 0(%rsi),%rax # copy first 6 dwords |
| 814 | mov 4(%rsi),%ebx | 1378 | mov 8(%rsi),%rbx |
| 815 | mov 8(%rsi),%ecx | 1379 | mov 16(%rsi),%rdx |
| 816 | mov 12(%rsi),%edx | 1380 | mov %rax,0(%rdi) |
| 817 | mov %eax,0(%rdi) | 1381 | mov %rbx,8(%rdi) |
| 818 | mov %ebx,4(%rdi) | 1382 | mov %rdx,16(%rdi) |
| 819 | mov %ecx,8(%rdi) | 1383 | |
| 820 | mov %edx,12(%rdi) | 1384 | shr \$32,%rdx |
| 821 | mov 16(%rsi),%ecx | ||
| 822 | mov 20(%rsi),%edx | ||
| 823 | mov %ecx,16(%rdi) | ||
| 824 | mov %edx,20(%rdi) | ||
| 825 | |||
| 826 | xor %ecx,%ecx | 1385 | xor %ecx,%ecx |
| 827 | jmp .L12shortcut | 1386 | jmp .L12shortcut |
| 828 | .align 4 | 1387 | .align 4 |
| @@ -858,30 +1417,23 @@ $code.=<<___; | |||
| 858 | jmp .Lexit | 1417 | jmp .Lexit |
| 859 | 1418 | ||
| 860 | .L14rounds: | 1419 | .L14rounds: |
| 861 | mov 0(%rsi),%eax # copy first 8 dwords | 1420 | mov 0(%rsi),%rax # copy first 8 dwords |
| 862 | mov 4(%rsi),%ebx | 1421 | mov 8(%rsi),%rbx |
| 863 | mov 8(%rsi),%ecx | 1422 | mov 16(%rsi),%rcx |
| 864 | mov 12(%rsi),%edx | 1423 | mov 24(%rsi),%rdx |
| 865 | mov %eax,0(%rdi) | 1424 | mov %rax,0(%rdi) |
| 866 | mov %ebx,4(%rdi) | 1425 | mov %rbx,8(%rdi) |
| 867 | mov %ecx,8(%rdi) | 1426 | mov %rcx,16(%rdi) |
| 868 | mov %edx,12(%rdi) | 1427 | mov %rdx,24(%rdi) |
| 869 | mov 16(%rsi),%eax | 1428 | |
| 870 | mov 20(%rsi),%ebx | 1429 | shr \$32,%rdx |
| 871 | mov 24(%rsi),%ecx | ||
| 872 | mov 28(%rsi),%edx | ||
| 873 | mov %eax,16(%rdi) | ||
| 874 | mov %ebx,20(%rdi) | ||
| 875 | mov %ecx,24(%rdi) | ||
| 876 | mov %edx,28(%rdi) | ||
| 877 | |||
| 878 | xor %ecx,%ecx | 1430 | xor %ecx,%ecx |
| 879 | jmp .L14shortcut | 1431 | jmp .L14shortcut |
| 880 | .align 4 | 1432 | .align 4 |
| 881 | .L14loop: | 1433 | .L14loop: |
| 1434 | mov 0(%rdi),%eax # rk[0] | ||
| 882 | mov 28(%rdi),%edx # rk[4] | 1435 | mov 28(%rdi),%edx # rk[4] |
| 883 | .L14shortcut: | 1436 | .L14shortcut: |
| 884 | mov 0(%rdi),%eax # rk[0] | ||
| 885 | ___ | 1437 | ___ |
| 886 | &enckey (); | 1438 | &enckey (); |
| 887 | $code.=<<___; | 1439 | $code.=<<___; |
| @@ -900,24 +1452,23 @@ $code.=<<___; | |||
| 900 | mov %eax,%edx | 1452 | mov %eax,%edx |
| 901 | mov 16(%rdi),%eax # rk[4] | 1453 | mov 16(%rdi),%eax # rk[4] |
| 902 | movz %dl,%esi # rk[11]>>0 | 1454 | movz %dl,%esi # rk[11]>>0 |
| 903 | mov 2(%rbp,%rsi,8),%ebx | 1455 | movzb -128(%rbp,%rsi),%ebx |
| 904 | movz %dh,%esi # rk[11]>>8 | 1456 | movz %dh,%esi # rk[11]>>8 |
| 905 | and \$0x000000FF,%ebx | ||
| 906 | xor %ebx,%eax | 1457 | xor %ebx,%eax |
| 907 | 1458 | ||
| 908 | mov 0(%rbp,%rsi,8),%ebx | 1459 | movzb -128(%rbp,%rsi),%ebx |
| 909 | shr \$16,%edx | 1460 | shr \$16,%edx |
| 910 | and \$0x0000FF00,%ebx | 1461 | shl \$8,%ebx |
| 911 | movz %dl,%esi # rk[11]>>16 | 1462 | movz %dl,%esi # rk[11]>>16 |
| 912 | xor %ebx,%eax | 1463 | xor %ebx,%eax |
| 913 | 1464 | ||
| 914 | mov 0(%rbp,%rsi,8),%ebx | 1465 | movzb -128(%rbp,%rsi),%ebx |
| 915 | movz %dh,%esi # rk[11]>>24 | 1466 | movz %dh,%esi # rk[11]>>24 |
| 916 | and \$0x00FF0000,%ebx | 1467 | shl \$16,%ebx |
| 917 | xor %ebx,%eax | 1468 | xor %ebx,%eax |
| 918 | 1469 | ||
| 919 | mov 2(%rbp,%rsi,8),%ebx | 1470 | movzb -128(%rbp,%rsi),%ebx |
| 920 | and \$0xFF000000,%ebx | 1471 | shl \$24,%ebx |
| 921 | xor %ebx,%eax | 1472 | xor %ebx,%eax |
| 922 | 1473 | ||
| 923 | mov %eax,48(%rdi) # rk[12] | 1474 | mov %eax,48(%rdi) # rk[12] |
| @@ -938,31 +1489,61 @@ $code.=<<___; | |||
| 938 | .Lbadpointer: | 1489 | .Lbadpointer: |
| 939 | mov \$-1,%rax | 1490 | mov \$-1,%rax |
| 940 | .Lexit: | 1491 | .Lexit: |
| 941 | pop %rbp | 1492 | .byte 0xf3,0xc3 # rep ret |
| 942 | pop %rbx | 1493 | .size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key |
| 943 | ret | ||
| 944 | .size AES_set_encrypt_key,.-AES_set_encrypt_key | ||
| 945 | ___ | 1494 | ___ |
| 946 | 1495 | ||
| 947 | sub deckey() | 1496 | sub deckey_ref() |
| 948 | { my ($i,$ptr,$te,$td) = @_; | 1497 | { my ($i,$ptr,$te,$td) = @_; |
| 1498 | my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d"); | ||
| 949 | $code.=<<___; | 1499 | $code.=<<___; |
| 950 | mov $i($ptr),%eax | 1500 | mov $i($ptr),$tp1 |
| 951 | mov %eax,%edx | 1501 | mov $tp1,$acc |
| 952 | movz %ah,%ebx | 1502 | and \$0x80808080,$acc |
| 953 | shr \$16,%edx | 1503 | mov $acc,$tp4 |
| 954 | and \$0xFF,%eax | 1504 | shr \$7,$tp4 |
| 955 | movzb 2($te,%rax,8),%rax | 1505 | lea 0($tp1,$tp1),$tp2 |
| 956 | movzb 2($te,%rbx,8),%rbx | 1506 | sub $tp4,$acc |
| 957 | mov 0($td,%rax,8),%eax | 1507 | and \$0xfefefefe,$tp2 |
| 958 | xor 3($td,%rbx,8),%eax | 1508 | and \$0x1b1b1b1b,$acc |
| 959 | movzb %dh,%ebx | 1509 | xor $tp2,$acc |
| 960 | and \$0xFF,%edx | 1510 | mov $acc,$tp2 |
| 961 | movzb 2($te,%rdx,8),%rdx | 1511 | |
| 962 | movzb 2($te,%rbx,8),%rbx | 1512 | and \$0x80808080,$acc |
| 963 | xor 2($td,%rdx,8),%eax | 1513 | mov $acc,$tp8 |
| 964 | xor 1($td,%rbx,8),%eax | 1514 | shr \$7,$tp8 |
| 965 | mov %eax,$i($ptr) | 1515 | lea 0($tp2,$tp2),$tp4 |
| 1516 | sub $tp8,$acc | ||
| 1517 | and \$0xfefefefe,$tp4 | ||
| 1518 | and \$0x1b1b1b1b,$acc | ||
| 1519 | xor $tp1,$tp2 # tp2^tp1 | ||
| 1520 | xor $tp4,$acc | ||
| 1521 | mov $acc,$tp4 | ||
| 1522 | |||
| 1523 | and \$0x80808080,$acc | ||
| 1524 | mov $acc,$tp8 | ||
| 1525 | shr \$7,$tp8 | ||
| 1526 | sub $tp8,$acc | ||
| 1527 | lea 0($tp4,$tp4),$tp8 | ||
| 1528 | xor $tp1,$tp4 # tp4^tp1 | ||
| 1529 | and \$0xfefefefe,$tp8 | ||
| 1530 | and \$0x1b1b1b1b,$acc | ||
| 1531 | xor $acc,$tp8 | ||
| 1532 | |||
| 1533 | xor $tp8,$tp1 # tp1^tp8 | ||
| 1534 | rol \$8,$tp1 # ROTATE(tp1^tp8,8) | ||
| 1535 | xor $tp8,$tp2 # tp2^tp1^tp8 | ||
| 1536 | xor $tp8,$tp4 # tp4^tp1^tp8 | ||
| 1537 | xor $tp2,$tp8 | ||
| 1538 | xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2 | ||
| 1539 | |||
| 1540 | xor $tp8,$tp1 | ||
| 1541 | rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24) | ||
| 1542 | xor $tp2,$tp1 | ||
| 1543 | rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16) | ||
| 1544 | xor $tp4,$tp1 | ||
| 1545 | |||
| 1546 | mov $tp1,$i($ptr) | ||
| 966 | ___ | 1547 | ___ |
| 967 | } | 1548 | } |
| 968 | 1549 | ||
| @@ -973,19 +1554,23 @@ $code.=<<___; | |||
| 973 | .type AES_set_decrypt_key,\@function,3 | 1554 | .type AES_set_decrypt_key,\@function,3 |
| 974 | .align 16 | 1555 | .align 16 |
| 975 | AES_set_decrypt_key: | 1556 | AES_set_decrypt_key: |
| 976 | push %rdx | 1557 | push %rbx |
| 977 | call AES_set_encrypt_key | 1558 | push %rbp |
| 978 | cmp \$0,%eax | 1559 | push %r12 |
| 979 | je .Lproceed | 1560 | push %r13 |
| 980 | lea 24(%rsp),%rsp | 1561 | push %r14 |
| 981 | ret | 1562 | push %r15 |
| 982 | .Lproceed: | 1563 | push %rdx # save key schedule |
| 1564 | .Ldec_key_prologue: | ||
| 1565 | |||
| 1566 | call _x86_64_AES_set_encrypt_key | ||
| 983 | mov (%rsp),%r8 # restore key schedule | 1567 | mov (%rsp),%r8 # restore key schedule |
| 984 | mov %rbx,(%rsp) | 1568 | cmp \$0,%eax |
| 1569 | jne .Labort | ||
| 985 | 1570 | ||
| 986 | mov 240(%r8),%ecx # pull number of rounds | 1571 | mov 240(%r8),%r14d # pull number of rounds |
| 987 | xor %rdi,%rdi | 1572 | xor %rdi,%rdi |
| 988 | lea (%rdi,%rcx,4),%rcx | 1573 | lea (%rdi,%r14d,4),%rcx |
| 989 | mov %r8,%rsi | 1574 | mov %r8,%rsi |
| 990 | lea (%r8,%rcx,4),%rdi # pointer to last chunk | 1575 | lea (%r8,%rcx,4),%rdi # pointer to last chunk |
| 991 | .align 4 | 1576 | .align 4 |
| @@ -1003,27 +1588,39 @@ AES_set_decrypt_key: | |||
| 1003 | cmp %rsi,%rdi | 1588 | cmp %rsi,%rdi |
| 1004 | jne .Linvert | 1589 | jne .Linvert |
| 1005 | 1590 | ||
| 1006 | .picmeup %r9 | 1591 | lea .LAES_Te+2048+1024(%rip),%rax # rcon |
| 1007 | lea AES_Td-.(%r9),%rdi | ||
| 1008 | lea AES_Te-AES_Td(%rdi),%r9 | ||
| 1009 | 1592 | ||
| 1010 | mov %r8,%rsi | 1593 | mov 40(%rax),$mask80 |
| 1011 | mov 240(%r8),%ecx # pull number of rounds | 1594 | mov 48(%rax),$maskfe |
| 1012 | sub \$1,%ecx | 1595 | mov 56(%rax),$mask1b |
| 1596 | |||
| 1597 | mov %r8,$key | ||
| 1598 | sub \$1,%r14d | ||
| 1013 | .align 4 | 1599 | .align 4 |
| 1014 | .Lpermute: | 1600 | .Lpermute: |
| 1015 | lea 16(%rsi),%rsi | 1601 | lea 16($key),$key |
| 1602 | mov 0($key),%rax | ||
| 1603 | mov 8($key),%rcx | ||
| 1016 | ___ | 1604 | ___ |
| 1017 | &deckey (0,"%rsi","%r9","%rdi"); | 1605 | &dectransform (); |
| 1018 | &deckey (4,"%rsi","%r9","%rdi"); | ||
| 1019 | &deckey (8,"%rsi","%r9","%rdi"); | ||
| 1020 | &deckey (12,"%rsi","%r9","%rdi"); | ||
| 1021 | $code.=<<___; | 1606 | $code.=<<___; |
| 1022 | sub \$1,%ecx | 1607 | mov %eax,0($key) |
| 1608 | mov %ebx,4($key) | ||
| 1609 | mov %ecx,8($key) | ||
| 1610 | mov %edx,12($key) | ||
| 1611 | sub \$1,%r14d | ||
| 1023 | jnz .Lpermute | 1612 | jnz .Lpermute |
| 1024 | 1613 | ||
| 1025 | xor %rax,%rax | 1614 | xor %rax,%rax |
| 1026 | pop %rbx | 1615 | .Labort: |
| 1616 | mov 8(%rsp),%r15 | ||
| 1617 | mov 16(%rsp),%r14 | ||
| 1618 | mov 24(%rsp),%r13 | ||
| 1619 | mov 32(%rsp),%r12 | ||
| 1620 | mov 40(%rsp),%rbp | ||
| 1621 | mov 48(%rsp),%rbx | ||
| 1622 | add \$56,%rsp | ||
| 1623 | .Ldec_key_epilogue: | ||
| 1027 | ret | 1624 | ret |
| 1028 | .size AES_set_decrypt_key,.-AES_set_decrypt_key | 1625 | .size AES_set_decrypt_key,.-AES_set_decrypt_key |
| 1029 | ___ | 1626 | ___ |
| @@ -1034,47 +1631,59 @@ ___ | |||
| 1034 | { | 1631 | { |
| 1035 | # stack frame layout | 1632 | # stack frame layout |
| 1036 | # -8(%rsp) return address | 1633 | # -8(%rsp) return address |
| 1037 | my $_rsp="0(%rsp)"; # saved %rsp | 1634 | my $keyp="0(%rsp)"; # one to pass as $key |
| 1038 | my $_len="8(%rsp)"; # copy of 3rd parameter, length | 1635 | my $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds]) |
| 1039 | my $_key="16(%rsp)"; # copy of 4th parameter, key | 1636 | my $_rsp="16(%rsp)"; # saved %rsp |
| 1040 | my $_ivp="24(%rsp)"; # copy of 5th parameter, ivp | 1637 | my $_inp="24(%rsp)"; # copy of 1st parameter, inp |
| 1041 | my $keyp="32(%rsp)"; # one to pass as $key | 1638 | my $_out="32(%rsp)"; # copy of 2nd parameter, out |
| 1042 | my $ivec="40(%rsp)"; # ivec[16] | 1639 | my $_len="40(%rsp)"; # copy of 3rd parameter, length |
| 1043 | my $aes_key="56(%rsp)"; # copy of aes_key | 1640 | my $_key="48(%rsp)"; # copy of 4th parameter, key |
| 1044 | my $mark="56+240(%rsp)"; # copy of aes_key->rounds | 1641 | my $_ivp="56(%rsp)"; # copy of 5th parameter, ivp |
| 1642 | my $ivec="64(%rsp)"; # ivec[16] | ||
| 1643 | my $aes_key="80(%rsp)"; # copy of aes_key | ||
| 1644 | my $mark="80+240(%rsp)"; # copy of aes_key->rounds | ||
| 1045 | 1645 | ||
| 1046 | $code.=<<___; | 1646 | $code.=<<___; |
| 1047 | .globl AES_cbc_encrypt | 1647 | .globl AES_cbc_encrypt |
| 1048 | .type AES_cbc_encrypt,\@function,6 | 1648 | .type AES_cbc_encrypt,\@function,6 |
| 1049 | .align 16 | 1649 | .align 16 |
| 1650 | .extern OPENSSL_ia32cap_P | ||
| 1050 | AES_cbc_encrypt: | 1651 | AES_cbc_encrypt: |
| 1051 | cmp \$0,%rdx # check length | 1652 | cmp \$0,%rdx # check length |
| 1052 | je .Lcbc_just_ret | 1653 | je .Lcbc_epilogue |
| 1654 | pushfq | ||
| 1053 | push %rbx | 1655 | push %rbx |
| 1054 | push %rbp | 1656 | push %rbp |
| 1055 | push %r12 | 1657 | push %r12 |
| 1056 | push %r13 | 1658 | push %r13 |
| 1057 | push %r14 | 1659 | push %r14 |
| 1058 | push %r15 | 1660 | push %r15 |
| 1059 | pushfq | 1661 | .Lcbc_prologue: |
| 1662 | |||
| 1060 | cld | 1663 | cld |
| 1061 | mov %r9d,%r9d # clear upper half of enc | 1664 | mov %r9d,%r9d # clear upper half of enc |
| 1062 | 1665 | ||
| 1063 | .picmeup $sbox | 1666 | lea .LAES_Te(%rip),$sbox |
| 1064 | .Lcbc_pic_point: | ||
| 1065 | |||
| 1066 | cmp \$0,%r9 | 1667 | cmp \$0,%r9 |
| 1067 | je .LDECRYPT | 1668 | jne .Lcbc_picked_te |
| 1068 | 1669 | lea .LAES_Td(%rip),$sbox | |
| 1069 | lea AES_Te-.Lcbc_pic_point($sbox),$sbox | 1670 | .Lcbc_picked_te: |
| 1671 | |||
| 1672 | mov OPENSSL_ia32cap_P(%rip),%r10d | ||
| 1673 | cmp \$$speed_limit,%rdx | ||
| 1674 | jb .Lcbc_slow_prologue | ||
| 1675 | test \$15,%rdx | ||
| 1676 | jnz .Lcbc_slow_prologue | ||
| 1677 | bt \$28,%r10d | ||
| 1678 | jc .Lcbc_slow_prologue | ||
| 1070 | 1679 | ||
| 1071 | # allocate aligned stack frame... | 1680 | # allocate aligned stack frame... |
| 1072 | lea -64-248(%rsp),$key | 1681 | lea -88-248(%rsp),$key |
| 1073 | and \$-64,$key | 1682 | and \$-64,$key |
| 1074 | 1683 | ||
| 1075 | # ... and make it doesn't alias with AES_Te modulo 4096 | 1684 | # ... and make sure it doesn't alias with AES_T[ed] modulo 4096 |
| 1076 | mov $sbox,%r10 | 1685 | mov $sbox,%r10 |
| 1077 | lea 2048($sbox),%r11 | 1686 | lea 2304($sbox),%r11 |
| 1078 | mov $key,%r12 | 1687 | mov $key,%r12 |
| 1079 | and \$0xFFF,%r10 # s = $sbox&0xfff | 1688 | and \$0xFFF,%r10 # s = $sbox&0xfff |
| 1080 | and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff | 1689 | and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff |
| @@ -1094,22 +1703,27 @@ AES_cbc_encrypt: | |||
| 1094 | .Lcbc_te_ok: | 1703 | .Lcbc_te_ok: |
| 1095 | 1704 | ||
| 1096 | xchg %rsp,$key | 1705 | xchg %rsp,$key |
| 1097 | add \$8,%rsp # reserve for return address! | 1706 | #add \$8,%rsp # reserve for return address! |
| 1098 | mov $key,$_rsp # save %rsp | 1707 | mov $key,$_rsp # save %rsp |
| 1708 | .Lcbc_fast_body: | ||
| 1709 | mov %rdi,$_inp # save copy of inp | ||
| 1710 | mov %rsi,$_out # save copy of out | ||
| 1099 | mov %rdx,$_len # save copy of len | 1711 | mov %rdx,$_len # save copy of len |
| 1100 | mov %rcx,$_key # save copy of key | 1712 | mov %rcx,$_key # save copy of key |
| 1101 | mov %r8,$_ivp # save copy of ivp | 1713 | mov %r8,$_ivp # save copy of ivp |
| 1102 | movl \$0,$mark # copy of aes_key->rounds = 0; | 1714 | movl \$0,$mark # copy of aes_key->rounds = 0; |
| 1103 | mov %r8,%rbp # rearrange input arguments | 1715 | mov %r8,%rbp # rearrange input arguments |
| 1716 | mov %r9,%rbx | ||
| 1104 | mov %rsi,$out | 1717 | mov %rsi,$out |
| 1105 | mov %rdi,$inp | 1718 | mov %rdi,$inp |
| 1106 | mov %rcx,$key | 1719 | mov %rcx,$key |
| 1107 | 1720 | ||
| 1721 | mov 240($key),%eax # key->rounds | ||
| 1108 | # do we copy key schedule to stack? | 1722 | # do we copy key schedule to stack? |
| 1109 | mov $key,%r10 | 1723 | mov $key,%r10 |
| 1110 | sub $sbox,%r10 | 1724 | sub $sbox,%r10 |
| 1111 | and \$0xfff,%r10 | 1725 | and \$0xfff,%r10 |
| 1112 | cmp \$2048,%r10 | 1726 | cmp \$2304,%r10 |
| 1113 | jb .Lcbc_do_ecopy | 1727 | jb .Lcbc_do_ecopy |
| 1114 | cmp \$4096-248,%r10 | 1728 | cmp \$4096-248,%r10 |
| 1115 | jb .Lcbc_skip_ecopy | 1729 | jb .Lcbc_skip_ecopy |
| @@ -1120,12 +1734,11 @@ AES_cbc_encrypt: | |||
| 1120 | lea $aes_key,$key | 1734 | lea $aes_key,$key |
| 1121 | mov \$240/8,%ecx | 1735 | mov \$240/8,%ecx |
| 1122 | .long 0x90A548F3 # rep movsq | 1736 | .long 0x90A548F3 # rep movsq |
| 1123 | mov (%rsi),%eax # copy aes_key->rounds | 1737 | mov %eax,(%rdi) # copy aes_key->rounds |
| 1124 | mov %eax,(%rdi) | ||
| 1125 | .Lcbc_skip_ecopy: | 1738 | .Lcbc_skip_ecopy: |
| 1126 | mov $key,$keyp # save key pointer | 1739 | mov $key,$keyp # save key pointer |
| 1127 | 1740 | ||
| 1128 | mov \$16,%ecx | 1741 | mov \$18,%ecx |
| 1129 | .align 4 | 1742 | .align 4 |
| 1130 | .Lcbc_prefetch_te: | 1743 | .Lcbc_prefetch_te: |
| 1131 | mov 0($sbox),%r10 | 1744 | mov 0($sbox),%r10 |
| @@ -1135,184 +1748,77 @@ AES_cbc_encrypt: | |||
| 1135 | lea 128($sbox),$sbox | 1748 | lea 128($sbox),$sbox |
| 1136 | sub \$1,%ecx | 1749 | sub \$1,%ecx |
| 1137 | jnz .Lcbc_prefetch_te | 1750 | jnz .Lcbc_prefetch_te |
| 1138 | sub \$2048,$sbox | 1751 | lea -2304($sbox),$sbox |
| 1139 | 1752 | ||
| 1140 | test \$-16,%rdx # check upon length | 1753 | cmp \$0,%rbx |
| 1141 | mov %rdx,%r10 | 1754 | je .LFAST_DECRYPT |
| 1755 | |||
| 1756 | #----------------------------- ENCRYPT -----------------------------# | ||
| 1142 | mov 0(%rbp),$s0 # load iv | 1757 | mov 0(%rbp),$s0 # load iv |
| 1143 | mov 4(%rbp),$s1 | 1758 | mov 4(%rbp),$s1 |
| 1144 | mov 8(%rbp),$s2 | 1759 | mov 8(%rbp),$s2 |
| 1145 | mov 12(%rbp),$s3 | 1760 | mov 12(%rbp),$s3 |
| 1146 | jz .Lcbc_enc_tail # short input... | ||
| 1147 | 1761 | ||
| 1148 | .align 4 | 1762 | .align 4 |
| 1149 | .Lcbc_enc_loop: | 1763 | .Lcbc_fast_enc_loop: |
| 1150 | xor 0($inp),$s0 | 1764 | xor 0($inp),$s0 |
| 1151 | xor 4($inp),$s1 | 1765 | xor 4($inp),$s1 |
| 1152 | xor 8($inp),$s2 | 1766 | xor 8($inp),$s2 |
| 1153 | xor 12($inp),$s3 | 1767 | xor 12($inp),$s3 |
| 1154 | mov $inp,$ivec # if ($verticalspin) save inp | ||
| 1155 | |||
| 1156 | mov $keyp,$key # restore key | 1768 | mov $keyp,$key # restore key |
| 1769 | mov $inp,$_inp # if ($verticalspin) save inp | ||
| 1770 | |||
| 1157 | call _x86_64_AES_encrypt | 1771 | call _x86_64_AES_encrypt |
| 1158 | 1772 | ||
| 1159 | mov $ivec,$inp # if ($verticalspin) restore inp | 1773 | mov $_inp,$inp # if ($verticalspin) restore inp |
| 1774 | mov $_len,%r10 | ||
| 1160 | mov $s0,0($out) | 1775 | mov $s0,0($out) |
| 1161 | mov $s1,4($out) | 1776 | mov $s1,4($out) |
| 1162 | mov $s2,8($out) | 1777 | mov $s2,8($out) |
| 1163 | mov $s3,12($out) | 1778 | mov $s3,12($out) |
| 1164 | 1779 | ||
| 1165 | mov $_len,%r10 | ||
| 1166 | lea 16($inp),$inp | 1780 | lea 16($inp),$inp |
| 1167 | lea 16($out),$out | 1781 | lea 16($out),$out |
| 1168 | sub \$16,%r10 | 1782 | sub \$16,%r10 |
| 1169 | test \$-16,%r10 | 1783 | test \$-16,%r10 |
| 1170 | mov %r10,$_len | 1784 | mov %r10,$_len |
| 1171 | jnz .Lcbc_enc_loop | 1785 | jnz .Lcbc_fast_enc_loop |
| 1172 | test \$15,%r10 | ||
| 1173 | jnz .Lcbc_enc_tail | ||
| 1174 | mov $_ivp,%rbp # restore ivp | 1786 | mov $_ivp,%rbp # restore ivp |
| 1175 | mov $s0,0(%rbp) # save ivec | 1787 | mov $s0,0(%rbp) # save ivec |
| 1176 | mov $s1,4(%rbp) | 1788 | mov $s1,4(%rbp) |
| 1177 | mov $s2,8(%rbp) | 1789 | mov $s2,8(%rbp) |
| 1178 | mov $s3,12(%rbp) | 1790 | mov $s3,12(%rbp) |
| 1179 | 1791 | ||
| 1180 | .align 4 | 1792 | jmp .Lcbc_fast_cleanup |
| 1181 | .Lcbc_cleanup: | 1793 | |
| 1182 | cmpl \$0,$mark # was the key schedule copied? | ||
| 1183 | lea $aes_key,%rdi | ||
| 1184 | mov $_rsp,%rsp | ||
| 1185 | je .Lcbc_exit | ||
| 1186 | mov \$240/8,%ecx | ||
| 1187 | xor %rax,%rax | ||
| 1188 | .long 0x90AB48F3 # rep stosq | ||
| 1189 | .Lcbc_exit: | ||
| 1190 | popfq | ||
| 1191 | pop %r15 | ||
| 1192 | pop %r14 | ||
| 1193 | pop %r13 | ||
| 1194 | pop %r12 | ||
| 1195 | pop %rbp | ||
| 1196 | pop %rbx | ||
| 1197 | .Lcbc_just_ret: | ||
| 1198 | ret | ||
| 1199 | .align 4 | ||
| 1200 | .Lcbc_enc_tail: | ||
| 1201 | mov %rax,%r11 | ||
| 1202 | mov %rcx,%r12 | ||
| 1203 | mov %r10,%rcx | ||
| 1204 | mov $inp,%rsi | ||
| 1205 | mov $out,%rdi | ||
| 1206 | .long 0xF689A4F3 # rep movsb | ||
| 1207 | mov \$16,%rcx # zero tail | ||
| 1208 | sub %r10,%rcx | ||
| 1209 | xor %rax,%rax | ||
| 1210 | .long 0xF689AAF3 # rep stosb | ||
| 1211 | mov $out,$inp # this is not a mistake! | ||
| 1212 | movq \$16,$_len # len=16 | ||
| 1213 | mov %r11,%rax | ||
| 1214 | mov %r12,%rcx | ||
| 1215 | jmp .Lcbc_enc_loop # one more spin... | ||
| 1216 | #----------------------------- DECRYPT -----------------------------# | 1794 | #----------------------------- DECRYPT -----------------------------# |
| 1217 | .align 16 | 1795 | .align 16 |
| 1218 | .LDECRYPT: | 1796 | .LFAST_DECRYPT: |
| 1219 | lea AES_Td-.Lcbc_pic_point($sbox),$sbox | ||
| 1220 | |||
| 1221 | # allocate aligned stack frame... | ||
| 1222 | lea -64-248(%rsp),$key | ||
| 1223 | and \$-64,$key | ||
| 1224 | |||
| 1225 | # ... and make it doesn't alias with AES_Td modulo 4096 | ||
| 1226 | mov $sbox,%r10 | ||
| 1227 | lea 2304($sbox),%r11 | ||
| 1228 | mov $key,%r12 | ||
| 1229 | and \$0xFFF,%r10 # s = $sbox&0xfff | ||
| 1230 | and \$0xFFF,%r11 # e = ($sbox+2048+256)&0xfff | ||
| 1231 | and \$0xFFF,%r12 # p = %rsp&0xfff | ||
| 1232 | |||
| 1233 | cmp %r11,%r12 # if (p=>e) %rsp =- (p-e); | ||
| 1234 | jb .Lcbc_td_break_out | ||
| 1235 | sub %r11,%r12 | ||
| 1236 | sub %r12,$key | ||
| 1237 | jmp .Lcbc_td_ok | ||
| 1238 | .Lcbc_td_break_out: # else %rsp -= (p-s)&0xfff + framesz | ||
| 1239 | sub %r10,%r12 | ||
| 1240 | and \$0xFFF,%r12 | ||
| 1241 | add \$320,%r12 | ||
| 1242 | sub %r12,$key | ||
| 1243 | .align 4 | ||
| 1244 | .Lcbc_td_ok: | ||
| 1245 | |||
| 1246 | xchg %rsp,$key | ||
| 1247 | add \$8,%rsp # reserve for return address! | ||
| 1248 | mov $key,$_rsp # save %rsp | ||
| 1249 | mov %rdx,$_len # save copy of len | ||
| 1250 | mov %rcx,$_key # save copy of key | ||
| 1251 | mov %r8,$_ivp # save copy of ivp | ||
| 1252 | movl \$0,$mark # copy of aes_key->rounds = 0; | ||
| 1253 | mov %r8,%rbp # rearrange input arguments | ||
| 1254 | mov %rsi,$out | ||
| 1255 | mov %rdi,$inp | ||
| 1256 | mov %rcx,$key | ||
| 1257 | |||
| 1258 | # do we copy key schedule to stack? | ||
| 1259 | mov $key,%r10 | ||
| 1260 | sub $sbox,%r10 | ||
| 1261 | and \$0xfff,%r10 | ||
| 1262 | cmp \$2304,%r10 | ||
| 1263 | jb .Lcbc_do_dcopy | ||
| 1264 | cmp \$4096-248,%r10 | ||
| 1265 | jb .Lcbc_skip_dcopy | ||
| 1266 | .align 4 | ||
| 1267 | .Lcbc_do_dcopy: | ||
| 1268 | mov $key,%rsi | ||
| 1269 | lea $aes_key,%rdi | ||
| 1270 | lea $aes_key,$key | ||
| 1271 | mov \$240/8,%ecx | ||
| 1272 | .long 0x90A548F3 # rep movsq | ||
| 1273 | mov (%rsi),%eax # copy aes_key->rounds | ||
| 1274 | mov %eax,(%rdi) | ||
| 1275 | .Lcbc_skip_dcopy: | ||
| 1276 | mov $key,$keyp # save key pointer | ||
| 1277 | |||
| 1278 | mov \$18,%ecx | ||
| 1279 | .align 4 | ||
| 1280 | .Lcbc_prefetch_td: | ||
| 1281 | mov 0($sbox),%r10 | ||
| 1282 | mov 32($sbox),%r11 | ||
| 1283 | mov 64($sbox),%r12 | ||
| 1284 | mov 96($sbox),%r13 | ||
| 1285 | lea 128($sbox),$sbox | ||
| 1286 | sub \$1,%ecx | ||
| 1287 | jnz .Lcbc_prefetch_td | ||
| 1288 | sub \$2304,$sbox | ||
| 1289 | |||
| 1290 | cmp $inp,$out | 1797 | cmp $inp,$out |
| 1291 | je .Lcbc_dec_in_place | 1798 | je .Lcbc_fast_dec_in_place |
| 1292 | 1799 | ||
| 1293 | mov %rbp,$ivec | 1800 | mov %rbp,$ivec |
| 1294 | .align 4 | 1801 | .align 4 |
| 1295 | .Lcbc_dec_loop: | 1802 | .Lcbc_fast_dec_loop: |
| 1296 | mov 0($inp),$s0 # read input | 1803 | mov 0($inp),$s0 # read input |
| 1297 | mov 4($inp),$s1 | 1804 | mov 4($inp),$s1 |
| 1298 | mov 8($inp),$s2 | 1805 | mov 8($inp),$s2 |
| 1299 | mov 12($inp),$s3 | 1806 | mov 12($inp),$s3 |
| 1300 | mov $inp,8+$ivec # if ($verticalspin) save inp | ||
| 1301 | |||
| 1302 | mov $keyp,$key # restore key | 1807 | mov $keyp,$key # restore key |
| 1808 | mov $inp,$_inp # if ($verticalspin) save inp | ||
| 1809 | |||
| 1303 | call _x86_64_AES_decrypt | 1810 | call _x86_64_AES_decrypt |
| 1304 | 1811 | ||
| 1305 | mov $ivec,%rbp # load ivp | 1812 | mov $ivec,%rbp # load ivp |
| 1306 | mov 8+$ivec,$inp # if ($verticalspin) restore inp | 1813 | mov $_inp,$inp # if ($verticalspin) restore inp |
| 1814 | mov $_len,%r10 # load len | ||
| 1307 | xor 0(%rbp),$s0 # xor iv | 1815 | xor 0(%rbp),$s0 # xor iv |
| 1308 | xor 4(%rbp),$s1 | 1816 | xor 4(%rbp),$s1 |
| 1309 | xor 8(%rbp),$s2 | 1817 | xor 8(%rbp),$s2 |
| 1310 | xor 12(%rbp),$s3 | 1818 | xor 12(%rbp),$s3 |
| 1311 | mov $inp,%rbp # current input, next iv | 1819 | mov $inp,%rbp # current input, next iv |
| 1312 | 1820 | ||
| 1313 | mov $_len,%r10 # load len | ||
| 1314 | sub \$16,%r10 | 1821 | sub \$16,%r10 |
| 1315 | jc .Lcbc_dec_partial | ||
| 1316 | mov %r10,$_len # update len | 1822 | mov %r10,$_len # update len |
| 1317 | mov %rbp,$ivec # update ivp | 1823 | mov %rbp,$ivec # update ivp |
| 1318 | 1824 | ||
| @@ -1323,81 +1829,281 @@ AES_cbc_encrypt: | |||
| 1323 | 1829 | ||
| 1324 | lea 16($inp),$inp | 1830 | lea 16($inp),$inp |
| 1325 | lea 16($out),$out | 1831 | lea 16($out),$out |
| 1326 | jnz .Lcbc_dec_loop | 1832 | jnz .Lcbc_fast_dec_loop |
| 1327 | .Lcbc_dec_end: | ||
| 1328 | mov $_ivp,%r12 # load user ivp | 1833 | mov $_ivp,%r12 # load user ivp |
| 1329 | mov 0(%rbp),%r10 # load iv | 1834 | mov 0(%rbp),%r10 # load iv |
| 1330 | mov 8(%rbp),%r11 | 1835 | mov 8(%rbp),%r11 |
| 1331 | mov %r10,0(%r12) # copy back to user | 1836 | mov %r10,0(%r12) # copy back to user |
| 1332 | mov %r11,8(%r12) | 1837 | mov %r11,8(%r12) |
| 1333 | jmp .Lcbc_cleanup | 1838 | jmp .Lcbc_fast_cleanup |
| 1334 | |||
| 1335 | .align 4 | ||
| 1336 | .Lcbc_dec_partial: | ||
| 1337 | mov $s0,0+$ivec # dump output to stack | ||
| 1338 | mov $s1,4+$ivec | ||
| 1339 | mov $s2,8+$ivec | ||
| 1340 | mov $s3,12+$ivec | ||
| 1341 | mov $out,%rdi | ||
| 1342 | lea $ivec,%rsi | ||
| 1343 | mov \$16,%rcx | ||
| 1344 | add %r10,%rcx # number of bytes to copy | ||
| 1345 | .long 0xF689A4F3 # rep movsb | ||
| 1346 | jmp .Lcbc_dec_end | ||
| 1347 | 1839 | ||
| 1348 | .align 16 | 1840 | .align 16 |
| 1349 | .Lcbc_dec_in_place: | 1841 | .Lcbc_fast_dec_in_place: |
| 1842 | mov 0(%rbp),%r10 # copy iv to stack | ||
| 1843 | mov 8(%rbp),%r11 | ||
| 1844 | mov %r10,0+$ivec | ||
| 1845 | mov %r11,8+$ivec | ||
| 1846 | .align 4 | ||
| 1847 | .Lcbc_fast_dec_in_place_loop: | ||
| 1350 | mov 0($inp),$s0 # load input | 1848 | mov 0($inp),$s0 # load input |
| 1351 | mov 4($inp),$s1 | 1849 | mov 4($inp),$s1 |
| 1352 | mov 8($inp),$s2 | 1850 | mov 8($inp),$s2 |
| 1353 | mov 12($inp),$s3 | 1851 | mov 12($inp),$s3 |
| 1852 | mov $keyp,$key # restore key | ||
| 1853 | mov $inp,$_inp # if ($verticalspin) save inp | ||
| 1354 | 1854 | ||
| 1355 | mov $inp,$ivec # if ($verticalspin) save inp | ||
| 1356 | mov $keyp,$key | ||
| 1357 | call _x86_64_AES_decrypt | 1855 | call _x86_64_AES_decrypt |
| 1358 | 1856 | ||
| 1359 | mov $ivec,$inp # if ($verticalspin) restore inp | 1857 | mov $_inp,$inp # if ($verticalspin) restore inp |
| 1360 | mov $_ivp,%rbp | 1858 | mov $_len,%r10 |
| 1361 | xor 0(%rbp),$s0 | 1859 | xor 0+$ivec,$s0 |
| 1362 | xor 4(%rbp),$s1 | 1860 | xor 4+$ivec,$s1 |
| 1363 | xor 8(%rbp),$s2 | 1861 | xor 8+$ivec,$s2 |
| 1364 | xor 12(%rbp),$s3 | 1862 | xor 12+$ivec,$s3 |
| 1863 | |||
| 1864 | mov 0($inp),%r11 # load input | ||
| 1865 | mov 8($inp),%r12 | ||
| 1866 | sub \$16,%r10 | ||
| 1867 | jz .Lcbc_fast_dec_in_place_done | ||
| 1365 | 1868 | ||
| 1366 | mov 0($inp),%r10 # copy input to iv | 1869 | mov %r11,0+$ivec # copy input to iv |
| 1367 | mov 8($inp),%r11 | 1870 | mov %r12,8+$ivec |
| 1368 | mov %r10,0(%rbp) | ||
| 1369 | mov %r11,8(%rbp) | ||
| 1370 | 1871 | ||
| 1371 | mov $s0,0($out) # save output [zaps input] | 1872 | mov $s0,0($out) # save output [zaps input] |
| 1372 | mov $s1,4($out) | 1873 | mov $s1,4($out) |
| 1373 | mov $s2,8($out) | 1874 | mov $s2,8($out) |
| 1374 | mov $s3,12($out) | 1875 | mov $s3,12($out) |
| 1375 | 1876 | ||
| 1376 | mov $_len,%rcx | ||
| 1377 | lea 16($inp),$inp | 1877 | lea 16($inp),$inp |
| 1378 | lea 16($out),$out | 1878 | lea 16($out),$out |
| 1379 | sub \$16,%rcx | 1879 | mov %r10,$_len |
| 1380 | jc .Lcbc_dec_in_place_partial | 1880 | jmp .Lcbc_fast_dec_in_place_loop |
| 1381 | mov %rcx,$_len | 1881 | .Lcbc_fast_dec_in_place_done: |
| 1382 | jnz .Lcbc_dec_in_place | 1882 | mov $_ivp,%rdi |
| 1383 | jmp .Lcbc_cleanup | 1883 | mov %r11,0(%rdi) # copy iv back to user |
| 1884 | mov %r12,8(%rdi) | ||
| 1885 | |||
| 1886 | mov $s0,0($out) # save output [zaps input] | ||
| 1887 | mov $s1,4($out) | ||
| 1888 | mov $s2,8($out) | ||
| 1889 | mov $s3,12($out) | ||
| 1384 | 1890 | ||
| 1385 | .align 4 | 1891 | .align 4 |
| 1386 | .Lcbc_dec_in_place_partial: | 1892 | .Lcbc_fast_cleanup: |
| 1387 | # one can argue if this is actually required | 1893 | cmpl \$0,$mark # was the key schedule copied? |
| 1388 | lea ($out,%rcx),%rdi | 1894 | lea $aes_key,%rdi |
| 1389 | lea (%rbp,%rcx),%rsi | 1895 | je .Lcbc_exit |
| 1390 | neg %rcx | 1896 | mov \$240/8,%ecx |
| 1391 | .long 0xF689A4F3 # rep movsb # restore tail | 1897 | xor %rax,%rax |
| 1392 | jmp .Lcbc_cleanup | 1898 | .long 0x90AB48F3 # rep stosq |
| 1899 | |||
| 1900 | jmp .Lcbc_exit | ||
| 1901 | |||
| 1902 | #--------------------------- SLOW ROUTINE ---------------------------# | ||
| 1903 | .align 16 | ||
| 1904 | .Lcbc_slow_prologue: | ||
| 1905 | # allocate aligned stack frame... | ||
| 1906 | lea -88(%rsp),%rbp | ||
| 1907 | and \$-64,%rbp | ||
| 1908 | # ... just "above" key schedule | ||
| 1909 | lea -88-63(%rcx),%r10 | ||
| 1910 | sub %rbp,%r10 | ||
| 1911 | neg %r10 | ||
| 1912 | and \$0x3c0,%r10 | ||
| 1913 | sub %r10,%rbp | ||
| 1914 | |||
| 1915 | xchg %rsp,%rbp | ||
| 1916 | #add \$8,%rsp # reserve for return address! | ||
| 1917 | mov %rbp,$_rsp # save %rsp | ||
| 1918 | .Lcbc_slow_body: | ||
| 1919 | #mov %rdi,$_inp # save copy of inp | ||
| 1920 | #mov %rsi,$_out # save copy of out | ||
| 1921 | #mov %rdx,$_len # save copy of len | ||
| 1922 | #mov %rcx,$_key # save copy of key | ||
| 1923 | mov %r8,$_ivp # save copy of ivp | ||
| 1924 | mov %r8,%rbp # rearrange input arguments | ||
| 1925 | mov %r9,%rbx | ||
| 1926 | mov %rsi,$out | ||
| 1927 | mov %rdi,$inp | ||
| 1928 | mov %rcx,$key | ||
| 1929 | mov %rdx,%r10 | ||
| 1930 | |||
| 1931 | mov 240($key),%eax | ||
| 1932 | mov $key,$keyp # save key pointer | ||
| 1933 | shl \$4,%eax | ||
| 1934 | lea ($key,%rax),%rax | ||
| 1935 | mov %rax,$keyend | ||
| 1936 | |||
| 1937 | # pick Te4 copy which can't "overlap" with stack frame or key scdedule | ||
| 1938 | lea 2048($sbox),$sbox | ||
| 1939 | lea 768-8(%rsp),%rax | ||
| 1940 | sub $sbox,%rax | ||
| 1941 | and \$0x300,%rax | ||
| 1942 | lea ($sbox,%rax),$sbox | ||
| 1943 | |||
| 1944 | cmp \$0,%rbx | ||
| 1945 | je .LSLOW_DECRYPT | ||
| 1946 | |||
| 1947 | #--------------------------- SLOW ENCRYPT ---------------------------# | ||
| 1948 | test \$-16,%r10 # check upon length | ||
| 1949 | mov 0(%rbp),$s0 # load iv | ||
| 1950 | mov 4(%rbp),$s1 | ||
| 1951 | mov 8(%rbp),$s2 | ||
| 1952 | mov 12(%rbp),$s3 | ||
| 1953 | jz .Lcbc_slow_enc_tail # short input... | ||
| 1954 | |||
| 1955 | .align 4 | ||
| 1956 | .Lcbc_slow_enc_loop: | ||
| 1957 | xor 0($inp),$s0 | ||
| 1958 | xor 4($inp),$s1 | ||
| 1959 | xor 8($inp),$s2 | ||
| 1960 | xor 12($inp),$s3 | ||
| 1961 | mov $keyp,$key # restore key | ||
| 1962 | mov $inp,$_inp # save inp | ||
| 1963 | mov $out,$_out # save out | ||
| 1964 | mov %r10,$_len # save len | ||
| 1965 | |||
| 1966 | call _x86_64_AES_encrypt_compact | ||
| 1967 | |||
| 1968 | mov $_inp,$inp # restore inp | ||
| 1969 | mov $_out,$out # restore out | ||
| 1970 | mov $_len,%r10 # restore len | ||
| 1971 | mov $s0,0($out) | ||
| 1972 | mov $s1,4($out) | ||
| 1973 | mov $s2,8($out) | ||
| 1974 | mov $s3,12($out) | ||
| 1975 | |||
| 1976 | lea 16($inp),$inp | ||
| 1977 | lea 16($out),$out | ||
| 1978 | sub \$16,%r10 | ||
| 1979 | test \$-16,%r10 | ||
| 1980 | jnz .Lcbc_slow_enc_loop | ||
| 1981 | test \$15,%r10 | ||
| 1982 | jnz .Lcbc_slow_enc_tail | ||
| 1983 | mov $_ivp,%rbp # restore ivp | ||
| 1984 | mov $s0,0(%rbp) # save ivec | ||
| 1985 | mov $s1,4(%rbp) | ||
| 1986 | mov $s2,8(%rbp) | ||
| 1987 | mov $s3,12(%rbp) | ||
| 1988 | |||
| 1989 | jmp .Lcbc_exit | ||
| 1990 | |||
| 1991 | .align 4 | ||
| 1992 | .Lcbc_slow_enc_tail: | ||
| 1993 | mov %rax,%r11 | ||
| 1994 | mov %rcx,%r12 | ||
| 1995 | mov %r10,%rcx | ||
| 1996 | mov $inp,%rsi | ||
| 1997 | mov $out,%rdi | ||
| 1998 | .long 0x9066A4F3 # rep movsb | ||
| 1999 | mov \$16,%rcx # zero tail | ||
| 2000 | sub %r10,%rcx | ||
| 2001 | xor %rax,%rax | ||
| 2002 | .long 0x9066AAF3 # rep stosb | ||
| 2003 | mov $out,$inp # this is not a mistake! | ||
| 2004 | mov \$16,%r10 # len=16 | ||
| 2005 | mov %r11,%rax | ||
| 2006 | mov %r12,%rcx | ||
| 2007 | jmp .Lcbc_slow_enc_loop # one more spin... | ||
| 2008 | #--------------------------- SLOW DECRYPT ---------------------------# | ||
| 2009 | .align 16 | ||
| 2010 | .LSLOW_DECRYPT: | ||
| 2011 | shr \$3,%rax | ||
| 2012 | add %rax,$sbox # recall "magic" constants! | ||
| 2013 | |||
| 2014 | mov 0(%rbp),%r11 # copy iv to stack | ||
| 2015 | mov 8(%rbp),%r12 | ||
| 2016 | mov %r11,0+$ivec | ||
| 2017 | mov %r12,8+$ivec | ||
| 2018 | |||
| 2019 | .align 4 | ||
| 2020 | .Lcbc_slow_dec_loop: | ||
| 2021 | mov 0($inp),$s0 # load input | ||
| 2022 | mov 4($inp),$s1 | ||
| 2023 | mov 8($inp),$s2 | ||
| 2024 | mov 12($inp),$s3 | ||
| 2025 | mov $keyp,$key # restore key | ||
| 2026 | mov $inp,$_inp # save inp | ||
| 2027 | mov $out,$_out # save out | ||
| 2028 | mov %r10,$_len # save len | ||
| 2029 | |||
| 2030 | call _x86_64_AES_decrypt_compact | ||
| 2031 | |||
| 2032 | mov $_inp,$inp # restore inp | ||
| 2033 | mov $_out,$out # restore out | ||
| 2034 | mov $_len,%r10 | ||
| 2035 | xor 0+$ivec,$s0 | ||
| 2036 | xor 4+$ivec,$s1 | ||
| 2037 | xor 8+$ivec,$s2 | ||
| 2038 | xor 12+$ivec,$s3 | ||
| 2039 | |||
| 2040 | mov 0($inp),%r11 # load input | ||
| 2041 | mov 8($inp),%r12 | ||
| 2042 | sub \$16,%r10 | ||
| 2043 | jc .Lcbc_slow_dec_partial | ||
| 2044 | jz .Lcbc_slow_dec_done | ||
| 2045 | |||
| 2046 | mov %r11,0+$ivec # copy input to iv | ||
| 2047 | mov %r12,8+$ivec | ||
| 2048 | |||
| 2049 | mov $s0,0($out) # save output [can zap input] | ||
| 2050 | mov $s1,4($out) | ||
| 2051 | mov $s2,8($out) | ||
| 2052 | mov $s3,12($out) | ||
| 2053 | |||
| 2054 | lea 16($inp),$inp | ||
| 2055 | lea 16($out),$out | ||
| 2056 | jmp .Lcbc_slow_dec_loop | ||
| 2057 | .Lcbc_slow_dec_done: | ||
| 2058 | mov $_ivp,%rdi | ||
| 2059 | mov %r11,0(%rdi) # copy iv back to user | ||
| 2060 | mov %r12,8(%rdi) | ||
| 2061 | |||
| 2062 | mov $s0,0($out) # save output [can zap input] | ||
| 2063 | mov $s1,4($out) | ||
| 2064 | mov $s2,8($out) | ||
| 2065 | mov $s3,12($out) | ||
| 2066 | |||
| 2067 | jmp .Lcbc_exit | ||
| 2068 | |||
| 2069 | .align 4 | ||
| 2070 | .Lcbc_slow_dec_partial: | ||
| 2071 | mov $_ivp,%rdi | ||
| 2072 | mov %r11,0(%rdi) # copy iv back to user | ||
| 2073 | mov %r12,8(%rdi) | ||
| 2074 | |||
| 2075 | mov $s0,0+$ivec # save output to stack | ||
| 2076 | mov $s1,4+$ivec | ||
| 2077 | mov $s2,8+$ivec | ||
| 2078 | mov $s3,12+$ivec | ||
| 2079 | |||
| 2080 | mov $out,%rdi | ||
| 2081 | lea $ivec,%rsi | ||
| 2082 | lea 16(%r10),%rcx | ||
| 2083 | .long 0x9066A4F3 # rep movsb | ||
| 2084 | jmp .Lcbc_exit | ||
| 2085 | |||
| 2086 | .align 16 | ||
| 2087 | .Lcbc_exit: | ||
| 2088 | mov $_rsp,%rsi | ||
| 2089 | mov (%rsi),%r15 | ||
| 2090 | mov 8(%rsi),%r14 | ||
| 2091 | mov 16(%rsi),%r13 | ||
| 2092 | mov 24(%rsi),%r12 | ||
| 2093 | mov 32(%rsi),%rbp | ||
| 2094 | mov 40(%rsi),%rbx | ||
| 2095 | lea 48(%rsi),%rsp | ||
| 2096 | .Lcbc_popfq: | ||
| 2097 | popfq | ||
| 2098 | .Lcbc_epilogue: | ||
| 2099 | ret | ||
| 1393 | .size AES_cbc_encrypt,.-AES_cbc_encrypt | 2100 | .size AES_cbc_encrypt,.-AES_cbc_encrypt |
| 1394 | ___ | 2101 | ___ |
| 1395 | } | 2102 | } |
| 1396 | 2103 | ||
| 1397 | $code.=<<___; | 2104 | $code.=<<___; |
| 1398 | .globl AES_Te | ||
| 1399 | .align 64 | 2105 | .align 64 |
| 1400 | AES_Te: | 2106 | .LAES_Te: |
| 1401 | ___ | 2107 | ___ |
| 1402 | &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); | 2108 | &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); |
| 1403 | &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); | 2109 | &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); |
| @@ -1463,16 +2169,149 @@ ___ | |||
| 1463 | &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); | 2169 | &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); |
| 1464 | &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); | 2170 | &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); |
| 1465 | &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); | 2171 | &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); |
| 2172 | |||
| 2173 | #Te4 # four copies of Te4 to choose from to avoid L1 aliasing | ||
| 2174 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
| 2175 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
| 2176 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
| 2177 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
| 2178 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
| 2179 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
| 2180 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
| 2181 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
| 2182 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
| 2183 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
| 2184 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
| 2185 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
| 2186 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
| 2187 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
| 2188 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
| 2189 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
| 2190 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
| 2191 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
| 2192 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
| 2193 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
| 2194 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
| 2195 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
| 2196 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
| 2197 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
| 2198 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
| 2199 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
| 2200 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
| 2201 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
| 2202 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
| 2203 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
| 2204 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
| 2205 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
| 2206 | |||
| 2207 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
| 2208 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
| 2209 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
| 2210 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
| 2211 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
| 2212 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
| 2213 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
| 2214 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
| 2215 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
| 2216 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
| 2217 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
| 2218 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
| 2219 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
| 2220 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
| 2221 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
| 2222 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
| 2223 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
| 2224 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
| 2225 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
| 2226 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
| 2227 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
| 2228 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
| 2229 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
| 2230 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
| 2231 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
| 2232 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
| 2233 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
| 2234 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
| 2235 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
| 2236 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
| 2237 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
| 2238 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
| 2239 | |||
| 2240 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
| 2241 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
| 2242 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
| 2243 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
| 2244 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
| 2245 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
| 2246 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
| 2247 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
| 2248 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
| 2249 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
| 2250 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
| 2251 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
| 2252 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
| 2253 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
| 2254 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
| 2255 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
| 2256 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
| 2257 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
| 2258 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
| 2259 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
| 2260 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
| 2261 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
| 2262 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
| 2263 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
| 2264 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
| 2265 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
| 2266 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
| 2267 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
| 2268 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
| 2269 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
| 2270 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
| 2271 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
| 2272 | |||
| 2273 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
| 2274 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
| 2275 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
| 2276 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
| 2277 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
| 2278 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
| 2279 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
| 2280 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
| 2281 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
| 2282 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
| 2283 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
| 2284 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
| 2285 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
| 2286 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
| 2287 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
| 2288 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
| 2289 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
| 2290 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
| 2291 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
| 2292 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
| 2293 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
| 2294 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
| 2295 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
| 2296 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
| 2297 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
| 2298 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
| 2299 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
| 2300 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
| 2301 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
| 2302 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
| 2303 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
| 2304 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
| 1466 | #rcon: | 2305 | #rcon: |
| 1467 | $code.=<<___; | 2306 | $code.=<<___; |
| 1468 | .long 0x00000001, 0x00000002, 0x00000004, 0x00000008 | 2307 | .long 0x00000001, 0x00000002, 0x00000004, 0x00000008 |
| 1469 | .long 0x00000010, 0x00000020, 0x00000040, 0x00000080 | 2308 | .long 0x00000010, 0x00000020, 0x00000040, 0x00000080 |
| 1470 | .long 0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0 | 2309 | .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080 |
| 2310 | .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b | ||
| 1471 | ___ | 2311 | ___ |
| 1472 | $code.=<<___; | 2312 | $code.=<<___; |
| 1473 | .globl AES_Td | ||
| 1474 | .align 64 | 2313 | .align 64 |
| 1475 | AES_Td: | 2314 | .LAES_Td: |
| 1476 | ___ | 2315 | ___ |
| 1477 | &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a); | 2316 | &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a); |
| 1478 | &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b); | 2317 | &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b); |
| @@ -1538,7 +2377,116 @@ ___ | |||
| 1538 | &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); | 2377 | &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); |
| 1539 | &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); | 2378 | &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); |
| 1540 | &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); | 2379 | &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); |
| 1541 | #Td4: | 2380 | |
| 2381 | #Td4: # four copies of Td4 to choose from to avoid L1 aliasing | ||
| 2382 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
| 2383 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
| 2384 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
| 2385 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
| 2386 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
| 2387 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
| 2388 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
| 2389 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
| 2390 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
| 2391 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
| 2392 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
| 2393 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
| 2394 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
| 2395 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
| 2396 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
| 2397 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
| 2398 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
| 2399 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
| 2400 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
| 2401 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
| 2402 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
| 2403 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
| 2404 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
| 2405 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
| 2406 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
| 2407 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
| 2408 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
| 2409 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
| 2410 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
| 2411 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
| 2412 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
| 2413 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
| 2414 | $code.=<<___; | ||
| 2415 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
| 2416 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
| 2417 | ___ | ||
| 2418 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
| 2419 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
| 2420 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
| 2421 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
| 2422 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
| 2423 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
| 2424 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
| 2425 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
| 2426 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
| 2427 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
| 2428 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
| 2429 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
| 2430 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
| 2431 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
| 2432 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
| 2433 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
| 2434 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
| 2435 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
| 2436 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
| 2437 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
| 2438 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
| 2439 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
| 2440 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
| 2441 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
| 2442 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
| 2443 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
| 2444 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
| 2445 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
| 2446 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
| 2447 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
| 2448 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
| 2449 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
| 2450 | $code.=<<___; | ||
| 2451 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
| 2452 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
| 2453 | ___ | ||
| 2454 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
| 2455 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
| 2456 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
| 2457 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
| 2458 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
| 2459 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
| 2460 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
| 2461 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
| 2462 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
| 2463 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
| 2464 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
| 2465 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
| 2466 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
| 2467 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
| 2468 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
| 2469 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
| 2470 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
| 2471 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
| 2472 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
| 2473 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
| 2474 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
| 2475 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
| 2476 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
| 2477 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
| 2478 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
| 2479 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
| 2480 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
| 2481 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
| 2482 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
| 2483 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
| 2484 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
| 2485 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
| 2486 | $code.=<<___; | ||
| 2487 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
| 2488 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
| 2489 | ___ | ||
| 1542 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | 2490 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); |
| 1543 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | 2491 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); |
| 1544 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | 2492 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); |
| @@ -1571,6 +2519,288 @@ ___ | |||
| 1571 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | 2519 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); |
| 1572 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | 2520 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); |
| 1573 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | 2521 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); |
| 2522 | $code.=<<___; | ||
| 2523 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
| 2524 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
| 2525 | .asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 2526 | .align 64 | ||
| 2527 | ___ | ||
| 2528 | |||
| 2529 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 2530 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 2531 | if ($win64) { | ||
| 2532 | $rec="%rcx"; | ||
| 2533 | $frame="%rdx"; | ||
| 2534 | $context="%r8"; | ||
| 2535 | $disp="%r9"; | ||
| 2536 | |||
| 2537 | $code.=<<___; | ||
| 2538 | .extern __imp_RtlVirtualUnwind | ||
| 2539 | .type block_se_handler,\@abi-omnipotent | ||
| 2540 | .align 16 | ||
| 2541 | block_se_handler: | ||
| 2542 | push %rsi | ||
| 2543 | push %rdi | ||
| 2544 | push %rbx | ||
| 2545 | push %rbp | ||
| 2546 | push %r12 | ||
| 2547 | push %r13 | ||
| 2548 | push %r14 | ||
| 2549 | push %r15 | ||
| 2550 | pushfq | ||
| 2551 | sub \$64,%rsp | ||
| 2552 | |||
| 2553 | mov 120($context),%rax # pull context->Rax | ||
| 2554 | mov 248($context),%rbx # pull context->Rip | ||
| 2555 | |||
| 2556 | mov 8($disp),%rsi # disp->ImageBase | ||
| 2557 | mov 56($disp),%r11 # disp->HandlerData | ||
| 2558 | |||
| 2559 | mov 0(%r11),%r10d # HandlerData[0] | ||
| 2560 | lea (%rsi,%r10),%r10 # prologue label | ||
| 2561 | cmp %r10,%rbx # context->Rip<prologue label | ||
| 2562 | jb .Lin_block_prologue | ||
| 2563 | |||
| 2564 | mov 152($context),%rax # pull context->Rsp | ||
| 2565 | |||
| 2566 | mov 4(%r11),%r10d # HandlerData[1] | ||
| 2567 | lea (%rsi,%r10),%r10 # epilogue label | ||
| 2568 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
| 2569 | jae .Lin_block_prologue | ||
| 2570 | |||
| 2571 | mov 24(%rax),%rax # pull saved real stack pointer | ||
| 2572 | lea 48(%rax),%rax # adjust... | ||
| 2573 | |||
| 2574 | mov -8(%rax),%rbx | ||
| 2575 | mov -16(%rax),%rbp | ||
| 2576 | mov -24(%rax),%r12 | ||
| 2577 | mov -32(%rax),%r13 | ||
| 2578 | mov -40(%rax),%r14 | ||
| 2579 | mov -48(%rax),%r15 | ||
| 2580 | mov %rbx,144($context) # restore context->Rbx | ||
| 2581 | mov %rbp,160($context) # restore context->Rbp | ||
| 2582 | mov %r12,216($context) # restore context->R12 | ||
| 2583 | mov %r13,224($context) # restore context->R13 | ||
| 2584 | mov %r14,232($context) # restore context->R14 | ||
| 2585 | mov %r15,240($context) # restore context->R15 | ||
| 2586 | |||
| 2587 | .Lin_block_prologue: | ||
| 2588 | mov 8(%rax),%rdi | ||
| 2589 | mov 16(%rax),%rsi | ||
| 2590 | mov %rax,152($context) # restore context->Rsp | ||
| 2591 | mov %rsi,168($context) # restore context->Rsi | ||
| 2592 | mov %rdi,176($context) # restore context->Rdi | ||
| 2593 | |||
| 2594 | jmp .Lcommon_seh_exit | ||
| 2595 | .size block_se_handler,.-block_se_handler | ||
| 2596 | |||
| 2597 | .type key_se_handler,\@abi-omnipotent | ||
| 2598 | .align 16 | ||
| 2599 | key_se_handler: | ||
| 2600 | push %rsi | ||
| 2601 | push %rdi | ||
| 2602 | push %rbx | ||
| 2603 | push %rbp | ||
| 2604 | push %r12 | ||
| 2605 | push %r13 | ||
| 2606 | push %r14 | ||
| 2607 | push %r15 | ||
| 2608 | pushfq | ||
| 2609 | sub \$64,%rsp | ||
| 2610 | |||
| 2611 | mov 120($context),%rax # pull context->Rax | ||
| 2612 | mov 248($context),%rbx # pull context->Rip | ||
| 2613 | |||
| 2614 | mov 8($disp),%rsi # disp->ImageBase | ||
| 2615 | mov 56($disp),%r11 # disp->HandlerData | ||
| 2616 | |||
| 2617 | mov 0(%r11),%r10d # HandlerData[0] | ||
| 2618 | lea (%rsi,%r10),%r10 # prologue label | ||
| 2619 | cmp %r10,%rbx # context->Rip<prologue label | ||
| 2620 | jb .Lin_key_prologue | ||
| 2621 | |||
| 2622 | mov 152($context),%rax # pull context->Rsp | ||
| 2623 | |||
| 2624 | mov 4(%r11),%r10d # HandlerData[1] | ||
| 2625 | lea (%rsi,%r10),%r10 # epilogue label | ||
| 2626 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
| 2627 | jae .Lin_key_prologue | ||
| 2628 | |||
| 2629 | lea 56(%rax),%rax | ||
| 2630 | |||
| 2631 | mov -8(%rax),%rbx | ||
| 2632 | mov -16(%rax),%rbp | ||
| 2633 | mov -24(%rax),%r12 | ||
| 2634 | mov -32(%rax),%r13 | ||
| 2635 | mov -40(%rax),%r14 | ||
| 2636 | mov -48(%rax),%r15 | ||
| 2637 | mov %rbx,144($context) # restore context->Rbx | ||
| 2638 | mov %rbp,160($context) # restore context->Rbp | ||
| 2639 | mov %r12,216($context) # restore context->R12 | ||
| 2640 | mov %r13,224($context) # restore context->R13 | ||
| 2641 | mov %r14,232($context) # restore context->R14 | ||
| 2642 | mov %r15,240($context) # restore context->R15 | ||
| 2643 | |||
| 2644 | .Lin_key_prologue: | ||
| 2645 | mov 8(%rax),%rdi | ||
| 2646 | mov 16(%rax),%rsi | ||
| 2647 | mov %rax,152($context) # restore context->Rsp | ||
| 2648 | mov %rsi,168($context) # restore context->Rsi | ||
| 2649 | mov %rdi,176($context) # restore context->Rdi | ||
| 2650 | |||
| 2651 | jmp .Lcommon_seh_exit | ||
| 2652 | .size key_se_handler,.-key_se_handler | ||
| 2653 | |||
| 2654 | .type cbc_se_handler,\@abi-omnipotent | ||
| 2655 | .align 16 | ||
| 2656 | cbc_se_handler: | ||
| 2657 | push %rsi | ||
| 2658 | push %rdi | ||
| 2659 | push %rbx | ||
| 2660 | push %rbp | ||
| 2661 | push %r12 | ||
| 2662 | push %r13 | ||
| 2663 | push %r14 | ||
| 2664 | push %r15 | ||
| 2665 | pushfq | ||
| 2666 | sub \$64,%rsp | ||
| 2667 | |||
| 2668 | mov 120($context),%rax # pull context->Rax | ||
| 2669 | mov 248($context),%rbx # pull context->Rip | ||
| 2670 | |||
| 2671 | lea .Lcbc_prologue(%rip),%r10 | ||
| 2672 | cmp %r10,%rbx # context->Rip<.Lcbc_prologue | ||
| 2673 | jb .Lin_cbc_prologue | ||
| 2674 | |||
| 2675 | lea .Lcbc_fast_body(%rip),%r10 | ||
| 2676 | cmp %r10,%rbx # context->Rip<.Lcbc_fast_body | ||
| 2677 | jb .Lin_cbc_frame_setup | ||
| 2678 | |||
| 2679 | lea .Lcbc_slow_prologue(%rip),%r10 | ||
| 2680 | cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue | ||
| 2681 | jb .Lin_cbc_body | ||
| 2682 | |||
| 2683 | lea .Lcbc_slow_body(%rip),%r10 | ||
| 2684 | cmp %r10,%rbx # context->Rip<.Lcbc_slow_body | ||
| 2685 | jb .Lin_cbc_frame_setup | ||
| 2686 | |||
| 2687 | .Lin_cbc_body: | ||
| 2688 | mov 152($context),%rax # pull context->Rsp | ||
| 2689 | |||
| 2690 | lea .Lcbc_epilogue(%rip),%r10 | ||
| 2691 | cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue | ||
| 2692 | jae .Lin_cbc_prologue | ||
| 2693 | |||
| 2694 | lea 8(%rax),%rax | ||
| 2695 | |||
| 2696 | lea .Lcbc_popfq(%rip),%r10 | ||
| 2697 | cmp %r10,%rbx # context->Rip>=.Lcbc_popfq | ||
| 2698 | jae .Lin_cbc_prologue | ||
| 2699 | |||
| 2700 | mov `16-8`(%rax),%rax # biased $_rsp | ||
| 2701 | lea 56(%rax),%rax | ||
| 2702 | |||
| 2703 | .Lin_cbc_frame_setup: | ||
| 2704 | mov -16(%rax),%rbx | ||
| 2705 | mov -24(%rax),%rbp | ||
| 2706 | mov -32(%rax),%r12 | ||
| 2707 | mov -40(%rax),%r13 | ||
| 2708 | mov -48(%rax),%r14 | ||
| 2709 | mov -56(%rax),%r15 | ||
| 2710 | mov %rbx,144($context) # restore context->Rbx | ||
| 2711 | mov %rbp,160($context) # restore context->Rbp | ||
| 2712 | mov %r12,216($context) # restore context->R12 | ||
| 2713 | mov %r13,224($context) # restore context->R13 | ||
| 2714 | mov %r14,232($context) # restore context->R14 | ||
| 2715 | mov %r15,240($context) # restore context->R15 | ||
| 2716 | |||
| 2717 | .Lin_cbc_prologue: | ||
| 2718 | mov 8(%rax),%rdi | ||
| 2719 | mov 16(%rax),%rsi | ||
| 2720 | mov %rax,152($context) # restore context->Rsp | ||
| 2721 | mov %rsi,168($context) # restore context->Rsi | ||
| 2722 | mov %rdi,176($context) # restore context->Rdi | ||
| 2723 | |||
| 2724 | .Lcommon_seh_exit: | ||
| 2725 | |||
| 2726 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 2727 | mov $context,%rsi # context | ||
| 2728 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
| 2729 | .long 0xa548f3fc # cld; rep movsq | ||
| 2730 | |||
| 2731 | mov $disp,%rsi | ||
| 2732 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 2733 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 2734 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 2735 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 2736 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 2737 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 2738 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 2739 | mov %r10,32(%rsp) # arg5 | ||
| 2740 | mov %r11,40(%rsp) # arg6 | ||
| 2741 | mov %r12,48(%rsp) # arg7 | ||
| 2742 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 2743 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 2744 | |||
| 2745 | mov \$1,%eax # ExceptionContinueSearch | ||
| 2746 | add \$64,%rsp | ||
| 2747 | popfq | ||
| 2748 | pop %r15 | ||
| 2749 | pop %r14 | ||
| 2750 | pop %r13 | ||
| 2751 | pop %r12 | ||
| 2752 | pop %rbp | ||
| 2753 | pop %rbx | ||
| 2754 | pop %rdi | ||
| 2755 | pop %rsi | ||
| 2756 | ret | ||
| 2757 | .size cbc_se_handler,.-cbc_se_handler | ||
| 2758 | |||
| 2759 | .section .pdata | ||
| 2760 | .align 4 | ||
| 2761 | .rva .LSEH_begin_AES_encrypt | ||
| 2762 | .rva .LSEH_end_AES_encrypt | ||
| 2763 | .rva .LSEH_info_AES_encrypt | ||
| 2764 | |||
| 2765 | .rva .LSEH_begin_AES_decrypt | ||
| 2766 | .rva .LSEH_end_AES_decrypt | ||
| 2767 | .rva .LSEH_info_AES_decrypt | ||
| 2768 | |||
| 2769 | .rva .LSEH_begin_AES_set_encrypt_key | ||
| 2770 | .rva .LSEH_end_AES_set_encrypt_key | ||
| 2771 | .rva .LSEH_info_AES_set_encrypt_key | ||
| 2772 | |||
| 2773 | .rva .LSEH_begin_AES_set_decrypt_key | ||
| 2774 | .rva .LSEH_end_AES_set_decrypt_key | ||
| 2775 | .rva .LSEH_info_AES_set_decrypt_key | ||
| 2776 | |||
| 2777 | .rva .LSEH_begin_AES_cbc_encrypt | ||
| 2778 | .rva .LSEH_end_AES_cbc_encrypt | ||
| 2779 | .rva .LSEH_info_AES_cbc_encrypt | ||
| 2780 | |||
| 2781 | .section .xdata | ||
| 2782 | .align 8 | ||
| 2783 | .LSEH_info_AES_encrypt: | ||
| 2784 | .byte 9,0,0,0 | ||
| 2785 | .rva block_se_handler | ||
| 2786 | .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] | ||
| 2787 | .LSEH_info_AES_decrypt: | ||
| 2788 | .byte 9,0,0,0 | ||
| 2789 | .rva block_se_handler | ||
| 2790 | .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] | ||
| 2791 | .LSEH_info_AES_set_encrypt_key: | ||
| 2792 | .byte 9,0,0,0 | ||
| 2793 | .rva key_se_handler | ||
| 2794 | .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] | ||
| 2795 | .LSEH_info_AES_set_decrypt_key: | ||
| 2796 | .byte 9,0,0,0 | ||
| 2797 | .rva key_se_handler | ||
| 2798 | .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] | ||
| 2799 | .LSEH_info_AES_cbc_encrypt: | ||
| 2800 | .byte 9,0,0,0 | ||
| 2801 | .rva cbc_se_handler | ||
| 2802 | ___ | ||
| 2803 | } | ||
| 1574 | 2804 | ||
| 1575 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | 2805 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; |
| 1576 | 2806 | ||
