diff options
| author | djm <> | 2010-10-01 22:59:01 +0000 |
|---|---|---|
| committer | djm <> | 2010-10-01 22:59:01 +0000 |
| commit | 8922d4bc4a8b8893d72a48deb2cdf58215f98505 (patch) | |
| tree | 939b752540947d33507b3acc48d76a8bfb7c3dc3 /src/lib/libcrypto/aes | |
| parent | 76262f7bf9262f965142b1b2b2105cb279c5c696 (diff) | |
| download | openbsd-8922d4bc4a8b8893d72a48deb2cdf58215f98505.tar.gz openbsd-8922d4bc4a8b8893d72a48deb2cdf58215f98505.tar.bz2 openbsd-8922d4bc4a8b8893d72a48deb2cdf58215f98505.zip | |
resolve conflicts, fix local changes
Diffstat (limited to 'src/lib/libcrypto/aes')
| -rw-r--r-- | src/lib/libcrypto/aes/Makefile.ssl | 103 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/aes.h | 28 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/aes_cbc.c | 82 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/aes_cfb.c | 160 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/aes_core.c | 209 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/aes_ctr.c | 90 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/aes_ofb.c | 94 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/asm/aes-586.pl | 2403 | ||||
| -rwxr-xr-x | src/lib/libcrypto/aes/asm/aes-x86_64.pl | 2 |
9 files changed, 2163 insertions, 1008 deletions
diff --git a/src/lib/libcrypto/aes/Makefile.ssl b/src/lib/libcrypto/aes/Makefile.ssl deleted file mode 100644 index f353aeb697..0000000000 --- a/src/lib/libcrypto/aes/Makefile.ssl +++ /dev/null | |||
| @@ -1,103 +0,0 @@ | |||
| 1 | # | ||
| 2 | # crypto/aes/Makefile | ||
| 3 | # | ||
| 4 | |||
| 5 | DIR= aes | ||
| 6 | TOP= ../.. | ||
| 7 | CC= cc | ||
| 8 | CPP= $(CC) -E | ||
| 9 | INCLUDES= | ||
| 10 | CFLAG=-g | ||
| 11 | INSTALL_PREFIX= | ||
| 12 | OPENSSLDIR= /usr/local/ssl | ||
| 13 | INSTALLTOP= /usr/local/ssl | ||
| 14 | MAKE= make -f Makefile.ssl | ||
| 15 | MAKEDEPPROG= makedepend | ||
| 16 | MAKEDEPEND= $(TOP)/util/domd $(TOP) -MD $(MAKEDEPPROG) | ||
| 17 | MAKEFILE= Makefile.ssl | ||
| 18 | AR= ar r | ||
| 19 | |||
| 20 | # CFLAGS= -mpentiumpro $(INCLUDES) $(CFLAG) -O3 -fexpensive-optimizations -funroll-loops -fforce-addr | ||
| 21 | CFLAGS= $(INCLUDES) $(CFLAG) | ||
| 22 | |||
| 23 | GENERAL=Makefile | ||
| 24 | #TEST=aestest.c | ||
| 25 | TEST= | ||
| 26 | APPS= | ||
| 27 | |||
| 28 | LIB=$(TOP)/libcrypto.a | ||
| 29 | LIBSRC=aes_core.c aes_misc.c aes_ecb.c aes_cbc.c aes_cfb.c aes_ofb.c aes_ctr.c | ||
| 30 | LIBOBJ=aes_core.o aes_misc.o aes_ecb.o aes_cbc.o aes_cfb.o aes_ofb.o aes_ctr.o | ||
| 31 | |||
| 32 | SRC= $(LIBSRC) | ||
| 33 | |||
| 34 | EXHEADER= aes.h | ||
| 35 | HEADER= aes_locl.h $(EXHEADER) | ||
| 36 | |||
| 37 | ALL= $(GENERAL) $(SRC) $(HEADER) | ||
| 38 | |||
| 39 | top: | ||
| 40 | (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all) | ||
| 41 | |||
| 42 | all: lib | ||
| 43 | |||
| 44 | lib: $(LIBOBJ) | ||
| 45 | $(AR) $(LIB) $(LIBOBJ) | ||
| 46 | $(RANLIB) $(LIB) || echo Never mind. | ||
| 47 | @touch lib | ||
| 48 | |||
| 49 | $(LIBOBJ): $(LIBSRC) | ||
| 50 | |||
| 51 | files: | ||
| 52 | $(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO | ||
| 53 | |||
| 54 | links: | ||
| 55 | @sh $(TOP)/util/point.sh Makefile.ssl Makefile | ||
| 56 | @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER) | ||
| 57 | @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST) | ||
| 58 | @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS) | ||
| 59 | |||
| 60 | install: installs | ||
| 61 | |||
| 62 | installs: | ||
| 63 | @for i in $(EXHEADER) ; \ | ||
| 64 | do \ | ||
| 65 | (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \ | ||
| 66 | chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \ | ||
| 67 | done; | ||
| 68 | |||
| 69 | tags: | ||
| 70 | ctags $(SRC) | ||
| 71 | |||
| 72 | tests: | ||
| 73 | |||
| 74 | lint: | ||
| 75 | lint -DLINT $(INCLUDES) $(SRC)>fluff | ||
| 76 | |||
| 77 | depend: | ||
| 78 | $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC) | ||
| 79 | |||
| 80 | dclean: | ||
| 81 | $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new | ||
| 82 | mv -f Makefile.new $(MAKEFILE) | ||
| 83 | |||
| 84 | clean: | ||
| 85 | rm -f *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff | ||
| 86 | |||
| 87 | # DO NOT DELETE THIS LINE -- make depend depends on it. | ||
| 88 | |||
| 89 | aes_cbc.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h | ||
| 90 | aes_cbc.o: ../../include/openssl/opensslconf.h aes_cbc.c aes_locl.h | ||
| 91 | aes_cfb.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h | ||
| 92 | aes_cfb.o: ../../include/openssl/opensslconf.h aes_cfb.c aes_locl.h | ||
| 93 | aes_core.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h | ||
| 94 | aes_core.o: ../../include/openssl/opensslconf.h aes_core.c aes_locl.h | ||
| 95 | aes_ctr.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h | ||
| 96 | aes_ctr.o: ../../include/openssl/opensslconf.h aes_ctr.c aes_locl.h | ||
| 97 | aes_ecb.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h | ||
| 98 | aes_ecb.o: ../../include/openssl/opensslconf.h aes_ecb.c aes_locl.h | ||
| 99 | aes_misc.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h | ||
| 100 | aes_misc.o: ../../include/openssl/opensslconf.h | ||
| 101 | aes_misc.o: ../../include/openssl/opensslv.h aes_locl.h aes_misc.c | ||
| 102 | aes_ofb.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h | ||
| 103 | aes_ofb.o: ../../include/openssl/opensslconf.h aes_locl.h aes_ofb.c | ||
diff --git a/src/lib/libcrypto/aes/aes.h b/src/lib/libcrypto/aes/aes.h index 450f2b4051..d2c99730fe 100644 --- a/src/lib/libcrypto/aes/aes.h +++ b/src/lib/libcrypto/aes/aes.h | |||
| @@ -58,6 +58,8 @@ | |||
| 58 | #error AES is disabled. | 58 | #error AES is disabled. |
| 59 | #endif | 59 | #endif |
| 60 | 60 | ||
| 61 | #include <stddef.h> | ||
| 62 | |||
| 61 | #define AES_ENCRYPT 1 | 63 | #define AES_ENCRYPT 1 |
| 62 | #define AES_DECRYPT 0 | 64 | #define AES_DECRYPT 0 |
| 63 | 65 | ||
| @@ -66,10 +68,6 @@ | |||
| 66 | #define AES_MAXNR 14 | 68 | #define AES_MAXNR 14 |
| 67 | #define AES_BLOCK_SIZE 16 | 69 | #define AES_BLOCK_SIZE 16 |
| 68 | 70 | ||
| 69 | #ifdef OPENSSL_FIPS | ||
| 70 | #define FIPS_AES_SIZE_T int | ||
| 71 | #endif | ||
| 72 | |||
| 73 | #ifdef __cplusplus | 71 | #ifdef __cplusplus |
| 74 | extern "C" { | 72 | extern "C" { |
| 75 | #endif | 73 | #endif |
| @@ -100,37 +98,32 @@ void AES_decrypt(const unsigned char *in, unsigned char *out, | |||
| 100 | void AES_ecb_encrypt(const unsigned char *in, unsigned char *out, | 98 | void AES_ecb_encrypt(const unsigned char *in, unsigned char *out, |
| 101 | const AES_KEY *key, const int enc); | 99 | const AES_KEY *key, const int enc); |
| 102 | void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, | 100 | void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, |
| 103 | const unsigned long length, const AES_KEY *key, | 101 | size_t length, const AES_KEY *key, |
| 104 | unsigned char *ivec, const int enc); | 102 | unsigned char *ivec, const int enc); |
| 105 | void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, | 103 | void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, |
| 106 | const unsigned long length, const AES_KEY *key, | 104 | size_t length, const AES_KEY *key, |
| 107 | unsigned char *ivec, int *num, const int enc); | 105 | unsigned char *ivec, int *num, const int enc); |
| 108 | void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, | 106 | void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, |
| 109 | const unsigned long length, const AES_KEY *key, | 107 | size_t length, const AES_KEY *key, |
| 110 | unsigned char *ivec, int *num, const int enc); | 108 | unsigned char *ivec, int *num, const int enc); |
| 111 | void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, | 109 | void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, |
| 112 | const unsigned long length, const AES_KEY *key, | 110 | size_t length, const AES_KEY *key, |
| 113 | unsigned char *ivec, int *num, const int enc); | 111 | unsigned char *ivec, int *num, const int enc); |
| 114 | void AES_cfbr_encrypt_block(const unsigned char *in,unsigned char *out, | ||
| 115 | const int nbits,const AES_KEY *key, | ||
| 116 | unsigned char *ivec,const int enc); | ||
| 117 | void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, | 112 | void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, |
| 118 | const unsigned long length, const AES_KEY *key, | 113 | size_t length, const AES_KEY *key, |
| 119 | unsigned char *ivec, int *num); | 114 | unsigned char *ivec, int *num); |
| 120 | void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, | 115 | void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, |
| 121 | const unsigned long length, const AES_KEY *key, | 116 | size_t length, const AES_KEY *key, |
| 122 | unsigned char ivec[AES_BLOCK_SIZE], | 117 | unsigned char ivec[AES_BLOCK_SIZE], |
| 123 | unsigned char ecount_buf[AES_BLOCK_SIZE], | 118 | unsigned char ecount_buf[AES_BLOCK_SIZE], |
| 124 | unsigned int *num); | 119 | unsigned int *num); |
| 125 | |||
| 126 | /* For IGE, see also http://www.links.org/files/openssl-ige.pdf */ | ||
| 127 | /* NB: the IV is _two_ blocks long */ | 120 | /* NB: the IV is _two_ blocks long */ |
| 128 | void AES_ige_encrypt(const unsigned char *in, unsigned char *out, | 121 | void AES_ige_encrypt(const unsigned char *in, unsigned char *out, |
| 129 | const unsigned long length, const AES_KEY *key, | 122 | size_t length, const AES_KEY *key, |
| 130 | unsigned char *ivec, const int enc); | 123 | unsigned char *ivec, const int enc); |
| 131 | /* NB: the IV is _four_ blocks long */ | 124 | /* NB: the IV is _four_ blocks long */ |
| 132 | void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out, | 125 | void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out, |
| 133 | const unsigned long length, const AES_KEY *key, | 126 | size_t length, const AES_KEY *key, |
| 134 | const AES_KEY *key2, const unsigned char *ivec, | 127 | const AES_KEY *key2, const unsigned char *ivec, |
| 135 | const int enc); | 128 | const int enc); |
| 136 | 129 | ||
| @@ -141,6 +134,7 @@ int AES_unwrap_key(AES_KEY *key, const unsigned char *iv, | |||
| 141 | unsigned char *out, | 134 | unsigned char *out, |
| 142 | const unsigned char *in, unsigned int inlen); | 135 | const unsigned char *in, unsigned int inlen); |
| 143 | 136 | ||
| 137 | |||
| 144 | #ifdef __cplusplus | 138 | #ifdef __cplusplus |
| 145 | } | 139 | } |
| 146 | #endif | 140 | #endif |
diff --git a/src/lib/libcrypto/aes/aes_cbc.c b/src/lib/libcrypto/aes/aes_cbc.c index 373864cd4b..227f75625d 100644 --- a/src/lib/libcrypto/aes/aes_cbc.c +++ b/src/lib/libcrypto/aes/aes_cbc.c | |||
| @@ -49,85 +49,15 @@ | |||
| 49 | * | 49 | * |
| 50 | */ | 50 | */ |
| 51 | 51 | ||
| 52 | #ifndef AES_DEBUG | ||
| 53 | # ifndef NDEBUG | ||
| 54 | # define NDEBUG | ||
| 55 | # endif | ||
| 56 | #endif | ||
| 57 | #include <assert.h> | ||
| 58 | |||
| 59 | #include <openssl/aes.h> | 52 | #include <openssl/aes.h> |
| 60 | #include "aes_locl.h" | 53 | #include <openssl/modes.h> |
| 61 | 54 | ||
| 62 | #if !defined(OPENSSL_FIPS_AES_ASM) | ||
| 63 | void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, | 55 | void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, |
| 64 | const unsigned long length, const AES_KEY *key, | 56 | size_t len, const AES_KEY *key, |
| 65 | unsigned char *ivec, const int enc) { | 57 | unsigned char *ivec, const int enc) { |
| 66 | 58 | ||
| 67 | unsigned long n; | 59 | if (enc) |
| 68 | unsigned long len = length; | 60 | CRYPTO_cbc128_encrypt(in,out,len,key,ivec,(block128_f)AES_encrypt); |
| 69 | unsigned char tmp[AES_BLOCK_SIZE]; | 61 | else |
| 70 | const unsigned char *iv = ivec; | 62 | CRYPTO_cbc128_decrypt(in,out,len,key,ivec,(block128_f)AES_decrypt); |
| 71 | |||
| 72 | assert(in && out && key && ivec); | ||
| 73 | assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc)); | ||
| 74 | |||
| 75 | if (AES_ENCRYPT == enc) { | ||
| 76 | while (len >= AES_BLOCK_SIZE) { | ||
| 77 | for(n=0; n < AES_BLOCK_SIZE; ++n) | ||
| 78 | out[n] = in[n] ^ iv[n]; | ||
| 79 | AES_encrypt(out, out, key); | ||
| 80 | iv = out; | ||
| 81 | len -= AES_BLOCK_SIZE; | ||
| 82 | in += AES_BLOCK_SIZE; | ||
| 83 | out += AES_BLOCK_SIZE; | ||
| 84 | } | ||
| 85 | if (len) { | ||
| 86 | for(n=0; n < len; ++n) | ||
| 87 | out[n] = in[n] ^ iv[n]; | ||
| 88 | for(n=len; n < AES_BLOCK_SIZE; ++n) | ||
| 89 | out[n] = iv[n]; | ||
| 90 | AES_encrypt(out, out, key); | ||
| 91 | iv = out; | ||
| 92 | } | ||
| 93 | memcpy(ivec,iv,AES_BLOCK_SIZE); | ||
| 94 | } else if (in != out) { | ||
| 95 | while (len >= AES_BLOCK_SIZE) { | ||
| 96 | AES_decrypt(in, out, key); | ||
| 97 | for(n=0; n < AES_BLOCK_SIZE; ++n) | ||
| 98 | out[n] ^= iv[n]; | ||
| 99 | iv = in; | ||
| 100 | len -= AES_BLOCK_SIZE; | ||
| 101 | in += AES_BLOCK_SIZE; | ||
| 102 | out += AES_BLOCK_SIZE; | ||
| 103 | } | ||
| 104 | if (len) { | ||
| 105 | AES_decrypt(in,tmp,key); | ||
| 106 | for(n=0; n < len; ++n) | ||
| 107 | out[n] = tmp[n] ^ iv[n]; | ||
| 108 | iv = in; | ||
| 109 | } | ||
| 110 | memcpy(ivec,iv,AES_BLOCK_SIZE); | ||
| 111 | } else { | ||
| 112 | while (len >= AES_BLOCK_SIZE) { | ||
| 113 | memcpy(tmp, in, AES_BLOCK_SIZE); | ||
| 114 | AES_decrypt(in, out, key); | ||
| 115 | for(n=0; n < AES_BLOCK_SIZE; ++n) | ||
| 116 | out[n] ^= ivec[n]; | ||
| 117 | memcpy(ivec, tmp, AES_BLOCK_SIZE); | ||
| 118 | len -= AES_BLOCK_SIZE; | ||
| 119 | in += AES_BLOCK_SIZE; | ||
| 120 | out += AES_BLOCK_SIZE; | ||
| 121 | } | ||
| 122 | if (len) { | ||
| 123 | memcpy(tmp, in, AES_BLOCK_SIZE); | ||
| 124 | AES_decrypt(tmp, out, key); | ||
| 125 | for(n=0; n < len; ++n) | ||
| 126 | out[n] ^= ivec[n]; | ||
| 127 | for(n=len; n < AES_BLOCK_SIZE; ++n) | ||
| 128 | out[n] = tmp[n]; | ||
| 129 | memcpy(ivec, tmp, AES_BLOCK_SIZE); | ||
| 130 | } | ||
| 131 | } | ||
| 132 | } | 63 | } |
| 133 | #endif | ||
diff --git a/src/lib/libcrypto/aes/aes_cfb.c b/src/lib/libcrypto/aes/aes_cfb.c index 49f0411010..0c6d058ce7 100644 --- a/src/lib/libcrypto/aes/aes_cfb.c +++ b/src/lib/libcrypto/aes/aes_cfb.c | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | /* crypto/aes/aes_cfb.c -*- mode:C; c-file-style: "eay" -*- */ | 1 | /* crypto/aes/aes_cfb.c -*- mode:C; c-file-style: "eay" -*- */ |
| 2 | /* ==================================================================== | 2 | /* ==================================================================== |
| 3 | * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. | 3 | * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved. |
| 4 | * | 4 | * |
| 5 | * Redistribution and use in source and binary forms, with or without | 5 | * Redistribution and use in source and binary forms, with or without |
| 6 | * modification, are permitted provided that the following conditions | 6 | * modification, are permitted provided that the following conditions |
| @@ -48,73 +48,9 @@ | |||
| 48 | * ==================================================================== | 48 | * ==================================================================== |
| 49 | * | 49 | * |
| 50 | */ | 50 | */ |
| 51 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
| 52 | * All rights reserved. | ||
| 53 | * | ||
| 54 | * This package is an SSL implementation written | ||
| 55 | * by Eric Young (eay@cryptsoft.com). | ||
| 56 | * The implementation was written so as to conform with Netscapes SSL. | ||
| 57 | * | ||
| 58 | * This library is free for commercial and non-commercial use as long as | ||
| 59 | * the following conditions are aheared to. The following conditions | ||
| 60 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
| 61 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
| 62 | * included with this distribution is covered by the same copyright terms | ||
| 63 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
| 64 | * | ||
| 65 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
| 66 | * the code are not to be removed. | ||
| 67 | * If this package is used in a product, Eric Young should be given attribution | ||
| 68 | * as the author of the parts of the library used. | ||
| 69 | * This can be in the form of a textual message at program startup or | ||
| 70 | * in documentation (online or textual) provided with the package. | ||
| 71 | * | ||
| 72 | * Redistribution and use in source and binary forms, with or without | ||
| 73 | * modification, are permitted provided that the following conditions | ||
| 74 | * are met: | ||
| 75 | * 1. Redistributions of source code must retain the copyright | ||
| 76 | * notice, this list of conditions and the following disclaimer. | ||
| 77 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 78 | * notice, this list of conditions and the following disclaimer in the | ||
| 79 | * documentation and/or other materials provided with the distribution. | ||
| 80 | * 3. All advertising materials mentioning features or use of this software | ||
| 81 | * must display the following acknowledgement: | ||
| 82 | * "This product includes cryptographic software written by | ||
| 83 | * Eric Young (eay@cryptsoft.com)" | ||
| 84 | * The word 'cryptographic' can be left out if the rouines from the library | ||
| 85 | * being used are not cryptographic related :-). | ||
| 86 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
| 87 | * the apps directory (application code) you must include an acknowledgement: | ||
| 88 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
| 89 | * | ||
| 90 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
| 91 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 92 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 93 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
| 94 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 95 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
| 96 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 97 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
| 98 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
| 99 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
| 100 | * SUCH DAMAGE. | ||
| 101 | * | ||
| 102 | * The licence and distribution terms for any publically available version or | ||
| 103 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
| 104 | * copied and put under another distribution licence | ||
| 105 | * [including the GNU Public Licence.] | ||
| 106 | */ | ||
| 107 | |||
| 108 | #ifndef AES_DEBUG | ||
| 109 | # ifndef NDEBUG | ||
| 110 | # define NDEBUG | ||
| 111 | # endif | ||
| 112 | #endif | ||
| 113 | #include <assert.h> | ||
| 114 | 51 | ||
| 115 | #include <openssl/aes.h> | 52 | #include <openssl/aes.h> |
| 116 | #include "aes_locl.h" | 53 | #include <openssl/modes.h> |
| 117 | #include "e_os.h" | ||
| 118 | 54 | ||
| 119 | /* The input and output encrypted as though 128bit cfb mode is being | 55 | /* The input and output encrypted as though 128bit cfb mode is being |
| 120 | * used. The extra state information to record how much of the | 56 | * used. The extra state information to record how much of the |
| @@ -122,104 +58,24 @@ | |||
| 122 | */ | 58 | */ |
| 123 | 59 | ||
| 124 | void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, | 60 | void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, |
| 125 | const unsigned long length, const AES_KEY *key, | 61 | size_t length, const AES_KEY *key, |
| 126 | unsigned char *ivec, int *num, const int enc) { | 62 | unsigned char *ivec, int *num, const int enc) { |
| 127 | 63 | ||
| 128 | unsigned int n; | 64 | CRYPTO_cfb128_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt); |
| 129 | unsigned long l = length; | ||
| 130 | unsigned char c; | ||
| 131 | |||
| 132 | assert(in && out && key && ivec && num); | ||
| 133 | |||
| 134 | n = *num; | ||
| 135 | |||
| 136 | if (enc) { | ||
| 137 | while (l--) { | ||
| 138 | if (n == 0) { | ||
| 139 | AES_encrypt(ivec, ivec, key); | ||
| 140 | } | ||
| 141 | ivec[n] = *(out++) = *(in++) ^ ivec[n]; | ||
| 142 | n = (n+1) % AES_BLOCK_SIZE; | ||
| 143 | } | ||
| 144 | } else { | ||
| 145 | while (l--) { | ||
| 146 | if (n == 0) { | ||
| 147 | AES_encrypt(ivec, ivec, key); | ||
| 148 | } | ||
| 149 | c = *(in); | ||
| 150 | *(out++) = *(in++) ^ ivec[n]; | ||
| 151 | ivec[n] = c; | ||
| 152 | n = (n+1) % AES_BLOCK_SIZE; | ||
| 153 | } | ||
| 154 | } | ||
| 155 | |||
| 156 | *num=n; | ||
| 157 | } | 65 | } |
| 158 | 66 | ||
| 159 | /* This expects a single block of size nbits for both in and out. Note that | ||
| 160 | it corrupts any extra bits in the last byte of out */ | ||
| 161 | void AES_cfbr_encrypt_block(const unsigned char *in,unsigned char *out, | ||
| 162 | const int nbits,const AES_KEY *key, | ||
| 163 | unsigned char *ivec,const int enc) | ||
| 164 | { | ||
| 165 | int n,rem,num; | ||
| 166 | unsigned char ovec[AES_BLOCK_SIZE*2]; | ||
| 167 | |||
| 168 | if (nbits<=0 || nbits>128) return; | ||
| 169 | |||
| 170 | /* fill in the first half of the new IV with the current IV */ | ||
| 171 | memcpy(ovec,ivec,AES_BLOCK_SIZE); | ||
| 172 | /* construct the new IV */ | ||
| 173 | AES_encrypt(ivec,ivec,key); | ||
| 174 | num = (nbits+7)/8; | ||
| 175 | if (enc) /* encrypt the input */ | ||
| 176 | for(n=0 ; n < num ; ++n) | ||
| 177 | out[n] = (ovec[AES_BLOCK_SIZE+n] = in[n] ^ ivec[n]); | ||
| 178 | else /* decrypt the input */ | ||
| 179 | for(n=0 ; n < num ; ++n) | ||
| 180 | out[n] = (ovec[AES_BLOCK_SIZE+n] = in[n]) ^ ivec[n]; | ||
| 181 | /* shift ovec left... */ | ||
| 182 | rem = nbits%8; | ||
| 183 | num = nbits/8; | ||
| 184 | if(rem==0) | ||
| 185 | memcpy(ivec,ovec+num,AES_BLOCK_SIZE); | ||
| 186 | else | ||
| 187 | for(n=0 ; n < AES_BLOCK_SIZE ; ++n) | ||
| 188 | ivec[n] = ovec[n+num]<<rem | ovec[n+num+1]>>(8-rem); | ||
| 189 | |||
| 190 | /* it is not necessary to cleanse ovec, since the IV is not secret */ | ||
| 191 | } | ||
| 192 | |||
| 193 | /* N.B. This expects the input to be packed, MS bit first */ | 67 | /* N.B. This expects the input to be packed, MS bit first */ |
| 194 | void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, | 68 | void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, |
| 195 | const unsigned long length, const AES_KEY *key, | 69 | size_t length, const AES_KEY *key, |
| 196 | unsigned char *ivec, int *num, const int enc) | 70 | unsigned char *ivec, int *num, const int enc) |
| 197 | { | 71 | { |
| 198 | unsigned int n; | 72 | CRYPTO_cfb128_1_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt); |
| 199 | unsigned char c[1],d[1]; | ||
| 200 | |||
| 201 | assert(in && out && key && ivec && num); | ||
| 202 | assert(*num == 0); | ||
| 203 | |||
| 204 | memset(out,0,(length+7)/8); | ||
| 205 | for(n=0 ; n < length ; ++n) | ||
| 206 | { | ||
| 207 | c[0]=(in[n/8]&(1 << (7-n%8))) ? 0x80 : 0; | ||
| 208 | AES_cfbr_encrypt_block(c,d,1,key,ivec,enc); | ||
| 209 | out[n/8]=(out[n/8]&~(1 << (7-n%8)))|((d[0]&0x80) >> (n%8)); | ||
| 210 | } | ||
| 211 | } | 73 | } |
| 212 | 74 | ||
| 213 | void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, | 75 | void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, |
| 214 | const unsigned long length, const AES_KEY *key, | 76 | size_t length, const AES_KEY *key, |
| 215 | unsigned char *ivec, int *num, const int enc) | 77 | unsigned char *ivec, int *num, const int enc) |
| 216 | { | 78 | { |
| 217 | unsigned int n; | 79 | CRYPTO_cfb128_8_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt); |
| 218 | |||
| 219 | assert(in && out && key && ivec && num); | ||
| 220 | assert(*num == 0); | ||
| 221 | |||
| 222 | for(n=0 ; n < length ; ++n) | ||
| 223 | AES_cfbr_encrypt_block(&in[n],&out[n],8,key,ivec,enc); | ||
| 224 | } | 80 | } |
| 225 | 81 | ||
diff --git a/src/lib/libcrypto/aes/aes_core.c b/src/lib/libcrypto/aes/aes_core.c index cffdd4daec..a7ec54f4da 100644 --- a/src/lib/libcrypto/aes/aes_core.c +++ b/src/lib/libcrypto/aes/aes_core.c | |||
| @@ -37,12 +37,9 @@ | |||
| 37 | 37 | ||
| 38 | #include <stdlib.h> | 38 | #include <stdlib.h> |
| 39 | #include <openssl/aes.h> | 39 | #include <openssl/aes.h> |
| 40 | #ifdef OPENSSL_FIPS | ||
| 41 | #include <openssl/fips.h> | ||
| 42 | #endif | ||
| 43 | |||
| 44 | #include "aes_locl.h" | 40 | #include "aes_locl.h" |
| 45 | 41 | ||
| 42 | #ifndef AES_ASM | ||
| 46 | /* | 43 | /* |
| 47 | Te0[x] = S [x].[02, 01, 01, 03]; | 44 | Te0[x] = S [x].[02, 01, 01, 03]; |
| 48 | Te1[x] = S [x].[03, 02, 01, 01]; | 45 | Te1[x] = S [x].[03, 02, 01, 01]; |
| @@ -635,10 +632,6 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | |||
| 635 | int i = 0; | 632 | int i = 0; |
| 636 | u32 temp; | 633 | u32 temp; |
| 637 | 634 | ||
| 638 | #ifdef OPENSSL_FIPS | ||
| 639 | FIPS_selftest_check(); | ||
| 640 | #endif | ||
| 641 | |||
| 642 | if (!userKey || !key) | 635 | if (!userKey || !key) |
| 643 | return -1; | 636 | return -1; |
| 644 | if (bits != 128 && bits != 192 && bits != 256) | 637 | if (bits != 128 && bits != 192 && bits != 256) |
| @@ -781,7 +774,6 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | |||
| 781 | return 0; | 774 | return 0; |
| 782 | } | 775 | } |
| 783 | 776 | ||
| 784 | #ifndef AES_ASM | ||
| 785 | /* | 777 | /* |
| 786 | * Encrypt a single block | 778 | * Encrypt a single block |
| 787 | * in and out can overlap | 779 | * in and out can overlap |
| @@ -1164,4 +1156,203 @@ void AES_decrypt(const unsigned char *in, unsigned char *out, | |||
| 1164 | PUTU32(out + 12, s3); | 1156 | PUTU32(out + 12, s3); |
| 1165 | } | 1157 | } |
| 1166 | 1158 | ||
| 1159 | #else /* AES_ASM */ | ||
| 1160 | |||
| 1161 | static const u8 Te4[256] = { | ||
| 1162 | 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U, | ||
| 1163 | 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U, | ||
| 1164 | 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U, | ||
| 1165 | 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U, | ||
| 1166 | 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU, | ||
| 1167 | 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U, | ||
| 1168 | 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU, | ||
| 1169 | 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U, | ||
| 1170 | 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U, | ||
| 1171 | 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U, | ||
| 1172 | 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU, | ||
| 1173 | 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU, | ||
| 1174 | 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U, | ||
| 1175 | 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U, | ||
| 1176 | 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U, | ||
| 1177 | 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U, | ||
| 1178 | 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U, | ||
| 1179 | 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U, | ||
| 1180 | 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U, | ||
| 1181 | 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU, | ||
| 1182 | 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU, | ||
| 1183 | 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U, | ||
| 1184 | 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U, | ||
| 1185 | 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U, | ||
| 1186 | 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U, | ||
| 1187 | 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU, | ||
| 1188 | 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU, | ||
| 1189 | 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU, | ||
| 1190 | 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U, | ||
| 1191 | 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU, | ||
| 1192 | 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U, | ||
| 1193 | 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U | ||
| 1194 | }; | ||
| 1195 | static const u32 rcon[] = { | ||
| 1196 | 0x01000000, 0x02000000, 0x04000000, 0x08000000, | ||
| 1197 | 0x10000000, 0x20000000, 0x40000000, 0x80000000, | ||
| 1198 | 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ | ||
| 1199 | }; | ||
| 1200 | |||
| 1201 | /** | ||
| 1202 | * Expand the cipher key into the encryption key schedule. | ||
| 1203 | */ | ||
| 1204 | int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | ||
| 1205 | AES_KEY *key) { | ||
| 1206 | u32 *rk; | ||
| 1207 | int i = 0; | ||
| 1208 | u32 temp; | ||
| 1209 | |||
| 1210 | if (!userKey || !key) | ||
| 1211 | return -1; | ||
| 1212 | if (bits != 128 && bits != 192 && bits != 256) | ||
| 1213 | return -2; | ||
| 1214 | |||
| 1215 | rk = key->rd_key; | ||
| 1216 | |||
| 1217 | if (bits==128) | ||
| 1218 | key->rounds = 10; | ||
| 1219 | else if (bits==192) | ||
| 1220 | key->rounds = 12; | ||
| 1221 | else | ||
| 1222 | key->rounds = 14; | ||
| 1223 | |||
| 1224 | rk[0] = GETU32(userKey ); | ||
| 1225 | rk[1] = GETU32(userKey + 4); | ||
| 1226 | rk[2] = GETU32(userKey + 8); | ||
| 1227 | rk[3] = GETU32(userKey + 12); | ||
| 1228 | if (bits == 128) { | ||
| 1229 | while (1) { | ||
| 1230 | temp = rk[3]; | ||
| 1231 | rk[4] = rk[0] ^ | ||
| 1232 | (Te4[(temp >> 16) & 0xff] << 24) ^ | ||
| 1233 | (Te4[(temp >> 8) & 0xff] << 16) ^ | ||
| 1234 | (Te4[(temp ) & 0xff] << 8) ^ | ||
| 1235 | (Te4[(temp >> 24) ]) ^ | ||
| 1236 | rcon[i]; | ||
| 1237 | rk[5] = rk[1] ^ rk[4]; | ||
| 1238 | rk[6] = rk[2] ^ rk[5]; | ||
| 1239 | rk[7] = rk[3] ^ rk[6]; | ||
| 1240 | if (++i == 10) { | ||
| 1241 | return 0; | ||
| 1242 | } | ||
| 1243 | rk += 4; | ||
| 1244 | } | ||
| 1245 | } | ||
| 1246 | rk[4] = GETU32(userKey + 16); | ||
| 1247 | rk[5] = GETU32(userKey + 20); | ||
| 1248 | if (bits == 192) { | ||
| 1249 | while (1) { | ||
| 1250 | temp = rk[ 5]; | ||
| 1251 | rk[ 6] = rk[ 0] ^ | ||
| 1252 | (Te4[(temp >> 16) & 0xff] << 24) ^ | ||
| 1253 | (Te4[(temp >> 8) & 0xff] << 16) ^ | ||
| 1254 | (Te4[(temp ) & 0xff] << 8) ^ | ||
| 1255 | (Te4[(temp >> 24) ]) ^ | ||
| 1256 | rcon[i]; | ||
| 1257 | rk[ 7] = rk[ 1] ^ rk[ 6]; | ||
| 1258 | rk[ 8] = rk[ 2] ^ rk[ 7]; | ||
| 1259 | rk[ 9] = rk[ 3] ^ rk[ 8]; | ||
| 1260 | if (++i == 8) { | ||
| 1261 | return 0; | ||
| 1262 | } | ||
| 1263 | rk[10] = rk[ 4] ^ rk[ 9]; | ||
| 1264 | rk[11] = rk[ 5] ^ rk[10]; | ||
| 1265 | rk += 6; | ||
| 1266 | } | ||
| 1267 | } | ||
| 1268 | rk[6] = GETU32(userKey + 24); | ||
| 1269 | rk[7] = GETU32(userKey + 28); | ||
| 1270 | if (bits == 256) { | ||
| 1271 | while (1) { | ||
| 1272 | temp = rk[ 7]; | ||
| 1273 | rk[ 8] = rk[ 0] ^ | ||
| 1274 | (Te4[(temp >> 16) & 0xff] << 24) ^ | ||
| 1275 | (Te4[(temp >> 8) & 0xff] << 16) ^ | ||
| 1276 | (Te4[(temp ) & 0xff] << 8) ^ | ||
| 1277 | (Te4[(temp >> 24) ]) ^ | ||
| 1278 | rcon[i]; | ||
| 1279 | rk[ 9] = rk[ 1] ^ rk[ 8]; | ||
| 1280 | rk[10] = rk[ 2] ^ rk[ 9]; | ||
| 1281 | rk[11] = rk[ 3] ^ rk[10]; | ||
| 1282 | if (++i == 7) { | ||
| 1283 | return 0; | ||
| 1284 | } | ||
| 1285 | temp = rk[11]; | ||
| 1286 | rk[12] = rk[ 4] ^ | ||
| 1287 | (Te4[(temp >> 24) ] << 24) ^ | ||
| 1288 | (Te4[(temp >> 16) & 0xff] << 16) ^ | ||
| 1289 | (Te4[(temp >> 8) & 0xff] << 8) ^ | ||
| 1290 | (Te4[(temp ) & 0xff]); | ||
| 1291 | rk[13] = rk[ 5] ^ rk[12]; | ||
| 1292 | rk[14] = rk[ 6] ^ rk[13]; | ||
| 1293 | rk[15] = rk[ 7] ^ rk[14]; | ||
| 1294 | |||
| 1295 | rk += 8; | ||
| 1296 | } | ||
| 1297 | } | ||
| 1298 | return 0; | ||
| 1299 | } | ||
| 1300 | |||
| 1301 | /** | ||
| 1302 | * Expand the cipher key into the decryption key schedule. | ||
| 1303 | */ | ||
| 1304 | int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | ||
| 1305 | AES_KEY *key) { | ||
| 1306 | |||
| 1307 | u32 *rk; | ||
| 1308 | int i, j, status; | ||
| 1309 | u32 temp; | ||
| 1310 | |||
| 1311 | /* first, start with an encryption schedule */ | ||
| 1312 | status = AES_set_encrypt_key(userKey, bits, key); | ||
| 1313 | if (status < 0) | ||
| 1314 | return status; | ||
| 1315 | |||
| 1316 | rk = key->rd_key; | ||
| 1317 | |||
| 1318 | /* invert the order of the round keys: */ | ||
| 1319 | for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) { | ||
| 1320 | temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp; | ||
| 1321 | temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; | ||
| 1322 | temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; | ||
| 1323 | temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp; | ||
| 1324 | } | ||
| 1325 | /* apply the inverse MixColumn transform to all round keys but the first and the last: */ | ||
| 1326 | for (i = 1; i < (key->rounds); i++) { | ||
| 1327 | rk += 4; | ||
| 1328 | for (j = 0; j < 4; j++) { | ||
| 1329 | u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m; | ||
| 1330 | |||
| 1331 | tp1 = rk[j]; | ||
| 1332 | m = tp1 & 0x80808080; | ||
| 1333 | tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^ | ||
| 1334 | ((m - (m >> 7)) & 0x1b1b1b1b); | ||
| 1335 | m = tp2 & 0x80808080; | ||
| 1336 | tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^ | ||
| 1337 | ((m - (m >> 7)) & 0x1b1b1b1b); | ||
| 1338 | m = tp4 & 0x80808080; | ||
| 1339 | tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^ | ||
| 1340 | ((m - (m >> 7)) & 0x1b1b1b1b); | ||
| 1341 | tp9 = tp8 ^ tp1; | ||
| 1342 | tpb = tp9 ^ tp2; | ||
| 1343 | tpd = tp9 ^ tp4; | ||
| 1344 | tpe = tp8 ^ tp4 ^ tp2; | ||
| 1345 | #if defined(ROTATE) | ||
| 1346 | rk[j] = tpe ^ ROTATE(tpd,16) ^ | ||
| 1347 | ROTATE(tp9,24) ^ ROTATE(tpb,8); | ||
| 1348 | #else | ||
| 1349 | rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ | ||
| 1350 | (tp9 >> 8) ^ (tp9 << 24) ^ | ||
| 1351 | (tpb >> 24) ^ (tpb << 8); | ||
| 1352 | #endif | ||
| 1353 | } | ||
| 1354 | } | ||
| 1355 | return 0; | ||
| 1356 | } | ||
| 1357 | |||
| 1167 | #endif /* AES_ASM */ | 1358 | #endif /* AES_ASM */ |
diff --git a/src/lib/libcrypto/aes/aes_ctr.c b/src/lib/libcrypto/aes/aes_ctr.c index f36982be1e..7c9d165d8a 100644 --- a/src/lib/libcrypto/aes/aes_ctr.c +++ b/src/lib/libcrypto/aes/aes_ctr.c | |||
| @@ -49,91 +49,13 @@ | |||
| 49 | * | 49 | * |
| 50 | */ | 50 | */ |
| 51 | 51 | ||
| 52 | #ifndef AES_DEBUG | ||
| 53 | # ifndef NDEBUG | ||
| 54 | # define NDEBUG | ||
| 55 | # endif | ||
| 56 | #endif | ||
| 57 | #include <assert.h> | ||
| 58 | |||
| 59 | #include <openssl/aes.h> | 52 | #include <openssl/aes.h> |
| 60 | #include "aes_locl.h" | 53 | #include <openssl/modes.h> |
| 61 | |||
| 62 | /* NOTE: the IV/counter CTR mode is big-endian. The rest of the AES code | ||
| 63 | * is endian-neutral. */ | ||
| 64 | |||
| 65 | /* increment counter (128-bit int) by 1 */ | ||
| 66 | static void AES_ctr128_inc(unsigned char *counter) { | ||
| 67 | unsigned long c; | ||
| 68 | |||
| 69 | /* Grab bottom dword of counter and increment */ | ||
| 70 | c = GETU32(counter + 12); | ||
| 71 | c++; c &= 0xFFFFFFFF; | ||
| 72 | PUTU32(counter + 12, c); | ||
| 73 | |||
| 74 | /* if no overflow, we're done */ | ||
| 75 | if (c) | ||
| 76 | return; | ||
| 77 | |||
| 78 | /* Grab 1st dword of counter and increment */ | ||
| 79 | c = GETU32(counter + 8); | ||
| 80 | c++; c &= 0xFFFFFFFF; | ||
| 81 | PUTU32(counter + 8, c); | ||
| 82 | |||
| 83 | /* if no overflow, we're done */ | ||
| 84 | if (c) | ||
| 85 | return; | ||
| 86 | |||
| 87 | /* Grab 2nd dword of counter and increment */ | ||
| 88 | c = GETU32(counter + 4); | ||
| 89 | c++; c &= 0xFFFFFFFF; | ||
| 90 | PUTU32(counter + 4, c); | ||
| 91 | |||
| 92 | /* if no overflow, we're done */ | ||
| 93 | if (c) | ||
| 94 | return; | ||
| 95 | 54 | ||
| 96 | /* Grab top dword of counter and increment */ | ||
| 97 | c = GETU32(counter + 0); | ||
| 98 | c++; c &= 0xFFFFFFFF; | ||
| 99 | PUTU32(counter + 0, c); | ||
| 100 | } | ||
| 101 | |||
| 102 | /* The input encrypted as though 128bit counter mode is being | ||
| 103 | * used. The extra state information to record how much of the | ||
| 104 | * 128bit block we have used is contained in *num, and the | ||
| 105 | * encrypted counter is kept in ecount_buf. Both *num and | ||
| 106 | * ecount_buf must be initialised with zeros before the first | ||
| 107 | * call to AES_ctr128_encrypt(). | ||
| 108 | * | ||
| 109 | * This algorithm assumes that the counter is in the x lower bits | ||
| 110 | * of the IV (ivec), and that the application has full control over | ||
| 111 | * overflow and the rest of the IV. This implementation takes NO | ||
| 112 | * responsability for checking that the counter doesn't overflow | ||
| 113 | * into the rest of the IV when incremented. | ||
| 114 | */ | ||
| 115 | void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, | 55 | void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, |
| 116 | const unsigned long length, const AES_KEY *key, | 56 | size_t length, const AES_KEY *key, |
| 117 | unsigned char ivec[AES_BLOCK_SIZE], | 57 | unsigned char ivec[AES_BLOCK_SIZE], |
| 118 | unsigned char ecount_buf[AES_BLOCK_SIZE], | 58 | unsigned char ecount_buf[AES_BLOCK_SIZE], |
| 119 | unsigned int *num) { | 59 | unsigned int *num) { |
| 120 | 60 | CRYPTO_ctr128_encrypt(in,out,length,key,ivec,ecount_buf,num,(block128_f)AES_encrypt); | |
| 121 | unsigned int n; | ||
| 122 | unsigned long l=length; | ||
| 123 | |||
| 124 | assert(in && out && key && counter && num); | ||
| 125 | assert(*num < AES_BLOCK_SIZE); | ||
| 126 | |||
| 127 | n = *num; | ||
| 128 | |||
| 129 | while (l--) { | ||
| 130 | if (n == 0) { | ||
| 131 | AES_encrypt(ivec, ecount_buf, key); | ||
| 132 | AES_ctr128_inc(ivec); | ||
| 133 | } | ||
| 134 | *(out++) = *(in++) ^ ecount_buf[n]; | ||
| 135 | n = (n+1) % AES_BLOCK_SIZE; | ||
| 136 | } | ||
| 137 | |||
| 138 | *num=n; | ||
| 139 | } | 61 | } |
diff --git a/src/lib/libcrypto/aes/aes_ofb.c b/src/lib/libcrypto/aes/aes_ofb.c index f358bb39e2..50bf0b8325 100644 --- a/src/lib/libcrypto/aes/aes_ofb.c +++ b/src/lib/libcrypto/aes/aes_ofb.c | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | /* crypto/aes/aes_ofb.c -*- mode:C; c-file-style: "eay" -*- */ | 1 | /* crypto/aes/aes_ofb.c -*- mode:C; c-file-style: "eay" -*- */ |
| 2 | /* ==================================================================== | 2 | /* ==================================================================== |
| 3 | * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. | 3 | * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved. |
| 4 | * | 4 | * |
| 5 | * Redistribution and use in source and binary forms, with or without | 5 | * Redistribution and use in source and binary forms, with or without |
| 6 | * modification, are permitted provided that the following conditions | 6 | * modification, are permitted provided that the following conditions |
| @@ -48,95 +48,13 @@ | |||
| 48 | * ==================================================================== | 48 | * ==================================================================== |
| 49 | * | 49 | * |
| 50 | */ | 50 | */ |
| 51 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
| 52 | * All rights reserved. | ||
| 53 | * | ||
| 54 | * This package is an SSL implementation written | ||
| 55 | * by Eric Young (eay@cryptsoft.com). | ||
| 56 | * The implementation was written so as to conform with Netscapes SSL. | ||
| 57 | * | ||
| 58 | * This library is free for commercial and non-commercial use as long as | ||
| 59 | * the following conditions are aheared to. The following conditions | ||
| 60 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
| 61 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
| 62 | * included with this distribution is covered by the same copyright terms | ||
| 63 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
| 64 | * | ||
| 65 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
| 66 | * the code are not to be removed. | ||
| 67 | * If this package is used in a product, Eric Young should be given attribution | ||
| 68 | * as the author of the parts of the library used. | ||
| 69 | * This can be in the form of a textual message at program startup or | ||
| 70 | * in documentation (online or textual) provided with the package. | ||
| 71 | * | ||
| 72 | * Redistribution and use in source and binary forms, with or without | ||
| 73 | * modification, are permitted provided that the following conditions | ||
| 74 | * are met: | ||
| 75 | * 1. Redistributions of source code must retain the copyright | ||
| 76 | * notice, this list of conditions and the following disclaimer. | ||
| 77 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 78 | * notice, this list of conditions and the following disclaimer in the | ||
| 79 | * documentation and/or other materials provided with the distribution. | ||
| 80 | * 3. All advertising materials mentioning features or use of this software | ||
| 81 | * must display the following acknowledgement: | ||
| 82 | * "This product includes cryptographic software written by | ||
| 83 | * Eric Young (eay@cryptsoft.com)" | ||
| 84 | * The word 'cryptographic' can be left out if the rouines from the library | ||
| 85 | * being used are not cryptographic related :-). | ||
| 86 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
| 87 | * the apps directory (application code) you must include an acknowledgement: | ||
| 88 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
| 89 | * | ||
| 90 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
| 91 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 92 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 93 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
| 94 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 95 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
| 96 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 97 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
| 98 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
| 99 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
| 100 | * SUCH DAMAGE. | ||
| 101 | * | ||
| 102 | * The licence and distribution terms for any publically available version or | ||
| 103 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
| 104 | * copied and put under another distribution licence | ||
| 105 | * [including the GNU Public Licence.] | ||
| 106 | */ | ||
| 107 | |||
| 108 | #ifndef AES_DEBUG | ||
| 109 | # ifndef NDEBUG | ||
| 110 | # define NDEBUG | ||
| 111 | # endif | ||
| 112 | #endif | ||
| 113 | #include <assert.h> | ||
| 114 | 51 | ||
| 115 | #include <openssl/aes.h> | 52 | #include <openssl/aes.h> |
| 116 | #include "aes_locl.h" | 53 | #include <openssl/modes.h> |
| 117 | 54 | ||
| 118 | /* The input and output encrypted as though 128bit ofb mode is being | ||
| 119 | * used. The extra state information to record how much of the | ||
| 120 | * 128bit block we have used is contained in *num; | ||
| 121 | */ | ||
| 122 | void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, | 55 | void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, |
| 123 | const unsigned long length, const AES_KEY *key, | 56 | size_t length, const AES_KEY *key, |
| 124 | unsigned char *ivec, int *num) { | 57 | unsigned char *ivec, int *num) |
| 125 | 58 | { | |
| 126 | unsigned int n; | 59 | CRYPTO_ofb128_encrypt(in,out,length,key,ivec,num,(block128_f)AES_encrypt); |
| 127 | unsigned long l=length; | ||
| 128 | |||
| 129 | assert(in && out && key && ivec && num); | ||
| 130 | |||
| 131 | n = *num; | ||
| 132 | |||
| 133 | while (l--) { | ||
| 134 | if (n == 0) { | ||
| 135 | AES_encrypt(ivec, ivec, key); | ||
| 136 | } | ||
| 137 | *(out++) = *(in++) ^ ivec[n]; | ||
| 138 | n = (n+1) % AES_BLOCK_SIZE; | ||
| 139 | } | ||
| 140 | |||
| 141 | *num=n; | ||
| 142 | } | 60 | } |
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl index e771e83953..aab40e6f1c 100644 --- a/src/lib/libcrypto/aes/asm/aes-586.pl +++ b/src/lib/libcrypto/aes/asm/aes-586.pl | |||
| @@ -2,11 +2,12 @@ | |||
| 2 | # | 2 | # |
| 3 | # ==================================================================== | 3 | # ==================================================================== |
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| 5 | # project. Rights for redistribution and usage in source and binary | 5 | # project. The module is, however, dual licensed under OpenSSL and |
| 6 | # forms are granted according to the OpenSSL license. | 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 7 | # ==================================================================== | 8 | # ==================================================================== |
| 8 | # | 9 | # |
| 9 | # Version 3.6. | 10 | # Version 4.3. |
| 10 | # | 11 | # |
| 11 | # You might fail to appreciate this module performance from the first | 12 | # You might fail to appreciate this module performance from the first |
| 12 | # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered | 13 | # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered |
| @@ -81,11 +82,117 @@ | |||
| 81 | # AMD K8 20 19 | 82 | # AMD K8 20 19 |
| 82 | # PIII 25 23 | 83 | # PIII 25 23 |
| 83 | # Pentium 81 78 | 84 | # Pentium 81 78 |
| 84 | 85 | # | |
| 85 | push(@INC,"perlasm","../../perlasm"); | 86 | # Version 3.7 reimplements outer rounds as "compact." Meaning that |
| 87 | # first and last rounds reference compact 256 bytes S-box. This means | ||
| 88 | # that first round consumes a lot more CPU cycles and that encrypt | ||
| 89 | # and decrypt performance becomes asymmetric. Encrypt performance | ||
| 90 | # drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is | ||
| 91 | # aggressively pre-fetched. | ||
| 92 | # | ||
| 93 | # Version 4.0 effectively rolls back to 3.6 and instead implements | ||
| 94 | # additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact, | ||
| 95 | # which use exclusively 256 byte S-box. These functions are to be | ||
| 96 | # called in modes not concealing plain text, such as ECB, or when | ||
| 97 | # we're asked to process smaller amount of data [or unconditionally | ||
| 98 | # on hyper-threading CPU]. Currently it's called unconditionally from | ||
| 99 | # AES_[en|de]crypt, which affects all modes, but CBC. CBC routine | ||
| 100 | # still needs to be modified to switch between slower and faster | ||
| 101 | # mode when appropriate... But in either case benchmark landscape | ||
| 102 | # changes dramatically and below numbers are CPU cycles per processed | ||
| 103 | # byte for 128-bit key. | ||
| 104 | # | ||
| 105 | # ECB encrypt ECB decrypt CBC large chunk | ||
| 106 | # P4 56[60] 84[100] 23 | ||
| 107 | # AMD K8 48[44] 70[79] 18 | ||
| 108 | # PIII 41[50] 61[91] 24 | ||
| 109 | # Core 2 32[38] 45[70] 18.5 | ||
| 110 | # Pentium 120 160 77 | ||
| 111 | # | ||
| 112 | # Version 4.1 switches to compact S-box even in key schedule setup. | ||
| 113 | # | ||
| 114 | # Version 4.2 prefetches compact S-box in every SSE round or in other | ||
| 115 | # words every cache-line is *guaranteed* to be accessed within ~50 | ||
| 116 | # cycles window. Why just SSE? Because it's needed on hyper-threading | ||
| 117 | # CPU! Which is also why it's prefetched with 64 byte stride. Best | ||
| 118 | # part is that it has no negative effect on performance:-) | ||
| 119 | # | ||
| 120 | # Version 4.3 implements switch between compact and non-compact block | ||
| 121 | # functions in AES_cbc_encrypt depending on how much data was asked | ||
| 122 | # to be processed in one stroke. | ||
| 123 | # | ||
| 124 | ###################################################################### | ||
| 125 | # Timing attacks are classified in two classes: synchronous when | ||
| 126 | # attacker consciously initiates cryptographic operation and collects | ||
| 127 | # timing data of various character afterwards, and asynchronous when | ||
| 128 | # malicious code is executed on same CPU simultaneously with AES, | ||
| 129 | # instruments itself and performs statistical analysis of this data. | ||
| 130 | # | ||
| 131 | # As far as synchronous attacks go the root to the AES timing | ||
| 132 | # vulnerability is twofold. Firstly, of 256 S-box elements at most 160 | ||
| 133 | # are referred to in single 128-bit block operation. Well, in C | ||
| 134 | # implementation with 4 distinct tables it's actually as little as 40 | ||
| 135 | # references per 256 elements table, but anyway... Secondly, even | ||
| 136 | # though S-box elements are clustered into smaller amount of cache- | ||
| 137 | # lines, smaller than 160 and even 40, it turned out that for certain | ||
| 138 | # plain-text pattern[s] or simply put chosen plain-text and given key | ||
| 139 | # few cache-lines remain unaccessed during block operation. Now, if | ||
| 140 | # attacker can figure out this access pattern, he can deduct the key | ||
| 141 | # [or at least part of it]. The natural way to mitigate this kind of | ||
| 142 | # attacks is to minimize the amount of cache-lines in S-box and/or | ||
| 143 | # prefetch them to ensure that every one is accessed for more uniform | ||
| 144 | # timing. But note that *if* plain-text was concealed in such way that | ||
| 145 | # input to block function is distributed *uniformly*, then attack | ||
| 146 | # wouldn't apply. Now note that some encryption modes, most notably | ||
| 147 | # CBC, do mask the plain-text in this exact way [secure cipher output | ||
| 148 | # is distributed uniformly]. Yes, one still might find input that | ||
| 149 | # would reveal the information about given key, but if amount of | ||
| 150 | # candidate inputs to be tried is larger than amount of possible key | ||
| 151 | # combinations then attack becomes infeasible. This is why revised | ||
| 152 | # AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk | ||
| 153 | # of data is to be processed in one stroke. The current size limit of | ||
| 154 | # 512 bytes is chosen to provide same [diminishigly low] probability | ||
| 155 | # for cache-line to remain untouched in large chunk operation with | ||
| 156 | # large S-box as for single block operation with compact S-box and | ||
| 157 | # surely needs more careful consideration... | ||
| 158 | # | ||
| 159 | # As for asynchronous attacks. There are two flavours: attacker code | ||
| 160 | # being interleaved with AES on hyper-threading CPU at *instruction* | ||
| 161 | # level, and two processes time sharing single core. As for latter. | ||
| 162 | # Two vectors. 1. Given that attacker process has higher priority, | ||
| 163 | # yield execution to process performing AES just before timer fires | ||
| 164 | # off the scheduler, immediately regain control of CPU and analyze the | ||
| 165 | # cache state. For this attack to be efficient attacker would have to | ||
| 166 | # effectively slow down the operation by several *orders* of magnitute, | ||
| 167 | # by ratio of time slice to duration of handful of AES rounds, which | ||
| 168 | # unlikely to remain unnoticed. Not to mention that this also means | ||
| 169 | # that he would spend correspondigly more time to collect enough | ||
| 170 | # statistical data to mount the attack. It's probably appropriate to | ||
| 171 | # say that if adeversary reckons that this attack is beneficial and | ||
| 172 | # risks to be noticed, you probably have larger problems having him | ||
| 173 | # mere opportunity. In other words suggested code design expects you | ||
| 174 | # to preclude/mitigate this attack by overall system security design. | ||
| 175 | # 2. Attacker manages to make his code interrupt driven. In order for | ||
| 176 | # this kind of attack to be feasible, interrupt rate has to be high | ||
| 177 | # enough, again comparable to duration of handful of AES rounds. But | ||
| 178 | # is there interrupt source of such rate? Hardly, not even 1Gbps NIC | ||
| 179 | # generates interrupts at such raging rate... | ||
| 180 | # | ||
| 181 | # And now back to the former, hyper-threading CPU or more specifically | ||
| 182 | # Intel P4. Recall that asynchronous attack implies that malicious | ||
| 183 | # code instruments itself. And naturally instrumentation granularity | ||
| 184 | # has be noticeably lower than duration of codepath accessing S-box. | ||
| 185 | # Given that all cache-lines are accessed during that time that is. | ||
| 186 | # Current implementation accesses *all* cache-lines within ~50 cycles | ||
| 187 | # window, which is actually *less* than RDTSC latency on Intel P4! | ||
| 188 | |||
| 189 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 190 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 86 | require "x86asm.pl"; | 191 | require "x86asm.pl"; |
| 87 | 192 | ||
| 88 | &asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386"); | 193 | &asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386"); |
| 194 | &static_label("AES_Te"); | ||
| 195 | &static_label("AES_Td"); | ||
| 89 | 196 | ||
| 90 | $s0="eax"; | 197 | $s0="eax"; |
| 91 | $s1="ebx"; | 198 | $s1="ebx"; |
| @@ -93,21 +200,36 @@ $s2="ecx"; | |||
| 93 | $s3="edx"; | 200 | $s3="edx"; |
| 94 | $key="edi"; | 201 | $key="edi"; |
| 95 | $acc="esi"; | 202 | $acc="esi"; |
| 203 | $tbl="ebp"; | ||
| 204 | |||
| 205 | # stack frame layout in _[x86|sse]_AES_* routines, frame is allocated | ||
| 206 | # by caller | ||
| 207 | $__ra=&DWP(0,"esp"); # return address | ||
| 208 | $__s0=&DWP(4,"esp"); # s0 backing store | ||
| 209 | $__s1=&DWP(8,"esp"); # s1 backing store | ||
| 210 | $__s2=&DWP(12,"esp"); # s2 backing store | ||
| 211 | $__s3=&DWP(16,"esp"); # s3 backing store | ||
| 212 | $__key=&DWP(20,"esp"); # pointer to key schedule | ||
| 213 | $__end=&DWP(24,"esp"); # pointer to end of key schedule | ||
| 214 | $__tbl=&DWP(28,"esp"); # %ebp backing store | ||
| 215 | |||
| 216 | # stack frame layout in AES_[en|crypt] routines, which differs from | ||
| 217 | # above by 4 and overlaps by %ebp backing store | ||
| 218 | $_tbl=&DWP(24,"esp"); | ||
| 219 | $_esp=&DWP(28,"esp"); | ||
| 96 | 220 | ||
| 97 | $compromise=0; # $compromise=128 abstains from copying key | 221 | sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } |
| 98 | # schedule to stack when encrypting inputs | 222 | |
| 99 | # shorter than 128 bytes at the cost of | 223 | $speed_limit=512; # chunks smaller than $speed_limit are |
| 100 | # risksing aliasing with S-boxes. In return | 224 | # processed with compact routine in CBC mode |
| 101 | # you get way better, up to +70%, small block | ||
| 102 | # performance. | ||
| 103 | $small_footprint=1; # $small_footprint=1 code is ~5% slower [on | 225 | $small_footprint=1; # $small_footprint=1 code is ~5% slower [on |
| 104 | # recent µ-archs], but ~5 times smaller! | 226 | # recent µ-archs], but ~5 times smaller! |
| 105 | # I favor compact code to minimize cache | 227 | # I favor compact code to minimize cache |
| 106 | # contention and in hope to "collect" 5% back | 228 | # contention and in hope to "collect" 5% back |
| 107 | # in real-life applications... | 229 | # in real-life applications... |
| 230 | |||
| 108 | $vertical_spin=0; # shift "verticaly" defaults to 0, because of | 231 | $vertical_spin=0; # shift "verticaly" defaults to 0, because of |
| 109 | # its proof-of-concept status... | 232 | # its proof-of-concept status... |
| 110 | |||
| 111 | # Note that there is no decvert(), as well as last encryption round is | 233 | # Note that there is no decvert(), as well as last encryption round is |
| 112 | # performed with "horizontal" shifts. This is because this "vertical" | 234 | # performed with "horizontal" shifts. This is because this "vertical" |
| 113 | # implementation [one which groups shifts on a given $s[i] to form a | 235 | # implementation [one which groups shifts on a given $s[i] to form a |
| @@ -170,17 +292,484 @@ sub encvert() | |||
| 170 | &movz ($v0,&HB($v1)); | 292 | &movz ($v0,&HB($v1)); |
| 171 | &and ($v1,0xFF); | 293 | &and ($v1,0xFF); |
| 172 | &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16 | 294 | &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16 |
| 173 | &mov ($key,&DWP(12,"esp")); # reincarnate v1 as key | 295 | &mov ($key,$__key); # reincarnate v1 as key |
| 174 | &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24 | 296 | &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24 |
| 175 | } | 297 | } |
| 176 | 298 | ||
| 299 | # Another experimental routine, which features "horizontal spin," but | ||
| 300 | # eliminates one reference to stack. Strangely enough runs slower... | ||
| 301 | sub enchoriz() | ||
| 302 | { my $v0 = $key, $v1 = $acc; | ||
| 303 | |||
| 304 | &movz ($v0,&LB($s0)); # 3, 2, 1, 0* | ||
| 305 | &rotr ($s2,8); # 8,11,10, 9 | ||
| 306 | &mov ($v1,&DWP(0,$te,$v0,8)); # 0 | ||
| 307 | &movz ($v0,&HB($s1)); # 7, 6, 5*, 4 | ||
| 308 | &rotr ($s3,16); # 13,12,15,14 | ||
| 309 | &xor ($v1,&DWP(3,$te,$v0,8)); # 5 | ||
| 310 | &movz ($v0,&HB($s2)); # 8,11,10*, 9 | ||
| 311 | &rotr ($s0,16); # 1, 0, 3, 2 | ||
| 312 | &xor ($v1,&DWP(2,$te,$v0,8)); # 10 | ||
| 313 | &movz ($v0,&HB($s3)); # 13,12,15*,14 | ||
| 314 | &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected | ||
| 315 | &mov ($__s0,$v1); # t[0] saved | ||
| 316 | |||
| 317 | &movz ($v0,&LB($s1)); # 7, 6, 5, 4* | ||
| 318 | &shr ($s1,16); # -, -, 7, 6 | ||
| 319 | &mov ($v1,&DWP(0,$te,$v0,8)); # 4 | ||
| 320 | &movz ($v0,&LB($s3)); # 13,12,15,14* | ||
| 321 | &xor ($v1,&DWP(2,$te,$v0,8)); # 14 | ||
| 322 | &movz ($v0,&HB($s0)); # 1, 0, 3*, 2 | ||
| 323 | &and ($s3,0xffff0000); # 13,12, -, - | ||
| 324 | &xor ($v1,&DWP(1,$te,$v0,8)); # 3 | ||
| 325 | &movz ($v0,&LB($s2)); # 8,11,10, 9* | ||
| 326 | &or ($s3,$s1); # 13,12, 7, 6 | ||
| 327 | &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected | ||
| 328 | &mov ($s1,$v1); # s[1]=t[1] | ||
| 329 | |||
| 330 | &movz ($v0,&LB($s0)); # 1, 0, 3, 2* | ||
| 331 | &shr ($s2,16); # -, -, 8,11 | ||
| 332 | &mov ($v1,&DWP(2,$te,$v0,8)); # 2 | ||
| 333 | &movz ($v0,&HB($s3)); # 13,12, 7*, 6 | ||
| 334 | &xor ($v1,&DWP(1,$te,$v0,8)); # 7 | ||
| 335 | &movz ($v0,&HB($s2)); # -, -, 8*,11 | ||
| 336 | &xor ($v1,&DWP(0,$te,$v0,8)); # 8 | ||
| 337 | &mov ($v0,$s3); | ||
| 338 | &shr ($v0,24); # 13 | ||
| 339 | &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected | ||
| 340 | |||
| 341 | &movz ($v0,&LB($s2)); # -, -, 8,11* | ||
| 342 | &shr ($s0,24); # 1* | ||
| 343 | &mov ($s2,&DWP(1,$te,$v0,8)); # 11 | ||
| 344 | &xor ($s2,&DWP(3,$te,$s0,8)); # 1 | ||
| 345 | &mov ($s0,$__s0); # s[0]=t[0] | ||
| 346 | &movz ($v0,&LB($s3)); # 13,12, 7, 6* | ||
| 347 | &shr ($s3,16); # , ,13,12 | ||
| 348 | &xor ($s2,&DWP(2,$te,$v0,8)); # 6 | ||
| 349 | &mov ($key,$__key); # reincarnate v0 as key | ||
| 350 | &and ($s3,0xff); # , ,13,12* | ||
| 351 | &mov ($s3,&DWP(0,$te,$s3,8)); # 12 | ||
| 352 | &xor ($s3,$s2); # s[2]=t[3] collected | ||
| 353 | &mov ($s2,$v1); # s[2]=t[2] | ||
| 354 | } | ||
| 355 | |||
| 356 | # More experimental code... SSE one... Even though this one eliminates | ||
| 357 | # *all* references to stack, it's not faster... | ||
| 358 | sub sse_encbody() | ||
| 359 | { | ||
| 360 | &movz ($acc,&LB("eax")); # 0 | ||
| 361 | &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0 | ||
| 362 | &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 | ||
| 363 | &movz ("edx",&HB("eax")); # 1 | ||
| 364 | &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1 | ||
| 365 | &shr ("eax",16); # 5, 4 | ||
| 366 | |||
| 367 | &movz ($acc,&LB("ebx")); # 10 | ||
| 368 | &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10 | ||
| 369 | &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 | ||
| 370 | &movz ($acc,&HB("ebx")); # 11 | ||
| 371 | &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11 | ||
| 372 | &shr ("ebx",16); # 15,14 | ||
| 373 | |||
| 374 | &movz ($acc,&HB("eax")); # 5 | ||
| 375 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5 | ||
| 376 | &movq ("mm3",QWP(16,$key)); | ||
| 377 | &movz ($acc,&HB("ebx")); # 15 | ||
| 378 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15 | ||
| 379 | &movd ("mm0","ecx"); # t[0] collected | ||
| 380 | |||
| 381 | &movz ($acc,&LB("eax")); # 4 | ||
| 382 | &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4 | ||
| 383 | &movd ("eax","mm2"); # 7, 6, 3, 2 | ||
| 384 | &movz ($acc,&LB("ebx")); # 14 | ||
| 385 | &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14 | ||
| 386 | &movd ("ebx","mm6"); # 13,12, 9, 8 | ||
| 387 | |||
| 388 | &movz ($acc,&HB("eax")); # 3 | ||
| 389 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3 | ||
| 390 | &movz ($acc,&HB("ebx")); # 9 | ||
| 391 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9 | ||
| 392 | &movd ("mm1","ecx"); # t[1] collected | ||
| 393 | |||
| 394 | &movz ($acc,&LB("eax")); # 2 | ||
| 395 | &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2 | ||
| 396 | &shr ("eax",16); # 7, 6 | ||
| 397 | &punpckldq ("mm0","mm1"); # t[0,1] collected | ||
| 398 | &movz ($acc,&LB("ebx")); # 8 | ||
| 399 | &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8 | ||
| 400 | &shr ("ebx",16); # 13,12 | ||
| 401 | |||
| 402 | &movz ($acc,&HB("eax")); # 7 | ||
| 403 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7 | ||
| 404 | &pxor ("mm0","mm3"); | ||
| 405 | &movz ("eax",&LB("eax")); # 6 | ||
| 406 | &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6 | ||
| 407 | &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0 | ||
| 408 | &movz ($acc,&HB("ebx")); # 13 | ||
| 409 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13 | ||
| 410 | &xor ("ecx",&DWP(24,$key)); # t[2] | ||
| 411 | &movd ("mm4","ecx"); # t[2] collected | ||
| 412 | &movz ("ebx",&LB("ebx")); # 12 | ||
| 413 | &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12 | ||
| 414 | &shr ("ecx",16); | ||
| 415 | &movd ("eax","mm1"); # 5, 4, 1, 0 | ||
| 416 | &mov ("ebx",&DWP(28,$key)); # t[3] | ||
| 417 | &xor ("ebx","edx"); | ||
| 418 | &movd ("mm5","ebx"); # t[3] collected | ||
| 419 | &and ("ebx",0xffff0000); | ||
| 420 | &or ("ebx","ecx"); | ||
| 421 | |||
| 422 | &punpckldq ("mm4","mm5"); # t[2,3] collected | ||
| 423 | } | ||
| 424 | |||
| 425 | ###################################################################### | ||
| 426 | # "Compact" block function | ||
| 427 | ###################################################################### | ||
| 428 | |||
| 429 | sub enccompact() | ||
| 430 | { my $Fn = mov; | ||
| 431 | while ($#_>5) { pop(@_); $Fn=sub{}; } | ||
| 432 | my ($i,$te,@s)=@_; | ||
| 433 | my $tmp = $key; | ||
| 434 | my $out = $i==3?$s[0]:$acc; | ||
| 435 | |||
| 436 | # $Fn is used in first compact round and its purpose is to | ||
| 437 | # void restoration of some values from stack, so that after | ||
| 438 | # 4xenccompact with extra argument $key value is left there... | ||
| 439 | if ($i==3) { &$Fn ($key,$__key); }##%edx | ||
| 440 | else { &mov ($out,$s[0]); } | ||
| 441 | &and ($out,0xFF); | ||
| 442 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] | ||
| 443 | if ($i==2) { &shr ($s[0],24); }#%ecx[2] | ||
| 444 | &movz ($out,&BP(-128,$te,$out,1)); | ||
| 445 | |||
| 446 | if ($i==3) { $tmp=$s[1]; }##%eax | ||
| 447 | &movz ($tmp,&HB($s[1])); | ||
| 448 | &movz ($tmp,&BP(-128,$te,$tmp,1)); | ||
| 449 | &shl ($tmp,8); | ||
| 450 | &xor ($out,$tmp); | ||
| 451 | |||
| 452 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx | ||
| 453 | else { &mov ($tmp,$s[2]); | ||
| 454 | &shr ($tmp,16); } | ||
| 455 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] | ||
| 456 | &and ($tmp,0xFF); | ||
| 457 | &movz ($tmp,&BP(-128,$te,$tmp,1)); | ||
| 458 | &shl ($tmp,16); | ||
| 459 | &xor ($out,$tmp); | ||
| 460 | |||
| 461 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx | ||
| 462 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] | ||
| 463 | else { &mov ($tmp,$s[3]); | ||
| 464 | &shr ($tmp,24); } | ||
| 465 | &movz ($tmp,&BP(-128,$te,$tmp,1)); | ||
| 466 | &shl ($tmp,24); | ||
| 467 | &xor ($out,$tmp); | ||
| 468 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | ||
| 469 | if ($i==3) { &mov ($s[3],$acc); } | ||
| 470 | &comment(); | ||
| 471 | } | ||
| 472 | |||
| 473 | sub enctransform() | ||
| 474 | { my @s = ($s0,$s1,$s2,$s3); | ||
| 475 | my $i = shift; | ||
| 476 | my $tmp = $tbl; | ||
| 477 | my $r2 = $key ; | ||
| 478 | |||
| 479 | &mov ($acc,$s[$i]); | ||
| 480 | &and ($acc,0x80808080); | ||
| 481 | &mov ($tmp,$acc); | ||
| 482 | &shr ($tmp,7); | ||
| 483 | &lea ($r2,&DWP(0,$s[$i],$s[$i])); | ||
| 484 | &sub ($acc,$tmp); | ||
| 485 | &and ($r2,0xfefefefe); | ||
| 486 | &and ($acc,0x1b1b1b1b); | ||
| 487 | &mov ($tmp,$s[$i]); | ||
| 488 | &xor ($acc,$r2); # r2 | ||
| 489 | |||
| 490 | &xor ($s[$i],$acc); # r0 ^ r2 | ||
| 491 | &rotl ($s[$i],24); | ||
| 492 | &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2 | ||
| 493 | &rotr ($tmp,16); | ||
| 494 | &xor ($s[$i],$tmp); | ||
| 495 | &rotr ($tmp,8); | ||
| 496 | &xor ($s[$i],$tmp); | ||
| 497 | } | ||
| 498 | |||
| 499 | &function_begin_B("_x86_AES_encrypt_compact"); | ||
| 500 | # note that caller is expected to allocate stack frame for me! | ||
| 501 | &mov ($__key,$key); # save key | ||
| 502 | |||
| 503 | &xor ($s0,&DWP(0,$key)); # xor with key | ||
| 504 | &xor ($s1,&DWP(4,$key)); | ||
| 505 | &xor ($s2,&DWP(8,$key)); | ||
| 506 | &xor ($s3,&DWP(12,$key)); | ||
| 507 | |||
| 508 | &mov ($acc,&DWP(240,$key)); # load key->rounds | ||
| 509 | &lea ($acc,&DWP(-2,$acc,$acc)); | ||
| 510 | &lea ($acc,&DWP(0,$key,$acc,8)); | ||
| 511 | &mov ($__end,$acc); # end of key schedule | ||
| 512 | |||
| 513 | # prefetch Te4 | ||
| 514 | &mov ($key,&DWP(0-128,$tbl)); | ||
| 515 | &mov ($acc,&DWP(32-128,$tbl)); | ||
| 516 | &mov ($key,&DWP(64-128,$tbl)); | ||
| 517 | &mov ($acc,&DWP(96-128,$tbl)); | ||
| 518 | &mov ($key,&DWP(128-128,$tbl)); | ||
| 519 | &mov ($acc,&DWP(160-128,$tbl)); | ||
| 520 | &mov ($key,&DWP(192-128,$tbl)); | ||
| 521 | &mov ($acc,&DWP(224-128,$tbl)); | ||
| 522 | |||
| 523 | &set_label("loop",16); | ||
| 524 | |||
| 525 | &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1); | ||
| 526 | &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1); | ||
| 527 | &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1); | ||
| 528 | &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1); | ||
| 529 | &enctransform(2); | ||
| 530 | &enctransform(3); | ||
| 531 | &enctransform(0); | ||
| 532 | &enctransform(1); | ||
| 533 | &mov ($key,$__key); | ||
| 534 | &mov ($tbl,$__tbl); | ||
| 535 | &add ($key,16); # advance rd_key | ||
| 536 | &xor ($s0,&DWP(0,$key)); | ||
| 537 | &xor ($s1,&DWP(4,$key)); | ||
| 538 | &xor ($s2,&DWP(8,$key)); | ||
| 539 | &xor ($s3,&DWP(12,$key)); | ||
| 540 | |||
| 541 | &cmp ($key,$__end); | ||
| 542 | &mov ($__key,$key); | ||
| 543 | &jb (&label("loop")); | ||
| 544 | |||
| 545 | &enccompact(0,$tbl,$s0,$s1,$s2,$s3); | ||
| 546 | &enccompact(1,$tbl,$s1,$s2,$s3,$s0); | ||
| 547 | &enccompact(2,$tbl,$s2,$s3,$s0,$s1); | ||
| 548 | &enccompact(3,$tbl,$s3,$s0,$s1,$s2); | ||
| 549 | |||
| 550 | &xor ($s0,&DWP(16,$key)); | ||
| 551 | &xor ($s1,&DWP(20,$key)); | ||
| 552 | &xor ($s2,&DWP(24,$key)); | ||
| 553 | &xor ($s3,&DWP(28,$key)); | ||
| 554 | |||
| 555 | &ret (); | ||
| 556 | &function_end_B("_x86_AES_encrypt_compact"); | ||
| 557 | |||
| 558 | ###################################################################### | ||
| 559 | # "Compact" SSE block function. | ||
| 560 | ###################################################################### | ||
| 561 | # | ||
| 562 | # Performance is not actually extraordinary in comparison to pure | ||
| 563 | # x86 code. In particular encrypt performance is virtually the same. | ||
| 564 | # Decrypt performance on the other hand is 15-20% better on newer | ||
| 565 | # µ-archs [but we're thankful for *any* improvement here], and ~50% | ||
| 566 | # better on PIII:-) And additionally on the pros side this code | ||
| 567 | # eliminates redundant references to stack and thus relieves/ | ||
| 568 | # minimizes the pressure on the memory bus. | ||
| 569 | # | ||
| 570 | # MMX register layout lsb | ||
| 571 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | ||
| 572 | # | mm4 | mm0 | | ||
| 573 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | ||
| 574 | # | s3 | s2 | s1 | s0 | | ||
| 575 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | ||
| 576 | # |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0| | ||
| 577 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | ||
| 578 | # | ||
| 579 | # Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8. | ||
| 580 | # In this terms encryption and decryption "compact" permutation | ||
| 581 | # matrices can be depicted as following: | ||
| 582 | # | ||
| 583 | # encryption lsb # decryption lsb | ||
| 584 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
| 585 | # | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 | | ||
| 586 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
| 587 | # | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 | | ||
| 588 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
| 589 | # | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 | | ||
| 590 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
| 591 | # | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 | | ||
| 592 | # +----++----+----+----+----+ # +----++----+----+----+----+ | ||
| 593 | # | ||
| 594 | ###################################################################### | ||
| 595 | # Why not xmm registers? Short answer. It was actually tested and | ||
| 596 | # was not any faster, but *contrary*, most notably on Intel CPUs. | ||
| 597 | # Longer answer. Main advantage of using mm registers is that movd | ||
| 598 | # latency is lower, especially on Intel P4. While arithmetic | ||
| 599 | # instructions are twice as many, they can be scheduled every cycle | ||
| 600 | # and not every second one when they are operating on xmm register, | ||
| 601 | # so that "arithmetic throughput" remains virtually the same. And | ||
| 602 | # finally the code can be executed even on elder SSE-only CPUs:-) | ||
| 603 | |||
| 604 | sub sse_enccompact() | ||
| 605 | { | ||
| 606 | &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0 | ||
| 607 | &pshufw ("mm5","mm4",0x0d); # 15,14,11,10 | ||
| 608 | &movd ("eax","mm1"); # 5, 4, 1, 0 | ||
| 609 | &movd ("ebx","mm5"); # 15,14,11,10 | ||
| 610 | |||
| 611 | &movz ($acc,&LB("eax")); # 0 | ||
| 612 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 | ||
| 613 | &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 | ||
| 614 | &movz ("edx",&HB("eax")); # 1 | ||
| 615 | &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 | ||
| 616 | &shl ("edx",8); # 1 | ||
| 617 | &shr ("eax",16); # 5, 4 | ||
| 618 | |||
| 619 | &movz ($acc,&LB("ebx")); # 10 | ||
| 620 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 | ||
| 621 | &shl ($acc,16); # 10 | ||
| 622 | &or ("ecx",$acc); # 10 | ||
| 623 | &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 | ||
| 624 | &movz ($acc,&HB("ebx")); # 11 | ||
| 625 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 | ||
| 626 | &shl ($acc,24); # 11 | ||
| 627 | &or ("edx",$acc); # 11 | ||
| 628 | &shr ("ebx",16); # 15,14 | ||
| 629 | |||
| 630 | &movz ($acc,&HB("eax")); # 5 | ||
| 631 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5 | ||
| 632 | &shl ($acc,8); # 5 | ||
| 633 | &or ("ecx",$acc); # 5 | ||
| 634 | &movz ($acc,&HB("ebx")); # 15 | ||
| 635 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 | ||
| 636 | &shl ($acc,24); # 15 | ||
| 637 | &or ("ecx",$acc); # 15 | ||
| 638 | &movd ("mm0","ecx"); # t[0] collected | ||
| 639 | |||
| 640 | &movz ($acc,&LB("eax")); # 4 | ||
| 641 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4 | ||
| 642 | &movd ("eax","mm2"); # 7, 6, 3, 2 | ||
| 643 | &movz ($acc,&LB("ebx")); # 14 | ||
| 644 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 | ||
| 645 | &shl ($acc,16); # 14 | ||
| 646 | &or ("ecx",$acc); # 14 | ||
| 647 | |||
| 648 | &movd ("ebx","mm6"); # 13,12, 9, 8 | ||
| 649 | &movz ($acc,&HB("eax")); # 3 | ||
| 650 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3 | ||
| 651 | &shl ($acc,24); # 3 | ||
| 652 | &or ("ecx",$acc); # 3 | ||
| 653 | &movz ($acc,&HB("ebx")); # 9 | ||
| 654 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 | ||
| 655 | &shl ($acc,8); # 9 | ||
| 656 | &or ("ecx",$acc); # 9 | ||
| 657 | &movd ("mm1","ecx"); # t[1] collected | ||
| 658 | |||
| 659 | &movz ($acc,&LB("ebx")); # 8 | ||
| 660 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8 | ||
| 661 | &shr ("ebx",16); # 13,12 | ||
| 662 | &movz ($acc,&LB("eax")); # 2 | ||
| 663 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 | ||
| 664 | &shl ($acc,16); # 2 | ||
| 665 | &or ("ecx",$acc); # 2 | ||
| 666 | &shr ("eax",16); # 7, 6 | ||
| 667 | |||
| 668 | &punpckldq ("mm0","mm1"); # t[0,1] collected | ||
| 669 | |||
| 670 | &movz ($acc,&HB("eax")); # 7 | ||
| 671 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 | ||
| 672 | &shl ($acc,24); # 7 | ||
| 673 | &or ("ecx",$acc); # 7 | ||
| 674 | &and ("eax",0xff); # 6 | ||
| 675 | &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6 | ||
| 676 | &shl ("eax",16); # 6 | ||
| 677 | &or ("edx","eax"); # 6 | ||
| 678 | &movz ($acc,&HB("ebx")); # 13 | ||
| 679 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 | ||
| 680 | &shl ($acc,8); # 13 | ||
| 681 | &or ("ecx",$acc); # 13 | ||
| 682 | &movd ("mm4","ecx"); # t[2] collected | ||
| 683 | &and ("ebx",0xff); # 12 | ||
| 684 | &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12 | ||
| 685 | &or ("edx","ebx"); # 12 | ||
| 686 | &movd ("mm5","edx"); # t[3] collected | ||
| 687 | |||
| 688 | &punpckldq ("mm4","mm5"); # t[2,3] collected | ||
| 689 | } | ||
| 690 | |||
| 691 | if (!$x86only) { | ||
| 692 | &function_begin_B("_sse_AES_encrypt_compact"); | ||
| 693 | &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0 | ||
| 694 | &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8 | ||
| 695 | |||
| 696 | # note that caller is expected to allocate stack frame for me! | ||
| 697 | &mov ($acc,&DWP(240,$key)); # load key->rounds | ||
| 698 | &lea ($acc,&DWP(-2,$acc,$acc)); | ||
| 699 | &lea ($acc,&DWP(0,$key,$acc,8)); | ||
| 700 | &mov ($__end,$acc); # end of key schedule | ||
| 701 | |||
| 702 | &mov ($s0,0x1b1b1b1b); # magic constant | ||
| 703 | &mov (&DWP(8,"esp"),$s0); | ||
| 704 | &mov (&DWP(12,"esp"),$s0); | ||
| 705 | |||
| 706 | # prefetch Te4 | ||
| 707 | &mov ($s0,&DWP(0-128,$tbl)); | ||
| 708 | &mov ($s1,&DWP(32-128,$tbl)); | ||
| 709 | &mov ($s2,&DWP(64-128,$tbl)); | ||
| 710 | &mov ($s3,&DWP(96-128,$tbl)); | ||
| 711 | &mov ($s0,&DWP(128-128,$tbl)); | ||
| 712 | &mov ($s1,&DWP(160-128,$tbl)); | ||
| 713 | &mov ($s2,&DWP(192-128,$tbl)); | ||
| 714 | &mov ($s3,&DWP(224-128,$tbl)); | ||
| 715 | |||
| 716 | &set_label("loop",16); | ||
| 717 | &sse_enccompact(); | ||
| 718 | &add ($key,16); | ||
| 719 | &cmp ($key,$__end); | ||
| 720 | &ja (&label("out")); | ||
| 721 | |||
| 722 | &movq ("mm2",&QWP(8,"esp")); | ||
| 723 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); | ||
| 724 | &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0 | ||
| 725 | &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4"); | ||
| 726 | &pand ("mm3","mm2"); &pand ("mm7","mm2"); | ||
| 727 | &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16) | ||
| 728 | &paddb ("mm0","mm0"); &paddb ("mm4","mm4"); | ||
| 729 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2 | ||
| 730 | &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0 | ||
| 731 | &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2 | ||
| 732 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16) | ||
| 733 | |||
| 734 | &movq ("mm2","mm3"); &movq ("mm6","mm7"); | ||
| 735 | &pslld ("mm3",8); &pslld ("mm7",8); | ||
| 736 | &psrld ("mm2",24); &psrld ("mm6",24); | ||
| 737 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8 | ||
| 738 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24 | ||
| 739 | |||
| 740 | &movq ("mm3","mm1"); &movq ("mm7","mm5"); | ||
| 741 | &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key)); | ||
| 742 | &psrld ("mm1",8); &psrld ("mm5",8); | ||
| 743 | &mov ($s0,&DWP(0-128,$tbl)); | ||
| 744 | &pslld ("mm3",24); &pslld ("mm7",24); | ||
| 745 | &mov ($s1,&DWP(64-128,$tbl)); | ||
| 746 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8 | ||
| 747 | &mov ($s2,&DWP(128-128,$tbl)); | ||
| 748 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24 | ||
| 749 | &mov ($s3,&DWP(192-128,$tbl)); | ||
| 750 | |||
| 751 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); | ||
| 752 | &jmp (&label("loop")); | ||
| 753 | |||
| 754 | &set_label("out",16); | ||
| 755 | &pxor ("mm0",&QWP(0,$key)); | ||
| 756 | &pxor ("mm4",&QWP(8,$key)); | ||
| 757 | |||
| 758 | &ret (); | ||
| 759 | &function_end_B("_sse_AES_encrypt_compact"); | ||
| 760 | } | ||
| 761 | |||
| 762 | ###################################################################### | ||
| 763 | # Vanilla block function. | ||
| 764 | ###################################################################### | ||
| 765 | |||
| 177 | sub encstep() | 766 | sub encstep() |
| 178 | { my ($i,$te,@s) = @_; | 767 | { my ($i,$te,@s) = @_; |
| 179 | my $tmp = $key; | 768 | my $tmp = $key; |
| 180 | my $out = $i==3?$s[0]:$acc; | 769 | my $out = $i==3?$s[0]:$acc; |
| 181 | 770 | ||
| 182 | # lines marked with #%e?x[i] denote "reordered" instructions... | 771 | # lines marked with #%e?x[i] denote "reordered" instructions... |
| 183 | if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx | 772 | if ($i==3) { &mov ($key,$__key); }##%edx |
| 184 | else { &mov ($out,$s[0]); | 773 | else { &mov ($out,$s[0]); |
| 185 | &and ($out,0xFF); } | 774 | &and ($out,0xFF); } |
| 186 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] | 775 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] |
| @@ -191,14 +780,14 @@ sub encstep() | |||
| 191 | &movz ($tmp,&HB($s[1])); | 780 | &movz ($tmp,&HB($s[1])); |
| 192 | &xor ($out,&DWP(3,$te,$tmp,8)); | 781 | &xor ($out,&DWP(3,$te,$tmp,8)); |
| 193 | 782 | ||
| 194 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx | 783 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx |
| 195 | else { &mov ($tmp,$s[2]); | 784 | else { &mov ($tmp,$s[2]); |
| 196 | &shr ($tmp,16); } | 785 | &shr ($tmp,16); } |
| 197 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] | 786 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] |
| 198 | &and ($tmp,0xFF); | 787 | &and ($tmp,0xFF); |
| 199 | &xor ($out,&DWP(2,$te,$tmp,8)); | 788 | &xor ($out,&DWP(2,$te,$tmp,8)); |
| 200 | 789 | ||
| 201 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx | 790 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx |
| 202 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] | 791 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] |
| 203 | else { &mov ($tmp,$s[3]); | 792 | else { &mov ($tmp,$s[3]); |
| 204 | &shr ($tmp,24) } | 793 | &shr ($tmp,24) } |
| @@ -213,7 +802,7 @@ sub enclast() | |||
| 213 | my $tmp = $key; | 802 | my $tmp = $key; |
| 214 | my $out = $i==3?$s[0]:$acc; | 803 | my $out = $i==3?$s[0]:$acc; |
| 215 | 804 | ||
| 216 | if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx | 805 | if ($i==3) { &mov ($key,$__key); }##%edx |
| 217 | else { &mov ($out,$s[0]); } | 806 | else { &mov ($out,$s[0]); } |
| 218 | &and ($out,0xFF); | 807 | &and ($out,0xFF); |
| 219 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] | 808 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] |
| @@ -227,8 +816,8 @@ sub enclast() | |||
| 227 | &and ($tmp,0x0000ff00); | 816 | &and ($tmp,0x0000ff00); |
| 228 | &xor ($out,$tmp); | 817 | &xor ($out,$tmp); |
| 229 | 818 | ||
| 230 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx | 819 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx |
| 231 | else { mov ($tmp,$s[2]); | 820 | else { &mov ($tmp,$s[2]); |
| 232 | &shr ($tmp,16); } | 821 | &shr ($tmp,16); } |
| 233 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] | 822 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] |
| 234 | &and ($tmp,0xFF); | 823 | &and ($tmp,0xFF); |
| @@ -236,7 +825,7 @@ sub enclast() | |||
| 236 | &and ($tmp,0x00ff0000); | 825 | &and ($tmp,0x00ff0000); |
| 237 | &xor ($out,$tmp); | 826 | &xor ($out,$tmp); |
| 238 | 827 | ||
| 239 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx | 828 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx |
| 240 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] | 829 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] |
| 241 | else { &mov ($tmp,$s[3]); | 830 | else { &mov ($tmp,$s[3]); |
| 242 | &shr ($tmp,24); } | 831 | &shr ($tmp,24); } |
| @@ -247,10 +836,7 @@ sub enclast() | |||
| 247 | if ($i==3) { &mov ($s[3],$acc); } | 836 | if ($i==3) { &mov ($s[3],$acc); } |
| 248 | } | 837 | } |
| 249 | 838 | ||
| 250 | sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | 839 | &function_begin_B("_x86_AES_encrypt"); |
| 251 | |||
| 252 | &public_label("AES_Te"); | ||
| 253 | &function_begin_C("_x86_AES_encrypt"); | ||
| 254 | if ($vertical_spin) { | 840 | if ($vertical_spin) { |
| 255 | # I need high parts of volatile registers to be accessible... | 841 | # I need high parts of volatile registers to be accessible... |
| 256 | &exch ($s1="edi",$key="ebx"); | 842 | &exch ($s1="edi",$key="ebx"); |
| @@ -258,7 +844,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 258 | } | 844 | } |
| 259 | 845 | ||
| 260 | # note that caller is expected to allocate stack frame for me! | 846 | # note that caller is expected to allocate stack frame for me! |
| 261 | &mov (&DWP(12,"esp"),$key); # save key | 847 | &mov ($__key,$key); # save key |
| 262 | 848 | ||
| 263 | &xor ($s0,&DWP(0,$key)); # xor with key | 849 | &xor ($s0,&DWP(0,$key)); # xor with key |
| 264 | &xor ($s1,&DWP(4,$key)); | 850 | &xor ($s1,&DWP(4,$key)); |
| @@ -270,24 +856,24 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 270 | if ($small_footprint) { | 856 | if ($small_footprint) { |
| 271 | &lea ($acc,&DWP(-2,$acc,$acc)); | 857 | &lea ($acc,&DWP(-2,$acc,$acc)); |
| 272 | &lea ($acc,&DWP(0,$key,$acc,8)); | 858 | &lea ($acc,&DWP(0,$key,$acc,8)); |
| 273 | &mov (&DWP(16,"esp"),$acc); # end of key schedule | 859 | &mov ($__end,$acc); # end of key schedule |
| 274 | &align (4); | 860 | |
| 275 | &set_label("loop"); | 861 | &set_label("loop",16); |
| 276 | if ($vertical_spin) { | 862 | if ($vertical_spin) { |
| 277 | &encvert("ebp",$s0,$s1,$s2,$s3); | 863 | &encvert($tbl,$s0,$s1,$s2,$s3); |
| 278 | } else { | 864 | } else { |
| 279 | &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 865 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
| 280 | &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 866 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
| 281 | &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 867 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
| 282 | &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 868 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
| 283 | } | 869 | } |
| 284 | &add ($key,16); # advance rd_key | 870 | &add ($key,16); # advance rd_key |
| 285 | &xor ($s0,&DWP(0,$key)); | 871 | &xor ($s0,&DWP(0,$key)); |
| 286 | &xor ($s1,&DWP(4,$key)); | 872 | &xor ($s1,&DWP(4,$key)); |
| 287 | &xor ($s2,&DWP(8,$key)); | 873 | &xor ($s2,&DWP(8,$key)); |
| 288 | &xor ($s3,&DWP(12,$key)); | 874 | &xor ($s3,&DWP(12,$key)); |
| 289 | &cmp ($key,&DWP(16,"esp")); | 875 | &cmp ($key,$__end); |
| 290 | &mov (&DWP(12,"esp"),$key); | 876 | &mov ($__key,$key); |
| 291 | &jb (&label("loop")); | 877 | &jb (&label("loop")); |
| 292 | } | 878 | } |
| 293 | else { | 879 | else { |
| @@ -296,15 +882,15 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 296 | &cmp ($acc,12); | 882 | &cmp ($acc,12); |
| 297 | &jle (&label("12rounds")); | 883 | &jle (&label("12rounds")); |
| 298 | 884 | ||
| 299 | &set_label("14rounds"); | 885 | &set_label("14rounds",4); |
| 300 | for ($i=1;$i<3;$i++) { | 886 | for ($i=1;$i<3;$i++) { |
| 301 | if ($vertical_spin) { | 887 | if ($vertical_spin) { |
| 302 | &encvert("ebp",$s0,$s1,$s2,$s3); | 888 | &encvert($tbl,$s0,$s1,$s2,$s3); |
| 303 | } else { | 889 | } else { |
| 304 | &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 890 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
| 305 | &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 891 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
| 306 | &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 892 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
| 307 | &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 893 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
| 308 | } | 894 | } |
| 309 | &xor ($s0,&DWP(16*$i+0,$key)); | 895 | &xor ($s0,&DWP(16*$i+0,$key)); |
| 310 | &xor ($s1,&DWP(16*$i+4,$key)); | 896 | &xor ($s1,&DWP(16*$i+4,$key)); |
| @@ -312,16 +898,16 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 312 | &xor ($s3,&DWP(16*$i+12,$key)); | 898 | &xor ($s3,&DWP(16*$i+12,$key)); |
| 313 | } | 899 | } |
| 314 | &add ($key,32); | 900 | &add ($key,32); |
| 315 | &mov (&DWP(12,"esp"),$key); # advance rd_key | 901 | &mov ($__key,$key); # advance rd_key |
| 316 | &set_label("12rounds"); | 902 | &set_label("12rounds",4); |
| 317 | for ($i=1;$i<3;$i++) { | 903 | for ($i=1;$i<3;$i++) { |
| 318 | if ($vertical_spin) { | 904 | if ($vertical_spin) { |
| 319 | &encvert("ebp",$s0,$s1,$s2,$s3); | 905 | &encvert($tbl,$s0,$s1,$s2,$s3); |
| 320 | } else { | 906 | } else { |
| 321 | &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 907 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
| 322 | &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 908 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
| 323 | &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 909 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
| 324 | &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 910 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
| 325 | } | 911 | } |
| 326 | &xor ($s0,&DWP(16*$i+0,$key)); | 912 | &xor ($s0,&DWP(16*$i+0,$key)); |
| 327 | &xor ($s1,&DWP(16*$i+4,$key)); | 913 | &xor ($s1,&DWP(16*$i+4,$key)); |
| @@ -329,16 +915,16 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 329 | &xor ($s3,&DWP(16*$i+12,$key)); | 915 | &xor ($s3,&DWP(16*$i+12,$key)); |
| 330 | } | 916 | } |
| 331 | &add ($key,32); | 917 | &add ($key,32); |
| 332 | &mov (&DWP(12,"esp"),$key); # advance rd_key | 918 | &mov ($__key,$key); # advance rd_key |
| 333 | &set_label("10rounds"); | 919 | &set_label("10rounds",4); |
| 334 | for ($i=1;$i<10;$i++) { | 920 | for ($i=1;$i<10;$i++) { |
| 335 | if ($vertical_spin) { | 921 | if ($vertical_spin) { |
| 336 | &encvert("ebp",$s0,$s1,$s2,$s3); | 922 | &encvert($tbl,$s0,$s1,$s2,$s3); |
| 337 | } else { | 923 | } else { |
| 338 | &encstep(0,"ebp",$s0,$s1,$s2,$s3); | 924 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
| 339 | &encstep(1,"ebp",$s1,$s2,$s3,$s0); | 925 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); |
| 340 | &encstep(2,"ebp",$s2,$s3,$s0,$s1); | 926 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); |
| 341 | &encstep(3,"ebp",$s3,$s0,$s1,$s2); | 927 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); |
| 342 | } | 928 | } |
| 343 | &xor ($s0,&DWP(16*$i+0,$key)); | 929 | &xor ($s0,&DWP(16*$i+0,$key)); |
| 344 | &xor ($s1,&DWP(16*$i+4,$key)); | 930 | &xor ($s1,&DWP(16*$i+4,$key)); |
| @@ -352,10 +938,10 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 352 | &mov ($s1="ebx",$key="edi"); | 938 | &mov ($s1="ebx",$key="edi"); |
| 353 | &mov ($s2="ecx",$acc="esi"); | 939 | &mov ($s2="ecx",$acc="esi"); |
| 354 | } | 940 | } |
| 355 | &enclast(0,"ebp",$s0,$s1,$s2,$s3); | 941 | &enclast(0,$tbl,$s0,$s1,$s2,$s3); |
| 356 | &enclast(1,"ebp",$s1,$s2,$s3,$s0); | 942 | &enclast(1,$tbl,$s1,$s2,$s3,$s0); |
| 357 | &enclast(2,"ebp",$s2,$s3,$s0,$s1); | 943 | &enclast(2,$tbl,$s2,$s3,$s0,$s1); |
| 358 | &enclast(3,"ebp",$s3,$s0,$s1,$s2); | 944 | &enclast(3,$tbl,$s3,$s0,$s1,$s2); |
| 359 | 945 | ||
| 360 | &add ($key,$small_footprint?16:160); | 946 | &add ($key,$small_footprint?16:160); |
| 361 | &xor ($s0,&DWP(0,$key)); | 947 | &xor ($s0,&DWP(0,$key)); |
| @@ -430,38 +1016,198 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 430 | &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); | 1016 | &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); |
| 431 | &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); | 1017 | &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); |
| 432 | &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); | 1018 | &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); |
| 1019 | |||
| 1020 | #Te4 # four copies of Te4 to choose from to avoid L1 aliasing | ||
| 1021 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
| 1022 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
| 1023 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
| 1024 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
| 1025 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
| 1026 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
| 1027 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
| 1028 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
| 1029 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
| 1030 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
| 1031 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
| 1032 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
| 1033 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
| 1034 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
| 1035 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
| 1036 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
| 1037 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
| 1038 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
| 1039 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
| 1040 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
| 1041 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
| 1042 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
| 1043 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
| 1044 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
| 1045 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
| 1046 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
| 1047 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
| 1048 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
| 1049 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
| 1050 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
| 1051 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
| 1052 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
| 1053 | |||
| 1054 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
| 1055 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
| 1056 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
| 1057 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
| 1058 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
| 1059 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
| 1060 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
| 1061 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
| 1062 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
| 1063 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
| 1064 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
| 1065 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
| 1066 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
| 1067 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
| 1068 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
| 1069 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
| 1070 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
| 1071 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
| 1072 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
| 1073 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
| 1074 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
| 1075 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
| 1076 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
| 1077 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
| 1078 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
| 1079 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
| 1080 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
| 1081 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
| 1082 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
| 1083 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
| 1084 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
| 1085 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
| 1086 | |||
| 1087 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
| 1088 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
| 1089 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
| 1090 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
| 1091 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
| 1092 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
| 1093 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
| 1094 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
| 1095 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
| 1096 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
| 1097 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
| 1098 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
| 1099 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
| 1100 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
| 1101 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
| 1102 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
| 1103 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
| 1104 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
| 1105 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
| 1106 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
| 1107 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
| 1108 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
| 1109 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
| 1110 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
| 1111 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
| 1112 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
| 1113 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
| 1114 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
| 1115 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
| 1116 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
| 1117 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
| 1118 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
| 1119 | |||
| 1120 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
| 1121 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
| 1122 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
| 1123 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
| 1124 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
| 1125 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
| 1126 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
| 1127 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
| 1128 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
| 1129 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
| 1130 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
| 1131 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
| 1132 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
| 1133 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
| 1134 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
| 1135 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
| 1136 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
| 1137 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
| 1138 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
| 1139 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
| 1140 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
| 1141 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
| 1142 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
| 1143 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
| 1144 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
| 1145 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
| 1146 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
| 1147 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
| 1148 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
| 1149 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
| 1150 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
| 1151 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
| 433 | #rcon: | 1152 | #rcon: |
| 434 | &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008); | 1153 | &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008); |
| 435 | &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080); | 1154 | &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080); |
| 436 | &data_word(0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0); | 1155 | &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000); |
| 1156 | &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000); | ||
| 437 | &function_end_B("_x86_AES_encrypt"); | 1157 | &function_end_B("_x86_AES_encrypt"); |
| 438 | 1158 | ||
| 439 | # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); | 1159 | # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); |
| 440 | &public_label("AES_Te"); | ||
| 441 | &function_begin("AES_encrypt"); | 1160 | &function_begin("AES_encrypt"); |
| 442 | &mov ($acc,&wparam(0)); # load inp | 1161 | &mov ($acc,&wparam(0)); # load inp |
| 443 | &mov ($key,&wparam(2)); # load key | 1162 | &mov ($key,&wparam(2)); # load key |
| 444 | 1163 | ||
| 445 | &mov ($s0,"esp"); | 1164 | &mov ($s0,"esp"); |
| 446 | &sub ("esp",24); | 1165 | &sub ("esp",36); |
| 447 | &and ("esp",-64); | 1166 | &and ("esp",-64); # align to cache-line |
| 448 | &add ("esp",4); | 1167 | |
| 449 | &mov (&DWP(16,"esp"),$s0); | 1168 | # place stack frame just "above" the key schedule |
| 1169 | &lea ($s1,&DWP(-64-63,$key)); | ||
| 1170 | &sub ($s1,"esp"); | ||
| 1171 | &neg ($s1); | ||
| 1172 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line | ||
| 1173 | &sub ("esp",$s1); | ||
| 1174 | &add ("esp",4); # 4 is reserved for caller's return address | ||
| 1175 | &mov ($_esp,$s0); # save stack pointer | ||
| 450 | 1176 | ||
| 451 | &call (&label("pic_point")); # make it PIC! | 1177 | &call (&label("pic_point")); # make it PIC! |
| 452 | &set_label("pic_point"); | 1178 | &set_label("pic_point"); |
| 453 | &blindpop("ebp"); | 1179 | &blindpop($tbl); |
| 454 | &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 1180 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only); |
| 455 | 1181 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); | |
| 1182 | |||
| 1183 | # pick Te4 copy which can't "overlap" with stack frame or key schedule | ||
| 1184 | &lea ($s1,&DWP(768-4,"esp")); | ||
| 1185 | &sub ($s1,$tbl); | ||
| 1186 | &and ($s1,0x300); | ||
| 1187 | &lea ($tbl,&DWP(2048+128,$tbl,$s1)); | ||
| 1188 | |||
| 1189 | if (!$x86only) { | ||
| 1190 | &bt (&DWP(0,$s0),25); # check for SSE bit | ||
| 1191 | &jnc (&label("x86")); | ||
| 1192 | |||
| 1193 | &movq ("mm0",&QWP(0,$acc)); | ||
| 1194 | &movq ("mm4",&QWP(8,$acc)); | ||
| 1195 | &call ("_sse_AES_encrypt_compact"); | ||
| 1196 | &mov ("esp",$_esp); # restore stack pointer | ||
| 1197 | &mov ($acc,&wparam(1)); # load out | ||
| 1198 | &movq (&QWP(0,$acc),"mm0"); # write output data | ||
| 1199 | &movq (&QWP(8,$acc),"mm4"); | ||
| 1200 | &emms (); | ||
| 1201 | &function_end_A(); | ||
| 1202 | } | ||
| 1203 | &set_label("x86",16); | ||
| 1204 | &mov ($_tbl,$tbl); | ||
| 456 | &mov ($s0,&DWP(0,$acc)); # load input data | 1205 | &mov ($s0,&DWP(0,$acc)); # load input data |
| 457 | &mov ($s1,&DWP(4,$acc)); | 1206 | &mov ($s1,&DWP(4,$acc)); |
| 458 | &mov ($s2,&DWP(8,$acc)); | 1207 | &mov ($s2,&DWP(8,$acc)); |
| 459 | &mov ($s3,&DWP(12,$acc)); | 1208 | &mov ($s3,&DWP(12,$acc)); |
| 460 | 1209 | &call ("_x86_AES_encrypt_compact"); | |
| 461 | &call ("_x86_AES_encrypt"); | 1210 | &mov ("esp",$_esp); # restore stack pointer |
| 462 | |||
| 463 | &mov ("esp",&DWP(16,"esp")); | ||
| 464 | |||
| 465 | &mov ($acc,&wparam(1)); # load out | 1211 | &mov ($acc,&wparam(1)); # load out |
| 466 | &mov (&DWP(0,$acc),$s0); # write output data | 1212 | &mov (&DWP(0,$acc),$s0); # write output data |
| 467 | &mov (&DWP(4,$acc),$s1); | 1213 | &mov (&DWP(4,$acc),$s1); |
| @@ -469,7 +1215,370 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } | |||
| 469 | &mov (&DWP(12,$acc),$s3); | 1215 | &mov (&DWP(12,$acc),$s3); |
| 470 | &function_end("AES_encrypt"); | 1216 | &function_end("AES_encrypt"); |
| 471 | 1217 | ||
| 472 | #------------------------------------------------------------------# | 1218 | #--------------------------------------------------------------------# |
| 1219 | |||
| 1220 | ###################################################################### | ||
| 1221 | # "Compact" block function | ||
| 1222 | ###################################################################### | ||
| 1223 | |||
| 1224 | sub deccompact() | ||
| 1225 | { my $Fn = mov; | ||
| 1226 | while ($#_>5) { pop(@_); $Fn=sub{}; } | ||
| 1227 | my ($i,$td,@s)=@_; | ||
| 1228 | my $tmp = $key; | ||
| 1229 | my $out = $i==3?$s[0]:$acc; | ||
| 1230 | |||
| 1231 | # $Fn is used in first compact round and its purpose is to | ||
| 1232 | # void restoration of some values from stack, so that after | ||
| 1233 | # 4xdeccompact with extra argument $key, $s0 and $s1 values | ||
| 1234 | # are left there... | ||
| 1235 | if($i==3) { &$Fn ($key,$__key); } | ||
| 1236 | else { &mov ($out,$s[0]); } | ||
| 1237 | &and ($out,0xFF); | ||
| 1238 | &movz ($out,&BP(-128,$td,$out,1)); | ||
| 1239 | |||
| 1240 | if ($i==3) { $tmp=$s[1]; } | ||
| 1241 | &movz ($tmp,&HB($s[1])); | ||
| 1242 | &movz ($tmp,&BP(-128,$td,$tmp,1)); | ||
| 1243 | &shl ($tmp,8); | ||
| 1244 | &xor ($out,$tmp); | ||
| 1245 | |||
| 1246 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); } | ||
| 1247 | else { mov ($tmp,$s[2]); } | ||
| 1248 | &shr ($tmp,16); | ||
| 1249 | &and ($tmp,0xFF); | ||
| 1250 | &movz ($tmp,&BP(-128,$td,$tmp,1)); | ||
| 1251 | &shl ($tmp,16); | ||
| 1252 | &xor ($out,$tmp); | ||
| 1253 | |||
| 1254 | if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); } | ||
| 1255 | else { &mov ($tmp,$s[3]); } | ||
| 1256 | &shr ($tmp,24); | ||
| 1257 | &movz ($tmp,&BP(-128,$td,$tmp,1)); | ||
| 1258 | &shl ($tmp,24); | ||
| 1259 | &xor ($out,$tmp); | ||
| 1260 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | ||
| 1261 | if ($i==3) { &$Fn ($s[3],$__s0); } | ||
| 1262 | } | ||
| 1263 | |||
| 1264 | # must be called with 2,3,0,1 as argument sequence!!! | ||
| 1265 | sub dectransform() | ||
| 1266 | { my @s = ($s0,$s1,$s2,$s3); | ||
| 1267 | my $i = shift; | ||
| 1268 | my $tmp = $key; | ||
| 1269 | my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1); | ||
| 1270 | my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1); | ||
| 1271 | my $tp8 = $tbl; | ||
| 1272 | |||
| 1273 | &mov ($acc,$s[$i]); | ||
| 1274 | &and ($acc,0x80808080); | ||
| 1275 | &mov ($tmp,$acc); | ||
| 1276 | &shr ($tmp,7); | ||
| 1277 | &lea ($tp2,&DWP(0,$s[$i],$s[$i])); | ||
| 1278 | &sub ($acc,$tmp); | ||
| 1279 | &and ($tp2,0xfefefefe); | ||
| 1280 | &and ($acc,0x1b1b1b1b); | ||
| 1281 | &xor ($acc,$tp2); | ||
| 1282 | &mov ($tp2,$acc); | ||
| 1283 | |||
| 1284 | &and ($acc,0x80808080); | ||
| 1285 | &mov ($tmp,$acc); | ||
| 1286 | &shr ($tmp,7); | ||
| 1287 | &lea ($tp4,&DWP(0,$tp2,$tp2)); | ||
| 1288 | &sub ($acc,$tmp); | ||
| 1289 | &and ($tp4,0xfefefefe); | ||
| 1290 | &and ($acc,0x1b1b1b1b); | ||
| 1291 | &xor ($tp2,$s[$i]); # tp2^tp1 | ||
| 1292 | &xor ($acc,$tp4); | ||
| 1293 | &mov ($tp4,$acc); | ||
| 1294 | |||
| 1295 | &and ($acc,0x80808080); | ||
| 1296 | &mov ($tmp,$acc); | ||
| 1297 | &shr ($tmp,7); | ||
| 1298 | &lea ($tp8,&DWP(0,$tp4,$tp4)); | ||
| 1299 | &sub ($acc,$tmp); | ||
| 1300 | &and ($tp8,0xfefefefe); | ||
| 1301 | &and ($acc,0x1b1b1b1b); | ||
| 1302 | &xor ($tp4,$s[$i]); # tp4^tp1 | ||
| 1303 | &rotl ($s[$i],8); # = ROTATE(tp1,8) | ||
| 1304 | &xor ($tp8,$acc); | ||
| 1305 | |||
| 1306 | &xor ($s[$i],$tp2); | ||
| 1307 | &xor ($tp2,$tp8); | ||
| 1308 | &rotl ($tp2,24); | ||
| 1309 | &xor ($s[$i],$tp4); | ||
| 1310 | &xor ($tp4,$tp8); | ||
| 1311 | &rotl ($tp4,16); | ||
| 1312 | &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) | ||
| 1313 | &rotl ($tp8,8); | ||
| 1314 | &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24) | ||
| 1315 | &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16) | ||
| 1316 | &mov ($s[0],$__s0) if($i==2); #prefetch $s0 | ||
| 1317 | &mov ($s[1],$__s1) if($i==3); #prefetch $s1 | ||
| 1318 | &mov ($s[2],$__s2) if($i==1); | ||
| 1319 | &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8) | ||
| 1320 | |||
| 1321 | &mov ($s[3],$__s3) if($i==1); | ||
| 1322 | &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2); | ||
| 1323 | } | ||
| 1324 | |||
| 1325 | &function_begin_B("_x86_AES_decrypt_compact"); | ||
| 1326 | # note that caller is expected to allocate stack frame for me! | ||
| 1327 | &mov ($__key,$key); # save key | ||
| 1328 | |||
| 1329 | &xor ($s0,&DWP(0,$key)); # xor with key | ||
| 1330 | &xor ($s1,&DWP(4,$key)); | ||
| 1331 | &xor ($s2,&DWP(8,$key)); | ||
| 1332 | &xor ($s3,&DWP(12,$key)); | ||
| 1333 | |||
| 1334 | &mov ($acc,&DWP(240,$key)); # load key->rounds | ||
| 1335 | |||
| 1336 | &lea ($acc,&DWP(-2,$acc,$acc)); | ||
| 1337 | &lea ($acc,&DWP(0,$key,$acc,8)); | ||
| 1338 | &mov ($__end,$acc); # end of key schedule | ||
| 1339 | |||
| 1340 | # prefetch Td4 | ||
| 1341 | &mov ($key,&DWP(0-128,$tbl)); | ||
| 1342 | &mov ($acc,&DWP(32-128,$tbl)); | ||
| 1343 | &mov ($key,&DWP(64-128,$tbl)); | ||
| 1344 | &mov ($acc,&DWP(96-128,$tbl)); | ||
| 1345 | &mov ($key,&DWP(128-128,$tbl)); | ||
| 1346 | &mov ($acc,&DWP(160-128,$tbl)); | ||
| 1347 | &mov ($key,&DWP(192-128,$tbl)); | ||
| 1348 | &mov ($acc,&DWP(224-128,$tbl)); | ||
| 1349 | |||
| 1350 | &set_label("loop",16); | ||
| 1351 | |||
| 1352 | &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1); | ||
| 1353 | &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1); | ||
| 1354 | &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1); | ||
| 1355 | &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1); | ||
| 1356 | &dectransform(2); | ||
| 1357 | &dectransform(3); | ||
| 1358 | &dectransform(0); | ||
| 1359 | &dectransform(1); | ||
| 1360 | &mov ($key,$__key); | ||
| 1361 | &mov ($tbl,$__tbl); | ||
| 1362 | &add ($key,16); # advance rd_key | ||
| 1363 | &xor ($s0,&DWP(0,$key)); | ||
| 1364 | &xor ($s1,&DWP(4,$key)); | ||
| 1365 | &xor ($s2,&DWP(8,$key)); | ||
| 1366 | &xor ($s3,&DWP(12,$key)); | ||
| 1367 | |||
| 1368 | &cmp ($key,$__end); | ||
| 1369 | &mov ($__key,$key); | ||
| 1370 | &jb (&label("loop")); | ||
| 1371 | |||
| 1372 | &deccompact(0,$tbl,$s0,$s3,$s2,$s1); | ||
| 1373 | &deccompact(1,$tbl,$s1,$s0,$s3,$s2); | ||
| 1374 | &deccompact(2,$tbl,$s2,$s1,$s0,$s3); | ||
| 1375 | &deccompact(3,$tbl,$s3,$s2,$s1,$s0); | ||
| 1376 | |||
| 1377 | &xor ($s0,&DWP(16,$key)); | ||
| 1378 | &xor ($s1,&DWP(20,$key)); | ||
| 1379 | &xor ($s2,&DWP(24,$key)); | ||
| 1380 | &xor ($s3,&DWP(28,$key)); | ||
| 1381 | |||
| 1382 | &ret (); | ||
| 1383 | &function_end_B("_x86_AES_decrypt_compact"); | ||
| 1384 | |||
| 1385 | ###################################################################### | ||
| 1386 | # "Compact" SSE block function. | ||
| 1387 | ###################################################################### | ||
| 1388 | |||
| 1389 | sub sse_deccompact() | ||
| 1390 | { | ||
| 1391 | &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0 | ||
| 1392 | &movd ("eax","mm1"); # 7, 6, 1, 0 | ||
| 1393 | |||
| 1394 | &pshufw ("mm5","mm4",0x09); # 13,12,11,10 | ||
| 1395 | &movz ($acc,&LB("eax")); # 0 | ||
| 1396 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 | ||
| 1397 | &movd ("ebx","mm5"); # 13,12,11,10 | ||
| 1398 | &movz ("edx",&HB("eax")); # 1 | ||
| 1399 | &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 | ||
| 1400 | &shl ("edx",8); # 1 | ||
| 1401 | |||
| 1402 | &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 | ||
| 1403 | &movz ($acc,&LB("ebx")); # 10 | ||
| 1404 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 | ||
| 1405 | &shl ($acc,16); # 10 | ||
| 1406 | &or ("ecx",$acc); # 10 | ||
| 1407 | &shr ("eax",16); # 7, 6 | ||
| 1408 | &movz ($acc,&HB("ebx")); # 11 | ||
| 1409 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 | ||
| 1410 | &shl ($acc,24); # 11 | ||
| 1411 | &or ("edx",$acc); # 11 | ||
| 1412 | &shr ("ebx",16); # 13,12 | ||
| 1413 | |||
| 1414 | &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 | ||
| 1415 | &movz ($acc,&HB("eax")); # 7 | ||
| 1416 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 | ||
| 1417 | &shl ($acc,24); # 7 | ||
| 1418 | &or ("ecx",$acc); # 7 | ||
| 1419 | &movz ($acc,&HB("ebx")); # 13 | ||
| 1420 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 | ||
| 1421 | &shl ($acc,8); # 13 | ||
| 1422 | &or ("ecx",$acc); # 13 | ||
| 1423 | &movd ("mm0","ecx"); # t[0] collected | ||
| 1424 | |||
| 1425 | &movz ($acc,&LB("eax")); # 6 | ||
| 1426 | &movd ("eax","mm2"); # 3, 2, 5, 4 | ||
| 1427 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6 | ||
| 1428 | &shl ("ecx",16); # 6 | ||
| 1429 | &movz ($acc,&LB("ebx")); # 12 | ||
| 1430 | &movd ("ebx","mm6"); # 9, 8,15,14 | ||
| 1431 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12 | ||
| 1432 | &or ("ecx",$acc); # 12 | ||
| 1433 | |||
| 1434 | &movz ($acc,&LB("eax")); # 4 | ||
| 1435 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4 | ||
| 1436 | &or ("edx",$acc); # 4 | ||
| 1437 | &movz ($acc,&LB("ebx")); # 14 | ||
| 1438 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 | ||
| 1439 | &shl ($acc,16); # 14 | ||
| 1440 | &or ("edx",$acc); # 14 | ||
| 1441 | &movd ("mm1","edx"); # t[1] collected | ||
| 1442 | |||
| 1443 | &movz ($acc,&HB("eax")); # 5 | ||
| 1444 | &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5 | ||
| 1445 | &shl ("edx",8); # 5 | ||
| 1446 | &movz ($acc,&HB("ebx")); # 15 | ||
| 1447 | &shr ("eax",16); # 3, 2 | ||
| 1448 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 | ||
| 1449 | &shl ($acc,24); # 15 | ||
| 1450 | &or ("edx",$acc); # 15 | ||
| 1451 | &shr ("ebx",16); # 9, 8 | ||
| 1452 | |||
| 1453 | &punpckldq ("mm0","mm1"); # t[0,1] collected | ||
| 1454 | |||
| 1455 | &movz ($acc,&HB("ebx")); # 9 | ||
| 1456 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 | ||
| 1457 | &shl ($acc,8); # 9 | ||
| 1458 | &or ("ecx",$acc); # 9 | ||
| 1459 | &and ("ebx",0xff); # 8 | ||
| 1460 | &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8 | ||
| 1461 | &or ("edx","ebx"); # 8 | ||
| 1462 | &movz ($acc,&LB("eax")); # 2 | ||
| 1463 | &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 | ||
| 1464 | &shl ($acc,16); # 2 | ||
| 1465 | &or ("edx",$acc); # 2 | ||
| 1466 | &movd ("mm4","edx"); # t[2] collected | ||
| 1467 | &movz ("eax",&HB("eax")); # 3 | ||
| 1468 | &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3 | ||
| 1469 | &shl ("eax",24); # 3 | ||
| 1470 | &or ("ecx","eax"); # 3 | ||
| 1471 | &movd ("mm5","ecx"); # t[3] collected | ||
| 1472 | |||
| 1473 | &punpckldq ("mm4","mm5"); # t[2,3] collected | ||
| 1474 | } | ||
| 1475 | |||
| 1476 | if (!$x86only) { | ||
| 1477 | &function_begin_B("_sse_AES_decrypt_compact"); | ||
| 1478 | &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0 | ||
| 1479 | &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8 | ||
| 1480 | |||
| 1481 | # note that caller is expected to allocate stack frame for me! | ||
| 1482 | &mov ($acc,&DWP(240,$key)); # load key->rounds | ||
| 1483 | &lea ($acc,&DWP(-2,$acc,$acc)); | ||
| 1484 | &lea ($acc,&DWP(0,$key,$acc,8)); | ||
| 1485 | &mov ($__end,$acc); # end of key schedule | ||
| 1486 | |||
| 1487 | &mov ($s0,0x1b1b1b1b); # magic constant | ||
| 1488 | &mov (&DWP(8,"esp"),$s0); | ||
| 1489 | &mov (&DWP(12,"esp"),$s0); | ||
| 1490 | |||
| 1491 | # prefetch Td4 | ||
| 1492 | &mov ($s0,&DWP(0-128,$tbl)); | ||
| 1493 | &mov ($s1,&DWP(32-128,$tbl)); | ||
| 1494 | &mov ($s2,&DWP(64-128,$tbl)); | ||
| 1495 | &mov ($s3,&DWP(96-128,$tbl)); | ||
| 1496 | &mov ($s0,&DWP(128-128,$tbl)); | ||
| 1497 | &mov ($s1,&DWP(160-128,$tbl)); | ||
| 1498 | &mov ($s2,&DWP(192-128,$tbl)); | ||
| 1499 | &mov ($s3,&DWP(224-128,$tbl)); | ||
| 1500 | |||
| 1501 | &set_label("loop",16); | ||
| 1502 | &sse_deccompact(); | ||
| 1503 | &add ($key,16); | ||
| 1504 | &cmp ($key,$__end); | ||
| 1505 | &ja (&label("out")); | ||
| 1506 | |||
| 1507 | # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N) | ||
| 1508 | &movq ("mm3","mm0"); &movq ("mm7","mm4"); | ||
| 1509 | &movq ("mm2","mm0",1); &movq ("mm6","mm4",1); | ||
| 1510 | &movq ("mm1","mm0"); &movq ("mm5","mm4"); | ||
| 1511 | &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16) | ||
| 1512 | &pslld ("mm2",8); &pslld ("mm6",8); | ||
| 1513 | &psrld ("mm3",8); &psrld ("mm7",8); | ||
| 1514 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8 | ||
| 1515 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8 | ||
| 1516 | &pslld ("mm2",16); &pslld ("mm6",16); | ||
| 1517 | &psrld ("mm3",16); &psrld ("mm7",16); | ||
| 1518 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24 | ||
| 1519 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24 | ||
| 1520 | |||
| 1521 | &movq ("mm3",&QWP(8,"esp")); | ||
| 1522 | &pxor ("mm2","mm2"); &pxor ("mm6","mm6"); | ||
| 1523 | &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5"); | ||
| 1524 | &pand ("mm2","mm3"); &pand ("mm6","mm3"); | ||
| 1525 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); | ||
| 1526 | &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2 | ||
| 1527 | &movq ("mm3","mm1"); &movq ("mm7","mm5"); | ||
| 1528 | &movq ("mm2","mm1"); &movq ("mm6","mm5"); | ||
| 1529 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2 | ||
| 1530 | &pslld ("mm3",24); &pslld ("mm7",24); | ||
| 1531 | &psrld ("mm2",8); &psrld ("mm6",8); | ||
| 1532 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24 | ||
| 1533 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8 | ||
| 1534 | |||
| 1535 | &movq ("mm2",&QWP(8,"esp")); | ||
| 1536 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); | ||
| 1537 | &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5"); | ||
| 1538 | &pand ("mm3","mm2"); &pand ("mm7","mm2"); | ||
| 1539 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); | ||
| 1540 | &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4 | ||
| 1541 | &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1); | ||
| 1542 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4 | ||
| 1543 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16) | ||
| 1544 | |||
| 1545 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); | ||
| 1546 | &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5"); | ||
| 1547 | &pand ("mm3","mm2"); &pand ("mm7","mm2"); | ||
| 1548 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); | ||
| 1549 | &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8 | ||
| 1550 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8 | ||
| 1551 | &movq ("mm3","mm1"); &movq ("mm7","mm5"); | ||
| 1552 | &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1); | ||
| 1553 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16) | ||
| 1554 | &pslld ("mm1",8); &pslld ("mm5",8); | ||
| 1555 | &psrld ("mm3",8); &psrld ("mm7",8); | ||
| 1556 | &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key)); | ||
| 1557 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8 | ||
| 1558 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8 | ||
| 1559 | &mov ($s0,&DWP(0-128,$tbl)); | ||
| 1560 | &pslld ("mm1",16); &pslld ("mm5",16); | ||
| 1561 | &mov ($s1,&DWP(64-128,$tbl)); | ||
| 1562 | &psrld ("mm3",16); &psrld ("mm7",16); | ||
| 1563 | &mov ($s2,&DWP(128-128,$tbl)); | ||
| 1564 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24 | ||
| 1565 | &mov ($s3,&DWP(192-128,$tbl)); | ||
| 1566 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24 | ||
| 1567 | |||
| 1568 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); | ||
| 1569 | &jmp (&label("loop")); | ||
| 1570 | |||
| 1571 | &set_label("out",16); | ||
| 1572 | &pxor ("mm0",&QWP(0,$key)); | ||
| 1573 | &pxor ("mm4",&QWP(8,$key)); | ||
| 1574 | |||
| 1575 | &ret (); | ||
| 1576 | &function_end_B("_sse_AES_decrypt_compact"); | ||
| 1577 | } | ||
| 1578 | |||
| 1579 | ###################################################################### | ||
| 1580 | # Vanilla block function. | ||
| 1581 | ###################################################################### | ||
| 473 | 1582 | ||
| 474 | sub decstep() | 1583 | sub decstep() |
| 475 | { my ($i,$td,@s) = @_; | 1584 | { my ($i,$td,@s) = @_; |
| @@ -480,7 +1589,7 @@ sub decstep() | |||
| 480 | # optimal... or rather that all attempts to reorder didn't | 1589 | # optimal... or rather that all attempts to reorder didn't |
| 481 | # result in better performance [which by the way is not a | 1590 | # result in better performance [which by the way is not a |
| 482 | # bit lower than ecryption]. | 1591 | # bit lower than ecryption]. |
| 483 | if($i==3) { &mov ($key,&DWP(12,"esp")); } | 1592 | if($i==3) { &mov ($key,$__key); } |
| 484 | else { &mov ($out,$s[0]); } | 1593 | else { &mov ($out,$s[0]); } |
| 485 | &and ($out,0xFF); | 1594 | &and ($out,0xFF); |
| 486 | &mov ($out,&DWP(0,$td,$out,8)); | 1595 | &mov ($out,&DWP(0,$td,$out,8)); |
| @@ -495,12 +1604,12 @@ sub decstep() | |||
| 495 | &and ($tmp,0xFF); | 1604 | &and ($tmp,0xFF); |
| 496 | &xor ($out,&DWP(2,$td,$tmp,8)); | 1605 | &xor ($out,&DWP(2,$td,$tmp,8)); |
| 497 | 1606 | ||
| 498 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); } | 1607 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); } |
| 499 | else { &mov ($tmp,$s[3]); } | 1608 | else { &mov ($tmp,$s[3]); } |
| 500 | &shr ($tmp,24); | 1609 | &shr ($tmp,24); |
| 501 | &xor ($out,&DWP(1,$td,$tmp,8)); | 1610 | &xor ($out,&DWP(1,$td,$tmp,8)); |
| 502 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | 1611 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } |
| 503 | if ($i==3) { &mov ($s[3],&DWP(4,"esp")); } | 1612 | if ($i==3) { &mov ($s[3],$__s0); } |
| 504 | &comment(); | 1613 | &comment(); |
| 505 | } | 1614 | } |
| 506 | 1615 | ||
| @@ -509,14 +1618,24 @@ sub declast() | |||
| 509 | my $tmp = $key; | 1618 | my $tmp = $key; |
| 510 | my $out = $i==3?$s[0]:$acc; | 1619 | my $out = $i==3?$s[0]:$acc; |
| 511 | 1620 | ||
| 512 | if($i==3) { &mov ($key,&DWP(12,"esp")); } | 1621 | if($i==0) { &lea ($td,&DWP(2048+128,$td)); |
| 1622 | &mov ($tmp,&DWP(0-128,$td)); | ||
| 1623 | &mov ($acc,&DWP(32-128,$td)); | ||
| 1624 | &mov ($tmp,&DWP(64-128,$td)); | ||
| 1625 | &mov ($acc,&DWP(96-128,$td)); | ||
| 1626 | &mov ($tmp,&DWP(128-128,$td)); | ||
| 1627 | &mov ($acc,&DWP(160-128,$td)); | ||
| 1628 | &mov ($tmp,&DWP(192-128,$td)); | ||
| 1629 | &mov ($acc,&DWP(224-128,$td)); | ||
| 1630 | &lea ($td,&DWP(-128,$td)); } | ||
| 1631 | if($i==3) { &mov ($key,$__key); } | ||
| 513 | else { &mov ($out,$s[0]); } | 1632 | else { &mov ($out,$s[0]); } |
| 514 | &and ($out,0xFF); | 1633 | &and ($out,0xFF); |
| 515 | &movz ($out,&BP(2048,$td,$out,1)); | 1634 | &movz ($out,&BP(0,$td,$out,1)); |
| 516 | 1635 | ||
| 517 | if ($i==3) { $tmp=$s[1]; } | 1636 | if ($i==3) { $tmp=$s[1]; } |
| 518 | &movz ($tmp,&HB($s[1])); | 1637 | &movz ($tmp,&HB($s[1])); |
| 519 | &movz ($tmp,&BP(2048,$td,$tmp,1)); | 1638 | &movz ($tmp,&BP(0,$td,$tmp,1)); |
| 520 | &shl ($tmp,8); | 1639 | &shl ($tmp,8); |
| 521 | &xor ($out,$tmp); | 1640 | &xor ($out,$tmp); |
| 522 | 1641 | ||
| @@ -524,24 +1643,24 @@ sub declast() | |||
| 524 | else { mov ($tmp,$s[2]); } | 1643 | else { mov ($tmp,$s[2]); } |
| 525 | &shr ($tmp,16); | 1644 | &shr ($tmp,16); |
| 526 | &and ($tmp,0xFF); | 1645 | &and ($tmp,0xFF); |
| 527 | &movz ($tmp,&BP(2048,$td,$tmp,1)); | 1646 | &movz ($tmp,&BP(0,$td,$tmp,1)); |
| 528 | &shl ($tmp,16); | 1647 | &shl ($tmp,16); |
| 529 | &xor ($out,$tmp); | 1648 | &xor ($out,$tmp); |
| 530 | 1649 | ||
| 531 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); } | 1650 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); } |
| 532 | else { &mov ($tmp,$s[3]); } | 1651 | else { &mov ($tmp,$s[3]); } |
| 533 | &shr ($tmp,24); | 1652 | &shr ($tmp,24); |
| 534 | &movz ($tmp,&BP(2048,$td,$tmp,1)); | 1653 | &movz ($tmp,&BP(0,$td,$tmp,1)); |
| 535 | &shl ($tmp,24); | 1654 | &shl ($tmp,24); |
| 536 | &xor ($out,$tmp); | 1655 | &xor ($out,$tmp); |
| 537 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | 1656 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } |
| 538 | if ($i==3) { &mov ($s[3],&DWP(4,"esp")); } | 1657 | if ($i==3) { &mov ($s[3],$__s0); |
| 1658 | &lea ($td,&DWP(-2048,$td)); } | ||
| 539 | } | 1659 | } |
| 540 | 1660 | ||
| 541 | &public_label("AES_Td"); | 1661 | &function_begin_B("_x86_AES_decrypt"); |
| 542 | &function_begin_C("_x86_AES_decrypt"); | ||
| 543 | # note that caller is expected to allocate stack frame for me! | 1662 | # note that caller is expected to allocate stack frame for me! |
| 544 | &mov (&DWP(12,"esp"),$key); # save key | 1663 | &mov ($__key,$key); # save key |
| 545 | 1664 | ||
| 546 | &xor ($s0,&DWP(0,$key)); # xor with key | 1665 | &xor ($s0,&DWP(0,$key)); # xor with key |
| 547 | &xor ($s1,&DWP(4,$key)); | 1666 | &xor ($s1,&DWP(4,$key)); |
| @@ -553,20 +1672,19 @@ sub declast() | |||
| 553 | if ($small_footprint) { | 1672 | if ($small_footprint) { |
| 554 | &lea ($acc,&DWP(-2,$acc,$acc)); | 1673 | &lea ($acc,&DWP(-2,$acc,$acc)); |
| 555 | &lea ($acc,&DWP(0,$key,$acc,8)); | 1674 | &lea ($acc,&DWP(0,$key,$acc,8)); |
| 556 | &mov (&DWP(16,"esp"),$acc); # end of key schedule | 1675 | &mov ($__end,$acc); # end of key schedule |
| 557 | &align (4); | 1676 | &set_label("loop",16); |
| 558 | &set_label("loop"); | 1677 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
| 559 | &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1678 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
| 560 | &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1679 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
| 561 | &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1680 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
| 562 | &decstep(3,"ebp",$s3,$s2,$s1,$s0); | ||
| 563 | &add ($key,16); # advance rd_key | 1681 | &add ($key,16); # advance rd_key |
| 564 | &xor ($s0,&DWP(0,$key)); | 1682 | &xor ($s0,&DWP(0,$key)); |
| 565 | &xor ($s1,&DWP(4,$key)); | 1683 | &xor ($s1,&DWP(4,$key)); |
| 566 | &xor ($s2,&DWP(8,$key)); | 1684 | &xor ($s2,&DWP(8,$key)); |
| 567 | &xor ($s3,&DWP(12,$key)); | 1685 | &xor ($s3,&DWP(12,$key)); |
| 568 | &cmp ($key,&DWP(16,"esp")); | 1686 | &cmp ($key,$__end); |
| 569 | &mov (&DWP(12,"esp"),$key); | 1687 | &mov ($__key,$key); |
| 570 | &jb (&label("loop")); | 1688 | &jb (&label("loop")); |
| 571 | } | 1689 | } |
| 572 | else { | 1690 | else { |
| @@ -575,38 +1693,38 @@ sub declast() | |||
| 575 | &cmp ($acc,12); | 1693 | &cmp ($acc,12); |
| 576 | &jle (&label("12rounds")); | 1694 | &jle (&label("12rounds")); |
| 577 | 1695 | ||
| 578 | &set_label("14rounds"); | 1696 | &set_label("14rounds",4); |
| 579 | for ($i=1;$i<3;$i++) { | 1697 | for ($i=1;$i<3;$i++) { |
| 580 | &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1698 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
| 581 | &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1699 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
| 582 | &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1700 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
| 583 | &decstep(3,"ebp",$s3,$s2,$s1,$s0); | 1701 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
| 584 | &xor ($s0,&DWP(16*$i+0,$key)); | 1702 | &xor ($s0,&DWP(16*$i+0,$key)); |
| 585 | &xor ($s1,&DWP(16*$i+4,$key)); | 1703 | &xor ($s1,&DWP(16*$i+4,$key)); |
| 586 | &xor ($s2,&DWP(16*$i+8,$key)); | 1704 | &xor ($s2,&DWP(16*$i+8,$key)); |
| 587 | &xor ($s3,&DWP(16*$i+12,$key)); | 1705 | &xor ($s3,&DWP(16*$i+12,$key)); |
| 588 | } | 1706 | } |
| 589 | &add ($key,32); | 1707 | &add ($key,32); |
| 590 | &mov (&DWP(12,"esp"),$key); # advance rd_key | 1708 | &mov ($__key,$key); # advance rd_key |
| 591 | &set_label("12rounds"); | 1709 | &set_label("12rounds",4); |
| 592 | for ($i=1;$i<3;$i++) { | 1710 | for ($i=1;$i<3;$i++) { |
| 593 | &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1711 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
| 594 | &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1712 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
| 595 | &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1713 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
| 596 | &decstep(3,"ebp",$s3,$s2,$s1,$s0); | 1714 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
| 597 | &xor ($s0,&DWP(16*$i+0,$key)); | 1715 | &xor ($s0,&DWP(16*$i+0,$key)); |
| 598 | &xor ($s1,&DWP(16*$i+4,$key)); | 1716 | &xor ($s1,&DWP(16*$i+4,$key)); |
| 599 | &xor ($s2,&DWP(16*$i+8,$key)); | 1717 | &xor ($s2,&DWP(16*$i+8,$key)); |
| 600 | &xor ($s3,&DWP(16*$i+12,$key)); | 1718 | &xor ($s3,&DWP(16*$i+12,$key)); |
| 601 | } | 1719 | } |
| 602 | &add ($key,32); | 1720 | &add ($key,32); |
| 603 | &mov (&DWP(12,"esp"),$key); # advance rd_key | 1721 | &mov ($__key,$key); # advance rd_key |
| 604 | &set_label("10rounds"); | 1722 | &set_label("10rounds",4); |
| 605 | for ($i=1;$i<10;$i++) { | 1723 | for ($i=1;$i<10;$i++) { |
| 606 | &decstep(0,"ebp",$s0,$s3,$s2,$s1); | 1724 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); |
| 607 | &decstep(1,"ebp",$s1,$s0,$s3,$s2); | 1725 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); |
| 608 | &decstep(2,"ebp",$s2,$s1,$s0,$s3); | 1726 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); |
| 609 | &decstep(3,"ebp",$s3,$s2,$s1,$s0); | 1727 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); |
| 610 | &xor ($s0,&DWP(16*$i+0,$key)); | 1728 | &xor ($s0,&DWP(16*$i+0,$key)); |
| 611 | &xor ($s1,&DWP(16*$i+4,$key)); | 1729 | &xor ($s1,&DWP(16*$i+4,$key)); |
| 612 | &xor ($s2,&DWP(16*$i+8,$key)); | 1730 | &xor ($s2,&DWP(16*$i+8,$key)); |
| @@ -614,10 +1732,10 @@ sub declast() | |||
| 614 | } | 1732 | } |
| 615 | } | 1733 | } |
| 616 | 1734 | ||
| 617 | &declast(0,"ebp",$s0,$s3,$s2,$s1); | 1735 | &declast(0,$tbl,$s0,$s3,$s2,$s1); |
| 618 | &declast(1,"ebp",$s1,$s0,$s3,$s2); | 1736 | &declast(1,$tbl,$s1,$s0,$s3,$s2); |
| 619 | &declast(2,"ebp",$s2,$s1,$s0,$s3); | 1737 | &declast(2,$tbl,$s2,$s1,$s0,$s3); |
| 620 | &declast(3,"ebp",$s3,$s2,$s1,$s0); | 1738 | &declast(3,$tbl,$s3,$s2,$s1,$s0); |
| 621 | 1739 | ||
| 622 | &add ($key,$small_footprint?16:160); | 1740 | &add ($key,$small_footprint?16:160); |
| 623 | &xor ($s0,&DWP(0,$key)); | 1741 | &xor ($s0,&DWP(0,$key)); |
| @@ -692,7 +1810,107 @@ sub declast() | |||
| 692 | &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); | 1810 | &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); |
| 693 | &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); | 1811 | &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); |
| 694 | &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); | 1812 | &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); |
| 695 | #Td4: | 1813 | |
| 1814 | #Td4: # four copies of Td4 to choose from to avoid L1 aliasing | ||
| 1815 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
| 1816 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
| 1817 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
| 1818 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
| 1819 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
| 1820 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
| 1821 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
| 1822 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
| 1823 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
| 1824 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
| 1825 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
| 1826 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
| 1827 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
| 1828 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
| 1829 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
| 1830 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
| 1831 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
| 1832 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
| 1833 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
| 1834 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
| 1835 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
| 1836 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
| 1837 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
| 1838 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
| 1839 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
| 1840 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
| 1841 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
| 1842 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
| 1843 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
| 1844 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
| 1845 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
| 1846 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
| 1847 | |||
| 1848 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
| 1849 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
| 1850 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
| 1851 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
| 1852 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
| 1853 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
| 1854 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
| 1855 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
| 1856 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
| 1857 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
| 1858 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
| 1859 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
| 1860 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
| 1861 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
| 1862 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
| 1863 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
| 1864 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
| 1865 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
| 1866 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
| 1867 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
| 1868 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
| 1869 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
| 1870 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
| 1871 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
| 1872 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
| 1873 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
| 1874 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
| 1875 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
| 1876 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
| 1877 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
| 1878 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
| 1879 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
| 1880 | |||
| 1881 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
| 1882 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
| 1883 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
| 1884 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
| 1885 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
| 1886 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
| 1887 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
| 1888 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
| 1889 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
| 1890 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
| 1891 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
| 1892 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
| 1893 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
| 1894 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
| 1895 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
| 1896 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
| 1897 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
| 1898 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
| 1899 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
| 1900 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
| 1901 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
| 1902 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
| 1903 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
| 1904 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
| 1905 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
| 1906 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
| 1907 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
| 1908 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
| 1909 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
| 1910 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
| 1911 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
| 1912 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
| 1913 | |||
| 696 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | 1914 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); |
| 697 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | 1915 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); |
| 698 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | 1916 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); |
| @@ -728,43 +1946,57 @@ sub declast() | |||
| 728 | &function_end_B("_x86_AES_decrypt"); | 1946 | &function_end_B("_x86_AES_decrypt"); |
| 729 | 1947 | ||
| 730 | # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); | 1948 | # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); |
| 731 | &public_label("AES_Td"); | ||
| 732 | &function_begin("AES_decrypt"); | 1949 | &function_begin("AES_decrypt"); |
| 733 | &mov ($acc,&wparam(0)); # load inp | 1950 | &mov ($acc,&wparam(0)); # load inp |
| 734 | &mov ($key,&wparam(2)); # load key | 1951 | &mov ($key,&wparam(2)); # load key |
| 735 | 1952 | ||
| 736 | &mov ($s0,"esp"); | 1953 | &mov ($s0,"esp"); |
| 737 | &sub ("esp",24); | 1954 | &sub ("esp",36); |
| 738 | &and ("esp",-64); | 1955 | &and ("esp",-64); # align to cache-line |
| 739 | &add ("esp",4); | 1956 | |
| 740 | &mov (&DWP(16,"esp"),$s0); | 1957 | # place stack frame just "above" the key schedule |
| 1958 | &lea ($s1,&DWP(-64-63,$key)); | ||
| 1959 | &sub ($s1,"esp"); | ||
| 1960 | &neg ($s1); | ||
| 1961 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line | ||
| 1962 | &sub ("esp",$s1); | ||
| 1963 | &add ("esp",4); # 4 is reserved for caller's return address | ||
| 1964 | &mov ($_esp,$s0); # save stack pointer | ||
| 741 | 1965 | ||
| 742 | &call (&label("pic_point")); # make it PIC! | 1966 | &call (&label("pic_point")); # make it PIC! |
| 743 | &set_label("pic_point"); | 1967 | &set_label("pic_point"); |
| 744 | &blindpop("ebp"); | 1968 | &blindpop($tbl); |
| 745 | &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); | 1969 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only); |
| 746 | 1970 | &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl)); | |
| 747 | # prefetch Td4 | 1971 | |
| 748 | &lea ("ebp",&DWP(2048+128,"ebp")); | 1972 | # pick Td4 copy which can't "overlap" with stack frame or key schedule |
| 749 | &mov ($s0,&DWP(0-128,"ebp")); | 1973 | &lea ($s1,&DWP(768-4,"esp")); |
| 750 | &mov ($s1,&DWP(32-128,"ebp")); | 1974 | &sub ($s1,$tbl); |
| 751 | &mov ($s2,&DWP(64-128,"ebp")); | 1975 | &and ($s1,0x300); |
| 752 | &mov ($s3,&DWP(96-128,"ebp")); | 1976 | &lea ($tbl,&DWP(2048+128,$tbl,$s1)); |
| 753 | &mov ($s0,&DWP(128-128,"ebp")); | 1977 | |
| 754 | &mov ($s1,&DWP(160-128,"ebp")); | 1978 | if (!$x86only) { |
| 755 | &mov ($s2,&DWP(192-128,"ebp")); | 1979 | &bt (&DWP(0,$s0),25); # check for SSE bit |
| 756 | &mov ($s3,&DWP(224-128,"ebp")); | 1980 | &jnc (&label("x86")); |
| 757 | &lea ("ebp",&DWP(-2048-128,"ebp")); | 1981 | |
| 758 | 1982 | &movq ("mm0",&QWP(0,$acc)); | |
| 1983 | &movq ("mm4",&QWP(8,$acc)); | ||
| 1984 | &call ("_sse_AES_decrypt_compact"); | ||
| 1985 | &mov ("esp",$_esp); # restore stack pointer | ||
| 1986 | &mov ($acc,&wparam(1)); # load out | ||
| 1987 | &movq (&QWP(0,$acc),"mm0"); # write output data | ||
| 1988 | &movq (&QWP(8,$acc),"mm4"); | ||
| 1989 | &emms (); | ||
| 1990 | &function_end_A(); | ||
| 1991 | } | ||
| 1992 | &set_label("x86",16); | ||
| 1993 | &mov ($_tbl,$tbl); | ||
| 759 | &mov ($s0,&DWP(0,$acc)); # load input data | 1994 | &mov ($s0,&DWP(0,$acc)); # load input data |
| 760 | &mov ($s1,&DWP(4,$acc)); | 1995 | &mov ($s1,&DWP(4,$acc)); |
| 761 | &mov ($s2,&DWP(8,$acc)); | 1996 | &mov ($s2,&DWP(8,$acc)); |
| 762 | &mov ($s3,&DWP(12,$acc)); | 1997 | &mov ($s3,&DWP(12,$acc)); |
| 763 | 1998 | &call ("_x86_AES_decrypt_compact"); | |
| 764 | &call ("_x86_AES_decrypt"); | 1999 | &mov ("esp",$_esp); # restore stack pointer |
| 765 | |||
| 766 | &mov ("esp",&DWP(16,"esp")); | ||
| 767 | |||
| 768 | &mov ($acc,&wparam(1)); # load out | 2000 | &mov ($acc,&wparam(1)); # load out |
| 769 | &mov (&DWP(0,$acc),$s0); # write output data | 2001 | &mov (&DWP(0,$acc),$s0); # write output data |
| 770 | &mov (&DWP(4,$acc),$s1); | 2002 | &mov (&DWP(4,$acc),$s1); |
| @@ -777,126 +2009,136 @@ sub declast() | |||
| 777 | # unsigned char *ivp,const int enc); | 2009 | # unsigned char *ivp,const int enc); |
| 778 | { | 2010 | { |
| 779 | # stack frame layout | 2011 | # stack frame layout |
| 780 | # -4(%esp) 0(%esp) return address | 2012 | # -4(%esp) # return address 0(%esp) |
| 781 | # 0(%esp) 4(%esp) tmp1 | 2013 | # 0(%esp) # s0 backing store 4(%esp) |
| 782 | # 4(%esp) 8(%esp) tmp2 | 2014 | # 4(%esp) # s1 backing store 8(%esp) |
| 783 | # 8(%esp) 12(%esp) key | 2015 | # 8(%esp) # s2 backing store 12(%esp) |
| 784 | # 12(%esp) 16(%esp) end of key schedule | 2016 | # 12(%esp) # s3 backing store 16(%esp) |
| 785 | my $_esp=&DWP(16,"esp"); #saved %esp | 2017 | # 16(%esp) # key backup 20(%esp) |
| 786 | my $_inp=&DWP(20,"esp"); #copy of wparam(0) | 2018 | # 20(%esp) # end of key schedule 24(%esp) |
| 787 | my $_out=&DWP(24,"esp"); #copy of wparam(1) | 2019 | # 24(%esp) # %ebp backup 28(%esp) |
| 788 | my $_len=&DWP(28,"esp"); #copy of wparam(2) | 2020 | # 28(%esp) # %esp backup |
| 789 | my $_key=&DWP(32,"esp"); #copy of wparam(3) | 2021 | my $_inp=&DWP(32,"esp"); # copy of wparam(0) |
| 790 | my $_ivp=&DWP(36,"esp"); #copy of wparam(4) | 2022 | my $_out=&DWP(36,"esp"); # copy of wparam(1) |
| 791 | my $_tmp=&DWP(40,"esp"); #volatile variable | 2023 | my $_len=&DWP(40,"esp"); # copy of wparam(2) |
| 792 | my $ivec=&DWP(44,"esp"); #ivec[16] | 2024 | my $_key=&DWP(44,"esp"); # copy of wparam(3) |
| 793 | my $aes_key=&DWP(60,"esp"); #copy of aes_key | 2025 | my $_ivp=&DWP(48,"esp"); # copy of wparam(4) |
| 794 | my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | 2026 | my $_tmp=&DWP(52,"esp"); # volatile variable |
| 795 | 2027 | # | |
| 796 | &public_label("AES_Te"); | 2028 | my $ivec=&DWP(60,"esp"); # ivec[16] |
| 797 | &public_label("AES_Td"); | 2029 | my $aes_key=&DWP(76,"esp"); # copy of aes_key |
| 2030 | my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds | ||
| 2031 | |||
| 798 | &function_begin("AES_cbc_encrypt"); | 2032 | &function_begin("AES_cbc_encrypt"); |
| 799 | &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len | 2033 | &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len |
| 800 | &cmp ($s2,0); | 2034 | &cmp ($s2,0); |
| 801 | &je (&label("enc_out")); | 2035 | &je (&label("drop_out")); |
| 802 | 2036 | ||
| 803 | &call (&label("pic_point")); # make it PIC! | 2037 | &call (&label("pic_point")); # make it PIC! |
| 804 | &set_label("pic_point"); | 2038 | &set_label("pic_point"); |
| 805 | &blindpop("ebp"); | 2039 | &blindpop($tbl); |
| 806 | 2040 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only); | |
| 807 | &pushf (); | ||
| 808 | &cld (); | ||
| 809 | 2041 | ||
| 810 | &cmp (&wparam(5),0); | 2042 | &cmp (&wparam(5),0); |
| 811 | &je (&label("DECRYPT")); | 2043 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); |
| 2044 | &jne (&label("picked_te")); | ||
| 2045 | &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl)); | ||
| 2046 | &set_label("picked_te"); | ||
| 812 | 2047 | ||
| 813 | &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 2048 | # one can argue if this is required |
| 814 | 2049 | &pushf (); | |
| 815 | # allocate aligned stack frame... | 2050 | &cld (); |
| 816 | &lea ($key,&DWP(-64-244,"esp")); | ||
| 817 | &and ($key,-64); | ||
| 818 | 2051 | ||
| 819 | # ... and make sure it doesn't alias with AES_Te modulo 4096 | 2052 | &cmp ($s2,$speed_limit); |
| 820 | &mov ($s0,"ebp"); | 2053 | &jb (&label("slow_way")); |
| 821 | &lea ($s1,&DWP(2048,"ebp")); | 2054 | &test ($s2,15); |
| 822 | &mov ($s3,$key); | 2055 | &jnz (&label("slow_way")); |
| 2056 | if (!$x86only) { | ||
| 2057 | &bt (&DWP(0,$s0),28); # check for hyper-threading bit | ||
| 2058 | &jc (&label("slow_way")); | ||
| 2059 | } | ||
| 2060 | # pre-allocate aligned stack frame... | ||
| 2061 | &lea ($acc,&DWP(-80-244,"esp")); | ||
| 2062 | &and ($acc,-64); | ||
| 2063 | |||
| 2064 | # ... and make sure it doesn't alias with $tbl modulo 4096 | ||
| 2065 | &mov ($s0,$tbl); | ||
| 2066 | &lea ($s1,&DWP(2048+256,$tbl)); | ||
| 2067 | &mov ($s3,$acc); | ||
| 823 | &and ($s0,0xfff); # s = %ebp&0xfff | 2068 | &and ($s0,0xfff); # s = %ebp&0xfff |
| 824 | &and ($s1,0xfff); # e = (%ebp+2048)&0xfff | 2069 | &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff |
| 825 | &and ($s3,0xfff); # p = %esp&0xfff | 2070 | &and ($s3,0xfff); # p = %esp&0xfff |
| 826 | 2071 | ||
| 827 | &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); | 2072 | &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); |
| 828 | &jb (&label("te_break_out")); | 2073 | &jb (&label("tbl_break_out")); |
| 829 | &sub ($s3,$s1); | 2074 | &sub ($s3,$s1); |
| 830 | &sub ($key,$s3); | 2075 | &sub ($acc,$s3); |
| 831 | &jmp (&label("te_ok")); | 2076 | &jmp (&label("tbl_ok")); |
| 832 | &set_label("te_break_out"); # else %esp -= (p-s)&0xfff + framesz; | 2077 | &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz; |
| 833 | &sub ($s3,$s0); | 2078 | &sub ($s3,$s0); |
| 834 | &and ($s3,0xfff); | 2079 | &and ($s3,0xfff); |
| 835 | &add ($s3,64+256); | 2080 | &add ($s3,384); |
| 836 | &sub ($key,$s3); | 2081 | &sub ($acc,$s3); |
| 837 | &align (4); | 2082 | &set_label("tbl_ok",4); |
| 838 | &set_label("te_ok"); | ||
| 839 | 2083 | ||
| 840 | &mov ($s0,&wparam(0)); # load inp | 2084 | &lea ($s3,&wparam(0)); # obtain pointer to parameter block |
| 841 | &mov ($s1,&wparam(1)); # load out | 2085 | &exch ("esp",$acc); # allocate stack frame |
| 842 | &mov ($s3,&wparam(3)); # load key | ||
| 843 | &mov ($acc,&wparam(4)); # load ivp | ||
| 844 | |||
| 845 | &exch ("esp",$key); | ||
| 846 | &add ("esp",4); # reserve for return address! | 2086 | &add ("esp",4); # reserve for return address! |
| 847 | &mov ($_esp,$key); # save %esp | 2087 | &mov ($_tbl,$tbl); # save %ebp |
| 2088 | &mov ($_esp,$acc); # save %esp | ||
| 2089 | |||
| 2090 | &mov ($s0,&DWP(0,$s3)); # load inp | ||
| 2091 | &mov ($s1,&DWP(4,$s3)); # load out | ||
| 2092 | #&mov ($s2,&DWP(8,$s3)); # load len | ||
| 2093 | &mov ($key,&DWP(12,$s3)); # load key | ||
| 2094 | &mov ($acc,&DWP(16,$s3)); # load ivp | ||
| 2095 | &mov ($s3,&DWP(20,$s3)); # load enc flag | ||
| 848 | 2096 | ||
| 849 | &mov ($_inp,$s0); # save copy of inp | 2097 | &mov ($_inp,$s0); # save copy of inp |
| 850 | &mov ($_out,$s1); # save copy of out | 2098 | &mov ($_out,$s1); # save copy of out |
| 851 | &mov ($_len,$s2); # save copy of len | 2099 | &mov ($_len,$s2); # save copy of len |
| 852 | &mov ($_key,$s3); # save copy of key | 2100 | &mov ($_key,$key); # save copy of key |
| 853 | &mov ($_ivp,$acc); # save copy of ivp | 2101 | &mov ($_ivp,$acc); # save copy of ivp |
| 854 | 2102 | ||
| 855 | &mov ($mark,0); # copy of aes_key->rounds = 0; | 2103 | &mov ($mark,0); # copy of aes_key->rounds = 0; |
| 856 | if ($compromise) { | ||
| 857 | &cmp ($s2,$compromise); | ||
| 858 | &jb (&label("skip_ecopy")); | ||
| 859 | } | ||
| 860 | # do we copy key schedule to stack? | 2104 | # do we copy key schedule to stack? |
| 861 | &mov ($s1 eq "ebx" ? $s1 : "",$s3); | 2105 | &mov ($s1 eq "ebx" ? $s1 : "",$key); |
| 862 | &mov ($s2 eq "ecx" ? $s2 : "",244/4); | 2106 | &mov ($s2 eq "ecx" ? $s2 : "",244/4); |
| 863 | &sub ($s1,"ebp"); | 2107 | &sub ($s1,$tbl); |
| 864 | &mov ("esi",$s3); | 2108 | &mov ("esi",$key); |
| 865 | &and ($s1,0xfff); | 2109 | &and ($s1,0xfff); |
| 866 | &lea ("edi",$aes_key); | 2110 | &lea ("edi",$aes_key); |
| 867 | &cmp ($s1,2048); | 2111 | &cmp ($s1,2048+256); |
| 868 | &jb (&label("do_ecopy")); | 2112 | &jb (&label("do_copy")); |
| 869 | &cmp ($s1,4096-244); | 2113 | &cmp ($s1,4096-244); |
| 870 | &jb (&label("skip_ecopy")); | 2114 | &jb (&label("skip_copy")); |
| 871 | &align (4); | 2115 | &set_label("do_copy",4); |
| 872 | &set_label("do_ecopy"); | ||
| 873 | &mov ($_key,"edi"); | 2116 | &mov ($_key,"edi"); |
| 874 | &data_word(0xA5F3F689); # rep movsd | 2117 | &data_word(0xA5F3F689); # rep movsd |
| 875 | &set_label("skip_ecopy"); | 2118 | &set_label("skip_copy"); |
| 876 | 2119 | ||
| 877 | &mov ($acc,$s0); | ||
| 878 | &mov ($key,16); | 2120 | &mov ($key,16); |
| 879 | &align (4); | 2121 | &set_label("prefetch_tbl",4); |
| 880 | &set_label("prefetch_te"); | 2122 | &mov ($s0,&DWP(0,$tbl)); |
| 881 | &mov ($s0,&DWP(0,"ebp")); | 2123 | &mov ($s1,&DWP(32,$tbl)); |
| 882 | &mov ($s1,&DWP(32,"ebp")); | 2124 | &mov ($s2,&DWP(64,$tbl)); |
| 883 | &mov ($s2,&DWP(64,"ebp")); | 2125 | &mov ($acc,&DWP(96,$tbl)); |
| 884 | &mov ($s3,&DWP(96,"ebp")); | 2126 | &lea ($tbl,&DWP(128,$tbl)); |
| 885 | &lea ("ebp",&DWP(128,"ebp")); | 2127 | &sub ($key,1); |
| 886 | &dec ($key); | 2128 | &jnz (&label("prefetch_tbl")); |
| 887 | &jnz (&label("prefetch_te")); | 2129 | &sub ($tbl,2048); |
| 888 | &sub ("ebp",2048); | 2130 | |
| 889 | 2131 | &mov ($acc,$_inp); | |
| 890 | &mov ($s2,$_len); | ||
| 891 | &mov ($key,$_ivp); | 2132 | &mov ($key,$_ivp); |
| 892 | &test ($s2,0xFFFFFFF0); | ||
| 893 | &jz (&label("enc_tail")); # short input... | ||
| 894 | 2133 | ||
| 2134 | &cmp ($s3,0); | ||
| 2135 | &je (&label("fast_decrypt")); | ||
| 2136 | |||
| 2137 | #----------------------------- ENCRYPT -----------------------------# | ||
| 895 | &mov ($s0,&DWP(0,$key)); # load iv | 2138 | &mov ($s0,&DWP(0,$key)); # load iv |
| 896 | &mov ($s1,&DWP(4,$key)); | 2139 | &mov ($s1,&DWP(4,$key)); |
| 897 | 2140 | ||
| 898 | &align (4); | 2141 | &set_label("fast_enc_loop",16); |
| 899 | &set_label("enc_loop"); | ||
| 900 | &mov ($s2,&DWP(8,$key)); | 2142 | &mov ($s2,&DWP(8,$key)); |
| 901 | &mov ($s3,&DWP(12,$key)); | 2143 | &mov ($s3,&DWP(12,$key)); |
| 902 | 2144 | ||
| @@ -916,22 +2158,16 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
| 916 | &mov (&DWP(8,$key),$s2); | 2158 | &mov (&DWP(8,$key),$s2); |
| 917 | &mov (&DWP(12,$key),$s3); | 2159 | &mov (&DWP(12,$key),$s3); |
| 918 | 2160 | ||
| 2161 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
| 919 | &mov ($s2,$_len); # load len | 2162 | &mov ($s2,$_len); # load len |
| 920 | |||
| 921 | &lea ($acc,&DWP(16,$acc)); | ||
| 922 | &mov ($_inp,$acc); # save inp | 2163 | &mov ($_inp,$acc); # save inp |
| 923 | 2164 | &lea ($s3,&DWP(16,$key)); # advance out | |
| 924 | &lea ($s3,&DWP(16,$key)); | ||
| 925 | &mov ($_out,$s3); # save out | 2165 | &mov ($_out,$s3); # save out |
| 926 | 2166 | &sub ($s2,16); # decrease len | |
| 927 | &sub ($s2,16); | ||
| 928 | &test ($s2,0xFFFFFFF0); | ||
| 929 | &mov ($_len,$s2); # save len | 2167 | &mov ($_len,$s2); # save len |
| 930 | &jnz (&label("enc_loop")); | 2168 | &jnz (&label("fast_enc_loop")); |
| 931 | &test ($s2,15); | ||
| 932 | &jnz (&label("enc_tail")); | ||
| 933 | &mov ($acc,$_ivp); # load ivp | 2169 | &mov ($acc,$_ivp); # load ivp |
| 934 | &mov ($s2,&DWP(8,$key)); # restore last dwords | 2170 | &mov ($s2,&DWP(8,$key)); # restore last 2 dwords |
| 935 | &mov ($s3,&DWP(12,$key)); | 2171 | &mov ($s3,&DWP(12,$key)); |
| 936 | &mov (&DWP(0,$acc),$s0); # save ivec | 2172 | &mov (&DWP(0,$acc),$s0); # save ivec |
| 937 | &mov (&DWP(4,$acc),$s1); | 2173 | &mov (&DWP(4,$acc),$s1); |
| @@ -949,125 +2185,20 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
| 949 | &set_label("skip_ezero") | 2185 | &set_label("skip_ezero") |
| 950 | &mov ("esp",$_esp); | 2186 | &mov ("esp",$_esp); |
| 951 | &popf (); | 2187 | &popf (); |
| 952 | &set_label("enc_out"); | 2188 | &set_label("drop_out"); |
| 953 | &function_end_A(); | 2189 | &function_end_A(); |
| 954 | &pushf (); # kludge, never executed | 2190 | &pushf (); # kludge, never executed |
| 955 | 2191 | ||
| 956 | &align (4); | ||
| 957 | &set_label("enc_tail"); | ||
| 958 | &mov ($s0,$key eq "edi" ? $key : ""); | ||
| 959 | &mov ($key,$_out); # load out | ||
| 960 | &push ($s0); # push ivp | ||
| 961 | &mov ($s1,16); | ||
| 962 | &sub ($s1,$s2); | ||
| 963 | &cmp ($key,$acc); # compare with inp | ||
| 964 | &je (&label("enc_in_place")); | ||
| 965 | &align (4); | ||
| 966 | &data_word(0xA4F3F689); # rep movsb # copy input | ||
| 967 | &jmp (&label("enc_skip_in_place")); | ||
| 968 | &set_label("enc_in_place"); | ||
| 969 | &lea ($key,&DWP(0,$key,$s2)); | ||
| 970 | &set_label("enc_skip_in_place"); | ||
| 971 | &mov ($s2,$s1); | ||
| 972 | &xor ($s0,$s0); | ||
| 973 | &align (4); | ||
| 974 | &data_word(0xAAF3F689); # rep stosb # zero tail | ||
| 975 | &pop ($key); # pop ivp | ||
| 976 | |||
| 977 | &mov ($acc,$_out); # output as input | ||
| 978 | &mov ($s0,&DWP(0,$key)); | ||
| 979 | &mov ($s1,&DWP(4,$key)); | ||
| 980 | &mov ($_len,16); # len=16 | ||
| 981 | &jmp (&label("enc_loop")); # one more spin... | ||
| 982 | |||
| 983 | #----------------------------- DECRYPT -----------------------------# | 2192 | #----------------------------- DECRYPT -----------------------------# |
| 984 | &align (4); | 2193 | &set_label("fast_decrypt",16); |
| 985 | &set_label("DECRYPT"); | ||
| 986 | &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); | ||
| 987 | |||
| 988 | # allocate aligned stack frame... | ||
| 989 | &lea ($key,&DWP(-64-244,"esp")); | ||
| 990 | &and ($key,-64); | ||
| 991 | |||
| 992 | # ... and make sure it doesn't alias with AES_Td modulo 4096 | ||
| 993 | &mov ($s0,"ebp"); | ||
| 994 | &lea ($s1,&DWP(2048+256,"ebp")); | ||
| 995 | &mov ($s3,$key); | ||
| 996 | &and ($s0,0xfff); # s = %ebp&0xfff | ||
| 997 | &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff | ||
| 998 | &and ($s3,0xfff); # p = %esp&0xfff | ||
| 999 | |||
| 1000 | &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); | ||
| 1001 | &jb (&label("td_break_out")); | ||
| 1002 | &sub ($s3,$s1); | ||
| 1003 | &sub ($key,$s3); | ||
| 1004 | &jmp (&label("td_ok")); | ||
| 1005 | &set_label("td_break_out"); # else %esp -= (p-s)&0xfff + framesz; | ||
| 1006 | &sub ($s3,$s0); | ||
| 1007 | &and ($s3,0xfff); | ||
| 1008 | &add ($s3,64+256); | ||
| 1009 | &sub ($key,$s3); | ||
| 1010 | &align (4); | ||
| 1011 | &set_label("td_ok"); | ||
| 1012 | |||
| 1013 | &mov ($s0,&wparam(0)); # load inp | ||
| 1014 | &mov ($s1,&wparam(1)); # load out | ||
| 1015 | &mov ($s3,&wparam(3)); # load key | ||
| 1016 | &mov ($acc,&wparam(4)); # load ivp | ||
| 1017 | |||
| 1018 | &exch ("esp",$key); | ||
| 1019 | &add ("esp",4); # reserve for return address! | ||
| 1020 | &mov ($_esp,$key); # save %esp | ||
| 1021 | |||
| 1022 | &mov ($_inp,$s0); # save copy of inp | ||
| 1023 | &mov ($_out,$s1); # save copy of out | ||
| 1024 | &mov ($_len,$s2); # save copy of len | ||
| 1025 | &mov ($_key,$s3); # save copy of key | ||
| 1026 | &mov ($_ivp,$acc); # save copy of ivp | ||
| 1027 | |||
| 1028 | &mov ($mark,0); # copy of aes_key->rounds = 0; | ||
| 1029 | if ($compromise) { | ||
| 1030 | &cmp ($s2,$compromise); | ||
| 1031 | &jb (&label("skip_dcopy")); | ||
| 1032 | } | ||
| 1033 | # do we copy key schedule to stack? | ||
| 1034 | &mov ($s1 eq "ebx" ? $s1 : "",$s3); | ||
| 1035 | &mov ($s2 eq "ecx" ? $s2 : "",244/4); | ||
| 1036 | &sub ($s1,"ebp"); | ||
| 1037 | &mov ("esi",$s3); | ||
| 1038 | &and ($s1,0xfff); | ||
| 1039 | &lea ("edi",$aes_key); | ||
| 1040 | &cmp ($s1,2048+256); | ||
| 1041 | &jb (&label("do_dcopy")); | ||
| 1042 | &cmp ($s1,4096-244); | ||
| 1043 | &jb (&label("skip_dcopy")); | ||
| 1044 | &align (4); | ||
| 1045 | &set_label("do_dcopy"); | ||
| 1046 | &mov ($_key,"edi"); | ||
| 1047 | &data_word(0xA5F3F689); # rep movsd | ||
| 1048 | &set_label("skip_dcopy"); | ||
| 1049 | |||
| 1050 | &mov ($acc,$s0); | ||
| 1051 | &mov ($key,18); | ||
| 1052 | &align (4); | ||
| 1053 | &set_label("prefetch_td"); | ||
| 1054 | &mov ($s0,&DWP(0,"ebp")); | ||
| 1055 | &mov ($s1,&DWP(32,"ebp")); | ||
| 1056 | &mov ($s2,&DWP(64,"ebp")); | ||
| 1057 | &mov ($s3,&DWP(96,"ebp")); | ||
| 1058 | &lea ("ebp",&DWP(128,"ebp")); | ||
| 1059 | &dec ($key); | ||
| 1060 | &jnz (&label("prefetch_td")); | ||
| 1061 | &sub ("ebp",2048+256); | ||
| 1062 | 2194 | ||
| 1063 | &cmp ($acc,$_out); | 2195 | &cmp ($acc,$_out); |
| 1064 | &je (&label("dec_in_place")); # in-place processing... | 2196 | &je (&label("fast_dec_in_place")); # in-place processing... |
| 1065 | 2197 | ||
| 1066 | &mov ($key,$_ivp); # load ivp | ||
| 1067 | &mov ($_tmp,$key); | 2198 | &mov ($_tmp,$key); |
| 1068 | 2199 | ||
| 1069 | &align (4); | 2200 | &align (4); |
| 1070 | &set_label("dec_loop"); | 2201 | &set_label("fast_dec_loop",16); |
| 1071 | &mov ($s0,&DWP(0,$acc)); # read input | 2202 | &mov ($s0,&DWP(0,$acc)); # read input |
| 1072 | &mov ($s1,&DWP(4,$acc)); | 2203 | &mov ($s1,&DWP(4,$acc)); |
| 1073 | &mov ($s2,&DWP(8,$acc)); | 2204 | &mov ($s2,&DWP(8,$acc)); |
| @@ -1083,27 +2214,24 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
| 1083 | &xor ($s2,&DWP(8,$key)); | 2214 | &xor ($s2,&DWP(8,$key)); |
| 1084 | &xor ($s3,&DWP(12,$key)); | 2215 | &xor ($s3,&DWP(12,$key)); |
| 1085 | 2216 | ||
| 1086 | &sub ($acc,16); | ||
| 1087 | &jc (&label("dec_partial")); | ||
| 1088 | &mov ($_len,$acc); # save len | ||
| 1089 | &mov ($acc,$_inp); # load inp | ||
| 1090 | &mov ($key,$_out); # load out | 2217 | &mov ($key,$_out); # load out |
| 2218 | &mov ($acc,$_inp); # load inp | ||
| 1091 | 2219 | ||
| 1092 | &mov (&DWP(0,$key),$s0); # write output | 2220 | &mov (&DWP(0,$key),$s0); # write output |
| 1093 | &mov (&DWP(4,$key),$s1); | 2221 | &mov (&DWP(4,$key),$s1); |
| 1094 | &mov (&DWP(8,$key),$s2); | 2222 | &mov (&DWP(8,$key),$s2); |
| 1095 | &mov (&DWP(12,$key),$s3); | 2223 | &mov (&DWP(12,$key),$s3); |
| 1096 | 2224 | ||
| 2225 | &mov ($s2,$_len); # load len | ||
| 1097 | &mov ($_tmp,$acc); # save ivp | 2226 | &mov ($_tmp,$acc); # save ivp |
| 1098 | &lea ($acc,&DWP(16,$acc)); | 2227 | &lea ($acc,&DWP(16,$acc)); # advance inp |
| 1099 | &mov ($_inp,$acc); # save inp | 2228 | &mov ($_inp,$acc); # save inp |
| 1100 | 2229 | &lea ($key,&DWP(16,$key)); # advance out | |
| 1101 | &lea ($key,&DWP(16,$key)); | ||
| 1102 | &mov ($_out,$key); # save out | 2230 | &mov ($_out,$key); # save out |
| 1103 | 2231 | &sub ($s2,16); # decrease len | |
| 1104 | &jnz (&label("dec_loop")); | 2232 | &mov ($_len,$s2); # save len |
| 2233 | &jnz (&label("fast_dec_loop")); | ||
| 1105 | &mov ($key,$_tmp); # load temp ivp | 2234 | &mov ($key,$_tmp); # load temp ivp |
| 1106 | &set_label("dec_end"); | ||
| 1107 | &mov ($acc,$_ivp); # load user ivp | 2235 | &mov ($acc,$_ivp); # load user ivp |
| 1108 | &mov ($s0,&DWP(0,$key)); # load iv | 2236 | &mov ($s0,&DWP(0,$key)); # load iv |
| 1109 | &mov ($s1,&DWP(4,$key)); | 2237 | &mov ($s1,&DWP(4,$key)); |
| @@ -1113,31 +2241,16 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
| 1113 | &mov (&DWP(4,$acc),$s1); | 2241 | &mov (&DWP(4,$acc),$s1); |
| 1114 | &mov (&DWP(8,$acc),$s2); | 2242 | &mov (&DWP(8,$acc),$s2); |
| 1115 | &mov (&DWP(12,$acc),$s3); | 2243 | &mov (&DWP(12,$acc),$s3); |
| 1116 | &jmp (&label("dec_out")); | 2244 | &jmp (&label("fast_dec_out")); |
| 1117 | 2245 | ||
| 1118 | &align (4); | 2246 | &set_label("fast_dec_in_place",16); |
| 1119 | &set_label("dec_partial"); | 2247 | &set_label("fast_dec_in_place_loop"); |
| 1120 | &lea ($key,$ivec); | ||
| 1121 | &mov (&DWP(0,$key),$s0); # dump output to stack | ||
| 1122 | &mov (&DWP(4,$key),$s1); | ||
| 1123 | &mov (&DWP(8,$key),$s2); | ||
| 1124 | &mov (&DWP(12,$key),$s3); | ||
| 1125 | &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc)); | ||
| 1126 | &mov ($acc eq "esi" ? $acc : "",$key); | ||
| 1127 | &mov ($key eq "edi" ? $key : "",$_out); # load out | ||
| 1128 | &data_word(0xA4F3F689); # rep movsb # copy output | ||
| 1129 | &mov ($key,$_inp); # use inp as temp ivp | ||
| 1130 | &jmp (&label("dec_end")); | ||
| 1131 | |||
| 1132 | &align (4); | ||
| 1133 | &set_label("dec_in_place"); | ||
| 1134 | &set_label("dec_in_place_loop"); | ||
| 1135 | &lea ($key,$ivec); | ||
| 1136 | &mov ($s0,&DWP(0,$acc)); # read input | 2248 | &mov ($s0,&DWP(0,$acc)); # read input |
| 1137 | &mov ($s1,&DWP(4,$acc)); | 2249 | &mov ($s1,&DWP(4,$acc)); |
| 1138 | &mov ($s2,&DWP(8,$acc)); | 2250 | &mov ($s2,&DWP(8,$acc)); |
| 1139 | &mov ($s3,&DWP(12,$acc)); | 2251 | &mov ($s3,&DWP(12,$acc)); |
| 1140 | 2252 | ||
| 2253 | &lea ($key,$ivec); | ||
| 1141 | &mov (&DWP(0,$key),$s0); # copy to temp | 2254 | &mov (&DWP(0,$key),$s0); # copy to temp |
| 1142 | &mov (&DWP(4,$key),$s1); | 2255 | &mov (&DWP(4,$key),$s1); |
| 1143 | &mov (&DWP(8,$key),$s2); | 2256 | &mov (&DWP(8,$key),$s2); |
| @@ -1158,7 +2271,7 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
| 1158 | &mov (&DWP(8,$acc),$s2); | 2271 | &mov (&DWP(8,$acc),$s2); |
| 1159 | &mov (&DWP(12,$acc),$s3); | 2272 | &mov (&DWP(12,$acc),$s3); |
| 1160 | 2273 | ||
| 1161 | &lea ($acc,&DWP(16,$acc)); | 2274 | &lea ($acc,&DWP(16,$acc)); # advance out |
| 1162 | &mov ($_out,$acc); # save out | 2275 | &mov ($_out,$acc); # save out |
| 1163 | 2276 | ||
| 1164 | &lea ($acc,$ivec); | 2277 | &lea ($acc,$ivec); |
| @@ -1173,40 +2286,340 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
| 1173 | &mov (&DWP(12,$key),$s3); | 2286 | &mov (&DWP(12,$key),$s3); |
| 1174 | 2287 | ||
| 1175 | &mov ($acc,$_inp); # load inp | 2288 | &mov ($acc,$_inp); # load inp |
| 2289 | &mov ($s2,$_len); # load len | ||
| 2290 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
| 2291 | &mov ($_inp,$acc); # save inp | ||
| 2292 | &sub ($s2,16); # decrease len | ||
| 2293 | &mov ($_len,$s2); # save len | ||
| 2294 | &jnz (&label("fast_dec_in_place_loop")); | ||
| 2295 | |||
| 2296 | &set_label("fast_dec_out",4); | ||
| 2297 | &cmp ($mark,0); # was the key schedule copied? | ||
| 2298 | &mov ("edi",$_key); | ||
| 2299 | &je (&label("skip_dzero")); | ||
| 2300 | # zero copy of key schedule | ||
| 2301 | &mov ("ecx",240/4); | ||
| 2302 | &xor ("eax","eax"); | ||
| 2303 | &align (4); | ||
| 2304 | &data_word(0xABF3F689); # rep stosd | ||
| 2305 | &set_label("skip_dzero") | ||
| 2306 | &mov ("esp",$_esp); | ||
| 2307 | &popf (); | ||
| 2308 | &function_end_A(); | ||
| 2309 | &pushf (); # kludge, never executed | ||
| 2310 | |||
| 2311 | #--------------------------- SLOW ROUTINE ---------------------------# | ||
| 2312 | &set_label("slow_way",16); | ||
| 2313 | |||
| 2314 | &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap | ||
| 2315 | &mov ($key,&wparam(3)); # load key | ||
| 2316 | |||
| 2317 | # pre-allocate aligned stack frame... | ||
| 2318 | &lea ($acc,&DWP(-80,"esp")); | ||
| 2319 | &and ($acc,-64); | ||
| 2320 | |||
| 2321 | # ... and make sure it doesn't alias with $key modulo 1024 | ||
| 2322 | &lea ($s1,&DWP(-80-63,$key)); | ||
| 2323 | &sub ($s1,$acc); | ||
| 2324 | &neg ($s1); | ||
| 2325 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line | ||
| 2326 | &sub ($acc,$s1); | ||
| 2327 | |||
| 2328 | # pick S-box copy which can't overlap with stack frame or $key | ||
| 2329 | &lea ($s1,&DWP(768,$acc)); | ||
| 2330 | &sub ($s1,$tbl); | ||
| 2331 | &and ($s1,0x300); | ||
| 2332 | &lea ($tbl,&DWP(2048+128,$tbl,$s1)); | ||
| 2333 | |||
| 2334 | &lea ($s3,&wparam(0)); # pointer to parameter block | ||
| 2335 | |||
| 2336 | &exch ("esp",$acc); | ||
| 2337 | &add ("esp",4); # reserve for return address! | ||
| 2338 | &mov ($_tbl,$tbl); # save %ebp | ||
| 2339 | &mov ($_esp,$acc); # save %esp | ||
| 2340 | &mov ($_tmp,$s0); # save OPENSSL_ia32cap | ||
| 2341 | |||
| 2342 | &mov ($s0,&DWP(0,$s3)); # load inp | ||
| 2343 | &mov ($s1,&DWP(4,$s3)); # load out | ||
| 2344 | #&mov ($s2,&DWP(8,$s3)); # load len | ||
| 2345 | #&mov ($key,&DWP(12,$s3)); # load key | ||
| 2346 | &mov ($acc,&DWP(16,$s3)); # load ivp | ||
| 2347 | &mov ($s3,&DWP(20,$s3)); # load enc flag | ||
| 2348 | |||
| 2349 | &mov ($_inp,$s0); # save copy of inp | ||
| 2350 | &mov ($_out,$s1); # save copy of out | ||
| 2351 | &mov ($_len,$s2); # save copy of len | ||
| 2352 | &mov ($_key,$key); # save copy of key | ||
| 2353 | &mov ($_ivp,$acc); # save copy of ivp | ||
| 2354 | |||
| 2355 | &mov ($key,$acc); | ||
| 2356 | &mov ($acc,$s0); | ||
| 2357 | |||
| 2358 | &cmp ($s3,0); | ||
| 2359 | &je (&label("slow_decrypt")); | ||
| 2360 | |||
| 2361 | #--------------------------- SLOW ENCRYPT ---------------------------# | ||
| 2362 | &cmp ($s2,16); | ||
| 2363 | &mov ($s3,$s1); | ||
| 2364 | &jb (&label("slow_enc_tail")); | ||
| 2365 | |||
| 2366 | if (!$x86only) { | ||
| 2367 | &bt ($_tmp,25); # check for SSE bit | ||
| 2368 | &jnc (&label("slow_enc_x86")); | ||
| 2369 | |||
| 2370 | &movq ("mm0",&QWP(0,$key)); # load iv | ||
| 2371 | &movq ("mm4",&QWP(8,$key)); | ||
| 1176 | 2372 | ||
| 1177 | &lea ($acc,&DWP(16,$acc)); | 2373 | &set_label("slow_enc_loop_sse",16); |
| 2374 | &pxor ("mm0",&QWP(0,$acc)); # xor input data | ||
| 2375 | &pxor ("mm4",&QWP(8,$acc)); | ||
| 2376 | |||
| 2377 | &mov ($key,$_key); | ||
| 2378 | &call ("_sse_AES_encrypt_compact"); | ||
| 2379 | |||
| 2380 | &mov ($acc,$_inp); # load inp | ||
| 2381 | &mov ($key,$_out); # load out | ||
| 2382 | &mov ($s2,$_len); # load len | ||
| 2383 | |||
| 2384 | &movq (&QWP(0,$key),"mm0"); # save output data | ||
| 2385 | &movq (&QWP(8,$key),"mm4"); | ||
| 2386 | |||
| 2387 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
| 1178 | &mov ($_inp,$acc); # save inp | 2388 | &mov ($_inp,$acc); # save inp |
| 2389 | &lea ($s3,&DWP(16,$key)); # advance out | ||
| 2390 | &mov ($_out,$s3); # save out | ||
| 2391 | &sub ($s2,16); # decrease len | ||
| 2392 | &cmp ($s2,16); | ||
| 2393 | &mov ($_len,$s2); # save len | ||
| 2394 | &jae (&label("slow_enc_loop_sse")); | ||
| 2395 | &test ($s2,15); | ||
| 2396 | &jnz (&label("slow_enc_tail")); | ||
| 2397 | &mov ($acc,$_ivp); # load ivp | ||
| 2398 | &movq (&QWP(0,$acc),"mm0"); # save ivec | ||
| 2399 | &movq (&QWP(8,$acc),"mm4"); | ||
| 2400 | &emms (); | ||
| 2401 | &mov ("esp",$_esp); | ||
| 2402 | &popf (); | ||
| 2403 | &function_end_A(); | ||
| 2404 | &pushf (); # kludge, never executed | ||
| 2405 | } | ||
| 2406 | &set_label("slow_enc_x86",16); | ||
| 2407 | &mov ($s0,&DWP(0,$key)); # load iv | ||
| 2408 | &mov ($s1,&DWP(4,$key)); | ||
| 2409 | |||
| 2410 | &set_label("slow_enc_loop_x86",4); | ||
| 2411 | &mov ($s2,&DWP(8,$key)); | ||
| 2412 | &mov ($s3,&DWP(12,$key)); | ||
| 2413 | |||
| 2414 | &xor ($s0,&DWP(0,$acc)); # xor input data | ||
| 2415 | &xor ($s1,&DWP(4,$acc)); | ||
| 2416 | &xor ($s2,&DWP(8,$acc)); | ||
| 2417 | &xor ($s3,&DWP(12,$acc)); | ||
| 2418 | |||
| 2419 | &mov ($key,$_key); # load key | ||
| 2420 | &call ("_x86_AES_encrypt_compact"); | ||
| 2421 | |||
| 2422 | &mov ($acc,$_inp); # load inp | ||
| 2423 | &mov ($key,$_out); # load out | ||
| 2424 | |||
| 2425 | &mov (&DWP(0,$key),$s0); # save output data | ||
| 2426 | &mov (&DWP(4,$key),$s1); | ||
| 2427 | &mov (&DWP(8,$key),$s2); | ||
| 2428 | &mov (&DWP(12,$key),$s3); | ||
| 1179 | 2429 | ||
| 1180 | &mov ($s2,$_len); # load len | 2430 | &mov ($s2,$_len); # load len |
| 1181 | &sub ($s2,16); | 2431 | &lea ($acc,&DWP(16,$acc)); # advance inp |
| 1182 | &jc (&label("dec_in_place_partial")); | 2432 | &mov ($_inp,$acc); # save inp |
| 2433 | &lea ($s3,&DWP(16,$key)); # advance out | ||
| 2434 | &mov ($_out,$s3); # save out | ||
| 2435 | &sub ($s2,16); # decrease len | ||
| 2436 | &cmp ($s2,16); | ||
| 1183 | &mov ($_len,$s2); # save len | 2437 | &mov ($_len,$s2); # save len |
| 1184 | &jnz (&label("dec_in_place_loop")); | 2438 | &jae (&label("slow_enc_loop_x86")); |
| 1185 | &jmp (&label("dec_out")); | 2439 | &test ($s2,15); |
| 1186 | 2440 | &jnz (&label("slow_enc_tail")); | |
| 1187 | &align (4); | 2441 | &mov ($acc,$_ivp); # load ivp |
| 1188 | &set_label("dec_in_place_partial"); | 2442 | &mov ($s2,&DWP(8,$key)); # restore last dwords |
| 1189 | # one can argue if this is actually required... | 2443 | &mov ($s3,&DWP(12,$key)); |
| 1190 | &mov ($key eq "edi" ? $key : "",$_out); | 2444 | &mov (&DWP(0,$acc),$s0); # save ivec |
| 1191 | &lea ($acc eq "esi" ? $acc : "",$ivec); | 2445 | &mov (&DWP(4,$acc),$s1); |
| 2446 | &mov (&DWP(8,$acc),$s2); | ||
| 2447 | &mov (&DWP(12,$acc),$s3); | ||
| 2448 | |||
| 2449 | &mov ("esp",$_esp); | ||
| 2450 | &popf (); | ||
| 2451 | &function_end_A(); | ||
| 2452 | &pushf (); # kludge, never executed | ||
| 2453 | |||
| 2454 | &set_label("slow_enc_tail",16); | ||
| 2455 | &emms () if (!$x86only); | ||
| 2456 | &mov ($key eq "edi"? $key:"",$s3); # load out to edi | ||
| 2457 | &mov ($s1,16); | ||
| 2458 | &sub ($s1,$s2); | ||
| 2459 | &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp | ||
| 2460 | &je (&label("enc_in_place")); | ||
| 2461 | &align (4); | ||
| 2462 | &data_word(0xA4F3F689); # rep movsb # copy input | ||
| 2463 | &jmp (&label("enc_skip_in_place")); | ||
| 2464 | &set_label("enc_in_place"); | ||
| 1192 | &lea ($key,&DWP(0,$key,$s2)); | 2465 | &lea ($key,&DWP(0,$key,$s2)); |
| 1193 | &lea ($acc,&DWP(16,$acc,$s2)); | 2466 | &set_label("enc_skip_in_place"); |
| 1194 | &neg ($s2 eq "ecx" ? $s2 : ""); | 2467 | &mov ($s2,$s1); |
| 1195 | &data_word(0xA4F3F689); # rep movsb # restore tail | 2468 | &xor ($s0,$s0); |
| 1196 | 2469 | &align (4); | |
| 1197 | &align (4); | 2470 | &data_word(0xAAF3F689); # rep stosb # zero tail |
| 1198 | &set_label("dec_out"); | 2471 | |
| 1199 | &cmp ($mark,0); # was the key schedule copied? | 2472 | &mov ($key,$_ivp); # restore ivp |
| 1200 | &mov ("edi",$_key); | 2473 | &mov ($acc,$s3); # output as input |
| 1201 | &je (&label("skip_dzero")); | 2474 | &mov ($s0,&DWP(0,$key)); |
| 1202 | # zero copy of key schedule | 2475 | &mov ($s1,&DWP(4,$key)); |
| 1203 | &mov ("ecx",240/4); | 2476 | &mov ($_len,16); # len=16 |
| 1204 | &xor ("eax","eax"); | 2477 | &jmp (&label("slow_enc_loop_x86")); # one more spin... |
| 1205 | &align (4); | 2478 | |
| 1206 | &data_word(0xABF3F689); # rep stosd | 2479 | #--------------------------- SLOW DECRYPT ---------------------------# |
| 1207 | &set_label("skip_dzero") | 2480 | &set_label("slow_decrypt",16); |
| 1208 | &mov ("esp",$_esp); | 2481 | if (!$x86only) { |
| 1209 | &popf (); | 2482 | &bt ($_tmp,25); # check for SSE bit |
| 2483 | &jnc (&label("slow_dec_loop_x86")); | ||
| 2484 | |||
| 2485 | &set_label("slow_dec_loop_sse",4); | ||
| 2486 | &movq ("mm0",&QWP(0,$acc)); # read input | ||
| 2487 | &movq ("mm4",&QWP(8,$acc)); | ||
| 2488 | |||
| 2489 | &mov ($key,$_key); | ||
| 2490 | &call ("_sse_AES_decrypt_compact"); | ||
| 2491 | |||
| 2492 | &mov ($acc,$_inp); # load inp | ||
| 2493 | &lea ($s0,$ivec); | ||
| 2494 | &mov ($s1,$_out); # load out | ||
| 2495 | &mov ($s2,$_len); # load len | ||
| 2496 | &mov ($key,$_ivp); # load ivp | ||
| 2497 | |||
| 2498 | &movq ("mm1",&QWP(0,$acc)); # re-read input | ||
| 2499 | &movq ("mm5",&QWP(8,$acc)); | ||
| 2500 | |||
| 2501 | &pxor ("mm0",&QWP(0,$key)); # xor iv | ||
| 2502 | &pxor ("mm4",&QWP(8,$key)); | ||
| 2503 | |||
| 2504 | &movq (&QWP(0,$key),"mm1"); # copy input to iv | ||
| 2505 | &movq (&QWP(8,$key),"mm5"); | ||
| 2506 | |||
| 2507 | &sub ($s2,16); # decrease len | ||
| 2508 | &jc (&label("slow_dec_partial_sse")); | ||
| 2509 | |||
| 2510 | &movq (&QWP(0,$s1),"mm0"); # write output | ||
| 2511 | &movq (&QWP(8,$s1),"mm4"); | ||
| 2512 | |||
| 2513 | &lea ($s1,&DWP(16,$s1)); # advance out | ||
| 2514 | &mov ($_out,$s1); # save out | ||
| 2515 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
| 2516 | &mov ($_inp,$acc); # save inp | ||
| 2517 | &mov ($_len,$s2); # save len | ||
| 2518 | &jnz (&label("slow_dec_loop_sse")); | ||
| 2519 | &emms (); | ||
| 2520 | &mov ("esp",$_esp); | ||
| 2521 | &popf (); | ||
| 2522 | &function_end_A(); | ||
| 2523 | &pushf (); # kludge, never executed | ||
| 2524 | |||
| 2525 | &set_label("slow_dec_partial_sse",16); | ||
| 2526 | &movq (&QWP(0,$s0),"mm0"); # save output to temp | ||
| 2527 | &movq (&QWP(8,$s0),"mm4"); | ||
| 2528 | &emms (); | ||
| 2529 | |||
| 2530 | &add ($s2 eq "ecx" ? "ecx":"",16); | ||
| 2531 | &mov ("edi",$s1); # out | ||
| 2532 | &mov ("esi",$s0); # temp | ||
| 2533 | &align (4); | ||
| 2534 | &data_word(0xA4F3F689); # rep movsb # copy partial output | ||
| 2535 | |||
| 2536 | &mov ("esp",$_esp); | ||
| 2537 | &popf (); | ||
| 2538 | &function_end_A(); | ||
| 2539 | &pushf (); # kludge, never executed | ||
| 2540 | } | ||
| 2541 | &set_label("slow_dec_loop_x86",16); | ||
| 2542 | &mov ($s0,&DWP(0,$acc)); # read input | ||
| 2543 | &mov ($s1,&DWP(4,$acc)); | ||
| 2544 | &mov ($s2,&DWP(8,$acc)); | ||
| 2545 | &mov ($s3,&DWP(12,$acc)); | ||
| 2546 | |||
| 2547 | &lea ($key,$ivec); | ||
| 2548 | &mov (&DWP(0,$key),$s0); # copy to temp | ||
| 2549 | &mov (&DWP(4,$key),$s1); | ||
| 2550 | &mov (&DWP(8,$key),$s2); | ||
| 2551 | &mov (&DWP(12,$key),$s3); | ||
| 2552 | |||
| 2553 | &mov ($key,$_key); # load key | ||
| 2554 | &call ("_x86_AES_decrypt_compact"); | ||
| 2555 | |||
| 2556 | &mov ($key,$_ivp); # load ivp | ||
| 2557 | &mov ($acc,$_len); # load len | ||
| 2558 | &xor ($s0,&DWP(0,$key)); # xor iv | ||
| 2559 | &xor ($s1,&DWP(4,$key)); | ||
| 2560 | &xor ($s2,&DWP(8,$key)); | ||
| 2561 | &xor ($s3,&DWP(12,$key)); | ||
| 2562 | |||
| 2563 | &sub ($acc,16); | ||
| 2564 | &jc (&label("slow_dec_partial_x86")); | ||
| 2565 | |||
| 2566 | &mov ($_len,$acc); # save len | ||
| 2567 | &mov ($acc,$_out); # load out | ||
| 2568 | |||
| 2569 | &mov (&DWP(0,$acc),$s0); # write output | ||
| 2570 | &mov (&DWP(4,$acc),$s1); | ||
| 2571 | &mov (&DWP(8,$acc),$s2); | ||
| 2572 | &mov (&DWP(12,$acc),$s3); | ||
| 2573 | |||
| 2574 | &lea ($acc,&DWP(16,$acc)); # advance out | ||
| 2575 | &mov ($_out,$acc); # save out | ||
| 2576 | |||
| 2577 | &lea ($acc,$ivec); | ||
| 2578 | &mov ($s0,&DWP(0,$acc)); # read temp | ||
| 2579 | &mov ($s1,&DWP(4,$acc)); | ||
| 2580 | &mov ($s2,&DWP(8,$acc)); | ||
| 2581 | &mov ($s3,&DWP(12,$acc)); | ||
| 2582 | |||
| 2583 | &mov (&DWP(0,$key),$s0); # copy it to iv | ||
| 2584 | &mov (&DWP(4,$key),$s1); | ||
| 2585 | &mov (&DWP(8,$key),$s2); | ||
| 2586 | &mov (&DWP(12,$key),$s3); | ||
| 2587 | |||
| 2588 | &mov ($acc,$_inp); # load inp | ||
| 2589 | &lea ($acc,&DWP(16,$acc)); # advance inp | ||
| 2590 | &mov ($_inp,$acc); # save inp | ||
| 2591 | &jnz (&label("slow_dec_loop_x86")); | ||
| 2592 | &mov ("esp",$_esp); | ||
| 2593 | &popf (); | ||
| 2594 | &function_end_A(); | ||
| 2595 | &pushf (); # kludge, never executed | ||
| 2596 | |||
| 2597 | &set_label("slow_dec_partial_x86",16); | ||
| 2598 | &lea ($acc,$ivec); | ||
| 2599 | &mov (&DWP(0,$acc),$s0); # save output to temp | ||
| 2600 | &mov (&DWP(4,$acc),$s1); | ||
| 2601 | &mov (&DWP(8,$acc),$s2); | ||
| 2602 | &mov (&DWP(12,$acc),$s3); | ||
| 2603 | |||
| 2604 | &mov ($acc,$_inp); | ||
| 2605 | &mov ($s0,&DWP(0,$acc)); # re-read input | ||
| 2606 | &mov ($s1,&DWP(4,$acc)); | ||
| 2607 | &mov ($s2,&DWP(8,$acc)); | ||
| 2608 | &mov ($s3,&DWP(12,$acc)); | ||
| 2609 | |||
| 2610 | &mov (&DWP(0,$key),$s0); # copy it to iv | ||
| 2611 | &mov (&DWP(4,$key),$s1); | ||
| 2612 | &mov (&DWP(8,$key),$s2); | ||
| 2613 | &mov (&DWP(12,$key),$s3); | ||
| 2614 | |||
| 2615 | &mov ("ecx",$_len); | ||
| 2616 | &mov ("edi",$_out); | ||
| 2617 | &lea ("esi",$ivec); | ||
| 2618 | &align (4); | ||
| 2619 | &data_word(0xA4F3F689); # rep movsb # copy partial output | ||
| 2620 | |||
| 2621 | &mov ("esp",$_esp); | ||
| 2622 | &popf (); | ||
| 1210 | &function_end("AES_cbc_encrypt"); | 2623 | &function_end("AES_cbc_encrypt"); |
| 1211 | } | 2624 | } |
| 1212 | 2625 | ||
| @@ -1215,35 +2628,31 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds | |||
| 1215 | sub enckey() | 2628 | sub enckey() |
| 1216 | { | 2629 | { |
| 1217 | &movz ("esi",&LB("edx")); # rk[i]>>0 | 2630 | &movz ("esi",&LB("edx")); # rk[i]>>0 |
| 1218 | &mov ("ebx",&DWP(2,"ebp","esi",8)); | 2631 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1219 | &movz ("esi",&HB("edx")); # rk[i]>>8 | 2632 | &movz ("esi",&HB("edx")); # rk[i]>>8 |
| 1220 | &and ("ebx",0xFF000000); | 2633 | &shl ("ebx",24); |
| 1221 | &xor ("eax","ebx"); | 2634 | &xor ("eax","ebx"); |
| 1222 | 2635 | ||
| 1223 | &mov ("ebx",&DWP(2,"ebp","esi",8)); | 2636 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1224 | &shr ("edx",16); | 2637 | &shr ("edx",16); |
| 1225 | &and ("ebx",0x000000FF); | ||
| 1226 | &movz ("esi",&LB("edx")); # rk[i]>>16 | 2638 | &movz ("esi",&LB("edx")); # rk[i]>>16 |
| 1227 | &xor ("eax","ebx"); | 2639 | &xor ("eax","ebx"); |
| 1228 | 2640 | ||
| 1229 | &mov ("ebx",&DWP(0,"ebp","esi",8)); | 2641 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1230 | &movz ("esi",&HB("edx")); # rk[i]>>24 | 2642 | &movz ("esi",&HB("edx")); # rk[i]>>24 |
| 1231 | &and ("ebx",0x0000FF00); | 2643 | &shl ("ebx",8); |
| 1232 | &xor ("eax","ebx"); | 2644 | &xor ("eax","ebx"); |
| 1233 | 2645 | ||
| 1234 | &mov ("ebx",&DWP(0,"ebp","esi",8)); | 2646 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1235 | &and ("ebx",0x00FF0000); | 2647 | &shl ("ebx",16); |
| 1236 | &xor ("eax","ebx"); | 2648 | &xor ("eax","ebx"); |
| 1237 | 2649 | ||
| 1238 | &xor ("eax",&DWP(2048,"ebp","ecx",4)); # rcon | 2650 | &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon |
| 1239 | } | 2651 | } |
| 1240 | 2652 | ||
| 1241 | # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | 2653 | &function_begin("_x86_AES_set_encrypt_key"); |
| 1242 | # AES_KEY *key) | 2654 | &mov ("esi",&wparam(1)); # user supplied key |
| 1243 | &public_label("AES_Te"); | 2655 | &mov ("edi",&wparam(3)); # private key schedule |
| 1244 | &function_begin("AES_set_encrypt_key", "", "_x86_AES_set_encrypt_key"); | ||
| 1245 | &mov ("esi",&wparam(0)); # user supplied key | ||
| 1246 | &mov ("edi",&wparam(2)); # private key schedule | ||
| 1247 | 2656 | ||
| 1248 | &test ("esi",-1); | 2657 | &test ("esi",-1); |
| 1249 | &jz (&label("badpointer")); | 2658 | &jz (&label("badpointer")); |
| @@ -1252,10 +2661,21 @@ sub enckey() | |||
| 1252 | 2661 | ||
| 1253 | &call (&label("pic_point")); | 2662 | &call (&label("pic_point")); |
| 1254 | &set_label("pic_point"); | 2663 | &set_label("pic_point"); |
| 1255 | &blindpop("ebp"); | 2664 | &blindpop($tbl); |
| 1256 | &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 2665 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); |
| 1257 | 2666 | &lea ($tbl,&DWP(2048+128,$tbl)); | |
| 1258 | &mov ("ecx",&wparam(1)); # number of bits in key | 2667 | |
| 2668 | # prefetch Te4 | ||
| 2669 | &mov ("eax",&DWP(0-128,$tbl)); | ||
| 2670 | &mov ("ebx",&DWP(32-128,$tbl)); | ||
| 2671 | &mov ("ecx",&DWP(64-128,$tbl)); | ||
| 2672 | &mov ("edx",&DWP(96-128,$tbl)); | ||
| 2673 | &mov ("eax",&DWP(128-128,$tbl)); | ||
| 2674 | &mov ("ebx",&DWP(160-128,$tbl)); | ||
| 2675 | &mov ("ecx",&DWP(192-128,$tbl)); | ||
| 2676 | &mov ("edx",&DWP(224-128,$tbl)); | ||
| 2677 | |||
| 2678 | &mov ("ecx",&wparam(2)); # number of bits in key | ||
| 1259 | &cmp ("ecx",128); | 2679 | &cmp ("ecx",128); |
| 1260 | &je (&label("10rounds")); | 2680 | &je (&label("10rounds")); |
| 1261 | &cmp ("ecx",192); | 2681 | &cmp ("ecx",192); |
| @@ -1394,24 +2814,23 @@ sub enckey() | |||
| 1394 | &mov ("edx","eax"); | 2814 | &mov ("edx","eax"); |
| 1395 | &mov ("eax",&DWP(16,"edi")); # rk[4] | 2815 | &mov ("eax",&DWP(16,"edi")); # rk[4] |
| 1396 | &movz ("esi",&LB("edx")); # rk[11]>>0 | 2816 | &movz ("esi",&LB("edx")); # rk[11]>>0 |
| 1397 | &mov ("ebx",&DWP(2,"ebp","esi",8)); | 2817 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1398 | &movz ("esi",&HB("edx")); # rk[11]>>8 | 2818 | &movz ("esi",&HB("edx")); # rk[11]>>8 |
| 1399 | &and ("ebx",0x000000FF); | ||
| 1400 | &xor ("eax","ebx"); | 2819 | &xor ("eax","ebx"); |
| 1401 | 2820 | ||
| 1402 | &mov ("ebx",&DWP(0,"ebp","esi",8)); | 2821 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1403 | &shr ("edx",16); | 2822 | &shr ("edx",16); |
| 1404 | &and ("ebx",0x0000FF00); | 2823 | &shl ("ebx",8); |
| 1405 | &movz ("esi",&LB("edx")); # rk[11]>>16 | 2824 | &movz ("esi",&LB("edx")); # rk[11]>>16 |
| 1406 | &xor ("eax","ebx"); | 2825 | &xor ("eax","ebx"); |
| 1407 | 2826 | ||
| 1408 | &mov ("ebx",&DWP(0,"ebp","esi",8)); | 2827 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1409 | &movz ("esi",&HB("edx")); # rk[11]>>24 | 2828 | &movz ("esi",&HB("edx")); # rk[11]>>24 |
| 1410 | &and ("ebx",0x00FF0000); | 2829 | &shl ("ebx",16); |
| 1411 | &xor ("eax","ebx"); | 2830 | &xor ("eax","ebx"); |
| 1412 | 2831 | ||
| 1413 | &mov ("ebx",&DWP(2,"ebp","esi",8)); | 2832 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
| 1414 | &and ("ebx",0xFF000000); | 2833 | &shl ("ebx",24); |
| 1415 | &xor ("eax","ebx"); | 2834 | &xor ("eax","ebx"); |
| 1416 | 2835 | ||
| 1417 | &mov (&DWP(48,"edi"),"eax"); # rk[12] | 2836 | &mov (&DWP(48,"edi"),"eax"); # rk[12] |
| @@ -1433,43 +2852,74 @@ sub enckey() | |||
| 1433 | &set_label("badpointer"); | 2852 | &set_label("badpointer"); |
| 1434 | &mov ("eax",-1); | 2853 | &mov ("eax",-1); |
| 1435 | &set_label("exit"); | 2854 | &set_label("exit"); |
| 1436 | &function_end("AES_set_encrypt_key"); | 2855 | &function_end("_x86_AES_set_encrypt_key"); |
| 1437 | 2856 | ||
| 1438 | sub deckey() | 2857 | # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, |
| 1439 | { my ($i,$ptr,$te,$td) = @_; | 2858 | # AES_KEY *key) |
| 2859 | &function_begin_B("AES_set_encrypt_key"); | ||
| 2860 | &call ("_x86_AES_set_encrypt_key"); | ||
| 2861 | &ret (); | ||
| 2862 | &function_end_B("AES_set_encrypt_key"); | ||
| 1440 | 2863 | ||
| 1441 | &mov ("eax",&DWP($i,$ptr)); | 2864 | sub deckey() |
| 1442 | &mov ("edx","eax"); | 2865 | { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; |
| 1443 | &movz ("ebx",&HB("eax")); | 2866 | my $tmp = $tbl; |
| 1444 | &shr ("edx",16); | 2867 | |
| 1445 | &and ("eax",0xFF); | 2868 | &mov ($acc,$tp1); |
| 1446 | &movz ("eax",&BP(2,$te,"eax",8)); | 2869 | &and ($acc,0x80808080); |
| 1447 | &movz ("ebx",&BP(2,$te,"ebx",8)); | 2870 | &mov ($tmp,$acc); |
| 1448 | &mov ("eax",&DWP(0,$td,"eax",8)); | 2871 | &shr ($tmp,7); |
| 1449 | &xor ("eax",&DWP(3,$td,"ebx",8)); | 2872 | &lea ($tp2,&DWP(0,$tp1,$tp1)); |
| 1450 | &movz ("ebx",&HB("edx")); | 2873 | &sub ($acc,$tmp); |
| 1451 | &and ("edx",0xFF); | 2874 | &and ($tp2,0xfefefefe); |
| 1452 | &movz ("edx",&BP(2,$te,"edx",8)); | 2875 | &and ($acc,0x1b1b1b1b); |
| 1453 | &movz ("ebx",&BP(2,$te,"ebx",8)); | 2876 | &xor ($acc,$tp2); |
| 1454 | &xor ("eax",&DWP(2,$td,"edx",8)); | 2877 | &mov ($tp2,$acc); |
| 1455 | &xor ("eax",&DWP(1,$td,"ebx",8)); | 2878 | |
| 1456 | &mov (&DWP($i,$ptr),"eax"); | 2879 | &and ($acc,0x80808080); |
| 2880 | &mov ($tmp,$acc); | ||
| 2881 | &shr ($tmp,7); | ||
| 2882 | &lea ($tp4,&DWP(0,$tp2,$tp2)); | ||
| 2883 | &sub ($acc,$tmp); | ||
| 2884 | &and ($tp4,0xfefefefe); | ||
| 2885 | &and ($acc,0x1b1b1b1b); | ||
| 2886 | &xor ($tp2,$tp1); # tp2^tp1 | ||
| 2887 | &xor ($acc,$tp4); | ||
| 2888 | &mov ($tp4,$acc); | ||
| 2889 | |||
| 2890 | &and ($acc,0x80808080); | ||
| 2891 | &mov ($tmp,$acc); | ||
| 2892 | &shr ($tmp,7); | ||
| 2893 | &lea ($tp8,&DWP(0,$tp4,$tp4)); | ||
| 2894 | &xor ($tp4,$tp1); # tp4^tp1 | ||
| 2895 | &sub ($acc,$tmp); | ||
| 2896 | &and ($tp8,0xfefefefe); | ||
| 2897 | &and ($acc,0x1b1b1b1b); | ||
| 2898 | &rotl ($tp1,8); # = ROTATE(tp1,8) | ||
| 2899 | &xor ($tp8,$acc); | ||
| 2900 | |||
| 2901 | &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load | ||
| 2902 | |||
| 2903 | &xor ($tp1,$tp2); | ||
| 2904 | &xor ($tp2,$tp8); | ||
| 2905 | &xor ($tp1,$tp4); | ||
| 2906 | &rotl ($tp2,24); | ||
| 2907 | &xor ($tp4,$tp8); | ||
| 2908 | &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) | ||
| 2909 | &rotl ($tp4,16); | ||
| 2910 | &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24) | ||
| 2911 | &rotl ($tp8,8); | ||
| 2912 | &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16) | ||
| 2913 | &mov ($tp2,$tmp); | ||
| 2914 | &xor ($tp1,$tp8); # ^= ROTATE(tp8,8) | ||
| 2915 | |||
| 2916 | &mov (&DWP(4*$i,$key),$tp1); | ||
| 1457 | } | 2917 | } |
| 1458 | 2918 | ||
| 1459 | # int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | 2919 | # int AES_set_decrypt_key(const unsigned char *userKey, const int bits, |
| 1460 | # AES_KEY *key) | 2920 | # AES_KEY *key) |
| 1461 | &public_label("AES_Td"); | ||
| 1462 | &public_label("AES_Te"); | ||
| 1463 | &function_begin_B("AES_set_decrypt_key"); | 2921 | &function_begin_B("AES_set_decrypt_key"); |
| 1464 | &mov ("eax",&wparam(0)); | ||
| 1465 | &mov ("ecx",&wparam(1)); | ||
| 1466 | &mov ("edx",&wparam(2)); | ||
| 1467 | &sub ("esp",12); | ||
| 1468 | &mov (&DWP(0,"esp"),"eax"); | ||
| 1469 | &mov (&DWP(4,"esp"),"ecx"); | ||
| 1470 | &mov (&DWP(8,"esp"),"edx"); | ||
| 1471 | &call ("_x86_AES_set_encrypt_key"); | 2922 | &call ("_x86_AES_set_encrypt_key"); |
| 1472 | &add ("esp",12); | ||
| 1473 | &cmp ("eax",0); | 2923 | &cmp ("eax",0); |
| 1474 | &je (&label("proceed")); | 2924 | &je (&label("proceed")); |
| 1475 | &ret (); | 2925 | &ret (); |
| @@ -1485,8 +2935,7 @@ sub deckey() | |||
| 1485 | &lea ("ecx",&DWP(0,"","ecx",4)); | 2935 | &lea ("ecx",&DWP(0,"","ecx",4)); |
| 1486 | &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk | 2936 | &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk |
| 1487 | 2937 | ||
| 1488 | &align (4); | 2938 | &set_label("invert",4); # invert order of chunks |
| 1489 | &set_label("invert"); # invert order of chunks | ||
| 1490 | &mov ("eax",&DWP(0,"esi")); | 2939 | &mov ("eax",&DWP(0,"esi")); |
| 1491 | &mov ("ebx",&DWP(4,"esi")); | 2940 | &mov ("ebx",&DWP(4,"esi")); |
| 1492 | &mov ("ecx",&DWP(0,"edi")); | 2941 | &mov ("ecx",&DWP(0,"edi")); |
| @@ -1508,26 +2957,24 @@ sub deckey() | |||
| 1508 | &cmp ("esi","edi"); | 2957 | &cmp ("esi","edi"); |
| 1509 | &jne (&label("invert")); | 2958 | &jne (&label("invert")); |
| 1510 | 2959 | ||
| 1511 | &call (&label("pic_point")); | 2960 | &mov ($key,&wparam(2)); |
| 1512 | &set_label("pic_point"); | 2961 | &mov ($acc,&DWP(240,$key)); # pull number of rounds |
| 1513 | blindpop("ebp"); | 2962 | &lea ($acc,&DWP(-2,$acc,$acc)); |
| 1514 | &lea ("edi",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); | 2963 | &lea ($acc,&DWP(0,$key,$acc,8)); |
| 1515 | &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); | 2964 | &mov (&wparam(2),$acc); |
| 1516 | 2965 | ||
| 1517 | &mov ("esi",&wparam(2)); | 2966 | &mov ($s0,&DWP(16,$key)); # modulo-scheduled load |
| 1518 | &mov ("ecx",&DWP(240,"esi")); # pull number of rounds | 2967 | &set_label("permute",4); # permute the key schedule |
| 1519 | &dec ("ecx"); | 2968 | &add ($key,16); |
| 1520 | &align (4); | 2969 | &deckey (0,$key,$s0,$s1,$s2,$s3); |
| 1521 | &set_label("permute"); # permute the key schedule | 2970 | &deckey (1,$key,$s1,$s2,$s3,$s0); |
| 1522 | &add ("esi",16); | 2971 | &deckey (2,$key,$s2,$s3,$s0,$s1); |
| 1523 | &deckey (0,"esi","ebp","edi"); | 2972 | &deckey (3,$key,$s3,$s0,$s1,$s2); |
| 1524 | &deckey (4,"esi","ebp","edi"); | 2973 | &cmp ($key,&wparam(2)); |
| 1525 | &deckey (8,"esi","ebp","edi"); | 2974 | &jb (&label("permute")); |
| 1526 | &deckey (12,"esi","ebp","edi"); | ||
| 1527 | &dec ("ecx"); | ||
| 1528 | &jnz (&label("permute")); | ||
| 1529 | 2975 | ||
| 1530 | &xor ("eax","eax"); # return success | 2976 | &xor ("eax","eax"); # return success |
| 1531 | &function_end("AES_set_decrypt_key"); | 2977 | &function_end("AES_set_decrypt_key"); |
| 2978 | &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>"); | ||
| 1532 | 2979 | ||
| 1533 | &asm_finish(); | 2980 | &asm_finish(); |
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl index a545e892ae..53e4ef85fd 100755 --- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl +++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl | |||
| @@ -1669,7 +1669,7 @@ AES_cbc_encrypt: | |||
| 1669 | lea .LAES_Td(%rip),$sbox | 1669 | lea .LAES_Td(%rip),$sbox |
| 1670 | .Lcbc_picked_te: | 1670 | .Lcbc_picked_te: |
| 1671 | 1671 | ||
| 1672 | mov OPENSSL_ia32cap_P(%rip),%r10d | 1672 | mov PIC_GOT(OPENSSL_ia32cap_P),%r10d |
| 1673 | cmp \$$speed_limit,%rdx | 1673 | cmp \$$speed_limit,%rdx |
| 1674 | jb .Lcbc_slow_prologue | 1674 | jb .Lcbc_slow_prologue |
| 1675 | test \$15,%rdx | 1675 | test \$15,%rdx |
