summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/aes
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/aes')
-rw-r--r--src/lib/libcrypto/aes/README3
-rw-r--r--src/lib/libcrypto/aes/aes.h142
-rw-r--r--src/lib/libcrypto/aes/aes_cbc.c63
-rw-r--r--src/lib/libcrypto/aes/aes_cfb.c81
-rw-r--r--src/lib/libcrypto/aes/aes_core.c1358
-rw-r--r--src/lib/libcrypto/aes/aes_ctr.c61
-rw-r--r--src/lib/libcrypto/aes/aes_ecb.c73
-rw-r--r--src/lib/libcrypto/aes/aes_ige.c323
-rw-r--r--src/lib/libcrypto/aes/aes_locl.h89
-rw-r--r--src/lib/libcrypto/aes/aes_misc.c64
-rw-r--r--src/lib/libcrypto/aes/aes_ofb.c60
-rw-r--r--src/lib/libcrypto/aes/aes_wrap.c259
-rw-r--r--src/lib/libcrypto/aes/aes_x86core.c1063
-rw-r--r--src/lib/libcrypto/aes/asm/aes-586.pl2980
-rw-r--r--src/lib/libcrypto/aes/asm/aes-armv4.pl1030
-rw-r--r--src/lib/libcrypto/aes/asm/aes-ia64.S1123
-rw-r--r--src/lib/libcrypto/aes/asm/aes-ppc.pl1189
-rw-r--r--src/lib/libcrypto/aes/asm/aes-s390x.pl1339
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-sparcv9.pl1181
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-x86_64.pl2809
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-x86_64.pl992
21 files changed, 0 insertions, 16282 deletions
diff --git a/src/lib/libcrypto/aes/README b/src/lib/libcrypto/aes/README
deleted file mode 100644
index 0f9620a80e..0000000000
--- a/src/lib/libcrypto/aes/README
+++ /dev/null
@@ -1,3 +0,0 @@
1This is an OpenSSL-compatible version of AES (also called Rijndael).
2aes_core.c is basically the same as rijndael-alg-fst.c but with an
3API that looks like the rest of the OpenSSL symmetric cipher suite.
diff --git a/src/lib/libcrypto/aes/aes.h b/src/lib/libcrypto/aes/aes.h
deleted file mode 100644
index d2c99730fe..0000000000
--- a/src/lib/libcrypto/aes/aes.h
+++ /dev/null
@@ -1,142 +0,0 @@
1/* crypto/aes/aes.h -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#ifndef HEADER_AES_H
53#define HEADER_AES_H
54
55#include <openssl/opensslconf.h>
56
57#ifdef OPENSSL_NO_AES
58#error AES is disabled.
59#endif
60
61#include <stddef.h>
62
63#define AES_ENCRYPT 1
64#define AES_DECRYPT 0
65
66/* Because array size can't be a const in C, the following two are macros.
67 Both sizes are in bytes. */
68#define AES_MAXNR 14
69#define AES_BLOCK_SIZE 16
70
71#ifdef __cplusplus
72extern "C" {
73#endif
74
75/* This should be a hidden type, but EVP requires that the size be known */
76struct aes_key_st {
77#ifdef AES_LONG
78 unsigned long rd_key[4 *(AES_MAXNR + 1)];
79#else
80 unsigned int rd_key[4 *(AES_MAXNR + 1)];
81#endif
82 int rounds;
83};
84typedef struct aes_key_st AES_KEY;
85
86const char *AES_options(void);
87
88int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
89 AES_KEY *key);
90int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
91 AES_KEY *key);
92
93void AES_encrypt(const unsigned char *in, unsigned char *out,
94 const AES_KEY *key);
95void AES_decrypt(const unsigned char *in, unsigned char *out,
96 const AES_KEY *key);
97
98void AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
99 const AES_KEY *key, const int enc);
100void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
101 size_t length, const AES_KEY *key,
102 unsigned char *ivec, const int enc);
103void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out,
104 size_t length, const AES_KEY *key,
105 unsigned char *ivec, int *num, const int enc);
106void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out,
107 size_t length, const AES_KEY *key,
108 unsigned char *ivec, int *num, const int enc);
109void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out,
110 size_t length, const AES_KEY *key,
111 unsigned char *ivec, int *num, const int enc);
112void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out,
113 size_t length, const AES_KEY *key,
114 unsigned char *ivec, int *num);
115void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
116 size_t length, const AES_KEY *key,
117 unsigned char ivec[AES_BLOCK_SIZE],
118 unsigned char ecount_buf[AES_BLOCK_SIZE],
119 unsigned int *num);
120/* NB: the IV is _two_ blocks long */
121void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
122 size_t length, const AES_KEY *key,
123 unsigned char *ivec, const int enc);
124/* NB: the IV is _four_ blocks long */
125void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out,
126 size_t length, const AES_KEY *key,
127 const AES_KEY *key2, const unsigned char *ivec,
128 const int enc);
129
130int AES_wrap_key(AES_KEY *key, const unsigned char *iv,
131 unsigned char *out,
132 const unsigned char *in, unsigned int inlen);
133int AES_unwrap_key(AES_KEY *key, const unsigned char *iv,
134 unsigned char *out,
135 const unsigned char *in, unsigned int inlen);
136
137
138#ifdef __cplusplus
139}
140#endif
141
142#endif /* !HEADER_AES_H */
diff --git a/src/lib/libcrypto/aes/aes_cbc.c b/src/lib/libcrypto/aes/aes_cbc.c
deleted file mode 100644
index 227f75625d..0000000000
--- a/src/lib/libcrypto/aes/aes_cbc.c
+++ /dev/null
@@ -1,63 +0,0 @@
1/* crypto/aes/aes_cbc.c -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/modes.h>
54
55void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
56 size_t len, const AES_KEY *key,
57 unsigned char *ivec, const int enc) {
58
59 if (enc)
60 CRYPTO_cbc128_encrypt(in,out,len,key,ivec,(block128_f)AES_encrypt);
61 else
62 CRYPTO_cbc128_decrypt(in,out,len,key,ivec,(block128_f)AES_decrypt);
63}
diff --git a/src/lib/libcrypto/aes/aes_cfb.c b/src/lib/libcrypto/aes/aes_cfb.c
deleted file mode 100644
index 0c6d058ce7..0000000000
--- a/src/lib/libcrypto/aes/aes_cfb.c
+++ /dev/null
@@ -1,81 +0,0 @@
1/* crypto/aes/aes_cfb.c -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/modes.h>
54
55/* The input and output encrypted as though 128bit cfb mode is being
56 * used. The extra state information to record how much of the
57 * 128bit block we have used is contained in *num;
58 */
59
60void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out,
61 size_t length, const AES_KEY *key,
62 unsigned char *ivec, int *num, const int enc) {
63
64 CRYPTO_cfb128_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt);
65}
66
67/* N.B. This expects the input to be packed, MS bit first */
68void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out,
69 size_t length, const AES_KEY *key,
70 unsigned char *ivec, int *num, const int enc)
71 {
72 CRYPTO_cfb128_1_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt);
73 }
74
75void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out,
76 size_t length, const AES_KEY *key,
77 unsigned char *ivec, int *num, const int enc)
78 {
79 CRYPTO_cfb128_8_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt);
80 }
81
diff --git a/src/lib/libcrypto/aes/aes_core.c b/src/lib/libcrypto/aes/aes_core.c
deleted file mode 100644
index a7ec54f4da..0000000000
--- a/src/lib/libcrypto/aes/aes_core.c
+++ /dev/null
@@ -1,1358 +0,0 @@
1/* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
2/**
3 * rijndael-alg-fst.c
4 *
5 * @version 3.0 (December 2000)
6 *
7 * Optimised ANSI C code for the Rijndael cipher (now AES)
8 *
9 * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10 * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11 * @author Paulo Barreto <paulo.barreto@terra.com.br>
12 *
13 * This code is hereby placed in the public domain.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/* Note: rewritten a little bit to provide error control and an OpenSSL-
29 compatible API */
30
31#ifndef AES_DEBUG
32# ifndef NDEBUG
33# define NDEBUG
34# endif
35#endif
36#include <assert.h>
37
38#include <stdlib.h>
39#include <openssl/aes.h>
40#include "aes_locl.h"
41
42#ifndef AES_ASM
43/*
44Te0[x] = S [x].[02, 01, 01, 03];
45Te1[x] = S [x].[03, 02, 01, 01];
46Te2[x] = S [x].[01, 03, 02, 01];
47Te3[x] = S [x].[01, 01, 03, 02];
48
49Td0[x] = Si[x].[0e, 09, 0d, 0b];
50Td1[x] = Si[x].[0b, 0e, 09, 0d];
51Td2[x] = Si[x].[0d, 0b, 0e, 09];
52Td3[x] = Si[x].[09, 0d, 0b, 0e];
53Td4[x] = Si[x].[01];
54*/
55
56static const u32 Te0[256] = {
57 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
58 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
59 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
60 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
61 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
62 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
63 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
64 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
65 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
66 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
67 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
68 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
69 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
70 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
71 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
72 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
73 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
74 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
75 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
76 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
77 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
78 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
79 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
80 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
81 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
82 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
83 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
84 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
85 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
86 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
87 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
88 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
89 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
90 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
91 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
92 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
93 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
94 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
95 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
96 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
97 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
98 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
99 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
100 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
101 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
102 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
103 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
104 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
105 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
106 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
107 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
108 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
109 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
110 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
111 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
112 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
113 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
114 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
115 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
116 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
117 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
118 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
119 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
120 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
121};
122static const u32 Te1[256] = {
123 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
124 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
125 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
126 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
127 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
128 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
129 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
130 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
131 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
132 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
133 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
134 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
135 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
136 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
137 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
138 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
139 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
140 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
141 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
142 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
143 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
144 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
145 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
146 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
147 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
148 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
149 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
150 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
151 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
152 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
153 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
154 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
155 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
156 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
157 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
158 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
159 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
160 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
161 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
162 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
163 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
164 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
165 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
166 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
167 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
168 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
169 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
170 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
171 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
172 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
173 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
174 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
175 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
176 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
177 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
178 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
179 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
180 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
181 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
182 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
183 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
184 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
185 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
186 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
187};
188static const u32 Te2[256] = {
189 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
190 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
191 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
192 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
193 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
194 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
195 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
196 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
197 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
198 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
199 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
200 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
201 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
202 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
203 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
204 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
205 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
206 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
207 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
208 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
209 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
210 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
211 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
212 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
213 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
214 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
215 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
216 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
217 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
218 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
219 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
220 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
221 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
222 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
223 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
224 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
225 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
226 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
227 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
228 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
229 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
230 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
231 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
232 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
233 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
234 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
235 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
236 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
237 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
238 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
239 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
240 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
241 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
242 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
243 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
244 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
245 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
246 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
247 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
248 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
249 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
250 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
251 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
252 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
253};
254static const u32 Te3[256] = {
255 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
256 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
257 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
258 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
259 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
260 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
261 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
262 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
263 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
264 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
265 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
266 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
267 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
268 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
269 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
270 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
271 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
272 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
273 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
274 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
275 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
276 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
277 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
278 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
279 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
280 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
281 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
282 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
283 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
284 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
285 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
286 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
287 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
288 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
289 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
290 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
291 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
292 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
293 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
294 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
295 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
296 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
297 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
298 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
299 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
300 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
301 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
302 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
303 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
304 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
305 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
306 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
307 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
308 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
309 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
310 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
311 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
312 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
313 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
314 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
315 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
316 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
317 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
318 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
319};
320
321static const u32 Td0[256] = {
322 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
323 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
324 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
325 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
326 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
327 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
328 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
329 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
330 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
331 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
332 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
333 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
334 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
335 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
336 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
337 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
338 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
339 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
340 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
341 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
342 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
343 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
344 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
345 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
346 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
347 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
348 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
349 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
350 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
351 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
352 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
353 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
354 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
355 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
356 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
357 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
358 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
359 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
360 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
361 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
362 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
363 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
364 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
365 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
366 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
367 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
368 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
369 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
370 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
371 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
372 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
373 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
374 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
375 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
376 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
377 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
378 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
379 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
380 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
381 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
382 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
383 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
384 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
385 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
386};
387static const u32 Td1[256] = {
388 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
389 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
390 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
391 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
392 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
393 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
394 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
395 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
396 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
397 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
398 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
399 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
400 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
401 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
402 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
403 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
404 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
405 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
406 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
407 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
408 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
409 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
410 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
411 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
412 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
413 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
414 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
415 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
416 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
417 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
418 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
419 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
420 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
421 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
422 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
423 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
424 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
425 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
426 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
427 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
428 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
429 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
430 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
431 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
432 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
433 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
434 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
435 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
436 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
437 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
438 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
439 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
440 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
441 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
442 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
443 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
444 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
445 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
446 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
447 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
448 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
449 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
450 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
451 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
452};
453static const u32 Td2[256] = {
454 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
455 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
456 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
457 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
458 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
459 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
460 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
461 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
462 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
463 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
464 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
465 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
466 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
467 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
468 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
469 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
470 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
471 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
472 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
473 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
474 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
475 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
476 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
477 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
478 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
479 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
480 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
481 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
482 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
483 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
484 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
485 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
486 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
487 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
488 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
489 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
490 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
491 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
492 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
493 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
494 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
495 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
496 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
497 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
498 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
499 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
500 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
501 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
502 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
503 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
504 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
505 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
506 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
507 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
508 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
509 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
510 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
511 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
512 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
513 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
514 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
515 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
516 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
517 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
518};
519static const u32 Td3[256] = {
520 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
521 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
522 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
523 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
524 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
525 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
526 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
527 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
528 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
529 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
530 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
531 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
532 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
533 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
534 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
535 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
536 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
537 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
538 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
539 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
540 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
541 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
542 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
543 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
544 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
545 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
546 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
547 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
548 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
549 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
550 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
551 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
552 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
553 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
554 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
555 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
556 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
557 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
558 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
559 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
560 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
561 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
562 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
563 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
564 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
565 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
566 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
567 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
568 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
569 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
570 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
571 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
572 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
573 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
574 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
575 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
576 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
577 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
578 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
579 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
580 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
581 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
582 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
583 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
584};
585static const u8 Td4[256] = {
586 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
587 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
588 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
589 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
590 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
591 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
592 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
593 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
594 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
595 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
596 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
597 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
598 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
599 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
600 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
601 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
602 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
603 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
604 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
605 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
606 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
607 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
608 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
609 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
610 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
611 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
612 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
613 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
614 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
615 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
616 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
617 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU,
618};
619static const u32 rcon[] = {
620 0x01000000, 0x02000000, 0x04000000, 0x08000000,
621 0x10000000, 0x20000000, 0x40000000, 0x80000000,
622 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
623};
624
625/**
626 * Expand the cipher key into the encryption key schedule.
627 */
628int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
629 AES_KEY *key) {
630
631 u32 *rk;
632 int i = 0;
633 u32 temp;
634
635 if (!userKey || !key)
636 return -1;
637 if (bits != 128 && bits != 192 && bits != 256)
638 return -2;
639
640 rk = key->rd_key;
641
642 if (bits==128)
643 key->rounds = 10;
644 else if (bits==192)
645 key->rounds = 12;
646 else
647 key->rounds = 14;
648
649 rk[0] = GETU32(userKey );
650 rk[1] = GETU32(userKey + 4);
651 rk[2] = GETU32(userKey + 8);
652 rk[3] = GETU32(userKey + 12);
653 if (bits == 128) {
654 while (1) {
655 temp = rk[3];
656 rk[4] = rk[0] ^
657 (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
658 (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
659 (Te0[(temp ) & 0xff] & 0x0000ff00) ^
660 (Te1[(temp >> 24) ] & 0x000000ff) ^
661 rcon[i];
662 rk[5] = rk[1] ^ rk[4];
663 rk[6] = rk[2] ^ rk[5];
664 rk[7] = rk[3] ^ rk[6];
665 if (++i == 10) {
666 return 0;
667 }
668 rk += 4;
669 }
670 }
671 rk[4] = GETU32(userKey + 16);
672 rk[5] = GETU32(userKey + 20);
673 if (bits == 192) {
674 while (1) {
675 temp = rk[ 5];
676 rk[ 6] = rk[ 0] ^
677 (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
678 (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
679 (Te0[(temp ) & 0xff] & 0x0000ff00) ^
680 (Te1[(temp >> 24) ] & 0x000000ff) ^
681 rcon[i];
682 rk[ 7] = rk[ 1] ^ rk[ 6];
683 rk[ 8] = rk[ 2] ^ rk[ 7];
684 rk[ 9] = rk[ 3] ^ rk[ 8];
685 if (++i == 8) {
686 return 0;
687 }
688 rk[10] = rk[ 4] ^ rk[ 9];
689 rk[11] = rk[ 5] ^ rk[10];
690 rk += 6;
691 }
692 }
693 rk[6] = GETU32(userKey + 24);
694 rk[7] = GETU32(userKey + 28);
695 if (bits == 256) {
696 while (1) {
697 temp = rk[ 7];
698 rk[ 8] = rk[ 0] ^
699 (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
700 (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
701 (Te0[(temp ) & 0xff] & 0x0000ff00) ^
702 (Te1[(temp >> 24) ] & 0x000000ff) ^
703 rcon[i];
704 rk[ 9] = rk[ 1] ^ rk[ 8];
705 rk[10] = rk[ 2] ^ rk[ 9];
706 rk[11] = rk[ 3] ^ rk[10];
707 if (++i == 7) {
708 return 0;
709 }
710 temp = rk[11];
711 rk[12] = rk[ 4] ^
712 (Te2[(temp >> 24) ] & 0xff000000) ^
713 (Te3[(temp >> 16) & 0xff] & 0x00ff0000) ^
714 (Te0[(temp >> 8) & 0xff] & 0x0000ff00) ^
715 (Te1[(temp ) & 0xff] & 0x000000ff);
716 rk[13] = rk[ 5] ^ rk[12];
717 rk[14] = rk[ 6] ^ rk[13];
718 rk[15] = rk[ 7] ^ rk[14];
719
720 rk += 8;
721 }
722 }
723 return 0;
724}
725
726/**
727 * Expand the cipher key into the decryption key schedule.
728 */
729int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
730 AES_KEY *key) {
731
732 u32 *rk;
733 int i, j, status;
734 u32 temp;
735
736 /* first, start with an encryption schedule */
737 status = AES_set_encrypt_key(userKey, bits, key);
738 if (status < 0)
739 return status;
740
741 rk = key->rd_key;
742
743 /* invert the order of the round keys: */
744 for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
745 temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
746 temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
747 temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
748 temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
749 }
750 /* apply the inverse MixColumn transform to all round keys but the first and the last: */
751 for (i = 1; i < (key->rounds); i++) {
752 rk += 4;
753 rk[0] =
754 Td0[Te1[(rk[0] >> 24) ] & 0xff] ^
755 Td1[Te1[(rk[0] >> 16) & 0xff] & 0xff] ^
756 Td2[Te1[(rk[0] >> 8) & 0xff] & 0xff] ^
757 Td3[Te1[(rk[0] ) & 0xff] & 0xff];
758 rk[1] =
759 Td0[Te1[(rk[1] >> 24) ] & 0xff] ^
760 Td1[Te1[(rk[1] >> 16) & 0xff] & 0xff] ^
761 Td2[Te1[(rk[1] >> 8) & 0xff] & 0xff] ^
762 Td3[Te1[(rk[1] ) & 0xff] & 0xff];
763 rk[2] =
764 Td0[Te1[(rk[2] >> 24) ] & 0xff] ^
765 Td1[Te1[(rk[2] >> 16) & 0xff] & 0xff] ^
766 Td2[Te1[(rk[2] >> 8) & 0xff] & 0xff] ^
767 Td3[Te1[(rk[2] ) & 0xff] & 0xff];
768 rk[3] =
769 Td0[Te1[(rk[3] >> 24) ] & 0xff] ^
770 Td1[Te1[(rk[3] >> 16) & 0xff] & 0xff] ^
771 Td2[Te1[(rk[3] >> 8) & 0xff] & 0xff] ^
772 Td3[Te1[(rk[3] ) & 0xff] & 0xff];
773 }
774 return 0;
775}
776
777/*
778 * Encrypt a single block
779 * in and out can overlap
780 */
781void AES_encrypt(const unsigned char *in, unsigned char *out,
782 const AES_KEY *key) {
783
784 const u32 *rk;
785 u32 s0, s1, s2, s3, t0, t1, t2, t3;
786#ifndef FULL_UNROLL
787 int r;
788#endif /* ?FULL_UNROLL */
789
790 assert(in && out && key);
791 rk = key->rd_key;
792
793 /*
794 * map byte array block to cipher state
795 * and add initial round key:
796 */
797 s0 = GETU32(in ) ^ rk[0];
798 s1 = GETU32(in + 4) ^ rk[1];
799 s2 = GETU32(in + 8) ^ rk[2];
800 s3 = GETU32(in + 12) ^ rk[3];
801#ifdef FULL_UNROLL
802 /* round 1: */
803 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
804 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
805 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
806 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
807 /* round 2: */
808 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
809 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
810 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
811 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
812 /* round 3: */
813 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
814 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
815 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
816 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
817 /* round 4: */
818 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
819 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
820 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
821 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
822 /* round 5: */
823 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
824 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
825 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
826 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
827 /* round 6: */
828 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
829 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
830 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
831 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
832 /* round 7: */
833 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
834 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
835 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
836 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
837 /* round 8: */
838 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
839 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
840 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
841 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
842 /* round 9: */
843 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
844 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
845 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
846 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];
847 if (key->rounds > 10) {
848 /* round 10: */
849 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];
850 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];
851 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];
852 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];
853 /* round 11: */
854 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];
855 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];
856 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];
857 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];
858 if (key->rounds > 12) {
859 /* round 12: */
860 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];
861 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];
862 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];
863 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];
864 /* round 13: */
865 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];
866 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];
867 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];
868 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];
869 }
870 }
871 rk += key->rounds << 2;
872#else /* !FULL_UNROLL */
873 /*
874 * Nr - 1 full rounds:
875 */
876 r = key->rounds >> 1;
877 for (;;) {
878 t0 =
879 Te0[(s0 >> 24) ] ^
880 Te1[(s1 >> 16) & 0xff] ^
881 Te2[(s2 >> 8) & 0xff] ^
882 Te3[(s3 ) & 0xff] ^
883 rk[4];
884 t1 =
885 Te0[(s1 >> 24) ] ^
886 Te1[(s2 >> 16) & 0xff] ^
887 Te2[(s3 >> 8) & 0xff] ^
888 Te3[(s0 ) & 0xff] ^
889 rk[5];
890 t2 =
891 Te0[(s2 >> 24) ] ^
892 Te1[(s3 >> 16) & 0xff] ^
893 Te2[(s0 >> 8) & 0xff] ^
894 Te3[(s1 ) & 0xff] ^
895 rk[6];
896 t3 =
897 Te0[(s3 >> 24) ] ^
898 Te1[(s0 >> 16) & 0xff] ^
899 Te2[(s1 >> 8) & 0xff] ^
900 Te3[(s2 ) & 0xff] ^
901 rk[7];
902
903 rk += 8;
904 if (--r == 0) {
905 break;
906 }
907
908 s0 =
909 Te0[(t0 >> 24) ] ^
910 Te1[(t1 >> 16) & 0xff] ^
911 Te2[(t2 >> 8) & 0xff] ^
912 Te3[(t3 ) & 0xff] ^
913 rk[0];
914 s1 =
915 Te0[(t1 >> 24) ] ^
916 Te1[(t2 >> 16) & 0xff] ^
917 Te2[(t3 >> 8) & 0xff] ^
918 Te3[(t0 ) & 0xff] ^
919 rk[1];
920 s2 =
921 Te0[(t2 >> 24) ] ^
922 Te1[(t3 >> 16) & 0xff] ^
923 Te2[(t0 >> 8) & 0xff] ^
924 Te3[(t1 ) & 0xff] ^
925 rk[2];
926 s3 =
927 Te0[(t3 >> 24) ] ^
928 Te1[(t0 >> 16) & 0xff] ^
929 Te2[(t1 >> 8) & 0xff] ^
930 Te3[(t2 ) & 0xff] ^
931 rk[3];
932 }
933#endif /* ?FULL_UNROLL */
934 /*
935 * apply last round and
936 * map cipher state to byte array block:
937 */
938 s0 =
939 (Te2[(t0 >> 24) ] & 0xff000000) ^
940 (Te3[(t1 >> 16) & 0xff] & 0x00ff0000) ^
941 (Te0[(t2 >> 8) & 0xff] & 0x0000ff00) ^
942 (Te1[(t3 ) & 0xff] & 0x000000ff) ^
943 rk[0];
944 PUTU32(out , s0);
945 s1 =
946 (Te2[(t1 >> 24) ] & 0xff000000) ^
947 (Te3[(t2 >> 16) & 0xff] & 0x00ff0000) ^
948 (Te0[(t3 >> 8) & 0xff] & 0x0000ff00) ^
949 (Te1[(t0 ) & 0xff] & 0x000000ff) ^
950 rk[1];
951 PUTU32(out + 4, s1);
952 s2 =
953 (Te2[(t2 >> 24) ] & 0xff000000) ^
954 (Te3[(t3 >> 16) & 0xff] & 0x00ff0000) ^
955 (Te0[(t0 >> 8) & 0xff] & 0x0000ff00) ^
956 (Te1[(t1 ) & 0xff] & 0x000000ff) ^
957 rk[2];
958 PUTU32(out + 8, s2);
959 s3 =
960 (Te2[(t3 >> 24) ] & 0xff000000) ^
961 (Te3[(t0 >> 16) & 0xff] & 0x00ff0000) ^
962 (Te0[(t1 >> 8) & 0xff] & 0x0000ff00) ^
963 (Te1[(t2 ) & 0xff] & 0x000000ff) ^
964 rk[3];
965 PUTU32(out + 12, s3);
966}
967
968/*
969 * Decrypt a single block
970 * in and out can overlap
971 */
972void AES_decrypt(const unsigned char *in, unsigned char *out,
973 const AES_KEY *key) {
974
975 const u32 *rk;
976 u32 s0, s1, s2, s3, t0, t1, t2, t3;
977#ifndef FULL_UNROLL
978 int r;
979#endif /* ?FULL_UNROLL */
980
981 assert(in && out && key);
982 rk = key->rd_key;
983
984 /*
985 * map byte array block to cipher state
986 * and add initial round key:
987 */
988 s0 = GETU32(in ) ^ rk[0];
989 s1 = GETU32(in + 4) ^ rk[1];
990 s2 = GETU32(in + 8) ^ rk[2];
991 s3 = GETU32(in + 12) ^ rk[3];
992#ifdef FULL_UNROLL
993 /* round 1: */
994 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];
995 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];
996 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];
997 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];
998 /* round 2: */
999 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];
1000 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];
1001 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];
1002 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];
1003 /* round 3: */
1004 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];
1005 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];
1006 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];
1007 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];
1008 /* round 4: */
1009 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];
1010 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];
1011 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];
1012 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];
1013 /* round 5: */
1014 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];
1015 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];
1016 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];
1017 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];
1018 /* round 6: */
1019 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];
1020 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];
1021 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];
1022 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];
1023 /* round 7: */
1024 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];
1025 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];
1026 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];
1027 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];
1028 /* round 8: */
1029 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];
1030 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];
1031 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];
1032 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];
1033 /* round 9: */
1034 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];
1035 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];
1036 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];
1037 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];
1038 if (key->rounds > 10) {
1039 /* round 10: */
1040 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];
1041 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];
1042 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];
1043 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];
1044 /* round 11: */
1045 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];
1046 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];
1047 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];
1048 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];
1049 if (key->rounds > 12) {
1050 /* round 12: */
1051 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];
1052 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];
1053 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];
1054 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];
1055 /* round 13: */
1056 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];
1057 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];
1058 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];
1059 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];
1060 }
1061 }
1062 rk += key->rounds << 2;
1063#else /* !FULL_UNROLL */
1064 /*
1065 * Nr - 1 full rounds:
1066 */
1067 r = key->rounds >> 1;
1068 for (;;) {
1069 t0 =
1070 Td0[(s0 >> 24) ] ^
1071 Td1[(s3 >> 16) & 0xff] ^
1072 Td2[(s2 >> 8) & 0xff] ^
1073 Td3[(s1 ) & 0xff] ^
1074 rk[4];
1075 t1 =
1076 Td0[(s1 >> 24) ] ^
1077 Td1[(s0 >> 16) & 0xff] ^
1078 Td2[(s3 >> 8) & 0xff] ^
1079 Td3[(s2 ) & 0xff] ^
1080 rk[5];
1081 t2 =
1082 Td0[(s2 >> 24) ] ^
1083 Td1[(s1 >> 16) & 0xff] ^
1084 Td2[(s0 >> 8) & 0xff] ^
1085 Td3[(s3 ) & 0xff] ^
1086 rk[6];
1087 t3 =
1088 Td0[(s3 >> 24) ] ^
1089 Td1[(s2 >> 16) & 0xff] ^
1090 Td2[(s1 >> 8) & 0xff] ^
1091 Td3[(s0 ) & 0xff] ^
1092 rk[7];
1093
1094 rk += 8;
1095 if (--r == 0) {
1096 break;
1097 }
1098
1099 s0 =
1100 Td0[(t0 >> 24) ] ^
1101 Td1[(t3 >> 16) & 0xff] ^
1102 Td2[(t2 >> 8) & 0xff] ^
1103 Td3[(t1 ) & 0xff] ^
1104 rk[0];
1105 s1 =
1106 Td0[(t1 >> 24) ] ^
1107 Td1[(t0 >> 16) & 0xff] ^
1108 Td2[(t3 >> 8) & 0xff] ^
1109 Td3[(t2 ) & 0xff] ^
1110 rk[1];
1111 s2 =
1112 Td0[(t2 >> 24) ] ^
1113 Td1[(t1 >> 16) & 0xff] ^
1114 Td2[(t0 >> 8) & 0xff] ^
1115 Td3[(t3 ) & 0xff] ^
1116 rk[2];
1117 s3 =
1118 Td0[(t3 >> 24) ] ^
1119 Td1[(t2 >> 16) & 0xff] ^
1120 Td2[(t1 >> 8) & 0xff] ^
1121 Td3[(t0 ) & 0xff] ^
1122 rk[3];
1123 }
1124#endif /* ?FULL_UNROLL */
1125 /*
1126 * apply last round and
1127 * map cipher state to byte array block:
1128 */
1129 s0 =
1130 (Td4[(t0 >> 24) ] << 24) ^
1131 (Td4[(t3 >> 16) & 0xff] << 16) ^
1132 (Td4[(t2 >> 8) & 0xff] << 8) ^
1133 (Td4[(t1 ) & 0xff]) ^
1134 rk[0];
1135 PUTU32(out , s0);
1136 s1 =
1137 (Td4[(t1 >> 24) ] << 24) ^
1138 (Td4[(t0 >> 16) & 0xff] << 16) ^
1139 (Td4[(t3 >> 8) & 0xff] << 8) ^
1140 (Td4[(t2 ) & 0xff]) ^
1141 rk[1];
1142 PUTU32(out + 4, s1);
1143 s2 =
1144 (Td4[(t2 >> 24) ] << 24) ^
1145 (Td4[(t1 >> 16) & 0xff] << 16) ^
1146 (Td4[(t0 >> 8) & 0xff] << 8) ^
1147 (Td4[(t3 ) & 0xff]) ^
1148 rk[2];
1149 PUTU32(out + 8, s2);
1150 s3 =
1151 (Td4[(t3 >> 24) ] << 24) ^
1152 (Td4[(t2 >> 16) & 0xff] << 16) ^
1153 (Td4[(t1 >> 8) & 0xff] << 8) ^
1154 (Td4[(t0 ) & 0xff]) ^
1155 rk[3];
1156 PUTU32(out + 12, s3);
1157}
1158
1159#else /* AES_ASM */
1160
1161static const u8 Te4[256] = {
1162 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
1163 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
1164 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
1165 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
1166 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
1167 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
1168 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
1169 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
1170 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
1171 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
1172 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
1173 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
1174 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
1175 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
1176 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
1177 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
1178 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
1179 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
1180 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
1181 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
1182 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
1183 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
1184 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
1185 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
1186 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
1187 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
1188 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
1189 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
1190 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
1191 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
1192 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
1193 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
1194};
1195static const u32 rcon[] = {
1196 0x01000000, 0x02000000, 0x04000000, 0x08000000,
1197 0x10000000, 0x20000000, 0x40000000, 0x80000000,
1198 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
1199};
1200
1201/**
1202 * Expand the cipher key into the encryption key schedule.
1203 */
1204int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
1205 AES_KEY *key) {
1206 u32 *rk;
1207 int i = 0;
1208 u32 temp;
1209
1210 if (!userKey || !key)
1211 return -1;
1212 if (bits != 128 && bits != 192 && bits != 256)
1213 return -2;
1214
1215 rk = key->rd_key;
1216
1217 if (bits==128)
1218 key->rounds = 10;
1219 else if (bits==192)
1220 key->rounds = 12;
1221 else
1222 key->rounds = 14;
1223
1224 rk[0] = GETU32(userKey );
1225 rk[1] = GETU32(userKey + 4);
1226 rk[2] = GETU32(userKey + 8);
1227 rk[3] = GETU32(userKey + 12);
1228 if (bits == 128) {
1229 while (1) {
1230 temp = rk[3];
1231 rk[4] = rk[0] ^
1232 (Te4[(temp >> 16) & 0xff] << 24) ^
1233 (Te4[(temp >> 8) & 0xff] << 16) ^
1234 (Te4[(temp ) & 0xff] << 8) ^
1235 (Te4[(temp >> 24) ]) ^
1236 rcon[i];
1237 rk[5] = rk[1] ^ rk[4];
1238 rk[6] = rk[2] ^ rk[5];
1239 rk[7] = rk[3] ^ rk[6];
1240 if (++i == 10) {
1241 return 0;
1242 }
1243 rk += 4;
1244 }
1245 }
1246 rk[4] = GETU32(userKey + 16);
1247 rk[5] = GETU32(userKey + 20);
1248 if (bits == 192) {
1249 while (1) {
1250 temp = rk[ 5];
1251 rk[ 6] = rk[ 0] ^
1252 (Te4[(temp >> 16) & 0xff] << 24) ^
1253 (Te4[(temp >> 8) & 0xff] << 16) ^
1254 (Te4[(temp ) & 0xff] << 8) ^
1255 (Te4[(temp >> 24) ]) ^
1256 rcon[i];
1257 rk[ 7] = rk[ 1] ^ rk[ 6];
1258 rk[ 8] = rk[ 2] ^ rk[ 7];
1259 rk[ 9] = rk[ 3] ^ rk[ 8];
1260 if (++i == 8) {
1261 return 0;
1262 }
1263 rk[10] = rk[ 4] ^ rk[ 9];
1264 rk[11] = rk[ 5] ^ rk[10];
1265 rk += 6;
1266 }
1267 }
1268 rk[6] = GETU32(userKey + 24);
1269 rk[7] = GETU32(userKey + 28);
1270 if (bits == 256) {
1271 while (1) {
1272 temp = rk[ 7];
1273 rk[ 8] = rk[ 0] ^
1274 (Te4[(temp >> 16) & 0xff] << 24) ^
1275 (Te4[(temp >> 8) & 0xff] << 16) ^
1276 (Te4[(temp ) & 0xff] << 8) ^
1277 (Te4[(temp >> 24) ]) ^
1278 rcon[i];
1279 rk[ 9] = rk[ 1] ^ rk[ 8];
1280 rk[10] = rk[ 2] ^ rk[ 9];
1281 rk[11] = rk[ 3] ^ rk[10];
1282 if (++i == 7) {
1283 return 0;
1284 }
1285 temp = rk[11];
1286 rk[12] = rk[ 4] ^
1287 (Te4[(temp >> 24) ] << 24) ^
1288 (Te4[(temp >> 16) & 0xff] << 16) ^
1289 (Te4[(temp >> 8) & 0xff] << 8) ^
1290 (Te4[(temp ) & 0xff]);
1291 rk[13] = rk[ 5] ^ rk[12];
1292 rk[14] = rk[ 6] ^ rk[13];
1293 rk[15] = rk[ 7] ^ rk[14];
1294
1295 rk += 8;
1296 }
1297 }
1298 return 0;
1299}
1300
1301/**
1302 * Expand the cipher key into the decryption key schedule.
1303 */
1304int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
1305 AES_KEY *key) {
1306
1307 u32 *rk;
1308 int i, j, status;
1309 u32 temp;
1310
1311 /* first, start with an encryption schedule */
1312 status = AES_set_encrypt_key(userKey, bits, key);
1313 if (status < 0)
1314 return status;
1315
1316 rk = key->rd_key;
1317
1318 /* invert the order of the round keys: */
1319 for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
1320 temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
1321 temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
1322 temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
1323 temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
1324 }
1325 /* apply the inverse MixColumn transform to all round keys but the first and the last: */
1326 for (i = 1; i < (key->rounds); i++) {
1327 rk += 4;
1328 for (j = 0; j < 4; j++) {
1329 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
1330
1331 tp1 = rk[j];
1332 m = tp1 & 0x80808080;
1333 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
1334 ((m - (m >> 7)) & 0x1b1b1b1b);
1335 m = tp2 & 0x80808080;
1336 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
1337 ((m - (m >> 7)) & 0x1b1b1b1b);
1338 m = tp4 & 0x80808080;
1339 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
1340 ((m - (m >> 7)) & 0x1b1b1b1b);
1341 tp9 = tp8 ^ tp1;
1342 tpb = tp9 ^ tp2;
1343 tpd = tp9 ^ tp4;
1344 tpe = tp8 ^ tp4 ^ tp2;
1345#if defined(ROTATE)
1346 rk[j] = tpe ^ ROTATE(tpd,16) ^
1347 ROTATE(tp9,24) ^ ROTATE(tpb,8);
1348#else
1349 rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
1350 (tp9 >> 8) ^ (tp9 << 24) ^
1351 (tpb >> 24) ^ (tpb << 8);
1352#endif
1353 }
1354 }
1355 return 0;
1356}
1357
1358#endif /* AES_ASM */
diff --git a/src/lib/libcrypto/aes/aes_ctr.c b/src/lib/libcrypto/aes/aes_ctr.c
deleted file mode 100644
index 7c9d165d8a..0000000000
--- a/src/lib/libcrypto/aes/aes_ctr.c
+++ /dev/null
@@ -1,61 +0,0 @@
1/* crypto/aes/aes_ctr.c -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/modes.h>
54
55void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
56 size_t length, const AES_KEY *key,
57 unsigned char ivec[AES_BLOCK_SIZE],
58 unsigned char ecount_buf[AES_BLOCK_SIZE],
59 unsigned int *num) {
60 CRYPTO_ctr128_encrypt(in,out,length,key,ivec,ecount_buf,num,(block128_f)AES_encrypt);
61}
diff --git a/src/lib/libcrypto/aes/aes_ecb.c b/src/lib/libcrypto/aes/aes_ecb.c
deleted file mode 100644
index 28aa561c2d..0000000000
--- a/src/lib/libcrypto/aes/aes_ecb.c
+++ /dev/null
@@ -1,73 +0,0 @@
1/* crypto/aes/aes_ecb.c -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#ifndef AES_DEBUG
53# ifndef NDEBUG
54# define NDEBUG
55# endif
56#endif
57#include <assert.h>
58
59#include <openssl/aes.h>
60#include "aes_locl.h"
61
62void AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
63 const AES_KEY *key, const int enc) {
64
65 assert(in && out && key);
66 assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc));
67
68 if (AES_ENCRYPT == enc)
69 AES_encrypt(in, out, key);
70 else
71 AES_decrypt(in, out, key);
72}
73
diff --git a/src/lib/libcrypto/aes/aes_ige.c b/src/lib/libcrypto/aes/aes_ige.c
deleted file mode 100644
index c161351e65..0000000000
--- a/src/lib/libcrypto/aes/aes_ige.c
+++ /dev/null
@@ -1,323 +0,0 @@
1/* crypto/aes/aes_ige.c -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include "cryptlib.h"
53
54#include <openssl/aes.h>
55#include "aes_locl.h"
56
57#define N_WORDS (AES_BLOCK_SIZE / sizeof(unsigned long))
58typedef struct {
59 unsigned long data[N_WORDS];
60} aes_block_t;
61
62/* XXX: probably some better way to do this */
63#if defined(__i386__) || defined(__x86_64__)
64#define UNALIGNED_MEMOPS_ARE_FAST 1
65#else
66#define UNALIGNED_MEMOPS_ARE_FAST 0
67#endif
68
69#if UNALIGNED_MEMOPS_ARE_FAST
70#define load_block(d, s) (d) = *(const aes_block_t *)(s)
71#define store_block(d, s) *(aes_block_t *)(d) = (s)
72#else
73#define load_block(d, s) memcpy((d).data, (s), AES_BLOCK_SIZE)
74#define store_block(d, s) memcpy((d), (s).data, AES_BLOCK_SIZE)
75#endif
76
77/* N.B. The IV for this mode is _twice_ the block size */
78
79void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
80 size_t length, const AES_KEY *key,
81 unsigned char *ivec, const int enc)
82 {
83 size_t n;
84 size_t len = length;
85
86 OPENSSL_assert(in && out && key && ivec);
87 OPENSSL_assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc));
88 OPENSSL_assert((length%AES_BLOCK_SIZE) == 0);
89
90 len = length / AES_BLOCK_SIZE;
91
92 if (AES_ENCRYPT == enc)
93 {
94 if (in != out &&
95 (UNALIGNED_MEMOPS_ARE_FAST || ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(long)==0))
96 {
97 aes_block_t *ivp = (aes_block_t *)ivec;
98 aes_block_t *iv2p = (aes_block_t *)(ivec + AES_BLOCK_SIZE);
99
100 while (len)
101 {
102 aes_block_t *inp = (aes_block_t *)in;
103 aes_block_t *outp = (aes_block_t *)out;
104
105 for(n=0 ; n < N_WORDS; ++n)
106 outp->data[n] = inp->data[n] ^ ivp->data[n];
107 AES_encrypt((unsigned char *)outp->data, (unsigned char *)outp->data, key);
108 for(n=0 ; n < N_WORDS; ++n)
109 outp->data[n] ^= iv2p->data[n];
110 ivp = outp;
111 iv2p = inp;
112 --len;
113 in += AES_BLOCK_SIZE;
114 out += AES_BLOCK_SIZE;
115 }
116 memcpy(ivec, ivp->data, AES_BLOCK_SIZE);
117 memcpy(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
118 }
119 else
120 {
121 aes_block_t tmp, tmp2;
122 aes_block_t iv;
123 aes_block_t iv2;
124
125 load_block(iv, ivec);
126 load_block(iv2, ivec + AES_BLOCK_SIZE);
127
128 while (len)
129 {
130 load_block(tmp, in);
131 for(n=0 ; n < N_WORDS; ++n)
132 tmp2.data[n] = tmp.data[n] ^ iv.data[n];
133 AES_encrypt((unsigned char *)tmp2.data, (unsigned char *)tmp2.data, key);
134 for(n=0 ; n < N_WORDS; ++n)
135 tmp2.data[n] ^= iv2.data[n];
136 store_block(out, tmp2);
137 iv = tmp2;
138 iv2 = tmp;
139 --len;
140 in += AES_BLOCK_SIZE;
141 out += AES_BLOCK_SIZE;
142 }
143 memcpy(ivec, iv.data, AES_BLOCK_SIZE);
144 memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
145 }
146 }
147 else
148 {
149 if (in != out &&
150 (UNALIGNED_MEMOPS_ARE_FAST || ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(long)==0))
151 {
152 aes_block_t *ivp = (aes_block_t *)ivec;
153 aes_block_t *iv2p = (aes_block_t *)(ivec + AES_BLOCK_SIZE);
154
155 while (len)
156 {
157 aes_block_t tmp;
158 aes_block_t *inp = (aes_block_t *)in;
159 aes_block_t *outp = (aes_block_t *)out;
160
161 for(n=0 ; n < N_WORDS; ++n)
162 tmp.data[n] = inp->data[n] ^ iv2p->data[n];
163 AES_decrypt((unsigned char *)tmp.data, (unsigned char *)outp->data, key);
164 for(n=0 ; n < N_WORDS; ++n)
165 outp->data[n] ^= ivp->data[n];
166 ivp = inp;
167 iv2p = outp;
168 --len;
169 in += AES_BLOCK_SIZE;
170 out += AES_BLOCK_SIZE;
171 }
172 memcpy(ivec, ivp->data, AES_BLOCK_SIZE);
173 memcpy(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
174 }
175 else
176 {
177 aes_block_t tmp, tmp2;
178 aes_block_t iv;
179 aes_block_t iv2;
180
181 load_block(iv, ivec);
182 load_block(iv2, ivec + AES_BLOCK_SIZE);
183
184 while (len)
185 {
186 load_block(tmp, in);
187 tmp2 = tmp;
188 for(n=0 ; n < N_WORDS; ++n)
189 tmp.data[n] ^= iv2.data[n];
190 AES_decrypt((unsigned char *)tmp.data, (unsigned char *)tmp.data, key);
191 for(n=0 ; n < N_WORDS; ++n)
192 tmp.data[n] ^= iv.data[n];
193 store_block(out, tmp);
194 iv = tmp2;
195 iv2 = tmp;
196 --len;
197 in += AES_BLOCK_SIZE;
198 out += AES_BLOCK_SIZE;
199 }
200 memcpy(ivec, iv.data, AES_BLOCK_SIZE);
201 memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
202 }
203 }
204 }
205
206/*
207 * Note that its effectively impossible to do biIGE in anything other
208 * than a single pass, so no provision is made for chaining.
209 */
210
211/* N.B. The IV for this mode is _four times_ the block size */
212
213void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out,
214 size_t length, const AES_KEY *key,
215 const AES_KEY *key2, const unsigned char *ivec,
216 const int enc)
217 {
218 size_t n;
219 size_t len = length;
220 unsigned char tmp[AES_BLOCK_SIZE];
221 unsigned char tmp2[AES_BLOCK_SIZE];
222 unsigned char tmp3[AES_BLOCK_SIZE];
223 unsigned char prev[AES_BLOCK_SIZE];
224 const unsigned char *iv;
225 const unsigned char *iv2;
226
227 OPENSSL_assert(in && out && key && ivec);
228 OPENSSL_assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc));
229 OPENSSL_assert((length%AES_BLOCK_SIZE) == 0);
230
231 if (AES_ENCRYPT == enc)
232 {
233 /* XXX: Do a separate case for when in != out (strictly should
234 check for overlap, too) */
235
236 /* First the forward pass */
237 iv = ivec;
238 iv2 = ivec + AES_BLOCK_SIZE;
239 while (len >= AES_BLOCK_SIZE)
240 {
241 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
242 out[n] = in[n] ^ iv[n];
243 AES_encrypt(out, out, key);
244 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
245 out[n] ^= iv2[n];
246 iv = out;
247 memcpy(prev, in, AES_BLOCK_SIZE);
248 iv2 = prev;
249 len -= AES_BLOCK_SIZE;
250 in += AES_BLOCK_SIZE;
251 out += AES_BLOCK_SIZE;
252 }
253
254 /* And now backwards */
255 iv = ivec + AES_BLOCK_SIZE*2;
256 iv2 = ivec + AES_BLOCK_SIZE*3;
257 len = length;
258 while(len >= AES_BLOCK_SIZE)
259 {
260 out -= AES_BLOCK_SIZE;
261 /* XXX: reduce copies by alternating between buffers */
262 memcpy(tmp, out, AES_BLOCK_SIZE);
263 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
264 out[n] ^= iv[n];
265 /* hexdump(stdout, "out ^ iv", out, AES_BLOCK_SIZE); */
266 AES_encrypt(out, out, key);
267 /* hexdump(stdout,"enc", out, AES_BLOCK_SIZE); */
268 /* hexdump(stdout,"iv2", iv2, AES_BLOCK_SIZE); */
269 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
270 out[n] ^= iv2[n];
271 /* hexdump(stdout,"out", out, AES_BLOCK_SIZE); */
272 iv = out;
273 memcpy(prev, tmp, AES_BLOCK_SIZE);
274 iv2 = prev;
275 len -= AES_BLOCK_SIZE;
276 }
277 }
278 else
279 {
280 /* First backwards */
281 iv = ivec + AES_BLOCK_SIZE*2;
282 iv2 = ivec + AES_BLOCK_SIZE*3;
283 in += length;
284 out += length;
285 while (len >= AES_BLOCK_SIZE)
286 {
287 in -= AES_BLOCK_SIZE;
288 out -= AES_BLOCK_SIZE;
289 memcpy(tmp, in, AES_BLOCK_SIZE);
290 memcpy(tmp2, in, AES_BLOCK_SIZE);
291 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
292 tmp[n] ^= iv2[n];
293 AES_decrypt(tmp, out, key);
294 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
295 out[n] ^= iv[n];
296 memcpy(tmp3, tmp2, AES_BLOCK_SIZE);
297 iv = tmp3;
298 iv2 = out;
299 len -= AES_BLOCK_SIZE;
300 }
301
302 /* And now forwards */
303 iv = ivec;
304 iv2 = ivec + AES_BLOCK_SIZE;
305 len = length;
306 while (len >= AES_BLOCK_SIZE)
307 {
308 memcpy(tmp, out, AES_BLOCK_SIZE);
309 memcpy(tmp2, out, AES_BLOCK_SIZE);
310 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
311 tmp[n] ^= iv2[n];
312 AES_decrypt(tmp, out, key);
313 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
314 out[n] ^= iv[n];
315 memcpy(tmp3, tmp2, AES_BLOCK_SIZE);
316 iv = tmp3;
317 iv2 = out;
318 len -= AES_BLOCK_SIZE;
319 in += AES_BLOCK_SIZE;
320 out += AES_BLOCK_SIZE;
321 }
322 }
323 }
diff --git a/src/lib/libcrypto/aes/aes_locl.h b/src/lib/libcrypto/aes/aes_locl.h
deleted file mode 100644
index 054b442d41..0000000000
--- a/src/lib/libcrypto/aes/aes_locl.h
+++ /dev/null
@@ -1,89 +0,0 @@
1/* crypto/aes/aes.h -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#ifndef HEADER_AES_LOCL_H
53#define HEADER_AES_LOCL_H
54
55#include <openssl/e_os2.h>
56
57#ifdef OPENSSL_NO_AES
58#error AES is disabled.
59#endif
60
61#include <stdio.h>
62#include <stdlib.h>
63#include <string.h>
64
65#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
66# define SWAP(x) (_lrotl(x, 8) & 0x00ff00ff | _lrotr(x, 8) & 0xff00ff00)
67# define GETU32(p) SWAP(*((u32 *)(p)))
68# define PUTU32(ct, st) { *((u32 *)(ct)) = SWAP((st)); }
69#else
70# define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] << 8) ^ ((u32)(pt)[3]))
71# define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >> 8); (ct)[3] = (u8)(st); }
72#endif
73
74#ifdef AES_LONG
75typedef unsigned long u32;
76#else
77typedef unsigned int u32;
78#endif
79typedef unsigned short u16;
80typedef unsigned char u8;
81
82#define MAXKC (256/32)
83#define MAXKB (256/8)
84#define MAXNR 14
85
86/* This controls loop-unrolling in aes_core.c */
87#undef FULL_UNROLL
88
89#endif /* !HEADER_AES_LOCL_H */
diff --git a/src/lib/libcrypto/aes/aes_misc.c b/src/lib/libcrypto/aes/aes_misc.c
deleted file mode 100644
index 4fead1b4c7..0000000000
--- a/src/lib/libcrypto/aes/aes_misc.c
+++ /dev/null
@@ -1,64 +0,0 @@
1/* crypto/aes/aes_misc.c -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/opensslv.h>
53#include <openssl/aes.h>
54#include "aes_locl.h"
55
56const char AES_version[]="AES" OPENSSL_VERSION_PTEXT;
57
58const char *AES_options(void) {
59#ifdef FULL_UNROLL
60 return "aes(full)";
61#else
62 return "aes(partial)";
63#endif
64}
diff --git a/src/lib/libcrypto/aes/aes_ofb.c b/src/lib/libcrypto/aes/aes_ofb.c
deleted file mode 100644
index 50bf0b8325..0000000000
--- a/src/lib/libcrypto/aes/aes_ofb.c
+++ /dev/null
@@ -1,60 +0,0 @@
1/* crypto/aes/aes_ofb.c -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/modes.h>
54
55void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out,
56 size_t length, const AES_KEY *key,
57 unsigned char *ivec, int *num)
58{
59 CRYPTO_ofb128_encrypt(in,out,length,key,ivec,num,(block128_f)AES_encrypt);
60}
diff --git a/src/lib/libcrypto/aes/aes_wrap.c b/src/lib/libcrypto/aes/aes_wrap.c
deleted file mode 100644
index e2d73d37ce..0000000000
--- a/src/lib/libcrypto/aes/aes_wrap.c
+++ /dev/null
@@ -1,259 +0,0 @@
1/* crypto/aes/aes_wrap.c */
2/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
3 * project.
4 */
5/* ====================================================================
6 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. All advertising materials mentioning features or use of this
21 * software must display the following acknowledgment:
22 * "This product includes software developed by the OpenSSL Project
23 * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
24 *
25 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26 * endorse or promote products derived from this software without
27 * prior written permission. For written permission, please contact
28 * licensing@OpenSSL.org.
29 *
30 * 5. Products derived from this software may not be called "OpenSSL"
31 * nor may "OpenSSL" appear in their names without prior written
32 * permission of the OpenSSL Project.
33 *
34 * 6. Redistributions of any form whatsoever must retain the following
35 * acknowledgment:
36 * "This product includes software developed by the OpenSSL Project
37 * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50 * OF THE POSSIBILITY OF SUCH DAMAGE.
51 * ====================================================================
52 */
53
54#include "cryptlib.h"
55#include <openssl/aes.h>
56#include <openssl/bio.h>
57
58static const unsigned char default_iv[] = {
59 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
60};
61
62int AES_wrap_key(AES_KEY *key, const unsigned char *iv,
63 unsigned char *out,
64 const unsigned char *in, unsigned int inlen)
65 {
66 unsigned char *A, B[16], *R;
67 unsigned int i, j, t;
68 if ((inlen & 0x7) || (inlen < 8))
69 return -1;
70 A = B;
71 t = 1;
72 memcpy(out + 8, in, inlen);
73 if (!iv)
74 iv = default_iv;
75
76 memcpy(A, iv, 8);
77
78 for (j = 0; j < 6; j++)
79 {
80 R = out + 8;
81 for (i = 0; i < inlen; i += 8, t++, R += 8)
82 {
83 memcpy(B + 8, R, 8);
84 AES_encrypt(B, B, key);
85 A[7] ^= (unsigned char)(t & 0xff);
86 if (t > 0xff)
87 {
88 A[6] ^= (unsigned char)((t >> 8) & 0xff);
89 A[5] ^= (unsigned char)((t >> 16) & 0xff);
90 A[4] ^= (unsigned char)((t >> 24) & 0xff);
91 }
92 memcpy(R, B + 8, 8);
93 }
94 }
95 memcpy(out, A, 8);
96 return inlen + 8;
97 }
98
99int AES_unwrap_key(AES_KEY *key, const unsigned char *iv,
100 unsigned char *out,
101 const unsigned char *in, unsigned int inlen)
102 {
103 unsigned char *A, B[16], *R;
104 unsigned int i, j, t;
105 inlen -= 8;
106 if (inlen & 0x7)
107 return -1;
108 if (inlen < 8)
109 return -1;
110 A = B;
111 t = 6 * (inlen >> 3);
112 memcpy(A, in, 8);
113 memcpy(out, in + 8, inlen);
114 for (j = 0; j < 6; j++)
115 {
116 R = out + inlen - 8;
117 for (i = 0; i < inlen; i += 8, t--, R -= 8)
118 {
119 A[7] ^= (unsigned char)(t & 0xff);
120 if (t > 0xff)
121 {
122 A[6] ^= (unsigned char)((t >> 8) & 0xff);
123 A[5] ^= (unsigned char)((t >> 16) & 0xff);
124 A[4] ^= (unsigned char)((t >> 24) & 0xff);
125 }
126 memcpy(B + 8, R, 8);
127 AES_decrypt(B, B, key);
128 memcpy(R, B + 8, 8);
129 }
130 }
131 if (!iv)
132 iv = default_iv;
133 if (memcmp(A, iv, 8))
134 {
135 OPENSSL_cleanse(out, inlen);
136 return 0;
137 }
138 return inlen;
139 }
140
141#ifdef AES_WRAP_TEST
142
143int AES_wrap_unwrap_test(const unsigned char *kek, int keybits,
144 const unsigned char *iv,
145 const unsigned char *eout,
146 const unsigned char *key, int keylen)
147 {
148 unsigned char *otmp = NULL, *ptmp = NULL;
149 int r, ret = 0;
150 AES_KEY wctx;
151 otmp = OPENSSL_malloc(keylen + 8);
152 ptmp = OPENSSL_malloc(keylen);
153 if (!otmp || !ptmp)
154 return 0;
155 if (AES_set_encrypt_key(kek, keybits, &wctx))
156 goto err;
157 r = AES_wrap_key(&wctx, iv, otmp, key, keylen);
158 if (r <= 0)
159 goto err;
160
161 if (eout && memcmp(eout, otmp, keylen))
162 goto err;
163
164 if (AES_set_decrypt_key(kek, keybits, &wctx))
165 goto err;
166 r = AES_unwrap_key(&wctx, iv, ptmp, otmp, r);
167
168 if (memcmp(key, ptmp, keylen))
169 goto err;
170
171 ret = 1;
172
173 err:
174 if (otmp)
175 OPENSSL_free(otmp);
176 if (ptmp)
177 OPENSSL_free(ptmp);
178
179 return ret;
180
181 }
182
183
184
185int main(int argc, char **argv)
186{
187
188static const unsigned char kek[] = {
189 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
190 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
191 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
192 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
193};
194
195static const unsigned char key[] = {
196 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
197 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
198 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
199 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
200};
201
202static const unsigned char e1[] = {
203 0x1f, 0xa6, 0x8b, 0x0a, 0x81, 0x12, 0xb4, 0x47,
204 0xae, 0xf3, 0x4b, 0xd8, 0xfb, 0x5a, 0x7b, 0x82,
205 0x9d, 0x3e, 0x86, 0x23, 0x71, 0xd2, 0xcf, 0xe5
206};
207
208static const unsigned char e2[] = {
209 0x96, 0x77, 0x8b, 0x25, 0xae, 0x6c, 0xa4, 0x35,
210 0xf9, 0x2b, 0x5b, 0x97, 0xc0, 0x50, 0xae, 0xd2,
211 0x46, 0x8a, 0xb8, 0xa1, 0x7a, 0xd8, 0x4e, 0x5d
212};
213
214static const unsigned char e3[] = {
215 0x64, 0xe8, 0xc3, 0xf9, 0xce, 0x0f, 0x5b, 0xa2,
216 0x63, 0xe9, 0x77, 0x79, 0x05, 0x81, 0x8a, 0x2a,
217 0x93, 0xc8, 0x19, 0x1e, 0x7d, 0x6e, 0x8a, 0xe7
218};
219
220static const unsigned char e4[] = {
221 0x03, 0x1d, 0x33, 0x26, 0x4e, 0x15, 0xd3, 0x32,
222 0x68, 0xf2, 0x4e, 0xc2, 0x60, 0x74, 0x3e, 0xdc,
223 0xe1, 0xc6, 0xc7, 0xdd, 0xee, 0x72, 0x5a, 0x93,
224 0x6b, 0xa8, 0x14, 0x91, 0x5c, 0x67, 0x62, 0xd2
225};
226
227static const unsigned char e5[] = {
228 0xa8, 0xf9, 0xbc, 0x16, 0x12, 0xc6, 0x8b, 0x3f,
229 0xf6, 0xe6, 0xf4, 0xfb, 0xe3, 0x0e, 0x71, 0xe4,
230 0x76, 0x9c, 0x8b, 0x80, 0xa3, 0x2c, 0xb8, 0x95,
231 0x8c, 0xd5, 0xd1, 0x7d, 0x6b, 0x25, 0x4d, 0xa1
232};
233
234static const unsigned char e6[] = {
235 0x28, 0xc9, 0xf4, 0x04, 0xc4, 0xb8, 0x10, 0xf4,
236 0xcb, 0xcc, 0xb3, 0x5c, 0xfb, 0x87, 0xf8, 0x26,
237 0x3f, 0x57, 0x86, 0xe2, 0xd8, 0x0e, 0xd3, 0x26,
238 0xcb, 0xc7, 0xf0, 0xe7, 0x1a, 0x99, 0xf4, 0x3b,
239 0xfb, 0x98, 0x8b, 0x9b, 0x7a, 0x02, 0xdd, 0x21
240};
241
242 AES_KEY wctx, xctx;
243 int ret;
244 ret = AES_wrap_unwrap_test(kek, 128, NULL, e1, key, 16);
245 fprintf(stderr, "Key test result %d\n", ret);
246 ret = AES_wrap_unwrap_test(kek, 192, NULL, e2, key, 16);
247 fprintf(stderr, "Key test result %d\n", ret);
248 ret = AES_wrap_unwrap_test(kek, 256, NULL, e3, key, 16);
249 fprintf(stderr, "Key test result %d\n", ret);
250 ret = AES_wrap_unwrap_test(kek, 192, NULL, e4, key, 24);
251 fprintf(stderr, "Key test result %d\n", ret);
252 ret = AES_wrap_unwrap_test(kek, 256, NULL, e5, key, 24);
253 fprintf(stderr, "Key test result %d\n", ret);
254 ret = AES_wrap_unwrap_test(kek, 256, NULL, e6, key, 32);
255 fprintf(stderr, "Key test result %d\n", ret);
256}
257
258
259#endif
diff --git a/src/lib/libcrypto/aes/aes_x86core.c b/src/lib/libcrypto/aes/aes_x86core.c
deleted file mode 100644
index d323e265c0..0000000000
--- a/src/lib/libcrypto/aes/aes_x86core.c
+++ /dev/null
@@ -1,1063 +0,0 @@
1/* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
2/**
3 * rijndael-alg-fst.c
4 *
5 * @version 3.0 (December 2000)
6 *
7 * Optimised ANSI C code for the Rijndael cipher (now AES)
8 *
9 * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10 * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11 * @author Paulo Barreto <paulo.barreto@terra.com.br>
12 *
13 * This code is hereby placed in the public domain.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * This is experimental x86[_64] derivative. It assumes little-endian
30 * byte order and expects CPU to sustain unaligned memory references.
31 * It is used as playground for cache-time attack mitigations and
32 * serves as reference C implementation for x86[_64] assembler.
33 *
34 * <appro@fy.chalmers.se>
35 */
36
37
38#ifndef AES_DEBUG
39# ifndef NDEBUG
40# define NDEBUG
41# endif
42#endif
43#include <assert.h>
44
45#include <stdlib.h>
46#include <openssl/aes.h>
47#include "aes_locl.h"
48
49/*
50 * These two parameters control which table, 256-byte or 2KB, is
51 * referenced in outer and respectively inner rounds.
52 */
53#define AES_COMPACT_IN_OUTER_ROUNDS
54#ifdef AES_COMPACT_IN_OUTER_ROUNDS
55/* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while
56 * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further*
57 * by factor of ~2. */
58# undef AES_COMPACT_IN_INNER_ROUNDS
59#endif
60
61#if 1
62static void prefetch256(const void *table)
63{
64 volatile unsigned long *t=(void *)table,ret;
65 unsigned long sum;
66 int i;
67
68 /* 32 is common least cache-line size */
69 for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0])) sum ^= t[i];
70
71 ret = sum;
72}
73#else
74# define prefetch256(t)
75#endif
76
77#undef GETU32
78#define GETU32(p) (*((u32*)(p)))
79
80#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
81typedef unsigned __int64 u64;
82#define U64(C) C##UI64
83#elif defined(__arch64__)
84typedef unsigned long u64;
85#define U64(C) C##UL
86#else
87typedef unsigned long long u64;
88#define U64(C) C##ULL
89#endif
90
91#undef ROTATE
92#if defined(_MSC_VER) || defined(__ICC)
93# define ROTATE(a,n) _lrotl(a,n)
94#elif defined(__GNUC__) && __GNUC__>=2
95# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
96# define ROTATE(a,n) ({ register unsigned int ret; \
97 asm ( \
98 "roll %1,%0" \
99 : "=r"(ret) \
100 : "I"(n), "0"(a) \
101 : "cc"); \
102 ret; \
103 })
104# endif
105#endif
106/*
107Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
108Te0[x] = S [x].[02, 01, 01, 03];
109Te1[x] = S [x].[03, 02, 01, 01];
110Te2[x] = S [x].[01, 03, 02, 01];
111Te3[x] = S [x].[01, 01, 03, 02];
112*/
113#define Te0 (u32)((u64*)((u8*)Te+0))
114#define Te1 (u32)((u64*)((u8*)Te+3))
115#define Te2 (u32)((u64*)((u8*)Te+2))
116#define Te3 (u32)((u64*)((u8*)Te+1))
117/*
118Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
119Td0[x] = Si[x].[0e, 09, 0d, 0b];
120Td1[x] = Si[x].[0b, 0e, 09, 0d];
121Td2[x] = Si[x].[0d, 0b, 0e, 09];
122Td3[x] = Si[x].[09, 0d, 0b, 0e];
123Td4[x] = Si[x].[01];
124*/
125#define Td0 (u32)((u64*)((u8*)Td+0))
126#define Td1 (u32)((u64*)((u8*)Td+3))
127#define Td2 (u32)((u64*)((u8*)Td+2))
128#define Td3 (u32)((u64*)((u8*)Td+1))
129
130static const u64 Te[256] = {
131 U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
132 U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
133 U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
134 U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
135 U64(0x5030306050303060), U64(0x0301010203010102),
136 U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
137 U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
138 U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
139 U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
140 U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
141 U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
142 U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
143 U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
144 U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
145 U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
146 U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
147 U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
148 U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
149 U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
150 U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
151 U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
152 U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
153 U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
154 U64(0x5331316253313162), U64(0x3f15152a3f15152a),
155 U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
156 U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
157 U64(0x2818183028181830), U64(0xa1969637a1969637),
158 U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
159 U64(0x0907070e0907070e), U64(0x3612122436121224),
160 U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
161 U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
162 U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
163 U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
164 U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
165 U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
166 U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
167 U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
168 U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
169 U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
170 U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
171 U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
172 U64(0x0000000000000000), U64(0x2cededc12cededc1),
173 U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
174 U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
175 U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
176 U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
177 U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
178 U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
179 U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
180 U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
181 U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
182 U64(0x5533336655333366), U64(0x9485851194858511),
183 U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
184 U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
185 U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
186 U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
187 U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
188 U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
189 U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
190 U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
191 U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
192 U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
193 U64(0x3010102030101020), U64(0x1affffe51affffe5),
194 U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
195 U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
196 U64(0x3513132635131326), U64(0x2fececc32fececc3),
197 U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
198 U64(0xcc444488cc444488), U64(0x3917172e3917172e),
199 U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
200 U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
201 U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
202 U64(0x2b1919322b191932), U64(0x957373e6957373e6),
203 U64(0xa06060c0a06060c0), U64(0x9881811998818119),
204 U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
205 U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
206 U64(0xab90903bab90903b), U64(0x8388880b8388880b),
207 U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
208 U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
209 U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
210 U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
211 U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
212 U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
213 U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
214 U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
215 U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
216 U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
217 U64(0xa8919139a8919139), U64(0xa4959531a4959531),
218 U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
219 U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
220 U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
221 U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
222 U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
223 U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
224 U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
225 U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
226 U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
227 U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
228 U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
229 U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
230 U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
231 U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
232 U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
233 U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
234 U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
235 U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
236 U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
237 U64(0xd8484890d8484890), U64(0x0503030605030306),
238 U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
239 U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
240 U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
241 U64(0x9186861791868617), U64(0x58c1c19958c1c199),
242 U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
243 U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
244 U64(0xb398982bb398982b), U64(0x3311112233111122),
245 U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
246 U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
247 U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
248 U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
249 U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
250 U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
251 U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
252 U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
253 U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
254 U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
255 U64(0xc3414182c3414182), U64(0xb0999929b0999929),
256 U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
257 U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
258 U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
259};
260
261static const u8 Te4[256] = {
262 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
263 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
264 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
265 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
266 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
267 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
268 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
269 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
270 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
271 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
272 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
273 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
274 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
275 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
276 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
277 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
278 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
279 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
280 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
281 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
282 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
283 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
284 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
285 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
286 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
287 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
288 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
289 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
290 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
291 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
292 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
293 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
294};
295
296static const u64 Td[256] = {
297 U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
298 U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
299 U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
300 U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
301 U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
302 U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
303 U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
304 U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
305 U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
306 U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
307 U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
308 U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
309 U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
310 U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
311 U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
312 U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
313 U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
314 U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
315 U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
316 U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
317 U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
318 U64(0x6033519760335197), U64(0x457f5362457f5362),
319 U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
320 U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
321 U64(0x5868487058684870), U64(0x19fd458f19fd458f),
322 U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
323 U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
324 U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
325 U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
326 U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
327 U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
328 U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
329 U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
330 U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
331 U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
332 U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
333 U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
334 U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
335 U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
336 U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
337 U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
338 U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
339 U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
340 U64(0x6fd406046fd40604), U64(0xff155060ff155060),
341 U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
342 U64(0xcc434089cc434089), U64(0x779ed967779ed967),
343 U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
344 U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
345 U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
346 U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
347 U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
348 U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
349 U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
350 U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
351 U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
352 U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
353 U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
354 U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
355 U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
356 U64(0x694b775a694b775a), U64(0x161a121c161a121c),
357 U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
358 U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
359 U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
360 U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
361 U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
362 U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
363 U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
364 U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
365 U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
366 U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
367 U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
368 U64(0x4022971340229713), U64(0x2011c6842011c684),
369 U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
370 U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
371 U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
372 U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
373 U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
374 U64(0xfa489411fa489411), U64(0x2264e9472264e947),
375 U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
376 U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
377 U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
378 U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
379 U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
380 U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
381 U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
382 U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
383 U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
384 U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
385 U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
386 U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
387 U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
388 U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
389 U64(0x097826cd097826cd), U64(0xf418596ef418596e),
390 U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
391 U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
392 U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
393 U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
394 U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
395 U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
396 U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
397 U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
398 U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
399 U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
400 U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
401 U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
402 U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
403 U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
404 U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
405 U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
406 U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
407 U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
408 U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
409 U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
410 U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
411 U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
412 U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
413 U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
414 U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
415 U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
416 U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
417 U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
418 U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
419 U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
420 U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
421 U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
422 U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
423 U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
424 U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
425};
426static const u8 Td4[256] = {
427 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
428 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
429 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
430 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
431 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
432 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
433 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
434 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
435 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
436 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
437 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
438 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
439 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
440 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
441 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
442 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
443 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
444 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
445 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
446 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
447 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
448 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
449 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
450 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
451 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
452 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
453 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
454 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
455 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
456 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
457 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
458 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
459};
460
461static const u32 rcon[] = {
462 0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
463 0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
464 0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
465};
466
467/**
468 * Expand the cipher key into the encryption key schedule.
469 */
470int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
471 AES_KEY *key) {
472
473 u32 *rk;
474 int i = 0;
475 u32 temp;
476
477 if (!userKey || !key)
478 return -1;
479 if (bits != 128 && bits != 192 && bits != 256)
480 return -2;
481
482 rk = key->rd_key;
483
484 if (bits==128)
485 key->rounds = 10;
486 else if (bits==192)
487 key->rounds = 12;
488 else
489 key->rounds = 14;
490
491 rk[0] = GETU32(userKey );
492 rk[1] = GETU32(userKey + 4);
493 rk[2] = GETU32(userKey + 8);
494 rk[3] = GETU32(userKey + 12);
495 if (bits == 128) {
496 while (1) {
497 temp = rk[3];
498 rk[4] = rk[0] ^
499 (Te4[(temp >> 8) & 0xff] ) ^
500 (Te4[(temp >> 16) & 0xff] << 8) ^
501 (Te4[(temp >> 24) ] << 16) ^
502 (Te4[(temp ) & 0xff] << 24) ^
503 rcon[i];
504 rk[5] = rk[1] ^ rk[4];
505 rk[6] = rk[2] ^ rk[5];
506 rk[7] = rk[3] ^ rk[6];
507 if (++i == 10) {
508 return 0;
509 }
510 rk += 4;
511 }
512 }
513 rk[4] = GETU32(userKey + 16);
514 rk[5] = GETU32(userKey + 20);
515 if (bits == 192) {
516 while (1) {
517 temp = rk[ 5];
518 rk[ 6] = rk[ 0] ^
519 (Te4[(temp >> 8) & 0xff] ) ^
520 (Te4[(temp >> 16) & 0xff] << 8) ^
521 (Te4[(temp >> 24) ] << 16) ^
522 (Te4[(temp ) & 0xff] << 24) ^
523 rcon[i];
524 rk[ 7] = rk[ 1] ^ rk[ 6];
525 rk[ 8] = rk[ 2] ^ rk[ 7];
526 rk[ 9] = rk[ 3] ^ rk[ 8];
527 if (++i == 8) {
528 return 0;
529 }
530 rk[10] = rk[ 4] ^ rk[ 9];
531 rk[11] = rk[ 5] ^ rk[10];
532 rk += 6;
533 }
534 }
535 rk[6] = GETU32(userKey + 24);
536 rk[7] = GETU32(userKey + 28);
537 if (bits == 256) {
538 while (1) {
539 temp = rk[ 7];
540 rk[ 8] = rk[ 0] ^
541 (Te4[(temp >> 8) & 0xff] ) ^
542 (Te4[(temp >> 16) & 0xff] << 8) ^
543 (Te4[(temp >> 24) ] << 16) ^
544 (Te4[(temp ) & 0xff] << 24) ^
545 rcon[i];
546 rk[ 9] = rk[ 1] ^ rk[ 8];
547 rk[10] = rk[ 2] ^ rk[ 9];
548 rk[11] = rk[ 3] ^ rk[10];
549 if (++i == 7) {
550 return 0;
551 }
552 temp = rk[11];
553 rk[12] = rk[ 4] ^
554 (Te4[(temp ) & 0xff] ) ^
555 (Te4[(temp >> 8) & 0xff] << 8) ^
556 (Te4[(temp >> 16) & 0xff] << 16) ^
557 (Te4[(temp >> 24) ] << 24);
558 rk[13] = rk[ 5] ^ rk[12];
559 rk[14] = rk[ 6] ^ rk[13];
560 rk[15] = rk[ 7] ^ rk[14];
561
562 rk += 8;
563 }
564 }
565 return 0;
566}
567
568/**
569 * Expand the cipher key into the decryption key schedule.
570 */
571int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
572 AES_KEY *key) {
573
574 u32 *rk;
575 int i, j, status;
576 u32 temp;
577
578 /* first, start with an encryption schedule */
579 status = AES_set_encrypt_key(userKey, bits, key);
580 if (status < 0)
581 return status;
582
583 rk = key->rd_key;
584
585 /* invert the order of the round keys: */
586 for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
587 temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
588 temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
589 temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
590 temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
591 }
592 /* apply the inverse MixColumn transform to all round keys but the first and the last: */
593 for (i = 1; i < (key->rounds); i++) {
594 rk += 4;
595#if 1
596 for (j = 0; j < 4; j++) {
597 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
598
599 tp1 = rk[j];
600 m = tp1 & 0x80808080;
601 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
602 ((m - (m >> 7)) & 0x1b1b1b1b);
603 m = tp2 & 0x80808080;
604 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
605 ((m - (m >> 7)) & 0x1b1b1b1b);
606 m = tp4 & 0x80808080;
607 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
608 ((m - (m >> 7)) & 0x1b1b1b1b);
609 tp9 = tp8 ^ tp1;
610 tpb = tp9 ^ tp2;
611 tpd = tp9 ^ tp4;
612 tpe = tp8 ^ tp4 ^ tp2;
613#if defined(ROTATE)
614 rk[j] = tpe ^ ROTATE(tpd,16) ^
615 ROTATE(tp9,8) ^ ROTATE(tpb,24);
616#else
617 rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
618 (tp9 >> 24) ^ (tp9 << 8) ^
619 (tpb >> 8) ^ (tpb << 24);
620#endif
621 }
622#else
623 rk[0] =
624 Td0[Te2[(rk[0] ) & 0xff] & 0xff] ^
625 Td1[Te2[(rk[0] >> 8) & 0xff] & 0xff] ^
626 Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
627 Td3[Te2[(rk[0] >> 24) ] & 0xff];
628 rk[1] =
629 Td0[Te2[(rk[1] ) & 0xff] & 0xff] ^
630 Td1[Te2[(rk[1] >> 8) & 0xff] & 0xff] ^
631 Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
632 Td3[Te2[(rk[1] >> 24) ] & 0xff];
633 rk[2] =
634 Td0[Te2[(rk[2] ) & 0xff] & 0xff] ^
635 Td1[Te2[(rk[2] >> 8) & 0xff] & 0xff] ^
636 Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
637 Td3[Te2[(rk[2] >> 24) ] & 0xff];
638 rk[3] =
639 Td0[Te2[(rk[3] ) & 0xff] & 0xff] ^
640 Td1[Te2[(rk[3] >> 8) & 0xff] & 0xff] ^
641 Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
642 Td3[Te2[(rk[3] >> 24) ] & 0xff];
643#endif
644 }
645 return 0;
646}
647
648/*
649 * Encrypt a single block
650 * in and out can overlap
651 */
652void AES_encrypt(const unsigned char *in, unsigned char *out,
653 const AES_KEY *key) {
654
655 const u32 *rk;
656 u32 s0, s1, s2, s3, t[4];
657 int r;
658
659 assert(in && out && key);
660 rk = key->rd_key;
661
662 /*
663 * map byte array block to cipher state
664 * and add initial round key:
665 */
666 s0 = GETU32(in ) ^ rk[0];
667 s1 = GETU32(in + 4) ^ rk[1];
668 s2 = GETU32(in + 8) ^ rk[2];
669 s3 = GETU32(in + 12) ^ rk[3];
670
671#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
672 prefetch256(Te4);
673
674 t[0] = Te4[(s0 ) & 0xff] ^
675 Te4[(s1 >> 8) & 0xff] << 8 ^
676 Te4[(s2 >> 16) & 0xff] << 16 ^
677 Te4[(s3 >> 24) ] << 24;
678 t[1] = Te4[(s1 ) & 0xff] ^
679 Te4[(s2 >> 8) & 0xff] << 8 ^
680 Te4[(s3 >> 16) & 0xff] << 16 ^
681 Te4[(s0 >> 24) ] << 24;
682 t[2] = Te4[(s2 ) & 0xff] ^
683 Te4[(s3 >> 8) & 0xff] << 8 ^
684 Te4[(s0 >> 16) & 0xff] << 16 ^
685 Te4[(s1 >> 24) ] << 24;
686 t[3] = Te4[(s3 ) & 0xff] ^
687 Te4[(s0 >> 8) & 0xff] << 8 ^
688 Te4[(s1 >> 16) & 0xff] << 16 ^
689 Te4[(s2 >> 24) ] << 24;
690
691 /* now do the linear transform using words */
692 { int i;
693 u32 r0, r1, r2;
694
695 for (i = 0; i < 4; i++) {
696 r0 = t[i];
697 r1 = r0 & 0x80808080;
698 r2 = ((r0 & 0x7f7f7f7f) << 1) ^
699 ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
700#if defined(ROTATE)
701 t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
702 ROTATE(r0,16) ^ ROTATE(r0,8);
703#else
704 t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
705 (r0 << 16) ^ (r0 >> 16) ^
706 (r0 << 8) ^ (r0 >> 24);
707#endif
708 t[i] ^= rk[4+i];
709 }
710 }
711#else
712 t[0] = Te0[(s0 ) & 0xff] ^
713 Te1[(s1 >> 8) & 0xff] ^
714 Te2[(s2 >> 16) & 0xff] ^
715 Te3[(s3 >> 24) ] ^
716 rk[4];
717 t[1] = Te0[(s1 ) & 0xff] ^
718 Te1[(s2 >> 8) & 0xff] ^
719 Te2[(s3 >> 16) & 0xff] ^
720 Te3[(s0 >> 24) ] ^
721 rk[5];
722 t[2] = Te0[(s2 ) & 0xff] ^
723 Te1[(s3 >> 8) & 0xff] ^
724 Te2[(s0 >> 16) & 0xff] ^
725 Te3[(s1 >> 24) ] ^
726 rk[6];
727 t[3] = Te0[(s3 ) & 0xff] ^
728 Te1[(s0 >> 8) & 0xff] ^
729 Te2[(s1 >> 16) & 0xff] ^
730 Te3[(s2 >> 24) ] ^
731 rk[7];
732#endif
733 s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
734
735 /*
736 * Nr - 2 full rounds:
737 */
738 for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
739#if defined(AES_COMPACT_IN_INNER_ROUNDS)
740 t[0] = Te4[(s0 ) & 0xff] ^
741 Te4[(s1 >> 8) & 0xff] << 8 ^
742 Te4[(s2 >> 16) & 0xff] << 16 ^
743 Te4[(s3 >> 24) ] << 24;
744 t[1] = Te4[(s1 ) & 0xff] ^
745 Te4[(s2 >> 8) & 0xff] << 8 ^
746 Te4[(s3 >> 16) & 0xff] << 16 ^
747 Te4[(s0 >> 24) ] << 24;
748 t[2] = Te4[(s2 ) & 0xff] ^
749 Te4[(s3 >> 8) & 0xff] << 8 ^
750 Te4[(s0 >> 16) & 0xff] << 16 ^
751 Te4[(s1 >> 24) ] << 24;
752 t[3] = Te4[(s3 ) & 0xff] ^
753 Te4[(s0 >> 8) & 0xff] << 8 ^
754 Te4[(s1 >> 16) & 0xff] << 16 ^
755 Te4[(s2 >> 24) ] << 24;
756
757 /* now do the linear transform using words */
758 { int i;
759 u32 r0, r1, r2;
760
761 for (i = 0; i < 4; i++) {
762 r0 = t[i];
763 r1 = r0 & 0x80808080;
764 r2 = ((r0 & 0x7f7f7f7f) << 1) ^
765 ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
766#if defined(ROTATE)
767 t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
768 ROTATE(r0,16) ^ ROTATE(r0,8);
769#else
770 t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
771 (r0 << 16) ^ (r0 >> 16) ^
772 (r0 << 8) ^ (r0 >> 24);
773#endif
774 t[i] ^= rk[i];
775 }
776 }
777#else
778 t[0] = Te0[(s0 ) & 0xff] ^
779 Te1[(s1 >> 8) & 0xff] ^
780 Te2[(s2 >> 16) & 0xff] ^
781 Te3[(s3 >> 24) ] ^
782 rk[0];
783 t[1] = Te0[(s1 ) & 0xff] ^
784 Te1[(s2 >> 8) & 0xff] ^
785 Te2[(s3 >> 16) & 0xff] ^
786 Te3[(s0 >> 24) ] ^
787 rk[1];
788 t[2] = Te0[(s2 ) & 0xff] ^
789 Te1[(s3 >> 8) & 0xff] ^
790 Te2[(s0 >> 16) & 0xff] ^
791 Te3[(s1 >> 24) ] ^
792 rk[2];
793 t[3] = Te0[(s3 ) & 0xff] ^
794 Te1[(s0 >> 8) & 0xff] ^
795 Te2[(s1 >> 16) & 0xff] ^
796 Te3[(s2 >> 24) ] ^
797 rk[3];
798#endif
799 s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
800 }
801 /*
802 * apply last round and
803 * map cipher state to byte array block:
804 */
805#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
806 prefetch256(Te4);
807
808 *(u32*)(out+0) =
809 Te4[(s0 ) & 0xff] ^
810 Te4[(s1 >> 8) & 0xff] << 8 ^
811 Te4[(s2 >> 16) & 0xff] << 16 ^
812 Te4[(s3 >> 24) ] << 24 ^
813 rk[0];
814 *(u32*)(out+4) =
815 Te4[(s1 ) & 0xff] ^
816 Te4[(s2 >> 8) & 0xff] << 8 ^
817 Te4[(s3 >> 16) & 0xff] << 16 ^
818 Te4[(s0 >> 24) ] << 24 ^
819 rk[1];
820 *(u32*)(out+8) =
821 Te4[(s2 ) & 0xff] ^
822 Te4[(s3 >> 8) & 0xff] << 8 ^
823 Te4[(s0 >> 16) & 0xff] << 16 ^
824 Te4[(s1 >> 24) ] << 24 ^
825 rk[2];
826 *(u32*)(out+12) =
827 Te4[(s3 ) & 0xff] ^
828 Te4[(s0 >> 8) & 0xff] << 8 ^
829 Te4[(s1 >> 16) & 0xff] << 16 ^
830 Te4[(s2 >> 24) ] << 24 ^
831 rk[3];
832#else
833 *(u32*)(out+0) =
834 (Te2[(s0 ) & 0xff] & 0x000000ffU) ^
835 (Te3[(s1 >> 8) & 0xff] & 0x0000ff00U) ^
836 (Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^
837 (Te1[(s3 >> 24) ] & 0xff000000U) ^
838 rk[0];
839 *(u32*)(out+4) =
840 (Te2[(s1 ) & 0xff] & 0x000000ffU) ^
841 (Te3[(s2 >> 8) & 0xff] & 0x0000ff00U) ^
842 (Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^
843 (Te1[(s0 >> 24) ] & 0xff000000U) ^
844 rk[1];
845 *(u32*)(out+8) =
846 (Te2[(s2 ) & 0xff] & 0x000000ffU) ^
847 (Te3[(s3 >> 8) & 0xff] & 0x0000ff00U) ^
848 (Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^
849 (Te1[(s1 >> 24) ] & 0xff000000U) ^
850 rk[2];
851 *(u32*)(out+12) =
852 (Te2[(s3 ) & 0xff] & 0x000000ffU) ^
853 (Te3[(s0 >> 8) & 0xff] & 0x0000ff00U) ^
854 (Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^
855 (Te1[(s2 >> 24) ] & 0xff000000U) ^
856 rk[3];
857#endif
858}
859
860/*
861 * Decrypt a single block
862 * in and out can overlap
863 */
864void AES_decrypt(const unsigned char *in, unsigned char *out,
865 const AES_KEY *key) {
866
867 const u32 *rk;
868 u32 s0, s1, s2, s3, t[4];
869 int r;
870
871 assert(in && out && key);
872 rk = key->rd_key;
873
874 /*
875 * map byte array block to cipher state
876 * and add initial round key:
877 */
878 s0 = GETU32(in ) ^ rk[0];
879 s1 = GETU32(in + 4) ^ rk[1];
880 s2 = GETU32(in + 8) ^ rk[2];
881 s3 = GETU32(in + 12) ^ rk[3];
882
883#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
884 prefetch256(Td4);
885
886 t[0] = Td4[(s0 ) & 0xff] ^
887 Td4[(s3 >> 8) & 0xff] << 8 ^
888 Td4[(s2 >> 16) & 0xff] << 16 ^
889 Td4[(s1 >> 24) ] << 24;
890 t[1] = Td4[(s1 ) & 0xff] ^
891 Td4[(s0 >> 8) & 0xff] << 8 ^
892 Td4[(s3 >> 16) & 0xff] << 16 ^
893 Td4[(s2 >> 24) ] << 24;
894 t[2] = Td4[(s2 ) & 0xff] ^
895 Td4[(s1 >> 8) & 0xff] << 8 ^
896 Td4[(s0 >> 16) & 0xff] << 16 ^
897 Td4[(s3 >> 24) ] << 24;
898 t[3] = Td4[(s3 ) & 0xff] ^
899 Td4[(s2 >> 8) & 0xff] << 8 ^
900 Td4[(s1 >> 16) & 0xff] << 16 ^
901 Td4[(s0 >> 24) ] << 24;
902
903 /* now do the linear transform using words */
904 { int i;
905 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
906
907 for (i = 0; i < 4; i++) {
908 tp1 = t[i];
909 m = tp1 & 0x80808080;
910 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
911 ((m - (m >> 7)) & 0x1b1b1b1b);
912 m = tp2 & 0x80808080;
913 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
914 ((m - (m >> 7)) & 0x1b1b1b1b);
915 m = tp4 & 0x80808080;
916 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
917 ((m - (m >> 7)) & 0x1b1b1b1b);
918 tp9 = tp8 ^ tp1;
919 tpb = tp9 ^ tp2;
920 tpd = tp9 ^ tp4;
921 tpe = tp8 ^ tp4 ^ tp2;
922#if defined(ROTATE)
923 t[i] = tpe ^ ROTATE(tpd,16) ^
924 ROTATE(tp9,8) ^ ROTATE(tpb,24);
925#else
926 t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
927 (tp9 >> 24) ^ (tp9 << 8) ^
928 (tpb >> 8) ^ (tpb << 24);
929#endif
930 t[i] ^= rk[4+i];
931 }
932 }
933#else
934 t[0] = Td0[(s0 ) & 0xff] ^
935 Td1[(s3 >> 8) & 0xff] ^
936 Td2[(s2 >> 16) & 0xff] ^
937 Td3[(s1 >> 24) ] ^
938 rk[4];
939 t[1] = Td0[(s1 ) & 0xff] ^
940 Td1[(s0 >> 8) & 0xff] ^
941 Td2[(s3 >> 16) & 0xff] ^
942 Td3[(s2 >> 24) ] ^
943 rk[5];
944 t[2] = Td0[(s2 ) & 0xff] ^
945 Td1[(s1 >> 8) & 0xff] ^
946 Td2[(s0 >> 16) & 0xff] ^
947 Td3[(s3 >> 24) ] ^
948 rk[6];
949 t[3] = Td0[(s3 ) & 0xff] ^
950 Td1[(s2 >> 8) & 0xff] ^
951 Td2[(s1 >> 16) & 0xff] ^
952 Td3[(s0 >> 24) ] ^
953 rk[7];
954#endif
955 s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
956
957 /*
958 * Nr - 2 full rounds:
959 */
960 for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
961#if defined(AES_COMPACT_IN_INNER_ROUNDS)
962 t[0] = Td4[(s0 ) & 0xff] ^
963 Td4[(s3 >> 8) & 0xff] << 8 ^
964 Td4[(s2 >> 16) & 0xff] << 16 ^
965 Td4[(s1 >> 24) ] << 24;
966 t[1] = Td4[(s1 ) & 0xff] ^
967 Td4[(s0 >> 8) & 0xff] << 8 ^
968 Td4[(s3 >> 16) & 0xff] << 16 ^
969 Td4[(s2 >> 24) ] << 24;
970 t[2] = Td4[(s2 ) & 0xff] ^
971 Td4[(s1 >> 8) & 0xff] << 8 ^
972 Td4[(s0 >> 16) & 0xff] << 16 ^
973 Td4[(s3 >> 24) ] << 24;
974 t[3] = Td4[(s3 ) & 0xff] ^
975 Td4[(s2 >> 8) & 0xff] << 8 ^
976 Td4[(s1 >> 16) & 0xff] << 16 ^
977 Td4[(s0 >> 24) ] << 24;
978
979 /* now do the linear transform using words */
980 { int i;
981 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
982
983 for (i = 0; i < 4; i++) {
984 tp1 = t[i];
985 m = tp1 & 0x80808080;
986 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
987 ((m - (m >> 7)) & 0x1b1b1b1b);
988 m = tp2 & 0x80808080;
989 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
990 ((m - (m >> 7)) & 0x1b1b1b1b);
991 m = tp4 & 0x80808080;
992 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
993 ((m - (m >> 7)) & 0x1b1b1b1b);
994 tp9 = tp8 ^ tp1;
995 tpb = tp9 ^ tp2;
996 tpd = tp9 ^ tp4;
997 tpe = tp8 ^ tp4 ^ tp2;
998#if defined(ROTATE)
999 t[i] = tpe ^ ROTATE(tpd,16) ^
1000 ROTATE(tp9,8) ^ ROTATE(tpb,24);
1001#else
1002 t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
1003 (tp9 >> 24) ^ (tp9 << 8) ^
1004 (tpb >> 8) ^ (tpb << 24);
1005#endif
1006 t[i] ^= rk[i];
1007 }
1008 }
1009#else
1010 t[0] = Td0[(s0 ) & 0xff] ^
1011 Td1[(s3 >> 8) & 0xff] ^
1012 Td2[(s2 >> 16) & 0xff] ^
1013 Td3[(s1 >> 24) ] ^
1014 rk[0];
1015 t[1] = Td0[(s1 ) & 0xff] ^
1016 Td1[(s0 >> 8) & 0xff] ^
1017 Td2[(s3 >> 16) & 0xff] ^
1018 Td3[(s2 >> 24) ] ^
1019 rk[1];
1020 t[2] = Td0[(s2 ) & 0xff] ^
1021 Td1[(s1 >> 8) & 0xff] ^
1022 Td2[(s0 >> 16) & 0xff] ^
1023 Td3[(s3 >> 24) ] ^
1024 rk[2];
1025 t[3] = Td0[(s3 ) & 0xff] ^
1026 Td1[(s2 >> 8) & 0xff] ^
1027 Td2[(s1 >> 16) & 0xff] ^
1028 Td3[(s0 >> 24) ] ^
1029 rk[3];
1030#endif
1031 s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
1032 }
1033 /*
1034 * apply last round and
1035 * map cipher state to byte array block:
1036 */
1037 prefetch256(Td4);
1038
1039 *(u32*)(out+0) =
1040 (Td4[(s0 ) & 0xff]) ^
1041 (Td4[(s3 >> 8) & 0xff] << 8) ^
1042 (Td4[(s2 >> 16) & 0xff] << 16) ^
1043 (Td4[(s1 >> 24) ] << 24) ^
1044 rk[0];
1045 *(u32*)(out+4) =
1046 (Td4[(s1 ) & 0xff]) ^
1047 (Td4[(s0 >> 8) & 0xff] << 8) ^
1048 (Td4[(s3 >> 16) & 0xff] << 16) ^
1049 (Td4[(s2 >> 24) ] << 24) ^
1050 rk[1];
1051 *(u32*)(out+8) =
1052 (Td4[(s2 ) & 0xff]) ^
1053 (Td4[(s1 >> 8) & 0xff] << 8) ^
1054 (Td4[(s0 >> 16) & 0xff] << 16) ^
1055 (Td4[(s3 >> 24) ] << 24) ^
1056 rk[2];
1057 *(u32*)(out+12) =
1058 (Td4[(s3 ) & 0xff]) ^
1059 (Td4[(s2 >> 8) & 0xff] << 8) ^
1060 (Td4[(s1 >> 16) & 0xff] << 16) ^
1061 (Td4[(s0 >> 24) ] << 24) ^
1062 rk[3];
1063}
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl
deleted file mode 100644
index aab40e6f1c..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-586.pl
+++ /dev/null
@@ -1,2980 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Version 4.3.
11#
12# You might fail to appreciate this module performance from the first
13# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
14# to be *the* best Intel C compiler without -KPIC, performance appears
15# to be virtually identical... But try to re-configure with shared
16# library support... Aha! Intel compiler "suddenly" lags behind by 30%
17# [on P4, more on others]:-) And if compared to position-independent
18# code generated by GNU C, this code performs *more* than *twice* as
19# fast! Yes, all this buzz about PIC means that unlike other hand-
20# coded implementations, this one was explicitly designed to be safe
21# to use even in shared library context... This also means that this
22# code isn't necessarily absolutely fastest "ever," because in order
23# to achieve position independence an extra register has to be
24# off-loaded to stack, which affects the benchmark result.
25#
26# Special note about instruction choice. Do you recall RC4_INT code
27# performing poorly on P4? It might be the time to figure out why.
28# RC4_INT code implies effective address calculations in base+offset*4
29# form. Trouble is that it seems that offset scaling turned to be
30# critical path... At least eliminating scaling resulted in 2.8x RC4
31# performance improvement [as you might recall]. As AES code is hungry
32# for scaling too, I [try to] avoid the latter by favoring off-by-2
33# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
34#
35# As was shown by Dean Gaudet <dean@arctic.org>, the above note turned
36# void. Performance improvement with off-by-2 shifts was observed on
37# intermediate implementation, which was spilling yet another register
38# to stack... Final offset*4 code below runs just a tad faster on P4,
39# but exhibits up to 10% improvement on other cores.
40#
41# Second version is "monolithic" replacement for aes_core.c, which in
42# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
43# This made it possible to implement little-endian variant of the
44# algorithm without modifying the base C code. Motivating factor for
45# the undertaken effort was that it appeared that in tight IA-32
46# register window little-endian flavor could achieve slightly higher
47# Instruction Level Parallelism, and it indeed resulted in up to 15%
48# better performance on most recent µ-archs...
49#
50# Third version adds AES_cbc_encrypt implementation, which resulted in
51# up to 40% performance imrovement of CBC benchmark results. 40% was
52# observed on P4 core, where "overall" imrovement coefficient, i.e. if
53# compared to PIC generated by GCC and in CBC mode, was observed to be
54# as large as 4x:-) CBC performance is virtually identical to ECB now
55# and on some platforms even better, e.g. 17.6 "small" cycles/byte on
56# Opteron, because certain function prologues and epilogues are
57# effectively taken out of the loop...
58#
59# Version 3.2 implements compressed tables and prefetch of these tables
60# in CBC[!] mode. Former means that 3/4 of table references are now
61# misaligned, which unfortunately has negative impact on elder IA-32
62# implementations, Pentium suffered 30% penalty, PIII - 10%.
63#
64# Version 3.3 avoids L1 cache aliasing between stack frame and
65# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
66# latter is achieved by copying the key schedule to controlled place in
67# stack. This unfortunately has rather strong impact on small block CBC
68# performance, ~2x deterioration on 16-byte block if compared to 3.3.
69#
70# Version 3.5 checks if there is L1 cache aliasing between user-supplied
71# key schedule and S-boxes and abstains from copying the former if
72# there is no. This allows end-user to consciously retain small block
73# performance by aligning key schedule in specific manner.
74#
75# Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
76#
77# Current ECB performance numbers for 128-bit key in CPU cycles per
78# processed byte [measure commonly used by AES benchmarkers] are:
79#
80# small footprint fully unrolled
81# P4 24 22
82# AMD K8 20 19
83# PIII 25 23
84# Pentium 81 78
85#
86# Version 3.7 reimplements outer rounds as "compact." Meaning that
87# first and last rounds reference compact 256 bytes S-box. This means
88# that first round consumes a lot more CPU cycles and that encrypt
89# and decrypt performance becomes asymmetric. Encrypt performance
90# drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
91# aggressively pre-fetched.
92#
93# Version 4.0 effectively rolls back to 3.6 and instead implements
94# additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
95# which use exclusively 256 byte S-box. These functions are to be
96# called in modes not concealing plain text, such as ECB, or when
97# we're asked to process smaller amount of data [or unconditionally
98# on hyper-threading CPU]. Currently it's called unconditionally from
99# AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
100# still needs to be modified to switch between slower and faster
101# mode when appropriate... But in either case benchmark landscape
102# changes dramatically and below numbers are CPU cycles per processed
103# byte for 128-bit key.
104#
105# ECB encrypt ECB decrypt CBC large chunk
106# P4 56[60] 84[100] 23
107# AMD K8 48[44] 70[79] 18
108# PIII 41[50] 61[91] 24
109# Core 2 32[38] 45[70] 18.5
110# Pentium 120 160 77
111#
112# Version 4.1 switches to compact S-box even in key schedule setup.
113#
114# Version 4.2 prefetches compact S-box in every SSE round or in other
115# words every cache-line is *guaranteed* to be accessed within ~50
116# cycles window. Why just SSE? Because it's needed on hyper-threading
117# CPU! Which is also why it's prefetched with 64 byte stride. Best
118# part is that it has no negative effect on performance:-)
119#
120# Version 4.3 implements switch between compact and non-compact block
121# functions in AES_cbc_encrypt depending on how much data was asked
122# to be processed in one stroke.
123#
124######################################################################
125# Timing attacks are classified in two classes: synchronous when
126# attacker consciously initiates cryptographic operation and collects
127# timing data of various character afterwards, and asynchronous when
128# malicious code is executed on same CPU simultaneously with AES,
129# instruments itself and performs statistical analysis of this data.
130#
131# As far as synchronous attacks go the root to the AES timing
132# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
133# are referred to in single 128-bit block operation. Well, in C
134# implementation with 4 distinct tables it's actually as little as 40
135# references per 256 elements table, but anyway... Secondly, even
136# though S-box elements are clustered into smaller amount of cache-
137# lines, smaller than 160 and even 40, it turned out that for certain
138# plain-text pattern[s] or simply put chosen plain-text and given key
139# few cache-lines remain unaccessed during block operation. Now, if
140# attacker can figure out this access pattern, he can deduct the key
141# [or at least part of it]. The natural way to mitigate this kind of
142# attacks is to minimize the amount of cache-lines in S-box and/or
143# prefetch them to ensure that every one is accessed for more uniform
144# timing. But note that *if* plain-text was concealed in such way that
145# input to block function is distributed *uniformly*, then attack
146# wouldn't apply. Now note that some encryption modes, most notably
147# CBC, do mask the plain-text in this exact way [secure cipher output
148# is distributed uniformly]. Yes, one still might find input that
149# would reveal the information about given key, but if amount of
150# candidate inputs to be tried is larger than amount of possible key
151# combinations then attack becomes infeasible. This is why revised
152# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
153# of data is to be processed in one stroke. The current size limit of
154# 512 bytes is chosen to provide same [diminishigly low] probability
155# for cache-line to remain untouched in large chunk operation with
156# large S-box as for single block operation with compact S-box and
157# surely needs more careful consideration...
158#
159# As for asynchronous attacks. There are two flavours: attacker code
160# being interleaved with AES on hyper-threading CPU at *instruction*
161# level, and two processes time sharing single core. As for latter.
162# Two vectors. 1. Given that attacker process has higher priority,
163# yield execution to process performing AES just before timer fires
164# off the scheduler, immediately regain control of CPU and analyze the
165# cache state. For this attack to be efficient attacker would have to
166# effectively slow down the operation by several *orders* of magnitute,
167# by ratio of time slice to duration of handful of AES rounds, which
168# unlikely to remain unnoticed. Not to mention that this also means
169# that he would spend correspondigly more time to collect enough
170# statistical data to mount the attack. It's probably appropriate to
171# say that if adeversary reckons that this attack is beneficial and
172# risks to be noticed, you probably have larger problems having him
173# mere opportunity. In other words suggested code design expects you
174# to preclude/mitigate this attack by overall system security design.
175# 2. Attacker manages to make his code interrupt driven. In order for
176# this kind of attack to be feasible, interrupt rate has to be high
177# enough, again comparable to duration of handful of AES rounds. But
178# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
179# generates interrupts at such raging rate...
180#
181# And now back to the former, hyper-threading CPU or more specifically
182# Intel P4. Recall that asynchronous attack implies that malicious
183# code instruments itself. And naturally instrumentation granularity
184# has be noticeably lower than duration of codepath accessing S-box.
185# Given that all cache-lines are accessed during that time that is.
186# Current implementation accesses *all* cache-lines within ~50 cycles
187# window, which is actually *less* than RDTSC latency on Intel P4!
188
189$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
190push(@INC,"${dir}","${dir}../../perlasm");
191require "x86asm.pl";
192
193&asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
194&static_label("AES_Te");
195&static_label("AES_Td");
196
197$s0="eax";
198$s1="ebx";
199$s2="ecx";
200$s3="edx";
201$key="edi";
202$acc="esi";
203$tbl="ebp";
204
205# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
206# by caller
207$__ra=&DWP(0,"esp"); # return address
208$__s0=&DWP(4,"esp"); # s0 backing store
209$__s1=&DWP(8,"esp"); # s1 backing store
210$__s2=&DWP(12,"esp"); # s2 backing store
211$__s3=&DWP(16,"esp"); # s3 backing store
212$__key=&DWP(20,"esp"); # pointer to key schedule
213$__end=&DWP(24,"esp"); # pointer to end of key schedule
214$__tbl=&DWP(28,"esp"); # %ebp backing store
215
216# stack frame layout in AES_[en|crypt] routines, which differs from
217# above by 4 and overlaps by %ebp backing store
218$_tbl=&DWP(24,"esp");
219$_esp=&DWP(28,"esp");
220
221sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
222
223$speed_limit=512; # chunks smaller than $speed_limit are
224 # processed with compact routine in CBC mode
225$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
226 # recent µ-archs], but ~5 times smaller!
227 # I favor compact code to minimize cache
228 # contention and in hope to "collect" 5% back
229 # in real-life applications...
230
231$vertical_spin=0; # shift "verticaly" defaults to 0, because of
232 # its proof-of-concept status...
233# Note that there is no decvert(), as well as last encryption round is
234# performed with "horizontal" shifts. This is because this "vertical"
235# implementation [one which groups shifts on a given $s[i] to form a
236# "column," unlike "horizontal" one, which groups shifts on different
237# $s[i] to form a "row"] is work in progress. It was observed to run
238# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
239# whole 12% slower:-( So we face a trade-off... Shall it be resolved
240# some day? Till then the code is considered experimental and by
241# default remains dormant...
242
243sub encvert()
244{ my ($te,@s) = @_;
245 my $v0 = $acc, $v1 = $key;
246
247 &mov ($v0,$s[3]); # copy s3
248 &mov (&DWP(4,"esp"),$s[2]); # save s2
249 &mov ($v1,$s[0]); # copy s0
250 &mov (&DWP(8,"esp"),$s[1]); # save s1
251
252 &movz ($s[2],&HB($s[0]));
253 &and ($s[0],0xFF);
254 &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0
255 &shr ($v1,16);
256 &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8
257 &movz ($s[1],&HB($v1));
258 &and ($v1,0xFF);
259 &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16
260 &mov ($v1,$v0);
261 &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24
262
263 &and ($v0,0xFF);
264 &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0
265 &movz ($v0,&HB($v1));
266 &shr ($v1,16);
267 &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8
268 &movz ($v0,&HB($v1));
269 &and ($v1,0xFF);
270 &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16
271 &mov ($v1,&DWP(4,"esp")); # restore s2
272 &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24
273
274 &mov ($v0,$v1);
275 &and ($v1,0xFF);
276 &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0
277 &movz ($v1,&HB($v0));
278 &shr ($v0,16);
279 &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8
280 &movz ($v1,&HB($v0));
281 &and ($v0,0xFF);
282 &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16
283 &mov ($v0,&DWP(8,"esp")); # restore s1
284 &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24
285
286 &mov ($v1,$v0);
287 &and ($v0,0xFF);
288 &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0
289 &movz ($v0,&HB($v1));
290 &shr ($v1,16);
291 &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8
292 &movz ($v0,&HB($v1));
293 &and ($v1,0xFF);
294 &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
295 &mov ($key,$__key); # reincarnate v1 as key
296 &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
297}
298
299# Another experimental routine, which features "horizontal spin," but
300# eliminates one reference to stack. Strangely enough runs slower...
301sub enchoriz()
302{ my $v0 = $key, $v1 = $acc;
303
304 &movz ($v0,&LB($s0)); # 3, 2, 1, 0*
305 &rotr ($s2,8); # 8,11,10, 9
306 &mov ($v1,&DWP(0,$te,$v0,8)); # 0
307 &movz ($v0,&HB($s1)); # 7, 6, 5*, 4
308 &rotr ($s3,16); # 13,12,15,14
309 &xor ($v1,&DWP(3,$te,$v0,8)); # 5
310 &movz ($v0,&HB($s2)); # 8,11,10*, 9
311 &rotr ($s0,16); # 1, 0, 3, 2
312 &xor ($v1,&DWP(2,$te,$v0,8)); # 10
313 &movz ($v0,&HB($s3)); # 13,12,15*,14
314 &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
315 &mov ($__s0,$v1); # t[0] saved
316
317 &movz ($v0,&LB($s1)); # 7, 6, 5, 4*
318 &shr ($s1,16); # -, -, 7, 6
319 &mov ($v1,&DWP(0,$te,$v0,8)); # 4
320 &movz ($v0,&LB($s3)); # 13,12,15,14*
321 &xor ($v1,&DWP(2,$te,$v0,8)); # 14
322 &movz ($v0,&HB($s0)); # 1, 0, 3*, 2
323 &and ($s3,0xffff0000); # 13,12, -, -
324 &xor ($v1,&DWP(1,$te,$v0,8)); # 3
325 &movz ($v0,&LB($s2)); # 8,11,10, 9*
326 &or ($s3,$s1); # 13,12, 7, 6
327 &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected
328 &mov ($s1,$v1); # s[1]=t[1]
329
330 &movz ($v0,&LB($s0)); # 1, 0, 3, 2*
331 &shr ($s2,16); # -, -, 8,11
332 &mov ($v1,&DWP(2,$te,$v0,8)); # 2
333 &movz ($v0,&HB($s3)); # 13,12, 7*, 6
334 &xor ($v1,&DWP(1,$te,$v0,8)); # 7
335 &movz ($v0,&HB($s2)); # -, -, 8*,11
336 &xor ($v1,&DWP(0,$te,$v0,8)); # 8
337 &mov ($v0,$s3);
338 &shr ($v0,24); # 13
339 &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected
340
341 &movz ($v0,&LB($s2)); # -, -, 8,11*
342 &shr ($s0,24); # 1*
343 &mov ($s2,&DWP(1,$te,$v0,8)); # 11
344 &xor ($s2,&DWP(3,$te,$s0,8)); # 1
345 &mov ($s0,$__s0); # s[0]=t[0]
346 &movz ($v0,&LB($s3)); # 13,12, 7, 6*
347 &shr ($s3,16); # , ,13,12
348 &xor ($s2,&DWP(2,$te,$v0,8)); # 6
349 &mov ($key,$__key); # reincarnate v0 as key
350 &and ($s3,0xff); # , ,13,12*
351 &mov ($s3,&DWP(0,$te,$s3,8)); # 12
352 &xor ($s3,$s2); # s[2]=t[3] collected
353 &mov ($s2,$v1); # s[2]=t[2]
354}
355
356# More experimental code... SSE one... Even though this one eliminates
357# *all* references to stack, it's not faster...
358sub sse_encbody()
359{
360 &movz ($acc,&LB("eax")); # 0
361 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
362 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
363 &movz ("edx",&HB("eax")); # 1
364 &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
365 &shr ("eax",16); # 5, 4
366
367 &movz ($acc,&LB("ebx")); # 10
368 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
369 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
370 &movz ($acc,&HB("ebx")); # 11
371 &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
372 &shr ("ebx",16); # 15,14
373
374 &movz ($acc,&HB("eax")); # 5
375 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
376 &movq ("mm3",QWP(16,$key));
377 &movz ($acc,&HB("ebx")); # 15
378 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
379 &movd ("mm0","ecx"); # t[0] collected
380
381 &movz ($acc,&LB("eax")); # 4
382 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
383 &movd ("eax","mm2"); # 7, 6, 3, 2
384 &movz ($acc,&LB("ebx")); # 14
385 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
386 &movd ("ebx","mm6"); # 13,12, 9, 8
387
388 &movz ($acc,&HB("eax")); # 3
389 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
390 &movz ($acc,&HB("ebx")); # 9
391 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
392 &movd ("mm1","ecx"); # t[1] collected
393
394 &movz ($acc,&LB("eax")); # 2
395 &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
396 &shr ("eax",16); # 7, 6
397 &punpckldq ("mm0","mm1"); # t[0,1] collected
398 &movz ($acc,&LB("ebx")); # 8
399 &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
400 &shr ("ebx",16); # 13,12
401
402 &movz ($acc,&HB("eax")); # 7
403 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
404 &pxor ("mm0","mm3");
405 &movz ("eax",&LB("eax")); # 6
406 &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
407 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
408 &movz ($acc,&HB("ebx")); # 13
409 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
410 &xor ("ecx",&DWP(24,$key)); # t[2]
411 &movd ("mm4","ecx"); # t[2] collected
412 &movz ("ebx",&LB("ebx")); # 12
413 &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12
414 &shr ("ecx",16);
415 &movd ("eax","mm1"); # 5, 4, 1, 0
416 &mov ("ebx",&DWP(28,$key)); # t[3]
417 &xor ("ebx","edx");
418 &movd ("mm5","ebx"); # t[3] collected
419 &and ("ebx",0xffff0000);
420 &or ("ebx","ecx");
421
422 &punpckldq ("mm4","mm5"); # t[2,3] collected
423}
424
425######################################################################
426# "Compact" block function
427######################################################################
428
429sub enccompact()
430{ my $Fn = mov;
431 while ($#_>5) { pop(@_); $Fn=sub{}; }
432 my ($i,$te,@s)=@_;
433 my $tmp = $key;
434 my $out = $i==3?$s[0]:$acc;
435
436 # $Fn is used in first compact round and its purpose is to
437 # void restoration of some values from stack, so that after
438 # 4xenccompact with extra argument $key value is left there...
439 if ($i==3) { &$Fn ($key,$__key); }##%edx
440 else { &mov ($out,$s[0]); }
441 &and ($out,0xFF);
442 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
443 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
444 &movz ($out,&BP(-128,$te,$out,1));
445
446 if ($i==3) { $tmp=$s[1]; }##%eax
447 &movz ($tmp,&HB($s[1]));
448 &movz ($tmp,&BP(-128,$te,$tmp,1));
449 &shl ($tmp,8);
450 &xor ($out,$tmp);
451
452 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
453 else { &mov ($tmp,$s[2]);
454 &shr ($tmp,16); }
455 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
456 &and ($tmp,0xFF);
457 &movz ($tmp,&BP(-128,$te,$tmp,1));
458 &shl ($tmp,16);
459 &xor ($out,$tmp);
460
461 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
462 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
463 else { &mov ($tmp,$s[3]);
464 &shr ($tmp,24); }
465 &movz ($tmp,&BP(-128,$te,$tmp,1));
466 &shl ($tmp,24);
467 &xor ($out,$tmp);
468 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
469 if ($i==3) { &mov ($s[3],$acc); }
470 &comment();
471}
472
473sub enctransform()
474{ my @s = ($s0,$s1,$s2,$s3);
475 my $i = shift;
476 my $tmp = $tbl;
477 my $r2 = $key ;
478
479 &mov ($acc,$s[$i]);
480 &and ($acc,0x80808080);
481 &mov ($tmp,$acc);
482 &shr ($tmp,7);
483 &lea ($r2,&DWP(0,$s[$i],$s[$i]));
484 &sub ($acc,$tmp);
485 &and ($r2,0xfefefefe);
486 &and ($acc,0x1b1b1b1b);
487 &mov ($tmp,$s[$i]);
488 &xor ($acc,$r2); # r2
489
490 &xor ($s[$i],$acc); # r0 ^ r2
491 &rotl ($s[$i],24);
492 &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2
493 &rotr ($tmp,16);
494 &xor ($s[$i],$tmp);
495 &rotr ($tmp,8);
496 &xor ($s[$i],$tmp);
497}
498
499&function_begin_B("_x86_AES_encrypt_compact");
500 # note that caller is expected to allocate stack frame for me!
501 &mov ($__key,$key); # save key
502
503 &xor ($s0,&DWP(0,$key)); # xor with key
504 &xor ($s1,&DWP(4,$key));
505 &xor ($s2,&DWP(8,$key));
506 &xor ($s3,&DWP(12,$key));
507
508 &mov ($acc,&DWP(240,$key)); # load key->rounds
509 &lea ($acc,&DWP(-2,$acc,$acc));
510 &lea ($acc,&DWP(0,$key,$acc,8));
511 &mov ($__end,$acc); # end of key schedule
512
513 # prefetch Te4
514 &mov ($key,&DWP(0-128,$tbl));
515 &mov ($acc,&DWP(32-128,$tbl));
516 &mov ($key,&DWP(64-128,$tbl));
517 &mov ($acc,&DWP(96-128,$tbl));
518 &mov ($key,&DWP(128-128,$tbl));
519 &mov ($acc,&DWP(160-128,$tbl));
520 &mov ($key,&DWP(192-128,$tbl));
521 &mov ($acc,&DWP(224-128,$tbl));
522
523 &set_label("loop",16);
524
525 &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
526 &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
527 &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
528 &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
529 &enctransform(2);
530 &enctransform(3);
531 &enctransform(0);
532 &enctransform(1);
533 &mov ($key,$__key);
534 &mov ($tbl,$__tbl);
535 &add ($key,16); # advance rd_key
536 &xor ($s0,&DWP(0,$key));
537 &xor ($s1,&DWP(4,$key));
538 &xor ($s2,&DWP(8,$key));
539 &xor ($s3,&DWP(12,$key));
540
541 &cmp ($key,$__end);
542 &mov ($__key,$key);
543 &jb (&label("loop"));
544
545 &enccompact(0,$tbl,$s0,$s1,$s2,$s3);
546 &enccompact(1,$tbl,$s1,$s2,$s3,$s0);
547 &enccompact(2,$tbl,$s2,$s3,$s0,$s1);
548 &enccompact(3,$tbl,$s3,$s0,$s1,$s2);
549
550 &xor ($s0,&DWP(16,$key));
551 &xor ($s1,&DWP(20,$key));
552 &xor ($s2,&DWP(24,$key));
553 &xor ($s3,&DWP(28,$key));
554
555 &ret ();
556&function_end_B("_x86_AES_encrypt_compact");
557
558######################################################################
559# "Compact" SSE block function.
560######################################################################
561#
562# Performance is not actually extraordinary in comparison to pure
563# x86 code. In particular encrypt performance is virtually the same.
564# Decrypt performance on the other hand is 15-20% better on newer
565# µ-archs [but we're thankful for *any* improvement here], and ~50%
566# better on PIII:-) And additionally on the pros side this code
567# eliminates redundant references to stack and thus relieves/
568# minimizes the pressure on the memory bus.
569#
570# MMX register layout lsb
571# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
572# | mm4 | mm0 |
573# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
574# | s3 | s2 | s1 | s0 |
575# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
576# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
577# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
578#
579# Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
580# In this terms encryption and decryption "compact" permutation
581# matrices can be depicted as following:
582#
583# encryption lsb # decryption lsb
584# +----++----+----+----+----+ # +----++----+----+----+----+
585# | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 |
586# +----++----+----+----+----+ # +----++----+----+----+----+
587# | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 |
588# +----++----+----+----+----+ # +----++----+----+----+----+
589# | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 |
590# +----++----+----+----+----+ # +----++----+----+----+----+
591# | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 |
592# +----++----+----+----+----+ # +----++----+----+----+----+
593#
594######################################################################
595# Why not xmm registers? Short answer. It was actually tested and
596# was not any faster, but *contrary*, most notably on Intel CPUs.
597# Longer answer. Main advantage of using mm registers is that movd
598# latency is lower, especially on Intel P4. While arithmetic
599# instructions are twice as many, they can be scheduled every cycle
600# and not every second one when they are operating on xmm register,
601# so that "arithmetic throughput" remains virtually the same. And
602# finally the code can be executed even on elder SSE-only CPUs:-)
603
604sub sse_enccompact()
605{
606 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
607 &pshufw ("mm5","mm4",0x0d); # 15,14,11,10
608 &movd ("eax","mm1"); # 5, 4, 1, 0
609 &movd ("ebx","mm5"); # 15,14,11,10
610
611 &movz ($acc,&LB("eax")); # 0
612 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
613 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
614 &movz ("edx",&HB("eax")); # 1
615 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
616 &shl ("edx",8); # 1
617 &shr ("eax",16); # 5, 4
618
619 &movz ($acc,&LB("ebx")); # 10
620 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
621 &shl ($acc,16); # 10
622 &or ("ecx",$acc); # 10
623 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
624 &movz ($acc,&HB("ebx")); # 11
625 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
626 &shl ($acc,24); # 11
627 &or ("edx",$acc); # 11
628 &shr ("ebx",16); # 15,14
629
630 &movz ($acc,&HB("eax")); # 5
631 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
632 &shl ($acc,8); # 5
633 &or ("ecx",$acc); # 5
634 &movz ($acc,&HB("ebx")); # 15
635 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
636 &shl ($acc,24); # 15
637 &or ("ecx",$acc); # 15
638 &movd ("mm0","ecx"); # t[0] collected
639
640 &movz ($acc,&LB("eax")); # 4
641 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
642 &movd ("eax","mm2"); # 7, 6, 3, 2
643 &movz ($acc,&LB("ebx")); # 14
644 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
645 &shl ($acc,16); # 14
646 &or ("ecx",$acc); # 14
647
648 &movd ("ebx","mm6"); # 13,12, 9, 8
649 &movz ($acc,&HB("eax")); # 3
650 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
651 &shl ($acc,24); # 3
652 &or ("ecx",$acc); # 3
653 &movz ($acc,&HB("ebx")); # 9
654 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
655 &shl ($acc,8); # 9
656 &or ("ecx",$acc); # 9
657 &movd ("mm1","ecx"); # t[1] collected
658
659 &movz ($acc,&LB("ebx")); # 8
660 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
661 &shr ("ebx",16); # 13,12
662 &movz ($acc,&LB("eax")); # 2
663 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
664 &shl ($acc,16); # 2
665 &or ("ecx",$acc); # 2
666 &shr ("eax",16); # 7, 6
667
668 &punpckldq ("mm0","mm1"); # t[0,1] collected
669
670 &movz ($acc,&HB("eax")); # 7
671 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
672 &shl ($acc,24); # 7
673 &or ("ecx",$acc); # 7
674 &and ("eax",0xff); # 6
675 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
676 &shl ("eax",16); # 6
677 &or ("edx","eax"); # 6
678 &movz ($acc,&HB("ebx")); # 13
679 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
680 &shl ($acc,8); # 13
681 &or ("ecx",$acc); # 13
682 &movd ("mm4","ecx"); # t[2] collected
683 &and ("ebx",0xff); # 12
684 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
685 &or ("edx","ebx"); # 12
686 &movd ("mm5","edx"); # t[3] collected
687
688 &punpckldq ("mm4","mm5"); # t[2,3] collected
689}
690
691 if (!$x86only) {
692&function_begin_B("_sse_AES_encrypt_compact");
693 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
694 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
695
696 # note that caller is expected to allocate stack frame for me!
697 &mov ($acc,&DWP(240,$key)); # load key->rounds
698 &lea ($acc,&DWP(-2,$acc,$acc));
699 &lea ($acc,&DWP(0,$key,$acc,8));
700 &mov ($__end,$acc); # end of key schedule
701
702 &mov ($s0,0x1b1b1b1b); # magic constant
703 &mov (&DWP(8,"esp"),$s0);
704 &mov (&DWP(12,"esp"),$s0);
705
706 # prefetch Te4
707 &mov ($s0,&DWP(0-128,$tbl));
708 &mov ($s1,&DWP(32-128,$tbl));
709 &mov ($s2,&DWP(64-128,$tbl));
710 &mov ($s3,&DWP(96-128,$tbl));
711 &mov ($s0,&DWP(128-128,$tbl));
712 &mov ($s1,&DWP(160-128,$tbl));
713 &mov ($s2,&DWP(192-128,$tbl));
714 &mov ($s3,&DWP(224-128,$tbl));
715
716 &set_label("loop",16);
717 &sse_enccompact();
718 &add ($key,16);
719 &cmp ($key,$__end);
720 &ja (&label("out"));
721
722 &movq ("mm2",&QWP(8,"esp"));
723 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
724 &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
725 &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
726 &pand ("mm3","mm2"); &pand ("mm7","mm2");
727 &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
728 &paddb ("mm0","mm0"); &paddb ("mm4","mm4");
729 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
730 &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
731 &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
732 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
733
734 &movq ("mm2","mm3"); &movq ("mm6","mm7");
735 &pslld ("mm3",8); &pslld ("mm7",8);
736 &psrld ("mm2",24); &psrld ("mm6",24);
737 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
738 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
739
740 &movq ("mm3","mm1"); &movq ("mm7","mm5");
741 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
742 &psrld ("mm1",8); &psrld ("mm5",8);
743 &mov ($s0,&DWP(0-128,$tbl));
744 &pslld ("mm3",24); &pslld ("mm7",24);
745 &mov ($s1,&DWP(64-128,$tbl));
746 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
747 &mov ($s2,&DWP(128-128,$tbl));
748 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
749 &mov ($s3,&DWP(192-128,$tbl));
750
751 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
752 &jmp (&label("loop"));
753
754 &set_label("out",16);
755 &pxor ("mm0",&QWP(0,$key));
756 &pxor ("mm4",&QWP(8,$key));
757
758 &ret ();
759&function_end_B("_sse_AES_encrypt_compact");
760 }
761
762######################################################################
763# Vanilla block function.
764######################################################################
765
766sub encstep()
767{ my ($i,$te,@s) = @_;
768 my $tmp = $key;
769 my $out = $i==3?$s[0]:$acc;
770
771 # lines marked with #%e?x[i] denote "reordered" instructions...
772 if ($i==3) { &mov ($key,$__key); }##%edx
773 else { &mov ($out,$s[0]);
774 &and ($out,0xFF); }
775 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
776 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
777 &mov ($out,&DWP(0,$te,$out,8));
778
779 if ($i==3) { $tmp=$s[1]; }##%eax
780 &movz ($tmp,&HB($s[1]));
781 &xor ($out,&DWP(3,$te,$tmp,8));
782
783 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
784 else { &mov ($tmp,$s[2]);
785 &shr ($tmp,16); }
786 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
787 &and ($tmp,0xFF);
788 &xor ($out,&DWP(2,$te,$tmp,8));
789
790 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
791 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
792 else { &mov ($tmp,$s[3]);
793 &shr ($tmp,24) }
794 &xor ($out,&DWP(1,$te,$tmp,8));
795 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
796 if ($i==3) { &mov ($s[3],$acc); }
797 &comment();
798}
799
800sub enclast()
801{ my ($i,$te,@s)=@_;
802 my $tmp = $key;
803 my $out = $i==3?$s[0]:$acc;
804
805 if ($i==3) { &mov ($key,$__key); }##%edx
806 else { &mov ($out,$s[0]); }
807 &and ($out,0xFF);
808 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
809 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
810 &mov ($out,&DWP(2,$te,$out,8));
811 &and ($out,0x000000ff);
812
813 if ($i==3) { $tmp=$s[1]; }##%eax
814 &movz ($tmp,&HB($s[1]));
815 &mov ($tmp,&DWP(0,$te,$tmp,8));
816 &and ($tmp,0x0000ff00);
817 &xor ($out,$tmp);
818
819 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
820 else { &mov ($tmp,$s[2]);
821 &shr ($tmp,16); }
822 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
823 &and ($tmp,0xFF);
824 &mov ($tmp,&DWP(0,$te,$tmp,8));
825 &and ($tmp,0x00ff0000);
826 &xor ($out,$tmp);
827
828 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
829 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
830 else { &mov ($tmp,$s[3]);
831 &shr ($tmp,24); }
832 &mov ($tmp,&DWP(2,$te,$tmp,8));
833 &and ($tmp,0xff000000);
834 &xor ($out,$tmp);
835 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
836 if ($i==3) { &mov ($s[3],$acc); }
837}
838
839&function_begin_B("_x86_AES_encrypt");
840 if ($vertical_spin) {
841 # I need high parts of volatile registers to be accessible...
842 &exch ($s1="edi",$key="ebx");
843 &mov ($s2="esi",$acc="ecx");
844 }
845
846 # note that caller is expected to allocate stack frame for me!
847 &mov ($__key,$key); # save key
848
849 &xor ($s0,&DWP(0,$key)); # xor with key
850 &xor ($s1,&DWP(4,$key));
851 &xor ($s2,&DWP(8,$key));
852 &xor ($s3,&DWP(12,$key));
853
854 &mov ($acc,&DWP(240,$key)); # load key->rounds
855
856 if ($small_footprint) {
857 &lea ($acc,&DWP(-2,$acc,$acc));
858 &lea ($acc,&DWP(0,$key,$acc,8));
859 &mov ($__end,$acc); # end of key schedule
860
861 &set_label("loop",16);
862 if ($vertical_spin) {
863 &encvert($tbl,$s0,$s1,$s2,$s3);
864 } else {
865 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
866 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
867 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
868 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
869 }
870 &add ($key,16); # advance rd_key
871 &xor ($s0,&DWP(0,$key));
872 &xor ($s1,&DWP(4,$key));
873 &xor ($s2,&DWP(8,$key));
874 &xor ($s3,&DWP(12,$key));
875 &cmp ($key,$__end);
876 &mov ($__key,$key);
877 &jb (&label("loop"));
878 }
879 else {
880 &cmp ($acc,10);
881 &jle (&label("10rounds"));
882 &cmp ($acc,12);
883 &jle (&label("12rounds"));
884
885 &set_label("14rounds",4);
886 for ($i=1;$i<3;$i++) {
887 if ($vertical_spin) {
888 &encvert($tbl,$s0,$s1,$s2,$s3);
889 } else {
890 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
891 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
892 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
893 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
894 }
895 &xor ($s0,&DWP(16*$i+0,$key));
896 &xor ($s1,&DWP(16*$i+4,$key));
897 &xor ($s2,&DWP(16*$i+8,$key));
898 &xor ($s3,&DWP(16*$i+12,$key));
899 }
900 &add ($key,32);
901 &mov ($__key,$key); # advance rd_key
902 &set_label("12rounds",4);
903 for ($i=1;$i<3;$i++) {
904 if ($vertical_spin) {
905 &encvert($tbl,$s0,$s1,$s2,$s3);
906 } else {
907 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
908 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
909 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
910 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
911 }
912 &xor ($s0,&DWP(16*$i+0,$key));
913 &xor ($s1,&DWP(16*$i+4,$key));
914 &xor ($s2,&DWP(16*$i+8,$key));
915 &xor ($s3,&DWP(16*$i+12,$key));
916 }
917 &add ($key,32);
918 &mov ($__key,$key); # advance rd_key
919 &set_label("10rounds",4);
920 for ($i=1;$i<10;$i++) {
921 if ($vertical_spin) {
922 &encvert($tbl,$s0,$s1,$s2,$s3);
923 } else {
924 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
925 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
926 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
927 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
928 }
929 &xor ($s0,&DWP(16*$i+0,$key));
930 &xor ($s1,&DWP(16*$i+4,$key));
931 &xor ($s2,&DWP(16*$i+8,$key));
932 &xor ($s3,&DWP(16*$i+12,$key));
933 }
934 }
935
936 if ($vertical_spin) {
937 # "reincarnate" some registers for "horizontal" spin...
938 &mov ($s1="ebx",$key="edi");
939 &mov ($s2="ecx",$acc="esi");
940 }
941 &enclast(0,$tbl,$s0,$s1,$s2,$s3);
942 &enclast(1,$tbl,$s1,$s2,$s3,$s0);
943 &enclast(2,$tbl,$s2,$s3,$s0,$s1);
944 &enclast(3,$tbl,$s3,$s0,$s1,$s2);
945
946 &add ($key,$small_footprint?16:160);
947 &xor ($s0,&DWP(0,$key));
948 &xor ($s1,&DWP(4,$key));
949 &xor ($s2,&DWP(8,$key));
950 &xor ($s3,&DWP(12,$key));
951
952 &ret ();
953
954&set_label("AES_Te",64); # Yes! I keep it in the code segment!
955 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
956 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
957 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
958 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
959 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
960 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
961 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
962 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
963 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
964 &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
965 &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
966 &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
967 &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
968 &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
969 &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
970 &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
971 &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
972 &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
973 &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
974 &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
975 &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
976 &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
977 &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
978 &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
979 &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
980 &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
981 &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
982 &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
983 &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
984 &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
985 &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
986 &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
987 &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
988 &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
989 &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
990 &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
991 &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
992 &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
993 &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
994 &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
995 &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
996 &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
997 &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
998 &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
999 &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
1000 &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
1001 &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
1002 &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
1003 &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
1004 &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
1005 &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
1006 &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
1007 &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
1008 &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
1009 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
1010 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
1011 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
1012 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
1013 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
1014 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
1015 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
1016 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
1017 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
1018 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
1019
1020#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
1021 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1022 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1023 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1024 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1025 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1026 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1027 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1028 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1029 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1030 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1031 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1032 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1033 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1034 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1035 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1036 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1037 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1038 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1039 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1040 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1041 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1042 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1043 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1044 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1045 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1046 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1047 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1048 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1049 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1050 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1051 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1052 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1053
1054 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1055 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1056 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1057 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1058 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1059 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1060 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1061 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1062 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1063 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1064 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1065 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1066 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1067 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1068 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1069 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1070 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1071 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1072 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1073 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1074 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1075 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1076 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1077 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1078 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1079 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1080 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1081 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1082 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1083 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1084 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1085 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1086
1087 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1088 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1089 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1090 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1091 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1092 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1093 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1094 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1095 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1096 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1097 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1098 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1099 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1100 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1101 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1102 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1103 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1104 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1105 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1106 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1107 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1108 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1109 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1110 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1111 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1112 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1113 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1114 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1115 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1116 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1117 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1118 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1119
1120 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1121 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1122 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1123 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1124 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1125 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1126 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1127 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1128 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1129 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1130 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1131 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1132 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1133 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1134 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1135 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1136 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1137 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1138 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1139 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1140 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1141 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1142 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1143 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1144 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1145 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1146 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1147 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1148 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1149 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1150 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1151 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1152#rcon:
1153 &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
1154 &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
1155 &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
1156 &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
1157&function_end_B("_x86_AES_encrypt");
1158
1159# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
1160&function_begin("AES_encrypt");
1161 &mov ($acc,&wparam(0)); # load inp
1162 &mov ($key,&wparam(2)); # load key
1163
1164 &mov ($s0,"esp");
1165 &sub ("esp",36);
1166 &and ("esp",-64); # align to cache-line
1167
1168 # place stack frame just "above" the key schedule
1169 &lea ($s1,&DWP(-64-63,$key));
1170 &sub ($s1,"esp");
1171 &neg ($s1);
1172 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1173 &sub ("esp",$s1);
1174 &add ("esp",4); # 4 is reserved for caller's return address
1175 &mov ($_esp,$s0); # save stack pointer
1176
1177 &call (&label("pic_point")); # make it PIC!
1178 &set_label("pic_point");
1179 &blindpop($tbl);
1180 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
1181 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
1182
1183 # pick Te4 copy which can't "overlap" with stack frame or key schedule
1184 &lea ($s1,&DWP(768-4,"esp"));
1185 &sub ($s1,$tbl);
1186 &and ($s1,0x300);
1187 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1188
1189 if (!$x86only) {
1190 &bt (&DWP(0,$s0),25); # check for SSE bit
1191 &jnc (&label("x86"));
1192
1193 &movq ("mm0",&QWP(0,$acc));
1194 &movq ("mm4",&QWP(8,$acc));
1195 &call ("_sse_AES_encrypt_compact");
1196 &mov ("esp",$_esp); # restore stack pointer
1197 &mov ($acc,&wparam(1)); # load out
1198 &movq (&QWP(0,$acc),"mm0"); # write output data
1199 &movq (&QWP(8,$acc),"mm4");
1200 &emms ();
1201 &function_end_A();
1202 }
1203 &set_label("x86",16);
1204 &mov ($_tbl,$tbl);
1205 &mov ($s0,&DWP(0,$acc)); # load input data
1206 &mov ($s1,&DWP(4,$acc));
1207 &mov ($s2,&DWP(8,$acc));
1208 &mov ($s3,&DWP(12,$acc));
1209 &call ("_x86_AES_encrypt_compact");
1210 &mov ("esp",$_esp); # restore stack pointer
1211 &mov ($acc,&wparam(1)); # load out
1212 &mov (&DWP(0,$acc),$s0); # write output data
1213 &mov (&DWP(4,$acc),$s1);
1214 &mov (&DWP(8,$acc),$s2);
1215 &mov (&DWP(12,$acc),$s3);
1216&function_end("AES_encrypt");
1217
1218#--------------------------------------------------------------------#
1219
1220######################################################################
1221# "Compact" block function
1222######################################################################
1223
1224sub deccompact()
1225{ my $Fn = mov;
1226 while ($#_>5) { pop(@_); $Fn=sub{}; }
1227 my ($i,$td,@s)=@_;
1228 my $tmp = $key;
1229 my $out = $i==3?$s[0]:$acc;
1230
1231 # $Fn is used in first compact round and its purpose is to
1232 # void restoration of some values from stack, so that after
1233 # 4xdeccompact with extra argument $key, $s0 and $s1 values
1234 # are left there...
1235 if($i==3) { &$Fn ($key,$__key); }
1236 else { &mov ($out,$s[0]); }
1237 &and ($out,0xFF);
1238 &movz ($out,&BP(-128,$td,$out,1));
1239
1240 if ($i==3) { $tmp=$s[1]; }
1241 &movz ($tmp,&HB($s[1]));
1242 &movz ($tmp,&BP(-128,$td,$tmp,1));
1243 &shl ($tmp,8);
1244 &xor ($out,$tmp);
1245
1246 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1247 else { mov ($tmp,$s[2]); }
1248 &shr ($tmp,16);
1249 &and ($tmp,0xFF);
1250 &movz ($tmp,&BP(-128,$td,$tmp,1));
1251 &shl ($tmp,16);
1252 &xor ($out,$tmp);
1253
1254 if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }
1255 else { &mov ($tmp,$s[3]); }
1256 &shr ($tmp,24);
1257 &movz ($tmp,&BP(-128,$td,$tmp,1));
1258 &shl ($tmp,24);
1259 &xor ($out,$tmp);
1260 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1261 if ($i==3) { &$Fn ($s[3],$__s0); }
1262}
1263
1264# must be called with 2,3,0,1 as argument sequence!!!
1265sub dectransform()
1266{ my @s = ($s0,$s1,$s2,$s3);
1267 my $i = shift;
1268 my $tmp = $key;
1269 my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
1270 my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
1271 my $tp8 = $tbl;
1272
1273 &mov ($acc,$s[$i]);
1274 &and ($acc,0x80808080);
1275 &mov ($tmp,$acc);
1276 &shr ($tmp,7);
1277 &lea ($tp2,&DWP(0,$s[$i],$s[$i]));
1278 &sub ($acc,$tmp);
1279 &and ($tp2,0xfefefefe);
1280 &and ($acc,0x1b1b1b1b);
1281 &xor ($acc,$tp2);
1282 &mov ($tp2,$acc);
1283
1284 &and ($acc,0x80808080);
1285 &mov ($tmp,$acc);
1286 &shr ($tmp,7);
1287 &lea ($tp4,&DWP(0,$tp2,$tp2));
1288 &sub ($acc,$tmp);
1289 &and ($tp4,0xfefefefe);
1290 &and ($acc,0x1b1b1b1b);
1291 &xor ($tp2,$s[$i]); # tp2^tp1
1292 &xor ($acc,$tp4);
1293 &mov ($tp4,$acc);
1294
1295 &and ($acc,0x80808080);
1296 &mov ($tmp,$acc);
1297 &shr ($tmp,7);
1298 &lea ($tp8,&DWP(0,$tp4,$tp4));
1299 &sub ($acc,$tmp);
1300 &and ($tp8,0xfefefefe);
1301 &and ($acc,0x1b1b1b1b);
1302 &xor ($tp4,$s[$i]); # tp4^tp1
1303 &rotl ($s[$i],8); # = ROTATE(tp1,8)
1304 &xor ($tp8,$acc);
1305
1306 &xor ($s[$i],$tp2);
1307 &xor ($tp2,$tp8);
1308 &rotl ($tp2,24);
1309 &xor ($s[$i],$tp4);
1310 &xor ($tp4,$tp8);
1311 &rotl ($tp4,16);
1312 &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
1313 &rotl ($tp8,8);
1314 &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
1315 &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
1316 &mov ($s[0],$__s0) if($i==2); #prefetch $s0
1317 &mov ($s[1],$__s1) if($i==3); #prefetch $s1
1318 &mov ($s[2],$__s2) if($i==1);
1319 &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
1320
1321 &mov ($s[3],$__s3) if($i==1);
1322 &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
1323}
1324
1325&function_begin_B("_x86_AES_decrypt_compact");
1326 # note that caller is expected to allocate stack frame for me!
1327 &mov ($__key,$key); # save key
1328
1329 &xor ($s0,&DWP(0,$key)); # xor with key
1330 &xor ($s1,&DWP(4,$key));
1331 &xor ($s2,&DWP(8,$key));
1332 &xor ($s3,&DWP(12,$key));
1333
1334 &mov ($acc,&DWP(240,$key)); # load key->rounds
1335
1336 &lea ($acc,&DWP(-2,$acc,$acc));
1337 &lea ($acc,&DWP(0,$key,$acc,8));
1338 &mov ($__end,$acc); # end of key schedule
1339
1340 # prefetch Td4
1341 &mov ($key,&DWP(0-128,$tbl));
1342 &mov ($acc,&DWP(32-128,$tbl));
1343 &mov ($key,&DWP(64-128,$tbl));
1344 &mov ($acc,&DWP(96-128,$tbl));
1345 &mov ($key,&DWP(128-128,$tbl));
1346 &mov ($acc,&DWP(160-128,$tbl));
1347 &mov ($key,&DWP(192-128,$tbl));
1348 &mov ($acc,&DWP(224-128,$tbl));
1349
1350 &set_label("loop",16);
1351
1352 &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
1353 &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
1354 &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
1355 &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
1356 &dectransform(2);
1357 &dectransform(3);
1358 &dectransform(0);
1359 &dectransform(1);
1360 &mov ($key,$__key);
1361 &mov ($tbl,$__tbl);
1362 &add ($key,16); # advance rd_key
1363 &xor ($s0,&DWP(0,$key));
1364 &xor ($s1,&DWP(4,$key));
1365 &xor ($s2,&DWP(8,$key));
1366 &xor ($s3,&DWP(12,$key));
1367
1368 &cmp ($key,$__end);
1369 &mov ($__key,$key);
1370 &jb (&label("loop"));
1371
1372 &deccompact(0,$tbl,$s0,$s3,$s2,$s1);
1373 &deccompact(1,$tbl,$s1,$s0,$s3,$s2);
1374 &deccompact(2,$tbl,$s2,$s1,$s0,$s3);
1375 &deccompact(3,$tbl,$s3,$s2,$s1,$s0);
1376
1377 &xor ($s0,&DWP(16,$key));
1378 &xor ($s1,&DWP(20,$key));
1379 &xor ($s2,&DWP(24,$key));
1380 &xor ($s3,&DWP(28,$key));
1381
1382 &ret ();
1383&function_end_B("_x86_AES_decrypt_compact");
1384
1385######################################################################
1386# "Compact" SSE block function.
1387######################################################################
1388
1389sub sse_deccompact()
1390{
1391 &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
1392 &movd ("eax","mm1"); # 7, 6, 1, 0
1393
1394 &pshufw ("mm5","mm4",0x09); # 13,12,11,10
1395 &movz ($acc,&LB("eax")); # 0
1396 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
1397 &movd ("ebx","mm5"); # 13,12,11,10
1398 &movz ("edx",&HB("eax")); # 1
1399 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
1400 &shl ("edx",8); # 1
1401
1402 &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
1403 &movz ($acc,&LB("ebx")); # 10
1404 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
1405 &shl ($acc,16); # 10
1406 &or ("ecx",$acc); # 10
1407 &shr ("eax",16); # 7, 6
1408 &movz ($acc,&HB("ebx")); # 11
1409 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
1410 &shl ($acc,24); # 11
1411 &or ("edx",$acc); # 11
1412 &shr ("ebx",16); # 13,12
1413
1414 &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
1415 &movz ($acc,&HB("eax")); # 7
1416 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
1417 &shl ($acc,24); # 7
1418 &or ("ecx",$acc); # 7
1419 &movz ($acc,&HB("ebx")); # 13
1420 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
1421 &shl ($acc,8); # 13
1422 &or ("ecx",$acc); # 13
1423 &movd ("mm0","ecx"); # t[0] collected
1424
1425 &movz ($acc,&LB("eax")); # 6
1426 &movd ("eax","mm2"); # 3, 2, 5, 4
1427 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
1428 &shl ("ecx",16); # 6
1429 &movz ($acc,&LB("ebx")); # 12
1430 &movd ("ebx","mm6"); # 9, 8,15,14
1431 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
1432 &or ("ecx",$acc); # 12
1433
1434 &movz ($acc,&LB("eax")); # 4
1435 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
1436 &or ("edx",$acc); # 4
1437 &movz ($acc,&LB("ebx")); # 14
1438 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
1439 &shl ($acc,16); # 14
1440 &or ("edx",$acc); # 14
1441 &movd ("mm1","edx"); # t[1] collected
1442
1443 &movz ($acc,&HB("eax")); # 5
1444 &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
1445 &shl ("edx",8); # 5
1446 &movz ($acc,&HB("ebx")); # 15
1447 &shr ("eax",16); # 3, 2
1448 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
1449 &shl ($acc,24); # 15
1450 &or ("edx",$acc); # 15
1451 &shr ("ebx",16); # 9, 8
1452
1453 &punpckldq ("mm0","mm1"); # t[0,1] collected
1454
1455 &movz ($acc,&HB("ebx")); # 9
1456 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
1457 &shl ($acc,8); # 9
1458 &or ("ecx",$acc); # 9
1459 &and ("ebx",0xff); # 8
1460 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
1461 &or ("edx","ebx"); # 8
1462 &movz ($acc,&LB("eax")); # 2
1463 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
1464 &shl ($acc,16); # 2
1465 &or ("edx",$acc); # 2
1466 &movd ("mm4","edx"); # t[2] collected
1467 &movz ("eax",&HB("eax")); # 3
1468 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
1469 &shl ("eax",24); # 3
1470 &or ("ecx","eax"); # 3
1471 &movd ("mm5","ecx"); # t[3] collected
1472
1473 &punpckldq ("mm4","mm5"); # t[2,3] collected
1474}
1475
1476 if (!$x86only) {
1477&function_begin_B("_sse_AES_decrypt_compact");
1478 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
1479 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
1480
1481 # note that caller is expected to allocate stack frame for me!
1482 &mov ($acc,&DWP(240,$key)); # load key->rounds
1483 &lea ($acc,&DWP(-2,$acc,$acc));
1484 &lea ($acc,&DWP(0,$key,$acc,8));
1485 &mov ($__end,$acc); # end of key schedule
1486
1487 &mov ($s0,0x1b1b1b1b); # magic constant
1488 &mov (&DWP(8,"esp"),$s0);
1489 &mov (&DWP(12,"esp"),$s0);
1490
1491 # prefetch Td4
1492 &mov ($s0,&DWP(0-128,$tbl));
1493 &mov ($s1,&DWP(32-128,$tbl));
1494 &mov ($s2,&DWP(64-128,$tbl));
1495 &mov ($s3,&DWP(96-128,$tbl));
1496 &mov ($s0,&DWP(128-128,$tbl));
1497 &mov ($s1,&DWP(160-128,$tbl));
1498 &mov ($s2,&DWP(192-128,$tbl));
1499 &mov ($s3,&DWP(224-128,$tbl));
1500
1501 &set_label("loop",16);
1502 &sse_deccompact();
1503 &add ($key,16);
1504 &cmp ($key,$__end);
1505 &ja (&label("out"));
1506
1507 # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
1508 &movq ("mm3","mm0"); &movq ("mm7","mm4");
1509 &movq ("mm2","mm0",1); &movq ("mm6","mm4",1);
1510 &movq ("mm1","mm0"); &movq ("mm5","mm4");
1511 &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16)
1512 &pslld ("mm2",8); &pslld ("mm6",8);
1513 &psrld ("mm3",8); &psrld ("mm7",8);
1514 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8
1515 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8
1516 &pslld ("mm2",16); &pslld ("mm6",16);
1517 &psrld ("mm3",16); &psrld ("mm7",16);
1518 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24
1519 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24
1520
1521 &movq ("mm3",&QWP(8,"esp"));
1522 &pxor ("mm2","mm2"); &pxor ("mm6","mm6");
1523 &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5");
1524 &pand ("mm2","mm3"); &pand ("mm6","mm3");
1525 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1526 &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2
1527 &movq ("mm3","mm1"); &movq ("mm7","mm5");
1528 &movq ("mm2","mm1"); &movq ("mm6","mm5");
1529 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2
1530 &pslld ("mm3",24); &pslld ("mm7",24);
1531 &psrld ("mm2",8); &psrld ("mm6",8);
1532 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24
1533 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8
1534
1535 &movq ("mm2",&QWP(8,"esp"));
1536 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1537 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1538 &pand ("mm3","mm2"); &pand ("mm7","mm2");
1539 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1540 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
1541 &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
1542 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
1543 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
1544
1545 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1546 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1547 &pand ("mm3","mm2"); &pand ("mm7","mm2");
1548 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1549 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8
1550 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8
1551 &movq ("mm3","mm1"); &movq ("mm7","mm5");
1552 &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1);
1553 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16)
1554 &pslld ("mm1",8); &pslld ("mm5",8);
1555 &psrld ("mm3",8); &psrld ("mm7",8);
1556 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
1557 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8
1558 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8
1559 &mov ($s0,&DWP(0-128,$tbl));
1560 &pslld ("mm1",16); &pslld ("mm5",16);
1561 &mov ($s1,&DWP(64-128,$tbl));
1562 &psrld ("mm3",16); &psrld ("mm7",16);
1563 &mov ($s2,&DWP(128-128,$tbl));
1564 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24
1565 &mov ($s3,&DWP(192-128,$tbl));
1566 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24
1567
1568 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
1569 &jmp (&label("loop"));
1570
1571 &set_label("out",16);
1572 &pxor ("mm0",&QWP(0,$key));
1573 &pxor ("mm4",&QWP(8,$key));
1574
1575 &ret ();
1576&function_end_B("_sse_AES_decrypt_compact");
1577 }
1578
1579######################################################################
1580# Vanilla block function.
1581######################################################################
1582
1583sub decstep()
1584{ my ($i,$td,@s) = @_;
1585 my $tmp = $key;
1586 my $out = $i==3?$s[0]:$acc;
1587
1588 # no instructions are reordered, as performance appears
1589 # optimal... or rather that all attempts to reorder didn't
1590 # result in better performance [which by the way is not a
1591 # bit lower than ecryption].
1592 if($i==3) { &mov ($key,$__key); }
1593 else { &mov ($out,$s[0]); }
1594 &and ($out,0xFF);
1595 &mov ($out,&DWP(0,$td,$out,8));
1596
1597 if ($i==3) { $tmp=$s[1]; }
1598 &movz ($tmp,&HB($s[1]));
1599 &xor ($out,&DWP(3,$td,$tmp,8));
1600
1601 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1602 else { &mov ($tmp,$s[2]); }
1603 &shr ($tmp,16);
1604 &and ($tmp,0xFF);
1605 &xor ($out,&DWP(2,$td,$tmp,8));
1606
1607 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
1608 else { &mov ($tmp,$s[3]); }
1609 &shr ($tmp,24);
1610 &xor ($out,&DWP(1,$td,$tmp,8));
1611 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1612 if ($i==3) { &mov ($s[3],$__s0); }
1613 &comment();
1614}
1615
1616sub declast()
1617{ my ($i,$td,@s)=@_;
1618 my $tmp = $key;
1619 my $out = $i==3?$s[0]:$acc;
1620
1621 if($i==0) { &lea ($td,&DWP(2048+128,$td));
1622 &mov ($tmp,&DWP(0-128,$td));
1623 &mov ($acc,&DWP(32-128,$td));
1624 &mov ($tmp,&DWP(64-128,$td));
1625 &mov ($acc,&DWP(96-128,$td));
1626 &mov ($tmp,&DWP(128-128,$td));
1627 &mov ($acc,&DWP(160-128,$td));
1628 &mov ($tmp,&DWP(192-128,$td));
1629 &mov ($acc,&DWP(224-128,$td));
1630 &lea ($td,&DWP(-128,$td)); }
1631 if($i==3) { &mov ($key,$__key); }
1632 else { &mov ($out,$s[0]); }
1633 &and ($out,0xFF);
1634 &movz ($out,&BP(0,$td,$out,1));
1635
1636 if ($i==3) { $tmp=$s[1]; }
1637 &movz ($tmp,&HB($s[1]));
1638 &movz ($tmp,&BP(0,$td,$tmp,1));
1639 &shl ($tmp,8);
1640 &xor ($out,$tmp);
1641
1642 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1643 else { mov ($tmp,$s[2]); }
1644 &shr ($tmp,16);
1645 &and ($tmp,0xFF);
1646 &movz ($tmp,&BP(0,$td,$tmp,1));
1647 &shl ($tmp,16);
1648 &xor ($out,$tmp);
1649
1650 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
1651 else { &mov ($tmp,$s[3]); }
1652 &shr ($tmp,24);
1653 &movz ($tmp,&BP(0,$td,$tmp,1));
1654 &shl ($tmp,24);
1655 &xor ($out,$tmp);
1656 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1657 if ($i==3) { &mov ($s[3],$__s0);
1658 &lea ($td,&DWP(-2048,$td)); }
1659}
1660
1661&function_begin_B("_x86_AES_decrypt");
1662 # note that caller is expected to allocate stack frame for me!
1663 &mov ($__key,$key); # save key
1664
1665 &xor ($s0,&DWP(0,$key)); # xor with key
1666 &xor ($s1,&DWP(4,$key));
1667 &xor ($s2,&DWP(8,$key));
1668 &xor ($s3,&DWP(12,$key));
1669
1670 &mov ($acc,&DWP(240,$key)); # load key->rounds
1671
1672 if ($small_footprint) {
1673 &lea ($acc,&DWP(-2,$acc,$acc));
1674 &lea ($acc,&DWP(0,$key,$acc,8));
1675 &mov ($__end,$acc); # end of key schedule
1676 &set_label("loop",16);
1677 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1678 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1679 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1680 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1681 &add ($key,16); # advance rd_key
1682 &xor ($s0,&DWP(0,$key));
1683 &xor ($s1,&DWP(4,$key));
1684 &xor ($s2,&DWP(8,$key));
1685 &xor ($s3,&DWP(12,$key));
1686 &cmp ($key,$__end);
1687 &mov ($__key,$key);
1688 &jb (&label("loop"));
1689 }
1690 else {
1691 &cmp ($acc,10);
1692 &jle (&label("10rounds"));
1693 &cmp ($acc,12);
1694 &jle (&label("12rounds"));
1695
1696 &set_label("14rounds",4);
1697 for ($i=1;$i<3;$i++) {
1698 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1699 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1700 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1701 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1702 &xor ($s0,&DWP(16*$i+0,$key));
1703 &xor ($s1,&DWP(16*$i+4,$key));
1704 &xor ($s2,&DWP(16*$i+8,$key));
1705 &xor ($s3,&DWP(16*$i+12,$key));
1706 }
1707 &add ($key,32);
1708 &mov ($__key,$key); # advance rd_key
1709 &set_label("12rounds",4);
1710 for ($i=1;$i<3;$i++) {
1711 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1712 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1713 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1714 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1715 &xor ($s0,&DWP(16*$i+0,$key));
1716 &xor ($s1,&DWP(16*$i+4,$key));
1717 &xor ($s2,&DWP(16*$i+8,$key));
1718 &xor ($s3,&DWP(16*$i+12,$key));
1719 }
1720 &add ($key,32);
1721 &mov ($__key,$key); # advance rd_key
1722 &set_label("10rounds",4);
1723 for ($i=1;$i<10;$i++) {
1724 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1725 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1726 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1727 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1728 &xor ($s0,&DWP(16*$i+0,$key));
1729 &xor ($s1,&DWP(16*$i+4,$key));
1730 &xor ($s2,&DWP(16*$i+8,$key));
1731 &xor ($s3,&DWP(16*$i+12,$key));
1732 }
1733 }
1734
1735 &declast(0,$tbl,$s0,$s3,$s2,$s1);
1736 &declast(1,$tbl,$s1,$s0,$s3,$s2);
1737 &declast(2,$tbl,$s2,$s1,$s0,$s3);
1738 &declast(3,$tbl,$s3,$s2,$s1,$s0);
1739
1740 &add ($key,$small_footprint?16:160);
1741 &xor ($s0,&DWP(0,$key));
1742 &xor ($s1,&DWP(4,$key));
1743 &xor ($s2,&DWP(8,$key));
1744 &xor ($s3,&DWP(12,$key));
1745
1746 &ret ();
1747
1748&set_label("AES_Td",64); # Yes! I keep it in the code segment!
1749 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
1750 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
1751 &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
1752 &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
1753 &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
1754 &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
1755 &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
1756 &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
1757 &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
1758 &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
1759 &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
1760 &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
1761 &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
1762 &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
1763 &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
1764 &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
1765 &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
1766 &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
1767 &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
1768 &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
1769 &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
1770 &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
1771 &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
1772 &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
1773 &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
1774 &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
1775 &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
1776 &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
1777 &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
1778 &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
1779 &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
1780 &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
1781 &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
1782 &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
1783 &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
1784 &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
1785 &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
1786 &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
1787 &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
1788 &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
1789 &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
1790 &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
1791 &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
1792 &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
1793 &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
1794 &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
1795 &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
1796 &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
1797 &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
1798 &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
1799 &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
1800 &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
1801 &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
1802 &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
1803 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
1804 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
1805 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
1806 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
1807 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
1808 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
1809 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
1810 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
1811 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
1812 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
1813
1814#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
1815 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1816 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1817 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1818 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1819 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1820 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1821 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1822 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1823 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1824 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1825 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1826 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1827 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1828 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1829 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1830 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1831 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1832 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1833 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1834 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1835 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1836 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1837 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1838 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1839 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1840 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1841 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1842 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1843 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1844 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1845 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1846 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1847
1848 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1849 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1850 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1851 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1852 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1853 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1854 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1855 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1856 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1857 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1858 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1859 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1860 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1861 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1862 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1863 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1864 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1865 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1866 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1867 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1868 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1869 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1870 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1871 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1872 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1873 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1874 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1875 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1876 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1877 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1878 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1879 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1880
1881 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1882 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1883 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1884 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1885 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1886 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1887 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1888 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1889 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1890 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1891 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1892 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1893 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1894 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1895 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1896 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1897 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1898 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1899 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1900 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1901 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1902 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1903 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1904 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1905 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1906 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1907 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1908 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1909 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1910 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1911 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1912 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1913
1914 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1915 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1916 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1917 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1918 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1919 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1920 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1921 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1922 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1923 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1924 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1925 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1926 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1927 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1928 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1929 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1930 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1931 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1932 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1933 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1934 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1935 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1936 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1937 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1938 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1939 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1940 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1941 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1942 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1943 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1944 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1945 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1946&function_end_B("_x86_AES_decrypt");
1947
1948# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
1949&function_begin("AES_decrypt");
1950 &mov ($acc,&wparam(0)); # load inp
1951 &mov ($key,&wparam(2)); # load key
1952
1953 &mov ($s0,"esp");
1954 &sub ("esp",36);
1955 &and ("esp",-64); # align to cache-line
1956
1957 # place stack frame just "above" the key schedule
1958 &lea ($s1,&DWP(-64-63,$key));
1959 &sub ($s1,"esp");
1960 &neg ($s1);
1961 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1962 &sub ("esp",$s1);
1963 &add ("esp",4); # 4 is reserved for caller's return address
1964 &mov ($_esp,$s0); # save stack pointer
1965
1966 &call (&label("pic_point")); # make it PIC!
1967 &set_label("pic_point");
1968 &blindpop($tbl);
1969 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
1970 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
1971
1972 # pick Td4 copy which can't "overlap" with stack frame or key schedule
1973 &lea ($s1,&DWP(768-4,"esp"));
1974 &sub ($s1,$tbl);
1975 &and ($s1,0x300);
1976 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1977
1978 if (!$x86only) {
1979 &bt (&DWP(0,$s0),25); # check for SSE bit
1980 &jnc (&label("x86"));
1981
1982 &movq ("mm0",&QWP(0,$acc));
1983 &movq ("mm4",&QWP(8,$acc));
1984 &call ("_sse_AES_decrypt_compact");
1985 &mov ("esp",$_esp); # restore stack pointer
1986 &mov ($acc,&wparam(1)); # load out
1987 &movq (&QWP(0,$acc),"mm0"); # write output data
1988 &movq (&QWP(8,$acc),"mm4");
1989 &emms ();
1990 &function_end_A();
1991 }
1992 &set_label("x86",16);
1993 &mov ($_tbl,$tbl);
1994 &mov ($s0,&DWP(0,$acc)); # load input data
1995 &mov ($s1,&DWP(4,$acc));
1996 &mov ($s2,&DWP(8,$acc));
1997 &mov ($s3,&DWP(12,$acc));
1998 &call ("_x86_AES_decrypt_compact");
1999 &mov ("esp",$_esp); # restore stack pointer
2000 &mov ($acc,&wparam(1)); # load out
2001 &mov (&DWP(0,$acc),$s0); # write output data
2002 &mov (&DWP(4,$acc),$s1);
2003 &mov (&DWP(8,$acc),$s2);
2004 &mov (&DWP(12,$acc),$s3);
2005&function_end("AES_decrypt");
2006
2007# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
2008# size_t length, const AES_KEY *key,
2009# unsigned char *ivp,const int enc);
2010{
2011# stack frame layout
2012# -4(%esp) # return address 0(%esp)
2013# 0(%esp) # s0 backing store 4(%esp)
2014# 4(%esp) # s1 backing store 8(%esp)
2015# 8(%esp) # s2 backing store 12(%esp)
2016# 12(%esp) # s3 backing store 16(%esp)
2017# 16(%esp) # key backup 20(%esp)
2018# 20(%esp) # end of key schedule 24(%esp)
2019# 24(%esp) # %ebp backup 28(%esp)
2020# 28(%esp) # %esp backup
2021my $_inp=&DWP(32,"esp"); # copy of wparam(0)
2022my $_out=&DWP(36,"esp"); # copy of wparam(1)
2023my $_len=&DWP(40,"esp"); # copy of wparam(2)
2024my $_key=&DWP(44,"esp"); # copy of wparam(3)
2025my $_ivp=&DWP(48,"esp"); # copy of wparam(4)
2026my $_tmp=&DWP(52,"esp"); # volatile variable
2027#
2028my $ivec=&DWP(60,"esp"); # ivec[16]
2029my $aes_key=&DWP(76,"esp"); # copy of aes_key
2030my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
2031
2032&function_begin("AES_cbc_encrypt");
2033 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
2034 &cmp ($s2,0);
2035 &je (&label("drop_out"));
2036
2037 &call (&label("pic_point")); # make it PIC!
2038 &set_label("pic_point");
2039 &blindpop($tbl);
2040 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
2041
2042 &cmp (&wparam(5),0);
2043 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
2044 &jne (&label("picked_te"));
2045 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
2046 &set_label("picked_te");
2047
2048 # one can argue if this is required
2049 &pushf ();
2050 &cld ();
2051
2052 &cmp ($s2,$speed_limit);
2053 &jb (&label("slow_way"));
2054 &test ($s2,15);
2055 &jnz (&label("slow_way"));
2056 if (!$x86only) {
2057 &bt (&DWP(0,$s0),28); # check for hyper-threading bit
2058 &jc (&label("slow_way"));
2059 }
2060 # pre-allocate aligned stack frame...
2061 &lea ($acc,&DWP(-80-244,"esp"));
2062 &and ($acc,-64);
2063
2064 # ... and make sure it doesn't alias with $tbl modulo 4096
2065 &mov ($s0,$tbl);
2066 &lea ($s1,&DWP(2048+256,$tbl));
2067 &mov ($s3,$acc);
2068 &and ($s0,0xfff); # s = %ebp&0xfff
2069 &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
2070 &and ($s3,0xfff); # p = %esp&0xfff
2071
2072 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
2073 &jb (&label("tbl_break_out"));
2074 &sub ($s3,$s1);
2075 &sub ($acc,$s3);
2076 &jmp (&label("tbl_ok"));
2077 &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz;
2078 &sub ($s3,$s0);
2079 &and ($s3,0xfff);
2080 &add ($s3,384);
2081 &sub ($acc,$s3);
2082 &set_label("tbl_ok",4);
2083
2084 &lea ($s3,&wparam(0)); # obtain pointer to parameter block
2085 &exch ("esp",$acc); # allocate stack frame
2086 &add ("esp",4); # reserve for return address!
2087 &mov ($_tbl,$tbl); # save %ebp
2088 &mov ($_esp,$acc); # save %esp
2089
2090 &mov ($s0,&DWP(0,$s3)); # load inp
2091 &mov ($s1,&DWP(4,$s3)); # load out
2092 #&mov ($s2,&DWP(8,$s3)); # load len
2093 &mov ($key,&DWP(12,$s3)); # load key
2094 &mov ($acc,&DWP(16,$s3)); # load ivp
2095 &mov ($s3,&DWP(20,$s3)); # load enc flag
2096
2097 &mov ($_inp,$s0); # save copy of inp
2098 &mov ($_out,$s1); # save copy of out
2099 &mov ($_len,$s2); # save copy of len
2100 &mov ($_key,$key); # save copy of key
2101 &mov ($_ivp,$acc); # save copy of ivp
2102
2103 &mov ($mark,0); # copy of aes_key->rounds = 0;
2104 # do we copy key schedule to stack?
2105 &mov ($s1 eq "ebx" ? $s1 : "",$key);
2106 &mov ($s2 eq "ecx" ? $s2 : "",244/4);
2107 &sub ($s1,$tbl);
2108 &mov ("esi",$key);
2109 &and ($s1,0xfff);
2110 &lea ("edi",$aes_key);
2111 &cmp ($s1,2048+256);
2112 &jb (&label("do_copy"));
2113 &cmp ($s1,4096-244);
2114 &jb (&label("skip_copy"));
2115 &set_label("do_copy",4);
2116 &mov ($_key,"edi");
2117 &data_word(0xA5F3F689); # rep movsd
2118 &set_label("skip_copy");
2119
2120 &mov ($key,16);
2121 &set_label("prefetch_tbl",4);
2122 &mov ($s0,&DWP(0,$tbl));
2123 &mov ($s1,&DWP(32,$tbl));
2124 &mov ($s2,&DWP(64,$tbl));
2125 &mov ($acc,&DWP(96,$tbl));
2126 &lea ($tbl,&DWP(128,$tbl));
2127 &sub ($key,1);
2128 &jnz (&label("prefetch_tbl"));
2129 &sub ($tbl,2048);
2130
2131 &mov ($acc,$_inp);
2132 &mov ($key,$_ivp);
2133
2134 &cmp ($s3,0);
2135 &je (&label("fast_decrypt"));
2136
2137#----------------------------- ENCRYPT -----------------------------#
2138 &mov ($s0,&DWP(0,$key)); # load iv
2139 &mov ($s1,&DWP(4,$key));
2140
2141 &set_label("fast_enc_loop",16);
2142 &mov ($s2,&DWP(8,$key));
2143 &mov ($s3,&DWP(12,$key));
2144
2145 &xor ($s0,&DWP(0,$acc)); # xor input data
2146 &xor ($s1,&DWP(4,$acc));
2147 &xor ($s2,&DWP(8,$acc));
2148 &xor ($s3,&DWP(12,$acc));
2149
2150 &mov ($key,$_key); # load key
2151 &call ("_x86_AES_encrypt");
2152
2153 &mov ($acc,$_inp); # load inp
2154 &mov ($key,$_out); # load out
2155
2156 &mov (&DWP(0,$key),$s0); # save output data
2157 &mov (&DWP(4,$key),$s1);
2158 &mov (&DWP(8,$key),$s2);
2159 &mov (&DWP(12,$key),$s3);
2160
2161 &lea ($acc,&DWP(16,$acc)); # advance inp
2162 &mov ($s2,$_len); # load len
2163 &mov ($_inp,$acc); # save inp
2164 &lea ($s3,&DWP(16,$key)); # advance out
2165 &mov ($_out,$s3); # save out
2166 &sub ($s2,16); # decrease len
2167 &mov ($_len,$s2); # save len
2168 &jnz (&label("fast_enc_loop"));
2169 &mov ($acc,$_ivp); # load ivp
2170 &mov ($s2,&DWP(8,$key)); # restore last 2 dwords
2171 &mov ($s3,&DWP(12,$key));
2172 &mov (&DWP(0,$acc),$s0); # save ivec
2173 &mov (&DWP(4,$acc),$s1);
2174 &mov (&DWP(8,$acc),$s2);
2175 &mov (&DWP(12,$acc),$s3);
2176
2177 &cmp ($mark,0); # was the key schedule copied?
2178 &mov ("edi",$_key);
2179 &je (&label("skip_ezero"));
2180 # zero copy of key schedule
2181 &mov ("ecx",240/4);
2182 &xor ("eax","eax");
2183 &align (4);
2184 &data_word(0xABF3F689); # rep stosd
2185 &set_label("skip_ezero")
2186 &mov ("esp",$_esp);
2187 &popf ();
2188 &set_label("drop_out");
2189 &function_end_A();
2190 &pushf (); # kludge, never executed
2191
2192#----------------------------- DECRYPT -----------------------------#
2193&set_label("fast_decrypt",16);
2194
2195 &cmp ($acc,$_out);
2196 &je (&label("fast_dec_in_place")); # in-place processing...
2197
2198 &mov ($_tmp,$key);
2199
2200 &align (4);
2201 &set_label("fast_dec_loop",16);
2202 &mov ($s0,&DWP(0,$acc)); # read input
2203 &mov ($s1,&DWP(4,$acc));
2204 &mov ($s2,&DWP(8,$acc));
2205 &mov ($s3,&DWP(12,$acc));
2206
2207 &mov ($key,$_key); # load key
2208 &call ("_x86_AES_decrypt");
2209
2210 &mov ($key,$_tmp); # load ivp
2211 &mov ($acc,$_len); # load len
2212 &xor ($s0,&DWP(0,$key)); # xor iv
2213 &xor ($s1,&DWP(4,$key));
2214 &xor ($s2,&DWP(8,$key));
2215 &xor ($s3,&DWP(12,$key));
2216
2217 &mov ($key,$_out); # load out
2218 &mov ($acc,$_inp); # load inp
2219
2220 &mov (&DWP(0,$key),$s0); # write output
2221 &mov (&DWP(4,$key),$s1);
2222 &mov (&DWP(8,$key),$s2);
2223 &mov (&DWP(12,$key),$s3);
2224
2225 &mov ($s2,$_len); # load len
2226 &mov ($_tmp,$acc); # save ivp
2227 &lea ($acc,&DWP(16,$acc)); # advance inp
2228 &mov ($_inp,$acc); # save inp
2229 &lea ($key,&DWP(16,$key)); # advance out
2230 &mov ($_out,$key); # save out
2231 &sub ($s2,16); # decrease len
2232 &mov ($_len,$s2); # save len
2233 &jnz (&label("fast_dec_loop"));
2234 &mov ($key,$_tmp); # load temp ivp
2235 &mov ($acc,$_ivp); # load user ivp
2236 &mov ($s0,&DWP(0,$key)); # load iv
2237 &mov ($s1,&DWP(4,$key));
2238 &mov ($s2,&DWP(8,$key));
2239 &mov ($s3,&DWP(12,$key));
2240 &mov (&DWP(0,$acc),$s0); # copy back to user
2241 &mov (&DWP(4,$acc),$s1);
2242 &mov (&DWP(8,$acc),$s2);
2243 &mov (&DWP(12,$acc),$s3);
2244 &jmp (&label("fast_dec_out"));
2245
2246 &set_label("fast_dec_in_place",16);
2247 &set_label("fast_dec_in_place_loop");
2248 &mov ($s0,&DWP(0,$acc)); # read input
2249 &mov ($s1,&DWP(4,$acc));
2250 &mov ($s2,&DWP(8,$acc));
2251 &mov ($s3,&DWP(12,$acc));
2252
2253 &lea ($key,$ivec);
2254 &mov (&DWP(0,$key),$s0); # copy to temp
2255 &mov (&DWP(4,$key),$s1);
2256 &mov (&DWP(8,$key),$s2);
2257 &mov (&DWP(12,$key),$s3);
2258
2259 &mov ($key,$_key); # load key
2260 &call ("_x86_AES_decrypt");
2261
2262 &mov ($key,$_ivp); # load ivp
2263 &mov ($acc,$_out); # load out
2264 &xor ($s0,&DWP(0,$key)); # xor iv
2265 &xor ($s1,&DWP(4,$key));
2266 &xor ($s2,&DWP(8,$key));
2267 &xor ($s3,&DWP(12,$key));
2268
2269 &mov (&DWP(0,$acc),$s0); # write output
2270 &mov (&DWP(4,$acc),$s1);
2271 &mov (&DWP(8,$acc),$s2);
2272 &mov (&DWP(12,$acc),$s3);
2273
2274 &lea ($acc,&DWP(16,$acc)); # advance out
2275 &mov ($_out,$acc); # save out
2276
2277 &lea ($acc,$ivec);
2278 &mov ($s0,&DWP(0,$acc)); # read temp
2279 &mov ($s1,&DWP(4,$acc));
2280 &mov ($s2,&DWP(8,$acc));
2281 &mov ($s3,&DWP(12,$acc));
2282
2283 &mov (&DWP(0,$key),$s0); # copy iv
2284 &mov (&DWP(4,$key),$s1);
2285 &mov (&DWP(8,$key),$s2);
2286 &mov (&DWP(12,$key),$s3);
2287
2288 &mov ($acc,$_inp); # load inp
2289 &mov ($s2,$_len); # load len
2290 &lea ($acc,&DWP(16,$acc)); # advance inp
2291 &mov ($_inp,$acc); # save inp
2292 &sub ($s2,16); # decrease len
2293 &mov ($_len,$s2); # save len
2294 &jnz (&label("fast_dec_in_place_loop"));
2295
2296 &set_label("fast_dec_out",4);
2297 &cmp ($mark,0); # was the key schedule copied?
2298 &mov ("edi",$_key);
2299 &je (&label("skip_dzero"));
2300 # zero copy of key schedule
2301 &mov ("ecx",240/4);
2302 &xor ("eax","eax");
2303 &align (4);
2304 &data_word(0xABF3F689); # rep stosd
2305 &set_label("skip_dzero")
2306 &mov ("esp",$_esp);
2307 &popf ();
2308 &function_end_A();
2309 &pushf (); # kludge, never executed
2310
2311#--------------------------- SLOW ROUTINE ---------------------------#
2312&set_label("slow_way",16);
2313
2314 &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap
2315 &mov ($key,&wparam(3)); # load key
2316
2317 # pre-allocate aligned stack frame...
2318 &lea ($acc,&DWP(-80,"esp"));
2319 &and ($acc,-64);
2320
2321 # ... and make sure it doesn't alias with $key modulo 1024
2322 &lea ($s1,&DWP(-80-63,$key));
2323 &sub ($s1,$acc);
2324 &neg ($s1);
2325 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
2326 &sub ($acc,$s1);
2327
2328 # pick S-box copy which can't overlap with stack frame or $key
2329 &lea ($s1,&DWP(768,$acc));
2330 &sub ($s1,$tbl);
2331 &and ($s1,0x300);
2332 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
2333
2334 &lea ($s3,&wparam(0)); # pointer to parameter block
2335
2336 &exch ("esp",$acc);
2337 &add ("esp",4); # reserve for return address!
2338 &mov ($_tbl,$tbl); # save %ebp
2339 &mov ($_esp,$acc); # save %esp
2340 &mov ($_tmp,$s0); # save OPENSSL_ia32cap
2341
2342 &mov ($s0,&DWP(0,$s3)); # load inp
2343 &mov ($s1,&DWP(4,$s3)); # load out
2344 #&mov ($s2,&DWP(8,$s3)); # load len
2345 #&mov ($key,&DWP(12,$s3)); # load key
2346 &mov ($acc,&DWP(16,$s3)); # load ivp
2347 &mov ($s3,&DWP(20,$s3)); # load enc flag
2348
2349 &mov ($_inp,$s0); # save copy of inp
2350 &mov ($_out,$s1); # save copy of out
2351 &mov ($_len,$s2); # save copy of len
2352 &mov ($_key,$key); # save copy of key
2353 &mov ($_ivp,$acc); # save copy of ivp
2354
2355 &mov ($key,$acc);
2356 &mov ($acc,$s0);
2357
2358 &cmp ($s3,0);
2359 &je (&label("slow_decrypt"));
2360
2361#--------------------------- SLOW ENCRYPT ---------------------------#
2362 &cmp ($s2,16);
2363 &mov ($s3,$s1);
2364 &jb (&label("slow_enc_tail"));
2365
2366 if (!$x86only) {
2367 &bt ($_tmp,25); # check for SSE bit
2368 &jnc (&label("slow_enc_x86"));
2369
2370 &movq ("mm0",&QWP(0,$key)); # load iv
2371 &movq ("mm4",&QWP(8,$key));
2372
2373 &set_label("slow_enc_loop_sse",16);
2374 &pxor ("mm0",&QWP(0,$acc)); # xor input data
2375 &pxor ("mm4",&QWP(8,$acc));
2376
2377 &mov ($key,$_key);
2378 &call ("_sse_AES_encrypt_compact");
2379
2380 &mov ($acc,$_inp); # load inp
2381 &mov ($key,$_out); # load out
2382 &mov ($s2,$_len); # load len
2383
2384 &movq (&QWP(0,$key),"mm0"); # save output data
2385 &movq (&QWP(8,$key),"mm4");
2386
2387 &lea ($acc,&DWP(16,$acc)); # advance inp
2388 &mov ($_inp,$acc); # save inp
2389 &lea ($s3,&DWP(16,$key)); # advance out
2390 &mov ($_out,$s3); # save out
2391 &sub ($s2,16); # decrease len
2392 &cmp ($s2,16);
2393 &mov ($_len,$s2); # save len
2394 &jae (&label("slow_enc_loop_sse"));
2395 &test ($s2,15);
2396 &jnz (&label("slow_enc_tail"));
2397 &mov ($acc,$_ivp); # load ivp
2398 &movq (&QWP(0,$acc),"mm0"); # save ivec
2399 &movq (&QWP(8,$acc),"mm4");
2400 &emms ();
2401 &mov ("esp",$_esp);
2402 &popf ();
2403 &function_end_A();
2404 &pushf (); # kludge, never executed
2405 }
2406 &set_label("slow_enc_x86",16);
2407 &mov ($s0,&DWP(0,$key)); # load iv
2408 &mov ($s1,&DWP(4,$key));
2409
2410 &set_label("slow_enc_loop_x86",4);
2411 &mov ($s2,&DWP(8,$key));
2412 &mov ($s3,&DWP(12,$key));
2413
2414 &xor ($s0,&DWP(0,$acc)); # xor input data
2415 &xor ($s1,&DWP(4,$acc));
2416 &xor ($s2,&DWP(8,$acc));
2417 &xor ($s3,&DWP(12,$acc));
2418
2419 &mov ($key,$_key); # load key
2420 &call ("_x86_AES_encrypt_compact");
2421
2422 &mov ($acc,$_inp); # load inp
2423 &mov ($key,$_out); # load out
2424
2425 &mov (&DWP(0,$key),$s0); # save output data
2426 &mov (&DWP(4,$key),$s1);
2427 &mov (&DWP(8,$key),$s2);
2428 &mov (&DWP(12,$key),$s3);
2429
2430 &mov ($s2,$_len); # load len
2431 &lea ($acc,&DWP(16,$acc)); # advance inp
2432 &mov ($_inp,$acc); # save inp
2433 &lea ($s3,&DWP(16,$key)); # advance out
2434 &mov ($_out,$s3); # save out
2435 &sub ($s2,16); # decrease len
2436 &cmp ($s2,16);
2437 &mov ($_len,$s2); # save len
2438 &jae (&label("slow_enc_loop_x86"));
2439 &test ($s2,15);
2440 &jnz (&label("slow_enc_tail"));
2441 &mov ($acc,$_ivp); # load ivp
2442 &mov ($s2,&DWP(8,$key)); # restore last dwords
2443 &mov ($s3,&DWP(12,$key));
2444 &mov (&DWP(0,$acc),$s0); # save ivec
2445 &mov (&DWP(4,$acc),$s1);
2446 &mov (&DWP(8,$acc),$s2);
2447 &mov (&DWP(12,$acc),$s3);
2448
2449 &mov ("esp",$_esp);
2450 &popf ();
2451 &function_end_A();
2452 &pushf (); # kludge, never executed
2453
2454 &set_label("slow_enc_tail",16);
2455 &emms () if (!$x86only);
2456 &mov ($key eq "edi"? $key:"",$s3); # load out to edi
2457 &mov ($s1,16);
2458 &sub ($s1,$s2);
2459 &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp
2460 &je (&label("enc_in_place"));
2461 &align (4);
2462 &data_word(0xA4F3F689); # rep movsb # copy input
2463 &jmp (&label("enc_skip_in_place"));
2464 &set_label("enc_in_place");
2465 &lea ($key,&DWP(0,$key,$s2));
2466 &set_label("enc_skip_in_place");
2467 &mov ($s2,$s1);
2468 &xor ($s0,$s0);
2469 &align (4);
2470 &data_word(0xAAF3F689); # rep stosb # zero tail
2471
2472 &mov ($key,$_ivp); # restore ivp
2473 &mov ($acc,$s3); # output as input
2474 &mov ($s0,&DWP(0,$key));
2475 &mov ($s1,&DWP(4,$key));
2476 &mov ($_len,16); # len=16
2477 &jmp (&label("slow_enc_loop_x86")); # one more spin...
2478
2479#--------------------------- SLOW DECRYPT ---------------------------#
2480&set_label("slow_decrypt",16);
2481 if (!$x86only) {
2482 &bt ($_tmp,25); # check for SSE bit
2483 &jnc (&label("slow_dec_loop_x86"));
2484
2485 &set_label("slow_dec_loop_sse",4);
2486 &movq ("mm0",&QWP(0,$acc)); # read input
2487 &movq ("mm4",&QWP(8,$acc));
2488
2489 &mov ($key,$_key);
2490 &call ("_sse_AES_decrypt_compact");
2491
2492 &mov ($acc,$_inp); # load inp
2493 &lea ($s0,$ivec);
2494 &mov ($s1,$_out); # load out
2495 &mov ($s2,$_len); # load len
2496 &mov ($key,$_ivp); # load ivp
2497
2498 &movq ("mm1",&QWP(0,$acc)); # re-read input
2499 &movq ("mm5",&QWP(8,$acc));
2500
2501 &pxor ("mm0",&QWP(0,$key)); # xor iv
2502 &pxor ("mm4",&QWP(8,$key));
2503
2504 &movq (&QWP(0,$key),"mm1"); # copy input to iv
2505 &movq (&QWP(8,$key),"mm5");
2506
2507 &sub ($s2,16); # decrease len
2508 &jc (&label("slow_dec_partial_sse"));
2509
2510 &movq (&QWP(0,$s1),"mm0"); # write output
2511 &movq (&QWP(8,$s1),"mm4");
2512
2513 &lea ($s1,&DWP(16,$s1)); # advance out
2514 &mov ($_out,$s1); # save out
2515 &lea ($acc,&DWP(16,$acc)); # advance inp
2516 &mov ($_inp,$acc); # save inp
2517 &mov ($_len,$s2); # save len
2518 &jnz (&label("slow_dec_loop_sse"));
2519 &emms ();
2520 &mov ("esp",$_esp);
2521 &popf ();
2522 &function_end_A();
2523 &pushf (); # kludge, never executed
2524
2525 &set_label("slow_dec_partial_sse",16);
2526 &movq (&QWP(0,$s0),"mm0"); # save output to temp
2527 &movq (&QWP(8,$s0),"mm4");
2528 &emms ();
2529
2530 &add ($s2 eq "ecx" ? "ecx":"",16);
2531 &mov ("edi",$s1); # out
2532 &mov ("esi",$s0); # temp
2533 &align (4);
2534 &data_word(0xA4F3F689); # rep movsb # copy partial output
2535
2536 &mov ("esp",$_esp);
2537 &popf ();
2538 &function_end_A();
2539 &pushf (); # kludge, never executed
2540 }
2541 &set_label("slow_dec_loop_x86",16);
2542 &mov ($s0,&DWP(0,$acc)); # read input
2543 &mov ($s1,&DWP(4,$acc));
2544 &mov ($s2,&DWP(8,$acc));
2545 &mov ($s3,&DWP(12,$acc));
2546
2547 &lea ($key,$ivec);
2548 &mov (&DWP(0,$key),$s0); # copy to temp
2549 &mov (&DWP(4,$key),$s1);
2550 &mov (&DWP(8,$key),$s2);
2551 &mov (&DWP(12,$key),$s3);
2552
2553 &mov ($key,$_key); # load key
2554 &call ("_x86_AES_decrypt_compact");
2555
2556 &mov ($key,$_ivp); # load ivp
2557 &mov ($acc,$_len); # load len
2558 &xor ($s0,&DWP(0,$key)); # xor iv
2559 &xor ($s1,&DWP(4,$key));
2560 &xor ($s2,&DWP(8,$key));
2561 &xor ($s3,&DWP(12,$key));
2562
2563 &sub ($acc,16);
2564 &jc (&label("slow_dec_partial_x86"));
2565
2566 &mov ($_len,$acc); # save len
2567 &mov ($acc,$_out); # load out
2568
2569 &mov (&DWP(0,$acc),$s0); # write output
2570 &mov (&DWP(4,$acc),$s1);
2571 &mov (&DWP(8,$acc),$s2);
2572 &mov (&DWP(12,$acc),$s3);
2573
2574 &lea ($acc,&DWP(16,$acc)); # advance out
2575 &mov ($_out,$acc); # save out
2576
2577 &lea ($acc,$ivec);
2578 &mov ($s0,&DWP(0,$acc)); # read temp
2579 &mov ($s1,&DWP(4,$acc));
2580 &mov ($s2,&DWP(8,$acc));
2581 &mov ($s3,&DWP(12,$acc));
2582
2583 &mov (&DWP(0,$key),$s0); # copy it to iv
2584 &mov (&DWP(4,$key),$s1);
2585 &mov (&DWP(8,$key),$s2);
2586 &mov (&DWP(12,$key),$s3);
2587
2588 &mov ($acc,$_inp); # load inp
2589 &lea ($acc,&DWP(16,$acc)); # advance inp
2590 &mov ($_inp,$acc); # save inp
2591 &jnz (&label("slow_dec_loop_x86"));
2592 &mov ("esp",$_esp);
2593 &popf ();
2594 &function_end_A();
2595 &pushf (); # kludge, never executed
2596
2597 &set_label("slow_dec_partial_x86",16);
2598 &lea ($acc,$ivec);
2599 &mov (&DWP(0,$acc),$s0); # save output to temp
2600 &mov (&DWP(4,$acc),$s1);
2601 &mov (&DWP(8,$acc),$s2);
2602 &mov (&DWP(12,$acc),$s3);
2603
2604 &mov ($acc,$_inp);
2605 &mov ($s0,&DWP(0,$acc)); # re-read input
2606 &mov ($s1,&DWP(4,$acc));
2607 &mov ($s2,&DWP(8,$acc));
2608 &mov ($s3,&DWP(12,$acc));
2609
2610 &mov (&DWP(0,$key),$s0); # copy it to iv
2611 &mov (&DWP(4,$key),$s1);
2612 &mov (&DWP(8,$key),$s2);
2613 &mov (&DWP(12,$key),$s3);
2614
2615 &mov ("ecx",$_len);
2616 &mov ("edi",$_out);
2617 &lea ("esi",$ivec);
2618 &align (4);
2619 &data_word(0xA4F3F689); # rep movsb # copy partial output
2620
2621 &mov ("esp",$_esp);
2622 &popf ();
2623&function_end("AES_cbc_encrypt");
2624}
2625
2626#------------------------------------------------------------------#
2627
2628sub enckey()
2629{
2630 &movz ("esi",&LB("edx")); # rk[i]>>0
2631 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2632 &movz ("esi",&HB("edx")); # rk[i]>>8
2633 &shl ("ebx",24);
2634 &xor ("eax","ebx");
2635
2636 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2637 &shr ("edx",16);
2638 &movz ("esi",&LB("edx")); # rk[i]>>16
2639 &xor ("eax","ebx");
2640
2641 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2642 &movz ("esi",&HB("edx")); # rk[i]>>24
2643 &shl ("ebx",8);
2644 &xor ("eax","ebx");
2645
2646 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2647 &shl ("ebx",16);
2648 &xor ("eax","ebx");
2649
2650 &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon
2651}
2652
2653&function_begin("_x86_AES_set_encrypt_key");
2654 &mov ("esi",&wparam(1)); # user supplied key
2655 &mov ("edi",&wparam(3)); # private key schedule
2656
2657 &test ("esi",-1);
2658 &jz (&label("badpointer"));
2659 &test ("edi",-1);
2660 &jz (&label("badpointer"));
2661
2662 &call (&label("pic_point"));
2663 &set_label("pic_point");
2664 &blindpop($tbl);
2665 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
2666 &lea ($tbl,&DWP(2048+128,$tbl));
2667
2668 # prefetch Te4
2669 &mov ("eax",&DWP(0-128,$tbl));
2670 &mov ("ebx",&DWP(32-128,$tbl));
2671 &mov ("ecx",&DWP(64-128,$tbl));
2672 &mov ("edx",&DWP(96-128,$tbl));
2673 &mov ("eax",&DWP(128-128,$tbl));
2674 &mov ("ebx",&DWP(160-128,$tbl));
2675 &mov ("ecx",&DWP(192-128,$tbl));
2676 &mov ("edx",&DWP(224-128,$tbl));
2677
2678 &mov ("ecx",&wparam(2)); # number of bits in key
2679 &cmp ("ecx",128);
2680 &je (&label("10rounds"));
2681 &cmp ("ecx",192);
2682 &je (&label("12rounds"));
2683 &cmp ("ecx",256);
2684 &je (&label("14rounds"));
2685 &mov ("eax",-2); # invalid number of bits
2686 &jmp (&label("exit"));
2687
2688 &set_label("10rounds");
2689 &mov ("eax",&DWP(0,"esi")); # copy first 4 dwords
2690 &mov ("ebx",&DWP(4,"esi"));
2691 &mov ("ecx",&DWP(8,"esi"));
2692 &mov ("edx",&DWP(12,"esi"));
2693 &mov (&DWP(0,"edi"),"eax");
2694 &mov (&DWP(4,"edi"),"ebx");
2695 &mov (&DWP(8,"edi"),"ecx");
2696 &mov (&DWP(12,"edi"),"edx");
2697
2698 &xor ("ecx","ecx");
2699 &jmp (&label("10shortcut"));
2700
2701 &align (4);
2702 &set_label("10loop");
2703 &mov ("eax",&DWP(0,"edi")); # rk[0]
2704 &mov ("edx",&DWP(12,"edi")); # rk[3]
2705 &set_label("10shortcut");
2706 &enckey ();
2707
2708 &mov (&DWP(16,"edi"),"eax"); # rk[4]
2709 &xor ("eax",&DWP(4,"edi"));
2710 &mov (&DWP(20,"edi"),"eax"); # rk[5]
2711 &xor ("eax",&DWP(8,"edi"));
2712 &mov (&DWP(24,"edi"),"eax"); # rk[6]
2713 &xor ("eax",&DWP(12,"edi"));
2714 &mov (&DWP(28,"edi"),"eax"); # rk[7]
2715 &inc ("ecx");
2716 &add ("edi",16);
2717 &cmp ("ecx",10);
2718 &jl (&label("10loop"));
2719
2720 &mov (&DWP(80,"edi"),10); # setup number of rounds
2721 &xor ("eax","eax");
2722 &jmp (&label("exit"));
2723
2724 &set_label("12rounds");
2725 &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
2726 &mov ("ebx",&DWP(4,"esi"));
2727 &mov ("ecx",&DWP(8,"esi"));
2728 &mov ("edx",&DWP(12,"esi"));
2729 &mov (&DWP(0,"edi"),"eax");
2730 &mov (&DWP(4,"edi"),"ebx");
2731 &mov (&DWP(8,"edi"),"ecx");
2732 &mov (&DWP(12,"edi"),"edx");
2733 &mov ("ecx",&DWP(16,"esi"));
2734 &mov ("edx",&DWP(20,"esi"));
2735 &mov (&DWP(16,"edi"),"ecx");
2736 &mov (&DWP(20,"edi"),"edx");
2737
2738 &xor ("ecx","ecx");
2739 &jmp (&label("12shortcut"));
2740
2741 &align (4);
2742 &set_label("12loop");
2743 &mov ("eax",&DWP(0,"edi")); # rk[0]
2744 &mov ("edx",&DWP(20,"edi")); # rk[5]
2745 &set_label("12shortcut");
2746 &enckey ();
2747
2748 &mov (&DWP(24,"edi"),"eax"); # rk[6]
2749 &xor ("eax",&DWP(4,"edi"));
2750 &mov (&DWP(28,"edi"),"eax"); # rk[7]
2751 &xor ("eax",&DWP(8,"edi"));
2752 &mov (&DWP(32,"edi"),"eax"); # rk[8]
2753 &xor ("eax",&DWP(12,"edi"));
2754 &mov (&DWP(36,"edi"),"eax"); # rk[9]
2755
2756 &cmp ("ecx",7);
2757 &je (&label("12break"));
2758 &inc ("ecx");
2759
2760 &xor ("eax",&DWP(16,"edi"));
2761 &mov (&DWP(40,"edi"),"eax"); # rk[10]
2762 &xor ("eax",&DWP(20,"edi"));
2763 &mov (&DWP(44,"edi"),"eax"); # rk[11]
2764
2765 &add ("edi",24);
2766 &jmp (&label("12loop"));
2767
2768 &set_label("12break");
2769 &mov (&DWP(72,"edi"),12); # setup number of rounds
2770 &xor ("eax","eax");
2771 &jmp (&label("exit"));
2772
2773 &set_label("14rounds");
2774 &mov ("eax",&DWP(0,"esi")); # copy first 8 dwords
2775 &mov ("ebx",&DWP(4,"esi"));
2776 &mov ("ecx",&DWP(8,"esi"));
2777 &mov ("edx",&DWP(12,"esi"));
2778 &mov (&DWP(0,"edi"),"eax");
2779 &mov (&DWP(4,"edi"),"ebx");
2780 &mov (&DWP(8,"edi"),"ecx");
2781 &mov (&DWP(12,"edi"),"edx");
2782 &mov ("eax",&DWP(16,"esi"));
2783 &mov ("ebx",&DWP(20,"esi"));
2784 &mov ("ecx",&DWP(24,"esi"));
2785 &mov ("edx",&DWP(28,"esi"));
2786 &mov (&DWP(16,"edi"),"eax");
2787 &mov (&DWP(20,"edi"),"ebx");
2788 &mov (&DWP(24,"edi"),"ecx");
2789 &mov (&DWP(28,"edi"),"edx");
2790
2791 &xor ("ecx","ecx");
2792 &jmp (&label("14shortcut"));
2793
2794 &align (4);
2795 &set_label("14loop");
2796 &mov ("edx",&DWP(28,"edi")); # rk[7]
2797 &set_label("14shortcut");
2798 &mov ("eax",&DWP(0,"edi")); # rk[0]
2799
2800 &enckey ();
2801
2802 &mov (&DWP(32,"edi"),"eax"); # rk[8]
2803 &xor ("eax",&DWP(4,"edi"));
2804 &mov (&DWP(36,"edi"),"eax"); # rk[9]
2805 &xor ("eax",&DWP(8,"edi"));
2806 &mov (&DWP(40,"edi"),"eax"); # rk[10]
2807 &xor ("eax",&DWP(12,"edi"));
2808 &mov (&DWP(44,"edi"),"eax"); # rk[11]
2809
2810 &cmp ("ecx",6);
2811 &je (&label("14break"));
2812 &inc ("ecx");
2813
2814 &mov ("edx","eax");
2815 &mov ("eax",&DWP(16,"edi")); # rk[4]
2816 &movz ("esi",&LB("edx")); # rk[11]>>0
2817 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2818 &movz ("esi",&HB("edx")); # rk[11]>>8
2819 &xor ("eax","ebx");
2820
2821 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2822 &shr ("edx",16);
2823 &shl ("ebx",8);
2824 &movz ("esi",&LB("edx")); # rk[11]>>16
2825 &xor ("eax","ebx");
2826
2827 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2828 &movz ("esi",&HB("edx")); # rk[11]>>24
2829 &shl ("ebx",16);
2830 &xor ("eax","ebx");
2831
2832 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2833 &shl ("ebx",24);
2834 &xor ("eax","ebx");
2835
2836 &mov (&DWP(48,"edi"),"eax"); # rk[12]
2837 &xor ("eax",&DWP(20,"edi"));
2838 &mov (&DWP(52,"edi"),"eax"); # rk[13]
2839 &xor ("eax",&DWP(24,"edi"));
2840 &mov (&DWP(56,"edi"),"eax"); # rk[14]
2841 &xor ("eax",&DWP(28,"edi"));
2842 &mov (&DWP(60,"edi"),"eax"); # rk[15]
2843
2844 &add ("edi",32);
2845 &jmp (&label("14loop"));
2846
2847 &set_label("14break");
2848 &mov (&DWP(48,"edi"),14); # setup number of rounds
2849 &xor ("eax","eax");
2850 &jmp (&label("exit"));
2851
2852 &set_label("badpointer");
2853 &mov ("eax",-1);
2854 &set_label("exit");
2855&function_end("_x86_AES_set_encrypt_key");
2856
2857# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
2858# AES_KEY *key)
2859&function_begin_B("AES_set_encrypt_key");
2860 &call ("_x86_AES_set_encrypt_key");
2861 &ret ();
2862&function_end_B("AES_set_encrypt_key");
2863
2864sub deckey()
2865{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
2866 my $tmp = $tbl;
2867
2868 &mov ($acc,$tp1);
2869 &and ($acc,0x80808080);
2870 &mov ($tmp,$acc);
2871 &shr ($tmp,7);
2872 &lea ($tp2,&DWP(0,$tp1,$tp1));
2873 &sub ($acc,$tmp);
2874 &and ($tp2,0xfefefefe);
2875 &and ($acc,0x1b1b1b1b);
2876 &xor ($acc,$tp2);
2877 &mov ($tp2,$acc);
2878
2879 &and ($acc,0x80808080);
2880 &mov ($tmp,$acc);
2881 &shr ($tmp,7);
2882 &lea ($tp4,&DWP(0,$tp2,$tp2));
2883 &sub ($acc,$tmp);
2884 &and ($tp4,0xfefefefe);
2885 &and ($acc,0x1b1b1b1b);
2886 &xor ($tp2,$tp1); # tp2^tp1
2887 &xor ($acc,$tp4);
2888 &mov ($tp4,$acc);
2889
2890 &and ($acc,0x80808080);
2891 &mov ($tmp,$acc);
2892 &shr ($tmp,7);
2893 &lea ($tp8,&DWP(0,$tp4,$tp4));
2894 &xor ($tp4,$tp1); # tp4^tp1
2895 &sub ($acc,$tmp);
2896 &and ($tp8,0xfefefefe);
2897 &and ($acc,0x1b1b1b1b);
2898 &rotl ($tp1,8); # = ROTATE(tp1,8)
2899 &xor ($tp8,$acc);
2900
2901 &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load
2902
2903 &xor ($tp1,$tp2);
2904 &xor ($tp2,$tp8);
2905 &xor ($tp1,$tp4);
2906 &rotl ($tp2,24);
2907 &xor ($tp4,$tp8);
2908 &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
2909 &rotl ($tp4,16);
2910 &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
2911 &rotl ($tp8,8);
2912 &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
2913 &mov ($tp2,$tmp);
2914 &xor ($tp1,$tp8); # ^= ROTATE(tp8,8)
2915
2916 &mov (&DWP(4*$i,$key),$tp1);
2917}
2918
2919# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
2920# AES_KEY *key)
2921&function_begin_B("AES_set_decrypt_key");
2922 &call ("_x86_AES_set_encrypt_key");
2923 &cmp ("eax",0);
2924 &je (&label("proceed"));
2925 &ret ();
2926
2927 &set_label("proceed");
2928 &push ("ebp");
2929 &push ("ebx");
2930 &push ("esi");
2931 &push ("edi");
2932
2933 &mov ("esi",&wparam(2));
2934 &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
2935 &lea ("ecx",&DWP(0,"","ecx",4));
2936 &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
2937
2938 &set_label("invert",4); # invert order of chunks
2939 &mov ("eax",&DWP(0,"esi"));
2940 &mov ("ebx",&DWP(4,"esi"));
2941 &mov ("ecx",&DWP(0,"edi"));
2942 &mov ("edx",&DWP(4,"edi"));
2943 &mov (&DWP(0,"edi"),"eax");
2944 &mov (&DWP(4,"edi"),"ebx");
2945 &mov (&DWP(0,"esi"),"ecx");
2946 &mov (&DWP(4,"esi"),"edx");
2947 &mov ("eax",&DWP(8,"esi"));
2948 &mov ("ebx",&DWP(12,"esi"));
2949 &mov ("ecx",&DWP(8,"edi"));
2950 &mov ("edx",&DWP(12,"edi"));
2951 &mov (&DWP(8,"edi"),"eax");
2952 &mov (&DWP(12,"edi"),"ebx");
2953 &mov (&DWP(8,"esi"),"ecx");
2954 &mov (&DWP(12,"esi"),"edx");
2955 &add ("esi",16);
2956 &sub ("edi",16);
2957 &cmp ("esi","edi");
2958 &jne (&label("invert"));
2959
2960 &mov ($key,&wparam(2));
2961 &mov ($acc,&DWP(240,$key)); # pull number of rounds
2962 &lea ($acc,&DWP(-2,$acc,$acc));
2963 &lea ($acc,&DWP(0,$key,$acc,8));
2964 &mov (&wparam(2),$acc);
2965
2966 &mov ($s0,&DWP(16,$key)); # modulo-scheduled load
2967 &set_label("permute",4); # permute the key schedule
2968 &add ($key,16);
2969 &deckey (0,$key,$s0,$s1,$s2,$s3);
2970 &deckey (1,$key,$s1,$s2,$s3,$s0);
2971 &deckey (2,$key,$s2,$s3,$s0,$s1);
2972 &deckey (3,$key,$s3,$s0,$s1,$s2);
2973 &cmp ($key,&wparam(2));
2974 &jb (&label("permute"));
2975
2976 &xor ("eax","eax"); # return success
2977&function_end("AES_set_decrypt_key");
2978&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
2979
2980&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl
deleted file mode 100644
index c51ee1fbf6..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-armv4.pl
+++ /dev/null
@@ -1,1030 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for ARMv4
11
12# January 2007.
13#
14# Code uses single 1K S-box and is >2 times faster than code generated
15# by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
16# allows to merge logical or arithmetic operation with shift or rotate
17# in one instruction and emit combined result every cycle. The module
18# is endian-neutral. The performance is ~42 cycles/byte for 128-bit
19# key [on single-issue Xscale PXA250 core].
20
21# May 2007.
22#
23# AES_set_[en|de]crypt_key is added.
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 12% improvement on
28# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
29
30while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
31open STDOUT,">$output";
32
33$s0="r0";
34$s1="r1";
35$s2="r2";
36$s3="r3";
37$t1="r4";
38$t2="r5";
39$t3="r6";
40$i1="r7";
41$i2="r8";
42$i3="r9";
43
44$tbl="r10";
45$key="r11";
46$rounds="r12";
47
48$code=<<___;
49.text
50.code 32
51
52.type AES_Te,%object
53.align 5
54AES_Te:
55.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
56.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
57.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
58.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
59.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
60.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
61.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
62.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
63.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
64.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
65.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
66.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
67.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
68.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
69.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
70.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
71.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
72.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
73.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
74.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
75.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
76.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
77.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
78.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
79.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
80.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
81.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
82.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
83.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
84.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
85.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
86.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
87.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
88.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
89.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
90.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
91.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
92.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
93.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
94.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
95.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
96.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
97.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
98.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
99.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
100.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
101.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
102.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
103.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
104.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
105.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
106.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
107.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
108.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
109.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
110.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
111.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
112.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
113.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
114.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
115.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
116.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
117.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
118.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
119@ Te4[256]
120.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
121.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
122.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
123.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
124.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
125.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
126.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
127.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
128.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
129.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
130.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
131.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
132.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
133.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
134.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
135.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
136.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
137.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
138.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
139.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
140.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
141.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
142.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
143.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
144.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
145.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
146.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
147.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
148.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
149.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
150.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
151.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
152@ rcon[]
153.word 0x01000000, 0x02000000, 0x04000000, 0x08000000
154.word 0x10000000, 0x20000000, 0x40000000, 0x80000000
155.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
156.size AES_Te,.-AES_Te
157
158@ void AES_encrypt(const unsigned char *in, unsigned char *out,
159@ const AES_KEY *key) {
160.global AES_encrypt
161.type AES_encrypt,%function
162.align 5
163AES_encrypt:
164 sub r3,pc,#8 @ AES_encrypt
165 stmdb sp!,{r1,r4-r12,lr}
166 mov $rounds,r0 @ inp
167 mov $key,r2
168 sub $tbl,r3,#AES_encrypt-AES_Te @ Te
169
170 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
171 ldrb $t1,[$rounds,#2] @ manner...
172 ldrb $t2,[$rounds,#1]
173 ldrb $t3,[$rounds,#0]
174 orr $s0,$s0,$t1,lsl#8
175 ldrb $s1,[$rounds,#7]
176 orr $s0,$s0,$t2,lsl#16
177 ldrb $t1,[$rounds,#6]
178 orr $s0,$s0,$t3,lsl#24
179 ldrb $t2,[$rounds,#5]
180 ldrb $t3,[$rounds,#4]
181 orr $s1,$s1,$t1,lsl#8
182 ldrb $s2,[$rounds,#11]
183 orr $s1,$s1,$t2,lsl#16
184 ldrb $t1,[$rounds,#10]
185 orr $s1,$s1,$t3,lsl#24
186 ldrb $t2,[$rounds,#9]
187 ldrb $t3,[$rounds,#8]
188 orr $s2,$s2,$t1,lsl#8
189 ldrb $s3,[$rounds,#15]
190 orr $s2,$s2,$t2,lsl#16
191 ldrb $t1,[$rounds,#14]
192 orr $s2,$s2,$t3,lsl#24
193 ldrb $t2,[$rounds,#13]
194 ldrb $t3,[$rounds,#12]
195 orr $s3,$s3,$t1,lsl#8
196 orr $s3,$s3,$t2,lsl#16
197 orr $s3,$s3,$t3,lsl#24
198
199 bl _armv4_AES_encrypt
200
201 ldr $rounds,[sp],#4 @ pop out
202 mov $t1,$s0,lsr#24 @ write output in endian-neutral
203 mov $t2,$s0,lsr#16 @ manner...
204 mov $t3,$s0,lsr#8
205 strb $t1,[$rounds,#0]
206 strb $t2,[$rounds,#1]
207 mov $t1,$s1,lsr#24
208 strb $t3,[$rounds,#2]
209 mov $t2,$s1,lsr#16
210 strb $s0,[$rounds,#3]
211 mov $t3,$s1,lsr#8
212 strb $t1,[$rounds,#4]
213 strb $t2,[$rounds,#5]
214 mov $t1,$s2,lsr#24
215 strb $t3,[$rounds,#6]
216 mov $t2,$s2,lsr#16
217 strb $s1,[$rounds,#7]
218 mov $t3,$s2,lsr#8
219 strb $t1,[$rounds,#8]
220 strb $t2,[$rounds,#9]
221 mov $t1,$s3,lsr#24
222 strb $t3,[$rounds,#10]
223 mov $t2,$s3,lsr#16
224 strb $s2,[$rounds,#11]
225 mov $t3,$s3,lsr#8
226 strb $t1,[$rounds,#12]
227 strb $t2,[$rounds,#13]
228 strb $t3,[$rounds,#14]
229 strb $s3,[$rounds,#15]
230
231 ldmia sp!,{r4-r12,lr}
232 tst lr,#1
233 moveq pc,lr @ be binary compatible with V4, yet
234 bx lr @ interoperable with Thumb ISA:-)
235.size AES_encrypt,.-AES_encrypt
236
237.type _armv4_AES_encrypt,%function
238.align 2
239_armv4_AES_encrypt:
240 str lr,[sp,#-4]! @ push lr
241 ldmia $key!,{$t1-$i1}
242 eor $s0,$s0,$t1
243 ldr $rounds,[$key,#240-16]
244 eor $s1,$s1,$t2
245 eor $s2,$s2,$t3
246 eor $s3,$s3,$i1
247 sub $rounds,$rounds,#1
248 mov lr,#255
249
250 and $i1,lr,$s0
251 and $i2,lr,$s0,lsr#8
252 and $i3,lr,$s0,lsr#16
253 mov $s0,$s0,lsr#24
254.Lenc_loop:
255 ldr $t1,[$tbl,$i1,lsl#2] @ Te3[s0>>0]
256 and $i1,lr,$s1,lsr#16 @ i0
257 ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8]
258 and $i2,lr,$s1
259 ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16]
260 and $i3,lr,$s1,lsr#8
261 ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24]
262 mov $s1,$s1,lsr#24
263
264 ldr $i1,[$tbl,$i1,lsl#2] @ Te1[s1>>16]
265 ldr $i2,[$tbl,$i2,lsl#2] @ Te3[s1>>0]
266 ldr $i3,[$tbl,$i3,lsl#2] @ Te2[s1>>8]
267 eor $s0,$s0,$i1,ror#8
268 ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24]
269 and $i1,lr,$s2,lsr#8 @ i0
270 eor $t2,$t2,$i2,ror#8
271 and $i2,lr,$s2,lsr#16 @ i1
272 eor $t3,$t3,$i3,ror#8
273 and $i3,lr,$s2
274 eor $s1,$s1,$t1,ror#24
275 ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
276 mov $s2,$s2,lsr#24
277
278 ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
279 ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
280 eor $s0,$s0,$i1,ror#16
281 ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
282 and $i1,lr,$s3 @ i0
283 eor $s1,$s1,$i2,ror#8
284 and $i2,lr,$s3,lsr#8 @ i1
285 eor $t3,$t3,$i3,ror#16
286 and $i3,lr,$s3,lsr#16 @ i2
287 eor $s2,$s2,$t2,ror#16
288 ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
289 mov $s3,$s3,lsr#24
290
291 ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
292 ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
293 eor $s0,$s0,$i1,ror#24
294 ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
295 eor $s1,$s1,$i2,ror#16
296 ldr $i1,[$key],#16
297 eor $s2,$s2,$i3,ror#8
298 ldr $t1,[$key,#-12]
299 eor $s3,$s3,$t3,ror#8
300
301 ldr $t2,[$key,#-8]
302 eor $s0,$s0,$i1
303 ldr $t3,[$key,#-4]
304 and $i1,lr,$s0
305 eor $s1,$s1,$t1
306 and $i2,lr,$s0,lsr#8
307 eor $s2,$s2,$t2
308 and $i3,lr,$s0,lsr#16
309 eor $s3,$s3,$t3
310 mov $s0,$s0,lsr#24
311
312 subs $rounds,$rounds,#1
313 bne .Lenc_loop
314
315 add $tbl,$tbl,#2
316
317 ldrb $t1,[$tbl,$i1,lsl#2] @ Te4[s0>>0]
318 and $i1,lr,$s1,lsr#16 @ i0
319 ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8]
320 and $i2,lr,$s1
321 ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16]
322 and $i3,lr,$s1,lsr#8
323 ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24]
324 mov $s1,$s1,lsr#24
325
326 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s1>>16]
327 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s1>>0]
328 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s1>>8]
329 eor $s0,$i1,$s0,lsl#8
330 ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24]
331 and $i1,lr,$s2,lsr#8 @ i0
332 eor $t2,$i2,$t2,lsl#8
333 and $i2,lr,$s2,lsr#16 @ i1
334 eor $t3,$i3,$t3,lsl#8
335 and $i3,lr,$s2
336 eor $s1,$t1,$s1,lsl#24
337 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
338 mov $s2,$s2,lsr#24
339
340 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
341 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
342 eor $s0,$i1,$s0,lsl#8
343 ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
344 and $i1,lr,$s3 @ i0
345 eor $s1,$s1,$i2,lsl#16
346 and $i2,lr,$s3,lsr#8 @ i1
347 eor $t3,$i3,$t3,lsl#8
348 and $i3,lr,$s3,lsr#16 @ i2
349 eor $s2,$t2,$s2,lsl#24
350 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
351 mov $s3,$s3,lsr#24
352
353 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
354 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
355 eor $s0,$i1,$s0,lsl#8
356 ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
357 ldr $i1,[$key,#0]
358 eor $s1,$s1,$i2,lsl#8
359 ldr $t1,[$key,#4]
360 eor $s2,$s2,$i3,lsl#16
361 ldr $t2,[$key,#8]
362 eor $s3,$t3,$s3,lsl#24
363 ldr $t3,[$key,#12]
364
365 eor $s0,$s0,$i1
366 eor $s1,$s1,$t1
367 eor $s2,$s2,$t2
368 eor $s3,$s3,$t3
369
370 sub $tbl,$tbl,#2
371 ldr pc,[sp],#4 @ pop and return
372.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
373
374.global AES_set_encrypt_key
375.type AES_set_encrypt_key,%function
376.align 5
377AES_set_encrypt_key:
378 sub r3,pc,#8 @ AES_set_encrypt_key
379 teq r0,#0
380 moveq r0,#-1
381 beq .Labrt
382 teq r2,#0
383 moveq r0,#-1
384 beq .Labrt
385
386 teq r1,#128
387 beq .Lok
388 teq r1,#192
389 beq .Lok
390 teq r1,#256
391 movne r0,#-1
392 bne .Labrt
393
394.Lok: stmdb sp!,{r4-r12,lr}
395 sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4
396
397 mov $rounds,r0 @ inp
398 mov lr,r1 @ bits
399 mov $key,r2 @ key
400
401 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
402 ldrb $t1,[$rounds,#2] @ manner...
403 ldrb $t2,[$rounds,#1]
404 ldrb $t3,[$rounds,#0]
405 orr $s0,$s0,$t1,lsl#8
406 ldrb $s1,[$rounds,#7]
407 orr $s0,$s0,$t2,lsl#16
408 ldrb $t1,[$rounds,#6]
409 orr $s0,$s0,$t3,lsl#24
410 ldrb $t2,[$rounds,#5]
411 ldrb $t3,[$rounds,#4]
412 orr $s1,$s1,$t1,lsl#8
413 ldrb $s2,[$rounds,#11]
414 orr $s1,$s1,$t2,lsl#16
415 ldrb $t1,[$rounds,#10]
416 orr $s1,$s1,$t3,lsl#24
417 ldrb $t2,[$rounds,#9]
418 ldrb $t3,[$rounds,#8]
419 orr $s2,$s2,$t1,lsl#8
420 ldrb $s3,[$rounds,#15]
421 orr $s2,$s2,$t2,lsl#16
422 ldrb $t1,[$rounds,#14]
423 orr $s2,$s2,$t3,lsl#24
424 ldrb $t2,[$rounds,#13]
425 ldrb $t3,[$rounds,#12]
426 orr $s3,$s3,$t1,lsl#8
427 str $s0,[$key],#16
428 orr $s3,$s3,$t2,lsl#16
429 str $s1,[$key,#-12]
430 orr $s3,$s3,$t3,lsl#24
431 str $s2,[$key,#-8]
432 str $s3,[$key,#-4]
433
434 teq lr,#128
435 bne .Lnot128
436 mov $rounds,#10
437 str $rounds,[$key,#240-16]
438 add $t3,$tbl,#256 @ rcon
439 mov lr,#255
440
441.L128_loop:
442 and $t2,lr,$s3,lsr#24
443 and $i1,lr,$s3,lsr#16
444 ldrb $t2,[$tbl,$t2]
445 and $i2,lr,$s3,lsr#8
446 ldrb $i1,[$tbl,$i1]
447 and $i3,lr,$s3
448 ldrb $i2,[$tbl,$i2]
449 orr $t2,$t2,$i1,lsl#24
450 ldrb $i3,[$tbl,$i3]
451 orr $t2,$t2,$i2,lsl#16
452 ldr $t1,[$t3],#4 @ rcon[i++]
453 orr $t2,$t2,$i3,lsl#8
454 eor $t2,$t2,$t1
455 eor $s0,$s0,$t2 @ rk[4]=rk[0]^...
456 eor $s1,$s1,$s0 @ rk[5]=rk[1]^rk[4]
457 str $s0,[$key],#16
458 eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5]
459 str $s1,[$key,#-12]
460 eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6]
461 str $s2,[$key,#-8]
462 subs $rounds,$rounds,#1
463 str $s3,[$key,#-4]
464 bne .L128_loop
465 sub r2,$key,#176
466 b .Ldone
467
468.Lnot128:
469 ldrb $i2,[$rounds,#19]
470 ldrb $t1,[$rounds,#18]
471 ldrb $t2,[$rounds,#17]
472 ldrb $t3,[$rounds,#16]
473 orr $i2,$i2,$t1,lsl#8
474 ldrb $i3,[$rounds,#23]
475 orr $i2,$i2,$t2,lsl#16
476 ldrb $t1,[$rounds,#22]
477 orr $i2,$i2,$t3,lsl#24
478 ldrb $t2,[$rounds,#21]
479 ldrb $t3,[$rounds,#20]
480 orr $i3,$i3,$t1,lsl#8
481 orr $i3,$i3,$t2,lsl#16
482 str $i2,[$key],#8
483 orr $i3,$i3,$t3,lsl#24
484 str $i3,[$key,#-4]
485
486 teq lr,#192
487 bne .Lnot192
488 mov $rounds,#12
489 str $rounds,[$key,#240-24]
490 add $t3,$tbl,#256 @ rcon
491 mov lr,#255
492 mov $rounds,#8
493
494.L192_loop:
495 and $t2,lr,$i3,lsr#24
496 and $i1,lr,$i3,lsr#16
497 ldrb $t2,[$tbl,$t2]
498 and $i2,lr,$i3,lsr#8
499 ldrb $i1,[$tbl,$i1]
500 and $i3,lr,$i3
501 ldrb $i2,[$tbl,$i2]
502 orr $t2,$t2,$i1,lsl#24
503 ldrb $i3,[$tbl,$i3]
504 orr $t2,$t2,$i2,lsl#16
505 ldr $t1,[$t3],#4 @ rcon[i++]
506 orr $t2,$t2,$i3,lsl#8
507 eor $i3,$t2,$t1
508 eor $s0,$s0,$i3 @ rk[6]=rk[0]^...
509 eor $s1,$s1,$s0 @ rk[7]=rk[1]^rk[6]
510 str $s0,[$key],#24
511 eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7]
512 str $s1,[$key,#-20]
513 eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8]
514 str $s2,[$key,#-16]
515 subs $rounds,$rounds,#1
516 str $s3,[$key,#-12]
517 subeq r2,$key,#216
518 beq .Ldone
519
520 ldr $i1,[$key,#-32]
521 ldr $i2,[$key,#-28]
522 eor $i1,$i1,$s3 @ rk[10]=rk[4]^rk[9]
523 eor $i3,$i2,$i1 @ rk[11]=rk[5]^rk[10]
524 str $i1,[$key,#-8]
525 str $i3,[$key,#-4]
526 b .L192_loop
527
528.Lnot192:
529 ldrb $i2,[$rounds,#27]
530 ldrb $t1,[$rounds,#26]
531 ldrb $t2,[$rounds,#25]
532 ldrb $t3,[$rounds,#24]
533 orr $i2,$i2,$t1,lsl#8
534 ldrb $i3,[$rounds,#31]
535 orr $i2,$i2,$t2,lsl#16
536 ldrb $t1,[$rounds,#30]
537 orr $i2,$i2,$t3,lsl#24
538 ldrb $t2,[$rounds,#29]
539 ldrb $t3,[$rounds,#28]
540 orr $i3,$i3,$t1,lsl#8
541 orr $i3,$i3,$t2,lsl#16
542 str $i2,[$key],#8
543 orr $i3,$i3,$t3,lsl#24
544 str $i3,[$key,#-4]
545
546 mov $rounds,#14
547 str $rounds,[$key,#240-32]
548 add $t3,$tbl,#256 @ rcon
549 mov lr,#255
550 mov $rounds,#7
551
552.L256_loop:
553 and $t2,lr,$i3,lsr#24
554 and $i1,lr,$i3,lsr#16
555 ldrb $t2,[$tbl,$t2]
556 and $i2,lr,$i3,lsr#8
557 ldrb $i1,[$tbl,$i1]
558 and $i3,lr,$i3
559 ldrb $i2,[$tbl,$i2]
560 orr $t2,$t2,$i1,lsl#24
561 ldrb $i3,[$tbl,$i3]
562 orr $t2,$t2,$i2,lsl#16
563 ldr $t1,[$t3],#4 @ rcon[i++]
564 orr $t2,$t2,$i3,lsl#8
565 eor $i3,$t2,$t1
566 eor $s0,$s0,$i3 @ rk[8]=rk[0]^...
567 eor $s1,$s1,$s0 @ rk[9]=rk[1]^rk[8]
568 str $s0,[$key],#32
569 eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9]
570 str $s1,[$key,#-28]
571 eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10]
572 str $s2,[$key,#-24]
573 subs $rounds,$rounds,#1
574 str $s3,[$key,#-20]
575 subeq r2,$key,#256
576 beq .Ldone
577
578 and $t2,lr,$s3
579 and $i1,lr,$s3,lsr#8
580 ldrb $t2,[$tbl,$t2]
581 and $i2,lr,$s3,lsr#16
582 ldrb $i1,[$tbl,$i1]
583 and $i3,lr,$s3,lsr#24
584 ldrb $i2,[$tbl,$i2]
585 orr $t2,$t2,$i1,lsl#8
586 ldrb $i3,[$tbl,$i3]
587 orr $t2,$t2,$i2,lsl#16
588 ldr $t1,[$key,#-48]
589 orr $t2,$t2,$i3,lsl#24
590
591 ldr $i1,[$key,#-44]
592 ldr $i2,[$key,#-40]
593 eor $t1,$t1,$t2 @ rk[12]=rk[4]^...
594 ldr $i3,[$key,#-36]
595 eor $i1,$i1,$t1 @ rk[13]=rk[5]^rk[12]
596 str $t1,[$key,#-16]
597 eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13]
598 str $i1,[$key,#-12]
599 eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14]
600 str $i2,[$key,#-8]
601 str $i3,[$key,#-4]
602 b .L256_loop
603
604.Ldone: mov r0,#0
605 ldmia sp!,{r4-r12,lr}
606.Labrt: tst lr,#1
607 moveq pc,lr @ be binary compatible with V4, yet
608 bx lr @ interoperable with Thumb ISA:-)
609.size AES_set_encrypt_key,.-AES_set_encrypt_key
610
611.global AES_set_decrypt_key
612.type AES_set_decrypt_key,%function
613.align 5
614AES_set_decrypt_key:
615 str lr,[sp,#-4]! @ push lr
616 bl AES_set_encrypt_key
617 teq r0,#0
618 ldrne lr,[sp],#4 @ pop lr
619 bne .Labrt
620
621 stmdb sp!,{r4-r12}
622
623 ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2,
624 mov $key,r2 @ which is AES_KEY *key
625 mov $i1,r2
626 add $i2,r2,$rounds,lsl#4
627
628.Linv: ldr $s0,[$i1]
629 ldr $s1,[$i1,#4]
630 ldr $s2,[$i1,#8]
631 ldr $s3,[$i1,#12]
632 ldr $t1,[$i2]
633 ldr $t2,[$i2,#4]
634 ldr $t3,[$i2,#8]
635 ldr $i3,[$i2,#12]
636 str $s0,[$i2],#-16
637 str $s1,[$i2,#16+4]
638 str $s2,[$i2,#16+8]
639 str $s3,[$i2,#16+12]
640 str $t1,[$i1],#16
641 str $t2,[$i1,#-12]
642 str $t3,[$i1,#-8]
643 str $i3,[$i1,#-4]
644 teq $i1,$i2
645 bne .Linv
646___
647$mask80=$i1;
648$mask1b=$i2;
649$mask7f=$i3;
650$code.=<<___;
651 ldr $s0,[$key,#16]! @ prefetch tp1
652 mov $mask80,#0x80
653 mov $mask1b,#0x1b
654 orr $mask80,$mask80,#0x8000
655 orr $mask1b,$mask1b,#0x1b00
656 orr $mask80,$mask80,$mask80,lsl#16
657 orr $mask1b,$mask1b,$mask1b,lsl#16
658 sub $rounds,$rounds,#1
659 mvn $mask7f,$mask80
660 mov $rounds,$rounds,lsl#2 @ (rounds-1)*4
661
662.Lmix: and $t1,$s0,$mask80
663 and $s1,$s0,$mask7f
664 sub $t1,$t1,$t1,lsr#7
665 and $t1,$t1,$mask1b
666 eor $s1,$t1,$s1,lsl#1 @ tp2
667
668 and $t1,$s1,$mask80
669 and $s2,$s1,$mask7f
670 sub $t1,$t1,$t1,lsr#7
671 and $t1,$t1,$mask1b
672 eor $s2,$t1,$s2,lsl#1 @ tp4
673
674 and $t1,$s2,$mask80
675 and $s3,$s2,$mask7f
676 sub $t1,$t1,$t1,lsr#7
677 and $t1,$t1,$mask1b
678 eor $s3,$t1,$s3,lsl#1 @ tp8
679
680 eor $t1,$s1,$s2
681 eor $t2,$s0,$s3 @ tp9
682 eor $t1,$t1,$s3 @ tpe
683 eor $t1,$t1,$s1,ror#24
684 eor $t1,$t1,$t2,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8)
685 eor $t1,$t1,$s2,ror#16
686 eor $t1,$t1,$t2,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16)
687 eor $t1,$t1,$t2,ror#8 @ ^= ROTATE(tp9,24)
688
689 ldr $s0,[$key,#4] @ prefetch tp1
690 str $t1,[$key],#4
691 subs $rounds,$rounds,#1
692 bne .Lmix
693
694 mov r0,#0
695 ldmia sp!,{r4-r12,lr}
696 tst lr,#1
697 moveq pc,lr @ be binary compatible with V4, yet
698 bx lr @ interoperable with Thumb ISA:-)
699.size AES_set_decrypt_key,.-AES_set_decrypt_key
700
701.type AES_Td,%object
702.align 5
703AES_Td:
704.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
705.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
706.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
707.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
708.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
709.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
710.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
711.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
712.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
713.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
714.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
715.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
716.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
717.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
718.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
719.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
720.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
721.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
722.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
723.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
724.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
725.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
726.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
727.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
728.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
729.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
730.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
731.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
732.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
733.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
734.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
735.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
736.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
737.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
738.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
739.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
740.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
741.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
742.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
743.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
744.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
745.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
746.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
747.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
748.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
749.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
750.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
751.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
752.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
753.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
754.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
755.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
756.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
757.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
758.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
759.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
760.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
761.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
762.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
763.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
764.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
765.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
766.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
767.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
768@ Td4[256]
769.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
770.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
771.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
772.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
773.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
774.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
775.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
776.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
777.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
778.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
779.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
780.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
781.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
782.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
783.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
784.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
785.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
786.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
787.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
788.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
789.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
790.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
791.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
792.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
793.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
794.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
795.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
796.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
797.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
798.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
799.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
800.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
801.size AES_Td,.-AES_Td
802
803@ void AES_decrypt(const unsigned char *in, unsigned char *out,
804@ const AES_KEY *key) {
805.global AES_decrypt
806.type AES_decrypt,%function
807.align 5
808AES_decrypt:
809 sub r3,pc,#8 @ AES_decrypt
810 stmdb sp!,{r1,r4-r12,lr}
811 mov $rounds,r0 @ inp
812 mov $key,r2
813 sub $tbl,r3,#AES_decrypt-AES_Td @ Td
814
815 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
816 ldrb $t1,[$rounds,#2] @ manner...
817 ldrb $t2,[$rounds,#1]
818 ldrb $t3,[$rounds,#0]
819 orr $s0,$s0,$t1,lsl#8
820 ldrb $s1,[$rounds,#7]
821 orr $s0,$s0,$t2,lsl#16
822 ldrb $t1,[$rounds,#6]
823 orr $s0,$s0,$t3,lsl#24
824 ldrb $t2,[$rounds,#5]
825 ldrb $t3,[$rounds,#4]
826 orr $s1,$s1,$t1,lsl#8
827 ldrb $s2,[$rounds,#11]
828 orr $s1,$s1,$t2,lsl#16
829 ldrb $t1,[$rounds,#10]
830 orr $s1,$s1,$t3,lsl#24
831 ldrb $t2,[$rounds,#9]
832 ldrb $t3,[$rounds,#8]
833 orr $s2,$s2,$t1,lsl#8
834 ldrb $s3,[$rounds,#15]
835 orr $s2,$s2,$t2,lsl#16
836 ldrb $t1,[$rounds,#14]
837 orr $s2,$s2,$t3,lsl#24
838 ldrb $t2,[$rounds,#13]
839 ldrb $t3,[$rounds,#12]
840 orr $s3,$s3,$t1,lsl#8
841 orr $s3,$s3,$t2,lsl#16
842 orr $s3,$s3,$t3,lsl#24
843
844 bl _armv4_AES_decrypt
845
846 ldr $rounds,[sp],#4 @ pop out
847 mov $t1,$s0,lsr#24 @ write output in endian-neutral
848 mov $t2,$s0,lsr#16 @ manner...
849 mov $t3,$s0,lsr#8
850 strb $t1,[$rounds,#0]
851 strb $t2,[$rounds,#1]
852 mov $t1,$s1,lsr#24
853 strb $t3,[$rounds,#2]
854 mov $t2,$s1,lsr#16
855 strb $s0,[$rounds,#3]
856 mov $t3,$s1,lsr#8
857 strb $t1,[$rounds,#4]
858 strb $t2,[$rounds,#5]
859 mov $t1,$s2,lsr#24
860 strb $t3,[$rounds,#6]
861 mov $t2,$s2,lsr#16
862 strb $s1,[$rounds,#7]
863 mov $t3,$s2,lsr#8
864 strb $t1,[$rounds,#8]
865 strb $t2,[$rounds,#9]
866 mov $t1,$s3,lsr#24
867 strb $t3,[$rounds,#10]
868 mov $t2,$s3,lsr#16
869 strb $s2,[$rounds,#11]
870 mov $t3,$s3,lsr#8
871 strb $t1,[$rounds,#12]
872 strb $t2,[$rounds,#13]
873 strb $t3,[$rounds,#14]
874 strb $s3,[$rounds,#15]
875
876 ldmia sp!,{r4-r12,lr}
877 tst lr,#1
878 moveq pc,lr @ be binary compatible with V4, yet
879 bx lr @ interoperable with Thumb ISA:-)
880.size AES_decrypt,.-AES_decrypt
881
882.type _armv4_AES_decrypt,%function
883.align 2
884_armv4_AES_decrypt:
885 str lr,[sp,#-4]! @ push lr
886 ldmia $key!,{$t1-$i1}
887 eor $s0,$s0,$t1
888 ldr $rounds,[$key,#240-16]
889 eor $s1,$s1,$t2
890 eor $s2,$s2,$t3
891 eor $s3,$s3,$i1
892 sub $rounds,$rounds,#1
893 mov lr,#255
894
895 and $i1,lr,$s0,lsr#16
896 and $i2,lr,$s0,lsr#8
897 and $i3,lr,$s0
898 mov $s0,$s0,lsr#24
899.Ldec_loop:
900 ldr $t1,[$tbl,$i1,lsl#2] @ Td1[s0>>16]
901 and $i1,lr,$s1 @ i0
902 ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8]
903 and $i2,lr,$s1,lsr#16
904 ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0]
905 and $i3,lr,$s1,lsr#8
906 ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24]
907 mov $s1,$s1,lsr#24
908
909 ldr $i1,[$tbl,$i1,lsl#2] @ Td3[s1>>0]
910 ldr $i2,[$tbl,$i2,lsl#2] @ Td1[s1>>16]
911 ldr $i3,[$tbl,$i3,lsl#2] @ Td2[s1>>8]
912 eor $s0,$s0,$i1,ror#24
913 ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24]
914 and $i1,lr,$s2,lsr#8 @ i0
915 eor $t2,$i2,$t2,ror#8
916 and $i2,lr,$s2 @ i1
917 eor $t3,$i3,$t3,ror#8
918 and $i3,lr,$s2,lsr#16
919 eor $s1,$s1,$t1,ror#8
920 ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
921 mov $s2,$s2,lsr#24
922
923 ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
924 ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
925 eor $s0,$s0,$i1,ror#16
926 ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
927 and $i1,lr,$s3,lsr#16 @ i0
928 eor $s1,$s1,$i2,ror#24
929 and $i2,lr,$s3,lsr#8 @ i1
930 eor $t3,$i3,$t3,ror#8
931 and $i3,lr,$s3 @ i2
932 eor $s2,$s2,$t2,ror#8
933 ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
934 mov $s3,$s3,lsr#24
935
936 ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
937 ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
938 eor $s0,$s0,$i1,ror#8
939 ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
940 eor $s1,$s1,$i2,ror#16
941 eor $s2,$s2,$i3,ror#24
942 ldr $i1,[$key],#16
943 eor $s3,$s3,$t3,ror#8
944
945 ldr $t1,[$key,#-12]
946 ldr $t2,[$key,#-8]
947 eor $s0,$s0,$i1
948 ldr $t3,[$key,#-4]
949 and $i1,lr,$s0,lsr#16
950 eor $s1,$s1,$t1
951 and $i2,lr,$s0,lsr#8
952 eor $s2,$s2,$t2
953 and $i3,lr,$s0
954 eor $s3,$s3,$t3
955 mov $s0,$s0,lsr#24
956
957 subs $rounds,$rounds,#1
958 bne .Ldec_loop
959
960 add $tbl,$tbl,#1024
961
962 ldr $t2,[$tbl,#0] @ prefetch Td4
963 ldr $t3,[$tbl,#32]
964 ldr $t1,[$tbl,#64]
965 ldr $t2,[$tbl,#96]
966 ldr $t3,[$tbl,#128]
967 ldr $t1,[$tbl,#160]
968 ldr $t2,[$tbl,#192]
969 ldr $t3,[$tbl,#224]
970
971 ldrb $s0,[$tbl,$s0] @ Td4[s0>>24]
972 ldrb $t1,[$tbl,$i1] @ Td4[s0>>16]
973 and $i1,lr,$s1 @ i0
974 ldrb $t2,[$tbl,$i2] @ Td4[s0>>8]
975 and $i2,lr,$s1,lsr#16
976 ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
977 and $i3,lr,$s1,lsr#8
978
979 ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
980 ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
981 ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
982 eor $s0,$i1,$s0,lsl#24
983 ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
984 eor $s1,$t1,$s1,lsl#8
985 and $i1,lr,$s2,lsr#8 @ i0
986 eor $t2,$t2,$i2,lsl#8
987 and $i2,lr,$s2 @ i1
988 eor $t3,$t3,$i3,lsl#8
989 ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
990 and $i3,lr,$s2,lsr#16
991
992 ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
993 ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
994 eor $s0,$s0,$i1,lsl#8
995 ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
996 eor $s1,$i2,$s1,lsl#16
997 and $i1,lr,$s3,lsr#16 @ i0
998 eor $s2,$t2,$s2,lsl#16
999 and $i2,lr,$s3,lsr#8 @ i1
1000 eor $t3,$t3,$i3,lsl#16
1001 ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
1002 and $i3,lr,$s3 @ i2
1003
1004 ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
1005 ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
1006 ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
1007 eor $s0,$s0,$i1,lsl#16
1008 ldr $i1,[$key,#0]
1009 eor $s1,$s1,$i2,lsl#8
1010 ldr $t1,[$key,#4]
1011 eor $s2,$i3,$s2,lsl#8
1012 ldr $t2,[$key,#8]
1013 eor $s3,$t3,$s3,lsl#24
1014 ldr $t3,[$key,#12]
1015
1016 eor $s0,$s0,$i1
1017 eor $s1,$s1,$t1
1018 eor $s2,$s2,$t2
1019 eor $s3,$s3,$t3
1020
1021 sub $tbl,$tbl,#1024
1022 ldr pc,[sp],#4 @ pop and return
1023.size _armv4_AES_decrypt,.-_armv4_AES_decrypt
1024.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
1025.align 2
1026___
1027
1028$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
1029print $code;
1030close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/aes/asm/aes-ia64.S b/src/lib/libcrypto/aes/asm/aes-ia64.S
deleted file mode 100644
index 7f6c4c3662..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-ia64.S
+++ /dev/null
@@ -1,1123 +0,0 @@
1// ====================================================================
2// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
3// project. Rights for redistribution and usage in source and binary
4// forms are granted according to the OpenSSL license.
5// ====================================================================
6//
7// What's wrong with compiler generated code? Compiler never uses
8// variable 'shr' which is pairable with 'extr'/'dep' instructions.
9// Then it uses 'zxt' which is an I-type, but can be replaced with
10// 'and' which in turn can be assigned to M-port [there're double as
11// much M-ports as there're I-ports on Itanium 2]. By sacrificing few
12// registers for small constants (255, 24 and 16) to be used with
13// 'shr' and 'and' instructions I can achieve better ILP, Intruction
14// Level Parallelism, and performance. This code outperforms GCC 3.3
15// generated code by over factor of 2 (two), GCC 3.4 - by 70% and
16// HP C - by 40%. Measured best-case scenario, i.e. aligned
17// big-endian input, ECB timing on Itanium 2 is (18 + 13*rounds)
18// ticks per block, or 9.25 CPU cycles per byte for 128 bit key.
19
20// Version 1.2 mitigates the hazard of cache-timing attacks by
21// a) compressing S-boxes from 8KB to 2KB+256B, b) scheduling
22// references to S-boxes for L2 cache latency, c) prefetching T[ed]4
23// prior last round. As result performance dropped to (26 + 15*rounds)
24// ticks per block or 11 cycles per byte processed with 128-bit key.
25// This is ~16% deterioration. For reference Itanium 2 L1 cache has
26// 64 bytes line size and L2 - 128 bytes...
27
28.ident "aes-ia64.S, version 1.2"
29.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
30.explicit
31.text
32
33rk0=r8; rk1=r9;
34
35pfssave=r2;
36lcsave=r10;
37prsave=r3;
38maskff=r11;
39twenty4=r14;
40sixteen=r15;
41
42te00=r16; te11=r17; te22=r18; te33=r19;
43te01=r20; te12=r21; te23=r22; te30=r23;
44te02=r24; te13=r25; te20=r26; te31=r27;
45te03=r28; te10=r29; te21=r30; te32=r31;
46
47// these are rotating...
48t0=r32; s0=r33;
49t1=r34; s1=r35;
50t2=r36; s2=r37;
51t3=r38; s3=r39;
52
53te0=r40; te1=r41; te2=r42; te3=r43;
54
55#if defined(_HPUX_SOURCE) && !defined(_LP64)
56# define ADDP addp4
57#else
58# define ADDP add
59#endif
60
61// Offsets from Te0
62#define TE0 0
63#define TE2 2
64#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
65#define TE1 3
66#define TE3 1
67#else
68#define TE1 1
69#define TE3 3
70#endif
71
72// This implies that AES_KEY comprises 32-bit key schedule elements
73// even on LP64 platforms.
74#ifndef KSZ
75# define KSZ 4
76# define LDKEY ld4
77#endif
78
79.proc _ia64_AES_encrypt#
80// Input: rk0-rk1
81// te0
82// te3 as AES_KEY->rounds!!!
83// s0-s3
84// maskff,twenty4,sixteen
85// Output: r16,r20,r24,r28 as s0-s3
86// Clobber: r16-r31,rk0-rk1,r32-r43
87.align 32
88_ia64_AES_encrypt:
89 .prologue
90 .altrp b6
91 .body
92{ .mmi; alloc r16=ar.pfs,12,0,0,8
93 LDKEY t0=[rk0],2*KSZ
94 mov pr.rot=1<<16 }
95{ .mmi; LDKEY t1=[rk1],2*KSZ
96 add te1=TE1,te0
97 add te3=-3,te3 };;
98{ .mib; LDKEY t2=[rk0],2*KSZ
99 mov ar.ec=2 }
100{ .mib; LDKEY t3=[rk1],2*KSZ
101 add te2=TE2,te0
102 brp.loop.imp .Le_top,.Le_end-16 };;
103
104{ .mmi; xor s0=s0,t0
105 xor s1=s1,t1
106 mov ar.lc=te3 }
107{ .mmi; xor s2=s2,t2
108 xor s3=s3,t3
109 add te3=TE3,te0 };;
110
111.align 32
112.Le_top:
113{ .mmi; (p0) LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
114 (p0) and te33=s3,maskff // 0/0:s3&0xff
115 (p0) extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
116{ .mmi; (p0) LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
117 (p0) and te30=s0,maskff // 0/1:s0&0xff
118 (p0) shr.u te00=s0,twenty4 };; // 0/0:s0>>24
119{ .mmi; (p0) LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
120 (p0) shladd te33=te33,3,te3 // 1/0:te0+s0>>24
121 (p0) extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
122{ .mmi; (p0) LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
123 (p0) shladd te30=te30,3,te3 // 1/1:te3+s0
124 (p0) shr.u te01=s1,twenty4 };; // 1/1:s1>>24
125{ .mmi; (p0) ld4 te33=[te33] // 2/0:te3[s3&0xff]
126 (p0) shladd te22=te22,3,te2 // 2/0:te2+s2>>8&0xff
127 (p0) extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
128{ .mmi; (p0) ld4 te30=[te30] // 2/1:te3[s0]
129 (p0) shladd te23=te23,3,te2 // 2/1:te2+s3>>8
130 (p0) shr.u te02=s2,twenty4 };; // 2/2:s2>>24
131{ .mmi; (p0) ld4 te22=[te22] // 3/0:te2[s2>>8]
132 (p0) shladd te20=te20,3,te2 // 3/2:te2+s0>>8
133 (p0) extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
134{ .mmi; (p0) ld4 te23=[te23] // 3/1:te2[s3>>8]
135 (p0) shladd te00=te00,3,te0 // 3/0:te0+s0>>24
136 (p0) shr.u te03=s3,twenty4 };; // 3/3:s3>>24
137{ .mmi; (p0) ld4 te20=[te20] // 4/2:te2[s0>>8]
138 (p0) shladd te21=te21,3,te2 // 4/3:te3+s2
139 (p0) extr.u te11=s1,16,8 } // 4/0:s1>>16&0xff
140{ .mmi; (p0) ld4 te00=[te00] // 4/0:te0[s0>>24]
141 (p0) shladd te01=te01,3,te0 // 4/1:te0+s1>>24
142 (p0) shr.u te13=s3,sixteen };; // 4/2:s3>>16
143{ .mmi; (p0) ld4 te21=[te21] // 5/3:te2[s1>>8]
144 (p0) shladd te11=te11,3,te1 // 5/0:te1+s1>>16
145 (p0) extr.u te12=s2,16,8 } // 5/1:s2>>16&0xff
146{ .mmi; (p0) ld4 te01=[te01] // 5/1:te0[s1>>24]
147 (p0) shladd te02=te02,3,te0 // 5/2:te0+s2>>24
148 (p0) and te31=s1,maskff };; // 5/2:s1&0xff
149{ .mmi; (p0) ld4 te11=[te11] // 6/0:te1[s1>>16]
150 (p0) shladd te12=te12,3,te1 // 6/1:te1+s2>>16
151 (p0) extr.u te10=s0,16,8 } // 6/3:s0>>16&0xff
152{ .mmi; (p0) ld4 te02=[te02] // 6/2:te0[s2>>24]
153 (p0) shladd te03=te03,3,te0 // 6/3:te1+s0>>16
154 (p0) and te32=s2,maskff };; // 6/3:s2&0xff
155
156{ .mmi; (p0) ld4 te12=[te12] // 7/1:te1[s2>>16]
157 (p0) shladd te31=te31,3,te3 // 7/2:te3+s1&0xff
158 (p0) and te13=te13,maskff} // 7/2:s3>>16&0xff
159{ .mmi; (p0) ld4 te03=[te03] // 7/3:te0[s3>>24]
160 (p0) shladd te32=te32,3,te3 // 7/3:te3+s2
161 (p0) xor t0=t0,te33 };; // 7/0:
162{ .mmi; (p0) ld4 te31=[te31] // 8/2:te3[s1]
163 (p0) shladd te13=te13,3,te1 // 8/2:te1+s3>>16
164 (p0) xor t0=t0,te22 } // 8/0:
165{ .mmi; (p0) ld4 te32=[te32] // 8/3:te3[s2]
166 (p0) shladd te10=te10,3,te1 // 8/3:te1+s0>>16
167 (p0) xor t1=t1,te30 };; // 8/1:
168{ .mmi; (p0) ld4 te13=[te13] // 9/2:te1[s3>>16]
169 (p0) ld4 te10=[te10] // 9/3:te1[s0>>16]
170 (p0) xor t0=t0,te00 };; // 9/0: !L2 scheduling
171{ .mmi; (p0) xor t1=t1,te23 // 10[9]/1:
172 (p0) xor t2=t2,te20 // 10[9]/2:
173 (p0) xor t3=t3,te21 };; // 10[9]/3:
174{ .mmi; (p0) xor t0=t0,te11 // 11[10]/0:done!
175 (p0) xor t1=t1,te01 // 11[10]/1:
176 (p0) xor t2=t2,te02 };; // 11[10]/2: !L2 scheduling
177{ .mmi; (p0) xor t3=t3,te03 // 12[10]/3:
178 (p16) cmp.eq p0,p17=r0,r0 };; // 12[10]/clear (p17)
179{ .mmi; (p0) xor t1=t1,te12 // 13[11]/1:done!
180 (p0) xor t2=t2,te31 // 13[11]/2:
181 (p0) xor t3=t3,te32 } // 13[11]/3:
182{ .mmi; (p17) add te0=2048,te0 // 13[11]/
183 (p17) add te1=2048+64-TE1,te1};; // 13[11]/
184{ .mib; (p0) xor t2=t2,te13 // 14[12]/2:done!
185 (p17) add te2=2048+128-TE2,te2} // 14[12]/
186{ .mib; (p0) xor t3=t3,te10 // 14[12]/3:done!
187 (p17) add te3=2048+192-TE3,te3 // 14[12]/
188 br.ctop.sptk .Le_top };;
189.Le_end:
190
191
192{ .mmi; ld8 te12=[te0] // prefetch Te4
193 ld8 te31=[te1] }
194{ .mmi; ld8 te10=[te2]
195 ld8 te32=[te3] }
196
197{ .mmi; LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
198 and te33=s3,maskff // 0/0:s3&0xff
199 extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
200{ .mmi; LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
201 and te30=s0,maskff // 0/1:s0&0xff
202 shr.u te00=s0,twenty4 };; // 0/0:s0>>24
203{ .mmi; LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
204 add te33=te33,te0 // 1/0:te0+s0>>24
205 extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
206{ .mmi; LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
207 add te30=te30,te0 // 1/1:te0+s0
208 shr.u te01=s1,twenty4 };; // 1/1:s1>>24
209{ .mmi; ld1 te33=[te33] // 2/0:te0[s3&0xff]
210 add te22=te22,te0 // 2/0:te0+s2>>8&0xff
211 extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
212{ .mmi; ld1 te30=[te30] // 2/1:te0[s0]
213 add te23=te23,te0 // 2/1:te0+s3>>8
214 shr.u te02=s2,twenty4 };; // 2/2:s2>>24
215{ .mmi; ld1 te22=[te22] // 3/0:te0[s2>>8]
216 add te20=te20,te0 // 3/2:te0+s0>>8
217 extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
218{ .mmi; ld1 te23=[te23] // 3/1:te0[s3>>8]
219 add te00=te00,te0 // 3/0:te0+s0>>24
220 shr.u te03=s3,twenty4 };; // 3/3:s3>>24
221{ .mmi; ld1 te20=[te20] // 4/2:te0[s0>>8]
222 add te21=te21,te0 // 4/3:te0+s2
223 extr.u te11=s1,16,8 } // 4/0:s1>>16&0xff
224{ .mmi; ld1 te00=[te00] // 4/0:te0[s0>>24]
225 add te01=te01,te0 // 4/1:te0+s1>>24
226 shr.u te13=s3,sixteen };; // 4/2:s3>>16
227{ .mmi; ld1 te21=[te21] // 5/3:te0[s1>>8]
228 add te11=te11,te0 // 5/0:te0+s1>>16
229 extr.u te12=s2,16,8 } // 5/1:s2>>16&0xff
230{ .mmi; ld1 te01=[te01] // 5/1:te0[s1>>24]
231 add te02=te02,te0 // 5/2:te0+s2>>24
232 and te31=s1,maskff };; // 5/2:s1&0xff
233{ .mmi; ld1 te11=[te11] // 6/0:te0[s1>>16]
234 add te12=te12,te0 // 6/1:te0+s2>>16
235 extr.u te10=s0,16,8 } // 6/3:s0>>16&0xff
236{ .mmi; ld1 te02=[te02] // 6/2:te0[s2>>24]
237 add te03=te03,te0 // 6/3:te0+s0>>16
238 and te32=s2,maskff };; // 6/3:s2&0xff
239
240{ .mmi; ld1 te12=[te12] // 7/1:te0[s2>>16]
241 add te31=te31,te0 // 7/2:te0+s1&0xff
242 dep te33=te22,te33,8,8} // 7/0:
243{ .mmi; ld1 te03=[te03] // 7/3:te0[s3>>24]
244 add te32=te32,te0 // 7/3:te0+s2
245 and te13=te13,maskff};; // 7/2:s3>>16&0xff
246{ .mmi; ld1 te31=[te31] // 8/2:te0[s1]
247 add te13=te13,te0 // 8/2:te0+s3>>16
248 dep te30=te23,te30,8,8} // 8/1:
249{ .mmi; ld1 te32=[te32] // 8/3:te0[s2]
250 add te10=te10,te0 // 8/3:te0+s0>>16
251 shl te00=te00,twenty4};; // 8/0:
252{ .mii; ld1 te13=[te13] // 9/2:te0[s3>>16]
253 dep te33=te11,te33,16,8 // 9/0:
254 shl te01=te01,twenty4};; // 9/1:
255{ .mii; ld1 te10=[te10] // 10/3:te0[s0>>16]
256 dep te31=te20,te31,8,8 // 10/2:
257 shl te02=te02,twenty4};; // 10/2:
258{ .mii; xor t0=t0,te33 // 11/0:
259 dep te32=te21,te32,8,8 // 11/3:
260 shl te12=te12,sixteen};; // 11/1:
261{ .mii; xor r16=t0,te00 // 12/0:done!
262 dep te31=te13,te31,16,8 // 12/2:
263 shl te03=te03,twenty4};; // 12/3:
264{ .mmi; xor t1=t1,te01 // 13/1:
265 xor t2=t2,te02 // 13/2:
266 dep te32=te10,te32,16,8};; // 13/3:
267{ .mmi; xor t1=t1,te30 // 14/1:
268 xor r24=t2,te31 // 14/2:done!
269 xor t3=t3,te32 };; // 14/3:
270{ .mib; xor r20=t1,te12 // 15/1:done!
271 xor r28=t3,te03 // 15/3:done!
272 br.ret.sptk b6 };;
273.endp _ia64_AES_encrypt#
274
275// void AES_encrypt (const void *in,void *out,const AES_KEY *key);
276.global AES_encrypt#
277.proc AES_encrypt#
278.align 32
279AES_encrypt:
280 .prologue
281 .save ar.pfs,pfssave
282{ .mmi; alloc pfssave=ar.pfs,3,1,12,0
283 and out0=3,in0
284 mov r3=ip }
285{ .mmi; ADDP in0=0,in0
286 mov loc0=psr.um
287 ADDP out11=KSZ*60,in2 };; // &AES_KEY->rounds
288
289{ .mmi; ld4 out11=[out11] // AES_KEY->rounds
290 add out8=(AES_Te#-AES_encrypt#),r3 // Te0
291 .save pr,prsave
292 mov prsave=pr }
293{ .mmi; rum 1<<3 // clear um.ac
294 .save ar.lc,lcsave
295 mov lcsave=ar.lc };;
296
297 .body
298#if defined(_HPUX_SOURCE) // HPUX is big-endian, cut 15+15 cycles...
299{ .mib; cmp.ne p6,p0=out0,r0
300 add out0=4,in0
301(p6) br.dpnt.many .Le_i_unaligned };;
302
303{ .mmi; ld4 out1=[in0],8 // s0
304 and out9=3,in1
305 mov twenty4=24 }
306{ .mmi; ld4 out3=[out0],8 // s1
307 ADDP rk0=0,in2
308 mov sixteen=16 };;
309{ .mmi; ld4 out5=[in0] // s2
310 cmp.ne p6,p0=out9,r0
311 mov maskff=0xff }
312{ .mmb; ld4 out7=[out0] // s3
313 ADDP rk1=KSZ,in2
314 br.call.sptk.many b6=_ia64_AES_encrypt };;
315
316{ .mib; ADDP in0=4,in1
317 ADDP in1=0,in1
318(p6) br.spnt .Le_o_unaligned };;
319
320{ .mii; mov psr.um=loc0
321 mov ar.pfs=pfssave
322 mov ar.lc=lcsave };;
323{ .mmi; st4 [in1]=r16,8 // s0
324 st4 [in0]=r20,8 // s1
325 mov pr=prsave,0x1ffff };;
326{ .mmb; st4 [in1]=r24 // s2
327 st4 [in0]=r28 // s3
328 br.ret.sptk.many b0 };;
329#endif
330
331.align 32
332.Le_i_unaligned:
333{ .mmi; add out0=1,in0
334 add out2=2,in0
335 add out4=3,in0 };;
336{ .mmi; ld1 r16=[in0],4
337 ld1 r17=[out0],4 }//;;
338{ .mmi; ld1 r18=[out2],4
339 ld1 out1=[out4],4 };; // s0
340{ .mmi; ld1 r20=[in0],4
341 ld1 r21=[out0],4 }//;;
342{ .mmi; ld1 r22=[out2],4
343 ld1 out3=[out4],4 };; // s1
344{ .mmi; ld1 r24=[in0],4
345 ld1 r25=[out0],4 }//;;
346{ .mmi; ld1 r26=[out2],4
347 ld1 out5=[out4],4 };; // s2
348{ .mmi; ld1 r28=[in0]
349 ld1 r29=[out0] }//;;
350{ .mmi; ld1 r30=[out2]
351 ld1 out7=[out4] };; // s3
352
353{ .mii;
354 dep out1=r16,out1,24,8 //;;
355 dep out3=r20,out3,24,8 }//;;
356{ .mii; ADDP rk0=0,in2
357 dep out5=r24,out5,24,8 //;;
358 dep out7=r28,out7,24,8 };;
359{ .mii; ADDP rk1=KSZ,in2
360 dep out1=r17,out1,16,8 //;;
361 dep out3=r21,out3,16,8 }//;;
362{ .mii; mov twenty4=24
363 dep out5=r25,out5,16,8 //;;
364 dep out7=r29,out7,16,8 };;
365{ .mii; mov sixteen=16
366 dep out1=r18,out1,8,8 //;;
367 dep out3=r22,out3,8,8 }//;;
368{ .mii; mov maskff=0xff
369 dep out5=r26,out5,8,8 //;;
370 dep out7=r30,out7,8,8 };;
371
372{ .mib; br.call.sptk.many b6=_ia64_AES_encrypt };;
373
374.Le_o_unaligned:
375{ .mii; ADDP out0=0,in1
376 extr.u r17=r16,8,8 // s0
377 shr.u r19=r16,twenty4 }//;;
378{ .mii; ADDP out1=1,in1
379 extr.u r18=r16,16,8
380 shr.u r23=r20,twenty4 }//;; // s1
381{ .mii; ADDP out2=2,in1
382 extr.u r21=r20,8,8
383 shr.u r22=r20,sixteen }//;;
384{ .mii; ADDP out3=3,in1
385 extr.u r25=r24,8,8 // s2
386 shr.u r27=r24,twenty4 };;
387{ .mii; st1 [out3]=r16,4
388 extr.u r26=r24,16,8
389 shr.u r31=r28,twenty4 }//;; // s3
390{ .mii; st1 [out2]=r17,4
391 extr.u r29=r28,8,8
392 shr.u r30=r28,sixteen }//;;
393
394{ .mmi; st1 [out1]=r18,4
395 st1 [out0]=r19,4 };;
396{ .mmi; st1 [out3]=r20,4
397 st1 [out2]=r21,4 }//;;
398{ .mmi; st1 [out1]=r22,4
399 st1 [out0]=r23,4 };;
400{ .mmi; st1 [out3]=r24,4
401 st1 [out2]=r25,4
402 mov pr=prsave,0x1ffff }//;;
403{ .mmi; st1 [out1]=r26,4
404 st1 [out0]=r27,4
405 mov ar.pfs=pfssave };;
406{ .mmi; st1 [out3]=r28
407 st1 [out2]=r29
408 mov ar.lc=lcsave }//;;
409{ .mmi; st1 [out1]=r30
410 st1 [out0]=r31 }
411{ .mfb; mov psr.um=loc0 // restore user mask
412 br.ret.sptk.many b0 };;
413.endp AES_encrypt#
414
415// *AES_decrypt are autogenerated by the following script:
416#if 0
417#!/usr/bin/env perl
418print "// *AES_decrypt are autogenerated by the following script:\n#if 0\n";
419open(PROG,'<'.$0); while(<PROG>) { print; } close(PROG);
420print "#endif\n";
421while(<>) {
422 $process=1 if (/\.proc\s+_ia64_AES_encrypt/);
423 next if (!$process);
424
425 #s/te00=s0/td00=s0/; s/te00/td00/g;
426 s/te11=s1/td13=s3/; s/te11/td13/g;
427 #s/te22=s2/td22=s2/; s/te22/td22/g;
428 s/te33=s3/td31=s1/; s/te33/td31/g;
429
430 #s/te01=s1/td01=s1/; s/te01/td01/g;
431 s/te12=s2/td10=s0/; s/te12/td10/g;
432 #s/te23=s3/td23=s3/; s/te23/td23/g;
433 s/te30=s0/td32=s2/; s/te30/td32/g;
434
435 #s/te02=s2/td02=s2/; s/te02/td02/g;
436 s/te13=s3/td11=s1/; s/te13/td11/g;
437 #s/te20=s0/td20=s0/; s/te20/td20/g;
438 s/te31=s1/td33=s3/; s/te31/td33/g;
439
440 #s/te03=s3/td03=s3/; s/te03/td03/g;
441 s/te10=s0/td12=s2/; s/te10/td12/g;
442 #s/te21=s1/td21=s1/; s/te21/td21/g;
443 s/te32=s2/td30=s0/; s/te32/td30/g;
444
445 s/td/te/g;
446
447 s/AES_encrypt/AES_decrypt/g;
448 s/\.Le_/.Ld_/g;
449 s/AES_Te#/AES_Td#/g;
450
451 print;
452
453 exit if (/\.endp\s+AES_decrypt/);
454}
455#endif
456.proc _ia64_AES_decrypt#
457// Input: rk0-rk1
458// te0
459// te3 as AES_KEY->rounds!!!
460// s0-s3
461// maskff,twenty4,sixteen
462// Output: r16,r20,r24,r28 as s0-s3
463// Clobber: r16-r31,rk0-rk1,r32-r43
464.align 32
465_ia64_AES_decrypt:
466 .prologue
467 .altrp b6
468 .body
469{ .mmi; alloc r16=ar.pfs,12,0,0,8
470 LDKEY t0=[rk0],2*KSZ
471 mov pr.rot=1<<16 }
472{ .mmi; LDKEY t1=[rk1],2*KSZ
473 add te1=TE1,te0
474 add te3=-3,te3 };;
475{ .mib; LDKEY t2=[rk0],2*KSZ
476 mov ar.ec=2 }
477{ .mib; LDKEY t3=[rk1],2*KSZ
478 add te2=TE2,te0
479 brp.loop.imp .Ld_top,.Ld_end-16 };;
480
481{ .mmi; xor s0=s0,t0
482 xor s1=s1,t1
483 mov ar.lc=te3 }
484{ .mmi; xor s2=s2,t2
485 xor s3=s3,t3
486 add te3=TE3,te0 };;
487
488.align 32
489.Ld_top:
490{ .mmi; (p0) LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
491 (p0) and te31=s1,maskff // 0/0:s3&0xff
492 (p0) extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
493{ .mmi; (p0) LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
494 (p0) and te32=s2,maskff // 0/1:s0&0xff
495 (p0) shr.u te00=s0,twenty4 };; // 0/0:s0>>24
496{ .mmi; (p0) LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
497 (p0) shladd te31=te31,3,te3 // 1/0:te0+s0>>24
498 (p0) extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
499{ .mmi; (p0) LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
500 (p0) shladd te32=te32,3,te3 // 1/1:te3+s0
501 (p0) shr.u te01=s1,twenty4 };; // 1/1:s1>>24
502{ .mmi; (p0) ld4 te31=[te31] // 2/0:te3[s3&0xff]
503 (p0) shladd te22=te22,3,te2 // 2/0:te2+s2>>8&0xff
504 (p0) extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
505{ .mmi; (p0) ld4 te32=[te32] // 2/1:te3[s0]
506 (p0) shladd te23=te23,3,te2 // 2/1:te2+s3>>8
507 (p0) shr.u te02=s2,twenty4 };; // 2/2:s2>>24
508{ .mmi; (p0) ld4 te22=[te22] // 3/0:te2[s2>>8]
509 (p0) shladd te20=te20,3,te2 // 3/2:te2+s0>>8
510 (p0) extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
511{ .mmi; (p0) ld4 te23=[te23] // 3/1:te2[s3>>8]
512 (p0) shladd te00=te00,3,te0 // 3/0:te0+s0>>24
513 (p0) shr.u te03=s3,twenty4 };; // 3/3:s3>>24
514{ .mmi; (p0) ld4 te20=[te20] // 4/2:te2[s0>>8]
515 (p0) shladd te21=te21,3,te2 // 4/3:te3+s2
516 (p0) extr.u te13=s3,16,8 } // 4/0:s1>>16&0xff
517{ .mmi; (p0) ld4 te00=[te00] // 4/0:te0[s0>>24]
518 (p0) shladd te01=te01,3,te0 // 4/1:te0+s1>>24
519 (p0) shr.u te11=s1,sixteen };; // 4/2:s3>>16
520{ .mmi; (p0) ld4 te21=[te21] // 5/3:te2[s1>>8]
521 (p0) shladd te13=te13,3,te1 // 5/0:te1+s1>>16
522 (p0) extr.u te10=s0,16,8 } // 5/1:s2>>16&0xff
523{ .mmi; (p0) ld4 te01=[te01] // 5/1:te0[s1>>24]
524 (p0) shladd te02=te02,3,te0 // 5/2:te0+s2>>24
525 (p0) and te33=s3,maskff };; // 5/2:s1&0xff
526{ .mmi; (p0) ld4 te13=[te13] // 6/0:te1[s1>>16]
527 (p0) shladd te10=te10,3,te1 // 6/1:te1+s2>>16
528 (p0) extr.u te12=s2,16,8 } // 6/3:s0>>16&0xff
529{ .mmi; (p0) ld4 te02=[te02] // 6/2:te0[s2>>24]
530 (p0) shladd te03=te03,3,te0 // 6/3:te1+s0>>16
531 (p0) and te30=s0,maskff };; // 6/3:s2&0xff
532
533{ .mmi; (p0) ld4 te10=[te10] // 7/1:te1[s2>>16]
534 (p0) shladd te33=te33,3,te3 // 7/2:te3+s1&0xff
535 (p0) and te11=te11,maskff} // 7/2:s3>>16&0xff
536{ .mmi; (p0) ld4 te03=[te03] // 7/3:te0[s3>>24]
537 (p0) shladd te30=te30,3,te3 // 7/3:te3+s2
538 (p0) xor t0=t0,te31 };; // 7/0:
539{ .mmi; (p0) ld4 te33=[te33] // 8/2:te3[s1]
540 (p0) shladd te11=te11,3,te1 // 8/2:te1+s3>>16
541 (p0) xor t0=t0,te22 } // 8/0:
542{ .mmi; (p0) ld4 te30=[te30] // 8/3:te3[s2]
543 (p0) shladd te12=te12,3,te1 // 8/3:te1+s0>>16
544 (p0) xor t1=t1,te32 };; // 8/1:
545{ .mmi; (p0) ld4 te11=[te11] // 9/2:te1[s3>>16]
546 (p0) ld4 te12=[te12] // 9/3:te1[s0>>16]
547 (p0) xor t0=t0,te00 };; // 9/0: !L2 scheduling
548{ .mmi; (p0) xor t1=t1,te23 // 10[9]/1:
549 (p0) xor t2=t2,te20 // 10[9]/2:
550 (p0) xor t3=t3,te21 };; // 10[9]/3:
551{ .mmi; (p0) xor t0=t0,te13 // 11[10]/0:done!
552 (p0) xor t1=t1,te01 // 11[10]/1:
553 (p0) xor t2=t2,te02 };; // 11[10]/2: !L2 scheduling
554{ .mmi; (p0) xor t3=t3,te03 // 12[10]/3:
555 (p16) cmp.eq p0,p17=r0,r0 };; // 12[10]/clear (p17)
556{ .mmi; (p0) xor t1=t1,te10 // 13[11]/1:done!
557 (p0) xor t2=t2,te33 // 13[11]/2:
558 (p0) xor t3=t3,te30 } // 13[11]/3:
559{ .mmi; (p17) add te0=2048,te0 // 13[11]/
560 (p17) add te1=2048+64-TE1,te1};; // 13[11]/
561{ .mib; (p0) xor t2=t2,te11 // 14[12]/2:done!
562 (p17) add te2=2048+128-TE2,te2} // 14[12]/
563{ .mib; (p0) xor t3=t3,te12 // 14[12]/3:done!
564 (p17) add te3=2048+192-TE3,te3 // 14[12]/
565 br.ctop.sptk .Ld_top };;
566.Ld_end:
567
568
569{ .mmi; ld8 te10=[te0] // prefetch Td4
570 ld8 te33=[te1] }
571{ .mmi; ld8 te12=[te2]
572 ld8 te30=[te3] }
573
574{ .mmi; LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
575 and te31=s1,maskff // 0/0:s3&0xff
576 extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
577{ .mmi; LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
578 and te32=s2,maskff // 0/1:s0&0xff
579 shr.u te00=s0,twenty4 };; // 0/0:s0>>24
580{ .mmi; LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
581 add te31=te31,te0 // 1/0:te0+s0>>24
582 extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
583{ .mmi; LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
584 add te32=te32,te0 // 1/1:te0+s0
585 shr.u te01=s1,twenty4 };; // 1/1:s1>>24
586{ .mmi; ld1 te31=[te31] // 2/0:te0[s3&0xff]
587 add te22=te22,te0 // 2/0:te0+s2>>8&0xff
588 extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
589{ .mmi; ld1 te32=[te32] // 2/1:te0[s0]
590 add te23=te23,te0 // 2/1:te0+s3>>8
591 shr.u te02=s2,twenty4 };; // 2/2:s2>>24
592{ .mmi; ld1 te22=[te22] // 3/0:te0[s2>>8]
593 add te20=te20,te0 // 3/2:te0+s0>>8
594 extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
595{ .mmi; ld1 te23=[te23] // 3/1:te0[s3>>8]
596 add te00=te00,te0 // 3/0:te0+s0>>24
597 shr.u te03=s3,twenty4 };; // 3/3:s3>>24
598{ .mmi; ld1 te20=[te20] // 4/2:te0[s0>>8]
599 add te21=te21,te0 // 4/3:te0+s2
600 extr.u te13=s3,16,8 } // 4/0:s1>>16&0xff
601{ .mmi; ld1 te00=[te00] // 4/0:te0[s0>>24]
602 add te01=te01,te0 // 4/1:te0+s1>>24
603 shr.u te11=s1,sixteen };; // 4/2:s3>>16
604{ .mmi; ld1 te21=[te21] // 5/3:te0[s1>>8]
605 add te13=te13,te0 // 5/0:te0+s1>>16
606 extr.u te10=s0,16,8 } // 5/1:s2>>16&0xff
607{ .mmi; ld1 te01=[te01] // 5/1:te0[s1>>24]
608 add te02=te02,te0 // 5/2:te0+s2>>24
609 and te33=s3,maskff };; // 5/2:s1&0xff
610{ .mmi; ld1 te13=[te13] // 6/0:te0[s1>>16]
611 add te10=te10,te0 // 6/1:te0+s2>>16
612 extr.u te12=s2,16,8 } // 6/3:s0>>16&0xff
613{ .mmi; ld1 te02=[te02] // 6/2:te0[s2>>24]
614 add te03=te03,te0 // 6/3:te0+s0>>16
615 and te30=s0,maskff };; // 6/3:s2&0xff
616
617{ .mmi; ld1 te10=[te10] // 7/1:te0[s2>>16]
618 add te33=te33,te0 // 7/2:te0+s1&0xff
619 dep te31=te22,te31,8,8} // 7/0:
620{ .mmi; ld1 te03=[te03] // 7/3:te0[s3>>24]
621 add te30=te30,te0 // 7/3:te0+s2
622 and te11=te11,maskff};; // 7/2:s3>>16&0xff
623{ .mmi; ld1 te33=[te33] // 8/2:te0[s1]
624 add te11=te11,te0 // 8/2:te0+s3>>16
625 dep te32=te23,te32,8,8} // 8/1:
626{ .mmi; ld1 te30=[te30] // 8/3:te0[s2]
627 add te12=te12,te0 // 8/3:te0+s0>>16
628 shl te00=te00,twenty4};; // 8/0:
629{ .mii; ld1 te11=[te11] // 9/2:te0[s3>>16]
630 dep te31=te13,te31,16,8 // 9/0:
631 shl te01=te01,twenty4};; // 9/1:
632{ .mii; ld1 te12=[te12] // 10/3:te0[s0>>16]
633 dep te33=te20,te33,8,8 // 10/2:
634 shl te02=te02,twenty4};; // 10/2:
635{ .mii; xor t0=t0,te31 // 11/0:
636 dep te30=te21,te30,8,8 // 11/3:
637 shl te10=te10,sixteen};; // 11/1:
638{ .mii; xor r16=t0,te00 // 12/0:done!
639 dep te33=te11,te33,16,8 // 12/2:
640 shl te03=te03,twenty4};; // 12/3:
641{ .mmi; xor t1=t1,te01 // 13/1:
642 xor t2=t2,te02 // 13/2:
643 dep te30=te12,te30,16,8};; // 13/3:
644{ .mmi; xor t1=t1,te32 // 14/1:
645 xor r24=t2,te33 // 14/2:done!
646 xor t3=t3,te30 };; // 14/3:
647{ .mib; xor r20=t1,te10 // 15/1:done!
648 xor r28=t3,te03 // 15/3:done!
649 br.ret.sptk b6 };;
650.endp _ia64_AES_decrypt#
651
652// void AES_decrypt (const void *in,void *out,const AES_KEY *key);
653.global AES_decrypt#
654.proc AES_decrypt#
655.align 32
656AES_decrypt:
657 .prologue
658 .save ar.pfs,pfssave
659{ .mmi; alloc pfssave=ar.pfs,3,1,12,0
660 and out0=3,in0
661 mov r3=ip }
662{ .mmi; ADDP in0=0,in0
663 mov loc0=psr.um
664 ADDP out11=KSZ*60,in2 };; // &AES_KEY->rounds
665
666{ .mmi; ld4 out11=[out11] // AES_KEY->rounds
667 add out8=(AES_Td#-AES_decrypt#),r3 // Te0
668 .save pr,prsave
669 mov prsave=pr }
670{ .mmi; rum 1<<3 // clear um.ac
671 .save ar.lc,lcsave
672 mov lcsave=ar.lc };;
673
674 .body
675#if defined(_HPUX_SOURCE) // HPUX is big-endian, cut 15+15 cycles...
676{ .mib; cmp.ne p6,p0=out0,r0
677 add out0=4,in0
678(p6) br.dpnt.many .Ld_i_unaligned };;
679
680{ .mmi; ld4 out1=[in0],8 // s0
681 and out9=3,in1
682 mov twenty4=24 }
683{ .mmi; ld4 out3=[out0],8 // s1
684 ADDP rk0=0,in2
685 mov sixteen=16 };;
686{ .mmi; ld4 out5=[in0] // s2
687 cmp.ne p6,p0=out9,r0
688 mov maskff=0xff }
689{ .mmb; ld4 out7=[out0] // s3
690 ADDP rk1=KSZ,in2
691 br.call.sptk.many b6=_ia64_AES_decrypt };;
692
693{ .mib; ADDP in0=4,in1
694 ADDP in1=0,in1
695(p6) br.spnt .Ld_o_unaligned };;
696
697{ .mii; mov psr.um=loc0
698 mov ar.pfs=pfssave
699 mov ar.lc=lcsave };;
700{ .mmi; st4 [in1]=r16,8 // s0
701 st4 [in0]=r20,8 // s1
702 mov pr=prsave,0x1ffff };;
703{ .mmb; st4 [in1]=r24 // s2
704 st4 [in0]=r28 // s3
705 br.ret.sptk.many b0 };;
706#endif
707
708.align 32
709.Ld_i_unaligned:
710{ .mmi; add out0=1,in0
711 add out2=2,in0
712 add out4=3,in0 };;
713{ .mmi; ld1 r16=[in0],4
714 ld1 r17=[out0],4 }//;;
715{ .mmi; ld1 r18=[out2],4
716 ld1 out1=[out4],4 };; // s0
717{ .mmi; ld1 r20=[in0],4
718 ld1 r21=[out0],4 }//;;
719{ .mmi; ld1 r22=[out2],4
720 ld1 out3=[out4],4 };; // s1
721{ .mmi; ld1 r24=[in0],4
722 ld1 r25=[out0],4 }//;;
723{ .mmi; ld1 r26=[out2],4
724 ld1 out5=[out4],4 };; // s2
725{ .mmi; ld1 r28=[in0]
726 ld1 r29=[out0] }//;;
727{ .mmi; ld1 r30=[out2]
728 ld1 out7=[out4] };; // s3
729
730{ .mii;
731 dep out1=r16,out1,24,8 //;;
732 dep out3=r20,out3,24,8 }//;;
733{ .mii; ADDP rk0=0,in2
734 dep out5=r24,out5,24,8 //;;
735 dep out7=r28,out7,24,8 };;
736{ .mii; ADDP rk1=KSZ,in2
737 dep out1=r17,out1,16,8 //;;
738 dep out3=r21,out3,16,8 }//;;
739{ .mii; mov twenty4=24
740 dep out5=r25,out5,16,8 //;;
741 dep out7=r29,out7,16,8 };;
742{ .mii; mov sixteen=16
743 dep out1=r18,out1,8,8 //;;
744 dep out3=r22,out3,8,8 }//;;
745{ .mii; mov maskff=0xff
746 dep out5=r26,out5,8,8 //;;
747 dep out7=r30,out7,8,8 };;
748
749{ .mib; br.call.sptk.many b6=_ia64_AES_decrypt };;
750
751.Ld_o_unaligned:
752{ .mii; ADDP out0=0,in1
753 extr.u r17=r16,8,8 // s0
754 shr.u r19=r16,twenty4 }//;;
755{ .mii; ADDP out1=1,in1
756 extr.u r18=r16,16,8
757 shr.u r23=r20,twenty4 }//;; // s1
758{ .mii; ADDP out2=2,in1
759 extr.u r21=r20,8,8
760 shr.u r22=r20,sixteen }//;;
761{ .mii; ADDP out3=3,in1
762 extr.u r25=r24,8,8 // s2
763 shr.u r27=r24,twenty4 };;
764{ .mii; st1 [out3]=r16,4
765 extr.u r26=r24,16,8
766 shr.u r31=r28,twenty4 }//;; // s3
767{ .mii; st1 [out2]=r17,4
768 extr.u r29=r28,8,8
769 shr.u r30=r28,sixteen }//;;
770
771{ .mmi; st1 [out1]=r18,4
772 st1 [out0]=r19,4 };;
773{ .mmi; st1 [out3]=r20,4
774 st1 [out2]=r21,4 }//;;
775{ .mmi; st1 [out1]=r22,4
776 st1 [out0]=r23,4 };;
777{ .mmi; st1 [out3]=r24,4
778 st1 [out2]=r25,4
779 mov pr=prsave,0x1ffff }//;;
780{ .mmi; st1 [out1]=r26,4
781 st1 [out0]=r27,4
782 mov ar.pfs=pfssave };;
783{ .mmi; st1 [out3]=r28
784 st1 [out2]=r29
785 mov ar.lc=lcsave }//;;
786{ .mmi; st1 [out1]=r30
787 st1 [out0]=r31 }
788{ .mfb; mov psr.um=loc0 // restore user mask
789 br.ret.sptk.many b0 };;
790.endp AES_decrypt#
791
792// leave it in .text segment...
793.align 64
794.global AES_Te#
795.type AES_Te#,@object
796AES_Te: data4 0xc66363a5,0xc66363a5, 0xf87c7c84,0xf87c7c84
797 data4 0xee777799,0xee777799, 0xf67b7b8d,0xf67b7b8d
798 data4 0xfff2f20d,0xfff2f20d, 0xd66b6bbd,0xd66b6bbd
799 data4 0xde6f6fb1,0xde6f6fb1, 0x91c5c554,0x91c5c554
800 data4 0x60303050,0x60303050, 0x02010103,0x02010103
801 data4 0xce6767a9,0xce6767a9, 0x562b2b7d,0x562b2b7d
802 data4 0xe7fefe19,0xe7fefe19, 0xb5d7d762,0xb5d7d762
803 data4 0x4dababe6,0x4dababe6, 0xec76769a,0xec76769a
804 data4 0x8fcaca45,0x8fcaca45, 0x1f82829d,0x1f82829d
805 data4 0x89c9c940,0x89c9c940, 0xfa7d7d87,0xfa7d7d87
806 data4 0xeffafa15,0xeffafa15, 0xb25959eb,0xb25959eb
807 data4 0x8e4747c9,0x8e4747c9, 0xfbf0f00b,0xfbf0f00b
808 data4 0x41adadec,0x41adadec, 0xb3d4d467,0xb3d4d467
809 data4 0x5fa2a2fd,0x5fa2a2fd, 0x45afafea,0x45afafea
810 data4 0x239c9cbf,0x239c9cbf, 0x53a4a4f7,0x53a4a4f7
811 data4 0xe4727296,0xe4727296, 0x9bc0c05b,0x9bc0c05b
812 data4 0x75b7b7c2,0x75b7b7c2, 0xe1fdfd1c,0xe1fdfd1c
813 data4 0x3d9393ae,0x3d9393ae, 0x4c26266a,0x4c26266a
814 data4 0x6c36365a,0x6c36365a, 0x7e3f3f41,0x7e3f3f41
815 data4 0xf5f7f702,0xf5f7f702, 0x83cccc4f,0x83cccc4f
816 data4 0x6834345c,0x6834345c, 0x51a5a5f4,0x51a5a5f4
817 data4 0xd1e5e534,0xd1e5e534, 0xf9f1f108,0xf9f1f108
818 data4 0xe2717193,0xe2717193, 0xabd8d873,0xabd8d873
819 data4 0x62313153,0x62313153, 0x2a15153f,0x2a15153f
820 data4 0x0804040c,0x0804040c, 0x95c7c752,0x95c7c752
821 data4 0x46232365,0x46232365, 0x9dc3c35e,0x9dc3c35e
822 data4 0x30181828,0x30181828, 0x379696a1,0x379696a1
823 data4 0x0a05050f,0x0a05050f, 0x2f9a9ab5,0x2f9a9ab5
824 data4 0x0e070709,0x0e070709, 0x24121236,0x24121236
825 data4 0x1b80809b,0x1b80809b, 0xdfe2e23d,0xdfe2e23d
826 data4 0xcdebeb26,0xcdebeb26, 0x4e272769,0x4e272769
827 data4 0x7fb2b2cd,0x7fb2b2cd, 0xea75759f,0xea75759f
828 data4 0x1209091b,0x1209091b, 0x1d83839e,0x1d83839e
829 data4 0x582c2c74,0x582c2c74, 0x341a1a2e,0x341a1a2e
830 data4 0x361b1b2d,0x361b1b2d, 0xdc6e6eb2,0xdc6e6eb2
831 data4 0xb45a5aee,0xb45a5aee, 0x5ba0a0fb,0x5ba0a0fb
832 data4 0xa45252f6,0xa45252f6, 0x763b3b4d,0x763b3b4d
833 data4 0xb7d6d661,0xb7d6d661, 0x7db3b3ce,0x7db3b3ce
834 data4 0x5229297b,0x5229297b, 0xdde3e33e,0xdde3e33e
835 data4 0x5e2f2f71,0x5e2f2f71, 0x13848497,0x13848497
836 data4 0xa65353f5,0xa65353f5, 0xb9d1d168,0xb9d1d168
837 data4 0x00000000,0x00000000, 0xc1eded2c,0xc1eded2c
838 data4 0x40202060,0x40202060, 0xe3fcfc1f,0xe3fcfc1f
839 data4 0x79b1b1c8,0x79b1b1c8, 0xb65b5bed,0xb65b5bed
840 data4 0xd46a6abe,0xd46a6abe, 0x8dcbcb46,0x8dcbcb46
841 data4 0x67bebed9,0x67bebed9, 0x7239394b,0x7239394b
842 data4 0x944a4ade,0x944a4ade, 0x984c4cd4,0x984c4cd4
843 data4 0xb05858e8,0xb05858e8, 0x85cfcf4a,0x85cfcf4a
844 data4 0xbbd0d06b,0xbbd0d06b, 0xc5efef2a,0xc5efef2a
845 data4 0x4faaaae5,0x4faaaae5, 0xedfbfb16,0xedfbfb16
846 data4 0x864343c5,0x864343c5, 0x9a4d4dd7,0x9a4d4dd7
847 data4 0x66333355,0x66333355, 0x11858594,0x11858594
848 data4 0x8a4545cf,0x8a4545cf, 0xe9f9f910,0xe9f9f910
849 data4 0x04020206,0x04020206, 0xfe7f7f81,0xfe7f7f81
850 data4 0xa05050f0,0xa05050f0, 0x783c3c44,0x783c3c44
851 data4 0x259f9fba,0x259f9fba, 0x4ba8a8e3,0x4ba8a8e3
852 data4 0xa25151f3,0xa25151f3, 0x5da3a3fe,0x5da3a3fe
853 data4 0x804040c0,0x804040c0, 0x058f8f8a,0x058f8f8a
854 data4 0x3f9292ad,0x3f9292ad, 0x219d9dbc,0x219d9dbc
855 data4 0x70383848,0x70383848, 0xf1f5f504,0xf1f5f504
856 data4 0x63bcbcdf,0x63bcbcdf, 0x77b6b6c1,0x77b6b6c1
857 data4 0xafdada75,0xafdada75, 0x42212163,0x42212163
858 data4 0x20101030,0x20101030, 0xe5ffff1a,0xe5ffff1a
859 data4 0xfdf3f30e,0xfdf3f30e, 0xbfd2d26d,0xbfd2d26d
860 data4 0x81cdcd4c,0x81cdcd4c, 0x180c0c14,0x180c0c14
861 data4 0x26131335,0x26131335, 0xc3ecec2f,0xc3ecec2f
862 data4 0xbe5f5fe1,0xbe5f5fe1, 0x359797a2,0x359797a2
863 data4 0x884444cc,0x884444cc, 0x2e171739,0x2e171739
864 data4 0x93c4c457,0x93c4c457, 0x55a7a7f2,0x55a7a7f2
865 data4 0xfc7e7e82,0xfc7e7e82, 0x7a3d3d47,0x7a3d3d47
866 data4 0xc86464ac,0xc86464ac, 0xba5d5de7,0xba5d5de7
867 data4 0x3219192b,0x3219192b, 0xe6737395,0xe6737395
868 data4 0xc06060a0,0xc06060a0, 0x19818198,0x19818198
869 data4 0x9e4f4fd1,0x9e4f4fd1, 0xa3dcdc7f,0xa3dcdc7f
870 data4 0x44222266,0x44222266, 0x542a2a7e,0x542a2a7e
871 data4 0x3b9090ab,0x3b9090ab, 0x0b888883,0x0b888883
872 data4 0x8c4646ca,0x8c4646ca, 0xc7eeee29,0xc7eeee29
873 data4 0x6bb8b8d3,0x6bb8b8d3, 0x2814143c,0x2814143c
874 data4 0xa7dede79,0xa7dede79, 0xbc5e5ee2,0xbc5e5ee2
875 data4 0x160b0b1d,0x160b0b1d, 0xaddbdb76,0xaddbdb76
876 data4 0xdbe0e03b,0xdbe0e03b, 0x64323256,0x64323256
877 data4 0x743a3a4e,0x743a3a4e, 0x140a0a1e,0x140a0a1e
878 data4 0x924949db,0x924949db, 0x0c06060a,0x0c06060a
879 data4 0x4824246c,0x4824246c, 0xb85c5ce4,0xb85c5ce4
880 data4 0x9fc2c25d,0x9fc2c25d, 0xbdd3d36e,0xbdd3d36e
881 data4 0x43acacef,0x43acacef, 0xc46262a6,0xc46262a6
882 data4 0x399191a8,0x399191a8, 0x319595a4,0x319595a4
883 data4 0xd3e4e437,0xd3e4e437, 0xf279798b,0xf279798b
884 data4 0xd5e7e732,0xd5e7e732, 0x8bc8c843,0x8bc8c843
885 data4 0x6e373759,0x6e373759, 0xda6d6db7,0xda6d6db7
886 data4 0x018d8d8c,0x018d8d8c, 0xb1d5d564,0xb1d5d564
887 data4 0x9c4e4ed2,0x9c4e4ed2, 0x49a9a9e0,0x49a9a9e0
888 data4 0xd86c6cb4,0xd86c6cb4, 0xac5656fa,0xac5656fa
889 data4 0xf3f4f407,0xf3f4f407, 0xcfeaea25,0xcfeaea25
890 data4 0xca6565af,0xca6565af, 0xf47a7a8e,0xf47a7a8e
891 data4 0x47aeaee9,0x47aeaee9, 0x10080818,0x10080818
892 data4 0x6fbabad5,0x6fbabad5, 0xf0787888,0xf0787888
893 data4 0x4a25256f,0x4a25256f, 0x5c2e2e72,0x5c2e2e72
894 data4 0x381c1c24,0x381c1c24, 0x57a6a6f1,0x57a6a6f1
895 data4 0x73b4b4c7,0x73b4b4c7, 0x97c6c651,0x97c6c651
896 data4 0xcbe8e823,0xcbe8e823, 0xa1dddd7c,0xa1dddd7c
897 data4 0xe874749c,0xe874749c, 0x3e1f1f21,0x3e1f1f21
898 data4 0x964b4bdd,0x964b4bdd, 0x61bdbddc,0x61bdbddc
899 data4 0x0d8b8b86,0x0d8b8b86, 0x0f8a8a85,0x0f8a8a85
900 data4 0xe0707090,0xe0707090, 0x7c3e3e42,0x7c3e3e42
901 data4 0x71b5b5c4,0x71b5b5c4, 0xcc6666aa,0xcc6666aa
902 data4 0x904848d8,0x904848d8, 0x06030305,0x06030305
903 data4 0xf7f6f601,0xf7f6f601, 0x1c0e0e12,0x1c0e0e12
904 data4 0xc26161a3,0xc26161a3, 0x6a35355f,0x6a35355f
905 data4 0xae5757f9,0xae5757f9, 0x69b9b9d0,0x69b9b9d0
906 data4 0x17868691,0x17868691, 0x99c1c158,0x99c1c158
907 data4 0x3a1d1d27,0x3a1d1d27, 0x279e9eb9,0x279e9eb9
908 data4 0xd9e1e138,0xd9e1e138, 0xebf8f813,0xebf8f813
909 data4 0x2b9898b3,0x2b9898b3, 0x22111133,0x22111133
910 data4 0xd26969bb,0xd26969bb, 0xa9d9d970,0xa9d9d970
911 data4 0x078e8e89,0x078e8e89, 0x339494a7,0x339494a7
912 data4 0x2d9b9bb6,0x2d9b9bb6, 0x3c1e1e22,0x3c1e1e22
913 data4 0x15878792,0x15878792, 0xc9e9e920,0xc9e9e920
914 data4 0x87cece49,0x87cece49, 0xaa5555ff,0xaa5555ff
915 data4 0x50282878,0x50282878, 0xa5dfdf7a,0xa5dfdf7a
916 data4 0x038c8c8f,0x038c8c8f, 0x59a1a1f8,0x59a1a1f8
917 data4 0x09898980,0x09898980, 0x1a0d0d17,0x1a0d0d17
918 data4 0x65bfbfda,0x65bfbfda, 0xd7e6e631,0xd7e6e631
919 data4 0x844242c6,0x844242c6, 0xd06868b8,0xd06868b8
920 data4 0x824141c3,0x824141c3, 0x299999b0,0x299999b0
921 data4 0x5a2d2d77,0x5a2d2d77, 0x1e0f0f11,0x1e0f0f11
922 data4 0x7bb0b0cb,0x7bb0b0cb, 0xa85454fc,0xa85454fc
923 data4 0x6dbbbbd6,0x6dbbbbd6, 0x2c16163a,0x2c16163a
924// Te4:
925 data1 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
926 data1 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
927 data1 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
928 data1 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
929 data1 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
930 data1 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
931 data1 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
932 data1 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
933 data1 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
934 data1 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
935 data1 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
936 data1 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
937 data1 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
938 data1 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
939 data1 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
940 data1 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
941 data1 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
942 data1 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
943 data1 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
944 data1 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
945 data1 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
946 data1 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
947 data1 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
948 data1 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
949 data1 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
950 data1 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
951 data1 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
952 data1 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
953 data1 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
954 data1 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
955 data1 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
956 data1 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
957.size AES_Te#,2048+256 // HP-UX assembler fails to ".-AES_Te#"
958
959.align 64
960.global AES_Td#
961.type AES_Td#,@object
962AES_Td: data4 0x51f4a750,0x51f4a750, 0x7e416553,0x7e416553
963 data4 0x1a17a4c3,0x1a17a4c3, 0x3a275e96,0x3a275e96
964 data4 0x3bab6bcb,0x3bab6bcb, 0x1f9d45f1,0x1f9d45f1
965 data4 0xacfa58ab,0xacfa58ab, 0x4be30393,0x4be30393
966 data4 0x2030fa55,0x2030fa55, 0xad766df6,0xad766df6
967 data4 0x88cc7691,0x88cc7691, 0xf5024c25,0xf5024c25
968 data4 0x4fe5d7fc,0x4fe5d7fc, 0xc52acbd7,0xc52acbd7
969 data4 0x26354480,0x26354480, 0xb562a38f,0xb562a38f
970 data4 0xdeb15a49,0xdeb15a49, 0x25ba1b67,0x25ba1b67
971 data4 0x45ea0e98,0x45ea0e98, 0x5dfec0e1,0x5dfec0e1
972 data4 0xc32f7502,0xc32f7502, 0x814cf012,0x814cf012
973 data4 0x8d4697a3,0x8d4697a3, 0x6bd3f9c6,0x6bd3f9c6
974 data4 0x038f5fe7,0x038f5fe7, 0x15929c95,0x15929c95
975 data4 0xbf6d7aeb,0xbf6d7aeb, 0x955259da,0x955259da
976 data4 0xd4be832d,0xd4be832d, 0x587421d3,0x587421d3
977 data4 0x49e06929,0x49e06929, 0x8ec9c844,0x8ec9c844
978 data4 0x75c2896a,0x75c2896a, 0xf48e7978,0xf48e7978
979 data4 0x99583e6b,0x99583e6b, 0x27b971dd,0x27b971dd
980 data4 0xbee14fb6,0xbee14fb6, 0xf088ad17,0xf088ad17
981 data4 0xc920ac66,0xc920ac66, 0x7dce3ab4,0x7dce3ab4
982 data4 0x63df4a18,0x63df4a18, 0xe51a3182,0xe51a3182
983 data4 0x97513360,0x97513360, 0x62537f45,0x62537f45
984 data4 0xb16477e0,0xb16477e0, 0xbb6bae84,0xbb6bae84
985 data4 0xfe81a01c,0xfe81a01c, 0xf9082b94,0xf9082b94
986 data4 0x70486858,0x70486858, 0x8f45fd19,0x8f45fd19
987 data4 0x94de6c87,0x94de6c87, 0x527bf8b7,0x527bf8b7
988 data4 0xab73d323,0xab73d323, 0x724b02e2,0x724b02e2
989 data4 0xe31f8f57,0xe31f8f57, 0x6655ab2a,0x6655ab2a
990 data4 0xb2eb2807,0xb2eb2807, 0x2fb5c203,0x2fb5c203
991 data4 0x86c57b9a,0x86c57b9a, 0xd33708a5,0xd33708a5
992 data4 0x302887f2,0x302887f2, 0x23bfa5b2,0x23bfa5b2
993 data4 0x02036aba,0x02036aba, 0xed16825c,0xed16825c
994 data4 0x8acf1c2b,0x8acf1c2b, 0xa779b492,0xa779b492
995 data4 0xf307f2f0,0xf307f2f0, 0x4e69e2a1,0x4e69e2a1
996 data4 0x65daf4cd,0x65daf4cd, 0x0605bed5,0x0605bed5
997 data4 0xd134621f,0xd134621f, 0xc4a6fe8a,0xc4a6fe8a
998 data4 0x342e539d,0x342e539d, 0xa2f355a0,0xa2f355a0
999 data4 0x058ae132,0x058ae132, 0xa4f6eb75,0xa4f6eb75
1000 data4 0x0b83ec39,0x0b83ec39, 0x4060efaa,0x4060efaa
1001 data4 0x5e719f06,0x5e719f06, 0xbd6e1051,0xbd6e1051
1002 data4 0x3e218af9,0x3e218af9, 0x96dd063d,0x96dd063d
1003 data4 0xdd3e05ae,0xdd3e05ae, 0x4de6bd46,0x4de6bd46
1004 data4 0x91548db5,0x91548db5, 0x71c45d05,0x71c45d05
1005 data4 0x0406d46f,0x0406d46f, 0x605015ff,0x605015ff
1006 data4 0x1998fb24,0x1998fb24, 0xd6bde997,0xd6bde997
1007 data4 0x894043cc,0x894043cc, 0x67d99e77,0x67d99e77
1008 data4 0xb0e842bd,0xb0e842bd, 0x07898b88,0x07898b88
1009 data4 0xe7195b38,0xe7195b38, 0x79c8eedb,0x79c8eedb
1010 data4 0xa17c0a47,0xa17c0a47, 0x7c420fe9,0x7c420fe9
1011 data4 0xf8841ec9,0xf8841ec9, 0x00000000,0x00000000
1012 data4 0x09808683,0x09808683, 0x322bed48,0x322bed48
1013 data4 0x1e1170ac,0x1e1170ac, 0x6c5a724e,0x6c5a724e
1014 data4 0xfd0efffb,0xfd0efffb, 0x0f853856,0x0f853856
1015 data4 0x3daed51e,0x3daed51e, 0x362d3927,0x362d3927
1016 data4 0x0a0fd964,0x0a0fd964, 0x685ca621,0x685ca621
1017 data4 0x9b5b54d1,0x9b5b54d1, 0x24362e3a,0x24362e3a
1018 data4 0x0c0a67b1,0x0c0a67b1, 0x9357e70f,0x9357e70f
1019 data4 0xb4ee96d2,0xb4ee96d2, 0x1b9b919e,0x1b9b919e
1020 data4 0x80c0c54f,0x80c0c54f, 0x61dc20a2,0x61dc20a2
1021 data4 0x5a774b69,0x5a774b69, 0x1c121a16,0x1c121a16
1022 data4 0xe293ba0a,0xe293ba0a, 0xc0a02ae5,0xc0a02ae5
1023 data4 0x3c22e043,0x3c22e043, 0x121b171d,0x121b171d
1024 data4 0x0e090d0b,0x0e090d0b, 0xf28bc7ad,0xf28bc7ad
1025 data4 0x2db6a8b9,0x2db6a8b9, 0x141ea9c8,0x141ea9c8
1026 data4 0x57f11985,0x57f11985, 0xaf75074c,0xaf75074c
1027 data4 0xee99ddbb,0xee99ddbb, 0xa37f60fd,0xa37f60fd
1028 data4 0xf701269f,0xf701269f, 0x5c72f5bc,0x5c72f5bc
1029 data4 0x44663bc5,0x44663bc5, 0x5bfb7e34,0x5bfb7e34
1030 data4 0x8b432976,0x8b432976, 0xcb23c6dc,0xcb23c6dc
1031 data4 0xb6edfc68,0xb6edfc68, 0xb8e4f163,0xb8e4f163
1032 data4 0xd731dcca,0xd731dcca, 0x42638510,0x42638510
1033 data4 0x13972240,0x13972240, 0x84c61120,0x84c61120
1034 data4 0x854a247d,0x854a247d, 0xd2bb3df8,0xd2bb3df8
1035 data4 0xaef93211,0xaef93211, 0xc729a16d,0xc729a16d
1036 data4 0x1d9e2f4b,0x1d9e2f4b, 0xdcb230f3,0xdcb230f3
1037 data4 0x0d8652ec,0x0d8652ec, 0x77c1e3d0,0x77c1e3d0
1038 data4 0x2bb3166c,0x2bb3166c, 0xa970b999,0xa970b999
1039 data4 0x119448fa,0x119448fa, 0x47e96422,0x47e96422
1040 data4 0xa8fc8cc4,0xa8fc8cc4, 0xa0f03f1a,0xa0f03f1a
1041 data4 0x567d2cd8,0x567d2cd8, 0x223390ef,0x223390ef
1042 data4 0x87494ec7,0x87494ec7, 0xd938d1c1,0xd938d1c1
1043 data4 0x8ccaa2fe,0x8ccaa2fe, 0x98d40b36,0x98d40b36
1044 data4 0xa6f581cf,0xa6f581cf, 0xa57ade28,0xa57ade28
1045 data4 0xdab78e26,0xdab78e26, 0x3fadbfa4,0x3fadbfa4
1046 data4 0x2c3a9de4,0x2c3a9de4, 0x5078920d,0x5078920d
1047 data4 0x6a5fcc9b,0x6a5fcc9b, 0x547e4662,0x547e4662
1048 data4 0xf68d13c2,0xf68d13c2, 0x90d8b8e8,0x90d8b8e8
1049 data4 0x2e39f75e,0x2e39f75e, 0x82c3aff5,0x82c3aff5
1050 data4 0x9f5d80be,0x9f5d80be, 0x69d0937c,0x69d0937c
1051 data4 0x6fd52da9,0x6fd52da9, 0xcf2512b3,0xcf2512b3
1052 data4 0xc8ac993b,0xc8ac993b, 0x10187da7,0x10187da7
1053 data4 0xe89c636e,0xe89c636e, 0xdb3bbb7b,0xdb3bbb7b
1054 data4 0xcd267809,0xcd267809, 0x6e5918f4,0x6e5918f4
1055 data4 0xec9ab701,0xec9ab701, 0x834f9aa8,0x834f9aa8
1056 data4 0xe6956e65,0xe6956e65, 0xaaffe67e,0xaaffe67e
1057 data4 0x21bccf08,0x21bccf08, 0xef15e8e6,0xef15e8e6
1058 data4 0xbae79bd9,0xbae79bd9, 0x4a6f36ce,0x4a6f36ce
1059 data4 0xea9f09d4,0xea9f09d4, 0x29b07cd6,0x29b07cd6
1060 data4 0x31a4b2af,0x31a4b2af, 0x2a3f2331,0x2a3f2331
1061 data4 0xc6a59430,0xc6a59430, 0x35a266c0,0x35a266c0
1062 data4 0x744ebc37,0x744ebc37, 0xfc82caa6,0xfc82caa6
1063 data4 0xe090d0b0,0xe090d0b0, 0x33a7d815,0x33a7d815
1064 data4 0xf104984a,0xf104984a, 0x41ecdaf7,0x41ecdaf7
1065 data4 0x7fcd500e,0x7fcd500e, 0x1791f62f,0x1791f62f
1066 data4 0x764dd68d,0x764dd68d, 0x43efb04d,0x43efb04d
1067 data4 0xccaa4d54,0xccaa4d54, 0xe49604df,0xe49604df
1068 data4 0x9ed1b5e3,0x9ed1b5e3, 0x4c6a881b,0x4c6a881b
1069 data4 0xc12c1fb8,0xc12c1fb8, 0x4665517f,0x4665517f
1070 data4 0x9d5eea04,0x9d5eea04, 0x018c355d,0x018c355d
1071 data4 0xfa877473,0xfa877473, 0xfb0b412e,0xfb0b412e
1072 data4 0xb3671d5a,0xb3671d5a, 0x92dbd252,0x92dbd252
1073 data4 0xe9105633,0xe9105633, 0x6dd64713,0x6dd64713
1074 data4 0x9ad7618c,0x9ad7618c, 0x37a10c7a,0x37a10c7a
1075 data4 0x59f8148e,0x59f8148e, 0xeb133c89,0xeb133c89
1076 data4 0xcea927ee,0xcea927ee, 0xb761c935,0xb761c935
1077 data4 0xe11ce5ed,0xe11ce5ed, 0x7a47b13c,0x7a47b13c
1078 data4 0x9cd2df59,0x9cd2df59, 0x55f2733f,0x55f2733f
1079 data4 0x1814ce79,0x1814ce79, 0x73c737bf,0x73c737bf
1080 data4 0x53f7cdea,0x53f7cdea, 0x5ffdaa5b,0x5ffdaa5b
1081 data4 0xdf3d6f14,0xdf3d6f14, 0x7844db86,0x7844db86
1082 data4 0xcaaff381,0xcaaff381, 0xb968c43e,0xb968c43e
1083 data4 0x3824342c,0x3824342c, 0xc2a3405f,0xc2a3405f
1084 data4 0x161dc372,0x161dc372, 0xbce2250c,0xbce2250c
1085 data4 0x283c498b,0x283c498b, 0xff0d9541,0xff0d9541
1086 data4 0x39a80171,0x39a80171, 0x080cb3de,0x080cb3de
1087 data4 0xd8b4e49c,0xd8b4e49c, 0x6456c190,0x6456c190
1088 data4 0x7bcb8461,0x7bcb8461, 0xd532b670,0xd532b670
1089 data4 0x486c5c74,0x486c5c74, 0xd0b85742,0xd0b85742
1090// Td4:
1091 data1 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
1092 data1 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
1093 data1 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
1094 data1 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
1095 data1 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
1096 data1 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
1097 data1 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
1098 data1 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
1099 data1 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
1100 data1 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
1101 data1 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
1102 data1 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
1103 data1 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
1104 data1 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
1105 data1 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
1106 data1 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
1107 data1 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
1108 data1 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
1109 data1 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
1110 data1 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
1111 data1 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
1112 data1 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
1113 data1 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
1114 data1 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
1115 data1 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
1116 data1 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
1117 data1 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
1118 data1 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
1119 data1 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
1120 data1 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
1121 data1 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
1122 data1 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1123.size AES_Td#,2048+256 // HP-UX assembler fails to ".-AES_Td#"
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl
deleted file mode 100644
index f82c5e1814..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-ppc.pl
+++ /dev/null
@@ -1,1189 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Needs more work: key setup, page boundaries, CBC routine...
11#
12# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
13# 128-bit key, which is ~40% better than 64-bit code generated by gcc
14# 4.0. But these are not the ones currently used! Their "compact"
15# counterparts are, for security reason. ppc_AES_encrypt_compact runs
16# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
17# at 1/3 of ppc_AES_decrypt.
18
19# February 2010
20#
21# Rescheduling instructions to favour Power6 pipeline gives 10%
22# performance improvement on the platfrom in question (and marginal
23# improvement even on others). It should be noted that Power6 fails
24# to process byte in 18 cycles, only in 23, because it fails to issue
25# 4 load instructions in two cycles, only in 3. As result non-compact
26# block subroutines are 25% slower than one would expect. Compact
27# functions scale better, because they have pure computational part,
28# which scales perfectly with clock frequency. To be specific
29# ppc_AES_encrypt_compact operates at 42 cycles per byte, while
30# ppc_AES_decrypt_compact - at 55 (in 64-bit build).
31
32$flavour = shift;
33
34if ($flavour =~ /64/) {
35 $SIZE_T =8;
36 $STU ="stdu";
37 $POP ="ld";
38 $PUSH ="std";
39} elsif ($flavour =~ /32/) {
40 $SIZE_T =4;
41 $STU ="stwu";
42 $POP ="lwz";
43 $PUSH ="stw";
44} else { die "nonsense $flavour"; }
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49die "can't locate ppc-xlate.pl";
50
51open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
52
53$FRAME=32*$SIZE_T;
54
55sub _data_word()
56{ my $i;
57 while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
58}
59
60$sp="r1";
61$toc="r2";
62$inp="r3";
63$out="r4";
64$key="r5";
65
66$Tbl0="r3";
67$Tbl1="r6";
68$Tbl2="r7";
69$Tbl3="r2";
70
71$s0="r8";
72$s1="r9";
73$s2="r10";
74$s3="r11";
75
76$t0="r12";
77$t1="r13";
78$t2="r14";
79$t3="r15";
80
81$acc00="r16";
82$acc01="r17";
83$acc02="r18";
84$acc03="r19";
85
86$acc04="r20";
87$acc05="r21";
88$acc06="r22";
89$acc07="r23";
90
91$acc08="r24";
92$acc09="r25";
93$acc10="r26";
94$acc11="r27";
95
96$acc12="r28";
97$acc13="r29";
98$acc14="r30";
99$acc15="r31";
100
101# stay away from TLS pointer
102if ($SIZE_T==8) { die if ($t1 ne "r13"); $t1="r0"; }
103else { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0"; }
104$mask80=$Tbl2;
105$mask1b=$Tbl3;
106
107$code.=<<___;
108.machine "any"
109.text
110
111.align 7
112LAES_Te:
113 mflr r0
114 bcl 20,31,\$+4
115 mflr $Tbl0 ; vvvvv "distance" between . and 1st data entry
116 addi $Tbl0,$Tbl0,`128-8`
117 mtlr r0
118 blr
119 .space `32-24`
120LAES_Td:
121 mflr r0
122 bcl 20,31,\$+4
123 mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry
124 addi $Tbl0,$Tbl0,`128-8-32+2048+256`
125 mtlr r0
126 blr
127 .space `128-32-24`
128___
129&_data_word(
130 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
131 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
132 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
133 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
134 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
135 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
136 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
137 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
138 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
139 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
140 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
141 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
142 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
143 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
144 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
145 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
146 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
147 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
148 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
149 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
150 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
151 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
152 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
153 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
154 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
155 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
156 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
157 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
158 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
159 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
160 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
161 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
162 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
163 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
164 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
165 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
166 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
167 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
168 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
169 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
170 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
171 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
172 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
173 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
174 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
175 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
176 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
177 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
178 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
179 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
180 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
181 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
182 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
183 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
184 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
185 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
186 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
187 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
188 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
189 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
190 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
191 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
192 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
193 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
194$code.=<<___;
195.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
196.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
197.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
198.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
199.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
200.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
201.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
202.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
203.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
204.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
205.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
206.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
207.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
208.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
209.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
210.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
211.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
212.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
213.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
214.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
215.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
216.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
217.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
218.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
219.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
220.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
221.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
222.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
223.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
224.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
225.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
226.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
227___
228&_data_word(
229 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
230 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
231 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
232 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
233 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
234 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
235 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
236 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
237 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
238 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
239 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
240 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
241 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
242 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
243 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
244 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
245 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
246 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
247 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
248 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
249 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
250 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
251 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
252 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
253 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
254 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
255 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
256 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
257 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
258 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
259 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
260 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
261 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
262 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
263 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
264 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
265 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
266 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
267 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
268 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
269 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
270 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
271 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
272 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
273 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
274 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
275 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
276 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
277 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
278 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
279 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
280 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
281 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
282 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
283 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
284 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
285 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
286 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
287 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
288 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
289 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
290 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
291 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
292 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
293$code.=<<___;
294.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
295.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
296.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
297.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
298.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
299.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
300.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
301.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
302.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
303.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
304.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
305.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
306.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
307.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
308.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
309.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
310.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
311.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
312.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
313.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
314.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
315.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
316.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
317.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
318.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
319.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
320.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
321.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
322.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
323.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
324.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
325.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
326
327
328.globl .AES_encrypt
329.align 7
330.AES_encrypt:
331 mflr r0
332 $STU $sp,-$FRAME($sp)
333
334 $PUSH r0,`$FRAME-$SIZE_T*21`($sp)
335 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
336 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
337 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
338 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
339 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
340 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
341 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
342 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
343 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
344 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
345 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
346 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
347 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
348 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
349 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
350 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
351 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
352 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
353 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
354 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
355
356 lwz $s0,0($inp)
357 lwz $s1,4($inp)
358 lwz $s2,8($inp)
359 lwz $s3,12($inp)
360 bl LAES_Te
361 bl Lppc_AES_encrypt_compact
362 stw $s0,0($out)
363 stw $s1,4($out)
364 stw $s2,8($out)
365 stw $s3,12($out)
366
367 $POP r0,`$FRAME-$SIZE_T*21`($sp)
368 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
369 $POP r13,`$FRAME-$SIZE_T*19`($sp)
370 $POP r14,`$FRAME-$SIZE_T*18`($sp)
371 $POP r15,`$FRAME-$SIZE_T*17`($sp)
372 $POP r16,`$FRAME-$SIZE_T*16`($sp)
373 $POP r17,`$FRAME-$SIZE_T*15`($sp)
374 $POP r18,`$FRAME-$SIZE_T*14`($sp)
375 $POP r19,`$FRAME-$SIZE_T*13`($sp)
376 $POP r20,`$FRAME-$SIZE_T*12`($sp)
377 $POP r21,`$FRAME-$SIZE_T*11`($sp)
378 $POP r22,`$FRAME-$SIZE_T*10`($sp)
379 $POP r23,`$FRAME-$SIZE_T*9`($sp)
380 $POP r24,`$FRAME-$SIZE_T*8`($sp)
381 $POP r25,`$FRAME-$SIZE_T*7`($sp)
382 $POP r26,`$FRAME-$SIZE_T*6`($sp)
383 $POP r27,`$FRAME-$SIZE_T*5`($sp)
384 $POP r28,`$FRAME-$SIZE_T*4`($sp)
385 $POP r29,`$FRAME-$SIZE_T*3`($sp)
386 $POP r30,`$FRAME-$SIZE_T*2`($sp)
387 $POP r31,`$FRAME-$SIZE_T*1`($sp)
388 mtlr r0
389 addi $sp,$sp,$FRAME
390 blr
391
392.align 5
393Lppc_AES_encrypt:
394 lwz $acc00,240($key)
395 lwz $t0,0($key)
396 lwz $t1,4($key)
397 lwz $t2,8($key)
398 lwz $t3,12($key)
399 addi $Tbl1,$Tbl0,3
400 addi $Tbl2,$Tbl0,2
401 addi $Tbl3,$Tbl0,1
402 addi $acc00,$acc00,-1
403 addi $key,$key,16
404 xor $s0,$s0,$t0
405 xor $s1,$s1,$t1
406 xor $s2,$s2,$t2
407 xor $s3,$s3,$t3
408 mtctr $acc00
409.align 4
410Lenc_loop:
411 rlwinm $acc00,$s0,`32-24+3`,21,28
412 rlwinm $acc01,$s1,`32-24+3`,21,28
413 rlwinm $acc02,$s2,`32-24+3`,21,28
414 rlwinm $acc03,$s3,`32-24+3`,21,28
415 lwz $t0,0($key)
416 lwz $t1,4($key)
417 rlwinm $acc04,$s1,`32-16+3`,21,28
418 rlwinm $acc05,$s2,`32-16+3`,21,28
419 lwz $t2,8($key)
420 lwz $t3,12($key)
421 rlwinm $acc06,$s3,`32-16+3`,21,28
422 rlwinm $acc07,$s0,`32-16+3`,21,28
423 lwzx $acc00,$Tbl0,$acc00
424 lwzx $acc01,$Tbl0,$acc01
425 rlwinm $acc08,$s2,`32-8+3`,21,28
426 rlwinm $acc09,$s3,`32-8+3`,21,28
427 lwzx $acc02,$Tbl0,$acc02
428 lwzx $acc03,$Tbl0,$acc03
429 rlwinm $acc10,$s0,`32-8+3`,21,28
430 rlwinm $acc11,$s1,`32-8+3`,21,28
431 lwzx $acc04,$Tbl1,$acc04
432 lwzx $acc05,$Tbl1,$acc05
433 rlwinm $acc12,$s3,`0+3`,21,28
434 rlwinm $acc13,$s0,`0+3`,21,28
435 lwzx $acc06,$Tbl1,$acc06
436 lwzx $acc07,$Tbl1,$acc07
437 rlwinm $acc14,$s1,`0+3`,21,28
438 rlwinm $acc15,$s2,`0+3`,21,28
439 lwzx $acc08,$Tbl2,$acc08
440 lwzx $acc09,$Tbl2,$acc09
441 xor $t0,$t0,$acc00
442 xor $t1,$t1,$acc01
443 lwzx $acc10,$Tbl2,$acc10
444 lwzx $acc11,$Tbl2,$acc11
445 xor $t2,$t2,$acc02
446 xor $t3,$t3,$acc03
447 lwzx $acc12,$Tbl3,$acc12
448 lwzx $acc13,$Tbl3,$acc13
449 xor $t0,$t0,$acc04
450 xor $t1,$t1,$acc05
451 lwzx $acc14,$Tbl3,$acc14
452 lwzx $acc15,$Tbl3,$acc15
453 xor $t2,$t2,$acc06
454 xor $t3,$t3,$acc07
455 xor $t0,$t0,$acc08
456 xor $t1,$t1,$acc09
457 xor $t2,$t2,$acc10
458 xor $t3,$t3,$acc11
459 xor $s0,$t0,$acc12
460 xor $s1,$t1,$acc13
461 xor $s2,$t2,$acc14
462 xor $s3,$t3,$acc15
463 addi $key,$key,16
464 bdnz- Lenc_loop
465
466 addi $Tbl2,$Tbl0,2048
467 nop
468 lwz $t0,0($key)
469 lwz $t1,4($key)
470 rlwinm $acc00,$s0,`32-24`,24,31
471 rlwinm $acc01,$s1,`32-24`,24,31
472 lwz $t2,8($key)
473 lwz $t3,12($key)
474 rlwinm $acc02,$s2,`32-24`,24,31
475 rlwinm $acc03,$s3,`32-24`,24,31
476 lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
477 lwz $acc09,`2048+32`($Tbl0)
478 rlwinm $acc04,$s1,`32-16`,24,31
479 rlwinm $acc05,$s2,`32-16`,24,31
480 lwz $acc10,`2048+64`($Tbl0)
481 lwz $acc11,`2048+96`($Tbl0)
482 rlwinm $acc06,$s3,`32-16`,24,31
483 rlwinm $acc07,$s0,`32-16`,24,31
484 lwz $acc12,`2048+128`($Tbl0)
485 lwz $acc13,`2048+160`($Tbl0)
486 rlwinm $acc08,$s2,`32-8`,24,31
487 rlwinm $acc09,$s3,`32-8`,24,31
488 lwz $acc14,`2048+192`($Tbl0)
489 lwz $acc15,`2048+224`($Tbl0)
490 rlwinm $acc10,$s0,`32-8`,24,31
491 rlwinm $acc11,$s1,`32-8`,24,31
492 lbzx $acc00,$Tbl2,$acc00
493 lbzx $acc01,$Tbl2,$acc01
494 rlwinm $acc12,$s3,`0`,24,31
495 rlwinm $acc13,$s0,`0`,24,31
496 lbzx $acc02,$Tbl2,$acc02
497 lbzx $acc03,$Tbl2,$acc03
498 rlwinm $acc14,$s1,`0`,24,31
499 rlwinm $acc15,$s2,`0`,24,31
500 lbzx $acc04,$Tbl2,$acc04
501 lbzx $acc05,$Tbl2,$acc05
502 rlwinm $s0,$acc00,24,0,7
503 rlwinm $s1,$acc01,24,0,7
504 lbzx $acc06,$Tbl2,$acc06
505 lbzx $acc07,$Tbl2,$acc07
506 rlwinm $s2,$acc02,24,0,7
507 rlwinm $s3,$acc03,24,0,7
508 lbzx $acc08,$Tbl2,$acc08
509 lbzx $acc09,$Tbl2,$acc09
510 rlwimi $s0,$acc04,16,8,15
511 rlwimi $s1,$acc05,16,8,15
512 lbzx $acc10,$Tbl2,$acc10
513 lbzx $acc11,$Tbl2,$acc11
514 rlwimi $s2,$acc06,16,8,15
515 rlwimi $s3,$acc07,16,8,15
516 lbzx $acc12,$Tbl2,$acc12
517 lbzx $acc13,$Tbl2,$acc13
518 rlwimi $s0,$acc08,8,16,23
519 rlwimi $s1,$acc09,8,16,23
520 lbzx $acc14,$Tbl2,$acc14
521 lbzx $acc15,$Tbl2,$acc15
522 rlwimi $s2,$acc10,8,16,23
523 rlwimi $s3,$acc11,8,16,23
524 or $s0,$s0,$acc12
525 or $s1,$s1,$acc13
526 or $s2,$s2,$acc14
527 or $s3,$s3,$acc15
528 xor $s0,$s0,$t0
529 xor $s1,$s1,$t1
530 xor $s2,$s2,$t2
531 xor $s3,$s3,$t3
532 blr
533
534.align 4
535Lppc_AES_encrypt_compact:
536 lwz $acc00,240($key)
537 lwz $t0,0($key)
538 lwz $t1,4($key)
539 lwz $t2,8($key)
540 lwz $t3,12($key)
541 addi $Tbl1,$Tbl0,2048
542 lis $mask80,0x8080
543 lis $mask1b,0x1b1b
544 addi $key,$key,16
545 ori $mask80,$mask80,0x8080
546 ori $mask1b,$mask1b,0x1b1b
547 mtctr $acc00
548.align 4
549Lenc_compact_loop:
550 xor $s0,$s0,$t0
551 xor $s1,$s1,$t1
552 xor $s2,$s2,$t2
553 xor $s3,$s3,$t3
554 rlwinm $acc00,$s0,`32-24`,24,31
555 rlwinm $acc01,$s1,`32-24`,24,31
556 rlwinm $acc02,$s2,`32-24`,24,31
557 rlwinm $acc03,$s3,`32-24`,24,31
558 rlwinm $acc04,$s1,`32-16`,24,31
559 rlwinm $acc05,$s2,`32-16`,24,31
560 rlwinm $acc06,$s3,`32-16`,24,31
561 rlwinm $acc07,$s0,`32-16`,24,31
562 lbzx $acc00,$Tbl1,$acc00
563 lbzx $acc01,$Tbl1,$acc01
564 rlwinm $acc08,$s2,`32-8`,24,31
565 rlwinm $acc09,$s3,`32-8`,24,31
566 lbzx $acc02,$Tbl1,$acc02
567 lbzx $acc03,$Tbl1,$acc03
568 rlwinm $acc10,$s0,`32-8`,24,31
569 rlwinm $acc11,$s1,`32-8`,24,31
570 lbzx $acc04,$Tbl1,$acc04
571 lbzx $acc05,$Tbl1,$acc05
572 rlwinm $acc12,$s3,`0`,24,31
573 rlwinm $acc13,$s0,`0`,24,31
574 lbzx $acc06,$Tbl1,$acc06
575 lbzx $acc07,$Tbl1,$acc07
576 rlwinm $acc14,$s1,`0`,24,31
577 rlwinm $acc15,$s2,`0`,24,31
578 lbzx $acc08,$Tbl1,$acc08
579 lbzx $acc09,$Tbl1,$acc09
580 rlwinm $s0,$acc00,24,0,7
581 rlwinm $s1,$acc01,24,0,7
582 lbzx $acc10,$Tbl1,$acc10
583 lbzx $acc11,$Tbl1,$acc11
584 rlwinm $s2,$acc02,24,0,7
585 rlwinm $s3,$acc03,24,0,7
586 lbzx $acc12,$Tbl1,$acc12
587 lbzx $acc13,$Tbl1,$acc13
588 rlwimi $s0,$acc04,16,8,15
589 rlwimi $s1,$acc05,16,8,15
590 lbzx $acc14,$Tbl1,$acc14
591 lbzx $acc15,$Tbl1,$acc15
592 rlwimi $s2,$acc06,16,8,15
593 rlwimi $s3,$acc07,16,8,15
594 rlwimi $s0,$acc08,8,16,23
595 rlwimi $s1,$acc09,8,16,23
596 rlwimi $s2,$acc10,8,16,23
597 rlwimi $s3,$acc11,8,16,23
598 lwz $t0,0($key)
599 lwz $t1,4($key)
600 or $s0,$s0,$acc12
601 or $s1,$s1,$acc13
602 lwz $t2,8($key)
603 lwz $t3,12($key)
604 or $s2,$s2,$acc14
605 or $s3,$s3,$acc15
606
607 addi $key,$key,16
608 bdz Lenc_compact_done
609
610 and $acc00,$s0,$mask80 # r1=r0&0x80808080
611 and $acc01,$s1,$mask80
612 and $acc02,$s2,$mask80
613 and $acc03,$s3,$mask80
614 srwi $acc04,$acc00,7 # r1>>7
615 srwi $acc05,$acc01,7
616 srwi $acc06,$acc02,7
617 srwi $acc07,$acc03,7
618 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
619 andc $acc09,$s1,$mask80
620 andc $acc10,$s2,$mask80
621 andc $acc11,$s3,$mask80
622 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
623 sub $acc01,$acc01,$acc05
624 sub $acc02,$acc02,$acc06
625 sub $acc03,$acc03,$acc07
626 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
627 add $acc09,$acc09,$acc09
628 add $acc10,$acc10,$acc10
629 add $acc11,$acc11,$acc11
630 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
631 and $acc01,$acc01,$mask1b
632 and $acc02,$acc02,$mask1b
633 and $acc03,$acc03,$mask1b
634 xor $acc00,$acc00,$acc08 # r2
635 xor $acc01,$acc01,$acc09
636 xor $acc02,$acc02,$acc10
637 xor $acc03,$acc03,$acc11
638
639 rotlwi $acc12,$s0,16 # ROTATE(r0,16)
640 rotlwi $acc13,$s1,16
641 rotlwi $acc14,$s2,16
642 rotlwi $acc15,$s3,16
643 xor $s0,$s0,$acc00 # r0^r2
644 xor $s1,$s1,$acc01
645 xor $s2,$s2,$acc02
646 xor $s3,$s3,$acc03
647 rotrwi $s0,$s0,24 # ROTATE(r2^r0,24)
648 rotrwi $s1,$s1,24
649 rotrwi $s2,$s2,24
650 rotrwi $s3,$s3,24
651 xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2
652 xor $s1,$s1,$acc01
653 xor $s2,$s2,$acc02
654 xor $s3,$s3,$acc03
655 rotlwi $acc08,$acc12,8 # ROTATE(r0,24)
656 rotlwi $acc09,$acc13,8
657 rotlwi $acc10,$acc14,8
658 rotlwi $acc11,$acc15,8
659 xor $s0,$s0,$acc12 #
660 xor $s1,$s1,$acc13
661 xor $s2,$s2,$acc14
662 xor $s3,$s3,$acc15
663 xor $s0,$s0,$acc08 #
664 xor $s1,$s1,$acc09
665 xor $s2,$s2,$acc10
666 xor $s3,$s3,$acc11
667
668 b Lenc_compact_loop
669.align 4
670Lenc_compact_done:
671 xor $s0,$s0,$t0
672 xor $s1,$s1,$t1
673 xor $s2,$s2,$t2
674 xor $s3,$s3,$t3
675 blr
676
677.globl .AES_decrypt
678.align 7
679.AES_decrypt:
680 mflr r0
681 $STU $sp,-$FRAME($sp)
682
683 $PUSH r0,`$FRAME-$SIZE_T*21`($sp)
684 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
685 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
686 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
687 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
688 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
689 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
690 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
691 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
692 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
693 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
694 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
695 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
696 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
697 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
698 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
699 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
700 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
701 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
702 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
703 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
704
705 lwz $s0,0($inp)
706 lwz $s1,4($inp)
707 lwz $s2,8($inp)
708 lwz $s3,12($inp)
709 bl LAES_Td
710 bl Lppc_AES_decrypt_compact
711 stw $s0,0($out)
712 stw $s1,4($out)
713 stw $s2,8($out)
714 stw $s3,12($out)
715
716 $POP r0,`$FRAME-$SIZE_T*21`($sp)
717 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
718 $POP r13,`$FRAME-$SIZE_T*19`($sp)
719 $POP r14,`$FRAME-$SIZE_T*18`($sp)
720 $POP r15,`$FRAME-$SIZE_T*17`($sp)
721 $POP r16,`$FRAME-$SIZE_T*16`($sp)
722 $POP r17,`$FRAME-$SIZE_T*15`($sp)
723 $POP r18,`$FRAME-$SIZE_T*14`($sp)
724 $POP r19,`$FRAME-$SIZE_T*13`($sp)
725 $POP r20,`$FRAME-$SIZE_T*12`($sp)
726 $POP r21,`$FRAME-$SIZE_T*11`($sp)
727 $POP r22,`$FRAME-$SIZE_T*10`($sp)
728 $POP r23,`$FRAME-$SIZE_T*9`($sp)
729 $POP r24,`$FRAME-$SIZE_T*8`($sp)
730 $POP r25,`$FRAME-$SIZE_T*7`($sp)
731 $POP r26,`$FRAME-$SIZE_T*6`($sp)
732 $POP r27,`$FRAME-$SIZE_T*5`($sp)
733 $POP r28,`$FRAME-$SIZE_T*4`($sp)
734 $POP r29,`$FRAME-$SIZE_T*3`($sp)
735 $POP r30,`$FRAME-$SIZE_T*2`($sp)
736 $POP r31,`$FRAME-$SIZE_T*1`($sp)
737 mtlr r0
738 addi $sp,$sp,$FRAME
739 blr
740
741.align 5
742Lppc_AES_decrypt:
743 lwz $acc00,240($key)
744 lwz $t0,0($key)
745 lwz $t1,4($key)
746 lwz $t2,8($key)
747 lwz $t3,12($key)
748 addi $Tbl1,$Tbl0,3
749 addi $Tbl2,$Tbl0,2
750 addi $Tbl3,$Tbl0,1
751 addi $acc00,$acc00,-1
752 addi $key,$key,16
753 xor $s0,$s0,$t0
754 xor $s1,$s1,$t1
755 xor $s2,$s2,$t2
756 xor $s3,$s3,$t3
757 mtctr $acc00
758.align 4
759Ldec_loop:
760 rlwinm $acc00,$s0,`32-24+3`,21,28
761 rlwinm $acc01,$s1,`32-24+3`,21,28
762 rlwinm $acc02,$s2,`32-24+3`,21,28
763 rlwinm $acc03,$s3,`32-24+3`,21,28
764 lwz $t0,0($key)
765 lwz $t1,4($key)
766 rlwinm $acc04,$s3,`32-16+3`,21,28
767 rlwinm $acc05,$s0,`32-16+3`,21,28
768 lwz $t2,8($key)
769 lwz $t3,12($key)
770 rlwinm $acc06,$s1,`32-16+3`,21,28
771 rlwinm $acc07,$s2,`32-16+3`,21,28
772 lwzx $acc00,$Tbl0,$acc00
773 lwzx $acc01,$Tbl0,$acc01
774 rlwinm $acc08,$s2,`32-8+3`,21,28
775 rlwinm $acc09,$s3,`32-8+3`,21,28
776 lwzx $acc02,$Tbl0,$acc02
777 lwzx $acc03,$Tbl0,$acc03
778 rlwinm $acc10,$s0,`32-8+3`,21,28
779 rlwinm $acc11,$s1,`32-8+3`,21,28
780 lwzx $acc04,$Tbl1,$acc04
781 lwzx $acc05,$Tbl1,$acc05
782 rlwinm $acc12,$s1,`0+3`,21,28
783 rlwinm $acc13,$s2,`0+3`,21,28
784 lwzx $acc06,$Tbl1,$acc06
785 lwzx $acc07,$Tbl1,$acc07
786 rlwinm $acc14,$s3,`0+3`,21,28
787 rlwinm $acc15,$s0,`0+3`,21,28
788 lwzx $acc08,$Tbl2,$acc08
789 lwzx $acc09,$Tbl2,$acc09
790 xor $t0,$t0,$acc00
791 xor $t1,$t1,$acc01
792 lwzx $acc10,$Tbl2,$acc10
793 lwzx $acc11,$Tbl2,$acc11
794 xor $t2,$t2,$acc02
795 xor $t3,$t3,$acc03
796 lwzx $acc12,$Tbl3,$acc12
797 lwzx $acc13,$Tbl3,$acc13
798 xor $t0,$t0,$acc04
799 xor $t1,$t1,$acc05
800 lwzx $acc14,$Tbl3,$acc14
801 lwzx $acc15,$Tbl3,$acc15
802 xor $t2,$t2,$acc06
803 xor $t3,$t3,$acc07
804 xor $t0,$t0,$acc08
805 xor $t1,$t1,$acc09
806 xor $t2,$t2,$acc10
807 xor $t3,$t3,$acc11
808 xor $s0,$t0,$acc12
809 xor $s1,$t1,$acc13
810 xor $s2,$t2,$acc14
811 xor $s3,$t3,$acc15
812 addi $key,$key,16
813 bdnz- Ldec_loop
814
815 addi $Tbl2,$Tbl0,2048
816 nop
817 lwz $t0,0($key)
818 lwz $t1,4($key)
819 rlwinm $acc00,$s0,`32-24`,24,31
820 rlwinm $acc01,$s1,`32-24`,24,31
821 lwz $t2,8($key)
822 lwz $t3,12($key)
823 rlwinm $acc02,$s2,`32-24`,24,31
824 rlwinm $acc03,$s3,`32-24`,24,31
825 lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
826 lwz $acc09,`2048+32`($Tbl0)
827 rlwinm $acc04,$s3,`32-16`,24,31
828 rlwinm $acc05,$s0,`32-16`,24,31
829 lwz $acc10,`2048+64`($Tbl0)
830 lwz $acc11,`2048+96`($Tbl0)
831 lbzx $acc00,$Tbl2,$acc00
832 lbzx $acc01,$Tbl2,$acc01
833 lwz $acc12,`2048+128`($Tbl0)
834 lwz $acc13,`2048+160`($Tbl0)
835 rlwinm $acc06,$s1,`32-16`,24,31
836 rlwinm $acc07,$s2,`32-16`,24,31
837 lwz $acc14,`2048+192`($Tbl0)
838 lwz $acc15,`2048+224`($Tbl0)
839 rlwinm $acc08,$s2,`32-8`,24,31
840 rlwinm $acc09,$s3,`32-8`,24,31
841 lbzx $acc02,$Tbl2,$acc02
842 lbzx $acc03,$Tbl2,$acc03
843 rlwinm $acc10,$s0,`32-8`,24,31
844 rlwinm $acc11,$s1,`32-8`,24,31
845 lbzx $acc04,$Tbl2,$acc04
846 lbzx $acc05,$Tbl2,$acc05
847 rlwinm $acc12,$s1,`0`,24,31
848 rlwinm $acc13,$s2,`0`,24,31
849 lbzx $acc06,$Tbl2,$acc06
850 lbzx $acc07,$Tbl2,$acc07
851 rlwinm $acc14,$s3,`0`,24,31
852 rlwinm $acc15,$s0,`0`,24,31
853 lbzx $acc08,$Tbl2,$acc08
854 lbzx $acc09,$Tbl2,$acc09
855 rlwinm $s0,$acc00,24,0,7
856 rlwinm $s1,$acc01,24,0,7
857 lbzx $acc10,$Tbl2,$acc10
858 lbzx $acc11,$Tbl2,$acc11
859 rlwinm $s2,$acc02,24,0,7
860 rlwinm $s3,$acc03,24,0,7
861 lbzx $acc12,$Tbl2,$acc12
862 lbzx $acc13,$Tbl2,$acc13
863 rlwimi $s0,$acc04,16,8,15
864 rlwimi $s1,$acc05,16,8,15
865 lbzx $acc14,$Tbl2,$acc14
866 lbzx $acc15,$Tbl2,$acc15
867 rlwimi $s2,$acc06,16,8,15
868 rlwimi $s3,$acc07,16,8,15
869 rlwimi $s0,$acc08,8,16,23
870 rlwimi $s1,$acc09,8,16,23
871 rlwimi $s2,$acc10,8,16,23
872 rlwimi $s3,$acc11,8,16,23
873 or $s0,$s0,$acc12
874 or $s1,$s1,$acc13
875 or $s2,$s2,$acc14
876 or $s3,$s3,$acc15
877 xor $s0,$s0,$t0
878 xor $s1,$s1,$t1
879 xor $s2,$s2,$t2
880 xor $s3,$s3,$t3
881 blr
882
883.align 4
884Lppc_AES_decrypt_compact:
885 lwz $acc00,240($key)
886 lwz $t0,0($key)
887 lwz $t1,4($key)
888 lwz $t2,8($key)
889 lwz $t3,12($key)
890 addi $Tbl1,$Tbl0,2048
891 lis $mask80,0x8080
892 lis $mask1b,0x1b1b
893 addi $key,$key,16
894 ori $mask80,$mask80,0x8080
895 ori $mask1b,$mask1b,0x1b1b
896___
897$code.=<<___ if ($SIZE_T==8);
898 insrdi $mask80,$mask80,32,0
899 insrdi $mask1b,$mask1b,32,0
900___
901$code.=<<___;
902 mtctr $acc00
903.align 4
904Ldec_compact_loop:
905 xor $s0,$s0,$t0
906 xor $s1,$s1,$t1
907 xor $s2,$s2,$t2
908 xor $s3,$s3,$t3
909 rlwinm $acc00,$s0,`32-24`,24,31
910 rlwinm $acc01,$s1,`32-24`,24,31
911 rlwinm $acc02,$s2,`32-24`,24,31
912 rlwinm $acc03,$s3,`32-24`,24,31
913 rlwinm $acc04,$s3,`32-16`,24,31
914 rlwinm $acc05,$s0,`32-16`,24,31
915 rlwinm $acc06,$s1,`32-16`,24,31
916 rlwinm $acc07,$s2,`32-16`,24,31
917 lbzx $acc00,$Tbl1,$acc00
918 lbzx $acc01,$Tbl1,$acc01
919 rlwinm $acc08,$s2,`32-8`,24,31
920 rlwinm $acc09,$s3,`32-8`,24,31
921 lbzx $acc02,$Tbl1,$acc02
922 lbzx $acc03,$Tbl1,$acc03
923 rlwinm $acc10,$s0,`32-8`,24,31
924 rlwinm $acc11,$s1,`32-8`,24,31
925 lbzx $acc04,$Tbl1,$acc04
926 lbzx $acc05,$Tbl1,$acc05
927 rlwinm $acc12,$s1,`0`,24,31
928 rlwinm $acc13,$s2,`0`,24,31
929 lbzx $acc06,$Tbl1,$acc06
930 lbzx $acc07,$Tbl1,$acc07
931 rlwinm $acc14,$s3,`0`,24,31
932 rlwinm $acc15,$s0,`0`,24,31
933 lbzx $acc08,$Tbl1,$acc08
934 lbzx $acc09,$Tbl1,$acc09
935 rlwinm $s0,$acc00,24,0,7
936 rlwinm $s1,$acc01,24,0,7
937 lbzx $acc10,$Tbl1,$acc10
938 lbzx $acc11,$Tbl1,$acc11
939 rlwinm $s2,$acc02,24,0,7
940 rlwinm $s3,$acc03,24,0,7
941 lbzx $acc12,$Tbl1,$acc12
942 lbzx $acc13,$Tbl1,$acc13
943 rlwimi $s0,$acc04,16,8,15
944 rlwimi $s1,$acc05,16,8,15
945 lbzx $acc14,$Tbl1,$acc14
946 lbzx $acc15,$Tbl1,$acc15
947 rlwimi $s2,$acc06,16,8,15
948 rlwimi $s3,$acc07,16,8,15
949 rlwimi $s0,$acc08,8,16,23
950 rlwimi $s1,$acc09,8,16,23
951 rlwimi $s2,$acc10,8,16,23
952 rlwimi $s3,$acc11,8,16,23
953 lwz $t0,0($key)
954 lwz $t1,4($key)
955 or $s0,$s0,$acc12
956 or $s1,$s1,$acc13
957 lwz $t2,8($key)
958 lwz $t3,12($key)
959 or $s2,$s2,$acc14
960 or $s3,$s3,$acc15
961
962 addi $key,$key,16
963 bdz Ldec_compact_done
964___
965$code.=<<___ if ($SIZE_T==8);
966 # vectorized permutation improves decrypt performance by 10%
967 insrdi $s0,$s1,32,0
968 insrdi $s2,$s3,32,0
969
970 and $acc00,$s0,$mask80 # r1=r0&0x80808080
971 and $acc02,$s2,$mask80
972 srdi $acc04,$acc00,7 # r1>>7
973 srdi $acc06,$acc02,7
974 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
975 andc $acc10,$s2,$mask80
976 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
977 sub $acc02,$acc02,$acc06
978 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
979 add $acc10,$acc10,$acc10
980 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
981 and $acc02,$acc02,$mask1b
982 xor $acc00,$acc00,$acc08 # r2
983 xor $acc02,$acc02,$acc10
984
985 and $acc04,$acc00,$mask80 # r1=r2&0x80808080
986 and $acc06,$acc02,$mask80
987 srdi $acc08,$acc04,7 # r1>>7
988 srdi $acc10,$acc06,7
989 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
990 andc $acc14,$acc02,$mask80
991 sub $acc04,$acc04,$acc08 # r1-(r1>>7)
992 sub $acc06,$acc06,$acc10
993 add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
994 add $acc14,$acc14,$acc14
995 and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
996 and $acc06,$acc06,$mask1b
997 xor $acc04,$acc04,$acc12 # r4
998 xor $acc06,$acc06,$acc14
999
1000 and $acc08,$acc04,$mask80 # r1=r4&0x80808080
1001 and $acc10,$acc06,$mask80
1002 srdi $acc12,$acc08,7 # r1>>7
1003 srdi $acc14,$acc10,7
1004 sub $acc08,$acc08,$acc12 # r1-(r1>>7)
1005 sub $acc10,$acc10,$acc14
1006 andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
1007 andc $acc14,$acc06,$mask80
1008 add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
1009 add $acc14,$acc14,$acc14
1010 and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1011 and $acc10,$acc10,$mask1b
1012 xor $acc08,$acc08,$acc12 # r8
1013 xor $acc10,$acc10,$acc14
1014
1015 xor $acc00,$acc00,$s0 # r2^r0
1016 xor $acc02,$acc02,$s2
1017 xor $acc04,$acc04,$s0 # r4^r0
1018 xor $acc06,$acc06,$s2
1019
1020 extrdi $acc01,$acc00,32,0
1021 extrdi $acc03,$acc02,32,0
1022 extrdi $acc05,$acc04,32,0
1023 extrdi $acc07,$acc06,32,0
1024 extrdi $acc09,$acc08,32,0
1025 extrdi $acc11,$acc10,32,0
1026___
1027$code.=<<___ if ($SIZE_T==4);
1028 and $acc00,$s0,$mask80 # r1=r0&0x80808080
1029 and $acc01,$s1,$mask80
1030 and $acc02,$s2,$mask80
1031 and $acc03,$s3,$mask80
1032 srwi $acc04,$acc00,7 # r1>>7
1033 srwi $acc05,$acc01,7
1034 srwi $acc06,$acc02,7
1035 srwi $acc07,$acc03,7
1036 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
1037 andc $acc09,$s1,$mask80
1038 andc $acc10,$s2,$mask80
1039 andc $acc11,$s3,$mask80
1040 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
1041 sub $acc01,$acc01,$acc05
1042 sub $acc02,$acc02,$acc06
1043 sub $acc03,$acc03,$acc07
1044 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
1045 add $acc09,$acc09,$acc09
1046 add $acc10,$acc10,$acc10
1047 add $acc11,$acc11,$acc11
1048 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1049 and $acc01,$acc01,$mask1b
1050 and $acc02,$acc02,$mask1b
1051 and $acc03,$acc03,$mask1b
1052 xor $acc00,$acc00,$acc08 # r2
1053 xor $acc01,$acc01,$acc09
1054 xor $acc02,$acc02,$acc10
1055 xor $acc03,$acc03,$acc11
1056
1057 and $acc04,$acc00,$mask80 # r1=r2&0x80808080
1058 and $acc05,$acc01,$mask80
1059 and $acc06,$acc02,$mask80
1060 and $acc07,$acc03,$mask80
1061 srwi $acc08,$acc04,7 # r1>>7
1062 srwi $acc09,$acc05,7
1063 srwi $acc10,$acc06,7
1064 srwi $acc11,$acc07,7
1065 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
1066 andc $acc13,$acc01,$mask80
1067 andc $acc14,$acc02,$mask80
1068 andc $acc15,$acc03,$mask80
1069 sub $acc04,$acc04,$acc08 # r1-(r1>>7)
1070 sub $acc05,$acc05,$acc09
1071 sub $acc06,$acc06,$acc10
1072 sub $acc07,$acc07,$acc11
1073 add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
1074 add $acc13,$acc13,$acc13
1075 add $acc14,$acc14,$acc14
1076 add $acc15,$acc15,$acc15
1077 and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1078 and $acc05,$acc05,$mask1b
1079 and $acc06,$acc06,$mask1b
1080 and $acc07,$acc07,$mask1b
1081 xor $acc04,$acc04,$acc12 # r4
1082 xor $acc05,$acc05,$acc13
1083 xor $acc06,$acc06,$acc14
1084 xor $acc07,$acc07,$acc15
1085
1086 and $acc08,$acc04,$mask80 # r1=r4&0x80808080
1087 and $acc09,$acc05,$mask80
1088 and $acc10,$acc06,$mask80
1089 and $acc11,$acc07,$mask80
1090 srwi $acc12,$acc08,7 # r1>>7
1091 srwi $acc13,$acc09,7
1092 srwi $acc14,$acc10,7
1093 srwi $acc15,$acc11,7
1094 sub $acc08,$acc08,$acc12 # r1-(r1>>7)
1095 sub $acc09,$acc09,$acc13
1096 sub $acc10,$acc10,$acc14
1097 sub $acc11,$acc11,$acc15
1098 andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
1099 andc $acc13,$acc05,$mask80
1100 andc $acc14,$acc06,$mask80
1101 andc $acc15,$acc07,$mask80
1102 add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
1103 add $acc13,$acc13,$acc13
1104 add $acc14,$acc14,$acc14
1105 add $acc15,$acc15,$acc15
1106 and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1107 and $acc09,$acc09,$mask1b
1108 and $acc10,$acc10,$mask1b
1109 and $acc11,$acc11,$mask1b
1110 xor $acc08,$acc08,$acc12 # r8
1111 xor $acc09,$acc09,$acc13
1112 xor $acc10,$acc10,$acc14
1113 xor $acc11,$acc11,$acc15
1114
1115 xor $acc00,$acc00,$s0 # r2^r0
1116 xor $acc01,$acc01,$s1
1117 xor $acc02,$acc02,$s2
1118 xor $acc03,$acc03,$s3
1119 xor $acc04,$acc04,$s0 # r4^r0
1120 xor $acc05,$acc05,$s1
1121 xor $acc06,$acc06,$s2
1122 xor $acc07,$acc07,$s3
1123___
1124$code.=<<___;
1125 rotrwi $s0,$s0,8 # = ROTATE(r0,8)
1126 rotrwi $s1,$s1,8
1127 rotrwi $s2,$s2,8
1128 rotrwi $s3,$s3,8
1129 xor $s0,$s0,$acc00 # ^= r2^r0
1130 xor $s1,$s1,$acc01
1131 xor $s2,$s2,$acc02
1132 xor $s3,$s3,$acc03
1133 xor $acc00,$acc00,$acc08
1134 xor $acc01,$acc01,$acc09
1135 xor $acc02,$acc02,$acc10
1136 xor $acc03,$acc03,$acc11
1137 xor $s0,$s0,$acc04 # ^= r4^r0
1138 xor $s1,$s1,$acc05
1139 xor $s2,$s2,$acc06
1140 xor $s3,$s3,$acc07
1141 rotrwi $acc00,$acc00,24
1142 rotrwi $acc01,$acc01,24
1143 rotrwi $acc02,$acc02,24
1144 rotrwi $acc03,$acc03,24
1145 xor $acc04,$acc04,$acc08
1146 xor $acc05,$acc05,$acc09
1147 xor $acc06,$acc06,$acc10
1148 xor $acc07,$acc07,$acc11
1149 xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
1150 xor $s1,$s1,$acc09
1151 xor $s2,$s2,$acc10
1152 xor $s3,$s3,$acc11
1153 rotrwi $acc04,$acc04,16
1154 rotrwi $acc05,$acc05,16
1155 rotrwi $acc06,$acc06,16
1156 rotrwi $acc07,$acc07,16
1157 xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24)
1158 xor $s1,$s1,$acc01
1159 xor $s2,$s2,$acc02
1160 xor $s3,$s3,$acc03
1161 rotrwi $acc08,$acc08,8
1162 rotrwi $acc09,$acc09,8
1163 rotrwi $acc10,$acc10,8
1164 rotrwi $acc11,$acc11,8
1165 xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16)
1166 xor $s1,$s1,$acc05
1167 xor $s2,$s2,$acc06
1168 xor $s3,$s3,$acc07
1169 xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
1170 xor $s1,$s1,$acc09
1171 xor $s2,$s2,$acc10
1172 xor $s3,$s3,$acc11
1173
1174 b Ldec_compact_loop
1175.align 4
1176Ldec_compact_done:
1177 xor $s0,$s0,$t0
1178 xor $s1,$s1,$t1
1179 xor $s2,$s2,$t2
1180 xor $s3,$s3,$t3
1181 blr
1182.long 0
1183.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
1184.align 7
1185___
1186
1187$code =~ s/\`([^\`]*)\`/eval $1/gem;
1188print $code;
1189close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl
deleted file mode 100644
index 7e01889298..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-s390x.pl
+++ /dev/null
@@ -1,1339 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for s390x.
11
12# April 2007.
13#
14# Software performance improvement over gcc-generated code is ~70% and
15# in absolute terms is ~73 cycles per byte processed with 128-bit key.
16# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
17# *strictly* in-order execution and issued instruction [in this case
18# load value from memory is critical] has to complete before execution
19# flow proceeds. S-boxes are compressed to 2KB[+256B].
20#
21# As for hardware acceleration support. It's basically a "teaser," as
22# it can and should be improved in several ways. Most notably support
23# for CBC is not utilized, nor multiple blocks are ever processed.
24# Then software key schedule can be postponed till hardware support
25# detection... Performance improvement over assembler is reportedly
26# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
27# support is implemented.
28
29# May 2007.
30#
31# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
32# for 128-bit keys, if hardware support is detected.
33
34# Januray 2009.
35#
36# Add support for hardware AES192/256 and reschedule instructions to
37# minimize/avoid Address Generation Interlock hazard and to favour
38# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
39# almost 50% on z9. The gain is smaller on z10, because being dual-
40# issue z10 makes it improssible to eliminate the interlock condition:
41# critial path is not long enough. Yet it spends ~24 cycles per byte
42# processed with 128-bit key.
43#
44# Unlike previous version hardware support detection takes place only
45# at the moment of key schedule setup, which is denoted in key->rounds.
46# This is done, because deferred key setup can't be made MT-safe, not
47# for key lengthes longer than 128 bits.
48#
49# Add AES_cbc_encrypt, which gives incredible performance improvement,
50# it was measured to be ~6.6x. It's less than previously mentioned 8x,
51# because software implementation was optimized.
52
53$softonly=0; # allow hardware support
54
55$t0="%r0"; $mask="%r0";
56$t1="%r1";
57$t2="%r2"; $inp="%r2";
58$t3="%r3"; $out="%r3"; $bits="%r3";
59$key="%r4";
60$i1="%r5";
61$i2="%r6";
62$i3="%r7";
63$s0="%r8";
64$s1="%r9";
65$s2="%r10";
66$s3="%r11";
67$tbl="%r12";
68$rounds="%r13";
69$ra="%r14";
70$sp="%r15";
71
72sub _data_word()
73{ my $i;
74 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
75}
76
77$code=<<___;
78.text
79
80.type AES_Te,\@object
81.align 256
82AES_Te:
83___
84&_data_word(
85 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
86 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
87 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
88 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
89 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
90 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
91 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
92 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
93 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
94 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
95 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
96 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
97 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
98 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
99 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
100 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
101 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
102 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
103 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
104 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
105 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
106 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
107 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
108 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
109 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
110 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
111 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
112 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
113 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
114 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
115 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
116 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
117 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
118 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
119 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
120 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
121 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
122 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
123 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
124 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
125 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
126 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
127 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
128 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
129 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
130 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
131 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
132 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
133 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
134 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
135 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
136 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
137 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
138 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
139 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
140 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
141 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
142 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
143 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
144 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
145 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
146 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
147 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
148 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
149$code.=<<___;
150# Te4[256]
151.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
152.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
153.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
154.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
155.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
156.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
157.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
158.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
159.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
160.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
161.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
162.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
163.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
164.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
165.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
166.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
167.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
168.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
169.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
170.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
171.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
172.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
173.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
174.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
175.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
176.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
177.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
178.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
179.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
180.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
181.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
182.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
183# rcon[]
184.long 0x01000000, 0x02000000, 0x04000000, 0x08000000
185.long 0x10000000, 0x20000000, 0x40000000, 0x80000000
186.long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
187.align 256
188.size AES_Te,.-AES_Te
189
190# void AES_encrypt(const unsigned char *inp, unsigned char *out,
191# const AES_KEY *key) {
192.globl AES_encrypt
193.type AES_encrypt,\@function
194AES_encrypt:
195___
196$code.=<<___ if (!$softonly);
197 l %r0,240($key)
198 lhi %r1,16
199 clr %r0,%r1
200 jl .Lesoft
201
202 la %r1,0($key)
203 #la %r2,0($inp)
204 la %r4,0($out)
205 lghi %r3,16 # single block length
206 .long 0xb92e0042 # km %r4,%r2
207 brc 1,.-4 # can this happen?
208 br %r14
209.align 64
210.Lesoft:
211___
212$code.=<<___;
213 stmg %r3,$ra,24($sp)
214
215 llgf $s0,0($inp)
216 llgf $s1,4($inp)
217 llgf $s2,8($inp)
218 llgf $s3,12($inp)
219
220 larl $tbl,AES_Te
221 bras $ra,_s390x_AES_encrypt
222
223 lg $out,24($sp)
224 st $s0,0($out)
225 st $s1,4($out)
226 st $s2,8($out)
227 st $s3,12($out)
228
229 lmg %r6,$ra,48($sp)
230 br $ra
231.size AES_encrypt,.-AES_encrypt
232
233.type _s390x_AES_encrypt,\@function
234.align 16
235_s390x_AES_encrypt:
236 stg $ra,152($sp)
237 x $s0,0($key)
238 x $s1,4($key)
239 x $s2,8($key)
240 x $s3,12($key)
241 l $rounds,240($key)
242 llill $mask,`0xff<<3`
243 aghi $rounds,-1
244 j .Lenc_loop
245.align 16
246.Lenc_loop:
247 sllg $t1,$s0,`0+3`
248 srlg $t2,$s0,`8-3`
249 srlg $t3,$s0,`16-3`
250 srl $s0,`24-3`
251 nr $s0,$mask
252 ngr $t1,$mask
253 nr $t2,$mask
254 nr $t3,$mask
255
256 srlg $i1,$s1,`16-3` # i0
257 sllg $i2,$s1,`0+3`
258 srlg $i3,$s1,`8-3`
259 srl $s1,`24-3`
260 nr $i1,$mask
261 nr $s1,$mask
262 ngr $i2,$mask
263 nr $i3,$mask
264
265 l $s0,0($s0,$tbl) # Te0[s0>>24]
266 l $t1,1($t1,$tbl) # Te3[s0>>0]
267 l $t2,2($t2,$tbl) # Te2[s0>>8]
268 l $t3,3($t3,$tbl) # Te1[s0>>16]
269
270 x $s0,3($i1,$tbl) # Te1[s1>>16]
271 l $s1,0($s1,$tbl) # Te0[s1>>24]
272 x $t2,1($i2,$tbl) # Te3[s1>>0]
273 x $t3,2($i3,$tbl) # Te2[s1>>8]
274
275 srlg $i1,$s2,`8-3` # i0
276 srlg $i2,$s2,`16-3` # i1
277 nr $i1,$mask
278 nr $i2,$mask
279 sllg $i3,$s2,`0+3`
280 srl $s2,`24-3`
281 nr $s2,$mask
282 ngr $i3,$mask
283
284 xr $s1,$t1
285 srlg $ra,$s3,`8-3` # i1
286 sllg $t1,$s3,`0+3` # i0
287 nr $ra,$mask
288 la $key,16($key)
289 ngr $t1,$mask
290
291 x $s0,2($i1,$tbl) # Te2[s2>>8]
292 x $s1,3($i2,$tbl) # Te1[s2>>16]
293 l $s2,0($s2,$tbl) # Te0[s2>>24]
294 x $t3,1($i3,$tbl) # Te3[s2>>0]
295
296 srlg $i3,$s3,`16-3` # i2
297 xr $s2,$t2
298 srl $s3,`24-3`
299 nr $i3,$mask
300 nr $s3,$mask
301
302 x $s0,0($key)
303 x $s1,4($key)
304 x $s2,8($key)
305 x $t3,12($key)
306
307 x $s0,1($t1,$tbl) # Te3[s3>>0]
308 x $s1,2($ra,$tbl) # Te2[s3>>8]
309 x $s2,3($i3,$tbl) # Te1[s3>>16]
310 l $s3,0($s3,$tbl) # Te0[s3>>24]
311 xr $s3,$t3
312
313 brct $rounds,.Lenc_loop
314 .align 16
315
316 sllg $t1,$s0,`0+3`
317 srlg $t2,$s0,`8-3`
318 ngr $t1,$mask
319 srlg $t3,$s0,`16-3`
320 srl $s0,`24-3`
321 nr $s0,$mask
322 nr $t2,$mask
323 nr $t3,$mask
324
325 srlg $i1,$s1,`16-3` # i0
326 sllg $i2,$s1,`0+3`
327 ngr $i2,$mask
328 srlg $i3,$s1,`8-3`
329 srl $s1,`24-3`
330 nr $i1,$mask
331 nr $s1,$mask
332 nr $i3,$mask
333
334 llgc $s0,2($s0,$tbl) # Te4[s0>>24]
335 llgc $t1,2($t1,$tbl) # Te4[s0>>0]
336 sll $s0,24
337 llgc $t2,2($t2,$tbl) # Te4[s0>>8]
338 llgc $t3,2($t3,$tbl) # Te4[s0>>16]
339 sll $t2,8
340 sll $t3,16
341
342 llgc $i1,2($i1,$tbl) # Te4[s1>>16]
343 llgc $s1,2($s1,$tbl) # Te4[s1>>24]
344 llgc $i2,2($i2,$tbl) # Te4[s1>>0]
345 llgc $i3,2($i3,$tbl) # Te4[s1>>8]
346 sll $i1,16
347 sll $s1,24
348 sll $i3,8
349 or $s0,$i1
350 or $s1,$t1
351 or $t2,$i2
352 or $t3,$i3
353
354 srlg $i1,$s2,`8-3` # i0
355 srlg $i2,$s2,`16-3` # i1
356 nr $i1,$mask
357 nr $i2,$mask
358 sllg $i3,$s2,`0+3`
359 srl $s2,`24-3`
360 ngr $i3,$mask
361 nr $s2,$mask
362
363 sllg $t1,$s3,`0+3` # i0
364 srlg $ra,$s3,`8-3` # i1
365 ngr $t1,$mask
366
367 llgc $i1,2($i1,$tbl) # Te4[s2>>8]
368 llgc $i2,2($i2,$tbl) # Te4[s2>>16]
369 sll $i1,8
370 llgc $s2,2($s2,$tbl) # Te4[s2>>24]
371 llgc $i3,2($i3,$tbl) # Te4[s2>>0]
372 sll $i2,16
373 nr $ra,$mask
374 sll $s2,24
375 or $s0,$i1
376 or $s1,$i2
377 or $s2,$t2
378 or $t3,$i3
379
380 srlg $i3,$s3,`16-3` # i2
381 srl $s3,`24-3`
382 nr $i3,$mask
383 nr $s3,$mask
384
385 l $t0,16($key)
386 l $t2,20($key)
387
388 llgc $i1,2($t1,$tbl) # Te4[s3>>0]
389 llgc $i2,2($ra,$tbl) # Te4[s3>>8]
390 llgc $i3,2($i3,$tbl) # Te4[s3>>16]
391 llgc $s3,2($s3,$tbl) # Te4[s3>>24]
392 sll $i2,8
393 sll $i3,16
394 sll $s3,24
395 or $s0,$i1
396 or $s1,$i2
397 or $s2,$i3
398 or $s3,$t3
399
400 lg $ra,152($sp)
401 xr $s0,$t0
402 xr $s1,$t2
403 x $s2,24($key)
404 x $s3,28($key)
405
406 br $ra
407.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
408___
409
410$code.=<<___;
411.type AES_Td,\@object
412.align 256
413AES_Td:
414___
415&_data_word(
416 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
417 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
418 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
419 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
420 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
421 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
422 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
423 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
424 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
425 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
426 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
427 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
428 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
429 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
430 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
431 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
432 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
433 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
434 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
435 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
436 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
437 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
438 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
439 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
440 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
441 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
442 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
443 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
444 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
445 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
446 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
447 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
448 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
449 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
450 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
451 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
452 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
453 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
454 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
455 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
456 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
457 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
458 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
459 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
460 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
461 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
462 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
463 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
464 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
465 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
466 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
467 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
468 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
469 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
470 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
471 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
472 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
473 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
474 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
475 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
476 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
477 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
478 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
479 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
480$code.=<<___;
481# Td4[256]
482.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
483.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
484.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
485.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
486.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
487.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
488.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
489.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
490.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
491.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
492.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
493.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
494.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
495.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
496.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
497.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
498.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
499.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
500.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
501.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
502.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
503.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
504.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
505.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
506.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
507.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
508.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
509.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
510.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
511.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
512.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
513.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
514.size AES_Td,.-AES_Td
515
516# void AES_decrypt(const unsigned char *inp, unsigned char *out,
517# const AES_KEY *key) {
518.globl AES_decrypt
519.type AES_decrypt,\@function
520AES_decrypt:
521___
522$code.=<<___ if (!$softonly);
523 l %r0,240($key)
524 lhi %r1,16
525 clr %r0,%r1
526 jl .Ldsoft
527
528 la %r1,0($key)
529 #la %r2,0($inp)
530 la %r4,0($out)
531 lghi %r3,16 # single block length
532 .long 0xb92e0042 # km %r4,%r2
533 brc 1,.-4 # can this happen?
534 br %r14
535.align 64
536.Ldsoft:
537___
538$code.=<<___;
539 stmg %r3,$ra,24($sp)
540
541 llgf $s0,0($inp)
542 llgf $s1,4($inp)
543 llgf $s2,8($inp)
544 llgf $s3,12($inp)
545
546 larl $tbl,AES_Td
547 bras $ra,_s390x_AES_decrypt
548
549 lg $out,24($sp)
550 st $s0,0($out)
551 st $s1,4($out)
552 st $s2,8($out)
553 st $s3,12($out)
554
555 lmg %r6,$ra,48($sp)
556 br $ra
557.size AES_decrypt,.-AES_decrypt
558
559.type _s390x_AES_decrypt,\@function
560.align 16
561_s390x_AES_decrypt:
562 stg $ra,152($sp)
563 x $s0,0($key)
564 x $s1,4($key)
565 x $s2,8($key)
566 x $s3,12($key)
567 l $rounds,240($key)
568 llill $mask,`0xff<<3`
569 aghi $rounds,-1
570 j .Ldec_loop
571.align 16
572.Ldec_loop:
573 srlg $t1,$s0,`16-3`
574 srlg $t2,$s0,`8-3`
575 sllg $t3,$s0,`0+3`
576 srl $s0,`24-3`
577 nr $s0,$mask
578 nr $t1,$mask
579 nr $t2,$mask
580 ngr $t3,$mask
581
582 sllg $i1,$s1,`0+3` # i0
583 srlg $i2,$s1,`16-3`
584 srlg $i3,$s1,`8-3`
585 srl $s1,`24-3`
586 ngr $i1,$mask
587 nr $s1,$mask
588 nr $i2,$mask
589 nr $i3,$mask
590
591 l $s0,0($s0,$tbl) # Td0[s0>>24]
592 l $t1,3($t1,$tbl) # Td1[s0>>16]
593 l $t2,2($t2,$tbl) # Td2[s0>>8]
594 l $t3,1($t3,$tbl) # Td3[s0>>0]
595
596 x $s0,1($i1,$tbl) # Td3[s1>>0]
597 l $s1,0($s1,$tbl) # Td0[s1>>24]
598 x $t2,3($i2,$tbl) # Td1[s1>>16]
599 x $t3,2($i3,$tbl) # Td2[s1>>8]
600
601 srlg $i1,$s2,`8-3` # i0
602 sllg $i2,$s2,`0+3` # i1
603 srlg $i3,$s2,`16-3`
604 srl $s2,`24-3`
605 nr $i1,$mask
606 ngr $i2,$mask
607 nr $s2,$mask
608 nr $i3,$mask
609
610 xr $s1,$t1
611 srlg $ra,$s3,`8-3` # i1
612 srlg $t1,$s3,`16-3` # i0
613 nr $ra,$mask
614 la $key,16($key)
615 nr $t1,$mask
616
617 x $s0,2($i1,$tbl) # Td2[s2>>8]
618 x $s1,1($i2,$tbl) # Td3[s2>>0]
619 l $s2,0($s2,$tbl) # Td0[s2>>24]
620 x $t3,3($i3,$tbl) # Td1[s2>>16]
621
622 sllg $i3,$s3,`0+3` # i2
623 srl $s3,`24-3`
624 ngr $i3,$mask
625 nr $s3,$mask
626
627 xr $s2,$t2
628 x $s0,0($key)
629 x $s1,4($key)
630 x $s2,8($key)
631 x $t3,12($key)
632
633 x $s0,3($t1,$tbl) # Td1[s3>>16]
634 x $s1,2($ra,$tbl) # Td2[s3>>8]
635 x $s2,1($i3,$tbl) # Td3[s3>>0]
636 l $s3,0($s3,$tbl) # Td0[s3>>24]
637 xr $s3,$t3
638
639 brct $rounds,.Ldec_loop
640 .align 16
641
642 l $t1,`2048+0`($tbl) # prefetch Td4
643 l $t2,`2048+64`($tbl)
644 l $t3,`2048+128`($tbl)
645 l $i1,`2048+192`($tbl)
646 llill $mask,0xff
647
648 srlg $i3,$s0,24 # i0
649 srlg $t1,$s0,16
650 srlg $t2,$s0,8
651 nr $s0,$mask # i3
652 nr $t1,$mask
653
654 srlg $i1,$s1,24
655 nr $t2,$mask
656 srlg $i2,$s1,16
657 srlg $ra,$s1,8
658 nr $s1,$mask # i0
659 nr $i2,$mask
660 nr $ra,$mask
661
662 llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
663 llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
664 llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
665 sll $t1,16
666 llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
667 sllg $s0,$i3,24
668 sll $t2,8
669
670 llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
671 llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
672 llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
673 sll $i1,24
674 llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
675 sll $i2,16
676 sll $i3,8
677 or $s0,$s1
678 or $t1,$i1
679 or $t2,$i2
680 or $t3,$i3
681
682 srlg $i1,$s2,8 # i0
683 srlg $i2,$s2,24
684 srlg $i3,$s2,16
685 nr $s2,$mask # i1
686 nr $i1,$mask
687 nr $i3,$mask
688 llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
689 llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
690 llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
691 llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
692 sll $i1,8
693 sll $i2,24
694 or $s0,$i1
695 sll $i3,16
696 or $t2,$i2
697 or $t3,$i3
698
699 srlg $i1,$s3,16 # i0
700 srlg $i2,$s3,8 # i1
701 srlg $i3,$s3,24
702 nr $s3,$mask # i2
703 nr $i1,$mask
704 nr $i2,$mask
705
706 lg $ra,152($sp)
707 or $s1,$t1
708 l $t0,16($key)
709 l $t1,20($key)
710
711 llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
712 llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
713 sll $i1,16
714 llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
715 llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
716 sll $i2,8
717 sll $s3,24
718 or $s0,$i1
719 or $s1,$i2
720 or $s2,$t2
721 or $s3,$t3
722
723 xr $s0,$t0
724 xr $s1,$t1
725 x $s2,24($key)
726 x $s3,28($key)
727
728 br $ra
729.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
730___
731
732$code.=<<___;
733# void AES_set_encrypt_key(const unsigned char *in, int bits,
734# AES_KEY *key) {
735.globl AES_set_encrypt_key
736.type AES_set_encrypt_key,\@function
737.align 16
738AES_set_encrypt_key:
739 lghi $t0,0
740 clgr $inp,$t0
741 je .Lminus1
742 clgr $key,$t0
743 je .Lminus1
744
745 lghi $t0,128
746 clr $bits,$t0
747 je .Lproceed
748 lghi $t0,192
749 clr $bits,$t0
750 je .Lproceed
751 lghi $t0,256
752 clr $bits,$t0
753 je .Lproceed
754 lghi %r2,-2
755 br %r14
756
757.align 16
758.Lproceed:
759___
760$code.=<<___ if (!$softonly);
761 # convert bits to km code, [128,192,256]->[18,19,20]
762 lhi %r5,-128
763 lhi %r0,18
764 ar %r5,$bits
765 srl %r5,6
766 ar %r5,%r0
767
768 larl %r1,OPENSSL_s390xcap_P
769 lg %r0,0(%r1)
770 tmhl %r0,0x4000 # check for message-security assist
771 jz .Lekey_internal
772
773 lghi %r0,0 # query capability vector
774 la %r1,16($sp)
775 .long 0xb92f0042 # kmc %r4,%r2
776
777 llihh %r1,0x8000
778 srlg %r1,%r1,0(%r5)
779 ng %r1,16($sp)
780 jz .Lekey_internal
781
782 lmg %r0,%r1,0($inp) # just copy 128 bits...
783 stmg %r0,%r1,0($key)
784 lhi %r0,192
785 cr $bits,%r0
786 jl 1f
787 lg %r1,16($inp)
788 stg %r1,16($key)
789 je 1f
790 lg %r1,24($inp)
791 stg %r1,24($key)
7921: st $bits,236($key) # save bits
793 st %r5,240($key) # save km code
794 lghi %r2,0
795 br %r14
796___
797$code.=<<___;
798.align 16
799.Lekey_internal:
800 stmg %r6,%r13,48($sp) # all non-volatile regs
801
802 larl $tbl,AES_Te+2048
803
804 llgf $s0,0($inp)
805 llgf $s1,4($inp)
806 llgf $s2,8($inp)
807 llgf $s3,12($inp)
808 st $s0,0($key)
809 st $s1,4($key)
810 st $s2,8($key)
811 st $s3,12($key)
812 lghi $t0,128
813 cr $bits,$t0
814 jne .Lnot128
815
816 llill $mask,0xff
817 lghi $t3,0 # i=0
818 lghi $rounds,10
819 st $rounds,240($key)
820
821 llgfr $t2,$s3 # temp=rk[3]
822 srlg $i1,$s3,8
823 srlg $i2,$s3,16
824 srlg $i3,$s3,24
825 nr $t2,$mask
826 nr $i1,$mask
827 nr $i2,$mask
828
829.align 16
830.L128_loop:
831 la $t2,0($t2,$tbl)
832 la $i1,0($i1,$tbl)
833 la $i2,0($i2,$tbl)
834 la $i3,0($i3,$tbl)
835 icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
836 icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
837 icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
838 icm $t2,1,0($i3) # Te4[rk[3]>>24]
839 x $t2,256($t3,$tbl) # rcon[i]
840 xr $s0,$t2 # rk[4]=rk[0]^...
841 xr $s1,$s0 # rk[5]=rk[1]^rk[4]
842 xr $s2,$s1 # rk[6]=rk[2]^rk[5]
843 xr $s3,$s2 # rk[7]=rk[3]^rk[6]
844
845 llgfr $t2,$s3 # temp=rk[3]
846 srlg $i1,$s3,8
847 srlg $i2,$s3,16
848 nr $t2,$mask
849 nr $i1,$mask
850 srlg $i3,$s3,24
851 nr $i2,$mask
852
853 st $s0,16($key)
854 st $s1,20($key)
855 st $s2,24($key)
856 st $s3,28($key)
857 la $key,16($key) # key+=4
858 la $t3,4($t3) # i++
859 brct $rounds,.L128_loop
860 lghi %r2,0
861 lmg %r6,%r13,48($sp)
862 br $ra
863
864.align 16
865.Lnot128:
866 llgf $t0,16($inp)
867 llgf $t1,20($inp)
868 st $t0,16($key)
869 st $t1,20($key)
870 lghi $t0,192
871 cr $bits,$t0
872 jne .Lnot192
873
874 llill $mask,0xff
875 lghi $t3,0 # i=0
876 lghi $rounds,12
877 st $rounds,240($key)
878 lghi $rounds,8
879
880 srlg $i1,$t1,8
881 srlg $i2,$t1,16
882 srlg $i3,$t1,24
883 nr $t1,$mask
884 nr $i1,$mask
885 nr $i2,$mask
886
887.align 16
888.L192_loop:
889 la $t1,0($t1,$tbl)
890 la $i1,0($i1,$tbl)
891 la $i2,0($i2,$tbl)
892 la $i3,0($i3,$tbl)
893 icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
894 icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
895 icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
896 icm $t1,1,0($i3) # Te4[rk[5]>>24]
897 x $t1,256($t3,$tbl) # rcon[i]
898 xr $s0,$t1 # rk[6]=rk[0]^...
899 xr $s1,$s0 # rk[7]=rk[1]^rk[6]
900 xr $s2,$s1 # rk[8]=rk[2]^rk[7]
901 xr $s3,$s2 # rk[9]=rk[3]^rk[8]
902
903 st $s0,24($key)
904 st $s1,28($key)
905 st $s2,32($key)
906 st $s3,36($key)
907 brct $rounds,.L192_continue
908 lghi %r2,0
909 lmg %r6,%r13,48($sp)
910 br $ra
911
912.align 16
913.L192_continue:
914 lgr $t1,$s3
915 x $t1,16($key) # rk[10]=rk[4]^rk[9]
916 st $t1,40($key)
917 x $t1,20($key) # rk[11]=rk[5]^rk[10]
918 st $t1,44($key)
919
920 srlg $i1,$t1,8
921 srlg $i2,$t1,16
922 srlg $i3,$t1,24
923 nr $t1,$mask
924 nr $i1,$mask
925 nr $i2,$mask
926
927 la $key,24($key) # key+=6
928 la $t3,4($t3) # i++
929 j .L192_loop
930
931.align 16
932.Lnot192:
933 llgf $t0,24($inp)
934 llgf $t1,28($inp)
935 st $t0,24($key)
936 st $t1,28($key)
937 llill $mask,0xff
938 lghi $t3,0 # i=0
939 lghi $rounds,14
940 st $rounds,240($key)
941 lghi $rounds,7
942
943 srlg $i1,$t1,8
944 srlg $i2,$t1,16
945 srlg $i3,$t1,24
946 nr $t1,$mask
947 nr $i1,$mask
948 nr $i2,$mask
949
950.align 16
951.L256_loop:
952 la $t1,0($t1,$tbl)
953 la $i1,0($i1,$tbl)
954 la $i2,0($i2,$tbl)
955 la $i3,0($i3,$tbl)
956 icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
957 icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
958 icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
959 icm $t1,1,0($i3) # Te4[rk[7]>>24]
960 x $t1,256($t3,$tbl) # rcon[i]
961 xr $s0,$t1 # rk[8]=rk[0]^...
962 xr $s1,$s0 # rk[9]=rk[1]^rk[8]
963 xr $s2,$s1 # rk[10]=rk[2]^rk[9]
964 xr $s3,$s2 # rk[11]=rk[3]^rk[10]
965 st $s0,32($key)
966 st $s1,36($key)
967 st $s2,40($key)
968 st $s3,44($key)
969 brct $rounds,.L256_continue
970 lghi %r2,0
971 lmg %r6,%r13,48($sp)
972 br $ra
973
974.align 16
975.L256_continue:
976 lgr $t1,$s3 # temp=rk[11]
977 srlg $i1,$s3,8
978 srlg $i2,$s3,16
979 srlg $i3,$s3,24
980 nr $t1,$mask
981 nr $i1,$mask
982 nr $i2,$mask
983 la $t1,0($t1,$tbl)
984 la $i1,0($i1,$tbl)
985 la $i2,0($i2,$tbl)
986 la $i3,0($i3,$tbl)
987 llgc $t1,0($t1) # Te4[rk[11]>>0]
988 icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
989 icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
990 icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
991 x $t1,16($key) # rk[12]=rk[4]^...
992 st $t1,48($key)
993 x $t1,20($key) # rk[13]=rk[5]^rk[12]
994 st $t1,52($key)
995 x $t1,24($key) # rk[14]=rk[6]^rk[13]
996 st $t1,56($key)
997 x $t1,28($key) # rk[15]=rk[7]^rk[14]
998 st $t1,60($key)
999
1000 srlg $i1,$t1,8
1001 srlg $i2,$t1,16
1002 srlg $i3,$t1,24
1003 nr $t1,$mask
1004 nr $i1,$mask
1005 nr $i2,$mask
1006
1007 la $key,32($key) # key+=8
1008 la $t3,4($t3) # i++
1009 j .L256_loop
1010
1011.Lminus1:
1012 lghi %r2,-1
1013 br $ra
1014.size AES_set_encrypt_key,.-AES_set_encrypt_key
1015
1016# void AES_set_decrypt_key(const unsigned char *in, int bits,
1017# AES_KEY *key) {
1018.globl AES_set_decrypt_key
1019.type AES_set_decrypt_key,\@function
1020.align 16
1021AES_set_decrypt_key:
1022 stg $key,32($sp) # I rely on AES_set_encrypt_key to
1023 stg $ra,112($sp) # save non-volatile registers!
1024 bras $ra,AES_set_encrypt_key
1025 lg $key,32($sp)
1026 lg $ra,112($sp)
1027 ltgr %r2,%r2
1028 bnzr $ra
1029___
1030$code.=<<___ if (!$softonly);
1031 l $t0,240($key)
1032 lhi $t1,16
1033 cr $t0,$t1
1034 jl .Lgo
1035 oill $t0,0x80 # set "decrypt" bit
1036 st $t0,240($key)
1037 br $ra
1038
1039.align 16
1040.Ldkey_internal:
1041 stg $key,32($sp)
1042 stg $ra,40($sp)
1043 bras $ra,.Lekey_internal
1044 lg $key,32($sp)
1045 lg $ra,40($sp)
1046___
1047$code.=<<___;
1048
1049.Lgo: llgf $rounds,240($key)
1050 la $i1,0($key)
1051 sllg $i2,$rounds,4
1052 la $i2,0($i2,$key)
1053 srl $rounds,1
1054 lghi $t1,-16
1055
1056.align 16
1057.Linv: lmg $s0,$s1,0($i1)
1058 lmg $s2,$s3,0($i2)
1059 stmg $s0,$s1,0($i2)
1060 stmg $s2,$s3,0($i1)
1061 la $i1,16($i1)
1062 la $i2,0($t1,$i2)
1063 brct $rounds,.Linv
1064___
1065$mask80=$i1;
1066$mask1b=$i2;
1067$maskfe=$i3;
1068$code.=<<___;
1069 llgf $rounds,240($key)
1070 aghi $rounds,-1
1071 sll $rounds,2 # (rounds-1)*4
1072 llilh $mask80,0x8080
1073 llilh $mask1b,0x1b1b
1074 llilh $maskfe,0xfefe
1075 oill $mask80,0x8080
1076 oill $mask1b,0x1b1b
1077 oill $maskfe,0xfefe
1078
1079.align 16
1080.Lmix: l $s0,16($key) # tp1
1081 lr $s1,$s0
1082 ngr $s1,$mask80
1083 srlg $t1,$s1,7
1084 slr $s1,$t1
1085 nr $s1,$mask1b
1086 sllg $t1,$s0,1
1087 nr $t1,$maskfe
1088 xr $s1,$t1 # tp2
1089
1090 lr $s2,$s1
1091 ngr $s2,$mask80
1092 srlg $t1,$s2,7
1093 slr $s2,$t1
1094 nr $s2,$mask1b
1095 sllg $t1,$s1,1
1096 nr $t1,$maskfe
1097 xr $s2,$t1 # tp4
1098
1099 lr $s3,$s2
1100 ngr $s3,$mask80
1101 srlg $t1,$s3,7
1102 slr $s3,$t1
1103 nr $s3,$mask1b
1104 sllg $t1,$s2,1
1105 nr $t1,$maskfe
1106 xr $s3,$t1 # tp8
1107
1108 xr $s1,$s0 # tp2^tp1
1109 xr $s2,$s0 # tp4^tp1
1110 rll $s0,$s0,24 # = ROTATE(tp1,8)
1111 xr $s2,$s3 # ^=tp8
1112 xr $s0,$s1 # ^=tp2^tp1
1113 xr $s1,$s3 # tp2^tp1^tp8
1114 xr $s0,$s2 # ^=tp4^tp1^tp8
1115 rll $s1,$s1,8
1116 rll $s2,$s2,16
1117 xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
1118 rll $s3,$s3,24
1119 xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
1120 xr $s0,$s3 # ^= ROTATE(tp8,8)
1121
1122 st $s0,16($key)
1123 la $key,4($key)
1124 brct $rounds,.Lmix
1125
1126 lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
1127 lghi %r2,0
1128 br $ra
1129.size AES_set_decrypt_key,.-AES_set_decrypt_key
1130___
1131
1132#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1133# size_t length, const AES_KEY *key,
1134# unsigned char *ivec, const int enc)
1135{
1136my $inp="%r2";
1137my $out="%r4"; # length and out are swapped
1138my $len="%r3";
1139my $key="%r5";
1140my $ivp="%r6";
1141
1142$code.=<<___;
1143.globl AES_cbc_encrypt
1144.type AES_cbc_encrypt,\@function
1145.align 16
1146AES_cbc_encrypt:
1147 xgr %r3,%r4 # flip %r3 and %r4, out and len
1148 xgr %r4,%r3
1149 xgr %r3,%r4
1150___
1151$code.=<<___ if (!$softonly);
1152 lhi %r0,16
1153 cl %r0,240($key)
1154 jh .Lcbc_software
1155
1156 lg %r0,0($ivp) # copy ivec
1157 lg %r1,8($ivp)
1158 stmg %r0,%r1,16($sp)
1159 lmg %r0,%r1,0($key) # copy key, cover 256 bit
1160 stmg %r0,%r1,32($sp)
1161 lmg %r0,%r1,16($key)
1162 stmg %r0,%r1,48($sp)
1163 l %r0,240($key) # load kmc code
1164 lghi $key,15 # res=len%16, len-=res;
1165 ngr $key,$len
1166 slgr $len,$key
1167 la %r1,16($sp) # parameter block - ivec || key
1168 jz .Lkmc_truncated
1169 .long 0xb92f0042 # kmc %r4,%r2
1170 brc 1,.-4 # pay attention to "partial completion"
1171 ltr $key,$key
1172 jnz .Lkmc_truncated
1173.Lkmc_done:
1174 lmg %r0,%r1,16($sp) # copy ivec to caller
1175 stg %r0,0($ivp)
1176 stg %r1,8($ivp)
1177 br $ra
1178.align 16
1179.Lkmc_truncated:
1180 ahi $key,-1 # it's the way it's encoded in mvc
1181 tmll %r0,0x80
1182 jnz .Lkmc_truncated_dec
1183 lghi %r1,0
1184 stg %r1,128($sp)
1185 stg %r1,136($sp)
1186 bras %r1,1f
1187 mvc 128(1,$sp),0($inp)
11881: ex $key,0(%r1)
1189 la %r1,16($sp) # restore parameter block
1190 la $inp,128($sp)
1191 lghi $len,16
1192 .long 0xb92f0042 # kmc %r4,%r2
1193 j .Lkmc_done
1194.align 16
1195.Lkmc_truncated_dec:
1196 stg $out,64($sp)
1197 la $out,128($sp)
1198 lghi $len,16
1199 .long 0xb92f0042 # kmc %r4,%r2
1200 lg $out,64($sp)
1201 bras %r1,2f
1202 mvc 0(1,$out),128($sp)
12032: ex $key,0(%r1)
1204 j .Lkmc_done
1205.align 16
1206.Lcbc_software:
1207___
1208$code.=<<___;
1209 stmg $key,$ra,40($sp)
1210 lhi %r0,0
1211 cl %r0,164($sp)
1212 je .Lcbc_decrypt
1213
1214 larl $tbl,AES_Te
1215
1216 llgf $s0,0($ivp)
1217 llgf $s1,4($ivp)
1218 llgf $s2,8($ivp)
1219 llgf $s3,12($ivp)
1220
1221 lghi $t0,16
1222 slgr $len,$t0
1223 brc 4,.Lcbc_enc_tail # if borrow
1224.Lcbc_enc_loop:
1225 stmg $inp,$out,16($sp)
1226 x $s0,0($inp)
1227 x $s1,4($inp)
1228 x $s2,8($inp)
1229 x $s3,12($inp)
1230 lgr %r4,$key
1231
1232 bras $ra,_s390x_AES_encrypt
1233
1234 lmg $inp,$key,16($sp)
1235 st $s0,0($out)
1236 st $s1,4($out)
1237 st $s2,8($out)
1238 st $s3,12($out)
1239
1240 la $inp,16($inp)
1241 la $out,16($out)
1242 lghi $t0,16
1243 ltgr $len,$len
1244 jz .Lcbc_enc_done
1245 slgr $len,$t0
1246 brc 4,.Lcbc_enc_tail # if borrow
1247 j .Lcbc_enc_loop
1248.align 16
1249.Lcbc_enc_done:
1250 lg $ivp,48($sp)
1251 st $s0,0($ivp)
1252 st $s1,4($ivp)
1253 st $s2,8($ivp)
1254 st $s3,12($ivp)
1255
1256 lmg %r7,$ra,56($sp)
1257 br $ra
1258
1259.align 16
1260.Lcbc_enc_tail:
1261 aghi $len,15
1262 lghi $t0,0
1263 stg $t0,128($sp)
1264 stg $t0,136($sp)
1265 bras $t1,3f
1266 mvc 128(1,$sp),0($inp)
12673: ex $len,0($t1)
1268 lghi $len,0
1269 la $inp,128($sp)
1270 j .Lcbc_enc_loop
1271
1272.align 16
1273.Lcbc_decrypt:
1274 larl $tbl,AES_Td
1275
1276 lg $t0,0($ivp)
1277 lg $t1,8($ivp)
1278 stmg $t0,$t1,128($sp)
1279
1280.Lcbc_dec_loop:
1281 stmg $inp,$out,16($sp)
1282 llgf $s0,0($inp)
1283 llgf $s1,4($inp)
1284 llgf $s2,8($inp)
1285 llgf $s3,12($inp)
1286 lgr %r4,$key
1287
1288 bras $ra,_s390x_AES_decrypt
1289
1290 lmg $inp,$key,16($sp)
1291 sllg $s0,$s0,32
1292 sllg $s2,$s2,32
1293 lr $s0,$s1
1294 lr $s2,$s3
1295
1296 lg $t0,0($inp)
1297 lg $t1,8($inp)
1298 xg $s0,128($sp)
1299 xg $s2,136($sp)
1300 lghi $s1,16
1301 slgr $len,$s1
1302 brc 4,.Lcbc_dec_tail # if borrow
1303 brc 2,.Lcbc_dec_done # if zero
1304 stg $s0,0($out)
1305 stg $s2,8($out)
1306 stmg $t0,$t1,128($sp)
1307
1308 la $inp,16($inp)
1309 la $out,16($out)
1310 j .Lcbc_dec_loop
1311
1312.Lcbc_dec_done:
1313 stg $s0,0($out)
1314 stg $s2,8($out)
1315.Lcbc_dec_exit:
1316 lmg $ivp,$ra,48($sp)
1317 stmg $t0,$t1,0($ivp)
1318
1319 br $ra
1320
1321.align 16
1322.Lcbc_dec_tail:
1323 aghi $len,15
1324 stg $s0,128($sp)
1325 stg $s2,136($sp)
1326 bras $s1,4f
1327 mvc 0(1,$out),128($sp)
13284: ex $len,0($s1)
1329 j .Lcbc_dec_exit
1330.size AES_cbc_encrypt,.-AES_cbc_encrypt
1331.comm OPENSSL_s390xcap_P,8,8
1332___
1333}
1334$code.=<<___;
1335.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
1336___
1337
1338$code =~ s/\`([^\`]*)\`/eval $1/gem;
1339print $code;
diff --git a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
deleted file mode 100755
index c57b3a2d6d..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
+++ /dev/null
@@ -1,1181 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# Version 1.1
10#
11# The major reason for undertaken effort was to mitigate the hazard of
12# cache-timing attack. This is [currently and initially!] addressed in
13# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
14# 2. References to them are scheduled for L2 cache latency, meaning
15# that the tables don't have to reside in L1 cache. Once again, this
16# is an initial draft and one should expect more countermeasures to
17# be implemented...
18#
19# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
20# round.
21#
22# Even though performance was not the primary goal [on the contrary,
23# extra shifts "induced" by compressed S-box and longer loop epilogue
24# "induced" by scheduling for L2 have negative effect on performance],
25# the code turned out to run in ~23 cycles per processed byte en-/
26# decrypted with 128-bit key. This is pretty good result for code
27# with mentioned qualities and UltraSPARC core. Compared to Sun C
28# generated code my encrypt procedure runs just few percents faster,
29# while decrypt one - whole 50% faster [yes, Sun C failed to generate
30# optimal decrypt procedure]. Compared to GNU C generated code both
31# procedures are more than 60% faster:-)
32
33$bits=32;
34for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
35if ($bits==64) { $bias=2047; $frame=192; }
36else { $bias=0; $frame=112; }
37$locals=16;
38
39$acc0="%l0";
40$acc1="%o0";
41$acc2="%o1";
42$acc3="%o2";
43
44$acc4="%l1";
45$acc5="%o3";
46$acc6="%o4";
47$acc7="%o5";
48
49$acc8="%l2";
50$acc9="%o7";
51$acc10="%g1";
52$acc11="%g2";
53
54$acc12="%l3";
55$acc13="%g3";
56$acc14="%g4";
57$acc15="%g5";
58
59$t0="%l4";
60$t1="%l5";
61$t2="%l6";
62$t3="%l7";
63
64$s0="%i0";
65$s1="%i1";
66$s2="%i2";
67$s3="%i3";
68$tbl="%i4";
69$key="%i5";
70$rounds="%i7"; # aliases with return address, which is off-loaded to stack
71
72sub _data_word()
73{ my $i;
74 while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
75}
76
77$code.=<<___ if ($bits==64);
78.register %g2,#scratch
79.register %g3,#scratch
80___
81$code.=<<___;
82.section ".text",#alloc,#execinstr
83
84.align 256
85AES_Te:
86___
87&_data_word(
88 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
89 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
90 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
91 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
92 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
93 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
94 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
95 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
96 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
97 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
98 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
99 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
100 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
101 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
102 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
103 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
104 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
105 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
106 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
107 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
108 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
109 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
110 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
111 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
112 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
113 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
114 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
115 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
116 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
117 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
118 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
119 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
120 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
121 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
122 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
123 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
124 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
125 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
126 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
127 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
128 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
129 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
130 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
131 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
132 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
133 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
134 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
135 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
136 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
137 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
138 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
139 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
140 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
141 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
142 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
143 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
144 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
145 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
146 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
147 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
148 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
149 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
150 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
151 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
152$code.=<<___;
153 .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
154 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
155 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
156 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
157 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
158 .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
159 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
160 .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
161 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
162 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
163 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
164 .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
165 .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
166 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
167 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
168 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
169 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
170 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
171 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
172 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
173 .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
174 .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
175 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
176 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
177 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
178 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
179 .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
180 .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
181 .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
182 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
183 .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
184 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
185.type AES_Te,#object
186.size AES_Te,(.-AES_Te)
187
188.align 64
189.skip 16
190_sparcv9_AES_encrypt:
191 save %sp,-$frame-$locals,%sp
192 stx %i7,[%sp+$bias+$frame+0] ! off-load return address
193 ld [$key+240],$rounds
194 ld [$key+0],$t0
195 ld [$key+4],$t1 !
196 ld [$key+8],$t2
197 srl $rounds,1,$rounds
198 xor $t0,$s0,$s0
199 ld [$key+12],$t3
200 srl $s0,21,$acc0
201 xor $t1,$s1,$s1
202 ld [$key+16],$t0
203 srl $s1,13,$acc1 !
204 xor $t2,$s2,$s2
205 ld [$key+20],$t1
206 xor $t3,$s3,$s3
207 ld [$key+24],$t2
208 and $acc0,2040,$acc0
209 ld [$key+28],$t3
210 nop
211.Lenc_loop:
212 srl $s2,5,$acc2 !
213 and $acc1,2040,$acc1
214 ldx [$tbl+$acc0],$acc0
215 sll $s3,3,$acc3
216 and $acc2,2040,$acc2
217 ldx [$tbl+$acc1],$acc1
218 srl $s1,21,$acc4
219 and $acc3,2040,$acc3
220 ldx [$tbl+$acc2],$acc2 !
221 srl $s2,13,$acc5
222 and $acc4,2040,$acc4
223 ldx [$tbl+$acc3],$acc3
224 srl $s3,5,$acc6
225 and $acc5,2040,$acc5
226 ldx [$tbl+$acc4],$acc4
227 fmovs %f0,%f0
228 sll $s0,3,$acc7 !
229 and $acc6,2040,$acc6
230 ldx [$tbl+$acc5],$acc5
231 srl $s2,21,$acc8
232 and $acc7,2040,$acc7
233 ldx [$tbl+$acc6],$acc6
234 srl $s3,13,$acc9
235 and $acc8,2040,$acc8
236 ldx [$tbl+$acc7],$acc7 !
237 srl $s0,5,$acc10
238 and $acc9,2040,$acc9
239 ldx [$tbl+$acc8],$acc8
240 sll $s1,3,$acc11
241 and $acc10,2040,$acc10
242 ldx [$tbl+$acc9],$acc9
243 fmovs %f0,%f0
244 srl $s3,21,$acc12 !
245 and $acc11,2040,$acc11
246 ldx [$tbl+$acc10],$acc10
247 srl $s0,13,$acc13
248 and $acc12,2040,$acc12
249 ldx [$tbl+$acc11],$acc11
250 srl $s1,5,$acc14
251 and $acc13,2040,$acc13
252 ldx [$tbl+$acc12],$acc12 !
253 sll $s2,3,$acc15
254 and $acc14,2040,$acc14
255 ldx [$tbl+$acc13],$acc13
256 and $acc15,2040,$acc15
257 add $key,32,$key
258 ldx [$tbl+$acc14],$acc14
259 fmovs %f0,%f0
260 subcc $rounds,1,$rounds !
261 ldx [$tbl+$acc15],$acc15
262 bz,a,pn %icc,.Lenc_last
263 add $tbl,2048,$rounds
264
265 srlx $acc1,8,$acc1
266 xor $acc0,$t0,$t0
267 ld [$key+0],$s0
268 fmovs %f0,%f0
269 srlx $acc2,16,$acc2 !
270 xor $acc1,$t0,$t0
271 ld [$key+4],$s1
272 srlx $acc3,24,$acc3
273 xor $acc2,$t0,$t0
274 ld [$key+8],$s2
275 srlx $acc5,8,$acc5
276 xor $acc3,$t0,$t0
277 ld [$key+12],$s3 !
278 srlx $acc6,16,$acc6
279 xor $acc4,$t1,$t1
280 fmovs %f0,%f0
281 srlx $acc7,24,$acc7
282 xor $acc5,$t1,$t1
283 srlx $acc9,8,$acc9
284 xor $acc6,$t1,$t1
285 srlx $acc10,16,$acc10 !
286 xor $acc7,$t1,$t1
287 srlx $acc11,24,$acc11
288 xor $acc8,$t2,$t2
289 srlx $acc13,8,$acc13
290 xor $acc9,$t2,$t2
291 srlx $acc14,16,$acc14
292 xor $acc10,$t2,$t2
293 srlx $acc15,24,$acc15 !
294 xor $acc11,$t2,$t2
295 xor $acc12,$acc14,$acc14
296 xor $acc13,$t3,$t3
297 srl $t0,21,$acc0
298 xor $acc14,$t3,$t3
299 srl $t1,13,$acc1
300 xor $acc15,$t3,$t3
301
302 and $acc0,2040,$acc0 !
303 srl $t2,5,$acc2
304 and $acc1,2040,$acc1
305 ldx [$tbl+$acc0],$acc0
306 sll $t3,3,$acc3
307 and $acc2,2040,$acc2
308 ldx [$tbl+$acc1],$acc1
309 fmovs %f0,%f0
310 srl $t1,21,$acc4 !
311 and $acc3,2040,$acc3
312 ldx [$tbl+$acc2],$acc2
313 srl $t2,13,$acc5
314 and $acc4,2040,$acc4
315 ldx [$tbl+$acc3],$acc3
316 srl $t3,5,$acc6
317 and $acc5,2040,$acc5
318 ldx [$tbl+$acc4],$acc4 !
319 sll $t0,3,$acc7
320 and $acc6,2040,$acc6
321 ldx [$tbl+$acc5],$acc5
322 srl $t2,21,$acc8
323 and $acc7,2040,$acc7
324 ldx [$tbl+$acc6],$acc6
325 fmovs %f0,%f0
326 srl $t3,13,$acc9 !
327 and $acc8,2040,$acc8
328 ldx [$tbl+$acc7],$acc7
329 srl $t0,5,$acc10
330 and $acc9,2040,$acc9
331 ldx [$tbl+$acc8],$acc8
332 sll $t1,3,$acc11
333 and $acc10,2040,$acc10
334 ldx [$tbl+$acc9],$acc9 !
335 srl $t3,21,$acc12
336 and $acc11,2040,$acc11
337 ldx [$tbl+$acc10],$acc10
338 srl $t0,13,$acc13
339 and $acc12,2040,$acc12
340 ldx [$tbl+$acc11],$acc11
341 fmovs %f0,%f0
342 srl $t1,5,$acc14 !
343 and $acc13,2040,$acc13
344 ldx [$tbl+$acc12],$acc12
345 sll $t2,3,$acc15
346 and $acc14,2040,$acc14
347 ldx [$tbl+$acc13],$acc13
348 srlx $acc1,8,$acc1
349 and $acc15,2040,$acc15
350 ldx [$tbl+$acc14],$acc14 !
351
352 srlx $acc2,16,$acc2
353 xor $acc0,$s0,$s0
354 ldx [$tbl+$acc15],$acc15
355 srlx $acc3,24,$acc3
356 xor $acc1,$s0,$s0
357 ld [$key+16],$t0
358 fmovs %f0,%f0
359 srlx $acc5,8,$acc5 !
360 xor $acc2,$s0,$s0
361 ld [$key+20],$t1
362 srlx $acc6,16,$acc6
363 xor $acc3,$s0,$s0
364 ld [$key+24],$t2
365 srlx $acc7,24,$acc7
366 xor $acc4,$s1,$s1
367 ld [$key+28],$t3 !
368 srlx $acc9,8,$acc9
369 xor $acc5,$s1,$s1
370 ldx [$tbl+2048+0],%g0 ! prefetch te4
371 srlx $acc10,16,$acc10
372 xor $acc6,$s1,$s1
373 ldx [$tbl+2048+32],%g0 ! prefetch te4
374 srlx $acc11,24,$acc11
375 xor $acc7,$s1,$s1
376 ldx [$tbl+2048+64],%g0 ! prefetch te4
377 srlx $acc13,8,$acc13
378 xor $acc8,$s2,$s2
379 ldx [$tbl+2048+96],%g0 ! prefetch te4
380 srlx $acc14,16,$acc14 !
381 xor $acc9,$s2,$s2
382 ldx [$tbl+2048+128],%g0 ! prefetch te4
383 srlx $acc15,24,$acc15
384 xor $acc10,$s2,$s2
385 ldx [$tbl+2048+160],%g0 ! prefetch te4
386 srl $s0,21,$acc0
387 xor $acc11,$s2,$s2
388 ldx [$tbl+2048+192],%g0 ! prefetch te4
389 xor $acc12,$acc14,$acc14
390 xor $acc13,$s3,$s3
391 ldx [$tbl+2048+224],%g0 ! prefetch te4
392 srl $s1,13,$acc1 !
393 xor $acc14,$s3,$s3
394 xor $acc15,$s3,$s3
395 ba .Lenc_loop
396 and $acc0,2040,$acc0
397
398.align 32
399.Lenc_last:
400 srlx $acc1,8,$acc1 !
401 xor $acc0,$t0,$t0
402 ld [$key+0],$s0
403 srlx $acc2,16,$acc2
404 xor $acc1,$t0,$t0
405 ld [$key+4],$s1
406 srlx $acc3,24,$acc3
407 xor $acc2,$t0,$t0
408 ld [$key+8],$s2 !
409 srlx $acc5,8,$acc5
410 xor $acc3,$t0,$t0
411 ld [$key+12],$s3
412 srlx $acc6,16,$acc6
413 xor $acc4,$t1,$t1
414 srlx $acc7,24,$acc7
415 xor $acc5,$t1,$t1
416 srlx $acc9,8,$acc9 !
417 xor $acc6,$t1,$t1
418 srlx $acc10,16,$acc10
419 xor $acc7,$t1,$t1
420 srlx $acc11,24,$acc11
421 xor $acc8,$t2,$t2
422 srlx $acc13,8,$acc13
423 xor $acc9,$t2,$t2
424 srlx $acc14,16,$acc14 !
425 xor $acc10,$t2,$t2
426 srlx $acc15,24,$acc15
427 xor $acc11,$t2,$t2
428 xor $acc12,$acc14,$acc14
429 xor $acc13,$t3,$t3
430 srl $t0,24,$acc0
431 xor $acc14,$t3,$t3
432 srl $t1,16,$acc1 !
433 xor $acc15,$t3,$t3
434
435 srl $t2,8,$acc2
436 and $acc1,255,$acc1
437 ldub [$rounds+$acc0],$acc0
438 srl $t1,24,$acc4
439 and $acc2,255,$acc2
440 ldub [$rounds+$acc1],$acc1
441 srl $t2,16,$acc5 !
442 and $t3,255,$acc3
443 ldub [$rounds+$acc2],$acc2
444 ldub [$rounds+$acc3],$acc3
445 srl $t3,8,$acc6
446 and $acc5,255,$acc5
447 ldub [$rounds+$acc4],$acc4
448 fmovs %f0,%f0
449 srl $t2,24,$acc8 !
450 and $acc6,255,$acc6
451 ldub [$rounds+$acc5],$acc5
452 srl $t3,16,$acc9
453 and $t0,255,$acc7
454 ldub [$rounds+$acc6],$acc6
455 ldub [$rounds+$acc7],$acc7
456 fmovs %f0,%f0
457 srl $t0,8,$acc10 !
458 and $acc9,255,$acc9
459 ldub [$rounds+$acc8],$acc8
460 srl $t3,24,$acc12
461 and $acc10,255,$acc10
462 ldub [$rounds+$acc9],$acc9
463 srl $t0,16,$acc13
464 and $t1,255,$acc11
465 ldub [$rounds+$acc10],$acc10 !
466 srl $t1,8,$acc14
467 and $acc13,255,$acc13
468 ldub [$rounds+$acc11],$acc11
469 ldub [$rounds+$acc12],$acc12
470 and $acc14,255,$acc14
471 ldub [$rounds+$acc13],$acc13
472 and $t2,255,$acc15
473 ldub [$rounds+$acc14],$acc14 !
474
475 sll $acc0,24,$acc0
476 xor $acc3,$s0,$s0
477 ldub [$rounds+$acc15],$acc15
478 sll $acc1,16,$acc1
479 xor $acc0,$s0,$s0
480 ldx [%sp+$bias+$frame+0],%i7 ! restore return address
481 fmovs %f0,%f0
482 sll $acc2,8,$acc2 !
483 xor $acc1,$s0,$s0
484 sll $acc4,24,$acc4
485 xor $acc2,$s0,$s0
486 sll $acc5,16,$acc5
487 xor $acc7,$s1,$s1
488 sll $acc6,8,$acc6
489 xor $acc4,$s1,$s1
490 sll $acc8,24,$acc8 !
491 xor $acc5,$s1,$s1
492 sll $acc9,16,$acc9
493 xor $acc11,$s2,$s2
494 sll $acc10,8,$acc10
495 xor $acc6,$s1,$s1
496 sll $acc12,24,$acc12
497 xor $acc8,$s2,$s2
498 sll $acc13,16,$acc13 !
499 xor $acc9,$s2,$s2
500 sll $acc14,8,$acc14
501 xor $acc10,$s2,$s2
502 xor $acc12,$acc14,$acc14
503 xor $acc13,$s3,$s3
504 xor $acc14,$s3,$s3
505 xor $acc15,$s3,$s3
506
507 ret
508 restore
509.type _sparcv9_AES_encrypt,#function
510.size _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
511
512.align 32
513.globl AES_encrypt
514AES_encrypt:
515 or %o0,%o1,%g1
516 andcc %g1,3,%g0
517 bnz,pn %xcc,.Lunaligned_enc
518 save %sp,-$frame,%sp
519
520 ld [%i0+0],%o0
521 ld [%i0+4],%o1
522 ld [%i0+8],%o2
523 ld [%i0+12],%o3
524
5251: call .+8
526 add %o7,AES_Te-1b,%o4
527 call _sparcv9_AES_encrypt
528 mov %i2,%o5
529
530 st %o0,[%i1+0]
531 st %o1,[%i1+4]
532 st %o2,[%i1+8]
533 st %o3,[%i1+12]
534
535 ret
536 restore
537
538.align 32
539.Lunaligned_enc:
540 ldub [%i0+0],%l0
541 ldub [%i0+1],%l1
542 ldub [%i0+2],%l2
543
544 sll %l0,24,%l0
545 ldub [%i0+3],%l3
546 sll %l1,16,%l1
547 ldub [%i0+4],%l4
548 sll %l2,8,%l2
549 or %l1,%l0,%l0
550 ldub [%i0+5],%l5
551 sll %l4,24,%l4
552 or %l3,%l2,%l2
553 ldub [%i0+6],%l6
554 sll %l5,16,%l5
555 or %l0,%l2,%o0
556 ldub [%i0+7],%l7
557
558 sll %l6,8,%l6
559 or %l5,%l4,%l4
560 ldub [%i0+8],%l0
561 or %l7,%l6,%l6
562 ldub [%i0+9],%l1
563 or %l4,%l6,%o1
564 ldub [%i0+10],%l2
565
566 sll %l0,24,%l0
567 ldub [%i0+11],%l3
568 sll %l1,16,%l1
569 ldub [%i0+12],%l4
570 sll %l2,8,%l2
571 or %l1,%l0,%l0
572 ldub [%i0+13],%l5
573 sll %l4,24,%l4
574 or %l3,%l2,%l2
575 ldub [%i0+14],%l6
576 sll %l5,16,%l5
577 or %l0,%l2,%o2
578 ldub [%i0+15],%l7
579
580 sll %l6,8,%l6
581 or %l5,%l4,%l4
582 or %l7,%l6,%l6
583 or %l4,%l6,%o3
584
5851: call .+8
586 add %o7,AES_Te-1b,%o4
587 call _sparcv9_AES_encrypt
588 mov %i2,%o5
589
590 srl %o0,24,%l0
591 srl %o0,16,%l1
592 stb %l0,[%i1+0]
593 srl %o0,8,%l2
594 stb %l1,[%i1+1]
595 stb %l2,[%i1+2]
596 srl %o1,24,%l4
597 stb %o0,[%i1+3]
598
599 srl %o1,16,%l5
600 stb %l4,[%i1+4]
601 srl %o1,8,%l6
602 stb %l5,[%i1+5]
603 stb %l6,[%i1+6]
604 srl %o2,24,%l0
605 stb %o1,[%i1+7]
606
607 srl %o2,16,%l1
608 stb %l0,[%i1+8]
609 srl %o2,8,%l2
610 stb %l1,[%i1+9]
611 stb %l2,[%i1+10]
612 srl %o3,24,%l4
613 stb %o2,[%i1+11]
614
615 srl %o3,16,%l5
616 stb %l4,[%i1+12]
617 srl %o3,8,%l6
618 stb %l5,[%i1+13]
619 stb %l6,[%i1+14]
620 stb %o3,[%i1+15]
621
622 ret
623 restore
624.type AES_encrypt,#function
625.size AES_encrypt,(.-AES_encrypt)
626
627___
628
629$code.=<<___;
630.align 256
631AES_Td:
632___
633&_data_word(
634 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
635 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
636 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
637 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
638 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
639 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
640 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
641 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
642 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
643 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
644 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
645 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
646 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
647 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
648 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
649 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
650 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
651 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
652 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
653 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
654 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
655 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
656 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
657 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
658 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
659 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
660 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
661 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
662 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
663 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
664 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
665 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
666 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
667 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
668 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
669 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
670 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
671 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
672 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
673 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
674 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
675 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
676 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
677 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
678 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
679 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
680 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
681 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
682 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
683 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
684 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
685 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
686 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
687 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
688 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
689 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
690 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
691 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
692 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
693 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
694 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
695 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
696 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
697 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
698$code.=<<___;
699 .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
700 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
701 .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
702 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
703 .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
704 .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
705 .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
706 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
707 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
708 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
709 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
710 .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
711 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
712 .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
713 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
714 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
715 .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
716 .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
717 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
718 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
719 .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
720 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
721 .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
722 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
723 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
724 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
725 .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
726 .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
727 .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
728 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
729 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
730 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
731.type AES_Td,#object
732.size AES_Td,(.-AES_Td)
733
734.align 64
735.skip 16
736_sparcv9_AES_decrypt:
737 save %sp,-$frame-$locals,%sp
738 stx %i7,[%sp+$bias+$frame+0] ! off-load return address
739 ld [$key+240],$rounds
740 ld [$key+0],$t0
741 ld [$key+4],$t1 !
742 ld [$key+8],$t2
743 ld [$key+12],$t3
744 srl $rounds,1,$rounds
745 xor $t0,$s0,$s0
746 ld [$key+16],$t0
747 xor $t1,$s1,$s1
748 ld [$key+20],$t1
749 srl $s0,21,$acc0 !
750 xor $t2,$s2,$s2
751 ld [$key+24],$t2
752 xor $t3,$s3,$s3
753 and $acc0,2040,$acc0
754 ld [$key+28],$t3
755 srl $s3,13,$acc1
756 nop
757.Ldec_loop:
758 srl $s2,5,$acc2 !
759 and $acc1,2040,$acc1
760 ldx [$tbl+$acc0],$acc0
761 sll $s1,3,$acc3
762 and $acc2,2040,$acc2
763 ldx [$tbl+$acc1],$acc1
764 srl $s1,21,$acc4
765 and $acc3,2040,$acc3
766 ldx [$tbl+$acc2],$acc2 !
767 srl $s0,13,$acc5
768 and $acc4,2040,$acc4
769 ldx [$tbl+$acc3],$acc3
770 srl $s3,5,$acc6
771 and $acc5,2040,$acc5
772 ldx [$tbl+$acc4],$acc4
773 fmovs %f0,%f0
774 sll $s2,3,$acc7 !
775 and $acc6,2040,$acc6
776 ldx [$tbl+$acc5],$acc5
777 srl $s2,21,$acc8
778 and $acc7,2040,$acc7
779 ldx [$tbl+$acc6],$acc6
780 srl $s1,13,$acc9
781 and $acc8,2040,$acc8
782 ldx [$tbl+$acc7],$acc7 !
783 srl $s0,5,$acc10
784 and $acc9,2040,$acc9
785 ldx [$tbl+$acc8],$acc8
786 sll $s3,3,$acc11
787 and $acc10,2040,$acc10
788 ldx [$tbl+$acc9],$acc9
789 fmovs %f0,%f0
790 srl $s3,21,$acc12 !
791 and $acc11,2040,$acc11
792 ldx [$tbl+$acc10],$acc10
793 srl $s2,13,$acc13
794 and $acc12,2040,$acc12
795 ldx [$tbl+$acc11],$acc11
796 srl $s1,5,$acc14
797 and $acc13,2040,$acc13
798 ldx [$tbl+$acc12],$acc12 !
799 sll $s0,3,$acc15
800 and $acc14,2040,$acc14
801 ldx [$tbl+$acc13],$acc13
802 and $acc15,2040,$acc15
803 add $key,32,$key
804 ldx [$tbl+$acc14],$acc14
805 fmovs %f0,%f0
806 subcc $rounds,1,$rounds !
807 ldx [$tbl+$acc15],$acc15
808 bz,a,pn %icc,.Ldec_last
809 add $tbl,2048,$rounds
810
811 srlx $acc1,8,$acc1
812 xor $acc0,$t0,$t0
813 ld [$key+0],$s0
814 fmovs %f0,%f0
815 srlx $acc2,16,$acc2 !
816 xor $acc1,$t0,$t0
817 ld [$key+4],$s1
818 srlx $acc3,24,$acc3
819 xor $acc2,$t0,$t0
820 ld [$key+8],$s2
821 srlx $acc5,8,$acc5
822 xor $acc3,$t0,$t0
823 ld [$key+12],$s3 !
824 srlx $acc6,16,$acc6
825 xor $acc4,$t1,$t1
826 fmovs %f0,%f0
827 srlx $acc7,24,$acc7
828 xor $acc5,$t1,$t1
829 srlx $acc9,8,$acc9
830 xor $acc6,$t1,$t1
831 srlx $acc10,16,$acc10 !
832 xor $acc7,$t1,$t1
833 srlx $acc11,24,$acc11
834 xor $acc8,$t2,$t2
835 srlx $acc13,8,$acc13
836 xor $acc9,$t2,$t2
837 srlx $acc14,16,$acc14
838 xor $acc10,$t2,$t2
839 srlx $acc15,24,$acc15 !
840 xor $acc11,$t2,$t2
841 xor $acc12,$acc14,$acc14
842 xor $acc13,$t3,$t3
843 srl $t0,21,$acc0
844 xor $acc14,$t3,$t3
845 xor $acc15,$t3,$t3
846 srl $t3,13,$acc1
847
848 and $acc0,2040,$acc0 !
849 srl $t2,5,$acc2
850 and $acc1,2040,$acc1
851 ldx [$tbl+$acc0],$acc0
852 sll $t1,3,$acc3
853 and $acc2,2040,$acc2
854 ldx [$tbl+$acc1],$acc1
855 fmovs %f0,%f0
856 srl $t1,21,$acc4 !
857 and $acc3,2040,$acc3
858 ldx [$tbl+$acc2],$acc2
859 srl $t0,13,$acc5
860 and $acc4,2040,$acc4
861 ldx [$tbl+$acc3],$acc3
862 srl $t3,5,$acc6
863 and $acc5,2040,$acc5
864 ldx [$tbl+$acc4],$acc4 !
865 sll $t2,3,$acc7
866 and $acc6,2040,$acc6
867 ldx [$tbl+$acc5],$acc5
868 srl $t2,21,$acc8
869 and $acc7,2040,$acc7
870 ldx [$tbl+$acc6],$acc6
871 fmovs %f0,%f0
872 srl $t1,13,$acc9 !
873 and $acc8,2040,$acc8
874 ldx [$tbl+$acc7],$acc7
875 srl $t0,5,$acc10
876 and $acc9,2040,$acc9
877 ldx [$tbl+$acc8],$acc8
878 sll $t3,3,$acc11
879 and $acc10,2040,$acc10
880 ldx [$tbl+$acc9],$acc9 !
881 srl $t3,21,$acc12
882 and $acc11,2040,$acc11
883 ldx [$tbl+$acc10],$acc10
884 srl $t2,13,$acc13
885 and $acc12,2040,$acc12
886 ldx [$tbl+$acc11],$acc11
887 fmovs %f0,%f0
888 srl $t1,5,$acc14 !
889 and $acc13,2040,$acc13
890 ldx [$tbl+$acc12],$acc12
891 sll $t0,3,$acc15
892 and $acc14,2040,$acc14
893 ldx [$tbl+$acc13],$acc13
894 srlx $acc1,8,$acc1
895 and $acc15,2040,$acc15
896 ldx [$tbl+$acc14],$acc14 !
897
898 srlx $acc2,16,$acc2
899 xor $acc0,$s0,$s0
900 ldx [$tbl+$acc15],$acc15
901 srlx $acc3,24,$acc3
902 xor $acc1,$s0,$s0
903 ld [$key+16],$t0
904 fmovs %f0,%f0
905 srlx $acc5,8,$acc5 !
906 xor $acc2,$s0,$s0
907 ld [$key+20],$t1
908 srlx $acc6,16,$acc6
909 xor $acc3,$s0,$s0
910 ld [$key+24],$t2
911 srlx $acc7,24,$acc7
912 xor $acc4,$s1,$s1
913 ld [$key+28],$t3 !
914 srlx $acc9,8,$acc9
915 xor $acc5,$s1,$s1
916 ldx [$tbl+2048+0],%g0 ! prefetch td4
917 srlx $acc10,16,$acc10
918 xor $acc6,$s1,$s1
919 ldx [$tbl+2048+32],%g0 ! prefetch td4
920 srlx $acc11,24,$acc11
921 xor $acc7,$s1,$s1
922 ldx [$tbl+2048+64],%g0 ! prefetch td4
923 srlx $acc13,8,$acc13
924 xor $acc8,$s2,$s2
925 ldx [$tbl+2048+96],%g0 ! prefetch td4
926 srlx $acc14,16,$acc14 !
927 xor $acc9,$s2,$s2
928 ldx [$tbl+2048+128],%g0 ! prefetch td4
929 srlx $acc15,24,$acc15
930 xor $acc10,$s2,$s2
931 ldx [$tbl+2048+160],%g0 ! prefetch td4
932 srl $s0,21,$acc0
933 xor $acc11,$s2,$s2
934 ldx [$tbl+2048+192],%g0 ! prefetch td4
935 xor $acc12,$acc14,$acc14
936 xor $acc13,$s3,$s3
937 ldx [$tbl+2048+224],%g0 ! prefetch td4
938 and $acc0,2040,$acc0 !
939 xor $acc14,$s3,$s3
940 xor $acc15,$s3,$s3
941 ba .Ldec_loop
942 srl $s3,13,$acc1
943
944.align 32
945.Ldec_last:
946 srlx $acc1,8,$acc1 !
947 xor $acc0,$t0,$t0
948 ld [$key+0],$s0
949 srlx $acc2,16,$acc2
950 xor $acc1,$t0,$t0
951 ld [$key+4],$s1
952 srlx $acc3,24,$acc3
953 xor $acc2,$t0,$t0
954 ld [$key+8],$s2 !
955 srlx $acc5,8,$acc5
956 xor $acc3,$t0,$t0
957 ld [$key+12],$s3
958 srlx $acc6,16,$acc6
959 xor $acc4,$t1,$t1
960 srlx $acc7,24,$acc7
961 xor $acc5,$t1,$t1
962 srlx $acc9,8,$acc9 !
963 xor $acc6,$t1,$t1
964 srlx $acc10,16,$acc10
965 xor $acc7,$t1,$t1
966 srlx $acc11,24,$acc11
967 xor $acc8,$t2,$t2
968 srlx $acc13,8,$acc13
969 xor $acc9,$t2,$t2
970 srlx $acc14,16,$acc14 !
971 xor $acc10,$t2,$t2
972 srlx $acc15,24,$acc15
973 xor $acc11,$t2,$t2
974 xor $acc12,$acc14,$acc14
975 xor $acc13,$t3,$t3
976 srl $t0,24,$acc0
977 xor $acc14,$t3,$t3
978 xor $acc15,$t3,$t3 !
979 srl $t3,16,$acc1
980
981 srl $t2,8,$acc2
982 and $acc1,255,$acc1
983 ldub [$rounds+$acc0],$acc0
984 srl $t1,24,$acc4
985 and $acc2,255,$acc2
986 ldub [$rounds+$acc1],$acc1
987 srl $t0,16,$acc5 !
988 and $t1,255,$acc3
989 ldub [$rounds+$acc2],$acc2
990 ldub [$rounds+$acc3],$acc3
991 srl $t3,8,$acc6
992 and $acc5,255,$acc5
993 ldub [$rounds+$acc4],$acc4
994 fmovs %f0,%f0
995 srl $t2,24,$acc8 !
996 and $acc6,255,$acc6
997 ldub [$rounds+$acc5],$acc5
998 srl $t1,16,$acc9
999 and $t2,255,$acc7
1000 ldub [$rounds+$acc6],$acc6
1001 ldub [$rounds+$acc7],$acc7
1002 fmovs %f0,%f0
1003 srl $t0,8,$acc10 !
1004 and $acc9,255,$acc9
1005 ldub [$rounds+$acc8],$acc8
1006 srl $t3,24,$acc12
1007 and $acc10,255,$acc10
1008 ldub [$rounds+$acc9],$acc9
1009 srl $t2,16,$acc13
1010 and $t3,255,$acc11
1011 ldub [$rounds+$acc10],$acc10 !
1012 srl $t1,8,$acc14
1013 and $acc13,255,$acc13
1014 ldub [$rounds+$acc11],$acc11
1015 ldub [$rounds+$acc12],$acc12
1016 and $acc14,255,$acc14
1017 ldub [$rounds+$acc13],$acc13
1018 and $t0,255,$acc15
1019 ldub [$rounds+$acc14],$acc14 !
1020
1021 sll $acc0,24,$acc0
1022 xor $acc3,$s0,$s0
1023 ldub [$rounds+$acc15],$acc15
1024 sll $acc1,16,$acc1
1025 xor $acc0,$s0,$s0
1026 ldx [%sp+$bias+$frame+0],%i7 ! restore return address
1027 fmovs %f0,%f0
1028 sll $acc2,8,$acc2 !
1029 xor $acc1,$s0,$s0
1030 sll $acc4,24,$acc4
1031 xor $acc2,$s0,$s0
1032 sll $acc5,16,$acc5
1033 xor $acc7,$s1,$s1
1034 sll $acc6,8,$acc6
1035 xor $acc4,$s1,$s1
1036 sll $acc8,24,$acc8 !
1037 xor $acc5,$s1,$s1
1038 sll $acc9,16,$acc9
1039 xor $acc11,$s2,$s2
1040 sll $acc10,8,$acc10
1041 xor $acc6,$s1,$s1
1042 sll $acc12,24,$acc12
1043 xor $acc8,$s2,$s2
1044 sll $acc13,16,$acc13 !
1045 xor $acc9,$s2,$s2
1046 sll $acc14,8,$acc14
1047 xor $acc10,$s2,$s2
1048 xor $acc12,$acc14,$acc14
1049 xor $acc13,$s3,$s3
1050 xor $acc14,$s3,$s3
1051 xor $acc15,$s3,$s3
1052
1053 ret
1054 restore
1055.type _sparcv9_AES_decrypt,#function
1056.size _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1057
1058.align 32
1059.globl AES_decrypt
1060AES_decrypt:
1061 or %o0,%o1,%g1
1062 andcc %g1,3,%g0
1063 bnz,pn %xcc,.Lunaligned_dec
1064 save %sp,-$frame,%sp
1065
1066 ld [%i0+0],%o0
1067 ld [%i0+4],%o1
1068 ld [%i0+8],%o2
1069 ld [%i0+12],%o3
1070
10711: call .+8
1072 add %o7,AES_Td-1b,%o4
1073 call _sparcv9_AES_decrypt
1074 mov %i2,%o5
1075
1076 st %o0,[%i1+0]
1077 st %o1,[%i1+4]
1078 st %o2,[%i1+8]
1079 st %o3,[%i1+12]
1080
1081 ret
1082 restore
1083
1084.align 32
1085.Lunaligned_dec:
1086 ldub [%i0+0],%l0
1087 ldub [%i0+1],%l1
1088 ldub [%i0+2],%l2
1089
1090 sll %l0,24,%l0
1091 ldub [%i0+3],%l3
1092 sll %l1,16,%l1
1093 ldub [%i0+4],%l4
1094 sll %l2,8,%l2
1095 or %l1,%l0,%l0
1096 ldub [%i0+5],%l5
1097 sll %l4,24,%l4
1098 or %l3,%l2,%l2
1099 ldub [%i0+6],%l6
1100 sll %l5,16,%l5
1101 or %l0,%l2,%o0
1102 ldub [%i0+7],%l7
1103
1104 sll %l6,8,%l6
1105 or %l5,%l4,%l4
1106 ldub [%i0+8],%l0
1107 or %l7,%l6,%l6
1108 ldub [%i0+9],%l1
1109 or %l4,%l6,%o1
1110 ldub [%i0+10],%l2
1111
1112 sll %l0,24,%l0
1113 ldub [%i0+11],%l3
1114 sll %l1,16,%l1
1115 ldub [%i0+12],%l4
1116 sll %l2,8,%l2
1117 or %l1,%l0,%l0
1118 ldub [%i0+13],%l5
1119 sll %l4,24,%l4
1120 or %l3,%l2,%l2
1121 ldub [%i0+14],%l6
1122 sll %l5,16,%l5
1123 or %l0,%l2,%o2
1124 ldub [%i0+15],%l7
1125
1126 sll %l6,8,%l6
1127 or %l5,%l4,%l4
1128 or %l7,%l6,%l6
1129 or %l4,%l6,%o3
1130
11311: call .+8
1132 add %o7,AES_Td-1b,%o4
1133 call _sparcv9_AES_decrypt
1134 mov %i2,%o5
1135
1136 srl %o0,24,%l0
1137 srl %o0,16,%l1
1138 stb %l0,[%i1+0]
1139 srl %o0,8,%l2
1140 stb %l1,[%i1+1]
1141 stb %l2,[%i1+2]
1142 srl %o1,24,%l4
1143 stb %o0,[%i1+3]
1144
1145 srl %o1,16,%l5
1146 stb %l4,[%i1+4]
1147 srl %o1,8,%l6
1148 stb %l5,[%i1+5]
1149 stb %l6,[%i1+6]
1150 srl %o2,24,%l0
1151 stb %o1,[%i1+7]
1152
1153 srl %o2,16,%l1
1154 stb %l0,[%i1+8]
1155 srl %o2,8,%l2
1156 stb %l1,[%i1+9]
1157 stb %l2,[%i1+10]
1158 srl %o3,24,%l4
1159 stb %o2,[%i1+11]
1160
1161 srl %o3,16,%l5
1162 stb %l4,[%i1+12]
1163 srl %o3,8,%l6
1164 stb %l5,[%i1+13]
1165 stb %l6,[%i1+14]
1166 stb %o3,[%i1+15]
1167
1168 ret
1169 restore
1170.type AES_decrypt,#function
1171.size AES_decrypt,(.-AES_decrypt)
1172___
1173
1174# fmovs instructions substituting for FP nops were originally added
1175# to meet specific instruction alignment requirements to maximize ILP.
1176# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1177# undesired effect, so just omit them and sacrifice some portion of
1178# percent in performance...
1179$code =~ s/fmovs.*$//gem;
1180
1181print $code;
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
deleted file mode 100755
index 53e4ef85fd..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl
+++ /dev/null
@@ -1,2809 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Version 2.1.
11#
12# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
13# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
14# [you'll notice a lot of resemblance], such as compressed S-boxes
15# in little-endian byte order, prefetch of these tables in CBC mode,
16# as well as avoiding L1 cache aliasing between stack frame and key
17# schedule and already mentioned tables, compressed Td4...
18#
19# Performance in number of cycles per processed byte for 128-bit key:
20#
21# ECB encrypt ECB decrypt CBC large chunk
22# AMD64 33 41 13.0
23# EM64T 38 59 18.6(*)
24# Core 2 30 43 14.5(*)
25#
26# (*) with hyper-threading off
27
28$flavour = shift;
29$output = shift;
30if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
31
32$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
33
34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
36( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
37die "can't locate x86_64-xlate.pl";
38
39open STDOUT,"| $^X $xlate $flavour $output";
40
41$verticalspin=1; # unlike 32-bit version $verticalspin performs
42 # ~15% better on both AMD and Intel cores
43$speed_limit=512; # see aes-586.pl for details
44
45$code=".text\n";
46
47$s0="%eax";
48$s1="%ebx";
49$s2="%ecx";
50$s3="%edx";
51$acc0="%esi"; $mask80="%rsi";
52$acc1="%edi"; $maskfe="%rdi";
53$acc2="%ebp"; $mask1b="%rbp";
54$inp="%r8";
55$out="%r9";
56$t0="%r10d";
57$t1="%r11d";
58$t2="%r12d";
59$rnds="%r13d";
60$sbox="%r14";
61$key="%r15";
62
63sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
64sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
65 $r =~ s/%[er]([sd]i)/%\1l/;
66 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
67sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
68 $r =~ s/%r([0-9]+)/%r\1d/; $r; }
69sub _data_word()
70{ my $i;
71 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
72}
73sub data_word()
74{ my $i;
75 my $last=pop(@_);
76 $code.=".long\t";
77 while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
78 $code.=sprintf"0x%08x\n",$last;
79}
80
81sub data_byte()
82{ my $i;
83 my $last=pop(@_);
84 $code.=".byte\t";
85 while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
86 $code.=sprintf"0x%02x\n",$last&0xff;
87}
88
89sub encvert()
90{ my $t3="%r8d"; # zaps $inp!
91
92$code.=<<___;
93 # favor 3-way issue Opteron pipeline...
94 movzb `&lo("$s0")`,$acc0
95 movzb `&lo("$s1")`,$acc1
96 movzb `&lo("$s2")`,$acc2
97 mov 0($sbox,$acc0,8),$t0
98 mov 0($sbox,$acc1,8),$t1
99 mov 0($sbox,$acc2,8),$t2
100
101 movzb `&hi("$s1")`,$acc0
102 movzb `&hi("$s2")`,$acc1
103 movzb `&lo("$s3")`,$acc2
104 xor 3($sbox,$acc0,8),$t0
105 xor 3($sbox,$acc1,8),$t1
106 mov 0($sbox,$acc2,8),$t3
107
108 movzb `&hi("$s3")`,$acc0
109 shr \$16,$s2
110 movzb `&hi("$s0")`,$acc2
111 xor 3($sbox,$acc0,8),$t2
112 shr \$16,$s3
113 xor 3($sbox,$acc2,8),$t3
114
115 shr \$16,$s1
116 lea 16($key),$key
117 shr \$16,$s0
118
119 movzb `&lo("$s2")`,$acc0
120 movzb `&lo("$s3")`,$acc1
121 movzb `&lo("$s0")`,$acc2
122 xor 2($sbox,$acc0,8),$t0
123 xor 2($sbox,$acc1,8),$t1
124 xor 2($sbox,$acc2,8),$t2
125
126 movzb `&hi("$s3")`,$acc0
127 movzb `&hi("$s0")`,$acc1
128 movzb `&lo("$s1")`,$acc2
129 xor 1($sbox,$acc0,8),$t0
130 xor 1($sbox,$acc1,8),$t1
131 xor 2($sbox,$acc2,8),$t3
132
133 mov 12($key),$s3
134 movzb `&hi("$s1")`,$acc1
135 movzb `&hi("$s2")`,$acc2
136 mov 0($key),$s0
137 xor 1($sbox,$acc1,8),$t2
138 xor 1($sbox,$acc2,8),$t3
139
140 mov 4($key),$s1
141 mov 8($key),$s2
142 xor $t0,$s0
143 xor $t1,$s1
144 xor $t2,$s2
145 xor $t3,$s3
146___
147}
148
149sub enclastvert()
150{ my $t3="%r8d"; # zaps $inp!
151
152$code.=<<___;
153 movzb `&lo("$s0")`,$acc0
154 movzb `&lo("$s1")`,$acc1
155 movzb `&lo("$s2")`,$acc2
156 movzb 2($sbox,$acc0,8),$t0
157 movzb 2($sbox,$acc1,8),$t1
158 movzb 2($sbox,$acc2,8),$t2
159
160 movzb `&lo("$s3")`,$acc0
161 movzb `&hi("$s1")`,$acc1
162 movzb `&hi("$s2")`,$acc2
163 movzb 2($sbox,$acc0,8),$t3
164 mov 0($sbox,$acc1,8),$acc1 #$t0
165 mov 0($sbox,$acc2,8),$acc2 #$t1
166
167 and \$0x0000ff00,$acc1
168 and \$0x0000ff00,$acc2
169
170 xor $acc1,$t0
171 xor $acc2,$t1
172 shr \$16,$s2
173
174 movzb `&hi("$s3")`,$acc0
175 movzb `&hi("$s0")`,$acc1
176 shr \$16,$s3
177 mov 0($sbox,$acc0,8),$acc0 #$t2
178 mov 0($sbox,$acc1,8),$acc1 #$t3
179
180 and \$0x0000ff00,$acc0
181 and \$0x0000ff00,$acc1
182 shr \$16,$s1
183 xor $acc0,$t2
184 xor $acc1,$t3
185 shr \$16,$s0
186
187 movzb `&lo("$s2")`,$acc0
188 movzb `&lo("$s3")`,$acc1
189 movzb `&lo("$s0")`,$acc2
190 mov 0($sbox,$acc0,8),$acc0 #$t0
191 mov 0($sbox,$acc1,8),$acc1 #$t1
192 mov 0($sbox,$acc2,8),$acc2 #$t2
193
194 and \$0x00ff0000,$acc0
195 and \$0x00ff0000,$acc1
196 and \$0x00ff0000,$acc2
197
198 xor $acc0,$t0
199 xor $acc1,$t1
200 xor $acc2,$t2
201
202 movzb `&lo("$s1")`,$acc0
203 movzb `&hi("$s3")`,$acc1
204 movzb `&hi("$s0")`,$acc2
205 mov 0($sbox,$acc0,8),$acc0 #$t3
206 mov 2($sbox,$acc1,8),$acc1 #$t0
207 mov 2($sbox,$acc2,8),$acc2 #$t1
208
209 and \$0x00ff0000,$acc0
210 and \$0xff000000,$acc1
211 and \$0xff000000,$acc2
212
213 xor $acc0,$t3
214 xor $acc1,$t0
215 xor $acc2,$t1
216
217 movzb `&hi("$s1")`,$acc0
218 movzb `&hi("$s2")`,$acc1
219 mov 16+12($key),$s3
220 mov 2($sbox,$acc0,8),$acc0 #$t2
221 mov 2($sbox,$acc1,8),$acc1 #$t3
222 mov 16+0($key),$s0
223
224 and \$0xff000000,$acc0
225 and \$0xff000000,$acc1
226
227 xor $acc0,$t2
228 xor $acc1,$t3
229
230 mov 16+4($key),$s1
231 mov 16+8($key),$s2
232 xor $t0,$s0
233 xor $t1,$s1
234 xor $t2,$s2
235 xor $t3,$s3
236___
237}
238
239sub encstep()
240{ my ($i,@s) = @_;
241 my $tmp0=$acc0;
242 my $tmp1=$acc1;
243 my $tmp2=$acc2;
244 my $out=($t0,$t1,$t2,$s[0])[$i];
245
246 if ($i==3) {
247 $tmp0=$s[1];
248 $tmp1=$s[2];
249 $tmp2=$s[3];
250 }
251 $code.=" movzb ".&lo($s[0]).",$out\n";
252 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
253 $code.=" lea 16($key),$key\n" if ($i==0);
254
255 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
256 $code.=" mov 0($sbox,$out,8),$out\n";
257
258 $code.=" shr \$16,$tmp1\n";
259 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
260 $code.=" xor 3($sbox,$tmp0,8),$out\n";
261
262 $code.=" movzb ".&lo($tmp1).",$tmp1\n";
263 $code.=" shr \$24,$tmp2\n";
264 $code.=" xor 4*$i($key),$out\n";
265
266 $code.=" xor 2($sbox,$tmp1,8),$out\n";
267 $code.=" xor 1($sbox,$tmp2,8),$out\n";
268
269 $code.=" mov $t0,$s[1]\n" if ($i==3);
270 $code.=" mov $t1,$s[2]\n" if ($i==3);
271 $code.=" mov $t2,$s[3]\n" if ($i==3);
272 $code.="\n";
273}
274
275sub enclast()
276{ my ($i,@s)=@_;
277 my $tmp0=$acc0;
278 my $tmp1=$acc1;
279 my $tmp2=$acc2;
280 my $out=($t0,$t1,$t2,$s[0])[$i];
281
282 if ($i==3) {
283 $tmp0=$s[1];
284 $tmp1=$s[2];
285 $tmp2=$s[3];
286 }
287 $code.=" movzb ".&lo($s[0]).",$out\n";
288 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
289
290 $code.=" mov 2($sbox,$out,8),$out\n";
291 $code.=" shr \$16,$tmp1\n";
292 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
293
294 $code.=" and \$0x000000ff,$out\n";
295 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
296 $code.=" movzb ".&lo($tmp1).",$tmp1\n";
297 $code.=" shr \$24,$tmp2\n";
298
299 $code.=" mov 0($sbox,$tmp0,8),$tmp0\n";
300 $code.=" mov 0($sbox,$tmp1,8),$tmp1\n";
301 $code.=" mov 2($sbox,$tmp2,8),$tmp2\n";
302
303 $code.=" and \$0x0000ff00,$tmp0\n";
304 $code.=" and \$0x00ff0000,$tmp1\n";
305 $code.=" and \$0xff000000,$tmp2\n";
306
307 $code.=" xor $tmp0,$out\n";
308 $code.=" mov $t0,$s[1]\n" if ($i==3);
309 $code.=" xor $tmp1,$out\n";
310 $code.=" mov $t1,$s[2]\n" if ($i==3);
311 $code.=" xor $tmp2,$out\n";
312 $code.=" mov $t2,$s[3]\n" if ($i==3);
313 $code.="\n";
314}
315
316$code.=<<___;
317.type _x86_64_AES_encrypt,\@abi-omnipotent
318.align 16
319_x86_64_AES_encrypt:
320 xor 0($key),$s0 # xor with key
321 xor 4($key),$s1
322 xor 8($key),$s2
323 xor 12($key),$s3
324
325 mov 240($key),$rnds # load key->rounds
326 sub \$1,$rnds
327 jmp .Lenc_loop
328.align 16
329.Lenc_loop:
330___
331 if ($verticalspin) { &encvert(); }
332 else { &encstep(0,$s0,$s1,$s2,$s3);
333 &encstep(1,$s1,$s2,$s3,$s0);
334 &encstep(2,$s2,$s3,$s0,$s1);
335 &encstep(3,$s3,$s0,$s1,$s2);
336 }
337$code.=<<___;
338 sub \$1,$rnds
339 jnz .Lenc_loop
340___
341 if ($verticalspin) { &enclastvert(); }
342 else { &enclast(0,$s0,$s1,$s2,$s3);
343 &enclast(1,$s1,$s2,$s3,$s0);
344 &enclast(2,$s2,$s3,$s0,$s1);
345 &enclast(3,$s3,$s0,$s1,$s2);
346 $code.=<<___;
347 xor 16+0($key),$s0 # xor with key
348 xor 16+4($key),$s1
349 xor 16+8($key),$s2
350 xor 16+12($key),$s3
351___
352 }
353$code.=<<___;
354 .byte 0xf3,0xc3 # rep ret
355.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
356___
357
358# it's possible to implement this by shifting tN by 8, filling least
359# significant byte with byte load and finally bswap-ing at the end,
360# but such partial register load kills Core 2...
361sub enccompactvert()
362{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
363
364$code.=<<___;
365 movzb `&lo("$s0")`,$t0
366 movzb `&lo("$s1")`,$t1
367 movzb `&lo("$s2")`,$t2
368 movzb ($sbox,$t0,1),$t0
369 movzb ($sbox,$t1,1),$t1
370 movzb ($sbox,$t2,1),$t2
371
372 movzb `&lo("$s3")`,$t3
373 movzb `&hi("$s1")`,$acc0
374 movzb `&hi("$s2")`,$acc1
375 movzb ($sbox,$t3,1),$t3
376 movzb ($sbox,$acc0,1),$t4 #$t0
377 movzb ($sbox,$acc1,1),$t5 #$t1
378
379 movzb `&hi("$s3")`,$acc2
380 movzb `&hi("$s0")`,$acc0
381 shr \$16,$s2
382 movzb ($sbox,$acc2,1),$acc2 #$t2
383 movzb ($sbox,$acc0,1),$acc0 #$t3
384 shr \$16,$s3
385
386 movzb `&lo("$s2")`,$acc1
387 shl \$8,$t4
388 shl \$8,$t5
389 movzb ($sbox,$acc1,1),$acc1 #$t0
390 xor $t4,$t0
391 xor $t5,$t1
392
393 movzb `&lo("$s3")`,$t4
394 shr \$16,$s0
395 shr \$16,$s1
396 movzb `&lo("$s0")`,$t5
397 shl \$8,$acc2
398 shl \$8,$acc0
399 movzb ($sbox,$t4,1),$t4 #$t1
400 movzb ($sbox,$t5,1),$t5 #$t2
401 xor $acc2,$t2
402 xor $acc0,$t3
403
404 movzb `&lo("$s1")`,$acc2
405 movzb `&hi("$s3")`,$acc0
406 shl \$16,$acc1
407 movzb ($sbox,$acc2,1),$acc2 #$t3
408 movzb ($sbox,$acc0,1),$acc0 #$t0
409 xor $acc1,$t0
410
411 movzb `&hi("$s0")`,$acc1
412 shr \$8,$s2
413 shr \$8,$s1
414 movzb ($sbox,$acc1,1),$acc1 #$t1
415 movzb ($sbox,$s2,1),$s3 #$t3
416 movzb ($sbox,$s1,1),$s2 #$t2
417 shl \$16,$t4
418 shl \$16,$t5
419 shl \$16,$acc2
420 xor $t4,$t1
421 xor $t5,$t2
422 xor $acc2,$t3
423
424 shl \$24,$acc0
425 shl \$24,$acc1
426 shl \$24,$s3
427 xor $acc0,$t0
428 shl \$24,$s2
429 xor $acc1,$t1
430 mov $t0,$s0
431 mov $t1,$s1
432 xor $t2,$s2
433 xor $t3,$s3
434___
435}
436
437sub enctransform_ref()
438{ my $sn = shift;
439 my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
440
441$code.=<<___;
442 mov $sn,$acc
443 and \$0x80808080,$acc
444 mov $acc,$tmp
445 shr \$7,$tmp
446 lea ($sn,$sn),$r2
447 sub $tmp,$acc
448 and \$0xfefefefe,$r2
449 and \$0x1b1b1b1b,$acc
450 mov $sn,$tmp
451 xor $acc,$r2
452
453 xor $r2,$sn
454 rol \$24,$sn
455 xor $r2,$sn
456 ror \$16,$tmp
457 xor $tmp,$sn
458 ror \$8,$tmp
459 xor $tmp,$sn
460___
461}
462
463# unlike decrypt case it does not pay off to parallelize enctransform
464sub enctransform()
465{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
466
467$code.=<<___;
468 mov $s0,$acc0
469 mov $s1,$acc1
470 and \$0x80808080,$acc0
471 and \$0x80808080,$acc1
472 mov $acc0,$t0
473 mov $acc1,$t1
474 shr \$7,$t0
475 lea ($s0,$s0),$r20
476 shr \$7,$t1
477 lea ($s1,$s1),$r21
478 sub $t0,$acc0
479 sub $t1,$acc1
480 and \$0xfefefefe,$r20
481 and \$0xfefefefe,$r21
482 and \$0x1b1b1b1b,$acc0
483 and \$0x1b1b1b1b,$acc1
484 mov $s0,$t0
485 mov $s1,$t1
486 xor $acc0,$r20
487 xor $acc1,$r21
488
489 xor $r20,$s0
490 xor $r21,$s1
491 mov $s2,$acc0
492 mov $s3,$acc1
493 rol \$24,$s0
494 rol \$24,$s1
495 and \$0x80808080,$acc0
496 and \$0x80808080,$acc1
497 xor $r20,$s0
498 xor $r21,$s1
499 mov $acc0,$t2
500 mov $acc1,$t3
501 ror \$16,$t0
502 ror \$16,$t1
503 shr \$7,$t2
504 lea ($s2,$s2),$r20
505 xor $t0,$s0
506 xor $t1,$s1
507 shr \$7,$t3
508 lea ($s3,$s3),$r21
509 ror \$8,$t0
510 ror \$8,$t1
511 sub $t2,$acc0
512 sub $t3,$acc1
513 xor $t0,$s0
514 xor $t1,$s1
515
516 and \$0xfefefefe,$r20
517 and \$0xfefefefe,$r21
518 and \$0x1b1b1b1b,$acc0
519 and \$0x1b1b1b1b,$acc1
520 mov $s2,$t2
521 mov $s3,$t3
522 xor $acc0,$r20
523 xor $acc1,$r21
524
525 xor $r20,$s2
526 xor $r21,$s3
527 rol \$24,$s2
528 rol \$24,$s3
529 xor $r20,$s2
530 xor $r21,$s3
531 mov 0($sbox),$acc0 # prefetch Te4
532 ror \$16,$t2
533 ror \$16,$t3
534 mov 64($sbox),$acc1
535 xor $t2,$s2
536 xor $t3,$s3
537 mov 128($sbox),$r20
538 ror \$8,$t2
539 ror \$8,$t3
540 mov 192($sbox),$r21
541 xor $t2,$s2
542 xor $t3,$s3
543___
544}
545
546$code.=<<___;
547.type _x86_64_AES_encrypt_compact,\@abi-omnipotent
548.align 16
549_x86_64_AES_encrypt_compact:
550 lea 128($sbox),$inp # size optimization
551 mov 0-128($inp),$acc1 # prefetch Te4
552 mov 32-128($inp),$acc2
553 mov 64-128($inp),$t0
554 mov 96-128($inp),$t1
555 mov 128-128($inp),$acc1
556 mov 160-128($inp),$acc2
557 mov 192-128($inp),$t0
558 mov 224-128($inp),$t1
559 jmp .Lenc_loop_compact
560.align 16
561.Lenc_loop_compact:
562 xor 0($key),$s0 # xor with key
563 xor 4($key),$s1
564 xor 8($key),$s2
565 xor 12($key),$s3
566 lea 16($key),$key
567___
568 &enccompactvert();
569$code.=<<___;
570 cmp 16(%rsp),$key
571 je .Lenc_compact_done
572___
573 &enctransform();
574$code.=<<___;
575 jmp .Lenc_loop_compact
576.align 16
577.Lenc_compact_done:
578 xor 0($key),$s0
579 xor 4($key),$s1
580 xor 8($key),$s2
581 xor 12($key),$s3
582 .byte 0xf3,0xc3 # rep ret
583.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
584___
585
586# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
587$code.=<<___;
588.globl AES_encrypt
589.type AES_encrypt,\@function,3
590.align 16
591AES_encrypt:
592 push %rbx
593 push %rbp
594 push %r12
595 push %r13
596 push %r14
597 push %r15
598
599 # allocate frame "above" key schedule
600 mov %rsp,%r10
601 lea -63(%rdx),%rcx # %rdx is key argument
602 and \$-64,%rsp
603 sub %rsp,%rcx
604 neg %rcx
605 and \$0x3c0,%rcx
606 sub %rcx,%rsp
607 sub \$32,%rsp
608
609 mov %rsi,16(%rsp) # save out
610 mov %r10,24(%rsp) # save real stack pointer
611.Lenc_prologue:
612
613 mov %rdx,$key
614 mov 240($key),$rnds # load rounds
615
616 mov 0(%rdi),$s0 # load input vector
617 mov 4(%rdi),$s1
618 mov 8(%rdi),$s2
619 mov 12(%rdi),$s3
620
621 shl \$4,$rnds
622 lea ($key,$rnds),%rbp
623 mov $key,(%rsp) # key schedule
624 mov %rbp,8(%rsp) # end of key schedule
625
626 # pick Te4 copy which can't "overlap" with stack frame or key schedule
627 lea .LAES_Te+2048(%rip),$sbox
628 lea 768(%rsp),%rbp
629 sub $sbox,%rbp
630 and \$0x300,%rbp
631 lea ($sbox,%rbp),$sbox
632
633 call _x86_64_AES_encrypt_compact
634
635 mov 16(%rsp),$out # restore out
636 mov 24(%rsp),%rsi # restore saved stack pointer
637 mov $s0,0($out) # write output vector
638 mov $s1,4($out)
639 mov $s2,8($out)
640 mov $s3,12($out)
641
642 mov (%rsi),%r15
643 mov 8(%rsi),%r14
644 mov 16(%rsi),%r13
645 mov 24(%rsi),%r12
646 mov 32(%rsi),%rbp
647 mov 40(%rsi),%rbx
648 lea 48(%rsi),%rsp
649.Lenc_epilogue:
650 ret
651.size AES_encrypt,.-AES_encrypt
652___
653
654#------------------------------------------------------------------#
655
656sub decvert()
657{ my $t3="%r8d"; # zaps $inp!
658
659$code.=<<___;
660 # favor 3-way issue Opteron pipeline...
661 movzb `&lo("$s0")`,$acc0
662 movzb `&lo("$s1")`,$acc1
663 movzb `&lo("$s2")`,$acc2
664 mov 0($sbox,$acc0,8),$t0
665 mov 0($sbox,$acc1,8),$t1
666 mov 0($sbox,$acc2,8),$t2
667
668 movzb `&hi("$s3")`,$acc0
669 movzb `&hi("$s0")`,$acc1
670 movzb `&lo("$s3")`,$acc2
671 xor 3($sbox,$acc0,8),$t0
672 xor 3($sbox,$acc1,8),$t1
673 mov 0($sbox,$acc2,8),$t3
674
675 movzb `&hi("$s1")`,$acc0
676 shr \$16,$s0
677 movzb `&hi("$s2")`,$acc2
678 xor 3($sbox,$acc0,8),$t2
679 shr \$16,$s3
680 xor 3($sbox,$acc2,8),$t3
681
682 shr \$16,$s1
683 lea 16($key),$key
684 shr \$16,$s2
685
686 movzb `&lo("$s2")`,$acc0
687 movzb `&lo("$s3")`,$acc1
688 movzb `&lo("$s0")`,$acc2
689 xor 2($sbox,$acc0,8),$t0
690 xor 2($sbox,$acc1,8),$t1
691 xor 2($sbox,$acc2,8),$t2
692
693 movzb `&hi("$s1")`,$acc0
694 movzb `&hi("$s2")`,$acc1
695 movzb `&lo("$s1")`,$acc2
696 xor 1($sbox,$acc0,8),$t0
697 xor 1($sbox,$acc1,8),$t1
698 xor 2($sbox,$acc2,8),$t3
699
700 movzb `&hi("$s3")`,$acc0
701 mov 12($key),$s3
702 movzb `&hi("$s0")`,$acc2
703 xor 1($sbox,$acc0,8),$t2
704 mov 0($key),$s0
705 xor 1($sbox,$acc2,8),$t3
706
707 xor $t0,$s0
708 mov 4($key),$s1
709 mov 8($key),$s2
710 xor $t2,$s2
711 xor $t1,$s1
712 xor $t3,$s3
713___
714}
715
716sub declastvert()
717{ my $t3="%r8d"; # zaps $inp!
718
719$code.=<<___;
720 lea 2048($sbox),$sbox # size optimization
721 movzb `&lo("$s0")`,$acc0
722 movzb `&lo("$s1")`,$acc1
723 movzb `&lo("$s2")`,$acc2
724 movzb ($sbox,$acc0,1),$t0
725 movzb ($sbox,$acc1,1),$t1
726 movzb ($sbox,$acc2,1),$t2
727
728 movzb `&lo("$s3")`,$acc0
729 movzb `&hi("$s3")`,$acc1
730 movzb `&hi("$s0")`,$acc2
731 movzb ($sbox,$acc0,1),$t3
732 movzb ($sbox,$acc1,1),$acc1 #$t0
733 movzb ($sbox,$acc2,1),$acc2 #$t1
734
735 shl \$8,$acc1
736 shl \$8,$acc2
737
738 xor $acc1,$t0
739 xor $acc2,$t1
740 shr \$16,$s3
741
742 movzb `&hi("$s1")`,$acc0
743 movzb `&hi("$s2")`,$acc1
744 shr \$16,$s0
745 movzb ($sbox,$acc0,1),$acc0 #$t2
746 movzb ($sbox,$acc1,1),$acc1 #$t3
747
748 shl \$8,$acc0
749 shl \$8,$acc1
750 shr \$16,$s1
751 xor $acc0,$t2
752 xor $acc1,$t3
753 shr \$16,$s2
754
755 movzb `&lo("$s2")`,$acc0
756 movzb `&lo("$s3")`,$acc1
757 movzb `&lo("$s0")`,$acc2
758 movzb ($sbox,$acc0,1),$acc0 #$t0
759 movzb ($sbox,$acc1,1),$acc1 #$t1
760 movzb ($sbox,$acc2,1),$acc2 #$t2
761
762 shl \$16,$acc0
763 shl \$16,$acc1
764 shl \$16,$acc2
765
766 xor $acc0,$t0
767 xor $acc1,$t1
768 xor $acc2,$t2
769
770 movzb `&lo("$s1")`,$acc0
771 movzb `&hi("$s1")`,$acc1
772 movzb `&hi("$s2")`,$acc2
773 movzb ($sbox,$acc0,1),$acc0 #$t3
774 movzb ($sbox,$acc1,1),$acc1 #$t0
775 movzb ($sbox,$acc2,1),$acc2 #$t1
776
777 shl \$16,$acc0
778 shl \$24,$acc1
779 shl \$24,$acc2
780
781 xor $acc0,$t3
782 xor $acc1,$t0
783 xor $acc2,$t1
784
785 movzb `&hi("$s3")`,$acc0
786 movzb `&hi("$s0")`,$acc1
787 mov 16+12($key),$s3
788 movzb ($sbox,$acc0,1),$acc0 #$t2
789 movzb ($sbox,$acc1,1),$acc1 #$t3
790 mov 16+0($key),$s0
791
792 shl \$24,$acc0
793 shl \$24,$acc1
794
795 xor $acc0,$t2
796 xor $acc1,$t3
797
798 mov 16+4($key),$s1
799 mov 16+8($key),$s2
800 lea -2048($sbox),$sbox
801 xor $t0,$s0
802 xor $t1,$s1
803 xor $t2,$s2
804 xor $t3,$s3
805___
806}
807
808sub decstep()
809{ my ($i,@s) = @_;
810 my $tmp0=$acc0;
811 my $tmp1=$acc1;
812 my $tmp2=$acc2;
813 my $out=($t0,$t1,$t2,$s[0])[$i];
814
815 $code.=" mov $s[0],$out\n" if ($i!=3);
816 $tmp1=$s[2] if ($i==3);
817 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
818 $code.=" and \$0xFF,$out\n";
819
820 $code.=" mov 0($sbox,$out,8),$out\n";
821 $code.=" shr \$16,$tmp1\n";
822 $tmp2=$s[3] if ($i==3);
823 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
824
825 $tmp0=$s[1] if ($i==3);
826 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
827 $code.=" and \$0xFF,$tmp1\n";
828 $code.=" shr \$24,$tmp2\n";
829
830 $code.=" xor 3($sbox,$tmp0,8),$out\n";
831 $code.=" xor 2($sbox,$tmp1,8),$out\n";
832 $code.=" xor 1($sbox,$tmp2,8),$out\n";
833
834 $code.=" mov $t2,$s[1]\n" if ($i==3);
835 $code.=" mov $t1,$s[2]\n" if ($i==3);
836 $code.=" mov $t0,$s[3]\n" if ($i==3);
837 $code.="\n";
838}
839
840sub declast()
841{ my ($i,@s)=@_;
842 my $tmp0=$acc0;
843 my $tmp1=$acc1;
844 my $tmp2=$acc2;
845 my $out=($t0,$t1,$t2,$s[0])[$i];
846
847 $code.=" mov $s[0],$out\n" if ($i!=3);
848 $tmp1=$s[2] if ($i==3);
849 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
850 $code.=" and \$0xFF,$out\n";
851
852 $code.=" movzb 2048($sbox,$out,1),$out\n";
853 $code.=" shr \$16,$tmp1\n";
854 $tmp2=$s[3] if ($i==3);
855 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
856
857 $tmp0=$s[1] if ($i==3);
858 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
859 $code.=" and \$0xFF,$tmp1\n";
860 $code.=" shr \$24,$tmp2\n";
861
862 $code.=" movzb 2048($sbox,$tmp0,1),$tmp0\n";
863 $code.=" movzb 2048($sbox,$tmp1,1),$tmp1\n";
864 $code.=" movzb 2048($sbox,$tmp2,1),$tmp2\n";
865
866 $code.=" shl \$8,$tmp0\n";
867 $code.=" shl \$16,$tmp1\n";
868 $code.=" shl \$24,$tmp2\n";
869
870 $code.=" xor $tmp0,$out\n";
871 $code.=" mov $t2,$s[1]\n" if ($i==3);
872 $code.=" xor $tmp1,$out\n";
873 $code.=" mov $t1,$s[2]\n" if ($i==3);
874 $code.=" xor $tmp2,$out\n";
875 $code.=" mov $t0,$s[3]\n" if ($i==3);
876 $code.="\n";
877}
878
879$code.=<<___;
880.type _x86_64_AES_decrypt,\@abi-omnipotent
881.align 16
882_x86_64_AES_decrypt:
883 xor 0($key),$s0 # xor with key
884 xor 4($key),$s1
885 xor 8($key),$s2
886 xor 12($key),$s3
887
888 mov 240($key),$rnds # load key->rounds
889 sub \$1,$rnds
890 jmp .Ldec_loop
891.align 16
892.Ldec_loop:
893___
894 if ($verticalspin) { &decvert(); }
895 else { &decstep(0,$s0,$s3,$s2,$s1);
896 &decstep(1,$s1,$s0,$s3,$s2);
897 &decstep(2,$s2,$s1,$s0,$s3);
898 &decstep(3,$s3,$s2,$s1,$s0);
899 $code.=<<___;
900 lea 16($key),$key
901 xor 0($key),$s0 # xor with key
902 xor 4($key),$s1
903 xor 8($key),$s2
904 xor 12($key),$s3
905___
906 }
907$code.=<<___;
908 sub \$1,$rnds
909 jnz .Ldec_loop
910___
911 if ($verticalspin) { &declastvert(); }
912 else { &declast(0,$s0,$s3,$s2,$s1);
913 &declast(1,$s1,$s0,$s3,$s2);
914 &declast(2,$s2,$s1,$s0,$s3);
915 &declast(3,$s3,$s2,$s1,$s0);
916 $code.=<<___;
917 xor 16+0($key),$s0 # xor with key
918 xor 16+4($key),$s1
919 xor 16+8($key),$s2
920 xor 16+12($key),$s3
921___
922 }
923$code.=<<___;
924 .byte 0xf3,0xc3 # rep ret
925.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
926___
927
928sub deccompactvert()
929{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
930
931$code.=<<___;
932 movzb `&lo("$s0")`,$t0
933 movzb `&lo("$s1")`,$t1
934 movzb `&lo("$s2")`,$t2
935 movzb ($sbox,$t0,1),$t0
936 movzb ($sbox,$t1,1),$t1
937 movzb ($sbox,$t2,1),$t2
938
939 movzb `&lo("$s3")`,$t3
940 movzb `&hi("$s3")`,$acc0
941 movzb `&hi("$s0")`,$acc1
942 movzb ($sbox,$t3,1),$t3
943 movzb ($sbox,$acc0,1),$t4 #$t0
944 movzb ($sbox,$acc1,1),$t5 #$t1
945
946 movzb `&hi("$s1")`,$acc2
947 movzb `&hi("$s2")`,$acc0
948 shr \$16,$s2
949 movzb ($sbox,$acc2,1),$acc2 #$t2
950 movzb ($sbox,$acc0,1),$acc0 #$t3
951 shr \$16,$s3
952
953 movzb `&lo("$s2")`,$acc1
954 shl \$8,$t4
955 shl \$8,$t5
956 movzb ($sbox,$acc1,1),$acc1 #$t0
957 xor $t4,$t0
958 xor $t5,$t1
959
960 movzb `&lo("$s3")`,$t4
961 shr \$16,$s0
962 shr \$16,$s1
963 movzb `&lo("$s0")`,$t5
964 shl \$8,$acc2
965 shl \$8,$acc0
966 movzb ($sbox,$t4,1),$t4 #$t1
967 movzb ($sbox,$t5,1),$t5 #$t2
968 xor $acc2,$t2
969 xor $acc0,$t3
970
971 movzb `&lo("$s1")`,$acc2
972 movzb `&hi("$s1")`,$acc0
973 shl \$16,$acc1
974 movzb ($sbox,$acc2,1),$acc2 #$t3
975 movzb ($sbox,$acc0,1),$acc0 #$t0
976 xor $acc1,$t0
977
978 movzb `&hi("$s2")`,$acc1
979 shl \$16,$t4
980 shl \$16,$t5
981 movzb ($sbox,$acc1,1),$s1 #$t1
982 xor $t4,$t1
983 xor $t5,$t2
984
985 movzb `&hi("$s3")`,$acc1
986 shr \$8,$s0
987 shl \$16,$acc2
988 movzb ($sbox,$acc1,1),$s2 #$t2
989 movzb ($sbox,$s0,1),$s3 #$t3
990 xor $acc2,$t3
991
992 shl \$24,$acc0
993 shl \$24,$s1
994 shl \$24,$s2
995 xor $acc0,$t0
996 shl \$24,$s3
997 xor $t1,$s1
998 mov $t0,$s0
999 xor $t2,$s2
1000 xor $t3,$s3
1001___
1002}
1003
1004# parallelized version! input is pair of 64-bit values: %rax=s1.s0
1005# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
1006# %ecx=s2 and %edx=s3.
1007sub dectransform()
1008{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
1009 my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
1010 my $prefetch = shift;
1011
1012$code.=<<___;
1013 mov $tp10,$acc0
1014 mov $tp18,$acc8
1015 and $mask80,$acc0
1016 and $mask80,$acc8
1017 mov $acc0,$tp40
1018 mov $acc8,$tp48
1019 shr \$7,$tp40
1020 lea ($tp10,$tp10),$tp20
1021 shr \$7,$tp48
1022 lea ($tp18,$tp18),$tp28
1023 sub $tp40,$acc0
1024 sub $tp48,$acc8
1025 and $maskfe,$tp20
1026 and $maskfe,$tp28
1027 and $mask1b,$acc0
1028 and $mask1b,$acc8
1029 xor $tp20,$acc0
1030 xor $tp28,$acc8
1031 mov $acc0,$tp20
1032 mov $acc8,$tp28
1033
1034 and $mask80,$acc0
1035 and $mask80,$acc8
1036 mov $acc0,$tp80
1037 mov $acc8,$tp88
1038 shr \$7,$tp80
1039 lea ($tp20,$tp20),$tp40
1040 shr \$7,$tp88
1041 lea ($tp28,$tp28),$tp48
1042 sub $tp80,$acc0
1043 sub $tp88,$acc8
1044 and $maskfe,$tp40
1045 and $maskfe,$tp48
1046 and $mask1b,$acc0
1047 and $mask1b,$acc8
1048 xor $tp40,$acc0
1049 xor $tp48,$acc8
1050 mov $acc0,$tp40
1051 mov $acc8,$tp48
1052
1053 and $mask80,$acc0
1054 and $mask80,$acc8
1055 mov $acc0,$tp80
1056 mov $acc8,$tp88
1057 shr \$7,$tp80
1058 xor $tp10,$tp20 # tp2^=tp1
1059 shr \$7,$tp88
1060 xor $tp18,$tp28 # tp2^=tp1
1061 sub $tp80,$acc0
1062 sub $tp88,$acc8
1063 lea ($tp40,$tp40),$tp80
1064 lea ($tp48,$tp48),$tp88
1065 xor $tp10,$tp40 # tp4^=tp1
1066 xor $tp18,$tp48 # tp4^=tp1
1067 and $maskfe,$tp80
1068 and $maskfe,$tp88
1069 and $mask1b,$acc0
1070 and $mask1b,$acc8
1071 xor $acc0,$tp80
1072 xor $acc8,$tp88
1073
1074 xor $tp80,$tp10 # tp1^=tp8
1075 xor $tp88,$tp18 # tp1^=tp8
1076 xor $tp80,$tp20 # tp2^tp1^=tp8
1077 xor $tp88,$tp28 # tp2^tp1^=tp8
1078 mov $tp10,$acc0
1079 mov $tp18,$acc8
1080 xor $tp80,$tp40 # tp4^tp1^=tp8
1081 xor $tp88,$tp48 # tp4^tp1^=tp8
1082 shr \$32,$acc0
1083 shr \$32,$acc8
1084 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1
1085 xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
1086 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8)
1087 rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8)
1088 xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1089 xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1090
1091 rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
1092 rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8)
1093 xor `&LO("$tp80")`,`&LO("$tp10")`
1094 xor `&LO("$tp88")`,`&LO("$tp18")`
1095 shr \$32,$tp80
1096 shr \$32,$tp88
1097 xor `&LO("$tp80")`,`&LO("$acc0")`
1098 xor `&LO("$tp88")`,`&LO("$acc8")`
1099
1100 mov $tp20,$tp80
1101 mov $tp28,$tp88
1102 shr \$32,$tp80
1103 shr \$32,$tp88
1104 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
1105 rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
1106 rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
1107 rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
1108 xor `&LO("$tp20")`,`&LO("$tp10")`
1109 xor `&LO("$tp28")`,`&LO("$tp18")`
1110 mov $tp40,$tp20
1111 mov $tp48,$tp28
1112 xor `&LO("$tp80")`,`&LO("$acc0")`
1113 xor `&LO("$tp88")`,`&LO("$acc8")`
1114
1115 `"mov 0($sbox),$mask80" if ($prefetch)`
1116 shr \$32,$tp20
1117 shr \$32,$tp28
1118 `"mov 64($sbox),$maskfe" if ($prefetch)`
1119 rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
1120 rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
1121 `"mov 128($sbox),$mask1b" if ($prefetch)`
1122 rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
1123 rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
1124 `"mov 192($sbox),$tp80" if ($prefetch)`
1125 xor `&LO("$tp40")`,`&LO("$tp10")`
1126 xor `&LO("$tp48")`,`&LO("$tp18")`
1127 `"mov 256($sbox),$tp88" if ($prefetch)`
1128 xor `&LO("$tp20")`,`&LO("$acc0")`
1129 xor `&LO("$tp28")`,`&LO("$acc8")`
1130___
1131}
1132
1133$code.=<<___;
1134.type _x86_64_AES_decrypt_compact,\@abi-omnipotent
1135.align 16
1136_x86_64_AES_decrypt_compact:
1137 lea 128($sbox),$inp # size optimization
1138 mov 0-128($inp),$acc1 # prefetch Td4
1139 mov 32-128($inp),$acc2
1140 mov 64-128($inp),$t0
1141 mov 96-128($inp),$t1
1142 mov 128-128($inp),$acc1
1143 mov 160-128($inp),$acc2
1144 mov 192-128($inp),$t0
1145 mov 224-128($inp),$t1
1146 jmp .Ldec_loop_compact
1147
1148.align 16
1149.Ldec_loop_compact:
1150 xor 0($key),$s0 # xor with key
1151 xor 4($key),$s1
1152 xor 8($key),$s2
1153 xor 12($key),$s3
1154 lea 16($key),$key
1155___
1156 &deccompactvert();
1157$code.=<<___;
1158 cmp 16(%rsp),$key
1159 je .Ldec_compact_done
1160
1161 mov 256+0($sbox),$mask80
1162 shl \$32,%rbx
1163 shl \$32,%rdx
1164 mov 256+8($sbox),$maskfe
1165 or %rbx,%rax
1166 or %rdx,%rcx
1167 mov 256+16($sbox),$mask1b
1168___
1169 &dectransform(1);
1170$code.=<<___;
1171 jmp .Ldec_loop_compact
1172.align 16
1173.Ldec_compact_done:
1174 xor 0($key),$s0
1175 xor 4($key),$s1
1176 xor 8($key),$s2
1177 xor 12($key),$s3
1178 .byte 0xf3,0xc3 # rep ret
1179.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
1180___
1181
1182# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
1183$code.=<<___;
1184.globl AES_decrypt
1185.type AES_decrypt,\@function,3
1186.align 16
1187AES_decrypt:
1188 push %rbx
1189 push %rbp
1190 push %r12
1191 push %r13
1192 push %r14
1193 push %r15
1194
1195 # allocate frame "above" key schedule
1196 mov %rsp,%r10
1197 lea -63(%rdx),%rcx # %rdx is key argument
1198 and \$-64,%rsp
1199 sub %rsp,%rcx
1200 neg %rcx
1201 and \$0x3c0,%rcx
1202 sub %rcx,%rsp
1203 sub \$32,%rsp
1204
1205 mov %rsi,16(%rsp) # save out
1206 mov %r10,24(%rsp) # save real stack pointer
1207.Ldec_prologue:
1208
1209 mov %rdx,$key
1210 mov 240($key),$rnds # load rounds
1211
1212 mov 0(%rdi),$s0 # load input vector
1213 mov 4(%rdi),$s1
1214 mov 8(%rdi),$s2
1215 mov 12(%rdi),$s3
1216
1217 shl \$4,$rnds
1218 lea ($key,$rnds),%rbp
1219 mov $key,(%rsp) # key schedule
1220 mov %rbp,8(%rsp) # end of key schedule
1221
1222 # pick Td4 copy which can't "overlap" with stack frame or key schedule
1223 lea .LAES_Td+2048(%rip),$sbox
1224 lea 768(%rsp),%rbp
1225 sub $sbox,%rbp
1226 and \$0x300,%rbp
1227 lea ($sbox,%rbp),$sbox
1228 shr \$3,%rbp # recall "magic" constants!
1229 add %rbp,$sbox
1230
1231 call _x86_64_AES_decrypt_compact
1232
1233 mov 16(%rsp),$out # restore out
1234 mov 24(%rsp),%rsi # restore saved stack pointer
1235 mov $s0,0($out) # write output vector
1236 mov $s1,4($out)
1237 mov $s2,8($out)
1238 mov $s3,12($out)
1239
1240 mov (%rsi),%r15
1241 mov 8(%rsi),%r14
1242 mov 16(%rsi),%r13
1243 mov 24(%rsi),%r12
1244 mov 32(%rsi),%rbp
1245 mov 40(%rsi),%rbx
1246 lea 48(%rsi),%rsp
1247.Ldec_epilogue:
1248 ret
1249.size AES_decrypt,.-AES_decrypt
1250___
1251#------------------------------------------------------------------#
1252
1253sub enckey()
1254{
1255$code.=<<___;
1256 movz %dl,%esi # rk[i]>>0
1257 movzb -128(%rbp,%rsi),%ebx
1258 movz %dh,%esi # rk[i]>>8
1259 shl \$24,%ebx
1260 xor %ebx,%eax
1261
1262 movzb -128(%rbp,%rsi),%ebx
1263 shr \$16,%edx
1264 movz %dl,%esi # rk[i]>>16
1265 xor %ebx,%eax
1266
1267 movzb -128(%rbp,%rsi),%ebx
1268 movz %dh,%esi # rk[i]>>24
1269 shl \$8,%ebx
1270 xor %ebx,%eax
1271
1272 movzb -128(%rbp,%rsi),%ebx
1273 shl \$16,%ebx
1274 xor %ebx,%eax
1275
1276 xor 1024-128(%rbp,%rcx,4),%eax # rcon
1277___
1278}
1279
1280# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
1281# AES_KEY *key)
1282$code.=<<___;
1283.globl AES_set_encrypt_key
1284.type AES_set_encrypt_key,\@function,3
1285.align 16
1286AES_set_encrypt_key:
1287 push %rbx
1288 push %rbp
1289 push %r12 # redundant, but allows to share
1290 push %r13 # exception handler...
1291 push %r14
1292 push %r15
1293 sub \$8,%rsp
1294.Lenc_key_prologue:
1295
1296 call _x86_64_AES_set_encrypt_key
1297
1298 mov 8(%rsp),%r15
1299 mov 16(%rsp),%r14
1300 mov 24(%rsp),%r13
1301 mov 32(%rsp),%r12
1302 mov 40(%rsp),%rbp
1303 mov 48(%rsp),%rbx
1304 add \$56,%rsp
1305.Lenc_key_epilogue:
1306 ret
1307.size AES_set_encrypt_key,.-AES_set_encrypt_key
1308
1309.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
1310.align 16
1311_x86_64_AES_set_encrypt_key:
1312 mov %esi,%ecx # %ecx=bits
1313 mov %rdi,%rsi # %rsi=userKey
1314 mov %rdx,%rdi # %rdi=key
1315
1316 test \$-1,%rsi
1317 jz .Lbadpointer
1318 test \$-1,%rdi
1319 jz .Lbadpointer
1320
1321 lea .LAES_Te(%rip),%rbp
1322 lea 2048+128(%rbp),%rbp
1323
1324 # prefetch Te4
1325 mov 0-128(%rbp),%eax
1326 mov 32-128(%rbp),%ebx
1327 mov 64-128(%rbp),%r8d
1328 mov 96-128(%rbp),%edx
1329 mov 128-128(%rbp),%eax
1330 mov 160-128(%rbp),%ebx
1331 mov 192-128(%rbp),%r8d
1332 mov 224-128(%rbp),%edx
1333
1334 cmp \$128,%ecx
1335 je .L10rounds
1336 cmp \$192,%ecx
1337 je .L12rounds
1338 cmp \$256,%ecx
1339 je .L14rounds
1340 mov \$-2,%rax # invalid number of bits
1341 jmp .Lexit
1342
1343.L10rounds:
1344 mov 0(%rsi),%rax # copy first 4 dwords
1345 mov 8(%rsi),%rdx
1346 mov %rax,0(%rdi)
1347 mov %rdx,8(%rdi)
1348
1349 shr \$32,%rdx
1350 xor %ecx,%ecx
1351 jmp .L10shortcut
1352.align 4
1353.L10loop:
1354 mov 0(%rdi),%eax # rk[0]
1355 mov 12(%rdi),%edx # rk[3]
1356.L10shortcut:
1357___
1358 &enckey ();
1359$code.=<<___;
1360 mov %eax,16(%rdi) # rk[4]
1361 xor 4(%rdi),%eax
1362 mov %eax,20(%rdi) # rk[5]
1363 xor 8(%rdi),%eax
1364 mov %eax,24(%rdi) # rk[6]
1365 xor 12(%rdi),%eax
1366 mov %eax,28(%rdi) # rk[7]
1367 add \$1,%ecx
1368 lea 16(%rdi),%rdi
1369 cmp \$10,%ecx
1370 jl .L10loop
1371
1372 movl \$10,80(%rdi) # setup number of rounds
1373 xor %rax,%rax
1374 jmp .Lexit
1375
1376.L12rounds:
1377 mov 0(%rsi),%rax # copy first 6 dwords
1378 mov 8(%rsi),%rbx
1379 mov 16(%rsi),%rdx
1380 mov %rax,0(%rdi)
1381 mov %rbx,8(%rdi)
1382 mov %rdx,16(%rdi)
1383
1384 shr \$32,%rdx
1385 xor %ecx,%ecx
1386 jmp .L12shortcut
1387.align 4
1388.L12loop:
1389 mov 0(%rdi),%eax # rk[0]
1390 mov 20(%rdi),%edx # rk[5]
1391.L12shortcut:
1392___
1393 &enckey ();
1394$code.=<<___;
1395 mov %eax,24(%rdi) # rk[6]
1396 xor 4(%rdi),%eax
1397 mov %eax,28(%rdi) # rk[7]
1398 xor 8(%rdi),%eax
1399 mov %eax,32(%rdi) # rk[8]
1400 xor 12(%rdi),%eax
1401 mov %eax,36(%rdi) # rk[9]
1402
1403 cmp \$7,%ecx
1404 je .L12break
1405 add \$1,%ecx
1406
1407 xor 16(%rdi),%eax
1408 mov %eax,40(%rdi) # rk[10]
1409 xor 20(%rdi),%eax
1410 mov %eax,44(%rdi) # rk[11]
1411
1412 lea 24(%rdi),%rdi
1413 jmp .L12loop
1414.L12break:
1415 movl \$12,72(%rdi) # setup number of rounds
1416 xor %rax,%rax
1417 jmp .Lexit
1418
1419.L14rounds:
1420 mov 0(%rsi),%rax # copy first 8 dwords
1421 mov 8(%rsi),%rbx
1422 mov 16(%rsi),%rcx
1423 mov 24(%rsi),%rdx
1424 mov %rax,0(%rdi)
1425 mov %rbx,8(%rdi)
1426 mov %rcx,16(%rdi)
1427 mov %rdx,24(%rdi)
1428
1429 shr \$32,%rdx
1430 xor %ecx,%ecx
1431 jmp .L14shortcut
1432.align 4
1433.L14loop:
1434 mov 0(%rdi),%eax # rk[0]
1435 mov 28(%rdi),%edx # rk[4]
1436.L14shortcut:
1437___
1438 &enckey ();
1439$code.=<<___;
1440 mov %eax,32(%rdi) # rk[8]
1441 xor 4(%rdi),%eax
1442 mov %eax,36(%rdi) # rk[9]
1443 xor 8(%rdi),%eax
1444 mov %eax,40(%rdi) # rk[10]
1445 xor 12(%rdi),%eax
1446 mov %eax,44(%rdi) # rk[11]
1447
1448 cmp \$6,%ecx
1449 je .L14break
1450 add \$1,%ecx
1451
1452 mov %eax,%edx
1453 mov 16(%rdi),%eax # rk[4]
1454 movz %dl,%esi # rk[11]>>0
1455 movzb -128(%rbp,%rsi),%ebx
1456 movz %dh,%esi # rk[11]>>8
1457 xor %ebx,%eax
1458
1459 movzb -128(%rbp,%rsi),%ebx
1460 shr \$16,%edx
1461 shl \$8,%ebx
1462 movz %dl,%esi # rk[11]>>16
1463 xor %ebx,%eax
1464
1465 movzb -128(%rbp,%rsi),%ebx
1466 movz %dh,%esi # rk[11]>>24
1467 shl \$16,%ebx
1468 xor %ebx,%eax
1469
1470 movzb -128(%rbp,%rsi),%ebx
1471 shl \$24,%ebx
1472 xor %ebx,%eax
1473
1474 mov %eax,48(%rdi) # rk[12]
1475 xor 20(%rdi),%eax
1476 mov %eax,52(%rdi) # rk[13]
1477 xor 24(%rdi),%eax
1478 mov %eax,56(%rdi) # rk[14]
1479 xor 28(%rdi),%eax
1480 mov %eax,60(%rdi) # rk[15]
1481
1482 lea 32(%rdi),%rdi
1483 jmp .L14loop
1484.L14break:
1485 movl \$14,48(%rdi) # setup number of rounds
1486 xor %rax,%rax
1487 jmp .Lexit
1488
1489.Lbadpointer:
1490 mov \$-1,%rax
1491.Lexit:
1492 .byte 0xf3,0xc3 # rep ret
1493.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
1494___
1495
1496sub deckey_ref()
1497{ my ($i,$ptr,$te,$td) = @_;
1498 my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
1499$code.=<<___;
1500 mov $i($ptr),$tp1
1501 mov $tp1,$acc
1502 and \$0x80808080,$acc
1503 mov $acc,$tp4
1504 shr \$7,$tp4
1505 lea 0($tp1,$tp1),$tp2
1506 sub $tp4,$acc
1507 and \$0xfefefefe,$tp2
1508 and \$0x1b1b1b1b,$acc
1509 xor $tp2,$acc
1510 mov $acc,$tp2
1511
1512 and \$0x80808080,$acc
1513 mov $acc,$tp8
1514 shr \$7,$tp8
1515 lea 0($tp2,$tp2),$tp4
1516 sub $tp8,$acc
1517 and \$0xfefefefe,$tp4
1518 and \$0x1b1b1b1b,$acc
1519 xor $tp1,$tp2 # tp2^tp1
1520 xor $tp4,$acc
1521 mov $acc,$tp4
1522
1523 and \$0x80808080,$acc
1524 mov $acc,$tp8
1525 shr \$7,$tp8
1526 sub $tp8,$acc
1527 lea 0($tp4,$tp4),$tp8
1528 xor $tp1,$tp4 # tp4^tp1
1529 and \$0xfefefefe,$tp8
1530 and \$0x1b1b1b1b,$acc
1531 xor $acc,$tp8
1532
1533 xor $tp8,$tp1 # tp1^tp8
1534 rol \$8,$tp1 # ROTATE(tp1^tp8,8)
1535 xor $tp8,$tp2 # tp2^tp1^tp8
1536 xor $tp8,$tp4 # tp4^tp1^tp8
1537 xor $tp2,$tp8
1538 xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
1539
1540 xor $tp8,$tp1
1541 rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24)
1542 xor $tp2,$tp1
1543 rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16)
1544 xor $tp4,$tp1
1545
1546 mov $tp1,$i($ptr)
1547___
1548}
1549
1550# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
1551# AES_KEY *key)
1552$code.=<<___;
1553.globl AES_set_decrypt_key
1554.type AES_set_decrypt_key,\@function,3
1555.align 16
1556AES_set_decrypt_key:
1557 push %rbx
1558 push %rbp
1559 push %r12
1560 push %r13
1561 push %r14
1562 push %r15
1563 push %rdx # save key schedule
1564.Ldec_key_prologue:
1565
1566 call _x86_64_AES_set_encrypt_key
1567 mov (%rsp),%r8 # restore key schedule
1568 cmp \$0,%eax
1569 jne .Labort
1570
1571 mov 240(%r8),%r14d # pull number of rounds
1572 xor %rdi,%rdi
1573 lea (%rdi,%r14d,4),%rcx
1574 mov %r8,%rsi
1575 lea (%r8,%rcx,4),%rdi # pointer to last chunk
1576.align 4
1577.Linvert:
1578 mov 0(%rsi),%rax
1579 mov 8(%rsi),%rbx
1580 mov 0(%rdi),%rcx
1581 mov 8(%rdi),%rdx
1582 mov %rax,0(%rdi)
1583 mov %rbx,8(%rdi)
1584 mov %rcx,0(%rsi)
1585 mov %rdx,8(%rsi)
1586 lea 16(%rsi),%rsi
1587 lea -16(%rdi),%rdi
1588 cmp %rsi,%rdi
1589 jne .Linvert
1590
1591 lea .LAES_Te+2048+1024(%rip),%rax # rcon
1592
1593 mov 40(%rax),$mask80
1594 mov 48(%rax),$maskfe
1595 mov 56(%rax),$mask1b
1596
1597 mov %r8,$key
1598 sub \$1,%r14d
1599.align 4
1600.Lpermute:
1601 lea 16($key),$key
1602 mov 0($key),%rax
1603 mov 8($key),%rcx
1604___
1605 &dectransform ();
1606$code.=<<___;
1607 mov %eax,0($key)
1608 mov %ebx,4($key)
1609 mov %ecx,8($key)
1610 mov %edx,12($key)
1611 sub \$1,%r14d
1612 jnz .Lpermute
1613
1614 xor %rax,%rax
1615.Labort:
1616 mov 8(%rsp),%r15
1617 mov 16(%rsp),%r14
1618 mov 24(%rsp),%r13
1619 mov 32(%rsp),%r12
1620 mov 40(%rsp),%rbp
1621 mov 48(%rsp),%rbx
1622 add \$56,%rsp
1623.Ldec_key_epilogue:
1624 ret
1625.size AES_set_decrypt_key,.-AES_set_decrypt_key
1626___
1627
1628# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
1629# size_t length, const AES_KEY *key,
1630# unsigned char *ivp,const int enc);
1631{
1632# stack frame layout
1633# -8(%rsp) return address
1634my $keyp="0(%rsp)"; # one to pass as $key
1635my $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds])
1636my $_rsp="16(%rsp)"; # saved %rsp
1637my $_inp="24(%rsp)"; # copy of 1st parameter, inp
1638my $_out="32(%rsp)"; # copy of 2nd parameter, out
1639my $_len="40(%rsp)"; # copy of 3rd parameter, length
1640my $_key="48(%rsp)"; # copy of 4th parameter, key
1641my $_ivp="56(%rsp)"; # copy of 5th parameter, ivp
1642my $ivec="64(%rsp)"; # ivec[16]
1643my $aes_key="80(%rsp)"; # copy of aes_key
1644my $mark="80+240(%rsp)"; # copy of aes_key->rounds
1645
1646$code.=<<___;
1647.globl AES_cbc_encrypt
1648.type AES_cbc_encrypt,\@function,6
1649.align 16
1650.extern OPENSSL_ia32cap_P
1651AES_cbc_encrypt:
1652 cmp \$0,%rdx # check length
1653 je .Lcbc_epilogue
1654 pushfq
1655 push %rbx
1656 push %rbp
1657 push %r12
1658 push %r13
1659 push %r14
1660 push %r15
1661.Lcbc_prologue:
1662
1663 cld
1664 mov %r9d,%r9d # clear upper half of enc
1665
1666 lea .LAES_Te(%rip),$sbox
1667 cmp \$0,%r9
1668 jne .Lcbc_picked_te
1669 lea .LAES_Td(%rip),$sbox
1670.Lcbc_picked_te:
1671
1672 mov PIC_GOT(OPENSSL_ia32cap_P),%r10d
1673 cmp \$$speed_limit,%rdx
1674 jb .Lcbc_slow_prologue
1675 test \$15,%rdx
1676 jnz .Lcbc_slow_prologue
1677 bt \$28,%r10d
1678 jc .Lcbc_slow_prologue
1679
1680 # allocate aligned stack frame...
1681 lea -88-248(%rsp),$key
1682 and \$-64,$key
1683
1684 # ... and make sure it doesn't alias with AES_T[ed] modulo 4096
1685 mov $sbox,%r10
1686 lea 2304($sbox),%r11
1687 mov $key,%r12
1688 and \$0xFFF,%r10 # s = $sbox&0xfff
1689 and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
1690 and \$0xFFF,%r12 # p = %rsp&0xfff
1691
1692 cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
1693 jb .Lcbc_te_break_out
1694 sub %r11,%r12
1695 sub %r12,$key
1696 jmp .Lcbc_te_ok
1697.Lcbc_te_break_out: # else %rsp -= (p-s)&0xfff + framesz
1698 sub %r10,%r12
1699 and \$0xFFF,%r12
1700 add \$320,%r12
1701 sub %r12,$key
1702.align 4
1703.Lcbc_te_ok:
1704
1705 xchg %rsp,$key
1706 #add \$8,%rsp # reserve for return address!
1707 mov $key,$_rsp # save %rsp
1708.Lcbc_fast_body:
1709 mov %rdi,$_inp # save copy of inp
1710 mov %rsi,$_out # save copy of out
1711 mov %rdx,$_len # save copy of len
1712 mov %rcx,$_key # save copy of key
1713 mov %r8,$_ivp # save copy of ivp
1714 movl \$0,$mark # copy of aes_key->rounds = 0;
1715 mov %r8,%rbp # rearrange input arguments
1716 mov %r9,%rbx
1717 mov %rsi,$out
1718 mov %rdi,$inp
1719 mov %rcx,$key
1720
1721 mov 240($key),%eax # key->rounds
1722 # do we copy key schedule to stack?
1723 mov $key,%r10
1724 sub $sbox,%r10
1725 and \$0xfff,%r10
1726 cmp \$2304,%r10
1727 jb .Lcbc_do_ecopy
1728 cmp \$4096-248,%r10
1729 jb .Lcbc_skip_ecopy
1730.align 4
1731.Lcbc_do_ecopy:
1732 mov $key,%rsi
1733 lea $aes_key,%rdi
1734 lea $aes_key,$key
1735 mov \$240/8,%ecx
1736 .long 0x90A548F3 # rep movsq
1737 mov %eax,(%rdi) # copy aes_key->rounds
1738.Lcbc_skip_ecopy:
1739 mov $key,$keyp # save key pointer
1740
1741 mov \$18,%ecx
1742.align 4
1743.Lcbc_prefetch_te:
1744 mov 0($sbox),%r10
1745 mov 32($sbox),%r11
1746 mov 64($sbox),%r12
1747 mov 96($sbox),%r13
1748 lea 128($sbox),$sbox
1749 sub \$1,%ecx
1750 jnz .Lcbc_prefetch_te
1751 lea -2304($sbox),$sbox
1752
1753 cmp \$0,%rbx
1754 je .LFAST_DECRYPT
1755
1756#----------------------------- ENCRYPT -----------------------------#
1757 mov 0(%rbp),$s0 # load iv
1758 mov 4(%rbp),$s1
1759 mov 8(%rbp),$s2
1760 mov 12(%rbp),$s3
1761
1762.align 4
1763.Lcbc_fast_enc_loop:
1764 xor 0($inp),$s0
1765 xor 4($inp),$s1
1766 xor 8($inp),$s2
1767 xor 12($inp),$s3
1768 mov $keyp,$key # restore key
1769 mov $inp,$_inp # if ($verticalspin) save inp
1770
1771 call _x86_64_AES_encrypt
1772
1773 mov $_inp,$inp # if ($verticalspin) restore inp
1774 mov $_len,%r10
1775 mov $s0,0($out)
1776 mov $s1,4($out)
1777 mov $s2,8($out)
1778 mov $s3,12($out)
1779
1780 lea 16($inp),$inp
1781 lea 16($out),$out
1782 sub \$16,%r10
1783 test \$-16,%r10
1784 mov %r10,$_len
1785 jnz .Lcbc_fast_enc_loop
1786 mov $_ivp,%rbp # restore ivp
1787 mov $s0,0(%rbp) # save ivec
1788 mov $s1,4(%rbp)
1789 mov $s2,8(%rbp)
1790 mov $s3,12(%rbp)
1791
1792 jmp .Lcbc_fast_cleanup
1793
1794#----------------------------- DECRYPT -----------------------------#
1795.align 16
1796.LFAST_DECRYPT:
1797 cmp $inp,$out
1798 je .Lcbc_fast_dec_in_place
1799
1800 mov %rbp,$ivec
1801.align 4
1802.Lcbc_fast_dec_loop:
1803 mov 0($inp),$s0 # read input
1804 mov 4($inp),$s1
1805 mov 8($inp),$s2
1806 mov 12($inp),$s3
1807 mov $keyp,$key # restore key
1808 mov $inp,$_inp # if ($verticalspin) save inp
1809
1810 call _x86_64_AES_decrypt
1811
1812 mov $ivec,%rbp # load ivp
1813 mov $_inp,$inp # if ($verticalspin) restore inp
1814 mov $_len,%r10 # load len
1815 xor 0(%rbp),$s0 # xor iv
1816 xor 4(%rbp),$s1
1817 xor 8(%rbp),$s2
1818 xor 12(%rbp),$s3
1819 mov $inp,%rbp # current input, next iv
1820
1821 sub \$16,%r10
1822 mov %r10,$_len # update len
1823 mov %rbp,$ivec # update ivp
1824
1825 mov $s0,0($out) # write output
1826 mov $s1,4($out)
1827 mov $s2,8($out)
1828 mov $s3,12($out)
1829
1830 lea 16($inp),$inp
1831 lea 16($out),$out
1832 jnz .Lcbc_fast_dec_loop
1833 mov $_ivp,%r12 # load user ivp
1834 mov 0(%rbp),%r10 # load iv
1835 mov 8(%rbp),%r11
1836 mov %r10,0(%r12) # copy back to user
1837 mov %r11,8(%r12)
1838 jmp .Lcbc_fast_cleanup
1839
1840.align 16
1841.Lcbc_fast_dec_in_place:
1842 mov 0(%rbp),%r10 # copy iv to stack
1843 mov 8(%rbp),%r11
1844 mov %r10,0+$ivec
1845 mov %r11,8+$ivec
1846.align 4
1847.Lcbc_fast_dec_in_place_loop:
1848 mov 0($inp),$s0 # load input
1849 mov 4($inp),$s1
1850 mov 8($inp),$s2
1851 mov 12($inp),$s3
1852 mov $keyp,$key # restore key
1853 mov $inp,$_inp # if ($verticalspin) save inp
1854
1855 call _x86_64_AES_decrypt
1856
1857 mov $_inp,$inp # if ($verticalspin) restore inp
1858 mov $_len,%r10
1859 xor 0+$ivec,$s0
1860 xor 4+$ivec,$s1
1861 xor 8+$ivec,$s2
1862 xor 12+$ivec,$s3
1863
1864 mov 0($inp),%r11 # load input
1865 mov 8($inp),%r12
1866 sub \$16,%r10
1867 jz .Lcbc_fast_dec_in_place_done
1868
1869 mov %r11,0+$ivec # copy input to iv
1870 mov %r12,8+$ivec
1871
1872 mov $s0,0($out) # save output [zaps input]
1873 mov $s1,4($out)
1874 mov $s2,8($out)
1875 mov $s3,12($out)
1876
1877 lea 16($inp),$inp
1878 lea 16($out),$out
1879 mov %r10,$_len
1880 jmp .Lcbc_fast_dec_in_place_loop
1881.Lcbc_fast_dec_in_place_done:
1882 mov $_ivp,%rdi
1883 mov %r11,0(%rdi) # copy iv back to user
1884 mov %r12,8(%rdi)
1885
1886 mov $s0,0($out) # save output [zaps input]
1887 mov $s1,4($out)
1888 mov $s2,8($out)
1889 mov $s3,12($out)
1890
1891.align 4
1892.Lcbc_fast_cleanup:
1893 cmpl \$0,$mark # was the key schedule copied?
1894 lea $aes_key,%rdi
1895 je .Lcbc_exit
1896 mov \$240/8,%ecx
1897 xor %rax,%rax
1898 .long 0x90AB48F3 # rep stosq
1899
1900 jmp .Lcbc_exit
1901
1902#--------------------------- SLOW ROUTINE ---------------------------#
1903.align 16
1904.Lcbc_slow_prologue:
1905 # allocate aligned stack frame...
1906 lea -88(%rsp),%rbp
1907 and \$-64,%rbp
1908 # ... just "above" key schedule
1909 lea -88-63(%rcx),%r10
1910 sub %rbp,%r10
1911 neg %r10
1912 and \$0x3c0,%r10
1913 sub %r10,%rbp
1914
1915 xchg %rsp,%rbp
1916 #add \$8,%rsp # reserve for return address!
1917 mov %rbp,$_rsp # save %rsp
1918.Lcbc_slow_body:
1919 #mov %rdi,$_inp # save copy of inp
1920 #mov %rsi,$_out # save copy of out
1921 #mov %rdx,$_len # save copy of len
1922 #mov %rcx,$_key # save copy of key
1923 mov %r8,$_ivp # save copy of ivp
1924 mov %r8,%rbp # rearrange input arguments
1925 mov %r9,%rbx
1926 mov %rsi,$out
1927 mov %rdi,$inp
1928 mov %rcx,$key
1929 mov %rdx,%r10
1930
1931 mov 240($key),%eax
1932 mov $key,$keyp # save key pointer
1933 shl \$4,%eax
1934 lea ($key,%rax),%rax
1935 mov %rax,$keyend
1936
1937 # pick Te4 copy which can't "overlap" with stack frame or key scdedule
1938 lea 2048($sbox),$sbox
1939 lea 768-8(%rsp),%rax
1940 sub $sbox,%rax
1941 and \$0x300,%rax
1942 lea ($sbox,%rax),$sbox
1943
1944 cmp \$0,%rbx
1945 je .LSLOW_DECRYPT
1946
1947#--------------------------- SLOW ENCRYPT ---------------------------#
1948 test \$-16,%r10 # check upon length
1949 mov 0(%rbp),$s0 # load iv
1950 mov 4(%rbp),$s1
1951 mov 8(%rbp),$s2
1952 mov 12(%rbp),$s3
1953 jz .Lcbc_slow_enc_tail # short input...
1954
1955.align 4
1956.Lcbc_slow_enc_loop:
1957 xor 0($inp),$s0
1958 xor 4($inp),$s1
1959 xor 8($inp),$s2
1960 xor 12($inp),$s3
1961 mov $keyp,$key # restore key
1962 mov $inp,$_inp # save inp
1963 mov $out,$_out # save out
1964 mov %r10,$_len # save len
1965
1966 call _x86_64_AES_encrypt_compact
1967
1968 mov $_inp,$inp # restore inp
1969 mov $_out,$out # restore out
1970 mov $_len,%r10 # restore len
1971 mov $s0,0($out)
1972 mov $s1,4($out)
1973 mov $s2,8($out)
1974 mov $s3,12($out)
1975
1976 lea 16($inp),$inp
1977 lea 16($out),$out
1978 sub \$16,%r10
1979 test \$-16,%r10
1980 jnz .Lcbc_slow_enc_loop
1981 test \$15,%r10
1982 jnz .Lcbc_slow_enc_tail
1983 mov $_ivp,%rbp # restore ivp
1984 mov $s0,0(%rbp) # save ivec
1985 mov $s1,4(%rbp)
1986 mov $s2,8(%rbp)
1987 mov $s3,12(%rbp)
1988
1989 jmp .Lcbc_exit
1990
1991.align 4
1992.Lcbc_slow_enc_tail:
1993 mov %rax,%r11
1994 mov %rcx,%r12
1995 mov %r10,%rcx
1996 mov $inp,%rsi
1997 mov $out,%rdi
1998 .long 0x9066A4F3 # rep movsb
1999 mov \$16,%rcx # zero tail
2000 sub %r10,%rcx
2001 xor %rax,%rax
2002 .long 0x9066AAF3 # rep stosb
2003 mov $out,$inp # this is not a mistake!
2004 mov \$16,%r10 # len=16
2005 mov %r11,%rax
2006 mov %r12,%rcx
2007 jmp .Lcbc_slow_enc_loop # one more spin...
2008#--------------------------- SLOW DECRYPT ---------------------------#
2009.align 16
2010.LSLOW_DECRYPT:
2011 shr \$3,%rax
2012 add %rax,$sbox # recall "magic" constants!
2013
2014 mov 0(%rbp),%r11 # copy iv to stack
2015 mov 8(%rbp),%r12
2016 mov %r11,0+$ivec
2017 mov %r12,8+$ivec
2018
2019.align 4
2020.Lcbc_slow_dec_loop:
2021 mov 0($inp),$s0 # load input
2022 mov 4($inp),$s1
2023 mov 8($inp),$s2
2024 mov 12($inp),$s3
2025 mov $keyp,$key # restore key
2026 mov $inp,$_inp # save inp
2027 mov $out,$_out # save out
2028 mov %r10,$_len # save len
2029
2030 call _x86_64_AES_decrypt_compact
2031
2032 mov $_inp,$inp # restore inp
2033 mov $_out,$out # restore out
2034 mov $_len,%r10
2035 xor 0+$ivec,$s0
2036 xor 4+$ivec,$s1
2037 xor 8+$ivec,$s2
2038 xor 12+$ivec,$s3
2039
2040 mov 0($inp),%r11 # load input
2041 mov 8($inp),%r12
2042 sub \$16,%r10
2043 jc .Lcbc_slow_dec_partial
2044 jz .Lcbc_slow_dec_done
2045
2046 mov %r11,0+$ivec # copy input to iv
2047 mov %r12,8+$ivec
2048
2049 mov $s0,0($out) # save output [can zap input]
2050 mov $s1,4($out)
2051 mov $s2,8($out)
2052 mov $s3,12($out)
2053
2054 lea 16($inp),$inp
2055 lea 16($out),$out
2056 jmp .Lcbc_slow_dec_loop
2057.Lcbc_slow_dec_done:
2058 mov $_ivp,%rdi
2059 mov %r11,0(%rdi) # copy iv back to user
2060 mov %r12,8(%rdi)
2061
2062 mov $s0,0($out) # save output [can zap input]
2063 mov $s1,4($out)
2064 mov $s2,8($out)
2065 mov $s3,12($out)
2066
2067 jmp .Lcbc_exit
2068
2069.align 4
2070.Lcbc_slow_dec_partial:
2071 mov $_ivp,%rdi
2072 mov %r11,0(%rdi) # copy iv back to user
2073 mov %r12,8(%rdi)
2074
2075 mov $s0,0+$ivec # save output to stack
2076 mov $s1,4+$ivec
2077 mov $s2,8+$ivec
2078 mov $s3,12+$ivec
2079
2080 mov $out,%rdi
2081 lea $ivec,%rsi
2082 lea 16(%r10),%rcx
2083 .long 0x9066A4F3 # rep movsb
2084 jmp .Lcbc_exit
2085
2086.align 16
2087.Lcbc_exit:
2088 mov $_rsp,%rsi
2089 mov (%rsi),%r15
2090 mov 8(%rsi),%r14
2091 mov 16(%rsi),%r13
2092 mov 24(%rsi),%r12
2093 mov 32(%rsi),%rbp
2094 mov 40(%rsi),%rbx
2095 lea 48(%rsi),%rsp
2096.Lcbc_popfq:
2097 popfq
2098.Lcbc_epilogue:
2099 ret
2100.size AES_cbc_encrypt,.-AES_cbc_encrypt
2101___
2102}
2103
2104$code.=<<___;
2105.align 64
2106.LAES_Te:
2107___
2108 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
2109 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
2110 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
2111 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
2112 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
2113 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
2114 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
2115 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
2116 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
2117 &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
2118 &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
2119 &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
2120 &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
2121 &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
2122 &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
2123 &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
2124 &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
2125 &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
2126 &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
2127 &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
2128 &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
2129 &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
2130 &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
2131 &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
2132 &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
2133 &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
2134 &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
2135 &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
2136 &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
2137 &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
2138 &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
2139 &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
2140 &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
2141 &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
2142 &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
2143 &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
2144 &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
2145 &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
2146 &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
2147 &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
2148 &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
2149 &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
2150 &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
2151 &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
2152 &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
2153 &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
2154 &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
2155 &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
2156 &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
2157 &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
2158 &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
2159 &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
2160 &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
2161 &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
2162 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
2163 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
2164 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
2165 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
2166 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
2167 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
2168 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
2169 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
2170 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
2171 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
2172
2173#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
2174 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2175 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2176 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2177 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2178 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2179 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2180 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2181 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2182 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2183 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2184 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2185 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2186 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2187 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2188 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2189 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2190 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2191 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2192 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2193 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2194 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2195 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2196 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2197 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2198 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2199 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2200 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2201 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2202 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2203 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2204 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2205 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2206
2207 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2208 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2209 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2210 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2211 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2212 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2213 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2214 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2215 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2216 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2217 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2218 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2219 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2220 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2221 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2222 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2223 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2224 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2225 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2226 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2227 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2228 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2229 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2230 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2231 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2232 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2233 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2234 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2235 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2236 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2237 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2238 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2239
2240 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2241 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2242 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2243 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2244 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2245 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2246 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2247 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2248 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2249 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2250 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2251 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2252 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2253 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2254 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2255 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2256 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2257 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2258 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2259 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2260 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2261 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2262 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2263 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2264 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2265 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2266 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2267 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2268 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2269 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2270 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2271 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2272
2273 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2274 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2275 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2276 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2277 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2278 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2279 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2280 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2281 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2282 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2283 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2284 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2285 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2286 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2287 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2288 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2289 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2290 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2291 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2292 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2293 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2294 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2295 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2296 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2297 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2298 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2299 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2300 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2301 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2302 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2303 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2304 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2305#rcon:
2306$code.=<<___;
2307 .long 0x00000001, 0x00000002, 0x00000004, 0x00000008
2308 .long 0x00000010, 0x00000020, 0x00000040, 0x00000080
2309 .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080
2310 .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
2311___
2312$code.=<<___;
2313.align 64
2314.LAES_Td:
2315___
2316 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
2317 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
2318 &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
2319 &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
2320 &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
2321 &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
2322 &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
2323 &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
2324 &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
2325 &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
2326 &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
2327 &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
2328 &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
2329 &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
2330 &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
2331 &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
2332 &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
2333 &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
2334 &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
2335 &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
2336 &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
2337 &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
2338 &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
2339 &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
2340 &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
2341 &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
2342 &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
2343 &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
2344 &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
2345 &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
2346 &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
2347 &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
2348 &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
2349 &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
2350 &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
2351 &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
2352 &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
2353 &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
2354 &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
2355 &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
2356 &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
2357 &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
2358 &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
2359 &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
2360 &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
2361 &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
2362 &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
2363 &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
2364 &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
2365 &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
2366 &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
2367 &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
2368 &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
2369 &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
2370 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
2371 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
2372 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
2373 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
2374 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
2375 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
2376 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
2377 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
2378 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
2379 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
2380
2381#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
2382 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2383 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2384 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2385 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2386 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2387 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2388 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2389 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2390 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2391 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2392 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2393 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2394 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2395 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2396 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2397 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2398 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2399 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2400 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2401 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2402 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2403 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2404 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2405 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2406 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2407 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2408 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2409 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2410 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2411 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2412 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2413 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2414$code.=<<___;
2415 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2416 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2417___
2418 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2419 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2420 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2421 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2422 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2423 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2424 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2425 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2426 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2427 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2428 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2429 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2430 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2431 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2432 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2433 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2434 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2435 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2436 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2437 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2438 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2439 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2440 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2441 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2442 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2443 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2444 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2445 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2446 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2447 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2448 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2449 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2450$code.=<<___;
2451 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2452 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2453___
2454 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2455 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2456 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2457 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2458 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2459 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2460 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2461 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2462 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2463 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2464 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2465 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2466 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2467 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2468 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2469 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2470 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2471 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2472 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2473 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2474 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2475 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2476 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2477 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2478 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2479 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2480 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2481 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2482 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2483 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2484 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2485 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2486$code.=<<___;
2487 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2488 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2489___
2490 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2491 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2492 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2493 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2494 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2495 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2496 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2497 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2498 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2499 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2500 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2501 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2502 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2503 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2504 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2505 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2506 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2507 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2508 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2509 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2510 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2511 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2512 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2513 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2514 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2515 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2516 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2517 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2518 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2519 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2520 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2521 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2522$code.=<<___;
2523 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2524 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2525.asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2526.align 64
2527___
2528
2529# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2530# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2531if ($win64) {
2532$rec="%rcx";
2533$frame="%rdx";
2534$context="%r8";
2535$disp="%r9";
2536
2537$code.=<<___;
2538.extern __imp_RtlVirtualUnwind
2539.type block_se_handler,\@abi-omnipotent
2540.align 16
2541block_se_handler:
2542 push %rsi
2543 push %rdi
2544 push %rbx
2545 push %rbp
2546 push %r12
2547 push %r13
2548 push %r14
2549 push %r15
2550 pushfq
2551 sub \$64,%rsp
2552
2553 mov 120($context),%rax # pull context->Rax
2554 mov 248($context),%rbx # pull context->Rip
2555
2556 mov 8($disp),%rsi # disp->ImageBase
2557 mov 56($disp),%r11 # disp->HandlerData
2558
2559 mov 0(%r11),%r10d # HandlerData[0]
2560 lea (%rsi,%r10),%r10 # prologue label
2561 cmp %r10,%rbx # context->Rip<prologue label
2562 jb .Lin_block_prologue
2563
2564 mov 152($context),%rax # pull context->Rsp
2565
2566 mov 4(%r11),%r10d # HandlerData[1]
2567 lea (%rsi,%r10),%r10 # epilogue label
2568 cmp %r10,%rbx # context->Rip>=epilogue label
2569 jae .Lin_block_prologue
2570
2571 mov 24(%rax),%rax # pull saved real stack pointer
2572 lea 48(%rax),%rax # adjust...
2573
2574 mov -8(%rax),%rbx
2575 mov -16(%rax),%rbp
2576 mov -24(%rax),%r12
2577 mov -32(%rax),%r13
2578 mov -40(%rax),%r14
2579 mov -48(%rax),%r15
2580 mov %rbx,144($context) # restore context->Rbx
2581 mov %rbp,160($context) # restore context->Rbp
2582 mov %r12,216($context) # restore context->R12
2583 mov %r13,224($context) # restore context->R13
2584 mov %r14,232($context) # restore context->R14
2585 mov %r15,240($context) # restore context->R15
2586
2587.Lin_block_prologue:
2588 mov 8(%rax),%rdi
2589 mov 16(%rax),%rsi
2590 mov %rax,152($context) # restore context->Rsp
2591 mov %rsi,168($context) # restore context->Rsi
2592 mov %rdi,176($context) # restore context->Rdi
2593
2594 jmp .Lcommon_seh_exit
2595.size block_se_handler,.-block_se_handler
2596
2597.type key_se_handler,\@abi-omnipotent
2598.align 16
2599key_se_handler:
2600 push %rsi
2601 push %rdi
2602 push %rbx
2603 push %rbp
2604 push %r12
2605 push %r13
2606 push %r14
2607 push %r15
2608 pushfq
2609 sub \$64,%rsp
2610
2611 mov 120($context),%rax # pull context->Rax
2612 mov 248($context),%rbx # pull context->Rip
2613
2614 mov 8($disp),%rsi # disp->ImageBase
2615 mov 56($disp),%r11 # disp->HandlerData
2616
2617 mov 0(%r11),%r10d # HandlerData[0]
2618 lea (%rsi,%r10),%r10 # prologue label
2619 cmp %r10,%rbx # context->Rip<prologue label
2620 jb .Lin_key_prologue
2621
2622 mov 152($context),%rax # pull context->Rsp
2623
2624 mov 4(%r11),%r10d # HandlerData[1]
2625 lea (%rsi,%r10),%r10 # epilogue label
2626 cmp %r10,%rbx # context->Rip>=epilogue label
2627 jae .Lin_key_prologue
2628
2629 lea 56(%rax),%rax
2630
2631 mov -8(%rax),%rbx
2632 mov -16(%rax),%rbp
2633 mov -24(%rax),%r12
2634 mov -32(%rax),%r13
2635 mov -40(%rax),%r14
2636 mov -48(%rax),%r15
2637 mov %rbx,144($context) # restore context->Rbx
2638 mov %rbp,160($context) # restore context->Rbp
2639 mov %r12,216($context) # restore context->R12
2640 mov %r13,224($context) # restore context->R13
2641 mov %r14,232($context) # restore context->R14
2642 mov %r15,240($context) # restore context->R15
2643
2644.Lin_key_prologue:
2645 mov 8(%rax),%rdi
2646 mov 16(%rax),%rsi
2647 mov %rax,152($context) # restore context->Rsp
2648 mov %rsi,168($context) # restore context->Rsi
2649 mov %rdi,176($context) # restore context->Rdi
2650
2651 jmp .Lcommon_seh_exit
2652.size key_se_handler,.-key_se_handler
2653
2654.type cbc_se_handler,\@abi-omnipotent
2655.align 16
2656cbc_se_handler:
2657 push %rsi
2658 push %rdi
2659 push %rbx
2660 push %rbp
2661 push %r12
2662 push %r13
2663 push %r14
2664 push %r15
2665 pushfq
2666 sub \$64,%rsp
2667
2668 mov 120($context),%rax # pull context->Rax
2669 mov 248($context),%rbx # pull context->Rip
2670
2671 lea .Lcbc_prologue(%rip),%r10
2672 cmp %r10,%rbx # context->Rip<.Lcbc_prologue
2673 jb .Lin_cbc_prologue
2674
2675 lea .Lcbc_fast_body(%rip),%r10
2676 cmp %r10,%rbx # context->Rip<.Lcbc_fast_body
2677 jb .Lin_cbc_frame_setup
2678
2679 lea .Lcbc_slow_prologue(%rip),%r10
2680 cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue
2681 jb .Lin_cbc_body
2682
2683 lea .Lcbc_slow_body(%rip),%r10
2684 cmp %r10,%rbx # context->Rip<.Lcbc_slow_body
2685 jb .Lin_cbc_frame_setup
2686
2687.Lin_cbc_body:
2688 mov 152($context),%rax # pull context->Rsp
2689
2690 lea .Lcbc_epilogue(%rip),%r10
2691 cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue
2692 jae .Lin_cbc_prologue
2693
2694 lea 8(%rax),%rax
2695
2696 lea .Lcbc_popfq(%rip),%r10
2697 cmp %r10,%rbx # context->Rip>=.Lcbc_popfq
2698 jae .Lin_cbc_prologue
2699
2700 mov `16-8`(%rax),%rax # biased $_rsp
2701 lea 56(%rax),%rax
2702
2703.Lin_cbc_frame_setup:
2704 mov -16(%rax),%rbx
2705 mov -24(%rax),%rbp
2706 mov -32(%rax),%r12
2707 mov -40(%rax),%r13
2708 mov -48(%rax),%r14
2709 mov -56(%rax),%r15
2710 mov %rbx,144($context) # restore context->Rbx
2711 mov %rbp,160($context) # restore context->Rbp
2712 mov %r12,216($context) # restore context->R12
2713 mov %r13,224($context) # restore context->R13
2714 mov %r14,232($context) # restore context->R14
2715 mov %r15,240($context) # restore context->R15
2716
2717.Lin_cbc_prologue:
2718 mov 8(%rax),%rdi
2719 mov 16(%rax),%rsi
2720 mov %rax,152($context) # restore context->Rsp
2721 mov %rsi,168($context) # restore context->Rsi
2722 mov %rdi,176($context) # restore context->Rdi
2723
2724.Lcommon_seh_exit:
2725
2726 mov 40($disp),%rdi # disp->ContextRecord
2727 mov $context,%rsi # context
2728 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2729 .long 0xa548f3fc # cld; rep movsq
2730
2731 mov $disp,%rsi
2732 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2733 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2734 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2735 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2736 mov 40(%rsi),%r10 # disp->ContextRecord
2737 lea 56(%rsi),%r11 # &disp->HandlerData
2738 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2739 mov %r10,32(%rsp) # arg5
2740 mov %r11,40(%rsp) # arg6
2741 mov %r12,48(%rsp) # arg7
2742 mov %rcx,56(%rsp) # arg8, (NULL)
2743 call *__imp_RtlVirtualUnwind(%rip)
2744
2745 mov \$1,%eax # ExceptionContinueSearch
2746 add \$64,%rsp
2747 popfq
2748 pop %r15
2749 pop %r14
2750 pop %r13
2751 pop %r12
2752 pop %rbp
2753 pop %rbx
2754 pop %rdi
2755 pop %rsi
2756 ret
2757.size cbc_se_handler,.-cbc_se_handler
2758
2759.section .pdata
2760.align 4
2761 .rva .LSEH_begin_AES_encrypt
2762 .rva .LSEH_end_AES_encrypt
2763 .rva .LSEH_info_AES_encrypt
2764
2765 .rva .LSEH_begin_AES_decrypt
2766 .rva .LSEH_end_AES_decrypt
2767 .rva .LSEH_info_AES_decrypt
2768
2769 .rva .LSEH_begin_AES_set_encrypt_key
2770 .rva .LSEH_end_AES_set_encrypt_key
2771 .rva .LSEH_info_AES_set_encrypt_key
2772
2773 .rva .LSEH_begin_AES_set_decrypt_key
2774 .rva .LSEH_end_AES_set_decrypt_key
2775 .rva .LSEH_info_AES_set_decrypt_key
2776
2777 .rva .LSEH_begin_AES_cbc_encrypt
2778 .rva .LSEH_end_AES_cbc_encrypt
2779 .rva .LSEH_info_AES_cbc_encrypt
2780
2781.section .xdata
2782.align 8
2783.LSEH_info_AES_encrypt:
2784 .byte 9,0,0,0
2785 .rva block_se_handler
2786 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
2787.LSEH_info_AES_decrypt:
2788 .byte 9,0,0,0
2789 .rva block_se_handler
2790 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
2791.LSEH_info_AES_set_encrypt_key:
2792 .byte 9,0,0,0
2793 .rva key_se_handler
2794 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
2795.LSEH_info_AES_set_decrypt_key:
2796 .byte 9,0,0,0
2797 .rva key_se_handler
2798 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]
2799.LSEH_info_AES_cbc_encrypt:
2800 .byte 9,0,0,0
2801 .rva cbc_se_handler
2802___
2803}
2804
2805$code =~ s/\`([^\`]*)\`/eval($1)/gem;
2806
2807print $code;
2808
2809close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
deleted file mode 100644
index 49e0f4b351..0000000000
--- a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
+++ /dev/null
@@ -1,992 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
13# details].
14
15$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
16 # generates drop-in replacement for
17 # crypto/aes/asm/aes-x86_64.pl:-)
18
19$flavour = shift;
20$output = shift;
21if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
22
23$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
24
25$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
26( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
27( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
28die "can't locate x86_64-xlate.pl";
29
30open STDOUT,"| $^X $xlate $flavour $output";
31
32$movkey = $PREFIX eq "aesni" ? "movaps" : "movups";
33@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
34 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
35
36$code=".text\n";
37
38$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
39# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
40$inp="%rdi";
41$out="%rsi";
42$len="%rdx";
43$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
44$ivp="%r8"; # cbc
45
46$rnds_="%r10d"; # backup copy for $rounds
47$key_="%r11"; # backup copy for $key
48
49# %xmm register layout
50$inout0="%xmm0"; $inout1="%xmm1";
51$inout2="%xmm2"; $inout3="%xmm3";
52$rndkey0="%xmm4"; $rndkey1="%xmm5";
53
54$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt
55$in1="%xmm8"; $in2="%xmm9";
56
57# Inline version of internal aesni_[en|de]crypt1.
58#
59# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
60# cycles which take care of loop variables...
61{ my $sn;
62sub aesni_generate1 {
63my ($p,$key,$rounds)=@_;
64++$sn;
65$code.=<<___;
66 $movkey ($key),$rndkey0
67 $movkey 16($key),$rndkey1
68 lea 32($key),$key
69 pxor $rndkey0,$inout0
70.Loop_${p}1_$sn:
71 aes${p} $rndkey1,$inout0
72 dec $rounds
73 $movkey ($key),$rndkey1
74 lea 16($key),$key
75 jnz .Loop_${p}1_$sn # loop body is 16 bytes
76 aes${p}last $rndkey1,$inout0
77___
78}}
79# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
80#
81{ my ($inp,$out,$key) = @_4args;
82
83$code.=<<___;
84.globl ${PREFIX}_encrypt
85.type ${PREFIX}_encrypt,\@abi-omnipotent
86.align 16
87${PREFIX}_encrypt:
88 movups ($inp),$inout0 # load input
89 mov 240($key),$rounds # pull $rounds
90___
91 &aesni_generate1("enc",$key,$rounds);
92$code.=<<___;
93 movups $inout0,($out) # output
94 ret
95.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
96
97.globl ${PREFIX}_decrypt
98.type ${PREFIX}_decrypt,\@abi-omnipotent
99.align 16
100${PREFIX}_decrypt:
101 movups ($inp),$inout0 # load input
102 mov 240($key),$rounds # pull $rounds
103___
104 &aesni_generate1("dec",$key,$rounds);
105$code.=<<___;
106 movups $inout0,($out) # output
107 ret
108.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
109___
110}
111
112# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
113# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
114# latency is 6, it turned out that it can be scheduled only every
115# *second* cycle. Thus 3x interleave is the one providing optimal
116# utilization, i.e. when subroutine's throughput is virtually same as
117# of non-interleaved subroutine [for number of input blocks up to 3].
118# This is why it makes no sense to implement 2x subroutine. As soon
119# as/if Intel improves throughput by making it possible to schedule
120# the instructions in question *every* cycles I would have to
121# implement 6x interleave and use it in loop...
122sub aesni_generate3 {
123my $dir=shift;
124# As already mentioned it takes in $key and $rounds, which are *not*
125# preserved. $inout[0-2] is cipher/clear text...
126$code.=<<___;
127.type _aesni_${dir}rypt3,\@abi-omnipotent
128.align 16
129_aesni_${dir}rypt3:
130 $movkey ($key),$rndkey0
131 shr \$1,$rounds
132 $movkey 16($key),$rndkey1
133 lea 32($key),$key
134 pxor $rndkey0,$inout0
135 pxor $rndkey0,$inout1
136 pxor $rndkey0,$inout2
137
138.L${dir}_loop3:
139 aes${dir} $rndkey1,$inout0
140 $movkey ($key),$rndkey0
141 aes${dir} $rndkey1,$inout1
142 dec $rounds
143 aes${dir} $rndkey1,$inout2
144 aes${dir} $rndkey0,$inout0
145 $movkey 16($key),$rndkey1
146 aes${dir} $rndkey0,$inout1
147 lea 32($key),$key
148 aes${dir} $rndkey0,$inout2
149 jnz .L${dir}_loop3
150
151 aes${dir} $rndkey1,$inout0
152 $movkey ($key),$rndkey0
153 aes${dir} $rndkey1,$inout1
154 aes${dir} $rndkey1,$inout2
155 aes${dir}last $rndkey0,$inout0
156 aes${dir}last $rndkey0,$inout1
157 aes${dir}last $rndkey0,$inout2
158 ret
159.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
160___
161}
162# 4x interleave is implemented to improve small block performance,
163# most notably [and naturally] 4 block by ~30%. One can argue that one
164# should have implemented 5x as well, but improvement would be <20%,
165# so it's not worth it...
166sub aesni_generate4 {
167my $dir=shift;
168# As already mentioned it takes in $key and $rounds, which are *not*
169# preserved. $inout[0-3] is cipher/clear text...
170$code.=<<___;
171.type _aesni_${dir}rypt4,\@abi-omnipotent
172.align 16
173_aesni_${dir}rypt4:
174 $movkey ($key),$rndkey0
175 shr \$1,$rounds
176 $movkey 16($key),$rndkey1
177 lea 32($key),$key
178 pxor $rndkey0,$inout0
179 pxor $rndkey0,$inout1
180 pxor $rndkey0,$inout2
181 pxor $rndkey0,$inout3
182
183.L${dir}_loop4:
184 aes${dir} $rndkey1,$inout0
185 $movkey ($key),$rndkey0
186 aes${dir} $rndkey1,$inout1
187 dec $rounds
188 aes${dir} $rndkey1,$inout2
189 aes${dir} $rndkey1,$inout3
190 aes${dir} $rndkey0,$inout0
191 $movkey 16($key),$rndkey1
192 aes${dir} $rndkey0,$inout1
193 lea 32($key),$key
194 aes${dir} $rndkey0,$inout2
195 aes${dir} $rndkey0,$inout3
196 jnz .L${dir}_loop4
197
198 aes${dir} $rndkey1,$inout0
199 $movkey ($key),$rndkey0
200 aes${dir} $rndkey1,$inout1
201 aes${dir} $rndkey1,$inout2
202 aes${dir} $rndkey1,$inout3
203 aes${dir}last $rndkey0,$inout0
204 aes${dir}last $rndkey0,$inout1
205 aes${dir}last $rndkey0,$inout2
206 aes${dir}last $rndkey0,$inout3
207 ret
208.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
209___
210}
211&aesni_generate3("enc") if ($PREFIX eq "aesni");
212&aesni_generate3("dec");
213&aesni_generate4("enc") if ($PREFIX eq "aesni");
214&aesni_generate4("dec");
215
216if ($PREFIX eq "aesni") {
217# void aesni_ecb_encrypt (const void *in, void *out,
218# size_t length, const AES_KEY *key,
219# int enc);
220$code.=<<___;
221.globl aesni_ecb_encrypt
222.type aesni_ecb_encrypt,\@function,5
223.align 16
224aesni_ecb_encrypt:
225 cmp \$16,$len # check length
226 jb .Lecb_ret
227
228 mov 240($key),$rounds # pull $rounds
229 and \$-16,$len
230 mov $key,$key_ # backup $key
231 test %r8d,%r8d # 5th argument
232 mov $rounds,$rnds_ # backup $rounds
233 jz .Lecb_decrypt
234#--------------------------- ECB ENCRYPT ------------------------------#
235 sub \$0x40,$len
236 jbe .Lecb_enc_tail
237 jmp .Lecb_enc_loop3
238.align 16
239.Lecb_enc_loop3:
240 movups ($inp),$inout0
241 movups 0x10($inp),$inout1
242 movups 0x20($inp),$inout2
243 call _aesni_encrypt3
244 sub \$0x30,$len
245 lea 0x30($inp),$inp
246 lea 0x30($out),$out
247 movups $inout0,-0x30($out)
248 mov $rnds_,$rounds # restore $rounds
249 movups $inout1,-0x20($out)
250 mov $key_,$key # restore $key
251 movups $inout2,-0x10($out)
252 ja .Lecb_enc_loop3
253
254.Lecb_enc_tail:
255 add \$0x40,$len
256 jz .Lecb_ret
257
258 cmp \$0x10,$len
259 movups ($inp),$inout0
260 je .Lecb_enc_one
261 cmp \$0x20,$len
262 movups 0x10($inp),$inout1
263 je .Lecb_enc_two
264 cmp \$0x30,$len
265 movups 0x20($inp),$inout2
266 je .Lecb_enc_three
267 movups 0x30($inp),$inout3
268 call _aesni_encrypt4
269 movups $inout0,($out)
270 movups $inout1,0x10($out)
271 movups $inout2,0x20($out)
272 movups $inout3,0x30($out)
273 jmp .Lecb_ret
274.align 16
275.Lecb_enc_one:
276___
277 &aesni_generate1("enc",$key,$rounds);
278$code.=<<___;
279 movups $inout0,($out)
280 jmp .Lecb_ret
281.align 16
282.Lecb_enc_two:
283 call _aesni_encrypt3
284 movups $inout0,($out)
285 movups $inout1,0x10($out)
286 jmp .Lecb_ret
287.align 16
288.Lecb_enc_three:
289 call _aesni_encrypt3
290 movups $inout0,($out)
291 movups $inout1,0x10($out)
292 movups $inout2,0x20($out)
293 jmp .Lecb_ret
294 #--------------------------- ECB DECRYPT ------------------------------#
295.align 16
296.Lecb_decrypt:
297 sub \$0x40,$len
298 jbe .Lecb_dec_tail
299 jmp .Lecb_dec_loop3
300.align 16
301.Lecb_dec_loop3:
302 movups ($inp),$inout0
303 movups 0x10($inp),$inout1
304 movups 0x20($inp),$inout2
305 call _aesni_decrypt3
306 sub \$0x30,$len
307 lea 0x30($inp),$inp
308 lea 0x30($out),$out
309 movups $inout0,-0x30($out)
310 mov $rnds_,$rounds # restore $rounds
311 movups $inout1,-0x20($out)
312 mov $key_,$key # restore $key
313 movups $inout2,-0x10($out)
314 ja .Lecb_dec_loop3
315
316.Lecb_dec_tail:
317 add \$0x40,$len
318 jz .Lecb_ret
319
320 cmp \$0x10,$len
321 movups ($inp),$inout0
322 je .Lecb_dec_one
323 cmp \$0x20,$len
324 movups 0x10($inp),$inout1
325 je .Lecb_dec_two
326 cmp \$0x30,$len
327 movups 0x20($inp),$inout2
328 je .Lecb_dec_three
329 movups 0x30($inp),$inout3
330 call _aesni_decrypt4
331 movups $inout0,($out)
332 movups $inout1,0x10($out)
333 movups $inout2,0x20($out)
334 movups $inout3,0x30($out)
335 jmp .Lecb_ret
336.align 16
337.Lecb_dec_one:
338___
339 &aesni_generate1("dec",$key,$rounds);
340$code.=<<___;
341 movups $inout0,($out)
342 jmp .Lecb_ret
343.align 16
344.Lecb_dec_two:
345 call _aesni_decrypt3
346 movups $inout0,($out)
347 movups $inout1,0x10($out)
348 jmp .Lecb_ret
349.align 16
350.Lecb_dec_three:
351 call _aesni_decrypt3
352 movups $inout0,($out)
353 movups $inout1,0x10($out)
354 movups $inout2,0x20($out)
355
356.Lecb_ret:
357 ret
358.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
359___
360}
361
362# void $PREFIX_cbc_encrypt (const void *inp, void *out,
363# size_t length, const AES_KEY *key,
364# unsigned char *ivp,const int enc);
365$reserved = $win64?0x40:-0x18; # used in decrypt
366$code.=<<___;
367.globl ${PREFIX}_cbc_encrypt
368.type ${PREFIX}_cbc_encrypt,\@function,6
369.align 16
370${PREFIX}_cbc_encrypt:
371 test $len,$len # check length
372 jz .Lcbc_ret
373
374 mov 240($key),$rnds_ # pull $rounds
375 mov $key,$key_ # backup $key
376 test %r9d,%r9d # 6th argument
377 jz .Lcbc_decrypt
378#--------------------------- CBC ENCRYPT ------------------------------#
379 movups ($ivp),$inout0 # load iv as initial state
380 cmp \$16,$len
381 mov $rnds_,$rounds
382 jb .Lcbc_enc_tail
383 sub \$16,$len
384 jmp .Lcbc_enc_loop
385.align 16
386.Lcbc_enc_loop:
387 movups ($inp),$inout1 # load input
388 lea 16($inp),$inp
389 pxor $inout1,$inout0
390___
391 &aesni_generate1("enc",$key,$rounds);
392$code.=<<___;
393 sub \$16,$len
394 lea 16($out),$out
395 mov $rnds_,$rounds # restore $rounds
396 mov $key_,$key # restore $key
397 movups $inout0,-16($out) # store output
398 jnc .Lcbc_enc_loop
399 add \$16,$len
400 jnz .Lcbc_enc_tail
401 movups $inout0,($ivp)
402 jmp .Lcbc_ret
403
404.Lcbc_enc_tail:
405 mov $len,%rcx # zaps $key
406 xchg $inp,$out # $inp is %rsi and $out is %rdi now
407 .long 0x9066A4F3 # rep movsb
408 mov \$16,%ecx # zero tail
409 sub $len,%rcx
410 xor %eax,%eax
411 .long 0x9066AAF3 # rep stosb
412 lea -16(%rdi),%rdi # rewind $out by 1 block
413 mov $rnds_,$rounds # restore $rounds
414 mov %rdi,%rsi # $inp and $out are the same
415 mov $key_,$key # restore $key
416 xor $len,$len # len=16
417 jmp .Lcbc_enc_loop # one more spin
418 #--------------------------- CBC DECRYPT ------------------------------#
419.align 16
420.Lcbc_decrypt:
421___
422$code.=<<___ if ($win64);
423 lea -0x58(%rsp),%rsp
424 movaps %xmm6,(%rsp)
425 movaps %xmm7,0x10(%rsp)
426 movaps %xmm8,0x20(%rsp)
427 movaps %xmm9,0x30(%rsp)
428.Lcbc_decrypt_body:
429___
430$code.=<<___;
431 movups ($ivp),$iv
432 sub \$0x40,$len
433 mov $rnds_,$rounds
434 jbe .Lcbc_dec_tail
435 jmp .Lcbc_dec_loop3
436.align 16
437.Lcbc_dec_loop3:
438 movups ($inp),$inout0
439 movups 0x10($inp),$inout1
440 movups 0x20($inp),$inout2
441 movaps $inout0,$in0
442 movaps $inout1,$in1
443 movaps $inout2,$in2
444 call _aesni_decrypt3
445 sub \$0x30,$len
446 lea 0x30($inp),$inp
447 lea 0x30($out),$out
448 pxor $iv,$inout0
449 pxor $in0,$inout1
450 movaps $in2,$iv
451 pxor $in1,$inout2
452 movups $inout0,-0x30($out)
453 mov $rnds_,$rounds # restore $rounds
454 movups $inout1,-0x20($out)
455 mov $key_,$key # restore $key
456 movups $inout2,-0x10($out)
457 ja .Lcbc_dec_loop3
458
459.Lcbc_dec_tail:
460 add \$0x40,$len
461 movups $iv,($ivp)
462 jz .Lcbc_dec_ret
463
464 movups ($inp),$inout0
465 cmp \$0x10,$len
466 movaps $inout0,$in0
467 jbe .Lcbc_dec_one
468 movups 0x10($inp),$inout1
469 cmp \$0x20,$len
470 movaps $inout1,$in1
471 jbe .Lcbc_dec_two
472 movups 0x20($inp),$inout2
473 cmp \$0x30,$len
474 movaps $inout2,$in2
475 jbe .Lcbc_dec_three
476 movups 0x30($inp),$inout3
477 call _aesni_decrypt4
478 pxor $iv,$inout0
479 movups 0x30($inp),$iv
480 pxor $in0,$inout1
481 movups $inout0,($out)
482 pxor $in1,$inout2
483 movups $inout1,0x10($out)
484 pxor $in2,$inout3
485 movups $inout2,0x20($out)
486 movaps $inout3,$inout0
487 lea 0x30($out),$out
488 jmp .Lcbc_dec_tail_collected
489.align 16
490.Lcbc_dec_one:
491___
492 &aesni_generate1("dec",$key,$rounds);
493$code.=<<___;
494 pxor $iv,$inout0
495 movaps $in0,$iv
496 jmp .Lcbc_dec_tail_collected
497.align 16
498.Lcbc_dec_two:
499 call _aesni_decrypt3
500 pxor $iv,$inout0
501 pxor $in0,$inout1
502 movups $inout0,($out)
503 movaps $in1,$iv
504 movaps $inout1,$inout0
505 lea 0x10($out),$out
506 jmp .Lcbc_dec_tail_collected
507.align 16
508.Lcbc_dec_three:
509 call _aesni_decrypt3
510 pxor $iv,$inout0
511 pxor $in0,$inout1
512 movups $inout0,($out)
513 pxor $in1,$inout2
514 movups $inout1,0x10($out)
515 movaps $in2,$iv
516 movaps $inout2,$inout0
517 lea 0x20($out),$out
518 jmp .Lcbc_dec_tail_collected
519.align 16
520.Lcbc_dec_tail_collected:
521 and \$15,$len
522 movups $iv,($ivp)
523 jnz .Lcbc_dec_tail_partial
524 movups $inout0,($out)
525 jmp .Lcbc_dec_ret
526.Lcbc_dec_tail_partial:
527 movaps $inout0,$reserved(%rsp)
528 mov $out,%rdi
529 mov $len,%rcx
530 lea $reserved(%rsp),%rsi
531 .long 0x9066A4F3 # rep movsb
532
533.Lcbc_dec_ret:
534___
535$code.=<<___ if ($win64);
536 movaps (%rsp),%xmm6
537 movaps 0x10(%rsp),%xmm7
538 movaps 0x20(%rsp),%xmm8
539 movaps 0x30(%rsp),%xmm9
540 lea 0x58(%rsp),%rsp
541___
542$code.=<<___;
543.Lcbc_ret:
544 ret
545.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
546___
547
548# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
549# int bits, AES_KEY *key)
550{ my ($inp,$bits,$key) = @_4args;
551 $bits =~ s/%r/%e/;
552
553$code.=<<___;
554.globl ${PREFIX}_set_decrypt_key
555.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
556.align 16
557${PREFIX}_set_decrypt_key:
558 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
559 call _aesni_set_encrypt_key
560 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
561 test %eax,%eax
562 jnz .Ldec_key_ret
563 lea 16($key,$bits),$inp # points at the end of key schedule
564
565 $movkey ($key),%xmm0 # just swap
566 $movkey ($inp),%xmm1
567 $movkey %xmm0,($inp)
568 $movkey %xmm1,($key)
569 lea 16($key),$key
570 lea -16($inp),$inp
571
572.Ldec_key_inverse:
573 $movkey ($key),%xmm0 # swap and inverse
574 $movkey ($inp),%xmm1
575 aesimc %xmm0,%xmm0
576 aesimc %xmm1,%xmm1
577 lea 16($key),$key
578 lea -16($inp),$inp
579 cmp $key,$inp
580 $movkey %xmm0,16($inp)
581 $movkey %xmm1,-16($key)
582 ja .Ldec_key_inverse
583
584 $movkey ($key),%xmm0 # inverse middle
585 aesimc %xmm0,%xmm0
586 $movkey %xmm0,($inp)
587.Ldec_key_ret:
588 add \$8,%rsp
589 ret
590.LSEH_end_set_decrypt_key:
591.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
592___
593
594# This is based on submission by
595#
596# Huang Ying <ying.huang@intel.com>
597# Vinodh Gopal <vinodh.gopal@intel.com>
598# Kahraman Akdemir
599#
600# Agressively optimized in respect to aeskeygenassist's critical path
601# and is contained in %xmm0-5 to meet Win64 ABI requirement.
602#
603$code.=<<___;
604.globl ${PREFIX}_set_encrypt_key
605.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
606.align 16
607${PREFIX}_set_encrypt_key:
608_aesni_set_encrypt_key:
609 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
610 test $inp,$inp
611 mov \$-1,%rax
612 jz .Lenc_key_ret
613 test $key,$key
614 jz .Lenc_key_ret
615
616 movups ($inp),%xmm0 # pull first 128 bits of *userKey
617 pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0
618 lea 16($key),%rax
619 cmp \$256,$bits
620 je .L14rounds
621 cmp \$192,$bits
622 je .L12rounds
623 cmp \$128,$bits
624 jne .Lbad_keybits
625
626.L10rounds:
627 mov \$9,$bits # 10 rounds for 128-bit key
628 $movkey %xmm0,($key) # round 0
629 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
630 call .Lkey_expansion_128_cold
631 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
632 call .Lkey_expansion_128
633 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
634 call .Lkey_expansion_128
635 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
636 call .Lkey_expansion_128
637 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
638 call .Lkey_expansion_128
639 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
640 call .Lkey_expansion_128
641 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
642 call .Lkey_expansion_128
643 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
644 call .Lkey_expansion_128
645 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
646 call .Lkey_expansion_128
647 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
648 call .Lkey_expansion_128
649 $movkey %xmm0,(%rax)
650 mov $bits,80(%rax) # 240(%rdx)
651 xor %eax,%eax
652 jmp .Lenc_key_ret
653
654.align 16
655.L12rounds:
656 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
657 mov \$11,$bits # 12 rounds for 192
658 $movkey %xmm0,($key) # round 0
659 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
660 call .Lkey_expansion_192a_cold
661 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
662 call .Lkey_expansion_192b
663 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
664 call .Lkey_expansion_192a
665 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
666 call .Lkey_expansion_192b
667 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
668 call .Lkey_expansion_192a
669 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
670 call .Lkey_expansion_192b
671 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
672 call .Lkey_expansion_192a
673 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
674 call .Lkey_expansion_192b
675 $movkey %xmm0,(%rax)
676 mov $bits,48(%rax) # 240(%rdx)
677 xor %rax, %rax
678 jmp .Lenc_key_ret
679
680.align 16
681.L14rounds:
682 movups 16($inp),%xmm2 # remaning half of *userKey
683 mov \$13,$bits # 14 rounds for 256
684 lea 16(%rax),%rax
685 $movkey %xmm0,($key) # round 0
686 $movkey %xmm2,16($key) # round 1
687 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
688 call .Lkey_expansion_256a_cold
689 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
690 call .Lkey_expansion_256b
691 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
692 call .Lkey_expansion_256a
693 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
694 call .Lkey_expansion_256b
695 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
696 call .Lkey_expansion_256a
697 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
698 call .Lkey_expansion_256b
699 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
700 call .Lkey_expansion_256a
701 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
702 call .Lkey_expansion_256b
703 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
704 call .Lkey_expansion_256a
705 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
706 call .Lkey_expansion_256b
707 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
708 call .Lkey_expansion_256a
709 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
710 call .Lkey_expansion_256b
711 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
712 call .Lkey_expansion_256a
713 $movkey %xmm0,(%rax)
714 mov $bits,16(%rax) # 240(%rdx)
715 xor %rax,%rax
716 jmp .Lenc_key_ret
717
718.align 16
719.Lbad_keybits:
720 mov \$-2,%rax
721.Lenc_key_ret:
722 add \$8,%rsp
723 ret
724.LSEH_end_set_encrypt_key:
725
726.align 16
727.Lkey_expansion_128:
728 $movkey %xmm0,(%rax)
729 lea 16(%rax),%rax
730.Lkey_expansion_128_cold:
731 shufps \$0b00010000,%xmm0,%xmm4
732 pxor %xmm4, %xmm0
733 shufps \$0b10001100,%xmm0,%xmm4
734 pxor %xmm4, %xmm0
735 pshufd \$0b11111111,%xmm1,%xmm1 # critical path
736 pxor %xmm1,%xmm0
737 ret
738
739.align 16
740.Lkey_expansion_192a:
741 $movkey %xmm0,(%rax)
742 lea 16(%rax),%rax
743.Lkey_expansion_192a_cold:
744 movaps %xmm2, %xmm5
745.Lkey_expansion_192b_warm:
746 shufps \$0b00010000,%xmm0,%xmm4
747 movaps %xmm2,%xmm3
748 pxor %xmm4,%xmm0
749 shufps \$0b10001100,%xmm0,%xmm4
750 pslldq \$4,%xmm3
751 pxor %xmm4,%xmm0
752 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
753 pxor %xmm3,%xmm2
754 pxor %xmm1,%xmm0
755 pshufd \$0b11111111,%xmm0,%xmm3
756 pxor %xmm3,%xmm2
757 ret
758
759.align 16
760.Lkey_expansion_192b:
761 movaps %xmm0,%xmm3
762 shufps \$0b01000100,%xmm0,%xmm5
763 $movkey %xmm5,(%rax)
764 shufps \$0b01001110,%xmm2,%xmm3
765 $movkey %xmm3,16(%rax)
766 lea 32(%rax),%rax
767 jmp .Lkey_expansion_192b_warm
768
769.align 16
770.Lkey_expansion_256a:
771 $movkey %xmm2,(%rax)
772 lea 16(%rax),%rax
773.Lkey_expansion_256a_cold:
774 shufps \$0b00010000,%xmm0,%xmm4
775 pxor %xmm4,%xmm0
776 shufps \$0b10001100,%xmm0,%xmm4
777 pxor %xmm4,%xmm0
778 pshufd \$0b11111111,%xmm1,%xmm1 # critical path
779 pxor %xmm1,%xmm0
780 ret
781
782.align 16
783.Lkey_expansion_256b:
784 $movkey %xmm0,(%rax)
785 lea 16(%rax),%rax
786
787 shufps \$0b00010000,%xmm2,%xmm4
788 pxor %xmm4,%xmm2
789 shufps \$0b10001100,%xmm2,%xmm4
790 pxor %xmm4,%xmm2
791 pshufd \$0b10101010,%xmm1,%xmm1 # critical path
792 pxor %xmm1,%xmm2
793 ret
794.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
795___
796}
797
798$code.=<<___;
799.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
800.align 64
801___
802
803# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
804# CONTEXT *context,DISPATCHER_CONTEXT *disp)
805if ($win64) {
806$rec="%rcx";
807$frame="%rdx";
808$context="%r8";
809$disp="%r9";
810
811$code.=<<___;
812.extern __imp_RtlVirtualUnwind
813.type cbc_se_handler,\@abi-omnipotent
814.align 16
815cbc_se_handler:
816 push %rsi
817 push %rdi
818 push %rbx
819 push %rbp
820 push %r12
821 push %r13
822 push %r14
823 push %r15
824 pushfq
825 sub \$64,%rsp
826
827 mov 152($context),%rax # pull context->Rsp
828 mov 248($context),%rbx # pull context->Rip
829
830 lea .Lcbc_decrypt(%rip),%r10
831 cmp %r10,%rbx # context->Rip<"prologue" label
832 jb .Lin_prologue
833
834 lea .Lcbc_decrypt_body(%rip),%r10
835 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
836 jb .Lrestore_rax
837
838 lea .Lcbc_ret(%rip),%r10
839 cmp %r10,%rbx # context->Rip>="epilogue" label
840 jae .Lin_prologue
841
842 lea 0(%rax),%rsi # top of stack
843 lea 512($context),%rdi # &context.Xmm6
844 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
845 .long 0xa548f3fc # cld; rep movsq
846 lea 0x58(%rax),%rax # adjust stack pointer
847 jmp .Lin_prologue
848
849.Lrestore_rax:
850 mov 120($context),%rax
851.Lin_prologue:
852 mov 8(%rax),%rdi
853 mov 16(%rax),%rsi
854 mov %rax,152($context) # restore context->Rsp
855 mov %rsi,168($context) # restore context->Rsi
856 mov %rdi,176($context) # restore context->Rdi
857
858 jmp .Lcommon_seh_exit
859.size cbc_se_handler,.-cbc_se_handler
860
861.type ecb_se_handler,\@abi-omnipotent
862.align 16
863ecb_se_handler:
864 push %rsi
865 push %rdi
866 push %rbx
867 push %rbp
868 push %r12
869 push %r13
870 push %r14
871 push %r15
872 pushfq
873 sub \$64,%rsp
874
875 mov 152($context),%rax # pull context->Rsp
876 mov 8(%rax),%rdi
877 mov 16(%rax),%rsi
878 mov %rsi,168($context) # restore context->Rsi
879 mov %rdi,176($context) # restore context->Rdi
880
881.Lcommon_seh_exit:
882
883 mov 40($disp),%rdi # disp->ContextRecord
884 mov $context,%rsi # context
885 mov \$154,%ecx # sizeof(CONTEXT)
886 .long 0xa548f3fc # cld; rep movsq
887
888 mov $disp,%rsi
889 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
890 mov 8(%rsi),%rdx # arg2, disp->ImageBase
891 mov 0(%rsi),%r8 # arg3, disp->ControlPc
892 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
893 mov 40(%rsi),%r10 # disp->ContextRecord
894 lea 56(%rsi),%r11 # &disp->HandlerData
895 lea 24(%rsi),%r12 # &disp->EstablisherFrame
896 mov %r10,32(%rsp) # arg5
897 mov %r11,40(%rsp) # arg6
898 mov %r12,48(%rsp) # arg7
899 mov %rcx,56(%rsp) # arg8, (NULL)
900 call *__imp_RtlVirtualUnwind(%rip)
901
902 mov \$1,%eax # ExceptionContinueSearch
903 add \$64,%rsp
904 popfq
905 pop %r15
906 pop %r14
907 pop %r13
908 pop %r12
909 pop %rbp
910 pop %rbx
911 pop %rdi
912 pop %rsi
913 ret
914.size cbc_se_handler,.-cbc_se_handler
915
916.section .pdata
917.align 4
918 .rva .LSEH_begin_${PREFIX}_ecb_encrypt
919 .rva .LSEH_end_${PREFIX}_ecb_encrypt
920 .rva .LSEH_info_ecb
921
922 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
923 .rva .LSEH_end_${PREFIX}_cbc_encrypt
924 .rva .LSEH_info_cbc
925
926 .rva ${PREFIX}_set_decrypt_key
927 .rva .LSEH_end_set_decrypt_key
928 .rva .LSEH_info_key
929
930 .rva ${PREFIX}_set_encrypt_key
931 .rva .LSEH_end_set_encrypt_key
932 .rva .LSEH_info_key
933.section .xdata
934.align 8
935.LSEH_info_ecb:
936 .byte 9,0,0,0
937 .rva ecb_se_handler
938.LSEH_info_cbc:
939 .byte 9,0,0,0
940 .rva cbc_se_handler
941.LSEH_info_key:
942 .byte 0x01,0x04,0x01,0x00
943 .byte 0x04,0x02,0x00,0x00
944___
945}
946
947sub rex {
948 local *opcode=shift;
949 my ($dst,$src)=@_;
950
951 if ($dst>=8 || $src>=8) {
952 $rex=0x40;
953 $rex|=0x04 if($dst>=8);
954 $rex|=0x01 if($src>=8);
955 push @opcode,$rex;
956 }
957}
958
959sub aesni {
960 my $line=shift;
961 my @opcode=(0x66);
962
963 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
964 rex(\@opcode,$4,$3);
965 push @opcode,0x0f,0x3a,0xdf;
966 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
967 my $c=$2;
968 push @opcode,$c=~/^0/?oct($c):$c;
969 return ".byte\t".join(',',@opcode);
970 }
971 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
972 my %opcodelet = (
973 "aesimc" => 0xdb,
974 "aesenc" => 0xdc, "aesenclast" => 0xdd,
975 "aesdec" => 0xde, "aesdeclast" => 0xdf
976 );
977 return undef if (!defined($opcodelet{$1}));
978 rex(\@opcode,$3,$2);
979 push @opcode,0x0f,0x38,$opcodelet{$1};
980 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
981 return ".byte\t".join(',',@opcode);
982 }
983 return $line;
984}
985
986$code =~ s/\`([^\`]*)\`/eval($1)/gem;
987$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
988
989print $code;
990
991close STDOUT;
992