summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/aes
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/aes')
-rw-r--r--src/lib/libcrypto/aes/aes.c226
-rw-r--r--src/lib/libcrypto/aes/aes.h120
-rw-r--r--src/lib/libcrypto/aes/aes_core.c1229
-rw-r--r--src/lib/libcrypto/aes/aes_ige.c195
-rw-r--r--src/lib/libcrypto/aes/aes_local.h76
-rw-r--r--src/lib/libcrypto/aes/asm/aes-586.pl2974
-rw-r--r--src/lib/libcrypto/aes/asm/aes-armv4.pl1134
-rw-r--r--src/lib/libcrypto/aes/asm/aes-mips.pl1613
-rw-r--r--src/lib/libcrypto/aes/asm/aes-parisc.pl1030
-rw-r--r--src/lib/libcrypto/aes/asm/aes-ppc.pl1344
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-sparcv9.pl1217
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-x86_64.pl2834
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-x86.pl2188
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-x86_64.pl3080
-rw-r--r--src/lib/libcrypto/aes/asm/bsaes-x86_64.pl3123
-rw-r--r--src/lib/libcrypto/aes/asm/vpaes-x86.pl911
-rw-r--r--src/lib/libcrypto/aes/asm/vpaes-x86_64.pl1222
17 files changed, 0 insertions, 24516 deletions
diff --git a/src/lib/libcrypto/aes/aes.c b/src/lib/libcrypto/aes/aes.c
deleted file mode 100644
index d36a006360..0000000000
--- a/src/lib/libcrypto/aes/aes.c
+++ /dev/null
@@ -1,226 +0,0 @@
1/* $OpenBSD: aes.c,v 1.4 2024/08/11 13:02:39 jsing Exp $ */
2/* ====================================================================
3 * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <string.h>
53
54#include <openssl/aes.h>
55#include <openssl/bio.h>
56#include <openssl/modes.h>
57
58#include "crypto_arch.h"
59
60static const unsigned char aes_wrap_default_iv[] = {
61 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
62};
63
64#ifdef HAVE_AES_CBC_ENCRYPT_INTERNAL
65void aes_cbc_encrypt_internal(const unsigned char *in, unsigned char *out,
66 size_t len, const AES_KEY *key, unsigned char *ivec, const int enc);
67
68#else
69static inline void
70aes_cbc_encrypt_internal(const unsigned char *in, unsigned char *out,
71 size_t len, const AES_KEY *key, unsigned char *ivec, const int enc)
72{
73 if (enc)
74 CRYPTO_cbc128_encrypt(in, out, len, key, ivec,
75 (block128_f)AES_encrypt);
76 else
77 CRYPTO_cbc128_decrypt(in, out, len, key, ivec,
78 (block128_f)AES_decrypt);
79}
80#endif
81
82void
83AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
84 size_t len, const AES_KEY *key, unsigned char *ivec, const int enc)
85{
86 aes_cbc_encrypt_internal(in, out, len, key, ivec, enc);
87}
88LCRYPTO_ALIAS(AES_cbc_encrypt);
89
90/*
91 * The input and output encrypted as though 128bit cfb mode is being
92 * used. The extra state information to record how much of the
93 * 128bit block we have used is contained in *num;
94 */
95
96void
97AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, size_t length,
98 const AES_KEY *key, unsigned char *ivec, int *num, const int enc)
99{
100 CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc,
101 (block128_f)AES_encrypt);
102}
103LCRYPTO_ALIAS(AES_cfb128_encrypt);
104
105/* N.B. This expects the input to be packed, MS bit first */
106void
107AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, size_t length,
108 const AES_KEY *key, unsigned char *ivec, int *num, const int enc)
109{
110 CRYPTO_cfb128_1_encrypt(in, out, length, key, ivec, num, enc,
111 (block128_f)AES_encrypt);
112}
113LCRYPTO_ALIAS(AES_cfb1_encrypt);
114
115void
116AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, size_t length,
117 const AES_KEY *key, unsigned char *ivec, int *num, const int enc)
118{
119 CRYPTO_cfb128_8_encrypt(in, out, length, key, ivec, num, enc,
120 (block128_f)AES_encrypt);
121}
122LCRYPTO_ALIAS(AES_cfb8_encrypt);
123
124void
125AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
126 size_t length, const AES_KEY *key, unsigned char ivec[AES_BLOCK_SIZE],
127 unsigned char ecount_buf[AES_BLOCK_SIZE], unsigned int *num)
128{
129 CRYPTO_ctr128_encrypt(in, out, length, key, ivec, ecount_buf, num,
130 (block128_f)AES_encrypt);
131}
132LCRYPTO_ALIAS(AES_ctr128_encrypt);
133
134void
135AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
136 const AES_KEY *key, const int enc)
137{
138 if (AES_ENCRYPT == enc)
139 AES_encrypt(in, out, key);
140 else
141 AES_decrypt(in, out, key);
142}
143LCRYPTO_ALIAS(AES_ecb_encrypt);
144
145void
146AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, size_t length,
147 const AES_KEY *key, unsigned char *ivec, int *num)
148{
149 CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num,
150 (block128_f)AES_encrypt);
151}
152LCRYPTO_ALIAS(AES_ofb128_encrypt);
153
154int
155AES_wrap_key(AES_KEY *key, const unsigned char *iv, unsigned char *out,
156 const unsigned char *in, unsigned int inlen)
157{
158 unsigned char *A, B[16], *R;
159 unsigned int i, j, t;
160
161 if ((inlen & 0x7) || (inlen < 16))
162 return -1;
163 A = B;
164 t = 1;
165 memmove(out + 8, in, inlen);
166 if (!iv)
167 iv = aes_wrap_default_iv;
168
169 memcpy(A, iv, 8);
170
171 for (j = 0; j < 6; j++) {
172 R = out + 8;
173 for (i = 0; i < inlen; i += 8, t++, R += 8) {
174 memcpy(B + 8, R, 8);
175 AES_encrypt(B, B, key);
176 A[7] ^= (unsigned char)(t & 0xff);
177 if (t > 0xff) {
178 A[6] ^= (unsigned char)((t >> 8) & 0xff);
179 A[5] ^= (unsigned char)((t >> 16) & 0xff);
180 A[4] ^= (unsigned char)((t >> 24) & 0xff);
181 }
182 memcpy(R, B + 8, 8);
183 }
184 }
185 memcpy(out, A, 8);
186 return inlen + 8;
187}
188LCRYPTO_ALIAS(AES_wrap_key);
189
190int
191AES_unwrap_key(AES_KEY *key, const unsigned char *iv, unsigned char *out,
192 const unsigned char *in, unsigned int inlen)
193{
194 unsigned char *A, B[16], *R;
195 unsigned int i, j, t;
196
197 if ((inlen & 0x7) || (inlen < 24))
198 return -1;
199 inlen -= 8;
200 A = B;
201 t = 6 * (inlen >> 3);
202 memcpy(A, in, 8);
203 memmove(out, in + 8, inlen);
204 for (j = 0; j < 6; j++) {
205 R = out + inlen - 8;
206 for (i = 0; i < inlen; i += 8, t--, R -= 8) {
207 A[7] ^= (unsigned char)(t & 0xff);
208 if (t > 0xff) {
209 A[6] ^= (unsigned char)((t >> 8) & 0xff);
210 A[5] ^= (unsigned char)((t >> 16) & 0xff);
211 A[4] ^= (unsigned char)((t >> 24) & 0xff);
212 }
213 memcpy(B + 8, R, 8);
214 AES_decrypt(B, B, key);
215 memcpy(R, B + 8, 8);
216 }
217 }
218 if (!iv)
219 iv = aes_wrap_default_iv;
220 if (memcmp(A, iv, 8)) {
221 explicit_bzero(out, inlen);
222 return 0;
223 }
224 return inlen;
225}
226LCRYPTO_ALIAS(AES_unwrap_key);
diff --git a/src/lib/libcrypto/aes/aes.h b/src/lib/libcrypto/aes/aes.h
deleted file mode 100644
index 8903a8ef8d..0000000000
--- a/src/lib/libcrypto/aes/aes.h
+++ /dev/null
@@ -1,120 +0,0 @@
1/* $OpenBSD: aes.h,v 1.16 2025/01/25 17:59:44 tb Exp $ */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#ifndef HEADER_AES_H
53#define HEADER_AES_H
54
55#include <openssl/opensslconf.h>
56
57#include <stddef.h>
58
59#define AES_ENCRYPT 1
60#define AES_DECRYPT 0
61
62/* Because array size can't be a const in C, the following two are macros.
63 Both sizes are in bytes. */
64#define AES_MAXNR 14
65#define AES_BLOCK_SIZE 16
66
67#ifdef __cplusplus
68extern "C" {
69#endif
70
71/* This should be a hidden type, but EVP requires that the size be known */
72struct aes_key_st {
73 unsigned int rd_key[4 *(AES_MAXNR + 1)];
74 int rounds;
75};
76typedef struct aes_key_st AES_KEY;
77
78int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
79 AES_KEY *key);
80int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
81 AES_KEY *key);
82
83void AES_encrypt(const unsigned char *in, unsigned char *out,
84 const AES_KEY *key);
85void AES_decrypt(const unsigned char *in, unsigned char *out,
86 const AES_KEY *key);
87
88void AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
89 const AES_KEY *key, const int enc);
90void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
91 size_t length, const AES_KEY *key, unsigned char *ivec, const int enc);
92void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out,
93 size_t length, const AES_KEY *key, unsigned char *ivec, int *num,
94 const int enc);
95void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out,
96 size_t length, const AES_KEY *key, unsigned char *ivec, int *num,
97 const int enc);
98void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out,
99 size_t length, const AES_KEY *key, unsigned char *ivec, int *num,
100 const int enc);
101void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out,
102 size_t length, const AES_KEY *key, unsigned char *ivec, int *num);
103void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
104 size_t length, const AES_KEY *key, unsigned char ivec[AES_BLOCK_SIZE],
105 unsigned char ecount_buf[AES_BLOCK_SIZE], unsigned int *num);
106/* NB: the IV is _two_ blocks long */
107void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
108 size_t length, const AES_KEY *key, unsigned char *ivec, const int enc);
109
110int AES_wrap_key(AES_KEY *key, const unsigned char *iv, unsigned char *out,
111 const unsigned char *in, unsigned int inlen);
112int AES_unwrap_key(AES_KEY *key, const unsigned char *iv, unsigned char *out,
113 const unsigned char *in, unsigned int inlen);
114
115
116#ifdef __cplusplus
117}
118#endif
119
120#endif /* !HEADER_AES_H */
diff --git a/src/lib/libcrypto/aes/aes_core.c b/src/lib/libcrypto/aes/aes_core.c
deleted file mode 100644
index 4383d74903..0000000000
--- a/src/lib/libcrypto/aes/aes_core.c
+++ /dev/null
@@ -1,1229 +0,0 @@
1/* $OpenBSD: aes_core.c,v 1.25 2024/11/13 21:00:57 tb Exp $ */
2/**
3 * rijndael-alg-fst.c
4 *
5 * @version 3.0 (December 2000)
6 *
7 * Optimised ANSI C code for the Rijndael cipher (now AES)
8 *
9 * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10 * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11 * @author Paulo Barreto <paulo.barreto@terra.com.br>
12 *
13 * This code is hereby placed in the public domain.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * Note: rewritten a little bit to provide error control and an OpenSSL-
30 * compatible API.
31 */
32
33#include <stdlib.h>
34
35#include <openssl/aes.h>
36
37#include "aes_local.h"
38#include "crypto_arch.h"
39#include "crypto_internal.h"
40
41/*
42Te0[x] = S [x].[02, 01, 01, 03];
43Te1[x] = S [x].[03, 02, 01, 01];
44Te2[x] = S [x].[01, 03, 02, 01];
45Te3[x] = S [x].[01, 01, 03, 02];
46
47Td0[x] = Si[x].[0e, 09, 0d, 0b];
48Td1[x] = Si[x].[0b, 0e, 09, 0d];
49Td2[x] = Si[x].[0d, 0b, 0e, 09];
50Td3[x] = Si[x].[09, 0d, 0b, 0e];
51Td4[x] = Si[x].[01];
52*/
53
54#if !defined(HAVE_AES_SET_ENCRYPT_KEY_INTERNAL) || \
55 !defined(HAVE_AES_SET_DECRYPT_KEY_INTERNAL) || \
56 !defined(HAVE_AES_ENCRYPT_INTERNAL) || \
57 !defined(HAVE_AES_DECRYPT_INTERNAL)
58static const u32 Te0[256] = {
59 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
60 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
61 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
62 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
63 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
64 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
65 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
66 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
67 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
68 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
69 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
70 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
71 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
72 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
73 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
74 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
75 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
76 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
77 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
78 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
79 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
80 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
81 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
82 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
83 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
84 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
85 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
86 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
87 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
88 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
89 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
90 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
91 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
92 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
93 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
94 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
95 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
96 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
97 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
98 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
99 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
100 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
101 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
102 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
103 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
104 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
105 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
106 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
107 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
108 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
109 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
110 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
111 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
112 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
113 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
114 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
115 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
116 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
117 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
118 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
119 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
120 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
121 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
122 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
123};
124static const u32 Te1[256] = {
125 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
126 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
127 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
128 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
129 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
130 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
131 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
132 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
133 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
134 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
135 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
136 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
137 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
138 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
139 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
140 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
141 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
142 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
143 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
144 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
145 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
146 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
147 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
148 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
149 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
150 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
151 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
152 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
153 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
154 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
155 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
156 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
157 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
158 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
159 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
160 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
161 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
162 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
163 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
164 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
165 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
166 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
167 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
168 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
169 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
170 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
171 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
172 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
173 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
174 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
175 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
176 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
177 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
178 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
179 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
180 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
181 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
182 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
183 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
184 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
185 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
186 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
187 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
188 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
189};
190static const u32 Te2[256] = {
191 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
192 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
193 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
194 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
195 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
196 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
197 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
198 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
199 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
200 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
201 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
202 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
203 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
204 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
205 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
206 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
207 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
208 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
209 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
210 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
211 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
212 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
213 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
214 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
215 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
216 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
217 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
218 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
219 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
220 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
221 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
222 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
223 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
224 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
225 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
226 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
227 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
228 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
229 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
230 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
231 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
232 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
233 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
234 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
235 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
236 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
237 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
238 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
239 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
240 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
241 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
242 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
243 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
244 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
245 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
246 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
247 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
248 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
249 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
250 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
251 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
252 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
253 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
254 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
255};
256static const u32 Te3[256] = {
257 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
258 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
259 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
260 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
261 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
262 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
263 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
264 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
265 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
266 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
267 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
268 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
269 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
270 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
271 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
272 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
273 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
274 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
275 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
276 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
277 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
278 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
279 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
280 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
281 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
282 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
283 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
284 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
285 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
286 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
287 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
288 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
289 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
290 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
291 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
292 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
293 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
294 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
295 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
296 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
297 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
298 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
299 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
300 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
301 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
302 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
303 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
304 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
305 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
306 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
307 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
308 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
309 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
310 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
311 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
312 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
313 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
314 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
315 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
316 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
317 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
318 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
319 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
320 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
321};
322
323static const u32 Td0[256] = {
324 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
325 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
326 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
327 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
328 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
329 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
330 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
331 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
332 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
333 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
334 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
335 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
336 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
337 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
338 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
339 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
340 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
341 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
342 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
343 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
344 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
345 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
346 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
347 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
348 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
349 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
350 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
351 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
352 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
353 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
354 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
355 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
356 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
357 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
358 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
359 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
360 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
361 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
362 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
363 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
364 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
365 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
366 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
367 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
368 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
369 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
370 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
371 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
372 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
373 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
374 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
375 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
376 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
377 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
378 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
379 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
380 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
381 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
382 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
383 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
384 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
385 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
386 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
387 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
388};
389static const u32 Td1[256] = {
390 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
391 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
392 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
393 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
394 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
395 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
396 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
397 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
398 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
399 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
400 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
401 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
402 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
403 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
404 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
405 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
406 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
407 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
408 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
409 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
410 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
411 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
412 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
413 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
414 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
415 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
416 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
417 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
418 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
419 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
420 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
421 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
422 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
423 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
424 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
425 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
426 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
427 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
428 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
429 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
430 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
431 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
432 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
433 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
434 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
435 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
436 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
437 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
438 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
439 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
440 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
441 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
442 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
443 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
444 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
445 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
446 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
447 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
448 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
449 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
450 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
451 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
452 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
453 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
454};
455static const u32 Td2[256] = {
456 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
457 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
458 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
459 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
460 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
461 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
462 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
463 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
464 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
465 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
466 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
467 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
468 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
469 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
470 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
471 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
472 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
473 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
474 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
475 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
476 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
477 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
478 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
479 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
480 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
481 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
482 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
483 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
484 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
485 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
486 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
487 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
488 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
489 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
490 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
491 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
492 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
493 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
494 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
495 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
496 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
497 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
498 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
499 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
500 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
501 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
502 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
503 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
504 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
505 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
506 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
507 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
508 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
509 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
510 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
511 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
512 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
513 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
514 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
515 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
516 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
517 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
518 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
519 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
520};
521static const u32 Td3[256] = {
522 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
523 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
524 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
525 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
526 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
527 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
528 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
529 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
530 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
531 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
532 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
533 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
534 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
535 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
536 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
537 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
538 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
539 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
540 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
541 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
542 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
543 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
544 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
545 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
546 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
547 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
548 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
549 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
550 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
551 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
552 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
553 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
554 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
555 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
556 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
557 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
558 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
559 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
560 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
561 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
562 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
563 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
564 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
565 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
566 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
567 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
568 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
569 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
570 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
571 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
572 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
573 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
574 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
575 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
576 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
577 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
578 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
579 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
580 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
581 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
582 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
583 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
584 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
585 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
586};
587#endif
588
589#if !defined(HAVE_AES_ENCRYPT_INTERNAL) || \
590 !defined(HAVE_AES_DECRYPT_INTERNAL)
591static const u8 Td4[256] = {
592 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
593 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
594 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
595 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
596 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
597 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
598 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
599 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
600 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
601 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
602 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
603 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
604 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
605 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
606 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
607 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
608 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
609 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
610 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
611 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
612 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
613 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
614 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
615 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
616 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
617 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
618 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
619 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
620 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
621 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
622 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
623 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU,
624};
625#endif
626
627#if !defined(HAVE_AES_SET_ENCRYPT_KEY_INTERNAL) || \
628 !defined(HAVE_AES_SET_DECRYPT_KEY_INTERNAL)
629static const u32 rcon[] = {
630 0x01000000, 0x02000000, 0x04000000, 0x08000000,
631 0x10000000, 0x20000000, 0x40000000, 0x80000000,
632 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
633};
634#endif
635
636#ifdef HAVE_AES_SET_ENCRYPT_KEY_INTERNAL
637int aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits,
638 AES_KEY *key);
639
640#else
641
642/*
643 * Expand the cipher key into the encryption key schedule.
644 */
645static inline int
646aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits,
647 AES_KEY *key)
648{
649 u32 *rk;
650 int i = 0;
651 u32 temp;
652
653 if (!userKey || !key)
654 return -1;
655 if (bits != 128 && bits != 192 && bits != 256)
656 return -2;
657
658 rk = key->rd_key;
659
660 if (bits == 128)
661 key->rounds = 10;
662 else if (bits == 192)
663 key->rounds = 12;
664 else
665 key->rounds = 14;
666
667 rk[0] = crypto_load_be32toh(&userKey[0 * 4]);
668 rk[1] = crypto_load_be32toh(&userKey[1 * 4]);
669 rk[2] = crypto_load_be32toh(&userKey[2 * 4]);
670 rk[3] = crypto_load_be32toh(&userKey[3 * 4]);
671 if (bits == 128) {
672 while (1) {
673 temp = rk[3];
674 rk[4] = rk[0] ^
675 (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
676 (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
677 (Te0[(temp) & 0xff] & 0x0000ff00) ^
678 (Te1[(temp >> 24)] & 0x000000ff) ^
679 rcon[i];
680 rk[5] = rk[1] ^ rk[4];
681 rk[6] = rk[2] ^ rk[5];
682 rk[7] = rk[3] ^ rk[6];
683 if (++i == 10) {
684 return 0;
685 }
686 rk += 4;
687 }
688 }
689 rk[4] = crypto_load_be32toh(&userKey[4 * 4]);
690 rk[5] = crypto_load_be32toh(&userKey[5 * 4]);
691 if (bits == 192) {
692 while (1) {
693 temp = rk[5];
694 rk[6] = rk[ 0] ^
695 (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
696 (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
697 (Te0[(temp) & 0xff] & 0x0000ff00) ^
698 (Te1[(temp >> 24)] & 0x000000ff) ^
699 rcon[i];
700 rk[7] = rk[1] ^ rk[6];
701 rk[8] = rk[2] ^ rk[7];
702 rk[9] = rk[3] ^ rk[8];
703 if (++i == 8) {
704 return 0;
705 }
706 rk[10] = rk[4] ^ rk[9];
707 rk[11] = rk[5] ^ rk[10];
708 rk += 6;
709 }
710 }
711 rk[6] = crypto_load_be32toh(&userKey[6 * 4]);
712 rk[7] = crypto_load_be32toh(&userKey[7 * 4]);
713 if (bits == 256) {
714 while (1) {
715 temp = rk[7];
716 rk[8] = rk[0] ^
717 (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
718 (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
719 (Te0[(temp) & 0xff] & 0x0000ff00) ^
720 (Te1[(temp >> 24)] & 0x000000ff) ^
721 rcon[i];
722 rk[9] = rk[1] ^ rk[8];
723 rk[10] = rk[2] ^ rk[9];
724 rk[11] = rk[3] ^ rk[10];
725 if (++i == 7) {
726 return 0;
727 }
728 temp = rk[11];
729 rk[12] = rk[4] ^
730 (Te2[(temp >> 24)] & 0xff000000) ^
731 (Te3[(temp >> 16) & 0xff] & 0x00ff0000) ^
732 (Te0[(temp >> 8) & 0xff] & 0x0000ff00) ^
733 (Te1[(temp) & 0xff] & 0x000000ff);
734 rk[13] = rk[5] ^ rk[12];
735 rk[14] = rk[6] ^ rk[13];
736 rk[15] = rk[7] ^ rk[14];
737
738 rk += 8;
739 }
740 }
741 return 0;
742}
743#endif
744
745int
746AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
747{
748 return aes_set_encrypt_key_internal(userKey, bits, key);
749}
750LCRYPTO_ALIAS(AES_set_encrypt_key);
751
752#ifdef HAVE_AES_SET_DECRYPT_KEY_INTERNAL
753int aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits,
754 AES_KEY *key);
755
756#else
757/*
758 * Expand the cipher key into the decryption key schedule.
759 */
760static inline int
761aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits,
762 AES_KEY *key)
763{
764 u32 *rk;
765 int i, j, status;
766 u32 temp;
767
768 /* first, start with an encryption schedule */
769 status = AES_set_encrypt_key(userKey, bits, key);
770 if (status < 0)
771 return status;
772
773 rk = key->rd_key;
774
775 /* invert the order of the round keys: */
776 for (i = 0, j = 4 * (key->rounds); i < j; i += 4, j -= 4) {
777 temp = rk[i];
778 rk[i] = rk[j];
779 rk[j] = temp;
780 temp = rk[i + 1];
781 rk[i + 1] = rk[j + 1];
782 rk[j + 1] = temp;
783 temp = rk[i + 2];
784 rk[i + 2] = rk[j + 2];
785 rk[j + 2] = temp;
786 temp = rk[i + 3];
787 rk[i + 3] = rk[j + 3];
788 rk[j + 3] = temp;
789 }
790 /* apply the inverse MixColumn transform to all round keys but the first and the last: */
791 for (i = 1; i < (key->rounds); i++) {
792 rk += 4;
793 rk[0] =
794 Td0[Te1[(rk[0] >> 24)] & 0xff] ^
795 Td1[Te1[(rk[0] >> 16) & 0xff] & 0xff] ^
796 Td2[Te1[(rk[0] >> 8) & 0xff] & 0xff] ^
797 Td3[Te1[(rk[0]) & 0xff] & 0xff];
798 rk[1] =
799 Td0[Te1[(rk[1] >> 24)] & 0xff] ^
800 Td1[Te1[(rk[1] >> 16) & 0xff] & 0xff] ^
801 Td2[Te1[(rk[1] >> 8) & 0xff] & 0xff] ^
802 Td3[Te1[(rk[1]) & 0xff] & 0xff];
803 rk[2] =
804 Td0[Te1[(rk[2] >> 24)] & 0xff] ^
805 Td1[Te1[(rk[2] >> 16) & 0xff] & 0xff] ^
806 Td2[Te1[(rk[2] >> 8) & 0xff] & 0xff] ^
807 Td3[Te1[(rk[2]) & 0xff] & 0xff];
808 rk[3] =
809 Td0[Te1[(rk[3] >> 24)] & 0xff] ^
810 Td1[Te1[(rk[3] >> 16) & 0xff] & 0xff] ^
811 Td2[Te1[(rk[3] >> 8) & 0xff] & 0xff] ^
812 Td3[Te1[(rk[3]) & 0xff] & 0xff];
813 }
814 return 0;
815}
816#endif
817
818int
819AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
820{
821 return aes_set_decrypt_key_internal(userKey, bits, key);
822}
823LCRYPTO_ALIAS(AES_set_decrypt_key);
824
825#ifdef HAVE_AES_ENCRYPT_INTERNAL
826void aes_encrypt_internal(const unsigned char *in, unsigned char *out,
827 const AES_KEY *key);
828
829#else
830/*
831 * Encrypt a single block - in and out can overlap.
832 */
833static inline void
834aes_encrypt_internal(const unsigned char *in, unsigned char *out,
835 const AES_KEY *key)
836{
837 const u32 *rk;
838 u32 s0, s1, s2, s3, t0, t1, t2, t3;
839#ifndef FULL_UNROLL
840 int r;
841#endif /* ?FULL_UNROLL */
842
843 rk = key->rd_key;
844
845 /*
846 * map byte array block to cipher state
847 * and add initial round key:
848 */
849 s0 = crypto_load_be32toh(&in[0 * 4]) ^ rk[0];
850 s1 = crypto_load_be32toh(&in[1 * 4]) ^ rk[1];
851 s2 = crypto_load_be32toh(&in[2 * 4]) ^ rk[2];
852 s3 = crypto_load_be32toh(&in[3 * 4]) ^ rk[3];
853#ifdef FULL_UNROLL
854 /* round 1: */
855 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
856 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
857 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
858 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
859 /* round 2: */
860 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
861 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
862 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
863 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
864 /* round 3: */
865 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
866 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
867 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
868 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
869 /* round 4: */
870 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
871 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
872 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
873 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
874 /* round 5: */
875 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
876 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
877 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
878 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
879 /* round 6: */
880 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
881 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
882 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
883 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
884 /* round 7: */
885 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
886 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
887 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
888 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
889 /* round 8: */
890 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
891 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
892 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
893 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
894 /* round 9: */
895 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
896 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
897 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
898 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];
899 if (key->rounds > 10) {
900 /* round 10: */
901 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];
902 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];
903 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];
904 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];
905 /* round 11: */
906 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];
907 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];
908 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];
909 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];
910 if (key->rounds > 12) {
911 /* round 12: */
912 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];
913 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];
914 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];
915 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];
916 /* round 13: */
917 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];
918 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];
919 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];
920 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];
921 }
922 }
923 rk += key->rounds << 2;
924#else /* !FULL_UNROLL */
925 /*
926 * Nr - 1 full rounds:
927 */
928 r = key->rounds >> 1;
929 for (;;) {
930 t0 =
931 Te0[(s0 >> 24)] ^
932 Te1[(s1 >> 16) & 0xff] ^
933 Te2[(s2 >> 8) & 0xff] ^
934 Te3[(s3) & 0xff] ^
935 rk[4];
936 t1 =
937 Te0[(s1 >> 24)] ^
938 Te1[(s2 >> 16) & 0xff] ^
939 Te2[(s3 >> 8) & 0xff] ^
940 Te3[(s0) & 0xff] ^
941 rk[5];
942 t2 =
943 Te0[(s2 >> 24)] ^
944 Te1[(s3 >> 16) & 0xff] ^
945 Te2[(s0 >> 8) & 0xff] ^
946 Te3[(s1) & 0xff] ^
947 rk[6];
948 t3 =
949 Te0[(s3 >> 24)] ^
950 Te1[(s0 >> 16) & 0xff] ^
951 Te2[(s1 >> 8) & 0xff] ^
952 Te3[(s2) & 0xff] ^
953 rk[7];
954
955 rk += 8;
956 if (--r == 0) {
957 break;
958 }
959
960 s0 =
961 Te0[(t0 >> 24)] ^
962 Te1[(t1 >> 16) & 0xff] ^
963 Te2[(t2 >> 8) & 0xff] ^
964 Te3[(t3) & 0xff] ^
965 rk[0];
966 s1 =
967 Te0[(t1 >> 24)] ^
968 Te1[(t2 >> 16) & 0xff] ^
969 Te2[(t3 >> 8) & 0xff] ^
970 Te3[(t0) & 0xff] ^
971 rk[1];
972 s2 =
973 Te0[(t2 >> 24)] ^
974 Te1[(t3 >> 16) & 0xff] ^
975 Te2[(t0 >> 8) & 0xff] ^
976 Te3[(t1) & 0xff] ^
977 rk[2];
978 s3 =
979 Te0[(t3 >> 24)] ^
980 Te1[(t0 >> 16) & 0xff] ^
981 Te2[(t1 >> 8) & 0xff] ^
982 Te3[(t2) & 0xff] ^
983 rk[3];
984 }
985#endif /* ?FULL_UNROLL */
986 /*
987 * apply last round and
988 * map cipher state to byte array block:
989 */
990 s0 =
991 (Te2[(t0 >> 24)] & 0xff000000) ^
992 (Te3[(t1 >> 16) & 0xff] & 0x00ff0000) ^
993 (Te0[(t2 >> 8) & 0xff] & 0x0000ff00) ^
994 (Te1[(t3) & 0xff] & 0x000000ff) ^
995 rk[0];
996 crypto_store_htobe32(&out[0 * 4], s0);
997 s1 =
998 (Te2[(t1 >> 24)] & 0xff000000) ^
999 (Te3[(t2 >> 16) & 0xff] & 0x00ff0000) ^
1000 (Te0[(t3 >> 8) & 0xff] & 0x0000ff00) ^
1001 (Te1[(t0) & 0xff] & 0x000000ff) ^
1002 rk[1];
1003 crypto_store_htobe32(&out[1 * 4], s1);
1004 s2 =
1005 (Te2[(t2 >> 24)] & 0xff000000) ^
1006 (Te3[(t3 >> 16) & 0xff] & 0x00ff0000) ^
1007 (Te0[(t0 >> 8) & 0xff] & 0x0000ff00) ^
1008 (Te1[(t1) & 0xff] & 0x000000ff) ^
1009 rk[2];
1010 crypto_store_htobe32(&out[2 * 4], s2);
1011 s3 =
1012 (Te2[(t3 >> 24)] & 0xff000000) ^
1013 (Te3[(t0 >> 16) & 0xff] & 0x00ff0000) ^
1014 (Te0[(t1 >> 8) & 0xff] & 0x0000ff00) ^
1015 (Te1[(t2) & 0xff] & 0x000000ff) ^
1016 rk[3];
1017 crypto_store_htobe32(&out[3 * 4], s3);
1018}
1019#endif
1020
1021void
1022AES_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key)
1023{
1024 aes_encrypt_internal(in, out, key);
1025}
1026LCRYPTO_ALIAS(AES_encrypt);
1027
1028#ifdef HAVE_AES_DECRYPT_INTERNAL
1029void aes_decrypt_internal(const unsigned char *in, unsigned char *out,
1030 const AES_KEY *key);
1031
1032#else
1033/*
1034 * Decrypt a single block - in and out can overlap.
1035 */
1036static inline void
1037aes_decrypt_internal(const unsigned char *in, unsigned char *out,
1038 const AES_KEY *key)
1039{
1040 const u32 *rk;
1041 u32 s0, s1, s2, s3, t0, t1, t2, t3;
1042#ifndef FULL_UNROLL
1043 int r;
1044#endif /* ?FULL_UNROLL */
1045
1046 rk = key->rd_key;
1047
1048 /*
1049 * map byte array block to cipher state
1050 * and add initial round key:
1051 */
1052 s0 = crypto_load_be32toh(&in[0 * 4]) ^ rk[0];
1053 s1 = crypto_load_be32toh(&in[1 * 4]) ^ rk[1];
1054 s2 = crypto_load_be32toh(&in[2 * 4]) ^ rk[2];
1055 s3 = crypto_load_be32toh(&in[3 * 4]) ^ rk[3];
1056#ifdef FULL_UNROLL
1057 /* round 1: */
1058 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];
1059 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];
1060 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];
1061 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];
1062 /* round 2: */
1063 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];
1064 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];
1065 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];
1066 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];
1067 /* round 3: */
1068 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];
1069 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];
1070 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];
1071 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];
1072 /* round 4: */
1073 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];
1074 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];
1075 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];
1076 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];
1077 /* round 5: */
1078 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];
1079 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];
1080 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];
1081 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];
1082 /* round 6: */
1083 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];
1084 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];
1085 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];
1086 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];
1087 /* round 7: */
1088 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];
1089 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];
1090 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];
1091 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];
1092 /* round 8: */
1093 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];
1094 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];
1095 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];
1096 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];
1097 /* round 9: */
1098 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];
1099 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];
1100 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];
1101 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];
1102 if (key->rounds > 10) {
1103 /* round 10: */
1104 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];
1105 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];
1106 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];
1107 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];
1108 /* round 11: */
1109 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];
1110 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];
1111 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];
1112 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];
1113 if (key->rounds > 12) {
1114 /* round 12: */
1115 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];
1116 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];
1117 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];
1118 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];
1119 /* round 13: */
1120 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];
1121 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];
1122 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];
1123 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];
1124 }
1125 }
1126 rk += key->rounds << 2;
1127#else /* !FULL_UNROLL */
1128 /*
1129 * Nr - 1 full rounds:
1130 */
1131 r = key->rounds >> 1;
1132 for (;;) {
1133 t0 =
1134 Td0[(s0 >> 24)] ^
1135 Td1[(s3 >> 16) & 0xff] ^
1136 Td2[(s2 >> 8) & 0xff] ^
1137 Td3[(s1) & 0xff] ^
1138 rk[4];
1139 t1 =
1140 Td0[(s1 >> 24)] ^
1141 Td1[(s0 >> 16) & 0xff] ^
1142 Td2[(s3 >> 8) & 0xff] ^
1143 Td3[(s2) & 0xff] ^
1144 rk[5];
1145 t2 =
1146 Td0[(s2 >> 24)] ^
1147 Td1[(s1 >> 16) & 0xff] ^
1148 Td2[(s0 >> 8) & 0xff] ^
1149 Td3[(s3) & 0xff] ^
1150 rk[6];
1151 t3 =
1152 Td0[(s3 >> 24)] ^
1153 Td1[(s2 >> 16) & 0xff] ^
1154 Td2[(s1 >> 8) & 0xff] ^
1155 Td3[(s0) & 0xff] ^
1156 rk[7];
1157
1158 rk += 8;
1159 if (--r == 0) {
1160 break;
1161 }
1162
1163 s0 =
1164 Td0[(t0 >> 24)] ^
1165 Td1[(t3 >> 16) & 0xff] ^
1166 Td2[(t2 >> 8) & 0xff] ^
1167 Td3[(t1) & 0xff] ^
1168 rk[0];
1169 s1 =
1170 Td0[(t1 >> 24)] ^
1171 Td1[(t0 >> 16) & 0xff] ^
1172 Td2[(t3 >> 8) & 0xff] ^
1173 Td3[(t2) & 0xff] ^
1174 rk[1];
1175 s2 =
1176 Td0[(t2 >> 24)] ^
1177 Td1[(t1 >> 16) & 0xff] ^
1178 Td2[(t0 >> 8) & 0xff] ^
1179 Td3[(t3) & 0xff] ^
1180 rk[2];
1181 s3 =
1182 Td0[(t3 >> 24)] ^
1183 Td1[(t2 >> 16) & 0xff] ^
1184 Td2[(t1 >> 8) & 0xff] ^
1185 Td3[(t0) & 0xff] ^
1186 rk[3];
1187 }
1188#endif /* ?FULL_UNROLL */
1189 /*
1190 * apply last round and
1191 * map cipher state to byte array block:
1192 */
1193 s0 =
1194 (((uint32_t)Td4[(t0 >> 24)]) << 24) ^
1195 (Td4[(t3 >> 16) & 0xff] << 16) ^
1196 (Td4[(t2 >> 8) & 0xff] << 8) ^
1197 (Td4[(t1) & 0xff]) ^
1198 rk[0];
1199 crypto_store_htobe32(&out[0 * 4], s0);
1200 s1 =
1201 (((uint32_t)Td4[(t1 >> 24)]) << 24) ^
1202 (Td4[(t0 >> 16) & 0xff] << 16) ^
1203 (Td4[(t3 >> 8) & 0xff] << 8) ^
1204 (Td4[(t2) & 0xff]) ^
1205 rk[1];
1206 crypto_store_htobe32(&out[1 * 4], s1);
1207 s2 =
1208 (((uint32_t)Td4[(t2 >> 24)]) << 24) ^
1209 (Td4[(t1 >> 16) & 0xff] << 16) ^
1210 (Td4[(t0 >> 8) & 0xff] << 8) ^
1211 (Td4[(t3) & 0xff]) ^
1212 rk[2];
1213 crypto_store_htobe32(&out[2 * 4], s2);
1214 s3 =
1215 (((uint32_t)Td4[(t3 >> 24)]) << 24) ^
1216 (Td4[(t2 >> 16) & 0xff] << 16) ^
1217 (Td4[(t1 >> 8) & 0xff] << 8) ^
1218 (Td4[(t0) & 0xff]) ^
1219 rk[3];
1220 crypto_store_htobe32(&out[3 * 4], s3);
1221}
1222#endif
1223
1224void
1225AES_decrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key)
1226{
1227 aes_decrypt_internal(in, out, key);
1228}
1229LCRYPTO_ALIAS(AES_decrypt);
diff --git a/src/lib/libcrypto/aes/aes_ige.c b/src/lib/libcrypto/aes/aes_ige.c
deleted file mode 100644
index 1a6fcfcfbf..0000000000
--- a/src/lib/libcrypto/aes/aes_ige.c
+++ /dev/null
@@ -1,195 +0,0 @@
1/* $OpenBSD: aes_ige.c,v 1.10 2024/03/30 05:14:12 joshua Exp $ */
2/* ====================================================================
3 * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/crypto.h>
54
55#include "aes_local.h"
56
57#define N_WORDS (AES_BLOCK_SIZE / sizeof(unsigned long))
58typedef struct {
59 unsigned long data[N_WORDS];
60} aes_block_t;
61
62/* XXX: probably some better way to do this */
63#if defined(__i386__) || defined(__x86_64__)
64#define UNALIGNED_MEMOPS_ARE_FAST 1
65#else
66#define UNALIGNED_MEMOPS_ARE_FAST 0
67#endif
68
69#if UNALIGNED_MEMOPS_ARE_FAST
70#define load_block(d, s) (d) = *(const aes_block_t *)(s)
71#define store_block(d, s) *(aes_block_t *)(d) = (s)
72#else
73#define load_block(d, s) memcpy((d).data, (s), AES_BLOCK_SIZE)
74#define store_block(d, s) memcpy((d), (s).data, AES_BLOCK_SIZE)
75#endif
76
77/* N.B. The IV for this mode is _twice_ the block size */
78
79void
80AES_ige_encrypt(const unsigned char *in, unsigned char *out, size_t length,
81 const AES_KEY *key, unsigned char *ivec, const int enc)
82{
83 size_t n;
84 size_t len;
85
86 OPENSSL_assert((length % AES_BLOCK_SIZE) == 0);
87
88 len = length / AES_BLOCK_SIZE;
89
90 if (AES_ENCRYPT == enc) {
91 if (in != out && (UNALIGNED_MEMOPS_ARE_FAST ||
92 ((size_t)in|(size_t)out|(size_t)ivec) %
93 sizeof(long) == 0)) {
94 aes_block_t *ivp = (aes_block_t *)ivec;
95 aes_block_t *iv2p = (aes_block_t *)(ivec + AES_BLOCK_SIZE);
96
97 while (len) {
98 aes_block_t *inp = (aes_block_t *)in;
99 aes_block_t *outp = (aes_block_t *)out;
100
101 for (n = 0; n < N_WORDS; ++n)
102 outp->data[n] = inp->data[n] ^ ivp->data[n];
103 AES_encrypt((unsigned char *)outp->data, (unsigned char *)outp->data, key);
104 for (n = 0; n < N_WORDS; ++n)
105 outp->data[n] ^= iv2p->data[n];
106 ivp = outp;
107 iv2p = inp;
108 --len;
109 in += AES_BLOCK_SIZE;
110 out += AES_BLOCK_SIZE;
111 }
112 memmove(ivec, ivp->data, AES_BLOCK_SIZE);
113 memmove(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
114 } else {
115 aes_block_t tmp, tmp2;
116 aes_block_t iv;
117 aes_block_t iv2;
118
119 load_block(iv, ivec);
120 load_block(iv2, ivec + AES_BLOCK_SIZE);
121
122 while (len) {
123 load_block(tmp, in);
124 for (n = 0; n < N_WORDS; ++n)
125 tmp2.data[n] = tmp.data[n] ^ iv.data[n];
126 AES_encrypt((unsigned char *)tmp2.data,
127 (unsigned char *)tmp2.data, key);
128 for (n = 0; n < N_WORDS; ++n)
129 tmp2.data[n] ^= iv2.data[n];
130 store_block(out, tmp2);
131 iv = tmp2;
132 iv2 = tmp;
133 --len;
134 in += AES_BLOCK_SIZE;
135 out += AES_BLOCK_SIZE;
136 }
137 memcpy(ivec, iv.data, AES_BLOCK_SIZE);
138 memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
139 }
140 } else {
141 if (in != out && (UNALIGNED_MEMOPS_ARE_FAST ||
142 ((size_t)in|(size_t)out|(size_t)ivec) %
143 sizeof(long) == 0)) {
144 aes_block_t *ivp = (aes_block_t *)ivec;
145 aes_block_t *iv2p = (aes_block_t *)(ivec + AES_BLOCK_SIZE);
146
147 while (len) {
148 aes_block_t tmp;
149 aes_block_t *inp = (aes_block_t *)in;
150 aes_block_t *outp = (aes_block_t *)out;
151
152 for (n = 0; n < N_WORDS; ++n)
153 tmp.data[n] = inp->data[n] ^ iv2p->data[n];
154 AES_decrypt((unsigned char *)tmp.data,
155 (unsigned char *)outp->data, key);
156 for (n = 0; n < N_WORDS; ++n)
157 outp->data[n] ^= ivp->data[n];
158 ivp = inp;
159 iv2p = outp;
160 --len;
161 in += AES_BLOCK_SIZE;
162 out += AES_BLOCK_SIZE;
163 }
164 memmove(ivec, ivp->data, AES_BLOCK_SIZE);
165 memmove(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
166 } else {
167 aes_block_t tmp, tmp2;
168 aes_block_t iv;
169 aes_block_t iv2;
170
171 load_block(iv, ivec);
172 load_block(iv2, ivec + AES_BLOCK_SIZE);
173
174 while (len) {
175 load_block(tmp, in);
176 tmp2 = tmp;
177 for (n = 0; n < N_WORDS; ++n)
178 tmp.data[n] ^= iv2.data[n];
179 AES_decrypt((unsigned char *)tmp.data,
180 (unsigned char *)tmp.data, key);
181 for (n = 0; n < N_WORDS; ++n)
182 tmp.data[n] ^= iv.data[n];
183 store_block(out, tmp);
184 iv = tmp2;
185 iv2 = tmp;
186 --len;
187 in += AES_BLOCK_SIZE;
188 out += AES_BLOCK_SIZE;
189 }
190 memcpy(ivec, iv.data, AES_BLOCK_SIZE);
191 memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
192 }
193 }
194}
195LCRYPTO_ALIAS(AES_ige_encrypt);
diff --git a/src/lib/libcrypto/aes/aes_local.h b/src/lib/libcrypto/aes/aes_local.h
deleted file mode 100644
index e0714df409..0000000000
--- a/src/lib/libcrypto/aes/aes_local.h
+++ /dev/null
@@ -1,76 +0,0 @@
1/* $OpenBSD: aes_local.h,v 1.4 2025/01/25 17:59:44 tb Exp $ */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#ifndef HEADER_AES_LOCAL_H
53#define HEADER_AES_LOCAL_H
54
55#include <openssl/opensslconf.h>
56
57#include <stdio.h>
58#include <stdlib.h>
59#include <string.h>
60
61__BEGIN_HIDDEN_DECLS
62
63typedef unsigned int u32;
64typedef unsigned short u16;
65typedef unsigned char u8;
66
67#define MAXKC (256/32)
68#define MAXKB (256/8)
69#define MAXNR 14
70
71/* This controls loop-unrolling in aes_core.c */
72#undef FULL_UNROLL
73
74__END_HIDDEN_DECLS
75
76#endif /* !HEADER_AES_LOCAL_H */
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl
deleted file mode 100644
index 364099d4d3..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-586.pl
+++ /dev/null
@@ -1,2974 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Version 4.3.
11#
12# You might fail to appreciate this module performance from the first
13# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
14# to be *the* best Intel C compiler without -KPIC, performance appears
15# to be virtually identical... But try to re-configure with shared
16# library support... Aha! Intel compiler "suddenly" lags behind by 30%
17# [on P4, more on others]:-) And if compared to position-independent
18# code generated by GNU C, this code performs *more* than *twice* as
19# fast! Yes, all this buzz about PIC means that unlike other hand-
20# coded implementations, this one was explicitly designed to be safe
21# to use even in shared library context... This also means that this
22# code isn't necessarily absolutely fastest "ever," because in order
23# to achieve position independence an extra register has to be
24# off-loaded to stack, which affects the benchmark result.
25#
26# Special note about instruction choice. Do you recall RC4_INT code
27# performing poorly on P4? It might be the time to figure out why.
28# RC4_INT code implies effective address calculations in base+offset*4
29# form. Trouble is that it seems that offset scaling turned to be
30# critical path... At least eliminating scaling resulted in 2.8x RC4
31# performance improvement [as you might recall]. As AES code is hungry
32# for scaling too, I [try to] avoid the latter by favoring off-by-2
33# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
34#
35# As was shown by Dean Gaudet <dean@arctic.org>, the above note turned
36# void. Performance improvement with off-by-2 shifts was observed on
37# intermediate implementation, which was spilling yet another register
38# to stack... Final offset*4 code below runs just a tad faster on P4,
39# but exhibits up to 10% improvement on other cores.
40#
41# Second version is "monolithic" replacement for aes_core.c, which in
42# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
43# This made it possible to implement little-endian variant of the
44# algorithm without modifying the base C code. Motivating factor for
45# the undertaken effort was that it appeared that in tight IA-32
46# register window little-endian flavor could achieve slightly higher
47# Instruction Level Parallelism, and it indeed resulted in up to 15%
48# better performance on most recent µ-archs...
49#
50# Third version adds AES_cbc_encrypt implementation, which resulted in
51# up to 40% performance improvement of CBC benchmark results. 40% was
52# observed on P4 core, where "overall" improvement coefficient, i.e. if
53# compared to PIC generated by GCC and in CBC mode, was observed to be
54# as large as 4x:-) CBC performance is virtually identical to ECB now
55# and on some platforms even better, e.g. 17.6 "small" cycles/byte on
56# Opteron, because certain function prologues and epilogues are
57# effectively taken out of the loop...
58#
59# Version 3.2 implements compressed tables and prefetch of these tables
60# in CBC[!] mode. Former means that 3/4 of table references are now
61# misaligned, which unfortunately has negative impact on elder IA-32
62# implementations, Pentium suffered 30% penalty, PIII - 10%.
63#
64# Version 3.3 avoids L1 cache aliasing between stack frame and
65# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
66# latter is achieved by copying the key schedule to controlled place in
67# stack. This unfortunately has rather strong impact on small block CBC
68# performance, ~2x deterioration on 16-byte block if compared to 3.3.
69#
70# Version 3.5 checks if there is L1 cache aliasing between user-supplied
71# key schedule and S-boxes and abstains from copying the former if
72# there is no. This allows end-user to consciously retain small block
73# performance by aligning key schedule in specific manner.
74#
75# Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
76#
77# Current ECB performance numbers for 128-bit key in CPU cycles per
78# processed byte [measure commonly used by AES benchmarkers] are:
79#
80# small footprint fully unrolled
81# P4 24 22
82# AMD K8 20 19
83# PIII 25 23
84# Pentium 81 78
85#
86# Version 3.7 reimplements outer rounds as "compact." Meaning that
87# first and last rounds reference compact 256 bytes S-box. This means
88# that first round consumes a lot more CPU cycles and that encrypt
89# and decrypt performance becomes asymmetric. Encrypt performance
90# drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
91# aggressively pre-fetched.
92#
93# Version 4.0 effectively rolls back to 3.6 and instead implements
94# additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
95# which use exclusively 256 byte S-box. These functions are to be
96# called in modes not concealing plain text, such as ECB, or when
97# we're asked to process smaller amount of data [or unconditionally
98# on hyper-threading CPU]. Currently it's called unconditionally from
99# AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
100# still needs to be modified to switch between slower and faster
101# mode when appropriate... But in either case benchmark landscape
102# changes dramatically and below numbers are CPU cycles per processed
103# byte for 128-bit key.
104#
105# ECB encrypt ECB decrypt CBC large chunk
106# P4 56[60] 84[100] 23
107# AMD K8 48[44] 70[79] 18
108# PIII 41[50] 61[91] 24
109# Core 2 32[38] 45[70] 18.5
110# Pentium 120 160 77
111#
112# Version 4.1 switches to compact S-box even in key schedule setup.
113#
114# Version 4.2 prefetches compact S-box in every SSE round or in other
115# words every cache-line is *guaranteed* to be accessed within ~50
116# cycles window. Why just SSE? Because it's needed on hyper-threading
117# CPU! Which is also why it's prefetched with 64 byte stride. Best
118# part is that it has no negative effect on performance:-)
119#
120# Version 4.3 implements switch between compact and non-compact block
121# functions in AES_cbc_encrypt depending on how much data was asked
122# to be processed in one stroke.
123#
124######################################################################
125# Timing attacks are classified in two classes: synchronous when
126# attacker consciously initiates cryptographic operation and collects
127# timing data of various character afterwards, and asynchronous when
128# malicious code is executed on same CPU simultaneously with AES,
129# instruments itself and performs statistical analysis of this data.
130#
131# As far as synchronous attacks go the root to the AES timing
132# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
133# are referred to in single 128-bit block operation. Well, in C
134# implementation with 4 distinct tables it's actually as little as 40
135# references per 256 elements table, but anyway... Secondly, even
136# though S-box elements are clustered into smaller amount of cache-
137# lines, smaller than 160 and even 40, it turned out that for certain
138# plain-text pattern[s] or simply put chosen plain-text and given key
139# few cache-lines remain unaccessed during block operation. Now, if
140# attacker can figure out this access pattern, he can deduct the key
141# [or at least part of it]. The natural way to mitigate this kind of
142# attacks is to minimize the amount of cache-lines in S-box and/or
143# prefetch them to ensure that every one is accessed for more uniform
144# timing. But note that *if* plain-text was concealed in such way that
145# input to block function is distributed *uniformly*, then attack
146# wouldn't apply. Now note that some encryption modes, most notably
147# CBC, do mask the plain-text in this exact way [secure cipher output
148# is distributed uniformly]. Yes, one still might find input that
149# would reveal the information about given key, but if amount of
150# candidate inputs to be tried is larger than amount of possible key
151# combinations then attack becomes infeasible. This is why revised
152# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
153# of data is to be processed in one stroke. The current size limit of
154# 512 bytes is chosen to provide same [diminishigly low] probability
155# for cache-line to remain untouched in large chunk operation with
156# large S-box as for single block operation with compact S-box and
157# surely needs more careful consideration...
158#
159# As for asynchronous attacks. There are two flavours: attacker code
160# being interleaved with AES on hyper-threading CPU at *instruction*
161# level, and two processes time sharing single core. As for latter.
162# Two vectors. 1. Given that attacker process has higher priority,
163# yield execution to process performing AES just before timer fires
164# off the scheduler, immediately regain control of CPU and analyze the
165# cache state. For this attack to be efficient attacker would have to
166# effectively slow down the operation by several *orders* of magnitute,
167# by ratio of time slice to duration of handful of AES rounds, which
168# unlikely to remain unnoticed. Not to mention that this also means
169# that he would spend correspondigly more time to collect enough
170# statistical data to mount the attack. It's probably appropriate to
171# say that if adeversary reckons that this attack is beneficial and
172# risks to be noticed, you probably have larger problems having him
173# mere opportunity. In other words suggested code design expects you
174# to preclude/mitigate this attack by overall system security design.
175# 2. Attacker manages to make his code interrupt driven. In order for
176# this kind of attack to be feasible, interrupt rate has to be high
177# enough, again comparable to duration of handful of AES rounds. But
178# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
179# generates interrupts at such raging rate...
180#
181# And now back to the former, hyper-threading CPU or more specifically
182# Intel P4. Recall that asynchronous attack implies that malicious
183# code instruments itself. And naturally instrumentation granularity
184# has be noticeably lower than duration of codepath accessing S-box.
185# Given that all cache-lines are accessed during that time that is.
186# Current implementation accesses *all* cache-lines within ~50 cycles
187# window, which is actually *less* than RDTSC latency on Intel P4!
188
189$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
190push(@INC,"${dir}","${dir}../../perlasm");
191require "x86asm.pl";
192
193&asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
194&static_label("AES_Te");
195&static_label("AES_Td");
196
197$s0="eax";
198$s1="ebx";
199$s2="ecx";
200$s3="edx";
201$key="edi";
202$acc="esi";
203$tbl="ebp";
204
205# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
206# by caller
207$__ra=&DWP(0,"esp"); # return address
208$__s0=&DWP(4,"esp"); # s0 backing store
209$__s1=&DWP(8,"esp"); # s1 backing store
210$__s2=&DWP(12,"esp"); # s2 backing store
211$__s3=&DWP(16,"esp"); # s3 backing store
212$__key=&DWP(20,"esp"); # pointer to key schedule
213$__end=&DWP(24,"esp"); # pointer to end of key schedule
214$__tbl=&DWP(28,"esp"); # %ebp backing store
215
216# stack frame layout in AES_[en|crypt] routines, which differs from
217# above by 4 and overlaps by %ebp backing store
218$_tbl=&DWP(24,"esp");
219$_esp=&DWP(28,"esp");
220
221sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
222
223$speed_limit=512; # chunks smaller than $speed_limit are
224 # processed with compact routine in CBC mode
225$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
226 # recent µ-archs], but ~5 times smaller!
227 # I favor compact code to minimize cache
228 # contention and in hope to "collect" 5% back
229 # in real-life applications...
230
231$vertical_spin=0; # shift "vertically" defaults to 0, because of
232 # its proof-of-concept status...
233# Note that there is no decvert(), as well as last encryption round is
234# performed with "horizontal" shifts. This is because this "vertical"
235# implementation [one which groups shifts on a given $s[i] to form a
236# "column," unlike "horizontal" one, which groups shifts on different
237# $s[i] to form a "row"] is work in progress. It was observed to run
238# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
239# whole 12% slower:-( So we face a trade-off... Shall it be resolved
240# some day? Till then the code is considered experimental and by
241# default remains dormant...
242
243sub encvert()
244{ my ($te,@s) = @_;
245 my $v0 = $acc, $v1 = $key;
246
247 &mov ($v0,$s[3]); # copy s3
248 &mov (&DWP(4,"esp"),$s[2]); # save s2
249 &mov ($v1,$s[0]); # copy s0
250 &mov (&DWP(8,"esp"),$s[1]); # save s1
251
252 &movz ($s[2],&HB($s[0]));
253 &and ($s[0],0xFF);
254 &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0
255 &shr ($v1,16);
256 &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8
257 &movz ($s[1],&HB($v1));
258 &and ($v1,0xFF);
259 &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16
260 &mov ($v1,$v0);
261 &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24
262
263 &and ($v0,0xFF);
264 &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0
265 &movz ($v0,&HB($v1));
266 &shr ($v1,16);
267 &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8
268 &movz ($v0,&HB($v1));
269 &and ($v1,0xFF);
270 &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16
271 &mov ($v1,&DWP(4,"esp")); # restore s2
272 &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24
273
274 &mov ($v0,$v1);
275 &and ($v1,0xFF);
276 &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0
277 &movz ($v1,&HB($v0));
278 &shr ($v0,16);
279 &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8
280 &movz ($v1,&HB($v0));
281 &and ($v0,0xFF);
282 &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16
283 &mov ($v0,&DWP(8,"esp")); # restore s1
284 &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24
285
286 &mov ($v1,$v0);
287 &and ($v0,0xFF);
288 &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0
289 &movz ($v0,&HB($v1));
290 &shr ($v1,16);
291 &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8
292 &movz ($v0,&HB($v1));
293 &and ($v1,0xFF);
294 &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
295 &mov ($key,$__key); # reincarnate v1 as key
296 &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
297}
298
299# Another experimental routine, which features "horizontal spin," but
300# eliminates one reference to stack. Strangely enough runs slower...
301sub enchoriz()
302{ my $v0 = $key, $v1 = $acc;
303
304 &movz ($v0,&LB($s0)); # 3, 2, 1, 0*
305 &rotr ($s2,8); # 8,11,10, 9
306 &mov ($v1,&DWP(0,$te,$v0,8)); # 0
307 &movz ($v0,&HB($s1)); # 7, 6, 5*, 4
308 &rotr ($s3,16); # 13,12,15,14
309 &xor ($v1,&DWP(3,$te,$v0,8)); # 5
310 &movz ($v0,&HB($s2)); # 8,11,10*, 9
311 &rotr ($s0,16); # 1, 0, 3, 2
312 &xor ($v1,&DWP(2,$te,$v0,8)); # 10
313 &movz ($v0,&HB($s3)); # 13,12,15*,14
314 &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
315 &mov ($__s0,$v1); # t[0] saved
316
317 &movz ($v0,&LB($s1)); # 7, 6, 5, 4*
318 &shr ($s1,16); # -, -, 7, 6
319 &mov ($v1,&DWP(0,$te,$v0,8)); # 4
320 &movz ($v0,&LB($s3)); # 13,12,15,14*
321 &xor ($v1,&DWP(2,$te,$v0,8)); # 14
322 &movz ($v0,&HB($s0)); # 1, 0, 3*, 2
323 &and ($s3,0xffff0000); # 13,12, -, -
324 &xor ($v1,&DWP(1,$te,$v0,8)); # 3
325 &movz ($v0,&LB($s2)); # 8,11,10, 9*
326 &or ($s3,$s1); # 13,12, 7, 6
327 &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected
328 &mov ($s1,$v1); # s[1]=t[1]
329
330 &movz ($v0,&LB($s0)); # 1, 0, 3, 2*
331 &shr ($s2,16); # -, -, 8,11
332 &mov ($v1,&DWP(2,$te,$v0,8)); # 2
333 &movz ($v0,&HB($s3)); # 13,12, 7*, 6
334 &xor ($v1,&DWP(1,$te,$v0,8)); # 7
335 &movz ($v0,&HB($s2)); # -, -, 8*,11
336 &xor ($v1,&DWP(0,$te,$v0,8)); # 8
337 &mov ($v0,$s3);
338 &shr ($v0,24); # 13
339 &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected
340
341 &movz ($v0,&LB($s2)); # -, -, 8,11*
342 &shr ($s0,24); # 1*
343 &mov ($s2,&DWP(1,$te,$v0,8)); # 11
344 &xor ($s2,&DWP(3,$te,$s0,8)); # 1
345 &mov ($s0,$__s0); # s[0]=t[0]
346 &movz ($v0,&LB($s3)); # 13,12, 7, 6*
347 &shr ($s3,16); # , ,13,12
348 &xor ($s2,&DWP(2,$te,$v0,8)); # 6
349 &mov ($key,$__key); # reincarnate v0 as key
350 &and ($s3,0xff); # , ,13,12*
351 &mov ($s3,&DWP(0,$te,$s3,8)); # 12
352 &xor ($s3,$s2); # s[2]=t[3] collected
353 &mov ($s2,$v1); # s[2]=t[2]
354}
355
356# More experimental code... SSE one... Even though this one eliminates
357# *all* references to stack, it's not faster...
358sub sse_encbody()
359{
360 &movz ($acc,&LB("eax")); # 0
361 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
362 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
363 &movz ("edx",&HB("eax")); # 1
364 &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
365 &shr ("eax",16); # 5, 4
366
367 &movz ($acc,&LB("ebx")); # 10
368 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
369 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
370 &movz ($acc,&HB("ebx")); # 11
371 &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
372 &shr ("ebx",16); # 15,14
373
374 &movz ($acc,&HB("eax")); # 5
375 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
376 &movq ("mm3",QWP(16,$key));
377 &movz ($acc,&HB("ebx")); # 15
378 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
379 &movd ("mm0","ecx"); # t[0] collected
380
381 &movz ($acc,&LB("eax")); # 4
382 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
383 &movd ("eax","mm2"); # 7, 6, 3, 2
384 &movz ($acc,&LB("ebx")); # 14
385 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
386 &movd ("ebx","mm6"); # 13,12, 9, 8
387
388 &movz ($acc,&HB("eax")); # 3
389 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
390 &movz ($acc,&HB("ebx")); # 9
391 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
392 &movd ("mm1","ecx"); # t[1] collected
393
394 &movz ($acc,&LB("eax")); # 2
395 &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
396 &shr ("eax",16); # 7, 6
397 &punpckldq ("mm0","mm1"); # t[0,1] collected
398 &movz ($acc,&LB("ebx")); # 8
399 &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
400 &shr ("ebx",16); # 13,12
401
402 &movz ($acc,&HB("eax")); # 7
403 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
404 &pxor ("mm0","mm3");
405 &movz ("eax",&LB("eax")); # 6
406 &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
407 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
408 &movz ($acc,&HB("ebx")); # 13
409 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
410 &xor ("ecx",&DWP(24,$key)); # t[2]
411 &movd ("mm4","ecx"); # t[2] collected
412 &movz ("ebx",&LB("ebx")); # 12
413 &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12
414 &shr ("ecx",16);
415 &movd ("eax","mm1"); # 5, 4, 1, 0
416 &mov ("ebx",&DWP(28,$key)); # t[3]
417 &xor ("ebx","edx");
418 &movd ("mm5","ebx"); # t[3] collected
419 &and ("ebx",0xffff0000);
420 &or ("ebx","ecx");
421
422 &punpckldq ("mm4","mm5"); # t[2,3] collected
423}
424
425######################################################################
426# "Compact" block function
427######################################################################
428
429sub enccompact()
430{ my $Fn = mov;
431 while ($#_>5) { pop(@_); $Fn=sub{}; }
432 my ($i,$te,@s)=@_;
433 my $tmp = $key;
434 my $out = $i==3?$s[0]:$acc;
435
436 # $Fn is used in first compact round and its purpose is to
437 # void restoration of some values from stack, so that after
438 # 4xenccompact with extra argument $key value is left there...
439 if ($i==3) { &$Fn ($key,$__key); }##%edx
440 else { &mov ($out,$s[0]); }
441 &and ($out,0xFF);
442 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
443 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
444 &movz ($out,&BP(-128,$te,$out,1));
445
446 if ($i==3) { $tmp=$s[1]; }##%eax
447 &movz ($tmp,&HB($s[1]));
448 &movz ($tmp,&BP(-128,$te,$tmp,1));
449 &shl ($tmp,8);
450 &xor ($out,$tmp);
451
452 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
453 else { &mov ($tmp,$s[2]);
454 &shr ($tmp,16); }
455 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
456 &and ($tmp,0xFF);
457 &movz ($tmp,&BP(-128,$te,$tmp,1));
458 &shl ($tmp,16);
459 &xor ($out,$tmp);
460
461 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
462 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
463 else { &mov ($tmp,$s[3]);
464 &shr ($tmp,24); }
465 &movz ($tmp,&BP(-128,$te,$tmp,1));
466 &shl ($tmp,24);
467 &xor ($out,$tmp);
468 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
469 if ($i==3) { &mov ($s[3],$acc); }
470 &comment();
471}
472
473sub enctransform()
474{ my @s = ($s0,$s1,$s2,$s3);
475 my $i = shift;
476 my $tmp = $tbl;
477 my $r2 = $key ;
478
479 &mov ($acc,$s[$i]);
480 &and ($acc,0x80808080);
481 &mov ($tmp,$acc);
482 &shr ($tmp,7);
483 &lea ($r2,&DWP(0,$s[$i],$s[$i]));
484 &sub ($acc,$tmp);
485 &and ($r2,0xfefefefe);
486 &and ($acc,0x1b1b1b1b);
487 &mov ($tmp,$s[$i]);
488 &xor ($acc,$r2); # r2
489
490 &xor ($s[$i],$acc); # r0 ^ r2
491 &rotl ($s[$i],24);
492 &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2
493 &rotr ($tmp,16);
494 &xor ($s[$i],$tmp);
495 &rotr ($tmp,8);
496 &xor ($s[$i],$tmp);
497}
498
499&function_begin_B("_x86_AES_encrypt_compact");
500 # note that caller is expected to allocate stack frame for me!
501 &mov ($__key,$key); # save key
502
503 &xor ($s0,&DWP(0,$key)); # xor with key
504 &xor ($s1,&DWP(4,$key));
505 &xor ($s2,&DWP(8,$key));
506 &xor ($s3,&DWP(12,$key));
507
508 &mov ($acc,&DWP(240,$key)); # load key->rounds
509 &lea ($acc,&DWP(-2,$acc,$acc));
510 &lea ($acc,&DWP(0,$key,$acc,8));
511 &mov ($__end,$acc); # end of key schedule
512
513 # prefetch Te4
514 &mov ($key,&DWP(0-128,$tbl));
515 &mov ($acc,&DWP(32-128,$tbl));
516 &mov ($key,&DWP(64-128,$tbl));
517 &mov ($acc,&DWP(96-128,$tbl));
518 &mov ($key,&DWP(128-128,$tbl));
519 &mov ($acc,&DWP(160-128,$tbl));
520 &mov ($key,&DWP(192-128,$tbl));
521 &mov ($acc,&DWP(224-128,$tbl));
522
523 &set_label("loop",16);
524
525 &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
526 &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
527 &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
528 &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
529 &enctransform(2);
530 &enctransform(3);
531 &enctransform(0);
532 &enctransform(1);
533 &mov ($key,$__key);
534 &mov ($tbl,$__tbl);
535 &add ($key,16); # advance rd_key
536 &xor ($s0,&DWP(0,$key));
537 &xor ($s1,&DWP(4,$key));
538 &xor ($s2,&DWP(8,$key));
539 &xor ($s3,&DWP(12,$key));
540
541 &cmp ($key,$__end);
542 &mov ($__key,$key);
543 &jb (&label("loop"));
544
545 &enccompact(0,$tbl,$s0,$s1,$s2,$s3);
546 &enccompact(1,$tbl,$s1,$s2,$s3,$s0);
547 &enccompact(2,$tbl,$s2,$s3,$s0,$s1);
548 &enccompact(3,$tbl,$s3,$s0,$s1,$s2);
549
550 &xor ($s0,&DWP(16,$key));
551 &xor ($s1,&DWP(20,$key));
552 &xor ($s2,&DWP(24,$key));
553 &xor ($s3,&DWP(28,$key));
554
555 &ret ();
556&function_end_B("_x86_AES_encrypt_compact");
557
558######################################################################
559# "Compact" SSE block function.
560######################################################################
561#
562# Performance is not actually extraordinary in comparison to pure
563# x86 code. In particular encrypt performance is virtually the same.
564# Decrypt performance on the other hand is 15-20% better on newer
565# µ-archs [but we're thankful for *any* improvement here], and ~50%
566# better on PIII:-) And additionally on the pros side this code
567# eliminates redundant references to stack and thus relieves/
568# minimizes the pressure on the memory bus.
569#
570# MMX register layout lsb
571# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
572# | mm4 | mm0 |
573# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
574# | s3 | s2 | s1 | s0 |
575# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
576# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
577# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
578#
579# Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
580# In this terms encryption and decryption "compact" permutation
581# matrices can be depicted as following:
582#
583# encryption lsb # decryption lsb
584# +----++----+----+----+----+ # +----++----+----+----+----+
585# | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 |
586# +----++----+----+----+----+ # +----++----+----+----+----+
587# | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 |
588# +----++----+----+----+----+ # +----++----+----+----+----+
589# | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 |
590# +----++----+----+----+----+ # +----++----+----+----+----+
591# | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 |
592# +----++----+----+----+----+ # +----++----+----+----+----+
593#
594######################################################################
595# Why not xmm registers? Short answer. It was actually tested and
596# was not any faster, but *contrary*, most notably on Intel CPUs.
597# Longer answer. Main advantage of using mm registers is that movd
598# latency is lower, especially on Intel P4. While arithmetic
599# instructions are twice as many, they can be scheduled every cycle
600# and not every second one when they are operating on xmm register,
601# so that "arithmetic throughput" remains virtually the same. And
602# finally the code can be executed even on elder SSE-only CPUs:-)
603
604sub sse_enccompact()
605{
606 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
607 &pshufw ("mm5","mm4",0x0d); # 15,14,11,10
608 &movd ("eax","mm1"); # 5, 4, 1, 0
609 &movd ("ebx","mm5"); # 15,14,11,10
610
611 &movz ($acc,&LB("eax")); # 0
612 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
613 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
614 &movz ("edx",&HB("eax")); # 1
615 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
616 &shl ("edx",8); # 1
617 &shr ("eax",16); # 5, 4
618
619 &movz ($acc,&LB("ebx")); # 10
620 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
621 &shl ($acc,16); # 10
622 &or ("ecx",$acc); # 10
623 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
624 &movz ($acc,&HB("ebx")); # 11
625 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
626 &shl ($acc,24); # 11
627 &or ("edx",$acc); # 11
628 &shr ("ebx",16); # 15,14
629
630 &movz ($acc,&HB("eax")); # 5
631 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
632 &shl ($acc,8); # 5
633 &or ("ecx",$acc); # 5
634 &movz ($acc,&HB("ebx")); # 15
635 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
636 &shl ($acc,24); # 15
637 &or ("ecx",$acc); # 15
638 &movd ("mm0","ecx"); # t[0] collected
639
640 &movz ($acc,&LB("eax")); # 4
641 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
642 &movd ("eax","mm2"); # 7, 6, 3, 2
643 &movz ($acc,&LB("ebx")); # 14
644 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
645 &shl ($acc,16); # 14
646 &or ("ecx",$acc); # 14
647
648 &movd ("ebx","mm6"); # 13,12, 9, 8
649 &movz ($acc,&HB("eax")); # 3
650 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
651 &shl ($acc,24); # 3
652 &or ("ecx",$acc); # 3
653 &movz ($acc,&HB("ebx")); # 9
654 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
655 &shl ($acc,8); # 9
656 &or ("ecx",$acc); # 9
657 &movd ("mm1","ecx"); # t[1] collected
658
659 &movz ($acc,&LB("ebx")); # 8
660 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
661 &shr ("ebx",16); # 13,12
662 &movz ($acc,&LB("eax")); # 2
663 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
664 &shl ($acc,16); # 2
665 &or ("ecx",$acc); # 2
666 &shr ("eax",16); # 7, 6
667
668 &punpckldq ("mm0","mm1"); # t[0,1] collected
669
670 &movz ($acc,&HB("eax")); # 7
671 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
672 &shl ($acc,24); # 7
673 &or ("ecx",$acc); # 7
674 &and ("eax",0xff); # 6
675 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
676 &shl ("eax",16); # 6
677 &or ("edx","eax"); # 6
678 &movz ($acc,&HB("ebx")); # 13
679 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
680 &shl ($acc,8); # 13
681 &or ("ecx",$acc); # 13
682 &movd ("mm4","ecx"); # t[2] collected
683 &and ("ebx",0xff); # 12
684 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
685 &or ("edx","ebx"); # 12
686 &movd ("mm5","edx"); # t[3] collected
687
688 &punpckldq ("mm4","mm5"); # t[2,3] collected
689}
690
691 if (!$x86only) {
692&function_begin_B("_sse_AES_encrypt_compact");
693 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
694 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
695
696 # note that caller is expected to allocate stack frame for me!
697 &mov ($acc,&DWP(240,$key)); # load key->rounds
698 &lea ($acc,&DWP(-2,$acc,$acc));
699 &lea ($acc,&DWP(0,$key,$acc,8));
700 &mov ($__end,$acc); # end of key schedule
701
702 &mov ($s0,0x1b1b1b1b); # magic constant
703 &mov (&DWP(8,"esp"),$s0);
704 &mov (&DWP(12,"esp"),$s0);
705
706 # prefetch Te4
707 &mov ($s0,&DWP(0-128,$tbl));
708 &mov ($s1,&DWP(32-128,$tbl));
709 &mov ($s2,&DWP(64-128,$tbl));
710 &mov ($s3,&DWP(96-128,$tbl));
711 &mov ($s0,&DWP(128-128,$tbl));
712 &mov ($s1,&DWP(160-128,$tbl));
713 &mov ($s2,&DWP(192-128,$tbl));
714 &mov ($s3,&DWP(224-128,$tbl));
715
716 &set_label("loop",16);
717 &sse_enccompact();
718 &add ($key,16);
719 &cmp ($key,$__end);
720 &ja (&label("out"));
721
722 &movq ("mm2",&QWP(8,"esp"));
723 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
724 &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
725 &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
726 &pand ("mm3","mm2"); &pand ("mm7","mm2");
727 &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
728 &paddb ("mm0","mm0"); &paddb ("mm4","mm4");
729 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
730 &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
731 &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
732 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
733
734 &movq ("mm2","mm3"); &movq ("mm6","mm7");
735 &pslld ("mm3",8); &pslld ("mm7",8);
736 &psrld ("mm2",24); &psrld ("mm6",24);
737 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
738 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
739
740 &movq ("mm3","mm1"); &movq ("mm7","mm5");
741 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
742 &psrld ("mm1",8); &psrld ("mm5",8);
743 &mov ($s0,&DWP(0-128,$tbl));
744 &pslld ("mm3",24); &pslld ("mm7",24);
745 &mov ($s1,&DWP(64-128,$tbl));
746 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
747 &mov ($s2,&DWP(128-128,$tbl));
748 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
749 &mov ($s3,&DWP(192-128,$tbl));
750
751 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
752 &jmp (&label("loop"));
753
754 &set_label("out",16);
755 &pxor ("mm0",&QWP(0,$key));
756 &pxor ("mm4",&QWP(8,$key));
757
758 &ret ();
759&function_end_B("_sse_AES_encrypt_compact");
760 }
761
762######################################################################
763# Vanilla block function.
764######################################################################
765
766sub encstep()
767{ my ($i,$te,@s) = @_;
768 my $tmp = $key;
769 my $out = $i==3?$s[0]:$acc;
770
771 # lines marked with #%e?x[i] denote "reordered" instructions...
772 if ($i==3) { &mov ($key,$__key); }##%edx
773 else { &mov ($out,$s[0]);
774 &and ($out,0xFF); }
775 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
776 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
777 &mov ($out,&DWP(0,$te,$out,8));
778
779 if ($i==3) { $tmp=$s[1]; }##%eax
780 &movz ($tmp,&HB($s[1]));
781 &xor ($out,&DWP(3,$te,$tmp,8));
782
783 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
784 else { &mov ($tmp,$s[2]);
785 &shr ($tmp,16); }
786 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
787 &and ($tmp,0xFF);
788 &xor ($out,&DWP(2,$te,$tmp,8));
789
790 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
791 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
792 else { &mov ($tmp,$s[3]);
793 &shr ($tmp,24) }
794 &xor ($out,&DWP(1,$te,$tmp,8));
795 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
796 if ($i==3) { &mov ($s[3],$acc); }
797 &comment();
798}
799
800sub enclast()
801{ my ($i,$te,@s)=@_;
802 my $tmp = $key;
803 my $out = $i==3?$s[0]:$acc;
804
805 if ($i==3) { &mov ($key,$__key); }##%edx
806 else { &mov ($out,$s[0]); }
807 &and ($out,0xFF);
808 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
809 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
810 &mov ($out,&DWP(2,$te,$out,8));
811 &and ($out,0x000000ff);
812
813 if ($i==3) { $tmp=$s[1]; }##%eax
814 &movz ($tmp,&HB($s[1]));
815 &mov ($tmp,&DWP(0,$te,$tmp,8));
816 &and ($tmp,0x0000ff00);
817 &xor ($out,$tmp);
818
819 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
820 else { &mov ($tmp,$s[2]);
821 &shr ($tmp,16); }
822 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
823 &and ($tmp,0xFF);
824 &mov ($tmp,&DWP(0,$te,$tmp,8));
825 &and ($tmp,0x00ff0000);
826 &xor ($out,$tmp);
827
828 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
829 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
830 else { &mov ($tmp,$s[3]);
831 &shr ($tmp,24); }
832 &mov ($tmp,&DWP(2,$te,$tmp,8));
833 &and ($tmp,0xff000000);
834 &xor ($out,$tmp);
835 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
836 if ($i==3) { &mov ($s[3],$acc); }
837}
838
839&function_begin_B("_x86_AES_encrypt");
840 if ($vertical_spin) {
841 # I need high parts of volatile registers to be accessible...
842 &exch ($s1="edi",$key="ebx");
843 &mov ($s2="esi",$acc="ecx");
844 }
845
846 # note that caller is expected to allocate stack frame for me!
847 &mov ($__key,$key); # save key
848
849 &xor ($s0,&DWP(0,$key)); # xor with key
850 &xor ($s1,&DWP(4,$key));
851 &xor ($s2,&DWP(8,$key));
852 &xor ($s3,&DWP(12,$key));
853
854 &mov ($acc,&DWP(240,$key)); # load key->rounds
855
856 if ($small_footprint) {
857 &lea ($acc,&DWP(-2,$acc,$acc));
858 &lea ($acc,&DWP(0,$key,$acc,8));
859 &mov ($__end,$acc); # end of key schedule
860
861 &set_label("loop",16);
862 if ($vertical_spin) {
863 &encvert($tbl,$s0,$s1,$s2,$s3);
864 } else {
865 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
866 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
867 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
868 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
869 }
870 &add ($key,16); # advance rd_key
871 &xor ($s0,&DWP(0,$key));
872 &xor ($s1,&DWP(4,$key));
873 &xor ($s2,&DWP(8,$key));
874 &xor ($s3,&DWP(12,$key));
875 &cmp ($key,$__end);
876 &mov ($__key,$key);
877 &jb (&label("loop"));
878 }
879 else {
880 &cmp ($acc,10);
881 &jle (&label("10rounds"));
882 &cmp ($acc,12);
883 &jle (&label("12rounds"));
884
885 &set_label("14rounds",4);
886 for ($i=1;$i<3;$i++) {
887 if ($vertical_spin) {
888 &encvert($tbl,$s0,$s1,$s2,$s3);
889 } else {
890 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
891 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
892 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
893 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
894 }
895 &xor ($s0,&DWP(16*$i+0,$key));
896 &xor ($s1,&DWP(16*$i+4,$key));
897 &xor ($s2,&DWP(16*$i+8,$key));
898 &xor ($s3,&DWP(16*$i+12,$key));
899 }
900 &add ($key,32);
901 &mov ($__key,$key); # advance rd_key
902 &set_label("12rounds",4);
903 for ($i=1;$i<3;$i++) {
904 if ($vertical_spin) {
905 &encvert($tbl,$s0,$s1,$s2,$s3);
906 } else {
907 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
908 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
909 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
910 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
911 }
912 &xor ($s0,&DWP(16*$i+0,$key));
913 &xor ($s1,&DWP(16*$i+4,$key));
914 &xor ($s2,&DWP(16*$i+8,$key));
915 &xor ($s3,&DWP(16*$i+12,$key));
916 }
917 &add ($key,32);
918 &mov ($__key,$key); # advance rd_key
919 &set_label("10rounds",4);
920 for ($i=1;$i<10;$i++) {
921 if ($vertical_spin) {
922 &encvert($tbl,$s0,$s1,$s2,$s3);
923 } else {
924 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
925 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
926 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
927 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
928 }
929 &xor ($s0,&DWP(16*$i+0,$key));
930 &xor ($s1,&DWP(16*$i+4,$key));
931 &xor ($s2,&DWP(16*$i+8,$key));
932 &xor ($s3,&DWP(16*$i+12,$key));
933 }
934 }
935
936 if ($vertical_spin) {
937 # "reincarnate" some registers for "horizontal" spin...
938 &mov ($s1="ebx",$key="edi");
939 &mov ($s2="ecx",$acc="esi");
940 }
941 &enclast(0,$tbl,$s0,$s1,$s2,$s3);
942 &enclast(1,$tbl,$s1,$s2,$s3,$s0);
943 &enclast(2,$tbl,$s2,$s3,$s0,$s1);
944 &enclast(3,$tbl,$s3,$s0,$s1,$s2);
945
946 &add ($key,$small_footprint?16:160);
947 &xor ($s0,&DWP(0,$key));
948 &xor ($s1,&DWP(4,$key));
949 &xor ($s2,&DWP(8,$key));
950 &xor ($s3,&DWP(12,$key));
951
952 &ret ();
953&function_end_B("_x86_AES_encrypt");
954
955 &rodataseg();
956&set_label("AES_Te",64);
957 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
958 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
959 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
960 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
961 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
962 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
963 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
964 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
965 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
966 &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
967 &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
968 &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
969 &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
970 &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
971 &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
972 &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
973 &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
974 &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
975 &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
976 &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
977 &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
978 &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
979 &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
980 &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
981 &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
982 &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
983 &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
984 &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
985 &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
986 &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
987 &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
988 &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
989 &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
990 &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
991 &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
992 &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
993 &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
994 &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
995 &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
996 &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
997 &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
998 &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
999 &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
1000 &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
1001 &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
1002 &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
1003 &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
1004 &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
1005 &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
1006 &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
1007 &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
1008 &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
1009 &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
1010 &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
1011 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
1012 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
1013 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
1014 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
1015 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
1016 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
1017 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
1018 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
1019 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
1020 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
1021
1022#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
1023 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1024 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1025 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1026 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1027 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1028 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1029 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1030 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1031 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1032 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1033 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1034 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1035 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1036 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1037 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1038 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1039 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1040 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1041 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1042 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1043 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1044 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1045 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1046 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1047 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1048 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1049 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1050 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1051 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1052 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1053 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1054 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1055
1056 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1057 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1058 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1059 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1060 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1061 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1062 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1063 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1064 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1065 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1066 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1067 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1068 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1069 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1070 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1071 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1072 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1073 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1074 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1075 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1076 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1077 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1078 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1079 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1080 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1081 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1082 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1083 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1084 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1085 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1086 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1087 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1088
1089 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1090 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1091 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1092 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1093 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1094 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1095 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1096 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1097 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1098 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1099 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1100 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1101 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1102 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1103 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1104 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1105 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1106 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1107 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1108 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1109 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1110 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1111 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1112 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1113 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1114 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1115 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1116 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1117 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1118 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1119 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1120 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1121
1122 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1123 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1124 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1125 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1126 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1127 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1128 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1129 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1130 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1131 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1132 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1133 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1134 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1135 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1136 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1137 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1138 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1139 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1140 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1141 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1142 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1143 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1144 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1145 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1146 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1147 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1148 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1149 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1150 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1151 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1152 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1153 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1154#rcon:
1155 &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
1156 &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
1157 &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
1158 &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
1159 &previous();
1160
1161# void aes_encrypt_internal(const void *inp, void *out, const AES_KEY *key);
1162&function_begin("aes_encrypt_internal");
1163 &mov ($acc,&wparam(0)); # load inp
1164 &mov ($key,&wparam(2)); # load key
1165
1166 &mov ($s0,"esp");
1167 &sub ("esp",36);
1168 &and ("esp",-64); # align to cache-line
1169
1170 # place stack frame just "above" the key schedule
1171 &lea ($s1,&DWP(-64-63,$key));
1172 &sub ($s1,"esp");
1173 &neg ($s1);
1174 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1175 &sub ("esp",$s1);
1176 &add ("esp",4); # 4 is reserved for caller's return address
1177 &mov ($_esp,$s0); # save stack pointer
1178
1179 &picsetup($tbl);
1180 &picsymbol($s0, "OPENSSL_ia32cap_P", $tbl);
1181 &picsymbol($tbl, &label("AES_Te"), $tbl);
1182
1183 # pick Te4 copy which can't "overlap" with stack frame or key schedule
1184 &lea ($s1,&DWP(768-4,"esp"));
1185 &sub ($s1,$tbl);
1186 &and ($s1,0x300);
1187 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1188
1189 if (!$x86only) {
1190 &bt (&DWP(0,$s0),"\$IA32CAP_BIT0_SSE"); # check for SSE bit
1191 &jnc (&label("x86"));
1192
1193 &movq ("mm0",&QWP(0,$acc));
1194 &movq ("mm4",&QWP(8,$acc));
1195 &call ("_sse_AES_encrypt_compact");
1196 &mov ("esp",$_esp); # restore stack pointer
1197 &mov ($acc,&wparam(1)); # load out
1198 &movq (&QWP(0,$acc),"mm0"); # write output data
1199 &movq (&QWP(8,$acc),"mm4");
1200 &emms ();
1201 &function_end_A();
1202 }
1203 &set_label("x86",16);
1204 &mov ($_tbl,$tbl);
1205 &mov ($s0,&DWP(0,$acc)); # load input data
1206 &mov ($s1,&DWP(4,$acc));
1207 &mov ($s2,&DWP(8,$acc));
1208 &mov ($s3,&DWP(12,$acc));
1209 &call ("_x86_AES_encrypt_compact");
1210 &mov ("esp",$_esp); # restore stack pointer
1211 &mov ($acc,&wparam(1)); # load out
1212 &mov (&DWP(0,$acc),$s0); # write output data
1213 &mov (&DWP(4,$acc),$s1);
1214 &mov (&DWP(8,$acc),$s2);
1215 &mov (&DWP(12,$acc),$s3);
1216&function_end("aes_encrypt_internal");
1217
1218#--------------------------------------------------------------------#
1219
1220######################################################################
1221# "Compact" block function
1222######################################################################
1223
1224sub deccompact()
1225{ my $Fn = mov;
1226 while ($#_>5) { pop(@_); $Fn=sub{}; }
1227 my ($i,$td,@s)=@_;
1228 my $tmp = $key;
1229 my $out = $i==3?$s[0]:$acc;
1230
1231 # $Fn is used in first compact round and its purpose is to
1232 # void restoration of some values from stack, so that after
1233 # 4xdeccompact with extra argument $key, $s0 and $s1 values
1234 # are left there...
1235 if($i==3) { &$Fn ($key,$__key); }
1236 else { &mov ($out,$s[0]); }
1237 &and ($out,0xFF);
1238 &movz ($out,&BP(-128,$td,$out,1));
1239
1240 if ($i==3) { $tmp=$s[1]; }
1241 &movz ($tmp,&HB($s[1]));
1242 &movz ($tmp,&BP(-128,$td,$tmp,1));
1243 &shl ($tmp,8);
1244 &xor ($out,$tmp);
1245
1246 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1247 else { mov ($tmp,$s[2]); }
1248 &shr ($tmp,16);
1249 &and ($tmp,0xFF);
1250 &movz ($tmp,&BP(-128,$td,$tmp,1));
1251 &shl ($tmp,16);
1252 &xor ($out,$tmp);
1253
1254 if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }
1255 else { &mov ($tmp,$s[3]); }
1256 &shr ($tmp,24);
1257 &movz ($tmp,&BP(-128,$td,$tmp,1));
1258 &shl ($tmp,24);
1259 &xor ($out,$tmp);
1260 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1261 if ($i==3) { &$Fn ($s[3],$__s0); }
1262}
1263
1264# must be called with 2,3,0,1 as argument sequence!!!
1265sub dectransform()
1266{ my @s = ($s0,$s1,$s2,$s3);
1267 my $i = shift;
1268 my $tmp = $key;
1269 my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
1270 my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
1271 my $tp8 = $tbl;
1272
1273 &mov ($acc,$s[$i]);
1274 &and ($acc,0x80808080);
1275 &mov ($tmp,$acc);
1276 &shr ($tmp,7);
1277 &lea ($tp2,&DWP(0,$s[$i],$s[$i]));
1278 &sub ($acc,$tmp);
1279 &and ($tp2,0xfefefefe);
1280 &and ($acc,0x1b1b1b1b);
1281 &xor ($acc,$tp2);
1282 &mov ($tp2,$acc);
1283
1284 &and ($acc,0x80808080);
1285 &mov ($tmp,$acc);
1286 &shr ($tmp,7);
1287 &lea ($tp4,&DWP(0,$tp2,$tp2));
1288 &sub ($acc,$tmp);
1289 &and ($tp4,0xfefefefe);
1290 &and ($acc,0x1b1b1b1b);
1291 &xor ($tp2,$s[$i]); # tp2^tp1
1292 &xor ($acc,$tp4);
1293 &mov ($tp4,$acc);
1294
1295 &and ($acc,0x80808080);
1296 &mov ($tmp,$acc);
1297 &shr ($tmp,7);
1298 &lea ($tp8,&DWP(0,$tp4,$tp4));
1299 &sub ($acc,$tmp);
1300 &and ($tp8,0xfefefefe);
1301 &and ($acc,0x1b1b1b1b);
1302 &xor ($tp4,$s[$i]); # tp4^tp1
1303 &rotl ($s[$i],8); # = ROTATE(tp1,8)
1304 &xor ($tp8,$acc);
1305
1306 &xor ($s[$i],$tp2);
1307 &xor ($tp2,$tp8);
1308 &rotl ($tp2,24);
1309 &xor ($s[$i],$tp4);
1310 &xor ($tp4,$tp8);
1311 &rotl ($tp4,16);
1312 &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
1313 &rotl ($tp8,8);
1314 &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
1315 &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
1316 &mov ($s[0],$__s0) if($i==2); #prefetch $s0
1317 &mov ($s[1],$__s1) if($i==3); #prefetch $s1
1318 &mov ($s[2],$__s2) if($i==1);
1319 &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
1320
1321 &mov ($s[3],$__s3) if($i==1);
1322 &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
1323}
1324
1325&function_begin_B("_x86_AES_decrypt_compact");
1326 # note that caller is expected to allocate stack frame for me!
1327 &mov ($__key,$key); # save key
1328
1329 &xor ($s0,&DWP(0,$key)); # xor with key
1330 &xor ($s1,&DWP(4,$key));
1331 &xor ($s2,&DWP(8,$key));
1332 &xor ($s3,&DWP(12,$key));
1333
1334 &mov ($acc,&DWP(240,$key)); # load key->rounds
1335
1336 &lea ($acc,&DWP(-2,$acc,$acc));
1337 &lea ($acc,&DWP(0,$key,$acc,8));
1338 &mov ($__end,$acc); # end of key schedule
1339
1340 # prefetch Td4
1341 &mov ($key,&DWP(0-128,$tbl));
1342 &mov ($acc,&DWP(32-128,$tbl));
1343 &mov ($key,&DWP(64-128,$tbl));
1344 &mov ($acc,&DWP(96-128,$tbl));
1345 &mov ($key,&DWP(128-128,$tbl));
1346 &mov ($acc,&DWP(160-128,$tbl));
1347 &mov ($key,&DWP(192-128,$tbl));
1348 &mov ($acc,&DWP(224-128,$tbl));
1349
1350 &set_label("loop",16);
1351
1352 &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
1353 &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
1354 &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
1355 &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
1356 &dectransform(2);
1357 &dectransform(3);
1358 &dectransform(0);
1359 &dectransform(1);
1360 &mov ($key,$__key);
1361 &mov ($tbl,$__tbl);
1362 &add ($key,16); # advance rd_key
1363 &xor ($s0,&DWP(0,$key));
1364 &xor ($s1,&DWP(4,$key));
1365 &xor ($s2,&DWP(8,$key));
1366 &xor ($s3,&DWP(12,$key));
1367
1368 &cmp ($key,$__end);
1369 &mov ($__key,$key);
1370 &jb (&label("loop"));
1371
1372 &deccompact(0,$tbl,$s0,$s3,$s2,$s1);
1373 &deccompact(1,$tbl,$s1,$s0,$s3,$s2);
1374 &deccompact(2,$tbl,$s2,$s1,$s0,$s3);
1375 &deccompact(3,$tbl,$s3,$s2,$s1,$s0);
1376
1377 &xor ($s0,&DWP(16,$key));
1378 &xor ($s1,&DWP(20,$key));
1379 &xor ($s2,&DWP(24,$key));
1380 &xor ($s3,&DWP(28,$key));
1381
1382 &ret ();
1383&function_end_B("_x86_AES_decrypt_compact");
1384
1385######################################################################
1386# "Compact" SSE block function.
1387######################################################################
1388
1389sub sse_deccompact()
1390{
1391 &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
1392 &movd ("eax","mm1"); # 7, 6, 1, 0
1393
1394 &pshufw ("mm5","mm4",0x09); # 13,12,11,10
1395 &movz ($acc,&LB("eax")); # 0
1396 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
1397 &movd ("ebx","mm5"); # 13,12,11,10
1398 &movz ("edx",&HB("eax")); # 1
1399 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
1400 &shl ("edx",8); # 1
1401
1402 &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
1403 &movz ($acc,&LB("ebx")); # 10
1404 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
1405 &shl ($acc,16); # 10
1406 &or ("ecx",$acc); # 10
1407 &shr ("eax",16); # 7, 6
1408 &movz ($acc,&HB("ebx")); # 11
1409 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
1410 &shl ($acc,24); # 11
1411 &or ("edx",$acc); # 11
1412 &shr ("ebx",16); # 13,12
1413
1414 &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
1415 &movz ($acc,&HB("eax")); # 7
1416 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
1417 &shl ($acc,24); # 7
1418 &or ("ecx",$acc); # 7
1419 &movz ($acc,&HB("ebx")); # 13
1420 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
1421 &shl ($acc,8); # 13
1422 &or ("ecx",$acc); # 13
1423 &movd ("mm0","ecx"); # t[0] collected
1424
1425 &movz ($acc,&LB("eax")); # 6
1426 &movd ("eax","mm2"); # 3, 2, 5, 4
1427 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
1428 &shl ("ecx",16); # 6
1429 &movz ($acc,&LB("ebx")); # 12
1430 &movd ("ebx","mm6"); # 9, 8,15,14
1431 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
1432 &or ("ecx",$acc); # 12
1433
1434 &movz ($acc,&LB("eax")); # 4
1435 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
1436 &or ("edx",$acc); # 4
1437 &movz ($acc,&LB("ebx")); # 14
1438 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
1439 &shl ($acc,16); # 14
1440 &or ("edx",$acc); # 14
1441 &movd ("mm1","edx"); # t[1] collected
1442
1443 &movz ($acc,&HB("eax")); # 5
1444 &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
1445 &shl ("edx",8); # 5
1446 &movz ($acc,&HB("ebx")); # 15
1447 &shr ("eax",16); # 3, 2
1448 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
1449 &shl ($acc,24); # 15
1450 &or ("edx",$acc); # 15
1451 &shr ("ebx",16); # 9, 8
1452
1453 &punpckldq ("mm0","mm1"); # t[0,1] collected
1454
1455 &movz ($acc,&HB("ebx")); # 9
1456 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
1457 &shl ($acc,8); # 9
1458 &or ("ecx",$acc); # 9
1459 &and ("ebx",0xff); # 8
1460 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
1461 &or ("edx","ebx"); # 8
1462 &movz ($acc,&LB("eax")); # 2
1463 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
1464 &shl ($acc,16); # 2
1465 &or ("edx",$acc); # 2
1466 &movd ("mm4","edx"); # t[2] collected
1467 &movz ("eax",&HB("eax")); # 3
1468 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
1469 &shl ("eax",24); # 3
1470 &or ("ecx","eax"); # 3
1471 &movd ("mm5","ecx"); # t[3] collected
1472
1473 &punpckldq ("mm4","mm5"); # t[2,3] collected
1474}
1475
1476 if (!$x86only) {
1477&function_begin_B("_sse_AES_decrypt_compact");
1478 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
1479 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
1480
1481 # note that caller is expected to allocate stack frame for me!
1482 &mov ($acc,&DWP(240,$key)); # load key->rounds
1483 &lea ($acc,&DWP(-2,$acc,$acc));
1484 &lea ($acc,&DWP(0,$key,$acc,8));
1485 &mov ($__end,$acc); # end of key schedule
1486
1487 &mov ($s0,0x1b1b1b1b); # magic constant
1488 &mov (&DWP(8,"esp"),$s0);
1489 &mov (&DWP(12,"esp"),$s0);
1490
1491 # prefetch Td4
1492 &mov ($s0,&DWP(0-128,$tbl));
1493 &mov ($s1,&DWP(32-128,$tbl));
1494 &mov ($s2,&DWP(64-128,$tbl));
1495 &mov ($s3,&DWP(96-128,$tbl));
1496 &mov ($s0,&DWP(128-128,$tbl));
1497 &mov ($s1,&DWP(160-128,$tbl));
1498 &mov ($s2,&DWP(192-128,$tbl));
1499 &mov ($s3,&DWP(224-128,$tbl));
1500
1501 &set_label("loop",16);
1502 &sse_deccompact();
1503 &add ($key,16);
1504 &cmp ($key,$__end);
1505 &ja (&label("out"));
1506
1507 # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
1508 &movq ("mm3","mm0"); &movq ("mm7","mm4");
1509 &movq ("mm2","mm0",1); &movq ("mm6","mm4",1);
1510 &movq ("mm1","mm0"); &movq ("mm5","mm4");
1511 &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16)
1512 &pslld ("mm2",8); &pslld ("mm6",8);
1513 &psrld ("mm3",8); &psrld ("mm7",8);
1514 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8
1515 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8
1516 &pslld ("mm2",16); &pslld ("mm6",16);
1517 &psrld ("mm3",16); &psrld ("mm7",16);
1518 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24
1519 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24
1520
1521 &movq ("mm3",&QWP(8,"esp"));
1522 &pxor ("mm2","mm2"); &pxor ("mm6","mm6");
1523 &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5");
1524 &pand ("mm2","mm3"); &pand ("mm6","mm3");
1525 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1526 &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2
1527 &movq ("mm3","mm1"); &movq ("mm7","mm5");
1528 &movq ("mm2","mm1"); &movq ("mm6","mm5");
1529 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2
1530 &pslld ("mm3",24); &pslld ("mm7",24);
1531 &psrld ("mm2",8); &psrld ("mm6",8);
1532 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24
1533 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8
1534
1535 &movq ("mm2",&QWP(8,"esp"));
1536 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1537 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1538 &pand ("mm3","mm2"); &pand ("mm7","mm2");
1539 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1540 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
1541 &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
1542 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
1543 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
1544
1545 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1546 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1547 &pand ("mm3","mm2"); &pand ("mm7","mm2");
1548 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1549 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8
1550 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8
1551 &movq ("mm3","mm1"); &movq ("mm7","mm5");
1552 &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1);
1553 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16)
1554 &pslld ("mm1",8); &pslld ("mm5",8);
1555 &psrld ("mm3",8); &psrld ("mm7",8);
1556 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
1557 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8
1558 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8
1559 &mov ($s0,&DWP(0-128,$tbl));
1560 &pslld ("mm1",16); &pslld ("mm5",16);
1561 &mov ($s1,&DWP(64-128,$tbl));
1562 &psrld ("mm3",16); &psrld ("mm7",16);
1563 &mov ($s2,&DWP(128-128,$tbl));
1564 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24
1565 &mov ($s3,&DWP(192-128,$tbl));
1566 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24
1567
1568 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
1569 &jmp (&label("loop"));
1570
1571 &set_label("out",16);
1572 &pxor ("mm0",&QWP(0,$key));
1573 &pxor ("mm4",&QWP(8,$key));
1574
1575 &ret ();
1576&function_end_B("_sse_AES_decrypt_compact");
1577 }
1578
1579######################################################################
1580# Vanilla block function.
1581######################################################################
1582
1583sub decstep()
1584{ my ($i,$td,@s) = @_;
1585 my $tmp = $key;
1586 my $out = $i==3?$s[0]:$acc;
1587
1588 # no instructions are reordered, as performance appears
1589 # optimal... or rather that all attempts to reorder didn't
1590 # result in better performance [which by the way is not a
1591 # bit lower than ecryption].
1592 if($i==3) { &mov ($key,$__key); }
1593 else { &mov ($out,$s[0]); }
1594 &and ($out,0xFF);
1595 &mov ($out,&DWP(0,$td,$out,8));
1596
1597 if ($i==3) { $tmp=$s[1]; }
1598 &movz ($tmp,&HB($s[1]));
1599 &xor ($out,&DWP(3,$td,$tmp,8));
1600
1601 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1602 else { &mov ($tmp,$s[2]); }
1603 &shr ($tmp,16);
1604 &and ($tmp,0xFF);
1605 &xor ($out,&DWP(2,$td,$tmp,8));
1606
1607 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
1608 else { &mov ($tmp,$s[3]); }
1609 &shr ($tmp,24);
1610 &xor ($out,&DWP(1,$td,$tmp,8));
1611 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1612 if ($i==3) { &mov ($s[3],$__s0); }
1613 &comment();
1614}
1615
1616sub declast()
1617{ my ($i,$td,@s)=@_;
1618 my $tmp = $key;
1619 my $out = $i==3?$s[0]:$acc;
1620
1621 if($i==0) { &lea ($td,&DWP(2048+128,$td));
1622 &mov ($tmp,&DWP(0-128,$td));
1623 &mov ($acc,&DWP(32-128,$td));
1624 &mov ($tmp,&DWP(64-128,$td));
1625 &mov ($acc,&DWP(96-128,$td));
1626 &mov ($tmp,&DWP(128-128,$td));
1627 &mov ($acc,&DWP(160-128,$td));
1628 &mov ($tmp,&DWP(192-128,$td));
1629 &mov ($acc,&DWP(224-128,$td));
1630 &lea ($td,&DWP(-128,$td)); }
1631 if($i==3) { &mov ($key,$__key); }
1632 else { &mov ($out,$s[0]); }
1633 &and ($out,0xFF);
1634 &movz ($out,&BP(0,$td,$out,1));
1635
1636 if ($i==3) { $tmp=$s[1]; }
1637 &movz ($tmp,&HB($s[1]));
1638 &movz ($tmp,&BP(0,$td,$tmp,1));
1639 &shl ($tmp,8);
1640 &xor ($out,$tmp);
1641
1642 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1643 else { mov ($tmp,$s[2]); }
1644 &shr ($tmp,16);
1645 &and ($tmp,0xFF);
1646 &movz ($tmp,&BP(0,$td,$tmp,1));
1647 &shl ($tmp,16);
1648 &xor ($out,$tmp);
1649
1650 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
1651 else { &mov ($tmp,$s[3]); }
1652 &shr ($tmp,24);
1653 &movz ($tmp,&BP(0,$td,$tmp,1));
1654 &shl ($tmp,24);
1655 &xor ($out,$tmp);
1656 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1657 if ($i==3) { &mov ($s[3],$__s0);
1658 &lea ($td,&DWP(-2048,$td)); }
1659}
1660
1661&function_begin_B("_x86_AES_decrypt");
1662 # note that caller is expected to allocate stack frame for me!
1663 &mov ($__key,$key); # save key
1664
1665 &xor ($s0,&DWP(0,$key)); # xor with key
1666 &xor ($s1,&DWP(4,$key));
1667 &xor ($s2,&DWP(8,$key));
1668 &xor ($s3,&DWP(12,$key));
1669
1670 &mov ($acc,&DWP(240,$key)); # load key->rounds
1671
1672 if ($small_footprint) {
1673 &lea ($acc,&DWP(-2,$acc,$acc));
1674 &lea ($acc,&DWP(0,$key,$acc,8));
1675 &mov ($__end,$acc); # end of key schedule
1676 &set_label("loop",16);
1677 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1678 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1679 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1680 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1681 &add ($key,16); # advance rd_key
1682 &xor ($s0,&DWP(0,$key));
1683 &xor ($s1,&DWP(4,$key));
1684 &xor ($s2,&DWP(8,$key));
1685 &xor ($s3,&DWP(12,$key));
1686 &cmp ($key,$__end);
1687 &mov ($__key,$key);
1688 &jb (&label("loop"));
1689 }
1690 else {
1691 &cmp ($acc,10);
1692 &jle (&label("10rounds"));
1693 &cmp ($acc,12);
1694 &jle (&label("12rounds"));
1695
1696 &set_label("14rounds",4);
1697 for ($i=1;$i<3;$i++) {
1698 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1699 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1700 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1701 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1702 &xor ($s0,&DWP(16*$i+0,$key));
1703 &xor ($s1,&DWP(16*$i+4,$key));
1704 &xor ($s2,&DWP(16*$i+8,$key));
1705 &xor ($s3,&DWP(16*$i+12,$key));
1706 }
1707 &add ($key,32);
1708 &mov ($__key,$key); # advance rd_key
1709 &set_label("12rounds",4);
1710 for ($i=1;$i<3;$i++) {
1711 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1712 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1713 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1714 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1715 &xor ($s0,&DWP(16*$i+0,$key));
1716 &xor ($s1,&DWP(16*$i+4,$key));
1717 &xor ($s2,&DWP(16*$i+8,$key));
1718 &xor ($s3,&DWP(16*$i+12,$key));
1719 }
1720 &add ($key,32);
1721 &mov ($__key,$key); # advance rd_key
1722 &set_label("10rounds",4);
1723 for ($i=1;$i<10;$i++) {
1724 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1725 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1726 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1727 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1728 &xor ($s0,&DWP(16*$i+0,$key));
1729 &xor ($s1,&DWP(16*$i+4,$key));
1730 &xor ($s2,&DWP(16*$i+8,$key));
1731 &xor ($s3,&DWP(16*$i+12,$key));
1732 }
1733 }
1734
1735 &declast(0,$tbl,$s0,$s3,$s2,$s1);
1736 &declast(1,$tbl,$s1,$s0,$s3,$s2);
1737 &declast(2,$tbl,$s2,$s1,$s0,$s3);
1738 &declast(3,$tbl,$s3,$s2,$s1,$s0);
1739
1740 &add ($key,$small_footprint?16:160);
1741 &xor ($s0,&DWP(0,$key));
1742 &xor ($s1,&DWP(4,$key));
1743 &xor ($s2,&DWP(8,$key));
1744 &xor ($s3,&DWP(12,$key));
1745
1746 &ret ();
1747&function_end_B("_x86_AES_decrypt");
1748
1749 &rodataseg();
1750&set_label("AES_Td",64);
1751 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
1752 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
1753 &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
1754 &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
1755 &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
1756 &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
1757 &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
1758 &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
1759 &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
1760 &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
1761 &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
1762 &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
1763 &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
1764 &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
1765 &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
1766 &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
1767 &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
1768 &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
1769 &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
1770 &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
1771 &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
1772 &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
1773 &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
1774 &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
1775 &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
1776 &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
1777 &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
1778 &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
1779 &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
1780 &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
1781 &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
1782 &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
1783 &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
1784 &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
1785 &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
1786 &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
1787 &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
1788 &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
1789 &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
1790 &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
1791 &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
1792 &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
1793 &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
1794 &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
1795 &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
1796 &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
1797 &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
1798 &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
1799 &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
1800 &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
1801 &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
1802 &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
1803 &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
1804 &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
1805 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
1806 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
1807 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
1808 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
1809 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
1810 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
1811 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
1812 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
1813 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
1814 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
1815
1816#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
1817 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1818 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1819 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1820 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1821 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1822 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1823 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1824 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1825 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1826 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1827 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1828 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1829 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1830 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1831 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1832 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1833 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1834 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1835 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1836 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1837 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1838 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1839 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1840 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1841 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1842 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1843 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1844 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1845 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1846 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1847 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1848 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1849
1850 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1851 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1852 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1853 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1854 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1855 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1856 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1857 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1858 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1859 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1860 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1861 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1862 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1863 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1864 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1865 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1866 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1867 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1868 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1869 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1870 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1871 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1872 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1873 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1874 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1875 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1876 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1877 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1878 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1879 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1880 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1881 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1882
1883 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1884 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1885 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1886 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1887 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1888 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1889 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1890 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1891 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1892 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1893 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1894 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1895 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1896 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1897 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1898 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1899 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1900 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1901 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1902 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1903 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1904 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1905 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1906 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1907 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1908 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1909 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1910 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1911 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1912 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1913 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1914 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1915
1916 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1917 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1918 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1919 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1920 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1921 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1922 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1923 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1924 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1925 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1926 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1927 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1928 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1929 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1930 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1931 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1932 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1933 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1934 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1935 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1936 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1937 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1938 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1939 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1940 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1941 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1942 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1943 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1944 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1945 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1946 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1947 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1948 &previous();
1949
1950# void aes_decrypt_internal(const void *inp, void *out, const AES_KEY *key);
1951&function_begin("aes_decrypt_internal");
1952 &mov ($acc,&wparam(0)); # load inp
1953 &mov ($key,&wparam(2)); # load key
1954
1955 &mov ($s0,"esp");
1956 &sub ("esp",36);
1957 &and ("esp",-64); # align to cache-line
1958
1959 # place stack frame just "above" the key schedule
1960 &lea ($s1,&DWP(-64-63,$key));
1961 &sub ($s1,"esp");
1962 &neg ($s1);
1963 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1964 &sub ("esp",$s1);
1965 &add ("esp",4); # 4 is reserved for caller's return address
1966 &mov ($_esp,$s0); # save stack pointer
1967
1968 &picsetup($tbl);
1969 &picsymbol($s0, "OPENSSL_ia32cap_P", $tbl);
1970 &picsymbol($tbl, &label("AES_Td"), $tbl);
1971
1972 # pick Td4 copy which can't "overlap" with stack frame or key schedule
1973 &lea ($s1,&DWP(768-4,"esp"));
1974 &sub ($s1,$tbl);
1975 &and ($s1,0x300);
1976 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1977
1978 if (!$x86only) {
1979 &bt (&DWP(0,$s0),"\$IA32CAP_BIT0_SSE"); # check for SSE bit
1980 &jnc (&label("x86"));
1981
1982 &movq ("mm0",&QWP(0,$acc));
1983 &movq ("mm4",&QWP(8,$acc));
1984 &call ("_sse_AES_decrypt_compact");
1985 &mov ("esp",$_esp); # restore stack pointer
1986 &mov ($acc,&wparam(1)); # load out
1987 &movq (&QWP(0,$acc),"mm0"); # write output data
1988 &movq (&QWP(8,$acc),"mm4");
1989 &emms ();
1990 &function_end_A();
1991 }
1992 &set_label("x86",16);
1993 &mov ($_tbl,$tbl);
1994 &mov ($s0,&DWP(0,$acc)); # load input data
1995 &mov ($s1,&DWP(4,$acc));
1996 &mov ($s2,&DWP(8,$acc));
1997 &mov ($s3,&DWP(12,$acc));
1998 &call ("_x86_AES_decrypt_compact");
1999 &mov ("esp",$_esp); # restore stack pointer
2000 &mov ($acc,&wparam(1)); # load out
2001 &mov (&DWP(0,$acc),$s0); # write output data
2002 &mov (&DWP(4,$acc),$s1);
2003 &mov (&DWP(8,$acc),$s2);
2004 &mov (&DWP(12,$acc),$s3);
2005&function_end("aes_decrypt_internal");
2006
2007# void aes_cbc_encrypt_internal(const void char *inp, unsigned char *out,
2008# size_t length, const AES_KEY *key, unsigned char *ivp,const int enc);
2009{
2010# stack frame layout
2011# -4(%esp) # return address 0(%esp)
2012# 0(%esp) # s0 backing store 4(%esp)
2013# 4(%esp) # s1 backing store 8(%esp)
2014# 8(%esp) # s2 backing store 12(%esp)
2015# 12(%esp) # s3 backing store 16(%esp)
2016# 16(%esp) # key backup 20(%esp)
2017# 20(%esp) # end of key schedule 24(%esp)
2018# 24(%esp) # %ebp backup 28(%esp)
2019# 28(%esp) # %esp backup
2020my $_inp=&DWP(32,"esp"); # copy of wparam(0)
2021my $_out=&DWP(36,"esp"); # copy of wparam(1)
2022my $_len=&DWP(40,"esp"); # copy of wparam(2)
2023my $_key=&DWP(44,"esp"); # copy of wparam(3)
2024my $_ivp=&DWP(48,"esp"); # copy of wparam(4)
2025my $_tmp=&DWP(52,"esp"); # volatile variable
2026#
2027my $ivec=&DWP(60,"esp"); # ivec[16]
2028my $aes_key=&DWP(76,"esp"); # copy of aes_key
2029my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
2030
2031&function_begin("aes_cbc_encrypt_internal");
2032 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
2033 &cmp ($s2,0);
2034 &je (&label("drop_out"));
2035
2036 &picsetup($tbl);
2037 &picsymbol($s0, "OPENSSL_ia32cap_P", $tbl);
2038 &picsymbol($tbl, &label("AES_Te"), $tbl);
2039 &cmp (&wparam(5),0);
2040 &jne (&label("picked_te"));
2041 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
2042 &set_label("picked_te");
2043
2044 # one can argue if this is required
2045 &pushf ();
2046 &cld ();
2047
2048 &cmp ($s2,$speed_limit);
2049 &jb (&label("slow_way"));
2050 &test ($s2,15);
2051 &jnz (&label("slow_way"));
2052 if (!$x86only) {
2053 &bt (&DWP(0,$s0),"\$IA32CAP_BIT0_HT"); # check for hyper-threading bit
2054 &jc (&label("slow_way"));
2055 }
2056 # pre-allocate aligned stack frame...
2057 &lea ($acc,&DWP(-80-244,"esp"));
2058 &and ($acc,-64);
2059
2060 # ... and make sure it doesn't alias with $tbl modulo 4096
2061 &mov ($s0,$tbl);
2062 &lea ($s1,&DWP(2048+256,$tbl));
2063 &mov ($s3,$acc);
2064 &and ($s0,0xfff); # s = %ebp&0xfff
2065 &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
2066 &and ($s3,0xfff); # p = %esp&0xfff
2067
2068 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
2069 &jb (&label("tbl_break_out"));
2070 &sub ($s3,$s1);
2071 &sub ($acc,$s3);
2072 &jmp (&label("tbl_ok"));
2073 &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz;
2074 &sub ($s3,$s0);
2075 &and ($s3,0xfff);
2076 &add ($s3,384);
2077 &sub ($acc,$s3);
2078 &set_label("tbl_ok",4);
2079
2080 &lea ($s3,&wparam(0)); # obtain pointer to parameter block
2081 &exch ("esp",$acc); # allocate stack frame
2082 &add ("esp",4); # reserve for return address!
2083 &mov ($_tbl,$tbl); # save %ebp
2084 &mov ($_esp,$acc); # save %esp
2085
2086 &mov ($s0,&DWP(0,$s3)); # load inp
2087 &mov ($s1,&DWP(4,$s3)); # load out
2088 #&mov ($s2,&DWP(8,$s3)); # load len
2089 &mov ($key,&DWP(12,$s3)); # load key
2090 &mov ($acc,&DWP(16,$s3)); # load ivp
2091 &mov ($s3,&DWP(20,$s3)); # load enc flag
2092
2093 &mov ($_inp,$s0); # save copy of inp
2094 &mov ($_out,$s1); # save copy of out
2095 &mov ($_len,$s2); # save copy of len
2096 &mov ($_key,$key); # save copy of key
2097 &mov ($_ivp,$acc); # save copy of ivp
2098
2099 &mov ($mark,0); # copy of aes_key->rounds = 0;
2100 # do we copy key schedule to stack?
2101 &mov ($s1 eq "ebx" ? $s1 : "",$key);
2102 &mov ($s2 eq "ecx" ? $s2 : "",244/4);
2103 &sub ($s1,$tbl);
2104 &mov ("esi",$key);
2105 &and ($s1,0xfff);
2106 &lea ("edi",$aes_key);
2107 &cmp ($s1,2048+256);
2108 &jb (&label("do_copy"));
2109 &cmp ($s1,4096-244);
2110 &jb (&label("skip_copy"));
2111 &set_label("do_copy",4);
2112 &mov ($_key,"edi");
2113 &data_word(0xA5F3F689); # rep movsd
2114 &set_label("skip_copy");
2115
2116 &mov ($key,16);
2117 &set_label("prefetch_tbl",4);
2118 &mov ($s0,&DWP(0,$tbl));
2119 &mov ($s1,&DWP(32,$tbl));
2120 &mov ($s2,&DWP(64,$tbl));
2121 &mov ($acc,&DWP(96,$tbl));
2122 &lea ($tbl,&DWP(128,$tbl));
2123 &sub ($key,1);
2124 &jnz (&label("prefetch_tbl"));
2125 &sub ($tbl,2048);
2126
2127 &mov ($acc,$_inp);
2128 &mov ($key,$_ivp);
2129
2130 &cmp ($s3,0);
2131 &je (&label("fast_decrypt"));
2132
2133#----------------------------- ENCRYPT -----------------------------#
2134 &mov ($s0,&DWP(0,$key)); # load iv
2135 &mov ($s1,&DWP(4,$key));
2136
2137 &set_label("fast_enc_loop",16);
2138 &mov ($s2,&DWP(8,$key));
2139 &mov ($s3,&DWP(12,$key));
2140
2141 &xor ($s0,&DWP(0,$acc)); # xor input data
2142 &xor ($s1,&DWP(4,$acc));
2143 &xor ($s2,&DWP(8,$acc));
2144 &xor ($s3,&DWP(12,$acc));
2145
2146 &mov ($key,$_key); # load key
2147 &call ("_x86_AES_encrypt");
2148
2149 &mov ($acc,$_inp); # load inp
2150 &mov ($key,$_out); # load out
2151
2152 &mov (&DWP(0,$key),$s0); # save output data
2153 &mov (&DWP(4,$key),$s1);
2154 &mov (&DWP(8,$key),$s2);
2155 &mov (&DWP(12,$key),$s3);
2156
2157 &lea ($acc,&DWP(16,$acc)); # advance inp
2158 &mov ($s2,$_len); # load len
2159 &mov ($_inp,$acc); # save inp
2160 &lea ($s3,&DWP(16,$key)); # advance out
2161 &mov ($_out,$s3); # save out
2162 &sub ($s2,16); # decrease len
2163 &mov ($_len,$s2); # save len
2164 &jnz (&label("fast_enc_loop"));
2165 &mov ($acc,$_ivp); # load ivp
2166 &mov ($s2,&DWP(8,$key)); # restore last 2 dwords
2167 &mov ($s3,&DWP(12,$key));
2168 &mov (&DWP(0,$acc),$s0); # save ivec
2169 &mov (&DWP(4,$acc),$s1);
2170 &mov (&DWP(8,$acc),$s2);
2171 &mov (&DWP(12,$acc),$s3);
2172
2173 &cmp ($mark,0); # was the key schedule copied?
2174 &mov ("edi",$_key);
2175 &je (&label("skip_ezero"));
2176 # zero copy of key schedule
2177 &mov ("ecx",240/4);
2178 &xor ("eax","eax");
2179 &align (4);
2180 &data_word(0xABF3F689); # rep stosd
2181 &set_label("skip_ezero")
2182 &mov ("esp",$_esp);
2183 &popf ();
2184 &set_label("drop_out");
2185 &function_end_A();
2186 &pushf (); # kludge, never executed
2187
2188#----------------------------- DECRYPT -----------------------------#
2189&set_label("fast_decrypt",16);
2190
2191 &cmp ($acc,$_out);
2192 &je (&label("fast_dec_in_place")); # in-place processing...
2193
2194 &mov ($_tmp,$key);
2195
2196 &align (4);
2197 &set_label("fast_dec_loop",16);
2198 &mov ($s0,&DWP(0,$acc)); # read input
2199 &mov ($s1,&DWP(4,$acc));
2200 &mov ($s2,&DWP(8,$acc));
2201 &mov ($s3,&DWP(12,$acc));
2202
2203 &mov ($key,$_key); # load key
2204 &call ("_x86_AES_decrypt");
2205
2206 &mov ($key,$_tmp); # load ivp
2207 &mov ($acc,$_len); # load len
2208 &xor ($s0,&DWP(0,$key)); # xor iv
2209 &xor ($s1,&DWP(4,$key));
2210 &xor ($s2,&DWP(8,$key));
2211 &xor ($s3,&DWP(12,$key));
2212
2213 &mov ($key,$_out); # load out
2214 &mov ($acc,$_inp); # load inp
2215
2216 &mov (&DWP(0,$key),$s0); # write output
2217 &mov (&DWP(4,$key),$s1);
2218 &mov (&DWP(8,$key),$s2);
2219 &mov (&DWP(12,$key),$s3);
2220
2221 &mov ($s2,$_len); # load len
2222 &mov ($_tmp,$acc); # save ivp
2223 &lea ($acc,&DWP(16,$acc)); # advance inp
2224 &mov ($_inp,$acc); # save inp
2225 &lea ($key,&DWP(16,$key)); # advance out
2226 &mov ($_out,$key); # save out
2227 &sub ($s2,16); # decrease len
2228 &mov ($_len,$s2); # save len
2229 &jnz (&label("fast_dec_loop"));
2230 &mov ($key,$_tmp); # load temp ivp
2231 &mov ($acc,$_ivp); # load user ivp
2232 &mov ($s0,&DWP(0,$key)); # load iv
2233 &mov ($s1,&DWP(4,$key));
2234 &mov ($s2,&DWP(8,$key));
2235 &mov ($s3,&DWP(12,$key));
2236 &mov (&DWP(0,$acc),$s0); # copy back to user
2237 &mov (&DWP(4,$acc),$s1);
2238 &mov (&DWP(8,$acc),$s2);
2239 &mov (&DWP(12,$acc),$s3);
2240 &jmp (&label("fast_dec_out"));
2241
2242 &set_label("fast_dec_in_place",16);
2243 &set_label("fast_dec_in_place_loop");
2244 &mov ($s0,&DWP(0,$acc)); # read input
2245 &mov ($s1,&DWP(4,$acc));
2246 &mov ($s2,&DWP(8,$acc));
2247 &mov ($s3,&DWP(12,$acc));
2248
2249 &lea ($key,$ivec);
2250 &mov (&DWP(0,$key),$s0); # copy to temp
2251 &mov (&DWP(4,$key),$s1);
2252 &mov (&DWP(8,$key),$s2);
2253 &mov (&DWP(12,$key),$s3);
2254
2255 &mov ($key,$_key); # load key
2256 &call ("_x86_AES_decrypt");
2257
2258 &mov ($key,$_ivp); # load ivp
2259 &mov ($acc,$_out); # load out
2260 &xor ($s0,&DWP(0,$key)); # xor iv
2261 &xor ($s1,&DWP(4,$key));
2262 &xor ($s2,&DWP(8,$key));
2263 &xor ($s3,&DWP(12,$key));
2264
2265 &mov (&DWP(0,$acc),$s0); # write output
2266 &mov (&DWP(4,$acc),$s1);
2267 &mov (&DWP(8,$acc),$s2);
2268 &mov (&DWP(12,$acc),$s3);
2269
2270 &lea ($acc,&DWP(16,$acc)); # advance out
2271 &mov ($_out,$acc); # save out
2272
2273 &lea ($acc,$ivec);
2274 &mov ($s0,&DWP(0,$acc)); # read temp
2275 &mov ($s1,&DWP(4,$acc));
2276 &mov ($s2,&DWP(8,$acc));
2277 &mov ($s3,&DWP(12,$acc));
2278
2279 &mov (&DWP(0,$key),$s0); # copy iv
2280 &mov (&DWP(4,$key),$s1);
2281 &mov (&DWP(8,$key),$s2);
2282 &mov (&DWP(12,$key),$s3);
2283
2284 &mov ($acc,$_inp); # load inp
2285 &mov ($s2,$_len); # load len
2286 &lea ($acc,&DWP(16,$acc)); # advance inp
2287 &mov ($_inp,$acc); # save inp
2288 &sub ($s2,16); # decrease len
2289 &mov ($_len,$s2); # save len
2290 &jnz (&label("fast_dec_in_place_loop"));
2291
2292 &set_label("fast_dec_out",4);
2293 &cmp ($mark,0); # was the key schedule copied?
2294 &mov ("edi",$_key);
2295 &je (&label("skip_dzero"));
2296 # zero copy of key schedule
2297 &mov ("ecx",240/4);
2298 &xor ("eax","eax");
2299 &align (4);
2300 &data_word(0xABF3F689); # rep stosd
2301 &set_label("skip_dzero")
2302 &mov ("esp",$_esp);
2303 &popf ();
2304 &function_end_A();
2305 &pushf (); # kludge, never executed
2306
2307#--------------------------- SLOW ROUTINE ---------------------------#
2308&set_label("slow_way",16);
2309
2310 &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap
2311 &mov ($key,&wparam(3)); # load key
2312
2313 # pre-allocate aligned stack frame...
2314 &lea ($acc,&DWP(-80,"esp"));
2315 &and ($acc,-64);
2316
2317 # ... and make sure it doesn't alias with $key modulo 1024
2318 &lea ($s1,&DWP(-80-63,$key));
2319 &sub ($s1,$acc);
2320 &neg ($s1);
2321 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
2322 &sub ($acc,$s1);
2323
2324 # pick S-box copy which can't overlap with stack frame or $key
2325 &lea ($s1,&DWP(768,$acc));
2326 &sub ($s1,$tbl);
2327 &and ($s1,0x300);
2328 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
2329
2330 &lea ($s3,&wparam(0)); # pointer to parameter block
2331
2332 &exch ("esp",$acc);
2333 &add ("esp",4); # reserve for return address!
2334 &mov ($_tbl,$tbl); # save %ebp
2335 &mov ($_esp,$acc); # save %esp
2336 &mov ($_tmp,$s0); # save OPENSSL_ia32cap
2337
2338 &mov ($s0,&DWP(0,$s3)); # load inp
2339 &mov ($s1,&DWP(4,$s3)); # load out
2340 #&mov ($s2,&DWP(8,$s3)); # load len
2341 #&mov ($key,&DWP(12,$s3)); # load key
2342 &mov ($acc,&DWP(16,$s3)); # load ivp
2343 &mov ($s3,&DWP(20,$s3)); # load enc flag
2344
2345 &mov ($_inp,$s0); # save copy of inp
2346 &mov ($_out,$s1); # save copy of out
2347 &mov ($_len,$s2); # save copy of len
2348 &mov ($_key,$key); # save copy of key
2349 &mov ($_ivp,$acc); # save copy of ivp
2350
2351 &mov ($key,$acc);
2352 &mov ($acc,$s0);
2353
2354 &cmp ($s3,0);
2355 &je (&label("slow_decrypt"));
2356
2357#--------------------------- SLOW ENCRYPT ---------------------------#
2358 &cmp ($s2,16);
2359 &mov ($s3,$s1);
2360 &jb (&label("slow_enc_tail"));
2361
2362 if (!$x86only) {
2363 &bt ($_tmp,"\$IA32CAP_BIT0_SSE"); # check for SSE bit
2364 &jnc (&label("slow_enc_x86"));
2365
2366 &movq ("mm0",&QWP(0,$key)); # load iv
2367 &movq ("mm4",&QWP(8,$key));
2368
2369 &set_label("slow_enc_loop_sse",16);
2370 &pxor ("mm0",&QWP(0,$acc)); # xor input data
2371 &pxor ("mm4",&QWP(8,$acc));
2372
2373 &mov ($key,$_key);
2374 &call ("_sse_AES_encrypt_compact");
2375
2376 &mov ($acc,$_inp); # load inp
2377 &mov ($key,$_out); # load out
2378 &mov ($s2,$_len); # load len
2379
2380 &movq (&QWP(0,$key),"mm0"); # save output data
2381 &movq (&QWP(8,$key),"mm4");
2382
2383 &lea ($acc,&DWP(16,$acc)); # advance inp
2384 &mov ($_inp,$acc); # save inp
2385 &lea ($s3,&DWP(16,$key)); # advance out
2386 &mov ($_out,$s3); # save out
2387 &sub ($s2,16); # decrease len
2388 &cmp ($s2,16);
2389 &mov ($_len,$s2); # save len
2390 &jae (&label("slow_enc_loop_sse"));
2391 &test ($s2,15);
2392 &jnz (&label("slow_enc_tail"));
2393 &mov ($acc,$_ivp); # load ivp
2394 &movq (&QWP(0,$acc),"mm0"); # save ivec
2395 &movq (&QWP(8,$acc),"mm4");
2396 &emms ();
2397 &mov ("esp",$_esp);
2398 &popf ();
2399 &function_end_A();
2400 &pushf (); # kludge, never executed
2401 }
2402 &set_label("slow_enc_x86",16);
2403 &mov ($s0,&DWP(0,$key)); # load iv
2404 &mov ($s1,&DWP(4,$key));
2405
2406 &set_label("slow_enc_loop_x86",4);
2407 &mov ($s2,&DWP(8,$key));
2408 &mov ($s3,&DWP(12,$key));
2409
2410 &xor ($s0,&DWP(0,$acc)); # xor input data
2411 &xor ($s1,&DWP(4,$acc));
2412 &xor ($s2,&DWP(8,$acc));
2413 &xor ($s3,&DWP(12,$acc));
2414
2415 &mov ($key,$_key); # load key
2416 &call ("_x86_AES_encrypt_compact");
2417
2418 &mov ($acc,$_inp); # load inp
2419 &mov ($key,$_out); # load out
2420
2421 &mov (&DWP(0,$key),$s0); # save output data
2422 &mov (&DWP(4,$key),$s1);
2423 &mov (&DWP(8,$key),$s2);
2424 &mov (&DWP(12,$key),$s3);
2425
2426 &mov ($s2,$_len); # load len
2427 &lea ($acc,&DWP(16,$acc)); # advance inp
2428 &mov ($_inp,$acc); # save inp
2429 &lea ($s3,&DWP(16,$key)); # advance out
2430 &mov ($_out,$s3); # save out
2431 &sub ($s2,16); # decrease len
2432 &cmp ($s2,16);
2433 &mov ($_len,$s2); # save len
2434 &jae (&label("slow_enc_loop_x86"));
2435 &test ($s2,15);
2436 &jnz (&label("slow_enc_tail"));
2437 &mov ($acc,$_ivp); # load ivp
2438 &mov ($s2,&DWP(8,$key)); # restore last dwords
2439 &mov ($s3,&DWP(12,$key));
2440 &mov (&DWP(0,$acc),$s0); # save ivec
2441 &mov (&DWP(4,$acc),$s1);
2442 &mov (&DWP(8,$acc),$s2);
2443 &mov (&DWP(12,$acc),$s3);
2444
2445 &mov ("esp",$_esp);
2446 &popf ();
2447 &function_end_A();
2448 &pushf (); # kludge, never executed
2449
2450 &set_label("slow_enc_tail",16);
2451 &emms () if (!$x86only);
2452 &mov ($key eq "edi"? $key:"",$s3); # load out to edi
2453 &mov ($s1,16);
2454 &sub ($s1,$s2);
2455 &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp
2456 &je (&label("enc_in_place"));
2457 &align (4);
2458 &data_word(0xA4F3F689); # rep movsb # copy input
2459 &jmp (&label("enc_skip_in_place"));
2460 &set_label("enc_in_place");
2461 &lea ($key,&DWP(0,$key,$s2));
2462 &set_label("enc_skip_in_place");
2463 &mov ($s2,$s1);
2464 &xor ($s0,$s0);
2465 &align (4);
2466 &data_word(0xAAF3F689); # rep stosb # zero tail
2467
2468 &mov ($key,$_ivp); # restore ivp
2469 &mov ($acc,$s3); # output as input
2470 &mov ($s0,&DWP(0,$key));
2471 &mov ($s1,&DWP(4,$key));
2472 &mov ($_len,16); # len=16
2473 &jmp (&label("slow_enc_loop_x86")); # one more spin...
2474
2475#--------------------------- SLOW DECRYPT ---------------------------#
2476&set_label("slow_decrypt",16);
2477 if (!$x86only) {
2478 &bt ($_tmp,"\$IA32CAP_BIT0_SSE"); # check for SSE bit
2479 &jnc (&label("slow_dec_loop_x86"));
2480
2481 &set_label("slow_dec_loop_sse",4);
2482 &movq ("mm0",&QWP(0,$acc)); # read input
2483 &movq ("mm4",&QWP(8,$acc));
2484
2485 &mov ($key,$_key);
2486 &call ("_sse_AES_decrypt_compact");
2487
2488 &mov ($acc,$_inp); # load inp
2489 &lea ($s0,$ivec);
2490 &mov ($s1,$_out); # load out
2491 &mov ($s2,$_len); # load len
2492 &mov ($key,$_ivp); # load ivp
2493
2494 &movq ("mm1",&QWP(0,$acc)); # re-read input
2495 &movq ("mm5",&QWP(8,$acc));
2496
2497 &pxor ("mm0",&QWP(0,$key)); # xor iv
2498 &pxor ("mm4",&QWP(8,$key));
2499
2500 &movq (&QWP(0,$key),"mm1"); # copy input to iv
2501 &movq (&QWP(8,$key),"mm5");
2502
2503 &sub ($s2,16); # decrease len
2504 &jc (&label("slow_dec_partial_sse"));
2505
2506 &movq (&QWP(0,$s1),"mm0"); # write output
2507 &movq (&QWP(8,$s1),"mm4");
2508
2509 &lea ($s1,&DWP(16,$s1)); # advance out
2510 &mov ($_out,$s1); # save out
2511 &lea ($acc,&DWP(16,$acc)); # advance inp
2512 &mov ($_inp,$acc); # save inp
2513 &mov ($_len,$s2); # save len
2514 &jnz (&label("slow_dec_loop_sse"));
2515 &emms ();
2516 &mov ("esp",$_esp);
2517 &popf ();
2518 &function_end_A();
2519 &pushf (); # kludge, never executed
2520
2521 &set_label("slow_dec_partial_sse",16);
2522 &movq (&QWP(0,$s0),"mm0"); # save output to temp
2523 &movq (&QWP(8,$s0),"mm4");
2524 &emms ();
2525
2526 &add ($s2 eq "ecx" ? "ecx":"",16);
2527 &mov ("edi",$s1); # out
2528 &mov ("esi",$s0); # temp
2529 &align (4);
2530 &data_word(0xA4F3F689); # rep movsb # copy partial output
2531
2532 &mov ("esp",$_esp);
2533 &popf ();
2534 &function_end_A();
2535 &pushf (); # kludge, never executed
2536 }
2537 &set_label("slow_dec_loop_x86",16);
2538 &mov ($s0,&DWP(0,$acc)); # read input
2539 &mov ($s1,&DWP(4,$acc));
2540 &mov ($s2,&DWP(8,$acc));
2541 &mov ($s3,&DWP(12,$acc));
2542
2543 &lea ($key,$ivec);
2544 &mov (&DWP(0,$key),$s0); # copy to temp
2545 &mov (&DWP(4,$key),$s1);
2546 &mov (&DWP(8,$key),$s2);
2547 &mov (&DWP(12,$key),$s3);
2548
2549 &mov ($key,$_key); # load key
2550 &call ("_x86_AES_decrypt_compact");
2551
2552 &mov ($key,$_ivp); # load ivp
2553 &mov ($acc,$_len); # load len
2554 &xor ($s0,&DWP(0,$key)); # xor iv
2555 &xor ($s1,&DWP(4,$key));
2556 &xor ($s2,&DWP(8,$key));
2557 &xor ($s3,&DWP(12,$key));
2558
2559 &sub ($acc,16);
2560 &jc (&label("slow_dec_partial_x86"));
2561
2562 &mov ($_len,$acc); # save len
2563 &mov ($acc,$_out); # load out
2564
2565 &mov (&DWP(0,$acc),$s0); # write output
2566 &mov (&DWP(4,$acc),$s1);
2567 &mov (&DWP(8,$acc),$s2);
2568 &mov (&DWP(12,$acc),$s3);
2569
2570 &lea ($acc,&DWP(16,$acc)); # advance out
2571 &mov ($_out,$acc); # save out
2572
2573 &lea ($acc,$ivec);
2574 &mov ($s0,&DWP(0,$acc)); # read temp
2575 &mov ($s1,&DWP(4,$acc));
2576 &mov ($s2,&DWP(8,$acc));
2577 &mov ($s3,&DWP(12,$acc));
2578
2579 &mov (&DWP(0,$key),$s0); # copy it to iv
2580 &mov (&DWP(4,$key),$s1);
2581 &mov (&DWP(8,$key),$s2);
2582 &mov (&DWP(12,$key),$s3);
2583
2584 &mov ($acc,$_inp); # load inp
2585 &lea ($acc,&DWP(16,$acc)); # advance inp
2586 &mov ($_inp,$acc); # save inp
2587 &jnz (&label("slow_dec_loop_x86"));
2588 &mov ("esp",$_esp);
2589 &popf ();
2590 &function_end_A();
2591 &pushf (); # kludge, never executed
2592
2593 &set_label("slow_dec_partial_x86",16);
2594 &lea ($acc,$ivec);
2595 &mov (&DWP(0,$acc),$s0); # save output to temp
2596 &mov (&DWP(4,$acc),$s1);
2597 &mov (&DWP(8,$acc),$s2);
2598 &mov (&DWP(12,$acc),$s3);
2599
2600 &mov ($acc,$_inp);
2601 &mov ($s0,&DWP(0,$acc)); # re-read input
2602 &mov ($s1,&DWP(4,$acc));
2603 &mov ($s2,&DWP(8,$acc));
2604 &mov ($s3,&DWP(12,$acc));
2605
2606 &mov (&DWP(0,$key),$s0); # copy it to iv
2607 &mov (&DWP(4,$key),$s1);
2608 &mov (&DWP(8,$key),$s2);
2609 &mov (&DWP(12,$key),$s3);
2610
2611 &mov ("ecx",$_len);
2612 &mov ("edi",$_out);
2613 &lea ("esi",$ivec);
2614 &align (4);
2615 &data_word(0xA4F3F689); # rep movsb # copy partial output
2616
2617 &mov ("esp",$_esp);
2618 &popf ();
2619&function_end("aes_cbc_encrypt_internal");
2620}
2621
2622#------------------------------------------------------------------#
2623
2624sub enckey()
2625{
2626 &movz ("esi",&LB("edx")); # rk[i]>>0
2627 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2628 &movz ("esi",&HB("edx")); # rk[i]>>8
2629 &shl ("ebx",24);
2630 &xor ("eax","ebx");
2631
2632 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2633 &shr ("edx",16);
2634 &movz ("esi",&LB("edx")); # rk[i]>>16
2635 &xor ("eax","ebx");
2636
2637 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2638 &movz ("esi",&HB("edx")); # rk[i]>>24
2639 &shl ("ebx",8);
2640 &xor ("eax","ebx");
2641
2642 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2643 &shl ("ebx",16);
2644 &xor ("eax","ebx");
2645
2646 &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon
2647}
2648
2649&function_begin("_x86_AES_set_encrypt_key");
2650 &mov ("esi",&wparam(1)); # user supplied key
2651 &mov ("edi",&wparam(3)); # private key schedule
2652
2653 &test ("esi",-1);
2654 &jz (&label("badpointer"));
2655 &test ("edi",-1);
2656 &jz (&label("badpointer"));
2657
2658 &picsetup($tbl);
2659 &picsymbol($tbl, &label("AES_Te"), $tbl);
2660
2661 &lea ($tbl,&DWP(2048+128,$tbl));
2662
2663 # prefetch Te4
2664 &mov ("eax",&DWP(0-128,$tbl));
2665 &mov ("ebx",&DWP(32-128,$tbl));
2666 &mov ("ecx",&DWP(64-128,$tbl));
2667 &mov ("edx",&DWP(96-128,$tbl));
2668 &mov ("eax",&DWP(128-128,$tbl));
2669 &mov ("ebx",&DWP(160-128,$tbl));
2670 &mov ("ecx",&DWP(192-128,$tbl));
2671 &mov ("edx",&DWP(224-128,$tbl));
2672
2673 &mov ("ecx",&wparam(2)); # number of bits in key
2674 &cmp ("ecx",128);
2675 &je (&label("10rounds"));
2676 &cmp ("ecx",192);
2677 &je (&label("12rounds"));
2678 &cmp ("ecx",256);
2679 &je (&label("14rounds"));
2680 &mov ("eax",-2); # invalid number of bits
2681 &jmp (&label("exit"));
2682
2683 &set_label("10rounds");
2684 &mov ("eax",&DWP(0,"esi")); # copy first 4 dwords
2685 &mov ("ebx",&DWP(4,"esi"));
2686 &mov ("ecx",&DWP(8,"esi"));
2687 &mov ("edx",&DWP(12,"esi"));
2688 &mov (&DWP(0,"edi"),"eax");
2689 &mov (&DWP(4,"edi"),"ebx");
2690 &mov (&DWP(8,"edi"),"ecx");
2691 &mov (&DWP(12,"edi"),"edx");
2692
2693 &xor ("ecx","ecx");
2694 &jmp (&label("10shortcut"));
2695
2696 &align (4);
2697 &set_label("10loop");
2698 &mov ("eax",&DWP(0,"edi")); # rk[0]
2699 &mov ("edx",&DWP(12,"edi")); # rk[3]
2700 &set_label("10shortcut");
2701 &enckey ();
2702
2703 &mov (&DWP(16,"edi"),"eax"); # rk[4]
2704 &xor ("eax",&DWP(4,"edi"));
2705 &mov (&DWP(20,"edi"),"eax"); # rk[5]
2706 &xor ("eax",&DWP(8,"edi"));
2707 &mov (&DWP(24,"edi"),"eax"); # rk[6]
2708 &xor ("eax",&DWP(12,"edi"));
2709 &mov (&DWP(28,"edi"),"eax"); # rk[7]
2710 &inc ("ecx");
2711 &add ("edi",16);
2712 &cmp ("ecx",10);
2713 &jl (&label("10loop"));
2714
2715 &mov (&DWP(80,"edi"),10); # setup number of rounds
2716 &xor ("eax","eax");
2717 &jmp (&label("exit"));
2718
2719 &set_label("12rounds");
2720 &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
2721 &mov ("ebx",&DWP(4,"esi"));
2722 &mov ("ecx",&DWP(8,"esi"));
2723 &mov ("edx",&DWP(12,"esi"));
2724 &mov (&DWP(0,"edi"),"eax");
2725 &mov (&DWP(4,"edi"),"ebx");
2726 &mov (&DWP(8,"edi"),"ecx");
2727 &mov (&DWP(12,"edi"),"edx");
2728 &mov ("ecx",&DWP(16,"esi"));
2729 &mov ("edx",&DWP(20,"esi"));
2730 &mov (&DWP(16,"edi"),"ecx");
2731 &mov (&DWP(20,"edi"),"edx");
2732
2733 &xor ("ecx","ecx");
2734 &jmp (&label("12shortcut"));
2735
2736 &align (4);
2737 &set_label("12loop");
2738 &mov ("eax",&DWP(0,"edi")); # rk[0]
2739 &mov ("edx",&DWP(20,"edi")); # rk[5]
2740 &set_label("12shortcut");
2741 &enckey ();
2742
2743 &mov (&DWP(24,"edi"),"eax"); # rk[6]
2744 &xor ("eax",&DWP(4,"edi"));
2745 &mov (&DWP(28,"edi"),"eax"); # rk[7]
2746 &xor ("eax",&DWP(8,"edi"));
2747 &mov (&DWP(32,"edi"),"eax"); # rk[8]
2748 &xor ("eax",&DWP(12,"edi"));
2749 &mov (&DWP(36,"edi"),"eax"); # rk[9]
2750
2751 &cmp ("ecx",7);
2752 &je (&label("12break"));
2753 &inc ("ecx");
2754
2755 &xor ("eax",&DWP(16,"edi"));
2756 &mov (&DWP(40,"edi"),"eax"); # rk[10]
2757 &xor ("eax",&DWP(20,"edi"));
2758 &mov (&DWP(44,"edi"),"eax"); # rk[11]
2759
2760 &add ("edi",24);
2761 &jmp (&label("12loop"));
2762
2763 &set_label("12break");
2764 &mov (&DWP(72,"edi"),12); # setup number of rounds
2765 &xor ("eax","eax");
2766 &jmp (&label("exit"));
2767
2768 &set_label("14rounds");
2769 &mov ("eax",&DWP(0,"esi")); # copy first 8 dwords
2770 &mov ("ebx",&DWP(4,"esi"));
2771 &mov ("ecx",&DWP(8,"esi"));
2772 &mov ("edx",&DWP(12,"esi"));
2773 &mov (&DWP(0,"edi"),"eax");
2774 &mov (&DWP(4,"edi"),"ebx");
2775 &mov (&DWP(8,"edi"),"ecx");
2776 &mov (&DWP(12,"edi"),"edx");
2777 &mov ("eax",&DWP(16,"esi"));
2778 &mov ("ebx",&DWP(20,"esi"));
2779 &mov ("ecx",&DWP(24,"esi"));
2780 &mov ("edx",&DWP(28,"esi"));
2781 &mov (&DWP(16,"edi"),"eax");
2782 &mov (&DWP(20,"edi"),"ebx");
2783 &mov (&DWP(24,"edi"),"ecx");
2784 &mov (&DWP(28,"edi"),"edx");
2785
2786 &xor ("ecx","ecx");
2787 &jmp (&label("14shortcut"));
2788
2789 &align (4);
2790 &set_label("14loop");
2791 &mov ("edx",&DWP(28,"edi")); # rk[7]
2792 &set_label("14shortcut");
2793 &mov ("eax",&DWP(0,"edi")); # rk[0]
2794
2795 &enckey ();
2796
2797 &mov (&DWP(32,"edi"),"eax"); # rk[8]
2798 &xor ("eax",&DWP(4,"edi"));
2799 &mov (&DWP(36,"edi"),"eax"); # rk[9]
2800 &xor ("eax",&DWP(8,"edi"));
2801 &mov (&DWP(40,"edi"),"eax"); # rk[10]
2802 &xor ("eax",&DWP(12,"edi"));
2803 &mov (&DWP(44,"edi"),"eax"); # rk[11]
2804
2805 &cmp ("ecx",6);
2806 &je (&label("14break"));
2807 &inc ("ecx");
2808
2809 &mov ("edx","eax");
2810 &mov ("eax",&DWP(16,"edi")); # rk[4]
2811 &movz ("esi",&LB("edx")); # rk[11]>>0
2812 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2813 &movz ("esi",&HB("edx")); # rk[11]>>8
2814 &xor ("eax","ebx");
2815
2816 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2817 &shr ("edx",16);
2818 &shl ("ebx",8);
2819 &movz ("esi",&LB("edx")); # rk[11]>>16
2820 &xor ("eax","ebx");
2821
2822 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2823 &movz ("esi",&HB("edx")); # rk[11]>>24
2824 &shl ("ebx",16);
2825 &xor ("eax","ebx");
2826
2827 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2828 &shl ("ebx",24);
2829 &xor ("eax","ebx");
2830
2831 &mov (&DWP(48,"edi"),"eax"); # rk[12]
2832 &xor ("eax",&DWP(20,"edi"));
2833 &mov (&DWP(52,"edi"),"eax"); # rk[13]
2834 &xor ("eax",&DWP(24,"edi"));
2835 &mov (&DWP(56,"edi"),"eax"); # rk[14]
2836 &xor ("eax",&DWP(28,"edi"));
2837 &mov (&DWP(60,"edi"),"eax"); # rk[15]
2838
2839 &add ("edi",32);
2840 &jmp (&label("14loop"));
2841
2842 &set_label("14break");
2843 &mov (&DWP(48,"edi"),14); # setup number of rounds
2844 &xor ("eax","eax");
2845 &jmp (&label("exit"));
2846
2847 &set_label("badpointer");
2848 &mov ("eax",-1);
2849 &set_label("exit");
2850&function_end("_x86_AES_set_encrypt_key");
2851
2852# int aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits,
2853# AES_KEY *key)
2854&function_begin_B("aes_set_encrypt_key_internal");
2855 &call ("_x86_AES_set_encrypt_key");
2856 &ret ();
2857&function_end_B("aes_set_encrypt_key_internal");
2858
2859sub deckey()
2860{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
2861 my $tmp = $tbl;
2862
2863 &mov ($acc,$tp1);
2864 &and ($acc,0x80808080);
2865 &mov ($tmp,$acc);
2866 &shr ($tmp,7);
2867 &lea ($tp2,&DWP(0,$tp1,$tp1));
2868 &sub ($acc,$tmp);
2869 &and ($tp2,0xfefefefe);
2870 &and ($acc,0x1b1b1b1b);
2871 &xor ($acc,$tp2);
2872 &mov ($tp2,$acc);
2873
2874 &and ($acc,0x80808080);
2875 &mov ($tmp,$acc);
2876 &shr ($tmp,7);
2877 &lea ($tp4,&DWP(0,$tp2,$tp2));
2878 &sub ($acc,$tmp);
2879 &and ($tp4,0xfefefefe);
2880 &and ($acc,0x1b1b1b1b);
2881 &xor ($tp2,$tp1); # tp2^tp1
2882 &xor ($acc,$tp4);
2883 &mov ($tp4,$acc);
2884
2885 &and ($acc,0x80808080);
2886 &mov ($tmp,$acc);
2887 &shr ($tmp,7);
2888 &lea ($tp8,&DWP(0,$tp4,$tp4));
2889 &xor ($tp4,$tp1); # tp4^tp1
2890 &sub ($acc,$tmp);
2891 &and ($tp8,0xfefefefe);
2892 &and ($acc,0x1b1b1b1b);
2893 &rotl ($tp1,8); # = ROTATE(tp1,8)
2894 &xor ($tp8,$acc);
2895
2896 &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load
2897
2898 &xor ($tp1,$tp2);
2899 &xor ($tp2,$tp8);
2900 &xor ($tp1,$tp4);
2901 &rotl ($tp2,24);
2902 &xor ($tp4,$tp8);
2903 &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
2904 &rotl ($tp4,16);
2905 &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
2906 &rotl ($tp8,8);
2907 &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
2908 &mov ($tp2,$tmp);
2909 &xor ($tp1,$tp8); # ^= ROTATE(tp8,8)
2910
2911 &mov (&DWP(4*$i,$key),$tp1);
2912}
2913
2914# int aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits,
2915# AES_KEY *key)
2916&function_begin_B("aes_set_decrypt_key_internal");
2917 &call ("_x86_AES_set_encrypt_key");
2918 &cmp ("eax",0);
2919 &je (&label("proceed"));
2920 &ret ();
2921
2922 &set_label("proceed");
2923 &push ("ebp");
2924 &push ("ebx");
2925 &push ("esi");
2926 &push ("edi");
2927
2928 &mov ("esi",&wparam(2));
2929 &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
2930 &lea ("ecx",&DWP(0,"","ecx",4));
2931 &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
2932
2933 &set_label("invert",4); # invert order of chunks
2934 &mov ("eax",&DWP(0,"esi"));
2935 &mov ("ebx",&DWP(4,"esi"));
2936 &mov ("ecx",&DWP(0,"edi"));
2937 &mov ("edx",&DWP(4,"edi"));
2938 &mov (&DWP(0,"edi"),"eax");
2939 &mov (&DWP(4,"edi"),"ebx");
2940 &mov (&DWP(0,"esi"),"ecx");
2941 &mov (&DWP(4,"esi"),"edx");
2942 &mov ("eax",&DWP(8,"esi"));
2943 &mov ("ebx",&DWP(12,"esi"));
2944 &mov ("ecx",&DWP(8,"edi"));
2945 &mov ("edx",&DWP(12,"edi"));
2946 &mov (&DWP(8,"edi"),"eax");
2947 &mov (&DWP(12,"edi"),"ebx");
2948 &mov (&DWP(8,"esi"),"ecx");
2949 &mov (&DWP(12,"esi"),"edx");
2950 &add ("esi",16);
2951 &sub ("edi",16);
2952 &cmp ("esi","edi");
2953 &jne (&label("invert"));
2954
2955 &mov ($key,&wparam(2));
2956 &mov ($acc,&DWP(240,$key)); # pull number of rounds
2957 &lea ($acc,&DWP(-2,$acc,$acc));
2958 &lea ($acc,&DWP(0,$key,$acc,8));
2959 &mov (&wparam(2),$acc);
2960
2961 &mov ($s0,&DWP(16,$key)); # modulo-scheduled load
2962 &set_label("permute",4); # permute the key schedule
2963 &add ($key,16);
2964 &deckey (0,$key,$s0,$s1,$s2,$s3);
2965 &deckey (1,$key,$s1,$s2,$s3,$s0);
2966 &deckey (2,$key,$s2,$s3,$s0,$s1);
2967 &deckey (3,$key,$s3,$s0,$s1,$s2);
2968 &cmp ($key,&wparam(2));
2969 &jb (&label("permute"));
2970
2971 &xor ("eax","eax"); # return success
2972&function_end("aes_set_decrypt_key_internal");
2973
2974&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl
deleted file mode 100644
index 0048ee5b29..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-armv4.pl
+++ /dev/null
@@ -1,1134 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for ARMv4
11
12# January 2007.
13#
14# Code uses single 1K S-box and is >2 times faster than code generated
15# by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
16# allows to merge logical or arithmetic operation with shift or rotate
17# in one instruction and emit combined result every cycle. The module
18# is endian-neutral. The performance is ~42 cycles/byte for 128-bit
19# key [on single-issue Xscale PXA250 core].
20
21# May 2007.
22#
23# AES_set_[en|de]crypt_key is added.
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 12% improvement on
28# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~21.5 cycles per byte.
34
35while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
36open STDOUT,">$output";
37
38$s0="r0";
39$s1="r1";
40$s2="r2";
41$s3="r3";
42$t1="r4";
43$t2="r5";
44$t3="r6";
45$i1="r7";
46$i2="r8";
47$i3="r9";
48
49$tbl="r10";
50$key="r11";
51$rounds="r12";
52
53$code=<<___;
54#include "arm_arch.h"
55.text
56.code 32
57
58.type AES_Te,%object
59.align 5
60AES_Te:
61.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
62.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
63.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
64.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
65.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
66.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
67.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
68.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
69.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
70.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
71.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
72.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
73.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
74.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
75.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
76.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
77.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
78.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
79.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
80.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
81.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
82.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
83.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
84.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
85.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
86.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
87.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
88.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
89.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
90.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
91.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
92.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
93.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
94.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
95.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
96.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
97.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
98.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
99.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
100.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
101.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
102.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
103.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
104.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
105.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
106.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
107.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
108.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
109.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
110.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
111.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
112.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
113.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
114.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
115.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
116.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
117.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
118.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
119.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
120.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
121.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
122.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
123.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
124.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
125@ Te4[256]
126.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
127.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
128.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
129.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
130.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
131.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
132.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
133.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
134.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
135.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
136.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
137.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
138.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
139.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
140.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
141.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
142.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
143.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
144.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
145.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
146.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
147.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
148.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
149.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
150.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
151.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
152.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
153.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
154.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
155.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
156.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
157.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
158@ rcon[]
159.word 0x01000000, 0x02000000, 0x04000000, 0x08000000
160.word 0x10000000, 0x20000000, 0x40000000, 0x80000000
161.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
162.size AES_Te,.-AES_Te
163
164@ void aes_encrypt_internal(const unsigned char *in, unsigned char *out,
165@ const AES_KEY *key) {
166.global aes_encrypt_internal
167.type aes_encrypt_internal,%function
168.align 5
169aes_encrypt_internal:
170 sub r3,pc,#8 @ aes_encrypt_internal
171 stmdb sp!,{r1,r4-r12,lr}
172 mov $rounds,r0 @ inp
173 mov $key,r2
174 sub $tbl,r3,#aes_encrypt_internal-AES_Te @ Te
175#if __ARM_ARCH__<7 || defined(__STRICT_ALIGNMENT)
176 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
177 ldrb $t1,[$rounds,#2] @ manner...
178 ldrb $t2,[$rounds,#1]
179 ldrb $t3,[$rounds,#0]
180 orr $s0,$s0,$t1,lsl#8
181 ldrb $s1,[$rounds,#7]
182 orr $s0,$s0,$t2,lsl#16
183 ldrb $t1,[$rounds,#6]
184 orr $s0,$s0,$t3,lsl#24
185 ldrb $t2,[$rounds,#5]
186 ldrb $t3,[$rounds,#4]
187 orr $s1,$s1,$t1,lsl#8
188 ldrb $s2,[$rounds,#11]
189 orr $s1,$s1,$t2,lsl#16
190 ldrb $t1,[$rounds,#10]
191 orr $s1,$s1,$t3,lsl#24
192 ldrb $t2,[$rounds,#9]
193 ldrb $t3,[$rounds,#8]
194 orr $s2,$s2,$t1,lsl#8
195 ldrb $s3,[$rounds,#15]
196 orr $s2,$s2,$t2,lsl#16
197 ldrb $t1,[$rounds,#14]
198 orr $s2,$s2,$t3,lsl#24
199 ldrb $t2,[$rounds,#13]
200 ldrb $t3,[$rounds,#12]
201 orr $s3,$s3,$t1,lsl#8
202 orr $s3,$s3,$t2,lsl#16
203 orr $s3,$s3,$t3,lsl#24
204#else
205 ldr $s0,[$rounds,#0]
206 ldr $s1,[$rounds,#4]
207 ldr $s2,[$rounds,#8]
208 ldr $s3,[$rounds,#12]
209#ifdef __ARMEL__
210 rev $s0,$s0
211 rev $s1,$s1
212 rev $s2,$s2
213 rev $s3,$s3
214#endif
215#endif
216 bl _armv4_AES_encrypt
217
218 ldr $rounds,[sp],#4 @ pop out
219#if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
220#ifdef __ARMEL__
221 rev $s0,$s0
222 rev $s1,$s1
223 rev $s2,$s2
224 rev $s3,$s3
225#endif
226 str $s0,[$rounds,#0]
227 str $s1,[$rounds,#4]
228 str $s2,[$rounds,#8]
229 str $s3,[$rounds,#12]
230#else
231 mov $t1,$s0,lsr#24 @ write output in endian-neutral
232 mov $t2,$s0,lsr#16 @ manner...
233 mov $t3,$s0,lsr#8
234 strb $t1,[$rounds,#0]
235 strb $t2,[$rounds,#1]
236 mov $t1,$s1,lsr#24
237 strb $t3,[$rounds,#2]
238 mov $t2,$s1,lsr#16
239 strb $s0,[$rounds,#3]
240 mov $t3,$s1,lsr#8
241 strb $t1,[$rounds,#4]
242 strb $t2,[$rounds,#5]
243 mov $t1,$s2,lsr#24
244 strb $t3,[$rounds,#6]
245 mov $t2,$s2,lsr#16
246 strb $s1,[$rounds,#7]
247 mov $t3,$s2,lsr#8
248 strb $t1,[$rounds,#8]
249 strb $t2,[$rounds,#9]
250 mov $t1,$s3,lsr#24
251 strb $t3,[$rounds,#10]
252 mov $t2,$s3,lsr#16
253 strb $s2,[$rounds,#11]
254 mov $t3,$s3,lsr#8
255 strb $t1,[$rounds,#12]
256 strb $t2,[$rounds,#13]
257 strb $t3,[$rounds,#14]
258 strb $s3,[$rounds,#15]
259#endif
260#if __ARM_ARCH__>=5
261 ldmia sp!,{r4-r12,pc}
262#else
263 ldmia sp!,{r4-r12,lr}
264 tst lr,#1
265 moveq pc,lr @ be binary compatible with V4, yet
266 bx lr @ interoperable with Thumb ISA:-)
267#endif
268.size aes_encrypt_internal,.-aes_encrypt_internal
269
270.type _armv4_AES_encrypt,%function
271.align 2
272_armv4_AES_encrypt:
273 str lr,[sp,#-4]! @ push lr
274 ldmia $key!,{$t1-$i1}
275 eor $s0,$s0,$t1
276 ldr $rounds,[$key,#240-16]
277 eor $s1,$s1,$t2
278 eor $s2,$s2,$t3
279 eor $s3,$s3,$i1
280 sub $rounds,$rounds,#1
281 mov lr,#255
282
283 and $i1,lr,$s0
284 and $i2,lr,$s0,lsr#8
285 and $i3,lr,$s0,lsr#16
286 mov $s0,$s0,lsr#24
287.Lenc_loop:
288 ldr $t1,[$tbl,$i1,lsl#2] @ Te3[s0>>0]
289 and $i1,lr,$s1,lsr#16 @ i0
290 ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8]
291 and $i2,lr,$s1
292 ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16]
293 and $i3,lr,$s1,lsr#8
294 ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24]
295 mov $s1,$s1,lsr#24
296
297 ldr $i1,[$tbl,$i1,lsl#2] @ Te1[s1>>16]
298 ldr $i2,[$tbl,$i2,lsl#2] @ Te3[s1>>0]
299 ldr $i3,[$tbl,$i3,lsl#2] @ Te2[s1>>8]
300 eor $s0,$s0,$i1,ror#8
301 ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24]
302 and $i1,lr,$s2,lsr#8 @ i0
303 eor $t2,$t2,$i2,ror#8
304 and $i2,lr,$s2,lsr#16 @ i1
305 eor $t3,$t3,$i3,ror#8
306 and $i3,lr,$s2
307 ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
308 eor $s1,$s1,$t1,ror#24
309 ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
310 mov $s2,$s2,lsr#24
311
312 ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
313 eor $s0,$s0,$i1,ror#16
314 ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
315 and $i1,lr,$s3 @ i0
316 eor $s1,$s1,$i2,ror#8
317 and $i2,lr,$s3,lsr#8 @ i1
318 eor $t3,$t3,$i3,ror#16
319 and $i3,lr,$s3,lsr#16 @ i2
320 ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
321 eor $s2,$s2,$t2,ror#16
322 ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
323 mov $s3,$s3,lsr#24
324
325 ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
326 eor $s0,$s0,$i1,ror#24
327 ldr $i1,[$key],#16
328 eor $s1,$s1,$i2,ror#16
329 ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
330 eor $s2,$s2,$i3,ror#8
331 ldr $t1,[$key,#-12]
332 eor $s3,$s3,$t3,ror#8
333
334 ldr $t2,[$key,#-8]
335 eor $s0,$s0,$i1
336 ldr $t3,[$key,#-4]
337 and $i1,lr,$s0
338 eor $s1,$s1,$t1
339 and $i2,lr,$s0,lsr#8
340 eor $s2,$s2,$t2
341 and $i3,lr,$s0,lsr#16
342 eor $s3,$s3,$t3
343 mov $s0,$s0,lsr#24
344
345 subs $rounds,$rounds,#1
346 bne .Lenc_loop
347
348 add $tbl,$tbl,#2
349
350 ldrb $t1,[$tbl,$i1,lsl#2] @ Te4[s0>>0]
351 and $i1,lr,$s1,lsr#16 @ i0
352 ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8]
353 and $i2,lr,$s1
354 ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16]
355 and $i3,lr,$s1,lsr#8
356 ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24]
357 mov $s1,$s1,lsr#24
358
359 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s1>>16]
360 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s1>>0]
361 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s1>>8]
362 eor $s0,$i1,$s0,lsl#8
363 ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24]
364 and $i1,lr,$s2,lsr#8 @ i0
365 eor $t2,$i2,$t2,lsl#8
366 and $i2,lr,$s2,lsr#16 @ i1
367 eor $t3,$i3,$t3,lsl#8
368 and $i3,lr,$s2
369 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
370 eor $s1,$t1,$s1,lsl#24
371 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
372 mov $s2,$s2,lsr#24
373
374 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
375 eor $s0,$i1,$s0,lsl#8
376 ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
377 and $i1,lr,$s3 @ i0
378 eor $s1,$s1,$i2,lsl#16
379 and $i2,lr,$s3,lsr#8 @ i1
380 eor $t3,$i3,$t3,lsl#8
381 and $i3,lr,$s3,lsr#16 @ i2
382 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
383 eor $s2,$t2,$s2,lsl#24
384 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
385 mov $s3,$s3,lsr#24
386
387 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
388 eor $s0,$i1,$s0,lsl#8
389 ldr $i1,[$key,#0]
390 ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
391 eor $s1,$s1,$i2,lsl#8
392 ldr $t1,[$key,#4]
393 eor $s2,$s2,$i3,lsl#16
394 ldr $t2,[$key,#8]
395 eor $s3,$t3,$s3,lsl#24
396 ldr $t3,[$key,#12]
397
398 eor $s0,$s0,$i1
399 eor $s1,$s1,$t1
400 eor $s2,$s2,$t2
401 eor $s3,$s3,$t3
402
403 sub $tbl,$tbl,#2
404 ldr pc,[sp],#4 @ pop and return
405.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
406
407.global aes_set_encrypt_key_internal
408.type aes_set_encrypt_key_internal,%function
409.align 5
410aes_set_encrypt_key_internal:
411_armv4_AES_set_encrypt_key:
412 sub r3,pc,#8 @ aes_set_encrypt_key_internal
413 teq r0,#0
414 moveq r0,#-1
415 beq .Labrt
416 teq r2,#0
417 moveq r0,#-1
418 beq .Labrt
419
420 teq r1,#128
421 beq .Lok
422 teq r1,#192
423 beq .Lok
424 teq r1,#256
425 movne r0,#-1
426 bne .Labrt
427
428.Lok: stmdb sp!,{r4-r12,lr}
429 sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
430
431 mov $rounds,r0 @ inp
432 mov lr,r1 @ bits
433 mov $key,r2 @ key
434
435#if __ARM_ARCH__<7 || defined(__STRICT_ALIGNMENT)
436 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
437 ldrb $t1,[$rounds,#2] @ manner...
438 ldrb $t2,[$rounds,#1]
439 ldrb $t3,[$rounds,#0]
440 orr $s0,$s0,$t1,lsl#8
441 ldrb $s1,[$rounds,#7]
442 orr $s0,$s0,$t2,lsl#16
443 ldrb $t1,[$rounds,#6]
444 orr $s0,$s0,$t3,lsl#24
445 ldrb $t2,[$rounds,#5]
446 ldrb $t3,[$rounds,#4]
447 orr $s1,$s1,$t1,lsl#8
448 ldrb $s2,[$rounds,#11]
449 orr $s1,$s1,$t2,lsl#16
450 ldrb $t1,[$rounds,#10]
451 orr $s1,$s1,$t3,lsl#24
452 ldrb $t2,[$rounds,#9]
453 ldrb $t3,[$rounds,#8]
454 orr $s2,$s2,$t1,lsl#8
455 ldrb $s3,[$rounds,#15]
456 orr $s2,$s2,$t2,lsl#16
457 ldrb $t1,[$rounds,#14]
458 orr $s2,$s2,$t3,lsl#24
459 ldrb $t2,[$rounds,#13]
460 ldrb $t3,[$rounds,#12]
461 orr $s3,$s3,$t1,lsl#8
462 str $s0,[$key],#16
463 orr $s3,$s3,$t2,lsl#16
464 str $s1,[$key,#-12]
465 orr $s3,$s3,$t3,lsl#24
466 str $s2,[$key,#-8]
467 str $s3,[$key,#-4]
468#else
469 ldr $s0,[$rounds,#0]
470 ldr $s1,[$rounds,#4]
471 ldr $s2,[$rounds,#8]
472 ldr $s3,[$rounds,#12]
473#ifdef __ARMEL__
474 rev $s0,$s0
475 rev $s1,$s1
476 rev $s2,$s2
477 rev $s3,$s3
478#endif
479 str $s0,[$key],#16
480 str $s1,[$key,#-12]
481 str $s2,[$key,#-8]
482 str $s3,[$key,#-4]
483#endif
484
485 teq lr,#128
486 bne .Lnot128
487 mov $rounds,#10
488 str $rounds,[$key,#240-16]
489 add $t3,$tbl,#256 @ rcon
490 mov lr,#255
491
492.L128_loop:
493 and $t2,lr,$s3,lsr#24
494 and $i1,lr,$s3,lsr#16
495 ldrb $t2,[$tbl,$t2]
496 and $i2,lr,$s3,lsr#8
497 ldrb $i1,[$tbl,$i1]
498 and $i3,lr,$s3
499 ldrb $i2,[$tbl,$i2]
500 orr $t2,$t2,$i1,lsl#24
501 ldrb $i3,[$tbl,$i3]
502 orr $t2,$t2,$i2,lsl#16
503 ldr $t1,[$t3],#4 @ rcon[i++]
504 orr $t2,$t2,$i3,lsl#8
505 eor $t2,$t2,$t1
506 eor $s0,$s0,$t2 @ rk[4]=rk[0]^...
507 eor $s1,$s1,$s0 @ rk[5]=rk[1]^rk[4]
508 str $s0,[$key],#16
509 eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5]
510 str $s1,[$key,#-12]
511 eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6]
512 str $s2,[$key,#-8]
513 subs $rounds,$rounds,#1
514 str $s3,[$key,#-4]
515 bne .L128_loop
516 sub r2,$key,#176
517 b .Ldone
518
519.Lnot128:
520#if __ARM_ARCH__<7 || defined(__STRICT_ALIGNMENT)
521 ldrb $i2,[$rounds,#19]
522 ldrb $t1,[$rounds,#18]
523 ldrb $t2,[$rounds,#17]
524 ldrb $t3,[$rounds,#16]
525 orr $i2,$i2,$t1,lsl#8
526 ldrb $i3,[$rounds,#23]
527 orr $i2,$i2,$t2,lsl#16
528 ldrb $t1,[$rounds,#22]
529 orr $i2,$i2,$t3,lsl#24
530 ldrb $t2,[$rounds,#21]
531 ldrb $t3,[$rounds,#20]
532 orr $i3,$i3,$t1,lsl#8
533 orr $i3,$i3,$t2,lsl#16
534 str $i2,[$key],#8
535 orr $i3,$i3,$t3,lsl#24
536 str $i3,[$key,#-4]
537#else
538 ldr $i2,[$rounds,#16]
539 ldr $i3,[$rounds,#20]
540#ifdef __ARMEL__
541 rev $i2,$i2
542 rev $i3,$i3
543#endif
544 str $i2,[$key],#8
545 str $i3,[$key,#-4]
546#endif
547
548 teq lr,#192
549 bne .Lnot192
550 mov $rounds,#12
551 str $rounds,[$key,#240-24]
552 add $t3,$tbl,#256 @ rcon
553 mov lr,#255
554 mov $rounds,#8
555
556.L192_loop:
557 and $t2,lr,$i3,lsr#24
558 and $i1,lr,$i3,lsr#16
559 ldrb $t2,[$tbl,$t2]
560 and $i2,lr,$i3,lsr#8
561 ldrb $i1,[$tbl,$i1]
562 and $i3,lr,$i3
563 ldrb $i2,[$tbl,$i2]
564 orr $t2,$t2,$i1,lsl#24
565 ldrb $i3,[$tbl,$i3]
566 orr $t2,$t2,$i2,lsl#16
567 ldr $t1,[$t3],#4 @ rcon[i++]
568 orr $t2,$t2,$i3,lsl#8
569 eor $i3,$t2,$t1
570 eor $s0,$s0,$i3 @ rk[6]=rk[0]^...
571 eor $s1,$s1,$s0 @ rk[7]=rk[1]^rk[6]
572 str $s0,[$key],#24
573 eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7]
574 str $s1,[$key,#-20]
575 eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8]
576 str $s2,[$key,#-16]
577 subs $rounds,$rounds,#1
578 str $s3,[$key,#-12]
579 subeq r2,$key,#216
580 beq .Ldone
581
582 ldr $i1,[$key,#-32]
583 ldr $i2,[$key,#-28]
584 eor $i1,$i1,$s3 @ rk[10]=rk[4]^rk[9]
585 eor $i3,$i2,$i1 @ rk[11]=rk[5]^rk[10]
586 str $i1,[$key,#-8]
587 str $i3,[$key,#-4]
588 b .L192_loop
589
590.Lnot192:
591#if __ARM_ARCH__<7 || defined(__STRICT_ALIGNMENT)
592 ldrb $i2,[$rounds,#27]
593 ldrb $t1,[$rounds,#26]
594 ldrb $t2,[$rounds,#25]
595 ldrb $t3,[$rounds,#24]
596 orr $i2,$i2,$t1,lsl#8
597 ldrb $i3,[$rounds,#31]
598 orr $i2,$i2,$t2,lsl#16
599 ldrb $t1,[$rounds,#30]
600 orr $i2,$i2,$t3,lsl#24
601 ldrb $t2,[$rounds,#29]
602 ldrb $t3,[$rounds,#28]
603 orr $i3,$i3,$t1,lsl#8
604 orr $i3,$i3,$t2,lsl#16
605 str $i2,[$key],#8
606 orr $i3,$i3,$t3,lsl#24
607 str $i3,[$key,#-4]
608#else
609 ldr $i2,[$rounds,#24]
610 ldr $i3,[$rounds,#28]
611#ifdef __ARMEL__
612 rev $i2,$i2
613 rev $i3,$i3
614#endif
615 str $i2,[$key],#8
616 str $i3,[$key,#-4]
617#endif
618
619 mov $rounds,#14
620 str $rounds,[$key,#240-32]
621 add $t3,$tbl,#256 @ rcon
622 mov lr,#255
623 mov $rounds,#7
624
625.L256_loop:
626 and $t2,lr,$i3,lsr#24
627 and $i1,lr,$i3,lsr#16
628 ldrb $t2,[$tbl,$t2]
629 and $i2,lr,$i3,lsr#8
630 ldrb $i1,[$tbl,$i1]
631 and $i3,lr,$i3
632 ldrb $i2,[$tbl,$i2]
633 orr $t2,$t2,$i1,lsl#24
634 ldrb $i3,[$tbl,$i3]
635 orr $t2,$t2,$i2,lsl#16
636 ldr $t1,[$t3],#4 @ rcon[i++]
637 orr $t2,$t2,$i3,lsl#8
638 eor $i3,$t2,$t1
639 eor $s0,$s0,$i3 @ rk[8]=rk[0]^...
640 eor $s1,$s1,$s0 @ rk[9]=rk[1]^rk[8]
641 str $s0,[$key],#32
642 eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9]
643 str $s1,[$key,#-28]
644 eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10]
645 str $s2,[$key,#-24]
646 subs $rounds,$rounds,#1
647 str $s3,[$key,#-20]
648 subeq r2,$key,#256
649 beq .Ldone
650
651 and $t2,lr,$s3
652 and $i1,lr,$s3,lsr#8
653 ldrb $t2,[$tbl,$t2]
654 and $i2,lr,$s3,lsr#16
655 ldrb $i1,[$tbl,$i1]
656 and $i3,lr,$s3,lsr#24
657 ldrb $i2,[$tbl,$i2]
658 orr $t2,$t2,$i1,lsl#8
659 ldrb $i3,[$tbl,$i3]
660 orr $t2,$t2,$i2,lsl#16
661 ldr $t1,[$key,#-48]
662 orr $t2,$t2,$i3,lsl#24
663
664 ldr $i1,[$key,#-44]
665 ldr $i2,[$key,#-40]
666 eor $t1,$t1,$t2 @ rk[12]=rk[4]^...
667 ldr $i3,[$key,#-36]
668 eor $i1,$i1,$t1 @ rk[13]=rk[5]^rk[12]
669 str $t1,[$key,#-16]
670 eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13]
671 str $i1,[$key,#-12]
672 eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14]
673 str $i2,[$key,#-8]
674 str $i3,[$key,#-4]
675 b .L256_loop
676
677.Ldone: mov r0,#0
678 ldmia sp!,{r4-r12,lr}
679.Labrt: tst lr,#1
680 moveq pc,lr @ be binary compatible with V4, yet
681 bx lr @ interoperable with Thumb ISA:-)
682.size aes_set_encrypt_key_internal,.-aes_set_encrypt_key_internal
683
684.global aes_set_decrypt_key_internal
685.type aes_set_decrypt_key_internal,%function
686.align 5
687aes_set_decrypt_key_internal:
688 str lr,[sp,#-4]! @ push lr
689 bl _armv4_AES_set_encrypt_key
690 teq r0,#0
691 ldrne lr,[sp],#4 @ pop lr
692 bne .Labrt
693
694 stmdb sp!,{r4-r12}
695
696 ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2,
697 mov $key,r2 @ which is AES_KEY *key
698 mov $i1,r2
699 add $i2,r2,$rounds,lsl#4
700
701.Linv: ldr $s0,[$i1]
702 ldr $s1,[$i1,#4]
703 ldr $s2,[$i1,#8]
704 ldr $s3,[$i1,#12]
705 ldr $t1,[$i2]
706 ldr $t2,[$i2,#4]
707 ldr $t3,[$i2,#8]
708 ldr $i3,[$i2,#12]
709 str $s0,[$i2],#-16
710 str $s1,[$i2,#16+4]
711 str $s2,[$i2,#16+8]
712 str $s3,[$i2,#16+12]
713 str $t1,[$i1],#16
714 str $t2,[$i1,#-12]
715 str $t3,[$i1,#-8]
716 str $i3,[$i1,#-4]
717 teq $i1,$i2
718 bne .Linv
719___
720$mask80=$i1;
721$mask1b=$i2;
722$mask7f=$i3;
723$code.=<<___;
724 ldr $s0,[$key,#16]! @ prefetch tp1
725 mov $mask80,#0x80
726 mov $mask1b,#0x1b
727 orr $mask80,$mask80,#0x8000
728 orr $mask1b,$mask1b,#0x1b00
729 orr $mask80,$mask80,$mask80,lsl#16
730 orr $mask1b,$mask1b,$mask1b,lsl#16
731 sub $rounds,$rounds,#1
732 mvn $mask7f,$mask80
733 mov $rounds,$rounds,lsl#2 @ (rounds-1)*4
734
735.Lmix: and $t1,$s0,$mask80
736 and $s1,$s0,$mask7f
737 sub $t1,$t1,$t1,lsr#7
738 and $t1,$t1,$mask1b
739 eor $s1,$t1,$s1,lsl#1 @ tp2
740
741 and $t1,$s1,$mask80
742 and $s2,$s1,$mask7f
743 sub $t1,$t1,$t1,lsr#7
744 and $t1,$t1,$mask1b
745 eor $s2,$t1,$s2,lsl#1 @ tp4
746
747 and $t1,$s2,$mask80
748 and $s3,$s2,$mask7f
749 sub $t1,$t1,$t1,lsr#7
750 and $t1,$t1,$mask1b
751 eor $s3,$t1,$s3,lsl#1 @ tp8
752
753 eor $t1,$s1,$s2
754 eor $t2,$s0,$s3 @ tp9
755 eor $t1,$t1,$s3 @ tpe
756 eor $t1,$t1,$s1,ror#24
757 eor $t1,$t1,$t2,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8)
758 eor $t1,$t1,$s2,ror#16
759 eor $t1,$t1,$t2,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16)
760 eor $t1,$t1,$t2,ror#8 @ ^= ROTATE(tp9,24)
761
762 ldr $s0,[$key,#4] @ prefetch tp1
763 str $t1,[$key],#4
764 subs $rounds,$rounds,#1
765 bne .Lmix
766
767 mov r0,#0
768#if __ARM_ARCH__>=5
769 ldmia sp!,{r4-r12,pc}
770#else
771 ldmia sp!,{r4-r12,lr}
772 tst lr,#1
773 moveq pc,lr @ be binary compatible with V4, yet
774 bx lr @ interoperable with Thumb ISA:-)
775#endif
776.size aes_set_decrypt_key_internal,.-aes_set_decrypt_key_internal
777
778.type AES_Td,%object
779.align 5
780AES_Td:
781.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
782.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
783.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
784.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
785.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
786.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
787.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
788.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
789.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
790.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
791.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
792.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
793.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
794.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
795.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
796.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
797.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
798.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
799.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
800.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
801.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
802.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
803.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
804.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
805.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
806.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
807.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
808.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
809.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
810.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
811.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
812.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
813.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
814.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
815.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
816.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
817.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
818.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
819.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
820.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
821.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
822.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
823.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
824.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
825.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
826.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
827.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
828.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
829.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
830.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
831.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
832.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
833.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
834.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
835.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
836.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
837.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
838.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
839.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
840.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
841.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
842.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
843.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
844.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
845@ Td4[256]
846.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
847.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
848.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
849.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
850.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
851.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
852.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
853.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
854.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
855.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
856.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
857.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
858.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
859.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
860.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
861.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
862.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
863.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
864.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
865.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
866.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
867.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
868.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
869.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
870.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
871.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
872.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
873.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
874.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
875.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
876.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
877.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
878.size AES_Td,.-AES_Td
879
880@ void aes_decrypt_internal(const unsigned char *in, unsigned char *out,
881@ const AES_KEY *key) {
882.global aes_decrypt_internal
883.type aes_decrypt_internal,%function
884.align 5
885aes_decrypt_internal:
886 sub r3,pc,#8 @ aes_decrypt_internal
887 stmdb sp!,{r1,r4-r12,lr}
888 mov $rounds,r0 @ inp
889 mov $key,r2
890 sub $tbl,r3,#aes_decrypt_internal-AES_Td @ Td
891#if __ARM_ARCH__<7 || defined(__STRICT_ALIGNMENT)
892 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
893 ldrb $t1,[$rounds,#2] @ manner...
894 ldrb $t2,[$rounds,#1]
895 ldrb $t3,[$rounds,#0]
896 orr $s0,$s0,$t1,lsl#8
897 ldrb $s1,[$rounds,#7]
898 orr $s0,$s0,$t2,lsl#16
899 ldrb $t1,[$rounds,#6]
900 orr $s0,$s0,$t3,lsl#24
901 ldrb $t2,[$rounds,#5]
902 ldrb $t3,[$rounds,#4]
903 orr $s1,$s1,$t1,lsl#8
904 ldrb $s2,[$rounds,#11]
905 orr $s1,$s1,$t2,lsl#16
906 ldrb $t1,[$rounds,#10]
907 orr $s1,$s1,$t3,lsl#24
908 ldrb $t2,[$rounds,#9]
909 ldrb $t3,[$rounds,#8]
910 orr $s2,$s2,$t1,lsl#8
911 ldrb $s3,[$rounds,#15]
912 orr $s2,$s2,$t2,lsl#16
913 ldrb $t1,[$rounds,#14]
914 orr $s2,$s2,$t3,lsl#24
915 ldrb $t2,[$rounds,#13]
916 ldrb $t3,[$rounds,#12]
917 orr $s3,$s3,$t1,lsl#8
918 orr $s3,$s3,$t2,lsl#16
919 orr $s3,$s3,$t3,lsl#24
920#else
921 ldr $s0,[$rounds,#0]
922 ldr $s1,[$rounds,#4]
923 ldr $s2,[$rounds,#8]
924 ldr $s3,[$rounds,#12]
925#ifdef __ARMEL__
926 rev $s0,$s0
927 rev $s1,$s1
928 rev $s2,$s2
929 rev $s3,$s3
930#endif
931#endif
932 bl _armv4_AES_decrypt
933
934 ldr $rounds,[sp],#4 @ pop out
935#if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
936#ifdef __ARMEL__
937 rev $s0,$s0
938 rev $s1,$s1
939 rev $s2,$s2
940 rev $s3,$s3
941#endif
942 str $s0,[$rounds,#0]
943 str $s1,[$rounds,#4]
944 str $s2,[$rounds,#8]
945 str $s3,[$rounds,#12]
946#else
947 mov $t1,$s0,lsr#24 @ write output in endian-neutral
948 mov $t2,$s0,lsr#16 @ manner...
949 mov $t3,$s0,lsr#8
950 strb $t1,[$rounds,#0]
951 strb $t2,[$rounds,#1]
952 mov $t1,$s1,lsr#24
953 strb $t3,[$rounds,#2]
954 mov $t2,$s1,lsr#16
955 strb $s0,[$rounds,#3]
956 mov $t3,$s1,lsr#8
957 strb $t1,[$rounds,#4]
958 strb $t2,[$rounds,#5]
959 mov $t1,$s2,lsr#24
960 strb $t3,[$rounds,#6]
961 mov $t2,$s2,lsr#16
962 strb $s1,[$rounds,#7]
963 mov $t3,$s2,lsr#8
964 strb $t1,[$rounds,#8]
965 strb $t2,[$rounds,#9]
966 mov $t1,$s3,lsr#24
967 strb $t3,[$rounds,#10]
968 mov $t2,$s3,lsr#16
969 strb $s2,[$rounds,#11]
970 mov $t3,$s3,lsr#8
971 strb $t1,[$rounds,#12]
972 strb $t2,[$rounds,#13]
973 strb $t3,[$rounds,#14]
974 strb $s3,[$rounds,#15]
975#endif
976#if __ARM_ARCH__>=5
977 ldmia sp!,{r4-r12,pc}
978#else
979 ldmia sp!,{r4-r12,lr}
980 tst lr,#1
981 moveq pc,lr @ be binary compatible with V4, yet
982 bx lr @ interoperable with Thumb ISA:-)
983#endif
984.size aes_decrypt_internal,.-aes_decrypt_internal
985
986.type _armv4_AES_decrypt,%function
987.align 2
988_armv4_AES_decrypt:
989 str lr,[sp,#-4]! @ push lr
990 ldmia $key!,{$t1-$i1}
991 eor $s0,$s0,$t1
992 ldr $rounds,[$key,#240-16]
993 eor $s1,$s1,$t2
994 eor $s2,$s2,$t3
995 eor $s3,$s3,$i1
996 sub $rounds,$rounds,#1
997 mov lr,#255
998
999 and $i1,lr,$s0,lsr#16
1000 and $i2,lr,$s0,lsr#8
1001 and $i3,lr,$s0
1002 mov $s0,$s0,lsr#24
1003.Ldec_loop:
1004 ldr $t1,[$tbl,$i1,lsl#2] @ Td1[s0>>16]
1005 and $i1,lr,$s1 @ i0
1006 ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8]
1007 and $i2,lr,$s1,lsr#16
1008 ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0]
1009 and $i3,lr,$s1,lsr#8
1010 ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24]
1011 mov $s1,$s1,lsr#24
1012
1013 ldr $i1,[$tbl,$i1,lsl#2] @ Td3[s1>>0]
1014 ldr $i2,[$tbl,$i2,lsl#2] @ Td1[s1>>16]
1015 ldr $i3,[$tbl,$i3,lsl#2] @ Td2[s1>>8]
1016 eor $s0,$s0,$i1,ror#24
1017 ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24]
1018 and $i1,lr,$s2,lsr#8 @ i0
1019 eor $t2,$i2,$t2,ror#8
1020 and $i2,lr,$s2 @ i1
1021 eor $t3,$i3,$t3,ror#8
1022 and $i3,lr,$s2,lsr#16
1023 ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
1024 eor $s1,$s1,$t1,ror#8
1025 ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
1026 mov $s2,$s2,lsr#24
1027
1028 ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
1029 eor $s0,$s0,$i1,ror#16
1030 ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
1031 and $i1,lr,$s3,lsr#16 @ i0
1032 eor $s1,$s1,$i2,ror#24
1033 and $i2,lr,$s3,lsr#8 @ i1
1034 eor $t3,$i3,$t3,ror#8
1035 and $i3,lr,$s3 @ i2
1036 ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
1037 eor $s2,$s2,$t2,ror#8
1038 ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
1039 mov $s3,$s3,lsr#24
1040
1041 ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
1042 eor $s0,$s0,$i1,ror#8
1043 ldr $i1,[$key],#16
1044 eor $s1,$s1,$i2,ror#16
1045 ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
1046 eor $s2,$s2,$i3,ror#24
1047
1048 ldr $t1,[$key,#-12]
1049 eor $s0,$s0,$i1
1050 ldr $t2,[$key,#-8]
1051 eor $s3,$s3,$t3,ror#8
1052 ldr $t3,[$key,#-4]
1053 and $i1,lr,$s0,lsr#16
1054 eor $s1,$s1,$t1
1055 and $i2,lr,$s0,lsr#8
1056 eor $s2,$s2,$t2
1057 and $i3,lr,$s0
1058 eor $s3,$s3,$t3
1059 mov $s0,$s0,lsr#24
1060
1061 subs $rounds,$rounds,#1
1062 bne .Ldec_loop
1063
1064 add $tbl,$tbl,#1024
1065
1066 ldr $t2,[$tbl,#0] @ prefetch Td4
1067 ldr $t3,[$tbl,#32]
1068 ldr $t1,[$tbl,#64]
1069 ldr $t2,[$tbl,#96]
1070 ldr $t3,[$tbl,#128]
1071 ldr $t1,[$tbl,#160]
1072 ldr $t2,[$tbl,#192]
1073 ldr $t3,[$tbl,#224]
1074
1075 ldrb $s0,[$tbl,$s0] @ Td4[s0>>24]
1076 ldrb $t1,[$tbl,$i1] @ Td4[s0>>16]
1077 and $i1,lr,$s1 @ i0
1078 ldrb $t2,[$tbl,$i2] @ Td4[s0>>8]
1079 and $i2,lr,$s1,lsr#16
1080 ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
1081 and $i3,lr,$s1,lsr#8
1082
1083 ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
1084 ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
1085 ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
1086 eor $s0,$i1,$s0,lsl#24
1087 ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
1088 eor $s1,$t1,$s1,lsl#8
1089 and $i1,lr,$s2,lsr#8 @ i0
1090 eor $t2,$t2,$i2,lsl#8
1091 and $i2,lr,$s2 @ i1
1092 ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
1093 eor $t3,$t3,$i3,lsl#8
1094 ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
1095 and $i3,lr,$s2,lsr#16
1096
1097 ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
1098 eor $s0,$s0,$i1,lsl#8
1099 ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
1100 eor $s1,$i2,$s1,lsl#16
1101 and $i1,lr,$s3,lsr#16 @ i0
1102 eor $s2,$t2,$s2,lsl#16
1103 and $i2,lr,$s3,lsr#8 @ i1
1104 ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
1105 eor $t3,$t3,$i3,lsl#16
1106 ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
1107 and $i3,lr,$s3 @ i2
1108
1109 ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
1110 ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
1111 eor $s0,$s0,$i1,lsl#16
1112 ldr $i1,[$key,#0]
1113 eor $s1,$s1,$i2,lsl#8
1114 ldr $t1,[$key,#4]
1115 eor $s2,$i3,$s2,lsl#8
1116 ldr $t2,[$key,#8]
1117 eor $s3,$t3,$s3,lsl#24
1118 ldr $t3,[$key,#12]
1119
1120 eor $s0,$s0,$i1
1121 eor $s1,$s1,$t1
1122 eor $s2,$s2,$t2
1123 eor $s3,$s3,$t3
1124
1125 sub $tbl,$tbl,#1024
1126 ldr pc,[sp],#4 @ pop and return
1127.size _armv4_AES_decrypt,.-_armv4_AES_decrypt
1128.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
1129.align 2
1130___
1131
1132$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
1133print $code;
1134close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/aes/asm/aes-mips.pl b/src/lib/libcrypto/aes/asm/aes-mips.pl
deleted file mode 100644
index 9a5df878f5..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-mips.pl
+++ /dev/null
@@ -1,1613 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for MIPS
11
12# October 2010
13#
14# Code uses 1K[+256B] S-box and on single-issue core [such as R5000]
15# spends ~68 cycles per byte processed with 128-bit key. This is ~16%
16# faster than gcc-generated code, which is not very impressive. But
17# recall that compressed S-box requires extra processing, namely
18# additional rotations. Rotations are implemented with lwl/lwr pairs,
19# which is normally used for loading unaligned data. Another cool
20# thing about this module is its endian neutrality, which means that
21# it processes data without ever changing byte order...
22
23######################################################################
24# There is a number of MIPS ABI in use, O32 and N32/64 are most
25# widely used. Then there is a new contender: NUBI. It appears that if
26# one picks the latter, it's possible to arrange code in ABI neutral
27# manner. Therefore let's stick to NUBI register layout:
28#
29($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
30($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
31($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
32($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
33#
34# The return value is placed in $a0. Following coding rules facilitate
35# interoperability:
36#
37# - never ever touch $tp, "thread pointer", former $gp;
38# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
39# old code];
40# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
41#
42# For reference here is register layout for N32/64 MIPS ABIs:
43#
44# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
45# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
46# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
47# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
48# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
49#
50$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
51
52if ($flavour =~ /64/i) {
53 $LA="dla";
54} else {
55 $LA="la";
56}
57
58if ($flavour =~ /64|n32/i) {
59 $PTR_ADD="dadd"; # incidentally works even on n32
60 $PTR_SUB="dsub"; # incidentally works even on n32
61 $REG_S="sd";
62 $REG_L="ld";
63 $PTR_SLL="dsll"; # incidentally works even on n32
64 $SZREG=8;
65} else {
66 $PTR_ADD="add";
67 $PTR_SUB="sub";
68 $REG_S="sw";
69 $REG_L="lw";
70 $PTR_SLL="sll";
71 $SZREG=4;
72}
73$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
74#
75# <appro@openssl.org>
76#
77######################################################################
78
79$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
80
81for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
82open STDOUT,">$output";
83
84if (!defined($big_endian))
85{ $big_endian=(unpack('L',pack('N',1))==1); }
86
87while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
88open STDOUT,">$output";
89
90my ($MSB,$LSB)=(0,3); # automatically converted to little-endian
91
92$code.=<<___;
93.text
94#if !defined(__vxworks) || defined(__pic__)
95.option pic2
96#endif
97.set noat
98___
99
100{{{
101my $FRAMESIZE=16*$SZREG;
102my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
103
104my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
105my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
106my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23));
107my ($key0,$cnt)=($gp,$fp);
108
109# instruction ordering is "stolen" from output from MIPSpro assembler
110# invoked with -mips3 -O3 arguments...
111$code.=<<___;
112.align 5
113.ent _mips_AES_encrypt
114_mips_AES_encrypt:
115 .frame $sp,0,$ra
116 .set reorder
117 lw $t0,0($key)
118 lw $t1,4($key)
119 lw $t2,8($key)
120 lw $t3,12($key)
121 lw $cnt,240($key)
122 $PTR_ADD $key0,$key,16
123
124 xor $s0,$t0
125 xor $s1,$t1
126 xor $s2,$t2
127 xor $s3,$t3
128
129 sub $cnt,1
130 _xtr $i0,$s1,16-2
131.Loop_enc:
132 _xtr $i1,$s2,16-2
133 _xtr $i2,$s3,16-2
134 _xtr $i3,$s0,16-2
135 and $i0,0x3fc
136 and $i1,0x3fc
137 and $i2,0x3fc
138 and $i3,0x3fc
139 $PTR_ADD $i0,$Tbl
140 $PTR_ADD $i1,$Tbl
141 $PTR_ADD $i2,$Tbl
142 $PTR_ADD $i3,$Tbl
143 lwl $t0,3($i0) # Te1[s1>>16]
144 lwl $t1,3($i1) # Te1[s2>>16]
145 lwl $t2,3($i2) # Te1[s3>>16]
146 lwl $t3,3($i3) # Te1[s0>>16]
147 lwr $t0,2($i0) # Te1[s1>>16]
148 lwr $t1,2($i1) # Te1[s2>>16]
149 lwr $t2,2($i2) # Te1[s3>>16]
150 lwr $t3,2($i3) # Te1[s0>>16]
151
152 _xtr $i0,$s2,8-2
153 _xtr $i1,$s3,8-2
154 _xtr $i2,$s0,8-2
155 _xtr $i3,$s1,8-2
156 and $i0,0x3fc
157 and $i1,0x3fc
158 and $i2,0x3fc
159 and $i3,0x3fc
160 $PTR_ADD $i0,$Tbl
161 $PTR_ADD $i1,$Tbl
162 $PTR_ADD $i2,$Tbl
163 $PTR_ADD $i3,$Tbl
164 lwl $t4,2($i0) # Te2[s2>>8]
165 lwl $t5,2($i1) # Te2[s3>>8]
166 lwl $t6,2($i2) # Te2[s0>>8]
167 lwl $t7,2($i3) # Te2[s1>>8]
168 lwr $t4,1($i0) # Te2[s2>>8]
169 lwr $t5,1($i1) # Te2[s3>>8]
170 lwr $t6,1($i2) # Te2[s0>>8]
171 lwr $t7,1($i3) # Te2[s1>>8]
172
173 _xtr $i0,$s3,0-2
174 _xtr $i1,$s0,0-2
175 _xtr $i2,$s1,0-2
176 _xtr $i3,$s2,0-2
177 and $i0,0x3fc
178 and $i1,0x3fc
179 and $i2,0x3fc
180 and $i3,0x3fc
181 $PTR_ADD $i0,$Tbl
182 $PTR_ADD $i1,$Tbl
183 $PTR_ADD $i2,$Tbl
184 $PTR_ADD $i3,$Tbl
185 lwl $t8,1($i0) # Te3[s3]
186 lwl $t9,1($i1) # Te3[s0]
187 lwl $t10,1($i2) # Te3[s1]
188 lwl $t11,1($i3) # Te3[s2]
189 lwr $t8,0($i0) # Te3[s3]
190 lwr $t9,0($i1) # Te3[s0]
191 lwr $t10,0($i2) # Te3[s1]
192 lwr $t11,0($i3) # Te3[s2]
193
194 _xtr $i0,$s0,24-2
195 _xtr $i1,$s1,24-2
196 _xtr $i2,$s2,24-2
197 _xtr $i3,$s3,24-2
198 and $i0,0x3fc
199 and $i1,0x3fc
200 and $i2,0x3fc
201 and $i3,0x3fc
202 $PTR_ADD $i0,$Tbl
203 $PTR_ADD $i1,$Tbl
204 $PTR_ADD $i2,$Tbl
205 $PTR_ADD $i3,$Tbl
206 xor $t0,$t4
207 xor $t1,$t5
208 xor $t2,$t6
209 xor $t3,$t7
210 lw $t4,0($i0) # Te0[s0>>24]
211 lw $t5,0($i1) # Te0[s1>>24]
212 lw $t6,0($i2) # Te0[s2>>24]
213 lw $t7,0($i3) # Te0[s3>>24]
214
215 lw $s0,0($key0)
216 lw $s1,4($key0)
217 lw $s2,8($key0)
218 lw $s3,12($key0)
219
220 xor $t0,$t8
221 xor $t1,$t9
222 xor $t2,$t10
223 xor $t3,$t11
224
225 xor $t0,$t4
226 xor $t1,$t5
227 xor $t2,$t6
228 xor $t3,$t7
229
230 sub $cnt,1
231 $PTR_ADD $key0,16
232 xor $s0,$t0
233 xor $s1,$t1
234 xor $s2,$t2
235 xor $s3,$t3
236 .set noreorder
237 bnez $cnt,.Loop_enc
238 _xtr $i0,$s1,16-2
239
240 .set reorder
241 _xtr $i1,$s2,16-2
242 _xtr $i2,$s3,16-2
243 _xtr $i3,$s0,16-2
244 and $i0,0x3fc
245 and $i1,0x3fc
246 and $i2,0x3fc
247 and $i3,0x3fc
248 $PTR_ADD $i0,$Tbl
249 $PTR_ADD $i1,$Tbl
250 $PTR_ADD $i2,$Tbl
251 $PTR_ADD $i3,$Tbl
252 lbu $t0,2($i0) # Te4[s1>>16]
253 lbu $t1,2($i1) # Te4[s2>>16]
254 lbu $t2,2($i2) # Te4[s3>>16]
255 lbu $t3,2($i3) # Te4[s0>>16]
256
257 _xtr $i0,$s2,8-2
258 _xtr $i1,$s3,8-2
259 _xtr $i2,$s0,8-2
260 _xtr $i3,$s1,8-2
261 and $i0,0x3fc
262 and $i1,0x3fc
263 and $i2,0x3fc
264 and $i3,0x3fc
265 $PTR_ADD $i0,$Tbl
266 $PTR_ADD $i1,$Tbl
267 $PTR_ADD $i2,$Tbl
268 $PTR_ADD $i3,$Tbl
269 lbu $t4,2($i0) # Te4[s2>>8]
270 lbu $t5,2($i1) # Te4[s3>>8]
271 lbu $t6,2($i2) # Te4[s0>>8]
272 lbu $t7,2($i3) # Te4[s1>>8]
273
274 _xtr $i0,$s0,24-2
275 _xtr $i1,$s1,24-2
276 _xtr $i2,$s2,24-2
277 _xtr $i3,$s3,24-2
278 and $i0,0x3fc
279 and $i1,0x3fc
280 and $i2,0x3fc
281 and $i3,0x3fc
282 $PTR_ADD $i0,$Tbl
283 $PTR_ADD $i1,$Tbl
284 $PTR_ADD $i2,$Tbl
285 $PTR_ADD $i3,$Tbl
286 lbu $t8,2($i0) # Te4[s0>>24]
287 lbu $t9,2($i1) # Te4[s1>>24]
288 lbu $t10,2($i2) # Te4[s2>>24]
289 lbu $t11,2($i3) # Te4[s3>>24]
290
291 _xtr $i0,$s3,0-2
292 _xtr $i1,$s0,0-2
293 _xtr $i2,$s1,0-2
294 _xtr $i3,$s2,0-2
295 and $i0,0x3fc
296 and $i1,0x3fc
297 and $i2,0x3fc
298 and $i3,0x3fc
299
300 _ins $t0,16
301 _ins $t1,16
302 _ins $t2,16
303 _ins $t3,16
304
305 _ins $t4,8
306 _ins $t5,8
307 _ins $t6,8
308 _ins $t7,8
309
310 xor $t0,$t4
311 xor $t1,$t5
312 xor $t2,$t6
313 xor $t3,$t7
314
315 $PTR_ADD $i0,$Tbl
316 $PTR_ADD $i1,$Tbl
317 $PTR_ADD $i2,$Tbl
318 $PTR_ADD $i3,$Tbl
319 lbu $t4,2($i0) # Te4[s3]
320 lbu $t5,2($i1) # Te4[s0]
321 lbu $t6,2($i2) # Te4[s1]
322 lbu $t7,2($i3) # Te4[s2]
323
324 _ins $t8,24
325 _ins $t9,24
326 _ins $t10,24
327 _ins $t11,24
328
329 lw $s0,0($key0)
330 lw $s1,4($key0)
331 lw $s2,8($key0)
332 lw $s3,12($key0)
333
334 xor $t0,$t8
335 xor $t1,$t9
336 xor $t2,$t10
337 xor $t3,$t11
338
339 _ins $t4,0
340 _ins $t5,0
341 _ins $t6,0
342 _ins $t7,0
343
344 xor $t0,$t4
345 xor $t1,$t5
346 xor $t2,$t6
347 xor $t3,$t7
348
349 xor $s0,$t0
350 xor $s1,$t1
351 xor $s2,$t2
352 xor $s3,$t3
353
354 jr $ra
355.end _mips_AES_encrypt
356
357.align 5
358.globl aes_encrypt_internal
359.ent aes_encrypt_internal
360aes_encrypt_internal:
361 .frame $sp,$FRAMESIZE,$ra
362 .mask $SAVED_REGS_MASK,-$SZREG
363 .set noreorder
364___
365$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
366 .cpload $pf
367___
368$code.=<<___;
369 $PTR_SUB $sp,$FRAMESIZE
370 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
371 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
372 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
373 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
374 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
375 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
376 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
377 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
378 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
379 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
380___
381$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
382 $REG_S \$15,$FRAMESIZE-11*$SZREG($sp)
383 $REG_S \$14,$FRAMESIZE-12*$SZREG($sp)
384 $REG_S \$13,$FRAMESIZE-13*$SZREG($sp)
385 $REG_S \$12,$FRAMESIZE-14*$SZREG($sp)
386 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
387___
388$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
389 .cplocal $Tbl
390 .cpsetup $pf,$zero,aes_encrypt_internal
391___
392$code.=<<___;
393 .set reorder
394 $LA $Tbl,AES_Te # PIC-ified 'load address'
395
396 lwl $s0,0+$MSB($inp)
397 lwl $s1,4+$MSB($inp)
398 lwl $s2,8+$MSB($inp)
399 lwl $s3,12+$MSB($inp)
400 lwr $s0,0+$LSB($inp)
401 lwr $s1,4+$LSB($inp)
402 lwr $s2,8+$LSB($inp)
403 lwr $s3,12+$LSB($inp)
404
405 bal _mips_AES_encrypt
406
407 swr $s0,0+$LSB($out)
408 swr $s1,4+$LSB($out)
409 swr $s2,8+$LSB($out)
410 swr $s3,12+$LSB($out)
411 swl $s0,0+$MSB($out)
412 swl $s1,4+$MSB($out)
413 swl $s2,8+$MSB($out)
414 swl $s3,12+$MSB($out)
415
416 .set noreorder
417 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
418 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
419 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
420 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
421 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
422 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
423 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
424 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
425 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
426 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
427___
428$code.=<<___ if ($flavour =~ /nubi/i);
429 $REG_L \$15,$FRAMESIZE-11*$SZREG($sp)
430 $REG_L \$14,$FRAMESIZE-12*$SZREG($sp)
431 $REG_L \$13,$FRAMESIZE-13*$SZREG($sp)
432 $REG_L \$12,$FRAMESIZE-14*$SZREG($sp)
433 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
434___
435$code.=<<___;
436 jr $ra
437 $PTR_ADD $sp,$FRAMESIZE
438.end aes_encrypt_internal
439___
440
441$code.=<<___;
442.align 5
443.ent _mips_AES_decrypt
444_mips_AES_decrypt:
445 .frame $sp,0,$ra
446 .set reorder
447 lw $t0,0($key)
448 lw $t1,4($key)
449 lw $t2,8($key)
450 lw $t3,12($key)
451 lw $cnt,240($key)
452 $PTR_ADD $key0,$key,16
453
454 xor $s0,$t0
455 xor $s1,$t1
456 xor $s2,$t2
457 xor $s3,$t3
458
459 sub $cnt,1
460 _xtr $i0,$s3,16-2
461.Loop_dec:
462 _xtr $i1,$s0,16-2
463 _xtr $i2,$s1,16-2
464 _xtr $i3,$s2,16-2
465 and $i0,0x3fc
466 and $i1,0x3fc
467 and $i2,0x3fc
468 and $i3,0x3fc
469 $PTR_ADD $i0,$Tbl
470 $PTR_ADD $i1,$Tbl
471 $PTR_ADD $i2,$Tbl
472 $PTR_ADD $i3,$Tbl
473 lwl $t0,3($i0) # Td1[s3>>16]
474 lwl $t1,3($i1) # Td1[s0>>16]
475 lwl $t2,3($i2) # Td1[s1>>16]
476 lwl $t3,3($i3) # Td1[s2>>16]
477 lwr $t0,2($i0) # Td1[s3>>16]
478 lwr $t1,2($i1) # Td1[s0>>16]
479 lwr $t2,2($i2) # Td1[s1>>16]
480 lwr $t3,2($i3) # Td1[s2>>16]
481
482 _xtr $i0,$s2,8-2
483 _xtr $i1,$s3,8-2
484 _xtr $i2,$s0,8-2
485 _xtr $i3,$s1,8-2
486 and $i0,0x3fc
487 and $i1,0x3fc
488 and $i2,0x3fc
489 and $i3,0x3fc
490 $PTR_ADD $i0,$Tbl
491 $PTR_ADD $i1,$Tbl
492 $PTR_ADD $i2,$Tbl
493 $PTR_ADD $i3,$Tbl
494 lwl $t4,2($i0) # Td2[s2>>8]
495 lwl $t5,2($i1) # Td2[s3>>8]
496 lwl $t6,2($i2) # Td2[s0>>8]
497 lwl $t7,2($i3) # Td2[s1>>8]
498 lwr $t4,1($i0) # Td2[s2>>8]
499 lwr $t5,1($i1) # Td2[s3>>8]
500 lwr $t6,1($i2) # Td2[s0>>8]
501 lwr $t7,1($i3) # Td2[s1>>8]
502
503 _xtr $i0,$s1,0-2
504 _xtr $i1,$s2,0-2
505 _xtr $i2,$s3,0-2
506 _xtr $i3,$s0,0-2
507 and $i0,0x3fc
508 and $i1,0x3fc
509 and $i2,0x3fc
510 and $i3,0x3fc
511 $PTR_ADD $i0,$Tbl
512 $PTR_ADD $i1,$Tbl
513 $PTR_ADD $i2,$Tbl
514 $PTR_ADD $i3,$Tbl
515 lwl $t8,1($i0) # Td3[s1]
516 lwl $t9,1($i1) # Td3[s2]
517 lwl $t10,1($i2) # Td3[s3]
518 lwl $t11,1($i3) # Td3[s0]
519 lwr $t8,0($i0) # Td3[s1]
520 lwr $t9,0($i1) # Td3[s2]
521 lwr $t10,0($i2) # Td3[s3]
522 lwr $t11,0($i3) # Td3[s0]
523
524 _xtr $i0,$s0,24-2
525 _xtr $i1,$s1,24-2
526 _xtr $i2,$s2,24-2
527 _xtr $i3,$s3,24-2
528 and $i0,0x3fc
529 and $i1,0x3fc
530 and $i2,0x3fc
531 and $i3,0x3fc
532 $PTR_ADD $i0,$Tbl
533 $PTR_ADD $i1,$Tbl
534 $PTR_ADD $i2,$Tbl
535 $PTR_ADD $i3,$Tbl
536
537 xor $t0,$t4
538 xor $t1,$t5
539 xor $t2,$t6
540 xor $t3,$t7
541
542
543 lw $t4,0($i0) # Td0[s0>>24]
544 lw $t5,0($i1) # Td0[s1>>24]
545 lw $t6,0($i2) # Td0[s2>>24]
546 lw $t7,0($i3) # Td0[s3>>24]
547
548 lw $s0,0($key0)
549 lw $s1,4($key0)
550 lw $s2,8($key0)
551 lw $s3,12($key0)
552
553 xor $t0,$t8
554 xor $t1,$t9
555 xor $t2,$t10
556 xor $t3,$t11
557
558 xor $t0,$t4
559 xor $t1,$t5
560 xor $t2,$t6
561 xor $t3,$t7
562
563 sub $cnt,1
564 $PTR_ADD $key0,16
565 xor $s0,$t0
566 xor $s1,$t1
567 xor $s2,$t2
568 xor $s3,$t3
569 .set noreorder
570 bnez $cnt,.Loop_dec
571 _xtr $i0,$s3,16-2
572
573 .set reorder
574 lw $t4,1024($Tbl) # prefetch Td4
575 lw $t5,1024+32($Tbl)
576 lw $t6,1024+64($Tbl)
577 lw $t7,1024+96($Tbl)
578 lw $t8,1024+128($Tbl)
579 lw $t9,1024+160($Tbl)
580 lw $t10,1024+192($Tbl)
581 lw $t11,1024+224($Tbl)
582
583 _xtr $i0,$s3,16
584 _xtr $i1,$s0,16
585 _xtr $i2,$s1,16
586 _xtr $i3,$s2,16
587 and $i0,0xff
588 and $i1,0xff
589 and $i2,0xff
590 and $i3,0xff
591 $PTR_ADD $i0,$Tbl
592 $PTR_ADD $i1,$Tbl
593 $PTR_ADD $i2,$Tbl
594 $PTR_ADD $i3,$Tbl
595 lbu $t0,1024($i0) # Td4[s3>>16]
596 lbu $t1,1024($i1) # Td4[s0>>16]
597 lbu $t2,1024($i2) # Td4[s1>>16]
598 lbu $t3,1024($i3) # Td4[s2>>16]
599
600 _xtr $i0,$s2,8
601 _xtr $i1,$s3,8
602 _xtr $i2,$s0,8
603 _xtr $i3,$s1,8
604 and $i0,0xff
605 and $i1,0xff
606 and $i2,0xff
607 and $i3,0xff
608 $PTR_ADD $i0,$Tbl
609 $PTR_ADD $i1,$Tbl
610 $PTR_ADD $i2,$Tbl
611 $PTR_ADD $i3,$Tbl
612 lbu $t4,1024($i0) # Td4[s2>>8]
613 lbu $t5,1024($i1) # Td4[s3>>8]
614 lbu $t6,1024($i2) # Td4[s0>>8]
615 lbu $t7,1024($i3) # Td4[s1>>8]
616
617 _xtr $i0,$s0,24
618 _xtr $i1,$s1,24
619 _xtr $i2,$s2,24
620 _xtr $i3,$s3,24
621 $PTR_ADD $i0,$Tbl
622 $PTR_ADD $i1,$Tbl
623 $PTR_ADD $i2,$Tbl
624 $PTR_ADD $i3,$Tbl
625 lbu $t8,1024($i0) # Td4[s0>>24]
626 lbu $t9,1024($i1) # Td4[s1>>24]
627 lbu $t10,1024($i2) # Td4[s2>>24]
628 lbu $t11,1024($i3) # Td4[s3>>24]
629
630 _xtr $i0,$s1,0
631 _xtr $i1,$s2,0
632 _xtr $i2,$s3,0
633 _xtr $i3,$s0,0
634
635 _ins $t0,16
636 _ins $t1,16
637 _ins $t2,16
638 _ins $t3,16
639
640 _ins $t4,8
641 _ins $t5,8
642 _ins $t6,8
643 _ins $t7,8
644
645 xor $t0,$t4
646 xor $t1,$t5
647 xor $t2,$t6
648 xor $t3,$t7
649
650 $PTR_ADD $i0,$Tbl
651 $PTR_ADD $i1,$Tbl
652 $PTR_ADD $i2,$Tbl
653 $PTR_ADD $i3,$Tbl
654 lbu $t4,1024($i0) # Td4[s1]
655 lbu $t5,1024($i1) # Td4[s2]
656 lbu $t6,1024($i2) # Td4[s3]
657 lbu $t7,1024($i3) # Td4[s0]
658
659 _ins $t8,24
660 _ins $t9,24
661 _ins $t10,24
662 _ins $t11,24
663
664 lw $s0,0($key0)
665 lw $s1,4($key0)
666 lw $s2,8($key0)
667 lw $s3,12($key0)
668
669 _ins $t4,0
670 _ins $t5,0
671 _ins $t6,0
672 _ins $t7,0
673
674
675 xor $t0,$t8
676 xor $t1,$t9
677 xor $t2,$t10
678 xor $t3,$t11
679
680 xor $t0,$t4
681 xor $t1,$t5
682 xor $t2,$t6
683 xor $t3,$t7
684
685 xor $s0,$t0
686 xor $s1,$t1
687 xor $s2,$t2
688 xor $s3,$t3
689
690 jr $ra
691.end _mips_AES_decrypt
692
693.align 5
694.globl aes_decrypt_internal
695.ent aes_decrypt_internal
696aes_decrypt_internal:
697 .frame $sp,$FRAMESIZE,$ra
698 .mask $SAVED_REGS_MASK,-$SZREG
699 .set noreorder
700___
701$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
702 .cpload $pf
703___
704$code.=<<___;
705 $PTR_SUB $sp,$FRAMESIZE
706 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
707 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
708 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
709 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
710 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
711 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
712 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
713 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
714 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
715 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
716___
717$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
718 $REG_S \$15,$FRAMESIZE-11*$SZREG($sp)
719 $REG_S \$14,$FRAMESIZE-12*$SZREG($sp)
720 $REG_S \$13,$FRAMESIZE-13*$SZREG($sp)
721 $REG_S \$12,$FRAMESIZE-14*$SZREG($sp)
722 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
723___
724$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
725 .cplocal $Tbl
726 .cpsetup $pf,$zero,aes_decrypt_internal
727___
728$code.=<<___;
729 .set reorder
730 $LA $Tbl,AES_Td # PIC-ified 'load address'
731
732 lwl $s0,0+$MSB($inp)
733 lwl $s1,4+$MSB($inp)
734 lwl $s2,8+$MSB($inp)
735 lwl $s3,12+$MSB($inp)
736 lwr $s0,0+$LSB($inp)
737 lwr $s1,4+$LSB($inp)
738 lwr $s2,8+$LSB($inp)
739 lwr $s3,12+$LSB($inp)
740
741 bal _mips_AES_decrypt
742
743 swr $s0,0+$LSB($out)
744 swr $s1,4+$LSB($out)
745 swr $s2,8+$LSB($out)
746 swr $s3,12+$LSB($out)
747 swl $s0,0+$MSB($out)
748 swl $s1,4+$MSB($out)
749 swl $s2,8+$MSB($out)
750 swl $s3,12+$MSB($out)
751
752 .set noreorder
753 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
754 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
755 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
756 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
757 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
758 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
759 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
760 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
761 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
762 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
763___
764$code.=<<___ if ($flavour =~ /nubi/i);
765 $REG_L \$15,$FRAMESIZE-11*$SZREG($sp)
766 $REG_L \$14,$FRAMESIZE-12*$SZREG($sp)
767 $REG_L \$13,$FRAMESIZE-13*$SZREG($sp)
768 $REG_L \$12,$FRAMESIZE-14*$SZREG($sp)
769 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
770___
771$code.=<<___;
772 jr $ra
773 $PTR_ADD $sp,$FRAMESIZE
774.end aes_decrypt_internal
775___
776}}}
777
778{{{
779my $FRAMESIZE=8*$SZREG;
780my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000;
781
782my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
783my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
784my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
785my ($rcon,$cnt)=($gp,$fp);
786
787$code.=<<___;
788.align 5
789.ent _mips_AES_set_encrypt_key
790_mips_AES_set_encrypt_key:
791 .frame $sp,0,$ra
792 .set noreorder
793 beqz $inp,.Lekey_done
794 li $t0,-1
795 beqz $key,.Lekey_done
796 $PTR_ADD $rcon,$Tbl,1024+256
797
798 .set reorder
799 lwl $rk0,0+$MSB($inp) # load 128 bits
800 lwl $rk1,4+$MSB($inp)
801 lwl $rk2,8+$MSB($inp)
802 lwl $rk3,12+$MSB($inp)
803 li $at,128
804 lwr $rk0,0+$LSB($inp)
805 lwr $rk1,4+$LSB($inp)
806 lwr $rk2,8+$LSB($inp)
807 lwr $rk3,12+$LSB($inp)
808 .set noreorder
809 beq $bits,$at,.L128bits
810 li $cnt,10
811
812 .set reorder
813 lwl $rk4,16+$MSB($inp) # load 192 bits
814 lwl $rk5,20+$MSB($inp)
815 li $at,192
816 lwr $rk4,16+$LSB($inp)
817 lwr $rk5,20+$LSB($inp)
818 .set noreorder
819 beq $bits,$at,.L192bits
820 li $cnt,8
821
822 .set reorder
823 lwl $rk6,24+$MSB($inp) # load 256 bits
824 lwl $rk7,28+$MSB($inp)
825 li $at,256
826 lwr $rk6,24+$LSB($inp)
827 lwr $rk7,28+$LSB($inp)
828 .set noreorder
829 beq $bits,$at,.L256bits
830 li $cnt,7
831
832 b .Lekey_done
833 li $t0,-2
834
835.align 4
836.L128bits:
837 .set reorder
838 srl $i0,$rk3,16
839 srl $i1,$rk3,8
840 and $i0,0xff
841 and $i1,0xff
842 and $i2,$rk3,0xff
843 srl $i3,$rk3,24
844 $PTR_ADD $i0,$Tbl
845 $PTR_ADD $i1,$Tbl
846 $PTR_ADD $i2,$Tbl
847 $PTR_ADD $i3,$Tbl
848 lbu $i0,1024($i0)
849 lbu $i1,1024($i1)
850 lbu $i2,1024($i2)
851 lbu $i3,1024($i3)
852
853 sw $rk0,0($key)
854 sw $rk1,4($key)
855 sw $rk2,8($key)
856 sw $rk3,12($key)
857 sub $cnt,1
858 $PTR_ADD $key,16
859
860 _bias $i0,24
861 _bias $i1,16
862 _bias $i2,8
863 _bias $i3,0
864
865 xor $rk0,$i0
866 lw $i0,0($rcon)
867 xor $rk0,$i1
868 xor $rk0,$i2
869 xor $rk0,$i3
870 xor $rk0,$i0
871
872 xor $rk1,$rk0
873 xor $rk2,$rk1
874 xor $rk3,$rk2
875
876 .set noreorder
877 bnez $cnt,.L128bits
878 $PTR_ADD $rcon,4
879
880 sw $rk0,0($key)
881 sw $rk1,4($key)
882 sw $rk2,8($key)
883 li $cnt,10
884 sw $rk3,12($key)
885 li $t0,0
886 sw $cnt,80($key)
887 b .Lekey_done
888 $PTR_SUB $key,10*16
889
890.align 4
891.L192bits:
892 .set reorder
893 srl $i0,$rk5,16
894 srl $i1,$rk5,8
895 and $i0,0xff
896 and $i1,0xff
897 and $i2,$rk5,0xff
898 srl $i3,$rk5,24
899 $PTR_ADD $i0,$Tbl
900 $PTR_ADD $i1,$Tbl
901 $PTR_ADD $i2,$Tbl
902 $PTR_ADD $i3,$Tbl
903 lbu $i0,1024($i0)
904 lbu $i1,1024($i1)
905 lbu $i2,1024($i2)
906 lbu $i3,1024($i3)
907
908 sw $rk0,0($key)
909 sw $rk1,4($key)
910 sw $rk2,8($key)
911 sw $rk3,12($key)
912 sw $rk4,16($key)
913 sw $rk5,20($key)
914 sub $cnt,1
915 $PTR_ADD $key,24
916
917 _bias $i0,24
918 _bias $i1,16
919 _bias $i2,8
920 _bias $i3,0
921
922 xor $rk0,$i0
923 lw $i0,0($rcon)
924 xor $rk0,$i1
925 xor $rk0,$i2
926 xor $rk0,$i3
927 xor $rk0,$i0
928
929 xor $rk1,$rk0
930 xor $rk2,$rk1
931 xor $rk3,$rk2
932 xor $rk4,$rk3
933 xor $rk5,$rk4
934
935 .set noreorder
936 bnez $cnt,.L192bits
937 $PTR_ADD $rcon,4
938
939 sw $rk0,0($key)
940 sw $rk1,4($key)
941 sw $rk2,8($key)
942 li $cnt,12
943 sw $rk3,12($key)
944 li $t0,0
945 sw $cnt,48($key)
946 b .Lekey_done
947 $PTR_SUB $key,12*16
948
949.align 4
950.L256bits:
951 .set reorder
952 srl $i0,$rk7,16
953 srl $i1,$rk7,8
954 and $i0,0xff
955 and $i1,0xff
956 and $i2,$rk7,0xff
957 srl $i3,$rk7,24
958 $PTR_ADD $i0,$Tbl
959 $PTR_ADD $i1,$Tbl
960 $PTR_ADD $i2,$Tbl
961 $PTR_ADD $i3,$Tbl
962 lbu $i0,1024($i0)
963 lbu $i1,1024($i1)
964 lbu $i2,1024($i2)
965 lbu $i3,1024($i3)
966
967 sw $rk0,0($key)
968 sw $rk1,4($key)
969 sw $rk2,8($key)
970 sw $rk3,12($key)
971 sw $rk4,16($key)
972 sw $rk5,20($key)
973 sw $rk6,24($key)
974 sw $rk7,28($key)
975 sub $cnt,1
976
977 _bias $i0,24
978 _bias $i1,16
979 _bias $i2,8
980 _bias $i3,0
981
982 xor $rk0,$i0
983 lw $i0,0($rcon)
984 xor $rk0,$i1
985 xor $rk0,$i2
986 xor $rk0,$i3
987 xor $rk0,$i0
988
989 xor $rk1,$rk0
990 xor $rk2,$rk1
991 xor $rk3,$rk2
992 beqz $cnt,.L256bits_done
993
994 srl $i0,$rk3,24
995 srl $i1,$rk3,16
996 srl $i2,$rk3,8
997 and $i3,$rk3,0xff
998 and $i1,0xff
999 and $i2,0xff
1000 $PTR_ADD $i0,$Tbl
1001 $PTR_ADD $i1,$Tbl
1002 $PTR_ADD $i2,$Tbl
1003 $PTR_ADD $i3,$Tbl
1004 lbu $i0,1024($i0)
1005 lbu $i1,1024($i1)
1006 lbu $i2,1024($i2)
1007 lbu $i3,1024($i3)
1008 sll $i0,24
1009 sll $i1,16
1010 sll $i2,8
1011
1012 xor $rk4,$i0
1013 xor $rk4,$i1
1014 xor $rk4,$i2
1015 xor $rk4,$i3
1016
1017 xor $rk5,$rk4
1018 xor $rk6,$rk5
1019 xor $rk7,$rk6
1020
1021 $PTR_ADD $key,32
1022 .set noreorder
1023 b .L256bits
1024 $PTR_ADD $rcon,4
1025
1026.L256bits_done:
1027 sw $rk0,32($key)
1028 sw $rk1,36($key)
1029 sw $rk2,40($key)
1030 li $cnt,14
1031 sw $rk3,44($key)
1032 li $t0,0
1033 sw $cnt,48($key)
1034 $PTR_SUB $key,12*16
1035
1036.Lekey_done:
1037 jr $ra
1038 nop
1039.end _mips_AES_set_encrypt_key
1040
1041.globl aes_set_encrypt_key_internal
1042.ent aes_set_encrypt_key_internal
1043aes_set_encrypt_key_internal:
1044 .frame $sp,$FRAMESIZE,$ra
1045 .mask $SAVED_REGS_MASK,-$SZREG
1046 .set noreorder
1047___
1048$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
1049 .cpload $pf
1050___
1051$code.=<<___;
1052 $PTR_SUB $sp,$FRAMESIZE
1053 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
1054 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
1055___
1056$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1057 $REG_S $s3,$FRAMESIZE-3*$SZREG($sp)
1058 $REG_S $s2,$FRAMESIZE-4*$SZREG($sp)
1059 $REG_S $s1,$FRAMESIZE-5*$SZREG($sp)
1060 $REG_S $s0,$FRAMESIZE-6*$SZREG($sp)
1061 $REG_S $gp,$FRAMESIZE-7*$SZREG($sp)
1062___
1063$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
1064 .cplocal $Tbl
1065 .cpsetup $pf,$zero,aes_set_encrypt_key_internal
1066___
1067$code.=<<___;
1068 .set reorder
1069 $LA $Tbl,AES_Te # PIC-ified 'load address'
1070
1071 bal _mips_AES_set_encrypt_key
1072
1073 .set noreorder
1074 move $a0,$t0
1075 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
1076 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
1077___
1078$code.=<<___ if ($flavour =~ /nubi/i);
1079 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
1080 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
1081 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
1082 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
1083 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
1084___
1085$code.=<<___;
1086 jr $ra
1087 $PTR_ADD $sp,$FRAMESIZE
1088.end aes_set_encrypt_key_internal
1089___
1090
1091my ($head,$tail)=($inp,$bits);
1092my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
1093my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
1094$code.=<<___;
1095.align 5
1096.globl aes_set_decrypt_key_internal
1097.ent aes_set_decrypt_key_internal
1098aes_set_decrypt_key_internal:
1099 .frame $sp,$FRAMESIZE,$ra
1100 .mask $SAVED_REGS_MASK,-$SZREG
1101 .set noreorder
1102___
1103$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
1104 .cpload $pf
1105___
1106$code.=<<___;
1107 $PTR_SUB $sp,$FRAMESIZE
1108 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
1109 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
1110___
1111$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1112 $REG_S $s3,$FRAMESIZE-3*$SZREG($sp)
1113 $REG_S $s2,$FRAMESIZE-4*$SZREG($sp)
1114 $REG_S $s1,$FRAMESIZE-5*$SZREG($sp)
1115 $REG_S $s0,$FRAMESIZE-6*$SZREG($sp)
1116 $REG_S $gp,$FRAMESIZE-7*$SZREG($sp)
1117___
1118$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
1119 .cplocal $Tbl
1120 .cpsetup $pf,$zero,aes_set_decrypt_key_internal
1121___
1122$code.=<<___;
1123 .set reorder
1124 $LA $Tbl,AES_Te # PIC-ified 'load address'
1125
1126 bal _mips_AES_set_encrypt_key
1127
1128 bltz $t0,.Ldkey_done
1129
1130 sll $at,$cnt,4
1131 $PTR_ADD $head,$key,0
1132 $PTR_ADD $tail,$key,$at
1133.align 4
1134.Lswap:
1135 lw $rk0,0($head)
1136 lw $rk1,4($head)
1137 lw $rk2,8($head)
1138 lw $rk3,12($head)
1139 lw $rk4,0($tail)
1140 lw $rk5,4($tail)
1141 lw $rk6,8($tail)
1142 lw $rk7,12($tail)
1143 sw $rk0,0($tail)
1144 sw $rk1,4($tail)
1145 sw $rk2,8($tail)
1146 sw $rk3,12($tail)
1147 $PTR_ADD $head,16
1148 $PTR_SUB $tail,16
1149 sw $rk4,-16($head)
1150 sw $rk5,-12($head)
1151 sw $rk6,-8($head)
1152 sw $rk7,-4($head)
1153 bne $head,$tail,.Lswap
1154
1155 lw $tp1,16($key) # modulo-scheduled
1156 lui $x80808080,0x8080
1157 sub $cnt,1
1158 or $x80808080,0x8080
1159 sll $cnt,2
1160 $PTR_ADD $key,16
1161 lui $x1b1b1b1b,0x1b1b
1162 nor $x7f7f7f7f,$zero,$x80808080
1163 or $x1b1b1b1b,0x1b1b
1164.align 4
1165.Lmix:
1166 and $m,$tp1,$x80808080
1167 and $tp2,$tp1,$x7f7f7f7f
1168 srl $tp4,$m,7
1169 addu $tp2,$tp2 # tp2<<1
1170 subu $m,$tp4
1171 and $m,$x1b1b1b1b
1172 xor $tp2,$m
1173
1174 and $m,$tp2,$x80808080
1175 and $tp4,$tp2,$x7f7f7f7f
1176 srl $tp8,$m,7
1177 addu $tp4,$tp4 # tp4<<1
1178 subu $m,$tp8
1179 and $m,$x1b1b1b1b
1180 xor $tp4,$m
1181
1182 and $m,$tp4,$x80808080
1183 and $tp8,$tp4,$x7f7f7f7f
1184 srl $tp9,$m,7
1185 addu $tp8,$tp8 # tp8<<1
1186 subu $m,$tp9
1187 and $m,$x1b1b1b1b
1188 xor $tp8,$m
1189
1190 xor $tp9,$tp8,$tp1
1191 xor $tpe,$tp8,$tp4
1192 xor $tpb,$tp9,$tp2
1193 xor $tpd,$tp9,$tp4
1194
1195 _ror $tp1,$tpd,16
1196 xor $tpe,$tp2
1197 _ror $tp2,$tpd,-16
1198 xor $tpe,$tp1
1199 _ror $tp1,$tp9,8
1200 xor $tpe,$tp2
1201 _ror $tp2,$tp9,-24
1202 xor $tpe,$tp1
1203 _ror $tp1,$tpb,24
1204 xor $tpe,$tp2
1205 _ror $tp2,$tpb,-8
1206 xor $tpe,$tp1
1207 lw $tp1,4($key) # modulo-scheduled
1208 xor $tpe,$tp2
1209 sub $cnt,1
1210 sw $tpe,0($key)
1211 $PTR_ADD $key,4
1212 bnez $cnt,.Lmix
1213
1214 li $t0,0
1215.Ldkey_done:
1216 .set noreorder
1217 move $a0,$t0
1218 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
1219 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
1220___
1221$code.=<<___ if ($flavour =~ /nubi/i);
1222 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
1223 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
1224 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
1225 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
1226 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
1227___
1228$code.=<<___;
1229 jr $ra
1230 $PTR_ADD $sp,$FRAMESIZE
1231.end aes_set_decrypt_key_internal
1232___
1233}}}
1234
1235######################################################################
1236# Tables are kept in endian-neutral manner
1237$code.=<<___;
1238.rdata
1239.align 6
1240AES_Te:
1241.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0
1242.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d
1243.byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd
1244.byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54
1245.byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03
1246.byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d
1247.byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62
1248.byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a
1249.byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d
1250.byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87
1251.byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb
1252.byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b
1253.byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67
1254.byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea
1255.byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7
1256.byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b
1257.byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c
1258.byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a
1259.byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41
1260.byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f
1261.byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4
1262.byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08
1263.byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73
1264.byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f
1265.byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52
1266.byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e
1267.byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1
1268.byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5
1269.byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36
1270.byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d
1271.byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69
1272.byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f
1273.byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e
1274.byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e
1275.byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2
1276.byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb
1277.byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d
1278.byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce
1279.byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e
1280.byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97
1281.byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68
1282.byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c
1283.byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f
1284.byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed
1285.byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46
1286.byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b
1287.byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4
1288.byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a
1289.byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a
1290.byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16
1291.byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7
1292.byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94
1293.byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10
1294.byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81
1295.byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44
1296.byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3
1297.byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe
1298.byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a
1299.byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc
1300.byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04
1301.byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1
1302.byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63
1303.byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a
1304.byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d
1305.byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14
1306.byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f
1307.byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2
1308.byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39
1309.byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2
1310.byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47
1311.byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7
1312.byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95
1313.byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98
1314.byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f
1315.byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e
1316.byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83
1317.byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29
1318.byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c
1319.byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2
1320.byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76
1321.byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56
1322.byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e
1323.byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a
1324.byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4
1325.byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e
1326.byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6
1327.byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4
1328.byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b
1329.byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43
1330.byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7
1331.byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64
1332.byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0
1333.byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa
1334.byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25
1335.byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e
1336.byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18
1337.byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88
1338.byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72
1339.byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1
1340.byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51
1341.byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c
1342.byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21
1343.byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc
1344.byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85
1345.byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42
1346.byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa
1347.byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05
1348.byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12
1349.byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f
1350.byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0
1351.byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58
1352.byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9
1353.byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13
1354.byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33
1355.byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70
1356.byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7
1357.byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22
1358.byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20
1359.byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff
1360.byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a
1361.byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8
1362.byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17
1363.byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31
1364.byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8
1365.byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0
1366.byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11
1367.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc
1368.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a
1369
1370.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4
1371.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
1372.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
1373.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
1374.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
1375.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
1376.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
1377.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
1378.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
1379.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
1380.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
1381.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
1382.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
1383.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
1384.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
1385.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
1386.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
1387.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
1388.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
1389.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
1390.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
1391.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
1392.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
1393.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
1394.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
1395.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
1396.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
1397.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
1398.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
1399.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
1400.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
1401.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
1402
1403.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon
1404.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00
1405.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00
1406.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00
1407.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00
1408
1409.align 6
1410AES_Td:
1411.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0
1412.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96
1413.byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1
1414.byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93
1415.byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6
1416.byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25
1417.byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7
1418.byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f
1419.byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67
1420.byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1
1421.byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12
1422.byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6
1423.byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95
1424.byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda
1425.byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3
1426.byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44
1427.byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78
1428.byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd
1429.byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17
1430.byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4
1431.byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82
1432.byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45
1433.byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84
1434.byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94
1435.byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19
1436.byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7
1437.byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2
1438.byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a
1439.byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03
1440.byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5
1441.byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2
1442.byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c
1443.byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92
1444.byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1
1445.byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5
1446.byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a
1447.byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0
1448.byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75
1449.byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa
1450.byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51
1451.byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d
1452.byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46
1453.byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05
1454.byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff
1455.byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97
1456.byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77
1457.byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88
1458.byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb
1459.byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9
1460.byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00
1461.byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48
1462.byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e
1463.byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56
1464.byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27
1465.byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21
1466.byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a
1467.byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f
1468.byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e
1469.byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2
1470.byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16
1471.byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5
1472.byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d
1473.byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad
1474.byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8
1475.byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c
1476.byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd
1477.byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc
1478.byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34
1479.byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc
1480.byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63
1481.byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10
1482.byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20
1483.byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8
1484.byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d
1485.byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3
1486.byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0
1487.byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99
1488.byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22
1489.byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a
1490.byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef
1491.byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1
1492.byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36
1493.byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28
1494.byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4
1495.byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d
1496.byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62
1497.byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8
1498.byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5
1499.byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c
1500.byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3
1501.byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7
1502.byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b
1503.byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4
1504.byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8
1505.byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e
1506.byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6
1507.byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce
1508.byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6
1509.byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31
1510.byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0
1511.byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6
1512.byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15
1513.byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7
1514.byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f
1515.byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d
1516.byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf
1517.byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b
1518.byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f
1519.byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d
1520.byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e
1521.byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52
1522.byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13
1523.byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a
1524.byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89
1525.byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35
1526.byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c
1527.byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f
1528.byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf
1529.byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b
1530.byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86
1531.byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e
1532.byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f
1533.byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c
1534.byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41
1535.byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde
1536.byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90
1537.byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70
1538.byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42
1539
1540.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4
1541.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
1542.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
1543.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
1544.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
1545.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
1546.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
1547.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
1548.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
1549.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
1550.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
1551.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
1552.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
1553.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
1554.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
1555.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
1556.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
1557.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
1558.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
1559.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
1560.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
1561.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
1562.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
1563.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
1564.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
1565.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
1566.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
1567.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
1568.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
1569.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
1570.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
1571.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1572___
1573
1574foreach (split("\n",$code)) {
1575 s/\`([^\`]*)\`/eval $1/ge;
1576
1577 # made-up _instructions, _xtr, _ins, _ror and _bias, cope
1578 # with byte order dependencies...
1579 if (/^\s+_/) {
1580 s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/;
1581
1582 s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/
1583 sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
1584 : eval("24-$3"))/e or
1585 s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
1586 sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
1587 : eval("24-$3"))/e or
1588 s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/
1589 sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
1590 : eval("$3*-1"))/e or
1591 s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
1592 sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
1593 : eval("($3-16)&31"))/e;
1594
1595 s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/
1596 sprintf("sll\t$1,$2,$3")/e or
1597 s/srl\s+(\$[0-9]+),(\$[0-9]+),0/
1598 sprintf("and\t$1,$2,0xff")/e or
1599 s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/;
1600 }
1601
1602 # convert lwl/lwr and swr/swl to little-endian order
1603 if (!$big_endian && /^\s+[sl]w[lr]\s+/) {
1604 s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/
1605 sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e or
1606 s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/
1607 sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e;
1608 }
1609
1610 print $_,"\n";
1611}
1612
1613close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-parisc.pl b/src/lib/libcrypto/aes/asm/aes-parisc.pl
deleted file mode 100644
index 4e4592b56b..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-parisc.pl
+++ /dev/null
@@ -1,1030 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for PA-RISC.
11#
12# June 2009.
13#
14# The module is mechanical transliteration of aes-sparcv9.pl, but with
15# a twist: S-boxes are compressed even further down to 1K+256B. On
16# PA-7100LC performance is ~40% better than gcc 3.2 generated code and
17# is about 33 cycles per byte processed with 128-bit key. Newer CPUs
18# perform at 16 cycles per byte. It's not faster than code generated
19# by vendor compiler, but recall that it has compressed S-boxes, which
20# requires extra processing.
21#
22# Special thanks to polarhome.com for providing HP-UX account.
23
24$flavour = shift;
25$output = shift;
26open STDOUT,">$output";
27
28if ($flavour =~ /64/) {
29 $LEVEL ="2.0W";
30 $SIZE_T =8;
31 $FRAME_MARKER =80;
32 $SAVED_RP =16;
33 $PUSH ="std";
34 $PUSHMA ="std,ma";
35 $POP ="ldd";
36 $POPMB ="ldd,mb";
37} else {
38 $LEVEL ="1.0";
39 $SIZE_T =4;
40 $FRAME_MARKER =48;
41 $SAVED_RP =20;
42 $PUSH ="stw";
43 $PUSHMA ="stwm";
44 $POP ="ldw";
45 $POPMB ="ldwm";
46}
47
48$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
49 # [+ argument transfer]
50$inp="%r26"; # arg0
51$out="%r25"; # arg1
52$key="%r24"; # arg2
53
54($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4");
55($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8");
56
57($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7,
58 $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) =
59("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16",
60"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26");
61
62$tbl="%r28";
63$rounds="%r29";
64
65$code=<<___;
66 .LEVEL $LEVEL
67 .text
68
69 .EXPORT aes_encrypt_internal,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
70 .ALIGN 64
71aes_encrypt_internal
72 .PROC
73 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
74 .ENTRY
75 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
76 $PUSHMA %r3,$FRAME(%sp)
77 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
78 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
79 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
80 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
81 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
82 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
83 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
84 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
85 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
86 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
87 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
88 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
89 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
90 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
91 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
92
93 ldi 3,$t0
94#ifdef __PIC__
95 addil LT'L\$AES_Te, %r19
96 ldw RT'L\$AES_Te(%r1), $tbl
97#else
98 ldil L'L\$AES_Te, %t1
99 ldo R'L\$AES_Te(%t1), $tbl
100#endif
101
102 and $inp,$t0,$t0
103 sub $inp,$t0,$inp
104 ldw 0($inp),$s0
105 ldw 4($inp),$s1
106 ldw 8($inp),$s2
107 comib,= 0,$t0,L\$enc_inp_aligned
108 ldw 12($inp),$s3
109
110 sh3addl $t0,%r0,$t0
111 subi 32,$t0,$t0
112 mtctl $t0,%cr11
113 ldw 16($inp),$t1
114 vshd $s0,$s1,$s0
115 vshd $s1,$s2,$s1
116 vshd $s2,$s3,$s2
117 vshd $s3,$t1,$s3
118
119L\$enc_inp_aligned
120 bl _parisc_AES_encrypt,%r31
121 nop
122
123 extru,<> $out,31,2,%r0
124 b L\$enc_out_aligned
125 nop
126
127 _srm $s0,24,$acc0
128 _srm $s0,16,$acc1
129 stb $acc0,0($out)
130 _srm $s0,8,$acc2
131 stb $acc1,1($out)
132 _srm $s1,24,$acc4
133 stb $acc2,2($out)
134 _srm $s1,16,$acc5
135 stb $s0,3($out)
136 _srm $s1,8,$acc6
137 stb $acc4,4($out)
138 _srm $s2,24,$acc0
139 stb $acc5,5($out)
140 _srm $s2,16,$acc1
141 stb $acc6,6($out)
142 _srm $s2,8,$acc2
143 stb $s1,7($out)
144 _srm $s3,24,$acc4
145 stb $acc0,8($out)
146 _srm $s3,16,$acc5
147 stb $acc1,9($out)
148 _srm $s3,8,$acc6
149 stb $acc2,10($out)
150 stb $s2,11($out)
151 stb $acc4,12($out)
152 stb $acc5,13($out)
153 stb $acc6,14($out)
154 b L\$enc_done
155 stb $s3,15($out)
156
157L\$enc_out_aligned
158 stw $s0,0($out)
159 stw $s1,4($out)
160 stw $s2,8($out)
161 stw $s3,12($out)
162
163L\$enc_done
164 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
165 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
166 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
167 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
168 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
169 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
170 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
171 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
172 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
173 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
174 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
175 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
176 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
177 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
178 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
179 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
180 bv (%r2)
181 .EXIT
182 $POPMB -$FRAME(%sp),%r3
183 .PROCEND
184
185 .ALIGN 16
186_parisc_AES_encrypt
187 .PROC
188 .CALLINFO MILLICODE
189 .ENTRY
190 ldw 240($key),$rounds
191 ldw 0($key),$t0
192 ldw 4($key),$t1
193 ldw 8($key),$t2
194 _srm $rounds,1,$rounds
195 xor $t0,$s0,$s0
196 ldw 12($key),$t3
197 _srm $s0,24,$acc0
198 xor $t1,$s1,$s1
199 ldw 16($key),$t0
200 _srm $s1,16,$acc1
201 xor $t2,$s2,$s2
202 ldw 20($key),$t1
203 xor $t3,$s3,$s3
204 ldw 24($key),$t2
205 ldw 28($key),$t3
206L\$enc_loop
207 _srm $s2,8,$acc2
208 ldwx,s $acc0($tbl),$acc0
209 _srm $s3,0,$acc3
210 ldwx,s $acc1($tbl),$acc1
211 _srm $s1,24,$acc4
212 ldwx,s $acc2($tbl),$acc2
213 _srm $s2,16,$acc5
214 ldwx,s $acc3($tbl),$acc3
215 _srm $s3,8,$acc6
216 ldwx,s $acc4($tbl),$acc4
217 _srm $s0,0,$acc7
218 ldwx,s $acc5($tbl),$acc5
219 _srm $s2,24,$acc8
220 ldwx,s $acc6($tbl),$acc6
221 _srm $s3,16,$acc9
222 ldwx,s $acc7($tbl),$acc7
223 _srm $s0,8,$acc10
224 ldwx,s $acc8($tbl),$acc8
225 _srm $s1,0,$acc11
226 ldwx,s $acc9($tbl),$acc9
227 _srm $s3,24,$acc12
228 ldwx,s $acc10($tbl),$acc10
229 _srm $s0,16,$acc13
230 ldwx,s $acc11($tbl),$acc11
231 _srm $s1,8,$acc14
232 ldwx,s $acc12($tbl),$acc12
233 _srm $s2,0,$acc15
234 ldwx,s $acc13($tbl),$acc13
235 ldwx,s $acc14($tbl),$acc14
236 ldwx,s $acc15($tbl),$acc15
237 addib,= -1,$rounds,L\$enc_last
238 ldo 32($key),$key
239
240 _ror $acc1,8,$acc1
241 xor $acc0,$t0,$t0
242 ldw 0($key),$s0
243 _ror $acc2,16,$acc2
244 xor $acc1,$t0,$t0
245 ldw 4($key),$s1
246 _ror $acc3,24,$acc3
247 xor $acc2,$t0,$t0
248 ldw 8($key),$s2
249 _ror $acc5,8,$acc5
250 xor $acc3,$t0,$t0
251 ldw 12($key),$s3
252 _ror $acc6,16,$acc6
253 xor $acc4,$t1,$t1
254 _ror $acc7,24,$acc7
255 xor $acc5,$t1,$t1
256 _ror $acc9,8,$acc9
257 xor $acc6,$t1,$t1
258 _ror $acc10,16,$acc10
259 xor $acc7,$t1,$t1
260 _ror $acc11,24,$acc11
261 xor $acc8,$t2,$t2
262 _ror $acc13,8,$acc13
263 xor $acc9,$t2,$t2
264 _ror $acc14,16,$acc14
265 xor $acc10,$t2,$t2
266 _ror $acc15,24,$acc15
267 xor $acc11,$t2,$t2
268 xor $acc12,$acc14,$acc14
269 xor $acc13,$t3,$t3
270 _srm $t0,24,$acc0
271 xor $acc14,$t3,$t3
272 _srm $t1,16,$acc1
273 xor $acc15,$t3,$t3
274
275 _srm $t2,8,$acc2
276 ldwx,s $acc0($tbl),$acc0
277 _srm $t3,0,$acc3
278 ldwx,s $acc1($tbl),$acc1
279 _srm $t1,24,$acc4
280 ldwx,s $acc2($tbl),$acc2
281 _srm $t2,16,$acc5
282 ldwx,s $acc3($tbl),$acc3
283 _srm $t3,8,$acc6
284 ldwx,s $acc4($tbl),$acc4
285 _srm $t0,0,$acc7
286 ldwx,s $acc5($tbl),$acc5
287 _srm $t2,24,$acc8
288 ldwx,s $acc6($tbl),$acc6
289 _srm $t3,16,$acc9
290 ldwx,s $acc7($tbl),$acc7
291 _srm $t0,8,$acc10
292 ldwx,s $acc8($tbl),$acc8
293 _srm $t1,0,$acc11
294 ldwx,s $acc9($tbl),$acc9
295 _srm $t3,24,$acc12
296 ldwx,s $acc10($tbl),$acc10
297 _srm $t0,16,$acc13
298 ldwx,s $acc11($tbl),$acc11
299 _srm $t1,8,$acc14
300 ldwx,s $acc12($tbl),$acc12
301 _srm $t2,0,$acc15
302 ldwx,s $acc13($tbl),$acc13
303 _ror $acc1,8,$acc1
304 ldwx,s $acc14($tbl),$acc14
305
306 _ror $acc2,16,$acc2
307 xor $acc0,$s0,$s0
308 ldwx,s $acc15($tbl),$acc15
309 _ror $acc3,24,$acc3
310 xor $acc1,$s0,$s0
311 ldw 16($key),$t0
312 _ror $acc5,8,$acc5
313 xor $acc2,$s0,$s0
314 ldw 20($key),$t1
315 _ror $acc6,16,$acc6
316 xor $acc3,$s0,$s0
317 ldw 24($key),$t2
318 _ror $acc7,24,$acc7
319 xor $acc4,$s1,$s1
320 ldw 28($key),$t3
321 _ror $acc9,8,$acc9
322 xor $acc5,$s1,$s1
323 ldw 1024+0($tbl),%r0 ; prefetch te4
324 _ror $acc10,16,$acc10
325 xor $acc6,$s1,$s1
326 ldw 1024+32($tbl),%r0 ; prefetch te4
327 _ror $acc11,24,$acc11
328 xor $acc7,$s1,$s1
329 ldw 1024+64($tbl),%r0 ; prefetch te4
330 _ror $acc13,8,$acc13
331 xor $acc8,$s2,$s2
332 ldw 1024+96($tbl),%r0 ; prefetch te4
333 _ror $acc14,16,$acc14
334 xor $acc9,$s2,$s2
335 ldw 1024+128($tbl),%r0 ; prefetch te4
336 _ror $acc15,24,$acc15
337 xor $acc10,$s2,$s2
338 ldw 1024+160($tbl),%r0 ; prefetch te4
339 _srm $s0,24,$acc0
340 xor $acc11,$s2,$s2
341 ldw 1024+192($tbl),%r0 ; prefetch te4
342 xor $acc12,$acc14,$acc14
343 xor $acc13,$s3,$s3
344 ldw 1024+224($tbl),%r0 ; prefetch te4
345 _srm $s1,16,$acc1
346 xor $acc14,$s3,$s3
347 b L\$enc_loop
348 xor $acc15,$s3,$s3
349
350 .ALIGN 16
351L\$enc_last
352 ldo 1024($tbl),$rounds
353 _ror $acc1,8,$acc1
354 xor $acc0,$t0,$t0
355 ldw 0($key),$s0
356 _ror $acc2,16,$acc2
357 xor $acc1,$t0,$t0
358 ldw 4($key),$s1
359 _ror $acc3,24,$acc3
360 xor $acc2,$t0,$t0
361 ldw 8($key),$s2
362 _ror $acc5,8,$acc5
363 xor $acc3,$t0,$t0
364 ldw 12($key),$s3
365 _ror $acc6,16,$acc6
366 xor $acc4,$t1,$t1
367 _ror $acc7,24,$acc7
368 xor $acc5,$t1,$t1
369 _ror $acc9,8,$acc9
370 xor $acc6,$t1,$t1
371 _ror $acc10,16,$acc10
372 xor $acc7,$t1,$t1
373 _ror $acc11,24,$acc11
374 xor $acc8,$t2,$t2
375 _ror $acc13,8,$acc13
376 xor $acc9,$t2,$t2
377 _ror $acc14,16,$acc14
378 xor $acc10,$t2,$t2
379 _ror $acc15,24,$acc15
380 xor $acc11,$t2,$t2
381 xor $acc12,$acc14,$acc14
382 xor $acc13,$t3,$t3
383 _srm $t0,24,$acc0
384 xor $acc14,$t3,$t3
385 _srm $t1,16,$acc1
386 xor $acc15,$t3,$t3
387
388 _srm $t2,8,$acc2
389 ldbx $acc0($rounds),$acc0
390 _srm $t1,24,$acc4
391 ldbx $acc1($rounds),$acc1
392 _srm $t2,16,$acc5
393 _srm $t3,0,$acc3
394 ldbx $acc2($rounds),$acc2
395 ldbx $acc3($rounds),$acc3
396 _srm $t3,8,$acc6
397 ldbx $acc4($rounds),$acc4
398 _srm $t2,24,$acc8
399 ldbx $acc5($rounds),$acc5
400 _srm $t3,16,$acc9
401 _srm $t0,0,$acc7
402 ldbx $acc6($rounds),$acc6
403 ldbx $acc7($rounds),$acc7
404 _srm $t0,8,$acc10
405 ldbx $acc8($rounds),$acc8
406 _srm $t3,24,$acc12
407 ldbx $acc9($rounds),$acc9
408 _srm $t0,16,$acc13
409 _srm $t1,0,$acc11
410 ldbx $acc10($rounds),$acc10
411 _srm $t1,8,$acc14
412 ldbx $acc11($rounds),$acc11
413 ldbx $acc12($rounds),$acc12
414 ldbx $acc13($rounds),$acc13
415 _srm $t2,0,$acc15
416 ldbx $acc14($rounds),$acc14
417
418 dep $acc0,7,8,$acc3
419 ldbx $acc15($rounds),$acc15
420 dep $acc4,7,8,$acc7
421 dep $acc1,15,8,$acc3
422 dep $acc5,15,8,$acc7
423 dep $acc2,23,8,$acc3
424 dep $acc6,23,8,$acc7
425 xor $acc3,$s0,$s0
426 xor $acc7,$s1,$s1
427 dep $acc8,7,8,$acc11
428 dep $acc12,7,8,$acc15
429 dep $acc9,15,8,$acc11
430 dep $acc13,15,8,$acc15
431 dep $acc10,23,8,$acc11
432 dep $acc14,23,8,$acc15
433 xor $acc11,$s2,$s2
434
435 bv (%r31)
436 .EXIT
437 xor $acc15,$s3,$s3
438 .PROCEND
439
440 .section .rodata
441 .ALIGN 64
442L\$AES_Te
443 .WORD 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
444 .WORD 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
445 .WORD 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
446 .WORD 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
447 .WORD 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
448 .WORD 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
449 .WORD 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
450 .WORD 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
451 .WORD 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
452 .WORD 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
453 .WORD 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
454 .WORD 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
455 .WORD 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
456 .WORD 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
457 .WORD 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
458 .WORD 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
459 .WORD 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
460 .WORD 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
461 .WORD 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
462 .WORD 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
463 .WORD 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
464 .WORD 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
465 .WORD 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
466 .WORD 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
467 .WORD 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
468 .WORD 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
469 .WORD 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
470 .WORD 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
471 .WORD 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
472 .WORD 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
473 .WORD 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
474 .WORD 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
475 .WORD 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
476 .WORD 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
477 .WORD 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
478 .WORD 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
479 .WORD 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
480 .WORD 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
481 .WORD 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
482 .WORD 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
483 .WORD 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
484 .WORD 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
485 .WORD 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
486 .WORD 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
487 .WORD 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
488 .WORD 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
489 .WORD 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
490 .WORD 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
491 .WORD 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
492 .WORD 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
493 .WORD 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
494 .WORD 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
495 .WORD 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
496 .WORD 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
497 .WORD 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
498 .WORD 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
499 .WORD 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
500 .WORD 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
501 .WORD 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
502 .WORD 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
503 .WORD 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
504 .WORD 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
505 .WORD 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
506 .WORD 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
507 .BYTE 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
508 .BYTE 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
509 .BYTE 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
510 .BYTE 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
511 .BYTE 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
512 .BYTE 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
513 .BYTE 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
514 .BYTE 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
515 .BYTE 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
516 .BYTE 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
517 .BYTE 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
518 .BYTE 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
519 .BYTE 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
520 .BYTE 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
521 .BYTE 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
522 .BYTE 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
523 .BYTE 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
524 .BYTE 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
525 .BYTE 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
526 .BYTE 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
527 .BYTE 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
528 .BYTE 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
529 .BYTE 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
530 .BYTE 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
531 .BYTE 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
532 .BYTE 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
533 .BYTE 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
534 .BYTE 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
535 .BYTE 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
536 .BYTE 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
537 .BYTE 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
538 .BYTE 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
539 .previous
540___
541
542$code.=<<___;
543 .EXPORT aes_decrypt_internal,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
544 .ALIGN 16
545aes_decrypt_internal
546 .PROC
547 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
548 .ENTRY
549 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
550 $PUSHMA %r3,$FRAME(%sp)
551 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
552 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
553 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
554 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
555 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
556 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
557 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
558 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
559 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
560 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
561 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
562 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
563 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
564 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
565 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
566
567 ldi 3,$t0
568#ifdef __PIC__
569 addil LT'L\$AES_Td, %r19
570 ldw RT'L\$AES_Td(%r1), $tbl
571#else
572 ldil L'L\$AES_Td, %t1
573 ldo R'L\$AES_Td(%t1), $tbl
574#endif
575
576 and $inp,$t0,$t0
577 sub $inp,$t0,$inp
578 ldw 0($inp),$s0
579 ldw 4($inp),$s1
580 ldw 8($inp),$s2
581 comib,= 0,$t0,L\$dec_inp_aligned
582 ldw 12($inp),$s3
583
584 sh3addl $t0,%r0,$t0
585 subi 32,$t0,$t0
586 mtctl $t0,%cr11
587 ldw 16($inp),$t1
588 vshd $s0,$s1,$s0
589 vshd $s1,$s2,$s1
590 vshd $s2,$s3,$s2
591 vshd $s3,$t1,$s3
592
593L\$dec_inp_aligned
594 bl _parisc_AES_decrypt,%r31
595 nop
596
597 extru,<> $out,31,2,%r0
598 b L\$dec_out_aligned
599 nop
600
601 _srm $s0,24,$acc0
602 _srm $s0,16,$acc1
603 stb $acc0,0($out)
604 _srm $s0,8,$acc2
605 stb $acc1,1($out)
606 _srm $s1,24,$acc4
607 stb $acc2,2($out)
608 _srm $s1,16,$acc5
609 stb $s0,3($out)
610 _srm $s1,8,$acc6
611 stb $acc4,4($out)
612 _srm $s2,24,$acc0
613 stb $acc5,5($out)
614 _srm $s2,16,$acc1
615 stb $acc6,6($out)
616 _srm $s2,8,$acc2
617 stb $s1,7($out)
618 _srm $s3,24,$acc4
619 stb $acc0,8($out)
620 _srm $s3,16,$acc5
621 stb $acc1,9($out)
622 _srm $s3,8,$acc6
623 stb $acc2,10($out)
624 stb $s2,11($out)
625 stb $acc4,12($out)
626 stb $acc5,13($out)
627 stb $acc6,14($out)
628 b L\$dec_done
629 stb $s3,15($out)
630
631L\$dec_out_aligned
632 stw $s0,0($out)
633 stw $s1,4($out)
634 stw $s2,8($out)
635 stw $s3,12($out)
636
637L\$dec_done
638 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
639 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
640 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
641 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
642 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
643 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
644 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
645 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
646 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
647 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
648 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
649 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
650 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
651 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
652 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
653 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
654 bv (%r2)
655 .EXIT
656 $POPMB -$FRAME(%sp),%r3
657 .PROCEND
658
659 .ALIGN 16
660_parisc_AES_decrypt
661 .PROC
662 .CALLINFO MILLICODE
663 .ENTRY
664 ldw 240($key),$rounds
665 ldw 0($key),$t0
666 ldw 4($key),$t1
667 ldw 8($key),$t2
668 ldw 12($key),$t3
669 _srm $rounds,1,$rounds
670 xor $t0,$s0,$s0
671 ldw 16($key),$t0
672 xor $t1,$s1,$s1
673 ldw 20($key),$t1
674 _srm $s0,24,$acc0
675 xor $t2,$s2,$s2
676 ldw 24($key),$t2
677 xor $t3,$s3,$s3
678 ldw 28($key),$t3
679 _srm $s3,16,$acc1
680L\$dec_loop
681 _srm $s2,8,$acc2
682 ldwx,s $acc0($tbl),$acc0
683 _srm $s1,0,$acc3
684 ldwx,s $acc1($tbl),$acc1
685 _srm $s1,24,$acc4
686 ldwx,s $acc2($tbl),$acc2
687 _srm $s0,16,$acc5
688 ldwx,s $acc3($tbl),$acc3
689 _srm $s3,8,$acc6
690 ldwx,s $acc4($tbl),$acc4
691 _srm $s2,0,$acc7
692 ldwx,s $acc5($tbl),$acc5
693 _srm $s2,24,$acc8
694 ldwx,s $acc6($tbl),$acc6
695 _srm $s1,16,$acc9
696 ldwx,s $acc7($tbl),$acc7
697 _srm $s0,8,$acc10
698 ldwx,s $acc8($tbl),$acc8
699 _srm $s3,0,$acc11
700 ldwx,s $acc9($tbl),$acc9
701 _srm $s3,24,$acc12
702 ldwx,s $acc10($tbl),$acc10
703 _srm $s2,16,$acc13
704 ldwx,s $acc11($tbl),$acc11
705 _srm $s1,8,$acc14
706 ldwx,s $acc12($tbl),$acc12
707 _srm $s0,0,$acc15
708 ldwx,s $acc13($tbl),$acc13
709 ldwx,s $acc14($tbl),$acc14
710 ldwx,s $acc15($tbl),$acc15
711 addib,= -1,$rounds,L\$dec_last
712 ldo 32($key),$key
713
714 _ror $acc1,8,$acc1
715 xor $acc0,$t0,$t0
716 ldw 0($key),$s0
717 _ror $acc2,16,$acc2
718 xor $acc1,$t0,$t0
719 ldw 4($key),$s1
720 _ror $acc3,24,$acc3
721 xor $acc2,$t0,$t0
722 ldw 8($key),$s2
723 _ror $acc5,8,$acc5
724 xor $acc3,$t0,$t0
725 ldw 12($key),$s3
726 _ror $acc6,16,$acc6
727 xor $acc4,$t1,$t1
728 _ror $acc7,24,$acc7
729 xor $acc5,$t1,$t1
730 _ror $acc9,8,$acc9
731 xor $acc6,$t1,$t1
732 _ror $acc10,16,$acc10
733 xor $acc7,$t1,$t1
734 _ror $acc11,24,$acc11
735 xor $acc8,$t2,$t2
736 _ror $acc13,8,$acc13
737 xor $acc9,$t2,$t2
738 _ror $acc14,16,$acc14
739 xor $acc10,$t2,$t2
740 _ror $acc15,24,$acc15
741 xor $acc11,$t2,$t2
742 xor $acc12,$acc14,$acc14
743 xor $acc13,$t3,$t3
744 _srm $t0,24,$acc0
745 xor $acc14,$t3,$t3
746 xor $acc15,$t3,$t3
747 _srm $t3,16,$acc1
748
749 _srm $t2,8,$acc2
750 ldwx,s $acc0($tbl),$acc0
751 _srm $t1,0,$acc3
752 ldwx,s $acc1($tbl),$acc1
753 _srm $t1,24,$acc4
754 ldwx,s $acc2($tbl),$acc2
755 _srm $t0,16,$acc5
756 ldwx,s $acc3($tbl),$acc3
757 _srm $t3,8,$acc6
758 ldwx,s $acc4($tbl),$acc4
759 _srm $t2,0,$acc7
760 ldwx,s $acc5($tbl),$acc5
761 _srm $t2,24,$acc8
762 ldwx,s $acc6($tbl),$acc6
763 _srm $t1,16,$acc9
764 ldwx,s $acc7($tbl),$acc7
765 _srm $t0,8,$acc10
766 ldwx,s $acc8($tbl),$acc8
767 _srm $t3,0,$acc11
768 ldwx,s $acc9($tbl),$acc9
769 _srm $t3,24,$acc12
770 ldwx,s $acc10($tbl),$acc10
771 _srm $t2,16,$acc13
772 ldwx,s $acc11($tbl),$acc11
773 _srm $t1,8,$acc14
774 ldwx,s $acc12($tbl),$acc12
775 _srm $t0,0,$acc15
776 ldwx,s $acc13($tbl),$acc13
777 _ror $acc1,8,$acc1
778 ldwx,s $acc14($tbl),$acc14
779
780 _ror $acc2,16,$acc2
781 xor $acc0,$s0,$s0
782 ldwx,s $acc15($tbl),$acc15
783 _ror $acc3,24,$acc3
784 xor $acc1,$s0,$s0
785 ldw 16($key),$t0
786 _ror $acc5,8,$acc5
787 xor $acc2,$s0,$s0
788 ldw 20($key),$t1
789 _ror $acc6,16,$acc6
790 xor $acc3,$s0,$s0
791 ldw 24($key),$t2
792 _ror $acc7,24,$acc7
793 xor $acc4,$s1,$s1
794 ldw 28($key),$t3
795 _ror $acc9,8,$acc9
796 xor $acc5,$s1,$s1
797 ldw 1024+0($tbl),%r0 ; prefetch td4
798 _ror $acc10,16,$acc10
799 xor $acc6,$s1,$s1
800 ldw 1024+32($tbl),%r0 ; prefetch td4
801 _ror $acc11,24,$acc11
802 xor $acc7,$s1,$s1
803 ldw 1024+64($tbl),%r0 ; prefetch td4
804 _ror $acc13,8,$acc13
805 xor $acc8,$s2,$s2
806 ldw 1024+96($tbl),%r0 ; prefetch td4
807 _ror $acc14,16,$acc14
808 xor $acc9,$s2,$s2
809 ldw 1024+128($tbl),%r0 ; prefetch td4
810 _ror $acc15,24,$acc15
811 xor $acc10,$s2,$s2
812 ldw 1024+160($tbl),%r0 ; prefetch td4
813 _srm $s0,24,$acc0
814 xor $acc11,$s2,$s2
815 ldw 1024+192($tbl),%r0 ; prefetch td4
816 xor $acc12,$acc14,$acc14
817 xor $acc13,$s3,$s3
818 ldw 1024+224($tbl),%r0 ; prefetch td4
819 xor $acc14,$s3,$s3
820 xor $acc15,$s3,$s3
821 b L\$dec_loop
822 _srm $s3,16,$acc1
823
824 .ALIGN 16
825L\$dec_last
826 ldo 1024($tbl),$rounds
827 _ror $acc1,8,$acc1
828 xor $acc0,$t0,$t0
829 ldw 0($key),$s0
830 _ror $acc2,16,$acc2
831 xor $acc1,$t0,$t0
832 ldw 4($key),$s1
833 _ror $acc3,24,$acc3
834 xor $acc2,$t0,$t0
835 ldw 8($key),$s2
836 _ror $acc5,8,$acc5
837 xor $acc3,$t0,$t0
838 ldw 12($key),$s3
839 _ror $acc6,16,$acc6
840 xor $acc4,$t1,$t1
841 _ror $acc7,24,$acc7
842 xor $acc5,$t1,$t1
843 _ror $acc9,8,$acc9
844 xor $acc6,$t1,$t1
845 _ror $acc10,16,$acc10
846 xor $acc7,$t1,$t1
847 _ror $acc11,24,$acc11
848 xor $acc8,$t2,$t2
849 _ror $acc13,8,$acc13
850 xor $acc9,$t2,$t2
851 _ror $acc14,16,$acc14
852 xor $acc10,$t2,$t2
853 _ror $acc15,24,$acc15
854 xor $acc11,$t2,$t2
855 xor $acc12,$acc14,$acc14
856 xor $acc13,$t3,$t3
857 _srm $t0,24,$acc0
858 xor $acc14,$t3,$t3
859 xor $acc15,$t3,$t3
860 _srm $t3,16,$acc1
861
862 _srm $t2,8,$acc2
863 ldbx $acc0($rounds),$acc0
864 _srm $t1,24,$acc4
865 ldbx $acc1($rounds),$acc1
866 _srm $t0,16,$acc5
867 _srm $t1,0,$acc3
868 ldbx $acc2($rounds),$acc2
869 ldbx $acc3($rounds),$acc3
870 _srm $t3,8,$acc6
871 ldbx $acc4($rounds),$acc4
872 _srm $t2,24,$acc8
873 ldbx $acc5($rounds),$acc5
874 _srm $t1,16,$acc9
875 _srm $t2,0,$acc7
876 ldbx $acc6($rounds),$acc6
877 ldbx $acc7($rounds),$acc7
878 _srm $t0,8,$acc10
879 ldbx $acc8($rounds),$acc8
880 _srm $t3,24,$acc12
881 ldbx $acc9($rounds),$acc9
882 _srm $t2,16,$acc13
883 _srm $t3,0,$acc11
884 ldbx $acc10($rounds),$acc10
885 _srm $t1,8,$acc14
886 ldbx $acc11($rounds),$acc11
887 ldbx $acc12($rounds),$acc12
888 ldbx $acc13($rounds),$acc13
889 _srm $t0,0,$acc15
890 ldbx $acc14($rounds),$acc14
891
892 dep $acc0,7,8,$acc3
893 ldbx $acc15($rounds),$acc15
894 dep $acc4,7,8,$acc7
895 dep $acc1,15,8,$acc3
896 dep $acc5,15,8,$acc7
897 dep $acc2,23,8,$acc3
898 dep $acc6,23,8,$acc7
899 xor $acc3,$s0,$s0
900 xor $acc7,$s1,$s1
901 dep $acc8,7,8,$acc11
902 dep $acc12,7,8,$acc15
903 dep $acc9,15,8,$acc11
904 dep $acc13,15,8,$acc15
905 dep $acc10,23,8,$acc11
906 dep $acc14,23,8,$acc15
907 xor $acc11,$s2,$s2
908
909 bv (%r31)
910 .EXIT
911 xor $acc15,$s3,$s3
912 .PROCEND
913
914 .section .rodata
915 .ALIGN 64
916L\$AES_Td
917 .WORD 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
918 .WORD 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
919 .WORD 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
920 .WORD 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
921 .WORD 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
922 .WORD 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
923 .WORD 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
924 .WORD 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
925 .WORD 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
926 .WORD 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
927 .WORD 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
928 .WORD 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
929 .WORD 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
930 .WORD 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
931 .WORD 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
932 .WORD 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
933 .WORD 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
934 .WORD 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
935 .WORD 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
936 .WORD 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
937 .WORD 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
938 .WORD 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
939 .WORD 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
940 .WORD 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
941 .WORD 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
942 .WORD 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
943 .WORD 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
944 .WORD 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
945 .WORD 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
946 .WORD 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
947 .WORD 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
948 .WORD 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
949 .WORD 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
950 .WORD 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
951 .WORD 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
952 .WORD 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
953 .WORD 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
954 .WORD 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
955 .WORD 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
956 .WORD 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
957 .WORD 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
958 .WORD 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
959 .WORD 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
960 .WORD 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
961 .WORD 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
962 .WORD 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
963 .WORD 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
964 .WORD 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
965 .WORD 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
966 .WORD 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
967 .WORD 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
968 .WORD 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
969 .WORD 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
970 .WORD 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
971 .WORD 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
972 .WORD 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
973 .WORD 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
974 .WORD 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
975 .WORD 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
976 .WORD 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
977 .WORD 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
978 .WORD 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
979 .WORD 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
980 .WORD 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
981 .BYTE 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
982 .BYTE 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
983 .BYTE 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
984 .BYTE 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
985 .BYTE 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
986 .BYTE 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
987 .BYTE 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
988 .BYTE 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
989 .BYTE 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
990 .BYTE 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
991 .BYTE 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
992 .BYTE 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
993 .BYTE 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
994 .BYTE 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
995 .BYTE 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
996 .BYTE 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
997 .BYTE 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
998 .BYTE 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
999 .BYTE 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
1000 .BYTE 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
1001 .BYTE 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
1002 .BYTE 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
1003 .BYTE 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
1004 .BYTE 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
1005 .BYTE 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
1006 .BYTE 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
1007 .BYTE 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
1008 .BYTE 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
1009 .BYTE 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
1010 .BYTE 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
1011 .BYTE 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
1012 .BYTE 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1013 .previous
1014___
1015
1016foreach (split("\n",$code)) {
1017 s/\`([^\`]*)\`/eval $1/ge;
1018
1019 # translate made up instructons: _ror, _srm
1020 s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/ or
1021
1022 s/_srm(\s+%r[0-9]+),([0-9]+),/
1023 $SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2)
1024 : sprintf("extrd,u%s,%d,8,",$1,63-$2)/e;
1025
1026 s/,\*/,/ if ($SIZE_T==4);
1027 s/\bbv\b(.*\(%r2\))/bve$1/ if ($SIZE_T==8);
1028 print $_,"\n";
1029}
1030close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl
deleted file mode 100644
index 178ba56b3e..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-ppc.pl
+++ /dev/null
@@ -1,1344 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Needs more work: key setup, CBC routine...
11#
12# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
13# 128-bit key, which is ~40% better than 64-bit code generated by gcc
14# 4.0. But these are not the ones currently used! Their "compact"
15# counterparts are, for security reason. ppc_AES_encrypt_compact runs
16# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
17# at 1/3 of ppc_AES_decrypt.
18
19# February 2010
20#
21# Rescheduling instructions to favour Power6 pipeline gave 10%
22# performance improvement on the platform in question (and marginal
23# improvement even on others). It should be noted that Power6 fails
24# to process byte in 18 cycles, only in 23, because it fails to issue
25# 4 load instructions in two cycles, only in 3. As result non-compact
26# block subroutines are 25% slower than one would expect. Compact
27# functions scale better, because they have pure computational part,
28# which scales perfectly with clock frequency. To be specific
29# ppc_AES_encrypt_compact operates at 42 cycles per byte, while
30# ppc_AES_decrypt_compact - at 55 (in 64-bit build).
31
32$flavour = shift;
33
34if ($flavour =~ /64/) {
35 $SIZE_T =8;
36 $LRSAVE =2*$SIZE_T;
37 $STU ="stdu";
38 $POP ="ld";
39 $PUSH ="std";
40} elsif ($flavour =~ /32/) {
41 $SIZE_T =4;
42 $LRSAVE =$SIZE_T;
43 $STU ="stwu";
44 $POP ="lwz";
45 $PUSH ="stw";
46} else { die "nonsense $flavour"; }
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
51die "can't locate ppc-xlate.pl";
52
53open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
54
55$FRAME=32*$SIZE_T;
56
57sub _data_word()
58{ my $i;
59 while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
60}
61
62$sp="r1";
63$toc="r2";
64$inp="r3";
65$out="r4";
66$key="r5";
67
68$Tbl0="r3";
69$Tbl1="r6";
70$Tbl2="r7";
71$Tbl3="r2";
72
73$s0="r8";
74$s1="r9";
75$s2="r10";
76$s3="r11";
77
78$t0="r12";
79$t1="r13";
80$t2="r14";
81$t3="r15";
82
83$acc00="r16";
84$acc01="r17";
85$acc02="r18";
86$acc03="r19";
87
88$acc04="r20";
89$acc05="r21";
90$acc06="r22";
91$acc07="r23";
92
93$acc08="r24";
94$acc09="r25";
95$acc10="r26";
96$acc11="r27";
97
98$acc12="r28";
99$acc13="r29";
100$acc14="r30";
101$acc15="r31";
102
103# stay away from TLS pointer
104if ($SIZE_T==8) { die if ($t1 ne "r13"); $t1="r0"; }
105else { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0"; }
106$mask80=$Tbl2;
107$mask1b=$Tbl3;
108
109$code.=<<___;
110.machine "any"
111.text
112
113.align 7
114LAES_Te:
115 mflr r0
116 bcl 20,31,\$+4
117 mflr $Tbl0 ; vvvvv "distance" between . and 1st data entry
118 addi $Tbl0,$Tbl0,`128-8`
119 mtlr r0
120 blr
121 .space `64-12*4`
122LAES_Td:
123 mflr r0
124 bcl 20,31,\$+4
125 mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry
126 addi $Tbl0,$Tbl0,`128-64-8+2048+256`
127 mtlr r0
128 blr
129 .space `128-64-12*4`
130___
131&_data_word(
132 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
133 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
134 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
135 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
136 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
137 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
138 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
139 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
140 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
141 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
142 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
143 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
144 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
145 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
146 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
147 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
148 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
149 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
150 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
151 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
152 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
153 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
154 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
155 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
156 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
157 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
158 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
159 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
160 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
161 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
162 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
163 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
164 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
165 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
166 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
167 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
168 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
169 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
170 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
171 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
172 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
173 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
174 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
175 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
176 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
177 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
178 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
179 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
180 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
181 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
182 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
183 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
184 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
185 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
186 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
187 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
188 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
189 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
190 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
191 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
192 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
193 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
194 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
195 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
196$code.=<<___;
197.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
198.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
199.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
200.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
201.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
202.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
203.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
204.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
205.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
206.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
207.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
208.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
209.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
210.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
211.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
212.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
213.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
214.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
215.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
216.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
217.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
218.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
219.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
220.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
221.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
222.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
223.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
224.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
225.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
226.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
227.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
228.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
229___
230&_data_word(
231 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
232 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
233 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
234 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
235 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
236 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
237 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
238 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
239 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
240 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
241 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
242 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
243 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
244 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
245 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
246 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
247 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
248 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
249 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
250 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
251 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
252 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
253 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
254 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
255 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
256 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
257 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
258 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
259 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
260 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
261 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
262 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
263 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
264 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
265 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
266 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
267 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
268 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
269 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
270 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
271 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
272 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
273 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
274 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
275 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
276 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
277 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
278 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
279 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
280 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
281 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
282 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
283 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
284 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
285 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
286 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
287 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
288 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
289 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
290 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
291 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
292 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
293 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
294 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
295$code.=<<___;
296.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
297.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
298.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
299.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
300.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
301.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
302.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
303.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
304.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
305.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
306.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
307.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
308.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
309.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
310.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
311.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
312.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
313.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
314.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
315.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
316.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
317.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
318.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
319.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
320.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
321.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
322.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
323.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
324.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
325.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
326.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
327.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
328
329
330.globl .aes_encrypt_internal
331.align 7
332.aes_encrypt_internal:
333 $STU $sp,-$FRAME($sp)
334 mflr r0
335
336 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
337 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
338 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
339 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
340 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
341 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
342 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
343 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
344 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
345 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
346 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
347 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
348 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
349 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
350 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
351 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
352 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
353 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
354 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
355 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
356 $PUSH r0,`$FRAME+$LRSAVE`($sp)
357
358 andi. $t0,$inp,3
359 andi. $t1,$out,3
360 or. $t0,$t0,$t1
361 bne Lenc_unaligned
362
363Lenc_unaligned_ok:
364 lwz $s0,0($inp)
365 lwz $s1,4($inp)
366 lwz $s2,8($inp)
367 lwz $s3,12($inp)
368 bl LAES_Te
369 bl Lppc_AES_encrypt_compact
370 stw $s0,0($out)
371 stw $s1,4($out)
372 stw $s2,8($out)
373 stw $s3,12($out)
374 b Lenc_done
375
376Lenc_unaligned:
377 subfic $t0,$inp,4096
378 subfic $t1,$out,4096
379 andi. $t0,$t0,4096-16
380 beq Lenc_xpage
381 andi. $t1,$t1,4096-16
382 bne Lenc_unaligned_ok
383
384Lenc_xpage:
385 lbz $acc00,0($inp)
386 lbz $acc01,1($inp)
387 lbz $acc02,2($inp)
388 lbz $s0,3($inp)
389 lbz $acc04,4($inp)
390 lbz $acc05,5($inp)
391 lbz $acc06,6($inp)
392 lbz $s1,7($inp)
393 lbz $acc08,8($inp)
394 lbz $acc09,9($inp)
395 lbz $acc10,10($inp)
396 insrwi $s0,$acc00,8,0
397 lbz $s2,11($inp)
398 insrwi $s1,$acc04,8,0
399 lbz $acc12,12($inp)
400 insrwi $s0,$acc01,8,8
401 lbz $acc13,13($inp)
402 insrwi $s1,$acc05,8,8
403 lbz $acc14,14($inp)
404 insrwi $s0,$acc02,8,16
405 lbz $s3,15($inp)
406 insrwi $s1,$acc06,8,16
407 insrwi $s2,$acc08,8,0
408 insrwi $s3,$acc12,8,0
409 insrwi $s2,$acc09,8,8
410 insrwi $s3,$acc13,8,8
411 insrwi $s2,$acc10,8,16
412 insrwi $s3,$acc14,8,16
413
414 bl LAES_Te
415 bl Lppc_AES_encrypt_compact
416
417 extrwi $acc00,$s0,8,0
418 extrwi $acc01,$s0,8,8
419 stb $acc00,0($out)
420 extrwi $acc02,$s0,8,16
421 stb $acc01,1($out)
422 stb $acc02,2($out)
423 extrwi $acc04,$s1,8,0
424 stb $s0,3($out)
425 extrwi $acc05,$s1,8,8
426 stb $acc04,4($out)
427 extrwi $acc06,$s1,8,16
428 stb $acc05,5($out)
429 stb $acc06,6($out)
430 extrwi $acc08,$s2,8,0
431 stb $s1,7($out)
432 extrwi $acc09,$s2,8,8
433 stb $acc08,8($out)
434 extrwi $acc10,$s2,8,16
435 stb $acc09,9($out)
436 stb $acc10,10($out)
437 extrwi $acc12,$s3,8,0
438 stb $s2,11($out)
439 extrwi $acc13,$s3,8,8
440 stb $acc12,12($out)
441 extrwi $acc14,$s3,8,16
442 stb $acc13,13($out)
443 stb $acc14,14($out)
444 stb $s3,15($out)
445
446Lenc_done:
447 $POP r0,`$FRAME+$LRSAVE`($sp)
448 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
449 $POP r13,`$FRAME-$SIZE_T*19`($sp)
450 $POP r14,`$FRAME-$SIZE_T*18`($sp)
451 $POP r15,`$FRAME-$SIZE_T*17`($sp)
452 $POP r16,`$FRAME-$SIZE_T*16`($sp)
453 $POP r17,`$FRAME-$SIZE_T*15`($sp)
454 $POP r18,`$FRAME-$SIZE_T*14`($sp)
455 $POP r19,`$FRAME-$SIZE_T*13`($sp)
456 $POP r20,`$FRAME-$SIZE_T*12`($sp)
457 $POP r21,`$FRAME-$SIZE_T*11`($sp)
458 $POP r22,`$FRAME-$SIZE_T*10`($sp)
459 $POP r23,`$FRAME-$SIZE_T*9`($sp)
460 $POP r24,`$FRAME-$SIZE_T*8`($sp)
461 $POP r25,`$FRAME-$SIZE_T*7`($sp)
462 $POP r26,`$FRAME-$SIZE_T*6`($sp)
463 $POP r27,`$FRAME-$SIZE_T*5`($sp)
464 $POP r28,`$FRAME-$SIZE_T*4`($sp)
465 $POP r29,`$FRAME-$SIZE_T*3`($sp)
466 $POP r30,`$FRAME-$SIZE_T*2`($sp)
467 $POP r31,`$FRAME-$SIZE_T*1`($sp)
468 mtlr r0
469 addi $sp,$sp,$FRAME
470 blr
471
472.align 5
473Lppc_AES_encrypt:
474 lwz $acc00,240($key)
475 addi $Tbl1,$Tbl0,3
476 lwz $t0,0($key)
477 addi $Tbl2,$Tbl0,2
478 lwz $t1,4($key)
479 addi $Tbl3,$Tbl0,1
480 lwz $t2,8($key)
481 addi $acc00,$acc00,-1
482 lwz $t3,12($key)
483 addi $key,$key,16
484 xor $s0,$s0,$t0
485 xor $s1,$s1,$t1
486 xor $s2,$s2,$t2
487 xor $s3,$s3,$t3
488 mtctr $acc00
489.align 4
490Lenc_loop:
491 rlwinm $acc00,$s0,`32-24+3`,21,28
492 rlwinm $acc01,$s1,`32-24+3`,21,28
493 rlwinm $acc02,$s2,`32-24+3`,21,28
494 rlwinm $acc03,$s3,`32-24+3`,21,28
495 lwz $t0,0($key)
496 rlwinm $acc04,$s1,`32-16+3`,21,28
497 lwz $t1,4($key)
498 rlwinm $acc05,$s2,`32-16+3`,21,28
499 lwz $t2,8($key)
500 rlwinm $acc06,$s3,`32-16+3`,21,28
501 lwz $t3,12($key)
502 rlwinm $acc07,$s0,`32-16+3`,21,28
503 lwzx $acc00,$Tbl0,$acc00
504 rlwinm $acc08,$s2,`32-8+3`,21,28
505 lwzx $acc01,$Tbl0,$acc01
506 rlwinm $acc09,$s3,`32-8+3`,21,28
507 lwzx $acc02,$Tbl0,$acc02
508 rlwinm $acc10,$s0,`32-8+3`,21,28
509 lwzx $acc03,$Tbl0,$acc03
510 rlwinm $acc11,$s1,`32-8+3`,21,28
511 lwzx $acc04,$Tbl1,$acc04
512 rlwinm $acc12,$s3,`0+3`,21,28
513 lwzx $acc05,$Tbl1,$acc05
514 rlwinm $acc13,$s0,`0+3`,21,28
515 lwzx $acc06,$Tbl1,$acc06
516 rlwinm $acc14,$s1,`0+3`,21,28
517 lwzx $acc07,$Tbl1,$acc07
518 rlwinm $acc15,$s2,`0+3`,21,28
519 lwzx $acc08,$Tbl2,$acc08
520 xor $t0,$t0,$acc00
521 lwzx $acc09,$Tbl2,$acc09
522 xor $t1,$t1,$acc01
523 lwzx $acc10,$Tbl2,$acc10
524 xor $t2,$t2,$acc02
525 lwzx $acc11,$Tbl2,$acc11
526 xor $t3,$t3,$acc03
527 lwzx $acc12,$Tbl3,$acc12
528 xor $t0,$t0,$acc04
529 lwzx $acc13,$Tbl3,$acc13
530 xor $t1,$t1,$acc05
531 lwzx $acc14,$Tbl3,$acc14
532 xor $t2,$t2,$acc06
533 lwzx $acc15,$Tbl3,$acc15
534 xor $t3,$t3,$acc07
535 xor $t0,$t0,$acc08
536 xor $t1,$t1,$acc09
537 xor $t2,$t2,$acc10
538 xor $t3,$t3,$acc11
539 xor $s0,$t0,$acc12
540 xor $s1,$t1,$acc13
541 xor $s2,$t2,$acc14
542 xor $s3,$t3,$acc15
543 addi $key,$key,16
544 bdnz- Lenc_loop
545
546 addi $Tbl2,$Tbl0,2048
547 nop
548 lwz $t0,0($key)
549 rlwinm $acc00,$s0,`32-24`,24,31
550 lwz $t1,4($key)
551 rlwinm $acc01,$s1,`32-24`,24,31
552 lwz $t2,8($key)
553 rlwinm $acc02,$s2,`32-24`,24,31
554 lwz $t3,12($key)
555 rlwinm $acc03,$s3,`32-24`,24,31
556 lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
557 rlwinm $acc04,$s1,`32-16`,24,31
558 lwz $acc09,`2048+32`($Tbl0)
559 rlwinm $acc05,$s2,`32-16`,24,31
560 lwz $acc10,`2048+64`($Tbl0)
561 rlwinm $acc06,$s3,`32-16`,24,31
562 lwz $acc11,`2048+96`($Tbl0)
563 rlwinm $acc07,$s0,`32-16`,24,31
564 lwz $acc12,`2048+128`($Tbl0)
565 rlwinm $acc08,$s2,`32-8`,24,31
566 lwz $acc13,`2048+160`($Tbl0)
567 rlwinm $acc09,$s3,`32-8`,24,31
568 lwz $acc14,`2048+192`($Tbl0)
569 rlwinm $acc10,$s0,`32-8`,24,31
570 lwz $acc15,`2048+224`($Tbl0)
571 rlwinm $acc11,$s1,`32-8`,24,31
572 lbzx $acc00,$Tbl2,$acc00
573 rlwinm $acc12,$s3,`0`,24,31
574 lbzx $acc01,$Tbl2,$acc01
575 rlwinm $acc13,$s0,`0`,24,31
576 lbzx $acc02,$Tbl2,$acc02
577 rlwinm $acc14,$s1,`0`,24,31
578 lbzx $acc03,$Tbl2,$acc03
579 rlwinm $acc15,$s2,`0`,24,31
580 lbzx $acc04,$Tbl2,$acc04
581 rlwinm $s0,$acc00,24,0,7
582 lbzx $acc05,$Tbl2,$acc05
583 rlwinm $s1,$acc01,24,0,7
584 lbzx $acc06,$Tbl2,$acc06
585 rlwinm $s2,$acc02,24,0,7
586 lbzx $acc07,$Tbl2,$acc07
587 rlwinm $s3,$acc03,24,0,7
588 lbzx $acc08,$Tbl2,$acc08
589 rlwimi $s0,$acc04,16,8,15
590 lbzx $acc09,$Tbl2,$acc09
591 rlwimi $s1,$acc05,16,8,15
592 lbzx $acc10,$Tbl2,$acc10
593 rlwimi $s2,$acc06,16,8,15
594 lbzx $acc11,$Tbl2,$acc11
595 rlwimi $s3,$acc07,16,8,15
596 lbzx $acc12,$Tbl2,$acc12
597 rlwimi $s0,$acc08,8,16,23
598 lbzx $acc13,$Tbl2,$acc13
599 rlwimi $s1,$acc09,8,16,23
600 lbzx $acc14,$Tbl2,$acc14
601 rlwimi $s2,$acc10,8,16,23
602 lbzx $acc15,$Tbl2,$acc15
603 rlwimi $s3,$acc11,8,16,23
604 or $s0,$s0,$acc12
605 or $s1,$s1,$acc13
606 or $s2,$s2,$acc14
607 or $s3,$s3,$acc15
608 xor $s0,$s0,$t0
609 xor $s1,$s1,$t1
610 xor $s2,$s2,$t2
611 xor $s3,$s3,$t3
612 blr
613
614.align 4
615Lppc_AES_encrypt_compact:
616 lwz $acc00,240($key)
617 addi $Tbl1,$Tbl0,2048
618 lwz $t0,0($key)
619 lis $mask80,0x8080
620 lwz $t1,4($key)
621 lis $mask1b,0x1b1b
622 lwz $t2,8($key)
623 ori $mask80,$mask80,0x8080
624 lwz $t3,12($key)
625 ori $mask1b,$mask1b,0x1b1b
626 addi $key,$key,16
627 mtctr $acc00
628.align 4
629Lenc_compact_loop:
630 xor $s0,$s0,$t0
631 xor $s1,$s1,$t1
632 rlwinm $acc00,$s0,`32-24`,24,31
633 xor $s2,$s2,$t2
634 rlwinm $acc01,$s1,`32-24`,24,31
635 xor $s3,$s3,$t3
636 rlwinm $acc02,$s2,`32-24`,24,31
637 rlwinm $acc03,$s3,`32-24`,24,31
638 rlwinm $acc04,$s1,`32-16`,24,31
639 rlwinm $acc05,$s2,`32-16`,24,31
640 rlwinm $acc06,$s3,`32-16`,24,31
641 rlwinm $acc07,$s0,`32-16`,24,31
642 lbzx $acc00,$Tbl1,$acc00
643 rlwinm $acc08,$s2,`32-8`,24,31
644 lbzx $acc01,$Tbl1,$acc01
645 rlwinm $acc09,$s3,`32-8`,24,31
646 lbzx $acc02,$Tbl1,$acc02
647 rlwinm $acc10,$s0,`32-8`,24,31
648 lbzx $acc03,$Tbl1,$acc03
649 rlwinm $acc11,$s1,`32-8`,24,31
650 lbzx $acc04,$Tbl1,$acc04
651 rlwinm $acc12,$s3,`0`,24,31
652 lbzx $acc05,$Tbl1,$acc05
653 rlwinm $acc13,$s0,`0`,24,31
654 lbzx $acc06,$Tbl1,$acc06
655 rlwinm $acc14,$s1,`0`,24,31
656 lbzx $acc07,$Tbl1,$acc07
657 rlwinm $acc15,$s2,`0`,24,31
658 lbzx $acc08,$Tbl1,$acc08
659 rlwinm $s0,$acc00,24,0,7
660 lbzx $acc09,$Tbl1,$acc09
661 rlwinm $s1,$acc01,24,0,7
662 lbzx $acc10,$Tbl1,$acc10
663 rlwinm $s2,$acc02,24,0,7
664 lbzx $acc11,$Tbl1,$acc11
665 rlwinm $s3,$acc03,24,0,7
666 lbzx $acc12,$Tbl1,$acc12
667 rlwimi $s0,$acc04,16,8,15
668 lbzx $acc13,$Tbl1,$acc13
669 rlwimi $s1,$acc05,16,8,15
670 lbzx $acc14,$Tbl1,$acc14
671 rlwimi $s2,$acc06,16,8,15
672 lbzx $acc15,$Tbl1,$acc15
673 rlwimi $s3,$acc07,16,8,15
674 rlwimi $s0,$acc08,8,16,23
675 rlwimi $s1,$acc09,8,16,23
676 rlwimi $s2,$acc10,8,16,23
677 rlwimi $s3,$acc11,8,16,23
678 lwz $t0,0($key)
679 or $s0,$s0,$acc12
680 lwz $t1,4($key)
681 or $s1,$s1,$acc13
682 lwz $t2,8($key)
683 or $s2,$s2,$acc14
684 lwz $t3,12($key)
685 or $s3,$s3,$acc15
686
687 addi $key,$key,16
688 bdz Lenc_compact_done
689
690 and $acc00,$s0,$mask80 # r1=r0&0x80808080
691 and $acc01,$s1,$mask80
692 and $acc02,$s2,$mask80
693 and $acc03,$s3,$mask80
694 srwi $acc04,$acc00,7 # r1>>7
695 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
696 srwi $acc05,$acc01,7
697 andc $acc09,$s1,$mask80
698 srwi $acc06,$acc02,7
699 andc $acc10,$s2,$mask80
700 srwi $acc07,$acc03,7
701 andc $acc11,$s3,$mask80
702 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
703 sub $acc01,$acc01,$acc05
704 sub $acc02,$acc02,$acc06
705 sub $acc03,$acc03,$acc07
706 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
707 add $acc09,$acc09,$acc09
708 add $acc10,$acc10,$acc10
709 add $acc11,$acc11,$acc11
710 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
711 and $acc01,$acc01,$mask1b
712 and $acc02,$acc02,$mask1b
713 and $acc03,$acc03,$mask1b
714 xor $acc00,$acc00,$acc08 # r2
715 xor $acc01,$acc01,$acc09
716 rotlwi $acc12,$s0,16 # ROTATE(r0,16)
717 xor $acc02,$acc02,$acc10
718 rotlwi $acc13,$s1,16
719 xor $acc03,$acc03,$acc11
720 rotlwi $acc14,$s2,16
721
722 xor $s0,$s0,$acc00 # r0^r2
723 rotlwi $acc15,$s3,16
724 xor $s1,$s1,$acc01
725 rotrwi $s0,$s0,24 # ROTATE(r2^r0,24)
726 xor $s2,$s2,$acc02
727 rotrwi $s1,$s1,24
728 xor $s3,$s3,$acc03
729 rotrwi $s2,$s2,24
730 xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2
731 rotrwi $s3,$s3,24
732 xor $s1,$s1,$acc01
733 xor $s2,$s2,$acc02
734 xor $s3,$s3,$acc03
735 rotlwi $acc08,$acc12,8 # ROTATE(r0,24)
736 xor $s0,$s0,$acc12 #
737 rotlwi $acc09,$acc13,8
738 xor $s1,$s1,$acc13
739 rotlwi $acc10,$acc14,8
740 xor $s2,$s2,$acc14
741 rotlwi $acc11,$acc15,8
742 xor $s3,$s3,$acc15
743 xor $s0,$s0,$acc08 #
744 xor $s1,$s1,$acc09
745 xor $s2,$s2,$acc10
746 xor $s3,$s3,$acc11
747
748 b Lenc_compact_loop
749.align 4
750Lenc_compact_done:
751 xor $s0,$s0,$t0
752 xor $s1,$s1,$t1
753 xor $s2,$s2,$t2
754 xor $s3,$s3,$t3
755 blr
756
757.globl .aes_decrypt_internal
758.align 7
759.aes_decrypt_internal:
760 $STU $sp,-$FRAME($sp)
761 mflr r0
762
763 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
764 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
765 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
766 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
767 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
768 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
769 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
770 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
771 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
772 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
773 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
774 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
775 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
776 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
777 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
778 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
779 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
780 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
781 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
782 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
783 $PUSH r0,`$FRAME+$LRSAVE`($sp)
784
785 andi. $t0,$inp,3
786 andi. $t1,$out,3
787 or. $t0,$t0,$t1
788 bne Ldec_unaligned
789
790Ldec_unaligned_ok:
791 lwz $s0,0($inp)
792 lwz $s1,4($inp)
793 lwz $s2,8($inp)
794 lwz $s3,12($inp)
795 bl LAES_Td
796 bl Lppc_AES_decrypt_compact
797 stw $s0,0($out)
798 stw $s1,4($out)
799 stw $s2,8($out)
800 stw $s3,12($out)
801 b Ldec_done
802
803Ldec_unaligned:
804 subfic $t0,$inp,4096
805 subfic $t1,$out,4096
806 andi. $t0,$t0,4096-16
807 beq Ldec_xpage
808 andi. $t1,$t1,4096-16
809 bne Ldec_unaligned_ok
810
811Ldec_xpage:
812 lbz $acc00,0($inp)
813 lbz $acc01,1($inp)
814 lbz $acc02,2($inp)
815 lbz $s0,3($inp)
816 lbz $acc04,4($inp)
817 lbz $acc05,5($inp)
818 lbz $acc06,6($inp)
819 lbz $s1,7($inp)
820 lbz $acc08,8($inp)
821 lbz $acc09,9($inp)
822 lbz $acc10,10($inp)
823 insrwi $s0,$acc00,8,0
824 lbz $s2,11($inp)
825 insrwi $s1,$acc04,8,0
826 lbz $acc12,12($inp)
827 insrwi $s0,$acc01,8,8
828 lbz $acc13,13($inp)
829 insrwi $s1,$acc05,8,8
830 lbz $acc14,14($inp)
831 insrwi $s0,$acc02,8,16
832 lbz $s3,15($inp)
833 insrwi $s1,$acc06,8,16
834 insrwi $s2,$acc08,8,0
835 insrwi $s3,$acc12,8,0
836 insrwi $s2,$acc09,8,8
837 insrwi $s3,$acc13,8,8
838 insrwi $s2,$acc10,8,16
839 insrwi $s3,$acc14,8,16
840
841 bl LAES_Td
842 bl Lppc_AES_decrypt_compact
843
844 extrwi $acc00,$s0,8,0
845 extrwi $acc01,$s0,8,8
846 stb $acc00,0($out)
847 extrwi $acc02,$s0,8,16
848 stb $acc01,1($out)
849 stb $acc02,2($out)
850 extrwi $acc04,$s1,8,0
851 stb $s0,3($out)
852 extrwi $acc05,$s1,8,8
853 stb $acc04,4($out)
854 extrwi $acc06,$s1,8,16
855 stb $acc05,5($out)
856 stb $acc06,6($out)
857 extrwi $acc08,$s2,8,0
858 stb $s1,7($out)
859 extrwi $acc09,$s2,8,8
860 stb $acc08,8($out)
861 extrwi $acc10,$s2,8,16
862 stb $acc09,9($out)
863 stb $acc10,10($out)
864 extrwi $acc12,$s3,8,0
865 stb $s2,11($out)
866 extrwi $acc13,$s3,8,8
867 stb $acc12,12($out)
868 extrwi $acc14,$s3,8,16
869 stb $acc13,13($out)
870 stb $acc14,14($out)
871 stb $s3,15($out)
872
873Ldec_done:
874 $POP r0,`$FRAME+$LRSAVE`($sp)
875 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
876 $POP r13,`$FRAME-$SIZE_T*19`($sp)
877 $POP r14,`$FRAME-$SIZE_T*18`($sp)
878 $POP r15,`$FRAME-$SIZE_T*17`($sp)
879 $POP r16,`$FRAME-$SIZE_T*16`($sp)
880 $POP r17,`$FRAME-$SIZE_T*15`($sp)
881 $POP r18,`$FRAME-$SIZE_T*14`($sp)
882 $POP r19,`$FRAME-$SIZE_T*13`($sp)
883 $POP r20,`$FRAME-$SIZE_T*12`($sp)
884 $POP r21,`$FRAME-$SIZE_T*11`($sp)
885 $POP r22,`$FRAME-$SIZE_T*10`($sp)
886 $POP r23,`$FRAME-$SIZE_T*9`($sp)
887 $POP r24,`$FRAME-$SIZE_T*8`($sp)
888 $POP r25,`$FRAME-$SIZE_T*7`($sp)
889 $POP r26,`$FRAME-$SIZE_T*6`($sp)
890 $POP r27,`$FRAME-$SIZE_T*5`($sp)
891 $POP r28,`$FRAME-$SIZE_T*4`($sp)
892 $POP r29,`$FRAME-$SIZE_T*3`($sp)
893 $POP r30,`$FRAME-$SIZE_T*2`($sp)
894 $POP r31,`$FRAME-$SIZE_T*1`($sp)
895 mtlr r0
896 addi $sp,$sp,$FRAME
897 blr
898
899.align 5
900Lppc_AES_decrypt:
901 lwz $acc00,240($key)
902 addi $Tbl1,$Tbl0,3
903 lwz $t0,0($key)
904 addi $Tbl2,$Tbl0,2
905 lwz $t1,4($key)
906 addi $Tbl3,$Tbl0,1
907 lwz $t2,8($key)
908 addi $acc00,$acc00,-1
909 lwz $t3,12($key)
910 addi $key,$key,16
911 xor $s0,$s0,$t0
912 xor $s1,$s1,$t1
913 xor $s2,$s2,$t2
914 xor $s3,$s3,$t3
915 mtctr $acc00
916.align 4
917Ldec_loop:
918 rlwinm $acc00,$s0,`32-24+3`,21,28
919 rlwinm $acc01,$s1,`32-24+3`,21,28
920 rlwinm $acc02,$s2,`32-24+3`,21,28
921 rlwinm $acc03,$s3,`32-24+3`,21,28
922 lwz $t0,0($key)
923 rlwinm $acc04,$s3,`32-16+3`,21,28
924 lwz $t1,4($key)
925 rlwinm $acc05,$s0,`32-16+3`,21,28
926 lwz $t2,8($key)
927 rlwinm $acc06,$s1,`32-16+3`,21,28
928 lwz $t3,12($key)
929 rlwinm $acc07,$s2,`32-16+3`,21,28
930 lwzx $acc00,$Tbl0,$acc00
931 rlwinm $acc08,$s2,`32-8+3`,21,28
932 lwzx $acc01,$Tbl0,$acc01
933 rlwinm $acc09,$s3,`32-8+3`,21,28
934 lwzx $acc02,$Tbl0,$acc02
935 rlwinm $acc10,$s0,`32-8+3`,21,28
936 lwzx $acc03,$Tbl0,$acc03
937 rlwinm $acc11,$s1,`32-8+3`,21,28
938 lwzx $acc04,$Tbl1,$acc04
939 rlwinm $acc12,$s1,`0+3`,21,28
940 lwzx $acc05,$Tbl1,$acc05
941 rlwinm $acc13,$s2,`0+3`,21,28
942 lwzx $acc06,$Tbl1,$acc06
943 rlwinm $acc14,$s3,`0+3`,21,28
944 lwzx $acc07,$Tbl1,$acc07
945 rlwinm $acc15,$s0,`0+3`,21,28
946 lwzx $acc08,$Tbl2,$acc08
947 xor $t0,$t0,$acc00
948 lwzx $acc09,$Tbl2,$acc09
949 xor $t1,$t1,$acc01
950 lwzx $acc10,$Tbl2,$acc10
951 xor $t2,$t2,$acc02
952 lwzx $acc11,$Tbl2,$acc11
953 xor $t3,$t3,$acc03
954 lwzx $acc12,$Tbl3,$acc12
955 xor $t0,$t0,$acc04
956 lwzx $acc13,$Tbl3,$acc13
957 xor $t1,$t1,$acc05
958 lwzx $acc14,$Tbl3,$acc14
959 xor $t2,$t2,$acc06
960 lwzx $acc15,$Tbl3,$acc15
961 xor $t3,$t3,$acc07
962 xor $t0,$t0,$acc08
963 xor $t1,$t1,$acc09
964 xor $t2,$t2,$acc10
965 xor $t3,$t3,$acc11
966 xor $s0,$t0,$acc12
967 xor $s1,$t1,$acc13
968 xor $s2,$t2,$acc14
969 xor $s3,$t3,$acc15
970 addi $key,$key,16
971 bdnz- Ldec_loop
972
973 addi $Tbl2,$Tbl0,2048
974 nop
975 lwz $t0,0($key)
976 rlwinm $acc00,$s0,`32-24`,24,31
977 lwz $t1,4($key)
978 rlwinm $acc01,$s1,`32-24`,24,31
979 lwz $t2,8($key)
980 rlwinm $acc02,$s2,`32-24`,24,31
981 lwz $t3,12($key)
982 rlwinm $acc03,$s3,`32-24`,24,31
983 lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
984 rlwinm $acc04,$s3,`32-16`,24,31
985 lwz $acc09,`2048+32`($Tbl0)
986 rlwinm $acc05,$s0,`32-16`,24,31
987 lwz $acc10,`2048+64`($Tbl0)
988 lbzx $acc00,$Tbl2,$acc00
989 lwz $acc11,`2048+96`($Tbl0)
990 lbzx $acc01,$Tbl2,$acc01
991 lwz $acc12,`2048+128`($Tbl0)
992 rlwinm $acc06,$s1,`32-16`,24,31
993 lwz $acc13,`2048+160`($Tbl0)
994 rlwinm $acc07,$s2,`32-16`,24,31
995 lwz $acc14,`2048+192`($Tbl0)
996 rlwinm $acc08,$s2,`32-8`,24,31
997 lwz $acc15,`2048+224`($Tbl0)
998 rlwinm $acc09,$s3,`32-8`,24,31
999 lbzx $acc02,$Tbl2,$acc02
1000 rlwinm $acc10,$s0,`32-8`,24,31
1001 lbzx $acc03,$Tbl2,$acc03
1002 rlwinm $acc11,$s1,`32-8`,24,31
1003 lbzx $acc04,$Tbl2,$acc04
1004 rlwinm $acc12,$s1,`0`,24,31
1005 lbzx $acc05,$Tbl2,$acc05
1006 rlwinm $acc13,$s2,`0`,24,31
1007 lbzx $acc06,$Tbl2,$acc06
1008 rlwinm $acc14,$s3,`0`,24,31
1009 lbzx $acc07,$Tbl2,$acc07
1010 rlwinm $acc15,$s0,`0`,24,31
1011 lbzx $acc08,$Tbl2,$acc08
1012 rlwinm $s0,$acc00,24,0,7
1013 lbzx $acc09,$Tbl2,$acc09
1014 rlwinm $s1,$acc01,24,0,7
1015 lbzx $acc10,$Tbl2,$acc10
1016 rlwinm $s2,$acc02,24,0,7
1017 lbzx $acc11,$Tbl2,$acc11
1018 rlwinm $s3,$acc03,24,0,7
1019 lbzx $acc12,$Tbl2,$acc12
1020 rlwimi $s0,$acc04,16,8,15
1021 lbzx $acc13,$Tbl2,$acc13
1022 rlwimi $s1,$acc05,16,8,15
1023 lbzx $acc14,$Tbl2,$acc14
1024 rlwimi $s2,$acc06,16,8,15
1025 lbzx $acc15,$Tbl2,$acc15
1026 rlwimi $s3,$acc07,16,8,15
1027 rlwimi $s0,$acc08,8,16,23
1028 rlwimi $s1,$acc09,8,16,23
1029 rlwimi $s2,$acc10,8,16,23
1030 rlwimi $s3,$acc11,8,16,23
1031 or $s0,$s0,$acc12
1032 or $s1,$s1,$acc13
1033 or $s2,$s2,$acc14
1034 or $s3,$s3,$acc15
1035 xor $s0,$s0,$t0
1036 xor $s1,$s1,$t1
1037 xor $s2,$s2,$t2
1038 xor $s3,$s3,$t3
1039 blr
1040
1041.align 4
1042Lppc_AES_decrypt_compact:
1043 lwz $acc00,240($key)
1044 addi $Tbl1,$Tbl0,2048
1045 lwz $t0,0($key)
1046 lis $mask80,0x8080
1047 lwz $t1,4($key)
1048 lis $mask1b,0x1b1b
1049 lwz $t2,8($key)
1050 ori $mask80,$mask80,0x8080
1051 lwz $t3,12($key)
1052 ori $mask1b,$mask1b,0x1b1b
1053 addi $key,$key,16
1054___
1055$code.=<<___ if ($SIZE_T==8);
1056 insrdi $mask80,$mask80,32,0
1057 insrdi $mask1b,$mask1b,32,0
1058___
1059$code.=<<___;
1060 mtctr $acc00
1061.align 4
1062Ldec_compact_loop:
1063 xor $s0,$s0,$t0
1064 xor $s1,$s1,$t1
1065 rlwinm $acc00,$s0,`32-24`,24,31
1066 xor $s2,$s2,$t2
1067 rlwinm $acc01,$s1,`32-24`,24,31
1068 xor $s3,$s3,$t3
1069 rlwinm $acc02,$s2,`32-24`,24,31
1070 rlwinm $acc03,$s3,`32-24`,24,31
1071 rlwinm $acc04,$s3,`32-16`,24,31
1072 rlwinm $acc05,$s0,`32-16`,24,31
1073 rlwinm $acc06,$s1,`32-16`,24,31
1074 rlwinm $acc07,$s2,`32-16`,24,31
1075 lbzx $acc00,$Tbl1,$acc00
1076 rlwinm $acc08,$s2,`32-8`,24,31
1077 lbzx $acc01,$Tbl1,$acc01
1078 rlwinm $acc09,$s3,`32-8`,24,31
1079 lbzx $acc02,$Tbl1,$acc02
1080 rlwinm $acc10,$s0,`32-8`,24,31
1081 lbzx $acc03,$Tbl1,$acc03
1082 rlwinm $acc11,$s1,`32-8`,24,31
1083 lbzx $acc04,$Tbl1,$acc04
1084 rlwinm $acc12,$s1,`0`,24,31
1085 lbzx $acc05,$Tbl1,$acc05
1086 rlwinm $acc13,$s2,`0`,24,31
1087 lbzx $acc06,$Tbl1,$acc06
1088 rlwinm $acc14,$s3,`0`,24,31
1089 lbzx $acc07,$Tbl1,$acc07
1090 rlwinm $acc15,$s0,`0`,24,31
1091 lbzx $acc08,$Tbl1,$acc08
1092 rlwinm $s0,$acc00,24,0,7
1093 lbzx $acc09,$Tbl1,$acc09
1094 rlwinm $s1,$acc01,24,0,7
1095 lbzx $acc10,$Tbl1,$acc10
1096 rlwinm $s2,$acc02,24,0,7
1097 lbzx $acc11,$Tbl1,$acc11
1098 rlwinm $s3,$acc03,24,0,7
1099 lbzx $acc12,$Tbl1,$acc12
1100 rlwimi $s0,$acc04,16,8,15
1101 lbzx $acc13,$Tbl1,$acc13
1102 rlwimi $s1,$acc05,16,8,15
1103 lbzx $acc14,$Tbl1,$acc14
1104 rlwimi $s2,$acc06,16,8,15
1105 lbzx $acc15,$Tbl1,$acc15
1106 rlwimi $s3,$acc07,16,8,15
1107 rlwimi $s0,$acc08,8,16,23
1108 rlwimi $s1,$acc09,8,16,23
1109 rlwimi $s2,$acc10,8,16,23
1110 rlwimi $s3,$acc11,8,16,23
1111 lwz $t0,0($key)
1112 or $s0,$s0,$acc12
1113 lwz $t1,4($key)
1114 or $s1,$s1,$acc13
1115 lwz $t2,8($key)
1116 or $s2,$s2,$acc14
1117 lwz $t3,12($key)
1118 or $s3,$s3,$acc15
1119
1120 addi $key,$key,16
1121 bdz Ldec_compact_done
1122___
1123$code.=<<___ if ($SIZE_T==8);
1124 # vectorized permutation improves decrypt performance by 10%
1125 insrdi $s0,$s1,32,0
1126 insrdi $s2,$s3,32,0
1127
1128 and $acc00,$s0,$mask80 # r1=r0&0x80808080
1129 and $acc02,$s2,$mask80
1130 srdi $acc04,$acc00,7 # r1>>7
1131 srdi $acc06,$acc02,7
1132 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
1133 andc $acc10,$s2,$mask80
1134 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
1135 sub $acc02,$acc02,$acc06
1136 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
1137 add $acc10,$acc10,$acc10
1138 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1139 and $acc02,$acc02,$mask1b
1140 xor $acc00,$acc00,$acc08 # r2
1141 xor $acc02,$acc02,$acc10
1142
1143 and $acc04,$acc00,$mask80 # r1=r2&0x80808080
1144 and $acc06,$acc02,$mask80
1145 srdi $acc08,$acc04,7 # r1>>7
1146 srdi $acc10,$acc06,7
1147 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
1148 andc $acc14,$acc02,$mask80
1149 sub $acc04,$acc04,$acc08 # r1-(r1>>7)
1150 sub $acc06,$acc06,$acc10
1151 add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
1152 add $acc14,$acc14,$acc14
1153 and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1154 and $acc06,$acc06,$mask1b
1155 xor $acc04,$acc04,$acc12 # r4
1156 xor $acc06,$acc06,$acc14
1157
1158 and $acc08,$acc04,$mask80 # r1=r4&0x80808080
1159 and $acc10,$acc06,$mask80
1160 srdi $acc12,$acc08,7 # r1>>7
1161 srdi $acc14,$acc10,7
1162 sub $acc08,$acc08,$acc12 # r1-(r1>>7)
1163 sub $acc10,$acc10,$acc14
1164 andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
1165 andc $acc14,$acc06,$mask80
1166 add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
1167 add $acc14,$acc14,$acc14
1168 and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1169 and $acc10,$acc10,$mask1b
1170 xor $acc08,$acc08,$acc12 # r8
1171 xor $acc10,$acc10,$acc14
1172
1173 xor $acc00,$acc00,$s0 # r2^r0
1174 xor $acc02,$acc02,$s2
1175 xor $acc04,$acc04,$s0 # r4^r0
1176 xor $acc06,$acc06,$s2
1177
1178 extrdi $acc01,$acc00,32,0
1179 extrdi $acc03,$acc02,32,0
1180 extrdi $acc05,$acc04,32,0
1181 extrdi $acc07,$acc06,32,0
1182 extrdi $acc09,$acc08,32,0
1183 extrdi $acc11,$acc10,32,0
1184___
1185$code.=<<___ if ($SIZE_T==4);
1186 and $acc00,$s0,$mask80 # r1=r0&0x80808080
1187 and $acc01,$s1,$mask80
1188 and $acc02,$s2,$mask80
1189 and $acc03,$s3,$mask80
1190 srwi $acc04,$acc00,7 # r1>>7
1191 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
1192 srwi $acc05,$acc01,7
1193 andc $acc09,$s1,$mask80
1194 srwi $acc06,$acc02,7
1195 andc $acc10,$s2,$mask80
1196 srwi $acc07,$acc03,7
1197 andc $acc11,$s3,$mask80
1198 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
1199 sub $acc01,$acc01,$acc05
1200 sub $acc02,$acc02,$acc06
1201 sub $acc03,$acc03,$acc07
1202 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
1203 add $acc09,$acc09,$acc09
1204 add $acc10,$acc10,$acc10
1205 add $acc11,$acc11,$acc11
1206 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1207 and $acc01,$acc01,$mask1b
1208 and $acc02,$acc02,$mask1b
1209 and $acc03,$acc03,$mask1b
1210 xor $acc00,$acc00,$acc08 # r2
1211 xor $acc01,$acc01,$acc09
1212 xor $acc02,$acc02,$acc10
1213 xor $acc03,$acc03,$acc11
1214
1215 and $acc04,$acc00,$mask80 # r1=r2&0x80808080
1216 and $acc05,$acc01,$mask80
1217 and $acc06,$acc02,$mask80
1218 and $acc07,$acc03,$mask80
1219 srwi $acc08,$acc04,7 # r1>>7
1220 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
1221 srwi $acc09,$acc05,7
1222 andc $acc13,$acc01,$mask80
1223 srwi $acc10,$acc06,7
1224 andc $acc14,$acc02,$mask80
1225 srwi $acc11,$acc07,7
1226 andc $acc15,$acc03,$mask80
1227 sub $acc04,$acc04,$acc08 # r1-(r1>>7)
1228 sub $acc05,$acc05,$acc09
1229 sub $acc06,$acc06,$acc10
1230 sub $acc07,$acc07,$acc11
1231 add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
1232 add $acc13,$acc13,$acc13
1233 add $acc14,$acc14,$acc14
1234 add $acc15,$acc15,$acc15
1235 and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1236 and $acc05,$acc05,$mask1b
1237 and $acc06,$acc06,$mask1b
1238 and $acc07,$acc07,$mask1b
1239 xor $acc04,$acc04,$acc12 # r4
1240 xor $acc05,$acc05,$acc13
1241 xor $acc06,$acc06,$acc14
1242 xor $acc07,$acc07,$acc15
1243
1244 and $acc08,$acc04,$mask80 # r1=r4&0x80808080
1245 and $acc09,$acc05,$mask80
1246 srwi $acc12,$acc08,7 # r1>>7
1247 and $acc10,$acc06,$mask80
1248 srwi $acc13,$acc09,7
1249 and $acc11,$acc07,$mask80
1250 srwi $acc14,$acc10,7
1251 sub $acc08,$acc08,$acc12 # r1-(r1>>7)
1252 srwi $acc15,$acc11,7
1253 sub $acc09,$acc09,$acc13
1254 sub $acc10,$acc10,$acc14
1255 sub $acc11,$acc11,$acc15
1256 andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
1257 andc $acc13,$acc05,$mask80
1258 andc $acc14,$acc06,$mask80
1259 andc $acc15,$acc07,$mask80
1260 add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
1261 add $acc13,$acc13,$acc13
1262 add $acc14,$acc14,$acc14
1263 add $acc15,$acc15,$acc15
1264 and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1265 and $acc09,$acc09,$mask1b
1266 and $acc10,$acc10,$mask1b
1267 and $acc11,$acc11,$mask1b
1268 xor $acc08,$acc08,$acc12 # r8
1269 xor $acc09,$acc09,$acc13
1270 xor $acc10,$acc10,$acc14
1271 xor $acc11,$acc11,$acc15
1272
1273 xor $acc00,$acc00,$s0 # r2^r0
1274 xor $acc01,$acc01,$s1
1275 xor $acc02,$acc02,$s2
1276 xor $acc03,$acc03,$s3
1277 xor $acc04,$acc04,$s0 # r4^r0
1278 xor $acc05,$acc05,$s1
1279 xor $acc06,$acc06,$s2
1280 xor $acc07,$acc07,$s3
1281___
1282$code.=<<___;
1283 rotrwi $s0,$s0,8 # = ROTATE(r0,8)
1284 rotrwi $s1,$s1,8
1285 xor $s0,$s0,$acc00 # ^= r2^r0
1286 rotrwi $s2,$s2,8
1287 xor $s1,$s1,$acc01
1288 rotrwi $s3,$s3,8
1289 xor $s2,$s2,$acc02
1290 xor $s3,$s3,$acc03
1291 xor $acc00,$acc00,$acc08
1292 xor $acc01,$acc01,$acc09
1293 xor $acc02,$acc02,$acc10
1294 xor $acc03,$acc03,$acc11
1295 xor $s0,$s0,$acc04 # ^= r4^r0
1296 rotrwi $acc00,$acc00,24
1297 xor $s1,$s1,$acc05
1298 rotrwi $acc01,$acc01,24
1299 xor $s2,$s2,$acc06
1300 rotrwi $acc02,$acc02,24
1301 xor $s3,$s3,$acc07
1302 rotrwi $acc03,$acc03,24
1303 xor $acc04,$acc04,$acc08
1304 xor $acc05,$acc05,$acc09
1305 xor $acc06,$acc06,$acc10
1306 xor $acc07,$acc07,$acc11
1307 xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
1308 rotrwi $acc04,$acc04,16
1309 xor $s1,$s1,$acc09
1310 rotrwi $acc05,$acc05,16
1311 xor $s2,$s2,$acc10
1312 rotrwi $acc06,$acc06,16
1313 xor $s3,$s3,$acc11
1314 rotrwi $acc07,$acc07,16
1315 xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24)
1316 rotrwi $acc08,$acc08,8
1317 xor $s1,$s1,$acc01
1318 rotrwi $acc09,$acc09,8
1319 xor $s2,$s2,$acc02
1320 rotrwi $acc10,$acc10,8
1321 xor $s3,$s3,$acc03
1322 rotrwi $acc11,$acc11,8
1323 xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16)
1324 xor $s1,$s1,$acc05
1325 xor $s2,$s2,$acc06
1326 xor $s3,$s3,$acc07
1327 xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
1328 xor $s1,$s1,$acc09
1329 xor $s2,$s2,$acc10
1330 xor $s3,$s3,$acc11
1331
1332 b Ldec_compact_loop
1333.align 4
1334Ldec_compact_done:
1335 xor $s0,$s0,$t0
1336 xor $s1,$s1,$t1
1337 xor $s2,$s2,$t2
1338 xor $s3,$s3,$t3
1339 blr
1340___
1341
1342$code =~ s/\`([^\`]*)\`/eval $1/gem;
1343print $code;
1344close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
deleted file mode 100755
index 1348d09594..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
+++ /dev/null
@@ -1,1217 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# Version 1.1
10#
11# The major reason for undertaken effort was to mitigate the hazard of
12# cache-timing attack. This is [currently and initially!] addressed in
13# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
14# 2. References to them are scheduled for L2 cache latency, meaning
15# that the tables don't have to reside in L1 cache. Once again, this
16# is an initial draft and one should expect more countermeasures to
17# be implemented...
18#
19# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
20# round.
21#
22# Even though performance was not the primary goal [on the contrary,
23# extra shifts "induced" by compressed S-box and longer loop epilogue
24# "induced" by scheduling for L2 have negative effect on performance],
25# the code turned out to run in ~23 cycles per processed byte en-/
26# decrypted with 128-bit key. This is pretty good result for code
27# with mentioned qualities and UltraSPARC core. Compared to Sun C
28# generated code my encrypt procedure runs just few percents faster,
29# while decrypt one - whole 50% faster [yes, Sun C failed to generate
30# optimal decrypt procedure]. Compared to GNU C generated code both
31# procedures are more than 60% faster:-)
32
33$bits=32;
34for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
35if ($bits==64) { $bias=2047; $frame=192; }
36else { $bias=0; $frame=112; }
37$locals=16;
38
39$acc0="%l0";
40$acc1="%o0";
41$acc2="%o1";
42$acc3="%o2";
43
44$acc4="%l1";
45$acc5="%o3";
46$acc6="%o4";
47$acc7="%o5";
48
49$acc8="%l2";
50$acc9="%o7";
51$acc10="%g1";
52$acc11="%g2";
53
54$acc12="%l3";
55$acc13="%g3";
56$acc14="%g4";
57$acc15="%g5";
58
59$t0="%l4";
60$t1="%l5";
61$t2="%l6";
62$t3="%l7";
63
64$s0="%i0";
65$s1="%i1";
66$s2="%i2";
67$s3="%i3";
68$tbl="%i4";
69$key="%i5";
70$rounds="%i7"; # aliases with return address, which is off-loaded to stack
71
72sub _data_word()
73{ my $i;
74 while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
75}
76
77$code.=<<___ if ($bits==64);
78.register %g2,#scratch
79.register %g3,#scratch
80___
81$code.=<<___;
82.section ".rodata",#alloc
83
84.align 256
85AES_Te:
86___
87&_data_word(
88 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
89 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
90 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
91 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
92 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
93 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
94 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
95 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
96 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
97 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
98 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
99 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
100 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
101 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
102 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
103 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
104 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
105 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
106 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
107 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
108 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
109 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
110 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
111 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
112 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
113 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
114 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
115 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
116 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
117 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
118 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
119 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
120 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
121 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
122 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
123 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
124 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
125 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
126 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
127 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
128 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
129 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
130 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
131 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
132 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
133 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
134 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
135 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
136 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
137 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
138 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
139 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
140 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
141 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
142 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
143 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
144 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
145 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
146 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
147 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
148 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
149 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
150 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
151 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
152$code.=<<___;
153 .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
154 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
155 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
156 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
157 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
158 .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
159 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
160 .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
161 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
162 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
163 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
164 .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
165 .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
166 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
167 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
168 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
169 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
170 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
171 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
172 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
173 .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
174 .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
175 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
176 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
177 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
178 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
179 .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
180 .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
181 .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
182 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
183 .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
184 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
185.type AES_Te,#object
186.size AES_Te,(.-AES_Te)
187
188.section ".text",#alloc,#execinstr
189.align 64
190.skip 16
191_sparcv9_AES_encrypt:
192 save %sp,-$frame-$locals,%sp
193 stx %i7,[%sp+$bias+$frame+0] ! off-load return address
194 ld [$key+240],$rounds
195 ld [$key+0],$t0
196 ld [$key+4],$t1 !
197 ld [$key+8],$t2
198 srl $rounds,1,$rounds
199 xor $t0,$s0,$s0
200 ld [$key+12],$t3
201 srl $s0,21,$acc0
202 xor $t1,$s1,$s1
203 ld [$key+16],$t0
204 srl $s1,13,$acc1 !
205 xor $t2,$s2,$s2
206 ld [$key+20],$t1
207 xor $t3,$s3,$s3
208 ld [$key+24],$t2
209 and $acc0,2040,$acc0
210 ld [$key+28],$t3
211 nop
212.Lenc_loop:
213 srl $s2,5,$acc2 !
214 and $acc1,2040,$acc1
215 ldx [$tbl+$acc0],$acc0
216 sll $s3,3,$acc3
217 and $acc2,2040,$acc2
218 ldx [$tbl+$acc1],$acc1
219 srl $s1,21,$acc4
220 and $acc3,2040,$acc3
221 ldx [$tbl+$acc2],$acc2 !
222 srl $s2,13,$acc5
223 and $acc4,2040,$acc4
224 ldx [$tbl+$acc3],$acc3
225 srl $s3,5,$acc6
226 and $acc5,2040,$acc5
227 ldx [$tbl+$acc4],$acc4
228 fmovs %f0,%f0
229 sll $s0,3,$acc7 !
230 and $acc6,2040,$acc6
231 ldx [$tbl+$acc5],$acc5
232 srl $s2,21,$acc8
233 and $acc7,2040,$acc7
234 ldx [$tbl+$acc6],$acc6
235 srl $s3,13,$acc9
236 and $acc8,2040,$acc8
237 ldx [$tbl+$acc7],$acc7 !
238 srl $s0,5,$acc10
239 and $acc9,2040,$acc9
240 ldx [$tbl+$acc8],$acc8
241 sll $s1,3,$acc11
242 and $acc10,2040,$acc10
243 ldx [$tbl+$acc9],$acc9
244 fmovs %f0,%f0
245 srl $s3,21,$acc12 !
246 and $acc11,2040,$acc11
247 ldx [$tbl+$acc10],$acc10
248 srl $s0,13,$acc13
249 and $acc12,2040,$acc12
250 ldx [$tbl+$acc11],$acc11
251 srl $s1,5,$acc14
252 and $acc13,2040,$acc13
253 ldx [$tbl+$acc12],$acc12 !
254 sll $s2,3,$acc15
255 and $acc14,2040,$acc14
256 ldx [$tbl+$acc13],$acc13
257 and $acc15,2040,$acc15
258 add $key,32,$key
259 ldx [$tbl+$acc14],$acc14
260 fmovs %f0,%f0
261 subcc $rounds,1,$rounds !
262 ldx [$tbl+$acc15],$acc15
263 bz,a,pn %icc,.Lenc_last
264 add $tbl,2048,$rounds
265
266 srlx $acc1,8,$acc1
267 xor $acc0,$t0,$t0
268 ld [$key+0],$s0
269 fmovs %f0,%f0
270 srlx $acc2,16,$acc2 !
271 xor $acc1,$t0,$t0
272 ld [$key+4],$s1
273 srlx $acc3,24,$acc3
274 xor $acc2,$t0,$t0
275 ld [$key+8],$s2
276 srlx $acc5,8,$acc5
277 xor $acc3,$t0,$t0
278 ld [$key+12],$s3 !
279 srlx $acc6,16,$acc6
280 xor $acc4,$t1,$t1
281 fmovs %f0,%f0
282 srlx $acc7,24,$acc7
283 xor $acc5,$t1,$t1
284 srlx $acc9,8,$acc9
285 xor $acc6,$t1,$t1
286 srlx $acc10,16,$acc10 !
287 xor $acc7,$t1,$t1
288 srlx $acc11,24,$acc11
289 xor $acc8,$t2,$t2
290 srlx $acc13,8,$acc13
291 xor $acc9,$t2,$t2
292 srlx $acc14,16,$acc14
293 xor $acc10,$t2,$t2
294 srlx $acc15,24,$acc15 !
295 xor $acc11,$t2,$t2
296 xor $acc12,$acc14,$acc14
297 xor $acc13,$t3,$t3
298 srl $t0,21,$acc0
299 xor $acc14,$t3,$t3
300 srl $t1,13,$acc1
301 xor $acc15,$t3,$t3
302
303 and $acc0,2040,$acc0 !
304 srl $t2,5,$acc2
305 and $acc1,2040,$acc1
306 ldx [$tbl+$acc0],$acc0
307 sll $t3,3,$acc3
308 and $acc2,2040,$acc2
309 ldx [$tbl+$acc1],$acc1
310 fmovs %f0,%f0
311 srl $t1,21,$acc4 !
312 and $acc3,2040,$acc3
313 ldx [$tbl+$acc2],$acc2
314 srl $t2,13,$acc5
315 and $acc4,2040,$acc4
316 ldx [$tbl+$acc3],$acc3
317 srl $t3,5,$acc6
318 and $acc5,2040,$acc5
319 ldx [$tbl+$acc4],$acc4 !
320 sll $t0,3,$acc7
321 and $acc6,2040,$acc6
322 ldx [$tbl+$acc5],$acc5
323 srl $t2,21,$acc8
324 and $acc7,2040,$acc7
325 ldx [$tbl+$acc6],$acc6
326 fmovs %f0,%f0
327 srl $t3,13,$acc9 !
328 and $acc8,2040,$acc8
329 ldx [$tbl+$acc7],$acc7
330 srl $t0,5,$acc10
331 and $acc9,2040,$acc9
332 ldx [$tbl+$acc8],$acc8
333 sll $t1,3,$acc11
334 and $acc10,2040,$acc10
335 ldx [$tbl+$acc9],$acc9 !
336 srl $t3,21,$acc12
337 and $acc11,2040,$acc11
338 ldx [$tbl+$acc10],$acc10
339 srl $t0,13,$acc13
340 and $acc12,2040,$acc12
341 ldx [$tbl+$acc11],$acc11
342 fmovs %f0,%f0
343 srl $t1,5,$acc14 !
344 and $acc13,2040,$acc13
345 ldx [$tbl+$acc12],$acc12
346 sll $t2,3,$acc15
347 and $acc14,2040,$acc14
348 ldx [$tbl+$acc13],$acc13
349 srlx $acc1,8,$acc1
350 and $acc15,2040,$acc15
351 ldx [$tbl+$acc14],$acc14 !
352
353 srlx $acc2,16,$acc2
354 xor $acc0,$s0,$s0
355 ldx [$tbl+$acc15],$acc15
356 srlx $acc3,24,$acc3
357 xor $acc1,$s0,$s0
358 ld [$key+16],$t0
359 fmovs %f0,%f0
360 srlx $acc5,8,$acc5 !
361 xor $acc2,$s0,$s0
362 ld [$key+20],$t1
363 srlx $acc6,16,$acc6
364 xor $acc3,$s0,$s0
365 ld [$key+24],$t2
366 srlx $acc7,24,$acc7
367 xor $acc4,$s1,$s1
368 ld [$key+28],$t3 !
369 srlx $acc9,8,$acc9
370 xor $acc5,$s1,$s1
371 ldx [$tbl+2048+0],%g0 ! prefetch te4
372 srlx $acc10,16,$acc10
373 xor $acc6,$s1,$s1
374 ldx [$tbl+2048+32],%g0 ! prefetch te4
375 srlx $acc11,24,$acc11
376 xor $acc7,$s1,$s1
377 ldx [$tbl+2048+64],%g0 ! prefetch te4
378 srlx $acc13,8,$acc13
379 xor $acc8,$s2,$s2
380 ldx [$tbl+2048+96],%g0 ! prefetch te4
381 srlx $acc14,16,$acc14 !
382 xor $acc9,$s2,$s2
383 ldx [$tbl+2048+128],%g0 ! prefetch te4
384 srlx $acc15,24,$acc15
385 xor $acc10,$s2,$s2
386 ldx [$tbl+2048+160],%g0 ! prefetch te4
387 srl $s0,21,$acc0
388 xor $acc11,$s2,$s2
389 ldx [$tbl+2048+192],%g0 ! prefetch te4
390 xor $acc12,$acc14,$acc14
391 xor $acc13,$s3,$s3
392 ldx [$tbl+2048+224],%g0 ! prefetch te4
393 srl $s1,13,$acc1 !
394 xor $acc14,$s3,$s3
395 xor $acc15,$s3,$s3
396 ba .Lenc_loop
397 and $acc0,2040,$acc0
398
399.align 32
400.Lenc_last:
401 srlx $acc1,8,$acc1 !
402 xor $acc0,$t0,$t0
403 ld [$key+0],$s0
404 srlx $acc2,16,$acc2
405 xor $acc1,$t0,$t0
406 ld [$key+4],$s1
407 srlx $acc3,24,$acc3
408 xor $acc2,$t0,$t0
409 ld [$key+8],$s2 !
410 srlx $acc5,8,$acc5
411 xor $acc3,$t0,$t0
412 ld [$key+12],$s3
413 srlx $acc6,16,$acc6
414 xor $acc4,$t1,$t1
415 srlx $acc7,24,$acc7
416 xor $acc5,$t1,$t1
417 srlx $acc9,8,$acc9 !
418 xor $acc6,$t1,$t1
419 srlx $acc10,16,$acc10
420 xor $acc7,$t1,$t1
421 srlx $acc11,24,$acc11
422 xor $acc8,$t2,$t2
423 srlx $acc13,8,$acc13
424 xor $acc9,$t2,$t2
425 srlx $acc14,16,$acc14 !
426 xor $acc10,$t2,$t2
427 srlx $acc15,24,$acc15
428 xor $acc11,$t2,$t2
429 xor $acc12,$acc14,$acc14
430 xor $acc13,$t3,$t3
431 srl $t0,24,$acc0
432 xor $acc14,$t3,$t3
433 srl $t1,16,$acc1 !
434 xor $acc15,$t3,$t3
435
436 srl $t2,8,$acc2
437 and $acc1,255,$acc1
438 ldub [$rounds+$acc0],$acc0
439 srl $t1,24,$acc4
440 and $acc2,255,$acc2
441 ldub [$rounds+$acc1],$acc1
442 srl $t2,16,$acc5 !
443 and $t3,255,$acc3
444 ldub [$rounds+$acc2],$acc2
445 ldub [$rounds+$acc3],$acc3
446 srl $t3,8,$acc6
447 and $acc5,255,$acc5
448 ldub [$rounds+$acc4],$acc4
449 fmovs %f0,%f0
450 srl $t2,24,$acc8 !
451 and $acc6,255,$acc6
452 ldub [$rounds+$acc5],$acc5
453 srl $t3,16,$acc9
454 and $t0,255,$acc7
455 ldub [$rounds+$acc6],$acc6
456 ldub [$rounds+$acc7],$acc7
457 fmovs %f0,%f0
458 srl $t0,8,$acc10 !
459 and $acc9,255,$acc9
460 ldub [$rounds+$acc8],$acc8
461 srl $t3,24,$acc12
462 and $acc10,255,$acc10
463 ldub [$rounds+$acc9],$acc9
464 srl $t0,16,$acc13
465 and $t1,255,$acc11
466 ldub [$rounds+$acc10],$acc10 !
467 srl $t1,8,$acc14
468 and $acc13,255,$acc13
469 ldub [$rounds+$acc11],$acc11
470 ldub [$rounds+$acc12],$acc12
471 and $acc14,255,$acc14
472 ldub [$rounds+$acc13],$acc13
473 and $t2,255,$acc15
474 ldub [$rounds+$acc14],$acc14 !
475
476 sll $acc0,24,$acc0
477 xor $acc3,$s0,$s0
478 ldub [$rounds+$acc15],$acc15
479 sll $acc1,16,$acc1
480 xor $acc0,$s0,$s0
481 ldx [%sp+$bias+$frame+0],%i7 ! restore return address
482 fmovs %f0,%f0
483 sll $acc2,8,$acc2 !
484 xor $acc1,$s0,$s0
485 sll $acc4,24,$acc4
486 xor $acc2,$s0,$s0
487 sll $acc5,16,$acc5
488 xor $acc7,$s1,$s1
489 sll $acc6,8,$acc6
490 xor $acc4,$s1,$s1
491 sll $acc8,24,$acc8 !
492 xor $acc5,$s1,$s1
493 sll $acc9,16,$acc9
494 xor $acc11,$s2,$s2
495 sll $acc10,8,$acc10
496 xor $acc6,$s1,$s1
497 sll $acc12,24,$acc12
498 xor $acc8,$s2,$s2
499 sll $acc13,16,$acc13 !
500 xor $acc9,$s2,$s2
501 sll $acc14,8,$acc14
502 xor $acc10,$s2,$s2
503 xor $acc12,$acc14,$acc14
504 xor $acc13,$s3,$s3
505 xor $acc14,$s3,$s3
506 xor $acc15,$s3,$s3
507
508 ret
509 restore
510.type _sparcv9_AES_encrypt,#function
511.size _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
512
513.align 32
514.globl aes_encrypt_internal
515aes_encrypt_internal:
516 save %sp,-$frame,%sp
517#ifdef __PIC__
518 sethi %hi(_GLOBAL_OFFSET_TABLE_-4), %o5
519 rd %pc, %o4
520 or %o5, %lo(_GLOBAL_OFFSET_TABLE_+4), %o5
521 add %o5, %o4, %o5
522#endif
523
524 or %i0,%i1,%g1
525 andcc %g1,3,%g0
526 bnz,pn %xcc,.Lunaligned_enc
527 nop
528
529 ld [%i0+0],%o0
530 ld [%i0+4],%o1
531 ld [%i0+8],%o2
532 ld [%i0+12],%o3
533
534#ifdef __PIC__
535 set AES_Te, %o4
536 ldx [%o4+%o5], %o4
537#else
538 set AES_Te, %o4
539#endif
540 call _sparcv9_AES_encrypt
541 mov %i2,%o5
542
543 st %o0,[%i1+0]
544 st %o1,[%i1+4]
545 st %o2,[%i1+8]
546 st %o3,[%i1+12]
547
548 ret
549 restore
550
551.align 32
552.Lunaligned_enc:
553 ldub [%i0+0],%l0
554 ldub [%i0+1],%l1
555 ldub [%i0+2],%l2
556
557 sll %l0,24,%l0
558 ldub [%i0+3],%l3
559 sll %l1,16,%l1
560 ldub [%i0+4],%l4
561 sll %l2,8,%l2
562 or %l1,%l0,%l0
563 ldub [%i0+5],%l5
564 sll %l4,24,%l4
565 or %l3,%l2,%l2
566 ldub [%i0+6],%l6
567 sll %l5,16,%l5
568 or %l0,%l2,%o0
569 ldub [%i0+7],%l7
570
571 sll %l6,8,%l6
572 or %l5,%l4,%l4
573 ldub [%i0+8],%l0
574 or %l7,%l6,%l6
575 ldub [%i0+9],%l1
576 or %l4,%l6,%o1
577 ldub [%i0+10],%l2
578
579 sll %l0,24,%l0
580 ldub [%i0+11],%l3
581 sll %l1,16,%l1
582 ldub [%i0+12],%l4
583 sll %l2,8,%l2
584 or %l1,%l0,%l0
585 ldub [%i0+13],%l5
586 sll %l4,24,%l4
587 or %l3,%l2,%l2
588 ldub [%i0+14],%l6
589 sll %l5,16,%l5
590 or %l0,%l2,%o2
591 ldub [%i0+15],%l7
592
593 sll %l6,8,%l6
594 or %l5,%l4,%l4
595 or %l7,%l6,%l6
596 or %l4,%l6,%o3
597
598#ifdef __PIC__
599 set AES_Te, %o4
600 ldx [%o4+%o5], %o4
601#else
602 set AES_Te, %o4
603#endif
604 call _sparcv9_AES_encrypt
605 mov %i2,%o5
606
607 srl %o0,24,%l0
608 srl %o0,16,%l1
609 stb %l0,[%i1+0]
610 srl %o0,8,%l2
611 stb %l1,[%i1+1]
612 stb %l2,[%i1+2]
613 srl %o1,24,%l4
614 stb %o0,[%i1+3]
615
616 srl %o1,16,%l5
617 stb %l4,[%i1+4]
618 srl %o1,8,%l6
619 stb %l5,[%i1+5]
620 stb %l6,[%i1+6]
621 srl %o2,24,%l0
622 stb %o1,[%i1+7]
623
624 srl %o2,16,%l1
625 stb %l0,[%i1+8]
626 srl %o2,8,%l2
627 stb %l1,[%i1+9]
628 stb %l2,[%i1+10]
629 srl %o3,24,%l4
630 stb %o2,[%i1+11]
631
632 srl %o3,16,%l5
633 stb %l4,[%i1+12]
634 srl %o3,8,%l6
635 stb %l5,[%i1+13]
636 stb %l6,[%i1+14]
637 stb %o3,[%i1+15]
638
639 ret
640 restore
641.type aes_encrypt_internal,#function
642.size aes_encrypt_internal,(.-aes_encrypt_internal)
643
644___
645
646$code.=<<___;
647.section ".rodata",#alloc
648.align 256
649AES_Td:
650___
651&_data_word(
652 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
653 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
654 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
655 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
656 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
657 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
658 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
659 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
660 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
661 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
662 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
663 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
664 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
665 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
666 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
667 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
668 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
669 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
670 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
671 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
672 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
673 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
674 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
675 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
676 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
677 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
678 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
679 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
680 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
681 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
682 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
683 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
684 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
685 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
686 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
687 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
688 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
689 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
690 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
691 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
692 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
693 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
694 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
695 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
696 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
697 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
698 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
699 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
700 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
701 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
702 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
703 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
704 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
705 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
706 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
707 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
708 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
709 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
710 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
711 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
712 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
713 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
714 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
715 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
716$code.=<<___;
717 .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
718 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
719 .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
720 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
721 .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
722 .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
723 .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
724 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
725 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
726 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
727 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
728 .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
729 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
730 .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
731 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
732 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
733 .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
734 .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
735 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
736 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
737 .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
738 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
739 .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
740 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
741 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
742 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
743 .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
744 .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
745 .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
746 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
747 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
748 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
749.type AES_Td,#object
750.size AES_Td,(.-AES_Td)
751
752.section ".text",#alloc,#execinstr
753.align 64
754.skip 16
755_sparcv9_AES_decrypt:
756 save %sp,-$frame-$locals,%sp
757 stx %i7,[%sp+$bias+$frame+0] ! off-load return address
758 ld [$key+240],$rounds
759 ld [$key+0],$t0
760 ld [$key+4],$t1 !
761 ld [$key+8],$t2
762 ld [$key+12],$t3
763 srl $rounds,1,$rounds
764 xor $t0,$s0,$s0
765 ld [$key+16],$t0
766 xor $t1,$s1,$s1
767 ld [$key+20],$t1
768 srl $s0,21,$acc0 !
769 xor $t2,$s2,$s2
770 ld [$key+24],$t2
771 xor $t3,$s3,$s3
772 and $acc0,2040,$acc0
773 ld [$key+28],$t3
774 srl $s3,13,$acc1
775 nop
776.Ldec_loop:
777 srl $s2,5,$acc2 !
778 and $acc1,2040,$acc1
779 ldx [$tbl+$acc0],$acc0
780 sll $s1,3,$acc3
781 and $acc2,2040,$acc2
782 ldx [$tbl+$acc1],$acc1
783 srl $s1,21,$acc4
784 and $acc3,2040,$acc3
785 ldx [$tbl+$acc2],$acc2 !
786 srl $s0,13,$acc5
787 and $acc4,2040,$acc4
788 ldx [$tbl+$acc3],$acc3
789 srl $s3,5,$acc6
790 and $acc5,2040,$acc5
791 ldx [$tbl+$acc4],$acc4
792 fmovs %f0,%f0
793 sll $s2,3,$acc7 !
794 and $acc6,2040,$acc6
795 ldx [$tbl+$acc5],$acc5
796 srl $s2,21,$acc8
797 and $acc7,2040,$acc7
798 ldx [$tbl+$acc6],$acc6
799 srl $s1,13,$acc9
800 and $acc8,2040,$acc8
801 ldx [$tbl+$acc7],$acc7 !
802 srl $s0,5,$acc10
803 and $acc9,2040,$acc9
804 ldx [$tbl+$acc8],$acc8
805 sll $s3,3,$acc11
806 and $acc10,2040,$acc10
807 ldx [$tbl+$acc9],$acc9
808 fmovs %f0,%f0
809 srl $s3,21,$acc12 !
810 and $acc11,2040,$acc11
811 ldx [$tbl+$acc10],$acc10
812 srl $s2,13,$acc13
813 and $acc12,2040,$acc12
814 ldx [$tbl+$acc11],$acc11
815 srl $s1,5,$acc14
816 and $acc13,2040,$acc13
817 ldx [$tbl+$acc12],$acc12 !
818 sll $s0,3,$acc15
819 and $acc14,2040,$acc14
820 ldx [$tbl+$acc13],$acc13
821 and $acc15,2040,$acc15
822 add $key,32,$key
823 ldx [$tbl+$acc14],$acc14
824 fmovs %f0,%f0
825 subcc $rounds,1,$rounds !
826 ldx [$tbl+$acc15],$acc15
827 bz,a,pn %icc,.Ldec_last
828 add $tbl,2048,$rounds
829
830 srlx $acc1,8,$acc1
831 xor $acc0,$t0,$t0
832 ld [$key+0],$s0
833 fmovs %f0,%f0
834 srlx $acc2,16,$acc2 !
835 xor $acc1,$t0,$t0
836 ld [$key+4],$s1
837 srlx $acc3,24,$acc3
838 xor $acc2,$t0,$t0
839 ld [$key+8],$s2
840 srlx $acc5,8,$acc5
841 xor $acc3,$t0,$t0
842 ld [$key+12],$s3 !
843 srlx $acc6,16,$acc6
844 xor $acc4,$t1,$t1
845 fmovs %f0,%f0
846 srlx $acc7,24,$acc7
847 xor $acc5,$t1,$t1
848 srlx $acc9,8,$acc9
849 xor $acc6,$t1,$t1
850 srlx $acc10,16,$acc10 !
851 xor $acc7,$t1,$t1
852 srlx $acc11,24,$acc11
853 xor $acc8,$t2,$t2
854 srlx $acc13,8,$acc13
855 xor $acc9,$t2,$t2
856 srlx $acc14,16,$acc14
857 xor $acc10,$t2,$t2
858 srlx $acc15,24,$acc15 !
859 xor $acc11,$t2,$t2
860 xor $acc12,$acc14,$acc14
861 xor $acc13,$t3,$t3
862 srl $t0,21,$acc0
863 xor $acc14,$t3,$t3
864 xor $acc15,$t3,$t3
865 srl $t3,13,$acc1
866
867 and $acc0,2040,$acc0 !
868 srl $t2,5,$acc2
869 and $acc1,2040,$acc1
870 ldx [$tbl+$acc0],$acc0
871 sll $t1,3,$acc3
872 and $acc2,2040,$acc2
873 ldx [$tbl+$acc1],$acc1
874 fmovs %f0,%f0
875 srl $t1,21,$acc4 !
876 and $acc3,2040,$acc3
877 ldx [$tbl+$acc2],$acc2
878 srl $t0,13,$acc5
879 and $acc4,2040,$acc4
880 ldx [$tbl+$acc3],$acc3
881 srl $t3,5,$acc6
882 and $acc5,2040,$acc5
883 ldx [$tbl+$acc4],$acc4 !
884 sll $t2,3,$acc7
885 and $acc6,2040,$acc6
886 ldx [$tbl+$acc5],$acc5
887 srl $t2,21,$acc8
888 and $acc7,2040,$acc7
889 ldx [$tbl+$acc6],$acc6
890 fmovs %f0,%f0
891 srl $t1,13,$acc9 !
892 and $acc8,2040,$acc8
893 ldx [$tbl+$acc7],$acc7
894 srl $t0,5,$acc10
895 and $acc9,2040,$acc9
896 ldx [$tbl+$acc8],$acc8
897 sll $t3,3,$acc11
898 and $acc10,2040,$acc10
899 ldx [$tbl+$acc9],$acc9 !
900 srl $t3,21,$acc12
901 and $acc11,2040,$acc11
902 ldx [$tbl+$acc10],$acc10
903 srl $t2,13,$acc13
904 and $acc12,2040,$acc12
905 ldx [$tbl+$acc11],$acc11
906 fmovs %f0,%f0
907 srl $t1,5,$acc14 !
908 and $acc13,2040,$acc13
909 ldx [$tbl+$acc12],$acc12
910 sll $t0,3,$acc15
911 and $acc14,2040,$acc14
912 ldx [$tbl+$acc13],$acc13
913 srlx $acc1,8,$acc1
914 and $acc15,2040,$acc15
915 ldx [$tbl+$acc14],$acc14 !
916
917 srlx $acc2,16,$acc2
918 xor $acc0,$s0,$s0
919 ldx [$tbl+$acc15],$acc15
920 srlx $acc3,24,$acc3
921 xor $acc1,$s0,$s0
922 ld [$key+16],$t0
923 fmovs %f0,%f0
924 srlx $acc5,8,$acc5 !
925 xor $acc2,$s0,$s0
926 ld [$key+20],$t1
927 srlx $acc6,16,$acc6
928 xor $acc3,$s0,$s0
929 ld [$key+24],$t2
930 srlx $acc7,24,$acc7
931 xor $acc4,$s1,$s1
932 ld [$key+28],$t3 !
933 srlx $acc9,8,$acc9
934 xor $acc5,$s1,$s1
935 ldx [$tbl+2048+0],%g0 ! prefetch td4
936 srlx $acc10,16,$acc10
937 xor $acc6,$s1,$s1
938 ldx [$tbl+2048+32],%g0 ! prefetch td4
939 srlx $acc11,24,$acc11
940 xor $acc7,$s1,$s1
941 ldx [$tbl+2048+64],%g0 ! prefetch td4
942 srlx $acc13,8,$acc13
943 xor $acc8,$s2,$s2
944 ldx [$tbl+2048+96],%g0 ! prefetch td4
945 srlx $acc14,16,$acc14 !
946 xor $acc9,$s2,$s2
947 ldx [$tbl+2048+128],%g0 ! prefetch td4
948 srlx $acc15,24,$acc15
949 xor $acc10,$s2,$s2
950 ldx [$tbl+2048+160],%g0 ! prefetch td4
951 srl $s0,21,$acc0
952 xor $acc11,$s2,$s2
953 ldx [$tbl+2048+192],%g0 ! prefetch td4
954 xor $acc12,$acc14,$acc14
955 xor $acc13,$s3,$s3
956 ldx [$tbl+2048+224],%g0 ! prefetch td4
957 and $acc0,2040,$acc0 !
958 xor $acc14,$s3,$s3
959 xor $acc15,$s3,$s3
960 ba .Ldec_loop
961 srl $s3,13,$acc1
962
963.align 32
964.Ldec_last:
965 srlx $acc1,8,$acc1 !
966 xor $acc0,$t0,$t0
967 ld [$key+0],$s0
968 srlx $acc2,16,$acc2
969 xor $acc1,$t0,$t0
970 ld [$key+4],$s1
971 srlx $acc3,24,$acc3
972 xor $acc2,$t0,$t0
973 ld [$key+8],$s2 !
974 srlx $acc5,8,$acc5
975 xor $acc3,$t0,$t0
976 ld [$key+12],$s3
977 srlx $acc6,16,$acc6
978 xor $acc4,$t1,$t1
979 srlx $acc7,24,$acc7
980 xor $acc5,$t1,$t1
981 srlx $acc9,8,$acc9 !
982 xor $acc6,$t1,$t1
983 srlx $acc10,16,$acc10
984 xor $acc7,$t1,$t1
985 srlx $acc11,24,$acc11
986 xor $acc8,$t2,$t2
987 srlx $acc13,8,$acc13
988 xor $acc9,$t2,$t2
989 srlx $acc14,16,$acc14 !
990 xor $acc10,$t2,$t2
991 srlx $acc15,24,$acc15
992 xor $acc11,$t2,$t2
993 xor $acc12,$acc14,$acc14
994 xor $acc13,$t3,$t3
995 srl $t0,24,$acc0
996 xor $acc14,$t3,$t3
997 xor $acc15,$t3,$t3 !
998 srl $t3,16,$acc1
999
1000 srl $t2,8,$acc2
1001 and $acc1,255,$acc1
1002 ldub [$rounds+$acc0],$acc0
1003 srl $t1,24,$acc4
1004 and $acc2,255,$acc2
1005 ldub [$rounds+$acc1],$acc1
1006 srl $t0,16,$acc5 !
1007 and $t1,255,$acc3
1008 ldub [$rounds+$acc2],$acc2
1009 ldub [$rounds+$acc3],$acc3
1010 srl $t3,8,$acc6
1011 and $acc5,255,$acc5
1012 ldub [$rounds+$acc4],$acc4
1013 fmovs %f0,%f0
1014 srl $t2,24,$acc8 !
1015 and $acc6,255,$acc6
1016 ldub [$rounds+$acc5],$acc5
1017 srl $t1,16,$acc9
1018 and $t2,255,$acc7
1019 ldub [$rounds+$acc6],$acc6
1020 ldub [$rounds+$acc7],$acc7
1021 fmovs %f0,%f0
1022 srl $t0,8,$acc10 !
1023 and $acc9,255,$acc9
1024 ldub [$rounds+$acc8],$acc8
1025 srl $t3,24,$acc12
1026 and $acc10,255,$acc10
1027 ldub [$rounds+$acc9],$acc9
1028 srl $t2,16,$acc13
1029 and $t3,255,$acc11
1030 ldub [$rounds+$acc10],$acc10 !
1031 srl $t1,8,$acc14
1032 and $acc13,255,$acc13
1033 ldub [$rounds+$acc11],$acc11
1034 ldub [$rounds+$acc12],$acc12
1035 and $acc14,255,$acc14
1036 ldub [$rounds+$acc13],$acc13
1037 and $t0,255,$acc15
1038 ldub [$rounds+$acc14],$acc14 !
1039
1040 sll $acc0,24,$acc0
1041 xor $acc3,$s0,$s0
1042 ldub [$rounds+$acc15],$acc15
1043 sll $acc1,16,$acc1
1044 xor $acc0,$s0,$s0
1045 ldx [%sp+$bias+$frame+0],%i7 ! restore return address
1046 fmovs %f0,%f0
1047 sll $acc2,8,$acc2 !
1048 xor $acc1,$s0,$s0
1049 sll $acc4,24,$acc4
1050 xor $acc2,$s0,$s0
1051 sll $acc5,16,$acc5
1052 xor $acc7,$s1,$s1
1053 sll $acc6,8,$acc6
1054 xor $acc4,$s1,$s1
1055 sll $acc8,24,$acc8 !
1056 xor $acc5,$s1,$s1
1057 sll $acc9,16,$acc9
1058 xor $acc11,$s2,$s2
1059 sll $acc10,8,$acc10
1060 xor $acc6,$s1,$s1
1061 sll $acc12,24,$acc12
1062 xor $acc8,$s2,$s2
1063 sll $acc13,16,$acc13 !
1064 xor $acc9,$s2,$s2
1065 sll $acc14,8,$acc14
1066 xor $acc10,$s2,$s2
1067 xor $acc12,$acc14,$acc14
1068 xor $acc13,$s3,$s3
1069 xor $acc14,$s3,$s3
1070 xor $acc15,$s3,$s3
1071
1072 ret
1073 restore
1074.type _sparcv9_AES_decrypt,#function
1075.size _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1076
1077.align 32
1078.globl aes_decrypt_internal
1079aes_decrypt_internal:
1080 save %sp,-$frame,%sp
1081#ifdef __PIC__
1082 sethi %hi(_GLOBAL_OFFSET_TABLE_-4), %o5
1083 rd %pc, %o4
1084 or %o5, %lo(_GLOBAL_OFFSET_TABLE_+4), %o5
1085 add %o5, %o4, %o5
1086#endif
1087
1088 or %i0,%i1,%g1
1089 andcc %g1,3,%g0
1090 bnz,pn %xcc,.Lunaligned_dec
1091 nop
1092
1093 ld [%i0+0],%o0
1094 ld [%i0+4],%o1
1095 ld [%i0+8],%o2
1096 ld [%i0+12],%o3
1097
1098#ifdef __PIC__
1099 set AES_Td, %o4
1100 ldx [%o4+%o5], %o4
1101#else
1102 set AES_Td, %o4
1103#endif
1104 call _sparcv9_AES_decrypt
1105 mov %i2,%o5
1106
1107 st %o0,[%i1+0]
1108 st %o1,[%i1+4]
1109 st %o2,[%i1+8]
1110 st %o3,[%i1+12]
1111
1112 ret
1113 restore
1114
1115.align 32
1116.Lunaligned_dec:
1117 ldub [%i0+0],%l0
1118 ldub [%i0+1],%l1
1119 ldub [%i0+2],%l2
1120
1121 sll %l0,24,%l0
1122 ldub [%i0+3],%l3
1123 sll %l1,16,%l1
1124 ldub [%i0+4],%l4
1125 sll %l2,8,%l2
1126 or %l1,%l0,%l0
1127 ldub [%i0+5],%l5
1128 sll %l4,24,%l4
1129 or %l3,%l2,%l2
1130 ldub [%i0+6],%l6
1131 sll %l5,16,%l5
1132 or %l0,%l2,%o0
1133 ldub [%i0+7],%l7
1134
1135 sll %l6,8,%l6
1136 or %l5,%l4,%l4
1137 ldub [%i0+8],%l0
1138 or %l7,%l6,%l6
1139 ldub [%i0+9],%l1
1140 or %l4,%l6,%o1
1141 ldub [%i0+10],%l2
1142
1143 sll %l0,24,%l0
1144 ldub [%i0+11],%l3
1145 sll %l1,16,%l1
1146 ldub [%i0+12],%l4
1147 sll %l2,8,%l2
1148 or %l1,%l0,%l0
1149 ldub [%i0+13],%l5
1150 sll %l4,24,%l4
1151 or %l3,%l2,%l2
1152 ldub [%i0+14],%l6
1153 sll %l5,16,%l5
1154 or %l0,%l2,%o2
1155 ldub [%i0+15],%l7
1156
1157 sll %l6,8,%l6
1158 or %l5,%l4,%l4
1159 or %l7,%l6,%l6
1160 or %l4,%l6,%o3
1161
1162#ifdef __PIC__
1163 set AES_Td, %o4
1164 ldx [%o4+%o5], %o4
1165#else
1166 set AES_Td, %o4
1167#endif
1168 call _sparcv9_AES_decrypt
1169 mov %i2,%o5
1170
1171 srl %o0,24,%l0
1172 srl %o0,16,%l1
1173 stb %l0,[%i1+0]
1174 srl %o0,8,%l2
1175 stb %l1,[%i1+1]
1176 stb %l2,[%i1+2]
1177 srl %o1,24,%l4
1178 stb %o0,[%i1+3]
1179
1180 srl %o1,16,%l5
1181 stb %l4,[%i1+4]
1182 srl %o1,8,%l6
1183 stb %l5,[%i1+5]
1184 stb %l6,[%i1+6]
1185 srl %o2,24,%l0
1186 stb %o1,[%i1+7]
1187
1188 srl %o2,16,%l1
1189 stb %l0,[%i1+8]
1190 srl %o2,8,%l2
1191 stb %l1,[%i1+9]
1192 stb %l2,[%i1+10]
1193 srl %o3,24,%l4
1194 stb %o2,[%i1+11]
1195
1196 srl %o3,16,%l5
1197 stb %l4,[%i1+12]
1198 srl %o3,8,%l6
1199 stb %l5,[%i1+13]
1200 stb %l6,[%i1+14]
1201 stb %o3,[%i1+15]
1202
1203 ret
1204 restore
1205.type aes_decrypt_internal,#function
1206.size aes_decrypt_internal,(.-aes_decrypt_internal)
1207___
1208
1209# fmovs instructions substituting for FP nops were originally added
1210# to meet specific instruction alignment requirements to maximize ILP.
1211# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1212# undesired effect, so just omit them and sacrifice some portion of
1213# percent in performance...
1214$code =~ s/fmovs.*$//gm;
1215
1216print $code;
1217close STDOUT; # ensure flush
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
deleted file mode 100755
index 324c4a2be2..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl
+++ /dev/null
@@ -1,2834 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Version 2.1.
11#
12# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
13# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
14# [you'll notice a lot of resemblance], such as compressed S-boxes
15# in little-endian byte order, prefetch of these tables in CBC mode,
16# as well as avoiding L1 cache aliasing between stack frame and key
17# schedule and already mentioned tables, compressed Td4...
18#
19# Performance in number of cycles per processed byte for 128-bit key:
20#
21# ECB encrypt ECB decrypt CBC large chunk
22# AMD64 33 41 13.0
23# EM64T 38 59 18.6(*)
24# Core 2 30 43 14.5(*)
25#
26# (*) with hyper-threading off
27
28$flavour = shift;
29$output = shift;
30if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
31
32$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
33
34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
36( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
37die "can't locate x86_64-xlate.pl";
38
39open OUT,"| \"$^X\" $xlate $flavour $output";
40*STDOUT=*OUT;
41
42$verticalspin=1; # unlike 32-bit version $verticalspin performs
43 # ~15% better on both AMD and Intel cores
44$speed_limit=512; # see aes-586.pl for details
45
46$code=".text\n";
47
48$s0="%eax";
49$s1="%ebx";
50$s2="%ecx";
51$s3="%edx";
52$acc0="%esi"; $mask80="%rsi";
53$acc1="%edi"; $maskfe="%rdi";
54$acc2="%ebp"; $mask1b="%rbp";
55$inp="%r8";
56$out="%r9";
57$t0="%r10d";
58$t1="%r11d";
59$t2="%r12d";
60$rnds="%r13d";
61$sbox="%r14";
62$key="%r15";
63
64sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
65sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
66 $r =~ s/%[er]([sd]i)/%\1l/;
67 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
68sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
69 $r =~ s/%r([0-9]+)/%r\1d/; $r; }
70sub _data_word()
71{ my $i;
72 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
73}
74sub data_word()
75{ my $i;
76 my $last=pop(@_);
77 $code.=".long\t";
78 while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
79 $code.=sprintf"0x%08x\n",$last;
80}
81
82sub data_byte()
83{ my $i;
84 my $last=pop(@_);
85 $code.=".byte\t";
86 while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
87 $code.=sprintf"0x%02x\n",$last&0xff;
88}
89
90sub encvert()
91{ my $t3="%r8d"; # zaps $inp!
92
93$code.=<<___;
94 # favor 3-way issue Opteron pipeline...
95 movzb `&lo("$s0")`,$acc0
96 movzb `&lo("$s1")`,$acc1
97 movzb `&lo("$s2")`,$acc2
98 mov 0($sbox,$acc0,8),$t0
99 mov 0($sbox,$acc1,8),$t1
100 mov 0($sbox,$acc2,8),$t2
101
102 movzb `&hi("$s1")`,$acc0
103 movzb `&hi("$s2")`,$acc1
104 movzb `&lo("$s3")`,$acc2
105 xor 3($sbox,$acc0,8),$t0
106 xor 3($sbox,$acc1,8),$t1
107 mov 0($sbox,$acc2,8),$t3
108
109 movzb `&hi("$s3")`,$acc0
110 shr \$16,$s2
111 movzb `&hi("$s0")`,$acc2
112 xor 3($sbox,$acc0,8),$t2
113 shr \$16,$s3
114 xor 3($sbox,$acc2,8),$t3
115
116 shr \$16,$s1
117 lea 16($key),$key
118 shr \$16,$s0
119
120 movzb `&lo("$s2")`,$acc0
121 movzb `&lo("$s3")`,$acc1
122 movzb `&lo("$s0")`,$acc2
123 xor 2($sbox,$acc0,8),$t0
124 xor 2($sbox,$acc1,8),$t1
125 xor 2($sbox,$acc2,8),$t2
126
127 movzb `&hi("$s3")`,$acc0
128 movzb `&hi("$s0")`,$acc1
129 movzb `&lo("$s1")`,$acc2
130 xor 1($sbox,$acc0,8),$t0
131 xor 1($sbox,$acc1,8),$t1
132 xor 2($sbox,$acc2,8),$t3
133
134 mov 12($key),$s3
135 movzb `&hi("$s1")`,$acc1
136 movzb `&hi("$s2")`,$acc2
137 mov 0($key),$s0
138 xor 1($sbox,$acc1,8),$t2
139 xor 1($sbox,$acc2,8),$t3
140
141 mov 4($key),$s1
142 mov 8($key),$s2
143 xor $t0,$s0
144 xor $t1,$s1
145 xor $t2,$s2
146 xor $t3,$s3
147___
148}
149
150sub enclastvert()
151{ my $t3="%r8d"; # zaps $inp!
152
153$code.=<<___;
154 movzb `&lo("$s0")`,$acc0
155 movzb `&lo("$s1")`,$acc1
156 movzb `&lo("$s2")`,$acc2
157 movzb 2($sbox,$acc0,8),$t0
158 movzb 2($sbox,$acc1,8),$t1
159 movzb 2($sbox,$acc2,8),$t2
160
161 movzb `&lo("$s3")`,$acc0
162 movzb `&hi("$s1")`,$acc1
163 movzb `&hi("$s2")`,$acc2
164 movzb 2($sbox,$acc0,8),$t3
165 mov 0($sbox,$acc1,8),$acc1 #$t0
166 mov 0($sbox,$acc2,8),$acc2 #$t1
167
168 and \$0x0000ff00,$acc1
169 and \$0x0000ff00,$acc2
170
171 xor $acc1,$t0
172 xor $acc2,$t1
173 shr \$16,$s2
174
175 movzb `&hi("$s3")`,$acc0
176 movzb `&hi("$s0")`,$acc1
177 shr \$16,$s3
178 mov 0($sbox,$acc0,8),$acc0 #$t2
179 mov 0($sbox,$acc1,8),$acc1 #$t3
180
181 and \$0x0000ff00,$acc0
182 and \$0x0000ff00,$acc1
183 shr \$16,$s1
184 xor $acc0,$t2
185 xor $acc1,$t3
186 shr \$16,$s0
187
188 movzb `&lo("$s2")`,$acc0
189 movzb `&lo("$s3")`,$acc1
190 movzb `&lo("$s0")`,$acc2
191 mov 0($sbox,$acc0,8),$acc0 #$t0
192 mov 0($sbox,$acc1,8),$acc1 #$t1
193 mov 0($sbox,$acc2,8),$acc2 #$t2
194
195 and \$0x00ff0000,$acc0
196 and \$0x00ff0000,$acc1
197 and \$0x00ff0000,$acc2
198
199 xor $acc0,$t0
200 xor $acc1,$t1
201 xor $acc2,$t2
202
203 movzb `&lo("$s1")`,$acc0
204 movzb `&hi("$s3")`,$acc1
205 movzb `&hi("$s0")`,$acc2
206 mov 0($sbox,$acc0,8),$acc0 #$t3
207 mov 2($sbox,$acc1,8),$acc1 #$t0
208 mov 2($sbox,$acc2,8),$acc2 #$t1
209
210 and \$0x00ff0000,$acc0
211 and \$0xff000000,$acc1
212 and \$0xff000000,$acc2
213
214 xor $acc0,$t3
215 xor $acc1,$t0
216 xor $acc2,$t1
217
218 movzb `&hi("$s1")`,$acc0
219 movzb `&hi("$s2")`,$acc1
220 mov 16+12($key),$s3
221 mov 2($sbox,$acc0,8),$acc0 #$t2
222 mov 2($sbox,$acc1,8),$acc1 #$t3
223 mov 16+0($key),$s0
224
225 and \$0xff000000,$acc0
226 and \$0xff000000,$acc1
227
228 xor $acc0,$t2
229 xor $acc1,$t3
230
231 mov 16+4($key),$s1
232 mov 16+8($key),$s2
233 xor $t0,$s0
234 xor $t1,$s1
235 xor $t2,$s2
236 xor $t3,$s3
237___
238}
239
240sub encstep()
241{ my ($i,@s) = @_;
242 my $tmp0=$acc0;
243 my $tmp1=$acc1;
244 my $tmp2=$acc2;
245 my $out=($t0,$t1,$t2,$s[0])[$i];
246
247 if ($i==3) {
248 $tmp0=$s[1];
249 $tmp1=$s[2];
250 $tmp2=$s[3];
251 }
252 $code.=" movzb ".&lo($s[0]).",$out\n";
253 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
254 $code.=" lea 16($key),$key\n" if ($i==0);
255
256 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
257 $code.=" mov 0($sbox,$out,8),$out\n";
258
259 $code.=" shr \$16,$tmp1\n";
260 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
261 $code.=" xor 3($sbox,$tmp0,8),$out\n";
262
263 $code.=" movzb ".&lo($tmp1).",$tmp1\n";
264 $code.=" shr \$24,$tmp2\n";
265 $code.=" xor 4*$i($key),$out\n";
266
267 $code.=" xor 2($sbox,$tmp1,8),$out\n";
268 $code.=" xor 1($sbox,$tmp2,8),$out\n";
269
270 $code.=" mov $t0,$s[1]\n" if ($i==3);
271 $code.=" mov $t1,$s[2]\n" if ($i==3);
272 $code.=" mov $t2,$s[3]\n" if ($i==3);
273 $code.="\n";
274}
275
276sub enclast()
277{ my ($i,@s)=@_;
278 my $tmp0=$acc0;
279 my $tmp1=$acc1;
280 my $tmp2=$acc2;
281 my $out=($t0,$t1,$t2,$s[0])[$i];
282
283 if ($i==3) {
284 $tmp0=$s[1];
285 $tmp1=$s[2];
286 $tmp2=$s[3];
287 }
288 $code.=" movzb ".&lo($s[0]).",$out\n";
289 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
290
291 $code.=" mov 2($sbox,$out,8),$out\n";
292 $code.=" shr \$16,$tmp1\n";
293 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
294
295 $code.=" and \$0x000000ff,$out\n";
296 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
297 $code.=" movzb ".&lo($tmp1).",$tmp1\n";
298 $code.=" shr \$24,$tmp2\n";
299
300 $code.=" mov 0($sbox,$tmp0,8),$tmp0\n";
301 $code.=" mov 0($sbox,$tmp1,8),$tmp1\n";
302 $code.=" mov 2($sbox,$tmp2,8),$tmp2\n";
303
304 $code.=" and \$0x0000ff00,$tmp0\n";
305 $code.=" and \$0x00ff0000,$tmp1\n";
306 $code.=" and \$0xff000000,$tmp2\n";
307
308 $code.=" xor $tmp0,$out\n";
309 $code.=" mov $t0,$s[1]\n" if ($i==3);
310 $code.=" xor $tmp1,$out\n";
311 $code.=" mov $t1,$s[2]\n" if ($i==3);
312 $code.=" xor $tmp2,$out\n";
313 $code.=" mov $t2,$s[3]\n" if ($i==3);
314 $code.="\n";
315}
316
317$code.=<<___;
318.type _x86_64_AES_encrypt,\@abi-omnipotent
319.align 16
320_x86_64_AES_encrypt:
321 _CET_ENDBR
322 xor 0($key),$s0 # xor with key
323 xor 4($key),$s1
324 xor 8($key),$s2
325 xor 12($key),$s3
326
327 mov 240($key),$rnds # load key->rounds
328 sub \$1,$rnds
329 jmp .Lenc_loop
330.align 16
331.Lenc_loop:
332___
333 if ($verticalspin) { &encvert(); }
334 else { &encstep(0,$s0,$s1,$s2,$s3);
335 &encstep(1,$s1,$s2,$s3,$s0);
336 &encstep(2,$s2,$s3,$s0,$s1);
337 &encstep(3,$s3,$s0,$s1,$s2);
338 }
339$code.=<<___;
340 sub \$1,$rnds
341 jnz .Lenc_loop
342___
343 if ($verticalspin) { &enclastvert(); }
344 else { &enclast(0,$s0,$s1,$s2,$s3);
345 &enclast(1,$s1,$s2,$s3,$s0);
346 &enclast(2,$s2,$s3,$s0,$s1);
347 &enclast(3,$s3,$s0,$s1,$s2);
348 $code.=<<___;
349 xor 16+0($key),$s0 # xor with key
350 xor 16+4($key),$s1
351 xor 16+8($key),$s2
352 xor 16+12($key),$s3
353___
354 }
355$code.=<<___;
356 retq
357.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
358___
359
360# it's possible to implement this by shifting tN by 8, filling least
361# significant byte with byte load and finally bswap-ing at the end,
362# but such partial register load kills Core 2...
363sub enccompactvert()
364{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
365
366$code.=<<___;
367 movzb `&lo("$s0")`,$t0
368 movzb `&lo("$s1")`,$t1
369 movzb `&lo("$s2")`,$t2
370 movzb ($sbox,$t0,1),$t0
371 movzb ($sbox,$t1,1),$t1
372 movzb ($sbox,$t2,1),$t2
373
374 movzb `&lo("$s3")`,$t3
375 movzb `&hi("$s1")`,$acc0
376 movzb `&hi("$s2")`,$acc1
377 movzb ($sbox,$t3,1),$t3
378 movzb ($sbox,$acc0,1),$t4 #$t0
379 movzb ($sbox,$acc1,1),$t5 #$t1
380
381 movzb `&hi("$s3")`,$acc2
382 movzb `&hi("$s0")`,$acc0
383 shr \$16,$s2
384 movzb ($sbox,$acc2,1),$acc2 #$t2
385 movzb ($sbox,$acc0,1),$acc0 #$t3
386 shr \$16,$s3
387
388 movzb `&lo("$s2")`,$acc1
389 shl \$8,$t4
390 shl \$8,$t5
391 movzb ($sbox,$acc1,1),$acc1 #$t0
392 xor $t4,$t0
393 xor $t5,$t1
394
395 movzb `&lo("$s3")`,$t4
396 shr \$16,$s0
397 shr \$16,$s1
398 movzb `&lo("$s0")`,$t5
399 shl \$8,$acc2
400 shl \$8,$acc0
401 movzb ($sbox,$t4,1),$t4 #$t1
402 movzb ($sbox,$t5,1),$t5 #$t2
403 xor $acc2,$t2
404 xor $acc0,$t3
405
406 movzb `&lo("$s1")`,$acc2
407 movzb `&hi("$s3")`,$acc0
408 shl \$16,$acc1
409 movzb ($sbox,$acc2,1),$acc2 #$t3
410 movzb ($sbox,$acc0,1),$acc0 #$t0
411 xor $acc1,$t0
412
413 movzb `&hi("$s0")`,$acc1
414 shr \$8,$s2
415 shr \$8,$s1
416 movzb ($sbox,$acc1,1),$acc1 #$t1
417 movzb ($sbox,$s2,1),$s3 #$t3
418 movzb ($sbox,$s1,1),$s2 #$t2
419 shl \$16,$t4
420 shl \$16,$t5
421 shl \$16,$acc2
422 xor $t4,$t1
423 xor $t5,$t2
424 xor $acc2,$t3
425
426 shl \$24,$acc0
427 shl \$24,$acc1
428 shl \$24,$s3
429 xor $acc0,$t0
430 shl \$24,$s2
431 xor $acc1,$t1
432 mov $t0,$s0
433 mov $t1,$s1
434 xor $t2,$s2
435 xor $t3,$s3
436___
437}
438
439sub enctransform_ref()
440{ my $sn = shift;
441 my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
442
443$code.=<<___;
444 mov $sn,$acc
445 and \$0x80808080,$acc
446 mov $acc,$tmp
447 shr \$7,$tmp
448 lea ($sn,$sn),$r2
449 sub $tmp,$acc
450 and \$0xfefefefe,$r2
451 and \$0x1b1b1b1b,$acc
452 mov $sn,$tmp
453 xor $acc,$r2
454
455 xor $r2,$sn
456 rol \$24,$sn
457 xor $r2,$sn
458 ror \$16,$tmp
459 xor $tmp,$sn
460 ror \$8,$tmp
461 xor $tmp,$sn
462___
463}
464
465# unlike decrypt case it does not pay off to parallelize enctransform
466sub enctransform()
467{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
468
469$code.=<<___;
470 mov $s0,$acc0
471 mov $s1,$acc1
472 and \$0x80808080,$acc0
473 and \$0x80808080,$acc1
474 mov $acc0,$t0
475 mov $acc1,$t1
476 shr \$7,$t0
477 lea ($s0,$s0),$r20
478 shr \$7,$t1
479 lea ($s1,$s1),$r21
480 sub $t0,$acc0
481 sub $t1,$acc1
482 and \$0xfefefefe,$r20
483 and \$0xfefefefe,$r21
484 and \$0x1b1b1b1b,$acc0
485 and \$0x1b1b1b1b,$acc1
486 mov $s0,$t0
487 mov $s1,$t1
488 xor $acc0,$r20
489 xor $acc1,$r21
490
491 xor $r20,$s0
492 xor $r21,$s1
493 mov $s2,$acc0
494 mov $s3,$acc1
495 rol \$24,$s0
496 rol \$24,$s1
497 and \$0x80808080,$acc0
498 and \$0x80808080,$acc1
499 xor $r20,$s0
500 xor $r21,$s1
501 mov $acc0,$t2
502 mov $acc1,$t3
503 ror \$16,$t0
504 ror \$16,$t1
505 shr \$7,$t2
506 lea ($s2,$s2),$r20
507 xor $t0,$s0
508 xor $t1,$s1
509 shr \$7,$t3
510 lea ($s3,$s3),$r21
511 ror \$8,$t0
512 ror \$8,$t1
513 sub $t2,$acc0
514 sub $t3,$acc1
515 xor $t0,$s0
516 xor $t1,$s1
517
518 and \$0xfefefefe,$r20
519 and \$0xfefefefe,$r21
520 and \$0x1b1b1b1b,$acc0
521 and \$0x1b1b1b1b,$acc1
522 mov $s2,$t2
523 mov $s3,$t3
524 xor $acc0,$r20
525 xor $acc1,$r21
526
527 xor $r20,$s2
528 xor $r21,$s3
529 rol \$24,$s2
530 rol \$24,$s3
531 xor $r20,$s2
532 xor $r21,$s3
533 mov 0($sbox),$acc0 # prefetch Te4
534 ror \$16,$t2
535 ror \$16,$t3
536 mov 64($sbox),$acc1
537 xor $t2,$s2
538 xor $t3,$s3
539 mov 128($sbox),$r20
540 ror \$8,$t2
541 ror \$8,$t3
542 mov 192($sbox),$r21
543 xor $t2,$s2
544 xor $t3,$s3
545___
546}
547
548$code.=<<___;
549.type _x86_64_AES_encrypt_compact,\@abi-omnipotent
550.align 16
551_x86_64_AES_encrypt_compact:
552 _CET_ENDBR
553 lea 128($sbox),$inp # size optimization
554 mov 0-128($inp),$acc1 # prefetch Te4
555 mov 32-128($inp),$acc2
556 mov 64-128($inp),$t0
557 mov 96-128($inp),$t1
558 mov 128-128($inp),$acc1
559 mov 160-128($inp),$acc2
560 mov 192-128($inp),$t0
561 mov 224-128($inp),$t1
562 jmp .Lenc_loop_compact
563.align 16
564.Lenc_loop_compact:
565 xor 0($key),$s0 # xor with key
566 xor 4($key),$s1
567 xor 8($key),$s2
568 xor 12($key),$s3
569 lea 16($key),$key
570___
571 &enccompactvert();
572$code.=<<___;
573 cmp 16(%rsp),$key
574 je .Lenc_compact_done
575___
576 &enctransform();
577$code.=<<___;
578 jmp .Lenc_loop_compact
579.align 16
580.Lenc_compact_done:
581 xor 0($key),$s0
582 xor 4($key),$s1
583 xor 8($key),$s2
584 xor 12($key),$s3
585 retq
586.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
587___
588
589# void aes_encrypt_internal(const void *inp, void *out, const AES_KEY *key);
590$code.=<<___;
591.globl aes_encrypt_internal
592.type aes_encrypt_internal,\@function,3
593.align 16
594.globl asm_AES_encrypt
595.hidden asm_AES_encrypt
596asm_AES_encrypt:
597aes_encrypt_internal:
598 _CET_ENDBR
599 push %rbx
600 push %rbp
601 push %r12
602 push %r13
603 push %r14
604 push %r15
605
606 # allocate frame "above" key schedule
607 mov %rsp,%r10
608 lea -63(%rdx),%rcx # %rdx is key argument
609 and \$-64,%rsp
610 sub %rsp,%rcx
611 neg %rcx
612 and \$0x3c0,%rcx
613 sub %rcx,%rsp
614 sub \$32,%rsp
615
616 mov %rsi,16(%rsp) # save out
617 mov %r10,24(%rsp) # save real stack pointer
618.Lenc_prologue:
619
620 mov %rdx,$key
621 mov 240($key),$rnds # load rounds
622
623 mov 0(%rdi),$s0 # load input vector
624 mov 4(%rdi),$s1
625 mov 8(%rdi),$s2
626 mov 12(%rdi),$s3
627
628 shl \$4,$rnds
629 lea ($key,$rnds),%rbp
630 mov $key,(%rsp) # key schedule
631 mov %rbp,8(%rsp) # end of key schedule
632
633 # pick Te4 copy which can't "overlap" with stack frame or key schedule
634 lea .LAES_Te+2048(%rip),$sbox
635 lea 768(%rsp),%rbp
636 sub $sbox,%rbp
637 and \$0x300,%rbp
638 lea ($sbox,%rbp),$sbox
639
640 call _x86_64_AES_encrypt_compact
641
642 mov 16(%rsp),$out # restore out
643 mov 24(%rsp),%rsi # restore saved stack pointer
644 mov $s0,0($out) # write output vector
645 mov $s1,4($out)
646 mov $s2,8($out)
647 mov $s3,12($out)
648
649 mov (%rsi),%r15
650 mov 8(%rsi),%r14
651 mov 16(%rsi),%r13
652 mov 24(%rsi),%r12
653 mov 32(%rsi),%rbp
654 mov 40(%rsi),%rbx
655 lea 48(%rsi),%rsp
656.Lenc_epilogue:
657 ret
658.size aes_encrypt_internal,.-aes_encrypt_internal
659___
660
661#------------------------------------------------------------------#
662
663sub decvert()
664{ my $t3="%r8d"; # zaps $inp!
665
666$code.=<<___;
667 # favor 3-way issue Opteron pipeline...
668 movzb `&lo("$s0")`,$acc0
669 movzb `&lo("$s1")`,$acc1
670 movzb `&lo("$s2")`,$acc2
671 mov 0($sbox,$acc0,8),$t0
672 mov 0($sbox,$acc1,8),$t1
673 mov 0($sbox,$acc2,8),$t2
674
675 movzb `&hi("$s3")`,$acc0
676 movzb `&hi("$s0")`,$acc1
677 movzb `&lo("$s3")`,$acc2
678 xor 3($sbox,$acc0,8),$t0
679 xor 3($sbox,$acc1,8),$t1
680 mov 0($sbox,$acc2,8),$t3
681
682 movzb `&hi("$s1")`,$acc0
683 shr \$16,$s0
684 movzb `&hi("$s2")`,$acc2
685 xor 3($sbox,$acc0,8),$t2
686 shr \$16,$s3
687 xor 3($sbox,$acc2,8),$t3
688
689 shr \$16,$s1
690 lea 16($key),$key
691 shr \$16,$s2
692
693 movzb `&lo("$s2")`,$acc0
694 movzb `&lo("$s3")`,$acc1
695 movzb `&lo("$s0")`,$acc2
696 xor 2($sbox,$acc0,8),$t0
697 xor 2($sbox,$acc1,8),$t1
698 xor 2($sbox,$acc2,8),$t2
699
700 movzb `&hi("$s1")`,$acc0
701 movzb `&hi("$s2")`,$acc1
702 movzb `&lo("$s1")`,$acc2
703 xor 1($sbox,$acc0,8),$t0
704 xor 1($sbox,$acc1,8),$t1
705 xor 2($sbox,$acc2,8),$t3
706
707 movzb `&hi("$s3")`,$acc0
708 mov 12($key),$s3
709 movzb `&hi("$s0")`,$acc2
710 xor 1($sbox,$acc0,8),$t2
711 mov 0($key),$s0
712 xor 1($sbox,$acc2,8),$t3
713
714 xor $t0,$s0
715 mov 4($key),$s1
716 mov 8($key),$s2
717 xor $t2,$s2
718 xor $t1,$s1
719 xor $t3,$s3
720___
721}
722
723sub declastvert()
724{ my $t3="%r8d"; # zaps $inp!
725
726$code.=<<___;
727 lea 2048($sbox),$sbox # size optimization
728 movzb `&lo("$s0")`,$acc0
729 movzb `&lo("$s1")`,$acc1
730 movzb `&lo("$s2")`,$acc2
731 movzb ($sbox,$acc0,1),$t0
732 movzb ($sbox,$acc1,1),$t1
733 movzb ($sbox,$acc2,1),$t2
734
735 movzb `&lo("$s3")`,$acc0
736 movzb `&hi("$s3")`,$acc1
737 movzb `&hi("$s0")`,$acc2
738 movzb ($sbox,$acc0,1),$t3
739 movzb ($sbox,$acc1,1),$acc1 #$t0
740 movzb ($sbox,$acc2,1),$acc2 #$t1
741
742 shl \$8,$acc1
743 shl \$8,$acc2
744
745 xor $acc1,$t0
746 xor $acc2,$t1
747 shr \$16,$s3
748
749 movzb `&hi("$s1")`,$acc0
750 movzb `&hi("$s2")`,$acc1
751 shr \$16,$s0
752 movzb ($sbox,$acc0,1),$acc0 #$t2
753 movzb ($sbox,$acc1,1),$acc1 #$t3
754
755 shl \$8,$acc0
756 shl \$8,$acc1
757 shr \$16,$s1
758 xor $acc0,$t2
759 xor $acc1,$t3
760 shr \$16,$s2
761
762 movzb `&lo("$s2")`,$acc0
763 movzb `&lo("$s3")`,$acc1
764 movzb `&lo("$s0")`,$acc2
765 movzb ($sbox,$acc0,1),$acc0 #$t0
766 movzb ($sbox,$acc1,1),$acc1 #$t1
767 movzb ($sbox,$acc2,1),$acc2 #$t2
768
769 shl \$16,$acc0
770 shl \$16,$acc1
771 shl \$16,$acc2
772
773 xor $acc0,$t0
774 xor $acc1,$t1
775 xor $acc2,$t2
776
777 movzb `&lo("$s1")`,$acc0
778 movzb `&hi("$s1")`,$acc1
779 movzb `&hi("$s2")`,$acc2
780 movzb ($sbox,$acc0,1),$acc0 #$t3
781 movzb ($sbox,$acc1,1),$acc1 #$t0
782 movzb ($sbox,$acc2,1),$acc2 #$t1
783
784 shl \$16,$acc0
785 shl \$24,$acc1
786 shl \$24,$acc2
787
788 xor $acc0,$t3
789 xor $acc1,$t0
790 xor $acc2,$t1
791
792 movzb `&hi("$s3")`,$acc0
793 movzb `&hi("$s0")`,$acc1
794 mov 16+12($key),$s3
795 movzb ($sbox,$acc0,1),$acc0 #$t2
796 movzb ($sbox,$acc1,1),$acc1 #$t3
797 mov 16+0($key),$s0
798
799 shl \$24,$acc0
800 shl \$24,$acc1
801
802 xor $acc0,$t2
803 xor $acc1,$t3
804
805 mov 16+4($key),$s1
806 mov 16+8($key),$s2
807 lea -2048($sbox),$sbox
808 xor $t0,$s0
809 xor $t1,$s1
810 xor $t2,$s2
811 xor $t3,$s3
812___
813}
814
815sub decstep()
816{ my ($i,@s) = @_;
817 my $tmp0=$acc0;
818 my $tmp1=$acc1;
819 my $tmp2=$acc2;
820 my $out=($t0,$t1,$t2,$s[0])[$i];
821
822 $code.=" mov $s[0],$out\n" if ($i!=3);
823 $tmp1=$s[2] if ($i==3);
824 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
825 $code.=" and \$0xFF,$out\n";
826
827 $code.=" mov 0($sbox,$out,8),$out\n";
828 $code.=" shr \$16,$tmp1\n";
829 $tmp2=$s[3] if ($i==3);
830 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
831
832 $tmp0=$s[1] if ($i==3);
833 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
834 $code.=" and \$0xFF,$tmp1\n";
835 $code.=" shr \$24,$tmp2\n";
836
837 $code.=" xor 3($sbox,$tmp0,8),$out\n";
838 $code.=" xor 2($sbox,$tmp1,8),$out\n";
839 $code.=" xor 1($sbox,$tmp2,8),$out\n";
840
841 $code.=" mov $t2,$s[1]\n" if ($i==3);
842 $code.=" mov $t1,$s[2]\n" if ($i==3);
843 $code.=" mov $t0,$s[3]\n" if ($i==3);
844 $code.="\n";
845}
846
847sub declast()
848{ my ($i,@s)=@_;
849 my $tmp0=$acc0;
850 my $tmp1=$acc1;
851 my $tmp2=$acc2;
852 my $out=($t0,$t1,$t2,$s[0])[$i];
853
854 $code.=" mov $s[0],$out\n" if ($i!=3);
855 $tmp1=$s[2] if ($i==3);
856 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
857 $code.=" and \$0xFF,$out\n";
858
859 $code.=" movzb 2048($sbox,$out,1),$out\n";
860 $code.=" shr \$16,$tmp1\n";
861 $tmp2=$s[3] if ($i==3);
862 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
863
864 $tmp0=$s[1] if ($i==3);
865 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
866 $code.=" and \$0xFF,$tmp1\n";
867 $code.=" shr \$24,$tmp2\n";
868
869 $code.=" movzb 2048($sbox,$tmp0,1),$tmp0\n";
870 $code.=" movzb 2048($sbox,$tmp1,1),$tmp1\n";
871 $code.=" movzb 2048($sbox,$tmp2,1),$tmp2\n";
872
873 $code.=" shl \$8,$tmp0\n";
874 $code.=" shl \$16,$tmp1\n";
875 $code.=" shl \$24,$tmp2\n";
876
877 $code.=" xor $tmp0,$out\n";
878 $code.=" mov $t2,$s[1]\n" if ($i==3);
879 $code.=" xor $tmp1,$out\n";
880 $code.=" mov $t1,$s[2]\n" if ($i==3);
881 $code.=" xor $tmp2,$out\n";
882 $code.=" mov $t0,$s[3]\n" if ($i==3);
883 $code.="\n";
884}
885
886$code.=<<___;
887.type _x86_64_AES_decrypt,\@abi-omnipotent
888.align 16
889_x86_64_AES_decrypt:
890 _CET_ENDBR
891 xor 0($key),$s0 # xor with key
892 xor 4($key),$s1
893 xor 8($key),$s2
894 xor 12($key),$s3
895
896 mov 240($key),$rnds # load key->rounds
897 sub \$1,$rnds
898 jmp .Ldec_loop
899.align 16
900.Ldec_loop:
901___
902 if ($verticalspin) { &decvert(); }
903 else { &decstep(0,$s0,$s3,$s2,$s1);
904 &decstep(1,$s1,$s0,$s3,$s2);
905 &decstep(2,$s2,$s1,$s0,$s3);
906 &decstep(3,$s3,$s2,$s1,$s0);
907 $code.=<<___;
908 lea 16($key),$key
909 xor 0($key),$s0 # xor with key
910 xor 4($key),$s1
911 xor 8($key),$s2
912 xor 12($key),$s3
913___
914 }
915$code.=<<___;
916 sub \$1,$rnds
917 jnz .Ldec_loop
918___
919 if ($verticalspin) { &declastvert(); }
920 else { &declast(0,$s0,$s3,$s2,$s1);
921 &declast(1,$s1,$s0,$s3,$s2);
922 &declast(2,$s2,$s1,$s0,$s3);
923 &declast(3,$s3,$s2,$s1,$s0);
924 $code.=<<___;
925 xor 16+0($key),$s0 # xor with key
926 xor 16+4($key),$s1
927 xor 16+8($key),$s2
928 xor 16+12($key),$s3
929___
930 }
931$code.=<<___;
932 retq
933.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
934___
935
936sub deccompactvert()
937{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
938
939$code.=<<___;
940 movzb `&lo("$s0")`,$t0
941 movzb `&lo("$s1")`,$t1
942 movzb `&lo("$s2")`,$t2
943 movzb ($sbox,$t0,1),$t0
944 movzb ($sbox,$t1,1),$t1
945 movzb ($sbox,$t2,1),$t2
946
947 movzb `&lo("$s3")`,$t3
948 movzb `&hi("$s3")`,$acc0
949 movzb `&hi("$s0")`,$acc1
950 movzb ($sbox,$t3,1),$t3
951 movzb ($sbox,$acc0,1),$t4 #$t0
952 movzb ($sbox,$acc1,1),$t5 #$t1
953
954 movzb `&hi("$s1")`,$acc2
955 movzb `&hi("$s2")`,$acc0
956 shr \$16,$s2
957 movzb ($sbox,$acc2,1),$acc2 #$t2
958 movzb ($sbox,$acc0,1),$acc0 #$t3
959 shr \$16,$s3
960
961 movzb `&lo("$s2")`,$acc1
962 shl \$8,$t4
963 shl \$8,$t5
964 movzb ($sbox,$acc1,1),$acc1 #$t0
965 xor $t4,$t0
966 xor $t5,$t1
967
968 movzb `&lo("$s3")`,$t4
969 shr \$16,$s0
970 shr \$16,$s1
971 movzb `&lo("$s0")`,$t5
972 shl \$8,$acc2
973 shl \$8,$acc0
974 movzb ($sbox,$t4,1),$t4 #$t1
975 movzb ($sbox,$t5,1),$t5 #$t2
976 xor $acc2,$t2
977 xor $acc0,$t3
978
979 movzb `&lo("$s1")`,$acc2
980 movzb `&hi("$s1")`,$acc0
981 shl \$16,$acc1
982 movzb ($sbox,$acc2,1),$acc2 #$t3
983 movzb ($sbox,$acc0,1),$acc0 #$t0
984 xor $acc1,$t0
985
986 movzb `&hi("$s2")`,$acc1
987 shl \$16,$t4
988 shl \$16,$t5
989 movzb ($sbox,$acc1,1),$s1 #$t1
990 xor $t4,$t1
991 xor $t5,$t2
992
993 movzb `&hi("$s3")`,$acc1
994 shr \$8,$s0
995 shl \$16,$acc2
996 movzb ($sbox,$acc1,1),$s2 #$t2
997 movzb ($sbox,$s0,1),$s3 #$t3
998 xor $acc2,$t3
999
1000 shl \$24,$acc0
1001 shl \$24,$s1
1002 shl \$24,$s2
1003 xor $acc0,$t0
1004 shl \$24,$s3
1005 xor $t1,$s1
1006 mov $t0,$s0
1007 xor $t2,$s2
1008 xor $t3,$s3
1009___
1010}
1011
1012# parallelized version! input is pair of 64-bit values: %rax=s1.s0
1013# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
1014# %ecx=s2 and %edx=s3.
1015sub dectransform()
1016{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
1017 my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
1018 my $prefetch = shift;
1019
1020$code.=<<___;
1021 mov $tp10,$acc0
1022 mov $tp18,$acc8
1023 and $mask80,$acc0
1024 and $mask80,$acc8
1025 mov $acc0,$tp40
1026 mov $acc8,$tp48
1027 shr \$7,$tp40
1028 lea ($tp10,$tp10),$tp20
1029 shr \$7,$tp48
1030 lea ($tp18,$tp18),$tp28
1031 sub $tp40,$acc0
1032 sub $tp48,$acc8
1033 and $maskfe,$tp20
1034 and $maskfe,$tp28
1035 and $mask1b,$acc0
1036 and $mask1b,$acc8
1037 xor $tp20,$acc0
1038 xor $tp28,$acc8
1039 mov $acc0,$tp20
1040 mov $acc8,$tp28
1041
1042 and $mask80,$acc0
1043 and $mask80,$acc8
1044 mov $acc0,$tp80
1045 mov $acc8,$tp88
1046 shr \$7,$tp80
1047 lea ($tp20,$tp20),$tp40
1048 shr \$7,$tp88
1049 lea ($tp28,$tp28),$tp48
1050 sub $tp80,$acc0
1051 sub $tp88,$acc8
1052 and $maskfe,$tp40
1053 and $maskfe,$tp48
1054 and $mask1b,$acc0
1055 and $mask1b,$acc8
1056 xor $tp40,$acc0
1057 xor $tp48,$acc8
1058 mov $acc0,$tp40
1059 mov $acc8,$tp48
1060
1061 and $mask80,$acc0
1062 and $mask80,$acc8
1063 mov $acc0,$tp80
1064 mov $acc8,$tp88
1065 shr \$7,$tp80
1066 xor $tp10,$tp20 # tp2^=tp1
1067 shr \$7,$tp88
1068 xor $tp18,$tp28 # tp2^=tp1
1069 sub $tp80,$acc0
1070 sub $tp88,$acc8
1071 lea ($tp40,$tp40),$tp80
1072 lea ($tp48,$tp48),$tp88
1073 xor $tp10,$tp40 # tp4^=tp1
1074 xor $tp18,$tp48 # tp4^=tp1
1075 and $maskfe,$tp80
1076 and $maskfe,$tp88
1077 and $mask1b,$acc0
1078 and $mask1b,$acc8
1079 xor $acc0,$tp80
1080 xor $acc8,$tp88
1081
1082 xor $tp80,$tp10 # tp1^=tp8
1083 xor $tp88,$tp18 # tp1^=tp8
1084 xor $tp80,$tp20 # tp2^tp1^=tp8
1085 xor $tp88,$tp28 # tp2^tp1^=tp8
1086 mov $tp10,$acc0
1087 mov $tp18,$acc8
1088 xor $tp80,$tp40 # tp4^tp1^=tp8
1089 xor $tp88,$tp48 # tp4^tp1^=tp8
1090 shr \$32,$acc0
1091 shr \$32,$acc8
1092 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1
1093 xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
1094 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8)
1095 rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8)
1096 xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1097 xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1098
1099 rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
1100 rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8)
1101 xor `&LO("$tp80")`,`&LO("$tp10")`
1102 xor `&LO("$tp88")`,`&LO("$tp18")`
1103 shr \$32,$tp80
1104 shr \$32,$tp88
1105 xor `&LO("$tp80")`,`&LO("$acc0")`
1106 xor `&LO("$tp88")`,`&LO("$acc8")`
1107
1108 mov $tp20,$tp80
1109 mov $tp28,$tp88
1110 shr \$32,$tp80
1111 shr \$32,$tp88
1112 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
1113 rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
1114 rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
1115 rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
1116 xor `&LO("$tp20")`,`&LO("$tp10")`
1117 xor `&LO("$tp28")`,`&LO("$tp18")`
1118 mov $tp40,$tp20
1119 mov $tp48,$tp28
1120 xor `&LO("$tp80")`,`&LO("$acc0")`
1121 xor `&LO("$tp88")`,`&LO("$acc8")`
1122
1123 `"mov 0($sbox),$mask80" if ($prefetch)`
1124 shr \$32,$tp20
1125 shr \$32,$tp28
1126 `"mov 64($sbox),$maskfe" if ($prefetch)`
1127 rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
1128 rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
1129 `"mov 128($sbox),$mask1b" if ($prefetch)`
1130 rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
1131 rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
1132 `"mov 192($sbox),$tp80" if ($prefetch)`
1133 xor `&LO("$tp40")`,`&LO("$tp10")`
1134 xor `&LO("$tp48")`,`&LO("$tp18")`
1135 `"mov 256($sbox),$tp88" if ($prefetch)`
1136 xor `&LO("$tp20")`,`&LO("$acc0")`
1137 xor `&LO("$tp28")`,`&LO("$acc8")`
1138___
1139}
1140
1141$code.=<<___;
1142.type _x86_64_AES_decrypt_compact,\@abi-omnipotent
1143.align 16
1144_x86_64_AES_decrypt_compact:
1145 _CET_ENDBR
1146 lea 128($sbox),$inp # size optimization
1147 mov 0-128($inp),$acc1 # prefetch Td4
1148 mov 32-128($inp),$acc2
1149 mov 64-128($inp),$t0
1150 mov 96-128($inp),$t1
1151 mov 128-128($inp),$acc1
1152 mov 160-128($inp),$acc2
1153 mov 192-128($inp),$t0
1154 mov 224-128($inp),$t1
1155 jmp .Ldec_loop_compact
1156
1157.align 16
1158.Ldec_loop_compact:
1159 xor 0($key),$s0 # xor with key
1160 xor 4($key),$s1
1161 xor 8($key),$s2
1162 xor 12($key),$s3
1163 lea 16($key),$key
1164___
1165 &deccompactvert();
1166$code.=<<___;
1167 cmp 16(%rsp),$key
1168 je .Ldec_compact_done
1169
1170 mov 256+0($sbox),$mask80
1171 shl \$32,%rbx
1172 shl \$32,%rdx
1173 mov 256+8($sbox),$maskfe
1174 or %rbx,%rax
1175 or %rdx,%rcx
1176 mov 256+16($sbox),$mask1b
1177___
1178 &dectransform(1);
1179$code.=<<___;
1180 jmp .Ldec_loop_compact
1181.align 16
1182.Ldec_compact_done:
1183 xor 0($key),$s0
1184 xor 4($key),$s1
1185 xor 8($key),$s2
1186 xor 12($key),$s3
1187 retq
1188.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
1189___
1190
1191# void aes_decrypt_internal(const void *inp, void *out, const AES_KEY *key);
1192$code.=<<___;
1193.globl aes_decrypt_internal
1194.type aes_decrypt_internal,\@function,3
1195.align 16
1196.globl asm_AES_decrypt
1197.hidden asm_AES_decrypt
1198asm_AES_decrypt:
1199aes_decrypt_internal:
1200 _CET_ENDBR
1201 push %rbx
1202 push %rbp
1203 push %r12
1204 push %r13
1205 push %r14
1206 push %r15
1207
1208 # allocate frame "above" key schedule
1209 mov %rsp,%r10
1210 lea -63(%rdx),%rcx # %rdx is key argument
1211 and \$-64,%rsp
1212 sub %rsp,%rcx
1213 neg %rcx
1214 and \$0x3c0,%rcx
1215 sub %rcx,%rsp
1216 sub \$32,%rsp
1217
1218 mov %rsi,16(%rsp) # save out
1219 mov %r10,24(%rsp) # save real stack pointer
1220.Ldec_prologue:
1221
1222 mov %rdx,$key
1223 mov 240($key),$rnds # load rounds
1224
1225 mov 0(%rdi),$s0 # load input vector
1226 mov 4(%rdi),$s1
1227 mov 8(%rdi),$s2
1228 mov 12(%rdi),$s3
1229
1230 shl \$4,$rnds
1231 lea ($key,$rnds),%rbp
1232 mov $key,(%rsp) # key schedule
1233 mov %rbp,8(%rsp) # end of key schedule
1234
1235 # pick Td4 copy which can't "overlap" with stack frame or key schedule
1236 lea .LAES_Td+2048(%rip),$sbox
1237 lea 768(%rsp),%rbp
1238 sub $sbox,%rbp
1239 and \$0x300,%rbp
1240 lea ($sbox,%rbp),$sbox
1241 shr \$3,%rbp # recall "magic" constants!
1242 add %rbp,$sbox
1243
1244 call _x86_64_AES_decrypt_compact
1245
1246 mov 16(%rsp),$out # restore out
1247 mov 24(%rsp),%rsi # restore saved stack pointer
1248 mov $s0,0($out) # write output vector
1249 mov $s1,4($out)
1250 mov $s2,8($out)
1251 mov $s3,12($out)
1252
1253 mov (%rsi),%r15
1254 mov 8(%rsi),%r14
1255 mov 16(%rsi),%r13
1256 mov 24(%rsi),%r12
1257 mov 32(%rsi),%rbp
1258 mov 40(%rsi),%rbx
1259 lea 48(%rsi),%rsp
1260.Ldec_epilogue:
1261 ret
1262.size aes_decrypt_internal,.-aes_decrypt_internal
1263___
1264#------------------------------------------------------------------#
1265
1266sub enckey()
1267{
1268$code.=<<___;
1269 movz %dl,%esi # rk[i]>>0
1270 movzb -128(%rbp,%rsi),%ebx
1271 movz %dh,%esi # rk[i]>>8
1272 shl \$24,%ebx
1273 xor %ebx,%eax
1274
1275 movzb -128(%rbp,%rsi),%ebx
1276 shr \$16,%edx
1277 movz %dl,%esi # rk[i]>>16
1278 xor %ebx,%eax
1279
1280 movzb -128(%rbp,%rsi),%ebx
1281 movz %dh,%esi # rk[i]>>24
1282 shl \$8,%ebx
1283 xor %ebx,%eax
1284
1285 movzb -128(%rbp,%rsi),%ebx
1286 shl \$16,%ebx
1287 xor %ebx,%eax
1288
1289 xor 1024-128(%rbp,%rcx,4),%eax # rcon
1290___
1291}
1292
1293# int aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits,
1294# AES_KEY *key)
1295$code.=<<___;
1296.globl aes_set_encrypt_key_internal
1297.type aes_set_encrypt_key_internal,\@function,3
1298.align 16
1299aes_set_encrypt_key_internal:
1300 _CET_ENDBR
1301 push %rbx
1302 push %rbp
1303 push %r12 # redundant, but allows to share
1304 push %r13 # exception handler...
1305 push %r14
1306 push %r15
1307 sub \$8,%rsp
1308.Lenc_key_prologue:
1309
1310 call _x86_64_AES_set_encrypt_key
1311
1312 mov 8(%rsp),%r15
1313 mov 16(%rsp),%r14
1314 mov 24(%rsp),%r13
1315 mov 32(%rsp),%r12
1316 mov 40(%rsp),%rbp
1317 mov 48(%rsp),%rbx
1318 add \$56,%rsp
1319.Lenc_key_epilogue:
1320 ret
1321.size aes_set_encrypt_key_internal,.-aes_set_encrypt_key_internal
1322
1323.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
1324.align 16
1325_x86_64_AES_set_encrypt_key:
1326 _CET_ENDBR
1327 mov %esi,%ecx # %ecx=bits
1328 mov %rdi,%rsi # %rsi=userKey
1329 mov %rdx,%rdi # %rdi=key
1330
1331 test \$-1,%rsi
1332 jz .Lbadpointer
1333 test \$-1,%rdi
1334 jz .Lbadpointer
1335
1336 lea .LAES_Te(%rip),%rbp
1337 lea 2048+128(%rbp),%rbp
1338
1339 # prefetch Te4
1340 mov 0-128(%rbp),%eax
1341 mov 32-128(%rbp),%ebx
1342 mov 64-128(%rbp),%r8d
1343 mov 96-128(%rbp),%edx
1344 mov 128-128(%rbp),%eax
1345 mov 160-128(%rbp),%ebx
1346 mov 192-128(%rbp),%r8d
1347 mov 224-128(%rbp),%edx
1348
1349 cmp \$128,%ecx
1350 je .L10rounds
1351 cmp \$192,%ecx
1352 je .L12rounds
1353 cmp \$256,%ecx
1354 je .L14rounds
1355 mov \$-2,%rax # invalid number of bits
1356 jmp .Lexit
1357
1358.L10rounds:
1359 mov 0(%rsi),%rax # copy first 4 dwords
1360 mov 8(%rsi),%rdx
1361 mov %rax,0(%rdi)
1362 mov %rdx,8(%rdi)
1363
1364 shr \$32,%rdx
1365 xor %ecx,%ecx
1366 jmp .L10shortcut
1367.align 4
1368.L10loop:
1369 mov 0(%rdi),%eax # rk[0]
1370 mov 12(%rdi),%edx # rk[3]
1371.L10shortcut:
1372___
1373 &enckey ();
1374$code.=<<___;
1375 mov %eax,16(%rdi) # rk[4]
1376 xor 4(%rdi),%eax
1377 mov %eax,20(%rdi) # rk[5]
1378 xor 8(%rdi),%eax
1379 mov %eax,24(%rdi) # rk[6]
1380 xor 12(%rdi),%eax
1381 mov %eax,28(%rdi) # rk[7]
1382 add \$1,%ecx
1383 lea 16(%rdi),%rdi
1384 cmp \$10,%ecx
1385 jl .L10loop
1386
1387 movl \$10,80(%rdi) # setup number of rounds
1388 xor %rax,%rax
1389 jmp .Lexit
1390
1391.L12rounds:
1392 mov 0(%rsi),%rax # copy first 6 dwords
1393 mov 8(%rsi),%rbx
1394 mov 16(%rsi),%rdx
1395 mov %rax,0(%rdi)
1396 mov %rbx,8(%rdi)
1397 mov %rdx,16(%rdi)
1398
1399 shr \$32,%rdx
1400 xor %ecx,%ecx
1401 jmp .L12shortcut
1402.align 4
1403.L12loop:
1404 mov 0(%rdi),%eax # rk[0]
1405 mov 20(%rdi),%edx # rk[5]
1406.L12shortcut:
1407___
1408 &enckey ();
1409$code.=<<___;
1410 mov %eax,24(%rdi) # rk[6]
1411 xor 4(%rdi),%eax
1412 mov %eax,28(%rdi) # rk[7]
1413 xor 8(%rdi),%eax
1414 mov %eax,32(%rdi) # rk[8]
1415 xor 12(%rdi),%eax
1416 mov %eax,36(%rdi) # rk[9]
1417
1418 cmp \$7,%ecx
1419 je .L12break
1420 add \$1,%ecx
1421
1422 xor 16(%rdi),%eax
1423 mov %eax,40(%rdi) # rk[10]
1424 xor 20(%rdi),%eax
1425 mov %eax,44(%rdi) # rk[11]
1426
1427 lea 24(%rdi),%rdi
1428 jmp .L12loop
1429.L12break:
1430 movl \$12,72(%rdi) # setup number of rounds
1431 xor %rax,%rax
1432 jmp .Lexit
1433
1434.L14rounds:
1435 mov 0(%rsi),%rax # copy first 8 dwords
1436 mov 8(%rsi),%rbx
1437 mov 16(%rsi),%rcx
1438 mov 24(%rsi),%rdx
1439 mov %rax,0(%rdi)
1440 mov %rbx,8(%rdi)
1441 mov %rcx,16(%rdi)
1442 mov %rdx,24(%rdi)
1443
1444 shr \$32,%rdx
1445 xor %ecx,%ecx
1446 jmp .L14shortcut
1447.align 4
1448.L14loop:
1449 mov 0(%rdi),%eax # rk[0]
1450 mov 28(%rdi),%edx # rk[4]
1451.L14shortcut:
1452___
1453 &enckey ();
1454$code.=<<___;
1455 mov %eax,32(%rdi) # rk[8]
1456 xor 4(%rdi),%eax
1457 mov %eax,36(%rdi) # rk[9]
1458 xor 8(%rdi),%eax
1459 mov %eax,40(%rdi) # rk[10]
1460 xor 12(%rdi),%eax
1461 mov %eax,44(%rdi) # rk[11]
1462
1463 cmp \$6,%ecx
1464 je .L14break
1465 add \$1,%ecx
1466
1467 mov %eax,%edx
1468 mov 16(%rdi),%eax # rk[4]
1469 movz %dl,%esi # rk[11]>>0
1470 movzb -128(%rbp,%rsi),%ebx
1471 movz %dh,%esi # rk[11]>>8
1472 xor %ebx,%eax
1473
1474 movzb -128(%rbp,%rsi),%ebx
1475 shr \$16,%edx
1476 shl \$8,%ebx
1477 movz %dl,%esi # rk[11]>>16
1478 xor %ebx,%eax
1479
1480 movzb -128(%rbp,%rsi),%ebx
1481 movz %dh,%esi # rk[11]>>24
1482 shl \$16,%ebx
1483 xor %ebx,%eax
1484
1485 movzb -128(%rbp,%rsi),%ebx
1486 shl \$24,%ebx
1487 xor %ebx,%eax
1488
1489 mov %eax,48(%rdi) # rk[12]
1490 xor 20(%rdi),%eax
1491 mov %eax,52(%rdi) # rk[13]
1492 xor 24(%rdi),%eax
1493 mov %eax,56(%rdi) # rk[14]
1494 xor 28(%rdi),%eax
1495 mov %eax,60(%rdi) # rk[15]
1496
1497 lea 32(%rdi),%rdi
1498 jmp .L14loop
1499.L14break:
1500 movl \$14,48(%rdi) # setup number of rounds
1501 xor %rax,%rax
1502 jmp .Lexit
1503
1504.Lbadpointer:
1505 mov \$-1,%rax
1506.Lexit:
1507 retq
1508.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
1509___
1510
1511sub deckey_ref()
1512{ my ($i,$ptr,$te,$td) = @_;
1513 my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
1514$code.=<<___;
1515 mov $i($ptr),$tp1
1516 mov $tp1,$acc
1517 and \$0x80808080,$acc
1518 mov $acc,$tp4
1519 shr \$7,$tp4
1520 lea 0($tp1,$tp1),$tp2
1521 sub $tp4,$acc
1522 and \$0xfefefefe,$tp2
1523 and \$0x1b1b1b1b,$acc
1524 xor $tp2,$acc
1525 mov $acc,$tp2
1526
1527 and \$0x80808080,$acc
1528 mov $acc,$tp8
1529 shr \$7,$tp8
1530 lea 0($tp2,$tp2),$tp4
1531 sub $tp8,$acc
1532 and \$0xfefefefe,$tp4
1533 and \$0x1b1b1b1b,$acc
1534 xor $tp1,$tp2 # tp2^tp1
1535 xor $tp4,$acc
1536 mov $acc,$tp4
1537
1538 and \$0x80808080,$acc
1539 mov $acc,$tp8
1540 shr \$7,$tp8
1541 sub $tp8,$acc
1542 lea 0($tp4,$tp4),$tp8
1543 xor $tp1,$tp4 # tp4^tp1
1544 and \$0xfefefefe,$tp8
1545 and \$0x1b1b1b1b,$acc
1546 xor $acc,$tp8
1547
1548 xor $tp8,$tp1 # tp1^tp8
1549 rol \$8,$tp1 # ROTATE(tp1^tp8,8)
1550 xor $tp8,$tp2 # tp2^tp1^tp8
1551 xor $tp8,$tp4 # tp4^tp1^tp8
1552 xor $tp2,$tp8
1553 xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
1554
1555 xor $tp8,$tp1
1556 rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24)
1557 xor $tp2,$tp1
1558 rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16)
1559 xor $tp4,$tp1
1560
1561 mov $tp1,$i($ptr)
1562___
1563}
1564
1565# int aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits,
1566# AES_KEY *key)
1567$code.=<<___;
1568.globl aes_set_decrypt_key_internal
1569.type aes_set_decrypt_key_internal,\@function,3
1570.align 16
1571aes_set_decrypt_key_internal:
1572 _CET_ENDBR
1573 push %rbx
1574 push %rbp
1575 push %r12
1576 push %r13
1577 push %r14
1578 push %r15
1579 push %rdx # save key schedule
1580.Ldec_key_prologue:
1581
1582 call _x86_64_AES_set_encrypt_key
1583 mov (%rsp),%r8 # restore key schedule
1584 cmp \$0,%eax
1585 jne .Labort
1586
1587 mov 240(%r8),%r14d # pull number of rounds
1588 xor %rdi,%rdi
1589 lea (%rdi,%r14d,4),%rcx
1590 mov %r8,%rsi
1591 lea (%r8,%rcx,4),%rdi # pointer to last chunk
1592.align 4
1593.Linvert:
1594 mov 0(%rsi),%rax
1595 mov 8(%rsi),%rbx
1596 mov 0(%rdi),%rcx
1597 mov 8(%rdi),%rdx
1598 mov %rax,0(%rdi)
1599 mov %rbx,8(%rdi)
1600 mov %rcx,0(%rsi)
1601 mov %rdx,8(%rsi)
1602 lea 16(%rsi),%rsi
1603 lea -16(%rdi),%rdi
1604 cmp %rsi,%rdi
1605 jne .Linvert
1606
1607 lea .LAES_Te+2048+1024(%rip),%rax # rcon
1608
1609 mov 40(%rax),$mask80
1610 mov 48(%rax),$maskfe
1611 mov 56(%rax),$mask1b
1612
1613 mov %r8,$key
1614 sub \$1,%r14d
1615.align 4
1616.Lpermute:
1617 lea 16($key),$key
1618 mov 0($key),%rax
1619 mov 8($key),%rcx
1620___
1621 &dectransform ();
1622$code.=<<___;
1623 mov %eax,0($key)
1624 mov %ebx,4($key)
1625 mov %ecx,8($key)
1626 mov %edx,12($key)
1627 sub \$1,%r14d
1628 jnz .Lpermute
1629
1630 xor %rax,%rax
1631.Labort:
1632 mov 8(%rsp),%r15
1633 mov 16(%rsp),%r14
1634 mov 24(%rsp),%r13
1635 mov 32(%rsp),%r12
1636 mov 40(%rsp),%rbp
1637 mov 48(%rsp),%rbx
1638 add \$56,%rsp
1639.Ldec_key_epilogue:
1640 ret
1641.size aes_set_decrypt_key_internal,.-aes_set_decrypt_key_internal
1642___
1643
1644# void aes_cbc_encrypt_internal(const void char *inp, unsigned char *out,
1645# size_t length, const AES_KEY *key, unsigned char *ivp,const int enc);
1646{
1647# stack frame layout
1648# -8(%rsp) return address
1649my $keyp="0(%rsp)"; # one to pass as $key
1650my $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds])
1651my $_rsp="16(%rsp)"; # saved %rsp
1652my $_inp="24(%rsp)"; # copy of 1st parameter, inp
1653my $_out="32(%rsp)"; # copy of 2nd parameter, out
1654my $_len="40(%rsp)"; # copy of 3rd parameter, length
1655my $_key="48(%rsp)"; # copy of 4th parameter, key
1656my $_ivp="56(%rsp)"; # copy of 5th parameter, ivp
1657my $ivec="64(%rsp)"; # ivec[16]
1658my $aes_key="80(%rsp)"; # copy of aes_key
1659my $mark="80+240(%rsp)"; # copy of aes_key->rounds
1660
1661$code.=<<___;
1662.globl aes_cbc_encrypt_internal
1663.type aes_cbc_encrypt_internal,\@function,6
1664.align 16
1665.extern OPENSSL_ia32cap_P
1666.hidden OPENSSL_ia32cap_P
1667.globl asm_AES_cbc_encrypt
1668.hidden asm_AES_cbc_encrypt
1669asm_AES_cbc_encrypt:
1670aes_cbc_encrypt_internal:
1671 _CET_ENDBR
1672 cmp \$0,%rdx # check length
1673 je .Lcbc_epilogue
1674 pushfq
1675 push %rbx
1676 push %rbp
1677 push %r12
1678 push %r13
1679 push %r14
1680 push %r15
1681.Lcbc_prologue:
1682
1683 cld
1684 mov %r9d,%r9d # clear upper half of enc
1685
1686 lea .LAES_Te(%rip),$sbox
1687 cmp \$0,%r9
1688 jne .Lcbc_picked_te
1689 lea .LAES_Td(%rip),$sbox
1690.Lcbc_picked_te:
1691
1692 mov OPENSSL_ia32cap_P(%rip),%r10d
1693 cmp \$$speed_limit,%rdx
1694 jb .Lcbc_slow_prologue
1695 test \$15,%rdx
1696 jnz .Lcbc_slow_prologue
1697 bt \$IA32CAP_BIT0_HT,%r10d
1698 jc .Lcbc_slow_prologue
1699
1700 # allocate aligned stack frame...
1701 lea -88-248(%rsp),$key
1702 and \$-64,$key
1703
1704 # ... and make sure it doesn't alias with AES_T[ed] modulo 4096
1705 mov $sbox,%r10
1706 lea 2304($sbox),%r11
1707 mov $key,%r12
1708 and \$0xFFF,%r10 # s = $sbox&0xfff
1709 and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
1710 and \$0xFFF,%r12 # p = %rsp&0xfff
1711
1712 cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
1713 jb .Lcbc_te_break_out
1714 sub %r11,%r12
1715 sub %r12,$key
1716 jmp .Lcbc_te_ok
1717.Lcbc_te_break_out: # else %rsp -= (p-s)&0xfff + framesz
1718 sub %r10,%r12
1719 and \$0xFFF,%r12
1720 add \$320,%r12
1721 sub %r12,$key
1722.align 4
1723.Lcbc_te_ok:
1724
1725 xchg %rsp,$key
1726 #add \$8,%rsp # reserve for return address!
1727 mov $key,$_rsp # save %rsp
1728.Lcbc_fast_body:
1729 mov %rdi,$_inp # save copy of inp
1730 mov %rsi,$_out # save copy of out
1731 mov %rdx,$_len # save copy of len
1732 mov %rcx,$_key # save copy of key
1733 mov %r8,$_ivp # save copy of ivp
1734 movl \$0,$mark # copy of aes_key->rounds = 0;
1735 mov %r8,%rbp # rearrange input arguments
1736 mov %r9,%rbx
1737 mov %rsi,$out
1738 mov %rdi,$inp
1739 mov %rcx,$key
1740
1741 mov 240($key),%eax # key->rounds
1742 # do we copy key schedule to stack?
1743 mov $key,%r10
1744 sub $sbox,%r10
1745 and \$0xfff,%r10
1746 cmp \$2304,%r10
1747 jb .Lcbc_do_ecopy
1748 cmp \$4096-248,%r10
1749 jb .Lcbc_skip_ecopy
1750.align 4
1751.Lcbc_do_ecopy:
1752 mov $key,%rsi
1753 lea $aes_key,%rdi
1754 lea $aes_key,$key
1755 mov \$240/8,%ecx
1756 .long 0x90A548F3 # rep movsq
1757 mov %eax,(%rdi) # copy aes_key->rounds
1758.Lcbc_skip_ecopy:
1759 mov $key,$keyp # save key pointer
1760
1761 mov \$18,%ecx
1762.align 4
1763.Lcbc_prefetch_te:
1764 mov 0($sbox),%r10
1765 mov 32($sbox),%r11
1766 mov 64($sbox),%r12
1767 mov 96($sbox),%r13
1768 lea 128($sbox),$sbox
1769 sub \$1,%ecx
1770 jnz .Lcbc_prefetch_te
1771 lea -2304($sbox),$sbox
1772
1773 cmp \$0,%rbx
1774 je .LFAST_DECRYPT
1775
1776#----------------------------- ENCRYPT -----------------------------#
1777 mov 0(%rbp),$s0 # load iv
1778 mov 4(%rbp),$s1
1779 mov 8(%rbp),$s2
1780 mov 12(%rbp),$s3
1781
1782.align 4
1783.Lcbc_fast_enc_loop:
1784 xor 0($inp),$s0
1785 xor 4($inp),$s1
1786 xor 8($inp),$s2
1787 xor 12($inp),$s3
1788 mov $keyp,$key # restore key
1789 mov $inp,$_inp # if ($verticalspin) save inp
1790
1791 call _x86_64_AES_encrypt
1792
1793 mov $_inp,$inp # if ($verticalspin) restore inp
1794 mov $_len,%r10
1795 mov $s0,0($out)
1796 mov $s1,4($out)
1797 mov $s2,8($out)
1798 mov $s3,12($out)
1799
1800 lea 16($inp),$inp
1801 lea 16($out),$out
1802 sub \$16,%r10
1803 test \$-16,%r10
1804 mov %r10,$_len
1805 jnz .Lcbc_fast_enc_loop
1806 mov $_ivp,%rbp # restore ivp
1807 mov $s0,0(%rbp) # save ivec
1808 mov $s1,4(%rbp)
1809 mov $s2,8(%rbp)
1810 mov $s3,12(%rbp)
1811
1812 jmp .Lcbc_fast_cleanup
1813
1814#----------------------------- DECRYPT -----------------------------#
1815.align 16
1816.LFAST_DECRYPT:
1817 cmp $inp,$out
1818 je .Lcbc_fast_dec_in_place
1819
1820 mov %rbp,$ivec
1821.align 4
1822.Lcbc_fast_dec_loop:
1823 mov 0($inp),$s0 # read input
1824 mov 4($inp),$s1
1825 mov 8($inp),$s2
1826 mov 12($inp),$s3
1827 mov $keyp,$key # restore key
1828 mov $inp,$_inp # if ($verticalspin) save inp
1829
1830 call _x86_64_AES_decrypt
1831
1832 mov $ivec,%rbp # load ivp
1833 mov $_inp,$inp # if ($verticalspin) restore inp
1834 mov $_len,%r10 # load len
1835 xor 0(%rbp),$s0 # xor iv
1836 xor 4(%rbp),$s1
1837 xor 8(%rbp),$s2
1838 xor 12(%rbp),$s3
1839 mov $inp,%rbp # current input, next iv
1840
1841 sub \$16,%r10
1842 mov %r10,$_len # update len
1843 mov %rbp,$ivec # update ivp
1844
1845 mov $s0,0($out) # write output
1846 mov $s1,4($out)
1847 mov $s2,8($out)
1848 mov $s3,12($out)
1849
1850 lea 16($inp),$inp
1851 lea 16($out),$out
1852 jnz .Lcbc_fast_dec_loop
1853 mov $_ivp,%r12 # load user ivp
1854 mov 0(%rbp),%r10 # load iv
1855 mov 8(%rbp),%r11
1856 mov %r10,0(%r12) # copy back to user
1857 mov %r11,8(%r12)
1858 jmp .Lcbc_fast_cleanup
1859
1860.align 16
1861.Lcbc_fast_dec_in_place:
1862 mov 0(%rbp),%r10 # copy iv to stack
1863 mov 8(%rbp),%r11
1864 mov %r10,0+$ivec
1865 mov %r11,8+$ivec
1866.align 4
1867.Lcbc_fast_dec_in_place_loop:
1868 mov 0($inp),$s0 # load input
1869 mov 4($inp),$s1
1870 mov 8($inp),$s2
1871 mov 12($inp),$s3
1872 mov $keyp,$key # restore key
1873 mov $inp,$_inp # if ($verticalspin) save inp
1874
1875 call _x86_64_AES_decrypt
1876
1877 mov $_inp,$inp # if ($verticalspin) restore inp
1878 mov $_len,%r10
1879 xor 0+$ivec,$s0
1880 xor 4+$ivec,$s1
1881 xor 8+$ivec,$s2
1882 xor 12+$ivec,$s3
1883
1884 mov 0($inp),%r11 # load input
1885 mov 8($inp),%r12
1886 sub \$16,%r10
1887 jz .Lcbc_fast_dec_in_place_done
1888
1889 mov %r11,0+$ivec # copy input to iv
1890 mov %r12,8+$ivec
1891
1892 mov $s0,0($out) # save output [zaps input]
1893 mov $s1,4($out)
1894 mov $s2,8($out)
1895 mov $s3,12($out)
1896
1897 lea 16($inp),$inp
1898 lea 16($out),$out
1899 mov %r10,$_len
1900 jmp .Lcbc_fast_dec_in_place_loop
1901.Lcbc_fast_dec_in_place_done:
1902 mov $_ivp,%rdi
1903 mov %r11,0(%rdi) # copy iv back to user
1904 mov %r12,8(%rdi)
1905
1906 mov $s0,0($out) # save output [zaps input]
1907 mov $s1,4($out)
1908 mov $s2,8($out)
1909 mov $s3,12($out)
1910
1911.align 4
1912.Lcbc_fast_cleanup:
1913 cmpl \$0,$mark # was the key schedule copied?
1914 lea $aes_key,%rdi
1915 je .Lcbc_exit
1916 mov \$240/8,%ecx
1917 xor %rax,%rax
1918 .long 0x90AB48F3 # rep stosq
1919
1920 jmp .Lcbc_exit
1921
1922#--------------------------- SLOW ROUTINE ---------------------------#
1923.align 16
1924.Lcbc_slow_prologue:
1925 # allocate aligned stack frame...
1926 lea -88(%rsp),%rbp
1927 and \$-64,%rbp
1928 # ... just "above" key schedule
1929 lea -88-63(%rcx),%r10
1930 sub %rbp,%r10
1931 neg %r10
1932 and \$0x3c0,%r10
1933 sub %r10,%rbp
1934
1935 xchg %rsp,%rbp
1936 #add \$8,%rsp # reserve for return address!
1937 mov %rbp,$_rsp # save %rsp
1938.Lcbc_slow_body:
1939 #mov %rdi,$_inp # save copy of inp
1940 #mov %rsi,$_out # save copy of out
1941 #mov %rdx,$_len # save copy of len
1942 #mov %rcx,$_key # save copy of key
1943 mov %r8,$_ivp # save copy of ivp
1944 mov %r8,%rbp # rearrange input arguments
1945 mov %r9,%rbx
1946 mov %rsi,$out
1947 mov %rdi,$inp
1948 mov %rcx,$key
1949 mov %rdx,%r10
1950
1951 mov 240($key),%eax
1952 mov $key,$keyp # save key pointer
1953 shl \$4,%eax
1954 lea ($key,%rax),%rax
1955 mov %rax,$keyend
1956
1957 # pick Te4 copy which can't "overlap" with stack frame or key schedule
1958 lea 2048($sbox),$sbox
1959 lea 768-8(%rsp),%rax
1960 sub $sbox,%rax
1961 and \$0x300,%rax
1962 lea ($sbox,%rax),$sbox
1963
1964 cmp \$0,%rbx
1965 je .LSLOW_DECRYPT
1966
1967#--------------------------- SLOW ENCRYPT ---------------------------#
1968 test \$-16,%r10 # check upon length
1969 mov 0(%rbp),$s0 # load iv
1970 mov 4(%rbp),$s1
1971 mov 8(%rbp),$s2
1972 mov 12(%rbp),$s3
1973 jz .Lcbc_slow_enc_tail # short input...
1974
1975.align 4
1976.Lcbc_slow_enc_loop:
1977 xor 0($inp),$s0
1978 xor 4($inp),$s1
1979 xor 8($inp),$s2
1980 xor 12($inp),$s3
1981 mov $keyp,$key # restore key
1982 mov $inp,$_inp # save inp
1983 mov $out,$_out # save out
1984 mov %r10,$_len # save len
1985
1986 call _x86_64_AES_encrypt_compact
1987
1988 mov $_inp,$inp # restore inp
1989 mov $_out,$out # restore out
1990 mov $_len,%r10 # restore len
1991 mov $s0,0($out)
1992 mov $s1,4($out)
1993 mov $s2,8($out)
1994 mov $s3,12($out)
1995
1996 lea 16($inp),$inp
1997 lea 16($out),$out
1998 sub \$16,%r10
1999 test \$-16,%r10
2000 jnz .Lcbc_slow_enc_loop
2001 test \$15,%r10
2002 jnz .Lcbc_slow_enc_tail
2003 mov $_ivp,%rbp # restore ivp
2004 mov $s0,0(%rbp) # save ivec
2005 mov $s1,4(%rbp)
2006 mov $s2,8(%rbp)
2007 mov $s3,12(%rbp)
2008
2009 jmp .Lcbc_exit
2010
2011.align 4
2012.Lcbc_slow_enc_tail:
2013 mov %rax,%r11
2014 mov %rcx,%r12
2015 mov %r10,%rcx
2016 mov $inp,%rsi
2017 mov $out,%rdi
2018 .long 0x9066A4F3 # rep movsb
2019 mov \$16,%rcx # zero tail
2020 sub %r10,%rcx
2021 xor %rax,%rax
2022 .long 0x9066AAF3 # rep stosb
2023 mov $out,$inp # this is not a mistake!
2024 mov \$16,%r10 # len=16
2025 mov %r11,%rax
2026 mov %r12,%rcx
2027 jmp .Lcbc_slow_enc_loop # one more spin...
2028#--------------------------- SLOW DECRYPT ---------------------------#
2029.align 16
2030.LSLOW_DECRYPT:
2031 shr \$3,%rax
2032 add %rax,$sbox # recall "magic" constants!
2033
2034 mov 0(%rbp),%r11 # copy iv to stack
2035 mov 8(%rbp),%r12
2036 mov %r11,0+$ivec
2037 mov %r12,8+$ivec
2038
2039.align 4
2040.Lcbc_slow_dec_loop:
2041 mov 0($inp),$s0 # load input
2042 mov 4($inp),$s1
2043 mov 8($inp),$s2
2044 mov 12($inp),$s3
2045 mov $keyp,$key # restore key
2046 mov $inp,$_inp # save inp
2047 mov $out,$_out # save out
2048 mov %r10,$_len # save len
2049
2050 call _x86_64_AES_decrypt_compact
2051
2052 mov $_inp,$inp # restore inp
2053 mov $_out,$out # restore out
2054 mov $_len,%r10
2055 xor 0+$ivec,$s0
2056 xor 4+$ivec,$s1
2057 xor 8+$ivec,$s2
2058 xor 12+$ivec,$s3
2059
2060 mov 0($inp),%r11 # load input
2061 mov 8($inp),%r12
2062 sub \$16,%r10
2063 jc .Lcbc_slow_dec_partial
2064 jz .Lcbc_slow_dec_done
2065
2066 mov %r11,0+$ivec # copy input to iv
2067 mov %r12,8+$ivec
2068
2069 mov $s0,0($out) # save output [can zap input]
2070 mov $s1,4($out)
2071 mov $s2,8($out)
2072 mov $s3,12($out)
2073
2074 lea 16($inp),$inp
2075 lea 16($out),$out
2076 jmp .Lcbc_slow_dec_loop
2077.Lcbc_slow_dec_done:
2078 mov $_ivp,%rdi
2079 mov %r11,0(%rdi) # copy iv back to user
2080 mov %r12,8(%rdi)
2081
2082 mov $s0,0($out) # save output [can zap input]
2083 mov $s1,4($out)
2084 mov $s2,8($out)
2085 mov $s3,12($out)
2086
2087 jmp .Lcbc_exit
2088
2089.align 4
2090.Lcbc_slow_dec_partial:
2091 mov $_ivp,%rdi
2092 mov %r11,0(%rdi) # copy iv back to user
2093 mov %r12,8(%rdi)
2094
2095 mov $s0,0+$ivec # save output to stack
2096 mov $s1,4+$ivec
2097 mov $s2,8+$ivec
2098 mov $s3,12+$ivec
2099
2100 mov $out,%rdi
2101 lea $ivec,%rsi
2102 lea 16(%r10),%rcx
2103 .long 0x9066A4F3 # rep movsb
2104 jmp .Lcbc_exit
2105
2106.align 16
2107.Lcbc_exit:
2108 mov $_rsp,%rsi
2109 mov (%rsi),%r15
2110 mov 8(%rsi),%r14
2111 mov 16(%rsi),%r13
2112 mov 24(%rsi),%r12
2113 mov 32(%rsi),%rbp
2114 mov 40(%rsi),%rbx
2115 lea 48(%rsi),%rsp
2116.Lcbc_popfq:
2117 popfq
2118.Lcbc_epilogue:
2119 ret
2120.size aes_cbc_encrypt_internal,.-aes_cbc_encrypt_internal
2121___
2122}
2123
2124$code.=<<___;
2125.section .rodata
2126.align 64
2127.LAES_Te:
2128___
2129 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
2130 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
2131 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
2132 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
2133 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
2134 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
2135 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
2136 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
2137 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
2138 &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
2139 &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
2140 &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
2141 &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
2142 &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
2143 &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
2144 &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
2145 &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
2146 &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
2147 &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
2148 &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
2149 &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
2150 &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
2151 &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
2152 &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
2153 &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
2154 &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
2155 &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
2156 &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
2157 &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
2158 &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
2159 &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
2160 &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
2161 &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
2162 &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
2163 &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
2164 &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
2165 &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
2166 &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
2167 &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
2168 &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
2169 &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
2170 &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
2171 &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
2172 &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
2173 &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
2174 &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
2175 &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
2176 &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
2177 &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
2178 &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
2179 &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
2180 &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
2181 &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
2182 &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
2183 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
2184 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
2185 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
2186 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
2187 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
2188 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
2189 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
2190 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
2191 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
2192 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
2193
2194#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
2195 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2196 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2197 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2198 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2199 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2200 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2201 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2202 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2203 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2204 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2205 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2206 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2207 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2208 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2209 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2210 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2211 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2212 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2213 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2214 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2215 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2216 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2217 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2218 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2219 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2220 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2221 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2222 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2223 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2224 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2225 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2226 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2227
2228 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2229 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2230 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2231 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2232 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2233 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2234 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2235 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2236 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2237 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2238 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2239 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2240 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2241 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2242 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2243 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2244 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2245 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2246 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2247 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2248 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2249 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2250 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2251 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2252 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2253 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2254 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2255 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2256 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2257 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2258 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2259 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2260
2261 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2262 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2263 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2264 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2265 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2266 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2267 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2268 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2269 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2270 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2271 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2272 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2273 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2274 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2275 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2276 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2277 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2278 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2279 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2280 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2281 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2282 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2283 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2284 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2285 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2286 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2287 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2288 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2289 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2290 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2291 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2292 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2293
2294 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2295 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2296 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2297 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2298 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2299 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2300 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2301 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2302 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2303 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2304 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2305 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2306 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2307 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2308 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2309 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2310 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2311 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2312 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2313 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2314 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2315 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2316 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2317 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2318 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2319 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2320 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2321 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2322 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2323 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2324 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2325 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2326#rcon:
2327$code.=<<___;
2328 .long 0x00000001, 0x00000002, 0x00000004, 0x00000008
2329 .long 0x00000010, 0x00000020, 0x00000040, 0x00000080
2330 .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080
2331 .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
2332___
2333$code.=<<___;
2334.align 64
2335.LAES_Td:
2336___
2337 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
2338 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
2339 &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
2340 &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
2341 &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
2342 &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
2343 &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
2344 &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
2345 &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
2346 &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
2347 &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
2348 &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
2349 &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
2350 &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
2351 &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
2352 &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
2353 &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
2354 &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
2355 &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
2356 &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
2357 &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
2358 &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
2359 &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
2360 &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
2361 &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
2362 &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
2363 &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
2364 &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
2365 &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
2366 &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
2367 &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
2368 &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
2369 &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
2370 &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
2371 &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
2372 &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
2373 &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
2374 &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
2375 &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
2376 &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
2377 &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
2378 &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
2379 &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
2380 &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
2381 &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
2382 &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
2383 &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
2384 &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
2385 &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
2386 &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
2387 &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
2388 &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
2389 &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
2390 &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
2391 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
2392 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
2393 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
2394 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
2395 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
2396 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
2397 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
2398 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
2399 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
2400 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
2401
2402#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
2403 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2404 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2405 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2406 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2407 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2408 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2409 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2410 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2411 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2412 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2413 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2414 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2415 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2416 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2417 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2418 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2419 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2420 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2421 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2422 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2423 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2424 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2425 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2426 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2427 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2428 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2429 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2430 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2431 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2432 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2433 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2434 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2435$code.=<<___;
2436 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2437 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2438___
2439 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2440 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2441 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2442 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2443 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2444 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2445 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2446 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2447 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2448 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2449 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2450 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2451 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2452 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2453 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2454 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2455 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2456 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2457 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2458 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2459 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2460 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2461 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2462 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2463 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2464 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2465 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2466 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2467 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2468 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2469 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2470 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2471$code.=<<___;
2472 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2473 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2474___
2475 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2476 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2477 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2478 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2479 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2480 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2481 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2482 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2483 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2484 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2485 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2486 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2487 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2488 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2489 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2490 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2491 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2492 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2493 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2494 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2495 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2496 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2497 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2498 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2499 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2500 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2501 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2502 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2503 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2504 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2505 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2506 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2507$code.=<<___;
2508 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2509 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2510___
2511 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2512 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2513 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2514 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2515 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2516 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2517 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2518 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2519 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2520 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2521 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2522 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2523 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2524 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2525 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2526 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2527 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2528 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2529 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2530 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2531 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2532 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2533 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2534 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2535 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2536 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2537 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2538 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2539 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2540 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2541 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2542 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2543$code.=<<___;
2544 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2545 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2546.align 64
2547.text
2548___
2549
2550# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2551# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2552if ($win64) {
2553$rec="%rcx";
2554$frame="%rdx";
2555$context="%r8";
2556$disp="%r9";
2557
2558$code.=<<___;
2559.extern __imp_RtlVirtualUnwind
2560.type block_se_handler,\@abi-omnipotent
2561.align 16
2562block_se_handler:
2563 _CET_ENDBR
2564 push %rsi
2565 push %rdi
2566 push %rbx
2567 push %rbp
2568 push %r12
2569 push %r13
2570 push %r14
2571 push %r15
2572 pushfq
2573 sub \$64,%rsp
2574
2575 mov 120($context),%rax # pull context->Rax
2576 mov 248($context),%rbx # pull context->Rip
2577
2578 mov 8($disp),%rsi # disp->ImageBase
2579 mov 56($disp),%r11 # disp->HandlerData
2580
2581 mov 0(%r11),%r10d # HandlerData[0]
2582 lea (%rsi,%r10),%r10 # prologue label
2583 cmp %r10,%rbx # context->Rip<prologue label
2584 jb .Lin_block_prologue
2585
2586 mov 152($context),%rax # pull context->Rsp
2587
2588 mov 4(%r11),%r10d # HandlerData[1]
2589 lea (%rsi,%r10),%r10 # epilogue label
2590 cmp %r10,%rbx # context->Rip>=epilogue label
2591 jae .Lin_block_prologue
2592
2593 mov 24(%rax),%rax # pull saved real stack pointer
2594 lea 48(%rax),%rax # adjust...
2595
2596 mov -8(%rax),%rbx
2597 mov -16(%rax),%rbp
2598 mov -24(%rax),%r12
2599 mov -32(%rax),%r13
2600 mov -40(%rax),%r14
2601 mov -48(%rax),%r15
2602 mov %rbx,144($context) # restore context->Rbx
2603 mov %rbp,160($context) # restore context->Rbp
2604 mov %r12,216($context) # restore context->R12
2605 mov %r13,224($context) # restore context->R13
2606 mov %r14,232($context) # restore context->R14
2607 mov %r15,240($context) # restore context->R15
2608
2609.Lin_block_prologue:
2610 mov 8(%rax),%rdi
2611 mov 16(%rax),%rsi
2612 mov %rax,152($context) # restore context->Rsp
2613 mov %rsi,168($context) # restore context->Rsi
2614 mov %rdi,176($context) # restore context->Rdi
2615
2616 jmp .Lcommon_seh_exit
2617.size block_se_handler,.-block_se_handler
2618
2619.type key_se_handler,\@abi-omnipotent
2620.align 16
2621key_se_handler:
2622 _CET_ENDBR
2623 push %rsi
2624 push %rdi
2625 push %rbx
2626 push %rbp
2627 push %r12
2628 push %r13
2629 push %r14
2630 push %r15
2631 pushfq
2632 sub \$64,%rsp
2633
2634 mov 120($context),%rax # pull context->Rax
2635 mov 248($context),%rbx # pull context->Rip
2636
2637 mov 8($disp),%rsi # disp->ImageBase
2638 mov 56($disp),%r11 # disp->HandlerData
2639
2640 mov 0(%r11),%r10d # HandlerData[0]
2641 lea (%rsi,%r10),%r10 # prologue label
2642 cmp %r10,%rbx # context->Rip<prologue label
2643 jb .Lin_key_prologue
2644
2645 mov 152($context),%rax # pull context->Rsp
2646
2647 mov 4(%r11),%r10d # HandlerData[1]
2648 lea (%rsi,%r10),%r10 # epilogue label
2649 cmp %r10,%rbx # context->Rip>=epilogue label
2650 jae .Lin_key_prologue
2651
2652 lea 56(%rax),%rax
2653
2654 mov -8(%rax),%rbx
2655 mov -16(%rax),%rbp
2656 mov -24(%rax),%r12
2657 mov -32(%rax),%r13
2658 mov -40(%rax),%r14
2659 mov -48(%rax),%r15
2660 mov %rbx,144($context) # restore context->Rbx
2661 mov %rbp,160($context) # restore context->Rbp
2662 mov %r12,216($context) # restore context->R12
2663 mov %r13,224($context) # restore context->R13
2664 mov %r14,232($context) # restore context->R14
2665 mov %r15,240($context) # restore context->R15
2666
2667.Lin_key_prologue:
2668 mov 8(%rax),%rdi
2669 mov 16(%rax),%rsi
2670 mov %rax,152($context) # restore context->Rsp
2671 mov %rsi,168($context) # restore context->Rsi
2672 mov %rdi,176($context) # restore context->Rdi
2673
2674 jmp .Lcommon_seh_exit
2675.size key_se_handler,.-key_se_handler
2676
2677.type cbc_se_handler,\@abi-omnipotent
2678.align 16
2679cbc_se_handler:
2680 _CET_ENDBR
2681 push %rsi
2682 push %rdi
2683 push %rbx
2684 push %rbp
2685 push %r12
2686 push %r13
2687 push %r14
2688 push %r15
2689 pushfq
2690 sub \$64,%rsp
2691
2692 mov 120($context),%rax # pull context->Rax
2693 mov 248($context),%rbx # pull context->Rip
2694
2695 lea .Lcbc_prologue(%rip),%r10
2696 cmp %r10,%rbx # context->Rip<.Lcbc_prologue
2697 jb .Lin_cbc_prologue
2698
2699 lea .Lcbc_fast_body(%rip),%r10
2700 cmp %r10,%rbx # context->Rip<.Lcbc_fast_body
2701 jb .Lin_cbc_frame_setup
2702
2703 lea .Lcbc_slow_prologue(%rip),%r10
2704 cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue
2705 jb .Lin_cbc_body
2706
2707 lea .Lcbc_slow_body(%rip),%r10
2708 cmp %r10,%rbx # context->Rip<.Lcbc_slow_body
2709 jb .Lin_cbc_frame_setup
2710
2711.Lin_cbc_body:
2712 mov 152($context),%rax # pull context->Rsp
2713
2714 lea .Lcbc_epilogue(%rip),%r10
2715 cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue
2716 jae .Lin_cbc_prologue
2717
2718 lea 8(%rax),%rax
2719
2720 lea .Lcbc_popfq(%rip),%r10
2721 cmp %r10,%rbx # context->Rip>=.Lcbc_popfq
2722 jae .Lin_cbc_prologue
2723
2724 mov `16-8`(%rax),%rax # biased $_rsp
2725 lea 56(%rax),%rax
2726
2727.Lin_cbc_frame_setup:
2728 mov -16(%rax),%rbx
2729 mov -24(%rax),%rbp
2730 mov -32(%rax),%r12
2731 mov -40(%rax),%r13
2732 mov -48(%rax),%r14
2733 mov -56(%rax),%r15
2734 mov %rbx,144($context) # restore context->Rbx
2735 mov %rbp,160($context) # restore context->Rbp
2736 mov %r12,216($context) # restore context->R12
2737 mov %r13,224($context) # restore context->R13
2738 mov %r14,232($context) # restore context->R14
2739 mov %r15,240($context) # restore context->R15
2740
2741.Lin_cbc_prologue:
2742 mov 8(%rax),%rdi
2743 mov 16(%rax),%rsi
2744 mov %rax,152($context) # restore context->Rsp
2745 mov %rsi,168($context) # restore context->Rsi
2746 mov %rdi,176($context) # restore context->Rdi
2747
2748.Lcommon_seh_exit:
2749
2750 mov 40($disp),%rdi # disp->ContextRecord
2751 mov $context,%rsi # context
2752 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2753 .long 0xa548f3fc # cld; rep movsq
2754
2755 mov $disp,%rsi
2756 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2757 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2758 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2759 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2760 mov 40(%rsi),%r10 # disp->ContextRecord
2761 lea 56(%rsi),%r11 # &disp->HandlerData
2762 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2763 mov %r10,32(%rsp) # arg5
2764 mov %r11,40(%rsp) # arg6
2765 mov %r12,48(%rsp) # arg7
2766 mov %rcx,56(%rsp) # arg8, (NULL)
2767 call *__imp_RtlVirtualUnwind(%rip)
2768
2769 mov \$1,%eax # ExceptionContinueSearch
2770 add \$64,%rsp
2771 popfq
2772 pop %r15
2773 pop %r14
2774 pop %r13
2775 pop %r12
2776 pop %rbp
2777 pop %rbx
2778 pop %rdi
2779 pop %rsi
2780 ret
2781.size cbc_se_handler,.-cbc_se_handler
2782
2783.section .pdata
2784.align 4
2785 .rva .LSEH_begin_aes_encrypt_internal
2786 .rva .LSEH_end_aes_encrypt_internal
2787 .rva .LSEH_info_aes_encrypt_internal
2788
2789 .rva .LSEH_begin_aes_decrypt_internal
2790 .rva .LSEH_end_aes_decrypt_internal
2791 .rva .LSEH_info_aes_decrypt_internal
2792
2793 .rva .LSEH_begin_aes_set_encrypt_key_internal
2794 .rva .LSEH_end_aes_set_encrypt_key_internal
2795 .rva .LSEH_info_aes_set_encrypt_key_internal
2796
2797 .rva .LSEH_begin_aes_set_decrypt_key_internal
2798 .rva .LSEH_end_aes_set_decrypt_key_internal
2799 .rva .LSEH_info_aes_set_decrypt_key_internal
2800
2801 .rva .LSEH_begin_aes_cbc_encrypt_internal
2802 .rva .LSEH_end_aes_cbc_encrypt_internal
2803 .rva .LSEH_info_aes_cbc_encrypt_internal
2804
2805.section .xdata
2806.align 8
2807.LSEH_info_aes_encrypt_internal:
2808 .byte 9,0,0,0
2809 .rva block_se_handler
2810 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
2811.LSEH_info_aes_decrypt_internal:
2812 .byte 9,0,0,0
2813 .rva block_se_handler
2814 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
2815.LSEH_info_aes_set_encrypt_key_internal:
2816 .byte 9,0,0,0
2817 .rva key_se_handler
2818 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
2819.LSEH_info_aes_set_decrypt_key_internal:
2820 .byte 9,0,0,0
2821 .rva key_se_handler
2822 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]
2823.LSEH_info_aes_cbc_encrypt_internal:
2824 .byte 9,0,0,0
2825 .rva cbc_se_handler
2826___
2827}
2828
2829$code =~ s/\`([^\`]*)\`/eval($1)/gem;
2830
2831print "#include \"x86_arch.h\"\n";
2832print $code;
2833
2834close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86.pl b/src/lib/libcrypto/aes/asm/aesni-x86.pl
deleted file mode 100644
index ff44415611..0000000000
--- a/src/lib/libcrypto/aes/asm/aesni-x86.pl
+++ /dev/null
@@ -1,2188 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13# details].
14#
15# Performance.
16#
17# To start with see corresponding paragraph in aesni-x86_64.pl...
18# Instead of filling table similar to one found there I've chosen to
19# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20# The simplified table below represents 32-bit performance relative
21# to 64-bit one in every given point. Ratios vary for different
22# encryption modes, therefore interval values.
23#
24# 16-byte 64-byte 256-byte 1-KB 8-KB
25# 53-67% 67-84% 91-94% 95-98% 97-99.5%
26#
27# Lower ratios for smaller block sizes are perfectly understandable,
28# because function call overhead is higher in 32-bit mode. Largest
29# 8-KB block performance is virtually same: 32-bit code is less than
30# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
31
32# January 2011
33#
34# See aesni-x86_64.pl for details. Unlike x86_64 version this module
35# interleaves at most 6 aes[enc|dec] instructions, because there are
36# not enough registers for 8x interleave [which should be optimal for
37# Sandy Bridge]. Actually, performance results for 6x interleave
38# factor presented in aesni-x86_64.pl (except for CTR) are for this
39# module.
40
41# April 2011
42#
43# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
45
46$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
47 # generates drop-in replacement for
48 # crypto/aes/asm/aes-586.pl:-)
49$inline=1; # inline _aesni_[en|de]crypt
50
51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52push(@INC,"${dir}","${dir}../../perlasm");
53require "x86asm.pl";
54
55&asm_init($ARGV[0],$0);
56
57if ($PREFIX eq "aesni") { $movekey=*movups; }
58else { $movekey=*movups; }
59
60$len="eax";
61$rounds="ecx";
62$key="edx";
63$inp="esi";
64$out="edi";
65$rounds_="ebx"; # backup copy for $rounds
66$key_="ebp"; # backup copy for $key
67
68$rndkey0="xmm0";
69$rndkey1="xmm1";
70$inout0="xmm2";
71$inout1="xmm3";
72$inout2="xmm4";
73$inout3="xmm5"; $in1="xmm5";
74$inout4="xmm6"; $in0="xmm6";
75$inout5="xmm7"; $ivec="xmm7";
76
77# AESNI extension
78sub aeskeygenassist
79{ my($dst,$src,$imm)=@_;
80 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
81 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
82}
83sub aescommon
84{ my($opcodelet,$dst,$src)=@_;
85 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
86 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
87}
88sub aesimc { aescommon(0xdb,@_); }
89sub aesenc { aescommon(0xdc,@_); }
90sub aesenclast { aescommon(0xdd,@_); }
91sub aesdec { aescommon(0xde,@_); }
92sub aesdeclast { aescommon(0xdf,@_); }
93
94# Inline version of internal aesni_[en|de]crypt1
95{ my $sn;
96sub aesni_inline_generate1
97{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
98 $sn++;
99
100 &$movekey ($rndkey0,&QWP(0,$key));
101 &$movekey ($rndkey1,&QWP(16,$key));
102 &xorps ($ivec,$rndkey0) if (defined($ivec));
103 &lea ($key,&DWP(32,$key));
104 &xorps ($inout,$ivec) if (defined($ivec));
105 &xorps ($inout,$rndkey0) if (!defined($ivec));
106 &set_label("${p}1_loop_$sn");
107 eval"&aes${p} ($inout,$rndkey1)";
108 &dec ($rounds);
109 &$movekey ($rndkey1,&QWP(0,$key));
110 &lea ($key,&DWP(16,$key));
111 &jnz (&label("${p}1_loop_$sn"));
112 eval"&aes${p}last ($inout,$rndkey1)";
113}}
114
115sub aesni_generate1 # fully unrolled loop
116{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
117
118 &function_begin_B("_aesni_${p}rypt1");
119 &movups ($rndkey0,&QWP(0,$key));
120 &$movekey ($rndkey1,&QWP(0x10,$key));
121 &xorps ($inout,$rndkey0);
122 &$movekey ($rndkey0,&QWP(0x20,$key));
123 &lea ($key,&DWP(0x30,$key));
124 &cmp ($rounds,11);
125 &jb (&label("${p}128"));
126 &lea ($key,&DWP(0x20,$key));
127 &je (&label("${p}192"));
128 &lea ($key,&DWP(0x20,$key));
129 eval"&aes${p} ($inout,$rndkey1)";
130 &$movekey ($rndkey1,&QWP(-0x40,$key));
131 eval"&aes${p} ($inout,$rndkey0)";
132 &$movekey ($rndkey0,&QWP(-0x30,$key));
133 &set_label("${p}192");
134 eval"&aes${p} ($inout,$rndkey1)";
135 &$movekey ($rndkey1,&QWP(-0x20,$key));
136 eval"&aes${p} ($inout,$rndkey0)";
137 &$movekey ($rndkey0,&QWP(-0x10,$key));
138 &set_label("${p}128");
139 eval"&aes${p} ($inout,$rndkey1)";
140 &$movekey ($rndkey1,&QWP(0,$key));
141 eval"&aes${p} ($inout,$rndkey0)";
142 &$movekey ($rndkey0,&QWP(0x10,$key));
143 eval"&aes${p} ($inout,$rndkey1)";
144 &$movekey ($rndkey1,&QWP(0x20,$key));
145 eval"&aes${p} ($inout,$rndkey0)";
146 &$movekey ($rndkey0,&QWP(0x30,$key));
147 eval"&aes${p} ($inout,$rndkey1)";
148 &$movekey ($rndkey1,&QWP(0x40,$key));
149 eval"&aes${p} ($inout,$rndkey0)";
150 &$movekey ($rndkey0,&QWP(0x50,$key));
151 eval"&aes${p} ($inout,$rndkey1)";
152 &$movekey ($rndkey1,&QWP(0x60,$key));
153 eval"&aes${p} ($inout,$rndkey0)";
154 &$movekey ($rndkey0,&QWP(0x70,$key));
155 eval"&aes${p} ($inout,$rndkey1)";
156 eval"&aes${p}last ($inout,$rndkey0)";
157 &ret();
158 &function_end_B("_aesni_${p}rypt1");
159}
160
161# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
162&aesni_generate1("enc") if (!$inline);
163&function_begin_B("${PREFIX}_encrypt");
164 &mov ("eax",&wparam(0));
165 &mov ($key,&wparam(2));
166 &movups ($inout0,&QWP(0,"eax"));
167 &mov ($rounds,&DWP(240,$key));
168 &mov ("eax",&wparam(1));
169 if ($inline)
170 { &aesni_inline_generate1("enc"); }
171 else
172 { &call ("_aesni_encrypt1"); }
173 &movups (&QWP(0,"eax"),$inout0);
174 &ret ();
175&function_end_B("${PREFIX}_encrypt");
176
177# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
178&aesni_generate1("dec") if(!$inline);
179&function_begin_B("${PREFIX}_decrypt");
180 &mov ("eax",&wparam(0));
181 &mov ($key,&wparam(2));
182 &movups ($inout0,&QWP(0,"eax"));
183 &mov ($rounds,&DWP(240,$key));
184 &mov ("eax",&wparam(1));
185 if ($inline)
186 { &aesni_inline_generate1("dec"); }
187 else
188 { &call ("_aesni_decrypt1"); }
189 &movups (&QWP(0,"eax"),$inout0);
190 &ret ();
191&function_end_B("${PREFIX}_decrypt");
192
193# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
194# factor. Why 3x subroutine were originally used in loops? Even though
195# aes[enc|dec] latency was originally 6, it could be scheduled only
196# every *2nd* cycle. Thus 3x interleave was the one providing optimal
197# utilization, i.e. when subroutine's throughput is virtually same as
198# of non-interleaved subroutine [for number of input blocks up to 3].
199# This is why it makes no sense to implement 2x subroutine.
200# aes[enc|dec] latency in next processor generation is 8, but the
201# instructions can be scheduled every cycle. Optimal interleave for
202# new processor is therefore 8x, but it's unfeasible to accommodate it
203# in XMM registers addreassable in 32-bit mode and therefore 6x is
204# used instead...
205
206sub aesni_generate3
207{ my $p=shift;
208
209 &function_begin_B("_aesni_${p}rypt3");
210 &$movekey ($rndkey0,&QWP(0,$key));
211 &shr ($rounds,1);
212 &$movekey ($rndkey1,&QWP(16,$key));
213 &lea ($key,&DWP(32,$key));
214 &xorps ($inout0,$rndkey0);
215 &pxor ($inout1,$rndkey0);
216 &pxor ($inout2,$rndkey0);
217 &$movekey ($rndkey0,&QWP(0,$key));
218
219 &set_label("${p}3_loop");
220 eval"&aes${p} ($inout0,$rndkey1)";
221 eval"&aes${p} ($inout1,$rndkey1)";
222 &dec ($rounds);
223 eval"&aes${p} ($inout2,$rndkey1)";
224 &$movekey ($rndkey1,&QWP(16,$key));
225 eval"&aes${p} ($inout0,$rndkey0)";
226 eval"&aes${p} ($inout1,$rndkey0)";
227 &lea ($key,&DWP(32,$key));
228 eval"&aes${p} ($inout2,$rndkey0)";
229 &$movekey ($rndkey0,&QWP(0,$key));
230 &jnz (&label("${p}3_loop"));
231 eval"&aes${p} ($inout0,$rndkey1)";
232 eval"&aes${p} ($inout1,$rndkey1)";
233 eval"&aes${p} ($inout2,$rndkey1)";
234 eval"&aes${p}last ($inout0,$rndkey0)";
235 eval"&aes${p}last ($inout1,$rndkey0)";
236 eval"&aes${p}last ($inout2,$rndkey0)";
237 &ret();
238 &function_end_B("_aesni_${p}rypt3");
239}
240
241# 4x interleave is implemented to improve small block performance,
242# most notably [and naturally] 4 block by ~30%. One can argue that one
243# should have implemented 5x as well, but improvement would be <20%,
244# so it's not worth it...
245sub aesni_generate4
246{ my $p=shift;
247
248 &function_begin_B("_aesni_${p}rypt4");
249 &$movekey ($rndkey0,&QWP(0,$key));
250 &$movekey ($rndkey1,&QWP(16,$key));
251 &shr ($rounds,1);
252 &lea ($key,&DWP(32,$key));
253 &xorps ($inout0,$rndkey0);
254 &pxor ($inout1,$rndkey0);
255 &pxor ($inout2,$rndkey0);
256 &pxor ($inout3,$rndkey0);
257 &$movekey ($rndkey0,&QWP(0,$key));
258
259 &set_label("${p}4_loop");
260 eval"&aes${p} ($inout0,$rndkey1)";
261 eval"&aes${p} ($inout1,$rndkey1)";
262 &dec ($rounds);
263 eval"&aes${p} ($inout2,$rndkey1)";
264 eval"&aes${p} ($inout3,$rndkey1)";
265 &$movekey ($rndkey1,&QWP(16,$key));
266 eval"&aes${p} ($inout0,$rndkey0)";
267 eval"&aes${p} ($inout1,$rndkey0)";
268 &lea ($key,&DWP(32,$key));
269 eval"&aes${p} ($inout2,$rndkey0)";
270 eval"&aes${p} ($inout3,$rndkey0)";
271 &$movekey ($rndkey0,&QWP(0,$key));
272 &jnz (&label("${p}4_loop"));
273
274 eval"&aes${p} ($inout0,$rndkey1)";
275 eval"&aes${p} ($inout1,$rndkey1)";
276 eval"&aes${p} ($inout2,$rndkey1)";
277 eval"&aes${p} ($inout3,$rndkey1)";
278 eval"&aes${p}last ($inout0,$rndkey0)";
279 eval"&aes${p}last ($inout1,$rndkey0)";
280 eval"&aes${p}last ($inout2,$rndkey0)";
281 eval"&aes${p}last ($inout3,$rndkey0)";
282 &ret();
283 &function_end_B("_aesni_${p}rypt4");
284}
285
286sub aesni_generate6
287{ my $p=shift;
288
289 &function_begin_B("_aesni_${p}rypt6");
290 &static_label("_aesni_${p}rypt6_enter");
291 &$movekey ($rndkey0,&QWP(0,$key));
292 &shr ($rounds,1);
293 &$movekey ($rndkey1,&QWP(16,$key));
294 &lea ($key,&DWP(32,$key));
295 &xorps ($inout0,$rndkey0);
296 &pxor ($inout1,$rndkey0); # pxor does better here
297 eval"&aes${p} ($inout0,$rndkey1)";
298 &pxor ($inout2,$rndkey0);
299 eval"&aes${p} ($inout1,$rndkey1)";
300 &pxor ($inout3,$rndkey0);
301 &dec ($rounds);
302 eval"&aes${p} ($inout2,$rndkey1)";
303 &pxor ($inout4,$rndkey0);
304 eval"&aes${p} ($inout3,$rndkey1)";
305 &pxor ($inout5,$rndkey0);
306 eval"&aes${p} ($inout4,$rndkey1)";
307 &$movekey ($rndkey0,&QWP(0,$key));
308 eval"&aes${p} ($inout5,$rndkey1)";
309 &jmp (&label("_aesni_${p}rypt6_enter"));
310
311 &set_label("${p}6_loop",16);
312 eval"&aes${p} ($inout0,$rndkey1)";
313 eval"&aes${p} ($inout1,$rndkey1)";
314 &dec ($rounds);
315 eval"&aes${p} ($inout2,$rndkey1)";
316 eval"&aes${p} ($inout3,$rndkey1)";
317 eval"&aes${p} ($inout4,$rndkey1)";
318 eval"&aes${p} ($inout5,$rndkey1)";
319 &set_label("_aesni_${p}rypt6_enter",16);
320 &$movekey ($rndkey1,&QWP(16,$key));
321 eval"&aes${p} ($inout0,$rndkey0)";
322 eval"&aes${p} ($inout1,$rndkey0)";
323 &lea ($key,&DWP(32,$key));
324 eval"&aes${p} ($inout2,$rndkey0)";
325 eval"&aes${p} ($inout3,$rndkey0)";
326 eval"&aes${p} ($inout4,$rndkey0)";
327 eval"&aes${p} ($inout5,$rndkey0)";
328 &$movekey ($rndkey0,&QWP(0,$key));
329 &jnz (&label("${p}6_loop"));
330
331 eval"&aes${p} ($inout0,$rndkey1)";
332 eval"&aes${p} ($inout1,$rndkey1)";
333 eval"&aes${p} ($inout2,$rndkey1)";
334 eval"&aes${p} ($inout3,$rndkey1)";
335 eval"&aes${p} ($inout4,$rndkey1)";
336 eval"&aes${p} ($inout5,$rndkey1)";
337 eval"&aes${p}last ($inout0,$rndkey0)";
338 eval"&aes${p}last ($inout1,$rndkey0)";
339 eval"&aes${p}last ($inout2,$rndkey0)";
340 eval"&aes${p}last ($inout3,$rndkey0)";
341 eval"&aes${p}last ($inout4,$rndkey0)";
342 eval"&aes${p}last ($inout5,$rndkey0)";
343 &ret();
344 &function_end_B("_aesni_${p}rypt6");
345}
346&aesni_generate3("enc") if ($PREFIX eq "aesni");
347&aesni_generate3("dec");
348&aesni_generate4("enc") if ($PREFIX eq "aesni");
349&aesni_generate4("dec");
350&aesni_generate6("enc") if ($PREFIX eq "aesni");
351&aesni_generate6("dec");
352
353if ($PREFIX eq "aesni") {
354######################################################################
355# void aesni_ecb_encrypt (const void *in, void *out,
356# size_t length, const AES_KEY *key,
357# int enc);
358&function_begin("aesni_ecb_encrypt");
359 &mov ($inp,&wparam(0));
360 &mov ($out,&wparam(1));
361 &mov ($len,&wparam(2));
362 &mov ($key,&wparam(3));
363 &mov ($rounds_,&wparam(4));
364 &and ($len,-16);
365 &jz (&label("ecb_ret"));
366 &mov ($rounds,&DWP(240,$key));
367 &test ($rounds_,$rounds_);
368 &jz (&label("ecb_decrypt"));
369
370 &mov ($key_,$key); # backup $key
371 &mov ($rounds_,$rounds); # backup $rounds
372 &cmp ($len,0x60);
373 &jb (&label("ecb_enc_tail"));
374
375 &movdqu ($inout0,&QWP(0,$inp));
376 &movdqu ($inout1,&QWP(0x10,$inp));
377 &movdqu ($inout2,&QWP(0x20,$inp));
378 &movdqu ($inout3,&QWP(0x30,$inp));
379 &movdqu ($inout4,&QWP(0x40,$inp));
380 &movdqu ($inout5,&QWP(0x50,$inp));
381 &lea ($inp,&DWP(0x60,$inp));
382 &sub ($len,0x60);
383 &jmp (&label("ecb_enc_loop6_enter"));
384
385&set_label("ecb_enc_loop6",16);
386 &movups (&QWP(0,$out),$inout0);
387 &movdqu ($inout0,&QWP(0,$inp));
388 &movups (&QWP(0x10,$out),$inout1);
389 &movdqu ($inout1,&QWP(0x10,$inp));
390 &movups (&QWP(0x20,$out),$inout2);
391 &movdqu ($inout2,&QWP(0x20,$inp));
392 &movups (&QWP(0x30,$out),$inout3);
393 &movdqu ($inout3,&QWP(0x30,$inp));
394 &movups (&QWP(0x40,$out),$inout4);
395 &movdqu ($inout4,&QWP(0x40,$inp));
396 &movups (&QWP(0x50,$out),$inout5);
397 &lea ($out,&DWP(0x60,$out));
398 &movdqu ($inout5,&QWP(0x50,$inp));
399 &lea ($inp,&DWP(0x60,$inp));
400&set_label("ecb_enc_loop6_enter");
401
402 &call ("_aesni_encrypt6");
403
404 &mov ($key,$key_); # restore $key
405 &mov ($rounds,$rounds_); # restore $rounds
406 &sub ($len,0x60);
407 &jnc (&label("ecb_enc_loop6"));
408
409 &movups (&QWP(0,$out),$inout0);
410 &movups (&QWP(0x10,$out),$inout1);
411 &movups (&QWP(0x20,$out),$inout2);
412 &movups (&QWP(0x30,$out),$inout3);
413 &movups (&QWP(0x40,$out),$inout4);
414 &movups (&QWP(0x50,$out),$inout5);
415 &lea ($out,&DWP(0x60,$out));
416 &add ($len,0x60);
417 &jz (&label("ecb_ret"));
418
419&set_label("ecb_enc_tail");
420 &movups ($inout0,&QWP(0,$inp));
421 &cmp ($len,0x20);
422 &jb (&label("ecb_enc_one"));
423 &movups ($inout1,&QWP(0x10,$inp));
424 &je (&label("ecb_enc_two"));
425 &movups ($inout2,&QWP(0x20,$inp));
426 &cmp ($len,0x40);
427 &jb (&label("ecb_enc_three"));
428 &movups ($inout3,&QWP(0x30,$inp));
429 &je (&label("ecb_enc_four"));
430 &movups ($inout4,&QWP(0x40,$inp));
431 &xorps ($inout5,$inout5);
432 &call ("_aesni_encrypt6");
433 &movups (&QWP(0,$out),$inout0);
434 &movups (&QWP(0x10,$out),$inout1);
435 &movups (&QWP(0x20,$out),$inout2);
436 &movups (&QWP(0x30,$out),$inout3);
437 &movups (&QWP(0x40,$out),$inout4);
438 jmp (&label("ecb_ret"));
439
440&set_label("ecb_enc_one",16);
441 if ($inline)
442 { &aesni_inline_generate1("enc"); }
443 else
444 { &call ("_aesni_encrypt1"); }
445 &movups (&QWP(0,$out),$inout0);
446 &jmp (&label("ecb_ret"));
447
448&set_label("ecb_enc_two",16);
449 &xorps ($inout2,$inout2);
450 &call ("_aesni_encrypt3");
451 &movups (&QWP(0,$out),$inout0);
452 &movups (&QWP(0x10,$out),$inout1);
453 &jmp (&label("ecb_ret"));
454
455&set_label("ecb_enc_three",16);
456 &call ("_aesni_encrypt3");
457 &movups (&QWP(0,$out),$inout0);
458 &movups (&QWP(0x10,$out),$inout1);
459 &movups (&QWP(0x20,$out),$inout2);
460 &jmp (&label("ecb_ret"));
461
462&set_label("ecb_enc_four",16);
463 &call ("_aesni_encrypt4");
464 &movups (&QWP(0,$out),$inout0);
465 &movups (&QWP(0x10,$out),$inout1);
466 &movups (&QWP(0x20,$out),$inout2);
467 &movups (&QWP(0x30,$out),$inout3);
468 &jmp (&label("ecb_ret"));
469######################################################################
470&set_label("ecb_decrypt",16);
471 &mov ($key_,$key); # backup $key
472 &mov ($rounds_,$rounds); # backup $rounds
473 &cmp ($len,0x60);
474 &jb (&label("ecb_dec_tail"));
475
476 &movdqu ($inout0,&QWP(0,$inp));
477 &movdqu ($inout1,&QWP(0x10,$inp));
478 &movdqu ($inout2,&QWP(0x20,$inp));
479 &movdqu ($inout3,&QWP(0x30,$inp));
480 &movdqu ($inout4,&QWP(0x40,$inp));
481 &movdqu ($inout5,&QWP(0x50,$inp));
482 &lea ($inp,&DWP(0x60,$inp));
483 &sub ($len,0x60);
484 &jmp (&label("ecb_dec_loop6_enter"));
485
486&set_label("ecb_dec_loop6",16);
487 &movups (&QWP(0,$out),$inout0);
488 &movdqu ($inout0,&QWP(0,$inp));
489 &movups (&QWP(0x10,$out),$inout1);
490 &movdqu ($inout1,&QWP(0x10,$inp));
491 &movups (&QWP(0x20,$out),$inout2);
492 &movdqu ($inout2,&QWP(0x20,$inp));
493 &movups (&QWP(0x30,$out),$inout3);
494 &movdqu ($inout3,&QWP(0x30,$inp));
495 &movups (&QWP(0x40,$out),$inout4);
496 &movdqu ($inout4,&QWP(0x40,$inp));
497 &movups (&QWP(0x50,$out),$inout5);
498 &lea ($out,&DWP(0x60,$out));
499 &movdqu ($inout5,&QWP(0x50,$inp));
500 &lea ($inp,&DWP(0x60,$inp));
501&set_label("ecb_dec_loop6_enter");
502
503 &call ("_aesni_decrypt6");
504
505 &mov ($key,$key_); # restore $key
506 &mov ($rounds,$rounds_); # restore $rounds
507 &sub ($len,0x60);
508 &jnc (&label("ecb_dec_loop6"));
509
510 &movups (&QWP(0,$out),$inout0);
511 &movups (&QWP(0x10,$out),$inout1);
512 &movups (&QWP(0x20,$out),$inout2);
513 &movups (&QWP(0x30,$out),$inout3);
514 &movups (&QWP(0x40,$out),$inout4);
515 &movups (&QWP(0x50,$out),$inout5);
516 &lea ($out,&DWP(0x60,$out));
517 &add ($len,0x60);
518 &jz (&label("ecb_ret"));
519
520&set_label("ecb_dec_tail");
521 &movups ($inout0,&QWP(0,$inp));
522 &cmp ($len,0x20);
523 &jb (&label("ecb_dec_one"));
524 &movups ($inout1,&QWP(0x10,$inp));
525 &je (&label("ecb_dec_two"));
526 &movups ($inout2,&QWP(0x20,$inp));
527 &cmp ($len,0x40);
528 &jb (&label("ecb_dec_three"));
529 &movups ($inout3,&QWP(0x30,$inp));
530 &je (&label("ecb_dec_four"));
531 &movups ($inout4,&QWP(0x40,$inp));
532 &xorps ($inout5,$inout5);
533 &call ("_aesni_decrypt6");
534 &movups (&QWP(0,$out),$inout0);
535 &movups (&QWP(0x10,$out),$inout1);
536 &movups (&QWP(0x20,$out),$inout2);
537 &movups (&QWP(0x30,$out),$inout3);
538 &movups (&QWP(0x40,$out),$inout4);
539 &jmp (&label("ecb_ret"));
540
541&set_label("ecb_dec_one",16);
542 if ($inline)
543 { &aesni_inline_generate1("dec"); }
544 else
545 { &call ("_aesni_decrypt1"); }
546 &movups (&QWP(0,$out),$inout0);
547 &jmp (&label("ecb_ret"));
548
549&set_label("ecb_dec_two",16);
550 &xorps ($inout2,$inout2);
551 &call ("_aesni_decrypt3");
552 &movups (&QWP(0,$out),$inout0);
553 &movups (&QWP(0x10,$out),$inout1);
554 &jmp (&label("ecb_ret"));
555
556&set_label("ecb_dec_three",16);
557 &call ("_aesni_decrypt3");
558 &movups (&QWP(0,$out),$inout0);
559 &movups (&QWP(0x10,$out),$inout1);
560 &movups (&QWP(0x20,$out),$inout2);
561 &jmp (&label("ecb_ret"));
562
563&set_label("ecb_dec_four",16);
564 &call ("_aesni_decrypt4");
565 &movups (&QWP(0,$out),$inout0);
566 &movups (&QWP(0x10,$out),$inout1);
567 &movups (&QWP(0x20,$out),$inout2);
568 &movups (&QWP(0x30,$out),$inout3);
569
570&set_label("ecb_ret");
571&function_end("aesni_ecb_encrypt");
572
573######################################################################
574# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
575# size_t blocks, const AES_KEY *key,
576# const char *ivec,char *cmac);
577#
578# Handles only complete blocks, operates on 64-bit counter and
579# does not update *ivec! Nor does it finalize CMAC value
580# (see engine/eng_aesni.c for details)
581#
582{ my $cmac=$inout1;
583&function_begin("aesni_ccm64_encrypt_blocks");
584 &mov ($inp,&wparam(0));
585 &mov ($out,&wparam(1));
586 &mov ($len,&wparam(2));
587 &mov ($key,&wparam(3));
588 &mov ($rounds_,&wparam(4));
589 &mov ($rounds,&wparam(5));
590 &mov ($key_,"esp");
591 &sub ("esp",60);
592 &and ("esp",-16); # align stack
593 &mov (&DWP(48,"esp"),$key_);
594
595 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
596 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
597 &mov ($rounds,&DWP(240,$key));
598
599 # compose byte-swap control mask for pshufb on stack
600 &mov (&DWP(0,"esp"),0x0c0d0e0f);
601 &mov (&DWP(4,"esp"),0x08090a0b);
602 &mov (&DWP(8,"esp"),0x04050607);
603 &mov (&DWP(12,"esp"),0x00010203);
604
605 # compose counter increment vector on stack
606 &mov ($rounds_,1);
607 &xor ($key_,$key_);
608 &mov (&DWP(16,"esp"),$rounds_);
609 &mov (&DWP(20,"esp"),$key_);
610 &mov (&DWP(24,"esp"),$key_);
611 &mov (&DWP(28,"esp"),$key_);
612
613 &shr ($rounds,1);
614 &lea ($key_,&DWP(0,$key));
615 &movdqa ($inout3,&QWP(0,"esp"));
616 &movdqa ($inout0,$ivec);
617 &mov ($rounds_,$rounds);
618 &pshufb ($ivec,$inout3);
619
620&set_label("ccm64_enc_outer");
621 &$movekey ($rndkey0,&QWP(0,$key_));
622 &mov ($rounds,$rounds_);
623 &movups ($in0,&QWP(0,$inp));
624
625 &xorps ($inout0,$rndkey0);
626 &$movekey ($rndkey1,&QWP(16,$key_));
627 &xorps ($rndkey0,$in0);
628 &lea ($key,&DWP(32,$key_));
629 &xorps ($cmac,$rndkey0); # cmac^=inp
630 &$movekey ($rndkey0,&QWP(0,$key));
631
632&set_label("ccm64_enc2_loop");
633 &aesenc ($inout0,$rndkey1);
634 &dec ($rounds);
635 &aesenc ($cmac,$rndkey1);
636 &$movekey ($rndkey1,&QWP(16,$key));
637 &aesenc ($inout0,$rndkey0);
638 &lea ($key,&DWP(32,$key));
639 &aesenc ($cmac,$rndkey0);
640 &$movekey ($rndkey0,&QWP(0,$key));
641 &jnz (&label("ccm64_enc2_loop"));
642 &aesenc ($inout0,$rndkey1);
643 &aesenc ($cmac,$rndkey1);
644 &paddq ($ivec,&QWP(16,"esp"));
645 &aesenclast ($inout0,$rndkey0);
646 &aesenclast ($cmac,$rndkey0);
647
648 &dec ($len);
649 &lea ($inp,&DWP(16,$inp));
650 &xorps ($in0,$inout0); # inp^=E(ivec)
651 &movdqa ($inout0,$ivec);
652 &movups (&QWP(0,$out),$in0); # save output
653 &lea ($out,&DWP(16,$out));
654 &pshufb ($inout0,$inout3);
655 &jnz (&label("ccm64_enc_outer"));
656
657 &mov ("esp",&DWP(48,"esp"));
658 &mov ($out,&wparam(5));
659 &movups (&QWP(0,$out),$cmac);
660&function_end("aesni_ccm64_encrypt_blocks");
661
662&function_begin("aesni_ccm64_decrypt_blocks");
663 &mov ($inp,&wparam(0));
664 &mov ($out,&wparam(1));
665 &mov ($len,&wparam(2));
666 &mov ($key,&wparam(3));
667 &mov ($rounds_,&wparam(4));
668 &mov ($rounds,&wparam(5));
669 &mov ($key_,"esp");
670 &sub ("esp",60);
671 &and ("esp",-16); # align stack
672 &mov (&DWP(48,"esp"),$key_);
673
674 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
675 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
676 &mov ($rounds,&DWP(240,$key));
677
678 # compose byte-swap control mask for pshufb on stack
679 &mov (&DWP(0,"esp"),0x0c0d0e0f);
680 &mov (&DWP(4,"esp"),0x08090a0b);
681 &mov (&DWP(8,"esp"),0x04050607);
682 &mov (&DWP(12,"esp"),0x00010203);
683
684 # compose counter increment vector on stack
685 &mov ($rounds_,1);
686 &xor ($key_,$key_);
687 &mov (&DWP(16,"esp"),$rounds_);
688 &mov (&DWP(20,"esp"),$key_);
689 &mov (&DWP(24,"esp"),$key_);
690 &mov (&DWP(28,"esp"),$key_);
691
692 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
693 &movdqa ($inout0,$ivec);
694
695 &mov ($key_,$key);
696 &mov ($rounds_,$rounds);
697
698 &pshufb ($ivec,$inout3);
699 if ($inline)
700 { &aesni_inline_generate1("enc"); }
701 else
702 { &call ("_aesni_encrypt1"); }
703 &movups ($in0,&QWP(0,$inp)); # load inp
704 &paddq ($ivec,&QWP(16,"esp"));
705 &lea ($inp,&QWP(16,$inp));
706 &jmp (&label("ccm64_dec_outer"));
707
708&set_label("ccm64_dec_outer",16);
709 &xorps ($in0,$inout0); # inp ^= E(ivec)
710 &movdqa ($inout0,$ivec);
711 &mov ($rounds,$rounds_);
712 &movups (&QWP(0,$out),$in0); # save output
713 &lea ($out,&DWP(16,$out));
714 &pshufb ($inout0,$inout3);
715
716 &sub ($len,1);
717 &jz (&label("ccm64_dec_break"));
718
719 &$movekey ($rndkey0,&QWP(0,$key_));
720 &shr ($rounds,1);
721 &$movekey ($rndkey1,&QWP(16,$key_));
722 &xorps ($in0,$rndkey0);
723 &lea ($key,&DWP(32,$key_));
724 &xorps ($inout0,$rndkey0);
725 &xorps ($cmac,$in0); # cmac^=out
726 &$movekey ($rndkey0,&QWP(0,$key));
727
728&set_label("ccm64_dec2_loop");
729 &aesenc ($inout0,$rndkey1);
730 &dec ($rounds);
731 &aesenc ($cmac,$rndkey1);
732 &$movekey ($rndkey1,&QWP(16,$key));
733 &aesenc ($inout0,$rndkey0);
734 &lea ($key,&DWP(32,$key));
735 &aesenc ($cmac,$rndkey0);
736 &$movekey ($rndkey0,&QWP(0,$key));
737 &jnz (&label("ccm64_dec2_loop"));
738 &movups ($in0,&QWP(0,$inp)); # load inp
739 &paddq ($ivec,&QWP(16,"esp"));
740 &aesenc ($inout0,$rndkey1);
741 &aesenc ($cmac,$rndkey1);
742 &lea ($inp,&QWP(16,$inp));
743 &aesenclast ($inout0,$rndkey0);
744 &aesenclast ($cmac,$rndkey0);
745 &jmp (&label("ccm64_dec_outer"));
746
747&set_label("ccm64_dec_break",16);
748 &mov ($key,$key_);
749 if ($inline)
750 { &aesni_inline_generate1("enc",$cmac,$in0); }
751 else
752 { &call ("_aesni_encrypt1",$cmac); }
753
754 &mov ("esp",&DWP(48,"esp"));
755 &mov ($out,&wparam(5));
756 &movups (&QWP(0,$out),$cmac);
757&function_end("aesni_ccm64_decrypt_blocks");
758}
759
760######################################################################
761# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
762# size_t blocks, const AES_KEY *key,
763# const char *ivec);
764#
765# Handles only complete blocks, operates on 32-bit counter and
766# does not update *ivec! (see engine/eng_aesni.c for details)
767#
768# stack layout:
769# 0 pshufb mask
770# 16 vector addend: 0,6,6,6
771# 32 counter-less ivec
772# 48 1st triplet of counter vector
773# 64 2nd triplet of counter vector
774# 80 saved %esp
775
776&function_begin("aesni_ctr32_encrypt_blocks");
777 &mov ($inp,&wparam(0));
778 &mov ($out,&wparam(1));
779 &mov ($len,&wparam(2));
780 &mov ($key,&wparam(3));
781 &mov ($rounds_,&wparam(4));
782 &mov ($key_,"esp");
783 &sub ("esp",88);
784 &and ("esp",-16); # align stack
785 &mov (&DWP(80,"esp"),$key_);
786
787 &cmp ($len,1);
788 &je (&label("ctr32_one_shortcut"));
789
790 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
791
792 # compose byte-swap control mask for pshufb on stack
793 &mov (&DWP(0,"esp"),0x0c0d0e0f);
794 &mov (&DWP(4,"esp"),0x08090a0b);
795 &mov (&DWP(8,"esp"),0x04050607);
796 &mov (&DWP(12,"esp"),0x00010203);
797
798 # compose counter increment vector on stack
799 &mov ($rounds,6);
800 &xor ($key_,$key_);
801 &mov (&DWP(16,"esp"),$rounds);
802 &mov (&DWP(20,"esp"),$rounds);
803 &mov (&DWP(24,"esp"),$rounds);
804 &mov (&DWP(28,"esp"),$key_);
805
806 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
807 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
808
809 &mov ($rounds,&DWP(240,$key)); # key->rounds
810
811 # compose 2 vectors of 3x32-bit counters
812 &bswap ($rounds_);
813 &pxor ($rndkey1,$rndkey1);
814 &pxor ($rndkey0,$rndkey0);
815 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
816 &pinsrd ($rndkey1,$rounds_,0);
817 &lea ($key_,&DWP(3,$rounds_));
818 &pinsrd ($rndkey0,$key_,0);
819 &inc ($rounds_);
820 &pinsrd ($rndkey1,$rounds_,1);
821 &inc ($key_);
822 &pinsrd ($rndkey0,$key_,1);
823 &inc ($rounds_);
824 &pinsrd ($rndkey1,$rounds_,2);
825 &inc ($key_);
826 &pinsrd ($rndkey0,$key_,2);
827 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
828 &pshufb ($rndkey1,$inout0); # byte swap
829 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
830 &pshufb ($rndkey0,$inout0); # byte swap
831
832 &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword
833 &pshufd ($inout1,$rndkey1,2<<6);
834 &cmp ($len,6);
835 &jb (&label("ctr32_tail"));
836 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec
837 &shr ($rounds,1);
838 &mov ($key_,$key); # backup $key
839 &mov ($rounds_,$rounds); # backup $rounds
840 &sub ($len,6);
841 &jmp (&label("ctr32_loop6"));
842
843&set_label("ctr32_loop6",16);
844 &pshufd ($inout2,$rndkey1,1<<6);
845 &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec
846 &pshufd ($inout3,$rndkey0,3<<6);
847 &por ($inout0,$rndkey1); # merge counter-less ivec
848 &pshufd ($inout4,$rndkey0,2<<6);
849 &por ($inout1,$rndkey1);
850 &pshufd ($inout5,$rndkey0,1<<6);
851 &por ($inout2,$rndkey1);
852 &por ($inout3,$rndkey1);
853 &por ($inout4,$rndkey1);
854 &por ($inout5,$rndkey1);
855
856 # inlining _aesni_encrypt6's prologue gives ~4% improvement...
857 &$movekey ($rndkey0,&QWP(0,$key_));
858 &$movekey ($rndkey1,&QWP(16,$key_));
859 &lea ($key,&DWP(32,$key_));
860 &dec ($rounds);
861 &pxor ($inout0,$rndkey0);
862 &pxor ($inout1,$rndkey0);
863 &aesenc ($inout0,$rndkey1);
864 &pxor ($inout2,$rndkey0);
865 &aesenc ($inout1,$rndkey1);
866 &pxor ($inout3,$rndkey0);
867 &aesenc ($inout2,$rndkey1);
868 &pxor ($inout4,$rndkey0);
869 &aesenc ($inout3,$rndkey1);
870 &pxor ($inout5,$rndkey0);
871 &aesenc ($inout4,$rndkey1);
872 &$movekey ($rndkey0,&QWP(0,$key));
873 &aesenc ($inout5,$rndkey1);
874
875 &call (&label("_aesni_encrypt6_enter"));
876
877 &movups ($rndkey1,&QWP(0,$inp));
878 &movups ($rndkey0,&QWP(0x10,$inp));
879 &xorps ($inout0,$rndkey1);
880 &movups ($rndkey1,&QWP(0x20,$inp));
881 &xorps ($inout1,$rndkey0);
882 &movups (&QWP(0,$out),$inout0);
883 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
884 &xorps ($inout2,$rndkey1);
885 &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet
886 &movups (&QWP(0x10,$out),$inout1);
887 &movups (&QWP(0x20,$out),$inout2);
888
889 &paddd ($rndkey1,$rndkey0); # 1st triplet increment
890 &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment
891 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
892
893 &movups ($inout1,&QWP(0x30,$inp));
894 &movups ($inout2,&QWP(0x40,$inp));
895 &xorps ($inout3,$inout1);
896 &movups ($inout1,&QWP(0x50,$inp));
897 &lea ($inp,&DWP(0x60,$inp));
898 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
899 &pshufb ($rndkey1,$inout0); # byte swap
900 &xorps ($inout4,$inout2);
901 &movups (&QWP(0x30,$out),$inout3);
902 &xorps ($inout5,$inout1);
903 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
904 &pshufb ($rndkey0,$inout0); # byte swap
905 &movups (&QWP(0x40,$out),$inout4);
906 &pshufd ($inout0,$rndkey1,3<<6);
907 &movups (&QWP(0x50,$out),$inout5);
908 &lea ($out,&DWP(0x60,$out));
909
910 &mov ($rounds,$rounds_);
911 &pshufd ($inout1,$rndkey1,2<<6);
912 &sub ($len,6);
913 &jnc (&label("ctr32_loop6"));
914
915 &add ($len,6);
916 &jz (&label("ctr32_ret"));
917 &mov ($key,$key_);
918 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
919 &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec
920
921&set_label("ctr32_tail");
922 &por ($inout0,$inout5);
923 &cmp ($len,2);
924 &jb (&label("ctr32_one"));
925
926 &pshufd ($inout2,$rndkey1,1<<6);
927 &por ($inout1,$inout5);
928 &je (&label("ctr32_two"));
929
930 &pshufd ($inout3,$rndkey0,3<<6);
931 &por ($inout2,$inout5);
932 &cmp ($len,4);
933 &jb (&label("ctr32_three"));
934
935 &pshufd ($inout4,$rndkey0,2<<6);
936 &por ($inout3,$inout5);
937 &je (&label("ctr32_four"));
938
939 &por ($inout4,$inout5);
940 &call ("_aesni_encrypt6");
941 &movups ($rndkey1,&QWP(0,$inp));
942 &movups ($rndkey0,&QWP(0x10,$inp));
943 &xorps ($inout0,$rndkey1);
944 &movups ($rndkey1,&QWP(0x20,$inp));
945 &xorps ($inout1,$rndkey0);
946 &movups ($rndkey0,&QWP(0x30,$inp));
947 &xorps ($inout2,$rndkey1);
948 &movups ($rndkey1,&QWP(0x40,$inp));
949 &xorps ($inout3,$rndkey0);
950 &movups (&QWP(0,$out),$inout0);
951 &xorps ($inout4,$rndkey1);
952 &movups (&QWP(0x10,$out),$inout1);
953 &movups (&QWP(0x20,$out),$inout2);
954 &movups (&QWP(0x30,$out),$inout3);
955 &movups (&QWP(0x40,$out),$inout4);
956 &jmp (&label("ctr32_ret"));
957
958&set_label("ctr32_one_shortcut",16);
959 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
960 &mov ($rounds,&DWP(240,$key));
961
962&set_label("ctr32_one");
963 if ($inline)
964 { &aesni_inline_generate1("enc"); }
965 else
966 { &call ("_aesni_encrypt1"); }
967 &movups ($in0,&QWP(0,$inp));
968 &xorps ($in0,$inout0);
969 &movups (&QWP(0,$out),$in0);
970 &jmp (&label("ctr32_ret"));
971
972&set_label("ctr32_two",16);
973 &call ("_aesni_encrypt3");
974 &movups ($inout3,&QWP(0,$inp));
975 &movups ($inout4,&QWP(0x10,$inp));
976 &xorps ($inout0,$inout3);
977 &xorps ($inout1,$inout4);
978 &movups (&QWP(0,$out),$inout0);
979 &movups (&QWP(0x10,$out),$inout1);
980 &jmp (&label("ctr32_ret"));
981
982&set_label("ctr32_three",16);
983 &call ("_aesni_encrypt3");
984 &movups ($inout3,&QWP(0,$inp));
985 &movups ($inout4,&QWP(0x10,$inp));
986 &xorps ($inout0,$inout3);
987 &movups ($inout5,&QWP(0x20,$inp));
988 &xorps ($inout1,$inout4);
989 &movups (&QWP(0,$out),$inout0);
990 &xorps ($inout2,$inout5);
991 &movups (&QWP(0x10,$out),$inout1);
992 &movups (&QWP(0x20,$out),$inout2);
993 &jmp (&label("ctr32_ret"));
994
995&set_label("ctr32_four",16);
996 &call ("_aesni_encrypt4");
997 &movups ($inout4,&QWP(0,$inp));
998 &movups ($inout5,&QWP(0x10,$inp));
999 &movups ($rndkey1,&QWP(0x20,$inp));
1000 &xorps ($inout0,$inout4);
1001 &movups ($rndkey0,&QWP(0x30,$inp));
1002 &xorps ($inout1,$inout5);
1003 &movups (&QWP(0,$out),$inout0);
1004 &xorps ($inout2,$rndkey1);
1005 &movups (&QWP(0x10,$out),$inout1);
1006 &xorps ($inout3,$rndkey0);
1007 &movups (&QWP(0x20,$out),$inout2);
1008 &movups (&QWP(0x30,$out),$inout3);
1009
1010&set_label("ctr32_ret");
1011 &mov ("esp",&DWP(80,"esp"));
1012&function_end("aesni_ctr32_encrypt_blocks");
1013
1014######################################################################
1015# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1016# const AES_KEY *key1, const AES_KEY *key2
1017# const unsigned char iv[16]);
1018#
1019{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1020
1021&function_begin("aesni_xts_encrypt");
1022 &mov ($key,&wparam(4)); # key2
1023 &mov ($inp,&wparam(5)); # clear-text tweak
1024
1025 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1026 &movups ($inout0,&QWP(0,$inp));
1027 if ($inline)
1028 { &aesni_inline_generate1("enc"); }
1029 else
1030 { &call ("_aesni_encrypt1"); }
1031
1032 &mov ($inp,&wparam(0));
1033 &mov ($out,&wparam(1));
1034 &mov ($len,&wparam(2));
1035 &mov ($key,&wparam(3)); # key1
1036
1037 &mov ($key_,"esp");
1038 &sub ("esp",16*7+8);
1039 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1040 &and ("esp",-16); # align stack
1041
1042 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1043 &mov (&DWP(16*6+4,"esp"),0);
1044 &mov (&DWP(16*6+8,"esp"),1);
1045 &mov (&DWP(16*6+12,"esp"),0);
1046 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1047 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1048
1049 &movdqa ($tweak,$inout0);
1050 &pxor ($twtmp,$twtmp);
1051 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1052 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1053
1054 &and ($len,-16);
1055 &mov ($key_,$key); # backup $key
1056 &mov ($rounds_,$rounds); # backup $rounds
1057 &sub ($len,16*6);
1058 &jc (&label("xts_enc_short"));
1059
1060 &shr ($rounds,1);
1061 &mov ($rounds_,$rounds);
1062 &jmp (&label("xts_enc_loop6"));
1063
1064&set_label("xts_enc_loop6",16);
1065 for ($i=0;$i<4;$i++) {
1066 &pshufd ($twres,$twtmp,0x13);
1067 &pxor ($twtmp,$twtmp);
1068 &movdqa (&QWP(16*$i,"esp"),$tweak);
1069 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1070 &pand ($twres,$twmask); # isolate carry and residue
1071 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1072 &pxor ($tweak,$twres);
1073 }
1074 &pshufd ($inout5,$twtmp,0x13);
1075 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1076 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1077 &$movekey ($rndkey0,&QWP(0,$key_));
1078 &pand ($inout5,$twmask); # isolate carry and residue
1079 &movups ($inout0,&QWP(0,$inp)); # load input
1080 &pxor ($inout5,$tweak);
1081
1082 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1083 &movdqu ($inout1,&QWP(16*1,$inp));
1084 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1085 &movdqu ($inout2,&QWP(16*2,$inp));
1086 &pxor ($inout1,$rndkey0);
1087 &movdqu ($inout3,&QWP(16*3,$inp));
1088 &pxor ($inout2,$rndkey0);
1089 &movdqu ($inout4,&QWP(16*4,$inp));
1090 &pxor ($inout3,$rndkey0);
1091 &movdqu ($rndkey1,&QWP(16*5,$inp));
1092 &pxor ($inout4,$rndkey0);
1093 &lea ($inp,&DWP(16*6,$inp));
1094 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1095 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1096 &pxor ($inout5,$rndkey1);
1097
1098 &$movekey ($rndkey1,&QWP(16,$key_));
1099 &lea ($key,&DWP(32,$key_));
1100 &pxor ($inout1,&QWP(16*1,"esp"));
1101 &aesenc ($inout0,$rndkey1);
1102 &pxor ($inout2,&QWP(16*2,"esp"));
1103 &aesenc ($inout1,$rndkey1);
1104 &pxor ($inout3,&QWP(16*3,"esp"));
1105 &dec ($rounds);
1106 &aesenc ($inout2,$rndkey1);
1107 &pxor ($inout4,&QWP(16*4,"esp"));
1108 &aesenc ($inout3,$rndkey1);
1109 &pxor ($inout5,$rndkey0);
1110 &aesenc ($inout4,$rndkey1);
1111 &$movekey ($rndkey0,&QWP(0,$key));
1112 &aesenc ($inout5,$rndkey1);
1113 &call (&label("_aesni_encrypt6_enter"));
1114
1115 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1116 &pxor ($twtmp,$twtmp);
1117 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1118 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1119 &xorps ($inout1,&QWP(16*1,"esp"));
1120 &movups (&QWP(16*0,$out),$inout0); # write output
1121 &xorps ($inout2,&QWP(16*2,"esp"));
1122 &movups (&QWP(16*1,$out),$inout1);
1123 &xorps ($inout3,&QWP(16*3,"esp"));
1124 &movups (&QWP(16*2,$out),$inout2);
1125 &xorps ($inout4,&QWP(16*4,"esp"));
1126 &movups (&QWP(16*3,$out),$inout3);
1127 &xorps ($inout5,$tweak);
1128 &movups (&QWP(16*4,$out),$inout4);
1129 &pshufd ($twres,$twtmp,0x13);
1130 &movups (&QWP(16*5,$out),$inout5);
1131 &lea ($out,&DWP(16*6,$out));
1132 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1133
1134 &pxor ($twtmp,$twtmp);
1135 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1136 &pand ($twres,$twmask); # isolate carry and residue
1137 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1138 &mov ($rounds,$rounds_); # restore $rounds
1139 &pxor ($tweak,$twres);
1140
1141 &sub ($len,16*6);
1142 &jnc (&label("xts_enc_loop6"));
1143
1144 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
1145 &mov ($key,$key_); # restore $key
1146 &mov ($rounds_,$rounds);
1147
1148&set_label("xts_enc_short");
1149 &add ($len,16*6);
1150 &jz (&label("xts_enc_done6x"));
1151
1152 &movdqa ($inout3,$tweak); # put aside previous tweak
1153 &cmp ($len,0x20);
1154 &jb (&label("xts_enc_one"));
1155
1156 &pshufd ($twres,$twtmp,0x13);
1157 &pxor ($twtmp,$twtmp);
1158 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1159 &pand ($twres,$twmask); # isolate carry and residue
1160 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1161 &pxor ($tweak,$twres);
1162 &je (&label("xts_enc_two"));
1163
1164 &pshufd ($twres,$twtmp,0x13);
1165 &pxor ($twtmp,$twtmp);
1166 &movdqa ($inout4,$tweak); # put aside previous tweak
1167 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1168 &pand ($twres,$twmask); # isolate carry and residue
1169 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1170 &pxor ($tweak,$twres);
1171 &cmp ($len,0x40);
1172 &jb (&label("xts_enc_three"));
1173
1174 &pshufd ($twres,$twtmp,0x13);
1175 &pxor ($twtmp,$twtmp);
1176 &movdqa ($inout5,$tweak); # put aside previous tweak
1177 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1178 &pand ($twres,$twmask); # isolate carry and residue
1179 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1180 &pxor ($tweak,$twres);
1181 &movdqa (&QWP(16*0,"esp"),$inout3);
1182 &movdqa (&QWP(16*1,"esp"),$inout4);
1183 &je (&label("xts_enc_four"));
1184
1185 &movdqa (&QWP(16*2,"esp"),$inout5);
1186 &pshufd ($inout5,$twtmp,0x13);
1187 &movdqa (&QWP(16*3,"esp"),$tweak);
1188 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1189 &pand ($inout5,$twmask); # isolate carry and residue
1190 &pxor ($inout5,$tweak);
1191
1192 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1193 &movdqu ($inout1,&QWP(16*1,$inp));
1194 &movdqu ($inout2,&QWP(16*2,$inp));
1195 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1196 &movdqu ($inout3,&QWP(16*3,$inp));
1197 &pxor ($inout1,&QWP(16*1,"esp"));
1198 &movdqu ($inout4,&QWP(16*4,$inp));
1199 &pxor ($inout2,&QWP(16*2,"esp"));
1200 &lea ($inp,&DWP(16*5,$inp));
1201 &pxor ($inout3,&QWP(16*3,"esp"));
1202 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1203 &pxor ($inout4,$inout5);
1204
1205 &call ("_aesni_encrypt6");
1206
1207 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1208 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1209 &xorps ($inout1,&QWP(16*1,"esp"));
1210 &xorps ($inout2,&QWP(16*2,"esp"));
1211 &movups (&QWP(16*0,$out),$inout0); # write output
1212 &xorps ($inout3,&QWP(16*3,"esp"));
1213 &movups (&QWP(16*1,$out),$inout1);
1214 &xorps ($inout4,$tweak);
1215 &movups (&QWP(16*2,$out),$inout2);
1216 &movups (&QWP(16*3,$out),$inout3);
1217 &movups (&QWP(16*4,$out),$inout4);
1218 &lea ($out,&DWP(16*5,$out));
1219 &jmp (&label("xts_enc_done"));
1220
1221&set_label("xts_enc_one",16);
1222 &movups ($inout0,&QWP(16*0,$inp)); # load input
1223 &lea ($inp,&DWP(16*1,$inp));
1224 &xorps ($inout0,$inout3); # input^=tweak
1225 if ($inline)
1226 { &aesni_inline_generate1("enc"); }
1227 else
1228 { &call ("_aesni_encrypt1"); }
1229 &xorps ($inout0,$inout3); # output^=tweak
1230 &movups (&QWP(16*0,$out),$inout0); # write output
1231 &lea ($out,&DWP(16*1,$out));
1232
1233 &movdqa ($tweak,$inout3); # last tweak
1234 &jmp (&label("xts_enc_done"));
1235
1236&set_label("xts_enc_two",16);
1237 &movaps ($inout4,$tweak); # put aside last tweak
1238
1239 &movups ($inout0,&QWP(16*0,$inp)); # load input
1240 &movups ($inout1,&QWP(16*1,$inp));
1241 &lea ($inp,&DWP(16*2,$inp));
1242 &xorps ($inout0,$inout3); # input^=tweak
1243 &xorps ($inout1,$inout4);
1244 &xorps ($inout2,$inout2);
1245
1246 &call ("_aesni_encrypt3");
1247
1248 &xorps ($inout0,$inout3); # output^=tweak
1249 &xorps ($inout1,$inout4);
1250 &movups (&QWP(16*0,$out),$inout0); # write output
1251 &movups (&QWP(16*1,$out),$inout1);
1252 &lea ($out,&DWP(16*2,$out));
1253
1254 &movdqa ($tweak,$inout4); # last tweak
1255 &jmp (&label("xts_enc_done"));
1256
1257&set_label("xts_enc_three",16);
1258 &movaps ($inout5,$tweak); # put aside last tweak
1259 &movups ($inout0,&QWP(16*0,$inp)); # load input
1260 &movups ($inout1,&QWP(16*1,$inp));
1261 &movups ($inout2,&QWP(16*2,$inp));
1262 &lea ($inp,&DWP(16*3,$inp));
1263 &xorps ($inout0,$inout3); # input^=tweak
1264 &xorps ($inout1,$inout4);
1265 &xorps ($inout2,$inout5);
1266
1267 &call ("_aesni_encrypt3");
1268
1269 &xorps ($inout0,$inout3); # output^=tweak
1270 &xorps ($inout1,$inout4);
1271 &xorps ($inout2,$inout5);
1272 &movups (&QWP(16*0,$out),$inout0); # write output
1273 &movups (&QWP(16*1,$out),$inout1);
1274 &movups (&QWP(16*2,$out),$inout2);
1275 &lea ($out,&DWP(16*3,$out));
1276
1277 &movdqa ($tweak,$inout5); # last tweak
1278 &jmp (&label("xts_enc_done"));
1279
1280&set_label("xts_enc_four",16);
1281 &movaps ($inout4,$tweak); # put aside last tweak
1282
1283 &movups ($inout0,&QWP(16*0,$inp)); # load input
1284 &movups ($inout1,&QWP(16*1,$inp));
1285 &movups ($inout2,&QWP(16*2,$inp));
1286 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1287 &movups ($inout3,&QWP(16*3,$inp));
1288 &lea ($inp,&DWP(16*4,$inp));
1289 &xorps ($inout1,&QWP(16*1,"esp"));
1290 &xorps ($inout2,$inout5);
1291 &xorps ($inout3,$inout4);
1292
1293 &call ("_aesni_encrypt4");
1294
1295 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1296 &xorps ($inout1,&QWP(16*1,"esp"));
1297 &xorps ($inout2,$inout5);
1298 &movups (&QWP(16*0,$out),$inout0); # write output
1299 &xorps ($inout3,$inout4);
1300 &movups (&QWP(16*1,$out),$inout1);
1301 &movups (&QWP(16*2,$out),$inout2);
1302 &movups (&QWP(16*3,$out),$inout3);
1303 &lea ($out,&DWP(16*4,$out));
1304
1305 &movdqa ($tweak,$inout4); # last tweak
1306 &jmp (&label("xts_enc_done"));
1307
1308&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1309 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1310 &and ($len,15);
1311 &jz (&label("xts_enc_ret"));
1312 &movdqa ($inout3,$tweak);
1313 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1314 &jmp (&label("xts_enc_steal"));
1315
1316&set_label("xts_enc_done",16);
1317 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1318 &pxor ($twtmp,$twtmp);
1319 &and ($len,15);
1320 &jz (&label("xts_enc_ret"));
1321
1322 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1323 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1324 &pshufd ($inout3,$twtmp,0x13);
1325 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1326 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1327 &pxor ($inout3,$tweak);
1328
1329&set_label("xts_enc_steal");
1330 &movz ($rounds,&BP(0,$inp));
1331 &movz ($key,&BP(-16,$out));
1332 &lea ($inp,&DWP(1,$inp));
1333 &mov (&BP(-16,$out),&LB($rounds));
1334 &mov (&BP(0,$out),&LB($key));
1335 &lea ($out,&DWP(1,$out));
1336 &sub ($len,1);
1337 &jnz (&label("xts_enc_steal"));
1338
1339 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1340 &mov ($key,$key_); # restore $key
1341 &mov ($rounds,$rounds_); # restore $rounds
1342
1343 &movups ($inout0,&QWP(-16,$out)); # load input
1344 &xorps ($inout0,$inout3); # input^=tweak
1345 if ($inline)
1346 { &aesni_inline_generate1("enc"); }
1347 else
1348 { &call ("_aesni_encrypt1"); }
1349 &xorps ($inout0,$inout3); # output^=tweak
1350 &movups (&QWP(-16,$out),$inout0); # write output
1351
1352&set_label("xts_enc_ret");
1353 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1354&function_end("aesni_xts_encrypt");
1355
1356&function_begin("aesni_xts_decrypt");
1357 &mov ($key,&wparam(4)); # key2
1358 &mov ($inp,&wparam(5)); # clear-text tweak
1359
1360 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1361 &movups ($inout0,&QWP(0,$inp));
1362 if ($inline)
1363 { &aesni_inline_generate1("enc"); }
1364 else
1365 { &call ("_aesni_encrypt1"); }
1366
1367 &mov ($inp,&wparam(0));
1368 &mov ($out,&wparam(1));
1369 &mov ($len,&wparam(2));
1370 &mov ($key,&wparam(3)); # key1
1371
1372 &mov ($key_,"esp");
1373 &sub ("esp",16*7+8);
1374 &and ("esp",-16); # align stack
1375
1376 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1377 &test ($len,15);
1378 &setnz (&LB($rounds_));
1379 &shl ($rounds_,4);
1380 &sub ($len,$rounds_);
1381
1382 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1383 &mov (&DWP(16*6+4,"esp"),0);
1384 &mov (&DWP(16*6+8,"esp"),1);
1385 &mov (&DWP(16*6+12,"esp"),0);
1386 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1387 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1388
1389 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1390 &mov ($key_,$key); # backup $key
1391 &mov ($rounds_,$rounds); # backup $rounds
1392
1393 &movdqa ($tweak,$inout0);
1394 &pxor ($twtmp,$twtmp);
1395 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1396 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1397
1398 &and ($len,-16);
1399 &sub ($len,16*6);
1400 &jc (&label("xts_dec_short"));
1401
1402 &shr ($rounds,1);
1403 &mov ($rounds_,$rounds);
1404 &jmp (&label("xts_dec_loop6"));
1405
1406&set_label("xts_dec_loop6",16);
1407 for ($i=0;$i<4;$i++) {
1408 &pshufd ($twres,$twtmp,0x13);
1409 &pxor ($twtmp,$twtmp);
1410 &movdqa (&QWP(16*$i,"esp"),$tweak);
1411 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1412 &pand ($twres,$twmask); # isolate carry and residue
1413 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1414 &pxor ($tweak,$twres);
1415 }
1416 &pshufd ($inout5,$twtmp,0x13);
1417 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1418 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1419 &$movekey ($rndkey0,&QWP(0,$key_));
1420 &pand ($inout5,$twmask); # isolate carry and residue
1421 &movups ($inout0,&QWP(0,$inp)); # load input
1422 &pxor ($inout5,$tweak);
1423
1424 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1425 &movdqu ($inout1,&QWP(16*1,$inp));
1426 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1427 &movdqu ($inout2,&QWP(16*2,$inp));
1428 &pxor ($inout1,$rndkey0);
1429 &movdqu ($inout3,&QWP(16*3,$inp));
1430 &pxor ($inout2,$rndkey0);
1431 &movdqu ($inout4,&QWP(16*4,$inp));
1432 &pxor ($inout3,$rndkey0);
1433 &movdqu ($rndkey1,&QWP(16*5,$inp));
1434 &pxor ($inout4,$rndkey0);
1435 &lea ($inp,&DWP(16*6,$inp));
1436 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1437 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1438 &pxor ($inout5,$rndkey1);
1439
1440 &$movekey ($rndkey1,&QWP(16,$key_));
1441 &lea ($key,&DWP(32,$key_));
1442 &pxor ($inout1,&QWP(16*1,"esp"));
1443 &aesdec ($inout0,$rndkey1);
1444 &pxor ($inout2,&QWP(16*2,"esp"));
1445 &aesdec ($inout1,$rndkey1);
1446 &pxor ($inout3,&QWP(16*3,"esp"));
1447 &dec ($rounds);
1448 &aesdec ($inout2,$rndkey1);
1449 &pxor ($inout4,&QWP(16*4,"esp"));
1450 &aesdec ($inout3,$rndkey1);
1451 &pxor ($inout5,$rndkey0);
1452 &aesdec ($inout4,$rndkey1);
1453 &$movekey ($rndkey0,&QWP(0,$key));
1454 &aesdec ($inout5,$rndkey1);
1455 &call (&label("_aesni_decrypt6_enter"));
1456
1457 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1458 &pxor ($twtmp,$twtmp);
1459 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1460 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1461 &xorps ($inout1,&QWP(16*1,"esp"));
1462 &movups (&QWP(16*0,$out),$inout0); # write output
1463 &xorps ($inout2,&QWP(16*2,"esp"));
1464 &movups (&QWP(16*1,$out),$inout1);
1465 &xorps ($inout3,&QWP(16*3,"esp"));
1466 &movups (&QWP(16*2,$out),$inout2);
1467 &xorps ($inout4,&QWP(16*4,"esp"));
1468 &movups (&QWP(16*3,$out),$inout3);
1469 &xorps ($inout5,$tweak);
1470 &movups (&QWP(16*4,$out),$inout4);
1471 &pshufd ($twres,$twtmp,0x13);
1472 &movups (&QWP(16*5,$out),$inout5);
1473 &lea ($out,&DWP(16*6,$out));
1474 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1475
1476 &pxor ($twtmp,$twtmp);
1477 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1478 &pand ($twres,$twmask); # isolate carry and residue
1479 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1480 &mov ($rounds,$rounds_); # restore $rounds
1481 &pxor ($tweak,$twres);
1482
1483 &sub ($len,16*6);
1484 &jnc (&label("xts_dec_loop6"));
1485
1486 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
1487 &mov ($key,$key_); # restore $key
1488 &mov ($rounds_,$rounds);
1489
1490&set_label("xts_dec_short");
1491 &add ($len,16*6);
1492 &jz (&label("xts_dec_done6x"));
1493
1494 &movdqa ($inout3,$tweak); # put aside previous tweak
1495 &cmp ($len,0x20);
1496 &jb (&label("xts_dec_one"));
1497
1498 &pshufd ($twres,$twtmp,0x13);
1499 &pxor ($twtmp,$twtmp);
1500 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1501 &pand ($twres,$twmask); # isolate carry and residue
1502 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1503 &pxor ($tweak,$twres);
1504 &je (&label("xts_dec_two"));
1505
1506 &pshufd ($twres,$twtmp,0x13);
1507 &pxor ($twtmp,$twtmp);
1508 &movdqa ($inout4,$tweak); # put aside previous tweak
1509 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1510 &pand ($twres,$twmask); # isolate carry and residue
1511 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1512 &pxor ($tweak,$twres);
1513 &cmp ($len,0x40);
1514 &jb (&label("xts_dec_three"));
1515
1516 &pshufd ($twres,$twtmp,0x13);
1517 &pxor ($twtmp,$twtmp);
1518 &movdqa ($inout5,$tweak); # put aside previous tweak
1519 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1520 &pand ($twres,$twmask); # isolate carry and residue
1521 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1522 &pxor ($tweak,$twres);
1523 &movdqa (&QWP(16*0,"esp"),$inout3);
1524 &movdqa (&QWP(16*1,"esp"),$inout4);
1525 &je (&label("xts_dec_four"));
1526
1527 &movdqa (&QWP(16*2,"esp"),$inout5);
1528 &pshufd ($inout5,$twtmp,0x13);
1529 &movdqa (&QWP(16*3,"esp"),$tweak);
1530 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1531 &pand ($inout5,$twmask); # isolate carry and residue
1532 &pxor ($inout5,$tweak);
1533
1534 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1535 &movdqu ($inout1,&QWP(16*1,$inp));
1536 &movdqu ($inout2,&QWP(16*2,$inp));
1537 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1538 &movdqu ($inout3,&QWP(16*3,$inp));
1539 &pxor ($inout1,&QWP(16*1,"esp"));
1540 &movdqu ($inout4,&QWP(16*4,$inp));
1541 &pxor ($inout2,&QWP(16*2,"esp"));
1542 &lea ($inp,&DWP(16*5,$inp));
1543 &pxor ($inout3,&QWP(16*3,"esp"));
1544 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1545 &pxor ($inout4,$inout5);
1546
1547 &call ("_aesni_decrypt6");
1548
1549 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1550 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1551 &xorps ($inout1,&QWP(16*1,"esp"));
1552 &xorps ($inout2,&QWP(16*2,"esp"));
1553 &movups (&QWP(16*0,$out),$inout0); # write output
1554 &xorps ($inout3,&QWP(16*3,"esp"));
1555 &movups (&QWP(16*1,$out),$inout1);
1556 &xorps ($inout4,$tweak);
1557 &movups (&QWP(16*2,$out),$inout2);
1558 &movups (&QWP(16*3,$out),$inout3);
1559 &movups (&QWP(16*4,$out),$inout4);
1560 &lea ($out,&DWP(16*5,$out));
1561 &jmp (&label("xts_dec_done"));
1562
1563&set_label("xts_dec_one",16);
1564 &movups ($inout0,&QWP(16*0,$inp)); # load input
1565 &lea ($inp,&DWP(16*1,$inp));
1566 &xorps ($inout0,$inout3); # input^=tweak
1567 if ($inline)
1568 { &aesni_inline_generate1("dec"); }
1569 else
1570 { &call ("_aesni_decrypt1"); }
1571 &xorps ($inout0,$inout3); # output^=tweak
1572 &movups (&QWP(16*0,$out),$inout0); # write output
1573 &lea ($out,&DWP(16*1,$out));
1574
1575 &movdqa ($tweak,$inout3); # last tweak
1576 &jmp (&label("xts_dec_done"));
1577
1578&set_label("xts_dec_two",16);
1579 &movaps ($inout4,$tweak); # put aside last tweak
1580
1581 &movups ($inout0,&QWP(16*0,$inp)); # load input
1582 &movups ($inout1,&QWP(16*1,$inp));
1583 &lea ($inp,&DWP(16*2,$inp));
1584 &xorps ($inout0,$inout3); # input^=tweak
1585 &xorps ($inout1,$inout4);
1586
1587 &call ("_aesni_decrypt3");
1588
1589 &xorps ($inout0,$inout3); # output^=tweak
1590 &xorps ($inout1,$inout4);
1591 &movups (&QWP(16*0,$out),$inout0); # write output
1592 &movups (&QWP(16*1,$out),$inout1);
1593 &lea ($out,&DWP(16*2,$out));
1594
1595 &movdqa ($tweak,$inout4); # last tweak
1596 &jmp (&label("xts_dec_done"));
1597
1598&set_label("xts_dec_three",16);
1599 &movaps ($inout5,$tweak); # put aside last tweak
1600 &movups ($inout0,&QWP(16*0,$inp)); # load input
1601 &movups ($inout1,&QWP(16*1,$inp));
1602 &movups ($inout2,&QWP(16*2,$inp));
1603 &lea ($inp,&DWP(16*3,$inp));
1604 &xorps ($inout0,$inout3); # input^=tweak
1605 &xorps ($inout1,$inout4);
1606 &xorps ($inout2,$inout5);
1607
1608 &call ("_aesni_decrypt3");
1609
1610 &xorps ($inout0,$inout3); # output^=tweak
1611 &xorps ($inout1,$inout4);
1612 &xorps ($inout2,$inout5);
1613 &movups (&QWP(16*0,$out),$inout0); # write output
1614 &movups (&QWP(16*1,$out),$inout1);
1615 &movups (&QWP(16*2,$out),$inout2);
1616 &lea ($out,&DWP(16*3,$out));
1617
1618 &movdqa ($tweak,$inout5); # last tweak
1619 &jmp (&label("xts_dec_done"));
1620
1621&set_label("xts_dec_four",16);
1622 &movaps ($inout4,$tweak); # put aside last tweak
1623
1624 &movups ($inout0,&QWP(16*0,$inp)); # load input
1625 &movups ($inout1,&QWP(16*1,$inp));
1626 &movups ($inout2,&QWP(16*2,$inp));
1627 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1628 &movups ($inout3,&QWP(16*3,$inp));
1629 &lea ($inp,&DWP(16*4,$inp));
1630 &xorps ($inout1,&QWP(16*1,"esp"));
1631 &xorps ($inout2,$inout5);
1632 &xorps ($inout3,$inout4);
1633
1634 &call ("_aesni_decrypt4");
1635
1636 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1637 &xorps ($inout1,&QWP(16*1,"esp"));
1638 &xorps ($inout2,$inout5);
1639 &movups (&QWP(16*0,$out),$inout0); # write output
1640 &xorps ($inout3,$inout4);
1641 &movups (&QWP(16*1,$out),$inout1);
1642 &movups (&QWP(16*2,$out),$inout2);
1643 &movups (&QWP(16*3,$out),$inout3);
1644 &lea ($out,&DWP(16*4,$out));
1645
1646 &movdqa ($tweak,$inout4); # last tweak
1647 &jmp (&label("xts_dec_done"));
1648
1649&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1650 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1651 &and ($len,15);
1652 &jz (&label("xts_dec_ret"));
1653 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1654 &jmp (&label("xts_dec_only_one_more"));
1655
1656&set_label("xts_dec_done",16);
1657 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1658 &pxor ($twtmp,$twtmp);
1659 &and ($len,15);
1660 &jz (&label("xts_dec_ret"));
1661
1662 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1663 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1664 &pshufd ($twres,$twtmp,0x13);
1665 &pxor ($twtmp,$twtmp);
1666 &movdqa ($twmask,&QWP(16*6,"esp"));
1667 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1668 &pand ($twres,$twmask); # isolate carry and residue
1669 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1670 &pxor ($tweak,$twres);
1671
1672&set_label("xts_dec_only_one_more");
1673 &pshufd ($inout3,$twtmp,0x13);
1674 &movdqa ($inout4,$tweak); # put aside previous tweak
1675 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1676 &pand ($inout3,$twmask); # isolate carry and residue
1677 &pxor ($inout3,$tweak);
1678
1679 &mov ($key,$key_); # restore $key
1680 &mov ($rounds,$rounds_); # restore $rounds
1681
1682 &movups ($inout0,&QWP(0,$inp)); # load input
1683 &xorps ($inout0,$inout3); # input^=tweak
1684 if ($inline)
1685 { &aesni_inline_generate1("dec"); }
1686 else
1687 { &call ("_aesni_decrypt1"); }
1688 &xorps ($inout0,$inout3); # output^=tweak
1689 &movups (&QWP(0,$out),$inout0); # write output
1690
1691&set_label("xts_dec_steal");
1692 &movz ($rounds,&BP(16,$inp));
1693 &movz ($key,&BP(0,$out));
1694 &lea ($inp,&DWP(1,$inp));
1695 &mov (&BP(0,$out),&LB($rounds));
1696 &mov (&BP(16,$out),&LB($key));
1697 &lea ($out,&DWP(1,$out));
1698 &sub ($len,1);
1699 &jnz (&label("xts_dec_steal"));
1700
1701 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1702 &mov ($key,$key_); # restore $key
1703 &mov ($rounds,$rounds_); # restore $rounds
1704
1705 &movups ($inout0,&QWP(0,$out)); # load input
1706 &xorps ($inout0,$inout4); # input^=tweak
1707 if ($inline)
1708 { &aesni_inline_generate1("dec"); }
1709 else
1710 { &call ("_aesni_decrypt1"); }
1711 &xorps ($inout0,$inout4); # output^=tweak
1712 &movups (&QWP(0,$out),$inout0); # write output
1713
1714&set_label("xts_dec_ret");
1715 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1716&function_end("aesni_xts_decrypt");
1717}
1718}
1719
1720######################################################################
1721# void $PREFIX_cbc_encrypt (const void *inp, void *out,
1722# size_t length, const AES_KEY *key,
1723# unsigned char *ivp,const int enc);
1724&function_begin("${PREFIX}_cbc_encrypt");
1725 &mov ($inp,&wparam(0));
1726 &mov ($rounds_,"esp");
1727 &mov ($out,&wparam(1));
1728 &sub ($rounds_,24);
1729 &mov ($len,&wparam(2));
1730 &and ($rounds_,-16);
1731 &mov ($key,&wparam(3));
1732 &mov ($key_,&wparam(4));
1733 &test ($len,$len);
1734 &jz (&label("cbc_abort"));
1735
1736 &cmp (&wparam(5),0);
1737 &xchg ($rounds_,"esp"); # alloca
1738 &movups ($ivec,&QWP(0,$key_)); # load IV
1739 &mov ($rounds,&DWP(240,$key));
1740 &mov ($key_,$key); # backup $key
1741 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
1742 &mov ($rounds_,$rounds); # backup $rounds
1743 &je (&label("cbc_decrypt"));
1744
1745 &movaps ($inout0,$ivec);
1746 &cmp ($len,16);
1747 &jb (&label("cbc_enc_tail"));
1748 &sub ($len,16);
1749 &jmp (&label("cbc_enc_loop"));
1750
1751&set_label("cbc_enc_loop",16);
1752 &movups ($ivec,&QWP(0,$inp)); # input actually
1753 &lea ($inp,&DWP(16,$inp));
1754 if ($inline)
1755 { &aesni_inline_generate1("enc",$inout0,$ivec); }
1756 else
1757 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
1758 &mov ($rounds,$rounds_); # restore $rounds
1759 &mov ($key,$key_); # restore $key
1760 &movups (&QWP(0,$out),$inout0); # store output
1761 &lea ($out,&DWP(16,$out));
1762 &sub ($len,16);
1763 &jnc (&label("cbc_enc_loop"));
1764 &add ($len,16);
1765 &jnz (&label("cbc_enc_tail"));
1766 &movaps ($ivec,$inout0);
1767 &jmp (&label("cbc_ret"));
1768
1769&set_label("cbc_enc_tail");
1770 &mov ("ecx",$len); # zaps $rounds
1771 &data_word(0xA4F3F689); # rep movsb
1772 &mov ("ecx",16); # zero tail
1773 &sub ("ecx",$len);
1774 &xor ("eax","eax"); # zaps $len
1775 &data_word(0xAAF3F689); # rep stosb
1776 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
1777 &mov ($rounds,$rounds_); # restore $rounds
1778 &mov ($inp,$out); # $inp and $out are the same
1779 &mov ($key,$key_); # restore $key
1780 &jmp (&label("cbc_enc_loop"));
1781######################################################################
1782&set_label("cbc_decrypt",16);
1783 &cmp ($len,0x50);
1784 &jbe (&label("cbc_dec_tail"));
1785 &movaps (&QWP(0,"esp"),$ivec); # save IV
1786 &sub ($len,0x50);
1787 &jmp (&label("cbc_dec_loop6_enter"));
1788
1789&set_label("cbc_dec_loop6",16);
1790 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
1791 &movups (&QWP(0,$out),$inout5);
1792 &lea ($out,&DWP(0x10,$out));
1793&set_label("cbc_dec_loop6_enter");
1794 &movdqu ($inout0,&QWP(0,$inp));
1795 &movdqu ($inout1,&QWP(0x10,$inp));
1796 &movdqu ($inout2,&QWP(0x20,$inp));
1797 &movdqu ($inout3,&QWP(0x30,$inp));
1798 &movdqu ($inout4,&QWP(0x40,$inp));
1799 &movdqu ($inout5,&QWP(0x50,$inp));
1800
1801 &call ("_aesni_decrypt6");
1802
1803 &movups ($rndkey1,&QWP(0,$inp));
1804 &movups ($rndkey0,&QWP(0x10,$inp));
1805 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
1806 &xorps ($inout1,$rndkey1);
1807 &movups ($rndkey1,&QWP(0x20,$inp));
1808 &xorps ($inout2,$rndkey0);
1809 &movups ($rndkey0,&QWP(0x30,$inp));
1810 &xorps ($inout3,$rndkey1);
1811 &movups ($rndkey1,&QWP(0x40,$inp));
1812 &xorps ($inout4,$rndkey0);
1813 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
1814 &xorps ($inout5,$rndkey1);
1815 &movups (&QWP(0,$out),$inout0);
1816 &movups (&QWP(0x10,$out),$inout1);
1817 &lea ($inp,&DWP(0x60,$inp));
1818 &movups (&QWP(0x20,$out),$inout2);
1819 &mov ($rounds,$rounds_) # restore $rounds
1820 &movups (&QWP(0x30,$out),$inout3);
1821 &mov ($key,$key_); # restore $key
1822 &movups (&QWP(0x40,$out),$inout4);
1823 &lea ($out,&DWP(0x50,$out));
1824 &sub ($len,0x60);
1825 &ja (&label("cbc_dec_loop6"));
1826
1827 &movaps ($inout0,$inout5);
1828 &movaps ($ivec,$rndkey0);
1829 &add ($len,0x50);
1830 &jle (&label("cbc_dec_tail_collected"));
1831 &movups (&QWP(0,$out),$inout0);
1832 &lea ($out,&DWP(0x10,$out));
1833&set_label("cbc_dec_tail");
1834 &movups ($inout0,&QWP(0,$inp));
1835 &movaps ($in0,$inout0);
1836 &cmp ($len,0x10);
1837 &jbe (&label("cbc_dec_one"));
1838
1839 &movups ($inout1,&QWP(0x10,$inp));
1840 &movaps ($in1,$inout1);
1841 &cmp ($len,0x20);
1842 &jbe (&label("cbc_dec_two"));
1843
1844 &movups ($inout2,&QWP(0x20,$inp));
1845 &cmp ($len,0x30);
1846 &jbe (&label("cbc_dec_three"));
1847
1848 &movups ($inout3,&QWP(0x30,$inp));
1849 &cmp ($len,0x40);
1850 &jbe (&label("cbc_dec_four"));
1851
1852 &movups ($inout4,&QWP(0x40,$inp));
1853 &movaps (&QWP(0,"esp"),$ivec); # save IV
1854 &movups ($inout0,&QWP(0,$inp));
1855 &xorps ($inout5,$inout5);
1856 &call ("_aesni_decrypt6");
1857 &movups ($rndkey1,&QWP(0,$inp));
1858 &movups ($rndkey0,&QWP(0x10,$inp));
1859 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
1860 &xorps ($inout1,$rndkey1);
1861 &movups ($rndkey1,&QWP(0x20,$inp));
1862 &xorps ($inout2,$rndkey0);
1863 &movups ($rndkey0,&QWP(0x30,$inp));
1864 &xorps ($inout3,$rndkey1);
1865 &movups ($ivec,&QWP(0x40,$inp)); # IV
1866 &xorps ($inout4,$rndkey0);
1867 &movups (&QWP(0,$out),$inout0);
1868 &movups (&QWP(0x10,$out),$inout1);
1869 &movups (&QWP(0x20,$out),$inout2);
1870 &movups (&QWP(0x30,$out),$inout3);
1871 &lea ($out,&DWP(0x40,$out));
1872 &movaps ($inout0,$inout4);
1873 &sub ($len,0x50);
1874 &jmp (&label("cbc_dec_tail_collected"));
1875
1876&set_label("cbc_dec_one",16);
1877 if ($inline)
1878 { &aesni_inline_generate1("dec"); }
1879 else
1880 { &call ("_aesni_decrypt1"); }
1881 &xorps ($inout0,$ivec);
1882 &movaps ($ivec,$in0);
1883 &sub ($len,0x10);
1884 &jmp (&label("cbc_dec_tail_collected"));
1885
1886&set_label("cbc_dec_two",16);
1887 &xorps ($inout2,$inout2);
1888 &call ("_aesni_decrypt3");
1889 &xorps ($inout0,$ivec);
1890 &xorps ($inout1,$in0);
1891 &movups (&QWP(0,$out),$inout0);
1892 &movaps ($inout0,$inout1);
1893 &lea ($out,&DWP(0x10,$out));
1894 &movaps ($ivec,$in1);
1895 &sub ($len,0x20);
1896 &jmp (&label("cbc_dec_tail_collected"));
1897
1898&set_label("cbc_dec_three",16);
1899 &call ("_aesni_decrypt3");
1900 &xorps ($inout0,$ivec);
1901 &xorps ($inout1,$in0);
1902 &xorps ($inout2,$in1);
1903 &movups (&QWP(0,$out),$inout0);
1904 &movaps ($inout0,$inout2);
1905 &movups (&QWP(0x10,$out),$inout1);
1906 &lea ($out,&DWP(0x20,$out));
1907 &movups ($ivec,&QWP(0x20,$inp));
1908 &sub ($len,0x30);
1909 &jmp (&label("cbc_dec_tail_collected"));
1910
1911&set_label("cbc_dec_four",16);
1912 &call ("_aesni_decrypt4");
1913 &movups ($rndkey1,&QWP(0x10,$inp));
1914 &movups ($rndkey0,&QWP(0x20,$inp));
1915 &xorps ($inout0,$ivec);
1916 &movups ($ivec,&QWP(0x30,$inp));
1917 &xorps ($inout1,$in0);
1918 &movups (&QWP(0,$out),$inout0);
1919 &xorps ($inout2,$rndkey1);
1920 &movups (&QWP(0x10,$out),$inout1);
1921 &xorps ($inout3,$rndkey0);
1922 &movups (&QWP(0x20,$out),$inout2);
1923 &lea ($out,&DWP(0x30,$out));
1924 &movaps ($inout0,$inout3);
1925 &sub ($len,0x40);
1926
1927&set_label("cbc_dec_tail_collected");
1928 &and ($len,15);
1929 &jnz (&label("cbc_dec_tail_partial"));
1930 &movups (&QWP(0,$out),$inout0);
1931 &jmp (&label("cbc_ret"));
1932
1933&set_label("cbc_dec_tail_partial",16);
1934 &movaps (&QWP(0,"esp"),$inout0);
1935 &mov ("ecx",16);
1936 &mov ($inp,"esp");
1937 &sub ("ecx",$len);
1938 &data_word(0xA4F3F689); # rep movsb
1939
1940&set_label("cbc_ret");
1941 &mov ("esp",&DWP(16,"esp")); # pull original %esp
1942 &mov ($key_,&wparam(4));
1943 &movups (&QWP(0,$key_),$ivec); # output IV
1944&set_label("cbc_abort");
1945&function_end("${PREFIX}_cbc_encrypt");
1946
1947######################################################################
1948# Mechanical port from aesni-x86_64.pl.
1949#
1950# _aesni_set_encrypt_key is private interface,
1951# input:
1952# "eax" const unsigned char *userKey
1953# $rounds int bits
1954# $key AES_KEY *key
1955# output:
1956# "eax" return code
1957# $round rounds
1958
1959&function_begin_B("_aesni_set_encrypt_key");
1960 &test ("eax","eax");
1961 &jz (&label("bad_pointer"));
1962 &test ($key,$key);
1963 &jz (&label("bad_pointer"));
1964
1965 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
1966 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
1967 &lea ($key,&DWP(16,$key));
1968 &cmp ($rounds,256);
1969 &je (&label("14rounds"));
1970 &cmp ($rounds,192);
1971 &je (&label("12rounds"));
1972 &cmp ($rounds,128);
1973 &jne (&label("bad_keybits"));
1974
1975&set_label("10rounds",16);
1976 &mov ($rounds,9);
1977 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
1978 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
1979 &call (&label("key_128_cold"));
1980 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
1981 &call (&label("key_128"));
1982 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
1983 &call (&label("key_128"));
1984 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
1985 &call (&label("key_128"));
1986 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
1987 &call (&label("key_128"));
1988 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
1989 &call (&label("key_128"));
1990 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
1991 &call (&label("key_128"));
1992 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
1993 &call (&label("key_128"));
1994 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
1995 &call (&label("key_128"));
1996 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
1997 &call (&label("key_128"));
1998 &$movekey (&QWP(0,$key),"xmm0");
1999 &mov (&DWP(80,$key),$rounds);
2000 &xor ("eax","eax");
2001 &ret();
2002
2003&set_label("key_128",16);
2004 &$movekey (&QWP(0,$key),"xmm0");
2005 &lea ($key,&DWP(16,$key));
2006&set_label("key_128_cold");
2007 &shufps ("xmm4","xmm0",0b00010000);
2008 &xorps ("xmm0","xmm4");
2009 &shufps ("xmm4","xmm0",0b10001100);
2010 &xorps ("xmm0","xmm4");
2011 &shufps ("xmm1","xmm1",0b11111111); # critical path
2012 &xorps ("xmm0","xmm1");
2013 &ret();
2014
2015&set_label("12rounds",16);
2016 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
2017 &mov ($rounds,11);
2018 &$movekey (&QWP(-16,$key),"xmm0") # round 0
2019 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
2020 &call (&label("key_192a_cold"));
2021 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
2022 &call (&label("key_192b"));
2023 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
2024 &call (&label("key_192a"));
2025 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
2026 &call (&label("key_192b"));
2027 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
2028 &call (&label("key_192a"));
2029 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
2030 &call (&label("key_192b"));
2031 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
2032 &call (&label("key_192a"));
2033 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
2034 &call (&label("key_192b"));
2035 &$movekey (&QWP(0,$key),"xmm0");
2036 &mov (&DWP(48,$key),$rounds);
2037 &xor ("eax","eax");
2038 &ret();
2039
2040&set_label("key_192a",16);
2041 &$movekey (&QWP(0,$key),"xmm0");
2042 &lea ($key,&DWP(16,$key));
2043&set_label("key_192a_cold",16);
2044 &movaps ("xmm5","xmm2");
2045&set_label("key_192b_warm");
2046 &shufps ("xmm4","xmm0",0b00010000);
2047 &movdqa ("xmm3","xmm2");
2048 &xorps ("xmm0","xmm4");
2049 &shufps ("xmm4","xmm0",0b10001100);
2050 &pslldq ("xmm3",4);
2051 &xorps ("xmm0","xmm4");
2052 &pshufd ("xmm1","xmm1",0b01010101); # critical path
2053 &pxor ("xmm2","xmm3");
2054 &pxor ("xmm0","xmm1");
2055 &pshufd ("xmm3","xmm0",0b11111111);
2056 &pxor ("xmm2","xmm3");
2057 &ret();
2058
2059&set_label("key_192b",16);
2060 &movaps ("xmm3","xmm0");
2061 &shufps ("xmm5","xmm0",0b01000100);
2062 &$movekey (&QWP(0,$key),"xmm5");
2063 &shufps ("xmm3","xmm2",0b01001110);
2064 &$movekey (&QWP(16,$key),"xmm3");
2065 &lea ($key,&DWP(32,$key));
2066 &jmp (&label("key_192b_warm"));
2067
2068&set_label("14rounds",16);
2069 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
2070 &mov ($rounds,13);
2071 &lea ($key,&DWP(16,$key));
2072 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
2073 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
2074 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
2075 &call (&label("key_256a_cold"));
2076 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
2077 &call (&label("key_256b"));
2078 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
2079 &call (&label("key_256a"));
2080 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
2081 &call (&label("key_256b"));
2082 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
2083 &call (&label("key_256a"));
2084 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
2085 &call (&label("key_256b"));
2086 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
2087 &call (&label("key_256a"));
2088 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
2089 &call (&label("key_256b"));
2090 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
2091 &call (&label("key_256a"));
2092 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
2093 &call (&label("key_256b"));
2094 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
2095 &call (&label("key_256a"));
2096 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
2097 &call (&label("key_256b"));
2098 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
2099 &call (&label("key_256a"));
2100 &$movekey (&QWP(0,$key),"xmm0");
2101 &mov (&DWP(16,$key),$rounds);
2102 &xor ("eax","eax");
2103 &ret();
2104
2105&set_label("key_256a",16);
2106 &$movekey (&QWP(0,$key),"xmm2");
2107 &lea ($key,&DWP(16,$key));
2108&set_label("key_256a_cold");
2109 &shufps ("xmm4","xmm0",0b00010000);
2110 &xorps ("xmm0","xmm4");
2111 &shufps ("xmm4","xmm0",0b10001100);
2112 &xorps ("xmm0","xmm4");
2113 &shufps ("xmm1","xmm1",0b11111111); # critical path
2114 &xorps ("xmm0","xmm1");
2115 &ret();
2116
2117&set_label("key_256b",16);
2118 &$movekey (&QWP(0,$key),"xmm0");
2119 &lea ($key,&DWP(16,$key));
2120
2121 &shufps ("xmm4","xmm2",0b00010000);
2122 &xorps ("xmm2","xmm4");
2123 &shufps ("xmm4","xmm2",0b10001100);
2124 &xorps ("xmm2","xmm4");
2125 &shufps ("xmm1","xmm1",0b10101010); # critical path
2126 &xorps ("xmm2","xmm1");
2127 &ret();
2128
2129&set_label("bad_pointer",4);
2130 &mov ("eax",-1);
2131 &ret ();
2132&set_label("bad_keybits",4);
2133 &mov ("eax",-2);
2134 &ret ();
2135&function_end_B("_aesni_set_encrypt_key");
2136
2137# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2138# AES_KEY *key)
2139&function_begin_B("${PREFIX}_set_encrypt_key");
2140 &mov ("eax",&wparam(0));
2141 &mov ($rounds,&wparam(1));
2142 &mov ($key,&wparam(2));
2143 &call ("_aesni_set_encrypt_key");
2144 &ret ();
2145&function_end_B("${PREFIX}_set_encrypt_key");
2146
2147# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2148# AES_KEY *key)
2149&function_begin_B("${PREFIX}_set_decrypt_key");
2150 &mov ("eax",&wparam(0));
2151 &mov ($rounds,&wparam(1));
2152 &mov ($key,&wparam(2));
2153 &call ("_aesni_set_encrypt_key");
2154 &mov ($key,&wparam(2));
2155 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
2156 &test ("eax","eax");
2157 &jnz (&label("dec_key_ret"));
2158 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
2159
2160 &$movekey ("xmm0",&QWP(0,$key)); # just swap
2161 &$movekey ("xmm1",&QWP(0,"eax"));
2162 &$movekey (&QWP(0,"eax"),"xmm0");
2163 &$movekey (&QWP(0,$key),"xmm1");
2164 &lea ($key,&DWP(16,$key));
2165 &lea ("eax",&DWP(-16,"eax"));
2166
2167&set_label("dec_key_inverse");
2168 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
2169 &$movekey ("xmm1",&QWP(0,"eax"));
2170 &aesimc ("xmm0","xmm0");
2171 &aesimc ("xmm1","xmm1");
2172 &lea ($key,&DWP(16,$key));
2173 &lea ("eax",&DWP(-16,"eax"));
2174 &$movekey (&QWP(16,"eax"),"xmm0");
2175 &$movekey (&QWP(-16,$key),"xmm1");
2176 &cmp ("eax",$key);
2177 &ja (&label("dec_key_inverse"));
2178
2179 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
2180 &aesimc ("xmm0","xmm0");
2181 &$movekey (&QWP(0,$key),"xmm0");
2182
2183 &xor ("eax","eax"); # return success
2184&set_label("dec_key_ret");
2185 &ret ();
2186&function_end_B("${PREFIX}_set_decrypt_key");
2187
2188&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
deleted file mode 100644
index 441524036a..0000000000
--- a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
+++ /dev/null
@@ -1,3080 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
13# details].
14#
15# Performance.
16#
17# Given aes(enc|dec) instructions' latency asymptotic performance for
18# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
19# processed with 128-bit key. And given their throughput asymptotic
20# performance for parallelizable modes is 1.25 cycles per byte. Being
21# asymptotic limit it's not something you commonly achieve in reality,
22# but how close does one get? Below are results collected for
23# different modes and block sized. Pairs of numbers are for en-/
24# decryption.
25#
26# 16-byte 64-byte 256-byte 1-KB 8-KB
27# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
28# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
29# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
30# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
31# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
32# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
33#
34# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
35# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
36# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
37# The results were collected with specially crafted speed.c benchmark
38# in order to compare them with results reported in "Intel Advanced
39# Encryption Standard (AES) New Instruction Set" White Paper Revision
40# 3.0 dated May 2010. All above results are consistently better. This
41# module also provides better performance for block sizes smaller than
42# 128 bytes in points *not* represented in the above table.
43#
44# Looking at the results for 8-KB buffer.
45#
46# CFB and OFB results are far from the limit, because implementation
47# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
48# single-block aesni_encrypt, which is not the most optimal way to go.
49# CBC encrypt result is unexpectedly high and there is no documented
50# explanation for it. Seemingly there is a small penalty for feeding
51# the result back to AES unit the way it's done in CBC mode. There is
52# nothing one can do and the result appears optimal. CCM result is
53# identical to CBC, because CBC-MAC is essentially CBC encrypt without
54# saving output. CCM CTR "stays invisible," because it's neatly
55# interleaved with CBC-MAC. This provides ~30% improvement over
56# "straghtforward" CCM implementation with CTR and CBC-MAC performed
57# disjointly. Parallelizable modes practically achieve the theoretical
58# limit.
59#
60# Looking at how results vary with buffer size.
61#
62# Curves are practically saturated at 1-KB buffer size. In most cases
63# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
64# CTR curve doesn't follow this pattern and is "slowest" changing one
65# with "256-byte" result being 87% of "8-KB." This is because overhead
66# in CTR mode is most computationally intensive. Small-block CCM
67# decrypt is slower than encrypt, because first CTR and last CBC-MAC
68# iterations can't be interleaved.
69#
70# Results for 192- and 256-bit keys.
71#
72# EVP-free results were observed to scale perfectly with number of
73# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
74# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
75# are a tad smaller, because the above mentioned penalty biases all
76# results by same constant value. In similar way function call
77# overhead affects small-block performance, as well as OFB and CFB
78# results. Differences are not large, most common coefficients are
79# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
80# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
81
82# January 2011
83#
84# While Westmere processor features 6 cycles latency for aes[enc|dec]
85# instructions, which can be scheduled every second cycle, Sandy
86# Bridge spends 8 cycles per instruction, but it can schedule them
87# every cycle. This means that code targeting Westmere would perform
88# suboptimally on Sandy Bridge. Therefore this update.
89#
90# In addition, non-parallelizable CBC encrypt (as well as CCM) is
91# optimized. Relative improvement might appear modest, 8% on Westmere,
92# but in absolute terms it's 3.77 cycles per byte encrypted with
93# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
94# should be compared to asymptotic limits of 3.75 for Westmere and
95# 5.00 for Sandy Bridge. Actually, the fact that they get this close
96# to asymptotic limits is quite amazing. Indeed, the limit is
97# calculated as latency times number of rounds, 10 for 128-bit key,
98# and divided by 16, the number of bytes in block, or in other words
99# it accounts *solely* for aesenc instructions. But there are extra
100# instructions, and numbers so close to the asymptotic limits mean
101# that it's as if it takes as little as *one* additional cycle to
102# execute all of them. How is it possible? It is possible thanks to
103# out-of-order execution logic, which manages to overlap post-
104# processing of previous block, things like saving the output, with
105# actual encryption of current block, as well as pre-processing of
106# current block, things like fetching input and xor-ing it with
107# 0-round element of the key schedule, with actual encryption of
108# previous block. Keep this in mind...
109#
110# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
111# performance is achieved by interleaving instructions working on
112# independent blocks. In which case asymptotic limit for such modes
113# can be obtained by dividing above mentioned numbers by AES
114# instructions' interleave factor. Westmere can execute at most 3
115# instructions at a time, meaning that optimal interleave factor is 3,
116# and that's where the "magic" number of 1.25 come from. "Optimal
117# interleave factor" means that increase of interleave factor does
118# not improve performance. The formula has proven to reflect reality
119# pretty well on Westmere... Sandy Bridge on the other hand can
120# execute up to 8 AES instructions at a time, so how does varying
121# interleave factor affect the performance? Here is table for ECB
122# (numbers are cycles per byte processed with 128-bit key):
123#
124# instruction interleave factor 3x 6x 8x
125# theoretical asymptotic limit 1.67 0.83 0.625
126# measured performance for 8KB block 1.05 0.86 0.84
127#
128# "as if" interleave factor 4.7x 5.8x 6.0x
129#
130# Further data for other parallelizable modes:
131#
132# CBC decrypt 1.16 0.93 0.93
133# CTR 1.14 0.91 n/a
134#
135# Well, given 3x column it's probably inappropriate to call the limit
136# asymptotic, if it can be surpassed, isn't it? What happens there?
137# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
138# magic is responsible for this. Processor overlaps not only the
139# additional instructions with AES ones, but even AES instructions
140# processing adjacent triplets of independent blocks. In the 6x case
141# additional instructions still claim disproportionally small amount
142# of additional cycles, but in 8x case number of instructions must be
143# a tad too high for out-of-order logic to cope with, and AES unit
144# remains underutilized... As you can see 8x interleave is hardly
145# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
146# utilizies 6x interleave because of limited register bank capacity.
147#
148# Higher interleave factors do have negative impact on Westmere
149# performance. While for ECB mode it's negligible ~1.5%, other
150# parallelizables perform ~5% worse, which is outweighed by ~25%
151# improvement on Sandy Bridge. To balance regression on Westmere
152# CTR mode was implemented with 6x aesenc interleave factor.
153
154# April 2011
155#
156# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
157# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
158# in CTR mode AES instruction interleave factor was chosen to be 6x.
159
160$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
161 # generates drop-in replacement for
162 # crypto/aes/asm/aes-x86_64.pl:-)
163
164$flavour = shift;
165$output = shift;
166if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
167
168$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
169
170$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
171( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
172( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
173die "can't locate x86_64-xlate.pl";
174
175open OUT,"| \"$^X\" $xlate $flavour $output";
176*STDOUT=*OUT;
177
178$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
179@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
180 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
181
182$code=".text\n";
183
184$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
185# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
186$inp="%rdi";
187$out="%rsi";
188$len="%rdx";
189$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
190$ivp="%r8"; # cbc, ctr, ...
191
192$rnds_="%r10d"; # backup copy for $rounds
193$key_="%r11"; # backup copy for $key
194
195# %xmm register layout
196$rndkey0="%xmm0"; $rndkey1="%xmm1";
197$inout0="%xmm2"; $inout1="%xmm3";
198$inout2="%xmm4"; $inout3="%xmm5";
199$inout4="%xmm6"; $inout5="%xmm7";
200$inout6="%xmm8"; $inout7="%xmm9";
201
202$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
203$in0="%xmm8"; $iv="%xmm9";
204
205# Inline version of internal aesni_[en|de]crypt1.
206#
207# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
208# cycles which take care of loop variables...
209{ my $sn;
210sub aesni_generate1 {
211my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
212++$sn;
213$code.=<<___;
214 $movkey ($key),$rndkey0
215 $movkey 16($key),$rndkey1
216___
217$code.=<<___ if (defined($ivec));
218 xorps $rndkey0,$ivec
219 lea 32($key),$key
220 xorps $ivec,$inout
221___
222$code.=<<___ if (!defined($ivec));
223 lea 32($key),$key
224 xorps $rndkey0,$inout
225___
226$code.=<<___;
227.Loop_${p}1_$sn:
228 aes${p} $rndkey1,$inout
229 dec $rounds
230 $movkey ($key),$rndkey1
231 lea 16($key),$key
232 jnz .Loop_${p}1_$sn # loop body is 16 bytes
233 aes${p}last $rndkey1,$inout
234___
235}}
236# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
237#
238{ my ($inp,$out,$key) = @_4args;
239
240$code.=<<___;
241.globl ${PREFIX}_encrypt
242.type ${PREFIX}_encrypt,\@abi-omnipotent
243.align 16
244${PREFIX}_encrypt:
245 _CET_ENDBR
246 movups ($inp),$inout0 # load input
247 mov 240($key),$rounds # key->rounds
248___
249 &aesni_generate1("enc",$key,$rounds);
250$code.=<<___;
251 movups $inout0,($out) # output
252 ret
253.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
254
255.globl ${PREFIX}_decrypt
256.type ${PREFIX}_decrypt,\@abi-omnipotent
257.align 16
258${PREFIX}_decrypt:
259 _CET_ENDBR
260 movups ($inp),$inout0 # load input
261 mov 240($key),$rounds # key->rounds
262___
263 &aesni_generate1("dec",$key,$rounds);
264$code.=<<___;
265 movups $inout0,($out) # output
266 ret
267.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
268___
269}
270
271# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
272# factor. Why 3x subroutine were originally used in loops? Even though
273# aes[enc|dec] latency was originally 6, it could be scheduled only
274# every *2nd* cycle. Thus 3x interleave was the one providing optimal
275# utilization, i.e. when subroutine's throughput is virtually same as
276# of non-interleaved subroutine [for number of input blocks up to 3].
277# This is why it makes no sense to implement 2x subroutine.
278# aes[enc|dec] latency in next processor generation is 8, but the
279# instructions can be scheduled every cycle. Optimal interleave for
280# new processor is therefore 8x...
281sub aesni_generate3 {
282my $dir=shift;
283# As already mentioned it takes in $key and $rounds, which are *not*
284# preserved. $inout[0-2] is cipher/clear text...
285$code.=<<___;
286.type _aesni_${dir}rypt3,\@abi-omnipotent
287.align 16
288_aesni_${dir}rypt3:
289 _CET_ENDBR
290 $movkey ($key),$rndkey0
291 shr \$1,$rounds
292 $movkey 16($key),$rndkey1
293 lea 32($key),$key
294 xorps $rndkey0,$inout0
295 xorps $rndkey0,$inout1
296 xorps $rndkey0,$inout2
297 $movkey ($key),$rndkey0
298
299.L${dir}_loop3:
300 aes${dir} $rndkey1,$inout0
301 aes${dir} $rndkey1,$inout1
302 dec $rounds
303 aes${dir} $rndkey1,$inout2
304 $movkey 16($key),$rndkey1
305 aes${dir} $rndkey0,$inout0
306 aes${dir} $rndkey0,$inout1
307 lea 32($key),$key
308 aes${dir} $rndkey0,$inout2
309 $movkey ($key),$rndkey0
310 jnz .L${dir}_loop3
311
312 aes${dir} $rndkey1,$inout0
313 aes${dir} $rndkey1,$inout1
314 aes${dir} $rndkey1,$inout2
315 aes${dir}last $rndkey0,$inout0
316 aes${dir}last $rndkey0,$inout1
317 aes${dir}last $rndkey0,$inout2
318 ret
319.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
320___
321}
322# 4x interleave is implemented to improve small block performance,
323# most notably [and naturally] 4 block by ~30%. One can argue that one
324# should have implemented 5x as well, but improvement would be <20%,
325# so it's not worth it...
326sub aesni_generate4 {
327my $dir=shift;
328# As already mentioned it takes in $key and $rounds, which are *not*
329# preserved. $inout[0-3] is cipher/clear text...
330$code.=<<___;
331.type _aesni_${dir}rypt4,\@abi-omnipotent
332.align 16
333_aesni_${dir}rypt4:
334 _CET_ENDBR
335 $movkey ($key),$rndkey0
336 shr \$1,$rounds
337 $movkey 16($key),$rndkey1
338 lea 32($key),$key
339 xorps $rndkey0,$inout0
340 xorps $rndkey0,$inout1
341 xorps $rndkey0,$inout2
342 xorps $rndkey0,$inout3
343 $movkey ($key),$rndkey0
344
345.L${dir}_loop4:
346 aes${dir} $rndkey1,$inout0
347 aes${dir} $rndkey1,$inout1
348 dec $rounds
349 aes${dir} $rndkey1,$inout2
350 aes${dir} $rndkey1,$inout3
351 $movkey 16($key),$rndkey1
352 aes${dir} $rndkey0,$inout0
353 aes${dir} $rndkey0,$inout1
354 lea 32($key),$key
355 aes${dir} $rndkey0,$inout2
356 aes${dir} $rndkey0,$inout3
357 $movkey ($key),$rndkey0
358 jnz .L${dir}_loop4
359
360 aes${dir} $rndkey1,$inout0
361 aes${dir} $rndkey1,$inout1
362 aes${dir} $rndkey1,$inout2
363 aes${dir} $rndkey1,$inout3
364 aes${dir}last $rndkey0,$inout0
365 aes${dir}last $rndkey0,$inout1
366 aes${dir}last $rndkey0,$inout2
367 aes${dir}last $rndkey0,$inout3
368 ret
369.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
370___
371}
372sub aesni_generate6 {
373my $dir=shift;
374# As already mentioned it takes in $key and $rounds, which are *not*
375# preserved. $inout[0-5] is cipher/clear text...
376$code.=<<___;
377.type _aesni_${dir}rypt6,\@abi-omnipotent
378.align 16
379_aesni_${dir}rypt6:
380 _CET_ENDBR
381 $movkey ($key),$rndkey0
382 shr \$1,$rounds
383 $movkey 16($key),$rndkey1
384 lea 32($key),$key
385 xorps $rndkey0,$inout0
386 pxor $rndkey0,$inout1
387 aes${dir} $rndkey1,$inout0
388 pxor $rndkey0,$inout2
389 aes${dir} $rndkey1,$inout1
390 pxor $rndkey0,$inout3
391 aes${dir} $rndkey1,$inout2
392 pxor $rndkey0,$inout4
393 aes${dir} $rndkey1,$inout3
394 pxor $rndkey0,$inout5
395 dec $rounds
396 aes${dir} $rndkey1,$inout4
397 $movkey ($key),$rndkey0
398 aes${dir} $rndkey1,$inout5
399 jmp .L${dir}_loop6_enter
400.align 16
401.L${dir}_loop6:
402 aes${dir} $rndkey1,$inout0
403 aes${dir} $rndkey1,$inout1
404 dec $rounds
405 aes${dir} $rndkey1,$inout2
406 aes${dir} $rndkey1,$inout3
407 aes${dir} $rndkey1,$inout4
408 aes${dir} $rndkey1,$inout5
409.L${dir}_loop6_enter: # happens to be 16-byte aligned
410 $movkey 16($key),$rndkey1
411 aes${dir} $rndkey0,$inout0
412 aes${dir} $rndkey0,$inout1
413 lea 32($key),$key
414 aes${dir} $rndkey0,$inout2
415 aes${dir} $rndkey0,$inout3
416 aes${dir} $rndkey0,$inout4
417 aes${dir} $rndkey0,$inout5
418 $movkey ($key),$rndkey0
419 jnz .L${dir}_loop6
420
421 aes${dir} $rndkey1,$inout0
422 aes${dir} $rndkey1,$inout1
423 aes${dir} $rndkey1,$inout2
424 aes${dir} $rndkey1,$inout3
425 aes${dir} $rndkey1,$inout4
426 aes${dir} $rndkey1,$inout5
427 aes${dir}last $rndkey0,$inout0
428 aes${dir}last $rndkey0,$inout1
429 aes${dir}last $rndkey0,$inout2
430 aes${dir}last $rndkey0,$inout3
431 aes${dir}last $rndkey0,$inout4
432 aes${dir}last $rndkey0,$inout5
433 ret
434.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
435___
436}
437sub aesni_generate8 {
438my $dir=shift;
439# As already mentioned it takes in $key and $rounds, which are *not*
440# preserved. $inout[0-7] is cipher/clear text...
441$code.=<<___;
442.type _aesni_${dir}rypt8,\@abi-omnipotent
443.align 16
444_aesni_${dir}rypt8:
445 _CET_ENDBR
446 $movkey ($key),$rndkey0
447 shr \$1,$rounds
448 $movkey 16($key),$rndkey1
449 lea 32($key),$key
450 xorps $rndkey0,$inout0
451 xorps $rndkey0,$inout1
452 aes${dir} $rndkey1,$inout0
453 pxor $rndkey0,$inout2
454 aes${dir} $rndkey1,$inout1
455 pxor $rndkey0,$inout3
456 aes${dir} $rndkey1,$inout2
457 pxor $rndkey0,$inout4
458 aes${dir} $rndkey1,$inout3
459 pxor $rndkey0,$inout5
460 dec $rounds
461 aes${dir} $rndkey1,$inout4
462 pxor $rndkey0,$inout6
463 aes${dir} $rndkey1,$inout5
464 pxor $rndkey0,$inout7
465 $movkey ($key),$rndkey0
466 aes${dir} $rndkey1,$inout6
467 aes${dir} $rndkey1,$inout7
468 $movkey 16($key),$rndkey1
469 jmp .L${dir}_loop8_enter
470.align 16
471.L${dir}_loop8:
472 aes${dir} $rndkey1,$inout0
473 aes${dir} $rndkey1,$inout1
474 dec $rounds
475 aes${dir} $rndkey1,$inout2
476 aes${dir} $rndkey1,$inout3
477 aes${dir} $rndkey1,$inout4
478 aes${dir} $rndkey1,$inout5
479 aes${dir} $rndkey1,$inout6
480 aes${dir} $rndkey1,$inout7
481 $movkey 16($key),$rndkey1
482.L${dir}_loop8_enter: # happens to be 16-byte aligned
483 aes${dir} $rndkey0,$inout0
484 aes${dir} $rndkey0,$inout1
485 lea 32($key),$key
486 aes${dir} $rndkey0,$inout2
487 aes${dir} $rndkey0,$inout3
488 aes${dir} $rndkey0,$inout4
489 aes${dir} $rndkey0,$inout5
490 aes${dir} $rndkey0,$inout6
491 aes${dir} $rndkey0,$inout7
492 $movkey ($key),$rndkey0
493 jnz .L${dir}_loop8
494
495 aes${dir} $rndkey1,$inout0
496 aes${dir} $rndkey1,$inout1
497 aes${dir} $rndkey1,$inout2
498 aes${dir} $rndkey1,$inout3
499 aes${dir} $rndkey1,$inout4
500 aes${dir} $rndkey1,$inout5
501 aes${dir} $rndkey1,$inout6
502 aes${dir} $rndkey1,$inout7
503 aes${dir}last $rndkey0,$inout0
504 aes${dir}last $rndkey0,$inout1
505 aes${dir}last $rndkey0,$inout2
506 aes${dir}last $rndkey0,$inout3
507 aes${dir}last $rndkey0,$inout4
508 aes${dir}last $rndkey0,$inout5
509 aes${dir}last $rndkey0,$inout6
510 aes${dir}last $rndkey0,$inout7
511 ret
512.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
513___
514}
515&aesni_generate3("enc") if ($PREFIX eq "aesni");
516&aesni_generate3("dec");
517&aesni_generate4("enc") if ($PREFIX eq "aesni");
518&aesni_generate4("dec");
519&aesni_generate6("enc") if ($PREFIX eq "aesni");
520&aesni_generate6("dec");
521&aesni_generate8("enc") if ($PREFIX eq "aesni");
522&aesni_generate8("dec");
523
524if ($PREFIX eq "aesni") {
525########################################################################
526# void aesni_ecb_encrypt (const void *in, void *out,
527# size_t length, const AES_KEY *key,
528# int enc);
529$code.=<<___;
530.globl aesni_ecb_encrypt
531.type aesni_ecb_encrypt,\@function,5
532.align 16
533aesni_ecb_encrypt:
534 _CET_ENDBR
535 and \$-16,$len
536 jz .Lecb_ret
537
538 mov 240($key),$rounds # key->rounds
539 $movkey ($key),$rndkey0
540 mov $key,$key_ # backup $key
541 mov $rounds,$rnds_ # backup $rounds
542 test %r8d,%r8d # 5th argument
543 jz .Lecb_decrypt
544#--------------------------- ECB ENCRYPT ------------------------------#
545 cmp \$0x80,$len
546 jb .Lecb_enc_tail
547
548 movdqu ($inp),$inout0
549 movdqu 0x10($inp),$inout1
550 movdqu 0x20($inp),$inout2
551 movdqu 0x30($inp),$inout3
552 movdqu 0x40($inp),$inout4
553 movdqu 0x50($inp),$inout5
554 movdqu 0x60($inp),$inout6
555 movdqu 0x70($inp),$inout7
556 lea 0x80($inp),$inp
557 sub \$0x80,$len
558 jmp .Lecb_enc_loop8_enter
559.align 16
560.Lecb_enc_loop8:
561 movups $inout0,($out)
562 mov $key_,$key # restore $key
563 movdqu ($inp),$inout0
564 mov $rnds_,$rounds # restore $rounds
565 movups $inout1,0x10($out)
566 movdqu 0x10($inp),$inout1
567 movups $inout2,0x20($out)
568 movdqu 0x20($inp),$inout2
569 movups $inout3,0x30($out)
570 movdqu 0x30($inp),$inout3
571 movups $inout4,0x40($out)
572 movdqu 0x40($inp),$inout4
573 movups $inout5,0x50($out)
574 movdqu 0x50($inp),$inout5
575 movups $inout6,0x60($out)
576 movdqu 0x60($inp),$inout6
577 movups $inout7,0x70($out)
578 lea 0x80($out),$out
579 movdqu 0x70($inp),$inout7
580 lea 0x80($inp),$inp
581.Lecb_enc_loop8_enter:
582
583 call _aesni_encrypt8
584
585 sub \$0x80,$len
586 jnc .Lecb_enc_loop8
587
588 movups $inout0,($out)
589 mov $key_,$key # restore $key
590 movups $inout1,0x10($out)
591 mov $rnds_,$rounds # restore $rounds
592 movups $inout2,0x20($out)
593 movups $inout3,0x30($out)
594 movups $inout4,0x40($out)
595 movups $inout5,0x50($out)
596 movups $inout6,0x60($out)
597 movups $inout7,0x70($out)
598 lea 0x80($out),$out
599 add \$0x80,$len
600 jz .Lecb_ret
601
602.Lecb_enc_tail:
603 movups ($inp),$inout0
604 cmp \$0x20,$len
605 jb .Lecb_enc_one
606 movups 0x10($inp),$inout1
607 je .Lecb_enc_two
608 movups 0x20($inp),$inout2
609 cmp \$0x40,$len
610 jb .Lecb_enc_three
611 movups 0x30($inp),$inout3
612 je .Lecb_enc_four
613 movups 0x40($inp),$inout4
614 cmp \$0x60,$len
615 jb .Lecb_enc_five
616 movups 0x50($inp),$inout5
617 je .Lecb_enc_six
618 movdqu 0x60($inp),$inout6
619 call _aesni_encrypt8
620 movups $inout0,($out)
621 movups $inout1,0x10($out)
622 movups $inout2,0x20($out)
623 movups $inout3,0x30($out)
624 movups $inout4,0x40($out)
625 movups $inout5,0x50($out)
626 movups $inout6,0x60($out)
627 jmp .Lecb_ret
628.align 16
629.Lecb_enc_one:
630___
631 &aesni_generate1("enc",$key,$rounds);
632$code.=<<___;
633 movups $inout0,($out)
634 jmp .Lecb_ret
635.align 16
636.Lecb_enc_two:
637 xorps $inout2,$inout2
638 call _aesni_encrypt3
639 movups $inout0,($out)
640 movups $inout1,0x10($out)
641 jmp .Lecb_ret
642.align 16
643.Lecb_enc_three:
644 call _aesni_encrypt3
645 movups $inout0,($out)
646 movups $inout1,0x10($out)
647 movups $inout2,0x20($out)
648 jmp .Lecb_ret
649.align 16
650.Lecb_enc_four:
651 call _aesni_encrypt4
652 movups $inout0,($out)
653 movups $inout1,0x10($out)
654 movups $inout2,0x20($out)
655 movups $inout3,0x30($out)
656 jmp .Lecb_ret
657.align 16
658.Lecb_enc_five:
659 xorps $inout5,$inout5
660 call _aesni_encrypt6
661 movups $inout0,($out)
662 movups $inout1,0x10($out)
663 movups $inout2,0x20($out)
664 movups $inout3,0x30($out)
665 movups $inout4,0x40($out)
666 jmp .Lecb_ret
667.align 16
668.Lecb_enc_six:
669 call _aesni_encrypt6
670 movups $inout0,($out)
671 movups $inout1,0x10($out)
672 movups $inout2,0x20($out)
673 movups $inout3,0x30($out)
674 movups $inout4,0x40($out)
675 movups $inout5,0x50($out)
676 jmp .Lecb_ret
677 #--------------------------- ECB DECRYPT ------------------------------#
678.align 16
679.Lecb_decrypt:
680 cmp \$0x80,$len
681 jb .Lecb_dec_tail
682
683 movdqu ($inp),$inout0
684 movdqu 0x10($inp),$inout1
685 movdqu 0x20($inp),$inout2
686 movdqu 0x30($inp),$inout3
687 movdqu 0x40($inp),$inout4
688 movdqu 0x50($inp),$inout5
689 movdqu 0x60($inp),$inout6
690 movdqu 0x70($inp),$inout7
691 lea 0x80($inp),$inp
692 sub \$0x80,$len
693 jmp .Lecb_dec_loop8_enter
694.align 16
695.Lecb_dec_loop8:
696 movups $inout0,($out)
697 mov $key_,$key # restore $key
698 movdqu ($inp),$inout0
699 mov $rnds_,$rounds # restore $rounds
700 movups $inout1,0x10($out)
701 movdqu 0x10($inp),$inout1
702 movups $inout2,0x20($out)
703 movdqu 0x20($inp),$inout2
704 movups $inout3,0x30($out)
705 movdqu 0x30($inp),$inout3
706 movups $inout4,0x40($out)
707 movdqu 0x40($inp),$inout4
708 movups $inout5,0x50($out)
709 movdqu 0x50($inp),$inout5
710 movups $inout6,0x60($out)
711 movdqu 0x60($inp),$inout6
712 movups $inout7,0x70($out)
713 lea 0x80($out),$out
714 movdqu 0x70($inp),$inout7
715 lea 0x80($inp),$inp
716.Lecb_dec_loop8_enter:
717
718 call _aesni_decrypt8
719
720 $movkey ($key_),$rndkey0
721 sub \$0x80,$len
722 jnc .Lecb_dec_loop8
723
724 movups $inout0,($out)
725 mov $key_,$key # restore $key
726 movups $inout1,0x10($out)
727 mov $rnds_,$rounds # restore $rounds
728 movups $inout2,0x20($out)
729 movups $inout3,0x30($out)
730 movups $inout4,0x40($out)
731 movups $inout5,0x50($out)
732 movups $inout6,0x60($out)
733 movups $inout7,0x70($out)
734 lea 0x80($out),$out
735 add \$0x80,$len
736 jz .Lecb_ret
737
738.Lecb_dec_tail:
739 movups ($inp),$inout0
740 cmp \$0x20,$len
741 jb .Lecb_dec_one
742 movups 0x10($inp),$inout1
743 je .Lecb_dec_two
744 movups 0x20($inp),$inout2
745 cmp \$0x40,$len
746 jb .Lecb_dec_three
747 movups 0x30($inp),$inout3
748 je .Lecb_dec_four
749 movups 0x40($inp),$inout4
750 cmp \$0x60,$len
751 jb .Lecb_dec_five
752 movups 0x50($inp),$inout5
753 je .Lecb_dec_six
754 movups 0x60($inp),$inout6
755 $movkey ($key),$rndkey0
756 call _aesni_decrypt8
757 movups $inout0,($out)
758 movups $inout1,0x10($out)
759 movups $inout2,0x20($out)
760 movups $inout3,0x30($out)
761 movups $inout4,0x40($out)
762 movups $inout5,0x50($out)
763 movups $inout6,0x60($out)
764 jmp .Lecb_ret
765.align 16
766.Lecb_dec_one:
767___
768 &aesni_generate1("dec",$key,$rounds);
769$code.=<<___;
770 movups $inout0,($out)
771 jmp .Lecb_ret
772.align 16
773.Lecb_dec_two:
774 xorps $inout2,$inout2
775 call _aesni_decrypt3
776 movups $inout0,($out)
777 movups $inout1,0x10($out)
778 jmp .Lecb_ret
779.align 16
780.Lecb_dec_three:
781 call _aesni_decrypt3
782 movups $inout0,($out)
783 movups $inout1,0x10($out)
784 movups $inout2,0x20($out)
785 jmp .Lecb_ret
786.align 16
787.Lecb_dec_four:
788 call _aesni_decrypt4
789 movups $inout0,($out)
790 movups $inout1,0x10($out)
791 movups $inout2,0x20($out)
792 movups $inout3,0x30($out)
793 jmp .Lecb_ret
794.align 16
795.Lecb_dec_five:
796 xorps $inout5,$inout5
797 call _aesni_decrypt6
798 movups $inout0,($out)
799 movups $inout1,0x10($out)
800 movups $inout2,0x20($out)
801 movups $inout3,0x30($out)
802 movups $inout4,0x40($out)
803 jmp .Lecb_ret
804.align 16
805.Lecb_dec_six:
806 call _aesni_decrypt6
807 movups $inout0,($out)
808 movups $inout1,0x10($out)
809 movups $inout2,0x20($out)
810 movups $inout3,0x30($out)
811 movups $inout4,0x40($out)
812 movups $inout5,0x50($out)
813
814.Lecb_ret:
815 ret
816.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
817___
818
819{
820######################################################################
821# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
822# size_t blocks, const AES_KEY *key,
823# const char *ivec,char *cmac);
824#
825# Handles only complete blocks, operates on 64-bit counter and
826# does not update *ivec! Nor does it finalize CMAC value
827# (see engine/eng_aesni.c for details)
828#
829{
830my $cmac="%r9"; # 6th argument
831
832my $increment="%xmm6";
833my $bswap_mask="%xmm7";
834
835$code.=<<___;
836.globl aesni_ccm64_encrypt_blocks
837.type aesni_ccm64_encrypt_blocks,\@function,6
838.align 16
839aesni_ccm64_encrypt_blocks:
840 _CET_ENDBR
841___
842$code.=<<___ if ($win64);
843 lea -0x58(%rsp),%rsp
844 movaps %xmm6,(%rsp)
845 movaps %xmm7,0x10(%rsp)
846 movaps %xmm8,0x20(%rsp)
847 movaps %xmm9,0x30(%rsp)
848.Lccm64_enc_body:
849___
850$code.=<<___;
851 mov 240($key),$rounds # key->rounds
852 movdqu ($ivp),$iv
853 movdqa .Lincrement64(%rip),$increment
854 movdqa .Lbswap_mask(%rip),$bswap_mask
855
856 shr \$1,$rounds
857 lea 0($key),$key_
858 movdqu ($cmac),$inout1
859 movdqa $iv,$inout0
860 mov $rounds,$rnds_
861 pshufb $bswap_mask,$iv
862 jmp .Lccm64_enc_outer
863.align 16
864.Lccm64_enc_outer:
865 $movkey ($key_),$rndkey0
866 mov $rnds_,$rounds
867 movups ($inp),$in0 # load inp
868
869 xorps $rndkey0,$inout0 # counter
870 $movkey 16($key_),$rndkey1
871 xorps $in0,$rndkey0
872 lea 32($key_),$key
873 xorps $rndkey0,$inout1 # cmac^=inp
874 $movkey ($key),$rndkey0
875
876.Lccm64_enc2_loop:
877 aesenc $rndkey1,$inout0
878 dec $rounds
879 aesenc $rndkey1,$inout1
880 $movkey 16($key),$rndkey1
881 aesenc $rndkey0,$inout0
882 lea 32($key),$key
883 aesenc $rndkey0,$inout1
884 $movkey 0($key),$rndkey0
885 jnz .Lccm64_enc2_loop
886 aesenc $rndkey1,$inout0
887 aesenc $rndkey1,$inout1
888 paddq $increment,$iv
889 aesenclast $rndkey0,$inout0
890 aesenclast $rndkey0,$inout1
891
892 dec $len
893 lea 16($inp),$inp
894 xorps $inout0,$in0 # inp ^= E(iv)
895 movdqa $iv,$inout0
896 movups $in0,($out) # save output
897 lea 16($out),$out
898 pshufb $bswap_mask,$inout0
899 jnz .Lccm64_enc_outer
900
901 movups $inout1,($cmac)
902___
903$code.=<<___ if ($win64);
904 movaps (%rsp),%xmm6
905 movaps 0x10(%rsp),%xmm7
906 movaps 0x20(%rsp),%xmm8
907 movaps 0x30(%rsp),%xmm9
908 lea 0x58(%rsp),%rsp
909.Lccm64_enc_ret:
910___
911$code.=<<___;
912 ret
913.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
914___
915######################################################################
916$code.=<<___;
917.globl aesni_ccm64_decrypt_blocks
918.type aesni_ccm64_decrypt_blocks,\@function,6
919.align 16
920aesni_ccm64_decrypt_blocks:
921 _CET_ENDBR
922___
923$code.=<<___ if ($win64);
924 lea -0x58(%rsp),%rsp
925 movaps %xmm6,(%rsp)
926 movaps %xmm7,0x10(%rsp)
927 movaps %xmm8,0x20(%rsp)
928 movaps %xmm9,0x30(%rsp)
929.Lccm64_dec_body:
930___
931$code.=<<___;
932 mov 240($key),$rounds # key->rounds
933 movups ($ivp),$iv
934 movdqu ($cmac),$inout1
935 movdqa .Lincrement64(%rip),$increment
936 movdqa .Lbswap_mask(%rip),$bswap_mask
937
938 movaps $iv,$inout0
939 mov $rounds,$rnds_
940 mov $key,$key_
941 pshufb $bswap_mask,$iv
942___
943 &aesni_generate1("enc",$key,$rounds);
944$code.=<<___;
945 movups ($inp),$in0 # load inp
946 paddq $increment,$iv
947 lea 16($inp),$inp
948 jmp .Lccm64_dec_outer
949.align 16
950.Lccm64_dec_outer:
951 xorps $inout0,$in0 # inp ^= E(iv)
952 movdqa $iv,$inout0
953 mov $rnds_,$rounds
954 movups $in0,($out) # save output
955 lea 16($out),$out
956 pshufb $bswap_mask,$inout0
957
958 sub \$1,$len
959 jz .Lccm64_dec_break
960
961 $movkey ($key_),$rndkey0
962 shr \$1,$rounds
963 $movkey 16($key_),$rndkey1
964 xorps $rndkey0,$in0
965 lea 32($key_),$key
966 xorps $rndkey0,$inout0
967 xorps $in0,$inout1 # cmac^=out
968 $movkey ($key),$rndkey0
969
970.Lccm64_dec2_loop:
971 aesenc $rndkey1,$inout0
972 dec $rounds
973 aesenc $rndkey1,$inout1
974 $movkey 16($key),$rndkey1
975 aesenc $rndkey0,$inout0
976 lea 32($key),$key
977 aesenc $rndkey0,$inout1
978 $movkey 0($key),$rndkey0
979 jnz .Lccm64_dec2_loop
980 movups ($inp),$in0 # load inp
981 paddq $increment,$iv
982 aesenc $rndkey1,$inout0
983 aesenc $rndkey1,$inout1
984 lea 16($inp),$inp
985 aesenclast $rndkey0,$inout0
986 aesenclast $rndkey0,$inout1
987 jmp .Lccm64_dec_outer
988
989.align 16
990.Lccm64_dec_break:
991 #xorps $in0,$inout1 # cmac^=out
992___
993 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
994$code.=<<___;
995 movups $inout1,($cmac)
996___
997$code.=<<___ if ($win64);
998 movaps (%rsp),%xmm6
999 movaps 0x10(%rsp),%xmm7
1000 movaps 0x20(%rsp),%xmm8
1001 movaps 0x30(%rsp),%xmm9
1002 lea 0x58(%rsp),%rsp
1003.Lccm64_dec_ret:
1004___
1005$code.=<<___;
1006 ret
1007.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1008___
1009}
1010######################################################################
1011# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1012# size_t blocks, const AES_KEY *key,
1013# const char *ivec);
1014#
1015# Handles only complete blocks, operates on 32-bit counter and
1016# does not update *ivec! (see engine/eng_aesni.c for details)
1017#
1018{
1019my $frame_size = 0x20+($win64?160:0);
1020my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
1021my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
1022my $bswap_mask="%xmm15";
1023
1024$code.=<<___;
1025.globl aesni_ctr32_encrypt_blocks
1026.type aesni_ctr32_encrypt_blocks,\@function,5
1027.align 16
1028aesni_ctr32_encrypt_blocks:
1029 _CET_ENDBR
1030 lea (%rsp),%rax
1031 push %rbp
1032 sub \$$frame_size,%rsp
1033___
1034$code.=<<___ if ($win64);
1035 movaps %xmm6,0x20(%rsp)
1036 movaps %xmm7,0x30(%rsp)
1037 movaps %xmm8,0x40(%rsp)
1038 movaps %xmm9,0x50(%rsp)
1039 movaps %xmm10,0x60(%rsp)
1040 movaps %xmm11,0x70(%rsp)
1041 movaps %xmm12,0x80(%rsp)
1042 movaps %xmm13,0x90(%rsp)
1043 movaps %xmm14,0xa0(%rsp)
1044 movaps %xmm15,0xb0(%rsp)
1045.Lctr32_body:
1046___
1047$code.=<<___;
1048 lea -8(%rax),%rbp
1049 cmp \$1,$len
1050 je .Lctr32_one_shortcut
1051
1052 movdqu ($ivp),$ivec
1053 movdqa .Lbswap_mask(%rip),$bswap_mask
1054 xor $rounds,$rounds
1055 pextrd \$3,$ivec,$rnds_ # pull 32-bit counter
1056 pinsrd \$3,$rounds,$ivec # wipe 32-bit counter
1057
1058 mov 240($key),$rounds # key->rounds
1059 bswap $rnds_
1060 pxor $iv0,$iv0 # vector of 3 32-bit counters
1061 pxor $iv1,$iv1 # vector of 3 32-bit counters
1062 pinsrd \$0,$rnds_,$iv0
1063 lea 3($rnds_),$key_
1064 pinsrd \$0,$key_,$iv1
1065 inc $rnds_
1066 pinsrd \$1,$rnds_,$iv0
1067 inc $key_
1068 pinsrd \$1,$key_,$iv1
1069 inc $rnds_
1070 pinsrd \$2,$rnds_,$iv0
1071 inc $key_
1072 pinsrd \$2,$key_,$iv1
1073 movdqa $iv0,0x00(%rsp)
1074 pshufb $bswap_mask,$iv0
1075 movdqa $iv1,0x10(%rsp)
1076 pshufb $bswap_mask,$iv1
1077
1078 pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword
1079 pshufd \$`2<<6`,$iv0,$inout1
1080 pshufd \$`1<<6`,$iv0,$inout2
1081 cmp \$6,$len
1082 jb .Lctr32_tail
1083 shr \$1,$rounds
1084 mov $key,$key_ # backup $key
1085 mov $rounds,$rnds_ # backup $rounds
1086 sub \$6,$len
1087 jmp .Lctr32_loop6
1088
1089.align 16
1090.Lctr32_loop6:
1091 pshufd \$`3<<6`,$iv1,$inout3
1092 por $ivec,$inout0 # merge counter-less ivec
1093 $movkey ($key_),$rndkey0
1094 pshufd \$`2<<6`,$iv1,$inout4
1095 por $ivec,$inout1
1096 $movkey 16($key_),$rndkey1
1097 pshufd \$`1<<6`,$iv1,$inout5
1098 por $ivec,$inout2
1099 por $ivec,$inout3
1100 xorps $rndkey0,$inout0
1101 por $ivec,$inout4
1102 por $ivec,$inout5
1103
1104 # inline _aesni_encrypt6 and interleave last rounds
1105 # with own code...
1106
1107 pxor $rndkey0,$inout1
1108 aesenc $rndkey1,$inout0
1109 lea 32($key_),$key
1110 pxor $rndkey0,$inout2
1111 aesenc $rndkey1,$inout1
1112 movdqa .Lincrement32(%rip),$iv1
1113 pxor $rndkey0,$inout3
1114 aesenc $rndkey1,$inout2
1115 movdqa (%rsp),$iv0
1116 pxor $rndkey0,$inout4
1117 aesenc $rndkey1,$inout3
1118 pxor $rndkey0,$inout5
1119 $movkey ($key),$rndkey0
1120 dec $rounds
1121 aesenc $rndkey1,$inout4
1122 aesenc $rndkey1,$inout5
1123 jmp .Lctr32_enc_loop6_enter
1124.align 16
1125.Lctr32_enc_loop6:
1126 aesenc $rndkey1,$inout0
1127 aesenc $rndkey1,$inout1
1128 dec $rounds
1129 aesenc $rndkey1,$inout2
1130 aesenc $rndkey1,$inout3
1131 aesenc $rndkey1,$inout4
1132 aesenc $rndkey1,$inout5
1133.Lctr32_enc_loop6_enter:
1134 $movkey 16($key),$rndkey1
1135 aesenc $rndkey0,$inout0
1136 aesenc $rndkey0,$inout1
1137 lea 32($key),$key
1138 aesenc $rndkey0,$inout2
1139 aesenc $rndkey0,$inout3
1140 aesenc $rndkey0,$inout4
1141 aesenc $rndkey0,$inout5
1142 $movkey ($key),$rndkey0
1143 jnz .Lctr32_enc_loop6
1144
1145 aesenc $rndkey1,$inout0
1146 paddd $iv1,$iv0 # increment counter vector
1147 aesenc $rndkey1,$inout1
1148 paddd 0x10(%rsp),$iv1
1149 aesenc $rndkey1,$inout2
1150 movdqa $iv0,0x00(%rsp) # save counter vector
1151 aesenc $rndkey1,$inout3
1152 movdqa $iv1,0x10(%rsp)
1153 aesenc $rndkey1,$inout4
1154 pshufb $bswap_mask,$iv0 # byte swap
1155 aesenc $rndkey1,$inout5
1156 pshufb $bswap_mask,$iv1
1157
1158 aesenclast $rndkey0,$inout0
1159 movups ($inp),$in0 # load input
1160 aesenclast $rndkey0,$inout1
1161 movups 0x10($inp),$in1
1162 aesenclast $rndkey0,$inout2
1163 movups 0x20($inp),$in2
1164 aesenclast $rndkey0,$inout3
1165 movups 0x30($inp),$in3
1166 aesenclast $rndkey0,$inout4
1167 movups 0x40($inp),$rndkey1
1168 aesenclast $rndkey0,$inout5
1169 movups 0x50($inp),$rndkey0
1170 lea 0x60($inp),$inp
1171
1172 xorps $inout0,$in0 # xor
1173 pshufd \$`3<<6`,$iv0,$inout0
1174 xorps $inout1,$in1
1175 pshufd \$`2<<6`,$iv0,$inout1
1176 movups $in0,($out) # store output
1177 xorps $inout2,$in2
1178 pshufd \$`1<<6`,$iv0,$inout2
1179 movups $in1,0x10($out)
1180 xorps $inout3,$in3
1181 movups $in2,0x20($out)
1182 xorps $inout4,$rndkey1
1183 movups $in3,0x30($out)
1184 xorps $inout5,$rndkey0
1185 movups $rndkey1,0x40($out)
1186 movups $rndkey0,0x50($out)
1187 lea 0x60($out),$out
1188 mov $rnds_,$rounds
1189 sub \$6,$len
1190 jnc .Lctr32_loop6
1191
1192 add \$6,$len
1193 jz .Lctr32_done
1194 mov $key_,$key # restore $key
1195 lea 1($rounds,$rounds),$rounds # restore original value
1196
1197.Lctr32_tail:
1198 por $ivec,$inout0
1199 movups ($inp),$in0
1200 cmp \$2,$len
1201 jb .Lctr32_one
1202
1203 por $ivec,$inout1
1204 movups 0x10($inp),$in1
1205 je .Lctr32_two
1206
1207 pshufd \$`3<<6`,$iv1,$inout3
1208 por $ivec,$inout2
1209 movups 0x20($inp),$in2
1210 cmp \$4,$len
1211 jb .Lctr32_three
1212
1213 pshufd \$`2<<6`,$iv1,$inout4
1214 por $ivec,$inout3
1215 movups 0x30($inp),$in3
1216 je .Lctr32_four
1217
1218 por $ivec,$inout4
1219 xorps $inout5,$inout5
1220
1221 call _aesni_encrypt6
1222
1223 movups 0x40($inp),$rndkey1
1224 xorps $inout0,$in0
1225 xorps $inout1,$in1
1226 movups $in0,($out)
1227 xorps $inout2,$in2
1228 movups $in1,0x10($out)
1229 xorps $inout3,$in3
1230 movups $in2,0x20($out)
1231 xorps $inout4,$rndkey1
1232 movups $in3,0x30($out)
1233 movups $rndkey1,0x40($out)
1234 jmp .Lctr32_done
1235
1236.align 16
1237.Lctr32_one_shortcut:
1238 movups ($ivp),$inout0
1239 movups ($inp),$in0
1240 mov 240($key),$rounds # key->rounds
1241.Lctr32_one:
1242___
1243 &aesni_generate1("enc",$key,$rounds);
1244$code.=<<___;
1245 xorps $inout0,$in0
1246 movups $in0,($out)
1247 jmp .Lctr32_done
1248
1249.align 16
1250.Lctr32_two:
1251 xorps $inout2,$inout2
1252 call _aesni_encrypt3
1253 xorps $inout0,$in0
1254 xorps $inout1,$in1
1255 movups $in0,($out)
1256 movups $in1,0x10($out)
1257 jmp .Lctr32_done
1258
1259.align 16
1260.Lctr32_three:
1261 call _aesni_encrypt3
1262 xorps $inout0,$in0
1263 xorps $inout1,$in1
1264 movups $in0,($out)
1265 xorps $inout2,$in2
1266 movups $in1,0x10($out)
1267 movups $in2,0x20($out)
1268 jmp .Lctr32_done
1269
1270.align 16
1271.Lctr32_four:
1272 call _aesni_encrypt4
1273 xorps $inout0,$in0
1274 xorps $inout1,$in1
1275 movups $in0,($out)
1276 xorps $inout2,$in2
1277 movups $in1,0x10($out)
1278 xorps $inout3,$in3
1279 movups $in2,0x20($out)
1280 movups $in3,0x30($out)
1281
1282.Lctr32_done:
1283___
1284$code.=<<___ if ($win64);
1285 movaps 0x20(%rsp),%xmm6
1286 movaps 0x30(%rsp),%xmm7
1287 movaps 0x40(%rsp),%xmm8
1288 movaps 0x50(%rsp),%xmm9
1289 movaps 0x60(%rsp),%xmm10
1290 movaps 0x70(%rsp),%xmm11
1291 movaps 0x80(%rsp),%xmm12
1292 movaps 0x90(%rsp),%xmm13
1293 movaps 0xa0(%rsp),%xmm14
1294 movaps 0xb0(%rsp),%xmm15
1295___
1296$code.=<<___;
1297 lea (%rbp),%rsp
1298 pop %rbp
1299.Lctr32_ret:
1300 ret
1301.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1302___
1303}
1304
1305######################################################################
1306# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1307# const AES_KEY *key1, const AES_KEY *key2
1308# const unsigned char iv[16]);
1309#
1310{
1311my @tweak=map("%xmm$_",(10..15));
1312my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1313my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1314my $frame_size = 0x60 + ($win64?160:0);
1315
1316$code.=<<___;
1317.globl aesni_xts_encrypt
1318.type aesni_xts_encrypt,\@function,6
1319.align 16
1320aesni_xts_encrypt:
1321 _CET_ENDBR
1322 lea (%rsp),%rax
1323 push %rbp
1324 sub \$$frame_size,%rsp
1325___
1326$code.=<<___ if ($win64);
1327 movaps %xmm6,0x60(%rsp)
1328 movaps %xmm7,0x70(%rsp)
1329 movaps %xmm8,0x80(%rsp)
1330 movaps %xmm9,0x90(%rsp)
1331 movaps %xmm10,0xa0(%rsp)
1332 movaps %xmm11,0xb0(%rsp)
1333 movaps %xmm12,0xc0(%rsp)
1334 movaps %xmm13,0xd0(%rsp)
1335 movaps %xmm14,0xe0(%rsp)
1336 movaps %xmm15,0xf0(%rsp)
1337.Lxts_enc_body:
1338___
1339$code.=<<___;
1340 lea -8(%rax),%rbp
1341 movups ($ivp),@tweak[5] # load clear-text tweak
1342 mov 240(%r8),$rounds # key2->rounds
1343 mov 240($key),$rnds_ # key1->rounds
1344___
1345 # generate the tweak
1346 &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
1347$code.=<<___;
1348 mov $key,$key_ # backup $key
1349 mov $rnds_,$rounds # backup $rounds
1350 mov $len,$len_ # backup $len
1351 and \$-16,$len
1352
1353 movdqa .Lxts_magic(%rip),$twmask
1354 pxor $twtmp,$twtmp
1355 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1356___
1357 for ($i=0;$i<4;$i++) {
1358 $code.=<<___;
1359 pshufd \$0x13,$twtmp,$twres
1360 pxor $twtmp,$twtmp
1361 movdqa @tweak[5],@tweak[$i]
1362 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1363 pand $twmask,$twres # isolate carry and residue
1364 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1365 pxor $twres,@tweak[5]
1366___
1367 }
1368$code.=<<___;
1369 sub \$16*6,$len
1370 jc .Lxts_enc_short
1371
1372 shr \$1,$rounds
1373 sub \$1,$rounds
1374 mov $rounds,$rnds_
1375 jmp .Lxts_enc_grandloop
1376
1377.align 16
1378.Lxts_enc_grandloop:
1379 pshufd \$0x13,$twtmp,$twres
1380 movdqa @tweak[5],@tweak[4]
1381 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1382 movdqu `16*0`($inp),$inout0 # load input
1383 pand $twmask,$twres # isolate carry and residue
1384 movdqu `16*1`($inp),$inout1
1385 pxor $twres,@tweak[5]
1386
1387 movdqu `16*2`($inp),$inout2
1388 pxor @tweak[0],$inout0 # input^=tweak
1389 movdqu `16*3`($inp),$inout3
1390 pxor @tweak[1],$inout1
1391 movdqu `16*4`($inp),$inout4
1392 pxor @tweak[2],$inout2
1393 movdqu `16*5`($inp),$inout5
1394 lea `16*6`($inp),$inp
1395 pxor @tweak[3],$inout3
1396 $movkey ($key_),$rndkey0
1397 pxor @tweak[4],$inout4
1398 pxor @tweak[5],$inout5
1399
1400 # inline _aesni_encrypt6 and interleave first and last rounds
1401 # with own code...
1402 $movkey 16($key_),$rndkey1
1403 pxor $rndkey0,$inout0
1404 pxor $rndkey0,$inout1
1405 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
1406 aesenc $rndkey1,$inout0
1407 lea 32($key_),$key
1408 pxor $rndkey0,$inout2
1409 movdqa @tweak[1],`16*1`(%rsp)
1410 aesenc $rndkey1,$inout1
1411 pxor $rndkey0,$inout3
1412 movdqa @tweak[2],`16*2`(%rsp)
1413 aesenc $rndkey1,$inout2
1414 pxor $rndkey0,$inout4
1415 movdqa @tweak[3],`16*3`(%rsp)
1416 aesenc $rndkey1,$inout3
1417 pxor $rndkey0,$inout5
1418 $movkey ($key),$rndkey0
1419 dec $rounds
1420 movdqa @tweak[4],`16*4`(%rsp)
1421 aesenc $rndkey1,$inout4
1422 movdqa @tweak[5],`16*5`(%rsp)
1423 aesenc $rndkey1,$inout5
1424 pxor $twtmp,$twtmp
1425 pcmpgtd @tweak[5],$twtmp
1426 jmp .Lxts_enc_loop6_enter
1427
1428.align 16
1429.Lxts_enc_loop6:
1430 aesenc $rndkey1,$inout0
1431 aesenc $rndkey1,$inout1
1432 dec $rounds
1433 aesenc $rndkey1,$inout2
1434 aesenc $rndkey1,$inout3
1435 aesenc $rndkey1,$inout4
1436 aesenc $rndkey1,$inout5
1437.Lxts_enc_loop6_enter:
1438 $movkey 16($key),$rndkey1
1439 aesenc $rndkey0,$inout0
1440 aesenc $rndkey0,$inout1
1441 lea 32($key),$key
1442 aesenc $rndkey0,$inout2
1443 aesenc $rndkey0,$inout3
1444 aesenc $rndkey0,$inout4
1445 aesenc $rndkey0,$inout5
1446 $movkey ($key),$rndkey0
1447 jnz .Lxts_enc_loop6
1448
1449 pshufd \$0x13,$twtmp,$twres
1450 pxor $twtmp,$twtmp
1451 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1452 aesenc $rndkey1,$inout0
1453 pand $twmask,$twres # isolate carry and residue
1454 aesenc $rndkey1,$inout1
1455 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1456 aesenc $rndkey1,$inout2
1457 pxor $twres,@tweak[5]
1458 aesenc $rndkey1,$inout3
1459 aesenc $rndkey1,$inout4
1460 aesenc $rndkey1,$inout5
1461 $movkey 16($key),$rndkey1
1462
1463 pshufd \$0x13,$twtmp,$twres
1464 pxor $twtmp,$twtmp
1465 movdqa @tweak[5],@tweak[0]
1466 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1467 aesenc $rndkey0,$inout0
1468 pand $twmask,$twres # isolate carry and residue
1469 aesenc $rndkey0,$inout1
1470 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1471 aesenc $rndkey0,$inout2
1472 pxor $twres,@tweak[5]
1473 aesenc $rndkey0,$inout3
1474 aesenc $rndkey0,$inout4
1475 aesenc $rndkey0,$inout5
1476 $movkey 32($key),$rndkey0
1477
1478 pshufd \$0x13,$twtmp,$twres
1479 pxor $twtmp,$twtmp
1480 movdqa @tweak[5],@tweak[1]
1481 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1482 aesenc $rndkey1,$inout0
1483 pand $twmask,$twres # isolate carry and residue
1484 aesenc $rndkey1,$inout1
1485 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1486 aesenc $rndkey1,$inout2
1487 pxor $twres,@tweak[5]
1488 aesenc $rndkey1,$inout3
1489 aesenc $rndkey1,$inout4
1490 aesenc $rndkey1,$inout5
1491
1492 pshufd \$0x13,$twtmp,$twres
1493 pxor $twtmp,$twtmp
1494 movdqa @tweak[5],@tweak[2]
1495 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1496 aesenclast $rndkey0,$inout0
1497 pand $twmask,$twres # isolate carry and residue
1498 aesenclast $rndkey0,$inout1
1499 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1500 aesenclast $rndkey0,$inout2
1501 pxor $twres,@tweak[5]
1502 aesenclast $rndkey0,$inout3
1503 aesenclast $rndkey0,$inout4
1504 aesenclast $rndkey0,$inout5
1505
1506 pshufd \$0x13,$twtmp,$twres
1507 pxor $twtmp,$twtmp
1508 movdqa @tweak[5],@tweak[3]
1509 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1510 xorps `16*0`(%rsp),$inout0 # output^=tweak
1511 pand $twmask,$twres # isolate carry and residue
1512 xorps `16*1`(%rsp),$inout1
1513 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1514 pxor $twres,@tweak[5]
1515
1516 xorps `16*2`(%rsp),$inout2
1517 movups $inout0,`16*0`($out) # write output
1518 xorps `16*3`(%rsp),$inout3
1519 movups $inout1,`16*1`($out)
1520 xorps `16*4`(%rsp),$inout4
1521 movups $inout2,`16*2`($out)
1522 xorps `16*5`(%rsp),$inout5
1523 movups $inout3,`16*3`($out)
1524 mov $rnds_,$rounds # restore $rounds
1525 movups $inout4,`16*4`($out)
1526 movups $inout5,`16*5`($out)
1527 lea `16*6`($out),$out
1528 sub \$16*6,$len
1529 jnc .Lxts_enc_grandloop
1530
1531 lea 3($rounds,$rounds),$rounds # restore original value
1532 mov $key_,$key # restore $key
1533 mov $rounds,$rnds_ # backup $rounds
1534
1535.Lxts_enc_short:
1536 add \$16*6,$len
1537 jz .Lxts_enc_done
1538
1539 cmp \$0x20,$len
1540 jb .Lxts_enc_one
1541 je .Lxts_enc_two
1542
1543 cmp \$0x40,$len
1544 jb .Lxts_enc_three
1545 je .Lxts_enc_four
1546
1547 pshufd \$0x13,$twtmp,$twres
1548 movdqa @tweak[5],@tweak[4]
1549 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1550 movdqu ($inp),$inout0
1551 pand $twmask,$twres # isolate carry and residue
1552 movdqu 16*1($inp),$inout1
1553 pxor $twres,@tweak[5]
1554
1555 movdqu 16*2($inp),$inout2
1556 pxor @tweak[0],$inout0
1557 movdqu 16*3($inp),$inout3
1558 pxor @tweak[1],$inout1
1559 movdqu 16*4($inp),$inout4
1560 lea 16*5($inp),$inp
1561 pxor @tweak[2],$inout2
1562 pxor @tweak[3],$inout3
1563 pxor @tweak[4],$inout4
1564
1565 call _aesni_encrypt6
1566
1567 xorps @tweak[0],$inout0
1568 movdqa @tweak[5],@tweak[0]
1569 xorps @tweak[1],$inout1
1570 xorps @tweak[2],$inout2
1571 movdqu $inout0,($out)
1572 xorps @tweak[3],$inout3
1573 movdqu $inout1,16*1($out)
1574 xorps @tweak[4],$inout4
1575 movdqu $inout2,16*2($out)
1576 movdqu $inout3,16*3($out)
1577 movdqu $inout4,16*4($out)
1578 lea 16*5($out),$out
1579 jmp .Lxts_enc_done
1580
1581.align 16
1582.Lxts_enc_one:
1583 movups ($inp),$inout0
1584 lea 16*1($inp),$inp
1585 xorps @tweak[0],$inout0
1586___
1587 &aesni_generate1("enc",$key,$rounds);
1588$code.=<<___;
1589 xorps @tweak[0],$inout0
1590 movdqa @tweak[1],@tweak[0]
1591 movups $inout0,($out)
1592 lea 16*1($out),$out
1593 jmp .Lxts_enc_done
1594
1595.align 16
1596.Lxts_enc_two:
1597 movups ($inp),$inout0
1598 movups 16($inp),$inout1
1599 lea 32($inp),$inp
1600 xorps @tweak[0],$inout0
1601 xorps @tweak[1],$inout1
1602
1603 call _aesni_encrypt3
1604
1605 xorps @tweak[0],$inout0
1606 movdqa @tweak[2],@tweak[0]
1607 xorps @tweak[1],$inout1
1608 movups $inout0,($out)
1609 movups $inout1,16*1($out)
1610 lea 16*2($out),$out
1611 jmp .Lxts_enc_done
1612
1613.align 16
1614.Lxts_enc_three:
1615 movups ($inp),$inout0
1616 movups 16*1($inp),$inout1
1617 movups 16*2($inp),$inout2
1618 lea 16*3($inp),$inp
1619 xorps @tweak[0],$inout0
1620 xorps @tweak[1],$inout1
1621 xorps @tweak[2],$inout2
1622
1623 call _aesni_encrypt3
1624
1625 xorps @tweak[0],$inout0
1626 movdqa @tweak[3],@tweak[0]
1627 xorps @tweak[1],$inout1
1628 xorps @tweak[2],$inout2
1629 movups $inout0,($out)
1630 movups $inout1,16*1($out)
1631 movups $inout2,16*2($out)
1632 lea 16*3($out),$out
1633 jmp .Lxts_enc_done
1634
1635.align 16
1636.Lxts_enc_four:
1637 movups ($inp),$inout0
1638 movups 16*1($inp),$inout1
1639 movups 16*2($inp),$inout2
1640 xorps @tweak[0],$inout0
1641 movups 16*3($inp),$inout3
1642 lea 16*4($inp),$inp
1643 xorps @tweak[1],$inout1
1644 xorps @tweak[2],$inout2
1645 xorps @tweak[3],$inout3
1646
1647 call _aesni_encrypt4
1648
1649 xorps @tweak[0],$inout0
1650 movdqa @tweak[5],@tweak[0]
1651 xorps @tweak[1],$inout1
1652 xorps @tweak[2],$inout2
1653 movups $inout0,($out)
1654 xorps @tweak[3],$inout3
1655 movups $inout1,16*1($out)
1656 movups $inout2,16*2($out)
1657 movups $inout3,16*3($out)
1658 lea 16*4($out),$out
1659 jmp .Lxts_enc_done
1660
1661.align 16
1662.Lxts_enc_done:
1663 and \$15,$len_
1664 jz .Lxts_enc_ret
1665 mov $len_,$len
1666
1667.Lxts_enc_steal:
1668 movzb ($inp),%eax # borrow $rounds ...
1669 movzb -16($out),%ecx # ... and $key
1670 lea 1($inp),$inp
1671 mov %al,-16($out)
1672 mov %cl,0($out)
1673 lea 1($out),$out
1674 sub \$1,$len
1675 jnz .Lxts_enc_steal
1676
1677 sub $len_,$out # rewind $out
1678 mov $key_,$key # restore $key
1679 mov $rnds_,$rounds # restore $rounds
1680
1681 movups -16($out),$inout0
1682 xorps @tweak[0],$inout0
1683___
1684 &aesni_generate1("enc",$key,$rounds);
1685$code.=<<___;
1686 xorps @tweak[0],$inout0
1687 movups $inout0,-16($out)
1688
1689.Lxts_enc_ret:
1690___
1691$code.=<<___ if ($win64);
1692 movaps 0x60(%rsp),%xmm6
1693 movaps 0x70(%rsp),%xmm7
1694 movaps 0x80(%rsp),%xmm8
1695 movaps 0x90(%rsp),%xmm9
1696 movaps 0xa0(%rsp),%xmm10
1697 movaps 0xb0(%rsp),%xmm11
1698 movaps 0xc0(%rsp),%xmm12
1699 movaps 0xd0(%rsp),%xmm13
1700 movaps 0xe0(%rsp),%xmm14
1701 movaps 0xf0(%rsp),%xmm15
1702___
1703$code.=<<___;
1704 lea (%rbp),%rsp
1705 pop %rbp
1706.Lxts_enc_epilogue:
1707 ret
1708.size aesni_xts_encrypt,.-aesni_xts_encrypt
1709___
1710
1711$code.=<<___;
1712.globl aesni_xts_decrypt
1713.type aesni_xts_decrypt,\@function,6
1714.align 16
1715aesni_xts_decrypt:
1716 _CET_ENDBR
1717 lea (%rsp),%rax
1718 push %rbp
1719 sub \$$frame_size,%rsp
1720___
1721$code.=<<___ if ($win64);
1722 movaps %xmm6,0x60(%rsp)
1723 movaps %xmm7,0x70(%rsp)
1724 movaps %xmm8,0x80(%rsp)
1725 movaps %xmm9,0x90(%rsp)
1726 movaps %xmm10,0xa0(%rsp)
1727 movaps %xmm11,0xb0(%rsp)
1728 movaps %xmm12,0xc0(%rsp)
1729 movaps %xmm13,0xd0(%rsp)
1730 movaps %xmm14,0xe0(%rsp)
1731 movaps %xmm15,0xf0(%rsp)
1732.Lxts_dec_body:
1733___
1734$code.=<<___;
1735 lea -8(%rax),%rbp
1736 movups ($ivp),@tweak[5] # load clear-text tweak
1737 mov 240($key2),$rounds # key2->rounds
1738 mov 240($key),$rnds_ # key1->rounds
1739___
1740 # generate the tweak
1741 &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
1742$code.=<<___;
1743 xor %eax,%eax # if ($len%16) len-=16;
1744 test \$15,$len
1745 setnz %al
1746 shl \$4,%rax
1747 sub %rax,$len
1748
1749 mov $key,$key_ # backup $key
1750 mov $rnds_,$rounds # backup $rounds
1751 mov $len,$len_ # backup $len
1752 and \$-16,$len
1753
1754 movdqa .Lxts_magic(%rip),$twmask
1755 pxor $twtmp,$twtmp
1756 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1757___
1758 for ($i=0;$i<4;$i++) {
1759 $code.=<<___;
1760 pshufd \$0x13,$twtmp,$twres
1761 pxor $twtmp,$twtmp
1762 movdqa @tweak[5],@tweak[$i]
1763 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1764 pand $twmask,$twres # isolate carry and residue
1765 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1766 pxor $twres,@tweak[5]
1767___
1768 }
1769$code.=<<___;
1770 sub \$16*6,$len
1771 jc .Lxts_dec_short
1772
1773 shr \$1,$rounds
1774 sub \$1,$rounds
1775 mov $rounds,$rnds_
1776 jmp .Lxts_dec_grandloop
1777
1778.align 16
1779.Lxts_dec_grandloop:
1780 pshufd \$0x13,$twtmp,$twres
1781 movdqa @tweak[5],@tweak[4]
1782 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1783 movdqu `16*0`($inp),$inout0 # load input
1784 pand $twmask,$twres # isolate carry and residue
1785 movdqu `16*1`($inp),$inout1
1786 pxor $twres,@tweak[5]
1787
1788 movdqu `16*2`($inp),$inout2
1789 pxor @tweak[0],$inout0 # input^=tweak
1790 movdqu `16*3`($inp),$inout3
1791 pxor @tweak[1],$inout1
1792 movdqu `16*4`($inp),$inout4
1793 pxor @tweak[2],$inout2
1794 movdqu `16*5`($inp),$inout5
1795 lea `16*6`($inp),$inp
1796 pxor @tweak[3],$inout3
1797 $movkey ($key_),$rndkey0
1798 pxor @tweak[4],$inout4
1799 pxor @tweak[5],$inout5
1800
1801 # inline _aesni_decrypt6 and interleave first and last rounds
1802 # with own code...
1803 $movkey 16($key_),$rndkey1
1804 pxor $rndkey0,$inout0
1805 pxor $rndkey0,$inout1
1806 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
1807 aesdec $rndkey1,$inout0
1808 lea 32($key_),$key
1809 pxor $rndkey0,$inout2
1810 movdqa @tweak[1],`16*1`(%rsp)
1811 aesdec $rndkey1,$inout1
1812 pxor $rndkey0,$inout3
1813 movdqa @tweak[2],`16*2`(%rsp)
1814 aesdec $rndkey1,$inout2
1815 pxor $rndkey0,$inout4
1816 movdqa @tweak[3],`16*3`(%rsp)
1817 aesdec $rndkey1,$inout3
1818 pxor $rndkey0,$inout5
1819 $movkey ($key),$rndkey0
1820 dec $rounds
1821 movdqa @tweak[4],`16*4`(%rsp)
1822 aesdec $rndkey1,$inout4
1823 movdqa @tweak[5],`16*5`(%rsp)
1824 aesdec $rndkey1,$inout5
1825 pxor $twtmp,$twtmp
1826 pcmpgtd @tweak[5],$twtmp
1827 jmp .Lxts_dec_loop6_enter
1828
1829.align 16
1830.Lxts_dec_loop6:
1831 aesdec $rndkey1,$inout0
1832 aesdec $rndkey1,$inout1
1833 dec $rounds
1834 aesdec $rndkey1,$inout2
1835 aesdec $rndkey1,$inout3
1836 aesdec $rndkey1,$inout4
1837 aesdec $rndkey1,$inout5
1838.Lxts_dec_loop6_enter:
1839 $movkey 16($key),$rndkey1
1840 aesdec $rndkey0,$inout0
1841 aesdec $rndkey0,$inout1
1842 lea 32($key),$key
1843 aesdec $rndkey0,$inout2
1844 aesdec $rndkey0,$inout3
1845 aesdec $rndkey0,$inout4
1846 aesdec $rndkey0,$inout5
1847 $movkey ($key),$rndkey0
1848 jnz .Lxts_dec_loop6
1849
1850 pshufd \$0x13,$twtmp,$twres
1851 pxor $twtmp,$twtmp
1852 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1853 aesdec $rndkey1,$inout0
1854 pand $twmask,$twres # isolate carry and residue
1855 aesdec $rndkey1,$inout1
1856 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1857 aesdec $rndkey1,$inout2
1858 pxor $twres,@tweak[5]
1859 aesdec $rndkey1,$inout3
1860 aesdec $rndkey1,$inout4
1861 aesdec $rndkey1,$inout5
1862 $movkey 16($key),$rndkey1
1863
1864 pshufd \$0x13,$twtmp,$twres
1865 pxor $twtmp,$twtmp
1866 movdqa @tweak[5],@tweak[0]
1867 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1868 aesdec $rndkey0,$inout0
1869 pand $twmask,$twres # isolate carry and residue
1870 aesdec $rndkey0,$inout1
1871 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1872 aesdec $rndkey0,$inout2
1873 pxor $twres,@tweak[5]
1874 aesdec $rndkey0,$inout3
1875 aesdec $rndkey0,$inout4
1876 aesdec $rndkey0,$inout5
1877 $movkey 32($key),$rndkey0
1878
1879 pshufd \$0x13,$twtmp,$twres
1880 pxor $twtmp,$twtmp
1881 movdqa @tweak[5],@tweak[1]
1882 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1883 aesdec $rndkey1,$inout0
1884 pand $twmask,$twres # isolate carry and residue
1885 aesdec $rndkey1,$inout1
1886 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1887 aesdec $rndkey1,$inout2
1888 pxor $twres,@tweak[5]
1889 aesdec $rndkey1,$inout3
1890 aesdec $rndkey1,$inout4
1891 aesdec $rndkey1,$inout5
1892
1893 pshufd \$0x13,$twtmp,$twres
1894 pxor $twtmp,$twtmp
1895 movdqa @tweak[5],@tweak[2]
1896 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1897 aesdeclast $rndkey0,$inout0
1898 pand $twmask,$twres # isolate carry and residue
1899 aesdeclast $rndkey0,$inout1
1900 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1901 aesdeclast $rndkey0,$inout2
1902 pxor $twres,@tweak[5]
1903 aesdeclast $rndkey0,$inout3
1904 aesdeclast $rndkey0,$inout4
1905 aesdeclast $rndkey0,$inout5
1906
1907 pshufd \$0x13,$twtmp,$twres
1908 pxor $twtmp,$twtmp
1909 movdqa @tweak[5],@tweak[3]
1910 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1911 xorps `16*0`(%rsp),$inout0 # output^=tweak
1912 pand $twmask,$twres # isolate carry and residue
1913 xorps `16*1`(%rsp),$inout1
1914 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1915 pxor $twres,@tweak[5]
1916
1917 xorps `16*2`(%rsp),$inout2
1918 movups $inout0,`16*0`($out) # write output
1919 xorps `16*3`(%rsp),$inout3
1920 movups $inout1,`16*1`($out)
1921 xorps `16*4`(%rsp),$inout4
1922 movups $inout2,`16*2`($out)
1923 xorps `16*5`(%rsp),$inout5
1924 movups $inout3,`16*3`($out)
1925 mov $rnds_,$rounds # restore $rounds
1926 movups $inout4,`16*4`($out)
1927 movups $inout5,`16*5`($out)
1928 lea `16*6`($out),$out
1929 sub \$16*6,$len
1930 jnc .Lxts_dec_grandloop
1931
1932 lea 3($rounds,$rounds),$rounds # restore original value
1933 mov $key_,$key # restore $key
1934 mov $rounds,$rnds_ # backup $rounds
1935
1936.Lxts_dec_short:
1937 add \$16*6,$len
1938 jz .Lxts_dec_done
1939
1940 cmp \$0x20,$len
1941 jb .Lxts_dec_one
1942 je .Lxts_dec_two
1943
1944 cmp \$0x40,$len
1945 jb .Lxts_dec_three
1946 je .Lxts_dec_four
1947
1948 pshufd \$0x13,$twtmp,$twres
1949 movdqa @tweak[5],@tweak[4]
1950 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1951 movdqu ($inp),$inout0
1952 pand $twmask,$twres # isolate carry and residue
1953 movdqu 16*1($inp),$inout1
1954 pxor $twres,@tweak[5]
1955
1956 movdqu 16*2($inp),$inout2
1957 pxor @tweak[0],$inout0
1958 movdqu 16*3($inp),$inout3
1959 pxor @tweak[1],$inout1
1960 movdqu 16*4($inp),$inout4
1961 lea 16*5($inp),$inp
1962 pxor @tweak[2],$inout2
1963 pxor @tweak[3],$inout3
1964 pxor @tweak[4],$inout4
1965
1966 call _aesni_decrypt6
1967
1968 xorps @tweak[0],$inout0
1969 xorps @tweak[1],$inout1
1970 xorps @tweak[2],$inout2
1971 movdqu $inout0,($out)
1972 xorps @tweak[3],$inout3
1973 movdqu $inout1,16*1($out)
1974 xorps @tweak[4],$inout4
1975 movdqu $inout2,16*2($out)
1976 pxor $twtmp,$twtmp
1977 movdqu $inout3,16*3($out)
1978 pcmpgtd @tweak[5],$twtmp
1979 movdqu $inout4,16*4($out)
1980 lea 16*5($out),$out
1981 pshufd \$0x13,$twtmp,@tweak[1] # $twres
1982 and \$15,$len_
1983 jz .Lxts_dec_ret
1984
1985 movdqa @tweak[5],@tweak[0]
1986 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1987 pand $twmask,@tweak[1] # isolate carry and residue
1988 pxor @tweak[5],@tweak[1]
1989 jmp .Lxts_dec_done2
1990
1991.align 16
1992.Lxts_dec_one:
1993 movups ($inp),$inout0
1994 lea 16*1($inp),$inp
1995 xorps @tweak[0],$inout0
1996___
1997 &aesni_generate1("dec",$key,$rounds);
1998$code.=<<___;
1999 xorps @tweak[0],$inout0
2000 movdqa @tweak[1],@tweak[0]
2001 movups $inout0,($out)
2002 movdqa @tweak[2],@tweak[1]
2003 lea 16*1($out),$out
2004 jmp .Lxts_dec_done
2005
2006.align 16
2007.Lxts_dec_two:
2008 movups ($inp),$inout0
2009 movups 16($inp),$inout1
2010 lea 32($inp),$inp
2011 xorps @tweak[0],$inout0
2012 xorps @tweak[1],$inout1
2013
2014 call _aesni_decrypt3
2015
2016 xorps @tweak[0],$inout0
2017 movdqa @tweak[2],@tweak[0]
2018 xorps @tweak[1],$inout1
2019 movdqa @tweak[3],@tweak[1]
2020 movups $inout0,($out)
2021 movups $inout1,16*1($out)
2022 lea 16*2($out),$out
2023 jmp .Lxts_dec_done
2024
2025.align 16
2026.Lxts_dec_three:
2027 movups ($inp),$inout0
2028 movups 16*1($inp),$inout1
2029 movups 16*2($inp),$inout2
2030 lea 16*3($inp),$inp
2031 xorps @tweak[0],$inout0
2032 xorps @tweak[1],$inout1
2033 xorps @tweak[2],$inout2
2034
2035 call _aesni_decrypt3
2036
2037 xorps @tweak[0],$inout0
2038 movdqa @tweak[3],@tweak[0]
2039 xorps @tweak[1],$inout1
2040 movdqa @tweak[5],@tweak[1]
2041 xorps @tweak[2],$inout2
2042 movups $inout0,($out)
2043 movups $inout1,16*1($out)
2044 movups $inout2,16*2($out)
2045 lea 16*3($out),$out
2046 jmp .Lxts_dec_done
2047
2048.align 16
2049.Lxts_dec_four:
2050 pshufd \$0x13,$twtmp,$twres
2051 movdqa @tweak[5],@tweak[4]
2052 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2053 movups ($inp),$inout0
2054 pand $twmask,$twres # isolate carry and residue
2055 movups 16*1($inp),$inout1
2056 pxor $twres,@tweak[5]
2057
2058 movups 16*2($inp),$inout2
2059 xorps @tweak[0],$inout0
2060 movups 16*3($inp),$inout3
2061 lea 16*4($inp),$inp
2062 xorps @tweak[1],$inout1
2063 xorps @tweak[2],$inout2
2064 xorps @tweak[3],$inout3
2065
2066 call _aesni_decrypt4
2067
2068 xorps @tweak[0],$inout0
2069 movdqa @tweak[4],@tweak[0]
2070 xorps @tweak[1],$inout1
2071 movdqa @tweak[5],@tweak[1]
2072 xorps @tweak[2],$inout2
2073 movups $inout0,($out)
2074 xorps @tweak[3],$inout3
2075 movups $inout1,16*1($out)
2076 movups $inout2,16*2($out)
2077 movups $inout3,16*3($out)
2078 lea 16*4($out),$out
2079 jmp .Lxts_dec_done
2080
2081.align 16
2082.Lxts_dec_done:
2083 and \$15,$len_
2084 jz .Lxts_dec_ret
2085.Lxts_dec_done2:
2086 mov $len_,$len
2087 mov $key_,$key # restore $key
2088 mov $rnds_,$rounds # restore $rounds
2089
2090 movups ($inp),$inout0
2091 xorps @tweak[1],$inout0
2092___
2093 &aesni_generate1("dec",$key,$rounds);
2094$code.=<<___;
2095 xorps @tweak[1],$inout0
2096 movups $inout0,($out)
2097
2098.Lxts_dec_steal:
2099 movzb 16($inp),%eax # borrow $rounds ...
2100 movzb ($out),%ecx # ... and $key
2101 lea 1($inp),$inp
2102 mov %al,($out)
2103 mov %cl,16($out)
2104 lea 1($out),$out
2105 sub \$1,$len
2106 jnz .Lxts_dec_steal
2107
2108 sub $len_,$out # rewind $out
2109 mov $key_,$key # restore $key
2110 mov $rnds_,$rounds # restore $rounds
2111
2112 movups ($out),$inout0
2113 xorps @tweak[0],$inout0
2114___
2115 &aesni_generate1("dec",$key,$rounds);
2116$code.=<<___;
2117 xorps @tweak[0],$inout0
2118 movups $inout0,($out)
2119
2120.Lxts_dec_ret:
2121___
2122$code.=<<___ if ($win64);
2123 movaps 0x60(%rsp),%xmm6
2124 movaps 0x70(%rsp),%xmm7
2125 movaps 0x80(%rsp),%xmm8
2126 movaps 0x90(%rsp),%xmm9
2127 movaps 0xa0(%rsp),%xmm10
2128 movaps 0xb0(%rsp),%xmm11
2129 movaps 0xc0(%rsp),%xmm12
2130 movaps 0xd0(%rsp),%xmm13
2131 movaps 0xe0(%rsp),%xmm14
2132 movaps 0xf0(%rsp),%xmm15
2133___
2134$code.=<<___;
2135 lea (%rbp),%rsp
2136 pop %rbp
2137.Lxts_dec_epilogue:
2138 ret
2139.size aesni_xts_decrypt,.-aesni_xts_decrypt
2140___
2141} }}
2142
2143########################################################################
2144# void $PREFIX_cbc_encrypt (const void *inp, void *out,
2145# size_t length, const AES_KEY *key,
2146# unsigned char *ivp,const int enc);
2147{
2148my $frame_size = 0x10 + ($win64?0x40:0); # used in decrypt
2149$code.=<<___;
2150.globl ${PREFIX}_cbc_encrypt
2151.type ${PREFIX}_cbc_encrypt,\@function,6
2152.align 16
2153${PREFIX}_cbc_encrypt:
2154 _CET_ENDBR
2155 test $len,$len # check length
2156 jz .Lcbc_ret
2157
2158 mov 240($key),$rnds_ # key->rounds
2159 mov $key,$key_ # backup $key
2160 test %r9d,%r9d # 6th argument
2161 jz .Lcbc_decrypt
2162#--------------------------- CBC ENCRYPT ------------------------------#
2163 movups ($ivp),$inout0 # load iv as initial state
2164 mov $rnds_,$rounds
2165 cmp \$16,$len
2166 jb .Lcbc_enc_tail
2167 sub \$16,$len
2168 jmp .Lcbc_enc_loop
2169.align 16
2170.Lcbc_enc_loop:
2171 movups ($inp),$inout1 # load input
2172 lea 16($inp),$inp
2173 #xorps $inout1,$inout0
2174___
2175 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
2176$code.=<<___;
2177 mov $rnds_,$rounds # restore $rounds
2178 mov $key_,$key # restore $key
2179 movups $inout0,0($out) # store output
2180 lea 16($out),$out
2181 sub \$16,$len
2182 jnc .Lcbc_enc_loop
2183 add \$16,$len
2184 jnz .Lcbc_enc_tail
2185 movups $inout0,($ivp)
2186 jmp .Lcbc_ret
2187
2188.Lcbc_enc_tail:
2189 mov $len,%rcx # zaps $key
2190 xchg $inp,$out # $inp is %rsi and $out is %rdi now
2191 .long 0x9066A4F3 # rep movsb
2192 mov \$16,%ecx # zero tail
2193 sub $len,%rcx
2194 xor %eax,%eax
2195 .long 0x9066AAF3 # rep stosb
2196 lea -16(%rdi),%rdi # rewind $out by 1 block
2197 mov $rnds_,$rounds # restore $rounds
2198 mov %rdi,%rsi # $inp and $out are the same
2199 mov $key_,$key # restore $key
2200 xor $len,$len # len=16
2201 jmp .Lcbc_enc_loop # one more spin
2202 #--------------------------- CBC DECRYPT ------------------------------#
2203.align 16
2204.Lcbc_decrypt:
2205 lea (%rsp),%rax
2206 push %rbp
2207 sub \$$frame_size,%rsp
2208___
2209$code.=<<___ if ($win64);
2210 movaps %xmm6,0x10(%rsp)
2211 movaps %xmm7,0x20(%rsp)
2212 movaps %xmm8,0x30(%rsp)
2213 movaps %xmm9,0x40(%rsp)
2214.Lcbc_decrypt_body:
2215___
2216$code.=<<___;
2217 lea -8(%rax),%rbp
2218 movups ($ivp),$iv
2219 mov $rnds_,$rounds
2220 cmp \$0x70,$len
2221 jbe .Lcbc_dec_tail
2222 shr \$1,$rnds_
2223 sub \$0x70,$len
2224 mov $rnds_,$rounds
2225 movaps $iv,(%rsp)
2226 jmp .Lcbc_dec_loop8_enter
2227.align 16
2228.Lcbc_dec_loop8:
2229 movaps $rndkey0,(%rsp) # save IV
2230 movups $inout7,($out)
2231 lea 0x10($out),$out
2232.Lcbc_dec_loop8_enter:
2233 $movkey ($key),$rndkey0
2234 movups ($inp),$inout0 # load input
2235 movups 0x10($inp),$inout1
2236 $movkey 16($key),$rndkey1
2237
2238 lea 32($key),$key
2239 movdqu 0x20($inp),$inout2
2240 xorps $rndkey0,$inout0
2241 movdqu 0x30($inp),$inout3
2242 xorps $rndkey0,$inout1
2243 movdqu 0x40($inp),$inout4
2244 aesdec $rndkey1,$inout0
2245 pxor $rndkey0,$inout2
2246 movdqu 0x50($inp),$inout5
2247 aesdec $rndkey1,$inout1
2248 pxor $rndkey0,$inout3
2249 movdqu 0x60($inp),$inout6
2250 aesdec $rndkey1,$inout2
2251 pxor $rndkey0,$inout4
2252 movdqu 0x70($inp),$inout7
2253 aesdec $rndkey1,$inout3
2254 pxor $rndkey0,$inout5
2255 dec $rounds
2256 aesdec $rndkey1,$inout4
2257 pxor $rndkey0,$inout6
2258 aesdec $rndkey1,$inout5
2259 pxor $rndkey0,$inout7
2260 $movkey ($key),$rndkey0
2261 aesdec $rndkey1,$inout6
2262 aesdec $rndkey1,$inout7
2263 $movkey 16($key),$rndkey1
2264
2265 call .Ldec_loop8_enter
2266
2267 movups ($inp),$rndkey1 # re-load input
2268 movups 0x10($inp),$rndkey0
2269 xorps (%rsp),$inout0 # ^= IV
2270 xorps $rndkey1,$inout1
2271 movups 0x20($inp),$rndkey1
2272 xorps $rndkey0,$inout2
2273 movups 0x30($inp),$rndkey0
2274 xorps $rndkey1,$inout3
2275 movups 0x40($inp),$rndkey1
2276 xorps $rndkey0,$inout4
2277 movups 0x50($inp),$rndkey0
2278 xorps $rndkey1,$inout5
2279 movups 0x60($inp),$rndkey1
2280 xorps $rndkey0,$inout6
2281 movups 0x70($inp),$rndkey0 # IV
2282 xorps $rndkey1,$inout7
2283 movups $inout0,($out)
2284 movups $inout1,0x10($out)
2285 movups $inout2,0x20($out)
2286 movups $inout3,0x30($out)
2287 mov $rnds_,$rounds # restore $rounds
2288 movups $inout4,0x40($out)
2289 mov $key_,$key # restore $key
2290 movups $inout5,0x50($out)
2291 lea 0x80($inp),$inp
2292 movups $inout6,0x60($out)
2293 lea 0x70($out),$out
2294 sub \$0x80,$len
2295 ja .Lcbc_dec_loop8
2296
2297 movaps $inout7,$inout0
2298 movaps $rndkey0,$iv
2299 add \$0x70,$len
2300 jle .Lcbc_dec_tail_collected
2301 movups $inout0,($out)
2302 lea 1($rnds_,$rnds_),$rounds
2303 lea 0x10($out),$out
2304.Lcbc_dec_tail:
2305 movups ($inp),$inout0
2306 movaps $inout0,$in0
2307 cmp \$0x10,$len
2308 jbe .Lcbc_dec_one
2309
2310 movups 0x10($inp),$inout1
2311 movaps $inout1,$in1
2312 cmp \$0x20,$len
2313 jbe .Lcbc_dec_two
2314
2315 movups 0x20($inp),$inout2
2316 movaps $inout2,$in2
2317 cmp \$0x30,$len
2318 jbe .Lcbc_dec_three
2319
2320 movups 0x30($inp),$inout3
2321 cmp \$0x40,$len
2322 jbe .Lcbc_dec_four
2323
2324 movups 0x40($inp),$inout4
2325 cmp \$0x50,$len
2326 jbe .Lcbc_dec_five
2327
2328 movups 0x50($inp),$inout5
2329 cmp \$0x60,$len
2330 jbe .Lcbc_dec_six
2331
2332 movups 0x60($inp),$inout6
2333 movaps $iv,(%rsp) # save IV
2334 call _aesni_decrypt8
2335 movups ($inp),$rndkey1
2336 movups 0x10($inp),$rndkey0
2337 xorps (%rsp),$inout0 # ^= IV
2338 xorps $rndkey1,$inout1
2339 movups 0x20($inp),$rndkey1
2340 xorps $rndkey0,$inout2
2341 movups 0x30($inp),$rndkey0
2342 xorps $rndkey1,$inout3
2343 movups 0x40($inp),$rndkey1
2344 xorps $rndkey0,$inout4
2345 movups 0x50($inp),$rndkey0
2346 xorps $rndkey1,$inout5
2347 movups 0x60($inp),$iv # IV
2348 xorps $rndkey0,$inout6
2349 movups $inout0,($out)
2350 movups $inout1,0x10($out)
2351 movups $inout2,0x20($out)
2352 movups $inout3,0x30($out)
2353 movups $inout4,0x40($out)
2354 movups $inout5,0x50($out)
2355 lea 0x60($out),$out
2356 movaps $inout6,$inout0
2357 sub \$0x70,$len
2358 jmp .Lcbc_dec_tail_collected
2359.align 16
2360.Lcbc_dec_one:
2361___
2362 &aesni_generate1("dec",$key,$rounds);
2363$code.=<<___;
2364 xorps $iv,$inout0
2365 movaps $in0,$iv
2366 sub \$0x10,$len
2367 jmp .Lcbc_dec_tail_collected
2368.align 16
2369.Lcbc_dec_two:
2370 xorps $inout2,$inout2
2371 call _aesni_decrypt3
2372 xorps $iv,$inout0
2373 xorps $in0,$inout1
2374 movups $inout0,($out)
2375 movaps $in1,$iv
2376 movaps $inout1,$inout0
2377 lea 0x10($out),$out
2378 sub \$0x20,$len
2379 jmp .Lcbc_dec_tail_collected
2380.align 16
2381.Lcbc_dec_three:
2382 call _aesni_decrypt3
2383 xorps $iv,$inout0
2384 xorps $in0,$inout1
2385 movups $inout0,($out)
2386 xorps $in1,$inout2
2387 movups $inout1,0x10($out)
2388 movaps $in2,$iv
2389 movaps $inout2,$inout0
2390 lea 0x20($out),$out
2391 sub \$0x30,$len
2392 jmp .Lcbc_dec_tail_collected
2393.align 16
2394.Lcbc_dec_four:
2395 call _aesni_decrypt4
2396 xorps $iv,$inout0
2397 movups 0x30($inp),$iv
2398 xorps $in0,$inout1
2399 movups $inout0,($out)
2400 xorps $in1,$inout2
2401 movups $inout1,0x10($out)
2402 xorps $in2,$inout3
2403 movups $inout2,0x20($out)
2404 movaps $inout3,$inout0
2405 lea 0x30($out),$out
2406 sub \$0x40,$len
2407 jmp .Lcbc_dec_tail_collected
2408.align 16
2409.Lcbc_dec_five:
2410 xorps $inout5,$inout5
2411 call _aesni_decrypt6
2412 movups 0x10($inp),$rndkey1
2413 movups 0x20($inp),$rndkey0
2414 xorps $iv,$inout0
2415 xorps $in0,$inout1
2416 xorps $rndkey1,$inout2
2417 movups 0x30($inp),$rndkey1
2418 xorps $rndkey0,$inout3
2419 movups 0x40($inp),$iv
2420 xorps $rndkey1,$inout4
2421 movups $inout0,($out)
2422 movups $inout1,0x10($out)
2423 movups $inout2,0x20($out)
2424 movups $inout3,0x30($out)
2425 lea 0x40($out),$out
2426 movaps $inout4,$inout0
2427 sub \$0x50,$len
2428 jmp .Lcbc_dec_tail_collected
2429.align 16
2430.Lcbc_dec_six:
2431 call _aesni_decrypt6
2432 movups 0x10($inp),$rndkey1
2433 movups 0x20($inp),$rndkey0
2434 xorps $iv,$inout0
2435 xorps $in0,$inout1
2436 xorps $rndkey1,$inout2
2437 movups 0x30($inp),$rndkey1
2438 xorps $rndkey0,$inout3
2439 movups 0x40($inp),$rndkey0
2440 xorps $rndkey1,$inout4
2441 movups 0x50($inp),$iv
2442 xorps $rndkey0,$inout5
2443 movups $inout0,($out)
2444 movups $inout1,0x10($out)
2445 movups $inout2,0x20($out)
2446 movups $inout3,0x30($out)
2447 movups $inout4,0x40($out)
2448 lea 0x50($out),$out
2449 movaps $inout5,$inout0
2450 sub \$0x60,$len
2451 jmp .Lcbc_dec_tail_collected
2452.align 16
2453.Lcbc_dec_tail_collected:
2454 and \$15,$len
2455 movups $iv,($ivp)
2456 jnz .Lcbc_dec_tail_partial
2457 movups $inout0,($out)
2458 jmp .Lcbc_dec_ret
2459.align 16
2460.Lcbc_dec_tail_partial:
2461 movaps $inout0,(%rsp)
2462 mov \$16,%rcx
2463 mov $out,%rdi
2464 sub $len,%rcx
2465 lea (%rsp),%rsi
2466 .long 0x9066A4F3 # rep movsb
2467
2468.Lcbc_dec_ret:
2469___
2470$code.=<<___ if ($win64);
2471 movaps 0x10(%rsp),%xmm6
2472 movaps 0x20(%rsp),%xmm7
2473 movaps 0x30(%rsp),%xmm8
2474 movaps 0x40(%rsp),%xmm9
2475___
2476$code.=<<___;
2477 lea (%rbp),%rsp
2478 pop %rbp
2479.Lcbc_ret:
2480 ret
2481.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
2482___
2483}
2484# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
2485# int bits, AES_KEY *key)
2486{ my ($inp,$bits,$key) = @_4args;
2487 $bits =~ s/%r/%e/;
2488
2489$code.=<<___;
2490.globl ${PREFIX}_set_decrypt_key
2491.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
2492.align 16
2493${PREFIX}_set_decrypt_key:
2494 _CET_ENDBR
2495 sub \$8,%rsp
2496 call __aesni_set_encrypt_key
2497 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
2498 test %eax,%eax
2499 jnz .Ldec_key_ret
2500 lea 16($key,$bits),$inp # points at the end of key schedule
2501
2502 $movkey ($key),%xmm0 # just swap
2503 $movkey ($inp),%xmm1
2504 $movkey %xmm0,($inp)
2505 $movkey %xmm1,($key)
2506 lea 16($key),$key
2507 lea -16($inp),$inp
2508
2509.Ldec_key_inverse:
2510 $movkey ($key),%xmm0 # swap and inverse
2511 $movkey ($inp),%xmm1
2512 aesimc %xmm0,%xmm0
2513 aesimc %xmm1,%xmm1
2514 lea 16($key),$key
2515 lea -16($inp),$inp
2516 $movkey %xmm0,16($inp)
2517 $movkey %xmm1,-16($key)
2518 cmp $key,$inp
2519 ja .Ldec_key_inverse
2520
2521 $movkey ($key),%xmm0 # inverse middle
2522 aesimc %xmm0,%xmm0
2523 $movkey %xmm0,($inp)
2524.Ldec_key_ret:
2525 add \$8,%rsp
2526 ret
2527.LSEH_end_set_decrypt_key:
2528.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
2529___
2530
2531# This is based on submission by
2532#
2533# Huang Ying <ying.huang@intel.com>
2534# Vinodh Gopal <vinodh.gopal@intel.com>
2535# Kahraman Akdemir
2536#
2537# Aggressively optimized in respect to aeskeygenassist's critical path
2538# and is contained in %xmm0-5 to meet Win64 ABI requirement.
2539#
2540$code.=<<___;
2541.globl ${PREFIX}_set_encrypt_key
2542.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
2543.align 16
2544${PREFIX}_set_encrypt_key:
2545 _CET_ENDBR
2546__aesni_set_encrypt_key:
2547 sub \$8,%rsp
2548 mov \$-1,%rax
2549 test $inp,$inp
2550 jz .Lenc_key_ret
2551 test $key,$key
2552 jz .Lenc_key_ret
2553
2554 movups ($inp),%xmm0 # pull first 128 bits of *userKey
2555 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
2556 lea 16($key),%rax
2557 cmp \$256,$bits
2558 je .L14rounds
2559 cmp \$192,$bits
2560 je .L12rounds
2561 cmp \$128,$bits
2562 jne .Lbad_keybits
2563
2564.L10rounds:
2565 mov \$9,$bits # 10 rounds for 128-bit key
2566 $movkey %xmm0,($key) # round 0
2567 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
2568 call .Lkey_expansion_128_cold
2569 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
2570 call .Lkey_expansion_128
2571 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
2572 call .Lkey_expansion_128
2573 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
2574 call .Lkey_expansion_128
2575 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
2576 call .Lkey_expansion_128
2577 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
2578 call .Lkey_expansion_128
2579 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
2580 call .Lkey_expansion_128
2581 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
2582 call .Lkey_expansion_128
2583 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
2584 call .Lkey_expansion_128
2585 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
2586 call .Lkey_expansion_128
2587 $movkey %xmm0,(%rax)
2588 mov $bits,80(%rax) # 240(%rdx)
2589 xor %eax,%eax
2590 jmp .Lenc_key_ret
2591
2592.align 16
2593.L12rounds:
2594 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
2595 mov \$11,$bits # 12 rounds for 192
2596 $movkey %xmm0,($key) # round 0
2597 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
2598 call .Lkey_expansion_192a_cold
2599 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
2600 call .Lkey_expansion_192b
2601 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
2602 call .Lkey_expansion_192a
2603 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
2604 call .Lkey_expansion_192b
2605 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
2606 call .Lkey_expansion_192a
2607 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
2608 call .Lkey_expansion_192b
2609 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
2610 call .Lkey_expansion_192a
2611 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
2612 call .Lkey_expansion_192b
2613 $movkey %xmm0,(%rax)
2614 mov $bits,48(%rax) # 240(%rdx)
2615 xor %rax, %rax
2616 jmp .Lenc_key_ret
2617
2618.align 16
2619.L14rounds:
2620 movups 16($inp),%xmm2 # remaining half of *userKey
2621 mov \$13,$bits # 14 rounds for 256
2622 lea 16(%rax),%rax
2623 $movkey %xmm0,($key) # round 0
2624 $movkey %xmm2,16($key) # round 1
2625 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
2626 call .Lkey_expansion_256a_cold
2627 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
2628 call .Lkey_expansion_256b
2629 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
2630 call .Lkey_expansion_256a
2631 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
2632 call .Lkey_expansion_256b
2633 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
2634 call .Lkey_expansion_256a
2635 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
2636 call .Lkey_expansion_256b
2637 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
2638 call .Lkey_expansion_256a
2639 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
2640 call .Lkey_expansion_256b
2641 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
2642 call .Lkey_expansion_256a
2643 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
2644 call .Lkey_expansion_256b
2645 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
2646 call .Lkey_expansion_256a
2647 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
2648 call .Lkey_expansion_256b
2649 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
2650 call .Lkey_expansion_256a
2651 $movkey %xmm0,(%rax)
2652 mov $bits,16(%rax) # 240(%rdx)
2653 xor %rax,%rax
2654 jmp .Lenc_key_ret
2655
2656.align 16
2657.Lbad_keybits:
2658 mov \$-2,%rax
2659.Lenc_key_ret:
2660 add \$8,%rsp
2661 ret
2662.LSEH_end_set_encrypt_key:
2663
2664.align 16
2665.Lkey_expansion_128:
2666 $movkey %xmm0,(%rax)
2667 lea 16(%rax),%rax
2668.Lkey_expansion_128_cold:
2669 shufps \$0b00010000,%xmm0,%xmm4
2670 xorps %xmm4, %xmm0
2671 shufps \$0b10001100,%xmm0,%xmm4
2672 xorps %xmm4, %xmm0
2673 shufps \$0b11111111,%xmm1,%xmm1 # critical path
2674 xorps %xmm1,%xmm0
2675 ret
2676
2677.align 16
2678.Lkey_expansion_192a:
2679 $movkey %xmm0,(%rax)
2680 lea 16(%rax),%rax
2681.Lkey_expansion_192a_cold:
2682 movaps %xmm2, %xmm5
2683.Lkey_expansion_192b_warm:
2684 shufps \$0b00010000,%xmm0,%xmm4
2685 movdqa %xmm2,%xmm3
2686 xorps %xmm4,%xmm0
2687 shufps \$0b10001100,%xmm0,%xmm4
2688 pslldq \$4,%xmm3
2689 xorps %xmm4,%xmm0
2690 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
2691 pxor %xmm3,%xmm2
2692 pxor %xmm1,%xmm0
2693 pshufd \$0b11111111,%xmm0,%xmm3
2694 pxor %xmm3,%xmm2
2695 ret
2696
2697.align 16
2698.Lkey_expansion_192b:
2699 movaps %xmm0,%xmm3
2700 shufps \$0b01000100,%xmm0,%xmm5
2701 $movkey %xmm5,(%rax)
2702 shufps \$0b01001110,%xmm2,%xmm3
2703 $movkey %xmm3,16(%rax)
2704 lea 32(%rax),%rax
2705 jmp .Lkey_expansion_192b_warm
2706
2707.align 16
2708.Lkey_expansion_256a:
2709 $movkey %xmm2,(%rax)
2710 lea 16(%rax),%rax
2711.Lkey_expansion_256a_cold:
2712 shufps \$0b00010000,%xmm0,%xmm4
2713 xorps %xmm4,%xmm0
2714 shufps \$0b10001100,%xmm0,%xmm4
2715 xorps %xmm4,%xmm0
2716 shufps \$0b11111111,%xmm1,%xmm1 # critical path
2717 xorps %xmm1,%xmm0
2718 ret
2719
2720.align 16
2721.Lkey_expansion_256b:
2722 $movkey %xmm0,(%rax)
2723 lea 16(%rax),%rax
2724
2725 shufps \$0b00010000,%xmm2,%xmm4
2726 xorps %xmm4,%xmm2
2727 shufps \$0b10001100,%xmm2,%xmm4
2728 xorps %xmm4,%xmm2
2729 shufps \$0b10101010,%xmm1,%xmm1 # critical path
2730 xorps %xmm1,%xmm2
2731 ret
2732.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
2733.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
2734___
2735}
2736
2737$code.=<<___;
2738.section .rodata
2739.align 64
2740.Lbswap_mask:
2741 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
2742.Lincrement32:
2743 .long 6,6,6,0
2744.Lincrement64:
2745 .long 1,0,0,0
2746.Lxts_magic:
2747 .long 0x87,0,1,0
2748.align 64
2749.text
2750___
2751
2752# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2753# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2754if ($win64) {
2755$rec="%rcx";
2756$frame="%rdx";
2757$context="%r8";
2758$disp="%r9";
2759
2760$code.=<<___;
2761.extern __imp_RtlVirtualUnwind
2762___
2763$code.=<<___ if ($PREFIX eq "aesni");
2764.type ecb_se_handler,\@abi-omnipotent
2765.align 16
2766ecb_se_handler:
2767 _CET_ENDBR
2768 push %rsi
2769 push %rdi
2770 push %rbx
2771 push %rbp
2772 push %r12
2773 push %r13
2774 push %r14
2775 push %r15
2776 pushfq
2777 sub \$64,%rsp
2778
2779 mov 152($context),%rax # pull context->Rsp
2780
2781 jmp .Lcommon_seh_tail
2782.size ecb_se_handler,.-ecb_se_handler
2783
2784.type ccm64_se_handler,\@abi-omnipotent
2785.align 16
2786ccm64_se_handler:
2787 _CET_ENDBR
2788 push %rsi
2789 push %rdi
2790 push %rbx
2791 push %rbp
2792 push %r12
2793 push %r13
2794 push %r14
2795 push %r15
2796 pushfq
2797 sub \$64,%rsp
2798
2799 mov 120($context),%rax # pull context->Rax
2800 mov 248($context),%rbx # pull context->Rip
2801
2802 mov 8($disp),%rsi # disp->ImageBase
2803 mov 56($disp),%r11 # disp->HandlerData
2804
2805 mov 0(%r11),%r10d # HandlerData[0]
2806 lea (%rsi,%r10),%r10 # prologue label
2807 cmp %r10,%rbx # context->Rip<prologue label
2808 jb .Lcommon_seh_tail
2809
2810 mov 152($context),%rax # pull context->Rsp
2811
2812 mov 4(%r11),%r10d # HandlerData[1]
2813 lea (%rsi,%r10),%r10 # epilogue label
2814 cmp %r10,%rbx # context->Rip>=epilogue label
2815 jae .Lcommon_seh_tail
2816
2817 lea 0(%rax),%rsi # %xmm save area
2818 lea 512($context),%rdi # &context.Xmm6
2819 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
2820 .long 0xa548f3fc # cld; rep movsq
2821 lea 0x58(%rax),%rax # adjust stack pointer
2822
2823 jmp .Lcommon_seh_tail
2824.size ccm64_se_handler,.-ccm64_se_handler
2825
2826.type ctr32_se_handler,\@abi-omnipotent
2827.align 16
2828ctr32_se_handler:
2829 _CET_ENDBR
2830 push %rsi
2831 push %rdi
2832 push %rbx
2833 push %rbp
2834 push %r12
2835 push %r13
2836 push %r14
2837 push %r15
2838 pushfq
2839 sub \$64,%rsp
2840
2841 mov 120($context),%rax # pull context->Rax
2842 mov 248($context),%rbx # pull context->Rip
2843
2844 lea .Lctr32_body(%rip),%r10
2845 cmp %r10,%rbx # context->Rip<"prologue" label
2846 jb .Lcommon_seh_tail
2847
2848 mov 152($context),%rax # pull context->Rsp
2849
2850 lea .Lctr32_ret(%rip),%r10
2851 cmp %r10,%rbx
2852 jae .Lcommon_seh_tail
2853
2854 lea 0x20(%rax),%rsi # %xmm save area
2855 lea 512($context),%rdi # &context.Xmm6
2856 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2857 .long 0xa548f3fc # cld; rep movsq
2858
2859 jmp .Lcommon_rbp_tail
2860.size ctr32_se_handler,.-ctr32_se_handler
2861
2862.type xts_se_handler,\@abi-omnipotent
2863.align 16
2864xts_se_handler:
2865 _CET_ENDBR
2866 push %rsi
2867 push %rdi
2868 push %rbx
2869 push %rbp
2870 push %r12
2871 push %r13
2872 push %r14
2873 push %r15
2874 pushfq
2875 sub \$64,%rsp
2876
2877 mov 120($context),%rax # pull context->Rax
2878 mov 248($context),%rbx # pull context->Rip
2879
2880 mov 8($disp),%rsi # disp->ImageBase
2881 mov 56($disp),%r11 # disp->HandlerData
2882
2883 mov 0(%r11),%r10d # HandlerData[0]
2884 lea (%rsi,%r10),%r10 # prologue label
2885 cmp %r10,%rbx # context->Rip<prologue label
2886 jb .Lcommon_seh_tail
2887
2888 mov 152($context),%rax # pull context->Rsp
2889
2890 mov 4(%r11),%r10d # HandlerData[1]
2891 lea (%rsi,%r10),%r10 # epilogue label
2892 cmp %r10,%rbx # context->Rip>=epilogue label
2893 jae .Lcommon_seh_tail
2894
2895 lea 0x60(%rax),%rsi # %xmm save area
2896 lea 512($context),%rdi # & context.Xmm6
2897 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2898 .long 0xa548f3fc # cld; rep movsq
2899
2900 jmp .Lcommon_rbp_tail
2901.size xts_se_handler,.-xts_se_handler
2902___
2903$code.=<<___;
2904.type cbc_se_handler,\@abi-omnipotent
2905.align 16
2906cbc_se_handler:
2907 _CET_ENDBR
2908 push %rsi
2909 push %rdi
2910 push %rbx
2911 push %rbp
2912 push %r12
2913 push %r13
2914 push %r14
2915 push %r15
2916 pushfq
2917 sub \$64,%rsp
2918
2919 mov 152($context),%rax # pull context->Rsp
2920 mov 248($context),%rbx # pull context->Rip
2921
2922 lea .Lcbc_decrypt(%rip),%r10
2923 cmp %r10,%rbx # context->Rip<"prologue" label
2924 jb .Lcommon_seh_tail
2925
2926 lea .Lcbc_decrypt_body(%rip),%r10
2927 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
2928 jb .Lrestore_cbc_rax
2929
2930 lea .Lcbc_ret(%rip),%r10
2931 cmp %r10,%rbx # context->Rip>="epilogue" label
2932 jae .Lcommon_seh_tail
2933
2934 lea 16(%rax),%rsi # %xmm save area
2935 lea 512($context),%rdi # &context.Xmm6
2936 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
2937 .long 0xa548f3fc # cld; rep movsq
2938
2939.Lcommon_rbp_tail:
2940 mov 160($context),%rax # pull context->Rbp
2941 mov (%rax),%rbp # restore saved %rbp
2942 lea 8(%rax),%rax # adjust stack pointer
2943 mov %rbp,160($context) # restore context->Rbp
2944 jmp .Lcommon_seh_tail
2945
2946.Lrestore_cbc_rax:
2947 mov 120($context),%rax
2948
2949.Lcommon_seh_tail:
2950 mov 8(%rax),%rdi
2951 mov 16(%rax),%rsi
2952 mov %rax,152($context) # restore context->Rsp
2953 mov %rsi,168($context) # restore context->Rsi
2954 mov %rdi,176($context) # restore context->Rdi
2955
2956 mov 40($disp),%rdi # disp->ContextRecord
2957 mov $context,%rsi # context
2958 mov \$154,%ecx # sizeof(CONTEXT)
2959 .long 0xa548f3fc # cld; rep movsq
2960
2961 mov $disp,%rsi
2962 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2963 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2964 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2965 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2966 mov 40(%rsi),%r10 # disp->ContextRecord
2967 lea 56(%rsi),%r11 # &disp->HandlerData
2968 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2969 mov %r10,32(%rsp) # arg5
2970 mov %r11,40(%rsp) # arg6
2971 mov %r12,48(%rsp) # arg7
2972 mov %rcx,56(%rsp) # arg8, (NULL)
2973 call *__imp_RtlVirtualUnwind(%rip)
2974
2975 mov \$1,%eax # ExceptionContinueSearch
2976 add \$64,%rsp
2977 popfq
2978 pop %r15
2979 pop %r14
2980 pop %r13
2981 pop %r12
2982 pop %rbp
2983 pop %rbx
2984 pop %rdi
2985 pop %rsi
2986 ret
2987.size cbc_se_handler,.-cbc_se_handler
2988
2989.section .pdata
2990.align 4
2991___
2992$code.=<<___ if ($PREFIX eq "aesni");
2993 .rva .LSEH_begin_aesni_ecb_encrypt
2994 .rva .LSEH_end_aesni_ecb_encrypt
2995 .rva .LSEH_info_ecb
2996
2997 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
2998 .rva .LSEH_end_aesni_ccm64_encrypt_blocks
2999 .rva .LSEH_info_ccm64_enc
3000
3001 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
3002 .rva .LSEH_end_aesni_ccm64_decrypt_blocks
3003 .rva .LSEH_info_ccm64_dec
3004
3005 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
3006 .rva .LSEH_end_aesni_ctr32_encrypt_blocks
3007 .rva .LSEH_info_ctr32
3008
3009 .rva .LSEH_begin_aesni_xts_encrypt
3010 .rva .LSEH_end_aesni_xts_encrypt
3011 .rva .LSEH_info_xts_enc
3012
3013 .rva .LSEH_begin_aesni_xts_decrypt
3014 .rva .LSEH_end_aesni_xts_decrypt
3015 .rva .LSEH_info_xts_dec
3016___
3017$code.=<<___;
3018 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
3019 .rva .LSEH_end_${PREFIX}_cbc_encrypt
3020 .rva .LSEH_info_cbc
3021
3022 .rva ${PREFIX}_set_decrypt_key
3023 .rva .LSEH_end_set_decrypt_key
3024 .rva .LSEH_info_key
3025
3026 .rva ${PREFIX}_set_encrypt_key
3027 .rva .LSEH_end_set_encrypt_key
3028 .rva .LSEH_info_key
3029.section .xdata
3030.align 8
3031___
3032$code.=<<___ if ($PREFIX eq "aesni");
3033.LSEH_info_ecb:
3034 .byte 9,0,0,0
3035 .rva ecb_se_handler
3036.LSEH_info_ccm64_enc:
3037 .byte 9,0,0,0
3038 .rva ccm64_se_handler
3039 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
3040.LSEH_info_ccm64_dec:
3041 .byte 9,0,0,0
3042 .rva ccm64_se_handler
3043 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
3044.LSEH_info_ctr32:
3045 .byte 9,0,0,0
3046 .rva ctr32_se_handler
3047.LSEH_info_xts_enc:
3048 .byte 9,0,0,0
3049 .rva xts_se_handler
3050 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3051.LSEH_info_xts_dec:
3052 .byte 9,0,0,0
3053 .rva xts_se_handler
3054 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3055___
3056$code.=<<___;
3057.LSEH_info_cbc:
3058 .byte 9,0,0,0
3059 .rva cbc_se_handler
3060.LSEH_info_key:
3061 .byte 0x01,0x04,0x01,0x00
3062 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
3063___
3064}
3065
3066sub rex {
3067 local *opcode=shift;
3068 my ($dst,$src)=@_;
3069 my $rex=0;
3070
3071 $rex|=0x04 if($dst>=8);
3072 $rex|=0x01 if($src>=8);
3073 push @opcode,$rex|0x40 if($rex);
3074}
3075
3076$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3077
3078print $code;
3079
3080close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
deleted file mode 100644
index c44a338114..0000000000
--- a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
+++ /dev/null
@@ -1,3123 +0,0 @@
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode] ###
5### bitsliced implementation for Intel Core 2 processors ###
6### requires support of SSE extensions up to SSSE3 ###
7### Author: Emilia Käsper and Peter Schwabe ###
8### Date: 2009-03-19 ###
9### Public domain ###
10### ###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12### further information. ###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22# from 12.5KB to 2.2KB;
23# - above was possible thanks to mixcolumns() modification that
24# allowed to feed its output back to aesenc[last], this was
25# achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28# relies on conversion of "conventional" key schedule as returned
29# by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31# to skip one shiftrows(), reduce bit-sliced key schedule and
32# speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38# Emilia's this(*) difference
39#
40# Core 2 9.30 8.69 +7%
41# Nehalem(**) 7.63 6.98 +9%
42# Atom 17.1 17.4 -2%(***)
43#
44# (*) Comparison is not completely fair, because "this" is ECB,
45# i.e. no extra processing such as counter values calculation
46# and xor-ing input as in Emilia's CTR implementation is
47# performed. However, the CTR calculations stand for not more
48# than 1% of total time, so comparison is *rather* fair.
49#
50# (**) Results were collected on Westmere, which is considered to
51# be equivalent to Nehalem for this code.
52#
53# (***) Slowdown on Atom is rather strange per se, because original
54# implementation has a number of 9+-bytes instructions, which
55# are bad for Atom front-end, and which I eliminated completely.
56# In attempt to address deterioration sbox() was tested in FP
57# SIMD "domain" (movaps instead of movdqa, xorps instead of
58# pxor, etc.). While it resulted in nominal 4% improvement on
59# Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# conversion conversion/8x block
68# Core 2 240 0.22
69# Nehalem 180 0.20
70# Atom 430 0.19
71#
72# The ratio values mean that 128-byte blocks will be processed
73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
81# October 2011.
82#
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2 9.83
87# Nehalem 7.74
88# Atom 19.0
89#
90# November 2011.
91#
92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93# suboptimal, but XTS is meant to be used with larger blocks...
94#
95# <appro@openssl.org>
96
97$flavour = shift;
98$output = shift;
99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
108open OUT,"| \"$^X\" $xlate $flavour $output";
109*STDOUT=*OUT;
110
111my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
113my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
114
115{
116my ($key,$rounds,$const)=("%rax","%r10d","%r11");
117
118sub Sbox {
119# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
121my @b=@_[0..7];
122my @t=@_[8..11];
123my @s=@_[12..15];
124 &InBasisChange (@b);
125 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
126 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
127}
128
129sub InBasisChange {
130# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
132my @b=@_[0..7];
133$code.=<<___;
134 pxor @b[6], @b[5]
135 pxor @b[1], @b[2]
136 pxor @b[0], @b[3]
137 pxor @b[2], @b[6]
138 pxor @b[0], @b[5]
139
140 pxor @b[3], @b[6]
141 pxor @b[7], @b[3]
142 pxor @b[5], @b[7]
143 pxor @b[4], @b[3]
144 pxor @b[5], @b[4]
145 pxor @b[1], @b[3]
146
147 pxor @b[7], @b[2]
148 pxor @b[5], @b[1]
149___
150}
151
152sub OutBasisChange {
153# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
155my @b=@_[0..7];
156$code.=<<___;
157 pxor @b[6], @b[0]
158 pxor @b[4], @b[1]
159 pxor @b[0], @b[2]
160 pxor @b[6], @b[4]
161 pxor @b[1], @b[6]
162
163 pxor @b[5], @b[1]
164 pxor @b[3], @b[5]
165 pxor @b[7], @b[3]
166 pxor @b[5], @b[7]
167 pxor @b[5], @b[2]
168
169 pxor @b[7], @b[4]
170___
171}
172
173sub InvSbox {
174# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
176my @b=@_[0..7];
177my @t=@_[8..11];
178my @s=@_[12..15];
179 &InvInBasisChange (@b);
180 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
181 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
182}
183
184sub InvInBasisChange { # OutBasisChange in reverse
185my @b=@_[5,1,2,6,3,7,0,4];
186$code.=<<___
187 pxor @b[7], @b[4]
188
189 pxor @b[5], @b[7]
190 pxor @b[5], @b[2]
191 pxor @b[7], @b[3]
192 pxor @b[3], @b[5]
193 pxor @b[5], @b[1]
194
195 pxor @b[1], @b[6]
196 pxor @b[0], @b[2]
197 pxor @b[6], @b[4]
198 pxor @b[6], @b[0]
199 pxor @b[4], @b[1]
200___
201}
202
203sub InvOutBasisChange { # InBasisChange in reverse
204my @b=@_[2,5,7,3,6,1,0,4];
205$code.=<<___;
206 pxor @b[5], @b[1]
207 pxor @b[7], @b[2]
208
209 pxor @b[1], @b[3]
210 pxor @b[5], @b[4]
211 pxor @b[5], @b[7]
212 pxor @b[4], @b[3]
213 pxor @b[0], @b[5]
214 pxor @b[7], @b[3]
215 pxor @b[2], @b[6]
216 pxor @b[1], @b[2]
217 pxor @b[3], @b[6]
218
219 pxor @b[0], @b[3]
220 pxor @b[6], @b[5]
221___
222}
223
224sub Mul_GF4 {
225#;*************************************************************
226#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227#;*************************************************************
228my ($x0,$x1,$y0,$y1,$t0)=@_;
229$code.=<<___;
230 movdqa $y0, $t0
231 pxor $y1, $t0
232 pand $x0, $t0
233 pxor $x1, $x0
234 pand $y0, $x1
235 pand $y1, $x0
236 pxor $x1, $x0
237 pxor $t0, $x1
238___
239}
240
241sub Mul_GF4_N { # not used, see next subroutine
242# multiply and scale by N
243my ($x0,$x1,$y0,$y1,$t0)=@_;
244$code.=<<___;
245 movdqa $y0, $t0
246 pxor $y1, $t0
247 pand $x0, $t0
248 pxor $x1, $x0
249 pand $y0, $x1
250 pand $y1, $x0
251 pxor $x0, $x1
252 pxor $t0, $x0
253___
254}
255
256sub Mul_GF4_N_GF4 {
257# interleaved Mul_GF4_N and Mul_GF4
258my ($x0,$x1,$y0,$y1,$t0,
259 $x2,$x3,$y2,$y3,$t1)=@_;
260$code.=<<___;
261 movdqa $y0, $t0
262 movdqa $y2, $t1
263 pxor $y1, $t0
264 pxor $y3, $t1
265 pand $x0, $t0
266 pand $x2, $t1
267 pxor $x1, $x0
268 pxor $x3, $x2
269 pand $y0, $x1
270 pand $y2, $x3
271 pand $y1, $x0
272 pand $y3, $x2
273 pxor $x0, $x1
274 pxor $x3, $x2
275 pxor $t0, $x0
276 pxor $t1, $x3
277___
278}
279sub Mul_GF16_2 {
280my @x=@_[0..7];
281my @y=@_[8..11];
282my @t=@_[12..15];
283$code.=<<___;
284 movdqa @x[0], @t[0]
285 movdqa @x[1], @t[1]
286___
287 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
288$code.=<<___;
289 pxor @x[2], @t[0]
290 pxor @x[3], @t[1]
291 pxor @y[2], @y[0]
292 pxor @y[3], @y[1]
293___
294 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
295 @x[2], @x[3], @y[2], @y[3], @t[2]);
296$code.=<<___;
297 pxor @t[0], @x[0]
298 pxor @t[0], @x[2]
299 pxor @t[1], @x[1]
300 pxor @t[1], @x[3]
301
302 movdqa @x[4], @t[0]
303 movdqa @x[5], @t[1]
304 pxor @x[6], @t[0]
305 pxor @x[7], @t[1]
306___
307 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
308 @x[6], @x[7], @y[2], @y[3], @t[2]);
309$code.=<<___;
310 pxor @y[2], @y[0]
311 pxor @y[3], @y[1]
312___
313 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
314$code.=<<___;
315 pxor @t[0], @x[4]
316 pxor @t[0], @x[6]
317 pxor @t[1], @x[5]
318 pxor @t[1], @x[7]
319___
320}
321sub Inv_GF256 {
322#;********************************************************************
323#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
324#;********************************************************************
325my @x=@_[0..7];
326my @t=@_[8..11];
327my @s=@_[12..15];
328# direct optimizations from hardware
329$code.=<<___;
330 movdqa @x[4], @t[3]
331 movdqa @x[5], @t[2]
332 movdqa @x[1], @t[1]
333 movdqa @x[7], @s[1]
334 movdqa @x[0], @s[0]
335
336 pxor @x[6], @t[3]
337 pxor @x[7], @t[2]
338 pxor @x[3], @t[1]
339 movdqa @t[3], @s[2]
340 pxor @x[6], @s[1]
341 movdqa @t[2], @t[0]
342 pxor @x[2], @s[0]
343 movdqa @t[3], @s[3]
344
345 por @t[1], @t[2]
346 por @s[0], @t[3]
347 pxor @t[0], @s[3]
348 pand @s[0], @s[2]
349 pxor @t[1], @s[0]
350 pand @t[1], @t[0]
351 pand @s[0], @s[3]
352 movdqa @x[3], @s[0]
353 pxor @x[2], @s[0]
354 pand @s[0], @s[1]
355 pxor @s[1], @t[3]
356 pxor @s[1], @t[2]
357 movdqa @x[4], @s[1]
358 movdqa @x[1], @s[0]
359 pxor @x[5], @s[1]
360 pxor @x[0], @s[0]
361 movdqa @s[1], @t[1]
362 pand @s[0], @s[1]
363 por @s[0], @t[1]
364 pxor @s[1], @t[0]
365 pxor @s[3], @t[3]
366 pxor @s[2], @t[2]
367 pxor @s[3], @t[1]
368 movdqa @x[7], @s[0]
369 pxor @s[2], @t[0]
370 movdqa @x[6], @s[1]
371 pxor @s[2], @t[1]
372 movdqa @x[5], @s[2]
373 pand @x[3], @s[0]
374 movdqa @x[4], @s[3]
375 pand @x[2], @s[1]
376 pand @x[1], @s[2]
377 por @x[0], @s[3]
378 pxor @s[0], @t[3]
379 pxor @s[1], @t[2]
380 pxor @s[2], @t[1]
381 pxor @s[3], @t[0]
382
383 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
384
385 # new smaller inversion
386
387 movdqa @t[3], @s[0]
388 pand @t[1], @t[3]
389 pxor @t[2], @s[0]
390
391 movdqa @t[0], @s[2]
392 movdqa @s[0], @s[3]
393 pxor @t[3], @s[2]
394 pand @s[2], @s[3]
395
396 movdqa @t[1], @s[1]
397 pxor @t[2], @s[3]
398 pxor @t[0], @s[1]
399
400 pxor @t[2], @t[3]
401
402 pand @t[3], @s[1]
403
404 movdqa @s[2], @t[2]
405 pxor @t[0], @s[1]
406
407 pxor @s[1], @t[2]
408 pxor @s[1], @t[1]
409
410 pand @t[0], @t[2]
411
412 pxor @t[2], @s[2]
413 pxor @t[2], @t[1]
414
415 pand @s[3], @s[2]
416
417 pxor @s[0], @s[2]
418___
419# output in s3, s2, s1, t1
420
421# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
422
423# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
424 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
425
426### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
427}
428
429# AES linear components
430
431sub ShiftRows {
432my @x=@_[0..7];
433my $mask=pop;
434$code.=<<___;
435 pxor 0x00($key),@x[0]
436 pxor 0x10($key),@x[1]
437 pshufb $mask,@x[0]
438 pxor 0x20($key),@x[2]
439 pshufb $mask,@x[1]
440 pxor 0x30($key),@x[3]
441 pshufb $mask,@x[2]
442 pxor 0x40($key),@x[4]
443 pshufb $mask,@x[3]
444 pxor 0x50($key),@x[5]
445 pshufb $mask,@x[4]
446 pxor 0x60($key),@x[6]
447 pshufb $mask,@x[5]
448 pxor 0x70($key),@x[7]
449 pshufb $mask,@x[6]
450 lea 0x80($key),$key
451 pshufb $mask,@x[7]
452___
453}
454
455sub MixColumns {
456# modified to emit output in order suitable for feeding back to aesenc[last]
457my @x=@_[0..7];
458my @t=@_[8..15];
459my $inv=@_[16]; # optional
460$code.=<<___;
461 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
462 pshufd \$0x93, @x[1], @t[1]
463 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
464 pshufd \$0x93, @x[2], @t[2]
465 pxor @t[1], @x[1]
466 pshufd \$0x93, @x[3], @t[3]
467 pxor @t[2], @x[2]
468 pshufd \$0x93, @x[4], @t[4]
469 pxor @t[3], @x[3]
470 pshufd \$0x93, @x[5], @t[5]
471 pxor @t[4], @x[4]
472 pshufd \$0x93, @x[6], @t[6]
473 pxor @t[5], @x[5]
474 pshufd \$0x93, @x[7], @t[7]
475 pxor @t[6], @x[6]
476 pxor @t[7], @x[7]
477
478 pxor @x[0], @t[1]
479 pxor @x[7], @t[0]
480 pxor @x[7], @t[1]
481 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
482 pxor @x[1], @t[2]
483 pshufd \$0x4E, @x[1], @x[1]
484 pxor @x[4], @t[5]
485 pxor @t[0], @x[0]
486 pxor @x[5], @t[6]
487 pxor @t[1], @x[1]
488 pxor @x[3], @t[4]
489 pshufd \$0x4E, @x[4], @t[0]
490 pxor @x[6], @t[7]
491 pshufd \$0x4E, @x[5], @t[1]
492 pxor @x[2], @t[3]
493 pshufd \$0x4E, @x[3], @x[4]
494 pxor @x[7], @t[3]
495 pshufd \$0x4E, @x[7], @x[5]
496 pxor @x[7], @t[4]
497 pshufd \$0x4E, @x[6], @x[3]
498 pxor @t[4], @t[0]
499 pshufd \$0x4E, @x[2], @x[6]
500 pxor @t[5], @t[1]
501___
502$code.=<<___ if (!$inv);
503 pxor @t[3], @x[4]
504 pxor @t[7], @x[5]
505 pxor @t[6], @x[3]
506 movdqa @t[0], @x[2]
507 pxor @t[2], @x[6]
508 movdqa @t[1], @x[7]
509___
510$code.=<<___ if ($inv);
511 pxor @x[4], @t[3]
512 pxor @t[7], @x[5]
513 pxor @x[3], @t[6]
514 movdqa @t[0], @x[3]
515 pxor @t[2], @x[6]
516 movdqa @t[6], @x[2]
517 movdqa @t[1], @x[7]
518 movdqa @x[6], @x[4]
519 movdqa @t[3], @x[6]
520___
521}
522
523sub InvMixColumns_orig {
524my @x=@_[0..7];
525my @t=@_[8..15];
526
527$code.=<<___;
528 # multiplication by 0x0e
529 pshufd \$0x93, @x[7], @t[7]
530 movdqa @x[2], @t[2]
531 pxor @x[5], @x[7] # 7 5
532 pxor @x[5], @x[2] # 2 5
533 pshufd \$0x93, @x[0], @t[0]
534 movdqa @x[5], @t[5]
535 pxor @x[0], @x[5] # 5 0 [1]
536 pxor @x[1], @x[0] # 0 1
537 pshufd \$0x93, @x[1], @t[1]
538 pxor @x[2], @x[1] # 1 25
539 pxor @x[6], @x[0] # 01 6 [2]
540 pxor @x[3], @x[1] # 125 3 [4]
541 pshufd \$0x93, @x[3], @t[3]
542 pxor @x[0], @x[2] # 25 016 [3]
543 pxor @x[7], @x[3] # 3 75
544 pxor @x[6], @x[7] # 75 6 [0]
545 pshufd \$0x93, @x[6], @t[6]
546 movdqa @x[4], @t[4]
547 pxor @x[4], @x[6] # 6 4
548 pxor @x[3], @x[4] # 4 375 [6]
549 pxor @x[7], @x[3] # 375 756=36
550 pxor @t[5], @x[6] # 64 5 [7]
551 pxor @t[2], @x[3] # 36 2
552 pxor @t[4], @x[3] # 362 4 [5]
553 pshufd \$0x93, @t[5], @t[5]
554___
555 my @y = @x[7,5,0,2,1,3,4,6];
556$code.=<<___;
557 # multiplication by 0x0b
558 pxor @y[0], @y[1]
559 pxor @t[0], @y[0]
560 pxor @t[1], @y[1]
561 pshufd \$0x93, @t[2], @t[2]
562 pxor @t[5], @y[0]
563 pxor @t[6], @y[1]
564 pxor @t[7], @y[0]
565 pshufd \$0x93, @t[4], @t[4]
566 pxor @t[6], @t[7] # clobber t[7]
567 pxor @y[0], @y[1]
568
569 pxor @t[0], @y[3]
570 pshufd \$0x93, @t[0], @t[0]
571 pxor @t[1], @y[2]
572 pxor @t[1], @y[4]
573 pxor @t[2], @y[2]
574 pshufd \$0x93, @t[1], @t[1]
575 pxor @t[2], @y[3]
576 pxor @t[2], @y[5]
577 pxor @t[7], @y[2]
578 pshufd \$0x93, @t[2], @t[2]
579 pxor @t[3], @y[3]
580 pxor @t[3], @y[6]
581 pxor @t[3], @y[4]
582 pshufd \$0x93, @t[3], @t[3]
583 pxor @t[4], @y[7]
584 pxor @t[4], @y[5]
585 pxor @t[7], @y[7]
586 pxor @t[5], @y[3]
587 pxor @t[4], @y[4]
588 pxor @t[5], @t[7] # clobber t[7] even more
589
590 pxor @t[7], @y[5]
591 pshufd \$0x93, @t[4], @t[4]
592 pxor @t[7], @y[6]
593 pxor @t[7], @y[4]
594
595 pxor @t[5], @t[7]
596 pshufd \$0x93, @t[5], @t[5]
597 pxor @t[6], @t[7] # restore t[7]
598
599 # multiplication by 0x0d
600 pxor @y[7], @y[4]
601 pxor @t[4], @y[7]
602 pshufd \$0x93, @t[6], @t[6]
603 pxor @t[0], @y[2]
604 pxor @t[5], @y[7]
605 pxor @t[2], @y[2]
606 pshufd \$0x93, @t[7], @t[7]
607
608 pxor @y[1], @y[3]
609 pxor @t[1], @y[1]
610 pxor @t[0], @y[0]
611 pxor @t[0], @y[3]
612 pxor @t[5], @y[1]
613 pxor @t[5], @y[0]
614 pxor @t[7], @y[1]
615 pshufd \$0x93, @t[0], @t[0]
616 pxor @t[6], @y[0]
617 pxor @y[1], @y[3]
618 pxor @t[1], @y[4]
619 pshufd \$0x93, @t[1], @t[1]
620
621 pxor @t[7], @y[7]
622 pxor @t[2], @y[4]
623 pxor @t[2], @y[5]
624 pshufd \$0x93, @t[2], @t[2]
625 pxor @t[6], @y[2]
626 pxor @t[3], @t[6] # clobber t[6]
627 pxor @y[7], @y[4]
628 pxor @t[6], @y[3]
629
630 pxor @t[6], @y[6]
631 pxor @t[5], @y[5]
632 pxor @t[4], @y[6]
633 pshufd \$0x93, @t[4], @t[4]
634 pxor @t[6], @y[5]
635 pxor @t[7], @y[6]
636 pxor @t[3], @t[6] # restore t[6]
637
638 pshufd \$0x93, @t[5], @t[5]
639 pshufd \$0x93, @t[6], @t[6]
640 pshufd \$0x93, @t[7], @t[7]
641 pshufd \$0x93, @t[3], @t[3]
642
643 # multiplication by 0x09
644 pxor @y[1], @y[4]
645 pxor @y[1], @t[1] # t[1]=y[1]
646 pxor @t[5], @t[0] # clobber t[0]
647 pxor @t[5], @t[1]
648 pxor @t[0], @y[3]
649 pxor @y[0], @t[0] # t[0]=y[0]
650 pxor @t[6], @t[1]
651 pxor @t[7], @t[6] # clobber t[6]
652 pxor @t[1], @y[4]
653 pxor @t[4], @y[7]
654 pxor @y[4], @t[4] # t[4]=y[4]
655 pxor @t[3], @y[6]
656 pxor @y[3], @t[3] # t[3]=y[3]
657 pxor @t[2], @y[5]
658 pxor @y[2], @t[2] # t[2]=y[2]
659 pxor @t[7], @t[3]
660 pxor @y[5], @t[5] # t[5]=y[5]
661 pxor @t[6], @t[2]
662 pxor @t[6], @t[5]
663 pxor @y[6], @t[6] # t[6]=y[6]
664 pxor @y[7], @t[7] # t[7]=y[7]
665
666 movdqa @t[0],@XMM[0]
667 movdqa @t[1],@XMM[1]
668 movdqa @t[2],@XMM[2]
669 movdqa @t[3],@XMM[3]
670 movdqa @t[4],@XMM[4]
671 movdqa @t[5],@XMM[5]
672 movdqa @t[6],@XMM[6]
673 movdqa @t[7],@XMM[7]
674___
675}
676
677sub InvMixColumns {
678my @x=@_[0..7];
679my @t=@_[8..15];
680
681# Thanks to Jussi Kivilinna for providing pointer to
682#
683# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
684# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
685# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
686# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
687
688$code.=<<___;
689 # multiplication by 0x05-0x00-0x04-0x00
690 pshufd \$0x4E, @x[0], @t[0]
691 pshufd \$0x4E, @x[6], @t[6]
692 pxor @x[0], @t[0]
693 pshufd \$0x4E, @x[7], @t[7]
694 pxor @x[6], @t[6]
695 pshufd \$0x4E, @x[1], @t[1]
696 pxor @x[7], @t[7]
697 pshufd \$0x4E, @x[2], @t[2]
698 pxor @x[1], @t[1]
699 pshufd \$0x4E, @x[3], @t[3]
700 pxor @x[2], @t[2]
701 pxor @t[6], @x[0]
702 pxor @t[6], @x[1]
703 pshufd \$0x4E, @x[4], @t[4]
704 pxor @x[3], @t[3]
705 pxor @t[0], @x[2]
706 pxor @t[1], @x[3]
707 pshufd \$0x4E, @x[5], @t[5]
708 pxor @x[4], @t[4]
709 pxor @t[7], @x[1]
710 pxor @t[2], @x[4]
711 pxor @x[5], @t[5]
712
713 pxor @t[7], @x[2]
714 pxor @t[6], @x[3]
715 pxor @t[6], @x[4]
716 pxor @t[3], @x[5]
717 pxor @t[4], @x[6]
718 pxor @t[7], @x[4]
719 pxor @t[7], @x[5]
720 pxor @t[5], @x[7]
721___
722 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
723}
724
725sub aesenc { # not used
726my @b=@_[0..7];
727my @t=@_[8..15];
728$code.=<<___;
729 movdqa 0x30($const),@t[0] # .LSR
730___
731 &ShiftRows (@b,@t[0]);
732 &Sbox (@b,@t);
733 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
734}
735
736sub aesenclast { # not used
737my @b=@_[0..7];
738my @t=@_[8..15];
739$code.=<<___;
740 movdqa 0x40($const),@t[0] # .LSRM0
741___
742 &ShiftRows (@b,@t[0]);
743 &Sbox (@b,@t);
744$code.=<<___
745 pxor 0x00($key),@b[0]
746 pxor 0x10($key),@b[1]
747 pxor 0x20($key),@b[4]
748 pxor 0x30($key),@b[6]
749 pxor 0x40($key),@b[3]
750 pxor 0x50($key),@b[7]
751 pxor 0x60($key),@b[2]
752 pxor 0x70($key),@b[5]
753___
754}
755
756sub swapmove {
757my ($a,$b,$n,$mask,$t)=@_;
758$code.=<<___;
759 movdqa $b,$t
760 psrlq \$$n,$b
761 pxor $a,$b
762 pand $mask,$b
763 pxor $b,$a
764 psllq \$$n,$b
765 pxor $t,$b
766___
767}
768sub swapmove2x {
769my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
770$code.=<<___;
771 movdqa $b0,$t0
772 psrlq \$$n,$b0
773 movdqa $b1,$t1
774 psrlq \$$n,$b1
775 pxor $a0,$b0
776 pxor $a1,$b1
777 pand $mask,$b0
778 pand $mask,$b1
779 pxor $b0,$a0
780 psllq \$$n,$b0
781 pxor $b1,$a1
782 psllq \$$n,$b1
783 pxor $t0,$b0
784 pxor $t1,$b1
785___
786}
787
788sub bitslice {
789my @x=reverse(@_[0..7]);
790my ($t0,$t1,$t2,$t3)=@_[8..11];
791$code.=<<___;
792 movdqa 0x00($const),$t0 # .LBS0
793 movdqa 0x10($const),$t1 # .LBS1
794___
795 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
796 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
797$code.=<<___;
798 movdqa 0x20($const),$t0 # .LBS2
799___
800 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
801 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
802
803 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
804 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
805}
806
807$code.=<<___;
808.text
809
810.extern asm_AES_encrypt
811.extern asm_AES_decrypt
812
813.type _bsaes_encrypt8,\@abi-omnipotent
814.align 64
815_bsaes_encrypt8:
816 _CET_ENDBR
817 lea .LBS0(%rip), $const # constants table
818
819 movdqa ($key), @XMM[9] # round 0 key
820 lea 0x10($key), $key
821 movdqa 0x50($const), @XMM[8] # .LM0SR
822 pxor @XMM[9], @XMM[0] # xor with round0 key
823 pxor @XMM[9], @XMM[1]
824 pshufb @XMM[8], @XMM[0]
825 pxor @XMM[9], @XMM[2]
826 pshufb @XMM[8], @XMM[1]
827 pxor @XMM[9], @XMM[3]
828 pshufb @XMM[8], @XMM[2]
829 pxor @XMM[9], @XMM[4]
830 pshufb @XMM[8], @XMM[3]
831 pxor @XMM[9], @XMM[5]
832 pshufb @XMM[8], @XMM[4]
833 pxor @XMM[9], @XMM[6]
834 pshufb @XMM[8], @XMM[5]
835 pxor @XMM[9], @XMM[7]
836 pshufb @XMM[8], @XMM[6]
837 pshufb @XMM[8], @XMM[7]
838_bsaes_encrypt8_bitslice:
839___
840 &bitslice (@XMM[0..7, 8..11]);
841$code.=<<___;
842 dec $rounds
843 jmp .Lenc_sbox
844.align 16
845.Lenc_loop:
846___
847 &ShiftRows (@XMM[0..7, 8]);
848$code.=".Lenc_sbox:\n";
849 &Sbox (@XMM[0..7, 8..15]);
850$code.=<<___;
851 dec $rounds
852 jl .Lenc_done
853___
854 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
855$code.=<<___;
856 movdqa 0x30($const), @XMM[8] # .LSR
857 jnz .Lenc_loop
858 movdqa 0x40($const), @XMM[8] # .LSRM0
859 jmp .Lenc_loop
860.align 16
861.Lenc_done:
862___
863 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
864 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
865$code.=<<___;
866 movdqa ($key), @XMM[8] # last round key
867 pxor @XMM[8], @XMM[4]
868 pxor @XMM[8], @XMM[6]
869 pxor @XMM[8], @XMM[3]
870 pxor @XMM[8], @XMM[7]
871 pxor @XMM[8], @XMM[2]
872 pxor @XMM[8], @XMM[5]
873 pxor @XMM[8], @XMM[0]
874 pxor @XMM[8], @XMM[1]
875 ret
876.size _bsaes_encrypt8,.-_bsaes_encrypt8
877
878.type _bsaes_decrypt8,\@abi-omnipotent
879.align 64
880_bsaes_decrypt8:
881 _CET_ENDBR
882 lea .LBS0(%rip), $const # constants table
883
884 movdqa ($key), @XMM[9] # round 0 key
885 lea 0x10($key), $key
886 movdqa -0x30($const), @XMM[8] # .LM0ISR
887 pxor @XMM[9], @XMM[0] # xor with round0 key
888 pxor @XMM[9], @XMM[1]
889 pshufb @XMM[8], @XMM[0]
890 pxor @XMM[9], @XMM[2]
891 pshufb @XMM[8], @XMM[1]
892 pxor @XMM[9], @XMM[3]
893 pshufb @XMM[8], @XMM[2]
894 pxor @XMM[9], @XMM[4]
895 pshufb @XMM[8], @XMM[3]
896 pxor @XMM[9], @XMM[5]
897 pshufb @XMM[8], @XMM[4]
898 pxor @XMM[9], @XMM[6]
899 pshufb @XMM[8], @XMM[5]
900 pxor @XMM[9], @XMM[7]
901 pshufb @XMM[8], @XMM[6]
902 pshufb @XMM[8], @XMM[7]
903___
904 &bitslice (@XMM[0..7, 8..11]);
905$code.=<<___;
906 dec $rounds
907 jmp .Ldec_sbox
908.align 16
909.Ldec_loop:
910___
911 &ShiftRows (@XMM[0..7, 8]);
912$code.=".Ldec_sbox:\n";
913 &InvSbox (@XMM[0..7, 8..15]);
914$code.=<<___;
915 dec $rounds
916 jl .Ldec_done
917___
918 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
919$code.=<<___;
920 movdqa -0x10($const), @XMM[8] # .LISR
921 jnz .Ldec_loop
922 movdqa -0x20($const), @XMM[8] # .LISRM0
923 jmp .Ldec_loop
924.align 16
925.Ldec_done:
926___
927 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
928$code.=<<___;
929 movdqa ($key), @XMM[8] # last round key
930 pxor @XMM[8], @XMM[6]
931 pxor @XMM[8], @XMM[4]
932 pxor @XMM[8], @XMM[2]
933 pxor @XMM[8], @XMM[7]
934 pxor @XMM[8], @XMM[3]
935 pxor @XMM[8], @XMM[5]
936 pxor @XMM[8], @XMM[0]
937 pxor @XMM[8], @XMM[1]
938 ret
939.size _bsaes_decrypt8,.-_bsaes_decrypt8
940___
941}
942{
943my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
944
945sub bitslice_key {
946my @x=reverse(@_[0..7]);
947my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
948
949 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
950$code.=<<___;
951 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
952 movdqa @x[0], @x[2]
953 movdqa @x[1], @x[3]
954___
955 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
956
957 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
958$code.=<<___;
959 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
960 movdqa @x[0], @x[4]
961 movdqa @x[2], @x[6]
962 movdqa @x[1], @x[5]
963 movdqa @x[3], @x[7]
964___
965 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
966 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
967}
968
969$code.=<<___;
970.type _bsaes_key_convert,\@abi-omnipotent
971.align 16
972_bsaes_key_convert:
973 _CET_ENDBR
974 lea .Lmasks(%rip), $const
975 movdqu ($inp), %xmm7 # load round 0 key
976 lea 0x10($inp), $inp
977 movdqa 0x00($const), %xmm0 # 0x01...
978 movdqa 0x10($const), %xmm1 # 0x02...
979 movdqa 0x20($const), %xmm2 # 0x04...
980 movdqa 0x30($const), %xmm3 # 0x08...
981 movdqa 0x40($const), %xmm4 # .LM0
982 pcmpeqd %xmm5, %xmm5 # .LNOT
983
984 movdqu ($inp), %xmm6 # load round 1 key
985 movdqa %xmm7, ($out) # save round 0 key
986 lea 0x10($out), $out
987 dec $rounds
988 jmp .Lkey_loop
989.align 16
990.Lkey_loop:
991 pshufb %xmm4, %xmm6 # .LM0
992
993 movdqa %xmm0, %xmm8
994 movdqa %xmm1, %xmm9
995
996 pand %xmm6, %xmm8
997 pand %xmm6, %xmm9
998 movdqa %xmm2, %xmm10
999 pcmpeqb %xmm0, %xmm8
1000 psllq \$4, %xmm0 # 0x10...
1001 movdqa %xmm3, %xmm11
1002 pcmpeqb %xmm1, %xmm9
1003 psllq \$4, %xmm1 # 0x20...
1004
1005 pand %xmm6, %xmm10
1006 pand %xmm6, %xmm11
1007 movdqa %xmm0, %xmm12
1008 pcmpeqb %xmm2, %xmm10
1009 psllq \$4, %xmm2 # 0x40...
1010 movdqa %xmm1, %xmm13
1011 pcmpeqb %xmm3, %xmm11
1012 psllq \$4, %xmm3 # 0x80...
1013
1014 movdqa %xmm2, %xmm14
1015 movdqa %xmm3, %xmm15
1016 pxor %xmm5, %xmm8 # "pnot"
1017 pxor %xmm5, %xmm9
1018
1019 pand %xmm6, %xmm12
1020 pand %xmm6, %xmm13
1021 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1022 pcmpeqb %xmm0, %xmm12
1023 psrlq \$4, %xmm0 # 0x01...
1024 movdqa %xmm9, 0x10($out)
1025 pcmpeqb %xmm1, %xmm13
1026 psrlq \$4, %xmm1 # 0x02...
1027 lea 0x10($inp), $inp
1028
1029 pand %xmm6, %xmm14
1030 pand %xmm6, %xmm15
1031 movdqa %xmm10, 0x20($out)
1032 pcmpeqb %xmm2, %xmm14
1033 psrlq \$4, %xmm2 # 0x04...
1034 movdqa %xmm11, 0x30($out)
1035 pcmpeqb %xmm3, %xmm15
1036 psrlq \$4, %xmm3 # 0x08...
1037 movdqu ($inp), %xmm6 # load next round key
1038
1039 pxor %xmm5, %xmm13 # "pnot"
1040 pxor %xmm5, %xmm14
1041 movdqa %xmm12, 0x40($out)
1042 movdqa %xmm13, 0x50($out)
1043 movdqa %xmm14, 0x60($out)
1044 movdqa %xmm15, 0x70($out)
1045 lea 0x80($out),$out
1046 dec $rounds
1047 jnz .Lkey_loop
1048
1049 movdqa 0x50($const), %xmm7 # .L63
1050 #movdqa %xmm6, ($out) # don't save last round key
1051 ret
1052.size _bsaes_key_convert,.-_bsaes_key_convert
1053___
1054}
1055
1056if (0 && !$win64) { # following four functions are unsupported interface
1057 # used for benchmarking...
1058$code.=<<___;
1059.globl bsaes_enc_key_convert
1060.type bsaes_enc_key_convert,\@function,2
1061.align 16
1062bsaes_enc_key_convert:
1063 _CET_ENDBR
1064 mov 240($inp),%r10d # pass rounds
1065 mov $inp,%rcx # pass key
1066 mov $out,%rax # pass key schedule
1067 call _bsaes_key_convert
1068 pxor %xmm6,%xmm7 # fix up last round key
1069 movdqa %xmm7,(%rax) # save last round key
1070 ret
1071.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1072
1073.globl bsaes_encrypt_128
1074.type bsaes_encrypt_128,\@function,4
1075.align 16
1076bsaes_encrypt_128:
1077.Lenc128_loop:
1078 _CET_ENDBR
1079 movdqu 0x00($inp), @XMM[0] # load input
1080 movdqu 0x10($inp), @XMM[1]
1081 movdqu 0x20($inp), @XMM[2]
1082 movdqu 0x30($inp), @XMM[3]
1083 movdqu 0x40($inp), @XMM[4]
1084 movdqu 0x50($inp), @XMM[5]
1085 movdqu 0x60($inp), @XMM[6]
1086 movdqu 0x70($inp), @XMM[7]
1087 mov $key, %rax # pass the $key
1088 lea 0x80($inp), $inp
1089 mov \$10,%r10d
1090
1091 call _bsaes_encrypt8
1092
1093 movdqu @XMM[0], 0x00($out) # write output
1094 movdqu @XMM[1], 0x10($out)
1095 movdqu @XMM[4], 0x20($out)
1096 movdqu @XMM[6], 0x30($out)
1097 movdqu @XMM[3], 0x40($out)
1098 movdqu @XMM[7], 0x50($out)
1099 movdqu @XMM[2], 0x60($out)
1100 movdqu @XMM[5], 0x70($out)
1101 lea 0x80($out), $out
1102 sub \$0x80,$len
1103 ja .Lenc128_loop
1104 ret
1105.size bsaes_encrypt_128,.-bsaes_encrypt_128
1106
1107.globl bsaes_dec_key_convert
1108.type bsaes_dec_key_convert,\@function,2
1109.align 16
1110bsaes_dec_key_convert:
1111 _CET_ENDBR
1112 mov 240($inp),%r10d # pass rounds
1113 mov $inp,%rcx # pass key
1114 mov $out,%rax # pass key schedule
1115 call _bsaes_key_convert
1116 pxor ($out),%xmm7 # fix up round 0 key
1117 movdqa %xmm6,(%rax) # save last round key
1118 movdqa %xmm7,($out)
1119 ret
1120.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1121
1122.globl bsaes_decrypt_128
1123.type bsaes_decrypt_128,\@function,4
1124.align 16
1125bsaes_decrypt_128:
1126 _CET_ENDBR
1127.Ldec128_loop:
1128 movdqu 0x00($inp), @XMM[0] # load input
1129 movdqu 0x10($inp), @XMM[1]
1130 movdqu 0x20($inp), @XMM[2]
1131 movdqu 0x30($inp), @XMM[3]
1132 movdqu 0x40($inp), @XMM[4]
1133 movdqu 0x50($inp), @XMM[5]
1134 movdqu 0x60($inp), @XMM[6]
1135 movdqu 0x70($inp), @XMM[7]
1136 mov $key, %rax # pass the $key
1137 lea 0x80($inp), $inp
1138 mov \$10,%r10d
1139
1140 call _bsaes_decrypt8
1141
1142 movdqu @XMM[0], 0x00($out) # write output
1143 movdqu @XMM[1], 0x10($out)
1144 movdqu @XMM[6], 0x20($out)
1145 movdqu @XMM[4], 0x30($out)
1146 movdqu @XMM[2], 0x40($out)
1147 movdqu @XMM[7], 0x50($out)
1148 movdqu @XMM[3], 0x60($out)
1149 movdqu @XMM[5], 0x70($out)
1150 lea 0x80($out), $out
1151 sub \$0x80,$len
1152 ja .Ldec128_loop
1153 ret
1154.size bsaes_decrypt_128,.-bsaes_decrypt_128
1155___
1156}
1157{
1158######################################################################
1159#
1160# OpenSSL interface
1161#
1162my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1163 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1164my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1165
1166if ($ecb) {
1167$code.=<<___;
1168.globl bsaes_ecb_encrypt_blocks
1169.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1170.align 16
1171bsaes_ecb_encrypt_blocks:
1172 _CET_ENDBR
1173 mov %rsp, %rax
1174.Lecb_enc_prologue:
1175 push %rbp
1176 push %rbx
1177 push %r12
1178 push %r13
1179 push %r14
1180 push %r15
1181 lea -0x48(%rsp),%rsp
1182___
1183$code.=<<___ if ($win64);
1184 lea -0xa0(%rsp), %rsp
1185 movaps %xmm6, 0x40(%rsp)
1186 movaps %xmm7, 0x50(%rsp)
1187 movaps %xmm8, 0x60(%rsp)
1188 movaps %xmm9, 0x70(%rsp)
1189 movaps %xmm10, 0x80(%rsp)
1190 movaps %xmm11, 0x90(%rsp)
1191 movaps %xmm12, 0xa0(%rsp)
1192 movaps %xmm13, 0xb0(%rsp)
1193 movaps %xmm14, 0xc0(%rsp)
1194 movaps %xmm15, 0xd0(%rsp)
1195.Lecb_enc_body:
1196___
1197$code.=<<___;
1198 mov %rsp,%rbp # backup %rsp
1199 mov 240($arg4),%eax # rounds
1200 mov $arg1,$inp # backup arguments
1201 mov $arg2,$out
1202 mov $arg3,$len
1203 mov $arg4,$key
1204 cmp \$8,$arg3
1205 jb .Lecb_enc_short
1206
1207 mov %eax,%ebx # backup rounds
1208 shl \$7,%rax # 128 bytes per inner round key
1209 sub \$`128-32`,%rax # size of bit-sliced key schedule
1210 sub %rax,%rsp
1211 mov %rsp,%rax # pass key schedule
1212 mov $key,%rcx # pass key
1213 mov %ebx,%r10d # pass rounds
1214 call _bsaes_key_convert
1215 pxor %xmm6,%xmm7 # fix up last round key
1216 movdqa %xmm7,(%rax) # save last round key
1217
1218 sub \$8,$len
1219.Lecb_enc_loop:
1220 movdqu 0x00($inp), @XMM[0] # load input
1221 movdqu 0x10($inp), @XMM[1]
1222 movdqu 0x20($inp), @XMM[2]
1223 movdqu 0x30($inp), @XMM[3]
1224 movdqu 0x40($inp), @XMM[4]
1225 movdqu 0x50($inp), @XMM[5]
1226 mov %rsp, %rax # pass key schedule
1227 movdqu 0x60($inp), @XMM[6]
1228 mov %ebx,%r10d # pass rounds
1229 movdqu 0x70($inp), @XMM[7]
1230 lea 0x80($inp), $inp
1231
1232 call _bsaes_encrypt8
1233
1234 movdqu @XMM[0], 0x00($out) # write output
1235 movdqu @XMM[1], 0x10($out)
1236 movdqu @XMM[4], 0x20($out)
1237 movdqu @XMM[6], 0x30($out)
1238 movdqu @XMM[3], 0x40($out)
1239 movdqu @XMM[7], 0x50($out)
1240 movdqu @XMM[2], 0x60($out)
1241 movdqu @XMM[5], 0x70($out)
1242 lea 0x80($out), $out
1243 sub \$8,$len
1244 jnc .Lecb_enc_loop
1245
1246 add \$8,$len
1247 jz .Lecb_enc_done
1248
1249 movdqu 0x00($inp), @XMM[0] # load input
1250 mov %rsp, %rax # pass key schedule
1251 mov %ebx,%r10d # pass rounds
1252 cmp \$2,$len
1253 jb .Lecb_enc_one
1254 movdqu 0x10($inp), @XMM[1]
1255 je .Lecb_enc_two
1256 movdqu 0x20($inp), @XMM[2]
1257 cmp \$4,$len
1258 jb .Lecb_enc_three
1259 movdqu 0x30($inp), @XMM[3]
1260 je .Lecb_enc_four
1261 movdqu 0x40($inp), @XMM[4]
1262 cmp \$6,$len
1263 jb .Lecb_enc_five
1264 movdqu 0x50($inp), @XMM[5]
1265 je .Lecb_enc_six
1266 movdqu 0x60($inp), @XMM[6]
1267 call _bsaes_encrypt8
1268 movdqu @XMM[0], 0x00($out) # write output
1269 movdqu @XMM[1], 0x10($out)
1270 movdqu @XMM[4], 0x20($out)
1271 movdqu @XMM[6], 0x30($out)
1272 movdqu @XMM[3], 0x40($out)
1273 movdqu @XMM[7], 0x50($out)
1274 movdqu @XMM[2], 0x60($out)
1275 jmp .Lecb_enc_done
1276.align 16
1277.Lecb_enc_six:
1278 call _bsaes_encrypt8
1279 movdqu @XMM[0], 0x00($out) # write output
1280 movdqu @XMM[1], 0x10($out)
1281 movdqu @XMM[4], 0x20($out)
1282 movdqu @XMM[6], 0x30($out)
1283 movdqu @XMM[3], 0x40($out)
1284 movdqu @XMM[7], 0x50($out)
1285 jmp .Lecb_enc_done
1286.align 16
1287.Lecb_enc_five:
1288 call _bsaes_encrypt8
1289 movdqu @XMM[0], 0x00($out) # write output
1290 movdqu @XMM[1], 0x10($out)
1291 movdqu @XMM[4], 0x20($out)
1292 movdqu @XMM[6], 0x30($out)
1293 movdqu @XMM[3], 0x40($out)
1294 jmp .Lecb_enc_done
1295.align 16
1296.Lecb_enc_four:
1297 call _bsaes_encrypt8
1298 movdqu @XMM[0], 0x00($out) # write output
1299 movdqu @XMM[1], 0x10($out)
1300 movdqu @XMM[4], 0x20($out)
1301 movdqu @XMM[6], 0x30($out)
1302 jmp .Lecb_enc_done
1303.align 16
1304.Lecb_enc_three:
1305 call _bsaes_encrypt8
1306 movdqu @XMM[0], 0x00($out) # write output
1307 movdqu @XMM[1], 0x10($out)
1308 movdqu @XMM[4], 0x20($out)
1309 jmp .Lecb_enc_done
1310.align 16
1311.Lecb_enc_two:
1312 call _bsaes_encrypt8
1313 movdqu @XMM[0], 0x00($out) # write output
1314 movdqu @XMM[1], 0x10($out)
1315 jmp .Lecb_enc_done
1316.align 16
1317.Lecb_enc_one:
1318 call _bsaes_encrypt8
1319 movdqu @XMM[0], 0x00($out) # write output
1320 jmp .Lecb_enc_done
1321.align 16
1322.Lecb_enc_short:
1323 lea ($inp), $arg1
1324 lea ($out), $arg2
1325 lea ($key), $arg3
1326 call asm_AES_encrypt
1327 lea 16($inp), $inp
1328 lea 16($out), $out
1329 dec $len
1330 jnz .Lecb_enc_short
1331
1332.Lecb_enc_done:
1333 lea (%rsp),%rax
1334 pxor %xmm0, %xmm0
1335.Lecb_enc_bzero: # wipe key schedule [if any]
1336 movdqa %xmm0, 0x00(%rax)
1337 movdqa %xmm0, 0x10(%rax)
1338 lea 0x20(%rax), %rax
1339 cmp %rax, %rbp
1340 jb .Lecb_enc_bzero
1341
1342 lea (%rbp),%rsp # restore %rsp
1343___
1344$code.=<<___ if ($win64);
1345 movaps 0x40(%rbp), %xmm6
1346 movaps 0x50(%rbp), %xmm7
1347 movaps 0x60(%rbp), %xmm8
1348 movaps 0x70(%rbp), %xmm9
1349 movaps 0x80(%rbp), %xmm10
1350 movaps 0x90(%rbp), %xmm11
1351 movaps 0xa0(%rbp), %xmm12
1352 movaps 0xb0(%rbp), %xmm13
1353 movaps 0xc0(%rbp), %xmm14
1354 movaps 0xd0(%rbp), %xmm15
1355 lea 0xa0(%rbp), %rsp
1356___
1357$code.=<<___;
1358 mov 0x48(%rsp), %r15
1359 mov 0x50(%rsp), %r14
1360 mov 0x58(%rsp), %r13
1361 mov 0x60(%rsp), %r12
1362 mov 0x68(%rsp), %rbx
1363 mov 0x70(%rsp), %rax
1364 lea 0x78(%rsp), %rsp
1365 mov %rax, %rbp
1366.Lecb_enc_epilogue:
1367 ret
1368.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1369
1370.globl bsaes_ecb_decrypt_blocks
1371.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1372.align 16
1373bsaes_ecb_decrypt_blocks:
1374 _CET_ENDBR
1375 mov %rsp, %rax
1376.Lecb_dec_prologue:
1377 push %rbp
1378 push %rbx
1379 push %r12
1380 push %r13
1381 push %r14
1382 push %r15
1383 lea -0x48(%rsp),%rsp
1384___
1385$code.=<<___ if ($win64);
1386 lea -0xa0(%rsp), %rsp
1387 movaps %xmm6, 0x40(%rsp)
1388 movaps %xmm7, 0x50(%rsp)
1389 movaps %xmm8, 0x60(%rsp)
1390 movaps %xmm9, 0x70(%rsp)
1391 movaps %xmm10, 0x80(%rsp)
1392 movaps %xmm11, 0x90(%rsp)
1393 movaps %xmm12, 0xa0(%rsp)
1394 movaps %xmm13, 0xb0(%rsp)
1395 movaps %xmm14, 0xc0(%rsp)
1396 movaps %xmm15, 0xd0(%rsp)
1397.Lecb_dec_body:
1398___
1399$code.=<<___;
1400 mov %rsp,%rbp # backup %rsp
1401 mov 240($arg4),%eax # rounds
1402 mov $arg1,$inp # backup arguments
1403 mov $arg2,$out
1404 mov $arg3,$len
1405 mov $arg4,$key
1406 cmp \$8,$arg3
1407 jb .Lecb_dec_short
1408
1409 mov %eax,%ebx # backup rounds
1410 shl \$7,%rax # 128 bytes per inner round key
1411 sub \$`128-32`,%rax # size of bit-sliced key schedule
1412 sub %rax,%rsp
1413 mov %rsp,%rax # pass key schedule
1414 mov $key,%rcx # pass key
1415 mov %ebx,%r10d # pass rounds
1416 call _bsaes_key_convert
1417 pxor (%rsp),%xmm7 # fix up 0 round key
1418 movdqa %xmm6,(%rax) # save last round key
1419 movdqa %xmm7,(%rsp)
1420
1421 sub \$8,$len
1422.Lecb_dec_loop:
1423 movdqu 0x00($inp), @XMM[0] # load input
1424 movdqu 0x10($inp), @XMM[1]
1425 movdqu 0x20($inp), @XMM[2]
1426 movdqu 0x30($inp), @XMM[3]
1427 movdqu 0x40($inp), @XMM[4]
1428 movdqu 0x50($inp), @XMM[5]
1429 mov %rsp, %rax # pass key schedule
1430 movdqu 0x60($inp), @XMM[6]
1431 mov %ebx,%r10d # pass rounds
1432 movdqu 0x70($inp), @XMM[7]
1433 lea 0x80($inp), $inp
1434
1435 call _bsaes_decrypt8
1436
1437 movdqu @XMM[0], 0x00($out) # write output
1438 movdqu @XMM[1], 0x10($out)
1439 movdqu @XMM[6], 0x20($out)
1440 movdqu @XMM[4], 0x30($out)
1441 movdqu @XMM[2], 0x40($out)
1442 movdqu @XMM[7], 0x50($out)
1443 movdqu @XMM[3], 0x60($out)
1444 movdqu @XMM[5], 0x70($out)
1445 lea 0x80($out), $out
1446 sub \$8,$len
1447 jnc .Lecb_dec_loop
1448
1449 add \$8,$len
1450 jz .Lecb_dec_done
1451
1452 movdqu 0x00($inp), @XMM[0] # load input
1453 mov %rsp, %rax # pass key schedule
1454 mov %ebx,%r10d # pass rounds
1455 cmp \$2,$len
1456 jb .Lecb_dec_one
1457 movdqu 0x10($inp), @XMM[1]
1458 je .Lecb_dec_two
1459 movdqu 0x20($inp), @XMM[2]
1460 cmp \$4,$len
1461 jb .Lecb_dec_three
1462 movdqu 0x30($inp), @XMM[3]
1463 je .Lecb_dec_four
1464 movdqu 0x40($inp), @XMM[4]
1465 cmp \$6,$len
1466 jb .Lecb_dec_five
1467 movdqu 0x50($inp), @XMM[5]
1468 je .Lecb_dec_six
1469 movdqu 0x60($inp), @XMM[6]
1470 call _bsaes_decrypt8
1471 movdqu @XMM[0], 0x00($out) # write output
1472 movdqu @XMM[1], 0x10($out)
1473 movdqu @XMM[6], 0x20($out)
1474 movdqu @XMM[4], 0x30($out)
1475 movdqu @XMM[2], 0x40($out)
1476 movdqu @XMM[7], 0x50($out)
1477 movdqu @XMM[3], 0x60($out)
1478 jmp .Lecb_dec_done
1479.align 16
1480.Lecb_dec_six:
1481 call _bsaes_decrypt8
1482 movdqu @XMM[0], 0x00($out) # write output
1483 movdqu @XMM[1], 0x10($out)
1484 movdqu @XMM[6], 0x20($out)
1485 movdqu @XMM[4], 0x30($out)
1486 movdqu @XMM[2], 0x40($out)
1487 movdqu @XMM[7], 0x50($out)
1488 jmp .Lecb_dec_done
1489.align 16
1490.Lecb_dec_five:
1491 call _bsaes_decrypt8
1492 movdqu @XMM[0], 0x00($out) # write output
1493 movdqu @XMM[1], 0x10($out)
1494 movdqu @XMM[6], 0x20($out)
1495 movdqu @XMM[4], 0x30($out)
1496 movdqu @XMM[2], 0x40($out)
1497 jmp .Lecb_dec_done
1498.align 16
1499.Lecb_dec_four:
1500 call _bsaes_decrypt8
1501 movdqu @XMM[0], 0x00($out) # write output
1502 movdqu @XMM[1], 0x10($out)
1503 movdqu @XMM[6], 0x20($out)
1504 movdqu @XMM[4], 0x30($out)
1505 jmp .Lecb_dec_done
1506.align 16
1507.Lecb_dec_three:
1508 call _bsaes_decrypt8
1509 movdqu @XMM[0], 0x00($out) # write output
1510 movdqu @XMM[1], 0x10($out)
1511 movdqu @XMM[6], 0x20($out)
1512 jmp .Lecb_dec_done
1513.align 16
1514.Lecb_dec_two:
1515 call _bsaes_decrypt8
1516 movdqu @XMM[0], 0x00($out) # write output
1517 movdqu @XMM[1], 0x10($out)
1518 jmp .Lecb_dec_done
1519.align 16
1520.Lecb_dec_one:
1521 call _bsaes_decrypt8
1522 movdqu @XMM[0], 0x00($out) # write output
1523 jmp .Lecb_dec_done
1524.align 16
1525.Lecb_dec_short:
1526 lea ($inp), $arg1
1527 lea ($out), $arg2
1528 lea ($key), $arg3
1529 call asm_AES_decrypt
1530 lea 16($inp), $inp
1531 lea 16($out), $out
1532 dec $len
1533 jnz .Lecb_dec_short
1534
1535.Lecb_dec_done:
1536 lea (%rsp),%rax
1537 pxor %xmm0, %xmm0
1538.Lecb_dec_bzero: # wipe key schedule [if any]
1539 movdqa %xmm0, 0x00(%rax)
1540 movdqa %xmm0, 0x10(%rax)
1541 lea 0x20(%rax), %rax
1542 cmp %rax, %rbp
1543 jb .Lecb_dec_bzero
1544
1545 lea (%rbp),%rsp # restore %rsp
1546___
1547$code.=<<___ if ($win64);
1548 movaps 0x40(%rbp), %xmm6
1549 movaps 0x50(%rbp), %xmm7
1550 movaps 0x60(%rbp), %xmm8
1551 movaps 0x70(%rbp), %xmm9
1552 movaps 0x80(%rbp), %xmm10
1553 movaps 0x90(%rbp), %xmm11
1554 movaps 0xa0(%rbp), %xmm12
1555 movaps 0xb0(%rbp), %xmm13
1556 movaps 0xc0(%rbp), %xmm14
1557 movaps 0xd0(%rbp), %xmm15
1558 lea 0xa0(%rbp), %rsp
1559___
1560$code.=<<___;
1561 mov 0x48(%rsp), %r15
1562 mov 0x50(%rsp), %r14
1563 mov 0x58(%rsp), %r13
1564 mov 0x60(%rsp), %r12
1565 mov 0x68(%rsp), %rbx
1566 mov 0x70(%rsp), %rax
1567 lea 0x78(%rsp), %rsp
1568 mov %rax, %rbp
1569.Lecb_dec_epilogue:
1570 ret
1571.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1572___
1573}
1574$code.=<<___;
1575.extern asm_AES_cbc_encrypt
1576.globl bsaes_cbc_encrypt
1577.type bsaes_cbc_encrypt,\@abi-omnipotent
1578.align 16
1579bsaes_cbc_encrypt:
1580 _CET_ENDBR
1581___
1582$code.=<<___ if ($win64);
1583 mov 48(%rsp),$arg6 # pull direction flag
1584___
1585$code.=<<___;
1586 cmp \$0,$arg6
1587 jne asm_AES_cbc_encrypt
1588 cmp \$128,$arg3
1589 jb asm_AES_cbc_encrypt
1590
1591 mov %rsp, %rax
1592.Lcbc_dec_prologue:
1593 push %rbp
1594 push %rbx
1595 push %r12
1596 push %r13
1597 push %r14
1598 push %r15
1599 lea -0x48(%rsp), %rsp
1600___
1601$code.=<<___ if ($win64);
1602 mov 0xa0(%rsp),$arg5 # pull ivp
1603 lea -0xa0(%rsp), %rsp
1604 movaps %xmm6, 0x40(%rsp)
1605 movaps %xmm7, 0x50(%rsp)
1606 movaps %xmm8, 0x60(%rsp)
1607 movaps %xmm9, 0x70(%rsp)
1608 movaps %xmm10, 0x80(%rsp)
1609 movaps %xmm11, 0x90(%rsp)
1610 movaps %xmm12, 0xa0(%rsp)
1611 movaps %xmm13, 0xb0(%rsp)
1612 movaps %xmm14, 0xc0(%rsp)
1613 movaps %xmm15, 0xd0(%rsp)
1614.Lcbc_dec_body:
1615___
1616$code.=<<___;
1617 mov %rsp, %rbp # backup %rsp
1618 mov 240($arg4), %eax # rounds
1619 mov $arg1, $inp # backup arguments
1620 mov $arg2, $out
1621 mov $arg3, $len
1622 mov $arg4, $key
1623 mov $arg5, %rbx
1624 shr \$4, $len # bytes to blocks
1625
1626 mov %eax, %edx # rounds
1627 shl \$7, %rax # 128 bytes per inner round key
1628 sub \$`128-32`, %rax # size of bit-sliced key schedule
1629 sub %rax, %rsp
1630
1631 mov %rsp, %rax # pass key schedule
1632 mov $key, %rcx # pass key
1633 mov %edx, %r10d # pass rounds
1634 call _bsaes_key_convert
1635 pxor (%rsp),%xmm7 # fix up 0 round key
1636 movdqa %xmm6,(%rax) # save last round key
1637 movdqa %xmm7,(%rsp)
1638
1639 movdqu (%rbx), @XMM[15] # load IV
1640 sub \$8,$len
1641.Lcbc_dec_loop:
1642 movdqu 0x00($inp), @XMM[0] # load input
1643 movdqu 0x10($inp), @XMM[1]
1644 movdqu 0x20($inp), @XMM[2]
1645 movdqu 0x30($inp), @XMM[3]
1646 movdqu 0x40($inp), @XMM[4]
1647 movdqu 0x50($inp), @XMM[5]
1648 mov %rsp, %rax # pass key schedule
1649 movdqu 0x60($inp), @XMM[6]
1650 mov %edx,%r10d # pass rounds
1651 movdqu 0x70($inp), @XMM[7]
1652 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1653
1654 call _bsaes_decrypt8
1655
1656 pxor 0x20(%rbp), @XMM[0] # ^= IV
1657 movdqu 0x00($inp), @XMM[8] # re-load input
1658 movdqu 0x10($inp), @XMM[9]
1659 pxor @XMM[8], @XMM[1]
1660 movdqu 0x20($inp), @XMM[10]
1661 pxor @XMM[9], @XMM[6]
1662 movdqu 0x30($inp), @XMM[11]
1663 pxor @XMM[10], @XMM[4]
1664 movdqu 0x40($inp), @XMM[12]
1665 pxor @XMM[11], @XMM[2]
1666 movdqu 0x50($inp), @XMM[13]
1667 pxor @XMM[12], @XMM[7]
1668 movdqu 0x60($inp), @XMM[14]
1669 pxor @XMM[13], @XMM[3]
1670 movdqu 0x70($inp), @XMM[15] # IV
1671 pxor @XMM[14], @XMM[5]
1672 movdqu @XMM[0], 0x00($out) # write output
1673 lea 0x80($inp), $inp
1674 movdqu @XMM[1], 0x10($out)
1675 movdqu @XMM[6], 0x20($out)
1676 movdqu @XMM[4], 0x30($out)
1677 movdqu @XMM[2], 0x40($out)
1678 movdqu @XMM[7], 0x50($out)
1679 movdqu @XMM[3], 0x60($out)
1680 movdqu @XMM[5], 0x70($out)
1681 lea 0x80($out), $out
1682 sub \$8,$len
1683 jnc .Lcbc_dec_loop
1684
1685 add \$8,$len
1686 jz .Lcbc_dec_done
1687
1688 movdqu 0x00($inp), @XMM[0] # load input
1689 mov %rsp, %rax # pass key schedule
1690 mov %edx, %r10d # pass rounds
1691 cmp \$2,$len
1692 jb .Lcbc_dec_one
1693 movdqu 0x10($inp), @XMM[1]
1694 je .Lcbc_dec_two
1695 movdqu 0x20($inp), @XMM[2]
1696 cmp \$4,$len
1697 jb .Lcbc_dec_three
1698 movdqu 0x30($inp), @XMM[3]
1699 je .Lcbc_dec_four
1700 movdqu 0x40($inp), @XMM[4]
1701 cmp \$6,$len
1702 jb .Lcbc_dec_five
1703 movdqu 0x50($inp), @XMM[5]
1704 je .Lcbc_dec_six
1705 movdqu 0x60($inp), @XMM[6]
1706 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1707 call _bsaes_decrypt8
1708 pxor 0x20(%rbp), @XMM[0] # ^= IV
1709 movdqu 0x00($inp), @XMM[8] # re-load input
1710 movdqu 0x10($inp), @XMM[9]
1711 pxor @XMM[8], @XMM[1]
1712 movdqu 0x20($inp), @XMM[10]
1713 pxor @XMM[9], @XMM[6]
1714 movdqu 0x30($inp), @XMM[11]
1715 pxor @XMM[10], @XMM[4]
1716 movdqu 0x40($inp), @XMM[12]
1717 pxor @XMM[11], @XMM[2]
1718 movdqu 0x50($inp), @XMM[13]
1719 pxor @XMM[12], @XMM[7]
1720 movdqu 0x60($inp), @XMM[15] # IV
1721 pxor @XMM[13], @XMM[3]
1722 movdqu @XMM[0], 0x00($out) # write output
1723 movdqu @XMM[1], 0x10($out)
1724 movdqu @XMM[6], 0x20($out)
1725 movdqu @XMM[4], 0x30($out)
1726 movdqu @XMM[2], 0x40($out)
1727 movdqu @XMM[7], 0x50($out)
1728 movdqu @XMM[3], 0x60($out)
1729 jmp .Lcbc_dec_done
1730.align 16
1731.Lcbc_dec_six:
1732 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1733 call _bsaes_decrypt8
1734 pxor 0x20(%rbp), @XMM[0] # ^= IV
1735 movdqu 0x00($inp), @XMM[8] # re-load input
1736 movdqu 0x10($inp), @XMM[9]
1737 pxor @XMM[8], @XMM[1]
1738 movdqu 0x20($inp), @XMM[10]
1739 pxor @XMM[9], @XMM[6]
1740 movdqu 0x30($inp), @XMM[11]
1741 pxor @XMM[10], @XMM[4]
1742 movdqu 0x40($inp), @XMM[12]
1743 pxor @XMM[11], @XMM[2]
1744 movdqu 0x50($inp), @XMM[15] # IV
1745 pxor @XMM[12], @XMM[7]
1746 movdqu @XMM[0], 0x00($out) # write output
1747 movdqu @XMM[1], 0x10($out)
1748 movdqu @XMM[6], 0x20($out)
1749 movdqu @XMM[4], 0x30($out)
1750 movdqu @XMM[2], 0x40($out)
1751 movdqu @XMM[7], 0x50($out)
1752 jmp .Lcbc_dec_done
1753.align 16
1754.Lcbc_dec_five:
1755 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1756 call _bsaes_decrypt8
1757 pxor 0x20(%rbp), @XMM[0] # ^= IV
1758 movdqu 0x00($inp), @XMM[8] # re-load input
1759 movdqu 0x10($inp), @XMM[9]
1760 pxor @XMM[8], @XMM[1]
1761 movdqu 0x20($inp), @XMM[10]
1762 pxor @XMM[9], @XMM[6]
1763 movdqu 0x30($inp), @XMM[11]
1764 pxor @XMM[10], @XMM[4]
1765 movdqu 0x40($inp), @XMM[15] # IV
1766 pxor @XMM[11], @XMM[2]
1767 movdqu @XMM[0], 0x00($out) # write output
1768 movdqu @XMM[1], 0x10($out)
1769 movdqu @XMM[6], 0x20($out)
1770 movdqu @XMM[4], 0x30($out)
1771 movdqu @XMM[2], 0x40($out)
1772 jmp .Lcbc_dec_done
1773.align 16
1774.Lcbc_dec_four:
1775 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1776 call _bsaes_decrypt8
1777 pxor 0x20(%rbp), @XMM[0] # ^= IV
1778 movdqu 0x00($inp), @XMM[8] # re-load input
1779 movdqu 0x10($inp), @XMM[9]
1780 pxor @XMM[8], @XMM[1]
1781 movdqu 0x20($inp), @XMM[10]
1782 pxor @XMM[9], @XMM[6]
1783 movdqu 0x30($inp), @XMM[15] # IV
1784 pxor @XMM[10], @XMM[4]
1785 movdqu @XMM[0], 0x00($out) # write output
1786 movdqu @XMM[1], 0x10($out)
1787 movdqu @XMM[6], 0x20($out)
1788 movdqu @XMM[4], 0x30($out)
1789 jmp .Lcbc_dec_done
1790.align 16
1791.Lcbc_dec_three:
1792 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1793 call _bsaes_decrypt8
1794 pxor 0x20(%rbp), @XMM[0] # ^= IV
1795 movdqu 0x00($inp), @XMM[8] # re-load input
1796 movdqu 0x10($inp), @XMM[9]
1797 pxor @XMM[8], @XMM[1]
1798 movdqu 0x20($inp), @XMM[15] # IV
1799 pxor @XMM[9], @XMM[6]
1800 movdqu @XMM[0], 0x00($out) # write output
1801 movdqu @XMM[1], 0x10($out)
1802 movdqu @XMM[6], 0x20($out)
1803 jmp .Lcbc_dec_done
1804.align 16
1805.Lcbc_dec_two:
1806 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1807 call _bsaes_decrypt8
1808 pxor 0x20(%rbp), @XMM[0] # ^= IV
1809 movdqu 0x00($inp), @XMM[8] # re-load input
1810 movdqu 0x10($inp), @XMM[15] # IV
1811 pxor @XMM[8], @XMM[1]
1812 movdqu @XMM[0], 0x00($out) # write output
1813 movdqu @XMM[1], 0x10($out)
1814 jmp .Lcbc_dec_done
1815.align 16
1816.Lcbc_dec_one:
1817 lea ($inp), $arg1
1818 lea 0x20(%rbp), $arg2 # buffer output
1819 lea ($key), $arg3
1820 call asm_AES_decrypt # doesn't touch %xmm
1821 pxor 0x20(%rbp), @XMM[15] # ^= IV
1822 movdqu @XMM[15], ($out) # write output
1823 movdqa @XMM[0], @XMM[15] # IV
1824
1825.Lcbc_dec_done:
1826 movdqu @XMM[15], (%rbx) # return IV
1827 lea (%rsp), %rax
1828 pxor %xmm0, %xmm0
1829.Lcbc_dec_bzero: # wipe key schedule [if any]
1830 movdqa %xmm0, 0x00(%rax)
1831 movdqa %xmm0, 0x10(%rax)
1832 lea 0x20(%rax), %rax
1833 cmp %rax, %rbp
1834 ja .Lcbc_dec_bzero
1835
1836 lea (%rbp),%rsp # restore %rsp
1837___
1838$code.=<<___ if ($win64);
1839 movaps 0x40(%rbp), %xmm6
1840 movaps 0x50(%rbp), %xmm7
1841 movaps 0x60(%rbp), %xmm8
1842 movaps 0x70(%rbp), %xmm9
1843 movaps 0x80(%rbp), %xmm10
1844 movaps 0x90(%rbp), %xmm11
1845 movaps 0xa0(%rbp), %xmm12
1846 movaps 0xb0(%rbp), %xmm13
1847 movaps 0xc0(%rbp), %xmm14
1848 movaps 0xd0(%rbp), %xmm15
1849 lea 0xa0(%rbp), %rsp
1850___
1851$code.=<<___;
1852 mov 0x48(%rsp), %r15
1853 mov 0x50(%rsp), %r14
1854 mov 0x58(%rsp), %r13
1855 mov 0x60(%rsp), %r12
1856 mov 0x68(%rsp), %rbx
1857 mov 0x70(%rsp), %rax
1858 lea 0x78(%rsp), %rsp
1859 mov %rax, %rbp
1860.Lcbc_dec_epilogue:
1861 ret
1862.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1863
1864.globl bsaes_ctr32_encrypt_blocks
1865.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1866.align 16
1867bsaes_ctr32_encrypt_blocks:
1868 _CET_ENDBR
1869 mov %rsp, %rax
1870.Lctr_enc_prologue:
1871 push %rbp
1872 push %rbx
1873 push %r12
1874 push %r13
1875 push %r14
1876 push %r15
1877 lea -0x48(%rsp), %rsp
1878___
1879$code.=<<___ if ($win64);
1880 mov 0xa0(%rsp),$arg5 # pull ivp
1881 lea -0xa0(%rsp), %rsp
1882 movaps %xmm6, 0x40(%rsp)
1883 movaps %xmm7, 0x50(%rsp)
1884 movaps %xmm8, 0x60(%rsp)
1885 movaps %xmm9, 0x70(%rsp)
1886 movaps %xmm10, 0x80(%rsp)
1887 movaps %xmm11, 0x90(%rsp)
1888 movaps %xmm12, 0xa0(%rsp)
1889 movaps %xmm13, 0xb0(%rsp)
1890 movaps %xmm14, 0xc0(%rsp)
1891 movaps %xmm15, 0xd0(%rsp)
1892.Lctr_enc_body:
1893___
1894$code.=<<___;
1895 mov %rsp, %rbp # backup %rsp
1896 movdqu ($arg5), %xmm0 # load counter
1897 mov 240($arg4), %eax # rounds
1898 mov $arg1, $inp # backup arguments
1899 mov $arg2, $out
1900 mov $arg3, $len
1901 mov $arg4, $key
1902 movdqa %xmm0, 0x20(%rbp) # copy counter
1903 cmp \$8, $arg3
1904 jb .Lctr_enc_short
1905
1906 mov %eax, %ebx # rounds
1907 shl \$7, %rax # 128 bytes per inner round key
1908 sub \$`128-32`, %rax # size of bit-sliced key schedule
1909 sub %rax, %rsp
1910
1911 mov %rsp, %rax # pass key schedule
1912 mov $key, %rcx # pass key
1913 mov %ebx, %r10d # pass rounds
1914 call _bsaes_key_convert
1915 pxor %xmm6,%xmm7 # fix up last round key
1916 movdqa %xmm7,(%rax) # save last round key
1917
1918 movdqa (%rsp), @XMM[9] # load round0 key
1919 lea .LADD1(%rip), %r11
1920 movdqa 0x20(%rbp), @XMM[0] # counter copy
1921 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1922 pshufb @XMM[8], @XMM[9] # byte swap upper part
1923 pshufb @XMM[8], @XMM[0]
1924 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1925 jmp .Lctr_enc_loop
1926.align 16
1927.Lctr_enc_loop:
1928 movdqa @XMM[0], 0x20(%rbp) # save counter
1929 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1930 movdqa @XMM[0], @XMM[2]
1931 paddd 0x00(%r11), @XMM[1] # .LADD1
1932 movdqa @XMM[0], @XMM[3]
1933 paddd 0x10(%r11), @XMM[2] # .LADD2
1934 movdqa @XMM[0], @XMM[4]
1935 paddd 0x20(%r11), @XMM[3] # .LADD3
1936 movdqa @XMM[0], @XMM[5]
1937 paddd 0x30(%r11), @XMM[4] # .LADD4
1938 movdqa @XMM[0], @XMM[6]
1939 paddd 0x40(%r11), @XMM[5] # .LADD5
1940 movdqa @XMM[0], @XMM[7]
1941 paddd 0x50(%r11), @XMM[6] # .LADD6
1942 paddd 0x60(%r11), @XMM[7] # .LADD7
1943
1944 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1945 # to flip byte order in 32-bit counter
1946 movdqa (%rsp), @XMM[9] # round 0 key
1947 lea 0x10(%rsp), %rax # pass key schedule
1948 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1949 pxor @XMM[9], @XMM[0] # xor with round0 key
1950 pxor @XMM[9], @XMM[1]
1951 pshufb @XMM[8], @XMM[0]
1952 pxor @XMM[9], @XMM[2]
1953 pshufb @XMM[8], @XMM[1]
1954 pxor @XMM[9], @XMM[3]
1955 pshufb @XMM[8], @XMM[2]
1956 pxor @XMM[9], @XMM[4]
1957 pshufb @XMM[8], @XMM[3]
1958 pxor @XMM[9], @XMM[5]
1959 pshufb @XMM[8], @XMM[4]
1960 pxor @XMM[9], @XMM[6]
1961 pshufb @XMM[8], @XMM[5]
1962 pxor @XMM[9], @XMM[7]
1963 pshufb @XMM[8], @XMM[6]
1964 lea .LBS0(%rip), %r11 # constants table
1965 pshufb @XMM[8], @XMM[7]
1966 mov %ebx,%r10d # pass rounds
1967
1968 call _bsaes_encrypt8_bitslice
1969
1970 sub \$8,$len
1971 jc .Lctr_enc_loop_done
1972
1973 movdqu 0x00($inp), @XMM[8] # load input
1974 movdqu 0x10($inp), @XMM[9]
1975 movdqu 0x20($inp), @XMM[10]
1976 movdqu 0x30($inp), @XMM[11]
1977 movdqu 0x40($inp), @XMM[12]
1978 movdqu 0x50($inp), @XMM[13]
1979 movdqu 0x60($inp), @XMM[14]
1980 movdqu 0x70($inp), @XMM[15]
1981 lea 0x80($inp),$inp
1982 pxor @XMM[0], @XMM[8]
1983 movdqa 0x20(%rbp), @XMM[0] # load counter
1984 pxor @XMM[9], @XMM[1]
1985 movdqu @XMM[8], 0x00($out) # write output
1986 pxor @XMM[10], @XMM[4]
1987 movdqu @XMM[1], 0x10($out)
1988 pxor @XMM[11], @XMM[6]
1989 movdqu @XMM[4], 0x20($out)
1990 pxor @XMM[12], @XMM[3]
1991 movdqu @XMM[6], 0x30($out)
1992 pxor @XMM[13], @XMM[7]
1993 movdqu @XMM[3], 0x40($out)
1994 pxor @XMM[14], @XMM[2]
1995 movdqu @XMM[7], 0x50($out)
1996 pxor @XMM[15], @XMM[5]
1997 movdqu @XMM[2], 0x60($out)
1998 lea .LADD1(%rip), %r11
1999 movdqu @XMM[5], 0x70($out)
2000 lea 0x80($out), $out
2001 paddd 0x70(%r11), @XMM[0] # .LADD8
2002 jnz .Lctr_enc_loop
2003
2004 jmp .Lctr_enc_done
2005.align 16
2006.Lctr_enc_loop_done:
2007 add \$8, $len
2008 movdqu 0x00($inp), @XMM[8] # load input
2009 pxor @XMM[8], @XMM[0]
2010 movdqu @XMM[0], 0x00($out) # write output
2011 cmp \$2,$len
2012 jb .Lctr_enc_done
2013 movdqu 0x10($inp), @XMM[9]
2014 pxor @XMM[9], @XMM[1]
2015 movdqu @XMM[1], 0x10($out)
2016 je .Lctr_enc_done
2017 movdqu 0x20($inp), @XMM[10]
2018 pxor @XMM[10], @XMM[4]
2019 movdqu @XMM[4], 0x20($out)
2020 cmp \$4,$len
2021 jb .Lctr_enc_done
2022 movdqu 0x30($inp), @XMM[11]
2023 pxor @XMM[11], @XMM[6]
2024 movdqu @XMM[6], 0x30($out)
2025 je .Lctr_enc_done
2026 movdqu 0x40($inp), @XMM[12]
2027 pxor @XMM[12], @XMM[3]
2028 movdqu @XMM[3], 0x40($out)
2029 cmp \$6,$len
2030 jb .Lctr_enc_done
2031 movdqu 0x50($inp), @XMM[13]
2032 pxor @XMM[13], @XMM[7]
2033 movdqu @XMM[7], 0x50($out)
2034 je .Lctr_enc_done
2035 movdqu 0x60($inp), @XMM[14]
2036 pxor @XMM[14], @XMM[2]
2037 movdqu @XMM[2], 0x60($out)
2038 jmp .Lctr_enc_done
2039
2040.align 16
2041.Lctr_enc_short:
2042 lea 0x20(%rbp), $arg1
2043 lea 0x30(%rbp), $arg2
2044 lea ($key), $arg3
2045 call asm_AES_encrypt
2046 movdqu ($inp), @XMM[1]
2047 lea 16($inp), $inp
2048 mov 0x2c(%rbp), %eax # load 32-bit counter
2049 bswap %eax
2050 pxor 0x30(%rbp), @XMM[1]
2051 inc %eax # increment
2052 movdqu @XMM[1], ($out)
2053 bswap %eax
2054 lea 16($out), $out
2055 mov %eax, 0x2c(%rsp) # save 32-bit counter
2056 dec $len
2057 jnz .Lctr_enc_short
2058
2059.Lctr_enc_done:
2060 lea (%rsp), %rax
2061 pxor %xmm0, %xmm0
2062.Lctr_enc_bzero: # wipe key schedule [if any]
2063 movdqa %xmm0, 0x00(%rax)
2064 movdqa %xmm0, 0x10(%rax)
2065 lea 0x20(%rax), %rax
2066 cmp %rax, %rbp
2067 ja .Lctr_enc_bzero
2068
2069 lea (%rbp),%rsp # restore %rsp
2070___
2071$code.=<<___ if ($win64);
2072 movaps 0x40(%rbp), %xmm6
2073 movaps 0x50(%rbp), %xmm7
2074 movaps 0x60(%rbp), %xmm8
2075 movaps 0x70(%rbp), %xmm9
2076 movaps 0x80(%rbp), %xmm10
2077 movaps 0x90(%rbp), %xmm11
2078 movaps 0xa0(%rbp), %xmm12
2079 movaps 0xb0(%rbp), %xmm13
2080 movaps 0xc0(%rbp), %xmm14
2081 movaps 0xd0(%rbp), %xmm15
2082 lea 0xa0(%rbp), %rsp
2083___
2084$code.=<<___;
2085 mov 0x48(%rsp), %r15
2086 mov 0x50(%rsp), %r14
2087 mov 0x58(%rsp), %r13
2088 mov 0x60(%rsp), %r12
2089 mov 0x68(%rsp), %rbx
2090 mov 0x70(%rsp), %rax
2091 lea 0x78(%rsp), %rsp
2092 mov %rax, %rbp
2093.Lctr_enc_epilogue:
2094 ret
2095.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2096___
2097######################################################################
2098# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2099# const AES_KEY *key1, const AES_KEY *key2,
2100# const unsigned char iv[16]);
2101#
2102my ($twmask,$twres,$twtmp)=@XMM[13..15];
2103$arg6=~s/d$//;
2104
2105$code.=<<___;
2106.globl bsaes_xts_encrypt
2107.type bsaes_xts_encrypt,\@abi-omnipotent
2108.align 16
2109bsaes_xts_encrypt:
2110 _CET_ENDBR
2111 mov %rsp, %rax
2112.Lxts_enc_prologue:
2113 push %rbp
2114 push %rbx
2115 push %r12
2116 push %r13
2117 push %r14
2118 push %r15
2119 lea -0x48(%rsp), %rsp
2120___
2121$code.=<<___ if ($win64);
2122 mov 0xa0(%rsp),$arg5 # pull key2
2123 mov 0xa8(%rsp),$arg6 # pull ivp
2124 lea -0xa0(%rsp), %rsp
2125 movaps %xmm6, 0x40(%rsp)
2126 movaps %xmm7, 0x50(%rsp)
2127 movaps %xmm8, 0x60(%rsp)
2128 movaps %xmm9, 0x70(%rsp)
2129 movaps %xmm10, 0x80(%rsp)
2130 movaps %xmm11, 0x90(%rsp)
2131 movaps %xmm12, 0xa0(%rsp)
2132 movaps %xmm13, 0xb0(%rsp)
2133 movaps %xmm14, 0xc0(%rsp)
2134 movaps %xmm15, 0xd0(%rsp)
2135.Lxts_enc_body:
2136___
2137$code.=<<___;
2138 mov %rsp, %rbp # backup %rsp
2139 mov $arg1, $inp # backup arguments
2140 mov $arg2, $out
2141 mov $arg3, $len
2142 mov $arg4, $key
2143
2144 lea ($arg6), $arg1
2145 lea 0x20(%rbp), $arg2
2146 lea ($arg5), $arg3
2147 call asm_AES_encrypt # generate initial tweak
2148
2149 mov 240($key), %eax # rounds
2150 mov $len, %rbx # backup $len
2151
2152 mov %eax, %edx # rounds
2153 shl \$7, %rax # 128 bytes per inner round key
2154 sub \$`128-32`, %rax # size of bit-sliced key schedule
2155 sub %rax, %rsp
2156
2157 mov %rsp, %rax # pass key schedule
2158 mov $key, %rcx # pass key
2159 mov %edx, %r10d # pass rounds
2160 call _bsaes_key_convert
2161 pxor %xmm6, %xmm7 # fix up last round key
2162 movdqa %xmm7, (%rax) # save last round key
2163
2164 and \$-16, $len
2165 sub \$0x80, %rsp # place for tweak[8]
2166 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2167
2168 pxor $twtmp, $twtmp
2169 movdqa .Lxts_magic(%rip), $twmask
2170 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2171
2172 sub \$0x80, $len
2173 jc .Lxts_enc_short
2174 jmp .Lxts_enc_loop
2175
2176.align 16
2177.Lxts_enc_loop:
2178___
2179 for ($i=0;$i<7;$i++) {
2180 $code.=<<___;
2181 pshufd \$0x13, $twtmp, $twres
2182 pxor $twtmp, $twtmp
2183 movdqa @XMM[7], @XMM[$i]
2184 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2185 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2186 pand $twmask, $twres # isolate carry and residue
2187 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2188 pxor $twres, @XMM[7]
2189___
2190 $code.=<<___ if ($i>=1);
2191 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2192___
2193 $code.=<<___ if ($i>=2);
2194 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2195___
2196 }
2197$code.=<<___;
2198 movdqu 0x60($inp), @XMM[8+6]
2199 pxor @XMM[8+5], @XMM[5]
2200 movdqu 0x70($inp), @XMM[8+7]
2201 lea 0x80($inp), $inp
2202 movdqa @XMM[7], 0x70(%rsp)
2203 pxor @XMM[8+6], @XMM[6]
2204 lea 0x80(%rsp), %rax # pass key schedule
2205 pxor @XMM[8+7], @XMM[7]
2206 mov %edx, %r10d # pass rounds
2207
2208 call _bsaes_encrypt8
2209
2210 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2211 pxor 0x10(%rsp), @XMM[1]
2212 movdqu @XMM[0], 0x00($out) # write output
2213 pxor 0x20(%rsp), @XMM[4]
2214 movdqu @XMM[1], 0x10($out)
2215 pxor 0x30(%rsp), @XMM[6]
2216 movdqu @XMM[4], 0x20($out)
2217 pxor 0x40(%rsp), @XMM[3]
2218 movdqu @XMM[6], 0x30($out)
2219 pxor 0x50(%rsp), @XMM[7]
2220 movdqu @XMM[3], 0x40($out)
2221 pxor 0x60(%rsp), @XMM[2]
2222 movdqu @XMM[7], 0x50($out)
2223 pxor 0x70(%rsp), @XMM[5]
2224 movdqu @XMM[2], 0x60($out)
2225 movdqu @XMM[5], 0x70($out)
2226 lea 0x80($out), $out
2227
2228 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2229 pxor $twtmp, $twtmp
2230 movdqa .Lxts_magic(%rip), $twmask
2231 pcmpgtd @XMM[7], $twtmp
2232 pshufd \$0x13, $twtmp, $twres
2233 pxor $twtmp, $twtmp
2234 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2235 pand $twmask, $twres # isolate carry and residue
2236 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2237 pxor $twres, @XMM[7]
2238
2239 sub \$0x80,$len
2240 jnc .Lxts_enc_loop
2241
2242.Lxts_enc_short:
2243 add \$0x80, $len
2244 jz .Lxts_enc_done
2245___
2246 for ($i=0;$i<7;$i++) {
2247 $code.=<<___;
2248 pshufd \$0x13, $twtmp, $twres
2249 pxor $twtmp, $twtmp
2250 movdqa @XMM[7], @XMM[$i]
2251 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2252 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2253 pand $twmask, $twres # isolate carry and residue
2254 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2255 pxor $twres, @XMM[7]
2256___
2257 $code.=<<___ if ($i>=1);
2258 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2259 cmp \$`0x10*$i`,$len
2260 je .Lxts_enc_$i
2261___
2262 $code.=<<___ if ($i>=2);
2263 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2264___
2265 }
2266$code.=<<___;
2267 movdqu 0x60($inp), @XMM[8+6]
2268 pxor @XMM[8+5], @XMM[5]
2269 movdqa @XMM[7], 0x70(%rsp)
2270 lea 0x70($inp), $inp
2271 pxor @XMM[8+6], @XMM[6]
2272 lea 0x80(%rsp), %rax # pass key schedule
2273 mov %edx, %r10d # pass rounds
2274
2275 call _bsaes_encrypt8
2276
2277 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2278 pxor 0x10(%rsp), @XMM[1]
2279 movdqu @XMM[0], 0x00($out) # write output
2280 pxor 0x20(%rsp), @XMM[4]
2281 movdqu @XMM[1], 0x10($out)
2282 pxor 0x30(%rsp), @XMM[6]
2283 movdqu @XMM[4], 0x20($out)
2284 pxor 0x40(%rsp), @XMM[3]
2285 movdqu @XMM[6], 0x30($out)
2286 pxor 0x50(%rsp), @XMM[7]
2287 movdqu @XMM[3], 0x40($out)
2288 pxor 0x60(%rsp), @XMM[2]
2289 movdqu @XMM[7], 0x50($out)
2290 movdqu @XMM[2], 0x60($out)
2291 lea 0x70($out), $out
2292
2293 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2294 jmp .Lxts_enc_done
2295.align 16
2296.Lxts_enc_6:
2297 pxor @XMM[8+4], @XMM[4]
2298 lea 0x60($inp), $inp
2299 pxor @XMM[8+5], @XMM[5]
2300 lea 0x80(%rsp), %rax # pass key schedule
2301 mov %edx, %r10d # pass rounds
2302
2303 call _bsaes_encrypt8
2304
2305 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2306 pxor 0x10(%rsp), @XMM[1]
2307 movdqu @XMM[0], 0x00($out) # write output
2308 pxor 0x20(%rsp), @XMM[4]
2309 movdqu @XMM[1], 0x10($out)
2310 pxor 0x30(%rsp), @XMM[6]
2311 movdqu @XMM[4], 0x20($out)
2312 pxor 0x40(%rsp), @XMM[3]
2313 movdqu @XMM[6], 0x30($out)
2314 pxor 0x50(%rsp), @XMM[7]
2315 movdqu @XMM[3], 0x40($out)
2316 movdqu @XMM[7], 0x50($out)
2317 lea 0x60($out), $out
2318
2319 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2320 jmp .Lxts_enc_done
2321.align 16
2322.Lxts_enc_5:
2323 pxor @XMM[8+3], @XMM[3]
2324 lea 0x50($inp), $inp
2325 pxor @XMM[8+4], @XMM[4]
2326 lea 0x80(%rsp), %rax # pass key schedule
2327 mov %edx, %r10d # pass rounds
2328
2329 call _bsaes_encrypt8
2330
2331 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2332 pxor 0x10(%rsp), @XMM[1]
2333 movdqu @XMM[0], 0x00($out) # write output
2334 pxor 0x20(%rsp), @XMM[4]
2335 movdqu @XMM[1], 0x10($out)
2336 pxor 0x30(%rsp), @XMM[6]
2337 movdqu @XMM[4], 0x20($out)
2338 pxor 0x40(%rsp), @XMM[3]
2339 movdqu @XMM[6], 0x30($out)
2340 movdqu @XMM[3], 0x40($out)
2341 lea 0x50($out), $out
2342
2343 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2344 jmp .Lxts_enc_done
2345.align 16
2346.Lxts_enc_4:
2347 pxor @XMM[8+2], @XMM[2]
2348 lea 0x40($inp), $inp
2349 pxor @XMM[8+3], @XMM[3]
2350 lea 0x80(%rsp), %rax # pass key schedule
2351 mov %edx, %r10d # pass rounds
2352
2353 call _bsaes_encrypt8
2354
2355 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2356 pxor 0x10(%rsp), @XMM[1]
2357 movdqu @XMM[0], 0x00($out) # write output
2358 pxor 0x20(%rsp), @XMM[4]
2359 movdqu @XMM[1], 0x10($out)
2360 pxor 0x30(%rsp), @XMM[6]
2361 movdqu @XMM[4], 0x20($out)
2362 movdqu @XMM[6], 0x30($out)
2363 lea 0x40($out), $out
2364
2365 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2366 jmp .Lxts_enc_done
2367.align 16
2368.Lxts_enc_3:
2369 pxor @XMM[8+1], @XMM[1]
2370 lea 0x30($inp), $inp
2371 pxor @XMM[8+2], @XMM[2]
2372 lea 0x80(%rsp), %rax # pass key schedule
2373 mov %edx, %r10d # pass rounds
2374
2375 call _bsaes_encrypt8
2376
2377 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2378 pxor 0x10(%rsp), @XMM[1]
2379 movdqu @XMM[0], 0x00($out) # write output
2380 pxor 0x20(%rsp), @XMM[4]
2381 movdqu @XMM[1], 0x10($out)
2382 movdqu @XMM[4], 0x20($out)
2383 lea 0x30($out), $out
2384
2385 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2386 jmp .Lxts_enc_done
2387.align 16
2388.Lxts_enc_2:
2389 pxor @XMM[8+0], @XMM[0]
2390 lea 0x20($inp), $inp
2391 pxor @XMM[8+1], @XMM[1]
2392 lea 0x80(%rsp), %rax # pass key schedule
2393 mov %edx, %r10d # pass rounds
2394
2395 call _bsaes_encrypt8
2396
2397 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2398 pxor 0x10(%rsp), @XMM[1]
2399 movdqu @XMM[0], 0x00($out) # write output
2400 movdqu @XMM[1], 0x10($out)
2401 lea 0x20($out), $out
2402
2403 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2404 jmp .Lxts_enc_done
2405.align 16
2406.Lxts_enc_1:
2407 pxor @XMM[0], @XMM[8]
2408 lea 0x10($inp), $inp
2409 movdqa @XMM[8], 0x20(%rbp)
2410 lea 0x20(%rbp), $arg1
2411 lea 0x20(%rbp), $arg2
2412 lea ($key), $arg3
2413 call asm_AES_encrypt # doesn't touch %xmm
2414 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2415 #pxor @XMM[8], @XMM[0]
2416 #lea 0x80(%rsp), %rax # pass key schedule
2417 #mov %edx, %r10d # pass rounds
2418 #call _bsaes_encrypt8
2419 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2420 movdqu @XMM[0], 0x00($out) # write output
2421 lea 0x10($out), $out
2422
2423 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2424
2425.Lxts_enc_done:
2426 and \$15, %ebx
2427 jz .Lxts_enc_ret
2428 mov $out, %rdx
2429
2430.Lxts_enc_steal:
2431 movzb ($inp), %eax
2432 movzb -16(%rdx), %ecx
2433 lea 1($inp), $inp
2434 mov %al, -16(%rdx)
2435 mov %cl, 0(%rdx)
2436 lea 1(%rdx), %rdx
2437 sub \$1,%ebx
2438 jnz .Lxts_enc_steal
2439
2440 movdqu -16($out), @XMM[0]
2441 lea 0x20(%rbp), $arg1
2442 pxor @XMM[7], @XMM[0]
2443 lea 0x20(%rbp), $arg2
2444 movdqa @XMM[0], 0x20(%rbp)
2445 lea ($key), $arg3
2446 call asm_AES_encrypt # doesn't touch %xmm
2447 pxor 0x20(%rbp), @XMM[7]
2448 movdqu @XMM[7], -16($out)
2449
2450.Lxts_enc_ret:
2451 lea (%rsp), %rax
2452 pxor %xmm0, %xmm0
2453.Lxts_enc_bzero: # wipe key schedule [if any]
2454 movdqa %xmm0, 0x00(%rax)
2455 movdqa %xmm0, 0x10(%rax)
2456 lea 0x20(%rax), %rax
2457 cmp %rax, %rbp
2458 ja .Lxts_enc_bzero
2459
2460 lea (%rbp),%rsp # restore %rsp
2461___
2462$code.=<<___ if ($win64);
2463 movaps 0x40(%rbp), %xmm6
2464 movaps 0x50(%rbp), %xmm7
2465 movaps 0x60(%rbp), %xmm8
2466 movaps 0x70(%rbp), %xmm9
2467 movaps 0x80(%rbp), %xmm10
2468 movaps 0x90(%rbp), %xmm11
2469 movaps 0xa0(%rbp), %xmm12
2470 movaps 0xb0(%rbp), %xmm13
2471 movaps 0xc0(%rbp), %xmm14
2472 movaps 0xd0(%rbp), %xmm15
2473 lea 0xa0(%rbp), %rsp
2474___
2475$code.=<<___;
2476 mov 0x48(%rsp), %r15
2477 mov 0x50(%rsp), %r14
2478 mov 0x58(%rsp), %r13
2479 mov 0x60(%rsp), %r12
2480 mov 0x68(%rsp), %rbx
2481 mov 0x70(%rsp), %rax
2482 lea 0x78(%rsp), %rsp
2483 mov %rax, %rbp
2484.Lxts_enc_epilogue:
2485 ret
2486.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2487
2488.globl bsaes_xts_decrypt
2489.type bsaes_xts_decrypt,\@abi-omnipotent
2490.align 16
2491bsaes_xts_decrypt:
2492 _CET_ENDBR
2493 mov %rsp, %rax
2494.Lxts_dec_prologue:
2495 push %rbp
2496 push %rbx
2497 push %r12
2498 push %r13
2499 push %r14
2500 push %r15
2501 lea -0x48(%rsp), %rsp
2502___
2503$code.=<<___ if ($win64);
2504 mov 0xa0(%rsp),$arg5 # pull key2
2505 mov 0xa8(%rsp),$arg6 # pull ivp
2506 lea -0xa0(%rsp), %rsp
2507 movaps %xmm6, 0x40(%rsp)
2508 movaps %xmm7, 0x50(%rsp)
2509 movaps %xmm8, 0x60(%rsp)
2510 movaps %xmm9, 0x70(%rsp)
2511 movaps %xmm10, 0x80(%rsp)
2512 movaps %xmm11, 0x90(%rsp)
2513 movaps %xmm12, 0xa0(%rsp)
2514 movaps %xmm13, 0xb0(%rsp)
2515 movaps %xmm14, 0xc0(%rsp)
2516 movaps %xmm15, 0xd0(%rsp)
2517.Lxts_dec_body:
2518___
2519$code.=<<___;
2520 mov %rsp, %rbp # backup %rsp
2521 mov $arg1, $inp # backup arguments
2522 mov $arg2, $out
2523 mov $arg3, $len
2524 mov $arg4, $key
2525
2526 lea ($arg6), $arg1
2527 lea 0x20(%rbp), $arg2
2528 lea ($arg5), $arg3
2529 call asm_AES_encrypt # generate initial tweak
2530
2531 mov 240($key), %eax # rounds
2532 mov $len, %rbx # backup $len
2533
2534 mov %eax, %edx # rounds
2535 shl \$7, %rax # 128 bytes per inner round key
2536 sub \$`128-32`, %rax # size of bit-sliced key schedule
2537 sub %rax, %rsp
2538
2539 mov %rsp, %rax # pass key schedule
2540 mov $key, %rcx # pass key
2541 mov %edx, %r10d # pass rounds
2542 call _bsaes_key_convert
2543 pxor (%rsp), %xmm7 # fix up round 0 key
2544 movdqa %xmm6, (%rax) # save last round key
2545 movdqa %xmm7, (%rsp)
2546
2547 xor %eax, %eax # if ($len%16) len-=16;
2548 and \$-16, $len
2549 test \$15, %ebx
2550 setnz %al
2551 shl \$4, %rax
2552 sub %rax, $len
2553
2554 sub \$0x80, %rsp # place for tweak[8]
2555 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2556
2557 pxor $twtmp, $twtmp
2558 movdqa .Lxts_magic(%rip), $twmask
2559 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2560
2561 sub \$0x80, $len
2562 jc .Lxts_dec_short
2563 jmp .Lxts_dec_loop
2564
2565.align 16
2566.Lxts_dec_loop:
2567___
2568 for ($i=0;$i<7;$i++) {
2569 $code.=<<___;
2570 pshufd \$0x13, $twtmp, $twres
2571 pxor $twtmp, $twtmp
2572 movdqa @XMM[7], @XMM[$i]
2573 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2574 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2575 pand $twmask, $twres # isolate carry and residue
2576 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2577 pxor $twres, @XMM[7]
2578___
2579 $code.=<<___ if ($i>=1);
2580 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2581___
2582 $code.=<<___ if ($i>=2);
2583 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2584___
2585 }
2586$code.=<<___;
2587 movdqu 0x60($inp), @XMM[8+6]
2588 pxor @XMM[8+5], @XMM[5]
2589 movdqu 0x70($inp), @XMM[8+7]
2590 lea 0x80($inp), $inp
2591 movdqa @XMM[7], 0x70(%rsp)
2592 pxor @XMM[8+6], @XMM[6]
2593 lea 0x80(%rsp), %rax # pass key schedule
2594 pxor @XMM[8+7], @XMM[7]
2595 mov %edx, %r10d # pass rounds
2596
2597 call _bsaes_decrypt8
2598
2599 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2600 pxor 0x10(%rsp), @XMM[1]
2601 movdqu @XMM[0], 0x00($out) # write output
2602 pxor 0x20(%rsp), @XMM[6]
2603 movdqu @XMM[1], 0x10($out)
2604 pxor 0x30(%rsp), @XMM[4]
2605 movdqu @XMM[6], 0x20($out)
2606 pxor 0x40(%rsp), @XMM[2]
2607 movdqu @XMM[4], 0x30($out)
2608 pxor 0x50(%rsp), @XMM[7]
2609 movdqu @XMM[2], 0x40($out)
2610 pxor 0x60(%rsp), @XMM[3]
2611 movdqu @XMM[7], 0x50($out)
2612 pxor 0x70(%rsp), @XMM[5]
2613 movdqu @XMM[3], 0x60($out)
2614 movdqu @XMM[5], 0x70($out)
2615 lea 0x80($out), $out
2616
2617 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2618 pxor $twtmp, $twtmp
2619 movdqa .Lxts_magic(%rip), $twmask
2620 pcmpgtd @XMM[7], $twtmp
2621 pshufd \$0x13, $twtmp, $twres
2622 pxor $twtmp, $twtmp
2623 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2624 pand $twmask, $twres # isolate carry and residue
2625 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2626 pxor $twres, @XMM[7]
2627
2628 sub \$0x80,$len
2629 jnc .Lxts_dec_loop
2630
2631.Lxts_dec_short:
2632 add \$0x80, $len
2633 jz .Lxts_dec_done
2634___
2635 for ($i=0;$i<7;$i++) {
2636 $code.=<<___;
2637 pshufd \$0x13, $twtmp, $twres
2638 pxor $twtmp, $twtmp
2639 movdqa @XMM[7], @XMM[$i]
2640 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2641 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2642 pand $twmask, $twres # isolate carry and residue
2643 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2644 pxor $twres, @XMM[7]
2645___
2646 $code.=<<___ if ($i>=1);
2647 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2648 cmp \$`0x10*$i`,$len
2649 je .Lxts_dec_$i
2650___
2651 $code.=<<___ if ($i>=2);
2652 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2653___
2654 }
2655$code.=<<___;
2656 movdqu 0x60($inp), @XMM[8+6]
2657 pxor @XMM[8+5], @XMM[5]
2658 movdqa @XMM[7], 0x70(%rsp)
2659 lea 0x70($inp), $inp
2660 pxor @XMM[8+6], @XMM[6]
2661 lea 0x80(%rsp), %rax # pass key schedule
2662 mov %edx, %r10d # pass rounds
2663
2664 call _bsaes_decrypt8
2665
2666 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2667 pxor 0x10(%rsp), @XMM[1]
2668 movdqu @XMM[0], 0x00($out) # write output
2669 pxor 0x20(%rsp), @XMM[6]
2670 movdqu @XMM[1], 0x10($out)
2671 pxor 0x30(%rsp), @XMM[4]
2672 movdqu @XMM[6], 0x20($out)
2673 pxor 0x40(%rsp), @XMM[2]
2674 movdqu @XMM[4], 0x30($out)
2675 pxor 0x50(%rsp), @XMM[7]
2676 movdqu @XMM[2], 0x40($out)
2677 pxor 0x60(%rsp), @XMM[3]
2678 movdqu @XMM[7], 0x50($out)
2679 movdqu @XMM[3], 0x60($out)
2680 lea 0x70($out), $out
2681
2682 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2683 jmp .Lxts_dec_done
2684.align 16
2685.Lxts_dec_6:
2686 pxor @XMM[8+4], @XMM[4]
2687 lea 0x60($inp), $inp
2688 pxor @XMM[8+5], @XMM[5]
2689 lea 0x80(%rsp), %rax # pass key schedule
2690 mov %edx, %r10d # pass rounds
2691
2692 call _bsaes_decrypt8
2693
2694 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2695 pxor 0x10(%rsp), @XMM[1]
2696 movdqu @XMM[0], 0x00($out) # write output
2697 pxor 0x20(%rsp), @XMM[6]
2698 movdqu @XMM[1], 0x10($out)
2699 pxor 0x30(%rsp), @XMM[4]
2700 movdqu @XMM[6], 0x20($out)
2701 pxor 0x40(%rsp), @XMM[2]
2702 movdqu @XMM[4], 0x30($out)
2703 pxor 0x50(%rsp), @XMM[7]
2704 movdqu @XMM[2], 0x40($out)
2705 movdqu @XMM[7], 0x50($out)
2706 lea 0x60($out), $out
2707
2708 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2709 jmp .Lxts_dec_done
2710.align 16
2711.Lxts_dec_5:
2712 pxor @XMM[8+3], @XMM[3]
2713 lea 0x50($inp), $inp
2714 pxor @XMM[8+4], @XMM[4]
2715 lea 0x80(%rsp), %rax # pass key schedule
2716 mov %edx, %r10d # pass rounds
2717
2718 call _bsaes_decrypt8
2719
2720 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2721 pxor 0x10(%rsp), @XMM[1]
2722 movdqu @XMM[0], 0x00($out) # write output
2723 pxor 0x20(%rsp), @XMM[6]
2724 movdqu @XMM[1], 0x10($out)
2725 pxor 0x30(%rsp), @XMM[4]
2726 movdqu @XMM[6], 0x20($out)
2727 pxor 0x40(%rsp), @XMM[2]
2728 movdqu @XMM[4], 0x30($out)
2729 movdqu @XMM[2], 0x40($out)
2730 lea 0x50($out), $out
2731
2732 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2733 jmp .Lxts_dec_done
2734.align 16
2735.Lxts_dec_4:
2736 pxor @XMM[8+2], @XMM[2]
2737 lea 0x40($inp), $inp
2738 pxor @XMM[8+3], @XMM[3]
2739 lea 0x80(%rsp), %rax # pass key schedule
2740 mov %edx, %r10d # pass rounds
2741
2742 call _bsaes_decrypt8
2743
2744 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2745 pxor 0x10(%rsp), @XMM[1]
2746 movdqu @XMM[0], 0x00($out) # write output
2747 pxor 0x20(%rsp), @XMM[6]
2748 movdqu @XMM[1], 0x10($out)
2749 pxor 0x30(%rsp), @XMM[4]
2750 movdqu @XMM[6], 0x20($out)
2751 movdqu @XMM[4], 0x30($out)
2752 lea 0x40($out), $out
2753
2754 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2755 jmp .Lxts_dec_done
2756.align 16
2757.Lxts_dec_3:
2758 pxor @XMM[8+1], @XMM[1]
2759 lea 0x30($inp), $inp
2760 pxor @XMM[8+2], @XMM[2]
2761 lea 0x80(%rsp), %rax # pass key schedule
2762 mov %edx, %r10d # pass rounds
2763
2764 call _bsaes_decrypt8
2765
2766 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2767 pxor 0x10(%rsp), @XMM[1]
2768 movdqu @XMM[0], 0x00($out) # write output
2769 pxor 0x20(%rsp), @XMM[6]
2770 movdqu @XMM[1], 0x10($out)
2771 movdqu @XMM[6], 0x20($out)
2772 lea 0x30($out), $out
2773
2774 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2775 jmp .Lxts_dec_done
2776.align 16
2777.Lxts_dec_2:
2778 pxor @XMM[8+0], @XMM[0]
2779 lea 0x20($inp), $inp
2780 pxor @XMM[8+1], @XMM[1]
2781 lea 0x80(%rsp), %rax # pass key schedule
2782 mov %edx, %r10d # pass rounds
2783
2784 call _bsaes_decrypt8
2785
2786 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2787 pxor 0x10(%rsp), @XMM[1]
2788 movdqu @XMM[0], 0x00($out) # write output
2789 movdqu @XMM[1], 0x10($out)
2790 lea 0x20($out), $out
2791
2792 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2793 jmp .Lxts_dec_done
2794.align 16
2795.Lxts_dec_1:
2796 pxor @XMM[0], @XMM[8]
2797 lea 0x10($inp), $inp
2798 movdqa @XMM[8], 0x20(%rbp)
2799 lea 0x20(%rbp), $arg1
2800 lea 0x20(%rbp), $arg2
2801 lea ($key), $arg3
2802 call asm_AES_decrypt # doesn't touch %xmm
2803 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2804 #pxor @XMM[8], @XMM[0]
2805 #lea 0x80(%rsp), %rax # pass key schedule
2806 #mov %edx, %r10d # pass rounds
2807 #call _bsaes_decrypt8
2808 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2809 movdqu @XMM[0], 0x00($out) # write output
2810 lea 0x10($out), $out
2811
2812 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2813
2814.Lxts_dec_done:
2815 and \$15, %ebx
2816 jz .Lxts_dec_ret
2817
2818 pxor $twtmp, $twtmp
2819 movdqa .Lxts_magic(%rip), $twmask
2820 pcmpgtd @XMM[7], $twtmp
2821 pshufd \$0x13, $twtmp, $twres
2822 movdqa @XMM[7], @XMM[6]
2823 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2824 pand $twmask, $twres # isolate carry and residue
2825 movdqu ($inp), @XMM[0]
2826 pxor $twres, @XMM[7]
2827
2828 lea 0x20(%rbp), $arg1
2829 pxor @XMM[7], @XMM[0]
2830 lea 0x20(%rbp), $arg2
2831 movdqa @XMM[0], 0x20(%rbp)
2832 lea ($key), $arg3
2833 call asm_AES_decrypt # doesn't touch %xmm
2834 pxor 0x20(%rbp), @XMM[7]
2835 mov $out, %rdx
2836 movdqu @XMM[7], ($out)
2837
2838.Lxts_dec_steal:
2839 movzb 16($inp), %eax
2840 movzb (%rdx), %ecx
2841 lea 1($inp), $inp
2842 mov %al, (%rdx)
2843 mov %cl, 16(%rdx)
2844 lea 1(%rdx), %rdx
2845 sub \$1,%ebx
2846 jnz .Lxts_dec_steal
2847
2848 movdqu ($out), @XMM[0]
2849 lea 0x20(%rbp), $arg1
2850 pxor @XMM[6], @XMM[0]
2851 lea 0x20(%rbp), $arg2
2852 movdqa @XMM[0], 0x20(%rbp)
2853 lea ($key), $arg3
2854 call asm_AES_decrypt # doesn't touch %xmm
2855 pxor 0x20(%rbp), @XMM[6]
2856 movdqu @XMM[6], ($out)
2857
2858.Lxts_dec_ret:
2859 lea (%rsp), %rax
2860 pxor %xmm0, %xmm0
2861.Lxts_dec_bzero: # wipe key schedule [if any]
2862 movdqa %xmm0, 0x00(%rax)
2863 movdqa %xmm0, 0x10(%rax)
2864 lea 0x20(%rax), %rax
2865 cmp %rax, %rbp
2866 ja .Lxts_dec_bzero
2867
2868 lea (%rbp),%rsp # restore %rsp
2869___
2870$code.=<<___ if ($win64);
2871 movaps 0x40(%rbp), %xmm6
2872 movaps 0x50(%rbp), %xmm7
2873 movaps 0x60(%rbp), %xmm8
2874 movaps 0x70(%rbp), %xmm9
2875 movaps 0x80(%rbp), %xmm10
2876 movaps 0x90(%rbp), %xmm11
2877 movaps 0xa0(%rbp), %xmm12
2878 movaps 0xb0(%rbp), %xmm13
2879 movaps 0xc0(%rbp), %xmm14
2880 movaps 0xd0(%rbp), %xmm15
2881 lea 0xa0(%rbp), %rsp
2882___
2883$code.=<<___;
2884 mov 0x48(%rsp), %r15
2885 mov 0x50(%rsp), %r14
2886 mov 0x58(%rsp), %r13
2887 mov 0x60(%rsp), %r12
2888 mov 0x68(%rsp), %rbx
2889 mov 0x70(%rsp), %rax
2890 lea 0x78(%rsp), %rsp
2891 mov %rax, %rbp
2892.Lxts_dec_epilogue:
2893 ret
2894.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2895___
2896}
2897$code.=<<___;
2898.section .rodata
2899.type _bsaes_const,\@object
2900.align 64
2901_bsaes_const:
2902.LM0ISR: # InvShiftRows constants
2903 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2904.LISRM0:
2905 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2906.LISR:
2907 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2908.LBS0: # bit-slice constants
2909 .quad 0x5555555555555555, 0x5555555555555555
2910.LBS1:
2911 .quad 0x3333333333333333, 0x3333333333333333
2912.LBS2:
2913 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2914.LSR: # shiftrows constants
2915 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2916.LSRM0:
2917 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2918.LM0SR:
2919 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2920.LSWPUP: # byte-swap upper dword
2921 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2922.LSWPUPM0SR:
2923 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2924.LADD1: # counter increment constants
2925 .quad 0x0000000000000000, 0x0000000100000000
2926.LADD2:
2927 .quad 0x0000000000000000, 0x0000000200000000
2928.LADD3:
2929 .quad 0x0000000000000000, 0x0000000300000000
2930.LADD4:
2931 .quad 0x0000000000000000, 0x0000000400000000
2932.LADD5:
2933 .quad 0x0000000000000000, 0x0000000500000000
2934.LADD6:
2935 .quad 0x0000000000000000, 0x0000000600000000
2936.LADD7:
2937 .quad 0x0000000000000000, 0x0000000700000000
2938.LADD8:
2939 .quad 0x0000000000000000, 0x0000000800000000
2940.Lxts_magic:
2941 .long 0x87,0,1,0
2942.Lmasks:
2943 .quad 0x0101010101010101, 0x0101010101010101
2944 .quad 0x0202020202020202, 0x0202020202020202
2945 .quad 0x0404040404040404, 0x0404040404040404
2946 .quad 0x0808080808080808, 0x0808080808080808
2947.LM0:
2948 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2949.L63:
2950 .quad 0x6363636363636363, 0x6363636363636363
2951.align 64
2952.size _bsaes_const,.-_bsaes_const
2953.text
2954___
2955
2956# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2957# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2958if ($win64) {
2959$rec="%rcx";
2960$frame="%rdx";
2961$context="%r8";
2962$disp="%r9";
2963
2964$code.=<<___;
2965.extern __imp_RtlVirtualUnwind
2966.type se_handler,\@abi-omnipotent
2967.align 16
2968se_handler:
2969 _CET_ENDBR
2970 push %rsi
2971 push %rdi
2972 push %rbx
2973 push %rbp
2974 push %r12
2975 push %r13
2976 push %r14
2977 push %r15
2978 pushfq
2979 sub \$64,%rsp
2980
2981 mov 120($context),%rax # pull context->Rax
2982 mov 248($context),%rbx # pull context->Rip
2983
2984 mov 8($disp),%rsi # disp->ImageBase
2985 mov 56($disp),%r11 # disp->HandlerData
2986
2987 mov 0(%r11),%r10d # HandlerData[0]
2988 lea (%rsi,%r10),%r10 # prologue label
2989 cmp %r10,%rbx # context->Rip<prologue label
2990 jb .Lin_prologue
2991
2992 mov 152($context),%rax # pull context->Rsp
2993
2994 mov 4(%r11),%r10d # HandlerData[1]
2995 lea (%rsi,%r10),%r10 # epilogue label
2996 cmp %r10,%rbx # context->Rip>=epilogue label
2997 jae .Lin_prologue
2998
2999 mov 160($context),%rax # pull context->Rbp
3000
3001 lea 0x40(%rax),%rsi # %xmm save area
3002 lea 512($context),%rdi # &context.Xmm6
3003 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
3004 .long 0xa548f3fc # cld; rep movsq
3005 lea 0xa0(%rax),%rax # adjust stack pointer
3006
3007 mov 0x70(%rax),%rbp
3008 mov 0x68(%rax),%rbx
3009 mov 0x60(%rax),%r12
3010 mov 0x58(%rax),%r13
3011 mov 0x50(%rax),%r14
3012 mov 0x48(%rax),%r15
3013 lea 0x78(%rax),%rax # adjust stack pointer
3014 mov %rbx,144($context) # restore context->Rbx
3015 mov %rbp,160($context) # restore context->Rbp
3016 mov %r12,216($context) # restore context->R12
3017 mov %r13,224($context) # restore context->R13
3018 mov %r14,232($context) # restore context->R14
3019 mov %r15,240($context) # restore context->R15
3020
3021.Lin_prologue:
3022 mov %rax,152($context) # restore context->Rsp
3023
3024 mov 40($disp),%rdi # disp->ContextRecord
3025 mov $context,%rsi # context
3026 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3027 .long 0xa548f3fc # cld; rep movsq
3028
3029 mov $disp,%rsi
3030 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3031 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3032 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3033 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3034 mov 40(%rsi),%r10 # disp->ContextRecord
3035 lea 56(%rsi),%r11 # &disp->HandlerData
3036 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3037 mov %r10,32(%rsp) # arg5
3038 mov %r11,40(%rsp) # arg6
3039 mov %r12,48(%rsp) # arg7
3040 mov %rcx,56(%rsp) # arg8, (NULL)
3041 call *__imp_RtlVirtualUnwind(%rip)
3042
3043 mov \$1,%eax # ExceptionContinueSearch
3044 add \$64,%rsp
3045 popfq
3046 pop %r15
3047 pop %r14
3048 pop %r13
3049 pop %r12
3050 pop %rbp
3051 pop %rbx
3052 pop %rdi
3053 pop %rsi
3054 ret
3055.size se_handler,.-se_handler
3056
3057.section .pdata
3058.align 4
3059___
3060$code.=<<___ if ($ecb);
3061 .rva .Lecb_enc_prologue
3062 .rva .Lecb_enc_epilogue
3063 .rva .Lecb_enc_info
3064
3065 .rva .Lecb_dec_prologue
3066 .rva .Lecb_dec_epilogue
3067 .rva .Lecb_dec_info
3068___
3069$code.=<<___;
3070 .rva .Lcbc_dec_prologue
3071 .rva .Lcbc_dec_epilogue
3072 .rva .Lcbc_dec_info
3073
3074 .rva .Lctr_enc_prologue
3075 .rva .Lctr_enc_epilogue
3076 .rva .Lctr_enc_info
3077
3078 .rva .Lxts_enc_prologue
3079 .rva .Lxts_enc_epilogue
3080 .rva .Lxts_enc_info
3081
3082 .rva .Lxts_dec_prologue
3083 .rva .Lxts_dec_epilogue
3084 .rva .Lxts_dec_info
3085
3086.section .xdata
3087.align 8
3088___
3089$code.=<<___ if ($ecb);
3090.Lecb_enc_info:
3091 .byte 9,0,0,0
3092 .rva se_handler
3093 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3094.Lecb_dec_info:
3095 .byte 9,0,0,0
3096 .rva se_handler
3097 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3098___
3099$code.=<<___;
3100.Lcbc_dec_info:
3101 .byte 9,0,0,0
3102 .rva se_handler
3103 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3104.Lctr_enc_info:
3105 .byte 9,0,0,0
3106 .rva se_handler
3107 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3108.Lxts_enc_info:
3109 .byte 9,0,0,0
3110 .rva se_handler
3111 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3112.Lxts_dec_info:
3113 .byte 9,0,0,0
3114 .rva se_handler
3115 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3116___
3117}
3118
3119$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3120
3121print $code;
3122
3123close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl
deleted file mode 100644
index 6e7bd36d05..0000000000
--- a/src/lib/libcrypto/aes/asm/vpaes-x86.pl
+++ /dev/null
@@ -1,911 +0,0 @@
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
17# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-586.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86.pl column - [also
26# large-block CBC] encrypt/decrypt.
27#
28# aes-586.pl vpaes-x86.pl
29#
30# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***)
31# Nehalem 27.9/40.4/18.1 10.3/12.0
32# Atom 102./119./60.1 64.5/85.3(***)
33#
34# (*) "Hyper-threading" in the context refers rather to cache shared
35# among multiple cores, than to specifically Intel HTT. As vast
36# majority of contemporary cores share cache, slower code path
37# is common place. In other words "with-hyper-threading-off"
38# results are presented mostly for reference purposes.
39#
40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
41#
42# (***) Less impressive improvement on Core 2 and Atom is due to slow
43# pshufb, yet it's respectable +32%/65% improvement on Core 2
44# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
45# code path).
46#
47# <appro@openssl.org>
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50push(@INC,"${dir}","${dir}../../perlasm");
51require "x86asm.pl";
52
53&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
54
55$PREFIX="vpaes";
56
57my ($round, $base, $magic, $key, $const, $inp, $out)=
58 ("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
59
60 &rodataseg();
61&static_label("_vpaes_consts");
62&static_label("_vpaes_schedule_low_round");
63
64&set_label("_vpaes_consts",64);
65$k_inv=-0x30; # inv, inva
66 &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
67 &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
68
69$k_s0F=-0x10; # s0F
70 &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
71
72$k_ipt=0x00; # input transform (lo, hi)
73 &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
74 &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
75
76$k_sb1=0x20; # sb1u, sb1t
77 &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
78 &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
79$k_sb2=0x40; # sb2u, sb2t
80 &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
81 &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
82$k_sbo=0x60; # sbou, sbot
83 &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
84 &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
85
86$k_mc_forward=0x80; # mc_forward
87 &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
88 &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
89 &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
90 &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
91
92$k_mc_backward=0xc0; # mc_backward
93 &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
94 &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
95 &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
96 &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
97
98$k_sr=0x100; # sr
99 &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
100 &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
101 &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
102 &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
103
104$k_rcon=0x140; # rcon
105 &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
106
107$k_s63=0x150; # s63: all equal to 0x63 transformed
108 &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
109
110$k_opt=0x160; # output transform
111 &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
112 &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
113
114$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
115 &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
116 &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
117##
118## Decryption stuff
119## Key schedule constants
120##
121$k_dksd=0x1a0; # decryption key schedule: invskew x*D
122 &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
123 &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
124$k_dksb=0x1c0; # decryption key schedule: invskew x*B
125 &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
126 &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
127$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
128 &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
129 &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
130$k_dks9=0x200; # decryption key schedule: invskew x*9
131 &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
132 &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
133
134##
135## Decryption stuff
136## Round function constants
137##
138$k_dipt=0x220; # decryption input transform
139 &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
140 &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
141
142$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
143 &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
144 &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
145$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
146 &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
147 &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
148$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
149 &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
150 &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
151$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
152 &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
153 &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
154$k_dsbo=0x2c0; # decryption sbox final output
155 &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
156 &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
157 &previous();
158
159&function_begin_B("_vpaes_preheat");
160 &movdqa ("xmm7",&QWP($k_inv,$const));
161 &movdqa ("xmm6",&QWP($k_s0F,$const));
162 &ret ();
163&function_end_B("_vpaes_preheat");
164
165##
166## _aes_encrypt_core
167##
168## AES-encrypt %xmm0.
169##
170## Inputs:
171## %xmm0 = input
172## %xmm6-%xmm7 as in _vpaes_preheat
173## (%edx) = scheduled keys
174##
175## Output in %xmm0
176## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
177##
178##
179&function_begin_B("_vpaes_encrypt_core");
180 &mov ($magic,16);
181 &mov ($round,&DWP(240,$key));
182 &movdqa ("xmm1","xmm6")
183 &movdqa ("xmm2",&QWP($k_ipt,$const));
184 &pandn ("xmm1","xmm0");
185 &movdqu ("xmm5",&QWP(0,$key));
186 &psrld ("xmm1",4);
187 &pand ("xmm0","xmm6");
188 &pshufb ("xmm2","xmm0");
189 &movdqa ("xmm0",&QWP($k_ipt+16,$const));
190 &pshufb ("xmm0","xmm1");
191 &pxor ("xmm2","xmm5");
192 &pxor ("xmm0","xmm2");
193 &add ($key,16);
194 &lea ($base,&DWP($k_mc_backward,$const));
195 &jmp (&label("enc_entry"));
196
197
198&set_label("enc_loop",16);
199 # middle of middle round
200 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
201 &pshufb ("xmm4","xmm2"); # 4 = sb1u
202 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
203 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
204 &pshufb ("xmm0","xmm3"); # 0 = sb1t
205 &pxor ("xmm0","xmm4"); # 0 = A
206 &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
207 &pshufb ("xmm5","xmm2"); # 4 = sb2u
208 &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
209 &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
210 &pshufb ("xmm2","xmm3"); # 2 = sb2t
211 &pxor ("xmm2","xmm5"); # 2 = 2A
212 &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
213 &movdqa ("xmm3","xmm0"); # 3 = A
214 &pshufb ("xmm0","xmm1"); # 0 = B
215 &add ($key,16); # next key
216 &pxor ("xmm0","xmm2"); # 0 = 2A+B
217 &pshufb ("xmm3","xmm4"); # 3 = D
218 &add ($magic,16); # next mc
219 &pxor ("xmm3","xmm0"); # 3 = 2A+B+D
220 &pshufb ("xmm0","xmm1"); # 0 = 2B+C
221 &and ($magic,0x30); # ... mod 4
222 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
223 &sub ($round,1); # nr--
224
225&set_label("enc_entry");
226 # top of round
227 &movdqa ("xmm1","xmm6"); # 1 : i
228 &pandn ("xmm1","xmm0"); # 1 = i<<4
229 &psrld ("xmm1",4); # 1 = i
230 &pand ("xmm0","xmm6"); # 0 = k
231 &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
232 &pshufb ("xmm5","xmm0"); # 2 = a/k
233 &pxor ("xmm0","xmm1"); # 0 = j
234 &movdqa ("xmm3","xmm7"); # 3 : 1/i
235 &pshufb ("xmm3","xmm1"); # 3 = 1/i
236 &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
237 &movdqa ("xmm4","xmm7"); # 4 : 1/j
238 &pshufb ("xmm4","xmm0"); # 4 = 1/j
239 &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
240 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
241 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
242 &pxor ("xmm2","xmm0"); # 2 = io
243 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
244 &movdqu ("xmm5",&QWP(0,$key));
245 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
246 &pxor ("xmm3","xmm1"); # 3 = jo
247 &jnz (&label("enc_loop"));
248
249 # middle of last round
250 &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
251 &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
252 &pshufb ("xmm4","xmm2"); # 4 = sbou
253 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
254 &pshufb ("xmm0","xmm3"); # 0 = sb1t
255 &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
256 &pxor ("xmm0","xmm4"); # 0 = A
257 &pshufb ("xmm0","xmm1");
258 &ret ();
259&function_end_B("_vpaes_encrypt_core");
260
261##
262## Decryption core
263##
264## Same API as encryption core.
265##
266&function_begin_B("_vpaes_decrypt_core");
267 &mov ($round,&DWP(240,$key));
268 &lea ($base,&DWP($k_dsbd,$const));
269 &movdqa ("xmm1","xmm6");
270 &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
271 &pandn ("xmm1","xmm0");
272 &mov ($magic,$round);
273 &psrld ("xmm1",4)
274 &movdqu ("xmm5",&QWP(0,$key));
275 &shl ($magic,4);
276 &pand ("xmm0","xmm6");
277 &pshufb ("xmm2","xmm0");
278 &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
279 &xor ($magic,0x30);
280 &pshufb ("xmm0","xmm1");
281 &and ($magic,0x30);
282 &pxor ("xmm2","xmm5");
283 &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
284 &pxor ("xmm0","xmm2");
285 &add ($key,16);
286 &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
287 &jmp (&label("dec_entry"));
288
289&set_label("dec_loop",16);
290##
291## Inverse mix columns
292##
293 &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
294 &pshufb ("xmm4","xmm2"); # 4 = sb9u
295 &pxor ("xmm4","xmm0");
296 &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t
297 &pshufb ("xmm0","xmm3"); # 0 = sb9t
298 &pxor ("xmm0","xmm4"); # 0 = ch
299 &add ($key,16); # next round key
300
301 &pshufb ("xmm0","xmm5"); # MC ch
302 &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
303 &pshufb ("xmm4","xmm2"); # 4 = sbdu
304 &pxor ("xmm4","xmm0"); # 4 = ch
305 &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt
306 &pshufb ("xmm0","xmm3"); # 0 = sbdt
307 &pxor ("xmm0","xmm4"); # 0 = ch
308 &sub ($round,1); # nr--
309
310 &pshufb ("xmm0","xmm5"); # MC ch
311 &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
312 &pshufb ("xmm4","xmm2"); # 4 = sbbu
313 &pxor ("xmm4","xmm0"); # 4 = ch
314 &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt
315 &pshufb ("xmm0","xmm3"); # 0 = sbbt
316 &pxor ("xmm0","xmm4"); # 0 = ch
317
318 &pshufb ("xmm0","xmm5"); # MC ch
319 &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
320 &pshufb ("xmm4","xmm2"); # 4 = sbeu
321 &pxor ("xmm4","xmm0"); # 4 = ch
322 &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
323 &pshufb ("xmm0","xmm3"); # 0 = sbet
324 &pxor ("xmm0","xmm4"); # 0 = ch
325
326 &palignr("xmm5","xmm5",12);
327
328&set_label("dec_entry");
329 # top of round
330 &movdqa ("xmm1","xmm6"); # 1 : i
331 &pandn ("xmm1","xmm0"); # 1 = i<<4
332 &psrld ("xmm1",4); # 1 = i
333 &pand ("xmm0","xmm6"); # 0 = k
334 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
335 &pshufb ("xmm2","xmm0"); # 2 = a/k
336 &pxor ("xmm0","xmm1"); # 0 = j
337 &movdqa ("xmm3","xmm7"); # 3 : 1/i
338 &pshufb ("xmm3","xmm1"); # 3 = 1/i
339 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
340 &movdqa ("xmm4","xmm7"); # 4 : 1/j
341 &pshufb ("xmm4","xmm0"); # 4 = 1/j
342 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
343 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
344 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
345 &pxor ("xmm2","xmm0"); # 2 = io
346 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
347 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
348 &pxor ("xmm3","xmm1"); # 3 = jo
349 &movdqu ("xmm0",&QWP(0,$key));
350 &jnz (&label("dec_loop"));
351
352 # middle of last round
353 &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
354 &pshufb ("xmm4","xmm2"); # 4 = sbou
355 &pxor ("xmm4","xmm0"); # 4 = sb1u + k
356 &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
357 &movdqa ("xmm2",&QWP(0,$magic));
358 &pshufb ("xmm0","xmm3"); # 0 = sb1t
359 &pxor ("xmm0","xmm4"); # 0 = A
360 &pshufb ("xmm0","xmm2");
361 &ret ();
362&function_end_B("_vpaes_decrypt_core");
363
364########################################################
365## ##
366## AES key schedule ##
367## ##
368########################################################
369&function_begin_B("_vpaes_schedule_core");
370 &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
371 &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
372
373 # input transform
374 &movdqa ("xmm3","xmm0");
375 &lea ($base,&DWP($k_ipt,$const));
376 &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
377 &call ("_vpaes_schedule_transform");
378 &movdqa ("xmm7","xmm0");
379
380 &test ($out,$out);
381 &jnz (&label("schedule_am_decrypting"));
382
383 # encrypting, output zeroth round key after transform
384 &movdqu (&QWP(0,$key),"xmm0");
385 &jmp (&label("schedule_go"));
386
387&set_label("schedule_am_decrypting");
388 # decrypting, output zeroth round key after shiftrows
389 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
390 &pshufb ("xmm3","xmm1");
391 &movdqu (&QWP(0,$key),"xmm3");
392 &xor ($magic,0x30);
393
394&set_label("schedule_go");
395 &cmp ($round,192);
396 &ja (&label("schedule_256"));
397 &je (&label("schedule_192"));
398 # 128: fall though
399
400##
401## .schedule_128
402##
403## 128-bit specific part of key schedule.
404##
405## This schedule is really simple, because all its parts
406## are accomplished by the subroutines.
407##
408&set_label("schedule_128");
409 &mov ($round,10);
410
411&set_label("loop_schedule_128");
412 &call ("_vpaes_schedule_round");
413 &dec ($round);
414 &jz (&label("schedule_mangle_last"));
415 &call ("_vpaes_schedule_mangle"); # write output
416 &jmp (&label("loop_schedule_128"));
417
418##
419## .aes_schedule_192
420##
421## 192-bit specific part of key schedule.
422##
423## The main body of this schedule is the same as the 128-bit
424## schedule, but with more smearing. The long, high side is
425## stored in %xmm7 as before, and the short, low side is in
426## the high bits of %xmm6.
427##
428## This schedule is somewhat nastier, however, because each
429## round produces 192 bits of key material, or 1.5 round keys.
430## Therefore, on each cycle we do 2 rounds and produce 3 round
431## keys.
432##
433&set_label("schedule_192",16);
434 &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
435 &call ("_vpaes_schedule_transform"); # input transform
436 &movdqa ("xmm6","xmm0"); # save short part
437 &pxor ("xmm4","xmm4"); # clear 4
438 &movhlps("xmm6","xmm4"); # clobber low side with zeros
439 &mov ($round,4);
440
441&set_label("loop_schedule_192");
442 &call ("_vpaes_schedule_round");
443 &palignr("xmm0","xmm6",8);
444 &call ("_vpaes_schedule_mangle"); # save key n
445 &call ("_vpaes_schedule_192_smear");
446 &call ("_vpaes_schedule_mangle"); # save key n+1
447 &call ("_vpaes_schedule_round");
448 &dec ($round);
449 &jz (&label("schedule_mangle_last"));
450 &call ("_vpaes_schedule_mangle"); # save key n+2
451 &call ("_vpaes_schedule_192_smear");
452 &jmp (&label("loop_schedule_192"));
453
454##
455## .aes_schedule_256
456##
457## 256-bit specific part of key schedule.
458##
459## The structure here is very similar to the 128-bit
460## schedule, but with an additional "low side" in
461## %xmm6. The low side's rounds are the same as the
462## high side's, except no rcon and no rotation.
463##
464&set_label("schedule_256",16);
465 &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
466 &call ("_vpaes_schedule_transform"); # input transform
467 &mov ($round,7);
468
469&set_label("loop_schedule_256");
470 &call ("_vpaes_schedule_mangle"); # output low result
471 &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
472
473 # high round
474 &call ("_vpaes_schedule_round");
475 &dec ($round);
476 &jz (&label("schedule_mangle_last"));
477 &call ("_vpaes_schedule_mangle");
478
479 # low round. swap xmm7 and xmm6
480 &pshufd ("xmm0","xmm0",0xFF);
481 &movdqa (&QWP(20,"esp"),"xmm7");
482 &movdqa ("xmm7","xmm6");
483 &call ("_vpaes_schedule_low_round");
484 &movdqa ("xmm7",&QWP(20,"esp"));
485
486 &jmp (&label("loop_schedule_256"));
487
488##
489## .aes_schedule_mangle_last
490##
491## Mangler for last round of key schedule
492## Mangles %xmm0
493## when encrypting, outputs out(%xmm0) ^ 63
494## when decrypting, outputs unskew(%xmm0)
495##
496## Always called right before return... jumps to cleanup and exits
497##
498&set_label("schedule_mangle_last",16);
499 # schedule last round key from xmm0
500 &lea ($base,&DWP($k_deskew,$const));
501 &test ($out,$out);
502 &jnz (&label("schedule_mangle_last_dec"));
503
504 # encrypting
505 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
506 &pshufb ("xmm0","xmm1"); # output permute
507 &lea ($base,&DWP($k_opt,$const)); # prepare to output transform
508 &add ($key,32);
509
510&set_label("schedule_mangle_last_dec");
511 &add ($key,-16);
512 &pxor ("xmm0",&QWP($k_s63,$const));
513 &call ("_vpaes_schedule_transform"); # output transform
514 &movdqu (&QWP(0,$key),"xmm0"); # save last key
515
516 # cleanup
517 &pxor ("xmm0","xmm0");
518 &pxor ("xmm1","xmm1");
519 &pxor ("xmm2","xmm2");
520 &pxor ("xmm3","xmm3");
521 &pxor ("xmm4","xmm4");
522 &pxor ("xmm5","xmm5");
523 &pxor ("xmm6","xmm6");
524 &pxor ("xmm7","xmm7");
525 &ret ();
526&function_end_B("_vpaes_schedule_core");
527
528##
529## .aes_schedule_192_smear
530##
531## Smear the short, low side in the 192-bit key schedule.
532##
533## Inputs:
534## %xmm7: high side, b a x y
535## %xmm6: low side, d c 0 0
536## %xmm13: 0
537##
538## Outputs:
539## %xmm6: b+c+d b+c 0 0
540## %xmm0: b+c+d b+c b a
541##
542&function_begin_B("_vpaes_schedule_192_smear");
543 &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0
544 &pxor ("xmm6","xmm0"); # -> c+d c 0 0
545 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
546 &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
547 &movdqa ("xmm0","xmm6");
548 &pxor ("xmm1","xmm1");
549 &movhlps("xmm6","xmm1"); # clobber low side with zeros
550 &ret ();
551&function_end_B("_vpaes_schedule_192_smear");
552
553##
554## .aes_schedule_round
555##
556## Runs one main round of the key schedule on %xmm0, %xmm7
557##
558## Specifically, runs subbytes on the high dword of %xmm0
559## then rotates it by one byte and xors into the low dword of
560## %xmm7.
561##
562## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
563## next rcon.
564##
565## Smears the dwords of %xmm7 by xoring the low into the
566## second low, result into third, result into highest.
567##
568## Returns results in %xmm7 = %xmm0.
569## Clobbers %xmm1-%xmm5.
570##
571&function_begin_B("_vpaes_schedule_round");
572 # extract rcon from xmm8
573 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8
574 &pxor ("xmm1","xmm1");
575 &palignr("xmm1","xmm2",15);
576 &palignr("xmm2","xmm2",15);
577 &pxor ("xmm7","xmm1");
578
579 # rotate
580 &pshufd ("xmm0","xmm0",0xFF);
581 &palignr("xmm0","xmm0",1);
582
583 # fall through...
584 &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
585
586 # low round: same as high round, but no rotation and no rcon.
587&set_label("_vpaes_schedule_low_round");
588 # smear xmm7
589 &movdqa ("xmm1","xmm7");
590 &pslldq ("xmm7",4);
591 &pxor ("xmm7","xmm1");
592 &movdqa ("xmm1","xmm7");
593 &pslldq ("xmm7",8);
594 &pxor ("xmm7","xmm1");
595 &pxor ("xmm7",&QWP($k_s63,$const));
596
597 # subbyte
598 &movdqa ("xmm4",&QWP($k_s0F,$const));
599 &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
600 &movdqa ("xmm1","xmm4");
601 &pandn ("xmm1","xmm0");
602 &psrld ("xmm1",4); # 1 = i
603 &pand ("xmm0","xmm4"); # 0 = k
604 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
605 &pshufb ("xmm2","xmm0"); # 2 = a/k
606 &pxor ("xmm0","xmm1"); # 0 = j
607 &movdqa ("xmm3","xmm5"); # 3 : 1/i
608 &pshufb ("xmm3","xmm1"); # 3 = 1/i
609 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
610 &movdqa ("xmm4","xmm5"); # 4 : 1/j
611 &pshufb ("xmm4","xmm0"); # 4 = 1/j
612 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
613 &movdqa ("xmm2","xmm5"); # 2 : 1/iak
614 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
615 &pxor ("xmm2","xmm0"); # 2 = io
616 &movdqa ("xmm3","xmm5"); # 3 : 1/jak
617 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
618 &pxor ("xmm3","xmm1"); # 3 = jo
619 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
620 &pshufb ("xmm4","xmm2"); # 4 = sbou
621 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
622 &pshufb ("xmm0","xmm3"); # 0 = sb1t
623 &pxor ("xmm0","xmm4"); # 0 = sbox output
624
625 # add in smeared stuff
626 &pxor ("xmm0","xmm7");
627 &movdqa ("xmm7","xmm0");
628 &ret ();
629&function_end_B("_vpaes_schedule_round");
630
631##
632## .aes_schedule_transform
633##
634## Linear-transform %xmm0 according to tables at (%ebx)
635##
636## Output in %xmm0
637## Clobbers %xmm1, %xmm2
638##
639&function_begin_B("_vpaes_schedule_transform");
640 &movdqa ("xmm2",&QWP($k_s0F,$const));
641 &movdqa ("xmm1","xmm2");
642 &pandn ("xmm1","xmm0");
643 &psrld ("xmm1",4);
644 &pand ("xmm0","xmm2");
645 &movdqa ("xmm2",&QWP(0,$base));
646 &pshufb ("xmm2","xmm0");
647 &movdqa ("xmm0",&QWP(16,$base));
648 &pshufb ("xmm0","xmm1");
649 &pxor ("xmm0","xmm2");
650 &ret ();
651&function_end_B("_vpaes_schedule_transform");
652
653##
654## .aes_schedule_mangle
655##
656## Mangle xmm0 from (basis-transformed) standard version
657## to our version.
658##
659## On encrypt,
660## xor with 0x63
661## multiply by circulant 0,1,1,1
662## apply shiftrows transform
663##
664## On decrypt,
665## xor with 0x63
666## multiply by "inverse mixcolumns" circulant E,B,D,9
667## deskew
668## apply shiftrows transform
669##
670##
671## Writes out to (%edx), and increments or decrements it
672## Keeps track of round number mod 4 in %ecx
673## Preserves xmm0
674## Clobbers xmm1-xmm5
675##
676&function_begin_B("_vpaes_schedule_mangle");
677 &movdqa ("xmm4","xmm0"); # save xmm0 for later
678 &movdqa ("xmm5",&QWP($k_mc_forward,$const));
679 &test ($out,$out);
680 &jnz (&label("schedule_mangle_dec"));
681
682 # encrypting
683 &add ($key,16);
684 &pxor ("xmm4",&QWP($k_s63,$const));
685 &pshufb ("xmm4","xmm5");
686 &movdqa ("xmm3","xmm4");
687 &pshufb ("xmm4","xmm5");
688 &pxor ("xmm3","xmm4");
689 &pshufb ("xmm4","xmm5");
690 &pxor ("xmm3","xmm4");
691
692 &jmp (&label("schedule_mangle_both"));
693
694&set_label("schedule_mangle_dec",16);
695 # inverse mix columns
696 &movdqa ("xmm2",&QWP($k_s0F,$const));
697 &lea ($inp,&DWP($k_dksd,$const));
698 &movdqa ("xmm1","xmm2");
699 &pandn ("xmm1","xmm4");
700 &psrld ("xmm1",4); # 1 = hi
701 &pand ("xmm4","xmm2"); # 4 = lo
702
703 &movdqa ("xmm2",&QWP(0,$inp));
704 &pshufb ("xmm2","xmm4");
705 &movdqa ("xmm3",&QWP(0x10,$inp));
706 &pshufb ("xmm3","xmm1");
707 &pxor ("xmm3","xmm2");
708 &pshufb ("xmm3","xmm5");
709
710 &movdqa ("xmm2",&QWP(0x20,$inp));
711 &pshufb ("xmm2","xmm4");
712 &pxor ("xmm2","xmm3");
713 &movdqa ("xmm3",&QWP(0x30,$inp));
714 &pshufb ("xmm3","xmm1");
715 &pxor ("xmm3","xmm2");
716 &pshufb ("xmm3","xmm5");
717
718 &movdqa ("xmm2",&QWP(0x40,$inp));
719 &pshufb ("xmm2","xmm4");
720 &pxor ("xmm2","xmm3");
721 &movdqa ("xmm3",&QWP(0x50,$inp));
722 &pshufb ("xmm3","xmm1");
723 &pxor ("xmm3","xmm2");
724 &pshufb ("xmm3","xmm5");
725
726 &movdqa ("xmm2",&QWP(0x60,$inp));
727 &pshufb ("xmm2","xmm4");
728 &pxor ("xmm2","xmm3");
729 &movdqa ("xmm3",&QWP(0x70,$inp));
730 &pshufb ("xmm3","xmm1");
731 &pxor ("xmm3","xmm2");
732
733 &add ($key,-16);
734
735&set_label("schedule_mangle_both");
736 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
737 &pshufb ("xmm3","xmm1");
738 &add ($magic,-16);
739 &and ($magic,0x30);
740 &movdqu (&QWP(0,$key),"xmm3");
741 &ret ();
742&function_end_B("_vpaes_schedule_mangle");
743
744#
745# Interface to OpenSSL
746#
747&function_begin("${PREFIX}_set_encrypt_key");
748 &mov ($inp,&wparam(0)); # inp
749 &lea ($base,&DWP(-56,"esp"));
750 &mov ($round,&wparam(1)); # bits
751 &and ($base,-16);
752 &mov ($key,&wparam(2)); # key
753 &xchg ($base,"esp"); # alloca
754 &mov (&DWP(48,"esp"),$base);
755
756 &mov ($base,$round);
757 &shr ($base,5);
758 &add ($base,5);
759 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
760 &mov ($magic,0x30);
761 &mov ($out,0);
762
763 &picsetup($const);
764 &picsymbol($const, &label("_vpaes_consts"), $const);
765 &lea ($const,&DWP(0x30,$const))
766
767 &call ("_vpaes_schedule_core");
768
769 &mov ("esp",&DWP(48,"esp"));
770 &xor ("eax","eax");
771&function_end("${PREFIX}_set_encrypt_key");
772
773&function_begin("${PREFIX}_set_decrypt_key");
774 &mov ($inp,&wparam(0)); # inp
775 &lea ($base,&DWP(-56,"esp"));
776 &mov ($round,&wparam(1)); # bits
777 &and ($base,-16);
778 &mov ($key,&wparam(2)); # key
779 &xchg ($base,"esp"); # alloca
780 &mov (&DWP(48,"esp"),$base);
781
782 &mov ($base,$round);
783 &shr ($base,5);
784 &add ($base,5);
785 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
786 &shl ($base,4);
787 &lea ($key,&DWP(16,$key,$base));
788
789 &mov ($out,1);
790 &mov ($magic,$round);
791 &shr ($magic,1);
792 &and ($magic,32);
793 &xor ($magic,32); # nbist==192?0:32;
794
795 &picsetup($const);
796 &picsymbol($const, &label("_vpaes_consts"), $const);
797 &lea ($const,&DWP(0x30,$const))
798
799 &call ("_vpaes_schedule_core");
800
801 &mov ("esp",&DWP(48,"esp"));
802 &xor ("eax","eax");
803&function_end("${PREFIX}_set_decrypt_key");
804
805&function_begin("${PREFIX}_encrypt");
806 &picsetup($const);
807 &picsymbol($const, &label("_vpaes_consts"), $const);
808 &lea ($const,&DWP(0x30,$const))
809
810 &call ("_vpaes_preheat");
811 &mov ($inp,&wparam(0)); # inp
812 &lea ($base,&DWP(-56,"esp"));
813 &mov ($out,&wparam(1)); # out
814 &and ($base,-16);
815 &mov ($key,&wparam(2)); # key
816 &xchg ($base,"esp"); # alloca
817 &mov (&DWP(48,"esp"),$base);
818
819 &movdqu ("xmm0",&QWP(0,$inp));
820 &call ("_vpaes_encrypt_core");
821 &movdqu (&QWP(0,$out),"xmm0");
822
823 &mov ("esp",&DWP(48,"esp"));
824&function_end("${PREFIX}_encrypt");
825
826&function_begin("${PREFIX}_decrypt");
827 &picsetup($const);
828 &picsymbol($const, &label("_vpaes_consts"), $const);
829 &lea ($const,&DWP(0x30,$const))
830
831 &call ("_vpaes_preheat");
832 &mov ($inp,&wparam(0)); # inp
833 &lea ($base,&DWP(-56,"esp"));
834 &mov ($out,&wparam(1)); # out
835 &and ($base,-16);
836 &mov ($key,&wparam(2)); # key
837 &xchg ($base,"esp"); # alloca
838 &mov (&DWP(48,"esp"),$base);
839
840 &movdqu ("xmm0",&QWP(0,$inp));
841 &call ("_vpaes_decrypt_core");
842 &movdqu (&QWP(0,$out),"xmm0");
843
844 &mov ("esp",&DWP(48,"esp"));
845&function_end("${PREFIX}_decrypt");
846
847&function_begin("${PREFIX}_cbc_encrypt");
848 &mov ($inp,&wparam(0)); # inp
849 &mov ($out,&wparam(1)); # out
850 &mov ($round,&wparam(2)); # len
851 &mov ($key,&wparam(3)); # key
852 &sub ($round,16);
853 &jc (&label("cbc_abort"));
854 &lea ($base,&DWP(-56,"esp"));
855 &mov ($const,&wparam(4)); # ivp
856 &and ($base,-16);
857 &mov ($magic,&wparam(5)); # enc
858 &xchg ($base,"esp"); # alloca
859 &movdqu ("xmm1",&QWP(0,$const)); # load IV
860 &sub ($out,$inp);
861 &mov (&DWP(48,"esp"),$base);
862
863 &mov (&DWP(0,"esp"),$out); # save out
864 &mov (&DWP(4,"esp"),$key) # save key
865 &mov (&DWP(8,"esp"),$const); # save ivp
866 &mov ($out,$round); # $out works as $len
867
868 &picsetup($const);
869 &picsymbol($const, &label("_vpaes_consts"), $const);
870 &lea ($const,&DWP(0x30,$const))
871
872 &call ("_vpaes_preheat");
873 &cmp ($magic,0);
874 &je (&label("cbc_dec_loop"));
875 &jmp (&label("cbc_enc_loop"));
876
877&set_label("cbc_enc_loop",16);
878 &movdqu ("xmm0",&QWP(0,$inp)); # load input
879 &pxor ("xmm0","xmm1"); # inp^=iv
880 &call ("_vpaes_encrypt_core");
881 &mov ($base,&DWP(0,"esp")); # restore out
882 &mov ($key,&DWP(4,"esp")); # restore key
883 &movdqa ("xmm1","xmm0");
884 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
885 &lea ($inp,&DWP(16,$inp));
886 &sub ($out,16);
887 &jnc (&label("cbc_enc_loop"));
888 &jmp (&label("cbc_done"));
889
890&set_label("cbc_dec_loop",16);
891 &movdqu ("xmm0",&QWP(0,$inp)); # load input
892 &movdqa (&QWP(16,"esp"),"xmm1"); # save IV
893 &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
894 &call ("_vpaes_decrypt_core");
895 &mov ($base,&DWP(0,"esp")); # restore out
896 &mov ($key,&DWP(4,"esp")); # restore key
897 &pxor ("xmm0",&QWP(16,"esp")); # out^=iv
898 &movdqa ("xmm1",&QWP(32,"esp")); # load next IV
899 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
900 &lea ($inp,&DWP(16,$inp));
901 &sub ($out,16);
902 &jnc (&label("cbc_dec_loop"));
903
904&set_label("cbc_done");
905 &mov ($base,&DWP(8,"esp")); # restore ivp
906 &mov ("esp",&DWP(48,"esp"));
907 &movdqu (&QWP(0,$base),"xmm1"); # write IV
908&set_label("cbc_abort");
909&function_end("${PREFIX}_cbc_encrypt");
910
911&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
deleted file mode 100644
index 7d92e8d8ca..0000000000
--- a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
+++ /dev/null
@@ -1,1222 +0,0 @@
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Interface to OpenSSL as "almost" drop-in replacement for
17# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-x86_64.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86_64.pl column -
26# [also large-block CBC] encrypt/decrypt.
27#
28# aes-x86_64.pl vpaes-x86_64.pl
29#
30# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***)
31# Nehalem 30.5/42.2/14.6 9.8/11.8
32# Atom 63.9/79.0/32.1 64.0/84.8(***)
33#
34# (*) "Hyper-threading" in the context refers rather to cache shared
35# among multiple cores, than to specifically Intel HTT. As vast
36# majority of contemporary cores share cache, slower code path
37# is common place. In other words "with-hyper-threading-off"
38# results are presented mostly for reference purposes.
39#
40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
41#
42# (***) Less impressive improvement on Core 2 and Atom is due to slow
43# pshufb, yet it's respectable +40%/78% improvement on Core 2
44# (as implied, over "hyper-threading-safe" code path).
45#
46# <appro@openssl.org>
47
48$flavour = shift;
49$output = shift;
50if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
51
52$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53
54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57die "can't locate x86_64-xlate.pl";
58
59open OUT,"| \"$^X\" $xlate $flavour $output";
60*STDOUT=*OUT;
61
62$PREFIX="vpaes";
63
64$code.=<<___;
65.text
66
67##
68## _aes_encrypt_core
69##
70## AES-encrypt %xmm0.
71##
72## Inputs:
73## %xmm0 = input
74## %xmm9-%xmm15 as in _vpaes_preheat
75## (%rdx) = scheduled keys
76##
77## Output in %xmm0
78## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
79## Preserves %xmm6 - %xmm8 so you get some local vectors
80##
81##
82.type _vpaes_encrypt_core,\@abi-omnipotent
83.align 16
84_vpaes_encrypt_core:
85 _CET_ENDBR
86 mov %rdx, %r9
87 mov \$16, %r11
88 mov 240(%rdx),%eax
89 movdqa %xmm9, %xmm1
90 movdqa .Lk_ipt(%rip), %xmm2 # iptlo
91 pandn %xmm0, %xmm1
92 movdqu (%r9), %xmm5 # round0 key
93 psrld \$4, %xmm1
94 pand %xmm9, %xmm0
95 pshufb %xmm0, %xmm2
96 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
97 pshufb %xmm1, %xmm0
98 pxor %xmm5, %xmm2
99 pxor %xmm2, %xmm0
100 add \$16, %r9
101 lea .Lk_mc_backward(%rip),%r10
102 jmp .Lenc_entry
103
104.align 16
105.Lenc_loop:
106 # middle of middle round
107 movdqa %xmm13, %xmm4 # 4 : sb1u
108 pshufb %xmm2, %xmm4 # 4 = sb1u
109 pxor %xmm5, %xmm4 # 4 = sb1u + k
110 movdqa %xmm12, %xmm0 # 0 : sb1t
111 pshufb %xmm3, %xmm0 # 0 = sb1t
112 pxor %xmm4, %xmm0 # 0 = A
113 movdqa %xmm15, %xmm5 # 4 : sb2u
114 pshufb %xmm2, %xmm5 # 4 = sb2u
115 movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
116 movdqa %xmm14, %xmm2 # 2 : sb2t
117 pshufb %xmm3, %xmm2 # 2 = sb2t
118 pxor %xmm5, %xmm2 # 2 = 2A
119 movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
120 movdqa %xmm0, %xmm3 # 3 = A
121 pshufb %xmm1, %xmm0 # 0 = B
122 add \$16, %r9 # next key
123 pxor %xmm2, %xmm0 # 0 = 2A+B
124 pshufb %xmm4, %xmm3 # 3 = D
125 add \$16, %r11 # next mc
126 pxor %xmm0, %xmm3 # 3 = 2A+B+D
127 pshufb %xmm1, %xmm0 # 0 = 2B+C
128 and \$0x30, %r11 # ... mod 4
129 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
130 sub \$1,%rax # nr--
131
132.Lenc_entry:
133 # top of round
134 movdqa %xmm9, %xmm1 # 1 : i
135 pandn %xmm0, %xmm1 # 1 = i<<4
136 psrld \$4, %xmm1 # 1 = i
137 pand %xmm9, %xmm0 # 0 = k
138 movdqa %xmm11, %xmm5 # 2 : a/k
139 pshufb %xmm0, %xmm5 # 2 = a/k
140 pxor %xmm1, %xmm0 # 0 = j
141 movdqa %xmm10, %xmm3 # 3 : 1/i
142 pshufb %xmm1, %xmm3 # 3 = 1/i
143 pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
144 movdqa %xmm10, %xmm4 # 4 : 1/j
145 pshufb %xmm0, %xmm4 # 4 = 1/j
146 pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
147 movdqa %xmm10, %xmm2 # 2 : 1/iak
148 pshufb %xmm3, %xmm2 # 2 = 1/iak
149 pxor %xmm0, %xmm2 # 2 = io
150 movdqa %xmm10, %xmm3 # 3 : 1/jak
151 movdqu (%r9), %xmm5
152 pshufb %xmm4, %xmm3 # 3 = 1/jak
153 pxor %xmm1, %xmm3 # 3 = jo
154 jnz .Lenc_loop
155
156 # middle of last round
157 movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
158 movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
159 pshufb %xmm2, %xmm4 # 4 = sbou
160 pxor %xmm5, %xmm4 # 4 = sb1u + k
161 pshufb %xmm3, %xmm0 # 0 = sb1t
162 movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
163 pxor %xmm4, %xmm0 # 0 = A
164 pshufb %xmm1, %xmm0
165 ret
166.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
167
168##
169## Decryption core
170##
171## Same API as encryption core.
172##
173.type _vpaes_decrypt_core,\@abi-omnipotent
174.align 16
175_vpaes_decrypt_core:
176 _CET_ENDBR
177 mov %rdx, %r9 # load key
178 mov 240(%rdx),%eax
179 movdqa %xmm9, %xmm1
180 movdqa .Lk_dipt(%rip), %xmm2 # iptlo
181 pandn %xmm0, %xmm1
182 mov %rax, %r11
183 psrld \$4, %xmm1
184 movdqu (%r9), %xmm5 # round0 key
185 shl \$4, %r11
186 pand %xmm9, %xmm0
187 pshufb %xmm0, %xmm2
188 movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
189 xor \$0x30, %r11
190 lea .Lk_dsbd(%rip),%r10
191 pshufb %xmm1, %xmm0
192 and \$0x30, %r11
193 pxor %xmm5, %xmm2
194 movdqa .Lk_mc_forward+48(%rip), %xmm5
195 pxor %xmm2, %xmm0
196 add \$16, %r9
197 add %r10, %r11
198 jmp .Ldec_entry
199
200.align 16
201.Ldec_loop:
202##
203## Inverse mix columns
204##
205 movdqa -0x20(%r10),%xmm4 # 4 : sb9u
206 pshufb %xmm2, %xmm4 # 4 = sb9u
207 pxor %xmm0, %xmm4
208 movdqa -0x10(%r10),%xmm0 # 0 : sb9t
209 pshufb %xmm3, %xmm0 # 0 = sb9t
210 pxor %xmm4, %xmm0 # 0 = ch
211 add \$16, %r9 # next round key
212
213 pshufb %xmm5, %xmm0 # MC ch
214 movdqa 0x00(%r10),%xmm4 # 4 : sbdu
215 pshufb %xmm2, %xmm4 # 4 = sbdu
216 pxor %xmm0, %xmm4 # 4 = ch
217 movdqa 0x10(%r10),%xmm0 # 0 : sbdt
218 pshufb %xmm3, %xmm0 # 0 = sbdt
219 pxor %xmm4, %xmm0 # 0 = ch
220 sub \$1,%rax # nr--
221
222 pshufb %xmm5, %xmm0 # MC ch
223 movdqa 0x20(%r10),%xmm4 # 4 : sbbu
224 pshufb %xmm2, %xmm4 # 4 = sbbu
225 pxor %xmm0, %xmm4 # 4 = ch
226 movdqa 0x30(%r10),%xmm0 # 0 : sbbt
227 pshufb %xmm3, %xmm0 # 0 = sbbt
228 pxor %xmm4, %xmm0 # 0 = ch
229
230 pshufb %xmm5, %xmm0 # MC ch
231 movdqa 0x40(%r10),%xmm4 # 4 : sbeu
232 pshufb %xmm2, %xmm4 # 4 = sbeu
233 pxor %xmm0, %xmm4 # 4 = ch
234 movdqa 0x50(%r10),%xmm0 # 0 : sbet
235 pshufb %xmm3, %xmm0 # 0 = sbet
236 pxor %xmm4, %xmm0 # 0 = ch
237
238 palignr \$12, %xmm5, %xmm5
239
240.Ldec_entry:
241 # top of round
242 movdqa %xmm9, %xmm1 # 1 : i
243 pandn %xmm0, %xmm1 # 1 = i<<4
244 psrld \$4, %xmm1 # 1 = i
245 pand %xmm9, %xmm0 # 0 = k
246 movdqa %xmm11, %xmm2 # 2 : a/k
247 pshufb %xmm0, %xmm2 # 2 = a/k
248 pxor %xmm1, %xmm0 # 0 = j
249 movdqa %xmm10, %xmm3 # 3 : 1/i
250 pshufb %xmm1, %xmm3 # 3 = 1/i
251 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
252 movdqa %xmm10, %xmm4 # 4 : 1/j
253 pshufb %xmm0, %xmm4 # 4 = 1/j
254 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
255 movdqa %xmm10, %xmm2 # 2 : 1/iak
256 pshufb %xmm3, %xmm2 # 2 = 1/iak
257 pxor %xmm0, %xmm2 # 2 = io
258 movdqa %xmm10, %xmm3 # 3 : 1/jak
259 pshufb %xmm4, %xmm3 # 3 = 1/jak
260 pxor %xmm1, %xmm3 # 3 = jo
261 movdqu (%r9), %xmm0
262 jnz .Ldec_loop
263
264 # middle of last round
265 movdqa 0x60(%r10), %xmm4 # 3 : sbou
266 pshufb %xmm2, %xmm4 # 4 = sbou
267 pxor %xmm0, %xmm4 # 4 = sb1u + k
268 movdqa 0x70(%r10), %xmm0 # 0 : sbot
269 movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
270 pshufb %xmm3, %xmm0 # 0 = sb1t
271 pxor %xmm4, %xmm0 # 0 = A
272 pshufb %xmm2, %xmm0
273 ret
274.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
275
276########################################################
277## ##
278## AES key schedule ##
279## ##
280########################################################
281.type _vpaes_schedule_core,\@abi-omnipotent
282.align 16
283_vpaes_schedule_core:
284 _CET_ENDBR
285 # rdi = key
286 # rsi = size in bits
287 # rdx = buffer
288 # rcx = direction. 0=encrypt, 1=decrypt
289
290 call _vpaes_preheat # load the tables
291 movdqa .Lk_rcon(%rip), %xmm8 # load rcon
292 movdqu (%rdi), %xmm0 # load key (unaligned)
293
294 # input transform
295 movdqa %xmm0, %xmm3
296 lea .Lk_ipt(%rip), %r11
297 call _vpaes_schedule_transform
298 movdqa %xmm0, %xmm7
299
300 lea .Lk_sr(%rip),%r10
301 test %rcx, %rcx
302 jnz .Lschedule_am_decrypting
303
304 # encrypting, output zeroth round key after transform
305 movdqu %xmm0, (%rdx)
306 jmp .Lschedule_go
307
308.Lschedule_am_decrypting:
309 # decrypting, output zeroth round key after shiftrows
310 movdqa (%r8,%r10),%xmm1
311 pshufb %xmm1, %xmm3
312 movdqu %xmm3, (%rdx)
313 xor \$0x30, %r8
314
315.Lschedule_go:
316 cmp \$192, %esi
317 ja .Lschedule_256
318 je .Lschedule_192
319 # 128: fall though
320
321##
322## .schedule_128
323##
324## 128-bit specific part of key schedule.
325##
326## This schedule is really simple, because all its parts
327## are accomplished by the subroutines.
328##
329.Lschedule_128:
330 mov \$10, %esi
331
332.Loop_schedule_128:
333 call _vpaes_schedule_round
334 dec %rsi
335 jz .Lschedule_mangle_last
336 call _vpaes_schedule_mangle # write output
337 jmp .Loop_schedule_128
338
339##
340## .aes_schedule_192
341##
342## 192-bit specific part of key schedule.
343##
344## The main body of this schedule is the same as the 128-bit
345## schedule, but with more smearing. The long, high side is
346## stored in %xmm7 as before, and the short, low side is in
347## the high bits of %xmm6.
348##
349## This schedule is somewhat nastier, however, because each
350## round produces 192 bits of key material, or 1.5 round keys.
351## Therefore, on each cycle we do 2 rounds and produce 3 round
352## keys.
353##
354.align 16
355.Lschedule_192:
356 movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
357 call _vpaes_schedule_transform # input transform
358 movdqa %xmm0, %xmm6 # save short part
359 pxor %xmm4, %xmm4 # clear 4
360 movhlps %xmm4, %xmm6 # clobber low side with zeros
361 mov \$4, %esi
362
363.Loop_schedule_192:
364 call _vpaes_schedule_round
365 palignr \$8,%xmm6,%xmm0
366 call _vpaes_schedule_mangle # save key n
367 call _vpaes_schedule_192_smear
368 call _vpaes_schedule_mangle # save key n+1
369 call _vpaes_schedule_round
370 dec %rsi
371 jz .Lschedule_mangle_last
372 call _vpaes_schedule_mangle # save key n+2
373 call _vpaes_schedule_192_smear
374 jmp .Loop_schedule_192
375
376##
377## .aes_schedule_256
378##
379## 256-bit specific part of key schedule.
380##
381## The structure here is very similar to the 128-bit
382## schedule, but with an additional "low side" in
383## %xmm6. The low side's rounds are the same as the
384## high side's, except no rcon and no rotation.
385##
386.align 16
387.Lschedule_256:
388 movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
389 call _vpaes_schedule_transform # input transform
390 mov \$7, %esi
391
392.Loop_schedule_256:
393 call _vpaes_schedule_mangle # output low result
394 movdqa %xmm0, %xmm6 # save cur_lo in xmm6
395
396 # high round
397 call _vpaes_schedule_round
398 dec %rsi
399 jz .Lschedule_mangle_last
400 call _vpaes_schedule_mangle
401
402 # low round. swap xmm7 and xmm6
403 pshufd \$0xFF, %xmm0, %xmm0
404 movdqa %xmm7, %xmm5
405 movdqa %xmm6, %xmm7
406 call _vpaes_schedule_low_round
407 movdqa %xmm5, %xmm7
408
409 jmp .Loop_schedule_256
410
411
412##
413## .aes_schedule_mangle_last
414##
415## Mangler for last round of key schedule
416## Mangles %xmm0
417## when encrypting, outputs out(%xmm0) ^ 63
418## when decrypting, outputs unskew(%xmm0)
419##
420## Always called right before return... jumps to cleanup and exits
421##
422.align 16
423.Lschedule_mangle_last:
424 # schedule last round key from xmm0
425 lea .Lk_deskew(%rip),%r11 # prepare to deskew
426 test %rcx, %rcx
427 jnz .Lschedule_mangle_last_dec
428
429 # encrypting
430 movdqa (%r8,%r10),%xmm1
431 pshufb %xmm1, %xmm0 # output permute
432 lea .Lk_opt(%rip), %r11 # prepare to output transform
433 add \$32, %rdx
434
435.Lschedule_mangle_last_dec:
436 add \$-16, %rdx
437 pxor .Lk_s63(%rip), %xmm0
438 call _vpaes_schedule_transform # output transform
439 movdqu %xmm0, (%rdx) # save last key
440
441 # cleanup
442 pxor %xmm0, %xmm0
443 pxor %xmm1, %xmm1
444 pxor %xmm2, %xmm2
445 pxor %xmm3, %xmm3
446 pxor %xmm4, %xmm4
447 pxor %xmm5, %xmm5
448 pxor %xmm6, %xmm6
449 pxor %xmm7, %xmm7
450 ret
451.size _vpaes_schedule_core,.-_vpaes_schedule_core
452
453##
454## .aes_schedule_192_smear
455##
456## Smear the short, low side in the 192-bit key schedule.
457##
458## Inputs:
459## %xmm7: high side, b a x y
460## %xmm6: low side, d c 0 0
461## %xmm13: 0
462##
463## Outputs:
464## %xmm6: b+c+d b+c 0 0
465## %xmm0: b+c+d b+c b a
466##
467.type _vpaes_schedule_192_smear,\@abi-omnipotent
468.align 16
469_vpaes_schedule_192_smear:
470 _CET_ENDBR
471 pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
472 pxor %xmm0, %xmm6 # -> c+d c 0 0
473 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
474 pxor %xmm0, %xmm6 # -> b+c+d b+c b a
475 movdqa %xmm6, %xmm0
476 pxor %xmm1, %xmm1
477 movhlps %xmm1, %xmm6 # clobber low side with zeros
478 ret
479.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
480
481##
482## .aes_schedule_round
483##
484## Runs one main round of the key schedule on %xmm0, %xmm7
485##
486## Specifically, runs subbytes on the high dword of %xmm0
487## then rotates it by one byte and xors into the low dword of
488## %xmm7.
489##
490## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
491## next rcon.
492##
493## Smears the dwords of %xmm7 by xoring the low into the
494## second low, result into third, result into highest.
495##
496## Returns results in %xmm7 = %xmm0.
497## Clobbers %xmm1-%xmm4, %r11.
498##
499.type _vpaes_schedule_round,\@abi-omnipotent
500.align 16
501_vpaes_schedule_round:
502 _CET_ENDBR
503 # extract rcon from xmm8
504 pxor %xmm1, %xmm1
505 palignr \$15, %xmm8, %xmm1
506 palignr \$15, %xmm8, %xmm8
507 pxor %xmm1, %xmm7
508
509 # rotate
510 pshufd \$0xFF, %xmm0, %xmm0
511 palignr \$1, %xmm0, %xmm0
512
513 # fall through...
514
515 # low round: same as high round, but no rotation and no rcon.
516_vpaes_schedule_low_round:
517 # smear xmm7
518 movdqa %xmm7, %xmm1
519 pslldq \$4, %xmm7
520 pxor %xmm1, %xmm7
521 movdqa %xmm7, %xmm1
522 pslldq \$8, %xmm7
523 pxor %xmm1, %xmm7
524 pxor .Lk_s63(%rip), %xmm7
525
526 # subbytes
527 movdqa %xmm9, %xmm1
528 pandn %xmm0, %xmm1
529 psrld \$4, %xmm1 # 1 = i
530 pand %xmm9, %xmm0 # 0 = k
531 movdqa %xmm11, %xmm2 # 2 : a/k
532 pshufb %xmm0, %xmm2 # 2 = a/k
533 pxor %xmm1, %xmm0 # 0 = j
534 movdqa %xmm10, %xmm3 # 3 : 1/i
535 pshufb %xmm1, %xmm3 # 3 = 1/i
536 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
537 movdqa %xmm10, %xmm4 # 4 : 1/j
538 pshufb %xmm0, %xmm4 # 4 = 1/j
539 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
540 movdqa %xmm10, %xmm2 # 2 : 1/iak
541 pshufb %xmm3, %xmm2 # 2 = 1/iak
542 pxor %xmm0, %xmm2 # 2 = io
543 movdqa %xmm10, %xmm3 # 3 : 1/jak
544 pshufb %xmm4, %xmm3 # 3 = 1/jak
545 pxor %xmm1, %xmm3 # 3 = jo
546 movdqa %xmm13, %xmm4 # 4 : sbou
547 pshufb %xmm2, %xmm4 # 4 = sbou
548 movdqa %xmm12, %xmm0 # 0 : sbot
549 pshufb %xmm3, %xmm0 # 0 = sb1t
550 pxor %xmm4, %xmm0 # 0 = sbox output
551
552 # add in smeared stuff
553 pxor %xmm7, %xmm0
554 movdqa %xmm0, %xmm7
555 ret
556.size _vpaes_schedule_round,.-_vpaes_schedule_round
557
558##
559## .aes_schedule_transform
560##
561## Linear-transform %xmm0 according to tables at (%r11)
562##
563## Requires that %xmm9 = 0x0F0F... as in preheat
564## Output in %xmm0
565## Clobbers %xmm1, %xmm2
566##
567.type _vpaes_schedule_transform,\@abi-omnipotent
568.align 16
569_vpaes_schedule_transform:
570 _CET_ENDBR
571 movdqa %xmm9, %xmm1
572 pandn %xmm0, %xmm1
573 psrld \$4, %xmm1
574 pand %xmm9, %xmm0
575 movdqa (%r11), %xmm2 # lo
576 pshufb %xmm0, %xmm2
577 movdqa 16(%r11), %xmm0 # hi
578 pshufb %xmm1, %xmm0
579 pxor %xmm2, %xmm0
580 ret
581.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
582
583##
584## .aes_schedule_mangle
585##
586## Mangle xmm0 from (basis-transformed) standard version
587## to our version.
588##
589## On encrypt,
590## xor with 0x63
591## multiply by circulant 0,1,1,1
592## apply shiftrows transform
593##
594## On decrypt,
595## xor with 0x63
596## multiply by "inverse mixcolumns" circulant E,B,D,9
597## deskew
598## apply shiftrows transform
599##
600##
601## Writes out to (%rdx), and increments or decrements it
602## Keeps track of round number mod 4 in %r8
603## Preserves xmm0
604## Clobbers xmm1-xmm5
605##
606.type _vpaes_schedule_mangle,\@abi-omnipotent
607.align 16
608_vpaes_schedule_mangle:
609 _CET_ENDBR
610 movdqa %xmm0, %xmm4 # save xmm0 for later
611 movdqa .Lk_mc_forward(%rip),%xmm5
612 test %rcx, %rcx
613 jnz .Lschedule_mangle_dec
614
615 # encrypting
616 add \$16, %rdx
617 pxor .Lk_s63(%rip),%xmm4
618 pshufb %xmm5, %xmm4
619 movdqa %xmm4, %xmm3
620 pshufb %xmm5, %xmm4
621 pxor %xmm4, %xmm3
622 pshufb %xmm5, %xmm4
623 pxor %xmm4, %xmm3
624
625 jmp .Lschedule_mangle_both
626.align 16
627.Lschedule_mangle_dec:
628 # inverse mix columns
629 lea .Lk_dksd(%rip),%r11
630 movdqa %xmm9, %xmm1
631 pandn %xmm4, %xmm1
632 psrld \$4, %xmm1 # 1 = hi
633 pand %xmm9, %xmm4 # 4 = lo
634
635 movdqa 0x00(%r11), %xmm2
636 pshufb %xmm4, %xmm2
637 movdqa 0x10(%r11), %xmm3
638 pshufb %xmm1, %xmm3
639 pxor %xmm2, %xmm3
640 pshufb %xmm5, %xmm3
641
642 movdqa 0x20(%r11), %xmm2
643 pshufb %xmm4, %xmm2
644 pxor %xmm3, %xmm2
645 movdqa 0x30(%r11), %xmm3
646 pshufb %xmm1, %xmm3
647 pxor %xmm2, %xmm3
648 pshufb %xmm5, %xmm3
649
650 movdqa 0x40(%r11), %xmm2
651 pshufb %xmm4, %xmm2
652 pxor %xmm3, %xmm2
653 movdqa 0x50(%r11), %xmm3
654 pshufb %xmm1, %xmm3
655 pxor %xmm2, %xmm3
656 pshufb %xmm5, %xmm3
657
658 movdqa 0x60(%r11), %xmm2
659 pshufb %xmm4, %xmm2
660 pxor %xmm3, %xmm2
661 movdqa 0x70(%r11), %xmm3
662 pshufb %xmm1, %xmm3
663 pxor %xmm2, %xmm3
664
665 add \$-16, %rdx
666
667.Lschedule_mangle_both:
668 movdqa (%r8,%r10),%xmm1
669 pshufb %xmm1,%xmm3
670 add \$-16, %r8
671 and \$0x30, %r8
672 movdqu %xmm3, (%rdx)
673 ret
674.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
675
676#
677# Interface to OpenSSL
678#
679.globl ${PREFIX}_set_encrypt_key
680.type ${PREFIX}_set_encrypt_key,\@function,3
681.align 16
682${PREFIX}_set_encrypt_key:
683 _CET_ENDBR
684___
685$code.=<<___ if ($win64);
686 lea -0xb8(%rsp),%rsp
687 movaps %xmm6,0x10(%rsp)
688 movaps %xmm7,0x20(%rsp)
689 movaps %xmm8,0x30(%rsp)
690 movaps %xmm9,0x40(%rsp)
691 movaps %xmm10,0x50(%rsp)
692 movaps %xmm11,0x60(%rsp)
693 movaps %xmm12,0x70(%rsp)
694 movaps %xmm13,0x80(%rsp)
695 movaps %xmm14,0x90(%rsp)
696 movaps %xmm15,0xa0(%rsp)
697.Lenc_key_body:
698___
699$code.=<<___;
700 mov %esi,%eax
701 shr \$5,%eax
702 add \$5,%eax
703 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
704
705 mov \$0,%ecx
706 mov \$0x30,%r8d
707 call _vpaes_schedule_core
708___
709$code.=<<___ if ($win64);
710 movaps 0x10(%rsp),%xmm6
711 movaps 0x20(%rsp),%xmm7
712 movaps 0x30(%rsp),%xmm8
713 movaps 0x40(%rsp),%xmm9
714 movaps 0x50(%rsp),%xmm10
715 movaps 0x60(%rsp),%xmm11
716 movaps 0x70(%rsp),%xmm12
717 movaps 0x80(%rsp),%xmm13
718 movaps 0x90(%rsp),%xmm14
719 movaps 0xa0(%rsp),%xmm15
720 lea 0xb8(%rsp),%rsp
721.Lenc_key_epilogue:
722___
723$code.=<<___;
724 xor %eax,%eax
725 ret
726.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
727
728.globl ${PREFIX}_set_decrypt_key
729.type ${PREFIX}_set_decrypt_key,\@function,3
730.align 16
731${PREFIX}_set_decrypt_key:
732 _CET_ENDBR
733___
734$code.=<<___ if ($win64);
735 lea -0xb8(%rsp),%rsp
736 movaps %xmm6,0x10(%rsp)
737 movaps %xmm7,0x20(%rsp)
738 movaps %xmm8,0x30(%rsp)
739 movaps %xmm9,0x40(%rsp)
740 movaps %xmm10,0x50(%rsp)
741 movaps %xmm11,0x60(%rsp)
742 movaps %xmm12,0x70(%rsp)
743 movaps %xmm13,0x80(%rsp)
744 movaps %xmm14,0x90(%rsp)
745 movaps %xmm15,0xa0(%rsp)
746.Ldec_key_body:
747___
748$code.=<<___;
749 mov %esi,%eax
750 shr \$5,%eax
751 add \$5,%eax
752 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
753 shl \$4,%eax
754 lea 16(%rdx,%rax),%rdx
755
756 mov \$1,%ecx
757 mov %esi,%r8d
758 shr \$1,%r8d
759 and \$32,%r8d
760 xor \$32,%r8d # nbits==192?0:32
761 call _vpaes_schedule_core
762___
763$code.=<<___ if ($win64);
764 movaps 0x10(%rsp),%xmm6
765 movaps 0x20(%rsp),%xmm7
766 movaps 0x30(%rsp),%xmm8
767 movaps 0x40(%rsp),%xmm9
768 movaps 0x50(%rsp),%xmm10
769 movaps 0x60(%rsp),%xmm11
770 movaps 0x70(%rsp),%xmm12
771 movaps 0x80(%rsp),%xmm13
772 movaps 0x90(%rsp),%xmm14
773 movaps 0xa0(%rsp),%xmm15
774 lea 0xb8(%rsp),%rsp
775.Ldec_key_epilogue:
776___
777$code.=<<___;
778 xor %eax,%eax
779 ret
780.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
781
782.globl ${PREFIX}_encrypt
783.type ${PREFIX}_encrypt,\@function,3
784.align 16
785${PREFIX}_encrypt:
786 _CET_ENDBR
787___
788$code.=<<___ if ($win64);
789 lea -0xb8(%rsp),%rsp
790 movaps %xmm6,0x10(%rsp)
791 movaps %xmm7,0x20(%rsp)
792 movaps %xmm8,0x30(%rsp)
793 movaps %xmm9,0x40(%rsp)
794 movaps %xmm10,0x50(%rsp)
795 movaps %xmm11,0x60(%rsp)
796 movaps %xmm12,0x70(%rsp)
797 movaps %xmm13,0x80(%rsp)
798 movaps %xmm14,0x90(%rsp)
799 movaps %xmm15,0xa0(%rsp)
800.Lenc_body:
801___
802$code.=<<___;
803 movdqu (%rdi),%xmm0
804 call _vpaes_preheat
805 call _vpaes_encrypt_core
806 movdqu %xmm0,(%rsi)
807___
808$code.=<<___ if ($win64);
809 movaps 0x10(%rsp),%xmm6
810 movaps 0x20(%rsp),%xmm7
811 movaps 0x30(%rsp),%xmm8
812 movaps 0x40(%rsp),%xmm9
813 movaps 0x50(%rsp),%xmm10
814 movaps 0x60(%rsp),%xmm11
815 movaps 0x70(%rsp),%xmm12
816 movaps 0x80(%rsp),%xmm13
817 movaps 0x90(%rsp),%xmm14
818 movaps 0xa0(%rsp),%xmm15
819 lea 0xb8(%rsp),%rsp
820.Lenc_epilogue:
821___
822$code.=<<___;
823 ret
824.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
825
826.globl ${PREFIX}_decrypt
827.type ${PREFIX}_decrypt,\@function,3
828.align 16
829${PREFIX}_decrypt:
830 _CET_ENDBR
831___
832$code.=<<___ if ($win64);
833 lea -0xb8(%rsp),%rsp
834 movaps %xmm6,0x10(%rsp)
835 movaps %xmm7,0x20(%rsp)
836 movaps %xmm8,0x30(%rsp)
837 movaps %xmm9,0x40(%rsp)
838 movaps %xmm10,0x50(%rsp)
839 movaps %xmm11,0x60(%rsp)
840 movaps %xmm12,0x70(%rsp)
841 movaps %xmm13,0x80(%rsp)
842 movaps %xmm14,0x90(%rsp)
843 movaps %xmm15,0xa0(%rsp)
844.Ldec_body:
845___
846$code.=<<___;
847 movdqu (%rdi),%xmm0
848 call _vpaes_preheat
849 call _vpaes_decrypt_core
850 movdqu %xmm0,(%rsi)
851___
852$code.=<<___ if ($win64);
853 movaps 0x10(%rsp),%xmm6
854 movaps 0x20(%rsp),%xmm7
855 movaps 0x30(%rsp),%xmm8
856 movaps 0x40(%rsp),%xmm9
857 movaps 0x50(%rsp),%xmm10
858 movaps 0x60(%rsp),%xmm11
859 movaps 0x70(%rsp),%xmm12
860 movaps 0x80(%rsp),%xmm13
861 movaps 0x90(%rsp),%xmm14
862 movaps 0xa0(%rsp),%xmm15
863 lea 0xb8(%rsp),%rsp
864.Ldec_epilogue:
865___
866$code.=<<___;
867 ret
868.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
869___
870{
871my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
872# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
873# size_t length, const AES_KEY *key,
874# unsigned char *ivp,const int enc);
875$code.=<<___;
876.globl ${PREFIX}_cbc_encrypt
877.type ${PREFIX}_cbc_encrypt,\@function,6
878.align 16
879${PREFIX}_cbc_encrypt:
880 _CET_ENDBR
881 xchg $key,$len
882___
883($len,$key)=($key,$len);
884$code.=<<___;
885 sub \$16,$len
886 jc .Lcbc_abort
887___
888$code.=<<___ if ($win64);
889 lea -0xb8(%rsp),%rsp
890 movaps %xmm6,0x10(%rsp)
891 movaps %xmm7,0x20(%rsp)
892 movaps %xmm8,0x30(%rsp)
893 movaps %xmm9,0x40(%rsp)
894 movaps %xmm10,0x50(%rsp)
895 movaps %xmm11,0x60(%rsp)
896 movaps %xmm12,0x70(%rsp)
897 movaps %xmm13,0x80(%rsp)
898 movaps %xmm14,0x90(%rsp)
899 movaps %xmm15,0xa0(%rsp)
900.Lcbc_body:
901___
902$code.=<<___;
903 movdqu ($ivp),%xmm6 # load IV
904 sub $inp,$out
905 call _vpaes_preheat
906 cmp \$0,${enc}d
907 je .Lcbc_dec_loop
908 jmp .Lcbc_enc_loop
909.align 16
910.Lcbc_enc_loop:
911 movdqu ($inp),%xmm0
912 pxor %xmm6,%xmm0
913 call _vpaes_encrypt_core
914 movdqa %xmm0,%xmm6
915 movdqu %xmm0,($out,$inp)
916 lea 16($inp),$inp
917 sub \$16,$len
918 jnc .Lcbc_enc_loop
919 jmp .Lcbc_done
920.align 16
921.Lcbc_dec_loop:
922 movdqu ($inp),%xmm0
923 movdqa %xmm0,%xmm7
924 call _vpaes_decrypt_core
925 pxor %xmm6,%xmm0
926 movdqa %xmm7,%xmm6
927 movdqu %xmm0,($out,$inp)
928 lea 16($inp),$inp
929 sub \$16,$len
930 jnc .Lcbc_dec_loop
931.Lcbc_done:
932 movdqu %xmm6,($ivp) # save IV
933___
934$code.=<<___ if ($win64);
935 movaps 0x10(%rsp),%xmm6
936 movaps 0x20(%rsp),%xmm7
937 movaps 0x30(%rsp),%xmm8
938 movaps 0x40(%rsp),%xmm9
939 movaps 0x50(%rsp),%xmm10
940 movaps 0x60(%rsp),%xmm11
941 movaps 0x70(%rsp),%xmm12
942 movaps 0x80(%rsp),%xmm13
943 movaps 0x90(%rsp),%xmm14
944 movaps 0xa0(%rsp),%xmm15
945 lea 0xb8(%rsp),%rsp
946.Lcbc_epilogue:
947___
948$code.=<<___;
949.Lcbc_abort:
950 ret
951.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
952___
953}
954$code.=<<___;
955##
956## _aes_preheat
957##
958## Fills register %r10 -> .aes_consts (so you can -fPIC)
959## and %xmm9-%xmm15 as specified below.
960##
961.type _vpaes_preheat,\@abi-omnipotent
962.align 16
963_vpaes_preheat:
964 _CET_ENDBR
965 lea .Lk_s0F(%rip), %r10
966 movdqa -0x20(%r10), %xmm10 # .Lk_inv
967 movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
968 movdqa 0x00(%r10), %xmm9 # .Lk_s0F
969 movdqa 0x30(%r10), %xmm13 # .Lk_sb1
970 movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
971 movdqa 0x50(%r10), %xmm15 # .Lk_sb2
972 movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
973 ret
974.size _vpaes_preheat,.-_vpaes_preheat
975########################################################
976## ##
977## Constants ##
978## ##
979########################################################
980.section .rodata
981.type _vpaes_consts,\@object
982.align 64
983_vpaes_consts:
984.Lk_inv: # inv, inva
985 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
986 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
987
988.Lk_s0F: # s0F
989 .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
990
991.Lk_ipt: # input transform (lo, hi)
992 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
993 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
994
995.Lk_sb1: # sb1u, sb1t
996 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
997 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
998.Lk_sb2: # sb2u, sb2t
999 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
1000 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
1001.Lk_sbo: # sbou, sbot
1002 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
1003 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
1004
1005.Lk_mc_forward: # mc_forward
1006 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
1007 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
1008 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
1009 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
1010
1011.Lk_mc_backward:# mc_backward
1012 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
1013 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
1014 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
1015 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
1016
1017.Lk_sr: # sr
1018 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
1019 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
1020 .quad 0x0F060D040B020900, 0x070E050C030A0108
1021 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
1022
1023.Lk_rcon: # rcon
1024 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1025
1026.Lk_s63: # s63: all equal to 0x63 transformed
1027 .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1028
1029.Lk_opt: # output transform
1030 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
1031 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1032
1033.Lk_deskew: # deskew tables: inverts the sbox's "skew"
1034 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1035 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1036
1037##
1038## Decryption stuff
1039## Key schedule constants
1040##
1041.Lk_dksd: # decryption key schedule: invskew x*D
1042 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1043 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1044.Lk_dksb: # decryption key schedule: invskew x*B
1045 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
1046 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1047.Lk_dkse: # decryption key schedule: invskew x*E + 0x63
1048 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
1049 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1050.Lk_dks9: # decryption key schedule: invskew x*9
1051 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
1052 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
1053
1054##
1055## Decryption stuff
1056## Round function constants
1057##
1058.Lk_dipt: # decryption input transform
1059 .quad 0x0F505B040B545F00, 0x154A411E114E451A
1060 .quad 0x86E383E660056500, 0x12771772F491F194
1061
1062.Lk_dsb9: # decryption sbox output *9*u, *9*t
1063 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
1064 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1065.Lk_dsbd: # decryption sbox output *D*u, *D*t
1066 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1067 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1068.Lk_dsbb: # decryption sbox output *B*u, *B*t
1069 .quad 0xD022649296B44200, 0x602646F6B0F2D404
1070 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1071.Lk_dsbe: # decryption sbox output *E*u, *E*t
1072 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
1073 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1074.Lk_dsbo: # decryption sbox final output
1075 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1076 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1077.align 64
1078.size _vpaes_consts,.-_vpaes_consts
1079.text
1080___
1081
1082if ($win64) {
1083# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1084# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1085$rec="%rcx";
1086$frame="%rdx";
1087$context="%r8";
1088$disp="%r9";
1089
1090$code.=<<___;
1091.extern __imp_RtlVirtualUnwind
1092.type se_handler,\@abi-omnipotent
1093.align 16
1094se_handler:
1095 _CET_ENDBR
1096 push %rsi
1097 push %rdi
1098 push %rbx
1099 push %rbp
1100 push %r12
1101 push %r13
1102 push %r14
1103 push %r15
1104 pushfq
1105 sub \$64,%rsp
1106
1107 mov 120($context),%rax # pull context->Rax
1108 mov 248($context),%rbx # pull context->Rip
1109
1110 mov 8($disp),%rsi # disp->ImageBase
1111 mov 56($disp),%r11 # disp->HandlerData
1112
1113 mov 0(%r11),%r10d # HandlerData[0]
1114 lea (%rsi,%r10),%r10 # prologue label
1115 cmp %r10,%rbx # context->Rip<prologue label
1116 jb .Lin_prologue
1117
1118 mov 152($context),%rax # pull context->Rsp
1119
1120 mov 4(%r11),%r10d # HandlerData[1]
1121 lea (%rsi,%r10),%r10 # epilogue label
1122 cmp %r10,%rbx # context->Rip>=epilogue label
1123 jae .Lin_prologue
1124
1125 lea 16(%rax),%rsi # %xmm save area
1126 lea 512($context),%rdi # &context.Xmm6
1127 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1128 .long 0xa548f3fc # cld; rep movsq
1129 lea 0xb8(%rax),%rax # adjust stack pointer
1130
1131.Lin_prologue:
1132 mov 8(%rax),%rdi
1133 mov 16(%rax),%rsi
1134 mov %rax,152($context) # restore context->Rsp
1135 mov %rsi,168($context) # restore context->Rsi
1136 mov %rdi,176($context) # restore context->Rdi
1137
1138 mov 40($disp),%rdi # disp->ContextRecord
1139 mov $context,%rsi # context
1140 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1141 .long 0xa548f3fc # cld; rep movsq
1142
1143 mov $disp,%rsi
1144 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1145 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1146 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1147 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1148 mov 40(%rsi),%r10 # disp->ContextRecord
1149 lea 56(%rsi),%r11 # &disp->HandlerData
1150 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1151 mov %r10,32(%rsp) # arg5
1152 mov %r11,40(%rsp) # arg6
1153 mov %r12,48(%rsp) # arg7
1154 mov %rcx,56(%rsp) # arg8, (NULL)
1155 call *__imp_RtlVirtualUnwind(%rip)
1156
1157 mov \$1,%eax # ExceptionContinueSearch
1158 add \$64,%rsp
1159 popfq
1160 pop %r15
1161 pop %r14
1162 pop %r13
1163 pop %r12
1164 pop %rbp
1165 pop %rbx
1166 pop %rdi
1167 pop %rsi
1168 ret
1169.size se_handler,.-se_handler
1170
1171.section .pdata
1172.align 4
1173 .rva .LSEH_begin_${PREFIX}_set_encrypt_key
1174 .rva .LSEH_end_${PREFIX}_set_encrypt_key
1175 .rva .LSEH_info_${PREFIX}_set_encrypt_key
1176
1177 .rva .LSEH_begin_${PREFIX}_set_decrypt_key
1178 .rva .LSEH_end_${PREFIX}_set_decrypt_key
1179 .rva .LSEH_info_${PREFIX}_set_decrypt_key
1180
1181 .rva .LSEH_begin_${PREFIX}_encrypt
1182 .rva .LSEH_end_${PREFIX}_encrypt
1183 .rva .LSEH_info_${PREFIX}_encrypt
1184
1185 .rva .LSEH_begin_${PREFIX}_decrypt
1186 .rva .LSEH_end_${PREFIX}_decrypt
1187 .rva .LSEH_info_${PREFIX}_decrypt
1188
1189 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
1190 .rva .LSEH_end_${PREFIX}_cbc_encrypt
1191 .rva .LSEH_info_${PREFIX}_cbc_encrypt
1192
1193.section .xdata
1194.align 8
1195.LSEH_info_${PREFIX}_set_encrypt_key:
1196 .byte 9,0,0,0
1197 .rva se_handler
1198 .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
1199.LSEH_info_${PREFIX}_set_decrypt_key:
1200 .byte 9,0,0,0
1201 .rva se_handler
1202 .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
1203.LSEH_info_${PREFIX}_encrypt:
1204 .byte 9,0,0,0
1205 .rva se_handler
1206 .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
1207.LSEH_info_${PREFIX}_decrypt:
1208 .byte 9,0,0,0
1209 .rva se_handler
1210 .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
1211.LSEH_info_${PREFIX}_cbc_encrypt:
1212 .byte 9,0,0,0
1213 .rva se_handler
1214 .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
1215___
1216}
1217
1218$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1219
1220print $code;
1221
1222close STDOUT;