summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/aes
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/aes')
-rw-r--r--src/lib/libcrypto/aes/aes.c296
-rw-r--r--src/lib/libcrypto/aes/aes_amd64.c201
-rw-r--r--src/lib/libcrypto/aes/aes_core.c101
-rw-r--r--src/lib/libcrypto/aes/aes_i386.c201
-rw-r--r--src/lib/libcrypto/aes/aes_ige.c195
-rw-r--r--src/lib/libcrypto/aes/aes_local.h31
-rw-r--r--src/lib/libcrypto/aes/asm/aes-586.pl30
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-x86_64.pl90
-rw-r--r--src/lib/libcrypto/aes/asm/bsaes-x86_64.pl3123
-rw-r--r--src/lib/libcrypto/aes/asm/vpaes-x86.pl911
-rw-r--r--src/lib/libcrypto/aes/asm/vpaes-x86_64.pl1222
11 files changed, 797 insertions, 5604 deletions
diff --git a/src/lib/libcrypto/aes/aes.c b/src/lib/libcrypto/aes/aes.c
index d36a006360..693badcd66 100644
--- a/src/lib/libcrypto/aes/aes.c
+++ b/src/lib/libcrypto/aes/aes.c
@@ -1,4 +1,4 @@
1/* $OpenBSD: aes.c,v 1.4 2024/08/11 13:02:39 jsing Exp $ */ 1/* $OpenBSD: aes.c,v 1.14 2025/07/22 09:13:49 jsing Exp $ */
2/* ==================================================================== 2/* ====================================================================
3 * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved. 3 * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved.
4 * 4 *
@@ -46,21 +46,72 @@
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE. 47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ==================================================================== 48 * ====================================================================
49 *
50 */ 49 */
51 50
52#include <string.h> 51#include <string.h>
53 52
54#include <openssl/aes.h> 53#include <openssl/aes.h>
55#include <openssl/bio.h> 54#include <openssl/bio.h>
55#include <openssl/crypto.h>
56#include <openssl/modes.h> 56#include <openssl/modes.h>
57 57
58#include "crypto_arch.h" 58#include "crypto_arch.h"
59#include "crypto_internal.h"
60#include "modes_local.h"
59 61
60static const unsigned char aes_wrap_default_iv[] = { 62static const unsigned char aes_wrap_default_iv[] = {
61 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 63 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
62}; 64};
63 65
66int aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits,
67 AES_KEY *key);
68int aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits,
69 AES_KEY *key);
70void aes_encrypt_internal(const unsigned char *in, unsigned char *out,
71 const AES_KEY *key);
72void aes_decrypt_internal(const unsigned char *in, unsigned char *out,
73 const AES_KEY *key);
74
75int
76AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
77{
78 return aes_set_encrypt_key_internal(userKey, bits, key);
79}
80LCRYPTO_ALIAS(AES_set_encrypt_key);
81
82int
83AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
84{
85 return aes_set_decrypt_key_internal(userKey, bits, key);
86}
87LCRYPTO_ALIAS(AES_set_decrypt_key);
88
89void
90AES_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key)
91{
92 aes_encrypt_internal(in, out, key);
93}
94LCRYPTO_ALIAS(AES_encrypt);
95
96void
97AES_decrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key)
98{
99 aes_decrypt_internal(in, out, key);
100}
101LCRYPTO_ALIAS(AES_decrypt);
102
103void
104aes_encrypt_block128(const unsigned char *in, unsigned char *out, const void *key)
105{
106 aes_encrypt_internal(in, out, key);
107}
108
109void
110aes_decrypt_block128(const unsigned char *in, unsigned char *out, const void *key)
111{
112 aes_decrypt_internal(in, out, key);
113}
114
64#ifdef HAVE_AES_CBC_ENCRYPT_INTERNAL 115#ifdef HAVE_AES_CBC_ENCRYPT_INTERNAL
65void aes_cbc_encrypt_internal(const unsigned char *in, unsigned char *out, 116void aes_cbc_encrypt_internal(const unsigned char *in, unsigned char *out,
66 size_t len, const AES_KEY *key, unsigned char *ivec, const int enc); 117 size_t len, const AES_KEY *key, unsigned char *ivec, const int enc);
@@ -72,10 +123,10 @@ aes_cbc_encrypt_internal(const unsigned char *in, unsigned char *out,
72{ 123{
73 if (enc) 124 if (enc)
74 CRYPTO_cbc128_encrypt(in, out, len, key, ivec, 125 CRYPTO_cbc128_encrypt(in, out, len, key, ivec,
75 (block128_f)AES_encrypt); 126 aes_encrypt_block128);
76 else 127 else
77 CRYPTO_cbc128_decrypt(in, out, len, key, ivec, 128 CRYPTO_cbc128_decrypt(in, out, len, key, ivec,
78 (block128_f)AES_decrypt); 129 aes_decrypt_block128);
79} 130}
80#endif 131#endif
81 132
@@ -98,7 +149,7 @@ AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, size_t length,
98 const AES_KEY *key, unsigned char *ivec, int *num, const int enc) 149 const AES_KEY *key, unsigned char *ivec, int *num, const int enc)
99{ 150{
100 CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc, 151 CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc,
101 (block128_f)AES_encrypt); 152 aes_encrypt_block128);
102} 153}
103LCRYPTO_ALIAS(AES_cfb128_encrypt); 154LCRYPTO_ALIAS(AES_cfb128_encrypt);
104 155
@@ -108,7 +159,7 @@ AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, size_t length,
108 const AES_KEY *key, unsigned char *ivec, int *num, const int enc) 159 const AES_KEY *key, unsigned char *ivec, int *num, const int enc)
109{ 160{
110 CRYPTO_cfb128_1_encrypt(in, out, length, key, ivec, num, enc, 161 CRYPTO_cfb128_1_encrypt(in, out, length, key, ivec, num, enc,
111 (block128_f)AES_encrypt); 162 aes_encrypt_block128);
112} 163}
113LCRYPTO_ALIAS(AES_cfb1_encrypt); 164LCRYPTO_ALIAS(AES_cfb1_encrypt);
114 165
@@ -117,17 +168,134 @@ AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, size_t length,
117 const AES_KEY *key, unsigned char *ivec, int *num, const int enc) 168 const AES_KEY *key, unsigned char *ivec, int *num, const int enc)
118{ 169{
119 CRYPTO_cfb128_8_encrypt(in, out, length, key, ivec, num, enc, 170 CRYPTO_cfb128_8_encrypt(in, out, length, key, ivec, num, enc,
120 (block128_f)AES_encrypt); 171 aes_encrypt_block128);
121} 172}
122LCRYPTO_ALIAS(AES_cfb8_encrypt); 173LCRYPTO_ALIAS(AES_cfb8_encrypt);
123 174
124void 175void
176aes_ccm64_encrypt_generic(const unsigned char *in, unsigned char *out,
177 size_t blocks, const void *key, const unsigned char ivec[16],
178 unsigned char cmac[16], int encrypt)
179{
180 uint8_t iv[AES_BLOCK_SIZE], buf[AES_BLOCK_SIZE];
181 uint8_t in_mask;
182 uint64_t ctr;
183 int i;
184
185 in_mask = 0 - (encrypt != 0);
186
187 memcpy(iv, ivec, sizeof(iv));
188
189 ctr = crypto_load_be64toh(&iv[8]);
190
191 while (blocks > 0) {
192 crypto_store_htobe64(&iv[8], ctr);
193 aes_encrypt_internal(iv, buf, key);
194 ctr++;
195
196 for (i = 0; i < 16; i++) {
197 out[i] = in[i] ^ buf[i];
198 cmac[i] ^= (in[i] & in_mask) | (out[i] & ~in_mask);
199 }
200
201 aes_encrypt_internal(cmac, cmac, key);
202
203 in += 16;
204 out += 16;
205 blocks--;
206 }
207
208 explicit_bzero(buf, sizeof(buf));
209 explicit_bzero(iv, sizeof(iv));
210}
211
212#ifdef HAVE_AES_CCM64_ENCRYPT_INTERNAL
213void aes_ccm64_encrypt_internal(const unsigned char *in, unsigned char *out,
214 size_t blocks, const void *key, const unsigned char ivec[16],
215 unsigned char cmac[16], int encrypt);
216
217#else
218static inline void
219aes_ccm64_encrypt_internal(const unsigned char *in, unsigned char *out,
220 size_t blocks, const void *key, const unsigned char ivec[16],
221 unsigned char cmac[16], int encrypt)
222{
223 aes_ccm64_encrypt_generic(in, out, blocks, key, ivec, cmac, encrypt);
224}
225#endif
226
227void
228aes_ccm64_encrypt_ccm128f(const unsigned char *in, unsigned char *out,
229 size_t blocks, const void *key, const unsigned char ivec[16],
230 unsigned char cmac[16])
231{
232 aes_ccm64_encrypt_internal(in, out, blocks, key, ivec, cmac, 1);
233}
234
235void
236aes_ccm64_decrypt_ccm128f(const unsigned char *in, unsigned char *out,
237 size_t blocks, const void *key, const unsigned char ivec[16],
238 unsigned char cmac[16])
239{
240 aes_ccm64_encrypt_internal(in, out, blocks, key, ivec, cmac, 0);
241}
242
243void
244aes_ctr32_encrypt_generic(const unsigned char *in, unsigned char *out,
245 size_t blocks, const AES_KEY *key, const unsigned char ivec[AES_BLOCK_SIZE])
246{
247 uint8_t iv[AES_BLOCK_SIZE], buf[AES_BLOCK_SIZE];
248 uint32_t ctr;
249 int i;
250
251 memcpy(iv, ivec, sizeof(iv));
252
253 ctr = crypto_load_be32toh(&iv[12]);
254
255 while (blocks > 0) {
256 crypto_store_htobe32(&iv[12], ctr);
257 aes_encrypt_internal(iv, buf, key);
258 ctr++;
259
260 for (i = 0; i < AES_BLOCK_SIZE; i++)
261 out[i] = in[i] ^ buf[i];
262
263 in += 16;
264 out += 16;
265 blocks--;
266 }
267
268 explicit_bzero(buf, sizeof(buf));
269 explicit_bzero(iv, sizeof(iv));
270}
271
272#ifdef HAVE_AES_CTR32_ENCRYPT_INTERNAL
273void aes_ctr32_encrypt_internal(const unsigned char *in, unsigned char *out,
274 size_t blocks, const AES_KEY *key, const unsigned char ivec[AES_BLOCK_SIZE]);
275
276#else
277static inline void
278aes_ctr32_encrypt_internal(const unsigned char *in, unsigned char *out,
279 size_t blocks, const AES_KEY *key, const unsigned char ivec[AES_BLOCK_SIZE])
280{
281 aes_ctr32_encrypt_generic(in, out, blocks, key, ivec);
282}
283#endif
284
285void
286aes_ctr32_encrypt_ctr128f(const unsigned char *in, unsigned char *out, size_t blocks,
287 const void *key, const unsigned char ivec[AES_BLOCK_SIZE])
288{
289 aes_ctr32_encrypt_internal(in, out, blocks, key, ivec);
290}
291
292void
125AES_ctr128_encrypt(const unsigned char *in, unsigned char *out, 293AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
126 size_t length, const AES_KEY *key, unsigned char ivec[AES_BLOCK_SIZE], 294 size_t length, const AES_KEY *key, unsigned char ivec[AES_BLOCK_SIZE],
127 unsigned char ecount_buf[AES_BLOCK_SIZE], unsigned int *num) 295 unsigned char ecount_buf[AES_BLOCK_SIZE], unsigned int *num)
128{ 296{
129 CRYPTO_ctr128_encrypt(in, out, length, key, ivec, ecount_buf, num, 297 CRYPTO_ctr128_encrypt_ctr32(in, out, length, key, ivec, ecount_buf,
130 (block128_f)AES_encrypt); 298 num, aes_ctr32_encrypt_ctr128f);
131} 299}
132LCRYPTO_ALIAS(AES_ctr128_encrypt); 300LCRYPTO_ALIAS(AES_ctr128_encrypt);
133 301
@@ -142,15 +310,121 @@ AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
142} 310}
143LCRYPTO_ALIAS(AES_ecb_encrypt); 311LCRYPTO_ALIAS(AES_ecb_encrypt);
144 312
313#ifndef HAVE_AES_ECB_ENCRYPT_INTERNAL
314void
315aes_ecb_encrypt_internal(const unsigned char *in, unsigned char *out,
316 size_t len, const AES_KEY *key, int encrypt)
317{
318 while (len >= AES_BLOCK_SIZE) {
319 AES_ecb_encrypt(in, out, key, encrypt);
320 in += AES_BLOCK_SIZE;
321 out += AES_BLOCK_SIZE;
322 len -= AES_BLOCK_SIZE;
323 }
324}
325#endif
326
327#define N_WORDS (AES_BLOCK_SIZE / sizeof(unsigned long))
328typedef struct {
329 unsigned long data[N_WORDS];
330} aes_block_t;
331
332void
333AES_ige_encrypt(const unsigned char *in, unsigned char *out, size_t length,
334 const AES_KEY *key, unsigned char *ivec, const int enc)
335{
336 aes_block_t tmp, tmp2;
337 aes_block_t iv;
338 aes_block_t iv2;
339 size_t n;
340 size_t len;
341
342 /* N.B. The IV for this mode is _twice_ the block size */
343
344 OPENSSL_assert((length % AES_BLOCK_SIZE) == 0);
345
346 len = length / AES_BLOCK_SIZE;
347
348 memcpy(iv.data, ivec, AES_BLOCK_SIZE);
349 memcpy(iv2.data, ivec + AES_BLOCK_SIZE, AES_BLOCK_SIZE);
350
351 if (AES_ENCRYPT == enc) {
352 while (len) {
353 memcpy(tmp.data, in, AES_BLOCK_SIZE);
354 for (n = 0; n < N_WORDS; ++n)
355 tmp2.data[n] = tmp.data[n] ^ iv.data[n];
356 AES_encrypt((unsigned char *)tmp2.data,
357 (unsigned char *)tmp2.data, key);
358 for (n = 0; n < N_WORDS; ++n)
359 tmp2.data[n] ^= iv2.data[n];
360 memcpy(out, tmp2.data, AES_BLOCK_SIZE);
361 iv = tmp2;
362 iv2 = tmp;
363 --len;
364 in += AES_BLOCK_SIZE;
365 out += AES_BLOCK_SIZE;
366 }
367 } else {
368 while (len) {
369 memcpy(tmp.data, in, AES_BLOCK_SIZE);
370 tmp2 = tmp;
371 for (n = 0; n < N_WORDS; ++n)
372 tmp.data[n] ^= iv2.data[n];
373 AES_decrypt((unsigned char *)tmp.data,
374 (unsigned char *)tmp.data, key);
375 for (n = 0; n < N_WORDS; ++n)
376 tmp.data[n] ^= iv.data[n];
377 memcpy(out, tmp.data, AES_BLOCK_SIZE);
378 iv = tmp2;
379 iv2 = tmp;
380 --len;
381 in += AES_BLOCK_SIZE;
382 out += AES_BLOCK_SIZE;
383 }
384 }
385 memcpy(ivec, iv.data, AES_BLOCK_SIZE);
386 memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
387}
388LCRYPTO_ALIAS(AES_ige_encrypt);
389
145void 390void
146AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, size_t length, 391AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, size_t length,
147 const AES_KEY *key, unsigned char *ivec, int *num) 392 const AES_KEY *key, unsigned char *ivec, int *num)
148{ 393{
149 CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num, 394 CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num,
150 (block128_f)AES_encrypt); 395 aes_encrypt_block128);
151} 396}
152LCRYPTO_ALIAS(AES_ofb128_encrypt); 397LCRYPTO_ALIAS(AES_ofb128_encrypt);
153 398
399void
400aes_xts_encrypt_generic(const unsigned char *in, unsigned char *out, size_t len,
401 const AES_KEY *key1, const AES_KEY *key2, const unsigned char iv[16],
402 int encrypt)
403{
404 XTS128_CONTEXT xctx;
405
406 if (encrypt)
407 xctx.block1 = aes_encrypt_block128;
408 else
409 xctx.block1 = aes_decrypt_block128;
410
411 xctx.block2 = aes_encrypt_block128;
412 xctx.key1 = key1;
413 xctx.key2 = key2;
414
415 CRYPTO_xts128_encrypt(&xctx, iv, in, out, len, encrypt);
416}
417
418#ifndef HAVE_AES_XTS_ENCRYPT_INTERNAL
419void
420aes_xts_encrypt_internal(const unsigned char *in, unsigned char *out, size_t len,
421 const AES_KEY *key1, const AES_KEY *key2, const unsigned char iv[16],
422 int encrypt)
423{
424 aes_xts_encrypt_generic(in, out, len, key1, key2, iv, encrypt);
425}
426#endif
427
154int 428int
155AES_wrap_key(AES_KEY *key, const unsigned char *iv, unsigned char *out, 429AES_wrap_key(AES_KEY *key, const unsigned char *iv, unsigned char *out,
156 const unsigned char *in, unsigned int inlen) 430 const unsigned char *in, unsigned int inlen)
@@ -217,7 +491,7 @@ AES_unwrap_key(AES_KEY *key, const unsigned char *iv, unsigned char *out,
217 } 491 }
218 if (!iv) 492 if (!iv)
219 iv = aes_wrap_default_iv; 493 iv = aes_wrap_default_iv;
220 if (memcmp(A, iv, 8)) { 494 if (timingsafe_memcmp(A, iv, 8) != 0) {
221 explicit_bzero(out, inlen); 495 explicit_bzero(out, inlen);
222 return 0; 496 return 0;
223 } 497 }
diff --git a/src/lib/libcrypto/aes/aes_amd64.c b/src/lib/libcrypto/aes/aes_amd64.c
new file mode 100644
index 0000000000..183a5cce14
--- /dev/null
+++ b/src/lib/libcrypto/aes/aes_amd64.c
@@ -0,0 +1,201 @@
1/* $OpenBSD: aes_amd64.c,v 1.5 2025/07/22 09:13:49 jsing Exp $ */
2/*
3 * Copyright (c) 2025 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <openssl/aes.h>
19
20#include "crypto_arch.h"
21#include "modes_local.h"
22
23int aes_set_encrypt_key_generic(const unsigned char *userKey, const int bits,
24 AES_KEY *key);
25int aes_set_decrypt_key_generic(const unsigned char *userKey, const int bits,
26 AES_KEY *key);
27
28void aes_encrypt_generic(const unsigned char *in, unsigned char *out,
29 const AES_KEY *key);
30void aes_decrypt_generic(const unsigned char *in, unsigned char *out,
31 const AES_KEY *key);
32
33void aes_cbc_encrypt_generic(const unsigned char *in, unsigned char *out,
34 size_t len, const AES_KEY *key, unsigned char *ivec, const int enc);
35
36void aes_ccm64_encrypt_generic(const unsigned char *in, unsigned char *out,
37 size_t blocks, const void *key, const unsigned char ivec[16],
38 unsigned char cmac[16], int encrypt);
39
40void aes_ctr32_encrypt_generic(const unsigned char *in, unsigned char *out,
41 size_t blocks, const AES_KEY *key, const unsigned char ivec[AES_BLOCK_SIZE]);
42
43void aes_xts_encrypt_generic(const unsigned char *in, unsigned char *out,
44 size_t len, const AES_KEY *key1, const AES_KEY *key2,
45 const unsigned char iv[16], int encrypt);
46
47int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
48 AES_KEY *key);
49int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
50 AES_KEY *key);
51
52void aesni_encrypt(const unsigned char *in, unsigned char *out,
53 const AES_KEY *key);
54void aesni_decrypt(const unsigned char *in, unsigned char *out,
55 const AES_KEY *key);
56
57void aesni_cbc_encrypt(const unsigned char *in, unsigned char *out,
58 size_t len, const AES_KEY *key, unsigned char *ivec, const int enc);
59
60void aesni_ccm64_encrypt_blocks(const unsigned char *in, unsigned char *out,
61 size_t blocks, const void *key, const unsigned char ivec[16],
62 unsigned char cmac[16]);
63
64void aesni_ccm64_decrypt_blocks(const unsigned char *in, unsigned char *out,
65 size_t blocks, const void *key, const unsigned char ivec[16],
66 unsigned char cmac[16]);
67
68void aesni_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
69 size_t blocks, const void *key, const unsigned char *ivec);
70
71void aesni_ecb_encrypt(const unsigned char *in, unsigned char *out,
72 size_t length, const AES_KEY *key, int enc);
73
74void aesni_xts_encrypt(const unsigned char *in, unsigned char *out,
75 size_t length, const AES_KEY *key1, const AES_KEY *key2,
76 const unsigned char iv[16]);
77
78void aesni_xts_decrypt(const unsigned char *in, unsigned char *out,
79 size_t length, const AES_KEY *key1, const AES_KEY *key2,
80 const unsigned char iv[16]);
81
82int
83aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits,
84 AES_KEY *key)
85{
86 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_AES) != 0)
87 return aesni_set_encrypt_key(userKey, bits, key);
88
89 return aes_set_encrypt_key_generic(userKey, bits, key);
90}
91
92int
93aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits,
94 AES_KEY *key)
95{
96 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_AES) != 0)
97 return aesni_set_decrypt_key(userKey, bits, key);
98
99 return aes_set_decrypt_key_generic(userKey, bits, key);
100}
101
102void
103aes_encrypt_internal(const unsigned char *in, unsigned char *out,
104 const AES_KEY *key)
105{
106 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_AES) != 0) {
107 aesni_encrypt(in, out, key);
108 return;
109 }
110
111 aes_encrypt_generic(in, out, key);
112}
113
114void
115aes_decrypt_internal(const unsigned char *in, unsigned char *out,
116 const AES_KEY *key)
117{
118 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_AES) != 0) {
119 aesni_decrypt(in, out, key);
120 return;
121 }
122
123 aes_decrypt_generic(in, out, key);
124}
125
126void
127aes_cbc_encrypt_internal(const unsigned char *in, unsigned char *out,
128 size_t len, const AES_KEY *key, unsigned char *ivec, const int enc)
129{
130 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_AES) != 0) {
131 aesni_cbc_encrypt(in, out, len, key, ivec, enc);
132 return;
133 }
134
135 aes_cbc_encrypt_generic(in, out, len, key, ivec, enc);
136}
137
138void
139aes_ccm64_encrypt_internal(const unsigned char *in, unsigned char *out,
140 size_t blocks, const void *key, const unsigned char ivec[16],
141 unsigned char cmac[16], int encrypt)
142{
143 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_AES) != 0) {
144 if (encrypt)
145 aesni_ccm64_encrypt_blocks(in, out, blocks, key, ivec, cmac);
146 else
147 aesni_ccm64_decrypt_blocks(in, out, blocks, key, ivec, cmac);
148 return;
149 }
150
151 aes_ccm64_encrypt_generic(in, out, blocks, key, ivec, cmac, encrypt);
152}
153
154void
155aes_ctr32_encrypt_internal(const unsigned char *in, unsigned char *out,
156 size_t blocks, const AES_KEY *key, const unsigned char ivec[AES_BLOCK_SIZE])
157{
158 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_AES) != 0) {
159 aesni_ctr32_encrypt_blocks(in, out, blocks, key, ivec);
160 return;
161 }
162
163 aes_ctr32_encrypt_generic(in, out, blocks, key, ivec);
164}
165
166void
167aes_ecb_encrypt_internal(const unsigned char *in, unsigned char *out,
168 size_t len, const AES_KEY *key, int encrypt)
169{
170 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_AES) != 0) {
171 aesni_ecb_encrypt(in, out, len, key, encrypt);
172 return;
173 }
174
175 while (len >= AES_BLOCK_SIZE) {
176 if (encrypt)
177 aes_encrypt_generic(in, out, key);
178 else
179 aes_decrypt_generic(in, out, key);
180
181 in += AES_BLOCK_SIZE;
182 out += AES_BLOCK_SIZE;
183 len -= AES_BLOCK_SIZE;
184 }
185}
186
187void
188aes_xts_encrypt_internal(const unsigned char *in, unsigned char *out,
189 size_t len, const AES_KEY *key1, const AES_KEY *key2,
190 const unsigned char iv[16], int encrypt)
191{
192 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_AES) != 0) {
193 if (encrypt)
194 aesni_xts_encrypt(in, out, len, key1, key2, iv);
195 else
196 aesni_xts_decrypt(in, out, len, key1, key2, iv);
197 return;
198 }
199
200 aes_xts_encrypt_generic(in, out, len, key1, key2, iv, encrypt);
201}
diff --git a/src/lib/libcrypto/aes/aes_core.c b/src/lib/libcrypto/aes/aes_core.c
index 4383d74903..8eccb998d3 100644
--- a/src/lib/libcrypto/aes/aes_core.c
+++ b/src/lib/libcrypto/aes/aes_core.c
@@ -1,4 +1,4 @@
1/* $OpenBSD: aes_core.c,v 1.25 2024/11/13 21:00:57 tb Exp $ */ 1/* $OpenBSD: aes_core.c,v 1.27 2025/04/21 12:23:09 jsing Exp $ */
2/** 2/**
3 * rijndael-alg-fst.c 3 * rijndael-alg-fst.c
4 * 4 *
@@ -30,7 +30,7 @@
30 * compatible API. 30 * compatible API.
31 */ 31 */
32 32
33#include <stdlib.h> 33#include <stdint.h>
34 34
35#include <openssl/aes.h> 35#include <openssl/aes.h>
36 36
@@ -55,7 +55,7 @@ Td4[x] = Si[x].[01];
55 !defined(HAVE_AES_SET_DECRYPT_KEY_INTERNAL) || \ 55 !defined(HAVE_AES_SET_DECRYPT_KEY_INTERNAL) || \
56 !defined(HAVE_AES_ENCRYPT_INTERNAL) || \ 56 !defined(HAVE_AES_ENCRYPT_INTERNAL) || \
57 !defined(HAVE_AES_DECRYPT_INTERNAL) 57 !defined(HAVE_AES_DECRYPT_INTERNAL)
58static const u32 Te0[256] = { 58static const uint32_t Te0[256] = {
59 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU, 59 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
60 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U, 60 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
61 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU, 61 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
@@ -121,7 +121,7 @@ static const u32 Te0[256] = {
121 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U, 121 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
122 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU, 122 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
123}; 123};
124static const u32 Te1[256] = { 124static const uint32_t Te1[256] = {
125 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU, 125 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
126 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U, 126 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
127 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU, 127 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
@@ -187,7 +187,7 @@ static const u32 Te1[256] = {
187 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU, 187 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
188 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U, 188 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
189}; 189};
190static const u32 Te2[256] = { 190static const uint32_t Te2[256] = {
191 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU, 191 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
192 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U, 192 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
193 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU, 193 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
@@ -253,7 +253,7 @@ static const u32 Te2[256] = {
253 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU, 253 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
254 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U, 254 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
255}; 255};
256static const u32 Te3[256] = { 256static const uint32_t Te3[256] = {
257 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U, 257 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
258 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U, 258 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
259 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U, 259 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
@@ -320,7 +320,7 @@ static const u32 Te3[256] = {
320 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU, 320 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
321}; 321};
322 322
323static const u32 Td0[256] = { 323static const uint32_t Td0[256] = {
324 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U, 324 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
325 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U, 325 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
326 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U, 326 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
@@ -386,7 +386,7 @@ static const u32 Td0[256] = {
386 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U, 386 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
387 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U, 387 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
388}; 388};
389static const u32 Td1[256] = { 389static const uint32_t Td1[256] = {
390 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU, 390 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
391 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U, 391 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
392 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU, 392 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
@@ -452,7 +452,7 @@ static const u32 Td1[256] = {
452 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U, 452 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
453 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U, 453 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
454}; 454};
455static const u32 Td2[256] = { 455static const uint32_t Td2[256] = {
456 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U, 456 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
457 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U, 457 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
458 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U, 458 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
@@ -518,7 +518,7 @@ static const u32 Td2[256] = {
518 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U, 518 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
519 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U, 519 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
520}; 520};
521static const u32 Td3[256] = { 521static const uint32_t Td3[256] = {
522 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU, 522 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
523 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU, 523 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
524 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U, 524 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
@@ -588,7 +588,7 @@ static const u32 Td3[256] = {
588 588
589#if !defined(HAVE_AES_ENCRYPT_INTERNAL) || \ 589#if !defined(HAVE_AES_ENCRYPT_INTERNAL) || \
590 !defined(HAVE_AES_DECRYPT_INTERNAL) 590 !defined(HAVE_AES_DECRYPT_INTERNAL)
591static const u8 Td4[256] = { 591static const uint8_t Td4[256] = {
592 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U, 592 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
593 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU, 593 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
594 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U, 594 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
@@ -626,29 +626,24 @@ static const u8 Td4[256] = {
626 626
627#if !defined(HAVE_AES_SET_ENCRYPT_KEY_INTERNAL) || \ 627#if !defined(HAVE_AES_SET_ENCRYPT_KEY_INTERNAL) || \
628 !defined(HAVE_AES_SET_DECRYPT_KEY_INTERNAL) 628 !defined(HAVE_AES_SET_DECRYPT_KEY_INTERNAL)
629static const u32 rcon[] = { 629static const uint32_t rcon[] = {
630 0x01000000, 0x02000000, 0x04000000, 0x08000000, 630 0x01000000, 0x02000000, 0x04000000, 0x08000000,
631 0x10000000, 0x20000000, 0x40000000, 0x80000000, 631 0x10000000, 0x20000000, 0x40000000, 0x80000000,
632 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ 632 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
633}; 633};
634#endif 634#endif
635 635
636#ifdef HAVE_AES_SET_ENCRYPT_KEY_INTERNAL 636#ifndef HAVE_AES_SET_ENCRYPT_KEY_INTERNAL
637int aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits,
638 AES_KEY *key);
639
640#else
641
642/* 637/*
643 * Expand the cipher key into the encryption key schedule. 638 * Expand the cipher key into the encryption key schedule.
644 */ 639 */
645static inline int 640int
646aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits, 641aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits,
647 AES_KEY *key) 642 AES_KEY *key)
648{ 643{
649 u32 *rk; 644 uint32_t *rk;
650 int i = 0; 645 int i = 0;
651 u32 temp; 646 uint32_t temp;
652 647
653 if (!userKey || !key) 648 if (!userKey || !key)
654 return -1; 649 return -1;
@@ -742,28 +737,17 @@ aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits,
742} 737}
743#endif 738#endif
744 739
745int 740#ifndef HAVE_AES_SET_DECRYPT_KEY_INTERNAL
746AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
747{
748 return aes_set_encrypt_key_internal(userKey, bits, key);
749}
750LCRYPTO_ALIAS(AES_set_encrypt_key);
751
752#ifdef HAVE_AES_SET_DECRYPT_KEY_INTERNAL
753int aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits,
754 AES_KEY *key);
755
756#else
757/* 741/*
758 * Expand the cipher key into the decryption key schedule. 742 * Expand the cipher key into the decryption key schedule.
759 */ 743 */
760static inline int 744int
761aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits, 745aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits,
762 AES_KEY *key) 746 AES_KEY *key)
763{ 747{
764 u32 *rk; 748 uint32_t *rk;
765 int i, j, status; 749 int i, j, status;
766 u32 temp; 750 uint32_t temp;
767 751
768 /* first, start with an encryption schedule */ 752 /* first, start with an encryption schedule */
769 status = AES_set_encrypt_key(userKey, bits, key); 753 status = AES_set_encrypt_key(userKey, bits, key);
@@ -815,27 +799,16 @@ aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits,
815} 799}
816#endif 800#endif
817 801
818int 802#ifndef HAVE_AES_ENCRYPT_INTERNAL
819AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
820{
821 return aes_set_decrypt_key_internal(userKey, bits, key);
822}
823LCRYPTO_ALIAS(AES_set_decrypt_key);
824
825#ifdef HAVE_AES_ENCRYPT_INTERNAL
826void aes_encrypt_internal(const unsigned char *in, unsigned char *out,
827 const AES_KEY *key);
828
829#else
830/* 803/*
831 * Encrypt a single block - in and out can overlap. 804 * Encrypt a single block - in and out can overlap.
832 */ 805 */
833static inline void 806void
834aes_encrypt_internal(const unsigned char *in, unsigned char *out, 807aes_encrypt_internal(const unsigned char *in, unsigned char *out,
835 const AES_KEY *key) 808 const AES_KEY *key)
836{ 809{
837 const u32 *rk; 810 const uint32_t *rk;
838 u32 s0, s1, s2, s3, t0, t1, t2, t3; 811 uint32_t s0, s1, s2, s3, t0, t1, t2, t3;
839#ifndef FULL_UNROLL 812#ifndef FULL_UNROLL
840 int r; 813 int r;
841#endif /* ?FULL_UNROLL */ 814#endif /* ?FULL_UNROLL */
@@ -1018,27 +991,16 @@ aes_encrypt_internal(const unsigned char *in, unsigned char *out,
1018} 991}
1019#endif 992#endif
1020 993
1021void 994#ifndef HAVE_AES_DECRYPT_INTERNAL
1022AES_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key)
1023{
1024 aes_encrypt_internal(in, out, key);
1025}
1026LCRYPTO_ALIAS(AES_encrypt);
1027
1028#ifdef HAVE_AES_DECRYPT_INTERNAL
1029void aes_decrypt_internal(const unsigned char *in, unsigned char *out,
1030 const AES_KEY *key);
1031
1032#else
1033/* 995/*
1034 * Decrypt a single block - in and out can overlap. 996 * Decrypt a single block - in and out can overlap.
1035 */ 997 */
1036static inline void 998void
1037aes_decrypt_internal(const unsigned char *in, unsigned char *out, 999aes_decrypt_internal(const unsigned char *in, unsigned char *out,
1038 const AES_KEY *key) 1000 const AES_KEY *key)
1039{ 1001{
1040 const u32 *rk; 1002 const uint32_t *rk;
1041 u32 s0, s1, s2, s3, t0, t1, t2, t3; 1003 uint32_t s0, s1, s2, s3, t0, t1, t2, t3;
1042#ifndef FULL_UNROLL 1004#ifndef FULL_UNROLL
1043 int r; 1005 int r;
1044#endif /* ?FULL_UNROLL */ 1006#endif /* ?FULL_UNROLL */
@@ -1220,10 +1182,3 @@ aes_decrypt_internal(const unsigned char *in, unsigned char *out,
1220 crypto_store_htobe32(&out[3 * 4], s3); 1182 crypto_store_htobe32(&out[3 * 4], s3);
1221} 1183}
1222#endif 1184#endif
1223
1224void
1225AES_decrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key)
1226{
1227 aes_decrypt_internal(in, out, key);
1228}
1229LCRYPTO_ALIAS(AES_decrypt);
diff --git a/src/lib/libcrypto/aes/aes_i386.c b/src/lib/libcrypto/aes/aes_i386.c
new file mode 100644
index 0000000000..85a14454da
--- /dev/null
+++ b/src/lib/libcrypto/aes/aes_i386.c
@@ -0,0 +1,201 @@
1/* $OpenBSD: aes_i386.c,v 1.5 2025/07/22 09:13:49 jsing Exp $ */
2/*
3 * Copyright (c) 2025 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <openssl/aes.h>
19
20#include "crypto_arch.h"
21#include "modes_local.h"
22
23int aes_set_encrypt_key_generic(const unsigned char *userKey, const int bits,
24 AES_KEY *key);
25int aes_set_decrypt_key_generic(const unsigned char *userKey, const int bits,
26 AES_KEY *key);
27
28void aes_encrypt_generic(const unsigned char *in, unsigned char *out,
29 const AES_KEY *key);
30void aes_decrypt_generic(const unsigned char *in, unsigned char *out,
31 const AES_KEY *key);
32
33void aes_cbc_encrypt_generic(const unsigned char *in, unsigned char *out,
34 size_t len, const AES_KEY *key, unsigned char *ivec, const int enc);
35
36void aes_ccm64_encrypt_generic(const unsigned char *in, unsigned char *out,
37 size_t blocks, const void *key, const unsigned char ivec[16],
38 unsigned char cmac[16], int encrypt);
39
40void aes_ctr32_encrypt_generic(const unsigned char *in, unsigned char *out,
41 size_t blocks, const AES_KEY *key, const unsigned char ivec[AES_BLOCK_SIZE]);
42
43void aes_xts_encrypt_generic(const unsigned char *in, unsigned char *out,
44 size_t len, const AES_KEY *key1, const AES_KEY *key2,
45 const unsigned char iv[16], int encrypt);
46
47int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
48 AES_KEY *key);
49int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
50 AES_KEY *key);
51
52void aesni_encrypt(const unsigned char *in, unsigned char *out,
53 const AES_KEY *key);
54void aesni_decrypt(const unsigned char *in, unsigned char *out,
55 const AES_KEY *key);
56
57void aesni_cbc_encrypt(const unsigned char *in, unsigned char *out,
58 size_t len, const AES_KEY *key, unsigned char *ivec, const int enc);
59
60void aesni_ccm64_encrypt_blocks(const unsigned char *in, unsigned char *out,
61 size_t blocks, const void *key, const unsigned char ivec[16],
62 unsigned char cmac[16]);
63
64void aesni_ccm64_decrypt_blocks(const unsigned char *in, unsigned char *out,
65 size_t blocks, const void *key, const unsigned char ivec[16],
66 unsigned char cmac[16]);
67
68void aesni_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
69 size_t blocks, const void *key, const unsigned char *ivec);
70
71void aesni_ecb_encrypt(const unsigned char *in, unsigned char *out,
72 size_t length, const AES_KEY *key, int enc);
73
74void aesni_xts_encrypt(const unsigned char *in, unsigned char *out,
75 size_t length, const AES_KEY *key1, const AES_KEY *key2,
76 const unsigned char iv[16]);
77
78void aesni_xts_decrypt(const unsigned char *in, unsigned char *out,
79 size_t length, const AES_KEY *key1, const AES_KEY *key2,
80 const unsigned char iv[16]);
81
82int
83aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits,
84 AES_KEY *key)
85{
86 if ((crypto_cpu_caps_i386 & CRYPTO_CPU_CAPS_I386_AES) != 0)
87 return aesni_set_encrypt_key(userKey, bits, key);
88
89 return aes_set_encrypt_key_generic(userKey, bits, key);
90}
91
92int
93aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits,
94 AES_KEY *key)
95{
96 if ((crypto_cpu_caps_i386 & CRYPTO_CPU_CAPS_I386_AES) != 0)
97 return aesni_set_decrypt_key(userKey, bits, key);
98
99 return aes_set_decrypt_key_generic(userKey, bits, key);
100}
101
102void
103aes_encrypt_internal(const unsigned char *in, unsigned char *out,
104 const AES_KEY *key)
105{
106 if ((crypto_cpu_caps_i386 & CRYPTO_CPU_CAPS_I386_AES) != 0) {
107 aesni_encrypt(in, out, key);
108 return;
109 }
110
111 aes_encrypt_generic(in, out, key);
112}
113
114void
115aes_decrypt_internal(const unsigned char *in, unsigned char *out,
116 const AES_KEY *key)
117{
118 if ((crypto_cpu_caps_i386 & CRYPTO_CPU_CAPS_I386_AES) != 0) {
119 aesni_decrypt(in, out, key);
120 return;
121 }
122
123 aes_decrypt_generic(in, out, key);
124}
125
126void
127aes_cbc_encrypt_internal(const unsigned char *in, unsigned char *out,
128 size_t len, const AES_KEY *key, unsigned char *ivec, const int enc)
129{
130 if ((crypto_cpu_caps_i386 & CRYPTO_CPU_CAPS_I386_AES) != 0) {
131 aesni_cbc_encrypt(in, out, len, key, ivec, enc);
132 return;
133 }
134
135 aes_cbc_encrypt_generic(in, out, len, key, ivec, enc);
136}
137
138void
139aes_ccm64_encrypt_internal(const unsigned char *in, unsigned char *out,
140 size_t blocks, const void *key, const unsigned char ivec[16],
141 unsigned char cmac[16], int encrypt)
142{
143 if ((crypto_cpu_caps_i386 & CRYPTO_CPU_CAPS_I386_AES) != 0) {
144 if (encrypt)
145 aesni_ccm64_encrypt_blocks(in, out, blocks, key, ivec, cmac);
146 else
147 aesni_ccm64_decrypt_blocks(in, out, blocks, key, ivec, cmac);
148 return;
149 }
150
151 aes_ccm64_encrypt_generic(in, out, blocks, key, ivec, cmac, encrypt);
152}
153
154void
155aes_ctr32_encrypt_internal(const unsigned char *in, unsigned char *out,
156 size_t blocks, const AES_KEY *key, const unsigned char ivec[AES_BLOCK_SIZE])
157{
158 if ((crypto_cpu_caps_i386 & CRYPTO_CPU_CAPS_I386_AES) != 0) {
159 aesni_ctr32_encrypt_blocks(in, out, blocks, key, ivec);
160 return;
161 }
162
163 aes_ctr32_encrypt_generic(in, out, blocks, key, ivec);
164}
165
166void
167aes_ecb_encrypt_internal(const unsigned char *in, unsigned char *out,
168 size_t len, const AES_KEY *key, int encrypt)
169{
170 if ((crypto_cpu_caps_i386 & CRYPTO_CPU_CAPS_I386_AES) != 0) {
171 aesni_ecb_encrypt(in, out, len, key, encrypt);
172 return;
173 }
174
175 while (len >= AES_BLOCK_SIZE) {
176 if (encrypt)
177 aes_encrypt_generic(in, out, key);
178 else
179 aes_decrypt_generic(in, out, key);
180
181 in += AES_BLOCK_SIZE;
182 out += AES_BLOCK_SIZE;
183 len -= AES_BLOCK_SIZE;
184 }
185}
186
187void
188aes_xts_encrypt_internal(const unsigned char *in, unsigned char *out,
189 size_t len, const AES_KEY *key1, const AES_KEY *key2,
190 const unsigned char iv[16], int encrypt)
191{
192 if ((crypto_cpu_caps_i386 & CRYPTO_CPU_CAPS_I386_AES) != 0) {
193 if (encrypt)
194 aesni_xts_encrypt(in, out, len, key1, key2, iv);
195 else
196 aesni_xts_decrypt(in, out, len, key1, key2, iv);
197 return;
198 }
199
200 aes_xts_encrypt_generic(in, out, len, key1, key2, iv, encrypt);
201}
diff --git a/src/lib/libcrypto/aes/aes_ige.c b/src/lib/libcrypto/aes/aes_ige.c
deleted file mode 100644
index 1a6fcfcfbf..0000000000
--- a/src/lib/libcrypto/aes/aes_ige.c
+++ /dev/null
@@ -1,195 +0,0 @@
1/* $OpenBSD: aes_ige.c,v 1.10 2024/03/30 05:14:12 joshua Exp $ */
2/* ====================================================================
3 * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/crypto.h>
54
55#include "aes_local.h"
56
57#define N_WORDS (AES_BLOCK_SIZE / sizeof(unsigned long))
58typedef struct {
59 unsigned long data[N_WORDS];
60} aes_block_t;
61
62/* XXX: probably some better way to do this */
63#if defined(__i386__) || defined(__x86_64__)
64#define UNALIGNED_MEMOPS_ARE_FAST 1
65#else
66#define UNALIGNED_MEMOPS_ARE_FAST 0
67#endif
68
69#if UNALIGNED_MEMOPS_ARE_FAST
70#define load_block(d, s) (d) = *(const aes_block_t *)(s)
71#define store_block(d, s) *(aes_block_t *)(d) = (s)
72#else
73#define load_block(d, s) memcpy((d).data, (s), AES_BLOCK_SIZE)
74#define store_block(d, s) memcpy((d), (s).data, AES_BLOCK_SIZE)
75#endif
76
77/* N.B. The IV for this mode is _twice_ the block size */
78
79void
80AES_ige_encrypt(const unsigned char *in, unsigned char *out, size_t length,
81 const AES_KEY *key, unsigned char *ivec, const int enc)
82{
83 size_t n;
84 size_t len;
85
86 OPENSSL_assert((length % AES_BLOCK_SIZE) == 0);
87
88 len = length / AES_BLOCK_SIZE;
89
90 if (AES_ENCRYPT == enc) {
91 if (in != out && (UNALIGNED_MEMOPS_ARE_FAST ||
92 ((size_t)in|(size_t)out|(size_t)ivec) %
93 sizeof(long) == 0)) {
94 aes_block_t *ivp = (aes_block_t *)ivec;
95 aes_block_t *iv2p = (aes_block_t *)(ivec + AES_BLOCK_SIZE);
96
97 while (len) {
98 aes_block_t *inp = (aes_block_t *)in;
99 aes_block_t *outp = (aes_block_t *)out;
100
101 for (n = 0; n < N_WORDS; ++n)
102 outp->data[n] = inp->data[n] ^ ivp->data[n];
103 AES_encrypt((unsigned char *)outp->data, (unsigned char *)outp->data, key);
104 for (n = 0; n < N_WORDS; ++n)
105 outp->data[n] ^= iv2p->data[n];
106 ivp = outp;
107 iv2p = inp;
108 --len;
109 in += AES_BLOCK_SIZE;
110 out += AES_BLOCK_SIZE;
111 }
112 memmove(ivec, ivp->data, AES_BLOCK_SIZE);
113 memmove(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
114 } else {
115 aes_block_t tmp, tmp2;
116 aes_block_t iv;
117 aes_block_t iv2;
118
119 load_block(iv, ivec);
120 load_block(iv2, ivec + AES_BLOCK_SIZE);
121
122 while (len) {
123 load_block(tmp, in);
124 for (n = 0; n < N_WORDS; ++n)
125 tmp2.data[n] = tmp.data[n] ^ iv.data[n];
126 AES_encrypt((unsigned char *)tmp2.data,
127 (unsigned char *)tmp2.data, key);
128 for (n = 0; n < N_WORDS; ++n)
129 tmp2.data[n] ^= iv2.data[n];
130 store_block(out, tmp2);
131 iv = tmp2;
132 iv2 = tmp;
133 --len;
134 in += AES_BLOCK_SIZE;
135 out += AES_BLOCK_SIZE;
136 }
137 memcpy(ivec, iv.data, AES_BLOCK_SIZE);
138 memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
139 }
140 } else {
141 if (in != out && (UNALIGNED_MEMOPS_ARE_FAST ||
142 ((size_t)in|(size_t)out|(size_t)ivec) %
143 sizeof(long) == 0)) {
144 aes_block_t *ivp = (aes_block_t *)ivec;
145 aes_block_t *iv2p = (aes_block_t *)(ivec + AES_BLOCK_SIZE);
146
147 while (len) {
148 aes_block_t tmp;
149 aes_block_t *inp = (aes_block_t *)in;
150 aes_block_t *outp = (aes_block_t *)out;
151
152 for (n = 0; n < N_WORDS; ++n)
153 tmp.data[n] = inp->data[n] ^ iv2p->data[n];
154 AES_decrypt((unsigned char *)tmp.data,
155 (unsigned char *)outp->data, key);
156 for (n = 0; n < N_WORDS; ++n)
157 outp->data[n] ^= ivp->data[n];
158 ivp = inp;
159 iv2p = outp;
160 --len;
161 in += AES_BLOCK_SIZE;
162 out += AES_BLOCK_SIZE;
163 }
164 memmove(ivec, ivp->data, AES_BLOCK_SIZE);
165 memmove(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
166 } else {
167 aes_block_t tmp, tmp2;
168 aes_block_t iv;
169 aes_block_t iv2;
170
171 load_block(iv, ivec);
172 load_block(iv2, ivec + AES_BLOCK_SIZE);
173
174 while (len) {
175 load_block(tmp, in);
176 tmp2 = tmp;
177 for (n = 0; n < N_WORDS; ++n)
178 tmp.data[n] ^= iv2.data[n];
179 AES_decrypt((unsigned char *)tmp.data,
180 (unsigned char *)tmp.data, key);
181 for (n = 0; n < N_WORDS; ++n)
182 tmp.data[n] ^= iv.data[n];
183 store_block(out, tmp);
184 iv = tmp2;
185 iv2 = tmp;
186 --len;
187 in += AES_BLOCK_SIZE;
188 out += AES_BLOCK_SIZE;
189 }
190 memcpy(ivec, iv.data, AES_BLOCK_SIZE);
191 memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
192 }
193 }
194}
195LCRYPTO_ALIAS(AES_ige_encrypt);
diff --git a/src/lib/libcrypto/aes/aes_local.h b/src/lib/libcrypto/aes/aes_local.h
index e0714df409..a265eaac1d 100644
--- a/src/lib/libcrypto/aes/aes_local.h
+++ b/src/lib/libcrypto/aes/aes_local.h
@@ -1,4 +1,4 @@
1/* $OpenBSD: aes_local.h,v 1.4 2025/01/25 17:59:44 tb Exp $ */ 1/* $OpenBSD: aes_local.h,v 1.11 2025/07/22 09:29:31 jsing Exp $ */
2/* ==================================================================== 2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. 3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 * 4 *
@@ -60,17 +60,30 @@
60 60
61__BEGIN_HIDDEN_DECLS 61__BEGIN_HIDDEN_DECLS
62 62
63typedef unsigned int u32;
64typedef unsigned short u16;
65typedef unsigned char u8;
66
67#define MAXKC (256/32)
68#define MAXKB (256/8)
69#define MAXNR 14
70
71/* This controls loop-unrolling in aes_core.c */ 63/* This controls loop-unrolling in aes_core.c */
72#undef FULL_UNROLL 64#undef FULL_UNROLL
73 65
66void aes_encrypt_block128(const unsigned char *in, unsigned char *out,
67 const void *key);
68
69void aes_ctr32_encrypt_ctr128f(const unsigned char *in, unsigned char *out,
70 size_t blocks, const void *key, const unsigned char ivec[AES_BLOCK_SIZE]);
71
72void aes_ccm64_encrypt_ccm128f(const unsigned char *in, unsigned char *out,
73 size_t blocks, const void *key, const unsigned char ivec[16],
74 unsigned char cmac[16]);
75
76void aes_ccm64_decrypt_ccm128f(const unsigned char *in, unsigned char *out,
77 size_t blocks, const void *key, const unsigned char ivec[16],
78 unsigned char cmac[16]);
79
80void aes_ecb_encrypt_internal(const unsigned char *in, unsigned char *out,
81 size_t len, const AES_KEY *key, int encrypt);
82
83void aes_xts_encrypt_internal(const char unsigned *in, char unsigned *out,
84 size_t len, const AES_KEY *key1, const AES_KEY *key2,
85 const unsigned char iv[16], int encrypt);
86
74__END_HIDDEN_DECLS 87__END_HIDDEN_DECLS
75 88
76#endif /* !HEADER_AES_LOCAL_H */ 89#endif /* !HEADER_AES_LOCAL_H */
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl
index 364099d4d3..402a1a3c46 100644
--- a/src/lib/libcrypto/aes/asm/aes-586.pl
+++ b/src/lib/libcrypto/aes/asm/aes-586.pl
@@ -1158,8 +1158,8 @@ sub enclast()
1158 &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000); 1158 &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
1159 &previous(); 1159 &previous();
1160 1160
1161# void aes_encrypt_internal(const void *inp, void *out, const AES_KEY *key); 1161# void aes_encrypt_generic(const void *inp, void *out, const AES_KEY *key);
1162&function_begin("aes_encrypt_internal"); 1162&function_begin("aes_encrypt_generic");
1163 &mov ($acc,&wparam(0)); # load inp 1163 &mov ($acc,&wparam(0)); # load inp
1164 &mov ($key,&wparam(2)); # load key 1164 &mov ($key,&wparam(2)); # load key
1165 1165
@@ -1213,7 +1213,7 @@ sub enclast()
1213 &mov (&DWP(4,$acc),$s1); 1213 &mov (&DWP(4,$acc),$s1);
1214 &mov (&DWP(8,$acc),$s2); 1214 &mov (&DWP(8,$acc),$s2);
1215 &mov (&DWP(12,$acc),$s3); 1215 &mov (&DWP(12,$acc),$s3);
1216&function_end("aes_encrypt_internal"); 1216&function_end("aes_encrypt_generic");
1217 1217
1218#--------------------------------------------------------------------# 1218#--------------------------------------------------------------------#
1219 1219
@@ -1947,8 +1947,8 @@ sub declast()
1947 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); 1947 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1948 &previous(); 1948 &previous();
1949 1949
1950# void aes_decrypt_internal(const void *inp, void *out, const AES_KEY *key); 1950# void aes_decrypt_generic(const void *inp, void *out, const AES_KEY *key);
1951&function_begin("aes_decrypt_internal"); 1951&function_begin("aes_decrypt_generic");
1952 &mov ($acc,&wparam(0)); # load inp 1952 &mov ($acc,&wparam(0)); # load inp
1953 &mov ($key,&wparam(2)); # load key 1953 &mov ($key,&wparam(2)); # load key
1954 1954
@@ -2002,9 +2002,9 @@ sub declast()
2002 &mov (&DWP(4,$acc),$s1); 2002 &mov (&DWP(4,$acc),$s1);
2003 &mov (&DWP(8,$acc),$s2); 2003 &mov (&DWP(8,$acc),$s2);
2004 &mov (&DWP(12,$acc),$s3); 2004 &mov (&DWP(12,$acc),$s3);
2005&function_end("aes_decrypt_internal"); 2005&function_end("aes_decrypt_generic");
2006 2006
2007# void aes_cbc_encrypt_internal(const void char *inp, unsigned char *out, 2007# void aes_cbc_encrypt_generic(const void char *inp, unsigned char *out,
2008# size_t length, const AES_KEY *key, unsigned char *ivp,const int enc); 2008# size_t length, const AES_KEY *key, unsigned char *ivp,const int enc);
2009{ 2009{
2010# stack frame layout 2010# stack frame layout
@@ -2028,7 +2028,7 @@ my $ivec=&DWP(60,"esp"); # ivec[16]
2028my $aes_key=&DWP(76,"esp"); # copy of aes_key 2028my $aes_key=&DWP(76,"esp"); # copy of aes_key
2029my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds 2029my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
2030 2030
2031&function_begin("aes_cbc_encrypt_internal"); 2031&function_begin("aes_cbc_encrypt_generic");
2032 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len 2032 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
2033 &cmp ($s2,0); 2033 &cmp ($s2,0);
2034 &je (&label("drop_out")); 2034 &je (&label("drop_out"));
@@ -2616,7 +2616,7 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
2616 2616
2617 &mov ("esp",$_esp); 2617 &mov ("esp",$_esp);
2618 &popf (); 2618 &popf ();
2619&function_end("aes_cbc_encrypt_internal"); 2619&function_end("aes_cbc_encrypt_generic");
2620} 2620}
2621 2621
2622#------------------------------------------------------------------# 2622#------------------------------------------------------------------#
@@ -2849,12 +2849,12 @@ sub enckey()
2849 &set_label("exit"); 2849 &set_label("exit");
2850&function_end("_x86_AES_set_encrypt_key"); 2850&function_end("_x86_AES_set_encrypt_key");
2851 2851
2852# int aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits, 2852# int aes_set_encrypt_key_generic(const unsigned char *userKey, const int bits,
2853# AES_KEY *key) 2853# AES_KEY *key)
2854&function_begin_B("aes_set_encrypt_key_internal"); 2854&function_begin_B("aes_set_encrypt_key_generic");
2855 &call ("_x86_AES_set_encrypt_key"); 2855 &call ("_x86_AES_set_encrypt_key");
2856 &ret (); 2856 &ret ();
2857&function_end_B("aes_set_encrypt_key_internal"); 2857&function_end_B("aes_set_encrypt_key_generic");
2858 2858
2859sub deckey() 2859sub deckey()
2860{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; 2860{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
@@ -2911,9 +2911,9 @@ sub deckey()
2911 &mov (&DWP(4*$i,$key),$tp1); 2911 &mov (&DWP(4*$i,$key),$tp1);
2912} 2912}
2913 2913
2914# int aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits, 2914# int aes_set_decrypt_key_generic(const unsigned char *userKey, const int bits,
2915# AES_KEY *key) 2915# AES_KEY *key)
2916&function_begin_B("aes_set_decrypt_key_internal"); 2916&function_begin_B("aes_set_decrypt_key_generic");
2917 &call ("_x86_AES_set_encrypt_key"); 2917 &call ("_x86_AES_set_encrypt_key");
2918 &cmp ("eax",0); 2918 &cmp ("eax",0);
2919 &je (&label("proceed")); 2919 &je (&label("proceed"));
@@ -2969,6 +2969,6 @@ sub deckey()
2969 &jb (&label("permute")); 2969 &jb (&label("permute"));
2970 2970
2971 &xor ("eax","eax"); # return success 2971 &xor ("eax","eax"); # return success
2972&function_end("aes_set_decrypt_key_internal"); 2972&function_end("aes_set_decrypt_key_generic");
2973 2973
2974&asm_finish(); 2974&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
index 324c4a2be2..2c73627546 100755
--- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl
+++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
@@ -586,15 +586,15 @@ $code.=<<___;
586.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact 586.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
587___ 587___
588 588
589# void aes_encrypt_internal(const void *inp, void *out, const AES_KEY *key); 589# void aes_encrypt_generic(const void *inp, void *out, const AES_KEY *key);
590$code.=<<___; 590$code.=<<___;
591.globl aes_encrypt_internal 591.globl aes_encrypt_generic
592.type aes_encrypt_internal,\@function,3 592.type aes_encrypt_generic,\@function,3
593.align 16 593.align 16
594.globl asm_AES_encrypt 594.globl asm_AES_encrypt
595.hidden asm_AES_encrypt 595.hidden asm_AES_encrypt
596asm_AES_encrypt: 596asm_AES_encrypt:
597aes_encrypt_internal: 597aes_encrypt_generic:
598 _CET_ENDBR 598 _CET_ENDBR
599 push %rbx 599 push %rbx
600 push %rbp 600 push %rbp
@@ -655,7 +655,7 @@ aes_encrypt_internal:
655 lea 48(%rsi),%rsp 655 lea 48(%rsi),%rsp
656.Lenc_epilogue: 656.Lenc_epilogue:
657 ret 657 ret
658.size aes_encrypt_internal,.-aes_encrypt_internal 658.size aes_encrypt_generic,.-aes_encrypt_generic
659___ 659___
660 660
661#------------------------------------------------------------------# 661#------------------------------------------------------------------#
@@ -1188,15 +1188,15 @@ $code.=<<___;
1188.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact 1188.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
1189___ 1189___
1190 1190
1191# void aes_decrypt_internal(const void *inp, void *out, const AES_KEY *key); 1191# void aes_decrypt_generic(const void *inp, void *out, const AES_KEY *key);
1192$code.=<<___; 1192$code.=<<___;
1193.globl aes_decrypt_internal 1193.globl aes_decrypt_generic
1194.type aes_decrypt_internal,\@function,3 1194.type aes_decrypt_generic,\@function,3
1195.align 16 1195.align 16
1196.globl asm_AES_decrypt 1196.globl asm_AES_decrypt
1197.hidden asm_AES_decrypt 1197.hidden asm_AES_decrypt
1198asm_AES_decrypt: 1198asm_AES_decrypt:
1199aes_decrypt_internal: 1199aes_decrypt_generic:
1200 _CET_ENDBR 1200 _CET_ENDBR
1201 push %rbx 1201 push %rbx
1202 push %rbp 1202 push %rbp
@@ -1259,7 +1259,7 @@ aes_decrypt_internal:
1259 lea 48(%rsi),%rsp 1259 lea 48(%rsi),%rsp
1260.Ldec_epilogue: 1260.Ldec_epilogue:
1261 ret 1261 ret
1262.size aes_decrypt_internal,.-aes_decrypt_internal 1262.size aes_decrypt_generic,.-aes_decrypt_generic
1263___ 1263___
1264#------------------------------------------------------------------# 1264#------------------------------------------------------------------#
1265 1265
@@ -1290,13 +1290,13 @@ $code.=<<___;
1290___ 1290___
1291} 1291}
1292 1292
1293# int aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits, 1293# int aes_set_encrypt_key_generic(const unsigned char *userKey, const int bits,
1294# AES_KEY *key) 1294# AES_KEY *key)
1295$code.=<<___; 1295$code.=<<___;
1296.globl aes_set_encrypt_key_internal 1296.globl aes_set_encrypt_key_generic
1297.type aes_set_encrypt_key_internal,\@function,3 1297.type aes_set_encrypt_key_generic,\@function,3
1298.align 16 1298.align 16
1299aes_set_encrypt_key_internal: 1299aes_set_encrypt_key_generic:
1300 _CET_ENDBR 1300 _CET_ENDBR
1301 push %rbx 1301 push %rbx
1302 push %rbp 1302 push %rbp
@@ -1318,7 +1318,7 @@ aes_set_encrypt_key_internal:
1318 add \$56,%rsp 1318 add \$56,%rsp
1319.Lenc_key_epilogue: 1319.Lenc_key_epilogue:
1320 ret 1320 ret
1321.size aes_set_encrypt_key_internal,.-aes_set_encrypt_key_internal 1321.size aes_set_encrypt_key_generic,.-aes_set_encrypt_key_generic
1322 1322
1323.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent 1323.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
1324.align 16 1324.align 16
@@ -1562,13 +1562,13 @@ $code.=<<___;
1562___ 1562___
1563} 1563}
1564 1564
1565# int aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits, 1565# int aes_set_decrypt_key_generic(const unsigned char *userKey, const int bits,
1566# AES_KEY *key) 1566# AES_KEY *key)
1567$code.=<<___; 1567$code.=<<___;
1568.globl aes_set_decrypt_key_internal 1568.globl aes_set_decrypt_key_generic
1569.type aes_set_decrypt_key_internal,\@function,3 1569.type aes_set_decrypt_key_generic,\@function,3
1570.align 16 1570.align 16
1571aes_set_decrypt_key_internal: 1571aes_set_decrypt_key_generic:
1572 _CET_ENDBR 1572 _CET_ENDBR
1573 push %rbx 1573 push %rbx
1574 push %rbp 1574 push %rbp
@@ -1638,10 +1638,10 @@ $code.=<<___;
1638 add \$56,%rsp 1638 add \$56,%rsp
1639.Ldec_key_epilogue: 1639.Ldec_key_epilogue:
1640 ret 1640 ret
1641.size aes_set_decrypt_key_internal,.-aes_set_decrypt_key_internal 1641.size aes_set_decrypt_key_generic,.-aes_set_decrypt_key_generic
1642___ 1642___
1643 1643
1644# void aes_cbc_encrypt_internal(const void char *inp, unsigned char *out, 1644# void aes_cbc_encrypt_generic(const void char *inp, unsigned char *out,
1645# size_t length, const AES_KEY *key, unsigned char *ivp,const int enc); 1645# size_t length, const AES_KEY *key, unsigned char *ivp,const int enc);
1646{ 1646{
1647# stack frame layout 1647# stack frame layout
@@ -1659,15 +1659,15 @@ my $aes_key="80(%rsp)"; # copy of aes_key
1659my $mark="80+240(%rsp)"; # copy of aes_key->rounds 1659my $mark="80+240(%rsp)"; # copy of aes_key->rounds
1660 1660
1661$code.=<<___; 1661$code.=<<___;
1662.globl aes_cbc_encrypt_internal 1662.globl aes_cbc_encrypt_generic
1663.type aes_cbc_encrypt_internal,\@function,6 1663.type aes_cbc_encrypt_generic,\@function,6
1664.align 16 1664.align 16
1665.extern OPENSSL_ia32cap_P 1665.extern OPENSSL_ia32cap_P
1666.hidden OPENSSL_ia32cap_P 1666.hidden OPENSSL_ia32cap_P
1667.globl asm_AES_cbc_encrypt 1667.globl asm_AES_cbc_encrypt
1668.hidden asm_AES_cbc_encrypt 1668.hidden asm_AES_cbc_encrypt
1669asm_AES_cbc_encrypt: 1669asm_AES_cbc_encrypt:
1670aes_cbc_encrypt_internal: 1670aes_cbc_encrypt_generic:
1671 _CET_ENDBR 1671 _CET_ENDBR
1672 cmp \$0,%rdx # check length 1672 cmp \$0,%rdx # check length
1673 je .Lcbc_epilogue 1673 je .Lcbc_epilogue
@@ -2117,7 +2117,7 @@ aes_cbc_encrypt_internal:
2117 popfq 2117 popfq
2118.Lcbc_epilogue: 2118.Lcbc_epilogue:
2119 ret 2119 ret
2120.size aes_cbc_encrypt_internal,.-aes_cbc_encrypt_internal 2120.size aes_cbc_encrypt_generic,.-aes_cbc_encrypt_generic
2121___ 2121___
2122} 2122}
2123 2123
@@ -2782,45 +2782,45 @@ cbc_se_handler:
2782 2782
2783.section .pdata 2783.section .pdata
2784.align 4 2784.align 4
2785 .rva .LSEH_begin_aes_encrypt_internal 2785 .rva .LSEH_begin_aes_encrypt_generic
2786 .rva .LSEH_end_aes_encrypt_internal 2786 .rva .LSEH_end_aes_encrypt_generic
2787 .rva .LSEH_info_aes_encrypt_internal 2787 .rva .LSEH_info_aes_encrypt_generic
2788 2788
2789 .rva .LSEH_begin_aes_decrypt_internal 2789 .rva .LSEH_begin_aes_decrypt_generic
2790 .rva .LSEH_end_aes_decrypt_internal 2790 .rva .LSEH_end_aes_decrypt_generic
2791 .rva .LSEH_info_aes_decrypt_internal 2791 .rva .LSEH_info_aes_decrypt_generic
2792 2792
2793 .rva .LSEH_begin_aes_set_encrypt_key_internal 2793 .rva .LSEH_begin_aes_set_encrypt_key_generic
2794 .rva .LSEH_end_aes_set_encrypt_key_internal 2794 .rva .LSEH_end_aes_set_encrypt_key_generic
2795 .rva .LSEH_info_aes_set_encrypt_key_internal 2795 .rva .LSEH_info_aes_set_encrypt_key_generic
2796 2796
2797 .rva .LSEH_begin_aes_set_decrypt_key_internal 2797 .rva .LSEH_begin_aes_set_decrypt_key_generic
2798 .rva .LSEH_end_aes_set_decrypt_key_internal 2798 .rva .LSEH_end_aes_set_decrypt_key_generic
2799 .rva .LSEH_info_aes_set_decrypt_key_internal 2799 .rva .LSEH_info_aes_set_decrypt_key_generic
2800 2800
2801 .rva .LSEH_begin_aes_cbc_encrypt_internal 2801 .rva .LSEH_begin_aes_cbc_encrypt_generic
2802 .rva .LSEH_end_aes_cbc_encrypt_internal 2802 .rva .LSEH_end_aes_cbc_encrypt_generic
2803 .rva .LSEH_info_aes_cbc_encrypt_internal 2803 .rva .LSEH_info_aes_cbc_encrypt_generic
2804 2804
2805.section .xdata 2805.section .xdata
2806.align 8 2806.align 8
2807.LSEH_info_aes_encrypt_internal: 2807.LSEH_info_aes_encrypt_generic:
2808 .byte 9,0,0,0 2808 .byte 9,0,0,0
2809 .rva block_se_handler 2809 .rva block_se_handler
2810 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] 2810 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
2811.LSEH_info_aes_decrypt_internal: 2811.LSEH_info_aes_decrypt_generic:
2812 .byte 9,0,0,0 2812 .byte 9,0,0,0
2813 .rva block_se_handler 2813 .rva block_se_handler
2814 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] 2814 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
2815.LSEH_info_aes_set_encrypt_key_internal: 2815.LSEH_info_aes_set_encrypt_key_generic:
2816 .byte 9,0,0,0 2816 .byte 9,0,0,0
2817 .rva key_se_handler 2817 .rva key_se_handler
2818 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] 2818 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
2819.LSEH_info_aes_set_decrypt_key_internal: 2819.LSEH_info_aes_set_decrypt_key_generic:
2820 .byte 9,0,0,0 2820 .byte 9,0,0,0
2821 .rva key_se_handler 2821 .rva key_se_handler
2822 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] 2822 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]
2823.LSEH_info_aes_cbc_encrypt_internal: 2823.LSEH_info_aes_cbc_encrypt_generic:
2824 .byte 9,0,0,0 2824 .byte 9,0,0,0
2825 .rva cbc_se_handler 2825 .rva cbc_se_handler
2826___ 2826___
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
deleted file mode 100644
index c44a338114..0000000000
--- a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
+++ /dev/null
@@ -1,3123 +0,0 @@
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode] ###
5### bitsliced implementation for Intel Core 2 processors ###
6### requires support of SSE extensions up to SSSE3 ###
7### Author: Emilia Käsper and Peter Schwabe ###
8### Date: 2009-03-19 ###
9### Public domain ###
10### ###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12### further information. ###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22# from 12.5KB to 2.2KB;
23# - above was possible thanks to mixcolumns() modification that
24# allowed to feed its output back to aesenc[last], this was
25# achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28# relies on conversion of "conventional" key schedule as returned
29# by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31# to skip one shiftrows(), reduce bit-sliced key schedule and
32# speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38# Emilia's this(*) difference
39#
40# Core 2 9.30 8.69 +7%
41# Nehalem(**) 7.63 6.98 +9%
42# Atom 17.1 17.4 -2%(***)
43#
44# (*) Comparison is not completely fair, because "this" is ECB,
45# i.e. no extra processing such as counter values calculation
46# and xor-ing input as in Emilia's CTR implementation is
47# performed. However, the CTR calculations stand for not more
48# than 1% of total time, so comparison is *rather* fair.
49#
50# (**) Results were collected on Westmere, which is considered to
51# be equivalent to Nehalem for this code.
52#
53# (***) Slowdown on Atom is rather strange per se, because original
54# implementation has a number of 9+-bytes instructions, which
55# are bad for Atom front-end, and which I eliminated completely.
56# In attempt to address deterioration sbox() was tested in FP
57# SIMD "domain" (movaps instead of movdqa, xorps instead of
58# pxor, etc.). While it resulted in nominal 4% improvement on
59# Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# conversion conversion/8x block
68# Core 2 240 0.22
69# Nehalem 180 0.20
70# Atom 430 0.19
71#
72# The ratio values mean that 128-byte blocks will be processed
73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
81# October 2011.
82#
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2 9.83
87# Nehalem 7.74
88# Atom 19.0
89#
90# November 2011.
91#
92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93# suboptimal, but XTS is meant to be used with larger blocks...
94#
95# <appro@openssl.org>
96
97$flavour = shift;
98$output = shift;
99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
108open OUT,"| \"$^X\" $xlate $flavour $output";
109*STDOUT=*OUT;
110
111my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
113my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
114
115{
116my ($key,$rounds,$const)=("%rax","%r10d","%r11");
117
118sub Sbox {
119# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
121my @b=@_[0..7];
122my @t=@_[8..11];
123my @s=@_[12..15];
124 &InBasisChange (@b);
125 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
126 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
127}
128
129sub InBasisChange {
130# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
132my @b=@_[0..7];
133$code.=<<___;
134 pxor @b[6], @b[5]
135 pxor @b[1], @b[2]
136 pxor @b[0], @b[3]
137 pxor @b[2], @b[6]
138 pxor @b[0], @b[5]
139
140 pxor @b[3], @b[6]
141 pxor @b[7], @b[3]
142 pxor @b[5], @b[7]
143 pxor @b[4], @b[3]
144 pxor @b[5], @b[4]
145 pxor @b[1], @b[3]
146
147 pxor @b[7], @b[2]
148 pxor @b[5], @b[1]
149___
150}
151
152sub OutBasisChange {
153# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
155my @b=@_[0..7];
156$code.=<<___;
157 pxor @b[6], @b[0]
158 pxor @b[4], @b[1]
159 pxor @b[0], @b[2]
160 pxor @b[6], @b[4]
161 pxor @b[1], @b[6]
162
163 pxor @b[5], @b[1]
164 pxor @b[3], @b[5]
165 pxor @b[7], @b[3]
166 pxor @b[5], @b[7]
167 pxor @b[5], @b[2]
168
169 pxor @b[7], @b[4]
170___
171}
172
173sub InvSbox {
174# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
176my @b=@_[0..7];
177my @t=@_[8..11];
178my @s=@_[12..15];
179 &InvInBasisChange (@b);
180 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
181 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
182}
183
184sub InvInBasisChange { # OutBasisChange in reverse
185my @b=@_[5,1,2,6,3,7,0,4];
186$code.=<<___
187 pxor @b[7], @b[4]
188
189 pxor @b[5], @b[7]
190 pxor @b[5], @b[2]
191 pxor @b[7], @b[3]
192 pxor @b[3], @b[5]
193 pxor @b[5], @b[1]
194
195 pxor @b[1], @b[6]
196 pxor @b[0], @b[2]
197 pxor @b[6], @b[4]
198 pxor @b[6], @b[0]
199 pxor @b[4], @b[1]
200___
201}
202
203sub InvOutBasisChange { # InBasisChange in reverse
204my @b=@_[2,5,7,3,6,1,0,4];
205$code.=<<___;
206 pxor @b[5], @b[1]
207 pxor @b[7], @b[2]
208
209 pxor @b[1], @b[3]
210 pxor @b[5], @b[4]
211 pxor @b[5], @b[7]
212 pxor @b[4], @b[3]
213 pxor @b[0], @b[5]
214 pxor @b[7], @b[3]
215 pxor @b[2], @b[6]
216 pxor @b[1], @b[2]
217 pxor @b[3], @b[6]
218
219 pxor @b[0], @b[3]
220 pxor @b[6], @b[5]
221___
222}
223
224sub Mul_GF4 {
225#;*************************************************************
226#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227#;*************************************************************
228my ($x0,$x1,$y0,$y1,$t0)=@_;
229$code.=<<___;
230 movdqa $y0, $t0
231 pxor $y1, $t0
232 pand $x0, $t0
233 pxor $x1, $x0
234 pand $y0, $x1
235 pand $y1, $x0
236 pxor $x1, $x0
237 pxor $t0, $x1
238___
239}
240
241sub Mul_GF4_N { # not used, see next subroutine
242# multiply and scale by N
243my ($x0,$x1,$y0,$y1,$t0)=@_;
244$code.=<<___;
245 movdqa $y0, $t0
246 pxor $y1, $t0
247 pand $x0, $t0
248 pxor $x1, $x0
249 pand $y0, $x1
250 pand $y1, $x0
251 pxor $x0, $x1
252 pxor $t0, $x0
253___
254}
255
256sub Mul_GF4_N_GF4 {
257# interleaved Mul_GF4_N and Mul_GF4
258my ($x0,$x1,$y0,$y1,$t0,
259 $x2,$x3,$y2,$y3,$t1)=@_;
260$code.=<<___;
261 movdqa $y0, $t0
262 movdqa $y2, $t1
263 pxor $y1, $t0
264 pxor $y3, $t1
265 pand $x0, $t0
266 pand $x2, $t1
267 pxor $x1, $x0
268 pxor $x3, $x2
269 pand $y0, $x1
270 pand $y2, $x3
271 pand $y1, $x0
272 pand $y3, $x2
273 pxor $x0, $x1
274 pxor $x3, $x2
275 pxor $t0, $x0
276 pxor $t1, $x3
277___
278}
279sub Mul_GF16_2 {
280my @x=@_[0..7];
281my @y=@_[8..11];
282my @t=@_[12..15];
283$code.=<<___;
284 movdqa @x[0], @t[0]
285 movdqa @x[1], @t[1]
286___
287 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
288$code.=<<___;
289 pxor @x[2], @t[0]
290 pxor @x[3], @t[1]
291 pxor @y[2], @y[0]
292 pxor @y[3], @y[1]
293___
294 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
295 @x[2], @x[3], @y[2], @y[3], @t[2]);
296$code.=<<___;
297 pxor @t[0], @x[0]
298 pxor @t[0], @x[2]
299 pxor @t[1], @x[1]
300 pxor @t[1], @x[3]
301
302 movdqa @x[4], @t[0]
303 movdqa @x[5], @t[1]
304 pxor @x[6], @t[0]
305 pxor @x[7], @t[1]
306___
307 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
308 @x[6], @x[7], @y[2], @y[3], @t[2]);
309$code.=<<___;
310 pxor @y[2], @y[0]
311 pxor @y[3], @y[1]
312___
313 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
314$code.=<<___;
315 pxor @t[0], @x[4]
316 pxor @t[0], @x[6]
317 pxor @t[1], @x[5]
318 pxor @t[1], @x[7]
319___
320}
321sub Inv_GF256 {
322#;********************************************************************
323#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
324#;********************************************************************
325my @x=@_[0..7];
326my @t=@_[8..11];
327my @s=@_[12..15];
328# direct optimizations from hardware
329$code.=<<___;
330 movdqa @x[4], @t[3]
331 movdqa @x[5], @t[2]
332 movdqa @x[1], @t[1]
333 movdqa @x[7], @s[1]
334 movdqa @x[0], @s[0]
335
336 pxor @x[6], @t[3]
337 pxor @x[7], @t[2]
338 pxor @x[3], @t[1]
339 movdqa @t[3], @s[2]
340 pxor @x[6], @s[1]
341 movdqa @t[2], @t[0]
342 pxor @x[2], @s[0]
343 movdqa @t[3], @s[3]
344
345 por @t[1], @t[2]
346 por @s[0], @t[3]
347 pxor @t[0], @s[3]
348 pand @s[0], @s[2]
349 pxor @t[1], @s[0]
350 pand @t[1], @t[0]
351 pand @s[0], @s[3]
352 movdqa @x[3], @s[0]
353 pxor @x[2], @s[0]
354 pand @s[0], @s[1]
355 pxor @s[1], @t[3]
356 pxor @s[1], @t[2]
357 movdqa @x[4], @s[1]
358 movdqa @x[1], @s[0]
359 pxor @x[5], @s[1]
360 pxor @x[0], @s[0]
361 movdqa @s[1], @t[1]
362 pand @s[0], @s[1]
363 por @s[0], @t[1]
364 pxor @s[1], @t[0]
365 pxor @s[3], @t[3]
366 pxor @s[2], @t[2]
367 pxor @s[3], @t[1]
368 movdqa @x[7], @s[0]
369 pxor @s[2], @t[0]
370 movdqa @x[6], @s[1]
371 pxor @s[2], @t[1]
372 movdqa @x[5], @s[2]
373 pand @x[3], @s[0]
374 movdqa @x[4], @s[3]
375 pand @x[2], @s[1]
376 pand @x[1], @s[2]
377 por @x[0], @s[3]
378 pxor @s[0], @t[3]
379 pxor @s[1], @t[2]
380 pxor @s[2], @t[1]
381 pxor @s[3], @t[0]
382
383 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
384
385 # new smaller inversion
386
387 movdqa @t[3], @s[0]
388 pand @t[1], @t[3]
389 pxor @t[2], @s[0]
390
391 movdqa @t[0], @s[2]
392 movdqa @s[0], @s[3]
393 pxor @t[3], @s[2]
394 pand @s[2], @s[3]
395
396 movdqa @t[1], @s[1]
397 pxor @t[2], @s[3]
398 pxor @t[0], @s[1]
399
400 pxor @t[2], @t[3]
401
402 pand @t[3], @s[1]
403
404 movdqa @s[2], @t[2]
405 pxor @t[0], @s[1]
406
407 pxor @s[1], @t[2]
408 pxor @s[1], @t[1]
409
410 pand @t[0], @t[2]
411
412 pxor @t[2], @s[2]
413 pxor @t[2], @t[1]
414
415 pand @s[3], @s[2]
416
417 pxor @s[0], @s[2]
418___
419# output in s3, s2, s1, t1
420
421# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
422
423# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
424 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
425
426### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
427}
428
429# AES linear components
430
431sub ShiftRows {
432my @x=@_[0..7];
433my $mask=pop;
434$code.=<<___;
435 pxor 0x00($key),@x[0]
436 pxor 0x10($key),@x[1]
437 pshufb $mask,@x[0]
438 pxor 0x20($key),@x[2]
439 pshufb $mask,@x[1]
440 pxor 0x30($key),@x[3]
441 pshufb $mask,@x[2]
442 pxor 0x40($key),@x[4]
443 pshufb $mask,@x[3]
444 pxor 0x50($key),@x[5]
445 pshufb $mask,@x[4]
446 pxor 0x60($key),@x[6]
447 pshufb $mask,@x[5]
448 pxor 0x70($key),@x[7]
449 pshufb $mask,@x[6]
450 lea 0x80($key),$key
451 pshufb $mask,@x[7]
452___
453}
454
455sub MixColumns {
456# modified to emit output in order suitable for feeding back to aesenc[last]
457my @x=@_[0..7];
458my @t=@_[8..15];
459my $inv=@_[16]; # optional
460$code.=<<___;
461 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
462 pshufd \$0x93, @x[1], @t[1]
463 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
464 pshufd \$0x93, @x[2], @t[2]
465 pxor @t[1], @x[1]
466 pshufd \$0x93, @x[3], @t[3]
467 pxor @t[2], @x[2]
468 pshufd \$0x93, @x[4], @t[4]
469 pxor @t[3], @x[3]
470 pshufd \$0x93, @x[5], @t[5]
471 pxor @t[4], @x[4]
472 pshufd \$0x93, @x[6], @t[6]
473 pxor @t[5], @x[5]
474 pshufd \$0x93, @x[7], @t[7]
475 pxor @t[6], @x[6]
476 pxor @t[7], @x[7]
477
478 pxor @x[0], @t[1]
479 pxor @x[7], @t[0]
480 pxor @x[7], @t[1]
481 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
482 pxor @x[1], @t[2]
483 pshufd \$0x4E, @x[1], @x[1]
484 pxor @x[4], @t[5]
485 pxor @t[0], @x[0]
486 pxor @x[5], @t[6]
487 pxor @t[1], @x[1]
488 pxor @x[3], @t[4]
489 pshufd \$0x4E, @x[4], @t[0]
490 pxor @x[6], @t[7]
491 pshufd \$0x4E, @x[5], @t[1]
492 pxor @x[2], @t[3]
493 pshufd \$0x4E, @x[3], @x[4]
494 pxor @x[7], @t[3]
495 pshufd \$0x4E, @x[7], @x[5]
496 pxor @x[7], @t[4]
497 pshufd \$0x4E, @x[6], @x[3]
498 pxor @t[4], @t[0]
499 pshufd \$0x4E, @x[2], @x[6]
500 pxor @t[5], @t[1]
501___
502$code.=<<___ if (!$inv);
503 pxor @t[3], @x[4]
504 pxor @t[7], @x[5]
505 pxor @t[6], @x[3]
506 movdqa @t[0], @x[2]
507 pxor @t[2], @x[6]
508 movdqa @t[1], @x[7]
509___
510$code.=<<___ if ($inv);
511 pxor @x[4], @t[3]
512 pxor @t[7], @x[5]
513 pxor @x[3], @t[6]
514 movdqa @t[0], @x[3]
515 pxor @t[2], @x[6]
516 movdqa @t[6], @x[2]
517 movdqa @t[1], @x[7]
518 movdqa @x[6], @x[4]
519 movdqa @t[3], @x[6]
520___
521}
522
523sub InvMixColumns_orig {
524my @x=@_[0..7];
525my @t=@_[8..15];
526
527$code.=<<___;
528 # multiplication by 0x0e
529 pshufd \$0x93, @x[7], @t[7]
530 movdqa @x[2], @t[2]
531 pxor @x[5], @x[7] # 7 5
532 pxor @x[5], @x[2] # 2 5
533 pshufd \$0x93, @x[0], @t[0]
534 movdqa @x[5], @t[5]
535 pxor @x[0], @x[5] # 5 0 [1]
536 pxor @x[1], @x[0] # 0 1
537 pshufd \$0x93, @x[1], @t[1]
538 pxor @x[2], @x[1] # 1 25
539 pxor @x[6], @x[0] # 01 6 [2]
540 pxor @x[3], @x[1] # 125 3 [4]
541 pshufd \$0x93, @x[3], @t[3]
542 pxor @x[0], @x[2] # 25 016 [3]
543 pxor @x[7], @x[3] # 3 75
544 pxor @x[6], @x[7] # 75 6 [0]
545 pshufd \$0x93, @x[6], @t[6]
546 movdqa @x[4], @t[4]
547 pxor @x[4], @x[6] # 6 4
548 pxor @x[3], @x[4] # 4 375 [6]
549 pxor @x[7], @x[3] # 375 756=36
550 pxor @t[5], @x[6] # 64 5 [7]
551 pxor @t[2], @x[3] # 36 2
552 pxor @t[4], @x[3] # 362 4 [5]
553 pshufd \$0x93, @t[5], @t[5]
554___
555 my @y = @x[7,5,0,2,1,3,4,6];
556$code.=<<___;
557 # multiplication by 0x0b
558 pxor @y[0], @y[1]
559 pxor @t[0], @y[0]
560 pxor @t[1], @y[1]
561 pshufd \$0x93, @t[2], @t[2]
562 pxor @t[5], @y[0]
563 pxor @t[6], @y[1]
564 pxor @t[7], @y[0]
565 pshufd \$0x93, @t[4], @t[4]
566 pxor @t[6], @t[7] # clobber t[7]
567 pxor @y[0], @y[1]
568
569 pxor @t[0], @y[3]
570 pshufd \$0x93, @t[0], @t[0]
571 pxor @t[1], @y[2]
572 pxor @t[1], @y[4]
573 pxor @t[2], @y[2]
574 pshufd \$0x93, @t[1], @t[1]
575 pxor @t[2], @y[3]
576 pxor @t[2], @y[5]
577 pxor @t[7], @y[2]
578 pshufd \$0x93, @t[2], @t[2]
579 pxor @t[3], @y[3]
580 pxor @t[3], @y[6]
581 pxor @t[3], @y[4]
582 pshufd \$0x93, @t[3], @t[3]
583 pxor @t[4], @y[7]
584 pxor @t[4], @y[5]
585 pxor @t[7], @y[7]
586 pxor @t[5], @y[3]
587 pxor @t[4], @y[4]
588 pxor @t[5], @t[7] # clobber t[7] even more
589
590 pxor @t[7], @y[5]
591 pshufd \$0x93, @t[4], @t[4]
592 pxor @t[7], @y[6]
593 pxor @t[7], @y[4]
594
595 pxor @t[5], @t[7]
596 pshufd \$0x93, @t[5], @t[5]
597 pxor @t[6], @t[7] # restore t[7]
598
599 # multiplication by 0x0d
600 pxor @y[7], @y[4]
601 pxor @t[4], @y[7]
602 pshufd \$0x93, @t[6], @t[6]
603 pxor @t[0], @y[2]
604 pxor @t[5], @y[7]
605 pxor @t[2], @y[2]
606 pshufd \$0x93, @t[7], @t[7]
607
608 pxor @y[1], @y[3]
609 pxor @t[1], @y[1]
610 pxor @t[0], @y[0]
611 pxor @t[0], @y[3]
612 pxor @t[5], @y[1]
613 pxor @t[5], @y[0]
614 pxor @t[7], @y[1]
615 pshufd \$0x93, @t[0], @t[0]
616 pxor @t[6], @y[0]
617 pxor @y[1], @y[3]
618 pxor @t[1], @y[4]
619 pshufd \$0x93, @t[1], @t[1]
620
621 pxor @t[7], @y[7]
622 pxor @t[2], @y[4]
623 pxor @t[2], @y[5]
624 pshufd \$0x93, @t[2], @t[2]
625 pxor @t[6], @y[2]
626 pxor @t[3], @t[6] # clobber t[6]
627 pxor @y[7], @y[4]
628 pxor @t[6], @y[3]
629
630 pxor @t[6], @y[6]
631 pxor @t[5], @y[5]
632 pxor @t[4], @y[6]
633 pshufd \$0x93, @t[4], @t[4]
634 pxor @t[6], @y[5]
635 pxor @t[7], @y[6]
636 pxor @t[3], @t[6] # restore t[6]
637
638 pshufd \$0x93, @t[5], @t[5]
639 pshufd \$0x93, @t[6], @t[6]
640 pshufd \$0x93, @t[7], @t[7]
641 pshufd \$0x93, @t[3], @t[3]
642
643 # multiplication by 0x09
644 pxor @y[1], @y[4]
645 pxor @y[1], @t[1] # t[1]=y[1]
646 pxor @t[5], @t[0] # clobber t[0]
647 pxor @t[5], @t[1]
648 pxor @t[0], @y[3]
649 pxor @y[0], @t[0] # t[0]=y[0]
650 pxor @t[6], @t[1]
651 pxor @t[7], @t[6] # clobber t[6]
652 pxor @t[1], @y[4]
653 pxor @t[4], @y[7]
654 pxor @y[4], @t[4] # t[4]=y[4]
655 pxor @t[3], @y[6]
656 pxor @y[3], @t[3] # t[3]=y[3]
657 pxor @t[2], @y[5]
658 pxor @y[2], @t[2] # t[2]=y[2]
659 pxor @t[7], @t[3]
660 pxor @y[5], @t[5] # t[5]=y[5]
661 pxor @t[6], @t[2]
662 pxor @t[6], @t[5]
663 pxor @y[6], @t[6] # t[6]=y[6]
664 pxor @y[7], @t[7] # t[7]=y[7]
665
666 movdqa @t[0],@XMM[0]
667 movdqa @t[1],@XMM[1]
668 movdqa @t[2],@XMM[2]
669 movdqa @t[3],@XMM[3]
670 movdqa @t[4],@XMM[4]
671 movdqa @t[5],@XMM[5]
672 movdqa @t[6],@XMM[6]
673 movdqa @t[7],@XMM[7]
674___
675}
676
677sub InvMixColumns {
678my @x=@_[0..7];
679my @t=@_[8..15];
680
681# Thanks to Jussi Kivilinna for providing pointer to
682#
683# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
684# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
685# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
686# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
687
688$code.=<<___;
689 # multiplication by 0x05-0x00-0x04-0x00
690 pshufd \$0x4E, @x[0], @t[0]
691 pshufd \$0x4E, @x[6], @t[6]
692 pxor @x[0], @t[0]
693 pshufd \$0x4E, @x[7], @t[7]
694 pxor @x[6], @t[6]
695 pshufd \$0x4E, @x[1], @t[1]
696 pxor @x[7], @t[7]
697 pshufd \$0x4E, @x[2], @t[2]
698 pxor @x[1], @t[1]
699 pshufd \$0x4E, @x[3], @t[3]
700 pxor @x[2], @t[2]
701 pxor @t[6], @x[0]
702 pxor @t[6], @x[1]
703 pshufd \$0x4E, @x[4], @t[4]
704 pxor @x[3], @t[3]
705 pxor @t[0], @x[2]
706 pxor @t[1], @x[3]
707 pshufd \$0x4E, @x[5], @t[5]
708 pxor @x[4], @t[4]
709 pxor @t[7], @x[1]
710 pxor @t[2], @x[4]
711 pxor @x[5], @t[5]
712
713 pxor @t[7], @x[2]
714 pxor @t[6], @x[3]
715 pxor @t[6], @x[4]
716 pxor @t[3], @x[5]
717 pxor @t[4], @x[6]
718 pxor @t[7], @x[4]
719 pxor @t[7], @x[5]
720 pxor @t[5], @x[7]
721___
722 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
723}
724
725sub aesenc { # not used
726my @b=@_[0..7];
727my @t=@_[8..15];
728$code.=<<___;
729 movdqa 0x30($const),@t[0] # .LSR
730___
731 &ShiftRows (@b,@t[0]);
732 &Sbox (@b,@t);
733 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
734}
735
736sub aesenclast { # not used
737my @b=@_[0..7];
738my @t=@_[8..15];
739$code.=<<___;
740 movdqa 0x40($const),@t[0] # .LSRM0
741___
742 &ShiftRows (@b,@t[0]);
743 &Sbox (@b,@t);
744$code.=<<___
745 pxor 0x00($key),@b[0]
746 pxor 0x10($key),@b[1]
747 pxor 0x20($key),@b[4]
748 pxor 0x30($key),@b[6]
749 pxor 0x40($key),@b[3]
750 pxor 0x50($key),@b[7]
751 pxor 0x60($key),@b[2]
752 pxor 0x70($key),@b[5]
753___
754}
755
756sub swapmove {
757my ($a,$b,$n,$mask,$t)=@_;
758$code.=<<___;
759 movdqa $b,$t
760 psrlq \$$n,$b
761 pxor $a,$b
762 pand $mask,$b
763 pxor $b,$a
764 psllq \$$n,$b
765 pxor $t,$b
766___
767}
768sub swapmove2x {
769my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
770$code.=<<___;
771 movdqa $b0,$t0
772 psrlq \$$n,$b0
773 movdqa $b1,$t1
774 psrlq \$$n,$b1
775 pxor $a0,$b0
776 pxor $a1,$b1
777 pand $mask,$b0
778 pand $mask,$b1
779 pxor $b0,$a0
780 psllq \$$n,$b0
781 pxor $b1,$a1
782 psllq \$$n,$b1
783 pxor $t0,$b0
784 pxor $t1,$b1
785___
786}
787
788sub bitslice {
789my @x=reverse(@_[0..7]);
790my ($t0,$t1,$t2,$t3)=@_[8..11];
791$code.=<<___;
792 movdqa 0x00($const),$t0 # .LBS0
793 movdqa 0x10($const),$t1 # .LBS1
794___
795 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
796 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
797$code.=<<___;
798 movdqa 0x20($const),$t0 # .LBS2
799___
800 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
801 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
802
803 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
804 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
805}
806
807$code.=<<___;
808.text
809
810.extern asm_AES_encrypt
811.extern asm_AES_decrypt
812
813.type _bsaes_encrypt8,\@abi-omnipotent
814.align 64
815_bsaes_encrypt8:
816 _CET_ENDBR
817 lea .LBS0(%rip), $const # constants table
818
819 movdqa ($key), @XMM[9] # round 0 key
820 lea 0x10($key), $key
821 movdqa 0x50($const), @XMM[8] # .LM0SR
822 pxor @XMM[9], @XMM[0] # xor with round0 key
823 pxor @XMM[9], @XMM[1]
824 pshufb @XMM[8], @XMM[0]
825 pxor @XMM[9], @XMM[2]
826 pshufb @XMM[8], @XMM[1]
827 pxor @XMM[9], @XMM[3]
828 pshufb @XMM[8], @XMM[2]
829 pxor @XMM[9], @XMM[4]
830 pshufb @XMM[8], @XMM[3]
831 pxor @XMM[9], @XMM[5]
832 pshufb @XMM[8], @XMM[4]
833 pxor @XMM[9], @XMM[6]
834 pshufb @XMM[8], @XMM[5]
835 pxor @XMM[9], @XMM[7]
836 pshufb @XMM[8], @XMM[6]
837 pshufb @XMM[8], @XMM[7]
838_bsaes_encrypt8_bitslice:
839___
840 &bitslice (@XMM[0..7, 8..11]);
841$code.=<<___;
842 dec $rounds
843 jmp .Lenc_sbox
844.align 16
845.Lenc_loop:
846___
847 &ShiftRows (@XMM[0..7, 8]);
848$code.=".Lenc_sbox:\n";
849 &Sbox (@XMM[0..7, 8..15]);
850$code.=<<___;
851 dec $rounds
852 jl .Lenc_done
853___
854 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
855$code.=<<___;
856 movdqa 0x30($const), @XMM[8] # .LSR
857 jnz .Lenc_loop
858 movdqa 0x40($const), @XMM[8] # .LSRM0
859 jmp .Lenc_loop
860.align 16
861.Lenc_done:
862___
863 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
864 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
865$code.=<<___;
866 movdqa ($key), @XMM[8] # last round key
867 pxor @XMM[8], @XMM[4]
868 pxor @XMM[8], @XMM[6]
869 pxor @XMM[8], @XMM[3]
870 pxor @XMM[8], @XMM[7]
871 pxor @XMM[8], @XMM[2]
872 pxor @XMM[8], @XMM[5]
873 pxor @XMM[8], @XMM[0]
874 pxor @XMM[8], @XMM[1]
875 ret
876.size _bsaes_encrypt8,.-_bsaes_encrypt8
877
878.type _bsaes_decrypt8,\@abi-omnipotent
879.align 64
880_bsaes_decrypt8:
881 _CET_ENDBR
882 lea .LBS0(%rip), $const # constants table
883
884 movdqa ($key), @XMM[9] # round 0 key
885 lea 0x10($key), $key
886 movdqa -0x30($const), @XMM[8] # .LM0ISR
887 pxor @XMM[9], @XMM[0] # xor with round0 key
888 pxor @XMM[9], @XMM[1]
889 pshufb @XMM[8], @XMM[0]
890 pxor @XMM[9], @XMM[2]
891 pshufb @XMM[8], @XMM[1]
892 pxor @XMM[9], @XMM[3]
893 pshufb @XMM[8], @XMM[2]
894 pxor @XMM[9], @XMM[4]
895 pshufb @XMM[8], @XMM[3]
896 pxor @XMM[9], @XMM[5]
897 pshufb @XMM[8], @XMM[4]
898 pxor @XMM[9], @XMM[6]
899 pshufb @XMM[8], @XMM[5]
900 pxor @XMM[9], @XMM[7]
901 pshufb @XMM[8], @XMM[6]
902 pshufb @XMM[8], @XMM[7]
903___
904 &bitslice (@XMM[0..7, 8..11]);
905$code.=<<___;
906 dec $rounds
907 jmp .Ldec_sbox
908.align 16
909.Ldec_loop:
910___
911 &ShiftRows (@XMM[0..7, 8]);
912$code.=".Ldec_sbox:\n";
913 &InvSbox (@XMM[0..7, 8..15]);
914$code.=<<___;
915 dec $rounds
916 jl .Ldec_done
917___
918 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
919$code.=<<___;
920 movdqa -0x10($const), @XMM[8] # .LISR
921 jnz .Ldec_loop
922 movdqa -0x20($const), @XMM[8] # .LISRM0
923 jmp .Ldec_loop
924.align 16
925.Ldec_done:
926___
927 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
928$code.=<<___;
929 movdqa ($key), @XMM[8] # last round key
930 pxor @XMM[8], @XMM[6]
931 pxor @XMM[8], @XMM[4]
932 pxor @XMM[8], @XMM[2]
933 pxor @XMM[8], @XMM[7]
934 pxor @XMM[8], @XMM[3]
935 pxor @XMM[8], @XMM[5]
936 pxor @XMM[8], @XMM[0]
937 pxor @XMM[8], @XMM[1]
938 ret
939.size _bsaes_decrypt8,.-_bsaes_decrypt8
940___
941}
942{
943my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
944
945sub bitslice_key {
946my @x=reverse(@_[0..7]);
947my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
948
949 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
950$code.=<<___;
951 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
952 movdqa @x[0], @x[2]
953 movdqa @x[1], @x[3]
954___
955 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
956
957 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
958$code.=<<___;
959 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
960 movdqa @x[0], @x[4]
961 movdqa @x[2], @x[6]
962 movdqa @x[1], @x[5]
963 movdqa @x[3], @x[7]
964___
965 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
966 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
967}
968
969$code.=<<___;
970.type _bsaes_key_convert,\@abi-omnipotent
971.align 16
972_bsaes_key_convert:
973 _CET_ENDBR
974 lea .Lmasks(%rip), $const
975 movdqu ($inp), %xmm7 # load round 0 key
976 lea 0x10($inp), $inp
977 movdqa 0x00($const), %xmm0 # 0x01...
978 movdqa 0x10($const), %xmm1 # 0x02...
979 movdqa 0x20($const), %xmm2 # 0x04...
980 movdqa 0x30($const), %xmm3 # 0x08...
981 movdqa 0x40($const), %xmm4 # .LM0
982 pcmpeqd %xmm5, %xmm5 # .LNOT
983
984 movdqu ($inp), %xmm6 # load round 1 key
985 movdqa %xmm7, ($out) # save round 0 key
986 lea 0x10($out), $out
987 dec $rounds
988 jmp .Lkey_loop
989.align 16
990.Lkey_loop:
991 pshufb %xmm4, %xmm6 # .LM0
992
993 movdqa %xmm0, %xmm8
994 movdqa %xmm1, %xmm9
995
996 pand %xmm6, %xmm8
997 pand %xmm6, %xmm9
998 movdqa %xmm2, %xmm10
999 pcmpeqb %xmm0, %xmm8
1000 psllq \$4, %xmm0 # 0x10...
1001 movdqa %xmm3, %xmm11
1002 pcmpeqb %xmm1, %xmm9
1003 psllq \$4, %xmm1 # 0x20...
1004
1005 pand %xmm6, %xmm10
1006 pand %xmm6, %xmm11
1007 movdqa %xmm0, %xmm12
1008 pcmpeqb %xmm2, %xmm10
1009 psllq \$4, %xmm2 # 0x40...
1010 movdqa %xmm1, %xmm13
1011 pcmpeqb %xmm3, %xmm11
1012 psllq \$4, %xmm3 # 0x80...
1013
1014 movdqa %xmm2, %xmm14
1015 movdqa %xmm3, %xmm15
1016 pxor %xmm5, %xmm8 # "pnot"
1017 pxor %xmm5, %xmm9
1018
1019 pand %xmm6, %xmm12
1020 pand %xmm6, %xmm13
1021 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1022 pcmpeqb %xmm0, %xmm12
1023 psrlq \$4, %xmm0 # 0x01...
1024 movdqa %xmm9, 0x10($out)
1025 pcmpeqb %xmm1, %xmm13
1026 psrlq \$4, %xmm1 # 0x02...
1027 lea 0x10($inp), $inp
1028
1029 pand %xmm6, %xmm14
1030 pand %xmm6, %xmm15
1031 movdqa %xmm10, 0x20($out)
1032 pcmpeqb %xmm2, %xmm14
1033 psrlq \$4, %xmm2 # 0x04...
1034 movdqa %xmm11, 0x30($out)
1035 pcmpeqb %xmm3, %xmm15
1036 psrlq \$4, %xmm3 # 0x08...
1037 movdqu ($inp), %xmm6 # load next round key
1038
1039 pxor %xmm5, %xmm13 # "pnot"
1040 pxor %xmm5, %xmm14
1041 movdqa %xmm12, 0x40($out)
1042 movdqa %xmm13, 0x50($out)
1043 movdqa %xmm14, 0x60($out)
1044 movdqa %xmm15, 0x70($out)
1045 lea 0x80($out),$out
1046 dec $rounds
1047 jnz .Lkey_loop
1048
1049 movdqa 0x50($const), %xmm7 # .L63
1050 #movdqa %xmm6, ($out) # don't save last round key
1051 ret
1052.size _bsaes_key_convert,.-_bsaes_key_convert
1053___
1054}
1055
1056if (0 && !$win64) { # following four functions are unsupported interface
1057 # used for benchmarking...
1058$code.=<<___;
1059.globl bsaes_enc_key_convert
1060.type bsaes_enc_key_convert,\@function,2
1061.align 16
1062bsaes_enc_key_convert:
1063 _CET_ENDBR
1064 mov 240($inp),%r10d # pass rounds
1065 mov $inp,%rcx # pass key
1066 mov $out,%rax # pass key schedule
1067 call _bsaes_key_convert
1068 pxor %xmm6,%xmm7 # fix up last round key
1069 movdqa %xmm7,(%rax) # save last round key
1070 ret
1071.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1072
1073.globl bsaes_encrypt_128
1074.type bsaes_encrypt_128,\@function,4
1075.align 16
1076bsaes_encrypt_128:
1077.Lenc128_loop:
1078 _CET_ENDBR
1079 movdqu 0x00($inp), @XMM[0] # load input
1080 movdqu 0x10($inp), @XMM[1]
1081 movdqu 0x20($inp), @XMM[2]
1082 movdqu 0x30($inp), @XMM[3]
1083 movdqu 0x40($inp), @XMM[4]
1084 movdqu 0x50($inp), @XMM[5]
1085 movdqu 0x60($inp), @XMM[6]
1086 movdqu 0x70($inp), @XMM[7]
1087 mov $key, %rax # pass the $key
1088 lea 0x80($inp), $inp
1089 mov \$10,%r10d
1090
1091 call _bsaes_encrypt8
1092
1093 movdqu @XMM[0], 0x00($out) # write output
1094 movdqu @XMM[1], 0x10($out)
1095 movdqu @XMM[4], 0x20($out)
1096 movdqu @XMM[6], 0x30($out)
1097 movdqu @XMM[3], 0x40($out)
1098 movdqu @XMM[7], 0x50($out)
1099 movdqu @XMM[2], 0x60($out)
1100 movdqu @XMM[5], 0x70($out)
1101 lea 0x80($out), $out
1102 sub \$0x80,$len
1103 ja .Lenc128_loop
1104 ret
1105.size bsaes_encrypt_128,.-bsaes_encrypt_128
1106
1107.globl bsaes_dec_key_convert
1108.type bsaes_dec_key_convert,\@function,2
1109.align 16
1110bsaes_dec_key_convert:
1111 _CET_ENDBR
1112 mov 240($inp),%r10d # pass rounds
1113 mov $inp,%rcx # pass key
1114 mov $out,%rax # pass key schedule
1115 call _bsaes_key_convert
1116 pxor ($out),%xmm7 # fix up round 0 key
1117 movdqa %xmm6,(%rax) # save last round key
1118 movdqa %xmm7,($out)
1119 ret
1120.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1121
1122.globl bsaes_decrypt_128
1123.type bsaes_decrypt_128,\@function,4
1124.align 16
1125bsaes_decrypt_128:
1126 _CET_ENDBR
1127.Ldec128_loop:
1128 movdqu 0x00($inp), @XMM[0] # load input
1129 movdqu 0x10($inp), @XMM[1]
1130 movdqu 0x20($inp), @XMM[2]
1131 movdqu 0x30($inp), @XMM[3]
1132 movdqu 0x40($inp), @XMM[4]
1133 movdqu 0x50($inp), @XMM[5]
1134 movdqu 0x60($inp), @XMM[6]
1135 movdqu 0x70($inp), @XMM[7]
1136 mov $key, %rax # pass the $key
1137 lea 0x80($inp), $inp
1138 mov \$10,%r10d
1139
1140 call _bsaes_decrypt8
1141
1142 movdqu @XMM[0], 0x00($out) # write output
1143 movdqu @XMM[1], 0x10($out)
1144 movdqu @XMM[6], 0x20($out)
1145 movdqu @XMM[4], 0x30($out)
1146 movdqu @XMM[2], 0x40($out)
1147 movdqu @XMM[7], 0x50($out)
1148 movdqu @XMM[3], 0x60($out)
1149 movdqu @XMM[5], 0x70($out)
1150 lea 0x80($out), $out
1151 sub \$0x80,$len
1152 ja .Ldec128_loop
1153 ret
1154.size bsaes_decrypt_128,.-bsaes_decrypt_128
1155___
1156}
1157{
1158######################################################################
1159#
1160# OpenSSL interface
1161#
1162my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1163 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1164my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1165
1166if ($ecb) {
1167$code.=<<___;
1168.globl bsaes_ecb_encrypt_blocks
1169.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1170.align 16
1171bsaes_ecb_encrypt_blocks:
1172 _CET_ENDBR
1173 mov %rsp, %rax
1174.Lecb_enc_prologue:
1175 push %rbp
1176 push %rbx
1177 push %r12
1178 push %r13
1179 push %r14
1180 push %r15
1181 lea -0x48(%rsp),%rsp
1182___
1183$code.=<<___ if ($win64);
1184 lea -0xa0(%rsp), %rsp
1185 movaps %xmm6, 0x40(%rsp)
1186 movaps %xmm7, 0x50(%rsp)
1187 movaps %xmm8, 0x60(%rsp)
1188 movaps %xmm9, 0x70(%rsp)
1189 movaps %xmm10, 0x80(%rsp)
1190 movaps %xmm11, 0x90(%rsp)
1191 movaps %xmm12, 0xa0(%rsp)
1192 movaps %xmm13, 0xb0(%rsp)
1193 movaps %xmm14, 0xc0(%rsp)
1194 movaps %xmm15, 0xd0(%rsp)
1195.Lecb_enc_body:
1196___
1197$code.=<<___;
1198 mov %rsp,%rbp # backup %rsp
1199 mov 240($arg4),%eax # rounds
1200 mov $arg1,$inp # backup arguments
1201 mov $arg2,$out
1202 mov $arg3,$len
1203 mov $arg4,$key
1204 cmp \$8,$arg3
1205 jb .Lecb_enc_short
1206
1207 mov %eax,%ebx # backup rounds
1208 shl \$7,%rax # 128 bytes per inner round key
1209 sub \$`128-32`,%rax # size of bit-sliced key schedule
1210 sub %rax,%rsp
1211 mov %rsp,%rax # pass key schedule
1212 mov $key,%rcx # pass key
1213 mov %ebx,%r10d # pass rounds
1214 call _bsaes_key_convert
1215 pxor %xmm6,%xmm7 # fix up last round key
1216 movdqa %xmm7,(%rax) # save last round key
1217
1218 sub \$8,$len
1219.Lecb_enc_loop:
1220 movdqu 0x00($inp), @XMM[0] # load input
1221 movdqu 0x10($inp), @XMM[1]
1222 movdqu 0x20($inp), @XMM[2]
1223 movdqu 0x30($inp), @XMM[3]
1224 movdqu 0x40($inp), @XMM[4]
1225 movdqu 0x50($inp), @XMM[5]
1226 mov %rsp, %rax # pass key schedule
1227 movdqu 0x60($inp), @XMM[6]
1228 mov %ebx,%r10d # pass rounds
1229 movdqu 0x70($inp), @XMM[7]
1230 lea 0x80($inp), $inp
1231
1232 call _bsaes_encrypt8
1233
1234 movdqu @XMM[0], 0x00($out) # write output
1235 movdqu @XMM[1], 0x10($out)
1236 movdqu @XMM[4], 0x20($out)
1237 movdqu @XMM[6], 0x30($out)
1238 movdqu @XMM[3], 0x40($out)
1239 movdqu @XMM[7], 0x50($out)
1240 movdqu @XMM[2], 0x60($out)
1241 movdqu @XMM[5], 0x70($out)
1242 lea 0x80($out), $out
1243 sub \$8,$len
1244 jnc .Lecb_enc_loop
1245
1246 add \$8,$len
1247 jz .Lecb_enc_done
1248
1249 movdqu 0x00($inp), @XMM[0] # load input
1250 mov %rsp, %rax # pass key schedule
1251 mov %ebx,%r10d # pass rounds
1252 cmp \$2,$len
1253 jb .Lecb_enc_one
1254 movdqu 0x10($inp), @XMM[1]
1255 je .Lecb_enc_two
1256 movdqu 0x20($inp), @XMM[2]
1257 cmp \$4,$len
1258 jb .Lecb_enc_three
1259 movdqu 0x30($inp), @XMM[3]
1260 je .Lecb_enc_four
1261 movdqu 0x40($inp), @XMM[4]
1262 cmp \$6,$len
1263 jb .Lecb_enc_five
1264 movdqu 0x50($inp), @XMM[5]
1265 je .Lecb_enc_six
1266 movdqu 0x60($inp), @XMM[6]
1267 call _bsaes_encrypt8
1268 movdqu @XMM[0], 0x00($out) # write output
1269 movdqu @XMM[1], 0x10($out)
1270 movdqu @XMM[4], 0x20($out)
1271 movdqu @XMM[6], 0x30($out)
1272 movdqu @XMM[3], 0x40($out)
1273 movdqu @XMM[7], 0x50($out)
1274 movdqu @XMM[2], 0x60($out)
1275 jmp .Lecb_enc_done
1276.align 16
1277.Lecb_enc_six:
1278 call _bsaes_encrypt8
1279 movdqu @XMM[0], 0x00($out) # write output
1280 movdqu @XMM[1], 0x10($out)
1281 movdqu @XMM[4], 0x20($out)
1282 movdqu @XMM[6], 0x30($out)
1283 movdqu @XMM[3], 0x40($out)
1284 movdqu @XMM[7], 0x50($out)
1285 jmp .Lecb_enc_done
1286.align 16
1287.Lecb_enc_five:
1288 call _bsaes_encrypt8
1289 movdqu @XMM[0], 0x00($out) # write output
1290 movdqu @XMM[1], 0x10($out)
1291 movdqu @XMM[4], 0x20($out)
1292 movdqu @XMM[6], 0x30($out)
1293 movdqu @XMM[3], 0x40($out)
1294 jmp .Lecb_enc_done
1295.align 16
1296.Lecb_enc_four:
1297 call _bsaes_encrypt8
1298 movdqu @XMM[0], 0x00($out) # write output
1299 movdqu @XMM[1], 0x10($out)
1300 movdqu @XMM[4], 0x20($out)
1301 movdqu @XMM[6], 0x30($out)
1302 jmp .Lecb_enc_done
1303.align 16
1304.Lecb_enc_three:
1305 call _bsaes_encrypt8
1306 movdqu @XMM[0], 0x00($out) # write output
1307 movdqu @XMM[1], 0x10($out)
1308 movdqu @XMM[4], 0x20($out)
1309 jmp .Lecb_enc_done
1310.align 16
1311.Lecb_enc_two:
1312 call _bsaes_encrypt8
1313 movdqu @XMM[0], 0x00($out) # write output
1314 movdqu @XMM[1], 0x10($out)
1315 jmp .Lecb_enc_done
1316.align 16
1317.Lecb_enc_one:
1318 call _bsaes_encrypt8
1319 movdqu @XMM[0], 0x00($out) # write output
1320 jmp .Lecb_enc_done
1321.align 16
1322.Lecb_enc_short:
1323 lea ($inp), $arg1
1324 lea ($out), $arg2
1325 lea ($key), $arg3
1326 call asm_AES_encrypt
1327 lea 16($inp), $inp
1328 lea 16($out), $out
1329 dec $len
1330 jnz .Lecb_enc_short
1331
1332.Lecb_enc_done:
1333 lea (%rsp),%rax
1334 pxor %xmm0, %xmm0
1335.Lecb_enc_bzero: # wipe key schedule [if any]
1336 movdqa %xmm0, 0x00(%rax)
1337 movdqa %xmm0, 0x10(%rax)
1338 lea 0x20(%rax), %rax
1339 cmp %rax, %rbp
1340 jb .Lecb_enc_bzero
1341
1342 lea (%rbp),%rsp # restore %rsp
1343___
1344$code.=<<___ if ($win64);
1345 movaps 0x40(%rbp), %xmm6
1346 movaps 0x50(%rbp), %xmm7
1347 movaps 0x60(%rbp), %xmm8
1348 movaps 0x70(%rbp), %xmm9
1349 movaps 0x80(%rbp), %xmm10
1350 movaps 0x90(%rbp), %xmm11
1351 movaps 0xa0(%rbp), %xmm12
1352 movaps 0xb0(%rbp), %xmm13
1353 movaps 0xc0(%rbp), %xmm14
1354 movaps 0xd0(%rbp), %xmm15
1355 lea 0xa0(%rbp), %rsp
1356___
1357$code.=<<___;
1358 mov 0x48(%rsp), %r15
1359 mov 0x50(%rsp), %r14
1360 mov 0x58(%rsp), %r13
1361 mov 0x60(%rsp), %r12
1362 mov 0x68(%rsp), %rbx
1363 mov 0x70(%rsp), %rax
1364 lea 0x78(%rsp), %rsp
1365 mov %rax, %rbp
1366.Lecb_enc_epilogue:
1367 ret
1368.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1369
1370.globl bsaes_ecb_decrypt_blocks
1371.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1372.align 16
1373bsaes_ecb_decrypt_blocks:
1374 _CET_ENDBR
1375 mov %rsp, %rax
1376.Lecb_dec_prologue:
1377 push %rbp
1378 push %rbx
1379 push %r12
1380 push %r13
1381 push %r14
1382 push %r15
1383 lea -0x48(%rsp),%rsp
1384___
1385$code.=<<___ if ($win64);
1386 lea -0xa0(%rsp), %rsp
1387 movaps %xmm6, 0x40(%rsp)
1388 movaps %xmm7, 0x50(%rsp)
1389 movaps %xmm8, 0x60(%rsp)
1390 movaps %xmm9, 0x70(%rsp)
1391 movaps %xmm10, 0x80(%rsp)
1392 movaps %xmm11, 0x90(%rsp)
1393 movaps %xmm12, 0xa0(%rsp)
1394 movaps %xmm13, 0xb0(%rsp)
1395 movaps %xmm14, 0xc0(%rsp)
1396 movaps %xmm15, 0xd0(%rsp)
1397.Lecb_dec_body:
1398___
1399$code.=<<___;
1400 mov %rsp,%rbp # backup %rsp
1401 mov 240($arg4),%eax # rounds
1402 mov $arg1,$inp # backup arguments
1403 mov $arg2,$out
1404 mov $arg3,$len
1405 mov $arg4,$key
1406 cmp \$8,$arg3
1407 jb .Lecb_dec_short
1408
1409 mov %eax,%ebx # backup rounds
1410 shl \$7,%rax # 128 bytes per inner round key
1411 sub \$`128-32`,%rax # size of bit-sliced key schedule
1412 sub %rax,%rsp
1413 mov %rsp,%rax # pass key schedule
1414 mov $key,%rcx # pass key
1415 mov %ebx,%r10d # pass rounds
1416 call _bsaes_key_convert
1417 pxor (%rsp),%xmm7 # fix up 0 round key
1418 movdqa %xmm6,(%rax) # save last round key
1419 movdqa %xmm7,(%rsp)
1420
1421 sub \$8,$len
1422.Lecb_dec_loop:
1423 movdqu 0x00($inp), @XMM[0] # load input
1424 movdqu 0x10($inp), @XMM[1]
1425 movdqu 0x20($inp), @XMM[2]
1426 movdqu 0x30($inp), @XMM[3]
1427 movdqu 0x40($inp), @XMM[4]
1428 movdqu 0x50($inp), @XMM[5]
1429 mov %rsp, %rax # pass key schedule
1430 movdqu 0x60($inp), @XMM[6]
1431 mov %ebx,%r10d # pass rounds
1432 movdqu 0x70($inp), @XMM[7]
1433 lea 0x80($inp), $inp
1434
1435 call _bsaes_decrypt8
1436
1437 movdqu @XMM[0], 0x00($out) # write output
1438 movdqu @XMM[1], 0x10($out)
1439 movdqu @XMM[6], 0x20($out)
1440 movdqu @XMM[4], 0x30($out)
1441 movdqu @XMM[2], 0x40($out)
1442 movdqu @XMM[7], 0x50($out)
1443 movdqu @XMM[3], 0x60($out)
1444 movdqu @XMM[5], 0x70($out)
1445 lea 0x80($out), $out
1446 sub \$8,$len
1447 jnc .Lecb_dec_loop
1448
1449 add \$8,$len
1450 jz .Lecb_dec_done
1451
1452 movdqu 0x00($inp), @XMM[0] # load input
1453 mov %rsp, %rax # pass key schedule
1454 mov %ebx,%r10d # pass rounds
1455 cmp \$2,$len
1456 jb .Lecb_dec_one
1457 movdqu 0x10($inp), @XMM[1]
1458 je .Lecb_dec_two
1459 movdqu 0x20($inp), @XMM[2]
1460 cmp \$4,$len
1461 jb .Lecb_dec_three
1462 movdqu 0x30($inp), @XMM[3]
1463 je .Lecb_dec_four
1464 movdqu 0x40($inp), @XMM[4]
1465 cmp \$6,$len
1466 jb .Lecb_dec_five
1467 movdqu 0x50($inp), @XMM[5]
1468 je .Lecb_dec_six
1469 movdqu 0x60($inp), @XMM[6]
1470 call _bsaes_decrypt8
1471 movdqu @XMM[0], 0x00($out) # write output
1472 movdqu @XMM[1], 0x10($out)
1473 movdqu @XMM[6], 0x20($out)
1474 movdqu @XMM[4], 0x30($out)
1475 movdqu @XMM[2], 0x40($out)
1476 movdqu @XMM[7], 0x50($out)
1477 movdqu @XMM[3], 0x60($out)
1478 jmp .Lecb_dec_done
1479.align 16
1480.Lecb_dec_six:
1481 call _bsaes_decrypt8
1482 movdqu @XMM[0], 0x00($out) # write output
1483 movdqu @XMM[1], 0x10($out)
1484 movdqu @XMM[6], 0x20($out)
1485 movdqu @XMM[4], 0x30($out)
1486 movdqu @XMM[2], 0x40($out)
1487 movdqu @XMM[7], 0x50($out)
1488 jmp .Lecb_dec_done
1489.align 16
1490.Lecb_dec_five:
1491 call _bsaes_decrypt8
1492 movdqu @XMM[0], 0x00($out) # write output
1493 movdqu @XMM[1], 0x10($out)
1494 movdqu @XMM[6], 0x20($out)
1495 movdqu @XMM[4], 0x30($out)
1496 movdqu @XMM[2], 0x40($out)
1497 jmp .Lecb_dec_done
1498.align 16
1499.Lecb_dec_four:
1500 call _bsaes_decrypt8
1501 movdqu @XMM[0], 0x00($out) # write output
1502 movdqu @XMM[1], 0x10($out)
1503 movdqu @XMM[6], 0x20($out)
1504 movdqu @XMM[4], 0x30($out)
1505 jmp .Lecb_dec_done
1506.align 16
1507.Lecb_dec_three:
1508 call _bsaes_decrypt8
1509 movdqu @XMM[0], 0x00($out) # write output
1510 movdqu @XMM[1], 0x10($out)
1511 movdqu @XMM[6], 0x20($out)
1512 jmp .Lecb_dec_done
1513.align 16
1514.Lecb_dec_two:
1515 call _bsaes_decrypt8
1516 movdqu @XMM[0], 0x00($out) # write output
1517 movdqu @XMM[1], 0x10($out)
1518 jmp .Lecb_dec_done
1519.align 16
1520.Lecb_dec_one:
1521 call _bsaes_decrypt8
1522 movdqu @XMM[0], 0x00($out) # write output
1523 jmp .Lecb_dec_done
1524.align 16
1525.Lecb_dec_short:
1526 lea ($inp), $arg1
1527 lea ($out), $arg2
1528 lea ($key), $arg3
1529 call asm_AES_decrypt
1530 lea 16($inp), $inp
1531 lea 16($out), $out
1532 dec $len
1533 jnz .Lecb_dec_short
1534
1535.Lecb_dec_done:
1536 lea (%rsp),%rax
1537 pxor %xmm0, %xmm0
1538.Lecb_dec_bzero: # wipe key schedule [if any]
1539 movdqa %xmm0, 0x00(%rax)
1540 movdqa %xmm0, 0x10(%rax)
1541 lea 0x20(%rax), %rax
1542 cmp %rax, %rbp
1543 jb .Lecb_dec_bzero
1544
1545 lea (%rbp),%rsp # restore %rsp
1546___
1547$code.=<<___ if ($win64);
1548 movaps 0x40(%rbp), %xmm6
1549 movaps 0x50(%rbp), %xmm7
1550 movaps 0x60(%rbp), %xmm8
1551 movaps 0x70(%rbp), %xmm9
1552 movaps 0x80(%rbp), %xmm10
1553 movaps 0x90(%rbp), %xmm11
1554 movaps 0xa0(%rbp), %xmm12
1555 movaps 0xb0(%rbp), %xmm13
1556 movaps 0xc0(%rbp), %xmm14
1557 movaps 0xd0(%rbp), %xmm15
1558 lea 0xa0(%rbp), %rsp
1559___
1560$code.=<<___;
1561 mov 0x48(%rsp), %r15
1562 mov 0x50(%rsp), %r14
1563 mov 0x58(%rsp), %r13
1564 mov 0x60(%rsp), %r12
1565 mov 0x68(%rsp), %rbx
1566 mov 0x70(%rsp), %rax
1567 lea 0x78(%rsp), %rsp
1568 mov %rax, %rbp
1569.Lecb_dec_epilogue:
1570 ret
1571.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1572___
1573}
1574$code.=<<___;
1575.extern asm_AES_cbc_encrypt
1576.globl bsaes_cbc_encrypt
1577.type bsaes_cbc_encrypt,\@abi-omnipotent
1578.align 16
1579bsaes_cbc_encrypt:
1580 _CET_ENDBR
1581___
1582$code.=<<___ if ($win64);
1583 mov 48(%rsp),$arg6 # pull direction flag
1584___
1585$code.=<<___;
1586 cmp \$0,$arg6
1587 jne asm_AES_cbc_encrypt
1588 cmp \$128,$arg3
1589 jb asm_AES_cbc_encrypt
1590
1591 mov %rsp, %rax
1592.Lcbc_dec_prologue:
1593 push %rbp
1594 push %rbx
1595 push %r12
1596 push %r13
1597 push %r14
1598 push %r15
1599 lea -0x48(%rsp), %rsp
1600___
1601$code.=<<___ if ($win64);
1602 mov 0xa0(%rsp),$arg5 # pull ivp
1603 lea -0xa0(%rsp), %rsp
1604 movaps %xmm6, 0x40(%rsp)
1605 movaps %xmm7, 0x50(%rsp)
1606 movaps %xmm8, 0x60(%rsp)
1607 movaps %xmm9, 0x70(%rsp)
1608 movaps %xmm10, 0x80(%rsp)
1609 movaps %xmm11, 0x90(%rsp)
1610 movaps %xmm12, 0xa0(%rsp)
1611 movaps %xmm13, 0xb0(%rsp)
1612 movaps %xmm14, 0xc0(%rsp)
1613 movaps %xmm15, 0xd0(%rsp)
1614.Lcbc_dec_body:
1615___
1616$code.=<<___;
1617 mov %rsp, %rbp # backup %rsp
1618 mov 240($arg4), %eax # rounds
1619 mov $arg1, $inp # backup arguments
1620 mov $arg2, $out
1621 mov $arg3, $len
1622 mov $arg4, $key
1623 mov $arg5, %rbx
1624 shr \$4, $len # bytes to blocks
1625
1626 mov %eax, %edx # rounds
1627 shl \$7, %rax # 128 bytes per inner round key
1628 sub \$`128-32`, %rax # size of bit-sliced key schedule
1629 sub %rax, %rsp
1630
1631 mov %rsp, %rax # pass key schedule
1632 mov $key, %rcx # pass key
1633 mov %edx, %r10d # pass rounds
1634 call _bsaes_key_convert
1635 pxor (%rsp),%xmm7 # fix up 0 round key
1636 movdqa %xmm6,(%rax) # save last round key
1637 movdqa %xmm7,(%rsp)
1638
1639 movdqu (%rbx), @XMM[15] # load IV
1640 sub \$8,$len
1641.Lcbc_dec_loop:
1642 movdqu 0x00($inp), @XMM[0] # load input
1643 movdqu 0x10($inp), @XMM[1]
1644 movdqu 0x20($inp), @XMM[2]
1645 movdqu 0x30($inp), @XMM[3]
1646 movdqu 0x40($inp), @XMM[4]
1647 movdqu 0x50($inp), @XMM[5]
1648 mov %rsp, %rax # pass key schedule
1649 movdqu 0x60($inp), @XMM[6]
1650 mov %edx,%r10d # pass rounds
1651 movdqu 0x70($inp), @XMM[7]
1652 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1653
1654 call _bsaes_decrypt8
1655
1656 pxor 0x20(%rbp), @XMM[0] # ^= IV
1657 movdqu 0x00($inp), @XMM[8] # re-load input
1658 movdqu 0x10($inp), @XMM[9]
1659 pxor @XMM[8], @XMM[1]
1660 movdqu 0x20($inp), @XMM[10]
1661 pxor @XMM[9], @XMM[6]
1662 movdqu 0x30($inp), @XMM[11]
1663 pxor @XMM[10], @XMM[4]
1664 movdqu 0x40($inp), @XMM[12]
1665 pxor @XMM[11], @XMM[2]
1666 movdqu 0x50($inp), @XMM[13]
1667 pxor @XMM[12], @XMM[7]
1668 movdqu 0x60($inp), @XMM[14]
1669 pxor @XMM[13], @XMM[3]
1670 movdqu 0x70($inp), @XMM[15] # IV
1671 pxor @XMM[14], @XMM[5]
1672 movdqu @XMM[0], 0x00($out) # write output
1673 lea 0x80($inp), $inp
1674 movdqu @XMM[1], 0x10($out)
1675 movdqu @XMM[6], 0x20($out)
1676 movdqu @XMM[4], 0x30($out)
1677 movdqu @XMM[2], 0x40($out)
1678 movdqu @XMM[7], 0x50($out)
1679 movdqu @XMM[3], 0x60($out)
1680 movdqu @XMM[5], 0x70($out)
1681 lea 0x80($out), $out
1682 sub \$8,$len
1683 jnc .Lcbc_dec_loop
1684
1685 add \$8,$len
1686 jz .Lcbc_dec_done
1687
1688 movdqu 0x00($inp), @XMM[0] # load input
1689 mov %rsp, %rax # pass key schedule
1690 mov %edx, %r10d # pass rounds
1691 cmp \$2,$len
1692 jb .Lcbc_dec_one
1693 movdqu 0x10($inp), @XMM[1]
1694 je .Lcbc_dec_two
1695 movdqu 0x20($inp), @XMM[2]
1696 cmp \$4,$len
1697 jb .Lcbc_dec_three
1698 movdqu 0x30($inp), @XMM[3]
1699 je .Lcbc_dec_four
1700 movdqu 0x40($inp), @XMM[4]
1701 cmp \$6,$len
1702 jb .Lcbc_dec_five
1703 movdqu 0x50($inp), @XMM[5]
1704 je .Lcbc_dec_six
1705 movdqu 0x60($inp), @XMM[6]
1706 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1707 call _bsaes_decrypt8
1708 pxor 0x20(%rbp), @XMM[0] # ^= IV
1709 movdqu 0x00($inp), @XMM[8] # re-load input
1710 movdqu 0x10($inp), @XMM[9]
1711 pxor @XMM[8], @XMM[1]
1712 movdqu 0x20($inp), @XMM[10]
1713 pxor @XMM[9], @XMM[6]
1714 movdqu 0x30($inp), @XMM[11]
1715 pxor @XMM[10], @XMM[4]
1716 movdqu 0x40($inp), @XMM[12]
1717 pxor @XMM[11], @XMM[2]
1718 movdqu 0x50($inp), @XMM[13]
1719 pxor @XMM[12], @XMM[7]
1720 movdqu 0x60($inp), @XMM[15] # IV
1721 pxor @XMM[13], @XMM[3]
1722 movdqu @XMM[0], 0x00($out) # write output
1723 movdqu @XMM[1], 0x10($out)
1724 movdqu @XMM[6], 0x20($out)
1725 movdqu @XMM[4], 0x30($out)
1726 movdqu @XMM[2], 0x40($out)
1727 movdqu @XMM[7], 0x50($out)
1728 movdqu @XMM[3], 0x60($out)
1729 jmp .Lcbc_dec_done
1730.align 16
1731.Lcbc_dec_six:
1732 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1733 call _bsaes_decrypt8
1734 pxor 0x20(%rbp), @XMM[0] # ^= IV
1735 movdqu 0x00($inp), @XMM[8] # re-load input
1736 movdqu 0x10($inp), @XMM[9]
1737 pxor @XMM[8], @XMM[1]
1738 movdqu 0x20($inp), @XMM[10]
1739 pxor @XMM[9], @XMM[6]
1740 movdqu 0x30($inp), @XMM[11]
1741 pxor @XMM[10], @XMM[4]
1742 movdqu 0x40($inp), @XMM[12]
1743 pxor @XMM[11], @XMM[2]
1744 movdqu 0x50($inp), @XMM[15] # IV
1745 pxor @XMM[12], @XMM[7]
1746 movdqu @XMM[0], 0x00($out) # write output
1747 movdqu @XMM[1], 0x10($out)
1748 movdqu @XMM[6], 0x20($out)
1749 movdqu @XMM[4], 0x30($out)
1750 movdqu @XMM[2], 0x40($out)
1751 movdqu @XMM[7], 0x50($out)
1752 jmp .Lcbc_dec_done
1753.align 16
1754.Lcbc_dec_five:
1755 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1756 call _bsaes_decrypt8
1757 pxor 0x20(%rbp), @XMM[0] # ^= IV
1758 movdqu 0x00($inp), @XMM[8] # re-load input
1759 movdqu 0x10($inp), @XMM[9]
1760 pxor @XMM[8], @XMM[1]
1761 movdqu 0x20($inp), @XMM[10]
1762 pxor @XMM[9], @XMM[6]
1763 movdqu 0x30($inp), @XMM[11]
1764 pxor @XMM[10], @XMM[4]
1765 movdqu 0x40($inp), @XMM[15] # IV
1766 pxor @XMM[11], @XMM[2]
1767 movdqu @XMM[0], 0x00($out) # write output
1768 movdqu @XMM[1], 0x10($out)
1769 movdqu @XMM[6], 0x20($out)
1770 movdqu @XMM[4], 0x30($out)
1771 movdqu @XMM[2], 0x40($out)
1772 jmp .Lcbc_dec_done
1773.align 16
1774.Lcbc_dec_four:
1775 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1776 call _bsaes_decrypt8
1777 pxor 0x20(%rbp), @XMM[0] # ^= IV
1778 movdqu 0x00($inp), @XMM[8] # re-load input
1779 movdqu 0x10($inp), @XMM[9]
1780 pxor @XMM[8], @XMM[1]
1781 movdqu 0x20($inp), @XMM[10]
1782 pxor @XMM[9], @XMM[6]
1783 movdqu 0x30($inp), @XMM[15] # IV
1784 pxor @XMM[10], @XMM[4]
1785 movdqu @XMM[0], 0x00($out) # write output
1786 movdqu @XMM[1], 0x10($out)
1787 movdqu @XMM[6], 0x20($out)
1788 movdqu @XMM[4], 0x30($out)
1789 jmp .Lcbc_dec_done
1790.align 16
1791.Lcbc_dec_three:
1792 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1793 call _bsaes_decrypt8
1794 pxor 0x20(%rbp), @XMM[0] # ^= IV
1795 movdqu 0x00($inp), @XMM[8] # re-load input
1796 movdqu 0x10($inp), @XMM[9]
1797 pxor @XMM[8], @XMM[1]
1798 movdqu 0x20($inp), @XMM[15] # IV
1799 pxor @XMM[9], @XMM[6]
1800 movdqu @XMM[0], 0x00($out) # write output
1801 movdqu @XMM[1], 0x10($out)
1802 movdqu @XMM[6], 0x20($out)
1803 jmp .Lcbc_dec_done
1804.align 16
1805.Lcbc_dec_two:
1806 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1807 call _bsaes_decrypt8
1808 pxor 0x20(%rbp), @XMM[0] # ^= IV
1809 movdqu 0x00($inp), @XMM[8] # re-load input
1810 movdqu 0x10($inp), @XMM[15] # IV
1811 pxor @XMM[8], @XMM[1]
1812 movdqu @XMM[0], 0x00($out) # write output
1813 movdqu @XMM[1], 0x10($out)
1814 jmp .Lcbc_dec_done
1815.align 16
1816.Lcbc_dec_one:
1817 lea ($inp), $arg1
1818 lea 0x20(%rbp), $arg2 # buffer output
1819 lea ($key), $arg3
1820 call asm_AES_decrypt # doesn't touch %xmm
1821 pxor 0x20(%rbp), @XMM[15] # ^= IV
1822 movdqu @XMM[15], ($out) # write output
1823 movdqa @XMM[0], @XMM[15] # IV
1824
1825.Lcbc_dec_done:
1826 movdqu @XMM[15], (%rbx) # return IV
1827 lea (%rsp), %rax
1828 pxor %xmm0, %xmm0
1829.Lcbc_dec_bzero: # wipe key schedule [if any]
1830 movdqa %xmm0, 0x00(%rax)
1831 movdqa %xmm0, 0x10(%rax)
1832 lea 0x20(%rax), %rax
1833 cmp %rax, %rbp
1834 ja .Lcbc_dec_bzero
1835
1836 lea (%rbp),%rsp # restore %rsp
1837___
1838$code.=<<___ if ($win64);
1839 movaps 0x40(%rbp), %xmm6
1840 movaps 0x50(%rbp), %xmm7
1841 movaps 0x60(%rbp), %xmm8
1842 movaps 0x70(%rbp), %xmm9
1843 movaps 0x80(%rbp), %xmm10
1844 movaps 0x90(%rbp), %xmm11
1845 movaps 0xa0(%rbp), %xmm12
1846 movaps 0xb0(%rbp), %xmm13
1847 movaps 0xc0(%rbp), %xmm14
1848 movaps 0xd0(%rbp), %xmm15
1849 lea 0xa0(%rbp), %rsp
1850___
1851$code.=<<___;
1852 mov 0x48(%rsp), %r15
1853 mov 0x50(%rsp), %r14
1854 mov 0x58(%rsp), %r13
1855 mov 0x60(%rsp), %r12
1856 mov 0x68(%rsp), %rbx
1857 mov 0x70(%rsp), %rax
1858 lea 0x78(%rsp), %rsp
1859 mov %rax, %rbp
1860.Lcbc_dec_epilogue:
1861 ret
1862.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1863
1864.globl bsaes_ctr32_encrypt_blocks
1865.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1866.align 16
1867bsaes_ctr32_encrypt_blocks:
1868 _CET_ENDBR
1869 mov %rsp, %rax
1870.Lctr_enc_prologue:
1871 push %rbp
1872 push %rbx
1873 push %r12
1874 push %r13
1875 push %r14
1876 push %r15
1877 lea -0x48(%rsp), %rsp
1878___
1879$code.=<<___ if ($win64);
1880 mov 0xa0(%rsp),$arg5 # pull ivp
1881 lea -0xa0(%rsp), %rsp
1882 movaps %xmm6, 0x40(%rsp)
1883 movaps %xmm7, 0x50(%rsp)
1884 movaps %xmm8, 0x60(%rsp)
1885 movaps %xmm9, 0x70(%rsp)
1886 movaps %xmm10, 0x80(%rsp)
1887 movaps %xmm11, 0x90(%rsp)
1888 movaps %xmm12, 0xa0(%rsp)
1889 movaps %xmm13, 0xb0(%rsp)
1890 movaps %xmm14, 0xc0(%rsp)
1891 movaps %xmm15, 0xd0(%rsp)
1892.Lctr_enc_body:
1893___
1894$code.=<<___;
1895 mov %rsp, %rbp # backup %rsp
1896 movdqu ($arg5), %xmm0 # load counter
1897 mov 240($arg4), %eax # rounds
1898 mov $arg1, $inp # backup arguments
1899 mov $arg2, $out
1900 mov $arg3, $len
1901 mov $arg4, $key
1902 movdqa %xmm0, 0x20(%rbp) # copy counter
1903 cmp \$8, $arg3
1904 jb .Lctr_enc_short
1905
1906 mov %eax, %ebx # rounds
1907 shl \$7, %rax # 128 bytes per inner round key
1908 sub \$`128-32`, %rax # size of bit-sliced key schedule
1909 sub %rax, %rsp
1910
1911 mov %rsp, %rax # pass key schedule
1912 mov $key, %rcx # pass key
1913 mov %ebx, %r10d # pass rounds
1914 call _bsaes_key_convert
1915 pxor %xmm6,%xmm7 # fix up last round key
1916 movdqa %xmm7,(%rax) # save last round key
1917
1918 movdqa (%rsp), @XMM[9] # load round0 key
1919 lea .LADD1(%rip), %r11
1920 movdqa 0x20(%rbp), @XMM[0] # counter copy
1921 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1922 pshufb @XMM[8], @XMM[9] # byte swap upper part
1923 pshufb @XMM[8], @XMM[0]
1924 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1925 jmp .Lctr_enc_loop
1926.align 16
1927.Lctr_enc_loop:
1928 movdqa @XMM[0], 0x20(%rbp) # save counter
1929 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1930 movdqa @XMM[0], @XMM[2]
1931 paddd 0x00(%r11), @XMM[1] # .LADD1
1932 movdqa @XMM[0], @XMM[3]
1933 paddd 0x10(%r11), @XMM[2] # .LADD2
1934 movdqa @XMM[0], @XMM[4]
1935 paddd 0x20(%r11), @XMM[3] # .LADD3
1936 movdqa @XMM[0], @XMM[5]
1937 paddd 0x30(%r11), @XMM[4] # .LADD4
1938 movdqa @XMM[0], @XMM[6]
1939 paddd 0x40(%r11), @XMM[5] # .LADD5
1940 movdqa @XMM[0], @XMM[7]
1941 paddd 0x50(%r11), @XMM[6] # .LADD6
1942 paddd 0x60(%r11), @XMM[7] # .LADD7
1943
1944 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1945 # to flip byte order in 32-bit counter
1946 movdqa (%rsp), @XMM[9] # round 0 key
1947 lea 0x10(%rsp), %rax # pass key schedule
1948 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1949 pxor @XMM[9], @XMM[0] # xor with round0 key
1950 pxor @XMM[9], @XMM[1]
1951 pshufb @XMM[8], @XMM[0]
1952 pxor @XMM[9], @XMM[2]
1953 pshufb @XMM[8], @XMM[1]
1954 pxor @XMM[9], @XMM[3]
1955 pshufb @XMM[8], @XMM[2]
1956 pxor @XMM[9], @XMM[4]
1957 pshufb @XMM[8], @XMM[3]
1958 pxor @XMM[9], @XMM[5]
1959 pshufb @XMM[8], @XMM[4]
1960 pxor @XMM[9], @XMM[6]
1961 pshufb @XMM[8], @XMM[5]
1962 pxor @XMM[9], @XMM[7]
1963 pshufb @XMM[8], @XMM[6]
1964 lea .LBS0(%rip), %r11 # constants table
1965 pshufb @XMM[8], @XMM[7]
1966 mov %ebx,%r10d # pass rounds
1967
1968 call _bsaes_encrypt8_bitslice
1969
1970 sub \$8,$len
1971 jc .Lctr_enc_loop_done
1972
1973 movdqu 0x00($inp), @XMM[8] # load input
1974 movdqu 0x10($inp), @XMM[9]
1975 movdqu 0x20($inp), @XMM[10]
1976 movdqu 0x30($inp), @XMM[11]
1977 movdqu 0x40($inp), @XMM[12]
1978 movdqu 0x50($inp), @XMM[13]
1979 movdqu 0x60($inp), @XMM[14]
1980 movdqu 0x70($inp), @XMM[15]
1981 lea 0x80($inp),$inp
1982 pxor @XMM[0], @XMM[8]
1983 movdqa 0x20(%rbp), @XMM[0] # load counter
1984 pxor @XMM[9], @XMM[1]
1985 movdqu @XMM[8], 0x00($out) # write output
1986 pxor @XMM[10], @XMM[4]
1987 movdqu @XMM[1], 0x10($out)
1988 pxor @XMM[11], @XMM[6]
1989 movdqu @XMM[4], 0x20($out)
1990 pxor @XMM[12], @XMM[3]
1991 movdqu @XMM[6], 0x30($out)
1992 pxor @XMM[13], @XMM[7]
1993 movdqu @XMM[3], 0x40($out)
1994 pxor @XMM[14], @XMM[2]
1995 movdqu @XMM[7], 0x50($out)
1996 pxor @XMM[15], @XMM[5]
1997 movdqu @XMM[2], 0x60($out)
1998 lea .LADD1(%rip), %r11
1999 movdqu @XMM[5], 0x70($out)
2000 lea 0x80($out), $out
2001 paddd 0x70(%r11), @XMM[0] # .LADD8
2002 jnz .Lctr_enc_loop
2003
2004 jmp .Lctr_enc_done
2005.align 16
2006.Lctr_enc_loop_done:
2007 add \$8, $len
2008 movdqu 0x00($inp), @XMM[8] # load input
2009 pxor @XMM[8], @XMM[0]
2010 movdqu @XMM[0], 0x00($out) # write output
2011 cmp \$2,$len
2012 jb .Lctr_enc_done
2013 movdqu 0x10($inp), @XMM[9]
2014 pxor @XMM[9], @XMM[1]
2015 movdqu @XMM[1], 0x10($out)
2016 je .Lctr_enc_done
2017 movdqu 0x20($inp), @XMM[10]
2018 pxor @XMM[10], @XMM[4]
2019 movdqu @XMM[4], 0x20($out)
2020 cmp \$4,$len
2021 jb .Lctr_enc_done
2022 movdqu 0x30($inp), @XMM[11]
2023 pxor @XMM[11], @XMM[6]
2024 movdqu @XMM[6], 0x30($out)
2025 je .Lctr_enc_done
2026 movdqu 0x40($inp), @XMM[12]
2027 pxor @XMM[12], @XMM[3]
2028 movdqu @XMM[3], 0x40($out)
2029 cmp \$6,$len
2030 jb .Lctr_enc_done
2031 movdqu 0x50($inp), @XMM[13]
2032 pxor @XMM[13], @XMM[7]
2033 movdqu @XMM[7], 0x50($out)
2034 je .Lctr_enc_done
2035 movdqu 0x60($inp), @XMM[14]
2036 pxor @XMM[14], @XMM[2]
2037 movdqu @XMM[2], 0x60($out)
2038 jmp .Lctr_enc_done
2039
2040.align 16
2041.Lctr_enc_short:
2042 lea 0x20(%rbp), $arg1
2043 lea 0x30(%rbp), $arg2
2044 lea ($key), $arg3
2045 call asm_AES_encrypt
2046 movdqu ($inp), @XMM[1]
2047 lea 16($inp), $inp
2048 mov 0x2c(%rbp), %eax # load 32-bit counter
2049 bswap %eax
2050 pxor 0x30(%rbp), @XMM[1]
2051 inc %eax # increment
2052 movdqu @XMM[1], ($out)
2053 bswap %eax
2054 lea 16($out), $out
2055 mov %eax, 0x2c(%rsp) # save 32-bit counter
2056 dec $len
2057 jnz .Lctr_enc_short
2058
2059.Lctr_enc_done:
2060 lea (%rsp), %rax
2061 pxor %xmm0, %xmm0
2062.Lctr_enc_bzero: # wipe key schedule [if any]
2063 movdqa %xmm0, 0x00(%rax)
2064 movdqa %xmm0, 0x10(%rax)
2065 lea 0x20(%rax), %rax
2066 cmp %rax, %rbp
2067 ja .Lctr_enc_bzero
2068
2069 lea (%rbp),%rsp # restore %rsp
2070___
2071$code.=<<___ if ($win64);
2072 movaps 0x40(%rbp), %xmm6
2073 movaps 0x50(%rbp), %xmm7
2074 movaps 0x60(%rbp), %xmm8
2075 movaps 0x70(%rbp), %xmm9
2076 movaps 0x80(%rbp), %xmm10
2077 movaps 0x90(%rbp), %xmm11
2078 movaps 0xa0(%rbp), %xmm12
2079 movaps 0xb0(%rbp), %xmm13
2080 movaps 0xc0(%rbp), %xmm14
2081 movaps 0xd0(%rbp), %xmm15
2082 lea 0xa0(%rbp), %rsp
2083___
2084$code.=<<___;
2085 mov 0x48(%rsp), %r15
2086 mov 0x50(%rsp), %r14
2087 mov 0x58(%rsp), %r13
2088 mov 0x60(%rsp), %r12
2089 mov 0x68(%rsp), %rbx
2090 mov 0x70(%rsp), %rax
2091 lea 0x78(%rsp), %rsp
2092 mov %rax, %rbp
2093.Lctr_enc_epilogue:
2094 ret
2095.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2096___
2097######################################################################
2098# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2099# const AES_KEY *key1, const AES_KEY *key2,
2100# const unsigned char iv[16]);
2101#
2102my ($twmask,$twres,$twtmp)=@XMM[13..15];
2103$arg6=~s/d$//;
2104
2105$code.=<<___;
2106.globl bsaes_xts_encrypt
2107.type bsaes_xts_encrypt,\@abi-omnipotent
2108.align 16
2109bsaes_xts_encrypt:
2110 _CET_ENDBR
2111 mov %rsp, %rax
2112.Lxts_enc_prologue:
2113 push %rbp
2114 push %rbx
2115 push %r12
2116 push %r13
2117 push %r14
2118 push %r15
2119 lea -0x48(%rsp), %rsp
2120___
2121$code.=<<___ if ($win64);
2122 mov 0xa0(%rsp),$arg5 # pull key2
2123 mov 0xa8(%rsp),$arg6 # pull ivp
2124 lea -0xa0(%rsp), %rsp
2125 movaps %xmm6, 0x40(%rsp)
2126 movaps %xmm7, 0x50(%rsp)
2127 movaps %xmm8, 0x60(%rsp)
2128 movaps %xmm9, 0x70(%rsp)
2129 movaps %xmm10, 0x80(%rsp)
2130 movaps %xmm11, 0x90(%rsp)
2131 movaps %xmm12, 0xa0(%rsp)
2132 movaps %xmm13, 0xb0(%rsp)
2133 movaps %xmm14, 0xc0(%rsp)
2134 movaps %xmm15, 0xd0(%rsp)
2135.Lxts_enc_body:
2136___
2137$code.=<<___;
2138 mov %rsp, %rbp # backup %rsp
2139 mov $arg1, $inp # backup arguments
2140 mov $arg2, $out
2141 mov $arg3, $len
2142 mov $arg4, $key
2143
2144 lea ($arg6), $arg1
2145 lea 0x20(%rbp), $arg2
2146 lea ($arg5), $arg3
2147 call asm_AES_encrypt # generate initial tweak
2148
2149 mov 240($key), %eax # rounds
2150 mov $len, %rbx # backup $len
2151
2152 mov %eax, %edx # rounds
2153 shl \$7, %rax # 128 bytes per inner round key
2154 sub \$`128-32`, %rax # size of bit-sliced key schedule
2155 sub %rax, %rsp
2156
2157 mov %rsp, %rax # pass key schedule
2158 mov $key, %rcx # pass key
2159 mov %edx, %r10d # pass rounds
2160 call _bsaes_key_convert
2161 pxor %xmm6, %xmm7 # fix up last round key
2162 movdqa %xmm7, (%rax) # save last round key
2163
2164 and \$-16, $len
2165 sub \$0x80, %rsp # place for tweak[8]
2166 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2167
2168 pxor $twtmp, $twtmp
2169 movdqa .Lxts_magic(%rip), $twmask
2170 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2171
2172 sub \$0x80, $len
2173 jc .Lxts_enc_short
2174 jmp .Lxts_enc_loop
2175
2176.align 16
2177.Lxts_enc_loop:
2178___
2179 for ($i=0;$i<7;$i++) {
2180 $code.=<<___;
2181 pshufd \$0x13, $twtmp, $twres
2182 pxor $twtmp, $twtmp
2183 movdqa @XMM[7], @XMM[$i]
2184 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2185 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2186 pand $twmask, $twres # isolate carry and residue
2187 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2188 pxor $twres, @XMM[7]
2189___
2190 $code.=<<___ if ($i>=1);
2191 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2192___
2193 $code.=<<___ if ($i>=2);
2194 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2195___
2196 }
2197$code.=<<___;
2198 movdqu 0x60($inp), @XMM[8+6]
2199 pxor @XMM[8+5], @XMM[5]
2200 movdqu 0x70($inp), @XMM[8+7]
2201 lea 0x80($inp), $inp
2202 movdqa @XMM[7], 0x70(%rsp)
2203 pxor @XMM[8+6], @XMM[6]
2204 lea 0x80(%rsp), %rax # pass key schedule
2205 pxor @XMM[8+7], @XMM[7]
2206 mov %edx, %r10d # pass rounds
2207
2208 call _bsaes_encrypt8
2209
2210 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2211 pxor 0x10(%rsp), @XMM[1]
2212 movdqu @XMM[0], 0x00($out) # write output
2213 pxor 0x20(%rsp), @XMM[4]
2214 movdqu @XMM[1], 0x10($out)
2215 pxor 0x30(%rsp), @XMM[6]
2216 movdqu @XMM[4], 0x20($out)
2217 pxor 0x40(%rsp), @XMM[3]
2218 movdqu @XMM[6], 0x30($out)
2219 pxor 0x50(%rsp), @XMM[7]
2220 movdqu @XMM[3], 0x40($out)
2221 pxor 0x60(%rsp), @XMM[2]
2222 movdqu @XMM[7], 0x50($out)
2223 pxor 0x70(%rsp), @XMM[5]
2224 movdqu @XMM[2], 0x60($out)
2225 movdqu @XMM[5], 0x70($out)
2226 lea 0x80($out), $out
2227
2228 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2229 pxor $twtmp, $twtmp
2230 movdqa .Lxts_magic(%rip), $twmask
2231 pcmpgtd @XMM[7], $twtmp
2232 pshufd \$0x13, $twtmp, $twres
2233 pxor $twtmp, $twtmp
2234 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2235 pand $twmask, $twres # isolate carry and residue
2236 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2237 pxor $twres, @XMM[7]
2238
2239 sub \$0x80,$len
2240 jnc .Lxts_enc_loop
2241
2242.Lxts_enc_short:
2243 add \$0x80, $len
2244 jz .Lxts_enc_done
2245___
2246 for ($i=0;$i<7;$i++) {
2247 $code.=<<___;
2248 pshufd \$0x13, $twtmp, $twres
2249 pxor $twtmp, $twtmp
2250 movdqa @XMM[7], @XMM[$i]
2251 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2252 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2253 pand $twmask, $twres # isolate carry and residue
2254 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2255 pxor $twres, @XMM[7]
2256___
2257 $code.=<<___ if ($i>=1);
2258 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2259 cmp \$`0x10*$i`,$len
2260 je .Lxts_enc_$i
2261___
2262 $code.=<<___ if ($i>=2);
2263 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2264___
2265 }
2266$code.=<<___;
2267 movdqu 0x60($inp), @XMM[8+6]
2268 pxor @XMM[8+5], @XMM[5]
2269 movdqa @XMM[7], 0x70(%rsp)
2270 lea 0x70($inp), $inp
2271 pxor @XMM[8+6], @XMM[6]
2272 lea 0x80(%rsp), %rax # pass key schedule
2273 mov %edx, %r10d # pass rounds
2274
2275 call _bsaes_encrypt8
2276
2277 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2278 pxor 0x10(%rsp), @XMM[1]
2279 movdqu @XMM[0], 0x00($out) # write output
2280 pxor 0x20(%rsp), @XMM[4]
2281 movdqu @XMM[1], 0x10($out)
2282 pxor 0x30(%rsp), @XMM[6]
2283 movdqu @XMM[4], 0x20($out)
2284 pxor 0x40(%rsp), @XMM[3]
2285 movdqu @XMM[6], 0x30($out)
2286 pxor 0x50(%rsp), @XMM[7]
2287 movdqu @XMM[3], 0x40($out)
2288 pxor 0x60(%rsp), @XMM[2]
2289 movdqu @XMM[7], 0x50($out)
2290 movdqu @XMM[2], 0x60($out)
2291 lea 0x70($out), $out
2292
2293 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2294 jmp .Lxts_enc_done
2295.align 16
2296.Lxts_enc_6:
2297 pxor @XMM[8+4], @XMM[4]
2298 lea 0x60($inp), $inp
2299 pxor @XMM[8+5], @XMM[5]
2300 lea 0x80(%rsp), %rax # pass key schedule
2301 mov %edx, %r10d # pass rounds
2302
2303 call _bsaes_encrypt8
2304
2305 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2306 pxor 0x10(%rsp), @XMM[1]
2307 movdqu @XMM[0], 0x00($out) # write output
2308 pxor 0x20(%rsp), @XMM[4]
2309 movdqu @XMM[1], 0x10($out)
2310 pxor 0x30(%rsp), @XMM[6]
2311 movdqu @XMM[4], 0x20($out)
2312 pxor 0x40(%rsp), @XMM[3]
2313 movdqu @XMM[6], 0x30($out)
2314 pxor 0x50(%rsp), @XMM[7]
2315 movdqu @XMM[3], 0x40($out)
2316 movdqu @XMM[7], 0x50($out)
2317 lea 0x60($out), $out
2318
2319 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2320 jmp .Lxts_enc_done
2321.align 16
2322.Lxts_enc_5:
2323 pxor @XMM[8+3], @XMM[3]
2324 lea 0x50($inp), $inp
2325 pxor @XMM[8+4], @XMM[4]
2326 lea 0x80(%rsp), %rax # pass key schedule
2327 mov %edx, %r10d # pass rounds
2328
2329 call _bsaes_encrypt8
2330
2331 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2332 pxor 0x10(%rsp), @XMM[1]
2333 movdqu @XMM[0], 0x00($out) # write output
2334 pxor 0x20(%rsp), @XMM[4]
2335 movdqu @XMM[1], 0x10($out)
2336 pxor 0x30(%rsp), @XMM[6]
2337 movdqu @XMM[4], 0x20($out)
2338 pxor 0x40(%rsp), @XMM[3]
2339 movdqu @XMM[6], 0x30($out)
2340 movdqu @XMM[3], 0x40($out)
2341 lea 0x50($out), $out
2342
2343 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2344 jmp .Lxts_enc_done
2345.align 16
2346.Lxts_enc_4:
2347 pxor @XMM[8+2], @XMM[2]
2348 lea 0x40($inp), $inp
2349 pxor @XMM[8+3], @XMM[3]
2350 lea 0x80(%rsp), %rax # pass key schedule
2351 mov %edx, %r10d # pass rounds
2352
2353 call _bsaes_encrypt8
2354
2355 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2356 pxor 0x10(%rsp), @XMM[1]
2357 movdqu @XMM[0], 0x00($out) # write output
2358 pxor 0x20(%rsp), @XMM[4]
2359 movdqu @XMM[1], 0x10($out)
2360 pxor 0x30(%rsp), @XMM[6]
2361 movdqu @XMM[4], 0x20($out)
2362 movdqu @XMM[6], 0x30($out)
2363 lea 0x40($out), $out
2364
2365 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2366 jmp .Lxts_enc_done
2367.align 16
2368.Lxts_enc_3:
2369 pxor @XMM[8+1], @XMM[1]
2370 lea 0x30($inp), $inp
2371 pxor @XMM[8+2], @XMM[2]
2372 lea 0x80(%rsp), %rax # pass key schedule
2373 mov %edx, %r10d # pass rounds
2374
2375 call _bsaes_encrypt8
2376
2377 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2378 pxor 0x10(%rsp), @XMM[1]
2379 movdqu @XMM[0], 0x00($out) # write output
2380 pxor 0x20(%rsp), @XMM[4]
2381 movdqu @XMM[1], 0x10($out)
2382 movdqu @XMM[4], 0x20($out)
2383 lea 0x30($out), $out
2384
2385 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2386 jmp .Lxts_enc_done
2387.align 16
2388.Lxts_enc_2:
2389 pxor @XMM[8+0], @XMM[0]
2390 lea 0x20($inp), $inp
2391 pxor @XMM[8+1], @XMM[1]
2392 lea 0x80(%rsp), %rax # pass key schedule
2393 mov %edx, %r10d # pass rounds
2394
2395 call _bsaes_encrypt8
2396
2397 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2398 pxor 0x10(%rsp), @XMM[1]
2399 movdqu @XMM[0], 0x00($out) # write output
2400 movdqu @XMM[1], 0x10($out)
2401 lea 0x20($out), $out
2402
2403 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2404 jmp .Lxts_enc_done
2405.align 16
2406.Lxts_enc_1:
2407 pxor @XMM[0], @XMM[8]
2408 lea 0x10($inp), $inp
2409 movdqa @XMM[8], 0x20(%rbp)
2410 lea 0x20(%rbp), $arg1
2411 lea 0x20(%rbp), $arg2
2412 lea ($key), $arg3
2413 call asm_AES_encrypt # doesn't touch %xmm
2414 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2415 #pxor @XMM[8], @XMM[0]
2416 #lea 0x80(%rsp), %rax # pass key schedule
2417 #mov %edx, %r10d # pass rounds
2418 #call _bsaes_encrypt8
2419 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2420 movdqu @XMM[0], 0x00($out) # write output
2421 lea 0x10($out), $out
2422
2423 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2424
2425.Lxts_enc_done:
2426 and \$15, %ebx
2427 jz .Lxts_enc_ret
2428 mov $out, %rdx
2429
2430.Lxts_enc_steal:
2431 movzb ($inp), %eax
2432 movzb -16(%rdx), %ecx
2433 lea 1($inp), $inp
2434 mov %al, -16(%rdx)
2435 mov %cl, 0(%rdx)
2436 lea 1(%rdx), %rdx
2437 sub \$1,%ebx
2438 jnz .Lxts_enc_steal
2439
2440 movdqu -16($out), @XMM[0]
2441 lea 0x20(%rbp), $arg1
2442 pxor @XMM[7], @XMM[0]
2443 lea 0x20(%rbp), $arg2
2444 movdqa @XMM[0], 0x20(%rbp)
2445 lea ($key), $arg3
2446 call asm_AES_encrypt # doesn't touch %xmm
2447 pxor 0x20(%rbp), @XMM[7]
2448 movdqu @XMM[7], -16($out)
2449
2450.Lxts_enc_ret:
2451 lea (%rsp), %rax
2452 pxor %xmm0, %xmm0
2453.Lxts_enc_bzero: # wipe key schedule [if any]
2454 movdqa %xmm0, 0x00(%rax)
2455 movdqa %xmm0, 0x10(%rax)
2456 lea 0x20(%rax), %rax
2457 cmp %rax, %rbp
2458 ja .Lxts_enc_bzero
2459
2460 lea (%rbp),%rsp # restore %rsp
2461___
2462$code.=<<___ if ($win64);
2463 movaps 0x40(%rbp), %xmm6
2464 movaps 0x50(%rbp), %xmm7
2465 movaps 0x60(%rbp), %xmm8
2466 movaps 0x70(%rbp), %xmm9
2467 movaps 0x80(%rbp), %xmm10
2468 movaps 0x90(%rbp), %xmm11
2469 movaps 0xa0(%rbp), %xmm12
2470 movaps 0xb0(%rbp), %xmm13
2471 movaps 0xc0(%rbp), %xmm14
2472 movaps 0xd0(%rbp), %xmm15
2473 lea 0xa0(%rbp), %rsp
2474___
2475$code.=<<___;
2476 mov 0x48(%rsp), %r15
2477 mov 0x50(%rsp), %r14
2478 mov 0x58(%rsp), %r13
2479 mov 0x60(%rsp), %r12
2480 mov 0x68(%rsp), %rbx
2481 mov 0x70(%rsp), %rax
2482 lea 0x78(%rsp), %rsp
2483 mov %rax, %rbp
2484.Lxts_enc_epilogue:
2485 ret
2486.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2487
2488.globl bsaes_xts_decrypt
2489.type bsaes_xts_decrypt,\@abi-omnipotent
2490.align 16
2491bsaes_xts_decrypt:
2492 _CET_ENDBR
2493 mov %rsp, %rax
2494.Lxts_dec_prologue:
2495 push %rbp
2496 push %rbx
2497 push %r12
2498 push %r13
2499 push %r14
2500 push %r15
2501 lea -0x48(%rsp), %rsp
2502___
2503$code.=<<___ if ($win64);
2504 mov 0xa0(%rsp),$arg5 # pull key2
2505 mov 0xa8(%rsp),$arg6 # pull ivp
2506 lea -0xa0(%rsp), %rsp
2507 movaps %xmm6, 0x40(%rsp)
2508 movaps %xmm7, 0x50(%rsp)
2509 movaps %xmm8, 0x60(%rsp)
2510 movaps %xmm9, 0x70(%rsp)
2511 movaps %xmm10, 0x80(%rsp)
2512 movaps %xmm11, 0x90(%rsp)
2513 movaps %xmm12, 0xa0(%rsp)
2514 movaps %xmm13, 0xb0(%rsp)
2515 movaps %xmm14, 0xc0(%rsp)
2516 movaps %xmm15, 0xd0(%rsp)
2517.Lxts_dec_body:
2518___
2519$code.=<<___;
2520 mov %rsp, %rbp # backup %rsp
2521 mov $arg1, $inp # backup arguments
2522 mov $arg2, $out
2523 mov $arg3, $len
2524 mov $arg4, $key
2525
2526 lea ($arg6), $arg1
2527 lea 0x20(%rbp), $arg2
2528 lea ($arg5), $arg3
2529 call asm_AES_encrypt # generate initial tweak
2530
2531 mov 240($key), %eax # rounds
2532 mov $len, %rbx # backup $len
2533
2534 mov %eax, %edx # rounds
2535 shl \$7, %rax # 128 bytes per inner round key
2536 sub \$`128-32`, %rax # size of bit-sliced key schedule
2537 sub %rax, %rsp
2538
2539 mov %rsp, %rax # pass key schedule
2540 mov $key, %rcx # pass key
2541 mov %edx, %r10d # pass rounds
2542 call _bsaes_key_convert
2543 pxor (%rsp), %xmm7 # fix up round 0 key
2544 movdqa %xmm6, (%rax) # save last round key
2545 movdqa %xmm7, (%rsp)
2546
2547 xor %eax, %eax # if ($len%16) len-=16;
2548 and \$-16, $len
2549 test \$15, %ebx
2550 setnz %al
2551 shl \$4, %rax
2552 sub %rax, $len
2553
2554 sub \$0x80, %rsp # place for tweak[8]
2555 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2556
2557 pxor $twtmp, $twtmp
2558 movdqa .Lxts_magic(%rip), $twmask
2559 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2560
2561 sub \$0x80, $len
2562 jc .Lxts_dec_short
2563 jmp .Lxts_dec_loop
2564
2565.align 16
2566.Lxts_dec_loop:
2567___
2568 for ($i=0;$i<7;$i++) {
2569 $code.=<<___;
2570 pshufd \$0x13, $twtmp, $twres
2571 pxor $twtmp, $twtmp
2572 movdqa @XMM[7], @XMM[$i]
2573 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2574 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2575 pand $twmask, $twres # isolate carry and residue
2576 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2577 pxor $twres, @XMM[7]
2578___
2579 $code.=<<___ if ($i>=1);
2580 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2581___
2582 $code.=<<___ if ($i>=2);
2583 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2584___
2585 }
2586$code.=<<___;
2587 movdqu 0x60($inp), @XMM[8+6]
2588 pxor @XMM[8+5], @XMM[5]
2589 movdqu 0x70($inp), @XMM[8+7]
2590 lea 0x80($inp), $inp
2591 movdqa @XMM[7], 0x70(%rsp)
2592 pxor @XMM[8+6], @XMM[6]
2593 lea 0x80(%rsp), %rax # pass key schedule
2594 pxor @XMM[8+7], @XMM[7]
2595 mov %edx, %r10d # pass rounds
2596
2597 call _bsaes_decrypt8
2598
2599 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2600 pxor 0x10(%rsp), @XMM[1]
2601 movdqu @XMM[0], 0x00($out) # write output
2602 pxor 0x20(%rsp), @XMM[6]
2603 movdqu @XMM[1], 0x10($out)
2604 pxor 0x30(%rsp), @XMM[4]
2605 movdqu @XMM[6], 0x20($out)
2606 pxor 0x40(%rsp), @XMM[2]
2607 movdqu @XMM[4], 0x30($out)
2608 pxor 0x50(%rsp), @XMM[7]
2609 movdqu @XMM[2], 0x40($out)
2610 pxor 0x60(%rsp), @XMM[3]
2611 movdqu @XMM[7], 0x50($out)
2612 pxor 0x70(%rsp), @XMM[5]
2613 movdqu @XMM[3], 0x60($out)
2614 movdqu @XMM[5], 0x70($out)
2615 lea 0x80($out), $out
2616
2617 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2618 pxor $twtmp, $twtmp
2619 movdqa .Lxts_magic(%rip), $twmask
2620 pcmpgtd @XMM[7], $twtmp
2621 pshufd \$0x13, $twtmp, $twres
2622 pxor $twtmp, $twtmp
2623 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2624 pand $twmask, $twres # isolate carry and residue
2625 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2626 pxor $twres, @XMM[7]
2627
2628 sub \$0x80,$len
2629 jnc .Lxts_dec_loop
2630
2631.Lxts_dec_short:
2632 add \$0x80, $len
2633 jz .Lxts_dec_done
2634___
2635 for ($i=0;$i<7;$i++) {
2636 $code.=<<___;
2637 pshufd \$0x13, $twtmp, $twres
2638 pxor $twtmp, $twtmp
2639 movdqa @XMM[7], @XMM[$i]
2640 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2641 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2642 pand $twmask, $twres # isolate carry and residue
2643 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2644 pxor $twres, @XMM[7]
2645___
2646 $code.=<<___ if ($i>=1);
2647 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2648 cmp \$`0x10*$i`,$len
2649 je .Lxts_dec_$i
2650___
2651 $code.=<<___ if ($i>=2);
2652 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2653___
2654 }
2655$code.=<<___;
2656 movdqu 0x60($inp), @XMM[8+6]
2657 pxor @XMM[8+5], @XMM[5]
2658 movdqa @XMM[7], 0x70(%rsp)
2659 lea 0x70($inp), $inp
2660 pxor @XMM[8+6], @XMM[6]
2661 lea 0x80(%rsp), %rax # pass key schedule
2662 mov %edx, %r10d # pass rounds
2663
2664 call _bsaes_decrypt8
2665
2666 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2667 pxor 0x10(%rsp), @XMM[1]
2668 movdqu @XMM[0], 0x00($out) # write output
2669 pxor 0x20(%rsp), @XMM[6]
2670 movdqu @XMM[1], 0x10($out)
2671 pxor 0x30(%rsp), @XMM[4]
2672 movdqu @XMM[6], 0x20($out)
2673 pxor 0x40(%rsp), @XMM[2]
2674 movdqu @XMM[4], 0x30($out)
2675 pxor 0x50(%rsp), @XMM[7]
2676 movdqu @XMM[2], 0x40($out)
2677 pxor 0x60(%rsp), @XMM[3]
2678 movdqu @XMM[7], 0x50($out)
2679 movdqu @XMM[3], 0x60($out)
2680 lea 0x70($out), $out
2681
2682 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2683 jmp .Lxts_dec_done
2684.align 16
2685.Lxts_dec_6:
2686 pxor @XMM[8+4], @XMM[4]
2687 lea 0x60($inp), $inp
2688 pxor @XMM[8+5], @XMM[5]
2689 lea 0x80(%rsp), %rax # pass key schedule
2690 mov %edx, %r10d # pass rounds
2691
2692 call _bsaes_decrypt8
2693
2694 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2695 pxor 0x10(%rsp), @XMM[1]
2696 movdqu @XMM[0], 0x00($out) # write output
2697 pxor 0x20(%rsp), @XMM[6]
2698 movdqu @XMM[1], 0x10($out)
2699 pxor 0x30(%rsp), @XMM[4]
2700 movdqu @XMM[6], 0x20($out)
2701 pxor 0x40(%rsp), @XMM[2]
2702 movdqu @XMM[4], 0x30($out)
2703 pxor 0x50(%rsp), @XMM[7]
2704 movdqu @XMM[2], 0x40($out)
2705 movdqu @XMM[7], 0x50($out)
2706 lea 0x60($out), $out
2707
2708 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2709 jmp .Lxts_dec_done
2710.align 16
2711.Lxts_dec_5:
2712 pxor @XMM[8+3], @XMM[3]
2713 lea 0x50($inp), $inp
2714 pxor @XMM[8+4], @XMM[4]
2715 lea 0x80(%rsp), %rax # pass key schedule
2716 mov %edx, %r10d # pass rounds
2717
2718 call _bsaes_decrypt8
2719
2720 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2721 pxor 0x10(%rsp), @XMM[1]
2722 movdqu @XMM[0], 0x00($out) # write output
2723 pxor 0x20(%rsp), @XMM[6]
2724 movdqu @XMM[1], 0x10($out)
2725 pxor 0x30(%rsp), @XMM[4]
2726 movdqu @XMM[6], 0x20($out)
2727 pxor 0x40(%rsp), @XMM[2]
2728 movdqu @XMM[4], 0x30($out)
2729 movdqu @XMM[2], 0x40($out)
2730 lea 0x50($out), $out
2731
2732 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2733 jmp .Lxts_dec_done
2734.align 16
2735.Lxts_dec_4:
2736 pxor @XMM[8+2], @XMM[2]
2737 lea 0x40($inp), $inp
2738 pxor @XMM[8+3], @XMM[3]
2739 lea 0x80(%rsp), %rax # pass key schedule
2740 mov %edx, %r10d # pass rounds
2741
2742 call _bsaes_decrypt8
2743
2744 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2745 pxor 0x10(%rsp), @XMM[1]
2746 movdqu @XMM[0], 0x00($out) # write output
2747 pxor 0x20(%rsp), @XMM[6]
2748 movdqu @XMM[1], 0x10($out)
2749 pxor 0x30(%rsp), @XMM[4]
2750 movdqu @XMM[6], 0x20($out)
2751 movdqu @XMM[4], 0x30($out)
2752 lea 0x40($out), $out
2753
2754 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2755 jmp .Lxts_dec_done
2756.align 16
2757.Lxts_dec_3:
2758 pxor @XMM[8+1], @XMM[1]
2759 lea 0x30($inp), $inp
2760 pxor @XMM[8+2], @XMM[2]
2761 lea 0x80(%rsp), %rax # pass key schedule
2762 mov %edx, %r10d # pass rounds
2763
2764 call _bsaes_decrypt8
2765
2766 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2767 pxor 0x10(%rsp), @XMM[1]
2768 movdqu @XMM[0], 0x00($out) # write output
2769 pxor 0x20(%rsp), @XMM[6]
2770 movdqu @XMM[1], 0x10($out)
2771 movdqu @XMM[6], 0x20($out)
2772 lea 0x30($out), $out
2773
2774 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2775 jmp .Lxts_dec_done
2776.align 16
2777.Lxts_dec_2:
2778 pxor @XMM[8+0], @XMM[0]
2779 lea 0x20($inp), $inp
2780 pxor @XMM[8+1], @XMM[1]
2781 lea 0x80(%rsp), %rax # pass key schedule
2782 mov %edx, %r10d # pass rounds
2783
2784 call _bsaes_decrypt8
2785
2786 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2787 pxor 0x10(%rsp), @XMM[1]
2788 movdqu @XMM[0], 0x00($out) # write output
2789 movdqu @XMM[1], 0x10($out)
2790 lea 0x20($out), $out
2791
2792 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2793 jmp .Lxts_dec_done
2794.align 16
2795.Lxts_dec_1:
2796 pxor @XMM[0], @XMM[8]
2797 lea 0x10($inp), $inp
2798 movdqa @XMM[8], 0x20(%rbp)
2799 lea 0x20(%rbp), $arg1
2800 lea 0x20(%rbp), $arg2
2801 lea ($key), $arg3
2802 call asm_AES_decrypt # doesn't touch %xmm
2803 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2804 #pxor @XMM[8], @XMM[0]
2805 #lea 0x80(%rsp), %rax # pass key schedule
2806 #mov %edx, %r10d # pass rounds
2807 #call _bsaes_decrypt8
2808 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2809 movdqu @XMM[0], 0x00($out) # write output
2810 lea 0x10($out), $out
2811
2812 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2813
2814.Lxts_dec_done:
2815 and \$15, %ebx
2816 jz .Lxts_dec_ret
2817
2818 pxor $twtmp, $twtmp
2819 movdqa .Lxts_magic(%rip), $twmask
2820 pcmpgtd @XMM[7], $twtmp
2821 pshufd \$0x13, $twtmp, $twres
2822 movdqa @XMM[7], @XMM[6]
2823 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2824 pand $twmask, $twres # isolate carry and residue
2825 movdqu ($inp), @XMM[0]
2826 pxor $twres, @XMM[7]
2827
2828 lea 0x20(%rbp), $arg1
2829 pxor @XMM[7], @XMM[0]
2830 lea 0x20(%rbp), $arg2
2831 movdqa @XMM[0], 0x20(%rbp)
2832 lea ($key), $arg3
2833 call asm_AES_decrypt # doesn't touch %xmm
2834 pxor 0x20(%rbp), @XMM[7]
2835 mov $out, %rdx
2836 movdqu @XMM[7], ($out)
2837
2838.Lxts_dec_steal:
2839 movzb 16($inp), %eax
2840 movzb (%rdx), %ecx
2841 lea 1($inp), $inp
2842 mov %al, (%rdx)
2843 mov %cl, 16(%rdx)
2844 lea 1(%rdx), %rdx
2845 sub \$1,%ebx
2846 jnz .Lxts_dec_steal
2847
2848 movdqu ($out), @XMM[0]
2849 lea 0x20(%rbp), $arg1
2850 pxor @XMM[6], @XMM[0]
2851 lea 0x20(%rbp), $arg2
2852 movdqa @XMM[0], 0x20(%rbp)
2853 lea ($key), $arg3
2854 call asm_AES_decrypt # doesn't touch %xmm
2855 pxor 0x20(%rbp), @XMM[6]
2856 movdqu @XMM[6], ($out)
2857
2858.Lxts_dec_ret:
2859 lea (%rsp), %rax
2860 pxor %xmm0, %xmm0
2861.Lxts_dec_bzero: # wipe key schedule [if any]
2862 movdqa %xmm0, 0x00(%rax)
2863 movdqa %xmm0, 0x10(%rax)
2864 lea 0x20(%rax), %rax
2865 cmp %rax, %rbp
2866 ja .Lxts_dec_bzero
2867
2868 lea (%rbp),%rsp # restore %rsp
2869___
2870$code.=<<___ if ($win64);
2871 movaps 0x40(%rbp), %xmm6
2872 movaps 0x50(%rbp), %xmm7
2873 movaps 0x60(%rbp), %xmm8
2874 movaps 0x70(%rbp), %xmm9
2875 movaps 0x80(%rbp), %xmm10
2876 movaps 0x90(%rbp), %xmm11
2877 movaps 0xa0(%rbp), %xmm12
2878 movaps 0xb0(%rbp), %xmm13
2879 movaps 0xc0(%rbp), %xmm14
2880 movaps 0xd0(%rbp), %xmm15
2881 lea 0xa0(%rbp), %rsp
2882___
2883$code.=<<___;
2884 mov 0x48(%rsp), %r15
2885 mov 0x50(%rsp), %r14
2886 mov 0x58(%rsp), %r13
2887 mov 0x60(%rsp), %r12
2888 mov 0x68(%rsp), %rbx
2889 mov 0x70(%rsp), %rax
2890 lea 0x78(%rsp), %rsp
2891 mov %rax, %rbp
2892.Lxts_dec_epilogue:
2893 ret
2894.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2895___
2896}
2897$code.=<<___;
2898.section .rodata
2899.type _bsaes_const,\@object
2900.align 64
2901_bsaes_const:
2902.LM0ISR: # InvShiftRows constants
2903 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2904.LISRM0:
2905 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2906.LISR:
2907 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2908.LBS0: # bit-slice constants
2909 .quad 0x5555555555555555, 0x5555555555555555
2910.LBS1:
2911 .quad 0x3333333333333333, 0x3333333333333333
2912.LBS2:
2913 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2914.LSR: # shiftrows constants
2915 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2916.LSRM0:
2917 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2918.LM0SR:
2919 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2920.LSWPUP: # byte-swap upper dword
2921 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2922.LSWPUPM0SR:
2923 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2924.LADD1: # counter increment constants
2925 .quad 0x0000000000000000, 0x0000000100000000
2926.LADD2:
2927 .quad 0x0000000000000000, 0x0000000200000000
2928.LADD3:
2929 .quad 0x0000000000000000, 0x0000000300000000
2930.LADD4:
2931 .quad 0x0000000000000000, 0x0000000400000000
2932.LADD5:
2933 .quad 0x0000000000000000, 0x0000000500000000
2934.LADD6:
2935 .quad 0x0000000000000000, 0x0000000600000000
2936.LADD7:
2937 .quad 0x0000000000000000, 0x0000000700000000
2938.LADD8:
2939 .quad 0x0000000000000000, 0x0000000800000000
2940.Lxts_magic:
2941 .long 0x87,0,1,0
2942.Lmasks:
2943 .quad 0x0101010101010101, 0x0101010101010101
2944 .quad 0x0202020202020202, 0x0202020202020202
2945 .quad 0x0404040404040404, 0x0404040404040404
2946 .quad 0x0808080808080808, 0x0808080808080808
2947.LM0:
2948 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2949.L63:
2950 .quad 0x6363636363636363, 0x6363636363636363
2951.align 64
2952.size _bsaes_const,.-_bsaes_const
2953.text
2954___
2955
2956# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2957# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2958if ($win64) {
2959$rec="%rcx";
2960$frame="%rdx";
2961$context="%r8";
2962$disp="%r9";
2963
2964$code.=<<___;
2965.extern __imp_RtlVirtualUnwind
2966.type se_handler,\@abi-omnipotent
2967.align 16
2968se_handler:
2969 _CET_ENDBR
2970 push %rsi
2971 push %rdi
2972 push %rbx
2973 push %rbp
2974 push %r12
2975 push %r13
2976 push %r14
2977 push %r15
2978 pushfq
2979 sub \$64,%rsp
2980
2981 mov 120($context),%rax # pull context->Rax
2982 mov 248($context),%rbx # pull context->Rip
2983
2984 mov 8($disp),%rsi # disp->ImageBase
2985 mov 56($disp),%r11 # disp->HandlerData
2986
2987 mov 0(%r11),%r10d # HandlerData[0]
2988 lea (%rsi,%r10),%r10 # prologue label
2989 cmp %r10,%rbx # context->Rip<prologue label
2990 jb .Lin_prologue
2991
2992 mov 152($context),%rax # pull context->Rsp
2993
2994 mov 4(%r11),%r10d # HandlerData[1]
2995 lea (%rsi,%r10),%r10 # epilogue label
2996 cmp %r10,%rbx # context->Rip>=epilogue label
2997 jae .Lin_prologue
2998
2999 mov 160($context),%rax # pull context->Rbp
3000
3001 lea 0x40(%rax),%rsi # %xmm save area
3002 lea 512($context),%rdi # &context.Xmm6
3003 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
3004 .long 0xa548f3fc # cld; rep movsq
3005 lea 0xa0(%rax),%rax # adjust stack pointer
3006
3007 mov 0x70(%rax),%rbp
3008 mov 0x68(%rax),%rbx
3009 mov 0x60(%rax),%r12
3010 mov 0x58(%rax),%r13
3011 mov 0x50(%rax),%r14
3012 mov 0x48(%rax),%r15
3013 lea 0x78(%rax),%rax # adjust stack pointer
3014 mov %rbx,144($context) # restore context->Rbx
3015 mov %rbp,160($context) # restore context->Rbp
3016 mov %r12,216($context) # restore context->R12
3017 mov %r13,224($context) # restore context->R13
3018 mov %r14,232($context) # restore context->R14
3019 mov %r15,240($context) # restore context->R15
3020
3021.Lin_prologue:
3022 mov %rax,152($context) # restore context->Rsp
3023
3024 mov 40($disp),%rdi # disp->ContextRecord
3025 mov $context,%rsi # context
3026 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3027 .long 0xa548f3fc # cld; rep movsq
3028
3029 mov $disp,%rsi
3030 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3031 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3032 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3033 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3034 mov 40(%rsi),%r10 # disp->ContextRecord
3035 lea 56(%rsi),%r11 # &disp->HandlerData
3036 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3037 mov %r10,32(%rsp) # arg5
3038 mov %r11,40(%rsp) # arg6
3039 mov %r12,48(%rsp) # arg7
3040 mov %rcx,56(%rsp) # arg8, (NULL)
3041 call *__imp_RtlVirtualUnwind(%rip)
3042
3043 mov \$1,%eax # ExceptionContinueSearch
3044 add \$64,%rsp
3045 popfq
3046 pop %r15
3047 pop %r14
3048 pop %r13
3049 pop %r12
3050 pop %rbp
3051 pop %rbx
3052 pop %rdi
3053 pop %rsi
3054 ret
3055.size se_handler,.-se_handler
3056
3057.section .pdata
3058.align 4
3059___
3060$code.=<<___ if ($ecb);
3061 .rva .Lecb_enc_prologue
3062 .rva .Lecb_enc_epilogue
3063 .rva .Lecb_enc_info
3064
3065 .rva .Lecb_dec_prologue
3066 .rva .Lecb_dec_epilogue
3067 .rva .Lecb_dec_info
3068___
3069$code.=<<___;
3070 .rva .Lcbc_dec_prologue
3071 .rva .Lcbc_dec_epilogue
3072 .rva .Lcbc_dec_info
3073
3074 .rva .Lctr_enc_prologue
3075 .rva .Lctr_enc_epilogue
3076 .rva .Lctr_enc_info
3077
3078 .rva .Lxts_enc_prologue
3079 .rva .Lxts_enc_epilogue
3080 .rva .Lxts_enc_info
3081
3082 .rva .Lxts_dec_prologue
3083 .rva .Lxts_dec_epilogue
3084 .rva .Lxts_dec_info
3085
3086.section .xdata
3087.align 8
3088___
3089$code.=<<___ if ($ecb);
3090.Lecb_enc_info:
3091 .byte 9,0,0,0
3092 .rva se_handler
3093 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3094.Lecb_dec_info:
3095 .byte 9,0,0,0
3096 .rva se_handler
3097 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3098___
3099$code.=<<___;
3100.Lcbc_dec_info:
3101 .byte 9,0,0,0
3102 .rva se_handler
3103 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3104.Lctr_enc_info:
3105 .byte 9,0,0,0
3106 .rva se_handler
3107 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3108.Lxts_enc_info:
3109 .byte 9,0,0,0
3110 .rva se_handler
3111 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3112.Lxts_dec_info:
3113 .byte 9,0,0,0
3114 .rva se_handler
3115 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3116___
3117}
3118
3119$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3120
3121print $code;
3122
3123close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl
deleted file mode 100644
index 6e7bd36d05..0000000000
--- a/src/lib/libcrypto/aes/asm/vpaes-x86.pl
+++ /dev/null
@@ -1,911 +0,0 @@
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
17# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-586.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86.pl column - [also
26# large-block CBC] encrypt/decrypt.
27#
28# aes-586.pl vpaes-x86.pl
29#
30# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***)
31# Nehalem 27.9/40.4/18.1 10.3/12.0
32# Atom 102./119./60.1 64.5/85.3(***)
33#
34# (*) "Hyper-threading" in the context refers rather to cache shared
35# among multiple cores, than to specifically Intel HTT. As vast
36# majority of contemporary cores share cache, slower code path
37# is common place. In other words "with-hyper-threading-off"
38# results are presented mostly for reference purposes.
39#
40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
41#
42# (***) Less impressive improvement on Core 2 and Atom is due to slow
43# pshufb, yet it's respectable +32%/65% improvement on Core 2
44# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
45# code path).
46#
47# <appro@openssl.org>
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50push(@INC,"${dir}","${dir}../../perlasm");
51require "x86asm.pl";
52
53&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
54
55$PREFIX="vpaes";
56
57my ($round, $base, $magic, $key, $const, $inp, $out)=
58 ("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
59
60 &rodataseg();
61&static_label("_vpaes_consts");
62&static_label("_vpaes_schedule_low_round");
63
64&set_label("_vpaes_consts",64);
65$k_inv=-0x30; # inv, inva
66 &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
67 &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
68
69$k_s0F=-0x10; # s0F
70 &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
71
72$k_ipt=0x00; # input transform (lo, hi)
73 &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
74 &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
75
76$k_sb1=0x20; # sb1u, sb1t
77 &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
78 &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
79$k_sb2=0x40; # sb2u, sb2t
80 &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
81 &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
82$k_sbo=0x60; # sbou, sbot
83 &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
84 &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
85
86$k_mc_forward=0x80; # mc_forward
87 &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
88 &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
89 &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
90 &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
91
92$k_mc_backward=0xc0; # mc_backward
93 &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
94 &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
95 &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
96 &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
97
98$k_sr=0x100; # sr
99 &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
100 &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
101 &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
102 &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
103
104$k_rcon=0x140; # rcon
105 &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
106
107$k_s63=0x150; # s63: all equal to 0x63 transformed
108 &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
109
110$k_opt=0x160; # output transform
111 &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
112 &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
113
114$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
115 &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
116 &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
117##
118## Decryption stuff
119## Key schedule constants
120##
121$k_dksd=0x1a0; # decryption key schedule: invskew x*D
122 &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
123 &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
124$k_dksb=0x1c0; # decryption key schedule: invskew x*B
125 &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
126 &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
127$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
128 &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
129 &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
130$k_dks9=0x200; # decryption key schedule: invskew x*9
131 &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
132 &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
133
134##
135## Decryption stuff
136## Round function constants
137##
138$k_dipt=0x220; # decryption input transform
139 &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
140 &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
141
142$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
143 &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
144 &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
145$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
146 &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
147 &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
148$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
149 &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
150 &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
151$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
152 &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
153 &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
154$k_dsbo=0x2c0; # decryption sbox final output
155 &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
156 &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
157 &previous();
158
159&function_begin_B("_vpaes_preheat");
160 &movdqa ("xmm7",&QWP($k_inv,$const));
161 &movdqa ("xmm6",&QWP($k_s0F,$const));
162 &ret ();
163&function_end_B("_vpaes_preheat");
164
165##
166## _aes_encrypt_core
167##
168## AES-encrypt %xmm0.
169##
170## Inputs:
171## %xmm0 = input
172## %xmm6-%xmm7 as in _vpaes_preheat
173## (%edx) = scheduled keys
174##
175## Output in %xmm0
176## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
177##
178##
179&function_begin_B("_vpaes_encrypt_core");
180 &mov ($magic,16);
181 &mov ($round,&DWP(240,$key));
182 &movdqa ("xmm1","xmm6")
183 &movdqa ("xmm2",&QWP($k_ipt,$const));
184 &pandn ("xmm1","xmm0");
185 &movdqu ("xmm5",&QWP(0,$key));
186 &psrld ("xmm1",4);
187 &pand ("xmm0","xmm6");
188 &pshufb ("xmm2","xmm0");
189 &movdqa ("xmm0",&QWP($k_ipt+16,$const));
190 &pshufb ("xmm0","xmm1");
191 &pxor ("xmm2","xmm5");
192 &pxor ("xmm0","xmm2");
193 &add ($key,16);
194 &lea ($base,&DWP($k_mc_backward,$const));
195 &jmp (&label("enc_entry"));
196
197
198&set_label("enc_loop",16);
199 # middle of middle round
200 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
201 &pshufb ("xmm4","xmm2"); # 4 = sb1u
202 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
203 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
204 &pshufb ("xmm0","xmm3"); # 0 = sb1t
205 &pxor ("xmm0","xmm4"); # 0 = A
206 &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
207 &pshufb ("xmm5","xmm2"); # 4 = sb2u
208 &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
209 &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
210 &pshufb ("xmm2","xmm3"); # 2 = sb2t
211 &pxor ("xmm2","xmm5"); # 2 = 2A
212 &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
213 &movdqa ("xmm3","xmm0"); # 3 = A
214 &pshufb ("xmm0","xmm1"); # 0 = B
215 &add ($key,16); # next key
216 &pxor ("xmm0","xmm2"); # 0 = 2A+B
217 &pshufb ("xmm3","xmm4"); # 3 = D
218 &add ($magic,16); # next mc
219 &pxor ("xmm3","xmm0"); # 3 = 2A+B+D
220 &pshufb ("xmm0","xmm1"); # 0 = 2B+C
221 &and ($magic,0x30); # ... mod 4
222 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
223 &sub ($round,1); # nr--
224
225&set_label("enc_entry");
226 # top of round
227 &movdqa ("xmm1","xmm6"); # 1 : i
228 &pandn ("xmm1","xmm0"); # 1 = i<<4
229 &psrld ("xmm1",4); # 1 = i
230 &pand ("xmm0","xmm6"); # 0 = k
231 &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
232 &pshufb ("xmm5","xmm0"); # 2 = a/k
233 &pxor ("xmm0","xmm1"); # 0 = j
234 &movdqa ("xmm3","xmm7"); # 3 : 1/i
235 &pshufb ("xmm3","xmm1"); # 3 = 1/i
236 &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
237 &movdqa ("xmm4","xmm7"); # 4 : 1/j
238 &pshufb ("xmm4","xmm0"); # 4 = 1/j
239 &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
240 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
241 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
242 &pxor ("xmm2","xmm0"); # 2 = io
243 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
244 &movdqu ("xmm5",&QWP(0,$key));
245 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
246 &pxor ("xmm3","xmm1"); # 3 = jo
247 &jnz (&label("enc_loop"));
248
249 # middle of last round
250 &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
251 &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
252 &pshufb ("xmm4","xmm2"); # 4 = sbou
253 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
254 &pshufb ("xmm0","xmm3"); # 0 = sb1t
255 &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
256 &pxor ("xmm0","xmm4"); # 0 = A
257 &pshufb ("xmm0","xmm1");
258 &ret ();
259&function_end_B("_vpaes_encrypt_core");
260
261##
262## Decryption core
263##
264## Same API as encryption core.
265##
266&function_begin_B("_vpaes_decrypt_core");
267 &mov ($round,&DWP(240,$key));
268 &lea ($base,&DWP($k_dsbd,$const));
269 &movdqa ("xmm1","xmm6");
270 &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
271 &pandn ("xmm1","xmm0");
272 &mov ($magic,$round);
273 &psrld ("xmm1",4)
274 &movdqu ("xmm5",&QWP(0,$key));
275 &shl ($magic,4);
276 &pand ("xmm0","xmm6");
277 &pshufb ("xmm2","xmm0");
278 &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
279 &xor ($magic,0x30);
280 &pshufb ("xmm0","xmm1");
281 &and ($magic,0x30);
282 &pxor ("xmm2","xmm5");
283 &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
284 &pxor ("xmm0","xmm2");
285 &add ($key,16);
286 &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
287 &jmp (&label("dec_entry"));
288
289&set_label("dec_loop",16);
290##
291## Inverse mix columns
292##
293 &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
294 &pshufb ("xmm4","xmm2"); # 4 = sb9u
295 &pxor ("xmm4","xmm0");
296 &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t
297 &pshufb ("xmm0","xmm3"); # 0 = sb9t
298 &pxor ("xmm0","xmm4"); # 0 = ch
299 &add ($key,16); # next round key
300
301 &pshufb ("xmm0","xmm5"); # MC ch
302 &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
303 &pshufb ("xmm4","xmm2"); # 4 = sbdu
304 &pxor ("xmm4","xmm0"); # 4 = ch
305 &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt
306 &pshufb ("xmm0","xmm3"); # 0 = sbdt
307 &pxor ("xmm0","xmm4"); # 0 = ch
308 &sub ($round,1); # nr--
309
310 &pshufb ("xmm0","xmm5"); # MC ch
311 &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
312 &pshufb ("xmm4","xmm2"); # 4 = sbbu
313 &pxor ("xmm4","xmm0"); # 4 = ch
314 &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt
315 &pshufb ("xmm0","xmm3"); # 0 = sbbt
316 &pxor ("xmm0","xmm4"); # 0 = ch
317
318 &pshufb ("xmm0","xmm5"); # MC ch
319 &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
320 &pshufb ("xmm4","xmm2"); # 4 = sbeu
321 &pxor ("xmm4","xmm0"); # 4 = ch
322 &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
323 &pshufb ("xmm0","xmm3"); # 0 = sbet
324 &pxor ("xmm0","xmm4"); # 0 = ch
325
326 &palignr("xmm5","xmm5",12);
327
328&set_label("dec_entry");
329 # top of round
330 &movdqa ("xmm1","xmm6"); # 1 : i
331 &pandn ("xmm1","xmm0"); # 1 = i<<4
332 &psrld ("xmm1",4); # 1 = i
333 &pand ("xmm0","xmm6"); # 0 = k
334 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
335 &pshufb ("xmm2","xmm0"); # 2 = a/k
336 &pxor ("xmm0","xmm1"); # 0 = j
337 &movdqa ("xmm3","xmm7"); # 3 : 1/i
338 &pshufb ("xmm3","xmm1"); # 3 = 1/i
339 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
340 &movdqa ("xmm4","xmm7"); # 4 : 1/j
341 &pshufb ("xmm4","xmm0"); # 4 = 1/j
342 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
343 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
344 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
345 &pxor ("xmm2","xmm0"); # 2 = io
346 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
347 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
348 &pxor ("xmm3","xmm1"); # 3 = jo
349 &movdqu ("xmm0",&QWP(0,$key));
350 &jnz (&label("dec_loop"));
351
352 # middle of last round
353 &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
354 &pshufb ("xmm4","xmm2"); # 4 = sbou
355 &pxor ("xmm4","xmm0"); # 4 = sb1u + k
356 &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
357 &movdqa ("xmm2",&QWP(0,$magic));
358 &pshufb ("xmm0","xmm3"); # 0 = sb1t
359 &pxor ("xmm0","xmm4"); # 0 = A
360 &pshufb ("xmm0","xmm2");
361 &ret ();
362&function_end_B("_vpaes_decrypt_core");
363
364########################################################
365## ##
366## AES key schedule ##
367## ##
368########################################################
369&function_begin_B("_vpaes_schedule_core");
370 &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
371 &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
372
373 # input transform
374 &movdqa ("xmm3","xmm0");
375 &lea ($base,&DWP($k_ipt,$const));
376 &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
377 &call ("_vpaes_schedule_transform");
378 &movdqa ("xmm7","xmm0");
379
380 &test ($out,$out);
381 &jnz (&label("schedule_am_decrypting"));
382
383 # encrypting, output zeroth round key after transform
384 &movdqu (&QWP(0,$key),"xmm0");
385 &jmp (&label("schedule_go"));
386
387&set_label("schedule_am_decrypting");
388 # decrypting, output zeroth round key after shiftrows
389 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
390 &pshufb ("xmm3","xmm1");
391 &movdqu (&QWP(0,$key),"xmm3");
392 &xor ($magic,0x30);
393
394&set_label("schedule_go");
395 &cmp ($round,192);
396 &ja (&label("schedule_256"));
397 &je (&label("schedule_192"));
398 # 128: fall though
399
400##
401## .schedule_128
402##
403## 128-bit specific part of key schedule.
404##
405## This schedule is really simple, because all its parts
406## are accomplished by the subroutines.
407##
408&set_label("schedule_128");
409 &mov ($round,10);
410
411&set_label("loop_schedule_128");
412 &call ("_vpaes_schedule_round");
413 &dec ($round);
414 &jz (&label("schedule_mangle_last"));
415 &call ("_vpaes_schedule_mangle"); # write output
416 &jmp (&label("loop_schedule_128"));
417
418##
419## .aes_schedule_192
420##
421## 192-bit specific part of key schedule.
422##
423## The main body of this schedule is the same as the 128-bit
424## schedule, but with more smearing. The long, high side is
425## stored in %xmm7 as before, and the short, low side is in
426## the high bits of %xmm6.
427##
428## This schedule is somewhat nastier, however, because each
429## round produces 192 bits of key material, or 1.5 round keys.
430## Therefore, on each cycle we do 2 rounds and produce 3 round
431## keys.
432##
433&set_label("schedule_192",16);
434 &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
435 &call ("_vpaes_schedule_transform"); # input transform
436 &movdqa ("xmm6","xmm0"); # save short part
437 &pxor ("xmm4","xmm4"); # clear 4
438 &movhlps("xmm6","xmm4"); # clobber low side with zeros
439 &mov ($round,4);
440
441&set_label("loop_schedule_192");
442 &call ("_vpaes_schedule_round");
443 &palignr("xmm0","xmm6",8);
444 &call ("_vpaes_schedule_mangle"); # save key n
445 &call ("_vpaes_schedule_192_smear");
446 &call ("_vpaes_schedule_mangle"); # save key n+1
447 &call ("_vpaes_schedule_round");
448 &dec ($round);
449 &jz (&label("schedule_mangle_last"));
450 &call ("_vpaes_schedule_mangle"); # save key n+2
451 &call ("_vpaes_schedule_192_smear");
452 &jmp (&label("loop_schedule_192"));
453
454##
455## .aes_schedule_256
456##
457## 256-bit specific part of key schedule.
458##
459## The structure here is very similar to the 128-bit
460## schedule, but with an additional "low side" in
461## %xmm6. The low side's rounds are the same as the
462## high side's, except no rcon and no rotation.
463##
464&set_label("schedule_256",16);
465 &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
466 &call ("_vpaes_schedule_transform"); # input transform
467 &mov ($round,7);
468
469&set_label("loop_schedule_256");
470 &call ("_vpaes_schedule_mangle"); # output low result
471 &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
472
473 # high round
474 &call ("_vpaes_schedule_round");
475 &dec ($round);
476 &jz (&label("schedule_mangle_last"));
477 &call ("_vpaes_schedule_mangle");
478
479 # low round. swap xmm7 and xmm6
480 &pshufd ("xmm0","xmm0",0xFF);
481 &movdqa (&QWP(20,"esp"),"xmm7");
482 &movdqa ("xmm7","xmm6");
483 &call ("_vpaes_schedule_low_round");
484 &movdqa ("xmm7",&QWP(20,"esp"));
485
486 &jmp (&label("loop_schedule_256"));
487
488##
489## .aes_schedule_mangle_last
490##
491## Mangler for last round of key schedule
492## Mangles %xmm0
493## when encrypting, outputs out(%xmm0) ^ 63
494## when decrypting, outputs unskew(%xmm0)
495##
496## Always called right before return... jumps to cleanup and exits
497##
498&set_label("schedule_mangle_last",16);
499 # schedule last round key from xmm0
500 &lea ($base,&DWP($k_deskew,$const));
501 &test ($out,$out);
502 &jnz (&label("schedule_mangle_last_dec"));
503
504 # encrypting
505 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
506 &pshufb ("xmm0","xmm1"); # output permute
507 &lea ($base,&DWP($k_opt,$const)); # prepare to output transform
508 &add ($key,32);
509
510&set_label("schedule_mangle_last_dec");
511 &add ($key,-16);
512 &pxor ("xmm0",&QWP($k_s63,$const));
513 &call ("_vpaes_schedule_transform"); # output transform
514 &movdqu (&QWP(0,$key),"xmm0"); # save last key
515
516 # cleanup
517 &pxor ("xmm0","xmm0");
518 &pxor ("xmm1","xmm1");
519 &pxor ("xmm2","xmm2");
520 &pxor ("xmm3","xmm3");
521 &pxor ("xmm4","xmm4");
522 &pxor ("xmm5","xmm5");
523 &pxor ("xmm6","xmm6");
524 &pxor ("xmm7","xmm7");
525 &ret ();
526&function_end_B("_vpaes_schedule_core");
527
528##
529## .aes_schedule_192_smear
530##
531## Smear the short, low side in the 192-bit key schedule.
532##
533## Inputs:
534## %xmm7: high side, b a x y
535## %xmm6: low side, d c 0 0
536## %xmm13: 0
537##
538## Outputs:
539## %xmm6: b+c+d b+c 0 0
540## %xmm0: b+c+d b+c b a
541##
542&function_begin_B("_vpaes_schedule_192_smear");
543 &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0
544 &pxor ("xmm6","xmm0"); # -> c+d c 0 0
545 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
546 &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
547 &movdqa ("xmm0","xmm6");
548 &pxor ("xmm1","xmm1");
549 &movhlps("xmm6","xmm1"); # clobber low side with zeros
550 &ret ();
551&function_end_B("_vpaes_schedule_192_smear");
552
553##
554## .aes_schedule_round
555##
556## Runs one main round of the key schedule on %xmm0, %xmm7
557##
558## Specifically, runs subbytes on the high dword of %xmm0
559## then rotates it by one byte and xors into the low dword of
560## %xmm7.
561##
562## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
563## next rcon.
564##
565## Smears the dwords of %xmm7 by xoring the low into the
566## second low, result into third, result into highest.
567##
568## Returns results in %xmm7 = %xmm0.
569## Clobbers %xmm1-%xmm5.
570##
571&function_begin_B("_vpaes_schedule_round");
572 # extract rcon from xmm8
573 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8
574 &pxor ("xmm1","xmm1");
575 &palignr("xmm1","xmm2",15);
576 &palignr("xmm2","xmm2",15);
577 &pxor ("xmm7","xmm1");
578
579 # rotate
580 &pshufd ("xmm0","xmm0",0xFF);
581 &palignr("xmm0","xmm0",1);
582
583 # fall through...
584 &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
585
586 # low round: same as high round, but no rotation and no rcon.
587&set_label("_vpaes_schedule_low_round");
588 # smear xmm7
589 &movdqa ("xmm1","xmm7");
590 &pslldq ("xmm7",4);
591 &pxor ("xmm7","xmm1");
592 &movdqa ("xmm1","xmm7");
593 &pslldq ("xmm7",8);
594 &pxor ("xmm7","xmm1");
595 &pxor ("xmm7",&QWP($k_s63,$const));
596
597 # subbyte
598 &movdqa ("xmm4",&QWP($k_s0F,$const));
599 &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
600 &movdqa ("xmm1","xmm4");
601 &pandn ("xmm1","xmm0");
602 &psrld ("xmm1",4); # 1 = i
603 &pand ("xmm0","xmm4"); # 0 = k
604 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
605 &pshufb ("xmm2","xmm0"); # 2 = a/k
606 &pxor ("xmm0","xmm1"); # 0 = j
607 &movdqa ("xmm3","xmm5"); # 3 : 1/i
608 &pshufb ("xmm3","xmm1"); # 3 = 1/i
609 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
610 &movdqa ("xmm4","xmm5"); # 4 : 1/j
611 &pshufb ("xmm4","xmm0"); # 4 = 1/j
612 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
613 &movdqa ("xmm2","xmm5"); # 2 : 1/iak
614 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
615 &pxor ("xmm2","xmm0"); # 2 = io
616 &movdqa ("xmm3","xmm5"); # 3 : 1/jak
617 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
618 &pxor ("xmm3","xmm1"); # 3 = jo
619 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
620 &pshufb ("xmm4","xmm2"); # 4 = sbou
621 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
622 &pshufb ("xmm0","xmm3"); # 0 = sb1t
623 &pxor ("xmm0","xmm4"); # 0 = sbox output
624
625 # add in smeared stuff
626 &pxor ("xmm0","xmm7");
627 &movdqa ("xmm7","xmm0");
628 &ret ();
629&function_end_B("_vpaes_schedule_round");
630
631##
632## .aes_schedule_transform
633##
634## Linear-transform %xmm0 according to tables at (%ebx)
635##
636## Output in %xmm0
637## Clobbers %xmm1, %xmm2
638##
639&function_begin_B("_vpaes_schedule_transform");
640 &movdqa ("xmm2",&QWP($k_s0F,$const));
641 &movdqa ("xmm1","xmm2");
642 &pandn ("xmm1","xmm0");
643 &psrld ("xmm1",4);
644 &pand ("xmm0","xmm2");
645 &movdqa ("xmm2",&QWP(0,$base));
646 &pshufb ("xmm2","xmm0");
647 &movdqa ("xmm0",&QWP(16,$base));
648 &pshufb ("xmm0","xmm1");
649 &pxor ("xmm0","xmm2");
650 &ret ();
651&function_end_B("_vpaes_schedule_transform");
652
653##
654## .aes_schedule_mangle
655##
656## Mangle xmm0 from (basis-transformed) standard version
657## to our version.
658##
659## On encrypt,
660## xor with 0x63
661## multiply by circulant 0,1,1,1
662## apply shiftrows transform
663##
664## On decrypt,
665## xor with 0x63
666## multiply by "inverse mixcolumns" circulant E,B,D,9
667## deskew
668## apply shiftrows transform
669##
670##
671## Writes out to (%edx), and increments or decrements it
672## Keeps track of round number mod 4 in %ecx
673## Preserves xmm0
674## Clobbers xmm1-xmm5
675##
676&function_begin_B("_vpaes_schedule_mangle");
677 &movdqa ("xmm4","xmm0"); # save xmm0 for later
678 &movdqa ("xmm5",&QWP($k_mc_forward,$const));
679 &test ($out,$out);
680 &jnz (&label("schedule_mangle_dec"));
681
682 # encrypting
683 &add ($key,16);
684 &pxor ("xmm4",&QWP($k_s63,$const));
685 &pshufb ("xmm4","xmm5");
686 &movdqa ("xmm3","xmm4");
687 &pshufb ("xmm4","xmm5");
688 &pxor ("xmm3","xmm4");
689 &pshufb ("xmm4","xmm5");
690 &pxor ("xmm3","xmm4");
691
692 &jmp (&label("schedule_mangle_both"));
693
694&set_label("schedule_mangle_dec",16);
695 # inverse mix columns
696 &movdqa ("xmm2",&QWP($k_s0F,$const));
697 &lea ($inp,&DWP($k_dksd,$const));
698 &movdqa ("xmm1","xmm2");
699 &pandn ("xmm1","xmm4");
700 &psrld ("xmm1",4); # 1 = hi
701 &pand ("xmm4","xmm2"); # 4 = lo
702
703 &movdqa ("xmm2",&QWP(0,$inp));
704 &pshufb ("xmm2","xmm4");
705 &movdqa ("xmm3",&QWP(0x10,$inp));
706 &pshufb ("xmm3","xmm1");
707 &pxor ("xmm3","xmm2");
708 &pshufb ("xmm3","xmm5");
709
710 &movdqa ("xmm2",&QWP(0x20,$inp));
711 &pshufb ("xmm2","xmm4");
712 &pxor ("xmm2","xmm3");
713 &movdqa ("xmm3",&QWP(0x30,$inp));
714 &pshufb ("xmm3","xmm1");
715 &pxor ("xmm3","xmm2");
716 &pshufb ("xmm3","xmm5");
717
718 &movdqa ("xmm2",&QWP(0x40,$inp));
719 &pshufb ("xmm2","xmm4");
720 &pxor ("xmm2","xmm3");
721 &movdqa ("xmm3",&QWP(0x50,$inp));
722 &pshufb ("xmm3","xmm1");
723 &pxor ("xmm3","xmm2");
724 &pshufb ("xmm3","xmm5");
725
726 &movdqa ("xmm2",&QWP(0x60,$inp));
727 &pshufb ("xmm2","xmm4");
728 &pxor ("xmm2","xmm3");
729 &movdqa ("xmm3",&QWP(0x70,$inp));
730 &pshufb ("xmm3","xmm1");
731 &pxor ("xmm3","xmm2");
732
733 &add ($key,-16);
734
735&set_label("schedule_mangle_both");
736 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
737 &pshufb ("xmm3","xmm1");
738 &add ($magic,-16);
739 &and ($magic,0x30);
740 &movdqu (&QWP(0,$key),"xmm3");
741 &ret ();
742&function_end_B("_vpaes_schedule_mangle");
743
744#
745# Interface to OpenSSL
746#
747&function_begin("${PREFIX}_set_encrypt_key");
748 &mov ($inp,&wparam(0)); # inp
749 &lea ($base,&DWP(-56,"esp"));
750 &mov ($round,&wparam(1)); # bits
751 &and ($base,-16);
752 &mov ($key,&wparam(2)); # key
753 &xchg ($base,"esp"); # alloca
754 &mov (&DWP(48,"esp"),$base);
755
756 &mov ($base,$round);
757 &shr ($base,5);
758 &add ($base,5);
759 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
760 &mov ($magic,0x30);
761 &mov ($out,0);
762
763 &picsetup($const);
764 &picsymbol($const, &label("_vpaes_consts"), $const);
765 &lea ($const,&DWP(0x30,$const))
766
767 &call ("_vpaes_schedule_core");
768
769 &mov ("esp",&DWP(48,"esp"));
770 &xor ("eax","eax");
771&function_end("${PREFIX}_set_encrypt_key");
772
773&function_begin("${PREFIX}_set_decrypt_key");
774 &mov ($inp,&wparam(0)); # inp
775 &lea ($base,&DWP(-56,"esp"));
776 &mov ($round,&wparam(1)); # bits
777 &and ($base,-16);
778 &mov ($key,&wparam(2)); # key
779 &xchg ($base,"esp"); # alloca
780 &mov (&DWP(48,"esp"),$base);
781
782 &mov ($base,$round);
783 &shr ($base,5);
784 &add ($base,5);
785 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
786 &shl ($base,4);
787 &lea ($key,&DWP(16,$key,$base));
788
789 &mov ($out,1);
790 &mov ($magic,$round);
791 &shr ($magic,1);
792 &and ($magic,32);
793 &xor ($magic,32); # nbist==192?0:32;
794
795 &picsetup($const);
796 &picsymbol($const, &label("_vpaes_consts"), $const);
797 &lea ($const,&DWP(0x30,$const))
798
799 &call ("_vpaes_schedule_core");
800
801 &mov ("esp",&DWP(48,"esp"));
802 &xor ("eax","eax");
803&function_end("${PREFIX}_set_decrypt_key");
804
805&function_begin("${PREFIX}_encrypt");
806 &picsetup($const);
807 &picsymbol($const, &label("_vpaes_consts"), $const);
808 &lea ($const,&DWP(0x30,$const))
809
810 &call ("_vpaes_preheat");
811 &mov ($inp,&wparam(0)); # inp
812 &lea ($base,&DWP(-56,"esp"));
813 &mov ($out,&wparam(1)); # out
814 &and ($base,-16);
815 &mov ($key,&wparam(2)); # key
816 &xchg ($base,"esp"); # alloca
817 &mov (&DWP(48,"esp"),$base);
818
819 &movdqu ("xmm0",&QWP(0,$inp));
820 &call ("_vpaes_encrypt_core");
821 &movdqu (&QWP(0,$out),"xmm0");
822
823 &mov ("esp",&DWP(48,"esp"));
824&function_end("${PREFIX}_encrypt");
825
826&function_begin("${PREFIX}_decrypt");
827 &picsetup($const);
828 &picsymbol($const, &label("_vpaes_consts"), $const);
829 &lea ($const,&DWP(0x30,$const))
830
831 &call ("_vpaes_preheat");
832 &mov ($inp,&wparam(0)); # inp
833 &lea ($base,&DWP(-56,"esp"));
834 &mov ($out,&wparam(1)); # out
835 &and ($base,-16);
836 &mov ($key,&wparam(2)); # key
837 &xchg ($base,"esp"); # alloca
838 &mov (&DWP(48,"esp"),$base);
839
840 &movdqu ("xmm0",&QWP(0,$inp));
841 &call ("_vpaes_decrypt_core");
842 &movdqu (&QWP(0,$out),"xmm0");
843
844 &mov ("esp",&DWP(48,"esp"));
845&function_end("${PREFIX}_decrypt");
846
847&function_begin("${PREFIX}_cbc_encrypt");
848 &mov ($inp,&wparam(0)); # inp
849 &mov ($out,&wparam(1)); # out
850 &mov ($round,&wparam(2)); # len
851 &mov ($key,&wparam(3)); # key
852 &sub ($round,16);
853 &jc (&label("cbc_abort"));
854 &lea ($base,&DWP(-56,"esp"));
855 &mov ($const,&wparam(4)); # ivp
856 &and ($base,-16);
857 &mov ($magic,&wparam(5)); # enc
858 &xchg ($base,"esp"); # alloca
859 &movdqu ("xmm1",&QWP(0,$const)); # load IV
860 &sub ($out,$inp);
861 &mov (&DWP(48,"esp"),$base);
862
863 &mov (&DWP(0,"esp"),$out); # save out
864 &mov (&DWP(4,"esp"),$key) # save key
865 &mov (&DWP(8,"esp"),$const); # save ivp
866 &mov ($out,$round); # $out works as $len
867
868 &picsetup($const);
869 &picsymbol($const, &label("_vpaes_consts"), $const);
870 &lea ($const,&DWP(0x30,$const))
871
872 &call ("_vpaes_preheat");
873 &cmp ($magic,0);
874 &je (&label("cbc_dec_loop"));
875 &jmp (&label("cbc_enc_loop"));
876
877&set_label("cbc_enc_loop",16);
878 &movdqu ("xmm0",&QWP(0,$inp)); # load input
879 &pxor ("xmm0","xmm1"); # inp^=iv
880 &call ("_vpaes_encrypt_core");
881 &mov ($base,&DWP(0,"esp")); # restore out
882 &mov ($key,&DWP(4,"esp")); # restore key
883 &movdqa ("xmm1","xmm0");
884 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
885 &lea ($inp,&DWP(16,$inp));
886 &sub ($out,16);
887 &jnc (&label("cbc_enc_loop"));
888 &jmp (&label("cbc_done"));
889
890&set_label("cbc_dec_loop",16);
891 &movdqu ("xmm0",&QWP(0,$inp)); # load input
892 &movdqa (&QWP(16,"esp"),"xmm1"); # save IV
893 &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
894 &call ("_vpaes_decrypt_core");
895 &mov ($base,&DWP(0,"esp")); # restore out
896 &mov ($key,&DWP(4,"esp")); # restore key
897 &pxor ("xmm0",&QWP(16,"esp")); # out^=iv
898 &movdqa ("xmm1",&QWP(32,"esp")); # load next IV
899 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
900 &lea ($inp,&DWP(16,$inp));
901 &sub ($out,16);
902 &jnc (&label("cbc_dec_loop"));
903
904&set_label("cbc_done");
905 &mov ($base,&DWP(8,"esp")); # restore ivp
906 &mov ("esp",&DWP(48,"esp"));
907 &movdqu (&QWP(0,$base),"xmm1"); # write IV
908&set_label("cbc_abort");
909&function_end("${PREFIX}_cbc_encrypt");
910
911&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
deleted file mode 100644
index 7d92e8d8ca..0000000000
--- a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
+++ /dev/null
@@ -1,1222 +0,0 @@
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Interface to OpenSSL as "almost" drop-in replacement for
17# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-x86_64.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86_64.pl column -
26# [also large-block CBC] encrypt/decrypt.
27#
28# aes-x86_64.pl vpaes-x86_64.pl
29#
30# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***)
31# Nehalem 30.5/42.2/14.6 9.8/11.8
32# Atom 63.9/79.0/32.1 64.0/84.8(***)
33#
34# (*) "Hyper-threading" in the context refers rather to cache shared
35# among multiple cores, than to specifically Intel HTT. As vast
36# majority of contemporary cores share cache, slower code path
37# is common place. In other words "with-hyper-threading-off"
38# results are presented mostly for reference purposes.
39#
40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
41#
42# (***) Less impressive improvement on Core 2 and Atom is due to slow
43# pshufb, yet it's respectable +40%/78% improvement on Core 2
44# (as implied, over "hyper-threading-safe" code path).
45#
46# <appro@openssl.org>
47
48$flavour = shift;
49$output = shift;
50if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
51
52$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53
54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57die "can't locate x86_64-xlate.pl";
58
59open OUT,"| \"$^X\" $xlate $flavour $output";
60*STDOUT=*OUT;
61
62$PREFIX="vpaes";
63
64$code.=<<___;
65.text
66
67##
68## _aes_encrypt_core
69##
70## AES-encrypt %xmm0.
71##
72## Inputs:
73## %xmm0 = input
74## %xmm9-%xmm15 as in _vpaes_preheat
75## (%rdx) = scheduled keys
76##
77## Output in %xmm0
78## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
79## Preserves %xmm6 - %xmm8 so you get some local vectors
80##
81##
82.type _vpaes_encrypt_core,\@abi-omnipotent
83.align 16
84_vpaes_encrypt_core:
85 _CET_ENDBR
86 mov %rdx, %r9
87 mov \$16, %r11
88 mov 240(%rdx),%eax
89 movdqa %xmm9, %xmm1
90 movdqa .Lk_ipt(%rip), %xmm2 # iptlo
91 pandn %xmm0, %xmm1
92 movdqu (%r9), %xmm5 # round0 key
93 psrld \$4, %xmm1
94 pand %xmm9, %xmm0
95 pshufb %xmm0, %xmm2
96 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
97 pshufb %xmm1, %xmm0
98 pxor %xmm5, %xmm2
99 pxor %xmm2, %xmm0
100 add \$16, %r9
101 lea .Lk_mc_backward(%rip),%r10
102 jmp .Lenc_entry
103
104.align 16
105.Lenc_loop:
106 # middle of middle round
107 movdqa %xmm13, %xmm4 # 4 : sb1u
108 pshufb %xmm2, %xmm4 # 4 = sb1u
109 pxor %xmm5, %xmm4 # 4 = sb1u + k
110 movdqa %xmm12, %xmm0 # 0 : sb1t
111 pshufb %xmm3, %xmm0 # 0 = sb1t
112 pxor %xmm4, %xmm0 # 0 = A
113 movdqa %xmm15, %xmm5 # 4 : sb2u
114 pshufb %xmm2, %xmm5 # 4 = sb2u
115 movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
116 movdqa %xmm14, %xmm2 # 2 : sb2t
117 pshufb %xmm3, %xmm2 # 2 = sb2t
118 pxor %xmm5, %xmm2 # 2 = 2A
119 movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
120 movdqa %xmm0, %xmm3 # 3 = A
121 pshufb %xmm1, %xmm0 # 0 = B
122 add \$16, %r9 # next key
123 pxor %xmm2, %xmm0 # 0 = 2A+B
124 pshufb %xmm4, %xmm3 # 3 = D
125 add \$16, %r11 # next mc
126 pxor %xmm0, %xmm3 # 3 = 2A+B+D
127 pshufb %xmm1, %xmm0 # 0 = 2B+C
128 and \$0x30, %r11 # ... mod 4
129 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
130 sub \$1,%rax # nr--
131
132.Lenc_entry:
133 # top of round
134 movdqa %xmm9, %xmm1 # 1 : i
135 pandn %xmm0, %xmm1 # 1 = i<<4
136 psrld \$4, %xmm1 # 1 = i
137 pand %xmm9, %xmm0 # 0 = k
138 movdqa %xmm11, %xmm5 # 2 : a/k
139 pshufb %xmm0, %xmm5 # 2 = a/k
140 pxor %xmm1, %xmm0 # 0 = j
141 movdqa %xmm10, %xmm3 # 3 : 1/i
142 pshufb %xmm1, %xmm3 # 3 = 1/i
143 pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
144 movdqa %xmm10, %xmm4 # 4 : 1/j
145 pshufb %xmm0, %xmm4 # 4 = 1/j
146 pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
147 movdqa %xmm10, %xmm2 # 2 : 1/iak
148 pshufb %xmm3, %xmm2 # 2 = 1/iak
149 pxor %xmm0, %xmm2 # 2 = io
150 movdqa %xmm10, %xmm3 # 3 : 1/jak
151 movdqu (%r9), %xmm5
152 pshufb %xmm4, %xmm3 # 3 = 1/jak
153 pxor %xmm1, %xmm3 # 3 = jo
154 jnz .Lenc_loop
155
156 # middle of last round
157 movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
158 movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
159 pshufb %xmm2, %xmm4 # 4 = sbou
160 pxor %xmm5, %xmm4 # 4 = sb1u + k
161 pshufb %xmm3, %xmm0 # 0 = sb1t
162 movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
163 pxor %xmm4, %xmm0 # 0 = A
164 pshufb %xmm1, %xmm0
165 ret
166.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
167
168##
169## Decryption core
170##
171## Same API as encryption core.
172##
173.type _vpaes_decrypt_core,\@abi-omnipotent
174.align 16
175_vpaes_decrypt_core:
176 _CET_ENDBR
177 mov %rdx, %r9 # load key
178 mov 240(%rdx),%eax
179 movdqa %xmm9, %xmm1
180 movdqa .Lk_dipt(%rip), %xmm2 # iptlo
181 pandn %xmm0, %xmm1
182 mov %rax, %r11
183 psrld \$4, %xmm1
184 movdqu (%r9), %xmm5 # round0 key
185 shl \$4, %r11
186 pand %xmm9, %xmm0
187 pshufb %xmm0, %xmm2
188 movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
189 xor \$0x30, %r11
190 lea .Lk_dsbd(%rip),%r10
191 pshufb %xmm1, %xmm0
192 and \$0x30, %r11
193 pxor %xmm5, %xmm2
194 movdqa .Lk_mc_forward+48(%rip), %xmm5
195 pxor %xmm2, %xmm0
196 add \$16, %r9
197 add %r10, %r11
198 jmp .Ldec_entry
199
200.align 16
201.Ldec_loop:
202##
203## Inverse mix columns
204##
205 movdqa -0x20(%r10),%xmm4 # 4 : sb9u
206 pshufb %xmm2, %xmm4 # 4 = sb9u
207 pxor %xmm0, %xmm4
208 movdqa -0x10(%r10),%xmm0 # 0 : sb9t
209 pshufb %xmm3, %xmm0 # 0 = sb9t
210 pxor %xmm4, %xmm0 # 0 = ch
211 add \$16, %r9 # next round key
212
213 pshufb %xmm5, %xmm0 # MC ch
214 movdqa 0x00(%r10),%xmm4 # 4 : sbdu
215 pshufb %xmm2, %xmm4 # 4 = sbdu
216 pxor %xmm0, %xmm4 # 4 = ch
217 movdqa 0x10(%r10),%xmm0 # 0 : sbdt
218 pshufb %xmm3, %xmm0 # 0 = sbdt
219 pxor %xmm4, %xmm0 # 0 = ch
220 sub \$1,%rax # nr--
221
222 pshufb %xmm5, %xmm0 # MC ch
223 movdqa 0x20(%r10),%xmm4 # 4 : sbbu
224 pshufb %xmm2, %xmm4 # 4 = sbbu
225 pxor %xmm0, %xmm4 # 4 = ch
226 movdqa 0x30(%r10),%xmm0 # 0 : sbbt
227 pshufb %xmm3, %xmm0 # 0 = sbbt
228 pxor %xmm4, %xmm0 # 0 = ch
229
230 pshufb %xmm5, %xmm0 # MC ch
231 movdqa 0x40(%r10),%xmm4 # 4 : sbeu
232 pshufb %xmm2, %xmm4 # 4 = sbeu
233 pxor %xmm0, %xmm4 # 4 = ch
234 movdqa 0x50(%r10),%xmm0 # 0 : sbet
235 pshufb %xmm3, %xmm0 # 0 = sbet
236 pxor %xmm4, %xmm0 # 0 = ch
237
238 palignr \$12, %xmm5, %xmm5
239
240.Ldec_entry:
241 # top of round
242 movdqa %xmm9, %xmm1 # 1 : i
243 pandn %xmm0, %xmm1 # 1 = i<<4
244 psrld \$4, %xmm1 # 1 = i
245 pand %xmm9, %xmm0 # 0 = k
246 movdqa %xmm11, %xmm2 # 2 : a/k
247 pshufb %xmm0, %xmm2 # 2 = a/k
248 pxor %xmm1, %xmm0 # 0 = j
249 movdqa %xmm10, %xmm3 # 3 : 1/i
250 pshufb %xmm1, %xmm3 # 3 = 1/i
251 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
252 movdqa %xmm10, %xmm4 # 4 : 1/j
253 pshufb %xmm0, %xmm4 # 4 = 1/j
254 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
255 movdqa %xmm10, %xmm2 # 2 : 1/iak
256 pshufb %xmm3, %xmm2 # 2 = 1/iak
257 pxor %xmm0, %xmm2 # 2 = io
258 movdqa %xmm10, %xmm3 # 3 : 1/jak
259 pshufb %xmm4, %xmm3 # 3 = 1/jak
260 pxor %xmm1, %xmm3 # 3 = jo
261 movdqu (%r9), %xmm0
262 jnz .Ldec_loop
263
264 # middle of last round
265 movdqa 0x60(%r10), %xmm4 # 3 : sbou
266 pshufb %xmm2, %xmm4 # 4 = sbou
267 pxor %xmm0, %xmm4 # 4 = sb1u + k
268 movdqa 0x70(%r10), %xmm0 # 0 : sbot
269 movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
270 pshufb %xmm3, %xmm0 # 0 = sb1t
271 pxor %xmm4, %xmm0 # 0 = A
272 pshufb %xmm2, %xmm0
273 ret
274.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
275
276########################################################
277## ##
278## AES key schedule ##
279## ##
280########################################################
281.type _vpaes_schedule_core,\@abi-omnipotent
282.align 16
283_vpaes_schedule_core:
284 _CET_ENDBR
285 # rdi = key
286 # rsi = size in bits
287 # rdx = buffer
288 # rcx = direction. 0=encrypt, 1=decrypt
289
290 call _vpaes_preheat # load the tables
291 movdqa .Lk_rcon(%rip), %xmm8 # load rcon
292 movdqu (%rdi), %xmm0 # load key (unaligned)
293
294 # input transform
295 movdqa %xmm0, %xmm3
296 lea .Lk_ipt(%rip), %r11
297 call _vpaes_schedule_transform
298 movdqa %xmm0, %xmm7
299
300 lea .Lk_sr(%rip),%r10
301 test %rcx, %rcx
302 jnz .Lschedule_am_decrypting
303
304 # encrypting, output zeroth round key after transform
305 movdqu %xmm0, (%rdx)
306 jmp .Lschedule_go
307
308.Lschedule_am_decrypting:
309 # decrypting, output zeroth round key after shiftrows
310 movdqa (%r8,%r10),%xmm1
311 pshufb %xmm1, %xmm3
312 movdqu %xmm3, (%rdx)
313 xor \$0x30, %r8
314
315.Lschedule_go:
316 cmp \$192, %esi
317 ja .Lschedule_256
318 je .Lschedule_192
319 # 128: fall though
320
321##
322## .schedule_128
323##
324## 128-bit specific part of key schedule.
325##
326## This schedule is really simple, because all its parts
327## are accomplished by the subroutines.
328##
329.Lschedule_128:
330 mov \$10, %esi
331
332.Loop_schedule_128:
333 call _vpaes_schedule_round
334 dec %rsi
335 jz .Lschedule_mangle_last
336 call _vpaes_schedule_mangle # write output
337 jmp .Loop_schedule_128
338
339##
340## .aes_schedule_192
341##
342## 192-bit specific part of key schedule.
343##
344## The main body of this schedule is the same as the 128-bit
345## schedule, but with more smearing. The long, high side is
346## stored in %xmm7 as before, and the short, low side is in
347## the high bits of %xmm6.
348##
349## This schedule is somewhat nastier, however, because each
350## round produces 192 bits of key material, or 1.5 round keys.
351## Therefore, on each cycle we do 2 rounds and produce 3 round
352## keys.
353##
354.align 16
355.Lschedule_192:
356 movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
357 call _vpaes_schedule_transform # input transform
358 movdqa %xmm0, %xmm6 # save short part
359 pxor %xmm4, %xmm4 # clear 4
360 movhlps %xmm4, %xmm6 # clobber low side with zeros
361 mov \$4, %esi
362
363.Loop_schedule_192:
364 call _vpaes_schedule_round
365 palignr \$8,%xmm6,%xmm0
366 call _vpaes_schedule_mangle # save key n
367 call _vpaes_schedule_192_smear
368 call _vpaes_schedule_mangle # save key n+1
369 call _vpaes_schedule_round
370 dec %rsi
371 jz .Lschedule_mangle_last
372 call _vpaes_schedule_mangle # save key n+2
373 call _vpaes_schedule_192_smear
374 jmp .Loop_schedule_192
375
376##
377## .aes_schedule_256
378##
379## 256-bit specific part of key schedule.
380##
381## The structure here is very similar to the 128-bit
382## schedule, but with an additional "low side" in
383## %xmm6. The low side's rounds are the same as the
384## high side's, except no rcon and no rotation.
385##
386.align 16
387.Lschedule_256:
388 movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
389 call _vpaes_schedule_transform # input transform
390 mov \$7, %esi
391
392.Loop_schedule_256:
393 call _vpaes_schedule_mangle # output low result
394 movdqa %xmm0, %xmm6 # save cur_lo in xmm6
395
396 # high round
397 call _vpaes_schedule_round
398 dec %rsi
399 jz .Lschedule_mangle_last
400 call _vpaes_schedule_mangle
401
402 # low round. swap xmm7 and xmm6
403 pshufd \$0xFF, %xmm0, %xmm0
404 movdqa %xmm7, %xmm5
405 movdqa %xmm6, %xmm7
406 call _vpaes_schedule_low_round
407 movdqa %xmm5, %xmm7
408
409 jmp .Loop_schedule_256
410
411
412##
413## .aes_schedule_mangle_last
414##
415## Mangler for last round of key schedule
416## Mangles %xmm0
417## when encrypting, outputs out(%xmm0) ^ 63
418## when decrypting, outputs unskew(%xmm0)
419##
420## Always called right before return... jumps to cleanup and exits
421##
422.align 16
423.Lschedule_mangle_last:
424 # schedule last round key from xmm0
425 lea .Lk_deskew(%rip),%r11 # prepare to deskew
426 test %rcx, %rcx
427 jnz .Lschedule_mangle_last_dec
428
429 # encrypting
430 movdqa (%r8,%r10),%xmm1
431 pshufb %xmm1, %xmm0 # output permute
432 lea .Lk_opt(%rip), %r11 # prepare to output transform
433 add \$32, %rdx
434
435.Lschedule_mangle_last_dec:
436 add \$-16, %rdx
437 pxor .Lk_s63(%rip), %xmm0
438 call _vpaes_schedule_transform # output transform
439 movdqu %xmm0, (%rdx) # save last key
440
441 # cleanup
442 pxor %xmm0, %xmm0
443 pxor %xmm1, %xmm1
444 pxor %xmm2, %xmm2
445 pxor %xmm3, %xmm3
446 pxor %xmm4, %xmm4
447 pxor %xmm5, %xmm5
448 pxor %xmm6, %xmm6
449 pxor %xmm7, %xmm7
450 ret
451.size _vpaes_schedule_core,.-_vpaes_schedule_core
452
453##
454## .aes_schedule_192_smear
455##
456## Smear the short, low side in the 192-bit key schedule.
457##
458## Inputs:
459## %xmm7: high side, b a x y
460## %xmm6: low side, d c 0 0
461## %xmm13: 0
462##
463## Outputs:
464## %xmm6: b+c+d b+c 0 0
465## %xmm0: b+c+d b+c b a
466##
467.type _vpaes_schedule_192_smear,\@abi-omnipotent
468.align 16
469_vpaes_schedule_192_smear:
470 _CET_ENDBR
471 pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
472 pxor %xmm0, %xmm6 # -> c+d c 0 0
473 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
474 pxor %xmm0, %xmm6 # -> b+c+d b+c b a
475 movdqa %xmm6, %xmm0
476 pxor %xmm1, %xmm1
477 movhlps %xmm1, %xmm6 # clobber low side with zeros
478 ret
479.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
480
481##
482## .aes_schedule_round
483##
484## Runs one main round of the key schedule on %xmm0, %xmm7
485##
486## Specifically, runs subbytes on the high dword of %xmm0
487## then rotates it by one byte and xors into the low dword of
488## %xmm7.
489##
490## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
491## next rcon.
492##
493## Smears the dwords of %xmm7 by xoring the low into the
494## second low, result into third, result into highest.
495##
496## Returns results in %xmm7 = %xmm0.
497## Clobbers %xmm1-%xmm4, %r11.
498##
499.type _vpaes_schedule_round,\@abi-omnipotent
500.align 16
501_vpaes_schedule_round:
502 _CET_ENDBR
503 # extract rcon from xmm8
504 pxor %xmm1, %xmm1
505 palignr \$15, %xmm8, %xmm1
506 palignr \$15, %xmm8, %xmm8
507 pxor %xmm1, %xmm7
508
509 # rotate
510 pshufd \$0xFF, %xmm0, %xmm0
511 palignr \$1, %xmm0, %xmm0
512
513 # fall through...
514
515 # low round: same as high round, but no rotation and no rcon.
516_vpaes_schedule_low_round:
517 # smear xmm7
518 movdqa %xmm7, %xmm1
519 pslldq \$4, %xmm7
520 pxor %xmm1, %xmm7
521 movdqa %xmm7, %xmm1
522 pslldq \$8, %xmm7
523 pxor %xmm1, %xmm7
524 pxor .Lk_s63(%rip), %xmm7
525
526 # subbytes
527 movdqa %xmm9, %xmm1
528 pandn %xmm0, %xmm1
529 psrld \$4, %xmm1 # 1 = i
530 pand %xmm9, %xmm0 # 0 = k
531 movdqa %xmm11, %xmm2 # 2 : a/k
532 pshufb %xmm0, %xmm2 # 2 = a/k
533 pxor %xmm1, %xmm0 # 0 = j
534 movdqa %xmm10, %xmm3 # 3 : 1/i
535 pshufb %xmm1, %xmm3 # 3 = 1/i
536 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
537 movdqa %xmm10, %xmm4 # 4 : 1/j
538 pshufb %xmm0, %xmm4 # 4 = 1/j
539 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
540 movdqa %xmm10, %xmm2 # 2 : 1/iak
541 pshufb %xmm3, %xmm2 # 2 = 1/iak
542 pxor %xmm0, %xmm2 # 2 = io
543 movdqa %xmm10, %xmm3 # 3 : 1/jak
544 pshufb %xmm4, %xmm3 # 3 = 1/jak
545 pxor %xmm1, %xmm3 # 3 = jo
546 movdqa %xmm13, %xmm4 # 4 : sbou
547 pshufb %xmm2, %xmm4 # 4 = sbou
548 movdqa %xmm12, %xmm0 # 0 : sbot
549 pshufb %xmm3, %xmm0 # 0 = sb1t
550 pxor %xmm4, %xmm0 # 0 = sbox output
551
552 # add in smeared stuff
553 pxor %xmm7, %xmm0
554 movdqa %xmm0, %xmm7
555 ret
556.size _vpaes_schedule_round,.-_vpaes_schedule_round
557
558##
559## .aes_schedule_transform
560##
561## Linear-transform %xmm0 according to tables at (%r11)
562##
563## Requires that %xmm9 = 0x0F0F... as in preheat
564## Output in %xmm0
565## Clobbers %xmm1, %xmm2
566##
567.type _vpaes_schedule_transform,\@abi-omnipotent
568.align 16
569_vpaes_schedule_transform:
570 _CET_ENDBR
571 movdqa %xmm9, %xmm1
572 pandn %xmm0, %xmm1
573 psrld \$4, %xmm1
574 pand %xmm9, %xmm0
575 movdqa (%r11), %xmm2 # lo
576 pshufb %xmm0, %xmm2
577 movdqa 16(%r11), %xmm0 # hi
578 pshufb %xmm1, %xmm0
579 pxor %xmm2, %xmm0
580 ret
581.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
582
583##
584## .aes_schedule_mangle
585##
586## Mangle xmm0 from (basis-transformed) standard version
587## to our version.
588##
589## On encrypt,
590## xor with 0x63
591## multiply by circulant 0,1,1,1
592## apply shiftrows transform
593##
594## On decrypt,
595## xor with 0x63
596## multiply by "inverse mixcolumns" circulant E,B,D,9
597## deskew
598## apply shiftrows transform
599##
600##
601## Writes out to (%rdx), and increments or decrements it
602## Keeps track of round number mod 4 in %r8
603## Preserves xmm0
604## Clobbers xmm1-xmm5
605##
606.type _vpaes_schedule_mangle,\@abi-omnipotent
607.align 16
608_vpaes_schedule_mangle:
609 _CET_ENDBR
610 movdqa %xmm0, %xmm4 # save xmm0 for later
611 movdqa .Lk_mc_forward(%rip),%xmm5
612 test %rcx, %rcx
613 jnz .Lschedule_mangle_dec
614
615 # encrypting
616 add \$16, %rdx
617 pxor .Lk_s63(%rip),%xmm4
618 pshufb %xmm5, %xmm4
619 movdqa %xmm4, %xmm3
620 pshufb %xmm5, %xmm4
621 pxor %xmm4, %xmm3
622 pshufb %xmm5, %xmm4
623 pxor %xmm4, %xmm3
624
625 jmp .Lschedule_mangle_both
626.align 16
627.Lschedule_mangle_dec:
628 # inverse mix columns
629 lea .Lk_dksd(%rip),%r11
630 movdqa %xmm9, %xmm1
631 pandn %xmm4, %xmm1
632 psrld \$4, %xmm1 # 1 = hi
633 pand %xmm9, %xmm4 # 4 = lo
634
635 movdqa 0x00(%r11), %xmm2
636 pshufb %xmm4, %xmm2
637 movdqa 0x10(%r11), %xmm3
638 pshufb %xmm1, %xmm3
639 pxor %xmm2, %xmm3
640 pshufb %xmm5, %xmm3
641
642 movdqa 0x20(%r11), %xmm2
643 pshufb %xmm4, %xmm2
644 pxor %xmm3, %xmm2
645 movdqa 0x30(%r11), %xmm3
646 pshufb %xmm1, %xmm3
647 pxor %xmm2, %xmm3
648 pshufb %xmm5, %xmm3
649
650 movdqa 0x40(%r11), %xmm2
651 pshufb %xmm4, %xmm2
652 pxor %xmm3, %xmm2
653 movdqa 0x50(%r11), %xmm3
654 pshufb %xmm1, %xmm3
655 pxor %xmm2, %xmm3
656 pshufb %xmm5, %xmm3
657
658 movdqa 0x60(%r11), %xmm2
659 pshufb %xmm4, %xmm2
660 pxor %xmm3, %xmm2
661 movdqa 0x70(%r11), %xmm3
662 pshufb %xmm1, %xmm3
663 pxor %xmm2, %xmm3
664
665 add \$-16, %rdx
666
667.Lschedule_mangle_both:
668 movdqa (%r8,%r10),%xmm1
669 pshufb %xmm1,%xmm3
670 add \$-16, %r8
671 and \$0x30, %r8
672 movdqu %xmm3, (%rdx)
673 ret
674.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
675
676#
677# Interface to OpenSSL
678#
679.globl ${PREFIX}_set_encrypt_key
680.type ${PREFIX}_set_encrypt_key,\@function,3
681.align 16
682${PREFIX}_set_encrypt_key:
683 _CET_ENDBR
684___
685$code.=<<___ if ($win64);
686 lea -0xb8(%rsp),%rsp
687 movaps %xmm6,0x10(%rsp)
688 movaps %xmm7,0x20(%rsp)
689 movaps %xmm8,0x30(%rsp)
690 movaps %xmm9,0x40(%rsp)
691 movaps %xmm10,0x50(%rsp)
692 movaps %xmm11,0x60(%rsp)
693 movaps %xmm12,0x70(%rsp)
694 movaps %xmm13,0x80(%rsp)
695 movaps %xmm14,0x90(%rsp)
696 movaps %xmm15,0xa0(%rsp)
697.Lenc_key_body:
698___
699$code.=<<___;
700 mov %esi,%eax
701 shr \$5,%eax
702 add \$5,%eax
703 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
704
705 mov \$0,%ecx
706 mov \$0x30,%r8d
707 call _vpaes_schedule_core
708___
709$code.=<<___ if ($win64);
710 movaps 0x10(%rsp),%xmm6
711 movaps 0x20(%rsp),%xmm7
712 movaps 0x30(%rsp),%xmm8
713 movaps 0x40(%rsp),%xmm9
714 movaps 0x50(%rsp),%xmm10
715 movaps 0x60(%rsp),%xmm11
716 movaps 0x70(%rsp),%xmm12
717 movaps 0x80(%rsp),%xmm13
718 movaps 0x90(%rsp),%xmm14
719 movaps 0xa0(%rsp),%xmm15
720 lea 0xb8(%rsp),%rsp
721.Lenc_key_epilogue:
722___
723$code.=<<___;
724 xor %eax,%eax
725 ret
726.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
727
728.globl ${PREFIX}_set_decrypt_key
729.type ${PREFIX}_set_decrypt_key,\@function,3
730.align 16
731${PREFIX}_set_decrypt_key:
732 _CET_ENDBR
733___
734$code.=<<___ if ($win64);
735 lea -0xb8(%rsp),%rsp
736 movaps %xmm6,0x10(%rsp)
737 movaps %xmm7,0x20(%rsp)
738 movaps %xmm8,0x30(%rsp)
739 movaps %xmm9,0x40(%rsp)
740 movaps %xmm10,0x50(%rsp)
741 movaps %xmm11,0x60(%rsp)
742 movaps %xmm12,0x70(%rsp)
743 movaps %xmm13,0x80(%rsp)
744 movaps %xmm14,0x90(%rsp)
745 movaps %xmm15,0xa0(%rsp)
746.Ldec_key_body:
747___
748$code.=<<___;
749 mov %esi,%eax
750 shr \$5,%eax
751 add \$5,%eax
752 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
753 shl \$4,%eax
754 lea 16(%rdx,%rax),%rdx
755
756 mov \$1,%ecx
757 mov %esi,%r8d
758 shr \$1,%r8d
759 and \$32,%r8d
760 xor \$32,%r8d # nbits==192?0:32
761 call _vpaes_schedule_core
762___
763$code.=<<___ if ($win64);
764 movaps 0x10(%rsp),%xmm6
765 movaps 0x20(%rsp),%xmm7
766 movaps 0x30(%rsp),%xmm8
767 movaps 0x40(%rsp),%xmm9
768 movaps 0x50(%rsp),%xmm10
769 movaps 0x60(%rsp),%xmm11
770 movaps 0x70(%rsp),%xmm12
771 movaps 0x80(%rsp),%xmm13
772 movaps 0x90(%rsp),%xmm14
773 movaps 0xa0(%rsp),%xmm15
774 lea 0xb8(%rsp),%rsp
775.Ldec_key_epilogue:
776___
777$code.=<<___;
778 xor %eax,%eax
779 ret
780.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
781
782.globl ${PREFIX}_encrypt
783.type ${PREFIX}_encrypt,\@function,3
784.align 16
785${PREFIX}_encrypt:
786 _CET_ENDBR
787___
788$code.=<<___ if ($win64);
789 lea -0xb8(%rsp),%rsp
790 movaps %xmm6,0x10(%rsp)
791 movaps %xmm7,0x20(%rsp)
792 movaps %xmm8,0x30(%rsp)
793 movaps %xmm9,0x40(%rsp)
794 movaps %xmm10,0x50(%rsp)
795 movaps %xmm11,0x60(%rsp)
796 movaps %xmm12,0x70(%rsp)
797 movaps %xmm13,0x80(%rsp)
798 movaps %xmm14,0x90(%rsp)
799 movaps %xmm15,0xa0(%rsp)
800.Lenc_body:
801___
802$code.=<<___;
803 movdqu (%rdi),%xmm0
804 call _vpaes_preheat
805 call _vpaes_encrypt_core
806 movdqu %xmm0,(%rsi)
807___
808$code.=<<___ if ($win64);
809 movaps 0x10(%rsp),%xmm6
810 movaps 0x20(%rsp),%xmm7
811 movaps 0x30(%rsp),%xmm8
812 movaps 0x40(%rsp),%xmm9
813 movaps 0x50(%rsp),%xmm10
814 movaps 0x60(%rsp),%xmm11
815 movaps 0x70(%rsp),%xmm12
816 movaps 0x80(%rsp),%xmm13
817 movaps 0x90(%rsp),%xmm14
818 movaps 0xa0(%rsp),%xmm15
819 lea 0xb8(%rsp),%rsp
820.Lenc_epilogue:
821___
822$code.=<<___;
823 ret
824.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
825
826.globl ${PREFIX}_decrypt
827.type ${PREFIX}_decrypt,\@function,3
828.align 16
829${PREFIX}_decrypt:
830 _CET_ENDBR
831___
832$code.=<<___ if ($win64);
833 lea -0xb8(%rsp),%rsp
834 movaps %xmm6,0x10(%rsp)
835 movaps %xmm7,0x20(%rsp)
836 movaps %xmm8,0x30(%rsp)
837 movaps %xmm9,0x40(%rsp)
838 movaps %xmm10,0x50(%rsp)
839 movaps %xmm11,0x60(%rsp)
840 movaps %xmm12,0x70(%rsp)
841 movaps %xmm13,0x80(%rsp)
842 movaps %xmm14,0x90(%rsp)
843 movaps %xmm15,0xa0(%rsp)
844.Ldec_body:
845___
846$code.=<<___;
847 movdqu (%rdi),%xmm0
848 call _vpaes_preheat
849 call _vpaes_decrypt_core
850 movdqu %xmm0,(%rsi)
851___
852$code.=<<___ if ($win64);
853 movaps 0x10(%rsp),%xmm6
854 movaps 0x20(%rsp),%xmm7
855 movaps 0x30(%rsp),%xmm8
856 movaps 0x40(%rsp),%xmm9
857 movaps 0x50(%rsp),%xmm10
858 movaps 0x60(%rsp),%xmm11
859 movaps 0x70(%rsp),%xmm12
860 movaps 0x80(%rsp),%xmm13
861 movaps 0x90(%rsp),%xmm14
862 movaps 0xa0(%rsp),%xmm15
863 lea 0xb8(%rsp),%rsp
864.Ldec_epilogue:
865___
866$code.=<<___;
867 ret
868.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
869___
870{
871my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
872# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
873# size_t length, const AES_KEY *key,
874# unsigned char *ivp,const int enc);
875$code.=<<___;
876.globl ${PREFIX}_cbc_encrypt
877.type ${PREFIX}_cbc_encrypt,\@function,6
878.align 16
879${PREFIX}_cbc_encrypt:
880 _CET_ENDBR
881 xchg $key,$len
882___
883($len,$key)=($key,$len);
884$code.=<<___;
885 sub \$16,$len
886 jc .Lcbc_abort
887___
888$code.=<<___ if ($win64);
889 lea -0xb8(%rsp),%rsp
890 movaps %xmm6,0x10(%rsp)
891 movaps %xmm7,0x20(%rsp)
892 movaps %xmm8,0x30(%rsp)
893 movaps %xmm9,0x40(%rsp)
894 movaps %xmm10,0x50(%rsp)
895 movaps %xmm11,0x60(%rsp)
896 movaps %xmm12,0x70(%rsp)
897 movaps %xmm13,0x80(%rsp)
898 movaps %xmm14,0x90(%rsp)
899 movaps %xmm15,0xa0(%rsp)
900.Lcbc_body:
901___
902$code.=<<___;
903 movdqu ($ivp),%xmm6 # load IV
904 sub $inp,$out
905 call _vpaes_preheat
906 cmp \$0,${enc}d
907 je .Lcbc_dec_loop
908 jmp .Lcbc_enc_loop
909.align 16
910.Lcbc_enc_loop:
911 movdqu ($inp),%xmm0
912 pxor %xmm6,%xmm0
913 call _vpaes_encrypt_core
914 movdqa %xmm0,%xmm6
915 movdqu %xmm0,($out,$inp)
916 lea 16($inp),$inp
917 sub \$16,$len
918 jnc .Lcbc_enc_loop
919 jmp .Lcbc_done
920.align 16
921.Lcbc_dec_loop:
922 movdqu ($inp),%xmm0
923 movdqa %xmm0,%xmm7
924 call _vpaes_decrypt_core
925 pxor %xmm6,%xmm0
926 movdqa %xmm7,%xmm6
927 movdqu %xmm0,($out,$inp)
928 lea 16($inp),$inp
929 sub \$16,$len
930 jnc .Lcbc_dec_loop
931.Lcbc_done:
932 movdqu %xmm6,($ivp) # save IV
933___
934$code.=<<___ if ($win64);
935 movaps 0x10(%rsp),%xmm6
936 movaps 0x20(%rsp),%xmm7
937 movaps 0x30(%rsp),%xmm8
938 movaps 0x40(%rsp),%xmm9
939 movaps 0x50(%rsp),%xmm10
940 movaps 0x60(%rsp),%xmm11
941 movaps 0x70(%rsp),%xmm12
942 movaps 0x80(%rsp),%xmm13
943 movaps 0x90(%rsp),%xmm14
944 movaps 0xa0(%rsp),%xmm15
945 lea 0xb8(%rsp),%rsp
946.Lcbc_epilogue:
947___
948$code.=<<___;
949.Lcbc_abort:
950 ret
951.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
952___
953}
954$code.=<<___;
955##
956## _aes_preheat
957##
958## Fills register %r10 -> .aes_consts (so you can -fPIC)
959## and %xmm9-%xmm15 as specified below.
960##
961.type _vpaes_preheat,\@abi-omnipotent
962.align 16
963_vpaes_preheat:
964 _CET_ENDBR
965 lea .Lk_s0F(%rip), %r10
966 movdqa -0x20(%r10), %xmm10 # .Lk_inv
967 movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
968 movdqa 0x00(%r10), %xmm9 # .Lk_s0F
969 movdqa 0x30(%r10), %xmm13 # .Lk_sb1
970 movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
971 movdqa 0x50(%r10), %xmm15 # .Lk_sb2
972 movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
973 ret
974.size _vpaes_preheat,.-_vpaes_preheat
975########################################################
976## ##
977## Constants ##
978## ##
979########################################################
980.section .rodata
981.type _vpaes_consts,\@object
982.align 64
983_vpaes_consts:
984.Lk_inv: # inv, inva
985 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
986 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
987
988.Lk_s0F: # s0F
989 .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
990
991.Lk_ipt: # input transform (lo, hi)
992 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
993 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
994
995.Lk_sb1: # sb1u, sb1t
996 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
997 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
998.Lk_sb2: # sb2u, sb2t
999 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
1000 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
1001.Lk_sbo: # sbou, sbot
1002 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
1003 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
1004
1005.Lk_mc_forward: # mc_forward
1006 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
1007 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
1008 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
1009 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
1010
1011.Lk_mc_backward:# mc_backward
1012 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
1013 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
1014 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
1015 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
1016
1017.Lk_sr: # sr
1018 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
1019 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
1020 .quad 0x0F060D040B020900, 0x070E050C030A0108
1021 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
1022
1023.Lk_rcon: # rcon
1024 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1025
1026.Lk_s63: # s63: all equal to 0x63 transformed
1027 .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1028
1029.Lk_opt: # output transform
1030 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
1031 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1032
1033.Lk_deskew: # deskew tables: inverts the sbox's "skew"
1034 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1035 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1036
1037##
1038## Decryption stuff
1039## Key schedule constants
1040##
1041.Lk_dksd: # decryption key schedule: invskew x*D
1042 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1043 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1044.Lk_dksb: # decryption key schedule: invskew x*B
1045 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
1046 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1047.Lk_dkse: # decryption key schedule: invskew x*E + 0x63
1048 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
1049 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1050.Lk_dks9: # decryption key schedule: invskew x*9
1051 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
1052 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
1053
1054##
1055## Decryption stuff
1056## Round function constants
1057##
1058.Lk_dipt: # decryption input transform
1059 .quad 0x0F505B040B545F00, 0x154A411E114E451A
1060 .quad 0x86E383E660056500, 0x12771772F491F194
1061
1062.Lk_dsb9: # decryption sbox output *9*u, *9*t
1063 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
1064 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1065.Lk_dsbd: # decryption sbox output *D*u, *D*t
1066 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1067 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1068.Lk_dsbb: # decryption sbox output *B*u, *B*t
1069 .quad 0xD022649296B44200, 0x602646F6B0F2D404
1070 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1071.Lk_dsbe: # decryption sbox output *E*u, *E*t
1072 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
1073 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1074.Lk_dsbo: # decryption sbox final output
1075 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1076 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1077.align 64
1078.size _vpaes_consts,.-_vpaes_consts
1079.text
1080___
1081
1082if ($win64) {
1083# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1084# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1085$rec="%rcx";
1086$frame="%rdx";
1087$context="%r8";
1088$disp="%r9";
1089
1090$code.=<<___;
1091.extern __imp_RtlVirtualUnwind
1092.type se_handler,\@abi-omnipotent
1093.align 16
1094se_handler:
1095 _CET_ENDBR
1096 push %rsi
1097 push %rdi
1098 push %rbx
1099 push %rbp
1100 push %r12
1101 push %r13
1102 push %r14
1103 push %r15
1104 pushfq
1105 sub \$64,%rsp
1106
1107 mov 120($context),%rax # pull context->Rax
1108 mov 248($context),%rbx # pull context->Rip
1109
1110 mov 8($disp),%rsi # disp->ImageBase
1111 mov 56($disp),%r11 # disp->HandlerData
1112
1113 mov 0(%r11),%r10d # HandlerData[0]
1114 lea (%rsi,%r10),%r10 # prologue label
1115 cmp %r10,%rbx # context->Rip<prologue label
1116 jb .Lin_prologue
1117
1118 mov 152($context),%rax # pull context->Rsp
1119
1120 mov 4(%r11),%r10d # HandlerData[1]
1121 lea (%rsi,%r10),%r10 # epilogue label
1122 cmp %r10,%rbx # context->Rip>=epilogue label
1123 jae .Lin_prologue
1124
1125 lea 16(%rax),%rsi # %xmm save area
1126 lea 512($context),%rdi # &context.Xmm6
1127 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1128 .long 0xa548f3fc # cld; rep movsq
1129 lea 0xb8(%rax),%rax # adjust stack pointer
1130
1131.Lin_prologue:
1132 mov 8(%rax),%rdi
1133 mov 16(%rax),%rsi
1134 mov %rax,152($context) # restore context->Rsp
1135 mov %rsi,168($context) # restore context->Rsi
1136 mov %rdi,176($context) # restore context->Rdi
1137
1138 mov 40($disp),%rdi # disp->ContextRecord
1139 mov $context,%rsi # context
1140 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1141 .long 0xa548f3fc # cld; rep movsq
1142
1143 mov $disp,%rsi
1144 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1145 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1146 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1147 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1148 mov 40(%rsi),%r10 # disp->ContextRecord
1149 lea 56(%rsi),%r11 # &disp->HandlerData
1150 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1151 mov %r10,32(%rsp) # arg5
1152 mov %r11,40(%rsp) # arg6
1153 mov %r12,48(%rsp) # arg7
1154 mov %rcx,56(%rsp) # arg8, (NULL)
1155 call *__imp_RtlVirtualUnwind(%rip)
1156
1157 mov \$1,%eax # ExceptionContinueSearch
1158 add \$64,%rsp
1159 popfq
1160 pop %r15
1161 pop %r14
1162 pop %r13
1163 pop %r12
1164 pop %rbp
1165 pop %rbx
1166 pop %rdi
1167 pop %rsi
1168 ret
1169.size se_handler,.-se_handler
1170
1171.section .pdata
1172.align 4
1173 .rva .LSEH_begin_${PREFIX}_set_encrypt_key
1174 .rva .LSEH_end_${PREFIX}_set_encrypt_key
1175 .rva .LSEH_info_${PREFIX}_set_encrypt_key
1176
1177 .rva .LSEH_begin_${PREFIX}_set_decrypt_key
1178 .rva .LSEH_end_${PREFIX}_set_decrypt_key
1179 .rva .LSEH_info_${PREFIX}_set_decrypt_key
1180
1181 .rva .LSEH_begin_${PREFIX}_encrypt
1182 .rva .LSEH_end_${PREFIX}_encrypt
1183 .rva .LSEH_info_${PREFIX}_encrypt
1184
1185 .rva .LSEH_begin_${PREFIX}_decrypt
1186 .rva .LSEH_end_${PREFIX}_decrypt
1187 .rva .LSEH_info_${PREFIX}_decrypt
1188
1189 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
1190 .rva .LSEH_end_${PREFIX}_cbc_encrypt
1191 .rva .LSEH_info_${PREFIX}_cbc_encrypt
1192
1193.section .xdata
1194.align 8
1195.LSEH_info_${PREFIX}_set_encrypt_key:
1196 .byte 9,0,0,0
1197 .rva se_handler
1198 .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
1199.LSEH_info_${PREFIX}_set_decrypt_key:
1200 .byte 9,0,0,0
1201 .rva se_handler
1202 .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
1203.LSEH_info_${PREFIX}_encrypt:
1204 .byte 9,0,0,0
1205 .rva se_handler
1206 .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
1207.LSEH_info_${PREFIX}_decrypt:
1208 .byte 9,0,0,0
1209 .rva se_handler
1210 .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
1211.LSEH_info_${PREFIX}_cbc_encrypt:
1212 .byte 9,0,0,0
1213 .rva se_handler
1214 .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
1215___
1216}
1217
1218$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1219
1220print $code;
1221
1222close STDOUT;