summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/aes
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/aes')
-rw-r--r--src/lib/libcrypto/aes/README3
-rw-r--r--src/lib/libcrypto/aes/aes.h126
-rw-r--r--src/lib/libcrypto/aes/aes_cbc.c65
-rw-r--r--src/lib/libcrypto/aes/aes_cfb.c84
-rw-r--r--src/lib/libcrypto/aes/aes_core.c1374
-rw-r--r--src/lib/libcrypto/aes/aes_ctr.c62
-rw-r--r--src/lib/libcrypto/aes/aes_ecb.c69
-rw-r--r--src/lib/libcrypto/aes/aes_ige.c194
-rw-r--r--src/lib/libcrypto/aes/aes_locl.h79
-rw-r--r--src/lib/libcrypto/aes/aes_misc.c65
-rw-r--r--src/lib/libcrypto/aes/aes_ofb.c61
-rw-r--r--src/lib/libcrypto/aes/aes_wrap.c133
-rw-r--r--src/lib/libcrypto/aes/aes_x86core.c1080
-rw-r--r--src/lib/libcrypto/aes/asm/aes-586.pl2980
-rw-r--r--src/lib/libcrypto/aes/asm/aes-armv4.pl1134
-rw-r--r--src/lib/libcrypto/aes/asm/aes-ia64.S1123
-rw-r--r--src/lib/libcrypto/aes/asm/aes-mips.pl1613
-rw-r--r--src/lib/libcrypto/aes/asm/aes-parisc.pl1028
-rw-r--r--src/lib/libcrypto/aes/asm/aes-ppc.pl1365
-rw-r--r--src/lib/libcrypto/aes/asm/aes-s390x.pl2237
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-sparcv9.pl1182
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-x86_64.pl2819
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl1232
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-x86.pl2189
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-x86_64.pl3041
-rw-r--r--src/lib/libcrypto/aes/asm/bsaes-x86_64.pl3108
-rw-r--r--src/lib/libcrypto/aes/asm/vpaes-x86.pl903
-rw-r--r--src/lib/libcrypto/aes/asm/vpaes-x86_64.pl1207
28 files changed, 0 insertions, 30556 deletions
diff --git a/src/lib/libcrypto/aes/README b/src/lib/libcrypto/aes/README
deleted file mode 100644
index 0f9620a80e..0000000000
--- a/src/lib/libcrypto/aes/README
+++ /dev/null
@@ -1,3 +0,0 @@
1This is an OpenSSL-compatible version of AES (also called Rijndael).
2aes_core.c is basically the same as rijndael-alg-fst.c but with an
3API that looks like the rest of the OpenSSL symmetric cipher suite.
diff --git a/src/lib/libcrypto/aes/aes.h b/src/lib/libcrypto/aes/aes.h
deleted file mode 100644
index c904485d8f..0000000000
--- a/src/lib/libcrypto/aes/aes.h
+++ /dev/null
@@ -1,126 +0,0 @@
1/* $OpenBSD: aes.h,v 1.14 2014/07/09 09:10:07 miod Exp $ */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#ifndef HEADER_AES_H
53#define HEADER_AES_H
54
55#include <openssl/opensslconf.h>
56
57#ifdef OPENSSL_NO_AES
58#error AES is disabled.
59#endif
60
61#include <stddef.h>
62
63#define AES_ENCRYPT 1
64#define AES_DECRYPT 0
65
66/* Because array size can't be a const in C, the following two are macros.
67 Both sizes are in bytes. */
68#define AES_MAXNR 14
69#define AES_BLOCK_SIZE 16
70
71#ifdef __cplusplus
72extern "C" {
73#endif
74
75/* This should be a hidden type, but EVP requires that the size be known */
76struct aes_key_st {
77 unsigned int rd_key[4 *(AES_MAXNR + 1)];
78 int rounds;
79};
80typedef struct aes_key_st AES_KEY;
81
82const char *AES_options(void);
83
84int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
85 AES_KEY *key);
86int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
87 AES_KEY *key);
88
89void AES_encrypt(const unsigned char *in, unsigned char *out,
90 const AES_KEY *key);
91void AES_decrypt(const unsigned char *in, unsigned char *out,
92 const AES_KEY *key);
93
94void AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
95 const AES_KEY *key, const int enc);
96void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
97 size_t length, const AES_KEY *key, unsigned char *ivec, const int enc);
98void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out,
99 size_t length, const AES_KEY *key, unsigned char *ivec, int *num,
100 const int enc);
101void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out,
102 size_t length, const AES_KEY *key, unsigned char *ivec, int *num,
103 const int enc);
104void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out,
105 size_t length, const AES_KEY *key, unsigned char *ivec, int *num,
106 const int enc);
107void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out,
108 size_t length, const AES_KEY *key, unsigned char *ivec, int *num);
109void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
110 size_t length, const AES_KEY *key, unsigned char ivec[AES_BLOCK_SIZE],
111 unsigned char ecount_buf[AES_BLOCK_SIZE], unsigned int *num);
112/* NB: the IV is _two_ blocks long */
113void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
114 size_t length, const AES_KEY *key, unsigned char *ivec, const int enc);
115
116int AES_wrap_key(AES_KEY *key, const unsigned char *iv, unsigned char *out,
117 const unsigned char *in, unsigned int inlen);
118int AES_unwrap_key(AES_KEY *key, const unsigned char *iv, unsigned char *out,
119 const unsigned char *in, unsigned int inlen);
120
121
122#ifdef __cplusplus
123}
124#endif
125
126#endif /* !HEADER_AES_H */
diff --git a/src/lib/libcrypto/aes/aes_cbc.c b/src/lib/libcrypto/aes/aes_cbc.c
deleted file mode 100644
index 5e76f6ea01..0000000000
--- a/src/lib/libcrypto/aes/aes_cbc.c
+++ /dev/null
@@ -1,65 +0,0 @@
1/* $OpenBSD: aes_cbc.c,v 1.12 2014/06/12 15:49:27 deraadt Exp $ */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/modes.h>
54
55void
56AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
57 size_t len, const AES_KEY *key, unsigned char *ivec, const int enc)
58{
59 if (enc)
60 CRYPTO_cbc128_encrypt(in, out, len, key, ivec,
61 (block128_f)AES_encrypt);
62 else
63 CRYPTO_cbc128_decrypt(in, out, len, key, ivec,
64 (block128_f)AES_decrypt);
65}
diff --git a/src/lib/libcrypto/aes/aes_cfb.c b/src/lib/libcrypto/aes/aes_cfb.c
deleted file mode 100644
index a6384f944d..0000000000
--- a/src/lib/libcrypto/aes/aes_cfb.c
+++ /dev/null
@@ -1,84 +0,0 @@
1/* $OpenBSD: aes_cfb.c,v 1.8 2014/06/12 15:49:27 deraadt Exp $ */
2/* ====================================================================
3 * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/modes.h>
54
55/* The input and output encrypted as though 128bit cfb mode is being
56 * used. The extra state information to record how much of the
57 * 128bit block we have used is contained in *num;
58 */
59
60void
61AES_cfb128_encrypt(const unsigned char *in, unsigned char *out, size_t length,
62 const AES_KEY *key, unsigned char *ivec, int *num, const int enc)
63{
64 CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc,
65 (block128_f)AES_encrypt);
66}
67
68/* N.B. This expects the input to be packed, MS bit first */
69void
70AES_cfb1_encrypt(const unsigned char *in, unsigned char *out, size_t length,
71 const AES_KEY *key, unsigned char *ivec, int *num, const int enc)
72{
73 CRYPTO_cfb128_1_encrypt(in, out, length, key, ivec, num, enc,
74 (block128_f)AES_encrypt);
75}
76
77void
78AES_cfb8_encrypt(const unsigned char *in, unsigned char *out, size_t length,
79 const AES_KEY *key, unsigned char *ivec, int *num, const int enc)
80{
81 CRYPTO_cfb128_8_encrypt(in, out, length, key, ivec, num, enc,
82 (block128_f)AES_encrypt);
83}
84
diff --git a/src/lib/libcrypto/aes/aes_core.c b/src/lib/libcrypto/aes/aes_core.c
deleted file mode 100644
index 1b8a24c714..0000000000
--- a/src/lib/libcrypto/aes/aes_core.c
+++ /dev/null
@@ -1,1374 +0,0 @@
1/* $OpenBSD: aes_core.c,v 1.13 2015/11/05 21:59:13 miod Exp $ */
2/**
3 * rijndael-alg-fst.c
4 *
5 * @version 3.0 (December 2000)
6 *
7 * Optimised ANSI C code for the Rijndael cipher (now AES)
8 *
9 * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10 * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11 * @author Paulo Barreto <paulo.barreto@terra.com.br>
12 *
13 * This code is hereby placed in the public domain.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/* Note: rewritten a little bit to provide error control and an OpenSSL-
29 compatible API */
30
31#ifndef AES_DEBUG
32# ifndef NDEBUG
33# define NDEBUG
34# endif
35#endif
36
37#include <stdlib.h>
38#include <openssl/aes.h>
39#include "aes_locl.h"
40
41#ifndef AES_ASM
42/*
43Te0[x] = S [x].[02, 01, 01, 03];
44Te1[x] = S [x].[03, 02, 01, 01];
45Te2[x] = S [x].[01, 03, 02, 01];
46Te3[x] = S [x].[01, 01, 03, 02];
47
48Td0[x] = Si[x].[0e, 09, 0d, 0b];
49Td1[x] = Si[x].[0b, 0e, 09, 0d];
50Td2[x] = Si[x].[0d, 0b, 0e, 09];
51Td3[x] = Si[x].[09, 0d, 0b, 0e];
52Td4[x] = Si[x].[01];
53*/
54
55static const u32 Te0[256] = {
56 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
57 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
58 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
59 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
60 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
61 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
62 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
63 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
64 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
65 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
66 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
67 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
68 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
69 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
70 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
71 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
72 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
73 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
74 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
75 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
76 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
77 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
78 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
79 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
80 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
81 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
82 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
83 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
84 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
85 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
86 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
87 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
88 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
89 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
90 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
91 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
92 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
93 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
94 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
95 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
96 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
97 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
98 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
99 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
100 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
101 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
102 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
103 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
104 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
105 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
106 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
107 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
108 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
109 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
110 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
111 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
112 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
113 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
114 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
115 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
116 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
117 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
118 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
119 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
120};
121static const u32 Te1[256] = {
122 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
123 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
124 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
125 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
126 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
127 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
128 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
129 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
130 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
131 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
132 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
133 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
134 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
135 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
136 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
137 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
138 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
139 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
140 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
141 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
142 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
143 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
144 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
145 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
146 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
147 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
148 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
149 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
150 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
151 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
152 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
153 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
154 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
155 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
156 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
157 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
158 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
159 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
160 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
161 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
162 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
163 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
164 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
165 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
166 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
167 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
168 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
169 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
170 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
171 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
172 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
173 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
174 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
175 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
176 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
177 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
178 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
179 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
180 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
181 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
182 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
183 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
184 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
185 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
186};
187static const u32 Te2[256] = {
188 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
189 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
190 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
191 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
192 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
193 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
194 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
195 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
196 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
197 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
198 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
199 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
200 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
201 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
202 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
203 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
204 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
205 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
206 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
207 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
208 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
209 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
210 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
211 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
212 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
213 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
214 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
215 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
216 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
217 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
218 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
219 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
220 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
221 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
222 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
223 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
224 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
225 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
226 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
227 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
228 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
229 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
230 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
231 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
232 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
233 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
234 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
235 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
236 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
237 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
238 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
239 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
240 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
241 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
242 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
243 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
244 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
245 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
246 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
247 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
248 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
249 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
250 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
251 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
252};
253static const u32 Te3[256] = {
254 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
255 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
256 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
257 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
258 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
259 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
260 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
261 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
262 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
263 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
264 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
265 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
266 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
267 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
268 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
269 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
270 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
271 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
272 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
273 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
274 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
275 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
276 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
277 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
278 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
279 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
280 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
281 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
282 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
283 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
284 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
285 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
286 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
287 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
288 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
289 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
290 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
291 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
292 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
293 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
294 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
295 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
296 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
297 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
298 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
299 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
300 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
301 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
302 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
303 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
304 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
305 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
306 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
307 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
308 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
309 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
310 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
311 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
312 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
313 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
314 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
315 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
316 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
317 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
318};
319
320static const u32 Td0[256] = {
321 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
322 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
323 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
324 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
325 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
326 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
327 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
328 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
329 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
330 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
331 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
332 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
333 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
334 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
335 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
336 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
337 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
338 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
339 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
340 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
341 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
342 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
343 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
344 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
345 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
346 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
347 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
348 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
349 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
350 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
351 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
352 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
353 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
354 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
355 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
356 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
357 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
358 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
359 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
360 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
361 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
362 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
363 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
364 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
365 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
366 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
367 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
368 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
369 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
370 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
371 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
372 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
373 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
374 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
375 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
376 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
377 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
378 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
379 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
380 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
381 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
382 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
383 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
384 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
385};
386static const u32 Td1[256] = {
387 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
388 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
389 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
390 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
391 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
392 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
393 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
394 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
395 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
396 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
397 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
398 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
399 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
400 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
401 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
402 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
403 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
404 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
405 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
406 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
407 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
408 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
409 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
410 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
411 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
412 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
413 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
414 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
415 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
416 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
417 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
418 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
419 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
420 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
421 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
422 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
423 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
424 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
425 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
426 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
427 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
428 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
429 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
430 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
431 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
432 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
433 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
434 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
435 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
436 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
437 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
438 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
439 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
440 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
441 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
442 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
443 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
444 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
445 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
446 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
447 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
448 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
449 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
450 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
451};
452static const u32 Td2[256] = {
453 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
454 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
455 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
456 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
457 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
458 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
459 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
460 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
461 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
462 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
463 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
464 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
465 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
466 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
467 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
468 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
469 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
470 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
471 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
472 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
473 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
474 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
475 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
476 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
477 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
478 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
479 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
480 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
481 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
482 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
483 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
484 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
485 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
486 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
487 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
488 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
489 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
490 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
491 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
492 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
493 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
494 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
495 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
496 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
497 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
498 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
499 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
500 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
501 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
502 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
503 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
504 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
505 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
506 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
507 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
508 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
509 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
510 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
511 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
512 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
513 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
514 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
515 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
516 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
517};
518static const u32 Td3[256] = {
519 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
520 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
521 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
522 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
523 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
524 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
525 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
526 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
527 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
528 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
529 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
530 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
531 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
532 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
533 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
534 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
535 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
536 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
537 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
538 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
539 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
540 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
541 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
542 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
543 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
544 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
545 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
546 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
547 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
548 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
549 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
550 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
551 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
552 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
553 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
554 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
555 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
556 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
557 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
558 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
559 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
560 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
561 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
562 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
563 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
564 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
565 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
566 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
567 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
568 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
569 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
570 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
571 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
572 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
573 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
574 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
575 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
576 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
577 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
578 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
579 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
580 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
581 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
582 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
583};
584static const u8 Td4[256] = {
585 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
586 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
587 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
588 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
589 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
590 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
591 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
592 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
593 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
594 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
595 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
596 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
597 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
598 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
599 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
600 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
601 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
602 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
603 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
604 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
605 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
606 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
607 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
608 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
609 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
610 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
611 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
612 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
613 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
614 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
615 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
616 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU,
617};
618static const u32 rcon[] = {
619 0x01000000, 0x02000000, 0x04000000, 0x08000000,
620 0x10000000, 0x20000000, 0x40000000, 0x80000000,
621 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
622};
623
624/**
625 * Expand the cipher key into the encryption key schedule.
626 */
627int
628AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
629{
630 u32 *rk;
631 int i = 0;
632 u32 temp;
633
634 if (!userKey || !key)
635 return -1;
636 if (bits != 128 && bits != 192 && bits != 256)
637 return -2;
638
639 rk = key->rd_key;
640
641 if (bits == 128)
642 key->rounds = 10;
643 else if (bits == 192)
644 key->rounds = 12;
645 else
646 key->rounds = 14;
647
648 rk[0] = GETU32(userKey);
649 rk[1] = GETU32(userKey + 4);
650 rk[2] = GETU32(userKey + 8);
651 rk[3] = GETU32(userKey + 12);
652 if (bits == 128) {
653 while (1) {
654 temp = rk[3];
655 rk[4] = rk[0] ^
656 (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
657 (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
658 (Te0[(temp) & 0xff] & 0x0000ff00) ^
659 (Te1[(temp >> 24)] & 0x000000ff) ^
660 rcon[i];
661 rk[5] = rk[1] ^ rk[4];
662 rk[6] = rk[2] ^ rk[5];
663 rk[7] = rk[3] ^ rk[6];
664 if (++i == 10) {
665 return 0;
666 }
667 rk += 4;
668 }
669 }
670 rk[4] = GETU32(userKey + 16);
671 rk[5] = GETU32(userKey + 20);
672 if (bits == 192) {
673 while (1) {
674 temp = rk[5];
675 rk[6] = rk[ 0] ^
676 (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
677 (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
678 (Te0[(temp) & 0xff] & 0x0000ff00) ^
679 (Te1[(temp >> 24)] & 0x000000ff) ^
680 rcon[i];
681 rk[7] = rk[1] ^ rk[6];
682 rk[8] = rk[2] ^ rk[7];
683 rk[9] = rk[3] ^ rk[8];
684 if (++i == 8) {
685 return 0;
686 }
687 rk[10] = rk[4] ^ rk[9];
688 rk[11] = rk[5] ^ rk[10];
689 rk += 6;
690 }
691 }
692 rk[6] = GETU32(userKey + 24);
693 rk[7] = GETU32(userKey + 28);
694 if (bits == 256) {
695 while (1) {
696 temp = rk[7];
697 rk[8] = rk[0] ^
698 (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
699 (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
700 (Te0[(temp) & 0xff] & 0x0000ff00) ^
701 (Te1[(temp >> 24)] & 0x000000ff) ^
702 rcon[i];
703 rk[9] = rk[1] ^ rk[8];
704 rk[10] = rk[2] ^ rk[9];
705 rk[11] = rk[3] ^ rk[10];
706 if (++i == 7) {
707 return 0;
708 }
709 temp = rk[11];
710 rk[12] = rk[4] ^
711 (Te2[(temp >> 24)] & 0xff000000) ^
712 (Te3[(temp >> 16) & 0xff] & 0x00ff0000) ^
713 (Te0[(temp >> 8) & 0xff] & 0x0000ff00) ^
714 (Te1[(temp) & 0xff] & 0x000000ff);
715 rk[13] = rk[5] ^ rk[12];
716 rk[14] = rk[6] ^ rk[13];
717 rk[15] = rk[7] ^ rk[14];
718
719 rk += 8;
720 }
721 }
722 return 0;
723}
724
725/**
726 * Expand the cipher key into the decryption key schedule.
727 */
728int
729AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
730{
731 u32 *rk;
732 int i, j, status;
733 u32 temp;
734
735 /* first, start with an encryption schedule */
736 status = AES_set_encrypt_key(userKey, bits, key);
737 if (status < 0)
738 return status;
739
740 rk = key->rd_key;
741
742 /* invert the order of the round keys: */
743 for (i = 0, j = 4 * (key->rounds); i < j; i += 4, j -= 4) {
744 temp = rk[i];
745 rk[i] = rk[j];
746 rk[j] = temp;
747 temp = rk[i + 1];
748 rk[i + 1] = rk[j + 1];
749 rk[j + 1] = temp;
750 temp = rk[i + 2];
751 rk[i + 2] = rk[j + 2];
752 rk[j + 2] = temp;
753 temp = rk[i + 3];
754 rk[i + 3] = rk[j + 3];
755 rk[j + 3] = temp;
756 }
757 /* apply the inverse MixColumn transform to all round keys but the first and the last: */
758 for (i = 1; i < (key->rounds); i++) {
759 rk += 4;
760 rk[0] =
761 Td0[Te1[(rk[0] >> 24)] & 0xff] ^
762 Td1[Te1[(rk[0] >> 16) & 0xff] & 0xff] ^
763 Td2[Te1[(rk[0] >> 8) & 0xff] & 0xff] ^
764 Td3[Te1[(rk[0]) & 0xff] & 0xff];
765 rk[1] =
766 Td0[Te1[(rk[1] >> 24)] & 0xff] ^
767 Td1[Te1[(rk[1] >> 16) & 0xff] & 0xff] ^
768 Td2[Te1[(rk[1] >> 8) & 0xff] & 0xff] ^
769 Td3[Te1[(rk[1]) & 0xff] & 0xff];
770 rk[2] =
771 Td0[Te1[(rk[2] >> 24)] & 0xff] ^
772 Td1[Te1[(rk[2] >> 16) & 0xff] & 0xff] ^
773 Td2[Te1[(rk[2] >> 8) & 0xff] & 0xff] ^
774 Td3[Te1[(rk[2]) & 0xff] & 0xff];
775 rk[3] =
776 Td0[Te1[(rk[3] >> 24)] & 0xff] ^
777 Td1[Te1[(rk[3] >> 16) & 0xff] & 0xff] ^
778 Td2[Te1[(rk[3] >> 8) & 0xff] & 0xff] ^
779 Td3[Te1[(rk[3]) & 0xff] & 0xff];
780 }
781 return 0;
782}
783
784/*
785 * Encrypt a single block
786 * in and out can overlap
787 */
788void
789AES_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key)
790{
791 const u32 *rk;
792 u32 s0, s1, s2, s3, t0, t1, t2, t3;
793#ifndef FULL_UNROLL
794 int r;
795#endif /* ?FULL_UNROLL */
796
797 rk = key->rd_key;
798
799 /*
800 * map byte array block to cipher state
801 * and add initial round key:
802 */
803 s0 = GETU32(in ) ^ rk[0];
804 s1 = GETU32(in + 4) ^ rk[1];
805 s2 = GETU32(in + 8) ^ rk[2];
806 s3 = GETU32(in + 12) ^ rk[3];
807#ifdef FULL_UNROLL
808 /* round 1: */
809 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
810 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
811 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
812 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
813 /* round 2: */
814 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
815 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
816 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
817 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
818 /* round 3: */
819 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
820 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
821 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
822 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
823 /* round 4: */
824 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
825 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
826 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
827 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
828 /* round 5: */
829 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
830 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
831 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
832 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
833 /* round 6: */
834 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
835 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
836 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
837 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
838 /* round 7: */
839 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
840 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
841 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
842 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
843 /* round 8: */
844 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
845 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
846 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
847 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
848 /* round 9: */
849 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
850 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
851 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
852 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];
853 if (key->rounds > 10) {
854 /* round 10: */
855 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];
856 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];
857 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];
858 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];
859 /* round 11: */
860 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];
861 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];
862 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];
863 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];
864 if (key->rounds > 12) {
865 /* round 12: */
866 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];
867 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];
868 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];
869 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];
870 /* round 13: */
871 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];
872 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];
873 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];
874 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];
875 }
876 }
877 rk += key->rounds << 2;
878#else /* !FULL_UNROLL */
879 /*
880 * Nr - 1 full rounds:
881 */
882 r = key->rounds >> 1;
883 for (;;) {
884 t0 =
885 Te0[(s0 >> 24)] ^
886 Te1[(s1 >> 16) & 0xff] ^
887 Te2[(s2 >> 8) & 0xff] ^
888 Te3[(s3) & 0xff] ^
889 rk[4];
890 t1 =
891 Te0[(s1 >> 24)] ^
892 Te1[(s2 >> 16) & 0xff] ^
893 Te2[(s3 >> 8) & 0xff] ^
894 Te3[(s0) & 0xff] ^
895 rk[5];
896 t2 =
897 Te0[(s2 >> 24)] ^
898 Te1[(s3 >> 16) & 0xff] ^
899 Te2[(s0 >> 8) & 0xff] ^
900 Te3[(s1) & 0xff] ^
901 rk[6];
902 t3 =
903 Te0[(s3 >> 24)] ^
904 Te1[(s0 >> 16) & 0xff] ^
905 Te2[(s1 >> 8) & 0xff] ^
906 Te3[(s2) & 0xff] ^
907 rk[7];
908
909 rk += 8;
910 if (--r == 0) {
911 break;
912 }
913
914 s0 =
915 Te0[(t0 >> 24)] ^
916 Te1[(t1 >> 16) & 0xff] ^
917 Te2[(t2 >> 8) & 0xff] ^
918 Te3[(t3) & 0xff] ^
919 rk[0];
920 s1 =
921 Te0[(t1 >> 24)] ^
922 Te1[(t2 >> 16) & 0xff] ^
923 Te2[(t3 >> 8) & 0xff] ^
924 Te3[(t0) & 0xff] ^
925 rk[1];
926 s2 =
927 Te0[(t2 >> 24)] ^
928 Te1[(t3 >> 16) & 0xff] ^
929 Te2[(t0 >> 8) & 0xff] ^
930 Te3[(t1) & 0xff] ^
931 rk[2];
932 s3 =
933 Te0[(t3 >> 24)] ^
934 Te1[(t0 >> 16) & 0xff] ^
935 Te2[(t1 >> 8) & 0xff] ^
936 Te3[(t2) & 0xff] ^
937 rk[3];
938 }
939#endif /* ?FULL_UNROLL */
940 /*
941 * apply last round and
942 * map cipher state to byte array block:
943 */
944 s0 =
945 (Te2[(t0 >> 24)] & 0xff000000) ^
946 (Te3[(t1 >> 16) & 0xff] & 0x00ff0000) ^
947 (Te0[(t2 >> 8) & 0xff] & 0x0000ff00) ^
948 (Te1[(t3) & 0xff] & 0x000000ff) ^
949 rk[0];
950 PUTU32(out, s0);
951 s1 =
952 (Te2[(t1 >> 24)] & 0xff000000) ^
953 (Te3[(t2 >> 16) & 0xff] & 0x00ff0000) ^
954 (Te0[(t3 >> 8) & 0xff] & 0x0000ff00) ^
955 (Te1[(t0) & 0xff] & 0x000000ff) ^
956 rk[1];
957 PUTU32(out + 4, s1);
958 s2 =
959 (Te2[(t2 >> 24)] & 0xff000000) ^
960 (Te3[(t3 >> 16) & 0xff] & 0x00ff0000) ^
961 (Te0[(t0 >> 8) & 0xff] & 0x0000ff00) ^
962 (Te1[(t1) & 0xff] & 0x000000ff) ^
963 rk[2];
964 PUTU32(out + 8, s2);
965 s3 =
966 (Te2[(t3 >> 24)] & 0xff000000) ^
967 (Te3[(t0 >> 16) & 0xff] & 0x00ff0000) ^
968 (Te0[(t1 >> 8) & 0xff] & 0x0000ff00) ^
969 (Te1[(t2) & 0xff] & 0x000000ff) ^
970 rk[3];
971 PUTU32(out + 12, s3);
972}
973
974/*
975 * Decrypt a single block
976 * in and out can overlap
977 */
978void
979AES_decrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key)
980{
981 const u32 *rk;
982 u32 s0, s1, s2, s3, t0, t1, t2, t3;
983#ifndef FULL_UNROLL
984 int r;
985#endif /* ?FULL_UNROLL */
986
987 rk = key->rd_key;
988
989 /*
990 * map byte array block to cipher state
991 * and add initial round key:
992 */
993 s0 = GETU32(in) ^ rk[0];
994 s1 = GETU32(in + 4) ^ rk[1];
995 s2 = GETU32(in + 8) ^ rk[2];
996 s3 = GETU32(in + 12) ^ rk[3];
997#ifdef FULL_UNROLL
998 /* round 1: */
999 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];
1000 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];
1001 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];
1002 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];
1003 /* round 2: */
1004 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];
1005 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];
1006 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];
1007 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];
1008 /* round 3: */
1009 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];
1010 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];
1011 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];
1012 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];
1013 /* round 4: */
1014 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];
1015 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];
1016 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];
1017 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];
1018 /* round 5: */
1019 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];
1020 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];
1021 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];
1022 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];
1023 /* round 6: */
1024 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];
1025 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];
1026 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];
1027 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];
1028 /* round 7: */
1029 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];
1030 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];
1031 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];
1032 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];
1033 /* round 8: */
1034 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];
1035 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];
1036 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];
1037 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];
1038 /* round 9: */
1039 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];
1040 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];
1041 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];
1042 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];
1043 if (key->rounds > 10) {
1044 /* round 10: */
1045 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];
1046 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];
1047 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];
1048 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];
1049 /* round 11: */
1050 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];
1051 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];
1052 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];
1053 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];
1054 if (key->rounds > 12) {
1055 /* round 12: */
1056 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];
1057 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];
1058 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];
1059 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];
1060 /* round 13: */
1061 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];
1062 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];
1063 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];
1064 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];
1065 }
1066 }
1067 rk += key->rounds << 2;
1068#else /* !FULL_UNROLL */
1069 /*
1070 * Nr - 1 full rounds:
1071 */
1072 r = key->rounds >> 1;
1073 for (;;) {
1074 t0 =
1075 Td0[(s0 >> 24)] ^
1076 Td1[(s3 >> 16) & 0xff] ^
1077 Td2[(s2 >> 8) & 0xff] ^
1078 Td3[(s1) & 0xff] ^
1079 rk[4];
1080 t1 =
1081 Td0[(s1 >> 24)] ^
1082 Td1[(s0 >> 16) & 0xff] ^
1083 Td2[(s3 >> 8) & 0xff] ^
1084 Td3[(s2) & 0xff] ^
1085 rk[5];
1086 t2 =
1087 Td0[(s2 >> 24)] ^
1088 Td1[(s1 >> 16) & 0xff] ^
1089 Td2[(s0 >> 8) & 0xff] ^
1090 Td3[(s3) & 0xff] ^
1091 rk[6];
1092 t3 =
1093 Td0[(s3 >> 24)] ^
1094 Td1[(s2 >> 16) & 0xff] ^
1095 Td2[(s1 >> 8) & 0xff] ^
1096 Td3[(s0) & 0xff] ^
1097 rk[7];
1098
1099 rk += 8;
1100 if (--r == 0) {
1101 break;
1102 }
1103
1104 s0 =
1105 Td0[(t0 >> 24)] ^
1106 Td1[(t3 >> 16) & 0xff] ^
1107 Td2[(t2 >> 8) & 0xff] ^
1108 Td3[(t1) & 0xff] ^
1109 rk[0];
1110 s1 =
1111 Td0[(t1 >> 24)] ^
1112 Td1[(t0 >> 16) & 0xff] ^
1113 Td2[(t3 >> 8) & 0xff] ^
1114 Td3[(t2) & 0xff] ^
1115 rk[1];
1116 s2 =
1117 Td0[(t2 >> 24)] ^
1118 Td1[(t1 >> 16) & 0xff] ^
1119 Td2[(t0 >> 8) & 0xff] ^
1120 Td3[(t3) & 0xff] ^
1121 rk[2];
1122 s3 =
1123 Td0[(t3 >> 24)] ^
1124 Td1[(t2 >> 16) & 0xff] ^
1125 Td2[(t1 >> 8) & 0xff] ^
1126 Td3[(t0) & 0xff] ^
1127 rk[3];
1128 }
1129#endif /* ?FULL_UNROLL */
1130 /*
1131 * apply last round and
1132 * map cipher state to byte array block:
1133 */
1134 s0 =
1135 (((uint32_t)Td4[(t0 >> 24)]) << 24) ^
1136 (Td4[(t3 >> 16) & 0xff] << 16) ^
1137 (Td4[(t2 >> 8) & 0xff] << 8) ^
1138 (Td4[(t1) & 0xff]) ^
1139 rk[0];
1140 PUTU32(out, s0);
1141 s1 =
1142 (((uint32_t)Td4[(t1 >> 24)]) << 24) ^
1143 (Td4[(t0 >> 16) & 0xff] << 16) ^
1144 (Td4[(t3 >> 8) & 0xff] << 8) ^
1145 (Td4[(t2) & 0xff]) ^
1146 rk[1];
1147 PUTU32(out + 4, s1);
1148 s2 =
1149 (((uint32_t)Td4[(t2 >> 24)]) << 24) ^
1150 (Td4[(t1 >> 16) & 0xff] << 16) ^
1151 (Td4[(t0 >> 8) & 0xff] << 8) ^
1152 (Td4[(t3) & 0xff]) ^
1153 rk[2];
1154 PUTU32(out + 8, s2);
1155 s3 =
1156 (((uint32_t)Td4[(t3 >> 24)]) << 24) ^
1157 (Td4[(t2 >> 16) & 0xff] << 16) ^
1158 (Td4[(t1 >> 8) & 0xff] << 8) ^
1159 (Td4[(t0) & 0xff]) ^
1160 rk[3];
1161 PUTU32(out + 12, s3);
1162}
1163
1164#else /* AES_ASM */
1165
1166static const u8 Te4[256] = {
1167 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
1168 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
1169 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
1170 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
1171 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
1172 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
1173 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
1174 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
1175 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
1176 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
1177 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
1178 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
1179 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
1180 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
1181 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
1182 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
1183 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
1184 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
1185 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
1186 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
1187 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
1188 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
1189 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
1190 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
1191 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
1192 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
1193 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
1194 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
1195 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
1196 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
1197 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
1198 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
1199};
1200static const u32 rcon[] = {
1201 0x01000000, 0x02000000, 0x04000000, 0x08000000,
1202 0x10000000, 0x20000000, 0x40000000, 0x80000000,
1203 0x1B000000, 0x36000000,
1204 /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
1205};
1206
1207/**
1208 * Expand the cipher key into the encryption key schedule.
1209 */
1210int
1211AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
1212{
1213 u32 *rk;
1214 int i = 0;
1215 u32 temp;
1216
1217 if (!userKey || !key)
1218 return -1;
1219 if (bits != 128 && bits != 192 && bits != 256)
1220 return -2;
1221
1222 rk = key->rd_key;
1223
1224 if (bits == 128)
1225 key->rounds = 10;
1226 else if (bits == 192)
1227 key->rounds = 12;
1228 else
1229 key->rounds = 14;
1230
1231 rk[0] = GETU32(userKey);
1232 rk[1] = GETU32(userKey + 4);
1233 rk[2] = GETU32(userKey + 8);
1234 rk[3] = GETU32(userKey + 12);
1235 if (bits == 128) {
1236 while (1) {
1237 temp = rk[3];
1238 rk[4] = rk[0] ^
1239 (Te4[(temp >> 16) & 0xff] << 24) ^
1240 (Te4[(temp >> 8) & 0xff] << 16) ^
1241 (Te4[(temp) & 0xff] << 8) ^
1242 (Te4[(temp >> 24)]) ^
1243 rcon[i];
1244 rk[5] = rk[1] ^ rk[4];
1245 rk[6] = rk[2] ^ rk[5];
1246 rk[7] = rk[3] ^ rk[6];
1247 if (++i == 10) {
1248 return 0;
1249 }
1250 rk += 4;
1251 }
1252 }
1253 rk[4] = GETU32(userKey + 16);
1254 rk[5] = GETU32(userKey + 20);
1255 if (bits == 192) {
1256 while (1) {
1257 temp = rk[5];
1258 rk[6] = rk[0] ^
1259 (Te4[(temp >> 16) & 0xff] << 24) ^
1260 (Te4[(temp >> 8) & 0xff] << 16) ^
1261 (Te4[(temp) & 0xff] << 8) ^
1262 (Te4[(temp >> 24)]) ^
1263 rcon[i];
1264 rk[7] = rk[1] ^ rk[6];
1265 rk[8] = rk[2] ^ rk[7];
1266 rk[9] = rk[3] ^ rk[8];
1267 if (++i == 8) {
1268 return 0;
1269 }
1270 rk[10] = rk[4] ^ rk[9];
1271 rk[11] = rk[5] ^ rk[10];
1272 rk += 6;
1273 }
1274 }
1275 rk[6] = GETU32(userKey + 24);
1276 rk[7] = GETU32(userKey + 28);
1277 if (bits == 256) {
1278 while (1) {
1279 temp = rk[7];
1280 rk[8] = rk[0] ^
1281 (Te4[(temp >> 16) & 0xff] << 24) ^
1282 (Te4[(temp >> 8) & 0xff] << 16) ^
1283 (Te4[(temp) & 0xff] << 8) ^
1284 (Te4[(temp >> 24)]) ^
1285 rcon[i];
1286 rk[9] = rk[1] ^ rk[8];
1287 rk[10] = rk[2] ^ rk[9];
1288 rk[11] = rk[3] ^ rk[10];
1289 if (++i == 7) {
1290 return 0;
1291 }
1292 temp = rk[11];
1293 rk[12] = rk[4] ^
1294 (Te4[(temp >> 24)] << 24) ^
1295 (Te4[(temp >> 16) & 0xff] << 16) ^
1296 (Te4[(temp >> 8) & 0xff] << 8) ^
1297 (Te4[(temp) & 0xff]);
1298 rk[13] = rk[5] ^ rk[12];
1299 rk[14] = rk[6] ^ rk[13];
1300 rk[15] = rk[7] ^ rk[14];
1301
1302 rk += 8;
1303 }
1304 }
1305 return 0;
1306}
1307
1308/**
1309 * Expand the cipher key into the decryption key schedule.
1310 */
1311int
1312AES_set_decrypt_key(const unsigned char *userKey, const int bits,
1313 AES_KEY *key)
1314{
1315 u32 *rk;
1316 int i, j, status;
1317 u32 temp;
1318
1319 /* first, start with an encryption schedule */
1320 status = AES_set_encrypt_key(userKey, bits, key);
1321 if (status < 0)
1322 return status;
1323
1324 rk = key->rd_key;
1325
1326 /* invert the order of the round keys: */
1327 for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
1328 temp = rk[i];
1329 rk[i] = rk[j];
1330 rk[j] = temp;
1331 temp = rk[i + 1];
1332 rk[i + 1] = rk[j + 1];
1333 rk[j + 1] = temp;
1334 temp = rk[i + 2];
1335 rk[i + 2] = rk[j + 2];
1336 rk[j + 2] = temp;
1337 temp = rk[i + 3];
1338 rk[i + 3] = rk[j + 3];
1339 rk[j + 3] = temp;
1340 }
1341 /* apply the inverse MixColumn transform to all round keys but the first and the last: */
1342 for (i = 1; i < (key->rounds); i++) {
1343 rk += 4;
1344 for (j = 0; j < 4; j++) {
1345 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
1346
1347 tp1 = rk[j];
1348 m = tp1 & 0x80808080;
1349 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
1350 ((m - (m >> 7)) & 0x1b1b1b1b);
1351 m = tp2 & 0x80808080;
1352 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
1353 ((m - (m >> 7)) & 0x1b1b1b1b);
1354 m = tp4 & 0x80808080;
1355 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
1356 ((m - (m >> 7)) & 0x1b1b1b1b);
1357 tp9 = tp8 ^ tp1;
1358 tpb = tp9 ^ tp2;
1359 tpd = tp9 ^ tp4;
1360 tpe = tp8 ^ tp4 ^ tp2;
1361#if defined(ROTATE)
1362 rk[j] = tpe ^ ROTATE(tpd, 16) ^
1363 ROTATE(tp9, 24) ^ ROTATE(tpb, 8);
1364#else
1365 rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
1366 (tp9 >> 8) ^ (tp9 << 24) ^
1367 (tpb >> 24) ^ (tpb << 8);
1368#endif
1369 }
1370 }
1371 return 0;
1372}
1373
1374#endif /* AES_ASM */
diff --git a/src/lib/libcrypto/aes/aes_ctr.c b/src/lib/libcrypto/aes/aes_ctr.c
deleted file mode 100644
index 607914599b..0000000000
--- a/src/lib/libcrypto/aes/aes_ctr.c
+++ /dev/null
@@ -1,62 +0,0 @@
1/* $OpenBSD: aes_ctr.c,v 1.9 2014/06/12 15:49:27 deraadt Exp $ */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/modes.h>
54
55void
56AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
57 size_t length, const AES_KEY *key, unsigned char ivec[AES_BLOCK_SIZE],
58 unsigned char ecount_buf[AES_BLOCK_SIZE], unsigned int *num)
59{
60 CRYPTO_ctr128_encrypt(in, out, length, key, ivec, ecount_buf, num,
61 (block128_f)AES_encrypt);
62}
diff --git a/src/lib/libcrypto/aes/aes_ecb.c b/src/lib/libcrypto/aes/aes_ecb.c
deleted file mode 100644
index b05e53994b..0000000000
--- a/src/lib/libcrypto/aes/aes_ecb.c
+++ /dev/null
@@ -1,69 +0,0 @@
1/* $OpenBSD: aes_ecb.c,v 1.6 2015/02/10 09:46:30 miod Exp $ */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#ifndef AES_DEBUG
53# ifndef NDEBUG
54# define NDEBUG
55# endif
56#endif
57
58#include <openssl/aes.h>
59#include "aes_locl.h"
60
61void
62AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
63 const AES_KEY *key, const int enc)
64{
65 if (AES_ENCRYPT == enc)
66 AES_encrypt(in, out, key);
67 else
68 AES_decrypt(in, out, key);
69}
diff --git a/src/lib/libcrypto/aes/aes_ige.c b/src/lib/libcrypto/aes/aes_ige.c
deleted file mode 100644
index 16ef5612eb..0000000000
--- a/src/lib/libcrypto/aes/aes_ige.c
+++ /dev/null
@@ -1,194 +0,0 @@
1/* $OpenBSD: aes_ige.c,v 1.7 2015/02/10 09:46:30 miod Exp $ */
2/* ====================================================================
3 * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/crypto.h>
54
55#include "aes_locl.h"
56
57#define N_WORDS (AES_BLOCK_SIZE / sizeof(unsigned long))
58typedef struct {
59 unsigned long data[N_WORDS];
60} aes_block_t;
61
62/* XXX: probably some better way to do this */
63#if defined(__i386__) || defined(__x86_64__)
64#define UNALIGNED_MEMOPS_ARE_FAST 1
65#else
66#define UNALIGNED_MEMOPS_ARE_FAST 0
67#endif
68
69#if UNALIGNED_MEMOPS_ARE_FAST
70#define load_block(d, s) (d) = *(const aes_block_t *)(s)
71#define store_block(d, s) *(aes_block_t *)(d) = (s)
72#else
73#define load_block(d, s) memcpy((d).data, (s), AES_BLOCK_SIZE)
74#define store_block(d, s) memcpy((d), (s).data, AES_BLOCK_SIZE)
75#endif
76
77/* N.B. The IV for this mode is _twice_ the block size */
78
79void
80AES_ige_encrypt(const unsigned char *in, unsigned char *out, size_t length,
81 const AES_KEY *key, unsigned char *ivec, const int enc)
82{
83 size_t n;
84 size_t len;
85
86 OPENSSL_assert((length % AES_BLOCK_SIZE) == 0);
87
88 len = length / AES_BLOCK_SIZE;
89
90 if (AES_ENCRYPT == enc) {
91 if (in != out && (UNALIGNED_MEMOPS_ARE_FAST ||
92 ((size_t)in|(size_t)out|(size_t)ivec) %
93 sizeof(long) == 0)) {
94 aes_block_t *ivp = (aes_block_t *)ivec;
95 aes_block_t *iv2p = (aes_block_t *)(ivec + AES_BLOCK_SIZE);
96
97 while (len) {
98 aes_block_t *inp = (aes_block_t *)in;
99 aes_block_t *outp = (aes_block_t *)out;
100
101 for (n = 0; n < N_WORDS; ++n)
102 outp->data[n] = inp->data[n] ^ ivp->data[n];
103 AES_encrypt((unsigned char *)outp->data, (unsigned char *)outp->data, key);
104 for (n = 0; n < N_WORDS; ++n)
105 outp->data[n] ^= iv2p->data[n];
106 ivp = outp;
107 iv2p = inp;
108 --len;
109 in += AES_BLOCK_SIZE;
110 out += AES_BLOCK_SIZE;
111 }
112 memcpy(ivec, ivp->data, AES_BLOCK_SIZE);
113 memcpy(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
114 } else {
115 aes_block_t tmp, tmp2;
116 aes_block_t iv;
117 aes_block_t iv2;
118
119 load_block(iv, ivec);
120 load_block(iv2, ivec + AES_BLOCK_SIZE);
121
122 while (len) {
123 load_block(tmp, in);
124 for (n = 0; n < N_WORDS; ++n)
125 tmp2.data[n] = tmp.data[n] ^ iv.data[n];
126 AES_encrypt((unsigned char *)tmp2.data,
127 (unsigned char *)tmp2.data, key);
128 for (n = 0; n < N_WORDS; ++n)
129 tmp2.data[n] ^= iv2.data[n];
130 store_block(out, tmp2);
131 iv = tmp2;
132 iv2 = tmp;
133 --len;
134 in += AES_BLOCK_SIZE;
135 out += AES_BLOCK_SIZE;
136 }
137 memcpy(ivec, iv.data, AES_BLOCK_SIZE);
138 memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
139 }
140 } else {
141 if (in != out && (UNALIGNED_MEMOPS_ARE_FAST ||
142 ((size_t)in|(size_t)out|(size_t)ivec) %
143 sizeof(long) == 0)) {
144 aes_block_t *ivp = (aes_block_t *)ivec;
145 aes_block_t *iv2p = (aes_block_t *)(ivec + AES_BLOCK_SIZE);
146
147 while (len) {
148 aes_block_t tmp;
149 aes_block_t *inp = (aes_block_t *)in;
150 aes_block_t *outp = (aes_block_t *)out;
151
152 for (n = 0; n < N_WORDS; ++n)
153 tmp.data[n] = inp->data[n] ^ iv2p->data[n];
154 AES_decrypt((unsigned char *)tmp.data,
155 (unsigned char *)outp->data, key);
156 for (n = 0; n < N_WORDS; ++n)
157 outp->data[n] ^= ivp->data[n];
158 ivp = inp;
159 iv2p = outp;
160 --len;
161 in += AES_BLOCK_SIZE;
162 out += AES_BLOCK_SIZE;
163 }
164 memcpy(ivec, ivp->data, AES_BLOCK_SIZE);
165 memcpy(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
166 } else {
167 aes_block_t tmp, tmp2;
168 aes_block_t iv;
169 aes_block_t iv2;
170
171 load_block(iv, ivec);
172 load_block(iv2, ivec + AES_BLOCK_SIZE);
173
174 while (len) {
175 load_block(tmp, in);
176 tmp2 = tmp;
177 for (n = 0; n < N_WORDS; ++n)
178 tmp.data[n] ^= iv2.data[n];
179 AES_decrypt((unsigned char *)tmp.data,
180 (unsigned char *)tmp.data, key);
181 for (n = 0; n < N_WORDS; ++n)
182 tmp.data[n] ^= iv.data[n];
183 store_block(out, tmp);
184 iv = tmp2;
185 iv2 = tmp;
186 --len;
187 in += AES_BLOCK_SIZE;
188 out += AES_BLOCK_SIZE;
189 }
190 memcpy(ivec, iv.data, AES_BLOCK_SIZE);
191 memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
192 }
193 }
194}
diff --git a/src/lib/libcrypto/aes/aes_locl.h b/src/lib/libcrypto/aes/aes_locl.h
deleted file mode 100644
index 83b20b5f5b..0000000000
--- a/src/lib/libcrypto/aes/aes_locl.h
+++ /dev/null
@@ -1,79 +0,0 @@
1/* $OpenBSD: aes_locl.h,v 1.10 2014/06/12 15:49:27 deraadt Exp $ */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#ifndef HEADER_AES_LOCL_H
53#define HEADER_AES_LOCL_H
54
55#include <openssl/opensslconf.h>
56
57#ifdef OPENSSL_NO_AES
58#error AES is disabled.
59#endif
60
61#include <stdio.h>
62#include <stdlib.h>
63#include <string.h>
64
65#define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] << 8) ^ ((u32)(pt)[3]))
66#define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >> 8); (ct)[3] = (u8)(st); }
67
68typedef unsigned int u32;
69typedef unsigned short u16;
70typedef unsigned char u8;
71
72#define MAXKC (256/32)
73#define MAXKB (256/8)
74#define MAXNR 14
75
76/* This controls loop-unrolling in aes_core.c */
77#undef FULL_UNROLL
78
79#endif /* !HEADER_AES_LOCL_H */
diff --git a/src/lib/libcrypto/aes/aes_misc.c b/src/lib/libcrypto/aes/aes_misc.c
deleted file mode 100644
index 6c1506dd79..0000000000
--- a/src/lib/libcrypto/aes/aes_misc.c
+++ /dev/null
@@ -1,65 +0,0 @@
1/* $OpenBSD: aes_misc.c,v 1.10 2014/07/09 11:10:50 bcook Exp $ */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/opensslv.h>
53#include <openssl/crypto.h>
54#include <openssl/aes.h>
55#include "aes_locl.h"
56
57const char *
58AES_options(void)
59{
60#ifdef FULL_UNROLL
61 return "aes(full)";
62#else
63 return "aes(partial)";
64#endif
65}
diff --git a/src/lib/libcrypto/aes/aes_ofb.c b/src/lib/libcrypto/aes/aes_ofb.c
deleted file mode 100644
index f8dc03a26e..0000000000
--- a/src/lib/libcrypto/aes/aes_ofb.c
+++ /dev/null
@@ -1,61 +0,0 @@
1/* $OpenBSD: aes_ofb.c,v 1.6 2014/06/12 15:49:27 deraadt Exp $ */
2/* ====================================================================
3 * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/modes.h>
54
55void
56AES_ofb128_encrypt(const unsigned char *in, unsigned char *out, size_t length,
57 const AES_KEY *key, unsigned char *ivec, int *num)
58{
59 CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num,
60 (block128_f)AES_encrypt);
61}
diff --git a/src/lib/libcrypto/aes/aes_wrap.c b/src/lib/libcrypto/aes/aes_wrap.c
deleted file mode 100644
index ac2f83a993..0000000000
--- a/src/lib/libcrypto/aes/aes_wrap.c
+++ /dev/null
@@ -1,133 +0,0 @@
1/* $OpenBSD: aes_wrap.c,v 1.10 2015/09/10 15:56:24 jsing Exp $ */
2/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
3 * project.
4 */
5/* ====================================================================
6 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. All advertising materials mentioning features or use of this
21 * software must display the following acknowledgment:
22 * "This product includes software developed by the OpenSSL Project
23 * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
24 *
25 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26 * endorse or promote products derived from this software without
27 * prior written permission. For written permission, please contact
28 * licensing@OpenSSL.org.
29 *
30 * 5. Products derived from this software may not be called "OpenSSL"
31 * nor may "OpenSSL" appear in their names without prior written
32 * permission of the OpenSSL Project.
33 *
34 * 6. Redistributions of any form whatsoever must retain the following
35 * acknowledgment:
36 * "This product includes software developed by the OpenSSL Project
37 * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50 * OF THE POSSIBILITY OF SUCH DAMAGE.
51 * ====================================================================
52 */
53
54#include <string.h>
55
56#include <openssl/aes.h>
57#include <openssl/bio.h>
58
59static const unsigned char default_iv[] = {
60 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
61};
62
63int
64AES_wrap_key(AES_KEY *key, const unsigned char *iv, unsigned char *out,
65 const unsigned char *in, unsigned int inlen)
66{
67 unsigned char *A, B[16], *R;
68 unsigned int i, j, t;
69 if ((inlen & 0x7) || (inlen < 8))
70 return -1;
71 A = B;
72 t = 1;
73 memcpy(out + 8, in, inlen);
74 if (!iv)
75 iv = default_iv;
76
77 memcpy(A, iv, 8);
78
79 for (j = 0; j < 6; j++) {
80 R = out + 8;
81 for (i = 0; i < inlen; i += 8, t++, R += 8) {
82 memcpy(B + 8, R, 8);
83 AES_encrypt(B, B, key);
84 A[7] ^= (unsigned char)(t & 0xff);
85 if (t > 0xff) {
86 A[6] ^= (unsigned char)((t >> 8) & 0xff);
87 A[5] ^= (unsigned char)((t >> 16) & 0xff);
88 A[4] ^= (unsigned char)((t >> 24) & 0xff);
89 }
90 memcpy(R, B + 8, 8);
91 }
92 }
93 memcpy(out, A, 8);
94 return inlen + 8;
95}
96
97int
98AES_unwrap_key(AES_KEY *key, const unsigned char *iv, unsigned char *out,
99 const unsigned char *in, unsigned int inlen)
100{
101 unsigned char *A, B[16], *R;
102 unsigned int i, j, t;
103 inlen -= 8;
104 if (inlen & 0x7)
105 return -1;
106 if (inlen < 8)
107 return -1;
108 A = B;
109 t = 6 * (inlen >> 3);
110 memcpy(A, in, 8);
111 memcpy(out, in + 8, inlen);
112 for (j = 0; j < 6; j++) {
113 R = out + inlen - 8;
114 for (i = 0; i < inlen; i += 8, t--, R -= 8) {
115 A[7] ^= (unsigned char)(t & 0xff);
116 if (t > 0xff) {
117 A[6] ^= (unsigned char)((t >> 8) & 0xff);
118 A[5] ^= (unsigned char)((t >> 16) & 0xff);
119 A[4] ^= (unsigned char)((t >> 24) & 0xff);
120 }
121 memcpy(B + 8, R, 8);
122 AES_decrypt(B, B, key);
123 memcpy(R, B + 8, 8);
124 }
125 }
126 if (!iv)
127 iv = default_iv;
128 if (memcmp(A, iv, 8)) {
129 explicit_bzero(out, inlen);
130 return 0;
131 }
132 return inlen;
133}
diff --git a/src/lib/libcrypto/aes/aes_x86core.c b/src/lib/libcrypto/aes/aes_x86core.c
deleted file mode 100644
index c604fa876f..0000000000
--- a/src/lib/libcrypto/aes/aes_x86core.c
+++ /dev/null
@@ -1,1080 +0,0 @@
1/* $OpenBSD: aes_x86core.c,v 1.8 2015/02/10 09:46:30 miod Exp $ */
2/**
3 * rijndael-alg-fst.c
4 *
5 * @version 3.0 (December 2000)
6 *
7 * Optimised ANSI C code for the Rijndael cipher (now AES)
8 *
9 * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10 * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11 * @author Paulo Barreto <paulo.barreto@terra.com.br>
12 *
13 * This code is hereby placed in the public domain.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * This is experimental x86[_64] derivative. It assumes little-endian
30 * byte order and expects CPU to sustain unaligned memory references.
31 * It is used as playground for cache-time attack mitigations and
32 * serves as reference C implementation for x86[_64] assembler.
33 *
34 * <appro@fy.chalmers.se>
35 */
36
37
38#ifndef AES_DEBUG
39# ifndef NDEBUG
40# define NDEBUG
41# endif
42#endif
43
44#include <stdlib.h>
45#include <openssl/aes.h>
46#include "aes_locl.h"
47
48/*
49 * These two parameters control which table, 256-byte or 2KB, is
50 * referenced in outer and respectively inner rounds.
51 */
52#define AES_COMPACT_IN_OUTER_ROUNDS
53#ifdef AES_COMPACT_IN_OUTER_ROUNDS
54/* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while
55 * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further*
56 * by factor of ~2. */
57# undef AES_COMPACT_IN_INNER_ROUNDS
58#endif
59
60#if 1
61static void
62prefetch256(const void *table)
63{
64 volatile unsigned long *t = (void *)table, ret;
65 unsigned long sum;
66 int i;
67
68 /* 32 is common least cache-line size */
69 for (sum = 0, i = 0; i < 256/sizeof(t[0]); i += 32 / sizeof(t[0]))
70 sum ^= t[i];
71
72 ret = sum;
73}
74#else
75# define prefetch256(t)
76#endif
77
78#undef GETU32
79#define GETU32(p) (*((u32*)(p)))
80
81#if defined(_LP64)
82typedef unsigned long u64;
83#define U64(C) C##UL
84#else
85typedef unsigned long long u64;
86#define U64(C) C##ULL
87#endif
88
89#undef ROTATE
90#if defined(__GNUC__) && __GNUC__>=2
91# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
92# define ROTATE(a,n) ({ unsigned int ret; \
93 asm ( \
94 "roll %1,%0" \
95 : "=r"(ret) \
96 : "I"(n), "0"(a) \
97 : "cc"); \
98 ret; \
99 })
100# endif
101#endif
102/*
103Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
104Te0[x] = S [x].[02, 01, 01, 03];
105Te1[x] = S [x].[03, 02, 01, 01];
106Te2[x] = S [x].[01, 03, 02, 01];
107Te3[x] = S [x].[01, 01, 03, 02];
108*/
109#define Te0 (u32)((u64*)((u8*)Te+0))
110#define Te1 (u32)((u64*)((u8*)Te+3))
111#define Te2 (u32)((u64*)((u8*)Te+2))
112#define Te3 (u32)((u64*)((u8*)Te+1))
113/*
114Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
115Td0[x] = Si[x].[0e, 09, 0d, 0b];
116Td1[x] = Si[x].[0b, 0e, 09, 0d];
117Td2[x] = Si[x].[0d, 0b, 0e, 09];
118Td3[x] = Si[x].[09, 0d, 0b, 0e];
119Td4[x] = Si[x].[01];
120*/
121#define Td0 (u32)((u64*)((u8*)Td+0))
122#define Td1 (u32)((u64*)((u8*)Td+3))
123#define Td2 (u32)((u64*)((u8*)Td+2))
124#define Td3 (u32)((u64*)((u8*)Td+1))
125
126static const u64 Te[256] = {
127 U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
128 U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
129 U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
130 U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
131 U64(0x5030306050303060), U64(0x0301010203010102),
132 U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
133 U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
134 U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
135 U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
136 U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
137 U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
138 U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
139 U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
140 U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
141 U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
142 U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
143 U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
144 U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
145 U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
146 U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
147 U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
148 U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
149 U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
150 U64(0x5331316253313162), U64(0x3f15152a3f15152a),
151 U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
152 U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
153 U64(0x2818183028181830), U64(0xa1969637a1969637),
154 U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
155 U64(0x0907070e0907070e), U64(0x3612122436121224),
156 U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
157 U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
158 U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
159 U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
160 U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
161 U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
162 U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
163 U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
164 U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
165 U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
166 U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
167 U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
168 U64(0x0000000000000000), U64(0x2cededc12cededc1),
169 U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
170 U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
171 U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
172 U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
173 U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
174 U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
175 U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
176 U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
177 U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
178 U64(0x5533336655333366), U64(0x9485851194858511),
179 U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
180 U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
181 U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
182 U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
183 U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
184 U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
185 U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
186 U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
187 U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
188 U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
189 U64(0x3010102030101020), U64(0x1affffe51affffe5),
190 U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
191 U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
192 U64(0x3513132635131326), U64(0x2fececc32fececc3),
193 U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
194 U64(0xcc444488cc444488), U64(0x3917172e3917172e),
195 U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
196 U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
197 U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
198 U64(0x2b1919322b191932), U64(0x957373e6957373e6),
199 U64(0xa06060c0a06060c0), U64(0x9881811998818119),
200 U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
201 U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
202 U64(0xab90903bab90903b), U64(0x8388880b8388880b),
203 U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
204 U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
205 U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
206 U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
207 U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
208 U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
209 U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
210 U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
211 U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
212 U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
213 U64(0xa8919139a8919139), U64(0xa4959531a4959531),
214 U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
215 U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
216 U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
217 U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
218 U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
219 U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
220 U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
221 U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
222 U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
223 U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
224 U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
225 U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
226 U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
227 U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
228 U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
229 U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
230 U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
231 U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
232 U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
233 U64(0xd8484890d8484890), U64(0x0503030605030306),
234 U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
235 U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
236 U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
237 U64(0x9186861791868617), U64(0x58c1c19958c1c199),
238 U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
239 U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
240 U64(0xb398982bb398982b), U64(0x3311112233111122),
241 U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
242 U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
243 U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
244 U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
245 U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
246 U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
247 U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
248 U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
249 U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
250 U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
251 U64(0xc3414182c3414182), U64(0xb0999929b0999929),
252 U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
253 U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
254 U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
255};
256
257static const u8 Te4[256] = {
258 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
259 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
260 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
261 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
262 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
263 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
264 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
265 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
266 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
267 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
268 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
269 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
270 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
271 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
272 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
273 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
274 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
275 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
276 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
277 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
278 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
279 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
280 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
281 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
282 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
283 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
284 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
285 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
286 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
287 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
288 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
289 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
290};
291
292static const u64 Td[256] = {
293 U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
294 U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
295 U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
296 U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
297 U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
298 U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
299 U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
300 U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
301 U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
302 U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
303 U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
304 U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
305 U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
306 U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
307 U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
308 U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
309 U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
310 U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
311 U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
312 U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
313 U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
314 U64(0x6033519760335197), U64(0x457f5362457f5362),
315 U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
316 U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
317 U64(0x5868487058684870), U64(0x19fd458f19fd458f),
318 U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
319 U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
320 U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
321 U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
322 U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
323 U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
324 U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
325 U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
326 U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
327 U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
328 U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
329 U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
330 U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
331 U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
332 U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
333 U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
334 U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
335 U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
336 U64(0x6fd406046fd40604), U64(0xff155060ff155060),
337 U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
338 U64(0xcc434089cc434089), U64(0x779ed967779ed967),
339 U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
340 U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
341 U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
342 U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
343 U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
344 U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
345 U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
346 U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
347 U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
348 U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
349 U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
350 U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
351 U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
352 U64(0x694b775a694b775a), U64(0x161a121c161a121c),
353 U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
354 U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
355 U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
356 U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
357 U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
358 U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
359 U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
360 U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
361 U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
362 U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
363 U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
364 U64(0x4022971340229713), U64(0x2011c6842011c684),
365 U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
366 U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
367 U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
368 U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
369 U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
370 U64(0xfa489411fa489411), U64(0x2264e9472264e947),
371 U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
372 U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
373 U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
374 U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
375 U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
376 U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
377 U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
378 U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
379 U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
380 U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
381 U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
382 U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
383 U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
384 U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
385 U64(0x097826cd097826cd), U64(0xf418596ef418596e),
386 U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
387 U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
388 U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
389 U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
390 U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
391 U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
392 U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
393 U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
394 U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
395 U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
396 U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
397 U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
398 U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
399 U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
400 U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
401 U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
402 U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
403 U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
404 U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
405 U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
406 U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
407 U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
408 U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
409 U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
410 U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
411 U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
412 U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
413 U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
414 U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
415 U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
416 U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
417 U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
418 U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
419 U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
420 U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
421};
422static const u8 Td4[256] = {
423 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
424 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
425 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
426 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
427 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
428 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
429 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
430 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
431 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
432 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
433 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
434 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
435 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
436 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
437 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
438 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
439 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
440 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
441 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
442 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
443 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
444 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
445 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
446 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
447 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
448 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
449 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
450 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
451 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
452 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
453 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
454 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
455};
456
457static const u32 rcon[] = {
458 0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
459 0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
460 0x0000001bU, 0x00000036U,
461 /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
462};
463
464/**
465 * Expand the cipher key into the encryption key schedule.
466 */
467int
468AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key) {
469 u32 *rk;
470 int i = 0;
471 u32 temp;
472
473 if (!userKey || !key)
474 return -1;
475 if (bits != 128 && bits != 192 && bits != 256)
476 return -2;
477
478 rk = key->rd_key;
479
480 if (bits == 128)
481 key->rounds = 10;
482 else if (bits == 192)
483 key->rounds = 12;
484 else
485 key->rounds = 14;
486
487 rk[0] = GETU32(userKey);
488 rk[1] = GETU32(userKey + 4);
489 rk[2] = GETU32(userKey + 8);
490 rk[3] = GETU32(userKey + 12);
491 if (bits == 128) {
492 while (1) {
493 temp = rk[3];
494 rk[4] = rk[0] ^
495 (Te4[(temp >> 8) & 0xff]) ^
496 (Te4[(temp >> 16) & 0xff] << 8) ^
497 (Te4[(temp >> 24)] << 16) ^
498 (Te4[(temp) & 0xff] << 24) ^
499 rcon[i];
500 rk[5] = rk[1] ^ rk[4];
501 rk[6] = rk[2] ^ rk[5];
502 rk[7] = rk[3] ^ rk[6];
503 if (++i == 10) {
504 return 0;
505 }
506 rk += 4;
507 }
508 }
509 rk[4] = GETU32(userKey + 16);
510 rk[5] = GETU32(userKey + 20);
511 if (bits == 192) {
512 while (1) {
513 temp = rk[5];
514 rk[6] = rk[ 0] ^
515 (Te4[(temp >> 8) & 0xff]) ^
516 (Te4[(temp >> 16) & 0xff] << 8) ^
517 (Te4[(temp >> 24)] << 16) ^
518 (Te4[(temp) & 0xff] << 24) ^
519 rcon[i];
520 rk[7] = rk[1] ^ rk[6];
521 rk[8] = rk[2] ^ rk[7];
522 rk[9] = rk[3] ^ rk[8];
523 if (++i == 8) {
524 return 0;
525 }
526 rk[10] = rk[4] ^ rk[9];
527 rk[11] = rk[5] ^ rk[10];
528 rk += 6;
529 }
530 }
531 rk[6] = GETU32(userKey + 24);
532 rk[7] = GETU32(userKey + 28);
533 if (bits == 256) {
534 while (1) {
535 temp = rk[7];
536 rk[8] = rk[0] ^
537 (Te4[(temp >> 8) & 0xff]) ^
538 (Te4[(temp >> 16) & 0xff] << 8) ^
539 (Te4[(temp >> 24)] << 16) ^
540 (Te4[(temp) & 0xff] << 24) ^
541 rcon[i];
542 rk[9] = rk[1] ^ rk[8];
543 rk[10] = rk[2] ^ rk[9];
544 rk[11] = rk[3] ^ rk[10];
545 if (++i == 7) {
546 return 0;
547 }
548 temp = rk[11];
549 rk[12] = rk[4] ^
550 (Te4[(temp) & 0xff]) ^
551 (Te4[(temp >> 8) & 0xff] << 8) ^
552 (Te4[(temp >> 16) & 0xff] << 16) ^
553 (Te4[(temp >> 24)] << 24);
554 rk[13] = rk[5] ^ rk[12];
555 rk[14] = rk[6] ^ rk[13];
556 rk[15] = rk[7] ^ rk[14];
557
558 rk += 8;
559 }
560 }
561 return 0;
562}
563
564/**
565 * Expand the cipher key into the decryption key schedule.
566 */
567int
568AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
569{
570 u32 *rk;
571 int i, j, status;
572 u32 temp;
573
574 /* first, start with an encryption schedule */
575 status = AES_set_encrypt_key(userKey, bits, key);
576 if (status < 0)
577 return status;
578
579 rk = key->rd_key;
580
581 /* invert the order of the round keys: */
582 for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
583 temp = rk[i];
584 rk[i] = rk[j];
585 rk[j] = temp;
586 temp = rk[i + 1];
587 rk[i + 1] = rk[j + 1];
588 rk[j + 1] = temp;
589 temp = rk[i + 2];
590 rk[i + 2] = rk[j + 2];
591 rk[j + 2] = temp;
592 temp = rk[i + 3];
593 rk[i + 3] = rk[j + 3];
594 rk[j + 3] = temp;
595 }
596 /* apply the inverse MixColumn transform to all round keys but the first and the last: */
597 for (i = 1; i < (key->rounds); i++) {
598 rk += 4;
599#if 1
600 for (j = 0; j < 4; j++) {
601 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
602
603 tp1 = rk[j];
604 m = tp1 & 0x80808080;
605 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
606 ((m - (m >> 7)) & 0x1b1b1b1b);
607 m = tp2 & 0x80808080;
608 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
609 ((m - (m >> 7)) & 0x1b1b1b1b);
610 m = tp4 & 0x80808080;
611 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
612 ((m - (m >> 7)) & 0x1b1b1b1b);
613 tp9 = tp8 ^ tp1;
614 tpb = tp9 ^ tp2;
615 tpd = tp9 ^ tp4;
616 tpe = tp8 ^ tp4 ^ tp2;
617#if defined(ROTATE)
618 rk[j] = tpe ^ ROTATE(tpd, 16) ^
619 ROTATE(tp9, 8) ^ ROTATE(tpb, 24);
620#else
621 rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
622 (tp9 >> 24) ^ (tp9 << 8) ^
623 (tpb >> 8) ^ (tpb << 24);
624#endif
625 }
626#else
627 rk[0] =
628 Td0[Te2[(rk[0]) & 0xff] & 0xff] ^
629 Td1[Te2[(rk[0] >> 8) & 0xff] & 0xff] ^
630 Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
631 Td3[Te2[(rk[0] >> 24)] & 0xff];
632 rk[1] =
633 Td0[Te2[(rk[1]) & 0xff] & 0xff] ^
634 Td1[Te2[(rk[1] >> 8) & 0xff] & 0xff] ^
635 Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
636 Td3[Te2[(rk[1] >> 24)] & 0xff];
637 rk[2] =
638 Td0[Te2[(rk[2]) & 0xff] & 0xff] ^
639 Td1[Te2[(rk[2] >> 8) & 0xff] & 0xff] ^
640 Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
641 Td3[Te2[(rk[2] >> 24)] & 0xff];
642 rk[3] =
643 Td0[Te2[(rk[3]) & 0xff] & 0xff] ^
644 Td1[Te2[(rk[3] >> 8) & 0xff] & 0xff] ^
645 Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
646 Td3[Te2[(rk[3] >> 24)] & 0xff];
647#endif
648 }
649 return 0;
650}
651
652/*
653 * Encrypt a single block
654 * in and out can overlap
655 */
656void
657AES_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key)
658{
659 const u32 *rk;
660 u32 s0, s1, s2, s3, t[4];
661 int r;
662
663 rk = key->rd_key;
664
665 /*
666 * map byte array block to cipher state
667 * and add initial round key:
668 */
669 s0 = GETU32(in) ^ rk[0];
670 s1 = GETU32(in + 4) ^ rk[1];
671 s2 = GETU32(in + 8) ^ rk[2];
672 s3 = GETU32(in + 12) ^ rk[3];
673
674#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
675 prefetch256(Te4);
676
677 t[0] = Te4[(s0) & 0xff] ^
678 Te4[(s1 >> 8) & 0xff] << 8 ^
679 Te4[(s2 >> 16) & 0xff] << 16 ^
680 Te4[(s3 >> 24)] << 24;
681 t[1] = Te4[(s1) & 0xff] ^
682 Te4[(s2 >> 8) & 0xff] << 8 ^
683 Te4[(s3 >> 16) & 0xff] << 16 ^
684 Te4[(s0 >> 24)] << 24;
685 t[2] = Te4[(s2) & 0xff] ^
686 Te4[(s3 >> 8) & 0xff] << 8 ^
687 Te4[(s0 >> 16) & 0xff] << 16 ^
688 Te4[(s1 >> 24)] << 24;
689 t[3] = Te4[(s3) & 0xff] ^
690 Te4[(s0 >> 8) & 0xff] << 8 ^
691 Te4[(s1 >> 16) & 0xff] << 16 ^
692 Te4[(s2 >> 24)] << 24;
693
694 /* now do the linear transform using words */
695 {
696 int i;
697 u32 r0, r1, r2;
698
699 for (i = 0; i < 4; i++) {
700 r0 = t[i];
701 r1 = r0 & 0x80808080;
702 r2 = ((r0 & 0x7f7f7f7f) << 1) ^
703 ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
704#if defined(ROTATE)
705 t[i] = r2 ^ ROTATE(r2, 24) ^ ROTATE(r0, 24) ^
706 ROTATE(r0, 16) ^ ROTATE(r0, 8);
707#else
708 t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
709 (r0 << 16) ^ (r0 >> 16) ^ (r0 << 8) ^ (r0 >> 24);
710#endif
711 t[i] ^= rk[4 + i];
712 }
713 }
714#else
715 t[0] = Te0[(s0) & 0xff] ^
716 Te1[(s1 >> 8) & 0xff] ^
717 Te2[(s2 >> 16) & 0xff] ^
718 Te3[(s3 >> 24)] ^
719 rk[4];
720 t[1] = Te0[(s1) & 0xff] ^
721 Te1[(s2 >> 8) & 0xff] ^
722 Te2[(s3 >> 16) & 0xff] ^
723 Te3[(s0 >> 24)] ^
724 rk[5];
725 t[2] = Te0[(s2) & 0xff] ^
726 Te1[(s3 >> 8) & 0xff] ^
727 Te2[(s0 >> 16) & 0xff] ^
728 Te3[(s1 >> 24)] ^
729 rk[6];
730 t[3] = Te0[(s3) & 0xff] ^
731 Te1[(s0 >> 8) & 0xff] ^
732 Te2[(s1 >> 16) & 0xff] ^
733 Te3[(s2 >> 24)] ^
734 rk[7];
735#endif
736 s0 = t[0];
737 s1 = t[1];
738 s2 = t[2];
739 s3 = t[3];
740
741 /*
742 * Nr - 2 full rounds:
743 */
744 for (rk += 8, r = key->rounds - 2; r > 0; rk += 4, r--) {
745#if defined(AES_COMPACT_IN_INNER_ROUNDS)
746 t[0] = Te4[(s0) & 0xff] ^
747 Te4[(s1 >> 8) & 0xff] << 8 ^
748 Te4[(s2 >> 16) & 0xff] << 16 ^
749 Te4[(s3 >> 24)] << 24;
750 t[1] = Te4[(s1) & 0xff] ^
751 Te4[(s2 >> 8) & 0xff] << 8 ^
752 Te4[(s3 >> 16) & 0xff] << 16 ^
753 Te4[(s0 >> 24)] << 24;
754 t[2] = Te4[(s2) & 0xff] ^
755 Te4[(s3 >> 8) & 0xff] << 8 ^
756 Te4[(s0 >> 16) & 0xff] << 16 ^
757 Te4[(s1 >> 24)] << 24;
758 t[3] = Te4[(s3) & 0xff] ^
759 Te4[(s0 >> 8) & 0xff] << 8 ^
760 Te4[(s1 >> 16) & 0xff] << 16 ^
761 Te4[(s2 >> 24)] << 24;
762
763 /* now do the linear transform using words */
764 {
765 int i;
766 u32 r0, r1, r2;
767
768 for (i = 0; i < 4; i++) {
769 r0 = t[i];
770 r1 = r0 & 0x80808080;
771 r2 = ((r0 & 0x7f7f7f7f) << 1) ^
772 ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
773#if defined(ROTATE)
774 t[i] = r2 ^ ROTATE(r2, 24) ^ ROTATE(r0, 24) ^
775 ROTATE(r0, 16) ^ ROTATE(r0, 8);
776#else
777 t[i] = r2 ^ ((r2 ^ r0) << 24) ^
778 ((r2 ^ r0) >> 8) ^
779 (r0 << 16) ^ (r0 >> 16) ^
780 (r0 << 8) ^ (r0 >> 24);
781#endif
782 t[i] ^= rk[i];
783 }
784 }
785#else
786 t[0] = Te0[(s0) & 0xff] ^
787 Te1[(s1 >> 8) & 0xff] ^
788 Te2[(s2 >> 16) & 0xff] ^
789 Te3[(s3 >> 24)] ^
790 rk[0];
791 t[1] = Te0[(s1) & 0xff] ^
792 Te1[(s2 >> 8) & 0xff] ^
793 Te2[(s3 >> 16) & 0xff] ^
794 Te3[(s0 >> 24)] ^
795 rk[1];
796 t[2] = Te0[(s2) & 0xff] ^
797 Te1[(s3 >> 8) & 0xff] ^
798 Te2[(s0 >> 16) & 0xff] ^
799 Te3[(s1 >> 24)] ^
800 rk[2];
801 t[3] = Te0[(s3) & 0xff] ^
802 Te1[(s0 >> 8) & 0xff] ^
803 Te2[(s1 >> 16) & 0xff] ^
804 Te3[(s2 >> 24)] ^
805 rk[3];
806#endif
807 s0 = t[0];
808 s1 = t[1];
809 s2 = t[2];
810 s3 = t[3];
811 }
812 /*
813 * apply last round and
814 * map cipher state to byte array block:
815 */
816#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
817 prefetch256(Te4);
818
819 *(u32*)(out + 0) =
820 Te4[(s0) & 0xff] ^
821 Te4[(s1 >> 8) & 0xff] << 8 ^
822 Te4[(s2 >> 16) & 0xff] << 16 ^
823 Te4[(s3 >> 24)] << 24 ^
824 rk[0];
825 *(u32*)(out + 4) =
826 Te4[(s1) & 0xff] ^
827 Te4[(s2 >> 8) & 0xff] << 8 ^
828 Te4[(s3 >> 16) & 0xff] << 16 ^
829 Te4[(s0 >> 24)] << 24 ^
830 rk[1];
831 *(u32*)(out + 8) =
832 Te4[(s2) & 0xff] ^
833 Te4[(s3 >> 8) & 0xff] << 8 ^
834 Te4[(s0 >> 16) & 0xff] << 16 ^
835 Te4[(s1 >> 24)] << 24 ^
836 rk[2];
837 *(u32*)(out + 12) =
838 Te4[(s3) & 0xff] ^
839 Te4[(s0 >> 8) & 0xff] << 8 ^
840 Te4[(s1 >> 16) & 0xff] << 16 ^
841 Te4[(s2 >> 24)] << 24 ^
842 rk[3];
843#else
844 *(u32*)(out + 0) =
845 (Te2[(s0) & 0xff] & 0x000000ffU) ^
846 (Te3[(s1 >> 8) & 0xff] & 0x0000ff00U) ^
847 (Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^
848 (Te1[(s3 >> 24)] & 0xff000000U) ^
849 rk[0];
850 *(u32*)(out + 4) =
851 (Te2[(s1) & 0xff] & 0x000000ffU) ^
852 (Te3[(s2 >> 8) & 0xff] & 0x0000ff00U) ^
853 (Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^
854 (Te1[(s0 >> 24)] & 0xff000000U) ^
855 rk[1];
856 *(u32*)(out + 8) =
857 (Te2[(s2) & 0xff] & 0x000000ffU) ^
858 (Te3[(s3 >> 8) & 0xff] & 0x0000ff00U) ^
859 (Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^
860 (Te1[(s1 >> 24)] & 0xff000000U) ^
861 rk[2];
862 *(u32*)(out + 12) =
863 (Te2[(s3) & 0xff] & 0x000000ffU) ^
864 (Te3[(s0 >> 8) & 0xff] & 0x0000ff00U) ^
865 (Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^
866 (Te1[(s2 >> 24)] & 0xff000000U) ^
867 rk[3];
868#endif
869}
870
871/*
872 * Decrypt a single block
873 * in and out can overlap
874 */
875void
876AES_decrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key)
877{
878 const u32 *rk;
879 u32 s0, s1, s2, s3, t[4];
880 int r;
881
882 rk = key->rd_key;
883
884 /*
885 * map byte array block to cipher state
886 * and add initial round key:
887 */
888 s0 = GETU32(in) ^ rk[0];
889 s1 = GETU32(in + 4) ^ rk[1];
890 s2 = GETU32(in + 8) ^ rk[2];
891 s3 = GETU32(in + 12) ^ rk[3];
892
893#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
894 prefetch256(Td4);
895
896 t[0] = Td4[(s0) & 0xff] ^
897 Td4[(s3 >> 8) & 0xff] << 8 ^
898 Td4[(s2 >> 16) & 0xff] << 16 ^
899 Td4[(s1 >> 24)] << 24;
900 t[1] = Td4[(s1) & 0xff] ^
901 Td4[(s0 >> 8) & 0xff] << 8 ^
902 Td4[(s3 >> 16) & 0xff] << 16 ^
903 Td4[(s2 >> 24)] << 24;
904 t[2] = Td4[(s2) & 0xff] ^
905 Td4[(s1 >> 8) & 0xff] << 8 ^
906 Td4[(s0 >> 16) & 0xff] << 16 ^
907 Td4[(s3 >> 24)] << 24;
908 t[3] = Td4[(s3) & 0xff] ^
909 Td4[(s2 >> 8) & 0xff] << 8 ^
910 Td4[(s1 >> 16) & 0xff] << 16 ^
911 Td4[(s0 >> 24)] << 24;
912
913 /* now do the linear transform using words */
914 {
915 int i;
916 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
917
918 for (i = 0; i < 4; i++) {
919 tp1 = t[i];
920 m = tp1 & 0x80808080;
921 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
922 ((m - (m >> 7)) & 0x1b1b1b1b);
923 m = tp2 & 0x80808080;
924 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
925 ((m - (m >> 7)) & 0x1b1b1b1b);
926 m = tp4 & 0x80808080;
927 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
928 ((m - (m >> 7)) & 0x1b1b1b1b);
929 tp9 = tp8 ^ tp1;
930 tpb = tp9 ^ tp2;
931 tpd = tp9 ^ tp4;
932 tpe = tp8 ^ tp4 ^ tp2;
933#if defined(ROTATE)
934 t[i] = tpe ^ ROTATE(tpd, 16) ^
935 ROTATE(tp9, 8) ^ ROTATE(tpb, 24);
936#else
937 t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
938 (tp9 >> 24) ^ (tp9 << 8) ^ (tpb >> 8) ^ (tpb << 24);
939#endif
940 t[i] ^= rk[4 + i];
941 }
942 }
943#else
944 t[0] = Td0[(s0) & 0xff] ^
945 Td1[(s3 >> 8) & 0xff] ^
946 Td2[(s2 >> 16) & 0xff] ^
947 Td3[(s1 >> 24)] ^
948 rk[4];
949 t[1] = Td0[(s1) & 0xff] ^
950 Td1[(s0 >> 8) & 0xff] ^
951 Td2[(s3 >> 16) & 0xff] ^
952 Td3[(s2 >> 24)] ^
953 rk[5];
954 t[2] = Td0[(s2) & 0xff] ^
955 Td1[(s1 >> 8) & 0xff] ^
956 Td2[(s0 >> 16) & 0xff] ^
957 Td3[(s3 >> 24)] ^
958 rk[6];
959 t[3] = Td0[(s3) & 0xff] ^
960 Td1[(s2 >> 8) & 0xff] ^
961 Td2[(s1 >> 16) & 0xff] ^
962 Td3[(s0 >> 24)] ^
963 rk[7];
964#endif
965 s0 = t[0];
966 s1 = t[1];
967 s2 = t[2];
968 s3 = t[3];
969
970 /*
971 * Nr - 2 full rounds:
972 */
973 for (rk += 8, r = key->rounds - 2; r > 0; rk += 4, r--) {
974#if defined(AES_COMPACT_IN_INNER_ROUNDS)
975 t[0] = Td4[(s0) & 0xff] ^
976 Td4[(s3 >> 8) & 0xff] << 8 ^
977 Td4[(s2 >> 16) & 0xff] << 16 ^
978 Td4[(s1 >> 24)] << 24;
979 t[1] = Td4[(s1) & 0xff] ^
980 Td4[(s0 >> 8) & 0xff] << 8 ^
981 Td4[(s3 >> 16) & 0xff] << 16 ^
982 Td4[(s2 >> 24)] << 24;
983 t[2] = Td4[(s2) & 0xff] ^
984 Td4[(s1 >> 8) & 0xff] << 8 ^
985 Td4[(s0 >> 16) & 0xff] << 16 ^
986 Td4[(s3 >> 24)] << 24;
987 t[3] = Td4[(s3) & 0xff] ^
988 Td4[(s2 >> 8) & 0xff] << 8 ^
989 Td4[(s1 >> 16) & 0xff] << 16 ^
990 Td4[(s0 >> 24)] << 24;
991
992 /* now do the linear transform using words */
993 {
994 int i;
995 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
996
997 for (i = 0; i < 4; i++) {
998 tp1 = t[i];
999 m = tp1 & 0x80808080;
1000 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
1001 ((m - (m >> 7)) & 0x1b1b1b1b);
1002 m = tp2 & 0x80808080;
1003 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
1004 ((m - (m >> 7)) & 0x1b1b1b1b);
1005 m = tp4 & 0x80808080;
1006 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
1007 ((m - (m >> 7)) & 0x1b1b1b1b);
1008 tp9 = tp8 ^ tp1;
1009 tpb = tp9 ^ tp2;
1010 tpd = tp9 ^ tp4;
1011 tpe = tp8 ^ tp4 ^ tp2;
1012#if defined(ROTATE)
1013 t[i] = tpe ^ ROTATE(tpd, 16) ^
1014 ROTATE(tp9, 8) ^ ROTATE(tpb, 24);
1015#else
1016 t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
1017 (tp9 >> 24) ^ (tp9 << 8) ^
1018 (tpb >> 8) ^ (tpb << 24);
1019#endif
1020 t[i] ^= rk[i];
1021 }
1022 }
1023#else
1024 t[0] = Td0[(s0) & 0xff] ^
1025 Td1[(s3 >> 8) & 0xff] ^
1026 Td2[(s2 >> 16) & 0xff] ^
1027 Td3[(s1 >> 24)] ^
1028 rk[0];
1029 t[1] = Td0[(s1) & 0xff] ^
1030 Td1[(s0 >> 8) & 0xff] ^
1031 Td2[(s3 >> 16) & 0xff] ^
1032 Td3[(s2 >> 24)] ^
1033 rk[1];
1034 t[2] = Td0[(s2) & 0xff] ^
1035 Td1[(s1 >> 8) & 0xff] ^
1036 Td2[(s0 >> 16) & 0xff] ^
1037 Td3[(s3 >> 24)] ^
1038 rk[2];
1039 t[3] = Td0[(s3) & 0xff] ^
1040 Td1[(s2 >> 8) & 0xff] ^
1041 Td2[(s1 >> 16) & 0xff] ^
1042 Td3[(s0 >> 24)] ^
1043 rk[3];
1044#endif
1045 s0 = t[0];
1046 s1 = t[1];
1047 s2 = t[2];
1048 s3 = t[3];
1049 }
1050 /*
1051 * apply last round and
1052 * map cipher state to byte array block:
1053 */
1054 prefetch256(Td4);
1055
1056 *(u32*)(out + 0) =
1057 (Td4[(s0) & 0xff]) ^
1058 (Td4[(s3 >> 8) & 0xff] << 8) ^
1059 (Td4[(s2 >> 16) & 0xff] << 16) ^
1060 (Td4[(s1 >> 24)] << 24) ^
1061 rk[0];
1062 *(u32*)(out + 4) =
1063 (Td4[(s1) & 0xff]) ^
1064 (Td4[(s0 >> 8) & 0xff] << 8) ^
1065 (Td4[(s3 >> 16) & 0xff] << 16) ^
1066 (Td4[(s2 >> 24)] << 24) ^
1067 rk[1];
1068 *(u32*)(out + 8) =
1069 (Td4[(s2) & 0xff]) ^
1070 (Td4[(s1 >> 8) & 0xff] << 8) ^
1071 (Td4[(s0 >> 16) & 0xff] << 16) ^
1072 (Td4[(s3 >> 24)] << 24) ^
1073 rk[2];
1074 *(u32*)(out + 12) =
1075 (Td4[(s3) & 0xff]) ^
1076 (Td4[(s2 >> 8) & 0xff] << 8) ^
1077 (Td4[(s1 >> 16) & 0xff] << 16) ^
1078 (Td4[(s0 >> 24)] << 24) ^
1079 rk[3];
1080}
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl
deleted file mode 100644
index aab40e6f1c..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-586.pl
+++ /dev/null
@@ -1,2980 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Version 4.3.
11#
12# You might fail to appreciate this module performance from the first
13# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
14# to be *the* best Intel C compiler without -KPIC, performance appears
15# to be virtually identical... But try to re-configure with shared
16# library support... Aha! Intel compiler "suddenly" lags behind by 30%
17# [on P4, more on others]:-) And if compared to position-independent
18# code generated by GNU C, this code performs *more* than *twice* as
19# fast! Yes, all this buzz about PIC means that unlike other hand-
20# coded implementations, this one was explicitly designed to be safe
21# to use even in shared library context... This also means that this
22# code isn't necessarily absolutely fastest "ever," because in order
23# to achieve position independence an extra register has to be
24# off-loaded to stack, which affects the benchmark result.
25#
26# Special note about instruction choice. Do you recall RC4_INT code
27# performing poorly on P4? It might be the time to figure out why.
28# RC4_INT code implies effective address calculations in base+offset*4
29# form. Trouble is that it seems that offset scaling turned to be
30# critical path... At least eliminating scaling resulted in 2.8x RC4
31# performance improvement [as you might recall]. As AES code is hungry
32# for scaling too, I [try to] avoid the latter by favoring off-by-2
33# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
34#
35# As was shown by Dean Gaudet <dean@arctic.org>, the above note turned
36# void. Performance improvement with off-by-2 shifts was observed on
37# intermediate implementation, which was spilling yet another register
38# to stack... Final offset*4 code below runs just a tad faster on P4,
39# but exhibits up to 10% improvement on other cores.
40#
41# Second version is "monolithic" replacement for aes_core.c, which in
42# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
43# This made it possible to implement little-endian variant of the
44# algorithm without modifying the base C code. Motivating factor for
45# the undertaken effort was that it appeared that in tight IA-32
46# register window little-endian flavor could achieve slightly higher
47# Instruction Level Parallelism, and it indeed resulted in up to 15%
48# better performance on most recent µ-archs...
49#
50# Third version adds AES_cbc_encrypt implementation, which resulted in
51# up to 40% performance imrovement of CBC benchmark results. 40% was
52# observed on P4 core, where "overall" imrovement coefficient, i.e. if
53# compared to PIC generated by GCC and in CBC mode, was observed to be
54# as large as 4x:-) CBC performance is virtually identical to ECB now
55# and on some platforms even better, e.g. 17.6 "small" cycles/byte on
56# Opteron, because certain function prologues and epilogues are
57# effectively taken out of the loop...
58#
59# Version 3.2 implements compressed tables and prefetch of these tables
60# in CBC[!] mode. Former means that 3/4 of table references are now
61# misaligned, which unfortunately has negative impact on elder IA-32
62# implementations, Pentium suffered 30% penalty, PIII - 10%.
63#
64# Version 3.3 avoids L1 cache aliasing between stack frame and
65# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
66# latter is achieved by copying the key schedule to controlled place in
67# stack. This unfortunately has rather strong impact on small block CBC
68# performance, ~2x deterioration on 16-byte block if compared to 3.3.
69#
70# Version 3.5 checks if there is L1 cache aliasing between user-supplied
71# key schedule and S-boxes and abstains from copying the former if
72# there is no. This allows end-user to consciously retain small block
73# performance by aligning key schedule in specific manner.
74#
75# Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
76#
77# Current ECB performance numbers for 128-bit key in CPU cycles per
78# processed byte [measure commonly used by AES benchmarkers] are:
79#
80# small footprint fully unrolled
81# P4 24 22
82# AMD K8 20 19
83# PIII 25 23
84# Pentium 81 78
85#
86# Version 3.7 reimplements outer rounds as "compact." Meaning that
87# first and last rounds reference compact 256 bytes S-box. This means
88# that first round consumes a lot more CPU cycles and that encrypt
89# and decrypt performance becomes asymmetric. Encrypt performance
90# drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
91# aggressively pre-fetched.
92#
93# Version 4.0 effectively rolls back to 3.6 and instead implements
94# additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
95# which use exclusively 256 byte S-box. These functions are to be
96# called in modes not concealing plain text, such as ECB, or when
97# we're asked to process smaller amount of data [or unconditionally
98# on hyper-threading CPU]. Currently it's called unconditionally from
99# AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
100# still needs to be modified to switch between slower and faster
101# mode when appropriate... But in either case benchmark landscape
102# changes dramatically and below numbers are CPU cycles per processed
103# byte for 128-bit key.
104#
105# ECB encrypt ECB decrypt CBC large chunk
106# P4 56[60] 84[100] 23
107# AMD K8 48[44] 70[79] 18
108# PIII 41[50] 61[91] 24
109# Core 2 32[38] 45[70] 18.5
110# Pentium 120 160 77
111#
112# Version 4.1 switches to compact S-box even in key schedule setup.
113#
114# Version 4.2 prefetches compact S-box in every SSE round or in other
115# words every cache-line is *guaranteed* to be accessed within ~50
116# cycles window. Why just SSE? Because it's needed on hyper-threading
117# CPU! Which is also why it's prefetched with 64 byte stride. Best
118# part is that it has no negative effect on performance:-)
119#
120# Version 4.3 implements switch between compact and non-compact block
121# functions in AES_cbc_encrypt depending on how much data was asked
122# to be processed in one stroke.
123#
124######################################################################
125# Timing attacks are classified in two classes: synchronous when
126# attacker consciously initiates cryptographic operation and collects
127# timing data of various character afterwards, and asynchronous when
128# malicious code is executed on same CPU simultaneously with AES,
129# instruments itself and performs statistical analysis of this data.
130#
131# As far as synchronous attacks go the root to the AES timing
132# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
133# are referred to in single 128-bit block operation. Well, in C
134# implementation with 4 distinct tables it's actually as little as 40
135# references per 256 elements table, but anyway... Secondly, even
136# though S-box elements are clustered into smaller amount of cache-
137# lines, smaller than 160 and even 40, it turned out that for certain
138# plain-text pattern[s] or simply put chosen plain-text and given key
139# few cache-lines remain unaccessed during block operation. Now, if
140# attacker can figure out this access pattern, he can deduct the key
141# [or at least part of it]. The natural way to mitigate this kind of
142# attacks is to minimize the amount of cache-lines in S-box and/or
143# prefetch them to ensure that every one is accessed for more uniform
144# timing. But note that *if* plain-text was concealed in such way that
145# input to block function is distributed *uniformly*, then attack
146# wouldn't apply. Now note that some encryption modes, most notably
147# CBC, do mask the plain-text in this exact way [secure cipher output
148# is distributed uniformly]. Yes, one still might find input that
149# would reveal the information about given key, but if amount of
150# candidate inputs to be tried is larger than amount of possible key
151# combinations then attack becomes infeasible. This is why revised
152# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
153# of data is to be processed in one stroke. The current size limit of
154# 512 bytes is chosen to provide same [diminishigly low] probability
155# for cache-line to remain untouched in large chunk operation with
156# large S-box as for single block operation with compact S-box and
157# surely needs more careful consideration...
158#
159# As for asynchronous attacks. There are two flavours: attacker code
160# being interleaved with AES on hyper-threading CPU at *instruction*
161# level, and two processes time sharing single core. As for latter.
162# Two vectors. 1. Given that attacker process has higher priority,
163# yield execution to process performing AES just before timer fires
164# off the scheduler, immediately regain control of CPU and analyze the
165# cache state. For this attack to be efficient attacker would have to
166# effectively slow down the operation by several *orders* of magnitute,
167# by ratio of time slice to duration of handful of AES rounds, which
168# unlikely to remain unnoticed. Not to mention that this also means
169# that he would spend correspondigly more time to collect enough
170# statistical data to mount the attack. It's probably appropriate to
171# say that if adeversary reckons that this attack is beneficial and
172# risks to be noticed, you probably have larger problems having him
173# mere opportunity. In other words suggested code design expects you
174# to preclude/mitigate this attack by overall system security design.
175# 2. Attacker manages to make his code interrupt driven. In order for
176# this kind of attack to be feasible, interrupt rate has to be high
177# enough, again comparable to duration of handful of AES rounds. But
178# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
179# generates interrupts at such raging rate...
180#
181# And now back to the former, hyper-threading CPU or more specifically
182# Intel P4. Recall that asynchronous attack implies that malicious
183# code instruments itself. And naturally instrumentation granularity
184# has be noticeably lower than duration of codepath accessing S-box.
185# Given that all cache-lines are accessed during that time that is.
186# Current implementation accesses *all* cache-lines within ~50 cycles
187# window, which is actually *less* than RDTSC latency on Intel P4!
188
189$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
190push(@INC,"${dir}","${dir}../../perlasm");
191require "x86asm.pl";
192
193&asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
194&static_label("AES_Te");
195&static_label("AES_Td");
196
197$s0="eax";
198$s1="ebx";
199$s2="ecx";
200$s3="edx";
201$key="edi";
202$acc="esi";
203$tbl="ebp";
204
205# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
206# by caller
207$__ra=&DWP(0,"esp"); # return address
208$__s0=&DWP(4,"esp"); # s0 backing store
209$__s1=&DWP(8,"esp"); # s1 backing store
210$__s2=&DWP(12,"esp"); # s2 backing store
211$__s3=&DWP(16,"esp"); # s3 backing store
212$__key=&DWP(20,"esp"); # pointer to key schedule
213$__end=&DWP(24,"esp"); # pointer to end of key schedule
214$__tbl=&DWP(28,"esp"); # %ebp backing store
215
216# stack frame layout in AES_[en|crypt] routines, which differs from
217# above by 4 and overlaps by %ebp backing store
218$_tbl=&DWP(24,"esp");
219$_esp=&DWP(28,"esp");
220
221sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
222
223$speed_limit=512; # chunks smaller than $speed_limit are
224 # processed with compact routine in CBC mode
225$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
226 # recent µ-archs], but ~5 times smaller!
227 # I favor compact code to minimize cache
228 # contention and in hope to "collect" 5% back
229 # in real-life applications...
230
231$vertical_spin=0; # shift "verticaly" defaults to 0, because of
232 # its proof-of-concept status...
233# Note that there is no decvert(), as well as last encryption round is
234# performed with "horizontal" shifts. This is because this "vertical"
235# implementation [one which groups shifts on a given $s[i] to form a
236# "column," unlike "horizontal" one, which groups shifts on different
237# $s[i] to form a "row"] is work in progress. It was observed to run
238# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
239# whole 12% slower:-( So we face a trade-off... Shall it be resolved
240# some day? Till then the code is considered experimental and by
241# default remains dormant...
242
243sub encvert()
244{ my ($te,@s) = @_;
245 my $v0 = $acc, $v1 = $key;
246
247 &mov ($v0,$s[3]); # copy s3
248 &mov (&DWP(4,"esp"),$s[2]); # save s2
249 &mov ($v1,$s[0]); # copy s0
250 &mov (&DWP(8,"esp"),$s[1]); # save s1
251
252 &movz ($s[2],&HB($s[0]));
253 &and ($s[0],0xFF);
254 &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0
255 &shr ($v1,16);
256 &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8
257 &movz ($s[1],&HB($v1));
258 &and ($v1,0xFF);
259 &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16
260 &mov ($v1,$v0);
261 &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24
262
263 &and ($v0,0xFF);
264 &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0
265 &movz ($v0,&HB($v1));
266 &shr ($v1,16);
267 &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8
268 &movz ($v0,&HB($v1));
269 &and ($v1,0xFF);
270 &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16
271 &mov ($v1,&DWP(4,"esp")); # restore s2
272 &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24
273
274 &mov ($v0,$v1);
275 &and ($v1,0xFF);
276 &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0
277 &movz ($v1,&HB($v0));
278 &shr ($v0,16);
279 &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8
280 &movz ($v1,&HB($v0));
281 &and ($v0,0xFF);
282 &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16
283 &mov ($v0,&DWP(8,"esp")); # restore s1
284 &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24
285
286 &mov ($v1,$v0);
287 &and ($v0,0xFF);
288 &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0
289 &movz ($v0,&HB($v1));
290 &shr ($v1,16);
291 &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8
292 &movz ($v0,&HB($v1));
293 &and ($v1,0xFF);
294 &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
295 &mov ($key,$__key); # reincarnate v1 as key
296 &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
297}
298
299# Another experimental routine, which features "horizontal spin," but
300# eliminates one reference to stack. Strangely enough runs slower...
301sub enchoriz()
302{ my $v0 = $key, $v1 = $acc;
303
304 &movz ($v0,&LB($s0)); # 3, 2, 1, 0*
305 &rotr ($s2,8); # 8,11,10, 9
306 &mov ($v1,&DWP(0,$te,$v0,8)); # 0
307 &movz ($v0,&HB($s1)); # 7, 6, 5*, 4
308 &rotr ($s3,16); # 13,12,15,14
309 &xor ($v1,&DWP(3,$te,$v0,8)); # 5
310 &movz ($v0,&HB($s2)); # 8,11,10*, 9
311 &rotr ($s0,16); # 1, 0, 3, 2
312 &xor ($v1,&DWP(2,$te,$v0,8)); # 10
313 &movz ($v0,&HB($s3)); # 13,12,15*,14
314 &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
315 &mov ($__s0,$v1); # t[0] saved
316
317 &movz ($v0,&LB($s1)); # 7, 6, 5, 4*
318 &shr ($s1,16); # -, -, 7, 6
319 &mov ($v1,&DWP(0,$te,$v0,8)); # 4
320 &movz ($v0,&LB($s3)); # 13,12,15,14*
321 &xor ($v1,&DWP(2,$te,$v0,8)); # 14
322 &movz ($v0,&HB($s0)); # 1, 0, 3*, 2
323 &and ($s3,0xffff0000); # 13,12, -, -
324 &xor ($v1,&DWP(1,$te,$v0,8)); # 3
325 &movz ($v0,&LB($s2)); # 8,11,10, 9*
326 &or ($s3,$s1); # 13,12, 7, 6
327 &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected
328 &mov ($s1,$v1); # s[1]=t[1]
329
330 &movz ($v0,&LB($s0)); # 1, 0, 3, 2*
331 &shr ($s2,16); # -, -, 8,11
332 &mov ($v1,&DWP(2,$te,$v0,8)); # 2
333 &movz ($v0,&HB($s3)); # 13,12, 7*, 6
334 &xor ($v1,&DWP(1,$te,$v0,8)); # 7
335 &movz ($v0,&HB($s2)); # -, -, 8*,11
336 &xor ($v1,&DWP(0,$te,$v0,8)); # 8
337 &mov ($v0,$s3);
338 &shr ($v0,24); # 13
339 &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected
340
341 &movz ($v0,&LB($s2)); # -, -, 8,11*
342 &shr ($s0,24); # 1*
343 &mov ($s2,&DWP(1,$te,$v0,8)); # 11
344 &xor ($s2,&DWP(3,$te,$s0,8)); # 1
345 &mov ($s0,$__s0); # s[0]=t[0]
346 &movz ($v0,&LB($s3)); # 13,12, 7, 6*
347 &shr ($s3,16); # , ,13,12
348 &xor ($s2,&DWP(2,$te,$v0,8)); # 6
349 &mov ($key,$__key); # reincarnate v0 as key
350 &and ($s3,0xff); # , ,13,12*
351 &mov ($s3,&DWP(0,$te,$s3,8)); # 12
352 &xor ($s3,$s2); # s[2]=t[3] collected
353 &mov ($s2,$v1); # s[2]=t[2]
354}
355
356# More experimental code... SSE one... Even though this one eliminates
357# *all* references to stack, it's not faster...
358sub sse_encbody()
359{
360 &movz ($acc,&LB("eax")); # 0
361 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
362 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
363 &movz ("edx",&HB("eax")); # 1
364 &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
365 &shr ("eax",16); # 5, 4
366
367 &movz ($acc,&LB("ebx")); # 10
368 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
369 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
370 &movz ($acc,&HB("ebx")); # 11
371 &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
372 &shr ("ebx",16); # 15,14
373
374 &movz ($acc,&HB("eax")); # 5
375 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
376 &movq ("mm3",QWP(16,$key));
377 &movz ($acc,&HB("ebx")); # 15
378 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
379 &movd ("mm0","ecx"); # t[0] collected
380
381 &movz ($acc,&LB("eax")); # 4
382 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
383 &movd ("eax","mm2"); # 7, 6, 3, 2
384 &movz ($acc,&LB("ebx")); # 14
385 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
386 &movd ("ebx","mm6"); # 13,12, 9, 8
387
388 &movz ($acc,&HB("eax")); # 3
389 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
390 &movz ($acc,&HB("ebx")); # 9
391 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
392 &movd ("mm1","ecx"); # t[1] collected
393
394 &movz ($acc,&LB("eax")); # 2
395 &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
396 &shr ("eax",16); # 7, 6
397 &punpckldq ("mm0","mm1"); # t[0,1] collected
398 &movz ($acc,&LB("ebx")); # 8
399 &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
400 &shr ("ebx",16); # 13,12
401
402 &movz ($acc,&HB("eax")); # 7
403 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
404 &pxor ("mm0","mm3");
405 &movz ("eax",&LB("eax")); # 6
406 &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
407 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
408 &movz ($acc,&HB("ebx")); # 13
409 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
410 &xor ("ecx",&DWP(24,$key)); # t[2]
411 &movd ("mm4","ecx"); # t[2] collected
412 &movz ("ebx",&LB("ebx")); # 12
413 &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12
414 &shr ("ecx",16);
415 &movd ("eax","mm1"); # 5, 4, 1, 0
416 &mov ("ebx",&DWP(28,$key)); # t[3]
417 &xor ("ebx","edx");
418 &movd ("mm5","ebx"); # t[3] collected
419 &and ("ebx",0xffff0000);
420 &or ("ebx","ecx");
421
422 &punpckldq ("mm4","mm5"); # t[2,3] collected
423}
424
425######################################################################
426# "Compact" block function
427######################################################################
428
429sub enccompact()
430{ my $Fn = mov;
431 while ($#_>5) { pop(@_); $Fn=sub{}; }
432 my ($i,$te,@s)=@_;
433 my $tmp = $key;
434 my $out = $i==3?$s[0]:$acc;
435
436 # $Fn is used in first compact round and its purpose is to
437 # void restoration of some values from stack, so that after
438 # 4xenccompact with extra argument $key value is left there...
439 if ($i==3) { &$Fn ($key,$__key); }##%edx
440 else { &mov ($out,$s[0]); }
441 &and ($out,0xFF);
442 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
443 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
444 &movz ($out,&BP(-128,$te,$out,1));
445
446 if ($i==3) { $tmp=$s[1]; }##%eax
447 &movz ($tmp,&HB($s[1]));
448 &movz ($tmp,&BP(-128,$te,$tmp,1));
449 &shl ($tmp,8);
450 &xor ($out,$tmp);
451
452 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
453 else { &mov ($tmp,$s[2]);
454 &shr ($tmp,16); }
455 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
456 &and ($tmp,0xFF);
457 &movz ($tmp,&BP(-128,$te,$tmp,1));
458 &shl ($tmp,16);
459 &xor ($out,$tmp);
460
461 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
462 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
463 else { &mov ($tmp,$s[3]);
464 &shr ($tmp,24); }
465 &movz ($tmp,&BP(-128,$te,$tmp,1));
466 &shl ($tmp,24);
467 &xor ($out,$tmp);
468 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
469 if ($i==3) { &mov ($s[3],$acc); }
470 &comment();
471}
472
473sub enctransform()
474{ my @s = ($s0,$s1,$s2,$s3);
475 my $i = shift;
476 my $tmp = $tbl;
477 my $r2 = $key ;
478
479 &mov ($acc,$s[$i]);
480 &and ($acc,0x80808080);
481 &mov ($tmp,$acc);
482 &shr ($tmp,7);
483 &lea ($r2,&DWP(0,$s[$i],$s[$i]));
484 &sub ($acc,$tmp);
485 &and ($r2,0xfefefefe);
486 &and ($acc,0x1b1b1b1b);
487 &mov ($tmp,$s[$i]);
488 &xor ($acc,$r2); # r2
489
490 &xor ($s[$i],$acc); # r0 ^ r2
491 &rotl ($s[$i],24);
492 &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2
493 &rotr ($tmp,16);
494 &xor ($s[$i],$tmp);
495 &rotr ($tmp,8);
496 &xor ($s[$i],$tmp);
497}
498
499&function_begin_B("_x86_AES_encrypt_compact");
500 # note that caller is expected to allocate stack frame for me!
501 &mov ($__key,$key); # save key
502
503 &xor ($s0,&DWP(0,$key)); # xor with key
504 &xor ($s1,&DWP(4,$key));
505 &xor ($s2,&DWP(8,$key));
506 &xor ($s3,&DWP(12,$key));
507
508 &mov ($acc,&DWP(240,$key)); # load key->rounds
509 &lea ($acc,&DWP(-2,$acc,$acc));
510 &lea ($acc,&DWP(0,$key,$acc,8));
511 &mov ($__end,$acc); # end of key schedule
512
513 # prefetch Te4
514 &mov ($key,&DWP(0-128,$tbl));
515 &mov ($acc,&DWP(32-128,$tbl));
516 &mov ($key,&DWP(64-128,$tbl));
517 &mov ($acc,&DWP(96-128,$tbl));
518 &mov ($key,&DWP(128-128,$tbl));
519 &mov ($acc,&DWP(160-128,$tbl));
520 &mov ($key,&DWP(192-128,$tbl));
521 &mov ($acc,&DWP(224-128,$tbl));
522
523 &set_label("loop",16);
524
525 &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
526 &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
527 &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
528 &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
529 &enctransform(2);
530 &enctransform(3);
531 &enctransform(0);
532 &enctransform(1);
533 &mov ($key,$__key);
534 &mov ($tbl,$__tbl);
535 &add ($key,16); # advance rd_key
536 &xor ($s0,&DWP(0,$key));
537 &xor ($s1,&DWP(4,$key));
538 &xor ($s2,&DWP(8,$key));
539 &xor ($s3,&DWP(12,$key));
540
541 &cmp ($key,$__end);
542 &mov ($__key,$key);
543 &jb (&label("loop"));
544
545 &enccompact(0,$tbl,$s0,$s1,$s2,$s3);
546 &enccompact(1,$tbl,$s1,$s2,$s3,$s0);
547 &enccompact(2,$tbl,$s2,$s3,$s0,$s1);
548 &enccompact(3,$tbl,$s3,$s0,$s1,$s2);
549
550 &xor ($s0,&DWP(16,$key));
551 &xor ($s1,&DWP(20,$key));
552 &xor ($s2,&DWP(24,$key));
553 &xor ($s3,&DWP(28,$key));
554
555 &ret ();
556&function_end_B("_x86_AES_encrypt_compact");
557
558######################################################################
559# "Compact" SSE block function.
560######################################################################
561#
562# Performance is not actually extraordinary in comparison to pure
563# x86 code. In particular encrypt performance is virtually the same.
564# Decrypt performance on the other hand is 15-20% better on newer
565# µ-archs [but we're thankful for *any* improvement here], and ~50%
566# better on PIII:-) And additionally on the pros side this code
567# eliminates redundant references to stack and thus relieves/
568# minimizes the pressure on the memory bus.
569#
570# MMX register layout lsb
571# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
572# | mm4 | mm0 |
573# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
574# | s3 | s2 | s1 | s0 |
575# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
576# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
577# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
578#
579# Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
580# In this terms encryption and decryption "compact" permutation
581# matrices can be depicted as following:
582#
583# encryption lsb # decryption lsb
584# +----++----+----+----+----+ # +----++----+----+----+----+
585# | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 |
586# +----++----+----+----+----+ # +----++----+----+----+----+
587# | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 |
588# +----++----+----+----+----+ # +----++----+----+----+----+
589# | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 |
590# +----++----+----+----+----+ # +----++----+----+----+----+
591# | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 |
592# +----++----+----+----+----+ # +----++----+----+----+----+
593#
594######################################################################
595# Why not xmm registers? Short answer. It was actually tested and
596# was not any faster, but *contrary*, most notably on Intel CPUs.
597# Longer answer. Main advantage of using mm registers is that movd
598# latency is lower, especially on Intel P4. While arithmetic
599# instructions are twice as many, they can be scheduled every cycle
600# and not every second one when they are operating on xmm register,
601# so that "arithmetic throughput" remains virtually the same. And
602# finally the code can be executed even on elder SSE-only CPUs:-)
603
604sub sse_enccompact()
605{
606 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
607 &pshufw ("mm5","mm4",0x0d); # 15,14,11,10
608 &movd ("eax","mm1"); # 5, 4, 1, 0
609 &movd ("ebx","mm5"); # 15,14,11,10
610
611 &movz ($acc,&LB("eax")); # 0
612 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
613 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
614 &movz ("edx",&HB("eax")); # 1
615 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
616 &shl ("edx",8); # 1
617 &shr ("eax",16); # 5, 4
618
619 &movz ($acc,&LB("ebx")); # 10
620 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
621 &shl ($acc,16); # 10
622 &or ("ecx",$acc); # 10
623 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
624 &movz ($acc,&HB("ebx")); # 11
625 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
626 &shl ($acc,24); # 11
627 &or ("edx",$acc); # 11
628 &shr ("ebx",16); # 15,14
629
630 &movz ($acc,&HB("eax")); # 5
631 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
632 &shl ($acc,8); # 5
633 &or ("ecx",$acc); # 5
634 &movz ($acc,&HB("ebx")); # 15
635 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
636 &shl ($acc,24); # 15
637 &or ("ecx",$acc); # 15
638 &movd ("mm0","ecx"); # t[0] collected
639
640 &movz ($acc,&LB("eax")); # 4
641 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
642 &movd ("eax","mm2"); # 7, 6, 3, 2
643 &movz ($acc,&LB("ebx")); # 14
644 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
645 &shl ($acc,16); # 14
646 &or ("ecx",$acc); # 14
647
648 &movd ("ebx","mm6"); # 13,12, 9, 8
649 &movz ($acc,&HB("eax")); # 3
650 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
651 &shl ($acc,24); # 3
652 &or ("ecx",$acc); # 3
653 &movz ($acc,&HB("ebx")); # 9
654 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
655 &shl ($acc,8); # 9
656 &or ("ecx",$acc); # 9
657 &movd ("mm1","ecx"); # t[1] collected
658
659 &movz ($acc,&LB("ebx")); # 8
660 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
661 &shr ("ebx",16); # 13,12
662 &movz ($acc,&LB("eax")); # 2
663 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
664 &shl ($acc,16); # 2
665 &or ("ecx",$acc); # 2
666 &shr ("eax",16); # 7, 6
667
668 &punpckldq ("mm0","mm1"); # t[0,1] collected
669
670 &movz ($acc,&HB("eax")); # 7
671 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
672 &shl ($acc,24); # 7
673 &or ("ecx",$acc); # 7
674 &and ("eax",0xff); # 6
675 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
676 &shl ("eax",16); # 6
677 &or ("edx","eax"); # 6
678 &movz ($acc,&HB("ebx")); # 13
679 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
680 &shl ($acc,8); # 13
681 &or ("ecx",$acc); # 13
682 &movd ("mm4","ecx"); # t[2] collected
683 &and ("ebx",0xff); # 12
684 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
685 &or ("edx","ebx"); # 12
686 &movd ("mm5","edx"); # t[3] collected
687
688 &punpckldq ("mm4","mm5"); # t[2,3] collected
689}
690
691 if (!$x86only) {
692&function_begin_B("_sse_AES_encrypt_compact");
693 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
694 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
695
696 # note that caller is expected to allocate stack frame for me!
697 &mov ($acc,&DWP(240,$key)); # load key->rounds
698 &lea ($acc,&DWP(-2,$acc,$acc));
699 &lea ($acc,&DWP(0,$key,$acc,8));
700 &mov ($__end,$acc); # end of key schedule
701
702 &mov ($s0,0x1b1b1b1b); # magic constant
703 &mov (&DWP(8,"esp"),$s0);
704 &mov (&DWP(12,"esp"),$s0);
705
706 # prefetch Te4
707 &mov ($s0,&DWP(0-128,$tbl));
708 &mov ($s1,&DWP(32-128,$tbl));
709 &mov ($s2,&DWP(64-128,$tbl));
710 &mov ($s3,&DWP(96-128,$tbl));
711 &mov ($s0,&DWP(128-128,$tbl));
712 &mov ($s1,&DWP(160-128,$tbl));
713 &mov ($s2,&DWP(192-128,$tbl));
714 &mov ($s3,&DWP(224-128,$tbl));
715
716 &set_label("loop",16);
717 &sse_enccompact();
718 &add ($key,16);
719 &cmp ($key,$__end);
720 &ja (&label("out"));
721
722 &movq ("mm2",&QWP(8,"esp"));
723 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
724 &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
725 &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
726 &pand ("mm3","mm2"); &pand ("mm7","mm2");
727 &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
728 &paddb ("mm0","mm0"); &paddb ("mm4","mm4");
729 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
730 &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
731 &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
732 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
733
734 &movq ("mm2","mm3"); &movq ("mm6","mm7");
735 &pslld ("mm3",8); &pslld ("mm7",8);
736 &psrld ("mm2",24); &psrld ("mm6",24);
737 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
738 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
739
740 &movq ("mm3","mm1"); &movq ("mm7","mm5");
741 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
742 &psrld ("mm1",8); &psrld ("mm5",8);
743 &mov ($s0,&DWP(0-128,$tbl));
744 &pslld ("mm3",24); &pslld ("mm7",24);
745 &mov ($s1,&DWP(64-128,$tbl));
746 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
747 &mov ($s2,&DWP(128-128,$tbl));
748 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
749 &mov ($s3,&DWP(192-128,$tbl));
750
751 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
752 &jmp (&label("loop"));
753
754 &set_label("out",16);
755 &pxor ("mm0",&QWP(0,$key));
756 &pxor ("mm4",&QWP(8,$key));
757
758 &ret ();
759&function_end_B("_sse_AES_encrypt_compact");
760 }
761
762######################################################################
763# Vanilla block function.
764######################################################################
765
766sub encstep()
767{ my ($i,$te,@s) = @_;
768 my $tmp = $key;
769 my $out = $i==3?$s[0]:$acc;
770
771 # lines marked with #%e?x[i] denote "reordered" instructions...
772 if ($i==3) { &mov ($key,$__key); }##%edx
773 else { &mov ($out,$s[0]);
774 &and ($out,0xFF); }
775 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
776 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
777 &mov ($out,&DWP(0,$te,$out,8));
778
779 if ($i==3) { $tmp=$s[1]; }##%eax
780 &movz ($tmp,&HB($s[1]));
781 &xor ($out,&DWP(3,$te,$tmp,8));
782
783 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
784 else { &mov ($tmp,$s[2]);
785 &shr ($tmp,16); }
786 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
787 &and ($tmp,0xFF);
788 &xor ($out,&DWP(2,$te,$tmp,8));
789
790 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
791 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
792 else { &mov ($tmp,$s[3]);
793 &shr ($tmp,24) }
794 &xor ($out,&DWP(1,$te,$tmp,8));
795 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
796 if ($i==3) { &mov ($s[3],$acc); }
797 &comment();
798}
799
800sub enclast()
801{ my ($i,$te,@s)=@_;
802 my $tmp = $key;
803 my $out = $i==3?$s[0]:$acc;
804
805 if ($i==3) { &mov ($key,$__key); }##%edx
806 else { &mov ($out,$s[0]); }
807 &and ($out,0xFF);
808 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
809 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
810 &mov ($out,&DWP(2,$te,$out,8));
811 &and ($out,0x000000ff);
812
813 if ($i==3) { $tmp=$s[1]; }##%eax
814 &movz ($tmp,&HB($s[1]));
815 &mov ($tmp,&DWP(0,$te,$tmp,8));
816 &and ($tmp,0x0000ff00);
817 &xor ($out,$tmp);
818
819 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
820 else { &mov ($tmp,$s[2]);
821 &shr ($tmp,16); }
822 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
823 &and ($tmp,0xFF);
824 &mov ($tmp,&DWP(0,$te,$tmp,8));
825 &and ($tmp,0x00ff0000);
826 &xor ($out,$tmp);
827
828 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
829 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
830 else { &mov ($tmp,$s[3]);
831 &shr ($tmp,24); }
832 &mov ($tmp,&DWP(2,$te,$tmp,8));
833 &and ($tmp,0xff000000);
834 &xor ($out,$tmp);
835 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
836 if ($i==3) { &mov ($s[3],$acc); }
837}
838
839&function_begin_B("_x86_AES_encrypt");
840 if ($vertical_spin) {
841 # I need high parts of volatile registers to be accessible...
842 &exch ($s1="edi",$key="ebx");
843 &mov ($s2="esi",$acc="ecx");
844 }
845
846 # note that caller is expected to allocate stack frame for me!
847 &mov ($__key,$key); # save key
848
849 &xor ($s0,&DWP(0,$key)); # xor with key
850 &xor ($s1,&DWP(4,$key));
851 &xor ($s2,&DWP(8,$key));
852 &xor ($s3,&DWP(12,$key));
853
854 &mov ($acc,&DWP(240,$key)); # load key->rounds
855
856 if ($small_footprint) {
857 &lea ($acc,&DWP(-2,$acc,$acc));
858 &lea ($acc,&DWP(0,$key,$acc,8));
859 &mov ($__end,$acc); # end of key schedule
860
861 &set_label("loop",16);
862 if ($vertical_spin) {
863 &encvert($tbl,$s0,$s1,$s2,$s3);
864 } else {
865 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
866 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
867 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
868 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
869 }
870 &add ($key,16); # advance rd_key
871 &xor ($s0,&DWP(0,$key));
872 &xor ($s1,&DWP(4,$key));
873 &xor ($s2,&DWP(8,$key));
874 &xor ($s3,&DWP(12,$key));
875 &cmp ($key,$__end);
876 &mov ($__key,$key);
877 &jb (&label("loop"));
878 }
879 else {
880 &cmp ($acc,10);
881 &jle (&label("10rounds"));
882 &cmp ($acc,12);
883 &jle (&label("12rounds"));
884
885 &set_label("14rounds",4);
886 for ($i=1;$i<3;$i++) {
887 if ($vertical_spin) {
888 &encvert($tbl,$s0,$s1,$s2,$s3);
889 } else {
890 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
891 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
892 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
893 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
894 }
895 &xor ($s0,&DWP(16*$i+0,$key));
896 &xor ($s1,&DWP(16*$i+4,$key));
897 &xor ($s2,&DWP(16*$i+8,$key));
898 &xor ($s3,&DWP(16*$i+12,$key));
899 }
900 &add ($key,32);
901 &mov ($__key,$key); # advance rd_key
902 &set_label("12rounds",4);
903 for ($i=1;$i<3;$i++) {
904 if ($vertical_spin) {
905 &encvert($tbl,$s0,$s1,$s2,$s3);
906 } else {
907 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
908 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
909 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
910 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
911 }
912 &xor ($s0,&DWP(16*$i+0,$key));
913 &xor ($s1,&DWP(16*$i+4,$key));
914 &xor ($s2,&DWP(16*$i+8,$key));
915 &xor ($s3,&DWP(16*$i+12,$key));
916 }
917 &add ($key,32);
918 &mov ($__key,$key); # advance rd_key
919 &set_label("10rounds",4);
920 for ($i=1;$i<10;$i++) {
921 if ($vertical_spin) {
922 &encvert($tbl,$s0,$s1,$s2,$s3);
923 } else {
924 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
925 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
926 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
927 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
928 }
929 &xor ($s0,&DWP(16*$i+0,$key));
930 &xor ($s1,&DWP(16*$i+4,$key));
931 &xor ($s2,&DWP(16*$i+8,$key));
932 &xor ($s3,&DWP(16*$i+12,$key));
933 }
934 }
935
936 if ($vertical_spin) {
937 # "reincarnate" some registers for "horizontal" spin...
938 &mov ($s1="ebx",$key="edi");
939 &mov ($s2="ecx",$acc="esi");
940 }
941 &enclast(0,$tbl,$s0,$s1,$s2,$s3);
942 &enclast(1,$tbl,$s1,$s2,$s3,$s0);
943 &enclast(2,$tbl,$s2,$s3,$s0,$s1);
944 &enclast(3,$tbl,$s3,$s0,$s1,$s2);
945
946 &add ($key,$small_footprint?16:160);
947 &xor ($s0,&DWP(0,$key));
948 &xor ($s1,&DWP(4,$key));
949 &xor ($s2,&DWP(8,$key));
950 &xor ($s3,&DWP(12,$key));
951
952 &ret ();
953
954&set_label("AES_Te",64); # Yes! I keep it in the code segment!
955 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
956 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
957 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
958 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
959 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
960 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
961 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
962 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
963 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
964 &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
965 &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
966 &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
967 &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
968 &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
969 &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
970 &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
971 &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
972 &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
973 &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
974 &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
975 &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
976 &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
977 &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
978 &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
979 &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
980 &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
981 &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
982 &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
983 &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
984 &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
985 &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
986 &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
987 &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
988 &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
989 &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
990 &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
991 &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
992 &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
993 &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
994 &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
995 &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
996 &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
997 &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
998 &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
999 &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
1000 &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
1001 &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
1002 &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
1003 &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
1004 &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
1005 &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
1006 &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
1007 &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
1008 &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
1009 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
1010 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
1011 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
1012 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
1013 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
1014 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
1015 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
1016 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
1017 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
1018 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
1019
1020#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
1021 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1022 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1023 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1024 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1025 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1026 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1027 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1028 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1029 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1030 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1031 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1032 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1033 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1034 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1035 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1036 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1037 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1038 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1039 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1040 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1041 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1042 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1043 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1044 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1045 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1046 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1047 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1048 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1049 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1050 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1051 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1052 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1053
1054 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1055 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1056 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1057 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1058 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1059 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1060 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1061 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1062 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1063 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1064 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1065 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1066 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1067 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1068 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1069 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1070 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1071 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1072 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1073 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1074 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1075 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1076 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1077 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1078 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1079 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1080 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1081 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1082 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1083 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1084 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1085 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1086
1087 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1088 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1089 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1090 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1091 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1092 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1093 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1094 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1095 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1096 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1097 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1098 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1099 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1100 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1101 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1102 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1103 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1104 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1105 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1106 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1107 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1108 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1109 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1110 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1111 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1112 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1113 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1114 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1115 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1116 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1117 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1118 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1119
1120 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1121 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1122 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1123 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1124 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1125 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1126 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1127 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1128 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1129 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1130 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1131 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1132 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1133 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1134 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1135 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1136 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1137 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1138 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1139 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1140 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1141 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1142 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1143 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1144 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1145 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1146 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1147 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1148 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1149 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1150 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1151 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1152#rcon:
1153 &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
1154 &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
1155 &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
1156 &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
1157&function_end_B("_x86_AES_encrypt");
1158
1159# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
1160&function_begin("AES_encrypt");
1161 &mov ($acc,&wparam(0)); # load inp
1162 &mov ($key,&wparam(2)); # load key
1163
1164 &mov ($s0,"esp");
1165 &sub ("esp",36);
1166 &and ("esp",-64); # align to cache-line
1167
1168 # place stack frame just "above" the key schedule
1169 &lea ($s1,&DWP(-64-63,$key));
1170 &sub ($s1,"esp");
1171 &neg ($s1);
1172 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1173 &sub ("esp",$s1);
1174 &add ("esp",4); # 4 is reserved for caller's return address
1175 &mov ($_esp,$s0); # save stack pointer
1176
1177 &call (&label("pic_point")); # make it PIC!
1178 &set_label("pic_point");
1179 &blindpop($tbl);
1180 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
1181 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
1182
1183 # pick Te4 copy which can't "overlap" with stack frame or key schedule
1184 &lea ($s1,&DWP(768-4,"esp"));
1185 &sub ($s1,$tbl);
1186 &and ($s1,0x300);
1187 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1188
1189 if (!$x86only) {
1190 &bt (&DWP(0,$s0),25); # check for SSE bit
1191 &jnc (&label("x86"));
1192
1193 &movq ("mm0",&QWP(0,$acc));
1194 &movq ("mm4",&QWP(8,$acc));
1195 &call ("_sse_AES_encrypt_compact");
1196 &mov ("esp",$_esp); # restore stack pointer
1197 &mov ($acc,&wparam(1)); # load out
1198 &movq (&QWP(0,$acc),"mm0"); # write output data
1199 &movq (&QWP(8,$acc),"mm4");
1200 &emms ();
1201 &function_end_A();
1202 }
1203 &set_label("x86",16);
1204 &mov ($_tbl,$tbl);
1205 &mov ($s0,&DWP(0,$acc)); # load input data
1206 &mov ($s1,&DWP(4,$acc));
1207 &mov ($s2,&DWP(8,$acc));
1208 &mov ($s3,&DWP(12,$acc));
1209 &call ("_x86_AES_encrypt_compact");
1210 &mov ("esp",$_esp); # restore stack pointer
1211 &mov ($acc,&wparam(1)); # load out
1212 &mov (&DWP(0,$acc),$s0); # write output data
1213 &mov (&DWP(4,$acc),$s1);
1214 &mov (&DWP(8,$acc),$s2);
1215 &mov (&DWP(12,$acc),$s3);
1216&function_end("AES_encrypt");
1217
1218#--------------------------------------------------------------------#
1219
1220######################################################################
1221# "Compact" block function
1222######################################################################
1223
1224sub deccompact()
1225{ my $Fn = mov;
1226 while ($#_>5) { pop(@_); $Fn=sub{}; }
1227 my ($i,$td,@s)=@_;
1228 my $tmp = $key;
1229 my $out = $i==3?$s[0]:$acc;
1230
1231 # $Fn is used in first compact round and its purpose is to
1232 # void restoration of some values from stack, so that after
1233 # 4xdeccompact with extra argument $key, $s0 and $s1 values
1234 # are left there...
1235 if($i==3) { &$Fn ($key,$__key); }
1236 else { &mov ($out,$s[0]); }
1237 &and ($out,0xFF);
1238 &movz ($out,&BP(-128,$td,$out,1));
1239
1240 if ($i==3) { $tmp=$s[1]; }
1241 &movz ($tmp,&HB($s[1]));
1242 &movz ($tmp,&BP(-128,$td,$tmp,1));
1243 &shl ($tmp,8);
1244 &xor ($out,$tmp);
1245
1246 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1247 else { mov ($tmp,$s[2]); }
1248 &shr ($tmp,16);
1249 &and ($tmp,0xFF);
1250 &movz ($tmp,&BP(-128,$td,$tmp,1));
1251 &shl ($tmp,16);
1252 &xor ($out,$tmp);
1253
1254 if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }
1255 else { &mov ($tmp,$s[3]); }
1256 &shr ($tmp,24);
1257 &movz ($tmp,&BP(-128,$td,$tmp,1));
1258 &shl ($tmp,24);
1259 &xor ($out,$tmp);
1260 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1261 if ($i==3) { &$Fn ($s[3],$__s0); }
1262}
1263
1264# must be called with 2,3,0,1 as argument sequence!!!
1265sub dectransform()
1266{ my @s = ($s0,$s1,$s2,$s3);
1267 my $i = shift;
1268 my $tmp = $key;
1269 my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
1270 my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
1271 my $tp8 = $tbl;
1272
1273 &mov ($acc,$s[$i]);
1274 &and ($acc,0x80808080);
1275 &mov ($tmp,$acc);
1276 &shr ($tmp,7);
1277 &lea ($tp2,&DWP(0,$s[$i],$s[$i]));
1278 &sub ($acc,$tmp);
1279 &and ($tp2,0xfefefefe);
1280 &and ($acc,0x1b1b1b1b);
1281 &xor ($acc,$tp2);
1282 &mov ($tp2,$acc);
1283
1284 &and ($acc,0x80808080);
1285 &mov ($tmp,$acc);
1286 &shr ($tmp,7);
1287 &lea ($tp4,&DWP(0,$tp2,$tp2));
1288 &sub ($acc,$tmp);
1289 &and ($tp4,0xfefefefe);
1290 &and ($acc,0x1b1b1b1b);
1291 &xor ($tp2,$s[$i]); # tp2^tp1
1292 &xor ($acc,$tp4);
1293 &mov ($tp4,$acc);
1294
1295 &and ($acc,0x80808080);
1296 &mov ($tmp,$acc);
1297 &shr ($tmp,7);
1298 &lea ($tp8,&DWP(0,$tp4,$tp4));
1299 &sub ($acc,$tmp);
1300 &and ($tp8,0xfefefefe);
1301 &and ($acc,0x1b1b1b1b);
1302 &xor ($tp4,$s[$i]); # tp4^tp1
1303 &rotl ($s[$i],8); # = ROTATE(tp1,8)
1304 &xor ($tp8,$acc);
1305
1306 &xor ($s[$i],$tp2);
1307 &xor ($tp2,$tp8);
1308 &rotl ($tp2,24);
1309 &xor ($s[$i],$tp4);
1310 &xor ($tp4,$tp8);
1311 &rotl ($tp4,16);
1312 &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
1313 &rotl ($tp8,8);
1314 &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
1315 &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
1316 &mov ($s[0],$__s0) if($i==2); #prefetch $s0
1317 &mov ($s[1],$__s1) if($i==3); #prefetch $s1
1318 &mov ($s[2],$__s2) if($i==1);
1319 &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
1320
1321 &mov ($s[3],$__s3) if($i==1);
1322 &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
1323}
1324
1325&function_begin_B("_x86_AES_decrypt_compact");
1326 # note that caller is expected to allocate stack frame for me!
1327 &mov ($__key,$key); # save key
1328
1329 &xor ($s0,&DWP(0,$key)); # xor with key
1330 &xor ($s1,&DWP(4,$key));
1331 &xor ($s2,&DWP(8,$key));
1332 &xor ($s3,&DWP(12,$key));
1333
1334 &mov ($acc,&DWP(240,$key)); # load key->rounds
1335
1336 &lea ($acc,&DWP(-2,$acc,$acc));
1337 &lea ($acc,&DWP(0,$key,$acc,8));
1338 &mov ($__end,$acc); # end of key schedule
1339
1340 # prefetch Td4
1341 &mov ($key,&DWP(0-128,$tbl));
1342 &mov ($acc,&DWP(32-128,$tbl));
1343 &mov ($key,&DWP(64-128,$tbl));
1344 &mov ($acc,&DWP(96-128,$tbl));
1345 &mov ($key,&DWP(128-128,$tbl));
1346 &mov ($acc,&DWP(160-128,$tbl));
1347 &mov ($key,&DWP(192-128,$tbl));
1348 &mov ($acc,&DWP(224-128,$tbl));
1349
1350 &set_label("loop",16);
1351
1352 &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
1353 &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
1354 &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
1355 &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
1356 &dectransform(2);
1357 &dectransform(3);
1358 &dectransform(0);
1359 &dectransform(1);
1360 &mov ($key,$__key);
1361 &mov ($tbl,$__tbl);
1362 &add ($key,16); # advance rd_key
1363 &xor ($s0,&DWP(0,$key));
1364 &xor ($s1,&DWP(4,$key));
1365 &xor ($s2,&DWP(8,$key));
1366 &xor ($s3,&DWP(12,$key));
1367
1368 &cmp ($key,$__end);
1369 &mov ($__key,$key);
1370 &jb (&label("loop"));
1371
1372 &deccompact(0,$tbl,$s0,$s3,$s2,$s1);
1373 &deccompact(1,$tbl,$s1,$s0,$s3,$s2);
1374 &deccompact(2,$tbl,$s2,$s1,$s0,$s3);
1375 &deccompact(3,$tbl,$s3,$s2,$s1,$s0);
1376
1377 &xor ($s0,&DWP(16,$key));
1378 &xor ($s1,&DWP(20,$key));
1379 &xor ($s2,&DWP(24,$key));
1380 &xor ($s3,&DWP(28,$key));
1381
1382 &ret ();
1383&function_end_B("_x86_AES_decrypt_compact");
1384
1385######################################################################
1386# "Compact" SSE block function.
1387######################################################################
1388
1389sub sse_deccompact()
1390{
1391 &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
1392 &movd ("eax","mm1"); # 7, 6, 1, 0
1393
1394 &pshufw ("mm5","mm4",0x09); # 13,12,11,10
1395 &movz ($acc,&LB("eax")); # 0
1396 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
1397 &movd ("ebx","mm5"); # 13,12,11,10
1398 &movz ("edx",&HB("eax")); # 1
1399 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
1400 &shl ("edx",8); # 1
1401
1402 &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
1403 &movz ($acc,&LB("ebx")); # 10
1404 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
1405 &shl ($acc,16); # 10
1406 &or ("ecx",$acc); # 10
1407 &shr ("eax",16); # 7, 6
1408 &movz ($acc,&HB("ebx")); # 11
1409 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
1410 &shl ($acc,24); # 11
1411 &or ("edx",$acc); # 11
1412 &shr ("ebx",16); # 13,12
1413
1414 &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
1415 &movz ($acc,&HB("eax")); # 7
1416 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
1417 &shl ($acc,24); # 7
1418 &or ("ecx",$acc); # 7
1419 &movz ($acc,&HB("ebx")); # 13
1420 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
1421 &shl ($acc,8); # 13
1422 &or ("ecx",$acc); # 13
1423 &movd ("mm0","ecx"); # t[0] collected
1424
1425 &movz ($acc,&LB("eax")); # 6
1426 &movd ("eax","mm2"); # 3, 2, 5, 4
1427 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
1428 &shl ("ecx",16); # 6
1429 &movz ($acc,&LB("ebx")); # 12
1430 &movd ("ebx","mm6"); # 9, 8,15,14
1431 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
1432 &or ("ecx",$acc); # 12
1433
1434 &movz ($acc,&LB("eax")); # 4
1435 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
1436 &or ("edx",$acc); # 4
1437 &movz ($acc,&LB("ebx")); # 14
1438 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
1439 &shl ($acc,16); # 14
1440 &or ("edx",$acc); # 14
1441 &movd ("mm1","edx"); # t[1] collected
1442
1443 &movz ($acc,&HB("eax")); # 5
1444 &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
1445 &shl ("edx",8); # 5
1446 &movz ($acc,&HB("ebx")); # 15
1447 &shr ("eax",16); # 3, 2
1448 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
1449 &shl ($acc,24); # 15
1450 &or ("edx",$acc); # 15
1451 &shr ("ebx",16); # 9, 8
1452
1453 &punpckldq ("mm0","mm1"); # t[0,1] collected
1454
1455 &movz ($acc,&HB("ebx")); # 9
1456 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
1457 &shl ($acc,8); # 9
1458 &or ("ecx",$acc); # 9
1459 &and ("ebx",0xff); # 8
1460 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
1461 &or ("edx","ebx"); # 8
1462 &movz ($acc,&LB("eax")); # 2
1463 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
1464 &shl ($acc,16); # 2
1465 &or ("edx",$acc); # 2
1466 &movd ("mm4","edx"); # t[2] collected
1467 &movz ("eax",&HB("eax")); # 3
1468 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
1469 &shl ("eax",24); # 3
1470 &or ("ecx","eax"); # 3
1471 &movd ("mm5","ecx"); # t[3] collected
1472
1473 &punpckldq ("mm4","mm5"); # t[2,3] collected
1474}
1475
1476 if (!$x86only) {
1477&function_begin_B("_sse_AES_decrypt_compact");
1478 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
1479 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
1480
1481 # note that caller is expected to allocate stack frame for me!
1482 &mov ($acc,&DWP(240,$key)); # load key->rounds
1483 &lea ($acc,&DWP(-2,$acc,$acc));
1484 &lea ($acc,&DWP(0,$key,$acc,8));
1485 &mov ($__end,$acc); # end of key schedule
1486
1487 &mov ($s0,0x1b1b1b1b); # magic constant
1488 &mov (&DWP(8,"esp"),$s0);
1489 &mov (&DWP(12,"esp"),$s0);
1490
1491 # prefetch Td4
1492 &mov ($s0,&DWP(0-128,$tbl));
1493 &mov ($s1,&DWP(32-128,$tbl));
1494 &mov ($s2,&DWP(64-128,$tbl));
1495 &mov ($s3,&DWP(96-128,$tbl));
1496 &mov ($s0,&DWP(128-128,$tbl));
1497 &mov ($s1,&DWP(160-128,$tbl));
1498 &mov ($s2,&DWP(192-128,$tbl));
1499 &mov ($s3,&DWP(224-128,$tbl));
1500
1501 &set_label("loop",16);
1502 &sse_deccompact();
1503 &add ($key,16);
1504 &cmp ($key,$__end);
1505 &ja (&label("out"));
1506
1507 # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
1508 &movq ("mm3","mm0"); &movq ("mm7","mm4");
1509 &movq ("mm2","mm0",1); &movq ("mm6","mm4",1);
1510 &movq ("mm1","mm0"); &movq ("mm5","mm4");
1511 &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16)
1512 &pslld ("mm2",8); &pslld ("mm6",8);
1513 &psrld ("mm3",8); &psrld ("mm7",8);
1514 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8
1515 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8
1516 &pslld ("mm2",16); &pslld ("mm6",16);
1517 &psrld ("mm3",16); &psrld ("mm7",16);
1518 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24
1519 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24
1520
1521 &movq ("mm3",&QWP(8,"esp"));
1522 &pxor ("mm2","mm2"); &pxor ("mm6","mm6");
1523 &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5");
1524 &pand ("mm2","mm3"); &pand ("mm6","mm3");
1525 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1526 &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2
1527 &movq ("mm3","mm1"); &movq ("mm7","mm5");
1528 &movq ("mm2","mm1"); &movq ("mm6","mm5");
1529 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2
1530 &pslld ("mm3",24); &pslld ("mm7",24);
1531 &psrld ("mm2",8); &psrld ("mm6",8);
1532 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24
1533 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8
1534
1535 &movq ("mm2",&QWP(8,"esp"));
1536 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1537 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1538 &pand ("mm3","mm2"); &pand ("mm7","mm2");
1539 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1540 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
1541 &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
1542 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
1543 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
1544
1545 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1546 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1547 &pand ("mm3","mm2"); &pand ("mm7","mm2");
1548 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1549 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8
1550 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8
1551 &movq ("mm3","mm1"); &movq ("mm7","mm5");
1552 &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1);
1553 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16)
1554 &pslld ("mm1",8); &pslld ("mm5",8);
1555 &psrld ("mm3",8); &psrld ("mm7",8);
1556 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
1557 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8
1558 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8
1559 &mov ($s0,&DWP(0-128,$tbl));
1560 &pslld ("mm1",16); &pslld ("mm5",16);
1561 &mov ($s1,&DWP(64-128,$tbl));
1562 &psrld ("mm3",16); &psrld ("mm7",16);
1563 &mov ($s2,&DWP(128-128,$tbl));
1564 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24
1565 &mov ($s3,&DWP(192-128,$tbl));
1566 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24
1567
1568 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
1569 &jmp (&label("loop"));
1570
1571 &set_label("out",16);
1572 &pxor ("mm0",&QWP(0,$key));
1573 &pxor ("mm4",&QWP(8,$key));
1574
1575 &ret ();
1576&function_end_B("_sse_AES_decrypt_compact");
1577 }
1578
1579######################################################################
1580# Vanilla block function.
1581######################################################################
1582
1583sub decstep()
1584{ my ($i,$td,@s) = @_;
1585 my $tmp = $key;
1586 my $out = $i==3?$s[0]:$acc;
1587
1588 # no instructions are reordered, as performance appears
1589 # optimal... or rather that all attempts to reorder didn't
1590 # result in better performance [which by the way is not a
1591 # bit lower than ecryption].
1592 if($i==3) { &mov ($key,$__key); }
1593 else { &mov ($out,$s[0]); }
1594 &and ($out,0xFF);
1595 &mov ($out,&DWP(0,$td,$out,8));
1596
1597 if ($i==3) { $tmp=$s[1]; }
1598 &movz ($tmp,&HB($s[1]));
1599 &xor ($out,&DWP(3,$td,$tmp,8));
1600
1601 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1602 else { &mov ($tmp,$s[2]); }
1603 &shr ($tmp,16);
1604 &and ($tmp,0xFF);
1605 &xor ($out,&DWP(2,$td,$tmp,8));
1606
1607 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
1608 else { &mov ($tmp,$s[3]); }
1609 &shr ($tmp,24);
1610 &xor ($out,&DWP(1,$td,$tmp,8));
1611 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1612 if ($i==3) { &mov ($s[3],$__s0); }
1613 &comment();
1614}
1615
1616sub declast()
1617{ my ($i,$td,@s)=@_;
1618 my $tmp = $key;
1619 my $out = $i==3?$s[0]:$acc;
1620
1621 if($i==0) { &lea ($td,&DWP(2048+128,$td));
1622 &mov ($tmp,&DWP(0-128,$td));
1623 &mov ($acc,&DWP(32-128,$td));
1624 &mov ($tmp,&DWP(64-128,$td));
1625 &mov ($acc,&DWP(96-128,$td));
1626 &mov ($tmp,&DWP(128-128,$td));
1627 &mov ($acc,&DWP(160-128,$td));
1628 &mov ($tmp,&DWP(192-128,$td));
1629 &mov ($acc,&DWP(224-128,$td));
1630 &lea ($td,&DWP(-128,$td)); }
1631 if($i==3) { &mov ($key,$__key); }
1632 else { &mov ($out,$s[0]); }
1633 &and ($out,0xFF);
1634 &movz ($out,&BP(0,$td,$out,1));
1635
1636 if ($i==3) { $tmp=$s[1]; }
1637 &movz ($tmp,&HB($s[1]));
1638 &movz ($tmp,&BP(0,$td,$tmp,1));
1639 &shl ($tmp,8);
1640 &xor ($out,$tmp);
1641
1642 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1643 else { mov ($tmp,$s[2]); }
1644 &shr ($tmp,16);
1645 &and ($tmp,0xFF);
1646 &movz ($tmp,&BP(0,$td,$tmp,1));
1647 &shl ($tmp,16);
1648 &xor ($out,$tmp);
1649
1650 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
1651 else { &mov ($tmp,$s[3]); }
1652 &shr ($tmp,24);
1653 &movz ($tmp,&BP(0,$td,$tmp,1));
1654 &shl ($tmp,24);
1655 &xor ($out,$tmp);
1656 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1657 if ($i==3) { &mov ($s[3],$__s0);
1658 &lea ($td,&DWP(-2048,$td)); }
1659}
1660
1661&function_begin_B("_x86_AES_decrypt");
1662 # note that caller is expected to allocate stack frame for me!
1663 &mov ($__key,$key); # save key
1664
1665 &xor ($s0,&DWP(0,$key)); # xor with key
1666 &xor ($s1,&DWP(4,$key));
1667 &xor ($s2,&DWP(8,$key));
1668 &xor ($s3,&DWP(12,$key));
1669
1670 &mov ($acc,&DWP(240,$key)); # load key->rounds
1671
1672 if ($small_footprint) {
1673 &lea ($acc,&DWP(-2,$acc,$acc));
1674 &lea ($acc,&DWP(0,$key,$acc,8));
1675 &mov ($__end,$acc); # end of key schedule
1676 &set_label("loop",16);
1677 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1678 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1679 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1680 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1681 &add ($key,16); # advance rd_key
1682 &xor ($s0,&DWP(0,$key));
1683 &xor ($s1,&DWP(4,$key));
1684 &xor ($s2,&DWP(8,$key));
1685 &xor ($s3,&DWP(12,$key));
1686 &cmp ($key,$__end);
1687 &mov ($__key,$key);
1688 &jb (&label("loop"));
1689 }
1690 else {
1691 &cmp ($acc,10);
1692 &jle (&label("10rounds"));
1693 &cmp ($acc,12);
1694 &jle (&label("12rounds"));
1695
1696 &set_label("14rounds",4);
1697 for ($i=1;$i<3;$i++) {
1698 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1699 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1700 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1701 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1702 &xor ($s0,&DWP(16*$i+0,$key));
1703 &xor ($s1,&DWP(16*$i+4,$key));
1704 &xor ($s2,&DWP(16*$i+8,$key));
1705 &xor ($s3,&DWP(16*$i+12,$key));
1706 }
1707 &add ($key,32);
1708 &mov ($__key,$key); # advance rd_key
1709 &set_label("12rounds",4);
1710 for ($i=1;$i<3;$i++) {
1711 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1712 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1713 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1714 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1715 &xor ($s0,&DWP(16*$i+0,$key));
1716 &xor ($s1,&DWP(16*$i+4,$key));
1717 &xor ($s2,&DWP(16*$i+8,$key));
1718 &xor ($s3,&DWP(16*$i+12,$key));
1719 }
1720 &add ($key,32);
1721 &mov ($__key,$key); # advance rd_key
1722 &set_label("10rounds",4);
1723 for ($i=1;$i<10;$i++) {
1724 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1725 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1726 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1727 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1728 &xor ($s0,&DWP(16*$i+0,$key));
1729 &xor ($s1,&DWP(16*$i+4,$key));
1730 &xor ($s2,&DWP(16*$i+8,$key));
1731 &xor ($s3,&DWP(16*$i+12,$key));
1732 }
1733 }
1734
1735 &declast(0,$tbl,$s0,$s3,$s2,$s1);
1736 &declast(1,$tbl,$s1,$s0,$s3,$s2);
1737 &declast(2,$tbl,$s2,$s1,$s0,$s3);
1738 &declast(3,$tbl,$s3,$s2,$s1,$s0);
1739
1740 &add ($key,$small_footprint?16:160);
1741 &xor ($s0,&DWP(0,$key));
1742 &xor ($s1,&DWP(4,$key));
1743 &xor ($s2,&DWP(8,$key));
1744 &xor ($s3,&DWP(12,$key));
1745
1746 &ret ();
1747
1748&set_label("AES_Td",64); # Yes! I keep it in the code segment!
1749 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
1750 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
1751 &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
1752 &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
1753 &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
1754 &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
1755 &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
1756 &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
1757 &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
1758 &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
1759 &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
1760 &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
1761 &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
1762 &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
1763 &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
1764 &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
1765 &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
1766 &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
1767 &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
1768 &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
1769 &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
1770 &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
1771 &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
1772 &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
1773 &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
1774 &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
1775 &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
1776 &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
1777 &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
1778 &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
1779 &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
1780 &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
1781 &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
1782 &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
1783 &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
1784 &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
1785 &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
1786 &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
1787 &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
1788 &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
1789 &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
1790 &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
1791 &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
1792 &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
1793 &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
1794 &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
1795 &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
1796 &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
1797 &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
1798 &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
1799 &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
1800 &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
1801 &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
1802 &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
1803 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
1804 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
1805 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
1806 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
1807 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
1808 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
1809 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
1810 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
1811 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
1812 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
1813
1814#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
1815 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1816 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1817 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1818 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1819 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1820 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1821 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1822 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1823 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1824 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1825 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1826 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1827 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1828 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1829 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1830 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1831 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1832 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1833 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1834 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1835 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1836 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1837 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1838 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1839 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1840 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1841 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1842 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1843 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1844 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1845 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1846 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1847
1848 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1849 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1850 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1851 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1852 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1853 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1854 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1855 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1856 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1857 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1858 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1859 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1860 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1861 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1862 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1863 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1864 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1865 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1866 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1867 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1868 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1869 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1870 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1871 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1872 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1873 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1874 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1875 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1876 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1877 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1878 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1879 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1880
1881 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1882 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1883 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1884 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1885 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1886 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1887 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1888 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1889 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1890 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1891 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1892 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1893 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1894 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1895 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1896 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1897 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1898 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1899 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1900 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1901 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1902 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1903 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1904 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1905 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1906 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1907 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1908 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1909 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1910 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1911 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1912 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1913
1914 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1915 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1916 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1917 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1918 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1919 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1920 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1921 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1922 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1923 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1924 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1925 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1926 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1927 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1928 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1929 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1930 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1931 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1932 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1933 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1934 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1935 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1936 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1937 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1938 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1939 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1940 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1941 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1942 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1943 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1944 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1945 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1946&function_end_B("_x86_AES_decrypt");
1947
1948# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
1949&function_begin("AES_decrypt");
1950 &mov ($acc,&wparam(0)); # load inp
1951 &mov ($key,&wparam(2)); # load key
1952
1953 &mov ($s0,"esp");
1954 &sub ("esp",36);
1955 &and ("esp",-64); # align to cache-line
1956
1957 # place stack frame just "above" the key schedule
1958 &lea ($s1,&DWP(-64-63,$key));
1959 &sub ($s1,"esp");
1960 &neg ($s1);
1961 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1962 &sub ("esp",$s1);
1963 &add ("esp",4); # 4 is reserved for caller's return address
1964 &mov ($_esp,$s0); # save stack pointer
1965
1966 &call (&label("pic_point")); # make it PIC!
1967 &set_label("pic_point");
1968 &blindpop($tbl);
1969 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
1970 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
1971
1972 # pick Td4 copy which can't "overlap" with stack frame or key schedule
1973 &lea ($s1,&DWP(768-4,"esp"));
1974 &sub ($s1,$tbl);
1975 &and ($s1,0x300);
1976 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1977
1978 if (!$x86only) {
1979 &bt (&DWP(0,$s0),25); # check for SSE bit
1980 &jnc (&label("x86"));
1981
1982 &movq ("mm0",&QWP(0,$acc));
1983 &movq ("mm4",&QWP(8,$acc));
1984 &call ("_sse_AES_decrypt_compact");
1985 &mov ("esp",$_esp); # restore stack pointer
1986 &mov ($acc,&wparam(1)); # load out
1987 &movq (&QWP(0,$acc),"mm0"); # write output data
1988 &movq (&QWP(8,$acc),"mm4");
1989 &emms ();
1990 &function_end_A();
1991 }
1992 &set_label("x86",16);
1993 &mov ($_tbl,$tbl);
1994 &mov ($s0,&DWP(0,$acc)); # load input data
1995 &mov ($s1,&DWP(4,$acc));
1996 &mov ($s2,&DWP(8,$acc));
1997 &mov ($s3,&DWP(12,$acc));
1998 &call ("_x86_AES_decrypt_compact");
1999 &mov ("esp",$_esp); # restore stack pointer
2000 &mov ($acc,&wparam(1)); # load out
2001 &mov (&DWP(0,$acc),$s0); # write output data
2002 &mov (&DWP(4,$acc),$s1);
2003 &mov (&DWP(8,$acc),$s2);
2004 &mov (&DWP(12,$acc),$s3);
2005&function_end("AES_decrypt");
2006
2007# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
2008# size_t length, const AES_KEY *key,
2009# unsigned char *ivp,const int enc);
2010{
2011# stack frame layout
2012# -4(%esp) # return address 0(%esp)
2013# 0(%esp) # s0 backing store 4(%esp)
2014# 4(%esp) # s1 backing store 8(%esp)
2015# 8(%esp) # s2 backing store 12(%esp)
2016# 12(%esp) # s3 backing store 16(%esp)
2017# 16(%esp) # key backup 20(%esp)
2018# 20(%esp) # end of key schedule 24(%esp)
2019# 24(%esp) # %ebp backup 28(%esp)
2020# 28(%esp) # %esp backup
2021my $_inp=&DWP(32,"esp"); # copy of wparam(0)
2022my $_out=&DWP(36,"esp"); # copy of wparam(1)
2023my $_len=&DWP(40,"esp"); # copy of wparam(2)
2024my $_key=&DWP(44,"esp"); # copy of wparam(3)
2025my $_ivp=&DWP(48,"esp"); # copy of wparam(4)
2026my $_tmp=&DWP(52,"esp"); # volatile variable
2027#
2028my $ivec=&DWP(60,"esp"); # ivec[16]
2029my $aes_key=&DWP(76,"esp"); # copy of aes_key
2030my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
2031
2032&function_begin("AES_cbc_encrypt");
2033 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
2034 &cmp ($s2,0);
2035 &je (&label("drop_out"));
2036
2037 &call (&label("pic_point")); # make it PIC!
2038 &set_label("pic_point");
2039 &blindpop($tbl);
2040 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
2041
2042 &cmp (&wparam(5),0);
2043 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
2044 &jne (&label("picked_te"));
2045 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
2046 &set_label("picked_te");
2047
2048 # one can argue if this is required
2049 &pushf ();
2050 &cld ();
2051
2052 &cmp ($s2,$speed_limit);
2053 &jb (&label("slow_way"));
2054 &test ($s2,15);
2055 &jnz (&label("slow_way"));
2056 if (!$x86only) {
2057 &bt (&DWP(0,$s0),28); # check for hyper-threading bit
2058 &jc (&label("slow_way"));
2059 }
2060 # pre-allocate aligned stack frame...
2061 &lea ($acc,&DWP(-80-244,"esp"));
2062 &and ($acc,-64);
2063
2064 # ... and make sure it doesn't alias with $tbl modulo 4096
2065 &mov ($s0,$tbl);
2066 &lea ($s1,&DWP(2048+256,$tbl));
2067 &mov ($s3,$acc);
2068 &and ($s0,0xfff); # s = %ebp&0xfff
2069 &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
2070 &and ($s3,0xfff); # p = %esp&0xfff
2071
2072 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
2073 &jb (&label("tbl_break_out"));
2074 &sub ($s3,$s1);
2075 &sub ($acc,$s3);
2076 &jmp (&label("tbl_ok"));
2077 &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz;
2078 &sub ($s3,$s0);
2079 &and ($s3,0xfff);
2080 &add ($s3,384);
2081 &sub ($acc,$s3);
2082 &set_label("tbl_ok",4);
2083
2084 &lea ($s3,&wparam(0)); # obtain pointer to parameter block
2085 &exch ("esp",$acc); # allocate stack frame
2086 &add ("esp",4); # reserve for return address!
2087 &mov ($_tbl,$tbl); # save %ebp
2088 &mov ($_esp,$acc); # save %esp
2089
2090 &mov ($s0,&DWP(0,$s3)); # load inp
2091 &mov ($s1,&DWP(4,$s3)); # load out
2092 #&mov ($s2,&DWP(8,$s3)); # load len
2093 &mov ($key,&DWP(12,$s3)); # load key
2094 &mov ($acc,&DWP(16,$s3)); # load ivp
2095 &mov ($s3,&DWP(20,$s3)); # load enc flag
2096
2097 &mov ($_inp,$s0); # save copy of inp
2098 &mov ($_out,$s1); # save copy of out
2099 &mov ($_len,$s2); # save copy of len
2100 &mov ($_key,$key); # save copy of key
2101 &mov ($_ivp,$acc); # save copy of ivp
2102
2103 &mov ($mark,0); # copy of aes_key->rounds = 0;
2104 # do we copy key schedule to stack?
2105 &mov ($s1 eq "ebx" ? $s1 : "",$key);
2106 &mov ($s2 eq "ecx" ? $s2 : "",244/4);
2107 &sub ($s1,$tbl);
2108 &mov ("esi",$key);
2109 &and ($s1,0xfff);
2110 &lea ("edi",$aes_key);
2111 &cmp ($s1,2048+256);
2112 &jb (&label("do_copy"));
2113 &cmp ($s1,4096-244);
2114 &jb (&label("skip_copy"));
2115 &set_label("do_copy",4);
2116 &mov ($_key,"edi");
2117 &data_word(0xA5F3F689); # rep movsd
2118 &set_label("skip_copy");
2119
2120 &mov ($key,16);
2121 &set_label("prefetch_tbl",4);
2122 &mov ($s0,&DWP(0,$tbl));
2123 &mov ($s1,&DWP(32,$tbl));
2124 &mov ($s2,&DWP(64,$tbl));
2125 &mov ($acc,&DWP(96,$tbl));
2126 &lea ($tbl,&DWP(128,$tbl));
2127 &sub ($key,1);
2128 &jnz (&label("prefetch_tbl"));
2129 &sub ($tbl,2048);
2130
2131 &mov ($acc,$_inp);
2132 &mov ($key,$_ivp);
2133
2134 &cmp ($s3,0);
2135 &je (&label("fast_decrypt"));
2136
2137#----------------------------- ENCRYPT -----------------------------#
2138 &mov ($s0,&DWP(0,$key)); # load iv
2139 &mov ($s1,&DWP(4,$key));
2140
2141 &set_label("fast_enc_loop",16);
2142 &mov ($s2,&DWP(8,$key));
2143 &mov ($s3,&DWP(12,$key));
2144
2145 &xor ($s0,&DWP(0,$acc)); # xor input data
2146 &xor ($s1,&DWP(4,$acc));
2147 &xor ($s2,&DWP(8,$acc));
2148 &xor ($s3,&DWP(12,$acc));
2149
2150 &mov ($key,$_key); # load key
2151 &call ("_x86_AES_encrypt");
2152
2153 &mov ($acc,$_inp); # load inp
2154 &mov ($key,$_out); # load out
2155
2156 &mov (&DWP(0,$key),$s0); # save output data
2157 &mov (&DWP(4,$key),$s1);
2158 &mov (&DWP(8,$key),$s2);
2159 &mov (&DWP(12,$key),$s3);
2160
2161 &lea ($acc,&DWP(16,$acc)); # advance inp
2162 &mov ($s2,$_len); # load len
2163 &mov ($_inp,$acc); # save inp
2164 &lea ($s3,&DWP(16,$key)); # advance out
2165 &mov ($_out,$s3); # save out
2166 &sub ($s2,16); # decrease len
2167 &mov ($_len,$s2); # save len
2168 &jnz (&label("fast_enc_loop"));
2169 &mov ($acc,$_ivp); # load ivp
2170 &mov ($s2,&DWP(8,$key)); # restore last 2 dwords
2171 &mov ($s3,&DWP(12,$key));
2172 &mov (&DWP(0,$acc),$s0); # save ivec
2173 &mov (&DWP(4,$acc),$s1);
2174 &mov (&DWP(8,$acc),$s2);
2175 &mov (&DWP(12,$acc),$s3);
2176
2177 &cmp ($mark,0); # was the key schedule copied?
2178 &mov ("edi",$_key);
2179 &je (&label("skip_ezero"));
2180 # zero copy of key schedule
2181 &mov ("ecx",240/4);
2182 &xor ("eax","eax");
2183 &align (4);
2184 &data_word(0xABF3F689); # rep stosd
2185 &set_label("skip_ezero")
2186 &mov ("esp",$_esp);
2187 &popf ();
2188 &set_label("drop_out");
2189 &function_end_A();
2190 &pushf (); # kludge, never executed
2191
2192#----------------------------- DECRYPT -----------------------------#
2193&set_label("fast_decrypt",16);
2194
2195 &cmp ($acc,$_out);
2196 &je (&label("fast_dec_in_place")); # in-place processing...
2197
2198 &mov ($_tmp,$key);
2199
2200 &align (4);
2201 &set_label("fast_dec_loop",16);
2202 &mov ($s0,&DWP(0,$acc)); # read input
2203 &mov ($s1,&DWP(4,$acc));
2204 &mov ($s2,&DWP(8,$acc));
2205 &mov ($s3,&DWP(12,$acc));
2206
2207 &mov ($key,$_key); # load key
2208 &call ("_x86_AES_decrypt");
2209
2210 &mov ($key,$_tmp); # load ivp
2211 &mov ($acc,$_len); # load len
2212 &xor ($s0,&DWP(0,$key)); # xor iv
2213 &xor ($s1,&DWP(4,$key));
2214 &xor ($s2,&DWP(8,$key));
2215 &xor ($s3,&DWP(12,$key));
2216
2217 &mov ($key,$_out); # load out
2218 &mov ($acc,$_inp); # load inp
2219
2220 &mov (&DWP(0,$key),$s0); # write output
2221 &mov (&DWP(4,$key),$s1);
2222 &mov (&DWP(8,$key),$s2);
2223 &mov (&DWP(12,$key),$s3);
2224
2225 &mov ($s2,$_len); # load len
2226 &mov ($_tmp,$acc); # save ivp
2227 &lea ($acc,&DWP(16,$acc)); # advance inp
2228 &mov ($_inp,$acc); # save inp
2229 &lea ($key,&DWP(16,$key)); # advance out
2230 &mov ($_out,$key); # save out
2231 &sub ($s2,16); # decrease len
2232 &mov ($_len,$s2); # save len
2233 &jnz (&label("fast_dec_loop"));
2234 &mov ($key,$_tmp); # load temp ivp
2235 &mov ($acc,$_ivp); # load user ivp
2236 &mov ($s0,&DWP(0,$key)); # load iv
2237 &mov ($s1,&DWP(4,$key));
2238 &mov ($s2,&DWP(8,$key));
2239 &mov ($s3,&DWP(12,$key));
2240 &mov (&DWP(0,$acc),$s0); # copy back to user
2241 &mov (&DWP(4,$acc),$s1);
2242 &mov (&DWP(8,$acc),$s2);
2243 &mov (&DWP(12,$acc),$s3);
2244 &jmp (&label("fast_dec_out"));
2245
2246 &set_label("fast_dec_in_place",16);
2247 &set_label("fast_dec_in_place_loop");
2248 &mov ($s0,&DWP(0,$acc)); # read input
2249 &mov ($s1,&DWP(4,$acc));
2250 &mov ($s2,&DWP(8,$acc));
2251 &mov ($s3,&DWP(12,$acc));
2252
2253 &lea ($key,$ivec);
2254 &mov (&DWP(0,$key),$s0); # copy to temp
2255 &mov (&DWP(4,$key),$s1);
2256 &mov (&DWP(8,$key),$s2);
2257 &mov (&DWP(12,$key),$s3);
2258
2259 &mov ($key,$_key); # load key
2260 &call ("_x86_AES_decrypt");
2261
2262 &mov ($key,$_ivp); # load ivp
2263 &mov ($acc,$_out); # load out
2264 &xor ($s0,&DWP(0,$key)); # xor iv
2265 &xor ($s1,&DWP(4,$key));
2266 &xor ($s2,&DWP(8,$key));
2267 &xor ($s3,&DWP(12,$key));
2268
2269 &mov (&DWP(0,$acc),$s0); # write output
2270 &mov (&DWP(4,$acc),$s1);
2271 &mov (&DWP(8,$acc),$s2);
2272 &mov (&DWP(12,$acc),$s3);
2273
2274 &lea ($acc,&DWP(16,$acc)); # advance out
2275 &mov ($_out,$acc); # save out
2276
2277 &lea ($acc,$ivec);
2278 &mov ($s0,&DWP(0,$acc)); # read temp
2279 &mov ($s1,&DWP(4,$acc));
2280 &mov ($s2,&DWP(8,$acc));
2281 &mov ($s3,&DWP(12,$acc));
2282
2283 &mov (&DWP(0,$key),$s0); # copy iv
2284 &mov (&DWP(4,$key),$s1);
2285 &mov (&DWP(8,$key),$s2);
2286 &mov (&DWP(12,$key),$s3);
2287
2288 &mov ($acc,$_inp); # load inp
2289 &mov ($s2,$_len); # load len
2290 &lea ($acc,&DWP(16,$acc)); # advance inp
2291 &mov ($_inp,$acc); # save inp
2292 &sub ($s2,16); # decrease len
2293 &mov ($_len,$s2); # save len
2294 &jnz (&label("fast_dec_in_place_loop"));
2295
2296 &set_label("fast_dec_out",4);
2297 &cmp ($mark,0); # was the key schedule copied?
2298 &mov ("edi",$_key);
2299 &je (&label("skip_dzero"));
2300 # zero copy of key schedule
2301 &mov ("ecx",240/4);
2302 &xor ("eax","eax");
2303 &align (4);
2304 &data_word(0xABF3F689); # rep stosd
2305 &set_label("skip_dzero")
2306 &mov ("esp",$_esp);
2307 &popf ();
2308 &function_end_A();
2309 &pushf (); # kludge, never executed
2310
2311#--------------------------- SLOW ROUTINE ---------------------------#
2312&set_label("slow_way",16);
2313
2314 &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap
2315 &mov ($key,&wparam(3)); # load key
2316
2317 # pre-allocate aligned stack frame...
2318 &lea ($acc,&DWP(-80,"esp"));
2319 &and ($acc,-64);
2320
2321 # ... and make sure it doesn't alias with $key modulo 1024
2322 &lea ($s1,&DWP(-80-63,$key));
2323 &sub ($s1,$acc);
2324 &neg ($s1);
2325 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
2326 &sub ($acc,$s1);
2327
2328 # pick S-box copy which can't overlap with stack frame or $key
2329 &lea ($s1,&DWP(768,$acc));
2330 &sub ($s1,$tbl);
2331 &and ($s1,0x300);
2332 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
2333
2334 &lea ($s3,&wparam(0)); # pointer to parameter block
2335
2336 &exch ("esp",$acc);
2337 &add ("esp",4); # reserve for return address!
2338 &mov ($_tbl,$tbl); # save %ebp
2339 &mov ($_esp,$acc); # save %esp
2340 &mov ($_tmp,$s0); # save OPENSSL_ia32cap
2341
2342 &mov ($s0,&DWP(0,$s3)); # load inp
2343 &mov ($s1,&DWP(4,$s3)); # load out
2344 #&mov ($s2,&DWP(8,$s3)); # load len
2345 #&mov ($key,&DWP(12,$s3)); # load key
2346 &mov ($acc,&DWP(16,$s3)); # load ivp
2347 &mov ($s3,&DWP(20,$s3)); # load enc flag
2348
2349 &mov ($_inp,$s0); # save copy of inp
2350 &mov ($_out,$s1); # save copy of out
2351 &mov ($_len,$s2); # save copy of len
2352 &mov ($_key,$key); # save copy of key
2353 &mov ($_ivp,$acc); # save copy of ivp
2354
2355 &mov ($key,$acc);
2356 &mov ($acc,$s0);
2357
2358 &cmp ($s3,0);
2359 &je (&label("slow_decrypt"));
2360
2361#--------------------------- SLOW ENCRYPT ---------------------------#
2362 &cmp ($s2,16);
2363 &mov ($s3,$s1);
2364 &jb (&label("slow_enc_tail"));
2365
2366 if (!$x86only) {
2367 &bt ($_tmp,25); # check for SSE bit
2368 &jnc (&label("slow_enc_x86"));
2369
2370 &movq ("mm0",&QWP(0,$key)); # load iv
2371 &movq ("mm4",&QWP(8,$key));
2372
2373 &set_label("slow_enc_loop_sse",16);
2374 &pxor ("mm0",&QWP(0,$acc)); # xor input data
2375 &pxor ("mm4",&QWP(8,$acc));
2376
2377 &mov ($key,$_key);
2378 &call ("_sse_AES_encrypt_compact");
2379
2380 &mov ($acc,$_inp); # load inp
2381 &mov ($key,$_out); # load out
2382 &mov ($s2,$_len); # load len
2383
2384 &movq (&QWP(0,$key),"mm0"); # save output data
2385 &movq (&QWP(8,$key),"mm4");
2386
2387 &lea ($acc,&DWP(16,$acc)); # advance inp
2388 &mov ($_inp,$acc); # save inp
2389 &lea ($s3,&DWP(16,$key)); # advance out
2390 &mov ($_out,$s3); # save out
2391 &sub ($s2,16); # decrease len
2392 &cmp ($s2,16);
2393 &mov ($_len,$s2); # save len
2394 &jae (&label("slow_enc_loop_sse"));
2395 &test ($s2,15);
2396 &jnz (&label("slow_enc_tail"));
2397 &mov ($acc,$_ivp); # load ivp
2398 &movq (&QWP(0,$acc),"mm0"); # save ivec
2399 &movq (&QWP(8,$acc),"mm4");
2400 &emms ();
2401 &mov ("esp",$_esp);
2402 &popf ();
2403 &function_end_A();
2404 &pushf (); # kludge, never executed
2405 }
2406 &set_label("slow_enc_x86",16);
2407 &mov ($s0,&DWP(0,$key)); # load iv
2408 &mov ($s1,&DWP(4,$key));
2409
2410 &set_label("slow_enc_loop_x86",4);
2411 &mov ($s2,&DWP(8,$key));
2412 &mov ($s3,&DWP(12,$key));
2413
2414 &xor ($s0,&DWP(0,$acc)); # xor input data
2415 &xor ($s1,&DWP(4,$acc));
2416 &xor ($s2,&DWP(8,$acc));
2417 &xor ($s3,&DWP(12,$acc));
2418
2419 &mov ($key,$_key); # load key
2420 &call ("_x86_AES_encrypt_compact");
2421
2422 &mov ($acc,$_inp); # load inp
2423 &mov ($key,$_out); # load out
2424
2425 &mov (&DWP(0,$key),$s0); # save output data
2426 &mov (&DWP(4,$key),$s1);
2427 &mov (&DWP(8,$key),$s2);
2428 &mov (&DWP(12,$key),$s3);
2429
2430 &mov ($s2,$_len); # load len
2431 &lea ($acc,&DWP(16,$acc)); # advance inp
2432 &mov ($_inp,$acc); # save inp
2433 &lea ($s3,&DWP(16,$key)); # advance out
2434 &mov ($_out,$s3); # save out
2435 &sub ($s2,16); # decrease len
2436 &cmp ($s2,16);
2437 &mov ($_len,$s2); # save len
2438 &jae (&label("slow_enc_loop_x86"));
2439 &test ($s2,15);
2440 &jnz (&label("slow_enc_tail"));
2441 &mov ($acc,$_ivp); # load ivp
2442 &mov ($s2,&DWP(8,$key)); # restore last dwords
2443 &mov ($s3,&DWP(12,$key));
2444 &mov (&DWP(0,$acc),$s0); # save ivec
2445 &mov (&DWP(4,$acc),$s1);
2446 &mov (&DWP(8,$acc),$s2);
2447 &mov (&DWP(12,$acc),$s3);
2448
2449 &mov ("esp",$_esp);
2450 &popf ();
2451 &function_end_A();
2452 &pushf (); # kludge, never executed
2453
2454 &set_label("slow_enc_tail",16);
2455 &emms () if (!$x86only);
2456 &mov ($key eq "edi"? $key:"",$s3); # load out to edi
2457 &mov ($s1,16);
2458 &sub ($s1,$s2);
2459 &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp
2460 &je (&label("enc_in_place"));
2461 &align (4);
2462 &data_word(0xA4F3F689); # rep movsb # copy input
2463 &jmp (&label("enc_skip_in_place"));
2464 &set_label("enc_in_place");
2465 &lea ($key,&DWP(0,$key,$s2));
2466 &set_label("enc_skip_in_place");
2467 &mov ($s2,$s1);
2468 &xor ($s0,$s0);
2469 &align (4);
2470 &data_word(0xAAF3F689); # rep stosb # zero tail
2471
2472 &mov ($key,$_ivp); # restore ivp
2473 &mov ($acc,$s3); # output as input
2474 &mov ($s0,&DWP(0,$key));
2475 &mov ($s1,&DWP(4,$key));
2476 &mov ($_len,16); # len=16
2477 &jmp (&label("slow_enc_loop_x86")); # one more spin...
2478
2479#--------------------------- SLOW DECRYPT ---------------------------#
2480&set_label("slow_decrypt",16);
2481 if (!$x86only) {
2482 &bt ($_tmp,25); # check for SSE bit
2483 &jnc (&label("slow_dec_loop_x86"));
2484
2485 &set_label("slow_dec_loop_sse",4);
2486 &movq ("mm0",&QWP(0,$acc)); # read input
2487 &movq ("mm4",&QWP(8,$acc));
2488
2489 &mov ($key,$_key);
2490 &call ("_sse_AES_decrypt_compact");
2491
2492 &mov ($acc,$_inp); # load inp
2493 &lea ($s0,$ivec);
2494 &mov ($s1,$_out); # load out
2495 &mov ($s2,$_len); # load len
2496 &mov ($key,$_ivp); # load ivp
2497
2498 &movq ("mm1",&QWP(0,$acc)); # re-read input
2499 &movq ("mm5",&QWP(8,$acc));
2500
2501 &pxor ("mm0",&QWP(0,$key)); # xor iv
2502 &pxor ("mm4",&QWP(8,$key));
2503
2504 &movq (&QWP(0,$key),"mm1"); # copy input to iv
2505 &movq (&QWP(8,$key),"mm5");
2506
2507 &sub ($s2,16); # decrease len
2508 &jc (&label("slow_dec_partial_sse"));
2509
2510 &movq (&QWP(0,$s1),"mm0"); # write output
2511 &movq (&QWP(8,$s1),"mm4");
2512
2513 &lea ($s1,&DWP(16,$s1)); # advance out
2514 &mov ($_out,$s1); # save out
2515 &lea ($acc,&DWP(16,$acc)); # advance inp
2516 &mov ($_inp,$acc); # save inp
2517 &mov ($_len,$s2); # save len
2518 &jnz (&label("slow_dec_loop_sse"));
2519 &emms ();
2520 &mov ("esp",$_esp);
2521 &popf ();
2522 &function_end_A();
2523 &pushf (); # kludge, never executed
2524
2525 &set_label("slow_dec_partial_sse",16);
2526 &movq (&QWP(0,$s0),"mm0"); # save output to temp
2527 &movq (&QWP(8,$s0),"mm4");
2528 &emms ();
2529
2530 &add ($s2 eq "ecx" ? "ecx":"",16);
2531 &mov ("edi",$s1); # out
2532 &mov ("esi",$s0); # temp
2533 &align (4);
2534 &data_word(0xA4F3F689); # rep movsb # copy partial output
2535
2536 &mov ("esp",$_esp);
2537 &popf ();
2538 &function_end_A();
2539 &pushf (); # kludge, never executed
2540 }
2541 &set_label("slow_dec_loop_x86",16);
2542 &mov ($s0,&DWP(0,$acc)); # read input
2543 &mov ($s1,&DWP(4,$acc));
2544 &mov ($s2,&DWP(8,$acc));
2545 &mov ($s3,&DWP(12,$acc));
2546
2547 &lea ($key,$ivec);
2548 &mov (&DWP(0,$key),$s0); # copy to temp
2549 &mov (&DWP(4,$key),$s1);
2550 &mov (&DWP(8,$key),$s2);
2551 &mov (&DWP(12,$key),$s3);
2552
2553 &mov ($key,$_key); # load key
2554 &call ("_x86_AES_decrypt_compact");
2555
2556 &mov ($key,$_ivp); # load ivp
2557 &mov ($acc,$_len); # load len
2558 &xor ($s0,&DWP(0,$key)); # xor iv
2559 &xor ($s1,&DWP(4,$key));
2560 &xor ($s2,&DWP(8,$key));
2561 &xor ($s3,&DWP(12,$key));
2562
2563 &sub ($acc,16);
2564 &jc (&label("slow_dec_partial_x86"));
2565
2566 &mov ($_len,$acc); # save len
2567 &mov ($acc,$_out); # load out
2568
2569 &mov (&DWP(0,$acc),$s0); # write output
2570 &mov (&DWP(4,$acc),$s1);
2571 &mov (&DWP(8,$acc),$s2);
2572 &mov (&DWP(12,$acc),$s3);
2573
2574 &lea ($acc,&DWP(16,$acc)); # advance out
2575 &mov ($_out,$acc); # save out
2576
2577 &lea ($acc,$ivec);
2578 &mov ($s0,&DWP(0,$acc)); # read temp
2579 &mov ($s1,&DWP(4,$acc));
2580 &mov ($s2,&DWP(8,$acc));
2581 &mov ($s3,&DWP(12,$acc));
2582
2583 &mov (&DWP(0,$key),$s0); # copy it to iv
2584 &mov (&DWP(4,$key),$s1);
2585 &mov (&DWP(8,$key),$s2);
2586 &mov (&DWP(12,$key),$s3);
2587
2588 &mov ($acc,$_inp); # load inp
2589 &lea ($acc,&DWP(16,$acc)); # advance inp
2590 &mov ($_inp,$acc); # save inp
2591 &jnz (&label("slow_dec_loop_x86"));
2592 &mov ("esp",$_esp);
2593 &popf ();
2594 &function_end_A();
2595 &pushf (); # kludge, never executed
2596
2597 &set_label("slow_dec_partial_x86",16);
2598 &lea ($acc,$ivec);
2599 &mov (&DWP(0,$acc),$s0); # save output to temp
2600 &mov (&DWP(4,$acc),$s1);
2601 &mov (&DWP(8,$acc),$s2);
2602 &mov (&DWP(12,$acc),$s3);
2603
2604 &mov ($acc,$_inp);
2605 &mov ($s0,&DWP(0,$acc)); # re-read input
2606 &mov ($s1,&DWP(4,$acc));
2607 &mov ($s2,&DWP(8,$acc));
2608 &mov ($s3,&DWP(12,$acc));
2609
2610 &mov (&DWP(0,$key),$s0); # copy it to iv
2611 &mov (&DWP(4,$key),$s1);
2612 &mov (&DWP(8,$key),$s2);
2613 &mov (&DWP(12,$key),$s3);
2614
2615 &mov ("ecx",$_len);
2616 &mov ("edi",$_out);
2617 &lea ("esi",$ivec);
2618 &align (4);
2619 &data_word(0xA4F3F689); # rep movsb # copy partial output
2620
2621 &mov ("esp",$_esp);
2622 &popf ();
2623&function_end("AES_cbc_encrypt");
2624}
2625
2626#------------------------------------------------------------------#
2627
2628sub enckey()
2629{
2630 &movz ("esi",&LB("edx")); # rk[i]>>0
2631 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2632 &movz ("esi",&HB("edx")); # rk[i]>>8
2633 &shl ("ebx",24);
2634 &xor ("eax","ebx");
2635
2636 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2637 &shr ("edx",16);
2638 &movz ("esi",&LB("edx")); # rk[i]>>16
2639 &xor ("eax","ebx");
2640
2641 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2642 &movz ("esi",&HB("edx")); # rk[i]>>24
2643 &shl ("ebx",8);
2644 &xor ("eax","ebx");
2645
2646 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2647 &shl ("ebx",16);
2648 &xor ("eax","ebx");
2649
2650 &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon
2651}
2652
2653&function_begin("_x86_AES_set_encrypt_key");
2654 &mov ("esi",&wparam(1)); # user supplied key
2655 &mov ("edi",&wparam(3)); # private key schedule
2656
2657 &test ("esi",-1);
2658 &jz (&label("badpointer"));
2659 &test ("edi",-1);
2660 &jz (&label("badpointer"));
2661
2662 &call (&label("pic_point"));
2663 &set_label("pic_point");
2664 &blindpop($tbl);
2665 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
2666 &lea ($tbl,&DWP(2048+128,$tbl));
2667
2668 # prefetch Te4
2669 &mov ("eax",&DWP(0-128,$tbl));
2670 &mov ("ebx",&DWP(32-128,$tbl));
2671 &mov ("ecx",&DWP(64-128,$tbl));
2672 &mov ("edx",&DWP(96-128,$tbl));
2673 &mov ("eax",&DWP(128-128,$tbl));
2674 &mov ("ebx",&DWP(160-128,$tbl));
2675 &mov ("ecx",&DWP(192-128,$tbl));
2676 &mov ("edx",&DWP(224-128,$tbl));
2677
2678 &mov ("ecx",&wparam(2)); # number of bits in key
2679 &cmp ("ecx",128);
2680 &je (&label("10rounds"));
2681 &cmp ("ecx",192);
2682 &je (&label("12rounds"));
2683 &cmp ("ecx",256);
2684 &je (&label("14rounds"));
2685 &mov ("eax",-2); # invalid number of bits
2686 &jmp (&label("exit"));
2687
2688 &set_label("10rounds");
2689 &mov ("eax",&DWP(0,"esi")); # copy first 4 dwords
2690 &mov ("ebx",&DWP(4,"esi"));
2691 &mov ("ecx",&DWP(8,"esi"));
2692 &mov ("edx",&DWP(12,"esi"));
2693 &mov (&DWP(0,"edi"),"eax");
2694 &mov (&DWP(4,"edi"),"ebx");
2695 &mov (&DWP(8,"edi"),"ecx");
2696 &mov (&DWP(12,"edi"),"edx");
2697
2698 &xor ("ecx","ecx");
2699 &jmp (&label("10shortcut"));
2700
2701 &align (4);
2702 &set_label("10loop");
2703 &mov ("eax",&DWP(0,"edi")); # rk[0]
2704 &mov ("edx",&DWP(12,"edi")); # rk[3]
2705 &set_label("10shortcut");
2706 &enckey ();
2707
2708 &mov (&DWP(16,"edi"),"eax"); # rk[4]
2709 &xor ("eax",&DWP(4,"edi"));
2710 &mov (&DWP(20,"edi"),"eax"); # rk[5]
2711 &xor ("eax",&DWP(8,"edi"));
2712 &mov (&DWP(24,"edi"),"eax"); # rk[6]
2713 &xor ("eax",&DWP(12,"edi"));
2714 &mov (&DWP(28,"edi"),"eax"); # rk[7]
2715 &inc ("ecx");
2716 &add ("edi",16);
2717 &cmp ("ecx",10);
2718 &jl (&label("10loop"));
2719
2720 &mov (&DWP(80,"edi"),10); # setup number of rounds
2721 &xor ("eax","eax");
2722 &jmp (&label("exit"));
2723
2724 &set_label("12rounds");
2725 &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
2726 &mov ("ebx",&DWP(4,"esi"));
2727 &mov ("ecx",&DWP(8,"esi"));
2728 &mov ("edx",&DWP(12,"esi"));
2729 &mov (&DWP(0,"edi"),"eax");
2730 &mov (&DWP(4,"edi"),"ebx");
2731 &mov (&DWP(8,"edi"),"ecx");
2732 &mov (&DWP(12,"edi"),"edx");
2733 &mov ("ecx",&DWP(16,"esi"));
2734 &mov ("edx",&DWP(20,"esi"));
2735 &mov (&DWP(16,"edi"),"ecx");
2736 &mov (&DWP(20,"edi"),"edx");
2737
2738 &xor ("ecx","ecx");
2739 &jmp (&label("12shortcut"));
2740
2741 &align (4);
2742 &set_label("12loop");
2743 &mov ("eax",&DWP(0,"edi")); # rk[0]
2744 &mov ("edx",&DWP(20,"edi")); # rk[5]
2745 &set_label("12shortcut");
2746 &enckey ();
2747
2748 &mov (&DWP(24,"edi"),"eax"); # rk[6]
2749 &xor ("eax",&DWP(4,"edi"));
2750 &mov (&DWP(28,"edi"),"eax"); # rk[7]
2751 &xor ("eax",&DWP(8,"edi"));
2752 &mov (&DWP(32,"edi"),"eax"); # rk[8]
2753 &xor ("eax",&DWP(12,"edi"));
2754 &mov (&DWP(36,"edi"),"eax"); # rk[9]
2755
2756 &cmp ("ecx",7);
2757 &je (&label("12break"));
2758 &inc ("ecx");
2759
2760 &xor ("eax",&DWP(16,"edi"));
2761 &mov (&DWP(40,"edi"),"eax"); # rk[10]
2762 &xor ("eax",&DWP(20,"edi"));
2763 &mov (&DWP(44,"edi"),"eax"); # rk[11]
2764
2765 &add ("edi",24);
2766 &jmp (&label("12loop"));
2767
2768 &set_label("12break");
2769 &mov (&DWP(72,"edi"),12); # setup number of rounds
2770 &xor ("eax","eax");
2771 &jmp (&label("exit"));
2772
2773 &set_label("14rounds");
2774 &mov ("eax",&DWP(0,"esi")); # copy first 8 dwords
2775 &mov ("ebx",&DWP(4,"esi"));
2776 &mov ("ecx",&DWP(8,"esi"));
2777 &mov ("edx",&DWP(12,"esi"));
2778 &mov (&DWP(0,"edi"),"eax");
2779 &mov (&DWP(4,"edi"),"ebx");
2780 &mov (&DWP(8,"edi"),"ecx");
2781 &mov (&DWP(12,"edi"),"edx");
2782 &mov ("eax",&DWP(16,"esi"));
2783 &mov ("ebx",&DWP(20,"esi"));
2784 &mov ("ecx",&DWP(24,"esi"));
2785 &mov ("edx",&DWP(28,"esi"));
2786 &mov (&DWP(16,"edi"),"eax");
2787 &mov (&DWP(20,"edi"),"ebx");
2788 &mov (&DWP(24,"edi"),"ecx");
2789 &mov (&DWP(28,"edi"),"edx");
2790
2791 &xor ("ecx","ecx");
2792 &jmp (&label("14shortcut"));
2793
2794 &align (4);
2795 &set_label("14loop");
2796 &mov ("edx",&DWP(28,"edi")); # rk[7]
2797 &set_label("14shortcut");
2798 &mov ("eax",&DWP(0,"edi")); # rk[0]
2799
2800 &enckey ();
2801
2802 &mov (&DWP(32,"edi"),"eax"); # rk[8]
2803 &xor ("eax",&DWP(4,"edi"));
2804 &mov (&DWP(36,"edi"),"eax"); # rk[9]
2805 &xor ("eax",&DWP(8,"edi"));
2806 &mov (&DWP(40,"edi"),"eax"); # rk[10]
2807 &xor ("eax",&DWP(12,"edi"));
2808 &mov (&DWP(44,"edi"),"eax"); # rk[11]
2809
2810 &cmp ("ecx",6);
2811 &je (&label("14break"));
2812 &inc ("ecx");
2813
2814 &mov ("edx","eax");
2815 &mov ("eax",&DWP(16,"edi")); # rk[4]
2816 &movz ("esi",&LB("edx")); # rk[11]>>0
2817 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2818 &movz ("esi",&HB("edx")); # rk[11]>>8
2819 &xor ("eax","ebx");
2820
2821 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2822 &shr ("edx",16);
2823 &shl ("ebx",8);
2824 &movz ("esi",&LB("edx")); # rk[11]>>16
2825 &xor ("eax","ebx");
2826
2827 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2828 &movz ("esi",&HB("edx")); # rk[11]>>24
2829 &shl ("ebx",16);
2830 &xor ("eax","ebx");
2831
2832 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2833 &shl ("ebx",24);
2834 &xor ("eax","ebx");
2835
2836 &mov (&DWP(48,"edi"),"eax"); # rk[12]
2837 &xor ("eax",&DWP(20,"edi"));
2838 &mov (&DWP(52,"edi"),"eax"); # rk[13]
2839 &xor ("eax",&DWP(24,"edi"));
2840 &mov (&DWP(56,"edi"),"eax"); # rk[14]
2841 &xor ("eax",&DWP(28,"edi"));
2842 &mov (&DWP(60,"edi"),"eax"); # rk[15]
2843
2844 &add ("edi",32);
2845 &jmp (&label("14loop"));
2846
2847 &set_label("14break");
2848 &mov (&DWP(48,"edi"),14); # setup number of rounds
2849 &xor ("eax","eax");
2850 &jmp (&label("exit"));
2851
2852 &set_label("badpointer");
2853 &mov ("eax",-1);
2854 &set_label("exit");
2855&function_end("_x86_AES_set_encrypt_key");
2856
2857# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
2858# AES_KEY *key)
2859&function_begin_B("AES_set_encrypt_key");
2860 &call ("_x86_AES_set_encrypt_key");
2861 &ret ();
2862&function_end_B("AES_set_encrypt_key");
2863
2864sub deckey()
2865{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
2866 my $tmp = $tbl;
2867
2868 &mov ($acc,$tp1);
2869 &and ($acc,0x80808080);
2870 &mov ($tmp,$acc);
2871 &shr ($tmp,7);
2872 &lea ($tp2,&DWP(0,$tp1,$tp1));
2873 &sub ($acc,$tmp);
2874 &and ($tp2,0xfefefefe);
2875 &and ($acc,0x1b1b1b1b);
2876 &xor ($acc,$tp2);
2877 &mov ($tp2,$acc);
2878
2879 &and ($acc,0x80808080);
2880 &mov ($tmp,$acc);
2881 &shr ($tmp,7);
2882 &lea ($tp4,&DWP(0,$tp2,$tp2));
2883 &sub ($acc,$tmp);
2884 &and ($tp4,0xfefefefe);
2885 &and ($acc,0x1b1b1b1b);
2886 &xor ($tp2,$tp1); # tp2^tp1
2887 &xor ($acc,$tp4);
2888 &mov ($tp4,$acc);
2889
2890 &and ($acc,0x80808080);
2891 &mov ($tmp,$acc);
2892 &shr ($tmp,7);
2893 &lea ($tp8,&DWP(0,$tp4,$tp4));
2894 &xor ($tp4,$tp1); # tp4^tp1
2895 &sub ($acc,$tmp);
2896 &and ($tp8,0xfefefefe);
2897 &and ($acc,0x1b1b1b1b);
2898 &rotl ($tp1,8); # = ROTATE(tp1,8)
2899 &xor ($tp8,$acc);
2900
2901 &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load
2902
2903 &xor ($tp1,$tp2);
2904 &xor ($tp2,$tp8);
2905 &xor ($tp1,$tp4);
2906 &rotl ($tp2,24);
2907 &xor ($tp4,$tp8);
2908 &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
2909 &rotl ($tp4,16);
2910 &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
2911 &rotl ($tp8,8);
2912 &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
2913 &mov ($tp2,$tmp);
2914 &xor ($tp1,$tp8); # ^= ROTATE(tp8,8)
2915
2916 &mov (&DWP(4*$i,$key),$tp1);
2917}
2918
2919# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
2920# AES_KEY *key)
2921&function_begin_B("AES_set_decrypt_key");
2922 &call ("_x86_AES_set_encrypt_key");
2923 &cmp ("eax",0);
2924 &je (&label("proceed"));
2925 &ret ();
2926
2927 &set_label("proceed");
2928 &push ("ebp");
2929 &push ("ebx");
2930 &push ("esi");
2931 &push ("edi");
2932
2933 &mov ("esi",&wparam(2));
2934 &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
2935 &lea ("ecx",&DWP(0,"","ecx",4));
2936 &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
2937
2938 &set_label("invert",4); # invert order of chunks
2939 &mov ("eax",&DWP(0,"esi"));
2940 &mov ("ebx",&DWP(4,"esi"));
2941 &mov ("ecx",&DWP(0,"edi"));
2942 &mov ("edx",&DWP(4,"edi"));
2943 &mov (&DWP(0,"edi"),"eax");
2944 &mov (&DWP(4,"edi"),"ebx");
2945 &mov (&DWP(0,"esi"),"ecx");
2946 &mov (&DWP(4,"esi"),"edx");
2947 &mov ("eax",&DWP(8,"esi"));
2948 &mov ("ebx",&DWP(12,"esi"));
2949 &mov ("ecx",&DWP(8,"edi"));
2950 &mov ("edx",&DWP(12,"edi"));
2951 &mov (&DWP(8,"edi"),"eax");
2952 &mov (&DWP(12,"edi"),"ebx");
2953 &mov (&DWP(8,"esi"),"ecx");
2954 &mov (&DWP(12,"esi"),"edx");
2955 &add ("esi",16);
2956 &sub ("edi",16);
2957 &cmp ("esi","edi");
2958 &jne (&label("invert"));
2959
2960 &mov ($key,&wparam(2));
2961 &mov ($acc,&DWP(240,$key)); # pull number of rounds
2962 &lea ($acc,&DWP(-2,$acc,$acc));
2963 &lea ($acc,&DWP(0,$key,$acc,8));
2964 &mov (&wparam(2),$acc);
2965
2966 &mov ($s0,&DWP(16,$key)); # modulo-scheduled load
2967 &set_label("permute",4); # permute the key schedule
2968 &add ($key,16);
2969 &deckey (0,$key,$s0,$s1,$s2,$s3);
2970 &deckey (1,$key,$s1,$s2,$s3,$s0);
2971 &deckey (2,$key,$s2,$s3,$s0,$s1);
2972 &deckey (3,$key,$s3,$s0,$s1,$s2);
2973 &cmp ($key,&wparam(2));
2974 &jb (&label("permute"));
2975
2976 &xor ("eax","eax"); # return success
2977&function_end("AES_set_decrypt_key");
2978&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
2979
2980&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl
deleted file mode 100644
index 717cc1ed7f..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-armv4.pl
+++ /dev/null
@@ -1,1134 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for ARMv4
11
12# January 2007.
13#
14# Code uses single 1K S-box and is >2 times faster than code generated
15# by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
16# allows to merge logical or arithmetic operation with shift or rotate
17# in one instruction and emit combined result every cycle. The module
18# is endian-neutral. The performance is ~42 cycles/byte for 128-bit
19# key [on single-issue Xscale PXA250 core].
20
21# May 2007.
22#
23# AES_set_[en|de]crypt_key is added.
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 12% improvement on
28# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~21.5 cycles per byte.
34
35while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
36open STDOUT,">$output";
37
38$s0="r0";
39$s1="r1";
40$s2="r2";
41$s3="r3";
42$t1="r4";
43$t2="r5";
44$t3="r6";
45$i1="r7";
46$i2="r8";
47$i3="r9";
48
49$tbl="r10";
50$key="r11";
51$rounds="r12";
52
53$code=<<___;
54#include "arm_arch.h"
55.text
56.code 32
57
58.type AES_Te,%object
59.align 5
60AES_Te:
61.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
62.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
63.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
64.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
65.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
66.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
67.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
68.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
69.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
70.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
71.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
72.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
73.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
74.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
75.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
76.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
77.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
78.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
79.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
80.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
81.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
82.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
83.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
84.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
85.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
86.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
87.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
88.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
89.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
90.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
91.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
92.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
93.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
94.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
95.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
96.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
97.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
98.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
99.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
100.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
101.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
102.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
103.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
104.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
105.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
106.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
107.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
108.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
109.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
110.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
111.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
112.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
113.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
114.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
115.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
116.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
117.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
118.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
119.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
120.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
121.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
122.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
123.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
124.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
125@ Te4[256]
126.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
127.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
128.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
129.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
130.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
131.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
132.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
133.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
134.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
135.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
136.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
137.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
138.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
139.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
140.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
141.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
142.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
143.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
144.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
145.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
146.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
147.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
148.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
149.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
150.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
151.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
152.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
153.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
154.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
155.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
156.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
157.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
158@ rcon[]
159.word 0x01000000, 0x02000000, 0x04000000, 0x08000000
160.word 0x10000000, 0x20000000, 0x40000000, 0x80000000
161.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
162.size AES_Te,.-AES_Te
163
164@ void AES_encrypt(const unsigned char *in, unsigned char *out,
165@ const AES_KEY *key) {
166.global AES_encrypt
167.type AES_encrypt,%function
168.align 5
169AES_encrypt:
170 sub r3,pc,#8 @ AES_encrypt
171 stmdb sp!,{r1,r4-r12,lr}
172 mov $rounds,r0 @ inp
173 mov $key,r2
174 sub $tbl,r3,#AES_encrypt-AES_Te @ Te
175#if __ARM_ARCH__<7
176 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
177 ldrb $t1,[$rounds,#2] @ manner...
178 ldrb $t2,[$rounds,#1]
179 ldrb $t3,[$rounds,#0]
180 orr $s0,$s0,$t1,lsl#8
181 ldrb $s1,[$rounds,#7]
182 orr $s0,$s0,$t2,lsl#16
183 ldrb $t1,[$rounds,#6]
184 orr $s0,$s0,$t3,lsl#24
185 ldrb $t2,[$rounds,#5]
186 ldrb $t3,[$rounds,#4]
187 orr $s1,$s1,$t1,lsl#8
188 ldrb $s2,[$rounds,#11]
189 orr $s1,$s1,$t2,lsl#16
190 ldrb $t1,[$rounds,#10]
191 orr $s1,$s1,$t3,lsl#24
192 ldrb $t2,[$rounds,#9]
193 ldrb $t3,[$rounds,#8]
194 orr $s2,$s2,$t1,lsl#8
195 ldrb $s3,[$rounds,#15]
196 orr $s2,$s2,$t2,lsl#16
197 ldrb $t1,[$rounds,#14]
198 orr $s2,$s2,$t3,lsl#24
199 ldrb $t2,[$rounds,#13]
200 ldrb $t3,[$rounds,#12]
201 orr $s3,$s3,$t1,lsl#8
202 orr $s3,$s3,$t2,lsl#16
203 orr $s3,$s3,$t3,lsl#24
204#else
205 ldr $s0,[$rounds,#0]
206 ldr $s1,[$rounds,#4]
207 ldr $s2,[$rounds,#8]
208 ldr $s3,[$rounds,#12]
209#ifdef __ARMEL__
210 rev $s0,$s0
211 rev $s1,$s1
212 rev $s2,$s2
213 rev $s3,$s3
214#endif
215#endif
216 bl _armv4_AES_encrypt
217
218 ldr $rounds,[sp],#4 @ pop out
219#if __ARM_ARCH__>=7
220#ifdef __ARMEL__
221 rev $s0,$s0
222 rev $s1,$s1
223 rev $s2,$s2
224 rev $s3,$s3
225#endif
226 str $s0,[$rounds,#0]
227 str $s1,[$rounds,#4]
228 str $s2,[$rounds,#8]
229 str $s3,[$rounds,#12]
230#else
231 mov $t1,$s0,lsr#24 @ write output in endian-neutral
232 mov $t2,$s0,lsr#16 @ manner...
233 mov $t3,$s0,lsr#8
234 strb $t1,[$rounds,#0]
235 strb $t2,[$rounds,#1]
236 mov $t1,$s1,lsr#24
237 strb $t3,[$rounds,#2]
238 mov $t2,$s1,lsr#16
239 strb $s0,[$rounds,#3]
240 mov $t3,$s1,lsr#8
241 strb $t1,[$rounds,#4]
242 strb $t2,[$rounds,#5]
243 mov $t1,$s2,lsr#24
244 strb $t3,[$rounds,#6]
245 mov $t2,$s2,lsr#16
246 strb $s1,[$rounds,#7]
247 mov $t3,$s2,lsr#8
248 strb $t1,[$rounds,#8]
249 strb $t2,[$rounds,#9]
250 mov $t1,$s3,lsr#24
251 strb $t3,[$rounds,#10]
252 mov $t2,$s3,lsr#16
253 strb $s2,[$rounds,#11]
254 mov $t3,$s3,lsr#8
255 strb $t1,[$rounds,#12]
256 strb $t2,[$rounds,#13]
257 strb $t3,[$rounds,#14]
258 strb $s3,[$rounds,#15]
259#endif
260#if __ARM_ARCH__>=5
261 ldmia sp!,{r4-r12,pc}
262#else
263 ldmia sp!,{r4-r12,lr}
264 tst lr,#1
265 moveq pc,lr @ be binary compatible with V4, yet
266 bx lr @ interoperable with Thumb ISA:-)
267#endif
268.size AES_encrypt,.-AES_encrypt
269
270.type _armv4_AES_encrypt,%function
271.align 2
272_armv4_AES_encrypt:
273 str lr,[sp,#-4]! @ push lr
274 ldmia $key!,{$t1-$i1}
275 eor $s0,$s0,$t1
276 ldr $rounds,[$key,#240-16]
277 eor $s1,$s1,$t2
278 eor $s2,$s2,$t3
279 eor $s3,$s3,$i1
280 sub $rounds,$rounds,#1
281 mov lr,#255
282
283 and $i1,lr,$s0
284 and $i2,lr,$s0,lsr#8
285 and $i3,lr,$s0,lsr#16
286 mov $s0,$s0,lsr#24
287.Lenc_loop:
288 ldr $t1,[$tbl,$i1,lsl#2] @ Te3[s0>>0]
289 and $i1,lr,$s1,lsr#16 @ i0
290 ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8]
291 and $i2,lr,$s1
292 ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16]
293 and $i3,lr,$s1,lsr#8
294 ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24]
295 mov $s1,$s1,lsr#24
296
297 ldr $i1,[$tbl,$i1,lsl#2] @ Te1[s1>>16]
298 ldr $i2,[$tbl,$i2,lsl#2] @ Te3[s1>>0]
299 ldr $i3,[$tbl,$i3,lsl#2] @ Te2[s1>>8]
300 eor $s0,$s0,$i1,ror#8
301 ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24]
302 and $i1,lr,$s2,lsr#8 @ i0
303 eor $t2,$t2,$i2,ror#8
304 and $i2,lr,$s2,lsr#16 @ i1
305 eor $t3,$t3,$i3,ror#8
306 and $i3,lr,$s2
307 ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
308 eor $s1,$s1,$t1,ror#24
309 ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
310 mov $s2,$s2,lsr#24
311
312 ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
313 eor $s0,$s0,$i1,ror#16
314 ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
315 and $i1,lr,$s3 @ i0
316 eor $s1,$s1,$i2,ror#8
317 and $i2,lr,$s3,lsr#8 @ i1
318 eor $t3,$t3,$i3,ror#16
319 and $i3,lr,$s3,lsr#16 @ i2
320 ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
321 eor $s2,$s2,$t2,ror#16
322 ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
323 mov $s3,$s3,lsr#24
324
325 ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
326 eor $s0,$s0,$i1,ror#24
327 ldr $i1,[$key],#16
328 eor $s1,$s1,$i2,ror#16
329 ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
330 eor $s2,$s2,$i3,ror#8
331 ldr $t1,[$key,#-12]
332 eor $s3,$s3,$t3,ror#8
333
334 ldr $t2,[$key,#-8]
335 eor $s0,$s0,$i1
336 ldr $t3,[$key,#-4]
337 and $i1,lr,$s0
338 eor $s1,$s1,$t1
339 and $i2,lr,$s0,lsr#8
340 eor $s2,$s2,$t2
341 and $i3,lr,$s0,lsr#16
342 eor $s3,$s3,$t3
343 mov $s0,$s0,lsr#24
344
345 subs $rounds,$rounds,#1
346 bne .Lenc_loop
347
348 add $tbl,$tbl,#2
349
350 ldrb $t1,[$tbl,$i1,lsl#2] @ Te4[s0>>0]
351 and $i1,lr,$s1,lsr#16 @ i0
352 ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8]
353 and $i2,lr,$s1
354 ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16]
355 and $i3,lr,$s1,lsr#8
356 ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24]
357 mov $s1,$s1,lsr#24
358
359 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s1>>16]
360 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s1>>0]
361 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s1>>8]
362 eor $s0,$i1,$s0,lsl#8
363 ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24]
364 and $i1,lr,$s2,lsr#8 @ i0
365 eor $t2,$i2,$t2,lsl#8
366 and $i2,lr,$s2,lsr#16 @ i1
367 eor $t3,$i3,$t3,lsl#8
368 and $i3,lr,$s2
369 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
370 eor $s1,$t1,$s1,lsl#24
371 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
372 mov $s2,$s2,lsr#24
373
374 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
375 eor $s0,$i1,$s0,lsl#8
376 ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
377 and $i1,lr,$s3 @ i0
378 eor $s1,$s1,$i2,lsl#16
379 and $i2,lr,$s3,lsr#8 @ i1
380 eor $t3,$i3,$t3,lsl#8
381 and $i3,lr,$s3,lsr#16 @ i2
382 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
383 eor $s2,$t2,$s2,lsl#24
384 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
385 mov $s3,$s3,lsr#24
386
387 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
388 eor $s0,$i1,$s0,lsl#8
389 ldr $i1,[$key,#0]
390 ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
391 eor $s1,$s1,$i2,lsl#8
392 ldr $t1,[$key,#4]
393 eor $s2,$s2,$i3,lsl#16
394 ldr $t2,[$key,#8]
395 eor $s3,$t3,$s3,lsl#24
396 ldr $t3,[$key,#12]
397
398 eor $s0,$s0,$i1
399 eor $s1,$s1,$t1
400 eor $s2,$s2,$t2
401 eor $s3,$s3,$t3
402
403 sub $tbl,$tbl,#2
404 ldr pc,[sp],#4 @ pop and return
405.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
406
407.global AES_set_encrypt_key
408.type AES_set_encrypt_key,%function
409.align 5
410AES_set_encrypt_key:
411_armv4_AES_set_encrypt_key:
412 sub r3,pc,#8 @ AES_set_encrypt_key
413 teq r0,#0
414 moveq r0,#-1
415 beq .Labrt
416 teq r2,#0
417 moveq r0,#-1
418 beq .Labrt
419
420 teq r1,#128
421 beq .Lok
422 teq r1,#192
423 beq .Lok
424 teq r1,#256
425 movne r0,#-1
426 bne .Labrt
427
428.Lok: stmdb sp!,{r4-r12,lr}
429 sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
430
431 mov $rounds,r0 @ inp
432 mov lr,r1 @ bits
433 mov $key,r2 @ key
434
435#if __ARM_ARCH__<7
436 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
437 ldrb $t1,[$rounds,#2] @ manner...
438 ldrb $t2,[$rounds,#1]
439 ldrb $t3,[$rounds,#0]
440 orr $s0,$s0,$t1,lsl#8
441 ldrb $s1,[$rounds,#7]
442 orr $s0,$s0,$t2,lsl#16
443 ldrb $t1,[$rounds,#6]
444 orr $s0,$s0,$t3,lsl#24
445 ldrb $t2,[$rounds,#5]
446 ldrb $t3,[$rounds,#4]
447 orr $s1,$s1,$t1,lsl#8
448 ldrb $s2,[$rounds,#11]
449 orr $s1,$s1,$t2,lsl#16
450 ldrb $t1,[$rounds,#10]
451 orr $s1,$s1,$t3,lsl#24
452 ldrb $t2,[$rounds,#9]
453 ldrb $t3,[$rounds,#8]
454 orr $s2,$s2,$t1,lsl#8
455 ldrb $s3,[$rounds,#15]
456 orr $s2,$s2,$t2,lsl#16
457 ldrb $t1,[$rounds,#14]
458 orr $s2,$s2,$t3,lsl#24
459 ldrb $t2,[$rounds,#13]
460 ldrb $t3,[$rounds,#12]
461 orr $s3,$s3,$t1,lsl#8
462 str $s0,[$key],#16
463 orr $s3,$s3,$t2,lsl#16
464 str $s1,[$key,#-12]
465 orr $s3,$s3,$t3,lsl#24
466 str $s2,[$key,#-8]
467 str $s3,[$key,#-4]
468#else
469 ldr $s0,[$rounds,#0]
470 ldr $s1,[$rounds,#4]
471 ldr $s2,[$rounds,#8]
472 ldr $s3,[$rounds,#12]
473#ifdef __ARMEL__
474 rev $s0,$s0
475 rev $s1,$s1
476 rev $s2,$s2
477 rev $s3,$s3
478#endif
479 str $s0,[$key],#16
480 str $s1,[$key,#-12]
481 str $s2,[$key,#-8]
482 str $s3,[$key,#-4]
483#endif
484
485 teq lr,#128
486 bne .Lnot128
487 mov $rounds,#10
488 str $rounds,[$key,#240-16]
489 add $t3,$tbl,#256 @ rcon
490 mov lr,#255
491
492.L128_loop:
493 and $t2,lr,$s3,lsr#24
494 and $i1,lr,$s3,lsr#16
495 ldrb $t2,[$tbl,$t2]
496 and $i2,lr,$s3,lsr#8
497 ldrb $i1,[$tbl,$i1]
498 and $i3,lr,$s3
499 ldrb $i2,[$tbl,$i2]
500 orr $t2,$t2,$i1,lsl#24
501 ldrb $i3,[$tbl,$i3]
502 orr $t2,$t2,$i2,lsl#16
503 ldr $t1,[$t3],#4 @ rcon[i++]
504 orr $t2,$t2,$i3,lsl#8
505 eor $t2,$t2,$t1
506 eor $s0,$s0,$t2 @ rk[4]=rk[0]^...
507 eor $s1,$s1,$s0 @ rk[5]=rk[1]^rk[4]
508 str $s0,[$key],#16
509 eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5]
510 str $s1,[$key,#-12]
511 eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6]
512 str $s2,[$key,#-8]
513 subs $rounds,$rounds,#1
514 str $s3,[$key,#-4]
515 bne .L128_loop
516 sub r2,$key,#176
517 b .Ldone
518
519.Lnot128:
520#if __ARM_ARCH__<7
521 ldrb $i2,[$rounds,#19]
522 ldrb $t1,[$rounds,#18]
523 ldrb $t2,[$rounds,#17]
524 ldrb $t3,[$rounds,#16]
525 orr $i2,$i2,$t1,lsl#8
526 ldrb $i3,[$rounds,#23]
527 orr $i2,$i2,$t2,lsl#16
528 ldrb $t1,[$rounds,#22]
529 orr $i2,$i2,$t3,lsl#24
530 ldrb $t2,[$rounds,#21]
531 ldrb $t3,[$rounds,#20]
532 orr $i3,$i3,$t1,lsl#8
533 orr $i3,$i3,$t2,lsl#16
534 str $i2,[$key],#8
535 orr $i3,$i3,$t3,lsl#24
536 str $i3,[$key,#-4]
537#else
538 ldr $i2,[$rounds,#16]
539 ldr $i3,[$rounds,#20]
540#ifdef __ARMEL__
541 rev $i2,$i2
542 rev $i3,$i3
543#endif
544 str $i2,[$key],#8
545 str $i3,[$key,#-4]
546#endif
547
548 teq lr,#192
549 bne .Lnot192
550 mov $rounds,#12
551 str $rounds,[$key,#240-24]
552 add $t3,$tbl,#256 @ rcon
553 mov lr,#255
554 mov $rounds,#8
555
556.L192_loop:
557 and $t2,lr,$i3,lsr#24
558 and $i1,lr,$i3,lsr#16
559 ldrb $t2,[$tbl,$t2]
560 and $i2,lr,$i3,lsr#8
561 ldrb $i1,[$tbl,$i1]
562 and $i3,lr,$i3
563 ldrb $i2,[$tbl,$i2]
564 orr $t2,$t2,$i1,lsl#24
565 ldrb $i3,[$tbl,$i3]
566 orr $t2,$t2,$i2,lsl#16
567 ldr $t1,[$t3],#4 @ rcon[i++]
568 orr $t2,$t2,$i3,lsl#8
569 eor $i3,$t2,$t1
570 eor $s0,$s0,$i3 @ rk[6]=rk[0]^...
571 eor $s1,$s1,$s0 @ rk[7]=rk[1]^rk[6]
572 str $s0,[$key],#24
573 eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7]
574 str $s1,[$key,#-20]
575 eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8]
576 str $s2,[$key,#-16]
577 subs $rounds,$rounds,#1
578 str $s3,[$key,#-12]
579 subeq r2,$key,#216
580 beq .Ldone
581
582 ldr $i1,[$key,#-32]
583 ldr $i2,[$key,#-28]
584 eor $i1,$i1,$s3 @ rk[10]=rk[4]^rk[9]
585 eor $i3,$i2,$i1 @ rk[11]=rk[5]^rk[10]
586 str $i1,[$key,#-8]
587 str $i3,[$key,#-4]
588 b .L192_loop
589
590.Lnot192:
591#if __ARM_ARCH__<7
592 ldrb $i2,[$rounds,#27]
593 ldrb $t1,[$rounds,#26]
594 ldrb $t2,[$rounds,#25]
595 ldrb $t3,[$rounds,#24]
596 orr $i2,$i2,$t1,lsl#8
597 ldrb $i3,[$rounds,#31]
598 orr $i2,$i2,$t2,lsl#16
599 ldrb $t1,[$rounds,#30]
600 orr $i2,$i2,$t3,lsl#24
601 ldrb $t2,[$rounds,#29]
602 ldrb $t3,[$rounds,#28]
603 orr $i3,$i3,$t1,lsl#8
604 orr $i3,$i3,$t2,lsl#16
605 str $i2,[$key],#8
606 orr $i3,$i3,$t3,lsl#24
607 str $i3,[$key,#-4]
608#else
609 ldr $i2,[$rounds,#24]
610 ldr $i3,[$rounds,#28]
611#ifdef __ARMEL__
612 rev $i2,$i2
613 rev $i3,$i3
614#endif
615 str $i2,[$key],#8
616 str $i3,[$key,#-4]
617#endif
618
619 mov $rounds,#14
620 str $rounds,[$key,#240-32]
621 add $t3,$tbl,#256 @ rcon
622 mov lr,#255
623 mov $rounds,#7
624
625.L256_loop:
626 and $t2,lr,$i3,lsr#24
627 and $i1,lr,$i3,lsr#16
628 ldrb $t2,[$tbl,$t2]
629 and $i2,lr,$i3,lsr#8
630 ldrb $i1,[$tbl,$i1]
631 and $i3,lr,$i3
632 ldrb $i2,[$tbl,$i2]
633 orr $t2,$t2,$i1,lsl#24
634 ldrb $i3,[$tbl,$i3]
635 orr $t2,$t2,$i2,lsl#16
636 ldr $t1,[$t3],#4 @ rcon[i++]
637 orr $t2,$t2,$i3,lsl#8
638 eor $i3,$t2,$t1
639 eor $s0,$s0,$i3 @ rk[8]=rk[0]^...
640 eor $s1,$s1,$s0 @ rk[9]=rk[1]^rk[8]
641 str $s0,[$key],#32
642 eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9]
643 str $s1,[$key,#-28]
644 eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10]
645 str $s2,[$key,#-24]
646 subs $rounds,$rounds,#1
647 str $s3,[$key,#-20]
648 subeq r2,$key,#256
649 beq .Ldone
650
651 and $t2,lr,$s3
652 and $i1,lr,$s3,lsr#8
653 ldrb $t2,[$tbl,$t2]
654 and $i2,lr,$s3,lsr#16
655 ldrb $i1,[$tbl,$i1]
656 and $i3,lr,$s3,lsr#24
657 ldrb $i2,[$tbl,$i2]
658 orr $t2,$t2,$i1,lsl#8
659 ldrb $i3,[$tbl,$i3]
660 orr $t2,$t2,$i2,lsl#16
661 ldr $t1,[$key,#-48]
662 orr $t2,$t2,$i3,lsl#24
663
664 ldr $i1,[$key,#-44]
665 ldr $i2,[$key,#-40]
666 eor $t1,$t1,$t2 @ rk[12]=rk[4]^...
667 ldr $i3,[$key,#-36]
668 eor $i1,$i1,$t1 @ rk[13]=rk[5]^rk[12]
669 str $t1,[$key,#-16]
670 eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13]
671 str $i1,[$key,#-12]
672 eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14]
673 str $i2,[$key,#-8]
674 str $i3,[$key,#-4]
675 b .L256_loop
676
677.Ldone: mov r0,#0
678 ldmia sp!,{r4-r12,lr}
679.Labrt: tst lr,#1
680 moveq pc,lr @ be binary compatible with V4, yet
681 bx lr @ interoperable with Thumb ISA:-)
682.size AES_set_encrypt_key,.-AES_set_encrypt_key
683
684.global AES_set_decrypt_key
685.type AES_set_decrypt_key,%function
686.align 5
687AES_set_decrypt_key:
688 str lr,[sp,#-4]! @ push lr
689 bl _armv4_AES_set_encrypt_key
690 teq r0,#0
691 ldrne lr,[sp],#4 @ pop lr
692 bne .Labrt
693
694 stmdb sp!,{r4-r12}
695
696 ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2,
697 mov $key,r2 @ which is AES_KEY *key
698 mov $i1,r2
699 add $i2,r2,$rounds,lsl#4
700
701.Linv: ldr $s0,[$i1]
702 ldr $s1,[$i1,#4]
703 ldr $s2,[$i1,#8]
704 ldr $s3,[$i1,#12]
705 ldr $t1,[$i2]
706 ldr $t2,[$i2,#4]
707 ldr $t3,[$i2,#8]
708 ldr $i3,[$i2,#12]
709 str $s0,[$i2],#-16
710 str $s1,[$i2,#16+4]
711 str $s2,[$i2,#16+8]
712 str $s3,[$i2,#16+12]
713 str $t1,[$i1],#16
714 str $t2,[$i1,#-12]
715 str $t3,[$i1,#-8]
716 str $i3,[$i1,#-4]
717 teq $i1,$i2
718 bne .Linv
719___
720$mask80=$i1;
721$mask1b=$i2;
722$mask7f=$i3;
723$code.=<<___;
724 ldr $s0,[$key,#16]! @ prefetch tp1
725 mov $mask80,#0x80
726 mov $mask1b,#0x1b
727 orr $mask80,$mask80,#0x8000
728 orr $mask1b,$mask1b,#0x1b00
729 orr $mask80,$mask80,$mask80,lsl#16
730 orr $mask1b,$mask1b,$mask1b,lsl#16
731 sub $rounds,$rounds,#1
732 mvn $mask7f,$mask80
733 mov $rounds,$rounds,lsl#2 @ (rounds-1)*4
734
735.Lmix: and $t1,$s0,$mask80
736 and $s1,$s0,$mask7f
737 sub $t1,$t1,$t1,lsr#7
738 and $t1,$t1,$mask1b
739 eor $s1,$t1,$s1,lsl#1 @ tp2
740
741 and $t1,$s1,$mask80
742 and $s2,$s1,$mask7f
743 sub $t1,$t1,$t1,lsr#7
744 and $t1,$t1,$mask1b
745 eor $s2,$t1,$s2,lsl#1 @ tp4
746
747 and $t1,$s2,$mask80
748 and $s3,$s2,$mask7f
749 sub $t1,$t1,$t1,lsr#7
750 and $t1,$t1,$mask1b
751 eor $s3,$t1,$s3,lsl#1 @ tp8
752
753 eor $t1,$s1,$s2
754 eor $t2,$s0,$s3 @ tp9
755 eor $t1,$t1,$s3 @ tpe
756 eor $t1,$t1,$s1,ror#24
757 eor $t1,$t1,$t2,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8)
758 eor $t1,$t1,$s2,ror#16
759 eor $t1,$t1,$t2,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16)
760 eor $t1,$t1,$t2,ror#8 @ ^= ROTATE(tp9,24)
761
762 ldr $s0,[$key,#4] @ prefetch tp1
763 str $t1,[$key],#4
764 subs $rounds,$rounds,#1
765 bne .Lmix
766
767 mov r0,#0
768#if __ARM_ARCH__>=5
769 ldmia sp!,{r4-r12,pc}
770#else
771 ldmia sp!,{r4-r12,lr}
772 tst lr,#1
773 moveq pc,lr @ be binary compatible with V4, yet
774 bx lr @ interoperable with Thumb ISA:-)
775#endif
776.size AES_set_decrypt_key,.-AES_set_decrypt_key
777
778.type AES_Td,%object
779.align 5
780AES_Td:
781.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
782.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
783.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
784.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
785.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
786.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
787.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
788.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
789.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
790.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
791.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
792.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
793.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
794.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
795.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
796.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
797.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
798.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
799.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
800.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
801.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
802.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
803.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
804.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
805.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
806.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
807.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
808.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
809.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
810.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
811.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
812.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
813.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
814.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
815.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
816.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
817.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
818.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
819.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
820.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
821.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
822.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
823.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
824.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
825.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
826.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
827.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
828.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
829.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
830.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
831.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
832.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
833.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
834.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
835.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
836.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
837.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
838.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
839.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
840.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
841.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
842.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
843.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
844.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
845@ Td4[256]
846.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
847.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
848.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
849.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
850.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
851.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
852.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
853.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
854.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
855.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
856.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
857.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
858.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
859.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
860.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
861.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
862.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
863.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
864.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
865.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
866.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
867.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
868.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
869.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
870.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
871.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
872.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
873.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
874.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
875.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
876.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
877.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
878.size AES_Td,.-AES_Td
879
880@ void AES_decrypt(const unsigned char *in, unsigned char *out,
881@ const AES_KEY *key) {
882.global AES_decrypt
883.type AES_decrypt,%function
884.align 5
885AES_decrypt:
886 sub r3,pc,#8 @ AES_decrypt
887 stmdb sp!,{r1,r4-r12,lr}
888 mov $rounds,r0 @ inp
889 mov $key,r2
890 sub $tbl,r3,#AES_decrypt-AES_Td @ Td
891#if __ARM_ARCH__<7
892 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
893 ldrb $t1,[$rounds,#2] @ manner...
894 ldrb $t2,[$rounds,#1]
895 ldrb $t3,[$rounds,#0]
896 orr $s0,$s0,$t1,lsl#8
897 ldrb $s1,[$rounds,#7]
898 orr $s0,$s0,$t2,lsl#16
899 ldrb $t1,[$rounds,#6]
900 orr $s0,$s0,$t3,lsl#24
901 ldrb $t2,[$rounds,#5]
902 ldrb $t3,[$rounds,#4]
903 orr $s1,$s1,$t1,lsl#8
904 ldrb $s2,[$rounds,#11]
905 orr $s1,$s1,$t2,lsl#16
906 ldrb $t1,[$rounds,#10]
907 orr $s1,$s1,$t3,lsl#24
908 ldrb $t2,[$rounds,#9]
909 ldrb $t3,[$rounds,#8]
910 orr $s2,$s2,$t1,lsl#8
911 ldrb $s3,[$rounds,#15]
912 orr $s2,$s2,$t2,lsl#16
913 ldrb $t1,[$rounds,#14]
914 orr $s2,$s2,$t3,lsl#24
915 ldrb $t2,[$rounds,#13]
916 ldrb $t3,[$rounds,#12]
917 orr $s3,$s3,$t1,lsl#8
918 orr $s3,$s3,$t2,lsl#16
919 orr $s3,$s3,$t3,lsl#24
920#else
921 ldr $s0,[$rounds,#0]
922 ldr $s1,[$rounds,#4]
923 ldr $s2,[$rounds,#8]
924 ldr $s3,[$rounds,#12]
925#ifdef __ARMEL__
926 rev $s0,$s0
927 rev $s1,$s1
928 rev $s2,$s2
929 rev $s3,$s3
930#endif
931#endif
932 bl _armv4_AES_decrypt
933
934 ldr $rounds,[sp],#4 @ pop out
935#if __ARM_ARCH__>=7
936#ifdef __ARMEL__
937 rev $s0,$s0
938 rev $s1,$s1
939 rev $s2,$s2
940 rev $s3,$s3
941#endif
942 str $s0,[$rounds,#0]
943 str $s1,[$rounds,#4]
944 str $s2,[$rounds,#8]
945 str $s3,[$rounds,#12]
946#else
947 mov $t1,$s0,lsr#24 @ write output in endian-neutral
948 mov $t2,$s0,lsr#16 @ manner...
949 mov $t3,$s0,lsr#8
950 strb $t1,[$rounds,#0]
951 strb $t2,[$rounds,#1]
952 mov $t1,$s1,lsr#24
953 strb $t3,[$rounds,#2]
954 mov $t2,$s1,lsr#16
955 strb $s0,[$rounds,#3]
956 mov $t3,$s1,lsr#8
957 strb $t1,[$rounds,#4]
958 strb $t2,[$rounds,#5]
959 mov $t1,$s2,lsr#24
960 strb $t3,[$rounds,#6]
961 mov $t2,$s2,lsr#16
962 strb $s1,[$rounds,#7]
963 mov $t3,$s2,lsr#8
964 strb $t1,[$rounds,#8]
965 strb $t2,[$rounds,#9]
966 mov $t1,$s3,lsr#24
967 strb $t3,[$rounds,#10]
968 mov $t2,$s3,lsr#16
969 strb $s2,[$rounds,#11]
970 mov $t3,$s3,lsr#8
971 strb $t1,[$rounds,#12]
972 strb $t2,[$rounds,#13]
973 strb $t3,[$rounds,#14]
974 strb $s3,[$rounds,#15]
975#endif
976#if __ARM_ARCH__>=5
977 ldmia sp!,{r4-r12,pc}
978#else
979 ldmia sp!,{r4-r12,lr}
980 tst lr,#1
981 moveq pc,lr @ be binary compatible with V4, yet
982 bx lr @ interoperable with Thumb ISA:-)
983#endif
984.size AES_decrypt,.-AES_decrypt
985
986.type _armv4_AES_decrypt,%function
987.align 2
988_armv4_AES_decrypt:
989 str lr,[sp,#-4]! @ push lr
990 ldmia $key!,{$t1-$i1}
991 eor $s0,$s0,$t1
992 ldr $rounds,[$key,#240-16]
993 eor $s1,$s1,$t2
994 eor $s2,$s2,$t3
995 eor $s3,$s3,$i1
996 sub $rounds,$rounds,#1
997 mov lr,#255
998
999 and $i1,lr,$s0,lsr#16
1000 and $i2,lr,$s0,lsr#8
1001 and $i3,lr,$s0
1002 mov $s0,$s0,lsr#24
1003.Ldec_loop:
1004 ldr $t1,[$tbl,$i1,lsl#2] @ Td1[s0>>16]
1005 and $i1,lr,$s1 @ i0
1006 ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8]
1007 and $i2,lr,$s1,lsr#16
1008 ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0]
1009 and $i3,lr,$s1,lsr#8
1010 ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24]
1011 mov $s1,$s1,lsr#24
1012
1013 ldr $i1,[$tbl,$i1,lsl#2] @ Td3[s1>>0]
1014 ldr $i2,[$tbl,$i2,lsl#2] @ Td1[s1>>16]
1015 ldr $i3,[$tbl,$i3,lsl#2] @ Td2[s1>>8]
1016 eor $s0,$s0,$i1,ror#24
1017 ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24]
1018 and $i1,lr,$s2,lsr#8 @ i0
1019 eor $t2,$i2,$t2,ror#8
1020 and $i2,lr,$s2 @ i1
1021 eor $t3,$i3,$t3,ror#8
1022 and $i3,lr,$s2,lsr#16
1023 ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
1024 eor $s1,$s1,$t1,ror#8
1025 ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
1026 mov $s2,$s2,lsr#24
1027
1028 ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
1029 eor $s0,$s0,$i1,ror#16
1030 ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
1031 and $i1,lr,$s3,lsr#16 @ i0
1032 eor $s1,$s1,$i2,ror#24
1033 and $i2,lr,$s3,lsr#8 @ i1
1034 eor $t3,$i3,$t3,ror#8
1035 and $i3,lr,$s3 @ i2
1036 ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
1037 eor $s2,$s2,$t2,ror#8
1038 ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
1039 mov $s3,$s3,lsr#24
1040
1041 ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
1042 eor $s0,$s0,$i1,ror#8
1043 ldr $i1,[$key],#16
1044 eor $s1,$s1,$i2,ror#16
1045 ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
1046 eor $s2,$s2,$i3,ror#24
1047
1048 ldr $t1,[$key,#-12]
1049 eor $s0,$s0,$i1
1050 ldr $t2,[$key,#-8]
1051 eor $s3,$s3,$t3,ror#8
1052 ldr $t3,[$key,#-4]
1053 and $i1,lr,$s0,lsr#16
1054 eor $s1,$s1,$t1
1055 and $i2,lr,$s0,lsr#8
1056 eor $s2,$s2,$t2
1057 and $i3,lr,$s0
1058 eor $s3,$s3,$t3
1059 mov $s0,$s0,lsr#24
1060
1061 subs $rounds,$rounds,#1
1062 bne .Ldec_loop
1063
1064 add $tbl,$tbl,#1024
1065
1066 ldr $t2,[$tbl,#0] @ prefetch Td4
1067 ldr $t3,[$tbl,#32]
1068 ldr $t1,[$tbl,#64]
1069 ldr $t2,[$tbl,#96]
1070 ldr $t3,[$tbl,#128]
1071 ldr $t1,[$tbl,#160]
1072 ldr $t2,[$tbl,#192]
1073 ldr $t3,[$tbl,#224]
1074
1075 ldrb $s0,[$tbl,$s0] @ Td4[s0>>24]
1076 ldrb $t1,[$tbl,$i1] @ Td4[s0>>16]
1077 and $i1,lr,$s1 @ i0
1078 ldrb $t2,[$tbl,$i2] @ Td4[s0>>8]
1079 and $i2,lr,$s1,lsr#16
1080 ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
1081 and $i3,lr,$s1,lsr#8
1082
1083 ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
1084 ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
1085 ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
1086 eor $s0,$i1,$s0,lsl#24
1087 ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
1088 eor $s1,$t1,$s1,lsl#8
1089 and $i1,lr,$s2,lsr#8 @ i0
1090 eor $t2,$t2,$i2,lsl#8
1091 and $i2,lr,$s2 @ i1
1092 ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
1093 eor $t3,$t3,$i3,lsl#8
1094 ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
1095 and $i3,lr,$s2,lsr#16
1096
1097 ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
1098 eor $s0,$s0,$i1,lsl#8
1099 ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
1100 eor $s1,$i2,$s1,lsl#16
1101 and $i1,lr,$s3,lsr#16 @ i0
1102 eor $s2,$t2,$s2,lsl#16
1103 and $i2,lr,$s3,lsr#8 @ i1
1104 ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
1105 eor $t3,$t3,$i3,lsl#16
1106 ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
1107 and $i3,lr,$s3 @ i2
1108
1109 ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
1110 ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
1111 eor $s0,$s0,$i1,lsl#16
1112 ldr $i1,[$key,#0]
1113 eor $s1,$s1,$i2,lsl#8
1114 ldr $t1,[$key,#4]
1115 eor $s2,$i3,$s2,lsl#8
1116 ldr $t2,[$key,#8]
1117 eor $s3,$t3,$s3,lsl#24
1118 ldr $t3,[$key,#12]
1119
1120 eor $s0,$s0,$i1
1121 eor $s1,$s1,$t1
1122 eor $s2,$s2,$t2
1123 eor $s3,$s3,$t3
1124
1125 sub $tbl,$tbl,#1024
1126 ldr pc,[sp],#4 @ pop and return
1127.size _armv4_AES_decrypt,.-_armv4_AES_decrypt
1128.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
1129.align 2
1130___
1131
1132$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
1133print $code;
1134close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/aes/asm/aes-ia64.S b/src/lib/libcrypto/aes/asm/aes-ia64.S
deleted file mode 100644
index 7f6c4c3662..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-ia64.S
+++ /dev/null
@@ -1,1123 +0,0 @@
1// ====================================================================
2// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
3// project. Rights for redistribution and usage in source and binary
4// forms are granted according to the OpenSSL license.
5// ====================================================================
6//
7// What's wrong with compiler generated code? Compiler never uses
8// variable 'shr' which is pairable with 'extr'/'dep' instructions.
9// Then it uses 'zxt' which is an I-type, but can be replaced with
10// 'and' which in turn can be assigned to M-port [there're double as
11// much M-ports as there're I-ports on Itanium 2]. By sacrificing few
12// registers for small constants (255, 24 and 16) to be used with
13// 'shr' and 'and' instructions I can achieve better ILP, Intruction
14// Level Parallelism, and performance. This code outperforms GCC 3.3
15// generated code by over factor of 2 (two), GCC 3.4 - by 70% and
16// HP C - by 40%. Measured best-case scenario, i.e. aligned
17// big-endian input, ECB timing on Itanium 2 is (18 + 13*rounds)
18// ticks per block, or 9.25 CPU cycles per byte for 128 bit key.
19
20// Version 1.2 mitigates the hazard of cache-timing attacks by
21// a) compressing S-boxes from 8KB to 2KB+256B, b) scheduling
22// references to S-boxes for L2 cache latency, c) prefetching T[ed]4
23// prior last round. As result performance dropped to (26 + 15*rounds)
24// ticks per block or 11 cycles per byte processed with 128-bit key.
25// This is ~16% deterioration. For reference Itanium 2 L1 cache has
26// 64 bytes line size and L2 - 128 bytes...
27
28.ident "aes-ia64.S, version 1.2"
29.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
30.explicit
31.text
32
33rk0=r8; rk1=r9;
34
35pfssave=r2;
36lcsave=r10;
37prsave=r3;
38maskff=r11;
39twenty4=r14;
40sixteen=r15;
41
42te00=r16; te11=r17; te22=r18; te33=r19;
43te01=r20; te12=r21; te23=r22; te30=r23;
44te02=r24; te13=r25; te20=r26; te31=r27;
45te03=r28; te10=r29; te21=r30; te32=r31;
46
47// these are rotating...
48t0=r32; s0=r33;
49t1=r34; s1=r35;
50t2=r36; s2=r37;
51t3=r38; s3=r39;
52
53te0=r40; te1=r41; te2=r42; te3=r43;
54
55#if defined(_HPUX_SOURCE) && !defined(_LP64)
56# define ADDP addp4
57#else
58# define ADDP add
59#endif
60
61// Offsets from Te0
62#define TE0 0
63#define TE2 2
64#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
65#define TE1 3
66#define TE3 1
67#else
68#define TE1 1
69#define TE3 3
70#endif
71
72// This implies that AES_KEY comprises 32-bit key schedule elements
73// even on LP64 platforms.
74#ifndef KSZ
75# define KSZ 4
76# define LDKEY ld4
77#endif
78
79.proc _ia64_AES_encrypt#
80// Input: rk0-rk1
81// te0
82// te3 as AES_KEY->rounds!!!
83// s0-s3
84// maskff,twenty4,sixteen
85// Output: r16,r20,r24,r28 as s0-s3
86// Clobber: r16-r31,rk0-rk1,r32-r43
87.align 32
88_ia64_AES_encrypt:
89 .prologue
90 .altrp b6
91 .body
92{ .mmi; alloc r16=ar.pfs,12,0,0,8
93 LDKEY t0=[rk0],2*KSZ
94 mov pr.rot=1<<16 }
95{ .mmi; LDKEY t1=[rk1],2*KSZ
96 add te1=TE1,te0
97 add te3=-3,te3 };;
98{ .mib; LDKEY t2=[rk0],2*KSZ
99 mov ar.ec=2 }
100{ .mib; LDKEY t3=[rk1],2*KSZ
101 add te2=TE2,te0
102 brp.loop.imp .Le_top,.Le_end-16 };;
103
104{ .mmi; xor s0=s0,t0
105 xor s1=s1,t1
106 mov ar.lc=te3 }
107{ .mmi; xor s2=s2,t2
108 xor s3=s3,t3
109 add te3=TE3,te0 };;
110
111.align 32
112.Le_top:
113{ .mmi; (p0) LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
114 (p0) and te33=s3,maskff // 0/0:s3&0xff
115 (p0) extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
116{ .mmi; (p0) LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
117 (p0) and te30=s0,maskff // 0/1:s0&0xff
118 (p0) shr.u te00=s0,twenty4 };; // 0/0:s0>>24
119{ .mmi; (p0) LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
120 (p0) shladd te33=te33,3,te3 // 1/0:te0+s0>>24
121 (p0) extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
122{ .mmi; (p0) LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
123 (p0) shladd te30=te30,3,te3 // 1/1:te3+s0
124 (p0) shr.u te01=s1,twenty4 };; // 1/1:s1>>24
125{ .mmi; (p0) ld4 te33=[te33] // 2/0:te3[s3&0xff]
126 (p0) shladd te22=te22,3,te2 // 2/0:te2+s2>>8&0xff
127 (p0) extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
128{ .mmi; (p0) ld4 te30=[te30] // 2/1:te3[s0]
129 (p0) shladd te23=te23,3,te2 // 2/1:te2+s3>>8
130 (p0) shr.u te02=s2,twenty4 };; // 2/2:s2>>24
131{ .mmi; (p0) ld4 te22=[te22] // 3/0:te2[s2>>8]
132 (p0) shladd te20=te20,3,te2 // 3/2:te2+s0>>8
133 (p0) extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
134{ .mmi; (p0) ld4 te23=[te23] // 3/1:te2[s3>>8]
135 (p0) shladd te00=te00,3,te0 // 3/0:te0+s0>>24
136 (p0) shr.u te03=s3,twenty4 };; // 3/3:s3>>24
137{ .mmi; (p0) ld4 te20=[te20] // 4/2:te2[s0>>8]
138 (p0) shladd te21=te21,3,te2 // 4/3:te3+s2
139 (p0) extr.u te11=s1,16,8 } // 4/0:s1>>16&0xff
140{ .mmi; (p0) ld4 te00=[te00] // 4/0:te0[s0>>24]
141 (p0) shladd te01=te01,3,te0 // 4/1:te0+s1>>24
142 (p0) shr.u te13=s3,sixteen };; // 4/2:s3>>16
143{ .mmi; (p0) ld4 te21=[te21] // 5/3:te2[s1>>8]
144 (p0) shladd te11=te11,3,te1 // 5/0:te1+s1>>16
145 (p0) extr.u te12=s2,16,8 } // 5/1:s2>>16&0xff
146{ .mmi; (p0) ld4 te01=[te01] // 5/1:te0[s1>>24]
147 (p0) shladd te02=te02,3,te0 // 5/2:te0+s2>>24
148 (p0) and te31=s1,maskff };; // 5/2:s1&0xff
149{ .mmi; (p0) ld4 te11=[te11] // 6/0:te1[s1>>16]
150 (p0) shladd te12=te12,3,te1 // 6/1:te1+s2>>16
151 (p0) extr.u te10=s0,16,8 } // 6/3:s0>>16&0xff
152{ .mmi; (p0) ld4 te02=[te02] // 6/2:te0[s2>>24]
153 (p0) shladd te03=te03,3,te0 // 6/3:te1+s0>>16
154 (p0) and te32=s2,maskff };; // 6/3:s2&0xff
155
156{ .mmi; (p0) ld4 te12=[te12] // 7/1:te1[s2>>16]
157 (p0) shladd te31=te31,3,te3 // 7/2:te3+s1&0xff
158 (p0) and te13=te13,maskff} // 7/2:s3>>16&0xff
159{ .mmi; (p0) ld4 te03=[te03] // 7/3:te0[s3>>24]
160 (p0) shladd te32=te32,3,te3 // 7/3:te3+s2
161 (p0) xor t0=t0,te33 };; // 7/0:
162{ .mmi; (p0) ld4 te31=[te31] // 8/2:te3[s1]
163 (p0) shladd te13=te13,3,te1 // 8/2:te1+s3>>16
164 (p0) xor t0=t0,te22 } // 8/0:
165{ .mmi; (p0) ld4 te32=[te32] // 8/3:te3[s2]
166 (p0) shladd te10=te10,3,te1 // 8/3:te1+s0>>16
167 (p0) xor t1=t1,te30 };; // 8/1:
168{ .mmi; (p0) ld4 te13=[te13] // 9/2:te1[s3>>16]
169 (p0) ld4 te10=[te10] // 9/3:te1[s0>>16]
170 (p0) xor t0=t0,te00 };; // 9/0: !L2 scheduling
171{ .mmi; (p0) xor t1=t1,te23 // 10[9]/1:
172 (p0) xor t2=t2,te20 // 10[9]/2:
173 (p0) xor t3=t3,te21 };; // 10[9]/3:
174{ .mmi; (p0) xor t0=t0,te11 // 11[10]/0:done!
175 (p0) xor t1=t1,te01 // 11[10]/1:
176 (p0) xor t2=t2,te02 };; // 11[10]/2: !L2 scheduling
177{ .mmi; (p0) xor t3=t3,te03 // 12[10]/3:
178 (p16) cmp.eq p0,p17=r0,r0 };; // 12[10]/clear (p17)
179{ .mmi; (p0) xor t1=t1,te12 // 13[11]/1:done!
180 (p0) xor t2=t2,te31 // 13[11]/2:
181 (p0) xor t3=t3,te32 } // 13[11]/3:
182{ .mmi; (p17) add te0=2048,te0 // 13[11]/
183 (p17) add te1=2048+64-TE1,te1};; // 13[11]/
184{ .mib; (p0) xor t2=t2,te13 // 14[12]/2:done!
185 (p17) add te2=2048+128-TE2,te2} // 14[12]/
186{ .mib; (p0) xor t3=t3,te10 // 14[12]/3:done!
187 (p17) add te3=2048+192-TE3,te3 // 14[12]/
188 br.ctop.sptk .Le_top };;
189.Le_end:
190
191
192{ .mmi; ld8 te12=[te0] // prefetch Te4
193 ld8 te31=[te1] }
194{ .mmi; ld8 te10=[te2]
195 ld8 te32=[te3] }
196
197{ .mmi; LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
198 and te33=s3,maskff // 0/0:s3&0xff
199 extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
200{ .mmi; LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
201 and te30=s0,maskff // 0/1:s0&0xff
202 shr.u te00=s0,twenty4 };; // 0/0:s0>>24
203{ .mmi; LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
204 add te33=te33,te0 // 1/0:te0+s0>>24
205 extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
206{ .mmi; LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
207 add te30=te30,te0 // 1/1:te0+s0
208 shr.u te01=s1,twenty4 };; // 1/1:s1>>24
209{ .mmi; ld1 te33=[te33] // 2/0:te0[s3&0xff]
210 add te22=te22,te0 // 2/0:te0+s2>>8&0xff
211 extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
212{ .mmi; ld1 te30=[te30] // 2/1:te0[s0]
213 add te23=te23,te0 // 2/1:te0+s3>>8
214 shr.u te02=s2,twenty4 };; // 2/2:s2>>24
215{ .mmi; ld1 te22=[te22] // 3/0:te0[s2>>8]
216 add te20=te20,te0 // 3/2:te0+s0>>8
217 extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
218{ .mmi; ld1 te23=[te23] // 3/1:te0[s3>>8]
219 add te00=te00,te0 // 3/0:te0+s0>>24
220 shr.u te03=s3,twenty4 };; // 3/3:s3>>24
221{ .mmi; ld1 te20=[te20] // 4/2:te0[s0>>8]
222 add te21=te21,te0 // 4/3:te0+s2
223 extr.u te11=s1,16,8 } // 4/0:s1>>16&0xff
224{ .mmi; ld1 te00=[te00] // 4/0:te0[s0>>24]
225 add te01=te01,te0 // 4/1:te0+s1>>24
226 shr.u te13=s3,sixteen };; // 4/2:s3>>16
227{ .mmi; ld1 te21=[te21] // 5/3:te0[s1>>8]
228 add te11=te11,te0 // 5/0:te0+s1>>16
229 extr.u te12=s2,16,8 } // 5/1:s2>>16&0xff
230{ .mmi; ld1 te01=[te01] // 5/1:te0[s1>>24]
231 add te02=te02,te0 // 5/2:te0+s2>>24
232 and te31=s1,maskff };; // 5/2:s1&0xff
233{ .mmi; ld1 te11=[te11] // 6/0:te0[s1>>16]
234 add te12=te12,te0 // 6/1:te0+s2>>16
235 extr.u te10=s0,16,8 } // 6/3:s0>>16&0xff
236{ .mmi; ld1 te02=[te02] // 6/2:te0[s2>>24]
237 add te03=te03,te0 // 6/3:te0+s0>>16
238 and te32=s2,maskff };; // 6/3:s2&0xff
239
240{ .mmi; ld1 te12=[te12] // 7/1:te0[s2>>16]
241 add te31=te31,te0 // 7/2:te0+s1&0xff
242 dep te33=te22,te33,8,8} // 7/0:
243{ .mmi; ld1 te03=[te03] // 7/3:te0[s3>>24]
244 add te32=te32,te0 // 7/3:te0+s2
245 and te13=te13,maskff};; // 7/2:s3>>16&0xff
246{ .mmi; ld1 te31=[te31] // 8/2:te0[s1]
247 add te13=te13,te0 // 8/2:te0+s3>>16
248 dep te30=te23,te30,8,8} // 8/1:
249{ .mmi; ld1 te32=[te32] // 8/3:te0[s2]
250 add te10=te10,te0 // 8/3:te0+s0>>16
251 shl te00=te00,twenty4};; // 8/0:
252{ .mii; ld1 te13=[te13] // 9/2:te0[s3>>16]
253 dep te33=te11,te33,16,8 // 9/0:
254 shl te01=te01,twenty4};; // 9/1:
255{ .mii; ld1 te10=[te10] // 10/3:te0[s0>>16]
256 dep te31=te20,te31,8,8 // 10/2:
257 shl te02=te02,twenty4};; // 10/2:
258{ .mii; xor t0=t0,te33 // 11/0:
259 dep te32=te21,te32,8,8 // 11/3:
260 shl te12=te12,sixteen};; // 11/1:
261{ .mii; xor r16=t0,te00 // 12/0:done!
262 dep te31=te13,te31,16,8 // 12/2:
263 shl te03=te03,twenty4};; // 12/3:
264{ .mmi; xor t1=t1,te01 // 13/1:
265 xor t2=t2,te02 // 13/2:
266 dep te32=te10,te32,16,8};; // 13/3:
267{ .mmi; xor t1=t1,te30 // 14/1:
268 xor r24=t2,te31 // 14/2:done!
269 xor t3=t3,te32 };; // 14/3:
270{ .mib; xor r20=t1,te12 // 15/1:done!
271 xor r28=t3,te03 // 15/3:done!
272 br.ret.sptk b6 };;
273.endp _ia64_AES_encrypt#
274
275// void AES_encrypt (const void *in,void *out,const AES_KEY *key);
276.global AES_encrypt#
277.proc AES_encrypt#
278.align 32
279AES_encrypt:
280 .prologue
281 .save ar.pfs,pfssave
282{ .mmi; alloc pfssave=ar.pfs,3,1,12,0
283 and out0=3,in0
284 mov r3=ip }
285{ .mmi; ADDP in0=0,in0
286 mov loc0=psr.um
287 ADDP out11=KSZ*60,in2 };; // &AES_KEY->rounds
288
289{ .mmi; ld4 out11=[out11] // AES_KEY->rounds
290 add out8=(AES_Te#-AES_encrypt#),r3 // Te0
291 .save pr,prsave
292 mov prsave=pr }
293{ .mmi; rum 1<<3 // clear um.ac
294 .save ar.lc,lcsave
295 mov lcsave=ar.lc };;
296
297 .body
298#if defined(_HPUX_SOURCE) // HPUX is big-endian, cut 15+15 cycles...
299{ .mib; cmp.ne p6,p0=out0,r0
300 add out0=4,in0
301(p6) br.dpnt.many .Le_i_unaligned };;
302
303{ .mmi; ld4 out1=[in0],8 // s0
304 and out9=3,in1
305 mov twenty4=24 }
306{ .mmi; ld4 out3=[out0],8 // s1
307 ADDP rk0=0,in2
308 mov sixteen=16 };;
309{ .mmi; ld4 out5=[in0] // s2
310 cmp.ne p6,p0=out9,r0
311 mov maskff=0xff }
312{ .mmb; ld4 out7=[out0] // s3
313 ADDP rk1=KSZ,in2
314 br.call.sptk.many b6=_ia64_AES_encrypt };;
315
316{ .mib; ADDP in0=4,in1
317 ADDP in1=0,in1
318(p6) br.spnt .Le_o_unaligned };;
319
320{ .mii; mov psr.um=loc0
321 mov ar.pfs=pfssave
322 mov ar.lc=lcsave };;
323{ .mmi; st4 [in1]=r16,8 // s0
324 st4 [in0]=r20,8 // s1
325 mov pr=prsave,0x1ffff };;
326{ .mmb; st4 [in1]=r24 // s2
327 st4 [in0]=r28 // s3
328 br.ret.sptk.many b0 };;
329#endif
330
331.align 32
332.Le_i_unaligned:
333{ .mmi; add out0=1,in0
334 add out2=2,in0
335 add out4=3,in0 };;
336{ .mmi; ld1 r16=[in0],4
337 ld1 r17=[out0],4 }//;;
338{ .mmi; ld1 r18=[out2],4
339 ld1 out1=[out4],4 };; // s0
340{ .mmi; ld1 r20=[in0],4
341 ld1 r21=[out0],4 }//;;
342{ .mmi; ld1 r22=[out2],4
343 ld1 out3=[out4],4 };; // s1
344{ .mmi; ld1 r24=[in0],4
345 ld1 r25=[out0],4 }//;;
346{ .mmi; ld1 r26=[out2],4
347 ld1 out5=[out4],4 };; // s2
348{ .mmi; ld1 r28=[in0]
349 ld1 r29=[out0] }//;;
350{ .mmi; ld1 r30=[out2]
351 ld1 out7=[out4] };; // s3
352
353{ .mii;
354 dep out1=r16,out1,24,8 //;;
355 dep out3=r20,out3,24,8 }//;;
356{ .mii; ADDP rk0=0,in2
357 dep out5=r24,out5,24,8 //;;
358 dep out7=r28,out7,24,8 };;
359{ .mii; ADDP rk1=KSZ,in2
360 dep out1=r17,out1,16,8 //;;
361 dep out3=r21,out3,16,8 }//;;
362{ .mii; mov twenty4=24
363 dep out5=r25,out5,16,8 //;;
364 dep out7=r29,out7,16,8 };;
365{ .mii; mov sixteen=16
366 dep out1=r18,out1,8,8 //;;
367 dep out3=r22,out3,8,8 }//;;
368{ .mii; mov maskff=0xff
369 dep out5=r26,out5,8,8 //;;
370 dep out7=r30,out7,8,8 };;
371
372{ .mib; br.call.sptk.many b6=_ia64_AES_encrypt };;
373
374.Le_o_unaligned:
375{ .mii; ADDP out0=0,in1
376 extr.u r17=r16,8,8 // s0
377 shr.u r19=r16,twenty4 }//;;
378{ .mii; ADDP out1=1,in1
379 extr.u r18=r16,16,8
380 shr.u r23=r20,twenty4 }//;; // s1
381{ .mii; ADDP out2=2,in1
382 extr.u r21=r20,8,8
383 shr.u r22=r20,sixteen }//;;
384{ .mii; ADDP out3=3,in1
385 extr.u r25=r24,8,8 // s2
386 shr.u r27=r24,twenty4 };;
387{ .mii; st1 [out3]=r16,4
388 extr.u r26=r24,16,8
389 shr.u r31=r28,twenty4 }//;; // s3
390{ .mii; st1 [out2]=r17,4
391 extr.u r29=r28,8,8
392 shr.u r30=r28,sixteen }//;;
393
394{ .mmi; st1 [out1]=r18,4
395 st1 [out0]=r19,4 };;
396{ .mmi; st1 [out3]=r20,4
397 st1 [out2]=r21,4 }//;;
398{ .mmi; st1 [out1]=r22,4
399 st1 [out0]=r23,4 };;
400{ .mmi; st1 [out3]=r24,4
401 st1 [out2]=r25,4
402 mov pr=prsave,0x1ffff }//;;
403{ .mmi; st1 [out1]=r26,4
404 st1 [out0]=r27,4
405 mov ar.pfs=pfssave };;
406{ .mmi; st1 [out3]=r28
407 st1 [out2]=r29
408 mov ar.lc=lcsave }//;;
409{ .mmi; st1 [out1]=r30
410 st1 [out0]=r31 }
411{ .mfb; mov psr.um=loc0 // restore user mask
412 br.ret.sptk.many b0 };;
413.endp AES_encrypt#
414
415// *AES_decrypt are autogenerated by the following script:
416#if 0
417#!/usr/bin/env perl
418print "// *AES_decrypt are autogenerated by the following script:\n#if 0\n";
419open(PROG,'<'.$0); while(<PROG>) { print; } close(PROG);
420print "#endif\n";
421while(<>) {
422 $process=1 if (/\.proc\s+_ia64_AES_encrypt/);
423 next if (!$process);
424
425 #s/te00=s0/td00=s0/; s/te00/td00/g;
426 s/te11=s1/td13=s3/; s/te11/td13/g;
427 #s/te22=s2/td22=s2/; s/te22/td22/g;
428 s/te33=s3/td31=s1/; s/te33/td31/g;
429
430 #s/te01=s1/td01=s1/; s/te01/td01/g;
431 s/te12=s2/td10=s0/; s/te12/td10/g;
432 #s/te23=s3/td23=s3/; s/te23/td23/g;
433 s/te30=s0/td32=s2/; s/te30/td32/g;
434
435 #s/te02=s2/td02=s2/; s/te02/td02/g;
436 s/te13=s3/td11=s1/; s/te13/td11/g;
437 #s/te20=s0/td20=s0/; s/te20/td20/g;
438 s/te31=s1/td33=s3/; s/te31/td33/g;
439
440 #s/te03=s3/td03=s3/; s/te03/td03/g;
441 s/te10=s0/td12=s2/; s/te10/td12/g;
442 #s/te21=s1/td21=s1/; s/te21/td21/g;
443 s/te32=s2/td30=s0/; s/te32/td30/g;
444
445 s/td/te/g;
446
447 s/AES_encrypt/AES_decrypt/g;
448 s/\.Le_/.Ld_/g;
449 s/AES_Te#/AES_Td#/g;
450
451 print;
452
453 exit if (/\.endp\s+AES_decrypt/);
454}
455#endif
456.proc _ia64_AES_decrypt#
457// Input: rk0-rk1
458// te0
459// te3 as AES_KEY->rounds!!!
460// s0-s3
461// maskff,twenty4,sixteen
462// Output: r16,r20,r24,r28 as s0-s3
463// Clobber: r16-r31,rk0-rk1,r32-r43
464.align 32
465_ia64_AES_decrypt:
466 .prologue
467 .altrp b6
468 .body
469{ .mmi; alloc r16=ar.pfs,12,0,0,8
470 LDKEY t0=[rk0],2*KSZ
471 mov pr.rot=1<<16 }
472{ .mmi; LDKEY t1=[rk1],2*KSZ
473 add te1=TE1,te0
474 add te3=-3,te3 };;
475{ .mib; LDKEY t2=[rk0],2*KSZ
476 mov ar.ec=2 }
477{ .mib; LDKEY t3=[rk1],2*KSZ
478 add te2=TE2,te0
479 brp.loop.imp .Ld_top,.Ld_end-16 };;
480
481{ .mmi; xor s0=s0,t0
482 xor s1=s1,t1
483 mov ar.lc=te3 }
484{ .mmi; xor s2=s2,t2
485 xor s3=s3,t3
486 add te3=TE3,te0 };;
487
488.align 32
489.Ld_top:
490{ .mmi; (p0) LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
491 (p0) and te31=s1,maskff // 0/0:s3&0xff
492 (p0) extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
493{ .mmi; (p0) LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
494 (p0) and te32=s2,maskff // 0/1:s0&0xff
495 (p0) shr.u te00=s0,twenty4 };; // 0/0:s0>>24
496{ .mmi; (p0) LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
497 (p0) shladd te31=te31,3,te3 // 1/0:te0+s0>>24
498 (p0) extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
499{ .mmi; (p0) LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
500 (p0) shladd te32=te32,3,te3 // 1/1:te3+s0
501 (p0) shr.u te01=s1,twenty4 };; // 1/1:s1>>24
502{ .mmi; (p0) ld4 te31=[te31] // 2/0:te3[s3&0xff]
503 (p0) shladd te22=te22,3,te2 // 2/0:te2+s2>>8&0xff
504 (p0) extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
505{ .mmi; (p0) ld4 te32=[te32] // 2/1:te3[s0]
506 (p0) shladd te23=te23,3,te2 // 2/1:te2+s3>>8
507 (p0) shr.u te02=s2,twenty4 };; // 2/2:s2>>24
508{ .mmi; (p0) ld4 te22=[te22] // 3/0:te2[s2>>8]
509 (p0) shladd te20=te20,3,te2 // 3/2:te2+s0>>8
510 (p0) extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
511{ .mmi; (p0) ld4 te23=[te23] // 3/1:te2[s3>>8]
512 (p0) shladd te00=te00,3,te0 // 3/0:te0+s0>>24
513 (p0) shr.u te03=s3,twenty4 };; // 3/3:s3>>24
514{ .mmi; (p0) ld4 te20=[te20] // 4/2:te2[s0>>8]
515 (p0) shladd te21=te21,3,te2 // 4/3:te3+s2
516 (p0) extr.u te13=s3,16,8 } // 4/0:s1>>16&0xff
517{ .mmi; (p0) ld4 te00=[te00] // 4/0:te0[s0>>24]
518 (p0) shladd te01=te01,3,te0 // 4/1:te0+s1>>24
519 (p0) shr.u te11=s1,sixteen };; // 4/2:s3>>16
520{ .mmi; (p0) ld4 te21=[te21] // 5/3:te2[s1>>8]
521 (p0) shladd te13=te13,3,te1 // 5/0:te1+s1>>16
522 (p0) extr.u te10=s0,16,8 } // 5/1:s2>>16&0xff
523{ .mmi; (p0) ld4 te01=[te01] // 5/1:te0[s1>>24]
524 (p0) shladd te02=te02,3,te0 // 5/2:te0+s2>>24
525 (p0) and te33=s3,maskff };; // 5/2:s1&0xff
526{ .mmi; (p0) ld4 te13=[te13] // 6/0:te1[s1>>16]
527 (p0) shladd te10=te10,3,te1 // 6/1:te1+s2>>16
528 (p0) extr.u te12=s2,16,8 } // 6/3:s0>>16&0xff
529{ .mmi; (p0) ld4 te02=[te02] // 6/2:te0[s2>>24]
530 (p0) shladd te03=te03,3,te0 // 6/3:te1+s0>>16
531 (p0) and te30=s0,maskff };; // 6/3:s2&0xff
532
533{ .mmi; (p0) ld4 te10=[te10] // 7/1:te1[s2>>16]
534 (p0) shladd te33=te33,3,te3 // 7/2:te3+s1&0xff
535 (p0) and te11=te11,maskff} // 7/2:s3>>16&0xff
536{ .mmi; (p0) ld4 te03=[te03] // 7/3:te0[s3>>24]
537 (p0) shladd te30=te30,3,te3 // 7/3:te3+s2
538 (p0) xor t0=t0,te31 };; // 7/0:
539{ .mmi; (p0) ld4 te33=[te33] // 8/2:te3[s1]
540 (p0) shladd te11=te11,3,te1 // 8/2:te1+s3>>16
541 (p0) xor t0=t0,te22 } // 8/0:
542{ .mmi; (p0) ld4 te30=[te30] // 8/3:te3[s2]
543 (p0) shladd te12=te12,3,te1 // 8/3:te1+s0>>16
544 (p0) xor t1=t1,te32 };; // 8/1:
545{ .mmi; (p0) ld4 te11=[te11] // 9/2:te1[s3>>16]
546 (p0) ld4 te12=[te12] // 9/3:te1[s0>>16]
547 (p0) xor t0=t0,te00 };; // 9/0: !L2 scheduling
548{ .mmi; (p0) xor t1=t1,te23 // 10[9]/1:
549 (p0) xor t2=t2,te20 // 10[9]/2:
550 (p0) xor t3=t3,te21 };; // 10[9]/3:
551{ .mmi; (p0) xor t0=t0,te13 // 11[10]/0:done!
552 (p0) xor t1=t1,te01 // 11[10]/1:
553 (p0) xor t2=t2,te02 };; // 11[10]/2: !L2 scheduling
554{ .mmi; (p0) xor t3=t3,te03 // 12[10]/3:
555 (p16) cmp.eq p0,p17=r0,r0 };; // 12[10]/clear (p17)
556{ .mmi; (p0) xor t1=t1,te10 // 13[11]/1:done!
557 (p0) xor t2=t2,te33 // 13[11]/2:
558 (p0) xor t3=t3,te30 } // 13[11]/3:
559{ .mmi; (p17) add te0=2048,te0 // 13[11]/
560 (p17) add te1=2048+64-TE1,te1};; // 13[11]/
561{ .mib; (p0) xor t2=t2,te11 // 14[12]/2:done!
562 (p17) add te2=2048+128-TE2,te2} // 14[12]/
563{ .mib; (p0) xor t3=t3,te12 // 14[12]/3:done!
564 (p17) add te3=2048+192-TE3,te3 // 14[12]/
565 br.ctop.sptk .Ld_top };;
566.Ld_end:
567
568
569{ .mmi; ld8 te10=[te0] // prefetch Td4
570 ld8 te33=[te1] }
571{ .mmi; ld8 te12=[te2]
572 ld8 te30=[te3] }
573
574{ .mmi; LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
575 and te31=s1,maskff // 0/0:s3&0xff
576 extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
577{ .mmi; LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
578 and te32=s2,maskff // 0/1:s0&0xff
579 shr.u te00=s0,twenty4 };; // 0/0:s0>>24
580{ .mmi; LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
581 add te31=te31,te0 // 1/0:te0+s0>>24
582 extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
583{ .mmi; LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
584 add te32=te32,te0 // 1/1:te0+s0
585 shr.u te01=s1,twenty4 };; // 1/1:s1>>24
586{ .mmi; ld1 te31=[te31] // 2/0:te0[s3&0xff]
587 add te22=te22,te0 // 2/0:te0+s2>>8&0xff
588 extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
589{ .mmi; ld1 te32=[te32] // 2/1:te0[s0]
590 add te23=te23,te0 // 2/1:te0+s3>>8
591 shr.u te02=s2,twenty4 };; // 2/2:s2>>24
592{ .mmi; ld1 te22=[te22] // 3/0:te0[s2>>8]
593 add te20=te20,te0 // 3/2:te0+s0>>8
594 extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
595{ .mmi; ld1 te23=[te23] // 3/1:te0[s3>>8]
596 add te00=te00,te0 // 3/0:te0+s0>>24
597 shr.u te03=s3,twenty4 };; // 3/3:s3>>24
598{ .mmi; ld1 te20=[te20] // 4/2:te0[s0>>8]
599 add te21=te21,te0 // 4/3:te0+s2
600 extr.u te13=s3,16,8 } // 4/0:s1>>16&0xff
601{ .mmi; ld1 te00=[te00] // 4/0:te0[s0>>24]
602 add te01=te01,te0 // 4/1:te0+s1>>24
603 shr.u te11=s1,sixteen };; // 4/2:s3>>16
604{ .mmi; ld1 te21=[te21] // 5/3:te0[s1>>8]
605 add te13=te13,te0 // 5/0:te0+s1>>16
606 extr.u te10=s0,16,8 } // 5/1:s2>>16&0xff
607{ .mmi; ld1 te01=[te01] // 5/1:te0[s1>>24]
608 add te02=te02,te0 // 5/2:te0+s2>>24
609 and te33=s3,maskff };; // 5/2:s1&0xff
610{ .mmi; ld1 te13=[te13] // 6/0:te0[s1>>16]
611 add te10=te10,te0 // 6/1:te0+s2>>16
612 extr.u te12=s2,16,8 } // 6/3:s0>>16&0xff
613{ .mmi; ld1 te02=[te02] // 6/2:te0[s2>>24]
614 add te03=te03,te0 // 6/3:te0+s0>>16
615 and te30=s0,maskff };; // 6/3:s2&0xff
616
617{ .mmi; ld1 te10=[te10] // 7/1:te0[s2>>16]
618 add te33=te33,te0 // 7/2:te0+s1&0xff
619 dep te31=te22,te31,8,8} // 7/0:
620{ .mmi; ld1 te03=[te03] // 7/3:te0[s3>>24]
621 add te30=te30,te0 // 7/3:te0+s2
622 and te11=te11,maskff};; // 7/2:s3>>16&0xff
623{ .mmi; ld1 te33=[te33] // 8/2:te0[s1]
624 add te11=te11,te0 // 8/2:te0+s3>>16
625 dep te32=te23,te32,8,8} // 8/1:
626{ .mmi; ld1 te30=[te30] // 8/3:te0[s2]
627 add te12=te12,te0 // 8/3:te0+s0>>16
628 shl te00=te00,twenty4};; // 8/0:
629{ .mii; ld1 te11=[te11] // 9/2:te0[s3>>16]
630 dep te31=te13,te31,16,8 // 9/0:
631 shl te01=te01,twenty4};; // 9/1:
632{ .mii; ld1 te12=[te12] // 10/3:te0[s0>>16]
633 dep te33=te20,te33,8,8 // 10/2:
634 shl te02=te02,twenty4};; // 10/2:
635{ .mii; xor t0=t0,te31 // 11/0:
636 dep te30=te21,te30,8,8 // 11/3:
637 shl te10=te10,sixteen};; // 11/1:
638{ .mii; xor r16=t0,te00 // 12/0:done!
639 dep te33=te11,te33,16,8 // 12/2:
640 shl te03=te03,twenty4};; // 12/3:
641{ .mmi; xor t1=t1,te01 // 13/1:
642 xor t2=t2,te02 // 13/2:
643 dep te30=te12,te30,16,8};; // 13/3:
644{ .mmi; xor t1=t1,te32 // 14/1:
645 xor r24=t2,te33 // 14/2:done!
646 xor t3=t3,te30 };; // 14/3:
647{ .mib; xor r20=t1,te10 // 15/1:done!
648 xor r28=t3,te03 // 15/3:done!
649 br.ret.sptk b6 };;
650.endp _ia64_AES_decrypt#
651
652// void AES_decrypt (const void *in,void *out,const AES_KEY *key);
653.global AES_decrypt#
654.proc AES_decrypt#
655.align 32
656AES_decrypt:
657 .prologue
658 .save ar.pfs,pfssave
659{ .mmi; alloc pfssave=ar.pfs,3,1,12,0
660 and out0=3,in0
661 mov r3=ip }
662{ .mmi; ADDP in0=0,in0
663 mov loc0=psr.um
664 ADDP out11=KSZ*60,in2 };; // &AES_KEY->rounds
665
666{ .mmi; ld4 out11=[out11] // AES_KEY->rounds
667 add out8=(AES_Td#-AES_decrypt#),r3 // Te0
668 .save pr,prsave
669 mov prsave=pr }
670{ .mmi; rum 1<<3 // clear um.ac
671 .save ar.lc,lcsave
672 mov lcsave=ar.lc };;
673
674 .body
675#if defined(_HPUX_SOURCE) // HPUX is big-endian, cut 15+15 cycles...
676{ .mib; cmp.ne p6,p0=out0,r0
677 add out0=4,in0
678(p6) br.dpnt.many .Ld_i_unaligned };;
679
680{ .mmi; ld4 out1=[in0],8 // s0
681 and out9=3,in1
682 mov twenty4=24 }
683{ .mmi; ld4 out3=[out0],8 // s1
684 ADDP rk0=0,in2
685 mov sixteen=16 };;
686{ .mmi; ld4 out5=[in0] // s2
687 cmp.ne p6,p0=out9,r0
688 mov maskff=0xff }
689{ .mmb; ld4 out7=[out0] // s3
690 ADDP rk1=KSZ,in2
691 br.call.sptk.many b6=_ia64_AES_decrypt };;
692
693{ .mib; ADDP in0=4,in1
694 ADDP in1=0,in1
695(p6) br.spnt .Ld_o_unaligned };;
696
697{ .mii; mov psr.um=loc0
698 mov ar.pfs=pfssave
699 mov ar.lc=lcsave };;
700{ .mmi; st4 [in1]=r16,8 // s0
701 st4 [in0]=r20,8 // s1
702 mov pr=prsave,0x1ffff };;
703{ .mmb; st4 [in1]=r24 // s2
704 st4 [in0]=r28 // s3
705 br.ret.sptk.many b0 };;
706#endif
707
708.align 32
709.Ld_i_unaligned:
710{ .mmi; add out0=1,in0
711 add out2=2,in0
712 add out4=3,in0 };;
713{ .mmi; ld1 r16=[in0],4
714 ld1 r17=[out0],4 }//;;
715{ .mmi; ld1 r18=[out2],4
716 ld1 out1=[out4],4 };; // s0
717{ .mmi; ld1 r20=[in0],4
718 ld1 r21=[out0],4 }//;;
719{ .mmi; ld1 r22=[out2],4
720 ld1 out3=[out4],4 };; // s1
721{ .mmi; ld1 r24=[in0],4
722 ld1 r25=[out0],4 }//;;
723{ .mmi; ld1 r26=[out2],4
724 ld1 out5=[out4],4 };; // s2
725{ .mmi; ld1 r28=[in0]
726 ld1 r29=[out0] }//;;
727{ .mmi; ld1 r30=[out2]
728 ld1 out7=[out4] };; // s3
729
730{ .mii;
731 dep out1=r16,out1,24,8 //;;
732 dep out3=r20,out3,24,8 }//;;
733{ .mii; ADDP rk0=0,in2
734 dep out5=r24,out5,24,8 //;;
735 dep out7=r28,out7,24,8 };;
736{ .mii; ADDP rk1=KSZ,in2
737 dep out1=r17,out1,16,8 //;;
738 dep out3=r21,out3,16,8 }//;;
739{ .mii; mov twenty4=24
740 dep out5=r25,out5,16,8 //;;
741 dep out7=r29,out7,16,8 };;
742{ .mii; mov sixteen=16
743 dep out1=r18,out1,8,8 //;;
744 dep out3=r22,out3,8,8 }//;;
745{ .mii; mov maskff=0xff
746 dep out5=r26,out5,8,8 //;;
747 dep out7=r30,out7,8,8 };;
748
749{ .mib; br.call.sptk.many b6=_ia64_AES_decrypt };;
750
751.Ld_o_unaligned:
752{ .mii; ADDP out0=0,in1
753 extr.u r17=r16,8,8 // s0
754 shr.u r19=r16,twenty4 }//;;
755{ .mii; ADDP out1=1,in1
756 extr.u r18=r16,16,8
757 shr.u r23=r20,twenty4 }//;; // s1
758{ .mii; ADDP out2=2,in1
759 extr.u r21=r20,8,8
760 shr.u r22=r20,sixteen }//;;
761{ .mii; ADDP out3=3,in1
762 extr.u r25=r24,8,8 // s2
763 shr.u r27=r24,twenty4 };;
764{ .mii; st1 [out3]=r16,4
765 extr.u r26=r24,16,8
766 shr.u r31=r28,twenty4 }//;; // s3
767{ .mii; st1 [out2]=r17,4
768 extr.u r29=r28,8,8
769 shr.u r30=r28,sixteen }//;;
770
771{ .mmi; st1 [out1]=r18,4
772 st1 [out0]=r19,4 };;
773{ .mmi; st1 [out3]=r20,4
774 st1 [out2]=r21,4 }//;;
775{ .mmi; st1 [out1]=r22,4
776 st1 [out0]=r23,4 };;
777{ .mmi; st1 [out3]=r24,4
778 st1 [out2]=r25,4
779 mov pr=prsave,0x1ffff }//;;
780{ .mmi; st1 [out1]=r26,4
781 st1 [out0]=r27,4
782 mov ar.pfs=pfssave };;
783{ .mmi; st1 [out3]=r28
784 st1 [out2]=r29
785 mov ar.lc=lcsave }//;;
786{ .mmi; st1 [out1]=r30
787 st1 [out0]=r31 }
788{ .mfb; mov psr.um=loc0 // restore user mask
789 br.ret.sptk.many b0 };;
790.endp AES_decrypt#
791
792// leave it in .text segment...
793.align 64
794.global AES_Te#
795.type AES_Te#,@object
796AES_Te: data4 0xc66363a5,0xc66363a5, 0xf87c7c84,0xf87c7c84
797 data4 0xee777799,0xee777799, 0xf67b7b8d,0xf67b7b8d
798 data4 0xfff2f20d,0xfff2f20d, 0xd66b6bbd,0xd66b6bbd
799 data4 0xde6f6fb1,0xde6f6fb1, 0x91c5c554,0x91c5c554
800 data4 0x60303050,0x60303050, 0x02010103,0x02010103
801 data4 0xce6767a9,0xce6767a9, 0x562b2b7d,0x562b2b7d
802 data4 0xe7fefe19,0xe7fefe19, 0xb5d7d762,0xb5d7d762
803 data4 0x4dababe6,0x4dababe6, 0xec76769a,0xec76769a
804 data4 0x8fcaca45,0x8fcaca45, 0x1f82829d,0x1f82829d
805 data4 0x89c9c940,0x89c9c940, 0xfa7d7d87,0xfa7d7d87
806 data4 0xeffafa15,0xeffafa15, 0xb25959eb,0xb25959eb
807 data4 0x8e4747c9,0x8e4747c9, 0xfbf0f00b,0xfbf0f00b
808 data4 0x41adadec,0x41adadec, 0xb3d4d467,0xb3d4d467
809 data4 0x5fa2a2fd,0x5fa2a2fd, 0x45afafea,0x45afafea
810 data4 0x239c9cbf,0x239c9cbf, 0x53a4a4f7,0x53a4a4f7
811 data4 0xe4727296,0xe4727296, 0x9bc0c05b,0x9bc0c05b
812 data4 0x75b7b7c2,0x75b7b7c2, 0xe1fdfd1c,0xe1fdfd1c
813 data4 0x3d9393ae,0x3d9393ae, 0x4c26266a,0x4c26266a
814 data4 0x6c36365a,0x6c36365a, 0x7e3f3f41,0x7e3f3f41
815 data4 0xf5f7f702,0xf5f7f702, 0x83cccc4f,0x83cccc4f
816 data4 0x6834345c,0x6834345c, 0x51a5a5f4,0x51a5a5f4
817 data4 0xd1e5e534,0xd1e5e534, 0xf9f1f108,0xf9f1f108
818 data4 0xe2717193,0xe2717193, 0xabd8d873,0xabd8d873
819 data4 0x62313153,0x62313153, 0x2a15153f,0x2a15153f
820 data4 0x0804040c,0x0804040c, 0x95c7c752,0x95c7c752
821 data4 0x46232365,0x46232365, 0x9dc3c35e,0x9dc3c35e
822 data4 0x30181828,0x30181828, 0x379696a1,0x379696a1
823 data4 0x0a05050f,0x0a05050f, 0x2f9a9ab5,0x2f9a9ab5
824 data4 0x0e070709,0x0e070709, 0x24121236,0x24121236
825 data4 0x1b80809b,0x1b80809b, 0xdfe2e23d,0xdfe2e23d
826 data4 0xcdebeb26,0xcdebeb26, 0x4e272769,0x4e272769
827 data4 0x7fb2b2cd,0x7fb2b2cd, 0xea75759f,0xea75759f
828 data4 0x1209091b,0x1209091b, 0x1d83839e,0x1d83839e
829 data4 0x582c2c74,0x582c2c74, 0x341a1a2e,0x341a1a2e
830 data4 0x361b1b2d,0x361b1b2d, 0xdc6e6eb2,0xdc6e6eb2
831 data4 0xb45a5aee,0xb45a5aee, 0x5ba0a0fb,0x5ba0a0fb
832 data4 0xa45252f6,0xa45252f6, 0x763b3b4d,0x763b3b4d
833 data4 0xb7d6d661,0xb7d6d661, 0x7db3b3ce,0x7db3b3ce
834 data4 0x5229297b,0x5229297b, 0xdde3e33e,0xdde3e33e
835 data4 0x5e2f2f71,0x5e2f2f71, 0x13848497,0x13848497
836 data4 0xa65353f5,0xa65353f5, 0xb9d1d168,0xb9d1d168
837 data4 0x00000000,0x00000000, 0xc1eded2c,0xc1eded2c
838 data4 0x40202060,0x40202060, 0xe3fcfc1f,0xe3fcfc1f
839 data4 0x79b1b1c8,0x79b1b1c8, 0xb65b5bed,0xb65b5bed
840 data4 0xd46a6abe,0xd46a6abe, 0x8dcbcb46,0x8dcbcb46
841 data4 0x67bebed9,0x67bebed9, 0x7239394b,0x7239394b
842 data4 0x944a4ade,0x944a4ade, 0x984c4cd4,0x984c4cd4
843 data4 0xb05858e8,0xb05858e8, 0x85cfcf4a,0x85cfcf4a
844 data4 0xbbd0d06b,0xbbd0d06b, 0xc5efef2a,0xc5efef2a
845 data4 0x4faaaae5,0x4faaaae5, 0xedfbfb16,0xedfbfb16
846 data4 0x864343c5,0x864343c5, 0x9a4d4dd7,0x9a4d4dd7
847 data4 0x66333355,0x66333355, 0x11858594,0x11858594
848 data4 0x8a4545cf,0x8a4545cf, 0xe9f9f910,0xe9f9f910
849 data4 0x04020206,0x04020206, 0xfe7f7f81,0xfe7f7f81
850 data4 0xa05050f0,0xa05050f0, 0x783c3c44,0x783c3c44
851 data4 0x259f9fba,0x259f9fba, 0x4ba8a8e3,0x4ba8a8e3
852 data4 0xa25151f3,0xa25151f3, 0x5da3a3fe,0x5da3a3fe
853 data4 0x804040c0,0x804040c0, 0x058f8f8a,0x058f8f8a
854 data4 0x3f9292ad,0x3f9292ad, 0x219d9dbc,0x219d9dbc
855 data4 0x70383848,0x70383848, 0xf1f5f504,0xf1f5f504
856 data4 0x63bcbcdf,0x63bcbcdf, 0x77b6b6c1,0x77b6b6c1
857 data4 0xafdada75,0xafdada75, 0x42212163,0x42212163
858 data4 0x20101030,0x20101030, 0xe5ffff1a,0xe5ffff1a
859 data4 0xfdf3f30e,0xfdf3f30e, 0xbfd2d26d,0xbfd2d26d
860 data4 0x81cdcd4c,0x81cdcd4c, 0x180c0c14,0x180c0c14
861 data4 0x26131335,0x26131335, 0xc3ecec2f,0xc3ecec2f
862 data4 0xbe5f5fe1,0xbe5f5fe1, 0x359797a2,0x359797a2
863 data4 0x884444cc,0x884444cc, 0x2e171739,0x2e171739
864 data4 0x93c4c457,0x93c4c457, 0x55a7a7f2,0x55a7a7f2
865 data4 0xfc7e7e82,0xfc7e7e82, 0x7a3d3d47,0x7a3d3d47
866 data4 0xc86464ac,0xc86464ac, 0xba5d5de7,0xba5d5de7
867 data4 0x3219192b,0x3219192b, 0xe6737395,0xe6737395
868 data4 0xc06060a0,0xc06060a0, 0x19818198,0x19818198
869 data4 0x9e4f4fd1,0x9e4f4fd1, 0xa3dcdc7f,0xa3dcdc7f
870 data4 0x44222266,0x44222266, 0x542a2a7e,0x542a2a7e
871 data4 0x3b9090ab,0x3b9090ab, 0x0b888883,0x0b888883
872 data4 0x8c4646ca,0x8c4646ca, 0xc7eeee29,0xc7eeee29
873 data4 0x6bb8b8d3,0x6bb8b8d3, 0x2814143c,0x2814143c
874 data4 0xa7dede79,0xa7dede79, 0xbc5e5ee2,0xbc5e5ee2
875 data4 0x160b0b1d,0x160b0b1d, 0xaddbdb76,0xaddbdb76
876 data4 0xdbe0e03b,0xdbe0e03b, 0x64323256,0x64323256
877 data4 0x743a3a4e,0x743a3a4e, 0x140a0a1e,0x140a0a1e
878 data4 0x924949db,0x924949db, 0x0c06060a,0x0c06060a
879 data4 0x4824246c,0x4824246c, 0xb85c5ce4,0xb85c5ce4
880 data4 0x9fc2c25d,0x9fc2c25d, 0xbdd3d36e,0xbdd3d36e
881 data4 0x43acacef,0x43acacef, 0xc46262a6,0xc46262a6
882 data4 0x399191a8,0x399191a8, 0x319595a4,0x319595a4
883 data4 0xd3e4e437,0xd3e4e437, 0xf279798b,0xf279798b
884 data4 0xd5e7e732,0xd5e7e732, 0x8bc8c843,0x8bc8c843
885 data4 0x6e373759,0x6e373759, 0xda6d6db7,0xda6d6db7
886 data4 0x018d8d8c,0x018d8d8c, 0xb1d5d564,0xb1d5d564
887 data4 0x9c4e4ed2,0x9c4e4ed2, 0x49a9a9e0,0x49a9a9e0
888 data4 0xd86c6cb4,0xd86c6cb4, 0xac5656fa,0xac5656fa
889 data4 0xf3f4f407,0xf3f4f407, 0xcfeaea25,0xcfeaea25
890 data4 0xca6565af,0xca6565af, 0xf47a7a8e,0xf47a7a8e
891 data4 0x47aeaee9,0x47aeaee9, 0x10080818,0x10080818
892 data4 0x6fbabad5,0x6fbabad5, 0xf0787888,0xf0787888
893 data4 0x4a25256f,0x4a25256f, 0x5c2e2e72,0x5c2e2e72
894 data4 0x381c1c24,0x381c1c24, 0x57a6a6f1,0x57a6a6f1
895 data4 0x73b4b4c7,0x73b4b4c7, 0x97c6c651,0x97c6c651
896 data4 0xcbe8e823,0xcbe8e823, 0xa1dddd7c,0xa1dddd7c
897 data4 0xe874749c,0xe874749c, 0x3e1f1f21,0x3e1f1f21
898 data4 0x964b4bdd,0x964b4bdd, 0x61bdbddc,0x61bdbddc
899 data4 0x0d8b8b86,0x0d8b8b86, 0x0f8a8a85,0x0f8a8a85
900 data4 0xe0707090,0xe0707090, 0x7c3e3e42,0x7c3e3e42
901 data4 0x71b5b5c4,0x71b5b5c4, 0xcc6666aa,0xcc6666aa
902 data4 0x904848d8,0x904848d8, 0x06030305,0x06030305
903 data4 0xf7f6f601,0xf7f6f601, 0x1c0e0e12,0x1c0e0e12
904 data4 0xc26161a3,0xc26161a3, 0x6a35355f,0x6a35355f
905 data4 0xae5757f9,0xae5757f9, 0x69b9b9d0,0x69b9b9d0
906 data4 0x17868691,0x17868691, 0x99c1c158,0x99c1c158
907 data4 0x3a1d1d27,0x3a1d1d27, 0x279e9eb9,0x279e9eb9
908 data4 0xd9e1e138,0xd9e1e138, 0xebf8f813,0xebf8f813
909 data4 0x2b9898b3,0x2b9898b3, 0x22111133,0x22111133
910 data4 0xd26969bb,0xd26969bb, 0xa9d9d970,0xa9d9d970
911 data4 0x078e8e89,0x078e8e89, 0x339494a7,0x339494a7
912 data4 0x2d9b9bb6,0x2d9b9bb6, 0x3c1e1e22,0x3c1e1e22
913 data4 0x15878792,0x15878792, 0xc9e9e920,0xc9e9e920
914 data4 0x87cece49,0x87cece49, 0xaa5555ff,0xaa5555ff
915 data4 0x50282878,0x50282878, 0xa5dfdf7a,0xa5dfdf7a
916 data4 0x038c8c8f,0x038c8c8f, 0x59a1a1f8,0x59a1a1f8
917 data4 0x09898980,0x09898980, 0x1a0d0d17,0x1a0d0d17
918 data4 0x65bfbfda,0x65bfbfda, 0xd7e6e631,0xd7e6e631
919 data4 0x844242c6,0x844242c6, 0xd06868b8,0xd06868b8
920 data4 0x824141c3,0x824141c3, 0x299999b0,0x299999b0
921 data4 0x5a2d2d77,0x5a2d2d77, 0x1e0f0f11,0x1e0f0f11
922 data4 0x7bb0b0cb,0x7bb0b0cb, 0xa85454fc,0xa85454fc
923 data4 0x6dbbbbd6,0x6dbbbbd6, 0x2c16163a,0x2c16163a
924// Te4:
925 data1 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
926 data1 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
927 data1 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
928 data1 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
929 data1 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
930 data1 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
931 data1 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
932 data1 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
933 data1 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
934 data1 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
935 data1 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
936 data1 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
937 data1 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
938 data1 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
939 data1 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
940 data1 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
941 data1 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
942 data1 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
943 data1 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
944 data1 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
945 data1 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
946 data1 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
947 data1 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
948 data1 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
949 data1 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
950 data1 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
951 data1 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
952 data1 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
953 data1 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
954 data1 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
955 data1 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
956 data1 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
957.size AES_Te#,2048+256 // HP-UX assembler fails to ".-AES_Te#"
958
959.align 64
960.global AES_Td#
961.type AES_Td#,@object
962AES_Td: data4 0x51f4a750,0x51f4a750, 0x7e416553,0x7e416553
963 data4 0x1a17a4c3,0x1a17a4c3, 0x3a275e96,0x3a275e96
964 data4 0x3bab6bcb,0x3bab6bcb, 0x1f9d45f1,0x1f9d45f1
965 data4 0xacfa58ab,0xacfa58ab, 0x4be30393,0x4be30393
966 data4 0x2030fa55,0x2030fa55, 0xad766df6,0xad766df6
967 data4 0x88cc7691,0x88cc7691, 0xf5024c25,0xf5024c25
968 data4 0x4fe5d7fc,0x4fe5d7fc, 0xc52acbd7,0xc52acbd7
969 data4 0x26354480,0x26354480, 0xb562a38f,0xb562a38f
970 data4 0xdeb15a49,0xdeb15a49, 0x25ba1b67,0x25ba1b67
971 data4 0x45ea0e98,0x45ea0e98, 0x5dfec0e1,0x5dfec0e1
972 data4 0xc32f7502,0xc32f7502, 0x814cf012,0x814cf012
973 data4 0x8d4697a3,0x8d4697a3, 0x6bd3f9c6,0x6bd3f9c6
974 data4 0x038f5fe7,0x038f5fe7, 0x15929c95,0x15929c95
975 data4 0xbf6d7aeb,0xbf6d7aeb, 0x955259da,0x955259da
976 data4 0xd4be832d,0xd4be832d, 0x587421d3,0x587421d3
977 data4 0x49e06929,0x49e06929, 0x8ec9c844,0x8ec9c844
978 data4 0x75c2896a,0x75c2896a, 0xf48e7978,0xf48e7978
979 data4 0x99583e6b,0x99583e6b, 0x27b971dd,0x27b971dd
980 data4 0xbee14fb6,0xbee14fb6, 0xf088ad17,0xf088ad17
981 data4 0xc920ac66,0xc920ac66, 0x7dce3ab4,0x7dce3ab4
982 data4 0x63df4a18,0x63df4a18, 0xe51a3182,0xe51a3182
983 data4 0x97513360,0x97513360, 0x62537f45,0x62537f45
984 data4 0xb16477e0,0xb16477e0, 0xbb6bae84,0xbb6bae84
985 data4 0xfe81a01c,0xfe81a01c, 0xf9082b94,0xf9082b94
986 data4 0x70486858,0x70486858, 0x8f45fd19,0x8f45fd19
987 data4 0x94de6c87,0x94de6c87, 0x527bf8b7,0x527bf8b7
988 data4 0xab73d323,0xab73d323, 0x724b02e2,0x724b02e2
989 data4 0xe31f8f57,0xe31f8f57, 0x6655ab2a,0x6655ab2a
990 data4 0xb2eb2807,0xb2eb2807, 0x2fb5c203,0x2fb5c203
991 data4 0x86c57b9a,0x86c57b9a, 0xd33708a5,0xd33708a5
992 data4 0x302887f2,0x302887f2, 0x23bfa5b2,0x23bfa5b2
993 data4 0x02036aba,0x02036aba, 0xed16825c,0xed16825c
994 data4 0x8acf1c2b,0x8acf1c2b, 0xa779b492,0xa779b492
995 data4 0xf307f2f0,0xf307f2f0, 0x4e69e2a1,0x4e69e2a1
996 data4 0x65daf4cd,0x65daf4cd, 0x0605bed5,0x0605bed5
997 data4 0xd134621f,0xd134621f, 0xc4a6fe8a,0xc4a6fe8a
998 data4 0x342e539d,0x342e539d, 0xa2f355a0,0xa2f355a0
999 data4 0x058ae132,0x058ae132, 0xa4f6eb75,0xa4f6eb75
1000 data4 0x0b83ec39,0x0b83ec39, 0x4060efaa,0x4060efaa
1001 data4 0x5e719f06,0x5e719f06, 0xbd6e1051,0xbd6e1051
1002 data4 0x3e218af9,0x3e218af9, 0x96dd063d,0x96dd063d
1003 data4 0xdd3e05ae,0xdd3e05ae, 0x4de6bd46,0x4de6bd46
1004 data4 0x91548db5,0x91548db5, 0x71c45d05,0x71c45d05
1005 data4 0x0406d46f,0x0406d46f, 0x605015ff,0x605015ff
1006 data4 0x1998fb24,0x1998fb24, 0xd6bde997,0xd6bde997
1007 data4 0x894043cc,0x894043cc, 0x67d99e77,0x67d99e77
1008 data4 0xb0e842bd,0xb0e842bd, 0x07898b88,0x07898b88
1009 data4 0xe7195b38,0xe7195b38, 0x79c8eedb,0x79c8eedb
1010 data4 0xa17c0a47,0xa17c0a47, 0x7c420fe9,0x7c420fe9
1011 data4 0xf8841ec9,0xf8841ec9, 0x00000000,0x00000000
1012 data4 0x09808683,0x09808683, 0x322bed48,0x322bed48
1013 data4 0x1e1170ac,0x1e1170ac, 0x6c5a724e,0x6c5a724e
1014 data4 0xfd0efffb,0xfd0efffb, 0x0f853856,0x0f853856
1015 data4 0x3daed51e,0x3daed51e, 0x362d3927,0x362d3927
1016 data4 0x0a0fd964,0x0a0fd964, 0x685ca621,0x685ca621
1017 data4 0x9b5b54d1,0x9b5b54d1, 0x24362e3a,0x24362e3a
1018 data4 0x0c0a67b1,0x0c0a67b1, 0x9357e70f,0x9357e70f
1019 data4 0xb4ee96d2,0xb4ee96d2, 0x1b9b919e,0x1b9b919e
1020 data4 0x80c0c54f,0x80c0c54f, 0x61dc20a2,0x61dc20a2
1021 data4 0x5a774b69,0x5a774b69, 0x1c121a16,0x1c121a16
1022 data4 0xe293ba0a,0xe293ba0a, 0xc0a02ae5,0xc0a02ae5
1023 data4 0x3c22e043,0x3c22e043, 0x121b171d,0x121b171d
1024 data4 0x0e090d0b,0x0e090d0b, 0xf28bc7ad,0xf28bc7ad
1025 data4 0x2db6a8b9,0x2db6a8b9, 0x141ea9c8,0x141ea9c8
1026 data4 0x57f11985,0x57f11985, 0xaf75074c,0xaf75074c
1027 data4 0xee99ddbb,0xee99ddbb, 0xa37f60fd,0xa37f60fd
1028 data4 0xf701269f,0xf701269f, 0x5c72f5bc,0x5c72f5bc
1029 data4 0x44663bc5,0x44663bc5, 0x5bfb7e34,0x5bfb7e34
1030 data4 0x8b432976,0x8b432976, 0xcb23c6dc,0xcb23c6dc
1031 data4 0xb6edfc68,0xb6edfc68, 0xb8e4f163,0xb8e4f163
1032 data4 0xd731dcca,0xd731dcca, 0x42638510,0x42638510
1033 data4 0x13972240,0x13972240, 0x84c61120,0x84c61120
1034 data4 0x854a247d,0x854a247d, 0xd2bb3df8,0xd2bb3df8
1035 data4 0xaef93211,0xaef93211, 0xc729a16d,0xc729a16d
1036 data4 0x1d9e2f4b,0x1d9e2f4b, 0xdcb230f3,0xdcb230f3
1037 data4 0x0d8652ec,0x0d8652ec, 0x77c1e3d0,0x77c1e3d0
1038 data4 0x2bb3166c,0x2bb3166c, 0xa970b999,0xa970b999
1039 data4 0x119448fa,0x119448fa, 0x47e96422,0x47e96422
1040 data4 0xa8fc8cc4,0xa8fc8cc4, 0xa0f03f1a,0xa0f03f1a
1041 data4 0x567d2cd8,0x567d2cd8, 0x223390ef,0x223390ef
1042 data4 0x87494ec7,0x87494ec7, 0xd938d1c1,0xd938d1c1
1043 data4 0x8ccaa2fe,0x8ccaa2fe, 0x98d40b36,0x98d40b36
1044 data4 0xa6f581cf,0xa6f581cf, 0xa57ade28,0xa57ade28
1045 data4 0xdab78e26,0xdab78e26, 0x3fadbfa4,0x3fadbfa4
1046 data4 0x2c3a9de4,0x2c3a9de4, 0x5078920d,0x5078920d
1047 data4 0x6a5fcc9b,0x6a5fcc9b, 0x547e4662,0x547e4662
1048 data4 0xf68d13c2,0xf68d13c2, 0x90d8b8e8,0x90d8b8e8
1049 data4 0x2e39f75e,0x2e39f75e, 0x82c3aff5,0x82c3aff5
1050 data4 0x9f5d80be,0x9f5d80be, 0x69d0937c,0x69d0937c
1051 data4 0x6fd52da9,0x6fd52da9, 0xcf2512b3,0xcf2512b3
1052 data4 0xc8ac993b,0xc8ac993b, 0x10187da7,0x10187da7
1053 data4 0xe89c636e,0xe89c636e, 0xdb3bbb7b,0xdb3bbb7b
1054 data4 0xcd267809,0xcd267809, 0x6e5918f4,0x6e5918f4
1055 data4 0xec9ab701,0xec9ab701, 0x834f9aa8,0x834f9aa8
1056 data4 0xe6956e65,0xe6956e65, 0xaaffe67e,0xaaffe67e
1057 data4 0x21bccf08,0x21bccf08, 0xef15e8e6,0xef15e8e6
1058 data4 0xbae79bd9,0xbae79bd9, 0x4a6f36ce,0x4a6f36ce
1059 data4 0xea9f09d4,0xea9f09d4, 0x29b07cd6,0x29b07cd6
1060 data4 0x31a4b2af,0x31a4b2af, 0x2a3f2331,0x2a3f2331
1061 data4 0xc6a59430,0xc6a59430, 0x35a266c0,0x35a266c0
1062 data4 0x744ebc37,0x744ebc37, 0xfc82caa6,0xfc82caa6
1063 data4 0xe090d0b0,0xe090d0b0, 0x33a7d815,0x33a7d815
1064 data4 0xf104984a,0xf104984a, 0x41ecdaf7,0x41ecdaf7
1065 data4 0x7fcd500e,0x7fcd500e, 0x1791f62f,0x1791f62f
1066 data4 0x764dd68d,0x764dd68d, 0x43efb04d,0x43efb04d
1067 data4 0xccaa4d54,0xccaa4d54, 0xe49604df,0xe49604df
1068 data4 0x9ed1b5e3,0x9ed1b5e3, 0x4c6a881b,0x4c6a881b
1069 data4 0xc12c1fb8,0xc12c1fb8, 0x4665517f,0x4665517f
1070 data4 0x9d5eea04,0x9d5eea04, 0x018c355d,0x018c355d
1071 data4 0xfa877473,0xfa877473, 0xfb0b412e,0xfb0b412e
1072 data4 0xb3671d5a,0xb3671d5a, 0x92dbd252,0x92dbd252
1073 data4 0xe9105633,0xe9105633, 0x6dd64713,0x6dd64713
1074 data4 0x9ad7618c,0x9ad7618c, 0x37a10c7a,0x37a10c7a
1075 data4 0x59f8148e,0x59f8148e, 0xeb133c89,0xeb133c89
1076 data4 0xcea927ee,0xcea927ee, 0xb761c935,0xb761c935
1077 data4 0xe11ce5ed,0xe11ce5ed, 0x7a47b13c,0x7a47b13c
1078 data4 0x9cd2df59,0x9cd2df59, 0x55f2733f,0x55f2733f
1079 data4 0x1814ce79,0x1814ce79, 0x73c737bf,0x73c737bf
1080 data4 0x53f7cdea,0x53f7cdea, 0x5ffdaa5b,0x5ffdaa5b
1081 data4 0xdf3d6f14,0xdf3d6f14, 0x7844db86,0x7844db86
1082 data4 0xcaaff381,0xcaaff381, 0xb968c43e,0xb968c43e
1083 data4 0x3824342c,0x3824342c, 0xc2a3405f,0xc2a3405f
1084 data4 0x161dc372,0x161dc372, 0xbce2250c,0xbce2250c
1085 data4 0x283c498b,0x283c498b, 0xff0d9541,0xff0d9541
1086 data4 0x39a80171,0x39a80171, 0x080cb3de,0x080cb3de
1087 data4 0xd8b4e49c,0xd8b4e49c, 0x6456c190,0x6456c190
1088 data4 0x7bcb8461,0x7bcb8461, 0xd532b670,0xd532b670
1089 data4 0x486c5c74,0x486c5c74, 0xd0b85742,0xd0b85742
1090// Td4:
1091 data1 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
1092 data1 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
1093 data1 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
1094 data1 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
1095 data1 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
1096 data1 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
1097 data1 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
1098 data1 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
1099 data1 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
1100 data1 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
1101 data1 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
1102 data1 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
1103 data1 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
1104 data1 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
1105 data1 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
1106 data1 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
1107 data1 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
1108 data1 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
1109 data1 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
1110 data1 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
1111 data1 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
1112 data1 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
1113 data1 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
1114 data1 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
1115 data1 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
1116 data1 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
1117 data1 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
1118 data1 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
1119 data1 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
1120 data1 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
1121 data1 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
1122 data1 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1123.size AES_Td#,2048+256 // HP-UX assembler fails to ".-AES_Td#"
diff --git a/src/lib/libcrypto/aes/asm/aes-mips.pl b/src/lib/libcrypto/aes/asm/aes-mips.pl
deleted file mode 100644
index 2f6ff74ffe..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-mips.pl
+++ /dev/null
@@ -1,1613 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for MIPS
11
12# October 2010
13#
14# Code uses 1K[+256B] S-box and on single-issue core [such as R5000]
15# spends ~68 cycles per byte processed with 128-bit key. This is ~16%
16# faster than gcc-generated code, which is not very impressive. But
17# recall that compressed S-box requires extra processing, namely
18# additional rotations. Rotations are implemented with lwl/lwr pairs,
19# which is normally used for loading unaligned data. Another cool
20# thing about this module is its endian neutrality, which means that
21# it processes data without ever changing byte order...
22
23######################################################################
24# There is a number of MIPS ABI in use, O32 and N32/64 are most
25# widely used. Then there is a new contender: NUBI. It appears that if
26# one picks the latter, it's possible to arrange code in ABI neutral
27# manner. Therefore let's stick to NUBI register layout:
28#
29($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
30($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
31($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
32($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
33#
34# The return value is placed in $a0. Following coding rules facilitate
35# interoperability:
36#
37# - never ever touch $tp, "thread pointer", former $gp;
38# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
39# old code];
40# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
41#
42# For reference here is register layout for N32/64 MIPS ABIs:
43#
44# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
45# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
46# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
47# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
48# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
49#
50$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
51
52if ($flavour =~ /64/i) {
53 $LA="dla";
54} else {
55 $LA="la";
56}
57
58if ($flavour =~ /64|n32/i) {
59 $PTR_ADD="dadd"; # incidentally works even on n32
60 $PTR_SUB="dsub"; # incidentally works even on n32
61 $REG_S="sd";
62 $REG_L="ld";
63 $PTR_SLL="dsll"; # incidentally works even on n32
64 $SZREG=8;
65} else {
66 $PTR_ADD="add";
67 $PTR_SUB="sub";
68 $REG_S="sw";
69 $REG_L="lw";
70 $PTR_SLL="sll";
71 $SZREG=4;
72}
73$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
74#
75# <appro@openssl.org>
76#
77######################################################################
78
79$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
80
81for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
82open STDOUT,">$output";
83
84if (!defined($big_endian))
85{ $big_endian=(unpack('L',pack('N',1))==1); }
86
87while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
88open STDOUT,">$output";
89
90my ($MSB,$LSB)=(0,3); # automatically converted to little-endian
91
92$code.=<<___;
93.text
94#if !defined(__vxworks) || defined(__pic__)
95.option pic2
96#endif
97.set noat
98___
99
100{{{
101my $FRAMESIZE=16*$SZREG;
102my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
103
104my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
105my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
106my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23));
107my ($key0,$cnt)=($gp,$fp);
108
109# instuction ordering is "stolen" from output from MIPSpro assembler
110# invoked with -mips3 -O3 arguments...
111$code.=<<___;
112.align 5
113.ent _mips_AES_encrypt
114_mips_AES_encrypt:
115 .frame $sp,0,$ra
116 .set reorder
117 lw $t0,0($key)
118 lw $t1,4($key)
119 lw $t2,8($key)
120 lw $t3,12($key)
121 lw $cnt,240($key)
122 $PTR_ADD $key0,$key,16
123
124 xor $s0,$t0
125 xor $s1,$t1
126 xor $s2,$t2
127 xor $s3,$t3
128
129 sub $cnt,1
130 _xtr $i0,$s1,16-2
131.Loop_enc:
132 _xtr $i1,$s2,16-2
133 _xtr $i2,$s3,16-2
134 _xtr $i3,$s0,16-2
135 and $i0,0x3fc
136 and $i1,0x3fc
137 and $i2,0x3fc
138 and $i3,0x3fc
139 $PTR_ADD $i0,$Tbl
140 $PTR_ADD $i1,$Tbl
141 $PTR_ADD $i2,$Tbl
142 $PTR_ADD $i3,$Tbl
143 lwl $t0,3($i0) # Te1[s1>>16]
144 lwl $t1,3($i1) # Te1[s2>>16]
145 lwl $t2,3($i2) # Te1[s3>>16]
146 lwl $t3,3($i3) # Te1[s0>>16]
147 lwr $t0,2($i0) # Te1[s1>>16]
148 lwr $t1,2($i1) # Te1[s2>>16]
149 lwr $t2,2($i2) # Te1[s3>>16]
150 lwr $t3,2($i3) # Te1[s0>>16]
151
152 _xtr $i0,$s2,8-2
153 _xtr $i1,$s3,8-2
154 _xtr $i2,$s0,8-2
155 _xtr $i3,$s1,8-2
156 and $i0,0x3fc
157 and $i1,0x3fc
158 and $i2,0x3fc
159 and $i3,0x3fc
160 $PTR_ADD $i0,$Tbl
161 $PTR_ADD $i1,$Tbl
162 $PTR_ADD $i2,$Tbl
163 $PTR_ADD $i3,$Tbl
164 lwl $t4,2($i0) # Te2[s2>>8]
165 lwl $t5,2($i1) # Te2[s3>>8]
166 lwl $t6,2($i2) # Te2[s0>>8]
167 lwl $t7,2($i3) # Te2[s1>>8]
168 lwr $t4,1($i0) # Te2[s2>>8]
169 lwr $t5,1($i1) # Te2[s3>>8]
170 lwr $t6,1($i2) # Te2[s0>>8]
171 lwr $t7,1($i3) # Te2[s1>>8]
172
173 _xtr $i0,$s3,0-2
174 _xtr $i1,$s0,0-2
175 _xtr $i2,$s1,0-2
176 _xtr $i3,$s2,0-2
177 and $i0,0x3fc
178 and $i1,0x3fc
179 and $i2,0x3fc
180 and $i3,0x3fc
181 $PTR_ADD $i0,$Tbl
182 $PTR_ADD $i1,$Tbl
183 $PTR_ADD $i2,$Tbl
184 $PTR_ADD $i3,$Tbl
185 lwl $t8,1($i0) # Te3[s3]
186 lwl $t9,1($i1) # Te3[s0]
187 lwl $t10,1($i2) # Te3[s1]
188 lwl $t11,1($i3) # Te3[s2]
189 lwr $t8,0($i0) # Te3[s3]
190 lwr $t9,0($i1) # Te3[s0]
191 lwr $t10,0($i2) # Te3[s1]
192 lwr $t11,0($i3) # Te3[s2]
193
194 _xtr $i0,$s0,24-2
195 _xtr $i1,$s1,24-2
196 _xtr $i2,$s2,24-2
197 _xtr $i3,$s3,24-2
198 and $i0,0x3fc
199 and $i1,0x3fc
200 and $i2,0x3fc
201 and $i3,0x3fc
202 $PTR_ADD $i0,$Tbl
203 $PTR_ADD $i1,$Tbl
204 $PTR_ADD $i2,$Tbl
205 $PTR_ADD $i3,$Tbl
206 xor $t0,$t4
207 xor $t1,$t5
208 xor $t2,$t6
209 xor $t3,$t7
210 lw $t4,0($i0) # Te0[s0>>24]
211 lw $t5,0($i1) # Te0[s1>>24]
212 lw $t6,0($i2) # Te0[s2>>24]
213 lw $t7,0($i3) # Te0[s3>>24]
214
215 lw $s0,0($key0)
216 lw $s1,4($key0)
217 lw $s2,8($key0)
218 lw $s3,12($key0)
219
220 xor $t0,$t8
221 xor $t1,$t9
222 xor $t2,$t10
223 xor $t3,$t11
224
225 xor $t0,$t4
226 xor $t1,$t5
227 xor $t2,$t6
228 xor $t3,$t7
229
230 sub $cnt,1
231 $PTR_ADD $key0,16
232 xor $s0,$t0
233 xor $s1,$t1
234 xor $s2,$t2
235 xor $s3,$t3
236 .set noreorder
237 bnez $cnt,.Loop_enc
238 _xtr $i0,$s1,16-2
239
240 .set reorder
241 _xtr $i1,$s2,16-2
242 _xtr $i2,$s3,16-2
243 _xtr $i3,$s0,16-2
244 and $i0,0x3fc
245 and $i1,0x3fc
246 and $i2,0x3fc
247 and $i3,0x3fc
248 $PTR_ADD $i0,$Tbl
249 $PTR_ADD $i1,$Tbl
250 $PTR_ADD $i2,$Tbl
251 $PTR_ADD $i3,$Tbl
252 lbu $t0,2($i0) # Te4[s1>>16]
253 lbu $t1,2($i1) # Te4[s2>>16]
254 lbu $t2,2($i2) # Te4[s3>>16]
255 lbu $t3,2($i3) # Te4[s0>>16]
256
257 _xtr $i0,$s2,8-2
258 _xtr $i1,$s3,8-2
259 _xtr $i2,$s0,8-2
260 _xtr $i3,$s1,8-2
261 and $i0,0x3fc
262 and $i1,0x3fc
263 and $i2,0x3fc
264 and $i3,0x3fc
265 $PTR_ADD $i0,$Tbl
266 $PTR_ADD $i1,$Tbl
267 $PTR_ADD $i2,$Tbl
268 $PTR_ADD $i3,$Tbl
269 lbu $t4,2($i0) # Te4[s2>>8]
270 lbu $t5,2($i1) # Te4[s3>>8]
271 lbu $t6,2($i2) # Te4[s0>>8]
272 lbu $t7,2($i3) # Te4[s1>>8]
273
274 _xtr $i0,$s0,24-2
275 _xtr $i1,$s1,24-2
276 _xtr $i2,$s2,24-2
277 _xtr $i3,$s3,24-2
278 and $i0,0x3fc
279 and $i1,0x3fc
280 and $i2,0x3fc
281 and $i3,0x3fc
282 $PTR_ADD $i0,$Tbl
283 $PTR_ADD $i1,$Tbl
284 $PTR_ADD $i2,$Tbl
285 $PTR_ADD $i3,$Tbl
286 lbu $t8,2($i0) # Te4[s0>>24]
287 lbu $t9,2($i1) # Te4[s1>>24]
288 lbu $t10,2($i2) # Te4[s2>>24]
289 lbu $t11,2($i3) # Te4[s3>>24]
290
291 _xtr $i0,$s3,0-2
292 _xtr $i1,$s0,0-2
293 _xtr $i2,$s1,0-2
294 _xtr $i3,$s2,0-2
295 and $i0,0x3fc
296 and $i1,0x3fc
297 and $i2,0x3fc
298 and $i3,0x3fc
299
300 _ins $t0,16
301 _ins $t1,16
302 _ins $t2,16
303 _ins $t3,16
304
305 _ins $t4,8
306 _ins $t5,8
307 _ins $t6,8
308 _ins $t7,8
309
310 xor $t0,$t4
311 xor $t1,$t5
312 xor $t2,$t6
313 xor $t3,$t7
314
315 $PTR_ADD $i0,$Tbl
316 $PTR_ADD $i1,$Tbl
317 $PTR_ADD $i2,$Tbl
318 $PTR_ADD $i3,$Tbl
319 lbu $t4,2($i0) # Te4[s3]
320 lbu $t5,2($i1) # Te4[s0]
321 lbu $t6,2($i2) # Te4[s1]
322 lbu $t7,2($i3) # Te4[s2]
323
324 _ins $t8,24
325 _ins $t9,24
326 _ins $t10,24
327 _ins $t11,24
328
329 lw $s0,0($key0)
330 lw $s1,4($key0)
331 lw $s2,8($key0)
332 lw $s3,12($key0)
333
334 xor $t0,$t8
335 xor $t1,$t9
336 xor $t2,$t10
337 xor $t3,$t11
338
339 _ins $t4,0
340 _ins $t5,0
341 _ins $t6,0
342 _ins $t7,0
343
344 xor $t0,$t4
345 xor $t1,$t5
346 xor $t2,$t6
347 xor $t3,$t7
348
349 xor $s0,$t0
350 xor $s1,$t1
351 xor $s2,$t2
352 xor $s3,$t3
353
354 jr $ra
355.end _mips_AES_encrypt
356
357.align 5
358.globl AES_encrypt
359.ent AES_encrypt
360AES_encrypt:
361 .frame $sp,$FRAMESIZE,$ra
362 .mask $SAVED_REGS_MASK,-$SZREG
363 .set noreorder
364___
365$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
366 .cpload $pf
367___
368$code.=<<___;
369 $PTR_SUB $sp,$FRAMESIZE
370 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
371 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
372 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
373 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
374 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
375 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
376 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
377 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
378 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
379 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
380___
381$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
382 $REG_S \$15,$FRAMESIZE-11*$SZREG($sp)
383 $REG_S \$14,$FRAMESIZE-12*$SZREG($sp)
384 $REG_S \$13,$FRAMESIZE-13*$SZREG($sp)
385 $REG_S \$12,$FRAMESIZE-14*$SZREG($sp)
386 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
387___
388$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
389 .cplocal $Tbl
390 .cpsetup $pf,$zero,AES_encrypt
391___
392$code.=<<___;
393 .set reorder
394 $LA $Tbl,AES_Te # PIC-ified 'load address'
395
396 lwl $s0,0+$MSB($inp)
397 lwl $s1,4+$MSB($inp)
398 lwl $s2,8+$MSB($inp)
399 lwl $s3,12+$MSB($inp)
400 lwr $s0,0+$LSB($inp)
401 lwr $s1,4+$LSB($inp)
402 lwr $s2,8+$LSB($inp)
403 lwr $s3,12+$LSB($inp)
404
405 bal _mips_AES_encrypt
406
407 swr $s0,0+$LSB($out)
408 swr $s1,4+$LSB($out)
409 swr $s2,8+$LSB($out)
410 swr $s3,12+$LSB($out)
411 swl $s0,0+$MSB($out)
412 swl $s1,4+$MSB($out)
413 swl $s2,8+$MSB($out)
414 swl $s3,12+$MSB($out)
415
416 .set noreorder
417 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
418 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
419 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
420 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
421 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
422 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
423 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
424 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
425 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
426 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
427___
428$code.=<<___ if ($flavour =~ /nubi/i);
429 $REG_L \$15,$FRAMESIZE-11*$SZREG($sp)
430 $REG_L \$14,$FRAMESIZE-12*$SZREG($sp)
431 $REG_L \$13,$FRAMESIZE-13*$SZREG($sp)
432 $REG_L \$12,$FRAMESIZE-14*$SZREG($sp)
433 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
434___
435$code.=<<___;
436 jr $ra
437 $PTR_ADD $sp,$FRAMESIZE
438.end AES_encrypt
439___
440
441$code.=<<___;
442.align 5
443.ent _mips_AES_decrypt
444_mips_AES_decrypt:
445 .frame $sp,0,$ra
446 .set reorder
447 lw $t0,0($key)
448 lw $t1,4($key)
449 lw $t2,8($key)
450 lw $t3,12($key)
451 lw $cnt,240($key)
452 $PTR_ADD $key0,$key,16
453
454 xor $s0,$t0
455 xor $s1,$t1
456 xor $s2,$t2
457 xor $s3,$t3
458
459 sub $cnt,1
460 _xtr $i0,$s3,16-2
461.Loop_dec:
462 _xtr $i1,$s0,16-2
463 _xtr $i2,$s1,16-2
464 _xtr $i3,$s2,16-2
465 and $i0,0x3fc
466 and $i1,0x3fc
467 and $i2,0x3fc
468 and $i3,0x3fc
469 $PTR_ADD $i0,$Tbl
470 $PTR_ADD $i1,$Tbl
471 $PTR_ADD $i2,$Tbl
472 $PTR_ADD $i3,$Tbl
473 lwl $t0,3($i0) # Td1[s3>>16]
474 lwl $t1,3($i1) # Td1[s0>>16]
475 lwl $t2,3($i2) # Td1[s1>>16]
476 lwl $t3,3($i3) # Td1[s2>>16]
477 lwr $t0,2($i0) # Td1[s3>>16]
478 lwr $t1,2($i1) # Td1[s0>>16]
479 lwr $t2,2($i2) # Td1[s1>>16]
480 lwr $t3,2($i3) # Td1[s2>>16]
481
482 _xtr $i0,$s2,8-2
483 _xtr $i1,$s3,8-2
484 _xtr $i2,$s0,8-2
485 _xtr $i3,$s1,8-2
486 and $i0,0x3fc
487 and $i1,0x3fc
488 and $i2,0x3fc
489 and $i3,0x3fc
490 $PTR_ADD $i0,$Tbl
491 $PTR_ADD $i1,$Tbl
492 $PTR_ADD $i2,$Tbl
493 $PTR_ADD $i3,$Tbl
494 lwl $t4,2($i0) # Td2[s2>>8]
495 lwl $t5,2($i1) # Td2[s3>>8]
496 lwl $t6,2($i2) # Td2[s0>>8]
497 lwl $t7,2($i3) # Td2[s1>>8]
498 lwr $t4,1($i0) # Td2[s2>>8]
499 lwr $t5,1($i1) # Td2[s3>>8]
500 lwr $t6,1($i2) # Td2[s0>>8]
501 lwr $t7,1($i3) # Td2[s1>>8]
502
503 _xtr $i0,$s1,0-2
504 _xtr $i1,$s2,0-2
505 _xtr $i2,$s3,0-2
506 _xtr $i3,$s0,0-2
507 and $i0,0x3fc
508 and $i1,0x3fc
509 and $i2,0x3fc
510 and $i3,0x3fc
511 $PTR_ADD $i0,$Tbl
512 $PTR_ADD $i1,$Tbl
513 $PTR_ADD $i2,$Tbl
514 $PTR_ADD $i3,$Tbl
515 lwl $t8,1($i0) # Td3[s1]
516 lwl $t9,1($i1) # Td3[s2]
517 lwl $t10,1($i2) # Td3[s3]
518 lwl $t11,1($i3) # Td3[s0]
519 lwr $t8,0($i0) # Td3[s1]
520 lwr $t9,0($i1) # Td3[s2]
521 lwr $t10,0($i2) # Td3[s3]
522 lwr $t11,0($i3) # Td3[s0]
523
524 _xtr $i0,$s0,24-2
525 _xtr $i1,$s1,24-2
526 _xtr $i2,$s2,24-2
527 _xtr $i3,$s3,24-2
528 and $i0,0x3fc
529 and $i1,0x3fc
530 and $i2,0x3fc
531 and $i3,0x3fc
532 $PTR_ADD $i0,$Tbl
533 $PTR_ADD $i1,$Tbl
534 $PTR_ADD $i2,$Tbl
535 $PTR_ADD $i3,$Tbl
536
537 xor $t0,$t4
538 xor $t1,$t5
539 xor $t2,$t6
540 xor $t3,$t7
541
542
543 lw $t4,0($i0) # Td0[s0>>24]
544 lw $t5,0($i1) # Td0[s1>>24]
545 lw $t6,0($i2) # Td0[s2>>24]
546 lw $t7,0($i3) # Td0[s3>>24]
547
548 lw $s0,0($key0)
549 lw $s1,4($key0)
550 lw $s2,8($key0)
551 lw $s3,12($key0)
552
553 xor $t0,$t8
554 xor $t1,$t9
555 xor $t2,$t10
556 xor $t3,$t11
557
558 xor $t0,$t4
559 xor $t1,$t5
560 xor $t2,$t6
561 xor $t3,$t7
562
563 sub $cnt,1
564 $PTR_ADD $key0,16
565 xor $s0,$t0
566 xor $s1,$t1
567 xor $s2,$t2
568 xor $s3,$t3
569 .set noreorder
570 bnez $cnt,.Loop_dec
571 _xtr $i0,$s3,16-2
572
573 .set reorder
574 lw $t4,1024($Tbl) # prefetch Td4
575 lw $t5,1024+32($Tbl)
576 lw $t6,1024+64($Tbl)
577 lw $t7,1024+96($Tbl)
578 lw $t8,1024+128($Tbl)
579 lw $t9,1024+160($Tbl)
580 lw $t10,1024+192($Tbl)
581 lw $t11,1024+224($Tbl)
582
583 _xtr $i0,$s3,16
584 _xtr $i1,$s0,16
585 _xtr $i2,$s1,16
586 _xtr $i3,$s2,16
587 and $i0,0xff
588 and $i1,0xff
589 and $i2,0xff
590 and $i3,0xff
591 $PTR_ADD $i0,$Tbl
592 $PTR_ADD $i1,$Tbl
593 $PTR_ADD $i2,$Tbl
594 $PTR_ADD $i3,$Tbl
595 lbu $t0,1024($i0) # Td4[s3>>16]
596 lbu $t1,1024($i1) # Td4[s0>>16]
597 lbu $t2,1024($i2) # Td4[s1>>16]
598 lbu $t3,1024($i3) # Td4[s2>>16]
599
600 _xtr $i0,$s2,8
601 _xtr $i1,$s3,8
602 _xtr $i2,$s0,8
603 _xtr $i3,$s1,8
604 and $i0,0xff
605 and $i1,0xff
606 and $i2,0xff
607 and $i3,0xff
608 $PTR_ADD $i0,$Tbl
609 $PTR_ADD $i1,$Tbl
610 $PTR_ADD $i2,$Tbl
611 $PTR_ADD $i3,$Tbl
612 lbu $t4,1024($i0) # Td4[s2>>8]
613 lbu $t5,1024($i1) # Td4[s3>>8]
614 lbu $t6,1024($i2) # Td4[s0>>8]
615 lbu $t7,1024($i3) # Td4[s1>>8]
616
617 _xtr $i0,$s0,24
618 _xtr $i1,$s1,24
619 _xtr $i2,$s2,24
620 _xtr $i3,$s3,24
621 $PTR_ADD $i0,$Tbl
622 $PTR_ADD $i1,$Tbl
623 $PTR_ADD $i2,$Tbl
624 $PTR_ADD $i3,$Tbl
625 lbu $t8,1024($i0) # Td4[s0>>24]
626 lbu $t9,1024($i1) # Td4[s1>>24]
627 lbu $t10,1024($i2) # Td4[s2>>24]
628 lbu $t11,1024($i3) # Td4[s3>>24]
629
630 _xtr $i0,$s1,0
631 _xtr $i1,$s2,0
632 _xtr $i2,$s3,0
633 _xtr $i3,$s0,0
634
635 _ins $t0,16
636 _ins $t1,16
637 _ins $t2,16
638 _ins $t3,16
639
640 _ins $t4,8
641 _ins $t5,8
642 _ins $t6,8
643 _ins $t7,8
644
645 xor $t0,$t4
646 xor $t1,$t5
647 xor $t2,$t6
648 xor $t3,$t7
649
650 $PTR_ADD $i0,$Tbl
651 $PTR_ADD $i1,$Tbl
652 $PTR_ADD $i2,$Tbl
653 $PTR_ADD $i3,$Tbl
654 lbu $t4,1024($i0) # Td4[s1]
655 lbu $t5,1024($i1) # Td4[s2]
656 lbu $t6,1024($i2) # Td4[s3]
657 lbu $t7,1024($i3) # Td4[s0]
658
659 _ins $t8,24
660 _ins $t9,24
661 _ins $t10,24
662 _ins $t11,24
663
664 lw $s0,0($key0)
665 lw $s1,4($key0)
666 lw $s2,8($key0)
667 lw $s3,12($key0)
668
669 _ins $t4,0
670 _ins $t5,0
671 _ins $t6,0
672 _ins $t7,0
673
674
675 xor $t0,$t8
676 xor $t1,$t9
677 xor $t2,$t10
678 xor $t3,$t11
679
680 xor $t0,$t4
681 xor $t1,$t5
682 xor $t2,$t6
683 xor $t3,$t7
684
685 xor $s0,$t0
686 xor $s1,$t1
687 xor $s2,$t2
688 xor $s3,$t3
689
690 jr $ra
691.end _mips_AES_decrypt
692
693.align 5
694.globl AES_decrypt
695.ent AES_decrypt
696AES_decrypt:
697 .frame $sp,$FRAMESIZE,$ra
698 .mask $SAVED_REGS_MASK,-$SZREG
699 .set noreorder
700___
701$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
702 .cpload $pf
703___
704$code.=<<___;
705 $PTR_SUB $sp,$FRAMESIZE
706 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
707 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
708 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
709 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
710 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
711 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
712 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
713 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
714 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
715 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
716___
717$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
718 $REG_S \$15,$FRAMESIZE-11*$SZREG($sp)
719 $REG_S \$14,$FRAMESIZE-12*$SZREG($sp)
720 $REG_S \$13,$FRAMESIZE-13*$SZREG($sp)
721 $REG_S \$12,$FRAMESIZE-14*$SZREG($sp)
722 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
723___
724$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
725 .cplocal $Tbl
726 .cpsetup $pf,$zero,AES_decrypt
727___
728$code.=<<___;
729 .set reorder
730 $LA $Tbl,AES_Td # PIC-ified 'load address'
731
732 lwl $s0,0+$MSB($inp)
733 lwl $s1,4+$MSB($inp)
734 lwl $s2,8+$MSB($inp)
735 lwl $s3,12+$MSB($inp)
736 lwr $s0,0+$LSB($inp)
737 lwr $s1,4+$LSB($inp)
738 lwr $s2,8+$LSB($inp)
739 lwr $s3,12+$LSB($inp)
740
741 bal _mips_AES_decrypt
742
743 swr $s0,0+$LSB($out)
744 swr $s1,4+$LSB($out)
745 swr $s2,8+$LSB($out)
746 swr $s3,12+$LSB($out)
747 swl $s0,0+$MSB($out)
748 swl $s1,4+$MSB($out)
749 swl $s2,8+$MSB($out)
750 swl $s3,12+$MSB($out)
751
752 .set noreorder
753 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
754 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
755 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
756 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
757 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
758 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
759 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
760 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
761 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
762 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
763___
764$code.=<<___ if ($flavour =~ /nubi/i);
765 $REG_L \$15,$FRAMESIZE-11*$SZREG($sp)
766 $REG_L \$14,$FRAMESIZE-12*$SZREG($sp)
767 $REG_L \$13,$FRAMESIZE-13*$SZREG($sp)
768 $REG_L \$12,$FRAMESIZE-14*$SZREG($sp)
769 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
770___
771$code.=<<___;
772 jr $ra
773 $PTR_ADD $sp,$FRAMESIZE
774.end AES_decrypt
775___
776}}}
777
778{{{
779my $FRAMESIZE=8*$SZREG;
780my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000;
781
782my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
783my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
784my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
785my ($rcon,$cnt)=($gp,$fp);
786
787$code.=<<___;
788.align 5
789.ent _mips_AES_set_encrypt_key
790_mips_AES_set_encrypt_key:
791 .frame $sp,0,$ra
792 .set noreorder
793 beqz $inp,.Lekey_done
794 li $t0,-1
795 beqz $key,.Lekey_done
796 $PTR_ADD $rcon,$Tbl,1024+256
797
798 .set reorder
799 lwl $rk0,0+$MSB($inp) # load 128 bits
800 lwl $rk1,4+$MSB($inp)
801 lwl $rk2,8+$MSB($inp)
802 lwl $rk3,12+$MSB($inp)
803 li $at,128
804 lwr $rk0,0+$LSB($inp)
805 lwr $rk1,4+$LSB($inp)
806 lwr $rk2,8+$LSB($inp)
807 lwr $rk3,12+$LSB($inp)
808 .set noreorder
809 beq $bits,$at,.L128bits
810 li $cnt,10
811
812 .set reorder
813 lwl $rk4,16+$MSB($inp) # load 192 bits
814 lwl $rk5,20+$MSB($inp)
815 li $at,192
816 lwr $rk4,16+$LSB($inp)
817 lwr $rk5,20+$LSB($inp)
818 .set noreorder
819 beq $bits,$at,.L192bits
820 li $cnt,8
821
822 .set reorder
823 lwl $rk6,24+$MSB($inp) # load 256 bits
824 lwl $rk7,28+$MSB($inp)
825 li $at,256
826 lwr $rk6,24+$LSB($inp)
827 lwr $rk7,28+$LSB($inp)
828 .set noreorder
829 beq $bits,$at,.L256bits
830 li $cnt,7
831
832 b .Lekey_done
833 li $t0,-2
834
835.align 4
836.L128bits:
837 .set reorder
838 srl $i0,$rk3,16
839 srl $i1,$rk3,8
840 and $i0,0xff
841 and $i1,0xff
842 and $i2,$rk3,0xff
843 srl $i3,$rk3,24
844 $PTR_ADD $i0,$Tbl
845 $PTR_ADD $i1,$Tbl
846 $PTR_ADD $i2,$Tbl
847 $PTR_ADD $i3,$Tbl
848 lbu $i0,1024($i0)
849 lbu $i1,1024($i1)
850 lbu $i2,1024($i2)
851 lbu $i3,1024($i3)
852
853 sw $rk0,0($key)
854 sw $rk1,4($key)
855 sw $rk2,8($key)
856 sw $rk3,12($key)
857 sub $cnt,1
858 $PTR_ADD $key,16
859
860 _bias $i0,24
861 _bias $i1,16
862 _bias $i2,8
863 _bias $i3,0
864
865 xor $rk0,$i0
866 lw $i0,0($rcon)
867 xor $rk0,$i1
868 xor $rk0,$i2
869 xor $rk0,$i3
870 xor $rk0,$i0
871
872 xor $rk1,$rk0
873 xor $rk2,$rk1
874 xor $rk3,$rk2
875
876 .set noreorder
877 bnez $cnt,.L128bits
878 $PTR_ADD $rcon,4
879
880 sw $rk0,0($key)
881 sw $rk1,4($key)
882 sw $rk2,8($key)
883 li $cnt,10
884 sw $rk3,12($key)
885 li $t0,0
886 sw $cnt,80($key)
887 b .Lekey_done
888 $PTR_SUB $key,10*16
889
890.align 4
891.L192bits:
892 .set reorder
893 srl $i0,$rk5,16
894 srl $i1,$rk5,8
895 and $i0,0xff
896 and $i1,0xff
897 and $i2,$rk5,0xff
898 srl $i3,$rk5,24
899 $PTR_ADD $i0,$Tbl
900 $PTR_ADD $i1,$Tbl
901 $PTR_ADD $i2,$Tbl
902 $PTR_ADD $i3,$Tbl
903 lbu $i0,1024($i0)
904 lbu $i1,1024($i1)
905 lbu $i2,1024($i2)
906 lbu $i3,1024($i3)
907
908 sw $rk0,0($key)
909 sw $rk1,4($key)
910 sw $rk2,8($key)
911 sw $rk3,12($key)
912 sw $rk4,16($key)
913 sw $rk5,20($key)
914 sub $cnt,1
915 $PTR_ADD $key,24
916
917 _bias $i0,24
918 _bias $i1,16
919 _bias $i2,8
920 _bias $i3,0
921
922 xor $rk0,$i0
923 lw $i0,0($rcon)
924 xor $rk0,$i1
925 xor $rk0,$i2
926 xor $rk0,$i3
927 xor $rk0,$i0
928
929 xor $rk1,$rk0
930 xor $rk2,$rk1
931 xor $rk3,$rk2
932 xor $rk4,$rk3
933 xor $rk5,$rk4
934
935 .set noreorder
936 bnez $cnt,.L192bits
937 $PTR_ADD $rcon,4
938
939 sw $rk0,0($key)
940 sw $rk1,4($key)
941 sw $rk2,8($key)
942 li $cnt,12
943 sw $rk3,12($key)
944 li $t0,0
945 sw $cnt,48($key)
946 b .Lekey_done
947 $PTR_SUB $key,12*16
948
949.align 4
950.L256bits:
951 .set reorder
952 srl $i0,$rk7,16
953 srl $i1,$rk7,8
954 and $i0,0xff
955 and $i1,0xff
956 and $i2,$rk7,0xff
957 srl $i3,$rk7,24
958 $PTR_ADD $i0,$Tbl
959 $PTR_ADD $i1,$Tbl
960 $PTR_ADD $i2,$Tbl
961 $PTR_ADD $i3,$Tbl
962 lbu $i0,1024($i0)
963 lbu $i1,1024($i1)
964 lbu $i2,1024($i2)
965 lbu $i3,1024($i3)
966
967 sw $rk0,0($key)
968 sw $rk1,4($key)
969 sw $rk2,8($key)
970 sw $rk3,12($key)
971 sw $rk4,16($key)
972 sw $rk5,20($key)
973 sw $rk6,24($key)
974 sw $rk7,28($key)
975 sub $cnt,1
976
977 _bias $i0,24
978 _bias $i1,16
979 _bias $i2,8
980 _bias $i3,0
981
982 xor $rk0,$i0
983 lw $i0,0($rcon)
984 xor $rk0,$i1
985 xor $rk0,$i2
986 xor $rk0,$i3
987 xor $rk0,$i0
988
989 xor $rk1,$rk0
990 xor $rk2,$rk1
991 xor $rk3,$rk2
992 beqz $cnt,.L256bits_done
993
994 srl $i0,$rk3,24
995 srl $i1,$rk3,16
996 srl $i2,$rk3,8
997 and $i3,$rk3,0xff
998 and $i1,0xff
999 and $i2,0xff
1000 $PTR_ADD $i0,$Tbl
1001 $PTR_ADD $i1,$Tbl
1002 $PTR_ADD $i2,$Tbl
1003 $PTR_ADD $i3,$Tbl
1004 lbu $i0,1024($i0)
1005 lbu $i1,1024($i1)
1006 lbu $i2,1024($i2)
1007 lbu $i3,1024($i3)
1008 sll $i0,24
1009 sll $i1,16
1010 sll $i2,8
1011
1012 xor $rk4,$i0
1013 xor $rk4,$i1
1014 xor $rk4,$i2
1015 xor $rk4,$i3
1016
1017 xor $rk5,$rk4
1018 xor $rk6,$rk5
1019 xor $rk7,$rk6
1020
1021 $PTR_ADD $key,32
1022 .set noreorder
1023 b .L256bits
1024 $PTR_ADD $rcon,4
1025
1026.L256bits_done:
1027 sw $rk0,32($key)
1028 sw $rk1,36($key)
1029 sw $rk2,40($key)
1030 li $cnt,14
1031 sw $rk3,44($key)
1032 li $t0,0
1033 sw $cnt,48($key)
1034 $PTR_SUB $key,12*16
1035
1036.Lekey_done:
1037 jr $ra
1038 nop
1039.end _mips_AES_set_encrypt_key
1040
1041.globl AES_set_encrypt_key
1042.ent AES_set_encrypt_key
1043AES_set_encrypt_key:
1044 .frame $sp,$FRAMESIZE,$ra
1045 .mask $SAVED_REGS_MASK,-$SZREG
1046 .set noreorder
1047___
1048$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
1049 .cpload $pf
1050___
1051$code.=<<___;
1052 $PTR_SUB $sp,$FRAMESIZE
1053 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
1054 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
1055___
1056$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1057 $REG_S $s3,$FRAMESIZE-3*$SZREG($sp)
1058 $REG_S $s2,$FRAMESIZE-4*$SZREG($sp)
1059 $REG_S $s1,$FRAMESIZE-5*$SZREG($sp)
1060 $REG_S $s0,$FRAMESIZE-6*$SZREG($sp)
1061 $REG_S $gp,$FRAMESIZE-7*$SZREG($sp)
1062___
1063$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
1064 .cplocal $Tbl
1065 .cpsetup $pf,$zero,AES_set_encrypt_key
1066___
1067$code.=<<___;
1068 .set reorder
1069 $LA $Tbl,AES_Te # PIC-ified 'load address'
1070
1071 bal _mips_AES_set_encrypt_key
1072
1073 .set noreorder
1074 move $a0,$t0
1075 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
1076 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
1077___
1078$code.=<<___ if ($flavour =~ /nubi/i);
1079 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
1080 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
1081 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
1082 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
1083 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
1084___
1085$code.=<<___;
1086 jr $ra
1087 $PTR_ADD $sp,$FRAMESIZE
1088.end AES_set_encrypt_key
1089___
1090
1091my ($head,$tail)=($inp,$bits);
1092my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
1093my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
1094$code.=<<___;
1095.align 5
1096.globl AES_set_decrypt_key
1097.ent AES_set_decrypt_key
1098AES_set_decrypt_key:
1099 .frame $sp,$FRAMESIZE,$ra
1100 .mask $SAVED_REGS_MASK,-$SZREG
1101 .set noreorder
1102___
1103$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
1104 .cpload $pf
1105___
1106$code.=<<___;
1107 $PTR_SUB $sp,$FRAMESIZE
1108 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
1109 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
1110___
1111$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1112 $REG_S $s3,$FRAMESIZE-3*$SZREG($sp)
1113 $REG_S $s2,$FRAMESIZE-4*$SZREG($sp)
1114 $REG_S $s1,$FRAMESIZE-5*$SZREG($sp)
1115 $REG_S $s0,$FRAMESIZE-6*$SZREG($sp)
1116 $REG_S $gp,$FRAMESIZE-7*$SZREG($sp)
1117___
1118$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
1119 .cplocal $Tbl
1120 .cpsetup $pf,$zero,AES_set_decrypt_key
1121___
1122$code.=<<___;
1123 .set reorder
1124 $LA $Tbl,AES_Te # PIC-ified 'load address'
1125
1126 bal _mips_AES_set_encrypt_key
1127
1128 bltz $t0,.Ldkey_done
1129
1130 sll $at,$cnt,4
1131 $PTR_ADD $head,$key,0
1132 $PTR_ADD $tail,$key,$at
1133.align 4
1134.Lswap:
1135 lw $rk0,0($head)
1136 lw $rk1,4($head)
1137 lw $rk2,8($head)
1138 lw $rk3,12($head)
1139 lw $rk4,0($tail)
1140 lw $rk5,4($tail)
1141 lw $rk6,8($tail)
1142 lw $rk7,12($tail)
1143 sw $rk0,0($tail)
1144 sw $rk1,4($tail)
1145 sw $rk2,8($tail)
1146 sw $rk3,12($tail)
1147 $PTR_ADD $head,16
1148 $PTR_SUB $tail,16
1149 sw $rk4,-16($head)
1150 sw $rk5,-12($head)
1151 sw $rk6,-8($head)
1152 sw $rk7,-4($head)
1153 bne $head,$tail,.Lswap
1154
1155 lw $tp1,16($key) # modulo-scheduled
1156 lui $x80808080,0x8080
1157 sub $cnt,1
1158 or $x80808080,0x8080
1159 sll $cnt,2
1160 $PTR_ADD $key,16
1161 lui $x1b1b1b1b,0x1b1b
1162 nor $x7f7f7f7f,$zero,$x80808080
1163 or $x1b1b1b1b,0x1b1b
1164.align 4
1165.Lmix:
1166 and $m,$tp1,$x80808080
1167 and $tp2,$tp1,$x7f7f7f7f
1168 srl $tp4,$m,7
1169 addu $tp2,$tp2 # tp2<<1
1170 subu $m,$tp4
1171 and $m,$x1b1b1b1b
1172 xor $tp2,$m
1173
1174 and $m,$tp2,$x80808080
1175 and $tp4,$tp2,$x7f7f7f7f
1176 srl $tp8,$m,7
1177 addu $tp4,$tp4 # tp4<<1
1178 subu $m,$tp8
1179 and $m,$x1b1b1b1b
1180 xor $tp4,$m
1181
1182 and $m,$tp4,$x80808080
1183 and $tp8,$tp4,$x7f7f7f7f
1184 srl $tp9,$m,7
1185 addu $tp8,$tp8 # tp8<<1
1186 subu $m,$tp9
1187 and $m,$x1b1b1b1b
1188 xor $tp8,$m
1189
1190 xor $tp9,$tp8,$tp1
1191 xor $tpe,$tp8,$tp4
1192 xor $tpb,$tp9,$tp2
1193 xor $tpd,$tp9,$tp4
1194
1195 _ror $tp1,$tpd,16
1196 xor $tpe,$tp2
1197 _ror $tp2,$tpd,-16
1198 xor $tpe,$tp1
1199 _ror $tp1,$tp9,8
1200 xor $tpe,$tp2
1201 _ror $tp2,$tp9,-24
1202 xor $tpe,$tp1
1203 _ror $tp1,$tpb,24
1204 xor $tpe,$tp2
1205 _ror $tp2,$tpb,-8
1206 xor $tpe,$tp1
1207 lw $tp1,4($key) # modulo-scheduled
1208 xor $tpe,$tp2
1209 sub $cnt,1
1210 sw $tpe,0($key)
1211 $PTR_ADD $key,4
1212 bnez $cnt,.Lmix
1213
1214 li $t0,0
1215.Ldkey_done:
1216 .set noreorder
1217 move $a0,$t0
1218 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
1219 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
1220___
1221$code.=<<___ if ($flavour =~ /nubi/i);
1222 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
1223 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
1224 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
1225 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
1226 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
1227___
1228$code.=<<___;
1229 jr $ra
1230 $PTR_ADD $sp,$FRAMESIZE
1231.end AES_set_decrypt_key
1232___
1233}}}
1234
1235######################################################################
1236# Tables are kept in endian-neutral manner
1237$code.=<<___;
1238.rdata
1239.align 6
1240AES_Te:
1241.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0
1242.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d
1243.byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd
1244.byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54
1245.byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03
1246.byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d
1247.byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62
1248.byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a
1249.byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d
1250.byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87
1251.byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb
1252.byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b
1253.byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67
1254.byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea
1255.byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7
1256.byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b
1257.byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c
1258.byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a
1259.byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41
1260.byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f
1261.byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4
1262.byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08
1263.byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73
1264.byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f
1265.byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52
1266.byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e
1267.byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1
1268.byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5
1269.byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36
1270.byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d
1271.byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69
1272.byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f
1273.byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e
1274.byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e
1275.byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2
1276.byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb
1277.byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d
1278.byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce
1279.byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e
1280.byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97
1281.byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68
1282.byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c
1283.byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f
1284.byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed
1285.byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46
1286.byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b
1287.byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4
1288.byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a
1289.byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a
1290.byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16
1291.byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7
1292.byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94
1293.byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10
1294.byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81
1295.byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44
1296.byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3
1297.byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe
1298.byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a
1299.byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc
1300.byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04
1301.byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1
1302.byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63
1303.byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a
1304.byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d
1305.byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14
1306.byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f
1307.byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2
1308.byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39
1309.byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2
1310.byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47
1311.byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7
1312.byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95
1313.byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98
1314.byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f
1315.byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e
1316.byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83
1317.byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29
1318.byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c
1319.byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2
1320.byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76
1321.byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56
1322.byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e
1323.byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a
1324.byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4
1325.byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e
1326.byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6
1327.byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4
1328.byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b
1329.byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43
1330.byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7
1331.byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64
1332.byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0
1333.byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa
1334.byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25
1335.byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e
1336.byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18
1337.byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88
1338.byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72
1339.byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1
1340.byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51
1341.byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c
1342.byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21
1343.byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc
1344.byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85
1345.byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42
1346.byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa
1347.byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05
1348.byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12
1349.byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f
1350.byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0
1351.byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58
1352.byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9
1353.byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13
1354.byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33
1355.byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70
1356.byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7
1357.byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22
1358.byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20
1359.byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff
1360.byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a
1361.byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8
1362.byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17
1363.byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31
1364.byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8
1365.byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0
1366.byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11
1367.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc
1368.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a
1369
1370.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4
1371.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
1372.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
1373.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
1374.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
1375.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
1376.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
1377.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
1378.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
1379.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
1380.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
1381.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
1382.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
1383.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
1384.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
1385.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
1386.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
1387.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
1388.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
1389.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
1390.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
1391.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
1392.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
1393.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
1394.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
1395.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
1396.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
1397.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
1398.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
1399.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
1400.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
1401.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
1402
1403.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon
1404.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00
1405.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00
1406.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00
1407.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00
1408
1409.align 6
1410AES_Td:
1411.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0
1412.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96
1413.byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1
1414.byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93
1415.byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6
1416.byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25
1417.byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7
1418.byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f
1419.byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67
1420.byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1
1421.byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12
1422.byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6
1423.byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95
1424.byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda
1425.byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3
1426.byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44
1427.byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78
1428.byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd
1429.byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17
1430.byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4
1431.byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82
1432.byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45
1433.byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84
1434.byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94
1435.byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19
1436.byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7
1437.byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2
1438.byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a
1439.byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03
1440.byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5
1441.byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2
1442.byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c
1443.byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92
1444.byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1
1445.byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5
1446.byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a
1447.byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0
1448.byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75
1449.byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa
1450.byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51
1451.byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d
1452.byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46
1453.byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05
1454.byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff
1455.byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97
1456.byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77
1457.byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88
1458.byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb
1459.byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9
1460.byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00
1461.byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48
1462.byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e
1463.byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56
1464.byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27
1465.byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21
1466.byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a
1467.byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f
1468.byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e
1469.byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2
1470.byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16
1471.byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5
1472.byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d
1473.byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad
1474.byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8
1475.byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c
1476.byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd
1477.byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc
1478.byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34
1479.byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc
1480.byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63
1481.byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10
1482.byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20
1483.byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8
1484.byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d
1485.byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3
1486.byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0
1487.byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99
1488.byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22
1489.byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a
1490.byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef
1491.byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1
1492.byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36
1493.byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28
1494.byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4
1495.byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d
1496.byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62
1497.byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8
1498.byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5
1499.byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c
1500.byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3
1501.byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7
1502.byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b
1503.byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4
1504.byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8
1505.byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e
1506.byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6
1507.byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce
1508.byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6
1509.byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31
1510.byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0
1511.byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6
1512.byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15
1513.byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7
1514.byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f
1515.byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d
1516.byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf
1517.byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b
1518.byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f
1519.byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d
1520.byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e
1521.byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52
1522.byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13
1523.byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a
1524.byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89
1525.byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35
1526.byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c
1527.byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f
1528.byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf
1529.byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b
1530.byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86
1531.byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e
1532.byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f
1533.byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c
1534.byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41
1535.byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde
1536.byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90
1537.byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70
1538.byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42
1539
1540.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4
1541.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
1542.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
1543.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
1544.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
1545.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
1546.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
1547.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
1548.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
1549.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
1550.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
1551.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
1552.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
1553.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
1554.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
1555.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
1556.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
1557.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
1558.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
1559.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
1560.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
1561.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
1562.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
1563.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
1564.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
1565.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
1566.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
1567.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
1568.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
1569.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
1570.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
1571.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1572___
1573
1574foreach (split("\n",$code)) {
1575 s/\`([^\`]*)\`/eval $1/ge;
1576
1577 # made-up _instructions, _xtr, _ins, _ror and _bias, cope
1578 # with byte order dependencies...
1579 if (/^\s+_/) {
1580 s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/;
1581
1582 s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/
1583 sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
1584 : eval("24-$3"))/e or
1585 s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
1586 sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
1587 : eval("24-$3"))/e or
1588 s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/
1589 sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
1590 : eval("$3*-1"))/e or
1591 s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
1592 sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
1593 : eval("($3-16)&31"))/e;
1594
1595 s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/
1596 sprintf("sll\t$1,$2,$3")/e or
1597 s/srl\s+(\$[0-9]+),(\$[0-9]+),0/
1598 sprintf("and\t$1,$2,0xff")/e or
1599 s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/;
1600 }
1601
1602 # convert lwl/lwr and swr/swl to little-endian order
1603 if (!$big_endian && /^\s+[sl]w[lr]\s+/) {
1604 s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/
1605 sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e or
1606 s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/
1607 sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e;
1608 }
1609
1610 print $_,"\n";
1611}
1612
1613close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-parisc.pl b/src/lib/libcrypto/aes/asm/aes-parisc.pl
deleted file mode 100644
index f12a1c18ec..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-parisc.pl
+++ /dev/null
@@ -1,1028 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for PA-RISC.
11#
12# June 2009.
13#
14# The module is mechanical transliteration of aes-sparcv9.pl, but with
15# a twist: S-boxes are compressed even further down to 1K+256B. On
16# PA-7100LC performance is ~40% better than gcc 3.2 generated code and
17# is about 33 cycles per byte processed with 128-bit key. Newer CPUs
18# perform at 16 cycles per byte. It's not faster than code generated
19# by vendor compiler, but recall that it has compressed S-boxes, which
20# requires extra processing.
21#
22# Special thanks to polarhome.com for providing HP-UX account.
23
24$flavour = shift;
25$output = shift;
26open STDOUT,">$output";
27
28if ($flavour =~ /64/) {
29 $LEVEL ="2.0W";
30 $SIZE_T =8;
31 $FRAME_MARKER =80;
32 $SAVED_RP =16;
33 $PUSH ="std";
34 $PUSHMA ="std,ma";
35 $POP ="ldd";
36 $POPMB ="ldd,mb";
37} else {
38 $LEVEL ="1.0";
39 $SIZE_T =4;
40 $FRAME_MARKER =48;
41 $SAVED_RP =20;
42 $PUSH ="stw";
43 $PUSHMA ="stwm";
44 $POP ="ldw";
45 $POPMB ="ldwm";
46}
47
48$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
49 # [+ argument transfer]
50$inp="%r26"; # arg0
51$out="%r25"; # arg1
52$key="%r24"; # arg2
53
54($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4");
55($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8");
56
57($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7,
58 $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) =
59("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16",
60"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26");
61
62$tbl="%r28";
63$rounds="%r29";
64
65$code=<<___;
66 .LEVEL $LEVEL
67#if 0
68 .SPACE \$TEXT\$
69 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
70#else
71 .text
72#endif
73
74 .EXPORT AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
75 .ALIGN 64
76AES_encrypt
77 .PROC
78 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
79 .ENTRY
80 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
81 $PUSHMA %r3,$FRAME(%sp)
82 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
83 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
84 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
85 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
86 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
87 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
88 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
89 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
90 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
91 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
92 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
93 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
94 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
95 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
96 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
97
98 blr %r0,$tbl
99 ldi 3,$t0
100L\$enc_pic
101 andcm $tbl,$t0,$tbl
102 ldo L\$AES_Te-L\$enc_pic($tbl),$tbl
103
104 and $inp,$t0,$t0
105 sub $inp,$t0,$inp
106 ldw 0($inp),$s0
107 ldw 4($inp),$s1
108 ldw 8($inp),$s2
109 comib,= 0,$t0,L\$enc_inp_aligned
110 ldw 12($inp),$s3
111
112 sh3addl $t0,%r0,$t0
113 subi 32,$t0,$t0
114 mtctl $t0,%cr11
115 ldw 16($inp),$t1
116 vshd $s0,$s1,$s0
117 vshd $s1,$s2,$s1
118 vshd $s2,$s3,$s2
119 vshd $s3,$t1,$s3
120
121L\$enc_inp_aligned
122 bl _parisc_AES_encrypt,%r31
123 nop
124
125 extru,<> $out,31,2,%r0
126 b L\$enc_out_aligned
127 nop
128
129 _srm $s0,24,$acc0
130 _srm $s0,16,$acc1
131 stb $acc0,0($out)
132 _srm $s0,8,$acc2
133 stb $acc1,1($out)
134 _srm $s1,24,$acc4
135 stb $acc2,2($out)
136 _srm $s1,16,$acc5
137 stb $s0,3($out)
138 _srm $s1,8,$acc6
139 stb $acc4,4($out)
140 _srm $s2,24,$acc0
141 stb $acc5,5($out)
142 _srm $s2,16,$acc1
143 stb $acc6,6($out)
144 _srm $s2,8,$acc2
145 stb $s1,7($out)
146 _srm $s3,24,$acc4
147 stb $acc0,8($out)
148 _srm $s3,16,$acc5
149 stb $acc1,9($out)
150 _srm $s3,8,$acc6
151 stb $acc2,10($out)
152 stb $s2,11($out)
153 stb $acc4,12($out)
154 stb $acc5,13($out)
155 stb $acc6,14($out)
156 b L\$enc_done
157 stb $s3,15($out)
158
159L\$enc_out_aligned
160 stw $s0,0($out)
161 stw $s1,4($out)
162 stw $s2,8($out)
163 stw $s3,12($out)
164
165L\$enc_done
166 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
167 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
168 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
169 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
170 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
171 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
172 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
173 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
174 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
175 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
176 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
177 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
178 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
179 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
180 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
181 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
182 bv (%r2)
183 .EXIT
184 $POPMB -$FRAME(%sp),%r3
185 .PROCEND
186
187 .ALIGN 16
188_parisc_AES_encrypt
189 .PROC
190 .CALLINFO MILLICODE
191 .ENTRY
192 ldw 240($key),$rounds
193 ldw 0($key),$t0
194 ldw 4($key),$t1
195 ldw 8($key),$t2
196 _srm $rounds,1,$rounds
197 xor $t0,$s0,$s0
198 ldw 12($key),$t3
199 _srm $s0,24,$acc0
200 xor $t1,$s1,$s1
201 ldw 16($key),$t0
202 _srm $s1,16,$acc1
203 xor $t2,$s2,$s2
204 ldw 20($key),$t1
205 xor $t3,$s3,$s3
206 ldw 24($key),$t2
207 ldw 28($key),$t3
208L\$enc_loop
209 _srm $s2,8,$acc2
210 ldwx,s $acc0($tbl),$acc0
211 _srm $s3,0,$acc3
212 ldwx,s $acc1($tbl),$acc1
213 _srm $s1,24,$acc4
214 ldwx,s $acc2($tbl),$acc2
215 _srm $s2,16,$acc5
216 ldwx,s $acc3($tbl),$acc3
217 _srm $s3,8,$acc6
218 ldwx,s $acc4($tbl),$acc4
219 _srm $s0,0,$acc7
220 ldwx,s $acc5($tbl),$acc5
221 _srm $s2,24,$acc8
222 ldwx,s $acc6($tbl),$acc6
223 _srm $s3,16,$acc9
224 ldwx,s $acc7($tbl),$acc7
225 _srm $s0,8,$acc10
226 ldwx,s $acc8($tbl),$acc8
227 _srm $s1,0,$acc11
228 ldwx,s $acc9($tbl),$acc9
229 _srm $s3,24,$acc12
230 ldwx,s $acc10($tbl),$acc10
231 _srm $s0,16,$acc13
232 ldwx,s $acc11($tbl),$acc11
233 _srm $s1,8,$acc14
234 ldwx,s $acc12($tbl),$acc12
235 _srm $s2,0,$acc15
236 ldwx,s $acc13($tbl),$acc13
237 ldwx,s $acc14($tbl),$acc14
238 ldwx,s $acc15($tbl),$acc15
239 addib,= -1,$rounds,L\$enc_last
240 ldo 32($key),$key
241
242 _ror $acc1,8,$acc1
243 xor $acc0,$t0,$t0
244 ldw 0($key),$s0
245 _ror $acc2,16,$acc2
246 xor $acc1,$t0,$t0
247 ldw 4($key),$s1
248 _ror $acc3,24,$acc3
249 xor $acc2,$t0,$t0
250 ldw 8($key),$s2
251 _ror $acc5,8,$acc5
252 xor $acc3,$t0,$t0
253 ldw 12($key),$s3
254 _ror $acc6,16,$acc6
255 xor $acc4,$t1,$t1
256 _ror $acc7,24,$acc7
257 xor $acc5,$t1,$t1
258 _ror $acc9,8,$acc9
259 xor $acc6,$t1,$t1
260 _ror $acc10,16,$acc10
261 xor $acc7,$t1,$t1
262 _ror $acc11,24,$acc11
263 xor $acc8,$t2,$t2
264 _ror $acc13,8,$acc13
265 xor $acc9,$t2,$t2
266 _ror $acc14,16,$acc14
267 xor $acc10,$t2,$t2
268 _ror $acc15,24,$acc15
269 xor $acc11,$t2,$t2
270 xor $acc12,$acc14,$acc14
271 xor $acc13,$t3,$t3
272 _srm $t0,24,$acc0
273 xor $acc14,$t3,$t3
274 _srm $t1,16,$acc1
275 xor $acc15,$t3,$t3
276
277 _srm $t2,8,$acc2
278 ldwx,s $acc0($tbl),$acc0
279 _srm $t3,0,$acc3
280 ldwx,s $acc1($tbl),$acc1
281 _srm $t1,24,$acc4
282 ldwx,s $acc2($tbl),$acc2
283 _srm $t2,16,$acc5
284 ldwx,s $acc3($tbl),$acc3
285 _srm $t3,8,$acc6
286 ldwx,s $acc4($tbl),$acc4
287 _srm $t0,0,$acc7
288 ldwx,s $acc5($tbl),$acc5
289 _srm $t2,24,$acc8
290 ldwx,s $acc6($tbl),$acc6
291 _srm $t3,16,$acc9
292 ldwx,s $acc7($tbl),$acc7
293 _srm $t0,8,$acc10
294 ldwx,s $acc8($tbl),$acc8
295 _srm $t1,0,$acc11
296 ldwx,s $acc9($tbl),$acc9
297 _srm $t3,24,$acc12
298 ldwx,s $acc10($tbl),$acc10
299 _srm $t0,16,$acc13
300 ldwx,s $acc11($tbl),$acc11
301 _srm $t1,8,$acc14
302 ldwx,s $acc12($tbl),$acc12
303 _srm $t2,0,$acc15
304 ldwx,s $acc13($tbl),$acc13
305 _ror $acc1,8,$acc1
306 ldwx,s $acc14($tbl),$acc14
307
308 _ror $acc2,16,$acc2
309 xor $acc0,$s0,$s0
310 ldwx,s $acc15($tbl),$acc15
311 _ror $acc3,24,$acc3
312 xor $acc1,$s0,$s0
313 ldw 16($key),$t0
314 _ror $acc5,8,$acc5
315 xor $acc2,$s0,$s0
316 ldw 20($key),$t1
317 _ror $acc6,16,$acc6
318 xor $acc3,$s0,$s0
319 ldw 24($key),$t2
320 _ror $acc7,24,$acc7
321 xor $acc4,$s1,$s1
322 ldw 28($key),$t3
323 _ror $acc9,8,$acc9
324 xor $acc5,$s1,$s1
325 ldw 1024+0($tbl),%r0 ; prefetch te4
326 _ror $acc10,16,$acc10
327 xor $acc6,$s1,$s1
328 ldw 1024+32($tbl),%r0 ; prefetch te4
329 _ror $acc11,24,$acc11
330 xor $acc7,$s1,$s1
331 ldw 1024+64($tbl),%r0 ; prefetch te4
332 _ror $acc13,8,$acc13
333 xor $acc8,$s2,$s2
334 ldw 1024+96($tbl),%r0 ; prefetch te4
335 _ror $acc14,16,$acc14
336 xor $acc9,$s2,$s2
337 ldw 1024+128($tbl),%r0 ; prefetch te4
338 _ror $acc15,24,$acc15
339 xor $acc10,$s2,$s2
340 ldw 1024+160($tbl),%r0 ; prefetch te4
341 _srm $s0,24,$acc0
342 xor $acc11,$s2,$s2
343 ldw 1024+192($tbl),%r0 ; prefetch te4
344 xor $acc12,$acc14,$acc14
345 xor $acc13,$s3,$s3
346 ldw 1024+224($tbl),%r0 ; prefetch te4
347 _srm $s1,16,$acc1
348 xor $acc14,$s3,$s3
349 b L\$enc_loop
350 xor $acc15,$s3,$s3
351
352 .ALIGN 16
353L\$enc_last
354 ldo 1024($tbl),$rounds
355 _ror $acc1,8,$acc1
356 xor $acc0,$t0,$t0
357 ldw 0($key),$s0
358 _ror $acc2,16,$acc2
359 xor $acc1,$t0,$t0
360 ldw 4($key),$s1
361 _ror $acc3,24,$acc3
362 xor $acc2,$t0,$t0
363 ldw 8($key),$s2
364 _ror $acc5,8,$acc5
365 xor $acc3,$t0,$t0
366 ldw 12($key),$s3
367 _ror $acc6,16,$acc6
368 xor $acc4,$t1,$t1
369 _ror $acc7,24,$acc7
370 xor $acc5,$t1,$t1
371 _ror $acc9,8,$acc9
372 xor $acc6,$t1,$t1
373 _ror $acc10,16,$acc10
374 xor $acc7,$t1,$t1
375 _ror $acc11,24,$acc11
376 xor $acc8,$t2,$t2
377 _ror $acc13,8,$acc13
378 xor $acc9,$t2,$t2
379 _ror $acc14,16,$acc14
380 xor $acc10,$t2,$t2
381 _ror $acc15,24,$acc15
382 xor $acc11,$t2,$t2
383 xor $acc12,$acc14,$acc14
384 xor $acc13,$t3,$t3
385 _srm $t0,24,$acc0
386 xor $acc14,$t3,$t3
387 _srm $t1,16,$acc1
388 xor $acc15,$t3,$t3
389
390 _srm $t2,8,$acc2
391 ldbx $acc0($rounds),$acc0
392 _srm $t1,24,$acc4
393 ldbx $acc1($rounds),$acc1
394 _srm $t2,16,$acc5
395 _srm $t3,0,$acc3
396 ldbx $acc2($rounds),$acc2
397 ldbx $acc3($rounds),$acc3
398 _srm $t3,8,$acc6
399 ldbx $acc4($rounds),$acc4
400 _srm $t2,24,$acc8
401 ldbx $acc5($rounds),$acc5
402 _srm $t3,16,$acc9
403 _srm $t0,0,$acc7
404 ldbx $acc6($rounds),$acc6
405 ldbx $acc7($rounds),$acc7
406 _srm $t0,8,$acc10
407 ldbx $acc8($rounds),$acc8
408 _srm $t3,24,$acc12
409 ldbx $acc9($rounds),$acc9
410 _srm $t0,16,$acc13
411 _srm $t1,0,$acc11
412 ldbx $acc10($rounds),$acc10
413 _srm $t1,8,$acc14
414 ldbx $acc11($rounds),$acc11
415 ldbx $acc12($rounds),$acc12
416 ldbx $acc13($rounds),$acc13
417 _srm $t2,0,$acc15
418 ldbx $acc14($rounds),$acc14
419
420 dep $acc0,7,8,$acc3
421 ldbx $acc15($rounds),$acc15
422 dep $acc4,7,8,$acc7
423 dep $acc1,15,8,$acc3
424 dep $acc5,15,8,$acc7
425 dep $acc2,23,8,$acc3
426 dep $acc6,23,8,$acc7
427 xor $acc3,$s0,$s0
428 xor $acc7,$s1,$s1
429 dep $acc8,7,8,$acc11
430 dep $acc12,7,8,$acc15
431 dep $acc9,15,8,$acc11
432 dep $acc13,15,8,$acc15
433 dep $acc10,23,8,$acc11
434 dep $acc14,23,8,$acc15
435 xor $acc11,$s2,$s2
436
437 bv (%r31)
438 .EXIT
439 xor $acc15,$s3,$s3
440 .PROCEND
441
442 .ALIGN 64
443L\$AES_Te
444 .WORD 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
445 .WORD 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
446 .WORD 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
447 .WORD 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
448 .WORD 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
449 .WORD 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
450 .WORD 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
451 .WORD 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
452 .WORD 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
453 .WORD 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
454 .WORD 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
455 .WORD 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
456 .WORD 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
457 .WORD 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
458 .WORD 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
459 .WORD 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
460 .WORD 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
461 .WORD 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
462 .WORD 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
463 .WORD 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
464 .WORD 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
465 .WORD 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
466 .WORD 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
467 .WORD 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
468 .WORD 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
469 .WORD 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
470 .WORD 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
471 .WORD 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
472 .WORD 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
473 .WORD 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
474 .WORD 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
475 .WORD 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
476 .WORD 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
477 .WORD 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
478 .WORD 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
479 .WORD 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
480 .WORD 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
481 .WORD 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
482 .WORD 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
483 .WORD 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
484 .WORD 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
485 .WORD 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
486 .WORD 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
487 .WORD 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
488 .WORD 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
489 .WORD 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
490 .WORD 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
491 .WORD 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
492 .WORD 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
493 .WORD 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
494 .WORD 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
495 .WORD 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
496 .WORD 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
497 .WORD 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
498 .WORD 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
499 .WORD 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
500 .WORD 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
501 .WORD 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
502 .WORD 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
503 .WORD 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
504 .WORD 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
505 .WORD 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
506 .WORD 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
507 .WORD 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
508 .BYTE 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
509 .BYTE 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
510 .BYTE 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
511 .BYTE 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
512 .BYTE 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
513 .BYTE 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
514 .BYTE 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
515 .BYTE 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
516 .BYTE 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
517 .BYTE 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
518 .BYTE 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
519 .BYTE 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
520 .BYTE 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
521 .BYTE 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
522 .BYTE 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
523 .BYTE 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
524 .BYTE 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
525 .BYTE 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
526 .BYTE 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
527 .BYTE 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
528 .BYTE 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
529 .BYTE 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
530 .BYTE 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
531 .BYTE 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
532 .BYTE 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
533 .BYTE 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
534 .BYTE 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
535 .BYTE 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
536 .BYTE 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
537 .BYTE 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
538 .BYTE 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
539 .BYTE 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
540___
541
542$code.=<<___;
543 .EXPORT AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
544 .ALIGN 16
545AES_decrypt
546 .PROC
547 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
548 .ENTRY
549 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
550 $PUSHMA %r3,$FRAME(%sp)
551 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
552 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
553 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
554 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
555 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
556 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
557 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
558 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
559 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
560 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
561 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
562 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
563 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
564 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
565 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
566
567 blr %r0,$tbl
568 ldi 3,$t0
569L\$dec_pic
570 andcm $tbl,$t0,$tbl
571 ldo L\$AES_Td-L\$dec_pic($tbl),$tbl
572
573 and $inp,$t0,$t0
574 sub $inp,$t0,$inp
575 ldw 0($inp),$s0
576 ldw 4($inp),$s1
577 ldw 8($inp),$s2
578 comib,= 0,$t0,L\$dec_inp_aligned
579 ldw 12($inp),$s3
580
581 sh3addl $t0,%r0,$t0
582 subi 32,$t0,$t0
583 mtctl $t0,%cr11
584 ldw 16($inp),$t1
585 vshd $s0,$s1,$s0
586 vshd $s1,$s2,$s1
587 vshd $s2,$s3,$s2
588 vshd $s3,$t1,$s3
589
590L\$dec_inp_aligned
591 bl _parisc_AES_decrypt,%r31
592 nop
593
594 extru,<> $out,31,2,%r0
595 b L\$dec_out_aligned
596 nop
597
598 _srm $s0,24,$acc0
599 _srm $s0,16,$acc1
600 stb $acc0,0($out)
601 _srm $s0,8,$acc2
602 stb $acc1,1($out)
603 _srm $s1,24,$acc4
604 stb $acc2,2($out)
605 _srm $s1,16,$acc5
606 stb $s0,3($out)
607 _srm $s1,8,$acc6
608 stb $acc4,4($out)
609 _srm $s2,24,$acc0
610 stb $acc5,5($out)
611 _srm $s2,16,$acc1
612 stb $acc6,6($out)
613 _srm $s2,8,$acc2
614 stb $s1,7($out)
615 _srm $s3,24,$acc4
616 stb $acc0,8($out)
617 _srm $s3,16,$acc5
618 stb $acc1,9($out)
619 _srm $s3,8,$acc6
620 stb $acc2,10($out)
621 stb $s2,11($out)
622 stb $acc4,12($out)
623 stb $acc5,13($out)
624 stb $acc6,14($out)
625 b L\$dec_done
626 stb $s3,15($out)
627
628L\$dec_out_aligned
629 stw $s0,0($out)
630 stw $s1,4($out)
631 stw $s2,8($out)
632 stw $s3,12($out)
633
634L\$dec_done
635 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
636 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
637 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
638 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
639 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
640 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
641 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
642 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
643 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
644 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
645 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
646 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
647 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
648 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
649 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
650 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
651 bv (%r2)
652 .EXIT
653 $POPMB -$FRAME(%sp),%r3
654 .PROCEND
655
656 .ALIGN 16
657_parisc_AES_decrypt
658 .PROC
659 .CALLINFO MILLICODE
660 .ENTRY
661 ldw 240($key),$rounds
662 ldw 0($key),$t0
663 ldw 4($key),$t1
664 ldw 8($key),$t2
665 ldw 12($key),$t3
666 _srm $rounds,1,$rounds
667 xor $t0,$s0,$s0
668 ldw 16($key),$t0
669 xor $t1,$s1,$s1
670 ldw 20($key),$t1
671 _srm $s0,24,$acc0
672 xor $t2,$s2,$s2
673 ldw 24($key),$t2
674 xor $t3,$s3,$s3
675 ldw 28($key),$t3
676 _srm $s3,16,$acc1
677L\$dec_loop
678 _srm $s2,8,$acc2
679 ldwx,s $acc0($tbl),$acc0
680 _srm $s1,0,$acc3
681 ldwx,s $acc1($tbl),$acc1
682 _srm $s1,24,$acc4
683 ldwx,s $acc2($tbl),$acc2
684 _srm $s0,16,$acc5
685 ldwx,s $acc3($tbl),$acc3
686 _srm $s3,8,$acc6
687 ldwx,s $acc4($tbl),$acc4
688 _srm $s2,0,$acc7
689 ldwx,s $acc5($tbl),$acc5
690 _srm $s2,24,$acc8
691 ldwx,s $acc6($tbl),$acc6
692 _srm $s1,16,$acc9
693 ldwx,s $acc7($tbl),$acc7
694 _srm $s0,8,$acc10
695 ldwx,s $acc8($tbl),$acc8
696 _srm $s3,0,$acc11
697 ldwx,s $acc9($tbl),$acc9
698 _srm $s3,24,$acc12
699 ldwx,s $acc10($tbl),$acc10
700 _srm $s2,16,$acc13
701 ldwx,s $acc11($tbl),$acc11
702 _srm $s1,8,$acc14
703 ldwx,s $acc12($tbl),$acc12
704 _srm $s0,0,$acc15
705 ldwx,s $acc13($tbl),$acc13
706 ldwx,s $acc14($tbl),$acc14
707 ldwx,s $acc15($tbl),$acc15
708 addib,= -1,$rounds,L\$dec_last
709 ldo 32($key),$key
710
711 _ror $acc1,8,$acc1
712 xor $acc0,$t0,$t0
713 ldw 0($key),$s0
714 _ror $acc2,16,$acc2
715 xor $acc1,$t0,$t0
716 ldw 4($key),$s1
717 _ror $acc3,24,$acc3
718 xor $acc2,$t0,$t0
719 ldw 8($key),$s2
720 _ror $acc5,8,$acc5
721 xor $acc3,$t0,$t0
722 ldw 12($key),$s3
723 _ror $acc6,16,$acc6
724 xor $acc4,$t1,$t1
725 _ror $acc7,24,$acc7
726 xor $acc5,$t1,$t1
727 _ror $acc9,8,$acc9
728 xor $acc6,$t1,$t1
729 _ror $acc10,16,$acc10
730 xor $acc7,$t1,$t1
731 _ror $acc11,24,$acc11
732 xor $acc8,$t2,$t2
733 _ror $acc13,8,$acc13
734 xor $acc9,$t2,$t2
735 _ror $acc14,16,$acc14
736 xor $acc10,$t2,$t2
737 _ror $acc15,24,$acc15
738 xor $acc11,$t2,$t2
739 xor $acc12,$acc14,$acc14
740 xor $acc13,$t3,$t3
741 _srm $t0,24,$acc0
742 xor $acc14,$t3,$t3
743 xor $acc15,$t3,$t3
744 _srm $t3,16,$acc1
745
746 _srm $t2,8,$acc2
747 ldwx,s $acc0($tbl),$acc0
748 _srm $t1,0,$acc3
749 ldwx,s $acc1($tbl),$acc1
750 _srm $t1,24,$acc4
751 ldwx,s $acc2($tbl),$acc2
752 _srm $t0,16,$acc5
753 ldwx,s $acc3($tbl),$acc3
754 _srm $t3,8,$acc6
755 ldwx,s $acc4($tbl),$acc4
756 _srm $t2,0,$acc7
757 ldwx,s $acc5($tbl),$acc5
758 _srm $t2,24,$acc8
759 ldwx,s $acc6($tbl),$acc6
760 _srm $t1,16,$acc9
761 ldwx,s $acc7($tbl),$acc7
762 _srm $t0,8,$acc10
763 ldwx,s $acc8($tbl),$acc8
764 _srm $t3,0,$acc11
765 ldwx,s $acc9($tbl),$acc9
766 _srm $t3,24,$acc12
767 ldwx,s $acc10($tbl),$acc10
768 _srm $t2,16,$acc13
769 ldwx,s $acc11($tbl),$acc11
770 _srm $t1,8,$acc14
771 ldwx,s $acc12($tbl),$acc12
772 _srm $t0,0,$acc15
773 ldwx,s $acc13($tbl),$acc13
774 _ror $acc1,8,$acc1
775 ldwx,s $acc14($tbl),$acc14
776
777 _ror $acc2,16,$acc2
778 xor $acc0,$s0,$s0
779 ldwx,s $acc15($tbl),$acc15
780 _ror $acc3,24,$acc3
781 xor $acc1,$s0,$s0
782 ldw 16($key),$t0
783 _ror $acc5,8,$acc5
784 xor $acc2,$s0,$s0
785 ldw 20($key),$t1
786 _ror $acc6,16,$acc6
787 xor $acc3,$s0,$s0
788 ldw 24($key),$t2
789 _ror $acc7,24,$acc7
790 xor $acc4,$s1,$s1
791 ldw 28($key),$t3
792 _ror $acc9,8,$acc9
793 xor $acc5,$s1,$s1
794 ldw 1024+0($tbl),%r0 ; prefetch td4
795 _ror $acc10,16,$acc10
796 xor $acc6,$s1,$s1
797 ldw 1024+32($tbl),%r0 ; prefetch td4
798 _ror $acc11,24,$acc11
799 xor $acc7,$s1,$s1
800 ldw 1024+64($tbl),%r0 ; prefetch td4
801 _ror $acc13,8,$acc13
802 xor $acc8,$s2,$s2
803 ldw 1024+96($tbl),%r0 ; prefetch td4
804 _ror $acc14,16,$acc14
805 xor $acc9,$s2,$s2
806 ldw 1024+128($tbl),%r0 ; prefetch td4
807 _ror $acc15,24,$acc15
808 xor $acc10,$s2,$s2
809 ldw 1024+160($tbl),%r0 ; prefetch td4
810 _srm $s0,24,$acc0
811 xor $acc11,$s2,$s2
812 ldw 1024+192($tbl),%r0 ; prefetch td4
813 xor $acc12,$acc14,$acc14
814 xor $acc13,$s3,$s3
815 ldw 1024+224($tbl),%r0 ; prefetch td4
816 xor $acc14,$s3,$s3
817 xor $acc15,$s3,$s3
818 b L\$dec_loop
819 _srm $s3,16,$acc1
820
821 .ALIGN 16
822L\$dec_last
823 ldo 1024($tbl),$rounds
824 _ror $acc1,8,$acc1
825 xor $acc0,$t0,$t0
826 ldw 0($key),$s0
827 _ror $acc2,16,$acc2
828 xor $acc1,$t0,$t0
829 ldw 4($key),$s1
830 _ror $acc3,24,$acc3
831 xor $acc2,$t0,$t0
832 ldw 8($key),$s2
833 _ror $acc5,8,$acc5
834 xor $acc3,$t0,$t0
835 ldw 12($key),$s3
836 _ror $acc6,16,$acc6
837 xor $acc4,$t1,$t1
838 _ror $acc7,24,$acc7
839 xor $acc5,$t1,$t1
840 _ror $acc9,8,$acc9
841 xor $acc6,$t1,$t1
842 _ror $acc10,16,$acc10
843 xor $acc7,$t1,$t1
844 _ror $acc11,24,$acc11
845 xor $acc8,$t2,$t2
846 _ror $acc13,8,$acc13
847 xor $acc9,$t2,$t2
848 _ror $acc14,16,$acc14
849 xor $acc10,$t2,$t2
850 _ror $acc15,24,$acc15
851 xor $acc11,$t2,$t2
852 xor $acc12,$acc14,$acc14
853 xor $acc13,$t3,$t3
854 _srm $t0,24,$acc0
855 xor $acc14,$t3,$t3
856 xor $acc15,$t3,$t3
857 _srm $t3,16,$acc1
858
859 _srm $t2,8,$acc2
860 ldbx $acc0($rounds),$acc0
861 _srm $t1,24,$acc4
862 ldbx $acc1($rounds),$acc1
863 _srm $t0,16,$acc5
864 _srm $t1,0,$acc3
865 ldbx $acc2($rounds),$acc2
866 ldbx $acc3($rounds),$acc3
867 _srm $t3,8,$acc6
868 ldbx $acc4($rounds),$acc4
869 _srm $t2,24,$acc8
870 ldbx $acc5($rounds),$acc5
871 _srm $t1,16,$acc9
872 _srm $t2,0,$acc7
873 ldbx $acc6($rounds),$acc6
874 ldbx $acc7($rounds),$acc7
875 _srm $t0,8,$acc10
876 ldbx $acc8($rounds),$acc8
877 _srm $t3,24,$acc12
878 ldbx $acc9($rounds),$acc9
879 _srm $t2,16,$acc13
880 _srm $t3,0,$acc11
881 ldbx $acc10($rounds),$acc10
882 _srm $t1,8,$acc14
883 ldbx $acc11($rounds),$acc11
884 ldbx $acc12($rounds),$acc12
885 ldbx $acc13($rounds),$acc13
886 _srm $t0,0,$acc15
887 ldbx $acc14($rounds),$acc14
888
889 dep $acc0,7,8,$acc3
890 ldbx $acc15($rounds),$acc15
891 dep $acc4,7,8,$acc7
892 dep $acc1,15,8,$acc3
893 dep $acc5,15,8,$acc7
894 dep $acc2,23,8,$acc3
895 dep $acc6,23,8,$acc7
896 xor $acc3,$s0,$s0
897 xor $acc7,$s1,$s1
898 dep $acc8,7,8,$acc11
899 dep $acc12,7,8,$acc15
900 dep $acc9,15,8,$acc11
901 dep $acc13,15,8,$acc15
902 dep $acc10,23,8,$acc11
903 dep $acc14,23,8,$acc15
904 xor $acc11,$s2,$s2
905
906 bv (%r31)
907 .EXIT
908 xor $acc15,$s3,$s3
909 .PROCEND
910
911 .ALIGN 64
912L\$AES_Td
913 .WORD 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
914 .WORD 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
915 .WORD 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
916 .WORD 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
917 .WORD 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
918 .WORD 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
919 .WORD 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
920 .WORD 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
921 .WORD 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
922 .WORD 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
923 .WORD 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
924 .WORD 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
925 .WORD 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
926 .WORD 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
927 .WORD 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
928 .WORD 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
929 .WORD 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
930 .WORD 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
931 .WORD 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
932 .WORD 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
933 .WORD 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
934 .WORD 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
935 .WORD 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
936 .WORD 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
937 .WORD 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
938 .WORD 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
939 .WORD 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
940 .WORD 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
941 .WORD 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
942 .WORD 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
943 .WORD 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
944 .WORD 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
945 .WORD 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
946 .WORD 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
947 .WORD 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
948 .WORD 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
949 .WORD 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
950 .WORD 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
951 .WORD 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
952 .WORD 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
953 .WORD 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
954 .WORD 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
955 .WORD 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
956 .WORD 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
957 .WORD 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
958 .WORD 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
959 .WORD 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
960 .WORD 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
961 .WORD 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
962 .WORD 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
963 .WORD 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
964 .WORD 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
965 .WORD 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
966 .WORD 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
967 .WORD 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
968 .WORD 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
969 .WORD 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
970 .WORD 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
971 .WORD 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
972 .WORD 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
973 .WORD 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
974 .WORD 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
975 .WORD 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
976 .WORD 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
977 .BYTE 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
978 .BYTE 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
979 .BYTE 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
980 .BYTE 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
981 .BYTE 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
982 .BYTE 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
983 .BYTE 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
984 .BYTE 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
985 .BYTE 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
986 .BYTE 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
987 .BYTE 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
988 .BYTE 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
989 .BYTE 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
990 .BYTE 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
991 .BYTE 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
992 .BYTE 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
993 .BYTE 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
994 .BYTE 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
995 .BYTE 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
996 .BYTE 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
997 .BYTE 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
998 .BYTE 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
999 .BYTE 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
1000 .BYTE 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
1001 .BYTE 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
1002 .BYTE 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
1003 .BYTE 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
1004 .BYTE 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
1005 .BYTE 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
1006 .BYTE 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
1007 .BYTE 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
1008 .BYTE 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1009
1010 .data
1011 .STRINGZ "AES for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
1012___
1013
1014foreach (split("\n",$code)) {
1015 s/\`([^\`]*)\`/eval $1/ge;
1016
1017 # translate made up instructons: _ror, _srm
1018 s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/ or
1019
1020 s/_srm(\s+%r[0-9]+),([0-9]+),/
1021 $SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2)
1022 : sprintf("extrd,u%s,%d,8,",$1,63-$2)/e;
1023
1024 s/,\*/,/ if ($SIZE_T==4);
1025 s/\bbv\b(.*\(%r2\))/bve$1/ if ($SIZE_T==8);
1026 print $_,"\n";
1027}
1028close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl
deleted file mode 100644
index 7c52cbe5f9..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-ppc.pl
+++ /dev/null
@@ -1,1365 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Needs more work: key setup, CBC routine...
11#
12# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
13# 128-bit key, which is ~40% better than 64-bit code generated by gcc
14# 4.0. But these are not the ones currently used! Their "compact"
15# counterparts are, for security reason. ppc_AES_encrypt_compact runs
16# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
17# at 1/3 of ppc_AES_decrypt.
18
19# February 2010
20#
21# Rescheduling instructions to favour Power6 pipeline gave 10%
22# performance improvement on the platfrom in question (and marginal
23# improvement even on others). It should be noted that Power6 fails
24# to process byte in 18 cycles, only in 23, because it fails to issue
25# 4 load instructions in two cycles, only in 3. As result non-compact
26# block subroutines are 25% slower than one would expect. Compact
27# functions scale better, because they have pure computational part,
28# which scales perfectly with clock frequency. To be specific
29# ppc_AES_encrypt_compact operates at 42 cycles per byte, while
30# ppc_AES_decrypt_compact - at 55 (in 64-bit build).
31
32$flavour = shift;
33
34if ($flavour =~ /64/) {
35 $SIZE_T =8;
36 $LRSAVE =2*$SIZE_T;
37 $STU ="stdu";
38 $POP ="ld";
39 $PUSH ="std";
40} elsif ($flavour =~ /32/) {
41 $SIZE_T =4;
42 $LRSAVE =$SIZE_T;
43 $STU ="stwu";
44 $POP ="lwz";
45 $PUSH ="stw";
46} else { die "nonsense $flavour"; }
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
51die "can't locate ppc-xlate.pl";
52
53open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
54
55$FRAME=32*$SIZE_T;
56
57sub _data_word()
58{ my $i;
59 while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
60}
61
62$sp="r1";
63$toc="r2";
64$inp="r3";
65$out="r4";
66$key="r5";
67
68$Tbl0="r3";
69$Tbl1="r6";
70$Tbl2="r7";
71$Tbl3="r2";
72
73$s0="r8";
74$s1="r9";
75$s2="r10";
76$s3="r11";
77
78$t0="r12";
79$t1="r13";
80$t2="r14";
81$t3="r15";
82
83$acc00="r16";
84$acc01="r17";
85$acc02="r18";
86$acc03="r19";
87
88$acc04="r20";
89$acc05="r21";
90$acc06="r22";
91$acc07="r23";
92
93$acc08="r24";
94$acc09="r25";
95$acc10="r26";
96$acc11="r27";
97
98$acc12="r28";
99$acc13="r29";
100$acc14="r30";
101$acc15="r31";
102
103# stay away from TLS pointer
104if ($SIZE_T==8) { die if ($t1 ne "r13"); $t1="r0"; }
105else { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0"; }
106$mask80=$Tbl2;
107$mask1b=$Tbl3;
108
109$code.=<<___;
110.machine "any"
111.text
112
113.align 7
114LAES_Te:
115 mflr r0
116 bcl 20,31,\$+4
117 mflr $Tbl0 ; vvvvv "distance" between . and 1st data entry
118 addi $Tbl0,$Tbl0,`128-8`
119 mtlr r0
120 blr
121 .long 0
122 .byte 0,12,0x14,0,0,0,0,0
123 .space `64-9*4`
124LAES_Td:
125 mflr r0
126 bcl 20,31,\$+4
127 mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry
128 addi $Tbl0,$Tbl0,`128-64-8+2048+256`
129 mtlr r0
130 blr
131 .long 0
132 .byte 0,12,0x14,0,0,0,0,0
133 .space `128-64-9*4`
134___
135&_data_word(
136 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
137 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
138 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
139 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
140 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
141 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
142 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
143 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
144 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
145 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
146 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
147 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
148 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
149 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
150 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
151 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
152 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
153 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
154 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
155 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
156 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
157 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
158 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
159 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
160 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
161 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
162 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
163 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
164 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
165 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
166 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
167 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
168 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
169 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
170 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
171 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
172 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
173 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
174 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
175 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
176 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
177 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
178 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
179 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
180 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
181 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
182 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
183 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
184 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
185 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
186 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
187 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
188 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
189 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
190 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
191 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
192 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
193 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
194 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
195 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
196 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
197 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
198 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
199 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
200$code.=<<___;
201.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
202.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
203.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
204.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
205.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
206.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
207.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
208.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
209.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
210.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
211.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
212.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
213.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
214.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
215.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
216.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
217.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
218.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
219.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
220.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
221.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
222.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
223.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
224.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
225.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
226.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
227.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
228.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
229.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
230.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
231.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
232.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
233___
234&_data_word(
235 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
236 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
237 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
238 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
239 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
240 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
241 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
242 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
243 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
244 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
245 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
246 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
247 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
248 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
249 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
250 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
251 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
252 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
253 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
254 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
255 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
256 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
257 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
258 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
259 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
260 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
261 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
262 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
263 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
264 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
265 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
266 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
267 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
268 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
269 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
270 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
271 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
272 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
273 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
274 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
275 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
276 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
277 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
278 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
279 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
280 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
281 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
282 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
283 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
284 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
285 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
286 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
287 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
288 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
289 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
290 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
291 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
292 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
293 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
294 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
295 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
296 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
297 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
298 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
299$code.=<<___;
300.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
301.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
302.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
303.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
304.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
305.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
306.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
307.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
308.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
309.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
310.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
311.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
312.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
313.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
314.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
315.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
316.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
317.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
318.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
319.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
320.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
321.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
322.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
323.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
324.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
325.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
326.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
327.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
328.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
329.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
330.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
331.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
332
333
334.globl .AES_encrypt
335.align 7
336.AES_encrypt:
337 $STU $sp,-$FRAME($sp)
338 mflr r0
339
340 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
341 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
342 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
343 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
344 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
345 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
346 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
347 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
348 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
349 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
350 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
351 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
352 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
353 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
354 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
355 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
356 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
357 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
358 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
359 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
360 $PUSH r0,`$FRAME+$LRSAVE`($sp)
361
362 andi. $t0,$inp,3
363 andi. $t1,$out,3
364 or. $t0,$t0,$t1
365 bne Lenc_unaligned
366
367Lenc_unaligned_ok:
368 lwz $s0,0($inp)
369 lwz $s1,4($inp)
370 lwz $s2,8($inp)
371 lwz $s3,12($inp)
372 bl LAES_Te
373 bl Lppc_AES_encrypt_compact
374 stw $s0,0($out)
375 stw $s1,4($out)
376 stw $s2,8($out)
377 stw $s3,12($out)
378 b Lenc_done
379
380Lenc_unaligned:
381 subfic $t0,$inp,4096
382 subfic $t1,$out,4096
383 andi. $t0,$t0,4096-16
384 beq Lenc_xpage
385 andi. $t1,$t1,4096-16
386 bne Lenc_unaligned_ok
387
388Lenc_xpage:
389 lbz $acc00,0($inp)
390 lbz $acc01,1($inp)
391 lbz $acc02,2($inp)
392 lbz $s0,3($inp)
393 lbz $acc04,4($inp)
394 lbz $acc05,5($inp)
395 lbz $acc06,6($inp)
396 lbz $s1,7($inp)
397 lbz $acc08,8($inp)
398 lbz $acc09,9($inp)
399 lbz $acc10,10($inp)
400 insrwi $s0,$acc00,8,0
401 lbz $s2,11($inp)
402 insrwi $s1,$acc04,8,0
403 lbz $acc12,12($inp)
404 insrwi $s0,$acc01,8,8
405 lbz $acc13,13($inp)
406 insrwi $s1,$acc05,8,8
407 lbz $acc14,14($inp)
408 insrwi $s0,$acc02,8,16
409 lbz $s3,15($inp)
410 insrwi $s1,$acc06,8,16
411 insrwi $s2,$acc08,8,0
412 insrwi $s3,$acc12,8,0
413 insrwi $s2,$acc09,8,8
414 insrwi $s3,$acc13,8,8
415 insrwi $s2,$acc10,8,16
416 insrwi $s3,$acc14,8,16
417
418 bl LAES_Te
419 bl Lppc_AES_encrypt_compact
420
421 extrwi $acc00,$s0,8,0
422 extrwi $acc01,$s0,8,8
423 stb $acc00,0($out)
424 extrwi $acc02,$s0,8,16
425 stb $acc01,1($out)
426 stb $acc02,2($out)
427 extrwi $acc04,$s1,8,0
428 stb $s0,3($out)
429 extrwi $acc05,$s1,8,8
430 stb $acc04,4($out)
431 extrwi $acc06,$s1,8,16
432 stb $acc05,5($out)
433 stb $acc06,6($out)
434 extrwi $acc08,$s2,8,0
435 stb $s1,7($out)
436 extrwi $acc09,$s2,8,8
437 stb $acc08,8($out)
438 extrwi $acc10,$s2,8,16
439 stb $acc09,9($out)
440 stb $acc10,10($out)
441 extrwi $acc12,$s3,8,0
442 stb $s2,11($out)
443 extrwi $acc13,$s3,8,8
444 stb $acc12,12($out)
445 extrwi $acc14,$s3,8,16
446 stb $acc13,13($out)
447 stb $acc14,14($out)
448 stb $s3,15($out)
449
450Lenc_done:
451 $POP r0,`$FRAME+$LRSAVE`($sp)
452 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
453 $POP r13,`$FRAME-$SIZE_T*19`($sp)
454 $POP r14,`$FRAME-$SIZE_T*18`($sp)
455 $POP r15,`$FRAME-$SIZE_T*17`($sp)
456 $POP r16,`$FRAME-$SIZE_T*16`($sp)
457 $POP r17,`$FRAME-$SIZE_T*15`($sp)
458 $POP r18,`$FRAME-$SIZE_T*14`($sp)
459 $POP r19,`$FRAME-$SIZE_T*13`($sp)
460 $POP r20,`$FRAME-$SIZE_T*12`($sp)
461 $POP r21,`$FRAME-$SIZE_T*11`($sp)
462 $POP r22,`$FRAME-$SIZE_T*10`($sp)
463 $POP r23,`$FRAME-$SIZE_T*9`($sp)
464 $POP r24,`$FRAME-$SIZE_T*8`($sp)
465 $POP r25,`$FRAME-$SIZE_T*7`($sp)
466 $POP r26,`$FRAME-$SIZE_T*6`($sp)
467 $POP r27,`$FRAME-$SIZE_T*5`($sp)
468 $POP r28,`$FRAME-$SIZE_T*4`($sp)
469 $POP r29,`$FRAME-$SIZE_T*3`($sp)
470 $POP r30,`$FRAME-$SIZE_T*2`($sp)
471 $POP r31,`$FRAME-$SIZE_T*1`($sp)
472 mtlr r0
473 addi $sp,$sp,$FRAME
474 blr
475 .long 0
476 .byte 0,12,4,1,0x80,18,3,0
477 .long 0
478
479.align 5
480Lppc_AES_encrypt:
481 lwz $acc00,240($key)
482 addi $Tbl1,$Tbl0,3
483 lwz $t0,0($key)
484 addi $Tbl2,$Tbl0,2
485 lwz $t1,4($key)
486 addi $Tbl3,$Tbl0,1
487 lwz $t2,8($key)
488 addi $acc00,$acc00,-1
489 lwz $t3,12($key)
490 addi $key,$key,16
491 xor $s0,$s0,$t0
492 xor $s1,$s1,$t1
493 xor $s2,$s2,$t2
494 xor $s3,$s3,$t3
495 mtctr $acc00
496.align 4
497Lenc_loop:
498 rlwinm $acc00,$s0,`32-24+3`,21,28
499 rlwinm $acc01,$s1,`32-24+3`,21,28
500 rlwinm $acc02,$s2,`32-24+3`,21,28
501 rlwinm $acc03,$s3,`32-24+3`,21,28
502 lwz $t0,0($key)
503 rlwinm $acc04,$s1,`32-16+3`,21,28
504 lwz $t1,4($key)
505 rlwinm $acc05,$s2,`32-16+3`,21,28
506 lwz $t2,8($key)
507 rlwinm $acc06,$s3,`32-16+3`,21,28
508 lwz $t3,12($key)
509 rlwinm $acc07,$s0,`32-16+3`,21,28
510 lwzx $acc00,$Tbl0,$acc00
511 rlwinm $acc08,$s2,`32-8+3`,21,28
512 lwzx $acc01,$Tbl0,$acc01
513 rlwinm $acc09,$s3,`32-8+3`,21,28
514 lwzx $acc02,$Tbl0,$acc02
515 rlwinm $acc10,$s0,`32-8+3`,21,28
516 lwzx $acc03,$Tbl0,$acc03
517 rlwinm $acc11,$s1,`32-8+3`,21,28
518 lwzx $acc04,$Tbl1,$acc04
519 rlwinm $acc12,$s3,`0+3`,21,28
520 lwzx $acc05,$Tbl1,$acc05
521 rlwinm $acc13,$s0,`0+3`,21,28
522 lwzx $acc06,$Tbl1,$acc06
523 rlwinm $acc14,$s1,`0+3`,21,28
524 lwzx $acc07,$Tbl1,$acc07
525 rlwinm $acc15,$s2,`0+3`,21,28
526 lwzx $acc08,$Tbl2,$acc08
527 xor $t0,$t0,$acc00
528 lwzx $acc09,$Tbl2,$acc09
529 xor $t1,$t1,$acc01
530 lwzx $acc10,$Tbl2,$acc10
531 xor $t2,$t2,$acc02
532 lwzx $acc11,$Tbl2,$acc11
533 xor $t3,$t3,$acc03
534 lwzx $acc12,$Tbl3,$acc12
535 xor $t0,$t0,$acc04
536 lwzx $acc13,$Tbl3,$acc13
537 xor $t1,$t1,$acc05
538 lwzx $acc14,$Tbl3,$acc14
539 xor $t2,$t2,$acc06
540 lwzx $acc15,$Tbl3,$acc15
541 xor $t3,$t3,$acc07
542 xor $t0,$t0,$acc08
543 xor $t1,$t1,$acc09
544 xor $t2,$t2,$acc10
545 xor $t3,$t3,$acc11
546 xor $s0,$t0,$acc12
547 xor $s1,$t1,$acc13
548 xor $s2,$t2,$acc14
549 xor $s3,$t3,$acc15
550 addi $key,$key,16
551 bdnz- Lenc_loop
552
553 addi $Tbl2,$Tbl0,2048
554 nop
555 lwz $t0,0($key)
556 rlwinm $acc00,$s0,`32-24`,24,31
557 lwz $t1,4($key)
558 rlwinm $acc01,$s1,`32-24`,24,31
559 lwz $t2,8($key)
560 rlwinm $acc02,$s2,`32-24`,24,31
561 lwz $t3,12($key)
562 rlwinm $acc03,$s3,`32-24`,24,31
563 lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
564 rlwinm $acc04,$s1,`32-16`,24,31
565 lwz $acc09,`2048+32`($Tbl0)
566 rlwinm $acc05,$s2,`32-16`,24,31
567 lwz $acc10,`2048+64`($Tbl0)
568 rlwinm $acc06,$s3,`32-16`,24,31
569 lwz $acc11,`2048+96`($Tbl0)
570 rlwinm $acc07,$s0,`32-16`,24,31
571 lwz $acc12,`2048+128`($Tbl0)
572 rlwinm $acc08,$s2,`32-8`,24,31
573 lwz $acc13,`2048+160`($Tbl0)
574 rlwinm $acc09,$s3,`32-8`,24,31
575 lwz $acc14,`2048+192`($Tbl0)
576 rlwinm $acc10,$s0,`32-8`,24,31
577 lwz $acc15,`2048+224`($Tbl0)
578 rlwinm $acc11,$s1,`32-8`,24,31
579 lbzx $acc00,$Tbl2,$acc00
580 rlwinm $acc12,$s3,`0`,24,31
581 lbzx $acc01,$Tbl2,$acc01
582 rlwinm $acc13,$s0,`0`,24,31
583 lbzx $acc02,$Tbl2,$acc02
584 rlwinm $acc14,$s1,`0`,24,31
585 lbzx $acc03,$Tbl2,$acc03
586 rlwinm $acc15,$s2,`0`,24,31
587 lbzx $acc04,$Tbl2,$acc04
588 rlwinm $s0,$acc00,24,0,7
589 lbzx $acc05,$Tbl2,$acc05
590 rlwinm $s1,$acc01,24,0,7
591 lbzx $acc06,$Tbl2,$acc06
592 rlwinm $s2,$acc02,24,0,7
593 lbzx $acc07,$Tbl2,$acc07
594 rlwinm $s3,$acc03,24,0,7
595 lbzx $acc08,$Tbl2,$acc08
596 rlwimi $s0,$acc04,16,8,15
597 lbzx $acc09,$Tbl2,$acc09
598 rlwimi $s1,$acc05,16,8,15
599 lbzx $acc10,$Tbl2,$acc10
600 rlwimi $s2,$acc06,16,8,15
601 lbzx $acc11,$Tbl2,$acc11
602 rlwimi $s3,$acc07,16,8,15
603 lbzx $acc12,$Tbl2,$acc12
604 rlwimi $s0,$acc08,8,16,23
605 lbzx $acc13,$Tbl2,$acc13
606 rlwimi $s1,$acc09,8,16,23
607 lbzx $acc14,$Tbl2,$acc14
608 rlwimi $s2,$acc10,8,16,23
609 lbzx $acc15,$Tbl2,$acc15
610 rlwimi $s3,$acc11,8,16,23
611 or $s0,$s0,$acc12
612 or $s1,$s1,$acc13
613 or $s2,$s2,$acc14
614 or $s3,$s3,$acc15
615 xor $s0,$s0,$t0
616 xor $s1,$s1,$t1
617 xor $s2,$s2,$t2
618 xor $s3,$s3,$t3
619 blr
620 .long 0
621 .byte 0,12,0x14,0,0,0,0,0
622
623.align 4
624Lppc_AES_encrypt_compact:
625 lwz $acc00,240($key)
626 addi $Tbl1,$Tbl0,2048
627 lwz $t0,0($key)
628 lis $mask80,0x8080
629 lwz $t1,4($key)
630 lis $mask1b,0x1b1b
631 lwz $t2,8($key)
632 ori $mask80,$mask80,0x8080
633 lwz $t3,12($key)
634 ori $mask1b,$mask1b,0x1b1b
635 addi $key,$key,16
636 mtctr $acc00
637.align 4
638Lenc_compact_loop:
639 xor $s0,$s0,$t0
640 xor $s1,$s1,$t1
641 rlwinm $acc00,$s0,`32-24`,24,31
642 xor $s2,$s2,$t2
643 rlwinm $acc01,$s1,`32-24`,24,31
644 xor $s3,$s3,$t3
645 rlwinm $acc02,$s2,`32-24`,24,31
646 rlwinm $acc03,$s3,`32-24`,24,31
647 rlwinm $acc04,$s1,`32-16`,24,31
648 rlwinm $acc05,$s2,`32-16`,24,31
649 rlwinm $acc06,$s3,`32-16`,24,31
650 rlwinm $acc07,$s0,`32-16`,24,31
651 lbzx $acc00,$Tbl1,$acc00
652 rlwinm $acc08,$s2,`32-8`,24,31
653 lbzx $acc01,$Tbl1,$acc01
654 rlwinm $acc09,$s3,`32-8`,24,31
655 lbzx $acc02,$Tbl1,$acc02
656 rlwinm $acc10,$s0,`32-8`,24,31
657 lbzx $acc03,$Tbl1,$acc03
658 rlwinm $acc11,$s1,`32-8`,24,31
659 lbzx $acc04,$Tbl1,$acc04
660 rlwinm $acc12,$s3,`0`,24,31
661 lbzx $acc05,$Tbl1,$acc05
662 rlwinm $acc13,$s0,`0`,24,31
663 lbzx $acc06,$Tbl1,$acc06
664 rlwinm $acc14,$s1,`0`,24,31
665 lbzx $acc07,$Tbl1,$acc07
666 rlwinm $acc15,$s2,`0`,24,31
667 lbzx $acc08,$Tbl1,$acc08
668 rlwinm $s0,$acc00,24,0,7
669 lbzx $acc09,$Tbl1,$acc09
670 rlwinm $s1,$acc01,24,0,7
671 lbzx $acc10,$Tbl1,$acc10
672 rlwinm $s2,$acc02,24,0,7
673 lbzx $acc11,$Tbl1,$acc11
674 rlwinm $s3,$acc03,24,0,7
675 lbzx $acc12,$Tbl1,$acc12
676 rlwimi $s0,$acc04,16,8,15
677 lbzx $acc13,$Tbl1,$acc13
678 rlwimi $s1,$acc05,16,8,15
679 lbzx $acc14,$Tbl1,$acc14
680 rlwimi $s2,$acc06,16,8,15
681 lbzx $acc15,$Tbl1,$acc15
682 rlwimi $s3,$acc07,16,8,15
683 rlwimi $s0,$acc08,8,16,23
684 rlwimi $s1,$acc09,8,16,23
685 rlwimi $s2,$acc10,8,16,23
686 rlwimi $s3,$acc11,8,16,23
687 lwz $t0,0($key)
688 or $s0,$s0,$acc12
689 lwz $t1,4($key)
690 or $s1,$s1,$acc13
691 lwz $t2,8($key)
692 or $s2,$s2,$acc14
693 lwz $t3,12($key)
694 or $s3,$s3,$acc15
695
696 addi $key,$key,16
697 bdz Lenc_compact_done
698
699 and $acc00,$s0,$mask80 # r1=r0&0x80808080
700 and $acc01,$s1,$mask80
701 and $acc02,$s2,$mask80
702 and $acc03,$s3,$mask80
703 srwi $acc04,$acc00,7 # r1>>7
704 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
705 srwi $acc05,$acc01,7
706 andc $acc09,$s1,$mask80
707 srwi $acc06,$acc02,7
708 andc $acc10,$s2,$mask80
709 srwi $acc07,$acc03,7
710 andc $acc11,$s3,$mask80
711 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
712 sub $acc01,$acc01,$acc05
713 sub $acc02,$acc02,$acc06
714 sub $acc03,$acc03,$acc07
715 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
716 add $acc09,$acc09,$acc09
717 add $acc10,$acc10,$acc10
718 add $acc11,$acc11,$acc11
719 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
720 and $acc01,$acc01,$mask1b
721 and $acc02,$acc02,$mask1b
722 and $acc03,$acc03,$mask1b
723 xor $acc00,$acc00,$acc08 # r2
724 xor $acc01,$acc01,$acc09
725 rotlwi $acc12,$s0,16 # ROTATE(r0,16)
726 xor $acc02,$acc02,$acc10
727 rotlwi $acc13,$s1,16
728 xor $acc03,$acc03,$acc11
729 rotlwi $acc14,$s2,16
730
731 xor $s0,$s0,$acc00 # r0^r2
732 rotlwi $acc15,$s3,16
733 xor $s1,$s1,$acc01
734 rotrwi $s0,$s0,24 # ROTATE(r2^r0,24)
735 xor $s2,$s2,$acc02
736 rotrwi $s1,$s1,24
737 xor $s3,$s3,$acc03
738 rotrwi $s2,$s2,24
739 xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2
740 rotrwi $s3,$s3,24
741 xor $s1,$s1,$acc01
742 xor $s2,$s2,$acc02
743 xor $s3,$s3,$acc03
744 rotlwi $acc08,$acc12,8 # ROTATE(r0,24)
745 xor $s0,$s0,$acc12 #
746 rotlwi $acc09,$acc13,8
747 xor $s1,$s1,$acc13
748 rotlwi $acc10,$acc14,8
749 xor $s2,$s2,$acc14
750 rotlwi $acc11,$acc15,8
751 xor $s3,$s3,$acc15
752 xor $s0,$s0,$acc08 #
753 xor $s1,$s1,$acc09
754 xor $s2,$s2,$acc10
755 xor $s3,$s3,$acc11
756
757 b Lenc_compact_loop
758.align 4
759Lenc_compact_done:
760 xor $s0,$s0,$t0
761 xor $s1,$s1,$t1
762 xor $s2,$s2,$t2
763 xor $s3,$s3,$t3
764 blr
765 .long 0
766 .byte 0,12,0x14,0,0,0,0,0
767
768.globl .AES_decrypt
769.align 7
770.AES_decrypt:
771 $STU $sp,-$FRAME($sp)
772 mflr r0
773
774 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
775 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
776 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
777 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
778 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
779 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
780 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
781 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
782 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
783 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
784 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
785 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
786 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
787 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
788 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
789 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
790 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
791 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
792 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
793 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
794 $PUSH r0,`$FRAME+$LRSAVE`($sp)
795
796 andi. $t0,$inp,3
797 andi. $t1,$out,3
798 or. $t0,$t0,$t1
799 bne Ldec_unaligned
800
801Ldec_unaligned_ok:
802 lwz $s0,0($inp)
803 lwz $s1,4($inp)
804 lwz $s2,8($inp)
805 lwz $s3,12($inp)
806 bl LAES_Td
807 bl Lppc_AES_decrypt_compact
808 stw $s0,0($out)
809 stw $s1,4($out)
810 stw $s2,8($out)
811 stw $s3,12($out)
812 b Ldec_done
813
814Ldec_unaligned:
815 subfic $t0,$inp,4096
816 subfic $t1,$out,4096
817 andi. $t0,$t0,4096-16
818 beq Ldec_xpage
819 andi. $t1,$t1,4096-16
820 bne Ldec_unaligned_ok
821
822Ldec_xpage:
823 lbz $acc00,0($inp)
824 lbz $acc01,1($inp)
825 lbz $acc02,2($inp)
826 lbz $s0,3($inp)
827 lbz $acc04,4($inp)
828 lbz $acc05,5($inp)
829 lbz $acc06,6($inp)
830 lbz $s1,7($inp)
831 lbz $acc08,8($inp)
832 lbz $acc09,9($inp)
833 lbz $acc10,10($inp)
834 insrwi $s0,$acc00,8,0
835 lbz $s2,11($inp)
836 insrwi $s1,$acc04,8,0
837 lbz $acc12,12($inp)
838 insrwi $s0,$acc01,8,8
839 lbz $acc13,13($inp)
840 insrwi $s1,$acc05,8,8
841 lbz $acc14,14($inp)
842 insrwi $s0,$acc02,8,16
843 lbz $s3,15($inp)
844 insrwi $s1,$acc06,8,16
845 insrwi $s2,$acc08,8,0
846 insrwi $s3,$acc12,8,0
847 insrwi $s2,$acc09,8,8
848 insrwi $s3,$acc13,8,8
849 insrwi $s2,$acc10,8,16
850 insrwi $s3,$acc14,8,16
851
852 bl LAES_Td
853 bl Lppc_AES_decrypt_compact
854
855 extrwi $acc00,$s0,8,0
856 extrwi $acc01,$s0,8,8
857 stb $acc00,0($out)
858 extrwi $acc02,$s0,8,16
859 stb $acc01,1($out)
860 stb $acc02,2($out)
861 extrwi $acc04,$s1,8,0
862 stb $s0,3($out)
863 extrwi $acc05,$s1,8,8
864 stb $acc04,4($out)
865 extrwi $acc06,$s1,8,16
866 stb $acc05,5($out)
867 stb $acc06,6($out)
868 extrwi $acc08,$s2,8,0
869 stb $s1,7($out)
870 extrwi $acc09,$s2,8,8
871 stb $acc08,8($out)
872 extrwi $acc10,$s2,8,16
873 stb $acc09,9($out)
874 stb $acc10,10($out)
875 extrwi $acc12,$s3,8,0
876 stb $s2,11($out)
877 extrwi $acc13,$s3,8,8
878 stb $acc12,12($out)
879 extrwi $acc14,$s3,8,16
880 stb $acc13,13($out)
881 stb $acc14,14($out)
882 stb $s3,15($out)
883
884Ldec_done:
885 $POP r0,`$FRAME+$LRSAVE`($sp)
886 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
887 $POP r13,`$FRAME-$SIZE_T*19`($sp)
888 $POP r14,`$FRAME-$SIZE_T*18`($sp)
889 $POP r15,`$FRAME-$SIZE_T*17`($sp)
890 $POP r16,`$FRAME-$SIZE_T*16`($sp)
891 $POP r17,`$FRAME-$SIZE_T*15`($sp)
892 $POP r18,`$FRAME-$SIZE_T*14`($sp)
893 $POP r19,`$FRAME-$SIZE_T*13`($sp)
894 $POP r20,`$FRAME-$SIZE_T*12`($sp)
895 $POP r21,`$FRAME-$SIZE_T*11`($sp)
896 $POP r22,`$FRAME-$SIZE_T*10`($sp)
897 $POP r23,`$FRAME-$SIZE_T*9`($sp)
898 $POP r24,`$FRAME-$SIZE_T*8`($sp)
899 $POP r25,`$FRAME-$SIZE_T*7`($sp)
900 $POP r26,`$FRAME-$SIZE_T*6`($sp)
901 $POP r27,`$FRAME-$SIZE_T*5`($sp)
902 $POP r28,`$FRAME-$SIZE_T*4`($sp)
903 $POP r29,`$FRAME-$SIZE_T*3`($sp)
904 $POP r30,`$FRAME-$SIZE_T*2`($sp)
905 $POP r31,`$FRAME-$SIZE_T*1`($sp)
906 mtlr r0
907 addi $sp,$sp,$FRAME
908 blr
909 .long 0
910 .byte 0,12,4,1,0x80,18,3,0
911 .long 0
912
913.align 5
914Lppc_AES_decrypt:
915 lwz $acc00,240($key)
916 addi $Tbl1,$Tbl0,3
917 lwz $t0,0($key)
918 addi $Tbl2,$Tbl0,2
919 lwz $t1,4($key)
920 addi $Tbl3,$Tbl0,1
921 lwz $t2,8($key)
922 addi $acc00,$acc00,-1
923 lwz $t3,12($key)
924 addi $key,$key,16
925 xor $s0,$s0,$t0
926 xor $s1,$s1,$t1
927 xor $s2,$s2,$t2
928 xor $s3,$s3,$t3
929 mtctr $acc00
930.align 4
931Ldec_loop:
932 rlwinm $acc00,$s0,`32-24+3`,21,28
933 rlwinm $acc01,$s1,`32-24+3`,21,28
934 rlwinm $acc02,$s2,`32-24+3`,21,28
935 rlwinm $acc03,$s3,`32-24+3`,21,28
936 lwz $t0,0($key)
937 rlwinm $acc04,$s3,`32-16+3`,21,28
938 lwz $t1,4($key)
939 rlwinm $acc05,$s0,`32-16+3`,21,28
940 lwz $t2,8($key)
941 rlwinm $acc06,$s1,`32-16+3`,21,28
942 lwz $t3,12($key)
943 rlwinm $acc07,$s2,`32-16+3`,21,28
944 lwzx $acc00,$Tbl0,$acc00
945 rlwinm $acc08,$s2,`32-8+3`,21,28
946 lwzx $acc01,$Tbl0,$acc01
947 rlwinm $acc09,$s3,`32-8+3`,21,28
948 lwzx $acc02,$Tbl0,$acc02
949 rlwinm $acc10,$s0,`32-8+3`,21,28
950 lwzx $acc03,$Tbl0,$acc03
951 rlwinm $acc11,$s1,`32-8+3`,21,28
952 lwzx $acc04,$Tbl1,$acc04
953 rlwinm $acc12,$s1,`0+3`,21,28
954 lwzx $acc05,$Tbl1,$acc05
955 rlwinm $acc13,$s2,`0+3`,21,28
956 lwzx $acc06,$Tbl1,$acc06
957 rlwinm $acc14,$s3,`0+3`,21,28
958 lwzx $acc07,$Tbl1,$acc07
959 rlwinm $acc15,$s0,`0+3`,21,28
960 lwzx $acc08,$Tbl2,$acc08
961 xor $t0,$t0,$acc00
962 lwzx $acc09,$Tbl2,$acc09
963 xor $t1,$t1,$acc01
964 lwzx $acc10,$Tbl2,$acc10
965 xor $t2,$t2,$acc02
966 lwzx $acc11,$Tbl2,$acc11
967 xor $t3,$t3,$acc03
968 lwzx $acc12,$Tbl3,$acc12
969 xor $t0,$t0,$acc04
970 lwzx $acc13,$Tbl3,$acc13
971 xor $t1,$t1,$acc05
972 lwzx $acc14,$Tbl3,$acc14
973 xor $t2,$t2,$acc06
974 lwzx $acc15,$Tbl3,$acc15
975 xor $t3,$t3,$acc07
976 xor $t0,$t0,$acc08
977 xor $t1,$t1,$acc09
978 xor $t2,$t2,$acc10
979 xor $t3,$t3,$acc11
980 xor $s0,$t0,$acc12
981 xor $s1,$t1,$acc13
982 xor $s2,$t2,$acc14
983 xor $s3,$t3,$acc15
984 addi $key,$key,16
985 bdnz- Ldec_loop
986
987 addi $Tbl2,$Tbl0,2048
988 nop
989 lwz $t0,0($key)
990 rlwinm $acc00,$s0,`32-24`,24,31
991 lwz $t1,4($key)
992 rlwinm $acc01,$s1,`32-24`,24,31
993 lwz $t2,8($key)
994 rlwinm $acc02,$s2,`32-24`,24,31
995 lwz $t3,12($key)
996 rlwinm $acc03,$s3,`32-24`,24,31
997 lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
998 rlwinm $acc04,$s3,`32-16`,24,31
999 lwz $acc09,`2048+32`($Tbl0)
1000 rlwinm $acc05,$s0,`32-16`,24,31
1001 lwz $acc10,`2048+64`($Tbl0)
1002 lbzx $acc00,$Tbl2,$acc00
1003 lwz $acc11,`2048+96`($Tbl0)
1004 lbzx $acc01,$Tbl2,$acc01
1005 lwz $acc12,`2048+128`($Tbl0)
1006 rlwinm $acc06,$s1,`32-16`,24,31
1007 lwz $acc13,`2048+160`($Tbl0)
1008 rlwinm $acc07,$s2,`32-16`,24,31
1009 lwz $acc14,`2048+192`($Tbl0)
1010 rlwinm $acc08,$s2,`32-8`,24,31
1011 lwz $acc15,`2048+224`($Tbl0)
1012 rlwinm $acc09,$s3,`32-8`,24,31
1013 lbzx $acc02,$Tbl2,$acc02
1014 rlwinm $acc10,$s0,`32-8`,24,31
1015 lbzx $acc03,$Tbl2,$acc03
1016 rlwinm $acc11,$s1,`32-8`,24,31
1017 lbzx $acc04,$Tbl2,$acc04
1018 rlwinm $acc12,$s1,`0`,24,31
1019 lbzx $acc05,$Tbl2,$acc05
1020 rlwinm $acc13,$s2,`0`,24,31
1021 lbzx $acc06,$Tbl2,$acc06
1022 rlwinm $acc14,$s3,`0`,24,31
1023 lbzx $acc07,$Tbl2,$acc07
1024 rlwinm $acc15,$s0,`0`,24,31
1025 lbzx $acc08,$Tbl2,$acc08
1026 rlwinm $s0,$acc00,24,0,7
1027 lbzx $acc09,$Tbl2,$acc09
1028 rlwinm $s1,$acc01,24,0,7
1029 lbzx $acc10,$Tbl2,$acc10
1030 rlwinm $s2,$acc02,24,0,7
1031 lbzx $acc11,$Tbl2,$acc11
1032 rlwinm $s3,$acc03,24,0,7
1033 lbzx $acc12,$Tbl2,$acc12
1034 rlwimi $s0,$acc04,16,8,15
1035 lbzx $acc13,$Tbl2,$acc13
1036 rlwimi $s1,$acc05,16,8,15
1037 lbzx $acc14,$Tbl2,$acc14
1038 rlwimi $s2,$acc06,16,8,15
1039 lbzx $acc15,$Tbl2,$acc15
1040 rlwimi $s3,$acc07,16,8,15
1041 rlwimi $s0,$acc08,8,16,23
1042 rlwimi $s1,$acc09,8,16,23
1043 rlwimi $s2,$acc10,8,16,23
1044 rlwimi $s3,$acc11,8,16,23
1045 or $s0,$s0,$acc12
1046 or $s1,$s1,$acc13
1047 or $s2,$s2,$acc14
1048 or $s3,$s3,$acc15
1049 xor $s0,$s0,$t0
1050 xor $s1,$s1,$t1
1051 xor $s2,$s2,$t2
1052 xor $s3,$s3,$t3
1053 blr
1054 .long 0
1055 .byte 0,12,0x14,0,0,0,0,0
1056
1057.align 4
1058Lppc_AES_decrypt_compact:
1059 lwz $acc00,240($key)
1060 addi $Tbl1,$Tbl0,2048
1061 lwz $t0,0($key)
1062 lis $mask80,0x8080
1063 lwz $t1,4($key)
1064 lis $mask1b,0x1b1b
1065 lwz $t2,8($key)
1066 ori $mask80,$mask80,0x8080
1067 lwz $t3,12($key)
1068 ori $mask1b,$mask1b,0x1b1b
1069 addi $key,$key,16
1070___
1071$code.=<<___ if ($SIZE_T==8);
1072 insrdi $mask80,$mask80,32,0
1073 insrdi $mask1b,$mask1b,32,0
1074___
1075$code.=<<___;
1076 mtctr $acc00
1077.align 4
1078Ldec_compact_loop:
1079 xor $s0,$s0,$t0
1080 xor $s1,$s1,$t1
1081 rlwinm $acc00,$s0,`32-24`,24,31
1082 xor $s2,$s2,$t2
1083 rlwinm $acc01,$s1,`32-24`,24,31
1084 xor $s3,$s3,$t3
1085 rlwinm $acc02,$s2,`32-24`,24,31
1086 rlwinm $acc03,$s3,`32-24`,24,31
1087 rlwinm $acc04,$s3,`32-16`,24,31
1088 rlwinm $acc05,$s0,`32-16`,24,31
1089 rlwinm $acc06,$s1,`32-16`,24,31
1090 rlwinm $acc07,$s2,`32-16`,24,31
1091 lbzx $acc00,$Tbl1,$acc00
1092 rlwinm $acc08,$s2,`32-8`,24,31
1093 lbzx $acc01,$Tbl1,$acc01
1094 rlwinm $acc09,$s3,`32-8`,24,31
1095 lbzx $acc02,$Tbl1,$acc02
1096 rlwinm $acc10,$s0,`32-8`,24,31
1097 lbzx $acc03,$Tbl1,$acc03
1098 rlwinm $acc11,$s1,`32-8`,24,31
1099 lbzx $acc04,$Tbl1,$acc04
1100 rlwinm $acc12,$s1,`0`,24,31
1101 lbzx $acc05,$Tbl1,$acc05
1102 rlwinm $acc13,$s2,`0`,24,31
1103 lbzx $acc06,$Tbl1,$acc06
1104 rlwinm $acc14,$s3,`0`,24,31
1105 lbzx $acc07,$Tbl1,$acc07
1106 rlwinm $acc15,$s0,`0`,24,31
1107 lbzx $acc08,$Tbl1,$acc08
1108 rlwinm $s0,$acc00,24,0,7
1109 lbzx $acc09,$Tbl1,$acc09
1110 rlwinm $s1,$acc01,24,0,7
1111 lbzx $acc10,$Tbl1,$acc10
1112 rlwinm $s2,$acc02,24,0,7
1113 lbzx $acc11,$Tbl1,$acc11
1114 rlwinm $s3,$acc03,24,0,7
1115 lbzx $acc12,$Tbl1,$acc12
1116 rlwimi $s0,$acc04,16,8,15
1117 lbzx $acc13,$Tbl1,$acc13
1118 rlwimi $s1,$acc05,16,8,15
1119 lbzx $acc14,$Tbl1,$acc14
1120 rlwimi $s2,$acc06,16,8,15
1121 lbzx $acc15,$Tbl1,$acc15
1122 rlwimi $s3,$acc07,16,8,15
1123 rlwimi $s0,$acc08,8,16,23
1124 rlwimi $s1,$acc09,8,16,23
1125 rlwimi $s2,$acc10,8,16,23
1126 rlwimi $s3,$acc11,8,16,23
1127 lwz $t0,0($key)
1128 or $s0,$s0,$acc12
1129 lwz $t1,4($key)
1130 or $s1,$s1,$acc13
1131 lwz $t2,8($key)
1132 or $s2,$s2,$acc14
1133 lwz $t3,12($key)
1134 or $s3,$s3,$acc15
1135
1136 addi $key,$key,16
1137 bdz Ldec_compact_done
1138___
1139$code.=<<___ if ($SIZE_T==8);
1140 # vectorized permutation improves decrypt performance by 10%
1141 insrdi $s0,$s1,32,0
1142 insrdi $s2,$s3,32,0
1143
1144 and $acc00,$s0,$mask80 # r1=r0&0x80808080
1145 and $acc02,$s2,$mask80
1146 srdi $acc04,$acc00,7 # r1>>7
1147 srdi $acc06,$acc02,7
1148 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
1149 andc $acc10,$s2,$mask80
1150 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
1151 sub $acc02,$acc02,$acc06
1152 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
1153 add $acc10,$acc10,$acc10
1154 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1155 and $acc02,$acc02,$mask1b
1156 xor $acc00,$acc00,$acc08 # r2
1157 xor $acc02,$acc02,$acc10
1158
1159 and $acc04,$acc00,$mask80 # r1=r2&0x80808080
1160 and $acc06,$acc02,$mask80
1161 srdi $acc08,$acc04,7 # r1>>7
1162 srdi $acc10,$acc06,7
1163 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
1164 andc $acc14,$acc02,$mask80
1165 sub $acc04,$acc04,$acc08 # r1-(r1>>7)
1166 sub $acc06,$acc06,$acc10
1167 add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
1168 add $acc14,$acc14,$acc14
1169 and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1170 and $acc06,$acc06,$mask1b
1171 xor $acc04,$acc04,$acc12 # r4
1172 xor $acc06,$acc06,$acc14
1173
1174 and $acc08,$acc04,$mask80 # r1=r4&0x80808080
1175 and $acc10,$acc06,$mask80
1176 srdi $acc12,$acc08,7 # r1>>7
1177 srdi $acc14,$acc10,7
1178 sub $acc08,$acc08,$acc12 # r1-(r1>>7)
1179 sub $acc10,$acc10,$acc14
1180 andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
1181 andc $acc14,$acc06,$mask80
1182 add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
1183 add $acc14,$acc14,$acc14
1184 and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1185 and $acc10,$acc10,$mask1b
1186 xor $acc08,$acc08,$acc12 # r8
1187 xor $acc10,$acc10,$acc14
1188
1189 xor $acc00,$acc00,$s0 # r2^r0
1190 xor $acc02,$acc02,$s2
1191 xor $acc04,$acc04,$s0 # r4^r0
1192 xor $acc06,$acc06,$s2
1193
1194 extrdi $acc01,$acc00,32,0
1195 extrdi $acc03,$acc02,32,0
1196 extrdi $acc05,$acc04,32,0
1197 extrdi $acc07,$acc06,32,0
1198 extrdi $acc09,$acc08,32,0
1199 extrdi $acc11,$acc10,32,0
1200___
1201$code.=<<___ if ($SIZE_T==4);
1202 and $acc00,$s0,$mask80 # r1=r0&0x80808080
1203 and $acc01,$s1,$mask80
1204 and $acc02,$s2,$mask80
1205 and $acc03,$s3,$mask80
1206 srwi $acc04,$acc00,7 # r1>>7
1207 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
1208 srwi $acc05,$acc01,7
1209 andc $acc09,$s1,$mask80
1210 srwi $acc06,$acc02,7
1211 andc $acc10,$s2,$mask80
1212 srwi $acc07,$acc03,7
1213 andc $acc11,$s3,$mask80
1214 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
1215 sub $acc01,$acc01,$acc05
1216 sub $acc02,$acc02,$acc06
1217 sub $acc03,$acc03,$acc07
1218 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
1219 add $acc09,$acc09,$acc09
1220 add $acc10,$acc10,$acc10
1221 add $acc11,$acc11,$acc11
1222 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1223 and $acc01,$acc01,$mask1b
1224 and $acc02,$acc02,$mask1b
1225 and $acc03,$acc03,$mask1b
1226 xor $acc00,$acc00,$acc08 # r2
1227 xor $acc01,$acc01,$acc09
1228 xor $acc02,$acc02,$acc10
1229 xor $acc03,$acc03,$acc11
1230
1231 and $acc04,$acc00,$mask80 # r1=r2&0x80808080
1232 and $acc05,$acc01,$mask80
1233 and $acc06,$acc02,$mask80
1234 and $acc07,$acc03,$mask80
1235 srwi $acc08,$acc04,7 # r1>>7
1236 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
1237 srwi $acc09,$acc05,7
1238 andc $acc13,$acc01,$mask80
1239 srwi $acc10,$acc06,7
1240 andc $acc14,$acc02,$mask80
1241 srwi $acc11,$acc07,7
1242 andc $acc15,$acc03,$mask80
1243 sub $acc04,$acc04,$acc08 # r1-(r1>>7)
1244 sub $acc05,$acc05,$acc09
1245 sub $acc06,$acc06,$acc10
1246 sub $acc07,$acc07,$acc11
1247 add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
1248 add $acc13,$acc13,$acc13
1249 add $acc14,$acc14,$acc14
1250 add $acc15,$acc15,$acc15
1251 and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1252 and $acc05,$acc05,$mask1b
1253 and $acc06,$acc06,$mask1b
1254 and $acc07,$acc07,$mask1b
1255 xor $acc04,$acc04,$acc12 # r4
1256 xor $acc05,$acc05,$acc13
1257 xor $acc06,$acc06,$acc14
1258 xor $acc07,$acc07,$acc15
1259
1260 and $acc08,$acc04,$mask80 # r1=r4&0x80808080
1261 and $acc09,$acc05,$mask80
1262 srwi $acc12,$acc08,7 # r1>>7
1263 and $acc10,$acc06,$mask80
1264 srwi $acc13,$acc09,7
1265 and $acc11,$acc07,$mask80
1266 srwi $acc14,$acc10,7
1267 sub $acc08,$acc08,$acc12 # r1-(r1>>7)
1268 srwi $acc15,$acc11,7
1269 sub $acc09,$acc09,$acc13
1270 sub $acc10,$acc10,$acc14
1271 sub $acc11,$acc11,$acc15
1272 andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
1273 andc $acc13,$acc05,$mask80
1274 andc $acc14,$acc06,$mask80
1275 andc $acc15,$acc07,$mask80
1276 add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
1277 add $acc13,$acc13,$acc13
1278 add $acc14,$acc14,$acc14
1279 add $acc15,$acc15,$acc15
1280 and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1281 and $acc09,$acc09,$mask1b
1282 and $acc10,$acc10,$mask1b
1283 and $acc11,$acc11,$mask1b
1284 xor $acc08,$acc08,$acc12 # r8
1285 xor $acc09,$acc09,$acc13
1286 xor $acc10,$acc10,$acc14
1287 xor $acc11,$acc11,$acc15
1288
1289 xor $acc00,$acc00,$s0 # r2^r0
1290 xor $acc01,$acc01,$s1
1291 xor $acc02,$acc02,$s2
1292 xor $acc03,$acc03,$s3
1293 xor $acc04,$acc04,$s0 # r4^r0
1294 xor $acc05,$acc05,$s1
1295 xor $acc06,$acc06,$s2
1296 xor $acc07,$acc07,$s3
1297___
1298$code.=<<___;
1299 rotrwi $s0,$s0,8 # = ROTATE(r0,8)
1300 rotrwi $s1,$s1,8
1301 xor $s0,$s0,$acc00 # ^= r2^r0
1302 rotrwi $s2,$s2,8
1303 xor $s1,$s1,$acc01
1304 rotrwi $s3,$s3,8
1305 xor $s2,$s2,$acc02
1306 xor $s3,$s3,$acc03
1307 xor $acc00,$acc00,$acc08
1308 xor $acc01,$acc01,$acc09
1309 xor $acc02,$acc02,$acc10
1310 xor $acc03,$acc03,$acc11
1311 xor $s0,$s0,$acc04 # ^= r4^r0
1312 rotrwi $acc00,$acc00,24
1313 xor $s1,$s1,$acc05
1314 rotrwi $acc01,$acc01,24
1315 xor $s2,$s2,$acc06
1316 rotrwi $acc02,$acc02,24
1317 xor $s3,$s3,$acc07
1318 rotrwi $acc03,$acc03,24
1319 xor $acc04,$acc04,$acc08
1320 xor $acc05,$acc05,$acc09
1321 xor $acc06,$acc06,$acc10
1322 xor $acc07,$acc07,$acc11
1323 xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
1324 rotrwi $acc04,$acc04,16
1325 xor $s1,$s1,$acc09
1326 rotrwi $acc05,$acc05,16
1327 xor $s2,$s2,$acc10
1328 rotrwi $acc06,$acc06,16
1329 xor $s3,$s3,$acc11
1330 rotrwi $acc07,$acc07,16
1331 xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24)
1332 rotrwi $acc08,$acc08,8
1333 xor $s1,$s1,$acc01
1334 rotrwi $acc09,$acc09,8
1335 xor $s2,$s2,$acc02
1336 rotrwi $acc10,$acc10,8
1337 xor $s3,$s3,$acc03
1338 rotrwi $acc11,$acc11,8
1339 xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16)
1340 xor $s1,$s1,$acc05
1341 xor $s2,$s2,$acc06
1342 xor $s3,$s3,$acc07
1343 xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
1344 xor $s1,$s1,$acc09
1345 xor $s2,$s2,$acc10
1346 xor $s3,$s3,$acc11
1347
1348 b Ldec_compact_loop
1349.align 4
1350Ldec_compact_done:
1351 xor $s0,$s0,$t0
1352 xor $s1,$s1,$t1
1353 xor $s2,$s2,$t2
1354 xor $s3,$s3,$t3
1355 blr
1356 .long 0
1357 .byte 0,12,0x14,0,0,0,0,0
1358
1359.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
1360.align 7
1361___
1362
1363$code =~ s/\`([^\`]*)\`/eval $1/gem;
1364print $code;
1365close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl
deleted file mode 100644
index 71d5b55077..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-s390x.pl
+++ /dev/null
@@ -1,2237 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for s390x.
11
12# April 2007.
13#
14# Software performance improvement over gcc-generated code is ~70% and
15# in absolute terms is ~73 cycles per byte processed with 128-bit key.
16# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
17# *strictly* in-order execution and issued instruction [in this case
18# load value from memory is critical] has to complete before execution
19# flow proceeds. S-boxes are compressed to 2KB[+256B].
20#
21# As for hardware acceleration support. It's basically a "teaser," as
22# it can and should be improved in several ways. Most notably support
23# for CBC is not utilized, nor multiple blocks are ever processed.
24# Then software key schedule can be postponed till hardware support
25# detection... Performance improvement over assembler is reportedly
26# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
27# support is implemented.
28
29# May 2007.
30#
31# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
32# for 128-bit keys, if hardware support is detected.
33
34# Januray 2009.
35#
36# Add support for hardware AES192/256 and reschedule instructions to
37# minimize/avoid Address Generation Interlock hazard and to favour
38# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
39# almost 50% on z9. The gain is smaller on z10, because being dual-
40# issue z10 makes it improssible to eliminate the interlock condition:
41# critial path is not long enough. Yet it spends ~24 cycles per byte
42# processed with 128-bit key.
43#
44# Unlike previous version hardware support detection takes place only
45# at the moment of key schedule setup, which is denoted in key->rounds.
46# This is done, because deferred key setup can't be made MT-safe, not
47# for keys longer than 128 bits.
48#
49# Add AES_cbc_encrypt, which gives incredible performance improvement,
50# it was measured to be ~6.6x. It's less than previously mentioned 8x,
51# because software implementation was optimized.
52
53# May 2010.
54#
55# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
56# performance improvement over "generic" counter mode routine relying
57# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
58# to the fact that exact throughput value depends on current stack
59# frame alignment within 4KB page. In worst case you get ~75% of the
60# maximum, but *on average* it would be as much as ~98%. Meaning that
61# worst case is unlike, it's like hitting ravine on plateau.
62
63# November 2010.
64#
65# Adapt for -m31 build. If kernel supports what's called "highgprs"
66# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
67# instructions and achieve "64-bit" performance even in 31-bit legacy
68# application context. The feature is not specific to any particular
69# processor, as long as it's "z-CPU". Latter implies that the code
70# remains z/Architecture specific. On z990 it was measured to perform
71# 2x better than code generated by gcc 4.3.
72
73# December 2010.
74#
75# Add support for z196 "cipher message with counter" instruction.
76# Note however that it's disengaged, because it was measured to
77# perform ~12% worse than vanilla km-based code...
78
79# February 2011.
80#
81# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
82# instructions, which deliver ~70% improvement at 8KB block size over
83# vanilla km-based code, 37% - at most like 512-bytes block size.
84
85$flavour = shift;
86
87if ($flavour =~ /3[12]/) {
88 $SIZE_T=4;
89 $g="";
90} else {
91 $SIZE_T=8;
92 $g="g";
93}
94
95while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
96open STDOUT,">$output";
97
98$softonly=0; # allow hardware support
99
100$t0="%r0"; $mask="%r0";
101$t1="%r1";
102$t2="%r2"; $inp="%r2";
103$t3="%r3"; $out="%r3"; $bits="%r3";
104$key="%r4";
105$i1="%r5";
106$i2="%r6";
107$i3="%r7";
108$s0="%r8";
109$s1="%r9";
110$s2="%r10";
111$s3="%r11";
112$tbl="%r12";
113$rounds="%r13";
114$ra="%r14";
115$sp="%r15";
116
117$stdframe=16*$SIZE_T+4*8;
118
119sub _data_word()
120{ my $i;
121 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
122}
123
124$code=<<___;
125.text
126
127.type AES_Te,\@object
128.align 256
129AES_Te:
130___
131&_data_word(
132 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
133 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
134 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
135 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
136 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
137 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
138 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
139 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
140 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
141 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
142 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
143 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
144 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
145 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
146 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
147 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
148 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
149 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
150 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
151 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
152 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
153 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
154 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
155 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
156 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
157 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
158 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
159 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
160 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
161 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
162 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
163 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
164 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
165 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
166 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
167 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
168 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
169 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
170 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
171 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
172 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
173 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
174 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
175 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
176 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
177 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
178 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
179 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
180 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
181 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
182 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
183 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
184 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
185 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
186 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
187 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
188 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
189 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
190 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
191 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
192 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
193 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
194 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
195 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
196$code.=<<___;
197# Te4[256]
198.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
199.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
200.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
201.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
202.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
203.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
204.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
205.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
206.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
207.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
208.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
209.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
210.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
211.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
212.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
213.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
214.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
215.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
216.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
217.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
218.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
219.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
220.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
221.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
222.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
223.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
224.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
225.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
226.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
227.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
228.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
229.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
230# rcon[]
231.long 0x01000000, 0x02000000, 0x04000000, 0x08000000
232.long 0x10000000, 0x20000000, 0x40000000, 0x80000000
233.long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
234.align 256
235.size AES_Te,.-AES_Te
236
237# void AES_encrypt(const unsigned char *inp, unsigned char *out,
238# const AES_KEY *key) {
239.globl AES_encrypt
240.type AES_encrypt,\@function
241AES_encrypt:
242___
243$code.=<<___ if (!$softonly);
244 l %r0,240($key)
245 lhi %r1,16
246 clr %r0,%r1
247 jl .Lesoft
248
249 la %r1,0($key)
250 #la %r2,0($inp)
251 la %r4,0($out)
252 lghi %r3,16 # single block length
253 .long 0xb92e0042 # km %r4,%r2
254 brc 1,.-4 # can this happen?
255 br %r14
256.align 64
257.Lesoft:
258___
259$code.=<<___;
260 stm${g} %r3,$ra,3*$SIZE_T($sp)
261
262 llgf $s0,0($inp)
263 llgf $s1,4($inp)
264 llgf $s2,8($inp)
265 llgf $s3,12($inp)
266
267 larl $tbl,AES_Te
268 bras $ra,_s390x_AES_encrypt
269
270 l${g} $out,3*$SIZE_T($sp)
271 st $s0,0($out)
272 st $s1,4($out)
273 st $s2,8($out)
274 st $s3,12($out)
275
276 lm${g} %r6,$ra,6*$SIZE_T($sp)
277 br $ra
278.size AES_encrypt,.-AES_encrypt
279
280.type _s390x_AES_encrypt,\@function
281.align 16
282_s390x_AES_encrypt:
283 st${g} $ra,15*$SIZE_T($sp)
284 x $s0,0($key)
285 x $s1,4($key)
286 x $s2,8($key)
287 x $s3,12($key)
288 l $rounds,240($key)
289 llill $mask,`0xff<<3`
290 aghi $rounds,-1
291 j .Lenc_loop
292.align 16
293.Lenc_loop:
294 sllg $t1,$s0,`0+3`
295 srlg $t2,$s0,`8-3`
296 srlg $t3,$s0,`16-3`
297 srl $s0,`24-3`
298 nr $s0,$mask
299 ngr $t1,$mask
300 nr $t2,$mask
301 nr $t3,$mask
302
303 srlg $i1,$s1,`16-3` # i0
304 sllg $i2,$s1,`0+3`
305 srlg $i3,$s1,`8-3`
306 srl $s1,`24-3`
307 nr $i1,$mask
308 nr $s1,$mask
309 ngr $i2,$mask
310 nr $i3,$mask
311
312 l $s0,0($s0,$tbl) # Te0[s0>>24]
313 l $t1,1($t1,$tbl) # Te3[s0>>0]
314 l $t2,2($t2,$tbl) # Te2[s0>>8]
315 l $t3,3($t3,$tbl) # Te1[s0>>16]
316
317 x $s0,3($i1,$tbl) # Te1[s1>>16]
318 l $s1,0($s1,$tbl) # Te0[s1>>24]
319 x $t2,1($i2,$tbl) # Te3[s1>>0]
320 x $t3,2($i3,$tbl) # Te2[s1>>8]
321
322 srlg $i1,$s2,`8-3` # i0
323 srlg $i2,$s2,`16-3` # i1
324 nr $i1,$mask
325 nr $i2,$mask
326 sllg $i3,$s2,`0+3`
327 srl $s2,`24-3`
328 nr $s2,$mask
329 ngr $i3,$mask
330
331 xr $s1,$t1
332 srlg $ra,$s3,`8-3` # i1
333 sllg $t1,$s3,`0+3` # i0
334 nr $ra,$mask
335 la $key,16($key)
336 ngr $t1,$mask
337
338 x $s0,2($i1,$tbl) # Te2[s2>>8]
339 x $s1,3($i2,$tbl) # Te1[s2>>16]
340 l $s2,0($s2,$tbl) # Te0[s2>>24]
341 x $t3,1($i3,$tbl) # Te3[s2>>0]
342
343 srlg $i3,$s3,`16-3` # i2
344 xr $s2,$t2
345 srl $s3,`24-3`
346 nr $i3,$mask
347 nr $s3,$mask
348
349 x $s0,0($key)
350 x $s1,4($key)
351 x $s2,8($key)
352 x $t3,12($key)
353
354 x $s0,1($t1,$tbl) # Te3[s3>>0]
355 x $s1,2($ra,$tbl) # Te2[s3>>8]
356 x $s2,3($i3,$tbl) # Te1[s3>>16]
357 l $s3,0($s3,$tbl) # Te0[s3>>24]
358 xr $s3,$t3
359
360 brct $rounds,.Lenc_loop
361 .align 16
362
363 sllg $t1,$s0,`0+3`
364 srlg $t2,$s0,`8-3`
365 ngr $t1,$mask
366 srlg $t3,$s0,`16-3`
367 srl $s0,`24-3`
368 nr $s0,$mask
369 nr $t2,$mask
370 nr $t3,$mask
371
372 srlg $i1,$s1,`16-3` # i0
373 sllg $i2,$s1,`0+3`
374 ngr $i2,$mask
375 srlg $i3,$s1,`8-3`
376 srl $s1,`24-3`
377 nr $i1,$mask
378 nr $s1,$mask
379 nr $i3,$mask
380
381 llgc $s0,2($s0,$tbl) # Te4[s0>>24]
382 llgc $t1,2($t1,$tbl) # Te4[s0>>0]
383 sll $s0,24
384 llgc $t2,2($t2,$tbl) # Te4[s0>>8]
385 llgc $t3,2($t3,$tbl) # Te4[s0>>16]
386 sll $t2,8
387 sll $t3,16
388
389 llgc $i1,2($i1,$tbl) # Te4[s1>>16]
390 llgc $s1,2($s1,$tbl) # Te4[s1>>24]
391 llgc $i2,2($i2,$tbl) # Te4[s1>>0]
392 llgc $i3,2($i3,$tbl) # Te4[s1>>8]
393 sll $i1,16
394 sll $s1,24
395 sll $i3,8
396 or $s0,$i1
397 or $s1,$t1
398 or $t2,$i2
399 or $t3,$i3
400
401 srlg $i1,$s2,`8-3` # i0
402 srlg $i2,$s2,`16-3` # i1
403 nr $i1,$mask
404 nr $i2,$mask
405 sllg $i3,$s2,`0+3`
406 srl $s2,`24-3`
407 ngr $i3,$mask
408 nr $s2,$mask
409
410 sllg $t1,$s3,`0+3` # i0
411 srlg $ra,$s3,`8-3` # i1
412 ngr $t1,$mask
413
414 llgc $i1,2($i1,$tbl) # Te4[s2>>8]
415 llgc $i2,2($i2,$tbl) # Te4[s2>>16]
416 sll $i1,8
417 llgc $s2,2($s2,$tbl) # Te4[s2>>24]
418 llgc $i3,2($i3,$tbl) # Te4[s2>>0]
419 sll $i2,16
420 nr $ra,$mask
421 sll $s2,24
422 or $s0,$i1
423 or $s1,$i2
424 or $s2,$t2
425 or $t3,$i3
426
427 srlg $i3,$s3,`16-3` # i2
428 srl $s3,`24-3`
429 nr $i3,$mask
430 nr $s3,$mask
431
432 l $t0,16($key)
433 l $t2,20($key)
434
435 llgc $i1,2($t1,$tbl) # Te4[s3>>0]
436 llgc $i2,2($ra,$tbl) # Te4[s3>>8]
437 llgc $i3,2($i3,$tbl) # Te4[s3>>16]
438 llgc $s3,2($s3,$tbl) # Te4[s3>>24]
439 sll $i2,8
440 sll $i3,16
441 sll $s3,24
442 or $s0,$i1
443 or $s1,$i2
444 or $s2,$i3
445 or $s3,$t3
446
447 l${g} $ra,15*$SIZE_T($sp)
448 xr $s0,$t0
449 xr $s1,$t2
450 x $s2,24($key)
451 x $s3,28($key)
452
453 br $ra
454.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
455___
456
457$code.=<<___;
458.type AES_Td,\@object
459.align 256
460AES_Td:
461___
462&_data_word(
463 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
464 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
465 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
466 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
467 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
468 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
469 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
470 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
471 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
472 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
473 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
474 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
475 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
476 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
477 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
478 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
479 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
480 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
481 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
482 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
483 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
484 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
485 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
486 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
487 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
488 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
489 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
490 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
491 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
492 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
493 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
494 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
495 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
496 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
497 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
498 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
499 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
500 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
501 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
502 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
503 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
504 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
505 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
506 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
507 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
508 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
509 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
510 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
511 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
512 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
513 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
514 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
515 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
516 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
517 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
518 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
519 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
520 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
521 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
522 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
523 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
524 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
525 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
526 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
527$code.=<<___;
528# Td4[256]
529.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
530.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
531.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
532.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
533.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
534.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
535.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
536.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
537.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
538.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
539.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
540.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
541.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
542.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
543.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
544.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
545.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
546.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
547.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
548.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
549.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
550.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
551.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
552.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
553.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
554.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
555.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
556.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
557.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
558.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
559.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
560.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
561.size AES_Td,.-AES_Td
562
563# void AES_decrypt(const unsigned char *inp, unsigned char *out,
564# const AES_KEY *key) {
565.globl AES_decrypt
566.type AES_decrypt,\@function
567AES_decrypt:
568___
569$code.=<<___ if (!$softonly);
570 l %r0,240($key)
571 lhi %r1,16
572 clr %r0,%r1
573 jl .Ldsoft
574
575 la %r1,0($key)
576 #la %r2,0($inp)
577 la %r4,0($out)
578 lghi %r3,16 # single block length
579 .long 0xb92e0042 # km %r4,%r2
580 brc 1,.-4 # can this happen?
581 br %r14
582.align 64
583.Ldsoft:
584___
585$code.=<<___;
586 stm${g} %r3,$ra,3*$SIZE_T($sp)
587
588 llgf $s0,0($inp)
589 llgf $s1,4($inp)
590 llgf $s2,8($inp)
591 llgf $s3,12($inp)
592
593 larl $tbl,AES_Td
594 bras $ra,_s390x_AES_decrypt
595
596 l${g} $out,3*$SIZE_T($sp)
597 st $s0,0($out)
598 st $s1,4($out)
599 st $s2,8($out)
600 st $s3,12($out)
601
602 lm${g} %r6,$ra,6*$SIZE_T($sp)
603 br $ra
604.size AES_decrypt,.-AES_decrypt
605
606.type _s390x_AES_decrypt,\@function
607.align 16
608_s390x_AES_decrypt:
609 st${g} $ra,15*$SIZE_T($sp)
610 x $s0,0($key)
611 x $s1,4($key)
612 x $s2,8($key)
613 x $s3,12($key)
614 l $rounds,240($key)
615 llill $mask,`0xff<<3`
616 aghi $rounds,-1
617 j .Ldec_loop
618.align 16
619.Ldec_loop:
620 srlg $t1,$s0,`16-3`
621 srlg $t2,$s0,`8-3`
622 sllg $t3,$s0,`0+3`
623 srl $s0,`24-3`
624 nr $s0,$mask
625 nr $t1,$mask
626 nr $t2,$mask
627 ngr $t3,$mask
628
629 sllg $i1,$s1,`0+3` # i0
630 srlg $i2,$s1,`16-3`
631 srlg $i3,$s1,`8-3`
632 srl $s1,`24-3`
633 ngr $i1,$mask
634 nr $s1,$mask
635 nr $i2,$mask
636 nr $i3,$mask
637
638 l $s0,0($s0,$tbl) # Td0[s0>>24]
639 l $t1,3($t1,$tbl) # Td1[s0>>16]
640 l $t2,2($t2,$tbl) # Td2[s0>>8]
641 l $t3,1($t3,$tbl) # Td3[s0>>0]
642
643 x $s0,1($i1,$tbl) # Td3[s1>>0]
644 l $s1,0($s1,$tbl) # Td0[s1>>24]
645 x $t2,3($i2,$tbl) # Td1[s1>>16]
646 x $t3,2($i3,$tbl) # Td2[s1>>8]
647
648 srlg $i1,$s2,`8-3` # i0
649 sllg $i2,$s2,`0+3` # i1
650 srlg $i3,$s2,`16-3`
651 srl $s2,`24-3`
652 nr $i1,$mask
653 ngr $i2,$mask
654 nr $s2,$mask
655 nr $i3,$mask
656
657 xr $s1,$t1
658 srlg $ra,$s3,`8-3` # i1
659 srlg $t1,$s3,`16-3` # i0
660 nr $ra,$mask
661 la $key,16($key)
662 nr $t1,$mask
663
664 x $s0,2($i1,$tbl) # Td2[s2>>8]
665 x $s1,1($i2,$tbl) # Td3[s2>>0]
666 l $s2,0($s2,$tbl) # Td0[s2>>24]
667 x $t3,3($i3,$tbl) # Td1[s2>>16]
668
669 sllg $i3,$s3,`0+3` # i2
670 srl $s3,`24-3`
671 ngr $i3,$mask
672 nr $s3,$mask
673
674 xr $s2,$t2
675 x $s0,0($key)
676 x $s1,4($key)
677 x $s2,8($key)
678 x $t3,12($key)
679
680 x $s0,3($t1,$tbl) # Td1[s3>>16]
681 x $s1,2($ra,$tbl) # Td2[s3>>8]
682 x $s2,1($i3,$tbl) # Td3[s3>>0]
683 l $s3,0($s3,$tbl) # Td0[s3>>24]
684 xr $s3,$t3
685
686 brct $rounds,.Ldec_loop
687 .align 16
688
689 l $t1,`2048+0`($tbl) # prefetch Td4
690 l $t2,`2048+64`($tbl)
691 l $t3,`2048+128`($tbl)
692 l $i1,`2048+192`($tbl)
693 llill $mask,0xff
694
695 srlg $i3,$s0,24 # i0
696 srlg $t1,$s0,16
697 srlg $t2,$s0,8
698 nr $s0,$mask # i3
699 nr $t1,$mask
700
701 srlg $i1,$s1,24
702 nr $t2,$mask
703 srlg $i2,$s1,16
704 srlg $ra,$s1,8
705 nr $s1,$mask # i0
706 nr $i2,$mask
707 nr $ra,$mask
708
709 llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
710 llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
711 llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
712 sll $t1,16
713 llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
714 sllg $s0,$i3,24
715 sll $t2,8
716
717 llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
718 llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
719 llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
720 sll $i1,24
721 llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
722 sll $i2,16
723 sll $i3,8
724 or $s0,$s1
725 or $t1,$i1
726 or $t2,$i2
727 or $t3,$i3
728
729 srlg $i1,$s2,8 # i0
730 srlg $i2,$s2,24
731 srlg $i3,$s2,16
732 nr $s2,$mask # i1
733 nr $i1,$mask
734 nr $i3,$mask
735 llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
736 llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
737 llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
738 llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
739 sll $i1,8
740 sll $i2,24
741 or $s0,$i1
742 sll $i3,16
743 or $t2,$i2
744 or $t3,$i3
745
746 srlg $i1,$s3,16 # i0
747 srlg $i2,$s3,8 # i1
748 srlg $i3,$s3,24
749 nr $s3,$mask # i2
750 nr $i1,$mask
751 nr $i2,$mask
752
753 l${g} $ra,15*$SIZE_T($sp)
754 or $s1,$t1
755 l $t0,16($key)
756 l $t1,20($key)
757
758 llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
759 llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
760 sll $i1,16
761 llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
762 llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
763 sll $i2,8
764 sll $s3,24
765 or $s0,$i1
766 or $s1,$i2
767 or $s2,$t2
768 or $s3,$t3
769
770 xr $s0,$t0
771 xr $s1,$t1
772 x $s2,24($key)
773 x $s3,28($key)
774
775 br $ra
776.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
777___
778
779$code.=<<___;
780# void AES_set_encrypt_key(const unsigned char *in, int bits,
781# AES_KEY *key) {
782.globl AES_set_encrypt_key
783.type AES_set_encrypt_key,\@function
784.align 16
785AES_set_encrypt_key:
786_s390x_AES_set_encrypt_key:
787 lghi $t0,0
788 cl${g}r $inp,$t0
789 je .Lminus1
790 cl${g}r $key,$t0
791 je .Lminus1
792
793 lghi $t0,128
794 clr $bits,$t0
795 je .Lproceed
796 lghi $t0,192
797 clr $bits,$t0
798 je .Lproceed
799 lghi $t0,256
800 clr $bits,$t0
801 je .Lproceed
802 lghi %r2,-2
803 br %r14
804
805.align 16
806.Lproceed:
807___
808$code.=<<___ if (!$softonly);
809 # convert bits to km code, [128,192,256]->[18,19,20]
810 lhi %r5,-128
811 lhi %r0,18
812 ar %r5,$bits
813 srl %r5,6
814 ar %r5,%r0
815
816 larl %r1,OPENSSL_s390xcap_P
817 lg %r0,0(%r1)
818 tmhl %r0,0x4000 # check for message-security assist
819 jz .Lekey_internal
820
821 lghi %r0,0 # query capability vector
822 la %r1,16($sp)
823 .long 0xb92f0042 # kmc %r4,%r2
824
825 llihh %r1,0x8000
826 srlg %r1,%r1,0(%r5)
827 ng %r1,16($sp)
828 jz .Lekey_internal
829
830 lmg %r0,%r1,0($inp) # just copy 128 bits...
831 stmg %r0,%r1,0($key)
832 lhi %r0,192
833 cr $bits,%r0
834 jl 1f
835 lg %r1,16($inp)
836 stg %r1,16($key)
837 je 1f
838 lg %r1,24($inp)
839 stg %r1,24($key)
8401: st $bits,236($key) # save bits [for debugging purposes]
841 lgr $t0,%r5
842 st %r5,240($key) # save km code
843 lghi %r2,0
844 br %r14
845___
846$code.=<<___;
847.align 16
848.Lekey_internal:
849 stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
850
851 larl $tbl,AES_Te+2048
852
853 llgf $s0,0($inp)
854 llgf $s1,4($inp)
855 llgf $s2,8($inp)
856 llgf $s3,12($inp)
857 st $s0,0($key)
858 st $s1,4($key)
859 st $s2,8($key)
860 st $s3,12($key)
861 lghi $t0,128
862 cr $bits,$t0
863 jne .Lnot128
864
865 llill $mask,0xff
866 lghi $t3,0 # i=0
867 lghi $rounds,10
868 st $rounds,240($key)
869
870 llgfr $t2,$s3 # temp=rk[3]
871 srlg $i1,$s3,8
872 srlg $i2,$s3,16
873 srlg $i3,$s3,24
874 nr $t2,$mask
875 nr $i1,$mask
876 nr $i2,$mask
877
878.align 16
879.L128_loop:
880 la $t2,0($t2,$tbl)
881 la $i1,0($i1,$tbl)
882 la $i2,0($i2,$tbl)
883 la $i3,0($i3,$tbl)
884 icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
885 icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
886 icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
887 icm $t2,1,0($i3) # Te4[rk[3]>>24]
888 x $t2,256($t3,$tbl) # rcon[i]
889 xr $s0,$t2 # rk[4]=rk[0]^...
890 xr $s1,$s0 # rk[5]=rk[1]^rk[4]
891 xr $s2,$s1 # rk[6]=rk[2]^rk[5]
892 xr $s3,$s2 # rk[7]=rk[3]^rk[6]
893
894 llgfr $t2,$s3 # temp=rk[3]
895 srlg $i1,$s3,8
896 srlg $i2,$s3,16
897 nr $t2,$mask
898 nr $i1,$mask
899 srlg $i3,$s3,24
900 nr $i2,$mask
901
902 st $s0,16($key)
903 st $s1,20($key)
904 st $s2,24($key)
905 st $s3,28($key)
906 la $key,16($key) # key+=4
907 la $t3,4($t3) # i++
908 brct $rounds,.L128_loop
909 lghi $t0,10
910 lghi %r2,0
911 lm${g} %r4,%r13,4*$SIZE_T($sp)
912 br $ra
913
914.align 16
915.Lnot128:
916 llgf $t0,16($inp)
917 llgf $t1,20($inp)
918 st $t0,16($key)
919 st $t1,20($key)
920 lghi $t0,192
921 cr $bits,$t0
922 jne .Lnot192
923
924 llill $mask,0xff
925 lghi $t3,0 # i=0
926 lghi $rounds,12
927 st $rounds,240($key)
928 lghi $rounds,8
929
930 srlg $i1,$t1,8
931 srlg $i2,$t1,16
932 srlg $i3,$t1,24
933 nr $t1,$mask
934 nr $i1,$mask
935 nr $i2,$mask
936
937.align 16
938.L192_loop:
939 la $t1,0($t1,$tbl)
940 la $i1,0($i1,$tbl)
941 la $i2,0($i2,$tbl)
942 la $i3,0($i3,$tbl)
943 icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
944 icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
945 icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
946 icm $t1,1,0($i3) # Te4[rk[5]>>24]
947 x $t1,256($t3,$tbl) # rcon[i]
948 xr $s0,$t1 # rk[6]=rk[0]^...
949 xr $s1,$s0 # rk[7]=rk[1]^rk[6]
950 xr $s2,$s1 # rk[8]=rk[2]^rk[7]
951 xr $s3,$s2 # rk[9]=rk[3]^rk[8]
952
953 st $s0,24($key)
954 st $s1,28($key)
955 st $s2,32($key)
956 st $s3,36($key)
957 brct $rounds,.L192_continue
958 lghi $t0,12
959 lghi %r2,0
960 lm${g} %r4,%r13,4*$SIZE_T($sp)
961 br $ra
962
963.align 16
964.L192_continue:
965 lgr $t1,$s3
966 x $t1,16($key) # rk[10]=rk[4]^rk[9]
967 st $t1,40($key)
968 x $t1,20($key) # rk[11]=rk[5]^rk[10]
969 st $t1,44($key)
970
971 srlg $i1,$t1,8
972 srlg $i2,$t1,16
973 srlg $i3,$t1,24
974 nr $t1,$mask
975 nr $i1,$mask
976 nr $i2,$mask
977
978 la $key,24($key) # key+=6
979 la $t3,4($t3) # i++
980 j .L192_loop
981
982.align 16
983.Lnot192:
984 llgf $t0,24($inp)
985 llgf $t1,28($inp)
986 st $t0,24($key)
987 st $t1,28($key)
988 llill $mask,0xff
989 lghi $t3,0 # i=0
990 lghi $rounds,14
991 st $rounds,240($key)
992 lghi $rounds,7
993
994 srlg $i1,$t1,8
995 srlg $i2,$t1,16
996 srlg $i3,$t1,24
997 nr $t1,$mask
998 nr $i1,$mask
999 nr $i2,$mask
1000
1001.align 16
1002.L256_loop:
1003 la $t1,0($t1,$tbl)
1004 la $i1,0($i1,$tbl)
1005 la $i2,0($i2,$tbl)
1006 la $i3,0($i3,$tbl)
1007 icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
1008 icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
1009 icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
1010 icm $t1,1,0($i3) # Te4[rk[7]>>24]
1011 x $t1,256($t3,$tbl) # rcon[i]
1012 xr $s0,$t1 # rk[8]=rk[0]^...
1013 xr $s1,$s0 # rk[9]=rk[1]^rk[8]
1014 xr $s2,$s1 # rk[10]=rk[2]^rk[9]
1015 xr $s3,$s2 # rk[11]=rk[3]^rk[10]
1016 st $s0,32($key)
1017 st $s1,36($key)
1018 st $s2,40($key)
1019 st $s3,44($key)
1020 brct $rounds,.L256_continue
1021 lghi $t0,14
1022 lghi %r2,0
1023 lm${g} %r4,%r13,4*$SIZE_T($sp)
1024 br $ra
1025
1026.align 16
1027.L256_continue:
1028 lgr $t1,$s3 # temp=rk[11]
1029 srlg $i1,$s3,8
1030 srlg $i2,$s3,16
1031 srlg $i3,$s3,24
1032 nr $t1,$mask
1033 nr $i1,$mask
1034 nr $i2,$mask
1035 la $t1,0($t1,$tbl)
1036 la $i1,0($i1,$tbl)
1037 la $i2,0($i2,$tbl)
1038 la $i3,0($i3,$tbl)
1039 llgc $t1,0($t1) # Te4[rk[11]>>0]
1040 icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
1041 icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
1042 icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
1043 x $t1,16($key) # rk[12]=rk[4]^...
1044 st $t1,48($key)
1045 x $t1,20($key) # rk[13]=rk[5]^rk[12]
1046 st $t1,52($key)
1047 x $t1,24($key) # rk[14]=rk[6]^rk[13]
1048 st $t1,56($key)
1049 x $t1,28($key) # rk[15]=rk[7]^rk[14]
1050 st $t1,60($key)
1051
1052 srlg $i1,$t1,8
1053 srlg $i2,$t1,16
1054 srlg $i3,$t1,24
1055 nr $t1,$mask
1056 nr $i1,$mask
1057 nr $i2,$mask
1058
1059 la $key,32($key) # key+=8
1060 la $t3,4($t3) # i++
1061 j .L256_loop
1062
1063.Lminus1:
1064 lghi %r2,-1
1065 br $ra
1066.size AES_set_encrypt_key,.-AES_set_encrypt_key
1067
1068# void AES_set_decrypt_key(const unsigned char *in, int bits,
1069# AES_KEY *key) {
1070.globl AES_set_decrypt_key
1071.type AES_set_decrypt_key,\@function
1072.align 16
1073AES_set_decrypt_key:
1074 #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
1075 st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
1076 bras $ra,_s390x_AES_set_encrypt_key
1077 #l${g} $key,4*$SIZE_T($sp)
1078 l${g} $ra,14*$SIZE_T($sp)
1079 ltgr %r2,%r2
1080 bnzr $ra
1081___
1082$code.=<<___ if (!$softonly);
1083 #l $t0,240($key)
1084 lhi $t1,16
1085 cr $t0,$t1
1086 jl .Lgo
1087 oill $t0,0x80 # set "decrypt" bit
1088 st $t0,240($key)
1089 br $ra
1090___
1091$code.=<<___;
1092.align 16
1093.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
1094 la $i1,0($key)
1095 sllg $i2,$rounds,4
1096 la $i2,0($i2,$key)
1097 srl $rounds,1
1098 lghi $t1,-16
1099
1100.align 16
1101.Linv: lmg $s0,$s1,0($i1)
1102 lmg $s2,$s3,0($i2)
1103 stmg $s0,$s1,0($i2)
1104 stmg $s2,$s3,0($i1)
1105 la $i1,16($i1)
1106 la $i2,0($t1,$i2)
1107 brct $rounds,.Linv
1108___
1109$mask80=$i1;
1110$mask1b=$i2;
1111$maskfe=$i3;
1112$code.=<<___;
1113 llgf $rounds,240($key)
1114 aghi $rounds,-1
1115 sll $rounds,2 # (rounds-1)*4
1116 llilh $mask80,0x8080
1117 llilh $mask1b,0x1b1b
1118 llilh $maskfe,0xfefe
1119 oill $mask80,0x8080
1120 oill $mask1b,0x1b1b
1121 oill $maskfe,0xfefe
1122
1123.align 16
1124.Lmix: l $s0,16($key) # tp1
1125 lr $s1,$s0
1126 ngr $s1,$mask80
1127 srlg $t1,$s1,7
1128 slr $s1,$t1
1129 nr $s1,$mask1b
1130 sllg $t1,$s0,1
1131 nr $t1,$maskfe
1132 xr $s1,$t1 # tp2
1133
1134 lr $s2,$s1
1135 ngr $s2,$mask80
1136 srlg $t1,$s2,7
1137 slr $s2,$t1
1138 nr $s2,$mask1b
1139 sllg $t1,$s1,1
1140 nr $t1,$maskfe
1141 xr $s2,$t1 # tp4
1142
1143 lr $s3,$s2
1144 ngr $s3,$mask80
1145 srlg $t1,$s3,7
1146 slr $s3,$t1
1147 nr $s3,$mask1b
1148 sllg $t1,$s2,1
1149 nr $t1,$maskfe
1150 xr $s3,$t1 # tp8
1151
1152 xr $s1,$s0 # tp2^tp1
1153 xr $s2,$s0 # tp4^tp1
1154 rll $s0,$s0,24 # = ROTATE(tp1,8)
1155 xr $s2,$s3 # ^=tp8
1156 xr $s0,$s1 # ^=tp2^tp1
1157 xr $s1,$s3 # tp2^tp1^tp8
1158 xr $s0,$s2 # ^=tp4^tp1^tp8
1159 rll $s1,$s1,8
1160 rll $s2,$s2,16
1161 xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
1162 rll $s3,$s3,24
1163 xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
1164 xr $s0,$s3 # ^= ROTATE(tp8,8)
1165
1166 st $s0,16($key)
1167 la $key,4($key)
1168 brct $rounds,.Lmix
1169
1170 lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1171 lghi %r2,0
1172 br $ra
1173.size AES_set_decrypt_key,.-AES_set_decrypt_key
1174___
1175
1176########################################################################
1177# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1178# size_t length, const AES_KEY *key,
1179# unsigned char *ivec, const int enc)
1180{
1181my $inp="%r2";
1182my $out="%r4"; # length and out are swapped
1183my $len="%r3";
1184my $key="%r5";
1185my $ivp="%r6";
1186
1187$code.=<<___;
1188.globl AES_cbc_encrypt
1189.type AES_cbc_encrypt,\@function
1190.align 16
1191AES_cbc_encrypt:
1192 xgr %r3,%r4 # flip %r3 and %r4, out and len
1193 xgr %r4,%r3
1194 xgr %r3,%r4
1195___
1196$code.=<<___ if (!$softonly);
1197 lhi %r0,16
1198 cl %r0,240($key)
1199 jh .Lcbc_software
1200
1201 lg %r0,0($ivp) # copy ivec
1202 lg %r1,8($ivp)
1203 stmg %r0,%r1,16($sp)
1204 lmg %r0,%r1,0($key) # copy key, cover 256 bit
1205 stmg %r0,%r1,32($sp)
1206 lmg %r0,%r1,16($key)
1207 stmg %r0,%r1,48($sp)
1208 l %r0,240($key) # load kmc code
1209 lghi $key,15 # res=len%16, len-=res;
1210 ngr $key,$len
1211 sl${g}r $len,$key
1212 la %r1,16($sp) # parameter block - ivec || key
1213 jz .Lkmc_truncated
1214 .long 0xb92f0042 # kmc %r4,%r2
1215 brc 1,.-4 # pay attention to "partial completion"
1216 ltr $key,$key
1217 jnz .Lkmc_truncated
1218.Lkmc_done:
1219 lmg %r0,%r1,16($sp) # copy ivec to caller
1220 stg %r0,0($ivp)
1221 stg %r1,8($ivp)
1222 br $ra
1223.align 16
1224.Lkmc_truncated:
1225 ahi $key,-1 # it's the way it's encoded in mvc
1226 tmll %r0,0x80
1227 jnz .Lkmc_truncated_dec
1228 lghi %r1,0
1229 stg %r1,16*$SIZE_T($sp)
1230 stg %r1,16*$SIZE_T+8($sp)
1231 bras %r1,1f
1232 mvc 16*$SIZE_T(1,$sp),0($inp)
12331: ex $key,0(%r1)
1234 la %r1,16($sp) # restore parameter block
1235 la $inp,16*$SIZE_T($sp)
1236 lghi $len,16
1237 .long 0xb92f0042 # kmc %r4,%r2
1238 j .Lkmc_done
1239.align 16
1240.Lkmc_truncated_dec:
1241 st${g} $out,4*$SIZE_T($sp)
1242 la $out,16*$SIZE_T($sp)
1243 lghi $len,16
1244 .long 0xb92f0042 # kmc %r4,%r2
1245 l${g} $out,4*$SIZE_T($sp)
1246 bras %r1,2f
1247 mvc 0(1,$out),16*$SIZE_T($sp)
12482: ex $key,0(%r1)
1249 j .Lkmc_done
1250.align 16
1251.Lcbc_software:
1252___
1253$code.=<<___;
1254 stm${g} $key,$ra,5*$SIZE_T($sp)
1255 lhi %r0,0
1256 cl %r0,`$stdframe+$SIZE_T-4`($sp)
1257 je .Lcbc_decrypt
1258
1259 larl $tbl,AES_Te
1260
1261 llgf $s0,0($ivp)
1262 llgf $s1,4($ivp)
1263 llgf $s2,8($ivp)
1264 llgf $s3,12($ivp)
1265
1266 lghi $t0,16
1267 sl${g}r $len,$t0
1268 brc 4,.Lcbc_enc_tail # if borrow
1269.Lcbc_enc_loop:
1270 stm${g} $inp,$out,2*$SIZE_T($sp)
1271 x $s0,0($inp)
1272 x $s1,4($inp)
1273 x $s2,8($inp)
1274 x $s3,12($inp)
1275 lgr %r4,$key
1276
1277 bras $ra,_s390x_AES_encrypt
1278
1279 lm${g} $inp,$key,2*$SIZE_T($sp)
1280 st $s0,0($out)
1281 st $s1,4($out)
1282 st $s2,8($out)
1283 st $s3,12($out)
1284
1285 la $inp,16($inp)
1286 la $out,16($out)
1287 lghi $t0,16
1288 lt${g}r $len,$len
1289 jz .Lcbc_enc_done
1290 sl${g}r $len,$t0
1291 brc 4,.Lcbc_enc_tail # if borrow
1292 j .Lcbc_enc_loop
1293.align 16
1294.Lcbc_enc_done:
1295 l${g} $ivp,6*$SIZE_T($sp)
1296 st $s0,0($ivp)
1297 st $s1,4($ivp)
1298 st $s2,8($ivp)
1299 st $s3,12($ivp)
1300
1301 lm${g} %r7,$ra,7*$SIZE_T($sp)
1302 br $ra
1303
1304.align 16
1305.Lcbc_enc_tail:
1306 aghi $len,15
1307 lghi $t0,0
1308 stg $t0,16*$SIZE_T($sp)
1309 stg $t0,16*$SIZE_T+8($sp)
1310 bras $t1,3f
1311 mvc 16*$SIZE_T(1,$sp),0($inp)
13123: ex $len,0($t1)
1313 lghi $len,0
1314 la $inp,16*$SIZE_T($sp)
1315 j .Lcbc_enc_loop
1316
1317.align 16
1318.Lcbc_decrypt:
1319 larl $tbl,AES_Td
1320
1321 lg $t0,0($ivp)
1322 lg $t1,8($ivp)
1323 stmg $t0,$t1,16*$SIZE_T($sp)
1324
1325.Lcbc_dec_loop:
1326 stm${g} $inp,$out,2*$SIZE_T($sp)
1327 llgf $s0,0($inp)
1328 llgf $s1,4($inp)
1329 llgf $s2,8($inp)
1330 llgf $s3,12($inp)
1331 lgr %r4,$key
1332
1333 bras $ra,_s390x_AES_decrypt
1334
1335 lm${g} $inp,$key,2*$SIZE_T($sp)
1336 sllg $s0,$s0,32
1337 sllg $s2,$s2,32
1338 lr $s0,$s1
1339 lr $s2,$s3
1340
1341 lg $t0,0($inp)
1342 lg $t1,8($inp)
1343 xg $s0,16*$SIZE_T($sp)
1344 xg $s2,16*$SIZE_T+8($sp)
1345 lghi $s1,16
1346 sl${g}r $len,$s1
1347 brc 4,.Lcbc_dec_tail # if borrow
1348 brc 2,.Lcbc_dec_done # if zero
1349 stg $s0,0($out)
1350 stg $s2,8($out)
1351 stmg $t0,$t1,16*$SIZE_T($sp)
1352
1353 la $inp,16($inp)
1354 la $out,16($out)
1355 j .Lcbc_dec_loop
1356
1357.Lcbc_dec_done:
1358 stg $s0,0($out)
1359 stg $s2,8($out)
1360.Lcbc_dec_exit:
1361 lm${g} %r6,$ra,6*$SIZE_T($sp)
1362 stmg $t0,$t1,0($ivp)
1363
1364 br $ra
1365
1366.align 16
1367.Lcbc_dec_tail:
1368 aghi $len,15
1369 stg $s0,16*$SIZE_T($sp)
1370 stg $s2,16*$SIZE_T+8($sp)
1371 bras $s1,4f
1372 mvc 0(1,$out),16*$SIZE_T($sp)
13734: ex $len,0($s1)
1374 j .Lcbc_dec_exit
1375.size AES_cbc_encrypt,.-AES_cbc_encrypt
1376___
1377}
1378########################################################################
1379# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1380# size_t blocks, const AES_KEY *key,
1381# const unsigned char *ivec)
1382{
1383my $inp="%r2";
1384my $out="%r4"; # blocks and out are swapped
1385my $len="%r3";
1386my $key="%r5"; my $iv0="%r5";
1387my $ivp="%r6";
1388my $fp ="%r7";
1389
1390$code.=<<___;
1391.globl AES_ctr32_encrypt
1392.type AES_ctr32_encrypt,\@function
1393.align 16
1394AES_ctr32_encrypt:
1395 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1396 xgr %r4,%r3
1397 xgr %r3,%r4
1398 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
1399___
1400$code.=<<___ if (!$softonly);
1401 l %r0,240($key)
1402 lhi %r1,16
1403 clr %r0,%r1
1404 jl .Lctr32_software
1405
1406 stm${g} %r6,$s3,6*$SIZE_T($sp)
1407
1408 slgr $out,$inp
1409 la %r1,0($key) # %r1 is permanent copy of $key
1410 lg $iv0,0($ivp) # load ivec
1411 lg $ivp,8($ivp)
1412
1413 # prepare and allocate stack frame at the top of 4K page
1414 # with 1K reserved for eventual signal handling
1415 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1416 lghi $s1,-4096
1417 algr $s0,$sp
1418 lgr $fp,$sp
1419 ngr $s0,$s1 # align at page boundary
1420 slgr $fp,$s0 # total buffer size
1421 lgr $s2,$sp
1422 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1423 slgr $fp,$s1 # deduct reservation to get usable buffer size
1424 # buffer size is at lest 256 and at most 3072+256-16
1425
1426 la $sp,1024($s0) # alloca
1427 srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
1428 st${g} $s2,0($sp) # back-chain
1429 st${g} $fp,$SIZE_T($sp)
1430
1431 slgr $len,$fp
1432 brc 1,.Lctr32_hw_switch # not zero, no borrow
1433 algr $fp,$len # input is shorter than allocated buffer
1434 lghi $len,0
1435 st${g} $fp,$SIZE_T($sp)
1436
1437.Lctr32_hw_switch:
1438___
1439$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower
1440 larl $s0,OPENSSL_s390xcap_P
1441 lg $s0,8($s0)
1442 tmhh $s0,0x0004 # check for message_security-assist-4
1443 jz .Lctr32_km_loop
1444
1445 llgfr $s0,%r0
1446 lgr $s1,%r1
1447 lghi %r0,0
1448 la %r1,16($sp)
1449 .long 0xb92d2042 # kmctr %r4,%r2,%r2
1450
1451 llihh %r0,0x8000 # check if kmctr supports the function code
1452 srlg %r0,%r0,0($s0)
1453 ng %r0,16($sp)
1454 lgr %r0,$s0
1455 lgr %r1,$s1
1456 jz .Lctr32_km_loop
1457
1458####### kmctr code
1459 algr $out,$inp # restore $out
1460 lgr $s1,$len # $s1 undertakes $len
1461 j .Lctr32_kmctr_loop
1462.align 16
1463.Lctr32_kmctr_loop:
1464 la $s2,16($sp)
1465 lgr $s3,$fp
1466.Lctr32_kmctr_prepare:
1467 stg $iv0,0($s2)
1468 stg $ivp,8($s2)
1469 la $s2,16($s2)
1470 ahi $ivp,1 # 32-bit increment, preserves upper half
1471 brct $s3,.Lctr32_kmctr_prepare
1472
1473 #la $inp,0($inp) # inp
1474 sllg $len,$fp,4 # len
1475 #la $out,0($out) # out
1476 la $s2,16($sp) # iv
1477 .long 0xb92da042 # kmctr $out,$s2,$inp
1478 brc 1,.-4 # pay attention to "partial completion"
1479
1480 slgr $s1,$fp
1481 brc 1,.Lctr32_kmctr_loop # not zero, no borrow
1482 algr $fp,$s1
1483 lghi $s1,0
1484 brc 4+1,.Lctr32_kmctr_loop # not zero
1485
1486 l${g} $sp,0($sp)
1487 lm${g} %r6,$s3,6*$SIZE_T($sp)
1488 br $ra
1489.align 16
1490___
1491$code.=<<___;
1492.Lctr32_km_loop:
1493 la $s2,16($sp)
1494 lgr $s3,$fp
1495.Lctr32_km_prepare:
1496 stg $iv0,0($s2)
1497 stg $ivp,8($s2)
1498 la $s2,16($s2)
1499 ahi $ivp,1 # 32-bit increment, preserves upper half
1500 brct $s3,.Lctr32_km_prepare
1501
1502 la $s0,16($sp) # inp
1503 sllg $s1,$fp,4 # len
1504 la $s2,16($sp) # out
1505 .long 0xb92e00a8 # km %r10,%r8
1506 brc 1,.-4 # pay attention to "partial completion"
1507
1508 la $s2,16($sp)
1509 lgr $s3,$fp
1510 slgr $s2,$inp
1511.Lctr32_km_xor:
1512 lg $s0,0($inp)
1513 lg $s1,8($inp)
1514 xg $s0,0($s2,$inp)
1515 xg $s1,8($s2,$inp)
1516 stg $s0,0($out,$inp)
1517 stg $s1,8($out,$inp)
1518 la $inp,16($inp)
1519 brct $s3,.Lctr32_km_xor
1520
1521 slgr $len,$fp
1522 brc 1,.Lctr32_km_loop # not zero, no borrow
1523 algr $fp,$len
1524 lghi $len,0
1525 brc 4+1,.Lctr32_km_loop # not zero
1526
1527 l${g} $s0,0($sp)
1528 l${g} $s1,$SIZE_T($sp)
1529 la $s2,16($sp)
1530.Lctr32_km_zap:
1531 stg $s0,0($s2)
1532 stg $s0,8($s2)
1533 la $s2,16($s2)
1534 brct $s1,.Lctr32_km_zap
1535
1536 la $sp,0($s0)
1537 lm${g} %r6,$s3,6*$SIZE_T($sp)
1538 br $ra
1539.align 16
1540.Lctr32_software:
1541___
1542$code.=<<___;
1543 stm${g} $key,$ra,5*$SIZE_T($sp)
1544 sl${g}r $inp,$out
1545 larl $tbl,AES_Te
1546 llgf $t1,12($ivp)
1547
1548.Lctr32_loop:
1549 stm${g} $inp,$out,2*$SIZE_T($sp)
1550 llgf $s0,0($ivp)
1551 llgf $s1,4($ivp)
1552 llgf $s2,8($ivp)
1553 lgr $s3,$t1
1554 st $t1,16*$SIZE_T($sp)
1555 lgr %r4,$key
1556
1557 bras $ra,_s390x_AES_encrypt
1558
1559 lm${g} $inp,$ivp,2*$SIZE_T($sp)
1560 llgf $t1,16*$SIZE_T($sp)
1561 x $s0,0($inp,$out)
1562 x $s1,4($inp,$out)
1563 x $s2,8($inp,$out)
1564 x $s3,12($inp,$out)
1565 stm $s0,$s3,0($out)
1566
1567 la $out,16($out)
1568 ahi $t1,1 # 32-bit increment
1569 brct $len,.Lctr32_loop
1570
1571 lm${g} %r6,$ra,6*$SIZE_T($sp)
1572 br $ra
1573.size AES_ctr32_encrypt,.-AES_ctr32_encrypt
1574___
1575}
1576
1577########################################################################
1578# void AES_xts_encrypt(const char *inp,char *out,size_t len,
1579# const AES_KEY *key1, const AES_KEY *key2,
1580# const unsigned char iv[16]);
1581#
1582{
1583my $inp="%r2";
1584my $out="%r4"; # len and out are swapped
1585my $len="%r3";
1586my $key1="%r5"; # $i1
1587my $key2="%r6"; # $i2
1588my $fp="%r7"; # $i3
1589my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
1590
1591$code.=<<___;
1592.type _s390x_xts_km,\@function
1593.align 16
1594_s390x_xts_km:
1595___
1596$code.=<<___ if(1);
1597 llgfr $s0,%r0 # put aside the function code
1598 lghi $s1,0x7f
1599 nr $s1,%r0
1600 lghi %r0,0 # query capability vector
1601 la %r1,$tweak-16($sp)
1602 .long 0xb92e0042 # km %r4,%r2
1603 llihh %r1,0x8000
1604 srlg %r1,%r1,32($s1) # check for 32+function code
1605 ng %r1,$tweak-16($sp)
1606 lgr %r0,$s0 # restore the function code
1607 la %r1,0($key1) # restore $key1
1608 jz .Lxts_km_vanilla
1609
1610 lmg $i2,$i3,$tweak($sp) # put aside the tweak value
1611 algr $out,$inp
1612
1613 oill %r0,32 # switch to xts function code
1614 aghi $s1,-18 #
1615 sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
1616 la %r1,$tweak-16($sp)
1617 slgr %r1,$s1 # parameter block position
1618 lmg $s0,$s3,0($key1) # load 256 bits of key material,
1619 stmg $s0,$s3,0(%r1) # and copy it to parameter block.
1620 # yes, it contains junk and overlaps
1621 # with the tweak in 128-bit case.
1622 # it's done to avoid conditional
1623 # branch.
1624 stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
1625
1626 .long 0xb92e0042 # km %r4,%r2
1627 brc 1,.-4 # pay attention to "partial completion"
1628
1629 lrvg $s0,$tweak+0($sp) # load the last tweak
1630 lrvg $s1,$tweak+8($sp)
1631 stmg %r0,%r3,$tweak-32($sp) # wipe copy of the key
1632
1633 nill %r0,0xffdf # switch back to original function code
1634 la %r1,0($key1) # restore pointer to $key1
1635 slgr $out,$inp
1636
1637 llgc $len,2*$SIZE_T-1($sp)
1638 nill $len,0x0f # $len%=16
1639 br $ra
1640
1641.align 16
1642.Lxts_km_vanilla:
1643___
1644$code.=<<___;
1645 # prepare and allocate stack frame at the top of 4K page
1646 # with 1K reserved for eventual signal handling
1647 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1648 lghi $s1,-4096
1649 algr $s0,$sp
1650 lgr $fp,$sp
1651 ngr $s0,$s1 # align at page boundary
1652 slgr $fp,$s0 # total buffer size
1653 lgr $s2,$sp
1654 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1655 slgr $fp,$s1 # deduct reservation to get usable buffer size
1656 # buffer size is at lest 256 and at most 3072+256-16
1657
1658 la $sp,1024($s0) # alloca
1659 nill $fp,0xfff0 # round to 16*n
1660 st${g} $s2,0($sp) # back-chain
1661 nill $len,0xfff0 # redundant
1662 st${g} $fp,$SIZE_T($sp)
1663
1664 slgr $len,$fp
1665 brc 1,.Lxts_km_go # not zero, no borrow
1666 algr $fp,$len # input is shorter than allocated buffer
1667 lghi $len,0
1668 st${g} $fp,$SIZE_T($sp)
1669
1670.Lxts_km_go:
1671 lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
1672 lrvg $s1,$tweak+8($s2)
1673
1674 la $s2,16($sp) # vector of ascending tweak values
1675 slgr $s2,$inp
1676 srlg $s3,$fp,4
1677 j .Lxts_km_start
1678
1679.Lxts_km_loop:
1680 la $s2,16($sp)
1681 slgr $s2,$inp
1682 srlg $s3,$fp,4
1683.Lxts_km_prepare:
1684 lghi $i1,0x87
1685 srag $i2,$s1,63 # broadcast upper bit
1686 ngr $i1,$i2 # rem
1687 algr $s0,$s0
1688 alcgr $s1,$s1
1689 xgr $s0,$i1
1690.Lxts_km_start:
1691 lrvgr $i1,$s0 # flip byte order
1692 lrvgr $i2,$s1
1693 stg $i1,0($s2,$inp)
1694 stg $i2,8($s2,$inp)
1695 xg $i1,0($inp)
1696 xg $i2,8($inp)
1697 stg $i1,0($out,$inp)
1698 stg $i2,8($out,$inp)
1699 la $inp,16($inp)
1700 brct $s3,.Lxts_km_prepare
1701
1702 slgr $inp,$fp # rewind $inp
1703 la $s2,0($out,$inp)
1704 lgr $s3,$fp
1705 .long 0xb92e00aa # km $s2,$s2
1706 brc 1,.-4 # pay attention to "partial completion"
1707
1708 la $s2,16($sp)
1709 slgr $s2,$inp
1710 srlg $s3,$fp,4
1711.Lxts_km_xor:
1712 lg $i1,0($out,$inp)
1713 lg $i2,8($out,$inp)
1714 xg $i1,0($s2,$inp)
1715 xg $i2,8($s2,$inp)
1716 stg $i1,0($out,$inp)
1717 stg $i2,8($out,$inp)
1718 la $inp,16($inp)
1719 brct $s3,.Lxts_km_xor
1720
1721 slgr $len,$fp
1722 brc 1,.Lxts_km_loop # not zero, no borrow
1723 algr $fp,$len
1724 lghi $len,0
1725 brc 4+1,.Lxts_km_loop # not zero
1726
1727 l${g} $i1,0($sp) # back-chain
1728 llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
1729 la $i2,16($sp)
1730 srlg $fp,$fp,4
1731.Lxts_km_zap:
1732 stg $i1,0($i2)
1733 stg $i1,8($i2)
1734 la $i2,16($i2)
1735 brct $fp,.Lxts_km_zap
1736
1737 la $sp,0($i1)
1738 llgc $len,2*$SIZE_T-1($i1)
1739 nill $len,0x0f # $len%=16
1740 bzr $ra
1741
1742 # generate one more tweak...
1743 lghi $i1,0x87
1744 srag $i2,$s1,63 # broadcast upper bit
1745 ngr $i1,$i2 # rem
1746 algr $s0,$s0
1747 alcgr $s1,$s1
1748 xgr $s0,$i1
1749
1750 ltr $len,$len # clear zero flag
1751 br $ra
1752.size _s390x_xts_km,.-_s390x_xts_km
1753
1754.globl AES_xts_encrypt
1755.type AES_xts_encrypt,\@function
1756.align 16
1757AES_xts_encrypt:
1758 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1759 xgr %r4,%r3
1760 xgr %r3,%r4
1761___
1762$code.=<<___ if ($SIZE_T==4);
1763 llgfr $len,$len
1764___
1765$code.=<<___;
1766 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1767 srag $len,$len,4 # formally wrong, because it expands
1768 # sign byte, but who can afford asking
1769 # to process more than 2^63-1 bytes?
1770 # I use it, because it sets condition
1771 # code...
1772 bcr 8,$ra # abort if zero (i.e. less than 16)
1773___
1774$code.=<<___ if (!$softonly);
1775 llgf %r0,240($key2)
1776 lhi %r1,16
1777 clr %r0,%r1
1778 jl .Lxts_enc_software
1779
1780 st${g} $ra,5*$SIZE_T($sp)
1781 stm${g} %r6,$s3,6*$SIZE_T($sp)
1782
1783 sllg $len,$len,4 # $len&=~15
1784 slgr $out,$inp
1785
1786 # generate the tweak value
1787 l${g} $s3,$stdframe($sp) # pointer to iv
1788 la $s2,$tweak($sp)
1789 lmg $s0,$s1,0($s3)
1790 lghi $s3,16
1791 stmg $s0,$s1,0($s2)
1792 la %r1,0($key2) # $key2 is not needed anymore
1793 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1794 brc 1,.-4 # can this happen?
1795
1796 l %r0,240($key1)
1797 la %r1,0($key1) # $key1 is not needed anymore
1798 bras $ra,_s390x_xts_km
1799 jz .Lxts_enc_km_done
1800
1801 aghi $inp,-16 # take one step back
1802 la $i3,0($out,$inp) # put aside real $out
1803.Lxts_enc_km_steal:
1804 llgc $i1,16($inp)
1805 llgc $i2,0($out,$inp)
1806 stc $i1,0($out,$inp)
1807 stc $i2,16($out,$inp)
1808 la $inp,1($inp)
1809 brct $len,.Lxts_enc_km_steal
1810
1811 la $s2,0($i3)
1812 lghi $s3,16
1813 lrvgr $i1,$s0 # flip byte order
1814 lrvgr $i2,$s1
1815 xg $i1,0($s2)
1816 xg $i2,8($s2)
1817 stg $i1,0($s2)
1818 stg $i2,8($s2)
1819 .long 0xb92e00aa # km $s2,$s2
1820 brc 1,.-4 # can this happen?
1821 lrvgr $i1,$s0 # flip byte order
1822 lrvgr $i2,$s1
1823 xg $i1,0($i3)
1824 xg $i2,8($i3)
1825 stg $i1,0($i3)
1826 stg $i2,8($i3)
1827
1828.Lxts_enc_km_done:
1829 stg $sp,$tweak+0($sp) # wipe tweak
1830 stg $sp,$tweak+8($sp)
1831 l${g} $ra,5*$SIZE_T($sp)
1832 lm${g} %r6,$s3,6*$SIZE_T($sp)
1833 br $ra
1834.align 16
1835.Lxts_enc_software:
1836___
1837$code.=<<___;
1838 stm${g} %r6,$ra,6*$SIZE_T($sp)
1839
1840 slgr $out,$inp
1841
1842 l${g} $s3,$stdframe($sp) # ivp
1843 llgf $s0,0($s3) # load iv
1844 llgf $s1,4($s3)
1845 llgf $s2,8($s3)
1846 llgf $s3,12($s3)
1847 stm${g} %r2,%r5,2*$SIZE_T($sp)
1848 la $key,0($key2)
1849 larl $tbl,AES_Te
1850 bras $ra,_s390x_AES_encrypt # generate the tweak
1851 lm${g} %r2,%r5,2*$SIZE_T($sp)
1852 stm $s0,$s3,$tweak($sp) # save the tweak
1853 j .Lxts_enc_enter
1854
1855.align 16
1856.Lxts_enc_loop:
1857 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1858 lrvg $s3,$tweak+8($sp)
1859 lghi %r1,0x87
1860 srag %r0,$s3,63 # broadcast upper bit
1861 ngr %r1,%r0 # rem
1862 algr $s1,$s1
1863 alcgr $s3,$s3
1864 xgr $s1,%r1
1865 lrvgr $s1,$s1 # flip byte order
1866 lrvgr $s3,$s3
1867 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1868 stg $s1,$tweak+0($sp) # save the tweak
1869 llgfr $s1,$s1
1870 srlg $s2,$s3,32
1871 stg $s3,$tweak+8($sp)
1872 llgfr $s3,$s3
1873 la $inp,16($inp) # $inp+=16
1874.Lxts_enc_enter:
1875 x $s0,0($inp) # ^=*($inp)
1876 x $s1,4($inp)
1877 x $s2,8($inp)
1878 x $s3,12($inp)
1879 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
1880 la $key,0($key1)
1881 bras $ra,_s390x_AES_encrypt
1882 lm${g} %r2,%r5,2*$SIZE_T($sp)
1883 x $s0,$tweak+0($sp) # ^=tweak
1884 x $s1,$tweak+4($sp)
1885 x $s2,$tweak+8($sp)
1886 x $s3,$tweak+12($sp)
1887 st $s0,0($out,$inp)
1888 st $s1,4($out,$inp)
1889 st $s2,8($out,$inp)
1890 st $s3,12($out,$inp)
1891 brct${g} $len,.Lxts_enc_loop
1892
1893 llgc $len,`2*$SIZE_T-1`($sp)
1894 nill $len,0x0f # $len%16
1895 jz .Lxts_enc_done
1896
1897 la $i3,0($inp,$out) # put aside real $out
1898.Lxts_enc_steal:
1899 llgc %r0,16($inp)
1900 llgc %r1,0($out,$inp)
1901 stc %r0,0($out,$inp)
1902 stc %r1,16($out,$inp)
1903 la $inp,1($inp)
1904 brct $len,.Lxts_enc_steal
1905 la $out,0($i3) # restore real $out
1906
1907 # generate last tweak...
1908 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1909 lrvg $s3,$tweak+8($sp)
1910 lghi %r1,0x87
1911 srag %r0,$s3,63 # broadcast upper bit
1912 ngr %r1,%r0 # rem
1913 algr $s1,$s1
1914 alcgr $s3,$s3
1915 xgr $s1,%r1
1916 lrvgr $s1,$s1 # flip byte order
1917 lrvgr $s3,$s3
1918 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1919 stg $s1,$tweak+0($sp) # save the tweak
1920 llgfr $s1,$s1
1921 srlg $s2,$s3,32
1922 stg $s3,$tweak+8($sp)
1923 llgfr $s3,$s3
1924
1925 x $s0,0($out) # ^=*(inp)|stolen cipther-text
1926 x $s1,4($out)
1927 x $s2,8($out)
1928 x $s3,12($out)
1929 st${g} $out,4*$SIZE_T($sp)
1930 la $key,0($key1)
1931 bras $ra,_s390x_AES_encrypt
1932 l${g} $out,4*$SIZE_T($sp)
1933 x $s0,`$tweak+0`($sp) # ^=tweak
1934 x $s1,`$tweak+4`($sp)
1935 x $s2,`$tweak+8`($sp)
1936 x $s3,`$tweak+12`($sp)
1937 st $s0,0($out)
1938 st $s1,4($out)
1939 st $s2,8($out)
1940 st $s3,12($out)
1941
1942.Lxts_enc_done:
1943 stg $sp,$tweak+0($sp) # wipe tweak
1944 stg $sp,$twesk+8($sp)
1945 lm${g} %r6,$ra,6*$SIZE_T($sp)
1946 br $ra
1947.size AES_xts_encrypt,.-AES_xts_encrypt
1948___
1949# void AES_xts_decrypt(const char *inp,char *out,size_t len,
1950# const AES_KEY *key1, const AES_KEY *key2,
1951# const unsigned char iv[16]);
1952#
1953$code.=<<___;
1954.globl AES_xts_decrypt
1955.type AES_xts_decrypt,\@function
1956.align 16
1957AES_xts_decrypt:
1958 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1959 xgr %r4,%r3
1960 xgr %r3,%r4
1961___
1962$code.=<<___ if ($SIZE_T==4);
1963 llgfr $len,$len
1964___
1965$code.=<<___;
1966 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1967 aghi $len,-16
1968 bcr 4,$ra # abort if less than zero. formally
1969 # wrong, because $len is unsigned,
1970 # but who can afford asking to
1971 # process more than 2^63-1 bytes?
1972 tmll $len,0x0f
1973 jnz .Lxts_dec_proceed
1974 aghi $len,16
1975.Lxts_dec_proceed:
1976___
1977$code.=<<___ if (!$softonly);
1978 llgf %r0,240($key2)
1979 lhi %r1,16
1980 clr %r0,%r1
1981 jl .Lxts_dec_software
1982
1983 st${g} $ra,5*$SIZE_T($sp)
1984 stm${g} %r6,$s3,6*$SIZE_T($sp)
1985
1986 nill $len,0xfff0 # $len&=~15
1987 slgr $out,$inp
1988
1989 # generate the tweak value
1990 l${g} $s3,$stdframe($sp) # pointer to iv
1991 la $s2,$tweak($sp)
1992 lmg $s0,$s1,0($s3)
1993 lghi $s3,16
1994 stmg $s0,$s1,0($s2)
1995 la %r1,0($key2) # $key2 is not needed past this point
1996 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1997 brc 1,.-4 # can this happen?
1998
1999 l %r0,240($key1)
2000 la %r1,0($key1) # $key1 is not needed anymore
2001
2002 ltgr $len,$len
2003 jz .Lxts_dec_km_short
2004 bras $ra,_s390x_xts_km
2005 jz .Lxts_dec_km_done
2006
2007 lrvgr $s2,$s0 # make copy in reverse byte order
2008 lrvgr $s3,$s1
2009 j .Lxts_dec_km_2ndtweak
2010
2011.Lxts_dec_km_short:
2012 llgc $len,`2*$SIZE_T-1`($sp)
2013 nill $len,0x0f # $len%=16
2014 lrvg $s0,$tweak+0($sp) # load the tweak
2015 lrvg $s1,$tweak+8($sp)
2016 lrvgr $s2,$s0 # make copy in reverse byte order
2017 lrvgr $s3,$s1
2018
2019.Lxts_dec_km_2ndtweak:
2020 lghi $i1,0x87
2021 srag $i2,$s1,63 # broadcast upper bit
2022 ngr $i1,$i2 # rem
2023 algr $s0,$s0
2024 alcgr $s1,$s1
2025 xgr $s0,$i1
2026 lrvgr $i1,$s0 # flip byte order
2027 lrvgr $i2,$s1
2028
2029 xg $i1,0($inp)
2030 xg $i2,8($inp)
2031 stg $i1,0($out,$inp)
2032 stg $i2,8($out,$inp)
2033 la $i2,0($out,$inp)
2034 lghi $i3,16
2035 .long 0xb92e0066 # km $i2,$i2
2036 brc 1,.-4 # can this happen?
2037 lrvgr $i1,$s0
2038 lrvgr $i2,$s1
2039 xg $i1,0($out,$inp)
2040 xg $i2,8($out,$inp)
2041 stg $i1,0($out,$inp)
2042 stg $i2,8($out,$inp)
2043
2044 la $i3,0($out,$inp) # put aside real $out
2045.Lxts_dec_km_steal:
2046 llgc $i1,16($inp)
2047 llgc $i2,0($out,$inp)
2048 stc $i1,0($out,$inp)
2049 stc $i2,16($out,$inp)
2050 la $inp,1($inp)
2051 brct $len,.Lxts_dec_km_steal
2052
2053 lgr $s0,$s2
2054 lgr $s1,$s3
2055 xg $s0,0($i3)
2056 xg $s1,8($i3)
2057 stg $s0,0($i3)
2058 stg $s1,8($i3)
2059 la $s0,0($i3)
2060 lghi $s1,16
2061 .long 0xb92e0088 # km $s0,$s0
2062 brc 1,.-4 # can this happen?
2063 xg $s2,0($i3)
2064 xg $s3,8($i3)
2065 stg $s2,0($i3)
2066 stg $s3,8($i3)
2067.Lxts_dec_km_done:
2068 stg $sp,$tweak+0($sp) # wipe tweak
2069 stg $sp,$tweak+8($sp)
2070 l${g} $ra,5*$SIZE_T($sp)
2071 lm${g} %r6,$s3,6*$SIZE_T($sp)
2072 br $ra
2073.align 16
2074.Lxts_dec_software:
2075___
2076$code.=<<___;
2077 stm${g} %r6,$ra,6*$SIZE_T($sp)
2078
2079 srlg $len,$len,4
2080 slgr $out,$inp
2081
2082 l${g} $s3,$stdframe($sp) # ivp
2083 llgf $s0,0($s3) # load iv
2084 llgf $s1,4($s3)
2085 llgf $s2,8($s3)
2086 llgf $s3,12($s3)
2087 stm${g} %r2,%r5,2*$SIZE_T($sp)
2088 la $key,0($key2)
2089 larl $tbl,AES_Te
2090 bras $ra,_s390x_AES_encrypt # generate the tweak
2091 lm${g} %r2,%r5,2*$SIZE_T($sp)
2092 larl $tbl,AES_Td
2093 lt${g}r $len,$len
2094 stm $s0,$s3,$tweak($sp) # save the tweak
2095 jz .Lxts_dec_short
2096 j .Lxts_dec_enter
2097
2098.align 16
2099.Lxts_dec_loop:
2100 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2101 lrvg $s3,$tweak+8($sp)
2102 lghi %r1,0x87
2103 srag %r0,$s3,63 # broadcast upper bit
2104 ngr %r1,%r0 # rem
2105 algr $s1,$s1
2106 alcgr $s3,$s3
2107 xgr $s1,%r1
2108 lrvgr $s1,$s1 # flip byte order
2109 lrvgr $s3,$s3
2110 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2111 stg $s1,$tweak+0($sp) # save the tweak
2112 llgfr $s1,$s1
2113 srlg $s2,$s3,32
2114 stg $s3,$tweak+8($sp)
2115 llgfr $s3,$s3
2116.Lxts_dec_enter:
2117 x $s0,0($inp) # tweak^=*(inp)
2118 x $s1,4($inp)
2119 x $s2,8($inp)
2120 x $s3,12($inp)
2121 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
2122 la $key,0($key1)
2123 bras $ra,_s390x_AES_decrypt
2124 lm${g} %r2,%r5,2*$SIZE_T($sp)
2125 x $s0,$tweak+0($sp) # ^=tweak
2126 x $s1,$tweak+4($sp)
2127 x $s2,$tweak+8($sp)
2128 x $s3,$tweak+12($sp)
2129 st $s0,0($out,$inp)
2130 st $s1,4($out,$inp)
2131 st $s2,8($out,$inp)
2132 st $s3,12($out,$inp)
2133 la $inp,16($inp)
2134 brct${g} $len,.Lxts_dec_loop
2135
2136 llgc $len,`2*$SIZE_T-1`($sp)
2137 nill $len,0x0f # $len%16
2138 jz .Lxts_dec_done
2139
2140 # generate pair of tweaks...
2141 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2142 lrvg $s3,$tweak+8($sp)
2143 lghi %r1,0x87
2144 srag %r0,$s3,63 # broadcast upper bit
2145 ngr %r1,%r0 # rem
2146 algr $s1,$s1
2147 alcgr $s3,$s3
2148 xgr $s1,%r1
2149 lrvgr $i2,$s1 # flip byte order
2150 lrvgr $i3,$s3
2151 stmg $i2,$i3,$tweak($sp) # save the 1st tweak
2152 j .Lxts_dec_2ndtweak
2153
2154.align 16
2155.Lxts_dec_short:
2156 llgc $len,`2*$SIZE_T-1`($sp)
2157 nill $len,0x0f # $len%16
2158 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2159 lrvg $s3,$tweak+8($sp)
2160.Lxts_dec_2ndtweak:
2161 lghi %r1,0x87
2162 srag %r0,$s3,63 # broadcast upper bit
2163 ngr %r1,%r0 # rem
2164 algr $s1,$s1
2165 alcgr $s3,$s3
2166 xgr $s1,%r1
2167 lrvgr $s1,$s1 # flip byte order
2168 lrvgr $s3,$s3
2169 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2170 stg $s1,$tweak-16+0($sp) # save the 2nd tweak
2171 llgfr $s1,$s1
2172 srlg $s2,$s3,32
2173 stg $s3,$tweak-16+8($sp)
2174 llgfr $s3,$s3
2175
2176 x $s0,0($inp) # tweak_the_2nd^=*(inp)
2177 x $s1,4($inp)
2178 x $s2,8($inp)
2179 x $s3,12($inp)
2180 stm${g} %r2,%r3,2*$SIZE_T($sp)
2181 la $key,0($key1)
2182 bras $ra,_s390x_AES_decrypt
2183 lm${g} %r2,%r5,2*$SIZE_T($sp)
2184 x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
2185 x $s1,$tweak-16+4($sp)
2186 x $s2,$tweak-16+8($sp)
2187 x $s3,$tweak-16+12($sp)
2188 st $s0,0($out,$inp)
2189 st $s1,4($out,$inp)
2190 st $s2,8($out,$inp)
2191 st $s3,12($out,$inp)
2192
2193 la $i3,0($out,$inp) # put aside real $out
2194.Lxts_dec_steal:
2195 llgc %r0,16($inp)
2196 llgc %r1,0($out,$inp)
2197 stc %r0,0($out,$inp)
2198 stc %r1,16($out,$inp)
2199 la $inp,1($inp)
2200 brct $len,.Lxts_dec_steal
2201 la $out,0($i3) # restore real $out
2202
2203 lm $s0,$s3,$tweak($sp) # load the 1st tweak
2204 x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
2205 x $s1,4($out)
2206 x $s2,8($out)
2207 x $s3,12($out)
2208 st${g} $out,4*$SIZE_T($sp)
2209 la $key,0($key1)
2210 bras $ra,_s390x_AES_decrypt
2211 l${g} $out,4*$SIZE_T($sp)
2212 x $s0,$tweak+0($sp) # ^=tweak
2213 x $s1,$tweak+4($sp)
2214 x $s2,$tweak+8($sp)
2215 x $s3,$tweak+12($sp)
2216 st $s0,0($out)
2217 st $s1,4($out)
2218 st $s2,8($out)
2219 st $s3,12($out)
2220 stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
2221 stg $sp,$tweak-16+8($sp)
2222.Lxts_dec_done:
2223 stg $sp,$tweak+0($sp) # wipe tweak
2224 stg $sp,$twesk+8($sp)
2225 lm${g} %r6,$ra,6*$SIZE_T($sp)
2226 br $ra
2227.size AES_xts_decrypt,.-AES_xts_decrypt
2228___
2229}
2230$code.=<<___;
2231.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2232.comm OPENSSL_s390xcap_P,16,8
2233___
2234
2235$code =~ s/\`([^\`]*)\`/eval $1/gem;
2236print $code;
2237close STDOUT; # force flush
diff --git a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
deleted file mode 100755
index 403c4d1290..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
+++ /dev/null
@@ -1,1182 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# Version 1.1
10#
11# The major reason for undertaken effort was to mitigate the hazard of
12# cache-timing attack. This is [currently and initially!] addressed in
13# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
14# 2. References to them are scheduled for L2 cache latency, meaning
15# that the tables don't have to reside in L1 cache. Once again, this
16# is an initial draft and one should expect more countermeasures to
17# be implemented...
18#
19# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
20# round.
21#
22# Even though performance was not the primary goal [on the contrary,
23# extra shifts "induced" by compressed S-box and longer loop epilogue
24# "induced" by scheduling for L2 have negative effect on performance],
25# the code turned out to run in ~23 cycles per processed byte en-/
26# decrypted with 128-bit key. This is pretty good result for code
27# with mentioned qualities and UltraSPARC core. Compared to Sun C
28# generated code my encrypt procedure runs just few percents faster,
29# while decrypt one - whole 50% faster [yes, Sun C failed to generate
30# optimal decrypt procedure]. Compared to GNU C generated code both
31# procedures are more than 60% faster:-)
32
33$bits=32;
34for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
35if ($bits==64) { $bias=2047; $frame=192; }
36else { $bias=0; $frame=112; }
37$locals=16;
38
39$acc0="%l0";
40$acc1="%o0";
41$acc2="%o1";
42$acc3="%o2";
43
44$acc4="%l1";
45$acc5="%o3";
46$acc6="%o4";
47$acc7="%o5";
48
49$acc8="%l2";
50$acc9="%o7";
51$acc10="%g1";
52$acc11="%g2";
53
54$acc12="%l3";
55$acc13="%g3";
56$acc14="%g4";
57$acc15="%g5";
58
59$t0="%l4";
60$t1="%l5";
61$t2="%l6";
62$t3="%l7";
63
64$s0="%i0";
65$s1="%i1";
66$s2="%i2";
67$s3="%i3";
68$tbl="%i4";
69$key="%i5";
70$rounds="%i7"; # aliases with return address, which is off-loaded to stack
71
72sub _data_word()
73{ my $i;
74 while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
75}
76
77$code.=<<___ if ($bits==64);
78.register %g2,#scratch
79.register %g3,#scratch
80___
81$code.=<<___;
82.section ".text",#alloc,#execinstr
83
84.align 256
85AES_Te:
86___
87&_data_word(
88 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
89 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
90 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
91 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
92 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
93 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
94 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
95 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
96 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
97 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
98 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
99 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
100 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
101 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
102 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
103 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
104 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
105 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
106 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
107 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
108 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
109 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
110 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
111 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
112 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
113 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
114 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
115 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
116 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
117 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
118 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
119 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
120 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
121 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
122 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
123 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
124 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
125 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
126 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
127 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
128 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
129 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
130 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
131 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
132 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
133 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
134 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
135 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
136 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
137 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
138 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
139 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
140 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
141 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
142 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
143 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
144 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
145 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
146 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
147 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
148 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
149 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
150 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
151 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
152$code.=<<___;
153 .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
154 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
155 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
156 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
157 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
158 .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
159 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
160 .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
161 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
162 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
163 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
164 .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
165 .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
166 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
167 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
168 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
169 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
170 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
171 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
172 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
173 .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
174 .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
175 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
176 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
177 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
178 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
179 .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
180 .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
181 .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
182 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
183 .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
184 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
185.type AES_Te,#object
186.size AES_Te,(.-AES_Te)
187
188.align 64
189.skip 16
190_sparcv9_AES_encrypt:
191 save %sp,-$frame-$locals,%sp
192 stx %i7,[%sp+$bias+$frame+0] ! off-load return address
193 ld [$key+240],$rounds
194 ld [$key+0],$t0
195 ld [$key+4],$t1 !
196 ld [$key+8],$t2
197 srl $rounds,1,$rounds
198 xor $t0,$s0,$s0
199 ld [$key+12],$t3
200 srl $s0,21,$acc0
201 xor $t1,$s1,$s1
202 ld [$key+16],$t0
203 srl $s1,13,$acc1 !
204 xor $t2,$s2,$s2
205 ld [$key+20],$t1
206 xor $t3,$s3,$s3
207 ld [$key+24],$t2
208 and $acc0,2040,$acc0
209 ld [$key+28],$t3
210 nop
211.Lenc_loop:
212 srl $s2,5,$acc2 !
213 and $acc1,2040,$acc1
214 ldx [$tbl+$acc0],$acc0
215 sll $s3,3,$acc3
216 and $acc2,2040,$acc2
217 ldx [$tbl+$acc1],$acc1
218 srl $s1,21,$acc4
219 and $acc3,2040,$acc3
220 ldx [$tbl+$acc2],$acc2 !
221 srl $s2,13,$acc5
222 and $acc4,2040,$acc4
223 ldx [$tbl+$acc3],$acc3
224 srl $s3,5,$acc6
225 and $acc5,2040,$acc5
226 ldx [$tbl+$acc4],$acc4
227 fmovs %f0,%f0
228 sll $s0,3,$acc7 !
229 and $acc6,2040,$acc6
230 ldx [$tbl+$acc5],$acc5
231 srl $s2,21,$acc8
232 and $acc7,2040,$acc7
233 ldx [$tbl+$acc6],$acc6
234 srl $s3,13,$acc9
235 and $acc8,2040,$acc8
236 ldx [$tbl+$acc7],$acc7 !
237 srl $s0,5,$acc10
238 and $acc9,2040,$acc9
239 ldx [$tbl+$acc8],$acc8
240 sll $s1,3,$acc11
241 and $acc10,2040,$acc10
242 ldx [$tbl+$acc9],$acc9
243 fmovs %f0,%f0
244 srl $s3,21,$acc12 !
245 and $acc11,2040,$acc11
246 ldx [$tbl+$acc10],$acc10
247 srl $s0,13,$acc13
248 and $acc12,2040,$acc12
249 ldx [$tbl+$acc11],$acc11
250 srl $s1,5,$acc14
251 and $acc13,2040,$acc13
252 ldx [$tbl+$acc12],$acc12 !
253 sll $s2,3,$acc15
254 and $acc14,2040,$acc14
255 ldx [$tbl+$acc13],$acc13
256 and $acc15,2040,$acc15
257 add $key,32,$key
258 ldx [$tbl+$acc14],$acc14
259 fmovs %f0,%f0
260 subcc $rounds,1,$rounds !
261 ldx [$tbl+$acc15],$acc15
262 bz,a,pn %icc,.Lenc_last
263 add $tbl,2048,$rounds
264
265 srlx $acc1,8,$acc1
266 xor $acc0,$t0,$t0
267 ld [$key+0],$s0
268 fmovs %f0,%f0
269 srlx $acc2,16,$acc2 !
270 xor $acc1,$t0,$t0
271 ld [$key+4],$s1
272 srlx $acc3,24,$acc3
273 xor $acc2,$t0,$t0
274 ld [$key+8],$s2
275 srlx $acc5,8,$acc5
276 xor $acc3,$t0,$t0
277 ld [$key+12],$s3 !
278 srlx $acc6,16,$acc6
279 xor $acc4,$t1,$t1
280 fmovs %f0,%f0
281 srlx $acc7,24,$acc7
282 xor $acc5,$t1,$t1
283 srlx $acc9,8,$acc9
284 xor $acc6,$t1,$t1
285 srlx $acc10,16,$acc10 !
286 xor $acc7,$t1,$t1
287 srlx $acc11,24,$acc11
288 xor $acc8,$t2,$t2
289 srlx $acc13,8,$acc13
290 xor $acc9,$t2,$t2
291 srlx $acc14,16,$acc14
292 xor $acc10,$t2,$t2
293 srlx $acc15,24,$acc15 !
294 xor $acc11,$t2,$t2
295 xor $acc12,$acc14,$acc14
296 xor $acc13,$t3,$t3
297 srl $t0,21,$acc0
298 xor $acc14,$t3,$t3
299 srl $t1,13,$acc1
300 xor $acc15,$t3,$t3
301
302 and $acc0,2040,$acc0 !
303 srl $t2,5,$acc2
304 and $acc1,2040,$acc1
305 ldx [$tbl+$acc0],$acc0
306 sll $t3,3,$acc3
307 and $acc2,2040,$acc2
308 ldx [$tbl+$acc1],$acc1
309 fmovs %f0,%f0
310 srl $t1,21,$acc4 !
311 and $acc3,2040,$acc3
312 ldx [$tbl+$acc2],$acc2
313 srl $t2,13,$acc5
314 and $acc4,2040,$acc4
315 ldx [$tbl+$acc3],$acc3
316 srl $t3,5,$acc6
317 and $acc5,2040,$acc5
318 ldx [$tbl+$acc4],$acc4 !
319 sll $t0,3,$acc7
320 and $acc6,2040,$acc6
321 ldx [$tbl+$acc5],$acc5
322 srl $t2,21,$acc8
323 and $acc7,2040,$acc7
324 ldx [$tbl+$acc6],$acc6
325 fmovs %f0,%f0
326 srl $t3,13,$acc9 !
327 and $acc8,2040,$acc8
328 ldx [$tbl+$acc7],$acc7
329 srl $t0,5,$acc10
330 and $acc9,2040,$acc9
331 ldx [$tbl+$acc8],$acc8
332 sll $t1,3,$acc11
333 and $acc10,2040,$acc10
334 ldx [$tbl+$acc9],$acc9 !
335 srl $t3,21,$acc12
336 and $acc11,2040,$acc11
337 ldx [$tbl+$acc10],$acc10
338 srl $t0,13,$acc13
339 and $acc12,2040,$acc12
340 ldx [$tbl+$acc11],$acc11
341 fmovs %f0,%f0
342 srl $t1,5,$acc14 !
343 and $acc13,2040,$acc13
344 ldx [$tbl+$acc12],$acc12
345 sll $t2,3,$acc15
346 and $acc14,2040,$acc14
347 ldx [$tbl+$acc13],$acc13
348 srlx $acc1,8,$acc1
349 and $acc15,2040,$acc15
350 ldx [$tbl+$acc14],$acc14 !
351
352 srlx $acc2,16,$acc2
353 xor $acc0,$s0,$s0
354 ldx [$tbl+$acc15],$acc15
355 srlx $acc3,24,$acc3
356 xor $acc1,$s0,$s0
357 ld [$key+16],$t0
358 fmovs %f0,%f0
359 srlx $acc5,8,$acc5 !
360 xor $acc2,$s0,$s0
361 ld [$key+20],$t1
362 srlx $acc6,16,$acc6
363 xor $acc3,$s0,$s0
364 ld [$key+24],$t2
365 srlx $acc7,24,$acc7
366 xor $acc4,$s1,$s1
367 ld [$key+28],$t3 !
368 srlx $acc9,8,$acc9
369 xor $acc5,$s1,$s1
370 ldx [$tbl+2048+0],%g0 ! prefetch te4
371 srlx $acc10,16,$acc10
372 xor $acc6,$s1,$s1
373 ldx [$tbl+2048+32],%g0 ! prefetch te4
374 srlx $acc11,24,$acc11
375 xor $acc7,$s1,$s1
376 ldx [$tbl+2048+64],%g0 ! prefetch te4
377 srlx $acc13,8,$acc13
378 xor $acc8,$s2,$s2
379 ldx [$tbl+2048+96],%g0 ! prefetch te4
380 srlx $acc14,16,$acc14 !
381 xor $acc9,$s2,$s2
382 ldx [$tbl+2048+128],%g0 ! prefetch te4
383 srlx $acc15,24,$acc15
384 xor $acc10,$s2,$s2
385 ldx [$tbl+2048+160],%g0 ! prefetch te4
386 srl $s0,21,$acc0
387 xor $acc11,$s2,$s2
388 ldx [$tbl+2048+192],%g0 ! prefetch te4
389 xor $acc12,$acc14,$acc14
390 xor $acc13,$s3,$s3
391 ldx [$tbl+2048+224],%g0 ! prefetch te4
392 srl $s1,13,$acc1 !
393 xor $acc14,$s3,$s3
394 xor $acc15,$s3,$s3
395 ba .Lenc_loop
396 and $acc0,2040,$acc0
397
398.align 32
399.Lenc_last:
400 srlx $acc1,8,$acc1 !
401 xor $acc0,$t0,$t0
402 ld [$key+0],$s0
403 srlx $acc2,16,$acc2
404 xor $acc1,$t0,$t0
405 ld [$key+4],$s1
406 srlx $acc3,24,$acc3
407 xor $acc2,$t0,$t0
408 ld [$key+8],$s2 !
409 srlx $acc5,8,$acc5
410 xor $acc3,$t0,$t0
411 ld [$key+12],$s3
412 srlx $acc6,16,$acc6
413 xor $acc4,$t1,$t1
414 srlx $acc7,24,$acc7
415 xor $acc5,$t1,$t1
416 srlx $acc9,8,$acc9 !
417 xor $acc6,$t1,$t1
418 srlx $acc10,16,$acc10
419 xor $acc7,$t1,$t1
420 srlx $acc11,24,$acc11
421 xor $acc8,$t2,$t2
422 srlx $acc13,8,$acc13
423 xor $acc9,$t2,$t2
424 srlx $acc14,16,$acc14 !
425 xor $acc10,$t2,$t2
426 srlx $acc15,24,$acc15
427 xor $acc11,$t2,$t2
428 xor $acc12,$acc14,$acc14
429 xor $acc13,$t3,$t3
430 srl $t0,24,$acc0
431 xor $acc14,$t3,$t3
432 srl $t1,16,$acc1 !
433 xor $acc15,$t3,$t3
434
435 srl $t2,8,$acc2
436 and $acc1,255,$acc1
437 ldub [$rounds+$acc0],$acc0
438 srl $t1,24,$acc4
439 and $acc2,255,$acc2
440 ldub [$rounds+$acc1],$acc1
441 srl $t2,16,$acc5 !
442 and $t3,255,$acc3
443 ldub [$rounds+$acc2],$acc2
444 ldub [$rounds+$acc3],$acc3
445 srl $t3,8,$acc6
446 and $acc5,255,$acc5
447 ldub [$rounds+$acc4],$acc4
448 fmovs %f0,%f0
449 srl $t2,24,$acc8 !
450 and $acc6,255,$acc6
451 ldub [$rounds+$acc5],$acc5
452 srl $t3,16,$acc9
453 and $t0,255,$acc7
454 ldub [$rounds+$acc6],$acc6
455 ldub [$rounds+$acc7],$acc7
456 fmovs %f0,%f0
457 srl $t0,8,$acc10 !
458 and $acc9,255,$acc9
459 ldub [$rounds+$acc8],$acc8
460 srl $t3,24,$acc12
461 and $acc10,255,$acc10
462 ldub [$rounds+$acc9],$acc9
463 srl $t0,16,$acc13
464 and $t1,255,$acc11
465 ldub [$rounds+$acc10],$acc10 !
466 srl $t1,8,$acc14
467 and $acc13,255,$acc13
468 ldub [$rounds+$acc11],$acc11
469 ldub [$rounds+$acc12],$acc12
470 and $acc14,255,$acc14
471 ldub [$rounds+$acc13],$acc13
472 and $t2,255,$acc15
473 ldub [$rounds+$acc14],$acc14 !
474
475 sll $acc0,24,$acc0
476 xor $acc3,$s0,$s0
477 ldub [$rounds+$acc15],$acc15
478 sll $acc1,16,$acc1
479 xor $acc0,$s0,$s0
480 ldx [%sp+$bias+$frame+0],%i7 ! restore return address
481 fmovs %f0,%f0
482 sll $acc2,8,$acc2 !
483 xor $acc1,$s0,$s0
484 sll $acc4,24,$acc4
485 xor $acc2,$s0,$s0
486 sll $acc5,16,$acc5
487 xor $acc7,$s1,$s1
488 sll $acc6,8,$acc6
489 xor $acc4,$s1,$s1
490 sll $acc8,24,$acc8 !
491 xor $acc5,$s1,$s1
492 sll $acc9,16,$acc9
493 xor $acc11,$s2,$s2
494 sll $acc10,8,$acc10
495 xor $acc6,$s1,$s1
496 sll $acc12,24,$acc12
497 xor $acc8,$s2,$s2
498 sll $acc13,16,$acc13 !
499 xor $acc9,$s2,$s2
500 sll $acc14,8,$acc14
501 xor $acc10,$s2,$s2
502 xor $acc12,$acc14,$acc14
503 xor $acc13,$s3,$s3
504 xor $acc14,$s3,$s3
505 xor $acc15,$s3,$s3
506
507 ret
508 restore
509.type _sparcv9_AES_encrypt,#function
510.size _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
511
512.align 32
513.globl AES_encrypt
514AES_encrypt:
515 or %o0,%o1,%g1
516 andcc %g1,3,%g0
517 bnz,pn %xcc,.Lunaligned_enc
518 save %sp,-$frame,%sp
519
520 ld [%i0+0],%o0
521 ld [%i0+4],%o1
522 ld [%i0+8],%o2
523 ld [%i0+12],%o3
524
5251: call .+8
526 add %o7,AES_Te-1b,%o4
527 call _sparcv9_AES_encrypt
528 mov %i2,%o5
529
530 st %o0,[%i1+0]
531 st %o1,[%i1+4]
532 st %o2,[%i1+8]
533 st %o3,[%i1+12]
534
535 ret
536 restore
537
538.align 32
539.Lunaligned_enc:
540 ldub [%i0+0],%l0
541 ldub [%i0+1],%l1
542 ldub [%i0+2],%l2
543
544 sll %l0,24,%l0
545 ldub [%i0+3],%l3
546 sll %l1,16,%l1
547 ldub [%i0+4],%l4
548 sll %l2,8,%l2
549 or %l1,%l0,%l0
550 ldub [%i0+5],%l5
551 sll %l4,24,%l4
552 or %l3,%l2,%l2
553 ldub [%i0+6],%l6
554 sll %l5,16,%l5
555 or %l0,%l2,%o0
556 ldub [%i0+7],%l7
557
558 sll %l6,8,%l6
559 or %l5,%l4,%l4
560 ldub [%i0+8],%l0
561 or %l7,%l6,%l6
562 ldub [%i0+9],%l1
563 or %l4,%l6,%o1
564 ldub [%i0+10],%l2
565
566 sll %l0,24,%l0
567 ldub [%i0+11],%l3
568 sll %l1,16,%l1
569 ldub [%i0+12],%l4
570 sll %l2,8,%l2
571 or %l1,%l0,%l0
572 ldub [%i0+13],%l5
573 sll %l4,24,%l4
574 or %l3,%l2,%l2
575 ldub [%i0+14],%l6
576 sll %l5,16,%l5
577 or %l0,%l2,%o2
578 ldub [%i0+15],%l7
579
580 sll %l6,8,%l6
581 or %l5,%l4,%l4
582 or %l7,%l6,%l6
583 or %l4,%l6,%o3
584
5851: call .+8
586 add %o7,AES_Te-1b,%o4
587 call _sparcv9_AES_encrypt
588 mov %i2,%o5
589
590 srl %o0,24,%l0
591 srl %o0,16,%l1
592 stb %l0,[%i1+0]
593 srl %o0,8,%l2
594 stb %l1,[%i1+1]
595 stb %l2,[%i1+2]
596 srl %o1,24,%l4
597 stb %o0,[%i1+3]
598
599 srl %o1,16,%l5
600 stb %l4,[%i1+4]
601 srl %o1,8,%l6
602 stb %l5,[%i1+5]
603 stb %l6,[%i1+6]
604 srl %o2,24,%l0
605 stb %o1,[%i1+7]
606
607 srl %o2,16,%l1
608 stb %l0,[%i1+8]
609 srl %o2,8,%l2
610 stb %l1,[%i1+9]
611 stb %l2,[%i1+10]
612 srl %o3,24,%l4
613 stb %o2,[%i1+11]
614
615 srl %o3,16,%l5
616 stb %l4,[%i1+12]
617 srl %o3,8,%l6
618 stb %l5,[%i1+13]
619 stb %l6,[%i1+14]
620 stb %o3,[%i1+15]
621
622 ret
623 restore
624.type AES_encrypt,#function
625.size AES_encrypt,(.-AES_encrypt)
626
627___
628
629$code.=<<___;
630.align 256
631AES_Td:
632___
633&_data_word(
634 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
635 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
636 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
637 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
638 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
639 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
640 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
641 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
642 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
643 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
644 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
645 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
646 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
647 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
648 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
649 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
650 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
651 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
652 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
653 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
654 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
655 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
656 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
657 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
658 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
659 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
660 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
661 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
662 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
663 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
664 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
665 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
666 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
667 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
668 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
669 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
670 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
671 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
672 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
673 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
674 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
675 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
676 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
677 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
678 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
679 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
680 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
681 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
682 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
683 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
684 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
685 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
686 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
687 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
688 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
689 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
690 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
691 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
692 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
693 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
694 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
695 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
696 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
697 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
698$code.=<<___;
699 .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
700 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
701 .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
702 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
703 .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
704 .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
705 .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
706 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
707 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
708 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
709 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
710 .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
711 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
712 .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
713 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
714 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
715 .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
716 .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
717 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
718 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
719 .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
720 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
721 .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
722 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
723 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
724 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
725 .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
726 .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
727 .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
728 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
729 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
730 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
731.type AES_Td,#object
732.size AES_Td,(.-AES_Td)
733
734.align 64
735.skip 16
736_sparcv9_AES_decrypt:
737 save %sp,-$frame-$locals,%sp
738 stx %i7,[%sp+$bias+$frame+0] ! off-load return address
739 ld [$key+240],$rounds
740 ld [$key+0],$t0
741 ld [$key+4],$t1 !
742 ld [$key+8],$t2
743 ld [$key+12],$t3
744 srl $rounds,1,$rounds
745 xor $t0,$s0,$s0
746 ld [$key+16],$t0
747 xor $t1,$s1,$s1
748 ld [$key+20],$t1
749 srl $s0,21,$acc0 !
750 xor $t2,$s2,$s2
751 ld [$key+24],$t2
752 xor $t3,$s3,$s3
753 and $acc0,2040,$acc0
754 ld [$key+28],$t3
755 srl $s3,13,$acc1
756 nop
757.Ldec_loop:
758 srl $s2,5,$acc2 !
759 and $acc1,2040,$acc1
760 ldx [$tbl+$acc0],$acc0
761 sll $s1,3,$acc3
762 and $acc2,2040,$acc2
763 ldx [$tbl+$acc1],$acc1
764 srl $s1,21,$acc4
765 and $acc3,2040,$acc3
766 ldx [$tbl+$acc2],$acc2 !
767 srl $s0,13,$acc5
768 and $acc4,2040,$acc4
769 ldx [$tbl+$acc3],$acc3
770 srl $s3,5,$acc6
771 and $acc5,2040,$acc5
772 ldx [$tbl+$acc4],$acc4
773 fmovs %f0,%f0
774 sll $s2,3,$acc7 !
775 and $acc6,2040,$acc6
776 ldx [$tbl+$acc5],$acc5
777 srl $s2,21,$acc8
778 and $acc7,2040,$acc7
779 ldx [$tbl+$acc6],$acc6
780 srl $s1,13,$acc9
781 and $acc8,2040,$acc8
782 ldx [$tbl+$acc7],$acc7 !
783 srl $s0,5,$acc10
784 and $acc9,2040,$acc9
785 ldx [$tbl+$acc8],$acc8
786 sll $s3,3,$acc11
787 and $acc10,2040,$acc10
788 ldx [$tbl+$acc9],$acc9
789 fmovs %f0,%f0
790 srl $s3,21,$acc12 !
791 and $acc11,2040,$acc11
792 ldx [$tbl+$acc10],$acc10
793 srl $s2,13,$acc13
794 and $acc12,2040,$acc12
795 ldx [$tbl+$acc11],$acc11
796 srl $s1,5,$acc14
797 and $acc13,2040,$acc13
798 ldx [$tbl+$acc12],$acc12 !
799 sll $s0,3,$acc15
800 and $acc14,2040,$acc14
801 ldx [$tbl+$acc13],$acc13
802 and $acc15,2040,$acc15
803 add $key,32,$key
804 ldx [$tbl+$acc14],$acc14
805 fmovs %f0,%f0
806 subcc $rounds,1,$rounds !
807 ldx [$tbl+$acc15],$acc15
808 bz,a,pn %icc,.Ldec_last
809 add $tbl,2048,$rounds
810
811 srlx $acc1,8,$acc1
812 xor $acc0,$t0,$t0
813 ld [$key+0],$s0
814 fmovs %f0,%f0
815 srlx $acc2,16,$acc2 !
816 xor $acc1,$t0,$t0
817 ld [$key+4],$s1
818 srlx $acc3,24,$acc3
819 xor $acc2,$t0,$t0
820 ld [$key+8],$s2
821 srlx $acc5,8,$acc5
822 xor $acc3,$t0,$t0
823 ld [$key+12],$s3 !
824 srlx $acc6,16,$acc6
825 xor $acc4,$t1,$t1
826 fmovs %f0,%f0
827 srlx $acc7,24,$acc7
828 xor $acc5,$t1,$t1
829 srlx $acc9,8,$acc9
830 xor $acc6,$t1,$t1
831 srlx $acc10,16,$acc10 !
832 xor $acc7,$t1,$t1
833 srlx $acc11,24,$acc11
834 xor $acc8,$t2,$t2
835 srlx $acc13,8,$acc13
836 xor $acc9,$t2,$t2
837 srlx $acc14,16,$acc14
838 xor $acc10,$t2,$t2
839 srlx $acc15,24,$acc15 !
840 xor $acc11,$t2,$t2
841 xor $acc12,$acc14,$acc14
842 xor $acc13,$t3,$t3
843 srl $t0,21,$acc0
844 xor $acc14,$t3,$t3
845 xor $acc15,$t3,$t3
846 srl $t3,13,$acc1
847
848 and $acc0,2040,$acc0 !
849 srl $t2,5,$acc2
850 and $acc1,2040,$acc1
851 ldx [$tbl+$acc0],$acc0
852 sll $t1,3,$acc3
853 and $acc2,2040,$acc2
854 ldx [$tbl+$acc1],$acc1
855 fmovs %f0,%f0
856 srl $t1,21,$acc4 !
857 and $acc3,2040,$acc3
858 ldx [$tbl+$acc2],$acc2
859 srl $t0,13,$acc5
860 and $acc4,2040,$acc4
861 ldx [$tbl+$acc3],$acc3
862 srl $t3,5,$acc6
863 and $acc5,2040,$acc5
864 ldx [$tbl+$acc4],$acc4 !
865 sll $t2,3,$acc7
866 and $acc6,2040,$acc6
867 ldx [$tbl+$acc5],$acc5
868 srl $t2,21,$acc8
869 and $acc7,2040,$acc7
870 ldx [$tbl+$acc6],$acc6
871 fmovs %f0,%f0
872 srl $t1,13,$acc9 !
873 and $acc8,2040,$acc8
874 ldx [$tbl+$acc7],$acc7
875 srl $t0,5,$acc10
876 and $acc9,2040,$acc9
877 ldx [$tbl+$acc8],$acc8
878 sll $t3,3,$acc11
879 and $acc10,2040,$acc10
880 ldx [$tbl+$acc9],$acc9 !
881 srl $t3,21,$acc12
882 and $acc11,2040,$acc11
883 ldx [$tbl+$acc10],$acc10
884 srl $t2,13,$acc13
885 and $acc12,2040,$acc12
886 ldx [$tbl+$acc11],$acc11
887 fmovs %f0,%f0
888 srl $t1,5,$acc14 !
889 and $acc13,2040,$acc13
890 ldx [$tbl+$acc12],$acc12
891 sll $t0,3,$acc15
892 and $acc14,2040,$acc14
893 ldx [$tbl+$acc13],$acc13
894 srlx $acc1,8,$acc1
895 and $acc15,2040,$acc15
896 ldx [$tbl+$acc14],$acc14 !
897
898 srlx $acc2,16,$acc2
899 xor $acc0,$s0,$s0
900 ldx [$tbl+$acc15],$acc15
901 srlx $acc3,24,$acc3
902 xor $acc1,$s0,$s0
903 ld [$key+16],$t0
904 fmovs %f0,%f0
905 srlx $acc5,8,$acc5 !
906 xor $acc2,$s0,$s0
907 ld [$key+20],$t1
908 srlx $acc6,16,$acc6
909 xor $acc3,$s0,$s0
910 ld [$key+24],$t2
911 srlx $acc7,24,$acc7
912 xor $acc4,$s1,$s1
913 ld [$key+28],$t3 !
914 srlx $acc9,8,$acc9
915 xor $acc5,$s1,$s1
916 ldx [$tbl+2048+0],%g0 ! prefetch td4
917 srlx $acc10,16,$acc10
918 xor $acc6,$s1,$s1
919 ldx [$tbl+2048+32],%g0 ! prefetch td4
920 srlx $acc11,24,$acc11
921 xor $acc7,$s1,$s1
922 ldx [$tbl+2048+64],%g0 ! prefetch td4
923 srlx $acc13,8,$acc13
924 xor $acc8,$s2,$s2
925 ldx [$tbl+2048+96],%g0 ! prefetch td4
926 srlx $acc14,16,$acc14 !
927 xor $acc9,$s2,$s2
928 ldx [$tbl+2048+128],%g0 ! prefetch td4
929 srlx $acc15,24,$acc15
930 xor $acc10,$s2,$s2
931 ldx [$tbl+2048+160],%g0 ! prefetch td4
932 srl $s0,21,$acc0
933 xor $acc11,$s2,$s2
934 ldx [$tbl+2048+192],%g0 ! prefetch td4
935 xor $acc12,$acc14,$acc14
936 xor $acc13,$s3,$s3
937 ldx [$tbl+2048+224],%g0 ! prefetch td4
938 and $acc0,2040,$acc0 !
939 xor $acc14,$s3,$s3
940 xor $acc15,$s3,$s3
941 ba .Ldec_loop
942 srl $s3,13,$acc1
943
944.align 32
945.Ldec_last:
946 srlx $acc1,8,$acc1 !
947 xor $acc0,$t0,$t0
948 ld [$key+0],$s0
949 srlx $acc2,16,$acc2
950 xor $acc1,$t0,$t0
951 ld [$key+4],$s1
952 srlx $acc3,24,$acc3
953 xor $acc2,$t0,$t0
954 ld [$key+8],$s2 !
955 srlx $acc5,8,$acc5
956 xor $acc3,$t0,$t0
957 ld [$key+12],$s3
958 srlx $acc6,16,$acc6
959 xor $acc4,$t1,$t1
960 srlx $acc7,24,$acc7
961 xor $acc5,$t1,$t1
962 srlx $acc9,8,$acc9 !
963 xor $acc6,$t1,$t1
964 srlx $acc10,16,$acc10
965 xor $acc7,$t1,$t1
966 srlx $acc11,24,$acc11
967 xor $acc8,$t2,$t2
968 srlx $acc13,8,$acc13
969 xor $acc9,$t2,$t2
970 srlx $acc14,16,$acc14 !
971 xor $acc10,$t2,$t2
972 srlx $acc15,24,$acc15
973 xor $acc11,$t2,$t2
974 xor $acc12,$acc14,$acc14
975 xor $acc13,$t3,$t3
976 srl $t0,24,$acc0
977 xor $acc14,$t3,$t3
978 xor $acc15,$t3,$t3 !
979 srl $t3,16,$acc1
980
981 srl $t2,8,$acc2
982 and $acc1,255,$acc1
983 ldub [$rounds+$acc0],$acc0
984 srl $t1,24,$acc4
985 and $acc2,255,$acc2
986 ldub [$rounds+$acc1],$acc1
987 srl $t0,16,$acc5 !
988 and $t1,255,$acc3
989 ldub [$rounds+$acc2],$acc2
990 ldub [$rounds+$acc3],$acc3
991 srl $t3,8,$acc6
992 and $acc5,255,$acc5
993 ldub [$rounds+$acc4],$acc4
994 fmovs %f0,%f0
995 srl $t2,24,$acc8 !
996 and $acc6,255,$acc6
997 ldub [$rounds+$acc5],$acc5
998 srl $t1,16,$acc9
999 and $t2,255,$acc7
1000 ldub [$rounds+$acc6],$acc6
1001 ldub [$rounds+$acc7],$acc7
1002 fmovs %f0,%f0
1003 srl $t0,8,$acc10 !
1004 and $acc9,255,$acc9
1005 ldub [$rounds+$acc8],$acc8
1006 srl $t3,24,$acc12
1007 and $acc10,255,$acc10
1008 ldub [$rounds+$acc9],$acc9
1009 srl $t2,16,$acc13
1010 and $t3,255,$acc11
1011 ldub [$rounds+$acc10],$acc10 !
1012 srl $t1,8,$acc14
1013 and $acc13,255,$acc13
1014 ldub [$rounds+$acc11],$acc11
1015 ldub [$rounds+$acc12],$acc12
1016 and $acc14,255,$acc14
1017 ldub [$rounds+$acc13],$acc13
1018 and $t0,255,$acc15
1019 ldub [$rounds+$acc14],$acc14 !
1020
1021 sll $acc0,24,$acc0
1022 xor $acc3,$s0,$s0
1023 ldub [$rounds+$acc15],$acc15
1024 sll $acc1,16,$acc1
1025 xor $acc0,$s0,$s0
1026 ldx [%sp+$bias+$frame+0],%i7 ! restore return address
1027 fmovs %f0,%f0
1028 sll $acc2,8,$acc2 !
1029 xor $acc1,$s0,$s0
1030 sll $acc4,24,$acc4
1031 xor $acc2,$s0,$s0
1032 sll $acc5,16,$acc5
1033 xor $acc7,$s1,$s1
1034 sll $acc6,8,$acc6
1035 xor $acc4,$s1,$s1
1036 sll $acc8,24,$acc8 !
1037 xor $acc5,$s1,$s1
1038 sll $acc9,16,$acc9
1039 xor $acc11,$s2,$s2
1040 sll $acc10,8,$acc10
1041 xor $acc6,$s1,$s1
1042 sll $acc12,24,$acc12
1043 xor $acc8,$s2,$s2
1044 sll $acc13,16,$acc13 !
1045 xor $acc9,$s2,$s2
1046 sll $acc14,8,$acc14
1047 xor $acc10,$s2,$s2
1048 xor $acc12,$acc14,$acc14
1049 xor $acc13,$s3,$s3
1050 xor $acc14,$s3,$s3
1051 xor $acc15,$s3,$s3
1052
1053 ret
1054 restore
1055.type _sparcv9_AES_decrypt,#function
1056.size _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1057
1058.align 32
1059.globl AES_decrypt
1060AES_decrypt:
1061 or %o0,%o1,%g1
1062 andcc %g1,3,%g0
1063 bnz,pn %xcc,.Lunaligned_dec
1064 save %sp,-$frame,%sp
1065
1066 ld [%i0+0],%o0
1067 ld [%i0+4],%o1
1068 ld [%i0+8],%o2
1069 ld [%i0+12],%o3
1070
10711: call .+8
1072 add %o7,AES_Td-1b,%o4
1073 call _sparcv9_AES_decrypt
1074 mov %i2,%o5
1075
1076 st %o0,[%i1+0]
1077 st %o1,[%i1+4]
1078 st %o2,[%i1+8]
1079 st %o3,[%i1+12]
1080
1081 ret
1082 restore
1083
1084.align 32
1085.Lunaligned_dec:
1086 ldub [%i0+0],%l0
1087 ldub [%i0+1],%l1
1088 ldub [%i0+2],%l2
1089
1090 sll %l0,24,%l0
1091 ldub [%i0+3],%l3
1092 sll %l1,16,%l1
1093 ldub [%i0+4],%l4
1094 sll %l2,8,%l2
1095 or %l1,%l0,%l0
1096 ldub [%i0+5],%l5
1097 sll %l4,24,%l4
1098 or %l3,%l2,%l2
1099 ldub [%i0+6],%l6
1100 sll %l5,16,%l5
1101 or %l0,%l2,%o0
1102 ldub [%i0+7],%l7
1103
1104 sll %l6,8,%l6
1105 or %l5,%l4,%l4
1106 ldub [%i0+8],%l0
1107 or %l7,%l6,%l6
1108 ldub [%i0+9],%l1
1109 or %l4,%l6,%o1
1110 ldub [%i0+10],%l2
1111
1112 sll %l0,24,%l0
1113 ldub [%i0+11],%l3
1114 sll %l1,16,%l1
1115 ldub [%i0+12],%l4
1116 sll %l2,8,%l2
1117 or %l1,%l0,%l0
1118 ldub [%i0+13],%l5
1119 sll %l4,24,%l4
1120 or %l3,%l2,%l2
1121 ldub [%i0+14],%l6
1122 sll %l5,16,%l5
1123 or %l0,%l2,%o2
1124 ldub [%i0+15],%l7
1125
1126 sll %l6,8,%l6
1127 or %l5,%l4,%l4
1128 or %l7,%l6,%l6
1129 or %l4,%l6,%o3
1130
11311: call .+8
1132 add %o7,AES_Td-1b,%o4
1133 call _sparcv9_AES_decrypt
1134 mov %i2,%o5
1135
1136 srl %o0,24,%l0
1137 srl %o0,16,%l1
1138 stb %l0,[%i1+0]
1139 srl %o0,8,%l2
1140 stb %l1,[%i1+1]
1141 stb %l2,[%i1+2]
1142 srl %o1,24,%l4
1143 stb %o0,[%i1+3]
1144
1145 srl %o1,16,%l5
1146 stb %l4,[%i1+4]
1147 srl %o1,8,%l6
1148 stb %l5,[%i1+5]
1149 stb %l6,[%i1+6]
1150 srl %o2,24,%l0
1151 stb %o1,[%i1+7]
1152
1153 srl %o2,16,%l1
1154 stb %l0,[%i1+8]
1155 srl %o2,8,%l2
1156 stb %l1,[%i1+9]
1157 stb %l2,[%i1+10]
1158 srl %o3,24,%l4
1159 stb %o2,[%i1+11]
1160
1161 srl %o3,16,%l5
1162 stb %l4,[%i1+12]
1163 srl %o3,8,%l6
1164 stb %l5,[%i1+13]
1165 stb %l6,[%i1+14]
1166 stb %o3,[%i1+15]
1167
1168 ret
1169 restore
1170.type AES_decrypt,#function
1171.size AES_decrypt,(.-AES_decrypt)
1172___
1173
1174# fmovs instructions substituting for FP nops were originally added
1175# to meet specific instruction alignment requirements to maximize ILP.
1176# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1177# undesired effect, so just omit them and sacrifice some portion of
1178# percent in performance...
1179$code =~ s/fmovs.*$//gm;
1180
1181print $code;
1182close STDOUT; # ensure flush
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
deleted file mode 100755
index f75e90ba87..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl
+++ /dev/null
@@ -1,2819 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Version 2.1.
11#
12# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
13# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
14# [you'll notice a lot of resemblance], such as compressed S-boxes
15# in little-endian byte order, prefetch of these tables in CBC mode,
16# as well as avoiding L1 cache aliasing between stack frame and key
17# schedule and already mentioned tables, compressed Td4...
18#
19# Performance in number of cycles per processed byte for 128-bit key:
20#
21# ECB encrypt ECB decrypt CBC large chunk
22# AMD64 33 41 13.0
23# EM64T 38 59 18.6(*)
24# Core 2 30 43 14.5(*)
25#
26# (*) with hyper-threading off
27
28$flavour = shift;
29$output = shift;
30if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
31
32$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
33
34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
36( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
37die "can't locate x86_64-xlate.pl";
38
39open OUT,"| \"$^X\" $xlate $flavour $output";
40*STDOUT=*OUT;
41
42$verticalspin=1; # unlike 32-bit version $verticalspin performs
43 # ~15% better on both AMD and Intel cores
44$speed_limit=512; # see aes-586.pl for details
45
46$code=".text\n";
47
48$s0="%eax";
49$s1="%ebx";
50$s2="%ecx";
51$s3="%edx";
52$acc0="%esi"; $mask80="%rsi";
53$acc1="%edi"; $maskfe="%rdi";
54$acc2="%ebp"; $mask1b="%rbp";
55$inp="%r8";
56$out="%r9";
57$t0="%r10d";
58$t1="%r11d";
59$t2="%r12d";
60$rnds="%r13d";
61$sbox="%r14";
62$key="%r15";
63
64sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
65sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
66 $r =~ s/%[er]([sd]i)/%\1l/;
67 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
68sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
69 $r =~ s/%r([0-9]+)/%r\1d/; $r; }
70sub _data_word()
71{ my $i;
72 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
73}
74sub data_word()
75{ my $i;
76 my $last=pop(@_);
77 $code.=".long\t";
78 while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
79 $code.=sprintf"0x%08x\n",$last;
80}
81
82sub data_byte()
83{ my $i;
84 my $last=pop(@_);
85 $code.=".byte\t";
86 while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
87 $code.=sprintf"0x%02x\n",$last&0xff;
88}
89
90sub encvert()
91{ my $t3="%r8d"; # zaps $inp!
92
93$code.=<<___;
94 # favor 3-way issue Opteron pipeline...
95 movzb `&lo("$s0")`,$acc0
96 movzb `&lo("$s1")`,$acc1
97 movzb `&lo("$s2")`,$acc2
98 mov 0($sbox,$acc0,8),$t0
99 mov 0($sbox,$acc1,8),$t1
100 mov 0($sbox,$acc2,8),$t2
101
102 movzb `&hi("$s1")`,$acc0
103 movzb `&hi("$s2")`,$acc1
104 movzb `&lo("$s3")`,$acc2
105 xor 3($sbox,$acc0,8),$t0
106 xor 3($sbox,$acc1,8),$t1
107 mov 0($sbox,$acc2,8),$t3
108
109 movzb `&hi("$s3")`,$acc0
110 shr \$16,$s2
111 movzb `&hi("$s0")`,$acc2
112 xor 3($sbox,$acc0,8),$t2
113 shr \$16,$s3
114 xor 3($sbox,$acc2,8),$t3
115
116 shr \$16,$s1
117 lea 16($key),$key
118 shr \$16,$s0
119
120 movzb `&lo("$s2")`,$acc0
121 movzb `&lo("$s3")`,$acc1
122 movzb `&lo("$s0")`,$acc2
123 xor 2($sbox,$acc0,8),$t0
124 xor 2($sbox,$acc1,8),$t1
125 xor 2($sbox,$acc2,8),$t2
126
127 movzb `&hi("$s3")`,$acc0
128 movzb `&hi("$s0")`,$acc1
129 movzb `&lo("$s1")`,$acc2
130 xor 1($sbox,$acc0,8),$t0
131 xor 1($sbox,$acc1,8),$t1
132 xor 2($sbox,$acc2,8),$t3
133
134 mov 12($key),$s3
135 movzb `&hi("$s1")`,$acc1
136 movzb `&hi("$s2")`,$acc2
137 mov 0($key),$s0
138 xor 1($sbox,$acc1,8),$t2
139 xor 1($sbox,$acc2,8),$t3
140
141 mov 4($key),$s1
142 mov 8($key),$s2
143 xor $t0,$s0
144 xor $t1,$s1
145 xor $t2,$s2
146 xor $t3,$s3
147___
148}
149
150sub enclastvert()
151{ my $t3="%r8d"; # zaps $inp!
152
153$code.=<<___;
154 movzb `&lo("$s0")`,$acc0
155 movzb `&lo("$s1")`,$acc1
156 movzb `&lo("$s2")`,$acc2
157 movzb 2($sbox,$acc0,8),$t0
158 movzb 2($sbox,$acc1,8),$t1
159 movzb 2($sbox,$acc2,8),$t2
160
161 movzb `&lo("$s3")`,$acc0
162 movzb `&hi("$s1")`,$acc1
163 movzb `&hi("$s2")`,$acc2
164 movzb 2($sbox,$acc0,8),$t3
165 mov 0($sbox,$acc1,8),$acc1 #$t0
166 mov 0($sbox,$acc2,8),$acc2 #$t1
167
168 and \$0x0000ff00,$acc1
169 and \$0x0000ff00,$acc2
170
171 xor $acc1,$t0
172 xor $acc2,$t1
173 shr \$16,$s2
174
175 movzb `&hi("$s3")`,$acc0
176 movzb `&hi("$s0")`,$acc1
177 shr \$16,$s3
178 mov 0($sbox,$acc0,8),$acc0 #$t2
179 mov 0($sbox,$acc1,8),$acc1 #$t3
180
181 and \$0x0000ff00,$acc0
182 and \$0x0000ff00,$acc1
183 shr \$16,$s1
184 xor $acc0,$t2
185 xor $acc1,$t3
186 shr \$16,$s0
187
188 movzb `&lo("$s2")`,$acc0
189 movzb `&lo("$s3")`,$acc1
190 movzb `&lo("$s0")`,$acc2
191 mov 0($sbox,$acc0,8),$acc0 #$t0
192 mov 0($sbox,$acc1,8),$acc1 #$t1
193 mov 0($sbox,$acc2,8),$acc2 #$t2
194
195 and \$0x00ff0000,$acc0
196 and \$0x00ff0000,$acc1
197 and \$0x00ff0000,$acc2
198
199 xor $acc0,$t0
200 xor $acc1,$t1
201 xor $acc2,$t2
202
203 movzb `&lo("$s1")`,$acc0
204 movzb `&hi("$s3")`,$acc1
205 movzb `&hi("$s0")`,$acc2
206 mov 0($sbox,$acc0,8),$acc0 #$t3
207 mov 2($sbox,$acc1,8),$acc1 #$t0
208 mov 2($sbox,$acc2,8),$acc2 #$t1
209
210 and \$0x00ff0000,$acc0
211 and \$0xff000000,$acc1
212 and \$0xff000000,$acc2
213
214 xor $acc0,$t3
215 xor $acc1,$t0
216 xor $acc2,$t1
217
218 movzb `&hi("$s1")`,$acc0
219 movzb `&hi("$s2")`,$acc1
220 mov 16+12($key),$s3
221 mov 2($sbox,$acc0,8),$acc0 #$t2
222 mov 2($sbox,$acc1,8),$acc1 #$t3
223 mov 16+0($key),$s0
224
225 and \$0xff000000,$acc0
226 and \$0xff000000,$acc1
227
228 xor $acc0,$t2
229 xor $acc1,$t3
230
231 mov 16+4($key),$s1
232 mov 16+8($key),$s2
233 xor $t0,$s0
234 xor $t1,$s1
235 xor $t2,$s2
236 xor $t3,$s3
237___
238}
239
240sub encstep()
241{ my ($i,@s) = @_;
242 my $tmp0=$acc0;
243 my $tmp1=$acc1;
244 my $tmp2=$acc2;
245 my $out=($t0,$t1,$t2,$s[0])[$i];
246
247 if ($i==3) {
248 $tmp0=$s[1];
249 $tmp1=$s[2];
250 $tmp2=$s[3];
251 }
252 $code.=" movzb ".&lo($s[0]).",$out\n";
253 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
254 $code.=" lea 16($key),$key\n" if ($i==0);
255
256 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
257 $code.=" mov 0($sbox,$out,8),$out\n";
258
259 $code.=" shr \$16,$tmp1\n";
260 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
261 $code.=" xor 3($sbox,$tmp0,8),$out\n";
262
263 $code.=" movzb ".&lo($tmp1).",$tmp1\n";
264 $code.=" shr \$24,$tmp2\n";
265 $code.=" xor 4*$i($key),$out\n";
266
267 $code.=" xor 2($sbox,$tmp1,8),$out\n";
268 $code.=" xor 1($sbox,$tmp2,8),$out\n";
269
270 $code.=" mov $t0,$s[1]\n" if ($i==3);
271 $code.=" mov $t1,$s[2]\n" if ($i==3);
272 $code.=" mov $t2,$s[3]\n" if ($i==3);
273 $code.="\n";
274}
275
276sub enclast()
277{ my ($i,@s)=@_;
278 my $tmp0=$acc0;
279 my $tmp1=$acc1;
280 my $tmp2=$acc2;
281 my $out=($t0,$t1,$t2,$s[0])[$i];
282
283 if ($i==3) {
284 $tmp0=$s[1];
285 $tmp1=$s[2];
286 $tmp2=$s[3];
287 }
288 $code.=" movzb ".&lo($s[0]).",$out\n";
289 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
290
291 $code.=" mov 2($sbox,$out,8),$out\n";
292 $code.=" shr \$16,$tmp1\n";
293 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
294
295 $code.=" and \$0x000000ff,$out\n";
296 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
297 $code.=" movzb ".&lo($tmp1).",$tmp1\n";
298 $code.=" shr \$24,$tmp2\n";
299
300 $code.=" mov 0($sbox,$tmp0,8),$tmp0\n";
301 $code.=" mov 0($sbox,$tmp1,8),$tmp1\n";
302 $code.=" mov 2($sbox,$tmp2,8),$tmp2\n";
303
304 $code.=" and \$0x0000ff00,$tmp0\n";
305 $code.=" and \$0x00ff0000,$tmp1\n";
306 $code.=" and \$0xff000000,$tmp2\n";
307
308 $code.=" xor $tmp0,$out\n";
309 $code.=" mov $t0,$s[1]\n" if ($i==3);
310 $code.=" xor $tmp1,$out\n";
311 $code.=" mov $t1,$s[2]\n" if ($i==3);
312 $code.=" xor $tmp2,$out\n";
313 $code.=" mov $t2,$s[3]\n" if ($i==3);
314 $code.="\n";
315}
316
317$code.=<<___;
318.type _x86_64_AES_encrypt,\@abi-omnipotent
319.align 16
320_x86_64_AES_encrypt:
321 xor 0($key),$s0 # xor with key
322 xor 4($key),$s1
323 xor 8($key),$s2
324 xor 12($key),$s3
325
326 mov 240($key),$rnds # load key->rounds
327 sub \$1,$rnds
328 jmp .Lenc_loop
329.align 16
330.Lenc_loop:
331___
332 if ($verticalspin) { &encvert(); }
333 else { &encstep(0,$s0,$s1,$s2,$s3);
334 &encstep(1,$s1,$s2,$s3,$s0);
335 &encstep(2,$s2,$s3,$s0,$s1);
336 &encstep(3,$s3,$s0,$s1,$s2);
337 }
338$code.=<<___;
339 sub \$1,$rnds
340 jnz .Lenc_loop
341___
342 if ($verticalspin) { &enclastvert(); }
343 else { &enclast(0,$s0,$s1,$s2,$s3);
344 &enclast(1,$s1,$s2,$s3,$s0);
345 &enclast(2,$s2,$s3,$s0,$s1);
346 &enclast(3,$s3,$s0,$s1,$s2);
347 $code.=<<___;
348 xor 16+0($key),$s0 # xor with key
349 xor 16+4($key),$s1
350 xor 16+8($key),$s2
351 xor 16+12($key),$s3
352___
353 }
354$code.=<<___;
355 .byte 0xf3,0xc3 # rep ret
356.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
357___
358
359# it's possible to implement this by shifting tN by 8, filling least
360# significant byte with byte load and finally bswap-ing at the end,
361# but such partial register load kills Core 2...
362sub enccompactvert()
363{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
364
365$code.=<<___;
366 movzb `&lo("$s0")`,$t0
367 movzb `&lo("$s1")`,$t1
368 movzb `&lo("$s2")`,$t2
369 movzb ($sbox,$t0,1),$t0
370 movzb ($sbox,$t1,1),$t1
371 movzb ($sbox,$t2,1),$t2
372
373 movzb `&lo("$s3")`,$t3
374 movzb `&hi("$s1")`,$acc0
375 movzb `&hi("$s2")`,$acc1
376 movzb ($sbox,$t3,1),$t3
377 movzb ($sbox,$acc0,1),$t4 #$t0
378 movzb ($sbox,$acc1,1),$t5 #$t1
379
380 movzb `&hi("$s3")`,$acc2
381 movzb `&hi("$s0")`,$acc0
382 shr \$16,$s2
383 movzb ($sbox,$acc2,1),$acc2 #$t2
384 movzb ($sbox,$acc0,1),$acc0 #$t3
385 shr \$16,$s3
386
387 movzb `&lo("$s2")`,$acc1
388 shl \$8,$t4
389 shl \$8,$t5
390 movzb ($sbox,$acc1,1),$acc1 #$t0
391 xor $t4,$t0
392 xor $t5,$t1
393
394 movzb `&lo("$s3")`,$t4
395 shr \$16,$s0
396 shr \$16,$s1
397 movzb `&lo("$s0")`,$t5
398 shl \$8,$acc2
399 shl \$8,$acc0
400 movzb ($sbox,$t4,1),$t4 #$t1
401 movzb ($sbox,$t5,1),$t5 #$t2
402 xor $acc2,$t2
403 xor $acc0,$t3
404
405 movzb `&lo("$s1")`,$acc2
406 movzb `&hi("$s3")`,$acc0
407 shl \$16,$acc1
408 movzb ($sbox,$acc2,1),$acc2 #$t3
409 movzb ($sbox,$acc0,1),$acc0 #$t0
410 xor $acc1,$t0
411
412 movzb `&hi("$s0")`,$acc1
413 shr \$8,$s2
414 shr \$8,$s1
415 movzb ($sbox,$acc1,1),$acc1 #$t1
416 movzb ($sbox,$s2,1),$s3 #$t3
417 movzb ($sbox,$s1,1),$s2 #$t2
418 shl \$16,$t4
419 shl \$16,$t5
420 shl \$16,$acc2
421 xor $t4,$t1
422 xor $t5,$t2
423 xor $acc2,$t3
424
425 shl \$24,$acc0
426 shl \$24,$acc1
427 shl \$24,$s3
428 xor $acc0,$t0
429 shl \$24,$s2
430 xor $acc1,$t1
431 mov $t0,$s0
432 mov $t1,$s1
433 xor $t2,$s2
434 xor $t3,$s3
435___
436}
437
438sub enctransform_ref()
439{ my $sn = shift;
440 my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
441
442$code.=<<___;
443 mov $sn,$acc
444 and \$0x80808080,$acc
445 mov $acc,$tmp
446 shr \$7,$tmp
447 lea ($sn,$sn),$r2
448 sub $tmp,$acc
449 and \$0xfefefefe,$r2
450 and \$0x1b1b1b1b,$acc
451 mov $sn,$tmp
452 xor $acc,$r2
453
454 xor $r2,$sn
455 rol \$24,$sn
456 xor $r2,$sn
457 ror \$16,$tmp
458 xor $tmp,$sn
459 ror \$8,$tmp
460 xor $tmp,$sn
461___
462}
463
464# unlike decrypt case it does not pay off to parallelize enctransform
465sub enctransform()
466{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
467
468$code.=<<___;
469 mov $s0,$acc0
470 mov $s1,$acc1
471 and \$0x80808080,$acc0
472 and \$0x80808080,$acc1
473 mov $acc0,$t0
474 mov $acc1,$t1
475 shr \$7,$t0
476 lea ($s0,$s0),$r20
477 shr \$7,$t1
478 lea ($s1,$s1),$r21
479 sub $t0,$acc0
480 sub $t1,$acc1
481 and \$0xfefefefe,$r20
482 and \$0xfefefefe,$r21
483 and \$0x1b1b1b1b,$acc0
484 and \$0x1b1b1b1b,$acc1
485 mov $s0,$t0
486 mov $s1,$t1
487 xor $acc0,$r20
488 xor $acc1,$r21
489
490 xor $r20,$s0
491 xor $r21,$s1
492 mov $s2,$acc0
493 mov $s3,$acc1
494 rol \$24,$s0
495 rol \$24,$s1
496 and \$0x80808080,$acc0
497 and \$0x80808080,$acc1
498 xor $r20,$s0
499 xor $r21,$s1
500 mov $acc0,$t2
501 mov $acc1,$t3
502 ror \$16,$t0
503 ror \$16,$t1
504 shr \$7,$t2
505 lea ($s2,$s2),$r20
506 xor $t0,$s0
507 xor $t1,$s1
508 shr \$7,$t3
509 lea ($s3,$s3),$r21
510 ror \$8,$t0
511 ror \$8,$t1
512 sub $t2,$acc0
513 sub $t3,$acc1
514 xor $t0,$s0
515 xor $t1,$s1
516
517 and \$0xfefefefe,$r20
518 and \$0xfefefefe,$r21
519 and \$0x1b1b1b1b,$acc0
520 and \$0x1b1b1b1b,$acc1
521 mov $s2,$t2
522 mov $s3,$t3
523 xor $acc0,$r20
524 xor $acc1,$r21
525
526 xor $r20,$s2
527 xor $r21,$s3
528 rol \$24,$s2
529 rol \$24,$s3
530 xor $r20,$s2
531 xor $r21,$s3
532 mov 0($sbox),$acc0 # prefetch Te4
533 ror \$16,$t2
534 ror \$16,$t3
535 mov 64($sbox),$acc1
536 xor $t2,$s2
537 xor $t3,$s3
538 mov 128($sbox),$r20
539 ror \$8,$t2
540 ror \$8,$t3
541 mov 192($sbox),$r21
542 xor $t2,$s2
543 xor $t3,$s3
544___
545}
546
547$code.=<<___;
548.type _x86_64_AES_encrypt_compact,\@abi-omnipotent
549.align 16
550_x86_64_AES_encrypt_compact:
551 lea 128($sbox),$inp # size optimization
552 mov 0-128($inp),$acc1 # prefetch Te4
553 mov 32-128($inp),$acc2
554 mov 64-128($inp),$t0
555 mov 96-128($inp),$t1
556 mov 128-128($inp),$acc1
557 mov 160-128($inp),$acc2
558 mov 192-128($inp),$t0
559 mov 224-128($inp),$t1
560 jmp .Lenc_loop_compact
561.align 16
562.Lenc_loop_compact:
563 xor 0($key),$s0 # xor with key
564 xor 4($key),$s1
565 xor 8($key),$s2
566 xor 12($key),$s3
567 lea 16($key),$key
568___
569 &enccompactvert();
570$code.=<<___;
571 cmp 16(%rsp),$key
572 je .Lenc_compact_done
573___
574 &enctransform();
575$code.=<<___;
576 jmp .Lenc_loop_compact
577.align 16
578.Lenc_compact_done:
579 xor 0($key),$s0
580 xor 4($key),$s1
581 xor 8($key),$s2
582 xor 12($key),$s3
583 .byte 0xf3,0xc3 # rep ret
584.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
585___
586
587# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
588$code.=<<___;
589.globl AES_encrypt
590.type AES_encrypt,\@function,3
591.align 16
592.globl asm_AES_encrypt
593.hidden asm_AES_encrypt
594asm_AES_encrypt:
595AES_encrypt:
596 push %rbx
597 push %rbp
598 push %r12
599 push %r13
600 push %r14
601 push %r15
602
603 # allocate frame "above" key schedule
604 mov %rsp,%r10
605 lea -63(%rdx),%rcx # %rdx is key argument
606 and \$-64,%rsp
607 sub %rsp,%rcx
608 neg %rcx
609 and \$0x3c0,%rcx
610 sub %rcx,%rsp
611 sub \$32,%rsp
612
613 mov %rsi,16(%rsp) # save out
614 mov %r10,24(%rsp) # save real stack pointer
615.Lenc_prologue:
616
617 mov %rdx,$key
618 mov 240($key),$rnds # load rounds
619
620 mov 0(%rdi),$s0 # load input vector
621 mov 4(%rdi),$s1
622 mov 8(%rdi),$s2
623 mov 12(%rdi),$s3
624
625 shl \$4,$rnds
626 lea ($key,$rnds),%rbp
627 mov $key,(%rsp) # key schedule
628 mov %rbp,8(%rsp) # end of key schedule
629
630 # pick Te4 copy which can't "overlap" with stack frame or key schedule
631 lea .LAES_Te+2048(%rip),$sbox
632 lea 768(%rsp),%rbp
633 sub $sbox,%rbp
634 and \$0x300,%rbp
635 lea ($sbox,%rbp),$sbox
636
637 call _x86_64_AES_encrypt_compact
638
639 mov 16(%rsp),$out # restore out
640 mov 24(%rsp),%rsi # restore saved stack pointer
641 mov $s0,0($out) # write output vector
642 mov $s1,4($out)
643 mov $s2,8($out)
644 mov $s3,12($out)
645
646 mov (%rsi),%r15
647 mov 8(%rsi),%r14
648 mov 16(%rsi),%r13
649 mov 24(%rsi),%r12
650 mov 32(%rsi),%rbp
651 mov 40(%rsi),%rbx
652 lea 48(%rsi),%rsp
653.Lenc_epilogue:
654 ret
655.size AES_encrypt,.-AES_encrypt
656___
657
658#------------------------------------------------------------------#
659
660sub decvert()
661{ my $t3="%r8d"; # zaps $inp!
662
663$code.=<<___;
664 # favor 3-way issue Opteron pipeline...
665 movzb `&lo("$s0")`,$acc0
666 movzb `&lo("$s1")`,$acc1
667 movzb `&lo("$s2")`,$acc2
668 mov 0($sbox,$acc0,8),$t0
669 mov 0($sbox,$acc1,8),$t1
670 mov 0($sbox,$acc2,8),$t2
671
672 movzb `&hi("$s3")`,$acc0
673 movzb `&hi("$s0")`,$acc1
674 movzb `&lo("$s3")`,$acc2
675 xor 3($sbox,$acc0,8),$t0
676 xor 3($sbox,$acc1,8),$t1
677 mov 0($sbox,$acc2,8),$t3
678
679 movzb `&hi("$s1")`,$acc0
680 shr \$16,$s0
681 movzb `&hi("$s2")`,$acc2
682 xor 3($sbox,$acc0,8),$t2
683 shr \$16,$s3
684 xor 3($sbox,$acc2,8),$t3
685
686 shr \$16,$s1
687 lea 16($key),$key
688 shr \$16,$s2
689
690 movzb `&lo("$s2")`,$acc0
691 movzb `&lo("$s3")`,$acc1
692 movzb `&lo("$s0")`,$acc2
693 xor 2($sbox,$acc0,8),$t0
694 xor 2($sbox,$acc1,8),$t1
695 xor 2($sbox,$acc2,8),$t2
696
697 movzb `&hi("$s1")`,$acc0
698 movzb `&hi("$s2")`,$acc1
699 movzb `&lo("$s1")`,$acc2
700 xor 1($sbox,$acc0,8),$t0
701 xor 1($sbox,$acc1,8),$t1
702 xor 2($sbox,$acc2,8),$t3
703
704 movzb `&hi("$s3")`,$acc0
705 mov 12($key),$s3
706 movzb `&hi("$s0")`,$acc2
707 xor 1($sbox,$acc0,8),$t2
708 mov 0($key),$s0
709 xor 1($sbox,$acc2,8),$t3
710
711 xor $t0,$s0
712 mov 4($key),$s1
713 mov 8($key),$s2
714 xor $t2,$s2
715 xor $t1,$s1
716 xor $t3,$s3
717___
718}
719
720sub declastvert()
721{ my $t3="%r8d"; # zaps $inp!
722
723$code.=<<___;
724 lea 2048($sbox),$sbox # size optimization
725 movzb `&lo("$s0")`,$acc0
726 movzb `&lo("$s1")`,$acc1
727 movzb `&lo("$s2")`,$acc2
728 movzb ($sbox,$acc0,1),$t0
729 movzb ($sbox,$acc1,1),$t1
730 movzb ($sbox,$acc2,1),$t2
731
732 movzb `&lo("$s3")`,$acc0
733 movzb `&hi("$s3")`,$acc1
734 movzb `&hi("$s0")`,$acc2
735 movzb ($sbox,$acc0,1),$t3
736 movzb ($sbox,$acc1,1),$acc1 #$t0
737 movzb ($sbox,$acc2,1),$acc2 #$t1
738
739 shl \$8,$acc1
740 shl \$8,$acc2
741
742 xor $acc1,$t0
743 xor $acc2,$t1
744 shr \$16,$s3
745
746 movzb `&hi("$s1")`,$acc0
747 movzb `&hi("$s2")`,$acc1
748 shr \$16,$s0
749 movzb ($sbox,$acc0,1),$acc0 #$t2
750 movzb ($sbox,$acc1,1),$acc1 #$t3
751
752 shl \$8,$acc0
753 shl \$8,$acc1
754 shr \$16,$s1
755 xor $acc0,$t2
756 xor $acc1,$t3
757 shr \$16,$s2
758
759 movzb `&lo("$s2")`,$acc0
760 movzb `&lo("$s3")`,$acc1
761 movzb `&lo("$s0")`,$acc2
762 movzb ($sbox,$acc0,1),$acc0 #$t0
763 movzb ($sbox,$acc1,1),$acc1 #$t1
764 movzb ($sbox,$acc2,1),$acc2 #$t2
765
766 shl \$16,$acc0
767 shl \$16,$acc1
768 shl \$16,$acc2
769
770 xor $acc0,$t0
771 xor $acc1,$t1
772 xor $acc2,$t2
773
774 movzb `&lo("$s1")`,$acc0
775 movzb `&hi("$s1")`,$acc1
776 movzb `&hi("$s2")`,$acc2
777 movzb ($sbox,$acc0,1),$acc0 #$t3
778 movzb ($sbox,$acc1,1),$acc1 #$t0
779 movzb ($sbox,$acc2,1),$acc2 #$t1
780
781 shl \$16,$acc0
782 shl \$24,$acc1
783 shl \$24,$acc2
784
785 xor $acc0,$t3
786 xor $acc1,$t0
787 xor $acc2,$t1
788
789 movzb `&hi("$s3")`,$acc0
790 movzb `&hi("$s0")`,$acc1
791 mov 16+12($key),$s3
792 movzb ($sbox,$acc0,1),$acc0 #$t2
793 movzb ($sbox,$acc1,1),$acc1 #$t3
794 mov 16+0($key),$s0
795
796 shl \$24,$acc0
797 shl \$24,$acc1
798
799 xor $acc0,$t2
800 xor $acc1,$t3
801
802 mov 16+4($key),$s1
803 mov 16+8($key),$s2
804 lea -2048($sbox),$sbox
805 xor $t0,$s0
806 xor $t1,$s1
807 xor $t2,$s2
808 xor $t3,$s3
809___
810}
811
812sub decstep()
813{ my ($i,@s) = @_;
814 my $tmp0=$acc0;
815 my $tmp1=$acc1;
816 my $tmp2=$acc2;
817 my $out=($t0,$t1,$t2,$s[0])[$i];
818
819 $code.=" mov $s[0],$out\n" if ($i!=3);
820 $tmp1=$s[2] if ($i==3);
821 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
822 $code.=" and \$0xFF,$out\n";
823
824 $code.=" mov 0($sbox,$out,8),$out\n";
825 $code.=" shr \$16,$tmp1\n";
826 $tmp2=$s[3] if ($i==3);
827 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
828
829 $tmp0=$s[1] if ($i==3);
830 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
831 $code.=" and \$0xFF,$tmp1\n";
832 $code.=" shr \$24,$tmp2\n";
833
834 $code.=" xor 3($sbox,$tmp0,8),$out\n";
835 $code.=" xor 2($sbox,$tmp1,8),$out\n";
836 $code.=" xor 1($sbox,$tmp2,8),$out\n";
837
838 $code.=" mov $t2,$s[1]\n" if ($i==3);
839 $code.=" mov $t1,$s[2]\n" if ($i==3);
840 $code.=" mov $t0,$s[3]\n" if ($i==3);
841 $code.="\n";
842}
843
844sub declast()
845{ my ($i,@s)=@_;
846 my $tmp0=$acc0;
847 my $tmp1=$acc1;
848 my $tmp2=$acc2;
849 my $out=($t0,$t1,$t2,$s[0])[$i];
850
851 $code.=" mov $s[0],$out\n" if ($i!=3);
852 $tmp1=$s[2] if ($i==3);
853 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
854 $code.=" and \$0xFF,$out\n";
855
856 $code.=" movzb 2048($sbox,$out,1),$out\n";
857 $code.=" shr \$16,$tmp1\n";
858 $tmp2=$s[3] if ($i==3);
859 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
860
861 $tmp0=$s[1] if ($i==3);
862 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
863 $code.=" and \$0xFF,$tmp1\n";
864 $code.=" shr \$24,$tmp2\n";
865
866 $code.=" movzb 2048($sbox,$tmp0,1),$tmp0\n";
867 $code.=" movzb 2048($sbox,$tmp1,1),$tmp1\n";
868 $code.=" movzb 2048($sbox,$tmp2,1),$tmp2\n";
869
870 $code.=" shl \$8,$tmp0\n";
871 $code.=" shl \$16,$tmp1\n";
872 $code.=" shl \$24,$tmp2\n";
873
874 $code.=" xor $tmp0,$out\n";
875 $code.=" mov $t2,$s[1]\n" if ($i==3);
876 $code.=" xor $tmp1,$out\n";
877 $code.=" mov $t1,$s[2]\n" if ($i==3);
878 $code.=" xor $tmp2,$out\n";
879 $code.=" mov $t0,$s[3]\n" if ($i==3);
880 $code.="\n";
881}
882
883$code.=<<___;
884.type _x86_64_AES_decrypt,\@abi-omnipotent
885.align 16
886_x86_64_AES_decrypt:
887 xor 0($key),$s0 # xor with key
888 xor 4($key),$s1
889 xor 8($key),$s2
890 xor 12($key),$s3
891
892 mov 240($key),$rnds # load key->rounds
893 sub \$1,$rnds
894 jmp .Ldec_loop
895.align 16
896.Ldec_loop:
897___
898 if ($verticalspin) { &decvert(); }
899 else { &decstep(0,$s0,$s3,$s2,$s1);
900 &decstep(1,$s1,$s0,$s3,$s2);
901 &decstep(2,$s2,$s1,$s0,$s3);
902 &decstep(3,$s3,$s2,$s1,$s0);
903 $code.=<<___;
904 lea 16($key),$key
905 xor 0($key),$s0 # xor with key
906 xor 4($key),$s1
907 xor 8($key),$s2
908 xor 12($key),$s3
909___
910 }
911$code.=<<___;
912 sub \$1,$rnds
913 jnz .Ldec_loop
914___
915 if ($verticalspin) { &declastvert(); }
916 else { &declast(0,$s0,$s3,$s2,$s1);
917 &declast(1,$s1,$s0,$s3,$s2);
918 &declast(2,$s2,$s1,$s0,$s3);
919 &declast(3,$s3,$s2,$s1,$s0);
920 $code.=<<___;
921 xor 16+0($key),$s0 # xor with key
922 xor 16+4($key),$s1
923 xor 16+8($key),$s2
924 xor 16+12($key),$s3
925___
926 }
927$code.=<<___;
928 .byte 0xf3,0xc3 # rep ret
929.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
930___
931
932sub deccompactvert()
933{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
934
935$code.=<<___;
936 movzb `&lo("$s0")`,$t0
937 movzb `&lo("$s1")`,$t1
938 movzb `&lo("$s2")`,$t2
939 movzb ($sbox,$t0,1),$t0
940 movzb ($sbox,$t1,1),$t1
941 movzb ($sbox,$t2,1),$t2
942
943 movzb `&lo("$s3")`,$t3
944 movzb `&hi("$s3")`,$acc0
945 movzb `&hi("$s0")`,$acc1
946 movzb ($sbox,$t3,1),$t3
947 movzb ($sbox,$acc0,1),$t4 #$t0
948 movzb ($sbox,$acc1,1),$t5 #$t1
949
950 movzb `&hi("$s1")`,$acc2
951 movzb `&hi("$s2")`,$acc0
952 shr \$16,$s2
953 movzb ($sbox,$acc2,1),$acc2 #$t2
954 movzb ($sbox,$acc0,1),$acc0 #$t3
955 shr \$16,$s3
956
957 movzb `&lo("$s2")`,$acc1
958 shl \$8,$t4
959 shl \$8,$t5
960 movzb ($sbox,$acc1,1),$acc1 #$t0
961 xor $t4,$t0
962 xor $t5,$t1
963
964 movzb `&lo("$s3")`,$t4
965 shr \$16,$s0
966 shr \$16,$s1
967 movzb `&lo("$s0")`,$t5
968 shl \$8,$acc2
969 shl \$8,$acc0
970 movzb ($sbox,$t4,1),$t4 #$t1
971 movzb ($sbox,$t5,1),$t5 #$t2
972 xor $acc2,$t2
973 xor $acc0,$t3
974
975 movzb `&lo("$s1")`,$acc2
976 movzb `&hi("$s1")`,$acc0
977 shl \$16,$acc1
978 movzb ($sbox,$acc2,1),$acc2 #$t3
979 movzb ($sbox,$acc0,1),$acc0 #$t0
980 xor $acc1,$t0
981
982 movzb `&hi("$s2")`,$acc1
983 shl \$16,$t4
984 shl \$16,$t5
985 movzb ($sbox,$acc1,1),$s1 #$t1
986 xor $t4,$t1
987 xor $t5,$t2
988
989 movzb `&hi("$s3")`,$acc1
990 shr \$8,$s0
991 shl \$16,$acc2
992 movzb ($sbox,$acc1,1),$s2 #$t2
993 movzb ($sbox,$s0,1),$s3 #$t3
994 xor $acc2,$t3
995
996 shl \$24,$acc0
997 shl \$24,$s1
998 shl \$24,$s2
999 xor $acc0,$t0
1000 shl \$24,$s3
1001 xor $t1,$s1
1002 mov $t0,$s0
1003 xor $t2,$s2
1004 xor $t3,$s3
1005___
1006}
1007
1008# parallelized version! input is pair of 64-bit values: %rax=s1.s0
1009# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
1010# %ecx=s2 and %edx=s3.
1011sub dectransform()
1012{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
1013 my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
1014 my $prefetch = shift;
1015
1016$code.=<<___;
1017 mov $tp10,$acc0
1018 mov $tp18,$acc8
1019 and $mask80,$acc0
1020 and $mask80,$acc8
1021 mov $acc0,$tp40
1022 mov $acc8,$tp48
1023 shr \$7,$tp40
1024 lea ($tp10,$tp10),$tp20
1025 shr \$7,$tp48
1026 lea ($tp18,$tp18),$tp28
1027 sub $tp40,$acc0
1028 sub $tp48,$acc8
1029 and $maskfe,$tp20
1030 and $maskfe,$tp28
1031 and $mask1b,$acc0
1032 and $mask1b,$acc8
1033 xor $tp20,$acc0
1034 xor $tp28,$acc8
1035 mov $acc0,$tp20
1036 mov $acc8,$tp28
1037
1038 and $mask80,$acc0
1039 and $mask80,$acc8
1040 mov $acc0,$tp80
1041 mov $acc8,$tp88
1042 shr \$7,$tp80
1043 lea ($tp20,$tp20),$tp40
1044 shr \$7,$tp88
1045 lea ($tp28,$tp28),$tp48
1046 sub $tp80,$acc0
1047 sub $tp88,$acc8
1048 and $maskfe,$tp40
1049 and $maskfe,$tp48
1050 and $mask1b,$acc0
1051 and $mask1b,$acc8
1052 xor $tp40,$acc0
1053 xor $tp48,$acc8
1054 mov $acc0,$tp40
1055 mov $acc8,$tp48
1056
1057 and $mask80,$acc0
1058 and $mask80,$acc8
1059 mov $acc0,$tp80
1060 mov $acc8,$tp88
1061 shr \$7,$tp80
1062 xor $tp10,$tp20 # tp2^=tp1
1063 shr \$7,$tp88
1064 xor $tp18,$tp28 # tp2^=tp1
1065 sub $tp80,$acc0
1066 sub $tp88,$acc8
1067 lea ($tp40,$tp40),$tp80
1068 lea ($tp48,$tp48),$tp88
1069 xor $tp10,$tp40 # tp4^=tp1
1070 xor $tp18,$tp48 # tp4^=tp1
1071 and $maskfe,$tp80
1072 and $maskfe,$tp88
1073 and $mask1b,$acc0
1074 and $mask1b,$acc8
1075 xor $acc0,$tp80
1076 xor $acc8,$tp88
1077
1078 xor $tp80,$tp10 # tp1^=tp8
1079 xor $tp88,$tp18 # tp1^=tp8
1080 xor $tp80,$tp20 # tp2^tp1^=tp8
1081 xor $tp88,$tp28 # tp2^tp1^=tp8
1082 mov $tp10,$acc0
1083 mov $tp18,$acc8
1084 xor $tp80,$tp40 # tp4^tp1^=tp8
1085 xor $tp88,$tp48 # tp4^tp1^=tp8
1086 shr \$32,$acc0
1087 shr \$32,$acc8
1088 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1
1089 xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
1090 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8)
1091 rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8)
1092 xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1093 xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1094
1095 rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
1096 rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8)
1097 xor `&LO("$tp80")`,`&LO("$tp10")`
1098 xor `&LO("$tp88")`,`&LO("$tp18")`
1099 shr \$32,$tp80
1100 shr \$32,$tp88
1101 xor `&LO("$tp80")`,`&LO("$acc0")`
1102 xor `&LO("$tp88")`,`&LO("$acc8")`
1103
1104 mov $tp20,$tp80
1105 mov $tp28,$tp88
1106 shr \$32,$tp80
1107 shr \$32,$tp88
1108 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
1109 rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
1110 rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
1111 rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
1112 xor `&LO("$tp20")`,`&LO("$tp10")`
1113 xor `&LO("$tp28")`,`&LO("$tp18")`
1114 mov $tp40,$tp20
1115 mov $tp48,$tp28
1116 xor `&LO("$tp80")`,`&LO("$acc0")`
1117 xor `&LO("$tp88")`,`&LO("$acc8")`
1118
1119 `"mov 0($sbox),$mask80" if ($prefetch)`
1120 shr \$32,$tp20
1121 shr \$32,$tp28
1122 `"mov 64($sbox),$maskfe" if ($prefetch)`
1123 rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
1124 rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
1125 `"mov 128($sbox),$mask1b" if ($prefetch)`
1126 rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
1127 rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
1128 `"mov 192($sbox),$tp80" if ($prefetch)`
1129 xor `&LO("$tp40")`,`&LO("$tp10")`
1130 xor `&LO("$tp48")`,`&LO("$tp18")`
1131 `"mov 256($sbox),$tp88" if ($prefetch)`
1132 xor `&LO("$tp20")`,`&LO("$acc0")`
1133 xor `&LO("$tp28")`,`&LO("$acc8")`
1134___
1135}
1136
1137$code.=<<___;
1138.type _x86_64_AES_decrypt_compact,\@abi-omnipotent
1139.align 16
1140_x86_64_AES_decrypt_compact:
1141 lea 128($sbox),$inp # size optimization
1142 mov 0-128($inp),$acc1 # prefetch Td4
1143 mov 32-128($inp),$acc2
1144 mov 64-128($inp),$t0
1145 mov 96-128($inp),$t1
1146 mov 128-128($inp),$acc1
1147 mov 160-128($inp),$acc2
1148 mov 192-128($inp),$t0
1149 mov 224-128($inp),$t1
1150 jmp .Ldec_loop_compact
1151
1152.align 16
1153.Ldec_loop_compact:
1154 xor 0($key),$s0 # xor with key
1155 xor 4($key),$s1
1156 xor 8($key),$s2
1157 xor 12($key),$s3
1158 lea 16($key),$key
1159___
1160 &deccompactvert();
1161$code.=<<___;
1162 cmp 16(%rsp),$key
1163 je .Ldec_compact_done
1164
1165 mov 256+0($sbox),$mask80
1166 shl \$32,%rbx
1167 shl \$32,%rdx
1168 mov 256+8($sbox),$maskfe
1169 or %rbx,%rax
1170 or %rdx,%rcx
1171 mov 256+16($sbox),$mask1b
1172___
1173 &dectransform(1);
1174$code.=<<___;
1175 jmp .Ldec_loop_compact
1176.align 16
1177.Ldec_compact_done:
1178 xor 0($key),$s0
1179 xor 4($key),$s1
1180 xor 8($key),$s2
1181 xor 12($key),$s3
1182 .byte 0xf3,0xc3 # rep ret
1183.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
1184___
1185
1186# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
1187$code.=<<___;
1188.globl AES_decrypt
1189.type AES_decrypt,\@function,3
1190.align 16
1191.globl asm_AES_decrypt
1192.hidden asm_AES_decrypt
1193asm_AES_decrypt:
1194AES_decrypt:
1195 push %rbx
1196 push %rbp
1197 push %r12
1198 push %r13
1199 push %r14
1200 push %r15
1201
1202 # allocate frame "above" key schedule
1203 mov %rsp,%r10
1204 lea -63(%rdx),%rcx # %rdx is key argument
1205 and \$-64,%rsp
1206 sub %rsp,%rcx
1207 neg %rcx
1208 and \$0x3c0,%rcx
1209 sub %rcx,%rsp
1210 sub \$32,%rsp
1211
1212 mov %rsi,16(%rsp) # save out
1213 mov %r10,24(%rsp) # save real stack pointer
1214.Ldec_prologue:
1215
1216 mov %rdx,$key
1217 mov 240($key),$rnds # load rounds
1218
1219 mov 0(%rdi),$s0 # load input vector
1220 mov 4(%rdi),$s1
1221 mov 8(%rdi),$s2
1222 mov 12(%rdi),$s3
1223
1224 shl \$4,$rnds
1225 lea ($key,$rnds),%rbp
1226 mov $key,(%rsp) # key schedule
1227 mov %rbp,8(%rsp) # end of key schedule
1228
1229 # pick Td4 copy which can't "overlap" with stack frame or key schedule
1230 lea .LAES_Td+2048(%rip),$sbox
1231 lea 768(%rsp),%rbp
1232 sub $sbox,%rbp
1233 and \$0x300,%rbp
1234 lea ($sbox,%rbp),$sbox
1235 shr \$3,%rbp # recall "magic" constants!
1236 add %rbp,$sbox
1237
1238 call _x86_64_AES_decrypt_compact
1239
1240 mov 16(%rsp),$out # restore out
1241 mov 24(%rsp),%rsi # restore saved stack pointer
1242 mov $s0,0($out) # write output vector
1243 mov $s1,4($out)
1244 mov $s2,8($out)
1245 mov $s3,12($out)
1246
1247 mov (%rsi),%r15
1248 mov 8(%rsi),%r14
1249 mov 16(%rsi),%r13
1250 mov 24(%rsi),%r12
1251 mov 32(%rsi),%rbp
1252 mov 40(%rsi),%rbx
1253 lea 48(%rsi),%rsp
1254.Ldec_epilogue:
1255 ret
1256.size AES_decrypt,.-AES_decrypt
1257___
1258#------------------------------------------------------------------#
1259
1260sub enckey()
1261{
1262$code.=<<___;
1263 movz %dl,%esi # rk[i]>>0
1264 movzb -128(%rbp,%rsi),%ebx
1265 movz %dh,%esi # rk[i]>>8
1266 shl \$24,%ebx
1267 xor %ebx,%eax
1268
1269 movzb -128(%rbp,%rsi),%ebx
1270 shr \$16,%edx
1271 movz %dl,%esi # rk[i]>>16
1272 xor %ebx,%eax
1273
1274 movzb -128(%rbp,%rsi),%ebx
1275 movz %dh,%esi # rk[i]>>24
1276 shl \$8,%ebx
1277 xor %ebx,%eax
1278
1279 movzb -128(%rbp,%rsi),%ebx
1280 shl \$16,%ebx
1281 xor %ebx,%eax
1282
1283 xor 1024-128(%rbp,%rcx,4),%eax # rcon
1284___
1285}
1286
1287# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
1288# AES_KEY *key)
1289$code.=<<___;
1290.globl AES_set_encrypt_key
1291.type AES_set_encrypt_key,\@function,3
1292.align 16
1293AES_set_encrypt_key:
1294 push %rbx
1295 push %rbp
1296 push %r12 # redundant, but allows to share
1297 push %r13 # exception handler...
1298 push %r14
1299 push %r15
1300 sub \$8,%rsp
1301.Lenc_key_prologue:
1302
1303 call _x86_64_AES_set_encrypt_key
1304
1305 mov 8(%rsp),%r15
1306 mov 16(%rsp),%r14
1307 mov 24(%rsp),%r13
1308 mov 32(%rsp),%r12
1309 mov 40(%rsp),%rbp
1310 mov 48(%rsp),%rbx
1311 add \$56,%rsp
1312.Lenc_key_epilogue:
1313 ret
1314.size AES_set_encrypt_key,.-AES_set_encrypt_key
1315
1316.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
1317.align 16
1318_x86_64_AES_set_encrypt_key:
1319 mov %esi,%ecx # %ecx=bits
1320 mov %rdi,%rsi # %rsi=userKey
1321 mov %rdx,%rdi # %rdi=key
1322
1323 test \$-1,%rsi
1324 jz .Lbadpointer
1325 test \$-1,%rdi
1326 jz .Lbadpointer
1327
1328 lea .LAES_Te(%rip),%rbp
1329 lea 2048+128(%rbp),%rbp
1330
1331 # prefetch Te4
1332 mov 0-128(%rbp),%eax
1333 mov 32-128(%rbp),%ebx
1334 mov 64-128(%rbp),%r8d
1335 mov 96-128(%rbp),%edx
1336 mov 128-128(%rbp),%eax
1337 mov 160-128(%rbp),%ebx
1338 mov 192-128(%rbp),%r8d
1339 mov 224-128(%rbp),%edx
1340
1341 cmp \$128,%ecx
1342 je .L10rounds
1343 cmp \$192,%ecx
1344 je .L12rounds
1345 cmp \$256,%ecx
1346 je .L14rounds
1347 mov \$-2,%rax # invalid number of bits
1348 jmp .Lexit
1349
1350.L10rounds:
1351 mov 0(%rsi),%rax # copy first 4 dwords
1352 mov 8(%rsi),%rdx
1353 mov %rax,0(%rdi)
1354 mov %rdx,8(%rdi)
1355
1356 shr \$32,%rdx
1357 xor %ecx,%ecx
1358 jmp .L10shortcut
1359.align 4
1360.L10loop:
1361 mov 0(%rdi),%eax # rk[0]
1362 mov 12(%rdi),%edx # rk[3]
1363.L10shortcut:
1364___
1365 &enckey ();
1366$code.=<<___;
1367 mov %eax,16(%rdi) # rk[4]
1368 xor 4(%rdi),%eax
1369 mov %eax,20(%rdi) # rk[5]
1370 xor 8(%rdi),%eax
1371 mov %eax,24(%rdi) # rk[6]
1372 xor 12(%rdi),%eax
1373 mov %eax,28(%rdi) # rk[7]
1374 add \$1,%ecx
1375 lea 16(%rdi),%rdi
1376 cmp \$10,%ecx
1377 jl .L10loop
1378
1379 movl \$10,80(%rdi) # setup number of rounds
1380 xor %rax,%rax
1381 jmp .Lexit
1382
1383.L12rounds:
1384 mov 0(%rsi),%rax # copy first 6 dwords
1385 mov 8(%rsi),%rbx
1386 mov 16(%rsi),%rdx
1387 mov %rax,0(%rdi)
1388 mov %rbx,8(%rdi)
1389 mov %rdx,16(%rdi)
1390
1391 shr \$32,%rdx
1392 xor %ecx,%ecx
1393 jmp .L12shortcut
1394.align 4
1395.L12loop:
1396 mov 0(%rdi),%eax # rk[0]
1397 mov 20(%rdi),%edx # rk[5]
1398.L12shortcut:
1399___
1400 &enckey ();
1401$code.=<<___;
1402 mov %eax,24(%rdi) # rk[6]
1403 xor 4(%rdi),%eax
1404 mov %eax,28(%rdi) # rk[7]
1405 xor 8(%rdi),%eax
1406 mov %eax,32(%rdi) # rk[8]
1407 xor 12(%rdi),%eax
1408 mov %eax,36(%rdi) # rk[9]
1409
1410 cmp \$7,%ecx
1411 je .L12break
1412 add \$1,%ecx
1413
1414 xor 16(%rdi),%eax
1415 mov %eax,40(%rdi) # rk[10]
1416 xor 20(%rdi),%eax
1417 mov %eax,44(%rdi) # rk[11]
1418
1419 lea 24(%rdi),%rdi
1420 jmp .L12loop
1421.L12break:
1422 movl \$12,72(%rdi) # setup number of rounds
1423 xor %rax,%rax
1424 jmp .Lexit
1425
1426.L14rounds:
1427 mov 0(%rsi),%rax # copy first 8 dwords
1428 mov 8(%rsi),%rbx
1429 mov 16(%rsi),%rcx
1430 mov 24(%rsi),%rdx
1431 mov %rax,0(%rdi)
1432 mov %rbx,8(%rdi)
1433 mov %rcx,16(%rdi)
1434 mov %rdx,24(%rdi)
1435
1436 shr \$32,%rdx
1437 xor %ecx,%ecx
1438 jmp .L14shortcut
1439.align 4
1440.L14loop:
1441 mov 0(%rdi),%eax # rk[0]
1442 mov 28(%rdi),%edx # rk[4]
1443.L14shortcut:
1444___
1445 &enckey ();
1446$code.=<<___;
1447 mov %eax,32(%rdi) # rk[8]
1448 xor 4(%rdi),%eax
1449 mov %eax,36(%rdi) # rk[9]
1450 xor 8(%rdi),%eax
1451 mov %eax,40(%rdi) # rk[10]
1452 xor 12(%rdi),%eax
1453 mov %eax,44(%rdi) # rk[11]
1454
1455 cmp \$6,%ecx
1456 je .L14break
1457 add \$1,%ecx
1458
1459 mov %eax,%edx
1460 mov 16(%rdi),%eax # rk[4]
1461 movz %dl,%esi # rk[11]>>0
1462 movzb -128(%rbp,%rsi),%ebx
1463 movz %dh,%esi # rk[11]>>8
1464 xor %ebx,%eax
1465
1466 movzb -128(%rbp,%rsi),%ebx
1467 shr \$16,%edx
1468 shl \$8,%ebx
1469 movz %dl,%esi # rk[11]>>16
1470 xor %ebx,%eax
1471
1472 movzb -128(%rbp,%rsi),%ebx
1473 movz %dh,%esi # rk[11]>>24
1474 shl \$16,%ebx
1475 xor %ebx,%eax
1476
1477 movzb -128(%rbp,%rsi),%ebx
1478 shl \$24,%ebx
1479 xor %ebx,%eax
1480
1481 mov %eax,48(%rdi) # rk[12]
1482 xor 20(%rdi),%eax
1483 mov %eax,52(%rdi) # rk[13]
1484 xor 24(%rdi),%eax
1485 mov %eax,56(%rdi) # rk[14]
1486 xor 28(%rdi),%eax
1487 mov %eax,60(%rdi) # rk[15]
1488
1489 lea 32(%rdi),%rdi
1490 jmp .L14loop
1491.L14break:
1492 movl \$14,48(%rdi) # setup number of rounds
1493 xor %rax,%rax
1494 jmp .Lexit
1495
1496.Lbadpointer:
1497 mov \$-1,%rax
1498.Lexit:
1499 .byte 0xf3,0xc3 # rep ret
1500.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
1501___
1502
1503sub deckey_ref()
1504{ my ($i,$ptr,$te,$td) = @_;
1505 my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
1506$code.=<<___;
1507 mov $i($ptr),$tp1
1508 mov $tp1,$acc
1509 and \$0x80808080,$acc
1510 mov $acc,$tp4
1511 shr \$7,$tp4
1512 lea 0($tp1,$tp1),$tp2
1513 sub $tp4,$acc
1514 and \$0xfefefefe,$tp2
1515 and \$0x1b1b1b1b,$acc
1516 xor $tp2,$acc
1517 mov $acc,$tp2
1518
1519 and \$0x80808080,$acc
1520 mov $acc,$tp8
1521 shr \$7,$tp8
1522 lea 0($tp2,$tp2),$tp4
1523 sub $tp8,$acc
1524 and \$0xfefefefe,$tp4
1525 and \$0x1b1b1b1b,$acc
1526 xor $tp1,$tp2 # tp2^tp1
1527 xor $tp4,$acc
1528 mov $acc,$tp4
1529
1530 and \$0x80808080,$acc
1531 mov $acc,$tp8
1532 shr \$7,$tp8
1533 sub $tp8,$acc
1534 lea 0($tp4,$tp4),$tp8
1535 xor $tp1,$tp4 # tp4^tp1
1536 and \$0xfefefefe,$tp8
1537 and \$0x1b1b1b1b,$acc
1538 xor $acc,$tp8
1539
1540 xor $tp8,$tp1 # tp1^tp8
1541 rol \$8,$tp1 # ROTATE(tp1^tp8,8)
1542 xor $tp8,$tp2 # tp2^tp1^tp8
1543 xor $tp8,$tp4 # tp4^tp1^tp8
1544 xor $tp2,$tp8
1545 xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
1546
1547 xor $tp8,$tp1
1548 rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24)
1549 xor $tp2,$tp1
1550 rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16)
1551 xor $tp4,$tp1
1552
1553 mov $tp1,$i($ptr)
1554___
1555}
1556
1557# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
1558# AES_KEY *key)
1559$code.=<<___;
1560.globl AES_set_decrypt_key
1561.type AES_set_decrypt_key,\@function,3
1562.align 16
1563AES_set_decrypt_key:
1564 push %rbx
1565 push %rbp
1566 push %r12
1567 push %r13
1568 push %r14
1569 push %r15
1570 push %rdx # save key schedule
1571.Ldec_key_prologue:
1572
1573 call _x86_64_AES_set_encrypt_key
1574 mov (%rsp),%r8 # restore key schedule
1575 cmp \$0,%eax
1576 jne .Labort
1577
1578 mov 240(%r8),%r14d # pull number of rounds
1579 xor %rdi,%rdi
1580 lea (%rdi,%r14d,4),%rcx
1581 mov %r8,%rsi
1582 lea (%r8,%rcx,4),%rdi # pointer to last chunk
1583.align 4
1584.Linvert:
1585 mov 0(%rsi),%rax
1586 mov 8(%rsi),%rbx
1587 mov 0(%rdi),%rcx
1588 mov 8(%rdi),%rdx
1589 mov %rax,0(%rdi)
1590 mov %rbx,8(%rdi)
1591 mov %rcx,0(%rsi)
1592 mov %rdx,8(%rsi)
1593 lea 16(%rsi),%rsi
1594 lea -16(%rdi),%rdi
1595 cmp %rsi,%rdi
1596 jne .Linvert
1597
1598 lea .LAES_Te+2048+1024(%rip),%rax # rcon
1599
1600 mov 40(%rax),$mask80
1601 mov 48(%rax),$maskfe
1602 mov 56(%rax),$mask1b
1603
1604 mov %r8,$key
1605 sub \$1,%r14d
1606.align 4
1607.Lpermute:
1608 lea 16($key),$key
1609 mov 0($key),%rax
1610 mov 8($key),%rcx
1611___
1612 &dectransform ();
1613$code.=<<___;
1614 mov %eax,0($key)
1615 mov %ebx,4($key)
1616 mov %ecx,8($key)
1617 mov %edx,12($key)
1618 sub \$1,%r14d
1619 jnz .Lpermute
1620
1621 xor %rax,%rax
1622.Labort:
1623 mov 8(%rsp),%r15
1624 mov 16(%rsp),%r14
1625 mov 24(%rsp),%r13
1626 mov 32(%rsp),%r12
1627 mov 40(%rsp),%rbp
1628 mov 48(%rsp),%rbx
1629 add \$56,%rsp
1630.Ldec_key_epilogue:
1631 ret
1632.size AES_set_decrypt_key,.-AES_set_decrypt_key
1633___
1634
1635# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
1636# size_t length, const AES_KEY *key,
1637# unsigned char *ivp,const int enc);
1638{
1639# stack frame layout
1640# -8(%rsp) return address
1641my $keyp="0(%rsp)"; # one to pass as $key
1642my $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds])
1643my $_rsp="16(%rsp)"; # saved %rsp
1644my $_inp="24(%rsp)"; # copy of 1st parameter, inp
1645my $_out="32(%rsp)"; # copy of 2nd parameter, out
1646my $_len="40(%rsp)"; # copy of 3rd parameter, length
1647my $_key="48(%rsp)"; # copy of 4th parameter, key
1648my $_ivp="56(%rsp)"; # copy of 5th parameter, ivp
1649my $ivec="64(%rsp)"; # ivec[16]
1650my $aes_key="80(%rsp)"; # copy of aes_key
1651my $mark="80+240(%rsp)"; # copy of aes_key->rounds
1652
1653$code.=<<___;
1654.globl AES_cbc_encrypt
1655.type AES_cbc_encrypt,\@function,6
1656.align 16
1657.extern OPENSSL_ia32cap_P
1658.globl asm_AES_cbc_encrypt
1659.hidden asm_AES_cbc_encrypt
1660asm_AES_cbc_encrypt:
1661AES_cbc_encrypt:
1662 cmp \$0,%rdx # check length
1663 je .Lcbc_epilogue
1664 pushfq
1665 push %rbx
1666 push %rbp
1667 push %r12
1668 push %r13
1669 push %r14
1670 push %r15
1671.Lcbc_prologue:
1672
1673 cld
1674 mov %r9d,%r9d # clear upper half of enc
1675
1676 lea .LAES_Te(%rip),$sbox
1677 cmp \$0,%r9
1678 jne .Lcbc_picked_te
1679 lea .LAES_Td(%rip),$sbox
1680.Lcbc_picked_te:
1681
1682 mov OPENSSL_ia32cap_P(%rip),%r10d
1683 cmp \$$speed_limit,%rdx
1684 jb .Lcbc_slow_prologue
1685 test \$15,%rdx
1686 jnz .Lcbc_slow_prologue
1687 bt \$28,%r10d
1688 jc .Lcbc_slow_prologue
1689
1690 # allocate aligned stack frame...
1691 lea -88-248(%rsp),$key
1692 and \$-64,$key
1693
1694 # ... and make sure it doesn't alias with AES_T[ed] modulo 4096
1695 mov $sbox,%r10
1696 lea 2304($sbox),%r11
1697 mov $key,%r12
1698 and \$0xFFF,%r10 # s = $sbox&0xfff
1699 and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
1700 and \$0xFFF,%r12 # p = %rsp&0xfff
1701
1702 cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
1703 jb .Lcbc_te_break_out
1704 sub %r11,%r12
1705 sub %r12,$key
1706 jmp .Lcbc_te_ok
1707.Lcbc_te_break_out: # else %rsp -= (p-s)&0xfff + framesz
1708 sub %r10,%r12
1709 and \$0xFFF,%r12
1710 add \$320,%r12
1711 sub %r12,$key
1712.align 4
1713.Lcbc_te_ok:
1714
1715 xchg %rsp,$key
1716 #add \$8,%rsp # reserve for return address!
1717 mov $key,$_rsp # save %rsp
1718.Lcbc_fast_body:
1719 mov %rdi,$_inp # save copy of inp
1720 mov %rsi,$_out # save copy of out
1721 mov %rdx,$_len # save copy of len
1722 mov %rcx,$_key # save copy of key
1723 mov %r8,$_ivp # save copy of ivp
1724 movl \$0,$mark # copy of aes_key->rounds = 0;
1725 mov %r8,%rbp # rearrange input arguments
1726 mov %r9,%rbx
1727 mov %rsi,$out
1728 mov %rdi,$inp
1729 mov %rcx,$key
1730
1731 mov 240($key),%eax # key->rounds
1732 # do we copy key schedule to stack?
1733 mov $key,%r10
1734 sub $sbox,%r10
1735 and \$0xfff,%r10
1736 cmp \$2304,%r10
1737 jb .Lcbc_do_ecopy
1738 cmp \$4096-248,%r10
1739 jb .Lcbc_skip_ecopy
1740.align 4
1741.Lcbc_do_ecopy:
1742 mov $key,%rsi
1743 lea $aes_key,%rdi
1744 lea $aes_key,$key
1745 mov \$240/8,%ecx
1746 .long 0x90A548F3 # rep movsq
1747 mov %eax,(%rdi) # copy aes_key->rounds
1748.Lcbc_skip_ecopy:
1749 mov $key,$keyp # save key pointer
1750
1751 mov \$18,%ecx
1752.align 4
1753.Lcbc_prefetch_te:
1754 mov 0($sbox),%r10
1755 mov 32($sbox),%r11
1756 mov 64($sbox),%r12
1757 mov 96($sbox),%r13
1758 lea 128($sbox),$sbox
1759 sub \$1,%ecx
1760 jnz .Lcbc_prefetch_te
1761 lea -2304($sbox),$sbox
1762
1763 cmp \$0,%rbx
1764 je .LFAST_DECRYPT
1765
1766#----------------------------- ENCRYPT -----------------------------#
1767 mov 0(%rbp),$s0 # load iv
1768 mov 4(%rbp),$s1
1769 mov 8(%rbp),$s2
1770 mov 12(%rbp),$s3
1771
1772.align 4
1773.Lcbc_fast_enc_loop:
1774 xor 0($inp),$s0
1775 xor 4($inp),$s1
1776 xor 8($inp),$s2
1777 xor 12($inp),$s3
1778 mov $keyp,$key # restore key
1779 mov $inp,$_inp # if ($verticalspin) save inp
1780
1781 call _x86_64_AES_encrypt
1782
1783 mov $_inp,$inp # if ($verticalspin) restore inp
1784 mov $_len,%r10
1785 mov $s0,0($out)
1786 mov $s1,4($out)
1787 mov $s2,8($out)
1788 mov $s3,12($out)
1789
1790 lea 16($inp),$inp
1791 lea 16($out),$out
1792 sub \$16,%r10
1793 test \$-16,%r10
1794 mov %r10,$_len
1795 jnz .Lcbc_fast_enc_loop
1796 mov $_ivp,%rbp # restore ivp
1797 mov $s0,0(%rbp) # save ivec
1798 mov $s1,4(%rbp)
1799 mov $s2,8(%rbp)
1800 mov $s3,12(%rbp)
1801
1802 jmp .Lcbc_fast_cleanup
1803
1804#----------------------------- DECRYPT -----------------------------#
1805.align 16
1806.LFAST_DECRYPT:
1807 cmp $inp,$out
1808 je .Lcbc_fast_dec_in_place
1809
1810 mov %rbp,$ivec
1811.align 4
1812.Lcbc_fast_dec_loop:
1813 mov 0($inp),$s0 # read input
1814 mov 4($inp),$s1
1815 mov 8($inp),$s2
1816 mov 12($inp),$s3
1817 mov $keyp,$key # restore key
1818 mov $inp,$_inp # if ($verticalspin) save inp
1819
1820 call _x86_64_AES_decrypt
1821
1822 mov $ivec,%rbp # load ivp
1823 mov $_inp,$inp # if ($verticalspin) restore inp
1824 mov $_len,%r10 # load len
1825 xor 0(%rbp),$s0 # xor iv
1826 xor 4(%rbp),$s1
1827 xor 8(%rbp),$s2
1828 xor 12(%rbp),$s3
1829 mov $inp,%rbp # current input, next iv
1830
1831 sub \$16,%r10
1832 mov %r10,$_len # update len
1833 mov %rbp,$ivec # update ivp
1834
1835 mov $s0,0($out) # write output
1836 mov $s1,4($out)
1837 mov $s2,8($out)
1838 mov $s3,12($out)
1839
1840 lea 16($inp),$inp
1841 lea 16($out),$out
1842 jnz .Lcbc_fast_dec_loop
1843 mov $_ivp,%r12 # load user ivp
1844 mov 0(%rbp),%r10 # load iv
1845 mov 8(%rbp),%r11
1846 mov %r10,0(%r12) # copy back to user
1847 mov %r11,8(%r12)
1848 jmp .Lcbc_fast_cleanup
1849
1850.align 16
1851.Lcbc_fast_dec_in_place:
1852 mov 0(%rbp),%r10 # copy iv to stack
1853 mov 8(%rbp),%r11
1854 mov %r10,0+$ivec
1855 mov %r11,8+$ivec
1856.align 4
1857.Lcbc_fast_dec_in_place_loop:
1858 mov 0($inp),$s0 # load input
1859 mov 4($inp),$s1
1860 mov 8($inp),$s2
1861 mov 12($inp),$s3
1862 mov $keyp,$key # restore key
1863 mov $inp,$_inp # if ($verticalspin) save inp
1864
1865 call _x86_64_AES_decrypt
1866
1867 mov $_inp,$inp # if ($verticalspin) restore inp
1868 mov $_len,%r10
1869 xor 0+$ivec,$s0
1870 xor 4+$ivec,$s1
1871 xor 8+$ivec,$s2
1872 xor 12+$ivec,$s3
1873
1874 mov 0($inp),%r11 # load input
1875 mov 8($inp),%r12
1876 sub \$16,%r10
1877 jz .Lcbc_fast_dec_in_place_done
1878
1879 mov %r11,0+$ivec # copy input to iv
1880 mov %r12,8+$ivec
1881
1882 mov $s0,0($out) # save output [zaps input]
1883 mov $s1,4($out)
1884 mov $s2,8($out)
1885 mov $s3,12($out)
1886
1887 lea 16($inp),$inp
1888 lea 16($out),$out
1889 mov %r10,$_len
1890 jmp .Lcbc_fast_dec_in_place_loop
1891.Lcbc_fast_dec_in_place_done:
1892 mov $_ivp,%rdi
1893 mov %r11,0(%rdi) # copy iv back to user
1894 mov %r12,8(%rdi)
1895
1896 mov $s0,0($out) # save output [zaps input]
1897 mov $s1,4($out)
1898 mov $s2,8($out)
1899 mov $s3,12($out)
1900
1901.align 4
1902.Lcbc_fast_cleanup:
1903 cmpl \$0,$mark # was the key schedule copied?
1904 lea $aes_key,%rdi
1905 je .Lcbc_exit
1906 mov \$240/8,%ecx
1907 xor %rax,%rax
1908 .long 0x90AB48F3 # rep stosq
1909
1910 jmp .Lcbc_exit
1911
1912#--------------------------- SLOW ROUTINE ---------------------------#
1913.align 16
1914.Lcbc_slow_prologue:
1915 # allocate aligned stack frame...
1916 lea -88(%rsp),%rbp
1917 and \$-64,%rbp
1918 # ... just "above" key schedule
1919 lea -88-63(%rcx),%r10
1920 sub %rbp,%r10
1921 neg %r10
1922 and \$0x3c0,%r10
1923 sub %r10,%rbp
1924
1925 xchg %rsp,%rbp
1926 #add \$8,%rsp # reserve for return address!
1927 mov %rbp,$_rsp # save %rsp
1928.Lcbc_slow_body:
1929 #mov %rdi,$_inp # save copy of inp
1930 #mov %rsi,$_out # save copy of out
1931 #mov %rdx,$_len # save copy of len
1932 #mov %rcx,$_key # save copy of key
1933 mov %r8,$_ivp # save copy of ivp
1934 mov %r8,%rbp # rearrange input arguments
1935 mov %r9,%rbx
1936 mov %rsi,$out
1937 mov %rdi,$inp
1938 mov %rcx,$key
1939 mov %rdx,%r10
1940
1941 mov 240($key),%eax
1942 mov $key,$keyp # save key pointer
1943 shl \$4,%eax
1944 lea ($key,%rax),%rax
1945 mov %rax,$keyend
1946
1947 # pick Te4 copy which can't "overlap" with stack frame or key scdedule
1948 lea 2048($sbox),$sbox
1949 lea 768-8(%rsp),%rax
1950 sub $sbox,%rax
1951 and \$0x300,%rax
1952 lea ($sbox,%rax),$sbox
1953
1954 cmp \$0,%rbx
1955 je .LSLOW_DECRYPT
1956
1957#--------------------------- SLOW ENCRYPT ---------------------------#
1958 test \$-16,%r10 # check upon length
1959 mov 0(%rbp),$s0 # load iv
1960 mov 4(%rbp),$s1
1961 mov 8(%rbp),$s2
1962 mov 12(%rbp),$s3
1963 jz .Lcbc_slow_enc_tail # short input...
1964
1965.align 4
1966.Lcbc_slow_enc_loop:
1967 xor 0($inp),$s0
1968 xor 4($inp),$s1
1969 xor 8($inp),$s2
1970 xor 12($inp),$s3
1971 mov $keyp,$key # restore key
1972 mov $inp,$_inp # save inp
1973 mov $out,$_out # save out
1974 mov %r10,$_len # save len
1975
1976 call _x86_64_AES_encrypt_compact
1977
1978 mov $_inp,$inp # restore inp
1979 mov $_out,$out # restore out
1980 mov $_len,%r10 # restore len
1981 mov $s0,0($out)
1982 mov $s1,4($out)
1983 mov $s2,8($out)
1984 mov $s3,12($out)
1985
1986 lea 16($inp),$inp
1987 lea 16($out),$out
1988 sub \$16,%r10
1989 test \$-16,%r10
1990 jnz .Lcbc_slow_enc_loop
1991 test \$15,%r10
1992 jnz .Lcbc_slow_enc_tail
1993 mov $_ivp,%rbp # restore ivp
1994 mov $s0,0(%rbp) # save ivec
1995 mov $s1,4(%rbp)
1996 mov $s2,8(%rbp)
1997 mov $s3,12(%rbp)
1998
1999 jmp .Lcbc_exit
2000
2001.align 4
2002.Lcbc_slow_enc_tail:
2003 mov %rax,%r11
2004 mov %rcx,%r12
2005 mov %r10,%rcx
2006 mov $inp,%rsi
2007 mov $out,%rdi
2008 .long 0x9066A4F3 # rep movsb
2009 mov \$16,%rcx # zero tail
2010 sub %r10,%rcx
2011 xor %rax,%rax
2012 .long 0x9066AAF3 # rep stosb
2013 mov $out,$inp # this is not a mistake!
2014 mov \$16,%r10 # len=16
2015 mov %r11,%rax
2016 mov %r12,%rcx
2017 jmp .Lcbc_slow_enc_loop # one more spin...
2018#--------------------------- SLOW DECRYPT ---------------------------#
2019.align 16
2020.LSLOW_DECRYPT:
2021 shr \$3,%rax
2022 add %rax,$sbox # recall "magic" constants!
2023
2024 mov 0(%rbp),%r11 # copy iv to stack
2025 mov 8(%rbp),%r12
2026 mov %r11,0+$ivec
2027 mov %r12,8+$ivec
2028
2029.align 4
2030.Lcbc_slow_dec_loop:
2031 mov 0($inp),$s0 # load input
2032 mov 4($inp),$s1
2033 mov 8($inp),$s2
2034 mov 12($inp),$s3
2035 mov $keyp,$key # restore key
2036 mov $inp,$_inp # save inp
2037 mov $out,$_out # save out
2038 mov %r10,$_len # save len
2039
2040 call _x86_64_AES_decrypt_compact
2041
2042 mov $_inp,$inp # restore inp
2043 mov $_out,$out # restore out
2044 mov $_len,%r10
2045 xor 0+$ivec,$s0
2046 xor 4+$ivec,$s1
2047 xor 8+$ivec,$s2
2048 xor 12+$ivec,$s3
2049
2050 mov 0($inp),%r11 # load input
2051 mov 8($inp),%r12
2052 sub \$16,%r10
2053 jc .Lcbc_slow_dec_partial
2054 jz .Lcbc_slow_dec_done
2055
2056 mov %r11,0+$ivec # copy input to iv
2057 mov %r12,8+$ivec
2058
2059 mov $s0,0($out) # save output [can zap input]
2060 mov $s1,4($out)
2061 mov $s2,8($out)
2062 mov $s3,12($out)
2063
2064 lea 16($inp),$inp
2065 lea 16($out),$out
2066 jmp .Lcbc_slow_dec_loop
2067.Lcbc_slow_dec_done:
2068 mov $_ivp,%rdi
2069 mov %r11,0(%rdi) # copy iv back to user
2070 mov %r12,8(%rdi)
2071
2072 mov $s0,0($out) # save output [can zap input]
2073 mov $s1,4($out)
2074 mov $s2,8($out)
2075 mov $s3,12($out)
2076
2077 jmp .Lcbc_exit
2078
2079.align 4
2080.Lcbc_slow_dec_partial:
2081 mov $_ivp,%rdi
2082 mov %r11,0(%rdi) # copy iv back to user
2083 mov %r12,8(%rdi)
2084
2085 mov $s0,0+$ivec # save output to stack
2086 mov $s1,4+$ivec
2087 mov $s2,8+$ivec
2088 mov $s3,12+$ivec
2089
2090 mov $out,%rdi
2091 lea $ivec,%rsi
2092 lea 16(%r10),%rcx
2093 .long 0x9066A4F3 # rep movsb
2094 jmp .Lcbc_exit
2095
2096.align 16
2097.Lcbc_exit:
2098 mov $_rsp,%rsi
2099 mov (%rsi),%r15
2100 mov 8(%rsi),%r14
2101 mov 16(%rsi),%r13
2102 mov 24(%rsi),%r12
2103 mov 32(%rsi),%rbp
2104 mov 40(%rsi),%rbx
2105 lea 48(%rsi),%rsp
2106.Lcbc_popfq:
2107 popfq
2108.Lcbc_epilogue:
2109 ret
2110.size AES_cbc_encrypt,.-AES_cbc_encrypt
2111___
2112}
2113
2114$code.=<<___;
2115.align 64
2116.LAES_Te:
2117___
2118 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
2119 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
2120 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
2121 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
2122 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
2123 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
2124 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
2125 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
2126 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
2127 &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
2128 &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
2129 &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
2130 &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
2131 &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
2132 &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
2133 &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
2134 &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
2135 &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
2136 &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
2137 &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
2138 &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
2139 &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
2140 &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
2141 &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
2142 &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
2143 &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
2144 &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
2145 &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
2146 &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
2147 &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
2148 &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
2149 &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
2150 &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
2151 &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
2152 &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
2153 &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
2154 &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
2155 &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
2156 &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
2157 &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
2158 &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
2159 &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
2160 &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
2161 &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
2162 &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
2163 &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
2164 &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
2165 &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
2166 &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
2167 &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
2168 &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
2169 &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
2170 &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
2171 &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
2172 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
2173 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
2174 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
2175 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
2176 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
2177 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
2178 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
2179 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
2180 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
2181 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
2182
2183#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
2184 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2185 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2186 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2187 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2188 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2189 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2190 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2191 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2192 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2193 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2194 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2195 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2196 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2197 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2198 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2199 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2200 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2201 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2202 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2203 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2204 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2205 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2206 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2207 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2208 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2209 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2210 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2211 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2212 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2213 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2214 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2215 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2216
2217 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2218 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2219 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2220 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2221 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2222 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2223 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2224 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2225 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2226 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2227 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2228 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2229 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2230 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2231 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2232 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2233 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2234 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2235 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2236 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2237 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2238 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2239 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2240 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2241 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2242 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2243 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2244 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2245 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2246 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2247 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2248 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2249
2250 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2251 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2252 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2253 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2254 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2255 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2256 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2257 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2258 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2259 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2260 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2261 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2262 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2263 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2264 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2265 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2266 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2267 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2268 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2269 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2270 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2271 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2272 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2273 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2274 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2275 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2276 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2277 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2278 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2279 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2280 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2281 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2282
2283 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2284 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2285 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2286 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2287 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2288 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2289 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2290 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2291 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2292 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2293 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2294 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2295 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2296 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2297 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2298 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2299 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2300 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2301 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2302 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2303 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2304 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2305 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2306 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2307 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2308 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2309 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2310 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2311 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2312 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2313 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2314 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2315#rcon:
2316$code.=<<___;
2317 .long 0x00000001, 0x00000002, 0x00000004, 0x00000008
2318 .long 0x00000010, 0x00000020, 0x00000040, 0x00000080
2319 .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080
2320 .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
2321___
2322$code.=<<___;
2323.align 64
2324.LAES_Td:
2325___
2326 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
2327 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
2328 &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
2329 &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
2330 &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
2331 &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
2332 &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
2333 &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
2334 &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
2335 &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
2336 &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
2337 &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
2338 &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
2339 &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
2340 &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
2341 &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
2342 &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
2343 &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
2344 &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
2345 &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
2346 &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
2347 &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
2348 &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
2349 &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
2350 &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
2351 &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
2352 &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
2353 &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
2354 &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
2355 &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
2356 &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
2357 &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
2358 &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
2359 &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
2360 &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
2361 &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
2362 &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
2363 &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
2364 &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
2365 &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
2366 &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
2367 &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
2368 &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
2369 &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
2370 &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
2371 &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
2372 &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
2373 &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
2374 &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
2375 &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
2376 &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
2377 &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
2378 &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
2379 &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
2380 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
2381 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
2382 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
2383 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
2384 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
2385 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
2386 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
2387 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
2388 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
2389 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
2390
2391#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
2392 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2393 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2394 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2395 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2396 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2397 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2398 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2399 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2400 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2401 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2402 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2403 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2404 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2405 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2406 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2407 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2408 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2409 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2410 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2411 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2412 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2413 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2414 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2415 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2416 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2417 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2418 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2419 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2420 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2421 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2422 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2423 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2424$code.=<<___;
2425 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2426 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2427___
2428 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2429 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2430 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2431 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2432 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2433 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2434 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2435 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2436 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2437 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2438 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2439 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2440 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2441 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2442 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2443 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2444 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2445 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2446 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2447 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2448 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2449 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2450 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2451 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2452 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2453 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2454 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2455 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2456 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2457 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2458 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2459 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2460$code.=<<___;
2461 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2462 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2463___
2464 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2465 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2466 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2467 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2468 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2469 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2470 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2471 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2472 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2473 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2474 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2475 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2476 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2477 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2478 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2479 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2480 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2481 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2482 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2483 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2484 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2485 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2486 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2487 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2488 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2489 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2490 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2491 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2492 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2493 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2494 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2495 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2496$code.=<<___;
2497 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2498 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2499___
2500 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2501 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2502 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2503 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2504 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2505 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2506 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2507 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2508 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2509 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2510 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2511 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2512 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2513 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2514 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2515 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2516 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2517 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2518 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2519 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2520 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2521 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2522 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2523 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2524 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2525 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2526 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2527 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2528 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2529 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2530 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2531 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2532$code.=<<___;
2533 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2534 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2535.asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2536.align 64
2537___
2538
2539# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2540# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2541if ($win64) {
2542$rec="%rcx";
2543$frame="%rdx";
2544$context="%r8";
2545$disp="%r9";
2546
2547$code.=<<___;
2548.extern __imp_RtlVirtualUnwind
2549.type block_se_handler,\@abi-omnipotent
2550.align 16
2551block_se_handler:
2552 push %rsi
2553 push %rdi
2554 push %rbx
2555 push %rbp
2556 push %r12
2557 push %r13
2558 push %r14
2559 push %r15
2560 pushfq
2561 sub \$64,%rsp
2562
2563 mov 120($context),%rax # pull context->Rax
2564 mov 248($context),%rbx # pull context->Rip
2565
2566 mov 8($disp),%rsi # disp->ImageBase
2567 mov 56($disp),%r11 # disp->HandlerData
2568
2569 mov 0(%r11),%r10d # HandlerData[0]
2570 lea (%rsi,%r10),%r10 # prologue label
2571 cmp %r10,%rbx # context->Rip<prologue label
2572 jb .Lin_block_prologue
2573
2574 mov 152($context),%rax # pull context->Rsp
2575
2576 mov 4(%r11),%r10d # HandlerData[1]
2577 lea (%rsi,%r10),%r10 # epilogue label
2578 cmp %r10,%rbx # context->Rip>=epilogue label
2579 jae .Lin_block_prologue
2580
2581 mov 24(%rax),%rax # pull saved real stack pointer
2582 lea 48(%rax),%rax # adjust...
2583
2584 mov -8(%rax),%rbx
2585 mov -16(%rax),%rbp
2586 mov -24(%rax),%r12
2587 mov -32(%rax),%r13
2588 mov -40(%rax),%r14
2589 mov -48(%rax),%r15
2590 mov %rbx,144($context) # restore context->Rbx
2591 mov %rbp,160($context) # restore context->Rbp
2592 mov %r12,216($context) # restore context->R12
2593 mov %r13,224($context) # restore context->R13
2594 mov %r14,232($context) # restore context->R14
2595 mov %r15,240($context) # restore context->R15
2596
2597.Lin_block_prologue:
2598 mov 8(%rax),%rdi
2599 mov 16(%rax),%rsi
2600 mov %rax,152($context) # restore context->Rsp
2601 mov %rsi,168($context) # restore context->Rsi
2602 mov %rdi,176($context) # restore context->Rdi
2603
2604 jmp .Lcommon_seh_exit
2605.size block_se_handler,.-block_se_handler
2606
2607.type key_se_handler,\@abi-omnipotent
2608.align 16
2609key_se_handler:
2610 push %rsi
2611 push %rdi
2612 push %rbx
2613 push %rbp
2614 push %r12
2615 push %r13
2616 push %r14
2617 push %r15
2618 pushfq
2619 sub \$64,%rsp
2620
2621 mov 120($context),%rax # pull context->Rax
2622 mov 248($context),%rbx # pull context->Rip
2623
2624 mov 8($disp),%rsi # disp->ImageBase
2625 mov 56($disp),%r11 # disp->HandlerData
2626
2627 mov 0(%r11),%r10d # HandlerData[0]
2628 lea (%rsi,%r10),%r10 # prologue label
2629 cmp %r10,%rbx # context->Rip<prologue label
2630 jb .Lin_key_prologue
2631
2632 mov 152($context),%rax # pull context->Rsp
2633
2634 mov 4(%r11),%r10d # HandlerData[1]
2635 lea (%rsi,%r10),%r10 # epilogue label
2636 cmp %r10,%rbx # context->Rip>=epilogue label
2637 jae .Lin_key_prologue
2638
2639 lea 56(%rax),%rax
2640
2641 mov -8(%rax),%rbx
2642 mov -16(%rax),%rbp
2643 mov -24(%rax),%r12
2644 mov -32(%rax),%r13
2645 mov -40(%rax),%r14
2646 mov -48(%rax),%r15
2647 mov %rbx,144($context) # restore context->Rbx
2648 mov %rbp,160($context) # restore context->Rbp
2649 mov %r12,216($context) # restore context->R12
2650 mov %r13,224($context) # restore context->R13
2651 mov %r14,232($context) # restore context->R14
2652 mov %r15,240($context) # restore context->R15
2653
2654.Lin_key_prologue:
2655 mov 8(%rax),%rdi
2656 mov 16(%rax),%rsi
2657 mov %rax,152($context) # restore context->Rsp
2658 mov %rsi,168($context) # restore context->Rsi
2659 mov %rdi,176($context) # restore context->Rdi
2660
2661 jmp .Lcommon_seh_exit
2662.size key_se_handler,.-key_se_handler
2663
2664.type cbc_se_handler,\@abi-omnipotent
2665.align 16
2666cbc_se_handler:
2667 push %rsi
2668 push %rdi
2669 push %rbx
2670 push %rbp
2671 push %r12
2672 push %r13
2673 push %r14
2674 push %r15
2675 pushfq
2676 sub \$64,%rsp
2677
2678 mov 120($context),%rax # pull context->Rax
2679 mov 248($context),%rbx # pull context->Rip
2680
2681 lea .Lcbc_prologue(%rip),%r10
2682 cmp %r10,%rbx # context->Rip<.Lcbc_prologue
2683 jb .Lin_cbc_prologue
2684
2685 lea .Lcbc_fast_body(%rip),%r10
2686 cmp %r10,%rbx # context->Rip<.Lcbc_fast_body
2687 jb .Lin_cbc_frame_setup
2688
2689 lea .Lcbc_slow_prologue(%rip),%r10
2690 cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue
2691 jb .Lin_cbc_body
2692
2693 lea .Lcbc_slow_body(%rip),%r10
2694 cmp %r10,%rbx # context->Rip<.Lcbc_slow_body
2695 jb .Lin_cbc_frame_setup
2696
2697.Lin_cbc_body:
2698 mov 152($context),%rax # pull context->Rsp
2699
2700 lea .Lcbc_epilogue(%rip),%r10
2701 cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue
2702 jae .Lin_cbc_prologue
2703
2704 lea 8(%rax),%rax
2705
2706 lea .Lcbc_popfq(%rip),%r10
2707 cmp %r10,%rbx # context->Rip>=.Lcbc_popfq
2708 jae .Lin_cbc_prologue
2709
2710 mov `16-8`(%rax),%rax # biased $_rsp
2711 lea 56(%rax),%rax
2712
2713.Lin_cbc_frame_setup:
2714 mov -16(%rax),%rbx
2715 mov -24(%rax),%rbp
2716 mov -32(%rax),%r12
2717 mov -40(%rax),%r13
2718 mov -48(%rax),%r14
2719 mov -56(%rax),%r15
2720 mov %rbx,144($context) # restore context->Rbx
2721 mov %rbp,160($context) # restore context->Rbp
2722 mov %r12,216($context) # restore context->R12
2723 mov %r13,224($context) # restore context->R13
2724 mov %r14,232($context) # restore context->R14
2725 mov %r15,240($context) # restore context->R15
2726
2727.Lin_cbc_prologue:
2728 mov 8(%rax),%rdi
2729 mov 16(%rax),%rsi
2730 mov %rax,152($context) # restore context->Rsp
2731 mov %rsi,168($context) # restore context->Rsi
2732 mov %rdi,176($context) # restore context->Rdi
2733
2734.Lcommon_seh_exit:
2735
2736 mov 40($disp),%rdi # disp->ContextRecord
2737 mov $context,%rsi # context
2738 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2739 .long 0xa548f3fc # cld; rep movsq
2740
2741 mov $disp,%rsi
2742 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2743 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2744 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2745 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2746 mov 40(%rsi),%r10 # disp->ContextRecord
2747 lea 56(%rsi),%r11 # &disp->HandlerData
2748 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2749 mov %r10,32(%rsp) # arg5
2750 mov %r11,40(%rsp) # arg6
2751 mov %r12,48(%rsp) # arg7
2752 mov %rcx,56(%rsp) # arg8, (NULL)
2753 call *__imp_RtlVirtualUnwind(%rip)
2754
2755 mov \$1,%eax # ExceptionContinueSearch
2756 add \$64,%rsp
2757 popfq
2758 pop %r15
2759 pop %r14
2760 pop %r13
2761 pop %r12
2762 pop %rbp
2763 pop %rbx
2764 pop %rdi
2765 pop %rsi
2766 ret
2767.size cbc_se_handler,.-cbc_se_handler
2768
2769.section .pdata
2770.align 4
2771 .rva .LSEH_begin_AES_encrypt
2772 .rva .LSEH_end_AES_encrypt
2773 .rva .LSEH_info_AES_encrypt
2774
2775 .rva .LSEH_begin_AES_decrypt
2776 .rva .LSEH_end_AES_decrypt
2777 .rva .LSEH_info_AES_decrypt
2778
2779 .rva .LSEH_begin_AES_set_encrypt_key
2780 .rva .LSEH_end_AES_set_encrypt_key
2781 .rva .LSEH_info_AES_set_encrypt_key
2782
2783 .rva .LSEH_begin_AES_set_decrypt_key
2784 .rva .LSEH_end_AES_set_decrypt_key
2785 .rva .LSEH_info_AES_set_decrypt_key
2786
2787 .rva .LSEH_begin_AES_cbc_encrypt
2788 .rva .LSEH_end_AES_cbc_encrypt
2789 .rva .LSEH_info_AES_cbc_encrypt
2790
2791.section .xdata
2792.align 8
2793.LSEH_info_AES_encrypt:
2794 .byte 9,0,0,0
2795 .rva block_se_handler
2796 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
2797.LSEH_info_AES_decrypt:
2798 .byte 9,0,0,0
2799 .rva block_se_handler
2800 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
2801.LSEH_info_AES_set_encrypt_key:
2802 .byte 9,0,0,0
2803 .rva key_se_handler
2804 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
2805.LSEH_info_AES_set_decrypt_key:
2806 .byte 9,0,0,0
2807 .rva key_se_handler
2808 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]
2809.LSEH_info_AES_cbc_encrypt:
2810 .byte 9,0,0,0
2811 .rva cbc_se_handler
2812___
2813}
2814
2815$code =~ s/\`([^\`]*)\`/eval($1)/gem;
2816
2817print $code;
2818
2819close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl
deleted file mode 100644
index 39b504cbe5..0000000000
--- a/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl
+++ /dev/null
@@ -1,1232 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# June 2011
11#
12# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
13# in http://download.intel.com/design/intarch/papers/323686.pdf, is
14# that since AESNI-CBC encrypt exhibit *very* low instruction-level
15# parallelism, interleaving it with another algorithm would allow to
16# utilize processor resources better and achieve better performance.
17# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
18# AESNI code is weaved into it. Below are performance numbers in
19# cycles per processed byte, less is better, for standalone AESNI-CBC
20# encrypt, sum of the latter and standalone SHA1, and "stitched"
21# subroutine:
22#
23# AES-128-CBC +SHA1 stitch gain
24# Westmere 3.77[+5.6] 9.37 6.65 +41%
25# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)
26#
27# AES-192-CBC
28# Westmere 4.51 10.11 6.97 +45%
29# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)
30#
31# AES-256-CBC
32# Westmere 5.25 10.85 7.25 +50%
33# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)
34#
35# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
36# background information. Above numbers in parentheses are SSSE3
37# results collected on AVX-capable CPU, i.e. apply on OSes that
38# don't support AVX.
39#
40# Needless to mention that it makes no sense to implement "stitched"
41# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
42# fully utilize parallelism, so stitching would not give any gain
43# anyway. Well, there might be some, e.g. because of better cache
44# locality... For reference, here are performance results for
45# standalone AESNI-CBC decrypt:
46#
47# AES-128-CBC AES-192-CBC AES-256-CBC
48# Westmere 1.31 1.55 1.80
49# Sandy Bridge 0.93 1.06 1.22
50
51$flavour = shift;
52$output = shift;
53if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
54
55$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
56
57$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
59( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
60die "can't locate x86_64-xlate.pl";
61
62$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
63 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
64 $1>=2.19);
65$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
66 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
67 $1>=2.09);
68$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69 `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
70 $1>=10);
71
72open OUT,"| \"$^X\" $xlate $flavour $output";
73*STDOUT=*OUT;
74
75# void aesni_cbc_sha1_enc(const void *inp,
76# void *out,
77# size_t length,
78# const AES_KEY *key,
79# unsigned char *iv,
80# SHA_CTX *ctx,
81# const void *in0);
82
83$code.=<<___;
84.text
85.extern OPENSSL_ia32cap_P
86
87.globl aesni_cbc_sha1_enc
88.type aesni_cbc_sha1_enc,\@abi-omnipotent
89.align 16
90aesni_cbc_sha1_enc:
91 # caller should check for SSSE3 and AES-NI bits
92 mov OPENSSL_ia32cap_P+0(%rip),%r10d
93 mov OPENSSL_ia32cap_P+4(%rip),%r11d
94___
95$code.=<<___ if ($avx);
96 and \$`1<<28`,%r11d # mask AVX bit
97 and \$`1<<30`,%r10d # mask "Intel CPU" bit
98 or %r11d,%r10d
99 cmp \$`1<<28|1<<30`,%r10d
100 je aesni_cbc_sha1_enc_avx
101___
102$code.=<<___;
103 jmp aesni_cbc_sha1_enc_ssse3
104 ret
105.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
106___
107
108my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
109
110my $Xi=4;
111my @X=map("%xmm$_",(4..7,0..3));
112my @Tx=map("%xmm$_",(8..10));
113my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
114my @T=("%esi","%edi");
115my $j=0; my $jj=0; my $r=0; my $sn=0;
116my $K_XX_XX="%r11";
117my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
118my @rndkey=("%xmm14","%xmm15");
119
120sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
121{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
122 my $arg = pop;
123 $arg = "\$$arg" if ($arg*1 eq $arg);
124 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
125}
126
127my $_rol=sub { &rol(@_) };
128my $_ror=sub { &ror(@_) };
129
130$code.=<<___;
131.type aesni_cbc_sha1_enc_ssse3,\@function,6
132.align 16
133aesni_cbc_sha1_enc_ssse3:
134 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
135 #shr \$6,$len # debugging artefact
136 #jz .Lepilogue_ssse3 # debugging artefact
137 push %rbx
138 push %rbp
139 push %r12
140 push %r13
141 push %r14
142 push %r15
143 lea `-104-($win64?10*16:0)`(%rsp),%rsp
144 #mov $in0,$inp # debugging artefact
145 #lea 64(%rsp),$ctx # debugging artefact
146___
147$code.=<<___ if ($win64);
148 movaps %xmm6,96+0(%rsp)
149 movaps %xmm7,96+16(%rsp)
150 movaps %xmm8,96+32(%rsp)
151 movaps %xmm9,96+48(%rsp)
152 movaps %xmm10,96+64(%rsp)
153 movaps %xmm11,96+80(%rsp)
154 movaps %xmm12,96+96(%rsp)
155 movaps %xmm13,96+112(%rsp)
156 movaps %xmm14,96+128(%rsp)
157 movaps %xmm15,96+144(%rsp)
158.Lprologue_ssse3:
159___
160$code.=<<___;
161 mov $in0,%r12 # reassign arguments
162 mov $out,%r13
163 mov $len,%r14
164 mov $key,%r15
165 movdqu ($ivp),$iv # load IV
166 mov $ivp,88(%rsp) # save $ivp
167___
168my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
169my $rounds="${ivp}d";
170$code.=<<___;
171 shl \$6,$len
172 sub $in0,$out
173 mov 240($key),$rounds
174 add $inp,$len # end of input
175
176 lea K_XX_XX(%rip),$K_XX_XX
177 mov 0($ctx),$A # load context
178 mov 4($ctx),$B
179 mov 8($ctx),$C
180 mov 12($ctx),$D
181 mov $B,@T[0] # magic seed
182 mov 16($ctx),$E
183
184 movdqa 64($K_XX_XX),@X[2] # pbswap mask
185 movdqa 0($K_XX_XX),@Tx[1] # K_00_19
186 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
187 movdqu 16($inp),@X[-3&7]
188 movdqu 32($inp),@X[-2&7]
189 movdqu 48($inp),@X[-1&7]
190 pshufb @X[2],@X[-4&7] # byte swap
191 add \$64,$inp
192 pshufb @X[2],@X[-3&7]
193 pshufb @X[2],@X[-2&7]
194 pshufb @X[2],@X[-1&7]
195 paddd @Tx[1],@X[-4&7] # add K_00_19
196 paddd @Tx[1],@X[-3&7]
197 paddd @Tx[1],@X[-2&7]
198 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
199 psubd @Tx[1],@X[-4&7] # restore X[]
200 movdqa @X[-3&7],16(%rsp)
201 psubd @Tx[1],@X[-3&7]
202 movdqa @X[-2&7],32(%rsp)
203 psubd @Tx[1],@X[-2&7]
204 movups ($key),$rndkey0 # $key[0]
205 movups 16($key),$rndkey[0] # forward reference
206 jmp .Loop_ssse3
207___
208
209my $aesenc=sub {
210 use integer;
211 my ($n,$k)=($r/10,$r%10);
212 if ($k==0) {
213 $code.=<<___;
214 movups `16*$n`($in0),$in # load input
215 xorps $rndkey0,$in
216___
217 $code.=<<___ if ($n);
218 movups $iv,`16*($n-1)`($out,$in0) # write output
219___
220 $code.=<<___;
221 xorps $in,$iv
222 aesenc $rndkey[0],$iv
223 movups `32+16*$k`($key),$rndkey[1]
224___
225 } elsif ($k==9) {
226 $sn++;
227 $code.=<<___;
228 cmp \$11,$rounds
229 jb .Laesenclast$sn
230 movups `32+16*($k+0)`($key),$rndkey[1]
231 aesenc $rndkey[0],$iv
232 movups `32+16*($k+1)`($key),$rndkey[0]
233 aesenc $rndkey[1],$iv
234 je .Laesenclast$sn
235 movups `32+16*($k+2)`($key),$rndkey[1]
236 aesenc $rndkey[0],$iv
237 movups `32+16*($k+3)`($key),$rndkey[0]
238 aesenc $rndkey[1],$iv
239.Laesenclast$sn:
240 aesenclast $rndkey[0],$iv
241 movups 16($key),$rndkey[1] # forward reference
242___
243 } else {
244 $code.=<<___;
245 aesenc $rndkey[0],$iv
246 movups `32+16*$k`($key),$rndkey[1]
247___
248 }
249 $r++; unshift(@rndkey,pop(@rndkey));
250};
251
252sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
253{ use integer;
254 my $body = shift;
255 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
256 my ($a,$b,$c,$d,$e);
257
258 &movdqa (@X[0],@X[-3&7]);
259 eval(shift(@insns));
260 eval(shift(@insns));
261 &movdqa (@Tx[0],@X[-1&7]);
262 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
263 eval(shift(@insns));
264 eval(shift(@insns));
265
266 &paddd (@Tx[1],@X[-1&7]);
267 eval(shift(@insns));
268 eval(shift(@insns));
269 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
270 eval(shift(@insns));
271 eval(shift(@insns));
272 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
273 eval(shift(@insns));
274 eval(shift(@insns));
275
276 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
277 eval(shift(@insns));
278 eval(shift(@insns));
279 eval(shift(@insns));
280 eval(shift(@insns));
281
282 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
283 eval(shift(@insns));
284 eval(shift(@insns));
285 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
286 eval(shift(@insns));
287 eval(shift(@insns));
288
289 &movdqa (@Tx[2],@X[0]);
290 &movdqa (@Tx[0],@X[0]);
291 eval(shift(@insns));
292 eval(shift(@insns));
293 eval(shift(@insns));
294 eval(shift(@insns));
295
296 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
297 &paddd (@X[0],@X[0]);
298 eval(shift(@insns));
299 eval(shift(@insns));
300 eval(shift(@insns));
301 eval(shift(@insns));
302
303 &psrld (@Tx[0],31);
304 eval(shift(@insns));
305 eval(shift(@insns));
306 &movdqa (@Tx[1],@Tx[2]);
307 eval(shift(@insns));
308 eval(shift(@insns));
309
310 &psrld (@Tx[2],30);
311 &por (@X[0],@Tx[0]); # "X[0]"<<<=1
312 eval(shift(@insns));
313 eval(shift(@insns));
314 eval(shift(@insns));
315 eval(shift(@insns));
316
317 &pslld (@Tx[1],2);
318 &pxor (@X[0],@Tx[2]);
319 eval(shift(@insns));
320 eval(shift(@insns));
321 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
322 eval(shift(@insns));
323 eval(shift(@insns));
324
325 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
326
327 foreach (@insns) { eval; } # remaining instructions [if any]
328
329 $Xi++; push(@X,shift(@X)); # "rotate" X[]
330 push(@Tx,shift(@Tx));
331}
332
333sub Xupdate_ssse3_32_79()
334{ use integer;
335 my $body = shift;
336 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
337 my ($a,$b,$c,$d,$e);
338
339 &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
340 eval(shift(@insns)); # body_20_39
341 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
342 &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
343 eval(shift(@insns));
344 eval(shift(@insns));
345 eval(shift(@insns)); # rol
346
347 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
348 eval(shift(@insns));
349 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
350 if ($Xi%5) {
351 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
352 } else { # ... or load next one
353 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
354 }
355 &paddd (@Tx[1],@X[-1&7]);
356 eval(shift(@insns)); # ror
357 eval(shift(@insns));
358
359 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
360 eval(shift(@insns)); # body_20_39
361 eval(shift(@insns));
362 eval(shift(@insns));
363 eval(shift(@insns)); # rol
364
365 &movdqa (@Tx[0],@X[0]);
366 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
367 eval(shift(@insns));
368 eval(shift(@insns));
369 eval(shift(@insns)); # ror
370 eval(shift(@insns));
371
372 &pslld (@X[0],2);
373 eval(shift(@insns)); # body_20_39
374 eval(shift(@insns));
375 &psrld (@Tx[0],30);
376 eval(shift(@insns));
377 eval(shift(@insns)); # rol
378 eval(shift(@insns));
379 eval(shift(@insns));
380 eval(shift(@insns)); # ror
381 eval(shift(@insns));
382
383 &por (@X[0],@Tx[0]); # "X[0]"<<<=2
384 eval(shift(@insns)); # body_20_39
385 eval(shift(@insns));
386 &movdqa (@Tx[1],@X[0]) if ($Xi<19);
387 eval(shift(@insns));
388 eval(shift(@insns)); # rol
389 eval(shift(@insns));
390 eval(shift(@insns));
391 eval(shift(@insns)); # rol
392 eval(shift(@insns));
393
394 foreach (@insns) { eval; } # remaining instructions
395
396 $Xi++; push(@X,shift(@X)); # "rotate" X[]
397 push(@Tx,shift(@Tx));
398}
399
400sub Xuplast_ssse3_80()
401{ use integer;
402 my $body = shift;
403 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
404 my ($a,$b,$c,$d,$e);
405
406 eval(shift(@insns));
407 &paddd (@Tx[1],@X[-1&7]);
408 eval(shift(@insns));
409 eval(shift(@insns));
410 eval(shift(@insns));
411 eval(shift(@insns));
412
413 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
414
415 foreach (@insns) { eval; } # remaining instructions
416
417 &cmp ($inp,$len);
418 &je (".Ldone_ssse3");
419
420 unshift(@Tx,pop(@Tx));
421
422 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
423 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
424 &movdqu (@X[-4&7],"0($inp)"); # load input
425 &movdqu (@X[-3&7],"16($inp)");
426 &movdqu (@X[-2&7],"32($inp)");
427 &movdqu (@X[-1&7],"48($inp)");
428 &pshufb (@X[-4&7],@X[2]); # byte swap
429 &add ($inp,64);
430
431 $Xi=0;
432}
433
434sub Xloop_ssse3()
435{ use integer;
436 my $body = shift;
437 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
438 my ($a,$b,$c,$d,$e);
439
440 eval(shift(@insns));
441 eval(shift(@insns));
442 &pshufb (@X[($Xi-3)&7],@X[2]);
443 eval(shift(@insns));
444 eval(shift(@insns));
445 &paddd (@X[($Xi-4)&7],@Tx[1]);
446 eval(shift(@insns));
447 eval(shift(@insns));
448 eval(shift(@insns));
449 eval(shift(@insns));
450 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
451 eval(shift(@insns));
452 eval(shift(@insns));
453 &psubd (@X[($Xi-4)&7],@Tx[1]);
454
455 foreach (@insns) { eval; }
456 $Xi++;
457}
458
459sub Xtail_ssse3()
460{ use integer;
461 my $body = shift;
462 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
463 my ($a,$b,$c,$d,$e);
464
465 foreach (@insns) { eval; }
466}
467
468sub body_00_19 () {
469 use integer;
470 my ($k,$n);
471 my @r=(
472 '($a,$b,$c,$d,$e)=@V;'.
473 '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
474 '&xor ($c,$d);',
475 '&mov (@T[1],$a);', # $b in next round
476 '&$_rol ($a,5);',
477 '&and (@T[0],$c);', # ($b&($c^$d))
478 '&xor ($c,$d);', # restore $c
479 '&xor (@T[0],$d);',
480 '&add ($e,$a);',
481 '&$_ror ($b,$j?7:2);', # $b>>>2
482 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
483 );
484 $n = scalar(@r);
485 $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
486 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
487 $jj++;
488 return @r;
489}
490
491sub body_20_39 () {
492 use integer;
493 my ($k,$n);
494 my @r=(
495 '($a,$b,$c,$d,$e)=@V;'.
496 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
497 '&xor (@T[0],$d);', # ($b^$d)
498 '&mov (@T[1],$a);', # $b in next round
499 '&$_rol ($a,5);',
500 '&xor (@T[0],$c);', # ($b^$d^$c)
501 '&add ($e,$a);',
502 '&$_ror ($b,7);', # $b>>>2
503 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
504 );
505 $n = scalar(@r);
506 $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds
507 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
508 $jj++;
509 return @r;
510}
511
512sub body_40_59 () {
513 use integer;
514 my ($k,$n);
515 my @r=(
516 '($a,$b,$c,$d,$e)=@V;'.
517 '&mov (@T[1],$c);',
518 '&xor ($c,$d);',
519 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
520 '&and (@T[1],$d);',
521 '&and (@T[0],$c);', # ($b&($c^$d))
522 '&$_ror ($b,7);', # $b>>>2
523 '&add ($e,@T[1]);',
524 '&mov (@T[1],$a);', # $b in next round
525 '&$_rol ($a,5);',
526 '&add ($e,@T[0]);',
527 '&xor ($c,$d);', # restore $c
528 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
529 );
530 $n = scalar(@r);
531 $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
532 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
533 $jj++;
534 return @r;
535}
536$code.=<<___;
537.align 16
538.Loop_ssse3:
539___
540 &Xupdate_ssse3_16_31(\&body_00_19);
541 &Xupdate_ssse3_16_31(\&body_00_19);
542 &Xupdate_ssse3_16_31(\&body_00_19);
543 &Xupdate_ssse3_16_31(\&body_00_19);
544 &Xupdate_ssse3_32_79(\&body_00_19);
545 &Xupdate_ssse3_32_79(\&body_20_39);
546 &Xupdate_ssse3_32_79(\&body_20_39);
547 &Xupdate_ssse3_32_79(\&body_20_39);
548 &Xupdate_ssse3_32_79(\&body_20_39);
549 &Xupdate_ssse3_32_79(\&body_20_39);
550 &Xupdate_ssse3_32_79(\&body_40_59);
551 &Xupdate_ssse3_32_79(\&body_40_59);
552 &Xupdate_ssse3_32_79(\&body_40_59);
553 &Xupdate_ssse3_32_79(\&body_40_59);
554 &Xupdate_ssse3_32_79(\&body_40_59);
555 &Xupdate_ssse3_32_79(\&body_20_39);
556 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
557
558 $saved_j=$j; @saved_V=@V;
559 $saved_r=$r; @saved_rndkey=@rndkey;
560
561 &Xloop_ssse3(\&body_20_39);
562 &Xloop_ssse3(\&body_20_39);
563 &Xloop_ssse3(\&body_20_39);
564
565$code.=<<___;
566 movups $iv,48($out,$in0) # write output
567 lea 64($in0),$in0
568
569 add 0($ctx),$A # update context
570 add 4($ctx),@T[0]
571 add 8($ctx),$C
572 add 12($ctx),$D
573 mov $A,0($ctx)
574 add 16($ctx),$E
575 mov @T[0],4($ctx)
576 mov @T[0],$B # magic seed
577 mov $C,8($ctx)
578 mov $D,12($ctx)
579 mov $E,16($ctx)
580 jmp .Loop_ssse3
581
582.align 16
583.Ldone_ssse3:
584___
585 $jj=$j=$saved_j; @V=@saved_V;
586 $r=$saved_r; @rndkey=@saved_rndkey;
587
588 &Xtail_ssse3(\&body_20_39);
589 &Xtail_ssse3(\&body_20_39);
590 &Xtail_ssse3(\&body_20_39);
591
592$code.=<<___;
593 movups $iv,48($out,$in0) # write output
594 mov 88(%rsp),$ivp # restore $ivp
595
596 add 0($ctx),$A # update context
597 add 4($ctx),@T[0]
598 add 8($ctx),$C
599 mov $A,0($ctx)
600 add 12($ctx),$D
601 mov @T[0],4($ctx)
602 add 16($ctx),$E
603 mov $C,8($ctx)
604 mov $D,12($ctx)
605 mov $E,16($ctx)
606 movups $iv,($ivp) # write IV
607___
608$code.=<<___ if ($win64);
609 movaps 96+0(%rsp),%xmm6
610 movaps 96+16(%rsp),%xmm7
611 movaps 96+32(%rsp),%xmm8
612 movaps 96+48(%rsp),%xmm9
613 movaps 96+64(%rsp),%xmm10
614 movaps 96+80(%rsp),%xmm11
615 movaps 96+96(%rsp),%xmm12
616 movaps 96+112(%rsp),%xmm13
617 movaps 96+128(%rsp),%xmm14
618 movaps 96+144(%rsp),%xmm15
619___
620$code.=<<___;
621 lea `104+($win64?10*16:0)`(%rsp),%rsi
622 mov 0(%rsi),%r15
623 mov 8(%rsi),%r14
624 mov 16(%rsi),%r13
625 mov 24(%rsi),%r12
626 mov 32(%rsi),%rbp
627 mov 40(%rsi),%rbx
628 lea 48(%rsi),%rsp
629.Lepilogue_ssse3:
630 ret
631.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
632___
633
634$j=$jj=$r=$sn=0;
635
636if ($avx) {
637my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
638
639my $Xi=4;
640my @X=map("%xmm$_",(4..7,0..3));
641my @Tx=map("%xmm$_",(8..10));
642my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
643my @T=("%esi","%edi");
644
645my $_rol=sub { &shld(@_[0],@_) };
646my $_ror=sub { &shrd(@_[0],@_) };
647
648$code.=<<___;
649.type aesni_cbc_sha1_enc_avx,\@function,6
650.align 16
651aesni_cbc_sha1_enc_avx:
652 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
653 #shr \$6,$len # debugging artefact
654 #jz .Lepilogue_avx # debugging artefact
655 push %rbx
656 push %rbp
657 push %r12
658 push %r13
659 push %r14
660 push %r15
661 lea `-104-($win64?10*16:0)`(%rsp),%rsp
662 #mov $in0,$inp # debugging artefact
663 #lea 64(%rsp),$ctx # debugging artefact
664___
665$code.=<<___ if ($win64);
666 movaps %xmm6,96+0(%rsp)
667 movaps %xmm7,96+16(%rsp)
668 movaps %xmm8,96+32(%rsp)
669 movaps %xmm9,96+48(%rsp)
670 movaps %xmm10,96+64(%rsp)
671 movaps %xmm11,96+80(%rsp)
672 movaps %xmm12,96+96(%rsp)
673 movaps %xmm13,96+112(%rsp)
674 movaps %xmm14,96+128(%rsp)
675 movaps %xmm15,96+144(%rsp)
676.Lprologue_avx:
677___
678$code.=<<___;
679 vzeroall
680 mov $in0,%r12 # reassign arguments
681 mov $out,%r13
682 mov $len,%r14
683 mov $key,%r15
684 vmovdqu ($ivp),$iv # load IV
685 mov $ivp,88(%rsp) # save $ivp
686___
687my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
688my $rounds="${ivp}d";
689$code.=<<___;
690 shl \$6,$len
691 sub $in0,$out
692 mov 240($key),$rounds
693 add \$112,$key # size optimization
694 add $inp,$len # end of input
695
696 lea K_XX_XX(%rip),$K_XX_XX
697 mov 0($ctx),$A # load context
698 mov 4($ctx),$B
699 mov 8($ctx),$C
700 mov 12($ctx),$D
701 mov $B,@T[0] # magic seed
702 mov 16($ctx),$E
703
704 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
705 vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
706 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
707 vmovdqu 16($inp),@X[-3&7]
708 vmovdqu 32($inp),@X[-2&7]
709 vmovdqu 48($inp),@X[-1&7]
710 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
711 add \$64,$inp
712 vpshufb @X[2],@X[-3&7],@X[-3&7]
713 vpshufb @X[2],@X[-2&7],@X[-2&7]
714 vpshufb @X[2],@X[-1&7],@X[-1&7]
715 vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
716 vpaddd @Tx[1],@X[-3&7],@X[1]
717 vpaddd @Tx[1],@X[-2&7],@X[2]
718 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
719 vmovdqa @X[1],16(%rsp)
720 vmovdqa @X[2],32(%rsp)
721 vmovups -112($key),$rndkey0 # $key[0]
722 vmovups 16-112($key),$rndkey[0] # forward reference
723 jmp .Loop_avx
724___
725
726my $aesenc=sub {
727 use integer;
728 my ($n,$k)=($r/10,$r%10);
729 if ($k==0) {
730 $code.=<<___;
731 vmovups `16*$n`($in0),$in # load input
732 vxorps $rndkey0,$in,$in
733___
734 $code.=<<___ if ($n);
735 vmovups $iv,`16*($n-1)`($out,$in0) # write output
736___
737 $code.=<<___;
738 vxorps $in,$iv,$iv
739 vaesenc $rndkey[0],$iv,$iv
740 vmovups `32+16*$k-112`($key),$rndkey[1]
741___
742 } elsif ($k==9) {
743 $sn++;
744 $code.=<<___;
745 cmp \$11,$rounds
746 jb .Lvaesenclast$sn
747 vaesenc $rndkey[0],$iv,$iv
748 vmovups `32+16*($k+0)-112`($key),$rndkey[1]
749 vaesenc $rndkey[1],$iv,$iv
750 vmovups `32+16*($k+1)-112`($key),$rndkey[0]
751 je .Lvaesenclast$sn
752 vaesenc $rndkey[0],$iv,$iv
753 vmovups `32+16*($k+2)-112`($key),$rndkey[1]
754 vaesenc $rndkey[1],$iv,$iv
755 vmovups `32+16*($k+3)-112`($key),$rndkey[0]
756.Lvaesenclast$sn:
757 vaesenclast $rndkey[0],$iv,$iv
758 vmovups 16-112($key),$rndkey[1] # forward reference
759___
760 } else {
761 $code.=<<___;
762 vaesenc $rndkey[0],$iv,$iv
763 vmovups `32+16*$k-112`($key),$rndkey[1]
764___
765 }
766 $r++; unshift(@rndkey,pop(@rndkey));
767};
768
769sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
770{ use integer;
771 my $body = shift;
772 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
773 my ($a,$b,$c,$d,$e);
774
775 eval(shift(@insns));
776 eval(shift(@insns));
777 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
778 eval(shift(@insns));
779 eval(shift(@insns));
780
781 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
782 eval(shift(@insns));
783 eval(shift(@insns));
784 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
785 eval(shift(@insns));
786 eval(shift(@insns));
787 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
788 eval(shift(@insns));
789 eval(shift(@insns));
790
791 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
792 eval(shift(@insns));
793 eval(shift(@insns));
794 eval(shift(@insns));
795 eval(shift(@insns));
796
797 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
798 eval(shift(@insns));
799 eval(shift(@insns));
800 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
801 eval(shift(@insns));
802 eval(shift(@insns));
803
804 &vpsrld (@Tx[0],@X[0],31);
805 eval(shift(@insns));
806 eval(shift(@insns));
807 eval(shift(@insns));
808 eval(shift(@insns));
809
810 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
811 &vpaddd (@X[0],@X[0],@X[0]);
812 eval(shift(@insns));
813 eval(shift(@insns));
814 eval(shift(@insns));
815 eval(shift(@insns));
816
817 &vpsrld (@Tx[1],@Tx[2],30);
818 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
819 eval(shift(@insns));
820 eval(shift(@insns));
821 eval(shift(@insns));
822 eval(shift(@insns));
823
824 &vpslld (@Tx[2],@Tx[2],2);
825 &vpxor (@X[0],@X[0],@Tx[1]);
826 eval(shift(@insns));
827 eval(shift(@insns));
828 eval(shift(@insns));
829 eval(shift(@insns));
830
831 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
832 eval(shift(@insns));
833 eval(shift(@insns));
834 &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
835 eval(shift(@insns));
836 eval(shift(@insns));
837
838
839 foreach (@insns) { eval; } # remaining instructions [if any]
840
841 $Xi++; push(@X,shift(@X)); # "rotate" X[]
842 push(@Tx,shift(@Tx));
843}
844
845sub Xupdate_avx_32_79()
846{ use integer;
847 my $body = shift;
848 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
849 my ($a,$b,$c,$d,$e);
850
851 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
852 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
853 eval(shift(@insns)); # body_20_39
854 eval(shift(@insns));
855 eval(shift(@insns));
856 eval(shift(@insns)); # rol
857
858 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
859 eval(shift(@insns));
860 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
861 if ($Xi%5) {
862 &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
863 } else { # ... or load next one
864 &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
865 }
866 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
867 eval(shift(@insns)); # ror
868 eval(shift(@insns));
869
870 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
871 eval(shift(@insns)); # body_20_39
872 eval(shift(@insns));
873 eval(shift(@insns));
874 eval(shift(@insns)); # rol
875
876 &vpsrld (@Tx[0],@X[0],30);
877 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
878 eval(shift(@insns));
879 eval(shift(@insns));
880 eval(shift(@insns)); # ror
881 eval(shift(@insns));
882
883 &vpslld (@X[0],@X[0],2);
884 eval(shift(@insns)); # body_20_39
885 eval(shift(@insns));
886 eval(shift(@insns));
887 eval(shift(@insns)); # rol
888 eval(shift(@insns));
889 eval(shift(@insns));
890 eval(shift(@insns)); # ror
891 eval(shift(@insns));
892
893 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
894 eval(shift(@insns)); # body_20_39
895 eval(shift(@insns));
896 &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
897 eval(shift(@insns));
898 eval(shift(@insns)); # rol
899 eval(shift(@insns));
900 eval(shift(@insns));
901 eval(shift(@insns)); # rol
902 eval(shift(@insns));
903
904 foreach (@insns) { eval; } # remaining instructions
905
906 $Xi++; push(@X,shift(@X)); # "rotate" X[]
907 push(@Tx,shift(@Tx));
908}
909
910sub Xuplast_avx_80()
911{ use integer;
912 my $body = shift;
913 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
914 my ($a,$b,$c,$d,$e);
915
916 eval(shift(@insns));
917 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
918 eval(shift(@insns));
919 eval(shift(@insns));
920 eval(shift(@insns));
921 eval(shift(@insns));
922
923 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
924
925 foreach (@insns) { eval; } # remaining instructions
926
927 &cmp ($inp,$len);
928 &je (".Ldone_avx");
929
930 unshift(@Tx,pop(@Tx));
931
932 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
933 &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
934 &vmovdqu(@X[-4&7],"0($inp)"); # load input
935 &vmovdqu(@X[-3&7],"16($inp)");
936 &vmovdqu(@X[-2&7],"32($inp)");
937 &vmovdqu(@X[-1&7],"48($inp)");
938 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
939 &add ($inp,64);
940
941 $Xi=0;
942}
943
944sub Xloop_avx()
945{ use integer;
946 my $body = shift;
947 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
948 my ($a,$b,$c,$d,$e);
949
950 eval(shift(@insns));
951 eval(shift(@insns));
952 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
953 eval(shift(@insns));
954 eval(shift(@insns));
955 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
956 eval(shift(@insns));
957 eval(shift(@insns));
958 eval(shift(@insns));
959 eval(shift(@insns));
960 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
961 eval(shift(@insns));
962 eval(shift(@insns));
963
964 foreach (@insns) { eval; }
965 $Xi++;
966}
967
968sub Xtail_avx()
969{ use integer;
970 my $body = shift;
971 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
972 my ($a,$b,$c,$d,$e);
973
974 foreach (@insns) { eval; }
975}
976
977$code.=<<___;
978.align 16
979.Loop_avx:
980___
981 &Xupdate_avx_16_31(\&body_00_19);
982 &Xupdate_avx_16_31(\&body_00_19);
983 &Xupdate_avx_16_31(\&body_00_19);
984 &Xupdate_avx_16_31(\&body_00_19);
985 &Xupdate_avx_32_79(\&body_00_19);
986 &Xupdate_avx_32_79(\&body_20_39);
987 &Xupdate_avx_32_79(\&body_20_39);
988 &Xupdate_avx_32_79(\&body_20_39);
989 &Xupdate_avx_32_79(\&body_20_39);
990 &Xupdate_avx_32_79(\&body_20_39);
991 &Xupdate_avx_32_79(\&body_40_59);
992 &Xupdate_avx_32_79(\&body_40_59);
993 &Xupdate_avx_32_79(\&body_40_59);
994 &Xupdate_avx_32_79(\&body_40_59);
995 &Xupdate_avx_32_79(\&body_40_59);
996 &Xupdate_avx_32_79(\&body_20_39);
997 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
998
999 $saved_j=$j; @saved_V=@V;
1000 $saved_r=$r; @saved_rndkey=@rndkey;
1001
1002 &Xloop_avx(\&body_20_39);
1003 &Xloop_avx(\&body_20_39);
1004 &Xloop_avx(\&body_20_39);
1005
1006$code.=<<___;
1007 vmovups $iv,48($out,$in0) # write output
1008 lea 64($in0),$in0
1009
1010 add 0($ctx),$A # update context
1011 add 4($ctx),@T[0]
1012 add 8($ctx),$C
1013 add 12($ctx),$D
1014 mov $A,0($ctx)
1015 add 16($ctx),$E
1016 mov @T[0],4($ctx)
1017 mov @T[0],$B # magic seed
1018 mov $C,8($ctx)
1019 mov $D,12($ctx)
1020 mov $E,16($ctx)
1021 jmp .Loop_avx
1022
1023.align 16
1024.Ldone_avx:
1025___
1026 $jj=$j=$saved_j; @V=@saved_V;
1027 $r=$saved_r; @rndkey=@saved_rndkey;
1028
1029 &Xtail_avx(\&body_20_39);
1030 &Xtail_avx(\&body_20_39);
1031 &Xtail_avx(\&body_20_39);
1032
1033$code.=<<___;
1034 vmovups $iv,48($out,$in0) # write output
1035 mov 88(%rsp),$ivp # restore $ivp
1036
1037 add 0($ctx),$A # update context
1038 add 4($ctx),@T[0]
1039 add 8($ctx),$C
1040 mov $A,0($ctx)
1041 add 12($ctx),$D
1042 mov @T[0],4($ctx)
1043 add 16($ctx),$E
1044 mov $C,8($ctx)
1045 mov $D,12($ctx)
1046 mov $E,16($ctx)
1047 vmovups $iv,($ivp) # write IV
1048 vzeroall
1049___
1050$code.=<<___ if ($win64);
1051 movaps 96+0(%rsp),%xmm6
1052 movaps 96+16(%rsp),%xmm7
1053 movaps 96+32(%rsp),%xmm8
1054 movaps 96+48(%rsp),%xmm9
1055 movaps 96+64(%rsp),%xmm10
1056 movaps 96+80(%rsp),%xmm11
1057 movaps 96+96(%rsp),%xmm12
1058 movaps 96+112(%rsp),%xmm13
1059 movaps 96+128(%rsp),%xmm14
1060 movaps 96+144(%rsp),%xmm15
1061___
1062$code.=<<___;
1063 lea `104+($win64?10*16:0)`(%rsp),%rsi
1064 mov 0(%rsi),%r15
1065 mov 8(%rsi),%r14
1066 mov 16(%rsi),%r13
1067 mov 24(%rsi),%r12
1068 mov 32(%rsi),%rbp
1069 mov 40(%rsi),%rbx
1070 lea 48(%rsi),%rsp
1071.Lepilogue_avx:
1072 ret
1073.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1074___
1075}
1076$code.=<<___;
1077.align 64
1078K_XX_XX:
1079.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1080.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1081.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1082.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1083.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
1084
1085.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1086.align 64
1087___
1088
1089# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1090# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1091if ($win64) {
1092$rec="%rcx";
1093$frame="%rdx";
1094$context="%r8";
1095$disp="%r9";
1096
1097$code.=<<___;
1098.extern __imp_RtlVirtualUnwind
1099.type ssse3_handler,\@abi-omnipotent
1100.align 16
1101ssse3_handler:
1102 push %rsi
1103 push %rdi
1104 push %rbx
1105 push %rbp
1106 push %r12
1107 push %r13
1108 push %r14
1109 push %r15
1110 pushfq
1111 sub \$64,%rsp
1112
1113 mov 120($context),%rax # pull context->Rax
1114 mov 248($context),%rbx # pull context->Rip
1115
1116 mov 8($disp),%rsi # disp->ImageBase
1117 mov 56($disp),%r11 # disp->HandlerData
1118
1119 mov 0(%r11),%r10d # HandlerData[0]
1120 lea (%rsi,%r10),%r10 # prologue label
1121 cmp %r10,%rbx # context->Rip<prologue label
1122 jb .Lcommon_seh_tail
1123
1124 mov 152($context),%rax # pull context->Rsp
1125
1126 mov 4(%r11),%r10d # HandlerData[1]
1127 lea (%rsi,%r10),%r10 # epilogue label
1128 cmp %r10,%rbx # context->Rip>=epilogue label
1129 jae .Lcommon_seh_tail
1130
1131 lea 96(%rax),%rsi
1132 lea 512($context),%rdi # &context.Xmm6
1133 mov \$20,%ecx
1134 .long 0xa548f3fc # cld; rep movsq
1135 lea `104+10*16`(%rax),%rax # adjust stack pointer
1136
1137 mov 0(%rax),%r15
1138 mov 8(%rax),%r14
1139 mov 16(%rax),%r13
1140 mov 24(%rax),%r12
1141 mov 32(%rax),%rbp
1142 mov 40(%rax),%rbx
1143 lea 48(%rax),%rax
1144 mov %rbx,144($context) # restore context->Rbx
1145 mov %rbp,160($context) # restore context->Rbp
1146 mov %r12,216($context) # restore context->R12
1147 mov %r13,224($context) # restore context->R13
1148 mov %r14,232($context) # restore context->R14
1149 mov %r15,240($context) # restore context->R15
1150
1151.Lcommon_seh_tail:
1152 mov 8(%rax),%rdi
1153 mov 16(%rax),%rsi
1154 mov %rax,152($context) # restore context->Rsp
1155 mov %rsi,168($context) # restore context->Rsi
1156 mov %rdi,176($context) # restore context->Rdi
1157
1158 mov 40($disp),%rdi # disp->ContextRecord
1159 mov $context,%rsi # context
1160 mov \$154,%ecx # sizeof(CONTEXT)
1161 .long 0xa548f3fc # cld; rep movsq
1162
1163 mov $disp,%rsi
1164 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1165 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1166 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1167 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1168 mov 40(%rsi),%r10 # disp->ContextRecord
1169 lea 56(%rsi),%r11 # &disp->HandlerData
1170 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1171 mov %r10,32(%rsp) # arg5
1172 mov %r11,40(%rsp) # arg6
1173 mov %r12,48(%rsp) # arg7
1174 mov %rcx,56(%rsp) # arg8, (NULL)
1175 call *__imp_RtlVirtualUnwind(%rip)
1176
1177 mov \$1,%eax # ExceptionContinueSearch
1178 add \$64,%rsp
1179 popfq
1180 pop %r15
1181 pop %r14
1182 pop %r13
1183 pop %r12
1184 pop %rbp
1185 pop %rbx
1186 pop %rdi
1187 pop %rsi
1188 ret
1189.size ssse3_handler,.-ssse3_handler
1190
1191.section .pdata
1192.align 4
1193 .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3
1194 .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3
1195 .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3
1196___
1197$code.=<<___ if ($avx);
1198 .rva .LSEH_begin_aesni_cbc_sha1_enc_avx
1199 .rva .LSEH_end_aesni_cbc_sha1_enc_avx
1200 .rva .LSEH_info_aesni_cbc_sha1_enc_avx
1201___
1202$code.=<<___;
1203.section .xdata
1204.align 8
1205.LSEH_info_aesni_cbc_sha1_enc_ssse3:
1206 .byte 9,0,0,0
1207 .rva ssse3_handler
1208 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
1209___
1210$code.=<<___ if ($avx);
1211.LSEH_info_aesni_cbc_sha1_enc_avx:
1212 .byte 9,0,0,0
1213 .rva ssse3_handler
1214 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1215___
1216}
1217
1218####################################################################
1219sub rex {
1220 local *opcode=shift;
1221 my ($dst,$src)=@_;
1222 my $rex=0;
1223
1224 $rex|=0x04 if($dst>=8);
1225 $rex|=0x01 if($src>=8);
1226 push @opcode,$rex|0x40 if($rex);
1227}
1228
1229$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1230
1231print $code;
1232close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86.pl b/src/lib/libcrypto/aes/asm/aesni-x86.pl
deleted file mode 100644
index 8c1d0b5bed..0000000000
--- a/src/lib/libcrypto/aes/asm/aesni-x86.pl
+++ /dev/null
@@ -1,2189 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13# details].
14#
15# Performance.
16#
17# To start with see corresponding paragraph in aesni-x86_64.pl...
18# Instead of filling table similar to one found there I've chosen to
19# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20# The simplified table below represents 32-bit performance relative
21# to 64-bit one in every given point. Ratios vary for different
22# encryption modes, therefore interval values.
23#
24# 16-byte 64-byte 256-byte 1-KB 8-KB
25# 53-67% 67-84% 91-94% 95-98% 97-99.5%
26#
27# Lower ratios for smaller block sizes are perfectly understandable,
28# because function call overhead is higher in 32-bit mode. Largest
29# 8-KB block performance is virtually same: 32-bit code is less than
30# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
31
32# January 2011
33#
34# See aesni-x86_64.pl for details. Unlike x86_64 version this module
35# interleaves at most 6 aes[enc|dec] instructions, because there are
36# not enough registers for 8x interleave [which should be optimal for
37# Sandy Bridge]. Actually, performance results for 6x interleave
38# factor presented in aesni-x86_64.pl (except for CTR) are for this
39# module.
40
41# April 2011
42#
43# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
45
46$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
47 # generates drop-in replacement for
48 # crypto/aes/asm/aes-586.pl:-)
49$inline=1; # inline _aesni_[en|de]crypt
50
51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52push(@INC,"${dir}","${dir}../../perlasm");
53require "x86asm.pl";
54
55&asm_init($ARGV[0],$0);
56
57if ($PREFIX eq "aesni") { $movekey=*movups; }
58else { $movekey=*movups; }
59
60$len="eax";
61$rounds="ecx";
62$key="edx";
63$inp="esi";
64$out="edi";
65$rounds_="ebx"; # backup copy for $rounds
66$key_="ebp"; # backup copy for $key
67
68$rndkey0="xmm0";
69$rndkey1="xmm1";
70$inout0="xmm2";
71$inout1="xmm3";
72$inout2="xmm4";
73$inout3="xmm5"; $in1="xmm5";
74$inout4="xmm6"; $in0="xmm6";
75$inout5="xmm7"; $ivec="xmm7";
76
77# AESNI extension
78sub aeskeygenassist
79{ my($dst,$src,$imm)=@_;
80 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
81 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
82}
83sub aescommon
84{ my($opcodelet,$dst,$src)=@_;
85 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
86 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
87}
88sub aesimc { aescommon(0xdb,@_); }
89sub aesenc { aescommon(0xdc,@_); }
90sub aesenclast { aescommon(0xdd,@_); }
91sub aesdec { aescommon(0xde,@_); }
92sub aesdeclast { aescommon(0xdf,@_); }
93
94# Inline version of internal aesni_[en|de]crypt1
95{ my $sn;
96sub aesni_inline_generate1
97{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
98 $sn++;
99
100 &$movekey ($rndkey0,&QWP(0,$key));
101 &$movekey ($rndkey1,&QWP(16,$key));
102 &xorps ($ivec,$rndkey0) if (defined($ivec));
103 &lea ($key,&DWP(32,$key));
104 &xorps ($inout,$ivec) if (defined($ivec));
105 &xorps ($inout,$rndkey0) if (!defined($ivec));
106 &set_label("${p}1_loop_$sn");
107 eval"&aes${p} ($inout,$rndkey1)";
108 &dec ($rounds);
109 &$movekey ($rndkey1,&QWP(0,$key));
110 &lea ($key,&DWP(16,$key));
111 &jnz (&label("${p}1_loop_$sn"));
112 eval"&aes${p}last ($inout,$rndkey1)";
113}}
114
115sub aesni_generate1 # fully unrolled loop
116{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
117
118 &function_begin_B("_aesni_${p}rypt1");
119 &movups ($rndkey0,&QWP(0,$key));
120 &$movekey ($rndkey1,&QWP(0x10,$key));
121 &xorps ($inout,$rndkey0);
122 &$movekey ($rndkey0,&QWP(0x20,$key));
123 &lea ($key,&DWP(0x30,$key));
124 &cmp ($rounds,11);
125 &jb (&label("${p}128"));
126 &lea ($key,&DWP(0x20,$key));
127 &je (&label("${p}192"));
128 &lea ($key,&DWP(0x20,$key));
129 eval"&aes${p} ($inout,$rndkey1)";
130 &$movekey ($rndkey1,&QWP(-0x40,$key));
131 eval"&aes${p} ($inout,$rndkey0)";
132 &$movekey ($rndkey0,&QWP(-0x30,$key));
133 &set_label("${p}192");
134 eval"&aes${p} ($inout,$rndkey1)";
135 &$movekey ($rndkey1,&QWP(-0x20,$key));
136 eval"&aes${p} ($inout,$rndkey0)";
137 &$movekey ($rndkey0,&QWP(-0x10,$key));
138 &set_label("${p}128");
139 eval"&aes${p} ($inout,$rndkey1)";
140 &$movekey ($rndkey1,&QWP(0,$key));
141 eval"&aes${p} ($inout,$rndkey0)";
142 &$movekey ($rndkey0,&QWP(0x10,$key));
143 eval"&aes${p} ($inout,$rndkey1)";
144 &$movekey ($rndkey1,&QWP(0x20,$key));
145 eval"&aes${p} ($inout,$rndkey0)";
146 &$movekey ($rndkey0,&QWP(0x30,$key));
147 eval"&aes${p} ($inout,$rndkey1)";
148 &$movekey ($rndkey1,&QWP(0x40,$key));
149 eval"&aes${p} ($inout,$rndkey0)";
150 &$movekey ($rndkey0,&QWP(0x50,$key));
151 eval"&aes${p} ($inout,$rndkey1)";
152 &$movekey ($rndkey1,&QWP(0x60,$key));
153 eval"&aes${p} ($inout,$rndkey0)";
154 &$movekey ($rndkey0,&QWP(0x70,$key));
155 eval"&aes${p} ($inout,$rndkey1)";
156 eval"&aes${p}last ($inout,$rndkey0)";
157 &ret();
158 &function_end_B("_aesni_${p}rypt1");
159}
160
161# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
162&aesni_generate1("enc") if (!$inline);
163&function_begin_B("${PREFIX}_encrypt");
164 &mov ("eax",&wparam(0));
165 &mov ($key,&wparam(2));
166 &movups ($inout0,&QWP(0,"eax"));
167 &mov ($rounds,&DWP(240,$key));
168 &mov ("eax",&wparam(1));
169 if ($inline)
170 { &aesni_inline_generate1("enc"); }
171 else
172 { &call ("_aesni_encrypt1"); }
173 &movups (&QWP(0,"eax"),$inout0);
174 &ret ();
175&function_end_B("${PREFIX}_encrypt");
176
177# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
178&aesni_generate1("dec") if(!$inline);
179&function_begin_B("${PREFIX}_decrypt");
180 &mov ("eax",&wparam(0));
181 &mov ($key,&wparam(2));
182 &movups ($inout0,&QWP(0,"eax"));
183 &mov ($rounds,&DWP(240,$key));
184 &mov ("eax",&wparam(1));
185 if ($inline)
186 { &aesni_inline_generate1("dec"); }
187 else
188 { &call ("_aesni_decrypt1"); }
189 &movups (&QWP(0,"eax"),$inout0);
190 &ret ();
191&function_end_B("${PREFIX}_decrypt");
192
193# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
194# factor. Why 3x subroutine were originally used in loops? Even though
195# aes[enc|dec] latency was originally 6, it could be scheduled only
196# every *2nd* cycle. Thus 3x interleave was the one providing optimal
197# utilization, i.e. when subroutine's throughput is virtually same as
198# of non-interleaved subroutine [for number of input blocks up to 3].
199# This is why it makes no sense to implement 2x subroutine.
200# aes[enc|dec] latency in next processor generation is 8, but the
201# instructions can be scheduled every cycle. Optimal interleave for
202# new processor is therefore 8x, but it's unfeasible to accommodate it
203# in XMM registers addreassable in 32-bit mode and therefore 6x is
204# used instead...
205
206sub aesni_generate3
207{ my $p=shift;
208
209 &function_begin_B("_aesni_${p}rypt3");
210 &$movekey ($rndkey0,&QWP(0,$key));
211 &shr ($rounds,1);
212 &$movekey ($rndkey1,&QWP(16,$key));
213 &lea ($key,&DWP(32,$key));
214 &xorps ($inout0,$rndkey0);
215 &pxor ($inout1,$rndkey0);
216 &pxor ($inout2,$rndkey0);
217 &$movekey ($rndkey0,&QWP(0,$key));
218
219 &set_label("${p}3_loop");
220 eval"&aes${p} ($inout0,$rndkey1)";
221 eval"&aes${p} ($inout1,$rndkey1)";
222 &dec ($rounds);
223 eval"&aes${p} ($inout2,$rndkey1)";
224 &$movekey ($rndkey1,&QWP(16,$key));
225 eval"&aes${p} ($inout0,$rndkey0)";
226 eval"&aes${p} ($inout1,$rndkey0)";
227 &lea ($key,&DWP(32,$key));
228 eval"&aes${p} ($inout2,$rndkey0)";
229 &$movekey ($rndkey0,&QWP(0,$key));
230 &jnz (&label("${p}3_loop"));
231 eval"&aes${p} ($inout0,$rndkey1)";
232 eval"&aes${p} ($inout1,$rndkey1)";
233 eval"&aes${p} ($inout2,$rndkey1)";
234 eval"&aes${p}last ($inout0,$rndkey0)";
235 eval"&aes${p}last ($inout1,$rndkey0)";
236 eval"&aes${p}last ($inout2,$rndkey0)";
237 &ret();
238 &function_end_B("_aesni_${p}rypt3");
239}
240
241# 4x interleave is implemented to improve small block performance,
242# most notably [and naturally] 4 block by ~30%. One can argue that one
243# should have implemented 5x as well, but improvement would be <20%,
244# so it's not worth it...
245sub aesni_generate4
246{ my $p=shift;
247
248 &function_begin_B("_aesni_${p}rypt4");
249 &$movekey ($rndkey0,&QWP(0,$key));
250 &$movekey ($rndkey1,&QWP(16,$key));
251 &shr ($rounds,1);
252 &lea ($key,&DWP(32,$key));
253 &xorps ($inout0,$rndkey0);
254 &pxor ($inout1,$rndkey0);
255 &pxor ($inout2,$rndkey0);
256 &pxor ($inout3,$rndkey0);
257 &$movekey ($rndkey0,&QWP(0,$key));
258
259 &set_label("${p}4_loop");
260 eval"&aes${p} ($inout0,$rndkey1)";
261 eval"&aes${p} ($inout1,$rndkey1)";
262 &dec ($rounds);
263 eval"&aes${p} ($inout2,$rndkey1)";
264 eval"&aes${p} ($inout3,$rndkey1)";
265 &$movekey ($rndkey1,&QWP(16,$key));
266 eval"&aes${p} ($inout0,$rndkey0)";
267 eval"&aes${p} ($inout1,$rndkey0)";
268 &lea ($key,&DWP(32,$key));
269 eval"&aes${p} ($inout2,$rndkey0)";
270 eval"&aes${p} ($inout3,$rndkey0)";
271 &$movekey ($rndkey0,&QWP(0,$key));
272 &jnz (&label("${p}4_loop"));
273
274 eval"&aes${p} ($inout0,$rndkey1)";
275 eval"&aes${p} ($inout1,$rndkey1)";
276 eval"&aes${p} ($inout2,$rndkey1)";
277 eval"&aes${p} ($inout3,$rndkey1)";
278 eval"&aes${p}last ($inout0,$rndkey0)";
279 eval"&aes${p}last ($inout1,$rndkey0)";
280 eval"&aes${p}last ($inout2,$rndkey0)";
281 eval"&aes${p}last ($inout3,$rndkey0)";
282 &ret();
283 &function_end_B("_aesni_${p}rypt4");
284}
285
286sub aesni_generate6
287{ my $p=shift;
288
289 &function_begin_B("_aesni_${p}rypt6");
290 &static_label("_aesni_${p}rypt6_enter");
291 &$movekey ($rndkey0,&QWP(0,$key));
292 &shr ($rounds,1);
293 &$movekey ($rndkey1,&QWP(16,$key));
294 &lea ($key,&DWP(32,$key));
295 &xorps ($inout0,$rndkey0);
296 &pxor ($inout1,$rndkey0); # pxor does better here
297 eval"&aes${p} ($inout0,$rndkey1)";
298 &pxor ($inout2,$rndkey0);
299 eval"&aes${p} ($inout1,$rndkey1)";
300 &pxor ($inout3,$rndkey0);
301 &dec ($rounds);
302 eval"&aes${p} ($inout2,$rndkey1)";
303 &pxor ($inout4,$rndkey0);
304 eval"&aes${p} ($inout3,$rndkey1)";
305 &pxor ($inout5,$rndkey0);
306 eval"&aes${p} ($inout4,$rndkey1)";
307 &$movekey ($rndkey0,&QWP(0,$key));
308 eval"&aes${p} ($inout5,$rndkey1)";
309 &jmp (&label("_aesni_${p}rypt6_enter"));
310
311 &set_label("${p}6_loop",16);
312 eval"&aes${p} ($inout0,$rndkey1)";
313 eval"&aes${p} ($inout1,$rndkey1)";
314 &dec ($rounds);
315 eval"&aes${p} ($inout2,$rndkey1)";
316 eval"&aes${p} ($inout3,$rndkey1)";
317 eval"&aes${p} ($inout4,$rndkey1)";
318 eval"&aes${p} ($inout5,$rndkey1)";
319 &set_label("_aesni_${p}rypt6_enter",16);
320 &$movekey ($rndkey1,&QWP(16,$key));
321 eval"&aes${p} ($inout0,$rndkey0)";
322 eval"&aes${p} ($inout1,$rndkey0)";
323 &lea ($key,&DWP(32,$key));
324 eval"&aes${p} ($inout2,$rndkey0)";
325 eval"&aes${p} ($inout3,$rndkey0)";
326 eval"&aes${p} ($inout4,$rndkey0)";
327 eval"&aes${p} ($inout5,$rndkey0)";
328 &$movekey ($rndkey0,&QWP(0,$key));
329 &jnz (&label("${p}6_loop"));
330
331 eval"&aes${p} ($inout0,$rndkey1)";
332 eval"&aes${p} ($inout1,$rndkey1)";
333 eval"&aes${p} ($inout2,$rndkey1)";
334 eval"&aes${p} ($inout3,$rndkey1)";
335 eval"&aes${p} ($inout4,$rndkey1)";
336 eval"&aes${p} ($inout5,$rndkey1)";
337 eval"&aes${p}last ($inout0,$rndkey0)";
338 eval"&aes${p}last ($inout1,$rndkey0)";
339 eval"&aes${p}last ($inout2,$rndkey0)";
340 eval"&aes${p}last ($inout3,$rndkey0)";
341 eval"&aes${p}last ($inout4,$rndkey0)";
342 eval"&aes${p}last ($inout5,$rndkey0)";
343 &ret();
344 &function_end_B("_aesni_${p}rypt6");
345}
346&aesni_generate3("enc") if ($PREFIX eq "aesni");
347&aesni_generate3("dec");
348&aesni_generate4("enc") if ($PREFIX eq "aesni");
349&aesni_generate4("dec");
350&aesni_generate6("enc") if ($PREFIX eq "aesni");
351&aesni_generate6("dec");
352
353if ($PREFIX eq "aesni") {
354######################################################################
355# void aesni_ecb_encrypt (const void *in, void *out,
356# size_t length, const AES_KEY *key,
357# int enc);
358&function_begin("aesni_ecb_encrypt");
359 &mov ($inp,&wparam(0));
360 &mov ($out,&wparam(1));
361 &mov ($len,&wparam(2));
362 &mov ($key,&wparam(3));
363 &mov ($rounds_,&wparam(4));
364 &and ($len,-16);
365 &jz (&label("ecb_ret"));
366 &mov ($rounds,&DWP(240,$key));
367 &test ($rounds_,$rounds_);
368 &jz (&label("ecb_decrypt"));
369
370 &mov ($key_,$key); # backup $key
371 &mov ($rounds_,$rounds); # backup $rounds
372 &cmp ($len,0x60);
373 &jb (&label("ecb_enc_tail"));
374
375 &movdqu ($inout0,&QWP(0,$inp));
376 &movdqu ($inout1,&QWP(0x10,$inp));
377 &movdqu ($inout2,&QWP(0x20,$inp));
378 &movdqu ($inout3,&QWP(0x30,$inp));
379 &movdqu ($inout4,&QWP(0x40,$inp));
380 &movdqu ($inout5,&QWP(0x50,$inp));
381 &lea ($inp,&DWP(0x60,$inp));
382 &sub ($len,0x60);
383 &jmp (&label("ecb_enc_loop6_enter"));
384
385&set_label("ecb_enc_loop6",16);
386 &movups (&QWP(0,$out),$inout0);
387 &movdqu ($inout0,&QWP(0,$inp));
388 &movups (&QWP(0x10,$out),$inout1);
389 &movdqu ($inout1,&QWP(0x10,$inp));
390 &movups (&QWP(0x20,$out),$inout2);
391 &movdqu ($inout2,&QWP(0x20,$inp));
392 &movups (&QWP(0x30,$out),$inout3);
393 &movdqu ($inout3,&QWP(0x30,$inp));
394 &movups (&QWP(0x40,$out),$inout4);
395 &movdqu ($inout4,&QWP(0x40,$inp));
396 &movups (&QWP(0x50,$out),$inout5);
397 &lea ($out,&DWP(0x60,$out));
398 &movdqu ($inout5,&QWP(0x50,$inp));
399 &lea ($inp,&DWP(0x60,$inp));
400&set_label("ecb_enc_loop6_enter");
401
402 &call ("_aesni_encrypt6");
403
404 &mov ($key,$key_); # restore $key
405 &mov ($rounds,$rounds_); # restore $rounds
406 &sub ($len,0x60);
407 &jnc (&label("ecb_enc_loop6"));
408
409 &movups (&QWP(0,$out),$inout0);
410 &movups (&QWP(0x10,$out),$inout1);
411 &movups (&QWP(0x20,$out),$inout2);
412 &movups (&QWP(0x30,$out),$inout3);
413 &movups (&QWP(0x40,$out),$inout4);
414 &movups (&QWP(0x50,$out),$inout5);
415 &lea ($out,&DWP(0x60,$out));
416 &add ($len,0x60);
417 &jz (&label("ecb_ret"));
418
419&set_label("ecb_enc_tail");
420 &movups ($inout0,&QWP(0,$inp));
421 &cmp ($len,0x20);
422 &jb (&label("ecb_enc_one"));
423 &movups ($inout1,&QWP(0x10,$inp));
424 &je (&label("ecb_enc_two"));
425 &movups ($inout2,&QWP(0x20,$inp));
426 &cmp ($len,0x40);
427 &jb (&label("ecb_enc_three"));
428 &movups ($inout3,&QWP(0x30,$inp));
429 &je (&label("ecb_enc_four"));
430 &movups ($inout4,&QWP(0x40,$inp));
431 &xorps ($inout5,$inout5);
432 &call ("_aesni_encrypt6");
433 &movups (&QWP(0,$out),$inout0);
434 &movups (&QWP(0x10,$out),$inout1);
435 &movups (&QWP(0x20,$out),$inout2);
436 &movups (&QWP(0x30,$out),$inout3);
437 &movups (&QWP(0x40,$out),$inout4);
438 jmp (&label("ecb_ret"));
439
440&set_label("ecb_enc_one",16);
441 if ($inline)
442 { &aesni_inline_generate1("enc"); }
443 else
444 { &call ("_aesni_encrypt1"); }
445 &movups (&QWP(0,$out),$inout0);
446 &jmp (&label("ecb_ret"));
447
448&set_label("ecb_enc_two",16);
449 &xorps ($inout2,$inout2);
450 &call ("_aesni_encrypt3");
451 &movups (&QWP(0,$out),$inout0);
452 &movups (&QWP(0x10,$out),$inout1);
453 &jmp (&label("ecb_ret"));
454
455&set_label("ecb_enc_three",16);
456 &call ("_aesni_encrypt3");
457 &movups (&QWP(0,$out),$inout0);
458 &movups (&QWP(0x10,$out),$inout1);
459 &movups (&QWP(0x20,$out),$inout2);
460 &jmp (&label("ecb_ret"));
461
462&set_label("ecb_enc_four",16);
463 &call ("_aesni_encrypt4");
464 &movups (&QWP(0,$out),$inout0);
465 &movups (&QWP(0x10,$out),$inout1);
466 &movups (&QWP(0x20,$out),$inout2);
467 &movups (&QWP(0x30,$out),$inout3);
468 &jmp (&label("ecb_ret"));
469######################################################################
470&set_label("ecb_decrypt",16);
471 &mov ($key_,$key); # backup $key
472 &mov ($rounds_,$rounds); # backup $rounds
473 &cmp ($len,0x60);
474 &jb (&label("ecb_dec_tail"));
475
476 &movdqu ($inout0,&QWP(0,$inp));
477 &movdqu ($inout1,&QWP(0x10,$inp));
478 &movdqu ($inout2,&QWP(0x20,$inp));
479 &movdqu ($inout3,&QWP(0x30,$inp));
480 &movdqu ($inout4,&QWP(0x40,$inp));
481 &movdqu ($inout5,&QWP(0x50,$inp));
482 &lea ($inp,&DWP(0x60,$inp));
483 &sub ($len,0x60);
484 &jmp (&label("ecb_dec_loop6_enter"));
485
486&set_label("ecb_dec_loop6",16);
487 &movups (&QWP(0,$out),$inout0);
488 &movdqu ($inout0,&QWP(0,$inp));
489 &movups (&QWP(0x10,$out),$inout1);
490 &movdqu ($inout1,&QWP(0x10,$inp));
491 &movups (&QWP(0x20,$out),$inout2);
492 &movdqu ($inout2,&QWP(0x20,$inp));
493 &movups (&QWP(0x30,$out),$inout3);
494 &movdqu ($inout3,&QWP(0x30,$inp));
495 &movups (&QWP(0x40,$out),$inout4);
496 &movdqu ($inout4,&QWP(0x40,$inp));
497 &movups (&QWP(0x50,$out),$inout5);
498 &lea ($out,&DWP(0x60,$out));
499 &movdqu ($inout5,&QWP(0x50,$inp));
500 &lea ($inp,&DWP(0x60,$inp));
501&set_label("ecb_dec_loop6_enter");
502
503 &call ("_aesni_decrypt6");
504
505 &mov ($key,$key_); # restore $key
506 &mov ($rounds,$rounds_); # restore $rounds
507 &sub ($len,0x60);
508 &jnc (&label("ecb_dec_loop6"));
509
510 &movups (&QWP(0,$out),$inout0);
511 &movups (&QWP(0x10,$out),$inout1);
512 &movups (&QWP(0x20,$out),$inout2);
513 &movups (&QWP(0x30,$out),$inout3);
514 &movups (&QWP(0x40,$out),$inout4);
515 &movups (&QWP(0x50,$out),$inout5);
516 &lea ($out,&DWP(0x60,$out));
517 &add ($len,0x60);
518 &jz (&label("ecb_ret"));
519
520&set_label("ecb_dec_tail");
521 &movups ($inout0,&QWP(0,$inp));
522 &cmp ($len,0x20);
523 &jb (&label("ecb_dec_one"));
524 &movups ($inout1,&QWP(0x10,$inp));
525 &je (&label("ecb_dec_two"));
526 &movups ($inout2,&QWP(0x20,$inp));
527 &cmp ($len,0x40);
528 &jb (&label("ecb_dec_three"));
529 &movups ($inout3,&QWP(0x30,$inp));
530 &je (&label("ecb_dec_four"));
531 &movups ($inout4,&QWP(0x40,$inp));
532 &xorps ($inout5,$inout5);
533 &call ("_aesni_decrypt6");
534 &movups (&QWP(0,$out),$inout0);
535 &movups (&QWP(0x10,$out),$inout1);
536 &movups (&QWP(0x20,$out),$inout2);
537 &movups (&QWP(0x30,$out),$inout3);
538 &movups (&QWP(0x40,$out),$inout4);
539 &jmp (&label("ecb_ret"));
540
541&set_label("ecb_dec_one",16);
542 if ($inline)
543 { &aesni_inline_generate1("dec"); }
544 else
545 { &call ("_aesni_decrypt1"); }
546 &movups (&QWP(0,$out),$inout0);
547 &jmp (&label("ecb_ret"));
548
549&set_label("ecb_dec_two",16);
550 &xorps ($inout2,$inout2);
551 &call ("_aesni_decrypt3");
552 &movups (&QWP(0,$out),$inout0);
553 &movups (&QWP(0x10,$out),$inout1);
554 &jmp (&label("ecb_ret"));
555
556&set_label("ecb_dec_three",16);
557 &call ("_aesni_decrypt3");
558 &movups (&QWP(0,$out),$inout0);
559 &movups (&QWP(0x10,$out),$inout1);
560 &movups (&QWP(0x20,$out),$inout2);
561 &jmp (&label("ecb_ret"));
562
563&set_label("ecb_dec_four",16);
564 &call ("_aesni_decrypt4");
565 &movups (&QWP(0,$out),$inout0);
566 &movups (&QWP(0x10,$out),$inout1);
567 &movups (&QWP(0x20,$out),$inout2);
568 &movups (&QWP(0x30,$out),$inout3);
569
570&set_label("ecb_ret");
571&function_end("aesni_ecb_encrypt");
572
573######################################################################
574# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
575# size_t blocks, const AES_KEY *key,
576# const char *ivec,char *cmac);
577#
578# Handles only complete blocks, operates on 64-bit counter and
579# does not update *ivec! Nor does it finalize CMAC value
580# (see engine/eng_aesni.c for details)
581#
582{ my $cmac=$inout1;
583&function_begin("aesni_ccm64_encrypt_blocks");
584 &mov ($inp,&wparam(0));
585 &mov ($out,&wparam(1));
586 &mov ($len,&wparam(2));
587 &mov ($key,&wparam(3));
588 &mov ($rounds_,&wparam(4));
589 &mov ($rounds,&wparam(5));
590 &mov ($key_,"esp");
591 &sub ("esp",60);
592 &and ("esp",-16); # align stack
593 &mov (&DWP(48,"esp"),$key_);
594
595 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
596 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
597 &mov ($rounds,&DWP(240,$key));
598
599 # compose byte-swap control mask for pshufb on stack
600 &mov (&DWP(0,"esp"),0x0c0d0e0f);
601 &mov (&DWP(4,"esp"),0x08090a0b);
602 &mov (&DWP(8,"esp"),0x04050607);
603 &mov (&DWP(12,"esp"),0x00010203);
604
605 # compose counter increment vector on stack
606 &mov ($rounds_,1);
607 &xor ($key_,$key_);
608 &mov (&DWP(16,"esp"),$rounds_);
609 &mov (&DWP(20,"esp"),$key_);
610 &mov (&DWP(24,"esp"),$key_);
611 &mov (&DWP(28,"esp"),$key_);
612
613 &shr ($rounds,1);
614 &lea ($key_,&DWP(0,$key));
615 &movdqa ($inout3,&QWP(0,"esp"));
616 &movdqa ($inout0,$ivec);
617 &mov ($rounds_,$rounds);
618 &pshufb ($ivec,$inout3);
619
620&set_label("ccm64_enc_outer");
621 &$movekey ($rndkey0,&QWP(0,$key_));
622 &mov ($rounds,$rounds_);
623 &movups ($in0,&QWP(0,$inp));
624
625 &xorps ($inout0,$rndkey0);
626 &$movekey ($rndkey1,&QWP(16,$key_));
627 &xorps ($rndkey0,$in0);
628 &lea ($key,&DWP(32,$key_));
629 &xorps ($cmac,$rndkey0); # cmac^=inp
630 &$movekey ($rndkey0,&QWP(0,$key));
631
632&set_label("ccm64_enc2_loop");
633 &aesenc ($inout0,$rndkey1);
634 &dec ($rounds);
635 &aesenc ($cmac,$rndkey1);
636 &$movekey ($rndkey1,&QWP(16,$key));
637 &aesenc ($inout0,$rndkey0);
638 &lea ($key,&DWP(32,$key));
639 &aesenc ($cmac,$rndkey0);
640 &$movekey ($rndkey0,&QWP(0,$key));
641 &jnz (&label("ccm64_enc2_loop"));
642 &aesenc ($inout0,$rndkey1);
643 &aesenc ($cmac,$rndkey1);
644 &paddq ($ivec,&QWP(16,"esp"));
645 &aesenclast ($inout0,$rndkey0);
646 &aesenclast ($cmac,$rndkey0);
647
648 &dec ($len);
649 &lea ($inp,&DWP(16,$inp));
650 &xorps ($in0,$inout0); # inp^=E(ivec)
651 &movdqa ($inout0,$ivec);
652 &movups (&QWP(0,$out),$in0); # save output
653 &lea ($out,&DWP(16,$out));
654 &pshufb ($inout0,$inout3);
655 &jnz (&label("ccm64_enc_outer"));
656
657 &mov ("esp",&DWP(48,"esp"));
658 &mov ($out,&wparam(5));
659 &movups (&QWP(0,$out),$cmac);
660&function_end("aesni_ccm64_encrypt_blocks");
661
662&function_begin("aesni_ccm64_decrypt_blocks");
663 &mov ($inp,&wparam(0));
664 &mov ($out,&wparam(1));
665 &mov ($len,&wparam(2));
666 &mov ($key,&wparam(3));
667 &mov ($rounds_,&wparam(4));
668 &mov ($rounds,&wparam(5));
669 &mov ($key_,"esp");
670 &sub ("esp",60);
671 &and ("esp",-16); # align stack
672 &mov (&DWP(48,"esp"),$key_);
673
674 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
675 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
676 &mov ($rounds,&DWP(240,$key));
677
678 # compose byte-swap control mask for pshufb on stack
679 &mov (&DWP(0,"esp"),0x0c0d0e0f);
680 &mov (&DWP(4,"esp"),0x08090a0b);
681 &mov (&DWP(8,"esp"),0x04050607);
682 &mov (&DWP(12,"esp"),0x00010203);
683
684 # compose counter increment vector on stack
685 &mov ($rounds_,1);
686 &xor ($key_,$key_);
687 &mov (&DWP(16,"esp"),$rounds_);
688 &mov (&DWP(20,"esp"),$key_);
689 &mov (&DWP(24,"esp"),$key_);
690 &mov (&DWP(28,"esp"),$key_);
691
692 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
693 &movdqa ($inout0,$ivec);
694
695 &mov ($key_,$key);
696 &mov ($rounds_,$rounds);
697
698 &pshufb ($ivec,$inout3);
699 if ($inline)
700 { &aesni_inline_generate1("enc"); }
701 else
702 { &call ("_aesni_encrypt1"); }
703 &movups ($in0,&QWP(0,$inp)); # load inp
704 &paddq ($ivec,&QWP(16,"esp"));
705 &lea ($inp,&QWP(16,$inp));
706 &jmp (&label("ccm64_dec_outer"));
707
708&set_label("ccm64_dec_outer",16);
709 &xorps ($in0,$inout0); # inp ^= E(ivec)
710 &movdqa ($inout0,$ivec);
711 &mov ($rounds,$rounds_);
712 &movups (&QWP(0,$out),$in0); # save output
713 &lea ($out,&DWP(16,$out));
714 &pshufb ($inout0,$inout3);
715
716 &sub ($len,1);
717 &jz (&label("ccm64_dec_break"));
718
719 &$movekey ($rndkey0,&QWP(0,$key_));
720 &shr ($rounds,1);
721 &$movekey ($rndkey1,&QWP(16,$key_));
722 &xorps ($in0,$rndkey0);
723 &lea ($key,&DWP(32,$key_));
724 &xorps ($inout0,$rndkey0);
725 &xorps ($cmac,$in0); # cmac^=out
726 &$movekey ($rndkey0,&QWP(0,$key));
727
728&set_label("ccm64_dec2_loop");
729 &aesenc ($inout0,$rndkey1);
730 &dec ($rounds);
731 &aesenc ($cmac,$rndkey1);
732 &$movekey ($rndkey1,&QWP(16,$key));
733 &aesenc ($inout0,$rndkey0);
734 &lea ($key,&DWP(32,$key));
735 &aesenc ($cmac,$rndkey0);
736 &$movekey ($rndkey0,&QWP(0,$key));
737 &jnz (&label("ccm64_dec2_loop"));
738 &movups ($in0,&QWP(0,$inp)); # load inp
739 &paddq ($ivec,&QWP(16,"esp"));
740 &aesenc ($inout0,$rndkey1);
741 &aesenc ($cmac,$rndkey1);
742 &lea ($inp,&QWP(16,$inp));
743 &aesenclast ($inout0,$rndkey0);
744 &aesenclast ($cmac,$rndkey0);
745 &jmp (&label("ccm64_dec_outer"));
746
747&set_label("ccm64_dec_break",16);
748 &mov ($key,$key_);
749 if ($inline)
750 { &aesni_inline_generate1("enc",$cmac,$in0); }
751 else
752 { &call ("_aesni_encrypt1",$cmac); }
753
754 &mov ("esp",&DWP(48,"esp"));
755 &mov ($out,&wparam(5));
756 &movups (&QWP(0,$out),$cmac);
757&function_end("aesni_ccm64_decrypt_blocks");
758}
759
760######################################################################
761# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
762# size_t blocks, const AES_KEY *key,
763# const char *ivec);
764#
765# Handles only complete blocks, operates on 32-bit counter and
766# does not update *ivec! (see engine/eng_aesni.c for details)
767#
768# stack layout:
769# 0 pshufb mask
770# 16 vector addend: 0,6,6,6
771# 32 counter-less ivec
772# 48 1st triplet of counter vector
773# 64 2nd triplet of counter vector
774# 80 saved %esp
775
776&function_begin("aesni_ctr32_encrypt_blocks");
777 &mov ($inp,&wparam(0));
778 &mov ($out,&wparam(1));
779 &mov ($len,&wparam(2));
780 &mov ($key,&wparam(3));
781 &mov ($rounds_,&wparam(4));
782 &mov ($key_,"esp");
783 &sub ("esp",88);
784 &and ("esp",-16); # align stack
785 &mov (&DWP(80,"esp"),$key_);
786
787 &cmp ($len,1);
788 &je (&label("ctr32_one_shortcut"));
789
790 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
791
792 # compose byte-swap control mask for pshufb on stack
793 &mov (&DWP(0,"esp"),0x0c0d0e0f);
794 &mov (&DWP(4,"esp"),0x08090a0b);
795 &mov (&DWP(8,"esp"),0x04050607);
796 &mov (&DWP(12,"esp"),0x00010203);
797
798 # compose counter increment vector on stack
799 &mov ($rounds,6);
800 &xor ($key_,$key_);
801 &mov (&DWP(16,"esp"),$rounds);
802 &mov (&DWP(20,"esp"),$rounds);
803 &mov (&DWP(24,"esp"),$rounds);
804 &mov (&DWP(28,"esp"),$key_);
805
806 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
807 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
808
809 &mov ($rounds,&DWP(240,$key)); # key->rounds
810
811 # compose 2 vectors of 3x32-bit counters
812 &bswap ($rounds_);
813 &pxor ($rndkey1,$rndkey1);
814 &pxor ($rndkey0,$rndkey0);
815 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
816 &pinsrd ($rndkey1,$rounds_,0);
817 &lea ($key_,&DWP(3,$rounds_));
818 &pinsrd ($rndkey0,$key_,0);
819 &inc ($rounds_);
820 &pinsrd ($rndkey1,$rounds_,1);
821 &inc ($key_);
822 &pinsrd ($rndkey0,$key_,1);
823 &inc ($rounds_);
824 &pinsrd ($rndkey1,$rounds_,2);
825 &inc ($key_);
826 &pinsrd ($rndkey0,$key_,2);
827 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
828 &pshufb ($rndkey1,$inout0); # byte swap
829 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
830 &pshufb ($rndkey0,$inout0); # byte swap
831
832 &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword
833 &pshufd ($inout1,$rndkey1,2<<6);
834 &cmp ($len,6);
835 &jb (&label("ctr32_tail"));
836 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec
837 &shr ($rounds,1);
838 &mov ($key_,$key); # backup $key
839 &mov ($rounds_,$rounds); # backup $rounds
840 &sub ($len,6);
841 &jmp (&label("ctr32_loop6"));
842
843&set_label("ctr32_loop6",16);
844 &pshufd ($inout2,$rndkey1,1<<6);
845 &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec
846 &pshufd ($inout3,$rndkey0,3<<6);
847 &por ($inout0,$rndkey1); # merge counter-less ivec
848 &pshufd ($inout4,$rndkey0,2<<6);
849 &por ($inout1,$rndkey1);
850 &pshufd ($inout5,$rndkey0,1<<6);
851 &por ($inout2,$rndkey1);
852 &por ($inout3,$rndkey1);
853 &por ($inout4,$rndkey1);
854 &por ($inout5,$rndkey1);
855
856 # inlining _aesni_encrypt6's prologue gives ~4% improvement...
857 &$movekey ($rndkey0,&QWP(0,$key_));
858 &$movekey ($rndkey1,&QWP(16,$key_));
859 &lea ($key,&DWP(32,$key_));
860 &dec ($rounds);
861 &pxor ($inout0,$rndkey0);
862 &pxor ($inout1,$rndkey0);
863 &aesenc ($inout0,$rndkey1);
864 &pxor ($inout2,$rndkey0);
865 &aesenc ($inout1,$rndkey1);
866 &pxor ($inout3,$rndkey0);
867 &aesenc ($inout2,$rndkey1);
868 &pxor ($inout4,$rndkey0);
869 &aesenc ($inout3,$rndkey1);
870 &pxor ($inout5,$rndkey0);
871 &aesenc ($inout4,$rndkey1);
872 &$movekey ($rndkey0,&QWP(0,$key));
873 &aesenc ($inout5,$rndkey1);
874
875 &call (&label("_aesni_encrypt6_enter"));
876
877 &movups ($rndkey1,&QWP(0,$inp));
878 &movups ($rndkey0,&QWP(0x10,$inp));
879 &xorps ($inout0,$rndkey1);
880 &movups ($rndkey1,&QWP(0x20,$inp));
881 &xorps ($inout1,$rndkey0);
882 &movups (&QWP(0,$out),$inout0);
883 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
884 &xorps ($inout2,$rndkey1);
885 &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet
886 &movups (&QWP(0x10,$out),$inout1);
887 &movups (&QWP(0x20,$out),$inout2);
888
889 &paddd ($rndkey1,$rndkey0); # 1st triplet increment
890 &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment
891 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
892
893 &movups ($inout1,&QWP(0x30,$inp));
894 &movups ($inout2,&QWP(0x40,$inp));
895 &xorps ($inout3,$inout1);
896 &movups ($inout1,&QWP(0x50,$inp));
897 &lea ($inp,&DWP(0x60,$inp));
898 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
899 &pshufb ($rndkey1,$inout0); # byte swap
900 &xorps ($inout4,$inout2);
901 &movups (&QWP(0x30,$out),$inout3);
902 &xorps ($inout5,$inout1);
903 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
904 &pshufb ($rndkey0,$inout0); # byte swap
905 &movups (&QWP(0x40,$out),$inout4);
906 &pshufd ($inout0,$rndkey1,3<<6);
907 &movups (&QWP(0x50,$out),$inout5);
908 &lea ($out,&DWP(0x60,$out));
909
910 &mov ($rounds,$rounds_);
911 &pshufd ($inout1,$rndkey1,2<<6);
912 &sub ($len,6);
913 &jnc (&label("ctr32_loop6"));
914
915 &add ($len,6);
916 &jz (&label("ctr32_ret"));
917 &mov ($key,$key_);
918 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
919 &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec
920
921&set_label("ctr32_tail");
922 &por ($inout0,$inout5);
923 &cmp ($len,2);
924 &jb (&label("ctr32_one"));
925
926 &pshufd ($inout2,$rndkey1,1<<6);
927 &por ($inout1,$inout5);
928 &je (&label("ctr32_two"));
929
930 &pshufd ($inout3,$rndkey0,3<<6);
931 &por ($inout2,$inout5);
932 &cmp ($len,4);
933 &jb (&label("ctr32_three"));
934
935 &pshufd ($inout4,$rndkey0,2<<6);
936 &por ($inout3,$inout5);
937 &je (&label("ctr32_four"));
938
939 &por ($inout4,$inout5);
940 &call ("_aesni_encrypt6");
941 &movups ($rndkey1,&QWP(0,$inp));
942 &movups ($rndkey0,&QWP(0x10,$inp));
943 &xorps ($inout0,$rndkey1);
944 &movups ($rndkey1,&QWP(0x20,$inp));
945 &xorps ($inout1,$rndkey0);
946 &movups ($rndkey0,&QWP(0x30,$inp));
947 &xorps ($inout2,$rndkey1);
948 &movups ($rndkey1,&QWP(0x40,$inp));
949 &xorps ($inout3,$rndkey0);
950 &movups (&QWP(0,$out),$inout0);
951 &xorps ($inout4,$rndkey1);
952 &movups (&QWP(0x10,$out),$inout1);
953 &movups (&QWP(0x20,$out),$inout2);
954 &movups (&QWP(0x30,$out),$inout3);
955 &movups (&QWP(0x40,$out),$inout4);
956 &jmp (&label("ctr32_ret"));
957
958&set_label("ctr32_one_shortcut",16);
959 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
960 &mov ($rounds,&DWP(240,$key));
961
962&set_label("ctr32_one");
963 if ($inline)
964 { &aesni_inline_generate1("enc"); }
965 else
966 { &call ("_aesni_encrypt1"); }
967 &movups ($in0,&QWP(0,$inp));
968 &xorps ($in0,$inout0);
969 &movups (&QWP(0,$out),$in0);
970 &jmp (&label("ctr32_ret"));
971
972&set_label("ctr32_two",16);
973 &call ("_aesni_encrypt3");
974 &movups ($inout3,&QWP(0,$inp));
975 &movups ($inout4,&QWP(0x10,$inp));
976 &xorps ($inout0,$inout3);
977 &xorps ($inout1,$inout4);
978 &movups (&QWP(0,$out),$inout0);
979 &movups (&QWP(0x10,$out),$inout1);
980 &jmp (&label("ctr32_ret"));
981
982&set_label("ctr32_three",16);
983 &call ("_aesni_encrypt3");
984 &movups ($inout3,&QWP(0,$inp));
985 &movups ($inout4,&QWP(0x10,$inp));
986 &xorps ($inout0,$inout3);
987 &movups ($inout5,&QWP(0x20,$inp));
988 &xorps ($inout1,$inout4);
989 &movups (&QWP(0,$out),$inout0);
990 &xorps ($inout2,$inout5);
991 &movups (&QWP(0x10,$out),$inout1);
992 &movups (&QWP(0x20,$out),$inout2);
993 &jmp (&label("ctr32_ret"));
994
995&set_label("ctr32_four",16);
996 &call ("_aesni_encrypt4");
997 &movups ($inout4,&QWP(0,$inp));
998 &movups ($inout5,&QWP(0x10,$inp));
999 &movups ($rndkey1,&QWP(0x20,$inp));
1000 &xorps ($inout0,$inout4);
1001 &movups ($rndkey0,&QWP(0x30,$inp));
1002 &xorps ($inout1,$inout5);
1003 &movups (&QWP(0,$out),$inout0);
1004 &xorps ($inout2,$rndkey1);
1005 &movups (&QWP(0x10,$out),$inout1);
1006 &xorps ($inout3,$rndkey0);
1007 &movups (&QWP(0x20,$out),$inout2);
1008 &movups (&QWP(0x30,$out),$inout3);
1009
1010&set_label("ctr32_ret");
1011 &mov ("esp",&DWP(80,"esp"));
1012&function_end("aesni_ctr32_encrypt_blocks");
1013
1014######################################################################
1015# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1016# const AES_KEY *key1, const AES_KEY *key2
1017# const unsigned char iv[16]);
1018#
1019{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1020
1021&function_begin("aesni_xts_encrypt");
1022 &mov ($key,&wparam(4)); # key2
1023 &mov ($inp,&wparam(5)); # clear-text tweak
1024
1025 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1026 &movups ($inout0,&QWP(0,$inp));
1027 if ($inline)
1028 { &aesni_inline_generate1("enc"); }
1029 else
1030 { &call ("_aesni_encrypt1"); }
1031
1032 &mov ($inp,&wparam(0));
1033 &mov ($out,&wparam(1));
1034 &mov ($len,&wparam(2));
1035 &mov ($key,&wparam(3)); # key1
1036
1037 &mov ($key_,"esp");
1038 &sub ("esp",16*7+8);
1039 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1040 &and ("esp",-16); # align stack
1041
1042 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1043 &mov (&DWP(16*6+4,"esp"),0);
1044 &mov (&DWP(16*6+8,"esp"),1);
1045 &mov (&DWP(16*6+12,"esp"),0);
1046 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1047 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1048
1049 &movdqa ($tweak,$inout0);
1050 &pxor ($twtmp,$twtmp);
1051 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1052 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1053
1054 &and ($len,-16);
1055 &mov ($key_,$key); # backup $key
1056 &mov ($rounds_,$rounds); # backup $rounds
1057 &sub ($len,16*6);
1058 &jc (&label("xts_enc_short"));
1059
1060 &shr ($rounds,1);
1061 &mov ($rounds_,$rounds);
1062 &jmp (&label("xts_enc_loop6"));
1063
1064&set_label("xts_enc_loop6",16);
1065 for ($i=0;$i<4;$i++) {
1066 &pshufd ($twres,$twtmp,0x13);
1067 &pxor ($twtmp,$twtmp);
1068 &movdqa (&QWP(16*$i,"esp"),$tweak);
1069 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1070 &pand ($twres,$twmask); # isolate carry and residue
1071 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1072 &pxor ($tweak,$twres);
1073 }
1074 &pshufd ($inout5,$twtmp,0x13);
1075 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1076 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1077 &$movekey ($rndkey0,&QWP(0,$key_));
1078 &pand ($inout5,$twmask); # isolate carry and residue
1079 &movups ($inout0,&QWP(0,$inp)); # load input
1080 &pxor ($inout5,$tweak);
1081
1082 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1083 &movdqu ($inout1,&QWP(16*1,$inp));
1084 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1085 &movdqu ($inout2,&QWP(16*2,$inp));
1086 &pxor ($inout1,$rndkey0);
1087 &movdqu ($inout3,&QWP(16*3,$inp));
1088 &pxor ($inout2,$rndkey0);
1089 &movdqu ($inout4,&QWP(16*4,$inp));
1090 &pxor ($inout3,$rndkey0);
1091 &movdqu ($rndkey1,&QWP(16*5,$inp));
1092 &pxor ($inout4,$rndkey0);
1093 &lea ($inp,&DWP(16*6,$inp));
1094 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1095 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1096 &pxor ($inout5,$rndkey1);
1097
1098 &$movekey ($rndkey1,&QWP(16,$key_));
1099 &lea ($key,&DWP(32,$key_));
1100 &pxor ($inout1,&QWP(16*1,"esp"));
1101 &aesenc ($inout0,$rndkey1);
1102 &pxor ($inout2,&QWP(16*2,"esp"));
1103 &aesenc ($inout1,$rndkey1);
1104 &pxor ($inout3,&QWP(16*3,"esp"));
1105 &dec ($rounds);
1106 &aesenc ($inout2,$rndkey1);
1107 &pxor ($inout4,&QWP(16*4,"esp"));
1108 &aesenc ($inout3,$rndkey1);
1109 &pxor ($inout5,$rndkey0);
1110 &aesenc ($inout4,$rndkey1);
1111 &$movekey ($rndkey0,&QWP(0,$key));
1112 &aesenc ($inout5,$rndkey1);
1113 &call (&label("_aesni_encrypt6_enter"));
1114
1115 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1116 &pxor ($twtmp,$twtmp);
1117 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1118 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1119 &xorps ($inout1,&QWP(16*1,"esp"));
1120 &movups (&QWP(16*0,$out),$inout0); # write output
1121 &xorps ($inout2,&QWP(16*2,"esp"));
1122 &movups (&QWP(16*1,$out),$inout1);
1123 &xorps ($inout3,&QWP(16*3,"esp"));
1124 &movups (&QWP(16*2,$out),$inout2);
1125 &xorps ($inout4,&QWP(16*4,"esp"));
1126 &movups (&QWP(16*3,$out),$inout3);
1127 &xorps ($inout5,$tweak);
1128 &movups (&QWP(16*4,$out),$inout4);
1129 &pshufd ($twres,$twtmp,0x13);
1130 &movups (&QWP(16*5,$out),$inout5);
1131 &lea ($out,&DWP(16*6,$out));
1132 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1133
1134 &pxor ($twtmp,$twtmp);
1135 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1136 &pand ($twres,$twmask); # isolate carry and residue
1137 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1138 &mov ($rounds,$rounds_); # restore $rounds
1139 &pxor ($tweak,$twres);
1140
1141 &sub ($len,16*6);
1142 &jnc (&label("xts_enc_loop6"));
1143
1144 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
1145 &mov ($key,$key_); # restore $key
1146 &mov ($rounds_,$rounds);
1147
1148&set_label("xts_enc_short");
1149 &add ($len,16*6);
1150 &jz (&label("xts_enc_done6x"));
1151
1152 &movdqa ($inout3,$tweak); # put aside previous tweak
1153 &cmp ($len,0x20);
1154 &jb (&label("xts_enc_one"));
1155
1156 &pshufd ($twres,$twtmp,0x13);
1157 &pxor ($twtmp,$twtmp);
1158 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1159 &pand ($twres,$twmask); # isolate carry and residue
1160 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1161 &pxor ($tweak,$twres);
1162 &je (&label("xts_enc_two"));
1163
1164 &pshufd ($twres,$twtmp,0x13);
1165 &pxor ($twtmp,$twtmp);
1166 &movdqa ($inout4,$tweak); # put aside previous tweak
1167 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1168 &pand ($twres,$twmask); # isolate carry and residue
1169 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1170 &pxor ($tweak,$twres);
1171 &cmp ($len,0x40);
1172 &jb (&label("xts_enc_three"));
1173
1174 &pshufd ($twres,$twtmp,0x13);
1175 &pxor ($twtmp,$twtmp);
1176 &movdqa ($inout5,$tweak); # put aside previous tweak
1177 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1178 &pand ($twres,$twmask); # isolate carry and residue
1179 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1180 &pxor ($tweak,$twres);
1181 &movdqa (&QWP(16*0,"esp"),$inout3);
1182 &movdqa (&QWP(16*1,"esp"),$inout4);
1183 &je (&label("xts_enc_four"));
1184
1185 &movdqa (&QWP(16*2,"esp"),$inout5);
1186 &pshufd ($inout5,$twtmp,0x13);
1187 &movdqa (&QWP(16*3,"esp"),$tweak);
1188 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1189 &pand ($inout5,$twmask); # isolate carry and residue
1190 &pxor ($inout5,$tweak);
1191
1192 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1193 &movdqu ($inout1,&QWP(16*1,$inp));
1194 &movdqu ($inout2,&QWP(16*2,$inp));
1195 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1196 &movdqu ($inout3,&QWP(16*3,$inp));
1197 &pxor ($inout1,&QWP(16*1,"esp"));
1198 &movdqu ($inout4,&QWP(16*4,$inp));
1199 &pxor ($inout2,&QWP(16*2,"esp"));
1200 &lea ($inp,&DWP(16*5,$inp));
1201 &pxor ($inout3,&QWP(16*3,"esp"));
1202 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1203 &pxor ($inout4,$inout5);
1204
1205 &call ("_aesni_encrypt6");
1206
1207 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1208 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1209 &xorps ($inout1,&QWP(16*1,"esp"));
1210 &xorps ($inout2,&QWP(16*2,"esp"));
1211 &movups (&QWP(16*0,$out),$inout0); # write output
1212 &xorps ($inout3,&QWP(16*3,"esp"));
1213 &movups (&QWP(16*1,$out),$inout1);
1214 &xorps ($inout4,$tweak);
1215 &movups (&QWP(16*2,$out),$inout2);
1216 &movups (&QWP(16*3,$out),$inout3);
1217 &movups (&QWP(16*4,$out),$inout4);
1218 &lea ($out,&DWP(16*5,$out));
1219 &jmp (&label("xts_enc_done"));
1220
1221&set_label("xts_enc_one",16);
1222 &movups ($inout0,&QWP(16*0,$inp)); # load input
1223 &lea ($inp,&DWP(16*1,$inp));
1224 &xorps ($inout0,$inout3); # input^=tweak
1225 if ($inline)
1226 { &aesni_inline_generate1("enc"); }
1227 else
1228 { &call ("_aesni_encrypt1"); }
1229 &xorps ($inout0,$inout3); # output^=tweak
1230 &movups (&QWP(16*0,$out),$inout0); # write output
1231 &lea ($out,&DWP(16*1,$out));
1232
1233 &movdqa ($tweak,$inout3); # last tweak
1234 &jmp (&label("xts_enc_done"));
1235
1236&set_label("xts_enc_two",16);
1237 &movaps ($inout4,$tweak); # put aside last tweak
1238
1239 &movups ($inout0,&QWP(16*0,$inp)); # load input
1240 &movups ($inout1,&QWP(16*1,$inp));
1241 &lea ($inp,&DWP(16*2,$inp));
1242 &xorps ($inout0,$inout3); # input^=tweak
1243 &xorps ($inout1,$inout4);
1244 &xorps ($inout2,$inout2);
1245
1246 &call ("_aesni_encrypt3");
1247
1248 &xorps ($inout0,$inout3); # output^=tweak
1249 &xorps ($inout1,$inout4);
1250 &movups (&QWP(16*0,$out),$inout0); # write output
1251 &movups (&QWP(16*1,$out),$inout1);
1252 &lea ($out,&DWP(16*2,$out));
1253
1254 &movdqa ($tweak,$inout4); # last tweak
1255 &jmp (&label("xts_enc_done"));
1256
1257&set_label("xts_enc_three",16);
1258 &movaps ($inout5,$tweak); # put aside last tweak
1259 &movups ($inout0,&QWP(16*0,$inp)); # load input
1260 &movups ($inout1,&QWP(16*1,$inp));
1261 &movups ($inout2,&QWP(16*2,$inp));
1262 &lea ($inp,&DWP(16*3,$inp));
1263 &xorps ($inout0,$inout3); # input^=tweak
1264 &xorps ($inout1,$inout4);
1265 &xorps ($inout2,$inout5);
1266
1267 &call ("_aesni_encrypt3");
1268
1269 &xorps ($inout0,$inout3); # output^=tweak
1270 &xorps ($inout1,$inout4);
1271 &xorps ($inout2,$inout5);
1272 &movups (&QWP(16*0,$out),$inout0); # write output
1273 &movups (&QWP(16*1,$out),$inout1);
1274 &movups (&QWP(16*2,$out),$inout2);
1275 &lea ($out,&DWP(16*3,$out));
1276
1277 &movdqa ($tweak,$inout5); # last tweak
1278 &jmp (&label("xts_enc_done"));
1279
1280&set_label("xts_enc_four",16);
1281 &movaps ($inout4,$tweak); # put aside last tweak
1282
1283 &movups ($inout0,&QWP(16*0,$inp)); # load input
1284 &movups ($inout1,&QWP(16*1,$inp));
1285 &movups ($inout2,&QWP(16*2,$inp));
1286 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1287 &movups ($inout3,&QWP(16*3,$inp));
1288 &lea ($inp,&DWP(16*4,$inp));
1289 &xorps ($inout1,&QWP(16*1,"esp"));
1290 &xorps ($inout2,$inout5);
1291 &xorps ($inout3,$inout4);
1292
1293 &call ("_aesni_encrypt4");
1294
1295 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1296 &xorps ($inout1,&QWP(16*1,"esp"));
1297 &xorps ($inout2,$inout5);
1298 &movups (&QWP(16*0,$out),$inout0); # write output
1299 &xorps ($inout3,$inout4);
1300 &movups (&QWP(16*1,$out),$inout1);
1301 &movups (&QWP(16*2,$out),$inout2);
1302 &movups (&QWP(16*3,$out),$inout3);
1303 &lea ($out,&DWP(16*4,$out));
1304
1305 &movdqa ($tweak,$inout4); # last tweak
1306 &jmp (&label("xts_enc_done"));
1307
1308&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1309 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1310 &and ($len,15);
1311 &jz (&label("xts_enc_ret"));
1312 &movdqa ($inout3,$tweak);
1313 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1314 &jmp (&label("xts_enc_steal"));
1315
1316&set_label("xts_enc_done",16);
1317 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1318 &pxor ($twtmp,$twtmp);
1319 &and ($len,15);
1320 &jz (&label("xts_enc_ret"));
1321
1322 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1323 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1324 &pshufd ($inout3,$twtmp,0x13);
1325 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1326 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1327 &pxor ($inout3,$tweak);
1328
1329&set_label("xts_enc_steal");
1330 &movz ($rounds,&BP(0,$inp));
1331 &movz ($key,&BP(-16,$out));
1332 &lea ($inp,&DWP(1,$inp));
1333 &mov (&BP(-16,$out),&LB($rounds));
1334 &mov (&BP(0,$out),&LB($key));
1335 &lea ($out,&DWP(1,$out));
1336 &sub ($len,1);
1337 &jnz (&label("xts_enc_steal"));
1338
1339 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1340 &mov ($key,$key_); # restore $key
1341 &mov ($rounds,$rounds_); # restore $rounds
1342
1343 &movups ($inout0,&QWP(-16,$out)); # load input
1344 &xorps ($inout0,$inout3); # input^=tweak
1345 if ($inline)
1346 { &aesni_inline_generate1("enc"); }
1347 else
1348 { &call ("_aesni_encrypt1"); }
1349 &xorps ($inout0,$inout3); # output^=tweak
1350 &movups (&QWP(-16,$out),$inout0); # write output
1351
1352&set_label("xts_enc_ret");
1353 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1354&function_end("aesni_xts_encrypt");
1355
1356&function_begin("aesni_xts_decrypt");
1357 &mov ($key,&wparam(4)); # key2
1358 &mov ($inp,&wparam(5)); # clear-text tweak
1359
1360 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1361 &movups ($inout0,&QWP(0,$inp));
1362 if ($inline)
1363 { &aesni_inline_generate1("enc"); }
1364 else
1365 { &call ("_aesni_encrypt1"); }
1366
1367 &mov ($inp,&wparam(0));
1368 &mov ($out,&wparam(1));
1369 &mov ($len,&wparam(2));
1370 &mov ($key,&wparam(3)); # key1
1371
1372 &mov ($key_,"esp");
1373 &sub ("esp",16*7+8);
1374 &and ("esp",-16); # align stack
1375
1376 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1377 &test ($len,15);
1378 &setnz (&LB($rounds_));
1379 &shl ($rounds_,4);
1380 &sub ($len,$rounds_);
1381
1382 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1383 &mov (&DWP(16*6+4,"esp"),0);
1384 &mov (&DWP(16*6+8,"esp"),1);
1385 &mov (&DWP(16*6+12,"esp"),0);
1386 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1387 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1388
1389 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1390 &mov ($key_,$key); # backup $key
1391 &mov ($rounds_,$rounds); # backup $rounds
1392
1393 &movdqa ($tweak,$inout0);
1394 &pxor ($twtmp,$twtmp);
1395 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1396 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1397
1398 &and ($len,-16);
1399 &sub ($len,16*6);
1400 &jc (&label("xts_dec_short"));
1401
1402 &shr ($rounds,1);
1403 &mov ($rounds_,$rounds);
1404 &jmp (&label("xts_dec_loop6"));
1405
1406&set_label("xts_dec_loop6",16);
1407 for ($i=0;$i<4;$i++) {
1408 &pshufd ($twres,$twtmp,0x13);
1409 &pxor ($twtmp,$twtmp);
1410 &movdqa (&QWP(16*$i,"esp"),$tweak);
1411 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1412 &pand ($twres,$twmask); # isolate carry and residue
1413 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1414 &pxor ($tweak,$twres);
1415 }
1416 &pshufd ($inout5,$twtmp,0x13);
1417 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1418 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1419 &$movekey ($rndkey0,&QWP(0,$key_));
1420 &pand ($inout5,$twmask); # isolate carry and residue
1421 &movups ($inout0,&QWP(0,$inp)); # load input
1422 &pxor ($inout5,$tweak);
1423
1424 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1425 &movdqu ($inout1,&QWP(16*1,$inp));
1426 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1427 &movdqu ($inout2,&QWP(16*2,$inp));
1428 &pxor ($inout1,$rndkey0);
1429 &movdqu ($inout3,&QWP(16*3,$inp));
1430 &pxor ($inout2,$rndkey0);
1431 &movdqu ($inout4,&QWP(16*4,$inp));
1432 &pxor ($inout3,$rndkey0);
1433 &movdqu ($rndkey1,&QWP(16*5,$inp));
1434 &pxor ($inout4,$rndkey0);
1435 &lea ($inp,&DWP(16*6,$inp));
1436 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1437 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1438 &pxor ($inout5,$rndkey1);
1439
1440 &$movekey ($rndkey1,&QWP(16,$key_));
1441 &lea ($key,&DWP(32,$key_));
1442 &pxor ($inout1,&QWP(16*1,"esp"));
1443 &aesdec ($inout0,$rndkey1);
1444 &pxor ($inout2,&QWP(16*2,"esp"));
1445 &aesdec ($inout1,$rndkey1);
1446 &pxor ($inout3,&QWP(16*3,"esp"));
1447 &dec ($rounds);
1448 &aesdec ($inout2,$rndkey1);
1449 &pxor ($inout4,&QWP(16*4,"esp"));
1450 &aesdec ($inout3,$rndkey1);
1451 &pxor ($inout5,$rndkey0);
1452 &aesdec ($inout4,$rndkey1);
1453 &$movekey ($rndkey0,&QWP(0,$key));
1454 &aesdec ($inout5,$rndkey1);
1455 &call (&label("_aesni_decrypt6_enter"));
1456
1457 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1458 &pxor ($twtmp,$twtmp);
1459 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1460 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1461 &xorps ($inout1,&QWP(16*1,"esp"));
1462 &movups (&QWP(16*0,$out),$inout0); # write output
1463 &xorps ($inout2,&QWP(16*2,"esp"));
1464 &movups (&QWP(16*1,$out),$inout1);
1465 &xorps ($inout3,&QWP(16*3,"esp"));
1466 &movups (&QWP(16*2,$out),$inout2);
1467 &xorps ($inout4,&QWP(16*4,"esp"));
1468 &movups (&QWP(16*3,$out),$inout3);
1469 &xorps ($inout5,$tweak);
1470 &movups (&QWP(16*4,$out),$inout4);
1471 &pshufd ($twres,$twtmp,0x13);
1472 &movups (&QWP(16*5,$out),$inout5);
1473 &lea ($out,&DWP(16*6,$out));
1474 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1475
1476 &pxor ($twtmp,$twtmp);
1477 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1478 &pand ($twres,$twmask); # isolate carry and residue
1479 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1480 &mov ($rounds,$rounds_); # restore $rounds
1481 &pxor ($tweak,$twres);
1482
1483 &sub ($len,16*6);
1484 &jnc (&label("xts_dec_loop6"));
1485
1486 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
1487 &mov ($key,$key_); # restore $key
1488 &mov ($rounds_,$rounds);
1489
1490&set_label("xts_dec_short");
1491 &add ($len,16*6);
1492 &jz (&label("xts_dec_done6x"));
1493
1494 &movdqa ($inout3,$tweak); # put aside previous tweak
1495 &cmp ($len,0x20);
1496 &jb (&label("xts_dec_one"));
1497
1498 &pshufd ($twres,$twtmp,0x13);
1499 &pxor ($twtmp,$twtmp);
1500 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1501 &pand ($twres,$twmask); # isolate carry and residue
1502 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1503 &pxor ($tweak,$twres);
1504 &je (&label("xts_dec_two"));
1505
1506 &pshufd ($twres,$twtmp,0x13);
1507 &pxor ($twtmp,$twtmp);
1508 &movdqa ($inout4,$tweak); # put aside previous tweak
1509 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1510 &pand ($twres,$twmask); # isolate carry and residue
1511 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1512 &pxor ($tweak,$twres);
1513 &cmp ($len,0x40);
1514 &jb (&label("xts_dec_three"));
1515
1516 &pshufd ($twres,$twtmp,0x13);
1517 &pxor ($twtmp,$twtmp);
1518 &movdqa ($inout5,$tweak); # put aside previous tweak
1519 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1520 &pand ($twres,$twmask); # isolate carry and residue
1521 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1522 &pxor ($tweak,$twres);
1523 &movdqa (&QWP(16*0,"esp"),$inout3);
1524 &movdqa (&QWP(16*1,"esp"),$inout4);
1525 &je (&label("xts_dec_four"));
1526
1527 &movdqa (&QWP(16*2,"esp"),$inout5);
1528 &pshufd ($inout5,$twtmp,0x13);
1529 &movdqa (&QWP(16*3,"esp"),$tweak);
1530 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1531 &pand ($inout5,$twmask); # isolate carry and residue
1532 &pxor ($inout5,$tweak);
1533
1534 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1535 &movdqu ($inout1,&QWP(16*1,$inp));
1536 &movdqu ($inout2,&QWP(16*2,$inp));
1537 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1538 &movdqu ($inout3,&QWP(16*3,$inp));
1539 &pxor ($inout1,&QWP(16*1,"esp"));
1540 &movdqu ($inout4,&QWP(16*4,$inp));
1541 &pxor ($inout2,&QWP(16*2,"esp"));
1542 &lea ($inp,&DWP(16*5,$inp));
1543 &pxor ($inout3,&QWP(16*3,"esp"));
1544 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1545 &pxor ($inout4,$inout5);
1546
1547 &call ("_aesni_decrypt6");
1548
1549 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1550 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1551 &xorps ($inout1,&QWP(16*1,"esp"));
1552 &xorps ($inout2,&QWP(16*2,"esp"));
1553 &movups (&QWP(16*0,$out),$inout0); # write output
1554 &xorps ($inout3,&QWP(16*3,"esp"));
1555 &movups (&QWP(16*1,$out),$inout1);
1556 &xorps ($inout4,$tweak);
1557 &movups (&QWP(16*2,$out),$inout2);
1558 &movups (&QWP(16*3,$out),$inout3);
1559 &movups (&QWP(16*4,$out),$inout4);
1560 &lea ($out,&DWP(16*5,$out));
1561 &jmp (&label("xts_dec_done"));
1562
1563&set_label("xts_dec_one",16);
1564 &movups ($inout0,&QWP(16*0,$inp)); # load input
1565 &lea ($inp,&DWP(16*1,$inp));
1566 &xorps ($inout0,$inout3); # input^=tweak
1567 if ($inline)
1568 { &aesni_inline_generate1("dec"); }
1569 else
1570 { &call ("_aesni_decrypt1"); }
1571 &xorps ($inout0,$inout3); # output^=tweak
1572 &movups (&QWP(16*0,$out),$inout0); # write output
1573 &lea ($out,&DWP(16*1,$out));
1574
1575 &movdqa ($tweak,$inout3); # last tweak
1576 &jmp (&label("xts_dec_done"));
1577
1578&set_label("xts_dec_two",16);
1579 &movaps ($inout4,$tweak); # put aside last tweak
1580
1581 &movups ($inout0,&QWP(16*0,$inp)); # load input
1582 &movups ($inout1,&QWP(16*1,$inp));
1583 &lea ($inp,&DWP(16*2,$inp));
1584 &xorps ($inout0,$inout3); # input^=tweak
1585 &xorps ($inout1,$inout4);
1586
1587 &call ("_aesni_decrypt3");
1588
1589 &xorps ($inout0,$inout3); # output^=tweak
1590 &xorps ($inout1,$inout4);
1591 &movups (&QWP(16*0,$out),$inout0); # write output
1592 &movups (&QWP(16*1,$out),$inout1);
1593 &lea ($out,&DWP(16*2,$out));
1594
1595 &movdqa ($tweak,$inout4); # last tweak
1596 &jmp (&label("xts_dec_done"));
1597
1598&set_label("xts_dec_three",16);
1599 &movaps ($inout5,$tweak); # put aside last tweak
1600 &movups ($inout0,&QWP(16*0,$inp)); # load input
1601 &movups ($inout1,&QWP(16*1,$inp));
1602 &movups ($inout2,&QWP(16*2,$inp));
1603 &lea ($inp,&DWP(16*3,$inp));
1604 &xorps ($inout0,$inout3); # input^=tweak
1605 &xorps ($inout1,$inout4);
1606 &xorps ($inout2,$inout5);
1607
1608 &call ("_aesni_decrypt3");
1609
1610 &xorps ($inout0,$inout3); # output^=tweak
1611 &xorps ($inout1,$inout4);
1612 &xorps ($inout2,$inout5);
1613 &movups (&QWP(16*0,$out),$inout0); # write output
1614 &movups (&QWP(16*1,$out),$inout1);
1615 &movups (&QWP(16*2,$out),$inout2);
1616 &lea ($out,&DWP(16*3,$out));
1617
1618 &movdqa ($tweak,$inout5); # last tweak
1619 &jmp (&label("xts_dec_done"));
1620
1621&set_label("xts_dec_four",16);
1622 &movaps ($inout4,$tweak); # put aside last tweak
1623
1624 &movups ($inout0,&QWP(16*0,$inp)); # load input
1625 &movups ($inout1,&QWP(16*1,$inp));
1626 &movups ($inout2,&QWP(16*2,$inp));
1627 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1628 &movups ($inout3,&QWP(16*3,$inp));
1629 &lea ($inp,&DWP(16*4,$inp));
1630 &xorps ($inout1,&QWP(16*1,"esp"));
1631 &xorps ($inout2,$inout5);
1632 &xorps ($inout3,$inout4);
1633
1634 &call ("_aesni_decrypt4");
1635
1636 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1637 &xorps ($inout1,&QWP(16*1,"esp"));
1638 &xorps ($inout2,$inout5);
1639 &movups (&QWP(16*0,$out),$inout0); # write output
1640 &xorps ($inout3,$inout4);
1641 &movups (&QWP(16*1,$out),$inout1);
1642 &movups (&QWP(16*2,$out),$inout2);
1643 &movups (&QWP(16*3,$out),$inout3);
1644 &lea ($out,&DWP(16*4,$out));
1645
1646 &movdqa ($tweak,$inout4); # last tweak
1647 &jmp (&label("xts_dec_done"));
1648
1649&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1650 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1651 &and ($len,15);
1652 &jz (&label("xts_dec_ret"));
1653 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1654 &jmp (&label("xts_dec_only_one_more"));
1655
1656&set_label("xts_dec_done",16);
1657 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1658 &pxor ($twtmp,$twtmp);
1659 &and ($len,15);
1660 &jz (&label("xts_dec_ret"));
1661
1662 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1663 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1664 &pshufd ($twres,$twtmp,0x13);
1665 &pxor ($twtmp,$twtmp);
1666 &movdqa ($twmask,&QWP(16*6,"esp"));
1667 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1668 &pand ($twres,$twmask); # isolate carry and residue
1669 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1670 &pxor ($tweak,$twres);
1671
1672&set_label("xts_dec_only_one_more");
1673 &pshufd ($inout3,$twtmp,0x13);
1674 &movdqa ($inout4,$tweak); # put aside previous tweak
1675 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1676 &pand ($inout3,$twmask); # isolate carry and residue
1677 &pxor ($inout3,$tweak);
1678
1679 &mov ($key,$key_); # restore $key
1680 &mov ($rounds,$rounds_); # restore $rounds
1681
1682 &movups ($inout0,&QWP(0,$inp)); # load input
1683 &xorps ($inout0,$inout3); # input^=tweak
1684 if ($inline)
1685 { &aesni_inline_generate1("dec"); }
1686 else
1687 { &call ("_aesni_decrypt1"); }
1688 &xorps ($inout0,$inout3); # output^=tweak
1689 &movups (&QWP(0,$out),$inout0); # write output
1690
1691&set_label("xts_dec_steal");
1692 &movz ($rounds,&BP(16,$inp));
1693 &movz ($key,&BP(0,$out));
1694 &lea ($inp,&DWP(1,$inp));
1695 &mov (&BP(0,$out),&LB($rounds));
1696 &mov (&BP(16,$out),&LB($key));
1697 &lea ($out,&DWP(1,$out));
1698 &sub ($len,1);
1699 &jnz (&label("xts_dec_steal"));
1700
1701 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1702 &mov ($key,$key_); # restore $key
1703 &mov ($rounds,$rounds_); # restore $rounds
1704
1705 &movups ($inout0,&QWP(0,$out)); # load input
1706 &xorps ($inout0,$inout4); # input^=tweak
1707 if ($inline)
1708 { &aesni_inline_generate1("dec"); }
1709 else
1710 { &call ("_aesni_decrypt1"); }
1711 &xorps ($inout0,$inout4); # output^=tweak
1712 &movups (&QWP(0,$out),$inout0); # write output
1713
1714&set_label("xts_dec_ret");
1715 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1716&function_end("aesni_xts_decrypt");
1717}
1718}
1719
1720######################################################################
1721# void $PREFIX_cbc_encrypt (const void *inp, void *out,
1722# size_t length, const AES_KEY *key,
1723# unsigned char *ivp,const int enc);
1724&function_begin("${PREFIX}_cbc_encrypt");
1725 &mov ($inp,&wparam(0));
1726 &mov ($rounds_,"esp");
1727 &mov ($out,&wparam(1));
1728 &sub ($rounds_,24);
1729 &mov ($len,&wparam(2));
1730 &and ($rounds_,-16);
1731 &mov ($key,&wparam(3));
1732 &mov ($key_,&wparam(4));
1733 &test ($len,$len);
1734 &jz (&label("cbc_abort"));
1735
1736 &cmp (&wparam(5),0);
1737 &xchg ($rounds_,"esp"); # alloca
1738 &movups ($ivec,&QWP(0,$key_)); # load IV
1739 &mov ($rounds,&DWP(240,$key));
1740 &mov ($key_,$key); # backup $key
1741 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
1742 &mov ($rounds_,$rounds); # backup $rounds
1743 &je (&label("cbc_decrypt"));
1744
1745 &movaps ($inout0,$ivec);
1746 &cmp ($len,16);
1747 &jb (&label("cbc_enc_tail"));
1748 &sub ($len,16);
1749 &jmp (&label("cbc_enc_loop"));
1750
1751&set_label("cbc_enc_loop",16);
1752 &movups ($ivec,&QWP(0,$inp)); # input actually
1753 &lea ($inp,&DWP(16,$inp));
1754 if ($inline)
1755 { &aesni_inline_generate1("enc",$inout0,$ivec); }
1756 else
1757 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
1758 &mov ($rounds,$rounds_); # restore $rounds
1759 &mov ($key,$key_); # restore $key
1760 &movups (&QWP(0,$out),$inout0); # store output
1761 &lea ($out,&DWP(16,$out));
1762 &sub ($len,16);
1763 &jnc (&label("cbc_enc_loop"));
1764 &add ($len,16);
1765 &jnz (&label("cbc_enc_tail"));
1766 &movaps ($ivec,$inout0);
1767 &jmp (&label("cbc_ret"));
1768
1769&set_label("cbc_enc_tail");
1770 &mov ("ecx",$len); # zaps $rounds
1771 &data_word(0xA4F3F689); # rep movsb
1772 &mov ("ecx",16); # zero tail
1773 &sub ("ecx",$len);
1774 &xor ("eax","eax"); # zaps $len
1775 &data_word(0xAAF3F689); # rep stosb
1776 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
1777 &mov ($rounds,$rounds_); # restore $rounds
1778 &mov ($inp,$out); # $inp and $out are the same
1779 &mov ($key,$key_); # restore $key
1780 &jmp (&label("cbc_enc_loop"));
1781######################################################################
1782&set_label("cbc_decrypt",16);
1783 &cmp ($len,0x50);
1784 &jbe (&label("cbc_dec_tail"));
1785 &movaps (&QWP(0,"esp"),$ivec); # save IV
1786 &sub ($len,0x50);
1787 &jmp (&label("cbc_dec_loop6_enter"));
1788
1789&set_label("cbc_dec_loop6",16);
1790 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
1791 &movups (&QWP(0,$out),$inout5);
1792 &lea ($out,&DWP(0x10,$out));
1793&set_label("cbc_dec_loop6_enter");
1794 &movdqu ($inout0,&QWP(0,$inp));
1795 &movdqu ($inout1,&QWP(0x10,$inp));
1796 &movdqu ($inout2,&QWP(0x20,$inp));
1797 &movdqu ($inout3,&QWP(0x30,$inp));
1798 &movdqu ($inout4,&QWP(0x40,$inp));
1799 &movdqu ($inout5,&QWP(0x50,$inp));
1800
1801 &call ("_aesni_decrypt6");
1802
1803 &movups ($rndkey1,&QWP(0,$inp));
1804 &movups ($rndkey0,&QWP(0x10,$inp));
1805 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
1806 &xorps ($inout1,$rndkey1);
1807 &movups ($rndkey1,&QWP(0x20,$inp));
1808 &xorps ($inout2,$rndkey0);
1809 &movups ($rndkey0,&QWP(0x30,$inp));
1810 &xorps ($inout3,$rndkey1);
1811 &movups ($rndkey1,&QWP(0x40,$inp));
1812 &xorps ($inout4,$rndkey0);
1813 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
1814 &xorps ($inout5,$rndkey1);
1815 &movups (&QWP(0,$out),$inout0);
1816 &movups (&QWP(0x10,$out),$inout1);
1817 &lea ($inp,&DWP(0x60,$inp));
1818 &movups (&QWP(0x20,$out),$inout2);
1819 &mov ($rounds,$rounds_) # restore $rounds
1820 &movups (&QWP(0x30,$out),$inout3);
1821 &mov ($key,$key_); # restore $key
1822 &movups (&QWP(0x40,$out),$inout4);
1823 &lea ($out,&DWP(0x50,$out));
1824 &sub ($len,0x60);
1825 &ja (&label("cbc_dec_loop6"));
1826
1827 &movaps ($inout0,$inout5);
1828 &movaps ($ivec,$rndkey0);
1829 &add ($len,0x50);
1830 &jle (&label("cbc_dec_tail_collected"));
1831 &movups (&QWP(0,$out),$inout0);
1832 &lea ($out,&DWP(0x10,$out));
1833&set_label("cbc_dec_tail");
1834 &movups ($inout0,&QWP(0,$inp));
1835 &movaps ($in0,$inout0);
1836 &cmp ($len,0x10);
1837 &jbe (&label("cbc_dec_one"));
1838
1839 &movups ($inout1,&QWP(0x10,$inp));
1840 &movaps ($in1,$inout1);
1841 &cmp ($len,0x20);
1842 &jbe (&label("cbc_dec_two"));
1843
1844 &movups ($inout2,&QWP(0x20,$inp));
1845 &cmp ($len,0x30);
1846 &jbe (&label("cbc_dec_three"));
1847
1848 &movups ($inout3,&QWP(0x30,$inp));
1849 &cmp ($len,0x40);
1850 &jbe (&label("cbc_dec_four"));
1851
1852 &movups ($inout4,&QWP(0x40,$inp));
1853 &movaps (&QWP(0,"esp"),$ivec); # save IV
1854 &movups ($inout0,&QWP(0,$inp));
1855 &xorps ($inout5,$inout5);
1856 &call ("_aesni_decrypt6");
1857 &movups ($rndkey1,&QWP(0,$inp));
1858 &movups ($rndkey0,&QWP(0x10,$inp));
1859 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
1860 &xorps ($inout1,$rndkey1);
1861 &movups ($rndkey1,&QWP(0x20,$inp));
1862 &xorps ($inout2,$rndkey0);
1863 &movups ($rndkey0,&QWP(0x30,$inp));
1864 &xorps ($inout3,$rndkey1);
1865 &movups ($ivec,&QWP(0x40,$inp)); # IV
1866 &xorps ($inout4,$rndkey0);
1867 &movups (&QWP(0,$out),$inout0);
1868 &movups (&QWP(0x10,$out),$inout1);
1869 &movups (&QWP(0x20,$out),$inout2);
1870 &movups (&QWP(0x30,$out),$inout3);
1871 &lea ($out,&DWP(0x40,$out));
1872 &movaps ($inout0,$inout4);
1873 &sub ($len,0x50);
1874 &jmp (&label("cbc_dec_tail_collected"));
1875
1876&set_label("cbc_dec_one",16);
1877 if ($inline)
1878 { &aesni_inline_generate1("dec"); }
1879 else
1880 { &call ("_aesni_decrypt1"); }
1881 &xorps ($inout0,$ivec);
1882 &movaps ($ivec,$in0);
1883 &sub ($len,0x10);
1884 &jmp (&label("cbc_dec_tail_collected"));
1885
1886&set_label("cbc_dec_two",16);
1887 &xorps ($inout2,$inout2);
1888 &call ("_aesni_decrypt3");
1889 &xorps ($inout0,$ivec);
1890 &xorps ($inout1,$in0);
1891 &movups (&QWP(0,$out),$inout0);
1892 &movaps ($inout0,$inout1);
1893 &lea ($out,&DWP(0x10,$out));
1894 &movaps ($ivec,$in1);
1895 &sub ($len,0x20);
1896 &jmp (&label("cbc_dec_tail_collected"));
1897
1898&set_label("cbc_dec_three",16);
1899 &call ("_aesni_decrypt3");
1900 &xorps ($inout0,$ivec);
1901 &xorps ($inout1,$in0);
1902 &xorps ($inout2,$in1);
1903 &movups (&QWP(0,$out),$inout0);
1904 &movaps ($inout0,$inout2);
1905 &movups (&QWP(0x10,$out),$inout1);
1906 &lea ($out,&DWP(0x20,$out));
1907 &movups ($ivec,&QWP(0x20,$inp));
1908 &sub ($len,0x30);
1909 &jmp (&label("cbc_dec_tail_collected"));
1910
1911&set_label("cbc_dec_four",16);
1912 &call ("_aesni_decrypt4");
1913 &movups ($rndkey1,&QWP(0x10,$inp));
1914 &movups ($rndkey0,&QWP(0x20,$inp));
1915 &xorps ($inout0,$ivec);
1916 &movups ($ivec,&QWP(0x30,$inp));
1917 &xorps ($inout1,$in0);
1918 &movups (&QWP(0,$out),$inout0);
1919 &xorps ($inout2,$rndkey1);
1920 &movups (&QWP(0x10,$out),$inout1);
1921 &xorps ($inout3,$rndkey0);
1922 &movups (&QWP(0x20,$out),$inout2);
1923 &lea ($out,&DWP(0x30,$out));
1924 &movaps ($inout0,$inout3);
1925 &sub ($len,0x40);
1926
1927&set_label("cbc_dec_tail_collected");
1928 &and ($len,15);
1929 &jnz (&label("cbc_dec_tail_partial"));
1930 &movups (&QWP(0,$out),$inout0);
1931 &jmp (&label("cbc_ret"));
1932
1933&set_label("cbc_dec_tail_partial",16);
1934 &movaps (&QWP(0,"esp"),$inout0);
1935 &mov ("ecx",16);
1936 &mov ($inp,"esp");
1937 &sub ("ecx",$len);
1938 &data_word(0xA4F3F689); # rep movsb
1939
1940&set_label("cbc_ret");
1941 &mov ("esp",&DWP(16,"esp")); # pull original %esp
1942 &mov ($key_,&wparam(4));
1943 &movups (&QWP(0,$key_),$ivec); # output IV
1944&set_label("cbc_abort");
1945&function_end("${PREFIX}_cbc_encrypt");
1946
1947######################################################################
1948# Mechanical port from aesni-x86_64.pl.
1949#
1950# _aesni_set_encrypt_key is private interface,
1951# input:
1952# "eax" const unsigned char *userKey
1953# $rounds int bits
1954# $key AES_KEY *key
1955# output:
1956# "eax" return code
1957# $round rounds
1958
1959&function_begin_B("_aesni_set_encrypt_key");
1960 &test ("eax","eax");
1961 &jz (&label("bad_pointer"));
1962 &test ($key,$key);
1963 &jz (&label("bad_pointer"));
1964
1965 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
1966 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
1967 &lea ($key,&DWP(16,$key));
1968 &cmp ($rounds,256);
1969 &je (&label("14rounds"));
1970 &cmp ($rounds,192);
1971 &je (&label("12rounds"));
1972 &cmp ($rounds,128);
1973 &jne (&label("bad_keybits"));
1974
1975&set_label("10rounds",16);
1976 &mov ($rounds,9);
1977 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
1978 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
1979 &call (&label("key_128_cold"));
1980 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
1981 &call (&label("key_128"));
1982 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
1983 &call (&label("key_128"));
1984 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
1985 &call (&label("key_128"));
1986 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
1987 &call (&label("key_128"));
1988 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
1989 &call (&label("key_128"));
1990 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
1991 &call (&label("key_128"));
1992 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
1993 &call (&label("key_128"));
1994 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
1995 &call (&label("key_128"));
1996 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
1997 &call (&label("key_128"));
1998 &$movekey (&QWP(0,$key),"xmm0");
1999 &mov (&DWP(80,$key),$rounds);
2000 &xor ("eax","eax");
2001 &ret();
2002
2003&set_label("key_128",16);
2004 &$movekey (&QWP(0,$key),"xmm0");
2005 &lea ($key,&DWP(16,$key));
2006&set_label("key_128_cold");
2007 &shufps ("xmm4","xmm0",0b00010000);
2008 &xorps ("xmm0","xmm4");
2009 &shufps ("xmm4","xmm0",0b10001100);
2010 &xorps ("xmm0","xmm4");
2011 &shufps ("xmm1","xmm1",0b11111111); # critical path
2012 &xorps ("xmm0","xmm1");
2013 &ret();
2014
2015&set_label("12rounds",16);
2016 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
2017 &mov ($rounds,11);
2018 &$movekey (&QWP(-16,$key),"xmm0") # round 0
2019 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
2020 &call (&label("key_192a_cold"));
2021 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
2022 &call (&label("key_192b"));
2023 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
2024 &call (&label("key_192a"));
2025 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
2026 &call (&label("key_192b"));
2027 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
2028 &call (&label("key_192a"));
2029 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
2030 &call (&label("key_192b"));
2031 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
2032 &call (&label("key_192a"));
2033 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
2034 &call (&label("key_192b"));
2035 &$movekey (&QWP(0,$key),"xmm0");
2036 &mov (&DWP(48,$key),$rounds);
2037 &xor ("eax","eax");
2038 &ret();
2039
2040&set_label("key_192a",16);
2041 &$movekey (&QWP(0,$key),"xmm0");
2042 &lea ($key,&DWP(16,$key));
2043&set_label("key_192a_cold",16);
2044 &movaps ("xmm5","xmm2");
2045&set_label("key_192b_warm");
2046 &shufps ("xmm4","xmm0",0b00010000);
2047 &movdqa ("xmm3","xmm2");
2048 &xorps ("xmm0","xmm4");
2049 &shufps ("xmm4","xmm0",0b10001100);
2050 &pslldq ("xmm3",4);
2051 &xorps ("xmm0","xmm4");
2052 &pshufd ("xmm1","xmm1",0b01010101); # critical path
2053 &pxor ("xmm2","xmm3");
2054 &pxor ("xmm0","xmm1");
2055 &pshufd ("xmm3","xmm0",0b11111111);
2056 &pxor ("xmm2","xmm3");
2057 &ret();
2058
2059&set_label("key_192b",16);
2060 &movaps ("xmm3","xmm0");
2061 &shufps ("xmm5","xmm0",0b01000100);
2062 &$movekey (&QWP(0,$key),"xmm5");
2063 &shufps ("xmm3","xmm2",0b01001110);
2064 &$movekey (&QWP(16,$key),"xmm3");
2065 &lea ($key,&DWP(32,$key));
2066 &jmp (&label("key_192b_warm"));
2067
2068&set_label("14rounds",16);
2069 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
2070 &mov ($rounds,13);
2071 &lea ($key,&DWP(16,$key));
2072 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
2073 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
2074 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
2075 &call (&label("key_256a_cold"));
2076 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
2077 &call (&label("key_256b"));
2078 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
2079 &call (&label("key_256a"));
2080 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
2081 &call (&label("key_256b"));
2082 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
2083 &call (&label("key_256a"));
2084 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
2085 &call (&label("key_256b"));
2086 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
2087 &call (&label("key_256a"));
2088 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
2089 &call (&label("key_256b"));
2090 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
2091 &call (&label("key_256a"));
2092 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
2093 &call (&label("key_256b"));
2094 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
2095 &call (&label("key_256a"));
2096 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
2097 &call (&label("key_256b"));
2098 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
2099 &call (&label("key_256a"));
2100 &$movekey (&QWP(0,$key),"xmm0");
2101 &mov (&DWP(16,$key),$rounds);
2102 &xor ("eax","eax");
2103 &ret();
2104
2105&set_label("key_256a",16);
2106 &$movekey (&QWP(0,$key),"xmm2");
2107 &lea ($key,&DWP(16,$key));
2108&set_label("key_256a_cold");
2109 &shufps ("xmm4","xmm0",0b00010000);
2110 &xorps ("xmm0","xmm4");
2111 &shufps ("xmm4","xmm0",0b10001100);
2112 &xorps ("xmm0","xmm4");
2113 &shufps ("xmm1","xmm1",0b11111111); # critical path
2114 &xorps ("xmm0","xmm1");
2115 &ret();
2116
2117&set_label("key_256b",16);
2118 &$movekey (&QWP(0,$key),"xmm0");
2119 &lea ($key,&DWP(16,$key));
2120
2121 &shufps ("xmm4","xmm2",0b00010000);
2122 &xorps ("xmm2","xmm4");
2123 &shufps ("xmm4","xmm2",0b10001100);
2124 &xorps ("xmm2","xmm4");
2125 &shufps ("xmm1","xmm1",0b10101010); # critical path
2126 &xorps ("xmm2","xmm1");
2127 &ret();
2128
2129&set_label("bad_pointer",4);
2130 &mov ("eax",-1);
2131 &ret ();
2132&set_label("bad_keybits",4);
2133 &mov ("eax",-2);
2134 &ret ();
2135&function_end_B("_aesni_set_encrypt_key");
2136
2137# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2138# AES_KEY *key)
2139&function_begin_B("${PREFIX}_set_encrypt_key");
2140 &mov ("eax",&wparam(0));
2141 &mov ($rounds,&wparam(1));
2142 &mov ($key,&wparam(2));
2143 &call ("_aesni_set_encrypt_key");
2144 &ret ();
2145&function_end_B("${PREFIX}_set_encrypt_key");
2146
2147# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2148# AES_KEY *key)
2149&function_begin_B("${PREFIX}_set_decrypt_key");
2150 &mov ("eax",&wparam(0));
2151 &mov ($rounds,&wparam(1));
2152 &mov ($key,&wparam(2));
2153 &call ("_aesni_set_encrypt_key");
2154 &mov ($key,&wparam(2));
2155 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
2156 &test ("eax","eax");
2157 &jnz (&label("dec_key_ret"));
2158 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
2159
2160 &$movekey ("xmm0",&QWP(0,$key)); # just swap
2161 &$movekey ("xmm1",&QWP(0,"eax"));
2162 &$movekey (&QWP(0,"eax"),"xmm0");
2163 &$movekey (&QWP(0,$key),"xmm1");
2164 &lea ($key,&DWP(16,$key));
2165 &lea ("eax",&DWP(-16,"eax"));
2166
2167&set_label("dec_key_inverse");
2168 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
2169 &$movekey ("xmm1",&QWP(0,"eax"));
2170 &aesimc ("xmm0","xmm0");
2171 &aesimc ("xmm1","xmm1");
2172 &lea ($key,&DWP(16,$key));
2173 &lea ("eax",&DWP(-16,"eax"));
2174 &$movekey (&QWP(16,"eax"),"xmm0");
2175 &$movekey (&QWP(-16,$key),"xmm1");
2176 &cmp ("eax",$key);
2177 &ja (&label("dec_key_inverse"));
2178
2179 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
2180 &aesimc ("xmm0","xmm0");
2181 &$movekey (&QWP(0,$key),"xmm0");
2182
2183 &xor ("eax","eax"); # return success
2184&set_label("dec_key_ret");
2185 &ret ();
2186&function_end_B("${PREFIX}_set_decrypt_key");
2187&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
2188
2189&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
deleted file mode 100644
index c073667fcb..0000000000
--- a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
+++ /dev/null
@@ -1,3041 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
13# details].
14#
15# Performance.
16#
17# Given aes(enc|dec) instructions' latency asymptotic performance for
18# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
19# processed with 128-bit key. And given their throughput asymptotic
20# performance for parallelizable modes is 1.25 cycles per byte. Being
21# asymptotic limit it's not something you commonly achieve in reality,
22# but how close does one get? Below are results collected for
23# different modes and block sized. Pairs of numbers are for en-/
24# decryption.
25#
26# 16-byte 64-byte 256-byte 1-KB 8-KB
27# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
28# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
29# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
30# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
31# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
32# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
33#
34# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
35# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
36# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
37# The results were collected with specially crafted speed.c benchmark
38# in order to compare them with results reported in "Intel Advanced
39# Encryption Standard (AES) New Instruction Set" White Paper Revision
40# 3.0 dated May 2010. All above results are consistently better. This
41# module also provides better performance for block sizes smaller than
42# 128 bytes in points *not* represented in the above table.
43#
44# Looking at the results for 8-KB buffer.
45#
46# CFB and OFB results are far from the limit, because implementation
47# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
48# single-block aesni_encrypt, which is not the most optimal way to go.
49# CBC encrypt result is unexpectedly high and there is no documented
50# explanation for it. Seemingly there is a small penalty for feeding
51# the result back to AES unit the way it's done in CBC mode. There is
52# nothing one can do and the result appears optimal. CCM result is
53# identical to CBC, because CBC-MAC is essentially CBC encrypt without
54# saving output. CCM CTR "stays invisible," because it's neatly
55# interleaved wih CBC-MAC. This provides ~30% improvement over
56# "straghtforward" CCM implementation with CTR and CBC-MAC performed
57# disjointly. Parallelizable modes practically achieve the theoretical
58# limit.
59#
60# Looking at how results vary with buffer size.
61#
62# Curves are practically saturated at 1-KB buffer size. In most cases
63# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
64# CTR curve doesn't follow this pattern and is "slowest" changing one
65# with "256-byte" result being 87% of "8-KB." This is because overhead
66# in CTR mode is most computationally intensive. Small-block CCM
67# decrypt is slower than encrypt, because first CTR and last CBC-MAC
68# iterations can't be interleaved.
69#
70# Results for 192- and 256-bit keys.
71#
72# EVP-free results were observed to scale perfectly with number of
73# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
74# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
75# are a tad smaller, because the above mentioned penalty biases all
76# results by same constant value. In similar way function call
77# overhead affects small-block performance, as well as OFB and CFB
78# results. Differences are not large, most common coefficients are
79# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
80# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
81
82# January 2011
83#
84# While Westmere processor features 6 cycles latency for aes[enc|dec]
85# instructions, which can be scheduled every second cycle, Sandy
86# Bridge spends 8 cycles per instruction, but it can schedule them
87# every cycle. This means that code targeting Westmere would perform
88# suboptimally on Sandy Bridge. Therefore this update.
89#
90# In addition, non-parallelizable CBC encrypt (as well as CCM) is
91# optimized. Relative improvement might appear modest, 8% on Westmere,
92# but in absolute terms it's 3.77 cycles per byte encrypted with
93# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
94# should be compared to asymptotic limits of 3.75 for Westmere and
95# 5.00 for Sandy Bridge. Actually, the fact that they get this close
96# to asymptotic limits is quite amazing. Indeed, the limit is
97# calculated as latency times number of rounds, 10 for 128-bit key,
98# and divided by 16, the number of bytes in block, or in other words
99# it accounts *solely* for aesenc instructions. But there are extra
100# instructions, and numbers so close to the asymptotic limits mean
101# that it's as if it takes as little as *one* additional cycle to
102# execute all of them. How is it possible? It is possible thanks to
103# out-of-order execution logic, which manages to overlap post-
104# processing of previous block, things like saving the output, with
105# actual encryption of current block, as well as pre-processing of
106# current block, things like fetching input and xor-ing it with
107# 0-round element of the key schedule, with actual encryption of
108# previous block. Keep this in mind...
109#
110# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
111# performance is achieved by interleaving instructions working on
112# independent blocks. In which case asymptotic limit for such modes
113# can be obtained by dividing above mentioned numbers by AES
114# instructions' interleave factor. Westmere can execute at most 3
115# instructions at a time, meaning that optimal interleave factor is 3,
116# and that's where the "magic" number of 1.25 come from. "Optimal
117# interleave factor" means that increase of interleave factor does
118# not improve performance. The formula has proven to reflect reality
119# pretty well on Westmere... Sandy Bridge on the other hand can
120# execute up to 8 AES instructions at a time, so how does varying
121# interleave factor affect the performance? Here is table for ECB
122# (numbers are cycles per byte processed with 128-bit key):
123#
124# instruction interleave factor 3x 6x 8x
125# theoretical asymptotic limit 1.67 0.83 0.625
126# measured performance for 8KB block 1.05 0.86 0.84
127#
128# "as if" interleave factor 4.7x 5.8x 6.0x
129#
130# Further data for other parallelizable modes:
131#
132# CBC decrypt 1.16 0.93 0.93
133# CTR 1.14 0.91 n/a
134#
135# Well, given 3x column it's probably inappropriate to call the limit
136# asymptotic, if it can be surpassed, isn't it? What happens there?
137# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
138# magic is responsible for this. Processor overlaps not only the
139# additional instructions with AES ones, but even AES instuctions
140# processing adjacent triplets of independent blocks. In the 6x case
141# additional instructions still claim disproportionally small amount
142# of additional cycles, but in 8x case number of instructions must be
143# a tad too high for out-of-order logic to cope with, and AES unit
144# remains underutilized... As you can see 8x interleave is hardly
145# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
146# utilizies 6x interleave because of limited register bank capacity.
147#
148# Higher interleave factors do have negative impact on Westmere
149# performance. While for ECB mode it's negligible ~1.5%, other
150# parallelizables perform ~5% worse, which is outweighed by ~25%
151# improvement on Sandy Bridge. To balance regression on Westmere
152# CTR mode was implemented with 6x aesenc interleave factor.
153
154# April 2011
155#
156# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
157# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
158# in CTR mode AES instruction interleave factor was chosen to be 6x.
159
160$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
161 # generates drop-in replacement for
162 # crypto/aes/asm/aes-x86_64.pl:-)
163
164$flavour = shift;
165$output = shift;
166if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
167
168$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
169
170$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
171( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
172( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
173die "can't locate x86_64-xlate.pl";
174
175open OUT,"| \"$^X\" $xlate $flavour $output";
176*STDOUT=*OUT;
177
178$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
179@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
180 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
181
182$code=".text\n";
183
184$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
185# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
186$inp="%rdi";
187$out="%rsi";
188$len="%rdx";
189$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
190$ivp="%r8"; # cbc, ctr, ...
191
192$rnds_="%r10d"; # backup copy for $rounds
193$key_="%r11"; # backup copy for $key
194
195# %xmm register layout
196$rndkey0="%xmm0"; $rndkey1="%xmm1";
197$inout0="%xmm2"; $inout1="%xmm3";
198$inout2="%xmm4"; $inout3="%xmm5";
199$inout4="%xmm6"; $inout5="%xmm7";
200$inout6="%xmm8"; $inout7="%xmm9";
201
202$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
203$in0="%xmm8"; $iv="%xmm9";
204
205# Inline version of internal aesni_[en|de]crypt1.
206#
207# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
208# cycles which take care of loop variables...
209{ my $sn;
210sub aesni_generate1 {
211my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
212++$sn;
213$code.=<<___;
214 $movkey ($key),$rndkey0
215 $movkey 16($key),$rndkey1
216___
217$code.=<<___ if (defined($ivec));
218 xorps $rndkey0,$ivec
219 lea 32($key),$key
220 xorps $ivec,$inout
221___
222$code.=<<___ if (!defined($ivec));
223 lea 32($key),$key
224 xorps $rndkey0,$inout
225___
226$code.=<<___;
227.Loop_${p}1_$sn:
228 aes${p} $rndkey1,$inout
229 dec $rounds
230 $movkey ($key),$rndkey1
231 lea 16($key),$key
232 jnz .Loop_${p}1_$sn # loop body is 16 bytes
233 aes${p}last $rndkey1,$inout
234___
235}}
236# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
237#
238{ my ($inp,$out,$key) = @_4args;
239
240$code.=<<___;
241.globl ${PREFIX}_encrypt
242.type ${PREFIX}_encrypt,\@abi-omnipotent
243.align 16
244${PREFIX}_encrypt:
245 movups ($inp),$inout0 # load input
246 mov 240($key),$rounds # key->rounds
247___
248 &aesni_generate1("enc",$key,$rounds);
249$code.=<<___;
250 movups $inout0,($out) # output
251 ret
252.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
253
254.globl ${PREFIX}_decrypt
255.type ${PREFIX}_decrypt,\@abi-omnipotent
256.align 16
257${PREFIX}_decrypt:
258 movups ($inp),$inout0 # load input
259 mov 240($key),$rounds # key->rounds
260___
261 &aesni_generate1("dec",$key,$rounds);
262$code.=<<___;
263 movups $inout0,($out) # output
264 ret
265.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
266___
267}
268
269# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
270# factor. Why 3x subroutine were originally used in loops? Even though
271# aes[enc|dec] latency was originally 6, it could be scheduled only
272# every *2nd* cycle. Thus 3x interleave was the one providing optimal
273# utilization, i.e. when subroutine's throughput is virtually same as
274# of non-interleaved subroutine [for number of input blocks up to 3].
275# This is why it makes no sense to implement 2x subroutine.
276# aes[enc|dec] latency in next processor generation is 8, but the
277# instructions can be scheduled every cycle. Optimal interleave for
278# new processor is therefore 8x...
279sub aesni_generate3 {
280my $dir=shift;
281# As already mentioned it takes in $key and $rounds, which are *not*
282# preserved. $inout[0-2] is cipher/clear text...
283$code.=<<___;
284.type _aesni_${dir}rypt3,\@abi-omnipotent
285.align 16
286_aesni_${dir}rypt3:
287 $movkey ($key),$rndkey0
288 shr \$1,$rounds
289 $movkey 16($key),$rndkey1
290 lea 32($key),$key
291 xorps $rndkey0,$inout0
292 xorps $rndkey0,$inout1
293 xorps $rndkey0,$inout2
294 $movkey ($key),$rndkey0
295
296.L${dir}_loop3:
297 aes${dir} $rndkey1,$inout0
298 aes${dir} $rndkey1,$inout1
299 dec $rounds
300 aes${dir} $rndkey1,$inout2
301 $movkey 16($key),$rndkey1
302 aes${dir} $rndkey0,$inout0
303 aes${dir} $rndkey0,$inout1
304 lea 32($key),$key
305 aes${dir} $rndkey0,$inout2
306 $movkey ($key),$rndkey0
307 jnz .L${dir}_loop3
308
309 aes${dir} $rndkey1,$inout0
310 aes${dir} $rndkey1,$inout1
311 aes${dir} $rndkey1,$inout2
312 aes${dir}last $rndkey0,$inout0
313 aes${dir}last $rndkey0,$inout1
314 aes${dir}last $rndkey0,$inout2
315 ret
316.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
317___
318}
319# 4x interleave is implemented to improve small block performance,
320# most notably [and naturally] 4 block by ~30%. One can argue that one
321# should have implemented 5x as well, but improvement would be <20%,
322# so it's not worth it...
323sub aesni_generate4 {
324my $dir=shift;
325# As already mentioned it takes in $key and $rounds, which are *not*
326# preserved. $inout[0-3] is cipher/clear text...
327$code.=<<___;
328.type _aesni_${dir}rypt4,\@abi-omnipotent
329.align 16
330_aesni_${dir}rypt4:
331 $movkey ($key),$rndkey0
332 shr \$1,$rounds
333 $movkey 16($key),$rndkey1
334 lea 32($key),$key
335 xorps $rndkey0,$inout0
336 xorps $rndkey0,$inout1
337 xorps $rndkey0,$inout2
338 xorps $rndkey0,$inout3
339 $movkey ($key),$rndkey0
340
341.L${dir}_loop4:
342 aes${dir} $rndkey1,$inout0
343 aes${dir} $rndkey1,$inout1
344 dec $rounds
345 aes${dir} $rndkey1,$inout2
346 aes${dir} $rndkey1,$inout3
347 $movkey 16($key),$rndkey1
348 aes${dir} $rndkey0,$inout0
349 aes${dir} $rndkey0,$inout1
350 lea 32($key),$key
351 aes${dir} $rndkey0,$inout2
352 aes${dir} $rndkey0,$inout3
353 $movkey ($key),$rndkey0
354 jnz .L${dir}_loop4
355
356 aes${dir} $rndkey1,$inout0
357 aes${dir} $rndkey1,$inout1
358 aes${dir} $rndkey1,$inout2
359 aes${dir} $rndkey1,$inout3
360 aes${dir}last $rndkey0,$inout0
361 aes${dir}last $rndkey0,$inout1
362 aes${dir}last $rndkey0,$inout2
363 aes${dir}last $rndkey0,$inout3
364 ret
365.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
366___
367}
368sub aesni_generate6 {
369my $dir=shift;
370# As already mentioned it takes in $key and $rounds, which are *not*
371# preserved. $inout[0-5] is cipher/clear text...
372$code.=<<___;
373.type _aesni_${dir}rypt6,\@abi-omnipotent
374.align 16
375_aesni_${dir}rypt6:
376 $movkey ($key),$rndkey0
377 shr \$1,$rounds
378 $movkey 16($key),$rndkey1
379 lea 32($key),$key
380 xorps $rndkey0,$inout0
381 pxor $rndkey0,$inout1
382 aes${dir} $rndkey1,$inout0
383 pxor $rndkey0,$inout2
384 aes${dir} $rndkey1,$inout1
385 pxor $rndkey0,$inout3
386 aes${dir} $rndkey1,$inout2
387 pxor $rndkey0,$inout4
388 aes${dir} $rndkey1,$inout3
389 pxor $rndkey0,$inout5
390 dec $rounds
391 aes${dir} $rndkey1,$inout4
392 $movkey ($key),$rndkey0
393 aes${dir} $rndkey1,$inout5
394 jmp .L${dir}_loop6_enter
395.align 16
396.L${dir}_loop6:
397 aes${dir} $rndkey1,$inout0
398 aes${dir} $rndkey1,$inout1
399 dec $rounds
400 aes${dir} $rndkey1,$inout2
401 aes${dir} $rndkey1,$inout3
402 aes${dir} $rndkey1,$inout4
403 aes${dir} $rndkey1,$inout5
404.L${dir}_loop6_enter: # happens to be 16-byte aligned
405 $movkey 16($key),$rndkey1
406 aes${dir} $rndkey0,$inout0
407 aes${dir} $rndkey0,$inout1
408 lea 32($key),$key
409 aes${dir} $rndkey0,$inout2
410 aes${dir} $rndkey0,$inout3
411 aes${dir} $rndkey0,$inout4
412 aes${dir} $rndkey0,$inout5
413 $movkey ($key),$rndkey0
414 jnz .L${dir}_loop6
415
416 aes${dir} $rndkey1,$inout0
417 aes${dir} $rndkey1,$inout1
418 aes${dir} $rndkey1,$inout2
419 aes${dir} $rndkey1,$inout3
420 aes${dir} $rndkey1,$inout4
421 aes${dir} $rndkey1,$inout5
422 aes${dir}last $rndkey0,$inout0
423 aes${dir}last $rndkey0,$inout1
424 aes${dir}last $rndkey0,$inout2
425 aes${dir}last $rndkey0,$inout3
426 aes${dir}last $rndkey0,$inout4
427 aes${dir}last $rndkey0,$inout5
428 ret
429.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
430___
431}
432sub aesni_generate8 {
433my $dir=shift;
434# As already mentioned it takes in $key and $rounds, which are *not*
435# preserved. $inout[0-7] is cipher/clear text...
436$code.=<<___;
437.type _aesni_${dir}rypt8,\@abi-omnipotent
438.align 16
439_aesni_${dir}rypt8:
440 $movkey ($key),$rndkey0
441 shr \$1,$rounds
442 $movkey 16($key),$rndkey1
443 lea 32($key),$key
444 xorps $rndkey0,$inout0
445 xorps $rndkey0,$inout1
446 aes${dir} $rndkey1,$inout0
447 pxor $rndkey0,$inout2
448 aes${dir} $rndkey1,$inout1
449 pxor $rndkey0,$inout3
450 aes${dir} $rndkey1,$inout2
451 pxor $rndkey0,$inout4
452 aes${dir} $rndkey1,$inout3
453 pxor $rndkey0,$inout5
454 dec $rounds
455 aes${dir} $rndkey1,$inout4
456 pxor $rndkey0,$inout6
457 aes${dir} $rndkey1,$inout5
458 pxor $rndkey0,$inout7
459 $movkey ($key),$rndkey0
460 aes${dir} $rndkey1,$inout6
461 aes${dir} $rndkey1,$inout7
462 $movkey 16($key),$rndkey1
463 jmp .L${dir}_loop8_enter
464.align 16
465.L${dir}_loop8:
466 aes${dir} $rndkey1,$inout0
467 aes${dir} $rndkey1,$inout1
468 dec $rounds
469 aes${dir} $rndkey1,$inout2
470 aes${dir} $rndkey1,$inout3
471 aes${dir} $rndkey1,$inout4
472 aes${dir} $rndkey1,$inout5
473 aes${dir} $rndkey1,$inout6
474 aes${dir} $rndkey1,$inout7
475 $movkey 16($key),$rndkey1
476.L${dir}_loop8_enter: # happens to be 16-byte aligned
477 aes${dir} $rndkey0,$inout0
478 aes${dir} $rndkey0,$inout1
479 lea 32($key),$key
480 aes${dir} $rndkey0,$inout2
481 aes${dir} $rndkey0,$inout3
482 aes${dir} $rndkey0,$inout4
483 aes${dir} $rndkey0,$inout5
484 aes${dir} $rndkey0,$inout6
485 aes${dir} $rndkey0,$inout7
486 $movkey ($key),$rndkey0
487 jnz .L${dir}_loop8
488
489 aes${dir} $rndkey1,$inout0
490 aes${dir} $rndkey1,$inout1
491 aes${dir} $rndkey1,$inout2
492 aes${dir} $rndkey1,$inout3
493 aes${dir} $rndkey1,$inout4
494 aes${dir} $rndkey1,$inout5
495 aes${dir} $rndkey1,$inout6
496 aes${dir} $rndkey1,$inout7
497 aes${dir}last $rndkey0,$inout0
498 aes${dir}last $rndkey0,$inout1
499 aes${dir}last $rndkey0,$inout2
500 aes${dir}last $rndkey0,$inout3
501 aes${dir}last $rndkey0,$inout4
502 aes${dir}last $rndkey0,$inout5
503 aes${dir}last $rndkey0,$inout6
504 aes${dir}last $rndkey0,$inout7
505 ret
506.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
507___
508}
509&aesni_generate3("enc") if ($PREFIX eq "aesni");
510&aesni_generate3("dec");
511&aesni_generate4("enc") if ($PREFIX eq "aesni");
512&aesni_generate4("dec");
513&aesni_generate6("enc") if ($PREFIX eq "aesni");
514&aesni_generate6("dec");
515&aesni_generate8("enc") if ($PREFIX eq "aesni");
516&aesni_generate8("dec");
517
518if ($PREFIX eq "aesni") {
519########################################################################
520# void aesni_ecb_encrypt (const void *in, void *out,
521# size_t length, const AES_KEY *key,
522# int enc);
523$code.=<<___;
524.globl aesni_ecb_encrypt
525.type aesni_ecb_encrypt,\@function,5
526.align 16
527aesni_ecb_encrypt:
528 and \$-16,$len
529 jz .Lecb_ret
530
531 mov 240($key),$rounds # key->rounds
532 $movkey ($key),$rndkey0
533 mov $key,$key_ # backup $key
534 mov $rounds,$rnds_ # backup $rounds
535 test %r8d,%r8d # 5th argument
536 jz .Lecb_decrypt
537#--------------------------- ECB ENCRYPT ------------------------------#
538 cmp \$0x80,$len
539 jb .Lecb_enc_tail
540
541 movdqu ($inp),$inout0
542 movdqu 0x10($inp),$inout1
543 movdqu 0x20($inp),$inout2
544 movdqu 0x30($inp),$inout3
545 movdqu 0x40($inp),$inout4
546 movdqu 0x50($inp),$inout5
547 movdqu 0x60($inp),$inout6
548 movdqu 0x70($inp),$inout7
549 lea 0x80($inp),$inp
550 sub \$0x80,$len
551 jmp .Lecb_enc_loop8_enter
552.align 16
553.Lecb_enc_loop8:
554 movups $inout0,($out)
555 mov $key_,$key # restore $key
556 movdqu ($inp),$inout0
557 mov $rnds_,$rounds # restore $rounds
558 movups $inout1,0x10($out)
559 movdqu 0x10($inp),$inout1
560 movups $inout2,0x20($out)
561 movdqu 0x20($inp),$inout2
562 movups $inout3,0x30($out)
563 movdqu 0x30($inp),$inout3
564 movups $inout4,0x40($out)
565 movdqu 0x40($inp),$inout4
566 movups $inout5,0x50($out)
567 movdqu 0x50($inp),$inout5
568 movups $inout6,0x60($out)
569 movdqu 0x60($inp),$inout6
570 movups $inout7,0x70($out)
571 lea 0x80($out),$out
572 movdqu 0x70($inp),$inout7
573 lea 0x80($inp),$inp
574.Lecb_enc_loop8_enter:
575
576 call _aesni_encrypt8
577
578 sub \$0x80,$len
579 jnc .Lecb_enc_loop8
580
581 movups $inout0,($out)
582 mov $key_,$key # restore $key
583 movups $inout1,0x10($out)
584 mov $rnds_,$rounds # restore $rounds
585 movups $inout2,0x20($out)
586 movups $inout3,0x30($out)
587 movups $inout4,0x40($out)
588 movups $inout5,0x50($out)
589 movups $inout6,0x60($out)
590 movups $inout7,0x70($out)
591 lea 0x80($out),$out
592 add \$0x80,$len
593 jz .Lecb_ret
594
595.Lecb_enc_tail:
596 movups ($inp),$inout0
597 cmp \$0x20,$len
598 jb .Lecb_enc_one
599 movups 0x10($inp),$inout1
600 je .Lecb_enc_two
601 movups 0x20($inp),$inout2
602 cmp \$0x40,$len
603 jb .Lecb_enc_three
604 movups 0x30($inp),$inout3
605 je .Lecb_enc_four
606 movups 0x40($inp),$inout4
607 cmp \$0x60,$len
608 jb .Lecb_enc_five
609 movups 0x50($inp),$inout5
610 je .Lecb_enc_six
611 movdqu 0x60($inp),$inout6
612 call _aesni_encrypt8
613 movups $inout0,($out)
614 movups $inout1,0x10($out)
615 movups $inout2,0x20($out)
616 movups $inout3,0x30($out)
617 movups $inout4,0x40($out)
618 movups $inout5,0x50($out)
619 movups $inout6,0x60($out)
620 jmp .Lecb_ret
621.align 16
622.Lecb_enc_one:
623___
624 &aesni_generate1("enc",$key,$rounds);
625$code.=<<___;
626 movups $inout0,($out)
627 jmp .Lecb_ret
628.align 16
629.Lecb_enc_two:
630 xorps $inout2,$inout2
631 call _aesni_encrypt3
632 movups $inout0,($out)
633 movups $inout1,0x10($out)
634 jmp .Lecb_ret
635.align 16
636.Lecb_enc_three:
637 call _aesni_encrypt3
638 movups $inout0,($out)
639 movups $inout1,0x10($out)
640 movups $inout2,0x20($out)
641 jmp .Lecb_ret
642.align 16
643.Lecb_enc_four:
644 call _aesni_encrypt4
645 movups $inout0,($out)
646 movups $inout1,0x10($out)
647 movups $inout2,0x20($out)
648 movups $inout3,0x30($out)
649 jmp .Lecb_ret
650.align 16
651.Lecb_enc_five:
652 xorps $inout5,$inout5
653 call _aesni_encrypt6
654 movups $inout0,($out)
655 movups $inout1,0x10($out)
656 movups $inout2,0x20($out)
657 movups $inout3,0x30($out)
658 movups $inout4,0x40($out)
659 jmp .Lecb_ret
660.align 16
661.Lecb_enc_six:
662 call _aesni_encrypt6
663 movups $inout0,($out)
664 movups $inout1,0x10($out)
665 movups $inout2,0x20($out)
666 movups $inout3,0x30($out)
667 movups $inout4,0x40($out)
668 movups $inout5,0x50($out)
669 jmp .Lecb_ret
670 #--------------------------- ECB DECRYPT ------------------------------#
671.align 16
672.Lecb_decrypt:
673 cmp \$0x80,$len
674 jb .Lecb_dec_tail
675
676 movdqu ($inp),$inout0
677 movdqu 0x10($inp),$inout1
678 movdqu 0x20($inp),$inout2
679 movdqu 0x30($inp),$inout3
680 movdqu 0x40($inp),$inout4
681 movdqu 0x50($inp),$inout5
682 movdqu 0x60($inp),$inout6
683 movdqu 0x70($inp),$inout7
684 lea 0x80($inp),$inp
685 sub \$0x80,$len
686 jmp .Lecb_dec_loop8_enter
687.align 16
688.Lecb_dec_loop8:
689 movups $inout0,($out)
690 mov $key_,$key # restore $key
691 movdqu ($inp),$inout0
692 mov $rnds_,$rounds # restore $rounds
693 movups $inout1,0x10($out)
694 movdqu 0x10($inp),$inout1
695 movups $inout2,0x20($out)
696 movdqu 0x20($inp),$inout2
697 movups $inout3,0x30($out)
698 movdqu 0x30($inp),$inout3
699 movups $inout4,0x40($out)
700 movdqu 0x40($inp),$inout4
701 movups $inout5,0x50($out)
702 movdqu 0x50($inp),$inout5
703 movups $inout6,0x60($out)
704 movdqu 0x60($inp),$inout6
705 movups $inout7,0x70($out)
706 lea 0x80($out),$out
707 movdqu 0x70($inp),$inout7
708 lea 0x80($inp),$inp
709.Lecb_dec_loop8_enter:
710
711 call _aesni_decrypt8
712
713 $movkey ($key_),$rndkey0
714 sub \$0x80,$len
715 jnc .Lecb_dec_loop8
716
717 movups $inout0,($out)
718 mov $key_,$key # restore $key
719 movups $inout1,0x10($out)
720 mov $rnds_,$rounds # restore $rounds
721 movups $inout2,0x20($out)
722 movups $inout3,0x30($out)
723 movups $inout4,0x40($out)
724 movups $inout5,0x50($out)
725 movups $inout6,0x60($out)
726 movups $inout7,0x70($out)
727 lea 0x80($out),$out
728 add \$0x80,$len
729 jz .Lecb_ret
730
731.Lecb_dec_tail:
732 movups ($inp),$inout0
733 cmp \$0x20,$len
734 jb .Lecb_dec_one
735 movups 0x10($inp),$inout1
736 je .Lecb_dec_two
737 movups 0x20($inp),$inout2
738 cmp \$0x40,$len
739 jb .Lecb_dec_three
740 movups 0x30($inp),$inout3
741 je .Lecb_dec_four
742 movups 0x40($inp),$inout4
743 cmp \$0x60,$len
744 jb .Lecb_dec_five
745 movups 0x50($inp),$inout5
746 je .Lecb_dec_six
747 movups 0x60($inp),$inout6
748 $movkey ($key),$rndkey0
749 call _aesni_decrypt8
750 movups $inout0,($out)
751 movups $inout1,0x10($out)
752 movups $inout2,0x20($out)
753 movups $inout3,0x30($out)
754 movups $inout4,0x40($out)
755 movups $inout5,0x50($out)
756 movups $inout6,0x60($out)
757 jmp .Lecb_ret
758.align 16
759.Lecb_dec_one:
760___
761 &aesni_generate1("dec",$key,$rounds);
762$code.=<<___;
763 movups $inout0,($out)
764 jmp .Lecb_ret
765.align 16
766.Lecb_dec_two:
767 xorps $inout2,$inout2
768 call _aesni_decrypt3
769 movups $inout0,($out)
770 movups $inout1,0x10($out)
771 jmp .Lecb_ret
772.align 16
773.Lecb_dec_three:
774 call _aesni_decrypt3
775 movups $inout0,($out)
776 movups $inout1,0x10($out)
777 movups $inout2,0x20($out)
778 jmp .Lecb_ret
779.align 16
780.Lecb_dec_four:
781 call _aesni_decrypt4
782 movups $inout0,($out)
783 movups $inout1,0x10($out)
784 movups $inout2,0x20($out)
785 movups $inout3,0x30($out)
786 jmp .Lecb_ret
787.align 16
788.Lecb_dec_five:
789 xorps $inout5,$inout5
790 call _aesni_decrypt6
791 movups $inout0,($out)
792 movups $inout1,0x10($out)
793 movups $inout2,0x20($out)
794 movups $inout3,0x30($out)
795 movups $inout4,0x40($out)
796 jmp .Lecb_ret
797.align 16
798.Lecb_dec_six:
799 call _aesni_decrypt6
800 movups $inout0,($out)
801 movups $inout1,0x10($out)
802 movups $inout2,0x20($out)
803 movups $inout3,0x30($out)
804 movups $inout4,0x40($out)
805 movups $inout5,0x50($out)
806
807.Lecb_ret:
808 ret
809.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
810___
811
812{
813######################################################################
814# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
815# size_t blocks, const AES_KEY *key,
816# const char *ivec,char *cmac);
817#
818# Handles only complete blocks, operates on 64-bit counter and
819# does not update *ivec! Nor does it finalize CMAC value
820# (see engine/eng_aesni.c for details)
821#
822{
823my $cmac="%r9"; # 6th argument
824
825my $increment="%xmm6";
826my $bswap_mask="%xmm7";
827
828$code.=<<___;
829.globl aesni_ccm64_encrypt_blocks
830.type aesni_ccm64_encrypt_blocks,\@function,6
831.align 16
832aesni_ccm64_encrypt_blocks:
833___
834$code.=<<___ if ($win64);
835 lea -0x58(%rsp),%rsp
836 movaps %xmm6,(%rsp)
837 movaps %xmm7,0x10(%rsp)
838 movaps %xmm8,0x20(%rsp)
839 movaps %xmm9,0x30(%rsp)
840.Lccm64_enc_body:
841___
842$code.=<<___;
843 mov 240($key),$rounds # key->rounds
844 movdqu ($ivp),$iv
845 movdqa .Lincrement64(%rip),$increment
846 movdqa .Lbswap_mask(%rip),$bswap_mask
847
848 shr \$1,$rounds
849 lea 0($key),$key_
850 movdqu ($cmac),$inout1
851 movdqa $iv,$inout0
852 mov $rounds,$rnds_
853 pshufb $bswap_mask,$iv
854 jmp .Lccm64_enc_outer
855.align 16
856.Lccm64_enc_outer:
857 $movkey ($key_),$rndkey0
858 mov $rnds_,$rounds
859 movups ($inp),$in0 # load inp
860
861 xorps $rndkey0,$inout0 # counter
862 $movkey 16($key_),$rndkey1
863 xorps $in0,$rndkey0
864 lea 32($key_),$key
865 xorps $rndkey0,$inout1 # cmac^=inp
866 $movkey ($key),$rndkey0
867
868.Lccm64_enc2_loop:
869 aesenc $rndkey1,$inout0
870 dec $rounds
871 aesenc $rndkey1,$inout1
872 $movkey 16($key),$rndkey1
873 aesenc $rndkey0,$inout0
874 lea 32($key),$key
875 aesenc $rndkey0,$inout1
876 $movkey 0($key),$rndkey0
877 jnz .Lccm64_enc2_loop
878 aesenc $rndkey1,$inout0
879 aesenc $rndkey1,$inout1
880 paddq $increment,$iv
881 aesenclast $rndkey0,$inout0
882 aesenclast $rndkey0,$inout1
883
884 dec $len
885 lea 16($inp),$inp
886 xorps $inout0,$in0 # inp ^= E(iv)
887 movdqa $iv,$inout0
888 movups $in0,($out) # save output
889 lea 16($out),$out
890 pshufb $bswap_mask,$inout0
891 jnz .Lccm64_enc_outer
892
893 movups $inout1,($cmac)
894___
895$code.=<<___ if ($win64);
896 movaps (%rsp),%xmm6
897 movaps 0x10(%rsp),%xmm7
898 movaps 0x20(%rsp),%xmm8
899 movaps 0x30(%rsp),%xmm9
900 lea 0x58(%rsp),%rsp
901.Lccm64_enc_ret:
902___
903$code.=<<___;
904 ret
905.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
906___
907######################################################################
908$code.=<<___;
909.globl aesni_ccm64_decrypt_blocks
910.type aesni_ccm64_decrypt_blocks,\@function,6
911.align 16
912aesni_ccm64_decrypt_blocks:
913___
914$code.=<<___ if ($win64);
915 lea -0x58(%rsp),%rsp
916 movaps %xmm6,(%rsp)
917 movaps %xmm7,0x10(%rsp)
918 movaps %xmm8,0x20(%rsp)
919 movaps %xmm9,0x30(%rsp)
920.Lccm64_dec_body:
921___
922$code.=<<___;
923 mov 240($key),$rounds # key->rounds
924 movups ($ivp),$iv
925 movdqu ($cmac),$inout1
926 movdqa .Lincrement64(%rip),$increment
927 movdqa .Lbswap_mask(%rip),$bswap_mask
928
929 movaps $iv,$inout0
930 mov $rounds,$rnds_
931 mov $key,$key_
932 pshufb $bswap_mask,$iv
933___
934 &aesni_generate1("enc",$key,$rounds);
935$code.=<<___;
936 movups ($inp),$in0 # load inp
937 paddq $increment,$iv
938 lea 16($inp),$inp
939 jmp .Lccm64_dec_outer
940.align 16
941.Lccm64_dec_outer:
942 xorps $inout0,$in0 # inp ^= E(iv)
943 movdqa $iv,$inout0
944 mov $rnds_,$rounds
945 movups $in0,($out) # save output
946 lea 16($out),$out
947 pshufb $bswap_mask,$inout0
948
949 sub \$1,$len
950 jz .Lccm64_dec_break
951
952 $movkey ($key_),$rndkey0
953 shr \$1,$rounds
954 $movkey 16($key_),$rndkey1
955 xorps $rndkey0,$in0
956 lea 32($key_),$key
957 xorps $rndkey0,$inout0
958 xorps $in0,$inout1 # cmac^=out
959 $movkey ($key),$rndkey0
960
961.Lccm64_dec2_loop:
962 aesenc $rndkey1,$inout0
963 dec $rounds
964 aesenc $rndkey1,$inout1
965 $movkey 16($key),$rndkey1
966 aesenc $rndkey0,$inout0
967 lea 32($key),$key
968 aesenc $rndkey0,$inout1
969 $movkey 0($key),$rndkey0
970 jnz .Lccm64_dec2_loop
971 movups ($inp),$in0 # load inp
972 paddq $increment,$iv
973 aesenc $rndkey1,$inout0
974 aesenc $rndkey1,$inout1
975 lea 16($inp),$inp
976 aesenclast $rndkey0,$inout0
977 aesenclast $rndkey0,$inout1
978 jmp .Lccm64_dec_outer
979
980.align 16
981.Lccm64_dec_break:
982 #xorps $in0,$inout1 # cmac^=out
983___
984 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
985$code.=<<___;
986 movups $inout1,($cmac)
987___
988$code.=<<___ if ($win64);
989 movaps (%rsp),%xmm6
990 movaps 0x10(%rsp),%xmm7
991 movaps 0x20(%rsp),%xmm8
992 movaps 0x30(%rsp),%xmm9
993 lea 0x58(%rsp),%rsp
994.Lccm64_dec_ret:
995___
996$code.=<<___;
997 ret
998.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
999___
1000}
1001######################################################################
1002# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1003# size_t blocks, const AES_KEY *key,
1004# const char *ivec);
1005#
1006# Handles only complete blocks, operates on 32-bit counter and
1007# does not update *ivec! (see engine/eng_aesni.c for details)
1008#
1009{
1010my $reserved = $win64?0:-0x28;
1011my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
1012my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
1013my $bswap_mask="%xmm15";
1014
1015$code.=<<___;
1016.globl aesni_ctr32_encrypt_blocks
1017.type aesni_ctr32_encrypt_blocks,\@function,5
1018.align 16
1019aesni_ctr32_encrypt_blocks:
1020___
1021$code.=<<___ if ($win64);
1022 lea -0xc8(%rsp),%rsp
1023 movaps %xmm6,0x20(%rsp)
1024 movaps %xmm7,0x30(%rsp)
1025 movaps %xmm8,0x40(%rsp)
1026 movaps %xmm9,0x50(%rsp)
1027 movaps %xmm10,0x60(%rsp)
1028 movaps %xmm11,0x70(%rsp)
1029 movaps %xmm12,0x80(%rsp)
1030 movaps %xmm13,0x90(%rsp)
1031 movaps %xmm14,0xa0(%rsp)
1032 movaps %xmm15,0xb0(%rsp)
1033.Lctr32_body:
1034___
1035$code.=<<___;
1036 cmp \$1,$len
1037 je .Lctr32_one_shortcut
1038
1039 movdqu ($ivp),$ivec
1040 movdqa .Lbswap_mask(%rip),$bswap_mask
1041 xor $rounds,$rounds
1042 pextrd \$3,$ivec,$rnds_ # pull 32-bit counter
1043 pinsrd \$3,$rounds,$ivec # wipe 32-bit counter
1044
1045 mov 240($key),$rounds # key->rounds
1046 bswap $rnds_
1047 pxor $iv0,$iv0 # vector of 3 32-bit counters
1048 pxor $iv1,$iv1 # vector of 3 32-bit counters
1049 pinsrd \$0,$rnds_,$iv0
1050 lea 3($rnds_),$key_
1051 pinsrd \$0,$key_,$iv1
1052 inc $rnds_
1053 pinsrd \$1,$rnds_,$iv0
1054 inc $key_
1055 pinsrd \$1,$key_,$iv1
1056 inc $rnds_
1057 pinsrd \$2,$rnds_,$iv0
1058 inc $key_
1059 pinsrd \$2,$key_,$iv1
1060 movdqa $iv0,$reserved(%rsp)
1061 pshufb $bswap_mask,$iv0
1062 movdqa $iv1,`$reserved+0x10`(%rsp)
1063 pshufb $bswap_mask,$iv1
1064
1065 pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword
1066 pshufd \$`2<<6`,$iv0,$inout1
1067 pshufd \$`1<<6`,$iv0,$inout2
1068 cmp \$6,$len
1069 jb .Lctr32_tail
1070 shr \$1,$rounds
1071 mov $key,$key_ # backup $key
1072 mov $rounds,$rnds_ # backup $rounds
1073 sub \$6,$len
1074 jmp .Lctr32_loop6
1075
1076.align 16
1077.Lctr32_loop6:
1078 pshufd \$`3<<6`,$iv1,$inout3
1079 por $ivec,$inout0 # merge counter-less ivec
1080 $movkey ($key_),$rndkey0
1081 pshufd \$`2<<6`,$iv1,$inout4
1082 por $ivec,$inout1
1083 $movkey 16($key_),$rndkey1
1084 pshufd \$`1<<6`,$iv1,$inout5
1085 por $ivec,$inout2
1086 por $ivec,$inout3
1087 xorps $rndkey0,$inout0
1088 por $ivec,$inout4
1089 por $ivec,$inout5
1090
1091 # inline _aesni_encrypt6 and interleave last rounds
1092 # with own code...
1093
1094 pxor $rndkey0,$inout1
1095 aesenc $rndkey1,$inout0
1096 lea 32($key_),$key
1097 pxor $rndkey0,$inout2
1098 aesenc $rndkey1,$inout1
1099 movdqa .Lincrement32(%rip),$iv1
1100 pxor $rndkey0,$inout3
1101 aesenc $rndkey1,$inout2
1102 movdqa $reserved(%rsp),$iv0
1103 pxor $rndkey0,$inout4
1104 aesenc $rndkey1,$inout3
1105 pxor $rndkey0,$inout5
1106 $movkey ($key),$rndkey0
1107 dec $rounds
1108 aesenc $rndkey1,$inout4
1109 aesenc $rndkey1,$inout5
1110 jmp .Lctr32_enc_loop6_enter
1111.align 16
1112.Lctr32_enc_loop6:
1113 aesenc $rndkey1,$inout0
1114 aesenc $rndkey1,$inout1
1115 dec $rounds
1116 aesenc $rndkey1,$inout2
1117 aesenc $rndkey1,$inout3
1118 aesenc $rndkey1,$inout4
1119 aesenc $rndkey1,$inout5
1120.Lctr32_enc_loop6_enter:
1121 $movkey 16($key),$rndkey1
1122 aesenc $rndkey0,$inout0
1123 aesenc $rndkey0,$inout1
1124 lea 32($key),$key
1125 aesenc $rndkey0,$inout2
1126 aesenc $rndkey0,$inout3
1127 aesenc $rndkey0,$inout4
1128 aesenc $rndkey0,$inout5
1129 $movkey ($key),$rndkey0
1130 jnz .Lctr32_enc_loop6
1131
1132 aesenc $rndkey1,$inout0
1133 paddd $iv1,$iv0 # increment counter vector
1134 aesenc $rndkey1,$inout1
1135 paddd `$reserved+0x10`(%rsp),$iv1
1136 aesenc $rndkey1,$inout2
1137 movdqa $iv0,$reserved(%rsp) # save counter vector
1138 aesenc $rndkey1,$inout3
1139 movdqa $iv1,`$reserved+0x10`(%rsp)
1140 aesenc $rndkey1,$inout4
1141 pshufb $bswap_mask,$iv0 # byte swap
1142 aesenc $rndkey1,$inout5
1143 pshufb $bswap_mask,$iv1
1144
1145 aesenclast $rndkey0,$inout0
1146 movups ($inp),$in0 # load input
1147 aesenclast $rndkey0,$inout1
1148 movups 0x10($inp),$in1
1149 aesenclast $rndkey0,$inout2
1150 movups 0x20($inp),$in2
1151 aesenclast $rndkey0,$inout3
1152 movups 0x30($inp),$in3
1153 aesenclast $rndkey0,$inout4
1154 movups 0x40($inp),$rndkey1
1155 aesenclast $rndkey0,$inout5
1156 movups 0x50($inp),$rndkey0
1157 lea 0x60($inp),$inp
1158
1159 xorps $inout0,$in0 # xor
1160 pshufd \$`3<<6`,$iv0,$inout0
1161 xorps $inout1,$in1
1162 pshufd \$`2<<6`,$iv0,$inout1
1163 movups $in0,($out) # store output
1164 xorps $inout2,$in2
1165 pshufd \$`1<<6`,$iv0,$inout2
1166 movups $in1,0x10($out)
1167 xorps $inout3,$in3
1168 movups $in2,0x20($out)
1169 xorps $inout4,$rndkey1
1170 movups $in3,0x30($out)
1171 xorps $inout5,$rndkey0
1172 movups $rndkey1,0x40($out)
1173 movups $rndkey0,0x50($out)
1174 lea 0x60($out),$out
1175 mov $rnds_,$rounds
1176 sub \$6,$len
1177 jnc .Lctr32_loop6
1178
1179 add \$6,$len
1180 jz .Lctr32_done
1181 mov $key_,$key # restore $key
1182 lea 1($rounds,$rounds),$rounds # restore original value
1183
1184.Lctr32_tail:
1185 por $ivec,$inout0
1186 movups ($inp),$in0
1187 cmp \$2,$len
1188 jb .Lctr32_one
1189
1190 por $ivec,$inout1
1191 movups 0x10($inp),$in1
1192 je .Lctr32_two
1193
1194 pshufd \$`3<<6`,$iv1,$inout3
1195 por $ivec,$inout2
1196 movups 0x20($inp),$in2
1197 cmp \$4,$len
1198 jb .Lctr32_three
1199
1200 pshufd \$`2<<6`,$iv1,$inout4
1201 por $ivec,$inout3
1202 movups 0x30($inp),$in3
1203 je .Lctr32_four
1204
1205 por $ivec,$inout4
1206 xorps $inout5,$inout5
1207
1208 call _aesni_encrypt6
1209
1210 movups 0x40($inp),$rndkey1
1211 xorps $inout0,$in0
1212 xorps $inout1,$in1
1213 movups $in0,($out)
1214 xorps $inout2,$in2
1215 movups $in1,0x10($out)
1216 xorps $inout3,$in3
1217 movups $in2,0x20($out)
1218 xorps $inout4,$rndkey1
1219 movups $in3,0x30($out)
1220 movups $rndkey1,0x40($out)
1221 jmp .Lctr32_done
1222
1223.align 16
1224.Lctr32_one_shortcut:
1225 movups ($ivp),$inout0
1226 movups ($inp),$in0
1227 mov 240($key),$rounds # key->rounds
1228.Lctr32_one:
1229___
1230 &aesni_generate1("enc",$key,$rounds);
1231$code.=<<___;
1232 xorps $inout0,$in0
1233 movups $in0,($out)
1234 jmp .Lctr32_done
1235
1236.align 16
1237.Lctr32_two:
1238 xorps $inout2,$inout2
1239 call _aesni_encrypt3
1240 xorps $inout0,$in0
1241 xorps $inout1,$in1
1242 movups $in0,($out)
1243 movups $in1,0x10($out)
1244 jmp .Lctr32_done
1245
1246.align 16
1247.Lctr32_three:
1248 call _aesni_encrypt3
1249 xorps $inout0,$in0
1250 xorps $inout1,$in1
1251 movups $in0,($out)
1252 xorps $inout2,$in2
1253 movups $in1,0x10($out)
1254 movups $in2,0x20($out)
1255 jmp .Lctr32_done
1256
1257.align 16
1258.Lctr32_four:
1259 call _aesni_encrypt4
1260 xorps $inout0,$in0
1261 xorps $inout1,$in1
1262 movups $in0,($out)
1263 xorps $inout2,$in2
1264 movups $in1,0x10($out)
1265 xorps $inout3,$in3
1266 movups $in2,0x20($out)
1267 movups $in3,0x30($out)
1268
1269.Lctr32_done:
1270___
1271$code.=<<___ if ($win64);
1272 movaps 0x20(%rsp),%xmm6
1273 movaps 0x30(%rsp),%xmm7
1274 movaps 0x40(%rsp),%xmm8
1275 movaps 0x50(%rsp),%xmm9
1276 movaps 0x60(%rsp),%xmm10
1277 movaps 0x70(%rsp),%xmm11
1278 movaps 0x80(%rsp),%xmm12
1279 movaps 0x90(%rsp),%xmm13
1280 movaps 0xa0(%rsp),%xmm14
1281 movaps 0xb0(%rsp),%xmm15
1282 lea 0xc8(%rsp),%rsp
1283.Lctr32_ret:
1284___
1285$code.=<<___;
1286 ret
1287.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1288___
1289}
1290
1291######################################################################
1292# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1293# const AES_KEY *key1, const AES_KEY *key2
1294# const unsigned char iv[16]);
1295#
1296{
1297my @tweak=map("%xmm$_",(10..15));
1298my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1299my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1300my $frame_size = 0x68 + ($win64?160:0);
1301
1302$code.=<<___;
1303.globl aesni_xts_encrypt
1304.type aesni_xts_encrypt,\@function,6
1305.align 16
1306aesni_xts_encrypt:
1307 lea -$frame_size(%rsp),%rsp
1308___
1309$code.=<<___ if ($win64);
1310 movaps %xmm6,0x60(%rsp)
1311 movaps %xmm7,0x70(%rsp)
1312 movaps %xmm8,0x80(%rsp)
1313 movaps %xmm9,0x90(%rsp)
1314 movaps %xmm10,0xa0(%rsp)
1315 movaps %xmm11,0xb0(%rsp)
1316 movaps %xmm12,0xc0(%rsp)
1317 movaps %xmm13,0xd0(%rsp)
1318 movaps %xmm14,0xe0(%rsp)
1319 movaps %xmm15,0xf0(%rsp)
1320.Lxts_enc_body:
1321___
1322$code.=<<___;
1323 movups ($ivp),@tweak[5] # load clear-text tweak
1324 mov 240(%r8),$rounds # key2->rounds
1325 mov 240($key),$rnds_ # key1->rounds
1326___
1327 # generate the tweak
1328 &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
1329$code.=<<___;
1330 mov $key,$key_ # backup $key
1331 mov $rnds_,$rounds # backup $rounds
1332 mov $len,$len_ # backup $len
1333 and \$-16,$len
1334
1335 movdqa .Lxts_magic(%rip),$twmask
1336 pxor $twtmp,$twtmp
1337 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1338___
1339 for ($i=0;$i<4;$i++) {
1340 $code.=<<___;
1341 pshufd \$0x13,$twtmp,$twres
1342 pxor $twtmp,$twtmp
1343 movdqa @tweak[5],@tweak[$i]
1344 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1345 pand $twmask,$twres # isolate carry and residue
1346 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1347 pxor $twres,@tweak[5]
1348___
1349 }
1350$code.=<<___;
1351 sub \$16*6,$len
1352 jc .Lxts_enc_short
1353
1354 shr \$1,$rounds
1355 sub \$1,$rounds
1356 mov $rounds,$rnds_
1357 jmp .Lxts_enc_grandloop
1358
1359.align 16
1360.Lxts_enc_grandloop:
1361 pshufd \$0x13,$twtmp,$twres
1362 movdqa @tweak[5],@tweak[4]
1363 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1364 movdqu `16*0`($inp),$inout0 # load input
1365 pand $twmask,$twres # isolate carry and residue
1366 movdqu `16*1`($inp),$inout1
1367 pxor $twres,@tweak[5]
1368
1369 movdqu `16*2`($inp),$inout2
1370 pxor @tweak[0],$inout0 # input^=tweak
1371 movdqu `16*3`($inp),$inout3
1372 pxor @tweak[1],$inout1
1373 movdqu `16*4`($inp),$inout4
1374 pxor @tweak[2],$inout2
1375 movdqu `16*5`($inp),$inout5
1376 lea `16*6`($inp),$inp
1377 pxor @tweak[3],$inout3
1378 $movkey ($key_),$rndkey0
1379 pxor @tweak[4],$inout4
1380 pxor @tweak[5],$inout5
1381
1382 # inline _aesni_encrypt6 and interleave first and last rounds
1383 # with own code...
1384 $movkey 16($key_),$rndkey1
1385 pxor $rndkey0,$inout0
1386 pxor $rndkey0,$inout1
1387 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
1388 aesenc $rndkey1,$inout0
1389 lea 32($key_),$key
1390 pxor $rndkey0,$inout2
1391 movdqa @tweak[1],`16*1`(%rsp)
1392 aesenc $rndkey1,$inout1
1393 pxor $rndkey0,$inout3
1394 movdqa @tweak[2],`16*2`(%rsp)
1395 aesenc $rndkey1,$inout2
1396 pxor $rndkey0,$inout4
1397 movdqa @tweak[3],`16*3`(%rsp)
1398 aesenc $rndkey1,$inout3
1399 pxor $rndkey0,$inout5
1400 $movkey ($key),$rndkey0
1401 dec $rounds
1402 movdqa @tweak[4],`16*4`(%rsp)
1403 aesenc $rndkey1,$inout4
1404 movdqa @tweak[5],`16*5`(%rsp)
1405 aesenc $rndkey1,$inout5
1406 pxor $twtmp,$twtmp
1407 pcmpgtd @tweak[5],$twtmp
1408 jmp .Lxts_enc_loop6_enter
1409
1410.align 16
1411.Lxts_enc_loop6:
1412 aesenc $rndkey1,$inout0
1413 aesenc $rndkey1,$inout1
1414 dec $rounds
1415 aesenc $rndkey1,$inout2
1416 aesenc $rndkey1,$inout3
1417 aesenc $rndkey1,$inout4
1418 aesenc $rndkey1,$inout5
1419.Lxts_enc_loop6_enter:
1420 $movkey 16($key),$rndkey1
1421 aesenc $rndkey0,$inout0
1422 aesenc $rndkey0,$inout1
1423 lea 32($key),$key
1424 aesenc $rndkey0,$inout2
1425 aesenc $rndkey0,$inout3
1426 aesenc $rndkey0,$inout4
1427 aesenc $rndkey0,$inout5
1428 $movkey ($key),$rndkey0
1429 jnz .Lxts_enc_loop6
1430
1431 pshufd \$0x13,$twtmp,$twres
1432 pxor $twtmp,$twtmp
1433 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1434 aesenc $rndkey1,$inout0
1435 pand $twmask,$twres # isolate carry and residue
1436 aesenc $rndkey1,$inout1
1437 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1438 aesenc $rndkey1,$inout2
1439 pxor $twres,@tweak[5]
1440 aesenc $rndkey1,$inout3
1441 aesenc $rndkey1,$inout4
1442 aesenc $rndkey1,$inout5
1443 $movkey 16($key),$rndkey1
1444
1445 pshufd \$0x13,$twtmp,$twres
1446 pxor $twtmp,$twtmp
1447 movdqa @tweak[5],@tweak[0]
1448 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1449 aesenc $rndkey0,$inout0
1450 pand $twmask,$twres # isolate carry and residue
1451 aesenc $rndkey0,$inout1
1452 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1453 aesenc $rndkey0,$inout2
1454 pxor $twres,@tweak[5]
1455 aesenc $rndkey0,$inout3
1456 aesenc $rndkey0,$inout4
1457 aesenc $rndkey0,$inout5
1458 $movkey 32($key),$rndkey0
1459
1460 pshufd \$0x13,$twtmp,$twres
1461 pxor $twtmp,$twtmp
1462 movdqa @tweak[5],@tweak[1]
1463 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1464 aesenc $rndkey1,$inout0
1465 pand $twmask,$twres # isolate carry and residue
1466 aesenc $rndkey1,$inout1
1467 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1468 aesenc $rndkey1,$inout2
1469 pxor $twres,@tweak[5]
1470 aesenc $rndkey1,$inout3
1471 aesenc $rndkey1,$inout4
1472 aesenc $rndkey1,$inout5
1473
1474 pshufd \$0x13,$twtmp,$twres
1475 pxor $twtmp,$twtmp
1476 movdqa @tweak[5],@tweak[2]
1477 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1478 aesenclast $rndkey0,$inout0
1479 pand $twmask,$twres # isolate carry and residue
1480 aesenclast $rndkey0,$inout1
1481 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1482 aesenclast $rndkey0,$inout2
1483 pxor $twres,@tweak[5]
1484 aesenclast $rndkey0,$inout3
1485 aesenclast $rndkey0,$inout4
1486 aesenclast $rndkey0,$inout5
1487
1488 pshufd \$0x13,$twtmp,$twres
1489 pxor $twtmp,$twtmp
1490 movdqa @tweak[5],@tweak[3]
1491 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1492 xorps `16*0`(%rsp),$inout0 # output^=tweak
1493 pand $twmask,$twres # isolate carry and residue
1494 xorps `16*1`(%rsp),$inout1
1495 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1496 pxor $twres,@tweak[5]
1497
1498 xorps `16*2`(%rsp),$inout2
1499 movups $inout0,`16*0`($out) # write output
1500 xorps `16*3`(%rsp),$inout3
1501 movups $inout1,`16*1`($out)
1502 xorps `16*4`(%rsp),$inout4
1503 movups $inout2,`16*2`($out)
1504 xorps `16*5`(%rsp),$inout5
1505 movups $inout3,`16*3`($out)
1506 mov $rnds_,$rounds # restore $rounds
1507 movups $inout4,`16*4`($out)
1508 movups $inout5,`16*5`($out)
1509 lea `16*6`($out),$out
1510 sub \$16*6,$len
1511 jnc .Lxts_enc_grandloop
1512
1513 lea 3($rounds,$rounds),$rounds # restore original value
1514 mov $key_,$key # restore $key
1515 mov $rounds,$rnds_ # backup $rounds
1516
1517.Lxts_enc_short:
1518 add \$16*6,$len
1519 jz .Lxts_enc_done
1520
1521 cmp \$0x20,$len
1522 jb .Lxts_enc_one
1523 je .Lxts_enc_two
1524
1525 cmp \$0x40,$len
1526 jb .Lxts_enc_three
1527 je .Lxts_enc_four
1528
1529 pshufd \$0x13,$twtmp,$twres
1530 movdqa @tweak[5],@tweak[4]
1531 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1532 movdqu ($inp),$inout0
1533 pand $twmask,$twres # isolate carry and residue
1534 movdqu 16*1($inp),$inout1
1535 pxor $twres,@tweak[5]
1536
1537 movdqu 16*2($inp),$inout2
1538 pxor @tweak[0],$inout0
1539 movdqu 16*3($inp),$inout3
1540 pxor @tweak[1],$inout1
1541 movdqu 16*4($inp),$inout4
1542 lea 16*5($inp),$inp
1543 pxor @tweak[2],$inout2
1544 pxor @tweak[3],$inout3
1545 pxor @tweak[4],$inout4
1546
1547 call _aesni_encrypt6
1548
1549 xorps @tweak[0],$inout0
1550 movdqa @tweak[5],@tweak[0]
1551 xorps @tweak[1],$inout1
1552 xorps @tweak[2],$inout2
1553 movdqu $inout0,($out)
1554 xorps @tweak[3],$inout3
1555 movdqu $inout1,16*1($out)
1556 xorps @tweak[4],$inout4
1557 movdqu $inout2,16*2($out)
1558 movdqu $inout3,16*3($out)
1559 movdqu $inout4,16*4($out)
1560 lea 16*5($out),$out
1561 jmp .Lxts_enc_done
1562
1563.align 16
1564.Lxts_enc_one:
1565 movups ($inp),$inout0
1566 lea 16*1($inp),$inp
1567 xorps @tweak[0],$inout0
1568___
1569 &aesni_generate1("enc",$key,$rounds);
1570$code.=<<___;
1571 xorps @tweak[0],$inout0
1572 movdqa @tweak[1],@tweak[0]
1573 movups $inout0,($out)
1574 lea 16*1($out),$out
1575 jmp .Lxts_enc_done
1576
1577.align 16
1578.Lxts_enc_two:
1579 movups ($inp),$inout0
1580 movups 16($inp),$inout1
1581 lea 32($inp),$inp
1582 xorps @tweak[0],$inout0
1583 xorps @tweak[1],$inout1
1584
1585 call _aesni_encrypt3
1586
1587 xorps @tweak[0],$inout0
1588 movdqa @tweak[2],@tweak[0]
1589 xorps @tweak[1],$inout1
1590 movups $inout0,($out)
1591 movups $inout1,16*1($out)
1592 lea 16*2($out),$out
1593 jmp .Lxts_enc_done
1594
1595.align 16
1596.Lxts_enc_three:
1597 movups ($inp),$inout0
1598 movups 16*1($inp),$inout1
1599 movups 16*2($inp),$inout2
1600 lea 16*3($inp),$inp
1601 xorps @tweak[0],$inout0
1602 xorps @tweak[1],$inout1
1603 xorps @tweak[2],$inout2
1604
1605 call _aesni_encrypt3
1606
1607 xorps @tweak[0],$inout0
1608 movdqa @tweak[3],@tweak[0]
1609 xorps @tweak[1],$inout1
1610 xorps @tweak[2],$inout2
1611 movups $inout0,($out)
1612 movups $inout1,16*1($out)
1613 movups $inout2,16*2($out)
1614 lea 16*3($out),$out
1615 jmp .Lxts_enc_done
1616
1617.align 16
1618.Lxts_enc_four:
1619 movups ($inp),$inout0
1620 movups 16*1($inp),$inout1
1621 movups 16*2($inp),$inout2
1622 xorps @tweak[0],$inout0
1623 movups 16*3($inp),$inout3
1624 lea 16*4($inp),$inp
1625 xorps @tweak[1],$inout1
1626 xorps @tweak[2],$inout2
1627 xorps @tweak[3],$inout3
1628
1629 call _aesni_encrypt4
1630
1631 xorps @tweak[0],$inout0
1632 movdqa @tweak[5],@tweak[0]
1633 xorps @tweak[1],$inout1
1634 xorps @tweak[2],$inout2
1635 movups $inout0,($out)
1636 xorps @tweak[3],$inout3
1637 movups $inout1,16*1($out)
1638 movups $inout2,16*2($out)
1639 movups $inout3,16*3($out)
1640 lea 16*4($out),$out
1641 jmp .Lxts_enc_done
1642
1643.align 16
1644.Lxts_enc_done:
1645 and \$15,$len_
1646 jz .Lxts_enc_ret
1647 mov $len_,$len
1648
1649.Lxts_enc_steal:
1650 movzb ($inp),%eax # borrow $rounds ...
1651 movzb -16($out),%ecx # ... and $key
1652 lea 1($inp),$inp
1653 mov %al,-16($out)
1654 mov %cl,0($out)
1655 lea 1($out),$out
1656 sub \$1,$len
1657 jnz .Lxts_enc_steal
1658
1659 sub $len_,$out # rewind $out
1660 mov $key_,$key # restore $key
1661 mov $rnds_,$rounds # restore $rounds
1662
1663 movups -16($out),$inout0
1664 xorps @tweak[0],$inout0
1665___
1666 &aesni_generate1("enc",$key,$rounds);
1667$code.=<<___;
1668 xorps @tweak[0],$inout0
1669 movups $inout0,-16($out)
1670
1671.Lxts_enc_ret:
1672___
1673$code.=<<___ if ($win64);
1674 movaps 0x60(%rsp),%xmm6
1675 movaps 0x70(%rsp),%xmm7
1676 movaps 0x80(%rsp),%xmm8
1677 movaps 0x90(%rsp),%xmm9
1678 movaps 0xa0(%rsp),%xmm10
1679 movaps 0xb0(%rsp),%xmm11
1680 movaps 0xc0(%rsp),%xmm12
1681 movaps 0xd0(%rsp),%xmm13
1682 movaps 0xe0(%rsp),%xmm14
1683 movaps 0xf0(%rsp),%xmm15
1684___
1685$code.=<<___;
1686 lea $frame_size(%rsp),%rsp
1687.Lxts_enc_epilogue:
1688 ret
1689.size aesni_xts_encrypt,.-aesni_xts_encrypt
1690___
1691
1692$code.=<<___;
1693.globl aesni_xts_decrypt
1694.type aesni_xts_decrypt,\@function,6
1695.align 16
1696aesni_xts_decrypt:
1697 lea -$frame_size(%rsp),%rsp
1698___
1699$code.=<<___ if ($win64);
1700 movaps %xmm6,0x60(%rsp)
1701 movaps %xmm7,0x70(%rsp)
1702 movaps %xmm8,0x80(%rsp)
1703 movaps %xmm9,0x90(%rsp)
1704 movaps %xmm10,0xa0(%rsp)
1705 movaps %xmm11,0xb0(%rsp)
1706 movaps %xmm12,0xc0(%rsp)
1707 movaps %xmm13,0xd0(%rsp)
1708 movaps %xmm14,0xe0(%rsp)
1709 movaps %xmm15,0xf0(%rsp)
1710.Lxts_dec_body:
1711___
1712$code.=<<___;
1713 movups ($ivp),@tweak[5] # load clear-text tweak
1714 mov 240($key2),$rounds # key2->rounds
1715 mov 240($key),$rnds_ # key1->rounds
1716___
1717 # generate the tweak
1718 &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
1719$code.=<<___;
1720 xor %eax,%eax # if ($len%16) len-=16;
1721 test \$15,$len
1722 setnz %al
1723 shl \$4,%rax
1724 sub %rax,$len
1725
1726 mov $key,$key_ # backup $key
1727 mov $rnds_,$rounds # backup $rounds
1728 mov $len,$len_ # backup $len
1729 and \$-16,$len
1730
1731 movdqa .Lxts_magic(%rip),$twmask
1732 pxor $twtmp,$twtmp
1733 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1734___
1735 for ($i=0;$i<4;$i++) {
1736 $code.=<<___;
1737 pshufd \$0x13,$twtmp,$twres
1738 pxor $twtmp,$twtmp
1739 movdqa @tweak[5],@tweak[$i]
1740 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1741 pand $twmask,$twres # isolate carry and residue
1742 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1743 pxor $twres,@tweak[5]
1744___
1745 }
1746$code.=<<___;
1747 sub \$16*6,$len
1748 jc .Lxts_dec_short
1749
1750 shr \$1,$rounds
1751 sub \$1,$rounds
1752 mov $rounds,$rnds_
1753 jmp .Lxts_dec_grandloop
1754
1755.align 16
1756.Lxts_dec_grandloop:
1757 pshufd \$0x13,$twtmp,$twres
1758 movdqa @tweak[5],@tweak[4]
1759 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1760 movdqu `16*0`($inp),$inout0 # load input
1761 pand $twmask,$twres # isolate carry and residue
1762 movdqu `16*1`($inp),$inout1
1763 pxor $twres,@tweak[5]
1764
1765 movdqu `16*2`($inp),$inout2
1766 pxor @tweak[0],$inout0 # input^=tweak
1767 movdqu `16*3`($inp),$inout3
1768 pxor @tweak[1],$inout1
1769 movdqu `16*4`($inp),$inout4
1770 pxor @tweak[2],$inout2
1771 movdqu `16*5`($inp),$inout5
1772 lea `16*6`($inp),$inp
1773 pxor @tweak[3],$inout3
1774 $movkey ($key_),$rndkey0
1775 pxor @tweak[4],$inout4
1776 pxor @tweak[5],$inout5
1777
1778 # inline _aesni_decrypt6 and interleave first and last rounds
1779 # with own code...
1780 $movkey 16($key_),$rndkey1
1781 pxor $rndkey0,$inout0
1782 pxor $rndkey0,$inout1
1783 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
1784 aesdec $rndkey1,$inout0
1785 lea 32($key_),$key
1786 pxor $rndkey0,$inout2
1787 movdqa @tweak[1],`16*1`(%rsp)
1788 aesdec $rndkey1,$inout1
1789 pxor $rndkey0,$inout3
1790 movdqa @tweak[2],`16*2`(%rsp)
1791 aesdec $rndkey1,$inout2
1792 pxor $rndkey0,$inout4
1793 movdqa @tweak[3],`16*3`(%rsp)
1794 aesdec $rndkey1,$inout3
1795 pxor $rndkey0,$inout5
1796 $movkey ($key),$rndkey0
1797 dec $rounds
1798 movdqa @tweak[4],`16*4`(%rsp)
1799 aesdec $rndkey1,$inout4
1800 movdqa @tweak[5],`16*5`(%rsp)
1801 aesdec $rndkey1,$inout5
1802 pxor $twtmp,$twtmp
1803 pcmpgtd @tweak[5],$twtmp
1804 jmp .Lxts_dec_loop6_enter
1805
1806.align 16
1807.Lxts_dec_loop6:
1808 aesdec $rndkey1,$inout0
1809 aesdec $rndkey1,$inout1
1810 dec $rounds
1811 aesdec $rndkey1,$inout2
1812 aesdec $rndkey1,$inout3
1813 aesdec $rndkey1,$inout4
1814 aesdec $rndkey1,$inout5
1815.Lxts_dec_loop6_enter:
1816 $movkey 16($key),$rndkey1
1817 aesdec $rndkey0,$inout0
1818 aesdec $rndkey0,$inout1
1819 lea 32($key),$key
1820 aesdec $rndkey0,$inout2
1821 aesdec $rndkey0,$inout3
1822 aesdec $rndkey0,$inout4
1823 aesdec $rndkey0,$inout5
1824 $movkey ($key),$rndkey0
1825 jnz .Lxts_dec_loop6
1826
1827 pshufd \$0x13,$twtmp,$twres
1828 pxor $twtmp,$twtmp
1829 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1830 aesdec $rndkey1,$inout0
1831 pand $twmask,$twres # isolate carry and residue
1832 aesdec $rndkey1,$inout1
1833 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1834 aesdec $rndkey1,$inout2
1835 pxor $twres,@tweak[5]
1836 aesdec $rndkey1,$inout3
1837 aesdec $rndkey1,$inout4
1838 aesdec $rndkey1,$inout5
1839 $movkey 16($key),$rndkey1
1840
1841 pshufd \$0x13,$twtmp,$twres
1842 pxor $twtmp,$twtmp
1843 movdqa @tweak[5],@tweak[0]
1844 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1845 aesdec $rndkey0,$inout0
1846 pand $twmask,$twres # isolate carry and residue
1847 aesdec $rndkey0,$inout1
1848 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1849 aesdec $rndkey0,$inout2
1850 pxor $twres,@tweak[5]
1851 aesdec $rndkey0,$inout3
1852 aesdec $rndkey0,$inout4
1853 aesdec $rndkey0,$inout5
1854 $movkey 32($key),$rndkey0
1855
1856 pshufd \$0x13,$twtmp,$twres
1857 pxor $twtmp,$twtmp
1858 movdqa @tweak[5],@tweak[1]
1859 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1860 aesdec $rndkey1,$inout0
1861 pand $twmask,$twres # isolate carry and residue
1862 aesdec $rndkey1,$inout1
1863 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1864 aesdec $rndkey1,$inout2
1865 pxor $twres,@tweak[5]
1866 aesdec $rndkey1,$inout3
1867 aesdec $rndkey1,$inout4
1868 aesdec $rndkey1,$inout5
1869
1870 pshufd \$0x13,$twtmp,$twres
1871 pxor $twtmp,$twtmp
1872 movdqa @tweak[5],@tweak[2]
1873 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1874 aesdeclast $rndkey0,$inout0
1875 pand $twmask,$twres # isolate carry and residue
1876 aesdeclast $rndkey0,$inout1
1877 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1878 aesdeclast $rndkey0,$inout2
1879 pxor $twres,@tweak[5]
1880 aesdeclast $rndkey0,$inout3
1881 aesdeclast $rndkey0,$inout4
1882 aesdeclast $rndkey0,$inout5
1883
1884 pshufd \$0x13,$twtmp,$twres
1885 pxor $twtmp,$twtmp
1886 movdqa @tweak[5],@tweak[3]
1887 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1888 xorps `16*0`(%rsp),$inout0 # output^=tweak
1889 pand $twmask,$twres # isolate carry and residue
1890 xorps `16*1`(%rsp),$inout1
1891 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1892 pxor $twres,@tweak[5]
1893
1894 xorps `16*2`(%rsp),$inout2
1895 movups $inout0,`16*0`($out) # write output
1896 xorps `16*3`(%rsp),$inout3
1897 movups $inout1,`16*1`($out)
1898 xorps `16*4`(%rsp),$inout4
1899 movups $inout2,`16*2`($out)
1900 xorps `16*5`(%rsp),$inout5
1901 movups $inout3,`16*3`($out)
1902 mov $rnds_,$rounds # restore $rounds
1903 movups $inout4,`16*4`($out)
1904 movups $inout5,`16*5`($out)
1905 lea `16*6`($out),$out
1906 sub \$16*6,$len
1907 jnc .Lxts_dec_grandloop
1908
1909 lea 3($rounds,$rounds),$rounds # restore original value
1910 mov $key_,$key # restore $key
1911 mov $rounds,$rnds_ # backup $rounds
1912
1913.Lxts_dec_short:
1914 add \$16*6,$len
1915 jz .Lxts_dec_done
1916
1917 cmp \$0x20,$len
1918 jb .Lxts_dec_one
1919 je .Lxts_dec_two
1920
1921 cmp \$0x40,$len
1922 jb .Lxts_dec_three
1923 je .Lxts_dec_four
1924
1925 pshufd \$0x13,$twtmp,$twres
1926 movdqa @tweak[5],@tweak[4]
1927 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1928 movdqu ($inp),$inout0
1929 pand $twmask,$twres # isolate carry and residue
1930 movdqu 16*1($inp),$inout1
1931 pxor $twres,@tweak[5]
1932
1933 movdqu 16*2($inp),$inout2
1934 pxor @tweak[0],$inout0
1935 movdqu 16*3($inp),$inout3
1936 pxor @tweak[1],$inout1
1937 movdqu 16*4($inp),$inout4
1938 lea 16*5($inp),$inp
1939 pxor @tweak[2],$inout2
1940 pxor @tweak[3],$inout3
1941 pxor @tweak[4],$inout4
1942
1943 call _aesni_decrypt6
1944
1945 xorps @tweak[0],$inout0
1946 xorps @tweak[1],$inout1
1947 xorps @tweak[2],$inout2
1948 movdqu $inout0,($out)
1949 xorps @tweak[3],$inout3
1950 movdqu $inout1,16*1($out)
1951 xorps @tweak[4],$inout4
1952 movdqu $inout2,16*2($out)
1953 pxor $twtmp,$twtmp
1954 movdqu $inout3,16*3($out)
1955 pcmpgtd @tweak[5],$twtmp
1956 movdqu $inout4,16*4($out)
1957 lea 16*5($out),$out
1958 pshufd \$0x13,$twtmp,@tweak[1] # $twres
1959 and \$15,$len_
1960 jz .Lxts_dec_ret
1961
1962 movdqa @tweak[5],@tweak[0]
1963 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1964 pand $twmask,@tweak[1] # isolate carry and residue
1965 pxor @tweak[5],@tweak[1]
1966 jmp .Lxts_dec_done2
1967
1968.align 16
1969.Lxts_dec_one:
1970 movups ($inp),$inout0
1971 lea 16*1($inp),$inp
1972 xorps @tweak[0],$inout0
1973___
1974 &aesni_generate1("dec",$key,$rounds);
1975$code.=<<___;
1976 xorps @tweak[0],$inout0
1977 movdqa @tweak[1],@tweak[0]
1978 movups $inout0,($out)
1979 movdqa @tweak[2],@tweak[1]
1980 lea 16*1($out),$out
1981 jmp .Lxts_dec_done
1982
1983.align 16
1984.Lxts_dec_two:
1985 movups ($inp),$inout0
1986 movups 16($inp),$inout1
1987 lea 32($inp),$inp
1988 xorps @tweak[0],$inout0
1989 xorps @tweak[1],$inout1
1990
1991 call _aesni_decrypt3
1992
1993 xorps @tweak[0],$inout0
1994 movdqa @tweak[2],@tweak[0]
1995 xorps @tweak[1],$inout1
1996 movdqa @tweak[3],@tweak[1]
1997 movups $inout0,($out)
1998 movups $inout1,16*1($out)
1999 lea 16*2($out),$out
2000 jmp .Lxts_dec_done
2001
2002.align 16
2003.Lxts_dec_three:
2004 movups ($inp),$inout0
2005 movups 16*1($inp),$inout1
2006 movups 16*2($inp),$inout2
2007 lea 16*3($inp),$inp
2008 xorps @tweak[0],$inout0
2009 xorps @tweak[1],$inout1
2010 xorps @tweak[2],$inout2
2011
2012 call _aesni_decrypt3
2013
2014 xorps @tweak[0],$inout0
2015 movdqa @tweak[3],@tweak[0]
2016 xorps @tweak[1],$inout1
2017 movdqa @tweak[5],@tweak[1]
2018 xorps @tweak[2],$inout2
2019 movups $inout0,($out)
2020 movups $inout1,16*1($out)
2021 movups $inout2,16*2($out)
2022 lea 16*3($out),$out
2023 jmp .Lxts_dec_done
2024
2025.align 16
2026.Lxts_dec_four:
2027 pshufd \$0x13,$twtmp,$twres
2028 movdqa @tweak[5],@tweak[4]
2029 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2030 movups ($inp),$inout0
2031 pand $twmask,$twres # isolate carry and residue
2032 movups 16*1($inp),$inout1
2033 pxor $twres,@tweak[5]
2034
2035 movups 16*2($inp),$inout2
2036 xorps @tweak[0],$inout0
2037 movups 16*3($inp),$inout3
2038 lea 16*4($inp),$inp
2039 xorps @tweak[1],$inout1
2040 xorps @tweak[2],$inout2
2041 xorps @tweak[3],$inout3
2042
2043 call _aesni_decrypt4
2044
2045 xorps @tweak[0],$inout0
2046 movdqa @tweak[4],@tweak[0]
2047 xorps @tweak[1],$inout1
2048 movdqa @tweak[5],@tweak[1]
2049 xorps @tweak[2],$inout2
2050 movups $inout0,($out)
2051 xorps @tweak[3],$inout3
2052 movups $inout1,16*1($out)
2053 movups $inout2,16*2($out)
2054 movups $inout3,16*3($out)
2055 lea 16*4($out),$out
2056 jmp .Lxts_dec_done
2057
2058.align 16
2059.Lxts_dec_done:
2060 and \$15,$len_
2061 jz .Lxts_dec_ret
2062.Lxts_dec_done2:
2063 mov $len_,$len
2064 mov $key_,$key # restore $key
2065 mov $rnds_,$rounds # restore $rounds
2066
2067 movups ($inp),$inout0
2068 xorps @tweak[1],$inout0
2069___
2070 &aesni_generate1("dec",$key,$rounds);
2071$code.=<<___;
2072 xorps @tweak[1],$inout0
2073 movups $inout0,($out)
2074
2075.Lxts_dec_steal:
2076 movzb 16($inp),%eax # borrow $rounds ...
2077 movzb ($out),%ecx # ... and $key
2078 lea 1($inp),$inp
2079 mov %al,($out)
2080 mov %cl,16($out)
2081 lea 1($out),$out
2082 sub \$1,$len
2083 jnz .Lxts_dec_steal
2084
2085 sub $len_,$out # rewind $out
2086 mov $key_,$key # restore $key
2087 mov $rnds_,$rounds # restore $rounds
2088
2089 movups ($out),$inout0
2090 xorps @tweak[0],$inout0
2091___
2092 &aesni_generate1("dec",$key,$rounds);
2093$code.=<<___;
2094 xorps @tweak[0],$inout0
2095 movups $inout0,($out)
2096
2097.Lxts_dec_ret:
2098___
2099$code.=<<___ if ($win64);
2100 movaps 0x60(%rsp),%xmm6
2101 movaps 0x70(%rsp),%xmm7
2102 movaps 0x80(%rsp),%xmm8
2103 movaps 0x90(%rsp),%xmm9
2104 movaps 0xa0(%rsp),%xmm10
2105 movaps 0xb0(%rsp),%xmm11
2106 movaps 0xc0(%rsp),%xmm12
2107 movaps 0xd0(%rsp),%xmm13
2108 movaps 0xe0(%rsp),%xmm14
2109 movaps 0xf0(%rsp),%xmm15
2110___
2111$code.=<<___;
2112 lea $frame_size(%rsp),%rsp
2113.Lxts_dec_epilogue:
2114 ret
2115.size aesni_xts_decrypt,.-aesni_xts_decrypt
2116___
2117} }}
2118
2119########################################################################
2120# void $PREFIX_cbc_encrypt (const void *inp, void *out,
2121# size_t length, const AES_KEY *key,
2122# unsigned char *ivp,const int enc);
2123{
2124my $reserved = $win64?0x40:-0x18; # used in decrypt
2125$code.=<<___;
2126.globl ${PREFIX}_cbc_encrypt
2127.type ${PREFIX}_cbc_encrypt,\@function,6
2128.align 16
2129${PREFIX}_cbc_encrypt:
2130 test $len,$len # check length
2131 jz .Lcbc_ret
2132
2133 mov 240($key),$rnds_ # key->rounds
2134 mov $key,$key_ # backup $key
2135 test %r9d,%r9d # 6th argument
2136 jz .Lcbc_decrypt
2137#--------------------------- CBC ENCRYPT ------------------------------#
2138 movups ($ivp),$inout0 # load iv as initial state
2139 mov $rnds_,$rounds
2140 cmp \$16,$len
2141 jb .Lcbc_enc_tail
2142 sub \$16,$len
2143 jmp .Lcbc_enc_loop
2144.align 16
2145.Lcbc_enc_loop:
2146 movups ($inp),$inout1 # load input
2147 lea 16($inp),$inp
2148 #xorps $inout1,$inout0
2149___
2150 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
2151$code.=<<___;
2152 mov $rnds_,$rounds # restore $rounds
2153 mov $key_,$key # restore $key
2154 movups $inout0,0($out) # store output
2155 lea 16($out),$out
2156 sub \$16,$len
2157 jnc .Lcbc_enc_loop
2158 add \$16,$len
2159 jnz .Lcbc_enc_tail
2160 movups $inout0,($ivp)
2161 jmp .Lcbc_ret
2162
2163.Lcbc_enc_tail:
2164 mov $len,%rcx # zaps $key
2165 xchg $inp,$out # $inp is %rsi and $out is %rdi now
2166 .long 0x9066A4F3 # rep movsb
2167 mov \$16,%ecx # zero tail
2168 sub $len,%rcx
2169 xor %eax,%eax
2170 .long 0x9066AAF3 # rep stosb
2171 lea -16(%rdi),%rdi # rewind $out by 1 block
2172 mov $rnds_,$rounds # restore $rounds
2173 mov %rdi,%rsi # $inp and $out are the same
2174 mov $key_,$key # restore $key
2175 xor $len,$len # len=16
2176 jmp .Lcbc_enc_loop # one more spin
2177 #--------------------------- CBC DECRYPT ------------------------------#
2178.align 16
2179.Lcbc_decrypt:
2180___
2181$code.=<<___ if ($win64);
2182 lea -0x58(%rsp),%rsp
2183 movaps %xmm6,(%rsp)
2184 movaps %xmm7,0x10(%rsp)
2185 movaps %xmm8,0x20(%rsp)
2186 movaps %xmm9,0x30(%rsp)
2187.Lcbc_decrypt_body:
2188___
2189$code.=<<___;
2190 movups ($ivp),$iv
2191 mov $rnds_,$rounds
2192 cmp \$0x70,$len
2193 jbe .Lcbc_dec_tail
2194 shr \$1,$rnds_
2195 sub \$0x70,$len
2196 mov $rnds_,$rounds
2197 movaps $iv,$reserved(%rsp)
2198 jmp .Lcbc_dec_loop8_enter
2199.align 16
2200.Lcbc_dec_loop8:
2201 movaps $rndkey0,$reserved(%rsp) # save IV
2202 movups $inout7,($out)
2203 lea 0x10($out),$out
2204.Lcbc_dec_loop8_enter:
2205 $movkey ($key),$rndkey0
2206 movups ($inp),$inout0 # load input
2207 movups 0x10($inp),$inout1
2208 $movkey 16($key),$rndkey1
2209
2210 lea 32($key),$key
2211 movdqu 0x20($inp),$inout2
2212 xorps $rndkey0,$inout0
2213 movdqu 0x30($inp),$inout3
2214 xorps $rndkey0,$inout1
2215 movdqu 0x40($inp),$inout4
2216 aesdec $rndkey1,$inout0
2217 pxor $rndkey0,$inout2
2218 movdqu 0x50($inp),$inout5
2219 aesdec $rndkey1,$inout1
2220 pxor $rndkey0,$inout3
2221 movdqu 0x60($inp),$inout6
2222 aesdec $rndkey1,$inout2
2223 pxor $rndkey0,$inout4
2224 movdqu 0x70($inp),$inout7
2225 aesdec $rndkey1,$inout3
2226 pxor $rndkey0,$inout5
2227 dec $rounds
2228 aesdec $rndkey1,$inout4
2229 pxor $rndkey0,$inout6
2230 aesdec $rndkey1,$inout5
2231 pxor $rndkey0,$inout7
2232 $movkey ($key),$rndkey0
2233 aesdec $rndkey1,$inout6
2234 aesdec $rndkey1,$inout7
2235 $movkey 16($key),$rndkey1
2236
2237 call .Ldec_loop8_enter
2238
2239 movups ($inp),$rndkey1 # re-load input
2240 movups 0x10($inp),$rndkey0
2241 xorps $reserved(%rsp),$inout0 # ^= IV
2242 xorps $rndkey1,$inout1
2243 movups 0x20($inp),$rndkey1
2244 xorps $rndkey0,$inout2
2245 movups 0x30($inp),$rndkey0
2246 xorps $rndkey1,$inout3
2247 movups 0x40($inp),$rndkey1
2248 xorps $rndkey0,$inout4
2249 movups 0x50($inp),$rndkey0
2250 xorps $rndkey1,$inout5
2251 movups 0x60($inp),$rndkey1
2252 xorps $rndkey0,$inout6
2253 movups 0x70($inp),$rndkey0 # IV
2254 xorps $rndkey1,$inout7
2255 movups $inout0,($out)
2256 movups $inout1,0x10($out)
2257 movups $inout2,0x20($out)
2258 movups $inout3,0x30($out)
2259 mov $rnds_,$rounds # restore $rounds
2260 movups $inout4,0x40($out)
2261 mov $key_,$key # restore $key
2262 movups $inout5,0x50($out)
2263 lea 0x80($inp),$inp
2264 movups $inout6,0x60($out)
2265 lea 0x70($out),$out
2266 sub \$0x80,$len
2267 ja .Lcbc_dec_loop8
2268
2269 movaps $inout7,$inout0
2270 movaps $rndkey0,$iv
2271 add \$0x70,$len
2272 jle .Lcbc_dec_tail_collected
2273 movups $inout0,($out)
2274 lea 1($rnds_,$rnds_),$rounds
2275 lea 0x10($out),$out
2276.Lcbc_dec_tail:
2277 movups ($inp),$inout0
2278 movaps $inout0,$in0
2279 cmp \$0x10,$len
2280 jbe .Lcbc_dec_one
2281
2282 movups 0x10($inp),$inout1
2283 movaps $inout1,$in1
2284 cmp \$0x20,$len
2285 jbe .Lcbc_dec_two
2286
2287 movups 0x20($inp),$inout2
2288 movaps $inout2,$in2
2289 cmp \$0x30,$len
2290 jbe .Lcbc_dec_three
2291
2292 movups 0x30($inp),$inout3
2293 cmp \$0x40,$len
2294 jbe .Lcbc_dec_four
2295
2296 movups 0x40($inp),$inout4
2297 cmp \$0x50,$len
2298 jbe .Lcbc_dec_five
2299
2300 movups 0x50($inp),$inout5
2301 cmp \$0x60,$len
2302 jbe .Lcbc_dec_six
2303
2304 movups 0x60($inp),$inout6
2305 movaps $iv,$reserved(%rsp) # save IV
2306 call _aesni_decrypt8
2307 movups ($inp),$rndkey1
2308 movups 0x10($inp),$rndkey0
2309 xorps $reserved(%rsp),$inout0 # ^= IV
2310 xorps $rndkey1,$inout1
2311 movups 0x20($inp),$rndkey1
2312 xorps $rndkey0,$inout2
2313 movups 0x30($inp),$rndkey0
2314 xorps $rndkey1,$inout3
2315 movups 0x40($inp),$rndkey1
2316 xorps $rndkey0,$inout4
2317 movups 0x50($inp),$rndkey0
2318 xorps $rndkey1,$inout5
2319 movups 0x60($inp),$iv # IV
2320 xorps $rndkey0,$inout6
2321 movups $inout0,($out)
2322 movups $inout1,0x10($out)
2323 movups $inout2,0x20($out)
2324 movups $inout3,0x30($out)
2325 movups $inout4,0x40($out)
2326 movups $inout5,0x50($out)
2327 lea 0x60($out),$out
2328 movaps $inout6,$inout0
2329 sub \$0x70,$len
2330 jmp .Lcbc_dec_tail_collected
2331.align 16
2332.Lcbc_dec_one:
2333___
2334 &aesni_generate1("dec",$key,$rounds);
2335$code.=<<___;
2336 xorps $iv,$inout0
2337 movaps $in0,$iv
2338 sub \$0x10,$len
2339 jmp .Lcbc_dec_tail_collected
2340.align 16
2341.Lcbc_dec_two:
2342 xorps $inout2,$inout2
2343 call _aesni_decrypt3
2344 xorps $iv,$inout0
2345 xorps $in0,$inout1
2346 movups $inout0,($out)
2347 movaps $in1,$iv
2348 movaps $inout1,$inout0
2349 lea 0x10($out),$out
2350 sub \$0x20,$len
2351 jmp .Lcbc_dec_tail_collected
2352.align 16
2353.Lcbc_dec_three:
2354 call _aesni_decrypt3
2355 xorps $iv,$inout0
2356 xorps $in0,$inout1
2357 movups $inout0,($out)
2358 xorps $in1,$inout2
2359 movups $inout1,0x10($out)
2360 movaps $in2,$iv
2361 movaps $inout2,$inout0
2362 lea 0x20($out),$out
2363 sub \$0x30,$len
2364 jmp .Lcbc_dec_tail_collected
2365.align 16
2366.Lcbc_dec_four:
2367 call _aesni_decrypt4
2368 xorps $iv,$inout0
2369 movups 0x30($inp),$iv
2370 xorps $in0,$inout1
2371 movups $inout0,($out)
2372 xorps $in1,$inout2
2373 movups $inout1,0x10($out)
2374 xorps $in2,$inout3
2375 movups $inout2,0x20($out)
2376 movaps $inout3,$inout0
2377 lea 0x30($out),$out
2378 sub \$0x40,$len
2379 jmp .Lcbc_dec_tail_collected
2380.align 16
2381.Lcbc_dec_five:
2382 xorps $inout5,$inout5
2383 call _aesni_decrypt6
2384 movups 0x10($inp),$rndkey1
2385 movups 0x20($inp),$rndkey0
2386 xorps $iv,$inout0
2387 xorps $in0,$inout1
2388 xorps $rndkey1,$inout2
2389 movups 0x30($inp),$rndkey1
2390 xorps $rndkey0,$inout3
2391 movups 0x40($inp),$iv
2392 xorps $rndkey1,$inout4
2393 movups $inout0,($out)
2394 movups $inout1,0x10($out)
2395 movups $inout2,0x20($out)
2396 movups $inout3,0x30($out)
2397 lea 0x40($out),$out
2398 movaps $inout4,$inout0
2399 sub \$0x50,$len
2400 jmp .Lcbc_dec_tail_collected
2401.align 16
2402.Lcbc_dec_six:
2403 call _aesni_decrypt6
2404 movups 0x10($inp),$rndkey1
2405 movups 0x20($inp),$rndkey0
2406 xorps $iv,$inout0
2407 xorps $in0,$inout1
2408 xorps $rndkey1,$inout2
2409 movups 0x30($inp),$rndkey1
2410 xorps $rndkey0,$inout3
2411 movups 0x40($inp),$rndkey0
2412 xorps $rndkey1,$inout4
2413 movups 0x50($inp),$iv
2414 xorps $rndkey0,$inout5
2415 movups $inout0,($out)
2416 movups $inout1,0x10($out)
2417 movups $inout2,0x20($out)
2418 movups $inout3,0x30($out)
2419 movups $inout4,0x40($out)
2420 lea 0x50($out),$out
2421 movaps $inout5,$inout0
2422 sub \$0x60,$len
2423 jmp .Lcbc_dec_tail_collected
2424.align 16
2425.Lcbc_dec_tail_collected:
2426 and \$15,$len
2427 movups $iv,($ivp)
2428 jnz .Lcbc_dec_tail_partial
2429 movups $inout0,($out)
2430 jmp .Lcbc_dec_ret
2431.align 16
2432.Lcbc_dec_tail_partial:
2433 movaps $inout0,$reserved(%rsp)
2434 mov \$16,%rcx
2435 mov $out,%rdi
2436 sub $len,%rcx
2437 lea $reserved(%rsp),%rsi
2438 .long 0x9066A4F3 # rep movsb
2439
2440.Lcbc_dec_ret:
2441___
2442$code.=<<___ if ($win64);
2443 movaps (%rsp),%xmm6
2444 movaps 0x10(%rsp),%xmm7
2445 movaps 0x20(%rsp),%xmm8
2446 movaps 0x30(%rsp),%xmm9
2447 lea 0x58(%rsp),%rsp
2448___
2449$code.=<<___;
2450.Lcbc_ret:
2451 ret
2452.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
2453___
2454}
2455# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
2456# int bits, AES_KEY *key)
2457{ my ($inp,$bits,$key) = @_4args;
2458 $bits =~ s/%r/%e/;
2459
2460$code.=<<___;
2461.globl ${PREFIX}_set_decrypt_key
2462.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
2463.align 16
2464${PREFIX}_set_decrypt_key:
2465 sub \$8,%rsp
2466 call __aesni_set_encrypt_key
2467 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
2468 test %eax,%eax
2469 jnz .Ldec_key_ret
2470 lea 16($key,$bits),$inp # points at the end of key schedule
2471
2472 $movkey ($key),%xmm0 # just swap
2473 $movkey ($inp),%xmm1
2474 $movkey %xmm0,($inp)
2475 $movkey %xmm1,($key)
2476 lea 16($key),$key
2477 lea -16($inp),$inp
2478
2479.Ldec_key_inverse:
2480 $movkey ($key),%xmm0 # swap and inverse
2481 $movkey ($inp),%xmm1
2482 aesimc %xmm0,%xmm0
2483 aesimc %xmm1,%xmm1
2484 lea 16($key),$key
2485 lea -16($inp),$inp
2486 $movkey %xmm0,16($inp)
2487 $movkey %xmm1,-16($key)
2488 cmp $key,$inp
2489 ja .Ldec_key_inverse
2490
2491 $movkey ($key),%xmm0 # inverse middle
2492 aesimc %xmm0,%xmm0
2493 $movkey %xmm0,($inp)
2494.Ldec_key_ret:
2495 add \$8,%rsp
2496 ret
2497.LSEH_end_set_decrypt_key:
2498.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
2499___
2500
2501# This is based on submission by
2502#
2503# Huang Ying <ying.huang@intel.com>
2504# Vinodh Gopal <vinodh.gopal@intel.com>
2505# Kahraman Akdemir
2506#
2507# Agressively optimized in respect to aeskeygenassist's critical path
2508# and is contained in %xmm0-5 to meet Win64 ABI requirement.
2509#
2510$code.=<<___;
2511.globl ${PREFIX}_set_encrypt_key
2512.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
2513.align 16
2514${PREFIX}_set_encrypt_key:
2515__aesni_set_encrypt_key:
2516 sub \$8,%rsp
2517 mov \$-1,%rax
2518 test $inp,$inp
2519 jz .Lenc_key_ret
2520 test $key,$key
2521 jz .Lenc_key_ret
2522
2523 movups ($inp),%xmm0 # pull first 128 bits of *userKey
2524 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
2525 lea 16($key),%rax
2526 cmp \$256,$bits
2527 je .L14rounds
2528 cmp \$192,$bits
2529 je .L12rounds
2530 cmp \$128,$bits
2531 jne .Lbad_keybits
2532
2533.L10rounds:
2534 mov \$9,$bits # 10 rounds for 128-bit key
2535 $movkey %xmm0,($key) # round 0
2536 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
2537 call .Lkey_expansion_128_cold
2538 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
2539 call .Lkey_expansion_128
2540 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
2541 call .Lkey_expansion_128
2542 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
2543 call .Lkey_expansion_128
2544 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
2545 call .Lkey_expansion_128
2546 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
2547 call .Lkey_expansion_128
2548 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
2549 call .Lkey_expansion_128
2550 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
2551 call .Lkey_expansion_128
2552 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
2553 call .Lkey_expansion_128
2554 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
2555 call .Lkey_expansion_128
2556 $movkey %xmm0,(%rax)
2557 mov $bits,80(%rax) # 240(%rdx)
2558 xor %eax,%eax
2559 jmp .Lenc_key_ret
2560
2561.align 16
2562.L12rounds:
2563 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
2564 mov \$11,$bits # 12 rounds for 192
2565 $movkey %xmm0,($key) # round 0
2566 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
2567 call .Lkey_expansion_192a_cold
2568 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
2569 call .Lkey_expansion_192b
2570 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
2571 call .Lkey_expansion_192a
2572 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
2573 call .Lkey_expansion_192b
2574 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
2575 call .Lkey_expansion_192a
2576 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
2577 call .Lkey_expansion_192b
2578 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
2579 call .Lkey_expansion_192a
2580 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
2581 call .Lkey_expansion_192b
2582 $movkey %xmm0,(%rax)
2583 mov $bits,48(%rax) # 240(%rdx)
2584 xor %rax, %rax
2585 jmp .Lenc_key_ret
2586
2587.align 16
2588.L14rounds:
2589 movups 16($inp),%xmm2 # remaning half of *userKey
2590 mov \$13,$bits # 14 rounds for 256
2591 lea 16(%rax),%rax
2592 $movkey %xmm0,($key) # round 0
2593 $movkey %xmm2,16($key) # round 1
2594 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
2595 call .Lkey_expansion_256a_cold
2596 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
2597 call .Lkey_expansion_256b
2598 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
2599 call .Lkey_expansion_256a
2600 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
2601 call .Lkey_expansion_256b
2602 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
2603 call .Lkey_expansion_256a
2604 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
2605 call .Lkey_expansion_256b
2606 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
2607 call .Lkey_expansion_256a
2608 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
2609 call .Lkey_expansion_256b
2610 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
2611 call .Lkey_expansion_256a
2612 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
2613 call .Lkey_expansion_256b
2614 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
2615 call .Lkey_expansion_256a
2616 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
2617 call .Lkey_expansion_256b
2618 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
2619 call .Lkey_expansion_256a
2620 $movkey %xmm0,(%rax)
2621 mov $bits,16(%rax) # 240(%rdx)
2622 xor %rax,%rax
2623 jmp .Lenc_key_ret
2624
2625.align 16
2626.Lbad_keybits:
2627 mov \$-2,%rax
2628.Lenc_key_ret:
2629 add \$8,%rsp
2630 ret
2631.LSEH_end_set_encrypt_key:
2632
2633.align 16
2634.Lkey_expansion_128:
2635 $movkey %xmm0,(%rax)
2636 lea 16(%rax),%rax
2637.Lkey_expansion_128_cold:
2638 shufps \$0b00010000,%xmm0,%xmm4
2639 xorps %xmm4, %xmm0
2640 shufps \$0b10001100,%xmm0,%xmm4
2641 xorps %xmm4, %xmm0
2642 shufps \$0b11111111,%xmm1,%xmm1 # critical path
2643 xorps %xmm1,%xmm0
2644 ret
2645
2646.align 16
2647.Lkey_expansion_192a:
2648 $movkey %xmm0,(%rax)
2649 lea 16(%rax),%rax
2650.Lkey_expansion_192a_cold:
2651 movaps %xmm2, %xmm5
2652.Lkey_expansion_192b_warm:
2653 shufps \$0b00010000,%xmm0,%xmm4
2654 movdqa %xmm2,%xmm3
2655 xorps %xmm4,%xmm0
2656 shufps \$0b10001100,%xmm0,%xmm4
2657 pslldq \$4,%xmm3
2658 xorps %xmm4,%xmm0
2659 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
2660 pxor %xmm3,%xmm2
2661 pxor %xmm1,%xmm0
2662 pshufd \$0b11111111,%xmm0,%xmm3
2663 pxor %xmm3,%xmm2
2664 ret
2665
2666.align 16
2667.Lkey_expansion_192b:
2668 movaps %xmm0,%xmm3
2669 shufps \$0b01000100,%xmm0,%xmm5
2670 $movkey %xmm5,(%rax)
2671 shufps \$0b01001110,%xmm2,%xmm3
2672 $movkey %xmm3,16(%rax)
2673 lea 32(%rax),%rax
2674 jmp .Lkey_expansion_192b_warm
2675
2676.align 16
2677.Lkey_expansion_256a:
2678 $movkey %xmm2,(%rax)
2679 lea 16(%rax),%rax
2680.Lkey_expansion_256a_cold:
2681 shufps \$0b00010000,%xmm0,%xmm4
2682 xorps %xmm4,%xmm0
2683 shufps \$0b10001100,%xmm0,%xmm4
2684 xorps %xmm4,%xmm0
2685 shufps \$0b11111111,%xmm1,%xmm1 # critical path
2686 xorps %xmm1,%xmm0
2687 ret
2688
2689.align 16
2690.Lkey_expansion_256b:
2691 $movkey %xmm0,(%rax)
2692 lea 16(%rax),%rax
2693
2694 shufps \$0b00010000,%xmm2,%xmm4
2695 xorps %xmm4,%xmm2
2696 shufps \$0b10001100,%xmm2,%xmm4
2697 xorps %xmm4,%xmm2
2698 shufps \$0b10101010,%xmm1,%xmm1 # critical path
2699 xorps %xmm1,%xmm2
2700 ret
2701.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
2702.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
2703___
2704}
2705
2706$code.=<<___;
2707.align 64
2708.Lbswap_mask:
2709 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
2710.Lincrement32:
2711 .long 6,6,6,0
2712.Lincrement64:
2713 .long 1,0,0,0
2714.Lxts_magic:
2715 .long 0x87,0,1,0
2716
2717.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
2718.align 64
2719___
2720
2721# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2722# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2723if ($win64) {
2724$rec="%rcx";
2725$frame="%rdx";
2726$context="%r8";
2727$disp="%r9";
2728
2729$code.=<<___;
2730.extern __imp_RtlVirtualUnwind
2731___
2732$code.=<<___ if ($PREFIX eq "aesni");
2733.type ecb_se_handler,\@abi-omnipotent
2734.align 16
2735ecb_se_handler:
2736 push %rsi
2737 push %rdi
2738 push %rbx
2739 push %rbp
2740 push %r12
2741 push %r13
2742 push %r14
2743 push %r15
2744 pushfq
2745 sub \$64,%rsp
2746
2747 mov 152($context),%rax # pull context->Rsp
2748
2749 jmp .Lcommon_seh_tail
2750.size ecb_se_handler,.-ecb_se_handler
2751
2752.type ccm64_se_handler,\@abi-omnipotent
2753.align 16
2754ccm64_se_handler:
2755 push %rsi
2756 push %rdi
2757 push %rbx
2758 push %rbp
2759 push %r12
2760 push %r13
2761 push %r14
2762 push %r15
2763 pushfq
2764 sub \$64,%rsp
2765
2766 mov 120($context),%rax # pull context->Rax
2767 mov 248($context),%rbx # pull context->Rip
2768
2769 mov 8($disp),%rsi # disp->ImageBase
2770 mov 56($disp),%r11 # disp->HandlerData
2771
2772 mov 0(%r11),%r10d # HandlerData[0]
2773 lea (%rsi,%r10),%r10 # prologue label
2774 cmp %r10,%rbx # context->Rip<prologue label
2775 jb .Lcommon_seh_tail
2776
2777 mov 152($context),%rax # pull context->Rsp
2778
2779 mov 4(%r11),%r10d # HandlerData[1]
2780 lea (%rsi,%r10),%r10 # epilogue label
2781 cmp %r10,%rbx # context->Rip>=epilogue label
2782 jae .Lcommon_seh_tail
2783
2784 lea 0(%rax),%rsi # %xmm save area
2785 lea 512($context),%rdi # &context.Xmm6
2786 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
2787 .long 0xa548f3fc # cld; rep movsq
2788 lea 0x58(%rax),%rax # adjust stack pointer
2789
2790 jmp .Lcommon_seh_tail
2791.size ccm64_se_handler,.-ccm64_se_handler
2792
2793.type ctr32_se_handler,\@abi-omnipotent
2794.align 16
2795ctr32_se_handler:
2796 push %rsi
2797 push %rdi
2798 push %rbx
2799 push %rbp
2800 push %r12
2801 push %r13
2802 push %r14
2803 push %r15
2804 pushfq
2805 sub \$64,%rsp
2806
2807 mov 120($context),%rax # pull context->Rax
2808 mov 248($context),%rbx # pull context->Rip
2809
2810 lea .Lctr32_body(%rip),%r10
2811 cmp %r10,%rbx # context->Rip<"prologue" label
2812 jb .Lcommon_seh_tail
2813
2814 mov 152($context),%rax # pull context->Rsp
2815
2816 lea .Lctr32_ret(%rip),%r10
2817 cmp %r10,%rbx
2818 jae .Lcommon_seh_tail
2819
2820 lea 0x20(%rax),%rsi # %xmm save area
2821 lea 512($context),%rdi # &context.Xmm6
2822 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2823 .long 0xa548f3fc # cld; rep movsq
2824 lea 0xc8(%rax),%rax # adjust stack pointer
2825
2826 jmp .Lcommon_seh_tail
2827.size ctr32_se_handler,.-ctr32_se_handler
2828
2829.type xts_se_handler,\@abi-omnipotent
2830.align 16
2831xts_se_handler:
2832 push %rsi
2833 push %rdi
2834 push %rbx
2835 push %rbp
2836 push %r12
2837 push %r13
2838 push %r14
2839 push %r15
2840 pushfq
2841 sub \$64,%rsp
2842
2843 mov 120($context),%rax # pull context->Rax
2844 mov 248($context),%rbx # pull context->Rip
2845
2846 mov 8($disp),%rsi # disp->ImageBase
2847 mov 56($disp),%r11 # disp->HandlerData
2848
2849 mov 0(%r11),%r10d # HandlerData[0]
2850 lea (%rsi,%r10),%r10 # prologue lable
2851 cmp %r10,%rbx # context->Rip<prologue label
2852 jb .Lcommon_seh_tail
2853
2854 mov 152($context),%rax # pull context->Rsp
2855
2856 mov 4(%r11),%r10d # HandlerData[1]
2857 lea (%rsi,%r10),%r10 # epilogue label
2858 cmp %r10,%rbx # context->Rip>=epilogue label
2859 jae .Lcommon_seh_tail
2860
2861 lea 0x60(%rax),%rsi # %xmm save area
2862 lea 512($context),%rdi # & context.Xmm6
2863 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2864 .long 0xa548f3fc # cld; rep movsq
2865 lea 0x68+160(%rax),%rax # adjust stack pointer
2866
2867 jmp .Lcommon_seh_tail
2868.size xts_se_handler,.-xts_se_handler
2869___
2870$code.=<<___;
2871.type cbc_se_handler,\@abi-omnipotent
2872.align 16
2873cbc_se_handler:
2874 push %rsi
2875 push %rdi
2876 push %rbx
2877 push %rbp
2878 push %r12
2879 push %r13
2880 push %r14
2881 push %r15
2882 pushfq
2883 sub \$64,%rsp
2884
2885 mov 152($context),%rax # pull context->Rsp
2886 mov 248($context),%rbx # pull context->Rip
2887
2888 lea .Lcbc_decrypt(%rip),%r10
2889 cmp %r10,%rbx # context->Rip<"prologue" label
2890 jb .Lcommon_seh_tail
2891
2892 lea .Lcbc_decrypt_body(%rip),%r10
2893 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
2894 jb .Lrestore_cbc_rax
2895
2896 lea .Lcbc_ret(%rip),%r10
2897 cmp %r10,%rbx # context->Rip>="epilogue" label
2898 jae .Lcommon_seh_tail
2899
2900 lea 0(%rax),%rsi # top of stack
2901 lea 512($context),%rdi # &context.Xmm6
2902 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
2903 .long 0xa548f3fc # cld; rep movsq
2904 lea 0x58(%rax),%rax # adjust stack pointer
2905 jmp .Lcommon_seh_tail
2906
2907.Lrestore_cbc_rax:
2908 mov 120($context),%rax
2909
2910.Lcommon_seh_tail:
2911 mov 8(%rax),%rdi
2912 mov 16(%rax),%rsi
2913 mov %rax,152($context) # restore context->Rsp
2914 mov %rsi,168($context) # restore context->Rsi
2915 mov %rdi,176($context) # restore context->Rdi
2916
2917 mov 40($disp),%rdi # disp->ContextRecord
2918 mov $context,%rsi # context
2919 mov \$154,%ecx # sizeof(CONTEXT)
2920 .long 0xa548f3fc # cld; rep movsq
2921
2922 mov $disp,%rsi
2923 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2924 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2925 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2926 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2927 mov 40(%rsi),%r10 # disp->ContextRecord
2928 lea 56(%rsi),%r11 # &disp->HandlerData
2929 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2930 mov %r10,32(%rsp) # arg5
2931 mov %r11,40(%rsp) # arg6
2932 mov %r12,48(%rsp) # arg7
2933 mov %rcx,56(%rsp) # arg8, (NULL)
2934 call *__imp_RtlVirtualUnwind(%rip)
2935
2936 mov \$1,%eax # ExceptionContinueSearch
2937 add \$64,%rsp
2938 popfq
2939 pop %r15
2940 pop %r14
2941 pop %r13
2942 pop %r12
2943 pop %rbp
2944 pop %rbx
2945 pop %rdi
2946 pop %rsi
2947 ret
2948.size cbc_se_handler,.-cbc_se_handler
2949
2950.section .pdata
2951.align 4
2952___
2953$code.=<<___ if ($PREFIX eq "aesni");
2954 .rva .LSEH_begin_aesni_ecb_encrypt
2955 .rva .LSEH_end_aesni_ecb_encrypt
2956 .rva .LSEH_info_ecb
2957
2958 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
2959 .rva .LSEH_end_aesni_ccm64_encrypt_blocks
2960 .rva .LSEH_info_ccm64_enc
2961
2962 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
2963 .rva .LSEH_end_aesni_ccm64_decrypt_blocks
2964 .rva .LSEH_info_ccm64_dec
2965
2966 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
2967 .rva .LSEH_end_aesni_ctr32_encrypt_blocks
2968 .rva .LSEH_info_ctr32
2969
2970 .rva .LSEH_begin_aesni_xts_encrypt
2971 .rva .LSEH_end_aesni_xts_encrypt
2972 .rva .LSEH_info_xts_enc
2973
2974 .rva .LSEH_begin_aesni_xts_decrypt
2975 .rva .LSEH_end_aesni_xts_decrypt
2976 .rva .LSEH_info_xts_dec
2977___
2978$code.=<<___;
2979 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
2980 .rva .LSEH_end_${PREFIX}_cbc_encrypt
2981 .rva .LSEH_info_cbc
2982
2983 .rva ${PREFIX}_set_decrypt_key
2984 .rva .LSEH_end_set_decrypt_key
2985 .rva .LSEH_info_key
2986
2987 .rva ${PREFIX}_set_encrypt_key
2988 .rva .LSEH_end_set_encrypt_key
2989 .rva .LSEH_info_key
2990.section .xdata
2991.align 8
2992___
2993$code.=<<___ if ($PREFIX eq "aesni");
2994.LSEH_info_ecb:
2995 .byte 9,0,0,0
2996 .rva ecb_se_handler
2997.LSEH_info_ccm64_enc:
2998 .byte 9,0,0,0
2999 .rva ccm64_se_handler
3000 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
3001.LSEH_info_ccm64_dec:
3002 .byte 9,0,0,0
3003 .rva ccm64_se_handler
3004 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
3005.LSEH_info_ctr32:
3006 .byte 9,0,0,0
3007 .rva ctr32_se_handler
3008.LSEH_info_xts_enc:
3009 .byte 9,0,0,0
3010 .rva xts_se_handler
3011 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3012.LSEH_info_xts_dec:
3013 .byte 9,0,0,0
3014 .rva xts_se_handler
3015 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3016___
3017$code.=<<___;
3018.LSEH_info_cbc:
3019 .byte 9,0,0,0
3020 .rva cbc_se_handler
3021.LSEH_info_key:
3022 .byte 0x01,0x04,0x01,0x00
3023 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
3024___
3025}
3026
3027sub rex {
3028 local *opcode=shift;
3029 my ($dst,$src)=@_;
3030 my $rex=0;
3031
3032 $rex|=0x04 if($dst>=8);
3033 $rex|=0x01 if($src>=8);
3034 push @opcode,$rex|0x40 if($rex);
3035}
3036
3037$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3038
3039print $code;
3040
3041close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
deleted file mode 100644
index 41b90f0844..0000000000
--- a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
+++ /dev/null
@@ -1,3108 +0,0 @@
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode] ###
5### bitsliced implementation for Intel Core 2 processors ###
6### requires support of SSE extensions up to SSSE3 ###
7### Author: Emilia Käsper and Peter Schwabe ###
8### Date: 2009-03-19 ###
9### Public domain ###
10### ###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12### further information. ###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22# from 12.5KB to 2.2KB;
23# - above was possibile thanks to mixcolumns() modification that
24# allowed to feed its output back to aesenc[last], this was
25# achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28# relies on conversion of "conventional" key schedule as returned
29# by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31# to skip one shiftrows(), reduce bit-sliced key schedule and
32# speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38# Emilia's this(*) difference
39#
40# Core 2 9.30 8.69 +7%
41# Nehalem(**) 7.63 6.98 +9%
42# Atom 17.1 17.4 -2%(***)
43#
44# (*) Comparison is not completely fair, because "this" is ECB,
45# i.e. no extra processing such as counter values calculation
46# and xor-ing input as in Emilia's CTR implementation is
47# performed. However, the CTR calculations stand for not more
48# than 1% of total time, so comparison is *rather* fair.
49#
50# (**) Results were collected on Westmere, which is considered to
51# be equivalent to Nehalem for this code.
52#
53# (***) Slowdown on Atom is rather strange per se, because original
54# implementation has a number of 9+-bytes instructions, which
55# are bad for Atom front-end, and which I eliminated completely.
56# In attempt to address deterioration sbox() was tested in FP
57# SIMD "domain" (movaps instead of movdqa, xorps instead of
58# pxor, etc.). While it resulted in nominal 4% improvement on
59# Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# conversion conversion/8x block
68# Core 2 240 0.22
69# Nehalem 180 0.20
70# Atom 430 0.19
71#
72# The ratio values mean that 128-byte blocks will be processed
73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
81# October 2011.
82#
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2 9.83
87# Nehalem 7.74
88# Atom 19.0
89#
90# November 2011.
91#
92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93# suboptimal, but XTS is meant to be used with larger blocks...
94#
95# <appro@openssl.org>
96
97$flavour = shift;
98$output = shift;
99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
108open OUT,"| \"$^X\" $xlate $flavour $output";
109*STDOUT=*OUT;
110
111my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
113my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
114
115{
116my ($key,$rounds,$const)=("%rax","%r10d","%r11");
117
118sub Sbox {
119# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
121my @b=@_[0..7];
122my @t=@_[8..11];
123my @s=@_[12..15];
124 &InBasisChange (@b);
125 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
126 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
127}
128
129sub InBasisChange {
130# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
132my @b=@_[0..7];
133$code.=<<___;
134 pxor @b[6], @b[5]
135 pxor @b[1], @b[2]
136 pxor @b[0], @b[3]
137 pxor @b[2], @b[6]
138 pxor @b[0], @b[5]
139
140 pxor @b[3], @b[6]
141 pxor @b[7], @b[3]
142 pxor @b[5], @b[7]
143 pxor @b[4], @b[3]
144 pxor @b[5], @b[4]
145 pxor @b[1], @b[3]
146
147 pxor @b[7], @b[2]
148 pxor @b[5], @b[1]
149___
150}
151
152sub OutBasisChange {
153# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
155my @b=@_[0..7];
156$code.=<<___;
157 pxor @b[6], @b[0]
158 pxor @b[4], @b[1]
159 pxor @b[0], @b[2]
160 pxor @b[6], @b[4]
161 pxor @b[1], @b[6]
162
163 pxor @b[5], @b[1]
164 pxor @b[3], @b[5]
165 pxor @b[7], @b[3]
166 pxor @b[5], @b[7]
167 pxor @b[5], @b[2]
168
169 pxor @b[7], @b[4]
170___
171}
172
173sub InvSbox {
174# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
176my @b=@_[0..7];
177my @t=@_[8..11];
178my @s=@_[12..15];
179 &InvInBasisChange (@b);
180 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
181 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
182}
183
184sub InvInBasisChange { # OutBasisChange in reverse
185my @b=@_[5,1,2,6,3,7,0,4];
186$code.=<<___
187 pxor @b[7], @b[4]
188
189 pxor @b[5], @b[7]
190 pxor @b[5], @b[2]
191 pxor @b[7], @b[3]
192 pxor @b[3], @b[5]
193 pxor @b[5], @b[1]
194
195 pxor @b[1], @b[6]
196 pxor @b[0], @b[2]
197 pxor @b[6], @b[4]
198 pxor @b[6], @b[0]
199 pxor @b[4], @b[1]
200___
201}
202
203sub InvOutBasisChange { # InBasisChange in reverse
204my @b=@_[2,5,7,3,6,1,0,4];
205$code.=<<___;
206 pxor @b[5], @b[1]
207 pxor @b[7], @b[2]
208
209 pxor @b[1], @b[3]
210 pxor @b[5], @b[4]
211 pxor @b[5], @b[7]
212 pxor @b[4], @b[3]
213 pxor @b[0], @b[5]
214 pxor @b[7], @b[3]
215 pxor @b[2], @b[6]
216 pxor @b[1], @b[2]
217 pxor @b[3], @b[6]
218
219 pxor @b[0], @b[3]
220 pxor @b[6], @b[5]
221___
222}
223
224sub Mul_GF4 {
225#;*************************************************************
226#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227#;*************************************************************
228my ($x0,$x1,$y0,$y1,$t0)=@_;
229$code.=<<___;
230 movdqa $y0, $t0
231 pxor $y1, $t0
232 pand $x0, $t0
233 pxor $x1, $x0
234 pand $y0, $x1
235 pand $y1, $x0
236 pxor $x1, $x0
237 pxor $t0, $x1
238___
239}
240
241sub Mul_GF4_N { # not used, see next subroutine
242# multiply and scale by N
243my ($x0,$x1,$y0,$y1,$t0)=@_;
244$code.=<<___;
245 movdqa $y0, $t0
246 pxor $y1, $t0
247 pand $x0, $t0
248 pxor $x1, $x0
249 pand $y0, $x1
250 pand $y1, $x0
251 pxor $x0, $x1
252 pxor $t0, $x0
253___
254}
255
256sub Mul_GF4_N_GF4 {
257# interleaved Mul_GF4_N and Mul_GF4
258my ($x0,$x1,$y0,$y1,$t0,
259 $x2,$x3,$y2,$y3,$t1)=@_;
260$code.=<<___;
261 movdqa $y0, $t0
262 movdqa $y2, $t1
263 pxor $y1, $t0
264 pxor $y3, $t1
265 pand $x0, $t0
266 pand $x2, $t1
267 pxor $x1, $x0
268 pxor $x3, $x2
269 pand $y0, $x1
270 pand $y2, $x3
271 pand $y1, $x0
272 pand $y3, $x2
273 pxor $x0, $x1
274 pxor $x3, $x2
275 pxor $t0, $x0
276 pxor $t1, $x3
277___
278}
279sub Mul_GF16_2 {
280my @x=@_[0..7];
281my @y=@_[8..11];
282my @t=@_[12..15];
283$code.=<<___;
284 movdqa @x[0], @t[0]
285 movdqa @x[1], @t[1]
286___
287 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
288$code.=<<___;
289 pxor @x[2], @t[0]
290 pxor @x[3], @t[1]
291 pxor @y[2], @y[0]
292 pxor @y[3], @y[1]
293___
294 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
295 @x[2], @x[3], @y[2], @y[3], @t[2]);
296$code.=<<___;
297 pxor @t[0], @x[0]
298 pxor @t[0], @x[2]
299 pxor @t[1], @x[1]
300 pxor @t[1], @x[3]
301
302 movdqa @x[4], @t[0]
303 movdqa @x[5], @t[1]
304 pxor @x[6], @t[0]
305 pxor @x[7], @t[1]
306___
307 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
308 @x[6], @x[7], @y[2], @y[3], @t[2]);
309$code.=<<___;
310 pxor @y[2], @y[0]
311 pxor @y[3], @y[1]
312___
313 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
314$code.=<<___;
315 pxor @t[0], @x[4]
316 pxor @t[0], @x[6]
317 pxor @t[1], @x[5]
318 pxor @t[1], @x[7]
319___
320}
321sub Inv_GF256 {
322#;********************************************************************
323#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
324#;********************************************************************
325my @x=@_[0..7];
326my @t=@_[8..11];
327my @s=@_[12..15];
328# direct optimizations from hardware
329$code.=<<___;
330 movdqa @x[4], @t[3]
331 movdqa @x[5], @t[2]
332 movdqa @x[1], @t[1]
333 movdqa @x[7], @s[1]
334 movdqa @x[0], @s[0]
335
336 pxor @x[6], @t[3]
337 pxor @x[7], @t[2]
338 pxor @x[3], @t[1]
339 movdqa @t[3], @s[2]
340 pxor @x[6], @s[1]
341 movdqa @t[2], @t[0]
342 pxor @x[2], @s[0]
343 movdqa @t[3], @s[3]
344
345 por @t[1], @t[2]
346 por @s[0], @t[3]
347 pxor @t[0], @s[3]
348 pand @s[0], @s[2]
349 pxor @t[1], @s[0]
350 pand @t[1], @t[0]
351 pand @s[0], @s[3]
352 movdqa @x[3], @s[0]
353 pxor @x[2], @s[0]
354 pand @s[0], @s[1]
355 pxor @s[1], @t[3]
356 pxor @s[1], @t[2]
357 movdqa @x[4], @s[1]
358 movdqa @x[1], @s[0]
359 pxor @x[5], @s[1]
360 pxor @x[0], @s[0]
361 movdqa @s[1], @t[1]
362 pand @s[0], @s[1]
363 por @s[0], @t[1]
364 pxor @s[1], @t[0]
365 pxor @s[3], @t[3]
366 pxor @s[2], @t[2]
367 pxor @s[3], @t[1]
368 movdqa @x[7], @s[0]
369 pxor @s[2], @t[0]
370 movdqa @x[6], @s[1]
371 pxor @s[2], @t[1]
372 movdqa @x[5], @s[2]
373 pand @x[3], @s[0]
374 movdqa @x[4], @s[3]
375 pand @x[2], @s[1]
376 pand @x[1], @s[2]
377 por @x[0], @s[3]
378 pxor @s[0], @t[3]
379 pxor @s[1], @t[2]
380 pxor @s[2], @t[1]
381 pxor @s[3], @t[0]
382
383 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
384
385 # new smaller inversion
386
387 movdqa @t[3], @s[0]
388 pand @t[1], @t[3]
389 pxor @t[2], @s[0]
390
391 movdqa @t[0], @s[2]
392 movdqa @s[0], @s[3]
393 pxor @t[3], @s[2]
394 pand @s[2], @s[3]
395
396 movdqa @t[1], @s[1]
397 pxor @t[2], @s[3]
398 pxor @t[0], @s[1]
399
400 pxor @t[2], @t[3]
401
402 pand @t[3], @s[1]
403
404 movdqa @s[2], @t[2]
405 pxor @t[0], @s[1]
406
407 pxor @s[1], @t[2]
408 pxor @s[1], @t[1]
409
410 pand @t[0], @t[2]
411
412 pxor @t[2], @s[2]
413 pxor @t[2], @t[1]
414
415 pand @s[3], @s[2]
416
417 pxor @s[0], @s[2]
418___
419# output in s3, s2, s1, t1
420
421# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
422
423# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
424 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
425
426### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
427}
428
429# AES linear components
430
431sub ShiftRows {
432my @x=@_[0..7];
433my $mask=pop;
434$code.=<<___;
435 pxor 0x00($key),@x[0]
436 pxor 0x10($key),@x[1]
437 pshufb $mask,@x[0]
438 pxor 0x20($key),@x[2]
439 pshufb $mask,@x[1]
440 pxor 0x30($key),@x[3]
441 pshufb $mask,@x[2]
442 pxor 0x40($key),@x[4]
443 pshufb $mask,@x[3]
444 pxor 0x50($key),@x[5]
445 pshufb $mask,@x[4]
446 pxor 0x60($key),@x[6]
447 pshufb $mask,@x[5]
448 pxor 0x70($key),@x[7]
449 pshufb $mask,@x[6]
450 lea 0x80($key),$key
451 pshufb $mask,@x[7]
452___
453}
454
455sub MixColumns {
456# modified to emit output in order suitable for feeding back to aesenc[last]
457my @x=@_[0..7];
458my @t=@_[8..15];
459my $inv=@_[16]; # optional
460$code.=<<___;
461 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
462 pshufd \$0x93, @x[1], @t[1]
463 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
464 pshufd \$0x93, @x[2], @t[2]
465 pxor @t[1], @x[1]
466 pshufd \$0x93, @x[3], @t[3]
467 pxor @t[2], @x[2]
468 pshufd \$0x93, @x[4], @t[4]
469 pxor @t[3], @x[3]
470 pshufd \$0x93, @x[5], @t[5]
471 pxor @t[4], @x[4]
472 pshufd \$0x93, @x[6], @t[6]
473 pxor @t[5], @x[5]
474 pshufd \$0x93, @x[7], @t[7]
475 pxor @t[6], @x[6]
476 pxor @t[7], @x[7]
477
478 pxor @x[0], @t[1]
479 pxor @x[7], @t[0]
480 pxor @x[7], @t[1]
481 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
482 pxor @x[1], @t[2]
483 pshufd \$0x4E, @x[1], @x[1]
484 pxor @x[4], @t[5]
485 pxor @t[0], @x[0]
486 pxor @x[5], @t[6]
487 pxor @t[1], @x[1]
488 pxor @x[3], @t[4]
489 pshufd \$0x4E, @x[4], @t[0]
490 pxor @x[6], @t[7]
491 pshufd \$0x4E, @x[5], @t[1]
492 pxor @x[2], @t[3]
493 pshufd \$0x4E, @x[3], @x[4]
494 pxor @x[7], @t[3]
495 pshufd \$0x4E, @x[7], @x[5]
496 pxor @x[7], @t[4]
497 pshufd \$0x4E, @x[6], @x[3]
498 pxor @t[4], @t[0]
499 pshufd \$0x4E, @x[2], @x[6]
500 pxor @t[5], @t[1]
501___
502$code.=<<___ if (!$inv);
503 pxor @t[3], @x[4]
504 pxor @t[7], @x[5]
505 pxor @t[6], @x[3]
506 movdqa @t[0], @x[2]
507 pxor @t[2], @x[6]
508 movdqa @t[1], @x[7]
509___
510$code.=<<___ if ($inv);
511 pxor @x[4], @t[3]
512 pxor @t[7], @x[5]
513 pxor @x[3], @t[6]
514 movdqa @t[0], @x[3]
515 pxor @t[2], @x[6]
516 movdqa @t[6], @x[2]
517 movdqa @t[1], @x[7]
518 movdqa @x[6], @x[4]
519 movdqa @t[3], @x[6]
520___
521}
522
523sub InvMixColumns_orig {
524my @x=@_[0..7];
525my @t=@_[8..15];
526
527$code.=<<___;
528 # multiplication by 0x0e
529 pshufd \$0x93, @x[7], @t[7]
530 movdqa @x[2], @t[2]
531 pxor @x[5], @x[7] # 7 5
532 pxor @x[5], @x[2] # 2 5
533 pshufd \$0x93, @x[0], @t[0]
534 movdqa @x[5], @t[5]
535 pxor @x[0], @x[5] # 5 0 [1]
536 pxor @x[1], @x[0] # 0 1
537 pshufd \$0x93, @x[1], @t[1]
538 pxor @x[2], @x[1] # 1 25
539 pxor @x[6], @x[0] # 01 6 [2]
540 pxor @x[3], @x[1] # 125 3 [4]
541 pshufd \$0x93, @x[3], @t[3]
542 pxor @x[0], @x[2] # 25 016 [3]
543 pxor @x[7], @x[3] # 3 75
544 pxor @x[6], @x[7] # 75 6 [0]
545 pshufd \$0x93, @x[6], @t[6]
546 movdqa @x[4], @t[4]
547 pxor @x[4], @x[6] # 6 4
548 pxor @x[3], @x[4] # 4 375 [6]
549 pxor @x[7], @x[3] # 375 756=36
550 pxor @t[5], @x[6] # 64 5 [7]
551 pxor @t[2], @x[3] # 36 2
552 pxor @t[4], @x[3] # 362 4 [5]
553 pshufd \$0x93, @t[5], @t[5]
554___
555 my @y = @x[7,5,0,2,1,3,4,6];
556$code.=<<___;
557 # multiplication by 0x0b
558 pxor @y[0], @y[1]
559 pxor @t[0], @y[0]
560 pxor @t[1], @y[1]
561 pshufd \$0x93, @t[2], @t[2]
562 pxor @t[5], @y[0]
563 pxor @t[6], @y[1]
564 pxor @t[7], @y[0]
565 pshufd \$0x93, @t[4], @t[4]
566 pxor @t[6], @t[7] # clobber t[7]
567 pxor @y[0], @y[1]
568
569 pxor @t[0], @y[3]
570 pshufd \$0x93, @t[0], @t[0]
571 pxor @t[1], @y[2]
572 pxor @t[1], @y[4]
573 pxor @t[2], @y[2]
574 pshufd \$0x93, @t[1], @t[1]
575 pxor @t[2], @y[3]
576 pxor @t[2], @y[5]
577 pxor @t[7], @y[2]
578 pshufd \$0x93, @t[2], @t[2]
579 pxor @t[3], @y[3]
580 pxor @t[3], @y[6]
581 pxor @t[3], @y[4]
582 pshufd \$0x93, @t[3], @t[3]
583 pxor @t[4], @y[7]
584 pxor @t[4], @y[5]
585 pxor @t[7], @y[7]
586 pxor @t[5], @y[3]
587 pxor @t[4], @y[4]
588 pxor @t[5], @t[7] # clobber t[7] even more
589
590 pxor @t[7], @y[5]
591 pshufd \$0x93, @t[4], @t[4]
592 pxor @t[7], @y[6]
593 pxor @t[7], @y[4]
594
595 pxor @t[5], @t[7]
596 pshufd \$0x93, @t[5], @t[5]
597 pxor @t[6], @t[7] # restore t[7]
598
599 # multiplication by 0x0d
600 pxor @y[7], @y[4]
601 pxor @t[4], @y[7]
602 pshufd \$0x93, @t[6], @t[6]
603 pxor @t[0], @y[2]
604 pxor @t[5], @y[7]
605 pxor @t[2], @y[2]
606 pshufd \$0x93, @t[7], @t[7]
607
608 pxor @y[1], @y[3]
609 pxor @t[1], @y[1]
610 pxor @t[0], @y[0]
611 pxor @t[0], @y[3]
612 pxor @t[5], @y[1]
613 pxor @t[5], @y[0]
614 pxor @t[7], @y[1]
615 pshufd \$0x93, @t[0], @t[0]
616 pxor @t[6], @y[0]
617 pxor @y[1], @y[3]
618 pxor @t[1], @y[4]
619 pshufd \$0x93, @t[1], @t[1]
620
621 pxor @t[7], @y[7]
622 pxor @t[2], @y[4]
623 pxor @t[2], @y[5]
624 pshufd \$0x93, @t[2], @t[2]
625 pxor @t[6], @y[2]
626 pxor @t[3], @t[6] # clobber t[6]
627 pxor @y[7], @y[4]
628 pxor @t[6], @y[3]
629
630 pxor @t[6], @y[6]
631 pxor @t[5], @y[5]
632 pxor @t[4], @y[6]
633 pshufd \$0x93, @t[4], @t[4]
634 pxor @t[6], @y[5]
635 pxor @t[7], @y[6]
636 pxor @t[3], @t[6] # restore t[6]
637
638 pshufd \$0x93, @t[5], @t[5]
639 pshufd \$0x93, @t[6], @t[6]
640 pshufd \$0x93, @t[7], @t[7]
641 pshufd \$0x93, @t[3], @t[3]
642
643 # multiplication by 0x09
644 pxor @y[1], @y[4]
645 pxor @y[1], @t[1] # t[1]=y[1]
646 pxor @t[5], @t[0] # clobber t[0]
647 pxor @t[5], @t[1]
648 pxor @t[0], @y[3]
649 pxor @y[0], @t[0] # t[0]=y[0]
650 pxor @t[6], @t[1]
651 pxor @t[7], @t[6] # clobber t[6]
652 pxor @t[1], @y[4]
653 pxor @t[4], @y[7]
654 pxor @y[4], @t[4] # t[4]=y[4]
655 pxor @t[3], @y[6]
656 pxor @y[3], @t[3] # t[3]=y[3]
657 pxor @t[2], @y[5]
658 pxor @y[2], @t[2] # t[2]=y[2]
659 pxor @t[7], @t[3]
660 pxor @y[5], @t[5] # t[5]=y[5]
661 pxor @t[6], @t[2]
662 pxor @t[6], @t[5]
663 pxor @y[6], @t[6] # t[6]=y[6]
664 pxor @y[7], @t[7] # t[7]=y[7]
665
666 movdqa @t[0],@XMM[0]
667 movdqa @t[1],@XMM[1]
668 movdqa @t[2],@XMM[2]
669 movdqa @t[3],@XMM[3]
670 movdqa @t[4],@XMM[4]
671 movdqa @t[5],@XMM[5]
672 movdqa @t[6],@XMM[6]
673 movdqa @t[7],@XMM[7]
674___
675}
676
677sub InvMixColumns {
678my @x=@_[0..7];
679my @t=@_[8..15];
680
681# Thanks to Jussi Kivilinna for providing pointer to
682#
683# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
684# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
685# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
686# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
687
688$code.=<<___;
689 # multiplication by 0x05-0x00-0x04-0x00
690 pshufd \$0x4E, @x[0], @t[0]
691 pshufd \$0x4E, @x[6], @t[6]
692 pxor @x[0], @t[0]
693 pshufd \$0x4E, @x[7], @t[7]
694 pxor @x[6], @t[6]
695 pshufd \$0x4E, @x[1], @t[1]
696 pxor @x[7], @t[7]
697 pshufd \$0x4E, @x[2], @t[2]
698 pxor @x[1], @t[1]
699 pshufd \$0x4E, @x[3], @t[3]
700 pxor @x[2], @t[2]
701 pxor @t[6], @x[0]
702 pxor @t[6], @x[1]
703 pshufd \$0x4E, @x[4], @t[4]
704 pxor @x[3], @t[3]
705 pxor @t[0], @x[2]
706 pxor @t[1], @x[3]
707 pshufd \$0x4E, @x[5], @t[5]
708 pxor @x[4], @t[4]
709 pxor @t[7], @x[1]
710 pxor @t[2], @x[4]
711 pxor @x[5], @t[5]
712
713 pxor @t[7], @x[2]
714 pxor @t[6], @x[3]
715 pxor @t[6], @x[4]
716 pxor @t[3], @x[5]
717 pxor @t[4], @x[6]
718 pxor @t[7], @x[4]
719 pxor @t[7], @x[5]
720 pxor @t[5], @x[7]
721___
722 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
723}
724
725sub aesenc { # not used
726my @b=@_[0..7];
727my @t=@_[8..15];
728$code.=<<___;
729 movdqa 0x30($const),@t[0] # .LSR
730___
731 &ShiftRows (@b,@t[0]);
732 &Sbox (@b,@t);
733 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
734}
735
736sub aesenclast { # not used
737my @b=@_[0..7];
738my @t=@_[8..15];
739$code.=<<___;
740 movdqa 0x40($const),@t[0] # .LSRM0
741___
742 &ShiftRows (@b,@t[0]);
743 &Sbox (@b,@t);
744$code.=<<___
745 pxor 0x00($key),@b[0]
746 pxor 0x10($key),@b[1]
747 pxor 0x20($key),@b[4]
748 pxor 0x30($key),@b[6]
749 pxor 0x40($key),@b[3]
750 pxor 0x50($key),@b[7]
751 pxor 0x60($key),@b[2]
752 pxor 0x70($key),@b[5]
753___
754}
755
756sub swapmove {
757my ($a,$b,$n,$mask,$t)=@_;
758$code.=<<___;
759 movdqa $b,$t
760 psrlq \$$n,$b
761 pxor $a,$b
762 pand $mask,$b
763 pxor $b,$a
764 psllq \$$n,$b
765 pxor $t,$b
766___
767}
768sub swapmove2x {
769my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
770$code.=<<___;
771 movdqa $b0,$t0
772 psrlq \$$n,$b0
773 movdqa $b1,$t1
774 psrlq \$$n,$b1
775 pxor $a0,$b0
776 pxor $a1,$b1
777 pand $mask,$b0
778 pand $mask,$b1
779 pxor $b0,$a0
780 psllq \$$n,$b0
781 pxor $b1,$a1
782 psllq \$$n,$b1
783 pxor $t0,$b0
784 pxor $t1,$b1
785___
786}
787
788sub bitslice {
789my @x=reverse(@_[0..7]);
790my ($t0,$t1,$t2,$t3)=@_[8..11];
791$code.=<<___;
792 movdqa 0x00($const),$t0 # .LBS0
793 movdqa 0x10($const),$t1 # .LBS1
794___
795 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
796 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
797$code.=<<___;
798 movdqa 0x20($const),$t0 # .LBS2
799___
800 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
801 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
802
803 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
804 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
805}
806
807$code.=<<___;
808.text
809
810.extern asm_AES_encrypt
811.extern asm_AES_decrypt
812
813.type _bsaes_encrypt8,\@abi-omnipotent
814.align 64
815_bsaes_encrypt8:
816 lea .LBS0(%rip), $const # constants table
817
818 movdqa ($key), @XMM[9] # round 0 key
819 lea 0x10($key), $key
820 movdqa 0x50($const), @XMM[8] # .LM0SR
821 pxor @XMM[9], @XMM[0] # xor with round0 key
822 pxor @XMM[9], @XMM[1]
823 pshufb @XMM[8], @XMM[0]
824 pxor @XMM[9], @XMM[2]
825 pshufb @XMM[8], @XMM[1]
826 pxor @XMM[9], @XMM[3]
827 pshufb @XMM[8], @XMM[2]
828 pxor @XMM[9], @XMM[4]
829 pshufb @XMM[8], @XMM[3]
830 pxor @XMM[9], @XMM[5]
831 pshufb @XMM[8], @XMM[4]
832 pxor @XMM[9], @XMM[6]
833 pshufb @XMM[8], @XMM[5]
834 pxor @XMM[9], @XMM[7]
835 pshufb @XMM[8], @XMM[6]
836 pshufb @XMM[8], @XMM[7]
837_bsaes_encrypt8_bitslice:
838___
839 &bitslice (@XMM[0..7, 8..11]);
840$code.=<<___;
841 dec $rounds
842 jmp .Lenc_sbox
843.align 16
844.Lenc_loop:
845___
846 &ShiftRows (@XMM[0..7, 8]);
847$code.=".Lenc_sbox:\n";
848 &Sbox (@XMM[0..7, 8..15]);
849$code.=<<___;
850 dec $rounds
851 jl .Lenc_done
852___
853 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
854$code.=<<___;
855 movdqa 0x30($const), @XMM[8] # .LSR
856 jnz .Lenc_loop
857 movdqa 0x40($const), @XMM[8] # .LSRM0
858 jmp .Lenc_loop
859.align 16
860.Lenc_done:
861___
862 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
863 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
864$code.=<<___;
865 movdqa ($key), @XMM[8] # last round key
866 pxor @XMM[8], @XMM[4]
867 pxor @XMM[8], @XMM[6]
868 pxor @XMM[8], @XMM[3]
869 pxor @XMM[8], @XMM[7]
870 pxor @XMM[8], @XMM[2]
871 pxor @XMM[8], @XMM[5]
872 pxor @XMM[8], @XMM[0]
873 pxor @XMM[8], @XMM[1]
874 ret
875.size _bsaes_encrypt8,.-_bsaes_encrypt8
876
877.type _bsaes_decrypt8,\@abi-omnipotent
878.align 64
879_bsaes_decrypt8:
880 lea .LBS0(%rip), $const # constants table
881
882 movdqa ($key), @XMM[9] # round 0 key
883 lea 0x10($key), $key
884 movdqa -0x30($const), @XMM[8] # .LM0ISR
885 pxor @XMM[9], @XMM[0] # xor with round0 key
886 pxor @XMM[9], @XMM[1]
887 pshufb @XMM[8], @XMM[0]
888 pxor @XMM[9], @XMM[2]
889 pshufb @XMM[8], @XMM[1]
890 pxor @XMM[9], @XMM[3]
891 pshufb @XMM[8], @XMM[2]
892 pxor @XMM[9], @XMM[4]
893 pshufb @XMM[8], @XMM[3]
894 pxor @XMM[9], @XMM[5]
895 pshufb @XMM[8], @XMM[4]
896 pxor @XMM[9], @XMM[6]
897 pshufb @XMM[8], @XMM[5]
898 pxor @XMM[9], @XMM[7]
899 pshufb @XMM[8], @XMM[6]
900 pshufb @XMM[8], @XMM[7]
901___
902 &bitslice (@XMM[0..7, 8..11]);
903$code.=<<___;
904 dec $rounds
905 jmp .Ldec_sbox
906.align 16
907.Ldec_loop:
908___
909 &ShiftRows (@XMM[0..7, 8]);
910$code.=".Ldec_sbox:\n";
911 &InvSbox (@XMM[0..7, 8..15]);
912$code.=<<___;
913 dec $rounds
914 jl .Ldec_done
915___
916 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
917$code.=<<___;
918 movdqa -0x10($const), @XMM[8] # .LISR
919 jnz .Ldec_loop
920 movdqa -0x20($const), @XMM[8] # .LISRM0
921 jmp .Ldec_loop
922.align 16
923.Ldec_done:
924___
925 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
926$code.=<<___;
927 movdqa ($key), @XMM[8] # last round key
928 pxor @XMM[8], @XMM[6]
929 pxor @XMM[8], @XMM[4]
930 pxor @XMM[8], @XMM[2]
931 pxor @XMM[8], @XMM[7]
932 pxor @XMM[8], @XMM[3]
933 pxor @XMM[8], @XMM[5]
934 pxor @XMM[8], @XMM[0]
935 pxor @XMM[8], @XMM[1]
936 ret
937.size _bsaes_decrypt8,.-_bsaes_decrypt8
938___
939}
940{
941my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
942
943sub bitslice_key {
944my @x=reverse(@_[0..7]);
945my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
946
947 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
948$code.=<<___;
949 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
950 movdqa @x[0], @x[2]
951 movdqa @x[1], @x[3]
952___
953 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
954
955 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
956$code.=<<___;
957 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
958 movdqa @x[0], @x[4]
959 movdqa @x[2], @x[6]
960 movdqa @x[1], @x[5]
961 movdqa @x[3], @x[7]
962___
963 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
964 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
965}
966
967$code.=<<___;
968.type _bsaes_key_convert,\@abi-omnipotent
969.align 16
970_bsaes_key_convert:
971 lea .Lmasks(%rip), $const
972 movdqu ($inp), %xmm7 # load round 0 key
973 lea 0x10($inp), $inp
974 movdqa 0x00($const), %xmm0 # 0x01...
975 movdqa 0x10($const), %xmm1 # 0x02...
976 movdqa 0x20($const), %xmm2 # 0x04...
977 movdqa 0x30($const), %xmm3 # 0x08...
978 movdqa 0x40($const), %xmm4 # .LM0
979 pcmpeqd %xmm5, %xmm5 # .LNOT
980
981 movdqu ($inp), %xmm6 # load round 1 key
982 movdqa %xmm7, ($out) # save round 0 key
983 lea 0x10($out), $out
984 dec $rounds
985 jmp .Lkey_loop
986.align 16
987.Lkey_loop:
988 pshufb %xmm4, %xmm6 # .LM0
989
990 movdqa %xmm0, %xmm8
991 movdqa %xmm1, %xmm9
992
993 pand %xmm6, %xmm8
994 pand %xmm6, %xmm9
995 movdqa %xmm2, %xmm10
996 pcmpeqb %xmm0, %xmm8
997 psllq \$4, %xmm0 # 0x10...
998 movdqa %xmm3, %xmm11
999 pcmpeqb %xmm1, %xmm9
1000 psllq \$4, %xmm1 # 0x20...
1001
1002 pand %xmm6, %xmm10
1003 pand %xmm6, %xmm11
1004 movdqa %xmm0, %xmm12
1005 pcmpeqb %xmm2, %xmm10
1006 psllq \$4, %xmm2 # 0x40...
1007 movdqa %xmm1, %xmm13
1008 pcmpeqb %xmm3, %xmm11
1009 psllq \$4, %xmm3 # 0x80...
1010
1011 movdqa %xmm2, %xmm14
1012 movdqa %xmm3, %xmm15
1013 pxor %xmm5, %xmm8 # "pnot"
1014 pxor %xmm5, %xmm9
1015
1016 pand %xmm6, %xmm12
1017 pand %xmm6, %xmm13
1018 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1019 pcmpeqb %xmm0, %xmm12
1020 psrlq \$4, %xmm0 # 0x01...
1021 movdqa %xmm9, 0x10($out)
1022 pcmpeqb %xmm1, %xmm13
1023 psrlq \$4, %xmm1 # 0x02...
1024 lea 0x10($inp), $inp
1025
1026 pand %xmm6, %xmm14
1027 pand %xmm6, %xmm15
1028 movdqa %xmm10, 0x20($out)
1029 pcmpeqb %xmm2, %xmm14
1030 psrlq \$4, %xmm2 # 0x04...
1031 movdqa %xmm11, 0x30($out)
1032 pcmpeqb %xmm3, %xmm15
1033 psrlq \$4, %xmm3 # 0x08...
1034 movdqu ($inp), %xmm6 # load next round key
1035
1036 pxor %xmm5, %xmm13 # "pnot"
1037 pxor %xmm5, %xmm14
1038 movdqa %xmm12, 0x40($out)
1039 movdqa %xmm13, 0x50($out)
1040 movdqa %xmm14, 0x60($out)
1041 movdqa %xmm15, 0x70($out)
1042 lea 0x80($out),$out
1043 dec $rounds
1044 jnz .Lkey_loop
1045
1046 movdqa 0x50($const), %xmm7 # .L63
1047 #movdqa %xmm6, ($out) # don't save last round key
1048 ret
1049.size _bsaes_key_convert,.-_bsaes_key_convert
1050___
1051}
1052
1053if (0 && !$win64) { # following four functions are unsupported interface
1054 # used for benchmarking...
1055$code.=<<___;
1056.globl bsaes_enc_key_convert
1057.type bsaes_enc_key_convert,\@function,2
1058.align 16
1059bsaes_enc_key_convert:
1060 mov 240($inp),%r10d # pass rounds
1061 mov $inp,%rcx # pass key
1062 mov $out,%rax # pass key schedule
1063 call _bsaes_key_convert
1064 pxor %xmm6,%xmm7 # fix up last round key
1065 movdqa %xmm7,(%rax) # save last round key
1066 ret
1067.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1068
1069.globl bsaes_encrypt_128
1070.type bsaes_encrypt_128,\@function,4
1071.align 16
1072bsaes_encrypt_128:
1073.Lenc128_loop:
1074 movdqu 0x00($inp), @XMM[0] # load input
1075 movdqu 0x10($inp), @XMM[1]
1076 movdqu 0x20($inp), @XMM[2]
1077 movdqu 0x30($inp), @XMM[3]
1078 movdqu 0x40($inp), @XMM[4]
1079 movdqu 0x50($inp), @XMM[5]
1080 movdqu 0x60($inp), @XMM[6]
1081 movdqu 0x70($inp), @XMM[7]
1082 mov $key, %rax # pass the $key
1083 lea 0x80($inp), $inp
1084 mov \$10,%r10d
1085
1086 call _bsaes_encrypt8
1087
1088 movdqu @XMM[0], 0x00($out) # write output
1089 movdqu @XMM[1], 0x10($out)
1090 movdqu @XMM[4], 0x20($out)
1091 movdqu @XMM[6], 0x30($out)
1092 movdqu @XMM[3], 0x40($out)
1093 movdqu @XMM[7], 0x50($out)
1094 movdqu @XMM[2], 0x60($out)
1095 movdqu @XMM[5], 0x70($out)
1096 lea 0x80($out), $out
1097 sub \$0x80,$len
1098 ja .Lenc128_loop
1099 ret
1100.size bsaes_encrypt_128,.-bsaes_encrypt_128
1101
1102.globl bsaes_dec_key_convert
1103.type bsaes_dec_key_convert,\@function,2
1104.align 16
1105bsaes_dec_key_convert:
1106 mov 240($inp),%r10d # pass rounds
1107 mov $inp,%rcx # pass key
1108 mov $out,%rax # pass key schedule
1109 call _bsaes_key_convert
1110 pxor ($out),%xmm7 # fix up round 0 key
1111 movdqa %xmm6,(%rax) # save last round key
1112 movdqa %xmm7,($out)
1113 ret
1114.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1115
1116.globl bsaes_decrypt_128
1117.type bsaes_decrypt_128,\@function,4
1118.align 16
1119bsaes_decrypt_128:
1120.Ldec128_loop:
1121 movdqu 0x00($inp), @XMM[0] # load input
1122 movdqu 0x10($inp), @XMM[1]
1123 movdqu 0x20($inp), @XMM[2]
1124 movdqu 0x30($inp), @XMM[3]
1125 movdqu 0x40($inp), @XMM[4]
1126 movdqu 0x50($inp), @XMM[5]
1127 movdqu 0x60($inp), @XMM[6]
1128 movdqu 0x70($inp), @XMM[7]
1129 mov $key, %rax # pass the $key
1130 lea 0x80($inp), $inp
1131 mov \$10,%r10d
1132
1133 call _bsaes_decrypt8
1134
1135 movdqu @XMM[0], 0x00($out) # write output
1136 movdqu @XMM[1], 0x10($out)
1137 movdqu @XMM[6], 0x20($out)
1138 movdqu @XMM[4], 0x30($out)
1139 movdqu @XMM[2], 0x40($out)
1140 movdqu @XMM[7], 0x50($out)
1141 movdqu @XMM[3], 0x60($out)
1142 movdqu @XMM[5], 0x70($out)
1143 lea 0x80($out), $out
1144 sub \$0x80,$len
1145 ja .Ldec128_loop
1146 ret
1147.size bsaes_decrypt_128,.-bsaes_decrypt_128
1148___
1149}
1150{
1151######################################################################
1152#
1153# OpenSSL interface
1154#
1155my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1156 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1157my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1158
1159if ($ecb) {
1160$code.=<<___;
1161.globl bsaes_ecb_encrypt_blocks
1162.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1163.align 16
1164bsaes_ecb_encrypt_blocks:
1165 mov %rsp, %rax
1166.Lecb_enc_prologue:
1167 push %rbp
1168 push %rbx
1169 push %r12
1170 push %r13
1171 push %r14
1172 push %r15
1173 lea -0x48(%rsp),%rsp
1174___
1175$code.=<<___ if ($win64);
1176 lea -0xa0(%rsp), %rsp
1177 movaps %xmm6, 0x40(%rsp)
1178 movaps %xmm7, 0x50(%rsp)
1179 movaps %xmm8, 0x60(%rsp)
1180 movaps %xmm9, 0x70(%rsp)
1181 movaps %xmm10, 0x80(%rsp)
1182 movaps %xmm11, 0x90(%rsp)
1183 movaps %xmm12, 0xa0(%rsp)
1184 movaps %xmm13, 0xb0(%rsp)
1185 movaps %xmm14, 0xc0(%rsp)
1186 movaps %xmm15, 0xd0(%rsp)
1187.Lecb_enc_body:
1188___
1189$code.=<<___;
1190 mov %rsp,%rbp # backup %rsp
1191 mov 240($arg4),%eax # rounds
1192 mov $arg1,$inp # backup arguments
1193 mov $arg2,$out
1194 mov $arg3,$len
1195 mov $arg4,$key
1196 cmp \$8,$arg3
1197 jb .Lecb_enc_short
1198
1199 mov %eax,%ebx # backup rounds
1200 shl \$7,%rax # 128 bytes per inner round key
1201 sub \$`128-32`,%rax # size of bit-sliced key schedule
1202 sub %rax,%rsp
1203 mov %rsp,%rax # pass key schedule
1204 mov $key,%rcx # pass key
1205 mov %ebx,%r10d # pass rounds
1206 call _bsaes_key_convert
1207 pxor %xmm6,%xmm7 # fix up last round key
1208 movdqa %xmm7,(%rax) # save last round key
1209
1210 sub \$8,$len
1211.Lecb_enc_loop:
1212 movdqu 0x00($inp), @XMM[0] # load input
1213 movdqu 0x10($inp), @XMM[1]
1214 movdqu 0x20($inp), @XMM[2]
1215 movdqu 0x30($inp), @XMM[3]
1216 movdqu 0x40($inp), @XMM[4]
1217 movdqu 0x50($inp), @XMM[5]
1218 mov %rsp, %rax # pass key schedule
1219 movdqu 0x60($inp), @XMM[6]
1220 mov %ebx,%r10d # pass rounds
1221 movdqu 0x70($inp), @XMM[7]
1222 lea 0x80($inp), $inp
1223
1224 call _bsaes_encrypt8
1225
1226 movdqu @XMM[0], 0x00($out) # write output
1227 movdqu @XMM[1], 0x10($out)
1228 movdqu @XMM[4], 0x20($out)
1229 movdqu @XMM[6], 0x30($out)
1230 movdqu @XMM[3], 0x40($out)
1231 movdqu @XMM[7], 0x50($out)
1232 movdqu @XMM[2], 0x60($out)
1233 movdqu @XMM[5], 0x70($out)
1234 lea 0x80($out), $out
1235 sub \$8,$len
1236 jnc .Lecb_enc_loop
1237
1238 add \$8,$len
1239 jz .Lecb_enc_done
1240
1241 movdqu 0x00($inp), @XMM[0] # load input
1242 mov %rsp, %rax # pass key schedule
1243 mov %ebx,%r10d # pass rounds
1244 cmp \$2,$len
1245 jb .Lecb_enc_one
1246 movdqu 0x10($inp), @XMM[1]
1247 je .Lecb_enc_two
1248 movdqu 0x20($inp), @XMM[2]
1249 cmp \$4,$len
1250 jb .Lecb_enc_three
1251 movdqu 0x30($inp), @XMM[3]
1252 je .Lecb_enc_four
1253 movdqu 0x40($inp), @XMM[4]
1254 cmp \$6,$len
1255 jb .Lecb_enc_five
1256 movdqu 0x50($inp), @XMM[5]
1257 je .Lecb_enc_six
1258 movdqu 0x60($inp), @XMM[6]
1259 call _bsaes_encrypt8
1260 movdqu @XMM[0], 0x00($out) # write output
1261 movdqu @XMM[1], 0x10($out)
1262 movdqu @XMM[4], 0x20($out)
1263 movdqu @XMM[6], 0x30($out)
1264 movdqu @XMM[3], 0x40($out)
1265 movdqu @XMM[7], 0x50($out)
1266 movdqu @XMM[2], 0x60($out)
1267 jmp .Lecb_enc_done
1268.align 16
1269.Lecb_enc_six:
1270 call _bsaes_encrypt8
1271 movdqu @XMM[0], 0x00($out) # write output
1272 movdqu @XMM[1], 0x10($out)
1273 movdqu @XMM[4], 0x20($out)
1274 movdqu @XMM[6], 0x30($out)
1275 movdqu @XMM[3], 0x40($out)
1276 movdqu @XMM[7], 0x50($out)
1277 jmp .Lecb_enc_done
1278.align 16
1279.Lecb_enc_five:
1280 call _bsaes_encrypt8
1281 movdqu @XMM[0], 0x00($out) # write output
1282 movdqu @XMM[1], 0x10($out)
1283 movdqu @XMM[4], 0x20($out)
1284 movdqu @XMM[6], 0x30($out)
1285 movdqu @XMM[3], 0x40($out)
1286 jmp .Lecb_enc_done
1287.align 16
1288.Lecb_enc_four:
1289 call _bsaes_encrypt8
1290 movdqu @XMM[0], 0x00($out) # write output
1291 movdqu @XMM[1], 0x10($out)
1292 movdqu @XMM[4], 0x20($out)
1293 movdqu @XMM[6], 0x30($out)
1294 jmp .Lecb_enc_done
1295.align 16
1296.Lecb_enc_three:
1297 call _bsaes_encrypt8
1298 movdqu @XMM[0], 0x00($out) # write output
1299 movdqu @XMM[1], 0x10($out)
1300 movdqu @XMM[4], 0x20($out)
1301 jmp .Lecb_enc_done
1302.align 16
1303.Lecb_enc_two:
1304 call _bsaes_encrypt8
1305 movdqu @XMM[0], 0x00($out) # write output
1306 movdqu @XMM[1], 0x10($out)
1307 jmp .Lecb_enc_done
1308.align 16
1309.Lecb_enc_one:
1310 call _bsaes_encrypt8
1311 movdqu @XMM[0], 0x00($out) # write output
1312 jmp .Lecb_enc_done
1313.align 16
1314.Lecb_enc_short:
1315 lea ($inp), $arg1
1316 lea ($out), $arg2
1317 lea ($key), $arg3
1318 call asm_AES_encrypt
1319 lea 16($inp), $inp
1320 lea 16($out), $out
1321 dec $len
1322 jnz .Lecb_enc_short
1323
1324.Lecb_enc_done:
1325 lea (%rsp),%rax
1326 pxor %xmm0, %xmm0
1327.Lecb_enc_bzero: # wipe key schedule [if any]
1328 movdqa %xmm0, 0x00(%rax)
1329 movdqa %xmm0, 0x10(%rax)
1330 lea 0x20(%rax), %rax
1331 cmp %rax, %rbp
1332 jb .Lecb_enc_bzero
1333
1334 lea (%rbp),%rsp # restore %rsp
1335___
1336$code.=<<___ if ($win64);
1337 movaps 0x40(%rbp), %xmm6
1338 movaps 0x50(%rbp), %xmm7
1339 movaps 0x60(%rbp), %xmm8
1340 movaps 0x70(%rbp), %xmm9
1341 movaps 0x80(%rbp), %xmm10
1342 movaps 0x90(%rbp), %xmm11
1343 movaps 0xa0(%rbp), %xmm12
1344 movaps 0xb0(%rbp), %xmm13
1345 movaps 0xc0(%rbp), %xmm14
1346 movaps 0xd0(%rbp), %xmm15
1347 lea 0xa0(%rbp), %rsp
1348___
1349$code.=<<___;
1350 mov 0x48(%rsp), %r15
1351 mov 0x50(%rsp), %r14
1352 mov 0x58(%rsp), %r13
1353 mov 0x60(%rsp), %r12
1354 mov 0x68(%rsp), %rbx
1355 mov 0x70(%rsp), %rax
1356 lea 0x78(%rsp), %rsp
1357 mov %rax, %rbp
1358.Lecb_enc_epilogue:
1359 ret
1360.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1361
1362.globl bsaes_ecb_decrypt_blocks
1363.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1364.align 16
1365bsaes_ecb_decrypt_blocks:
1366 mov %rsp, %rax
1367.Lecb_dec_prologue:
1368 push %rbp
1369 push %rbx
1370 push %r12
1371 push %r13
1372 push %r14
1373 push %r15
1374 lea -0x48(%rsp),%rsp
1375___
1376$code.=<<___ if ($win64);
1377 lea -0xa0(%rsp), %rsp
1378 movaps %xmm6, 0x40(%rsp)
1379 movaps %xmm7, 0x50(%rsp)
1380 movaps %xmm8, 0x60(%rsp)
1381 movaps %xmm9, 0x70(%rsp)
1382 movaps %xmm10, 0x80(%rsp)
1383 movaps %xmm11, 0x90(%rsp)
1384 movaps %xmm12, 0xa0(%rsp)
1385 movaps %xmm13, 0xb0(%rsp)
1386 movaps %xmm14, 0xc0(%rsp)
1387 movaps %xmm15, 0xd0(%rsp)
1388.Lecb_dec_body:
1389___
1390$code.=<<___;
1391 mov %rsp,%rbp # backup %rsp
1392 mov 240($arg4),%eax # rounds
1393 mov $arg1,$inp # backup arguments
1394 mov $arg2,$out
1395 mov $arg3,$len
1396 mov $arg4,$key
1397 cmp \$8,$arg3
1398 jb .Lecb_dec_short
1399
1400 mov %eax,%ebx # backup rounds
1401 shl \$7,%rax # 128 bytes per inner round key
1402 sub \$`128-32`,%rax # size of bit-sliced key schedule
1403 sub %rax,%rsp
1404 mov %rsp,%rax # pass key schedule
1405 mov $key,%rcx # pass key
1406 mov %ebx,%r10d # pass rounds
1407 call _bsaes_key_convert
1408 pxor (%rsp),%xmm7 # fix up 0 round key
1409 movdqa %xmm6,(%rax) # save last round key
1410 movdqa %xmm7,(%rsp)
1411
1412 sub \$8,$len
1413.Lecb_dec_loop:
1414 movdqu 0x00($inp), @XMM[0] # load input
1415 movdqu 0x10($inp), @XMM[1]
1416 movdqu 0x20($inp), @XMM[2]
1417 movdqu 0x30($inp), @XMM[3]
1418 movdqu 0x40($inp), @XMM[4]
1419 movdqu 0x50($inp), @XMM[5]
1420 mov %rsp, %rax # pass key schedule
1421 movdqu 0x60($inp), @XMM[6]
1422 mov %ebx,%r10d # pass rounds
1423 movdqu 0x70($inp), @XMM[7]
1424 lea 0x80($inp), $inp
1425
1426 call _bsaes_decrypt8
1427
1428 movdqu @XMM[0], 0x00($out) # write output
1429 movdqu @XMM[1], 0x10($out)
1430 movdqu @XMM[6], 0x20($out)
1431 movdqu @XMM[4], 0x30($out)
1432 movdqu @XMM[2], 0x40($out)
1433 movdqu @XMM[7], 0x50($out)
1434 movdqu @XMM[3], 0x60($out)
1435 movdqu @XMM[5], 0x70($out)
1436 lea 0x80($out), $out
1437 sub \$8,$len
1438 jnc .Lecb_dec_loop
1439
1440 add \$8,$len
1441 jz .Lecb_dec_done
1442
1443 movdqu 0x00($inp), @XMM[0] # load input
1444 mov %rsp, %rax # pass key schedule
1445 mov %ebx,%r10d # pass rounds
1446 cmp \$2,$len
1447 jb .Lecb_dec_one
1448 movdqu 0x10($inp), @XMM[1]
1449 je .Lecb_dec_two
1450 movdqu 0x20($inp), @XMM[2]
1451 cmp \$4,$len
1452 jb .Lecb_dec_three
1453 movdqu 0x30($inp), @XMM[3]
1454 je .Lecb_dec_four
1455 movdqu 0x40($inp), @XMM[4]
1456 cmp \$6,$len
1457 jb .Lecb_dec_five
1458 movdqu 0x50($inp), @XMM[5]
1459 je .Lecb_dec_six
1460 movdqu 0x60($inp), @XMM[6]
1461 call _bsaes_decrypt8
1462 movdqu @XMM[0], 0x00($out) # write output
1463 movdqu @XMM[1], 0x10($out)
1464 movdqu @XMM[6], 0x20($out)
1465 movdqu @XMM[4], 0x30($out)
1466 movdqu @XMM[2], 0x40($out)
1467 movdqu @XMM[7], 0x50($out)
1468 movdqu @XMM[3], 0x60($out)
1469 jmp .Lecb_dec_done
1470.align 16
1471.Lecb_dec_six:
1472 call _bsaes_decrypt8
1473 movdqu @XMM[0], 0x00($out) # write output
1474 movdqu @XMM[1], 0x10($out)
1475 movdqu @XMM[6], 0x20($out)
1476 movdqu @XMM[4], 0x30($out)
1477 movdqu @XMM[2], 0x40($out)
1478 movdqu @XMM[7], 0x50($out)
1479 jmp .Lecb_dec_done
1480.align 16
1481.Lecb_dec_five:
1482 call _bsaes_decrypt8
1483 movdqu @XMM[0], 0x00($out) # write output
1484 movdqu @XMM[1], 0x10($out)
1485 movdqu @XMM[6], 0x20($out)
1486 movdqu @XMM[4], 0x30($out)
1487 movdqu @XMM[2], 0x40($out)
1488 jmp .Lecb_dec_done
1489.align 16
1490.Lecb_dec_four:
1491 call _bsaes_decrypt8
1492 movdqu @XMM[0], 0x00($out) # write output
1493 movdqu @XMM[1], 0x10($out)
1494 movdqu @XMM[6], 0x20($out)
1495 movdqu @XMM[4], 0x30($out)
1496 jmp .Lecb_dec_done
1497.align 16
1498.Lecb_dec_three:
1499 call _bsaes_decrypt8
1500 movdqu @XMM[0], 0x00($out) # write output
1501 movdqu @XMM[1], 0x10($out)
1502 movdqu @XMM[6], 0x20($out)
1503 jmp .Lecb_dec_done
1504.align 16
1505.Lecb_dec_two:
1506 call _bsaes_decrypt8
1507 movdqu @XMM[0], 0x00($out) # write output
1508 movdqu @XMM[1], 0x10($out)
1509 jmp .Lecb_dec_done
1510.align 16
1511.Lecb_dec_one:
1512 call _bsaes_decrypt8
1513 movdqu @XMM[0], 0x00($out) # write output
1514 jmp .Lecb_dec_done
1515.align 16
1516.Lecb_dec_short:
1517 lea ($inp), $arg1
1518 lea ($out), $arg2
1519 lea ($key), $arg3
1520 call asm_AES_decrypt
1521 lea 16($inp), $inp
1522 lea 16($out), $out
1523 dec $len
1524 jnz .Lecb_dec_short
1525
1526.Lecb_dec_done:
1527 lea (%rsp),%rax
1528 pxor %xmm0, %xmm0
1529.Lecb_dec_bzero: # wipe key schedule [if any]
1530 movdqa %xmm0, 0x00(%rax)
1531 movdqa %xmm0, 0x10(%rax)
1532 lea 0x20(%rax), %rax
1533 cmp %rax, %rbp
1534 jb .Lecb_dec_bzero
1535
1536 lea (%rbp),%rsp # restore %rsp
1537___
1538$code.=<<___ if ($win64);
1539 movaps 0x40(%rbp), %xmm6
1540 movaps 0x50(%rbp), %xmm7
1541 movaps 0x60(%rbp), %xmm8
1542 movaps 0x70(%rbp), %xmm9
1543 movaps 0x80(%rbp), %xmm10
1544 movaps 0x90(%rbp), %xmm11
1545 movaps 0xa0(%rbp), %xmm12
1546 movaps 0xb0(%rbp), %xmm13
1547 movaps 0xc0(%rbp), %xmm14
1548 movaps 0xd0(%rbp), %xmm15
1549 lea 0xa0(%rbp), %rsp
1550___
1551$code.=<<___;
1552 mov 0x48(%rsp), %r15
1553 mov 0x50(%rsp), %r14
1554 mov 0x58(%rsp), %r13
1555 mov 0x60(%rsp), %r12
1556 mov 0x68(%rsp), %rbx
1557 mov 0x70(%rsp), %rax
1558 lea 0x78(%rsp), %rsp
1559 mov %rax, %rbp
1560.Lecb_dec_epilogue:
1561 ret
1562.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1563___
1564}
1565$code.=<<___;
1566.extern asm_AES_cbc_encrypt
1567.globl bsaes_cbc_encrypt
1568.type bsaes_cbc_encrypt,\@abi-omnipotent
1569.align 16
1570bsaes_cbc_encrypt:
1571___
1572$code.=<<___ if ($win64);
1573 mov 48(%rsp),$arg6 # pull direction flag
1574___
1575$code.=<<___;
1576 cmp \$0,$arg6
1577 jne asm_AES_cbc_encrypt
1578 cmp \$128,$arg3
1579 jb asm_AES_cbc_encrypt
1580
1581 mov %rsp, %rax
1582.Lcbc_dec_prologue:
1583 push %rbp
1584 push %rbx
1585 push %r12
1586 push %r13
1587 push %r14
1588 push %r15
1589 lea -0x48(%rsp), %rsp
1590___
1591$code.=<<___ if ($win64);
1592 mov 0xa0(%rsp),$arg5 # pull ivp
1593 lea -0xa0(%rsp), %rsp
1594 movaps %xmm6, 0x40(%rsp)
1595 movaps %xmm7, 0x50(%rsp)
1596 movaps %xmm8, 0x60(%rsp)
1597 movaps %xmm9, 0x70(%rsp)
1598 movaps %xmm10, 0x80(%rsp)
1599 movaps %xmm11, 0x90(%rsp)
1600 movaps %xmm12, 0xa0(%rsp)
1601 movaps %xmm13, 0xb0(%rsp)
1602 movaps %xmm14, 0xc0(%rsp)
1603 movaps %xmm15, 0xd0(%rsp)
1604.Lcbc_dec_body:
1605___
1606$code.=<<___;
1607 mov %rsp, %rbp # backup %rsp
1608 mov 240($arg4), %eax # rounds
1609 mov $arg1, $inp # backup arguments
1610 mov $arg2, $out
1611 mov $arg3, $len
1612 mov $arg4, $key
1613 mov $arg5, %rbx
1614 shr \$4, $len # bytes to blocks
1615
1616 mov %eax, %edx # rounds
1617 shl \$7, %rax # 128 bytes per inner round key
1618 sub \$`128-32`, %rax # size of bit-sliced key schedule
1619 sub %rax, %rsp
1620
1621 mov %rsp, %rax # pass key schedule
1622 mov $key, %rcx # pass key
1623 mov %edx, %r10d # pass rounds
1624 call _bsaes_key_convert
1625 pxor (%rsp),%xmm7 # fix up 0 round key
1626 movdqa %xmm6,(%rax) # save last round key
1627 movdqa %xmm7,(%rsp)
1628
1629 movdqu (%rbx), @XMM[15] # load IV
1630 sub \$8,$len
1631.Lcbc_dec_loop:
1632 movdqu 0x00($inp), @XMM[0] # load input
1633 movdqu 0x10($inp), @XMM[1]
1634 movdqu 0x20($inp), @XMM[2]
1635 movdqu 0x30($inp), @XMM[3]
1636 movdqu 0x40($inp), @XMM[4]
1637 movdqu 0x50($inp), @XMM[5]
1638 mov %rsp, %rax # pass key schedule
1639 movdqu 0x60($inp), @XMM[6]
1640 mov %edx,%r10d # pass rounds
1641 movdqu 0x70($inp), @XMM[7]
1642 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1643
1644 call _bsaes_decrypt8
1645
1646 pxor 0x20(%rbp), @XMM[0] # ^= IV
1647 movdqu 0x00($inp), @XMM[8] # re-load input
1648 movdqu 0x10($inp), @XMM[9]
1649 pxor @XMM[8], @XMM[1]
1650 movdqu 0x20($inp), @XMM[10]
1651 pxor @XMM[9], @XMM[6]
1652 movdqu 0x30($inp), @XMM[11]
1653 pxor @XMM[10], @XMM[4]
1654 movdqu 0x40($inp), @XMM[12]
1655 pxor @XMM[11], @XMM[2]
1656 movdqu 0x50($inp), @XMM[13]
1657 pxor @XMM[12], @XMM[7]
1658 movdqu 0x60($inp), @XMM[14]
1659 pxor @XMM[13], @XMM[3]
1660 movdqu 0x70($inp), @XMM[15] # IV
1661 pxor @XMM[14], @XMM[5]
1662 movdqu @XMM[0], 0x00($out) # write output
1663 lea 0x80($inp), $inp
1664 movdqu @XMM[1], 0x10($out)
1665 movdqu @XMM[6], 0x20($out)
1666 movdqu @XMM[4], 0x30($out)
1667 movdqu @XMM[2], 0x40($out)
1668 movdqu @XMM[7], 0x50($out)
1669 movdqu @XMM[3], 0x60($out)
1670 movdqu @XMM[5], 0x70($out)
1671 lea 0x80($out), $out
1672 sub \$8,$len
1673 jnc .Lcbc_dec_loop
1674
1675 add \$8,$len
1676 jz .Lcbc_dec_done
1677
1678 movdqu 0x00($inp), @XMM[0] # load input
1679 mov %rsp, %rax # pass key schedule
1680 mov %edx, %r10d # pass rounds
1681 cmp \$2,$len
1682 jb .Lcbc_dec_one
1683 movdqu 0x10($inp), @XMM[1]
1684 je .Lcbc_dec_two
1685 movdqu 0x20($inp), @XMM[2]
1686 cmp \$4,$len
1687 jb .Lcbc_dec_three
1688 movdqu 0x30($inp), @XMM[3]
1689 je .Lcbc_dec_four
1690 movdqu 0x40($inp), @XMM[4]
1691 cmp \$6,$len
1692 jb .Lcbc_dec_five
1693 movdqu 0x50($inp), @XMM[5]
1694 je .Lcbc_dec_six
1695 movdqu 0x60($inp), @XMM[6]
1696 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1697 call _bsaes_decrypt8
1698 pxor 0x20(%rbp), @XMM[0] # ^= IV
1699 movdqu 0x00($inp), @XMM[8] # re-load input
1700 movdqu 0x10($inp), @XMM[9]
1701 pxor @XMM[8], @XMM[1]
1702 movdqu 0x20($inp), @XMM[10]
1703 pxor @XMM[9], @XMM[6]
1704 movdqu 0x30($inp), @XMM[11]
1705 pxor @XMM[10], @XMM[4]
1706 movdqu 0x40($inp), @XMM[12]
1707 pxor @XMM[11], @XMM[2]
1708 movdqu 0x50($inp), @XMM[13]
1709 pxor @XMM[12], @XMM[7]
1710 movdqu 0x60($inp), @XMM[15] # IV
1711 pxor @XMM[13], @XMM[3]
1712 movdqu @XMM[0], 0x00($out) # write output
1713 movdqu @XMM[1], 0x10($out)
1714 movdqu @XMM[6], 0x20($out)
1715 movdqu @XMM[4], 0x30($out)
1716 movdqu @XMM[2], 0x40($out)
1717 movdqu @XMM[7], 0x50($out)
1718 movdqu @XMM[3], 0x60($out)
1719 jmp .Lcbc_dec_done
1720.align 16
1721.Lcbc_dec_six:
1722 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1723 call _bsaes_decrypt8
1724 pxor 0x20(%rbp), @XMM[0] # ^= IV
1725 movdqu 0x00($inp), @XMM[8] # re-load input
1726 movdqu 0x10($inp), @XMM[9]
1727 pxor @XMM[8], @XMM[1]
1728 movdqu 0x20($inp), @XMM[10]
1729 pxor @XMM[9], @XMM[6]
1730 movdqu 0x30($inp), @XMM[11]
1731 pxor @XMM[10], @XMM[4]
1732 movdqu 0x40($inp), @XMM[12]
1733 pxor @XMM[11], @XMM[2]
1734 movdqu 0x50($inp), @XMM[15] # IV
1735 pxor @XMM[12], @XMM[7]
1736 movdqu @XMM[0], 0x00($out) # write output
1737 movdqu @XMM[1], 0x10($out)
1738 movdqu @XMM[6], 0x20($out)
1739 movdqu @XMM[4], 0x30($out)
1740 movdqu @XMM[2], 0x40($out)
1741 movdqu @XMM[7], 0x50($out)
1742 jmp .Lcbc_dec_done
1743.align 16
1744.Lcbc_dec_five:
1745 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1746 call _bsaes_decrypt8
1747 pxor 0x20(%rbp), @XMM[0] # ^= IV
1748 movdqu 0x00($inp), @XMM[8] # re-load input
1749 movdqu 0x10($inp), @XMM[9]
1750 pxor @XMM[8], @XMM[1]
1751 movdqu 0x20($inp), @XMM[10]
1752 pxor @XMM[9], @XMM[6]
1753 movdqu 0x30($inp), @XMM[11]
1754 pxor @XMM[10], @XMM[4]
1755 movdqu 0x40($inp), @XMM[15] # IV
1756 pxor @XMM[11], @XMM[2]
1757 movdqu @XMM[0], 0x00($out) # write output
1758 movdqu @XMM[1], 0x10($out)
1759 movdqu @XMM[6], 0x20($out)
1760 movdqu @XMM[4], 0x30($out)
1761 movdqu @XMM[2], 0x40($out)
1762 jmp .Lcbc_dec_done
1763.align 16
1764.Lcbc_dec_four:
1765 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1766 call _bsaes_decrypt8
1767 pxor 0x20(%rbp), @XMM[0] # ^= IV
1768 movdqu 0x00($inp), @XMM[8] # re-load input
1769 movdqu 0x10($inp), @XMM[9]
1770 pxor @XMM[8], @XMM[1]
1771 movdqu 0x20($inp), @XMM[10]
1772 pxor @XMM[9], @XMM[6]
1773 movdqu 0x30($inp), @XMM[15] # IV
1774 pxor @XMM[10], @XMM[4]
1775 movdqu @XMM[0], 0x00($out) # write output
1776 movdqu @XMM[1], 0x10($out)
1777 movdqu @XMM[6], 0x20($out)
1778 movdqu @XMM[4], 0x30($out)
1779 jmp .Lcbc_dec_done
1780.align 16
1781.Lcbc_dec_three:
1782 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1783 call _bsaes_decrypt8
1784 pxor 0x20(%rbp), @XMM[0] # ^= IV
1785 movdqu 0x00($inp), @XMM[8] # re-load input
1786 movdqu 0x10($inp), @XMM[9]
1787 pxor @XMM[8], @XMM[1]
1788 movdqu 0x20($inp), @XMM[15] # IV
1789 pxor @XMM[9], @XMM[6]
1790 movdqu @XMM[0], 0x00($out) # write output
1791 movdqu @XMM[1], 0x10($out)
1792 movdqu @XMM[6], 0x20($out)
1793 jmp .Lcbc_dec_done
1794.align 16
1795.Lcbc_dec_two:
1796 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1797 call _bsaes_decrypt8
1798 pxor 0x20(%rbp), @XMM[0] # ^= IV
1799 movdqu 0x00($inp), @XMM[8] # re-load input
1800 movdqu 0x10($inp), @XMM[15] # IV
1801 pxor @XMM[8], @XMM[1]
1802 movdqu @XMM[0], 0x00($out) # write output
1803 movdqu @XMM[1], 0x10($out)
1804 jmp .Lcbc_dec_done
1805.align 16
1806.Lcbc_dec_one:
1807 lea ($inp), $arg1
1808 lea 0x20(%rbp), $arg2 # buffer output
1809 lea ($key), $arg3
1810 call asm_AES_decrypt # doesn't touch %xmm
1811 pxor 0x20(%rbp), @XMM[15] # ^= IV
1812 movdqu @XMM[15], ($out) # write output
1813 movdqa @XMM[0], @XMM[15] # IV
1814
1815.Lcbc_dec_done:
1816 movdqu @XMM[15], (%rbx) # return IV
1817 lea (%rsp), %rax
1818 pxor %xmm0, %xmm0
1819.Lcbc_dec_bzero: # wipe key schedule [if any]
1820 movdqa %xmm0, 0x00(%rax)
1821 movdqa %xmm0, 0x10(%rax)
1822 lea 0x20(%rax), %rax
1823 cmp %rax, %rbp
1824 ja .Lcbc_dec_bzero
1825
1826 lea (%rbp),%rsp # restore %rsp
1827___
1828$code.=<<___ if ($win64);
1829 movaps 0x40(%rbp), %xmm6
1830 movaps 0x50(%rbp), %xmm7
1831 movaps 0x60(%rbp), %xmm8
1832 movaps 0x70(%rbp), %xmm9
1833 movaps 0x80(%rbp), %xmm10
1834 movaps 0x90(%rbp), %xmm11
1835 movaps 0xa0(%rbp), %xmm12
1836 movaps 0xb0(%rbp), %xmm13
1837 movaps 0xc0(%rbp), %xmm14
1838 movaps 0xd0(%rbp), %xmm15
1839 lea 0xa0(%rbp), %rsp
1840___
1841$code.=<<___;
1842 mov 0x48(%rsp), %r15
1843 mov 0x50(%rsp), %r14
1844 mov 0x58(%rsp), %r13
1845 mov 0x60(%rsp), %r12
1846 mov 0x68(%rsp), %rbx
1847 mov 0x70(%rsp), %rax
1848 lea 0x78(%rsp), %rsp
1849 mov %rax, %rbp
1850.Lcbc_dec_epilogue:
1851 ret
1852.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1853
1854.globl bsaes_ctr32_encrypt_blocks
1855.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1856.align 16
1857bsaes_ctr32_encrypt_blocks:
1858 mov %rsp, %rax
1859.Lctr_enc_prologue:
1860 push %rbp
1861 push %rbx
1862 push %r12
1863 push %r13
1864 push %r14
1865 push %r15
1866 lea -0x48(%rsp), %rsp
1867___
1868$code.=<<___ if ($win64);
1869 mov 0xa0(%rsp),$arg5 # pull ivp
1870 lea -0xa0(%rsp), %rsp
1871 movaps %xmm6, 0x40(%rsp)
1872 movaps %xmm7, 0x50(%rsp)
1873 movaps %xmm8, 0x60(%rsp)
1874 movaps %xmm9, 0x70(%rsp)
1875 movaps %xmm10, 0x80(%rsp)
1876 movaps %xmm11, 0x90(%rsp)
1877 movaps %xmm12, 0xa0(%rsp)
1878 movaps %xmm13, 0xb0(%rsp)
1879 movaps %xmm14, 0xc0(%rsp)
1880 movaps %xmm15, 0xd0(%rsp)
1881.Lctr_enc_body:
1882___
1883$code.=<<___;
1884 mov %rsp, %rbp # backup %rsp
1885 movdqu ($arg5), %xmm0 # load counter
1886 mov 240($arg4), %eax # rounds
1887 mov $arg1, $inp # backup arguments
1888 mov $arg2, $out
1889 mov $arg3, $len
1890 mov $arg4, $key
1891 movdqa %xmm0, 0x20(%rbp) # copy counter
1892 cmp \$8, $arg3
1893 jb .Lctr_enc_short
1894
1895 mov %eax, %ebx # rounds
1896 shl \$7, %rax # 128 bytes per inner round key
1897 sub \$`128-32`, %rax # size of bit-sliced key schedule
1898 sub %rax, %rsp
1899
1900 mov %rsp, %rax # pass key schedule
1901 mov $key, %rcx # pass key
1902 mov %ebx, %r10d # pass rounds
1903 call _bsaes_key_convert
1904 pxor %xmm6,%xmm7 # fix up last round key
1905 movdqa %xmm7,(%rax) # save last round key
1906
1907 movdqa (%rsp), @XMM[9] # load round0 key
1908 lea .LADD1(%rip), %r11
1909 movdqa 0x20(%rbp), @XMM[0] # counter copy
1910 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1911 pshufb @XMM[8], @XMM[9] # byte swap upper part
1912 pshufb @XMM[8], @XMM[0]
1913 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1914 jmp .Lctr_enc_loop
1915.align 16
1916.Lctr_enc_loop:
1917 movdqa @XMM[0], 0x20(%rbp) # save counter
1918 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1919 movdqa @XMM[0], @XMM[2]
1920 paddd 0x00(%r11), @XMM[1] # .LADD1
1921 movdqa @XMM[0], @XMM[3]
1922 paddd 0x10(%r11), @XMM[2] # .LADD2
1923 movdqa @XMM[0], @XMM[4]
1924 paddd 0x20(%r11), @XMM[3] # .LADD3
1925 movdqa @XMM[0], @XMM[5]
1926 paddd 0x30(%r11), @XMM[4] # .LADD4
1927 movdqa @XMM[0], @XMM[6]
1928 paddd 0x40(%r11), @XMM[5] # .LADD5
1929 movdqa @XMM[0], @XMM[7]
1930 paddd 0x50(%r11), @XMM[6] # .LADD6
1931 paddd 0x60(%r11), @XMM[7] # .LADD7
1932
1933 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1934 # to flip byte order in 32-bit counter
1935 movdqa (%rsp), @XMM[9] # round 0 key
1936 lea 0x10(%rsp), %rax # pass key schedule
1937 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1938 pxor @XMM[9], @XMM[0] # xor with round0 key
1939 pxor @XMM[9], @XMM[1]
1940 pshufb @XMM[8], @XMM[0]
1941 pxor @XMM[9], @XMM[2]
1942 pshufb @XMM[8], @XMM[1]
1943 pxor @XMM[9], @XMM[3]
1944 pshufb @XMM[8], @XMM[2]
1945 pxor @XMM[9], @XMM[4]
1946 pshufb @XMM[8], @XMM[3]
1947 pxor @XMM[9], @XMM[5]
1948 pshufb @XMM[8], @XMM[4]
1949 pxor @XMM[9], @XMM[6]
1950 pshufb @XMM[8], @XMM[5]
1951 pxor @XMM[9], @XMM[7]
1952 pshufb @XMM[8], @XMM[6]
1953 lea .LBS0(%rip), %r11 # constants table
1954 pshufb @XMM[8], @XMM[7]
1955 mov %ebx,%r10d # pass rounds
1956
1957 call _bsaes_encrypt8_bitslice
1958
1959 sub \$8,$len
1960 jc .Lctr_enc_loop_done
1961
1962 movdqu 0x00($inp), @XMM[8] # load input
1963 movdqu 0x10($inp), @XMM[9]
1964 movdqu 0x20($inp), @XMM[10]
1965 movdqu 0x30($inp), @XMM[11]
1966 movdqu 0x40($inp), @XMM[12]
1967 movdqu 0x50($inp), @XMM[13]
1968 movdqu 0x60($inp), @XMM[14]
1969 movdqu 0x70($inp), @XMM[15]
1970 lea 0x80($inp),$inp
1971 pxor @XMM[0], @XMM[8]
1972 movdqa 0x20(%rbp), @XMM[0] # load counter
1973 pxor @XMM[9], @XMM[1]
1974 movdqu @XMM[8], 0x00($out) # write output
1975 pxor @XMM[10], @XMM[4]
1976 movdqu @XMM[1], 0x10($out)
1977 pxor @XMM[11], @XMM[6]
1978 movdqu @XMM[4], 0x20($out)
1979 pxor @XMM[12], @XMM[3]
1980 movdqu @XMM[6], 0x30($out)
1981 pxor @XMM[13], @XMM[7]
1982 movdqu @XMM[3], 0x40($out)
1983 pxor @XMM[14], @XMM[2]
1984 movdqu @XMM[7], 0x50($out)
1985 pxor @XMM[15], @XMM[5]
1986 movdqu @XMM[2], 0x60($out)
1987 lea .LADD1(%rip), %r11
1988 movdqu @XMM[5], 0x70($out)
1989 lea 0x80($out), $out
1990 paddd 0x70(%r11), @XMM[0] # .LADD8
1991 jnz .Lctr_enc_loop
1992
1993 jmp .Lctr_enc_done
1994.align 16
1995.Lctr_enc_loop_done:
1996 add \$8, $len
1997 movdqu 0x00($inp), @XMM[8] # load input
1998 pxor @XMM[8], @XMM[0]
1999 movdqu @XMM[0], 0x00($out) # write output
2000 cmp \$2,$len
2001 jb .Lctr_enc_done
2002 movdqu 0x10($inp), @XMM[9]
2003 pxor @XMM[9], @XMM[1]
2004 movdqu @XMM[1], 0x10($out)
2005 je .Lctr_enc_done
2006 movdqu 0x20($inp), @XMM[10]
2007 pxor @XMM[10], @XMM[4]
2008 movdqu @XMM[4], 0x20($out)
2009 cmp \$4,$len
2010 jb .Lctr_enc_done
2011 movdqu 0x30($inp), @XMM[11]
2012 pxor @XMM[11], @XMM[6]
2013 movdqu @XMM[6], 0x30($out)
2014 je .Lctr_enc_done
2015 movdqu 0x40($inp), @XMM[12]
2016 pxor @XMM[12], @XMM[3]
2017 movdqu @XMM[3], 0x40($out)
2018 cmp \$6,$len
2019 jb .Lctr_enc_done
2020 movdqu 0x50($inp), @XMM[13]
2021 pxor @XMM[13], @XMM[7]
2022 movdqu @XMM[7], 0x50($out)
2023 je .Lctr_enc_done
2024 movdqu 0x60($inp), @XMM[14]
2025 pxor @XMM[14], @XMM[2]
2026 movdqu @XMM[2], 0x60($out)
2027 jmp .Lctr_enc_done
2028
2029.align 16
2030.Lctr_enc_short:
2031 lea 0x20(%rbp), $arg1
2032 lea 0x30(%rbp), $arg2
2033 lea ($key), $arg3
2034 call asm_AES_encrypt
2035 movdqu ($inp), @XMM[1]
2036 lea 16($inp), $inp
2037 mov 0x2c(%rbp), %eax # load 32-bit counter
2038 bswap %eax
2039 pxor 0x30(%rbp), @XMM[1]
2040 inc %eax # increment
2041 movdqu @XMM[1], ($out)
2042 bswap %eax
2043 lea 16($out), $out
2044 mov %eax, 0x2c(%rsp) # save 32-bit counter
2045 dec $len
2046 jnz .Lctr_enc_short
2047
2048.Lctr_enc_done:
2049 lea (%rsp), %rax
2050 pxor %xmm0, %xmm0
2051.Lctr_enc_bzero: # wipe key schedule [if any]
2052 movdqa %xmm0, 0x00(%rax)
2053 movdqa %xmm0, 0x10(%rax)
2054 lea 0x20(%rax), %rax
2055 cmp %rax, %rbp
2056 ja .Lctr_enc_bzero
2057
2058 lea (%rbp),%rsp # restore %rsp
2059___
2060$code.=<<___ if ($win64);
2061 movaps 0x40(%rbp), %xmm6
2062 movaps 0x50(%rbp), %xmm7
2063 movaps 0x60(%rbp), %xmm8
2064 movaps 0x70(%rbp), %xmm9
2065 movaps 0x80(%rbp), %xmm10
2066 movaps 0x90(%rbp), %xmm11
2067 movaps 0xa0(%rbp), %xmm12
2068 movaps 0xb0(%rbp), %xmm13
2069 movaps 0xc0(%rbp), %xmm14
2070 movaps 0xd0(%rbp), %xmm15
2071 lea 0xa0(%rbp), %rsp
2072___
2073$code.=<<___;
2074 mov 0x48(%rsp), %r15
2075 mov 0x50(%rsp), %r14
2076 mov 0x58(%rsp), %r13
2077 mov 0x60(%rsp), %r12
2078 mov 0x68(%rsp), %rbx
2079 mov 0x70(%rsp), %rax
2080 lea 0x78(%rsp), %rsp
2081 mov %rax, %rbp
2082.Lctr_enc_epilogue:
2083 ret
2084.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2085___
2086######################################################################
2087# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2088# const AES_KEY *key1, const AES_KEY *key2,
2089# const unsigned char iv[16]);
2090#
2091my ($twmask,$twres,$twtmp)=@XMM[13..15];
2092$arg6=~s/d$//;
2093
2094$code.=<<___;
2095.globl bsaes_xts_encrypt
2096.type bsaes_xts_encrypt,\@abi-omnipotent
2097.align 16
2098bsaes_xts_encrypt:
2099 mov %rsp, %rax
2100.Lxts_enc_prologue:
2101 push %rbp
2102 push %rbx
2103 push %r12
2104 push %r13
2105 push %r14
2106 push %r15
2107 lea -0x48(%rsp), %rsp
2108___
2109$code.=<<___ if ($win64);
2110 mov 0xa0(%rsp),$arg5 # pull key2
2111 mov 0xa8(%rsp),$arg6 # pull ivp
2112 lea -0xa0(%rsp), %rsp
2113 movaps %xmm6, 0x40(%rsp)
2114 movaps %xmm7, 0x50(%rsp)
2115 movaps %xmm8, 0x60(%rsp)
2116 movaps %xmm9, 0x70(%rsp)
2117 movaps %xmm10, 0x80(%rsp)
2118 movaps %xmm11, 0x90(%rsp)
2119 movaps %xmm12, 0xa0(%rsp)
2120 movaps %xmm13, 0xb0(%rsp)
2121 movaps %xmm14, 0xc0(%rsp)
2122 movaps %xmm15, 0xd0(%rsp)
2123.Lxts_enc_body:
2124___
2125$code.=<<___;
2126 mov %rsp, %rbp # backup %rsp
2127 mov $arg1, $inp # backup arguments
2128 mov $arg2, $out
2129 mov $arg3, $len
2130 mov $arg4, $key
2131
2132 lea ($arg6), $arg1
2133 lea 0x20(%rbp), $arg2
2134 lea ($arg5), $arg3
2135 call asm_AES_encrypt # generate initial tweak
2136
2137 mov 240($key), %eax # rounds
2138 mov $len, %rbx # backup $len
2139
2140 mov %eax, %edx # rounds
2141 shl \$7, %rax # 128 bytes per inner round key
2142 sub \$`128-32`, %rax # size of bit-sliced key schedule
2143 sub %rax, %rsp
2144
2145 mov %rsp, %rax # pass key schedule
2146 mov $key, %rcx # pass key
2147 mov %edx, %r10d # pass rounds
2148 call _bsaes_key_convert
2149 pxor %xmm6, %xmm7 # fix up last round key
2150 movdqa %xmm7, (%rax) # save last round key
2151
2152 and \$-16, $len
2153 sub \$0x80, %rsp # place for tweak[8]
2154 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2155
2156 pxor $twtmp, $twtmp
2157 movdqa .Lxts_magic(%rip), $twmask
2158 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2159
2160 sub \$0x80, $len
2161 jc .Lxts_enc_short
2162 jmp .Lxts_enc_loop
2163
2164.align 16
2165.Lxts_enc_loop:
2166___
2167 for ($i=0;$i<7;$i++) {
2168 $code.=<<___;
2169 pshufd \$0x13, $twtmp, $twres
2170 pxor $twtmp, $twtmp
2171 movdqa @XMM[7], @XMM[$i]
2172 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2173 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2174 pand $twmask, $twres # isolate carry and residue
2175 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2176 pxor $twres, @XMM[7]
2177___
2178 $code.=<<___ if ($i>=1);
2179 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2180___
2181 $code.=<<___ if ($i>=2);
2182 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2183___
2184 }
2185$code.=<<___;
2186 movdqu 0x60($inp), @XMM[8+6]
2187 pxor @XMM[8+5], @XMM[5]
2188 movdqu 0x70($inp), @XMM[8+7]
2189 lea 0x80($inp), $inp
2190 movdqa @XMM[7], 0x70(%rsp)
2191 pxor @XMM[8+6], @XMM[6]
2192 lea 0x80(%rsp), %rax # pass key schedule
2193 pxor @XMM[8+7], @XMM[7]
2194 mov %edx, %r10d # pass rounds
2195
2196 call _bsaes_encrypt8
2197
2198 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2199 pxor 0x10(%rsp), @XMM[1]
2200 movdqu @XMM[0], 0x00($out) # write output
2201 pxor 0x20(%rsp), @XMM[4]
2202 movdqu @XMM[1], 0x10($out)
2203 pxor 0x30(%rsp), @XMM[6]
2204 movdqu @XMM[4], 0x20($out)
2205 pxor 0x40(%rsp), @XMM[3]
2206 movdqu @XMM[6], 0x30($out)
2207 pxor 0x50(%rsp), @XMM[7]
2208 movdqu @XMM[3], 0x40($out)
2209 pxor 0x60(%rsp), @XMM[2]
2210 movdqu @XMM[7], 0x50($out)
2211 pxor 0x70(%rsp), @XMM[5]
2212 movdqu @XMM[2], 0x60($out)
2213 movdqu @XMM[5], 0x70($out)
2214 lea 0x80($out), $out
2215
2216 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2217 pxor $twtmp, $twtmp
2218 movdqa .Lxts_magic(%rip), $twmask
2219 pcmpgtd @XMM[7], $twtmp
2220 pshufd \$0x13, $twtmp, $twres
2221 pxor $twtmp, $twtmp
2222 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2223 pand $twmask, $twres # isolate carry and residue
2224 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2225 pxor $twres, @XMM[7]
2226
2227 sub \$0x80,$len
2228 jnc .Lxts_enc_loop
2229
2230.Lxts_enc_short:
2231 add \$0x80, $len
2232 jz .Lxts_enc_done
2233___
2234 for ($i=0;$i<7;$i++) {
2235 $code.=<<___;
2236 pshufd \$0x13, $twtmp, $twres
2237 pxor $twtmp, $twtmp
2238 movdqa @XMM[7], @XMM[$i]
2239 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2240 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2241 pand $twmask, $twres # isolate carry and residue
2242 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2243 pxor $twres, @XMM[7]
2244___
2245 $code.=<<___ if ($i>=1);
2246 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2247 cmp \$`0x10*$i`,$len
2248 je .Lxts_enc_$i
2249___
2250 $code.=<<___ if ($i>=2);
2251 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2252___
2253 }
2254$code.=<<___;
2255 movdqu 0x60($inp), @XMM[8+6]
2256 pxor @XMM[8+5], @XMM[5]
2257 movdqa @XMM[7], 0x70(%rsp)
2258 lea 0x70($inp), $inp
2259 pxor @XMM[8+6], @XMM[6]
2260 lea 0x80(%rsp), %rax # pass key schedule
2261 mov %edx, %r10d # pass rounds
2262
2263 call _bsaes_encrypt8
2264
2265 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2266 pxor 0x10(%rsp), @XMM[1]
2267 movdqu @XMM[0], 0x00($out) # write output
2268 pxor 0x20(%rsp), @XMM[4]
2269 movdqu @XMM[1], 0x10($out)
2270 pxor 0x30(%rsp), @XMM[6]
2271 movdqu @XMM[4], 0x20($out)
2272 pxor 0x40(%rsp), @XMM[3]
2273 movdqu @XMM[6], 0x30($out)
2274 pxor 0x50(%rsp), @XMM[7]
2275 movdqu @XMM[3], 0x40($out)
2276 pxor 0x60(%rsp), @XMM[2]
2277 movdqu @XMM[7], 0x50($out)
2278 movdqu @XMM[2], 0x60($out)
2279 lea 0x70($out), $out
2280
2281 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2282 jmp .Lxts_enc_done
2283.align 16
2284.Lxts_enc_6:
2285 pxor @XMM[8+4], @XMM[4]
2286 lea 0x60($inp), $inp
2287 pxor @XMM[8+5], @XMM[5]
2288 lea 0x80(%rsp), %rax # pass key schedule
2289 mov %edx, %r10d # pass rounds
2290
2291 call _bsaes_encrypt8
2292
2293 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2294 pxor 0x10(%rsp), @XMM[1]
2295 movdqu @XMM[0], 0x00($out) # write output
2296 pxor 0x20(%rsp), @XMM[4]
2297 movdqu @XMM[1], 0x10($out)
2298 pxor 0x30(%rsp), @XMM[6]
2299 movdqu @XMM[4], 0x20($out)
2300 pxor 0x40(%rsp), @XMM[3]
2301 movdqu @XMM[6], 0x30($out)
2302 pxor 0x50(%rsp), @XMM[7]
2303 movdqu @XMM[3], 0x40($out)
2304 movdqu @XMM[7], 0x50($out)
2305 lea 0x60($out), $out
2306
2307 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2308 jmp .Lxts_enc_done
2309.align 16
2310.Lxts_enc_5:
2311 pxor @XMM[8+3], @XMM[3]
2312 lea 0x50($inp), $inp
2313 pxor @XMM[8+4], @XMM[4]
2314 lea 0x80(%rsp), %rax # pass key schedule
2315 mov %edx, %r10d # pass rounds
2316
2317 call _bsaes_encrypt8
2318
2319 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2320 pxor 0x10(%rsp), @XMM[1]
2321 movdqu @XMM[0], 0x00($out) # write output
2322 pxor 0x20(%rsp), @XMM[4]
2323 movdqu @XMM[1], 0x10($out)
2324 pxor 0x30(%rsp), @XMM[6]
2325 movdqu @XMM[4], 0x20($out)
2326 pxor 0x40(%rsp), @XMM[3]
2327 movdqu @XMM[6], 0x30($out)
2328 movdqu @XMM[3], 0x40($out)
2329 lea 0x50($out), $out
2330
2331 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2332 jmp .Lxts_enc_done
2333.align 16
2334.Lxts_enc_4:
2335 pxor @XMM[8+2], @XMM[2]
2336 lea 0x40($inp), $inp
2337 pxor @XMM[8+3], @XMM[3]
2338 lea 0x80(%rsp), %rax # pass key schedule
2339 mov %edx, %r10d # pass rounds
2340
2341 call _bsaes_encrypt8
2342
2343 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2344 pxor 0x10(%rsp), @XMM[1]
2345 movdqu @XMM[0], 0x00($out) # write output
2346 pxor 0x20(%rsp), @XMM[4]
2347 movdqu @XMM[1], 0x10($out)
2348 pxor 0x30(%rsp), @XMM[6]
2349 movdqu @XMM[4], 0x20($out)
2350 movdqu @XMM[6], 0x30($out)
2351 lea 0x40($out), $out
2352
2353 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2354 jmp .Lxts_enc_done
2355.align 16
2356.Lxts_enc_3:
2357 pxor @XMM[8+1], @XMM[1]
2358 lea 0x30($inp), $inp
2359 pxor @XMM[8+2], @XMM[2]
2360 lea 0x80(%rsp), %rax # pass key schedule
2361 mov %edx, %r10d # pass rounds
2362
2363 call _bsaes_encrypt8
2364
2365 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2366 pxor 0x10(%rsp), @XMM[1]
2367 movdqu @XMM[0], 0x00($out) # write output
2368 pxor 0x20(%rsp), @XMM[4]
2369 movdqu @XMM[1], 0x10($out)
2370 movdqu @XMM[4], 0x20($out)
2371 lea 0x30($out), $out
2372
2373 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2374 jmp .Lxts_enc_done
2375.align 16
2376.Lxts_enc_2:
2377 pxor @XMM[8+0], @XMM[0]
2378 lea 0x20($inp), $inp
2379 pxor @XMM[8+1], @XMM[1]
2380 lea 0x80(%rsp), %rax # pass key schedule
2381 mov %edx, %r10d # pass rounds
2382
2383 call _bsaes_encrypt8
2384
2385 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2386 pxor 0x10(%rsp), @XMM[1]
2387 movdqu @XMM[0], 0x00($out) # write output
2388 movdqu @XMM[1], 0x10($out)
2389 lea 0x20($out), $out
2390
2391 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2392 jmp .Lxts_enc_done
2393.align 16
2394.Lxts_enc_1:
2395 pxor @XMM[0], @XMM[8]
2396 lea 0x10($inp), $inp
2397 movdqa @XMM[8], 0x20(%rbp)
2398 lea 0x20(%rbp), $arg1
2399 lea 0x20(%rbp), $arg2
2400 lea ($key), $arg3
2401 call asm_AES_encrypt # doesn't touch %xmm
2402 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2403 #pxor @XMM[8], @XMM[0]
2404 #lea 0x80(%rsp), %rax # pass key schedule
2405 #mov %edx, %r10d # pass rounds
2406 #call _bsaes_encrypt8
2407 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2408 movdqu @XMM[0], 0x00($out) # write output
2409 lea 0x10($out), $out
2410
2411 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2412
2413.Lxts_enc_done:
2414 and \$15, %ebx
2415 jz .Lxts_enc_ret
2416 mov $out, %rdx
2417
2418.Lxts_enc_steal:
2419 movzb ($inp), %eax
2420 movzb -16(%rdx), %ecx
2421 lea 1($inp), $inp
2422 mov %al, -16(%rdx)
2423 mov %cl, 0(%rdx)
2424 lea 1(%rdx), %rdx
2425 sub \$1,%ebx
2426 jnz .Lxts_enc_steal
2427
2428 movdqu -16($out), @XMM[0]
2429 lea 0x20(%rbp), $arg1
2430 pxor @XMM[7], @XMM[0]
2431 lea 0x20(%rbp), $arg2
2432 movdqa @XMM[0], 0x20(%rbp)
2433 lea ($key), $arg3
2434 call asm_AES_encrypt # doesn't touch %xmm
2435 pxor 0x20(%rbp), @XMM[7]
2436 movdqu @XMM[7], -16($out)
2437
2438.Lxts_enc_ret:
2439 lea (%rsp), %rax
2440 pxor %xmm0, %xmm0
2441.Lxts_enc_bzero: # wipe key schedule [if any]
2442 movdqa %xmm0, 0x00(%rax)
2443 movdqa %xmm0, 0x10(%rax)
2444 lea 0x20(%rax), %rax
2445 cmp %rax, %rbp
2446 ja .Lxts_enc_bzero
2447
2448 lea (%rbp),%rsp # restore %rsp
2449___
2450$code.=<<___ if ($win64);
2451 movaps 0x40(%rbp), %xmm6
2452 movaps 0x50(%rbp), %xmm7
2453 movaps 0x60(%rbp), %xmm8
2454 movaps 0x70(%rbp), %xmm9
2455 movaps 0x80(%rbp), %xmm10
2456 movaps 0x90(%rbp), %xmm11
2457 movaps 0xa0(%rbp), %xmm12
2458 movaps 0xb0(%rbp), %xmm13
2459 movaps 0xc0(%rbp), %xmm14
2460 movaps 0xd0(%rbp), %xmm15
2461 lea 0xa0(%rbp), %rsp
2462___
2463$code.=<<___;
2464 mov 0x48(%rsp), %r15
2465 mov 0x50(%rsp), %r14
2466 mov 0x58(%rsp), %r13
2467 mov 0x60(%rsp), %r12
2468 mov 0x68(%rsp), %rbx
2469 mov 0x70(%rsp), %rax
2470 lea 0x78(%rsp), %rsp
2471 mov %rax, %rbp
2472.Lxts_enc_epilogue:
2473 ret
2474.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2475
2476.globl bsaes_xts_decrypt
2477.type bsaes_xts_decrypt,\@abi-omnipotent
2478.align 16
2479bsaes_xts_decrypt:
2480 mov %rsp, %rax
2481.Lxts_dec_prologue:
2482 push %rbp
2483 push %rbx
2484 push %r12
2485 push %r13
2486 push %r14
2487 push %r15
2488 lea -0x48(%rsp), %rsp
2489___
2490$code.=<<___ if ($win64);
2491 mov 0xa0(%rsp),$arg5 # pull key2
2492 mov 0xa8(%rsp),$arg6 # pull ivp
2493 lea -0xa0(%rsp), %rsp
2494 movaps %xmm6, 0x40(%rsp)
2495 movaps %xmm7, 0x50(%rsp)
2496 movaps %xmm8, 0x60(%rsp)
2497 movaps %xmm9, 0x70(%rsp)
2498 movaps %xmm10, 0x80(%rsp)
2499 movaps %xmm11, 0x90(%rsp)
2500 movaps %xmm12, 0xa0(%rsp)
2501 movaps %xmm13, 0xb0(%rsp)
2502 movaps %xmm14, 0xc0(%rsp)
2503 movaps %xmm15, 0xd0(%rsp)
2504.Lxts_dec_body:
2505___
2506$code.=<<___;
2507 mov %rsp, %rbp # backup %rsp
2508 mov $arg1, $inp # backup arguments
2509 mov $arg2, $out
2510 mov $arg3, $len
2511 mov $arg4, $key
2512
2513 lea ($arg6), $arg1
2514 lea 0x20(%rbp), $arg2
2515 lea ($arg5), $arg3
2516 call asm_AES_encrypt # generate initial tweak
2517
2518 mov 240($key), %eax # rounds
2519 mov $len, %rbx # backup $len
2520
2521 mov %eax, %edx # rounds
2522 shl \$7, %rax # 128 bytes per inner round key
2523 sub \$`128-32`, %rax # size of bit-sliced key schedule
2524 sub %rax, %rsp
2525
2526 mov %rsp, %rax # pass key schedule
2527 mov $key, %rcx # pass key
2528 mov %edx, %r10d # pass rounds
2529 call _bsaes_key_convert
2530 pxor (%rsp), %xmm7 # fix up round 0 key
2531 movdqa %xmm6, (%rax) # save last round key
2532 movdqa %xmm7, (%rsp)
2533
2534 xor %eax, %eax # if ($len%16) len-=16;
2535 and \$-16, $len
2536 test \$15, %ebx
2537 setnz %al
2538 shl \$4, %rax
2539 sub %rax, $len
2540
2541 sub \$0x80, %rsp # place for tweak[8]
2542 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2543
2544 pxor $twtmp, $twtmp
2545 movdqa .Lxts_magic(%rip), $twmask
2546 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2547
2548 sub \$0x80, $len
2549 jc .Lxts_dec_short
2550 jmp .Lxts_dec_loop
2551
2552.align 16
2553.Lxts_dec_loop:
2554___
2555 for ($i=0;$i<7;$i++) {
2556 $code.=<<___;
2557 pshufd \$0x13, $twtmp, $twres
2558 pxor $twtmp, $twtmp
2559 movdqa @XMM[7], @XMM[$i]
2560 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2561 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2562 pand $twmask, $twres # isolate carry and residue
2563 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2564 pxor $twres, @XMM[7]
2565___
2566 $code.=<<___ if ($i>=1);
2567 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2568___
2569 $code.=<<___ if ($i>=2);
2570 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2571___
2572 }
2573$code.=<<___;
2574 movdqu 0x60($inp), @XMM[8+6]
2575 pxor @XMM[8+5], @XMM[5]
2576 movdqu 0x70($inp), @XMM[8+7]
2577 lea 0x80($inp), $inp
2578 movdqa @XMM[7], 0x70(%rsp)
2579 pxor @XMM[8+6], @XMM[6]
2580 lea 0x80(%rsp), %rax # pass key schedule
2581 pxor @XMM[8+7], @XMM[7]
2582 mov %edx, %r10d # pass rounds
2583
2584 call _bsaes_decrypt8
2585
2586 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2587 pxor 0x10(%rsp), @XMM[1]
2588 movdqu @XMM[0], 0x00($out) # write output
2589 pxor 0x20(%rsp), @XMM[6]
2590 movdqu @XMM[1], 0x10($out)
2591 pxor 0x30(%rsp), @XMM[4]
2592 movdqu @XMM[6], 0x20($out)
2593 pxor 0x40(%rsp), @XMM[2]
2594 movdqu @XMM[4], 0x30($out)
2595 pxor 0x50(%rsp), @XMM[7]
2596 movdqu @XMM[2], 0x40($out)
2597 pxor 0x60(%rsp), @XMM[3]
2598 movdqu @XMM[7], 0x50($out)
2599 pxor 0x70(%rsp), @XMM[5]
2600 movdqu @XMM[3], 0x60($out)
2601 movdqu @XMM[5], 0x70($out)
2602 lea 0x80($out), $out
2603
2604 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2605 pxor $twtmp, $twtmp
2606 movdqa .Lxts_magic(%rip), $twmask
2607 pcmpgtd @XMM[7], $twtmp
2608 pshufd \$0x13, $twtmp, $twres
2609 pxor $twtmp, $twtmp
2610 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2611 pand $twmask, $twres # isolate carry and residue
2612 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2613 pxor $twres, @XMM[7]
2614
2615 sub \$0x80,$len
2616 jnc .Lxts_dec_loop
2617
2618.Lxts_dec_short:
2619 add \$0x80, $len
2620 jz .Lxts_dec_done
2621___
2622 for ($i=0;$i<7;$i++) {
2623 $code.=<<___;
2624 pshufd \$0x13, $twtmp, $twres
2625 pxor $twtmp, $twtmp
2626 movdqa @XMM[7], @XMM[$i]
2627 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2628 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2629 pand $twmask, $twres # isolate carry and residue
2630 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2631 pxor $twres, @XMM[7]
2632___
2633 $code.=<<___ if ($i>=1);
2634 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2635 cmp \$`0x10*$i`,$len
2636 je .Lxts_dec_$i
2637___
2638 $code.=<<___ if ($i>=2);
2639 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2640___
2641 }
2642$code.=<<___;
2643 movdqu 0x60($inp), @XMM[8+6]
2644 pxor @XMM[8+5], @XMM[5]
2645 movdqa @XMM[7], 0x70(%rsp)
2646 lea 0x70($inp), $inp
2647 pxor @XMM[8+6], @XMM[6]
2648 lea 0x80(%rsp), %rax # pass key schedule
2649 mov %edx, %r10d # pass rounds
2650
2651 call _bsaes_decrypt8
2652
2653 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2654 pxor 0x10(%rsp), @XMM[1]
2655 movdqu @XMM[0], 0x00($out) # write output
2656 pxor 0x20(%rsp), @XMM[6]
2657 movdqu @XMM[1], 0x10($out)
2658 pxor 0x30(%rsp), @XMM[4]
2659 movdqu @XMM[6], 0x20($out)
2660 pxor 0x40(%rsp), @XMM[2]
2661 movdqu @XMM[4], 0x30($out)
2662 pxor 0x50(%rsp), @XMM[7]
2663 movdqu @XMM[2], 0x40($out)
2664 pxor 0x60(%rsp), @XMM[3]
2665 movdqu @XMM[7], 0x50($out)
2666 movdqu @XMM[3], 0x60($out)
2667 lea 0x70($out), $out
2668
2669 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2670 jmp .Lxts_dec_done
2671.align 16
2672.Lxts_dec_6:
2673 pxor @XMM[8+4], @XMM[4]
2674 lea 0x60($inp), $inp
2675 pxor @XMM[8+5], @XMM[5]
2676 lea 0x80(%rsp), %rax # pass key schedule
2677 mov %edx, %r10d # pass rounds
2678
2679 call _bsaes_decrypt8
2680
2681 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2682 pxor 0x10(%rsp), @XMM[1]
2683 movdqu @XMM[0], 0x00($out) # write output
2684 pxor 0x20(%rsp), @XMM[6]
2685 movdqu @XMM[1], 0x10($out)
2686 pxor 0x30(%rsp), @XMM[4]
2687 movdqu @XMM[6], 0x20($out)
2688 pxor 0x40(%rsp), @XMM[2]
2689 movdqu @XMM[4], 0x30($out)
2690 pxor 0x50(%rsp), @XMM[7]
2691 movdqu @XMM[2], 0x40($out)
2692 movdqu @XMM[7], 0x50($out)
2693 lea 0x60($out), $out
2694
2695 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2696 jmp .Lxts_dec_done
2697.align 16
2698.Lxts_dec_5:
2699 pxor @XMM[8+3], @XMM[3]
2700 lea 0x50($inp), $inp
2701 pxor @XMM[8+4], @XMM[4]
2702 lea 0x80(%rsp), %rax # pass key schedule
2703 mov %edx, %r10d # pass rounds
2704
2705 call _bsaes_decrypt8
2706
2707 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2708 pxor 0x10(%rsp), @XMM[1]
2709 movdqu @XMM[0], 0x00($out) # write output
2710 pxor 0x20(%rsp), @XMM[6]
2711 movdqu @XMM[1], 0x10($out)
2712 pxor 0x30(%rsp), @XMM[4]
2713 movdqu @XMM[6], 0x20($out)
2714 pxor 0x40(%rsp), @XMM[2]
2715 movdqu @XMM[4], 0x30($out)
2716 movdqu @XMM[2], 0x40($out)
2717 lea 0x50($out), $out
2718
2719 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2720 jmp .Lxts_dec_done
2721.align 16
2722.Lxts_dec_4:
2723 pxor @XMM[8+2], @XMM[2]
2724 lea 0x40($inp), $inp
2725 pxor @XMM[8+3], @XMM[3]
2726 lea 0x80(%rsp), %rax # pass key schedule
2727 mov %edx, %r10d # pass rounds
2728
2729 call _bsaes_decrypt8
2730
2731 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2732 pxor 0x10(%rsp), @XMM[1]
2733 movdqu @XMM[0], 0x00($out) # write output
2734 pxor 0x20(%rsp), @XMM[6]
2735 movdqu @XMM[1], 0x10($out)
2736 pxor 0x30(%rsp), @XMM[4]
2737 movdqu @XMM[6], 0x20($out)
2738 movdqu @XMM[4], 0x30($out)
2739 lea 0x40($out), $out
2740
2741 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2742 jmp .Lxts_dec_done
2743.align 16
2744.Lxts_dec_3:
2745 pxor @XMM[8+1], @XMM[1]
2746 lea 0x30($inp), $inp
2747 pxor @XMM[8+2], @XMM[2]
2748 lea 0x80(%rsp), %rax # pass key schedule
2749 mov %edx, %r10d # pass rounds
2750
2751 call _bsaes_decrypt8
2752
2753 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2754 pxor 0x10(%rsp), @XMM[1]
2755 movdqu @XMM[0], 0x00($out) # write output
2756 pxor 0x20(%rsp), @XMM[6]
2757 movdqu @XMM[1], 0x10($out)
2758 movdqu @XMM[6], 0x20($out)
2759 lea 0x30($out), $out
2760
2761 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2762 jmp .Lxts_dec_done
2763.align 16
2764.Lxts_dec_2:
2765 pxor @XMM[8+0], @XMM[0]
2766 lea 0x20($inp), $inp
2767 pxor @XMM[8+1], @XMM[1]
2768 lea 0x80(%rsp), %rax # pass key schedule
2769 mov %edx, %r10d # pass rounds
2770
2771 call _bsaes_decrypt8
2772
2773 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2774 pxor 0x10(%rsp), @XMM[1]
2775 movdqu @XMM[0], 0x00($out) # write output
2776 movdqu @XMM[1], 0x10($out)
2777 lea 0x20($out), $out
2778
2779 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2780 jmp .Lxts_dec_done
2781.align 16
2782.Lxts_dec_1:
2783 pxor @XMM[0], @XMM[8]
2784 lea 0x10($inp), $inp
2785 movdqa @XMM[8], 0x20(%rbp)
2786 lea 0x20(%rbp), $arg1
2787 lea 0x20(%rbp), $arg2
2788 lea ($key), $arg3
2789 call asm_AES_decrypt # doesn't touch %xmm
2790 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2791 #pxor @XMM[8], @XMM[0]
2792 #lea 0x80(%rsp), %rax # pass key schedule
2793 #mov %edx, %r10d # pass rounds
2794 #call _bsaes_decrypt8
2795 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2796 movdqu @XMM[0], 0x00($out) # write output
2797 lea 0x10($out), $out
2798
2799 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2800
2801.Lxts_dec_done:
2802 and \$15, %ebx
2803 jz .Lxts_dec_ret
2804
2805 pxor $twtmp, $twtmp
2806 movdqa .Lxts_magic(%rip), $twmask
2807 pcmpgtd @XMM[7], $twtmp
2808 pshufd \$0x13, $twtmp, $twres
2809 movdqa @XMM[7], @XMM[6]
2810 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2811 pand $twmask, $twres # isolate carry and residue
2812 movdqu ($inp), @XMM[0]
2813 pxor $twres, @XMM[7]
2814
2815 lea 0x20(%rbp), $arg1
2816 pxor @XMM[7], @XMM[0]
2817 lea 0x20(%rbp), $arg2
2818 movdqa @XMM[0], 0x20(%rbp)
2819 lea ($key), $arg3
2820 call asm_AES_decrypt # doesn't touch %xmm
2821 pxor 0x20(%rbp), @XMM[7]
2822 mov $out, %rdx
2823 movdqu @XMM[7], ($out)
2824
2825.Lxts_dec_steal:
2826 movzb 16($inp), %eax
2827 movzb (%rdx), %ecx
2828 lea 1($inp), $inp
2829 mov %al, (%rdx)
2830 mov %cl, 16(%rdx)
2831 lea 1(%rdx), %rdx
2832 sub \$1,%ebx
2833 jnz .Lxts_dec_steal
2834
2835 movdqu ($out), @XMM[0]
2836 lea 0x20(%rbp), $arg1
2837 pxor @XMM[6], @XMM[0]
2838 lea 0x20(%rbp), $arg2
2839 movdqa @XMM[0], 0x20(%rbp)
2840 lea ($key), $arg3
2841 call asm_AES_decrypt # doesn't touch %xmm
2842 pxor 0x20(%rbp), @XMM[6]
2843 movdqu @XMM[6], ($out)
2844
2845.Lxts_dec_ret:
2846 lea (%rsp), %rax
2847 pxor %xmm0, %xmm0
2848.Lxts_dec_bzero: # wipe key schedule [if any]
2849 movdqa %xmm0, 0x00(%rax)
2850 movdqa %xmm0, 0x10(%rax)
2851 lea 0x20(%rax), %rax
2852 cmp %rax, %rbp
2853 ja .Lxts_dec_bzero
2854
2855 lea (%rbp),%rsp # restore %rsp
2856___
2857$code.=<<___ if ($win64);
2858 movaps 0x40(%rbp), %xmm6
2859 movaps 0x50(%rbp), %xmm7
2860 movaps 0x60(%rbp), %xmm8
2861 movaps 0x70(%rbp), %xmm9
2862 movaps 0x80(%rbp), %xmm10
2863 movaps 0x90(%rbp), %xmm11
2864 movaps 0xa0(%rbp), %xmm12
2865 movaps 0xb0(%rbp), %xmm13
2866 movaps 0xc0(%rbp), %xmm14
2867 movaps 0xd0(%rbp), %xmm15
2868 lea 0xa0(%rbp), %rsp
2869___
2870$code.=<<___;
2871 mov 0x48(%rsp), %r15
2872 mov 0x50(%rsp), %r14
2873 mov 0x58(%rsp), %r13
2874 mov 0x60(%rsp), %r12
2875 mov 0x68(%rsp), %rbx
2876 mov 0x70(%rsp), %rax
2877 lea 0x78(%rsp), %rsp
2878 mov %rax, %rbp
2879.Lxts_dec_epilogue:
2880 ret
2881.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2882___
2883}
2884$code.=<<___;
2885.type _bsaes_const,\@object
2886.align 64
2887_bsaes_const:
2888.LM0ISR: # InvShiftRows constants
2889 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2890.LISRM0:
2891 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2892.LISR:
2893 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2894.LBS0: # bit-slice constants
2895 .quad 0x5555555555555555, 0x5555555555555555
2896.LBS1:
2897 .quad 0x3333333333333333, 0x3333333333333333
2898.LBS2:
2899 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2900.LSR: # shiftrows constants
2901 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2902.LSRM0:
2903 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2904.LM0SR:
2905 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2906.LSWPUP: # byte-swap upper dword
2907 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2908.LSWPUPM0SR:
2909 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2910.LADD1: # counter increment constants
2911 .quad 0x0000000000000000, 0x0000000100000000
2912.LADD2:
2913 .quad 0x0000000000000000, 0x0000000200000000
2914.LADD3:
2915 .quad 0x0000000000000000, 0x0000000300000000
2916.LADD4:
2917 .quad 0x0000000000000000, 0x0000000400000000
2918.LADD5:
2919 .quad 0x0000000000000000, 0x0000000500000000
2920.LADD6:
2921 .quad 0x0000000000000000, 0x0000000600000000
2922.LADD7:
2923 .quad 0x0000000000000000, 0x0000000700000000
2924.LADD8:
2925 .quad 0x0000000000000000, 0x0000000800000000
2926.Lxts_magic:
2927 .long 0x87,0,1,0
2928.Lmasks:
2929 .quad 0x0101010101010101, 0x0101010101010101
2930 .quad 0x0202020202020202, 0x0202020202020202
2931 .quad 0x0404040404040404, 0x0404040404040404
2932 .quad 0x0808080808080808, 0x0808080808080808
2933.LM0:
2934 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2935.L63:
2936 .quad 0x6363636363636363, 0x6363636363636363
2937.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2938.align 64
2939.size _bsaes_const,.-_bsaes_const
2940___
2941
2942# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2943# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2944if ($win64) {
2945$rec="%rcx";
2946$frame="%rdx";
2947$context="%r8";
2948$disp="%r9";
2949
2950$code.=<<___;
2951.extern __imp_RtlVirtualUnwind
2952.type se_handler,\@abi-omnipotent
2953.align 16
2954se_handler:
2955 push %rsi
2956 push %rdi
2957 push %rbx
2958 push %rbp
2959 push %r12
2960 push %r13
2961 push %r14
2962 push %r15
2963 pushfq
2964 sub \$64,%rsp
2965
2966 mov 120($context),%rax # pull context->Rax
2967 mov 248($context),%rbx # pull context->Rip
2968
2969 mov 8($disp),%rsi # disp->ImageBase
2970 mov 56($disp),%r11 # disp->HandlerData
2971
2972 mov 0(%r11),%r10d # HandlerData[0]
2973 lea (%rsi,%r10),%r10 # prologue label
2974 cmp %r10,%rbx # context->Rip<prologue label
2975 jb .Lin_prologue
2976
2977 mov 152($context),%rax # pull context->Rsp
2978
2979 mov 4(%r11),%r10d # HandlerData[1]
2980 lea (%rsi,%r10),%r10 # epilogue label
2981 cmp %r10,%rbx # context->Rip>=epilogue label
2982 jae .Lin_prologue
2983
2984 mov 160($context),%rax # pull context->Rbp
2985
2986 lea 0x40(%rax),%rsi # %xmm save area
2987 lea 512($context),%rdi # &context.Xmm6
2988 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2989 .long 0xa548f3fc # cld; rep movsq
2990 lea 0xa0(%rax),%rax # adjust stack pointer
2991
2992 mov 0x70(%rax),%rbp
2993 mov 0x68(%rax),%rbx
2994 mov 0x60(%rax),%r12
2995 mov 0x58(%rax),%r13
2996 mov 0x50(%rax),%r14
2997 mov 0x48(%rax),%r15
2998 lea 0x78(%rax),%rax # adjust stack pointer
2999 mov %rbx,144($context) # restore context->Rbx
3000 mov %rbp,160($context) # restore context->Rbp
3001 mov %r12,216($context) # restore context->R12
3002 mov %r13,224($context) # restore context->R13
3003 mov %r14,232($context) # restore context->R14
3004 mov %r15,240($context) # restore context->R15
3005
3006.Lin_prologue:
3007 mov %rax,152($context) # restore context->Rsp
3008
3009 mov 40($disp),%rdi # disp->ContextRecord
3010 mov $context,%rsi # context
3011 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3012 .long 0xa548f3fc # cld; rep movsq
3013
3014 mov $disp,%rsi
3015 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3016 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3017 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3018 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3019 mov 40(%rsi),%r10 # disp->ContextRecord
3020 lea 56(%rsi),%r11 # &disp->HandlerData
3021 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3022 mov %r10,32(%rsp) # arg5
3023 mov %r11,40(%rsp) # arg6
3024 mov %r12,48(%rsp) # arg7
3025 mov %rcx,56(%rsp) # arg8, (NULL)
3026 call *__imp_RtlVirtualUnwind(%rip)
3027
3028 mov \$1,%eax # ExceptionContinueSearch
3029 add \$64,%rsp
3030 popfq
3031 pop %r15
3032 pop %r14
3033 pop %r13
3034 pop %r12
3035 pop %rbp
3036 pop %rbx
3037 pop %rdi
3038 pop %rsi
3039 ret
3040.size se_handler,.-se_handler
3041
3042.section .pdata
3043.align 4
3044___
3045$code.=<<___ if ($ecb);
3046 .rva .Lecb_enc_prologue
3047 .rva .Lecb_enc_epilogue
3048 .rva .Lecb_enc_info
3049
3050 .rva .Lecb_dec_prologue
3051 .rva .Lecb_dec_epilogue
3052 .rva .Lecb_dec_info
3053___
3054$code.=<<___;
3055 .rva .Lcbc_dec_prologue
3056 .rva .Lcbc_dec_epilogue
3057 .rva .Lcbc_dec_info
3058
3059 .rva .Lctr_enc_prologue
3060 .rva .Lctr_enc_epilogue
3061 .rva .Lctr_enc_info
3062
3063 .rva .Lxts_enc_prologue
3064 .rva .Lxts_enc_epilogue
3065 .rva .Lxts_enc_info
3066
3067 .rva .Lxts_dec_prologue
3068 .rva .Lxts_dec_epilogue
3069 .rva .Lxts_dec_info
3070
3071.section .xdata
3072.align 8
3073___
3074$code.=<<___ if ($ecb);
3075.Lecb_enc_info:
3076 .byte 9,0,0,0
3077 .rva se_handler
3078 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3079.Lecb_dec_info:
3080 .byte 9,0,0,0
3081 .rva se_handler
3082 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3083___
3084$code.=<<___;
3085.Lcbc_dec_info:
3086 .byte 9,0,0,0
3087 .rva se_handler
3088 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3089.Lctr_enc_info:
3090 .byte 9,0,0,0
3091 .rva se_handler
3092 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3093.Lxts_enc_info:
3094 .byte 9,0,0,0
3095 .rva se_handler
3096 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3097.Lxts_dec_info:
3098 .byte 9,0,0,0
3099 .rva se_handler
3100 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3101___
3102}
3103
3104$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3105
3106print $code;
3107
3108close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl
deleted file mode 100644
index 1533e2c304..0000000000
--- a/src/lib/libcrypto/aes/asm/vpaes-x86.pl
+++ /dev/null
@@ -1,903 +0,0 @@
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
17# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-586.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86.pl column - [also
26# large-block CBC] encrypt/decrypt.
27#
28# aes-586.pl vpaes-x86.pl
29#
30# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***)
31# Nehalem 27.9/40.4/18.1 10.3/12.0
32# Atom 102./119./60.1 64.5/85.3(***)
33#
34# (*) "Hyper-threading" in the context refers rather to cache shared
35# among multiple cores, than to specifically Intel HTT. As vast
36# majority of contemporary cores share cache, slower code path
37# is common place. In other words "with-hyper-threading-off"
38# results are presented mostly for reference purposes.
39#
40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
41#
42# (***) Less impressive improvement on Core 2 and Atom is due to slow
43# pshufb, yet it's respectable +32%/65% improvement on Core 2
44# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
45# code path).
46#
47# <appro@openssl.org>
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50push(@INC,"${dir}","${dir}../../perlasm");
51require "x86asm.pl";
52
53&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
54
55$PREFIX="vpaes";
56
57my ($round, $base, $magic, $key, $const, $inp, $out)=
58 ("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
59
60&static_label("_vpaes_consts");
61&static_label("_vpaes_schedule_low_round");
62
63&set_label("_vpaes_consts",64);
64$k_inv=-0x30; # inv, inva
65 &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
66 &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
67
68$k_s0F=-0x10; # s0F
69 &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
70
71$k_ipt=0x00; # input transform (lo, hi)
72 &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
73 &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
74
75$k_sb1=0x20; # sb1u, sb1t
76 &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
77 &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
78$k_sb2=0x40; # sb2u, sb2t
79 &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
80 &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
81$k_sbo=0x60; # sbou, sbot
82 &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
83 &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
84
85$k_mc_forward=0x80; # mc_forward
86 &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
87 &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
88 &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
89 &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
90
91$k_mc_backward=0xc0; # mc_backward
92 &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
93 &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
94 &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
95 &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
96
97$k_sr=0x100; # sr
98 &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
99 &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
100 &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
101 &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
102
103$k_rcon=0x140; # rcon
104 &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
105
106$k_s63=0x150; # s63: all equal to 0x63 transformed
107 &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
108
109$k_opt=0x160; # output transform
110 &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
111 &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
112
113$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
114 &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
115 &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
116##
117## Decryption stuff
118## Key schedule constants
119##
120$k_dksd=0x1a0; # decryption key schedule: invskew x*D
121 &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
122 &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
123$k_dksb=0x1c0; # decryption key schedule: invskew x*B
124 &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
125 &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
126$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
127 &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
128 &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
129$k_dks9=0x200; # decryption key schedule: invskew x*9
130 &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
131 &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
132
133##
134## Decryption stuff
135## Round function constants
136##
137$k_dipt=0x220; # decryption input transform
138 &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
139 &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
140
141$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
142 &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
143 &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
144$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
145 &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
146 &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
147$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
148 &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
149 &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
150$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
151 &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
152 &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
153$k_dsbo=0x2c0; # decryption sbox final output
154 &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
155 &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
156&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
157&align (64);
158
159&function_begin_B("_vpaes_preheat");
160 &add ($const,&DWP(0,"esp"));
161 &movdqa ("xmm7",&QWP($k_inv,$const));
162 &movdqa ("xmm6",&QWP($k_s0F,$const));
163 &ret ();
164&function_end_B("_vpaes_preheat");
165
166##
167## _aes_encrypt_core
168##
169## AES-encrypt %xmm0.
170##
171## Inputs:
172## %xmm0 = input
173## %xmm6-%xmm7 as in _vpaes_preheat
174## (%edx) = scheduled keys
175##
176## Output in %xmm0
177## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
178##
179##
180&function_begin_B("_vpaes_encrypt_core");
181 &mov ($magic,16);
182 &mov ($round,&DWP(240,$key));
183 &movdqa ("xmm1","xmm6")
184 &movdqa ("xmm2",&QWP($k_ipt,$const));
185 &pandn ("xmm1","xmm0");
186 &movdqu ("xmm5",&QWP(0,$key));
187 &psrld ("xmm1",4);
188 &pand ("xmm0","xmm6");
189 &pshufb ("xmm2","xmm0");
190 &movdqa ("xmm0",&QWP($k_ipt+16,$const));
191 &pshufb ("xmm0","xmm1");
192 &pxor ("xmm2","xmm5");
193 &pxor ("xmm0","xmm2");
194 &add ($key,16);
195 &lea ($base,&DWP($k_mc_backward,$const));
196 &jmp (&label("enc_entry"));
197
198
199&set_label("enc_loop",16);
200 # middle of middle round
201 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
202 &pshufb ("xmm4","xmm2"); # 4 = sb1u
203 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
204 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
205 &pshufb ("xmm0","xmm3"); # 0 = sb1t
206 &pxor ("xmm0","xmm4"); # 0 = A
207 &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
208 &pshufb ("xmm5","xmm2"); # 4 = sb2u
209 &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
210 &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
211 &pshufb ("xmm2","xmm3"); # 2 = sb2t
212 &pxor ("xmm2","xmm5"); # 2 = 2A
213 &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
214 &movdqa ("xmm3","xmm0"); # 3 = A
215 &pshufb ("xmm0","xmm1"); # 0 = B
216 &add ($key,16); # next key
217 &pxor ("xmm0","xmm2"); # 0 = 2A+B
218 &pshufb ("xmm3","xmm4"); # 3 = D
219 &add ($magic,16); # next mc
220 &pxor ("xmm3","xmm0"); # 3 = 2A+B+D
221 &pshufb ("xmm0","xmm1"); # 0 = 2B+C
222 &and ($magic,0x30); # ... mod 4
223 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
224 &sub ($round,1); # nr--
225
226&set_label("enc_entry");
227 # top of round
228 &movdqa ("xmm1","xmm6"); # 1 : i
229 &pandn ("xmm1","xmm0"); # 1 = i<<4
230 &psrld ("xmm1",4); # 1 = i
231 &pand ("xmm0","xmm6"); # 0 = k
232 &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
233 &pshufb ("xmm5","xmm0"); # 2 = a/k
234 &pxor ("xmm0","xmm1"); # 0 = j
235 &movdqa ("xmm3","xmm7"); # 3 : 1/i
236 &pshufb ("xmm3","xmm1"); # 3 = 1/i
237 &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
238 &movdqa ("xmm4","xmm7"); # 4 : 1/j
239 &pshufb ("xmm4","xmm0"); # 4 = 1/j
240 &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
241 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
242 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
243 &pxor ("xmm2","xmm0"); # 2 = io
244 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
245 &movdqu ("xmm5",&QWP(0,$key));
246 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
247 &pxor ("xmm3","xmm1"); # 3 = jo
248 &jnz (&label("enc_loop"));
249
250 # middle of last round
251 &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
252 &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
253 &pshufb ("xmm4","xmm2"); # 4 = sbou
254 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
255 &pshufb ("xmm0","xmm3"); # 0 = sb1t
256 &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
257 &pxor ("xmm0","xmm4"); # 0 = A
258 &pshufb ("xmm0","xmm1");
259 &ret ();
260&function_end_B("_vpaes_encrypt_core");
261
262##
263## Decryption core
264##
265## Same API as encryption core.
266##
267&function_begin_B("_vpaes_decrypt_core");
268 &mov ($round,&DWP(240,$key));
269 &lea ($base,&DWP($k_dsbd,$const));
270 &movdqa ("xmm1","xmm6");
271 &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
272 &pandn ("xmm1","xmm0");
273 &mov ($magic,$round);
274 &psrld ("xmm1",4)
275 &movdqu ("xmm5",&QWP(0,$key));
276 &shl ($magic,4);
277 &pand ("xmm0","xmm6");
278 &pshufb ("xmm2","xmm0");
279 &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
280 &xor ($magic,0x30);
281 &pshufb ("xmm0","xmm1");
282 &and ($magic,0x30);
283 &pxor ("xmm2","xmm5");
284 &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
285 &pxor ("xmm0","xmm2");
286 &add ($key,16);
287 &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
288 &jmp (&label("dec_entry"));
289
290&set_label("dec_loop",16);
291##
292## Inverse mix columns
293##
294 &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
295 &pshufb ("xmm4","xmm2"); # 4 = sb9u
296 &pxor ("xmm4","xmm0");
297 &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t
298 &pshufb ("xmm0","xmm3"); # 0 = sb9t
299 &pxor ("xmm0","xmm4"); # 0 = ch
300 &add ($key,16); # next round key
301
302 &pshufb ("xmm0","xmm5"); # MC ch
303 &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
304 &pshufb ("xmm4","xmm2"); # 4 = sbdu
305 &pxor ("xmm4","xmm0"); # 4 = ch
306 &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt
307 &pshufb ("xmm0","xmm3"); # 0 = sbdt
308 &pxor ("xmm0","xmm4"); # 0 = ch
309 &sub ($round,1); # nr--
310
311 &pshufb ("xmm0","xmm5"); # MC ch
312 &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
313 &pshufb ("xmm4","xmm2"); # 4 = sbbu
314 &pxor ("xmm4","xmm0"); # 4 = ch
315 &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt
316 &pshufb ("xmm0","xmm3"); # 0 = sbbt
317 &pxor ("xmm0","xmm4"); # 0 = ch
318
319 &pshufb ("xmm0","xmm5"); # MC ch
320 &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
321 &pshufb ("xmm4","xmm2"); # 4 = sbeu
322 &pxor ("xmm4","xmm0"); # 4 = ch
323 &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
324 &pshufb ("xmm0","xmm3"); # 0 = sbet
325 &pxor ("xmm0","xmm4"); # 0 = ch
326
327 &palignr("xmm5","xmm5",12);
328
329&set_label("dec_entry");
330 # top of round
331 &movdqa ("xmm1","xmm6"); # 1 : i
332 &pandn ("xmm1","xmm0"); # 1 = i<<4
333 &psrld ("xmm1",4); # 1 = i
334 &pand ("xmm0","xmm6"); # 0 = k
335 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
336 &pshufb ("xmm2","xmm0"); # 2 = a/k
337 &pxor ("xmm0","xmm1"); # 0 = j
338 &movdqa ("xmm3","xmm7"); # 3 : 1/i
339 &pshufb ("xmm3","xmm1"); # 3 = 1/i
340 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
341 &movdqa ("xmm4","xmm7"); # 4 : 1/j
342 &pshufb ("xmm4","xmm0"); # 4 = 1/j
343 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
344 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
345 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
346 &pxor ("xmm2","xmm0"); # 2 = io
347 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
348 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
349 &pxor ("xmm3","xmm1"); # 3 = jo
350 &movdqu ("xmm0",&QWP(0,$key));
351 &jnz (&label("dec_loop"));
352
353 # middle of last round
354 &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
355 &pshufb ("xmm4","xmm2"); # 4 = sbou
356 &pxor ("xmm4","xmm0"); # 4 = sb1u + k
357 &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
358 &movdqa ("xmm2",&QWP(0,$magic));
359 &pshufb ("xmm0","xmm3"); # 0 = sb1t
360 &pxor ("xmm0","xmm4"); # 0 = A
361 &pshufb ("xmm0","xmm2");
362 &ret ();
363&function_end_B("_vpaes_decrypt_core");
364
365########################################################
366## ##
367## AES key schedule ##
368## ##
369########################################################
370&function_begin_B("_vpaes_schedule_core");
371 &add ($const,&DWP(0,"esp"));
372 &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
373 &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
374
375 # input transform
376 &movdqa ("xmm3","xmm0");
377 &lea ($base,&DWP($k_ipt,$const));
378 &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
379 &call ("_vpaes_schedule_transform");
380 &movdqa ("xmm7","xmm0");
381
382 &test ($out,$out);
383 &jnz (&label("schedule_am_decrypting"));
384
385 # encrypting, output zeroth round key after transform
386 &movdqu (&QWP(0,$key),"xmm0");
387 &jmp (&label("schedule_go"));
388
389&set_label("schedule_am_decrypting");
390 # decrypting, output zeroth round key after shiftrows
391 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
392 &pshufb ("xmm3","xmm1");
393 &movdqu (&QWP(0,$key),"xmm3");
394 &xor ($magic,0x30);
395
396&set_label("schedule_go");
397 &cmp ($round,192);
398 &ja (&label("schedule_256"));
399 &je (&label("schedule_192"));
400 # 128: fall though
401
402##
403## .schedule_128
404##
405## 128-bit specific part of key schedule.
406##
407## This schedule is really simple, because all its parts
408## are accomplished by the subroutines.
409##
410&set_label("schedule_128");
411 &mov ($round,10);
412
413&set_label("loop_schedule_128");
414 &call ("_vpaes_schedule_round");
415 &dec ($round);
416 &jz (&label("schedule_mangle_last"));
417 &call ("_vpaes_schedule_mangle"); # write output
418 &jmp (&label("loop_schedule_128"));
419
420##
421## .aes_schedule_192
422##
423## 192-bit specific part of key schedule.
424##
425## The main body of this schedule is the same as the 128-bit
426## schedule, but with more smearing. The long, high side is
427## stored in %xmm7 as before, and the short, low side is in
428## the high bits of %xmm6.
429##
430## This schedule is somewhat nastier, however, because each
431## round produces 192 bits of key material, or 1.5 round keys.
432## Therefore, on each cycle we do 2 rounds and produce 3 round
433## keys.
434##
435&set_label("schedule_192",16);
436 &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
437 &call ("_vpaes_schedule_transform"); # input transform
438 &movdqa ("xmm6","xmm0"); # save short part
439 &pxor ("xmm4","xmm4"); # clear 4
440 &movhlps("xmm6","xmm4"); # clobber low side with zeros
441 &mov ($round,4);
442
443&set_label("loop_schedule_192");
444 &call ("_vpaes_schedule_round");
445 &palignr("xmm0","xmm6",8);
446 &call ("_vpaes_schedule_mangle"); # save key n
447 &call ("_vpaes_schedule_192_smear");
448 &call ("_vpaes_schedule_mangle"); # save key n+1
449 &call ("_vpaes_schedule_round");
450 &dec ($round);
451 &jz (&label("schedule_mangle_last"));
452 &call ("_vpaes_schedule_mangle"); # save key n+2
453 &call ("_vpaes_schedule_192_smear");
454 &jmp (&label("loop_schedule_192"));
455
456##
457## .aes_schedule_256
458##
459## 256-bit specific part of key schedule.
460##
461## The structure here is very similar to the 128-bit
462## schedule, but with an additional "low side" in
463## %xmm6. The low side's rounds are the same as the
464## high side's, except no rcon and no rotation.
465##
466&set_label("schedule_256",16);
467 &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
468 &call ("_vpaes_schedule_transform"); # input transform
469 &mov ($round,7);
470
471&set_label("loop_schedule_256");
472 &call ("_vpaes_schedule_mangle"); # output low result
473 &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
474
475 # high round
476 &call ("_vpaes_schedule_round");
477 &dec ($round);
478 &jz (&label("schedule_mangle_last"));
479 &call ("_vpaes_schedule_mangle");
480
481 # low round. swap xmm7 and xmm6
482 &pshufd ("xmm0","xmm0",0xFF);
483 &movdqa (&QWP(20,"esp"),"xmm7");
484 &movdqa ("xmm7","xmm6");
485 &call ("_vpaes_schedule_low_round");
486 &movdqa ("xmm7",&QWP(20,"esp"));
487
488 &jmp (&label("loop_schedule_256"));
489
490##
491## .aes_schedule_mangle_last
492##
493## Mangler for last round of key schedule
494## Mangles %xmm0
495## when encrypting, outputs out(%xmm0) ^ 63
496## when decrypting, outputs unskew(%xmm0)
497##
498## Always called right before return... jumps to cleanup and exits
499##
500&set_label("schedule_mangle_last",16);
501 # schedule last round key from xmm0
502 &lea ($base,&DWP($k_deskew,$const));
503 &test ($out,$out);
504 &jnz (&label("schedule_mangle_last_dec"));
505
506 # encrypting
507 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
508 &pshufb ("xmm0","xmm1"); # output permute
509 &lea ($base,&DWP($k_opt,$const)); # prepare to output transform
510 &add ($key,32);
511
512&set_label("schedule_mangle_last_dec");
513 &add ($key,-16);
514 &pxor ("xmm0",&QWP($k_s63,$const));
515 &call ("_vpaes_schedule_transform"); # output transform
516 &movdqu (&QWP(0,$key),"xmm0"); # save last key
517
518 # cleanup
519 &pxor ("xmm0","xmm0");
520 &pxor ("xmm1","xmm1");
521 &pxor ("xmm2","xmm2");
522 &pxor ("xmm3","xmm3");
523 &pxor ("xmm4","xmm4");
524 &pxor ("xmm5","xmm5");
525 &pxor ("xmm6","xmm6");
526 &pxor ("xmm7","xmm7");
527 &ret ();
528&function_end_B("_vpaes_schedule_core");
529
530##
531## .aes_schedule_192_smear
532##
533## Smear the short, low side in the 192-bit key schedule.
534##
535## Inputs:
536## %xmm7: high side, b a x y
537## %xmm6: low side, d c 0 0
538## %xmm13: 0
539##
540## Outputs:
541## %xmm6: b+c+d b+c 0 0
542## %xmm0: b+c+d b+c b a
543##
544&function_begin_B("_vpaes_schedule_192_smear");
545 &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0
546 &pxor ("xmm6","xmm0"); # -> c+d c 0 0
547 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
548 &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
549 &movdqa ("xmm0","xmm6");
550 &pxor ("xmm1","xmm1");
551 &movhlps("xmm6","xmm1"); # clobber low side with zeros
552 &ret ();
553&function_end_B("_vpaes_schedule_192_smear");
554
555##
556## .aes_schedule_round
557##
558## Runs one main round of the key schedule on %xmm0, %xmm7
559##
560## Specifically, runs subbytes on the high dword of %xmm0
561## then rotates it by one byte and xors into the low dword of
562## %xmm7.
563##
564## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
565## next rcon.
566##
567## Smears the dwords of %xmm7 by xoring the low into the
568## second low, result into third, result into highest.
569##
570## Returns results in %xmm7 = %xmm0.
571## Clobbers %xmm1-%xmm5.
572##
573&function_begin_B("_vpaes_schedule_round");
574 # extract rcon from xmm8
575 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8
576 &pxor ("xmm1","xmm1");
577 &palignr("xmm1","xmm2",15);
578 &palignr("xmm2","xmm2",15);
579 &pxor ("xmm7","xmm1");
580
581 # rotate
582 &pshufd ("xmm0","xmm0",0xFF);
583 &palignr("xmm0","xmm0",1);
584
585 # fall through...
586 &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
587
588 # low round: same as high round, but no rotation and no rcon.
589&set_label("_vpaes_schedule_low_round");
590 # smear xmm7
591 &movdqa ("xmm1","xmm7");
592 &pslldq ("xmm7",4);
593 &pxor ("xmm7","xmm1");
594 &movdqa ("xmm1","xmm7");
595 &pslldq ("xmm7",8);
596 &pxor ("xmm7","xmm1");
597 &pxor ("xmm7",&QWP($k_s63,$const));
598
599 # subbyte
600 &movdqa ("xmm4",&QWP($k_s0F,$const));
601 &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
602 &movdqa ("xmm1","xmm4");
603 &pandn ("xmm1","xmm0");
604 &psrld ("xmm1",4); # 1 = i
605 &pand ("xmm0","xmm4"); # 0 = k
606 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
607 &pshufb ("xmm2","xmm0"); # 2 = a/k
608 &pxor ("xmm0","xmm1"); # 0 = j
609 &movdqa ("xmm3","xmm5"); # 3 : 1/i
610 &pshufb ("xmm3","xmm1"); # 3 = 1/i
611 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
612 &movdqa ("xmm4","xmm5"); # 4 : 1/j
613 &pshufb ("xmm4","xmm0"); # 4 = 1/j
614 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
615 &movdqa ("xmm2","xmm5"); # 2 : 1/iak
616 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
617 &pxor ("xmm2","xmm0"); # 2 = io
618 &movdqa ("xmm3","xmm5"); # 3 : 1/jak
619 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
620 &pxor ("xmm3","xmm1"); # 3 = jo
621 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
622 &pshufb ("xmm4","xmm2"); # 4 = sbou
623 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
624 &pshufb ("xmm0","xmm3"); # 0 = sb1t
625 &pxor ("xmm0","xmm4"); # 0 = sbox output
626
627 # add in smeared stuff
628 &pxor ("xmm0","xmm7");
629 &movdqa ("xmm7","xmm0");
630 &ret ();
631&function_end_B("_vpaes_schedule_round");
632
633##
634## .aes_schedule_transform
635##
636## Linear-transform %xmm0 according to tables at (%ebx)
637##
638## Output in %xmm0
639## Clobbers %xmm1, %xmm2
640##
641&function_begin_B("_vpaes_schedule_transform");
642 &movdqa ("xmm2",&QWP($k_s0F,$const));
643 &movdqa ("xmm1","xmm2");
644 &pandn ("xmm1","xmm0");
645 &psrld ("xmm1",4);
646 &pand ("xmm0","xmm2");
647 &movdqa ("xmm2",&QWP(0,$base));
648 &pshufb ("xmm2","xmm0");
649 &movdqa ("xmm0",&QWP(16,$base));
650 &pshufb ("xmm0","xmm1");
651 &pxor ("xmm0","xmm2");
652 &ret ();
653&function_end_B("_vpaes_schedule_transform");
654
655##
656## .aes_schedule_mangle
657##
658## Mangle xmm0 from (basis-transformed) standard version
659## to our version.
660##
661## On encrypt,
662## xor with 0x63
663## multiply by circulant 0,1,1,1
664## apply shiftrows transform
665##
666## On decrypt,
667## xor with 0x63
668## multiply by "inverse mixcolumns" circulant E,B,D,9
669## deskew
670## apply shiftrows transform
671##
672##
673## Writes out to (%edx), and increments or decrements it
674## Keeps track of round number mod 4 in %ecx
675## Preserves xmm0
676## Clobbers xmm1-xmm5
677##
678&function_begin_B("_vpaes_schedule_mangle");
679 &movdqa ("xmm4","xmm0"); # save xmm0 for later
680 &movdqa ("xmm5",&QWP($k_mc_forward,$const));
681 &test ($out,$out);
682 &jnz (&label("schedule_mangle_dec"));
683
684 # encrypting
685 &add ($key,16);
686 &pxor ("xmm4",&QWP($k_s63,$const));
687 &pshufb ("xmm4","xmm5");
688 &movdqa ("xmm3","xmm4");
689 &pshufb ("xmm4","xmm5");
690 &pxor ("xmm3","xmm4");
691 &pshufb ("xmm4","xmm5");
692 &pxor ("xmm3","xmm4");
693
694 &jmp (&label("schedule_mangle_both"));
695
696&set_label("schedule_mangle_dec",16);
697 # inverse mix columns
698 &movdqa ("xmm2",&QWP($k_s0F,$const));
699 &lea ($inp,&DWP($k_dksd,$const));
700 &movdqa ("xmm1","xmm2");
701 &pandn ("xmm1","xmm4");
702 &psrld ("xmm1",4); # 1 = hi
703 &pand ("xmm4","xmm2"); # 4 = lo
704
705 &movdqa ("xmm2",&QWP(0,$inp));
706 &pshufb ("xmm2","xmm4");
707 &movdqa ("xmm3",&QWP(0x10,$inp));
708 &pshufb ("xmm3","xmm1");
709 &pxor ("xmm3","xmm2");
710 &pshufb ("xmm3","xmm5");
711
712 &movdqa ("xmm2",&QWP(0x20,$inp));
713 &pshufb ("xmm2","xmm4");
714 &pxor ("xmm2","xmm3");
715 &movdqa ("xmm3",&QWP(0x30,$inp));
716 &pshufb ("xmm3","xmm1");
717 &pxor ("xmm3","xmm2");
718 &pshufb ("xmm3","xmm5");
719
720 &movdqa ("xmm2",&QWP(0x40,$inp));
721 &pshufb ("xmm2","xmm4");
722 &pxor ("xmm2","xmm3");
723 &movdqa ("xmm3",&QWP(0x50,$inp));
724 &pshufb ("xmm3","xmm1");
725 &pxor ("xmm3","xmm2");
726 &pshufb ("xmm3","xmm5");
727
728 &movdqa ("xmm2",&QWP(0x60,$inp));
729 &pshufb ("xmm2","xmm4");
730 &pxor ("xmm2","xmm3");
731 &movdqa ("xmm3",&QWP(0x70,$inp));
732 &pshufb ("xmm3","xmm1");
733 &pxor ("xmm3","xmm2");
734
735 &add ($key,-16);
736
737&set_label("schedule_mangle_both");
738 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
739 &pshufb ("xmm3","xmm1");
740 &add ($magic,-16);
741 &and ($magic,0x30);
742 &movdqu (&QWP(0,$key),"xmm3");
743 &ret ();
744&function_end_B("_vpaes_schedule_mangle");
745
746#
747# Interface to OpenSSL
748#
749&function_begin("${PREFIX}_set_encrypt_key");
750 &mov ($inp,&wparam(0)); # inp
751 &lea ($base,&DWP(-56,"esp"));
752 &mov ($round,&wparam(1)); # bits
753 &and ($base,-16);
754 &mov ($key,&wparam(2)); # key
755 &xchg ($base,"esp"); # alloca
756 &mov (&DWP(48,"esp"),$base);
757
758 &mov ($base,$round);
759 &shr ($base,5);
760 &add ($base,5);
761 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
762 &mov ($magic,0x30);
763 &mov ($out,0);
764
765 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
766 &call ("_vpaes_schedule_core");
767&set_label("pic_point");
768
769 &mov ("esp",&DWP(48,"esp"));
770 &xor ("eax","eax");
771&function_end("${PREFIX}_set_encrypt_key");
772
773&function_begin("${PREFIX}_set_decrypt_key");
774 &mov ($inp,&wparam(0)); # inp
775 &lea ($base,&DWP(-56,"esp"));
776 &mov ($round,&wparam(1)); # bits
777 &and ($base,-16);
778 &mov ($key,&wparam(2)); # key
779 &xchg ($base,"esp"); # alloca
780 &mov (&DWP(48,"esp"),$base);
781
782 &mov ($base,$round);
783 &shr ($base,5);
784 &add ($base,5);
785 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
786 &shl ($base,4);
787 &lea ($key,&DWP(16,$key,$base));
788
789 &mov ($out,1);
790 &mov ($magic,$round);
791 &shr ($magic,1);
792 &and ($magic,32);
793 &xor ($magic,32); # nbist==192?0:32;
794
795 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
796 &call ("_vpaes_schedule_core");
797&set_label("pic_point");
798
799 &mov ("esp",&DWP(48,"esp"));
800 &xor ("eax","eax");
801&function_end("${PREFIX}_set_decrypt_key");
802
803&function_begin("${PREFIX}_encrypt");
804 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
805 &call ("_vpaes_preheat");
806&set_label("pic_point");
807 &mov ($inp,&wparam(0)); # inp
808 &lea ($base,&DWP(-56,"esp"));
809 &mov ($out,&wparam(1)); # out
810 &and ($base,-16);
811 &mov ($key,&wparam(2)); # key
812 &xchg ($base,"esp"); # alloca
813 &mov (&DWP(48,"esp"),$base);
814
815 &movdqu ("xmm0",&QWP(0,$inp));
816 &call ("_vpaes_encrypt_core");
817 &movdqu (&QWP(0,$out),"xmm0");
818
819 &mov ("esp",&DWP(48,"esp"));
820&function_end("${PREFIX}_encrypt");
821
822&function_begin("${PREFIX}_decrypt");
823 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
824 &call ("_vpaes_preheat");
825&set_label("pic_point");
826 &mov ($inp,&wparam(0)); # inp
827 &lea ($base,&DWP(-56,"esp"));
828 &mov ($out,&wparam(1)); # out
829 &and ($base,-16);
830 &mov ($key,&wparam(2)); # key
831 &xchg ($base,"esp"); # alloca
832 &mov (&DWP(48,"esp"),$base);
833
834 &movdqu ("xmm0",&QWP(0,$inp));
835 &call ("_vpaes_decrypt_core");
836 &movdqu (&QWP(0,$out),"xmm0");
837
838 &mov ("esp",&DWP(48,"esp"));
839&function_end("${PREFIX}_decrypt");
840
841&function_begin("${PREFIX}_cbc_encrypt");
842 &mov ($inp,&wparam(0)); # inp
843 &mov ($out,&wparam(1)); # out
844 &mov ($round,&wparam(2)); # len
845 &mov ($key,&wparam(3)); # key
846 &sub ($round,16);
847 &jc (&label("cbc_abort"));
848 &lea ($base,&DWP(-56,"esp"));
849 &mov ($const,&wparam(4)); # ivp
850 &and ($base,-16);
851 &mov ($magic,&wparam(5)); # enc
852 &xchg ($base,"esp"); # alloca
853 &movdqu ("xmm1",&QWP(0,$const)); # load IV
854 &sub ($out,$inp);
855 &mov (&DWP(48,"esp"),$base);
856
857 &mov (&DWP(0,"esp"),$out); # save out
858 &mov (&DWP(4,"esp"),$key) # save key
859 &mov (&DWP(8,"esp"),$const); # save ivp
860 &mov ($out,$round); # $out works as $len
861
862 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
863 &call ("_vpaes_preheat");
864&set_label("pic_point");
865 &cmp ($magic,0);
866 &je (&label("cbc_dec_loop"));
867 &jmp (&label("cbc_enc_loop"));
868
869&set_label("cbc_enc_loop",16);
870 &movdqu ("xmm0",&QWP(0,$inp)); # load input
871 &pxor ("xmm0","xmm1"); # inp^=iv
872 &call ("_vpaes_encrypt_core");
873 &mov ($base,&DWP(0,"esp")); # restore out
874 &mov ($key,&DWP(4,"esp")); # restore key
875 &movdqa ("xmm1","xmm0");
876 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
877 &lea ($inp,&DWP(16,$inp));
878 &sub ($out,16);
879 &jnc (&label("cbc_enc_loop"));
880 &jmp (&label("cbc_done"));
881
882&set_label("cbc_dec_loop",16);
883 &movdqu ("xmm0",&QWP(0,$inp)); # load input
884 &movdqa (&QWP(16,"esp"),"xmm1"); # save IV
885 &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
886 &call ("_vpaes_decrypt_core");
887 &mov ($base,&DWP(0,"esp")); # restore out
888 &mov ($key,&DWP(4,"esp")); # restore key
889 &pxor ("xmm0",&QWP(16,"esp")); # out^=iv
890 &movdqa ("xmm1",&QWP(32,"esp")); # load next IV
891 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
892 &lea ($inp,&DWP(16,$inp));
893 &sub ($out,16);
894 &jnc (&label("cbc_dec_loop"));
895
896&set_label("cbc_done");
897 &mov ($base,&DWP(8,"esp")); # restore ivp
898 &mov ("esp",&DWP(48,"esp"));
899 &movdqu (&QWP(0,$base),"xmm1"); # write IV
900&set_label("cbc_abort");
901&function_end("${PREFIX}_cbc_encrypt");
902
903&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
deleted file mode 100644
index bd7f45b850..0000000000
--- a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
+++ /dev/null
@@ -1,1207 +0,0 @@
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Interface to OpenSSL as "almost" drop-in replacement for
17# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-x86_64.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86_64.pl column -
26# [also large-block CBC] encrypt/decrypt.
27#
28# aes-x86_64.pl vpaes-x86_64.pl
29#
30# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***)
31# Nehalem 30.5/42.2/14.6 9.8/11.8
32# Atom 63.9/79.0/32.1 64.0/84.8(***)
33#
34# (*) "Hyper-threading" in the context refers rather to cache shared
35# among multiple cores, than to specifically Intel HTT. As vast
36# majority of contemporary cores share cache, slower code path
37# is common place. In other words "with-hyper-threading-off"
38# results are presented mostly for reference purposes.
39#
40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
41#
42# (***) Less impressive improvement on Core 2 and Atom is due to slow
43# pshufb, yet it's respectable +40%/78% improvement on Core 2
44# (as implied, over "hyper-threading-safe" code path).
45#
46# <appro@openssl.org>
47
48$flavour = shift;
49$output = shift;
50if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
51
52$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53
54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57die "can't locate x86_64-xlate.pl";
58
59open OUT,"| \"$^X\" $xlate $flavour $output";
60*STDOUT=*OUT;
61
62$PREFIX="vpaes";
63
64$code.=<<___;
65.text
66
67##
68## _aes_encrypt_core
69##
70## AES-encrypt %xmm0.
71##
72## Inputs:
73## %xmm0 = input
74## %xmm9-%xmm15 as in _vpaes_preheat
75## (%rdx) = scheduled keys
76##
77## Output in %xmm0
78## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
79## Preserves %xmm6 - %xmm8 so you get some local vectors
80##
81##
82.type _vpaes_encrypt_core,\@abi-omnipotent
83.align 16
84_vpaes_encrypt_core:
85 mov %rdx, %r9
86 mov \$16, %r11
87 mov 240(%rdx),%eax
88 movdqa %xmm9, %xmm1
89 movdqa .Lk_ipt(%rip), %xmm2 # iptlo
90 pandn %xmm0, %xmm1
91 movdqu (%r9), %xmm5 # round0 key
92 psrld \$4, %xmm1
93 pand %xmm9, %xmm0
94 pshufb %xmm0, %xmm2
95 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
96 pshufb %xmm1, %xmm0
97 pxor %xmm5, %xmm2
98 pxor %xmm2, %xmm0
99 add \$16, %r9
100 lea .Lk_mc_backward(%rip),%r10
101 jmp .Lenc_entry
102
103.align 16
104.Lenc_loop:
105 # middle of middle round
106 movdqa %xmm13, %xmm4 # 4 : sb1u
107 pshufb %xmm2, %xmm4 # 4 = sb1u
108 pxor %xmm5, %xmm4 # 4 = sb1u + k
109 movdqa %xmm12, %xmm0 # 0 : sb1t
110 pshufb %xmm3, %xmm0 # 0 = sb1t
111 pxor %xmm4, %xmm0 # 0 = A
112 movdqa %xmm15, %xmm5 # 4 : sb2u
113 pshufb %xmm2, %xmm5 # 4 = sb2u
114 movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
115 movdqa %xmm14, %xmm2 # 2 : sb2t
116 pshufb %xmm3, %xmm2 # 2 = sb2t
117 pxor %xmm5, %xmm2 # 2 = 2A
118 movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
119 movdqa %xmm0, %xmm3 # 3 = A
120 pshufb %xmm1, %xmm0 # 0 = B
121 add \$16, %r9 # next key
122 pxor %xmm2, %xmm0 # 0 = 2A+B
123 pshufb %xmm4, %xmm3 # 3 = D
124 add \$16, %r11 # next mc
125 pxor %xmm0, %xmm3 # 3 = 2A+B+D
126 pshufb %xmm1, %xmm0 # 0 = 2B+C
127 and \$0x30, %r11 # ... mod 4
128 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
129 sub \$1,%rax # nr--
130
131.Lenc_entry:
132 # top of round
133 movdqa %xmm9, %xmm1 # 1 : i
134 pandn %xmm0, %xmm1 # 1 = i<<4
135 psrld \$4, %xmm1 # 1 = i
136 pand %xmm9, %xmm0 # 0 = k
137 movdqa %xmm11, %xmm5 # 2 : a/k
138 pshufb %xmm0, %xmm5 # 2 = a/k
139 pxor %xmm1, %xmm0 # 0 = j
140 movdqa %xmm10, %xmm3 # 3 : 1/i
141 pshufb %xmm1, %xmm3 # 3 = 1/i
142 pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
143 movdqa %xmm10, %xmm4 # 4 : 1/j
144 pshufb %xmm0, %xmm4 # 4 = 1/j
145 pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
146 movdqa %xmm10, %xmm2 # 2 : 1/iak
147 pshufb %xmm3, %xmm2 # 2 = 1/iak
148 pxor %xmm0, %xmm2 # 2 = io
149 movdqa %xmm10, %xmm3 # 3 : 1/jak
150 movdqu (%r9), %xmm5
151 pshufb %xmm4, %xmm3 # 3 = 1/jak
152 pxor %xmm1, %xmm3 # 3 = jo
153 jnz .Lenc_loop
154
155 # middle of last round
156 movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
157 movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
158 pshufb %xmm2, %xmm4 # 4 = sbou
159 pxor %xmm5, %xmm4 # 4 = sb1u + k
160 pshufb %xmm3, %xmm0 # 0 = sb1t
161 movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
162 pxor %xmm4, %xmm0 # 0 = A
163 pshufb %xmm1, %xmm0
164 ret
165.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
166
167##
168## Decryption core
169##
170## Same API as encryption core.
171##
172.type _vpaes_decrypt_core,\@abi-omnipotent
173.align 16
174_vpaes_decrypt_core:
175 mov %rdx, %r9 # load key
176 mov 240(%rdx),%eax
177 movdqa %xmm9, %xmm1
178 movdqa .Lk_dipt(%rip), %xmm2 # iptlo
179 pandn %xmm0, %xmm1
180 mov %rax, %r11
181 psrld \$4, %xmm1
182 movdqu (%r9), %xmm5 # round0 key
183 shl \$4, %r11
184 pand %xmm9, %xmm0
185 pshufb %xmm0, %xmm2
186 movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
187 xor \$0x30, %r11
188 lea .Lk_dsbd(%rip),%r10
189 pshufb %xmm1, %xmm0
190 and \$0x30, %r11
191 pxor %xmm5, %xmm2
192 movdqa .Lk_mc_forward+48(%rip), %xmm5
193 pxor %xmm2, %xmm0
194 add \$16, %r9
195 add %r10, %r11
196 jmp .Ldec_entry
197
198.align 16
199.Ldec_loop:
200##
201## Inverse mix columns
202##
203 movdqa -0x20(%r10),%xmm4 # 4 : sb9u
204 pshufb %xmm2, %xmm4 # 4 = sb9u
205 pxor %xmm0, %xmm4
206 movdqa -0x10(%r10),%xmm0 # 0 : sb9t
207 pshufb %xmm3, %xmm0 # 0 = sb9t
208 pxor %xmm4, %xmm0 # 0 = ch
209 add \$16, %r9 # next round key
210
211 pshufb %xmm5, %xmm0 # MC ch
212 movdqa 0x00(%r10),%xmm4 # 4 : sbdu
213 pshufb %xmm2, %xmm4 # 4 = sbdu
214 pxor %xmm0, %xmm4 # 4 = ch
215 movdqa 0x10(%r10),%xmm0 # 0 : sbdt
216 pshufb %xmm3, %xmm0 # 0 = sbdt
217 pxor %xmm4, %xmm0 # 0 = ch
218 sub \$1,%rax # nr--
219
220 pshufb %xmm5, %xmm0 # MC ch
221 movdqa 0x20(%r10),%xmm4 # 4 : sbbu
222 pshufb %xmm2, %xmm4 # 4 = sbbu
223 pxor %xmm0, %xmm4 # 4 = ch
224 movdqa 0x30(%r10),%xmm0 # 0 : sbbt
225 pshufb %xmm3, %xmm0 # 0 = sbbt
226 pxor %xmm4, %xmm0 # 0 = ch
227
228 pshufb %xmm5, %xmm0 # MC ch
229 movdqa 0x40(%r10),%xmm4 # 4 : sbeu
230 pshufb %xmm2, %xmm4 # 4 = sbeu
231 pxor %xmm0, %xmm4 # 4 = ch
232 movdqa 0x50(%r10),%xmm0 # 0 : sbet
233 pshufb %xmm3, %xmm0 # 0 = sbet
234 pxor %xmm4, %xmm0 # 0 = ch
235
236 palignr \$12, %xmm5, %xmm5
237
238.Ldec_entry:
239 # top of round
240 movdqa %xmm9, %xmm1 # 1 : i
241 pandn %xmm0, %xmm1 # 1 = i<<4
242 psrld \$4, %xmm1 # 1 = i
243 pand %xmm9, %xmm0 # 0 = k
244 movdqa %xmm11, %xmm2 # 2 : a/k
245 pshufb %xmm0, %xmm2 # 2 = a/k
246 pxor %xmm1, %xmm0 # 0 = j
247 movdqa %xmm10, %xmm3 # 3 : 1/i
248 pshufb %xmm1, %xmm3 # 3 = 1/i
249 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
250 movdqa %xmm10, %xmm4 # 4 : 1/j
251 pshufb %xmm0, %xmm4 # 4 = 1/j
252 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
253 movdqa %xmm10, %xmm2 # 2 : 1/iak
254 pshufb %xmm3, %xmm2 # 2 = 1/iak
255 pxor %xmm0, %xmm2 # 2 = io
256 movdqa %xmm10, %xmm3 # 3 : 1/jak
257 pshufb %xmm4, %xmm3 # 3 = 1/jak
258 pxor %xmm1, %xmm3 # 3 = jo
259 movdqu (%r9), %xmm0
260 jnz .Ldec_loop
261
262 # middle of last round
263 movdqa 0x60(%r10), %xmm4 # 3 : sbou
264 pshufb %xmm2, %xmm4 # 4 = sbou
265 pxor %xmm0, %xmm4 # 4 = sb1u + k
266 movdqa 0x70(%r10), %xmm0 # 0 : sbot
267 movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
268 pshufb %xmm3, %xmm0 # 0 = sb1t
269 pxor %xmm4, %xmm0 # 0 = A
270 pshufb %xmm2, %xmm0
271 ret
272.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
273
274########################################################
275## ##
276## AES key schedule ##
277## ##
278########################################################
279.type _vpaes_schedule_core,\@abi-omnipotent
280.align 16
281_vpaes_schedule_core:
282 # rdi = key
283 # rsi = size in bits
284 # rdx = buffer
285 # rcx = direction. 0=encrypt, 1=decrypt
286
287 call _vpaes_preheat # load the tables
288 movdqa .Lk_rcon(%rip), %xmm8 # load rcon
289 movdqu (%rdi), %xmm0 # load key (unaligned)
290
291 # input transform
292 movdqa %xmm0, %xmm3
293 lea .Lk_ipt(%rip), %r11
294 call _vpaes_schedule_transform
295 movdqa %xmm0, %xmm7
296
297 lea .Lk_sr(%rip),%r10
298 test %rcx, %rcx
299 jnz .Lschedule_am_decrypting
300
301 # encrypting, output zeroth round key after transform
302 movdqu %xmm0, (%rdx)
303 jmp .Lschedule_go
304
305.Lschedule_am_decrypting:
306 # decrypting, output zeroth round key after shiftrows
307 movdqa (%r8,%r10),%xmm1
308 pshufb %xmm1, %xmm3
309 movdqu %xmm3, (%rdx)
310 xor \$0x30, %r8
311
312.Lschedule_go:
313 cmp \$192, %esi
314 ja .Lschedule_256
315 je .Lschedule_192
316 # 128: fall though
317
318##
319## .schedule_128
320##
321## 128-bit specific part of key schedule.
322##
323## This schedule is really simple, because all its parts
324## are accomplished by the subroutines.
325##
326.Lschedule_128:
327 mov \$10, %esi
328
329.Loop_schedule_128:
330 call _vpaes_schedule_round
331 dec %rsi
332 jz .Lschedule_mangle_last
333 call _vpaes_schedule_mangle # write output
334 jmp .Loop_schedule_128
335
336##
337## .aes_schedule_192
338##
339## 192-bit specific part of key schedule.
340##
341## The main body of this schedule is the same as the 128-bit
342## schedule, but with more smearing. The long, high side is
343## stored in %xmm7 as before, and the short, low side is in
344## the high bits of %xmm6.
345##
346## This schedule is somewhat nastier, however, because each
347## round produces 192 bits of key material, or 1.5 round keys.
348## Therefore, on each cycle we do 2 rounds and produce 3 round
349## keys.
350##
351.align 16
352.Lschedule_192:
353 movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
354 call _vpaes_schedule_transform # input transform
355 movdqa %xmm0, %xmm6 # save short part
356 pxor %xmm4, %xmm4 # clear 4
357 movhlps %xmm4, %xmm6 # clobber low side with zeros
358 mov \$4, %esi
359
360.Loop_schedule_192:
361 call _vpaes_schedule_round
362 palignr \$8,%xmm6,%xmm0
363 call _vpaes_schedule_mangle # save key n
364 call _vpaes_schedule_192_smear
365 call _vpaes_schedule_mangle # save key n+1
366 call _vpaes_schedule_round
367 dec %rsi
368 jz .Lschedule_mangle_last
369 call _vpaes_schedule_mangle # save key n+2
370 call _vpaes_schedule_192_smear
371 jmp .Loop_schedule_192
372
373##
374## .aes_schedule_256
375##
376## 256-bit specific part of key schedule.
377##
378## The structure here is very similar to the 128-bit
379## schedule, but with an additional "low side" in
380## %xmm6. The low side's rounds are the same as the
381## high side's, except no rcon and no rotation.
382##
383.align 16
384.Lschedule_256:
385 movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
386 call _vpaes_schedule_transform # input transform
387 mov \$7, %esi
388
389.Loop_schedule_256:
390 call _vpaes_schedule_mangle # output low result
391 movdqa %xmm0, %xmm6 # save cur_lo in xmm6
392
393 # high round
394 call _vpaes_schedule_round
395 dec %rsi
396 jz .Lschedule_mangle_last
397 call _vpaes_schedule_mangle
398
399 # low round. swap xmm7 and xmm6
400 pshufd \$0xFF, %xmm0, %xmm0
401 movdqa %xmm7, %xmm5
402 movdqa %xmm6, %xmm7
403 call _vpaes_schedule_low_round
404 movdqa %xmm5, %xmm7
405
406 jmp .Loop_schedule_256
407
408
409##
410## .aes_schedule_mangle_last
411##
412## Mangler for last round of key schedule
413## Mangles %xmm0
414## when encrypting, outputs out(%xmm0) ^ 63
415## when decrypting, outputs unskew(%xmm0)
416##
417## Always called right before return... jumps to cleanup and exits
418##
419.align 16
420.Lschedule_mangle_last:
421 # schedule last round key from xmm0
422 lea .Lk_deskew(%rip),%r11 # prepare to deskew
423 test %rcx, %rcx
424 jnz .Lschedule_mangle_last_dec
425
426 # encrypting
427 movdqa (%r8,%r10),%xmm1
428 pshufb %xmm1, %xmm0 # output permute
429 lea .Lk_opt(%rip), %r11 # prepare to output transform
430 add \$32, %rdx
431
432.Lschedule_mangle_last_dec:
433 add \$-16, %rdx
434 pxor .Lk_s63(%rip), %xmm0
435 call _vpaes_schedule_transform # output transform
436 movdqu %xmm0, (%rdx) # save last key
437
438 # cleanup
439 pxor %xmm0, %xmm0
440 pxor %xmm1, %xmm1
441 pxor %xmm2, %xmm2
442 pxor %xmm3, %xmm3
443 pxor %xmm4, %xmm4
444 pxor %xmm5, %xmm5
445 pxor %xmm6, %xmm6
446 pxor %xmm7, %xmm7
447 ret
448.size _vpaes_schedule_core,.-_vpaes_schedule_core
449
450##
451## .aes_schedule_192_smear
452##
453## Smear the short, low side in the 192-bit key schedule.
454##
455## Inputs:
456## %xmm7: high side, b a x y
457## %xmm6: low side, d c 0 0
458## %xmm13: 0
459##
460## Outputs:
461## %xmm6: b+c+d b+c 0 0
462## %xmm0: b+c+d b+c b a
463##
464.type _vpaes_schedule_192_smear,\@abi-omnipotent
465.align 16
466_vpaes_schedule_192_smear:
467 pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
468 pxor %xmm0, %xmm6 # -> c+d c 0 0
469 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
470 pxor %xmm0, %xmm6 # -> b+c+d b+c b a
471 movdqa %xmm6, %xmm0
472 pxor %xmm1, %xmm1
473 movhlps %xmm1, %xmm6 # clobber low side with zeros
474 ret
475.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
476
477##
478## .aes_schedule_round
479##
480## Runs one main round of the key schedule on %xmm0, %xmm7
481##
482## Specifically, runs subbytes on the high dword of %xmm0
483## then rotates it by one byte and xors into the low dword of
484## %xmm7.
485##
486## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
487## next rcon.
488##
489## Smears the dwords of %xmm7 by xoring the low into the
490## second low, result into third, result into highest.
491##
492## Returns results in %xmm7 = %xmm0.
493## Clobbers %xmm1-%xmm4, %r11.
494##
495.type _vpaes_schedule_round,\@abi-omnipotent
496.align 16
497_vpaes_schedule_round:
498 # extract rcon from xmm8
499 pxor %xmm1, %xmm1
500 palignr \$15, %xmm8, %xmm1
501 palignr \$15, %xmm8, %xmm8
502 pxor %xmm1, %xmm7
503
504 # rotate
505 pshufd \$0xFF, %xmm0, %xmm0
506 palignr \$1, %xmm0, %xmm0
507
508 # fall through...
509
510 # low round: same as high round, but no rotation and no rcon.
511_vpaes_schedule_low_round:
512 # smear xmm7
513 movdqa %xmm7, %xmm1
514 pslldq \$4, %xmm7
515 pxor %xmm1, %xmm7
516 movdqa %xmm7, %xmm1
517 pslldq \$8, %xmm7
518 pxor %xmm1, %xmm7
519 pxor .Lk_s63(%rip), %xmm7
520
521 # subbytes
522 movdqa %xmm9, %xmm1
523 pandn %xmm0, %xmm1
524 psrld \$4, %xmm1 # 1 = i
525 pand %xmm9, %xmm0 # 0 = k
526 movdqa %xmm11, %xmm2 # 2 : a/k
527 pshufb %xmm0, %xmm2 # 2 = a/k
528 pxor %xmm1, %xmm0 # 0 = j
529 movdqa %xmm10, %xmm3 # 3 : 1/i
530 pshufb %xmm1, %xmm3 # 3 = 1/i
531 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
532 movdqa %xmm10, %xmm4 # 4 : 1/j
533 pshufb %xmm0, %xmm4 # 4 = 1/j
534 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
535 movdqa %xmm10, %xmm2 # 2 : 1/iak
536 pshufb %xmm3, %xmm2 # 2 = 1/iak
537 pxor %xmm0, %xmm2 # 2 = io
538 movdqa %xmm10, %xmm3 # 3 : 1/jak
539 pshufb %xmm4, %xmm3 # 3 = 1/jak
540 pxor %xmm1, %xmm3 # 3 = jo
541 movdqa %xmm13, %xmm4 # 4 : sbou
542 pshufb %xmm2, %xmm4 # 4 = sbou
543 movdqa %xmm12, %xmm0 # 0 : sbot
544 pshufb %xmm3, %xmm0 # 0 = sb1t
545 pxor %xmm4, %xmm0 # 0 = sbox output
546
547 # add in smeared stuff
548 pxor %xmm7, %xmm0
549 movdqa %xmm0, %xmm7
550 ret
551.size _vpaes_schedule_round,.-_vpaes_schedule_round
552
553##
554## .aes_schedule_transform
555##
556## Linear-transform %xmm0 according to tables at (%r11)
557##
558## Requires that %xmm9 = 0x0F0F... as in preheat
559## Output in %xmm0
560## Clobbers %xmm1, %xmm2
561##
562.type _vpaes_schedule_transform,\@abi-omnipotent
563.align 16
564_vpaes_schedule_transform:
565 movdqa %xmm9, %xmm1
566 pandn %xmm0, %xmm1
567 psrld \$4, %xmm1
568 pand %xmm9, %xmm0
569 movdqa (%r11), %xmm2 # lo
570 pshufb %xmm0, %xmm2
571 movdqa 16(%r11), %xmm0 # hi
572 pshufb %xmm1, %xmm0
573 pxor %xmm2, %xmm0
574 ret
575.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
576
577##
578## .aes_schedule_mangle
579##
580## Mangle xmm0 from (basis-transformed) standard version
581## to our version.
582##
583## On encrypt,
584## xor with 0x63
585## multiply by circulant 0,1,1,1
586## apply shiftrows transform
587##
588## On decrypt,
589## xor with 0x63
590## multiply by "inverse mixcolumns" circulant E,B,D,9
591## deskew
592## apply shiftrows transform
593##
594##
595## Writes out to (%rdx), and increments or decrements it
596## Keeps track of round number mod 4 in %r8
597## Preserves xmm0
598## Clobbers xmm1-xmm5
599##
600.type _vpaes_schedule_mangle,\@abi-omnipotent
601.align 16
602_vpaes_schedule_mangle:
603 movdqa %xmm0, %xmm4 # save xmm0 for later
604 movdqa .Lk_mc_forward(%rip),%xmm5
605 test %rcx, %rcx
606 jnz .Lschedule_mangle_dec
607
608 # encrypting
609 add \$16, %rdx
610 pxor .Lk_s63(%rip),%xmm4
611 pshufb %xmm5, %xmm4
612 movdqa %xmm4, %xmm3
613 pshufb %xmm5, %xmm4
614 pxor %xmm4, %xmm3
615 pshufb %xmm5, %xmm4
616 pxor %xmm4, %xmm3
617
618 jmp .Lschedule_mangle_both
619.align 16
620.Lschedule_mangle_dec:
621 # inverse mix columns
622 lea .Lk_dksd(%rip),%r11
623 movdqa %xmm9, %xmm1
624 pandn %xmm4, %xmm1
625 psrld \$4, %xmm1 # 1 = hi
626 pand %xmm9, %xmm4 # 4 = lo
627
628 movdqa 0x00(%r11), %xmm2
629 pshufb %xmm4, %xmm2
630 movdqa 0x10(%r11), %xmm3
631 pshufb %xmm1, %xmm3
632 pxor %xmm2, %xmm3
633 pshufb %xmm5, %xmm3
634
635 movdqa 0x20(%r11), %xmm2
636 pshufb %xmm4, %xmm2
637 pxor %xmm3, %xmm2
638 movdqa 0x30(%r11), %xmm3
639 pshufb %xmm1, %xmm3
640 pxor %xmm2, %xmm3
641 pshufb %xmm5, %xmm3
642
643 movdqa 0x40(%r11), %xmm2
644 pshufb %xmm4, %xmm2
645 pxor %xmm3, %xmm2
646 movdqa 0x50(%r11), %xmm3
647 pshufb %xmm1, %xmm3
648 pxor %xmm2, %xmm3
649 pshufb %xmm5, %xmm3
650
651 movdqa 0x60(%r11), %xmm2
652 pshufb %xmm4, %xmm2
653 pxor %xmm3, %xmm2
654 movdqa 0x70(%r11), %xmm3
655 pshufb %xmm1, %xmm3
656 pxor %xmm2, %xmm3
657
658 add \$-16, %rdx
659
660.Lschedule_mangle_both:
661 movdqa (%r8,%r10),%xmm1
662 pshufb %xmm1,%xmm3
663 add \$-16, %r8
664 and \$0x30, %r8
665 movdqu %xmm3, (%rdx)
666 ret
667.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
668
669#
670# Interface to OpenSSL
671#
672.globl ${PREFIX}_set_encrypt_key
673.type ${PREFIX}_set_encrypt_key,\@function,3
674.align 16
675${PREFIX}_set_encrypt_key:
676___
677$code.=<<___ if ($win64);
678 lea -0xb8(%rsp),%rsp
679 movaps %xmm6,0x10(%rsp)
680 movaps %xmm7,0x20(%rsp)
681 movaps %xmm8,0x30(%rsp)
682 movaps %xmm9,0x40(%rsp)
683 movaps %xmm10,0x50(%rsp)
684 movaps %xmm11,0x60(%rsp)
685 movaps %xmm12,0x70(%rsp)
686 movaps %xmm13,0x80(%rsp)
687 movaps %xmm14,0x90(%rsp)
688 movaps %xmm15,0xa0(%rsp)
689.Lenc_key_body:
690___
691$code.=<<___;
692 mov %esi,%eax
693 shr \$5,%eax
694 add \$5,%eax
695 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
696
697 mov \$0,%ecx
698 mov \$0x30,%r8d
699 call _vpaes_schedule_core
700___
701$code.=<<___ if ($win64);
702 movaps 0x10(%rsp),%xmm6
703 movaps 0x20(%rsp),%xmm7
704 movaps 0x30(%rsp),%xmm8
705 movaps 0x40(%rsp),%xmm9
706 movaps 0x50(%rsp),%xmm10
707 movaps 0x60(%rsp),%xmm11
708 movaps 0x70(%rsp),%xmm12
709 movaps 0x80(%rsp),%xmm13
710 movaps 0x90(%rsp),%xmm14
711 movaps 0xa0(%rsp),%xmm15
712 lea 0xb8(%rsp),%rsp
713.Lenc_key_epilogue:
714___
715$code.=<<___;
716 xor %eax,%eax
717 ret
718.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
719
720.globl ${PREFIX}_set_decrypt_key
721.type ${PREFIX}_set_decrypt_key,\@function,3
722.align 16
723${PREFIX}_set_decrypt_key:
724___
725$code.=<<___ if ($win64);
726 lea -0xb8(%rsp),%rsp
727 movaps %xmm6,0x10(%rsp)
728 movaps %xmm7,0x20(%rsp)
729 movaps %xmm8,0x30(%rsp)
730 movaps %xmm9,0x40(%rsp)
731 movaps %xmm10,0x50(%rsp)
732 movaps %xmm11,0x60(%rsp)
733 movaps %xmm12,0x70(%rsp)
734 movaps %xmm13,0x80(%rsp)
735 movaps %xmm14,0x90(%rsp)
736 movaps %xmm15,0xa0(%rsp)
737.Ldec_key_body:
738___
739$code.=<<___;
740 mov %esi,%eax
741 shr \$5,%eax
742 add \$5,%eax
743 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
744 shl \$4,%eax
745 lea 16(%rdx,%rax),%rdx
746
747 mov \$1,%ecx
748 mov %esi,%r8d
749 shr \$1,%r8d
750 and \$32,%r8d
751 xor \$32,%r8d # nbits==192?0:32
752 call _vpaes_schedule_core
753___
754$code.=<<___ if ($win64);
755 movaps 0x10(%rsp),%xmm6
756 movaps 0x20(%rsp),%xmm7
757 movaps 0x30(%rsp),%xmm8
758 movaps 0x40(%rsp),%xmm9
759 movaps 0x50(%rsp),%xmm10
760 movaps 0x60(%rsp),%xmm11
761 movaps 0x70(%rsp),%xmm12
762 movaps 0x80(%rsp),%xmm13
763 movaps 0x90(%rsp),%xmm14
764 movaps 0xa0(%rsp),%xmm15
765 lea 0xb8(%rsp),%rsp
766.Ldec_key_epilogue:
767___
768$code.=<<___;
769 xor %eax,%eax
770 ret
771.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
772
773.globl ${PREFIX}_encrypt
774.type ${PREFIX}_encrypt,\@function,3
775.align 16
776${PREFIX}_encrypt:
777___
778$code.=<<___ if ($win64);
779 lea -0xb8(%rsp),%rsp
780 movaps %xmm6,0x10(%rsp)
781 movaps %xmm7,0x20(%rsp)
782 movaps %xmm8,0x30(%rsp)
783 movaps %xmm9,0x40(%rsp)
784 movaps %xmm10,0x50(%rsp)
785 movaps %xmm11,0x60(%rsp)
786 movaps %xmm12,0x70(%rsp)
787 movaps %xmm13,0x80(%rsp)
788 movaps %xmm14,0x90(%rsp)
789 movaps %xmm15,0xa0(%rsp)
790.Lenc_body:
791___
792$code.=<<___;
793 movdqu (%rdi),%xmm0
794 call _vpaes_preheat
795 call _vpaes_encrypt_core
796 movdqu %xmm0,(%rsi)
797___
798$code.=<<___ if ($win64);
799 movaps 0x10(%rsp),%xmm6
800 movaps 0x20(%rsp),%xmm7
801 movaps 0x30(%rsp),%xmm8
802 movaps 0x40(%rsp),%xmm9
803 movaps 0x50(%rsp),%xmm10
804 movaps 0x60(%rsp),%xmm11
805 movaps 0x70(%rsp),%xmm12
806 movaps 0x80(%rsp),%xmm13
807 movaps 0x90(%rsp),%xmm14
808 movaps 0xa0(%rsp),%xmm15
809 lea 0xb8(%rsp),%rsp
810.Lenc_epilogue:
811___
812$code.=<<___;
813 ret
814.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
815
816.globl ${PREFIX}_decrypt
817.type ${PREFIX}_decrypt,\@function,3
818.align 16
819${PREFIX}_decrypt:
820___
821$code.=<<___ if ($win64);
822 lea -0xb8(%rsp),%rsp
823 movaps %xmm6,0x10(%rsp)
824 movaps %xmm7,0x20(%rsp)
825 movaps %xmm8,0x30(%rsp)
826 movaps %xmm9,0x40(%rsp)
827 movaps %xmm10,0x50(%rsp)
828 movaps %xmm11,0x60(%rsp)
829 movaps %xmm12,0x70(%rsp)
830 movaps %xmm13,0x80(%rsp)
831 movaps %xmm14,0x90(%rsp)
832 movaps %xmm15,0xa0(%rsp)
833.Ldec_body:
834___
835$code.=<<___;
836 movdqu (%rdi),%xmm0
837 call _vpaes_preheat
838 call _vpaes_decrypt_core
839 movdqu %xmm0,(%rsi)
840___
841$code.=<<___ if ($win64);
842 movaps 0x10(%rsp),%xmm6
843 movaps 0x20(%rsp),%xmm7
844 movaps 0x30(%rsp),%xmm8
845 movaps 0x40(%rsp),%xmm9
846 movaps 0x50(%rsp),%xmm10
847 movaps 0x60(%rsp),%xmm11
848 movaps 0x70(%rsp),%xmm12
849 movaps 0x80(%rsp),%xmm13
850 movaps 0x90(%rsp),%xmm14
851 movaps 0xa0(%rsp),%xmm15
852 lea 0xb8(%rsp),%rsp
853.Ldec_epilogue:
854___
855$code.=<<___;
856 ret
857.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
858___
859{
860my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
861# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
862# size_t length, const AES_KEY *key,
863# unsigned char *ivp,const int enc);
864$code.=<<___;
865.globl ${PREFIX}_cbc_encrypt
866.type ${PREFIX}_cbc_encrypt,\@function,6
867.align 16
868${PREFIX}_cbc_encrypt:
869 xchg $key,$len
870___
871($len,$key)=($key,$len);
872$code.=<<___;
873 sub \$16,$len
874 jc .Lcbc_abort
875___
876$code.=<<___ if ($win64);
877 lea -0xb8(%rsp),%rsp
878 movaps %xmm6,0x10(%rsp)
879 movaps %xmm7,0x20(%rsp)
880 movaps %xmm8,0x30(%rsp)
881 movaps %xmm9,0x40(%rsp)
882 movaps %xmm10,0x50(%rsp)
883 movaps %xmm11,0x60(%rsp)
884 movaps %xmm12,0x70(%rsp)
885 movaps %xmm13,0x80(%rsp)
886 movaps %xmm14,0x90(%rsp)
887 movaps %xmm15,0xa0(%rsp)
888.Lcbc_body:
889___
890$code.=<<___;
891 movdqu ($ivp),%xmm6 # load IV
892 sub $inp,$out
893 call _vpaes_preheat
894 cmp \$0,${enc}d
895 je .Lcbc_dec_loop
896 jmp .Lcbc_enc_loop
897.align 16
898.Lcbc_enc_loop:
899 movdqu ($inp),%xmm0
900 pxor %xmm6,%xmm0
901 call _vpaes_encrypt_core
902 movdqa %xmm0,%xmm6
903 movdqu %xmm0,($out,$inp)
904 lea 16($inp),$inp
905 sub \$16,$len
906 jnc .Lcbc_enc_loop
907 jmp .Lcbc_done
908.align 16
909.Lcbc_dec_loop:
910 movdqu ($inp),%xmm0
911 movdqa %xmm0,%xmm7
912 call _vpaes_decrypt_core
913 pxor %xmm6,%xmm0
914 movdqa %xmm7,%xmm6
915 movdqu %xmm0,($out,$inp)
916 lea 16($inp),$inp
917 sub \$16,$len
918 jnc .Lcbc_dec_loop
919.Lcbc_done:
920 movdqu %xmm6,($ivp) # save IV
921___
922$code.=<<___ if ($win64);
923 movaps 0x10(%rsp),%xmm6
924 movaps 0x20(%rsp),%xmm7
925 movaps 0x30(%rsp),%xmm8
926 movaps 0x40(%rsp),%xmm9
927 movaps 0x50(%rsp),%xmm10
928 movaps 0x60(%rsp),%xmm11
929 movaps 0x70(%rsp),%xmm12
930 movaps 0x80(%rsp),%xmm13
931 movaps 0x90(%rsp),%xmm14
932 movaps 0xa0(%rsp),%xmm15
933 lea 0xb8(%rsp),%rsp
934.Lcbc_epilogue:
935___
936$code.=<<___;
937.Lcbc_abort:
938 ret
939.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
940___
941}
942$code.=<<___;
943##
944## _aes_preheat
945##
946## Fills register %r10 -> .aes_consts (so you can -fPIC)
947## and %xmm9-%xmm15 as specified below.
948##
949.type _vpaes_preheat,\@abi-omnipotent
950.align 16
951_vpaes_preheat:
952 lea .Lk_s0F(%rip), %r10
953 movdqa -0x20(%r10), %xmm10 # .Lk_inv
954 movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
955 movdqa 0x00(%r10), %xmm9 # .Lk_s0F
956 movdqa 0x30(%r10), %xmm13 # .Lk_sb1
957 movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
958 movdqa 0x50(%r10), %xmm15 # .Lk_sb2
959 movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
960 ret
961.size _vpaes_preheat,.-_vpaes_preheat
962########################################################
963## ##
964## Constants ##
965## ##
966########################################################
967.type _vpaes_consts,\@object
968.align 64
969_vpaes_consts:
970.Lk_inv: # inv, inva
971 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
972 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
973
974.Lk_s0F: # s0F
975 .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
976
977.Lk_ipt: # input transform (lo, hi)
978 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
979 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
980
981.Lk_sb1: # sb1u, sb1t
982 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
983 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
984.Lk_sb2: # sb2u, sb2t
985 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
986 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
987.Lk_sbo: # sbou, sbot
988 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
989 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
990
991.Lk_mc_forward: # mc_forward
992 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
993 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
994 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
995 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
996
997.Lk_mc_backward:# mc_backward
998 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
999 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
1000 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
1001 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
1002
1003.Lk_sr: # sr
1004 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
1005 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
1006 .quad 0x0F060D040B020900, 0x070E050C030A0108
1007 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
1008
1009.Lk_rcon: # rcon
1010 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1011
1012.Lk_s63: # s63: all equal to 0x63 transformed
1013 .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1014
1015.Lk_opt: # output transform
1016 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
1017 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1018
1019.Lk_deskew: # deskew tables: inverts the sbox's "skew"
1020 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1021 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1022
1023##
1024## Decryption stuff
1025## Key schedule constants
1026##
1027.Lk_dksd: # decryption key schedule: invskew x*D
1028 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1029 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1030.Lk_dksb: # decryption key schedule: invskew x*B
1031 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
1032 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1033.Lk_dkse: # decryption key schedule: invskew x*E + 0x63
1034 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
1035 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1036.Lk_dks9: # decryption key schedule: invskew x*9
1037 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
1038 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
1039
1040##
1041## Decryption stuff
1042## Round function constants
1043##
1044.Lk_dipt: # decryption input transform
1045 .quad 0x0F505B040B545F00, 0x154A411E114E451A
1046 .quad 0x86E383E660056500, 0x12771772F491F194
1047
1048.Lk_dsb9: # decryption sbox output *9*u, *9*t
1049 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
1050 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1051.Lk_dsbd: # decryption sbox output *D*u, *D*t
1052 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1053 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1054.Lk_dsbb: # decryption sbox output *B*u, *B*t
1055 .quad 0xD022649296B44200, 0x602646F6B0F2D404
1056 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1057.Lk_dsbe: # decryption sbox output *E*u, *E*t
1058 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
1059 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1060.Lk_dsbo: # decryption sbox final output
1061 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1062 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1063.asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
1064.align 64
1065.size _vpaes_consts,.-_vpaes_consts
1066___
1067
1068if ($win64) {
1069# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1070# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1071$rec="%rcx";
1072$frame="%rdx";
1073$context="%r8";
1074$disp="%r9";
1075
1076$code.=<<___;
1077.extern __imp_RtlVirtualUnwind
1078.type se_handler,\@abi-omnipotent
1079.align 16
1080se_handler:
1081 push %rsi
1082 push %rdi
1083 push %rbx
1084 push %rbp
1085 push %r12
1086 push %r13
1087 push %r14
1088 push %r15
1089 pushfq
1090 sub \$64,%rsp
1091
1092 mov 120($context),%rax # pull context->Rax
1093 mov 248($context),%rbx # pull context->Rip
1094
1095 mov 8($disp),%rsi # disp->ImageBase
1096 mov 56($disp),%r11 # disp->HandlerData
1097
1098 mov 0(%r11),%r10d # HandlerData[0]
1099 lea (%rsi,%r10),%r10 # prologue label
1100 cmp %r10,%rbx # context->Rip<prologue label
1101 jb .Lin_prologue
1102
1103 mov 152($context),%rax # pull context->Rsp
1104
1105 mov 4(%r11),%r10d # HandlerData[1]
1106 lea (%rsi,%r10),%r10 # epilogue label
1107 cmp %r10,%rbx # context->Rip>=epilogue label
1108 jae .Lin_prologue
1109
1110 lea 16(%rax),%rsi # %xmm save area
1111 lea 512($context),%rdi # &context.Xmm6
1112 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1113 .long 0xa548f3fc # cld; rep movsq
1114 lea 0xb8(%rax),%rax # adjust stack pointer
1115
1116.Lin_prologue:
1117 mov 8(%rax),%rdi
1118 mov 16(%rax),%rsi
1119 mov %rax,152($context) # restore context->Rsp
1120 mov %rsi,168($context) # restore context->Rsi
1121 mov %rdi,176($context) # restore context->Rdi
1122
1123 mov 40($disp),%rdi # disp->ContextRecord
1124 mov $context,%rsi # context
1125 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1126 .long 0xa548f3fc # cld; rep movsq
1127
1128 mov $disp,%rsi
1129 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1130 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1131 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1132 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1133 mov 40(%rsi),%r10 # disp->ContextRecord
1134 lea 56(%rsi),%r11 # &disp->HandlerData
1135 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1136 mov %r10,32(%rsp) # arg5
1137 mov %r11,40(%rsp) # arg6
1138 mov %r12,48(%rsp) # arg7
1139 mov %rcx,56(%rsp) # arg8, (NULL)
1140 call *__imp_RtlVirtualUnwind(%rip)
1141
1142 mov \$1,%eax # ExceptionContinueSearch
1143 add \$64,%rsp
1144 popfq
1145 pop %r15
1146 pop %r14
1147 pop %r13
1148 pop %r12
1149 pop %rbp
1150 pop %rbx
1151 pop %rdi
1152 pop %rsi
1153 ret
1154.size se_handler,.-se_handler
1155
1156.section .pdata
1157.align 4
1158 .rva .LSEH_begin_${PREFIX}_set_encrypt_key
1159 .rva .LSEH_end_${PREFIX}_set_encrypt_key
1160 .rva .LSEH_info_${PREFIX}_set_encrypt_key
1161
1162 .rva .LSEH_begin_${PREFIX}_set_decrypt_key
1163 .rva .LSEH_end_${PREFIX}_set_decrypt_key
1164 .rva .LSEH_info_${PREFIX}_set_decrypt_key
1165
1166 .rva .LSEH_begin_${PREFIX}_encrypt
1167 .rva .LSEH_end_${PREFIX}_encrypt
1168 .rva .LSEH_info_${PREFIX}_encrypt
1169
1170 .rva .LSEH_begin_${PREFIX}_decrypt
1171 .rva .LSEH_end_${PREFIX}_decrypt
1172 .rva .LSEH_info_${PREFIX}_decrypt
1173
1174 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
1175 .rva .LSEH_end_${PREFIX}_cbc_encrypt
1176 .rva .LSEH_info_${PREFIX}_cbc_encrypt
1177
1178.section .xdata
1179.align 8
1180.LSEH_info_${PREFIX}_set_encrypt_key:
1181 .byte 9,0,0,0
1182 .rva se_handler
1183 .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
1184.LSEH_info_${PREFIX}_set_decrypt_key:
1185 .byte 9,0,0,0
1186 .rva se_handler
1187 .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
1188.LSEH_info_${PREFIX}_encrypt:
1189 .byte 9,0,0,0
1190 .rva se_handler
1191 .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
1192.LSEH_info_${PREFIX}_decrypt:
1193 .byte 9,0,0,0
1194 .rva se_handler
1195 .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
1196.LSEH_info_${PREFIX}_cbc_encrypt:
1197 .byte 9,0,0,0
1198 .rva se_handler
1199 .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
1200___
1201}
1202
1203$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1204
1205print $code;
1206
1207close STDOUT;