summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/aes
diff options
context:
space:
mode:
authorcvs2svn <admin@example.com>2014-02-27 21:04:58 +0000
committercvs2svn <admin@example.com>2014-02-27 21:04:58 +0000
commit726818f36b5221c023cd04c4b90bdbc08e94cd96 (patch)
treecf8221f3aa5bf5a578ddf1ecf5677ad08c04d342 /src/lib/libcrypto/aes
parent3b6d92e82b1421b811bcdec7f7fdfb31eeef18de (diff)
downloadopenbsd-OPENBSD_5_5_BASE.tar.gz
openbsd-OPENBSD_5_5_BASE.tar.bz2
openbsd-OPENBSD_5_5_BASE.zip
This commit was manufactured by cvs2git to create tag 'OPENBSD_5_5_BASE'.OPENBSD_5_5_BASE
Diffstat (limited to 'src/lib/libcrypto/aes')
-rw-r--r--src/lib/libcrypto/aes/README3
-rw-r--r--src/lib/libcrypto/aes/aes.h147
-rw-r--r--src/lib/libcrypto/aes/aes_cbc.c63
-rw-r--r--src/lib/libcrypto/aes/aes_cfb.c81
-rw-r--r--src/lib/libcrypto/aes/aes_core.c1358
-rw-r--r--src/lib/libcrypto/aes/aes_ctr.c61
-rw-r--r--src/lib/libcrypto/aes/aes_ecb.c73
-rw-r--r--src/lib/libcrypto/aes/aes_ige.c323
-rw-r--r--src/lib/libcrypto/aes/aes_locl.h89
-rw-r--r--src/lib/libcrypto/aes/aes_misc.c85
-rw-r--r--src/lib/libcrypto/aes/aes_ofb.c60
-rw-r--r--src/lib/libcrypto/aes/aes_wrap.c259
-rw-r--r--src/lib/libcrypto/aes/aes_x86core.c1063
-rw-r--r--src/lib/libcrypto/aes/asm/aes-586.pl2980
-rw-r--r--src/lib/libcrypto/aes/asm/aes-armv4.pl1134
-rw-r--r--src/lib/libcrypto/aes/asm/aes-ia64.S1123
-rw-r--r--src/lib/libcrypto/aes/asm/aes-mips.pl1611
-rw-r--r--src/lib/libcrypto/aes/asm/aes-parisc.pl1021
-rw-r--r--src/lib/libcrypto/aes/asm/aes-ppc.pl1365
-rw-r--r--src/lib/libcrypto/aes/asm/aes-s390x.pl2254
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-sparcv9.pl1182
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-x86_64.pl2818
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl1249
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-x86.pl2189
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-x86_64.pl3068
-rw-r--r--src/lib/libcrypto/aes/asm/bsaes-x86_64.pl3044
-rw-r--r--src/lib/libcrypto/aes/asm/vpaes-x86.pl903
-rw-r--r--src/lib/libcrypto/aes/asm/vpaes-x86_64.pl1206
28 files changed, 0 insertions, 30812 deletions
diff --git a/src/lib/libcrypto/aes/README b/src/lib/libcrypto/aes/README
deleted file mode 100644
index 0f9620a80e..0000000000
--- a/src/lib/libcrypto/aes/README
+++ /dev/null
@@ -1,3 +0,0 @@
1This is an OpenSSL-compatible version of AES (also called Rijndael).
2aes_core.c is basically the same as rijndael-alg-fst.c but with an
3API that looks like the rest of the OpenSSL symmetric cipher suite.
diff --git a/src/lib/libcrypto/aes/aes.h b/src/lib/libcrypto/aes/aes.h
deleted file mode 100644
index 031abf01b5..0000000000
--- a/src/lib/libcrypto/aes/aes.h
+++ /dev/null
@@ -1,147 +0,0 @@
1/* crypto/aes/aes.h -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#ifndef HEADER_AES_H
53#define HEADER_AES_H
54
55#include <openssl/opensslconf.h>
56
57#ifdef OPENSSL_NO_AES
58#error AES is disabled.
59#endif
60
61#include <stddef.h>
62
63#define AES_ENCRYPT 1
64#define AES_DECRYPT 0
65
66/* Because array size can't be a const in C, the following two are macros.
67 Both sizes are in bytes. */
68#define AES_MAXNR 14
69#define AES_BLOCK_SIZE 16
70
71#ifdef __cplusplus
72extern "C" {
73#endif
74
75/* This should be a hidden type, but EVP requires that the size be known */
76struct aes_key_st {
77#ifdef AES_LONG
78 unsigned long rd_key[4 *(AES_MAXNR + 1)];
79#else
80 unsigned int rd_key[4 *(AES_MAXNR + 1)];
81#endif
82 int rounds;
83};
84typedef struct aes_key_st AES_KEY;
85
86const char *AES_options(void);
87
88int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
89 AES_KEY *key);
90int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
91 AES_KEY *key);
92
93int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
94 AES_KEY *key);
95int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
96 AES_KEY *key);
97
98void AES_encrypt(const unsigned char *in, unsigned char *out,
99 const AES_KEY *key);
100void AES_decrypt(const unsigned char *in, unsigned char *out,
101 const AES_KEY *key);
102
103void AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
104 const AES_KEY *key, const int enc);
105void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
106 size_t length, const AES_KEY *key,
107 unsigned char *ivec, const int enc);
108void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out,
109 size_t length, const AES_KEY *key,
110 unsigned char *ivec, int *num, const int enc);
111void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out,
112 size_t length, const AES_KEY *key,
113 unsigned char *ivec, int *num, const int enc);
114void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out,
115 size_t length, const AES_KEY *key,
116 unsigned char *ivec, int *num, const int enc);
117void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out,
118 size_t length, const AES_KEY *key,
119 unsigned char *ivec, int *num);
120void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
121 size_t length, const AES_KEY *key,
122 unsigned char ivec[AES_BLOCK_SIZE],
123 unsigned char ecount_buf[AES_BLOCK_SIZE],
124 unsigned int *num);
125/* NB: the IV is _two_ blocks long */
126void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
127 size_t length, const AES_KEY *key,
128 unsigned char *ivec, const int enc);
129/* NB: the IV is _four_ blocks long */
130void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out,
131 size_t length, const AES_KEY *key,
132 const AES_KEY *key2, const unsigned char *ivec,
133 const int enc);
134
135int AES_wrap_key(AES_KEY *key, const unsigned char *iv,
136 unsigned char *out,
137 const unsigned char *in, unsigned int inlen);
138int AES_unwrap_key(AES_KEY *key, const unsigned char *iv,
139 unsigned char *out,
140 const unsigned char *in, unsigned int inlen);
141
142
143#ifdef __cplusplus
144}
145#endif
146
147#endif /* !HEADER_AES_H */
diff --git a/src/lib/libcrypto/aes/aes_cbc.c b/src/lib/libcrypto/aes/aes_cbc.c
deleted file mode 100644
index 227f75625d..0000000000
--- a/src/lib/libcrypto/aes/aes_cbc.c
+++ /dev/null
@@ -1,63 +0,0 @@
1/* crypto/aes/aes_cbc.c -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/modes.h>
54
55void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
56 size_t len, const AES_KEY *key,
57 unsigned char *ivec, const int enc) {
58
59 if (enc)
60 CRYPTO_cbc128_encrypt(in,out,len,key,ivec,(block128_f)AES_encrypt);
61 else
62 CRYPTO_cbc128_decrypt(in,out,len,key,ivec,(block128_f)AES_decrypt);
63}
diff --git a/src/lib/libcrypto/aes/aes_cfb.c b/src/lib/libcrypto/aes/aes_cfb.c
deleted file mode 100644
index 0c6d058ce7..0000000000
--- a/src/lib/libcrypto/aes/aes_cfb.c
+++ /dev/null
@@ -1,81 +0,0 @@
1/* crypto/aes/aes_cfb.c -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/modes.h>
54
55/* The input and output encrypted as though 128bit cfb mode is being
56 * used. The extra state information to record how much of the
57 * 128bit block we have used is contained in *num;
58 */
59
60void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out,
61 size_t length, const AES_KEY *key,
62 unsigned char *ivec, int *num, const int enc) {
63
64 CRYPTO_cfb128_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt);
65}
66
67/* N.B. This expects the input to be packed, MS bit first */
68void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out,
69 size_t length, const AES_KEY *key,
70 unsigned char *ivec, int *num, const int enc)
71 {
72 CRYPTO_cfb128_1_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt);
73 }
74
75void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out,
76 size_t length, const AES_KEY *key,
77 unsigned char *ivec, int *num, const int enc)
78 {
79 CRYPTO_cfb128_8_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt);
80 }
81
diff --git a/src/lib/libcrypto/aes/aes_core.c b/src/lib/libcrypto/aes/aes_core.c
deleted file mode 100644
index 8f5210ac70..0000000000
--- a/src/lib/libcrypto/aes/aes_core.c
+++ /dev/null
@@ -1,1358 +0,0 @@
1/* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
2/**
3 * rijndael-alg-fst.c
4 *
5 * @version 3.0 (December 2000)
6 *
7 * Optimised ANSI C code for the Rijndael cipher (now AES)
8 *
9 * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10 * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11 * @author Paulo Barreto <paulo.barreto@terra.com.br>
12 *
13 * This code is hereby placed in the public domain.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/* Note: rewritten a little bit to provide error control and an OpenSSL-
29 compatible API */
30
31#ifndef AES_DEBUG
32# ifndef NDEBUG
33# define NDEBUG
34# endif
35#endif
36#include <assert.h>
37
38#include <stdlib.h>
39#include <openssl/aes.h>
40#include "aes_locl.h"
41
42#ifndef AES_ASM
43/*
44Te0[x] = S [x].[02, 01, 01, 03];
45Te1[x] = S [x].[03, 02, 01, 01];
46Te2[x] = S [x].[01, 03, 02, 01];
47Te3[x] = S [x].[01, 01, 03, 02];
48
49Td0[x] = Si[x].[0e, 09, 0d, 0b];
50Td1[x] = Si[x].[0b, 0e, 09, 0d];
51Td2[x] = Si[x].[0d, 0b, 0e, 09];
52Td3[x] = Si[x].[09, 0d, 0b, 0e];
53Td4[x] = Si[x].[01];
54*/
55
56static const u32 Te0[256] = {
57 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
58 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
59 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
60 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
61 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
62 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
63 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
64 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
65 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
66 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
67 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
68 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
69 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
70 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
71 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
72 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
73 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
74 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
75 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
76 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
77 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
78 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
79 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
80 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
81 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
82 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
83 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
84 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
85 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
86 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
87 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
88 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
89 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
90 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
91 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
92 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
93 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
94 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
95 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
96 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
97 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
98 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
99 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
100 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
101 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
102 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
103 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
104 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
105 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
106 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
107 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
108 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
109 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
110 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
111 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
112 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
113 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
114 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
115 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
116 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
117 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
118 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
119 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
120 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
121};
122static const u32 Te1[256] = {
123 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
124 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
125 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
126 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
127 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
128 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
129 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
130 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
131 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
132 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
133 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
134 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
135 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
136 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
137 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
138 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
139 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
140 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
141 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
142 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
143 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
144 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
145 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
146 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
147 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
148 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
149 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
150 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
151 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
152 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
153 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
154 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
155 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
156 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
157 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
158 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
159 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
160 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
161 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
162 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
163 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
164 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
165 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
166 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
167 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
168 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
169 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
170 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
171 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
172 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
173 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
174 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
175 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
176 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
177 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
178 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
179 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
180 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
181 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
182 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
183 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
184 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
185 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
186 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
187};
188static const u32 Te2[256] = {
189 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
190 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
191 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
192 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
193 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
194 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
195 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
196 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
197 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
198 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
199 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
200 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
201 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
202 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
203 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
204 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
205 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
206 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
207 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
208 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
209 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
210 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
211 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
212 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
213 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
214 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
215 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
216 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
217 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
218 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
219 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
220 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
221 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
222 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
223 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
224 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
225 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
226 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
227 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
228 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
229 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
230 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
231 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
232 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
233 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
234 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
235 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
236 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
237 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
238 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
239 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
240 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
241 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
242 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
243 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
244 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
245 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
246 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
247 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
248 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
249 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
250 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
251 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
252 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
253};
254static const u32 Te3[256] = {
255 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
256 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
257 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
258 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
259 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
260 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
261 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
262 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
263 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
264 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
265 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
266 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
267 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
268 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
269 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
270 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
271 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
272 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
273 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
274 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
275 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
276 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
277 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
278 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
279 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
280 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
281 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
282 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
283 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
284 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
285 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
286 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
287 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
288 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
289 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
290 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
291 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
292 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
293 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
294 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
295 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
296 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
297 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
298 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
299 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
300 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
301 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
302 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
303 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
304 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
305 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
306 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
307 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
308 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
309 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
310 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
311 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
312 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
313 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
314 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
315 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
316 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
317 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
318 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
319};
320
321static const u32 Td0[256] = {
322 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
323 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
324 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
325 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
326 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
327 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
328 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
329 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
330 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
331 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
332 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
333 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
334 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
335 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
336 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
337 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
338 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
339 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
340 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
341 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
342 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
343 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
344 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
345 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
346 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
347 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
348 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
349 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
350 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
351 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
352 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
353 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
354 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
355 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
356 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
357 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
358 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
359 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
360 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
361 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
362 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
363 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
364 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
365 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
366 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
367 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
368 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
369 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
370 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
371 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
372 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
373 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
374 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
375 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
376 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
377 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
378 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
379 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
380 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
381 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
382 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
383 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
384 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
385 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
386};
387static const u32 Td1[256] = {
388 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
389 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
390 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
391 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
392 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
393 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
394 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
395 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
396 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
397 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
398 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
399 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
400 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
401 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
402 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
403 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
404 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
405 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
406 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
407 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
408 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
409 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
410 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
411 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
412 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
413 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
414 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
415 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
416 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
417 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
418 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
419 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
420 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
421 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
422 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
423 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
424 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
425 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
426 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
427 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
428 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
429 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
430 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
431 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
432 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
433 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
434 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
435 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
436 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
437 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
438 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
439 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
440 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
441 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
442 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
443 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
444 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
445 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
446 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
447 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
448 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
449 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
450 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
451 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
452};
453static const u32 Td2[256] = {
454 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
455 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
456 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
457 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
458 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
459 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
460 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
461 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
462 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
463 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
464 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
465 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
466 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
467 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
468 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
469 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
470 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
471 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
472 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
473 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
474 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
475 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
476 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
477 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
478 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
479 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
480 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
481 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
482 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
483 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
484 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
485 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
486 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
487 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
488 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
489 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
490 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
491 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
492 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
493 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
494 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
495 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
496 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
497 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
498 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
499 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
500 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
501 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
502 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
503 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
504 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
505 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
506 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
507 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
508 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
509 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
510 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
511 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
512 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
513 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
514 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
515 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
516 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
517 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
518};
519static const u32 Td3[256] = {
520 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
521 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
522 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
523 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
524 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
525 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
526 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
527 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
528 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
529 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
530 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
531 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
532 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
533 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
534 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
535 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
536 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
537 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
538 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
539 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
540 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
541 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
542 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
543 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
544 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
545 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
546 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
547 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
548 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
549 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
550 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
551 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
552 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
553 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
554 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
555 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
556 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
557 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
558 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
559 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
560 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
561 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
562 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
563 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
564 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
565 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
566 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
567 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
568 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
569 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
570 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
571 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
572 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
573 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
574 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
575 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
576 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
577 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
578 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
579 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
580 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
581 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
582 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
583 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
584};
585static const u8 Td4[256] = {
586 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
587 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
588 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
589 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
590 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
591 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
592 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
593 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
594 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
595 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
596 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
597 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
598 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
599 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
600 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
601 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
602 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
603 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
604 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
605 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
606 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
607 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
608 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
609 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
610 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
611 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
612 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
613 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
614 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
615 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
616 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
617 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU,
618};
619static const u32 rcon[] = {
620 0x01000000, 0x02000000, 0x04000000, 0x08000000,
621 0x10000000, 0x20000000, 0x40000000, 0x80000000,
622 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
623};
624
625/**
626 * Expand the cipher key into the encryption key schedule.
627 */
628int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
629 AES_KEY *key) {
630
631 u32 *rk;
632 int i = 0;
633 u32 temp;
634
635 if (!userKey || !key)
636 return -1;
637 if (bits != 128 && bits != 192 && bits != 256)
638 return -2;
639
640 rk = key->rd_key;
641
642 if (bits==128)
643 key->rounds = 10;
644 else if (bits==192)
645 key->rounds = 12;
646 else
647 key->rounds = 14;
648
649 rk[0] = GETU32(userKey );
650 rk[1] = GETU32(userKey + 4);
651 rk[2] = GETU32(userKey + 8);
652 rk[3] = GETU32(userKey + 12);
653 if (bits == 128) {
654 while (1) {
655 temp = rk[3];
656 rk[4] = rk[0] ^
657 (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
658 (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
659 (Te0[(temp ) & 0xff] & 0x0000ff00) ^
660 (Te1[(temp >> 24) ] & 0x000000ff) ^
661 rcon[i];
662 rk[5] = rk[1] ^ rk[4];
663 rk[6] = rk[2] ^ rk[5];
664 rk[7] = rk[3] ^ rk[6];
665 if (++i == 10) {
666 return 0;
667 }
668 rk += 4;
669 }
670 }
671 rk[4] = GETU32(userKey + 16);
672 rk[5] = GETU32(userKey + 20);
673 if (bits == 192) {
674 while (1) {
675 temp = rk[ 5];
676 rk[ 6] = rk[ 0] ^
677 (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
678 (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
679 (Te0[(temp ) & 0xff] & 0x0000ff00) ^
680 (Te1[(temp >> 24) ] & 0x000000ff) ^
681 rcon[i];
682 rk[ 7] = rk[ 1] ^ rk[ 6];
683 rk[ 8] = rk[ 2] ^ rk[ 7];
684 rk[ 9] = rk[ 3] ^ rk[ 8];
685 if (++i == 8) {
686 return 0;
687 }
688 rk[10] = rk[ 4] ^ rk[ 9];
689 rk[11] = rk[ 5] ^ rk[10];
690 rk += 6;
691 }
692 }
693 rk[6] = GETU32(userKey + 24);
694 rk[7] = GETU32(userKey + 28);
695 if (bits == 256) {
696 while (1) {
697 temp = rk[ 7];
698 rk[ 8] = rk[ 0] ^
699 (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
700 (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
701 (Te0[(temp ) & 0xff] & 0x0000ff00) ^
702 (Te1[(temp >> 24) ] & 0x000000ff) ^
703 rcon[i];
704 rk[ 9] = rk[ 1] ^ rk[ 8];
705 rk[10] = rk[ 2] ^ rk[ 9];
706 rk[11] = rk[ 3] ^ rk[10];
707 if (++i == 7) {
708 return 0;
709 }
710 temp = rk[11];
711 rk[12] = rk[ 4] ^
712 (Te2[(temp >> 24) ] & 0xff000000) ^
713 (Te3[(temp >> 16) & 0xff] & 0x00ff0000) ^
714 (Te0[(temp >> 8) & 0xff] & 0x0000ff00) ^
715 (Te1[(temp ) & 0xff] & 0x000000ff);
716 rk[13] = rk[ 5] ^ rk[12];
717 rk[14] = rk[ 6] ^ rk[13];
718 rk[15] = rk[ 7] ^ rk[14];
719
720 rk += 8;
721 }
722 }
723 return 0;
724}
725
726/**
727 * Expand the cipher key into the decryption key schedule.
728 */
729int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
730 AES_KEY *key) {
731
732 u32 *rk;
733 int i, j, status;
734 u32 temp;
735
736 /* first, start with an encryption schedule */
737 status = private_AES_set_encrypt_key(userKey, bits, key);
738 if (status < 0)
739 return status;
740
741 rk = key->rd_key;
742
743 /* invert the order of the round keys: */
744 for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
745 temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
746 temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
747 temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
748 temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
749 }
750 /* apply the inverse MixColumn transform to all round keys but the first and the last: */
751 for (i = 1; i < (key->rounds); i++) {
752 rk += 4;
753 rk[0] =
754 Td0[Te1[(rk[0] >> 24) ] & 0xff] ^
755 Td1[Te1[(rk[0] >> 16) & 0xff] & 0xff] ^
756 Td2[Te1[(rk[0] >> 8) & 0xff] & 0xff] ^
757 Td3[Te1[(rk[0] ) & 0xff] & 0xff];
758 rk[1] =
759 Td0[Te1[(rk[1] >> 24) ] & 0xff] ^
760 Td1[Te1[(rk[1] >> 16) & 0xff] & 0xff] ^
761 Td2[Te1[(rk[1] >> 8) & 0xff] & 0xff] ^
762 Td3[Te1[(rk[1] ) & 0xff] & 0xff];
763 rk[2] =
764 Td0[Te1[(rk[2] >> 24) ] & 0xff] ^
765 Td1[Te1[(rk[2] >> 16) & 0xff] & 0xff] ^
766 Td2[Te1[(rk[2] >> 8) & 0xff] & 0xff] ^
767 Td3[Te1[(rk[2] ) & 0xff] & 0xff];
768 rk[3] =
769 Td0[Te1[(rk[3] >> 24) ] & 0xff] ^
770 Td1[Te1[(rk[3] >> 16) & 0xff] & 0xff] ^
771 Td2[Te1[(rk[3] >> 8) & 0xff] & 0xff] ^
772 Td3[Te1[(rk[3] ) & 0xff] & 0xff];
773 }
774 return 0;
775}
776
777/*
778 * Encrypt a single block
779 * in and out can overlap
780 */
781void AES_encrypt(const unsigned char *in, unsigned char *out,
782 const AES_KEY *key) {
783
784 const u32 *rk;
785 u32 s0, s1, s2, s3, t0, t1, t2, t3;
786#ifndef FULL_UNROLL
787 int r;
788#endif /* ?FULL_UNROLL */
789
790 assert(in && out && key);
791 rk = key->rd_key;
792
793 /*
794 * map byte array block to cipher state
795 * and add initial round key:
796 */
797 s0 = GETU32(in ) ^ rk[0];
798 s1 = GETU32(in + 4) ^ rk[1];
799 s2 = GETU32(in + 8) ^ rk[2];
800 s3 = GETU32(in + 12) ^ rk[3];
801#ifdef FULL_UNROLL
802 /* round 1: */
803 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
804 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
805 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
806 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
807 /* round 2: */
808 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
809 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
810 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
811 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
812 /* round 3: */
813 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
814 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
815 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
816 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
817 /* round 4: */
818 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
819 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
820 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
821 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
822 /* round 5: */
823 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
824 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
825 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
826 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
827 /* round 6: */
828 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
829 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
830 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
831 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
832 /* round 7: */
833 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
834 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
835 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
836 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
837 /* round 8: */
838 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
839 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
840 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
841 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
842 /* round 9: */
843 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
844 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
845 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
846 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];
847 if (key->rounds > 10) {
848 /* round 10: */
849 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];
850 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];
851 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];
852 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];
853 /* round 11: */
854 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];
855 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];
856 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];
857 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];
858 if (key->rounds > 12) {
859 /* round 12: */
860 s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];
861 s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];
862 s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];
863 s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];
864 /* round 13: */
865 t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];
866 t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];
867 t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];
868 t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];
869 }
870 }
871 rk += key->rounds << 2;
872#else /* !FULL_UNROLL */
873 /*
874 * Nr - 1 full rounds:
875 */
876 r = key->rounds >> 1;
877 for (;;) {
878 t0 =
879 Te0[(s0 >> 24) ] ^
880 Te1[(s1 >> 16) & 0xff] ^
881 Te2[(s2 >> 8) & 0xff] ^
882 Te3[(s3 ) & 0xff] ^
883 rk[4];
884 t1 =
885 Te0[(s1 >> 24) ] ^
886 Te1[(s2 >> 16) & 0xff] ^
887 Te2[(s3 >> 8) & 0xff] ^
888 Te3[(s0 ) & 0xff] ^
889 rk[5];
890 t2 =
891 Te0[(s2 >> 24) ] ^
892 Te1[(s3 >> 16) & 0xff] ^
893 Te2[(s0 >> 8) & 0xff] ^
894 Te3[(s1 ) & 0xff] ^
895 rk[6];
896 t3 =
897 Te0[(s3 >> 24) ] ^
898 Te1[(s0 >> 16) & 0xff] ^
899 Te2[(s1 >> 8) & 0xff] ^
900 Te3[(s2 ) & 0xff] ^
901 rk[7];
902
903 rk += 8;
904 if (--r == 0) {
905 break;
906 }
907
908 s0 =
909 Te0[(t0 >> 24) ] ^
910 Te1[(t1 >> 16) & 0xff] ^
911 Te2[(t2 >> 8) & 0xff] ^
912 Te3[(t3 ) & 0xff] ^
913 rk[0];
914 s1 =
915 Te0[(t1 >> 24) ] ^
916 Te1[(t2 >> 16) & 0xff] ^
917 Te2[(t3 >> 8) & 0xff] ^
918 Te3[(t0 ) & 0xff] ^
919 rk[1];
920 s2 =
921 Te0[(t2 >> 24) ] ^
922 Te1[(t3 >> 16) & 0xff] ^
923 Te2[(t0 >> 8) & 0xff] ^
924 Te3[(t1 ) & 0xff] ^
925 rk[2];
926 s3 =
927 Te0[(t3 >> 24) ] ^
928 Te1[(t0 >> 16) & 0xff] ^
929 Te2[(t1 >> 8) & 0xff] ^
930 Te3[(t2 ) & 0xff] ^
931 rk[3];
932 }
933#endif /* ?FULL_UNROLL */
934 /*
935 * apply last round and
936 * map cipher state to byte array block:
937 */
938 s0 =
939 (Te2[(t0 >> 24) ] & 0xff000000) ^
940 (Te3[(t1 >> 16) & 0xff] & 0x00ff0000) ^
941 (Te0[(t2 >> 8) & 0xff] & 0x0000ff00) ^
942 (Te1[(t3 ) & 0xff] & 0x000000ff) ^
943 rk[0];
944 PUTU32(out , s0);
945 s1 =
946 (Te2[(t1 >> 24) ] & 0xff000000) ^
947 (Te3[(t2 >> 16) & 0xff] & 0x00ff0000) ^
948 (Te0[(t3 >> 8) & 0xff] & 0x0000ff00) ^
949 (Te1[(t0 ) & 0xff] & 0x000000ff) ^
950 rk[1];
951 PUTU32(out + 4, s1);
952 s2 =
953 (Te2[(t2 >> 24) ] & 0xff000000) ^
954 (Te3[(t3 >> 16) & 0xff] & 0x00ff0000) ^
955 (Te0[(t0 >> 8) & 0xff] & 0x0000ff00) ^
956 (Te1[(t1 ) & 0xff] & 0x000000ff) ^
957 rk[2];
958 PUTU32(out + 8, s2);
959 s3 =
960 (Te2[(t3 >> 24) ] & 0xff000000) ^
961 (Te3[(t0 >> 16) & 0xff] & 0x00ff0000) ^
962 (Te0[(t1 >> 8) & 0xff] & 0x0000ff00) ^
963 (Te1[(t2 ) & 0xff] & 0x000000ff) ^
964 rk[3];
965 PUTU32(out + 12, s3);
966}
967
968/*
969 * Decrypt a single block
970 * in and out can overlap
971 */
972void AES_decrypt(const unsigned char *in, unsigned char *out,
973 const AES_KEY *key) {
974
975 const u32 *rk;
976 u32 s0, s1, s2, s3, t0, t1, t2, t3;
977#ifndef FULL_UNROLL
978 int r;
979#endif /* ?FULL_UNROLL */
980
981 assert(in && out && key);
982 rk = key->rd_key;
983
984 /*
985 * map byte array block to cipher state
986 * and add initial round key:
987 */
988 s0 = GETU32(in ) ^ rk[0];
989 s1 = GETU32(in + 4) ^ rk[1];
990 s2 = GETU32(in + 8) ^ rk[2];
991 s3 = GETU32(in + 12) ^ rk[3];
992#ifdef FULL_UNROLL
993 /* round 1: */
994 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];
995 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];
996 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];
997 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];
998 /* round 2: */
999 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];
1000 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];
1001 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];
1002 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];
1003 /* round 3: */
1004 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];
1005 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];
1006 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];
1007 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];
1008 /* round 4: */
1009 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];
1010 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];
1011 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];
1012 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];
1013 /* round 5: */
1014 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];
1015 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];
1016 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];
1017 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];
1018 /* round 6: */
1019 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];
1020 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];
1021 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];
1022 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];
1023 /* round 7: */
1024 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];
1025 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];
1026 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];
1027 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];
1028 /* round 8: */
1029 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];
1030 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];
1031 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];
1032 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];
1033 /* round 9: */
1034 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];
1035 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];
1036 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];
1037 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];
1038 if (key->rounds > 10) {
1039 /* round 10: */
1040 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];
1041 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];
1042 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];
1043 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];
1044 /* round 11: */
1045 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];
1046 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];
1047 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];
1048 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];
1049 if (key->rounds > 12) {
1050 /* round 12: */
1051 s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];
1052 s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];
1053 s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];
1054 s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];
1055 /* round 13: */
1056 t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];
1057 t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];
1058 t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];
1059 t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];
1060 }
1061 }
1062 rk += key->rounds << 2;
1063#else /* !FULL_UNROLL */
1064 /*
1065 * Nr - 1 full rounds:
1066 */
1067 r = key->rounds >> 1;
1068 for (;;) {
1069 t0 =
1070 Td0[(s0 >> 24) ] ^
1071 Td1[(s3 >> 16) & 0xff] ^
1072 Td2[(s2 >> 8) & 0xff] ^
1073 Td3[(s1 ) & 0xff] ^
1074 rk[4];
1075 t1 =
1076 Td0[(s1 >> 24) ] ^
1077 Td1[(s0 >> 16) & 0xff] ^
1078 Td2[(s3 >> 8) & 0xff] ^
1079 Td3[(s2 ) & 0xff] ^
1080 rk[5];
1081 t2 =
1082 Td0[(s2 >> 24) ] ^
1083 Td1[(s1 >> 16) & 0xff] ^
1084 Td2[(s0 >> 8) & 0xff] ^
1085 Td3[(s3 ) & 0xff] ^
1086 rk[6];
1087 t3 =
1088 Td0[(s3 >> 24) ] ^
1089 Td1[(s2 >> 16) & 0xff] ^
1090 Td2[(s1 >> 8) & 0xff] ^
1091 Td3[(s0 ) & 0xff] ^
1092 rk[7];
1093
1094 rk += 8;
1095 if (--r == 0) {
1096 break;
1097 }
1098
1099 s0 =
1100 Td0[(t0 >> 24) ] ^
1101 Td1[(t3 >> 16) & 0xff] ^
1102 Td2[(t2 >> 8) & 0xff] ^
1103 Td3[(t1 ) & 0xff] ^
1104 rk[0];
1105 s1 =
1106 Td0[(t1 >> 24) ] ^
1107 Td1[(t0 >> 16) & 0xff] ^
1108 Td2[(t3 >> 8) & 0xff] ^
1109 Td3[(t2 ) & 0xff] ^
1110 rk[1];
1111 s2 =
1112 Td0[(t2 >> 24) ] ^
1113 Td1[(t1 >> 16) & 0xff] ^
1114 Td2[(t0 >> 8) & 0xff] ^
1115 Td3[(t3 ) & 0xff] ^
1116 rk[2];
1117 s3 =
1118 Td0[(t3 >> 24) ] ^
1119 Td1[(t2 >> 16) & 0xff] ^
1120 Td2[(t1 >> 8) & 0xff] ^
1121 Td3[(t0 ) & 0xff] ^
1122 rk[3];
1123 }
1124#endif /* ?FULL_UNROLL */
1125 /*
1126 * apply last round and
1127 * map cipher state to byte array block:
1128 */
1129 s0 =
1130 (Td4[(t0 >> 24) ] << 24) ^
1131 (Td4[(t3 >> 16) & 0xff] << 16) ^
1132 (Td4[(t2 >> 8) & 0xff] << 8) ^
1133 (Td4[(t1 ) & 0xff]) ^
1134 rk[0];
1135 PUTU32(out , s0);
1136 s1 =
1137 (Td4[(t1 >> 24) ] << 24) ^
1138 (Td4[(t0 >> 16) & 0xff] << 16) ^
1139 (Td4[(t3 >> 8) & 0xff] << 8) ^
1140 (Td4[(t2 ) & 0xff]) ^
1141 rk[1];
1142 PUTU32(out + 4, s1);
1143 s2 =
1144 (Td4[(t2 >> 24) ] << 24) ^
1145 (Td4[(t1 >> 16) & 0xff] << 16) ^
1146 (Td4[(t0 >> 8) & 0xff] << 8) ^
1147 (Td4[(t3 ) & 0xff]) ^
1148 rk[2];
1149 PUTU32(out + 8, s2);
1150 s3 =
1151 (Td4[(t3 >> 24) ] << 24) ^
1152 (Td4[(t2 >> 16) & 0xff] << 16) ^
1153 (Td4[(t1 >> 8) & 0xff] << 8) ^
1154 (Td4[(t0 ) & 0xff]) ^
1155 rk[3];
1156 PUTU32(out + 12, s3);
1157}
1158
1159#else /* AES_ASM */
1160
1161static const u8 Te4[256] = {
1162 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
1163 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
1164 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
1165 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
1166 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
1167 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
1168 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
1169 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
1170 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
1171 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
1172 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
1173 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
1174 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
1175 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
1176 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
1177 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
1178 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
1179 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
1180 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
1181 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
1182 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
1183 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
1184 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
1185 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
1186 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
1187 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
1188 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
1189 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
1190 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
1191 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
1192 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
1193 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
1194};
1195static const u32 rcon[] = {
1196 0x01000000, 0x02000000, 0x04000000, 0x08000000,
1197 0x10000000, 0x20000000, 0x40000000, 0x80000000,
1198 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
1199};
1200
1201/**
1202 * Expand the cipher key into the encryption key schedule.
1203 */
1204int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
1205 AES_KEY *key) {
1206 u32 *rk;
1207 int i = 0;
1208 u32 temp;
1209
1210 if (!userKey || !key)
1211 return -1;
1212 if (bits != 128 && bits != 192 && bits != 256)
1213 return -2;
1214
1215 rk = key->rd_key;
1216
1217 if (bits==128)
1218 key->rounds = 10;
1219 else if (bits==192)
1220 key->rounds = 12;
1221 else
1222 key->rounds = 14;
1223
1224 rk[0] = GETU32(userKey );
1225 rk[1] = GETU32(userKey + 4);
1226 rk[2] = GETU32(userKey + 8);
1227 rk[3] = GETU32(userKey + 12);
1228 if (bits == 128) {
1229 while (1) {
1230 temp = rk[3];
1231 rk[4] = rk[0] ^
1232 (Te4[(temp >> 16) & 0xff] << 24) ^
1233 (Te4[(temp >> 8) & 0xff] << 16) ^
1234 (Te4[(temp ) & 0xff] << 8) ^
1235 (Te4[(temp >> 24) ]) ^
1236 rcon[i];
1237 rk[5] = rk[1] ^ rk[4];
1238 rk[6] = rk[2] ^ rk[5];
1239 rk[7] = rk[3] ^ rk[6];
1240 if (++i == 10) {
1241 return 0;
1242 }
1243 rk += 4;
1244 }
1245 }
1246 rk[4] = GETU32(userKey + 16);
1247 rk[5] = GETU32(userKey + 20);
1248 if (bits == 192) {
1249 while (1) {
1250 temp = rk[ 5];
1251 rk[ 6] = rk[ 0] ^
1252 (Te4[(temp >> 16) & 0xff] << 24) ^
1253 (Te4[(temp >> 8) & 0xff] << 16) ^
1254 (Te4[(temp ) & 0xff] << 8) ^
1255 (Te4[(temp >> 24) ]) ^
1256 rcon[i];
1257 rk[ 7] = rk[ 1] ^ rk[ 6];
1258 rk[ 8] = rk[ 2] ^ rk[ 7];
1259 rk[ 9] = rk[ 3] ^ rk[ 8];
1260 if (++i == 8) {
1261 return 0;
1262 }
1263 rk[10] = rk[ 4] ^ rk[ 9];
1264 rk[11] = rk[ 5] ^ rk[10];
1265 rk += 6;
1266 }
1267 }
1268 rk[6] = GETU32(userKey + 24);
1269 rk[7] = GETU32(userKey + 28);
1270 if (bits == 256) {
1271 while (1) {
1272 temp = rk[ 7];
1273 rk[ 8] = rk[ 0] ^
1274 (Te4[(temp >> 16) & 0xff] << 24) ^
1275 (Te4[(temp >> 8) & 0xff] << 16) ^
1276 (Te4[(temp ) & 0xff] << 8) ^
1277 (Te4[(temp >> 24) ]) ^
1278 rcon[i];
1279 rk[ 9] = rk[ 1] ^ rk[ 8];
1280 rk[10] = rk[ 2] ^ rk[ 9];
1281 rk[11] = rk[ 3] ^ rk[10];
1282 if (++i == 7) {
1283 return 0;
1284 }
1285 temp = rk[11];
1286 rk[12] = rk[ 4] ^
1287 (Te4[(temp >> 24) ] << 24) ^
1288 (Te4[(temp >> 16) & 0xff] << 16) ^
1289 (Te4[(temp >> 8) & 0xff] << 8) ^
1290 (Te4[(temp ) & 0xff]);
1291 rk[13] = rk[ 5] ^ rk[12];
1292 rk[14] = rk[ 6] ^ rk[13];
1293 rk[15] = rk[ 7] ^ rk[14];
1294
1295 rk += 8;
1296 }
1297 }
1298 return 0;
1299}
1300
1301/**
1302 * Expand the cipher key into the decryption key schedule.
1303 */
1304int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
1305 AES_KEY *key) {
1306
1307 u32 *rk;
1308 int i, j, status;
1309 u32 temp;
1310
1311 /* first, start with an encryption schedule */
1312 status = private_AES_set_encrypt_key(userKey, bits, key);
1313 if (status < 0)
1314 return status;
1315
1316 rk = key->rd_key;
1317
1318 /* invert the order of the round keys: */
1319 for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
1320 temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
1321 temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
1322 temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
1323 temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
1324 }
1325 /* apply the inverse MixColumn transform to all round keys but the first and the last: */
1326 for (i = 1; i < (key->rounds); i++) {
1327 rk += 4;
1328 for (j = 0; j < 4; j++) {
1329 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
1330
1331 tp1 = rk[j];
1332 m = tp1 & 0x80808080;
1333 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
1334 ((m - (m >> 7)) & 0x1b1b1b1b);
1335 m = tp2 & 0x80808080;
1336 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
1337 ((m - (m >> 7)) & 0x1b1b1b1b);
1338 m = tp4 & 0x80808080;
1339 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
1340 ((m - (m >> 7)) & 0x1b1b1b1b);
1341 tp9 = tp8 ^ tp1;
1342 tpb = tp9 ^ tp2;
1343 tpd = tp9 ^ tp4;
1344 tpe = tp8 ^ tp4 ^ tp2;
1345#if defined(ROTATE)
1346 rk[j] = tpe ^ ROTATE(tpd,16) ^
1347 ROTATE(tp9,24) ^ ROTATE(tpb,8);
1348#else
1349 rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
1350 (tp9 >> 8) ^ (tp9 << 24) ^
1351 (tpb >> 24) ^ (tpb << 8);
1352#endif
1353 }
1354 }
1355 return 0;
1356}
1357
1358#endif /* AES_ASM */
diff --git a/src/lib/libcrypto/aes/aes_ctr.c b/src/lib/libcrypto/aes/aes_ctr.c
deleted file mode 100644
index 7c9d165d8a..0000000000
--- a/src/lib/libcrypto/aes/aes_ctr.c
+++ /dev/null
@@ -1,61 +0,0 @@
1/* crypto/aes/aes_ctr.c -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/modes.h>
54
55void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
56 size_t length, const AES_KEY *key,
57 unsigned char ivec[AES_BLOCK_SIZE],
58 unsigned char ecount_buf[AES_BLOCK_SIZE],
59 unsigned int *num) {
60 CRYPTO_ctr128_encrypt(in,out,length,key,ivec,ecount_buf,num,(block128_f)AES_encrypt);
61}
diff --git a/src/lib/libcrypto/aes/aes_ecb.c b/src/lib/libcrypto/aes/aes_ecb.c
deleted file mode 100644
index 28aa561c2d..0000000000
--- a/src/lib/libcrypto/aes/aes_ecb.c
+++ /dev/null
@@ -1,73 +0,0 @@
1/* crypto/aes/aes_ecb.c -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#ifndef AES_DEBUG
53# ifndef NDEBUG
54# define NDEBUG
55# endif
56#endif
57#include <assert.h>
58
59#include <openssl/aes.h>
60#include "aes_locl.h"
61
62void AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
63 const AES_KEY *key, const int enc) {
64
65 assert(in && out && key);
66 assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc));
67
68 if (AES_ENCRYPT == enc)
69 AES_encrypt(in, out, key);
70 else
71 AES_decrypt(in, out, key);
72}
73
diff --git a/src/lib/libcrypto/aes/aes_ige.c b/src/lib/libcrypto/aes/aes_ige.c
deleted file mode 100644
index c161351e65..0000000000
--- a/src/lib/libcrypto/aes/aes_ige.c
+++ /dev/null
@@ -1,323 +0,0 @@
1/* crypto/aes/aes_ige.c -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include "cryptlib.h"
53
54#include <openssl/aes.h>
55#include "aes_locl.h"
56
57#define N_WORDS (AES_BLOCK_SIZE / sizeof(unsigned long))
58typedef struct {
59 unsigned long data[N_WORDS];
60} aes_block_t;
61
62/* XXX: probably some better way to do this */
63#if defined(__i386__) || defined(__x86_64__)
64#define UNALIGNED_MEMOPS_ARE_FAST 1
65#else
66#define UNALIGNED_MEMOPS_ARE_FAST 0
67#endif
68
69#if UNALIGNED_MEMOPS_ARE_FAST
70#define load_block(d, s) (d) = *(const aes_block_t *)(s)
71#define store_block(d, s) *(aes_block_t *)(d) = (s)
72#else
73#define load_block(d, s) memcpy((d).data, (s), AES_BLOCK_SIZE)
74#define store_block(d, s) memcpy((d), (s).data, AES_BLOCK_SIZE)
75#endif
76
77/* N.B. The IV for this mode is _twice_ the block size */
78
79void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
80 size_t length, const AES_KEY *key,
81 unsigned char *ivec, const int enc)
82 {
83 size_t n;
84 size_t len = length;
85
86 OPENSSL_assert(in && out && key && ivec);
87 OPENSSL_assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc));
88 OPENSSL_assert((length%AES_BLOCK_SIZE) == 0);
89
90 len = length / AES_BLOCK_SIZE;
91
92 if (AES_ENCRYPT == enc)
93 {
94 if (in != out &&
95 (UNALIGNED_MEMOPS_ARE_FAST || ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(long)==0))
96 {
97 aes_block_t *ivp = (aes_block_t *)ivec;
98 aes_block_t *iv2p = (aes_block_t *)(ivec + AES_BLOCK_SIZE);
99
100 while (len)
101 {
102 aes_block_t *inp = (aes_block_t *)in;
103 aes_block_t *outp = (aes_block_t *)out;
104
105 for(n=0 ; n < N_WORDS; ++n)
106 outp->data[n] = inp->data[n] ^ ivp->data[n];
107 AES_encrypt((unsigned char *)outp->data, (unsigned char *)outp->data, key);
108 for(n=0 ; n < N_WORDS; ++n)
109 outp->data[n] ^= iv2p->data[n];
110 ivp = outp;
111 iv2p = inp;
112 --len;
113 in += AES_BLOCK_SIZE;
114 out += AES_BLOCK_SIZE;
115 }
116 memcpy(ivec, ivp->data, AES_BLOCK_SIZE);
117 memcpy(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
118 }
119 else
120 {
121 aes_block_t tmp, tmp2;
122 aes_block_t iv;
123 aes_block_t iv2;
124
125 load_block(iv, ivec);
126 load_block(iv2, ivec + AES_BLOCK_SIZE);
127
128 while (len)
129 {
130 load_block(tmp, in);
131 for(n=0 ; n < N_WORDS; ++n)
132 tmp2.data[n] = tmp.data[n] ^ iv.data[n];
133 AES_encrypt((unsigned char *)tmp2.data, (unsigned char *)tmp2.data, key);
134 for(n=0 ; n < N_WORDS; ++n)
135 tmp2.data[n] ^= iv2.data[n];
136 store_block(out, tmp2);
137 iv = tmp2;
138 iv2 = tmp;
139 --len;
140 in += AES_BLOCK_SIZE;
141 out += AES_BLOCK_SIZE;
142 }
143 memcpy(ivec, iv.data, AES_BLOCK_SIZE);
144 memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
145 }
146 }
147 else
148 {
149 if (in != out &&
150 (UNALIGNED_MEMOPS_ARE_FAST || ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(long)==0))
151 {
152 aes_block_t *ivp = (aes_block_t *)ivec;
153 aes_block_t *iv2p = (aes_block_t *)(ivec + AES_BLOCK_SIZE);
154
155 while (len)
156 {
157 aes_block_t tmp;
158 aes_block_t *inp = (aes_block_t *)in;
159 aes_block_t *outp = (aes_block_t *)out;
160
161 for(n=0 ; n < N_WORDS; ++n)
162 tmp.data[n] = inp->data[n] ^ iv2p->data[n];
163 AES_decrypt((unsigned char *)tmp.data, (unsigned char *)outp->data, key);
164 for(n=0 ; n < N_WORDS; ++n)
165 outp->data[n] ^= ivp->data[n];
166 ivp = inp;
167 iv2p = outp;
168 --len;
169 in += AES_BLOCK_SIZE;
170 out += AES_BLOCK_SIZE;
171 }
172 memcpy(ivec, ivp->data, AES_BLOCK_SIZE);
173 memcpy(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
174 }
175 else
176 {
177 aes_block_t tmp, tmp2;
178 aes_block_t iv;
179 aes_block_t iv2;
180
181 load_block(iv, ivec);
182 load_block(iv2, ivec + AES_BLOCK_SIZE);
183
184 while (len)
185 {
186 load_block(tmp, in);
187 tmp2 = tmp;
188 for(n=0 ; n < N_WORDS; ++n)
189 tmp.data[n] ^= iv2.data[n];
190 AES_decrypt((unsigned char *)tmp.data, (unsigned char *)tmp.data, key);
191 for(n=0 ; n < N_WORDS; ++n)
192 tmp.data[n] ^= iv.data[n];
193 store_block(out, tmp);
194 iv = tmp2;
195 iv2 = tmp;
196 --len;
197 in += AES_BLOCK_SIZE;
198 out += AES_BLOCK_SIZE;
199 }
200 memcpy(ivec, iv.data, AES_BLOCK_SIZE);
201 memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
202 }
203 }
204 }
205
206/*
207 * Note that its effectively impossible to do biIGE in anything other
208 * than a single pass, so no provision is made for chaining.
209 */
210
211/* N.B. The IV for this mode is _four times_ the block size */
212
213void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out,
214 size_t length, const AES_KEY *key,
215 const AES_KEY *key2, const unsigned char *ivec,
216 const int enc)
217 {
218 size_t n;
219 size_t len = length;
220 unsigned char tmp[AES_BLOCK_SIZE];
221 unsigned char tmp2[AES_BLOCK_SIZE];
222 unsigned char tmp3[AES_BLOCK_SIZE];
223 unsigned char prev[AES_BLOCK_SIZE];
224 const unsigned char *iv;
225 const unsigned char *iv2;
226
227 OPENSSL_assert(in && out && key && ivec);
228 OPENSSL_assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc));
229 OPENSSL_assert((length%AES_BLOCK_SIZE) == 0);
230
231 if (AES_ENCRYPT == enc)
232 {
233 /* XXX: Do a separate case for when in != out (strictly should
234 check for overlap, too) */
235
236 /* First the forward pass */
237 iv = ivec;
238 iv2 = ivec + AES_BLOCK_SIZE;
239 while (len >= AES_BLOCK_SIZE)
240 {
241 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
242 out[n] = in[n] ^ iv[n];
243 AES_encrypt(out, out, key);
244 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
245 out[n] ^= iv2[n];
246 iv = out;
247 memcpy(prev, in, AES_BLOCK_SIZE);
248 iv2 = prev;
249 len -= AES_BLOCK_SIZE;
250 in += AES_BLOCK_SIZE;
251 out += AES_BLOCK_SIZE;
252 }
253
254 /* And now backwards */
255 iv = ivec + AES_BLOCK_SIZE*2;
256 iv2 = ivec + AES_BLOCK_SIZE*3;
257 len = length;
258 while(len >= AES_BLOCK_SIZE)
259 {
260 out -= AES_BLOCK_SIZE;
261 /* XXX: reduce copies by alternating between buffers */
262 memcpy(tmp, out, AES_BLOCK_SIZE);
263 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
264 out[n] ^= iv[n];
265 /* hexdump(stdout, "out ^ iv", out, AES_BLOCK_SIZE); */
266 AES_encrypt(out, out, key);
267 /* hexdump(stdout,"enc", out, AES_BLOCK_SIZE); */
268 /* hexdump(stdout,"iv2", iv2, AES_BLOCK_SIZE); */
269 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
270 out[n] ^= iv2[n];
271 /* hexdump(stdout,"out", out, AES_BLOCK_SIZE); */
272 iv = out;
273 memcpy(prev, tmp, AES_BLOCK_SIZE);
274 iv2 = prev;
275 len -= AES_BLOCK_SIZE;
276 }
277 }
278 else
279 {
280 /* First backwards */
281 iv = ivec + AES_BLOCK_SIZE*2;
282 iv2 = ivec + AES_BLOCK_SIZE*3;
283 in += length;
284 out += length;
285 while (len >= AES_BLOCK_SIZE)
286 {
287 in -= AES_BLOCK_SIZE;
288 out -= AES_BLOCK_SIZE;
289 memcpy(tmp, in, AES_BLOCK_SIZE);
290 memcpy(tmp2, in, AES_BLOCK_SIZE);
291 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
292 tmp[n] ^= iv2[n];
293 AES_decrypt(tmp, out, key);
294 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
295 out[n] ^= iv[n];
296 memcpy(tmp3, tmp2, AES_BLOCK_SIZE);
297 iv = tmp3;
298 iv2 = out;
299 len -= AES_BLOCK_SIZE;
300 }
301
302 /* And now forwards */
303 iv = ivec;
304 iv2 = ivec + AES_BLOCK_SIZE;
305 len = length;
306 while (len >= AES_BLOCK_SIZE)
307 {
308 memcpy(tmp, out, AES_BLOCK_SIZE);
309 memcpy(tmp2, out, AES_BLOCK_SIZE);
310 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
311 tmp[n] ^= iv2[n];
312 AES_decrypt(tmp, out, key);
313 for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
314 out[n] ^= iv[n];
315 memcpy(tmp3, tmp2, AES_BLOCK_SIZE);
316 iv = tmp3;
317 iv2 = out;
318 len -= AES_BLOCK_SIZE;
319 in += AES_BLOCK_SIZE;
320 out += AES_BLOCK_SIZE;
321 }
322 }
323 }
diff --git a/src/lib/libcrypto/aes/aes_locl.h b/src/lib/libcrypto/aes/aes_locl.h
deleted file mode 100644
index 054b442d41..0000000000
--- a/src/lib/libcrypto/aes/aes_locl.h
+++ /dev/null
@@ -1,89 +0,0 @@
1/* crypto/aes/aes.h -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#ifndef HEADER_AES_LOCL_H
53#define HEADER_AES_LOCL_H
54
55#include <openssl/e_os2.h>
56
57#ifdef OPENSSL_NO_AES
58#error AES is disabled.
59#endif
60
61#include <stdio.h>
62#include <stdlib.h>
63#include <string.h>
64
65#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
66# define SWAP(x) (_lrotl(x, 8) & 0x00ff00ff | _lrotr(x, 8) & 0xff00ff00)
67# define GETU32(p) SWAP(*((u32 *)(p)))
68# define PUTU32(ct, st) { *((u32 *)(ct)) = SWAP((st)); }
69#else
70# define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] << 8) ^ ((u32)(pt)[3]))
71# define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >> 8); (ct)[3] = (u8)(st); }
72#endif
73
74#ifdef AES_LONG
75typedef unsigned long u32;
76#else
77typedef unsigned int u32;
78#endif
79typedef unsigned short u16;
80typedef unsigned char u8;
81
82#define MAXKC (256/32)
83#define MAXKB (256/8)
84#define MAXNR 14
85
86/* This controls loop-unrolling in aes_core.c */
87#undef FULL_UNROLL
88
89#endif /* !HEADER_AES_LOCL_H */
diff --git a/src/lib/libcrypto/aes/aes_misc.c b/src/lib/libcrypto/aes/aes_misc.c
deleted file mode 100644
index f083488ecb..0000000000
--- a/src/lib/libcrypto/aes/aes_misc.c
+++ /dev/null
@@ -1,85 +0,0 @@
1/* crypto/aes/aes_misc.c -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/opensslv.h>
53#include <openssl/crypto.h>
54#include <openssl/aes.h>
55#include "aes_locl.h"
56
57const char AES_version[]="AES" OPENSSL_VERSION_PTEXT;
58
59const char *AES_options(void) {
60#ifdef FULL_UNROLL
61 return "aes(full)";
62#else
63 return "aes(partial)";
64#endif
65}
66
67/* FIPS wrapper functions to block low level AES calls in FIPS mode */
68
69int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
70 AES_KEY *key)
71 {
72#ifdef OPENSSL_FIPS
73 fips_cipher_abort(AES);
74#endif
75 return private_AES_set_encrypt_key(userKey, bits, key);
76 }
77
78int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
79 AES_KEY *key)
80 {
81#ifdef OPENSSL_FIPS
82 fips_cipher_abort(AES);
83#endif
84 return private_AES_set_decrypt_key(userKey, bits, key);
85 }
diff --git a/src/lib/libcrypto/aes/aes_ofb.c b/src/lib/libcrypto/aes/aes_ofb.c
deleted file mode 100644
index 50bf0b8325..0000000000
--- a/src/lib/libcrypto/aes/aes_ofb.c
+++ /dev/null
@@ -1,60 +0,0 @@
1/* crypto/aes/aes_ofb.c -*- mode:C; c-file-style: "eay" -*- */
2/* ====================================================================
3 * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 */
51
52#include <openssl/aes.h>
53#include <openssl/modes.h>
54
55void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out,
56 size_t length, const AES_KEY *key,
57 unsigned char *ivec, int *num)
58{
59 CRYPTO_ofb128_encrypt(in,out,length,key,ivec,num,(block128_f)AES_encrypt);
60}
diff --git a/src/lib/libcrypto/aes/aes_wrap.c b/src/lib/libcrypto/aes/aes_wrap.c
deleted file mode 100644
index e2d73d37ce..0000000000
--- a/src/lib/libcrypto/aes/aes_wrap.c
+++ /dev/null
@@ -1,259 +0,0 @@
1/* crypto/aes/aes_wrap.c */
2/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
3 * project.
4 */
5/* ====================================================================
6 * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. All advertising materials mentioning features or use of this
21 * software must display the following acknowledgment:
22 * "This product includes software developed by the OpenSSL Project
23 * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
24 *
25 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26 * endorse or promote products derived from this software without
27 * prior written permission. For written permission, please contact
28 * licensing@OpenSSL.org.
29 *
30 * 5. Products derived from this software may not be called "OpenSSL"
31 * nor may "OpenSSL" appear in their names without prior written
32 * permission of the OpenSSL Project.
33 *
34 * 6. Redistributions of any form whatsoever must retain the following
35 * acknowledgment:
36 * "This product includes software developed by the OpenSSL Project
37 * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50 * OF THE POSSIBILITY OF SUCH DAMAGE.
51 * ====================================================================
52 */
53
54#include "cryptlib.h"
55#include <openssl/aes.h>
56#include <openssl/bio.h>
57
58static const unsigned char default_iv[] = {
59 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
60};
61
62int AES_wrap_key(AES_KEY *key, const unsigned char *iv,
63 unsigned char *out,
64 const unsigned char *in, unsigned int inlen)
65 {
66 unsigned char *A, B[16], *R;
67 unsigned int i, j, t;
68 if ((inlen & 0x7) || (inlen < 8))
69 return -1;
70 A = B;
71 t = 1;
72 memcpy(out + 8, in, inlen);
73 if (!iv)
74 iv = default_iv;
75
76 memcpy(A, iv, 8);
77
78 for (j = 0; j < 6; j++)
79 {
80 R = out + 8;
81 for (i = 0; i < inlen; i += 8, t++, R += 8)
82 {
83 memcpy(B + 8, R, 8);
84 AES_encrypt(B, B, key);
85 A[7] ^= (unsigned char)(t & 0xff);
86 if (t > 0xff)
87 {
88 A[6] ^= (unsigned char)((t >> 8) & 0xff);
89 A[5] ^= (unsigned char)((t >> 16) & 0xff);
90 A[4] ^= (unsigned char)((t >> 24) & 0xff);
91 }
92 memcpy(R, B + 8, 8);
93 }
94 }
95 memcpy(out, A, 8);
96 return inlen + 8;
97 }
98
99int AES_unwrap_key(AES_KEY *key, const unsigned char *iv,
100 unsigned char *out,
101 const unsigned char *in, unsigned int inlen)
102 {
103 unsigned char *A, B[16], *R;
104 unsigned int i, j, t;
105 inlen -= 8;
106 if (inlen & 0x7)
107 return -1;
108 if (inlen < 8)
109 return -1;
110 A = B;
111 t = 6 * (inlen >> 3);
112 memcpy(A, in, 8);
113 memcpy(out, in + 8, inlen);
114 for (j = 0; j < 6; j++)
115 {
116 R = out + inlen - 8;
117 for (i = 0; i < inlen; i += 8, t--, R -= 8)
118 {
119 A[7] ^= (unsigned char)(t & 0xff);
120 if (t > 0xff)
121 {
122 A[6] ^= (unsigned char)((t >> 8) & 0xff);
123 A[5] ^= (unsigned char)((t >> 16) & 0xff);
124 A[4] ^= (unsigned char)((t >> 24) & 0xff);
125 }
126 memcpy(B + 8, R, 8);
127 AES_decrypt(B, B, key);
128 memcpy(R, B + 8, 8);
129 }
130 }
131 if (!iv)
132 iv = default_iv;
133 if (memcmp(A, iv, 8))
134 {
135 OPENSSL_cleanse(out, inlen);
136 return 0;
137 }
138 return inlen;
139 }
140
141#ifdef AES_WRAP_TEST
142
143int AES_wrap_unwrap_test(const unsigned char *kek, int keybits,
144 const unsigned char *iv,
145 const unsigned char *eout,
146 const unsigned char *key, int keylen)
147 {
148 unsigned char *otmp = NULL, *ptmp = NULL;
149 int r, ret = 0;
150 AES_KEY wctx;
151 otmp = OPENSSL_malloc(keylen + 8);
152 ptmp = OPENSSL_malloc(keylen);
153 if (!otmp || !ptmp)
154 return 0;
155 if (AES_set_encrypt_key(kek, keybits, &wctx))
156 goto err;
157 r = AES_wrap_key(&wctx, iv, otmp, key, keylen);
158 if (r <= 0)
159 goto err;
160
161 if (eout && memcmp(eout, otmp, keylen))
162 goto err;
163
164 if (AES_set_decrypt_key(kek, keybits, &wctx))
165 goto err;
166 r = AES_unwrap_key(&wctx, iv, ptmp, otmp, r);
167
168 if (memcmp(key, ptmp, keylen))
169 goto err;
170
171 ret = 1;
172
173 err:
174 if (otmp)
175 OPENSSL_free(otmp);
176 if (ptmp)
177 OPENSSL_free(ptmp);
178
179 return ret;
180
181 }
182
183
184
185int main(int argc, char **argv)
186{
187
188static const unsigned char kek[] = {
189 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
190 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
191 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
192 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
193};
194
195static const unsigned char key[] = {
196 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
197 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
198 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
199 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
200};
201
202static const unsigned char e1[] = {
203 0x1f, 0xa6, 0x8b, 0x0a, 0x81, 0x12, 0xb4, 0x47,
204 0xae, 0xf3, 0x4b, 0xd8, 0xfb, 0x5a, 0x7b, 0x82,
205 0x9d, 0x3e, 0x86, 0x23, 0x71, 0xd2, 0xcf, 0xe5
206};
207
208static const unsigned char e2[] = {
209 0x96, 0x77, 0x8b, 0x25, 0xae, 0x6c, 0xa4, 0x35,
210 0xf9, 0x2b, 0x5b, 0x97, 0xc0, 0x50, 0xae, 0xd2,
211 0x46, 0x8a, 0xb8, 0xa1, 0x7a, 0xd8, 0x4e, 0x5d
212};
213
214static const unsigned char e3[] = {
215 0x64, 0xe8, 0xc3, 0xf9, 0xce, 0x0f, 0x5b, 0xa2,
216 0x63, 0xe9, 0x77, 0x79, 0x05, 0x81, 0x8a, 0x2a,
217 0x93, 0xc8, 0x19, 0x1e, 0x7d, 0x6e, 0x8a, 0xe7
218};
219
220static const unsigned char e4[] = {
221 0x03, 0x1d, 0x33, 0x26, 0x4e, 0x15, 0xd3, 0x32,
222 0x68, 0xf2, 0x4e, 0xc2, 0x60, 0x74, 0x3e, 0xdc,
223 0xe1, 0xc6, 0xc7, 0xdd, 0xee, 0x72, 0x5a, 0x93,
224 0x6b, 0xa8, 0x14, 0x91, 0x5c, 0x67, 0x62, 0xd2
225};
226
227static const unsigned char e5[] = {
228 0xa8, 0xf9, 0xbc, 0x16, 0x12, 0xc6, 0x8b, 0x3f,
229 0xf6, 0xe6, 0xf4, 0xfb, 0xe3, 0x0e, 0x71, 0xe4,
230 0x76, 0x9c, 0x8b, 0x80, 0xa3, 0x2c, 0xb8, 0x95,
231 0x8c, 0xd5, 0xd1, 0x7d, 0x6b, 0x25, 0x4d, 0xa1
232};
233
234static const unsigned char e6[] = {
235 0x28, 0xc9, 0xf4, 0x04, 0xc4, 0xb8, 0x10, 0xf4,
236 0xcb, 0xcc, 0xb3, 0x5c, 0xfb, 0x87, 0xf8, 0x26,
237 0x3f, 0x57, 0x86, 0xe2, 0xd8, 0x0e, 0xd3, 0x26,
238 0xcb, 0xc7, 0xf0, 0xe7, 0x1a, 0x99, 0xf4, 0x3b,
239 0xfb, 0x98, 0x8b, 0x9b, 0x7a, 0x02, 0xdd, 0x21
240};
241
242 AES_KEY wctx, xctx;
243 int ret;
244 ret = AES_wrap_unwrap_test(kek, 128, NULL, e1, key, 16);
245 fprintf(stderr, "Key test result %d\n", ret);
246 ret = AES_wrap_unwrap_test(kek, 192, NULL, e2, key, 16);
247 fprintf(stderr, "Key test result %d\n", ret);
248 ret = AES_wrap_unwrap_test(kek, 256, NULL, e3, key, 16);
249 fprintf(stderr, "Key test result %d\n", ret);
250 ret = AES_wrap_unwrap_test(kek, 192, NULL, e4, key, 24);
251 fprintf(stderr, "Key test result %d\n", ret);
252 ret = AES_wrap_unwrap_test(kek, 256, NULL, e5, key, 24);
253 fprintf(stderr, "Key test result %d\n", ret);
254 ret = AES_wrap_unwrap_test(kek, 256, NULL, e6, key, 32);
255 fprintf(stderr, "Key test result %d\n", ret);
256}
257
258
259#endif
diff --git a/src/lib/libcrypto/aes/aes_x86core.c b/src/lib/libcrypto/aes/aes_x86core.c
deleted file mode 100644
index d323e265c0..0000000000
--- a/src/lib/libcrypto/aes/aes_x86core.c
+++ /dev/null
@@ -1,1063 +0,0 @@
1/* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
2/**
3 * rijndael-alg-fst.c
4 *
5 * @version 3.0 (December 2000)
6 *
7 * Optimised ANSI C code for the Rijndael cipher (now AES)
8 *
9 * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10 * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11 * @author Paulo Barreto <paulo.barreto@terra.com.br>
12 *
13 * This code is hereby placed in the public domain.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * This is experimental x86[_64] derivative. It assumes little-endian
30 * byte order and expects CPU to sustain unaligned memory references.
31 * It is used as playground for cache-time attack mitigations and
32 * serves as reference C implementation for x86[_64] assembler.
33 *
34 * <appro@fy.chalmers.se>
35 */
36
37
38#ifndef AES_DEBUG
39# ifndef NDEBUG
40# define NDEBUG
41# endif
42#endif
43#include <assert.h>
44
45#include <stdlib.h>
46#include <openssl/aes.h>
47#include "aes_locl.h"
48
49/*
50 * These two parameters control which table, 256-byte or 2KB, is
51 * referenced in outer and respectively inner rounds.
52 */
53#define AES_COMPACT_IN_OUTER_ROUNDS
54#ifdef AES_COMPACT_IN_OUTER_ROUNDS
55/* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while
56 * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further*
57 * by factor of ~2. */
58# undef AES_COMPACT_IN_INNER_ROUNDS
59#endif
60
61#if 1
62static void prefetch256(const void *table)
63{
64 volatile unsigned long *t=(void *)table,ret;
65 unsigned long sum;
66 int i;
67
68 /* 32 is common least cache-line size */
69 for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0])) sum ^= t[i];
70
71 ret = sum;
72}
73#else
74# define prefetch256(t)
75#endif
76
77#undef GETU32
78#define GETU32(p) (*((u32*)(p)))
79
80#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
81typedef unsigned __int64 u64;
82#define U64(C) C##UI64
83#elif defined(__arch64__)
84typedef unsigned long u64;
85#define U64(C) C##UL
86#else
87typedef unsigned long long u64;
88#define U64(C) C##ULL
89#endif
90
91#undef ROTATE
92#if defined(_MSC_VER) || defined(__ICC)
93# define ROTATE(a,n) _lrotl(a,n)
94#elif defined(__GNUC__) && __GNUC__>=2
95# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
96# define ROTATE(a,n) ({ register unsigned int ret; \
97 asm ( \
98 "roll %1,%0" \
99 : "=r"(ret) \
100 : "I"(n), "0"(a) \
101 : "cc"); \
102 ret; \
103 })
104# endif
105#endif
106/*
107Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
108Te0[x] = S [x].[02, 01, 01, 03];
109Te1[x] = S [x].[03, 02, 01, 01];
110Te2[x] = S [x].[01, 03, 02, 01];
111Te3[x] = S [x].[01, 01, 03, 02];
112*/
113#define Te0 (u32)((u64*)((u8*)Te+0))
114#define Te1 (u32)((u64*)((u8*)Te+3))
115#define Te2 (u32)((u64*)((u8*)Te+2))
116#define Te3 (u32)((u64*)((u8*)Te+1))
117/*
118Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
119Td0[x] = Si[x].[0e, 09, 0d, 0b];
120Td1[x] = Si[x].[0b, 0e, 09, 0d];
121Td2[x] = Si[x].[0d, 0b, 0e, 09];
122Td3[x] = Si[x].[09, 0d, 0b, 0e];
123Td4[x] = Si[x].[01];
124*/
125#define Td0 (u32)((u64*)((u8*)Td+0))
126#define Td1 (u32)((u64*)((u8*)Td+3))
127#define Td2 (u32)((u64*)((u8*)Td+2))
128#define Td3 (u32)((u64*)((u8*)Td+1))
129
130static const u64 Te[256] = {
131 U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
132 U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
133 U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
134 U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
135 U64(0x5030306050303060), U64(0x0301010203010102),
136 U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
137 U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
138 U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
139 U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
140 U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
141 U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
142 U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
143 U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
144 U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
145 U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
146 U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
147 U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
148 U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
149 U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
150 U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
151 U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
152 U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
153 U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
154 U64(0x5331316253313162), U64(0x3f15152a3f15152a),
155 U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
156 U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
157 U64(0x2818183028181830), U64(0xa1969637a1969637),
158 U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
159 U64(0x0907070e0907070e), U64(0x3612122436121224),
160 U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
161 U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
162 U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
163 U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
164 U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
165 U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
166 U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
167 U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
168 U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
169 U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
170 U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
171 U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
172 U64(0x0000000000000000), U64(0x2cededc12cededc1),
173 U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
174 U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
175 U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
176 U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
177 U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
178 U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
179 U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
180 U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
181 U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
182 U64(0x5533336655333366), U64(0x9485851194858511),
183 U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
184 U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
185 U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
186 U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
187 U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
188 U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
189 U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
190 U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
191 U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
192 U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
193 U64(0x3010102030101020), U64(0x1affffe51affffe5),
194 U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
195 U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
196 U64(0x3513132635131326), U64(0x2fececc32fececc3),
197 U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
198 U64(0xcc444488cc444488), U64(0x3917172e3917172e),
199 U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
200 U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
201 U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
202 U64(0x2b1919322b191932), U64(0x957373e6957373e6),
203 U64(0xa06060c0a06060c0), U64(0x9881811998818119),
204 U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
205 U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
206 U64(0xab90903bab90903b), U64(0x8388880b8388880b),
207 U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
208 U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
209 U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
210 U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
211 U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
212 U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
213 U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
214 U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
215 U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
216 U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
217 U64(0xa8919139a8919139), U64(0xa4959531a4959531),
218 U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
219 U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
220 U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
221 U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
222 U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
223 U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
224 U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
225 U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
226 U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
227 U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
228 U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
229 U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
230 U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
231 U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
232 U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
233 U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
234 U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
235 U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
236 U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
237 U64(0xd8484890d8484890), U64(0x0503030605030306),
238 U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
239 U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
240 U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
241 U64(0x9186861791868617), U64(0x58c1c19958c1c199),
242 U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
243 U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
244 U64(0xb398982bb398982b), U64(0x3311112233111122),
245 U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
246 U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
247 U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
248 U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
249 U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
250 U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
251 U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
252 U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
253 U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
254 U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
255 U64(0xc3414182c3414182), U64(0xb0999929b0999929),
256 U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
257 U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
258 U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
259};
260
261static const u8 Te4[256] = {
262 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
263 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
264 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
265 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
266 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
267 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
268 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
269 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
270 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
271 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
272 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
273 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
274 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
275 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
276 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
277 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
278 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
279 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
280 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
281 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
282 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
283 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
284 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
285 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
286 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
287 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
288 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
289 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
290 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
291 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
292 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
293 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
294};
295
296static const u64 Td[256] = {
297 U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
298 U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
299 U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
300 U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
301 U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
302 U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
303 U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
304 U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
305 U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
306 U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
307 U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
308 U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
309 U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
310 U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
311 U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
312 U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
313 U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
314 U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
315 U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
316 U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
317 U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
318 U64(0x6033519760335197), U64(0x457f5362457f5362),
319 U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
320 U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
321 U64(0x5868487058684870), U64(0x19fd458f19fd458f),
322 U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
323 U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
324 U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
325 U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
326 U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
327 U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
328 U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
329 U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
330 U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
331 U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
332 U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
333 U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
334 U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
335 U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
336 U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
337 U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
338 U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
339 U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
340 U64(0x6fd406046fd40604), U64(0xff155060ff155060),
341 U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
342 U64(0xcc434089cc434089), U64(0x779ed967779ed967),
343 U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
344 U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
345 U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
346 U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
347 U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
348 U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
349 U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
350 U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
351 U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
352 U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
353 U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
354 U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
355 U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
356 U64(0x694b775a694b775a), U64(0x161a121c161a121c),
357 U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
358 U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
359 U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
360 U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
361 U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
362 U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
363 U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
364 U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
365 U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
366 U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
367 U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
368 U64(0x4022971340229713), U64(0x2011c6842011c684),
369 U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
370 U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
371 U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
372 U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
373 U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
374 U64(0xfa489411fa489411), U64(0x2264e9472264e947),
375 U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
376 U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
377 U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
378 U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
379 U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
380 U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
381 U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
382 U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
383 U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
384 U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
385 U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
386 U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
387 U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
388 U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
389 U64(0x097826cd097826cd), U64(0xf418596ef418596e),
390 U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
391 U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
392 U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
393 U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
394 U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
395 U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
396 U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
397 U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
398 U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
399 U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
400 U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
401 U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
402 U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
403 U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
404 U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
405 U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
406 U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
407 U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
408 U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
409 U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
410 U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
411 U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
412 U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
413 U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
414 U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
415 U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
416 U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
417 U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
418 U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
419 U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
420 U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
421 U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
422 U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
423 U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
424 U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
425};
426static const u8 Td4[256] = {
427 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
428 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
429 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
430 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
431 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
432 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
433 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
434 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
435 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
436 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
437 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
438 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
439 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
440 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
441 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
442 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
443 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
444 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
445 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
446 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
447 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
448 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
449 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
450 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
451 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
452 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
453 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
454 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
455 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
456 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
457 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
458 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
459};
460
461static const u32 rcon[] = {
462 0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
463 0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
464 0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
465};
466
467/**
468 * Expand the cipher key into the encryption key schedule.
469 */
470int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
471 AES_KEY *key) {
472
473 u32 *rk;
474 int i = 0;
475 u32 temp;
476
477 if (!userKey || !key)
478 return -1;
479 if (bits != 128 && bits != 192 && bits != 256)
480 return -2;
481
482 rk = key->rd_key;
483
484 if (bits==128)
485 key->rounds = 10;
486 else if (bits==192)
487 key->rounds = 12;
488 else
489 key->rounds = 14;
490
491 rk[0] = GETU32(userKey );
492 rk[1] = GETU32(userKey + 4);
493 rk[2] = GETU32(userKey + 8);
494 rk[3] = GETU32(userKey + 12);
495 if (bits == 128) {
496 while (1) {
497 temp = rk[3];
498 rk[4] = rk[0] ^
499 (Te4[(temp >> 8) & 0xff] ) ^
500 (Te4[(temp >> 16) & 0xff] << 8) ^
501 (Te4[(temp >> 24) ] << 16) ^
502 (Te4[(temp ) & 0xff] << 24) ^
503 rcon[i];
504 rk[5] = rk[1] ^ rk[4];
505 rk[6] = rk[2] ^ rk[5];
506 rk[7] = rk[3] ^ rk[6];
507 if (++i == 10) {
508 return 0;
509 }
510 rk += 4;
511 }
512 }
513 rk[4] = GETU32(userKey + 16);
514 rk[5] = GETU32(userKey + 20);
515 if (bits == 192) {
516 while (1) {
517 temp = rk[ 5];
518 rk[ 6] = rk[ 0] ^
519 (Te4[(temp >> 8) & 0xff] ) ^
520 (Te4[(temp >> 16) & 0xff] << 8) ^
521 (Te4[(temp >> 24) ] << 16) ^
522 (Te4[(temp ) & 0xff] << 24) ^
523 rcon[i];
524 rk[ 7] = rk[ 1] ^ rk[ 6];
525 rk[ 8] = rk[ 2] ^ rk[ 7];
526 rk[ 9] = rk[ 3] ^ rk[ 8];
527 if (++i == 8) {
528 return 0;
529 }
530 rk[10] = rk[ 4] ^ rk[ 9];
531 rk[11] = rk[ 5] ^ rk[10];
532 rk += 6;
533 }
534 }
535 rk[6] = GETU32(userKey + 24);
536 rk[7] = GETU32(userKey + 28);
537 if (bits == 256) {
538 while (1) {
539 temp = rk[ 7];
540 rk[ 8] = rk[ 0] ^
541 (Te4[(temp >> 8) & 0xff] ) ^
542 (Te4[(temp >> 16) & 0xff] << 8) ^
543 (Te4[(temp >> 24) ] << 16) ^
544 (Te4[(temp ) & 0xff] << 24) ^
545 rcon[i];
546 rk[ 9] = rk[ 1] ^ rk[ 8];
547 rk[10] = rk[ 2] ^ rk[ 9];
548 rk[11] = rk[ 3] ^ rk[10];
549 if (++i == 7) {
550 return 0;
551 }
552 temp = rk[11];
553 rk[12] = rk[ 4] ^
554 (Te4[(temp ) & 0xff] ) ^
555 (Te4[(temp >> 8) & 0xff] << 8) ^
556 (Te4[(temp >> 16) & 0xff] << 16) ^
557 (Te4[(temp >> 24) ] << 24);
558 rk[13] = rk[ 5] ^ rk[12];
559 rk[14] = rk[ 6] ^ rk[13];
560 rk[15] = rk[ 7] ^ rk[14];
561
562 rk += 8;
563 }
564 }
565 return 0;
566}
567
568/**
569 * Expand the cipher key into the decryption key schedule.
570 */
571int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
572 AES_KEY *key) {
573
574 u32 *rk;
575 int i, j, status;
576 u32 temp;
577
578 /* first, start with an encryption schedule */
579 status = AES_set_encrypt_key(userKey, bits, key);
580 if (status < 0)
581 return status;
582
583 rk = key->rd_key;
584
585 /* invert the order of the round keys: */
586 for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
587 temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
588 temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
589 temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
590 temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
591 }
592 /* apply the inverse MixColumn transform to all round keys but the first and the last: */
593 for (i = 1; i < (key->rounds); i++) {
594 rk += 4;
595#if 1
596 for (j = 0; j < 4; j++) {
597 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
598
599 tp1 = rk[j];
600 m = tp1 & 0x80808080;
601 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
602 ((m - (m >> 7)) & 0x1b1b1b1b);
603 m = tp2 & 0x80808080;
604 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
605 ((m - (m >> 7)) & 0x1b1b1b1b);
606 m = tp4 & 0x80808080;
607 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
608 ((m - (m >> 7)) & 0x1b1b1b1b);
609 tp9 = tp8 ^ tp1;
610 tpb = tp9 ^ tp2;
611 tpd = tp9 ^ tp4;
612 tpe = tp8 ^ tp4 ^ tp2;
613#if defined(ROTATE)
614 rk[j] = tpe ^ ROTATE(tpd,16) ^
615 ROTATE(tp9,8) ^ ROTATE(tpb,24);
616#else
617 rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
618 (tp9 >> 24) ^ (tp9 << 8) ^
619 (tpb >> 8) ^ (tpb << 24);
620#endif
621 }
622#else
623 rk[0] =
624 Td0[Te2[(rk[0] ) & 0xff] & 0xff] ^
625 Td1[Te2[(rk[0] >> 8) & 0xff] & 0xff] ^
626 Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
627 Td3[Te2[(rk[0] >> 24) ] & 0xff];
628 rk[1] =
629 Td0[Te2[(rk[1] ) & 0xff] & 0xff] ^
630 Td1[Te2[(rk[1] >> 8) & 0xff] & 0xff] ^
631 Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
632 Td3[Te2[(rk[1] >> 24) ] & 0xff];
633 rk[2] =
634 Td0[Te2[(rk[2] ) & 0xff] & 0xff] ^
635 Td1[Te2[(rk[2] >> 8) & 0xff] & 0xff] ^
636 Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
637 Td3[Te2[(rk[2] >> 24) ] & 0xff];
638 rk[3] =
639 Td0[Te2[(rk[3] ) & 0xff] & 0xff] ^
640 Td1[Te2[(rk[3] >> 8) & 0xff] & 0xff] ^
641 Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
642 Td3[Te2[(rk[3] >> 24) ] & 0xff];
643#endif
644 }
645 return 0;
646}
647
648/*
649 * Encrypt a single block
650 * in and out can overlap
651 */
652void AES_encrypt(const unsigned char *in, unsigned char *out,
653 const AES_KEY *key) {
654
655 const u32 *rk;
656 u32 s0, s1, s2, s3, t[4];
657 int r;
658
659 assert(in && out && key);
660 rk = key->rd_key;
661
662 /*
663 * map byte array block to cipher state
664 * and add initial round key:
665 */
666 s0 = GETU32(in ) ^ rk[0];
667 s1 = GETU32(in + 4) ^ rk[1];
668 s2 = GETU32(in + 8) ^ rk[2];
669 s3 = GETU32(in + 12) ^ rk[3];
670
671#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
672 prefetch256(Te4);
673
674 t[0] = Te4[(s0 ) & 0xff] ^
675 Te4[(s1 >> 8) & 0xff] << 8 ^
676 Te4[(s2 >> 16) & 0xff] << 16 ^
677 Te4[(s3 >> 24) ] << 24;
678 t[1] = Te4[(s1 ) & 0xff] ^
679 Te4[(s2 >> 8) & 0xff] << 8 ^
680 Te4[(s3 >> 16) & 0xff] << 16 ^
681 Te4[(s0 >> 24) ] << 24;
682 t[2] = Te4[(s2 ) & 0xff] ^
683 Te4[(s3 >> 8) & 0xff] << 8 ^
684 Te4[(s0 >> 16) & 0xff] << 16 ^
685 Te4[(s1 >> 24) ] << 24;
686 t[3] = Te4[(s3 ) & 0xff] ^
687 Te4[(s0 >> 8) & 0xff] << 8 ^
688 Te4[(s1 >> 16) & 0xff] << 16 ^
689 Te4[(s2 >> 24) ] << 24;
690
691 /* now do the linear transform using words */
692 { int i;
693 u32 r0, r1, r2;
694
695 for (i = 0; i < 4; i++) {
696 r0 = t[i];
697 r1 = r0 & 0x80808080;
698 r2 = ((r0 & 0x7f7f7f7f) << 1) ^
699 ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
700#if defined(ROTATE)
701 t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
702 ROTATE(r0,16) ^ ROTATE(r0,8);
703#else
704 t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
705 (r0 << 16) ^ (r0 >> 16) ^
706 (r0 << 8) ^ (r0 >> 24);
707#endif
708 t[i] ^= rk[4+i];
709 }
710 }
711#else
712 t[0] = Te0[(s0 ) & 0xff] ^
713 Te1[(s1 >> 8) & 0xff] ^
714 Te2[(s2 >> 16) & 0xff] ^
715 Te3[(s3 >> 24) ] ^
716 rk[4];
717 t[1] = Te0[(s1 ) & 0xff] ^
718 Te1[(s2 >> 8) & 0xff] ^
719 Te2[(s3 >> 16) & 0xff] ^
720 Te3[(s0 >> 24) ] ^
721 rk[5];
722 t[2] = Te0[(s2 ) & 0xff] ^
723 Te1[(s3 >> 8) & 0xff] ^
724 Te2[(s0 >> 16) & 0xff] ^
725 Te3[(s1 >> 24) ] ^
726 rk[6];
727 t[3] = Te0[(s3 ) & 0xff] ^
728 Te1[(s0 >> 8) & 0xff] ^
729 Te2[(s1 >> 16) & 0xff] ^
730 Te3[(s2 >> 24) ] ^
731 rk[7];
732#endif
733 s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
734
735 /*
736 * Nr - 2 full rounds:
737 */
738 for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
739#if defined(AES_COMPACT_IN_INNER_ROUNDS)
740 t[0] = Te4[(s0 ) & 0xff] ^
741 Te4[(s1 >> 8) & 0xff] << 8 ^
742 Te4[(s2 >> 16) & 0xff] << 16 ^
743 Te4[(s3 >> 24) ] << 24;
744 t[1] = Te4[(s1 ) & 0xff] ^
745 Te4[(s2 >> 8) & 0xff] << 8 ^
746 Te4[(s3 >> 16) & 0xff] << 16 ^
747 Te4[(s0 >> 24) ] << 24;
748 t[2] = Te4[(s2 ) & 0xff] ^
749 Te4[(s3 >> 8) & 0xff] << 8 ^
750 Te4[(s0 >> 16) & 0xff] << 16 ^
751 Te4[(s1 >> 24) ] << 24;
752 t[3] = Te4[(s3 ) & 0xff] ^
753 Te4[(s0 >> 8) & 0xff] << 8 ^
754 Te4[(s1 >> 16) & 0xff] << 16 ^
755 Te4[(s2 >> 24) ] << 24;
756
757 /* now do the linear transform using words */
758 { int i;
759 u32 r0, r1, r2;
760
761 for (i = 0; i < 4; i++) {
762 r0 = t[i];
763 r1 = r0 & 0x80808080;
764 r2 = ((r0 & 0x7f7f7f7f) << 1) ^
765 ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
766#if defined(ROTATE)
767 t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
768 ROTATE(r0,16) ^ ROTATE(r0,8);
769#else
770 t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
771 (r0 << 16) ^ (r0 >> 16) ^
772 (r0 << 8) ^ (r0 >> 24);
773#endif
774 t[i] ^= rk[i];
775 }
776 }
777#else
778 t[0] = Te0[(s0 ) & 0xff] ^
779 Te1[(s1 >> 8) & 0xff] ^
780 Te2[(s2 >> 16) & 0xff] ^
781 Te3[(s3 >> 24) ] ^
782 rk[0];
783 t[1] = Te0[(s1 ) & 0xff] ^
784 Te1[(s2 >> 8) & 0xff] ^
785 Te2[(s3 >> 16) & 0xff] ^
786 Te3[(s0 >> 24) ] ^
787 rk[1];
788 t[2] = Te0[(s2 ) & 0xff] ^
789 Te1[(s3 >> 8) & 0xff] ^
790 Te2[(s0 >> 16) & 0xff] ^
791 Te3[(s1 >> 24) ] ^
792 rk[2];
793 t[3] = Te0[(s3 ) & 0xff] ^
794 Te1[(s0 >> 8) & 0xff] ^
795 Te2[(s1 >> 16) & 0xff] ^
796 Te3[(s2 >> 24) ] ^
797 rk[3];
798#endif
799 s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
800 }
801 /*
802 * apply last round and
803 * map cipher state to byte array block:
804 */
805#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
806 prefetch256(Te4);
807
808 *(u32*)(out+0) =
809 Te4[(s0 ) & 0xff] ^
810 Te4[(s1 >> 8) & 0xff] << 8 ^
811 Te4[(s2 >> 16) & 0xff] << 16 ^
812 Te4[(s3 >> 24) ] << 24 ^
813 rk[0];
814 *(u32*)(out+4) =
815 Te4[(s1 ) & 0xff] ^
816 Te4[(s2 >> 8) & 0xff] << 8 ^
817 Te4[(s3 >> 16) & 0xff] << 16 ^
818 Te4[(s0 >> 24) ] << 24 ^
819 rk[1];
820 *(u32*)(out+8) =
821 Te4[(s2 ) & 0xff] ^
822 Te4[(s3 >> 8) & 0xff] << 8 ^
823 Te4[(s0 >> 16) & 0xff] << 16 ^
824 Te4[(s1 >> 24) ] << 24 ^
825 rk[2];
826 *(u32*)(out+12) =
827 Te4[(s3 ) & 0xff] ^
828 Te4[(s0 >> 8) & 0xff] << 8 ^
829 Te4[(s1 >> 16) & 0xff] << 16 ^
830 Te4[(s2 >> 24) ] << 24 ^
831 rk[3];
832#else
833 *(u32*)(out+0) =
834 (Te2[(s0 ) & 0xff] & 0x000000ffU) ^
835 (Te3[(s1 >> 8) & 0xff] & 0x0000ff00U) ^
836 (Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^
837 (Te1[(s3 >> 24) ] & 0xff000000U) ^
838 rk[0];
839 *(u32*)(out+4) =
840 (Te2[(s1 ) & 0xff] & 0x000000ffU) ^
841 (Te3[(s2 >> 8) & 0xff] & 0x0000ff00U) ^
842 (Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^
843 (Te1[(s0 >> 24) ] & 0xff000000U) ^
844 rk[1];
845 *(u32*)(out+8) =
846 (Te2[(s2 ) & 0xff] & 0x000000ffU) ^
847 (Te3[(s3 >> 8) & 0xff] & 0x0000ff00U) ^
848 (Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^
849 (Te1[(s1 >> 24) ] & 0xff000000U) ^
850 rk[2];
851 *(u32*)(out+12) =
852 (Te2[(s3 ) & 0xff] & 0x000000ffU) ^
853 (Te3[(s0 >> 8) & 0xff] & 0x0000ff00U) ^
854 (Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^
855 (Te1[(s2 >> 24) ] & 0xff000000U) ^
856 rk[3];
857#endif
858}
859
860/*
861 * Decrypt a single block
862 * in and out can overlap
863 */
864void AES_decrypt(const unsigned char *in, unsigned char *out,
865 const AES_KEY *key) {
866
867 const u32 *rk;
868 u32 s0, s1, s2, s3, t[4];
869 int r;
870
871 assert(in && out && key);
872 rk = key->rd_key;
873
874 /*
875 * map byte array block to cipher state
876 * and add initial round key:
877 */
878 s0 = GETU32(in ) ^ rk[0];
879 s1 = GETU32(in + 4) ^ rk[1];
880 s2 = GETU32(in + 8) ^ rk[2];
881 s3 = GETU32(in + 12) ^ rk[3];
882
883#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
884 prefetch256(Td4);
885
886 t[0] = Td4[(s0 ) & 0xff] ^
887 Td4[(s3 >> 8) & 0xff] << 8 ^
888 Td4[(s2 >> 16) & 0xff] << 16 ^
889 Td4[(s1 >> 24) ] << 24;
890 t[1] = Td4[(s1 ) & 0xff] ^
891 Td4[(s0 >> 8) & 0xff] << 8 ^
892 Td4[(s3 >> 16) & 0xff] << 16 ^
893 Td4[(s2 >> 24) ] << 24;
894 t[2] = Td4[(s2 ) & 0xff] ^
895 Td4[(s1 >> 8) & 0xff] << 8 ^
896 Td4[(s0 >> 16) & 0xff] << 16 ^
897 Td4[(s3 >> 24) ] << 24;
898 t[3] = Td4[(s3 ) & 0xff] ^
899 Td4[(s2 >> 8) & 0xff] << 8 ^
900 Td4[(s1 >> 16) & 0xff] << 16 ^
901 Td4[(s0 >> 24) ] << 24;
902
903 /* now do the linear transform using words */
904 { int i;
905 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
906
907 for (i = 0; i < 4; i++) {
908 tp1 = t[i];
909 m = tp1 & 0x80808080;
910 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
911 ((m - (m >> 7)) & 0x1b1b1b1b);
912 m = tp2 & 0x80808080;
913 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
914 ((m - (m >> 7)) & 0x1b1b1b1b);
915 m = tp4 & 0x80808080;
916 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
917 ((m - (m >> 7)) & 0x1b1b1b1b);
918 tp9 = tp8 ^ tp1;
919 tpb = tp9 ^ tp2;
920 tpd = tp9 ^ tp4;
921 tpe = tp8 ^ tp4 ^ tp2;
922#if defined(ROTATE)
923 t[i] = tpe ^ ROTATE(tpd,16) ^
924 ROTATE(tp9,8) ^ ROTATE(tpb,24);
925#else
926 t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
927 (tp9 >> 24) ^ (tp9 << 8) ^
928 (tpb >> 8) ^ (tpb << 24);
929#endif
930 t[i] ^= rk[4+i];
931 }
932 }
933#else
934 t[0] = Td0[(s0 ) & 0xff] ^
935 Td1[(s3 >> 8) & 0xff] ^
936 Td2[(s2 >> 16) & 0xff] ^
937 Td3[(s1 >> 24) ] ^
938 rk[4];
939 t[1] = Td0[(s1 ) & 0xff] ^
940 Td1[(s0 >> 8) & 0xff] ^
941 Td2[(s3 >> 16) & 0xff] ^
942 Td3[(s2 >> 24) ] ^
943 rk[5];
944 t[2] = Td0[(s2 ) & 0xff] ^
945 Td1[(s1 >> 8) & 0xff] ^
946 Td2[(s0 >> 16) & 0xff] ^
947 Td3[(s3 >> 24) ] ^
948 rk[6];
949 t[3] = Td0[(s3 ) & 0xff] ^
950 Td1[(s2 >> 8) & 0xff] ^
951 Td2[(s1 >> 16) & 0xff] ^
952 Td3[(s0 >> 24) ] ^
953 rk[7];
954#endif
955 s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
956
957 /*
958 * Nr - 2 full rounds:
959 */
960 for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
961#if defined(AES_COMPACT_IN_INNER_ROUNDS)
962 t[0] = Td4[(s0 ) & 0xff] ^
963 Td4[(s3 >> 8) & 0xff] << 8 ^
964 Td4[(s2 >> 16) & 0xff] << 16 ^
965 Td4[(s1 >> 24) ] << 24;
966 t[1] = Td4[(s1 ) & 0xff] ^
967 Td4[(s0 >> 8) & 0xff] << 8 ^
968 Td4[(s3 >> 16) & 0xff] << 16 ^
969 Td4[(s2 >> 24) ] << 24;
970 t[2] = Td4[(s2 ) & 0xff] ^
971 Td4[(s1 >> 8) & 0xff] << 8 ^
972 Td4[(s0 >> 16) & 0xff] << 16 ^
973 Td4[(s3 >> 24) ] << 24;
974 t[3] = Td4[(s3 ) & 0xff] ^
975 Td4[(s2 >> 8) & 0xff] << 8 ^
976 Td4[(s1 >> 16) & 0xff] << 16 ^
977 Td4[(s0 >> 24) ] << 24;
978
979 /* now do the linear transform using words */
980 { int i;
981 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
982
983 for (i = 0; i < 4; i++) {
984 tp1 = t[i];
985 m = tp1 & 0x80808080;
986 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
987 ((m - (m >> 7)) & 0x1b1b1b1b);
988 m = tp2 & 0x80808080;
989 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
990 ((m - (m >> 7)) & 0x1b1b1b1b);
991 m = tp4 & 0x80808080;
992 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
993 ((m - (m >> 7)) & 0x1b1b1b1b);
994 tp9 = tp8 ^ tp1;
995 tpb = tp9 ^ tp2;
996 tpd = tp9 ^ tp4;
997 tpe = tp8 ^ tp4 ^ tp2;
998#if defined(ROTATE)
999 t[i] = tpe ^ ROTATE(tpd,16) ^
1000 ROTATE(tp9,8) ^ ROTATE(tpb,24);
1001#else
1002 t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
1003 (tp9 >> 24) ^ (tp9 << 8) ^
1004 (tpb >> 8) ^ (tpb << 24);
1005#endif
1006 t[i] ^= rk[i];
1007 }
1008 }
1009#else
1010 t[0] = Td0[(s0 ) & 0xff] ^
1011 Td1[(s3 >> 8) & 0xff] ^
1012 Td2[(s2 >> 16) & 0xff] ^
1013 Td3[(s1 >> 24) ] ^
1014 rk[0];
1015 t[1] = Td0[(s1 ) & 0xff] ^
1016 Td1[(s0 >> 8) & 0xff] ^
1017 Td2[(s3 >> 16) & 0xff] ^
1018 Td3[(s2 >> 24) ] ^
1019 rk[1];
1020 t[2] = Td0[(s2 ) & 0xff] ^
1021 Td1[(s1 >> 8) & 0xff] ^
1022 Td2[(s0 >> 16) & 0xff] ^
1023 Td3[(s3 >> 24) ] ^
1024 rk[2];
1025 t[3] = Td0[(s3 ) & 0xff] ^
1026 Td1[(s2 >> 8) & 0xff] ^
1027 Td2[(s1 >> 16) & 0xff] ^
1028 Td3[(s0 >> 24) ] ^
1029 rk[3];
1030#endif
1031 s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
1032 }
1033 /*
1034 * apply last round and
1035 * map cipher state to byte array block:
1036 */
1037 prefetch256(Td4);
1038
1039 *(u32*)(out+0) =
1040 (Td4[(s0 ) & 0xff]) ^
1041 (Td4[(s3 >> 8) & 0xff] << 8) ^
1042 (Td4[(s2 >> 16) & 0xff] << 16) ^
1043 (Td4[(s1 >> 24) ] << 24) ^
1044 rk[0];
1045 *(u32*)(out+4) =
1046 (Td4[(s1 ) & 0xff]) ^
1047 (Td4[(s0 >> 8) & 0xff] << 8) ^
1048 (Td4[(s3 >> 16) & 0xff] << 16) ^
1049 (Td4[(s2 >> 24) ] << 24) ^
1050 rk[1];
1051 *(u32*)(out+8) =
1052 (Td4[(s2 ) & 0xff]) ^
1053 (Td4[(s1 >> 8) & 0xff] << 8) ^
1054 (Td4[(s0 >> 16) & 0xff] << 16) ^
1055 (Td4[(s3 >> 24) ] << 24) ^
1056 rk[2];
1057 *(u32*)(out+12) =
1058 (Td4[(s3 ) & 0xff]) ^
1059 (Td4[(s2 >> 8) & 0xff] << 8) ^
1060 (Td4[(s1 >> 16) & 0xff] << 16) ^
1061 (Td4[(s0 >> 24) ] << 24) ^
1062 rk[3];
1063}
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl
deleted file mode 100644
index 687ed811be..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-586.pl
+++ /dev/null
@@ -1,2980 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Version 4.3.
11#
12# You might fail to appreciate this module performance from the first
13# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
14# to be *the* best Intel C compiler without -KPIC, performance appears
15# to be virtually identical... But try to re-configure with shared
16# library support... Aha! Intel compiler "suddenly" lags behind by 30%
17# [on P4, more on others]:-) And if compared to position-independent
18# code generated by GNU C, this code performs *more* than *twice* as
19# fast! Yes, all this buzz about PIC means that unlike other hand-
20# coded implementations, this one was explicitly designed to be safe
21# to use even in shared library context... This also means that this
22# code isn't necessarily absolutely fastest "ever," because in order
23# to achieve position independence an extra register has to be
24# off-loaded to stack, which affects the benchmark result.
25#
26# Special note about instruction choice. Do you recall RC4_INT code
27# performing poorly on P4? It might be the time to figure out why.
28# RC4_INT code implies effective address calculations in base+offset*4
29# form. Trouble is that it seems that offset scaling turned to be
30# critical path... At least eliminating scaling resulted in 2.8x RC4
31# performance improvement [as you might recall]. As AES code is hungry
32# for scaling too, I [try to] avoid the latter by favoring off-by-2
33# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
34#
35# As was shown by Dean Gaudet <dean@arctic.org>, the above note turned
36# void. Performance improvement with off-by-2 shifts was observed on
37# intermediate implementation, which was spilling yet another register
38# to stack... Final offset*4 code below runs just a tad faster on P4,
39# but exhibits up to 10% improvement on other cores.
40#
41# Second version is "monolithic" replacement for aes_core.c, which in
42# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key.
43# This made it possible to implement little-endian variant of the
44# algorithm without modifying the base C code. Motivating factor for
45# the undertaken effort was that it appeared that in tight IA-32
46# register window little-endian flavor could achieve slightly higher
47# Instruction Level Parallelism, and it indeed resulted in up to 15%
48# better performance on most recent µ-archs...
49#
50# Third version adds AES_cbc_encrypt implementation, which resulted in
51# up to 40% performance imrovement of CBC benchmark results. 40% was
52# observed on P4 core, where "overall" imrovement coefficient, i.e. if
53# compared to PIC generated by GCC and in CBC mode, was observed to be
54# as large as 4x:-) CBC performance is virtually identical to ECB now
55# and on some platforms even better, e.g. 17.6 "small" cycles/byte on
56# Opteron, because certain function prologues and epilogues are
57# effectively taken out of the loop...
58#
59# Version 3.2 implements compressed tables and prefetch of these tables
60# in CBC[!] mode. Former means that 3/4 of table references are now
61# misaligned, which unfortunately has negative impact on elder IA-32
62# implementations, Pentium suffered 30% penalty, PIII - 10%.
63#
64# Version 3.3 avoids L1 cache aliasing between stack frame and
65# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
66# latter is achieved by copying the key schedule to controlled place in
67# stack. This unfortunately has rather strong impact on small block CBC
68# performance, ~2x deterioration on 16-byte block if compared to 3.3.
69#
70# Version 3.5 checks if there is L1 cache aliasing between user-supplied
71# key schedule and S-boxes and abstains from copying the former if
72# there is no. This allows end-user to consciously retain small block
73# performance by aligning key schedule in specific manner.
74#
75# Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
76#
77# Current ECB performance numbers for 128-bit key in CPU cycles per
78# processed byte [measure commonly used by AES benchmarkers] are:
79#
80# small footprint fully unrolled
81# P4 24 22
82# AMD K8 20 19
83# PIII 25 23
84# Pentium 81 78
85#
86# Version 3.7 reimplements outer rounds as "compact." Meaning that
87# first and last rounds reference compact 256 bytes S-box. This means
88# that first round consumes a lot more CPU cycles and that encrypt
89# and decrypt performance becomes asymmetric. Encrypt performance
90# drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
91# aggressively pre-fetched.
92#
93# Version 4.0 effectively rolls back to 3.6 and instead implements
94# additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
95# which use exclusively 256 byte S-box. These functions are to be
96# called in modes not concealing plain text, such as ECB, or when
97# we're asked to process smaller amount of data [or unconditionally
98# on hyper-threading CPU]. Currently it's called unconditionally from
99# AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
100# still needs to be modified to switch between slower and faster
101# mode when appropriate... But in either case benchmark landscape
102# changes dramatically and below numbers are CPU cycles per processed
103# byte for 128-bit key.
104#
105# ECB encrypt ECB decrypt CBC large chunk
106# P4 56[60] 84[100] 23
107# AMD K8 48[44] 70[79] 18
108# PIII 41[50] 61[91] 24
109# Core 2 32[38] 45[70] 18.5
110# Pentium 120 160 77
111#
112# Version 4.1 switches to compact S-box even in key schedule setup.
113#
114# Version 4.2 prefetches compact S-box in every SSE round or in other
115# words every cache-line is *guaranteed* to be accessed within ~50
116# cycles window. Why just SSE? Because it's needed on hyper-threading
117# CPU! Which is also why it's prefetched with 64 byte stride. Best
118# part is that it has no negative effect on performance:-)
119#
120# Version 4.3 implements switch between compact and non-compact block
121# functions in AES_cbc_encrypt depending on how much data was asked
122# to be processed in one stroke.
123#
124######################################################################
125# Timing attacks are classified in two classes: synchronous when
126# attacker consciously initiates cryptographic operation and collects
127# timing data of various character afterwards, and asynchronous when
128# malicious code is executed on same CPU simultaneously with AES,
129# instruments itself and performs statistical analysis of this data.
130#
131# As far as synchronous attacks go the root to the AES timing
132# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
133# are referred to in single 128-bit block operation. Well, in C
134# implementation with 4 distinct tables it's actually as little as 40
135# references per 256 elements table, but anyway... Secondly, even
136# though S-box elements are clustered into smaller amount of cache-
137# lines, smaller than 160 and even 40, it turned out that for certain
138# plain-text pattern[s] or simply put chosen plain-text and given key
139# few cache-lines remain unaccessed during block operation. Now, if
140# attacker can figure out this access pattern, he can deduct the key
141# [or at least part of it]. The natural way to mitigate this kind of
142# attacks is to minimize the amount of cache-lines in S-box and/or
143# prefetch them to ensure that every one is accessed for more uniform
144# timing. But note that *if* plain-text was concealed in such way that
145# input to block function is distributed *uniformly*, then attack
146# wouldn't apply. Now note that some encryption modes, most notably
147# CBC, do mask the plain-text in this exact way [secure cipher output
148# is distributed uniformly]. Yes, one still might find input that
149# would reveal the information about given key, but if amount of
150# candidate inputs to be tried is larger than amount of possible key
151# combinations then attack becomes infeasible. This is why revised
152# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
153# of data is to be processed in one stroke. The current size limit of
154# 512 bytes is chosen to provide same [diminishigly low] probability
155# for cache-line to remain untouched in large chunk operation with
156# large S-box as for single block operation with compact S-box and
157# surely needs more careful consideration...
158#
159# As for asynchronous attacks. There are two flavours: attacker code
160# being interleaved with AES on hyper-threading CPU at *instruction*
161# level, and two processes time sharing single core. As for latter.
162# Two vectors. 1. Given that attacker process has higher priority,
163# yield execution to process performing AES just before timer fires
164# off the scheduler, immediately regain control of CPU and analyze the
165# cache state. For this attack to be efficient attacker would have to
166# effectively slow down the operation by several *orders* of magnitute,
167# by ratio of time slice to duration of handful of AES rounds, which
168# unlikely to remain unnoticed. Not to mention that this also means
169# that he would spend correspondigly more time to collect enough
170# statistical data to mount the attack. It's probably appropriate to
171# say that if adeversary reckons that this attack is beneficial and
172# risks to be noticed, you probably have larger problems having him
173# mere opportunity. In other words suggested code design expects you
174# to preclude/mitigate this attack by overall system security design.
175# 2. Attacker manages to make his code interrupt driven. In order for
176# this kind of attack to be feasible, interrupt rate has to be high
177# enough, again comparable to duration of handful of AES rounds. But
178# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
179# generates interrupts at such raging rate...
180#
181# And now back to the former, hyper-threading CPU or more specifically
182# Intel P4. Recall that asynchronous attack implies that malicious
183# code instruments itself. And naturally instrumentation granularity
184# has be noticeably lower than duration of codepath accessing S-box.
185# Given that all cache-lines are accessed during that time that is.
186# Current implementation accesses *all* cache-lines within ~50 cycles
187# window, which is actually *less* than RDTSC latency on Intel P4!
188
189$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
190push(@INC,"${dir}","${dir}../../perlasm");
191require "x86asm.pl";
192
193&asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
194&static_label("AES_Te");
195&static_label("AES_Td");
196
197$s0="eax";
198$s1="ebx";
199$s2="ecx";
200$s3="edx";
201$key="edi";
202$acc="esi";
203$tbl="ebp";
204
205# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
206# by caller
207$__ra=&DWP(0,"esp"); # return address
208$__s0=&DWP(4,"esp"); # s0 backing store
209$__s1=&DWP(8,"esp"); # s1 backing store
210$__s2=&DWP(12,"esp"); # s2 backing store
211$__s3=&DWP(16,"esp"); # s3 backing store
212$__key=&DWP(20,"esp"); # pointer to key schedule
213$__end=&DWP(24,"esp"); # pointer to end of key schedule
214$__tbl=&DWP(28,"esp"); # %ebp backing store
215
216# stack frame layout in AES_[en|crypt] routines, which differs from
217# above by 4 and overlaps by %ebp backing store
218$_tbl=&DWP(24,"esp");
219$_esp=&DWP(28,"esp");
220
221sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
222
223$speed_limit=512; # chunks smaller than $speed_limit are
224 # processed with compact routine in CBC mode
225$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
226 # recent µ-archs], but ~5 times smaller!
227 # I favor compact code to minimize cache
228 # contention and in hope to "collect" 5% back
229 # in real-life applications...
230
231$vertical_spin=0; # shift "verticaly" defaults to 0, because of
232 # its proof-of-concept status...
233# Note that there is no decvert(), as well as last encryption round is
234# performed with "horizontal" shifts. This is because this "vertical"
235# implementation [one which groups shifts on a given $s[i] to form a
236# "column," unlike "horizontal" one, which groups shifts on different
237# $s[i] to form a "row"] is work in progress. It was observed to run
238# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
239# whole 12% slower:-( So we face a trade-off... Shall it be resolved
240# some day? Till then the code is considered experimental and by
241# default remains dormant...
242
243sub encvert()
244{ my ($te,@s) = @_;
245 my $v0 = $acc, $v1 = $key;
246
247 &mov ($v0,$s[3]); # copy s3
248 &mov (&DWP(4,"esp"),$s[2]); # save s2
249 &mov ($v1,$s[0]); # copy s0
250 &mov (&DWP(8,"esp"),$s[1]); # save s1
251
252 &movz ($s[2],&HB($s[0]));
253 &and ($s[0],0xFF);
254 &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0
255 &shr ($v1,16);
256 &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8
257 &movz ($s[1],&HB($v1));
258 &and ($v1,0xFF);
259 &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16
260 &mov ($v1,$v0);
261 &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24
262
263 &and ($v0,0xFF);
264 &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0
265 &movz ($v0,&HB($v1));
266 &shr ($v1,16);
267 &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8
268 &movz ($v0,&HB($v1));
269 &and ($v1,0xFF);
270 &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16
271 &mov ($v1,&DWP(4,"esp")); # restore s2
272 &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24
273
274 &mov ($v0,$v1);
275 &and ($v1,0xFF);
276 &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0
277 &movz ($v1,&HB($v0));
278 &shr ($v0,16);
279 &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8
280 &movz ($v1,&HB($v0));
281 &and ($v0,0xFF);
282 &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16
283 &mov ($v0,&DWP(8,"esp")); # restore s1
284 &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24
285
286 &mov ($v1,$v0);
287 &and ($v0,0xFF);
288 &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0
289 &movz ($v0,&HB($v1));
290 &shr ($v1,16);
291 &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8
292 &movz ($v0,&HB($v1));
293 &and ($v1,0xFF);
294 &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
295 &mov ($key,$__key); # reincarnate v1 as key
296 &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
297}
298
299# Another experimental routine, which features "horizontal spin," but
300# eliminates one reference to stack. Strangely enough runs slower...
301sub enchoriz()
302{ my $v0 = $key, $v1 = $acc;
303
304 &movz ($v0,&LB($s0)); # 3, 2, 1, 0*
305 &rotr ($s2,8); # 8,11,10, 9
306 &mov ($v1,&DWP(0,$te,$v0,8)); # 0
307 &movz ($v0,&HB($s1)); # 7, 6, 5*, 4
308 &rotr ($s3,16); # 13,12,15,14
309 &xor ($v1,&DWP(3,$te,$v0,8)); # 5
310 &movz ($v0,&HB($s2)); # 8,11,10*, 9
311 &rotr ($s0,16); # 1, 0, 3, 2
312 &xor ($v1,&DWP(2,$te,$v0,8)); # 10
313 &movz ($v0,&HB($s3)); # 13,12,15*,14
314 &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
315 &mov ($__s0,$v1); # t[0] saved
316
317 &movz ($v0,&LB($s1)); # 7, 6, 5, 4*
318 &shr ($s1,16); # -, -, 7, 6
319 &mov ($v1,&DWP(0,$te,$v0,8)); # 4
320 &movz ($v0,&LB($s3)); # 13,12,15,14*
321 &xor ($v1,&DWP(2,$te,$v0,8)); # 14
322 &movz ($v0,&HB($s0)); # 1, 0, 3*, 2
323 &and ($s3,0xffff0000); # 13,12, -, -
324 &xor ($v1,&DWP(1,$te,$v0,8)); # 3
325 &movz ($v0,&LB($s2)); # 8,11,10, 9*
326 &or ($s3,$s1); # 13,12, 7, 6
327 &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected
328 &mov ($s1,$v1); # s[1]=t[1]
329
330 &movz ($v0,&LB($s0)); # 1, 0, 3, 2*
331 &shr ($s2,16); # -, -, 8,11
332 &mov ($v1,&DWP(2,$te,$v0,8)); # 2
333 &movz ($v0,&HB($s3)); # 13,12, 7*, 6
334 &xor ($v1,&DWP(1,$te,$v0,8)); # 7
335 &movz ($v0,&HB($s2)); # -, -, 8*,11
336 &xor ($v1,&DWP(0,$te,$v0,8)); # 8
337 &mov ($v0,$s3);
338 &shr ($v0,24); # 13
339 &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected
340
341 &movz ($v0,&LB($s2)); # -, -, 8,11*
342 &shr ($s0,24); # 1*
343 &mov ($s2,&DWP(1,$te,$v0,8)); # 11
344 &xor ($s2,&DWP(3,$te,$s0,8)); # 1
345 &mov ($s0,$__s0); # s[0]=t[0]
346 &movz ($v0,&LB($s3)); # 13,12, 7, 6*
347 &shr ($s3,16); # , ,13,12
348 &xor ($s2,&DWP(2,$te,$v0,8)); # 6
349 &mov ($key,$__key); # reincarnate v0 as key
350 &and ($s3,0xff); # , ,13,12*
351 &mov ($s3,&DWP(0,$te,$s3,8)); # 12
352 &xor ($s3,$s2); # s[2]=t[3] collected
353 &mov ($s2,$v1); # s[2]=t[2]
354}
355
356# More experimental code... SSE one... Even though this one eliminates
357# *all* references to stack, it's not faster...
358sub sse_encbody()
359{
360 &movz ($acc,&LB("eax")); # 0
361 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
362 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
363 &movz ("edx",&HB("eax")); # 1
364 &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
365 &shr ("eax",16); # 5, 4
366
367 &movz ($acc,&LB("ebx")); # 10
368 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
369 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
370 &movz ($acc,&HB("ebx")); # 11
371 &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
372 &shr ("ebx",16); # 15,14
373
374 &movz ($acc,&HB("eax")); # 5
375 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
376 &movq ("mm3",QWP(16,$key));
377 &movz ($acc,&HB("ebx")); # 15
378 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
379 &movd ("mm0","ecx"); # t[0] collected
380
381 &movz ($acc,&LB("eax")); # 4
382 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
383 &movd ("eax","mm2"); # 7, 6, 3, 2
384 &movz ($acc,&LB("ebx")); # 14
385 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
386 &movd ("ebx","mm6"); # 13,12, 9, 8
387
388 &movz ($acc,&HB("eax")); # 3
389 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
390 &movz ($acc,&HB("ebx")); # 9
391 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
392 &movd ("mm1","ecx"); # t[1] collected
393
394 &movz ($acc,&LB("eax")); # 2
395 &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
396 &shr ("eax",16); # 7, 6
397 &punpckldq ("mm0","mm1"); # t[0,1] collected
398 &movz ($acc,&LB("ebx")); # 8
399 &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
400 &shr ("ebx",16); # 13,12
401
402 &movz ($acc,&HB("eax")); # 7
403 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
404 &pxor ("mm0","mm3");
405 &movz ("eax",&LB("eax")); # 6
406 &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
407 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
408 &movz ($acc,&HB("ebx")); # 13
409 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
410 &xor ("ecx",&DWP(24,$key)); # t[2]
411 &movd ("mm4","ecx"); # t[2] collected
412 &movz ("ebx",&LB("ebx")); # 12
413 &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12
414 &shr ("ecx",16);
415 &movd ("eax","mm1"); # 5, 4, 1, 0
416 &mov ("ebx",&DWP(28,$key)); # t[3]
417 &xor ("ebx","edx");
418 &movd ("mm5","ebx"); # t[3] collected
419 &and ("ebx",0xffff0000);
420 &or ("ebx","ecx");
421
422 &punpckldq ("mm4","mm5"); # t[2,3] collected
423}
424
425######################################################################
426# "Compact" block function
427######################################################################
428
429sub enccompact()
430{ my $Fn = mov;
431 while ($#_>5) { pop(@_); $Fn=sub{}; }
432 my ($i,$te,@s)=@_;
433 my $tmp = $key;
434 my $out = $i==3?$s[0]:$acc;
435
436 # $Fn is used in first compact round and its purpose is to
437 # void restoration of some values from stack, so that after
438 # 4xenccompact with extra argument $key value is left there...
439 if ($i==3) { &$Fn ($key,$__key); }##%edx
440 else { &mov ($out,$s[0]); }
441 &and ($out,0xFF);
442 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
443 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
444 &movz ($out,&BP(-128,$te,$out,1));
445
446 if ($i==3) { $tmp=$s[1]; }##%eax
447 &movz ($tmp,&HB($s[1]));
448 &movz ($tmp,&BP(-128,$te,$tmp,1));
449 &shl ($tmp,8);
450 &xor ($out,$tmp);
451
452 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
453 else { &mov ($tmp,$s[2]);
454 &shr ($tmp,16); }
455 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
456 &and ($tmp,0xFF);
457 &movz ($tmp,&BP(-128,$te,$tmp,1));
458 &shl ($tmp,16);
459 &xor ($out,$tmp);
460
461 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
462 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
463 else { &mov ($tmp,$s[3]);
464 &shr ($tmp,24); }
465 &movz ($tmp,&BP(-128,$te,$tmp,1));
466 &shl ($tmp,24);
467 &xor ($out,$tmp);
468 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
469 if ($i==3) { &mov ($s[3],$acc); }
470 &comment();
471}
472
473sub enctransform()
474{ my @s = ($s0,$s1,$s2,$s3);
475 my $i = shift;
476 my $tmp = $tbl;
477 my $r2 = $key ;
478
479 &mov ($acc,$s[$i]);
480 &and ($acc,0x80808080);
481 &mov ($tmp,$acc);
482 &shr ($tmp,7);
483 &lea ($r2,&DWP(0,$s[$i],$s[$i]));
484 &sub ($acc,$tmp);
485 &and ($r2,0xfefefefe);
486 &and ($acc,0x1b1b1b1b);
487 &mov ($tmp,$s[$i]);
488 &xor ($acc,$r2); # r2
489
490 &xor ($s[$i],$acc); # r0 ^ r2
491 &rotl ($s[$i],24);
492 &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2
493 &rotr ($tmp,16);
494 &xor ($s[$i],$tmp);
495 &rotr ($tmp,8);
496 &xor ($s[$i],$tmp);
497}
498
499&function_begin_B("_x86_AES_encrypt_compact");
500 # note that caller is expected to allocate stack frame for me!
501 &mov ($__key,$key); # save key
502
503 &xor ($s0,&DWP(0,$key)); # xor with key
504 &xor ($s1,&DWP(4,$key));
505 &xor ($s2,&DWP(8,$key));
506 &xor ($s3,&DWP(12,$key));
507
508 &mov ($acc,&DWP(240,$key)); # load key->rounds
509 &lea ($acc,&DWP(-2,$acc,$acc));
510 &lea ($acc,&DWP(0,$key,$acc,8));
511 &mov ($__end,$acc); # end of key schedule
512
513 # prefetch Te4
514 &mov ($key,&DWP(0-128,$tbl));
515 &mov ($acc,&DWP(32-128,$tbl));
516 &mov ($key,&DWP(64-128,$tbl));
517 &mov ($acc,&DWP(96-128,$tbl));
518 &mov ($key,&DWP(128-128,$tbl));
519 &mov ($acc,&DWP(160-128,$tbl));
520 &mov ($key,&DWP(192-128,$tbl));
521 &mov ($acc,&DWP(224-128,$tbl));
522
523 &set_label("loop",16);
524
525 &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
526 &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
527 &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
528 &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
529 &enctransform(2);
530 &enctransform(3);
531 &enctransform(0);
532 &enctransform(1);
533 &mov ($key,$__key);
534 &mov ($tbl,$__tbl);
535 &add ($key,16); # advance rd_key
536 &xor ($s0,&DWP(0,$key));
537 &xor ($s1,&DWP(4,$key));
538 &xor ($s2,&DWP(8,$key));
539 &xor ($s3,&DWP(12,$key));
540
541 &cmp ($key,$__end);
542 &mov ($__key,$key);
543 &jb (&label("loop"));
544
545 &enccompact(0,$tbl,$s0,$s1,$s2,$s3);
546 &enccompact(1,$tbl,$s1,$s2,$s3,$s0);
547 &enccompact(2,$tbl,$s2,$s3,$s0,$s1);
548 &enccompact(3,$tbl,$s3,$s0,$s1,$s2);
549
550 &xor ($s0,&DWP(16,$key));
551 &xor ($s1,&DWP(20,$key));
552 &xor ($s2,&DWP(24,$key));
553 &xor ($s3,&DWP(28,$key));
554
555 &ret ();
556&function_end_B("_x86_AES_encrypt_compact");
557
558######################################################################
559# "Compact" SSE block function.
560######################################################################
561#
562# Performance is not actually extraordinary in comparison to pure
563# x86 code. In particular encrypt performance is virtually the same.
564# Decrypt performance on the other hand is 15-20% better on newer
565# µ-archs [but we're thankful for *any* improvement here], and ~50%
566# better on PIII:-) And additionally on the pros side this code
567# eliminates redundant references to stack and thus relieves/
568# minimizes the pressure on the memory bus.
569#
570# MMX register layout lsb
571# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
572# | mm4 | mm0 |
573# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
574# | s3 | s2 | s1 | s0 |
575# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
576# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
577# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
578#
579# Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
580# In this terms encryption and decryption "compact" permutation
581# matrices can be depicted as following:
582#
583# encryption lsb # decryption lsb
584# +----++----+----+----+----+ # +----++----+----+----+----+
585# | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 |
586# +----++----+----+----+----+ # +----++----+----+----+----+
587# | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 |
588# +----++----+----+----+----+ # +----++----+----+----+----+
589# | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 |
590# +----++----+----+----+----+ # +----++----+----+----+----+
591# | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 |
592# +----++----+----+----+----+ # +----++----+----+----+----+
593#
594######################################################################
595# Why not xmm registers? Short answer. It was actually tested and
596# was not any faster, but *contrary*, most notably on Intel CPUs.
597# Longer answer. Main advantage of using mm registers is that movd
598# latency is lower, especially on Intel P4. While arithmetic
599# instructions are twice as many, they can be scheduled every cycle
600# and not every second one when they are operating on xmm register,
601# so that "arithmetic throughput" remains virtually the same. And
602# finally the code can be executed even on elder SSE-only CPUs:-)
603
604sub sse_enccompact()
605{
606 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
607 &pshufw ("mm5","mm4",0x0d); # 15,14,11,10
608 &movd ("eax","mm1"); # 5, 4, 1, 0
609 &movd ("ebx","mm5"); # 15,14,11,10
610
611 &movz ($acc,&LB("eax")); # 0
612 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
613 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
614 &movz ("edx",&HB("eax")); # 1
615 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
616 &shl ("edx",8); # 1
617 &shr ("eax",16); # 5, 4
618
619 &movz ($acc,&LB("ebx")); # 10
620 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
621 &shl ($acc,16); # 10
622 &or ("ecx",$acc); # 10
623 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
624 &movz ($acc,&HB("ebx")); # 11
625 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
626 &shl ($acc,24); # 11
627 &or ("edx",$acc); # 11
628 &shr ("ebx",16); # 15,14
629
630 &movz ($acc,&HB("eax")); # 5
631 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
632 &shl ($acc,8); # 5
633 &or ("ecx",$acc); # 5
634 &movz ($acc,&HB("ebx")); # 15
635 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
636 &shl ($acc,24); # 15
637 &or ("ecx",$acc); # 15
638 &movd ("mm0","ecx"); # t[0] collected
639
640 &movz ($acc,&LB("eax")); # 4
641 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
642 &movd ("eax","mm2"); # 7, 6, 3, 2
643 &movz ($acc,&LB("ebx")); # 14
644 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
645 &shl ($acc,16); # 14
646 &or ("ecx",$acc); # 14
647
648 &movd ("ebx","mm6"); # 13,12, 9, 8
649 &movz ($acc,&HB("eax")); # 3
650 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
651 &shl ($acc,24); # 3
652 &or ("ecx",$acc); # 3
653 &movz ($acc,&HB("ebx")); # 9
654 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
655 &shl ($acc,8); # 9
656 &or ("ecx",$acc); # 9
657 &movd ("mm1","ecx"); # t[1] collected
658
659 &movz ($acc,&LB("ebx")); # 8
660 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
661 &shr ("ebx",16); # 13,12
662 &movz ($acc,&LB("eax")); # 2
663 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
664 &shl ($acc,16); # 2
665 &or ("ecx",$acc); # 2
666 &shr ("eax",16); # 7, 6
667
668 &punpckldq ("mm0","mm1"); # t[0,1] collected
669
670 &movz ($acc,&HB("eax")); # 7
671 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
672 &shl ($acc,24); # 7
673 &or ("ecx",$acc); # 7
674 &and ("eax",0xff); # 6
675 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
676 &shl ("eax",16); # 6
677 &or ("edx","eax"); # 6
678 &movz ($acc,&HB("ebx")); # 13
679 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
680 &shl ($acc,8); # 13
681 &or ("ecx",$acc); # 13
682 &movd ("mm4","ecx"); # t[2] collected
683 &and ("ebx",0xff); # 12
684 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
685 &or ("edx","ebx"); # 12
686 &movd ("mm5","edx"); # t[3] collected
687
688 &punpckldq ("mm4","mm5"); # t[2,3] collected
689}
690
691 if (!$x86only) {
692&function_begin_B("_sse_AES_encrypt_compact");
693 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
694 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
695
696 # note that caller is expected to allocate stack frame for me!
697 &mov ($acc,&DWP(240,$key)); # load key->rounds
698 &lea ($acc,&DWP(-2,$acc,$acc));
699 &lea ($acc,&DWP(0,$key,$acc,8));
700 &mov ($__end,$acc); # end of key schedule
701
702 &mov ($s0,0x1b1b1b1b); # magic constant
703 &mov (&DWP(8,"esp"),$s0);
704 &mov (&DWP(12,"esp"),$s0);
705
706 # prefetch Te4
707 &mov ($s0,&DWP(0-128,$tbl));
708 &mov ($s1,&DWP(32-128,$tbl));
709 &mov ($s2,&DWP(64-128,$tbl));
710 &mov ($s3,&DWP(96-128,$tbl));
711 &mov ($s0,&DWP(128-128,$tbl));
712 &mov ($s1,&DWP(160-128,$tbl));
713 &mov ($s2,&DWP(192-128,$tbl));
714 &mov ($s3,&DWP(224-128,$tbl));
715
716 &set_label("loop",16);
717 &sse_enccompact();
718 &add ($key,16);
719 &cmp ($key,$__end);
720 &ja (&label("out"));
721
722 &movq ("mm2",&QWP(8,"esp"));
723 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
724 &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
725 &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
726 &pand ("mm3","mm2"); &pand ("mm7","mm2");
727 &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
728 &paddb ("mm0","mm0"); &paddb ("mm4","mm4");
729 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
730 &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
731 &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
732 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
733
734 &movq ("mm2","mm3"); &movq ("mm6","mm7");
735 &pslld ("mm3",8); &pslld ("mm7",8);
736 &psrld ("mm2",24); &psrld ("mm6",24);
737 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
738 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
739
740 &movq ("mm3","mm1"); &movq ("mm7","mm5");
741 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
742 &psrld ("mm1",8); &psrld ("mm5",8);
743 &mov ($s0,&DWP(0-128,$tbl));
744 &pslld ("mm3",24); &pslld ("mm7",24);
745 &mov ($s1,&DWP(64-128,$tbl));
746 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
747 &mov ($s2,&DWP(128-128,$tbl));
748 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
749 &mov ($s3,&DWP(192-128,$tbl));
750
751 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
752 &jmp (&label("loop"));
753
754 &set_label("out",16);
755 &pxor ("mm0",&QWP(0,$key));
756 &pxor ("mm4",&QWP(8,$key));
757
758 &ret ();
759&function_end_B("_sse_AES_encrypt_compact");
760 }
761
762######################################################################
763# Vanilla block function.
764######################################################################
765
766sub encstep()
767{ my ($i,$te,@s) = @_;
768 my $tmp = $key;
769 my $out = $i==3?$s[0]:$acc;
770
771 # lines marked with #%e?x[i] denote "reordered" instructions...
772 if ($i==3) { &mov ($key,$__key); }##%edx
773 else { &mov ($out,$s[0]);
774 &and ($out,0xFF); }
775 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
776 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
777 &mov ($out,&DWP(0,$te,$out,8));
778
779 if ($i==3) { $tmp=$s[1]; }##%eax
780 &movz ($tmp,&HB($s[1]));
781 &xor ($out,&DWP(3,$te,$tmp,8));
782
783 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
784 else { &mov ($tmp,$s[2]);
785 &shr ($tmp,16); }
786 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
787 &and ($tmp,0xFF);
788 &xor ($out,&DWP(2,$te,$tmp,8));
789
790 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
791 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
792 else { &mov ($tmp,$s[3]);
793 &shr ($tmp,24) }
794 &xor ($out,&DWP(1,$te,$tmp,8));
795 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
796 if ($i==3) { &mov ($s[3],$acc); }
797 &comment();
798}
799
800sub enclast()
801{ my ($i,$te,@s)=@_;
802 my $tmp = $key;
803 my $out = $i==3?$s[0]:$acc;
804
805 if ($i==3) { &mov ($key,$__key); }##%edx
806 else { &mov ($out,$s[0]); }
807 &and ($out,0xFF);
808 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
809 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
810 &mov ($out,&DWP(2,$te,$out,8));
811 &and ($out,0x000000ff);
812
813 if ($i==3) { $tmp=$s[1]; }##%eax
814 &movz ($tmp,&HB($s[1]));
815 &mov ($tmp,&DWP(0,$te,$tmp,8));
816 &and ($tmp,0x0000ff00);
817 &xor ($out,$tmp);
818
819 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
820 else { &mov ($tmp,$s[2]);
821 &shr ($tmp,16); }
822 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
823 &and ($tmp,0xFF);
824 &mov ($tmp,&DWP(0,$te,$tmp,8));
825 &and ($tmp,0x00ff0000);
826 &xor ($out,$tmp);
827
828 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
829 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
830 else { &mov ($tmp,$s[3]);
831 &shr ($tmp,24); }
832 &mov ($tmp,&DWP(2,$te,$tmp,8));
833 &and ($tmp,0xff000000);
834 &xor ($out,$tmp);
835 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
836 if ($i==3) { &mov ($s[3],$acc); }
837}
838
839&function_begin_B("_x86_AES_encrypt");
840 if ($vertical_spin) {
841 # I need high parts of volatile registers to be accessible...
842 &exch ($s1="edi",$key="ebx");
843 &mov ($s2="esi",$acc="ecx");
844 }
845
846 # note that caller is expected to allocate stack frame for me!
847 &mov ($__key,$key); # save key
848
849 &xor ($s0,&DWP(0,$key)); # xor with key
850 &xor ($s1,&DWP(4,$key));
851 &xor ($s2,&DWP(8,$key));
852 &xor ($s3,&DWP(12,$key));
853
854 &mov ($acc,&DWP(240,$key)); # load key->rounds
855
856 if ($small_footprint) {
857 &lea ($acc,&DWP(-2,$acc,$acc));
858 &lea ($acc,&DWP(0,$key,$acc,8));
859 &mov ($__end,$acc); # end of key schedule
860
861 &set_label("loop",16);
862 if ($vertical_spin) {
863 &encvert($tbl,$s0,$s1,$s2,$s3);
864 } else {
865 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
866 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
867 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
868 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
869 }
870 &add ($key,16); # advance rd_key
871 &xor ($s0,&DWP(0,$key));
872 &xor ($s1,&DWP(4,$key));
873 &xor ($s2,&DWP(8,$key));
874 &xor ($s3,&DWP(12,$key));
875 &cmp ($key,$__end);
876 &mov ($__key,$key);
877 &jb (&label("loop"));
878 }
879 else {
880 &cmp ($acc,10);
881 &jle (&label("10rounds"));
882 &cmp ($acc,12);
883 &jle (&label("12rounds"));
884
885 &set_label("14rounds",4);
886 for ($i=1;$i<3;$i++) {
887 if ($vertical_spin) {
888 &encvert($tbl,$s0,$s1,$s2,$s3);
889 } else {
890 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
891 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
892 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
893 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
894 }
895 &xor ($s0,&DWP(16*$i+0,$key));
896 &xor ($s1,&DWP(16*$i+4,$key));
897 &xor ($s2,&DWP(16*$i+8,$key));
898 &xor ($s3,&DWP(16*$i+12,$key));
899 }
900 &add ($key,32);
901 &mov ($__key,$key); # advance rd_key
902 &set_label("12rounds",4);
903 for ($i=1;$i<3;$i++) {
904 if ($vertical_spin) {
905 &encvert($tbl,$s0,$s1,$s2,$s3);
906 } else {
907 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
908 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
909 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
910 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
911 }
912 &xor ($s0,&DWP(16*$i+0,$key));
913 &xor ($s1,&DWP(16*$i+4,$key));
914 &xor ($s2,&DWP(16*$i+8,$key));
915 &xor ($s3,&DWP(16*$i+12,$key));
916 }
917 &add ($key,32);
918 &mov ($__key,$key); # advance rd_key
919 &set_label("10rounds",4);
920 for ($i=1;$i<10;$i++) {
921 if ($vertical_spin) {
922 &encvert($tbl,$s0,$s1,$s2,$s3);
923 } else {
924 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
925 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
926 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
927 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
928 }
929 &xor ($s0,&DWP(16*$i+0,$key));
930 &xor ($s1,&DWP(16*$i+4,$key));
931 &xor ($s2,&DWP(16*$i+8,$key));
932 &xor ($s3,&DWP(16*$i+12,$key));
933 }
934 }
935
936 if ($vertical_spin) {
937 # "reincarnate" some registers for "horizontal" spin...
938 &mov ($s1="ebx",$key="edi");
939 &mov ($s2="ecx",$acc="esi");
940 }
941 &enclast(0,$tbl,$s0,$s1,$s2,$s3);
942 &enclast(1,$tbl,$s1,$s2,$s3,$s0);
943 &enclast(2,$tbl,$s2,$s3,$s0,$s1);
944 &enclast(3,$tbl,$s3,$s0,$s1,$s2);
945
946 &add ($key,$small_footprint?16:160);
947 &xor ($s0,&DWP(0,$key));
948 &xor ($s1,&DWP(4,$key));
949 &xor ($s2,&DWP(8,$key));
950 &xor ($s3,&DWP(12,$key));
951
952 &ret ();
953
954&set_label("AES_Te",64); # Yes! I keep it in the code segment!
955 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
956 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
957 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
958 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
959 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
960 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
961 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
962 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
963 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
964 &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
965 &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
966 &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
967 &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
968 &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
969 &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
970 &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
971 &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
972 &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
973 &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
974 &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
975 &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
976 &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
977 &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
978 &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
979 &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
980 &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
981 &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
982 &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
983 &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
984 &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
985 &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
986 &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
987 &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
988 &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
989 &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
990 &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
991 &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
992 &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
993 &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
994 &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
995 &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
996 &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
997 &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
998 &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
999 &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
1000 &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
1001 &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
1002 &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
1003 &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
1004 &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
1005 &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
1006 &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
1007 &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
1008 &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
1009 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
1010 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
1011 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
1012 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
1013 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
1014 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
1015 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
1016 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
1017 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
1018 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
1019
1020#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
1021 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1022 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1023 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1024 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1025 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1026 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1027 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1028 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1029 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1030 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1031 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1032 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1033 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1034 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1035 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1036 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1037 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1038 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1039 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1040 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1041 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1042 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1043 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1044 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1045 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1046 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1047 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1048 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1049 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1050 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1051 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1052 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1053
1054 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1055 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1056 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1057 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1058 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1059 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1060 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1061 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1062 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1063 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1064 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1065 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1066 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1067 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1068 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1069 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1070 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1071 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1072 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1073 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1074 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1075 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1076 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1077 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1078 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1079 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1080 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1081 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1082 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1083 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1084 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1085 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1086
1087 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1088 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1089 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1090 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1091 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1092 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1093 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1094 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1095 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1096 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1097 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1098 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1099 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1100 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1101 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1102 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1103 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1104 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1105 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1106 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1107 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1108 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1109 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1110 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1111 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1112 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1113 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1114 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1115 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1116 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1117 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1118 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1119
1120 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1121 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1122 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1123 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1124 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1125 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1126 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1127 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1128 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1129 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1130 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1131 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1132 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1133 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1134 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1135 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1136 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1137 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1138 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1139 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1140 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1141 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1142 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1143 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1144 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1145 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1146 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1147 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1148 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1149 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1150 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1151 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1152#rcon:
1153 &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
1154 &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
1155 &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
1156 &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
1157&function_end_B("_x86_AES_encrypt");
1158
1159# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
1160&function_begin("AES_encrypt");
1161 &mov ($acc,&wparam(0)); # load inp
1162 &mov ($key,&wparam(2)); # load key
1163
1164 &mov ($s0,"esp");
1165 &sub ("esp",36);
1166 &and ("esp",-64); # align to cache-line
1167
1168 # place stack frame just "above" the key schedule
1169 &lea ($s1,&DWP(-64-63,$key));
1170 &sub ($s1,"esp");
1171 &neg ($s1);
1172 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1173 &sub ("esp",$s1);
1174 &add ("esp",4); # 4 is reserved for caller's return address
1175 &mov ($_esp,$s0); # save stack pointer
1176
1177 &call (&label("pic_point")); # make it PIC!
1178 &set_label("pic_point");
1179 &blindpop($tbl);
1180 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
1181 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
1182
1183 # pick Te4 copy which can't "overlap" with stack frame or key schedule
1184 &lea ($s1,&DWP(768-4,"esp"));
1185 &sub ($s1,$tbl);
1186 &and ($s1,0x300);
1187 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1188
1189 if (!$x86only) {
1190 &bt (&DWP(0,$s0),25); # check for SSE bit
1191 &jnc (&label("x86"));
1192
1193 &movq ("mm0",&QWP(0,$acc));
1194 &movq ("mm4",&QWP(8,$acc));
1195 &call ("_sse_AES_encrypt_compact");
1196 &mov ("esp",$_esp); # restore stack pointer
1197 &mov ($acc,&wparam(1)); # load out
1198 &movq (&QWP(0,$acc),"mm0"); # write output data
1199 &movq (&QWP(8,$acc),"mm4");
1200 &emms ();
1201 &function_end_A();
1202 }
1203 &set_label("x86",16);
1204 &mov ($_tbl,$tbl);
1205 &mov ($s0,&DWP(0,$acc)); # load input data
1206 &mov ($s1,&DWP(4,$acc));
1207 &mov ($s2,&DWP(8,$acc));
1208 &mov ($s3,&DWP(12,$acc));
1209 &call ("_x86_AES_encrypt_compact");
1210 &mov ("esp",$_esp); # restore stack pointer
1211 &mov ($acc,&wparam(1)); # load out
1212 &mov (&DWP(0,$acc),$s0); # write output data
1213 &mov (&DWP(4,$acc),$s1);
1214 &mov (&DWP(8,$acc),$s2);
1215 &mov (&DWP(12,$acc),$s3);
1216&function_end("AES_encrypt");
1217
1218#--------------------------------------------------------------------#
1219
1220######################################################################
1221# "Compact" block function
1222######################################################################
1223
1224sub deccompact()
1225{ my $Fn = mov;
1226 while ($#_>5) { pop(@_); $Fn=sub{}; }
1227 my ($i,$td,@s)=@_;
1228 my $tmp = $key;
1229 my $out = $i==3?$s[0]:$acc;
1230
1231 # $Fn is used in first compact round and its purpose is to
1232 # void restoration of some values from stack, so that after
1233 # 4xdeccompact with extra argument $key, $s0 and $s1 values
1234 # are left there...
1235 if($i==3) { &$Fn ($key,$__key); }
1236 else { &mov ($out,$s[0]); }
1237 &and ($out,0xFF);
1238 &movz ($out,&BP(-128,$td,$out,1));
1239
1240 if ($i==3) { $tmp=$s[1]; }
1241 &movz ($tmp,&HB($s[1]));
1242 &movz ($tmp,&BP(-128,$td,$tmp,1));
1243 &shl ($tmp,8);
1244 &xor ($out,$tmp);
1245
1246 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1247 else { mov ($tmp,$s[2]); }
1248 &shr ($tmp,16);
1249 &and ($tmp,0xFF);
1250 &movz ($tmp,&BP(-128,$td,$tmp,1));
1251 &shl ($tmp,16);
1252 &xor ($out,$tmp);
1253
1254 if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }
1255 else { &mov ($tmp,$s[3]); }
1256 &shr ($tmp,24);
1257 &movz ($tmp,&BP(-128,$td,$tmp,1));
1258 &shl ($tmp,24);
1259 &xor ($out,$tmp);
1260 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1261 if ($i==3) { &$Fn ($s[3],$__s0); }
1262}
1263
1264# must be called with 2,3,0,1 as argument sequence!!!
1265sub dectransform()
1266{ my @s = ($s0,$s1,$s2,$s3);
1267 my $i = shift;
1268 my $tmp = $key;
1269 my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
1270 my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
1271 my $tp8 = $tbl;
1272
1273 &mov ($acc,$s[$i]);
1274 &and ($acc,0x80808080);
1275 &mov ($tmp,$acc);
1276 &shr ($tmp,7);
1277 &lea ($tp2,&DWP(0,$s[$i],$s[$i]));
1278 &sub ($acc,$tmp);
1279 &and ($tp2,0xfefefefe);
1280 &and ($acc,0x1b1b1b1b);
1281 &xor ($acc,$tp2);
1282 &mov ($tp2,$acc);
1283
1284 &and ($acc,0x80808080);
1285 &mov ($tmp,$acc);
1286 &shr ($tmp,7);
1287 &lea ($tp4,&DWP(0,$tp2,$tp2));
1288 &sub ($acc,$tmp);
1289 &and ($tp4,0xfefefefe);
1290 &and ($acc,0x1b1b1b1b);
1291 &xor ($tp2,$s[$i]); # tp2^tp1
1292 &xor ($acc,$tp4);
1293 &mov ($tp4,$acc);
1294
1295 &and ($acc,0x80808080);
1296 &mov ($tmp,$acc);
1297 &shr ($tmp,7);
1298 &lea ($tp8,&DWP(0,$tp4,$tp4));
1299 &sub ($acc,$tmp);
1300 &and ($tp8,0xfefefefe);
1301 &and ($acc,0x1b1b1b1b);
1302 &xor ($tp4,$s[$i]); # tp4^tp1
1303 &rotl ($s[$i],8); # = ROTATE(tp1,8)
1304 &xor ($tp8,$acc);
1305
1306 &xor ($s[$i],$tp2);
1307 &xor ($tp2,$tp8);
1308 &rotl ($tp2,24);
1309 &xor ($s[$i],$tp4);
1310 &xor ($tp4,$tp8);
1311 &rotl ($tp4,16);
1312 &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
1313 &rotl ($tp8,8);
1314 &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
1315 &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
1316 &mov ($s[0],$__s0) if($i==2); #prefetch $s0
1317 &mov ($s[1],$__s1) if($i==3); #prefetch $s1
1318 &mov ($s[2],$__s2) if($i==1);
1319 &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
1320
1321 &mov ($s[3],$__s3) if($i==1);
1322 &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
1323}
1324
1325&function_begin_B("_x86_AES_decrypt_compact");
1326 # note that caller is expected to allocate stack frame for me!
1327 &mov ($__key,$key); # save key
1328
1329 &xor ($s0,&DWP(0,$key)); # xor with key
1330 &xor ($s1,&DWP(4,$key));
1331 &xor ($s2,&DWP(8,$key));
1332 &xor ($s3,&DWP(12,$key));
1333
1334 &mov ($acc,&DWP(240,$key)); # load key->rounds
1335
1336 &lea ($acc,&DWP(-2,$acc,$acc));
1337 &lea ($acc,&DWP(0,$key,$acc,8));
1338 &mov ($__end,$acc); # end of key schedule
1339
1340 # prefetch Td4
1341 &mov ($key,&DWP(0-128,$tbl));
1342 &mov ($acc,&DWP(32-128,$tbl));
1343 &mov ($key,&DWP(64-128,$tbl));
1344 &mov ($acc,&DWP(96-128,$tbl));
1345 &mov ($key,&DWP(128-128,$tbl));
1346 &mov ($acc,&DWP(160-128,$tbl));
1347 &mov ($key,&DWP(192-128,$tbl));
1348 &mov ($acc,&DWP(224-128,$tbl));
1349
1350 &set_label("loop",16);
1351
1352 &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
1353 &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
1354 &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
1355 &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
1356 &dectransform(2);
1357 &dectransform(3);
1358 &dectransform(0);
1359 &dectransform(1);
1360 &mov ($key,$__key);
1361 &mov ($tbl,$__tbl);
1362 &add ($key,16); # advance rd_key
1363 &xor ($s0,&DWP(0,$key));
1364 &xor ($s1,&DWP(4,$key));
1365 &xor ($s2,&DWP(8,$key));
1366 &xor ($s3,&DWP(12,$key));
1367
1368 &cmp ($key,$__end);
1369 &mov ($__key,$key);
1370 &jb (&label("loop"));
1371
1372 &deccompact(0,$tbl,$s0,$s3,$s2,$s1);
1373 &deccompact(1,$tbl,$s1,$s0,$s3,$s2);
1374 &deccompact(2,$tbl,$s2,$s1,$s0,$s3);
1375 &deccompact(3,$tbl,$s3,$s2,$s1,$s0);
1376
1377 &xor ($s0,&DWP(16,$key));
1378 &xor ($s1,&DWP(20,$key));
1379 &xor ($s2,&DWP(24,$key));
1380 &xor ($s3,&DWP(28,$key));
1381
1382 &ret ();
1383&function_end_B("_x86_AES_decrypt_compact");
1384
1385######################################################################
1386# "Compact" SSE block function.
1387######################################################################
1388
1389sub sse_deccompact()
1390{
1391 &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
1392 &movd ("eax","mm1"); # 7, 6, 1, 0
1393
1394 &pshufw ("mm5","mm4",0x09); # 13,12,11,10
1395 &movz ($acc,&LB("eax")); # 0
1396 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
1397 &movd ("ebx","mm5"); # 13,12,11,10
1398 &movz ("edx",&HB("eax")); # 1
1399 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
1400 &shl ("edx",8); # 1
1401
1402 &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
1403 &movz ($acc,&LB("ebx")); # 10
1404 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
1405 &shl ($acc,16); # 10
1406 &or ("ecx",$acc); # 10
1407 &shr ("eax",16); # 7, 6
1408 &movz ($acc,&HB("ebx")); # 11
1409 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
1410 &shl ($acc,24); # 11
1411 &or ("edx",$acc); # 11
1412 &shr ("ebx",16); # 13,12
1413
1414 &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
1415 &movz ($acc,&HB("eax")); # 7
1416 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
1417 &shl ($acc,24); # 7
1418 &or ("ecx",$acc); # 7
1419 &movz ($acc,&HB("ebx")); # 13
1420 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
1421 &shl ($acc,8); # 13
1422 &or ("ecx",$acc); # 13
1423 &movd ("mm0","ecx"); # t[0] collected
1424
1425 &movz ($acc,&LB("eax")); # 6
1426 &movd ("eax","mm2"); # 3, 2, 5, 4
1427 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
1428 &shl ("ecx",16); # 6
1429 &movz ($acc,&LB("ebx")); # 12
1430 &movd ("ebx","mm6"); # 9, 8,15,14
1431 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
1432 &or ("ecx",$acc); # 12
1433
1434 &movz ($acc,&LB("eax")); # 4
1435 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
1436 &or ("edx",$acc); # 4
1437 &movz ($acc,&LB("ebx")); # 14
1438 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
1439 &shl ($acc,16); # 14
1440 &or ("edx",$acc); # 14
1441 &movd ("mm1","edx"); # t[1] collected
1442
1443 &movz ($acc,&HB("eax")); # 5
1444 &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
1445 &shl ("edx",8); # 5
1446 &movz ($acc,&HB("ebx")); # 15
1447 &shr ("eax",16); # 3, 2
1448 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
1449 &shl ($acc,24); # 15
1450 &or ("edx",$acc); # 15
1451 &shr ("ebx",16); # 9, 8
1452
1453 &punpckldq ("mm0","mm1"); # t[0,1] collected
1454
1455 &movz ($acc,&HB("ebx")); # 9
1456 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
1457 &shl ($acc,8); # 9
1458 &or ("ecx",$acc); # 9
1459 &and ("ebx",0xff); # 8
1460 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
1461 &or ("edx","ebx"); # 8
1462 &movz ($acc,&LB("eax")); # 2
1463 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
1464 &shl ($acc,16); # 2
1465 &or ("edx",$acc); # 2
1466 &movd ("mm4","edx"); # t[2] collected
1467 &movz ("eax",&HB("eax")); # 3
1468 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
1469 &shl ("eax",24); # 3
1470 &or ("ecx","eax"); # 3
1471 &movd ("mm5","ecx"); # t[3] collected
1472
1473 &punpckldq ("mm4","mm5"); # t[2,3] collected
1474}
1475
1476 if (!$x86only) {
1477&function_begin_B("_sse_AES_decrypt_compact");
1478 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
1479 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
1480
1481 # note that caller is expected to allocate stack frame for me!
1482 &mov ($acc,&DWP(240,$key)); # load key->rounds
1483 &lea ($acc,&DWP(-2,$acc,$acc));
1484 &lea ($acc,&DWP(0,$key,$acc,8));
1485 &mov ($__end,$acc); # end of key schedule
1486
1487 &mov ($s0,0x1b1b1b1b); # magic constant
1488 &mov (&DWP(8,"esp"),$s0);
1489 &mov (&DWP(12,"esp"),$s0);
1490
1491 # prefetch Td4
1492 &mov ($s0,&DWP(0-128,$tbl));
1493 &mov ($s1,&DWP(32-128,$tbl));
1494 &mov ($s2,&DWP(64-128,$tbl));
1495 &mov ($s3,&DWP(96-128,$tbl));
1496 &mov ($s0,&DWP(128-128,$tbl));
1497 &mov ($s1,&DWP(160-128,$tbl));
1498 &mov ($s2,&DWP(192-128,$tbl));
1499 &mov ($s3,&DWP(224-128,$tbl));
1500
1501 &set_label("loop",16);
1502 &sse_deccompact();
1503 &add ($key,16);
1504 &cmp ($key,$__end);
1505 &ja (&label("out"));
1506
1507 # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
1508 &movq ("mm3","mm0"); &movq ("mm7","mm4");
1509 &movq ("mm2","mm0",1); &movq ("mm6","mm4",1);
1510 &movq ("mm1","mm0"); &movq ("mm5","mm4");
1511 &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16)
1512 &pslld ("mm2",8); &pslld ("mm6",8);
1513 &psrld ("mm3",8); &psrld ("mm7",8);
1514 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8
1515 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8
1516 &pslld ("mm2",16); &pslld ("mm6",16);
1517 &psrld ("mm3",16); &psrld ("mm7",16);
1518 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24
1519 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24
1520
1521 &movq ("mm3",&QWP(8,"esp"));
1522 &pxor ("mm2","mm2"); &pxor ("mm6","mm6");
1523 &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5");
1524 &pand ("mm2","mm3"); &pand ("mm6","mm3");
1525 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1526 &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2
1527 &movq ("mm3","mm1"); &movq ("mm7","mm5");
1528 &movq ("mm2","mm1"); &movq ("mm6","mm5");
1529 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2
1530 &pslld ("mm3",24); &pslld ("mm7",24);
1531 &psrld ("mm2",8); &psrld ("mm6",8);
1532 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24
1533 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8
1534
1535 &movq ("mm2",&QWP(8,"esp"));
1536 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1537 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1538 &pand ("mm3","mm2"); &pand ("mm7","mm2");
1539 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1540 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
1541 &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
1542 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
1543 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
1544
1545 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1546 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1547 &pand ("mm3","mm2"); &pand ("mm7","mm2");
1548 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1549 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8
1550 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8
1551 &movq ("mm3","mm1"); &movq ("mm7","mm5");
1552 &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1);
1553 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16)
1554 &pslld ("mm1",8); &pslld ("mm5",8);
1555 &psrld ("mm3",8); &psrld ("mm7",8);
1556 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
1557 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8
1558 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8
1559 &mov ($s0,&DWP(0-128,$tbl));
1560 &pslld ("mm1",16); &pslld ("mm5",16);
1561 &mov ($s1,&DWP(64-128,$tbl));
1562 &psrld ("mm3",16); &psrld ("mm7",16);
1563 &mov ($s2,&DWP(128-128,$tbl));
1564 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24
1565 &mov ($s3,&DWP(192-128,$tbl));
1566 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24
1567
1568 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
1569 &jmp (&label("loop"));
1570
1571 &set_label("out",16);
1572 &pxor ("mm0",&QWP(0,$key));
1573 &pxor ("mm4",&QWP(8,$key));
1574
1575 &ret ();
1576&function_end_B("_sse_AES_decrypt_compact");
1577 }
1578
1579######################################################################
1580# Vanilla block function.
1581######################################################################
1582
1583sub decstep()
1584{ my ($i,$td,@s) = @_;
1585 my $tmp = $key;
1586 my $out = $i==3?$s[0]:$acc;
1587
1588 # no instructions are reordered, as performance appears
1589 # optimal... or rather that all attempts to reorder didn't
1590 # result in better performance [which by the way is not a
1591 # bit lower than ecryption].
1592 if($i==3) { &mov ($key,$__key); }
1593 else { &mov ($out,$s[0]); }
1594 &and ($out,0xFF);
1595 &mov ($out,&DWP(0,$td,$out,8));
1596
1597 if ($i==3) { $tmp=$s[1]; }
1598 &movz ($tmp,&HB($s[1]));
1599 &xor ($out,&DWP(3,$td,$tmp,8));
1600
1601 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1602 else { &mov ($tmp,$s[2]); }
1603 &shr ($tmp,16);
1604 &and ($tmp,0xFF);
1605 &xor ($out,&DWP(2,$td,$tmp,8));
1606
1607 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
1608 else { &mov ($tmp,$s[3]); }
1609 &shr ($tmp,24);
1610 &xor ($out,&DWP(1,$td,$tmp,8));
1611 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1612 if ($i==3) { &mov ($s[3],$__s0); }
1613 &comment();
1614}
1615
1616sub declast()
1617{ my ($i,$td,@s)=@_;
1618 my $tmp = $key;
1619 my $out = $i==3?$s[0]:$acc;
1620
1621 if($i==0) { &lea ($td,&DWP(2048+128,$td));
1622 &mov ($tmp,&DWP(0-128,$td));
1623 &mov ($acc,&DWP(32-128,$td));
1624 &mov ($tmp,&DWP(64-128,$td));
1625 &mov ($acc,&DWP(96-128,$td));
1626 &mov ($tmp,&DWP(128-128,$td));
1627 &mov ($acc,&DWP(160-128,$td));
1628 &mov ($tmp,&DWP(192-128,$td));
1629 &mov ($acc,&DWP(224-128,$td));
1630 &lea ($td,&DWP(-128,$td)); }
1631 if($i==3) { &mov ($key,$__key); }
1632 else { &mov ($out,$s[0]); }
1633 &and ($out,0xFF);
1634 &movz ($out,&BP(0,$td,$out,1));
1635
1636 if ($i==3) { $tmp=$s[1]; }
1637 &movz ($tmp,&HB($s[1]));
1638 &movz ($tmp,&BP(0,$td,$tmp,1));
1639 &shl ($tmp,8);
1640 &xor ($out,$tmp);
1641
1642 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1643 else { mov ($tmp,$s[2]); }
1644 &shr ($tmp,16);
1645 &and ($tmp,0xFF);
1646 &movz ($tmp,&BP(0,$td,$tmp,1));
1647 &shl ($tmp,16);
1648 &xor ($out,$tmp);
1649
1650 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
1651 else { &mov ($tmp,$s[3]); }
1652 &shr ($tmp,24);
1653 &movz ($tmp,&BP(0,$td,$tmp,1));
1654 &shl ($tmp,24);
1655 &xor ($out,$tmp);
1656 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1657 if ($i==3) { &mov ($s[3],$__s0);
1658 &lea ($td,&DWP(-2048,$td)); }
1659}
1660
1661&function_begin_B("_x86_AES_decrypt");
1662 # note that caller is expected to allocate stack frame for me!
1663 &mov ($__key,$key); # save key
1664
1665 &xor ($s0,&DWP(0,$key)); # xor with key
1666 &xor ($s1,&DWP(4,$key));
1667 &xor ($s2,&DWP(8,$key));
1668 &xor ($s3,&DWP(12,$key));
1669
1670 &mov ($acc,&DWP(240,$key)); # load key->rounds
1671
1672 if ($small_footprint) {
1673 &lea ($acc,&DWP(-2,$acc,$acc));
1674 &lea ($acc,&DWP(0,$key,$acc,8));
1675 &mov ($__end,$acc); # end of key schedule
1676 &set_label("loop",16);
1677 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1678 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1679 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1680 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1681 &add ($key,16); # advance rd_key
1682 &xor ($s0,&DWP(0,$key));
1683 &xor ($s1,&DWP(4,$key));
1684 &xor ($s2,&DWP(8,$key));
1685 &xor ($s3,&DWP(12,$key));
1686 &cmp ($key,$__end);
1687 &mov ($__key,$key);
1688 &jb (&label("loop"));
1689 }
1690 else {
1691 &cmp ($acc,10);
1692 &jle (&label("10rounds"));
1693 &cmp ($acc,12);
1694 &jle (&label("12rounds"));
1695
1696 &set_label("14rounds",4);
1697 for ($i=1;$i<3;$i++) {
1698 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1699 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1700 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1701 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1702 &xor ($s0,&DWP(16*$i+0,$key));
1703 &xor ($s1,&DWP(16*$i+4,$key));
1704 &xor ($s2,&DWP(16*$i+8,$key));
1705 &xor ($s3,&DWP(16*$i+12,$key));
1706 }
1707 &add ($key,32);
1708 &mov ($__key,$key); # advance rd_key
1709 &set_label("12rounds",4);
1710 for ($i=1;$i<3;$i++) {
1711 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1712 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1713 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1714 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1715 &xor ($s0,&DWP(16*$i+0,$key));
1716 &xor ($s1,&DWP(16*$i+4,$key));
1717 &xor ($s2,&DWP(16*$i+8,$key));
1718 &xor ($s3,&DWP(16*$i+12,$key));
1719 }
1720 &add ($key,32);
1721 &mov ($__key,$key); # advance rd_key
1722 &set_label("10rounds",4);
1723 for ($i=1;$i<10;$i++) {
1724 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1725 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1726 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1727 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1728 &xor ($s0,&DWP(16*$i+0,$key));
1729 &xor ($s1,&DWP(16*$i+4,$key));
1730 &xor ($s2,&DWP(16*$i+8,$key));
1731 &xor ($s3,&DWP(16*$i+12,$key));
1732 }
1733 }
1734
1735 &declast(0,$tbl,$s0,$s3,$s2,$s1);
1736 &declast(1,$tbl,$s1,$s0,$s3,$s2);
1737 &declast(2,$tbl,$s2,$s1,$s0,$s3);
1738 &declast(3,$tbl,$s3,$s2,$s1,$s0);
1739
1740 &add ($key,$small_footprint?16:160);
1741 &xor ($s0,&DWP(0,$key));
1742 &xor ($s1,&DWP(4,$key));
1743 &xor ($s2,&DWP(8,$key));
1744 &xor ($s3,&DWP(12,$key));
1745
1746 &ret ();
1747
1748&set_label("AES_Td",64); # Yes! I keep it in the code segment!
1749 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
1750 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
1751 &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
1752 &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
1753 &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
1754 &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
1755 &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
1756 &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
1757 &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
1758 &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
1759 &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
1760 &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
1761 &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
1762 &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
1763 &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
1764 &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
1765 &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
1766 &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
1767 &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
1768 &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
1769 &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
1770 &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
1771 &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
1772 &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
1773 &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
1774 &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
1775 &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
1776 &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
1777 &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
1778 &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
1779 &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
1780 &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
1781 &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
1782 &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
1783 &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
1784 &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
1785 &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
1786 &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
1787 &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
1788 &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
1789 &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
1790 &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
1791 &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
1792 &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
1793 &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
1794 &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
1795 &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
1796 &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
1797 &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
1798 &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
1799 &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
1800 &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
1801 &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
1802 &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
1803 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
1804 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
1805 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
1806 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
1807 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
1808 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
1809 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
1810 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
1811 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
1812 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
1813
1814#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
1815 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1816 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1817 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1818 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1819 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1820 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1821 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1822 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1823 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1824 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1825 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1826 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1827 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1828 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1829 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1830 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1831 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1832 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1833 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1834 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1835 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1836 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1837 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1838 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1839 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1840 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1841 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1842 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1843 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1844 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1845 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1846 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1847
1848 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1849 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1850 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1851 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1852 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1853 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1854 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1855 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1856 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1857 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1858 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1859 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1860 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1861 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1862 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1863 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1864 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1865 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1866 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1867 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1868 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1869 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1870 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1871 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1872 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1873 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1874 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1875 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1876 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1877 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1878 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1879 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1880
1881 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1882 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1883 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1884 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1885 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1886 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1887 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1888 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1889 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1890 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1891 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1892 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1893 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1894 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1895 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1896 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1897 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1898 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1899 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1900 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1901 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1902 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1903 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1904 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1905 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1906 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1907 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1908 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1909 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1910 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1911 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1912 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1913
1914 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1915 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1916 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1917 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1918 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1919 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1920 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1921 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1922 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1923 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1924 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1925 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1926 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1927 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1928 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1929 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1930 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1931 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1932 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1933 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1934 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1935 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1936 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1937 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1938 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1939 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1940 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1941 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1942 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1943 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1944 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1945 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1946&function_end_B("_x86_AES_decrypt");
1947
1948# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
1949&function_begin("AES_decrypt");
1950 &mov ($acc,&wparam(0)); # load inp
1951 &mov ($key,&wparam(2)); # load key
1952
1953 &mov ($s0,"esp");
1954 &sub ("esp",36);
1955 &and ("esp",-64); # align to cache-line
1956
1957 # place stack frame just "above" the key schedule
1958 &lea ($s1,&DWP(-64-63,$key));
1959 &sub ($s1,"esp");
1960 &neg ($s1);
1961 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1962 &sub ("esp",$s1);
1963 &add ("esp",4); # 4 is reserved for caller's return address
1964 &mov ($_esp,$s0); # save stack pointer
1965
1966 &call (&label("pic_point")); # make it PIC!
1967 &set_label("pic_point");
1968 &blindpop($tbl);
1969 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
1970 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
1971
1972 # pick Td4 copy which can't "overlap" with stack frame or key schedule
1973 &lea ($s1,&DWP(768-4,"esp"));
1974 &sub ($s1,$tbl);
1975 &and ($s1,0x300);
1976 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1977
1978 if (!$x86only) {
1979 &bt (&DWP(0,$s0),25); # check for SSE bit
1980 &jnc (&label("x86"));
1981
1982 &movq ("mm0",&QWP(0,$acc));
1983 &movq ("mm4",&QWP(8,$acc));
1984 &call ("_sse_AES_decrypt_compact");
1985 &mov ("esp",$_esp); # restore stack pointer
1986 &mov ($acc,&wparam(1)); # load out
1987 &movq (&QWP(0,$acc),"mm0"); # write output data
1988 &movq (&QWP(8,$acc),"mm4");
1989 &emms ();
1990 &function_end_A();
1991 }
1992 &set_label("x86",16);
1993 &mov ($_tbl,$tbl);
1994 &mov ($s0,&DWP(0,$acc)); # load input data
1995 &mov ($s1,&DWP(4,$acc));
1996 &mov ($s2,&DWP(8,$acc));
1997 &mov ($s3,&DWP(12,$acc));
1998 &call ("_x86_AES_decrypt_compact");
1999 &mov ("esp",$_esp); # restore stack pointer
2000 &mov ($acc,&wparam(1)); # load out
2001 &mov (&DWP(0,$acc),$s0); # write output data
2002 &mov (&DWP(4,$acc),$s1);
2003 &mov (&DWP(8,$acc),$s2);
2004 &mov (&DWP(12,$acc),$s3);
2005&function_end("AES_decrypt");
2006
2007# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
2008# size_t length, const AES_KEY *key,
2009# unsigned char *ivp,const int enc);
2010{
2011# stack frame layout
2012# -4(%esp) # return address 0(%esp)
2013# 0(%esp) # s0 backing store 4(%esp)
2014# 4(%esp) # s1 backing store 8(%esp)
2015# 8(%esp) # s2 backing store 12(%esp)
2016# 12(%esp) # s3 backing store 16(%esp)
2017# 16(%esp) # key backup 20(%esp)
2018# 20(%esp) # end of key schedule 24(%esp)
2019# 24(%esp) # %ebp backup 28(%esp)
2020# 28(%esp) # %esp backup
2021my $_inp=&DWP(32,"esp"); # copy of wparam(0)
2022my $_out=&DWP(36,"esp"); # copy of wparam(1)
2023my $_len=&DWP(40,"esp"); # copy of wparam(2)
2024my $_key=&DWP(44,"esp"); # copy of wparam(3)
2025my $_ivp=&DWP(48,"esp"); # copy of wparam(4)
2026my $_tmp=&DWP(52,"esp"); # volatile variable
2027#
2028my $ivec=&DWP(60,"esp"); # ivec[16]
2029my $aes_key=&DWP(76,"esp"); # copy of aes_key
2030my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
2031
2032&function_begin("AES_cbc_encrypt");
2033 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
2034 &cmp ($s2,0);
2035 &je (&label("drop_out"));
2036
2037 &call (&label("pic_point")); # make it PIC!
2038 &set_label("pic_point");
2039 &blindpop($tbl);
2040 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
2041
2042 &cmp (&wparam(5),0);
2043 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
2044 &jne (&label("picked_te"));
2045 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
2046 &set_label("picked_te");
2047
2048 # one can argue if this is required
2049 &pushf ();
2050 &cld ();
2051
2052 &cmp ($s2,$speed_limit);
2053 &jb (&label("slow_way"));
2054 &test ($s2,15);
2055 &jnz (&label("slow_way"));
2056 if (!$x86only) {
2057 &bt (&DWP(0,$s0),28); # check for hyper-threading bit
2058 &jc (&label("slow_way"));
2059 }
2060 # pre-allocate aligned stack frame...
2061 &lea ($acc,&DWP(-80-244,"esp"));
2062 &and ($acc,-64);
2063
2064 # ... and make sure it doesn't alias with $tbl modulo 4096
2065 &mov ($s0,$tbl);
2066 &lea ($s1,&DWP(2048+256,$tbl));
2067 &mov ($s3,$acc);
2068 &and ($s0,0xfff); # s = %ebp&0xfff
2069 &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
2070 &and ($s3,0xfff); # p = %esp&0xfff
2071
2072 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
2073 &jb (&label("tbl_break_out"));
2074 &sub ($s3,$s1);
2075 &sub ($acc,$s3);
2076 &jmp (&label("tbl_ok"));
2077 &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz;
2078 &sub ($s3,$s0);
2079 &and ($s3,0xfff);
2080 &add ($s3,384);
2081 &sub ($acc,$s3);
2082 &set_label("tbl_ok",4);
2083
2084 &lea ($s3,&wparam(0)); # obtain pointer to parameter block
2085 &exch ("esp",$acc); # allocate stack frame
2086 &add ("esp",4); # reserve for return address!
2087 &mov ($_tbl,$tbl); # save %ebp
2088 &mov ($_esp,$acc); # save %esp
2089
2090 &mov ($s0,&DWP(0,$s3)); # load inp
2091 &mov ($s1,&DWP(4,$s3)); # load out
2092 #&mov ($s2,&DWP(8,$s3)); # load len
2093 &mov ($key,&DWP(12,$s3)); # load key
2094 &mov ($acc,&DWP(16,$s3)); # load ivp
2095 &mov ($s3,&DWP(20,$s3)); # load enc flag
2096
2097 &mov ($_inp,$s0); # save copy of inp
2098 &mov ($_out,$s1); # save copy of out
2099 &mov ($_len,$s2); # save copy of len
2100 &mov ($_key,$key); # save copy of key
2101 &mov ($_ivp,$acc); # save copy of ivp
2102
2103 &mov ($mark,0); # copy of aes_key->rounds = 0;
2104 # do we copy key schedule to stack?
2105 &mov ($s1 eq "ebx" ? $s1 : "",$key);
2106 &mov ($s2 eq "ecx" ? $s2 : "",244/4);
2107 &sub ($s1,$tbl);
2108 &mov ("esi",$key);
2109 &and ($s1,0xfff);
2110 &lea ("edi",$aes_key);
2111 &cmp ($s1,2048+256);
2112 &jb (&label("do_copy"));
2113 &cmp ($s1,4096-244);
2114 &jb (&label("skip_copy"));
2115 &set_label("do_copy",4);
2116 &mov ($_key,"edi");
2117 &data_word(0xA5F3F689); # rep movsd
2118 &set_label("skip_copy");
2119
2120 &mov ($key,16);
2121 &set_label("prefetch_tbl",4);
2122 &mov ($s0,&DWP(0,$tbl));
2123 &mov ($s1,&DWP(32,$tbl));
2124 &mov ($s2,&DWP(64,$tbl));
2125 &mov ($acc,&DWP(96,$tbl));
2126 &lea ($tbl,&DWP(128,$tbl));
2127 &sub ($key,1);
2128 &jnz (&label("prefetch_tbl"));
2129 &sub ($tbl,2048);
2130
2131 &mov ($acc,$_inp);
2132 &mov ($key,$_ivp);
2133
2134 &cmp ($s3,0);
2135 &je (&label("fast_decrypt"));
2136
2137#----------------------------- ENCRYPT -----------------------------#
2138 &mov ($s0,&DWP(0,$key)); # load iv
2139 &mov ($s1,&DWP(4,$key));
2140
2141 &set_label("fast_enc_loop",16);
2142 &mov ($s2,&DWP(8,$key));
2143 &mov ($s3,&DWP(12,$key));
2144
2145 &xor ($s0,&DWP(0,$acc)); # xor input data
2146 &xor ($s1,&DWP(4,$acc));
2147 &xor ($s2,&DWP(8,$acc));
2148 &xor ($s3,&DWP(12,$acc));
2149
2150 &mov ($key,$_key); # load key
2151 &call ("_x86_AES_encrypt");
2152
2153 &mov ($acc,$_inp); # load inp
2154 &mov ($key,$_out); # load out
2155
2156 &mov (&DWP(0,$key),$s0); # save output data
2157 &mov (&DWP(4,$key),$s1);
2158 &mov (&DWP(8,$key),$s2);
2159 &mov (&DWP(12,$key),$s3);
2160
2161 &lea ($acc,&DWP(16,$acc)); # advance inp
2162 &mov ($s2,$_len); # load len
2163 &mov ($_inp,$acc); # save inp
2164 &lea ($s3,&DWP(16,$key)); # advance out
2165 &mov ($_out,$s3); # save out
2166 &sub ($s2,16); # decrease len
2167 &mov ($_len,$s2); # save len
2168 &jnz (&label("fast_enc_loop"));
2169 &mov ($acc,$_ivp); # load ivp
2170 &mov ($s2,&DWP(8,$key)); # restore last 2 dwords
2171 &mov ($s3,&DWP(12,$key));
2172 &mov (&DWP(0,$acc),$s0); # save ivec
2173 &mov (&DWP(4,$acc),$s1);
2174 &mov (&DWP(8,$acc),$s2);
2175 &mov (&DWP(12,$acc),$s3);
2176
2177 &cmp ($mark,0); # was the key schedule copied?
2178 &mov ("edi",$_key);
2179 &je (&label("skip_ezero"));
2180 # zero copy of key schedule
2181 &mov ("ecx",240/4);
2182 &xor ("eax","eax");
2183 &align (4);
2184 &data_word(0xABF3F689); # rep stosd
2185 &set_label("skip_ezero")
2186 &mov ("esp",$_esp);
2187 &popf ();
2188 &set_label("drop_out");
2189 &function_end_A();
2190 &pushf (); # kludge, never executed
2191
2192#----------------------------- DECRYPT -----------------------------#
2193&set_label("fast_decrypt",16);
2194
2195 &cmp ($acc,$_out);
2196 &je (&label("fast_dec_in_place")); # in-place processing...
2197
2198 &mov ($_tmp,$key);
2199
2200 &align (4);
2201 &set_label("fast_dec_loop",16);
2202 &mov ($s0,&DWP(0,$acc)); # read input
2203 &mov ($s1,&DWP(4,$acc));
2204 &mov ($s2,&DWP(8,$acc));
2205 &mov ($s3,&DWP(12,$acc));
2206
2207 &mov ($key,$_key); # load key
2208 &call ("_x86_AES_decrypt");
2209
2210 &mov ($key,$_tmp); # load ivp
2211 &mov ($acc,$_len); # load len
2212 &xor ($s0,&DWP(0,$key)); # xor iv
2213 &xor ($s1,&DWP(4,$key));
2214 &xor ($s2,&DWP(8,$key));
2215 &xor ($s3,&DWP(12,$key));
2216
2217 &mov ($key,$_out); # load out
2218 &mov ($acc,$_inp); # load inp
2219
2220 &mov (&DWP(0,$key),$s0); # write output
2221 &mov (&DWP(4,$key),$s1);
2222 &mov (&DWP(8,$key),$s2);
2223 &mov (&DWP(12,$key),$s3);
2224
2225 &mov ($s2,$_len); # load len
2226 &mov ($_tmp,$acc); # save ivp
2227 &lea ($acc,&DWP(16,$acc)); # advance inp
2228 &mov ($_inp,$acc); # save inp
2229 &lea ($key,&DWP(16,$key)); # advance out
2230 &mov ($_out,$key); # save out
2231 &sub ($s2,16); # decrease len
2232 &mov ($_len,$s2); # save len
2233 &jnz (&label("fast_dec_loop"));
2234 &mov ($key,$_tmp); # load temp ivp
2235 &mov ($acc,$_ivp); # load user ivp
2236 &mov ($s0,&DWP(0,$key)); # load iv
2237 &mov ($s1,&DWP(4,$key));
2238 &mov ($s2,&DWP(8,$key));
2239 &mov ($s3,&DWP(12,$key));
2240 &mov (&DWP(0,$acc),$s0); # copy back to user
2241 &mov (&DWP(4,$acc),$s1);
2242 &mov (&DWP(8,$acc),$s2);
2243 &mov (&DWP(12,$acc),$s3);
2244 &jmp (&label("fast_dec_out"));
2245
2246 &set_label("fast_dec_in_place",16);
2247 &set_label("fast_dec_in_place_loop");
2248 &mov ($s0,&DWP(0,$acc)); # read input
2249 &mov ($s1,&DWP(4,$acc));
2250 &mov ($s2,&DWP(8,$acc));
2251 &mov ($s3,&DWP(12,$acc));
2252
2253 &lea ($key,$ivec);
2254 &mov (&DWP(0,$key),$s0); # copy to temp
2255 &mov (&DWP(4,$key),$s1);
2256 &mov (&DWP(8,$key),$s2);
2257 &mov (&DWP(12,$key),$s3);
2258
2259 &mov ($key,$_key); # load key
2260 &call ("_x86_AES_decrypt");
2261
2262 &mov ($key,$_ivp); # load ivp
2263 &mov ($acc,$_out); # load out
2264 &xor ($s0,&DWP(0,$key)); # xor iv
2265 &xor ($s1,&DWP(4,$key));
2266 &xor ($s2,&DWP(8,$key));
2267 &xor ($s3,&DWP(12,$key));
2268
2269 &mov (&DWP(0,$acc),$s0); # write output
2270 &mov (&DWP(4,$acc),$s1);
2271 &mov (&DWP(8,$acc),$s2);
2272 &mov (&DWP(12,$acc),$s3);
2273
2274 &lea ($acc,&DWP(16,$acc)); # advance out
2275 &mov ($_out,$acc); # save out
2276
2277 &lea ($acc,$ivec);
2278 &mov ($s0,&DWP(0,$acc)); # read temp
2279 &mov ($s1,&DWP(4,$acc));
2280 &mov ($s2,&DWP(8,$acc));
2281 &mov ($s3,&DWP(12,$acc));
2282
2283 &mov (&DWP(0,$key),$s0); # copy iv
2284 &mov (&DWP(4,$key),$s1);
2285 &mov (&DWP(8,$key),$s2);
2286 &mov (&DWP(12,$key),$s3);
2287
2288 &mov ($acc,$_inp); # load inp
2289 &mov ($s2,$_len); # load len
2290 &lea ($acc,&DWP(16,$acc)); # advance inp
2291 &mov ($_inp,$acc); # save inp
2292 &sub ($s2,16); # decrease len
2293 &mov ($_len,$s2); # save len
2294 &jnz (&label("fast_dec_in_place_loop"));
2295
2296 &set_label("fast_dec_out",4);
2297 &cmp ($mark,0); # was the key schedule copied?
2298 &mov ("edi",$_key);
2299 &je (&label("skip_dzero"));
2300 # zero copy of key schedule
2301 &mov ("ecx",240/4);
2302 &xor ("eax","eax");
2303 &align (4);
2304 &data_word(0xABF3F689); # rep stosd
2305 &set_label("skip_dzero")
2306 &mov ("esp",$_esp);
2307 &popf ();
2308 &function_end_A();
2309 &pushf (); # kludge, never executed
2310
2311#--------------------------- SLOW ROUTINE ---------------------------#
2312&set_label("slow_way",16);
2313
2314 &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap
2315 &mov ($key,&wparam(3)); # load key
2316
2317 # pre-allocate aligned stack frame...
2318 &lea ($acc,&DWP(-80,"esp"));
2319 &and ($acc,-64);
2320
2321 # ... and make sure it doesn't alias with $key modulo 1024
2322 &lea ($s1,&DWP(-80-63,$key));
2323 &sub ($s1,$acc);
2324 &neg ($s1);
2325 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
2326 &sub ($acc,$s1);
2327
2328 # pick S-box copy which can't overlap with stack frame or $key
2329 &lea ($s1,&DWP(768,$acc));
2330 &sub ($s1,$tbl);
2331 &and ($s1,0x300);
2332 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
2333
2334 &lea ($s3,&wparam(0)); # pointer to parameter block
2335
2336 &exch ("esp",$acc);
2337 &add ("esp",4); # reserve for return address!
2338 &mov ($_tbl,$tbl); # save %ebp
2339 &mov ($_esp,$acc); # save %esp
2340 &mov ($_tmp,$s0); # save OPENSSL_ia32cap
2341
2342 &mov ($s0,&DWP(0,$s3)); # load inp
2343 &mov ($s1,&DWP(4,$s3)); # load out
2344 #&mov ($s2,&DWP(8,$s3)); # load len
2345 #&mov ($key,&DWP(12,$s3)); # load key
2346 &mov ($acc,&DWP(16,$s3)); # load ivp
2347 &mov ($s3,&DWP(20,$s3)); # load enc flag
2348
2349 &mov ($_inp,$s0); # save copy of inp
2350 &mov ($_out,$s1); # save copy of out
2351 &mov ($_len,$s2); # save copy of len
2352 &mov ($_key,$key); # save copy of key
2353 &mov ($_ivp,$acc); # save copy of ivp
2354
2355 &mov ($key,$acc);
2356 &mov ($acc,$s0);
2357
2358 &cmp ($s3,0);
2359 &je (&label("slow_decrypt"));
2360
2361#--------------------------- SLOW ENCRYPT ---------------------------#
2362 &cmp ($s2,16);
2363 &mov ($s3,$s1);
2364 &jb (&label("slow_enc_tail"));
2365
2366 if (!$x86only) {
2367 &bt ($_tmp,25); # check for SSE bit
2368 &jnc (&label("slow_enc_x86"));
2369
2370 &movq ("mm0",&QWP(0,$key)); # load iv
2371 &movq ("mm4",&QWP(8,$key));
2372
2373 &set_label("slow_enc_loop_sse",16);
2374 &pxor ("mm0",&QWP(0,$acc)); # xor input data
2375 &pxor ("mm4",&QWP(8,$acc));
2376
2377 &mov ($key,$_key);
2378 &call ("_sse_AES_encrypt_compact");
2379
2380 &mov ($acc,$_inp); # load inp
2381 &mov ($key,$_out); # load out
2382 &mov ($s2,$_len); # load len
2383
2384 &movq (&QWP(0,$key),"mm0"); # save output data
2385 &movq (&QWP(8,$key),"mm4");
2386
2387 &lea ($acc,&DWP(16,$acc)); # advance inp
2388 &mov ($_inp,$acc); # save inp
2389 &lea ($s3,&DWP(16,$key)); # advance out
2390 &mov ($_out,$s3); # save out
2391 &sub ($s2,16); # decrease len
2392 &cmp ($s2,16);
2393 &mov ($_len,$s2); # save len
2394 &jae (&label("slow_enc_loop_sse"));
2395 &test ($s2,15);
2396 &jnz (&label("slow_enc_tail"));
2397 &mov ($acc,$_ivp); # load ivp
2398 &movq (&QWP(0,$acc),"mm0"); # save ivec
2399 &movq (&QWP(8,$acc),"mm4");
2400 &emms ();
2401 &mov ("esp",$_esp);
2402 &popf ();
2403 &function_end_A();
2404 &pushf (); # kludge, never executed
2405 }
2406 &set_label("slow_enc_x86",16);
2407 &mov ($s0,&DWP(0,$key)); # load iv
2408 &mov ($s1,&DWP(4,$key));
2409
2410 &set_label("slow_enc_loop_x86",4);
2411 &mov ($s2,&DWP(8,$key));
2412 &mov ($s3,&DWP(12,$key));
2413
2414 &xor ($s0,&DWP(0,$acc)); # xor input data
2415 &xor ($s1,&DWP(4,$acc));
2416 &xor ($s2,&DWP(8,$acc));
2417 &xor ($s3,&DWP(12,$acc));
2418
2419 &mov ($key,$_key); # load key
2420 &call ("_x86_AES_encrypt_compact");
2421
2422 &mov ($acc,$_inp); # load inp
2423 &mov ($key,$_out); # load out
2424
2425 &mov (&DWP(0,$key),$s0); # save output data
2426 &mov (&DWP(4,$key),$s1);
2427 &mov (&DWP(8,$key),$s2);
2428 &mov (&DWP(12,$key),$s3);
2429
2430 &mov ($s2,$_len); # load len
2431 &lea ($acc,&DWP(16,$acc)); # advance inp
2432 &mov ($_inp,$acc); # save inp
2433 &lea ($s3,&DWP(16,$key)); # advance out
2434 &mov ($_out,$s3); # save out
2435 &sub ($s2,16); # decrease len
2436 &cmp ($s2,16);
2437 &mov ($_len,$s2); # save len
2438 &jae (&label("slow_enc_loop_x86"));
2439 &test ($s2,15);
2440 &jnz (&label("slow_enc_tail"));
2441 &mov ($acc,$_ivp); # load ivp
2442 &mov ($s2,&DWP(8,$key)); # restore last dwords
2443 &mov ($s3,&DWP(12,$key));
2444 &mov (&DWP(0,$acc),$s0); # save ivec
2445 &mov (&DWP(4,$acc),$s1);
2446 &mov (&DWP(8,$acc),$s2);
2447 &mov (&DWP(12,$acc),$s3);
2448
2449 &mov ("esp",$_esp);
2450 &popf ();
2451 &function_end_A();
2452 &pushf (); # kludge, never executed
2453
2454 &set_label("slow_enc_tail",16);
2455 &emms () if (!$x86only);
2456 &mov ($key eq "edi"? $key:"",$s3); # load out to edi
2457 &mov ($s1,16);
2458 &sub ($s1,$s2);
2459 &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp
2460 &je (&label("enc_in_place"));
2461 &align (4);
2462 &data_word(0xA4F3F689); # rep movsb # copy input
2463 &jmp (&label("enc_skip_in_place"));
2464 &set_label("enc_in_place");
2465 &lea ($key,&DWP(0,$key,$s2));
2466 &set_label("enc_skip_in_place");
2467 &mov ($s2,$s1);
2468 &xor ($s0,$s0);
2469 &align (4);
2470 &data_word(0xAAF3F689); # rep stosb # zero tail
2471
2472 &mov ($key,$_ivp); # restore ivp
2473 &mov ($acc,$s3); # output as input
2474 &mov ($s0,&DWP(0,$key));
2475 &mov ($s1,&DWP(4,$key));
2476 &mov ($_len,16); # len=16
2477 &jmp (&label("slow_enc_loop_x86")); # one more spin...
2478
2479#--------------------------- SLOW DECRYPT ---------------------------#
2480&set_label("slow_decrypt",16);
2481 if (!$x86only) {
2482 &bt ($_tmp,25); # check for SSE bit
2483 &jnc (&label("slow_dec_loop_x86"));
2484
2485 &set_label("slow_dec_loop_sse",4);
2486 &movq ("mm0",&QWP(0,$acc)); # read input
2487 &movq ("mm4",&QWP(8,$acc));
2488
2489 &mov ($key,$_key);
2490 &call ("_sse_AES_decrypt_compact");
2491
2492 &mov ($acc,$_inp); # load inp
2493 &lea ($s0,$ivec);
2494 &mov ($s1,$_out); # load out
2495 &mov ($s2,$_len); # load len
2496 &mov ($key,$_ivp); # load ivp
2497
2498 &movq ("mm1",&QWP(0,$acc)); # re-read input
2499 &movq ("mm5",&QWP(8,$acc));
2500
2501 &pxor ("mm0",&QWP(0,$key)); # xor iv
2502 &pxor ("mm4",&QWP(8,$key));
2503
2504 &movq (&QWP(0,$key),"mm1"); # copy input to iv
2505 &movq (&QWP(8,$key),"mm5");
2506
2507 &sub ($s2,16); # decrease len
2508 &jc (&label("slow_dec_partial_sse"));
2509
2510 &movq (&QWP(0,$s1),"mm0"); # write output
2511 &movq (&QWP(8,$s1),"mm4");
2512
2513 &lea ($s1,&DWP(16,$s1)); # advance out
2514 &mov ($_out,$s1); # save out
2515 &lea ($acc,&DWP(16,$acc)); # advance inp
2516 &mov ($_inp,$acc); # save inp
2517 &mov ($_len,$s2); # save len
2518 &jnz (&label("slow_dec_loop_sse"));
2519 &emms ();
2520 &mov ("esp",$_esp);
2521 &popf ();
2522 &function_end_A();
2523 &pushf (); # kludge, never executed
2524
2525 &set_label("slow_dec_partial_sse",16);
2526 &movq (&QWP(0,$s0),"mm0"); # save output to temp
2527 &movq (&QWP(8,$s0),"mm4");
2528 &emms ();
2529
2530 &add ($s2 eq "ecx" ? "ecx":"",16);
2531 &mov ("edi",$s1); # out
2532 &mov ("esi",$s0); # temp
2533 &align (4);
2534 &data_word(0xA4F3F689); # rep movsb # copy partial output
2535
2536 &mov ("esp",$_esp);
2537 &popf ();
2538 &function_end_A();
2539 &pushf (); # kludge, never executed
2540 }
2541 &set_label("slow_dec_loop_x86",16);
2542 &mov ($s0,&DWP(0,$acc)); # read input
2543 &mov ($s1,&DWP(4,$acc));
2544 &mov ($s2,&DWP(8,$acc));
2545 &mov ($s3,&DWP(12,$acc));
2546
2547 &lea ($key,$ivec);
2548 &mov (&DWP(0,$key),$s0); # copy to temp
2549 &mov (&DWP(4,$key),$s1);
2550 &mov (&DWP(8,$key),$s2);
2551 &mov (&DWP(12,$key),$s3);
2552
2553 &mov ($key,$_key); # load key
2554 &call ("_x86_AES_decrypt_compact");
2555
2556 &mov ($key,$_ivp); # load ivp
2557 &mov ($acc,$_len); # load len
2558 &xor ($s0,&DWP(0,$key)); # xor iv
2559 &xor ($s1,&DWP(4,$key));
2560 &xor ($s2,&DWP(8,$key));
2561 &xor ($s3,&DWP(12,$key));
2562
2563 &sub ($acc,16);
2564 &jc (&label("slow_dec_partial_x86"));
2565
2566 &mov ($_len,$acc); # save len
2567 &mov ($acc,$_out); # load out
2568
2569 &mov (&DWP(0,$acc),$s0); # write output
2570 &mov (&DWP(4,$acc),$s1);
2571 &mov (&DWP(8,$acc),$s2);
2572 &mov (&DWP(12,$acc),$s3);
2573
2574 &lea ($acc,&DWP(16,$acc)); # advance out
2575 &mov ($_out,$acc); # save out
2576
2577 &lea ($acc,$ivec);
2578 &mov ($s0,&DWP(0,$acc)); # read temp
2579 &mov ($s1,&DWP(4,$acc));
2580 &mov ($s2,&DWP(8,$acc));
2581 &mov ($s3,&DWP(12,$acc));
2582
2583 &mov (&DWP(0,$key),$s0); # copy it to iv
2584 &mov (&DWP(4,$key),$s1);
2585 &mov (&DWP(8,$key),$s2);
2586 &mov (&DWP(12,$key),$s3);
2587
2588 &mov ($acc,$_inp); # load inp
2589 &lea ($acc,&DWP(16,$acc)); # advance inp
2590 &mov ($_inp,$acc); # save inp
2591 &jnz (&label("slow_dec_loop_x86"));
2592 &mov ("esp",$_esp);
2593 &popf ();
2594 &function_end_A();
2595 &pushf (); # kludge, never executed
2596
2597 &set_label("slow_dec_partial_x86",16);
2598 &lea ($acc,$ivec);
2599 &mov (&DWP(0,$acc),$s0); # save output to temp
2600 &mov (&DWP(4,$acc),$s1);
2601 &mov (&DWP(8,$acc),$s2);
2602 &mov (&DWP(12,$acc),$s3);
2603
2604 &mov ($acc,$_inp);
2605 &mov ($s0,&DWP(0,$acc)); # re-read input
2606 &mov ($s1,&DWP(4,$acc));
2607 &mov ($s2,&DWP(8,$acc));
2608 &mov ($s3,&DWP(12,$acc));
2609
2610 &mov (&DWP(0,$key),$s0); # copy it to iv
2611 &mov (&DWP(4,$key),$s1);
2612 &mov (&DWP(8,$key),$s2);
2613 &mov (&DWP(12,$key),$s3);
2614
2615 &mov ("ecx",$_len);
2616 &mov ("edi",$_out);
2617 &lea ("esi",$ivec);
2618 &align (4);
2619 &data_word(0xA4F3F689); # rep movsb # copy partial output
2620
2621 &mov ("esp",$_esp);
2622 &popf ();
2623&function_end("AES_cbc_encrypt");
2624}
2625
2626#------------------------------------------------------------------#
2627
2628sub enckey()
2629{
2630 &movz ("esi",&LB("edx")); # rk[i]>>0
2631 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2632 &movz ("esi",&HB("edx")); # rk[i]>>8
2633 &shl ("ebx",24);
2634 &xor ("eax","ebx");
2635
2636 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2637 &shr ("edx",16);
2638 &movz ("esi",&LB("edx")); # rk[i]>>16
2639 &xor ("eax","ebx");
2640
2641 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2642 &movz ("esi",&HB("edx")); # rk[i]>>24
2643 &shl ("ebx",8);
2644 &xor ("eax","ebx");
2645
2646 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2647 &shl ("ebx",16);
2648 &xor ("eax","ebx");
2649
2650 &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon
2651}
2652
2653&function_begin("_x86_AES_set_encrypt_key");
2654 &mov ("esi",&wparam(1)); # user supplied key
2655 &mov ("edi",&wparam(3)); # private key schedule
2656
2657 &test ("esi",-1);
2658 &jz (&label("badpointer"));
2659 &test ("edi",-1);
2660 &jz (&label("badpointer"));
2661
2662 &call (&label("pic_point"));
2663 &set_label("pic_point");
2664 &blindpop($tbl);
2665 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
2666 &lea ($tbl,&DWP(2048+128,$tbl));
2667
2668 # prefetch Te4
2669 &mov ("eax",&DWP(0-128,$tbl));
2670 &mov ("ebx",&DWP(32-128,$tbl));
2671 &mov ("ecx",&DWP(64-128,$tbl));
2672 &mov ("edx",&DWP(96-128,$tbl));
2673 &mov ("eax",&DWP(128-128,$tbl));
2674 &mov ("ebx",&DWP(160-128,$tbl));
2675 &mov ("ecx",&DWP(192-128,$tbl));
2676 &mov ("edx",&DWP(224-128,$tbl));
2677
2678 &mov ("ecx",&wparam(2)); # number of bits in key
2679 &cmp ("ecx",128);
2680 &je (&label("10rounds"));
2681 &cmp ("ecx",192);
2682 &je (&label("12rounds"));
2683 &cmp ("ecx",256);
2684 &je (&label("14rounds"));
2685 &mov ("eax",-2); # invalid number of bits
2686 &jmp (&label("exit"));
2687
2688 &set_label("10rounds");
2689 &mov ("eax",&DWP(0,"esi")); # copy first 4 dwords
2690 &mov ("ebx",&DWP(4,"esi"));
2691 &mov ("ecx",&DWP(8,"esi"));
2692 &mov ("edx",&DWP(12,"esi"));
2693 &mov (&DWP(0,"edi"),"eax");
2694 &mov (&DWP(4,"edi"),"ebx");
2695 &mov (&DWP(8,"edi"),"ecx");
2696 &mov (&DWP(12,"edi"),"edx");
2697
2698 &xor ("ecx","ecx");
2699 &jmp (&label("10shortcut"));
2700
2701 &align (4);
2702 &set_label("10loop");
2703 &mov ("eax",&DWP(0,"edi")); # rk[0]
2704 &mov ("edx",&DWP(12,"edi")); # rk[3]
2705 &set_label("10shortcut");
2706 &enckey ();
2707
2708 &mov (&DWP(16,"edi"),"eax"); # rk[4]
2709 &xor ("eax",&DWP(4,"edi"));
2710 &mov (&DWP(20,"edi"),"eax"); # rk[5]
2711 &xor ("eax",&DWP(8,"edi"));
2712 &mov (&DWP(24,"edi"),"eax"); # rk[6]
2713 &xor ("eax",&DWP(12,"edi"));
2714 &mov (&DWP(28,"edi"),"eax"); # rk[7]
2715 &inc ("ecx");
2716 &add ("edi",16);
2717 &cmp ("ecx",10);
2718 &jl (&label("10loop"));
2719
2720 &mov (&DWP(80,"edi"),10); # setup number of rounds
2721 &xor ("eax","eax");
2722 &jmp (&label("exit"));
2723
2724 &set_label("12rounds");
2725 &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
2726 &mov ("ebx",&DWP(4,"esi"));
2727 &mov ("ecx",&DWP(8,"esi"));
2728 &mov ("edx",&DWP(12,"esi"));
2729 &mov (&DWP(0,"edi"),"eax");
2730 &mov (&DWP(4,"edi"),"ebx");
2731 &mov (&DWP(8,"edi"),"ecx");
2732 &mov (&DWP(12,"edi"),"edx");
2733 &mov ("ecx",&DWP(16,"esi"));
2734 &mov ("edx",&DWP(20,"esi"));
2735 &mov (&DWP(16,"edi"),"ecx");
2736 &mov (&DWP(20,"edi"),"edx");
2737
2738 &xor ("ecx","ecx");
2739 &jmp (&label("12shortcut"));
2740
2741 &align (4);
2742 &set_label("12loop");
2743 &mov ("eax",&DWP(0,"edi")); # rk[0]
2744 &mov ("edx",&DWP(20,"edi")); # rk[5]
2745 &set_label("12shortcut");
2746 &enckey ();
2747
2748 &mov (&DWP(24,"edi"),"eax"); # rk[6]
2749 &xor ("eax",&DWP(4,"edi"));
2750 &mov (&DWP(28,"edi"),"eax"); # rk[7]
2751 &xor ("eax",&DWP(8,"edi"));
2752 &mov (&DWP(32,"edi"),"eax"); # rk[8]
2753 &xor ("eax",&DWP(12,"edi"));
2754 &mov (&DWP(36,"edi"),"eax"); # rk[9]
2755
2756 &cmp ("ecx",7);
2757 &je (&label("12break"));
2758 &inc ("ecx");
2759
2760 &xor ("eax",&DWP(16,"edi"));
2761 &mov (&DWP(40,"edi"),"eax"); # rk[10]
2762 &xor ("eax",&DWP(20,"edi"));
2763 &mov (&DWP(44,"edi"),"eax"); # rk[11]
2764
2765 &add ("edi",24);
2766 &jmp (&label("12loop"));
2767
2768 &set_label("12break");
2769 &mov (&DWP(72,"edi"),12); # setup number of rounds
2770 &xor ("eax","eax");
2771 &jmp (&label("exit"));
2772
2773 &set_label("14rounds");
2774 &mov ("eax",&DWP(0,"esi")); # copy first 8 dwords
2775 &mov ("ebx",&DWP(4,"esi"));
2776 &mov ("ecx",&DWP(8,"esi"));
2777 &mov ("edx",&DWP(12,"esi"));
2778 &mov (&DWP(0,"edi"),"eax");
2779 &mov (&DWP(4,"edi"),"ebx");
2780 &mov (&DWP(8,"edi"),"ecx");
2781 &mov (&DWP(12,"edi"),"edx");
2782 &mov ("eax",&DWP(16,"esi"));
2783 &mov ("ebx",&DWP(20,"esi"));
2784 &mov ("ecx",&DWP(24,"esi"));
2785 &mov ("edx",&DWP(28,"esi"));
2786 &mov (&DWP(16,"edi"),"eax");
2787 &mov (&DWP(20,"edi"),"ebx");
2788 &mov (&DWP(24,"edi"),"ecx");
2789 &mov (&DWP(28,"edi"),"edx");
2790
2791 &xor ("ecx","ecx");
2792 &jmp (&label("14shortcut"));
2793
2794 &align (4);
2795 &set_label("14loop");
2796 &mov ("edx",&DWP(28,"edi")); # rk[7]
2797 &set_label("14shortcut");
2798 &mov ("eax",&DWP(0,"edi")); # rk[0]
2799
2800 &enckey ();
2801
2802 &mov (&DWP(32,"edi"),"eax"); # rk[8]
2803 &xor ("eax",&DWP(4,"edi"));
2804 &mov (&DWP(36,"edi"),"eax"); # rk[9]
2805 &xor ("eax",&DWP(8,"edi"));
2806 &mov (&DWP(40,"edi"),"eax"); # rk[10]
2807 &xor ("eax",&DWP(12,"edi"));
2808 &mov (&DWP(44,"edi"),"eax"); # rk[11]
2809
2810 &cmp ("ecx",6);
2811 &je (&label("14break"));
2812 &inc ("ecx");
2813
2814 &mov ("edx","eax");
2815 &mov ("eax",&DWP(16,"edi")); # rk[4]
2816 &movz ("esi",&LB("edx")); # rk[11]>>0
2817 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2818 &movz ("esi",&HB("edx")); # rk[11]>>8
2819 &xor ("eax","ebx");
2820
2821 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2822 &shr ("edx",16);
2823 &shl ("ebx",8);
2824 &movz ("esi",&LB("edx")); # rk[11]>>16
2825 &xor ("eax","ebx");
2826
2827 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2828 &movz ("esi",&HB("edx")); # rk[11]>>24
2829 &shl ("ebx",16);
2830 &xor ("eax","ebx");
2831
2832 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2833 &shl ("ebx",24);
2834 &xor ("eax","ebx");
2835
2836 &mov (&DWP(48,"edi"),"eax"); # rk[12]
2837 &xor ("eax",&DWP(20,"edi"));
2838 &mov (&DWP(52,"edi"),"eax"); # rk[13]
2839 &xor ("eax",&DWP(24,"edi"));
2840 &mov (&DWP(56,"edi"),"eax"); # rk[14]
2841 &xor ("eax",&DWP(28,"edi"));
2842 &mov (&DWP(60,"edi"),"eax"); # rk[15]
2843
2844 &add ("edi",32);
2845 &jmp (&label("14loop"));
2846
2847 &set_label("14break");
2848 &mov (&DWP(48,"edi"),14); # setup number of rounds
2849 &xor ("eax","eax");
2850 &jmp (&label("exit"));
2851
2852 &set_label("badpointer");
2853 &mov ("eax",-1);
2854 &set_label("exit");
2855&function_end("_x86_AES_set_encrypt_key");
2856
2857# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
2858# AES_KEY *key)
2859&function_begin_B("private_AES_set_encrypt_key");
2860 &call ("_x86_AES_set_encrypt_key");
2861 &ret ();
2862&function_end_B("private_AES_set_encrypt_key");
2863
2864sub deckey()
2865{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
2866 my $tmp = $tbl;
2867
2868 &mov ($acc,$tp1);
2869 &and ($acc,0x80808080);
2870 &mov ($tmp,$acc);
2871 &shr ($tmp,7);
2872 &lea ($tp2,&DWP(0,$tp1,$tp1));
2873 &sub ($acc,$tmp);
2874 &and ($tp2,0xfefefefe);
2875 &and ($acc,0x1b1b1b1b);
2876 &xor ($acc,$tp2);
2877 &mov ($tp2,$acc);
2878
2879 &and ($acc,0x80808080);
2880 &mov ($tmp,$acc);
2881 &shr ($tmp,7);
2882 &lea ($tp4,&DWP(0,$tp2,$tp2));
2883 &sub ($acc,$tmp);
2884 &and ($tp4,0xfefefefe);
2885 &and ($acc,0x1b1b1b1b);
2886 &xor ($tp2,$tp1); # tp2^tp1
2887 &xor ($acc,$tp4);
2888 &mov ($tp4,$acc);
2889
2890 &and ($acc,0x80808080);
2891 &mov ($tmp,$acc);
2892 &shr ($tmp,7);
2893 &lea ($tp8,&DWP(0,$tp4,$tp4));
2894 &xor ($tp4,$tp1); # tp4^tp1
2895 &sub ($acc,$tmp);
2896 &and ($tp8,0xfefefefe);
2897 &and ($acc,0x1b1b1b1b);
2898 &rotl ($tp1,8); # = ROTATE(tp1,8)
2899 &xor ($tp8,$acc);
2900
2901 &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load
2902
2903 &xor ($tp1,$tp2);
2904 &xor ($tp2,$tp8);
2905 &xor ($tp1,$tp4);
2906 &rotl ($tp2,24);
2907 &xor ($tp4,$tp8);
2908 &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
2909 &rotl ($tp4,16);
2910 &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
2911 &rotl ($tp8,8);
2912 &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
2913 &mov ($tp2,$tmp);
2914 &xor ($tp1,$tp8); # ^= ROTATE(tp8,8)
2915
2916 &mov (&DWP(4*$i,$key),$tp1);
2917}
2918
2919# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
2920# AES_KEY *key)
2921&function_begin_B("private_AES_set_decrypt_key");
2922 &call ("_x86_AES_set_encrypt_key");
2923 &cmp ("eax",0);
2924 &je (&label("proceed"));
2925 &ret ();
2926
2927 &set_label("proceed");
2928 &push ("ebp");
2929 &push ("ebx");
2930 &push ("esi");
2931 &push ("edi");
2932
2933 &mov ("esi",&wparam(2));
2934 &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
2935 &lea ("ecx",&DWP(0,"","ecx",4));
2936 &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
2937
2938 &set_label("invert",4); # invert order of chunks
2939 &mov ("eax",&DWP(0,"esi"));
2940 &mov ("ebx",&DWP(4,"esi"));
2941 &mov ("ecx",&DWP(0,"edi"));
2942 &mov ("edx",&DWP(4,"edi"));
2943 &mov (&DWP(0,"edi"),"eax");
2944 &mov (&DWP(4,"edi"),"ebx");
2945 &mov (&DWP(0,"esi"),"ecx");
2946 &mov (&DWP(4,"esi"),"edx");
2947 &mov ("eax",&DWP(8,"esi"));
2948 &mov ("ebx",&DWP(12,"esi"));
2949 &mov ("ecx",&DWP(8,"edi"));
2950 &mov ("edx",&DWP(12,"edi"));
2951 &mov (&DWP(8,"edi"),"eax");
2952 &mov (&DWP(12,"edi"),"ebx");
2953 &mov (&DWP(8,"esi"),"ecx");
2954 &mov (&DWP(12,"esi"),"edx");
2955 &add ("esi",16);
2956 &sub ("edi",16);
2957 &cmp ("esi","edi");
2958 &jne (&label("invert"));
2959
2960 &mov ($key,&wparam(2));
2961 &mov ($acc,&DWP(240,$key)); # pull number of rounds
2962 &lea ($acc,&DWP(-2,$acc,$acc));
2963 &lea ($acc,&DWP(0,$key,$acc,8));
2964 &mov (&wparam(2),$acc);
2965
2966 &mov ($s0,&DWP(16,$key)); # modulo-scheduled load
2967 &set_label("permute",4); # permute the key schedule
2968 &add ($key,16);
2969 &deckey (0,$key,$s0,$s1,$s2,$s3);
2970 &deckey (1,$key,$s1,$s2,$s3,$s0);
2971 &deckey (2,$key,$s2,$s3,$s0,$s1);
2972 &deckey (3,$key,$s3,$s0,$s1,$s2);
2973 &cmp ($key,&wparam(2));
2974 &jb (&label("permute"));
2975
2976 &xor ("eax","eax"); # return success
2977&function_end("private_AES_set_decrypt_key");
2978&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
2979
2980&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl
deleted file mode 100644
index 86b86c4a0f..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-armv4.pl
+++ /dev/null
@@ -1,1134 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for ARMv4
11
12# January 2007.
13#
14# Code uses single 1K S-box and is >2 times faster than code generated
15# by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
16# allows to merge logical or arithmetic operation with shift or rotate
17# in one instruction and emit combined result every cycle. The module
18# is endian-neutral. The performance is ~42 cycles/byte for 128-bit
19# key [on single-issue Xscale PXA250 core].
20
21# May 2007.
22#
23# AES_set_[en|de]crypt_key is added.
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 12% improvement on
28# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~21.5 cycles per byte.
34
35while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
36open STDOUT,">$output";
37
38$s0="r0";
39$s1="r1";
40$s2="r2";
41$s3="r3";
42$t1="r4";
43$t2="r5";
44$t3="r6";
45$i1="r7";
46$i2="r8";
47$i3="r9";
48
49$tbl="r10";
50$key="r11";
51$rounds="r12";
52
53$code=<<___;
54#include "arm_arch.h"
55.text
56.code 32
57
58.type AES_Te,%object
59.align 5
60AES_Te:
61.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
62.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
63.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
64.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
65.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
66.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
67.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
68.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
69.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
70.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
71.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
72.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
73.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
74.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
75.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
76.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
77.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
78.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
79.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
80.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
81.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
82.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
83.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
84.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
85.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
86.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
87.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
88.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
89.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
90.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
91.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
92.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
93.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
94.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
95.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
96.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
97.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
98.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
99.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
100.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
101.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
102.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
103.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
104.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
105.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
106.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
107.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
108.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
109.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
110.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
111.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
112.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
113.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
114.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
115.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
116.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
117.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
118.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
119.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
120.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
121.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
122.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
123.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
124.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
125@ Te4[256]
126.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
127.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
128.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
129.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
130.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
131.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
132.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
133.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
134.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
135.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
136.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
137.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
138.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
139.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
140.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
141.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
142.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
143.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
144.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
145.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
146.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
147.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
148.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
149.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
150.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
151.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
152.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
153.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
154.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
155.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
156.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
157.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
158@ rcon[]
159.word 0x01000000, 0x02000000, 0x04000000, 0x08000000
160.word 0x10000000, 0x20000000, 0x40000000, 0x80000000
161.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
162.size AES_Te,.-AES_Te
163
164@ void AES_encrypt(const unsigned char *in, unsigned char *out,
165@ const AES_KEY *key) {
166.global AES_encrypt
167.type AES_encrypt,%function
168.align 5
169AES_encrypt:
170 sub r3,pc,#8 @ AES_encrypt
171 stmdb sp!,{r1,r4-r12,lr}
172 mov $rounds,r0 @ inp
173 mov $key,r2
174 sub $tbl,r3,#AES_encrypt-AES_Te @ Te
175#if __ARM_ARCH__<7
176 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
177 ldrb $t1,[$rounds,#2] @ manner...
178 ldrb $t2,[$rounds,#1]
179 ldrb $t3,[$rounds,#0]
180 orr $s0,$s0,$t1,lsl#8
181 ldrb $s1,[$rounds,#7]
182 orr $s0,$s0,$t2,lsl#16
183 ldrb $t1,[$rounds,#6]
184 orr $s0,$s0,$t3,lsl#24
185 ldrb $t2,[$rounds,#5]
186 ldrb $t3,[$rounds,#4]
187 orr $s1,$s1,$t1,lsl#8
188 ldrb $s2,[$rounds,#11]
189 orr $s1,$s1,$t2,lsl#16
190 ldrb $t1,[$rounds,#10]
191 orr $s1,$s1,$t3,lsl#24
192 ldrb $t2,[$rounds,#9]
193 ldrb $t3,[$rounds,#8]
194 orr $s2,$s2,$t1,lsl#8
195 ldrb $s3,[$rounds,#15]
196 orr $s2,$s2,$t2,lsl#16
197 ldrb $t1,[$rounds,#14]
198 orr $s2,$s2,$t3,lsl#24
199 ldrb $t2,[$rounds,#13]
200 ldrb $t3,[$rounds,#12]
201 orr $s3,$s3,$t1,lsl#8
202 orr $s3,$s3,$t2,lsl#16
203 orr $s3,$s3,$t3,lsl#24
204#else
205 ldr $s0,[$rounds,#0]
206 ldr $s1,[$rounds,#4]
207 ldr $s2,[$rounds,#8]
208 ldr $s3,[$rounds,#12]
209#ifdef __ARMEL__
210 rev $s0,$s0
211 rev $s1,$s1
212 rev $s2,$s2
213 rev $s3,$s3
214#endif
215#endif
216 bl _armv4_AES_encrypt
217
218 ldr $rounds,[sp],#4 @ pop out
219#if __ARM_ARCH__>=7
220#ifdef __ARMEL__
221 rev $s0,$s0
222 rev $s1,$s1
223 rev $s2,$s2
224 rev $s3,$s3
225#endif
226 str $s0,[$rounds,#0]
227 str $s1,[$rounds,#4]
228 str $s2,[$rounds,#8]
229 str $s3,[$rounds,#12]
230#else
231 mov $t1,$s0,lsr#24 @ write output in endian-neutral
232 mov $t2,$s0,lsr#16 @ manner...
233 mov $t3,$s0,lsr#8
234 strb $t1,[$rounds,#0]
235 strb $t2,[$rounds,#1]
236 mov $t1,$s1,lsr#24
237 strb $t3,[$rounds,#2]
238 mov $t2,$s1,lsr#16
239 strb $s0,[$rounds,#3]
240 mov $t3,$s1,lsr#8
241 strb $t1,[$rounds,#4]
242 strb $t2,[$rounds,#5]
243 mov $t1,$s2,lsr#24
244 strb $t3,[$rounds,#6]
245 mov $t2,$s2,lsr#16
246 strb $s1,[$rounds,#7]
247 mov $t3,$s2,lsr#8
248 strb $t1,[$rounds,#8]
249 strb $t2,[$rounds,#9]
250 mov $t1,$s3,lsr#24
251 strb $t3,[$rounds,#10]
252 mov $t2,$s3,lsr#16
253 strb $s2,[$rounds,#11]
254 mov $t3,$s3,lsr#8
255 strb $t1,[$rounds,#12]
256 strb $t2,[$rounds,#13]
257 strb $t3,[$rounds,#14]
258 strb $s3,[$rounds,#15]
259#endif
260#if __ARM_ARCH__>=5
261 ldmia sp!,{r4-r12,pc}
262#else
263 ldmia sp!,{r4-r12,lr}
264 tst lr,#1
265 moveq pc,lr @ be binary compatible with V4, yet
266 bx lr @ interoperable with Thumb ISA:-)
267#endif
268.size AES_encrypt,.-AES_encrypt
269
270.type _armv4_AES_encrypt,%function
271.align 2
272_armv4_AES_encrypt:
273 str lr,[sp,#-4]! @ push lr
274 ldmia $key!,{$t1-$i1}
275 eor $s0,$s0,$t1
276 ldr $rounds,[$key,#240-16]
277 eor $s1,$s1,$t2
278 eor $s2,$s2,$t3
279 eor $s3,$s3,$i1
280 sub $rounds,$rounds,#1
281 mov lr,#255
282
283 and $i1,lr,$s0
284 and $i2,lr,$s0,lsr#8
285 and $i3,lr,$s0,lsr#16
286 mov $s0,$s0,lsr#24
287.Lenc_loop:
288 ldr $t1,[$tbl,$i1,lsl#2] @ Te3[s0>>0]
289 and $i1,lr,$s1,lsr#16 @ i0
290 ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8]
291 and $i2,lr,$s1
292 ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16]
293 and $i3,lr,$s1,lsr#8
294 ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24]
295 mov $s1,$s1,lsr#24
296
297 ldr $i1,[$tbl,$i1,lsl#2] @ Te1[s1>>16]
298 ldr $i2,[$tbl,$i2,lsl#2] @ Te3[s1>>0]
299 ldr $i3,[$tbl,$i3,lsl#2] @ Te2[s1>>8]
300 eor $s0,$s0,$i1,ror#8
301 ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24]
302 and $i1,lr,$s2,lsr#8 @ i0
303 eor $t2,$t2,$i2,ror#8
304 and $i2,lr,$s2,lsr#16 @ i1
305 eor $t3,$t3,$i3,ror#8
306 and $i3,lr,$s2
307 ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
308 eor $s1,$s1,$t1,ror#24
309 ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
310 mov $s2,$s2,lsr#24
311
312 ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
313 eor $s0,$s0,$i1,ror#16
314 ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
315 and $i1,lr,$s3 @ i0
316 eor $s1,$s1,$i2,ror#8
317 and $i2,lr,$s3,lsr#8 @ i1
318 eor $t3,$t3,$i3,ror#16
319 and $i3,lr,$s3,lsr#16 @ i2
320 ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
321 eor $s2,$s2,$t2,ror#16
322 ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
323 mov $s3,$s3,lsr#24
324
325 ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
326 eor $s0,$s0,$i1,ror#24
327 ldr $i1,[$key],#16
328 eor $s1,$s1,$i2,ror#16
329 ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
330 eor $s2,$s2,$i3,ror#8
331 ldr $t1,[$key,#-12]
332 eor $s3,$s3,$t3,ror#8
333
334 ldr $t2,[$key,#-8]
335 eor $s0,$s0,$i1
336 ldr $t3,[$key,#-4]
337 and $i1,lr,$s0
338 eor $s1,$s1,$t1
339 and $i2,lr,$s0,lsr#8
340 eor $s2,$s2,$t2
341 and $i3,lr,$s0,lsr#16
342 eor $s3,$s3,$t3
343 mov $s0,$s0,lsr#24
344
345 subs $rounds,$rounds,#1
346 bne .Lenc_loop
347
348 add $tbl,$tbl,#2
349
350 ldrb $t1,[$tbl,$i1,lsl#2] @ Te4[s0>>0]
351 and $i1,lr,$s1,lsr#16 @ i0
352 ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8]
353 and $i2,lr,$s1
354 ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16]
355 and $i3,lr,$s1,lsr#8
356 ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24]
357 mov $s1,$s1,lsr#24
358
359 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s1>>16]
360 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s1>>0]
361 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s1>>8]
362 eor $s0,$i1,$s0,lsl#8
363 ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24]
364 and $i1,lr,$s2,lsr#8 @ i0
365 eor $t2,$i2,$t2,lsl#8
366 and $i2,lr,$s2,lsr#16 @ i1
367 eor $t3,$i3,$t3,lsl#8
368 and $i3,lr,$s2
369 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
370 eor $s1,$t1,$s1,lsl#24
371 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
372 mov $s2,$s2,lsr#24
373
374 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
375 eor $s0,$i1,$s0,lsl#8
376 ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
377 and $i1,lr,$s3 @ i0
378 eor $s1,$s1,$i2,lsl#16
379 and $i2,lr,$s3,lsr#8 @ i1
380 eor $t3,$i3,$t3,lsl#8
381 and $i3,lr,$s3,lsr#16 @ i2
382 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
383 eor $s2,$t2,$s2,lsl#24
384 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
385 mov $s3,$s3,lsr#24
386
387 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
388 eor $s0,$i1,$s0,lsl#8
389 ldr $i1,[$key,#0]
390 ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
391 eor $s1,$s1,$i2,lsl#8
392 ldr $t1,[$key,#4]
393 eor $s2,$s2,$i3,lsl#16
394 ldr $t2,[$key,#8]
395 eor $s3,$t3,$s3,lsl#24
396 ldr $t3,[$key,#12]
397
398 eor $s0,$s0,$i1
399 eor $s1,$s1,$t1
400 eor $s2,$s2,$t2
401 eor $s3,$s3,$t3
402
403 sub $tbl,$tbl,#2
404 ldr pc,[sp],#4 @ pop and return
405.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
406
407.global private_AES_set_encrypt_key
408.type private_AES_set_encrypt_key,%function
409.align 5
410private_AES_set_encrypt_key:
411_armv4_AES_set_encrypt_key:
412 sub r3,pc,#8 @ AES_set_encrypt_key
413 teq r0,#0
414 moveq r0,#-1
415 beq .Labrt
416 teq r2,#0
417 moveq r0,#-1
418 beq .Labrt
419
420 teq r1,#128
421 beq .Lok
422 teq r1,#192
423 beq .Lok
424 teq r1,#256
425 movne r0,#-1
426 bne .Labrt
427
428.Lok: stmdb sp!,{r4-r12,lr}
429 sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
430
431 mov $rounds,r0 @ inp
432 mov lr,r1 @ bits
433 mov $key,r2 @ key
434
435#if __ARM_ARCH__<7
436 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
437 ldrb $t1,[$rounds,#2] @ manner...
438 ldrb $t2,[$rounds,#1]
439 ldrb $t3,[$rounds,#0]
440 orr $s0,$s0,$t1,lsl#8
441 ldrb $s1,[$rounds,#7]
442 orr $s0,$s0,$t2,lsl#16
443 ldrb $t1,[$rounds,#6]
444 orr $s0,$s0,$t3,lsl#24
445 ldrb $t2,[$rounds,#5]
446 ldrb $t3,[$rounds,#4]
447 orr $s1,$s1,$t1,lsl#8
448 ldrb $s2,[$rounds,#11]
449 orr $s1,$s1,$t2,lsl#16
450 ldrb $t1,[$rounds,#10]
451 orr $s1,$s1,$t3,lsl#24
452 ldrb $t2,[$rounds,#9]
453 ldrb $t3,[$rounds,#8]
454 orr $s2,$s2,$t1,lsl#8
455 ldrb $s3,[$rounds,#15]
456 orr $s2,$s2,$t2,lsl#16
457 ldrb $t1,[$rounds,#14]
458 orr $s2,$s2,$t3,lsl#24
459 ldrb $t2,[$rounds,#13]
460 ldrb $t3,[$rounds,#12]
461 orr $s3,$s3,$t1,lsl#8
462 str $s0,[$key],#16
463 orr $s3,$s3,$t2,lsl#16
464 str $s1,[$key,#-12]
465 orr $s3,$s3,$t3,lsl#24
466 str $s2,[$key,#-8]
467 str $s3,[$key,#-4]
468#else
469 ldr $s0,[$rounds,#0]
470 ldr $s1,[$rounds,#4]
471 ldr $s2,[$rounds,#8]
472 ldr $s3,[$rounds,#12]
473#ifdef __ARMEL__
474 rev $s0,$s0
475 rev $s1,$s1
476 rev $s2,$s2
477 rev $s3,$s3
478#endif
479 str $s0,[$key],#16
480 str $s1,[$key,#-12]
481 str $s2,[$key,#-8]
482 str $s3,[$key,#-4]
483#endif
484
485 teq lr,#128
486 bne .Lnot128
487 mov $rounds,#10
488 str $rounds,[$key,#240-16]
489 add $t3,$tbl,#256 @ rcon
490 mov lr,#255
491
492.L128_loop:
493 and $t2,lr,$s3,lsr#24
494 and $i1,lr,$s3,lsr#16
495 ldrb $t2,[$tbl,$t2]
496 and $i2,lr,$s3,lsr#8
497 ldrb $i1,[$tbl,$i1]
498 and $i3,lr,$s3
499 ldrb $i2,[$tbl,$i2]
500 orr $t2,$t2,$i1,lsl#24
501 ldrb $i3,[$tbl,$i3]
502 orr $t2,$t2,$i2,lsl#16
503 ldr $t1,[$t3],#4 @ rcon[i++]
504 orr $t2,$t2,$i3,lsl#8
505 eor $t2,$t2,$t1
506 eor $s0,$s0,$t2 @ rk[4]=rk[0]^...
507 eor $s1,$s1,$s0 @ rk[5]=rk[1]^rk[4]
508 str $s0,[$key],#16
509 eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5]
510 str $s1,[$key,#-12]
511 eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6]
512 str $s2,[$key,#-8]
513 subs $rounds,$rounds,#1
514 str $s3,[$key,#-4]
515 bne .L128_loop
516 sub r2,$key,#176
517 b .Ldone
518
519.Lnot128:
520#if __ARM_ARCH__<7
521 ldrb $i2,[$rounds,#19]
522 ldrb $t1,[$rounds,#18]
523 ldrb $t2,[$rounds,#17]
524 ldrb $t3,[$rounds,#16]
525 orr $i2,$i2,$t1,lsl#8
526 ldrb $i3,[$rounds,#23]
527 orr $i2,$i2,$t2,lsl#16
528 ldrb $t1,[$rounds,#22]
529 orr $i2,$i2,$t3,lsl#24
530 ldrb $t2,[$rounds,#21]
531 ldrb $t3,[$rounds,#20]
532 orr $i3,$i3,$t1,lsl#8
533 orr $i3,$i3,$t2,lsl#16
534 str $i2,[$key],#8
535 orr $i3,$i3,$t3,lsl#24
536 str $i3,[$key,#-4]
537#else
538 ldr $i2,[$rounds,#16]
539 ldr $i3,[$rounds,#20]
540#ifdef __ARMEL__
541 rev $i2,$i2
542 rev $i3,$i3
543#endif
544 str $i2,[$key],#8
545 str $i3,[$key,#-4]
546#endif
547
548 teq lr,#192
549 bne .Lnot192
550 mov $rounds,#12
551 str $rounds,[$key,#240-24]
552 add $t3,$tbl,#256 @ rcon
553 mov lr,#255
554 mov $rounds,#8
555
556.L192_loop:
557 and $t2,lr,$i3,lsr#24
558 and $i1,lr,$i3,lsr#16
559 ldrb $t2,[$tbl,$t2]
560 and $i2,lr,$i3,lsr#8
561 ldrb $i1,[$tbl,$i1]
562 and $i3,lr,$i3
563 ldrb $i2,[$tbl,$i2]
564 orr $t2,$t2,$i1,lsl#24
565 ldrb $i3,[$tbl,$i3]
566 orr $t2,$t2,$i2,lsl#16
567 ldr $t1,[$t3],#4 @ rcon[i++]
568 orr $t2,$t2,$i3,lsl#8
569 eor $i3,$t2,$t1
570 eor $s0,$s0,$i3 @ rk[6]=rk[0]^...
571 eor $s1,$s1,$s0 @ rk[7]=rk[1]^rk[6]
572 str $s0,[$key],#24
573 eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7]
574 str $s1,[$key,#-20]
575 eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8]
576 str $s2,[$key,#-16]
577 subs $rounds,$rounds,#1
578 str $s3,[$key,#-12]
579 subeq r2,$key,#216
580 beq .Ldone
581
582 ldr $i1,[$key,#-32]
583 ldr $i2,[$key,#-28]
584 eor $i1,$i1,$s3 @ rk[10]=rk[4]^rk[9]
585 eor $i3,$i2,$i1 @ rk[11]=rk[5]^rk[10]
586 str $i1,[$key,#-8]
587 str $i3,[$key,#-4]
588 b .L192_loop
589
590.Lnot192:
591#if __ARM_ARCH__<7
592 ldrb $i2,[$rounds,#27]
593 ldrb $t1,[$rounds,#26]
594 ldrb $t2,[$rounds,#25]
595 ldrb $t3,[$rounds,#24]
596 orr $i2,$i2,$t1,lsl#8
597 ldrb $i3,[$rounds,#31]
598 orr $i2,$i2,$t2,lsl#16
599 ldrb $t1,[$rounds,#30]
600 orr $i2,$i2,$t3,lsl#24
601 ldrb $t2,[$rounds,#29]
602 ldrb $t3,[$rounds,#28]
603 orr $i3,$i3,$t1,lsl#8
604 orr $i3,$i3,$t2,lsl#16
605 str $i2,[$key],#8
606 orr $i3,$i3,$t3,lsl#24
607 str $i3,[$key,#-4]
608#else
609 ldr $i2,[$rounds,#24]
610 ldr $i3,[$rounds,#28]
611#ifdef __ARMEL__
612 rev $i2,$i2
613 rev $i3,$i3
614#endif
615 str $i2,[$key],#8
616 str $i3,[$key,#-4]
617#endif
618
619 mov $rounds,#14
620 str $rounds,[$key,#240-32]
621 add $t3,$tbl,#256 @ rcon
622 mov lr,#255
623 mov $rounds,#7
624
625.L256_loop:
626 and $t2,lr,$i3,lsr#24
627 and $i1,lr,$i3,lsr#16
628 ldrb $t2,[$tbl,$t2]
629 and $i2,lr,$i3,lsr#8
630 ldrb $i1,[$tbl,$i1]
631 and $i3,lr,$i3
632 ldrb $i2,[$tbl,$i2]
633 orr $t2,$t2,$i1,lsl#24
634 ldrb $i3,[$tbl,$i3]
635 orr $t2,$t2,$i2,lsl#16
636 ldr $t1,[$t3],#4 @ rcon[i++]
637 orr $t2,$t2,$i3,lsl#8
638 eor $i3,$t2,$t1
639 eor $s0,$s0,$i3 @ rk[8]=rk[0]^...
640 eor $s1,$s1,$s0 @ rk[9]=rk[1]^rk[8]
641 str $s0,[$key],#32
642 eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9]
643 str $s1,[$key,#-28]
644 eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10]
645 str $s2,[$key,#-24]
646 subs $rounds,$rounds,#1
647 str $s3,[$key,#-20]
648 subeq r2,$key,#256
649 beq .Ldone
650
651 and $t2,lr,$s3
652 and $i1,lr,$s3,lsr#8
653 ldrb $t2,[$tbl,$t2]
654 and $i2,lr,$s3,lsr#16
655 ldrb $i1,[$tbl,$i1]
656 and $i3,lr,$s3,lsr#24
657 ldrb $i2,[$tbl,$i2]
658 orr $t2,$t2,$i1,lsl#8
659 ldrb $i3,[$tbl,$i3]
660 orr $t2,$t2,$i2,lsl#16
661 ldr $t1,[$key,#-48]
662 orr $t2,$t2,$i3,lsl#24
663
664 ldr $i1,[$key,#-44]
665 ldr $i2,[$key,#-40]
666 eor $t1,$t1,$t2 @ rk[12]=rk[4]^...
667 ldr $i3,[$key,#-36]
668 eor $i1,$i1,$t1 @ rk[13]=rk[5]^rk[12]
669 str $t1,[$key,#-16]
670 eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13]
671 str $i1,[$key,#-12]
672 eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14]
673 str $i2,[$key,#-8]
674 str $i3,[$key,#-4]
675 b .L256_loop
676
677.Ldone: mov r0,#0
678 ldmia sp!,{r4-r12,lr}
679.Labrt: tst lr,#1
680 moveq pc,lr @ be binary compatible with V4, yet
681 bx lr @ interoperable with Thumb ISA:-)
682.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
683
684.global private_AES_set_decrypt_key
685.type private_AES_set_decrypt_key,%function
686.align 5
687private_AES_set_decrypt_key:
688 str lr,[sp,#-4]! @ push lr
689 bl _armv4_AES_set_encrypt_key
690 teq r0,#0
691 ldrne lr,[sp],#4 @ pop lr
692 bne .Labrt
693
694 stmdb sp!,{r4-r12}
695
696 ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2,
697 mov $key,r2 @ which is AES_KEY *key
698 mov $i1,r2
699 add $i2,r2,$rounds,lsl#4
700
701.Linv: ldr $s0,[$i1]
702 ldr $s1,[$i1,#4]
703 ldr $s2,[$i1,#8]
704 ldr $s3,[$i1,#12]
705 ldr $t1,[$i2]
706 ldr $t2,[$i2,#4]
707 ldr $t3,[$i2,#8]
708 ldr $i3,[$i2,#12]
709 str $s0,[$i2],#-16
710 str $s1,[$i2,#16+4]
711 str $s2,[$i2,#16+8]
712 str $s3,[$i2,#16+12]
713 str $t1,[$i1],#16
714 str $t2,[$i1,#-12]
715 str $t3,[$i1,#-8]
716 str $i3,[$i1,#-4]
717 teq $i1,$i2
718 bne .Linv
719___
720$mask80=$i1;
721$mask1b=$i2;
722$mask7f=$i3;
723$code.=<<___;
724 ldr $s0,[$key,#16]! @ prefetch tp1
725 mov $mask80,#0x80
726 mov $mask1b,#0x1b
727 orr $mask80,$mask80,#0x8000
728 orr $mask1b,$mask1b,#0x1b00
729 orr $mask80,$mask80,$mask80,lsl#16
730 orr $mask1b,$mask1b,$mask1b,lsl#16
731 sub $rounds,$rounds,#1
732 mvn $mask7f,$mask80
733 mov $rounds,$rounds,lsl#2 @ (rounds-1)*4
734
735.Lmix: and $t1,$s0,$mask80
736 and $s1,$s0,$mask7f
737 sub $t1,$t1,$t1,lsr#7
738 and $t1,$t1,$mask1b
739 eor $s1,$t1,$s1,lsl#1 @ tp2
740
741 and $t1,$s1,$mask80
742 and $s2,$s1,$mask7f
743 sub $t1,$t1,$t1,lsr#7
744 and $t1,$t1,$mask1b
745 eor $s2,$t1,$s2,lsl#1 @ tp4
746
747 and $t1,$s2,$mask80
748 and $s3,$s2,$mask7f
749 sub $t1,$t1,$t1,lsr#7
750 and $t1,$t1,$mask1b
751 eor $s3,$t1,$s3,lsl#1 @ tp8
752
753 eor $t1,$s1,$s2
754 eor $t2,$s0,$s3 @ tp9
755 eor $t1,$t1,$s3 @ tpe
756 eor $t1,$t1,$s1,ror#24
757 eor $t1,$t1,$t2,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8)
758 eor $t1,$t1,$s2,ror#16
759 eor $t1,$t1,$t2,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16)
760 eor $t1,$t1,$t2,ror#8 @ ^= ROTATE(tp9,24)
761
762 ldr $s0,[$key,#4] @ prefetch tp1
763 str $t1,[$key],#4
764 subs $rounds,$rounds,#1
765 bne .Lmix
766
767 mov r0,#0
768#if __ARM_ARCH__>=5
769 ldmia sp!,{r4-r12,pc}
770#else
771 ldmia sp!,{r4-r12,lr}
772 tst lr,#1
773 moveq pc,lr @ be binary compatible with V4, yet
774 bx lr @ interoperable with Thumb ISA:-)
775#endif
776.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
777
778.type AES_Td,%object
779.align 5
780AES_Td:
781.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
782.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
783.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
784.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
785.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
786.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
787.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
788.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
789.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
790.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
791.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
792.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
793.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
794.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
795.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
796.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
797.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
798.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
799.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
800.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
801.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
802.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
803.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
804.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
805.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
806.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
807.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
808.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
809.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
810.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
811.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
812.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
813.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
814.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
815.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
816.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
817.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
818.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
819.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
820.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
821.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
822.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
823.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
824.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
825.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
826.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
827.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
828.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
829.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
830.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
831.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
832.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
833.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
834.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
835.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
836.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
837.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
838.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
839.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
840.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
841.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
842.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
843.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
844.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
845@ Td4[256]
846.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
847.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
848.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
849.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
850.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
851.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
852.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
853.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
854.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
855.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
856.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
857.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
858.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
859.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
860.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
861.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
862.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
863.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
864.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
865.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
866.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
867.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
868.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
869.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
870.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
871.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
872.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
873.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
874.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
875.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
876.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
877.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
878.size AES_Td,.-AES_Td
879
880@ void AES_decrypt(const unsigned char *in, unsigned char *out,
881@ const AES_KEY *key) {
882.global AES_decrypt
883.type AES_decrypt,%function
884.align 5
885AES_decrypt:
886 sub r3,pc,#8 @ AES_decrypt
887 stmdb sp!,{r1,r4-r12,lr}
888 mov $rounds,r0 @ inp
889 mov $key,r2
890 sub $tbl,r3,#AES_decrypt-AES_Td @ Td
891#if __ARM_ARCH__<7
892 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
893 ldrb $t1,[$rounds,#2] @ manner...
894 ldrb $t2,[$rounds,#1]
895 ldrb $t3,[$rounds,#0]
896 orr $s0,$s0,$t1,lsl#8
897 ldrb $s1,[$rounds,#7]
898 orr $s0,$s0,$t2,lsl#16
899 ldrb $t1,[$rounds,#6]
900 orr $s0,$s0,$t3,lsl#24
901 ldrb $t2,[$rounds,#5]
902 ldrb $t3,[$rounds,#4]
903 orr $s1,$s1,$t1,lsl#8
904 ldrb $s2,[$rounds,#11]
905 orr $s1,$s1,$t2,lsl#16
906 ldrb $t1,[$rounds,#10]
907 orr $s1,$s1,$t3,lsl#24
908 ldrb $t2,[$rounds,#9]
909 ldrb $t3,[$rounds,#8]
910 orr $s2,$s2,$t1,lsl#8
911 ldrb $s3,[$rounds,#15]
912 orr $s2,$s2,$t2,lsl#16
913 ldrb $t1,[$rounds,#14]
914 orr $s2,$s2,$t3,lsl#24
915 ldrb $t2,[$rounds,#13]
916 ldrb $t3,[$rounds,#12]
917 orr $s3,$s3,$t1,lsl#8
918 orr $s3,$s3,$t2,lsl#16
919 orr $s3,$s3,$t3,lsl#24
920#else
921 ldr $s0,[$rounds,#0]
922 ldr $s1,[$rounds,#4]
923 ldr $s2,[$rounds,#8]
924 ldr $s3,[$rounds,#12]
925#ifdef __ARMEL__
926 rev $s0,$s0
927 rev $s1,$s1
928 rev $s2,$s2
929 rev $s3,$s3
930#endif
931#endif
932 bl _armv4_AES_decrypt
933
934 ldr $rounds,[sp],#4 @ pop out
935#if __ARM_ARCH__>=7
936#ifdef __ARMEL__
937 rev $s0,$s0
938 rev $s1,$s1
939 rev $s2,$s2
940 rev $s3,$s3
941#endif
942 str $s0,[$rounds,#0]
943 str $s1,[$rounds,#4]
944 str $s2,[$rounds,#8]
945 str $s3,[$rounds,#12]
946#else
947 mov $t1,$s0,lsr#24 @ write output in endian-neutral
948 mov $t2,$s0,lsr#16 @ manner...
949 mov $t3,$s0,lsr#8
950 strb $t1,[$rounds,#0]
951 strb $t2,[$rounds,#1]
952 mov $t1,$s1,lsr#24
953 strb $t3,[$rounds,#2]
954 mov $t2,$s1,lsr#16
955 strb $s0,[$rounds,#3]
956 mov $t3,$s1,lsr#8
957 strb $t1,[$rounds,#4]
958 strb $t2,[$rounds,#5]
959 mov $t1,$s2,lsr#24
960 strb $t3,[$rounds,#6]
961 mov $t2,$s2,lsr#16
962 strb $s1,[$rounds,#7]
963 mov $t3,$s2,lsr#8
964 strb $t1,[$rounds,#8]
965 strb $t2,[$rounds,#9]
966 mov $t1,$s3,lsr#24
967 strb $t3,[$rounds,#10]
968 mov $t2,$s3,lsr#16
969 strb $s2,[$rounds,#11]
970 mov $t3,$s3,lsr#8
971 strb $t1,[$rounds,#12]
972 strb $t2,[$rounds,#13]
973 strb $t3,[$rounds,#14]
974 strb $s3,[$rounds,#15]
975#endif
976#if __ARM_ARCH__>=5
977 ldmia sp!,{r4-r12,pc}
978#else
979 ldmia sp!,{r4-r12,lr}
980 tst lr,#1
981 moveq pc,lr @ be binary compatible with V4, yet
982 bx lr @ interoperable with Thumb ISA:-)
983#endif
984.size AES_decrypt,.-AES_decrypt
985
986.type _armv4_AES_decrypt,%function
987.align 2
988_armv4_AES_decrypt:
989 str lr,[sp,#-4]! @ push lr
990 ldmia $key!,{$t1-$i1}
991 eor $s0,$s0,$t1
992 ldr $rounds,[$key,#240-16]
993 eor $s1,$s1,$t2
994 eor $s2,$s2,$t3
995 eor $s3,$s3,$i1
996 sub $rounds,$rounds,#1
997 mov lr,#255
998
999 and $i1,lr,$s0,lsr#16
1000 and $i2,lr,$s0,lsr#8
1001 and $i3,lr,$s0
1002 mov $s0,$s0,lsr#24
1003.Ldec_loop:
1004 ldr $t1,[$tbl,$i1,lsl#2] @ Td1[s0>>16]
1005 and $i1,lr,$s1 @ i0
1006 ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8]
1007 and $i2,lr,$s1,lsr#16
1008 ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0]
1009 and $i3,lr,$s1,lsr#8
1010 ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24]
1011 mov $s1,$s1,lsr#24
1012
1013 ldr $i1,[$tbl,$i1,lsl#2] @ Td3[s1>>0]
1014 ldr $i2,[$tbl,$i2,lsl#2] @ Td1[s1>>16]
1015 ldr $i3,[$tbl,$i3,lsl#2] @ Td2[s1>>8]
1016 eor $s0,$s0,$i1,ror#24
1017 ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24]
1018 and $i1,lr,$s2,lsr#8 @ i0
1019 eor $t2,$i2,$t2,ror#8
1020 and $i2,lr,$s2 @ i1
1021 eor $t3,$i3,$t3,ror#8
1022 and $i3,lr,$s2,lsr#16
1023 ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
1024 eor $s1,$s1,$t1,ror#8
1025 ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
1026 mov $s2,$s2,lsr#24
1027
1028 ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
1029 eor $s0,$s0,$i1,ror#16
1030 ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
1031 and $i1,lr,$s3,lsr#16 @ i0
1032 eor $s1,$s1,$i2,ror#24
1033 and $i2,lr,$s3,lsr#8 @ i1
1034 eor $t3,$i3,$t3,ror#8
1035 and $i3,lr,$s3 @ i2
1036 ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
1037 eor $s2,$s2,$t2,ror#8
1038 ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
1039 mov $s3,$s3,lsr#24
1040
1041 ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
1042 eor $s0,$s0,$i1,ror#8
1043 ldr $i1,[$key],#16
1044 eor $s1,$s1,$i2,ror#16
1045 ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
1046 eor $s2,$s2,$i3,ror#24
1047
1048 ldr $t1,[$key,#-12]
1049 eor $s0,$s0,$i1
1050 ldr $t2,[$key,#-8]
1051 eor $s3,$s3,$t3,ror#8
1052 ldr $t3,[$key,#-4]
1053 and $i1,lr,$s0,lsr#16
1054 eor $s1,$s1,$t1
1055 and $i2,lr,$s0,lsr#8
1056 eor $s2,$s2,$t2
1057 and $i3,lr,$s0
1058 eor $s3,$s3,$t3
1059 mov $s0,$s0,lsr#24
1060
1061 subs $rounds,$rounds,#1
1062 bne .Ldec_loop
1063
1064 add $tbl,$tbl,#1024
1065
1066 ldr $t2,[$tbl,#0] @ prefetch Td4
1067 ldr $t3,[$tbl,#32]
1068 ldr $t1,[$tbl,#64]
1069 ldr $t2,[$tbl,#96]
1070 ldr $t3,[$tbl,#128]
1071 ldr $t1,[$tbl,#160]
1072 ldr $t2,[$tbl,#192]
1073 ldr $t3,[$tbl,#224]
1074
1075 ldrb $s0,[$tbl,$s0] @ Td4[s0>>24]
1076 ldrb $t1,[$tbl,$i1] @ Td4[s0>>16]
1077 and $i1,lr,$s1 @ i0
1078 ldrb $t2,[$tbl,$i2] @ Td4[s0>>8]
1079 and $i2,lr,$s1,lsr#16
1080 ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
1081 and $i3,lr,$s1,lsr#8
1082
1083 ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
1084 ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
1085 ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
1086 eor $s0,$i1,$s0,lsl#24
1087 ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
1088 eor $s1,$t1,$s1,lsl#8
1089 and $i1,lr,$s2,lsr#8 @ i0
1090 eor $t2,$t2,$i2,lsl#8
1091 and $i2,lr,$s2 @ i1
1092 ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
1093 eor $t3,$t3,$i3,lsl#8
1094 ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
1095 and $i3,lr,$s2,lsr#16
1096
1097 ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
1098 eor $s0,$s0,$i1,lsl#8
1099 ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
1100 eor $s1,$i2,$s1,lsl#16
1101 and $i1,lr,$s3,lsr#16 @ i0
1102 eor $s2,$t2,$s2,lsl#16
1103 and $i2,lr,$s3,lsr#8 @ i1
1104 ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
1105 eor $t3,$t3,$i3,lsl#16
1106 ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
1107 and $i3,lr,$s3 @ i2
1108
1109 ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
1110 ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
1111 eor $s0,$s0,$i1,lsl#16
1112 ldr $i1,[$key,#0]
1113 eor $s1,$s1,$i2,lsl#8
1114 ldr $t1,[$key,#4]
1115 eor $s2,$i3,$s2,lsl#8
1116 ldr $t2,[$key,#8]
1117 eor $s3,$t3,$s3,lsl#24
1118 ldr $t3,[$key,#12]
1119
1120 eor $s0,$s0,$i1
1121 eor $s1,$s1,$t1
1122 eor $s2,$s2,$t2
1123 eor $s3,$s3,$t3
1124
1125 sub $tbl,$tbl,#1024
1126 ldr pc,[sp],#4 @ pop and return
1127.size _armv4_AES_decrypt,.-_armv4_AES_decrypt
1128.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
1129.align 2
1130___
1131
1132$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
1133print $code;
1134close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/aes/asm/aes-ia64.S b/src/lib/libcrypto/aes/asm/aes-ia64.S
deleted file mode 100644
index 7f6c4c3662..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-ia64.S
+++ /dev/null
@@ -1,1123 +0,0 @@
1// ====================================================================
2// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
3// project. Rights for redistribution and usage in source and binary
4// forms are granted according to the OpenSSL license.
5// ====================================================================
6//
7// What's wrong with compiler generated code? Compiler never uses
8// variable 'shr' which is pairable with 'extr'/'dep' instructions.
9// Then it uses 'zxt' which is an I-type, but can be replaced with
10// 'and' which in turn can be assigned to M-port [there're double as
11// much M-ports as there're I-ports on Itanium 2]. By sacrificing few
12// registers for small constants (255, 24 and 16) to be used with
13// 'shr' and 'and' instructions I can achieve better ILP, Intruction
14// Level Parallelism, and performance. This code outperforms GCC 3.3
15// generated code by over factor of 2 (two), GCC 3.4 - by 70% and
16// HP C - by 40%. Measured best-case scenario, i.e. aligned
17// big-endian input, ECB timing on Itanium 2 is (18 + 13*rounds)
18// ticks per block, or 9.25 CPU cycles per byte for 128 bit key.
19
20// Version 1.2 mitigates the hazard of cache-timing attacks by
21// a) compressing S-boxes from 8KB to 2KB+256B, b) scheduling
22// references to S-boxes for L2 cache latency, c) prefetching T[ed]4
23// prior last round. As result performance dropped to (26 + 15*rounds)
24// ticks per block or 11 cycles per byte processed with 128-bit key.
25// This is ~16% deterioration. For reference Itanium 2 L1 cache has
26// 64 bytes line size and L2 - 128 bytes...
27
28.ident "aes-ia64.S, version 1.2"
29.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
30.explicit
31.text
32
33rk0=r8; rk1=r9;
34
35pfssave=r2;
36lcsave=r10;
37prsave=r3;
38maskff=r11;
39twenty4=r14;
40sixteen=r15;
41
42te00=r16; te11=r17; te22=r18; te33=r19;
43te01=r20; te12=r21; te23=r22; te30=r23;
44te02=r24; te13=r25; te20=r26; te31=r27;
45te03=r28; te10=r29; te21=r30; te32=r31;
46
47// these are rotating...
48t0=r32; s0=r33;
49t1=r34; s1=r35;
50t2=r36; s2=r37;
51t3=r38; s3=r39;
52
53te0=r40; te1=r41; te2=r42; te3=r43;
54
55#if defined(_HPUX_SOURCE) && !defined(_LP64)
56# define ADDP addp4
57#else
58# define ADDP add
59#endif
60
61// Offsets from Te0
62#define TE0 0
63#define TE2 2
64#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
65#define TE1 3
66#define TE3 1
67#else
68#define TE1 1
69#define TE3 3
70#endif
71
72// This implies that AES_KEY comprises 32-bit key schedule elements
73// even on LP64 platforms.
74#ifndef KSZ
75# define KSZ 4
76# define LDKEY ld4
77#endif
78
79.proc _ia64_AES_encrypt#
80// Input: rk0-rk1
81// te0
82// te3 as AES_KEY->rounds!!!
83// s0-s3
84// maskff,twenty4,sixteen
85// Output: r16,r20,r24,r28 as s0-s3
86// Clobber: r16-r31,rk0-rk1,r32-r43
87.align 32
88_ia64_AES_encrypt:
89 .prologue
90 .altrp b6
91 .body
92{ .mmi; alloc r16=ar.pfs,12,0,0,8
93 LDKEY t0=[rk0],2*KSZ
94 mov pr.rot=1<<16 }
95{ .mmi; LDKEY t1=[rk1],2*KSZ
96 add te1=TE1,te0
97 add te3=-3,te3 };;
98{ .mib; LDKEY t2=[rk0],2*KSZ
99 mov ar.ec=2 }
100{ .mib; LDKEY t3=[rk1],2*KSZ
101 add te2=TE2,te0
102 brp.loop.imp .Le_top,.Le_end-16 };;
103
104{ .mmi; xor s0=s0,t0
105 xor s1=s1,t1
106 mov ar.lc=te3 }
107{ .mmi; xor s2=s2,t2
108 xor s3=s3,t3
109 add te3=TE3,te0 };;
110
111.align 32
112.Le_top:
113{ .mmi; (p0) LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
114 (p0) and te33=s3,maskff // 0/0:s3&0xff
115 (p0) extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
116{ .mmi; (p0) LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
117 (p0) and te30=s0,maskff // 0/1:s0&0xff
118 (p0) shr.u te00=s0,twenty4 };; // 0/0:s0>>24
119{ .mmi; (p0) LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
120 (p0) shladd te33=te33,3,te3 // 1/0:te0+s0>>24
121 (p0) extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
122{ .mmi; (p0) LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
123 (p0) shladd te30=te30,3,te3 // 1/1:te3+s0
124 (p0) shr.u te01=s1,twenty4 };; // 1/1:s1>>24
125{ .mmi; (p0) ld4 te33=[te33] // 2/0:te3[s3&0xff]
126 (p0) shladd te22=te22,3,te2 // 2/0:te2+s2>>8&0xff
127 (p0) extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
128{ .mmi; (p0) ld4 te30=[te30] // 2/1:te3[s0]
129 (p0) shladd te23=te23,3,te2 // 2/1:te2+s3>>8
130 (p0) shr.u te02=s2,twenty4 };; // 2/2:s2>>24
131{ .mmi; (p0) ld4 te22=[te22] // 3/0:te2[s2>>8]
132 (p0) shladd te20=te20,3,te2 // 3/2:te2+s0>>8
133 (p0) extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
134{ .mmi; (p0) ld4 te23=[te23] // 3/1:te2[s3>>8]
135 (p0) shladd te00=te00,3,te0 // 3/0:te0+s0>>24
136 (p0) shr.u te03=s3,twenty4 };; // 3/3:s3>>24
137{ .mmi; (p0) ld4 te20=[te20] // 4/2:te2[s0>>8]
138 (p0) shladd te21=te21,3,te2 // 4/3:te3+s2
139 (p0) extr.u te11=s1,16,8 } // 4/0:s1>>16&0xff
140{ .mmi; (p0) ld4 te00=[te00] // 4/0:te0[s0>>24]
141 (p0) shladd te01=te01,3,te0 // 4/1:te0+s1>>24
142 (p0) shr.u te13=s3,sixteen };; // 4/2:s3>>16
143{ .mmi; (p0) ld4 te21=[te21] // 5/3:te2[s1>>8]
144 (p0) shladd te11=te11,3,te1 // 5/0:te1+s1>>16
145 (p0) extr.u te12=s2,16,8 } // 5/1:s2>>16&0xff
146{ .mmi; (p0) ld4 te01=[te01] // 5/1:te0[s1>>24]
147 (p0) shladd te02=te02,3,te0 // 5/2:te0+s2>>24
148 (p0) and te31=s1,maskff };; // 5/2:s1&0xff
149{ .mmi; (p0) ld4 te11=[te11] // 6/0:te1[s1>>16]
150 (p0) shladd te12=te12,3,te1 // 6/1:te1+s2>>16
151 (p0) extr.u te10=s0,16,8 } // 6/3:s0>>16&0xff
152{ .mmi; (p0) ld4 te02=[te02] // 6/2:te0[s2>>24]
153 (p0) shladd te03=te03,3,te0 // 6/3:te1+s0>>16
154 (p0) and te32=s2,maskff };; // 6/3:s2&0xff
155
156{ .mmi; (p0) ld4 te12=[te12] // 7/1:te1[s2>>16]
157 (p0) shladd te31=te31,3,te3 // 7/2:te3+s1&0xff
158 (p0) and te13=te13,maskff} // 7/2:s3>>16&0xff
159{ .mmi; (p0) ld4 te03=[te03] // 7/3:te0[s3>>24]
160 (p0) shladd te32=te32,3,te3 // 7/3:te3+s2
161 (p0) xor t0=t0,te33 };; // 7/0:
162{ .mmi; (p0) ld4 te31=[te31] // 8/2:te3[s1]
163 (p0) shladd te13=te13,3,te1 // 8/2:te1+s3>>16
164 (p0) xor t0=t0,te22 } // 8/0:
165{ .mmi; (p0) ld4 te32=[te32] // 8/3:te3[s2]
166 (p0) shladd te10=te10,3,te1 // 8/3:te1+s0>>16
167 (p0) xor t1=t1,te30 };; // 8/1:
168{ .mmi; (p0) ld4 te13=[te13] // 9/2:te1[s3>>16]
169 (p0) ld4 te10=[te10] // 9/3:te1[s0>>16]
170 (p0) xor t0=t0,te00 };; // 9/0: !L2 scheduling
171{ .mmi; (p0) xor t1=t1,te23 // 10[9]/1:
172 (p0) xor t2=t2,te20 // 10[9]/2:
173 (p0) xor t3=t3,te21 };; // 10[9]/3:
174{ .mmi; (p0) xor t0=t0,te11 // 11[10]/0:done!
175 (p0) xor t1=t1,te01 // 11[10]/1:
176 (p0) xor t2=t2,te02 };; // 11[10]/2: !L2 scheduling
177{ .mmi; (p0) xor t3=t3,te03 // 12[10]/3:
178 (p16) cmp.eq p0,p17=r0,r0 };; // 12[10]/clear (p17)
179{ .mmi; (p0) xor t1=t1,te12 // 13[11]/1:done!
180 (p0) xor t2=t2,te31 // 13[11]/2:
181 (p0) xor t3=t3,te32 } // 13[11]/3:
182{ .mmi; (p17) add te0=2048,te0 // 13[11]/
183 (p17) add te1=2048+64-TE1,te1};; // 13[11]/
184{ .mib; (p0) xor t2=t2,te13 // 14[12]/2:done!
185 (p17) add te2=2048+128-TE2,te2} // 14[12]/
186{ .mib; (p0) xor t3=t3,te10 // 14[12]/3:done!
187 (p17) add te3=2048+192-TE3,te3 // 14[12]/
188 br.ctop.sptk .Le_top };;
189.Le_end:
190
191
192{ .mmi; ld8 te12=[te0] // prefetch Te4
193 ld8 te31=[te1] }
194{ .mmi; ld8 te10=[te2]
195 ld8 te32=[te3] }
196
197{ .mmi; LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
198 and te33=s3,maskff // 0/0:s3&0xff
199 extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
200{ .mmi; LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
201 and te30=s0,maskff // 0/1:s0&0xff
202 shr.u te00=s0,twenty4 };; // 0/0:s0>>24
203{ .mmi; LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
204 add te33=te33,te0 // 1/0:te0+s0>>24
205 extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
206{ .mmi; LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
207 add te30=te30,te0 // 1/1:te0+s0
208 shr.u te01=s1,twenty4 };; // 1/1:s1>>24
209{ .mmi; ld1 te33=[te33] // 2/0:te0[s3&0xff]
210 add te22=te22,te0 // 2/0:te0+s2>>8&0xff
211 extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
212{ .mmi; ld1 te30=[te30] // 2/1:te0[s0]
213 add te23=te23,te0 // 2/1:te0+s3>>8
214 shr.u te02=s2,twenty4 };; // 2/2:s2>>24
215{ .mmi; ld1 te22=[te22] // 3/0:te0[s2>>8]
216 add te20=te20,te0 // 3/2:te0+s0>>8
217 extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
218{ .mmi; ld1 te23=[te23] // 3/1:te0[s3>>8]
219 add te00=te00,te0 // 3/0:te0+s0>>24
220 shr.u te03=s3,twenty4 };; // 3/3:s3>>24
221{ .mmi; ld1 te20=[te20] // 4/2:te0[s0>>8]
222 add te21=te21,te0 // 4/3:te0+s2
223 extr.u te11=s1,16,8 } // 4/0:s1>>16&0xff
224{ .mmi; ld1 te00=[te00] // 4/0:te0[s0>>24]
225 add te01=te01,te0 // 4/1:te0+s1>>24
226 shr.u te13=s3,sixteen };; // 4/2:s3>>16
227{ .mmi; ld1 te21=[te21] // 5/3:te0[s1>>8]
228 add te11=te11,te0 // 5/0:te0+s1>>16
229 extr.u te12=s2,16,8 } // 5/1:s2>>16&0xff
230{ .mmi; ld1 te01=[te01] // 5/1:te0[s1>>24]
231 add te02=te02,te0 // 5/2:te0+s2>>24
232 and te31=s1,maskff };; // 5/2:s1&0xff
233{ .mmi; ld1 te11=[te11] // 6/0:te0[s1>>16]
234 add te12=te12,te0 // 6/1:te0+s2>>16
235 extr.u te10=s0,16,8 } // 6/3:s0>>16&0xff
236{ .mmi; ld1 te02=[te02] // 6/2:te0[s2>>24]
237 add te03=te03,te0 // 6/3:te0+s0>>16
238 and te32=s2,maskff };; // 6/3:s2&0xff
239
240{ .mmi; ld1 te12=[te12] // 7/1:te0[s2>>16]
241 add te31=te31,te0 // 7/2:te0+s1&0xff
242 dep te33=te22,te33,8,8} // 7/0:
243{ .mmi; ld1 te03=[te03] // 7/3:te0[s3>>24]
244 add te32=te32,te0 // 7/3:te0+s2
245 and te13=te13,maskff};; // 7/2:s3>>16&0xff
246{ .mmi; ld1 te31=[te31] // 8/2:te0[s1]
247 add te13=te13,te0 // 8/2:te0+s3>>16
248 dep te30=te23,te30,8,8} // 8/1:
249{ .mmi; ld1 te32=[te32] // 8/3:te0[s2]
250 add te10=te10,te0 // 8/3:te0+s0>>16
251 shl te00=te00,twenty4};; // 8/0:
252{ .mii; ld1 te13=[te13] // 9/2:te0[s3>>16]
253 dep te33=te11,te33,16,8 // 9/0:
254 shl te01=te01,twenty4};; // 9/1:
255{ .mii; ld1 te10=[te10] // 10/3:te0[s0>>16]
256 dep te31=te20,te31,8,8 // 10/2:
257 shl te02=te02,twenty4};; // 10/2:
258{ .mii; xor t0=t0,te33 // 11/0:
259 dep te32=te21,te32,8,8 // 11/3:
260 shl te12=te12,sixteen};; // 11/1:
261{ .mii; xor r16=t0,te00 // 12/0:done!
262 dep te31=te13,te31,16,8 // 12/2:
263 shl te03=te03,twenty4};; // 12/3:
264{ .mmi; xor t1=t1,te01 // 13/1:
265 xor t2=t2,te02 // 13/2:
266 dep te32=te10,te32,16,8};; // 13/3:
267{ .mmi; xor t1=t1,te30 // 14/1:
268 xor r24=t2,te31 // 14/2:done!
269 xor t3=t3,te32 };; // 14/3:
270{ .mib; xor r20=t1,te12 // 15/1:done!
271 xor r28=t3,te03 // 15/3:done!
272 br.ret.sptk b6 };;
273.endp _ia64_AES_encrypt#
274
275// void AES_encrypt (const void *in,void *out,const AES_KEY *key);
276.global AES_encrypt#
277.proc AES_encrypt#
278.align 32
279AES_encrypt:
280 .prologue
281 .save ar.pfs,pfssave
282{ .mmi; alloc pfssave=ar.pfs,3,1,12,0
283 and out0=3,in0
284 mov r3=ip }
285{ .mmi; ADDP in0=0,in0
286 mov loc0=psr.um
287 ADDP out11=KSZ*60,in2 };; // &AES_KEY->rounds
288
289{ .mmi; ld4 out11=[out11] // AES_KEY->rounds
290 add out8=(AES_Te#-AES_encrypt#),r3 // Te0
291 .save pr,prsave
292 mov prsave=pr }
293{ .mmi; rum 1<<3 // clear um.ac
294 .save ar.lc,lcsave
295 mov lcsave=ar.lc };;
296
297 .body
298#if defined(_HPUX_SOURCE) // HPUX is big-endian, cut 15+15 cycles...
299{ .mib; cmp.ne p6,p0=out0,r0
300 add out0=4,in0
301(p6) br.dpnt.many .Le_i_unaligned };;
302
303{ .mmi; ld4 out1=[in0],8 // s0
304 and out9=3,in1
305 mov twenty4=24 }
306{ .mmi; ld4 out3=[out0],8 // s1
307 ADDP rk0=0,in2
308 mov sixteen=16 };;
309{ .mmi; ld4 out5=[in0] // s2
310 cmp.ne p6,p0=out9,r0
311 mov maskff=0xff }
312{ .mmb; ld4 out7=[out0] // s3
313 ADDP rk1=KSZ,in2
314 br.call.sptk.many b6=_ia64_AES_encrypt };;
315
316{ .mib; ADDP in0=4,in1
317 ADDP in1=0,in1
318(p6) br.spnt .Le_o_unaligned };;
319
320{ .mii; mov psr.um=loc0
321 mov ar.pfs=pfssave
322 mov ar.lc=lcsave };;
323{ .mmi; st4 [in1]=r16,8 // s0
324 st4 [in0]=r20,8 // s1
325 mov pr=prsave,0x1ffff };;
326{ .mmb; st4 [in1]=r24 // s2
327 st4 [in0]=r28 // s3
328 br.ret.sptk.many b0 };;
329#endif
330
331.align 32
332.Le_i_unaligned:
333{ .mmi; add out0=1,in0
334 add out2=2,in0
335 add out4=3,in0 };;
336{ .mmi; ld1 r16=[in0],4
337 ld1 r17=[out0],4 }//;;
338{ .mmi; ld1 r18=[out2],4
339 ld1 out1=[out4],4 };; // s0
340{ .mmi; ld1 r20=[in0],4
341 ld1 r21=[out0],4 }//;;
342{ .mmi; ld1 r22=[out2],4
343 ld1 out3=[out4],4 };; // s1
344{ .mmi; ld1 r24=[in0],4
345 ld1 r25=[out0],4 }//;;
346{ .mmi; ld1 r26=[out2],4
347 ld1 out5=[out4],4 };; // s2
348{ .mmi; ld1 r28=[in0]
349 ld1 r29=[out0] }//;;
350{ .mmi; ld1 r30=[out2]
351 ld1 out7=[out4] };; // s3
352
353{ .mii;
354 dep out1=r16,out1,24,8 //;;
355 dep out3=r20,out3,24,8 }//;;
356{ .mii; ADDP rk0=0,in2
357 dep out5=r24,out5,24,8 //;;
358 dep out7=r28,out7,24,8 };;
359{ .mii; ADDP rk1=KSZ,in2
360 dep out1=r17,out1,16,8 //;;
361 dep out3=r21,out3,16,8 }//;;
362{ .mii; mov twenty4=24
363 dep out5=r25,out5,16,8 //;;
364 dep out7=r29,out7,16,8 };;
365{ .mii; mov sixteen=16
366 dep out1=r18,out1,8,8 //;;
367 dep out3=r22,out3,8,8 }//;;
368{ .mii; mov maskff=0xff
369 dep out5=r26,out5,8,8 //;;
370 dep out7=r30,out7,8,8 };;
371
372{ .mib; br.call.sptk.many b6=_ia64_AES_encrypt };;
373
374.Le_o_unaligned:
375{ .mii; ADDP out0=0,in1
376 extr.u r17=r16,8,8 // s0
377 shr.u r19=r16,twenty4 }//;;
378{ .mii; ADDP out1=1,in1
379 extr.u r18=r16,16,8
380 shr.u r23=r20,twenty4 }//;; // s1
381{ .mii; ADDP out2=2,in1
382 extr.u r21=r20,8,8
383 shr.u r22=r20,sixteen }//;;
384{ .mii; ADDP out3=3,in1
385 extr.u r25=r24,8,8 // s2
386 shr.u r27=r24,twenty4 };;
387{ .mii; st1 [out3]=r16,4
388 extr.u r26=r24,16,8
389 shr.u r31=r28,twenty4 }//;; // s3
390{ .mii; st1 [out2]=r17,4
391 extr.u r29=r28,8,8
392 shr.u r30=r28,sixteen }//;;
393
394{ .mmi; st1 [out1]=r18,4
395 st1 [out0]=r19,4 };;
396{ .mmi; st1 [out3]=r20,4
397 st1 [out2]=r21,4 }//;;
398{ .mmi; st1 [out1]=r22,4
399 st1 [out0]=r23,4 };;
400{ .mmi; st1 [out3]=r24,4
401 st1 [out2]=r25,4
402 mov pr=prsave,0x1ffff }//;;
403{ .mmi; st1 [out1]=r26,4
404 st1 [out0]=r27,4
405 mov ar.pfs=pfssave };;
406{ .mmi; st1 [out3]=r28
407 st1 [out2]=r29
408 mov ar.lc=lcsave }//;;
409{ .mmi; st1 [out1]=r30
410 st1 [out0]=r31 }
411{ .mfb; mov psr.um=loc0 // restore user mask
412 br.ret.sptk.many b0 };;
413.endp AES_encrypt#
414
415// *AES_decrypt are autogenerated by the following script:
416#if 0
417#!/usr/bin/env perl
418print "// *AES_decrypt are autogenerated by the following script:\n#if 0\n";
419open(PROG,'<'.$0); while(<PROG>) { print; } close(PROG);
420print "#endif\n";
421while(<>) {
422 $process=1 if (/\.proc\s+_ia64_AES_encrypt/);
423 next if (!$process);
424
425 #s/te00=s0/td00=s0/; s/te00/td00/g;
426 s/te11=s1/td13=s3/; s/te11/td13/g;
427 #s/te22=s2/td22=s2/; s/te22/td22/g;
428 s/te33=s3/td31=s1/; s/te33/td31/g;
429
430 #s/te01=s1/td01=s1/; s/te01/td01/g;
431 s/te12=s2/td10=s0/; s/te12/td10/g;
432 #s/te23=s3/td23=s3/; s/te23/td23/g;
433 s/te30=s0/td32=s2/; s/te30/td32/g;
434
435 #s/te02=s2/td02=s2/; s/te02/td02/g;
436 s/te13=s3/td11=s1/; s/te13/td11/g;
437 #s/te20=s0/td20=s0/; s/te20/td20/g;
438 s/te31=s1/td33=s3/; s/te31/td33/g;
439
440 #s/te03=s3/td03=s3/; s/te03/td03/g;
441 s/te10=s0/td12=s2/; s/te10/td12/g;
442 #s/te21=s1/td21=s1/; s/te21/td21/g;
443 s/te32=s2/td30=s0/; s/te32/td30/g;
444
445 s/td/te/g;
446
447 s/AES_encrypt/AES_decrypt/g;
448 s/\.Le_/.Ld_/g;
449 s/AES_Te#/AES_Td#/g;
450
451 print;
452
453 exit if (/\.endp\s+AES_decrypt/);
454}
455#endif
456.proc _ia64_AES_decrypt#
457// Input: rk0-rk1
458// te0
459// te3 as AES_KEY->rounds!!!
460// s0-s3
461// maskff,twenty4,sixteen
462// Output: r16,r20,r24,r28 as s0-s3
463// Clobber: r16-r31,rk0-rk1,r32-r43
464.align 32
465_ia64_AES_decrypt:
466 .prologue
467 .altrp b6
468 .body
469{ .mmi; alloc r16=ar.pfs,12,0,0,8
470 LDKEY t0=[rk0],2*KSZ
471 mov pr.rot=1<<16 }
472{ .mmi; LDKEY t1=[rk1],2*KSZ
473 add te1=TE1,te0
474 add te3=-3,te3 };;
475{ .mib; LDKEY t2=[rk0],2*KSZ
476 mov ar.ec=2 }
477{ .mib; LDKEY t3=[rk1],2*KSZ
478 add te2=TE2,te0
479 brp.loop.imp .Ld_top,.Ld_end-16 };;
480
481{ .mmi; xor s0=s0,t0
482 xor s1=s1,t1
483 mov ar.lc=te3 }
484{ .mmi; xor s2=s2,t2
485 xor s3=s3,t3
486 add te3=TE3,te0 };;
487
488.align 32
489.Ld_top:
490{ .mmi; (p0) LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
491 (p0) and te31=s1,maskff // 0/0:s3&0xff
492 (p0) extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
493{ .mmi; (p0) LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
494 (p0) and te32=s2,maskff // 0/1:s0&0xff
495 (p0) shr.u te00=s0,twenty4 };; // 0/0:s0>>24
496{ .mmi; (p0) LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
497 (p0) shladd te31=te31,3,te3 // 1/0:te0+s0>>24
498 (p0) extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
499{ .mmi; (p0) LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
500 (p0) shladd te32=te32,3,te3 // 1/1:te3+s0
501 (p0) shr.u te01=s1,twenty4 };; // 1/1:s1>>24
502{ .mmi; (p0) ld4 te31=[te31] // 2/0:te3[s3&0xff]
503 (p0) shladd te22=te22,3,te2 // 2/0:te2+s2>>8&0xff
504 (p0) extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
505{ .mmi; (p0) ld4 te32=[te32] // 2/1:te3[s0]
506 (p0) shladd te23=te23,3,te2 // 2/1:te2+s3>>8
507 (p0) shr.u te02=s2,twenty4 };; // 2/2:s2>>24
508{ .mmi; (p0) ld4 te22=[te22] // 3/0:te2[s2>>8]
509 (p0) shladd te20=te20,3,te2 // 3/2:te2+s0>>8
510 (p0) extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
511{ .mmi; (p0) ld4 te23=[te23] // 3/1:te2[s3>>8]
512 (p0) shladd te00=te00,3,te0 // 3/0:te0+s0>>24
513 (p0) shr.u te03=s3,twenty4 };; // 3/3:s3>>24
514{ .mmi; (p0) ld4 te20=[te20] // 4/2:te2[s0>>8]
515 (p0) shladd te21=te21,3,te2 // 4/3:te3+s2
516 (p0) extr.u te13=s3,16,8 } // 4/0:s1>>16&0xff
517{ .mmi; (p0) ld4 te00=[te00] // 4/0:te0[s0>>24]
518 (p0) shladd te01=te01,3,te0 // 4/1:te0+s1>>24
519 (p0) shr.u te11=s1,sixteen };; // 4/2:s3>>16
520{ .mmi; (p0) ld4 te21=[te21] // 5/3:te2[s1>>8]
521 (p0) shladd te13=te13,3,te1 // 5/0:te1+s1>>16
522 (p0) extr.u te10=s0,16,8 } // 5/1:s2>>16&0xff
523{ .mmi; (p0) ld4 te01=[te01] // 5/1:te0[s1>>24]
524 (p0) shladd te02=te02,3,te0 // 5/2:te0+s2>>24
525 (p0) and te33=s3,maskff };; // 5/2:s1&0xff
526{ .mmi; (p0) ld4 te13=[te13] // 6/0:te1[s1>>16]
527 (p0) shladd te10=te10,3,te1 // 6/1:te1+s2>>16
528 (p0) extr.u te12=s2,16,8 } // 6/3:s0>>16&0xff
529{ .mmi; (p0) ld4 te02=[te02] // 6/2:te0[s2>>24]
530 (p0) shladd te03=te03,3,te0 // 6/3:te1+s0>>16
531 (p0) and te30=s0,maskff };; // 6/3:s2&0xff
532
533{ .mmi; (p0) ld4 te10=[te10] // 7/1:te1[s2>>16]
534 (p0) shladd te33=te33,3,te3 // 7/2:te3+s1&0xff
535 (p0) and te11=te11,maskff} // 7/2:s3>>16&0xff
536{ .mmi; (p0) ld4 te03=[te03] // 7/3:te0[s3>>24]
537 (p0) shladd te30=te30,3,te3 // 7/3:te3+s2
538 (p0) xor t0=t0,te31 };; // 7/0:
539{ .mmi; (p0) ld4 te33=[te33] // 8/2:te3[s1]
540 (p0) shladd te11=te11,3,te1 // 8/2:te1+s3>>16
541 (p0) xor t0=t0,te22 } // 8/0:
542{ .mmi; (p0) ld4 te30=[te30] // 8/3:te3[s2]
543 (p0) shladd te12=te12,3,te1 // 8/3:te1+s0>>16
544 (p0) xor t1=t1,te32 };; // 8/1:
545{ .mmi; (p0) ld4 te11=[te11] // 9/2:te1[s3>>16]
546 (p0) ld4 te12=[te12] // 9/3:te1[s0>>16]
547 (p0) xor t0=t0,te00 };; // 9/0: !L2 scheduling
548{ .mmi; (p0) xor t1=t1,te23 // 10[9]/1:
549 (p0) xor t2=t2,te20 // 10[9]/2:
550 (p0) xor t3=t3,te21 };; // 10[9]/3:
551{ .mmi; (p0) xor t0=t0,te13 // 11[10]/0:done!
552 (p0) xor t1=t1,te01 // 11[10]/1:
553 (p0) xor t2=t2,te02 };; // 11[10]/2: !L2 scheduling
554{ .mmi; (p0) xor t3=t3,te03 // 12[10]/3:
555 (p16) cmp.eq p0,p17=r0,r0 };; // 12[10]/clear (p17)
556{ .mmi; (p0) xor t1=t1,te10 // 13[11]/1:done!
557 (p0) xor t2=t2,te33 // 13[11]/2:
558 (p0) xor t3=t3,te30 } // 13[11]/3:
559{ .mmi; (p17) add te0=2048,te0 // 13[11]/
560 (p17) add te1=2048+64-TE1,te1};; // 13[11]/
561{ .mib; (p0) xor t2=t2,te11 // 14[12]/2:done!
562 (p17) add te2=2048+128-TE2,te2} // 14[12]/
563{ .mib; (p0) xor t3=t3,te12 // 14[12]/3:done!
564 (p17) add te3=2048+192-TE3,te3 // 14[12]/
565 br.ctop.sptk .Ld_top };;
566.Ld_end:
567
568
569{ .mmi; ld8 te10=[te0] // prefetch Td4
570 ld8 te33=[te1] }
571{ .mmi; ld8 te12=[te2]
572 ld8 te30=[te3] }
573
574{ .mmi; LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
575 and te31=s1,maskff // 0/0:s3&0xff
576 extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
577{ .mmi; LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
578 and te32=s2,maskff // 0/1:s0&0xff
579 shr.u te00=s0,twenty4 };; // 0/0:s0>>24
580{ .mmi; LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
581 add te31=te31,te0 // 1/0:te0+s0>>24
582 extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
583{ .mmi; LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
584 add te32=te32,te0 // 1/1:te0+s0
585 shr.u te01=s1,twenty4 };; // 1/1:s1>>24
586{ .mmi; ld1 te31=[te31] // 2/0:te0[s3&0xff]
587 add te22=te22,te0 // 2/0:te0+s2>>8&0xff
588 extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
589{ .mmi; ld1 te32=[te32] // 2/1:te0[s0]
590 add te23=te23,te0 // 2/1:te0+s3>>8
591 shr.u te02=s2,twenty4 };; // 2/2:s2>>24
592{ .mmi; ld1 te22=[te22] // 3/0:te0[s2>>8]
593 add te20=te20,te0 // 3/2:te0+s0>>8
594 extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
595{ .mmi; ld1 te23=[te23] // 3/1:te0[s3>>8]
596 add te00=te00,te0 // 3/0:te0+s0>>24
597 shr.u te03=s3,twenty4 };; // 3/3:s3>>24
598{ .mmi; ld1 te20=[te20] // 4/2:te0[s0>>8]
599 add te21=te21,te0 // 4/3:te0+s2
600 extr.u te13=s3,16,8 } // 4/0:s1>>16&0xff
601{ .mmi; ld1 te00=[te00] // 4/0:te0[s0>>24]
602 add te01=te01,te0 // 4/1:te0+s1>>24
603 shr.u te11=s1,sixteen };; // 4/2:s3>>16
604{ .mmi; ld1 te21=[te21] // 5/3:te0[s1>>8]
605 add te13=te13,te0 // 5/0:te0+s1>>16
606 extr.u te10=s0,16,8 } // 5/1:s2>>16&0xff
607{ .mmi; ld1 te01=[te01] // 5/1:te0[s1>>24]
608 add te02=te02,te0 // 5/2:te0+s2>>24
609 and te33=s3,maskff };; // 5/2:s1&0xff
610{ .mmi; ld1 te13=[te13] // 6/0:te0[s1>>16]
611 add te10=te10,te0 // 6/1:te0+s2>>16
612 extr.u te12=s2,16,8 } // 6/3:s0>>16&0xff
613{ .mmi; ld1 te02=[te02] // 6/2:te0[s2>>24]
614 add te03=te03,te0 // 6/3:te0+s0>>16
615 and te30=s0,maskff };; // 6/3:s2&0xff
616
617{ .mmi; ld1 te10=[te10] // 7/1:te0[s2>>16]
618 add te33=te33,te0 // 7/2:te0+s1&0xff
619 dep te31=te22,te31,8,8} // 7/0:
620{ .mmi; ld1 te03=[te03] // 7/3:te0[s3>>24]
621 add te30=te30,te0 // 7/3:te0+s2
622 and te11=te11,maskff};; // 7/2:s3>>16&0xff
623{ .mmi; ld1 te33=[te33] // 8/2:te0[s1]
624 add te11=te11,te0 // 8/2:te0+s3>>16
625 dep te32=te23,te32,8,8} // 8/1:
626{ .mmi; ld1 te30=[te30] // 8/3:te0[s2]
627 add te12=te12,te0 // 8/3:te0+s0>>16
628 shl te00=te00,twenty4};; // 8/0:
629{ .mii; ld1 te11=[te11] // 9/2:te0[s3>>16]
630 dep te31=te13,te31,16,8 // 9/0:
631 shl te01=te01,twenty4};; // 9/1:
632{ .mii; ld1 te12=[te12] // 10/3:te0[s0>>16]
633 dep te33=te20,te33,8,8 // 10/2:
634 shl te02=te02,twenty4};; // 10/2:
635{ .mii; xor t0=t0,te31 // 11/0:
636 dep te30=te21,te30,8,8 // 11/3:
637 shl te10=te10,sixteen};; // 11/1:
638{ .mii; xor r16=t0,te00 // 12/0:done!
639 dep te33=te11,te33,16,8 // 12/2:
640 shl te03=te03,twenty4};; // 12/3:
641{ .mmi; xor t1=t1,te01 // 13/1:
642 xor t2=t2,te02 // 13/2:
643 dep te30=te12,te30,16,8};; // 13/3:
644{ .mmi; xor t1=t1,te32 // 14/1:
645 xor r24=t2,te33 // 14/2:done!
646 xor t3=t3,te30 };; // 14/3:
647{ .mib; xor r20=t1,te10 // 15/1:done!
648 xor r28=t3,te03 // 15/3:done!
649 br.ret.sptk b6 };;
650.endp _ia64_AES_decrypt#
651
652// void AES_decrypt (const void *in,void *out,const AES_KEY *key);
653.global AES_decrypt#
654.proc AES_decrypt#
655.align 32
656AES_decrypt:
657 .prologue
658 .save ar.pfs,pfssave
659{ .mmi; alloc pfssave=ar.pfs,3,1,12,0
660 and out0=3,in0
661 mov r3=ip }
662{ .mmi; ADDP in0=0,in0
663 mov loc0=psr.um
664 ADDP out11=KSZ*60,in2 };; // &AES_KEY->rounds
665
666{ .mmi; ld4 out11=[out11] // AES_KEY->rounds
667 add out8=(AES_Td#-AES_decrypt#),r3 // Te0
668 .save pr,prsave
669 mov prsave=pr }
670{ .mmi; rum 1<<3 // clear um.ac
671 .save ar.lc,lcsave
672 mov lcsave=ar.lc };;
673
674 .body
675#if defined(_HPUX_SOURCE) // HPUX is big-endian, cut 15+15 cycles...
676{ .mib; cmp.ne p6,p0=out0,r0
677 add out0=4,in0
678(p6) br.dpnt.many .Ld_i_unaligned };;
679
680{ .mmi; ld4 out1=[in0],8 // s0
681 and out9=3,in1
682 mov twenty4=24 }
683{ .mmi; ld4 out3=[out0],8 // s1
684 ADDP rk0=0,in2
685 mov sixteen=16 };;
686{ .mmi; ld4 out5=[in0] // s2
687 cmp.ne p6,p0=out9,r0
688 mov maskff=0xff }
689{ .mmb; ld4 out7=[out0] // s3
690 ADDP rk1=KSZ,in2
691 br.call.sptk.many b6=_ia64_AES_decrypt };;
692
693{ .mib; ADDP in0=4,in1
694 ADDP in1=0,in1
695(p6) br.spnt .Ld_o_unaligned };;
696
697{ .mii; mov psr.um=loc0
698 mov ar.pfs=pfssave
699 mov ar.lc=lcsave };;
700{ .mmi; st4 [in1]=r16,8 // s0
701 st4 [in0]=r20,8 // s1
702 mov pr=prsave,0x1ffff };;
703{ .mmb; st4 [in1]=r24 // s2
704 st4 [in0]=r28 // s3
705 br.ret.sptk.many b0 };;
706#endif
707
708.align 32
709.Ld_i_unaligned:
710{ .mmi; add out0=1,in0
711 add out2=2,in0
712 add out4=3,in0 };;
713{ .mmi; ld1 r16=[in0],4
714 ld1 r17=[out0],4 }//;;
715{ .mmi; ld1 r18=[out2],4
716 ld1 out1=[out4],4 };; // s0
717{ .mmi; ld1 r20=[in0],4
718 ld1 r21=[out0],4 }//;;
719{ .mmi; ld1 r22=[out2],4
720 ld1 out3=[out4],4 };; // s1
721{ .mmi; ld1 r24=[in0],4
722 ld1 r25=[out0],4 }//;;
723{ .mmi; ld1 r26=[out2],4
724 ld1 out5=[out4],4 };; // s2
725{ .mmi; ld1 r28=[in0]
726 ld1 r29=[out0] }//;;
727{ .mmi; ld1 r30=[out2]
728 ld1 out7=[out4] };; // s3
729
730{ .mii;
731 dep out1=r16,out1,24,8 //;;
732 dep out3=r20,out3,24,8 }//;;
733{ .mii; ADDP rk0=0,in2
734 dep out5=r24,out5,24,8 //;;
735 dep out7=r28,out7,24,8 };;
736{ .mii; ADDP rk1=KSZ,in2
737 dep out1=r17,out1,16,8 //;;
738 dep out3=r21,out3,16,8 }//;;
739{ .mii; mov twenty4=24
740 dep out5=r25,out5,16,8 //;;
741 dep out7=r29,out7,16,8 };;
742{ .mii; mov sixteen=16
743 dep out1=r18,out1,8,8 //;;
744 dep out3=r22,out3,8,8 }//;;
745{ .mii; mov maskff=0xff
746 dep out5=r26,out5,8,8 //;;
747 dep out7=r30,out7,8,8 };;
748
749{ .mib; br.call.sptk.many b6=_ia64_AES_decrypt };;
750
751.Ld_o_unaligned:
752{ .mii; ADDP out0=0,in1
753 extr.u r17=r16,8,8 // s0
754 shr.u r19=r16,twenty4 }//;;
755{ .mii; ADDP out1=1,in1
756 extr.u r18=r16,16,8
757 shr.u r23=r20,twenty4 }//;; // s1
758{ .mii; ADDP out2=2,in1
759 extr.u r21=r20,8,8
760 shr.u r22=r20,sixteen }//;;
761{ .mii; ADDP out3=3,in1
762 extr.u r25=r24,8,8 // s2
763 shr.u r27=r24,twenty4 };;
764{ .mii; st1 [out3]=r16,4
765 extr.u r26=r24,16,8
766 shr.u r31=r28,twenty4 }//;; // s3
767{ .mii; st1 [out2]=r17,4
768 extr.u r29=r28,8,8
769 shr.u r30=r28,sixteen }//;;
770
771{ .mmi; st1 [out1]=r18,4
772 st1 [out0]=r19,4 };;
773{ .mmi; st1 [out3]=r20,4
774 st1 [out2]=r21,4 }//;;
775{ .mmi; st1 [out1]=r22,4
776 st1 [out0]=r23,4 };;
777{ .mmi; st1 [out3]=r24,4
778 st1 [out2]=r25,4
779 mov pr=prsave,0x1ffff }//;;
780{ .mmi; st1 [out1]=r26,4
781 st1 [out0]=r27,4
782 mov ar.pfs=pfssave };;
783{ .mmi; st1 [out3]=r28
784 st1 [out2]=r29
785 mov ar.lc=lcsave }//;;
786{ .mmi; st1 [out1]=r30
787 st1 [out0]=r31 }
788{ .mfb; mov psr.um=loc0 // restore user mask
789 br.ret.sptk.many b0 };;
790.endp AES_decrypt#
791
792// leave it in .text segment...
793.align 64
794.global AES_Te#
795.type AES_Te#,@object
796AES_Te: data4 0xc66363a5,0xc66363a5, 0xf87c7c84,0xf87c7c84
797 data4 0xee777799,0xee777799, 0xf67b7b8d,0xf67b7b8d
798 data4 0xfff2f20d,0xfff2f20d, 0xd66b6bbd,0xd66b6bbd
799 data4 0xde6f6fb1,0xde6f6fb1, 0x91c5c554,0x91c5c554
800 data4 0x60303050,0x60303050, 0x02010103,0x02010103
801 data4 0xce6767a9,0xce6767a9, 0x562b2b7d,0x562b2b7d
802 data4 0xe7fefe19,0xe7fefe19, 0xb5d7d762,0xb5d7d762
803 data4 0x4dababe6,0x4dababe6, 0xec76769a,0xec76769a
804 data4 0x8fcaca45,0x8fcaca45, 0x1f82829d,0x1f82829d
805 data4 0x89c9c940,0x89c9c940, 0xfa7d7d87,0xfa7d7d87
806 data4 0xeffafa15,0xeffafa15, 0xb25959eb,0xb25959eb
807 data4 0x8e4747c9,0x8e4747c9, 0xfbf0f00b,0xfbf0f00b
808 data4 0x41adadec,0x41adadec, 0xb3d4d467,0xb3d4d467
809 data4 0x5fa2a2fd,0x5fa2a2fd, 0x45afafea,0x45afafea
810 data4 0x239c9cbf,0x239c9cbf, 0x53a4a4f7,0x53a4a4f7
811 data4 0xe4727296,0xe4727296, 0x9bc0c05b,0x9bc0c05b
812 data4 0x75b7b7c2,0x75b7b7c2, 0xe1fdfd1c,0xe1fdfd1c
813 data4 0x3d9393ae,0x3d9393ae, 0x4c26266a,0x4c26266a
814 data4 0x6c36365a,0x6c36365a, 0x7e3f3f41,0x7e3f3f41
815 data4 0xf5f7f702,0xf5f7f702, 0x83cccc4f,0x83cccc4f
816 data4 0x6834345c,0x6834345c, 0x51a5a5f4,0x51a5a5f4
817 data4 0xd1e5e534,0xd1e5e534, 0xf9f1f108,0xf9f1f108
818 data4 0xe2717193,0xe2717193, 0xabd8d873,0xabd8d873
819 data4 0x62313153,0x62313153, 0x2a15153f,0x2a15153f
820 data4 0x0804040c,0x0804040c, 0x95c7c752,0x95c7c752
821 data4 0x46232365,0x46232365, 0x9dc3c35e,0x9dc3c35e
822 data4 0x30181828,0x30181828, 0x379696a1,0x379696a1
823 data4 0x0a05050f,0x0a05050f, 0x2f9a9ab5,0x2f9a9ab5
824 data4 0x0e070709,0x0e070709, 0x24121236,0x24121236
825 data4 0x1b80809b,0x1b80809b, 0xdfe2e23d,0xdfe2e23d
826 data4 0xcdebeb26,0xcdebeb26, 0x4e272769,0x4e272769
827 data4 0x7fb2b2cd,0x7fb2b2cd, 0xea75759f,0xea75759f
828 data4 0x1209091b,0x1209091b, 0x1d83839e,0x1d83839e
829 data4 0x582c2c74,0x582c2c74, 0x341a1a2e,0x341a1a2e
830 data4 0x361b1b2d,0x361b1b2d, 0xdc6e6eb2,0xdc6e6eb2
831 data4 0xb45a5aee,0xb45a5aee, 0x5ba0a0fb,0x5ba0a0fb
832 data4 0xa45252f6,0xa45252f6, 0x763b3b4d,0x763b3b4d
833 data4 0xb7d6d661,0xb7d6d661, 0x7db3b3ce,0x7db3b3ce
834 data4 0x5229297b,0x5229297b, 0xdde3e33e,0xdde3e33e
835 data4 0x5e2f2f71,0x5e2f2f71, 0x13848497,0x13848497
836 data4 0xa65353f5,0xa65353f5, 0xb9d1d168,0xb9d1d168
837 data4 0x00000000,0x00000000, 0xc1eded2c,0xc1eded2c
838 data4 0x40202060,0x40202060, 0xe3fcfc1f,0xe3fcfc1f
839 data4 0x79b1b1c8,0x79b1b1c8, 0xb65b5bed,0xb65b5bed
840 data4 0xd46a6abe,0xd46a6abe, 0x8dcbcb46,0x8dcbcb46
841 data4 0x67bebed9,0x67bebed9, 0x7239394b,0x7239394b
842 data4 0x944a4ade,0x944a4ade, 0x984c4cd4,0x984c4cd4
843 data4 0xb05858e8,0xb05858e8, 0x85cfcf4a,0x85cfcf4a
844 data4 0xbbd0d06b,0xbbd0d06b, 0xc5efef2a,0xc5efef2a
845 data4 0x4faaaae5,0x4faaaae5, 0xedfbfb16,0xedfbfb16
846 data4 0x864343c5,0x864343c5, 0x9a4d4dd7,0x9a4d4dd7
847 data4 0x66333355,0x66333355, 0x11858594,0x11858594
848 data4 0x8a4545cf,0x8a4545cf, 0xe9f9f910,0xe9f9f910
849 data4 0x04020206,0x04020206, 0xfe7f7f81,0xfe7f7f81
850 data4 0xa05050f0,0xa05050f0, 0x783c3c44,0x783c3c44
851 data4 0x259f9fba,0x259f9fba, 0x4ba8a8e3,0x4ba8a8e3
852 data4 0xa25151f3,0xa25151f3, 0x5da3a3fe,0x5da3a3fe
853 data4 0x804040c0,0x804040c0, 0x058f8f8a,0x058f8f8a
854 data4 0x3f9292ad,0x3f9292ad, 0x219d9dbc,0x219d9dbc
855 data4 0x70383848,0x70383848, 0xf1f5f504,0xf1f5f504
856 data4 0x63bcbcdf,0x63bcbcdf, 0x77b6b6c1,0x77b6b6c1
857 data4 0xafdada75,0xafdada75, 0x42212163,0x42212163
858 data4 0x20101030,0x20101030, 0xe5ffff1a,0xe5ffff1a
859 data4 0xfdf3f30e,0xfdf3f30e, 0xbfd2d26d,0xbfd2d26d
860 data4 0x81cdcd4c,0x81cdcd4c, 0x180c0c14,0x180c0c14
861 data4 0x26131335,0x26131335, 0xc3ecec2f,0xc3ecec2f
862 data4 0xbe5f5fe1,0xbe5f5fe1, 0x359797a2,0x359797a2
863 data4 0x884444cc,0x884444cc, 0x2e171739,0x2e171739
864 data4 0x93c4c457,0x93c4c457, 0x55a7a7f2,0x55a7a7f2
865 data4 0xfc7e7e82,0xfc7e7e82, 0x7a3d3d47,0x7a3d3d47
866 data4 0xc86464ac,0xc86464ac, 0xba5d5de7,0xba5d5de7
867 data4 0x3219192b,0x3219192b, 0xe6737395,0xe6737395
868 data4 0xc06060a0,0xc06060a0, 0x19818198,0x19818198
869 data4 0x9e4f4fd1,0x9e4f4fd1, 0xa3dcdc7f,0xa3dcdc7f
870 data4 0x44222266,0x44222266, 0x542a2a7e,0x542a2a7e
871 data4 0x3b9090ab,0x3b9090ab, 0x0b888883,0x0b888883
872 data4 0x8c4646ca,0x8c4646ca, 0xc7eeee29,0xc7eeee29
873 data4 0x6bb8b8d3,0x6bb8b8d3, 0x2814143c,0x2814143c
874 data4 0xa7dede79,0xa7dede79, 0xbc5e5ee2,0xbc5e5ee2
875 data4 0x160b0b1d,0x160b0b1d, 0xaddbdb76,0xaddbdb76
876 data4 0xdbe0e03b,0xdbe0e03b, 0x64323256,0x64323256
877 data4 0x743a3a4e,0x743a3a4e, 0x140a0a1e,0x140a0a1e
878 data4 0x924949db,0x924949db, 0x0c06060a,0x0c06060a
879 data4 0x4824246c,0x4824246c, 0xb85c5ce4,0xb85c5ce4
880 data4 0x9fc2c25d,0x9fc2c25d, 0xbdd3d36e,0xbdd3d36e
881 data4 0x43acacef,0x43acacef, 0xc46262a6,0xc46262a6
882 data4 0x399191a8,0x399191a8, 0x319595a4,0x319595a4
883 data4 0xd3e4e437,0xd3e4e437, 0xf279798b,0xf279798b
884 data4 0xd5e7e732,0xd5e7e732, 0x8bc8c843,0x8bc8c843
885 data4 0x6e373759,0x6e373759, 0xda6d6db7,0xda6d6db7
886 data4 0x018d8d8c,0x018d8d8c, 0xb1d5d564,0xb1d5d564
887 data4 0x9c4e4ed2,0x9c4e4ed2, 0x49a9a9e0,0x49a9a9e0
888 data4 0xd86c6cb4,0xd86c6cb4, 0xac5656fa,0xac5656fa
889 data4 0xf3f4f407,0xf3f4f407, 0xcfeaea25,0xcfeaea25
890 data4 0xca6565af,0xca6565af, 0xf47a7a8e,0xf47a7a8e
891 data4 0x47aeaee9,0x47aeaee9, 0x10080818,0x10080818
892 data4 0x6fbabad5,0x6fbabad5, 0xf0787888,0xf0787888
893 data4 0x4a25256f,0x4a25256f, 0x5c2e2e72,0x5c2e2e72
894 data4 0x381c1c24,0x381c1c24, 0x57a6a6f1,0x57a6a6f1
895 data4 0x73b4b4c7,0x73b4b4c7, 0x97c6c651,0x97c6c651
896 data4 0xcbe8e823,0xcbe8e823, 0xa1dddd7c,0xa1dddd7c
897 data4 0xe874749c,0xe874749c, 0x3e1f1f21,0x3e1f1f21
898 data4 0x964b4bdd,0x964b4bdd, 0x61bdbddc,0x61bdbddc
899 data4 0x0d8b8b86,0x0d8b8b86, 0x0f8a8a85,0x0f8a8a85
900 data4 0xe0707090,0xe0707090, 0x7c3e3e42,0x7c3e3e42
901 data4 0x71b5b5c4,0x71b5b5c4, 0xcc6666aa,0xcc6666aa
902 data4 0x904848d8,0x904848d8, 0x06030305,0x06030305
903 data4 0xf7f6f601,0xf7f6f601, 0x1c0e0e12,0x1c0e0e12
904 data4 0xc26161a3,0xc26161a3, 0x6a35355f,0x6a35355f
905 data4 0xae5757f9,0xae5757f9, 0x69b9b9d0,0x69b9b9d0
906 data4 0x17868691,0x17868691, 0x99c1c158,0x99c1c158
907 data4 0x3a1d1d27,0x3a1d1d27, 0x279e9eb9,0x279e9eb9
908 data4 0xd9e1e138,0xd9e1e138, 0xebf8f813,0xebf8f813
909 data4 0x2b9898b3,0x2b9898b3, 0x22111133,0x22111133
910 data4 0xd26969bb,0xd26969bb, 0xa9d9d970,0xa9d9d970
911 data4 0x078e8e89,0x078e8e89, 0x339494a7,0x339494a7
912 data4 0x2d9b9bb6,0x2d9b9bb6, 0x3c1e1e22,0x3c1e1e22
913 data4 0x15878792,0x15878792, 0xc9e9e920,0xc9e9e920
914 data4 0x87cece49,0x87cece49, 0xaa5555ff,0xaa5555ff
915 data4 0x50282878,0x50282878, 0xa5dfdf7a,0xa5dfdf7a
916 data4 0x038c8c8f,0x038c8c8f, 0x59a1a1f8,0x59a1a1f8
917 data4 0x09898980,0x09898980, 0x1a0d0d17,0x1a0d0d17
918 data4 0x65bfbfda,0x65bfbfda, 0xd7e6e631,0xd7e6e631
919 data4 0x844242c6,0x844242c6, 0xd06868b8,0xd06868b8
920 data4 0x824141c3,0x824141c3, 0x299999b0,0x299999b0
921 data4 0x5a2d2d77,0x5a2d2d77, 0x1e0f0f11,0x1e0f0f11
922 data4 0x7bb0b0cb,0x7bb0b0cb, 0xa85454fc,0xa85454fc
923 data4 0x6dbbbbd6,0x6dbbbbd6, 0x2c16163a,0x2c16163a
924// Te4:
925 data1 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
926 data1 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
927 data1 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
928 data1 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
929 data1 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
930 data1 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
931 data1 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
932 data1 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
933 data1 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
934 data1 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
935 data1 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
936 data1 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
937 data1 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
938 data1 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
939 data1 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
940 data1 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
941 data1 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
942 data1 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
943 data1 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
944 data1 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
945 data1 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
946 data1 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
947 data1 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
948 data1 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
949 data1 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
950 data1 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
951 data1 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
952 data1 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
953 data1 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
954 data1 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
955 data1 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
956 data1 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
957.size AES_Te#,2048+256 // HP-UX assembler fails to ".-AES_Te#"
958
959.align 64
960.global AES_Td#
961.type AES_Td#,@object
962AES_Td: data4 0x51f4a750,0x51f4a750, 0x7e416553,0x7e416553
963 data4 0x1a17a4c3,0x1a17a4c3, 0x3a275e96,0x3a275e96
964 data4 0x3bab6bcb,0x3bab6bcb, 0x1f9d45f1,0x1f9d45f1
965 data4 0xacfa58ab,0xacfa58ab, 0x4be30393,0x4be30393
966 data4 0x2030fa55,0x2030fa55, 0xad766df6,0xad766df6
967 data4 0x88cc7691,0x88cc7691, 0xf5024c25,0xf5024c25
968 data4 0x4fe5d7fc,0x4fe5d7fc, 0xc52acbd7,0xc52acbd7
969 data4 0x26354480,0x26354480, 0xb562a38f,0xb562a38f
970 data4 0xdeb15a49,0xdeb15a49, 0x25ba1b67,0x25ba1b67
971 data4 0x45ea0e98,0x45ea0e98, 0x5dfec0e1,0x5dfec0e1
972 data4 0xc32f7502,0xc32f7502, 0x814cf012,0x814cf012
973 data4 0x8d4697a3,0x8d4697a3, 0x6bd3f9c6,0x6bd3f9c6
974 data4 0x038f5fe7,0x038f5fe7, 0x15929c95,0x15929c95
975 data4 0xbf6d7aeb,0xbf6d7aeb, 0x955259da,0x955259da
976 data4 0xd4be832d,0xd4be832d, 0x587421d3,0x587421d3
977 data4 0x49e06929,0x49e06929, 0x8ec9c844,0x8ec9c844
978 data4 0x75c2896a,0x75c2896a, 0xf48e7978,0xf48e7978
979 data4 0x99583e6b,0x99583e6b, 0x27b971dd,0x27b971dd
980 data4 0xbee14fb6,0xbee14fb6, 0xf088ad17,0xf088ad17
981 data4 0xc920ac66,0xc920ac66, 0x7dce3ab4,0x7dce3ab4
982 data4 0x63df4a18,0x63df4a18, 0xe51a3182,0xe51a3182
983 data4 0x97513360,0x97513360, 0x62537f45,0x62537f45
984 data4 0xb16477e0,0xb16477e0, 0xbb6bae84,0xbb6bae84
985 data4 0xfe81a01c,0xfe81a01c, 0xf9082b94,0xf9082b94
986 data4 0x70486858,0x70486858, 0x8f45fd19,0x8f45fd19
987 data4 0x94de6c87,0x94de6c87, 0x527bf8b7,0x527bf8b7
988 data4 0xab73d323,0xab73d323, 0x724b02e2,0x724b02e2
989 data4 0xe31f8f57,0xe31f8f57, 0x6655ab2a,0x6655ab2a
990 data4 0xb2eb2807,0xb2eb2807, 0x2fb5c203,0x2fb5c203
991 data4 0x86c57b9a,0x86c57b9a, 0xd33708a5,0xd33708a5
992 data4 0x302887f2,0x302887f2, 0x23bfa5b2,0x23bfa5b2
993 data4 0x02036aba,0x02036aba, 0xed16825c,0xed16825c
994 data4 0x8acf1c2b,0x8acf1c2b, 0xa779b492,0xa779b492
995 data4 0xf307f2f0,0xf307f2f0, 0x4e69e2a1,0x4e69e2a1
996 data4 0x65daf4cd,0x65daf4cd, 0x0605bed5,0x0605bed5
997 data4 0xd134621f,0xd134621f, 0xc4a6fe8a,0xc4a6fe8a
998 data4 0x342e539d,0x342e539d, 0xa2f355a0,0xa2f355a0
999 data4 0x058ae132,0x058ae132, 0xa4f6eb75,0xa4f6eb75
1000 data4 0x0b83ec39,0x0b83ec39, 0x4060efaa,0x4060efaa
1001 data4 0x5e719f06,0x5e719f06, 0xbd6e1051,0xbd6e1051
1002 data4 0x3e218af9,0x3e218af9, 0x96dd063d,0x96dd063d
1003 data4 0xdd3e05ae,0xdd3e05ae, 0x4de6bd46,0x4de6bd46
1004 data4 0x91548db5,0x91548db5, 0x71c45d05,0x71c45d05
1005 data4 0x0406d46f,0x0406d46f, 0x605015ff,0x605015ff
1006 data4 0x1998fb24,0x1998fb24, 0xd6bde997,0xd6bde997
1007 data4 0x894043cc,0x894043cc, 0x67d99e77,0x67d99e77
1008 data4 0xb0e842bd,0xb0e842bd, 0x07898b88,0x07898b88
1009 data4 0xe7195b38,0xe7195b38, 0x79c8eedb,0x79c8eedb
1010 data4 0xa17c0a47,0xa17c0a47, 0x7c420fe9,0x7c420fe9
1011 data4 0xf8841ec9,0xf8841ec9, 0x00000000,0x00000000
1012 data4 0x09808683,0x09808683, 0x322bed48,0x322bed48
1013 data4 0x1e1170ac,0x1e1170ac, 0x6c5a724e,0x6c5a724e
1014 data4 0xfd0efffb,0xfd0efffb, 0x0f853856,0x0f853856
1015 data4 0x3daed51e,0x3daed51e, 0x362d3927,0x362d3927
1016 data4 0x0a0fd964,0x0a0fd964, 0x685ca621,0x685ca621
1017 data4 0x9b5b54d1,0x9b5b54d1, 0x24362e3a,0x24362e3a
1018 data4 0x0c0a67b1,0x0c0a67b1, 0x9357e70f,0x9357e70f
1019 data4 0xb4ee96d2,0xb4ee96d2, 0x1b9b919e,0x1b9b919e
1020 data4 0x80c0c54f,0x80c0c54f, 0x61dc20a2,0x61dc20a2
1021 data4 0x5a774b69,0x5a774b69, 0x1c121a16,0x1c121a16
1022 data4 0xe293ba0a,0xe293ba0a, 0xc0a02ae5,0xc0a02ae5
1023 data4 0x3c22e043,0x3c22e043, 0x121b171d,0x121b171d
1024 data4 0x0e090d0b,0x0e090d0b, 0xf28bc7ad,0xf28bc7ad
1025 data4 0x2db6a8b9,0x2db6a8b9, 0x141ea9c8,0x141ea9c8
1026 data4 0x57f11985,0x57f11985, 0xaf75074c,0xaf75074c
1027 data4 0xee99ddbb,0xee99ddbb, 0xa37f60fd,0xa37f60fd
1028 data4 0xf701269f,0xf701269f, 0x5c72f5bc,0x5c72f5bc
1029 data4 0x44663bc5,0x44663bc5, 0x5bfb7e34,0x5bfb7e34
1030 data4 0x8b432976,0x8b432976, 0xcb23c6dc,0xcb23c6dc
1031 data4 0xb6edfc68,0xb6edfc68, 0xb8e4f163,0xb8e4f163
1032 data4 0xd731dcca,0xd731dcca, 0x42638510,0x42638510
1033 data4 0x13972240,0x13972240, 0x84c61120,0x84c61120
1034 data4 0x854a247d,0x854a247d, 0xd2bb3df8,0xd2bb3df8
1035 data4 0xaef93211,0xaef93211, 0xc729a16d,0xc729a16d
1036 data4 0x1d9e2f4b,0x1d9e2f4b, 0xdcb230f3,0xdcb230f3
1037 data4 0x0d8652ec,0x0d8652ec, 0x77c1e3d0,0x77c1e3d0
1038 data4 0x2bb3166c,0x2bb3166c, 0xa970b999,0xa970b999
1039 data4 0x119448fa,0x119448fa, 0x47e96422,0x47e96422
1040 data4 0xa8fc8cc4,0xa8fc8cc4, 0xa0f03f1a,0xa0f03f1a
1041 data4 0x567d2cd8,0x567d2cd8, 0x223390ef,0x223390ef
1042 data4 0x87494ec7,0x87494ec7, 0xd938d1c1,0xd938d1c1
1043 data4 0x8ccaa2fe,0x8ccaa2fe, 0x98d40b36,0x98d40b36
1044 data4 0xa6f581cf,0xa6f581cf, 0xa57ade28,0xa57ade28
1045 data4 0xdab78e26,0xdab78e26, 0x3fadbfa4,0x3fadbfa4
1046 data4 0x2c3a9de4,0x2c3a9de4, 0x5078920d,0x5078920d
1047 data4 0x6a5fcc9b,0x6a5fcc9b, 0x547e4662,0x547e4662
1048 data4 0xf68d13c2,0xf68d13c2, 0x90d8b8e8,0x90d8b8e8
1049 data4 0x2e39f75e,0x2e39f75e, 0x82c3aff5,0x82c3aff5
1050 data4 0x9f5d80be,0x9f5d80be, 0x69d0937c,0x69d0937c
1051 data4 0x6fd52da9,0x6fd52da9, 0xcf2512b3,0xcf2512b3
1052 data4 0xc8ac993b,0xc8ac993b, 0x10187da7,0x10187da7
1053 data4 0xe89c636e,0xe89c636e, 0xdb3bbb7b,0xdb3bbb7b
1054 data4 0xcd267809,0xcd267809, 0x6e5918f4,0x6e5918f4
1055 data4 0xec9ab701,0xec9ab701, 0x834f9aa8,0x834f9aa8
1056 data4 0xe6956e65,0xe6956e65, 0xaaffe67e,0xaaffe67e
1057 data4 0x21bccf08,0x21bccf08, 0xef15e8e6,0xef15e8e6
1058 data4 0xbae79bd9,0xbae79bd9, 0x4a6f36ce,0x4a6f36ce
1059 data4 0xea9f09d4,0xea9f09d4, 0x29b07cd6,0x29b07cd6
1060 data4 0x31a4b2af,0x31a4b2af, 0x2a3f2331,0x2a3f2331
1061 data4 0xc6a59430,0xc6a59430, 0x35a266c0,0x35a266c0
1062 data4 0x744ebc37,0x744ebc37, 0xfc82caa6,0xfc82caa6
1063 data4 0xe090d0b0,0xe090d0b0, 0x33a7d815,0x33a7d815
1064 data4 0xf104984a,0xf104984a, 0x41ecdaf7,0x41ecdaf7
1065 data4 0x7fcd500e,0x7fcd500e, 0x1791f62f,0x1791f62f
1066 data4 0x764dd68d,0x764dd68d, 0x43efb04d,0x43efb04d
1067 data4 0xccaa4d54,0xccaa4d54, 0xe49604df,0xe49604df
1068 data4 0x9ed1b5e3,0x9ed1b5e3, 0x4c6a881b,0x4c6a881b
1069 data4 0xc12c1fb8,0xc12c1fb8, 0x4665517f,0x4665517f
1070 data4 0x9d5eea04,0x9d5eea04, 0x018c355d,0x018c355d
1071 data4 0xfa877473,0xfa877473, 0xfb0b412e,0xfb0b412e
1072 data4 0xb3671d5a,0xb3671d5a, 0x92dbd252,0x92dbd252
1073 data4 0xe9105633,0xe9105633, 0x6dd64713,0x6dd64713
1074 data4 0x9ad7618c,0x9ad7618c, 0x37a10c7a,0x37a10c7a
1075 data4 0x59f8148e,0x59f8148e, 0xeb133c89,0xeb133c89
1076 data4 0xcea927ee,0xcea927ee, 0xb761c935,0xb761c935
1077 data4 0xe11ce5ed,0xe11ce5ed, 0x7a47b13c,0x7a47b13c
1078 data4 0x9cd2df59,0x9cd2df59, 0x55f2733f,0x55f2733f
1079 data4 0x1814ce79,0x1814ce79, 0x73c737bf,0x73c737bf
1080 data4 0x53f7cdea,0x53f7cdea, 0x5ffdaa5b,0x5ffdaa5b
1081 data4 0xdf3d6f14,0xdf3d6f14, 0x7844db86,0x7844db86
1082 data4 0xcaaff381,0xcaaff381, 0xb968c43e,0xb968c43e
1083 data4 0x3824342c,0x3824342c, 0xc2a3405f,0xc2a3405f
1084 data4 0x161dc372,0x161dc372, 0xbce2250c,0xbce2250c
1085 data4 0x283c498b,0x283c498b, 0xff0d9541,0xff0d9541
1086 data4 0x39a80171,0x39a80171, 0x080cb3de,0x080cb3de
1087 data4 0xd8b4e49c,0xd8b4e49c, 0x6456c190,0x6456c190
1088 data4 0x7bcb8461,0x7bcb8461, 0xd532b670,0xd532b670
1089 data4 0x486c5c74,0x486c5c74, 0xd0b85742,0xd0b85742
1090// Td4:
1091 data1 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
1092 data1 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
1093 data1 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
1094 data1 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
1095 data1 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
1096 data1 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
1097 data1 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
1098 data1 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
1099 data1 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
1100 data1 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
1101 data1 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
1102 data1 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
1103 data1 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
1104 data1 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
1105 data1 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
1106 data1 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
1107 data1 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
1108 data1 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
1109 data1 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
1110 data1 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
1111 data1 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
1112 data1 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
1113 data1 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
1114 data1 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
1115 data1 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
1116 data1 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
1117 data1 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
1118 data1 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
1119 data1 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
1120 data1 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
1121 data1 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
1122 data1 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1123.size AES_Td#,2048+256 // HP-UX assembler fails to ".-AES_Td#"
diff --git a/src/lib/libcrypto/aes/asm/aes-mips.pl b/src/lib/libcrypto/aes/asm/aes-mips.pl
deleted file mode 100644
index 2ce6deffc8..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-mips.pl
+++ /dev/null
@@ -1,1611 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for MIPS
11
12# October 2010
13#
14# Code uses 1K[+256B] S-box and on single-issue core [such as R5000]
15# spends ~68 cycles per byte processed with 128-bit key. This is ~16%
16# faster than gcc-generated code, which is not very impressive. But
17# recall that compressed S-box requires extra processing, namely
18# additional rotations. Rotations are implemented with lwl/lwr pairs,
19# which is normally used for loading unaligned data. Another cool
20# thing about this module is its endian neutrality, which means that
21# it processes data without ever changing byte order...
22
23######################################################################
24# There is a number of MIPS ABI in use, O32 and N32/64 are most
25# widely used. Then there is a new contender: NUBI. It appears that if
26# one picks the latter, it's possible to arrange code in ABI neutral
27# manner. Therefore let's stick to NUBI register layout:
28#
29($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
30($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
31($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
32($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
33#
34# The return value is placed in $a0. Following coding rules facilitate
35# interoperability:
36#
37# - never ever touch $tp, "thread pointer", former $gp;
38# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
39# old code];
40# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
41#
42# For reference here is register layout for N32/64 MIPS ABIs:
43#
44# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
45# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
46# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
47# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
48# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
49#
50$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
51
52if ($flavour =~ /64|n32/i) {
53 $PTR_ADD="dadd"; # incidentally works even on n32
54 $PTR_SUB="dsub"; # incidentally works even on n32
55 $REG_S="sd";
56 $REG_L="ld";
57 $PTR_SLL="dsll"; # incidentally works even on n32
58 $SZREG=8;
59} else {
60 $PTR_ADD="add";
61 $PTR_SUB="sub";
62 $REG_S="sw";
63 $REG_L="lw";
64 $PTR_SLL="sll";
65 $SZREG=4;
66}
67$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
68#
69# <appro@openssl.org>
70#
71######################################################################
72
73$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
74
75for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
76open STDOUT,">$output";
77
78if (!defined($big_endian))
79{ $big_endian=(unpack('L',pack('N',1))==1); }
80
81while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
82open STDOUT,">$output";
83
84my ($MSB,$LSB)=(0,3); # automatically converted to little-endian
85
86$code.=<<___;
87.text
88#ifdef OPENSSL_FIPSCANISTER
89# include <openssl/fipssyms.h>
90#endif
91
92#if !defined(__vxworks) || defined(__pic__)
93.option pic2
94#endif
95.set noat
96___
97
98{{{
99my $FRAMESIZE=16*$SZREG;
100my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
101
102my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
103my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
104my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23));
105my ($key0,$cnt)=($gp,$fp);
106
107# instuction ordering is "stolen" from output from MIPSpro assembler
108# invoked with -mips3 -O3 arguments...
109$code.=<<___;
110.align 5
111.ent _mips_AES_encrypt
112_mips_AES_encrypt:
113 .frame $sp,0,$ra
114 .set reorder
115 lw $t0,0($key)
116 lw $t1,4($key)
117 lw $t2,8($key)
118 lw $t3,12($key)
119 lw $cnt,240($key)
120 $PTR_ADD $key0,$key,16
121
122 xor $s0,$t0
123 xor $s1,$t1
124 xor $s2,$t2
125 xor $s3,$t3
126
127 sub $cnt,1
128 _xtr $i0,$s1,16-2
129.Loop_enc:
130 _xtr $i1,$s2,16-2
131 _xtr $i2,$s3,16-2
132 _xtr $i3,$s0,16-2
133 and $i0,0x3fc
134 and $i1,0x3fc
135 and $i2,0x3fc
136 and $i3,0x3fc
137 $PTR_ADD $i0,$Tbl
138 $PTR_ADD $i1,$Tbl
139 $PTR_ADD $i2,$Tbl
140 $PTR_ADD $i3,$Tbl
141 lwl $t0,3($i0) # Te1[s1>>16]
142 lwl $t1,3($i1) # Te1[s2>>16]
143 lwl $t2,3($i2) # Te1[s3>>16]
144 lwl $t3,3($i3) # Te1[s0>>16]
145 lwr $t0,2($i0) # Te1[s1>>16]
146 lwr $t1,2($i1) # Te1[s2>>16]
147 lwr $t2,2($i2) # Te1[s3>>16]
148 lwr $t3,2($i3) # Te1[s0>>16]
149
150 _xtr $i0,$s2,8-2
151 _xtr $i1,$s3,8-2
152 _xtr $i2,$s0,8-2
153 _xtr $i3,$s1,8-2
154 and $i0,0x3fc
155 and $i1,0x3fc
156 and $i2,0x3fc
157 and $i3,0x3fc
158 $PTR_ADD $i0,$Tbl
159 $PTR_ADD $i1,$Tbl
160 $PTR_ADD $i2,$Tbl
161 $PTR_ADD $i3,$Tbl
162 lwl $t4,2($i0) # Te2[s2>>8]
163 lwl $t5,2($i1) # Te2[s3>>8]
164 lwl $t6,2($i2) # Te2[s0>>8]
165 lwl $t7,2($i3) # Te2[s1>>8]
166 lwr $t4,1($i0) # Te2[s2>>8]
167 lwr $t5,1($i1) # Te2[s3>>8]
168 lwr $t6,1($i2) # Te2[s0>>8]
169 lwr $t7,1($i3) # Te2[s1>>8]
170
171 _xtr $i0,$s3,0-2
172 _xtr $i1,$s0,0-2
173 _xtr $i2,$s1,0-2
174 _xtr $i3,$s2,0-2
175 and $i0,0x3fc
176 and $i1,0x3fc
177 and $i2,0x3fc
178 and $i3,0x3fc
179 $PTR_ADD $i0,$Tbl
180 $PTR_ADD $i1,$Tbl
181 $PTR_ADD $i2,$Tbl
182 $PTR_ADD $i3,$Tbl
183 lwl $t8,1($i0) # Te3[s3]
184 lwl $t9,1($i1) # Te3[s0]
185 lwl $t10,1($i2) # Te3[s1]
186 lwl $t11,1($i3) # Te3[s2]
187 lwr $t8,0($i0) # Te3[s3]
188 lwr $t9,0($i1) # Te3[s0]
189 lwr $t10,0($i2) # Te3[s1]
190 lwr $t11,0($i3) # Te3[s2]
191
192 _xtr $i0,$s0,24-2
193 _xtr $i1,$s1,24-2
194 _xtr $i2,$s2,24-2
195 _xtr $i3,$s3,24-2
196 and $i0,0x3fc
197 and $i1,0x3fc
198 and $i2,0x3fc
199 and $i3,0x3fc
200 $PTR_ADD $i0,$Tbl
201 $PTR_ADD $i1,$Tbl
202 $PTR_ADD $i2,$Tbl
203 $PTR_ADD $i3,$Tbl
204 xor $t0,$t4
205 xor $t1,$t5
206 xor $t2,$t6
207 xor $t3,$t7
208 lw $t4,0($i0) # Te0[s0>>24]
209 lw $t5,0($i1) # Te0[s1>>24]
210 lw $t6,0($i2) # Te0[s2>>24]
211 lw $t7,0($i3) # Te0[s3>>24]
212
213 lw $s0,0($key0)
214 lw $s1,4($key0)
215 lw $s2,8($key0)
216 lw $s3,12($key0)
217
218 xor $t0,$t8
219 xor $t1,$t9
220 xor $t2,$t10
221 xor $t3,$t11
222
223 xor $t0,$t4
224 xor $t1,$t5
225 xor $t2,$t6
226 xor $t3,$t7
227
228 sub $cnt,1
229 $PTR_ADD $key0,16
230 xor $s0,$t0
231 xor $s1,$t1
232 xor $s2,$t2
233 xor $s3,$t3
234 .set noreorder
235 bnez $cnt,.Loop_enc
236 _xtr $i0,$s1,16-2
237
238 .set reorder
239 _xtr $i1,$s2,16-2
240 _xtr $i2,$s3,16-2
241 _xtr $i3,$s0,16-2
242 and $i0,0x3fc
243 and $i1,0x3fc
244 and $i2,0x3fc
245 and $i3,0x3fc
246 $PTR_ADD $i0,$Tbl
247 $PTR_ADD $i1,$Tbl
248 $PTR_ADD $i2,$Tbl
249 $PTR_ADD $i3,$Tbl
250 lbu $t0,2($i0) # Te4[s1>>16]
251 lbu $t1,2($i1) # Te4[s2>>16]
252 lbu $t2,2($i2) # Te4[s3>>16]
253 lbu $t3,2($i3) # Te4[s0>>16]
254
255 _xtr $i0,$s2,8-2
256 _xtr $i1,$s3,8-2
257 _xtr $i2,$s0,8-2
258 _xtr $i3,$s1,8-2
259 and $i0,0x3fc
260 and $i1,0x3fc
261 and $i2,0x3fc
262 and $i3,0x3fc
263 $PTR_ADD $i0,$Tbl
264 $PTR_ADD $i1,$Tbl
265 $PTR_ADD $i2,$Tbl
266 $PTR_ADD $i3,$Tbl
267 lbu $t4,2($i0) # Te4[s2>>8]
268 lbu $t5,2($i1) # Te4[s3>>8]
269 lbu $t6,2($i2) # Te4[s0>>8]
270 lbu $t7,2($i3) # Te4[s1>>8]
271
272 _xtr $i0,$s0,24-2
273 _xtr $i1,$s1,24-2
274 _xtr $i2,$s2,24-2
275 _xtr $i3,$s3,24-2
276 and $i0,0x3fc
277 and $i1,0x3fc
278 and $i2,0x3fc
279 and $i3,0x3fc
280 $PTR_ADD $i0,$Tbl
281 $PTR_ADD $i1,$Tbl
282 $PTR_ADD $i2,$Tbl
283 $PTR_ADD $i3,$Tbl
284 lbu $t8,2($i0) # Te4[s0>>24]
285 lbu $t9,2($i1) # Te4[s1>>24]
286 lbu $t10,2($i2) # Te4[s2>>24]
287 lbu $t11,2($i3) # Te4[s3>>24]
288
289 _xtr $i0,$s3,0-2
290 _xtr $i1,$s0,0-2
291 _xtr $i2,$s1,0-2
292 _xtr $i3,$s2,0-2
293 and $i0,0x3fc
294 and $i1,0x3fc
295 and $i2,0x3fc
296 and $i3,0x3fc
297
298 _ins $t0,16
299 _ins $t1,16
300 _ins $t2,16
301 _ins $t3,16
302
303 _ins $t4,8
304 _ins $t5,8
305 _ins $t6,8
306 _ins $t7,8
307
308 xor $t0,$t4
309 xor $t1,$t5
310 xor $t2,$t6
311 xor $t3,$t7
312
313 $PTR_ADD $i0,$Tbl
314 $PTR_ADD $i1,$Tbl
315 $PTR_ADD $i2,$Tbl
316 $PTR_ADD $i3,$Tbl
317 lbu $t4,2($i0) # Te4[s3]
318 lbu $t5,2($i1) # Te4[s0]
319 lbu $t6,2($i2) # Te4[s1]
320 lbu $t7,2($i3) # Te4[s2]
321
322 _ins $t8,24
323 _ins $t9,24
324 _ins $t10,24
325 _ins $t11,24
326
327 lw $s0,0($key0)
328 lw $s1,4($key0)
329 lw $s2,8($key0)
330 lw $s3,12($key0)
331
332 xor $t0,$t8
333 xor $t1,$t9
334 xor $t2,$t10
335 xor $t3,$t11
336
337 _ins $t4,0
338 _ins $t5,0
339 _ins $t6,0
340 _ins $t7,0
341
342 xor $t0,$t4
343 xor $t1,$t5
344 xor $t2,$t6
345 xor $t3,$t7
346
347 xor $s0,$t0
348 xor $s1,$t1
349 xor $s2,$t2
350 xor $s3,$t3
351
352 jr $ra
353.end _mips_AES_encrypt
354
355.align 5
356.globl AES_encrypt
357.ent AES_encrypt
358AES_encrypt:
359 .frame $sp,$FRAMESIZE,$ra
360 .mask $SAVED_REGS_MASK,-$SZREG
361 .set noreorder
362___
363$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
364 .cpload $pf
365___
366$code.=<<___;
367 $PTR_SUB $sp,$FRAMESIZE
368 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
369 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
370 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
371 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
372 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
373 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
374 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
375 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
376 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
377 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
378___
379$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
380 $REG_S \$15,$FRAMESIZE-11*$SZREG($sp)
381 $REG_S \$14,$FRAMESIZE-12*$SZREG($sp)
382 $REG_S \$13,$FRAMESIZE-13*$SZREG($sp)
383 $REG_S \$12,$FRAMESIZE-14*$SZREG($sp)
384 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
385___
386$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
387 .cplocal $Tbl
388 .cpsetup $pf,$zero,AES_encrypt
389___
390$code.=<<___;
391 .set reorder
392 la $Tbl,AES_Te # PIC-ified 'load address'
393
394 lwl $s0,0+$MSB($inp)
395 lwl $s1,4+$MSB($inp)
396 lwl $s2,8+$MSB($inp)
397 lwl $s3,12+$MSB($inp)
398 lwr $s0,0+$LSB($inp)
399 lwr $s1,4+$LSB($inp)
400 lwr $s2,8+$LSB($inp)
401 lwr $s3,12+$LSB($inp)
402
403 bal _mips_AES_encrypt
404
405 swr $s0,0+$LSB($out)
406 swr $s1,4+$LSB($out)
407 swr $s2,8+$LSB($out)
408 swr $s3,12+$LSB($out)
409 swl $s0,0+$MSB($out)
410 swl $s1,4+$MSB($out)
411 swl $s2,8+$MSB($out)
412 swl $s3,12+$MSB($out)
413
414 .set noreorder
415 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
416 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
417 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
418 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
419 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
420 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
421 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
422 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
423 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
424 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
425___
426$code.=<<___ if ($flavour =~ /nubi/i);
427 $REG_L \$15,$FRAMESIZE-11*$SZREG($sp)
428 $REG_L \$14,$FRAMESIZE-12*$SZREG($sp)
429 $REG_L \$13,$FRAMESIZE-13*$SZREG($sp)
430 $REG_L \$12,$FRAMESIZE-14*$SZREG($sp)
431 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
432___
433$code.=<<___;
434 jr $ra
435 $PTR_ADD $sp,$FRAMESIZE
436.end AES_encrypt
437___
438
439$code.=<<___;
440.align 5
441.ent _mips_AES_decrypt
442_mips_AES_decrypt:
443 .frame $sp,0,$ra
444 .set reorder
445 lw $t0,0($key)
446 lw $t1,4($key)
447 lw $t2,8($key)
448 lw $t3,12($key)
449 lw $cnt,240($key)
450 $PTR_ADD $key0,$key,16
451
452 xor $s0,$t0
453 xor $s1,$t1
454 xor $s2,$t2
455 xor $s3,$t3
456
457 sub $cnt,1
458 _xtr $i0,$s3,16-2
459.Loop_dec:
460 _xtr $i1,$s0,16-2
461 _xtr $i2,$s1,16-2
462 _xtr $i3,$s2,16-2
463 and $i0,0x3fc
464 and $i1,0x3fc
465 and $i2,0x3fc
466 and $i3,0x3fc
467 $PTR_ADD $i0,$Tbl
468 $PTR_ADD $i1,$Tbl
469 $PTR_ADD $i2,$Tbl
470 $PTR_ADD $i3,$Tbl
471 lwl $t0,3($i0) # Td1[s3>>16]
472 lwl $t1,3($i1) # Td1[s0>>16]
473 lwl $t2,3($i2) # Td1[s1>>16]
474 lwl $t3,3($i3) # Td1[s2>>16]
475 lwr $t0,2($i0) # Td1[s3>>16]
476 lwr $t1,2($i1) # Td1[s0>>16]
477 lwr $t2,2($i2) # Td1[s1>>16]
478 lwr $t3,2($i3) # Td1[s2>>16]
479
480 _xtr $i0,$s2,8-2
481 _xtr $i1,$s3,8-2
482 _xtr $i2,$s0,8-2
483 _xtr $i3,$s1,8-2
484 and $i0,0x3fc
485 and $i1,0x3fc
486 and $i2,0x3fc
487 and $i3,0x3fc
488 $PTR_ADD $i0,$Tbl
489 $PTR_ADD $i1,$Tbl
490 $PTR_ADD $i2,$Tbl
491 $PTR_ADD $i3,$Tbl
492 lwl $t4,2($i0) # Td2[s2>>8]
493 lwl $t5,2($i1) # Td2[s3>>8]
494 lwl $t6,2($i2) # Td2[s0>>8]
495 lwl $t7,2($i3) # Td2[s1>>8]
496 lwr $t4,1($i0) # Td2[s2>>8]
497 lwr $t5,1($i1) # Td2[s3>>8]
498 lwr $t6,1($i2) # Td2[s0>>8]
499 lwr $t7,1($i3) # Td2[s1>>8]
500
501 _xtr $i0,$s1,0-2
502 _xtr $i1,$s2,0-2
503 _xtr $i2,$s3,0-2
504 _xtr $i3,$s0,0-2
505 and $i0,0x3fc
506 and $i1,0x3fc
507 and $i2,0x3fc
508 and $i3,0x3fc
509 $PTR_ADD $i0,$Tbl
510 $PTR_ADD $i1,$Tbl
511 $PTR_ADD $i2,$Tbl
512 $PTR_ADD $i3,$Tbl
513 lwl $t8,1($i0) # Td3[s1]
514 lwl $t9,1($i1) # Td3[s2]
515 lwl $t10,1($i2) # Td3[s3]
516 lwl $t11,1($i3) # Td3[s0]
517 lwr $t8,0($i0) # Td3[s1]
518 lwr $t9,0($i1) # Td3[s2]
519 lwr $t10,0($i2) # Td3[s3]
520 lwr $t11,0($i3) # Td3[s0]
521
522 _xtr $i0,$s0,24-2
523 _xtr $i1,$s1,24-2
524 _xtr $i2,$s2,24-2
525 _xtr $i3,$s3,24-2
526 and $i0,0x3fc
527 and $i1,0x3fc
528 and $i2,0x3fc
529 and $i3,0x3fc
530 $PTR_ADD $i0,$Tbl
531 $PTR_ADD $i1,$Tbl
532 $PTR_ADD $i2,$Tbl
533 $PTR_ADD $i3,$Tbl
534
535 xor $t0,$t4
536 xor $t1,$t5
537 xor $t2,$t6
538 xor $t3,$t7
539
540
541 lw $t4,0($i0) # Td0[s0>>24]
542 lw $t5,0($i1) # Td0[s1>>24]
543 lw $t6,0($i2) # Td0[s2>>24]
544 lw $t7,0($i3) # Td0[s3>>24]
545
546 lw $s0,0($key0)
547 lw $s1,4($key0)
548 lw $s2,8($key0)
549 lw $s3,12($key0)
550
551 xor $t0,$t8
552 xor $t1,$t9
553 xor $t2,$t10
554 xor $t3,$t11
555
556 xor $t0,$t4
557 xor $t1,$t5
558 xor $t2,$t6
559 xor $t3,$t7
560
561 sub $cnt,1
562 $PTR_ADD $key0,16
563 xor $s0,$t0
564 xor $s1,$t1
565 xor $s2,$t2
566 xor $s3,$t3
567 .set noreorder
568 bnez $cnt,.Loop_dec
569 _xtr $i0,$s3,16-2
570
571 .set reorder
572 lw $t4,1024($Tbl) # prefetch Td4
573 lw $t5,1024+32($Tbl)
574 lw $t6,1024+64($Tbl)
575 lw $t7,1024+96($Tbl)
576 lw $t8,1024+128($Tbl)
577 lw $t9,1024+160($Tbl)
578 lw $t10,1024+192($Tbl)
579 lw $t11,1024+224($Tbl)
580
581 _xtr $i0,$s3,16
582 _xtr $i1,$s0,16
583 _xtr $i2,$s1,16
584 _xtr $i3,$s2,16
585 and $i0,0xff
586 and $i1,0xff
587 and $i2,0xff
588 and $i3,0xff
589 $PTR_ADD $i0,$Tbl
590 $PTR_ADD $i1,$Tbl
591 $PTR_ADD $i2,$Tbl
592 $PTR_ADD $i3,$Tbl
593 lbu $t0,1024($i0) # Td4[s3>>16]
594 lbu $t1,1024($i1) # Td4[s0>>16]
595 lbu $t2,1024($i2) # Td4[s1>>16]
596 lbu $t3,1024($i3) # Td4[s2>>16]
597
598 _xtr $i0,$s2,8
599 _xtr $i1,$s3,8
600 _xtr $i2,$s0,8
601 _xtr $i3,$s1,8
602 and $i0,0xff
603 and $i1,0xff
604 and $i2,0xff
605 and $i3,0xff
606 $PTR_ADD $i0,$Tbl
607 $PTR_ADD $i1,$Tbl
608 $PTR_ADD $i2,$Tbl
609 $PTR_ADD $i3,$Tbl
610 lbu $t4,1024($i0) # Td4[s2>>8]
611 lbu $t5,1024($i1) # Td4[s3>>8]
612 lbu $t6,1024($i2) # Td4[s0>>8]
613 lbu $t7,1024($i3) # Td4[s1>>8]
614
615 _xtr $i0,$s0,24
616 _xtr $i1,$s1,24
617 _xtr $i2,$s2,24
618 _xtr $i3,$s3,24
619 $PTR_ADD $i0,$Tbl
620 $PTR_ADD $i1,$Tbl
621 $PTR_ADD $i2,$Tbl
622 $PTR_ADD $i3,$Tbl
623 lbu $t8,1024($i0) # Td4[s0>>24]
624 lbu $t9,1024($i1) # Td4[s1>>24]
625 lbu $t10,1024($i2) # Td4[s2>>24]
626 lbu $t11,1024($i3) # Td4[s3>>24]
627
628 _xtr $i0,$s1,0
629 _xtr $i1,$s2,0
630 _xtr $i2,$s3,0
631 _xtr $i3,$s0,0
632
633 _ins $t0,16
634 _ins $t1,16
635 _ins $t2,16
636 _ins $t3,16
637
638 _ins $t4,8
639 _ins $t5,8
640 _ins $t6,8
641 _ins $t7,8
642
643 xor $t0,$t4
644 xor $t1,$t5
645 xor $t2,$t6
646 xor $t3,$t7
647
648 $PTR_ADD $i0,$Tbl
649 $PTR_ADD $i1,$Tbl
650 $PTR_ADD $i2,$Tbl
651 $PTR_ADD $i3,$Tbl
652 lbu $t4,1024($i0) # Td4[s1]
653 lbu $t5,1024($i1) # Td4[s2]
654 lbu $t6,1024($i2) # Td4[s3]
655 lbu $t7,1024($i3) # Td4[s0]
656
657 _ins $t8,24
658 _ins $t9,24
659 _ins $t10,24
660 _ins $t11,24
661
662 lw $s0,0($key0)
663 lw $s1,4($key0)
664 lw $s2,8($key0)
665 lw $s3,12($key0)
666
667 _ins $t4,0
668 _ins $t5,0
669 _ins $t6,0
670 _ins $t7,0
671
672
673 xor $t0,$t8
674 xor $t1,$t9
675 xor $t2,$t10
676 xor $t3,$t11
677
678 xor $t0,$t4
679 xor $t1,$t5
680 xor $t2,$t6
681 xor $t3,$t7
682
683 xor $s0,$t0
684 xor $s1,$t1
685 xor $s2,$t2
686 xor $s3,$t3
687
688 jr $ra
689.end _mips_AES_decrypt
690
691.align 5
692.globl AES_decrypt
693.ent AES_decrypt
694AES_decrypt:
695 .frame $sp,$FRAMESIZE,$ra
696 .mask $SAVED_REGS_MASK,-$SZREG
697 .set noreorder
698___
699$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
700 .cpload $pf
701___
702$code.=<<___;
703 $PTR_SUB $sp,$FRAMESIZE
704 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
705 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
706 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
707 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
708 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
709 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
710 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
711 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
712 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
713 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
714___
715$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
716 $REG_S \$15,$FRAMESIZE-11*$SZREG($sp)
717 $REG_S \$14,$FRAMESIZE-12*$SZREG($sp)
718 $REG_S \$13,$FRAMESIZE-13*$SZREG($sp)
719 $REG_S \$12,$FRAMESIZE-14*$SZREG($sp)
720 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
721___
722$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
723 .cplocal $Tbl
724 .cpsetup $pf,$zero,AES_decrypt
725___
726$code.=<<___;
727 .set reorder
728 la $Tbl,AES_Td # PIC-ified 'load address'
729
730 lwl $s0,0+$MSB($inp)
731 lwl $s1,4+$MSB($inp)
732 lwl $s2,8+$MSB($inp)
733 lwl $s3,12+$MSB($inp)
734 lwr $s0,0+$LSB($inp)
735 lwr $s1,4+$LSB($inp)
736 lwr $s2,8+$LSB($inp)
737 lwr $s3,12+$LSB($inp)
738
739 bal _mips_AES_decrypt
740
741 swr $s0,0+$LSB($out)
742 swr $s1,4+$LSB($out)
743 swr $s2,8+$LSB($out)
744 swr $s3,12+$LSB($out)
745 swl $s0,0+$MSB($out)
746 swl $s1,4+$MSB($out)
747 swl $s2,8+$MSB($out)
748 swl $s3,12+$MSB($out)
749
750 .set noreorder
751 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
752 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
753 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
754 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
755 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
756 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
757 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
758 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
759 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
760 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
761___
762$code.=<<___ if ($flavour =~ /nubi/i);
763 $REG_L \$15,$FRAMESIZE-11*$SZREG($sp)
764 $REG_L \$14,$FRAMESIZE-12*$SZREG($sp)
765 $REG_L \$13,$FRAMESIZE-13*$SZREG($sp)
766 $REG_L \$12,$FRAMESIZE-14*$SZREG($sp)
767 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
768___
769$code.=<<___;
770 jr $ra
771 $PTR_ADD $sp,$FRAMESIZE
772.end AES_decrypt
773___
774}}}
775
776{{{
777my $FRAMESIZE=8*$SZREG;
778my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000;
779
780my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
781my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
782my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
783my ($rcon,$cnt)=($gp,$fp);
784
785$code.=<<___;
786.align 5
787.ent _mips_AES_set_encrypt_key
788_mips_AES_set_encrypt_key:
789 .frame $sp,0,$ra
790 .set noreorder
791 beqz $inp,.Lekey_done
792 li $t0,-1
793 beqz $key,.Lekey_done
794 $PTR_ADD $rcon,$Tbl,1024+256
795
796 .set reorder
797 lwl $rk0,0+$MSB($inp) # load 128 bits
798 lwl $rk1,4+$MSB($inp)
799 lwl $rk2,8+$MSB($inp)
800 lwl $rk3,12+$MSB($inp)
801 li $at,128
802 lwr $rk0,0+$LSB($inp)
803 lwr $rk1,4+$LSB($inp)
804 lwr $rk2,8+$LSB($inp)
805 lwr $rk3,12+$LSB($inp)
806 .set noreorder
807 beq $bits,$at,.L128bits
808 li $cnt,10
809
810 .set reorder
811 lwl $rk4,16+$MSB($inp) # load 192 bits
812 lwl $rk5,20+$MSB($inp)
813 li $at,192
814 lwr $rk4,16+$LSB($inp)
815 lwr $rk5,20+$LSB($inp)
816 .set noreorder
817 beq $bits,$at,.L192bits
818 li $cnt,8
819
820 .set reorder
821 lwl $rk6,24+$MSB($inp) # load 256 bits
822 lwl $rk7,28+$MSB($inp)
823 li $at,256
824 lwr $rk6,24+$LSB($inp)
825 lwr $rk7,28+$LSB($inp)
826 .set noreorder
827 beq $bits,$at,.L256bits
828 li $cnt,7
829
830 b .Lekey_done
831 li $t0,-2
832
833.align 4
834.L128bits:
835 .set reorder
836 srl $i0,$rk3,16
837 srl $i1,$rk3,8
838 and $i0,0xff
839 and $i1,0xff
840 and $i2,$rk3,0xff
841 srl $i3,$rk3,24
842 $PTR_ADD $i0,$Tbl
843 $PTR_ADD $i1,$Tbl
844 $PTR_ADD $i2,$Tbl
845 $PTR_ADD $i3,$Tbl
846 lbu $i0,1024($i0)
847 lbu $i1,1024($i1)
848 lbu $i2,1024($i2)
849 lbu $i3,1024($i3)
850
851 sw $rk0,0($key)
852 sw $rk1,4($key)
853 sw $rk2,8($key)
854 sw $rk3,12($key)
855 sub $cnt,1
856 $PTR_ADD $key,16
857
858 _bias $i0,24
859 _bias $i1,16
860 _bias $i2,8
861 _bias $i3,0
862
863 xor $rk0,$i0
864 lw $i0,0($rcon)
865 xor $rk0,$i1
866 xor $rk0,$i2
867 xor $rk0,$i3
868 xor $rk0,$i0
869
870 xor $rk1,$rk0
871 xor $rk2,$rk1
872 xor $rk3,$rk2
873
874 .set noreorder
875 bnez $cnt,.L128bits
876 $PTR_ADD $rcon,4
877
878 sw $rk0,0($key)
879 sw $rk1,4($key)
880 sw $rk2,8($key)
881 li $cnt,10
882 sw $rk3,12($key)
883 li $t0,0
884 sw $cnt,80($key)
885 b .Lekey_done
886 $PTR_SUB $key,10*16
887
888.align 4
889.L192bits:
890 .set reorder
891 srl $i0,$rk5,16
892 srl $i1,$rk5,8
893 and $i0,0xff
894 and $i1,0xff
895 and $i2,$rk5,0xff
896 srl $i3,$rk5,24
897 $PTR_ADD $i0,$Tbl
898 $PTR_ADD $i1,$Tbl
899 $PTR_ADD $i2,$Tbl
900 $PTR_ADD $i3,$Tbl
901 lbu $i0,1024($i0)
902 lbu $i1,1024($i1)
903 lbu $i2,1024($i2)
904 lbu $i3,1024($i3)
905
906 sw $rk0,0($key)
907 sw $rk1,4($key)
908 sw $rk2,8($key)
909 sw $rk3,12($key)
910 sw $rk4,16($key)
911 sw $rk5,20($key)
912 sub $cnt,1
913 $PTR_ADD $key,24
914
915 _bias $i0,24
916 _bias $i1,16
917 _bias $i2,8
918 _bias $i3,0
919
920 xor $rk0,$i0
921 lw $i0,0($rcon)
922 xor $rk0,$i1
923 xor $rk0,$i2
924 xor $rk0,$i3
925 xor $rk0,$i0
926
927 xor $rk1,$rk0
928 xor $rk2,$rk1
929 xor $rk3,$rk2
930 xor $rk4,$rk3
931 xor $rk5,$rk4
932
933 .set noreorder
934 bnez $cnt,.L192bits
935 $PTR_ADD $rcon,4
936
937 sw $rk0,0($key)
938 sw $rk1,4($key)
939 sw $rk2,8($key)
940 li $cnt,12
941 sw $rk3,12($key)
942 li $t0,0
943 sw $cnt,48($key)
944 b .Lekey_done
945 $PTR_SUB $key,12*16
946
947.align 4
948.L256bits:
949 .set reorder
950 srl $i0,$rk7,16
951 srl $i1,$rk7,8
952 and $i0,0xff
953 and $i1,0xff
954 and $i2,$rk7,0xff
955 srl $i3,$rk7,24
956 $PTR_ADD $i0,$Tbl
957 $PTR_ADD $i1,$Tbl
958 $PTR_ADD $i2,$Tbl
959 $PTR_ADD $i3,$Tbl
960 lbu $i0,1024($i0)
961 lbu $i1,1024($i1)
962 lbu $i2,1024($i2)
963 lbu $i3,1024($i3)
964
965 sw $rk0,0($key)
966 sw $rk1,4($key)
967 sw $rk2,8($key)
968 sw $rk3,12($key)
969 sw $rk4,16($key)
970 sw $rk5,20($key)
971 sw $rk6,24($key)
972 sw $rk7,28($key)
973 sub $cnt,1
974
975 _bias $i0,24
976 _bias $i1,16
977 _bias $i2,8
978 _bias $i3,0
979
980 xor $rk0,$i0
981 lw $i0,0($rcon)
982 xor $rk0,$i1
983 xor $rk0,$i2
984 xor $rk0,$i3
985 xor $rk0,$i0
986
987 xor $rk1,$rk0
988 xor $rk2,$rk1
989 xor $rk3,$rk2
990 beqz $cnt,.L256bits_done
991
992 srl $i0,$rk3,24
993 srl $i1,$rk3,16
994 srl $i2,$rk3,8
995 and $i3,$rk3,0xff
996 and $i1,0xff
997 and $i2,0xff
998 $PTR_ADD $i0,$Tbl
999 $PTR_ADD $i1,$Tbl
1000 $PTR_ADD $i2,$Tbl
1001 $PTR_ADD $i3,$Tbl
1002 lbu $i0,1024($i0)
1003 lbu $i1,1024($i1)
1004 lbu $i2,1024($i2)
1005 lbu $i3,1024($i3)
1006 sll $i0,24
1007 sll $i1,16
1008 sll $i2,8
1009
1010 xor $rk4,$i0
1011 xor $rk4,$i1
1012 xor $rk4,$i2
1013 xor $rk4,$i3
1014
1015 xor $rk5,$rk4
1016 xor $rk6,$rk5
1017 xor $rk7,$rk6
1018
1019 $PTR_ADD $key,32
1020 .set noreorder
1021 b .L256bits
1022 $PTR_ADD $rcon,4
1023
1024.L256bits_done:
1025 sw $rk0,32($key)
1026 sw $rk1,36($key)
1027 sw $rk2,40($key)
1028 li $cnt,14
1029 sw $rk3,44($key)
1030 li $t0,0
1031 sw $cnt,48($key)
1032 $PTR_SUB $key,12*16
1033
1034.Lekey_done:
1035 jr $ra
1036 nop
1037.end _mips_AES_set_encrypt_key
1038
1039.globl AES_set_encrypt_key
1040.ent AES_set_encrypt_key
1041AES_set_encrypt_key:
1042 .frame $sp,$FRAMESIZE,$ra
1043 .mask $SAVED_REGS_MASK,-$SZREG
1044 .set noreorder
1045___
1046$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
1047 .cpload $pf
1048___
1049$code.=<<___;
1050 $PTR_SUB $sp,$FRAMESIZE
1051 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
1052 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
1053___
1054$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1055 $REG_S $s3,$FRAMESIZE-3*$SZREG($sp)
1056 $REG_S $s2,$FRAMESIZE-4*$SZREG($sp)
1057 $REG_S $s1,$FRAMESIZE-5*$SZREG($sp)
1058 $REG_S $s0,$FRAMESIZE-6*$SZREG($sp)
1059 $REG_S $gp,$FRAMESIZE-7*$SZREG($sp)
1060___
1061$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
1062 .cplocal $Tbl
1063 .cpsetup $pf,$zero,AES_set_encrypt_key
1064___
1065$code.=<<___;
1066 .set reorder
1067 la $Tbl,AES_Te # PIC-ified 'load address'
1068
1069 bal _mips_AES_set_encrypt_key
1070
1071 .set noreorder
1072 move $a0,$t0
1073 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
1074 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
1075___
1076$code.=<<___ if ($flavour =~ /nubi/i);
1077 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
1078 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
1079 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
1080 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
1081 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
1082___
1083$code.=<<___;
1084 jr $ra
1085 $PTR_ADD $sp,$FRAMESIZE
1086.end AES_set_encrypt_key
1087___
1088
1089my ($head,$tail)=($inp,$bits);
1090my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
1091my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
1092$code.=<<___;
1093.align 5
1094.globl AES_set_decrypt_key
1095.ent AES_set_decrypt_key
1096AES_set_decrypt_key:
1097 .frame $sp,$FRAMESIZE,$ra
1098 .mask $SAVED_REGS_MASK,-$SZREG
1099 .set noreorder
1100___
1101$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
1102 .cpload $pf
1103___
1104$code.=<<___;
1105 $PTR_SUB $sp,$FRAMESIZE
1106 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
1107 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
1108___
1109$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1110 $REG_S $s3,$FRAMESIZE-3*$SZREG($sp)
1111 $REG_S $s2,$FRAMESIZE-4*$SZREG($sp)
1112 $REG_S $s1,$FRAMESIZE-5*$SZREG($sp)
1113 $REG_S $s0,$FRAMESIZE-6*$SZREG($sp)
1114 $REG_S $gp,$FRAMESIZE-7*$SZREG($sp)
1115___
1116$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
1117 .cplocal $Tbl
1118 .cpsetup $pf,$zero,AES_set_decrypt_key
1119___
1120$code.=<<___;
1121 .set reorder
1122 la $Tbl,AES_Te # PIC-ified 'load address'
1123
1124 bal _mips_AES_set_encrypt_key
1125
1126 bltz $t0,.Ldkey_done
1127
1128 sll $at,$cnt,4
1129 $PTR_ADD $head,$key,0
1130 $PTR_ADD $tail,$key,$at
1131.align 4
1132.Lswap:
1133 lw $rk0,0($head)
1134 lw $rk1,4($head)
1135 lw $rk2,8($head)
1136 lw $rk3,12($head)
1137 lw $rk4,0($tail)
1138 lw $rk5,4($tail)
1139 lw $rk6,8($tail)
1140 lw $rk7,12($tail)
1141 sw $rk0,0($tail)
1142 sw $rk1,4($tail)
1143 sw $rk2,8($tail)
1144 sw $rk3,12($tail)
1145 $PTR_ADD $head,16
1146 $PTR_SUB $tail,16
1147 sw $rk4,-16($head)
1148 sw $rk5,-12($head)
1149 sw $rk6,-8($head)
1150 sw $rk7,-4($head)
1151 bne $head,$tail,.Lswap
1152
1153 lw $tp1,16($key) # modulo-scheduled
1154 lui $x80808080,0x8080
1155 sub $cnt,1
1156 or $x80808080,0x8080
1157 sll $cnt,2
1158 $PTR_ADD $key,16
1159 lui $x1b1b1b1b,0x1b1b
1160 nor $x7f7f7f7f,$zero,$x80808080
1161 or $x1b1b1b1b,0x1b1b
1162.align 4
1163.Lmix:
1164 and $m,$tp1,$x80808080
1165 and $tp2,$tp1,$x7f7f7f7f
1166 srl $tp4,$m,7
1167 addu $tp2,$tp2 # tp2<<1
1168 subu $m,$tp4
1169 and $m,$x1b1b1b1b
1170 xor $tp2,$m
1171
1172 and $m,$tp2,$x80808080
1173 and $tp4,$tp2,$x7f7f7f7f
1174 srl $tp8,$m,7
1175 addu $tp4,$tp4 # tp4<<1
1176 subu $m,$tp8
1177 and $m,$x1b1b1b1b
1178 xor $tp4,$m
1179
1180 and $m,$tp4,$x80808080
1181 and $tp8,$tp4,$x7f7f7f7f
1182 srl $tp9,$m,7
1183 addu $tp8,$tp8 # tp8<<1
1184 subu $m,$tp9
1185 and $m,$x1b1b1b1b
1186 xor $tp8,$m
1187
1188 xor $tp9,$tp8,$tp1
1189 xor $tpe,$tp8,$tp4
1190 xor $tpb,$tp9,$tp2
1191 xor $tpd,$tp9,$tp4
1192
1193 _ror $tp1,$tpd,16
1194 xor $tpe,$tp2
1195 _ror $tp2,$tpd,-16
1196 xor $tpe,$tp1
1197 _ror $tp1,$tp9,8
1198 xor $tpe,$tp2
1199 _ror $tp2,$tp9,-24
1200 xor $tpe,$tp1
1201 _ror $tp1,$tpb,24
1202 xor $tpe,$tp2
1203 _ror $tp2,$tpb,-8
1204 xor $tpe,$tp1
1205 lw $tp1,4($key) # modulo-scheduled
1206 xor $tpe,$tp2
1207 sub $cnt,1
1208 sw $tpe,0($key)
1209 $PTR_ADD $key,4
1210 bnez $cnt,.Lmix
1211
1212 li $t0,0
1213.Ldkey_done:
1214 .set noreorder
1215 move $a0,$t0
1216 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
1217 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
1218___
1219$code.=<<___ if ($flavour =~ /nubi/i);
1220 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
1221 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
1222 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
1223 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
1224 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
1225___
1226$code.=<<___;
1227 jr $ra
1228 $PTR_ADD $sp,$FRAMESIZE
1229.end AES_set_decrypt_key
1230___
1231}}}
1232
1233######################################################################
1234# Tables are kept in endian-neutral manner
1235$code.=<<___;
1236.rdata
1237.align 6
1238AES_Te:
1239.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0
1240.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d
1241.byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd
1242.byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54
1243.byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03
1244.byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d
1245.byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62
1246.byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a
1247.byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d
1248.byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87
1249.byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb
1250.byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b
1251.byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67
1252.byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea
1253.byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7
1254.byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b
1255.byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c
1256.byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a
1257.byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41
1258.byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f
1259.byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4
1260.byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08
1261.byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73
1262.byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f
1263.byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52
1264.byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e
1265.byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1
1266.byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5
1267.byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36
1268.byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d
1269.byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69
1270.byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f
1271.byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e
1272.byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e
1273.byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2
1274.byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb
1275.byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d
1276.byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce
1277.byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e
1278.byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97
1279.byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68
1280.byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c
1281.byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f
1282.byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed
1283.byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46
1284.byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b
1285.byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4
1286.byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a
1287.byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a
1288.byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16
1289.byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7
1290.byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94
1291.byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10
1292.byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81
1293.byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44
1294.byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3
1295.byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe
1296.byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a
1297.byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc
1298.byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04
1299.byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1
1300.byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63
1301.byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a
1302.byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d
1303.byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14
1304.byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f
1305.byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2
1306.byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39
1307.byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2
1308.byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47
1309.byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7
1310.byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95
1311.byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98
1312.byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f
1313.byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e
1314.byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83
1315.byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29
1316.byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c
1317.byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2
1318.byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76
1319.byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56
1320.byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e
1321.byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a
1322.byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4
1323.byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e
1324.byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6
1325.byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4
1326.byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b
1327.byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43
1328.byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7
1329.byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64
1330.byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0
1331.byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa
1332.byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25
1333.byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e
1334.byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18
1335.byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88
1336.byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72
1337.byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1
1338.byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51
1339.byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c
1340.byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21
1341.byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc
1342.byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85
1343.byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42
1344.byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa
1345.byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05
1346.byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12
1347.byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f
1348.byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0
1349.byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58
1350.byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9
1351.byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13
1352.byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33
1353.byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70
1354.byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7
1355.byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22
1356.byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20
1357.byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff
1358.byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a
1359.byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8
1360.byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17
1361.byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31
1362.byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8
1363.byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0
1364.byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11
1365.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc
1366.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a
1367
1368.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4
1369.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
1370.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
1371.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
1372.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
1373.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
1374.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
1375.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
1376.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
1377.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
1378.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
1379.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
1380.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
1381.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
1382.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
1383.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
1384.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
1385.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
1386.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
1387.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
1388.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
1389.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
1390.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
1391.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
1392.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
1393.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
1394.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
1395.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
1396.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
1397.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
1398.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
1399.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
1400
1401.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon
1402.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00
1403.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00
1404.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00
1405.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00
1406
1407.align 6
1408AES_Td:
1409.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0
1410.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96
1411.byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1
1412.byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93
1413.byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6
1414.byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25
1415.byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7
1416.byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f
1417.byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67
1418.byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1
1419.byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12
1420.byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6
1421.byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95
1422.byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda
1423.byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3
1424.byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44
1425.byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78
1426.byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd
1427.byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17
1428.byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4
1429.byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82
1430.byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45
1431.byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84
1432.byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94
1433.byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19
1434.byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7
1435.byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2
1436.byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a
1437.byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03
1438.byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5
1439.byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2
1440.byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c
1441.byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92
1442.byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1
1443.byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5
1444.byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a
1445.byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0
1446.byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75
1447.byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa
1448.byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51
1449.byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d
1450.byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46
1451.byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05
1452.byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff
1453.byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97
1454.byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77
1455.byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88
1456.byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb
1457.byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9
1458.byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00
1459.byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48
1460.byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e
1461.byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56
1462.byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27
1463.byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21
1464.byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a
1465.byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f
1466.byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e
1467.byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2
1468.byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16
1469.byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5
1470.byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d
1471.byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad
1472.byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8
1473.byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c
1474.byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd
1475.byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc
1476.byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34
1477.byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc
1478.byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63
1479.byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10
1480.byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20
1481.byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8
1482.byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d
1483.byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3
1484.byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0
1485.byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99
1486.byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22
1487.byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a
1488.byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef
1489.byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1
1490.byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36
1491.byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28
1492.byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4
1493.byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d
1494.byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62
1495.byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8
1496.byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5
1497.byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c
1498.byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3
1499.byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7
1500.byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b
1501.byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4
1502.byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8
1503.byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e
1504.byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6
1505.byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce
1506.byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6
1507.byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31
1508.byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0
1509.byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6
1510.byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15
1511.byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7
1512.byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f
1513.byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d
1514.byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf
1515.byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b
1516.byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f
1517.byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d
1518.byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e
1519.byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52
1520.byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13
1521.byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a
1522.byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89
1523.byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35
1524.byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c
1525.byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f
1526.byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf
1527.byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b
1528.byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86
1529.byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e
1530.byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f
1531.byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c
1532.byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41
1533.byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde
1534.byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90
1535.byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70
1536.byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42
1537
1538.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4
1539.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
1540.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
1541.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
1542.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
1543.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
1544.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
1545.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
1546.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
1547.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
1548.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
1549.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
1550.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
1551.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
1552.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
1553.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
1554.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
1555.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
1556.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
1557.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
1558.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
1559.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
1560.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
1561.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
1562.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
1563.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
1564.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
1565.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
1566.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
1567.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
1568.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
1569.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1570___
1571
1572foreach (split("\n",$code)) {
1573 s/\`([^\`]*)\`/eval $1/ge;
1574
1575 # made-up _instructions, _xtr, _ins, _ror and _bias, cope
1576 # with byte order dependencies...
1577 if (/^\s+_/) {
1578 s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/;
1579
1580 s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/
1581 sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
1582 : eval("24-$3"))/e or
1583 s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
1584 sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
1585 : eval("24-$3"))/e or
1586 s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/
1587 sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
1588 : eval("$3*-1"))/e or
1589 s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
1590 sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
1591 : eval("($3-16)&31"))/e;
1592
1593 s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/
1594 sprintf("sll\t$1,$2,$3")/e or
1595 s/srl\s+(\$[0-9]+),(\$[0-9]+),0/
1596 sprintf("and\t$1,$2,0xff")/e or
1597 s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/;
1598 }
1599
1600 # convert lwl/lwr and swr/swl to little-endian order
1601 if (!$big_endian && /^\s+[sl]w[lr]\s+/) {
1602 s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/
1603 sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e or
1604 s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/
1605 sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e;
1606 }
1607
1608 print $_,"\n";
1609}
1610
1611close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-parisc.pl b/src/lib/libcrypto/aes/asm/aes-parisc.pl
deleted file mode 100644
index c36b6a2270..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-parisc.pl
+++ /dev/null
@@ -1,1021 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for PA-RISC.
11#
12# June 2009.
13#
14# The module is mechanical transliteration of aes-sparcv9.pl, but with
15# a twist: S-boxes are compressed even further down to 1K+256B. On
16# PA-7100LC performance is ~40% better than gcc 3.2 generated code and
17# is about 33 cycles per byte processed with 128-bit key. Newer CPUs
18# perform at 16 cycles per byte. It's not faster than code generated
19# by vendor compiler, but recall that it has compressed S-boxes, which
20# requires extra processing.
21#
22# Special thanks to polarhome.com for providing HP-UX account.
23
24$flavour = shift;
25$output = shift;
26open STDOUT,">$output";
27
28if ($flavour =~ /64/) {
29 $LEVEL ="2.0W";
30 $SIZE_T =8;
31 $FRAME_MARKER =80;
32 $SAVED_RP =16;
33 $PUSH ="std";
34 $PUSHMA ="std,ma";
35 $POP ="ldd";
36 $POPMB ="ldd,mb";
37} else {
38 $LEVEL ="1.0";
39 $SIZE_T =4;
40 $FRAME_MARKER =48;
41 $SAVED_RP =20;
42 $PUSH ="stw";
43 $PUSHMA ="stwm";
44 $POP ="ldw";
45 $POPMB ="ldwm";
46}
47
48$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
49 # [+ argument transfer]
50$inp="%r26"; # arg0
51$out="%r25"; # arg1
52$key="%r24"; # arg2
53
54($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4");
55($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8");
56
57($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7,
58 $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) =
59("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16",
60"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26");
61
62$tbl="%r28";
63$rounds="%r29";
64
65$code=<<___;
66 .LEVEL $LEVEL
67 .SPACE \$TEXT\$
68 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
69
70 .EXPORT AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
71 .ALIGN 64
72AES_encrypt
73 .PROC
74 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
75 .ENTRY
76 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
77 $PUSHMA %r3,$FRAME(%sp)
78 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
79 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
80 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
81 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
82 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
83 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
84 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
85 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
86 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
87 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
88 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
89 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
90 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
91 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
92 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
93
94 blr %r0,$tbl
95 ldi 3,$t0
96L\$enc_pic
97 andcm $tbl,$t0,$tbl
98 ldo L\$AES_Te-L\$enc_pic($tbl),$tbl
99
100 and $inp,$t0,$t0
101 sub $inp,$t0,$inp
102 ldw 0($inp),$s0
103 ldw 4($inp),$s1
104 ldw 8($inp),$s2
105 comib,= 0,$t0,L\$enc_inp_aligned
106 ldw 12($inp),$s3
107
108 sh3addl $t0,%r0,$t0
109 subi 32,$t0,$t0
110 mtctl $t0,%cr11
111 ldw 16($inp),$t1
112 vshd $s0,$s1,$s0
113 vshd $s1,$s2,$s1
114 vshd $s2,$s3,$s2
115 vshd $s3,$t1,$s3
116
117L\$enc_inp_aligned
118 bl _parisc_AES_encrypt,%r31
119 nop
120
121 extru,<> $out,31,2,%r0
122 b L\$enc_out_aligned
123 nop
124
125 _srm $s0,24,$acc0
126 _srm $s0,16,$acc1
127 stb $acc0,0($out)
128 _srm $s0,8,$acc2
129 stb $acc1,1($out)
130 _srm $s1,24,$acc4
131 stb $acc2,2($out)
132 _srm $s1,16,$acc5
133 stb $s0,3($out)
134 _srm $s1,8,$acc6
135 stb $acc4,4($out)
136 _srm $s2,24,$acc0
137 stb $acc5,5($out)
138 _srm $s2,16,$acc1
139 stb $acc6,6($out)
140 _srm $s2,8,$acc2
141 stb $s1,7($out)
142 _srm $s3,24,$acc4
143 stb $acc0,8($out)
144 _srm $s3,16,$acc5
145 stb $acc1,9($out)
146 _srm $s3,8,$acc6
147 stb $acc2,10($out)
148 stb $s2,11($out)
149 stb $acc4,12($out)
150 stb $acc5,13($out)
151 stb $acc6,14($out)
152 b L\$enc_done
153 stb $s3,15($out)
154
155L\$enc_out_aligned
156 stw $s0,0($out)
157 stw $s1,4($out)
158 stw $s2,8($out)
159 stw $s3,12($out)
160
161L\$enc_done
162 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
163 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
164 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
165 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
166 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
167 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
168 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
169 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
170 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
171 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
172 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
173 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
174 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
175 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
176 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
177 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
178 bv (%r2)
179 .EXIT
180 $POPMB -$FRAME(%sp),%r3
181 .PROCEND
182
183 .ALIGN 16
184_parisc_AES_encrypt
185 .PROC
186 .CALLINFO MILLICODE
187 .ENTRY
188 ldw 240($key),$rounds
189 ldw 0($key),$t0
190 ldw 4($key),$t1
191 ldw 8($key),$t2
192 _srm $rounds,1,$rounds
193 xor $t0,$s0,$s0
194 ldw 12($key),$t3
195 _srm $s0,24,$acc0
196 xor $t1,$s1,$s1
197 ldw 16($key),$t0
198 _srm $s1,16,$acc1
199 xor $t2,$s2,$s2
200 ldw 20($key),$t1
201 xor $t3,$s3,$s3
202 ldw 24($key),$t2
203 ldw 28($key),$t3
204L\$enc_loop
205 _srm $s2,8,$acc2
206 ldwx,s $acc0($tbl),$acc0
207 _srm $s3,0,$acc3
208 ldwx,s $acc1($tbl),$acc1
209 _srm $s1,24,$acc4
210 ldwx,s $acc2($tbl),$acc2
211 _srm $s2,16,$acc5
212 ldwx,s $acc3($tbl),$acc3
213 _srm $s3,8,$acc6
214 ldwx,s $acc4($tbl),$acc4
215 _srm $s0,0,$acc7
216 ldwx,s $acc5($tbl),$acc5
217 _srm $s2,24,$acc8
218 ldwx,s $acc6($tbl),$acc6
219 _srm $s3,16,$acc9
220 ldwx,s $acc7($tbl),$acc7
221 _srm $s0,8,$acc10
222 ldwx,s $acc8($tbl),$acc8
223 _srm $s1,0,$acc11
224 ldwx,s $acc9($tbl),$acc9
225 _srm $s3,24,$acc12
226 ldwx,s $acc10($tbl),$acc10
227 _srm $s0,16,$acc13
228 ldwx,s $acc11($tbl),$acc11
229 _srm $s1,8,$acc14
230 ldwx,s $acc12($tbl),$acc12
231 _srm $s2,0,$acc15
232 ldwx,s $acc13($tbl),$acc13
233 ldwx,s $acc14($tbl),$acc14
234 ldwx,s $acc15($tbl),$acc15
235 addib,= -1,$rounds,L\$enc_last
236 ldo 32($key),$key
237
238 _ror $acc1,8,$acc1
239 xor $acc0,$t0,$t0
240 ldw 0($key),$s0
241 _ror $acc2,16,$acc2
242 xor $acc1,$t0,$t0
243 ldw 4($key),$s1
244 _ror $acc3,24,$acc3
245 xor $acc2,$t0,$t0
246 ldw 8($key),$s2
247 _ror $acc5,8,$acc5
248 xor $acc3,$t0,$t0
249 ldw 12($key),$s3
250 _ror $acc6,16,$acc6
251 xor $acc4,$t1,$t1
252 _ror $acc7,24,$acc7
253 xor $acc5,$t1,$t1
254 _ror $acc9,8,$acc9
255 xor $acc6,$t1,$t1
256 _ror $acc10,16,$acc10
257 xor $acc7,$t1,$t1
258 _ror $acc11,24,$acc11
259 xor $acc8,$t2,$t2
260 _ror $acc13,8,$acc13
261 xor $acc9,$t2,$t2
262 _ror $acc14,16,$acc14
263 xor $acc10,$t2,$t2
264 _ror $acc15,24,$acc15
265 xor $acc11,$t2,$t2
266 xor $acc12,$acc14,$acc14
267 xor $acc13,$t3,$t3
268 _srm $t0,24,$acc0
269 xor $acc14,$t3,$t3
270 _srm $t1,16,$acc1
271 xor $acc15,$t3,$t3
272
273 _srm $t2,8,$acc2
274 ldwx,s $acc0($tbl),$acc0
275 _srm $t3,0,$acc3
276 ldwx,s $acc1($tbl),$acc1
277 _srm $t1,24,$acc4
278 ldwx,s $acc2($tbl),$acc2
279 _srm $t2,16,$acc5
280 ldwx,s $acc3($tbl),$acc3
281 _srm $t3,8,$acc6
282 ldwx,s $acc4($tbl),$acc4
283 _srm $t0,0,$acc7
284 ldwx,s $acc5($tbl),$acc5
285 _srm $t2,24,$acc8
286 ldwx,s $acc6($tbl),$acc6
287 _srm $t3,16,$acc9
288 ldwx,s $acc7($tbl),$acc7
289 _srm $t0,8,$acc10
290 ldwx,s $acc8($tbl),$acc8
291 _srm $t1,0,$acc11
292 ldwx,s $acc9($tbl),$acc9
293 _srm $t3,24,$acc12
294 ldwx,s $acc10($tbl),$acc10
295 _srm $t0,16,$acc13
296 ldwx,s $acc11($tbl),$acc11
297 _srm $t1,8,$acc14
298 ldwx,s $acc12($tbl),$acc12
299 _srm $t2,0,$acc15
300 ldwx,s $acc13($tbl),$acc13
301 _ror $acc1,8,$acc1
302 ldwx,s $acc14($tbl),$acc14
303
304 _ror $acc2,16,$acc2
305 xor $acc0,$s0,$s0
306 ldwx,s $acc15($tbl),$acc15
307 _ror $acc3,24,$acc3
308 xor $acc1,$s0,$s0
309 ldw 16($key),$t0
310 _ror $acc5,8,$acc5
311 xor $acc2,$s0,$s0
312 ldw 20($key),$t1
313 _ror $acc6,16,$acc6
314 xor $acc3,$s0,$s0
315 ldw 24($key),$t2
316 _ror $acc7,24,$acc7
317 xor $acc4,$s1,$s1
318 ldw 28($key),$t3
319 _ror $acc9,8,$acc9
320 xor $acc5,$s1,$s1
321 ldw 1024+0($tbl),%r0 ; prefetch te4
322 _ror $acc10,16,$acc10
323 xor $acc6,$s1,$s1
324 ldw 1024+32($tbl),%r0 ; prefetch te4
325 _ror $acc11,24,$acc11
326 xor $acc7,$s1,$s1
327 ldw 1024+64($tbl),%r0 ; prefetch te4
328 _ror $acc13,8,$acc13
329 xor $acc8,$s2,$s2
330 ldw 1024+96($tbl),%r0 ; prefetch te4
331 _ror $acc14,16,$acc14
332 xor $acc9,$s2,$s2
333 ldw 1024+128($tbl),%r0 ; prefetch te4
334 _ror $acc15,24,$acc15
335 xor $acc10,$s2,$s2
336 ldw 1024+160($tbl),%r0 ; prefetch te4
337 _srm $s0,24,$acc0
338 xor $acc11,$s2,$s2
339 ldw 1024+192($tbl),%r0 ; prefetch te4
340 xor $acc12,$acc14,$acc14
341 xor $acc13,$s3,$s3
342 ldw 1024+224($tbl),%r0 ; prefetch te4
343 _srm $s1,16,$acc1
344 xor $acc14,$s3,$s3
345 b L\$enc_loop
346 xor $acc15,$s3,$s3
347
348 .ALIGN 16
349L\$enc_last
350 ldo 1024($tbl),$rounds
351 _ror $acc1,8,$acc1
352 xor $acc0,$t0,$t0
353 ldw 0($key),$s0
354 _ror $acc2,16,$acc2
355 xor $acc1,$t0,$t0
356 ldw 4($key),$s1
357 _ror $acc3,24,$acc3
358 xor $acc2,$t0,$t0
359 ldw 8($key),$s2
360 _ror $acc5,8,$acc5
361 xor $acc3,$t0,$t0
362 ldw 12($key),$s3
363 _ror $acc6,16,$acc6
364 xor $acc4,$t1,$t1
365 _ror $acc7,24,$acc7
366 xor $acc5,$t1,$t1
367 _ror $acc9,8,$acc9
368 xor $acc6,$t1,$t1
369 _ror $acc10,16,$acc10
370 xor $acc7,$t1,$t1
371 _ror $acc11,24,$acc11
372 xor $acc8,$t2,$t2
373 _ror $acc13,8,$acc13
374 xor $acc9,$t2,$t2
375 _ror $acc14,16,$acc14
376 xor $acc10,$t2,$t2
377 _ror $acc15,24,$acc15
378 xor $acc11,$t2,$t2
379 xor $acc12,$acc14,$acc14
380 xor $acc13,$t3,$t3
381 _srm $t0,24,$acc0
382 xor $acc14,$t3,$t3
383 _srm $t1,16,$acc1
384 xor $acc15,$t3,$t3
385
386 _srm $t2,8,$acc2
387 ldbx $acc0($rounds),$acc0
388 _srm $t1,24,$acc4
389 ldbx $acc1($rounds),$acc1
390 _srm $t2,16,$acc5
391 _srm $t3,0,$acc3
392 ldbx $acc2($rounds),$acc2
393 ldbx $acc3($rounds),$acc3
394 _srm $t3,8,$acc6
395 ldbx $acc4($rounds),$acc4
396 _srm $t2,24,$acc8
397 ldbx $acc5($rounds),$acc5
398 _srm $t3,16,$acc9
399 _srm $t0,0,$acc7
400 ldbx $acc6($rounds),$acc6
401 ldbx $acc7($rounds),$acc7
402 _srm $t0,8,$acc10
403 ldbx $acc8($rounds),$acc8
404 _srm $t3,24,$acc12
405 ldbx $acc9($rounds),$acc9
406 _srm $t0,16,$acc13
407 _srm $t1,0,$acc11
408 ldbx $acc10($rounds),$acc10
409 _srm $t1,8,$acc14
410 ldbx $acc11($rounds),$acc11
411 ldbx $acc12($rounds),$acc12
412 ldbx $acc13($rounds),$acc13
413 _srm $t2,0,$acc15
414 ldbx $acc14($rounds),$acc14
415
416 dep $acc0,7,8,$acc3
417 ldbx $acc15($rounds),$acc15
418 dep $acc4,7,8,$acc7
419 dep $acc1,15,8,$acc3
420 dep $acc5,15,8,$acc7
421 dep $acc2,23,8,$acc3
422 dep $acc6,23,8,$acc7
423 xor $acc3,$s0,$s0
424 xor $acc7,$s1,$s1
425 dep $acc8,7,8,$acc11
426 dep $acc12,7,8,$acc15
427 dep $acc9,15,8,$acc11
428 dep $acc13,15,8,$acc15
429 dep $acc10,23,8,$acc11
430 dep $acc14,23,8,$acc15
431 xor $acc11,$s2,$s2
432
433 bv (%r31)
434 .EXIT
435 xor $acc15,$s3,$s3
436 .PROCEND
437
438 .ALIGN 64
439L\$AES_Te
440 .WORD 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
441 .WORD 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
442 .WORD 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
443 .WORD 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
444 .WORD 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
445 .WORD 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
446 .WORD 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
447 .WORD 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
448 .WORD 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
449 .WORD 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
450 .WORD 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
451 .WORD 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
452 .WORD 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
453 .WORD 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
454 .WORD 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
455 .WORD 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
456 .WORD 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
457 .WORD 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
458 .WORD 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
459 .WORD 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
460 .WORD 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
461 .WORD 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
462 .WORD 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
463 .WORD 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
464 .WORD 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
465 .WORD 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
466 .WORD 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
467 .WORD 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
468 .WORD 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
469 .WORD 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
470 .WORD 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
471 .WORD 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
472 .WORD 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
473 .WORD 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
474 .WORD 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
475 .WORD 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
476 .WORD 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
477 .WORD 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
478 .WORD 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
479 .WORD 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
480 .WORD 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
481 .WORD 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
482 .WORD 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
483 .WORD 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
484 .WORD 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
485 .WORD 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
486 .WORD 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
487 .WORD 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
488 .WORD 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
489 .WORD 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
490 .WORD 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
491 .WORD 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
492 .WORD 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
493 .WORD 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
494 .WORD 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
495 .WORD 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
496 .WORD 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
497 .WORD 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
498 .WORD 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
499 .WORD 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
500 .WORD 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
501 .WORD 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
502 .WORD 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
503 .WORD 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
504 .BYTE 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
505 .BYTE 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
506 .BYTE 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
507 .BYTE 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
508 .BYTE 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
509 .BYTE 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
510 .BYTE 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
511 .BYTE 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
512 .BYTE 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
513 .BYTE 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
514 .BYTE 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
515 .BYTE 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
516 .BYTE 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
517 .BYTE 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
518 .BYTE 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
519 .BYTE 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
520 .BYTE 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
521 .BYTE 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
522 .BYTE 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
523 .BYTE 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
524 .BYTE 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
525 .BYTE 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
526 .BYTE 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
527 .BYTE 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
528 .BYTE 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
529 .BYTE 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
530 .BYTE 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
531 .BYTE 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
532 .BYTE 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
533 .BYTE 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
534 .BYTE 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
535 .BYTE 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
536___
537
538$code.=<<___;
539 .EXPORT AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
540 .ALIGN 16
541AES_decrypt
542 .PROC
543 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
544 .ENTRY
545 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
546 $PUSHMA %r3,$FRAME(%sp)
547 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
548 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
549 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
550 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
551 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
552 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
553 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
554 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
555 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
556 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
557 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
558 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
559 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
560 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
561 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
562
563 blr %r0,$tbl
564 ldi 3,$t0
565L\$dec_pic
566 andcm $tbl,$t0,$tbl
567 ldo L\$AES_Td-L\$dec_pic($tbl),$tbl
568
569 and $inp,$t0,$t0
570 sub $inp,$t0,$inp
571 ldw 0($inp),$s0
572 ldw 4($inp),$s1
573 ldw 8($inp),$s2
574 comib,= 0,$t0,L\$dec_inp_aligned
575 ldw 12($inp),$s3
576
577 sh3addl $t0,%r0,$t0
578 subi 32,$t0,$t0
579 mtctl $t0,%cr11
580 ldw 16($inp),$t1
581 vshd $s0,$s1,$s0
582 vshd $s1,$s2,$s1
583 vshd $s2,$s3,$s2
584 vshd $s3,$t1,$s3
585
586L\$dec_inp_aligned
587 bl _parisc_AES_decrypt,%r31
588 nop
589
590 extru,<> $out,31,2,%r0
591 b L\$dec_out_aligned
592 nop
593
594 _srm $s0,24,$acc0
595 _srm $s0,16,$acc1
596 stb $acc0,0($out)
597 _srm $s0,8,$acc2
598 stb $acc1,1($out)
599 _srm $s1,24,$acc4
600 stb $acc2,2($out)
601 _srm $s1,16,$acc5
602 stb $s0,3($out)
603 _srm $s1,8,$acc6
604 stb $acc4,4($out)
605 _srm $s2,24,$acc0
606 stb $acc5,5($out)
607 _srm $s2,16,$acc1
608 stb $acc6,6($out)
609 _srm $s2,8,$acc2
610 stb $s1,7($out)
611 _srm $s3,24,$acc4
612 stb $acc0,8($out)
613 _srm $s3,16,$acc5
614 stb $acc1,9($out)
615 _srm $s3,8,$acc6
616 stb $acc2,10($out)
617 stb $s2,11($out)
618 stb $acc4,12($out)
619 stb $acc5,13($out)
620 stb $acc6,14($out)
621 b L\$dec_done
622 stb $s3,15($out)
623
624L\$dec_out_aligned
625 stw $s0,0($out)
626 stw $s1,4($out)
627 stw $s2,8($out)
628 stw $s3,12($out)
629
630L\$dec_done
631 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
632 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
633 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
634 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
635 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
636 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
637 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
638 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
639 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
640 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
641 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
642 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
643 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
644 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
645 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
646 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
647 bv (%r2)
648 .EXIT
649 $POPMB -$FRAME(%sp),%r3
650 .PROCEND
651
652 .ALIGN 16
653_parisc_AES_decrypt
654 .PROC
655 .CALLINFO MILLICODE
656 .ENTRY
657 ldw 240($key),$rounds
658 ldw 0($key),$t0
659 ldw 4($key),$t1
660 ldw 8($key),$t2
661 ldw 12($key),$t3
662 _srm $rounds,1,$rounds
663 xor $t0,$s0,$s0
664 ldw 16($key),$t0
665 xor $t1,$s1,$s1
666 ldw 20($key),$t1
667 _srm $s0,24,$acc0
668 xor $t2,$s2,$s2
669 ldw 24($key),$t2
670 xor $t3,$s3,$s3
671 ldw 28($key),$t3
672 _srm $s3,16,$acc1
673L\$dec_loop
674 _srm $s2,8,$acc2
675 ldwx,s $acc0($tbl),$acc0
676 _srm $s1,0,$acc3
677 ldwx,s $acc1($tbl),$acc1
678 _srm $s1,24,$acc4
679 ldwx,s $acc2($tbl),$acc2
680 _srm $s0,16,$acc5
681 ldwx,s $acc3($tbl),$acc3
682 _srm $s3,8,$acc6
683 ldwx,s $acc4($tbl),$acc4
684 _srm $s2,0,$acc7
685 ldwx,s $acc5($tbl),$acc5
686 _srm $s2,24,$acc8
687 ldwx,s $acc6($tbl),$acc6
688 _srm $s1,16,$acc9
689 ldwx,s $acc7($tbl),$acc7
690 _srm $s0,8,$acc10
691 ldwx,s $acc8($tbl),$acc8
692 _srm $s3,0,$acc11
693 ldwx,s $acc9($tbl),$acc9
694 _srm $s3,24,$acc12
695 ldwx,s $acc10($tbl),$acc10
696 _srm $s2,16,$acc13
697 ldwx,s $acc11($tbl),$acc11
698 _srm $s1,8,$acc14
699 ldwx,s $acc12($tbl),$acc12
700 _srm $s0,0,$acc15
701 ldwx,s $acc13($tbl),$acc13
702 ldwx,s $acc14($tbl),$acc14
703 ldwx,s $acc15($tbl),$acc15
704 addib,= -1,$rounds,L\$dec_last
705 ldo 32($key),$key
706
707 _ror $acc1,8,$acc1
708 xor $acc0,$t0,$t0
709 ldw 0($key),$s0
710 _ror $acc2,16,$acc2
711 xor $acc1,$t0,$t0
712 ldw 4($key),$s1
713 _ror $acc3,24,$acc3
714 xor $acc2,$t0,$t0
715 ldw 8($key),$s2
716 _ror $acc5,8,$acc5
717 xor $acc3,$t0,$t0
718 ldw 12($key),$s3
719 _ror $acc6,16,$acc6
720 xor $acc4,$t1,$t1
721 _ror $acc7,24,$acc7
722 xor $acc5,$t1,$t1
723 _ror $acc9,8,$acc9
724 xor $acc6,$t1,$t1
725 _ror $acc10,16,$acc10
726 xor $acc7,$t1,$t1
727 _ror $acc11,24,$acc11
728 xor $acc8,$t2,$t2
729 _ror $acc13,8,$acc13
730 xor $acc9,$t2,$t2
731 _ror $acc14,16,$acc14
732 xor $acc10,$t2,$t2
733 _ror $acc15,24,$acc15
734 xor $acc11,$t2,$t2
735 xor $acc12,$acc14,$acc14
736 xor $acc13,$t3,$t3
737 _srm $t0,24,$acc0
738 xor $acc14,$t3,$t3
739 xor $acc15,$t3,$t3
740 _srm $t3,16,$acc1
741
742 _srm $t2,8,$acc2
743 ldwx,s $acc0($tbl),$acc0
744 _srm $t1,0,$acc3
745 ldwx,s $acc1($tbl),$acc1
746 _srm $t1,24,$acc4
747 ldwx,s $acc2($tbl),$acc2
748 _srm $t0,16,$acc5
749 ldwx,s $acc3($tbl),$acc3
750 _srm $t3,8,$acc6
751 ldwx,s $acc4($tbl),$acc4
752 _srm $t2,0,$acc7
753 ldwx,s $acc5($tbl),$acc5
754 _srm $t2,24,$acc8
755 ldwx,s $acc6($tbl),$acc6
756 _srm $t1,16,$acc9
757 ldwx,s $acc7($tbl),$acc7
758 _srm $t0,8,$acc10
759 ldwx,s $acc8($tbl),$acc8
760 _srm $t3,0,$acc11
761 ldwx,s $acc9($tbl),$acc9
762 _srm $t3,24,$acc12
763 ldwx,s $acc10($tbl),$acc10
764 _srm $t2,16,$acc13
765 ldwx,s $acc11($tbl),$acc11
766 _srm $t1,8,$acc14
767 ldwx,s $acc12($tbl),$acc12
768 _srm $t0,0,$acc15
769 ldwx,s $acc13($tbl),$acc13
770 _ror $acc1,8,$acc1
771 ldwx,s $acc14($tbl),$acc14
772
773 _ror $acc2,16,$acc2
774 xor $acc0,$s0,$s0
775 ldwx,s $acc15($tbl),$acc15
776 _ror $acc3,24,$acc3
777 xor $acc1,$s0,$s0
778 ldw 16($key),$t0
779 _ror $acc5,8,$acc5
780 xor $acc2,$s0,$s0
781 ldw 20($key),$t1
782 _ror $acc6,16,$acc6
783 xor $acc3,$s0,$s0
784 ldw 24($key),$t2
785 _ror $acc7,24,$acc7
786 xor $acc4,$s1,$s1
787 ldw 28($key),$t3
788 _ror $acc9,8,$acc9
789 xor $acc5,$s1,$s1
790 ldw 1024+0($tbl),%r0 ; prefetch td4
791 _ror $acc10,16,$acc10
792 xor $acc6,$s1,$s1
793 ldw 1024+32($tbl),%r0 ; prefetch td4
794 _ror $acc11,24,$acc11
795 xor $acc7,$s1,$s1
796 ldw 1024+64($tbl),%r0 ; prefetch td4
797 _ror $acc13,8,$acc13
798 xor $acc8,$s2,$s2
799 ldw 1024+96($tbl),%r0 ; prefetch td4
800 _ror $acc14,16,$acc14
801 xor $acc9,$s2,$s2
802 ldw 1024+128($tbl),%r0 ; prefetch td4
803 _ror $acc15,24,$acc15
804 xor $acc10,$s2,$s2
805 ldw 1024+160($tbl),%r0 ; prefetch td4
806 _srm $s0,24,$acc0
807 xor $acc11,$s2,$s2
808 ldw 1024+192($tbl),%r0 ; prefetch td4
809 xor $acc12,$acc14,$acc14
810 xor $acc13,$s3,$s3
811 ldw 1024+224($tbl),%r0 ; prefetch td4
812 xor $acc14,$s3,$s3
813 xor $acc15,$s3,$s3
814 b L\$dec_loop
815 _srm $s3,16,$acc1
816
817 .ALIGN 16
818L\$dec_last
819 ldo 1024($tbl),$rounds
820 _ror $acc1,8,$acc1
821 xor $acc0,$t0,$t0
822 ldw 0($key),$s0
823 _ror $acc2,16,$acc2
824 xor $acc1,$t0,$t0
825 ldw 4($key),$s1
826 _ror $acc3,24,$acc3
827 xor $acc2,$t0,$t0
828 ldw 8($key),$s2
829 _ror $acc5,8,$acc5
830 xor $acc3,$t0,$t0
831 ldw 12($key),$s3
832 _ror $acc6,16,$acc6
833 xor $acc4,$t1,$t1
834 _ror $acc7,24,$acc7
835 xor $acc5,$t1,$t1
836 _ror $acc9,8,$acc9
837 xor $acc6,$t1,$t1
838 _ror $acc10,16,$acc10
839 xor $acc7,$t1,$t1
840 _ror $acc11,24,$acc11
841 xor $acc8,$t2,$t2
842 _ror $acc13,8,$acc13
843 xor $acc9,$t2,$t2
844 _ror $acc14,16,$acc14
845 xor $acc10,$t2,$t2
846 _ror $acc15,24,$acc15
847 xor $acc11,$t2,$t2
848 xor $acc12,$acc14,$acc14
849 xor $acc13,$t3,$t3
850 _srm $t0,24,$acc0
851 xor $acc14,$t3,$t3
852 xor $acc15,$t3,$t3
853 _srm $t3,16,$acc1
854
855 _srm $t2,8,$acc2
856 ldbx $acc0($rounds),$acc0
857 _srm $t1,24,$acc4
858 ldbx $acc1($rounds),$acc1
859 _srm $t0,16,$acc5
860 _srm $t1,0,$acc3
861 ldbx $acc2($rounds),$acc2
862 ldbx $acc3($rounds),$acc3
863 _srm $t3,8,$acc6
864 ldbx $acc4($rounds),$acc4
865 _srm $t2,24,$acc8
866 ldbx $acc5($rounds),$acc5
867 _srm $t1,16,$acc9
868 _srm $t2,0,$acc7
869 ldbx $acc6($rounds),$acc6
870 ldbx $acc7($rounds),$acc7
871 _srm $t0,8,$acc10
872 ldbx $acc8($rounds),$acc8
873 _srm $t3,24,$acc12
874 ldbx $acc9($rounds),$acc9
875 _srm $t2,16,$acc13
876 _srm $t3,0,$acc11
877 ldbx $acc10($rounds),$acc10
878 _srm $t1,8,$acc14
879 ldbx $acc11($rounds),$acc11
880 ldbx $acc12($rounds),$acc12
881 ldbx $acc13($rounds),$acc13
882 _srm $t0,0,$acc15
883 ldbx $acc14($rounds),$acc14
884
885 dep $acc0,7,8,$acc3
886 ldbx $acc15($rounds),$acc15
887 dep $acc4,7,8,$acc7
888 dep $acc1,15,8,$acc3
889 dep $acc5,15,8,$acc7
890 dep $acc2,23,8,$acc3
891 dep $acc6,23,8,$acc7
892 xor $acc3,$s0,$s0
893 xor $acc7,$s1,$s1
894 dep $acc8,7,8,$acc11
895 dep $acc12,7,8,$acc15
896 dep $acc9,15,8,$acc11
897 dep $acc13,15,8,$acc15
898 dep $acc10,23,8,$acc11
899 dep $acc14,23,8,$acc15
900 xor $acc11,$s2,$s2
901
902 bv (%r31)
903 .EXIT
904 xor $acc15,$s3,$s3
905 .PROCEND
906
907 .ALIGN 64
908L\$AES_Td
909 .WORD 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
910 .WORD 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
911 .WORD 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
912 .WORD 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
913 .WORD 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
914 .WORD 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
915 .WORD 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
916 .WORD 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
917 .WORD 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
918 .WORD 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
919 .WORD 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
920 .WORD 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
921 .WORD 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
922 .WORD 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
923 .WORD 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
924 .WORD 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
925 .WORD 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
926 .WORD 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
927 .WORD 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
928 .WORD 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
929 .WORD 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
930 .WORD 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
931 .WORD 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
932 .WORD 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
933 .WORD 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
934 .WORD 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
935 .WORD 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
936 .WORD 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
937 .WORD 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
938 .WORD 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
939 .WORD 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
940 .WORD 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
941 .WORD 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
942 .WORD 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
943 .WORD 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
944 .WORD 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
945 .WORD 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
946 .WORD 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
947 .WORD 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
948 .WORD 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
949 .WORD 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
950 .WORD 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
951 .WORD 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
952 .WORD 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
953 .WORD 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
954 .WORD 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
955 .WORD 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
956 .WORD 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
957 .WORD 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
958 .WORD 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
959 .WORD 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
960 .WORD 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
961 .WORD 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
962 .WORD 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
963 .WORD 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
964 .WORD 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
965 .WORD 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
966 .WORD 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
967 .WORD 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
968 .WORD 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
969 .WORD 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
970 .WORD 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
971 .WORD 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
972 .WORD 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
973 .BYTE 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
974 .BYTE 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
975 .BYTE 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
976 .BYTE 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
977 .BYTE 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
978 .BYTE 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
979 .BYTE 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
980 .BYTE 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
981 .BYTE 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
982 .BYTE 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
983 .BYTE 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
984 .BYTE 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
985 .BYTE 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
986 .BYTE 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
987 .BYTE 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
988 .BYTE 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
989 .BYTE 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
990 .BYTE 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
991 .BYTE 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
992 .BYTE 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
993 .BYTE 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
994 .BYTE 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
995 .BYTE 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
996 .BYTE 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
997 .BYTE 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
998 .BYTE 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
999 .BYTE 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
1000 .BYTE 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
1001 .BYTE 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
1002 .BYTE 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
1003 .BYTE 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
1004 .BYTE 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1005 .STRINGZ "AES for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
1006___
1007
1008foreach (split("\n",$code)) {
1009 s/\`([^\`]*)\`/eval $1/ge;
1010
1011 # translate made up instructons: _ror, _srm
1012 s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/ or
1013
1014 s/_srm(\s+%r[0-9]+),([0-9]+),/
1015 $SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2)
1016 : sprintf("extrd,u%s,%d,8,",$1,63-$2)/e;
1017
1018 s/,\*/,/ if ($SIZE_T==4);
1019 print $_,"\n";
1020}
1021close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl
deleted file mode 100644
index 7c52cbe5f9..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-ppc.pl
+++ /dev/null
@@ -1,1365 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Needs more work: key setup, CBC routine...
11#
12# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
13# 128-bit key, which is ~40% better than 64-bit code generated by gcc
14# 4.0. But these are not the ones currently used! Their "compact"
15# counterparts are, for security reason. ppc_AES_encrypt_compact runs
16# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
17# at 1/3 of ppc_AES_decrypt.
18
19# February 2010
20#
21# Rescheduling instructions to favour Power6 pipeline gave 10%
22# performance improvement on the platfrom in question (and marginal
23# improvement even on others). It should be noted that Power6 fails
24# to process byte in 18 cycles, only in 23, because it fails to issue
25# 4 load instructions in two cycles, only in 3. As result non-compact
26# block subroutines are 25% slower than one would expect. Compact
27# functions scale better, because they have pure computational part,
28# which scales perfectly with clock frequency. To be specific
29# ppc_AES_encrypt_compact operates at 42 cycles per byte, while
30# ppc_AES_decrypt_compact - at 55 (in 64-bit build).
31
32$flavour = shift;
33
34if ($flavour =~ /64/) {
35 $SIZE_T =8;
36 $LRSAVE =2*$SIZE_T;
37 $STU ="stdu";
38 $POP ="ld";
39 $PUSH ="std";
40} elsif ($flavour =~ /32/) {
41 $SIZE_T =4;
42 $LRSAVE =$SIZE_T;
43 $STU ="stwu";
44 $POP ="lwz";
45 $PUSH ="stw";
46} else { die "nonsense $flavour"; }
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
51die "can't locate ppc-xlate.pl";
52
53open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
54
55$FRAME=32*$SIZE_T;
56
57sub _data_word()
58{ my $i;
59 while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
60}
61
62$sp="r1";
63$toc="r2";
64$inp="r3";
65$out="r4";
66$key="r5";
67
68$Tbl0="r3";
69$Tbl1="r6";
70$Tbl2="r7";
71$Tbl3="r2";
72
73$s0="r8";
74$s1="r9";
75$s2="r10";
76$s3="r11";
77
78$t0="r12";
79$t1="r13";
80$t2="r14";
81$t3="r15";
82
83$acc00="r16";
84$acc01="r17";
85$acc02="r18";
86$acc03="r19";
87
88$acc04="r20";
89$acc05="r21";
90$acc06="r22";
91$acc07="r23";
92
93$acc08="r24";
94$acc09="r25";
95$acc10="r26";
96$acc11="r27";
97
98$acc12="r28";
99$acc13="r29";
100$acc14="r30";
101$acc15="r31";
102
103# stay away from TLS pointer
104if ($SIZE_T==8) { die if ($t1 ne "r13"); $t1="r0"; }
105else { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0"; }
106$mask80=$Tbl2;
107$mask1b=$Tbl3;
108
109$code.=<<___;
110.machine "any"
111.text
112
113.align 7
114LAES_Te:
115 mflr r0
116 bcl 20,31,\$+4
117 mflr $Tbl0 ; vvvvv "distance" between . and 1st data entry
118 addi $Tbl0,$Tbl0,`128-8`
119 mtlr r0
120 blr
121 .long 0
122 .byte 0,12,0x14,0,0,0,0,0
123 .space `64-9*4`
124LAES_Td:
125 mflr r0
126 bcl 20,31,\$+4
127 mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry
128 addi $Tbl0,$Tbl0,`128-64-8+2048+256`
129 mtlr r0
130 blr
131 .long 0
132 .byte 0,12,0x14,0,0,0,0,0
133 .space `128-64-9*4`
134___
135&_data_word(
136 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
137 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
138 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
139 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
140 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
141 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
142 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
143 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
144 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
145 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
146 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
147 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
148 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
149 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
150 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
151 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
152 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
153 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
154 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
155 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
156 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
157 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
158 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
159 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
160 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
161 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
162 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
163 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
164 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
165 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
166 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
167 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
168 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
169 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
170 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
171 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
172 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
173 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
174 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
175 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
176 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
177 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
178 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
179 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
180 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
181 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
182 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
183 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
184 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
185 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
186 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
187 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
188 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
189 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
190 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
191 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
192 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
193 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
194 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
195 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
196 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
197 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
198 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
199 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
200$code.=<<___;
201.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
202.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
203.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
204.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
205.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
206.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
207.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
208.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
209.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
210.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
211.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
212.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
213.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
214.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
215.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
216.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
217.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
218.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
219.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
220.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
221.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
222.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
223.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
224.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
225.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
226.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
227.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
228.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
229.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
230.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
231.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
232.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
233___
234&_data_word(
235 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
236 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
237 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
238 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
239 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
240 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
241 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
242 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
243 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
244 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
245 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
246 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
247 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
248 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
249 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
250 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
251 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
252 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
253 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
254 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
255 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
256 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
257 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
258 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
259 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
260 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
261 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
262 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
263 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
264 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
265 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
266 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
267 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
268 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
269 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
270 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
271 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
272 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
273 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
274 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
275 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
276 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
277 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
278 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
279 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
280 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
281 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
282 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
283 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
284 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
285 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
286 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
287 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
288 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
289 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
290 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
291 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
292 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
293 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
294 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
295 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
296 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
297 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
298 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
299$code.=<<___;
300.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
301.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
302.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
303.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
304.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
305.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
306.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
307.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
308.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
309.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
310.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
311.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
312.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
313.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
314.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
315.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
316.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
317.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
318.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
319.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
320.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
321.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
322.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
323.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
324.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
325.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
326.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
327.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
328.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
329.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
330.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
331.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
332
333
334.globl .AES_encrypt
335.align 7
336.AES_encrypt:
337 $STU $sp,-$FRAME($sp)
338 mflr r0
339
340 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
341 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
342 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
343 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
344 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
345 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
346 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
347 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
348 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
349 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
350 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
351 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
352 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
353 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
354 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
355 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
356 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
357 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
358 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
359 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
360 $PUSH r0,`$FRAME+$LRSAVE`($sp)
361
362 andi. $t0,$inp,3
363 andi. $t1,$out,3
364 or. $t0,$t0,$t1
365 bne Lenc_unaligned
366
367Lenc_unaligned_ok:
368 lwz $s0,0($inp)
369 lwz $s1,4($inp)
370 lwz $s2,8($inp)
371 lwz $s3,12($inp)
372 bl LAES_Te
373 bl Lppc_AES_encrypt_compact
374 stw $s0,0($out)
375 stw $s1,4($out)
376 stw $s2,8($out)
377 stw $s3,12($out)
378 b Lenc_done
379
380Lenc_unaligned:
381 subfic $t0,$inp,4096
382 subfic $t1,$out,4096
383 andi. $t0,$t0,4096-16
384 beq Lenc_xpage
385 andi. $t1,$t1,4096-16
386 bne Lenc_unaligned_ok
387
388Lenc_xpage:
389 lbz $acc00,0($inp)
390 lbz $acc01,1($inp)
391 lbz $acc02,2($inp)
392 lbz $s0,3($inp)
393 lbz $acc04,4($inp)
394 lbz $acc05,5($inp)
395 lbz $acc06,6($inp)
396 lbz $s1,7($inp)
397 lbz $acc08,8($inp)
398 lbz $acc09,9($inp)
399 lbz $acc10,10($inp)
400 insrwi $s0,$acc00,8,0
401 lbz $s2,11($inp)
402 insrwi $s1,$acc04,8,0
403 lbz $acc12,12($inp)
404 insrwi $s0,$acc01,8,8
405 lbz $acc13,13($inp)
406 insrwi $s1,$acc05,8,8
407 lbz $acc14,14($inp)
408 insrwi $s0,$acc02,8,16
409 lbz $s3,15($inp)
410 insrwi $s1,$acc06,8,16
411 insrwi $s2,$acc08,8,0
412 insrwi $s3,$acc12,8,0
413 insrwi $s2,$acc09,8,8
414 insrwi $s3,$acc13,8,8
415 insrwi $s2,$acc10,8,16
416 insrwi $s3,$acc14,8,16
417
418 bl LAES_Te
419 bl Lppc_AES_encrypt_compact
420
421 extrwi $acc00,$s0,8,0
422 extrwi $acc01,$s0,8,8
423 stb $acc00,0($out)
424 extrwi $acc02,$s0,8,16
425 stb $acc01,1($out)
426 stb $acc02,2($out)
427 extrwi $acc04,$s1,8,0
428 stb $s0,3($out)
429 extrwi $acc05,$s1,8,8
430 stb $acc04,4($out)
431 extrwi $acc06,$s1,8,16
432 stb $acc05,5($out)
433 stb $acc06,6($out)
434 extrwi $acc08,$s2,8,0
435 stb $s1,7($out)
436 extrwi $acc09,$s2,8,8
437 stb $acc08,8($out)
438 extrwi $acc10,$s2,8,16
439 stb $acc09,9($out)
440 stb $acc10,10($out)
441 extrwi $acc12,$s3,8,0
442 stb $s2,11($out)
443 extrwi $acc13,$s3,8,8
444 stb $acc12,12($out)
445 extrwi $acc14,$s3,8,16
446 stb $acc13,13($out)
447 stb $acc14,14($out)
448 stb $s3,15($out)
449
450Lenc_done:
451 $POP r0,`$FRAME+$LRSAVE`($sp)
452 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
453 $POP r13,`$FRAME-$SIZE_T*19`($sp)
454 $POP r14,`$FRAME-$SIZE_T*18`($sp)
455 $POP r15,`$FRAME-$SIZE_T*17`($sp)
456 $POP r16,`$FRAME-$SIZE_T*16`($sp)
457 $POP r17,`$FRAME-$SIZE_T*15`($sp)
458 $POP r18,`$FRAME-$SIZE_T*14`($sp)
459 $POP r19,`$FRAME-$SIZE_T*13`($sp)
460 $POP r20,`$FRAME-$SIZE_T*12`($sp)
461 $POP r21,`$FRAME-$SIZE_T*11`($sp)
462 $POP r22,`$FRAME-$SIZE_T*10`($sp)
463 $POP r23,`$FRAME-$SIZE_T*9`($sp)
464 $POP r24,`$FRAME-$SIZE_T*8`($sp)
465 $POP r25,`$FRAME-$SIZE_T*7`($sp)
466 $POP r26,`$FRAME-$SIZE_T*6`($sp)
467 $POP r27,`$FRAME-$SIZE_T*5`($sp)
468 $POP r28,`$FRAME-$SIZE_T*4`($sp)
469 $POP r29,`$FRAME-$SIZE_T*3`($sp)
470 $POP r30,`$FRAME-$SIZE_T*2`($sp)
471 $POP r31,`$FRAME-$SIZE_T*1`($sp)
472 mtlr r0
473 addi $sp,$sp,$FRAME
474 blr
475 .long 0
476 .byte 0,12,4,1,0x80,18,3,0
477 .long 0
478
479.align 5
480Lppc_AES_encrypt:
481 lwz $acc00,240($key)
482 addi $Tbl1,$Tbl0,3
483 lwz $t0,0($key)
484 addi $Tbl2,$Tbl0,2
485 lwz $t1,4($key)
486 addi $Tbl3,$Tbl0,1
487 lwz $t2,8($key)
488 addi $acc00,$acc00,-1
489 lwz $t3,12($key)
490 addi $key,$key,16
491 xor $s0,$s0,$t0
492 xor $s1,$s1,$t1
493 xor $s2,$s2,$t2
494 xor $s3,$s3,$t3
495 mtctr $acc00
496.align 4
497Lenc_loop:
498 rlwinm $acc00,$s0,`32-24+3`,21,28
499 rlwinm $acc01,$s1,`32-24+3`,21,28
500 rlwinm $acc02,$s2,`32-24+3`,21,28
501 rlwinm $acc03,$s3,`32-24+3`,21,28
502 lwz $t0,0($key)
503 rlwinm $acc04,$s1,`32-16+3`,21,28
504 lwz $t1,4($key)
505 rlwinm $acc05,$s2,`32-16+3`,21,28
506 lwz $t2,8($key)
507 rlwinm $acc06,$s3,`32-16+3`,21,28
508 lwz $t3,12($key)
509 rlwinm $acc07,$s0,`32-16+3`,21,28
510 lwzx $acc00,$Tbl0,$acc00
511 rlwinm $acc08,$s2,`32-8+3`,21,28
512 lwzx $acc01,$Tbl0,$acc01
513 rlwinm $acc09,$s3,`32-8+3`,21,28
514 lwzx $acc02,$Tbl0,$acc02
515 rlwinm $acc10,$s0,`32-8+3`,21,28
516 lwzx $acc03,$Tbl0,$acc03
517 rlwinm $acc11,$s1,`32-8+3`,21,28
518 lwzx $acc04,$Tbl1,$acc04
519 rlwinm $acc12,$s3,`0+3`,21,28
520 lwzx $acc05,$Tbl1,$acc05
521 rlwinm $acc13,$s0,`0+3`,21,28
522 lwzx $acc06,$Tbl1,$acc06
523 rlwinm $acc14,$s1,`0+3`,21,28
524 lwzx $acc07,$Tbl1,$acc07
525 rlwinm $acc15,$s2,`0+3`,21,28
526 lwzx $acc08,$Tbl2,$acc08
527 xor $t0,$t0,$acc00
528 lwzx $acc09,$Tbl2,$acc09
529 xor $t1,$t1,$acc01
530 lwzx $acc10,$Tbl2,$acc10
531 xor $t2,$t2,$acc02
532 lwzx $acc11,$Tbl2,$acc11
533 xor $t3,$t3,$acc03
534 lwzx $acc12,$Tbl3,$acc12
535 xor $t0,$t0,$acc04
536 lwzx $acc13,$Tbl3,$acc13
537 xor $t1,$t1,$acc05
538 lwzx $acc14,$Tbl3,$acc14
539 xor $t2,$t2,$acc06
540 lwzx $acc15,$Tbl3,$acc15
541 xor $t3,$t3,$acc07
542 xor $t0,$t0,$acc08
543 xor $t1,$t1,$acc09
544 xor $t2,$t2,$acc10
545 xor $t3,$t3,$acc11
546 xor $s0,$t0,$acc12
547 xor $s1,$t1,$acc13
548 xor $s2,$t2,$acc14
549 xor $s3,$t3,$acc15
550 addi $key,$key,16
551 bdnz- Lenc_loop
552
553 addi $Tbl2,$Tbl0,2048
554 nop
555 lwz $t0,0($key)
556 rlwinm $acc00,$s0,`32-24`,24,31
557 lwz $t1,4($key)
558 rlwinm $acc01,$s1,`32-24`,24,31
559 lwz $t2,8($key)
560 rlwinm $acc02,$s2,`32-24`,24,31
561 lwz $t3,12($key)
562 rlwinm $acc03,$s3,`32-24`,24,31
563 lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
564 rlwinm $acc04,$s1,`32-16`,24,31
565 lwz $acc09,`2048+32`($Tbl0)
566 rlwinm $acc05,$s2,`32-16`,24,31
567 lwz $acc10,`2048+64`($Tbl0)
568 rlwinm $acc06,$s3,`32-16`,24,31
569 lwz $acc11,`2048+96`($Tbl0)
570 rlwinm $acc07,$s0,`32-16`,24,31
571 lwz $acc12,`2048+128`($Tbl0)
572 rlwinm $acc08,$s2,`32-8`,24,31
573 lwz $acc13,`2048+160`($Tbl0)
574 rlwinm $acc09,$s3,`32-8`,24,31
575 lwz $acc14,`2048+192`($Tbl0)
576 rlwinm $acc10,$s0,`32-8`,24,31
577 lwz $acc15,`2048+224`($Tbl0)
578 rlwinm $acc11,$s1,`32-8`,24,31
579 lbzx $acc00,$Tbl2,$acc00
580 rlwinm $acc12,$s3,`0`,24,31
581 lbzx $acc01,$Tbl2,$acc01
582 rlwinm $acc13,$s0,`0`,24,31
583 lbzx $acc02,$Tbl2,$acc02
584 rlwinm $acc14,$s1,`0`,24,31
585 lbzx $acc03,$Tbl2,$acc03
586 rlwinm $acc15,$s2,`0`,24,31
587 lbzx $acc04,$Tbl2,$acc04
588 rlwinm $s0,$acc00,24,0,7
589 lbzx $acc05,$Tbl2,$acc05
590 rlwinm $s1,$acc01,24,0,7
591 lbzx $acc06,$Tbl2,$acc06
592 rlwinm $s2,$acc02,24,0,7
593 lbzx $acc07,$Tbl2,$acc07
594 rlwinm $s3,$acc03,24,0,7
595 lbzx $acc08,$Tbl2,$acc08
596 rlwimi $s0,$acc04,16,8,15
597 lbzx $acc09,$Tbl2,$acc09
598 rlwimi $s1,$acc05,16,8,15
599 lbzx $acc10,$Tbl2,$acc10
600 rlwimi $s2,$acc06,16,8,15
601 lbzx $acc11,$Tbl2,$acc11
602 rlwimi $s3,$acc07,16,8,15
603 lbzx $acc12,$Tbl2,$acc12
604 rlwimi $s0,$acc08,8,16,23
605 lbzx $acc13,$Tbl2,$acc13
606 rlwimi $s1,$acc09,8,16,23
607 lbzx $acc14,$Tbl2,$acc14
608 rlwimi $s2,$acc10,8,16,23
609 lbzx $acc15,$Tbl2,$acc15
610 rlwimi $s3,$acc11,8,16,23
611 or $s0,$s0,$acc12
612 or $s1,$s1,$acc13
613 or $s2,$s2,$acc14
614 or $s3,$s3,$acc15
615 xor $s0,$s0,$t0
616 xor $s1,$s1,$t1
617 xor $s2,$s2,$t2
618 xor $s3,$s3,$t3
619 blr
620 .long 0
621 .byte 0,12,0x14,0,0,0,0,0
622
623.align 4
624Lppc_AES_encrypt_compact:
625 lwz $acc00,240($key)
626 addi $Tbl1,$Tbl0,2048
627 lwz $t0,0($key)
628 lis $mask80,0x8080
629 lwz $t1,4($key)
630 lis $mask1b,0x1b1b
631 lwz $t2,8($key)
632 ori $mask80,$mask80,0x8080
633 lwz $t3,12($key)
634 ori $mask1b,$mask1b,0x1b1b
635 addi $key,$key,16
636 mtctr $acc00
637.align 4
638Lenc_compact_loop:
639 xor $s0,$s0,$t0
640 xor $s1,$s1,$t1
641 rlwinm $acc00,$s0,`32-24`,24,31
642 xor $s2,$s2,$t2
643 rlwinm $acc01,$s1,`32-24`,24,31
644 xor $s3,$s3,$t3
645 rlwinm $acc02,$s2,`32-24`,24,31
646 rlwinm $acc03,$s3,`32-24`,24,31
647 rlwinm $acc04,$s1,`32-16`,24,31
648 rlwinm $acc05,$s2,`32-16`,24,31
649 rlwinm $acc06,$s3,`32-16`,24,31
650 rlwinm $acc07,$s0,`32-16`,24,31
651 lbzx $acc00,$Tbl1,$acc00
652 rlwinm $acc08,$s2,`32-8`,24,31
653 lbzx $acc01,$Tbl1,$acc01
654 rlwinm $acc09,$s3,`32-8`,24,31
655 lbzx $acc02,$Tbl1,$acc02
656 rlwinm $acc10,$s0,`32-8`,24,31
657 lbzx $acc03,$Tbl1,$acc03
658 rlwinm $acc11,$s1,`32-8`,24,31
659 lbzx $acc04,$Tbl1,$acc04
660 rlwinm $acc12,$s3,`0`,24,31
661 lbzx $acc05,$Tbl1,$acc05
662 rlwinm $acc13,$s0,`0`,24,31
663 lbzx $acc06,$Tbl1,$acc06
664 rlwinm $acc14,$s1,`0`,24,31
665 lbzx $acc07,$Tbl1,$acc07
666 rlwinm $acc15,$s2,`0`,24,31
667 lbzx $acc08,$Tbl1,$acc08
668 rlwinm $s0,$acc00,24,0,7
669 lbzx $acc09,$Tbl1,$acc09
670 rlwinm $s1,$acc01,24,0,7
671 lbzx $acc10,$Tbl1,$acc10
672 rlwinm $s2,$acc02,24,0,7
673 lbzx $acc11,$Tbl1,$acc11
674 rlwinm $s3,$acc03,24,0,7
675 lbzx $acc12,$Tbl1,$acc12
676 rlwimi $s0,$acc04,16,8,15
677 lbzx $acc13,$Tbl1,$acc13
678 rlwimi $s1,$acc05,16,8,15
679 lbzx $acc14,$Tbl1,$acc14
680 rlwimi $s2,$acc06,16,8,15
681 lbzx $acc15,$Tbl1,$acc15
682 rlwimi $s3,$acc07,16,8,15
683 rlwimi $s0,$acc08,8,16,23
684 rlwimi $s1,$acc09,8,16,23
685 rlwimi $s2,$acc10,8,16,23
686 rlwimi $s3,$acc11,8,16,23
687 lwz $t0,0($key)
688 or $s0,$s0,$acc12
689 lwz $t1,4($key)
690 or $s1,$s1,$acc13
691 lwz $t2,8($key)
692 or $s2,$s2,$acc14
693 lwz $t3,12($key)
694 or $s3,$s3,$acc15
695
696 addi $key,$key,16
697 bdz Lenc_compact_done
698
699 and $acc00,$s0,$mask80 # r1=r0&0x80808080
700 and $acc01,$s1,$mask80
701 and $acc02,$s2,$mask80
702 and $acc03,$s3,$mask80
703 srwi $acc04,$acc00,7 # r1>>7
704 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
705 srwi $acc05,$acc01,7
706 andc $acc09,$s1,$mask80
707 srwi $acc06,$acc02,7
708 andc $acc10,$s2,$mask80
709 srwi $acc07,$acc03,7
710 andc $acc11,$s3,$mask80
711 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
712 sub $acc01,$acc01,$acc05
713 sub $acc02,$acc02,$acc06
714 sub $acc03,$acc03,$acc07
715 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
716 add $acc09,$acc09,$acc09
717 add $acc10,$acc10,$acc10
718 add $acc11,$acc11,$acc11
719 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
720 and $acc01,$acc01,$mask1b
721 and $acc02,$acc02,$mask1b
722 and $acc03,$acc03,$mask1b
723 xor $acc00,$acc00,$acc08 # r2
724 xor $acc01,$acc01,$acc09
725 rotlwi $acc12,$s0,16 # ROTATE(r0,16)
726 xor $acc02,$acc02,$acc10
727 rotlwi $acc13,$s1,16
728 xor $acc03,$acc03,$acc11
729 rotlwi $acc14,$s2,16
730
731 xor $s0,$s0,$acc00 # r0^r2
732 rotlwi $acc15,$s3,16
733 xor $s1,$s1,$acc01
734 rotrwi $s0,$s0,24 # ROTATE(r2^r0,24)
735 xor $s2,$s2,$acc02
736 rotrwi $s1,$s1,24
737 xor $s3,$s3,$acc03
738 rotrwi $s2,$s2,24
739 xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2
740 rotrwi $s3,$s3,24
741 xor $s1,$s1,$acc01
742 xor $s2,$s2,$acc02
743 xor $s3,$s3,$acc03
744 rotlwi $acc08,$acc12,8 # ROTATE(r0,24)
745 xor $s0,$s0,$acc12 #
746 rotlwi $acc09,$acc13,8
747 xor $s1,$s1,$acc13
748 rotlwi $acc10,$acc14,8
749 xor $s2,$s2,$acc14
750 rotlwi $acc11,$acc15,8
751 xor $s3,$s3,$acc15
752 xor $s0,$s0,$acc08 #
753 xor $s1,$s1,$acc09
754 xor $s2,$s2,$acc10
755 xor $s3,$s3,$acc11
756
757 b Lenc_compact_loop
758.align 4
759Lenc_compact_done:
760 xor $s0,$s0,$t0
761 xor $s1,$s1,$t1
762 xor $s2,$s2,$t2
763 xor $s3,$s3,$t3
764 blr
765 .long 0
766 .byte 0,12,0x14,0,0,0,0,0
767
768.globl .AES_decrypt
769.align 7
770.AES_decrypt:
771 $STU $sp,-$FRAME($sp)
772 mflr r0
773
774 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
775 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
776 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
777 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
778 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
779 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
780 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
781 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
782 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
783 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
784 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
785 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
786 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
787 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
788 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
789 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
790 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
791 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
792 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
793 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
794 $PUSH r0,`$FRAME+$LRSAVE`($sp)
795
796 andi. $t0,$inp,3
797 andi. $t1,$out,3
798 or. $t0,$t0,$t1
799 bne Ldec_unaligned
800
801Ldec_unaligned_ok:
802 lwz $s0,0($inp)
803 lwz $s1,4($inp)
804 lwz $s2,8($inp)
805 lwz $s3,12($inp)
806 bl LAES_Td
807 bl Lppc_AES_decrypt_compact
808 stw $s0,0($out)
809 stw $s1,4($out)
810 stw $s2,8($out)
811 stw $s3,12($out)
812 b Ldec_done
813
814Ldec_unaligned:
815 subfic $t0,$inp,4096
816 subfic $t1,$out,4096
817 andi. $t0,$t0,4096-16
818 beq Ldec_xpage
819 andi. $t1,$t1,4096-16
820 bne Ldec_unaligned_ok
821
822Ldec_xpage:
823 lbz $acc00,0($inp)
824 lbz $acc01,1($inp)
825 lbz $acc02,2($inp)
826 lbz $s0,3($inp)
827 lbz $acc04,4($inp)
828 lbz $acc05,5($inp)
829 lbz $acc06,6($inp)
830 lbz $s1,7($inp)
831 lbz $acc08,8($inp)
832 lbz $acc09,9($inp)
833 lbz $acc10,10($inp)
834 insrwi $s0,$acc00,8,0
835 lbz $s2,11($inp)
836 insrwi $s1,$acc04,8,0
837 lbz $acc12,12($inp)
838 insrwi $s0,$acc01,8,8
839 lbz $acc13,13($inp)
840 insrwi $s1,$acc05,8,8
841 lbz $acc14,14($inp)
842 insrwi $s0,$acc02,8,16
843 lbz $s3,15($inp)
844 insrwi $s1,$acc06,8,16
845 insrwi $s2,$acc08,8,0
846 insrwi $s3,$acc12,8,0
847 insrwi $s2,$acc09,8,8
848 insrwi $s3,$acc13,8,8
849 insrwi $s2,$acc10,8,16
850 insrwi $s3,$acc14,8,16
851
852 bl LAES_Td
853 bl Lppc_AES_decrypt_compact
854
855 extrwi $acc00,$s0,8,0
856 extrwi $acc01,$s0,8,8
857 stb $acc00,0($out)
858 extrwi $acc02,$s0,8,16
859 stb $acc01,1($out)
860 stb $acc02,2($out)
861 extrwi $acc04,$s1,8,0
862 stb $s0,3($out)
863 extrwi $acc05,$s1,8,8
864 stb $acc04,4($out)
865 extrwi $acc06,$s1,8,16
866 stb $acc05,5($out)
867 stb $acc06,6($out)
868 extrwi $acc08,$s2,8,0
869 stb $s1,7($out)
870 extrwi $acc09,$s2,8,8
871 stb $acc08,8($out)
872 extrwi $acc10,$s2,8,16
873 stb $acc09,9($out)
874 stb $acc10,10($out)
875 extrwi $acc12,$s3,8,0
876 stb $s2,11($out)
877 extrwi $acc13,$s3,8,8
878 stb $acc12,12($out)
879 extrwi $acc14,$s3,8,16
880 stb $acc13,13($out)
881 stb $acc14,14($out)
882 stb $s3,15($out)
883
884Ldec_done:
885 $POP r0,`$FRAME+$LRSAVE`($sp)
886 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
887 $POP r13,`$FRAME-$SIZE_T*19`($sp)
888 $POP r14,`$FRAME-$SIZE_T*18`($sp)
889 $POP r15,`$FRAME-$SIZE_T*17`($sp)
890 $POP r16,`$FRAME-$SIZE_T*16`($sp)
891 $POP r17,`$FRAME-$SIZE_T*15`($sp)
892 $POP r18,`$FRAME-$SIZE_T*14`($sp)
893 $POP r19,`$FRAME-$SIZE_T*13`($sp)
894 $POP r20,`$FRAME-$SIZE_T*12`($sp)
895 $POP r21,`$FRAME-$SIZE_T*11`($sp)
896 $POP r22,`$FRAME-$SIZE_T*10`($sp)
897 $POP r23,`$FRAME-$SIZE_T*9`($sp)
898 $POP r24,`$FRAME-$SIZE_T*8`($sp)
899 $POP r25,`$FRAME-$SIZE_T*7`($sp)
900 $POP r26,`$FRAME-$SIZE_T*6`($sp)
901 $POP r27,`$FRAME-$SIZE_T*5`($sp)
902 $POP r28,`$FRAME-$SIZE_T*4`($sp)
903 $POP r29,`$FRAME-$SIZE_T*3`($sp)
904 $POP r30,`$FRAME-$SIZE_T*2`($sp)
905 $POP r31,`$FRAME-$SIZE_T*1`($sp)
906 mtlr r0
907 addi $sp,$sp,$FRAME
908 blr
909 .long 0
910 .byte 0,12,4,1,0x80,18,3,0
911 .long 0
912
913.align 5
914Lppc_AES_decrypt:
915 lwz $acc00,240($key)
916 addi $Tbl1,$Tbl0,3
917 lwz $t0,0($key)
918 addi $Tbl2,$Tbl0,2
919 lwz $t1,4($key)
920 addi $Tbl3,$Tbl0,1
921 lwz $t2,8($key)
922 addi $acc00,$acc00,-1
923 lwz $t3,12($key)
924 addi $key,$key,16
925 xor $s0,$s0,$t0
926 xor $s1,$s1,$t1
927 xor $s2,$s2,$t2
928 xor $s3,$s3,$t3
929 mtctr $acc00
930.align 4
931Ldec_loop:
932 rlwinm $acc00,$s0,`32-24+3`,21,28
933 rlwinm $acc01,$s1,`32-24+3`,21,28
934 rlwinm $acc02,$s2,`32-24+3`,21,28
935 rlwinm $acc03,$s3,`32-24+3`,21,28
936 lwz $t0,0($key)
937 rlwinm $acc04,$s3,`32-16+3`,21,28
938 lwz $t1,4($key)
939 rlwinm $acc05,$s0,`32-16+3`,21,28
940 lwz $t2,8($key)
941 rlwinm $acc06,$s1,`32-16+3`,21,28
942 lwz $t3,12($key)
943 rlwinm $acc07,$s2,`32-16+3`,21,28
944 lwzx $acc00,$Tbl0,$acc00
945 rlwinm $acc08,$s2,`32-8+3`,21,28
946 lwzx $acc01,$Tbl0,$acc01
947 rlwinm $acc09,$s3,`32-8+3`,21,28
948 lwzx $acc02,$Tbl0,$acc02
949 rlwinm $acc10,$s0,`32-8+3`,21,28
950 lwzx $acc03,$Tbl0,$acc03
951 rlwinm $acc11,$s1,`32-8+3`,21,28
952 lwzx $acc04,$Tbl1,$acc04
953 rlwinm $acc12,$s1,`0+3`,21,28
954 lwzx $acc05,$Tbl1,$acc05
955 rlwinm $acc13,$s2,`0+3`,21,28
956 lwzx $acc06,$Tbl1,$acc06
957 rlwinm $acc14,$s3,`0+3`,21,28
958 lwzx $acc07,$Tbl1,$acc07
959 rlwinm $acc15,$s0,`0+3`,21,28
960 lwzx $acc08,$Tbl2,$acc08
961 xor $t0,$t0,$acc00
962 lwzx $acc09,$Tbl2,$acc09
963 xor $t1,$t1,$acc01
964 lwzx $acc10,$Tbl2,$acc10
965 xor $t2,$t2,$acc02
966 lwzx $acc11,$Tbl2,$acc11
967 xor $t3,$t3,$acc03
968 lwzx $acc12,$Tbl3,$acc12
969 xor $t0,$t0,$acc04
970 lwzx $acc13,$Tbl3,$acc13
971 xor $t1,$t1,$acc05
972 lwzx $acc14,$Tbl3,$acc14
973 xor $t2,$t2,$acc06
974 lwzx $acc15,$Tbl3,$acc15
975 xor $t3,$t3,$acc07
976 xor $t0,$t0,$acc08
977 xor $t1,$t1,$acc09
978 xor $t2,$t2,$acc10
979 xor $t3,$t3,$acc11
980 xor $s0,$t0,$acc12
981 xor $s1,$t1,$acc13
982 xor $s2,$t2,$acc14
983 xor $s3,$t3,$acc15
984 addi $key,$key,16
985 bdnz- Ldec_loop
986
987 addi $Tbl2,$Tbl0,2048
988 nop
989 lwz $t0,0($key)
990 rlwinm $acc00,$s0,`32-24`,24,31
991 lwz $t1,4($key)
992 rlwinm $acc01,$s1,`32-24`,24,31
993 lwz $t2,8($key)
994 rlwinm $acc02,$s2,`32-24`,24,31
995 lwz $t3,12($key)
996 rlwinm $acc03,$s3,`32-24`,24,31
997 lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
998 rlwinm $acc04,$s3,`32-16`,24,31
999 lwz $acc09,`2048+32`($Tbl0)
1000 rlwinm $acc05,$s0,`32-16`,24,31
1001 lwz $acc10,`2048+64`($Tbl0)
1002 lbzx $acc00,$Tbl2,$acc00
1003 lwz $acc11,`2048+96`($Tbl0)
1004 lbzx $acc01,$Tbl2,$acc01
1005 lwz $acc12,`2048+128`($Tbl0)
1006 rlwinm $acc06,$s1,`32-16`,24,31
1007 lwz $acc13,`2048+160`($Tbl0)
1008 rlwinm $acc07,$s2,`32-16`,24,31
1009 lwz $acc14,`2048+192`($Tbl0)
1010 rlwinm $acc08,$s2,`32-8`,24,31
1011 lwz $acc15,`2048+224`($Tbl0)
1012 rlwinm $acc09,$s3,`32-8`,24,31
1013 lbzx $acc02,$Tbl2,$acc02
1014 rlwinm $acc10,$s0,`32-8`,24,31
1015 lbzx $acc03,$Tbl2,$acc03
1016 rlwinm $acc11,$s1,`32-8`,24,31
1017 lbzx $acc04,$Tbl2,$acc04
1018 rlwinm $acc12,$s1,`0`,24,31
1019 lbzx $acc05,$Tbl2,$acc05
1020 rlwinm $acc13,$s2,`0`,24,31
1021 lbzx $acc06,$Tbl2,$acc06
1022 rlwinm $acc14,$s3,`0`,24,31
1023 lbzx $acc07,$Tbl2,$acc07
1024 rlwinm $acc15,$s0,`0`,24,31
1025 lbzx $acc08,$Tbl2,$acc08
1026 rlwinm $s0,$acc00,24,0,7
1027 lbzx $acc09,$Tbl2,$acc09
1028 rlwinm $s1,$acc01,24,0,7
1029 lbzx $acc10,$Tbl2,$acc10
1030 rlwinm $s2,$acc02,24,0,7
1031 lbzx $acc11,$Tbl2,$acc11
1032 rlwinm $s3,$acc03,24,0,7
1033 lbzx $acc12,$Tbl2,$acc12
1034 rlwimi $s0,$acc04,16,8,15
1035 lbzx $acc13,$Tbl2,$acc13
1036 rlwimi $s1,$acc05,16,8,15
1037 lbzx $acc14,$Tbl2,$acc14
1038 rlwimi $s2,$acc06,16,8,15
1039 lbzx $acc15,$Tbl2,$acc15
1040 rlwimi $s3,$acc07,16,8,15
1041 rlwimi $s0,$acc08,8,16,23
1042 rlwimi $s1,$acc09,8,16,23
1043 rlwimi $s2,$acc10,8,16,23
1044 rlwimi $s3,$acc11,8,16,23
1045 or $s0,$s0,$acc12
1046 or $s1,$s1,$acc13
1047 or $s2,$s2,$acc14
1048 or $s3,$s3,$acc15
1049 xor $s0,$s0,$t0
1050 xor $s1,$s1,$t1
1051 xor $s2,$s2,$t2
1052 xor $s3,$s3,$t3
1053 blr
1054 .long 0
1055 .byte 0,12,0x14,0,0,0,0,0
1056
1057.align 4
1058Lppc_AES_decrypt_compact:
1059 lwz $acc00,240($key)
1060 addi $Tbl1,$Tbl0,2048
1061 lwz $t0,0($key)
1062 lis $mask80,0x8080
1063 lwz $t1,4($key)
1064 lis $mask1b,0x1b1b
1065 lwz $t2,8($key)
1066 ori $mask80,$mask80,0x8080
1067 lwz $t3,12($key)
1068 ori $mask1b,$mask1b,0x1b1b
1069 addi $key,$key,16
1070___
1071$code.=<<___ if ($SIZE_T==8);
1072 insrdi $mask80,$mask80,32,0
1073 insrdi $mask1b,$mask1b,32,0
1074___
1075$code.=<<___;
1076 mtctr $acc00
1077.align 4
1078Ldec_compact_loop:
1079 xor $s0,$s0,$t0
1080 xor $s1,$s1,$t1
1081 rlwinm $acc00,$s0,`32-24`,24,31
1082 xor $s2,$s2,$t2
1083 rlwinm $acc01,$s1,`32-24`,24,31
1084 xor $s3,$s3,$t3
1085 rlwinm $acc02,$s2,`32-24`,24,31
1086 rlwinm $acc03,$s3,`32-24`,24,31
1087 rlwinm $acc04,$s3,`32-16`,24,31
1088 rlwinm $acc05,$s0,`32-16`,24,31
1089 rlwinm $acc06,$s1,`32-16`,24,31
1090 rlwinm $acc07,$s2,`32-16`,24,31
1091 lbzx $acc00,$Tbl1,$acc00
1092 rlwinm $acc08,$s2,`32-8`,24,31
1093 lbzx $acc01,$Tbl1,$acc01
1094 rlwinm $acc09,$s3,`32-8`,24,31
1095 lbzx $acc02,$Tbl1,$acc02
1096 rlwinm $acc10,$s0,`32-8`,24,31
1097 lbzx $acc03,$Tbl1,$acc03
1098 rlwinm $acc11,$s1,`32-8`,24,31
1099 lbzx $acc04,$Tbl1,$acc04
1100 rlwinm $acc12,$s1,`0`,24,31
1101 lbzx $acc05,$Tbl1,$acc05
1102 rlwinm $acc13,$s2,`0`,24,31
1103 lbzx $acc06,$Tbl1,$acc06
1104 rlwinm $acc14,$s3,`0`,24,31
1105 lbzx $acc07,$Tbl1,$acc07
1106 rlwinm $acc15,$s0,`0`,24,31
1107 lbzx $acc08,$Tbl1,$acc08
1108 rlwinm $s0,$acc00,24,0,7
1109 lbzx $acc09,$Tbl1,$acc09
1110 rlwinm $s1,$acc01,24,0,7
1111 lbzx $acc10,$Tbl1,$acc10
1112 rlwinm $s2,$acc02,24,0,7
1113 lbzx $acc11,$Tbl1,$acc11
1114 rlwinm $s3,$acc03,24,0,7
1115 lbzx $acc12,$Tbl1,$acc12
1116 rlwimi $s0,$acc04,16,8,15
1117 lbzx $acc13,$Tbl1,$acc13
1118 rlwimi $s1,$acc05,16,8,15
1119 lbzx $acc14,$Tbl1,$acc14
1120 rlwimi $s2,$acc06,16,8,15
1121 lbzx $acc15,$Tbl1,$acc15
1122 rlwimi $s3,$acc07,16,8,15
1123 rlwimi $s0,$acc08,8,16,23
1124 rlwimi $s1,$acc09,8,16,23
1125 rlwimi $s2,$acc10,8,16,23
1126 rlwimi $s3,$acc11,8,16,23
1127 lwz $t0,0($key)
1128 or $s0,$s0,$acc12
1129 lwz $t1,4($key)
1130 or $s1,$s1,$acc13
1131 lwz $t2,8($key)
1132 or $s2,$s2,$acc14
1133 lwz $t3,12($key)
1134 or $s3,$s3,$acc15
1135
1136 addi $key,$key,16
1137 bdz Ldec_compact_done
1138___
1139$code.=<<___ if ($SIZE_T==8);
1140 # vectorized permutation improves decrypt performance by 10%
1141 insrdi $s0,$s1,32,0
1142 insrdi $s2,$s3,32,0
1143
1144 and $acc00,$s0,$mask80 # r1=r0&0x80808080
1145 and $acc02,$s2,$mask80
1146 srdi $acc04,$acc00,7 # r1>>7
1147 srdi $acc06,$acc02,7
1148 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
1149 andc $acc10,$s2,$mask80
1150 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
1151 sub $acc02,$acc02,$acc06
1152 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
1153 add $acc10,$acc10,$acc10
1154 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1155 and $acc02,$acc02,$mask1b
1156 xor $acc00,$acc00,$acc08 # r2
1157 xor $acc02,$acc02,$acc10
1158
1159 and $acc04,$acc00,$mask80 # r1=r2&0x80808080
1160 and $acc06,$acc02,$mask80
1161 srdi $acc08,$acc04,7 # r1>>7
1162 srdi $acc10,$acc06,7
1163 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
1164 andc $acc14,$acc02,$mask80
1165 sub $acc04,$acc04,$acc08 # r1-(r1>>7)
1166 sub $acc06,$acc06,$acc10
1167 add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
1168 add $acc14,$acc14,$acc14
1169 and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1170 and $acc06,$acc06,$mask1b
1171 xor $acc04,$acc04,$acc12 # r4
1172 xor $acc06,$acc06,$acc14
1173
1174 and $acc08,$acc04,$mask80 # r1=r4&0x80808080
1175 and $acc10,$acc06,$mask80
1176 srdi $acc12,$acc08,7 # r1>>7
1177 srdi $acc14,$acc10,7
1178 sub $acc08,$acc08,$acc12 # r1-(r1>>7)
1179 sub $acc10,$acc10,$acc14
1180 andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
1181 andc $acc14,$acc06,$mask80
1182 add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
1183 add $acc14,$acc14,$acc14
1184 and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1185 and $acc10,$acc10,$mask1b
1186 xor $acc08,$acc08,$acc12 # r8
1187 xor $acc10,$acc10,$acc14
1188
1189 xor $acc00,$acc00,$s0 # r2^r0
1190 xor $acc02,$acc02,$s2
1191 xor $acc04,$acc04,$s0 # r4^r0
1192 xor $acc06,$acc06,$s2
1193
1194 extrdi $acc01,$acc00,32,0
1195 extrdi $acc03,$acc02,32,0
1196 extrdi $acc05,$acc04,32,0
1197 extrdi $acc07,$acc06,32,0
1198 extrdi $acc09,$acc08,32,0
1199 extrdi $acc11,$acc10,32,0
1200___
1201$code.=<<___ if ($SIZE_T==4);
1202 and $acc00,$s0,$mask80 # r1=r0&0x80808080
1203 and $acc01,$s1,$mask80
1204 and $acc02,$s2,$mask80
1205 and $acc03,$s3,$mask80
1206 srwi $acc04,$acc00,7 # r1>>7
1207 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
1208 srwi $acc05,$acc01,7
1209 andc $acc09,$s1,$mask80
1210 srwi $acc06,$acc02,7
1211 andc $acc10,$s2,$mask80
1212 srwi $acc07,$acc03,7
1213 andc $acc11,$s3,$mask80
1214 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
1215 sub $acc01,$acc01,$acc05
1216 sub $acc02,$acc02,$acc06
1217 sub $acc03,$acc03,$acc07
1218 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
1219 add $acc09,$acc09,$acc09
1220 add $acc10,$acc10,$acc10
1221 add $acc11,$acc11,$acc11
1222 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1223 and $acc01,$acc01,$mask1b
1224 and $acc02,$acc02,$mask1b
1225 and $acc03,$acc03,$mask1b
1226 xor $acc00,$acc00,$acc08 # r2
1227 xor $acc01,$acc01,$acc09
1228 xor $acc02,$acc02,$acc10
1229 xor $acc03,$acc03,$acc11
1230
1231 and $acc04,$acc00,$mask80 # r1=r2&0x80808080
1232 and $acc05,$acc01,$mask80
1233 and $acc06,$acc02,$mask80
1234 and $acc07,$acc03,$mask80
1235 srwi $acc08,$acc04,7 # r1>>7
1236 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
1237 srwi $acc09,$acc05,7
1238 andc $acc13,$acc01,$mask80
1239 srwi $acc10,$acc06,7
1240 andc $acc14,$acc02,$mask80
1241 srwi $acc11,$acc07,7
1242 andc $acc15,$acc03,$mask80
1243 sub $acc04,$acc04,$acc08 # r1-(r1>>7)
1244 sub $acc05,$acc05,$acc09
1245 sub $acc06,$acc06,$acc10
1246 sub $acc07,$acc07,$acc11
1247 add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
1248 add $acc13,$acc13,$acc13
1249 add $acc14,$acc14,$acc14
1250 add $acc15,$acc15,$acc15
1251 and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1252 and $acc05,$acc05,$mask1b
1253 and $acc06,$acc06,$mask1b
1254 and $acc07,$acc07,$mask1b
1255 xor $acc04,$acc04,$acc12 # r4
1256 xor $acc05,$acc05,$acc13
1257 xor $acc06,$acc06,$acc14
1258 xor $acc07,$acc07,$acc15
1259
1260 and $acc08,$acc04,$mask80 # r1=r4&0x80808080
1261 and $acc09,$acc05,$mask80
1262 srwi $acc12,$acc08,7 # r1>>7
1263 and $acc10,$acc06,$mask80
1264 srwi $acc13,$acc09,7
1265 and $acc11,$acc07,$mask80
1266 srwi $acc14,$acc10,7
1267 sub $acc08,$acc08,$acc12 # r1-(r1>>7)
1268 srwi $acc15,$acc11,7
1269 sub $acc09,$acc09,$acc13
1270 sub $acc10,$acc10,$acc14
1271 sub $acc11,$acc11,$acc15
1272 andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
1273 andc $acc13,$acc05,$mask80
1274 andc $acc14,$acc06,$mask80
1275 andc $acc15,$acc07,$mask80
1276 add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
1277 add $acc13,$acc13,$acc13
1278 add $acc14,$acc14,$acc14
1279 add $acc15,$acc15,$acc15
1280 and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1281 and $acc09,$acc09,$mask1b
1282 and $acc10,$acc10,$mask1b
1283 and $acc11,$acc11,$mask1b
1284 xor $acc08,$acc08,$acc12 # r8
1285 xor $acc09,$acc09,$acc13
1286 xor $acc10,$acc10,$acc14
1287 xor $acc11,$acc11,$acc15
1288
1289 xor $acc00,$acc00,$s0 # r2^r0
1290 xor $acc01,$acc01,$s1
1291 xor $acc02,$acc02,$s2
1292 xor $acc03,$acc03,$s3
1293 xor $acc04,$acc04,$s0 # r4^r0
1294 xor $acc05,$acc05,$s1
1295 xor $acc06,$acc06,$s2
1296 xor $acc07,$acc07,$s3
1297___
1298$code.=<<___;
1299 rotrwi $s0,$s0,8 # = ROTATE(r0,8)
1300 rotrwi $s1,$s1,8
1301 xor $s0,$s0,$acc00 # ^= r2^r0
1302 rotrwi $s2,$s2,8
1303 xor $s1,$s1,$acc01
1304 rotrwi $s3,$s3,8
1305 xor $s2,$s2,$acc02
1306 xor $s3,$s3,$acc03
1307 xor $acc00,$acc00,$acc08
1308 xor $acc01,$acc01,$acc09
1309 xor $acc02,$acc02,$acc10
1310 xor $acc03,$acc03,$acc11
1311 xor $s0,$s0,$acc04 # ^= r4^r0
1312 rotrwi $acc00,$acc00,24
1313 xor $s1,$s1,$acc05
1314 rotrwi $acc01,$acc01,24
1315 xor $s2,$s2,$acc06
1316 rotrwi $acc02,$acc02,24
1317 xor $s3,$s3,$acc07
1318 rotrwi $acc03,$acc03,24
1319 xor $acc04,$acc04,$acc08
1320 xor $acc05,$acc05,$acc09
1321 xor $acc06,$acc06,$acc10
1322 xor $acc07,$acc07,$acc11
1323 xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
1324 rotrwi $acc04,$acc04,16
1325 xor $s1,$s1,$acc09
1326 rotrwi $acc05,$acc05,16
1327 xor $s2,$s2,$acc10
1328 rotrwi $acc06,$acc06,16
1329 xor $s3,$s3,$acc11
1330 rotrwi $acc07,$acc07,16
1331 xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24)
1332 rotrwi $acc08,$acc08,8
1333 xor $s1,$s1,$acc01
1334 rotrwi $acc09,$acc09,8
1335 xor $s2,$s2,$acc02
1336 rotrwi $acc10,$acc10,8
1337 xor $s3,$s3,$acc03
1338 rotrwi $acc11,$acc11,8
1339 xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16)
1340 xor $s1,$s1,$acc05
1341 xor $s2,$s2,$acc06
1342 xor $s3,$s3,$acc07
1343 xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
1344 xor $s1,$s1,$acc09
1345 xor $s2,$s2,$acc10
1346 xor $s3,$s3,$acc11
1347
1348 b Ldec_compact_loop
1349.align 4
1350Ldec_compact_done:
1351 xor $s0,$s0,$t0
1352 xor $s1,$s1,$t1
1353 xor $s2,$s2,$t2
1354 xor $s3,$s3,$t3
1355 blr
1356 .long 0
1357 .byte 0,12,0x14,0,0,0,0,0
1358
1359.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
1360.align 7
1361___
1362
1363$code =~ s/\`([^\`]*)\`/eval $1/gem;
1364print $code;
1365close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl
deleted file mode 100644
index 445a1e6762..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-s390x.pl
+++ /dev/null
@@ -1,2254 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for s390x.
11
12# April 2007.
13#
14# Software performance improvement over gcc-generated code is ~70% and
15# in absolute terms is ~73 cycles per byte processed with 128-bit key.
16# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
17# *strictly* in-order execution and issued instruction [in this case
18# load value from memory is critical] has to complete before execution
19# flow proceeds. S-boxes are compressed to 2KB[+256B].
20#
21# As for hardware acceleration support. It's basically a "teaser," as
22# it can and should be improved in several ways. Most notably support
23# for CBC is not utilized, nor multiple blocks are ever processed.
24# Then software key schedule can be postponed till hardware support
25# detection... Performance improvement over assembler is reportedly
26# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
27# support is implemented.
28
29# May 2007.
30#
31# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
32# for 128-bit keys, if hardware support is detected.
33
34# Januray 2009.
35#
36# Add support for hardware AES192/256 and reschedule instructions to
37# minimize/avoid Address Generation Interlock hazard and to favour
38# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
39# almost 50% on z9. The gain is smaller on z10, because being dual-
40# issue z10 makes it improssible to eliminate the interlock condition:
41# critial path is not long enough. Yet it spends ~24 cycles per byte
42# processed with 128-bit key.
43#
44# Unlike previous version hardware support detection takes place only
45# at the moment of key schedule setup, which is denoted in key->rounds.
46# This is done, because deferred key setup can't be made MT-safe, not
47# for keys longer than 128 bits.
48#
49# Add AES_cbc_encrypt, which gives incredible performance improvement,
50# it was measured to be ~6.6x. It's less than previously mentioned 8x,
51# because software implementation was optimized.
52
53# May 2010.
54#
55# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
56# performance improvement over "generic" counter mode routine relying
57# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
58# to the fact that exact throughput value depends on current stack
59# frame alignment within 4KB page. In worst case you get ~75% of the
60# maximum, but *on average* it would be as much as ~98%. Meaning that
61# worst case is unlike, it's like hitting ravine on plateau.
62
63# November 2010.
64#
65# Adapt for -m31 build. If kernel supports what's called "highgprs"
66# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
67# instructions and achieve "64-bit" performance even in 31-bit legacy
68# application context. The feature is not specific to any particular
69# processor, as long as it's "z-CPU". Latter implies that the code
70# remains z/Architecture specific. On z990 it was measured to perform
71# 2x better than code generated by gcc 4.3.
72
73# December 2010.
74#
75# Add support for z196 "cipher message with counter" instruction.
76# Note however that it's disengaged, because it was measured to
77# perform ~12% worse than vanilla km-based code...
78
79# February 2011.
80#
81# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
82# instructions, which deliver ~70% improvement at 8KB block size over
83# vanilla km-based code, 37% - at most like 512-bytes block size.
84
85$flavour = shift;
86
87if ($flavour =~ /3[12]/) {
88 $SIZE_T=4;
89 $g="";
90} else {
91 $SIZE_T=8;
92 $g="g";
93}
94
95while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
96open STDOUT,">$output";
97
98$softonly=0; # allow hardware support
99
100$t0="%r0"; $mask="%r0";
101$t1="%r1";
102$t2="%r2"; $inp="%r2";
103$t3="%r3"; $out="%r3"; $bits="%r3";
104$key="%r4";
105$i1="%r5";
106$i2="%r6";
107$i3="%r7";
108$s0="%r8";
109$s1="%r9";
110$s2="%r10";
111$s3="%r11";
112$tbl="%r12";
113$rounds="%r13";
114$ra="%r14";
115$sp="%r15";
116
117$stdframe=16*$SIZE_T+4*8;
118
119sub _data_word()
120{ my $i;
121 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
122}
123
124$code=<<___;
125.text
126
127.type AES_Te,\@object
128.align 256
129AES_Te:
130___
131&_data_word(
132 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
133 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
134 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
135 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
136 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
137 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
138 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
139 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
140 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
141 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
142 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
143 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
144 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
145 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
146 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
147 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
148 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
149 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
150 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
151 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
152 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
153 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
154 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
155 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
156 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
157 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
158 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
159 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
160 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
161 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
162 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
163 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
164 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
165 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
166 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
167 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
168 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
169 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
170 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
171 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
172 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
173 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
174 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
175 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
176 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
177 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
178 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
179 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
180 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
181 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
182 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
183 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
184 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
185 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
186 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
187 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
188 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
189 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
190 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
191 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
192 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
193 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
194 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
195 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
196$code.=<<___;
197# Te4[256]
198.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
199.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
200.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
201.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
202.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
203.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
204.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
205.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
206.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
207.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
208.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
209.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
210.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
211.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
212.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
213.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
214.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
215.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
216.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
217.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
218.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
219.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
220.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
221.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
222.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
223.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
224.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
225.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
226.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
227.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
228.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
229.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
230# rcon[]
231.long 0x01000000, 0x02000000, 0x04000000, 0x08000000
232.long 0x10000000, 0x20000000, 0x40000000, 0x80000000
233.long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
234.align 256
235.size AES_Te,.-AES_Te
236
237# void AES_encrypt(const unsigned char *inp, unsigned char *out,
238# const AES_KEY *key) {
239.globl AES_encrypt
240.type AES_encrypt,\@function
241AES_encrypt:
242___
243$code.=<<___ if (!$softonly);
244 l %r0,240($key)
245 lhi %r1,16
246 clr %r0,%r1
247 jl .Lesoft
248
249 la %r1,0($key)
250 #la %r2,0($inp)
251 la %r4,0($out)
252 lghi %r3,16 # single block length
253 .long 0xb92e0042 # km %r4,%r2
254 brc 1,.-4 # can this happen?
255 br %r14
256.align 64
257.Lesoft:
258___
259$code.=<<___;
260 stm${g} %r3,$ra,3*$SIZE_T($sp)
261
262 llgf $s0,0($inp)
263 llgf $s1,4($inp)
264 llgf $s2,8($inp)
265 llgf $s3,12($inp)
266
267 larl $tbl,AES_Te
268 bras $ra,_s390x_AES_encrypt
269
270 l${g} $out,3*$SIZE_T($sp)
271 st $s0,0($out)
272 st $s1,4($out)
273 st $s2,8($out)
274 st $s3,12($out)
275
276 lm${g} %r6,$ra,6*$SIZE_T($sp)
277 br $ra
278.size AES_encrypt,.-AES_encrypt
279
280.type _s390x_AES_encrypt,\@function
281.align 16
282_s390x_AES_encrypt:
283 st${g} $ra,15*$SIZE_T($sp)
284 x $s0,0($key)
285 x $s1,4($key)
286 x $s2,8($key)
287 x $s3,12($key)
288 l $rounds,240($key)
289 llill $mask,`0xff<<3`
290 aghi $rounds,-1
291 j .Lenc_loop
292.align 16
293.Lenc_loop:
294 sllg $t1,$s0,`0+3`
295 srlg $t2,$s0,`8-3`
296 srlg $t3,$s0,`16-3`
297 srl $s0,`24-3`
298 nr $s0,$mask
299 ngr $t1,$mask
300 nr $t2,$mask
301 nr $t3,$mask
302
303 srlg $i1,$s1,`16-3` # i0
304 sllg $i2,$s1,`0+3`
305 srlg $i3,$s1,`8-3`
306 srl $s1,`24-3`
307 nr $i1,$mask
308 nr $s1,$mask
309 ngr $i2,$mask
310 nr $i3,$mask
311
312 l $s0,0($s0,$tbl) # Te0[s0>>24]
313 l $t1,1($t1,$tbl) # Te3[s0>>0]
314 l $t2,2($t2,$tbl) # Te2[s0>>8]
315 l $t3,3($t3,$tbl) # Te1[s0>>16]
316
317 x $s0,3($i1,$tbl) # Te1[s1>>16]
318 l $s1,0($s1,$tbl) # Te0[s1>>24]
319 x $t2,1($i2,$tbl) # Te3[s1>>0]
320 x $t3,2($i3,$tbl) # Te2[s1>>8]
321
322 srlg $i1,$s2,`8-3` # i0
323 srlg $i2,$s2,`16-3` # i1
324 nr $i1,$mask
325 nr $i2,$mask
326 sllg $i3,$s2,`0+3`
327 srl $s2,`24-3`
328 nr $s2,$mask
329 ngr $i3,$mask
330
331 xr $s1,$t1
332 srlg $ra,$s3,`8-3` # i1
333 sllg $t1,$s3,`0+3` # i0
334 nr $ra,$mask
335 la $key,16($key)
336 ngr $t1,$mask
337
338 x $s0,2($i1,$tbl) # Te2[s2>>8]
339 x $s1,3($i2,$tbl) # Te1[s2>>16]
340 l $s2,0($s2,$tbl) # Te0[s2>>24]
341 x $t3,1($i3,$tbl) # Te3[s2>>0]
342
343 srlg $i3,$s3,`16-3` # i2
344 xr $s2,$t2
345 srl $s3,`24-3`
346 nr $i3,$mask
347 nr $s3,$mask
348
349 x $s0,0($key)
350 x $s1,4($key)
351 x $s2,8($key)
352 x $t3,12($key)
353
354 x $s0,1($t1,$tbl) # Te3[s3>>0]
355 x $s1,2($ra,$tbl) # Te2[s3>>8]
356 x $s2,3($i3,$tbl) # Te1[s3>>16]
357 l $s3,0($s3,$tbl) # Te0[s3>>24]
358 xr $s3,$t3
359
360 brct $rounds,.Lenc_loop
361 .align 16
362
363 sllg $t1,$s0,`0+3`
364 srlg $t2,$s0,`8-3`
365 ngr $t1,$mask
366 srlg $t3,$s0,`16-3`
367 srl $s0,`24-3`
368 nr $s0,$mask
369 nr $t2,$mask
370 nr $t3,$mask
371
372 srlg $i1,$s1,`16-3` # i0
373 sllg $i2,$s1,`0+3`
374 ngr $i2,$mask
375 srlg $i3,$s1,`8-3`
376 srl $s1,`24-3`
377 nr $i1,$mask
378 nr $s1,$mask
379 nr $i3,$mask
380
381 llgc $s0,2($s0,$tbl) # Te4[s0>>24]
382 llgc $t1,2($t1,$tbl) # Te4[s0>>0]
383 sll $s0,24
384 llgc $t2,2($t2,$tbl) # Te4[s0>>8]
385 llgc $t3,2($t3,$tbl) # Te4[s0>>16]
386 sll $t2,8
387 sll $t3,16
388
389 llgc $i1,2($i1,$tbl) # Te4[s1>>16]
390 llgc $s1,2($s1,$tbl) # Te4[s1>>24]
391 llgc $i2,2($i2,$tbl) # Te4[s1>>0]
392 llgc $i3,2($i3,$tbl) # Te4[s1>>8]
393 sll $i1,16
394 sll $s1,24
395 sll $i3,8
396 or $s0,$i1
397 or $s1,$t1
398 or $t2,$i2
399 or $t3,$i3
400
401 srlg $i1,$s2,`8-3` # i0
402 srlg $i2,$s2,`16-3` # i1
403 nr $i1,$mask
404 nr $i2,$mask
405 sllg $i3,$s2,`0+3`
406 srl $s2,`24-3`
407 ngr $i3,$mask
408 nr $s2,$mask
409
410 sllg $t1,$s3,`0+3` # i0
411 srlg $ra,$s3,`8-3` # i1
412 ngr $t1,$mask
413
414 llgc $i1,2($i1,$tbl) # Te4[s2>>8]
415 llgc $i2,2($i2,$tbl) # Te4[s2>>16]
416 sll $i1,8
417 llgc $s2,2($s2,$tbl) # Te4[s2>>24]
418 llgc $i3,2($i3,$tbl) # Te4[s2>>0]
419 sll $i2,16
420 nr $ra,$mask
421 sll $s2,24
422 or $s0,$i1
423 or $s1,$i2
424 or $s2,$t2
425 or $t3,$i3
426
427 srlg $i3,$s3,`16-3` # i2
428 srl $s3,`24-3`
429 nr $i3,$mask
430 nr $s3,$mask
431
432 l $t0,16($key)
433 l $t2,20($key)
434
435 llgc $i1,2($t1,$tbl) # Te4[s3>>0]
436 llgc $i2,2($ra,$tbl) # Te4[s3>>8]
437 llgc $i3,2($i3,$tbl) # Te4[s3>>16]
438 llgc $s3,2($s3,$tbl) # Te4[s3>>24]
439 sll $i2,8
440 sll $i3,16
441 sll $s3,24
442 or $s0,$i1
443 or $s1,$i2
444 or $s2,$i3
445 or $s3,$t3
446
447 l${g} $ra,15*$SIZE_T($sp)
448 xr $s0,$t0
449 xr $s1,$t2
450 x $s2,24($key)
451 x $s3,28($key)
452
453 br $ra
454.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
455___
456
457$code.=<<___;
458.type AES_Td,\@object
459.align 256
460AES_Td:
461___
462&_data_word(
463 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
464 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
465 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
466 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
467 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
468 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
469 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
470 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
471 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
472 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
473 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
474 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
475 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
476 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
477 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
478 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
479 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
480 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
481 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
482 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
483 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
484 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
485 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
486 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
487 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
488 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
489 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
490 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
491 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
492 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
493 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
494 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
495 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
496 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
497 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
498 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
499 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
500 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
501 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
502 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
503 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
504 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
505 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
506 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
507 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
508 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
509 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
510 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
511 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
512 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
513 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
514 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
515 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
516 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
517 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
518 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
519 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
520 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
521 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
522 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
523 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
524 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
525 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
526 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
527$code.=<<___;
528# Td4[256]
529.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
530.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
531.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
532.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
533.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
534.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
535.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
536.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
537.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
538.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
539.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
540.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
541.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
542.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
543.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
544.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
545.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
546.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
547.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
548.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
549.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
550.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
551.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
552.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
553.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
554.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
555.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
556.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
557.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
558.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
559.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
560.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
561.size AES_Td,.-AES_Td
562
563# void AES_decrypt(const unsigned char *inp, unsigned char *out,
564# const AES_KEY *key) {
565.globl AES_decrypt
566.type AES_decrypt,\@function
567AES_decrypt:
568___
569$code.=<<___ if (!$softonly);
570 l %r0,240($key)
571 lhi %r1,16
572 clr %r0,%r1
573 jl .Ldsoft
574
575 la %r1,0($key)
576 #la %r2,0($inp)
577 la %r4,0($out)
578 lghi %r3,16 # single block length
579 .long 0xb92e0042 # km %r4,%r2
580 brc 1,.-4 # can this happen?
581 br %r14
582.align 64
583.Ldsoft:
584___
585$code.=<<___;
586 stm${g} %r3,$ra,3*$SIZE_T($sp)
587
588 llgf $s0,0($inp)
589 llgf $s1,4($inp)
590 llgf $s2,8($inp)
591 llgf $s3,12($inp)
592
593 larl $tbl,AES_Td
594 bras $ra,_s390x_AES_decrypt
595
596 l${g} $out,3*$SIZE_T($sp)
597 st $s0,0($out)
598 st $s1,4($out)
599 st $s2,8($out)
600 st $s3,12($out)
601
602 lm${g} %r6,$ra,6*$SIZE_T($sp)
603 br $ra
604.size AES_decrypt,.-AES_decrypt
605
606.type _s390x_AES_decrypt,\@function
607.align 16
608_s390x_AES_decrypt:
609 st${g} $ra,15*$SIZE_T($sp)
610 x $s0,0($key)
611 x $s1,4($key)
612 x $s2,8($key)
613 x $s3,12($key)
614 l $rounds,240($key)
615 llill $mask,`0xff<<3`
616 aghi $rounds,-1
617 j .Ldec_loop
618.align 16
619.Ldec_loop:
620 srlg $t1,$s0,`16-3`
621 srlg $t2,$s0,`8-3`
622 sllg $t3,$s0,`0+3`
623 srl $s0,`24-3`
624 nr $s0,$mask
625 nr $t1,$mask
626 nr $t2,$mask
627 ngr $t3,$mask
628
629 sllg $i1,$s1,`0+3` # i0
630 srlg $i2,$s1,`16-3`
631 srlg $i3,$s1,`8-3`
632 srl $s1,`24-3`
633 ngr $i1,$mask
634 nr $s1,$mask
635 nr $i2,$mask
636 nr $i3,$mask
637
638 l $s0,0($s0,$tbl) # Td0[s0>>24]
639 l $t1,3($t1,$tbl) # Td1[s0>>16]
640 l $t2,2($t2,$tbl) # Td2[s0>>8]
641 l $t3,1($t3,$tbl) # Td3[s0>>0]
642
643 x $s0,1($i1,$tbl) # Td3[s1>>0]
644 l $s1,0($s1,$tbl) # Td0[s1>>24]
645 x $t2,3($i2,$tbl) # Td1[s1>>16]
646 x $t3,2($i3,$tbl) # Td2[s1>>8]
647
648 srlg $i1,$s2,`8-3` # i0
649 sllg $i2,$s2,`0+3` # i1
650 srlg $i3,$s2,`16-3`
651 srl $s2,`24-3`
652 nr $i1,$mask
653 ngr $i2,$mask
654 nr $s2,$mask
655 nr $i3,$mask
656
657 xr $s1,$t1
658 srlg $ra,$s3,`8-3` # i1
659 srlg $t1,$s3,`16-3` # i0
660 nr $ra,$mask
661 la $key,16($key)
662 nr $t1,$mask
663
664 x $s0,2($i1,$tbl) # Td2[s2>>8]
665 x $s1,1($i2,$tbl) # Td3[s2>>0]
666 l $s2,0($s2,$tbl) # Td0[s2>>24]
667 x $t3,3($i3,$tbl) # Td1[s2>>16]
668
669 sllg $i3,$s3,`0+3` # i2
670 srl $s3,`24-3`
671 ngr $i3,$mask
672 nr $s3,$mask
673
674 xr $s2,$t2
675 x $s0,0($key)
676 x $s1,4($key)
677 x $s2,8($key)
678 x $t3,12($key)
679
680 x $s0,3($t1,$tbl) # Td1[s3>>16]
681 x $s1,2($ra,$tbl) # Td2[s3>>8]
682 x $s2,1($i3,$tbl) # Td3[s3>>0]
683 l $s3,0($s3,$tbl) # Td0[s3>>24]
684 xr $s3,$t3
685
686 brct $rounds,.Ldec_loop
687 .align 16
688
689 l $t1,`2048+0`($tbl) # prefetch Td4
690 l $t2,`2048+64`($tbl)
691 l $t3,`2048+128`($tbl)
692 l $i1,`2048+192`($tbl)
693 llill $mask,0xff
694
695 srlg $i3,$s0,24 # i0
696 srlg $t1,$s0,16
697 srlg $t2,$s0,8
698 nr $s0,$mask # i3
699 nr $t1,$mask
700
701 srlg $i1,$s1,24
702 nr $t2,$mask
703 srlg $i2,$s1,16
704 srlg $ra,$s1,8
705 nr $s1,$mask # i0
706 nr $i2,$mask
707 nr $ra,$mask
708
709 llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
710 llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
711 llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
712 sll $t1,16
713 llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
714 sllg $s0,$i3,24
715 sll $t2,8
716
717 llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
718 llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
719 llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
720 sll $i1,24
721 llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
722 sll $i2,16
723 sll $i3,8
724 or $s0,$s1
725 or $t1,$i1
726 or $t2,$i2
727 or $t3,$i3
728
729 srlg $i1,$s2,8 # i0
730 srlg $i2,$s2,24
731 srlg $i3,$s2,16
732 nr $s2,$mask # i1
733 nr $i1,$mask
734 nr $i3,$mask
735 llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
736 llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
737 llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
738 llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
739 sll $i1,8
740 sll $i2,24
741 or $s0,$i1
742 sll $i3,16
743 or $t2,$i2
744 or $t3,$i3
745
746 srlg $i1,$s3,16 # i0
747 srlg $i2,$s3,8 # i1
748 srlg $i3,$s3,24
749 nr $s3,$mask # i2
750 nr $i1,$mask
751 nr $i2,$mask
752
753 l${g} $ra,15*$SIZE_T($sp)
754 or $s1,$t1
755 l $t0,16($key)
756 l $t1,20($key)
757
758 llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
759 llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
760 sll $i1,16
761 llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
762 llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
763 sll $i2,8
764 sll $s3,24
765 or $s0,$i1
766 or $s1,$i2
767 or $s2,$t2
768 or $s3,$t3
769
770 xr $s0,$t0
771 xr $s1,$t1
772 x $s2,24($key)
773 x $s3,28($key)
774
775 br $ra
776.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
777___
778
779$code.=<<___;
780# void AES_set_encrypt_key(const unsigned char *in, int bits,
781# AES_KEY *key) {
782.globl private_AES_set_encrypt_key
783.type private_AES_set_encrypt_key,\@function
784.align 16
785private_AES_set_encrypt_key:
786_s390x_AES_set_encrypt_key:
787 lghi $t0,0
788 cl${g}r $inp,$t0
789 je .Lminus1
790 cl${g}r $key,$t0
791 je .Lminus1
792
793 lghi $t0,128
794 clr $bits,$t0
795 je .Lproceed
796 lghi $t0,192
797 clr $bits,$t0
798 je .Lproceed
799 lghi $t0,256
800 clr $bits,$t0
801 je .Lproceed
802 lghi %r2,-2
803 br %r14
804
805.align 16
806.Lproceed:
807___
808$code.=<<___ if (!$softonly);
809 # convert bits to km code, [128,192,256]->[18,19,20]
810 lhi %r5,-128
811 lhi %r0,18
812 ar %r5,$bits
813 srl %r5,6
814 ar %r5,%r0
815
816 larl %r1,OPENSSL_s390xcap_P
817 lg %r0,0(%r1)
818 tmhl %r0,0x4000 # check for message-security assist
819 jz .Lekey_internal
820
821 lghi %r0,0 # query capability vector
822 la %r1,16($sp)
823 .long 0xb92f0042 # kmc %r4,%r2
824
825 llihh %r1,0x8000
826 srlg %r1,%r1,0(%r5)
827 ng %r1,16($sp)
828 jz .Lekey_internal
829
830 lmg %r0,%r1,0($inp) # just copy 128 bits...
831 stmg %r0,%r1,0($key)
832 lhi %r0,192
833 cr $bits,%r0
834 jl 1f
835 lg %r1,16($inp)
836 stg %r1,16($key)
837 je 1f
838 lg %r1,24($inp)
839 stg %r1,24($key)
8401: st $bits,236($key) # save bits [for debugging purposes]
841 lgr $t0,%r5
842 st %r5,240($key) # save km code
843 lghi %r2,0
844 br %r14
845___
846$code.=<<___;
847.align 16
848.Lekey_internal:
849 stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
850
851 larl $tbl,AES_Te+2048
852
853 llgf $s0,0($inp)
854 llgf $s1,4($inp)
855 llgf $s2,8($inp)
856 llgf $s3,12($inp)
857 st $s0,0($key)
858 st $s1,4($key)
859 st $s2,8($key)
860 st $s3,12($key)
861 lghi $t0,128
862 cr $bits,$t0
863 jne .Lnot128
864
865 llill $mask,0xff
866 lghi $t3,0 # i=0
867 lghi $rounds,10
868 st $rounds,240($key)
869
870 llgfr $t2,$s3 # temp=rk[3]
871 srlg $i1,$s3,8
872 srlg $i2,$s3,16
873 srlg $i3,$s3,24
874 nr $t2,$mask
875 nr $i1,$mask
876 nr $i2,$mask
877
878.align 16
879.L128_loop:
880 la $t2,0($t2,$tbl)
881 la $i1,0($i1,$tbl)
882 la $i2,0($i2,$tbl)
883 la $i3,0($i3,$tbl)
884 icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
885 icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
886 icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
887 icm $t2,1,0($i3) # Te4[rk[3]>>24]
888 x $t2,256($t3,$tbl) # rcon[i]
889 xr $s0,$t2 # rk[4]=rk[0]^...
890 xr $s1,$s0 # rk[5]=rk[1]^rk[4]
891 xr $s2,$s1 # rk[6]=rk[2]^rk[5]
892 xr $s3,$s2 # rk[7]=rk[3]^rk[6]
893
894 llgfr $t2,$s3 # temp=rk[3]
895 srlg $i1,$s3,8
896 srlg $i2,$s3,16
897 nr $t2,$mask
898 nr $i1,$mask
899 srlg $i3,$s3,24
900 nr $i2,$mask
901
902 st $s0,16($key)
903 st $s1,20($key)
904 st $s2,24($key)
905 st $s3,28($key)
906 la $key,16($key) # key+=4
907 la $t3,4($t3) # i++
908 brct $rounds,.L128_loop
909 lghi $t0,10
910 lghi %r2,0
911 lm${g} %r4,%r13,4*$SIZE_T($sp)
912 br $ra
913
914.align 16
915.Lnot128:
916 llgf $t0,16($inp)
917 llgf $t1,20($inp)
918 st $t0,16($key)
919 st $t1,20($key)
920 lghi $t0,192
921 cr $bits,$t0
922 jne .Lnot192
923
924 llill $mask,0xff
925 lghi $t3,0 # i=0
926 lghi $rounds,12
927 st $rounds,240($key)
928 lghi $rounds,8
929
930 srlg $i1,$t1,8
931 srlg $i2,$t1,16
932 srlg $i3,$t1,24
933 nr $t1,$mask
934 nr $i1,$mask
935 nr $i2,$mask
936
937.align 16
938.L192_loop:
939 la $t1,0($t1,$tbl)
940 la $i1,0($i1,$tbl)
941 la $i2,0($i2,$tbl)
942 la $i3,0($i3,$tbl)
943 icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
944 icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
945 icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
946 icm $t1,1,0($i3) # Te4[rk[5]>>24]
947 x $t1,256($t3,$tbl) # rcon[i]
948 xr $s0,$t1 # rk[6]=rk[0]^...
949 xr $s1,$s0 # rk[7]=rk[1]^rk[6]
950 xr $s2,$s1 # rk[8]=rk[2]^rk[7]
951 xr $s3,$s2 # rk[9]=rk[3]^rk[8]
952
953 st $s0,24($key)
954 st $s1,28($key)
955 st $s2,32($key)
956 st $s3,36($key)
957 brct $rounds,.L192_continue
958 lghi $t0,12
959 lghi %r2,0
960 lm${g} %r4,%r13,4*$SIZE_T($sp)
961 br $ra
962
963.align 16
964.L192_continue:
965 lgr $t1,$s3
966 x $t1,16($key) # rk[10]=rk[4]^rk[9]
967 st $t1,40($key)
968 x $t1,20($key) # rk[11]=rk[5]^rk[10]
969 st $t1,44($key)
970
971 srlg $i1,$t1,8
972 srlg $i2,$t1,16
973 srlg $i3,$t1,24
974 nr $t1,$mask
975 nr $i1,$mask
976 nr $i2,$mask
977
978 la $key,24($key) # key+=6
979 la $t3,4($t3) # i++
980 j .L192_loop
981
982.align 16
983.Lnot192:
984 llgf $t0,24($inp)
985 llgf $t1,28($inp)
986 st $t0,24($key)
987 st $t1,28($key)
988 llill $mask,0xff
989 lghi $t3,0 # i=0
990 lghi $rounds,14
991 st $rounds,240($key)
992 lghi $rounds,7
993
994 srlg $i1,$t1,8
995 srlg $i2,$t1,16
996 srlg $i3,$t1,24
997 nr $t1,$mask
998 nr $i1,$mask
999 nr $i2,$mask
1000
1001.align 16
1002.L256_loop:
1003 la $t1,0($t1,$tbl)
1004 la $i1,0($i1,$tbl)
1005 la $i2,0($i2,$tbl)
1006 la $i3,0($i3,$tbl)
1007 icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
1008 icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
1009 icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
1010 icm $t1,1,0($i3) # Te4[rk[7]>>24]
1011 x $t1,256($t3,$tbl) # rcon[i]
1012 xr $s0,$t1 # rk[8]=rk[0]^...
1013 xr $s1,$s0 # rk[9]=rk[1]^rk[8]
1014 xr $s2,$s1 # rk[10]=rk[2]^rk[9]
1015 xr $s3,$s2 # rk[11]=rk[3]^rk[10]
1016 st $s0,32($key)
1017 st $s1,36($key)
1018 st $s2,40($key)
1019 st $s3,44($key)
1020 brct $rounds,.L256_continue
1021 lghi $t0,14
1022 lghi %r2,0
1023 lm${g} %r4,%r13,4*$SIZE_T($sp)
1024 br $ra
1025
1026.align 16
1027.L256_continue:
1028 lgr $t1,$s3 # temp=rk[11]
1029 srlg $i1,$s3,8
1030 srlg $i2,$s3,16
1031 srlg $i3,$s3,24
1032 nr $t1,$mask
1033 nr $i1,$mask
1034 nr $i2,$mask
1035 la $t1,0($t1,$tbl)
1036 la $i1,0($i1,$tbl)
1037 la $i2,0($i2,$tbl)
1038 la $i3,0($i3,$tbl)
1039 llgc $t1,0($t1) # Te4[rk[11]>>0]
1040 icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
1041 icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
1042 icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
1043 x $t1,16($key) # rk[12]=rk[4]^...
1044 st $t1,48($key)
1045 x $t1,20($key) # rk[13]=rk[5]^rk[12]
1046 st $t1,52($key)
1047 x $t1,24($key) # rk[14]=rk[6]^rk[13]
1048 st $t1,56($key)
1049 x $t1,28($key) # rk[15]=rk[7]^rk[14]
1050 st $t1,60($key)
1051
1052 srlg $i1,$t1,8
1053 srlg $i2,$t1,16
1054 srlg $i3,$t1,24
1055 nr $t1,$mask
1056 nr $i1,$mask
1057 nr $i2,$mask
1058
1059 la $key,32($key) # key+=8
1060 la $t3,4($t3) # i++
1061 j .L256_loop
1062
1063.Lminus1:
1064 lghi %r2,-1
1065 br $ra
1066.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
1067
1068# void AES_set_decrypt_key(const unsigned char *in, int bits,
1069# AES_KEY *key) {
1070.globl private_AES_set_decrypt_key
1071.type private_AES_set_decrypt_key,\@function
1072.align 16
1073private_AES_set_decrypt_key:
1074 #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
1075 st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
1076 bras $ra,_s390x_AES_set_encrypt_key
1077 #l${g} $key,4*$SIZE_T($sp)
1078 l${g} $ra,14*$SIZE_T($sp)
1079 ltgr %r2,%r2
1080 bnzr $ra
1081___
1082$code.=<<___ if (!$softonly);
1083 #l $t0,240($key)
1084 lhi $t1,16
1085 cr $t0,$t1
1086 jl .Lgo
1087 oill $t0,0x80 # set "decrypt" bit
1088 st $t0,240($key)
1089 br $ra
1090___
1091$code.=<<___;
1092.align 16
1093.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
1094 la $i1,0($key)
1095 sllg $i2,$rounds,4
1096 la $i2,0($i2,$key)
1097 srl $rounds,1
1098 lghi $t1,-16
1099
1100.align 16
1101.Linv: lmg $s0,$s1,0($i1)
1102 lmg $s2,$s3,0($i2)
1103 stmg $s0,$s1,0($i2)
1104 stmg $s2,$s3,0($i1)
1105 la $i1,16($i1)
1106 la $i2,0($t1,$i2)
1107 brct $rounds,.Linv
1108___
1109$mask80=$i1;
1110$mask1b=$i2;
1111$maskfe=$i3;
1112$code.=<<___;
1113 llgf $rounds,240($key)
1114 aghi $rounds,-1
1115 sll $rounds,2 # (rounds-1)*4
1116 llilh $mask80,0x8080
1117 llilh $mask1b,0x1b1b
1118 llilh $maskfe,0xfefe
1119 oill $mask80,0x8080
1120 oill $mask1b,0x1b1b
1121 oill $maskfe,0xfefe
1122
1123.align 16
1124.Lmix: l $s0,16($key) # tp1
1125 lr $s1,$s0
1126 ngr $s1,$mask80
1127 srlg $t1,$s1,7
1128 slr $s1,$t1
1129 nr $s1,$mask1b
1130 sllg $t1,$s0,1
1131 nr $t1,$maskfe
1132 xr $s1,$t1 # tp2
1133
1134 lr $s2,$s1
1135 ngr $s2,$mask80
1136 srlg $t1,$s2,7
1137 slr $s2,$t1
1138 nr $s2,$mask1b
1139 sllg $t1,$s1,1
1140 nr $t1,$maskfe
1141 xr $s2,$t1 # tp4
1142
1143 lr $s3,$s2
1144 ngr $s3,$mask80
1145 srlg $t1,$s3,7
1146 slr $s3,$t1
1147 nr $s3,$mask1b
1148 sllg $t1,$s2,1
1149 nr $t1,$maskfe
1150 xr $s3,$t1 # tp8
1151
1152 xr $s1,$s0 # tp2^tp1
1153 xr $s2,$s0 # tp4^tp1
1154 rll $s0,$s0,24 # = ROTATE(tp1,8)
1155 xr $s2,$s3 # ^=tp8
1156 xr $s0,$s1 # ^=tp2^tp1
1157 xr $s1,$s3 # tp2^tp1^tp8
1158 xr $s0,$s2 # ^=tp4^tp1^tp8
1159 rll $s1,$s1,8
1160 rll $s2,$s2,16
1161 xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
1162 rll $s3,$s3,24
1163 xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
1164 xr $s0,$s3 # ^= ROTATE(tp8,8)
1165
1166 st $s0,16($key)
1167 la $key,4($key)
1168 brct $rounds,.Lmix
1169
1170 lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1171 lghi %r2,0
1172 br $ra
1173.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
1174___
1175
1176########################################################################
1177# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1178# size_t length, const AES_KEY *key,
1179# unsigned char *ivec, const int enc)
1180{
1181my $inp="%r2";
1182my $out="%r4"; # length and out are swapped
1183my $len="%r3";
1184my $key="%r5";
1185my $ivp="%r6";
1186
1187$code.=<<___;
1188.globl AES_cbc_encrypt
1189.type AES_cbc_encrypt,\@function
1190.align 16
1191AES_cbc_encrypt:
1192 xgr %r3,%r4 # flip %r3 and %r4, out and len
1193 xgr %r4,%r3
1194 xgr %r3,%r4
1195___
1196$code.=<<___ if (!$softonly);
1197 lhi %r0,16
1198 cl %r0,240($key)
1199 jh .Lcbc_software
1200
1201 lg %r0,0($ivp) # copy ivec
1202 lg %r1,8($ivp)
1203 stmg %r0,%r1,16($sp)
1204 lmg %r0,%r1,0($key) # copy key, cover 256 bit
1205 stmg %r0,%r1,32($sp)
1206 lmg %r0,%r1,16($key)
1207 stmg %r0,%r1,48($sp)
1208 l %r0,240($key) # load kmc code
1209 lghi $key,15 # res=len%16, len-=res;
1210 ngr $key,$len
1211 sl${g}r $len,$key
1212 la %r1,16($sp) # parameter block - ivec || key
1213 jz .Lkmc_truncated
1214 .long 0xb92f0042 # kmc %r4,%r2
1215 brc 1,.-4 # pay attention to "partial completion"
1216 ltr $key,$key
1217 jnz .Lkmc_truncated
1218.Lkmc_done:
1219 lmg %r0,%r1,16($sp) # copy ivec to caller
1220 stg %r0,0($ivp)
1221 stg %r1,8($ivp)
1222 br $ra
1223.align 16
1224.Lkmc_truncated:
1225 ahi $key,-1 # it's the way it's encoded in mvc
1226 tmll %r0,0x80
1227 jnz .Lkmc_truncated_dec
1228 lghi %r1,0
1229 stg %r1,16*$SIZE_T($sp)
1230 stg %r1,16*$SIZE_T+8($sp)
1231 bras %r1,1f
1232 mvc 16*$SIZE_T(1,$sp),0($inp)
12331: ex $key,0(%r1)
1234 la %r1,16($sp) # restore parameter block
1235 la $inp,16*$SIZE_T($sp)
1236 lghi $len,16
1237 .long 0xb92f0042 # kmc %r4,%r2
1238 j .Lkmc_done
1239.align 16
1240.Lkmc_truncated_dec:
1241 st${g} $out,4*$SIZE_T($sp)
1242 la $out,16*$SIZE_T($sp)
1243 lghi $len,16
1244 .long 0xb92f0042 # kmc %r4,%r2
1245 l${g} $out,4*$SIZE_T($sp)
1246 bras %r1,2f
1247 mvc 0(1,$out),16*$SIZE_T($sp)
12482: ex $key,0(%r1)
1249 j .Lkmc_done
1250.align 16
1251.Lcbc_software:
1252___
1253$code.=<<___;
1254 stm${g} $key,$ra,5*$SIZE_T($sp)
1255 lhi %r0,0
1256 cl %r0,`$stdframe+$SIZE_T-4`($sp)
1257 je .Lcbc_decrypt
1258
1259 larl $tbl,AES_Te
1260
1261 llgf $s0,0($ivp)
1262 llgf $s1,4($ivp)
1263 llgf $s2,8($ivp)
1264 llgf $s3,12($ivp)
1265
1266 lghi $t0,16
1267 sl${g}r $len,$t0
1268 brc 4,.Lcbc_enc_tail # if borrow
1269.Lcbc_enc_loop:
1270 stm${g} $inp,$out,2*$SIZE_T($sp)
1271 x $s0,0($inp)
1272 x $s1,4($inp)
1273 x $s2,8($inp)
1274 x $s3,12($inp)
1275 lgr %r4,$key
1276
1277 bras $ra,_s390x_AES_encrypt
1278
1279 lm${g} $inp,$key,2*$SIZE_T($sp)
1280 st $s0,0($out)
1281 st $s1,4($out)
1282 st $s2,8($out)
1283 st $s3,12($out)
1284
1285 la $inp,16($inp)
1286 la $out,16($out)
1287 lghi $t0,16
1288 lt${g}r $len,$len
1289 jz .Lcbc_enc_done
1290 sl${g}r $len,$t0
1291 brc 4,.Lcbc_enc_tail # if borrow
1292 j .Lcbc_enc_loop
1293.align 16
1294.Lcbc_enc_done:
1295 l${g} $ivp,6*$SIZE_T($sp)
1296 st $s0,0($ivp)
1297 st $s1,4($ivp)
1298 st $s2,8($ivp)
1299 st $s3,12($ivp)
1300
1301 lm${g} %r7,$ra,7*$SIZE_T($sp)
1302 br $ra
1303
1304.align 16
1305.Lcbc_enc_tail:
1306 aghi $len,15
1307 lghi $t0,0
1308 stg $t0,16*$SIZE_T($sp)
1309 stg $t0,16*$SIZE_T+8($sp)
1310 bras $t1,3f
1311 mvc 16*$SIZE_T(1,$sp),0($inp)
13123: ex $len,0($t1)
1313 lghi $len,0
1314 la $inp,16*$SIZE_T($sp)
1315 j .Lcbc_enc_loop
1316
1317.align 16
1318.Lcbc_decrypt:
1319 larl $tbl,AES_Td
1320
1321 lg $t0,0($ivp)
1322 lg $t1,8($ivp)
1323 stmg $t0,$t1,16*$SIZE_T($sp)
1324
1325.Lcbc_dec_loop:
1326 stm${g} $inp,$out,2*$SIZE_T($sp)
1327 llgf $s0,0($inp)
1328 llgf $s1,4($inp)
1329 llgf $s2,8($inp)
1330 llgf $s3,12($inp)
1331 lgr %r4,$key
1332
1333 bras $ra,_s390x_AES_decrypt
1334
1335 lm${g} $inp,$key,2*$SIZE_T($sp)
1336 sllg $s0,$s0,32
1337 sllg $s2,$s2,32
1338 lr $s0,$s1
1339 lr $s2,$s3
1340
1341 lg $t0,0($inp)
1342 lg $t1,8($inp)
1343 xg $s0,16*$SIZE_T($sp)
1344 xg $s2,16*$SIZE_T+8($sp)
1345 lghi $s1,16
1346 sl${g}r $len,$s1
1347 brc 4,.Lcbc_dec_tail # if borrow
1348 brc 2,.Lcbc_dec_done # if zero
1349 stg $s0,0($out)
1350 stg $s2,8($out)
1351 stmg $t0,$t1,16*$SIZE_T($sp)
1352
1353 la $inp,16($inp)
1354 la $out,16($out)
1355 j .Lcbc_dec_loop
1356
1357.Lcbc_dec_done:
1358 stg $s0,0($out)
1359 stg $s2,8($out)
1360.Lcbc_dec_exit:
1361 lm${g} %r6,$ra,6*$SIZE_T($sp)
1362 stmg $t0,$t1,0($ivp)
1363
1364 br $ra
1365
1366.align 16
1367.Lcbc_dec_tail:
1368 aghi $len,15
1369 stg $s0,16*$SIZE_T($sp)
1370 stg $s2,16*$SIZE_T+8($sp)
1371 bras $s1,4f
1372 mvc 0(1,$out),16*$SIZE_T($sp)
13734: ex $len,0($s1)
1374 j .Lcbc_dec_exit
1375.size AES_cbc_encrypt,.-AES_cbc_encrypt
1376___
1377}
1378########################################################################
1379# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1380# size_t blocks, const AES_KEY *key,
1381# const unsigned char *ivec)
1382{
1383my $inp="%r2";
1384my $out="%r4"; # blocks and out are swapped
1385my $len="%r3";
1386my $key="%r5"; my $iv0="%r5";
1387my $ivp="%r6";
1388my $fp ="%r7";
1389
1390$code.=<<___;
1391.globl AES_ctr32_encrypt
1392.type AES_ctr32_encrypt,\@function
1393.align 16
1394AES_ctr32_encrypt:
1395 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1396 xgr %r4,%r3
1397 xgr %r3,%r4
1398 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
1399___
1400$code.=<<___ if (!$softonly);
1401 l %r0,240($key)
1402 lhi %r1,16
1403 clr %r0,%r1
1404 jl .Lctr32_software
1405
1406 stm${g} %r6,$s3,6*$SIZE_T($sp)
1407
1408 slgr $out,$inp
1409 la %r1,0($key) # %r1 is permanent copy of $key
1410 lg $iv0,0($ivp) # load ivec
1411 lg $ivp,8($ivp)
1412
1413 # prepare and allocate stack frame at the top of 4K page
1414 # with 1K reserved for eventual signal handling
1415 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1416 lghi $s1,-4096
1417 algr $s0,$sp
1418 lgr $fp,$sp
1419 ngr $s0,$s1 # align at page boundary
1420 slgr $fp,$s0 # total buffer size
1421 lgr $s2,$sp
1422 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1423 slgr $fp,$s1 # deduct reservation to get usable buffer size
1424 # buffer size is at lest 256 and at most 3072+256-16
1425
1426 la $sp,1024($s0) # alloca
1427 srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
1428 st${g} $s2,0($sp) # back-chain
1429 st${g} $fp,$SIZE_T($sp)
1430
1431 slgr $len,$fp
1432 brc 1,.Lctr32_hw_switch # not zero, no borrow
1433 algr $fp,$len # input is shorter than allocated buffer
1434 lghi $len,0
1435 st${g} $fp,$SIZE_T($sp)
1436
1437.Lctr32_hw_switch:
1438___
1439$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower
1440 larl $s0,OPENSSL_s390xcap_P
1441 lg $s0,8($s0)
1442 tmhh $s0,0x0004 # check for message_security-assist-4
1443 jz .Lctr32_km_loop
1444
1445 llgfr $s0,%r0
1446 lgr $s1,%r1
1447 lghi %r0,0
1448 la %r1,16($sp)
1449 .long 0xb92d2042 # kmctr %r4,%r2,%r2
1450
1451 llihh %r0,0x8000 # check if kmctr supports the function code
1452 srlg %r0,%r0,0($s0)
1453 ng %r0,16($sp)
1454 lgr %r0,$s0
1455 lgr %r1,$s1
1456 jz .Lctr32_km_loop
1457
1458####### kmctr code
1459 algr $out,$inp # restore $out
1460 lgr $s1,$len # $s1 undertakes $len
1461 j .Lctr32_kmctr_loop
1462.align 16
1463.Lctr32_kmctr_loop:
1464 la $s2,16($sp)
1465 lgr $s3,$fp
1466.Lctr32_kmctr_prepare:
1467 stg $iv0,0($s2)
1468 stg $ivp,8($s2)
1469 la $s2,16($s2)
1470 ahi $ivp,1 # 32-bit increment, preserves upper half
1471 brct $s3,.Lctr32_kmctr_prepare
1472
1473 #la $inp,0($inp) # inp
1474 sllg $len,$fp,4 # len
1475 #la $out,0($out) # out
1476 la $s2,16($sp) # iv
1477 .long 0xb92da042 # kmctr $out,$s2,$inp
1478 brc 1,.-4 # pay attention to "partial completion"
1479
1480 slgr $s1,$fp
1481 brc 1,.Lctr32_kmctr_loop # not zero, no borrow
1482 algr $fp,$s1
1483 lghi $s1,0
1484 brc 4+1,.Lctr32_kmctr_loop # not zero
1485
1486 l${g} $sp,0($sp)
1487 lm${g} %r6,$s3,6*$SIZE_T($sp)
1488 br $ra
1489.align 16
1490___
1491$code.=<<___;
1492.Lctr32_km_loop:
1493 la $s2,16($sp)
1494 lgr $s3,$fp
1495.Lctr32_km_prepare:
1496 stg $iv0,0($s2)
1497 stg $ivp,8($s2)
1498 la $s2,16($s2)
1499 ahi $ivp,1 # 32-bit increment, preserves upper half
1500 brct $s3,.Lctr32_km_prepare
1501
1502 la $s0,16($sp) # inp
1503 sllg $s1,$fp,4 # len
1504 la $s2,16($sp) # out
1505 .long 0xb92e00a8 # km %r10,%r8
1506 brc 1,.-4 # pay attention to "partial completion"
1507
1508 la $s2,16($sp)
1509 lgr $s3,$fp
1510 slgr $s2,$inp
1511.Lctr32_km_xor:
1512 lg $s0,0($inp)
1513 lg $s1,8($inp)
1514 xg $s0,0($s2,$inp)
1515 xg $s1,8($s2,$inp)
1516 stg $s0,0($out,$inp)
1517 stg $s1,8($out,$inp)
1518 la $inp,16($inp)
1519 brct $s3,.Lctr32_km_xor
1520
1521 slgr $len,$fp
1522 brc 1,.Lctr32_km_loop # not zero, no borrow
1523 algr $fp,$len
1524 lghi $len,0
1525 brc 4+1,.Lctr32_km_loop # not zero
1526
1527 l${g} $s0,0($sp)
1528 l${g} $s1,$SIZE_T($sp)
1529 la $s2,16($sp)
1530.Lctr32_km_zap:
1531 stg $s0,0($s2)
1532 stg $s0,8($s2)
1533 la $s2,16($s2)
1534 brct $s1,.Lctr32_km_zap
1535
1536 la $sp,0($s0)
1537 lm${g} %r6,$s3,6*$SIZE_T($sp)
1538 br $ra
1539.align 16
1540.Lctr32_software:
1541___
1542$code.=<<___;
1543 stm${g} $key,$ra,5*$SIZE_T($sp)
1544 sl${g}r $inp,$out
1545 larl $tbl,AES_Te
1546 llgf $t1,12($ivp)
1547
1548.Lctr32_loop:
1549 stm${g} $inp,$out,2*$SIZE_T($sp)
1550 llgf $s0,0($ivp)
1551 llgf $s1,4($ivp)
1552 llgf $s2,8($ivp)
1553 lgr $s3,$t1
1554 st $t1,16*$SIZE_T($sp)
1555 lgr %r4,$key
1556
1557 bras $ra,_s390x_AES_encrypt
1558
1559 lm${g} $inp,$ivp,2*$SIZE_T($sp)
1560 llgf $t1,16*$SIZE_T($sp)
1561 x $s0,0($inp,$out)
1562 x $s1,4($inp,$out)
1563 x $s2,8($inp,$out)
1564 x $s3,12($inp,$out)
1565 stm $s0,$s3,0($out)
1566
1567 la $out,16($out)
1568 ahi $t1,1 # 32-bit increment
1569 brct $len,.Lctr32_loop
1570
1571 lm${g} %r6,$ra,6*$SIZE_T($sp)
1572 br $ra
1573.size AES_ctr32_encrypt,.-AES_ctr32_encrypt
1574___
1575}
1576
1577########################################################################
1578# void AES_xts_encrypt(const char *inp,char *out,size_t len,
1579# const AES_KEY *key1, const AES_KEY *key2,
1580# const unsigned char iv[16]);
1581#
1582{
1583my $inp="%r2";
1584my $out="%r4"; # len and out are swapped
1585my $len="%r3";
1586my $key1="%r5"; # $i1
1587my $key2="%r6"; # $i2
1588my $fp="%r7"; # $i3
1589my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
1590
1591$code.=<<___;
1592.type _s390x_xts_km,\@function
1593.align 16
1594_s390x_xts_km:
1595___
1596$code.=<<___ if(1);
1597 llgfr $s0,%r0 # put aside the function code
1598 lghi $s1,0x7f
1599 nr $s1,%r0
1600 lghi %r0,0 # query capability vector
1601 la %r1,2*$SIZE_T($sp)
1602 .long 0xb92e0042 # km %r4,%r2
1603 llihh %r1,0x8000
1604 srlg %r1,%r1,32($s1) # check for 32+function code
1605 ng %r1,2*$SIZE_T($sp)
1606 lgr %r0,$s0 # restore the function code
1607 la %r1,0($key1) # restore $key1
1608 jz .Lxts_km_vanilla
1609
1610 lmg $i2,$i3,$tweak($sp) # put aside the tweak value
1611 algr $out,$inp
1612
1613 oill %r0,32 # switch to xts function code
1614 aghi $s1,-18 #
1615 sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
1616 la %r1,$tweak-16($sp)
1617 slgr %r1,$s1 # parameter block position
1618 lmg $s0,$s3,0($key1) # load 256 bits of key material,
1619 stmg $s0,$s3,0(%r1) # and copy it to parameter block.
1620 # yes, it contains junk and overlaps
1621 # with the tweak in 128-bit case.
1622 # it's done to avoid conditional
1623 # branch.
1624 stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
1625
1626 .long 0xb92e0042 # km %r4,%r2
1627 brc 1,.-4 # pay attention to "partial completion"
1628
1629 lrvg $s0,$tweak+0($sp) # load the last tweak
1630 lrvg $s1,$tweak+8($sp)
1631 stmg %r0,%r3,$tweak-32(%r1) # wipe copy of the key
1632
1633 nill %r0,0xffdf # switch back to original function code
1634 la %r1,0($key1) # restore pointer to $key1
1635 slgr $out,$inp
1636
1637 llgc $len,2*$SIZE_T-1($sp)
1638 nill $len,0x0f # $len%=16
1639 br $ra
1640
1641.align 16
1642.Lxts_km_vanilla:
1643___
1644$code.=<<___;
1645 # prepare and allocate stack frame at the top of 4K page
1646 # with 1K reserved for eventual signal handling
1647 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1648 lghi $s1,-4096
1649 algr $s0,$sp
1650 lgr $fp,$sp
1651 ngr $s0,$s1 # align at page boundary
1652 slgr $fp,$s0 # total buffer size
1653 lgr $s2,$sp
1654 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1655 slgr $fp,$s1 # deduct reservation to get usable buffer size
1656 # buffer size is at lest 256 and at most 3072+256-16
1657
1658 la $sp,1024($s0) # alloca
1659 nill $fp,0xfff0 # round to 16*n
1660 st${g} $s2,0($sp) # back-chain
1661 nill $len,0xfff0 # redundant
1662 st${g} $fp,$SIZE_T($sp)
1663
1664 slgr $len,$fp
1665 brc 1,.Lxts_km_go # not zero, no borrow
1666 algr $fp,$len # input is shorter than allocated buffer
1667 lghi $len,0
1668 st${g} $fp,$SIZE_T($sp)
1669
1670.Lxts_km_go:
1671 lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
1672 lrvg $s1,$tweak+8($s2)
1673
1674 la $s2,16($sp) # vector of ascending tweak values
1675 slgr $s2,$inp
1676 srlg $s3,$fp,4
1677 j .Lxts_km_start
1678
1679.Lxts_km_loop:
1680 la $s2,16($sp)
1681 slgr $s2,$inp
1682 srlg $s3,$fp,4
1683.Lxts_km_prepare:
1684 lghi $i1,0x87
1685 srag $i2,$s1,63 # broadcast upper bit
1686 ngr $i1,$i2 # rem
1687 srlg $i2,$s0,63 # carry bit from lower half
1688 sllg $s0,$s0,1
1689 sllg $s1,$s1,1
1690 xgr $s0,$i1
1691 ogr $s1,$i2
1692.Lxts_km_start:
1693 lrvgr $i1,$s0 # flip byte order
1694 lrvgr $i2,$s1
1695 stg $i1,0($s2,$inp)
1696 stg $i2,8($s2,$inp)
1697 xg $i1,0($inp)
1698 xg $i2,8($inp)
1699 stg $i1,0($out,$inp)
1700 stg $i2,8($out,$inp)
1701 la $inp,16($inp)
1702 brct $s3,.Lxts_km_prepare
1703
1704 slgr $inp,$fp # rewind $inp
1705 la $s2,0($out,$inp)
1706 lgr $s3,$fp
1707 .long 0xb92e00aa # km $s2,$s2
1708 brc 1,.-4 # pay attention to "partial completion"
1709
1710 la $s2,16($sp)
1711 slgr $s2,$inp
1712 srlg $s3,$fp,4
1713.Lxts_km_xor:
1714 lg $i1,0($out,$inp)
1715 lg $i2,8($out,$inp)
1716 xg $i1,0($s2,$inp)
1717 xg $i2,8($s2,$inp)
1718 stg $i1,0($out,$inp)
1719 stg $i2,8($out,$inp)
1720 la $inp,16($inp)
1721 brct $s3,.Lxts_km_xor
1722
1723 slgr $len,$fp
1724 brc 1,.Lxts_km_loop # not zero, no borrow
1725 algr $fp,$len
1726 lghi $len,0
1727 brc 4+1,.Lxts_km_loop # not zero
1728
1729 l${g} $i1,0($sp) # back-chain
1730 llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
1731 la $i2,16($sp)
1732 srlg $fp,$fp,4
1733.Lxts_km_zap:
1734 stg $i1,0($i2)
1735 stg $i1,8($i2)
1736 la $i2,16($i2)
1737 brct $fp,.Lxts_km_zap
1738
1739 la $sp,0($i1)
1740 llgc $len,2*$SIZE_T-1($i1)
1741 nill $len,0x0f # $len%=16
1742 bzr $ra
1743
1744 # generate one more tweak...
1745 lghi $i1,0x87
1746 srag $i2,$s1,63 # broadcast upper bit
1747 ngr $i1,$i2 # rem
1748 srlg $i2,$s0,63 # carry bit from lower half
1749 sllg $s0,$s0,1
1750 sllg $s1,$s1,1
1751 xgr $s0,$i1
1752 ogr $s1,$i2
1753
1754 ltr $len,$len # clear zero flag
1755 br $ra
1756.size _s390x_xts_km,.-_s390x_xts_km
1757
1758.globl AES_xts_encrypt
1759.type AES_xts_encrypt,\@function
1760.align 16
1761AES_xts_encrypt:
1762 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1763 xgr %r4,%r3
1764 xgr %r3,%r4
1765___
1766$code.=<<___ if ($SIZE_T==4);
1767 llgfr $len,$len
1768___
1769$code.=<<___;
1770 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1771 srag $len,$len,4 # formally wrong, because it expands
1772 # sign byte, but who can afford asking
1773 # to process more than 2^63-1 bytes?
1774 # I use it, because it sets condition
1775 # code...
1776 bcr 8,$ra # abort if zero (i.e. less than 16)
1777___
1778$code.=<<___ if (!$softonly);
1779 llgf %r0,240($key2)
1780 lhi %r1,16
1781 clr %r0,%r1
1782 jl .Lxts_enc_software
1783
1784 stm${g} %r6,$s3,6*$SIZE_T($sp)
1785 st${g} $ra,14*$SIZE_T($sp)
1786
1787 sllg $len,$len,4 # $len&=~15
1788 slgr $out,$inp
1789
1790 # generate the tweak value
1791 l${g} $s3,$stdframe($sp) # pointer to iv
1792 la $s2,$tweak($sp)
1793 lmg $s0,$s1,0($s3)
1794 lghi $s3,16
1795 stmg $s0,$s1,0($s2)
1796 la %r1,0($key2) # $key2 is not needed anymore
1797 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1798 brc 1,.-4 # can this happen?
1799
1800 l %r0,240($key1)
1801 la %r1,0($key1) # $key1 is not needed anymore
1802 bras $ra,_s390x_xts_km
1803 jz .Lxts_enc_km_done
1804
1805 aghi $inp,-16 # take one step back
1806 la $i3,0($out,$inp) # put aside real $out
1807.Lxts_enc_km_steal:
1808 llgc $i1,16($inp)
1809 llgc $i2,0($out,$inp)
1810 stc $i1,0($out,$inp)
1811 stc $i2,16($out,$inp)
1812 la $inp,1($inp)
1813 brct $len,.Lxts_enc_km_steal
1814
1815 la $s2,0($i3)
1816 lghi $s3,16
1817 lrvgr $i1,$s0 # flip byte order
1818 lrvgr $i2,$s1
1819 xg $i1,0($s2)
1820 xg $i2,8($s2)
1821 stg $i1,0($s2)
1822 stg $i2,8($s2)
1823 .long 0xb92e00aa # km $s2,$s2
1824 brc 1,.-4 # can this happen?
1825 lrvgr $i1,$s0 # flip byte order
1826 lrvgr $i2,$s1
1827 xg $i1,0($i3)
1828 xg $i2,8($i3)
1829 stg $i1,0($i3)
1830 stg $i2,8($i3)
1831
1832.Lxts_enc_km_done:
1833 l${g} $ra,14*$SIZE_T($sp)
1834 st${g} $sp,$tweak($sp) # wipe tweak
1835 st${g} $sp,$tweak($sp)
1836 lm${g} %r6,$s3,6*$SIZE_T($sp)
1837 br $ra
1838.align 16
1839.Lxts_enc_software:
1840___
1841$code.=<<___;
1842 stm${g} %r6,$ra,6*$SIZE_T($sp)
1843
1844 slgr $out,$inp
1845
1846 xgr $s0,$s0 # clear upper half
1847 xgr $s1,$s1
1848 lrv $s0,$stdframe+4($sp) # load secno
1849 lrv $s1,$stdframe+0($sp)
1850 xgr $s2,$s2
1851 xgr $s3,$s3
1852 stm${g} %r2,%r5,2*$SIZE_T($sp)
1853 la $key,0($key2)
1854 larl $tbl,AES_Te
1855 bras $ra,_s390x_AES_encrypt # generate the tweak
1856 lm${g} %r2,%r5,2*$SIZE_T($sp)
1857 stm $s0,$s3,$tweak($sp) # save the tweak
1858 j .Lxts_enc_enter
1859
1860.align 16
1861.Lxts_enc_loop:
1862 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1863 lrvg $s3,$tweak+8($sp)
1864 lghi %r1,0x87
1865 srag %r0,$s3,63 # broadcast upper bit
1866 ngr %r1,%r0 # rem
1867 srlg %r0,$s1,63 # carry bit from lower half
1868 sllg $s1,$s1,1
1869 sllg $s3,$s3,1
1870 xgr $s1,%r1
1871 ogr $s3,%r0
1872 lrvgr $s1,$s1 # flip byte order
1873 lrvgr $s3,$s3
1874 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1875 stg $s1,$tweak+0($sp) # save the tweak
1876 llgfr $s1,$s1
1877 srlg $s2,$s3,32
1878 stg $s3,$tweak+8($sp)
1879 llgfr $s3,$s3
1880 la $inp,16($inp) # $inp+=16
1881.Lxts_enc_enter:
1882 x $s0,0($inp) # ^=*($inp)
1883 x $s1,4($inp)
1884 x $s2,8($inp)
1885 x $s3,12($inp)
1886 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
1887 la $key,0($key1)
1888 bras $ra,_s390x_AES_encrypt
1889 lm${g} %r2,%r5,2*$SIZE_T($sp)
1890 x $s0,$tweak+0($sp) # ^=tweak
1891 x $s1,$tweak+4($sp)
1892 x $s2,$tweak+8($sp)
1893 x $s3,$tweak+12($sp)
1894 st $s0,0($out,$inp)
1895 st $s1,4($out,$inp)
1896 st $s2,8($out,$inp)
1897 st $s3,12($out,$inp)
1898 brct${g} $len,.Lxts_enc_loop
1899
1900 llgc $len,`2*$SIZE_T-1`($sp)
1901 nill $len,0x0f # $len%16
1902 jz .Lxts_enc_done
1903
1904 la $i3,0($inp,$out) # put aside real $out
1905.Lxts_enc_steal:
1906 llgc %r0,16($inp)
1907 llgc %r1,0($out,$inp)
1908 stc %r0,0($out,$inp)
1909 stc %r1,16($out,$inp)
1910 la $inp,1($inp)
1911 brct $len,.Lxts_enc_steal
1912 la $out,0($i3) # restore real $out
1913
1914 # generate last tweak...
1915 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1916 lrvg $s3,$tweak+8($sp)
1917 lghi %r1,0x87
1918 srag %r0,$s3,63 # broadcast upper bit
1919 ngr %r1,%r0 # rem
1920 srlg %r0,$s1,63 # carry bit from lower half
1921 sllg $s1,$s1,1
1922 sllg $s3,$s3,1
1923 xgr $s1,%r1
1924 ogr $s3,%r0
1925 lrvgr $s1,$s1 # flip byte order
1926 lrvgr $s3,$s3
1927 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1928 stg $s1,$tweak+0($sp) # save the tweak
1929 llgfr $s1,$s1
1930 srlg $s2,$s3,32
1931 stg $s3,$tweak+8($sp)
1932 llgfr $s3,$s3
1933
1934 x $s0,0($out) # ^=*(inp)|stolen cipther-text
1935 x $s1,4($out)
1936 x $s2,8($out)
1937 x $s3,12($out)
1938 st${g} $out,4*$SIZE_T($sp)
1939 la $key,0($key1)
1940 bras $ra,_s390x_AES_encrypt
1941 l${g} $out,4*$SIZE_T($sp)
1942 x $s0,`$tweak+0`($sp) # ^=tweak
1943 x $s1,`$tweak+4`($sp)
1944 x $s2,`$tweak+8`($sp)
1945 x $s3,`$tweak+12`($sp)
1946 st $s0,0($out)
1947 st $s1,4($out)
1948 st $s2,8($out)
1949 st $s3,12($out)
1950
1951.Lxts_enc_done:
1952 stg $sp,$tweak+0($sp) # wipe tweak
1953 stg $sp,$twesk+8($sp)
1954 lm${g} %r6,$ra,6*$SIZE_T($sp)
1955 br $ra
1956.size AES_xts_encrypt,.-AES_xts_encrypt
1957___
1958# void AES_xts_decrypt(const char *inp,char *out,size_t len,
1959# const AES_KEY *key1, const AES_KEY *key2,u64 secno);
1960#
1961$code.=<<___;
1962.globl AES_xts_decrypt
1963.type AES_xts_decrypt,\@function
1964.align 16
1965AES_xts_decrypt:
1966 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1967 xgr %r4,%r3
1968 xgr %r3,%r4
1969___
1970$code.=<<___ if ($SIZE_T==4);
1971 llgfr $len,$len
1972___
1973$code.=<<___;
1974 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1975 aghi $len,-16
1976 bcr 4,$ra # abort if less than zero. formally
1977 # wrong, because $len is unsigned,
1978 # but who can afford asking to
1979 # process more than 2^63-1 bytes?
1980 tmll $len,0x0f
1981 jnz .Lxts_dec_proceed
1982 aghi $len,16
1983.Lxts_dec_proceed:
1984___
1985$code.=<<___ if (!$softonly);
1986 llgf %r0,240($key2)
1987 lhi %r1,16
1988 clr %r0,%r1
1989 jl .Lxts_dec_software
1990
1991 stm${g} %r6,$s3,6*$SIZE_T($sp)
1992 st${g} $ra,14*$SIZE_T($sp)
1993
1994 nill $len,0xfff0 # $len&=~15
1995 slgr $out,$inp
1996
1997 # generate the tweak value
1998 l${g} $s3,$stdframe($sp) # pointer to iv
1999 la $s2,$tweak($sp)
2000 lmg $s0,$s1,0($s3)
2001 lghi $s3,16
2002 stmg $s0,$s1,0($s2)
2003 la %r1,0($key2) # $key2 is not needed past this point
2004 .long 0xb92e00aa # km $s2,$s2, generate the tweak
2005 brc 1,.-4 # can this happen?
2006
2007 l %r0,240($key1)
2008 la %r1,0($key1) # $key1 is not needed anymore
2009
2010 ltgr $len,$len
2011 jz .Lxts_dec_km_short
2012 bras $ra,_s390x_xts_km
2013 jz .Lxts_dec_km_done
2014
2015 lrvgr $s2,$s0 # make copy in reverse byte order
2016 lrvgr $s3,$s1
2017 j .Lxts_dec_km_2ndtweak
2018
2019.Lxts_dec_km_short:
2020 llgc $len,`2*$SIZE_T-1`($sp)
2021 nill $len,0x0f # $len%=16
2022 lrvg $s0,$tweak+0($sp) # load the tweak
2023 lrvg $s1,$tweak+8($sp)
2024 lrvgr $s2,$s0 # make copy in reverse byte order
2025 lrvgr $s3,$s1
2026
2027.Lxts_dec_km_2ndtweak:
2028 lghi $i1,0x87
2029 srag $i2,$s1,63 # broadcast upper bit
2030 ngr $i1,$i2 # rem
2031 srlg $i2,$s0,63 # carry bit from lower half
2032 sllg $s0,$s0,1
2033 sllg $s1,$s1,1
2034 xgr $s0,$i1
2035 ogr $s1,$i2
2036 lrvgr $i1,$s0 # flip byte order
2037 lrvgr $i2,$s1
2038
2039 xg $i1,0($inp)
2040 xg $i2,8($inp)
2041 stg $i1,0($out,$inp)
2042 stg $i2,8($out,$inp)
2043 la $i2,0($out,$inp)
2044 lghi $i3,16
2045 .long 0xb92e0066 # km $i2,$i2
2046 brc 1,.-4 # can this happen?
2047 lrvgr $i1,$s0
2048 lrvgr $i2,$s1
2049 xg $i1,0($out,$inp)
2050 xg $i2,8($out,$inp)
2051 stg $i1,0($out,$inp)
2052 stg $i2,8($out,$inp)
2053
2054 la $i3,0($out,$inp) # put aside real $out
2055.Lxts_dec_km_steal:
2056 llgc $i1,16($inp)
2057 llgc $i2,0($out,$inp)
2058 stc $i1,0($out,$inp)
2059 stc $i2,16($out,$inp)
2060 la $inp,1($inp)
2061 brct $len,.Lxts_dec_km_steal
2062
2063 lgr $s0,$s2
2064 lgr $s1,$s3
2065 xg $s0,0($i3)
2066 xg $s1,8($i3)
2067 stg $s0,0($i3)
2068 stg $s1,8($i3)
2069 la $s0,0($i3)
2070 lghi $s1,16
2071 .long 0xb92e0088 # km $s0,$s0
2072 brc 1,.-4 # can this happen?
2073 xg $s2,0($i3)
2074 xg $s3,8($i3)
2075 stg $s2,0($i3)
2076 stg $s3,8($i3)
2077.Lxts_dec_km_done:
2078 l${g} $ra,14*$SIZE_T($sp)
2079 st${g} $sp,$tweak($sp) # wipe tweak
2080 st${g} $sp,$tweak($sp)
2081 lm${g} %r6,$s3,6*$SIZE_T($sp)
2082 br $ra
2083.align 16
2084.Lxts_dec_software:
2085___
2086$code.=<<___;
2087 stm${g} %r6,$ra,6*$SIZE_T($sp)
2088
2089 srlg $len,$len,4
2090 slgr $out,$inp
2091
2092 xgr $s0,$s0 # clear upper half
2093 xgr $s1,$s1
2094 lrv $s0,$stdframe+4($sp) # load secno
2095 lrv $s1,$stdframe+0($sp)
2096 xgr $s2,$s2
2097 xgr $s3,$s3
2098 stm${g} %r2,%r5,2*$SIZE_T($sp)
2099 la $key,0($key2)
2100 larl $tbl,AES_Te
2101 bras $ra,_s390x_AES_encrypt # generate the tweak
2102 lm${g} %r2,%r5,2*$SIZE_T($sp)
2103 larl $tbl,AES_Td
2104 lt${g}r $len,$len
2105 stm $s0,$s3,$tweak($sp) # save the tweak
2106 jz .Lxts_dec_short
2107 j .Lxts_dec_enter
2108
2109.align 16
2110.Lxts_dec_loop:
2111 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2112 lrvg $s3,$tweak+8($sp)
2113 lghi %r1,0x87
2114 srag %r0,$s3,63 # broadcast upper bit
2115 ngr %r1,%r0 # rem
2116 srlg %r0,$s1,63 # carry bit from lower half
2117 sllg $s1,$s1,1
2118 sllg $s3,$s3,1
2119 xgr $s1,%r1
2120 ogr $s3,%r0
2121 lrvgr $s1,$s1 # flip byte order
2122 lrvgr $s3,$s3
2123 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2124 stg $s1,$tweak+0($sp) # save the tweak
2125 llgfr $s1,$s1
2126 srlg $s2,$s3,32
2127 stg $s3,$tweak+8($sp)
2128 llgfr $s3,$s3
2129.Lxts_dec_enter:
2130 x $s0,0($inp) # tweak^=*(inp)
2131 x $s1,4($inp)
2132 x $s2,8($inp)
2133 x $s3,12($inp)
2134 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
2135 la $key,0($key1)
2136 bras $ra,_s390x_AES_decrypt
2137 lm${g} %r2,%r5,2*$SIZE_T($sp)
2138 x $s0,$tweak+0($sp) # ^=tweak
2139 x $s1,$tweak+4($sp)
2140 x $s2,$tweak+8($sp)
2141 x $s3,$tweak+12($sp)
2142 st $s0,0($out,$inp)
2143 st $s1,4($out,$inp)
2144 st $s2,8($out,$inp)
2145 st $s3,12($out,$inp)
2146 la $inp,16($inp)
2147 brct${g} $len,.Lxts_dec_loop
2148
2149 llgc $len,`2*$SIZE_T-1`($sp)
2150 nill $len,0x0f # $len%16
2151 jz .Lxts_dec_done
2152
2153 # generate pair of tweaks...
2154 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2155 lrvg $s3,$tweak+8($sp)
2156 lghi %r1,0x87
2157 srag %r0,$s3,63 # broadcast upper bit
2158 ngr %r1,%r0 # rem
2159 srlg %r0,$s1,63 # carry bit from lower half
2160 sllg $s1,$s1,1
2161 sllg $s3,$s3,1
2162 xgr $s1,%r1
2163 ogr $s3,%r0
2164 lrvgr $i2,$s1 # flip byte order
2165 lrvgr $i3,$s3
2166 stmg $i2,$i3,$tweak($sp) # save the 1st tweak
2167 j .Lxts_dec_2ndtweak
2168
2169.align 16
2170.Lxts_dec_short:
2171 llgc $len,`2*$SIZE_T-1`($sp)
2172 nill $len,0x0f # $len%16
2173 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2174 lrvg $s3,$tweak+8($sp)
2175.Lxts_dec_2ndtweak:
2176 lghi %r1,0x87
2177 srag %r0,$s3,63 # broadcast upper bit
2178 ngr %r1,%r0 # rem
2179 srlg %r0,$s1,63 # carry bit from lower half
2180 sllg $s1,$s1,1
2181 sllg $s3,$s3,1
2182 xgr $s1,%r1
2183 ogr $s3,%r0
2184 lrvgr $s1,$s1 # flip byte order
2185 lrvgr $s3,$s3
2186 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2187 stg $s1,$tweak-16+0($sp) # save the 2nd tweak
2188 llgfr $s1,$s1
2189 srlg $s2,$s3,32
2190 stg $s3,$tweak-16+8($sp)
2191 llgfr $s3,$s3
2192
2193 x $s0,0($inp) # tweak_the_2nd^=*(inp)
2194 x $s1,4($inp)
2195 x $s2,8($inp)
2196 x $s3,12($inp)
2197 stm${g} %r2,%r3,2*$SIZE_T($sp)
2198 la $key,0($key1)
2199 bras $ra,_s390x_AES_decrypt
2200 lm${g} %r2,%r5,2*$SIZE_T($sp)
2201 x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
2202 x $s1,$tweak-16+4($sp)
2203 x $s2,$tweak-16+8($sp)
2204 x $s3,$tweak-16+12($sp)
2205 st $s0,0($out,$inp)
2206 st $s1,4($out,$inp)
2207 st $s2,8($out,$inp)
2208 st $s3,12($out,$inp)
2209
2210 la $i3,0($out,$inp) # put aside real $out
2211.Lxts_dec_steal:
2212 llgc %r0,16($inp)
2213 llgc %r1,0($out,$inp)
2214 stc %r0,0($out,$inp)
2215 stc %r1,16($out,$inp)
2216 la $inp,1($inp)
2217 brct $len,.Lxts_dec_steal
2218 la $out,0($i3) # restore real $out
2219
2220 lm $s0,$s3,$tweak($sp) # load the 1st tweak
2221 x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
2222 x $s1,4($out)
2223 x $s2,8($out)
2224 x $s3,12($out)
2225 st${g} $out,4*$SIZE_T($sp)
2226 la $key,0($key1)
2227 bras $ra,_s390x_AES_decrypt
2228 l${g} $out,4*$SIZE_T($sp)
2229 x $s0,$tweak+0($sp) # ^=tweak
2230 x $s1,$tweak+4($sp)
2231 x $s2,$tweak+8($sp)
2232 x $s3,$tweak+12($sp)
2233 st $s0,0($out)
2234 st $s1,4($out)
2235 st $s2,8($out)
2236 st $s3,12($out)
2237 stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
2238 stg $sp,$tweak-16+8($sp)
2239.Lxts_dec_done:
2240 stg $sp,$tweak+0($sp) # wipe tweak
2241 stg $sp,$twesk+8($sp)
2242 lm${g} %r6,$ra,6*$SIZE_T($sp)
2243 br $ra
2244.size AES_xts_decrypt,.-AES_xts_decrypt
2245___
2246}
2247$code.=<<___;
2248.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2249.comm OPENSSL_s390xcap_P,16,8
2250___
2251
2252$code =~ s/\`([^\`]*)\`/eval $1/gem;
2253print $code;
2254close STDOUT; # force flush
diff --git a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
deleted file mode 100755
index 403c4d1290..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
+++ /dev/null
@@ -1,1182 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# Version 1.1
10#
11# The major reason for undertaken effort was to mitigate the hazard of
12# cache-timing attack. This is [currently and initially!] addressed in
13# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
14# 2. References to them are scheduled for L2 cache latency, meaning
15# that the tables don't have to reside in L1 cache. Once again, this
16# is an initial draft and one should expect more countermeasures to
17# be implemented...
18#
19# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
20# round.
21#
22# Even though performance was not the primary goal [on the contrary,
23# extra shifts "induced" by compressed S-box and longer loop epilogue
24# "induced" by scheduling for L2 have negative effect on performance],
25# the code turned out to run in ~23 cycles per processed byte en-/
26# decrypted with 128-bit key. This is pretty good result for code
27# with mentioned qualities and UltraSPARC core. Compared to Sun C
28# generated code my encrypt procedure runs just few percents faster,
29# while decrypt one - whole 50% faster [yes, Sun C failed to generate
30# optimal decrypt procedure]. Compared to GNU C generated code both
31# procedures are more than 60% faster:-)
32
33$bits=32;
34for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
35if ($bits==64) { $bias=2047; $frame=192; }
36else { $bias=0; $frame=112; }
37$locals=16;
38
39$acc0="%l0";
40$acc1="%o0";
41$acc2="%o1";
42$acc3="%o2";
43
44$acc4="%l1";
45$acc5="%o3";
46$acc6="%o4";
47$acc7="%o5";
48
49$acc8="%l2";
50$acc9="%o7";
51$acc10="%g1";
52$acc11="%g2";
53
54$acc12="%l3";
55$acc13="%g3";
56$acc14="%g4";
57$acc15="%g5";
58
59$t0="%l4";
60$t1="%l5";
61$t2="%l6";
62$t3="%l7";
63
64$s0="%i0";
65$s1="%i1";
66$s2="%i2";
67$s3="%i3";
68$tbl="%i4";
69$key="%i5";
70$rounds="%i7"; # aliases with return address, which is off-loaded to stack
71
72sub _data_word()
73{ my $i;
74 while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
75}
76
77$code.=<<___ if ($bits==64);
78.register %g2,#scratch
79.register %g3,#scratch
80___
81$code.=<<___;
82.section ".text",#alloc,#execinstr
83
84.align 256
85AES_Te:
86___
87&_data_word(
88 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
89 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
90 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
91 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
92 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
93 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
94 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
95 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
96 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
97 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
98 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
99 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
100 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
101 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
102 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
103 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
104 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
105 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
106 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
107 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
108 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
109 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
110 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
111 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
112 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
113 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
114 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
115 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
116 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
117 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
118 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
119 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
120 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
121 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
122 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
123 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
124 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
125 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
126 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
127 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
128 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
129 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
130 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
131 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
132 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
133 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
134 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
135 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
136 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
137 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
138 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
139 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
140 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
141 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
142 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
143 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
144 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
145 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
146 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
147 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
148 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
149 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
150 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
151 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
152$code.=<<___;
153 .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
154 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
155 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
156 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
157 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
158 .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
159 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
160 .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
161 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
162 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
163 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
164 .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
165 .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
166 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
167 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
168 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
169 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
170 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
171 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
172 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
173 .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
174 .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
175 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
176 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
177 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
178 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
179 .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
180 .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
181 .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
182 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
183 .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
184 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
185.type AES_Te,#object
186.size AES_Te,(.-AES_Te)
187
188.align 64
189.skip 16
190_sparcv9_AES_encrypt:
191 save %sp,-$frame-$locals,%sp
192 stx %i7,[%sp+$bias+$frame+0] ! off-load return address
193 ld [$key+240],$rounds
194 ld [$key+0],$t0
195 ld [$key+4],$t1 !
196 ld [$key+8],$t2
197 srl $rounds,1,$rounds
198 xor $t0,$s0,$s0
199 ld [$key+12],$t3
200 srl $s0,21,$acc0
201 xor $t1,$s1,$s1
202 ld [$key+16],$t0
203 srl $s1,13,$acc1 !
204 xor $t2,$s2,$s2
205 ld [$key+20],$t1
206 xor $t3,$s3,$s3
207 ld [$key+24],$t2
208 and $acc0,2040,$acc0
209 ld [$key+28],$t3
210 nop
211.Lenc_loop:
212 srl $s2,5,$acc2 !
213 and $acc1,2040,$acc1
214 ldx [$tbl+$acc0],$acc0
215 sll $s3,3,$acc3
216 and $acc2,2040,$acc2
217 ldx [$tbl+$acc1],$acc1
218 srl $s1,21,$acc4
219 and $acc3,2040,$acc3
220 ldx [$tbl+$acc2],$acc2 !
221 srl $s2,13,$acc5
222 and $acc4,2040,$acc4
223 ldx [$tbl+$acc3],$acc3
224 srl $s3,5,$acc6
225 and $acc5,2040,$acc5
226 ldx [$tbl+$acc4],$acc4
227 fmovs %f0,%f0
228 sll $s0,3,$acc7 !
229 and $acc6,2040,$acc6
230 ldx [$tbl+$acc5],$acc5
231 srl $s2,21,$acc8
232 and $acc7,2040,$acc7
233 ldx [$tbl+$acc6],$acc6
234 srl $s3,13,$acc9
235 and $acc8,2040,$acc8
236 ldx [$tbl+$acc7],$acc7 !
237 srl $s0,5,$acc10
238 and $acc9,2040,$acc9
239 ldx [$tbl+$acc8],$acc8
240 sll $s1,3,$acc11
241 and $acc10,2040,$acc10
242 ldx [$tbl+$acc9],$acc9
243 fmovs %f0,%f0
244 srl $s3,21,$acc12 !
245 and $acc11,2040,$acc11
246 ldx [$tbl+$acc10],$acc10
247 srl $s0,13,$acc13
248 and $acc12,2040,$acc12
249 ldx [$tbl+$acc11],$acc11
250 srl $s1,5,$acc14
251 and $acc13,2040,$acc13
252 ldx [$tbl+$acc12],$acc12 !
253 sll $s2,3,$acc15
254 and $acc14,2040,$acc14
255 ldx [$tbl+$acc13],$acc13
256 and $acc15,2040,$acc15
257 add $key,32,$key
258 ldx [$tbl+$acc14],$acc14
259 fmovs %f0,%f0
260 subcc $rounds,1,$rounds !
261 ldx [$tbl+$acc15],$acc15
262 bz,a,pn %icc,.Lenc_last
263 add $tbl,2048,$rounds
264
265 srlx $acc1,8,$acc1
266 xor $acc0,$t0,$t0
267 ld [$key+0],$s0
268 fmovs %f0,%f0
269 srlx $acc2,16,$acc2 !
270 xor $acc1,$t0,$t0
271 ld [$key+4],$s1
272 srlx $acc3,24,$acc3
273 xor $acc2,$t0,$t0
274 ld [$key+8],$s2
275 srlx $acc5,8,$acc5
276 xor $acc3,$t0,$t0
277 ld [$key+12],$s3 !
278 srlx $acc6,16,$acc6
279 xor $acc4,$t1,$t1
280 fmovs %f0,%f0
281 srlx $acc7,24,$acc7
282 xor $acc5,$t1,$t1
283 srlx $acc9,8,$acc9
284 xor $acc6,$t1,$t1
285 srlx $acc10,16,$acc10 !
286 xor $acc7,$t1,$t1
287 srlx $acc11,24,$acc11
288 xor $acc8,$t2,$t2
289 srlx $acc13,8,$acc13
290 xor $acc9,$t2,$t2
291 srlx $acc14,16,$acc14
292 xor $acc10,$t2,$t2
293 srlx $acc15,24,$acc15 !
294 xor $acc11,$t2,$t2
295 xor $acc12,$acc14,$acc14
296 xor $acc13,$t3,$t3
297 srl $t0,21,$acc0
298 xor $acc14,$t3,$t3
299 srl $t1,13,$acc1
300 xor $acc15,$t3,$t3
301
302 and $acc0,2040,$acc0 !
303 srl $t2,5,$acc2
304 and $acc1,2040,$acc1
305 ldx [$tbl+$acc0],$acc0
306 sll $t3,3,$acc3
307 and $acc2,2040,$acc2
308 ldx [$tbl+$acc1],$acc1
309 fmovs %f0,%f0
310 srl $t1,21,$acc4 !
311 and $acc3,2040,$acc3
312 ldx [$tbl+$acc2],$acc2
313 srl $t2,13,$acc5
314 and $acc4,2040,$acc4
315 ldx [$tbl+$acc3],$acc3
316 srl $t3,5,$acc6
317 and $acc5,2040,$acc5
318 ldx [$tbl+$acc4],$acc4 !
319 sll $t0,3,$acc7
320 and $acc6,2040,$acc6
321 ldx [$tbl+$acc5],$acc5
322 srl $t2,21,$acc8
323 and $acc7,2040,$acc7
324 ldx [$tbl+$acc6],$acc6
325 fmovs %f0,%f0
326 srl $t3,13,$acc9 !
327 and $acc8,2040,$acc8
328 ldx [$tbl+$acc7],$acc7
329 srl $t0,5,$acc10
330 and $acc9,2040,$acc9
331 ldx [$tbl+$acc8],$acc8
332 sll $t1,3,$acc11
333 and $acc10,2040,$acc10
334 ldx [$tbl+$acc9],$acc9 !
335 srl $t3,21,$acc12
336 and $acc11,2040,$acc11
337 ldx [$tbl+$acc10],$acc10
338 srl $t0,13,$acc13
339 and $acc12,2040,$acc12
340 ldx [$tbl+$acc11],$acc11
341 fmovs %f0,%f0
342 srl $t1,5,$acc14 !
343 and $acc13,2040,$acc13
344 ldx [$tbl+$acc12],$acc12
345 sll $t2,3,$acc15
346 and $acc14,2040,$acc14
347 ldx [$tbl+$acc13],$acc13
348 srlx $acc1,8,$acc1
349 and $acc15,2040,$acc15
350 ldx [$tbl+$acc14],$acc14 !
351
352 srlx $acc2,16,$acc2
353 xor $acc0,$s0,$s0
354 ldx [$tbl+$acc15],$acc15
355 srlx $acc3,24,$acc3
356 xor $acc1,$s0,$s0
357 ld [$key+16],$t0
358 fmovs %f0,%f0
359 srlx $acc5,8,$acc5 !
360 xor $acc2,$s0,$s0
361 ld [$key+20],$t1
362 srlx $acc6,16,$acc6
363 xor $acc3,$s0,$s0
364 ld [$key+24],$t2
365 srlx $acc7,24,$acc7
366 xor $acc4,$s1,$s1
367 ld [$key+28],$t3 !
368 srlx $acc9,8,$acc9
369 xor $acc5,$s1,$s1
370 ldx [$tbl+2048+0],%g0 ! prefetch te4
371 srlx $acc10,16,$acc10
372 xor $acc6,$s1,$s1
373 ldx [$tbl+2048+32],%g0 ! prefetch te4
374 srlx $acc11,24,$acc11
375 xor $acc7,$s1,$s1
376 ldx [$tbl+2048+64],%g0 ! prefetch te4
377 srlx $acc13,8,$acc13
378 xor $acc8,$s2,$s2
379 ldx [$tbl+2048+96],%g0 ! prefetch te4
380 srlx $acc14,16,$acc14 !
381 xor $acc9,$s2,$s2
382 ldx [$tbl+2048+128],%g0 ! prefetch te4
383 srlx $acc15,24,$acc15
384 xor $acc10,$s2,$s2
385 ldx [$tbl+2048+160],%g0 ! prefetch te4
386 srl $s0,21,$acc0
387 xor $acc11,$s2,$s2
388 ldx [$tbl+2048+192],%g0 ! prefetch te4
389 xor $acc12,$acc14,$acc14
390 xor $acc13,$s3,$s3
391 ldx [$tbl+2048+224],%g0 ! prefetch te4
392 srl $s1,13,$acc1 !
393 xor $acc14,$s3,$s3
394 xor $acc15,$s3,$s3
395 ba .Lenc_loop
396 and $acc0,2040,$acc0
397
398.align 32
399.Lenc_last:
400 srlx $acc1,8,$acc1 !
401 xor $acc0,$t0,$t0
402 ld [$key+0],$s0
403 srlx $acc2,16,$acc2
404 xor $acc1,$t0,$t0
405 ld [$key+4],$s1
406 srlx $acc3,24,$acc3
407 xor $acc2,$t0,$t0
408 ld [$key+8],$s2 !
409 srlx $acc5,8,$acc5
410 xor $acc3,$t0,$t0
411 ld [$key+12],$s3
412 srlx $acc6,16,$acc6
413 xor $acc4,$t1,$t1
414 srlx $acc7,24,$acc7
415 xor $acc5,$t1,$t1
416 srlx $acc9,8,$acc9 !
417 xor $acc6,$t1,$t1
418 srlx $acc10,16,$acc10
419 xor $acc7,$t1,$t1
420 srlx $acc11,24,$acc11
421 xor $acc8,$t2,$t2
422 srlx $acc13,8,$acc13
423 xor $acc9,$t2,$t2
424 srlx $acc14,16,$acc14 !
425 xor $acc10,$t2,$t2
426 srlx $acc15,24,$acc15
427 xor $acc11,$t2,$t2
428 xor $acc12,$acc14,$acc14
429 xor $acc13,$t3,$t3
430 srl $t0,24,$acc0
431 xor $acc14,$t3,$t3
432 srl $t1,16,$acc1 !
433 xor $acc15,$t3,$t3
434
435 srl $t2,8,$acc2
436 and $acc1,255,$acc1
437 ldub [$rounds+$acc0],$acc0
438 srl $t1,24,$acc4
439 and $acc2,255,$acc2
440 ldub [$rounds+$acc1],$acc1
441 srl $t2,16,$acc5 !
442 and $t3,255,$acc3
443 ldub [$rounds+$acc2],$acc2
444 ldub [$rounds+$acc3],$acc3
445 srl $t3,8,$acc6
446 and $acc5,255,$acc5
447 ldub [$rounds+$acc4],$acc4
448 fmovs %f0,%f0
449 srl $t2,24,$acc8 !
450 and $acc6,255,$acc6
451 ldub [$rounds+$acc5],$acc5
452 srl $t3,16,$acc9
453 and $t0,255,$acc7
454 ldub [$rounds+$acc6],$acc6
455 ldub [$rounds+$acc7],$acc7
456 fmovs %f0,%f0
457 srl $t0,8,$acc10 !
458 and $acc9,255,$acc9
459 ldub [$rounds+$acc8],$acc8
460 srl $t3,24,$acc12
461 and $acc10,255,$acc10
462 ldub [$rounds+$acc9],$acc9
463 srl $t0,16,$acc13
464 and $t1,255,$acc11
465 ldub [$rounds+$acc10],$acc10 !
466 srl $t1,8,$acc14
467 and $acc13,255,$acc13
468 ldub [$rounds+$acc11],$acc11
469 ldub [$rounds+$acc12],$acc12
470 and $acc14,255,$acc14
471 ldub [$rounds+$acc13],$acc13
472 and $t2,255,$acc15
473 ldub [$rounds+$acc14],$acc14 !
474
475 sll $acc0,24,$acc0
476 xor $acc3,$s0,$s0
477 ldub [$rounds+$acc15],$acc15
478 sll $acc1,16,$acc1
479 xor $acc0,$s0,$s0
480 ldx [%sp+$bias+$frame+0],%i7 ! restore return address
481 fmovs %f0,%f0
482 sll $acc2,8,$acc2 !
483 xor $acc1,$s0,$s0
484 sll $acc4,24,$acc4
485 xor $acc2,$s0,$s0
486 sll $acc5,16,$acc5
487 xor $acc7,$s1,$s1
488 sll $acc6,8,$acc6
489 xor $acc4,$s1,$s1
490 sll $acc8,24,$acc8 !
491 xor $acc5,$s1,$s1
492 sll $acc9,16,$acc9
493 xor $acc11,$s2,$s2
494 sll $acc10,8,$acc10
495 xor $acc6,$s1,$s1
496 sll $acc12,24,$acc12
497 xor $acc8,$s2,$s2
498 sll $acc13,16,$acc13 !
499 xor $acc9,$s2,$s2
500 sll $acc14,8,$acc14
501 xor $acc10,$s2,$s2
502 xor $acc12,$acc14,$acc14
503 xor $acc13,$s3,$s3
504 xor $acc14,$s3,$s3
505 xor $acc15,$s3,$s3
506
507 ret
508 restore
509.type _sparcv9_AES_encrypt,#function
510.size _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
511
512.align 32
513.globl AES_encrypt
514AES_encrypt:
515 or %o0,%o1,%g1
516 andcc %g1,3,%g0
517 bnz,pn %xcc,.Lunaligned_enc
518 save %sp,-$frame,%sp
519
520 ld [%i0+0],%o0
521 ld [%i0+4],%o1
522 ld [%i0+8],%o2
523 ld [%i0+12],%o3
524
5251: call .+8
526 add %o7,AES_Te-1b,%o4
527 call _sparcv9_AES_encrypt
528 mov %i2,%o5
529
530 st %o0,[%i1+0]
531 st %o1,[%i1+4]
532 st %o2,[%i1+8]
533 st %o3,[%i1+12]
534
535 ret
536 restore
537
538.align 32
539.Lunaligned_enc:
540 ldub [%i0+0],%l0
541 ldub [%i0+1],%l1
542 ldub [%i0+2],%l2
543
544 sll %l0,24,%l0
545 ldub [%i0+3],%l3
546 sll %l1,16,%l1
547 ldub [%i0+4],%l4
548 sll %l2,8,%l2
549 or %l1,%l0,%l0
550 ldub [%i0+5],%l5
551 sll %l4,24,%l4
552 or %l3,%l2,%l2
553 ldub [%i0+6],%l6
554 sll %l5,16,%l5
555 or %l0,%l2,%o0
556 ldub [%i0+7],%l7
557
558 sll %l6,8,%l6
559 or %l5,%l4,%l4
560 ldub [%i0+8],%l0
561 or %l7,%l6,%l6
562 ldub [%i0+9],%l1
563 or %l4,%l6,%o1
564 ldub [%i0+10],%l2
565
566 sll %l0,24,%l0
567 ldub [%i0+11],%l3
568 sll %l1,16,%l1
569 ldub [%i0+12],%l4
570 sll %l2,8,%l2
571 or %l1,%l0,%l0
572 ldub [%i0+13],%l5
573 sll %l4,24,%l4
574 or %l3,%l2,%l2
575 ldub [%i0+14],%l6
576 sll %l5,16,%l5
577 or %l0,%l2,%o2
578 ldub [%i0+15],%l7
579
580 sll %l6,8,%l6
581 or %l5,%l4,%l4
582 or %l7,%l6,%l6
583 or %l4,%l6,%o3
584
5851: call .+8
586 add %o7,AES_Te-1b,%o4
587 call _sparcv9_AES_encrypt
588 mov %i2,%o5
589
590 srl %o0,24,%l0
591 srl %o0,16,%l1
592 stb %l0,[%i1+0]
593 srl %o0,8,%l2
594 stb %l1,[%i1+1]
595 stb %l2,[%i1+2]
596 srl %o1,24,%l4
597 stb %o0,[%i1+3]
598
599 srl %o1,16,%l5
600 stb %l4,[%i1+4]
601 srl %o1,8,%l6
602 stb %l5,[%i1+5]
603 stb %l6,[%i1+6]
604 srl %o2,24,%l0
605 stb %o1,[%i1+7]
606
607 srl %o2,16,%l1
608 stb %l0,[%i1+8]
609 srl %o2,8,%l2
610 stb %l1,[%i1+9]
611 stb %l2,[%i1+10]
612 srl %o3,24,%l4
613 stb %o2,[%i1+11]
614
615 srl %o3,16,%l5
616 stb %l4,[%i1+12]
617 srl %o3,8,%l6
618 stb %l5,[%i1+13]
619 stb %l6,[%i1+14]
620 stb %o3,[%i1+15]
621
622 ret
623 restore
624.type AES_encrypt,#function
625.size AES_encrypt,(.-AES_encrypt)
626
627___
628
629$code.=<<___;
630.align 256
631AES_Td:
632___
633&_data_word(
634 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
635 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
636 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
637 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
638 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
639 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
640 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
641 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
642 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
643 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
644 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
645 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
646 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
647 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
648 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
649 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
650 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
651 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
652 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
653 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
654 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
655 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
656 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
657 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
658 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
659 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
660 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
661 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
662 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
663 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
664 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
665 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
666 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
667 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
668 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
669 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
670 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
671 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
672 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
673 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
674 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
675 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
676 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
677 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
678 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
679 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
680 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
681 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
682 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
683 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
684 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
685 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
686 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
687 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
688 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
689 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
690 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
691 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
692 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
693 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
694 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
695 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
696 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
697 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
698$code.=<<___;
699 .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
700 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
701 .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
702 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
703 .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
704 .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
705 .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
706 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
707 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
708 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
709 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
710 .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
711 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
712 .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
713 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
714 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
715 .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
716 .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
717 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
718 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
719 .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
720 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
721 .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
722 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
723 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
724 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
725 .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
726 .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
727 .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
728 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
729 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
730 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
731.type AES_Td,#object
732.size AES_Td,(.-AES_Td)
733
734.align 64
735.skip 16
736_sparcv9_AES_decrypt:
737 save %sp,-$frame-$locals,%sp
738 stx %i7,[%sp+$bias+$frame+0] ! off-load return address
739 ld [$key+240],$rounds
740 ld [$key+0],$t0
741 ld [$key+4],$t1 !
742 ld [$key+8],$t2
743 ld [$key+12],$t3
744 srl $rounds,1,$rounds
745 xor $t0,$s0,$s0
746 ld [$key+16],$t0
747 xor $t1,$s1,$s1
748 ld [$key+20],$t1
749 srl $s0,21,$acc0 !
750 xor $t2,$s2,$s2
751 ld [$key+24],$t2
752 xor $t3,$s3,$s3
753 and $acc0,2040,$acc0
754 ld [$key+28],$t3
755 srl $s3,13,$acc1
756 nop
757.Ldec_loop:
758 srl $s2,5,$acc2 !
759 and $acc1,2040,$acc1
760 ldx [$tbl+$acc0],$acc0
761 sll $s1,3,$acc3
762 and $acc2,2040,$acc2
763 ldx [$tbl+$acc1],$acc1
764 srl $s1,21,$acc4
765 and $acc3,2040,$acc3
766 ldx [$tbl+$acc2],$acc2 !
767 srl $s0,13,$acc5
768 and $acc4,2040,$acc4
769 ldx [$tbl+$acc3],$acc3
770 srl $s3,5,$acc6
771 and $acc5,2040,$acc5
772 ldx [$tbl+$acc4],$acc4
773 fmovs %f0,%f0
774 sll $s2,3,$acc7 !
775 and $acc6,2040,$acc6
776 ldx [$tbl+$acc5],$acc5
777 srl $s2,21,$acc8
778 and $acc7,2040,$acc7
779 ldx [$tbl+$acc6],$acc6
780 srl $s1,13,$acc9
781 and $acc8,2040,$acc8
782 ldx [$tbl+$acc7],$acc7 !
783 srl $s0,5,$acc10
784 and $acc9,2040,$acc9
785 ldx [$tbl+$acc8],$acc8
786 sll $s3,3,$acc11
787 and $acc10,2040,$acc10
788 ldx [$tbl+$acc9],$acc9
789 fmovs %f0,%f0
790 srl $s3,21,$acc12 !
791 and $acc11,2040,$acc11
792 ldx [$tbl+$acc10],$acc10
793 srl $s2,13,$acc13
794 and $acc12,2040,$acc12
795 ldx [$tbl+$acc11],$acc11
796 srl $s1,5,$acc14
797 and $acc13,2040,$acc13
798 ldx [$tbl+$acc12],$acc12 !
799 sll $s0,3,$acc15
800 and $acc14,2040,$acc14
801 ldx [$tbl+$acc13],$acc13
802 and $acc15,2040,$acc15
803 add $key,32,$key
804 ldx [$tbl+$acc14],$acc14
805 fmovs %f0,%f0
806 subcc $rounds,1,$rounds !
807 ldx [$tbl+$acc15],$acc15
808 bz,a,pn %icc,.Ldec_last
809 add $tbl,2048,$rounds
810
811 srlx $acc1,8,$acc1
812 xor $acc0,$t0,$t0
813 ld [$key+0],$s0
814 fmovs %f0,%f0
815 srlx $acc2,16,$acc2 !
816 xor $acc1,$t0,$t0
817 ld [$key+4],$s1
818 srlx $acc3,24,$acc3
819 xor $acc2,$t0,$t0
820 ld [$key+8],$s2
821 srlx $acc5,8,$acc5
822 xor $acc3,$t0,$t0
823 ld [$key+12],$s3 !
824 srlx $acc6,16,$acc6
825 xor $acc4,$t1,$t1
826 fmovs %f0,%f0
827 srlx $acc7,24,$acc7
828 xor $acc5,$t1,$t1
829 srlx $acc9,8,$acc9
830 xor $acc6,$t1,$t1
831 srlx $acc10,16,$acc10 !
832 xor $acc7,$t1,$t1
833 srlx $acc11,24,$acc11
834 xor $acc8,$t2,$t2
835 srlx $acc13,8,$acc13
836 xor $acc9,$t2,$t2
837 srlx $acc14,16,$acc14
838 xor $acc10,$t2,$t2
839 srlx $acc15,24,$acc15 !
840 xor $acc11,$t2,$t2
841 xor $acc12,$acc14,$acc14
842 xor $acc13,$t3,$t3
843 srl $t0,21,$acc0
844 xor $acc14,$t3,$t3
845 xor $acc15,$t3,$t3
846 srl $t3,13,$acc1
847
848 and $acc0,2040,$acc0 !
849 srl $t2,5,$acc2
850 and $acc1,2040,$acc1
851 ldx [$tbl+$acc0],$acc0
852 sll $t1,3,$acc3
853 and $acc2,2040,$acc2
854 ldx [$tbl+$acc1],$acc1
855 fmovs %f0,%f0
856 srl $t1,21,$acc4 !
857 and $acc3,2040,$acc3
858 ldx [$tbl+$acc2],$acc2
859 srl $t0,13,$acc5
860 and $acc4,2040,$acc4
861 ldx [$tbl+$acc3],$acc3
862 srl $t3,5,$acc6
863 and $acc5,2040,$acc5
864 ldx [$tbl+$acc4],$acc4 !
865 sll $t2,3,$acc7
866 and $acc6,2040,$acc6
867 ldx [$tbl+$acc5],$acc5
868 srl $t2,21,$acc8
869 and $acc7,2040,$acc7
870 ldx [$tbl+$acc6],$acc6
871 fmovs %f0,%f0
872 srl $t1,13,$acc9 !
873 and $acc8,2040,$acc8
874 ldx [$tbl+$acc7],$acc7
875 srl $t0,5,$acc10
876 and $acc9,2040,$acc9
877 ldx [$tbl+$acc8],$acc8
878 sll $t3,3,$acc11
879 and $acc10,2040,$acc10
880 ldx [$tbl+$acc9],$acc9 !
881 srl $t3,21,$acc12
882 and $acc11,2040,$acc11
883 ldx [$tbl+$acc10],$acc10
884 srl $t2,13,$acc13
885 and $acc12,2040,$acc12
886 ldx [$tbl+$acc11],$acc11
887 fmovs %f0,%f0
888 srl $t1,5,$acc14 !
889 and $acc13,2040,$acc13
890 ldx [$tbl+$acc12],$acc12
891 sll $t0,3,$acc15
892 and $acc14,2040,$acc14
893 ldx [$tbl+$acc13],$acc13
894 srlx $acc1,8,$acc1
895 and $acc15,2040,$acc15
896 ldx [$tbl+$acc14],$acc14 !
897
898 srlx $acc2,16,$acc2
899 xor $acc0,$s0,$s0
900 ldx [$tbl+$acc15],$acc15
901 srlx $acc3,24,$acc3
902 xor $acc1,$s0,$s0
903 ld [$key+16],$t0
904 fmovs %f0,%f0
905 srlx $acc5,8,$acc5 !
906 xor $acc2,$s0,$s0
907 ld [$key+20],$t1
908 srlx $acc6,16,$acc6
909 xor $acc3,$s0,$s0
910 ld [$key+24],$t2
911 srlx $acc7,24,$acc7
912 xor $acc4,$s1,$s1
913 ld [$key+28],$t3 !
914 srlx $acc9,8,$acc9
915 xor $acc5,$s1,$s1
916 ldx [$tbl+2048+0],%g0 ! prefetch td4
917 srlx $acc10,16,$acc10
918 xor $acc6,$s1,$s1
919 ldx [$tbl+2048+32],%g0 ! prefetch td4
920 srlx $acc11,24,$acc11
921 xor $acc7,$s1,$s1
922 ldx [$tbl+2048+64],%g0 ! prefetch td4
923 srlx $acc13,8,$acc13
924 xor $acc8,$s2,$s2
925 ldx [$tbl+2048+96],%g0 ! prefetch td4
926 srlx $acc14,16,$acc14 !
927 xor $acc9,$s2,$s2
928 ldx [$tbl+2048+128],%g0 ! prefetch td4
929 srlx $acc15,24,$acc15
930 xor $acc10,$s2,$s2
931 ldx [$tbl+2048+160],%g0 ! prefetch td4
932 srl $s0,21,$acc0
933 xor $acc11,$s2,$s2
934 ldx [$tbl+2048+192],%g0 ! prefetch td4
935 xor $acc12,$acc14,$acc14
936 xor $acc13,$s3,$s3
937 ldx [$tbl+2048+224],%g0 ! prefetch td4
938 and $acc0,2040,$acc0 !
939 xor $acc14,$s3,$s3
940 xor $acc15,$s3,$s3
941 ba .Ldec_loop
942 srl $s3,13,$acc1
943
944.align 32
945.Ldec_last:
946 srlx $acc1,8,$acc1 !
947 xor $acc0,$t0,$t0
948 ld [$key+0],$s0
949 srlx $acc2,16,$acc2
950 xor $acc1,$t0,$t0
951 ld [$key+4],$s1
952 srlx $acc3,24,$acc3
953 xor $acc2,$t0,$t0
954 ld [$key+8],$s2 !
955 srlx $acc5,8,$acc5
956 xor $acc3,$t0,$t0
957 ld [$key+12],$s3
958 srlx $acc6,16,$acc6
959 xor $acc4,$t1,$t1
960 srlx $acc7,24,$acc7
961 xor $acc5,$t1,$t1
962 srlx $acc9,8,$acc9 !
963 xor $acc6,$t1,$t1
964 srlx $acc10,16,$acc10
965 xor $acc7,$t1,$t1
966 srlx $acc11,24,$acc11
967 xor $acc8,$t2,$t2
968 srlx $acc13,8,$acc13
969 xor $acc9,$t2,$t2
970 srlx $acc14,16,$acc14 !
971 xor $acc10,$t2,$t2
972 srlx $acc15,24,$acc15
973 xor $acc11,$t2,$t2
974 xor $acc12,$acc14,$acc14
975 xor $acc13,$t3,$t3
976 srl $t0,24,$acc0
977 xor $acc14,$t3,$t3
978 xor $acc15,$t3,$t3 !
979 srl $t3,16,$acc1
980
981 srl $t2,8,$acc2
982 and $acc1,255,$acc1
983 ldub [$rounds+$acc0],$acc0
984 srl $t1,24,$acc4
985 and $acc2,255,$acc2
986 ldub [$rounds+$acc1],$acc1
987 srl $t0,16,$acc5 !
988 and $t1,255,$acc3
989 ldub [$rounds+$acc2],$acc2
990 ldub [$rounds+$acc3],$acc3
991 srl $t3,8,$acc6
992 and $acc5,255,$acc5
993 ldub [$rounds+$acc4],$acc4
994 fmovs %f0,%f0
995 srl $t2,24,$acc8 !
996 and $acc6,255,$acc6
997 ldub [$rounds+$acc5],$acc5
998 srl $t1,16,$acc9
999 and $t2,255,$acc7
1000 ldub [$rounds+$acc6],$acc6
1001 ldub [$rounds+$acc7],$acc7
1002 fmovs %f0,%f0
1003 srl $t0,8,$acc10 !
1004 and $acc9,255,$acc9
1005 ldub [$rounds+$acc8],$acc8
1006 srl $t3,24,$acc12
1007 and $acc10,255,$acc10
1008 ldub [$rounds+$acc9],$acc9
1009 srl $t2,16,$acc13
1010 and $t3,255,$acc11
1011 ldub [$rounds+$acc10],$acc10 !
1012 srl $t1,8,$acc14
1013 and $acc13,255,$acc13
1014 ldub [$rounds+$acc11],$acc11
1015 ldub [$rounds+$acc12],$acc12
1016 and $acc14,255,$acc14
1017 ldub [$rounds+$acc13],$acc13
1018 and $t0,255,$acc15
1019 ldub [$rounds+$acc14],$acc14 !
1020
1021 sll $acc0,24,$acc0
1022 xor $acc3,$s0,$s0
1023 ldub [$rounds+$acc15],$acc15
1024 sll $acc1,16,$acc1
1025 xor $acc0,$s0,$s0
1026 ldx [%sp+$bias+$frame+0],%i7 ! restore return address
1027 fmovs %f0,%f0
1028 sll $acc2,8,$acc2 !
1029 xor $acc1,$s0,$s0
1030 sll $acc4,24,$acc4
1031 xor $acc2,$s0,$s0
1032 sll $acc5,16,$acc5
1033 xor $acc7,$s1,$s1
1034 sll $acc6,8,$acc6
1035 xor $acc4,$s1,$s1
1036 sll $acc8,24,$acc8 !
1037 xor $acc5,$s1,$s1
1038 sll $acc9,16,$acc9
1039 xor $acc11,$s2,$s2
1040 sll $acc10,8,$acc10
1041 xor $acc6,$s1,$s1
1042 sll $acc12,24,$acc12
1043 xor $acc8,$s2,$s2
1044 sll $acc13,16,$acc13 !
1045 xor $acc9,$s2,$s2
1046 sll $acc14,8,$acc14
1047 xor $acc10,$s2,$s2
1048 xor $acc12,$acc14,$acc14
1049 xor $acc13,$s3,$s3
1050 xor $acc14,$s3,$s3
1051 xor $acc15,$s3,$s3
1052
1053 ret
1054 restore
1055.type _sparcv9_AES_decrypt,#function
1056.size _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1057
1058.align 32
1059.globl AES_decrypt
1060AES_decrypt:
1061 or %o0,%o1,%g1
1062 andcc %g1,3,%g0
1063 bnz,pn %xcc,.Lunaligned_dec
1064 save %sp,-$frame,%sp
1065
1066 ld [%i0+0],%o0
1067 ld [%i0+4],%o1
1068 ld [%i0+8],%o2
1069 ld [%i0+12],%o3
1070
10711: call .+8
1072 add %o7,AES_Td-1b,%o4
1073 call _sparcv9_AES_decrypt
1074 mov %i2,%o5
1075
1076 st %o0,[%i1+0]
1077 st %o1,[%i1+4]
1078 st %o2,[%i1+8]
1079 st %o3,[%i1+12]
1080
1081 ret
1082 restore
1083
1084.align 32
1085.Lunaligned_dec:
1086 ldub [%i0+0],%l0
1087 ldub [%i0+1],%l1
1088 ldub [%i0+2],%l2
1089
1090 sll %l0,24,%l0
1091 ldub [%i0+3],%l3
1092 sll %l1,16,%l1
1093 ldub [%i0+4],%l4
1094 sll %l2,8,%l2
1095 or %l1,%l0,%l0
1096 ldub [%i0+5],%l5
1097 sll %l4,24,%l4
1098 or %l3,%l2,%l2
1099 ldub [%i0+6],%l6
1100 sll %l5,16,%l5
1101 or %l0,%l2,%o0
1102 ldub [%i0+7],%l7
1103
1104 sll %l6,8,%l6
1105 or %l5,%l4,%l4
1106 ldub [%i0+8],%l0
1107 or %l7,%l6,%l6
1108 ldub [%i0+9],%l1
1109 or %l4,%l6,%o1
1110 ldub [%i0+10],%l2
1111
1112 sll %l0,24,%l0
1113 ldub [%i0+11],%l3
1114 sll %l1,16,%l1
1115 ldub [%i0+12],%l4
1116 sll %l2,8,%l2
1117 or %l1,%l0,%l0
1118 ldub [%i0+13],%l5
1119 sll %l4,24,%l4
1120 or %l3,%l2,%l2
1121 ldub [%i0+14],%l6
1122 sll %l5,16,%l5
1123 or %l0,%l2,%o2
1124 ldub [%i0+15],%l7
1125
1126 sll %l6,8,%l6
1127 or %l5,%l4,%l4
1128 or %l7,%l6,%l6
1129 or %l4,%l6,%o3
1130
11311: call .+8
1132 add %o7,AES_Td-1b,%o4
1133 call _sparcv9_AES_decrypt
1134 mov %i2,%o5
1135
1136 srl %o0,24,%l0
1137 srl %o0,16,%l1
1138 stb %l0,[%i1+0]
1139 srl %o0,8,%l2
1140 stb %l1,[%i1+1]
1141 stb %l2,[%i1+2]
1142 srl %o1,24,%l4
1143 stb %o0,[%i1+3]
1144
1145 srl %o1,16,%l5
1146 stb %l4,[%i1+4]
1147 srl %o1,8,%l6
1148 stb %l5,[%i1+5]
1149 stb %l6,[%i1+6]
1150 srl %o2,24,%l0
1151 stb %o1,[%i1+7]
1152
1153 srl %o2,16,%l1
1154 stb %l0,[%i1+8]
1155 srl %o2,8,%l2
1156 stb %l1,[%i1+9]
1157 stb %l2,[%i1+10]
1158 srl %o3,24,%l4
1159 stb %o2,[%i1+11]
1160
1161 srl %o3,16,%l5
1162 stb %l4,[%i1+12]
1163 srl %o3,8,%l6
1164 stb %l5,[%i1+13]
1165 stb %l6,[%i1+14]
1166 stb %o3,[%i1+15]
1167
1168 ret
1169 restore
1170.type AES_decrypt,#function
1171.size AES_decrypt,(.-AES_decrypt)
1172___
1173
1174# fmovs instructions substituting for FP nops were originally added
1175# to meet specific instruction alignment requirements to maximize ILP.
1176# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1177# undesired effect, so just omit them and sacrifice some portion of
1178# percent in performance...
1179$code =~ s/fmovs.*$//gm;
1180
1181print $code;
1182close STDOUT; # ensure flush
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
deleted file mode 100755
index 48fa857d5b..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl
+++ /dev/null
@@ -1,2818 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Version 2.1.
11#
12# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
13# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
14# [you'll notice a lot of resemblance], such as compressed S-boxes
15# in little-endian byte order, prefetch of these tables in CBC mode,
16# as well as avoiding L1 cache aliasing between stack frame and key
17# schedule and already mentioned tables, compressed Td4...
18#
19# Performance in number of cycles per processed byte for 128-bit key:
20#
21# ECB encrypt ECB decrypt CBC large chunk
22# AMD64 33 41 13.0
23# EM64T 38 59 18.6(*)
24# Core 2 30 43 14.5(*)
25#
26# (*) with hyper-threading off
27
28$flavour = shift;
29$output = shift;
30if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
31
32$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
33
34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
36( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
37die "can't locate x86_64-xlate.pl";
38
39open STDOUT,"| $^X $xlate $flavour $output";
40
41$verticalspin=1; # unlike 32-bit version $verticalspin performs
42 # ~15% better on both AMD and Intel cores
43$speed_limit=512; # see aes-586.pl for details
44
45$code=".text\n";
46
47$s0="%eax";
48$s1="%ebx";
49$s2="%ecx";
50$s3="%edx";
51$acc0="%esi"; $mask80="%rsi";
52$acc1="%edi"; $maskfe="%rdi";
53$acc2="%ebp"; $mask1b="%rbp";
54$inp="%r8";
55$out="%r9";
56$t0="%r10d";
57$t1="%r11d";
58$t2="%r12d";
59$rnds="%r13d";
60$sbox="%r14";
61$key="%r15";
62
63sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
64sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
65 $r =~ s/%[er]([sd]i)/%\1l/;
66 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
67sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
68 $r =~ s/%r([0-9]+)/%r\1d/; $r; }
69sub _data_word()
70{ my $i;
71 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
72}
73sub data_word()
74{ my $i;
75 my $last=pop(@_);
76 $code.=".long\t";
77 while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
78 $code.=sprintf"0x%08x\n",$last;
79}
80
81sub data_byte()
82{ my $i;
83 my $last=pop(@_);
84 $code.=".byte\t";
85 while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
86 $code.=sprintf"0x%02x\n",$last&0xff;
87}
88
89sub encvert()
90{ my $t3="%r8d"; # zaps $inp!
91
92$code.=<<___;
93 # favor 3-way issue Opteron pipeline...
94 movzb `&lo("$s0")`,$acc0
95 movzb `&lo("$s1")`,$acc1
96 movzb `&lo("$s2")`,$acc2
97 mov 0($sbox,$acc0,8),$t0
98 mov 0($sbox,$acc1,8),$t1
99 mov 0($sbox,$acc2,8),$t2
100
101 movzb `&hi("$s1")`,$acc0
102 movzb `&hi("$s2")`,$acc1
103 movzb `&lo("$s3")`,$acc2
104 xor 3($sbox,$acc0,8),$t0
105 xor 3($sbox,$acc1,8),$t1
106 mov 0($sbox,$acc2,8),$t3
107
108 movzb `&hi("$s3")`,$acc0
109 shr \$16,$s2
110 movzb `&hi("$s0")`,$acc2
111 xor 3($sbox,$acc0,8),$t2
112 shr \$16,$s3
113 xor 3($sbox,$acc2,8),$t3
114
115 shr \$16,$s1
116 lea 16($key),$key
117 shr \$16,$s0
118
119 movzb `&lo("$s2")`,$acc0
120 movzb `&lo("$s3")`,$acc1
121 movzb `&lo("$s0")`,$acc2
122 xor 2($sbox,$acc0,8),$t0
123 xor 2($sbox,$acc1,8),$t1
124 xor 2($sbox,$acc2,8),$t2
125
126 movzb `&hi("$s3")`,$acc0
127 movzb `&hi("$s0")`,$acc1
128 movzb `&lo("$s1")`,$acc2
129 xor 1($sbox,$acc0,8),$t0
130 xor 1($sbox,$acc1,8),$t1
131 xor 2($sbox,$acc2,8),$t3
132
133 mov 12($key),$s3
134 movzb `&hi("$s1")`,$acc1
135 movzb `&hi("$s2")`,$acc2
136 mov 0($key),$s0
137 xor 1($sbox,$acc1,8),$t2
138 xor 1($sbox,$acc2,8),$t3
139
140 mov 4($key),$s1
141 mov 8($key),$s2
142 xor $t0,$s0
143 xor $t1,$s1
144 xor $t2,$s2
145 xor $t3,$s3
146___
147}
148
149sub enclastvert()
150{ my $t3="%r8d"; # zaps $inp!
151
152$code.=<<___;
153 movzb `&lo("$s0")`,$acc0
154 movzb `&lo("$s1")`,$acc1
155 movzb `&lo("$s2")`,$acc2
156 movzb 2($sbox,$acc0,8),$t0
157 movzb 2($sbox,$acc1,8),$t1
158 movzb 2($sbox,$acc2,8),$t2
159
160 movzb `&lo("$s3")`,$acc0
161 movzb `&hi("$s1")`,$acc1
162 movzb `&hi("$s2")`,$acc2
163 movzb 2($sbox,$acc0,8),$t3
164 mov 0($sbox,$acc1,8),$acc1 #$t0
165 mov 0($sbox,$acc2,8),$acc2 #$t1
166
167 and \$0x0000ff00,$acc1
168 and \$0x0000ff00,$acc2
169
170 xor $acc1,$t0
171 xor $acc2,$t1
172 shr \$16,$s2
173
174 movzb `&hi("$s3")`,$acc0
175 movzb `&hi("$s0")`,$acc1
176 shr \$16,$s3
177 mov 0($sbox,$acc0,8),$acc0 #$t2
178 mov 0($sbox,$acc1,8),$acc1 #$t3
179
180 and \$0x0000ff00,$acc0
181 and \$0x0000ff00,$acc1
182 shr \$16,$s1
183 xor $acc0,$t2
184 xor $acc1,$t3
185 shr \$16,$s0
186
187 movzb `&lo("$s2")`,$acc0
188 movzb `&lo("$s3")`,$acc1
189 movzb `&lo("$s0")`,$acc2
190 mov 0($sbox,$acc0,8),$acc0 #$t0
191 mov 0($sbox,$acc1,8),$acc1 #$t1
192 mov 0($sbox,$acc2,8),$acc2 #$t2
193
194 and \$0x00ff0000,$acc0
195 and \$0x00ff0000,$acc1
196 and \$0x00ff0000,$acc2
197
198 xor $acc0,$t0
199 xor $acc1,$t1
200 xor $acc2,$t2
201
202 movzb `&lo("$s1")`,$acc0
203 movzb `&hi("$s3")`,$acc1
204 movzb `&hi("$s0")`,$acc2
205 mov 0($sbox,$acc0,8),$acc0 #$t3
206 mov 2($sbox,$acc1,8),$acc1 #$t0
207 mov 2($sbox,$acc2,8),$acc2 #$t1
208
209 and \$0x00ff0000,$acc0
210 and \$0xff000000,$acc1
211 and \$0xff000000,$acc2
212
213 xor $acc0,$t3
214 xor $acc1,$t0
215 xor $acc2,$t1
216
217 movzb `&hi("$s1")`,$acc0
218 movzb `&hi("$s2")`,$acc1
219 mov 16+12($key),$s3
220 mov 2($sbox,$acc0,8),$acc0 #$t2
221 mov 2($sbox,$acc1,8),$acc1 #$t3
222 mov 16+0($key),$s0
223
224 and \$0xff000000,$acc0
225 and \$0xff000000,$acc1
226
227 xor $acc0,$t2
228 xor $acc1,$t3
229
230 mov 16+4($key),$s1
231 mov 16+8($key),$s2
232 xor $t0,$s0
233 xor $t1,$s1
234 xor $t2,$s2
235 xor $t3,$s3
236___
237}
238
239sub encstep()
240{ my ($i,@s) = @_;
241 my $tmp0=$acc0;
242 my $tmp1=$acc1;
243 my $tmp2=$acc2;
244 my $out=($t0,$t1,$t2,$s[0])[$i];
245
246 if ($i==3) {
247 $tmp0=$s[1];
248 $tmp1=$s[2];
249 $tmp2=$s[3];
250 }
251 $code.=" movzb ".&lo($s[0]).",$out\n";
252 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
253 $code.=" lea 16($key),$key\n" if ($i==0);
254
255 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
256 $code.=" mov 0($sbox,$out,8),$out\n";
257
258 $code.=" shr \$16,$tmp1\n";
259 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
260 $code.=" xor 3($sbox,$tmp0,8),$out\n";
261
262 $code.=" movzb ".&lo($tmp1).",$tmp1\n";
263 $code.=" shr \$24,$tmp2\n";
264 $code.=" xor 4*$i($key),$out\n";
265
266 $code.=" xor 2($sbox,$tmp1,8),$out\n";
267 $code.=" xor 1($sbox,$tmp2,8),$out\n";
268
269 $code.=" mov $t0,$s[1]\n" if ($i==3);
270 $code.=" mov $t1,$s[2]\n" if ($i==3);
271 $code.=" mov $t2,$s[3]\n" if ($i==3);
272 $code.="\n";
273}
274
275sub enclast()
276{ my ($i,@s)=@_;
277 my $tmp0=$acc0;
278 my $tmp1=$acc1;
279 my $tmp2=$acc2;
280 my $out=($t0,$t1,$t2,$s[0])[$i];
281
282 if ($i==3) {
283 $tmp0=$s[1];
284 $tmp1=$s[2];
285 $tmp2=$s[3];
286 }
287 $code.=" movzb ".&lo($s[0]).",$out\n";
288 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
289
290 $code.=" mov 2($sbox,$out,8),$out\n";
291 $code.=" shr \$16,$tmp1\n";
292 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
293
294 $code.=" and \$0x000000ff,$out\n";
295 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
296 $code.=" movzb ".&lo($tmp1).",$tmp1\n";
297 $code.=" shr \$24,$tmp2\n";
298
299 $code.=" mov 0($sbox,$tmp0,8),$tmp0\n";
300 $code.=" mov 0($sbox,$tmp1,8),$tmp1\n";
301 $code.=" mov 2($sbox,$tmp2,8),$tmp2\n";
302
303 $code.=" and \$0x0000ff00,$tmp0\n";
304 $code.=" and \$0x00ff0000,$tmp1\n";
305 $code.=" and \$0xff000000,$tmp2\n";
306
307 $code.=" xor $tmp0,$out\n";
308 $code.=" mov $t0,$s[1]\n" if ($i==3);
309 $code.=" xor $tmp1,$out\n";
310 $code.=" mov $t1,$s[2]\n" if ($i==3);
311 $code.=" xor $tmp2,$out\n";
312 $code.=" mov $t2,$s[3]\n" if ($i==3);
313 $code.="\n";
314}
315
316$code.=<<___;
317.type _x86_64_AES_encrypt,\@abi-omnipotent
318.align 16
319_x86_64_AES_encrypt:
320 xor 0($key),$s0 # xor with key
321 xor 4($key),$s1
322 xor 8($key),$s2
323 xor 12($key),$s3
324
325 mov 240($key),$rnds # load key->rounds
326 sub \$1,$rnds
327 jmp .Lenc_loop
328.align 16
329.Lenc_loop:
330___
331 if ($verticalspin) { &encvert(); }
332 else { &encstep(0,$s0,$s1,$s2,$s3);
333 &encstep(1,$s1,$s2,$s3,$s0);
334 &encstep(2,$s2,$s3,$s0,$s1);
335 &encstep(3,$s3,$s0,$s1,$s2);
336 }
337$code.=<<___;
338 sub \$1,$rnds
339 jnz .Lenc_loop
340___
341 if ($verticalspin) { &enclastvert(); }
342 else { &enclast(0,$s0,$s1,$s2,$s3);
343 &enclast(1,$s1,$s2,$s3,$s0);
344 &enclast(2,$s2,$s3,$s0,$s1);
345 &enclast(3,$s3,$s0,$s1,$s2);
346 $code.=<<___;
347 xor 16+0($key),$s0 # xor with key
348 xor 16+4($key),$s1
349 xor 16+8($key),$s2
350 xor 16+12($key),$s3
351___
352 }
353$code.=<<___;
354 .byte 0xf3,0xc3 # rep ret
355.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
356___
357
358# it's possible to implement this by shifting tN by 8, filling least
359# significant byte with byte load and finally bswap-ing at the end,
360# but such partial register load kills Core 2...
361sub enccompactvert()
362{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
363
364$code.=<<___;
365 movzb `&lo("$s0")`,$t0
366 movzb `&lo("$s1")`,$t1
367 movzb `&lo("$s2")`,$t2
368 movzb ($sbox,$t0,1),$t0
369 movzb ($sbox,$t1,1),$t1
370 movzb ($sbox,$t2,1),$t2
371
372 movzb `&lo("$s3")`,$t3
373 movzb `&hi("$s1")`,$acc0
374 movzb `&hi("$s2")`,$acc1
375 movzb ($sbox,$t3,1),$t3
376 movzb ($sbox,$acc0,1),$t4 #$t0
377 movzb ($sbox,$acc1,1),$t5 #$t1
378
379 movzb `&hi("$s3")`,$acc2
380 movzb `&hi("$s0")`,$acc0
381 shr \$16,$s2
382 movzb ($sbox,$acc2,1),$acc2 #$t2
383 movzb ($sbox,$acc0,1),$acc0 #$t3
384 shr \$16,$s3
385
386 movzb `&lo("$s2")`,$acc1
387 shl \$8,$t4
388 shl \$8,$t5
389 movzb ($sbox,$acc1,1),$acc1 #$t0
390 xor $t4,$t0
391 xor $t5,$t1
392
393 movzb `&lo("$s3")`,$t4
394 shr \$16,$s0
395 shr \$16,$s1
396 movzb `&lo("$s0")`,$t5
397 shl \$8,$acc2
398 shl \$8,$acc0
399 movzb ($sbox,$t4,1),$t4 #$t1
400 movzb ($sbox,$t5,1),$t5 #$t2
401 xor $acc2,$t2
402 xor $acc0,$t3
403
404 movzb `&lo("$s1")`,$acc2
405 movzb `&hi("$s3")`,$acc0
406 shl \$16,$acc1
407 movzb ($sbox,$acc2,1),$acc2 #$t3
408 movzb ($sbox,$acc0,1),$acc0 #$t0
409 xor $acc1,$t0
410
411 movzb `&hi("$s0")`,$acc1
412 shr \$8,$s2
413 shr \$8,$s1
414 movzb ($sbox,$acc1,1),$acc1 #$t1
415 movzb ($sbox,$s2,1),$s3 #$t3
416 movzb ($sbox,$s1,1),$s2 #$t2
417 shl \$16,$t4
418 shl \$16,$t5
419 shl \$16,$acc2
420 xor $t4,$t1
421 xor $t5,$t2
422 xor $acc2,$t3
423
424 shl \$24,$acc0
425 shl \$24,$acc1
426 shl \$24,$s3
427 xor $acc0,$t0
428 shl \$24,$s2
429 xor $acc1,$t1
430 mov $t0,$s0
431 mov $t1,$s1
432 xor $t2,$s2
433 xor $t3,$s3
434___
435}
436
437sub enctransform_ref()
438{ my $sn = shift;
439 my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
440
441$code.=<<___;
442 mov $sn,$acc
443 and \$0x80808080,$acc
444 mov $acc,$tmp
445 shr \$7,$tmp
446 lea ($sn,$sn),$r2
447 sub $tmp,$acc
448 and \$0xfefefefe,$r2
449 and \$0x1b1b1b1b,$acc
450 mov $sn,$tmp
451 xor $acc,$r2
452
453 xor $r2,$sn
454 rol \$24,$sn
455 xor $r2,$sn
456 ror \$16,$tmp
457 xor $tmp,$sn
458 ror \$8,$tmp
459 xor $tmp,$sn
460___
461}
462
463# unlike decrypt case it does not pay off to parallelize enctransform
464sub enctransform()
465{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
466
467$code.=<<___;
468 mov $s0,$acc0
469 mov $s1,$acc1
470 and \$0x80808080,$acc0
471 and \$0x80808080,$acc1
472 mov $acc0,$t0
473 mov $acc1,$t1
474 shr \$7,$t0
475 lea ($s0,$s0),$r20
476 shr \$7,$t1
477 lea ($s1,$s1),$r21
478 sub $t0,$acc0
479 sub $t1,$acc1
480 and \$0xfefefefe,$r20
481 and \$0xfefefefe,$r21
482 and \$0x1b1b1b1b,$acc0
483 and \$0x1b1b1b1b,$acc1
484 mov $s0,$t0
485 mov $s1,$t1
486 xor $acc0,$r20
487 xor $acc1,$r21
488
489 xor $r20,$s0
490 xor $r21,$s1
491 mov $s2,$acc0
492 mov $s3,$acc1
493 rol \$24,$s0
494 rol \$24,$s1
495 and \$0x80808080,$acc0
496 and \$0x80808080,$acc1
497 xor $r20,$s0
498 xor $r21,$s1
499 mov $acc0,$t2
500 mov $acc1,$t3
501 ror \$16,$t0
502 ror \$16,$t1
503 shr \$7,$t2
504 lea ($s2,$s2),$r20
505 xor $t0,$s0
506 xor $t1,$s1
507 shr \$7,$t3
508 lea ($s3,$s3),$r21
509 ror \$8,$t0
510 ror \$8,$t1
511 sub $t2,$acc0
512 sub $t3,$acc1
513 xor $t0,$s0
514 xor $t1,$s1
515
516 and \$0xfefefefe,$r20
517 and \$0xfefefefe,$r21
518 and \$0x1b1b1b1b,$acc0
519 and \$0x1b1b1b1b,$acc1
520 mov $s2,$t2
521 mov $s3,$t3
522 xor $acc0,$r20
523 xor $acc1,$r21
524
525 xor $r20,$s2
526 xor $r21,$s3
527 rol \$24,$s2
528 rol \$24,$s3
529 xor $r20,$s2
530 xor $r21,$s3
531 mov 0($sbox),$acc0 # prefetch Te4
532 ror \$16,$t2
533 ror \$16,$t3
534 mov 64($sbox),$acc1
535 xor $t2,$s2
536 xor $t3,$s3
537 mov 128($sbox),$r20
538 ror \$8,$t2
539 ror \$8,$t3
540 mov 192($sbox),$r21
541 xor $t2,$s2
542 xor $t3,$s3
543___
544}
545
546$code.=<<___;
547.type _x86_64_AES_encrypt_compact,\@abi-omnipotent
548.align 16
549_x86_64_AES_encrypt_compact:
550 lea 128($sbox),$inp # size optimization
551 mov 0-128($inp),$acc1 # prefetch Te4
552 mov 32-128($inp),$acc2
553 mov 64-128($inp),$t0
554 mov 96-128($inp),$t1
555 mov 128-128($inp),$acc1
556 mov 160-128($inp),$acc2
557 mov 192-128($inp),$t0
558 mov 224-128($inp),$t1
559 jmp .Lenc_loop_compact
560.align 16
561.Lenc_loop_compact:
562 xor 0($key),$s0 # xor with key
563 xor 4($key),$s1
564 xor 8($key),$s2
565 xor 12($key),$s3
566 lea 16($key),$key
567___
568 &enccompactvert();
569$code.=<<___;
570 cmp 16(%rsp),$key
571 je .Lenc_compact_done
572___
573 &enctransform();
574$code.=<<___;
575 jmp .Lenc_loop_compact
576.align 16
577.Lenc_compact_done:
578 xor 0($key),$s0
579 xor 4($key),$s1
580 xor 8($key),$s2
581 xor 12($key),$s3
582 .byte 0xf3,0xc3 # rep ret
583.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
584___
585
586# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
587$code.=<<___;
588.globl AES_encrypt
589.type AES_encrypt,\@function,3
590.align 16
591.globl asm_AES_encrypt
592.hidden asm_AES_encrypt
593asm_AES_encrypt:
594AES_encrypt:
595 push %rbx
596 push %rbp
597 push %r12
598 push %r13
599 push %r14
600 push %r15
601
602 # allocate frame "above" key schedule
603 mov %rsp,%r10
604 lea -63(%rdx),%rcx # %rdx is key argument
605 and \$-64,%rsp
606 sub %rsp,%rcx
607 neg %rcx
608 and \$0x3c0,%rcx
609 sub %rcx,%rsp
610 sub \$32,%rsp
611
612 mov %rsi,16(%rsp) # save out
613 mov %r10,24(%rsp) # save real stack pointer
614.Lenc_prologue:
615
616 mov %rdx,$key
617 mov 240($key),$rnds # load rounds
618
619 mov 0(%rdi),$s0 # load input vector
620 mov 4(%rdi),$s1
621 mov 8(%rdi),$s2
622 mov 12(%rdi),$s3
623
624 shl \$4,$rnds
625 lea ($key,$rnds),%rbp
626 mov $key,(%rsp) # key schedule
627 mov %rbp,8(%rsp) # end of key schedule
628
629 # pick Te4 copy which can't "overlap" with stack frame or key schedule
630 lea .LAES_Te+2048(%rip),$sbox
631 lea 768(%rsp),%rbp
632 sub $sbox,%rbp
633 and \$0x300,%rbp
634 lea ($sbox,%rbp),$sbox
635
636 call _x86_64_AES_encrypt_compact
637
638 mov 16(%rsp),$out # restore out
639 mov 24(%rsp),%rsi # restore saved stack pointer
640 mov $s0,0($out) # write output vector
641 mov $s1,4($out)
642 mov $s2,8($out)
643 mov $s3,12($out)
644
645 mov (%rsi),%r15
646 mov 8(%rsi),%r14
647 mov 16(%rsi),%r13
648 mov 24(%rsi),%r12
649 mov 32(%rsi),%rbp
650 mov 40(%rsi),%rbx
651 lea 48(%rsi),%rsp
652.Lenc_epilogue:
653 ret
654.size AES_encrypt,.-AES_encrypt
655___
656
657#------------------------------------------------------------------#
658
659sub decvert()
660{ my $t3="%r8d"; # zaps $inp!
661
662$code.=<<___;
663 # favor 3-way issue Opteron pipeline...
664 movzb `&lo("$s0")`,$acc0
665 movzb `&lo("$s1")`,$acc1
666 movzb `&lo("$s2")`,$acc2
667 mov 0($sbox,$acc0,8),$t0
668 mov 0($sbox,$acc1,8),$t1
669 mov 0($sbox,$acc2,8),$t2
670
671 movzb `&hi("$s3")`,$acc0
672 movzb `&hi("$s0")`,$acc1
673 movzb `&lo("$s3")`,$acc2
674 xor 3($sbox,$acc0,8),$t0
675 xor 3($sbox,$acc1,8),$t1
676 mov 0($sbox,$acc2,8),$t3
677
678 movzb `&hi("$s1")`,$acc0
679 shr \$16,$s0
680 movzb `&hi("$s2")`,$acc2
681 xor 3($sbox,$acc0,8),$t2
682 shr \$16,$s3
683 xor 3($sbox,$acc2,8),$t3
684
685 shr \$16,$s1
686 lea 16($key),$key
687 shr \$16,$s2
688
689 movzb `&lo("$s2")`,$acc0
690 movzb `&lo("$s3")`,$acc1
691 movzb `&lo("$s0")`,$acc2
692 xor 2($sbox,$acc0,8),$t0
693 xor 2($sbox,$acc1,8),$t1
694 xor 2($sbox,$acc2,8),$t2
695
696 movzb `&hi("$s1")`,$acc0
697 movzb `&hi("$s2")`,$acc1
698 movzb `&lo("$s1")`,$acc2
699 xor 1($sbox,$acc0,8),$t0
700 xor 1($sbox,$acc1,8),$t1
701 xor 2($sbox,$acc2,8),$t3
702
703 movzb `&hi("$s3")`,$acc0
704 mov 12($key),$s3
705 movzb `&hi("$s0")`,$acc2
706 xor 1($sbox,$acc0,8),$t2
707 mov 0($key),$s0
708 xor 1($sbox,$acc2,8),$t3
709
710 xor $t0,$s0
711 mov 4($key),$s1
712 mov 8($key),$s2
713 xor $t2,$s2
714 xor $t1,$s1
715 xor $t3,$s3
716___
717}
718
719sub declastvert()
720{ my $t3="%r8d"; # zaps $inp!
721
722$code.=<<___;
723 lea 2048($sbox),$sbox # size optimization
724 movzb `&lo("$s0")`,$acc0
725 movzb `&lo("$s1")`,$acc1
726 movzb `&lo("$s2")`,$acc2
727 movzb ($sbox,$acc0,1),$t0
728 movzb ($sbox,$acc1,1),$t1
729 movzb ($sbox,$acc2,1),$t2
730
731 movzb `&lo("$s3")`,$acc0
732 movzb `&hi("$s3")`,$acc1
733 movzb `&hi("$s0")`,$acc2
734 movzb ($sbox,$acc0,1),$t3
735 movzb ($sbox,$acc1,1),$acc1 #$t0
736 movzb ($sbox,$acc2,1),$acc2 #$t1
737
738 shl \$8,$acc1
739 shl \$8,$acc2
740
741 xor $acc1,$t0
742 xor $acc2,$t1
743 shr \$16,$s3
744
745 movzb `&hi("$s1")`,$acc0
746 movzb `&hi("$s2")`,$acc1
747 shr \$16,$s0
748 movzb ($sbox,$acc0,1),$acc0 #$t2
749 movzb ($sbox,$acc1,1),$acc1 #$t3
750
751 shl \$8,$acc0
752 shl \$8,$acc1
753 shr \$16,$s1
754 xor $acc0,$t2
755 xor $acc1,$t3
756 shr \$16,$s2
757
758 movzb `&lo("$s2")`,$acc0
759 movzb `&lo("$s3")`,$acc1
760 movzb `&lo("$s0")`,$acc2
761 movzb ($sbox,$acc0,1),$acc0 #$t0
762 movzb ($sbox,$acc1,1),$acc1 #$t1
763 movzb ($sbox,$acc2,1),$acc2 #$t2
764
765 shl \$16,$acc0
766 shl \$16,$acc1
767 shl \$16,$acc2
768
769 xor $acc0,$t0
770 xor $acc1,$t1
771 xor $acc2,$t2
772
773 movzb `&lo("$s1")`,$acc0
774 movzb `&hi("$s1")`,$acc1
775 movzb `&hi("$s2")`,$acc2
776 movzb ($sbox,$acc0,1),$acc0 #$t3
777 movzb ($sbox,$acc1,1),$acc1 #$t0
778 movzb ($sbox,$acc2,1),$acc2 #$t1
779
780 shl \$16,$acc0
781 shl \$24,$acc1
782 shl \$24,$acc2
783
784 xor $acc0,$t3
785 xor $acc1,$t0
786 xor $acc2,$t1
787
788 movzb `&hi("$s3")`,$acc0
789 movzb `&hi("$s0")`,$acc1
790 mov 16+12($key),$s3
791 movzb ($sbox,$acc0,1),$acc0 #$t2
792 movzb ($sbox,$acc1,1),$acc1 #$t3
793 mov 16+0($key),$s0
794
795 shl \$24,$acc0
796 shl \$24,$acc1
797
798 xor $acc0,$t2
799 xor $acc1,$t3
800
801 mov 16+4($key),$s1
802 mov 16+8($key),$s2
803 lea -2048($sbox),$sbox
804 xor $t0,$s0
805 xor $t1,$s1
806 xor $t2,$s2
807 xor $t3,$s3
808___
809}
810
811sub decstep()
812{ my ($i,@s) = @_;
813 my $tmp0=$acc0;
814 my $tmp1=$acc1;
815 my $tmp2=$acc2;
816 my $out=($t0,$t1,$t2,$s[0])[$i];
817
818 $code.=" mov $s[0],$out\n" if ($i!=3);
819 $tmp1=$s[2] if ($i==3);
820 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
821 $code.=" and \$0xFF,$out\n";
822
823 $code.=" mov 0($sbox,$out,8),$out\n";
824 $code.=" shr \$16,$tmp1\n";
825 $tmp2=$s[3] if ($i==3);
826 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
827
828 $tmp0=$s[1] if ($i==3);
829 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
830 $code.=" and \$0xFF,$tmp1\n";
831 $code.=" shr \$24,$tmp2\n";
832
833 $code.=" xor 3($sbox,$tmp0,8),$out\n";
834 $code.=" xor 2($sbox,$tmp1,8),$out\n";
835 $code.=" xor 1($sbox,$tmp2,8),$out\n";
836
837 $code.=" mov $t2,$s[1]\n" if ($i==3);
838 $code.=" mov $t1,$s[2]\n" if ($i==3);
839 $code.=" mov $t0,$s[3]\n" if ($i==3);
840 $code.="\n";
841}
842
843sub declast()
844{ my ($i,@s)=@_;
845 my $tmp0=$acc0;
846 my $tmp1=$acc1;
847 my $tmp2=$acc2;
848 my $out=($t0,$t1,$t2,$s[0])[$i];
849
850 $code.=" mov $s[0],$out\n" if ($i!=3);
851 $tmp1=$s[2] if ($i==3);
852 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
853 $code.=" and \$0xFF,$out\n";
854
855 $code.=" movzb 2048($sbox,$out,1),$out\n";
856 $code.=" shr \$16,$tmp1\n";
857 $tmp2=$s[3] if ($i==3);
858 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
859
860 $tmp0=$s[1] if ($i==3);
861 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
862 $code.=" and \$0xFF,$tmp1\n";
863 $code.=" shr \$24,$tmp2\n";
864
865 $code.=" movzb 2048($sbox,$tmp0,1),$tmp0\n";
866 $code.=" movzb 2048($sbox,$tmp1,1),$tmp1\n";
867 $code.=" movzb 2048($sbox,$tmp2,1),$tmp2\n";
868
869 $code.=" shl \$8,$tmp0\n";
870 $code.=" shl \$16,$tmp1\n";
871 $code.=" shl \$24,$tmp2\n";
872
873 $code.=" xor $tmp0,$out\n";
874 $code.=" mov $t2,$s[1]\n" if ($i==3);
875 $code.=" xor $tmp1,$out\n";
876 $code.=" mov $t1,$s[2]\n" if ($i==3);
877 $code.=" xor $tmp2,$out\n";
878 $code.=" mov $t0,$s[3]\n" if ($i==3);
879 $code.="\n";
880}
881
882$code.=<<___;
883.type _x86_64_AES_decrypt,\@abi-omnipotent
884.align 16
885_x86_64_AES_decrypt:
886 xor 0($key),$s0 # xor with key
887 xor 4($key),$s1
888 xor 8($key),$s2
889 xor 12($key),$s3
890
891 mov 240($key),$rnds # load key->rounds
892 sub \$1,$rnds
893 jmp .Ldec_loop
894.align 16
895.Ldec_loop:
896___
897 if ($verticalspin) { &decvert(); }
898 else { &decstep(0,$s0,$s3,$s2,$s1);
899 &decstep(1,$s1,$s0,$s3,$s2);
900 &decstep(2,$s2,$s1,$s0,$s3);
901 &decstep(3,$s3,$s2,$s1,$s0);
902 $code.=<<___;
903 lea 16($key),$key
904 xor 0($key),$s0 # xor with key
905 xor 4($key),$s1
906 xor 8($key),$s2
907 xor 12($key),$s3
908___
909 }
910$code.=<<___;
911 sub \$1,$rnds
912 jnz .Ldec_loop
913___
914 if ($verticalspin) { &declastvert(); }
915 else { &declast(0,$s0,$s3,$s2,$s1);
916 &declast(1,$s1,$s0,$s3,$s2);
917 &declast(2,$s2,$s1,$s0,$s3);
918 &declast(3,$s3,$s2,$s1,$s0);
919 $code.=<<___;
920 xor 16+0($key),$s0 # xor with key
921 xor 16+4($key),$s1
922 xor 16+8($key),$s2
923 xor 16+12($key),$s3
924___
925 }
926$code.=<<___;
927 .byte 0xf3,0xc3 # rep ret
928.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
929___
930
931sub deccompactvert()
932{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
933
934$code.=<<___;
935 movzb `&lo("$s0")`,$t0
936 movzb `&lo("$s1")`,$t1
937 movzb `&lo("$s2")`,$t2
938 movzb ($sbox,$t0,1),$t0
939 movzb ($sbox,$t1,1),$t1
940 movzb ($sbox,$t2,1),$t2
941
942 movzb `&lo("$s3")`,$t3
943 movzb `&hi("$s3")`,$acc0
944 movzb `&hi("$s0")`,$acc1
945 movzb ($sbox,$t3,1),$t3
946 movzb ($sbox,$acc0,1),$t4 #$t0
947 movzb ($sbox,$acc1,1),$t5 #$t1
948
949 movzb `&hi("$s1")`,$acc2
950 movzb `&hi("$s2")`,$acc0
951 shr \$16,$s2
952 movzb ($sbox,$acc2,1),$acc2 #$t2
953 movzb ($sbox,$acc0,1),$acc0 #$t3
954 shr \$16,$s3
955
956 movzb `&lo("$s2")`,$acc1
957 shl \$8,$t4
958 shl \$8,$t5
959 movzb ($sbox,$acc1,1),$acc1 #$t0
960 xor $t4,$t0
961 xor $t5,$t1
962
963 movzb `&lo("$s3")`,$t4
964 shr \$16,$s0
965 shr \$16,$s1
966 movzb `&lo("$s0")`,$t5
967 shl \$8,$acc2
968 shl \$8,$acc0
969 movzb ($sbox,$t4,1),$t4 #$t1
970 movzb ($sbox,$t5,1),$t5 #$t2
971 xor $acc2,$t2
972 xor $acc0,$t3
973
974 movzb `&lo("$s1")`,$acc2
975 movzb `&hi("$s1")`,$acc0
976 shl \$16,$acc1
977 movzb ($sbox,$acc2,1),$acc2 #$t3
978 movzb ($sbox,$acc0,1),$acc0 #$t0
979 xor $acc1,$t0
980
981 movzb `&hi("$s2")`,$acc1
982 shl \$16,$t4
983 shl \$16,$t5
984 movzb ($sbox,$acc1,1),$s1 #$t1
985 xor $t4,$t1
986 xor $t5,$t2
987
988 movzb `&hi("$s3")`,$acc1
989 shr \$8,$s0
990 shl \$16,$acc2
991 movzb ($sbox,$acc1,1),$s2 #$t2
992 movzb ($sbox,$s0,1),$s3 #$t3
993 xor $acc2,$t3
994
995 shl \$24,$acc0
996 shl \$24,$s1
997 shl \$24,$s2
998 xor $acc0,$t0
999 shl \$24,$s3
1000 xor $t1,$s1
1001 mov $t0,$s0
1002 xor $t2,$s2
1003 xor $t3,$s3
1004___
1005}
1006
1007# parallelized version! input is pair of 64-bit values: %rax=s1.s0
1008# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
1009# %ecx=s2 and %edx=s3.
1010sub dectransform()
1011{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
1012 my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
1013 my $prefetch = shift;
1014
1015$code.=<<___;
1016 mov $tp10,$acc0
1017 mov $tp18,$acc8
1018 and $mask80,$acc0
1019 and $mask80,$acc8
1020 mov $acc0,$tp40
1021 mov $acc8,$tp48
1022 shr \$7,$tp40
1023 lea ($tp10,$tp10),$tp20
1024 shr \$7,$tp48
1025 lea ($tp18,$tp18),$tp28
1026 sub $tp40,$acc0
1027 sub $tp48,$acc8
1028 and $maskfe,$tp20
1029 and $maskfe,$tp28
1030 and $mask1b,$acc0
1031 and $mask1b,$acc8
1032 xor $tp20,$acc0
1033 xor $tp28,$acc8
1034 mov $acc0,$tp20
1035 mov $acc8,$tp28
1036
1037 and $mask80,$acc0
1038 and $mask80,$acc8
1039 mov $acc0,$tp80
1040 mov $acc8,$tp88
1041 shr \$7,$tp80
1042 lea ($tp20,$tp20),$tp40
1043 shr \$7,$tp88
1044 lea ($tp28,$tp28),$tp48
1045 sub $tp80,$acc0
1046 sub $tp88,$acc8
1047 and $maskfe,$tp40
1048 and $maskfe,$tp48
1049 and $mask1b,$acc0
1050 and $mask1b,$acc8
1051 xor $tp40,$acc0
1052 xor $tp48,$acc8
1053 mov $acc0,$tp40
1054 mov $acc8,$tp48
1055
1056 and $mask80,$acc0
1057 and $mask80,$acc8
1058 mov $acc0,$tp80
1059 mov $acc8,$tp88
1060 shr \$7,$tp80
1061 xor $tp10,$tp20 # tp2^=tp1
1062 shr \$7,$tp88
1063 xor $tp18,$tp28 # tp2^=tp1
1064 sub $tp80,$acc0
1065 sub $tp88,$acc8
1066 lea ($tp40,$tp40),$tp80
1067 lea ($tp48,$tp48),$tp88
1068 xor $tp10,$tp40 # tp4^=tp1
1069 xor $tp18,$tp48 # tp4^=tp1
1070 and $maskfe,$tp80
1071 and $maskfe,$tp88
1072 and $mask1b,$acc0
1073 and $mask1b,$acc8
1074 xor $acc0,$tp80
1075 xor $acc8,$tp88
1076
1077 xor $tp80,$tp10 # tp1^=tp8
1078 xor $tp88,$tp18 # tp1^=tp8
1079 xor $tp80,$tp20 # tp2^tp1^=tp8
1080 xor $tp88,$tp28 # tp2^tp1^=tp8
1081 mov $tp10,$acc0
1082 mov $tp18,$acc8
1083 xor $tp80,$tp40 # tp4^tp1^=tp8
1084 xor $tp88,$tp48 # tp4^tp1^=tp8
1085 shr \$32,$acc0
1086 shr \$32,$acc8
1087 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1
1088 xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
1089 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8)
1090 rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8)
1091 xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1092 xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1093
1094 rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
1095 rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8)
1096 xor `&LO("$tp80")`,`&LO("$tp10")`
1097 xor `&LO("$tp88")`,`&LO("$tp18")`
1098 shr \$32,$tp80
1099 shr \$32,$tp88
1100 xor `&LO("$tp80")`,`&LO("$acc0")`
1101 xor `&LO("$tp88")`,`&LO("$acc8")`
1102
1103 mov $tp20,$tp80
1104 mov $tp28,$tp88
1105 shr \$32,$tp80
1106 shr \$32,$tp88
1107 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
1108 rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
1109 rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
1110 rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
1111 xor `&LO("$tp20")`,`&LO("$tp10")`
1112 xor `&LO("$tp28")`,`&LO("$tp18")`
1113 mov $tp40,$tp20
1114 mov $tp48,$tp28
1115 xor `&LO("$tp80")`,`&LO("$acc0")`
1116 xor `&LO("$tp88")`,`&LO("$acc8")`
1117
1118 `"mov 0($sbox),$mask80" if ($prefetch)`
1119 shr \$32,$tp20
1120 shr \$32,$tp28
1121 `"mov 64($sbox),$maskfe" if ($prefetch)`
1122 rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
1123 rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
1124 `"mov 128($sbox),$mask1b" if ($prefetch)`
1125 rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
1126 rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
1127 `"mov 192($sbox),$tp80" if ($prefetch)`
1128 xor `&LO("$tp40")`,`&LO("$tp10")`
1129 xor `&LO("$tp48")`,`&LO("$tp18")`
1130 `"mov 256($sbox),$tp88" if ($prefetch)`
1131 xor `&LO("$tp20")`,`&LO("$acc0")`
1132 xor `&LO("$tp28")`,`&LO("$acc8")`
1133___
1134}
1135
1136$code.=<<___;
1137.type _x86_64_AES_decrypt_compact,\@abi-omnipotent
1138.align 16
1139_x86_64_AES_decrypt_compact:
1140 lea 128($sbox),$inp # size optimization
1141 mov 0-128($inp),$acc1 # prefetch Td4
1142 mov 32-128($inp),$acc2
1143 mov 64-128($inp),$t0
1144 mov 96-128($inp),$t1
1145 mov 128-128($inp),$acc1
1146 mov 160-128($inp),$acc2
1147 mov 192-128($inp),$t0
1148 mov 224-128($inp),$t1
1149 jmp .Ldec_loop_compact
1150
1151.align 16
1152.Ldec_loop_compact:
1153 xor 0($key),$s0 # xor with key
1154 xor 4($key),$s1
1155 xor 8($key),$s2
1156 xor 12($key),$s3
1157 lea 16($key),$key
1158___
1159 &deccompactvert();
1160$code.=<<___;
1161 cmp 16(%rsp),$key
1162 je .Ldec_compact_done
1163
1164 mov 256+0($sbox),$mask80
1165 shl \$32,%rbx
1166 shl \$32,%rdx
1167 mov 256+8($sbox),$maskfe
1168 or %rbx,%rax
1169 or %rdx,%rcx
1170 mov 256+16($sbox),$mask1b
1171___
1172 &dectransform(1);
1173$code.=<<___;
1174 jmp .Ldec_loop_compact
1175.align 16
1176.Ldec_compact_done:
1177 xor 0($key),$s0
1178 xor 4($key),$s1
1179 xor 8($key),$s2
1180 xor 12($key),$s3
1181 .byte 0xf3,0xc3 # rep ret
1182.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
1183___
1184
1185# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
1186$code.=<<___;
1187.globl AES_decrypt
1188.type AES_decrypt,\@function,3
1189.align 16
1190.globl asm_AES_decrypt
1191.hidden asm_AES_decrypt
1192asm_AES_decrypt:
1193AES_decrypt:
1194 push %rbx
1195 push %rbp
1196 push %r12
1197 push %r13
1198 push %r14
1199 push %r15
1200
1201 # allocate frame "above" key schedule
1202 mov %rsp,%r10
1203 lea -63(%rdx),%rcx # %rdx is key argument
1204 and \$-64,%rsp
1205 sub %rsp,%rcx
1206 neg %rcx
1207 and \$0x3c0,%rcx
1208 sub %rcx,%rsp
1209 sub \$32,%rsp
1210
1211 mov %rsi,16(%rsp) # save out
1212 mov %r10,24(%rsp) # save real stack pointer
1213.Ldec_prologue:
1214
1215 mov %rdx,$key
1216 mov 240($key),$rnds # load rounds
1217
1218 mov 0(%rdi),$s0 # load input vector
1219 mov 4(%rdi),$s1
1220 mov 8(%rdi),$s2
1221 mov 12(%rdi),$s3
1222
1223 shl \$4,$rnds
1224 lea ($key,$rnds),%rbp
1225 mov $key,(%rsp) # key schedule
1226 mov %rbp,8(%rsp) # end of key schedule
1227
1228 # pick Td4 copy which can't "overlap" with stack frame or key schedule
1229 lea .LAES_Td+2048(%rip),$sbox
1230 lea 768(%rsp),%rbp
1231 sub $sbox,%rbp
1232 and \$0x300,%rbp
1233 lea ($sbox,%rbp),$sbox
1234 shr \$3,%rbp # recall "magic" constants!
1235 add %rbp,$sbox
1236
1237 call _x86_64_AES_decrypt_compact
1238
1239 mov 16(%rsp),$out # restore out
1240 mov 24(%rsp),%rsi # restore saved stack pointer
1241 mov $s0,0($out) # write output vector
1242 mov $s1,4($out)
1243 mov $s2,8($out)
1244 mov $s3,12($out)
1245
1246 mov (%rsi),%r15
1247 mov 8(%rsi),%r14
1248 mov 16(%rsi),%r13
1249 mov 24(%rsi),%r12
1250 mov 32(%rsi),%rbp
1251 mov 40(%rsi),%rbx
1252 lea 48(%rsi),%rsp
1253.Ldec_epilogue:
1254 ret
1255.size AES_decrypt,.-AES_decrypt
1256___
1257#------------------------------------------------------------------#
1258
1259sub enckey()
1260{
1261$code.=<<___;
1262 movz %dl,%esi # rk[i]>>0
1263 movzb -128(%rbp,%rsi),%ebx
1264 movz %dh,%esi # rk[i]>>8
1265 shl \$24,%ebx
1266 xor %ebx,%eax
1267
1268 movzb -128(%rbp,%rsi),%ebx
1269 shr \$16,%edx
1270 movz %dl,%esi # rk[i]>>16
1271 xor %ebx,%eax
1272
1273 movzb -128(%rbp,%rsi),%ebx
1274 movz %dh,%esi # rk[i]>>24
1275 shl \$8,%ebx
1276 xor %ebx,%eax
1277
1278 movzb -128(%rbp,%rsi),%ebx
1279 shl \$16,%ebx
1280 xor %ebx,%eax
1281
1282 xor 1024-128(%rbp,%rcx,4),%eax # rcon
1283___
1284}
1285
1286# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
1287# AES_KEY *key)
1288$code.=<<___;
1289.globl private_AES_set_encrypt_key
1290.type private_AES_set_encrypt_key,\@function,3
1291.align 16
1292private_AES_set_encrypt_key:
1293 push %rbx
1294 push %rbp
1295 push %r12 # redundant, but allows to share
1296 push %r13 # exception handler...
1297 push %r14
1298 push %r15
1299 sub \$8,%rsp
1300.Lenc_key_prologue:
1301
1302 call _x86_64_AES_set_encrypt_key
1303
1304 mov 8(%rsp),%r15
1305 mov 16(%rsp),%r14
1306 mov 24(%rsp),%r13
1307 mov 32(%rsp),%r12
1308 mov 40(%rsp),%rbp
1309 mov 48(%rsp),%rbx
1310 add \$56,%rsp
1311.Lenc_key_epilogue:
1312 ret
1313.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
1314
1315.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
1316.align 16
1317_x86_64_AES_set_encrypt_key:
1318 mov %esi,%ecx # %ecx=bits
1319 mov %rdi,%rsi # %rsi=userKey
1320 mov %rdx,%rdi # %rdi=key
1321
1322 test \$-1,%rsi
1323 jz .Lbadpointer
1324 test \$-1,%rdi
1325 jz .Lbadpointer
1326
1327 lea .LAES_Te(%rip),%rbp
1328 lea 2048+128(%rbp),%rbp
1329
1330 # prefetch Te4
1331 mov 0-128(%rbp),%eax
1332 mov 32-128(%rbp),%ebx
1333 mov 64-128(%rbp),%r8d
1334 mov 96-128(%rbp),%edx
1335 mov 128-128(%rbp),%eax
1336 mov 160-128(%rbp),%ebx
1337 mov 192-128(%rbp),%r8d
1338 mov 224-128(%rbp),%edx
1339
1340 cmp \$128,%ecx
1341 je .L10rounds
1342 cmp \$192,%ecx
1343 je .L12rounds
1344 cmp \$256,%ecx
1345 je .L14rounds
1346 mov \$-2,%rax # invalid number of bits
1347 jmp .Lexit
1348
1349.L10rounds:
1350 mov 0(%rsi),%rax # copy first 4 dwords
1351 mov 8(%rsi),%rdx
1352 mov %rax,0(%rdi)
1353 mov %rdx,8(%rdi)
1354
1355 shr \$32,%rdx
1356 xor %ecx,%ecx
1357 jmp .L10shortcut
1358.align 4
1359.L10loop:
1360 mov 0(%rdi),%eax # rk[0]
1361 mov 12(%rdi),%edx # rk[3]
1362.L10shortcut:
1363___
1364 &enckey ();
1365$code.=<<___;
1366 mov %eax,16(%rdi) # rk[4]
1367 xor 4(%rdi),%eax
1368 mov %eax,20(%rdi) # rk[5]
1369 xor 8(%rdi),%eax
1370 mov %eax,24(%rdi) # rk[6]
1371 xor 12(%rdi),%eax
1372 mov %eax,28(%rdi) # rk[7]
1373 add \$1,%ecx
1374 lea 16(%rdi),%rdi
1375 cmp \$10,%ecx
1376 jl .L10loop
1377
1378 movl \$10,80(%rdi) # setup number of rounds
1379 xor %rax,%rax
1380 jmp .Lexit
1381
1382.L12rounds:
1383 mov 0(%rsi),%rax # copy first 6 dwords
1384 mov 8(%rsi),%rbx
1385 mov 16(%rsi),%rdx
1386 mov %rax,0(%rdi)
1387 mov %rbx,8(%rdi)
1388 mov %rdx,16(%rdi)
1389
1390 shr \$32,%rdx
1391 xor %ecx,%ecx
1392 jmp .L12shortcut
1393.align 4
1394.L12loop:
1395 mov 0(%rdi),%eax # rk[0]
1396 mov 20(%rdi),%edx # rk[5]
1397.L12shortcut:
1398___
1399 &enckey ();
1400$code.=<<___;
1401 mov %eax,24(%rdi) # rk[6]
1402 xor 4(%rdi),%eax
1403 mov %eax,28(%rdi) # rk[7]
1404 xor 8(%rdi),%eax
1405 mov %eax,32(%rdi) # rk[8]
1406 xor 12(%rdi),%eax
1407 mov %eax,36(%rdi) # rk[9]
1408
1409 cmp \$7,%ecx
1410 je .L12break
1411 add \$1,%ecx
1412
1413 xor 16(%rdi),%eax
1414 mov %eax,40(%rdi) # rk[10]
1415 xor 20(%rdi),%eax
1416 mov %eax,44(%rdi) # rk[11]
1417
1418 lea 24(%rdi),%rdi
1419 jmp .L12loop
1420.L12break:
1421 movl \$12,72(%rdi) # setup number of rounds
1422 xor %rax,%rax
1423 jmp .Lexit
1424
1425.L14rounds:
1426 mov 0(%rsi),%rax # copy first 8 dwords
1427 mov 8(%rsi),%rbx
1428 mov 16(%rsi),%rcx
1429 mov 24(%rsi),%rdx
1430 mov %rax,0(%rdi)
1431 mov %rbx,8(%rdi)
1432 mov %rcx,16(%rdi)
1433 mov %rdx,24(%rdi)
1434
1435 shr \$32,%rdx
1436 xor %ecx,%ecx
1437 jmp .L14shortcut
1438.align 4
1439.L14loop:
1440 mov 0(%rdi),%eax # rk[0]
1441 mov 28(%rdi),%edx # rk[4]
1442.L14shortcut:
1443___
1444 &enckey ();
1445$code.=<<___;
1446 mov %eax,32(%rdi) # rk[8]
1447 xor 4(%rdi),%eax
1448 mov %eax,36(%rdi) # rk[9]
1449 xor 8(%rdi),%eax
1450 mov %eax,40(%rdi) # rk[10]
1451 xor 12(%rdi),%eax
1452 mov %eax,44(%rdi) # rk[11]
1453
1454 cmp \$6,%ecx
1455 je .L14break
1456 add \$1,%ecx
1457
1458 mov %eax,%edx
1459 mov 16(%rdi),%eax # rk[4]
1460 movz %dl,%esi # rk[11]>>0
1461 movzb -128(%rbp,%rsi),%ebx
1462 movz %dh,%esi # rk[11]>>8
1463 xor %ebx,%eax
1464
1465 movzb -128(%rbp,%rsi),%ebx
1466 shr \$16,%edx
1467 shl \$8,%ebx
1468 movz %dl,%esi # rk[11]>>16
1469 xor %ebx,%eax
1470
1471 movzb -128(%rbp,%rsi),%ebx
1472 movz %dh,%esi # rk[11]>>24
1473 shl \$16,%ebx
1474 xor %ebx,%eax
1475
1476 movzb -128(%rbp,%rsi),%ebx
1477 shl \$24,%ebx
1478 xor %ebx,%eax
1479
1480 mov %eax,48(%rdi) # rk[12]
1481 xor 20(%rdi),%eax
1482 mov %eax,52(%rdi) # rk[13]
1483 xor 24(%rdi),%eax
1484 mov %eax,56(%rdi) # rk[14]
1485 xor 28(%rdi),%eax
1486 mov %eax,60(%rdi) # rk[15]
1487
1488 lea 32(%rdi),%rdi
1489 jmp .L14loop
1490.L14break:
1491 movl \$14,48(%rdi) # setup number of rounds
1492 xor %rax,%rax
1493 jmp .Lexit
1494
1495.Lbadpointer:
1496 mov \$-1,%rax
1497.Lexit:
1498 .byte 0xf3,0xc3 # rep ret
1499.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
1500___
1501
1502sub deckey_ref()
1503{ my ($i,$ptr,$te,$td) = @_;
1504 my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
1505$code.=<<___;
1506 mov $i($ptr),$tp1
1507 mov $tp1,$acc
1508 and \$0x80808080,$acc
1509 mov $acc,$tp4
1510 shr \$7,$tp4
1511 lea 0($tp1,$tp1),$tp2
1512 sub $tp4,$acc
1513 and \$0xfefefefe,$tp2
1514 and \$0x1b1b1b1b,$acc
1515 xor $tp2,$acc
1516 mov $acc,$tp2
1517
1518 and \$0x80808080,$acc
1519 mov $acc,$tp8
1520 shr \$7,$tp8
1521 lea 0($tp2,$tp2),$tp4
1522 sub $tp8,$acc
1523 and \$0xfefefefe,$tp4
1524 and \$0x1b1b1b1b,$acc
1525 xor $tp1,$tp2 # tp2^tp1
1526 xor $tp4,$acc
1527 mov $acc,$tp4
1528
1529 and \$0x80808080,$acc
1530 mov $acc,$tp8
1531 shr \$7,$tp8
1532 sub $tp8,$acc
1533 lea 0($tp4,$tp4),$tp8
1534 xor $tp1,$tp4 # tp4^tp1
1535 and \$0xfefefefe,$tp8
1536 and \$0x1b1b1b1b,$acc
1537 xor $acc,$tp8
1538
1539 xor $tp8,$tp1 # tp1^tp8
1540 rol \$8,$tp1 # ROTATE(tp1^tp8,8)
1541 xor $tp8,$tp2 # tp2^tp1^tp8
1542 xor $tp8,$tp4 # tp4^tp1^tp8
1543 xor $tp2,$tp8
1544 xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
1545
1546 xor $tp8,$tp1
1547 rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24)
1548 xor $tp2,$tp1
1549 rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16)
1550 xor $tp4,$tp1
1551
1552 mov $tp1,$i($ptr)
1553___
1554}
1555
1556# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
1557# AES_KEY *key)
1558$code.=<<___;
1559.globl private_AES_set_decrypt_key
1560.type private_AES_set_decrypt_key,\@function,3
1561.align 16
1562private_AES_set_decrypt_key:
1563 push %rbx
1564 push %rbp
1565 push %r12
1566 push %r13
1567 push %r14
1568 push %r15
1569 push %rdx # save key schedule
1570.Ldec_key_prologue:
1571
1572 call _x86_64_AES_set_encrypt_key
1573 mov (%rsp),%r8 # restore key schedule
1574 cmp \$0,%eax
1575 jne .Labort
1576
1577 mov 240(%r8),%r14d # pull number of rounds
1578 xor %rdi,%rdi
1579 lea (%rdi,%r14d,4),%rcx
1580 mov %r8,%rsi
1581 lea (%r8,%rcx,4),%rdi # pointer to last chunk
1582.align 4
1583.Linvert:
1584 mov 0(%rsi),%rax
1585 mov 8(%rsi),%rbx
1586 mov 0(%rdi),%rcx
1587 mov 8(%rdi),%rdx
1588 mov %rax,0(%rdi)
1589 mov %rbx,8(%rdi)
1590 mov %rcx,0(%rsi)
1591 mov %rdx,8(%rsi)
1592 lea 16(%rsi),%rsi
1593 lea -16(%rdi),%rdi
1594 cmp %rsi,%rdi
1595 jne .Linvert
1596
1597 lea .LAES_Te+2048+1024(%rip),%rax # rcon
1598
1599 mov 40(%rax),$mask80
1600 mov 48(%rax),$maskfe
1601 mov 56(%rax),$mask1b
1602
1603 mov %r8,$key
1604 sub \$1,%r14d
1605.align 4
1606.Lpermute:
1607 lea 16($key),$key
1608 mov 0($key),%rax
1609 mov 8($key),%rcx
1610___
1611 &dectransform ();
1612$code.=<<___;
1613 mov %eax,0($key)
1614 mov %ebx,4($key)
1615 mov %ecx,8($key)
1616 mov %edx,12($key)
1617 sub \$1,%r14d
1618 jnz .Lpermute
1619
1620 xor %rax,%rax
1621.Labort:
1622 mov 8(%rsp),%r15
1623 mov 16(%rsp),%r14
1624 mov 24(%rsp),%r13
1625 mov 32(%rsp),%r12
1626 mov 40(%rsp),%rbp
1627 mov 48(%rsp),%rbx
1628 add \$56,%rsp
1629.Ldec_key_epilogue:
1630 ret
1631.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
1632___
1633
1634# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
1635# size_t length, const AES_KEY *key,
1636# unsigned char *ivp,const int enc);
1637{
1638# stack frame layout
1639# -8(%rsp) return address
1640my $keyp="0(%rsp)"; # one to pass as $key
1641my $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds])
1642my $_rsp="16(%rsp)"; # saved %rsp
1643my $_inp="24(%rsp)"; # copy of 1st parameter, inp
1644my $_out="32(%rsp)"; # copy of 2nd parameter, out
1645my $_len="40(%rsp)"; # copy of 3rd parameter, length
1646my $_key="48(%rsp)"; # copy of 4th parameter, key
1647my $_ivp="56(%rsp)"; # copy of 5th parameter, ivp
1648my $ivec="64(%rsp)"; # ivec[16]
1649my $aes_key="80(%rsp)"; # copy of aes_key
1650my $mark="80+240(%rsp)"; # copy of aes_key->rounds
1651
1652$code.=<<___;
1653.globl AES_cbc_encrypt
1654.type AES_cbc_encrypt,\@function,6
1655.align 16
1656.extern OPENSSL_ia32cap_P
1657.globl asm_AES_cbc_encrypt
1658.hidden asm_AES_cbc_encrypt
1659asm_AES_cbc_encrypt:
1660AES_cbc_encrypt:
1661 cmp \$0,%rdx # check length
1662 je .Lcbc_epilogue
1663 pushfq
1664 push %rbx
1665 push %rbp
1666 push %r12
1667 push %r13
1668 push %r14
1669 push %r15
1670.Lcbc_prologue:
1671
1672 cld
1673 mov %r9d,%r9d # clear upper half of enc
1674
1675 lea .LAES_Te(%rip),$sbox
1676 cmp \$0,%r9
1677 jne .Lcbc_picked_te
1678 lea .LAES_Td(%rip),$sbox
1679.Lcbc_picked_te:
1680
1681 mov OPENSSL_ia32cap_P(%rip),%r10d
1682 cmp \$$speed_limit,%rdx
1683 jb .Lcbc_slow_prologue
1684 test \$15,%rdx
1685 jnz .Lcbc_slow_prologue
1686 bt \$28,%r10d
1687 jc .Lcbc_slow_prologue
1688
1689 # allocate aligned stack frame...
1690 lea -88-248(%rsp),$key
1691 and \$-64,$key
1692
1693 # ... and make sure it doesn't alias with AES_T[ed] modulo 4096
1694 mov $sbox,%r10
1695 lea 2304($sbox),%r11
1696 mov $key,%r12
1697 and \$0xFFF,%r10 # s = $sbox&0xfff
1698 and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
1699 and \$0xFFF,%r12 # p = %rsp&0xfff
1700
1701 cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
1702 jb .Lcbc_te_break_out
1703 sub %r11,%r12
1704 sub %r12,$key
1705 jmp .Lcbc_te_ok
1706.Lcbc_te_break_out: # else %rsp -= (p-s)&0xfff + framesz
1707 sub %r10,%r12
1708 and \$0xFFF,%r12
1709 add \$320,%r12
1710 sub %r12,$key
1711.align 4
1712.Lcbc_te_ok:
1713
1714 xchg %rsp,$key
1715 #add \$8,%rsp # reserve for return address!
1716 mov $key,$_rsp # save %rsp
1717.Lcbc_fast_body:
1718 mov %rdi,$_inp # save copy of inp
1719 mov %rsi,$_out # save copy of out
1720 mov %rdx,$_len # save copy of len
1721 mov %rcx,$_key # save copy of key
1722 mov %r8,$_ivp # save copy of ivp
1723 movl \$0,$mark # copy of aes_key->rounds = 0;
1724 mov %r8,%rbp # rearrange input arguments
1725 mov %r9,%rbx
1726 mov %rsi,$out
1727 mov %rdi,$inp
1728 mov %rcx,$key
1729
1730 mov 240($key),%eax # key->rounds
1731 # do we copy key schedule to stack?
1732 mov $key,%r10
1733 sub $sbox,%r10
1734 and \$0xfff,%r10
1735 cmp \$2304,%r10
1736 jb .Lcbc_do_ecopy
1737 cmp \$4096-248,%r10
1738 jb .Lcbc_skip_ecopy
1739.align 4
1740.Lcbc_do_ecopy:
1741 mov $key,%rsi
1742 lea $aes_key,%rdi
1743 lea $aes_key,$key
1744 mov \$240/8,%ecx
1745 .long 0x90A548F3 # rep movsq
1746 mov %eax,(%rdi) # copy aes_key->rounds
1747.Lcbc_skip_ecopy:
1748 mov $key,$keyp # save key pointer
1749
1750 mov \$18,%ecx
1751.align 4
1752.Lcbc_prefetch_te:
1753 mov 0($sbox),%r10
1754 mov 32($sbox),%r11
1755 mov 64($sbox),%r12
1756 mov 96($sbox),%r13
1757 lea 128($sbox),$sbox
1758 sub \$1,%ecx
1759 jnz .Lcbc_prefetch_te
1760 lea -2304($sbox),$sbox
1761
1762 cmp \$0,%rbx
1763 je .LFAST_DECRYPT
1764
1765#----------------------------- ENCRYPT -----------------------------#
1766 mov 0(%rbp),$s0 # load iv
1767 mov 4(%rbp),$s1
1768 mov 8(%rbp),$s2
1769 mov 12(%rbp),$s3
1770
1771.align 4
1772.Lcbc_fast_enc_loop:
1773 xor 0($inp),$s0
1774 xor 4($inp),$s1
1775 xor 8($inp),$s2
1776 xor 12($inp),$s3
1777 mov $keyp,$key # restore key
1778 mov $inp,$_inp # if ($verticalspin) save inp
1779
1780 call _x86_64_AES_encrypt
1781
1782 mov $_inp,$inp # if ($verticalspin) restore inp
1783 mov $_len,%r10
1784 mov $s0,0($out)
1785 mov $s1,4($out)
1786 mov $s2,8($out)
1787 mov $s3,12($out)
1788
1789 lea 16($inp),$inp
1790 lea 16($out),$out
1791 sub \$16,%r10
1792 test \$-16,%r10
1793 mov %r10,$_len
1794 jnz .Lcbc_fast_enc_loop
1795 mov $_ivp,%rbp # restore ivp
1796 mov $s0,0(%rbp) # save ivec
1797 mov $s1,4(%rbp)
1798 mov $s2,8(%rbp)
1799 mov $s3,12(%rbp)
1800
1801 jmp .Lcbc_fast_cleanup
1802
1803#----------------------------- DECRYPT -----------------------------#
1804.align 16
1805.LFAST_DECRYPT:
1806 cmp $inp,$out
1807 je .Lcbc_fast_dec_in_place
1808
1809 mov %rbp,$ivec
1810.align 4
1811.Lcbc_fast_dec_loop:
1812 mov 0($inp),$s0 # read input
1813 mov 4($inp),$s1
1814 mov 8($inp),$s2
1815 mov 12($inp),$s3
1816 mov $keyp,$key # restore key
1817 mov $inp,$_inp # if ($verticalspin) save inp
1818
1819 call _x86_64_AES_decrypt
1820
1821 mov $ivec,%rbp # load ivp
1822 mov $_inp,$inp # if ($verticalspin) restore inp
1823 mov $_len,%r10 # load len
1824 xor 0(%rbp),$s0 # xor iv
1825 xor 4(%rbp),$s1
1826 xor 8(%rbp),$s2
1827 xor 12(%rbp),$s3
1828 mov $inp,%rbp # current input, next iv
1829
1830 sub \$16,%r10
1831 mov %r10,$_len # update len
1832 mov %rbp,$ivec # update ivp
1833
1834 mov $s0,0($out) # write output
1835 mov $s1,4($out)
1836 mov $s2,8($out)
1837 mov $s3,12($out)
1838
1839 lea 16($inp),$inp
1840 lea 16($out),$out
1841 jnz .Lcbc_fast_dec_loop
1842 mov $_ivp,%r12 # load user ivp
1843 mov 0(%rbp),%r10 # load iv
1844 mov 8(%rbp),%r11
1845 mov %r10,0(%r12) # copy back to user
1846 mov %r11,8(%r12)
1847 jmp .Lcbc_fast_cleanup
1848
1849.align 16
1850.Lcbc_fast_dec_in_place:
1851 mov 0(%rbp),%r10 # copy iv to stack
1852 mov 8(%rbp),%r11
1853 mov %r10,0+$ivec
1854 mov %r11,8+$ivec
1855.align 4
1856.Lcbc_fast_dec_in_place_loop:
1857 mov 0($inp),$s0 # load input
1858 mov 4($inp),$s1
1859 mov 8($inp),$s2
1860 mov 12($inp),$s3
1861 mov $keyp,$key # restore key
1862 mov $inp,$_inp # if ($verticalspin) save inp
1863
1864 call _x86_64_AES_decrypt
1865
1866 mov $_inp,$inp # if ($verticalspin) restore inp
1867 mov $_len,%r10
1868 xor 0+$ivec,$s0
1869 xor 4+$ivec,$s1
1870 xor 8+$ivec,$s2
1871 xor 12+$ivec,$s3
1872
1873 mov 0($inp),%r11 # load input
1874 mov 8($inp),%r12
1875 sub \$16,%r10
1876 jz .Lcbc_fast_dec_in_place_done
1877
1878 mov %r11,0+$ivec # copy input to iv
1879 mov %r12,8+$ivec
1880
1881 mov $s0,0($out) # save output [zaps input]
1882 mov $s1,4($out)
1883 mov $s2,8($out)
1884 mov $s3,12($out)
1885
1886 lea 16($inp),$inp
1887 lea 16($out),$out
1888 mov %r10,$_len
1889 jmp .Lcbc_fast_dec_in_place_loop
1890.Lcbc_fast_dec_in_place_done:
1891 mov $_ivp,%rdi
1892 mov %r11,0(%rdi) # copy iv back to user
1893 mov %r12,8(%rdi)
1894
1895 mov $s0,0($out) # save output [zaps input]
1896 mov $s1,4($out)
1897 mov $s2,8($out)
1898 mov $s3,12($out)
1899
1900.align 4
1901.Lcbc_fast_cleanup:
1902 cmpl \$0,$mark # was the key schedule copied?
1903 lea $aes_key,%rdi
1904 je .Lcbc_exit
1905 mov \$240/8,%ecx
1906 xor %rax,%rax
1907 .long 0x90AB48F3 # rep stosq
1908
1909 jmp .Lcbc_exit
1910
1911#--------------------------- SLOW ROUTINE ---------------------------#
1912.align 16
1913.Lcbc_slow_prologue:
1914 # allocate aligned stack frame...
1915 lea -88(%rsp),%rbp
1916 and \$-64,%rbp
1917 # ... just "above" key schedule
1918 lea -88-63(%rcx),%r10
1919 sub %rbp,%r10
1920 neg %r10
1921 and \$0x3c0,%r10
1922 sub %r10,%rbp
1923
1924 xchg %rsp,%rbp
1925 #add \$8,%rsp # reserve for return address!
1926 mov %rbp,$_rsp # save %rsp
1927.Lcbc_slow_body:
1928 #mov %rdi,$_inp # save copy of inp
1929 #mov %rsi,$_out # save copy of out
1930 #mov %rdx,$_len # save copy of len
1931 #mov %rcx,$_key # save copy of key
1932 mov %r8,$_ivp # save copy of ivp
1933 mov %r8,%rbp # rearrange input arguments
1934 mov %r9,%rbx
1935 mov %rsi,$out
1936 mov %rdi,$inp
1937 mov %rcx,$key
1938 mov %rdx,%r10
1939
1940 mov 240($key),%eax
1941 mov $key,$keyp # save key pointer
1942 shl \$4,%eax
1943 lea ($key,%rax),%rax
1944 mov %rax,$keyend
1945
1946 # pick Te4 copy which can't "overlap" with stack frame or key scdedule
1947 lea 2048($sbox),$sbox
1948 lea 768-8(%rsp),%rax
1949 sub $sbox,%rax
1950 and \$0x300,%rax
1951 lea ($sbox,%rax),$sbox
1952
1953 cmp \$0,%rbx
1954 je .LSLOW_DECRYPT
1955
1956#--------------------------- SLOW ENCRYPT ---------------------------#
1957 test \$-16,%r10 # check upon length
1958 mov 0(%rbp),$s0 # load iv
1959 mov 4(%rbp),$s1
1960 mov 8(%rbp),$s2
1961 mov 12(%rbp),$s3
1962 jz .Lcbc_slow_enc_tail # short input...
1963
1964.align 4
1965.Lcbc_slow_enc_loop:
1966 xor 0($inp),$s0
1967 xor 4($inp),$s1
1968 xor 8($inp),$s2
1969 xor 12($inp),$s3
1970 mov $keyp,$key # restore key
1971 mov $inp,$_inp # save inp
1972 mov $out,$_out # save out
1973 mov %r10,$_len # save len
1974
1975 call _x86_64_AES_encrypt_compact
1976
1977 mov $_inp,$inp # restore inp
1978 mov $_out,$out # restore out
1979 mov $_len,%r10 # restore len
1980 mov $s0,0($out)
1981 mov $s1,4($out)
1982 mov $s2,8($out)
1983 mov $s3,12($out)
1984
1985 lea 16($inp),$inp
1986 lea 16($out),$out
1987 sub \$16,%r10
1988 test \$-16,%r10
1989 jnz .Lcbc_slow_enc_loop
1990 test \$15,%r10
1991 jnz .Lcbc_slow_enc_tail
1992 mov $_ivp,%rbp # restore ivp
1993 mov $s0,0(%rbp) # save ivec
1994 mov $s1,4(%rbp)
1995 mov $s2,8(%rbp)
1996 mov $s3,12(%rbp)
1997
1998 jmp .Lcbc_exit
1999
2000.align 4
2001.Lcbc_slow_enc_tail:
2002 mov %rax,%r11
2003 mov %rcx,%r12
2004 mov %r10,%rcx
2005 mov $inp,%rsi
2006 mov $out,%rdi
2007 .long 0x9066A4F3 # rep movsb
2008 mov \$16,%rcx # zero tail
2009 sub %r10,%rcx
2010 xor %rax,%rax
2011 .long 0x9066AAF3 # rep stosb
2012 mov $out,$inp # this is not a mistake!
2013 mov \$16,%r10 # len=16
2014 mov %r11,%rax
2015 mov %r12,%rcx
2016 jmp .Lcbc_slow_enc_loop # one more spin...
2017#--------------------------- SLOW DECRYPT ---------------------------#
2018.align 16
2019.LSLOW_DECRYPT:
2020 shr \$3,%rax
2021 add %rax,$sbox # recall "magic" constants!
2022
2023 mov 0(%rbp),%r11 # copy iv to stack
2024 mov 8(%rbp),%r12
2025 mov %r11,0+$ivec
2026 mov %r12,8+$ivec
2027
2028.align 4
2029.Lcbc_slow_dec_loop:
2030 mov 0($inp),$s0 # load input
2031 mov 4($inp),$s1
2032 mov 8($inp),$s2
2033 mov 12($inp),$s3
2034 mov $keyp,$key # restore key
2035 mov $inp,$_inp # save inp
2036 mov $out,$_out # save out
2037 mov %r10,$_len # save len
2038
2039 call _x86_64_AES_decrypt_compact
2040
2041 mov $_inp,$inp # restore inp
2042 mov $_out,$out # restore out
2043 mov $_len,%r10
2044 xor 0+$ivec,$s0
2045 xor 4+$ivec,$s1
2046 xor 8+$ivec,$s2
2047 xor 12+$ivec,$s3
2048
2049 mov 0($inp),%r11 # load input
2050 mov 8($inp),%r12
2051 sub \$16,%r10
2052 jc .Lcbc_slow_dec_partial
2053 jz .Lcbc_slow_dec_done
2054
2055 mov %r11,0+$ivec # copy input to iv
2056 mov %r12,8+$ivec
2057
2058 mov $s0,0($out) # save output [can zap input]
2059 mov $s1,4($out)
2060 mov $s2,8($out)
2061 mov $s3,12($out)
2062
2063 lea 16($inp),$inp
2064 lea 16($out),$out
2065 jmp .Lcbc_slow_dec_loop
2066.Lcbc_slow_dec_done:
2067 mov $_ivp,%rdi
2068 mov %r11,0(%rdi) # copy iv back to user
2069 mov %r12,8(%rdi)
2070
2071 mov $s0,0($out) # save output [can zap input]
2072 mov $s1,4($out)
2073 mov $s2,8($out)
2074 mov $s3,12($out)
2075
2076 jmp .Lcbc_exit
2077
2078.align 4
2079.Lcbc_slow_dec_partial:
2080 mov $_ivp,%rdi
2081 mov %r11,0(%rdi) # copy iv back to user
2082 mov %r12,8(%rdi)
2083
2084 mov $s0,0+$ivec # save output to stack
2085 mov $s1,4+$ivec
2086 mov $s2,8+$ivec
2087 mov $s3,12+$ivec
2088
2089 mov $out,%rdi
2090 lea $ivec,%rsi
2091 lea 16(%r10),%rcx
2092 .long 0x9066A4F3 # rep movsb
2093 jmp .Lcbc_exit
2094
2095.align 16
2096.Lcbc_exit:
2097 mov $_rsp,%rsi
2098 mov (%rsi),%r15
2099 mov 8(%rsi),%r14
2100 mov 16(%rsi),%r13
2101 mov 24(%rsi),%r12
2102 mov 32(%rsi),%rbp
2103 mov 40(%rsi),%rbx
2104 lea 48(%rsi),%rsp
2105.Lcbc_popfq:
2106 popfq
2107.Lcbc_epilogue:
2108 ret
2109.size AES_cbc_encrypt,.-AES_cbc_encrypt
2110___
2111}
2112
2113$code.=<<___;
2114.align 64
2115.LAES_Te:
2116___
2117 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
2118 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
2119 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
2120 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
2121 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
2122 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
2123 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
2124 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
2125 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
2126 &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
2127 &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
2128 &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
2129 &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
2130 &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
2131 &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
2132 &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
2133 &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
2134 &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
2135 &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
2136 &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
2137 &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
2138 &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
2139 &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
2140 &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
2141 &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
2142 &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
2143 &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
2144 &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
2145 &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
2146 &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
2147 &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
2148 &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
2149 &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
2150 &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
2151 &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
2152 &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
2153 &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
2154 &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
2155 &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
2156 &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
2157 &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
2158 &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
2159 &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
2160 &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
2161 &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
2162 &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
2163 &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
2164 &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
2165 &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
2166 &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
2167 &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
2168 &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
2169 &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
2170 &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
2171 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
2172 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
2173 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
2174 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
2175 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
2176 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
2177 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
2178 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
2179 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
2180 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
2181
2182#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
2183 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2184 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2185 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2186 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2187 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2188 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2189 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2190 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2191 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2192 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2193 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2194 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2195 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2196 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2197 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2198 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2199 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2200 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2201 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2202 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2203 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2204 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2205 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2206 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2207 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2208 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2209 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2210 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2211 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2212 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2213 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2214 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2215
2216 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2217 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2218 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2219 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2220 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2221 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2222 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2223 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2224 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2225 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2226 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2227 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2228 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2229 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2230 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2231 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2232 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2233 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2234 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2235 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2236 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2237 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2238 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2239 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2240 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2241 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2242 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2243 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2244 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2245 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2246 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2247 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2248
2249 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2250 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2251 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2252 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2253 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2254 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2255 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2256 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2257 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2258 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2259 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2260 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2261 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2262 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2263 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2264 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2265 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2266 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2267 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2268 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2269 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2270 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2271 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2272 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2273 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2274 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2275 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2276 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2277 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2278 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2279 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2280 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2281
2282 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2283 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2284 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2285 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2286 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2287 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2288 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2289 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2290 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2291 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2292 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2293 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2294 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2295 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2296 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2297 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2298 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2299 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2300 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2301 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2302 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2303 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2304 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2305 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2306 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2307 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2308 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2309 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2310 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2311 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2312 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2313 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2314#rcon:
2315$code.=<<___;
2316 .long 0x00000001, 0x00000002, 0x00000004, 0x00000008
2317 .long 0x00000010, 0x00000020, 0x00000040, 0x00000080
2318 .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080
2319 .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
2320___
2321$code.=<<___;
2322.align 64
2323.LAES_Td:
2324___
2325 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
2326 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
2327 &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
2328 &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
2329 &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
2330 &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
2331 &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
2332 &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
2333 &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
2334 &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
2335 &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
2336 &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
2337 &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
2338 &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
2339 &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
2340 &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
2341 &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
2342 &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
2343 &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
2344 &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
2345 &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
2346 &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
2347 &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
2348 &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
2349 &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
2350 &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
2351 &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
2352 &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
2353 &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
2354 &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
2355 &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
2356 &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
2357 &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
2358 &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
2359 &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
2360 &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
2361 &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
2362 &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
2363 &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
2364 &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
2365 &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
2366 &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
2367 &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
2368 &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
2369 &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
2370 &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
2371 &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
2372 &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
2373 &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
2374 &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
2375 &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
2376 &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
2377 &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
2378 &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
2379 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
2380 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
2381 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
2382 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
2383 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
2384 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
2385 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
2386 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
2387 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
2388 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
2389
2390#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
2391 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2392 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2393 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2394 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2395 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2396 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2397 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2398 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2399 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2400 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2401 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2402 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2403 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2404 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2405 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2406 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2407 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2408 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2409 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2410 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2411 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2412 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2413 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2414 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2415 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2416 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2417 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2418 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2419 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2420 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2421 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2422 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2423$code.=<<___;
2424 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2425 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2426___
2427 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2428 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2429 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2430 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2431 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2432 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2433 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2434 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2435 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2436 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2437 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2438 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2439 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2440 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2441 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2442 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2443 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2444 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2445 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2446 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2447 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2448 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2449 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2450 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2451 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2452 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2453 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2454 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2455 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2456 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2457 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2458 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2459$code.=<<___;
2460 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2461 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2462___
2463 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2464 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2465 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2466 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2467 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2468 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2469 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2470 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2471 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2472 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2473 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2474 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2475 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2476 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2477 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2478 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2479 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2480 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2481 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2482 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2483 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2484 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2485 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2486 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2487 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2488 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2489 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2490 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2491 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2492 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2493 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2494 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2495$code.=<<___;
2496 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2497 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2498___
2499 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2500 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2501 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2502 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2503 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2504 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2505 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2506 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2507 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2508 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2509 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2510 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2511 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2512 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2513 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2514 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2515 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2516 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2517 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2518 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2519 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2520 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2521 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2522 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2523 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2524 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2525 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2526 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2527 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2528 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2529 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2530 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2531$code.=<<___;
2532 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2533 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2534.asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2535.align 64
2536___
2537
2538# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2539# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2540if ($win64) {
2541$rec="%rcx";
2542$frame="%rdx";
2543$context="%r8";
2544$disp="%r9";
2545
2546$code.=<<___;
2547.extern __imp_RtlVirtualUnwind
2548.type block_se_handler,\@abi-omnipotent
2549.align 16
2550block_se_handler:
2551 push %rsi
2552 push %rdi
2553 push %rbx
2554 push %rbp
2555 push %r12
2556 push %r13
2557 push %r14
2558 push %r15
2559 pushfq
2560 sub \$64,%rsp
2561
2562 mov 120($context),%rax # pull context->Rax
2563 mov 248($context),%rbx # pull context->Rip
2564
2565 mov 8($disp),%rsi # disp->ImageBase
2566 mov 56($disp),%r11 # disp->HandlerData
2567
2568 mov 0(%r11),%r10d # HandlerData[0]
2569 lea (%rsi,%r10),%r10 # prologue label
2570 cmp %r10,%rbx # context->Rip<prologue label
2571 jb .Lin_block_prologue
2572
2573 mov 152($context),%rax # pull context->Rsp
2574
2575 mov 4(%r11),%r10d # HandlerData[1]
2576 lea (%rsi,%r10),%r10 # epilogue label
2577 cmp %r10,%rbx # context->Rip>=epilogue label
2578 jae .Lin_block_prologue
2579
2580 mov 24(%rax),%rax # pull saved real stack pointer
2581 lea 48(%rax),%rax # adjust...
2582
2583 mov -8(%rax),%rbx
2584 mov -16(%rax),%rbp
2585 mov -24(%rax),%r12
2586 mov -32(%rax),%r13
2587 mov -40(%rax),%r14
2588 mov -48(%rax),%r15
2589 mov %rbx,144($context) # restore context->Rbx
2590 mov %rbp,160($context) # restore context->Rbp
2591 mov %r12,216($context) # restore context->R12
2592 mov %r13,224($context) # restore context->R13
2593 mov %r14,232($context) # restore context->R14
2594 mov %r15,240($context) # restore context->R15
2595
2596.Lin_block_prologue:
2597 mov 8(%rax),%rdi
2598 mov 16(%rax),%rsi
2599 mov %rax,152($context) # restore context->Rsp
2600 mov %rsi,168($context) # restore context->Rsi
2601 mov %rdi,176($context) # restore context->Rdi
2602
2603 jmp .Lcommon_seh_exit
2604.size block_se_handler,.-block_se_handler
2605
2606.type key_se_handler,\@abi-omnipotent
2607.align 16
2608key_se_handler:
2609 push %rsi
2610 push %rdi
2611 push %rbx
2612 push %rbp
2613 push %r12
2614 push %r13
2615 push %r14
2616 push %r15
2617 pushfq
2618 sub \$64,%rsp
2619
2620 mov 120($context),%rax # pull context->Rax
2621 mov 248($context),%rbx # pull context->Rip
2622
2623 mov 8($disp),%rsi # disp->ImageBase
2624 mov 56($disp),%r11 # disp->HandlerData
2625
2626 mov 0(%r11),%r10d # HandlerData[0]
2627 lea (%rsi,%r10),%r10 # prologue label
2628 cmp %r10,%rbx # context->Rip<prologue label
2629 jb .Lin_key_prologue
2630
2631 mov 152($context),%rax # pull context->Rsp
2632
2633 mov 4(%r11),%r10d # HandlerData[1]
2634 lea (%rsi,%r10),%r10 # epilogue label
2635 cmp %r10,%rbx # context->Rip>=epilogue label
2636 jae .Lin_key_prologue
2637
2638 lea 56(%rax),%rax
2639
2640 mov -8(%rax),%rbx
2641 mov -16(%rax),%rbp
2642 mov -24(%rax),%r12
2643 mov -32(%rax),%r13
2644 mov -40(%rax),%r14
2645 mov -48(%rax),%r15
2646 mov %rbx,144($context) # restore context->Rbx
2647 mov %rbp,160($context) # restore context->Rbp
2648 mov %r12,216($context) # restore context->R12
2649 mov %r13,224($context) # restore context->R13
2650 mov %r14,232($context) # restore context->R14
2651 mov %r15,240($context) # restore context->R15
2652
2653.Lin_key_prologue:
2654 mov 8(%rax),%rdi
2655 mov 16(%rax),%rsi
2656 mov %rax,152($context) # restore context->Rsp
2657 mov %rsi,168($context) # restore context->Rsi
2658 mov %rdi,176($context) # restore context->Rdi
2659
2660 jmp .Lcommon_seh_exit
2661.size key_se_handler,.-key_se_handler
2662
2663.type cbc_se_handler,\@abi-omnipotent
2664.align 16
2665cbc_se_handler:
2666 push %rsi
2667 push %rdi
2668 push %rbx
2669 push %rbp
2670 push %r12
2671 push %r13
2672 push %r14
2673 push %r15
2674 pushfq
2675 sub \$64,%rsp
2676
2677 mov 120($context),%rax # pull context->Rax
2678 mov 248($context),%rbx # pull context->Rip
2679
2680 lea .Lcbc_prologue(%rip),%r10
2681 cmp %r10,%rbx # context->Rip<.Lcbc_prologue
2682 jb .Lin_cbc_prologue
2683
2684 lea .Lcbc_fast_body(%rip),%r10
2685 cmp %r10,%rbx # context->Rip<.Lcbc_fast_body
2686 jb .Lin_cbc_frame_setup
2687
2688 lea .Lcbc_slow_prologue(%rip),%r10
2689 cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue
2690 jb .Lin_cbc_body
2691
2692 lea .Lcbc_slow_body(%rip),%r10
2693 cmp %r10,%rbx # context->Rip<.Lcbc_slow_body
2694 jb .Lin_cbc_frame_setup
2695
2696.Lin_cbc_body:
2697 mov 152($context),%rax # pull context->Rsp
2698
2699 lea .Lcbc_epilogue(%rip),%r10
2700 cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue
2701 jae .Lin_cbc_prologue
2702
2703 lea 8(%rax),%rax
2704
2705 lea .Lcbc_popfq(%rip),%r10
2706 cmp %r10,%rbx # context->Rip>=.Lcbc_popfq
2707 jae .Lin_cbc_prologue
2708
2709 mov `16-8`(%rax),%rax # biased $_rsp
2710 lea 56(%rax),%rax
2711
2712.Lin_cbc_frame_setup:
2713 mov -16(%rax),%rbx
2714 mov -24(%rax),%rbp
2715 mov -32(%rax),%r12
2716 mov -40(%rax),%r13
2717 mov -48(%rax),%r14
2718 mov -56(%rax),%r15
2719 mov %rbx,144($context) # restore context->Rbx
2720 mov %rbp,160($context) # restore context->Rbp
2721 mov %r12,216($context) # restore context->R12
2722 mov %r13,224($context) # restore context->R13
2723 mov %r14,232($context) # restore context->R14
2724 mov %r15,240($context) # restore context->R15
2725
2726.Lin_cbc_prologue:
2727 mov 8(%rax),%rdi
2728 mov 16(%rax),%rsi
2729 mov %rax,152($context) # restore context->Rsp
2730 mov %rsi,168($context) # restore context->Rsi
2731 mov %rdi,176($context) # restore context->Rdi
2732
2733.Lcommon_seh_exit:
2734
2735 mov 40($disp),%rdi # disp->ContextRecord
2736 mov $context,%rsi # context
2737 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2738 .long 0xa548f3fc # cld; rep movsq
2739
2740 mov $disp,%rsi
2741 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2742 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2743 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2744 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2745 mov 40(%rsi),%r10 # disp->ContextRecord
2746 lea 56(%rsi),%r11 # &disp->HandlerData
2747 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2748 mov %r10,32(%rsp) # arg5
2749 mov %r11,40(%rsp) # arg6
2750 mov %r12,48(%rsp) # arg7
2751 mov %rcx,56(%rsp) # arg8, (NULL)
2752 call *__imp_RtlVirtualUnwind(%rip)
2753
2754 mov \$1,%eax # ExceptionContinueSearch
2755 add \$64,%rsp
2756 popfq
2757 pop %r15
2758 pop %r14
2759 pop %r13
2760 pop %r12
2761 pop %rbp
2762 pop %rbx
2763 pop %rdi
2764 pop %rsi
2765 ret
2766.size cbc_se_handler,.-cbc_se_handler
2767
2768.section .pdata
2769.align 4
2770 .rva .LSEH_begin_AES_encrypt
2771 .rva .LSEH_end_AES_encrypt
2772 .rva .LSEH_info_AES_encrypt
2773
2774 .rva .LSEH_begin_AES_decrypt
2775 .rva .LSEH_end_AES_decrypt
2776 .rva .LSEH_info_AES_decrypt
2777
2778 .rva .LSEH_begin_private_AES_set_encrypt_key
2779 .rva .LSEH_end_private_AES_set_encrypt_key
2780 .rva .LSEH_info_private_AES_set_encrypt_key
2781
2782 .rva .LSEH_begin_private_AES_set_decrypt_key
2783 .rva .LSEH_end_private_AES_set_decrypt_key
2784 .rva .LSEH_info_private_AES_set_decrypt_key
2785
2786 .rva .LSEH_begin_AES_cbc_encrypt
2787 .rva .LSEH_end_AES_cbc_encrypt
2788 .rva .LSEH_info_AES_cbc_encrypt
2789
2790.section .xdata
2791.align 8
2792.LSEH_info_AES_encrypt:
2793 .byte 9,0,0,0
2794 .rva block_se_handler
2795 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
2796.LSEH_info_AES_decrypt:
2797 .byte 9,0,0,0
2798 .rva block_se_handler
2799 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
2800.LSEH_info_private_AES_set_encrypt_key:
2801 .byte 9,0,0,0
2802 .rva key_se_handler
2803 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
2804.LSEH_info_private_AES_set_decrypt_key:
2805 .byte 9,0,0,0
2806 .rva key_se_handler
2807 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]
2808.LSEH_info_AES_cbc_encrypt:
2809 .byte 9,0,0,0
2810 .rva cbc_se_handler
2811___
2812}
2813
2814$code =~ s/\`([^\`]*)\`/eval($1)/gem;
2815
2816print $code;
2817
2818close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl
deleted file mode 100644
index c6f6b3334a..0000000000
--- a/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl
+++ /dev/null
@@ -1,1249 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# June 2011
11#
12# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
13# in http://download.intel.com/design/intarch/papers/323686.pdf, is
14# that since AESNI-CBC encrypt exhibit *very* low instruction-level
15# parallelism, interleaving it with another algorithm would allow to
16# utilize processor resources better and achieve better performance.
17# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
18# AESNI code is weaved into it. Below are performance numbers in
19# cycles per processed byte, less is better, for standalone AESNI-CBC
20# encrypt, sum of the latter and standalone SHA1, and "stitched"
21# subroutine:
22#
23# AES-128-CBC +SHA1 stitch gain
24# Westmere 3.77[+5.6] 9.37 6.65 +41%
25# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)
26#
27# AES-192-CBC
28# Westmere 4.51 10.11 6.97 +45%
29# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)
30#
31# AES-256-CBC
32# Westmere 5.25 10.85 7.25 +50%
33# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)
34#
35# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
36# background information. Above numbers in parentheses are SSSE3
37# results collected on AVX-capable CPU, i.e. apply on OSes that
38# don't support AVX.
39#
40# Needless to mention that it makes no sense to implement "stitched"
41# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
42# fully utilize parallelism, so stitching would not give any gain
43# anyway. Well, there might be some, e.g. because of better cache
44# locality... For reference, here are performance results for
45# standalone AESNI-CBC decrypt:
46#
47# AES-128-CBC AES-192-CBC AES-256-CBC
48# Westmere 1.31 1.55 1.80
49# Sandy Bridge 0.93 1.06 1.22
50
51$flavour = shift;
52$output = shift;
53if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
54
55$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
56
57$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
59( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
60die "can't locate x86_64-xlate.pl";
61
62$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
63 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
64 $1>=2.19);
65$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
66 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
67 $1>=2.09);
68$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69 `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
70 $1>=10);
71
72open STDOUT,"| $^X $xlate $flavour $output";
73
74# void aesni_cbc_sha1_enc(const void *inp,
75# void *out,
76# size_t length,
77# const AES_KEY *key,
78# unsigned char *iv,
79# SHA_CTX *ctx,
80# const void *in0);
81
82$code.=<<___;
83.text
84.extern OPENSSL_ia32cap_P
85
86.globl aesni_cbc_sha1_enc
87.type aesni_cbc_sha1_enc,\@abi-omnipotent
88.align 16
89aesni_cbc_sha1_enc:
90 # caller should check for SSSE3 and AES-NI bits
91 mov OPENSSL_ia32cap_P+0(%rip),%r10d
92 mov OPENSSL_ia32cap_P+4(%rip),%r11d
93___
94$code.=<<___ if ($avx);
95 and \$`1<<28`,%r11d # mask AVX bit
96 and \$`1<<30`,%r10d # mask "Intel CPU" bit
97 or %r11d,%r10d
98 cmp \$`1<<28|1<<30`,%r10d
99 je aesni_cbc_sha1_enc_avx
100___
101$code.=<<___;
102 jmp aesni_cbc_sha1_enc_ssse3
103 ret
104.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
105___
106
107my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
108
109my $Xi=4;
110my @X=map("%xmm$_",(4..7,0..3));
111my @Tx=map("%xmm$_",(8..10));
112my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
113my @T=("%esi","%edi");
114my $j=0; my $jj=0; my $r=0; my $sn=0;
115my $K_XX_XX="%r11";
116my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
117my @rndkey=("%xmm14","%xmm15");
118
119sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
120{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
121 my $arg = pop;
122 $arg = "\$$arg" if ($arg*1 eq $arg);
123 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
124}
125
126my $_rol=sub { &rol(@_) };
127my $_ror=sub { &ror(@_) };
128
129$code.=<<___;
130.type aesni_cbc_sha1_enc_ssse3,\@function,6
131.align 16
132aesni_cbc_sha1_enc_ssse3:
133 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
134 #shr \$6,$len # debugging artefact
135 #jz .Lepilogue_ssse3 # debugging artefact
136 push %rbx
137 push %rbp
138 push %r12
139 push %r13
140 push %r14
141 push %r15
142 lea `-104-($win64?10*16:0)`(%rsp),%rsp
143 #mov $in0,$inp # debugging artefact
144 #lea 64(%rsp),$ctx # debugging artefact
145___
146$code.=<<___ if ($win64);
147 movaps %xmm6,96+0(%rsp)
148 movaps %xmm7,96+16(%rsp)
149 movaps %xmm8,96+32(%rsp)
150 movaps %xmm9,96+48(%rsp)
151 movaps %xmm10,96+64(%rsp)
152 movaps %xmm11,96+80(%rsp)
153 movaps %xmm12,96+96(%rsp)
154 movaps %xmm13,96+112(%rsp)
155 movaps %xmm14,96+128(%rsp)
156 movaps %xmm15,96+144(%rsp)
157.Lprologue_ssse3:
158___
159$code.=<<___;
160 mov $in0,%r12 # reassign arguments
161 mov $out,%r13
162 mov $len,%r14
163 mov $key,%r15
164 movdqu ($ivp),$iv # load IV
165 mov $ivp,88(%rsp) # save $ivp
166___
167my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
168my $rounds="${ivp}d";
169$code.=<<___;
170 shl \$6,$len
171 sub $in0,$out
172 mov 240($key),$rounds
173 add $inp,$len # end of input
174
175 lea K_XX_XX(%rip),$K_XX_XX
176 mov 0($ctx),$A # load context
177 mov 4($ctx),$B
178 mov 8($ctx),$C
179 mov 12($ctx),$D
180 mov $B,@T[0] # magic seed
181 mov 16($ctx),$E
182
183 movdqa 64($K_XX_XX),@X[2] # pbswap mask
184 movdqa 0($K_XX_XX),@Tx[1] # K_00_19
185 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
186 movdqu 16($inp),@X[-3&7]
187 movdqu 32($inp),@X[-2&7]
188 movdqu 48($inp),@X[-1&7]
189 pshufb @X[2],@X[-4&7] # byte swap
190 add \$64,$inp
191 pshufb @X[2],@X[-3&7]
192 pshufb @X[2],@X[-2&7]
193 pshufb @X[2],@X[-1&7]
194 paddd @Tx[1],@X[-4&7] # add K_00_19
195 paddd @Tx[1],@X[-3&7]
196 paddd @Tx[1],@X[-2&7]
197 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
198 psubd @Tx[1],@X[-4&7] # restore X[]
199 movdqa @X[-3&7],16(%rsp)
200 psubd @Tx[1],@X[-3&7]
201 movdqa @X[-2&7],32(%rsp)
202 psubd @Tx[1],@X[-2&7]
203 movups ($key),$rndkey0 # $key[0]
204 movups 16($key),$rndkey[0] # forward reference
205 jmp .Loop_ssse3
206___
207
208my $aesenc=sub {
209 use integer;
210 my ($n,$k)=($r/10,$r%10);
211 if ($k==0) {
212 $code.=<<___;
213 movups `16*$n`($in0),$in # load input
214 xorps $rndkey0,$in
215___
216 $code.=<<___ if ($n);
217 movups $iv,`16*($n-1)`($out,$in0) # write output
218___
219 $code.=<<___;
220 xorps $in,$iv
221 aesenc $rndkey[0],$iv
222 movups `32+16*$k`($key),$rndkey[1]
223___
224 } elsif ($k==9) {
225 $sn++;
226 $code.=<<___;
227 cmp \$11,$rounds
228 jb .Laesenclast$sn
229 movups `32+16*($k+0)`($key),$rndkey[1]
230 aesenc $rndkey[0],$iv
231 movups `32+16*($k+1)`($key),$rndkey[0]
232 aesenc $rndkey[1],$iv
233 je .Laesenclast$sn
234 movups `32+16*($k+2)`($key),$rndkey[1]
235 aesenc $rndkey[0],$iv
236 movups `32+16*($k+3)`($key),$rndkey[0]
237 aesenc $rndkey[1],$iv
238.Laesenclast$sn:
239 aesenclast $rndkey[0],$iv
240 movups 16($key),$rndkey[1] # forward reference
241___
242 } else {
243 $code.=<<___;
244 aesenc $rndkey[0],$iv
245 movups `32+16*$k`($key),$rndkey[1]
246___
247 }
248 $r++; unshift(@rndkey,pop(@rndkey));
249};
250
251sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
252{ use integer;
253 my $body = shift;
254 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
255 my ($a,$b,$c,$d,$e);
256
257 &movdqa (@X[0],@X[-3&7]);
258 eval(shift(@insns));
259 eval(shift(@insns));
260 &movdqa (@Tx[0],@X[-1&7]);
261 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
262 eval(shift(@insns));
263 eval(shift(@insns));
264
265 &paddd (@Tx[1],@X[-1&7]);
266 eval(shift(@insns));
267 eval(shift(@insns));
268 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
269 eval(shift(@insns));
270 eval(shift(@insns));
271 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
272 eval(shift(@insns));
273 eval(shift(@insns));
274
275 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
276 eval(shift(@insns));
277 eval(shift(@insns));
278 eval(shift(@insns));
279 eval(shift(@insns));
280
281 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
282 eval(shift(@insns));
283 eval(shift(@insns));
284 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
285 eval(shift(@insns));
286 eval(shift(@insns));
287
288 &movdqa (@Tx[2],@X[0]);
289 &movdqa (@Tx[0],@X[0]);
290 eval(shift(@insns));
291 eval(shift(@insns));
292 eval(shift(@insns));
293 eval(shift(@insns));
294
295 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
296 &paddd (@X[0],@X[0]);
297 eval(shift(@insns));
298 eval(shift(@insns));
299 eval(shift(@insns));
300 eval(shift(@insns));
301
302 &psrld (@Tx[0],31);
303 eval(shift(@insns));
304 eval(shift(@insns));
305 &movdqa (@Tx[1],@Tx[2]);
306 eval(shift(@insns));
307 eval(shift(@insns));
308
309 &psrld (@Tx[2],30);
310 &por (@X[0],@Tx[0]); # "X[0]"<<<=1
311 eval(shift(@insns));
312 eval(shift(@insns));
313 eval(shift(@insns));
314 eval(shift(@insns));
315
316 &pslld (@Tx[1],2);
317 &pxor (@X[0],@Tx[2]);
318 eval(shift(@insns));
319 eval(shift(@insns));
320 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
321 eval(shift(@insns));
322 eval(shift(@insns));
323
324 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
325
326 foreach (@insns) { eval; } # remaining instructions [if any]
327
328 $Xi++; push(@X,shift(@X)); # "rotate" X[]
329 push(@Tx,shift(@Tx));
330}
331
332sub Xupdate_ssse3_32_79()
333{ use integer;
334 my $body = shift;
335 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
336 my ($a,$b,$c,$d,$e);
337
338 &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
339 eval(shift(@insns)); # body_20_39
340 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
341 &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
342 eval(shift(@insns));
343 eval(shift(@insns));
344 eval(shift(@insns)); # rol
345
346 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
347 eval(shift(@insns));
348 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
349 if ($Xi%5) {
350 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
351 } else { # ... or load next one
352 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
353 }
354 &paddd (@Tx[1],@X[-1&7]);
355 eval(shift(@insns)); # ror
356 eval(shift(@insns));
357
358 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
359 eval(shift(@insns)); # body_20_39
360 eval(shift(@insns));
361 eval(shift(@insns));
362 eval(shift(@insns)); # rol
363
364 &movdqa (@Tx[0],@X[0]);
365 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
366 eval(shift(@insns));
367 eval(shift(@insns));
368 eval(shift(@insns)); # ror
369 eval(shift(@insns));
370
371 &pslld (@X[0],2);
372 eval(shift(@insns)); # body_20_39
373 eval(shift(@insns));
374 &psrld (@Tx[0],30);
375 eval(shift(@insns));
376 eval(shift(@insns)); # rol
377 eval(shift(@insns));
378 eval(shift(@insns));
379 eval(shift(@insns)); # ror
380 eval(shift(@insns));
381
382 &por (@X[0],@Tx[0]); # "X[0]"<<<=2
383 eval(shift(@insns)); # body_20_39
384 eval(shift(@insns));
385 &movdqa (@Tx[1],@X[0]) if ($Xi<19);
386 eval(shift(@insns));
387 eval(shift(@insns)); # rol
388 eval(shift(@insns));
389 eval(shift(@insns));
390 eval(shift(@insns)); # rol
391 eval(shift(@insns));
392
393 foreach (@insns) { eval; } # remaining instructions
394
395 $Xi++; push(@X,shift(@X)); # "rotate" X[]
396 push(@Tx,shift(@Tx));
397}
398
399sub Xuplast_ssse3_80()
400{ use integer;
401 my $body = shift;
402 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
403 my ($a,$b,$c,$d,$e);
404
405 eval(shift(@insns));
406 &paddd (@Tx[1],@X[-1&7]);
407 eval(shift(@insns));
408 eval(shift(@insns));
409 eval(shift(@insns));
410 eval(shift(@insns));
411
412 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
413
414 foreach (@insns) { eval; } # remaining instructions
415
416 &cmp ($inp,$len);
417 &je (".Ldone_ssse3");
418
419 unshift(@Tx,pop(@Tx));
420
421 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
422 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
423 &movdqu (@X[-4&7],"0($inp)"); # load input
424 &movdqu (@X[-3&7],"16($inp)");
425 &movdqu (@X[-2&7],"32($inp)");
426 &movdqu (@X[-1&7],"48($inp)");
427 &pshufb (@X[-4&7],@X[2]); # byte swap
428 &add ($inp,64);
429
430 $Xi=0;
431}
432
433sub Xloop_ssse3()
434{ use integer;
435 my $body = shift;
436 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
437 my ($a,$b,$c,$d,$e);
438
439 eval(shift(@insns));
440 eval(shift(@insns));
441 &pshufb (@X[($Xi-3)&7],@X[2]);
442 eval(shift(@insns));
443 eval(shift(@insns));
444 &paddd (@X[($Xi-4)&7],@Tx[1]);
445 eval(shift(@insns));
446 eval(shift(@insns));
447 eval(shift(@insns));
448 eval(shift(@insns));
449 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
450 eval(shift(@insns));
451 eval(shift(@insns));
452 &psubd (@X[($Xi-4)&7],@Tx[1]);
453
454 foreach (@insns) { eval; }
455 $Xi++;
456}
457
458sub Xtail_ssse3()
459{ use integer;
460 my $body = shift;
461 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
462 my ($a,$b,$c,$d,$e);
463
464 foreach (@insns) { eval; }
465}
466
467sub body_00_19 () {
468 use integer;
469 my ($k,$n);
470 my @r=(
471 '($a,$b,$c,$d,$e)=@V;'.
472 '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
473 '&xor ($c,$d);',
474 '&mov (@T[1],$a);', # $b in next round
475 '&$_rol ($a,5);',
476 '&and (@T[0],$c);', # ($b&($c^$d))
477 '&xor ($c,$d);', # restore $c
478 '&xor (@T[0],$d);',
479 '&add ($e,$a);',
480 '&$_ror ($b,$j?7:2);', # $b>>>2
481 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
482 );
483 $n = scalar(@r);
484 $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
485 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
486 $jj++;
487 return @r;
488}
489
490sub body_20_39 () {
491 use integer;
492 my ($k,$n);
493 my @r=(
494 '($a,$b,$c,$d,$e)=@V;'.
495 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
496 '&xor (@T[0],$d);', # ($b^$d)
497 '&mov (@T[1],$a);', # $b in next round
498 '&$_rol ($a,5);',
499 '&xor (@T[0],$c);', # ($b^$d^$c)
500 '&add ($e,$a);',
501 '&$_ror ($b,7);', # $b>>>2
502 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
503 );
504 $n = scalar(@r);
505 $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds
506 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
507 $jj++;
508 return @r;
509}
510
511sub body_40_59 () {
512 use integer;
513 my ($k,$n);
514 my @r=(
515 '($a,$b,$c,$d,$e)=@V;'.
516 '&mov (@T[1],$c);',
517 '&xor ($c,$d);',
518 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
519 '&and (@T[1],$d);',
520 '&and (@T[0],$c);', # ($b&($c^$d))
521 '&$_ror ($b,7);', # $b>>>2
522 '&add ($e,@T[1]);',
523 '&mov (@T[1],$a);', # $b in next round
524 '&$_rol ($a,5);',
525 '&add ($e,@T[0]);',
526 '&xor ($c,$d);', # restore $c
527 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
528 );
529 $n = scalar(@r);
530 $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
531 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
532 $jj++;
533 return @r;
534}
535$code.=<<___;
536.align 16
537.Loop_ssse3:
538___
539 &Xupdate_ssse3_16_31(\&body_00_19);
540 &Xupdate_ssse3_16_31(\&body_00_19);
541 &Xupdate_ssse3_16_31(\&body_00_19);
542 &Xupdate_ssse3_16_31(\&body_00_19);
543 &Xupdate_ssse3_32_79(\&body_00_19);
544 &Xupdate_ssse3_32_79(\&body_20_39);
545 &Xupdate_ssse3_32_79(\&body_20_39);
546 &Xupdate_ssse3_32_79(\&body_20_39);
547 &Xupdate_ssse3_32_79(\&body_20_39);
548 &Xupdate_ssse3_32_79(\&body_20_39);
549 &Xupdate_ssse3_32_79(\&body_40_59);
550 &Xupdate_ssse3_32_79(\&body_40_59);
551 &Xupdate_ssse3_32_79(\&body_40_59);
552 &Xupdate_ssse3_32_79(\&body_40_59);
553 &Xupdate_ssse3_32_79(\&body_40_59);
554 &Xupdate_ssse3_32_79(\&body_20_39);
555 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
556
557 $saved_j=$j; @saved_V=@V;
558 $saved_r=$r; @saved_rndkey=@rndkey;
559
560 &Xloop_ssse3(\&body_20_39);
561 &Xloop_ssse3(\&body_20_39);
562 &Xloop_ssse3(\&body_20_39);
563
564$code.=<<___;
565 movups $iv,48($out,$in0) # write output
566 lea 64($in0),$in0
567
568 add 0($ctx),$A # update context
569 add 4($ctx),@T[0]
570 add 8($ctx),$C
571 add 12($ctx),$D
572 mov $A,0($ctx)
573 add 16($ctx),$E
574 mov @T[0],4($ctx)
575 mov @T[0],$B # magic seed
576 mov $C,8($ctx)
577 mov $D,12($ctx)
578 mov $E,16($ctx)
579 jmp .Loop_ssse3
580
581.align 16
582.Ldone_ssse3:
583___
584 $jj=$j=$saved_j; @V=@saved_V;
585 $r=$saved_r; @rndkey=@saved_rndkey;
586
587 &Xtail_ssse3(\&body_20_39);
588 &Xtail_ssse3(\&body_20_39);
589 &Xtail_ssse3(\&body_20_39);
590
591$code.=<<___;
592 movups $iv,48($out,$in0) # write output
593 mov 88(%rsp),$ivp # restore $ivp
594
595 add 0($ctx),$A # update context
596 add 4($ctx),@T[0]
597 add 8($ctx),$C
598 mov $A,0($ctx)
599 add 12($ctx),$D
600 mov @T[0],4($ctx)
601 add 16($ctx),$E
602 mov $C,8($ctx)
603 mov $D,12($ctx)
604 mov $E,16($ctx)
605 movups $iv,($ivp) # write IV
606___
607$code.=<<___ if ($win64);
608 movaps 96+0(%rsp),%xmm6
609 movaps 96+16(%rsp),%xmm7
610 movaps 96+32(%rsp),%xmm8
611 movaps 96+48(%rsp),%xmm9
612 movaps 96+64(%rsp),%xmm10
613 movaps 96+80(%rsp),%xmm11
614 movaps 96+96(%rsp),%xmm12
615 movaps 96+112(%rsp),%xmm13
616 movaps 96+128(%rsp),%xmm14
617 movaps 96+144(%rsp),%xmm15
618___
619$code.=<<___;
620 lea `104+($win64?10*16:0)`(%rsp),%rsi
621 mov 0(%rsi),%r15
622 mov 8(%rsi),%r14
623 mov 16(%rsi),%r13
624 mov 24(%rsi),%r12
625 mov 32(%rsi),%rbp
626 mov 40(%rsi),%rbx
627 lea 48(%rsi),%rsp
628.Lepilogue_ssse3:
629 ret
630.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
631___
632
633$j=$jj=$r=$sn=0;
634
635if ($avx) {
636my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
637
638my $Xi=4;
639my @X=map("%xmm$_",(4..7,0..3));
640my @Tx=map("%xmm$_",(8..10));
641my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
642my @T=("%esi","%edi");
643
644my $_rol=sub { &shld(@_[0],@_) };
645my $_ror=sub { &shrd(@_[0],@_) };
646
647$code.=<<___;
648.type aesni_cbc_sha1_enc_avx,\@function,6
649.align 16
650aesni_cbc_sha1_enc_avx:
651 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
652 #shr \$6,$len # debugging artefact
653 #jz .Lepilogue_avx # debugging artefact
654 push %rbx
655 push %rbp
656 push %r12
657 push %r13
658 push %r14
659 push %r15
660 lea `-104-($win64?10*16:0)`(%rsp),%rsp
661 #mov $in0,$inp # debugging artefact
662 #lea 64(%rsp),$ctx # debugging artefact
663___
664$code.=<<___ if ($win64);
665 movaps %xmm6,96+0(%rsp)
666 movaps %xmm7,96+16(%rsp)
667 movaps %xmm8,96+32(%rsp)
668 movaps %xmm9,96+48(%rsp)
669 movaps %xmm10,96+64(%rsp)
670 movaps %xmm11,96+80(%rsp)
671 movaps %xmm12,96+96(%rsp)
672 movaps %xmm13,96+112(%rsp)
673 movaps %xmm14,96+128(%rsp)
674 movaps %xmm15,96+144(%rsp)
675.Lprologue_avx:
676___
677$code.=<<___;
678 vzeroall
679 mov $in0,%r12 # reassign arguments
680 mov $out,%r13
681 mov $len,%r14
682 mov $key,%r15
683 vmovdqu ($ivp),$iv # load IV
684 mov $ivp,88(%rsp) # save $ivp
685___
686my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
687my $rounds="${ivp}d";
688$code.=<<___;
689 shl \$6,$len
690 sub $in0,$out
691 mov 240($key),$rounds
692 add \$112,$key # size optimization
693 add $inp,$len # end of input
694
695 lea K_XX_XX(%rip),$K_XX_XX
696 mov 0($ctx),$A # load context
697 mov 4($ctx),$B
698 mov 8($ctx),$C
699 mov 12($ctx),$D
700 mov $B,@T[0] # magic seed
701 mov 16($ctx),$E
702
703 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
704 vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
705 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
706 vmovdqu 16($inp),@X[-3&7]
707 vmovdqu 32($inp),@X[-2&7]
708 vmovdqu 48($inp),@X[-1&7]
709 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
710 add \$64,$inp
711 vpshufb @X[2],@X[-3&7],@X[-3&7]
712 vpshufb @X[2],@X[-2&7],@X[-2&7]
713 vpshufb @X[2],@X[-1&7],@X[-1&7]
714 vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
715 vpaddd @Tx[1],@X[-3&7],@X[1]
716 vpaddd @Tx[1],@X[-2&7],@X[2]
717 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
718 vmovdqa @X[1],16(%rsp)
719 vmovdqa @X[2],32(%rsp)
720 vmovups -112($key),$rndkey0 # $key[0]
721 vmovups 16-112($key),$rndkey[0] # forward reference
722 jmp .Loop_avx
723___
724
725my $aesenc=sub {
726 use integer;
727 my ($n,$k)=($r/10,$r%10);
728 if ($k==0) {
729 $code.=<<___;
730 vmovups `16*$n`($in0),$in # load input
731 vxorps $rndkey0,$in,$in
732___
733 $code.=<<___ if ($n);
734 vmovups $iv,`16*($n-1)`($out,$in0) # write output
735___
736 $code.=<<___;
737 vxorps $in,$iv,$iv
738 vaesenc $rndkey[0],$iv,$iv
739 vmovups `32+16*$k-112`($key),$rndkey[1]
740___
741 } elsif ($k==9) {
742 $sn++;
743 $code.=<<___;
744 cmp \$11,$rounds
745 jb .Lvaesenclast$sn
746 vaesenc $rndkey[0],$iv,$iv
747 vmovups `32+16*($k+0)-112`($key),$rndkey[1]
748 vaesenc $rndkey[1],$iv,$iv
749 vmovups `32+16*($k+1)-112`($key),$rndkey[0]
750 je .Lvaesenclast$sn
751 vaesenc $rndkey[0],$iv,$iv
752 vmovups `32+16*($k+2)-112`($key),$rndkey[1]
753 vaesenc $rndkey[1],$iv,$iv
754 vmovups `32+16*($k+3)-112`($key),$rndkey[0]
755.Lvaesenclast$sn:
756 vaesenclast $rndkey[0],$iv,$iv
757 vmovups 16-112($key),$rndkey[1] # forward reference
758___
759 } else {
760 $code.=<<___;
761 vaesenc $rndkey[0],$iv,$iv
762 vmovups `32+16*$k-112`($key),$rndkey[1]
763___
764 }
765 $r++; unshift(@rndkey,pop(@rndkey));
766};
767
768sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
769{ use integer;
770 my $body = shift;
771 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
772 my ($a,$b,$c,$d,$e);
773
774 eval(shift(@insns));
775 eval(shift(@insns));
776 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
777 eval(shift(@insns));
778 eval(shift(@insns));
779
780 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
781 eval(shift(@insns));
782 eval(shift(@insns));
783 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
784 eval(shift(@insns));
785 eval(shift(@insns));
786 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
787 eval(shift(@insns));
788 eval(shift(@insns));
789
790 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
791 eval(shift(@insns));
792 eval(shift(@insns));
793 eval(shift(@insns));
794 eval(shift(@insns));
795
796 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
797 eval(shift(@insns));
798 eval(shift(@insns));
799 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
800 eval(shift(@insns));
801 eval(shift(@insns));
802
803 &vpsrld (@Tx[0],@X[0],31);
804 eval(shift(@insns));
805 eval(shift(@insns));
806 eval(shift(@insns));
807 eval(shift(@insns));
808
809 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
810 &vpaddd (@X[0],@X[0],@X[0]);
811 eval(shift(@insns));
812 eval(shift(@insns));
813 eval(shift(@insns));
814 eval(shift(@insns));
815
816 &vpsrld (@Tx[1],@Tx[2],30);
817 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
818 eval(shift(@insns));
819 eval(shift(@insns));
820 eval(shift(@insns));
821 eval(shift(@insns));
822
823 &vpslld (@Tx[2],@Tx[2],2);
824 &vpxor (@X[0],@X[0],@Tx[1]);
825 eval(shift(@insns));
826 eval(shift(@insns));
827 eval(shift(@insns));
828 eval(shift(@insns));
829
830 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
831 eval(shift(@insns));
832 eval(shift(@insns));
833 &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
834 eval(shift(@insns));
835 eval(shift(@insns));
836
837
838 foreach (@insns) { eval; } # remaining instructions [if any]
839
840 $Xi++; push(@X,shift(@X)); # "rotate" X[]
841 push(@Tx,shift(@Tx));
842}
843
844sub Xupdate_avx_32_79()
845{ use integer;
846 my $body = shift;
847 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
848 my ($a,$b,$c,$d,$e);
849
850 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
851 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
852 eval(shift(@insns)); # body_20_39
853 eval(shift(@insns));
854 eval(shift(@insns));
855 eval(shift(@insns)); # rol
856
857 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
858 eval(shift(@insns));
859 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
860 if ($Xi%5) {
861 &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
862 } else { # ... or load next one
863 &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
864 }
865 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
866 eval(shift(@insns)); # ror
867 eval(shift(@insns));
868
869 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
870 eval(shift(@insns)); # body_20_39
871 eval(shift(@insns));
872 eval(shift(@insns));
873 eval(shift(@insns)); # rol
874
875 &vpsrld (@Tx[0],@X[0],30);
876 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
877 eval(shift(@insns));
878 eval(shift(@insns));
879 eval(shift(@insns)); # ror
880 eval(shift(@insns));
881
882 &vpslld (@X[0],@X[0],2);
883 eval(shift(@insns)); # body_20_39
884 eval(shift(@insns));
885 eval(shift(@insns));
886 eval(shift(@insns)); # rol
887 eval(shift(@insns));
888 eval(shift(@insns));
889 eval(shift(@insns)); # ror
890 eval(shift(@insns));
891
892 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
893 eval(shift(@insns)); # body_20_39
894 eval(shift(@insns));
895 &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
896 eval(shift(@insns));
897 eval(shift(@insns)); # rol
898 eval(shift(@insns));
899 eval(shift(@insns));
900 eval(shift(@insns)); # rol
901 eval(shift(@insns));
902
903 foreach (@insns) { eval; } # remaining instructions
904
905 $Xi++; push(@X,shift(@X)); # "rotate" X[]
906 push(@Tx,shift(@Tx));
907}
908
909sub Xuplast_avx_80()
910{ use integer;
911 my $body = shift;
912 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
913 my ($a,$b,$c,$d,$e);
914
915 eval(shift(@insns));
916 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
917 eval(shift(@insns));
918 eval(shift(@insns));
919 eval(shift(@insns));
920 eval(shift(@insns));
921
922 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
923
924 foreach (@insns) { eval; } # remaining instructions
925
926 &cmp ($inp,$len);
927 &je (".Ldone_avx");
928
929 unshift(@Tx,pop(@Tx));
930
931 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
932 &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
933 &vmovdqu(@X[-4&7],"0($inp)"); # load input
934 &vmovdqu(@X[-3&7],"16($inp)");
935 &vmovdqu(@X[-2&7],"32($inp)");
936 &vmovdqu(@X[-1&7],"48($inp)");
937 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
938 &add ($inp,64);
939
940 $Xi=0;
941}
942
943sub Xloop_avx()
944{ use integer;
945 my $body = shift;
946 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
947 my ($a,$b,$c,$d,$e);
948
949 eval(shift(@insns));
950 eval(shift(@insns));
951 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
952 eval(shift(@insns));
953 eval(shift(@insns));
954 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
955 eval(shift(@insns));
956 eval(shift(@insns));
957 eval(shift(@insns));
958 eval(shift(@insns));
959 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
960 eval(shift(@insns));
961 eval(shift(@insns));
962
963 foreach (@insns) { eval; }
964 $Xi++;
965}
966
967sub Xtail_avx()
968{ use integer;
969 my $body = shift;
970 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
971 my ($a,$b,$c,$d,$e);
972
973 foreach (@insns) { eval; }
974}
975
976$code.=<<___;
977.align 16
978.Loop_avx:
979___
980 &Xupdate_avx_16_31(\&body_00_19);
981 &Xupdate_avx_16_31(\&body_00_19);
982 &Xupdate_avx_16_31(\&body_00_19);
983 &Xupdate_avx_16_31(\&body_00_19);
984 &Xupdate_avx_32_79(\&body_00_19);
985 &Xupdate_avx_32_79(\&body_20_39);
986 &Xupdate_avx_32_79(\&body_20_39);
987 &Xupdate_avx_32_79(\&body_20_39);
988 &Xupdate_avx_32_79(\&body_20_39);
989 &Xupdate_avx_32_79(\&body_20_39);
990 &Xupdate_avx_32_79(\&body_40_59);
991 &Xupdate_avx_32_79(\&body_40_59);
992 &Xupdate_avx_32_79(\&body_40_59);
993 &Xupdate_avx_32_79(\&body_40_59);
994 &Xupdate_avx_32_79(\&body_40_59);
995 &Xupdate_avx_32_79(\&body_20_39);
996 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
997
998 $saved_j=$j; @saved_V=@V;
999 $saved_r=$r; @saved_rndkey=@rndkey;
1000
1001 &Xloop_avx(\&body_20_39);
1002 &Xloop_avx(\&body_20_39);
1003 &Xloop_avx(\&body_20_39);
1004
1005$code.=<<___;
1006 vmovups $iv,48($out,$in0) # write output
1007 lea 64($in0),$in0
1008
1009 add 0($ctx),$A # update context
1010 add 4($ctx),@T[0]
1011 add 8($ctx),$C
1012 add 12($ctx),$D
1013 mov $A,0($ctx)
1014 add 16($ctx),$E
1015 mov @T[0],4($ctx)
1016 mov @T[0],$B # magic seed
1017 mov $C,8($ctx)
1018 mov $D,12($ctx)
1019 mov $E,16($ctx)
1020 jmp .Loop_avx
1021
1022.align 16
1023.Ldone_avx:
1024___
1025 $jj=$j=$saved_j; @V=@saved_V;
1026 $r=$saved_r; @rndkey=@saved_rndkey;
1027
1028 &Xtail_avx(\&body_20_39);
1029 &Xtail_avx(\&body_20_39);
1030 &Xtail_avx(\&body_20_39);
1031
1032$code.=<<___;
1033 vmovups $iv,48($out,$in0) # write output
1034 mov 88(%rsp),$ivp # restore $ivp
1035
1036 add 0($ctx),$A # update context
1037 add 4($ctx),@T[0]
1038 add 8($ctx),$C
1039 mov $A,0($ctx)
1040 add 12($ctx),$D
1041 mov @T[0],4($ctx)
1042 add 16($ctx),$E
1043 mov $C,8($ctx)
1044 mov $D,12($ctx)
1045 mov $E,16($ctx)
1046 vmovups $iv,($ivp) # write IV
1047 vzeroall
1048___
1049$code.=<<___ if ($win64);
1050 movaps 96+0(%rsp),%xmm6
1051 movaps 96+16(%rsp),%xmm7
1052 movaps 96+32(%rsp),%xmm8
1053 movaps 96+48(%rsp),%xmm9
1054 movaps 96+64(%rsp),%xmm10
1055 movaps 96+80(%rsp),%xmm11
1056 movaps 96+96(%rsp),%xmm12
1057 movaps 96+112(%rsp),%xmm13
1058 movaps 96+128(%rsp),%xmm14
1059 movaps 96+144(%rsp),%xmm15
1060___
1061$code.=<<___;
1062 lea `104+($win64?10*16:0)`(%rsp),%rsi
1063 mov 0(%rsi),%r15
1064 mov 8(%rsi),%r14
1065 mov 16(%rsi),%r13
1066 mov 24(%rsi),%r12
1067 mov 32(%rsi),%rbp
1068 mov 40(%rsi),%rbx
1069 lea 48(%rsi),%rsp
1070.Lepilogue_avx:
1071 ret
1072.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1073___
1074}
1075$code.=<<___;
1076.align 64
1077K_XX_XX:
1078.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1079.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1080.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1081.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1082.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
1083
1084.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1085.align 64
1086___
1087
1088# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1089# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1090if ($win64) {
1091$rec="%rcx";
1092$frame="%rdx";
1093$context="%r8";
1094$disp="%r9";
1095
1096$code.=<<___;
1097.extern __imp_RtlVirtualUnwind
1098.type ssse3_handler,\@abi-omnipotent
1099.align 16
1100ssse3_handler:
1101 push %rsi
1102 push %rdi
1103 push %rbx
1104 push %rbp
1105 push %r12
1106 push %r13
1107 push %r14
1108 push %r15
1109 pushfq
1110 sub \$64,%rsp
1111
1112 mov 120($context),%rax # pull context->Rax
1113 mov 248($context),%rbx # pull context->Rip
1114
1115 mov 8($disp),%rsi # disp->ImageBase
1116 mov 56($disp),%r11 # disp->HandlerData
1117
1118 mov 0(%r11),%r10d # HandlerData[0]
1119 lea (%rsi,%r10),%r10 # prologue label
1120 cmp %r10,%rbx # context->Rip<prologue label
1121 jb .Lcommon_seh_tail
1122
1123 mov 152($context),%rax # pull context->Rsp
1124
1125 mov 4(%r11),%r10d # HandlerData[1]
1126 lea (%rsi,%r10),%r10 # epilogue label
1127 cmp %r10,%rbx # context->Rip>=epilogue label
1128 jae .Lcommon_seh_tail
1129
1130 lea 96(%rax),%rsi
1131 lea 512($context),%rdi # &context.Xmm6
1132 mov \$20,%ecx
1133 .long 0xa548f3fc # cld; rep movsq
1134 lea `104+10*16`(%rax),%rax # adjust stack pointer
1135
1136 mov 0(%rax),%r15
1137 mov 8(%rax),%r14
1138 mov 16(%rax),%r13
1139 mov 24(%rax),%r12
1140 mov 32(%rax),%rbp
1141 mov 40(%rax),%rbx
1142 lea 48(%rax),%rax
1143 mov %rbx,144($context) # restore context->Rbx
1144 mov %rbp,160($context) # restore context->Rbp
1145 mov %r12,216($context) # restore context->R12
1146 mov %r13,224($context) # restore context->R13
1147 mov %r14,232($context) # restore context->R14
1148 mov %r15,240($context) # restore context->R15
1149
1150.Lcommon_seh_tail:
1151 mov 8(%rax),%rdi
1152 mov 16(%rax),%rsi
1153 mov %rax,152($context) # restore context->Rsp
1154 mov %rsi,168($context) # restore context->Rsi
1155 mov %rdi,176($context) # restore context->Rdi
1156
1157 mov 40($disp),%rdi # disp->ContextRecord
1158 mov $context,%rsi # context
1159 mov \$154,%ecx # sizeof(CONTEXT)
1160 .long 0xa548f3fc # cld; rep movsq
1161
1162 mov $disp,%rsi
1163 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1164 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1165 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1166 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1167 mov 40(%rsi),%r10 # disp->ContextRecord
1168 lea 56(%rsi),%r11 # &disp->HandlerData
1169 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1170 mov %r10,32(%rsp) # arg5
1171 mov %r11,40(%rsp) # arg6
1172 mov %r12,48(%rsp) # arg7
1173 mov %rcx,56(%rsp) # arg8, (NULL)
1174 call *__imp_RtlVirtualUnwind(%rip)
1175
1176 mov \$1,%eax # ExceptionContinueSearch
1177 add \$64,%rsp
1178 popfq
1179 pop %r15
1180 pop %r14
1181 pop %r13
1182 pop %r12
1183 pop %rbp
1184 pop %rbx
1185 pop %rdi
1186 pop %rsi
1187 ret
1188.size ssse3_handler,.-ssse3_handler
1189
1190.section .pdata
1191.align 4
1192 .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3
1193 .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3
1194 .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3
1195___
1196$code.=<<___ if ($avx);
1197 .rva .LSEH_begin_aesni_cbc_sha1_enc_avx
1198 .rva .LSEH_end_aesni_cbc_sha1_enc_avx
1199 .rva .LSEH_info_aesni_cbc_sha1_enc_avx
1200___
1201$code.=<<___;
1202.section .xdata
1203.align 8
1204.LSEH_info_aesni_cbc_sha1_enc_ssse3:
1205 .byte 9,0,0,0
1206 .rva ssse3_handler
1207 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
1208___
1209$code.=<<___ if ($avx);
1210.LSEH_info_aesni_cbc_sha1_enc_avx:
1211 .byte 9,0,0,0
1212 .rva ssse3_handler
1213 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1214___
1215}
1216
1217####################################################################
1218sub rex {
1219 local *opcode=shift;
1220 my ($dst,$src)=@_;
1221 my $rex=0;
1222
1223 $rex|=0x04 if($dst>=8);
1224 $rex|=0x01 if($src>=8);
1225 push @opcode,$rex|0x40 if($rex);
1226}
1227
1228sub aesni {
1229 my $line=shift;
1230 my @opcode=(0x66);
1231
1232 if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1233 my %opcodelet = (
1234 "aesenc" => 0xdc, "aesenclast" => 0xdd
1235 );
1236 return undef if (!defined($opcodelet{$1}));
1237 rex(\@opcode,$3,$2);
1238 push @opcode,0x0f,0x38,$opcodelet{$1};
1239 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
1240 return ".byte\t".join(',',@opcode);
1241 }
1242 return $line;
1243}
1244
1245$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1246$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1247
1248print $code;
1249close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86.pl b/src/lib/libcrypto/aes/asm/aesni-x86.pl
deleted file mode 100644
index 3dc345b585..0000000000
--- a/src/lib/libcrypto/aes/asm/aesni-x86.pl
+++ /dev/null
@@ -1,2189 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13# details].
14#
15# Performance.
16#
17# To start with see corresponding paragraph in aesni-x86_64.pl...
18# Instead of filling table similar to one found there I've chosen to
19# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20# The simplified table below represents 32-bit performance relative
21# to 64-bit one in every given point. Ratios vary for different
22# encryption modes, therefore interval values.
23#
24# 16-byte 64-byte 256-byte 1-KB 8-KB
25# 53-67% 67-84% 91-94% 95-98% 97-99.5%
26#
27# Lower ratios for smaller block sizes are perfectly understandable,
28# because function call overhead is higher in 32-bit mode. Largest
29# 8-KB block performance is virtually same: 32-bit code is less than
30# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
31
32# January 2011
33#
34# See aesni-x86_64.pl for details. Unlike x86_64 version this module
35# interleaves at most 6 aes[enc|dec] instructions, because there are
36# not enough registers for 8x interleave [which should be optimal for
37# Sandy Bridge]. Actually, performance results for 6x interleave
38# factor presented in aesni-x86_64.pl (except for CTR) are for this
39# module.
40
41# April 2011
42#
43# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
45
46$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
47 # generates drop-in replacement for
48 # crypto/aes/asm/aes-586.pl:-)
49$inline=1; # inline _aesni_[en|de]crypt
50
51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52push(@INC,"${dir}","${dir}../../perlasm");
53require "x86asm.pl";
54
55&asm_init($ARGV[0],$0);
56
57if ($PREFIX eq "aesni") { $movekey=*movups; }
58else { $movekey=*movups; }
59
60$len="eax";
61$rounds="ecx";
62$key="edx";
63$inp="esi";
64$out="edi";
65$rounds_="ebx"; # backup copy for $rounds
66$key_="ebp"; # backup copy for $key
67
68$rndkey0="xmm0";
69$rndkey1="xmm1";
70$inout0="xmm2";
71$inout1="xmm3";
72$inout2="xmm4";
73$inout3="xmm5"; $in1="xmm5";
74$inout4="xmm6"; $in0="xmm6";
75$inout5="xmm7"; $ivec="xmm7";
76
77# AESNI extenstion
78sub aeskeygenassist
79{ my($dst,$src,$imm)=@_;
80 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
81 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
82}
83sub aescommon
84{ my($opcodelet,$dst,$src)=@_;
85 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
86 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
87}
88sub aesimc { aescommon(0xdb,@_); }
89sub aesenc { aescommon(0xdc,@_); }
90sub aesenclast { aescommon(0xdd,@_); }
91sub aesdec { aescommon(0xde,@_); }
92sub aesdeclast { aescommon(0xdf,@_); }
93
94# Inline version of internal aesni_[en|de]crypt1
95{ my $sn;
96sub aesni_inline_generate1
97{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
98 $sn++;
99
100 &$movekey ($rndkey0,&QWP(0,$key));
101 &$movekey ($rndkey1,&QWP(16,$key));
102 &xorps ($ivec,$rndkey0) if (defined($ivec));
103 &lea ($key,&DWP(32,$key));
104 &xorps ($inout,$ivec) if (defined($ivec));
105 &xorps ($inout,$rndkey0) if (!defined($ivec));
106 &set_label("${p}1_loop_$sn");
107 eval"&aes${p} ($inout,$rndkey1)";
108 &dec ($rounds);
109 &$movekey ($rndkey1,&QWP(0,$key));
110 &lea ($key,&DWP(16,$key));
111 &jnz (&label("${p}1_loop_$sn"));
112 eval"&aes${p}last ($inout,$rndkey1)";
113}}
114
115sub aesni_generate1 # fully unrolled loop
116{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
117
118 &function_begin_B("_aesni_${p}rypt1");
119 &movups ($rndkey0,&QWP(0,$key));
120 &$movekey ($rndkey1,&QWP(0x10,$key));
121 &xorps ($inout,$rndkey0);
122 &$movekey ($rndkey0,&QWP(0x20,$key));
123 &lea ($key,&DWP(0x30,$key));
124 &cmp ($rounds,11);
125 &jb (&label("${p}128"));
126 &lea ($key,&DWP(0x20,$key));
127 &je (&label("${p}192"));
128 &lea ($key,&DWP(0x20,$key));
129 eval"&aes${p} ($inout,$rndkey1)";
130 &$movekey ($rndkey1,&QWP(-0x40,$key));
131 eval"&aes${p} ($inout,$rndkey0)";
132 &$movekey ($rndkey0,&QWP(-0x30,$key));
133 &set_label("${p}192");
134 eval"&aes${p} ($inout,$rndkey1)";
135 &$movekey ($rndkey1,&QWP(-0x20,$key));
136 eval"&aes${p} ($inout,$rndkey0)";
137 &$movekey ($rndkey0,&QWP(-0x10,$key));
138 &set_label("${p}128");
139 eval"&aes${p} ($inout,$rndkey1)";
140 &$movekey ($rndkey1,&QWP(0,$key));
141 eval"&aes${p} ($inout,$rndkey0)";
142 &$movekey ($rndkey0,&QWP(0x10,$key));
143 eval"&aes${p} ($inout,$rndkey1)";
144 &$movekey ($rndkey1,&QWP(0x20,$key));
145 eval"&aes${p} ($inout,$rndkey0)";
146 &$movekey ($rndkey0,&QWP(0x30,$key));
147 eval"&aes${p} ($inout,$rndkey1)";
148 &$movekey ($rndkey1,&QWP(0x40,$key));
149 eval"&aes${p} ($inout,$rndkey0)";
150 &$movekey ($rndkey0,&QWP(0x50,$key));
151 eval"&aes${p} ($inout,$rndkey1)";
152 &$movekey ($rndkey1,&QWP(0x60,$key));
153 eval"&aes${p} ($inout,$rndkey0)";
154 &$movekey ($rndkey0,&QWP(0x70,$key));
155 eval"&aes${p} ($inout,$rndkey1)";
156 eval"&aes${p}last ($inout,$rndkey0)";
157 &ret();
158 &function_end_B("_aesni_${p}rypt1");
159}
160
161# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
162&aesni_generate1("enc") if (!$inline);
163&function_begin_B("${PREFIX}_encrypt");
164 &mov ("eax",&wparam(0));
165 &mov ($key,&wparam(2));
166 &movups ($inout0,&QWP(0,"eax"));
167 &mov ($rounds,&DWP(240,$key));
168 &mov ("eax",&wparam(1));
169 if ($inline)
170 { &aesni_inline_generate1("enc"); }
171 else
172 { &call ("_aesni_encrypt1"); }
173 &movups (&QWP(0,"eax"),$inout0);
174 &ret ();
175&function_end_B("${PREFIX}_encrypt");
176
177# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
178&aesni_generate1("dec") if(!$inline);
179&function_begin_B("${PREFIX}_decrypt");
180 &mov ("eax",&wparam(0));
181 &mov ($key,&wparam(2));
182 &movups ($inout0,&QWP(0,"eax"));
183 &mov ($rounds,&DWP(240,$key));
184 &mov ("eax",&wparam(1));
185 if ($inline)
186 { &aesni_inline_generate1("dec"); }
187 else
188 { &call ("_aesni_decrypt1"); }
189 &movups (&QWP(0,"eax"),$inout0);
190 &ret ();
191&function_end_B("${PREFIX}_decrypt");
192
193# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
194# factor. Why 3x subroutine were originally used in loops? Even though
195# aes[enc|dec] latency was originally 6, it could be scheduled only
196# every *2nd* cycle. Thus 3x interleave was the one providing optimal
197# utilization, i.e. when subroutine's throughput is virtually same as
198# of non-interleaved subroutine [for number of input blocks up to 3].
199# This is why it makes no sense to implement 2x subroutine.
200# aes[enc|dec] latency in next processor generation is 8, but the
201# instructions can be scheduled every cycle. Optimal interleave for
202# new processor is therefore 8x, but it's unfeasible to accommodate it
203# in XMM registers addreassable in 32-bit mode and therefore 6x is
204# used instead...
205
206sub aesni_generate3
207{ my $p=shift;
208
209 &function_begin_B("_aesni_${p}rypt3");
210 &$movekey ($rndkey0,&QWP(0,$key));
211 &shr ($rounds,1);
212 &$movekey ($rndkey1,&QWP(16,$key));
213 &lea ($key,&DWP(32,$key));
214 &xorps ($inout0,$rndkey0);
215 &pxor ($inout1,$rndkey0);
216 &pxor ($inout2,$rndkey0);
217 &$movekey ($rndkey0,&QWP(0,$key));
218
219 &set_label("${p}3_loop");
220 eval"&aes${p} ($inout0,$rndkey1)";
221 eval"&aes${p} ($inout1,$rndkey1)";
222 &dec ($rounds);
223 eval"&aes${p} ($inout2,$rndkey1)";
224 &$movekey ($rndkey1,&QWP(16,$key));
225 eval"&aes${p} ($inout0,$rndkey0)";
226 eval"&aes${p} ($inout1,$rndkey0)";
227 &lea ($key,&DWP(32,$key));
228 eval"&aes${p} ($inout2,$rndkey0)";
229 &$movekey ($rndkey0,&QWP(0,$key));
230 &jnz (&label("${p}3_loop"));
231 eval"&aes${p} ($inout0,$rndkey1)";
232 eval"&aes${p} ($inout1,$rndkey1)";
233 eval"&aes${p} ($inout2,$rndkey1)";
234 eval"&aes${p}last ($inout0,$rndkey0)";
235 eval"&aes${p}last ($inout1,$rndkey0)";
236 eval"&aes${p}last ($inout2,$rndkey0)";
237 &ret();
238 &function_end_B("_aesni_${p}rypt3");
239}
240
241# 4x interleave is implemented to improve small block performance,
242# most notably [and naturally] 4 block by ~30%. One can argue that one
243# should have implemented 5x as well, but improvement would be <20%,
244# so it's not worth it...
245sub aesni_generate4
246{ my $p=shift;
247
248 &function_begin_B("_aesni_${p}rypt4");
249 &$movekey ($rndkey0,&QWP(0,$key));
250 &$movekey ($rndkey1,&QWP(16,$key));
251 &shr ($rounds,1);
252 &lea ($key,&DWP(32,$key));
253 &xorps ($inout0,$rndkey0);
254 &pxor ($inout1,$rndkey0);
255 &pxor ($inout2,$rndkey0);
256 &pxor ($inout3,$rndkey0);
257 &$movekey ($rndkey0,&QWP(0,$key));
258
259 &set_label("${p}4_loop");
260 eval"&aes${p} ($inout0,$rndkey1)";
261 eval"&aes${p} ($inout1,$rndkey1)";
262 &dec ($rounds);
263 eval"&aes${p} ($inout2,$rndkey1)";
264 eval"&aes${p} ($inout3,$rndkey1)";
265 &$movekey ($rndkey1,&QWP(16,$key));
266 eval"&aes${p} ($inout0,$rndkey0)";
267 eval"&aes${p} ($inout1,$rndkey0)";
268 &lea ($key,&DWP(32,$key));
269 eval"&aes${p} ($inout2,$rndkey0)";
270 eval"&aes${p} ($inout3,$rndkey0)";
271 &$movekey ($rndkey0,&QWP(0,$key));
272 &jnz (&label("${p}4_loop"));
273
274 eval"&aes${p} ($inout0,$rndkey1)";
275 eval"&aes${p} ($inout1,$rndkey1)";
276 eval"&aes${p} ($inout2,$rndkey1)";
277 eval"&aes${p} ($inout3,$rndkey1)";
278 eval"&aes${p}last ($inout0,$rndkey0)";
279 eval"&aes${p}last ($inout1,$rndkey0)";
280 eval"&aes${p}last ($inout2,$rndkey0)";
281 eval"&aes${p}last ($inout3,$rndkey0)";
282 &ret();
283 &function_end_B("_aesni_${p}rypt4");
284}
285
286sub aesni_generate6
287{ my $p=shift;
288
289 &function_begin_B("_aesni_${p}rypt6");
290 &static_label("_aesni_${p}rypt6_enter");
291 &$movekey ($rndkey0,&QWP(0,$key));
292 &shr ($rounds,1);
293 &$movekey ($rndkey1,&QWP(16,$key));
294 &lea ($key,&DWP(32,$key));
295 &xorps ($inout0,$rndkey0);
296 &pxor ($inout1,$rndkey0); # pxor does better here
297 eval"&aes${p} ($inout0,$rndkey1)";
298 &pxor ($inout2,$rndkey0);
299 eval"&aes${p} ($inout1,$rndkey1)";
300 &pxor ($inout3,$rndkey0);
301 &dec ($rounds);
302 eval"&aes${p} ($inout2,$rndkey1)";
303 &pxor ($inout4,$rndkey0);
304 eval"&aes${p} ($inout3,$rndkey1)";
305 &pxor ($inout5,$rndkey0);
306 eval"&aes${p} ($inout4,$rndkey1)";
307 &$movekey ($rndkey0,&QWP(0,$key));
308 eval"&aes${p} ($inout5,$rndkey1)";
309 &jmp (&label("_aesni_${p}rypt6_enter"));
310
311 &set_label("${p}6_loop",16);
312 eval"&aes${p} ($inout0,$rndkey1)";
313 eval"&aes${p} ($inout1,$rndkey1)";
314 &dec ($rounds);
315 eval"&aes${p} ($inout2,$rndkey1)";
316 eval"&aes${p} ($inout3,$rndkey1)";
317 eval"&aes${p} ($inout4,$rndkey1)";
318 eval"&aes${p} ($inout5,$rndkey1)";
319 &set_label("_aesni_${p}rypt6_enter",16);
320 &$movekey ($rndkey1,&QWP(16,$key));
321 eval"&aes${p} ($inout0,$rndkey0)";
322 eval"&aes${p} ($inout1,$rndkey0)";
323 &lea ($key,&DWP(32,$key));
324 eval"&aes${p} ($inout2,$rndkey0)";
325 eval"&aes${p} ($inout3,$rndkey0)";
326 eval"&aes${p} ($inout4,$rndkey0)";
327 eval"&aes${p} ($inout5,$rndkey0)";
328 &$movekey ($rndkey0,&QWP(0,$key));
329 &jnz (&label("${p}6_loop"));
330
331 eval"&aes${p} ($inout0,$rndkey1)";
332 eval"&aes${p} ($inout1,$rndkey1)";
333 eval"&aes${p} ($inout2,$rndkey1)";
334 eval"&aes${p} ($inout3,$rndkey1)";
335 eval"&aes${p} ($inout4,$rndkey1)";
336 eval"&aes${p} ($inout5,$rndkey1)";
337 eval"&aes${p}last ($inout0,$rndkey0)";
338 eval"&aes${p}last ($inout1,$rndkey0)";
339 eval"&aes${p}last ($inout2,$rndkey0)";
340 eval"&aes${p}last ($inout3,$rndkey0)";
341 eval"&aes${p}last ($inout4,$rndkey0)";
342 eval"&aes${p}last ($inout5,$rndkey0)";
343 &ret();
344 &function_end_B("_aesni_${p}rypt6");
345}
346&aesni_generate3("enc") if ($PREFIX eq "aesni");
347&aesni_generate3("dec");
348&aesni_generate4("enc") if ($PREFIX eq "aesni");
349&aesni_generate4("dec");
350&aesni_generate6("enc") if ($PREFIX eq "aesni");
351&aesni_generate6("dec");
352
353if ($PREFIX eq "aesni") {
354######################################################################
355# void aesni_ecb_encrypt (const void *in, void *out,
356# size_t length, const AES_KEY *key,
357# int enc);
358&function_begin("aesni_ecb_encrypt");
359 &mov ($inp,&wparam(0));
360 &mov ($out,&wparam(1));
361 &mov ($len,&wparam(2));
362 &mov ($key,&wparam(3));
363 &mov ($rounds_,&wparam(4));
364 &and ($len,-16);
365 &jz (&label("ecb_ret"));
366 &mov ($rounds,&DWP(240,$key));
367 &test ($rounds_,$rounds_);
368 &jz (&label("ecb_decrypt"));
369
370 &mov ($key_,$key); # backup $key
371 &mov ($rounds_,$rounds); # backup $rounds
372 &cmp ($len,0x60);
373 &jb (&label("ecb_enc_tail"));
374
375 &movdqu ($inout0,&QWP(0,$inp));
376 &movdqu ($inout1,&QWP(0x10,$inp));
377 &movdqu ($inout2,&QWP(0x20,$inp));
378 &movdqu ($inout3,&QWP(0x30,$inp));
379 &movdqu ($inout4,&QWP(0x40,$inp));
380 &movdqu ($inout5,&QWP(0x50,$inp));
381 &lea ($inp,&DWP(0x60,$inp));
382 &sub ($len,0x60);
383 &jmp (&label("ecb_enc_loop6_enter"));
384
385&set_label("ecb_enc_loop6",16);
386 &movups (&QWP(0,$out),$inout0);
387 &movdqu ($inout0,&QWP(0,$inp));
388 &movups (&QWP(0x10,$out),$inout1);
389 &movdqu ($inout1,&QWP(0x10,$inp));
390 &movups (&QWP(0x20,$out),$inout2);
391 &movdqu ($inout2,&QWP(0x20,$inp));
392 &movups (&QWP(0x30,$out),$inout3);
393 &movdqu ($inout3,&QWP(0x30,$inp));
394 &movups (&QWP(0x40,$out),$inout4);
395 &movdqu ($inout4,&QWP(0x40,$inp));
396 &movups (&QWP(0x50,$out),$inout5);
397 &lea ($out,&DWP(0x60,$out));
398 &movdqu ($inout5,&QWP(0x50,$inp));
399 &lea ($inp,&DWP(0x60,$inp));
400&set_label("ecb_enc_loop6_enter");
401
402 &call ("_aesni_encrypt6");
403
404 &mov ($key,$key_); # restore $key
405 &mov ($rounds,$rounds_); # restore $rounds
406 &sub ($len,0x60);
407 &jnc (&label("ecb_enc_loop6"));
408
409 &movups (&QWP(0,$out),$inout0);
410 &movups (&QWP(0x10,$out),$inout1);
411 &movups (&QWP(0x20,$out),$inout2);
412 &movups (&QWP(0x30,$out),$inout3);
413 &movups (&QWP(0x40,$out),$inout4);
414 &movups (&QWP(0x50,$out),$inout5);
415 &lea ($out,&DWP(0x60,$out));
416 &add ($len,0x60);
417 &jz (&label("ecb_ret"));
418
419&set_label("ecb_enc_tail");
420 &movups ($inout0,&QWP(0,$inp));
421 &cmp ($len,0x20);
422 &jb (&label("ecb_enc_one"));
423 &movups ($inout1,&QWP(0x10,$inp));
424 &je (&label("ecb_enc_two"));
425 &movups ($inout2,&QWP(0x20,$inp));
426 &cmp ($len,0x40);
427 &jb (&label("ecb_enc_three"));
428 &movups ($inout3,&QWP(0x30,$inp));
429 &je (&label("ecb_enc_four"));
430 &movups ($inout4,&QWP(0x40,$inp));
431 &xorps ($inout5,$inout5);
432 &call ("_aesni_encrypt6");
433 &movups (&QWP(0,$out),$inout0);
434 &movups (&QWP(0x10,$out),$inout1);
435 &movups (&QWP(0x20,$out),$inout2);
436 &movups (&QWP(0x30,$out),$inout3);
437 &movups (&QWP(0x40,$out),$inout4);
438 jmp (&label("ecb_ret"));
439
440&set_label("ecb_enc_one",16);
441 if ($inline)
442 { &aesni_inline_generate1("enc"); }
443 else
444 { &call ("_aesni_encrypt1"); }
445 &movups (&QWP(0,$out),$inout0);
446 &jmp (&label("ecb_ret"));
447
448&set_label("ecb_enc_two",16);
449 &xorps ($inout2,$inout2);
450 &call ("_aesni_encrypt3");
451 &movups (&QWP(0,$out),$inout0);
452 &movups (&QWP(0x10,$out),$inout1);
453 &jmp (&label("ecb_ret"));
454
455&set_label("ecb_enc_three",16);
456 &call ("_aesni_encrypt3");
457 &movups (&QWP(0,$out),$inout0);
458 &movups (&QWP(0x10,$out),$inout1);
459 &movups (&QWP(0x20,$out),$inout2);
460 &jmp (&label("ecb_ret"));
461
462&set_label("ecb_enc_four",16);
463 &call ("_aesni_encrypt4");
464 &movups (&QWP(0,$out),$inout0);
465 &movups (&QWP(0x10,$out),$inout1);
466 &movups (&QWP(0x20,$out),$inout2);
467 &movups (&QWP(0x30,$out),$inout3);
468 &jmp (&label("ecb_ret"));
469######################################################################
470&set_label("ecb_decrypt",16);
471 &mov ($key_,$key); # backup $key
472 &mov ($rounds_,$rounds); # backup $rounds
473 &cmp ($len,0x60);
474 &jb (&label("ecb_dec_tail"));
475
476 &movdqu ($inout0,&QWP(0,$inp));
477 &movdqu ($inout1,&QWP(0x10,$inp));
478 &movdqu ($inout2,&QWP(0x20,$inp));
479 &movdqu ($inout3,&QWP(0x30,$inp));
480 &movdqu ($inout4,&QWP(0x40,$inp));
481 &movdqu ($inout5,&QWP(0x50,$inp));
482 &lea ($inp,&DWP(0x60,$inp));
483 &sub ($len,0x60);
484 &jmp (&label("ecb_dec_loop6_enter"));
485
486&set_label("ecb_dec_loop6",16);
487 &movups (&QWP(0,$out),$inout0);
488 &movdqu ($inout0,&QWP(0,$inp));
489 &movups (&QWP(0x10,$out),$inout1);
490 &movdqu ($inout1,&QWP(0x10,$inp));
491 &movups (&QWP(0x20,$out),$inout2);
492 &movdqu ($inout2,&QWP(0x20,$inp));
493 &movups (&QWP(0x30,$out),$inout3);
494 &movdqu ($inout3,&QWP(0x30,$inp));
495 &movups (&QWP(0x40,$out),$inout4);
496 &movdqu ($inout4,&QWP(0x40,$inp));
497 &movups (&QWP(0x50,$out),$inout5);
498 &lea ($out,&DWP(0x60,$out));
499 &movdqu ($inout5,&QWP(0x50,$inp));
500 &lea ($inp,&DWP(0x60,$inp));
501&set_label("ecb_dec_loop6_enter");
502
503 &call ("_aesni_decrypt6");
504
505 &mov ($key,$key_); # restore $key
506 &mov ($rounds,$rounds_); # restore $rounds
507 &sub ($len,0x60);
508 &jnc (&label("ecb_dec_loop6"));
509
510 &movups (&QWP(0,$out),$inout0);
511 &movups (&QWP(0x10,$out),$inout1);
512 &movups (&QWP(0x20,$out),$inout2);
513 &movups (&QWP(0x30,$out),$inout3);
514 &movups (&QWP(0x40,$out),$inout4);
515 &movups (&QWP(0x50,$out),$inout5);
516 &lea ($out,&DWP(0x60,$out));
517 &add ($len,0x60);
518 &jz (&label("ecb_ret"));
519
520&set_label("ecb_dec_tail");
521 &movups ($inout0,&QWP(0,$inp));
522 &cmp ($len,0x20);
523 &jb (&label("ecb_dec_one"));
524 &movups ($inout1,&QWP(0x10,$inp));
525 &je (&label("ecb_dec_two"));
526 &movups ($inout2,&QWP(0x20,$inp));
527 &cmp ($len,0x40);
528 &jb (&label("ecb_dec_three"));
529 &movups ($inout3,&QWP(0x30,$inp));
530 &je (&label("ecb_dec_four"));
531 &movups ($inout4,&QWP(0x40,$inp));
532 &xorps ($inout5,$inout5);
533 &call ("_aesni_decrypt6");
534 &movups (&QWP(0,$out),$inout0);
535 &movups (&QWP(0x10,$out),$inout1);
536 &movups (&QWP(0x20,$out),$inout2);
537 &movups (&QWP(0x30,$out),$inout3);
538 &movups (&QWP(0x40,$out),$inout4);
539 &jmp (&label("ecb_ret"));
540
541&set_label("ecb_dec_one",16);
542 if ($inline)
543 { &aesni_inline_generate1("dec"); }
544 else
545 { &call ("_aesni_decrypt1"); }
546 &movups (&QWP(0,$out),$inout0);
547 &jmp (&label("ecb_ret"));
548
549&set_label("ecb_dec_two",16);
550 &xorps ($inout2,$inout2);
551 &call ("_aesni_decrypt3");
552 &movups (&QWP(0,$out),$inout0);
553 &movups (&QWP(0x10,$out),$inout1);
554 &jmp (&label("ecb_ret"));
555
556&set_label("ecb_dec_three",16);
557 &call ("_aesni_decrypt3");
558 &movups (&QWP(0,$out),$inout0);
559 &movups (&QWP(0x10,$out),$inout1);
560 &movups (&QWP(0x20,$out),$inout2);
561 &jmp (&label("ecb_ret"));
562
563&set_label("ecb_dec_four",16);
564 &call ("_aesni_decrypt4");
565 &movups (&QWP(0,$out),$inout0);
566 &movups (&QWP(0x10,$out),$inout1);
567 &movups (&QWP(0x20,$out),$inout2);
568 &movups (&QWP(0x30,$out),$inout3);
569
570&set_label("ecb_ret");
571&function_end("aesni_ecb_encrypt");
572
573######################################################################
574# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
575# size_t blocks, const AES_KEY *key,
576# const char *ivec,char *cmac);
577#
578# Handles only complete blocks, operates on 64-bit counter and
579# does not update *ivec! Nor does it finalize CMAC value
580# (see engine/eng_aesni.c for details)
581#
582{ my $cmac=$inout1;
583&function_begin("aesni_ccm64_encrypt_blocks");
584 &mov ($inp,&wparam(0));
585 &mov ($out,&wparam(1));
586 &mov ($len,&wparam(2));
587 &mov ($key,&wparam(3));
588 &mov ($rounds_,&wparam(4));
589 &mov ($rounds,&wparam(5));
590 &mov ($key_,"esp");
591 &sub ("esp",60);
592 &and ("esp",-16); # align stack
593 &mov (&DWP(48,"esp"),$key_);
594
595 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
596 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
597 &mov ($rounds,&DWP(240,$key));
598
599 # compose byte-swap control mask for pshufb on stack
600 &mov (&DWP(0,"esp"),0x0c0d0e0f);
601 &mov (&DWP(4,"esp"),0x08090a0b);
602 &mov (&DWP(8,"esp"),0x04050607);
603 &mov (&DWP(12,"esp"),0x00010203);
604
605 # compose counter increment vector on stack
606 &mov ($rounds_,1);
607 &xor ($key_,$key_);
608 &mov (&DWP(16,"esp"),$rounds_);
609 &mov (&DWP(20,"esp"),$key_);
610 &mov (&DWP(24,"esp"),$key_);
611 &mov (&DWP(28,"esp"),$key_);
612
613 &shr ($rounds,1);
614 &lea ($key_,&DWP(0,$key));
615 &movdqa ($inout3,&QWP(0,"esp"));
616 &movdqa ($inout0,$ivec);
617 &mov ($rounds_,$rounds);
618 &pshufb ($ivec,$inout3);
619
620&set_label("ccm64_enc_outer");
621 &$movekey ($rndkey0,&QWP(0,$key_));
622 &mov ($rounds,$rounds_);
623 &movups ($in0,&QWP(0,$inp));
624
625 &xorps ($inout0,$rndkey0);
626 &$movekey ($rndkey1,&QWP(16,$key_));
627 &xorps ($rndkey0,$in0);
628 &lea ($key,&DWP(32,$key_));
629 &xorps ($cmac,$rndkey0); # cmac^=inp
630 &$movekey ($rndkey0,&QWP(0,$key));
631
632&set_label("ccm64_enc2_loop");
633 &aesenc ($inout0,$rndkey1);
634 &dec ($rounds);
635 &aesenc ($cmac,$rndkey1);
636 &$movekey ($rndkey1,&QWP(16,$key));
637 &aesenc ($inout0,$rndkey0);
638 &lea ($key,&DWP(32,$key));
639 &aesenc ($cmac,$rndkey0);
640 &$movekey ($rndkey0,&QWP(0,$key));
641 &jnz (&label("ccm64_enc2_loop"));
642 &aesenc ($inout0,$rndkey1);
643 &aesenc ($cmac,$rndkey1);
644 &paddq ($ivec,&QWP(16,"esp"));
645 &aesenclast ($inout0,$rndkey0);
646 &aesenclast ($cmac,$rndkey0);
647
648 &dec ($len);
649 &lea ($inp,&DWP(16,$inp));
650 &xorps ($in0,$inout0); # inp^=E(ivec)
651 &movdqa ($inout0,$ivec);
652 &movups (&QWP(0,$out),$in0); # save output
653 &lea ($out,&DWP(16,$out));
654 &pshufb ($inout0,$inout3);
655 &jnz (&label("ccm64_enc_outer"));
656
657 &mov ("esp",&DWP(48,"esp"));
658 &mov ($out,&wparam(5));
659 &movups (&QWP(0,$out),$cmac);
660&function_end("aesni_ccm64_encrypt_blocks");
661
662&function_begin("aesni_ccm64_decrypt_blocks");
663 &mov ($inp,&wparam(0));
664 &mov ($out,&wparam(1));
665 &mov ($len,&wparam(2));
666 &mov ($key,&wparam(3));
667 &mov ($rounds_,&wparam(4));
668 &mov ($rounds,&wparam(5));
669 &mov ($key_,"esp");
670 &sub ("esp",60);
671 &and ("esp",-16); # align stack
672 &mov (&DWP(48,"esp"),$key_);
673
674 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
675 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
676 &mov ($rounds,&DWP(240,$key));
677
678 # compose byte-swap control mask for pshufb on stack
679 &mov (&DWP(0,"esp"),0x0c0d0e0f);
680 &mov (&DWP(4,"esp"),0x08090a0b);
681 &mov (&DWP(8,"esp"),0x04050607);
682 &mov (&DWP(12,"esp"),0x00010203);
683
684 # compose counter increment vector on stack
685 &mov ($rounds_,1);
686 &xor ($key_,$key_);
687 &mov (&DWP(16,"esp"),$rounds_);
688 &mov (&DWP(20,"esp"),$key_);
689 &mov (&DWP(24,"esp"),$key_);
690 &mov (&DWP(28,"esp"),$key_);
691
692 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
693 &movdqa ($inout0,$ivec);
694
695 &mov ($key_,$key);
696 &mov ($rounds_,$rounds);
697
698 &pshufb ($ivec,$inout3);
699 if ($inline)
700 { &aesni_inline_generate1("enc"); }
701 else
702 { &call ("_aesni_encrypt1"); }
703 &movups ($in0,&QWP(0,$inp)); # load inp
704 &paddq ($ivec,&QWP(16,"esp"));
705 &lea ($inp,&QWP(16,$inp));
706 &jmp (&label("ccm64_dec_outer"));
707
708&set_label("ccm64_dec_outer",16);
709 &xorps ($in0,$inout0); # inp ^= E(ivec)
710 &movdqa ($inout0,$ivec);
711 &mov ($rounds,$rounds_);
712 &movups (&QWP(0,$out),$in0); # save output
713 &lea ($out,&DWP(16,$out));
714 &pshufb ($inout0,$inout3);
715
716 &sub ($len,1);
717 &jz (&label("ccm64_dec_break"));
718
719 &$movekey ($rndkey0,&QWP(0,$key_));
720 &shr ($rounds,1);
721 &$movekey ($rndkey1,&QWP(16,$key_));
722 &xorps ($in0,$rndkey0);
723 &lea ($key,&DWP(32,$key_));
724 &xorps ($inout0,$rndkey0);
725 &xorps ($cmac,$in0); # cmac^=out
726 &$movekey ($rndkey0,&QWP(0,$key));
727
728&set_label("ccm64_dec2_loop");
729 &aesenc ($inout0,$rndkey1);
730 &dec ($rounds);
731 &aesenc ($cmac,$rndkey1);
732 &$movekey ($rndkey1,&QWP(16,$key));
733 &aesenc ($inout0,$rndkey0);
734 &lea ($key,&DWP(32,$key));
735 &aesenc ($cmac,$rndkey0);
736 &$movekey ($rndkey0,&QWP(0,$key));
737 &jnz (&label("ccm64_dec2_loop"));
738 &movups ($in0,&QWP(0,$inp)); # load inp
739 &paddq ($ivec,&QWP(16,"esp"));
740 &aesenc ($inout0,$rndkey1);
741 &aesenc ($cmac,$rndkey1);
742 &lea ($inp,&QWP(16,$inp));
743 &aesenclast ($inout0,$rndkey0);
744 &aesenclast ($cmac,$rndkey0);
745 &jmp (&label("ccm64_dec_outer"));
746
747&set_label("ccm64_dec_break",16);
748 &mov ($key,$key_);
749 if ($inline)
750 { &aesni_inline_generate1("enc",$cmac,$in0); }
751 else
752 { &call ("_aesni_encrypt1",$cmac); }
753
754 &mov ("esp",&DWP(48,"esp"));
755 &mov ($out,&wparam(5));
756 &movups (&QWP(0,$out),$cmac);
757&function_end("aesni_ccm64_decrypt_blocks");
758}
759
760######################################################################
761# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
762# size_t blocks, const AES_KEY *key,
763# const char *ivec);
764#
765# Handles only complete blocks, operates on 32-bit counter and
766# does not update *ivec! (see engine/eng_aesni.c for details)
767#
768# stack layout:
769# 0 pshufb mask
770# 16 vector addend: 0,6,6,6
771# 32 counter-less ivec
772# 48 1st triplet of counter vector
773# 64 2nd triplet of counter vector
774# 80 saved %esp
775
776&function_begin("aesni_ctr32_encrypt_blocks");
777 &mov ($inp,&wparam(0));
778 &mov ($out,&wparam(1));
779 &mov ($len,&wparam(2));
780 &mov ($key,&wparam(3));
781 &mov ($rounds_,&wparam(4));
782 &mov ($key_,"esp");
783 &sub ("esp",88);
784 &and ("esp",-16); # align stack
785 &mov (&DWP(80,"esp"),$key_);
786
787 &cmp ($len,1);
788 &je (&label("ctr32_one_shortcut"));
789
790 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
791
792 # compose byte-swap control mask for pshufb on stack
793 &mov (&DWP(0,"esp"),0x0c0d0e0f);
794 &mov (&DWP(4,"esp"),0x08090a0b);
795 &mov (&DWP(8,"esp"),0x04050607);
796 &mov (&DWP(12,"esp"),0x00010203);
797
798 # compose counter increment vector on stack
799 &mov ($rounds,6);
800 &xor ($key_,$key_);
801 &mov (&DWP(16,"esp"),$rounds);
802 &mov (&DWP(20,"esp"),$rounds);
803 &mov (&DWP(24,"esp"),$rounds);
804 &mov (&DWP(28,"esp"),$key_);
805
806 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
807 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
808
809 &mov ($rounds,&DWP(240,$key)); # key->rounds
810
811 # compose 2 vectors of 3x32-bit counters
812 &bswap ($rounds_);
813 &pxor ($rndkey1,$rndkey1);
814 &pxor ($rndkey0,$rndkey0);
815 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
816 &pinsrd ($rndkey1,$rounds_,0);
817 &lea ($key_,&DWP(3,$rounds_));
818 &pinsrd ($rndkey0,$key_,0);
819 &inc ($rounds_);
820 &pinsrd ($rndkey1,$rounds_,1);
821 &inc ($key_);
822 &pinsrd ($rndkey0,$key_,1);
823 &inc ($rounds_);
824 &pinsrd ($rndkey1,$rounds_,2);
825 &inc ($key_);
826 &pinsrd ($rndkey0,$key_,2);
827 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
828 &pshufb ($rndkey1,$inout0); # byte swap
829 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
830 &pshufb ($rndkey0,$inout0); # byte swap
831
832 &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword
833 &pshufd ($inout1,$rndkey1,2<<6);
834 &cmp ($len,6);
835 &jb (&label("ctr32_tail"));
836 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec
837 &shr ($rounds,1);
838 &mov ($key_,$key); # backup $key
839 &mov ($rounds_,$rounds); # backup $rounds
840 &sub ($len,6);
841 &jmp (&label("ctr32_loop6"));
842
843&set_label("ctr32_loop6",16);
844 &pshufd ($inout2,$rndkey1,1<<6);
845 &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec
846 &pshufd ($inout3,$rndkey0,3<<6);
847 &por ($inout0,$rndkey1); # merge counter-less ivec
848 &pshufd ($inout4,$rndkey0,2<<6);
849 &por ($inout1,$rndkey1);
850 &pshufd ($inout5,$rndkey0,1<<6);
851 &por ($inout2,$rndkey1);
852 &por ($inout3,$rndkey1);
853 &por ($inout4,$rndkey1);
854 &por ($inout5,$rndkey1);
855
856 # inlining _aesni_encrypt6's prologue gives ~4% improvement...
857 &$movekey ($rndkey0,&QWP(0,$key_));
858 &$movekey ($rndkey1,&QWP(16,$key_));
859 &lea ($key,&DWP(32,$key_));
860 &dec ($rounds);
861 &pxor ($inout0,$rndkey0);
862 &pxor ($inout1,$rndkey0);
863 &aesenc ($inout0,$rndkey1);
864 &pxor ($inout2,$rndkey0);
865 &aesenc ($inout1,$rndkey1);
866 &pxor ($inout3,$rndkey0);
867 &aesenc ($inout2,$rndkey1);
868 &pxor ($inout4,$rndkey0);
869 &aesenc ($inout3,$rndkey1);
870 &pxor ($inout5,$rndkey0);
871 &aesenc ($inout4,$rndkey1);
872 &$movekey ($rndkey0,&QWP(0,$key));
873 &aesenc ($inout5,$rndkey1);
874
875 &call (&label("_aesni_encrypt6_enter"));
876
877 &movups ($rndkey1,&QWP(0,$inp));
878 &movups ($rndkey0,&QWP(0x10,$inp));
879 &xorps ($inout0,$rndkey1);
880 &movups ($rndkey1,&QWP(0x20,$inp));
881 &xorps ($inout1,$rndkey0);
882 &movups (&QWP(0,$out),$inout0);
883 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
884 &xorps ($inout2,$rndkey1);
885 &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet
886 &movups (&QWP(0x10,$out),$inout1);
887 &movups (&QWP(0x20,$out),$inout2);
888
889 &paddd ($rndkey1,$rndkey0); # 1st triplet increment
890 &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment
891 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
892
893 &movups ($inout1,&QWP(0x30,$inp));
894 &movups ($inout2,&QWP(0x40,$inp));
895 &xorps ($inout3,$inout1);
896 &movups ($inout1,&QWP(0x50,$inp));
897 &lea ($inp,&DWP(0x60,$inp));
898 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
899 &pshufb ($rndkey1,$inout0); # byte swap
900 &xorps ($inout4,$inout2);
901 &movups (&QWP(0x30,$out),$inout3);
902 &xorps ($inout5,$inout1);
903 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
904 &pshufb ($rndkey0,$inout0); # byte swap
905 &movups (&QWP(0x40,$out),$inout4);
906 &pshufd ($inout0,$rndkey1,3<<6);
907 &movups (&QWP(0x50,$out),$inout5);
908 &lea ($out,&DWP(0x60,$out));
909
910 &mov ($rounds,$rounds_);
911 &pshufd ($inout1,$rndkey1,2<<6);
912 &sub ($len,6);
913 &jnc (&label("ctr32_loop6"));
914
915 &add ($len,6);
916 &jz (&label("ctr32_ret"));
917 &mov ($key,$key_);
918 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
919 &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec
920
921&set_label("ctr32_tail");
922 &por ($inout0,$inout5);
923 &cmp ($len,2);
924 &jb (&label("ctr32_one"));
925
926 &pshufd ($inout2,$rndkey1,1<<6);
927 &por ($inout1,$inout5);
928 &je (&label("ctr32_two"));
929
930 &pshufd ($inout3,$rndkey0,3<<6);
931 &por ($inout2,$inout5);
932 &cmp ($len,4);
933 &jb (&label("ctr32_three"));
934
935 &pshufd ($inout4,$rndkey0,2<<6);
936 &por ($inout3,$inout5);
937 &je (&label("ctr32_four"));
938
939 &por ($inout4,$inout5);
940 &call ("_aesni_encrypt6");
941 &movups ($rndkey1,&QWP(0,$inp));
942 &movups ($rndkey0,&QWP(0x10,$inp));
943 &xorps ($inout0,$rndkey1);
944 &movups ($rndkey1,&QWP(0x20,$inp));
945 &xorps ($inout1,$rndkey0);
946 &movups ($rndkey0,&QWP(0x30,$inp));
947 &xorps ($inout2,$rndkey1);
948 &movups ($rndkey1,&QWP(0x40,$inp));
949 &xorps ($inout3,$rndkey0);
950 &movups (&QWP(0,$out),$inout0);
951 &xorps ($inout4,$rndkey1);
952 &movups (&QWP(0x10,$out),$inout1);
953 &movups (&QWP(0x20,$out),$inout2);
954 &movups (&QWP(0x30,$out),$inout3);
955 &movups (&QWP(0x40,$out),$inout4);
956 &jmp (&label("ctr32_ret"));
957
958&set_label("ctr32_one_shortcut",16);
959 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
960 &mov ($rounds,&DWP(240,$key));
961
962&set_label("ctr32_one");
963 if ($inline)
964 { &aesni_inline_generate1("enc"); }
965 else
966 { &call ("_aesni_encrypt1"); }
967 &movups ($in0,&QWP(0,$inp));
968 &xorps ($in0,$inout0);
969 &movups (&QWP(0,$out),$in0);
970 &jmp (&label("ctr32_ret"));
971
972&set_label("ctr32_two",16);
973 &call ("_aesni_encrypt3");
974 &movups ($inout3,&QWP(0,$inp));
975 &movups ($inout4,&QWP(0x10,$inp));
976 &xorps ($inout0,$inout3);
977 &xorps ($inout1,$inout4);
978 &movups (&QWP(0,$out),$inout0);
979 &movups (&QWP(0x10,$out),$inout1);
980 &jmp (&label("ctr32_ret"));
981
982&set_label("ctr32_three",16);
983 &call ("_aesni_encrypt3");
984 &movups ($inout3,&QWP(0,$inp));
985 &movups ($inout4,&QWP(0x10,$inp));
986 &xorps ($inout0,$inout3);
987 &movups ($inout5,&QWP(0x20,$inp));
988 &xorps ($inout1,$inout4);
989 &movups (&QWP(0,$out),$inout0);
990 &xorps ($inout2,$inout5);
991 &movups (&QWP(0x10,$out),$inout1);
992 &movups (&QWP(0x20,$out),$inout2);
993 &jmp (&label("ctr32_ret"));
994
995&set_label("ctr32_four",16);
996 &call ("_aesni_encrypt4");
997 &movups ($inout4,&QWP(0,$inp));
998 &movups ($inout5,&QWP(0x10,$inp));
999 &movups ($rndkey1,&QWP(0x20,$inp));
1000 &xorps ($inout0,$inout4);
1001 &movups ($rndkey0,&QWP(0x30,$inp));
1002 &xorps ($inout1,$inout5);
1003 &movups (&QWP(0,$out),$inout0);
1004 &xorps ($inout2,$rndkey1);
1005 &movups (&QWP(0x10,$out),$inout1);
1006 &xorps ($inout3,$rndkey0);
1007 &movups (&QWP(0x20,$out),$inout2);
1008 &movups (&QWP(0x30,$out),$inout3);
1009
1010&set_label("ctr32_ret");
1011 &mov ("esp",&DWP(80,"esp"));
1012&function_end("aesni_ctr32_encrypt_blocks");
1013
1014######################################################################
1015# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1016# const AES_KEY *key1, const AES_KEY *key2
1017# const unsigned char iv[16]);
1018#
1019{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1020
1021&function_begin("aesni_xts_encrypt");
1022 &mov ($key,&wparam(4)); # key2
1023 &mov ($inp,&wparam(5)); # clear-text tweak
1024
1025 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1026 &movups ($inout0,&QWP(0,$inp));
1027 if ($inline)
1028 { &aesni_inline_generate1("enc"); }
1029 else
1030 { &call ("_aesni_encrypt1"); }
1031
1032 &mov ($inp,&wparam(0));
1033 &mov ($out,&wparam(1));
1034 &mov ($len,&wparam(2));
1035 &mov ($key,&wparam(3)); # key1
1036
1037 &mov ($key_,"esp");
1038 &sub ("esp",16*7+8);
1039 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1040 &and ("esp",-16); # align stack
1041
1042 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1043 &mov (&DWP(16*6+4,"esp"),0);
1044 &mov (&DWP(16*6+8,"esp"),1);
1045 &mov (&DWP(16*6+12,"esp"),0);
1046 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1047 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1048
1049 &movdqa ($tweak,$inout0);
1050 &pxor ($twtmp,$twtmp);
1051 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1052 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1053
1054 &and ($len,-16);
1055 &mov ($key_,$key); # backup $key
1056 &mov ($rounds_,$rounds); # backup $rounds
1057 &sub ($len,16*6);
1058 &jc (&label("xts_enc_short"));
1059
1060 &shr ($rounds,1);
1061 &mov ($rounds_,$rounds);
1062 &jmp (&label("xts_enc_loop6"));
1063
1064&set_label("xts_enc_loop6",16);
1065 for ($i=0;$i<4;$i++) {
1066 &pshufd ($twres,$twtmp,0x13);
1067 &pxor ($twtmp,$twtmp);
1068 &movdqa (&QWP(16*$i,"esp"),$tweak);
1069 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1070 &pand ($twres,$twmask); # isolate carry and residue
1071 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1072 &pxor ($tweak,$twres);
1073 }
1074 &pshufd ($inout5,$twtmp,0x13);
1075 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1076 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1077 &$movekey ($rndkey0,&QWP(0,$key_));
1078 &pand ($inout5,$twmask); # isolate carry and residue
1079 &movups ($inout0,&QWP(0,$inp)); # load input
1080 &pxor ($inout5,$tweak);
1081
1082 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1083 &movdqu ($inout1,&QWP(16*1,$inp));
1084 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1085 &movdqu ($inout2,&QWP(16*2,$inp));
1086 &pxor ($inout1,$rndkey0);
1087 &movdqu ($inout3,&QWP(16*3,$inp));
1088 &pxor ($inout2,$rndkey0);
1089 &movdqu ($inout4,&QWP(16*4,$inp));
1090 &pxor ($inout3,$rndkey0);
1091 &movdqu ($rndkey1,&QWP(16*5,$inp));
1092 &pxor ($inout4,$rndkey0);
1093 &lea ($inp,&DWP(16*6,$inp));
1094 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1095 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1096 &pxor ($inout5,$rndkey1);
1097
1098 &$movekey ($rndkey1,&QWP(16,$key_));
1099 &lea ($key,&DWP(32,$key_));
1100 &pxor ($inout1,&QWP(16*1,"esp"));
1101 &aesenc ($inout0,$rndkey1);
1102 &pxor ($inout2,&QWP(16*2,"esp"));
1103 &aesenc ($inout1,$rndkey1);
1104 &pxor ($inout3,&QWP(16*3,"esp"));
1105 &dec ($rounds);
1106 &aesenc ($inout2,$rndkey1);
1107 &pxor ($inout4,&QWP(16*4,"esp"));
1108 &aesenc ($inout3,$rndkey1);
1109 &pxor ($inout5,$rndkey0);
1110 &aesenc ($inout4,$rndkey1);
1111 &$movekey ($rndkey0,&QWP(0,$key));
1112 &aesenc ($inout5,$rndkey1);
1113 &call (&label("_aesni_encrypt6_enter"));
1114
1115 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1116 &pxor ($twtmp,$twtmp);
1117 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1118 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1119 &xorps ($inout1,&QWP(16*1,"esp"));
1120 &movups (&QWP(16*0,$out),$inout0); # write output
1121 &xorps ($inout2,&QWP(16*2,"esp"));
1122 &movups (&QWP(16*1,$out),$inout1);
1123 &xorps ($inout3,&QWP(16*3,"esp"));
1124 &movups (&QWP(16*2,$out),$inout2);
1125 &xorps ($inout4,&QWP(16*4,"esp"));
1126 &movups (&QWP(16*3,$out),$inout3);
1127 &xorps ($inout5,$tweak);
1128 &movups (&QWP(16*4,$out),$inout4);
1129 &pshufd ($twres,$twtmp,0x13);
1130 &movups (&QWP(16*5,$out),$inout5);
1131 &lea ($out,&DWP(16*6,$out));
1132 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1133
1134 &pxor ($twtmp,$twtmp);
1135 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1136 &pand ($twres,$twmask); # isolate carry and residue
1137 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1138 &mov ($rounds,$rounds_); # restore $rounds
1139 &pxor ($tweak,$twres);
1140
1141 &sub ($len,16*6);
1142 &jnc (&label("xts_enc_loop6"));
1143
1144 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
1145 &mov ($key,$key_); # restore $key
1146 &mov ($rounds_,$rounds);
1147
1148&set_label("xts_enc_short");
1149 &add ($len,16*6);
1150 &jz (&label("xts_enc_done6x"));
1151
1152 &movdqa ($inout3,$tweak); # put aside previous tweak
1153 &cmp ($len,0x20);
1154 &jb (&label("xts_enc_one"));
1155
1156 &pshufd ($twres,$twtmp,0x13);
1157 &pxor ($twtmp,$twtmp);
1158 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1159 &pand ($twres,$twmask); # isolate carry and residue
1160 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1161 &pxor ($tweak,$twres);
1162 &je (&label("xts_enc_two"));
1163
1164 &pshufd ($twres,$twtmp,0x13);
1165 &pxor ($twtmp,$twtmp);
1166 &movdqa ($inout4,$tweak); # put aside previous tweak
1167 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1168 &pand ($twres,$twmask); # isolate carry and residue
1169 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1170 &pxor ($tweak,$twres);
1171 &cmp ($len,0x40);
1172 &jb (&label("xts_enc_three"));
1173
1174 &pshufd ($twres,$twtmp,0x13);
1175 &pxor ($twtmp,$twtmp);
1176 &movdqa ($inout5,$tweak); # put aside previous tweak
1177 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1178 &pand ($twres,$twmask); # isolate carry and residue
1179 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1180 &pxor ($tweak,$twres);
1181 &movdqa (&QWP(16*0,"esp"),$inout3);
1182 &movdqa (&QWP(16*1,"esp"),$inout4);
1183 &je (&label("xts_enc_four"));
1184
1185 &movdqa (&QWP(16*2,"esp"),$inout5);
1186 &pshufd ($inout5,$twtmp,0x13);
1187 &movdqa (&QWP(16*3,"esp"),$tweak);
1188 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1189 &pand ($inout5,$twmask); # isolate carry and residue
1190 &pxor ($inout5,$tweak);
1191
1192 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1193 &movdqu ($inout1,&QWP(16*1,$inp));
1194 &movdqu ($inout2,&QWP(16*2,$inp));
1195 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1196 &movdqu ($inout3,&QWP(16*3,$inp));
1197 &pxor ($inout1,&QWP(16*1,"esp"));
1198 &movdqu ($inout4,&QWP(16*4,$inp));
1199 &pxor ($inout2,&QWP(16*2,"esp"));
1200 &lea ($inp,&DWP(16*5,$inp));
1201 &pxor ($inout3,&QWP(16*3,"esp"));
1202 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1203 &pxor ($inout4,$inout5);
1204
1205 &call ("_aesni_encrypt6");
1206
1207 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1208 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1209 &xorps ($inout1,&QWP(16*1,"esp"));
1210 &xorps ($inout2,&QWP(16*2,"esp"));
1211 &movups (&QWP(16*0,$out),$inout0); # write output
1212 &xorps ($inout3,&QWP(16*3,"esp"));
1213 &movups (&QWP(16*1,$out),$inout1);
1214 &xorps ($inout4,$tweak);
1215 &movups (&QWP(16*2,$out),$inout2);
1216 &movups (&QWP(16*3,$out),$inout3);
1217 &movups (&QWP(16*4,$out),$inout4);
1218 &lea ($out,&DWP(16*5,$out));
1219 &jmp (&label("xts_enc_done"));
1220
1221&set_label("xts_enc_one",16);
1222 &movups ($inout0,&QWP(16*0,$inp)); # load input
1223 &lea ($inp,&DWP(16*1,$inp));
1224 &xorps ($inout0,$inout3); # input^=tweak
1225 if ($inline)
1226 { &aesni_inline_generate1("enc"); }
1227 else
1228 { &call ("_aesni_encrypt1"); }
1229 &xorps ($inout0,$inout3); # output^=tweak
1230 &movups (&QWP(16*0,$out),$inout0); # write output
1231 &lea ($out,&DWP(16*1,$out));
1232
1233 &movdqa ($tweak,$inout3); # last tweak
1234 &jmp (&label("xts_enc_done"));
1235
1236&set_label("xts_enc_two",16);
1237 &movaps ($inout4,$tweak); # put aside last tweak
1238
1239 &movups ($inout0,&QWP(16*0,$inp)); # load input
1240 &movups ($inout1,&QWP(16*1,$inp));
1241 &lea ($inp,&DWP(16*2,$inp));
1242 &xorps ($inout0,$inout3); # input^=tweak
1243 &xorps ($inout1,$inout4);
1244 &xorps ($inout2,$inout2);
1245
1246 &call ("_aesni_encrypt3");
1247
1248 &xorps ($inout0,$inout3); # output^=tweak
1249 &xorps ($inout1,$inout4);
1250 &movups (&QWP(16*0,$out),$inout0); # write output
1251 &movups (&QWP(16*1,$out),$inout1);
1252 &lea ($out,&DWP(16*2,$out));
1253
1254 &movdqa ($tweak,$inout4); # last tweak
1255 &jmp (&label("xts_enc_done"));
1256
1257&set_label("xts_enc_three",16);
1258 &movaps ($inout5,$tweak); # put aside last tweak
1259 &movups ($inout0,&QWP(16*0,$inp)); # load input
1260 &movups ($inout1,&QWP(16*1,$inp));
1261 &movups ($inout2,&QWP(16*2,$inp));
1262 &lea ($inp,&DWP(16*3,$inp));
1263 &xorps ($inout0,$inout3); # input^=tweak
1264 &xorps ($inout1,$inout4);
1265 &xorps ($inout2,$inout5);
1266
1267 &call ("_aesni_encrypt3");
1268
1269 &xorps ($inout0,$inout3); # output^=tweak
1270 &xorps ($inout1,$inout4);
1271 &xorps ($inout2,$inout5);
1272 &movups (&QWP(16*0,$out),$inout0); # write output
1273 &movups (&QWP(16*1,$out),$inout1);
1274 &movups (&QWP(16*2,$out),$inout2);
1275 &lea ($out,&DWP(16*3,$out));
1276
1277 &movdqa ($tweak,$inout5); # last tweak
1278 &jmp (&label("xts_enc_done"));
1279
1280&set_label("xts_enc_four",16);
1281 &movaps ($inout4,$tweak); # put aside last tweak
1282
1283 &movups ($inout0,&QWP(16*0,$inp)); # load input
1284 &movups ($inout1,&QWP(16*1,$inp));
1285 &movups ($inout2,&QWP(16*2,$inp));
1286 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1287 &movups ($inout3,&QWP(16*3,$inp));
1288 &lea ($inp,&DWP(16*4,$inp));
1289 &xorps ($inout1,&QWP(16*1,"esp"));
1290 &xorps ($inout2,$inout5);
1291 &xorps ($inout3,$inout4);
1292
1293 &call ("_aesni_encrypt4");
1294
1295 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1296 &xorps ($inout1,&QWP(16*1,"esp"));
1297 &xorps ($inout2,$inout5);
1298 &movups (&QWP(16*0,$out),$inout0); # write output
1299 &xorps ($inout3,$inout4);
1300 &movups (&QWP(16*1,$out),$inout1);
1301 &movups (&QWP(16*2,$out),$inout2);
1302 &movups (&QWP(16*3,$out),$inout3);
1303 &lea ($out,&DWP(16*4,$out));
1304
1305 &movdqa ($tweak,$inout4); # last tweak
1306 &jmp (&label("xts_enc_done"));
1307
1308&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1309 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1310 &and ($len,15);
1311 &jz (&label("xts_enc_ret"));
1312 &movdqa ($inout3,$tweak);
1313 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1314 &jmp (&label("xts_enc_steal"));
1315
1316&set_label("xts_enc_done",16);
1317 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1318 &pxor ($twtmp,$twtmp);
1319 &and ($len,15);
1320 &jz (&label("xts_enc_ret"));
1321
1322 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1323 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1324 &pshufd ($inout3,$twtmp,0x13);
1325 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1326 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1327 &pxor ($inout3,$tweak);
1328
1329&set_label("xts_enc_steal");
1330 &movz ($rounds,&BP(0,$inp));
1331 &movz ($key,&BP(-16,$out));
1332 &lea ($inp,&DWP(1,$inp));
1333 &mov (&BP(-16,$out),&LB($rounds));
1334 &mov (&BP(0,$out),&LB($key));
1335 &lea ($out,&DWP(1,$out));
1336 &sub ($len,1);
1337 &jnz (&label("xts_enc_steal"));
1338
1339 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1340 &mov ($key,$key_); # restore $key
1341 &mov ($rounds,$rounds_); # restore $rounds
1342
1343 &movups ($inout0,&QWP(-16,$out)); # load input
1344 &xorps ($inout0,$inout3); # input^=tweak
1345 if ($inline)
1346 { &aesni_inline_generate1("enc"); }
1347 else
1348 { &call ("_aesni_encrypt1"); }
1349 &xorps ($inout0,$inout3); # output^=tweak
1350 &movups (&QWP(-16,$out),$inout0); # write output
1351
1352&set_label("xts_enc_ret");
1353 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1354&function_end("aesni_xts_encrypt");
1355
1356&function_begin("aesni_xts_decrypt");
1357 &mov ($key,&wparam(4)); # key2
1358 &mov ($inp,&wparam(5)); # clear-text tweak
1359
1360 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1361 &movups ($inout0,&QWP(0,$inp));
1362 if ($inline)
1363 { &aesni_inline_generate1("enc"); }
1364 else
1365 { &call ("_aesni_encrypt1"); }
1366
1367 &mov ($inp,&wparam(0));
1368 &mov ($out,&wparam(1));
1369 &mov ($len,&wparam(2));
1370 &mov ($key,&wparam(3)); # key1
1371
1372 &mov ($key_,"esp");
1373 &sub ("esp",16*7+8);
1374 &and ("esp",-16); # align stack
1375
1376 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1377 &test ($len,15);
1378 &setnz (&LB($rounds_));
1379 &shl ($rounds_,4);
1380 &sub ($len,$rounds_);
1381
1382 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1383 &mov (&DWP(16*6+4,"esp"),0);
1384 &mov (&DWP(16*6+8,"esp"),1);
1385 &mov (&DWP(16*6+12,"esp"),0);
1386 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1387 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1388
1389 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1390 &mov ($key_,$key); # backup $key
1391 &mov ($rounds_,$rounds); # backup $rounds
1392
1393 &movdqa ($tweak,$inout0);
1394 &pxor ($twtmp,$twtmp);
1395 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1396 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1397
1398 &and ($len,-16);
1399 &sub ($len,16*6);
1400 &jc (&label("xts_dec_short"));
1401
1402 &shr ($rounds,1);
1403 &mov ($rounds_,$rounds);
1404 &jmp (&label("xts_dec_loop6"));
1405
1406&set_label("xts_dec_loop6",16);
1407 for ($i=0;$i<4;$i++) {
1408 &pshufd ($twres,$twtmp,0x13);
1409 &pxor ($twtmp,$twtmp);
1410 &movdqa (&QWP(16*$i,"esp"),$tweak);
1411 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1412 &pand ($twres,$twmask); # isolate carry and residue
1413 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1414 &pxor ($tweak,$twres);
1415 }
1416 &pshufd ($inout5,$twtmp,0x13);
1417 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1418 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1419 &$movekey ($rndkey0,&QWP(0,$key_));
1420 &pand ($inout5,$twmask); # isolate carry and residue
1421 &movups ($inout0,&QWP(0,$inp)); # load input
1422 &pxor ($inout5,$tweak);
1423
1424 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1425 &movdqu ($inout1,&QWP(16*1,$inp));
1426 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1427 &movdqu ($inout2,&QWP(16*2,$inp));
1428 &pxor ($inout1,$rndkey0);
1429 &movdqu ($inout3,&QWP(16*3,$inp));
1430 &pxor ($inout2,$rndkey0);
1431 &movdqu ($inout4,&QWP(16*4,$inp));
1432 &pxor ($inout3,$rndkey0);
1433 &movdqu ($rndkey1,&QWP(16*5,$inp));
1434 &pxor ($inout4,$rndkey0);
1435 &lea ($inp,&DWP(16*6,$inp));
1436 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1437 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1438 &pxor ($inout5,$rndkey1);
1439
1440 &$movekey ($rndkey1,&QWP(16,$key_));
1441 &lea ($key,&DWP(32,$key_));
1442 &pxor ($inout1,&QWP(16*1,"esp"));
1443 &aesdec ($inout0,$rndkey1);
1444 &pxor ($inout2,&QWP(16*2,"esp"));
1445 &aesdec ($inout1,$rndkey1);
1446 &pxor ($inout3,&QWP(16*3,"esp"));
1447 &dec ($rounds);
1448 &aesdec ($inout2,$rndkey1);
1449 &pxor ($inout4,&QWP(16*4,"esp"));
1450 &aesdec ($inout3,$rndkey1);
1451 &pxor ($inout5,$rndkey0);
1452 &aesdec ($inout4,$rndkey1);
1453 &$movekey ($rndkey0,&QWP(0,$key));
1454 &aesdec ($inout5,$rndkey1);
1455 &call (&label("_aesni_decrypt6_enter"));
1456
1457 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1458 &pxor ($twtmp,$twtmp);
1459 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1460 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1461 &xorps ($inout1,&QWP(16*1,"esp"));
1462 &movups (&QWP(16*0,$out),$inout0); # write output
1463 &xorps ($inout2,&QWP(16*2,"esp"));
1464 &movups (&QWP(16*1,$out),$inout1);
1465 &xorps ($inout3,&QWP(16*3,"esp"));
1466 &movups (&QWP(16*2,$out),$inout2);
1467 &xorps ($inout4,&QWP(16*4,"esp"));
1468 &movups (&QWP(16*3,$out),$inout3);
1469 &xorps ($inout5,$tweak);
1470 &movups (&QWP(16*4,$out),$inout4);
1471 &pshufd ($twres,$twtmp,0x13);
1472 &movups (&QWP(16*5,$out),$inout5);
1473 &lea ($out,&DWP(16*6,$out));
1474 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1475
1476 &pxor ($twtmp,$twtmp);
1477 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1478 &pand ($twres,$twmask); # isolate carry and residue
1479 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1480 &mov ($rounds,$rounds_); # restore $rounds
1481 &pxor ($tweak,$twres);
1482
1483 &sub ($len,16*6);
1484 &jnc (&label("xts_dec_loop6"));
1485
1486 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
1487 &mov ($key,$key_); # restore $key
1488 &mov ($rounds_,$rounds);
1489
1490&set_label("xts_dec_short");
1491 &add ($len,16*6);
1492 &jz (&label("xts_dec_done6x"));
1493
1494 &movdqa ($inout3,$tweak); # put aside previous tweak
1495 &cmp ($len,0x20);
1496 &jb (&label("xts_dec_one"));
1497
1498 &pshufd ($twres,$twtmp,0x13);
1499 &pxor ($twtmp,$twtmp);
1500 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1501 &pand ($twres,$twmask); # isolate carry and residue
1502 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1503 &pxor ($tweak,$twres);
1504 &je (&label("xts_dec_two"));
1505
1506 &pshufd ($twres,$twtmp,0x13);
1507 &pxor ($twtmp,$twtmp);
1508 &movdqa ($inout4,$tweak); # put aside previous tweak
1509 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1510 &pand ($twres,$twmask); # isolate carry and residue
1511 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1512 &pxor ($tweak,$twres);
1513 &cmp ($len,0x40);
1514 &jb (&label("xts_dec_three"));
1515
1516 &pshufd ($twres,$twtmp,0x13);
1517 &pxor ($twtmp,$twtmp);
1518 &movdqa ($inout5,$tweak); # put aside previous tweak
1519 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1520 &pand ($twres,$twmask); # isolate carry and residue
1521 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1522 &pxor ($tweak,$twres);
1523 &movdqa (&QWP(16*0,"esp"),$inout3);
1524 &movdqa (&QWP(16*1,"esp"),$inout4);
1525 &je (&label("xts_dec_four"));
1526
1527 &movdqa (&QWP(16*2,"esp"),$inout5);
1528 &pshufd ($inout5,$twtmp,0x13);
1529 &movdqa (&QWP(16*3,"esp"),$tweak);
1530 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1531 &pand ($inout5,$twmask); # isolate carry and residue
1532 &pxor ($inout5,$tweak);
1533
1534 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1535 &movdqu ($inout1,&QWP(16*1,$inp));
1536 &movdqu ($inout2,&QWP(16*2,$inp));
1537 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1538 &movdqu ($inout3,&QWP(16*3,$inp));
1539 &pxor ($inout1,&QWP(16*1,"esp"));
1540 &movdqu ($inout4,&QWP(16*4,$inp));
1541 &pxor ($inout2,&QWP(16*2,"esp"));
1542 &lea ($inp,&DWP(16*5,$inp));
1543 &pxor ($inout3,&QWP(16*3,"esp"));
1544 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1545 &pxor ($inout4,$inout5);
1546
1547 &call ("_aesni_decrypt6");
1548
1549 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1550 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1551 &xorps ($inout1,&QWP(16*1,"esp"));
1552 &xorps ($inout2,&QWP(16*2,"esp"));
1553 &movups (&QWP(16*0,$out),$inout0); # write output
1554 &xorps ($inout3,&QWP(16*3,"esp"));
1555 &movups (&QWP(16*1,$out),$inout1);
1556 &xorps ($inout4,$tweak);
1557 &movups (&QWP(16*2,$out),$inout2);
1558 &movups (&QWP(16*3,$out),$inout3);
1559 &movups (&QWP(16*4,$out),$inout4);
1560 &lea ($out,&DWP(16*5,$out));
1561 &jmp (&label("xts_dec_done"));
1562
1563&set_label("xts_dec_one",16);
1564 &movups ($inout0,&QWP(16*0,$inp)); # load input
1565 &lea ($inp,&DWP(16*1,$inp));
1566 &xorps ($inout0,$inout3); # input^=tweak
1567 if ($inline)
1568 { &aesni_inline_generate1("dec"); }
1569 else
1570 { &call ("_aesni_decrypt1"); }
1571 &xorps ($inout0,$inout3); # output^=tweak
1572 &movups (&QWP(16*0,$out),$inout0); # write output
1573 &lea ($out,&DWP(16*1,$out));
1574
1575 &movdqa ($tweak,$inout3); # last tweak
1576 &jmp (&label("xts_dec_done"));
1577
1578&set_label("xts_dec_two",16);
1579 &movaps ($inout4,$tweak); # put aside last tweak
1580
1581 &movups ($inout0,&QWP(16*0,$inp)); # load input
1582 &movups ($inout1,&QWP(16*1,$inp));
1583 &lea ($inp,&DWP(16*2,$inp));
1584 &xorps ($inout0,$inout3); # input^=tweak
1585 &xorps ($inout1,$inout4);
1586
1587 &call ("_aesni_decrypt3");
1588
1589 &xorps ($inout0,$inout3); # output^=tweak
1590 &xorps ($inout1,$inout4);
1591 &movups (&QWP(16*0,$out),$inout0); # write output
1592 &movups (&QWP(16*1,$out),$inout1);
1593 &lea ($out,&DWP(16*2,$out));
1594
1595 &movdqa ($tweak,$inout4); # last tweak
1596 &jmp (&label("xts_dec_done"));
1597
1598&set_label("xts_dec_three",16);
1599 &movaps ($inout5,$tweak); # put aside last tweak
1600 &movups ($inout0,&QWP(16*0,$inp)); # load input
1601 &movups ($inout1,&QWP(16*1,$inp));
1602 &movups ($inout2,&QWP(16*2,$inp));
1603 &lea ($inp,&DWP(16*3,$inp));
1604 &xorps ($inout0,$inout3); # input^=tweak
1605 &xorps ($inout1,$inout4);
1606 &xorps ($inout2,$inout5);
1607
1608 &call ("_aesni_decrypt3");
1609
1610 &xorps ($inout0,$inout3); # output^=tweak
1611 &xorps ($inout1,$inout4);
1612 &xorps ($inout2,$inout5);
1613 &movups (&QWP(16*0,$out),$inout0); # write output
1614 &movups (&QWP(16*1,$out),$inout1);
1615 &movups (&QWP(16*2,$out),$inout2);
1616 &lea ($out,&DWP(16*3,$out));
1617
1618 &movdqa ($tweak,$inout5); # last tweak
1619 &jmp (&label("xts_dec_done"));
1620
1621&set_label("xts_dec_four",16);
1622 &movaps ($inout4,$tweak); # put aside last tweak
1623
1624 &movups ($inout0,&QWP(16*0,$inp)); # load input
1625 &movups ($inout1,&QWP(16*1,$inp));
1626 &movups ($inout2,&QWP(16*2,$inp));
1627 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1628 &movups ($inout3,&QWP(16*3,$inp));
1629 &lea ($inp,&DWP(16*4,$inp));
1630 &xorps ($inout1,&QWP(16*1,"esp"));
1631 &xorps ($inout2,$inout5);
1632 &xorps ($inout3,$inout4);
1633
1634 &call ("_aesni_decrypt4");
1635
1636 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1637 &xorps ($inout1,&QWP(16*1,"esp"));
1638 &xorps ($inout2,$inout5);
1639 &movups (&QWP(16*0,$out),$inout0); # write output
1640 &xorps ($inout3,$inout4);
1641 &movups (&QWP(16*1,$out),$inout1);
1642 &movups (&QWP(16*2,$out),$inout2);
1643 &movups (&QWP(16*3,$out),$inout3);
1644 &lea ($out,&DWP(16*4,$out));
1645
1646 &movdqa ($tweak,$inout4); # last tweak
1647 &jmp (&label("xts_dec_done"));
1648
1649&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1650 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1651 &and ($len,15);
1652 &jz (&label("xts_dec_ret"));
1653 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1654 &jmp (&label("xts_dec_only_one_more"));
1655
1656&set_label("xts_dec_done",16);
1657 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1658 &pxor ($twtmp,$twtmp);
1659 &and ($len,15);
1660 &jz (&label("xts_dec_ret"));
1661
1662 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1663 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1664 &pshufd ($twres,$twtmp,0x13);
1665 &pxor ($twtmp,$twtmp);
1666 &movdqa ($twmask,&QWP(16*6,"esp"));
1667 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1668 &pand ($twres,$twmask); # isolate carry and residue
1669 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1670 &pxor ($tweak,$twres);
1671
1672&set_label("xts_dec_only_one_more");
1673 &pshufd ($inout3,$twtmp,0x13);
1674 &movdqa ($inout4,$tweak); # put aside previous tweak
1675 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1676 &pand ($inout3,$twmask); # isolate carry and residue
1677 &pxor ($inout3,$tweak);
1678
1679 &mov ($key,$key_); # restore $key
1680 &mov ($rounds,$rounds_); # restore $rounds
1681
1682 &movups ($inout0,&QWP(0,$inp)); # load input
1683 &xorps ($inout0,$inout3); # input^=tweak
1684 if ($inline)
1685 { &aesni_inline_generate1("dec"); }
1686 else
1687 { &call ("_aesni_decrypt1"); }
1688 &xorps ($inout0,$inout3); # output^=tweak
1689 &movups (&QWP(0,$out),$inout0); # write output
1690
1691&set_label("xts_dec_steal");
1692 &movz ($rounds,&BP(16,$inp));
1693 &movz ($key,&BP(0,$out));
1694 &lea ($inp,&DWP(1,$inp));
1695 &mov (&BP(0,$out),&LB($rounds));
1696 &mov (&BP(16,$out),&LB($key));
1697 &lea ($out,&DWP(1,$out));
1698 &sub ($len,1);
1699 &jnz (&label("xts_dec_steal"));
1700
1701 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1702 &mov ($key,$key_); # restore $key
1703 &mov ($rounds,$rounds_); # restore $rounds
1704
1705 &movups ($inout0,&QWP(0,$out)); # load input
1706 &xorps ($inout0,$inout4); # input^=tweak
1707 if ($inline)
1708 { &aesni_inline_generate1("dec"); }
1709 else
1710 { &call ("_aesni_decrypt1"); }
1711 &xorps ($inout0,$inout4); # output^=tweak
1712 &movups (&QWP(0,$out),$inout0); # write output
1713
1714&set_label("xts_dec_ret");
1715 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1716&function_end("aesni_xts_decrypt");
1717}
1718}
1719
1720######################################################################
1721# void $PREFIX_cbc_encrypt (const void *inp, void *out,
1722# size_t length, const AES_KEY *key,
1723# unsigned char *ivp,const int enc);
1724&function_begin("${PREFIX}_cbc_encrypt");
1725 &mov ($inp,&wparam(0));
1726 &mov ($rounds_,"esp");
1727 &mov ($out,&wparam(1));
1728 &sub ($rounds_,24);
1729 &mov ($len,&wparam(2));
1730 &and ($rounds_,-16);
1731 &mov ($key,&wparam(3));
1732 &mov ($key_,&wparam(4));
1733 &test ($len,$len);
1734 &jz (&label("cbc_abort"));
1735
1736 &cmp (&wparam(5),0);
1737 &xchg ($rounds_,"esp"); # alloca
1738 &movups ($ivec,&QWP(0,$key_)); # load IV
1739 &mov ($rounds,&DWP(240,$key));
1740 &mov ($key_,$key); # backup $key
1741 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
1742 &mov ($rounds_,$rounds); # backup $rounds
1743 &je (&label("cbc_decrypt"));
1744
1745 &movaps ($inout0,$ivec);
1746 &cmp ($len,16);
1747 &jb (&label("cbc_enc_tail"));
1748 &sub ($len,16);
1749 &jmp (&label("cbc_enc_loop"));
1750
1751&set_label("cbc_enc_loop",16);
1752 &movups ($ivec,&QWP(0,$inp)); # input actually
1753 &lea ($inp,&DWP(16,$inp));
1754 if ($inline)
1755 { &aesni_inline_generate1("enc",$inout0,$ivec); }
1756 else
1757 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
1758 &mov ($rounds,$rounds_); # restore $rounds
1759 &mov ($key,$key_); # restore $key
1760 &movups (&QWP(0,$out),$inout0); # store output
1761 &lea ($out,&DWP(16,$out));
1762 &sub ($len,16);
1763 &jnc (&label("cbc_enc_loop"));
1764 &add ($len,16);
1765 &jnz (&label("cbc_enc_tail"));
1766 &movaps ($ivec,$inout0);
1767 &jmp (&label("cbc_ret"));
1768
1769&set_label("cbc_enc_tail");
1770 &mov ("ecx",$len); # zaps $rounds
1771 &data_word(0xA4F3F689); # rep movsb
1772 &mov ("ecx",16); # zero tail
1773 &sub ("ecx",$len);
1774 &xor ("eax","eax"); # zaps $len
1775 &data_word(0xAAF3F689); # rep stosb
1776 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
1777 &mov ($rounds,$rounds_); # restore $rounds
1778 &mov ($inp,$out); # $inp and $out are the same
1779 &mov ($key,$key_); # restore $key
1780 &jmp (&label("cbc_enc_loop"));
1781######################################################################
1782&set_label("cbc_decrypt",16);
1783 &cmp ($len,0x50);
1784 &jbe (&label("cbc_dec_tail"));
1785 &movaps (&QWP(0,"esp"),$ivec); # save IV
1786 &sub ($len,0x50);
1787 &jmp (&label("cbc_dec_loop6_enter"));
1788
1789&set_label("cbc_dec_loop6",16);
1790 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
1791 &movups (&QWP(0,$out),$inout5);
1792 &lea ($out,&DWP(0x10,$out));
1793&set_label("cbc_dec_loop6_enter");
1794 &movdqu ($inout0,&QWP(0,$inp));
1795 &movdqu ($inout1,&QWP(0x10,$inp));
1796 &movdqu ($inout2,&QWP(0x20,$inp));
1797 &movdqu ($inout3,&QWP(0x30,$inp));
1798 &movdqu ($inout4,&QWP(0x40,$inp));
1799 &movdqu ($inout5,&QWP(0x50,$inp));
1800
1801 &call ("_aesni_decrypt6");
1802
1803 &movups ($rndkey1,&QWP(0,$inp));
1804 &movups ($rndkey0,&QWP(0x10,$inp));
1805 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
1806 &xorps ($inout1,$rndkey1);
1807 &movups ($rndkey1,&QWP(0x20,$inp));
1808 &xorps ($inout2,$rndkey0);
1809 &movups ($rndkey0,&QWP(0x30,$inp));
1810 &xorps ($inout3,$rndkey1);
1811 &movups ($rndkey1,&QWP(0x40,$inp));
1812 &xorps ($inout4,$rndkey0);
1813 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
1814 &xorps ($inout5,$rndkey1);
1815 &movups (&QWP(0,$out),$inout0);
1816 &movups (&QWP(0x10,$out),$inout1);
1817 &lea ($inp,&DWP(0x60,$inp));
1818 &movups (&QWP(0x20,$out),$inout2);
1819 &mov ($rounds,$rounds_) # restore $rounds
1820 &movups (&QWP(0x30,$out),$inout3);
1821 &mov ($key,$key_); # restore $key
1822 &movups (&QWP(0x40,$out),$inout4);
1823 &lea ($out,&DWP(0x50,$out));
1824 &sub ($len,0x60);
1825 &ja (&label("cbc_dec_loop6"));
1826
1827 &movaps ($inout0,$inout5);
1828 &movaps ($ivec,$rndkey0);
1829 &add ($len,0x50);
1830 &jle (&label("cbc_dec_tail_collected"));
1831 &movups (&QWP(0,$out),$inout0);
1832 &lea ($out,&DWP(0x10,$out));
1833&set_label("cbc_dec_tail");
1834 &movups ($inout0,&QWP(0,$inp));
1835 &movaps ($in0,$inout0);
1836 &cmp ($len,0x10);
1837 &jbe (&label("cbc_dec_one"));
1838
1839 &movups ($inout1,&QWP(0x10,$inp));
1840 &movaps ($in1,$inout1);
1841 &cmp ($len,0x20);
1842 &jbe (&label("cbc_dec_two"));
1843
1844 &movups ($inout2,&QWP(0x20,$inp));
1845 &cmp ($len,0x30);
1846 &jbe (&label("cbc_dec_three"));
1847
1848 &movups ($inout3,&QWP(0x30,$inp));
1849 &cmp ($len,0x40);
1850 &jbe (&label("cbc_dec_four"));
1851
1852 &movups ($inout4,&QWP(0x40,$inp));
1853 &movaps (&QWP(0,"esp"),$ivec); # save IV
1854 &movups ($inout0,&QWP(0,$inp));
1855 &xorps ($inout5,$inout5);
1856 &call ("_aesni_decrypt6");
1857 &movups ($rndkey1,&QWP(0,$inp));
1858 &movups ($rndkey0,&QWP(0x10,$inp));
1859 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
1860 &xorps ($inout1,$rndkey1);
1861 &movups ($rndkey1,&QWP(0x20,$inp));
1862 &xorps ($inout2,$rndkey0);
1863 &movups ($rndkey0,&QWP(0x30,$inp));
1864 &xorps ($inout3,$rndkey1);
1865 &movups ($ivec,&QWP(0x40,$inp)); # IV
1866 &xorps ($inout4,$rndkey0);
1867 &movups (&QWP(0,$out),$inout0);
1868 &movups (&QWP(0x10,$out),$inout1);
1869 &movups (&QWP(0x20,$out),$inout2);
1870 &movups (&QWP(0x30,$out),$inout3);
1871 &lea ($out,&DWP(0x40,$out));
1872 &movaps ($inout0,$inout4);
1873 &sub ($len,0x50);
1874 &jmp (&label("cbc_dec_tail_collected"));
1875
1876&set_label("cbc_dec_one",16);
1877 if ($inline)
1878 { &aesni_inline_generate1("dec"); }
1879 else
1880 { &call ("_aesni_decrypt1"); }
1881 &xorps ($inout0,$ivec);
1882 &movaps ($ivec,$in0);
1883 &sub ($len,0x10);
1884 &jmp (&label("cbc_dec_tail_collected"));
1885
1886&set_label("cbc_dec_two",16);
1887 &xorps ($inout2,$inout2);
1888 &call ("_aesni_decrypt3");
1889 &xorps ($inout0,$ivec);
1890 &xorps ($inout1,$in0);
1891 &movups (&QWP(0,$out),$inout0);
1892 &movaps ($inout0,$inout1);
1893 &lea ($out,&DWP(0x10,$out));
1894 &movaps ($ivec,$in1);
1895 &sub ($len,0x20);
1896 &jmp (&label("cbc_dec_tail_collected"));
1897
1898&set_label("cbc_dec_three",16);
1899 &call ("_aesni_decrypt3");
1900 &xorps ($inout0,$ivec);
1901 &xorps ($inout1,$in0);
1902 &xorps ($inout2,$in1);
1903 &movups (&QWP(0,$out),$inout0);
1904 &movaps ($inout0,$inout2);
1905 &movups (&QWP(0x10,$out),$inout1);
1906 &lea ($out,&DWP(0x20,$out));
1907 &movups ($ivec,&QWP(0x20,$inp));
1908 &sub ($len,0x30);
1909 &jmp (&label("cbc_dec_tail_collected"));
1910
1911&set_label("cbc_dec_four",16);
1912 &call ("_aesni_decrypt4");
1913 &movups ($rndkey1,&QWP(0x10,$inp));
1914 &movups ($rndkey0,&QWP(0x20,$inp));
1915 &xorps ($inout0,$ivec);
1916 &movups ($ivec,&QWP(0x30,$inp));
1917 &xorps ($inout1,$in0);
1918 &movups (&QWP(0,$out),$inout0);
1919 &xorps ($inout2,$rndkey1);
1920 &movups (&QWP(0x10,$out),$inout1);
1921 &xorps ($inout3,$rndkey0);
1922 &movups (&QWP(0x20,$out),$inout2);
1923 &lea ($out,&DWP(0x30,$out));
1924 &movaps ($inout0,$inout3);
1925 &sub ($len,0x40);
1926
1927&set_label("cbc_dec_tail_collected");
1928 &and ($len,15);
1929 &jnz (&label("cbc_dec_tail_partial"));
1930 &movups (&QWP(0,$out),$inout0);
1931 &jmp (&label("cbc_ret"));
1932
1933&set_label("cbc_dec_tail_partial",16);
1934 &movaps (&QWP(0,"esp"),$inout0);
1935 &mov ("ecx",16);
1936 &mov ($inp,"esp");
1937 &sub ("ecx",$len);
1938 &data_word(0xA4F3F689); # rep movsb
1939
1940&set_label("cbc_ret");
1941 &mov ("esp",&DWP(16,"esp")); # pull original %esp
1942 &mov ($key_,&wparam(4));
1943 &movups (&QWP(0,$key_),$ivec); # output IV
1944&set_label("cbc_abort");
1945&function_end("${PREFIX}_cbc_encrypt");
1946
1947######################################################################
1948# Mechanical port from aesni-x86_64.pl.
1949#
1950# _aesni_set_encrypt_key is private interface,
1951# input:
1952# "eax" const unsigned char *userKey
1953# $rounds int bits
1954# $key AES_KEY *key
1955# output:
1956# "eax" return code
1957# $round rounds
1958
1959&function_begin_B("_aesni_set_encrypt_key");
1960 &test ("eax","eax");
1961 &jz (&label("bad_pointer"));
1962 &test ($key,$key);
1963 &jz (&label("bad_pointer"));
1964
1965 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
1966 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
1967 &lea ($key,&DWP(16,$key));
1968 &cmp ($rounds,256);
1969 &je (&label("14rounds"));
1970 &cmp ($rounds,192);
1971 &je (&label("12rounds"));
1972 &cmp ($rounds,128);
1973 &jne (&label("bad_keybits"));
1974
1975&set_label("10rounds",16);
1976 &mov ($rounds,9);
1977 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
1978 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
1979 &call (&label("key_128_cold"));
1980 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
1981 &call (&label("key_128"));
1982 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
1983 &call (&label("key_128"));
1984 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
1985 &call (&label("key_128"));
1986 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
1987 &call (&label("key_128"));
1988 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
1989 &call (&label("key_128"));
1990 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
1991 &call (&label("key_128"));
1992 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
1993 &call (&label("key_128"));
1994 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
1995 &call (&label("key_128"));
1996 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
1997 &call (&label("key_128"));
1998 &$movekey (&QWP(0,$key),"xmm0");
1999 &mov (&DWP(80,$key),$rounds);
2000 &xor ("eax","eax");
2001 &ret();
2002
2003&set_label("key_128",16);
2004 &$movekey (&QWP(0,$key),"xmm0");
2005 &lea ($key,&DWP(16,$key));
2006&set_label("key_128_cold");
2007 &shufps ("xmm4","xmm0",0b00010000);
2008 &xorps ("xmm0","xmm4");
2009 &shufps ("xmm4","xmm0",0b10001100);
2010 &xorps ("xmm0","xmm4");
2011 &shufps ("xmm1","xmm1",0b11111111); # critical path
2012 &xorps ("xmm0","xmm1");
2013 &ret();
2014
2015&set_label("12rounds",16);
2016 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
2017 &mov ($rounds,11);
2018 &$movekey (&QWP(-16,$key),"xmm0") # round 0
2019 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
2020 &call (&label("key_192a_cold"));
2021 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
2022 &call (&label("key_192b"));
2023 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
2024 &call (&label("key_192a"));
2025 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
2026 &call (&label("key_192b"));
2027 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
2028 &call (&label("key_192a"));
2029 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
2030 &call (&label("key_192b"));
2031 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
2032 &call (&label("key_192a"));
2033 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
2034 &call (&label("key_192b"));
2035 &$movekey (&QWP(0,$key),"xmm0");
2036 &mov (&DWP(48,$key),$rounds);
2037 &xor ("eax","eax");
2038 &ret();
2039
2040&set_label("key_192a",16);
2041 &$movekey (&QWP(0,$key),"xmm0");
2042 &lea ($key,&DWP(16,$key));
2043&set_label("key_192a_cold",16);
2044 &movaps ("xmm5","xmm2");
2045&set_label("key_192b_warm");
2046 &shufps ("xmm4","xmm0",0b00010000);
2047 &movdqa ("xmm3","xmm2");
2048 &xorps ("xmm0","xmm4");
2049 &shufps ("xmm4","xmm0",0b10001100);
2050 &pslldq ("xmm3",4);
2051 &xorps ("xmm0","xmm4");
2052 &pshufd ("xmm1","xmm1",0b01010101); # critical path
2053 &pxor ("xmm2","xmm3");
2054 &pxor ("xmm0","xmm1");
2055 &pshufd ("xmm3","xmm0",0b11111111);
2056 &pxor ("xmm2","xmm3");
2057 &ret();
2058
2059&set_label("key_192b",16);
2060 &movaps ("xmm3","xmm0");
2061 &shufps ("xmm5","xmm0",0b01000100);
2062 &$movekey (&QWP(0,$key),"xmm5");
2063 &shufps ("xmm3","xmm2",0b01001110);
2064 &$movekey (&QWP(16,$key),"xmm3");
2065 &lea ($key,&DWP(32,$key));
2066 &jmp (&label("key_192b_warm"));
2067
2068&set_label("14rounds",16);
2069 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
2070 &mov ($rounds,13);
2071 &lea ($key,&DWP(16,$key));
2072 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
2073 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
2074 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
2075 &call (&label("key_256a_cold"));
2076 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
2077 &call (&label("key_256b"));
2078 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
2079 &call (&label("key_256a"));
2080 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
2081 &call (&label("key_256b"));
2082 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
2083 &call (&label("key_256a"));
2084 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
2085 &call (&label("key_256b"));
2086 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
2087 &call (&label("key_256a"));
2088 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
2089 &call (&label("key_256b"));
2090 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
2091 &call (&label("key_256a"));
2092 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
2093 &call (&label("key_256b"));
2094 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
2095 &call (&label("key_256a"));
2096 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
2097 &call (&label("key_256b"));
2098 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
2099 &call (&label("key_256a"));
2100 &$movekey (&QWP(0,$key),"xmm0");
2101 &mov (&DWP(16,$key),$rounds);
2102 &xor ("eax","eax");
2103 &ret();
2104
2105&set_label("key_256a",16);
2106 &$movekey (&QWP(0,$key),"xmm2");
2107 &lea ($key,&DWP(16,$key));
2108&set_label("key_256a_cold");
2109 &shufps ("xmm4","xmm0",0b00010000);
2110 &xorps ("xmm0","xmm4");
2111 &shufps ("xmm4","xmm0",0b10001100);
2112 &xorps ("xmm0","xmm4");
2113 &shufps ("xmm1","xmm1",0b11111111); # critical path
2114 &xorps ("xmm0","xmm1");
2115 &ret();
2116
2117&set_label("key_256b",16);
2118 &$movekey (&QWP(0,$key),"xmm0");
2119 &lea ($key,&DWP(16,$key));
2120
2121 &shufps ("xmm4","xmm2",0b00010000);
2122 &xorps ("xmm2","xmm4");
2123 &shufps ("xmm4","xmm2",0b10001100);
2124 &xorps ("xmm2","xmm4");
2125 &shufps ("xmm1","xmm1",0b10101010); # critical path
2126 &xorps ("xmm2","xmm1");
2127 &ret();
2128
2129&set_label("bad_pointer",4);
2130 &mov ("eax",-1);
2131 &ret ();
2132&set_label("bad_keybits",4);
2133 &mov ("eax",-2);
2134 &ret ();
2135&function_end_B("_aesni_set_encrypt_key");
2136
2137# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2138# AES_KEY *key)
2139&function_begin_B("${PREFIX}_set_encrypt_key");
2140 &mov ("eax",&wparam(0));
2141 &mov ($rounds,&wparam(1));
2142 &mov ($key,&wparam(2));
2143 &call ("_aesni_set_encrypt_key");
2144 &ret ();
2145&function_end_B("${PREFIX}_set_encrypt_key");
2146
2147# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2148# AES_KEY *key)
2149&function_begin_B("${PREFIX}_set_decrypt_key");
2150 &mov ("eax",&wparam(0));
2151 &mov ($rounds,&wparam(1));
2152 &mov ($key,&wparam(2));
2153 &call ("_aesni_set_encrypt_key");
2154 &mov ($key,&wparam(2));
2155 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
2156 &test ("eax","eax");
2157 &jnz (&label("dec_key_ret"));
2158 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
2159
2160 &$movekey ("xmm0",&QWP(0,$key)); # just swap
2161 &$movekey ("xmm1",&QWP(0,"eax"));
2162 &$movekey (&QWP(0,"eax"),"xmm0");
2163 &$movekey (&QWP(0,$key),"xmm1");
2164 &lea ($key,&DWP(16,$key));
2165 &lea ("eax",&DWP(-16,"eax"));
2166
2167&set_label("dec_key_inverse");
2168 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
2169 &$movekey ("xmm1",&QWP(0,"eax"));
2170 &aesimc ("xmm0","xmm0");
2171 &aesimc ("xmm1","xmm1");
2172 &lea ($key,&DWP(16,$key));
2173 &lea ("eax",&DWP(-16,"eax"));
2174 &$movekey (&QWP(16,"eax"),"xmm0");
2175 &$movekey (&QWP(-16,$key),"xmm1");
2176 &cmp ("eax",$key);
2177 &ja (&label("dec_key_inverse"));
2178
2179 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
2180 &aesimc ("xmm0","xmm0");
2181 &$movekey (&QWP(0,$key),"xmm0");
2182
2183 &xor ("eax","eax"); # return success
2184&set_label("dec_key_ret");
2185 &ret ();
2186&function_end_B("${PREFIX}_set_decrypt_key");
2187&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
2188
2189&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
deleted file mode 100644
index 499f3b3f42..0000000000
--- a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
+++ /dev/null
@@ -1,3068 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
13# details].
14#
15# Performance.
16#
17# Given aes(enc|dec) instructions' latency asymptotic performance for
18# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
19# processed with 128-bit key. And given their throughput asymptotic
20# performance for parallelizable modes is 1.25 cycles per byte. Being
21# asymptotic limit it's not something you commonly achieve in reality,
22# but how close does one get? Below are results collected for
23# different modes and block sized. Pairs of numbers are for en-/
24# decryption.
25#
26# 16-byte 64-byte 256-byte 1-KB 8-KB
27# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
28# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
29# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
30# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
31# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
32# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
33#
34# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
35# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
36# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
37# The results were collected with specially crafted speed.c benchmark
38# in order to compare them with results reported in "Intel Advanced
39# Encryption Standard (AES) New Instruction Set" White Paper Revision
40# 3.0 dated May 2010. All above results are consistently better. This
41# module also provides better performance for block sizes smaller than
42# 128 bytes in points *not* represented in the above table.
43#
44# Looking at the results for 8-KB buffer.
45#
46# CFB and OFB results are far from the limit, because implementation
47# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
48# single-block aesni_encrypt, which is not the most optimal way to go.
49# CBC encrypt result is unexpectedly high and there is no documented
50# explanation for it. Seemingly there is a small penalty for feeding
51# the result back to AES unit the way it's done in CBC mode. There is
52# nothing one can do and the result appears optimal. CCM result is
53# identical to CBC, because CBC-MAC is essentially CBC encrypt without
54# saving output. CCM CTR "stays invisible," because it's neatly
55# interleaved wih CBC-MAC. This provides ~30% improvement over
56# "straghtforward" CCM implementation with CTR and CBC-MAC performed
57# disjointly. Parallelizable modes practically achieve the theoretical
58# limit.
59#
60# Looking at how results vary with buffer size.
61#
62# Curves are practically saturated at 1-KB buffer size. In most cases
63# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
64# CTR curve doesn't follow this pattern and is "slowest" changing one
65# with "256-byte" result being 87% of "8-KB." This is because overhead
66# in CTR mode is most computationally intensive. Small-block CCM
67# decrypt is slower than encrypt, because first CTR and last CBC-MAC
68# iterations can't be interleaved.
69#
70# Results for 192- and 256-bit keys.
71#
72# EVP-free results were observed to scale perfectly with number of
73# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
74# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
75# are a tad smaller, because the above mentioned penalty biases all
76# results by same constant value. In similar way function call
77# overhead affects small-block performance, as well as OFB and CFB
78# results. Differences are not large, most common coefficients are
79# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
80# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
81
82# January 2011
83#
84# While Westmere processor features 6 cycles latency for aes[enc|dec]
85# instructions, which can be scheduled every second cycle, Sandy
86# Bridge spends 8 cycles per instruction, but it can schedule them
87# every cycle. This means that code targeting Westmere would perform
88# suboptimally on Sandy Bridge. Therefore this update.
89#
90# In addition, non-parallelizable CBC encrypt (as well as CCM) is
91# optimized. Relative improvement might appear modest, 8% on Westmere,
92# but in absolute terms it's 3.77 cycles per byte encrypted with
93# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
94# should be compared to asymptotic limits of 3.75 for Westmere and
95# 5.00 for Sandy Bridge. Actually, the fact that they get this close
96# to asymptotic limits is quite amazing. Indeed, the limit is
97# calculated as latency times number of rounds, 10 for 128-bit key,
98# and divided by 16, the number of bytes in block, or in other words
99# it accounts *solely* for aesenc instructions. But there are extra
100# instructions, and numbers so close to the asymptotic limits mean
101# that it's as if it takes as little as *one* additional cycle to
102# execute all of them. How is it possible? It is possible thanks to
103# out-of-order execution logic, which manages to overlap post-
104# processing of previous block, things like saving the output, with
105# actual encryption of current block, as well as pre-processing of
106# current block, things like fetching input and xor-ing it with
107# 0-round element of the key schedule, with actual encryption of
108# previous block. Keep this in mind...
109#
110# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
111# performance is achieved by interleaving instructions working on
112# independent blocks. In which case asymptotic limit for such modes
113# can be obtained by dividing above mentioned numbers by AES
114# instructions' interleave factor. Westmere can execute at most 3
115# instructions at a time, meaning that optimal interleave factor is 3,
116# and that's where the "magic" number of 1.25 come from. "Optimal
117# interleave factor" means that increase of interleave factor does
118# not improve performance. The formula has proven to reflect reality
119# pretty well on Westmere... Sandy Bridge on the other hand can
120# execute up to 8 AES instructions at a time, so how does varying
121# interleave factor affect the performance? Here is table for ECB
122# (numbers are cycles per byte processed with 128-bit key):
123#
124# instruction interleave factor 3x 6x 8x
125# theoretical asymptotic limit 1.67 0.83 0.625
126# measured performance for 8KB block 1.05 0.86 0.84
127#
128# "as if" interleave factor 4.7x 5.8x 6.0x
129#
130# Further data for other parallelizable modes:
131#
132# CBC decrypt 1.16 0.93 0.93
133# CTR 1.14 0.91 n/a
134#
135# Well, given 3x column it's probably inappropriate to call the limit
136# asymptotic, if it can be surpassed, isn't it? What happens there?
137# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
138# magic is responsible for this. Processor overlaps not only the
139# additional instructions with AES ones, but even AES instuctions
140# processing adjacent triplets of independent blocks. In the 6x case
141# additional instructions still claim disproportionally small amount
142# of additional cycles, but in 8x case number of instructions must be
143# a tad too high for out-of-order logic to cope with, and AES unit
144# remains underutilized... As you can see 8x interleave is hardly
145# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
146# utilizies 6x interleave because of limited register bank capacity.
147#
148# Higher interleave factors do have negative impact on Westmere
149# performance. While for ECB mode it's negligible ~1.5%, other
150# parallelizables perform ~5% worse, which is outweighed by ~25%
151# improvement on Sandy Bridge. To balance regression on Westmere
152# CTR mode was implemented with 6x aesenc interleave factor.
153
154# April 2011
155#
156# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
157# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
158# in CTR mode AES instruction interleave factor was chosen to be 6x.
159
160$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
161 # generates drop-in replacement for
162 # crypto/aes/asm/aes-x86_64.pl:-)
163
164$flavour = shift;
165$output = shift;
166if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
167
168$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
169
170$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
171( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
172( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
173die "can't locate x86_64-xlate.pl";
174
175open STDOUT,"| $^X $xlate $flavour $output";
176
177$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
178@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
179 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
180
181$code=".text\n";
182
183$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
184# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
185$inp="%rdi";
186$out="%rsi";
187$len="%rdx";
188$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
189$ivp="%r8"; # cbc, ctr, ...
190
191$rnds_="%r10d"; # backup copy for $rounds
192$key_="%r11"; # backup copy for $key
193
194# %xmm register layout
195$rndkey0="%xmm0"; $rndkey1="%xmm1";
196$inout0="%xmm2"; $inout1="%xmm3";
197$inout2="%xmm4"; $inout3="%xmm5";
198$inout4="%xmm6"; $inout5="%xmm7";
199$inout6="%xmm8"; $inout7="%xmm9";
200
201$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
202$in0="%xmm8"; $iv="%xmm9";
203
204# Inline version of internal aesni_[en|de]crypt1.
205#
206# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
207# cycles which take care of loop variables...
208{ my $sn;
209sub aesni_generate1 {
210my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
211++$sn;
212$code.=<<___;
213 $movkey ($key),$rndkey0
214 $movkey 16($key),$rndkey1
215___
216$code.=<<___ if (defined($ivec));
217 xorps $rndkey0,$ivec
218 lea 32($key),$key
219 xorps $ivec,$inout
220___
221$code.=<<___ if (!defined($ivec));
222 lea 32($key),$key
223 xorps $rndkey0,$inout
224___
225$code.=<<___;
226.Loop_${p}1_$sn:
227 aes${p} $rndkey1,$inout
228 dec $rounds
229 $movkey ($key),$rndkey1
230 lea 16($key),$key
231 jnz .Loop_${p}1_$sn # loop body is 16 bytes
232 aes${p}last $rndkey1,$inout
233___
234}}
235# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
236#
237{ my ($inp,$out,$key) = @_4args;
238
239$code.=<<___;
240.globl ${PREFIX}_encrypt
241.type ${PREFIX}_encrypt,\@abi-omnipotent
242.align 16
243${PREFIX}_encrypt:
244 movups ($inp),$inout0 # load input
245 mov 240($key),$rounds # key->rounds
246___
247 &aesni_generate1("enc",$key,$rounds);
248$code.=<<___;
249 movups $inout0,($out) # output
250 ret
251.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
252
253.globl ${PREFIX}_decrypt
254.type ${PREFIX}_decrypt,\@abi-omnipotent
255.align 16
256${PREFIX}_decrypt:
257 movups ($inp),$inout0 # load input
258 mov 240($key),$rounds # key->rounds
259___
260 &aesni_generate1("dec",$key,$rounds);
261$code.=<<___;
262 movups $inout0,($out) # output
263 ret
264.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
265___
266}
267
268# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
269# factor. Why 3x subroutine were originally used in loops? Even though
270# aes[enc|dec] latency was originally 6, it could be scheduled only
271# every *2nd* cycle. Thus 3x interleave was the one providing optimal
272# utilization, i.e. when subroutine's throughput is virtually same as
273# of non-interleaved subroutine [for number of input blocks up to 3].
274# This is why it makes no sense to implement 2x subroutine.
275# aes[enc|dec] latency in next processor generation is 8, but the
276# instructions can be scheduled every cycle. Optimal interleave for
277# new processor is therefore 8x...
278sub aesni_generate3 {
279my $dir=shift;
280# As already mentioned it takes in $key and $rounds, which are *not*
281# preserved. $inout[0-2] is cipher/clear text...
282$code.=<<___;
283.type _aesni_${dir}rypt3,\@abi-omnipotent
284.align 16
285_aesni_${dir}rypt3:
286 $movkey ($key),$rndkey0
287 shr \$1,$rounds
288 $movkey 16($key),$rndkey1
289 lea 32($key),$key
290 xorps $rndkey0,$inout0
291 xorps $rndkey0,$inout1
292 xorps $rndkey0,$inout2
293 $movkey ($key),$rndkey0
294
295.L${dir}_loop3:
296 aes${dir} $rndkey1,$inout0
297 aes${dir} $rndkey1,$inout1
298 dec $rounds
299 aes${dir} $rndkey1,$inout2
300 $movkey 16($key),$rndkey1
301 aes${dir} $rndkey0,$inout0
302 aes${dir} $rndkey0,$inout1
303 lea 32($key),$key
304 aes${dir} $rndkey0,$inout2
305 $movkey ($key),$rndkey0
306 jnz .L${dir}_loop3
307
308 aes${dir} $rndkey1,$inout0
309 aes${dir} $rndkey1,$inout1
310 aes${dir} $rndkey1,$inout2
311 aes${dir}last $rndkey0,$inout0
312 aes${dir}last $rndkey0,$inout1
313 aes${dir}last $rndkey0,$inout2
314 ret
315.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
316___
317}
318# 4x interleave is implemented to improve small block performance,
319# most notably [and naturally] 4 block by ~30%. One can argue that one
320# should have implemented 5x as well, but improvement would be <20%,
321# so it's not worth it...
322sub aesni_generate4 {
323my $dir=shift;
324# As already mentioned it takes in $key and $rounds, which are *not*
325# preserved. $inout[0-3] is cipher/clear text...
326$code.=<<___;
327.type _aesni_${dir}rypt4,\@abi-omnipotent
328.align 16
329_aesni_${dir}rypt4:
330 $movkey ($key),$rndkey0
331 shr \$1,$rounds
332 $movkey 16($key),$rndkey1
333 lea 32($key),$key
334 xorps $rndkey0,$inout0
335 xorps $rndkey0,$inout1
336 xorps $rndkey0,$inout2
337 xorps $rndkey0,$inout3
338 $movkey ($key),$rndkey0
339
340.L${dir}_loop4:
341 aes${dir} $rndkey1,$inout0
342 aes${dir} $rndkey1,$inout1
343 dec $rounds
344 aes${dir} $rndkey1,$inout2
345 aes${dir} $rndkey1,$inout3
346 $movkey 16($key),$rndkey1
347 aes${dir} $rndkey0,$inout0
348 aes${dir} $rndkey0,$inout1
349 lea 32($key),$key
350 aes${dir} $rndkey0,$inout2
351 aes${dir} $rndkey0,$inout3
352 $movkey ($key),$rndkey0
353 jnz .L${dir}_loop4
354
355 aes${dir} $rndkey1,$inout0
356 aes${dir} $rndkey1,$inout1
357 aes${dir} $rndkey1,$inout2
358 aes${dir} $rndkey1,$inout3
359 aes${dir}last $rndkey0,$inout0
360 aes${dir}last $rndkey0,$inout1
361 aes${dir}last $rndkey0,$inout2
362 aes${dir}last $rndkey0,$inout3
363 ret
364.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
365___
366}
367sub aesni_generate6 {
368my $dir=shift;
369# As already mentioned it takes in $key and $rounds, which are *not*
370# preserved. $inout[0-5] is cipher/clear text...
371$code.=<<___;
372.type _aesni_${dir}rypt6,\@abi-omnipotent
373.align 16
374_aesni_${dir}rypt6:
375 $movkey ($key),$rndkey0
376 shr \$1,$rounds
377 $movkey 16($key),$rndkey1
378 lea 32($key),$key
379 xorps $rndkey0,$inout0
380 pxor $rndkey0,$inout1
381 aes${dir} $rndkey1,$inout0
382 pxor $rndkey0,$inout2
383 aes${dir} $rndkey1,$inout1
384 pxor $rndkey0,$inout3
385 aes${dir} $rndkey1,$inout2
386 pxor $rndkey0,$inout4
387 aes${dir} $rndkey1,$inout3
388 pxor $rndkey0,$inout5
389 dec $rounds
390 aes${dir} $rndkey1,$inout4
391 $movkey ($key),$rndkey0
392 aes${dir} $rndkey1,$inout5
393 jmp .L${dir}_loop6_enter
394.align 16
395.L${dir}_loop6:
396 aes${dir} $rndkey1,$inout0
397 aes${dir} $rndkey1,$inout1
398 dec $rounds
399 aes${dir} $rndkey1,$inout2
400 aes${dir} $rndkey1,$inout3
401 aes${dir} $rndkey1,$inout4
402 aes${dir} $rndkey1,$inout5
403.L${dir}_loop6_enter: # happens to be 16-byte aligned
404 $movkey 16($key),$rndkey1
405 aes${dir} $rndkey0,$inout0
406 aes${dir} $rndkey0,$inout1
407 lea 32($key),$key
408 aes${dir} $rndkey0,$inout2
409 aes${dir} $rndkey0,$inout3
410 aes${dir} $rndkey0,$inout4
411 aes${dir} $rndkey0,$inout5
412 $movkey ($key),$rndkey0
413 jnz .L${dir}_loop6
414
415 aes${dir} $rndkey1,$inout0
416 aes${dir} $rndkey1,$inout1
417 aes${dir} $rndkey1,$inout2
418 aes${dir} $rndkey1,$inout3
419 aes${dir} $rndkey1,$inout4
420 aes${dir} $rndkey1,$inout5
421 aes${dir}last $rndkey0,$inout0
422 aes${dir}last $rndkey0,$inout1
423 aes${dir}last $rndkey0,$inout2
424 aes${dir}last $rndkey0,$inout3
425 aes${dir}last $rndkey0,$inout4
426 aes${dir}last $rndkey0,$inout5
427 ret
428.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
429___
430}
431sub aesni_generate8 {
432my $dir=shift;
433# As already mentioned it takes in $key and $rounds, which are *not*
434# preserved. $inout[0-7] is cipher/clear text...
435$code.=<<___;
436.type _aesni_${dir}rypt8,\@abi-omnipotent
437.align 16
438_aesni_${dir}rypt8:
439 $movkey ($key),$rndkey0
440 shr \$1,$rounds
441 $movkey 16($key),$rndkey1
442 lea 32($key),$key
443 xorps $rndkey0,$inout0
444 xorps $rndkey0,$inout1
445 aes${dir} $rndkey1,$inout0
446 pxor $rndkey0,$inout2
447 aes${dir} $rndkey1,$inout1
448 pxor $rndkey0,$inout3
449 aes${dir} $rndkey1,$inout2
450 pxor $rndkey0,$inout4
451 aes${dir} $rndkey1,$inout3
452 pxor $rndkey0,$inout5
453 dec $rounds
454 aes${dir} $rndkey1,$inout4
455 pxor $rndkey0,$inout6
456 aes${dir} $rndkey1,$inout5
457 pxor $rndkey0,$inout7
458 $movkey ($key),$rndkey0
459 aes${dir} $rndkey1,$inout6
460 aes${dir} $rndkey1,$inout7
461 $movkey 16($key),$rndkey1
462 jmp .L${dir}_loop8_enter
463.align 16
464.L${dir}_loop8:
465 aes${dir} $rndkey1,$inout0
466 aes${dir} $rndkey1,$inout1
467 dec $rounds
468 aes${dir} $rndkey1,$inout2
469 aes${dir} $rndkey1,$inout3
470 aes${dir} $rndkey1,$inout4
471 aes${dir} $rndkey1,$inout5
472 aes${dir} $rndkey1,$inout6
473 aes${dir} $rndkey1,$inout7
474 $movkey 16($key),$rndkey1
475.L${dir}_loop8_enter: # happens to be 16-byte aligned
476 aes${dir} $rndkey0,$inout0
477 aes${dir} $rndkey0,$inout1
478 lea 32($key),$key
479 aes${dir} $rndkey0,$inout2
480 aes${dir} $rndkey0,$inout3
481 aes${dir} $rndkey0,$inout4
482 aes${dir} $rndkey0,$inout5
483 aes${dir} $rndkey0,$inout6
484 aes${dir} $rndkey0,$inout7
485 $movkey ($key),$rndkey0
486 jnz .L${dir}_loop8
487
488 aes${dir} $rndkey1,$inout0
489 aes${dir} $rndkey1,$inout1
490 aes${dir} $rndkey1,$inout2
491 aes${dir} $rndkey1,$inout3
492 aes${dir} $rndkey1,$inout4
493 aes${dir} $rndkey1,$inout5
494 aes${dir} $rndkey1,$inout6
495 aes${dir} $rndkey1,$inout7
496 aes${dir}last $rndkey0,$inout0
497 aes${dir}last $rndkey0,$inout1
498 aes${dir}last $rndkey0,$inout2
499 aes${dir}last $rndkey0,$inout3
500 aes${dir}last $rndkey0,$inout4
501 aes${dir}last $rndkey0,$inout5
502 aes${dir}last $rndkey0,$inout6
503 aes${dir}last $rndkey0,$inout7
504 ret
505.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
506___
507}
508&aesni_generate3("enc") if ($PREFIX eq "aesni");
509&aesni_generate3("dec");
510&aesni_generate4("enc") if ($PREFIX eq "aesni");
511&aesni_generate4("dec");
512&aesni_generate6("enc") if ($PREFIX eq "aesni");
513&aesni_generate6("dec");
514&aesni_generate8("enc") if ($PREFIX eq "aesni");
515&aesni_generate8("dec");
516
517if ($PREFIX eq "aesni") {
518########################################################################
519# void aesni_ecb_encrypt (const void *in, void *out,
520# size_t length, const AES_KEY *key,
521# int enc);
522$code.=<<___;
523.globl aesni_ecb_encrypt
524.type aesni_ecb_encrypt,\@function,5
525.align 16
526aesni_ecb_encrypt:
527 and \$-16,$len
528 jz .Lecb_ret
529
530 mov 240($key),$rounds # key->rounds
531 $movkey ($key),$rndkey0
532 mov $key,$key_ # backup $key
533 mov $rounds,$rnds_ # backup $rounds
534 test %r8d,%r8d # 5th argument
535 jz .Lecb_decrypt
536#--------------------------- ECB ENCRYPT ------------------------------#
537 cmp \$0x80,$len
538 jb .Lecb_enc_tail
539
540 movdqu ($inp),$inout0
541 movdqu 0x10($inp),$inout1
542 movdqu 0x20($inp),$inout2
543 movdqu 0x30($inp),$inout3
544 movdqu 0x40($inp),$inout4
545 movdqu 0x50($inp),$inout5
546 movdqu 0x60($inp),$inout6
547 movdqu 0x70($inp),$inout7
548 lea 0x80($inp),$inp
549 sub \$0x80,$len
550 jmp .Lecb_enc_loop8_enter
551.align 16
552.Lecb_enc_loop8:
553 movups $inout0,($out)
554 mov $key_,$key # restore $key
555 movdqu ($inp),$inout0
556 mov $rnds_,$rounds # restore $rounds
557 movups $inout1,0x10($out)
558 movdqu 0x10($inp),$inout1
559 movups $inout2,0x20($out)
560 movdqu 0x20($inp),$inout2
561 movups $inout3,0x30($out)
562 movdqu 0x30($inp),$inout3
563 movups $inout4,0x40($out)
564 movdqu 0x40($inp),$inout4
565 movups $inout5,0x50($out)
566 movdqu 0x50($inp),$inout5
567 movups $inout6,0x60($out)
568 movdqu 0x60($inp),$inout6
569 movups $inout7,0x70($out)
570 lea 0x80($out),$out
571 movdqu 0x70($inp),$inout7
572 lea 0x80($inp),$inp
573.Lecb_enc_loop8_enter:
574
575 call _aesni_encrypt8
576
577 sub \$0x80,$len
578 jnc .Lecb_enc_loop8
579
580 movups $inout0,($out)
581 mov $key_,$key # restore $key
582 movups $inout1,0x10($out)
583 mov $rnds_,$rounds # restore $rounds
584 movups $inout2,0x20($out)
585 movups $inout3,0x30($out)
586 movups $inout4,0x40($out)
587 movups $inout5,0x50($out)
588 movups $inout6,0x60($out)
589 movups $inout7,0x70($out)
590 lea 0x80($out),$out
591 add \$0x80,$len
592 jz .Lecb_ret
593
594.Lecb_enc_tail:
595 movups ($inp),$inout0
596 cmp \$0x20,$len
597 jb .Lecb_enc_one
598 movups 0x10($inp),$inout1
599 je .Lecb_enc_two
600 movups 0x20($inp),$inout2
601 cmp \$0x40,$len
602 jb .Lecb_enc_three
603 movups 0x30($inp),$inout3
604 je .Lecb_enc_four
605 movups 0x40($inp),$inout4
606 cmp \$0x60,$len
607 jb .Lecb_enc_five
608 movups 0x50($inp),$inout5
609 je .Lecb_enc_six
610 movdqu 0x60($inp),$inout6
611 call _aesni_encrypt8
612 movups $inout0,($out)
613 movups $inout1,0x10($out)
614 movups $inout2,0x20($out)
615 movups $inout3,0x30($out)
616 movups $inout4,0x40($out)
617 movups $inout5,0x50($out)
618 movups $inout6,0x60($out)
619 jmp .Lecb_ret
620.align 16
621.Lecb_enc_one:
622___
623 &aesni_generate1("enc",$key,$rounds);
624$code.=<<___;
625 movups $inout0,($out)
626 jmp .Lecb_ret
627.align 16
628.Lecb_enc_two:
629 xorps $inout2,$inout2
630 call _aesni_encrypt3
631 movups $inout0,($out)
632 movups $inout1,0x10($out)
633 jmp .Lecb_ret
634.align 16
635.Lecb_enc_three:
636 call _aesni_encrypt3
637 movups $inout0,($out)
638 movups $inout1,0x10($out)
639 movups $inout2,0x20($out)
640 jmp .Lecb_ret
641.align 16
642.Lecb_enc_four:
643 call _aesni_encrypt4
644 movups $inout0,($out)
645 movups $inout1,0x10($out)
646 movups $inout2,0x20($out)
647 movups $inout3,0x30($out)
648 jmp .Lecb_ret
649.align 16
650.Lecb_enc_five:
651 xorps $inout5,$inout5
652 call _aesni_encrypt6
653 movups $inout0,($out)
654 movups $inout1,0x10($out)
655 movups $inout2,0x20($out)
656 movups $inout3,0x30($out)
657 movups $inout4,0x40($out)
658 jmp .Lecb_ret
659.align 16
660.Lecb_enc_six:
661 call _aesni_encrypt6
662 movups $inout0,($out)
663 movups $inout1,0x10($out)
664 movups $inout2,0x20($out)
665 movups $inout3,0x30($out)
666 movups $inout4,0x40($out)
667 movups $inout5,0x50($out)
668 jmp .Lecb_ret
669 #--------------------------- ECB DECRYPT ------------------------------#
670.align 16
671.Lecb_decrypt:
672 cmp \$0x80,$len
673 jb .Lecb_dec_tail
674
675 movdqu ($inp),$inout0
676 movdqu 0x10($inp),$inout1
677 movdqu 0x20($inp),$inout2
678 movdqu 0x30($inp),$inout3
679 movdqu 0x40($inp),$inout4
680 movdqu 0x50($inp),$inout5
681 movdqu 0x60($inp),$inout6
682 movdqu 0x70($inp),$inout7
683 lea 0x80($inp),$inp
684 sub \$0x80,$len
685 jmp .Lecb_dec_loop8_enter
686.align 16
687.Lecb_dec_loop8:
688 movups $inout0,($out)
689 mov $key_,$key # restore $key
690 movdqu ($inp),$inout0
691 mov $rnds_,$rounds # restore $rounds
692 movups $inout1,0x10($out)
693 movdqu 0x10($inp),$inout1
694 movups $inout2,0x20($out)
695 movdqu 0x20($inp),$inout2
696 movups $inout3,0x30($out)
697 movdqu 0x30($inp),$inout3
698 movups $inout4,0x40($out)
699 movdqu 0x40($inp),$inout4
700 movups $inout5,0x50($out)
701 movdqu 0x50($inp),$inout5
702 movups $inout6,0x60($out)
703 movdqu 0x60($inp),$inout6
704 movups $inout7,0x70($out)
705 lea 0x80($out),$out
706 movdqu 0x70($inp),$inout7
707 lea 0x80($inp),$inp
708.Lecb_dec_loop8_enter:
709
710 call _aesni_decrypt8
711
712 $movkey ($key_),$rndkey0
713 sub \$0x80,$len
714 jnc .Lecb_dec_loop8
715
716 movups $inout0,($out)
717 mov $key_,$key # restore $key
718 movups $inout1,0x10($out)
719 mov $rnds_,$rounds # restore $rounds
720 movups $inout2,0x20($out)
721 movups $inout3,0x30($out)
722 movups $inout4,0x40($out)
723 movups $inout5,0x50($out)
724 movups $inout6,0x60($out)
725 movups $inout7,0x70($out)
726 lea 0x80($out),$out
727 add \$0x80,$len
728 jz .Lecb_ret
729
730.Lecb_dec_tail:
731 movups ($inp),$inout0
732 cmp \$0x20,$len
733 jb .Lecb_dec_one
734 movups 0x10($inp),$inout1
735 je .Lecb_dec_two
736 movups 0x20($inp),$inout2
737 cmp \$0x40,$len
738 jb .Lecb_dec_three
739 movups 0x30($inp),$inout3
740 je .Lecb_dec_four
741 movups 0x40($inp),$inout4
742 cmp \$0x60,$len
743 jb .Lecb_dec_five
744 movups 0x50($inp),$inout5
745 je .Lecb_dec_six
746 movups 0x60($inp),$inout6
747 $movkey ($key),$rndkey0
748 call _aesni_decrypt8
749 movups $inout0,($out)
750 movups $inout1,0x10($out)
751 movups $inout2,0x20($out)
752 movups $inout3,0x30($out)
753 movups $inout4,0x40($out)
754 movups $inout5,0x50($out)
755 movups $inout6,0x60($out)
756 jmp .Lecb_ret
757.align 16
758.Lecb_dec_one:
759___
760 &aesni_generate1("dec",$key,$rounds);
761$code.=<<___;
762 movups $inout0,($out)
763 jmp .Lecb_ret
764.align 16
765.Lecb_dec_two:
766 xorps $inout2,$inout2
767 call _aesni_decrypt3
768 movups $inout0,($out)
769 movups $inout1,0x10($out)
770 jmp .Lecb_ret
771.align 16
772.Lecb_dec_three:
773 call _aesni_decrypt3
774 movups $inout0,($out)
775 movups $inout1,0x10($out)
776 movups $inout2,0x20($out)
777 jmp .Lecb_ret
778.align 16
779.Lecb_dec_four:
780 call _aesni_decrypt4
781 movups $inout0,($out)
782 movups $inout1,0x10($out)
783 movups $inout2,0x20($out)
784 movups $inout3,0x30($out)
785 jmp .Lecb_ret
786.align 16
787.Lecb_dec_five:
788 xorps $inout5,$inout5
789 call _aesni_decrypt6
790 movups $inout0,($out)
791 movups $inout1,0x10($out)
792 movups $inout2,0x20($out)
793 movups $inout3,0x30($out)
794 movups $inout4,0x40($out)
795 jmp .Lecb_ret
796.align 16
797.Lecb_dec_six:
798 call _aesni_decrypt6
799 movups $inout0,($out)
800 movups $inout1,0x10($out)
801 movups $inout2,0x20($out)
802 movups $inout3,0x30($out)
803 movups $inout4,0x40($out)
804 movups $inout5,0x50($out)
805
806.Lecb_ret:
807 ret
808.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
809___
810
811{
812######################################################################
813# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
814# size_t blocks, const AES_KEY *key,
815# const char *ivec,char *cmac);
816#
817# Handles only complete blocks, operates on 64-bit counter and
818# does not update *ivec! Nor does it finalize CMAC value
819# (see engine/eng_aesni.c for details)
820#
821{
822my $cmac="%r9"; # 6th argument
823
824my $increment="%xmm6";
825my $bswap_mask="%xmm7";
826
827$code.=<<___;
828.globl aesni_ccm64_encrypt_blocks
829.type aesni_ccm64_encrypt_blocks,\@function,6
830.align 16
831aesni_ccm64_encrypt_blocks:
832___
833$code.=<<___ if ($win64);
834 lea -0x58(%rsp),%rsp
835 movaps %xmm6,(%rsp)
836 movaps %xmm7,0x10(%rsp)
837 movaps %xmm8,0x20(%rsp)
838 movaps %xmm9,0x30(%rsp)
839.Lccm64_enc_body:
840___
841$code.=<<___;
842 mov 240($key),$rounds # key->rounds
843 movdqu ($ivp),$iv
844 movdqa .Lincrement64(%rip),$increment
845 movdqa .Lbswap_mask(%rip),$bswap_mask
846
847 shr \$1,$rounds
848 lea 0($key),$key_
849 movdqu ($cmac),$inout1
850 movdqa $iv,$inout0
851 mov $rounds,$rnds_
852 pshufb $bswap_mask,$iv
853 jmp .Lccm64_enc_outer
854.align 16
855.Lccm64_enc_outer:
856 $movkey ($key_),$rndkey0
857 mov $rnds_,$rounds
858 movups ($inp),$in0 # load inp
859
860 xorps $rndkey0,$inout0 # counter
861 $movkey 16($key_),$rndkey1
862 xorps $in0,$rndkey0
863 lea 32($key_),$key
864 xorps $rndkey0,$inout1 # cmac^=inp
865 $movkey ($key),$rndkey0
866
867.Lccm64_enc2_loop:
868 aesenc $rndkey1,$inout0
869 dec $rounds
870 aesenc $rndkey1,$inout1
871 $movkey 16($key),$rndkey1
872 aesenc $rndkey0,$inout0
873 lea 32($key),$key
874 aesenc $rndkey0,$inout1
875 $movkey 0($key),$rndkey0
876 jnz .Lccm64_enc2_loop
877 aesenc $rndkey1,$inout0
878 aesenc $rndkey1,$inout1
879 paddq $increment,$iv
880 aesenclast $rndkey0,$inout0
881 aesenclast $rndkey0,$inout1
882
883 dec $len
884 lea 16($inp),$inp
885 xorps $inout0,$in0 # inp ^= E(iv)
886 movdqa $iv,$inout0
887 movups $in0,($out) # save output
888 lea 16($out),$out
889 pshufb $bswap_mask,$inout0
890 jnz .Lccm64_enc_outer
891
892 movups $inout1,($cmac)
893___
894$code.=<<___ if ($win64);
895 movaps (%rsp),%xmm6
896 movaps 0x10(%rsp),%xmm7
897 movaps 0x20(%rsp),%xmm8
898 movaps 0x30(%rsp),%xmm9
899 lea 0x58(%rsp),%rsp
900.Lccm64_enc_ret:
901___
902$code.=<<___;
903 ret
904.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
905___
906######################################################################
907$code.=<<___;
908.globl aesni_ccm64_decrypt_blocks
909.type aesni_ccm64_decrypt_blocks,\@function,6
910.align 16
911aesni_ccm64_decrypt_blocks:
912___
913$code.=<<___ if ($win64);
914 lea -0x58(%rsp),%rsp
915 movaps %xmm6,(%rsp)
916 movaps %xmm7,0x10(%rsp)
917 movaps %xmm8,0x20(%rsp)
918 movaps %xmm9,0x30(%rsp)
919.Lccm64_dec_body:
920___
921$code.=<<___;
922 mov 240($key),$rounds # key->rounds
923 movups ($ivp),$iv
924 movdqu ($cmac),$inout1
925 movdqa .Lincrement64(%rip),$increment
926 movdqa .Lbswap_mask(%rip),$bswap_mask
927
928 movaps $iv,$inout0
929 mov $rounds,$rnds_
930 mov $key,$key_
931 pshufb $bswap_mask,$iv
932___
933 &aesni_generate1("enc",$key,$rounds);
934$code.=<<___;
935 movups ($inp),$in0 # load inp
936 paddq $increment,$iv
937 lea 16($inp),$inp
938 jmp .Lccm64_dec_outer
939.align 16
940.Lccm64_dec_outer:
941 xorps $inout0,$in0 # inp ^= E(iv)
942 movdqa $iv,$inout0
943 mov $rnds_,$rounds
944 movups $in0,($out) # save output
945 lea 16($out),$out
946 pshufb $bswap_mask,$inout0
947
948 sub \$1,$len
949 jz .Lccm64_dec_break
950
951 $movkey ($key_),$rndkey0
952 shr \$1,$rounds
953 $movkey 16($key_),$rndkey1
954 xorps $rndkey0,$in0
955 lea 32($key_),$key
956 xorps $rndkey0,$inout0
957 xorps $in0,$inout1 # cmac^=out
958 $movkey ($key),$rndkey0
959
960.Lccm64_dec2_loop:
961 aesenc $rndkey1,$inout0
962 dec $rounds
963 aesenc $rndkey1,$inout1
964 $movkey 16($key),$rndkey1
965 aesenc $rndkey0,$inout0
966 lea 32($key),$key
967 aesenc $rndkey0,$inout1
968 $movkey 0($key),$rndkey0
969 jnz .Lccm64_dec2_loop
970 movups ($inp),$in0 # load inp
971 paddq $increment,$iv
972 aesenc $rndkey1,$inout0
973 aesenc $rndkey1,$inout1
974 lea 16($inp),$inp
975 aesenclast $rndkey0,$inout0
976 aesenclast $rndkey0,$inout1
977 jmp .Lccm64_dec_outer
978
979.align 16
980.Lccm64_dec_break:
981 #xorps $in0,$inout1 # cmac^=out
982___
983 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
984$code.=<<___;
985 movups $inout1,($cmac)
986___
987$code.=<<___ if ($win64);
988 movaps (%rsp),%xmm6
989 movaps 0x10(%rsp),%xmm7
990 movaps 0x20(%rsp),%xmm8
991 movaps 0x30(%rsp),%xmm9
992 lea 0x58(%rsp),%rsp
993.Lccm64_dec_ret:
994___
995$code.=<<___;
996 ret
997.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
998___
999}
1000######################################################################
1001# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1002# size_t blocks, const AES_KEY *key,
1003# const char *ivec);
1004#
1005# Handles only complete blocks, operates on 32-bit counter and
1006# does not update *ivec! (see engine/eng_aesni.c for details)
1007#
1008{
1009my $reserved = $win64?0:-0x28;
1010my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
1011my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
1012my $bswap_mask="%xmm15";
1013
1014$code.=<<___;
1015.globl aesni_ctr32_encrypt_blocks
1016.type aesni_ctr32_encrypt_blocks,\@function,5
1017.align 16
1018aesni_ctr32_encrypt_blocks:
1019___
1020$code.=<<___ if ($win64);
1021 lea -0xc8(%rsp),%rsp
1022 movaps %xmm6,0x20(%rsp)
1023 movaps %xmm7,0x30(%rsp)
1024 movaps %xmm8,0x40(%rsp)
1025 movaps %xmm9,0x50(%rsp)
1026 movaps %xmm10,0x60(%rsp)
1027 movaps %xmm11,0x70(%rsp)
1028 movaps %xmm12,0x80(%rsp)
1029 movaps %xmm13,0x90(%rsp)
1030 movaps %xmm14,0xa0(%rsp)
1031 movaps %xmm15,0xb0(%rsp)
1032.Lctr32_body:
1033___
1034$code.=<<___;
1035 cmp \$1,$len
1036 je .Lctr32_one_shortcut
1037
1038 movdqu ($ivp),$ivec
1039 movdqa .Lbswap_mask(%rip),$bswap_mask
1040 xor $rounds,$rounds
1041 pextrd \$3,$ivec,$rnds_ # pull 32-bit counter
1042 pinsrd \$3,$rounds,$ivec # wipe 32-bit counter
1043
1044 mov 240($key),$rounds # key->rounds
1045 bswap $rnds_
1046 pxor $iv0,$iv0 # vector of 3 32-bit counters
1047 pxor $iv1,$iv1 # vector of 3 32-bit counters
1048 pinsrd \$0,$rnds_,$iv0
1049 lea 3($rnds_),$key_
1050 pinsrd \$0,$key_,$iv1
1051 inc $rnds_
1052 pinsrd \$1,$rnds_,$iv0
1053 inc $key_
1054 pinsrd \$1,$key_,$iv1
1055 inc $rnds_
1056 pinsrd \$2,$rnds_,$iv0
1057 inc $key_
1058 pinsrd \$2,$key_,$iv1
1059 movdqa $iv0,$reserved(%rsp)
1060 pshufb $bswap_mask,$iv0
1061 movdqa $iv1,`$reserved+0x10`(%rsp)
1062 pshufb $bswap_mask,$iv1
1063
1064 pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword
1065 pshufd \$`2<<6`,$iv0,$inout1
1066 pshufd \$`1<<6`,$iv0,$inout2
1067 cmp \$6,$len
1068 jb .Lctr32_tail
1069 shr \$1,$rounds
1070 mov $key,$key_ # backup $key
1071 mov $rounds,$rnds_ # backup $rounds
1072 sub \$6,$len
1073 jmp .Lctr32_loop6
1074
1075.align 16
1076.Lctr32_loop6:
1077 pshufd \$`3<<6`,$iv1,$inout3
1078 por $ivec,$inout0 # merge counter-less ivec
1079 $movkey ($key_),$rndkey0
1080 pshufd \$`2<<6`,$iv1,$inout4
1081 por $ivec,$inout1
1082 $movkey 16($key_),$rndkey1
1083 pshufd \$`1<<6`,$iv1,$inout5
1084 por $ivec,$inout2
1085 por $ivec,$inout3
1086 xorps $rndkey0,$inout0
1087 por $ivec,$inout4
1088 por $ivec,$inout5
1089
1090 # inline _aesni_encrypt6 and interleave last rounds
1091 # with own code...
1092
1093 pxor $rndkey0,$inout1
1094 aesenc $rndkey1,$inout0
1095 lea 32($key_),$key
1096 pxor $rndkey0,$inout2
1097 aesenc $rndkey1,$inout1
1098 movdqa .Lincrement32(%rip),$iv1
1099 pxor $rndkey0,$inout3
1100 aesenc $rndkey1,$inout2
1101 movdqa $reserved(%rsp),$iv0
1102 pxor $rndkey0,$inout4
1103 aesenc $rndkey1,$inout3
1104 pxor $rndkey0,$inout5
1105 $movkey ($key),$rndkey0
1106 dec $rounds
1107 aesenc $rndkey1,$inout4
1108 aesenc $rndkey1,$inout5
1109 jmp .Lctr32_enc_loop6_enter
1110.align 16
1111.Lctr32_enc_loop6:
1112 aesenc $rndkey1,$inout0
1113 aesenc $rndkey1,$inout1
1114 dec $rounds
1115 aesenc $rndkey1,$inout2
1116 aesenc $rndkey1,$inout3
1117 aesenc $rndkey1,$inout4
1118 aesenc $rndkey1,$inout5
1119.Lctr32_enc_loop6_enter:
1120 $movkey 16($key),$rndkey1
1121 aesenc $rndkey0,$inout0
1122 aesenc $rndkey0,$inout1
1123 lea 32($key),$key
1124 aesenc $rndkey0,$inout2
1125 aesenc $rndkey0,$inout3
1126 aesenc $rndkey0,$inout4
1127 aesenc $rndkey0,$inout5
1128 $movkey ($key),$rndkey0
1129 jnz .Lctr32_enc_loop6
1130
1131 aesenc $rndkey1,$inout0
1132 paddd $iv1,$iv0 # increment counter vector
1133 aesenc $rndkey1,$inout1
1134 paddd `$reserved+0x10`(%rsp),$iv1
1135 aesenc $rndkey1,$inout2
1136 movdqa $iv0,$reserved(%rsp) # save counter vector
1137 aesenc $rndkey1,$inout3
1138 movdqa $iv1,`$reserved+0x10`(%rsp)
1139 aesenc $rndkey1,$inout4
1140 pshufb $bswap_mask,$iv0 # byte swap
1141 aesenc $rndkey1,$inout5
1142 pshufb $bswap_mask,$iv1
1143
1144 aesenclast $rndkey0,$inout0
1145 movups ($inp),$in0 # load input
1146 aesenclast $rndkey0,$inout1
1147 movups 0x10($inp),$in1
1148 aesenclast $rndkey0,$inout2
1149 movups 0x20($inp),$in2
1150 aesenclast $rndkey0,$inout3
1151 movups 0x30($inp),$in3
1152 aesenclast $rndkey0,$inout4
1153 movups 0x40($inp),$rndkey1
1154 aesenclast $rndkey0,$inout5
1155 movups 0x50($inp),$rndkey0
1156 lea 0x60($inp),$inp
1157
1158 xorps $inout0,$in0 # xor
1159 pshufd \$`3<<6`,$iv0,$inout0
1160 xorps $inout1,$in1
1161 pshufd \$`2<<6`,$iv0,$inout1
1162 movups $in0,($out) # store output
1163 xorps $inout2,$in2
1164 pshufd \$`1<<6`,$iv0,$inout2
1165 movups $in1,0x10($out)
1166 xorps $inout3,$in3
1167 movups $in2,0x20($out)
1168 xorps $inout4,$rndkey1
1169 movups $in3,0x30($out)
1170 xorps $inout5,$rndkey0
1171 movups $rndkey1,0x40($out)
1172 movups $rndkey0,0x50($out)
1173 lea 0x60($out),$out
1174 mov $rnds_,$rounds
1175 sub \$6,$len
1176 jnc .Lctr32_loop6
1177
1178 add \$6,$len
1179 jz .Lctr32_done
1180 mov $key_,$key # restore $key
1181 lea 1($rounds,$rounds),$rounds # restore original value
1182
1183.Lctr32_tail:
1184 por $ivec,$inout0
1185 movups ($inp),$in0
1186 cmp \$2,$len
1187 jb .Lctr32_one
1188
1189 por $ivec,$inout1
1190 movups 0x10($inp),$in1
1191 je .Lctr32_two
1192
1193 pshufd \$`3<<6`,$iv1,$inout3
1194 por $ivec,$inout2
1195 movups 0x20($inp),$in2
1196 cmp \$4,$len
1197 jb .Lctr32_three
1198
1199 pshufd \$`2<<6`,$iv1,$inout4
1200 por $ivec,$inout3
1201 movups 0x30($inp),$in3
1202 je .Lctr32_four
1203
1204 por $ivec,$inout4
1205 xorps $inout5,$inout5
1206
1207 call _aesni_encrypt6
1208
1209 movups 0x40($inp),$rndkey1
1210 xorps $inout0,$in0
1211 xorps $inout1,$in1
1212 movups $in0,($out)
1213 xorps $inout2,$in2
1214 movups $in1,0x10($out)
1215 xorps $inout3,$in3
1216 movups $in2,0x20($out)
1217 xorps $inout4,$rndkey1
1218 movups $in3,0x30($out)
1219 movups $rndkey1,0x40($out)
1220 jmp .Lctr32_done
1221
1222.align 16
1223.Lctr32_one_shortcut:
1224 movups ($ivp),$inout0
1225 movups ($inp),$in0
1226 mov 240($key),$rounds # key->rounds
1227.Lctr32_one:
1228___
1229 &aesni_generate1("enc",$key,$rounds);
1230$code.=<<___;
1231 xorps $inout0,$in0
1232 movups $in0,($out)
1233 jmp .Lctr32_done
1234
1235.align 16
1236.Lctr32_two:
1237 xorps $inout2,$inout2
1238 call _aesni_encrypt3
1239 xorps $inout0,$in0
1240 xorps $inout1,$in1
1241 movups $in0,($out)
1242 movups $in1,0x10($out)
1243 jmp .Lctr32_done
1244
1245.align 16
1246.Lctr32_three:
1247 call _aesni_encrypt3
1248 xorps $inout0,$in0
1249 xorps $inout1,$in1
1250 movups $in0,($out)
1251 xorps $inout2,$in2
1252 movups $in1,0x10($out)
1253 movups $in2,0x20($out)
1254 jmp .Lctr32_done
1255
1256.align 16
1257.Lctr32_four:
1258 call _aesni_encrypt4
1259 xorps $inout0,$in0
1260 xorps $inout1,$in1
1261 movups $in0,($out)
1262 xorps $inout2,$in2
1263 movups $in1,0x10($out)
1264 xorps $inout3,$in3
1265 movups $in2,0x20($out)
1266 movups $in3,0x30($out)
1267
1268.Lctr32_done:
1269___
1270$code.=<<___ if ($win64);
1271 movaps 0x20(%rsp),%xmm6
1272 movaps 0x30(%rsp),%xmm7
1273 movaps 0x40(%rsp),%xmm8
1274 movaps 0x50(%rsp),%xmm9
1275 movaps 0x60(%rsp),%xmm10
1276 movaps 0x70(%rsp),%xmm11
1277 movaps 0x80(%rsp),%xmm12
1278 movaps 0x90(%rsp),%xmm13
1279 movaps 0xa0(%rsp),%xmm14
1280 movaps 0xb0(%rsp),%xmm15
1281 lea 0xc8(%rsp),%rsp
1282.Lctr32_ret:
1283___
1284$code.=<<___;
1285 ret
1286.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1287___
1288}
1289
1290######################################################################
1291# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1292# const AES_KEY *key1, const AES_KEY *key2
1293# const unsigned char iv[16]);
1294#
1295{
1296my @tweak=map("%xmm$_",(10..15));
1297my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1298my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1299my $frame_size = 0x68 + ($win64?160:0);
1300
1301$code.=<<___;
1302.globl aesni_xts_encrypt
1303.type aesni_xts_encrypt,\@function,6
1304.align 16
1305aesni_xts_encrypt:
1306 lea -$frame_size(%rsp),%rsp
1307___
1308$code.=<<___ if ($win64);
1309 movaps %xmm6,0x60(%rsp)
1310 movaps %xmm7,0x70(%rsp)
1311 movaps %xmm8,0x80(%rsp)
1312 movaps %xmm9,0x90(%rsp)
1313 movaps %xmm10,0xa0(%rsp)
1314 movaps %xmm11,0xb0(%rsp)
1315 movaps %xmm12,0xc0(%rsp)
1316 movaps %xmm13,0xd0(%rsp)
1317 movaps %xmm14,0xe0(%rsp)
1318 movaps %xmm15,0xf0(%rsp)
1319.Lxts_enc_body:
1320___
1321$code.=<<___;
1322 movups ($ivp),@tweak[5] # load clear-text tweak
1323 mov 240(%r8),$rounds # key2->rounds
1324 mov 240($key),$rnds_ # key1->rounds
1325___
1326 # generate the tweak
1327 &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
1328$code.=<<___;
1329 mov $key,$key_ # backup $key
1330 mov $rnds_,$rounds # backup $rounds
1331 mov $len,$len_ # backup $len
1332 and \$-16,$len
1333
1334 movdqa .Lxts_magic(%rip),$twmask
1335 pxor $twtmp,$twtmp
1336 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1337___
1338 for ($i=0;$i<4;$i++) {
1339 $code.=<<___;
1340 pshufd \$0x13,$twtmp,$twres
1341 pxor $twtmp,$twtmp
1342 movdqa @tweak[5],@tweak[$i]
1343 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1344 pand $twmask,$twres # isolate carry and residue
1345 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1346 pxor $twres,@tweak[5]
1347___
1348 }
1349$code.=<<___;
1350 sub \$16*6,$len
1351 jc .Lxts_enc_short
1352
1353 shr \$1,$rounds
1354 sub \$1,$rounds
1355 mov $rounds,$rnds_
1356 jmp .Lxts_enc_grandloop
1357
1358.align 16
1359.Lxts_enc_grandloop:
1360 pshufd \$0x13,$twtmp,$twres
1361 movdqa @tweak[5],@tweak[4]
1362 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1363 movdqu `16*0`($inp),$inout0 # load input
1364 pand $twmask,$twres # isolate carry and residue
1365 movdqu `16*1`($inp),$inout1
1366 pxor $twres,@tweak[5]
1367
1368 movdqu `16*2`($inp),$inout2
1369 pxor @tweak[0],$inout0 # input^=tweak
1370 movdqu `16*3`($inp),$inout3
1371 pxor @tweak[1],$inout1
1372 movdqu `16*4`($inp),$inout4
1373 pxor @tweak[2],$inout2
1374 movdqu `16*5`($inp),$inout5
1375 lea `16*6`($inp),$inp
1376 pxor @tweak[3],$inout3
1377 $movkey ($key_),$rndkey0
1378 pxor @tweak[4],$inout4
1379 pxor @tweak[5],$inout5
1380
1381 # inline _aesni_encrypt6 and interleave first and last rounds
1382 # with own code...
1383 $movkey 16($key_),$rndkey1
1384 pxor $rndkey0,$inout0
1385 pxor $rndkey0,$inout1
1386 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
1387 aesenc $rndkey1,$inout0
1388 lea 32($key_),$key
1389 pxor $rndkey0,$inout2
1390 movdqa @tweak[1],`16*1`(%rsp)
1391 aesenc $rndkey1,$inout1
1392 pxor $rndkey0,$inout3
1393 movdqa @tweak[2],`16*2`(%rsp)
1394 aesenc $rndkey1,$inout2
1395 pxor $rndkey0,$inout4
1396 movdqa @tweak[3],`16*3`(%rsp)
1397 aesenc $rndkey1,$inout3
1398 pxor $rndkey0,$inout5
1399 $movkey ($key),$rndkey0
1400 dec $rounds
1401 movdqa @tweak[4],`16*4`(%rsp)
1402 aesenc $rndkey1,$inout4
1403 movdqa @tweak[5],`16*5`(%rsp)
1404 aesenc $rndkey1,$inout5
1405 pxor $twtmp,$twtmp
1406 pcmpgtd @tweak[5],$twtmp
1407 jmp .Lxts_enc_loop6_enter
1408
1409.align 16
1410.Lxts_enc_loop6:
1411 aesenc $rndkey1,$inout0
1412 aesenc $rndkey1,$inout1
1413 dec $rounds
1414 aesenc $rndkey1,$inout2
1415 aesenc $rndkey1,$inout3
1416 aesenc $rndkey1,$inout4
1417 aesenc $rndkey1,$inout5
1418.Lxts_enc_loop6_enter:
1419 $movkey 16($key),$rndkey1
1420 aesenc $rndkey0,$inout0
1421 aesenc $rndkey0,$inout1
1422 lea 32($key),$key
1423 aesenc $rndkey0,$inout2
1424 aesenc $rndkey0,$inout3
1425 aesenc $rndkey0,$inout4
1426 aesenc $rndkey0,$inout5
1427 $movkey ($key),$rndkey0
1428 jnz .Lxts_enc_loop6
1429
1430 pshufd \$0x13,$twtmp,$twres
1431 pxor $twtmp,$twtmp
1432 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1433 aesenc $rndkey1,$inout0
1434 pand $twmask,$twres # isolate carry and residue
1435 aesenc $rndkey1,$inout1
1436 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1437 aesenc $rndkey1,$inout2
1438 pxor $twres,@tweak[5]
1439 aesenc $rndkey1,$inout3
1440 aesenc $rndkey1,$inout4
1441 aesenc $rndkey1,$inout5
1442 $movkey 16($key),$rndkey1
1443
1444 pshufd \$0x13,$twtmp,$twres
1445 pxor $twtmp,$twtmp
1446 movdqa @tweak[5],@tweak[0]
1447 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1448 aesenc $rndkey0,$inout0
1449 pand $twmask,$twres # isolate carry and residue
1450 aesenc $rndkey0,$inout1
1451 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1452 aesenc $rndkey0,$inout2
1453 pxor $twres,@tweak[5]
1454 aesenc $rndkey0,$inout3
1455 aesenc $rndkey0,$inout4
1456 aesenc $rndkey0,$inout5
1457 $movkey 32($key),$rndkey0
1458
1459 pshufd \$0x13,$twtmp,$twres
1460 pxor $twtmp,$twtmp
1461 movdqa @tweak[5],@tweak[1]
1462 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1463 aesenc $rndkey1,$inout0
1464 pand $twmask,$twres # isolate carry and residue
1465 aesenc $rndkey1,$inout1
1466 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1467 aesenc $rndkey1,$inout2
1468 pxor $twres,@tweak[5]
1469 aesenc $rndkey1,$inout3
1470 aesenc $rndkey1,$inout4
1471 aesenc $rndkey1,$inout5
1472
1473 pshufd \$0x13,$twtmp,$twres
1474 pxor $twtmp,$twtmp
1475 movdqa @tweak[5],@tweak[2]
1476 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1477 aesenclast $rndkey0,$inout0
1478 pand $twmask,$twres # isolate carry and residue
1479 aesenclast $rndkey0,$inout1
1480 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1481 aesenclast $rndkey0,$inout2
1482 pxor $twres,@tweak[5]
1483 aesenclast $rndkey0,$inout3
1484 aesenclast $rndkey0,$inout4
1485 aesenclast $rndkey0,$inout5
1486
1487 pshufd \$0x13,$twtmp,$twres
1488 pxor $twtmp,$twtmp
1489 movdqa @tweak[5],@tweak[3]
1490 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1491 xorps `16*0`(%rsp),$inout0 # output^=tweak
1492 pand $twmask,$twres # isolate carry and residue
1493 xorps `16*1`(%rsp),$inout1
1494 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1495 pxor $twres,@tweak[5]
1496
1497 xorps `16*2`(%rsp),$inout2
1498 movups $inout0,`16*0`($out) # write output
1499 xorps `16*3`(%rsp),$inout3
1500 movups $inout1,`16*1`($out)
1501 xorps `16*4`(%rsp),$inout4
1502 movups $inout2,`16*2`($out)
1503 xorps `16*5`(%rsp),$inout5
1504 movups $inout3,`16*3`($out)
1505 mov $rnds_,$rounds # restore $rounds
1506 movups $inout4,`16*4`($out)
1507 movups $inout5,`16*5`($out)
1508 lea `16*6`($out),$out
1509 sub \$16*6,$len
1510 jnc .Lxts_enc_grandloop
1511
1512 lea 3($rounds,$rounds),$rounds # restore original value
1513 mov $key_,$key # restore $key
1514 mov $rounds,$rnds_ # backup $rounds
1515
1516.Lxts_enc_short:
1517 add \$16*6,$len
1518 jz .Lxts_enc_done
1519
1520 cmp \$0x20,$len
1521 jb .Lxts_enc_one
1522 je .Lxts_enc_two
1523
1524 cmp \$0x40,$len
1525 jb .Lxts_enc_three
1526 je .Lxts_enc_four
1527
1528 pshufd \$0x13,$twtmp,$twres
1529 movdqa @tweak[5],@tweak[4]
1530 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1531 movdqu ($inp),$inout0
1532 pand $twmask,$twres # isolate carry and residue
1533 movdqu 16*1($inp),$inout1
1534 pxor $twres,@tweak[5]
1535
1536 movdqu 16*2($inp),$inout2
1537 pxor @tweak[0],$inout0
1538 movdqu 16*3($inp),$inout3
1539 pxor @tweak[1],$inout1
1540 movdqu 16*4($inp),$inout4
1541 lea 16*5($inp),$inp
1542 pxor @tweak[2],$inout2
1543 pxor @tweak[3],$inout3
1544 pxor @tweak[4],$inout4
1545
1546 call _aesni_encrypt6
1547
1548 xorps @tweak[0],$inout0
1549 movdqa @tweak[5],@tweak[0]
1550 xorps @tweak[1],$inout1
1551 xorps @tweak[2],$inout2
1552 movdqu $inout0,($out)
1553 xorps @tweak[3],$inout3
1554 movdqu $inout1,16*1($out)
1555 xorps @tweak[4],$inout4
1556 movdqu $inout2,16*2($out)
1557 movdqu $inout3,16*3($out)
1558 movdqu $inout4,16*4($out)
1559 lea 16*5($out),$out
1560 jmp .Lxts_enc_done
1561
1562.align 16
1563.Lxts_enc_one:
1564 movups ($inp),$inout0
1565 lea 16*1($inp),$inp
1566 xorps @tweak[0],$inout0
1567___
1568 &aesni_generate1("enc",$key,$rounds);
1569$code.=<<___;
1570 xorps @tweak[0],$inout0
1571 movdqa @tweak[1],@tweak[0]
1572 movups $inout0,($out)
1573 lea 16*1($out),$out
1574 jmp .Lxts_enc_done
1575
1576.align 16
1577.Lxts_enc_two:
1578 movups ($inp),$inout0
1579 movups 16($inp),$inout1
1580 lea 32($inp),$inp
1581 xorps @tweak[0],$inout0
1582 xorps @tweak[1],$inout1
1583
1584 call _aesni_encrypt3
1585
1586 xorps @tweak[0],$inout0
1587 movdqa @tweak[2],@tweak[0]
1588 xorps @tweak[1],$inout1
1589 movups $inout0,($out)
1590 movups $inout1,16*1($out)
1591 lea 16*2($out),$out
1592 jmp .Lxts_enc_done
1593
1594.align 16
1595.Lxts_enc_three:
1596 movups ($inp),$inout0
1597 movups 16*1($inp),$inout1
1598 movups 16*2($inp),$inout2
1599 lea 16*3($inp),$inp
1600 xorps @tweak[0],$inout0
1601 xorps @tweak[1],$inout1
1602 xorps @tweak[2],$inout2
1603
1604 call _aesni_encrypt3
1605
1606 xorps @tweak[0],$inout0
1607 movdqa @tweak[3],@tweak[0]
1608 xorps @tweak[1],$inout1
1609 xorps @tweak[2],$inout2
1610 movups $inout0,($out)
1611 movups $inout1,16*1($out)
1612 movups $inout2,16*2($out)
1613 lea 16*3($out),$out
1614 jmp .Lxts_enc_done
1615
1616.align 16
1617.Lxts_enc_four:
1618 movups ($inp),$inout0
1619 movups 16*1($inp),$inout1
1620 movups 16*2($inp),$inout2
1621 xorps @tweak[0],$inout0
1622 movups 16*3($inp),$inout3
1623 lea 16*4($inp),$inp
1624 xorps @tweak[1],$inout1
1625 xorps @tweak[2],$inout2
1626 xorps @tweak[3],$inout3
1627
1628 call _aesni_encrypt4
1629
1630 xorps @tweak[0],$inout0
1631 movdqa @tweak[5],@tweak[0]
1632 xorps @tweak[1],$inout1
1633 xorps @tweak[2],$inout2
1634 movups $inout0,($out)
1635 xorps @tweak[3],$inout3
1636 movups $inout1,16*1($out)
1637 movups $inout2,16*2($out)
1638 movups $inout3,16*3($out)
1639 lea 16*4($out),$out
1640 jmp .Lxts_enc_done
1641
1642.align 16
1643.Lxts_enc_done:
1644 and \$15,$len_
1645 jz .Lxts_enc_ret
1646 mov $len_,$len
1647
1648.Lxts_enc_steal:
1649 movzb ($inp),%eax # borrow $rounds ...
1650 movzb -16($out),%ecx # ... and $key
1651 lea 1($inp),$inp
1652 mov %al,-16($out)
1653 mov %cl,0($out)
1654 lea 1($out),$out
1655 sub \$1,$len
1656 jnz .Lxts_enc_steal
1657
1658 sub $len_,$out # rewind $out
1659 mov $key_,$key # restore $key
1660 mov $rnds_,$rounds # restore $rounds
1661
1662 movups -16($out),$inout0
1663 xorps @tweak[0],$inout0
1664___
1665 &aesni_generate1("enc",$key,$rounds);
1666$code.=<<___;
1667 xorps @tweak[0],$inout0
1668 movups $inout0,-16($out)
1669
1670.Lxts_enc_ret:
1671___
1672$code.=<<___ if ($win64);
1673 movaps 0x60(%rsp),%xmm6
1674 movaps 0x70(%rsp),%xmm7
1675 movaps 0x80(%rsp),%xmm8
1676 movaps 0x90(%rsp),%xmm9
1677 movaps 0xa0(%rsp),%xmm10
1678 movaps 0xb0(%rsp),%xmm11
1679 movaps 0xc0(%rsp),%xmm12
1680 movaps 0xd0(%rsp),%xmm13
1681 movaps 0xe0(%rsp),%xmm14
1682 movaps 0xf0(%rsp),%xmm15
1683___
1684$code.=<<___;
1685 lea $frame_size(%rsp),%rsp
1686.Lxts_enc_epilogue:
1687 ret
1688.size aesni_xts_encrypt,.-aesni_xts_encrypt
1689___
1690
1691$code.=<<___;
1692.globl aesni_xts_decrypt
1693.type aesni_xts_decrypt,\@function,6
1694.align 16
1695aesni_xts_decrypt:
1696 lea -$frame_size(%rsp),%rsp
1697___
1698$code.=<<___ if ($win64);
1699 movaps %xmm6,0x60(%rsp)
1700 movaps %xmm7,0x70(%rsp)
1701 movaps %xmm8,0x80(%rsp)
1702 movaps %xmm9,0x90(%rsp)
1703 movaps %xmm10,0xa0(%rsp)
1704 movaps %xmm11,0xb0(%rsp)
1705 movaps %xmm12,0xc0(%rsp)
1706 movaps %xmm13,0xd0(%rsp)
1707 movaps %xmm14,0xe0(%rsp)
1708 movaps %xmm15,0xf0(%rsp)
1709.Lxts_dec_body:
1710___
1711$code.=<<___;
1712 movups ($ivp),@tweak[5] # load clear-text tweak
1713 mov 240($key2),$rounds # key2->rounds
1714 mov 240($key),$rnds_ # key1->rounds
1715___
1716 # generate the tweak
1717 &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
1718$code.=<<___;
1719 xor %eax,%eax # if ($len%16) len-=16;
1720 test \$15,$len
1721 setnz %al
1722 shl \$4,%rax
1723 sub %rax,$len
1724
1725 mov $key,$key_ # backup $key
1726 mov $rnds_,$rounds # backup $rounds
1727 mov $len,$len_ # backup $len
1728 and \$-16,$len
1729
1730 movdqa .Lxts_magic(%rip),$twmask
1731 pxor $twtmp,$twtmp
1732 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1733___
1734 for ($i=0;$i<4;$i++) {
1735 $code.=<<___;
1736 pshufd \$0x13,$twtmp,$twres
1737 pxor $twtmp,$twtmp
1738 movdqa @tweak[5],@tweak[$i]
1739 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1740 pand $twmask,$twres # isolate carry and residue
1741 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1742 pxor $twres,@tweak[5]
1743___
1744 }
1745$code.=<<___;
1746 sub \$16*6,$len
1747 jc .Lxts_dec_short
1748
1749 shr \$1,$rounds
1750 sub \$1,$rounds
1751 mov $rounds,$rnds_
1752 jmp .Lxts_dec_grandloop
1753
1754.align 16
1755.Lxts_dec_grandloop:
1756 pshufd \$0x13,$twtmp,$twres
1757 movdqa @tweak[5],@tweak[4]
1758 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1759 movdqu `16*0`($inp),$inout0 # load input
1760 pand $twmask,$twres # isolate carry and residue
1761 movdqu `16*1`($inp),$inout1
1762 pxor $twres,@tweak[5]
1763
1764 movdqu `16*2`($inp),$inout2
1765 pxor @tweak[0],$inout0 # input^=tweak
1766 movdqu `16*3`($inp),$inout3
1767 pxor @tweak[1],$inout1
1768 movdqu `16*4`($inp),$inout4
1769 pxor @tweak[2],$inout2
1770 movdqu `16*5`($inp),$inout5
1771 lea `16*6`($inp),$inp
1772 pxor @tweak[3],$inout3
1773 $movkey ($key_),$rndkey0
1774 pxor @tweak[4],$inout4
1775 pxor @tweak[5],$inout5
1776
1777 # inline _aesni_decrypt6 and interleave first and last rounds
1778 # with own code...
1779 $movkey 16($key_),$rndkey1
1780 pxor $rndkey0,$inout0
1781 pxor $rndkey0,$inout1
1782 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
1783 aesdec $rndkey1,$inout0
1784 lea 32($key_),$key
1785 pxor $rndkey0,$inout2
1786 movdqa @tweak[1],`16*1`(%rsp)
1787 aesdec $rndkey1,$inout1
1788 pxor $rndkey0,$inout3
1789 movdqa @tweak[2],`16*2`(%rsp)
1790 aesdec $rndkey1,$inout2
1791 pxor $rndkey0,$inout4
1792 movdqa @tweak[3],`16*3`(%rsp)
1793 aesdec $rndkey1,$inout3
1794 pxor $rndkey0,$inout5
1795 $movkey ($key),$rndkey0
1796 dec $rounds
1797 movdqa @tweak[4],`16*4`(%rsp)
1798 aesdec $rndkey1,$inout4
1799 movdqa @tweak[5],`16*5`(%rsp)
1800 aesdec $rndkey1,$inout5
1801 pxor $twtmp,$twtmp
1802 pcmpgtd @tweak[5],$twtmp
1803 jmp .Lxts_dec_loop6_enter
1804
1805.align 16
1806.Lxts_dec_loop6:
1807 aesdec $rndkey1,$inout0
1808 aesdec $rndkey1,$inout1
1809 dec $rounds
1810 aesdec $rndkey1,$inout2
1811 aesdec $rndkey1,$inout3
1812 aesdec $rndkey1,$inout4
1813 aesdec $rndkey1,$inout5
1814.Lxts_dec_loop6_enter:
1815 $movkey 16($key),$rndkey1
1816 aesdec $rndkey0,$inout0
1817 aesdec $rndkey0,$inout1
1818 lea 32($key),$key
1819 aesdec $rndkey0,$inout2
1820 aesdec $rndkey0,$inout3
1821 aesdec $rndkey0,$inout4
1822 aesdec $rndkey0,$inout5
1823 $movkey ($key),$rndkey0
1824 jnz .Lxts_dec_loop6
1825
1826 pshufd \$0x13,$twtmp,$twres
1827 pxor $twtmp,$twtmp
1828 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1829 aesdec $rndkey1,$inout0
1830 pand $twmask,$twres # isolate carry and residue
1831 aesdec $rndkey1,$inout1
1832 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1833 aesdec $rndkey1,$inout2
1834 pxor $twres,@tweak[5]
1835 aesdec $rndkey1,$inout3
1836 aesdec $rndkey1,$inout4
1837 aesdec $rndkey1,$inout5
1838 $movkey 16($key),$rndkey1
1839
1840 pshufd \$0x13,$twtmp,$twres
1841 pxor $twtmp,$twtmp
1842 movdqa @tweak[5],@tweak[0]
1843 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1844 aesdec $rndkey0,$inout0
1845 pand $twmask,$twres # isolate carry and residue
1846 aesdec $rndkey0,$inout1
1847 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1848 aesdec $rndkey0,$inout2
1849 pxor $twres,@tweak[5]
1850 aesdec $rndkey0,$inout3
1851 aesdec $rndkey0,$inout4
1852 aesdec $rndkey0,$inout5
1853 $movkey 32($key),$rndkey0
1854
1855 pshufd \$0x13,$twtmp,$twres
1856 pxor $twtmp,$twtmp
1857 movdqa @tweak[5],@tweak[1]
1858 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1859 aesdec $rndkey1,$inout0
1860 pand $twmask,$twres # isolate carry and residue
1861 aesdec $rndkey1,$inout1
1862 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1863 aesdec $rndkey1,$inout2
1864 pxor $twres,@tweak[5]
1865 aesdec $rndkey1,$inout3
1866 aesdec $rndkey1,$inout4
1867 aesdec $rndkey1,$inout5
1868
1869 pshufd \$0x13,$twtmp,$twres
1870 pxor $twtmp,$twtmp
1871 movdqa @tweak[5],@tweak[2]
1872 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1873 aesdeclast $rndkey0,$inout0
1874 pand $twmask,$twres # isolate carry and residue
1875 aesdeclast $rndkey0,$inout1
1876 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1877 aesdeclast $rndkey0,$inout2
1878 pxor $twres,@tweak[5]
1879 aesdeclast $rndkey0,$inout3
1880 aesdeclast $rndkey0,$inout4
1881 aesdeclast $rndkey0,$inout5
1882
1883 pshufd \$0x13,$twtmp,$twres
1884 pxor $twtmp,$twtmp
1885 movdqa @tweak[5],@tweak[3]
1886 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1887 xorps `16*0`(%rsp),$inout0 # output^=tweak
1888 pand $twmask,$twres # isolate carry and residue
1889 xorps `16*1`(%rsp),$inout1
1890 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1891 pxor $twres,@tweak[5]
1892
1893 xorps `16*2`(%rsp),$inout2
1894 movups $inout0,`16*0`($out) # write output
1895 xorps `16*3`(%rsp),$inout3
1896 movups $inout1,`16*1`($out)
1897 xorps `16*4`(%rsp),$inout4
1898 movups $inout2,`16*2`($out)
1899 xorps `16*5`(%rsp),$inout5
1900 movups $inout3,`16*3`($out)
1901 mov $rnds_,$rounds # restore $rounds
1902 movups $inout4,`16*4`($out)
1903 movups $inout5,`16*5`($out)
1904 lea `16*6`($out),$out
1905 sub \$16*6,$len
1906 jnc .Lxts_dec_grandloop
1907
1908 lea 3($rounds,$rounds),$rounds # restore original value
1909 mov $key_,$key # restore $key
1910 mov $rounds,$rnds_ # backup $rounds
1911
1912.Lxts_dec_short:
1913 add \$16*6,$len
1914 jz .Lxts_dec_done
1915
1916 cmp \$0x20,$len
1917 jb .Lxts_dec_one
1918 je .Lxts_dec_two
1919
1920 cmp \$0x40,$len
1921 jb .Lxts_dec_three
1922 je .Lxts_dec_four
1923
1924 pshufd \$0x13,$twtmp,$twres
1925 movdqa @tweak[5],@tweak[4]
1926 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1927 movdqu ($inp),$inout0
1928 pand $twmask,$twres # isolate carry and residue
1929 movdqu 16*1($inp),$inout1
1930 pxor $twres,@tweak[5]
1931
1932 movdqu 16*2($inp),$inout2
1933 pxor @tweak[0],$inout0
1934 movdqu 16*3($inp),$inout3
1935 pxor @tweak[1],$inout1
1936 movdqu 16*4($inp),$inout4
1937 lea 16*5($inp),$inp
1938 pxor @tweak[2],$inout2
1939 pxor @tweak[3],$inout3
1940 pxor @tweak[4],$inout4
1941
1942 call _aesni_decrypt6
1943
1944 xorps @tweak[0],$inout0
1945 xorps @tweak[1],$inout1
1946 xorps @tweak[2],$inout2
1947 movdqu $inout0,($out)
1948 xorps @tweak[3],$inout3
1949 movdqu $inout1,16*1($out)
1950 xorps @tweak[4],$inout4
1951 movdqu $inout2,16*2($out)
1952 pxor $twtmp,$twtmp
1953 movdqu $inout3,16*3($out)
1954 pcmpgtd @tweak[5],$twtmp
1955 movdqu $inout4,16*4($out)
1956 lea 16*5($out),$out
1957 pshufd \$0x13,$twtmp,@tweak[1] # $twres
1958 and \$15,$len_
1959 jz .Lxts_dec_ret
1960
1961 movdqa @tweak[5],@tweak[0]
1962 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1963 pand $twmask,@tweak[1] # isolate carry and residue
1964 pxor @tweak[5],@tweak[1]
1965 jmp .Lxts_dec_done2
1966
1967.align 16
1968.Lxts_dec_one:
1969 movups ($inp),$inout0
1970 lea 16*1($inp),$inp
1971 xorps @tweak[0],$inout0
1972___
1973 &aesni_generate1("dec",$key,$rounds);
1974$code.=<<___;
1975 xorps @tweak[0],$inout0
1976 movdqa @tweak[1],@tweak[0]
1977 movups $inout0,($out)
1978 movdqa @tweak[2],@tweak[1]
1979 lea 16*1($out),$out
1980 jmp .Lxts_dec_done
1981
1982.align 16
1983.Lxts_dec_two:
1984 movups ($inp),$inout0
1985 movups 16($inp),$inout1
1986 lea 32($inp),$inp
1987 xorps @tweak[0],$inout0
1988 xorps @tweak[1],$inout1
1989
1990 call _aesni_decrypt3
1991
1992 xorps @tweak[0],$inout0
1993 movdqa @tweak[2],@tweak[0]
1994 xorps @tweak[1],$inout1
1995 movdqa @tweak[3],@tweak[1]
1996 movups $inout0,($out)
1997 movups $inout1,16*1($out)
1998 lea 16*2($out),$out
1999 jmp .Lxts_dec_done
2000
2001.align 16
2002.Lxts_dec_three:
2003 movups ($inp),$inout0
2004 movups 16*1($inp),$inout1
2005 movups 16*2($inp),$inout2
2006 lea 16*3($inp),$inp
2007 xorps @tweak[0],$inout0
2008 xorps @tweak[1],$inout1
2009 xorps @tweak[2],$inout2
2010
2011 call _aesni_decrypt3
2012
2013 xorps @tweak[0],$inout0
2014 movdqa @tweak[3],@tweak[0]
2015 xorps @tweak[1],$inout1
2016 movdqa @tweak[5],@tweak[1]
2017 xorps @tweak[2],$inout2
2018 movups $inout0,($out)
2019 movups $inout1,16*1($out)
2020 movups $inout2,16*2($out)
2021 lea 16*3($out),$out
2022 jmp .Lxts_dec_done
2023
2024.align 16
2025.Lxts_dec_four:
2026 pshufd \$0x13,$twtmp,$twres
2027 movdqa @tweak[5],@tweak[4]
2028 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2029 movups ($inp),$inout0
2030 pand $twmask,$twres # isolate carry and residue
2031 movups 16*1($inp),$inout1
2032 pxor $twres,@tweak[5]
2033
2034 movups 16*2($inp),$inout2
2035 xorps @tweak[0],$inout0
2036 movups 16*3($inp),$inout3
2037 lea 16*4($inp),$inp
2038 xorps @tweak[1],$inout1
2039 xorps @tweak[2],$inout2
2040 xorps @tweak[3],$inout3
2041
2042 call _aesni_decrypt4
2043
2044 xorps @tweak[0],$inout0
2045 movdqa @tweak[4],@tweak[0]
2046 xorps @tweak[1],$inout1
2047 movdqa @tweak[5],@tweak[1]
2048 xorps @tweak[2],$inout2
2049 movups $inout0,($out)
2050 xorps @tweak[3],$inout3
2051 movups $inout1,16*1($out)
2052 movups $inout2,16*2($out)
2053 movups $inout3,16*3($out)
2054 lea 16*4($out),$out
2055 jmp .Lxts_dec_done
2056
2057.align 16
2058.Lxts_dec_done:
2059 and \$15,$len_
2060 jz .Lxts_dec_ret
2061.Lxts_dec_done2:
2062 mov $len_,$len
2063 mov $key_,$key # restore $key
2064 mov $rnds_,$rounds # restore $rounds
2065
2066 movups ($inp),$inout0
2067 xorps @tweak[1],$inout0
2068___
2069 &aesni_generate1("dec",$key,$rounds);
2070$code.=<<___;
2071 xorps @tweak[1],$inout0
2072 movups $inout0,($out)
2073
2074.Lxts_dec_steal:
2075 movzb 16($inp),%eax # borrow $rounds ...
2076 movzb ($out),%ecx # ... and $key
2077 lea 1($inp),$inp
2078 mov %al,($out)
2079 mov %cl,16($out)
2080 lea 1($out),$out
2081 sub \$1,$len
2082 jnz .Lxts_dec_steal
2083
2084 sub $len_,$out # rewind $out
2085 mov $key_,$key # restore $key
2086 mov $rnds_,$rounds # restore $rounds
2087
2088 movups ($out),$inout0
2089 xorps @tweak[0],$inout0
2090___
2091 &aesni_generate1("dec",$key,$rounds);
2092$code.=<<___;
2093 xorps @tweak[0],$inout0
2094 movups $inout0,($out)
2095
2096.Lxts_dec_ret:
2097___
2098$code.=<<___ if ($win64);
2099 movaps 0x60(%rsp),%xmm6
2100 movaps 0x70(%rsp),%xmm7
2101 movaps 0x80(%rsp),%xmm8
2102 movaps 0x90(%rsp),%xmm9
2103 movaps 0xa0(%rsp),%xmm10
2104 movaps 0xb0(%rsp),%xmm11
2105 movaps 0xc0(%rsp),%xmm12
2106 movaps 0xd0(%rsp),%xmm13
2107 movaps 0xe0(%rsp),%xmm14
2108 movaps 0xf0(%rsp),%xmm15
2109___
2110$code.=<<___;
2111 lea $frame_size(%rsp),%rsp
2112.Lxts_dec_epilogue:
2113 ret
2114.size aesni_xts_decrypt,.-aesni_xts_decrypt
2115___
2116} }}
2117
2118########################################################################
2119# void $PREFIX_cbc_encrypt (const void *inp, void *out,
2120# size_t length, const AES_KEY *key,
2121# unsigned char *ivp,const int enc);
2122{
2123my $reserved = $win64?0x40:-0x18; # used in decrypt
2124$code.=<<___;
2125.globl ${PREFIX}_cbc_encrypt
2126.type ${PREFIX}_cbc_encrypt,\@function,6
2127.align 16
2128${PREFIX}_cbc_encrypt:
2129 test $len,$len # check length
2130 jz .Lcbc_ret
2131
2132 mov 240($key),$rnds_ # key->rounds
2133 mov $key,$key_ # backup $key
2134 test %r9d,%r9d # 6th argument
2135 jz .Lcbc_decrypt
2136#--------------------------- CBC ENCRYPT ------------------------------#
2137 movups ($ivp),$inout0 # load iv as initial state
2138 mov $rnds_,$rounds
2139 cmp \$16,$len
2140 jb .Lcbc_enc_tail
2141 sub \$16,$len
2142 jmp .Lcbc_enc_loop
2143.align 16
2144.Lcbc_enc_loop:
2145 movups ($inp),$inout1 # load input
2146 lea 16($inp),$inp
2147 #xorps $inout1,$inout0
2148___
2149 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
2150$code.=<<___;
2151 mov $rnds_,$rounds # restore $rounds
2152 mov $key_,$key # restore $key
2153 movups $inout0,0($out) # store output
2154 lea 16($out),$out
2155 sub \$16,$len
2156 jnc .Lcbc_enc_loop
2157 add \$16,$len
2158 jnz .Lcbc_enc_tail
2159 movups $inout0,($ivp)
2160 jmp .Lcbc_ret
2161
2162.Lcbc_enc_tail:
2163 mov $len,%rcx # zaps $key
2164 xchg $inp,$out # $inp is %rsi and $out is %rdi now
2165 .long 0x9066A4F3 # rep movsb
2166 mov \$16,%ecx # zero tail
2167 sub $len,%rcx
2168 xor %eax,%eax
2169 .long 0x9066AAF3 # rep stosb
2170 lea -16(%rdi),%rdi # rewind $out by 1 block
2171 mov $rnds_,$rounds # restore $rounds
2172 mov %rdi,%rsi # $inp and $out are the same
2173 mov $key_,$key # restore $key
2174 xor $len,$len # len=16
2175 jmp .Lcbc_enc_loop # one more spin
2176 #--------------------------- CBC DECRYPT ------------------------------#
2177.align 16
2178.Lcbc_decrypt:
2179___
2180$code.=<<___ if ($win64);
2181 lea -0x58(%rsp),%rsp
2182 movaps %xmm6,(%rsp)
2183 movaps %xmm7,0x10(%rsp)
2184 movaps %xmm8,0x20(%rsp)
2185 movaps %xmm9,0x30(%rsp)
2186.Lcbc_decrypt_body:
2187___
2188$code.=<<___;
2189 movups ($ivp),$iv
2190 mov $rnds_,$rounds
2191 cmp \$0x70,$len
2192 jbe .Lcbc_dec_tail
2193 shr \$1,$rnds_
2194 sub \$0x70,$len
2195 mov $rnds_,$rounds
2196 movaps $iv,$reserved(%rsp)
2197 jmp .Lcbc_dec_loop8_enter
2198.align 16
2199.Lcbc_dec_loop8:
2200 movaps $rndkey0,$reserved(%rsp) # save IV
2201 movups $inout7,($out)
2202 lea 0x10($out),$out
2203.Lcbc_dec_loop8_enter:
2204 $movkey ($key),$rndkey0
2205 movups ($inp),$inout0 # load input
2206 movups 0x10($inp),$inout1
2207 $movkey 16($key),$rndkey1
2208
2209 lea 32($key),$key
2210 movdqu 0x20($inp),$inout2
2211 xorps $rndkey0,$inout0
2212 movdqu 0x30($inp),$inout3
2213 xorps $rndkey0,$inout1
2214 movdqu 0x40($inp),$inout4
2215 aesdec $rndkey1,$inout0
2216 pxor $rndkey0,$inout2
2217 movdqu 0x50($inp),$inout5
2218 aesdec $rndkey1,$inout1
2219 pxor $rndkey0,$inout3
2220 movdqu 0x60($inp),$inout6
2221 aesdec $rndkey1,$inout2
2222 pxor $rndkey0,$inout4
2223 movdqu 0x70($inp),$inout7
2224 aesdec $rndkey1,$inout3
2225 pxor $rndkey0,$inout5
2226 dec $rounds
2227 aesdec $rndkey1,$inout4
2228 pxor $rndkey0,$inout6
2229 aesdec $rndkey1,$inout5
2230 pxor $rndkey0,$inout7
2231 $movkey ($key),$rndkey0
2232 aesdec $rndkey1,$inout6
2233 aesdec $rndkey1,$inout7
2234 $movkey 16($key),$rndkey1
2235
2236 call .Ldec_loop8_enter
2237
2238 movups ($inp),$rndkey1 # re-load input
2239 movups 0x10($inp),$rndkey0
2240 xorps $reserved(%rsp),$inout0 # ^= IV
2241 xorps $rndkey1,$inout1
2242 movups 0x20($inp),$rndkey1
2243 xorps $rndkey0,$inout2
2244 movups 0x30($inp),$rndkey0
2245 xorps $rndkey1,$inout3
2246 movups 0x40($inp),$rndkey1
2247 xorps $rndkey0,$inout4
2248 movups 0x50($inp),$rndkey0
2249 xorps $rndkey1,$inout5
2250 movups 0x60($inp),$rndkey1
2251 xorps $rndkey0,$inout6
2252 movups 0x70($inp),$rndkey0 # IV
2253 xorps $rndkey1,$inout7
2254 movups $inout0,($out)
2255 movups $inout1,0x10($out)
2256 movups $inout2,0x20($out)
2257 movups $inout3,0x30($out)
2258 mov $rnds_,$rounds # restore $rounds
2259 movups $inout4,0x40($out)
2260 mov $key_,$key # restore $key
2261 movups $inout5,0x50($out)
2262 lea 0x80($inp),$inp
2263 movups $inout6,0x60($out)
2264 lea 0x70($out),$out
2265 sub \$0x80,$len
2266 ja .Lcbc_dec_loop8
2267
2268 movaps $inout7,$inout0
2269 movaps $rndkey0,$iv
2270 add \$0x70,$len
2271 jle .Lcbc_dec_tail_collected
2272 movups $inout0,($out)
2273 lea 1($rnds_,$rnds_),$rounds
2274 lea 0x10($out),$out
2275.Lcbc_dec_tail:
2276 movups ($inp),$inout0
2277 movaps $inout0,$in0
2278 cmp \$0x10,$len
2279 jbe .Lcbc_dec_one
2280
2281 movups 0x10($inp),$inout1
2282 movaps $inout1,$in1
2283 cmp \$0x20,$len
2284 jbe .Lcbc_dec_two
2285
2286 movups 0x20($inp),$inout2
2287 movaps $inout2,$in2
2288 cmp \$0x30,$len
2289 jbe .Lcbc_dec_three
2290
2291 movups 0x30($inp),$inout3
2292 cmp \$0x40,$len
2293 jbe .Lcbc_dec_four
2294
2295 movups 0x40($inp),$inout4
2296 cmp \$0x50,$len
2297 jbe .Lcbc_dec_five
2298
2299 movups 0x50($inp),$inout5
2300 cmp \$0x60,$len
2301 jbe .Lcbc_dec_six
2302
2303 movups 0x60($inp),$inout6
2304 movaps $iv,$reserved(%rsp) # save IV
2305 call _aesni_decrypt8
2306 movups ($inp),$rndkey1
2307 movups 0x10($inp),$rndkey0
2308 xorps $reserved(%rsp),$inout0 # ^= IV
2309 xorps $rndkey1,$inout1
2310 movups 0x20($inp),$rndkey1
2311 xorps $rndkey0,$inout2
2312 movups 0x30($inp),$rndkey0
2313 xorps $rndkey1,$inout3
2314 movups 0x40($inp),$rndkey1
2315 xorps $rndkey0,$inout4
2316 movups 0x50($inp),$rndkey0
2317 xorps $rndkey1,$inout5
2318 movups 0x60($inp),$iv # IV
2319 xorps $rndkey0,$inout6
2320 movups $inout0,($out)
2321 movups $inout1,0x10($out)
2322 movups $inout2,0x20($out)
2323 movups $inout3,0x30($out)
2324 movups $inout4,0x40($out)
2325 movups $inout5,0x50($out)
2326 lea 0x60($out),$out
2327 movaps $inout6,$inout0
2328 sub \$0x70,$len
2329 jmp .Lcbc_dec_tail_collected
2330.align 16
2331.Lcbc_dec_one:
2332___
2333 &aesni_generate1("dec",$key,$rounds);
2334$code.=<<___;
2335 xorps $iv,$inout0
2336 movaps $in0,$iv
2337 sub \$0x10,$len
2338 jmp .Lcbc_dec_tail_collected
2339.align 16
2340.Lcbc_dec_two:
2341 xorps $inout2,$inout2
2342 call _aesni_decrypt3
2343 xorps $iv,$inout0
2344 xorps $in0,$inout1
2345 movups $inout0,($out)
2346 movaps $in1,$iv
2347 movaps $inout1,$inout0
2348 lea 0x10($out),$out
2349 sub \$0x20,$len
2350 jmp .Lcbc_dec_tail_collected
2351.align 16
2352.Lcbc_dec_three:
2353 call _aesni_decrypt3
2354 xorps $iv,$inout0
2355 xorps $in0,$inout1
2356 movups $inout0,($out)
2357 xorps $in1,$inout2
2358 movups $inout1,0x10($out)
2359 movaps $in2,$iv
2360 movaps $inout2,$inout0
2361 lea 0x20($out),$out
2362 sub \$0x30,$len
2363 jmp .Lcbc_dec_tail_collected
2364.align 16
2365.Lcbc_dec_four:
2366 call _aesni_decrypt4
2367 xorps $iv,$inout0
2368 movups 0x30($inp),$iv
2369 xorps $in0,$inout1
2370 movups $inout0,($out)
2371 xorps $in1,$inout2
2372 movups $inout1,0x10($out)
2373 xorps $in2,$inout3
2374 movups $inout2,0x20($out)
2375 movaps $inout3,$inout0
2376 lea 0x30($out),$out
2377 sub \$0x40,$len
2378 jmp .Lcbc_dec_tail_collected
2379.align 16
2380.Lcbc_dec_five:
2381 xorps $inout5,$inout5
2382 call _aesni_decrypt6
2383 movups 0x10($inp),$rndkey1
2384 movups 0x20($inp),$rndkey0
2385 xorps $iv,$inout0
2386 xorps $in0,$inout1
2387 xorps $rndkey1,$inout2
2388 movups 0x30($inp),$rndkey1
2389 xorps $rndkey0,$inout3
2390 movups 0x40($inp),$iv
2391 xorps $rndkey1,$inout4
2392 movups $inout0,($out)
2393 movups $inout1,0x10($out)
2394 movups $inout2,0x20($out)
2395 movups $inout3,0x30($out)
2396 lea 0x40($out),$out
2397 movaps $inout4,$inout0
2398 sub \$0x50,$len
2399 jmp .Lcbc_dec_tail_collected
2400.align 16
2401.Lcbc_dec_six:
2402 call _aesni_decrypt6
2403 movups 0x10($inp),$rndkey1
2404 movups 0x20($inp),$rndkey0
2405 xorps $iv,$inout0
2406 xorps $in0,$inout1
2407 xorps $rndkey1,$inout2
2408 movups 0x30($inp),$rndkey1
2409 xorps $rndkey0,$inout3
2410 movups 0x40($inp),$rndkey0
2411 xorps $rndkey1,$inout4
2412 movups 0x50($inp),$iv
2413 xorps $rndkey0,$inout5
2414 movups $inout0,($out)
2415 movups $inout1,0x10($out)
2416 movups $inout2,0x20($out)
2417 movups $inout3,0x30($out)
2418 movups $inout4,0x40($out)
2419 lea 0x50($out),$out
2420 movaps $inout5,$inout0
2421 sub \$0x60,$len
2422 jmp .Lcbc_dec_tail_collected
2423.align 16
2424.Lcbc_dec_tail_collected:
2425 and \$15,$len
2426 movups $iv,($ivp)
2427 jnz .Lcbc_dec_tail_partial
2428 movups $inout0,($out)
2429 jmp .Lcbc_dec_ret
2430.align 16
2431.Lcbc_dec_tail_partial:
2432 movaps $inout0,$reserved(%rsp)
2433 mov \$16,%rcx
2434 mov $out,%rdi
2435 sub $len,%rcx
2436 lea $reserved(%rsp),%rsi
2437 .long 0x9066A4F3 # rep movsb
2438
2439.Lcbc_dec_ret:
2440___
2441$code.=<<___ if ($win64);
2442 movaps (%rsp),%xmm6
2443 movaps 0x10(%rsp),%xmm7
2444 movaps 0x20(%rsp),%xmm8
2445 movaps 0x30(%rsp),%xmm9
2446 lea 0x58(%rsp),%rsp
2447___
2448$code.=<<___;
2449.Lcbc_ret:
2450 ret
2451.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
2452___
2453}
2454# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
2455# int bits, AES_KEY *key)
2456{ my ($inp,$bits,$key) = @_4args;
2457 $bits =~ s/%r/%e/;
2458
2459$code.=<<___;
2460.globl ${PREFIX}_set_decrypt_key
2461.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
2462.align 16
2463${PREFIX}_set_decrypt_key:
2464 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
2465 call __aesni_set_encrypt_key
2466 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
2467 test %eax,%eax
2468 jnz .Ldec_key_ret
2469 lea 16($key,$bits),$inp # points at the end of key schedule
2470
2471 $movkey ($key),%xmm0 # just swap
2472 $movkey ($inp),%xmm1
2473 $movkey %xmm0,($inp)
2474 $movkey %xmm1,($key)
2475 lea 16($key),$key
2476 lea -16($inp),$inp
2477
2478.Ldec_key_inverse:
2479 $movkey ($key),%xmm0 # swap and inverse
2480 $movkey ($inp),%xmm1
2481 aesimc %xmm0,%xmm0
2482 aesimc %xmm1,%xmm1
2483 lea 16($key),$key
2484 lea -16($inp),$inp
2485 $movkey %xmm0,16($inp)
2486 $movkey %xmm1,-16($key)
2487 cmp $key,$inp
2488 ja .Ldec_key_inverse
2489
2490 $movkey ($key),%xmm0 # inverse middle
2491 aesimc %xmm0,%xmm0
2492 $movkey %xmm0,($inp)
2493.Ldec_key_ret:
2494 add \$8,%rsp
2495 ret
2496.LSEH_end_set_decrypt_key:
2497.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
2498___
2499
2500# This is based on submission by
2501#
2502# Huang Ying <ying.huang@intel.com>
2503# Vinodh Gopal <vinodh.gopal@intel.com>
2504# Kahraman Akdemir
2505#
2506# Agressively optimized in respect to aeskeygenassist's critical path
2507# and is contained in %xmm0-5 to meet Win64 ABI requirement.
2508#
2509$code.=<<___;
2510.globl ${PREFIX}_set_encrypt_key
2511.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
2512.align 16
2513${PREFIX}_set_encrypt_key:
2514__aesni_set_encrypt_key:
2515 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
2516 mov \$-1,%rax
2517 test $inp,$inp
2518 jz .Lenc_key_ret
2519 test $key,$key
2520 jz .Lenc_key_ret
2521
2522 movups ($inp),%xmm0 # pull first 128 bits of *userKey
2523 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
2524 lea 16($key),%rax
2525 cmp \$256,$bits
2526 je .L14rounds
2527 cmp \$192,$bits
2528 je .L12rounds
2529 cmp \$128,$bits
2530 jne .Lbad_keybits
2531
2532.L10rounds:
2533 mov \$9,$bits # 10 rounds for 128-bit key
2534 $movkey %xmm0,($key) # round 0
2535 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
2536 call .Lkey_expansion_128_cold
2537 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
2538 call .Lkey_expansion_128
2539 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
2540 call .Lkey_expansion_128
2541 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
2542 call .Lkey_expansion_128
2543 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
2544 call .Lkey_expansion_128
2545 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
2546 call .Lkey_expansion_128
2547 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
2548 call .Lkey_expansion_128
2549 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
2550 call .Lkey_expansion_128
2551 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
2552 call .Lkey_expansion_128
2553 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
2554 call .Lkey_expansion_128
2555 $movkey %xmm0,(%rax)
2556 mov $bits,80(%rax) # 240(%rdx)
2557 xor %eax,%eax
2558 jmp .Lenc_key_ret
2559
2560.align 16
2561.L12rounds:
2562 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
2563 mov \$11,$bits # 12 rounds for 192
2564 $movkey %xmm0,($key) # round 0
2565 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
2566 call .Lkey_expansion_192a_cold
2567 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
2568 call .Lkey_expansion_192b
2569 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
2570 call .Lkey_expansion_192a
2571 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
2572 call .Lkey_expansion_192b
2573 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
2574 call .Lkey_expansion_192a
2575 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
2576 call .Lkey_expansion_192b
2577 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
2578 call .Lkey_expansion_192a
2579 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
2580 call .Lkey_expansion_192b
2581 $movkey %xmm0,(%rax)
2582 mov $bits,48(%rax) # 240(%rdx)
2583 xor %rax, %rax
2584 jmp .Lenc_key_ret
2585
2586.align 16
2587.L14rounds:
2588 movups 16($inp),%xmm2 # remaning half of *userKey
2589 mov \$13,$bits # 14 rounds for 256
2590 lea 16(%rax),%rax
2591 $movkey %xmm0,($key) # round 0
2592 $movkey %xmm2,16($key) # round 1
2593 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
2594 call .Lkey_expansion_256a_cold
2595 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
2596 call .Lkey_expansion_256b
2597 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
2598 call .Lkey_expansion_256a
2599 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
2600 call .Lkey_expansion_256b
2601 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
2602 call .Lkey_expansion_256a
2603 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
2604 call .Lkey_expansion_256b
2605 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
2606 call .Lkey_expansion_256a
2607 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
2608 call .Lkey_expansion_256b
2609 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
2610 call .Lkey_expansion_256a
2611 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
2612 call .Lkey_expansion_256b
2613 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
2614 call .Lkey_expansion_256a
2615 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
2616 call .Lkey_expansion_256b
2617 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
2618 call .Lkey_expansion_256a
2619 $movkey %xmm0,(%rax)
2620 mov $bits,16(%rax) # 240(%rdx)
2621 xor %rax,%rax
2622 jmp .Lenc_key_ret
2623
2624.align 16
2625.Lbad_keybits:
2626 mov \$-2,%rax
2627.Lenc_key_ret:
2628 add \$8,%rsp
2629 ret
2630.LSEH_end_set_encrypt_key:
2631
2632.align 16
2633.Lkey_expansion_128:
2634 $movkey %xmm0,(%rax)
2635 lea 16(%rax),%rax
2636.Lkey_expansion_128_cold:
2637 shufps \$0b00010000,%xmm0,%xmm4
2638 xorps %xmm4, %xmm0
2639 shufps \$0b10001100,%xmm0,%xmm4
2640 xorps %xmm4, %xmm0
2641 shufps \$0b11111111,%xmm1,%xmm1 # critical path
2642 xorps %xmm1,%xmm0
2643 ret
2644
2645.align 16
2646.Lkey_expansion_192a:
2647 $movkey %xmm0,(%rax)
2648 lea 16(%rax),%rax
2649.Lkey_expansion_192a_cold:
2650 movaps %xmm2, %xmm5
2651.Lkey_expansion_192b_warm:
2652 shufps \$0b00010000,%xmm0,%xmm4
2653 movdqa %xmm2,%xmm3
2654 xorps %xmm4,%xmm0
2655 shufps \$0b10001100,%xmm0,%xmm4
2656 pslldq \$4,%xmm3
2657 xorps %xmm4,%xmm0
2658 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
2659 pxor %xmm3,%xmm2
2660 pxor %xmm1,%xmm0
2661 pshufd \$0b11111111,%xmm0,%xmm3
2662 pxor %xmm3,%xmm2
2663 ret
2664
2665.align 16
2666.Lkey_expansion_192b:
2667 movaps %xmm0,%xmm3
2668 shufps \$0b01000100,%xmm0,%xmm5
2669 $movkey %xmm5,(%rax)
2670 shufps \$0b01001110,%xmm2,%xmm3
2671 $movkey %xmm3,16(%rax)
2672 lea 32(%rax),%rax
2673 jmp .Lkey_expansion_192b_warm
2674
2675.align 16
2676.Lkey_expansion_256a:
2677 $movkey %xmm2,(%rax)
2678 lea 16(%rax),%rax
2679.Lkey_expansion_256a_cold:
2680 shufps \$0b00010000,%xmm0,%xmm4
2681 xorps %xmm4,%xmm0
2682 shufps \$0b10001100,%xmm0,%xmm4
2683 xorps %xmm4,%xmm0
2684 shufps \$0b11111111,%xmm1,%xmm1 # critical path
2685 xorps %xmm1,%xmm0
2686 ret
2687
2688.align 16
2689.Lkey_expansion_256b:
2690 $movkey %xmm0,(%rax)
2691 lea 16(%rax),%rax
2692
2693 shufps \$0b00010000,%xmm2,%xmm4
2694 xorps %xmm4,%xmm2
2695 shufps \$0b10001100,%xmm2,%xmm4
2696 xorps %xmm4,%xmm2
2697 shufps \$0b10101010,%xmm1,%xmm1 # critical path
2698 xorps %xmm1,%xmm2
2699 ret
2700.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
2701.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
2702___
2703}
2704
2705$code.=<<___;
2706.align 64
2707.Lbswap_mask:
2708 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
2709.Lincrement32:
2710 .long 6,6,6,0
2711.Lincrement64:
2712 .long 1,0,0,0
2713.Lxts_magic:
2714 .long 0x87,0,1,0
2715
2716.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
2717.align 64
2718___
2719
2720# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2721# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2722if ($win64) {
2723$rec="%rcx";
2724$frame="%rdx";
2725$context="%r8";
2726$disp="%r9";
2727
2728$code.=<<___;
2729.extern __imp_RtlVirtualUnwind
2730___
2731$code.=<<___ if ($PREFIX eq "aesni");
2732.type ecb_se_handler,\@abi-omnipotent
2733.align 16
2734ecb_se_handler:
2735 push %rsi
2736 push %rdi
2737 push %rbx
2738 push %rbp
2739 push %r12
2740 push %r13
2741 push %r14
2742 push %r15
2743 pushfq
2744 sub \$64,%rsp
2745
2746 mov 152($context),%rax # pull context->Rsp
2747
2748 jmp .Lcommon_seh_tail
2749.size ecb_se_handler,.-ecb_se_handler
2750
2751.type ccm64_se_handler,\@abi-omnipotent
2752.align 16
2753ccm64_se_handler:
2754 push %rsi
2755 push %rdi
2756 push %rbx
2757 push %rbp
2758 push %r12
2759 push %r13
2760 push %r14
2761 push %r15
2762 pushfq
2763 sub \$64,%rsp
2764
2765 mov 120($context),%rax # pull context->Rax
2766 mov 248($context),%rbx # pull context->Rip
2767
2768 mov 8($disp),%rsi # disp->ImageBase
2769 mov 56($disp),%r11 # disp->HandlerData
2770
2771 mov 0(%r11),%r10d # HandlerData[0]
2772 lea (%rsi,%r10),%r10 # prologue label
2773 cmp %r10,%rbx # context->Rip<prologue label
2774 jb .Lcommon_seh_tail
2775
2776 mov 152($context),%rax # pull context->Rsp
2777
2778 mov 4(%r11),%r10d # HandlerData[1]
2779 lea (%rsi,%r10),%r10 # epilogue label
2780 cmp %r10,%rbx # context->Rip>=epilogue label
2781 jae .Lcommon_seh_tail
2782
2783 lea 0(%rax),%rsi # %xmm save area
2784 lea 512($context),%rdi # &context.Xmm6
2785 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
2786 .long 0xa548f3fc # cld; rep movsq
2787 lea 0x58(%rax),%rax # adjust stack pointer
2788
2789 jmp .Lcommon_seh_tail
2790.size ccm64_se_handler,.-ccm64_se_handler
2791
2792.type ctr32_se_handler,\@abi-omnipotent
2793.align 16
2794ctr32_se_handler:
2795 push %rsi
2796 push %rdi
2797 push %rbx
2798 push %rbp
2799 push %r12
2800 push %r13
2801 push %r14
2802 push %r15
2803 pushfq
2804 sub \$64,%rsp
2805
2806 mov 120($context),%rax # pull context->Rax
2807 mov 248($context),%rbx # pull context->Rip
2808
2809 lea .Lctr32_body(%rip),%r10
2810 cmp %r10,%rbx # context->Rip<"prologue" label
2811 jb .Lcommon_seh_tail
2812
2813 mov 152($context),%rax # pull context->Rsp
2814
2815 lea .Lctr32_ret(%rip),%r10
2816 cmp %r10,%rbx
2817 jae .Lcommon_seh_tail
2818
2819 lea 0x20(%rax),%rsi # %xmm save area
2820 lea 512($context),%rdi # &context.Xmm6
2821 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2822 .long 0xa548f3fc # cld; rep movsq
2823 lea 0xc8(%rax),%rax # adjust stack pointer
2824
2825 jmp .Lcommon_seh_tail
2826.size ctr32_se_handler,.-ctr32_se_handler
2827
2828.type xts_se_handler,\@abi-omnipotent
2829.align 16
2830xts_se_handler:
2831 push %rsi
2832 push %rdi
2833 push %rbx
2834 push %rbp
2835 push %r12
2836 push %r13
2837 push %r14
2838 push %r15
2839 pushfq
2840 sub \$64,%rsp
2841
2842 mov 120($context),%rax # pull context->Rax
2843 mov 248($context),%rbx # pull context->Rip
2844
2845 mov 8($disp),%rsi # disp->ImageBase
2846 mov 56($disp),%r11 # disp->HandlerData
2847
2848 mov 0(%r11),%r10d # HandlerData[0]
2849 lea (%rsi,%r10),%r10 # prologue lable
2850 cmp %r10,%rbx # context->Rip<prologue label
2851 jb .Lcommon_seh_tail
2852
2853 mov 152($context),%rax # pull context->Rsp
2854
2855 mov 4(%r11),%r10d # HandlerData[1]
2856 lea (%rsi,%r10),%r10 # epilogue label
2857 cmp %r10,%rbx # context->Rip>=epilogue label
2858 jae .Lcommon_seh_tail
2859
2860 lea 0x60(%rax),%rsi # %xmm save area
2861 lea 512($context),%rdi # & context.Xmm6
2862 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2863 .long 0xa548f3fc # cld; rep movsq
2864 lea 0x68+160(%rax),%rax # adjust stack pointer
2865
2866 jmp .Lcommon_seh_tail
2867.size xts_se_handler,.-xts_se_handler
2868___
2869$code.=<<___;
2870.type cbc_se_handler,\@abi-omnipotent
2871.align 16
2872cbc_se_handler:
2873 push %rsi
2874 push %rdi
2875 push %rbx
2876 push %rbp
2877 push %r12
2878 push %r13
2879 push %r14
2880 push %r15
2881 pushfq
2882 sub \$64,%rsp
2883
2884 mov 152($context),%rax # pull context->Rsp
2885 mov 248($context),%rbx # pull context->Rip
2886
2887 lea .Lcbc_decrypt(%rip),%r10
2888 cmp %r10,%rbx # context->Rip<"prologue" label
2889 jb .Lcommon_seh_tail
2890
2891 lea .Lcbc_decrypt_body(%rip),%r10
2892 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
2893 jb .Lrestore_cbc_rax
2894
2895 lea .Lcbc_ret(%rip),%r10
2896 cmp %r10,%rbx # context->Rip>="epilogue" label
2897 jae .Lcommon_seh_tail
2898
2899 lea 0(%rax),%rsi # top of stack
2900 lea 512($context),%rdi # &context.Xmm6
2901 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
2902 .long 0xa548f3fc # cld; rep movsq
2903 lea 0x58(%rax),%rax # adjust stack pointer
2904 jmp .Lcommon_seh_tail
2905
2906.Lrestore_cbc_rax:
2907 mov 120($context),%rax
2908
2909.Lcommon_seh_tail:
2910 mov 8(%rax),%rdi
2911 mov 16(%rax),%rsi
2912 mov %rax,152($context) # restore context->Rsp
2913 mov %rsi,168($context) # restore context->Rsi
2914 mov %rdi,176($context) # restore context->Rdi
2915
2916 mov 40($disp),%rdi # disp->ContextRecord
2917 mov $context,%rsi # context
2918 mov \$154,%ecx # sizeof(CONTEXT)
2919 .long 0xa548f3fc # cld; rep movsq
2920
2921 mov $disp,%rsi
2922 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2923 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2924 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2925 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2926 mov 40(%rsi),%r10 # disp->ContextRecord
2927 lea 56(%rsi),%r11 # &disp->HandlerData
2928 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2929 mov %r10,32(%rsp) # arg5
2930 mov %r11,40(%rsp) # arg6
2931 mov %r12,48(%rsp) # arg7
2932 mov %rcx,56(%rsp) # arg8, (NULL)
2933 call *__imp_RtlVirtualUnwind(%rip)
2934
2935 mov \$1,%eax # ExceptionContinueSearch
2936 add \$64,%rsp
2937 popfq
2938 pop %r15
2939 pop %r14
2940 pop %r13
2941 pop %r12
2942 pop %rbp
2943 pop %rbx
2944 pop %rdi
2945 pop %rsi
2946 ret
2947.size cbc_se_handler,.-cbc_se_handler
2948
2949.section .pdata
2950.align 4
2951___
2952$code.=<<___ if ($PREFIX eq "aesni");
2953 .rva .LSEH_begin_aesni_ecb_encrypt
2954 .rva .LSEH_end_aesni_ecb_encrypt
2955 .rva .LSEH_info_ecb
2956
2957 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
2958 .rva .LSEH_end_aesni_ccm64_encrypt_blocks
2959 .rva .LSEH_info_ccm64_enc
2960
2961 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
2962 .rva .LSEH_end_aesni_ccm64_decrypt_blocks
2963 .rva .LSEH_info_ccm64_dec
2964
2965 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
2966 .rva .LSEH_end_aesni_ctr32_encrypt_blocks
2967 .rva .LSEH_info_ctr32
2968
2969 .rva .LSEH_begin_aesni_xts_encrypt
2970 .rva .LSEH_end_aesni_xts_encrypt
2971 .rva .LSEH_info_xts_enc
2972
2973 .rva .LSEH_begin_aesni_xts_decrypt
2974 .rva .LSEH_end_aesni_xts_decrypt
2975 .rva .LSEH_info_xts_dec
2976___
2977$code.=<<___;
2978 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
2979 .rva .LSEH_end_${PREFIX}_cbc_encrypt
2980 .rva .LSEH_info_cbc
2981
2982 .rva ${PREFIX}_set_decrypt_key
2983 .rva .LSEH_end_set_decrypt_key
2984 .rva .LSEH_info_key
2985
2986 .rva ${PREFIX}_set_encrypt_key
2987 .rva .LSEH_end_set_encrypt_key
2988 .rva .LSEH_info_key
2989.section .xdata
2990.align 8
2991___
2992$code.=<<___ if ($PREFIX eq "aesni");
2993.LSEH_info_ecb:
2994 .byte 9,0,0,0
2995 .rva ecb_se_handler
2996.LSEH_info_ccm64_enc:
2997 .byte 9,0,0,0
2998 .rva ccm64_se_handler
2999 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
3000.LSEH_info_ccm64_dec:
3001 .byte 9,0,0,0
3002 .rva ccm64_se_handler
3003 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
3004.LSEH_info_ctr32:
3005 .byte 9,0,0,0
3006 .rva ctr32_se_handler
3007.LSEH_info_xts_enc:
3008 .byte 9,0,0,0
3009 .rva xts_se_handler
3010 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3011.LSEH_info_xts_dec:
3012 .byte 9,0,0,0
3013 .rva xts_se_handler
3014 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3015___
3016$code.=<<___;
3017.LSEH_info_cbc:
3018 .byte 9,0,0,0
3019 .rva cbc_se_handler
3020.LSEH_info_key:
3021 .byte 0x01,0x04,0x01,0x00
3022 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
3023___
3024}
3025
3026sub rex {
3027 local *opcode=shift;
3028 my ($dst,$src)=@_;
3029 my $rex=0;
3030
3031 $rex|=0x04 if($dst>=8);
3032 $rex|=0x01 if($src>=8);
3033 push @opcode,$rex|0x40 if($rex);
3034}
3035
3036sub aesni {
3037 my $line=shift;
3038 my @opcode=(0x66);
3039
3040 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
3041 rex(\@opcode,$4,$3);
3042 push @opcode,0x0f,0x3a,0xdf;
3043 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
3044 my $c=$2;
3045 push @opcode,$c=~/^0/?oct($c):$c;
3046 return ".byte\t".join(',',@opcode);
3047 }
3048 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
3049 my %opcodelet = (
3050 "aesimc" => 0xdb,
3051 "aesenc" => 0xdc, "aesenclast" => 0xdd,
3052 "aesdec" => 0xde, "aesdeclast" => 0xdf
3053 );
3054 return undef if (!defined($opcodelet{$1}));
3055 rex(\@opcode,$3,$2);
3056 push @opcode,0x0f,0x38,$opcodelet{$1};
3057 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
3058 return ".byte\t".join(',',@opcode);
3059 }
3060 return $line;
3061}
3062
3063$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3064$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
3065
3066print $code;
3067
3068close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
deleted file mode 100644
index c9c6312fa7..0000000000
--- a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
+++ /dev/null
@@ -1,3044 +0,0 @@
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode] ###
5### bitsliced implementation for Intel Core 2 processors ###
6### requires support of SSE extensions up to SSSE3 ###
7### Author: Emilia Käsper and Peter Schwabe ###
8### Date: 2009-03-19 ###
9### Public domain ###
10### ###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12### further information. ###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22# from 12.5KB to 2.2KB;
23# - above was possibile thanks to mixcolumns() modification that
24# allowed to feed its output back to aesenc[last], this was
25# achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28# relies on conversion of "conventional" key schedule as returned
29# by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31# to skip one shiftrows(), reduce bit-sliced key schedule and
32# speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38# Emilia's this(*) difference
39#
40# Core 2 9.30 8.69 +7%
41# Nehalem(**) 7.63 6.98 +9%
42# Atom 17.1 17.4 -2%(***)
43#
44# (*) Comparison is not completely fair, because "this" is ECB,
45# i.e. no extra processing such as counter values calculation
46# and xor-ing input as in Emilia's CTR implementation is
47# performed. However, the CTR calculations stand for not more
48# than 1% of total time, so comparison is *rather* fair.
49#
50# (**) Results were collected on Westmere, which is considered to
51# be equivalent to Nehalem for this code.
52#
53# (***) Slowdown on Atom is rather strange per se, because original
54# implementation has a number of 9+-bytes instructions, which
55# are bad for Atom front-end, and which I eliminated completely.
56# In attempt to address deterioration sbox() was tested in FP
57# SIMD "domain" (movaps instead of movdqa, xorps instead of
58# pxor, etc.). While it resulted in nominal 4% improvement on
59# Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# conversion conversion/8x block
68# Core 2 240 0.22
69# Nehalem 180 0.20
70# Atom 430 0.19
71#
72# The ratio values mean that 128-byte blocks will be processed
73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
81# October 2011.
82#
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2 11.0
87# Nehalem 9.16
88# Atom 20.9
89#
90# November 2011.
91#
92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93# suboptimal, but XTS is meant to be used with larger blocks...
94#
95# <appro@openssl.org>
96
97$flavour = shift;
98$output = shift;
99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
108open STDOUT,"| $^X $xlate $flavour $output";
109
110my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
111my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
112my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
113
114{
115my ($key,$rounds,$const)=("%rax","%r10d","%r11");
116
117sub Sbox {
118# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
119# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
120my @b=@_[0..7];
121my @t=@_[8..11];
122my @s=@_[12..15];
123 &InBasisChange (@b);
124 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
125 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
126}
127
128sub InBasisChange {
129# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
130# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
131my @b=@_[0..7];
132$code.=<<___;
133 pxor @b[6], @b[5]
134 pxor @b[1], @b[2]
135 pxor @b[0], @b[3]
136 pxor @b[2], @b[6]
137 pxor @b[0], @b[5]
138
139 pxor @b[3], @b[6]
140 pxor @b[7], @b[3]
141 pxor @b[5], @b[7]
142 pxor @b[4], @b[3]
143 pxor @b[5], @b[4]
144 pxor @b[1], @b[3]
145
146 pxor @b[7], @b[2]
147 pxor @b[5], @b[1]
148___
149}
150
151sub OutBasisChange {
152# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
153# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
154my @b=@_[0..7];
155$code.=<<___;
156 pxor @b[6], @b[0]
157 pxor @b[4], @b[1]
158 pxor @b[0], @b[2]
159 pxor @b[6], @b[4]
160 pxor @b[1], @b[6]
161
162 pxor @b[5], @b[1]
163 pxor @b[3], @b[5]
164 pxor @b[7], @b[3]
165 pxor @b[5], @b[7]
166 pxor @b[5], @b[2]
167
168 pxor @b[7], @b[4]
169___
170}
171
172sub InvSbox {
173# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
174# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
175my @b=@_[0..7];
176my @t=@_[8..11];
177my @s=@_[12..15];
178 &InvInBasisChange (@b);
179 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
180 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
181}
182
183sub InvInBasisChange { # OutBasisChange in reverse
184my @b=@_[5,1,2,6,3,7,0,4];
185$code.=<<___
186 pxor @b[7], @b[4]
187
188 pxor @b[5], @b[7]
189 pxor @b[5], @b[2]
190 pxor @b[7], @b[3]
191 pxor @b[3], @b[5]
192 pxor @b[5], @b[1]
193
194 pxor @b[1], @b[6]
195 pxor @b[0], @b[2]
196 pxor @b[6], @b[4]
197 pxor @b[6], @b[0]
198 pxor @b[4], @b[1]
199___
200}
201
202sub InvOutBasisChange { # InBasisChange in reverse
203my @b=@_[2,5,7,3,6,1,0,4];
204$code.=<<___;
205 pxor @b[5], @b[1]
206 pxor @b[7], @b[2]
207
208 pxor @b[1], @b[3]
209 pxor @b[5], @b[4]
210 pxor @b[5], @b[7]
211 pxor @b[4], @b[3]
212 pxor @b[0], @b[5]
213 pxor @b[7], @b[3]
214 pxor @b[2], @b[6]
215 pxor @b[1], @b[2]
216 pxor @b[3], @b[6]
217
218 pxor @b[0], @b[3]
219 pxor @b[6], @b[5]
220___
221}
222
223sub Mul_GF4 {
224#;*************************************************************
225#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
226#;*************************************************************
227my ($x0,$x1,$y0,$y1,$t0)=@_;
228$code.=<<___;
229 movdqa $y0, $t0
230 pxor $y1, $t0
231 pand $x0, $t0
232 pxor $x1, $x0
233 pand $y0, $x1
234 pand $y1, $x0
235 pxor $x1, $x0
236 pxor $t0, $x1
237___
238}
239
240sub Mul_GF4_N { # not used, see next subroutine
241# multiply and scale by N
242my ($x0,$x1,$y0,$y1,$t0)=@_;
243$code.=<<___;
244 movdqa $y0, $t0
245 pxor $y1, $t0
246 pand $x0, $t0
247 pxor $x1, $x0
248 pand $y0, $x1
249 pand $y1, $x0
250 pxor $x0, $x1
251 pxor $t0, $x0
252___
253}
254
255sub Mul_GF4_N_GF4 {
256# interleaved Mul_GF4_N and Mul_GF4
257my ($x0,$x1,$y0,$y1,$t0,
258 $x2,$x3,$y2,$y3,$t1)=@_;
259$code.=<<___;
260 movdqa $y0, $t0
261 movdqa $y2, $t1
262 pxor $y1, $t0
263 pxor $y3, $t1
264 pand $x0, $t0
265 pand $x2, $t1
266 pxor $x1, $x0
267 pxor $x3, $x2
268 pand $y0, $x1
269 pand $y2, $x3
270 pand $y1, $x0
271 pand $y3, $x2
272 pxor $x0, $x1
273 pxor $x3, $x2
274 pxor $t0, $x0
275 pxor $t1, $x3
276___
277}
278sub Mul_GF16_2 {
279my @x=@_[0..7];
280my @y=@_[8..11];
281my @t=@_[12..15];
282$code.=<<___;
283 movdqa @x[0], @t[0]
284 movdqa @x[1], @t[1]
285___
286 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
287$code.=<<___;
288 pxor @x[2], @t[0]
289 pxor @x[3], @t[1]
290 pxor @y[2], @y[0]
291 pxor @y[3], @y[1]
292___
293 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
294 @x[2], @x[3], @y[2], @y[3], @t[2]);
295$code.=<<___;
296 pxor @t[0], @x[0]
297 pxor @t[0], @x[2]
298 pxor @t[1], @x[1]
299 pxor @t[1], @x[3]
300
301 movdqa @x[4], @t[0]
302 movdqa @x[5], @t[1]
303 pxor @x[6], @t[0]
304 pxor @x[7], @t[1]
305___
306 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
307 @x[6], @x[7], @y[2], @y[3], @t[2]);
308$code.=<<___;
309 pxor @y[2], @y[0]
310 pxor @y[3], @y[1]
311___
312 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
313$code.=<<___;
314 pxor @t[0], @x[4]
315 pxor @t[0], @x[6]
316 pxor @t[1], @x[5]
317 pxor @t[1], @x[7]
318___
319}
320sub Inv_GF256 {
321#;********************************************************************
322#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
323#;********************************************************************
324my @x=@_[0..7];
325my @t=@_[8..11];
326my @s=@_[12..15];
327# direct optimizations from hardware
328$code.=<<___;
329 movdqa @x[4], @t[3]
330 movdqa @x[5], @t[2]
331 movdqa @x[1], @t[1]
332 movdqa @x[7], @s[1]
333 movdqa @x[0], @s[0]
334
335 pxor @x[6], @t[3]
336 pxor @x[7], @t[2]
337 pxor @x[3], @t[1]
338 movdqa @t[3], @s[2]
339 pxor @x[6], @s[1]
340 movdqa @t[2], @t[0]
341 pxor @x[2], @s[0]
342 movdqa @t[3], @s[3]
343
344 por @t[1], @t[2]
345 por @s[0], @t[3]
346 pxor @t[0], @s[3]
347 pand @s[0], @s[2]
348 pxor @t[1], @s[0]
349 pand @t[1], @t[0]
350 pand @s[0], @s[3]
351 movdqa @x[3], @s[0]
352 pxor @x[2], @s[0]
353 pand @s[0], @s[1]
354 pxor @s[1], @t[3]
355 pxor @s[1], @t[2]
356 movdqa @x[4], @s[1]
357 movdqa @x[1], @s[0]
358 pxor @x[5], @s[1]
359 pxor @x[0], @s[0]
360 movdqa @s[1], @t[1]
361 pand @s[0], @s[1]
362 por @s[0], @t[1]
363 pxor @s[1], @t[0]
364 pxor @s[3], @t[3]
365 pxor @s[2], @t[2]
366 pxor @s[3], @t[1]
367 movdqa @x[7], @s[0]
368 pxor @s[2], @t[0]
369 movdqa @x[6], @s[1]
370 pxor @s[2], @t[1]
371 movdqa @x[5], @s[2]
372 pand @x[3], @s[0]
373 movdqa @x[4], @s[3]
374 pand @x[2], @s[1]
375 pand @x[1], @s[2]
376 por @x[0], @s[3]
377 pxor @s[0], @t[3]
378 pxor @s[1], @t[2]
379 pxor @s[2], @t[1]
380 pxor @s[3], @t[0]
381
382 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
383
384 # new smaller inversion
385
386 movdqa @t[3], @s[0]
387 pand @t[1], @t[3]
388 pxor @t[2], @s[0]
389
390 movdqa @t[0], @s[2]
391 movdqa @s[0], @s[3]
392 pxor @t[3], @s[2]
393 pand @s[2], @s[3]
394
395 movdqa @t[1], @s[1]
396 pxor @t[2], @s[3]
397 pxor @t[0], @s[1]
398
399 pxor @t[2], @t[3]
400
401 pand @t[3], @s[1]
402
403 movdqa @s[2], @t[2]
404 pxor @t[0], @s[1]
405
406 pxor @s[1], @t[2]
407 pxor @s[1], @t[1]
408
409 pand @t[0], @t[2]
410
411 pxor @t[2], @s[2]
412 pxor @t[2], @t[1]
413
414 pand @s[3], @s[2]
415
416 pxor @s[0], @s[2]
417___
418# output in s3, s2, s1, t1
419
420# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
421
422# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
423 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
424
425### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
426}
427
428# AES linear components
429
430sub ShiftRows {
431my @x=@_[0..7];
432my $mask=pop;
433$code.=<<___;
434 pxor 0x00($key),@x[0]
435 pxor 0x10($key),@x[1]
436 pshufb $mask,@x[0]
437 pxor 0x20($key),@x[2]
438 pshufb $mask,@x[1]
439 pxor 0x30($key),@x[3]
440 pshufb $mask,@x[2]
441 pxor 0x40($key),@x[4]
442 pshufb $mask,@x[3]
443 pxor 0x50($key),@x[5]
444 pshufb $mask,@x[4]
445 pxor 0x60($key),@x[6]
446 pshufb $mask,@x[5]
447 pxor 0x70($key),@x[7]
448 pshufb $mask,@x[6]
449 lea 0x80($key),$key
450 pshufb $mask,@x[7]
451___
452}
453
454sub MixColumns {
455# modified to emit output in order suitable for feeding back to aesenc[last]
456my @x=@_[0..7];
457my @t=@_[8..15];
458$code.=<<___;
459 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
460 pshufd \$0x93, @x[1], @t[1]
461 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
462 pshufd \$0x93, @x[2], @t[2]
463 pxor @t[1], @x[1]
464 pshufd \$0x93, @x[3], @t[3]
465 pxor @t[2], @x[2]
466 pshufd \$0x93, @x[4], @t[4]
467 pxor @t[3], @x[3]
468 pshufd \$0x93, @x[5], @t[5]
469 pxor @t[4], @x[4]
470 pshufd \$0x93, @x[6], @t[6]
471 pxor @t[5], @x[5]
472 pshufd \$0x93, @x[7], @t[7]
473 pxor @t[6], @x[6]
474 pxor @t[7], @x[7]
475
476 pxor @x[0], @t[1]
477 pxor @x[7], @t[0]
478 pxor @x[7], @t[1]
479 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
480 pxor @x[1], @t[2]
481 pshufd \$0x4E, @x[1], @x[1]
482 pxor @x[4], @t[5]
483 pxor @t[0], @x[0]
484 pxor @x[5], @t[6]
485 pxor @t[1], @x[1]
486 pxor @x[3], @t[4]
487 pshufd \$0x4E, @x[4], @t[0]
488 pxor @x[6], @t[7]
489 pshufd \$0x4E, @x[5], @t[1]
490 pxor @x[2], @t[3]
491 pshufd \$0x4E, @x[3], @x[4]
492 pxor @x[7], @t[3]
493 pshufd \$0x4E, @x[7], @x[5]
494 pxor @x[7], @t[4]
495 pshufd \$0x4E, @x[6], @x[3]
496 pxor @t[4], @t[0]
497 pshufd \$0x4E, @x[2], @x[6]
498 pxor @t[5], @t[1]
499
500 pxor @t[3], @x[4]
501 pxor @t[7], @x[5]
502 pxor @t[6], @x[3]
503 movdqa @t[0], @x[2]
504 pxor @t[2], @x[6]
505 movdqa @t[1], @x[7]
506___
507}
508
509sub InvMixColumns {
510my @x=@_[0..7];
511my @t=@_[8..15];
512
513$code.=<<___;
514 # multiplication by 0x0e
515 pshufd \$0x93, @x[7], @t[7]
516 movdqa @x[2], @t[2]
517 pxor @x[5], @x[7] # 7 5
518 pxor @x[5], @x[2] # 2 5
519 pshufd \$0x93, @x[0], @t[0]
520 movdqa @x[5], @t[5]
521 pxor @x[0], @x[5] # 5 0 [1]
522 pxor @x[1], @x[0] # 0 1
523 pshufd \$0x93, @x[1], @t[1]
524 pxor @x[2], @x[1] # 1 25
525 pxor @x[6], @x[0] # 01 6 [2]
526 pxor @x[3], @x[1] # 125 3 [4]
527 pshufd \$0x93, @x[3], @t[3]
528 pxor @x[0], @x[2] # 25 016 [3]
529 pxor @x[7], @x[3] # 3 75
530 pxor @x[6], @x[7] # 75 6 [0]
531 pshufd \$0x93, @x[6], @t[6]
532 movdqa @x[4], @t[4]
533 pxor @x[4], @x[6] # 6 4
534 pxor @x[3], @x[4] # 4 375 [6]
535 pxor @x[7], @x[3] # 375 756=36
536 pxor @t[5], @x[6] # 64 5 [7]
537 pxor @t[2], @x[3] # 36 2
538 pxor @t[4], @x[3] # 362 4 [5]
539 pshufd \$0x93, @t[5], @t[5]
540___
541 my @y = @x[7,5,0,2,1,3,4,6];
542$code.=<<___;
543 # multiplication by 0x0b
544 pxor @y[0], @y[1]
545 pxor @t[0], @y[0]
546 pxor @t[1], @y[1]
547 pshufd \$0x93, @t[2], @t[2]
548 pxor @t[5], @y[0]
549 pxor @t[6], @y[1]
550 pxor @t[7], @y[0]
551 pshufd \$0x93, @t[4], @t[4]
552 pxor @t[6], @t[7] # clobber t[7]
553 pxor @y[0], @y[1]
554
555 pxor @t[0], @y[3]
556 pshufd \$0x93, @t[0], @t[0]
557 pxor @t[1], @y[2]
558 pxor @t[1], @y[4]
559 pxor @t[2], @y[2]
560 pshufd \$0x93, @t[1], @t[1]
561 pxor @t[2], @y[3]
562 pxor @t[2], @y[5]
563 pxor @t[7], @y[2]
564 pshufd \$0x93, @t[2], @t[2]
565 pxor @t[3], @y[3]
566 pxor @t[3], @y[6]
567 pxor @t[3], @y[4]
568 pshufd \$0x93, @t[3], @t[3]
569 pxor @t[4], @y[7]
570 pxor @t[4], @y[5]
571 pxor @t[7], @y[7]
572 pxor @t[5], @y[3]
573 pxor @t[4], @y[4]
574 pxor @t[5], @t[7] # clobber t[7] even more
575
576 pxor @t[7], @y[5]
577 pshufd \$0x93, @t[4], @t[4]
578 pxor @t[7], @y[6]
579 pxor @t[7], @y[4]
580
581 pxor @t[5], @t[7]
582 pshufd \$0x93, @t[5], @t[5]
583 pxor @t[6], @t[7] # restore t[7]
584
585 # multiplication by 0x0d
586 pxor @y[7], @y[4]
587 pxor @t[4], @y[7]
588 pshufd \$0x93, @t[6], @t[6]
589 pxor @t[0], @y[2]
590 pxor @t[5], @y[7]
591 pxor @t[2], @y[2]
592 pshufd \$0x93, @t[7], @t[7]
593
594 pxor @y[1], @y[3]
595 pxor @t[1], @y[1]
596 pxor @t[0], @y[0]
597 pxor @t[0], @y[3]
598 pxor @t[5], @y[1]
599 pxor @t[5], @y[0]
600 pxor @t[7], @y[1]
601 pshufd \$0x93, @t[0], @t[0]
602 pxor @t[6], @y[0]
603 pxor @y[1], @y[3]
604 pxor @t[1], @y[4]
605 pshufd \$0x93, @t[1], @t[1]
606
607 pxor @t[7], @y[7]
608 pxor @t[2], @y[4]
609 pxor @t[2], @y[5]
610 pshufd \$0x93, @t[2], @t[2]
611 pxor @t[6], @y[2]
612 pxor @t[3], @t[6] # clobber t[6]
613 pxor @y[7], @y[4]
614 pxor @t[6], @y[3]
615
616 pxor @t[6], @y[6]
617 pxor @t[5], @y[5]
618 pxor @t[4], @y[6]
619 pshufd \$0x93, @t[4], @t[4]
620 pxor @t[6], @y[5]
621 pxor @t[7], @y[6]
622 pxor @t[3], @t[6] # restore t[6]
623
624 pshufd \$0x93, @t[5], @t[5]
625 pshufd \$0x93, @t[6], @t[6]
626 pshufd \$0x93, @t[7], @t[7]
627 pshufd \$0x93, @t[3], @t[3]
628
629 # multiplication by 0x09
630 pxor @y[1], @y[4]
631 pxor @y[1], @t[1] # t[1]=y[1]
632 pxor @t[5], @t[0] # clobber t[0]
633 pxor @t[5], @t[1]
634 pxor @t[0], @y[3]
635 pxor @y[0], @t[0] # t[0]=y[0]
636 pxor @t[6], @t[1]
637 pxor @t[7], @t[6] # clobber t[6]
638 pxor @t[1], @y[4]
639 pxor @t[4], @y[7]
640 pxor @y[4], @t[4] # t[4]=y[4]
641 pxor @t[3], @y[6]
642 pxor @y[3], @t[3] # t[3]=y[3]
643 pxor @t[2], @y[5]
644 pxor @y[2], @t[2] # t[2]=y[2]
645 pxor @t[7], @t[3]
646 pxor @y[5], @t[5] # t[5]=y[5]
647 pxor @t[6], @t[2]
648 pxor @t[6], @t[5]
649 pxor @y[6], @t[6] # t[6]=y[6]
650 pxor @y[7], @t[7] # t[7]=y[7]
651
652 movdqa @t[0],@XMM[0]
653 movdqa @t[1],@XMM[1]
654 movdqa @t[2],@XMM[2]
655 movdqa @t[3],@XMM[3]
656 movdqa @t[4],@XMM[4]
657 movdqa @t[5],@XMM[5]
658 movdqa @t[6],@XMM[6]
659 movdqa @t[7],@XMM[7]
660___
661}
662
663sub aesenc { # not used
664my @b=@_[0..7];
665my @t=@_[8..15];
666$code.=<<___;
667 movdqa 0x30($const),@t[0] # .LSR
668___
669 &ShiftRows (@b,@t[0]);
670 &Sbox (@b,@t);
671 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
672}
673
674sub aesenclast { # not used
675my @b=@_[0..7];
676my @t=@_[8..15];
677$code.=<<___;
678 movdqa 0x40($const),@t[0] # .LSRM0
679___
680 &ShiftRows (@b,@t[0]);
681 &Sbox (@b,@t);
682$code.=<<___
683 pxor 0x00($key),@b[0]
684 pxor 0x10($key),@b[1]
685 pxor 0x20($key),@b[4]
686 pxor 0x30($key),@b[6]
687 pxor 0x40($key),@b[3]
688 pxor 0x50($key),@b[7]
689 pxor 0x60($key),@b[2]
690 pxor 0x70($key),@b[5]
691___
692}
693
694sub swapmove {
695my ($a,$b,$n,$mask,$t)=@_;
696$code.=<<___;
697 movdqa $b,$t
698 psrlq \$$n,$b
699 pxor $a,$b
700 pand $mask,$b
701 pxor $b,$a
702 psllq \$$n,$b
703 pxor $t,$b
704___
705}
706sub swapmove2x {
707my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
708$code.=<<___;
709 movdqa $b0,$t0
710 psrlq \$$n,$b0
711 movdqa $b1,$t1
712 psrlq \$$n,$b1
713 pxor $a0,$b0
714 pxor $a1,$b1
715 pand $mask,$b0
716 pand $mask,$b1
717 pxor $b0,$a0
718 psllq \$$n,$b0
719 pxor $b1,$a1
720 psllq \$$n,$b1
721 pxor $t0,$b0
722 pxor $t1,$b1
723___
724}
725
726sub bitslice {
727my @x=reverse(@_[0..7]);
728my ($t0,$t1,$t2,$t3)=@_[8..11];
729$code.=<<___;
730 movdqa 0x00($const),$t0 # .LBS0
731 movdqa 0x10($const),$t1 # .LBS1
732___
733 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
734 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
735$code.=<<___;
736 movdqa 0x20($const),$t0 # .LBS2
737___
738 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
739 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
740
741 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
742 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
743}
744
745$code.=<<___;
746.text
747
748.extern asm_AES_encrypt
749.extern asm_AES_decrypt
750
751.type _bsaes_encrypt8,\@abi-omnipotent
752.align 64
753_bsaes_encrypt8:
754 lea .LBS0(%rip), $const # constants table
755
756 movdqa ($key), @XMM[9] # round 0 key
757 lea 0x10($key), $key
758 movdqa 0x50($const), @XMM[8] # .LM0SR
759 pxor @XMM[9], @XMM[0] # xor with round0 key
760 pxor @XMM[9], @XMM[1]
761 pshufb @XMM[8], @XMM[0]
762 pxor @XMM[9], @XMM[2]
763 pshufb @XMM[8], @XMM[1]
764 pxor @XMM[9], @XMM[3]
765 pshufb @XMM[8], @XMM[2]
766 pxor @XMM[9], @XMM[4]
767 pshufb @XMM[8], @XMM[3]
768 pxor @XMM[9], @XMM[5]
769 pshufb @XMM[8], @XMM[4]
770 pxor @XMM[9], @XMM[6]
771 pshufb @XMM[8], @XMM[5]
772 pxor @XMM[9], @XMM[7]
773 pshufb @XMM[8], @XMM[6]
774 pshufb @XMM[8], @XMM[7]
775_bsaes_encrypt8_bitslice:
776___
777 &bitslice (@XMM[0..7, 8..11]);
778$code.=<<___;
779 dec $rounds
780 jmp .Lenc_sbox
781.align 16
782.Lenc_loop:
783___
784 &ShiftRows (@XMM[0..7, 8]);
785$code.=".Lenc_sbox:\n";
786 &Sbox (@XMM[0..7, 8..15]);
787$code.=<<___;
788 dec $rounds
789 jl .Lenc_done
790___
791 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
792$code.=<<___;
793 movdqa 0x30($const), @XMM[8] # .LSR
794 jnz .Lenc_loop
795 movdqa 0x40($const), @XMM[8] # .LSRM0
796 jmp .Lenc_loop
797.align 16
798.Lenc_done:
799___
800 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
801 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
802$code.=<<___;
803 movdqa ($key), @XMM[8] # last round key
804 pxor @XMM[8], @XMM[4]
805 pxor @XMM[8], @XMM[6]
806 pxor @XMM[8], @XMM[3]
807 pxor @XMM[8], @XMM[7]
808 pxor @XMM[8], @XMM[2]
809 pxor @XMM[8], @XMM[5]
810 pxor @XMM[8], @XMM[0]
811 pxor @XMM[8], @XMM[1]
812 ret
813.size _bsaes_encrypt8,.-_bsaes_encrypt8
814
815.type _bsaes_decrypt8,\@abi-omnipotent
816.align 64
817_bsaes_decrypt8:
818 lea .LBS0(%rip), $const # constants table
819
820 movdqa ($key), @XMM[9] # round 0 key
821 lea 0x10($key), $key
822 movdqa -0x30($const), @XMM[8] # .LM0ISR
823 pxor @XMM[9], @XMM[0] # xor with round0 key
824 pxor @XMM[9], @XMM[1]
825 pshufb @XMM[8], @XMM[0]
826 pxor @XMM[9], @XMM[2]
827 pshufb @XMM[8], @XMM[1]
828 pxor @XMM[9], @XMM[3]
829 pshufb @XMM[8], @XMM[2]
830 pxor @XMM[9], @XMM[4]
831 pshufb @XMM[8], @XMM[3]
832 pxor @XMM[9], @XMM[5]
833 pshufb @XMM[8], @XMM[4]
834 pxor @XMM[9], @XMM[6]
835 pshufb @XMM[8], @XMM[5]
836 pxor @XMM[9], @XMM[7]
837 pshufb @XMM[8], @XMM[6]
838 pshufb @XMM[8], @XMM[7]
839___
840 &bitslice (@XMM[0..7, 8..11]);
841$code.=<<___;
842 dec $rounds
843 jmp .Ldec_sbox
844.align 16
845.Ldec_loop:
846___
847 &ShiftRows (@XMM[0..7, 8]);
848$code.=".Ldec_sbox:\n";
849 &InvSbox (@XMM[0..7, 8..15]);
850$code.=<<___;
851 dec $rounds
852 jl .Ldec_done
853___
854 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
855$code.=<<___;
856 movdqa -0x10($const), @XMM[8] # .LISR
857 jnz .Ldec_loop
858 movdqa -0x20($const), @XMM[8] # .LISRM0
859 jmp .Ldec_loop
860.align 16
861.Ldec_done:
862___
863 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
864$code.=<<___;
865 movdqa ($key), @XMM[8] # last round key
866 pxor @XMM[8], @XMM[6]
867 pxor @XMM[8], @XMM[4]
868 pxor @XMM[8], @XMM[2]
869 pxor @XMM[8], @XMM[7]
870 pxor @XMM[8], @XMM[3]
871 pxor @XMM[8], @XMM[5]
872 pxor @XMM[8], @XMM[0]
873 pxor @XMM[8], @XMM[1]
874 ret
875.size _bsaes_decrypt8,.-_bsaes_decrypt8
876___
877}
878{
879my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
880
881sub bitslice_key {
882my @x=reverse(@_[0..7]);
883my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
884
885 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
886$code.=<<___;
887 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
888 movdqa @x[0], @x[2]
889 movdqa @x[1], @x[3]
890___
891 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
892
893 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
894$code.=<<___;
895 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
896 movdqa @x[0], @x[4]
897 movdqa @x[2], @x[6]
898 movdqa @x[1], @x[5]
899 movdqa @x[3], @x[7]
900___
901 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
902 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
903}
904
905$code.=<<___;
906.type _bsaes_key_convert,\@abi-omnipotent
907.align 16
908_bsaes_key_convert:
909 lea .Lmasks(%rip), $const
910 movdqu ($inp), %xmm7 # load round 0 key
911 lea 0x10($inp), $inp
912 movdqa 0x00($const), %xmm0 # 0x01...
913 movdqa 0x10($const), %xmm1 # 0x02...
914 movdqa 0x20($const), %xmm2 # 0x04...
915 movdqa 0x30($const), %xmm3 # 0x08...
916 movdqa 0x40($const), %xmm4 # .LM0
917 pcmpeqd %xmm5, %xmm5 # .LNOT
918
919 movdqu ($inp), %xmm6 # load round 1 key
920 movdqa %xmm7, ($out) # save round 0 key
921 lea 0x10($out), $out
922 dec $rounds
923 jmp .Lkey_loop
924.align 16
925.Lkey_loop:
926 pshufb %xmm4, %xmm6 # .LM0
927
928 movdqa %xmm0, %xmm8
929 movdqa %xmm1, %xmm9
930
931 pand %xmm6, %xmm8
932 pand %xmm6, %xmm9
933 movdqa %xmm2, %xmm10
934 pcmpeqb %xmm0, %xmm8
935 psllq \$4, %xmm0 # 0x10...
936 movdqa %xmm3, %xmm11
937 pcmpeqb %xmm1, %xmm9
938 psllq \$4, %xmm1 # 0x20...
939
940 pand %xmm6, %xmm10
941 pand %xmm6, %xmm11
942 movdqa %xmm0, %xmm12
943 pcmpeqb %xmm2, %xmm10
944 psllq \$4, %xmm2 # 0x40...
945 movdqa %xmm1, %xmm13
946 pcmpeqb %xmm3, %xmm11
947 psllq \$4, %xmm3 # 0x80...
948
949 movdqa %xmm2, %xmm14
950 movdqa %xmm3, %xmm15
951 pxor %xmm5, %xmm8 # "pnot"
952 pxor %xmm5, %xmm9
953
954 pand %xmm6, %xmm12
955 pand %xmm6, %xmm13
956 movdqa %xmm8, 0x00($out) # write bit-sliced round key
957 pcmpeqb %xmm0, %xmm12
958 psrlq \$4, %xmm0 # 0x01...
959 movdqa %xmm9, 0x10($out)
960 pcmpeqb %xmm1, %xmm13
961 psrlq \$4, %xmm1 # 0x02...
962 lea 0x10($inp), $inp
963
964 pand %xmm6, %xmm14
965 pand %xmm6, %xmm15
966 movdqa %xmm10, 0x20($out)
967 pcmpeqb %xmm2, %xmm14
968 psrlq \$4, %xmm2 # 0x04...
969 movdqa %xmm11, 0x30($out)
970 pcmpeqb %xmm3, %xmm15
971 psrlq \$4, %xmm3 # 0x08...
972 movdqu ($inp), %xmm6 # load next round key
973
974 pxor %xmm5, %xmm13 # "pnot"
975 pxor %xmm5, %xmm14
976 movdqa %xmm12, 0x40($out)
977 movdqa %xmm13, 0x50($out)
978 movdqa %xmm14, 0x60($out)
979 movdqa %xmm15, 0x70($out)
980 lea 0x80($out),$out
981 dec $rounds
982 jnz .Lkey_loop
983
984 movdqa 0x50($const), %xmm7 # .L63
985 #movdqa %xmm6, ($out) # don't save last round key
986 ret
987.size _bsaes_key_convert,.-_bsaes_key_convert
988___
989}
990
991if (0 && !$win64) { # following four functions are unsupported interface
992 # used for benchmarking...
993$code.=<<___;
994.globl bsaes_enc_key_convert
995.type bsaes_enc_key_convert,\@function,2
996.align 16
997bsaes_enc_key_convert:
998 mov 240($inp),%r10d # pass rounds
999 mov $inp,%rcx # pass key
1000 mov $out,%rax # pass key schedule
1001 call _bsaes_key_convert
1002 pxor %xmm6,%xmm7 # fix up last round key
1003 movdqa %xmm7,(%rax) # save last round key
1004 ret
1005.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1006
1007.globl bsaes_encrypt_128
1008.type bsaes_encrypt_128,\@function,4
1009.align 16
1010bsaes_encrypt_128:
1011.Lenc128_loop:
1012 movdqu 0x00($inp), @XMM[0] # load input
1013 movdqu 0x10($inp), @XMM[1]
1014 movdqu 0x20($inp), @XMM[2]
1015 movdqu 0x30($inp), @XMM[3]
1016 movdqu 0x40($inp), @XMM[4]
1017 movdqu 0x50($inp), @XMM[5]
1018 movdqu 0x60($inp), @XMM[6]
1019 movdqu 0x70($inp), @XMM[7]
1020 mov $key, %rax # pass the $key
1021 lea 0x80($inp), $inp
1022 mov \$10,%r10d
1023
1024 call _bsaes_encrypt8
1025
1026 movdqu @XMM[0], 0x00($out) # write output
1027 movdqu @XMM[1], 0x10($out)
1028 movdqu @XMM[4], 0x20($out)
1029 movdqu @XMM[6], 0x30($out)
1030 movdqu @XMM[3], 0x40($out)
1031 movdqu @XMM[7], 0x50($out)
1032 movdqu @XMM[2], 0x60($out)
1033 movdqu @XMM[5], 0x70($out)
1034 lea 0x80($out), $out
1035 sub \$0x80,$len
1036 ja .Lenc128_loop
1037 ret
1038.size bsaes_encrypt_128,.-bsaes_encrypt_128
1039
1040.globl bsaes_dec_key_convert
1041.type bsaes_dec_key_convert,\@function,2
1042.align 16
1043bsaes_dec_key_convert:
1044 mov 240($inp),%r10d # pass rounds
1045 mov $inp,%rcx # pass key
1046 mov $out,%rax # pass key schedule
1047 call _bsaes_key_convert
1048 pxor ($out),%xmm7 # fix up round 0 key
1049 movdqa %xmm6,(%rax) # save last round key
1050 movdqa %xmm7,($out)
1051 ret
1052.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1053
1054.globl bsaes_decrypt_128
1055.type bsaes_decrypt_128,\@function,4
1056.align 16
1057bsaes_decrypt_128:
1058.Ldec128_loop:
1059 movdqu 0x00($inp), @XMM[0] # load input
1060 movdqu 0x10($inp), @XMM[1]
1061 movdqu 0x20($inp), @XMM[2]
1062 movdqu 0x30($inp), @XMM[3]
1063 movdqu 0x40($inp), @XMM[4]
1064 movdqu 0x50($inp), @XMM[5]
1065 movdqu 0x60($inp), @XMM[6]
1066 movdqu 0x70($inp), @XMM[7]
1067 mov $key, %rax # pass the $key
1068 lea 0x80($inp), $inp
1069 mov \$10,%r10d
1070
1071 call _bsaes_decrypt8
1072
1073 movdqu @XMM[0], 0x00($out) # write output
1074 movdqu @XMM[1], 0x10($out)
1075 movdqu @XMM[6], 0x20($out)
1076 movdqu @XMM[4], 0x30($out)
1077 movdqu @XMM[2], 0x40($out)
1078 movdqu @XMM[7], 0x50($out)
1079 movdqu @XMM[3], 0x60($out)
1080 movdqu @XMM[5], 0x70($out)
1081 lea 0x80($out), $out
1082 sub \$0x80,$len
1083 ja .Ldec128_loop
1084 ret
1085.size bsaes_decrypt_128,.-bsaes_decrypt_128
1086___
1087}
1088{
1089######################################################################
1090#
1091# OpenSSL interface
1092#
1093my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1094 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1095my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1096
1097if ($ecb) {
1098$code.=<<___;
1099.globl bsaes_ecb_encrypt_blocks
1100.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1101.align 16
1102bsaes_ecb_encrypt_blocks:
1103 mov %rsp, %rax
1104.Lecb_enc_prologue:
1105 push %rbp
1106 push %rbx
1107 push %r12
1108 push %r13
1109 push %r14
1110 push %r15
1111 lea -0x48(%rsp),%rsp
1112___
1113$code.=<<___ if ($win64);
1114 lea -0xa0(%rsp), %rsp
1115 movaps %xmm6, 0x40(%rsp)
1116 movaps %xmm7, 0x50(%rsp)
1117 movaps %xmm8, 0x60(%rsp)
1118 movaps %xmm9, 0x70(%rsp)
1119 movaps %xmm10, 0x80(%rsp)
1120 movaps %xmm11, 0x90(%rsp)
1121 movaps %xmm12, 0xa0(%rsp)
1122 movaps %xmm13, 0xb0(%rsp)
1123 movaps %xmm14, 0xc0(%rsp)
1124 movaps %xmm15, 0xd0(%rsp)
1125.Lecb_enc_body:
1126___
1127$code.=<<___;
1128 mov %rsp,%rbp # backup %rsp
1129 mov 240($arg4),%eax # rounds
1130 mov $arg1,$inp # backup arguments
1131 mov $arg2,$out
1132 mov $arg3,$len
1133 mov $arg4,$key
1134 cmp \$8,$arg3
1135 jb .Lecb_enc_short
1136
1137 mov %eax,%ebx # backup rounds
1138 shl \$7,%rax # 128 bytes per inner round key
1139 sub \$`128-32`,%rax # size of bit-sliced key schedule
1140 sub %rax,%rsp
1141 mov %rsp,%rax # pass key schedule
1142 mov $key,%rcx # pass key
1143 mov %ebx,%r10d # pass rounds
1144 call _bsaes_key_convert
1145 pxor %xmm6,%xmm7 # fix up last round key
1146 movdqa %xmm7,(%rax) # save last round key
1147
1148 sub \$8,$len
1149.Lecb_enc_loop:
1150 movdqu 0x00($inp), @XMM[0] # load input
1151 movdqu 0x10($inp), @XMM[1]
1152 movdqu 0x20($inp), @XMM[2]
1153 movdqu 0x30($inp), @XMM[3]
1154 movdqu 0x40($inp), @XMM[4]
1155 movdqu 0x50($inp), @XMM[5]
1156 mov %rsp, %rax # pass key schedule
1157 movdqu 0x60($inp), @XMM[6]
1158 mov %ebx,%r10d # pass rounds
1159 movdqu 0x70($inp), @XMM[7]
1160 lea 0x80($inp), $inp
1161
1162 call _bsaes_encrypt8
1163
1164 movdqu @XMM[0], 0x00($out) # write output
1165 movdqu @XMM[1], 0x10($out)
1166 movdqu @XMM[4], 0x20($out)
1167 movdqu @XMM[6], 0x30($out)
1168 movdqu @XMM[3], 0x40($out)
1169 movdqu @XMM[7], 0x50($out)
1170 movdqu @XMM[2], 0x60($out)
1171 movdqu @XMM[5], 0x70($out)
1172 lea 0x80($out), $out
1173 sub \$8,$len
1174 jnc .Lecb_enc_loop
1175
1176 add \$8,$len
1177 jz .Lecb_enc_done
1178
1179 movdqu 0x00($inp), @XMM[0] # load input
1180 mov %rsp, %rax # pass key schedule
1181 mov %ebx,%r10d # pass rounds
1182 cmp \$2,$len
1183 jb .Lecb_enc_one
1184 movdqu 0x10($inp), @XMM[1]
1185 je .Lecb_enc_two
1186 movdqu 0x20($inp), @XMM[2]
1187 cmp \$4,$len
1188 jb .Lecb_enc_three
1189 movdqu 0x30($inp), @XMM[3]
1190 je .Lecb_enc_four
1191 movdqu 0x40($inp), @XMM[4]
1192 cmp \$6,$len
1193 jb .Lecb_enc_five
1194 movdqu 0x50($inp), @XMM[5]
1195 je .Lecb_enc_six
1196 movdqu 0x60($inp), @XMM[6]
1197 call _bsaes_encrypt8
1198 movdqu @XMM[0], 0x00($out) # write output
1199 movdqu @XMM[1], 0x10($out)
1200 movdqu @XMM[4], 0x20($out)
1201 movdqu @XMM[6], 0x30($out)
1202 movdqu @XMM[3], 0x40($out)
1203 movdqu @XMM[7], 0x50($out)
1204 movdqu @XMM[2], 0x60($out)
1205 jmp .Lecb_enc_done
1206.align 16
1207.Lecb_enc_six:
1208 call _bsaes_encrypt8
1209 movdqu @XMM[0], 0x00($out) # write output
1210 movdqu @XMM[1], 0x10($out)
1211 movdqu @XMM[4], 0x20($out)
1212 movdqu @XMM[6], 0x30($out)
1213 movdqu @XMM[3], 0x40($out)
1214 movdqu @XMM[7], 0x50($out)
1215 jmp .Lecb_enc_done
1216.align 16
1217.Lecb_enc_five:
1218 call _bsaes_encrypt8
1219 movdqu @XMM[0], 0x00($out) # write output
1220 movdqu @XMM[1], 0x10($out)
1221 movdqu @XMM[4], 0x20($out)
1222 movdqu @XMM[6], 0x30($out)
1223 movdqu @XMM[3], 0x40($out)
1224 jmp .Lecb_enc_done
1225.align 16
1226.Lecb_enc_four:
1227 call _bsaes_encrypt8
1228 movdqu @XMM[0], 0x00($out) # write output
1229 movdqu @XMM[1], 0x10($out)
1230 movdqu @XMM[4], 0x20($out)
1231 movdqu @XMM[6], 0x30($out)
1232 jmp .Lecb_enc_done
1233.align 16
1234.Lecb_enc_three:
1235 call _bsaes_encrypt8
1236 movdqu @XMM[0], 0x00($out) # write output
1237 movdqu @XMM[1], 0x10($out)
1238 movdqu @XMM[4], 0x20($out)
1239 jmp .Lecb_enc_done
1240.align 16
1241.Lecb_enc_two:
1242 call _bsaes_encrypt8
1243 movdqu @XMM[0], 0x00($out) # write output
1244 movdqu @XMM[1], 0x10($out)
1245 jmp .Lecb_enc_done
1246.align 16
1247.Lecb_enc_one:
1248 call _bsaes_encrypt8
1249 movdqu @XMM[0], 0x00($out) # write output
1250 jmp .Lecb_enc_done
1251.align 16
1252.Lecb_enc_short:
1253 lea ($inp), $arg1
1254 lea ($out), $arg2
1255 lea ($key), $arg3
1256 call asm_AES_encrypt
1257 lea 16($inp), $inp
1258 lea 16($out), $out
1259 dec $len
1260 jnz .Lecb_enc_short
1261
1262.Lecb_enc_done:
1263 lea (%rsp),%rax
1264 pxor %xmm0, %xmm0
1265.Lecb_enc_bzero: # wipe key schedule [if any]
1266 movdqa %xmm0, 0x00(%rax)
1267 movdqa %xmm0, 0x10(%rax)
1268 lea 0x20(%rax), %rax
1269 cmp %rax, %rbp
1270 jb .Lecb_enc_bzero
1271
1272 lea (%rbp),%rsp # restore %rsp
1273___
1274$code.=<<___ if ($win64);
1275 movaps 0x40(%rbp), %xmm6
1276 movaps 0x50(%rbp), %xmm7
1277 movaps 0x60(%rbp), %xmm8
1278 movaps 0x70(%rbp), %xmm9
1279 movaps 0x80(%rbp), %xmm10
1280 movaps 0x90(%rbp), %xmm11
1281 movaps 0xa0(%rbp), %xmm12
1282 movaps 0xb0(%rbp), %xmm13
1283 movaps 0xc0(%rbp), %xmm14
1284 movaps 0xd0(%rbp), %xmm15
1285 lea 0xa0(%rbp), %rsp
1286___
1287$code.=<<___;
1288 mov 0x48(%rsp), %r15
1289 mov 0x50(%rsp), %r14
1290 mov 0x58(%rsp), %r13
1291 mov 0x60(%rsp), %r12
1292 mov 0x68(%rsp), %rbx
1293 mov 0x70(%rsp), %rax
1294 lea 0x78(%rsp), %rsp
1295 mov %rax, %rbp
1296.Lecb_enc_epilogue:
1297 ret
1298.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1299
1300.globl bsaes_ecb_decrypt_blocks
1301.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1302.align 16
1303bsaes_ecb_decrypt_blocks:
1304 mov %rsp, %rax
1305.Lecb_dec_prologue:
1306 push %rbp
1307 push %rbx
1308 push %r12
1309 push %r13
1310 push %r14
1311 push %r15
1312 lea -0x48(%rsp),%rsp
1313___
1314$code.=<<___ if ($win64);
1315 lea -0xa0(%rsp), %rsp
1316 movaps %xmm6, 0x40(%rsp)
1317 movaps %xmm7, 0x50(%rsp)
1318 movaps %xmm8, 0x60(%rsp)
1319 movaps %xmm9, 0x70(%rsp)
1320 movaps %xmm10, 0x80(%rsp)
1321 movaps %xmm11, 0x90(%rsp)
1322 movaps %xmm12, 0xa0(%rsp)
1323 movaps %xmm13, 0xb0(%rsp)
1324 movaps %xmm14, 0xc0(%rsp)
1325 movaps %xmm15, 0xd0(%rsp)
1326.Lecb_dec_body:
1327___
1328$code.=<<___;
1329 mov %rsp,%rbp # backup %rsp
1330 mov 240($arg4),%eax # rounds
1331 mov $arg1,$inp # backup arguments
1332 mov $arg2,$out
1333 mov $arg3,$len
1334 mov $arg4,$key
1335 cmp \$8,$arg3
1336 jb .Lecb_dec_short
1337
1338 mov %eax,%ebx # backup rounds
1339 shl \$7,%rax # 128 bytes per inner round key
1340 sub \$`128-32`,%rax # size of bit-sliced key schedule
1341 sub %rax,%rsp
1342 mov %rsp,%rax # pass key schedule
1343 mov $key,%rcx # pass key
1344 mov %ebx,%r10d # pass rounds
1345 call _bsaes_key_convert
1346 pxor (%rsp),%xmm7 # fix up 0 round key
1347 movdqa %xmm6,(%rax) # save last round key
1348 movdqa %xmm7,(%rsp)
1349
1350 sub \$8,$len
1351.Lecb_dec_loop:
1352 movdqu 0x00($inp), @XMM[0] # load input
1353 movdqu 0x10($inp), @XMM[1]
1354 movdqu 0x20($inp), @XMM[2]
1355 movdqu 0x30($inp), @XMM[3]
1356 movdqu 0x40($inp), @XMM[4]
1357 movdqu 0x50($inp), @XMM[5]
1358 mov %rsp, %rax # pass key schedule
1359 movdqu 0x60($inp), @XMM[6]
1360 mov %ebx,%r10d # pass rounds
1361 movdqu 0x70($inp), @XMM[7]
1362 lea 0x80($inp), $inp
1363
1364 call _bsaes_decrypt8
1365
1366 movdqu @XMM[0], 0x00($out) # write output
1367 movdqu @XMM[1], 0x10($out)
1368 movdqu @XMM[6], 0x20($out)
1369 movdqu @XMM[4], 0x30($out)
1370 movdqu @XMM[2], 0x40($out)
1371 movdqu @XMM[7], 0x50($out)
1372 movdqu @XMM[3], 0x60($out)
1373 movdqu @XMM[5], 0x70($out)
1374 lea 0x80($out), $out
1375 sub \$8,$len
1376 jnc .Lecb_dec_loop
1377
1378 add \$8,$len
1379 jz .Lecb_dec_done
1380
1381 movdqu 0x00($inp), @XMM[0] # load input
1382 mov %rsp, %rax # pass key schedule
1383 mov %ebx,%r10d # pass rounds
1384 cmp \$2,$len
1385 jb .Lecb_dec_one
1386 movdqu 0x10($inp), @XMM[1]
1387 je .Lecb_dec_two
1388 movdqu 0x20($inp), @XMM[2]
1389 cmp \$4,$len
1390 jb .Lecb_dec_three
1391 movdqu 0x30($inp), @XMM[3]
1392 je .Lecb_dec_four
1393 movdqu 0x40($inp), @XMM[4]
1394 cmp \$6,$len
1395 jb .Lecb_dec_five
1396 movdqu 0x50($inp), @XMM[5]
1397 je .Lecb_dec_six
1398 movdqu 0x60($inp), @XMM[6]
1399 call _bsaes_decrypt8
1400 movdqu @XMM[0], 0x00($out) # write output
1401 movdqu @XMM[1], 0x10($out)
1402 movdqu @XMM[6], 0x20($out)
1403 movdqu @XMM[4], 0x30($out)
1404 movdqu @XMM[2], 0x40($out)
1405 movdqu @XMM[7], 0x50($out)
1406 movdqu @XMM[3], 0x60($out)
1407 jmp .Lecb_dec_done
1408.align 16
1409.Lecb_dec_six:
1410 call _bsaes_decrypt8
1411 movdqu @XMM[0], 0x00($out) # write output
1412 movdqu @XMM[1], 0x10($out)
1413 movdqu @XMM[6], 0x20($out)
1414 movdqu @XMM[4], 0x30($out)
1415 movdqu @XMM[2], 0x40($out)
1416 movdqu @XMM[7], 0x50($out)
1417 jmp .Lecb_dec_done
1418.align 16
1419.Lecb_dec_five:
1420 call _bsaes_decrypt8
1421 movdqu @XMM[0], 0x00($out) # write output
1422 movdqu @XMM[1], 0x10($out)
1423 movdqu @XMM[6], 0x20($out)
1424 movdqu @XMM[4], 0x30($out)
1425 movdqu @XMM[2], 0x40($out)
1426 jmp .Lecb_dec_done
1427.align 16
1428.Lecb_dec_four:
1429 call _bsaes_decrypt8
1430 movdqu @XMM[0], 0x00($out) # write output
1431 movdqu @XMM[1], 0x10($out)
1432 movdqu @XMM[6], 0x20($out)
1433 movdqu @XMM[4], 0x30($out)
1434 jmp .Lecb_dec_done
1435.align 16
1436.Lecb_dec_three:
1437 call _bsaes_decrypt8
1438 movdqu @XMM[0], 0x00($out) # write output
1439 movdqu @XMM[1], 0x10($out)
1440 movdqu @XMM[6], 0x20($out)
1441 jmp .Lecb_dec_done
1442.align 16
1443.Lecb_dec_two:
1444 call _bsaes_decrypt8
1445 movdqu @XMM[0], 0x00($out) # write output
1446 movdqu @XMM[1], 0x10($out)
1447 jmp .Lecb_dec_done
1448.align 16
1449.Lecb_dec_one:
1450 call _bsaes_decrypt8
1451 movdqu @XMM[0], 0x00($out) # write output
1452 jmp .Lecb_dec_done
1453.align 16
1454.Lecb_dec_short:
1455 lea ($inp), $arg1
1456 lea ($out), $arg2
1457 lea ($key), $arg3
1458 call asm_AES_decrypt
1459 lea 16($inp), $inp
1460 lea 16($out), $out
1461 dec $len
1462 jnz .Lecb_dec_short
1463
1464.Lecb_dec_done:
1465 lea (%rsp),%rax
1466 pxor %xmm0, %xmm0
1467.Lecb_dec_bzero: # wipe key schedule [if any]
1468 movdqa %xmm0, 0x00(%rax)
1469 movdqa %xmm0, 0x10(%rax)
1470 lea 0x20(%rax), %rax
1471 cmp %rax, %rbp
1472 jb .Lecb_dec_bzero
1473
1474 lea (%rbp),%rsp # restore %rsp
1475___
1476$code.=<<___ if ($win64);
1477 movaps 0x40(%rbp), %xmm6
1478 movaps 0x50(%rbp), %xmm7
1479 movaps 0x60(%rbp), %xmm8
1480 movaps 0x70(%rbp), %xmm9
1481 movaps 0x80(%rbp), %xmm10
1482 movaps 0x90(%rbp), %xmm11
1483 movaps 0xa0(%rbp), %xmm12
1484 movaps 0xb0(%rbp), %xmm13
1485 movaps 0xc0(%rbp), %xmm14
1486 movaps 0xd0(%rbp), %xmm15
1487 lea 0xa0(%rbp), %rsp
1488___
1489$code.=<<___;
1490 mov 0x48(%rsp), %r15
1491 mov 0x50(%rsp), %r14
1492 mov 0x58(%rsp), %r13
1493 mov 0x60(%rsp), %r12
1494 mov 0x68(%rsp), %rbx
1495 mov 0x70(%rsp), %rax
1496 lea 0x78(%rsp), %rsp
1497 mov %rax, %rbp
1498.Lecb_dec_epilogue:
1499 ret
1500.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1501___
1502}
1503$code.=<<___;
1504.extern asm_AES_cbc_encrypt
1505.globl bsaes_cbc_encrypt
1506.type bsaes_cbc_encrypt,\@abi-omnipotent
1507.align 16
1508bsaes_cbc_encrypt:
1509___
1510$code.=<<___ if ($win64);
1511 mov 48(%rsp),$arg6 # pull direction flag
1512___
1513$code.=<<___;
1514 cmp \$0,$arg6
1515 jne asm_AES_cbc_encrypt
1516 cmp \$128,$arg3
1517 jb asm_AES_cbc_encrypt
1518
1519 mov %rsp, %rax
1520.Lcbc_dec_prologue:
1521 push %rbp
1522 push %rbx
1523 push %r12
1524 push %r13
1525 push %r14
1526 push %r15
1527 lea -0x48(%rsp), %rsp
1528___
1529$code.=<<___ if ($win64);
1530 mov 0xa0(%rsp),$arg5 # pull ivp
1531 lea -0xa0(%rsp), %rsp
1532 movaps %xmm6, 0x40(%rsp)
1533 movaps %xmm7, 0x50(%rsp)
1534 movaps %xmm8, 0x60(%rsp)
1535 movaps %xmm9, 0x70(%rsp)
1536 movaps %xmm10, 0x80(%rsp)
1537 movaps %xmm11, 0x90(%rsp)
1538 movaps %xmm12, 0xa0(%rsp)
1539 movaps %xmm13, 0xb0(%rsp)
1540 movaps %xmm14, 0xc0(%rsp)
1541 movaps %xmm15, 0xd0(%rsp)
1542.Lcbc_dec_body:
1543___
1544$code.=<<___;
1545 mov %rsp, %rbp # backup %rsp
1546 mov 240($arg4), %eax # rounds
1547 mov $arg1, $inp # backup arguments
1548 mov $arg2, $out
1549 mov $arg3, $len
1550 mov $arg4, $key
1551 mov $arg5, %rbx
1552 shr \$4, $len # bytes to blocks
1553
1554 mov %eax, %edx # rounds
1555 shl \$7, %rax # 128 bytes per inner round key
1556 sub \$`128-32`, %rax # size of bit-sliced key schedule
1557 sub %rax, %rsp
1558
1559 mov %rsp, %rax # pass key schedule
1560 mov $key, %rcx # pass key
1561 mov %edx, %r10d # pass rounds
1562 call _bsaes_key_convert
1563 pxor (%rsp),%xmm7 # fix up 0 round key
1564 movdqa %xmm6,(%rax) # save last round key
1565 movdqa %xmm7,(%rsp)
1566
1567 movdqu (%rbx), @XMM[15] # load IV
1568 sub \$8,$len
1569.Lcbc_dec_loop:
1570 movdqu 0x00($inp), @XMM[0] # load input
1571 movdqu 0x10($inp), @XMM[1]
1572 movdqu 0x20($inp), @XMM[2]
1573 movdqu 0x30($inp), @XMM[3]
1574 movdqu 0x40($inp), @XMM[4]
1575 movdqu 0x50($inp), @XMM[5]
1576 mov %rsp, %rax # pass key schedule
1577 movdqu 0x60($inp), @XMM[6]
1578 mov %edx,%r10d # pass rounds
1579 movdqu 0x70($inp), @XMM[7]
1580 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1581
1582 call _bsaes_decrypt8
1583
1584 pxor 0x20(%rbp), @XMM[0] # ^= IV
1585 movdqu 0x00($inp), @XMM[8] # re-load input
1586 movdqu 0x10($inp), @XMM[9]
1587 pxor @XMM[8], @XMM[1]
1588 movdqu 0x20($inp), @XMM[10]
1589 pxor @XMM[9], @XMM[6]
1590 movdqu 0x30($inp), @XMM[11]
1591 pxor @XMM[10], @XMM[4]
1592 movdqu 0x40($inp), @XMM[12]
1593 pxor @XMM[11], @XMM[2]
1594 movdqu 0x50($inp), @XMM[13]
1595 pxor @XMM[12], @XMM[7]
1596 movdqu 0x60($inp), @XMM[14]
1597 pxor @XMM[13], @XMM[3]
1598 movdqu 0x70($inp), @XMM[15] # IV
1599 pxor @XMM[14], @XMM[5]
1600 movdqu @XMM[0], 0x00($out) # write output
1601 lea 0x80($inp), $inp
1602 movdqu @XMM[1], 0x10($out)
1603 movdqu @XMM[6], 0x20($out)
1604 movdqu @XMM[4], 0x30($out)
1605 movdqu @XMM[2], 0x40($out)
1606 movdqu @XMM[7], 0x50($out)
1607 movdqu @XMM[3], 0x60($out)
1608 movdqu @XMM[5], 0x70($out)
1609 lea 0x80($out), $out
1610 sub \$8,$len
1611 jnc .Lcbc_dec_loop
1612
1613 add \$8,$len
1614 jz .Lcbc_dec_done
1615
1616 movdqu 0x00($inp), @XMM[0] # load input
1617 mov %rsp, %rax # pass key schedule
1618 mov %edx, %r10d # pass rounds
1619 cmp \$2,$len
1620 jb .Lcbc_dec_one
1621 movdqu 0x10($inp), @XMM[1]
1622 je .Lcbc_dec_two
1623 movdqu 0x20($inp), @XMM[2]
1624 cmp \$4,$len
1625 jb .Lcbc_dec_three
1626 movdqu 0x30($inp), @XMM[3]
1627 je .Lcbc_dec_four
1628 movdqu 0x40($inp), @XMM[4]
1629 cmp \$6,$len
1630 jb .Lcbc_dec_five
1631 movdqu 0x50($inp), @XMM[5]
1632 je .Lcbc_dec_six
1633 movdqu 0x60($inp), @XMM[6]
1634 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1635 call _bsaes_decrypt8
1636 pxor 0x20(%rbp), @XMM[0] # ^= IV
1637 movdqu 0x00($inp), @XMM[8] # re-load input
1638 movdqu 0x10($inp), @XMM[9]
1639 pxor @XMM[8], @XMM[1]
1640 movdqu 0x20($inp), @XMM[10]
1641 pxor @XMM[9], @XMM[6]
1642 movdqu 0x30($inp), @XMM[11]
1643 pxor @XMM[10], @XMM[4]
1644 movdqu 0x40($inp), @XMM[12]
1645 pxor @XMM[11], @XMM[2]
1646 movdqu 0x50($inp), @XMM[13]
1647 pxor @XMM[12], @XMM[7]
1648 movdqu 0x60($inp), @XMM[15] # IV
1649 pxor @XMM[13], @XMM[3]
1650 movdqu @XMM[0], 0x00($out) # write output
1651 movdqu @XMM[1], 0x10($out)
1652 movdqu @XMM[6], 0x20($out)
1653 movdqu @XMM[4], 0x30($out)
1654 movdqu @XMM[2], 0x40($out)
1655 movdqu @XMM[7], 0x50($out)
1656 movdqu @XMM[3], 0x60($out)
1657 jmp .Lcbc_dec_done
1658.align 16
1659.Lcbc_dec_six:
1660 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1661 call _bsaes_decrypt8
1662 pxor 0x20(%rbp), @XMM[0] # ^= IV
1663 movdqu 0x00($inp), @XMM[8] # re-load input
1664 movdqu 0x10($inp), @XMM[9]
1665 pxor @XMM[8], @XMM[1]
1666 movdqu 0x20($inp), @XMM[10]
1667 pxor @XMM[9], @XMM[6]
1668 movdqu 0x30($inp), @XMM[11]
1669 pxor @XMM[10], @XMM[4]
1670 movdqu 0x40($inp), @XMM[12]
1671 pxor @XMM[11], @XMM[2]
1672 movdqu 0x50($inp), @XMM[15] # IV
1673 pxor @XMM[12], @XMM[7]
1674 movdqu @XMM[0], 0x00($out) # write output
1675 movdqu @XMM[1], 0x10($out)
1676 movdqu @XMM[6], 0x20($out)
1677 movdqu @XMM[4], 0x30($out)
1678 movdqu @XMM[2], 0x40($out)
1679 movdqu @XMM[7], 0x50($out)
1680 jmp .Lcbc_dec_done
1681.align 16
1682.Lcbc_dec_five:
1683 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1684 call _bsaes_decrypt8
1685 pxor 0x20(%rbp), @XMM[0] # ^= IV
1686 movdqu 0x00($inp), @XMM[8] # re-load input
1687 movdqu 0x10($inp), @XMM[9]
1688 pxor @XMM[8], @XMM[1]
1689 movdqu 0x20($inp), @XMM[10]
1690 pxor @XMM[9], @XMM[6]
1691 movdqu 0x30($inp), @XMM[11]
1692 pxor @XMM[10], @XMM[4]
1693 movdqu 0x40($inp), @XMM[15] # IV
1694 pxor @XMM[11], @XMM[2]
1695 movdqu @XMM[0], 0x00($out) # write output
1696 movdqu @XMM[1], 0x10($out)
1697 movdqu @XMM[6], 0x20($out)
1698 movdqu @XMM[4], 0x30($out)
1699 movdqu @XMM[2], 0x40($out)
1700 jmp .Lcbc_dec_done
1701.align 16
1702.Lcbc_dec_four:
1703 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1704 call _bsaes_decrypt8
1705 pxor 0x20(%rbp), @XMM[0] # ^= IV
1706 movdqu 0x00($inp), @XMM[8] # re-load input
1707 movdqu 0x10($inp), @XMM[9]
1708 pxor @XMM[8], @XMM[1]
1709 movdqu 0x20($inp), @XMM[10]
1710 pxor @XMM[9], @XMM[6]
1711 movdqu 0x30($inp), @XMM[15] # IV
1712 pxor @XMM[10], @XMM[4]
1713 movdqu @XMM[0], 0x00($out) # write output
1714 movdqu @XMM[1], 0x10($out)
1715 movdqu @XMM[6], 0x20($out)
1716 movdqu @XMM[4], 0x30($out)
1717 jmp .Lcbc_dec_done
1718.align 16
1719.Lcbc_dec_three:
1720 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1721 call _bsaes_decrypt8
1722 pxor 0x20(%rbp), @XMM[0] # ^= IV
1723 movdqu 0x00($inp), @XMM[8] # re-load input
1724 movdqu 0x10($inp), @XMM[9]
1725 pxor @XMM[8], @XMM[1]
1726 movdqu 0x20($inp), @XMM[15] # IV
1727 pxor @XMM[9], @XMM[6]
1728 movdqu @XMM[0], 0x00($out) # write output
1729 movdqu @XMM[1], 0x10($out)
1730 movdqu @XMM[6], 0x20($out)
1731 jmp .Lcbc_dec_done
1732.align 16
1733.Lcbc_dec_two:
1734 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1735 call _bsaes_decrypt8
1736 pxor 0x20(%rbp), @XMM[0] # ^= IV
1737 movdqu 0x00($inp), @XMM[8] # re-load input
1738 movdqu 0x10($inp), @XMM[15] # IV
1739 pxor @XMM[8], @XMM[1]
1740 movdqu @XMM[0], 0x00($out) # write output
1741 movdqu @XMM[1], 0x10($out)
1742 jmp .Lcbc_dec_done
1743.align 16
1744.Lcbc_dec_one:
1745 lea ($inp), $arg1
1746 lea 0x20(%rbp), $arg2 # buffer output
1747 lea ($key), $arg3
1748 call asm_AES_decrypt # doesn't touch %xmm
1749 pxor 0x20(%rbp), @XMM[15] # ^= IV
1750 movdqu @XMM[15], ($out) # write output
1751 movdqa @XMM[0], @XMM[15] # IV
1752
1753.Lcbc_dec_done:
1754 movdqu @XMM[15], (%rbx) # return IV
1755 lea (%rsp), %rax
1756 pxor %xmm0, %xmm0
1757.Lcbc_dec_bzero: # wipe key schedule [if any]
1758 movdqa %xmm0, 0x00(%rax)
1759 movdqa %xmm0, 0x10(%rax)
1760 lea 0x20(%rax), %rax
1761 cmp %rax, %rbp
1762 ja .Lcbc_dec_bzero
1763
1764 lea (%rbp),%rsp # restore %rsp
1765___
1766$code.=<<___ if ($win64);
1767 movaps 0x40(%rbp), %xmm6
1768 movaps 0x50(%rbp), %xmm7
1769 movaps 0x60(%rbp), %xmm8
1770 movaps 0x70(%rbp), %xmm9
1771 movaps 0x80(%rbp), %xmm10
1772 movaps 0x90(%rbp), %xmm11
1773 movaps 0xa0(%rbp), %xmm12
1774 movaps 0xb0(%rbp), %xmm13
1775 movaps 0xc0(%rbp), %xmm14
1776 movaps 0xd0(%rbp), %xmm15
1777 lea 0xa0(%rbp), %rsp
1778___
1779$code.=<<___;
1780 mov 0x48(%rsp), %r15
1781 mov 0x50(%rsp), %r14
1782 mov 0x58(%rsp), %r13
1783 mov 0x60(%rsp), %r12
1784 mov 0x68(%rsp), %rbx
1785 mov 0x70(%rsp), %rax
1786 lea 0x78(%rsp), %rsp
1787 mov %rax, %rbp
1788.Lcbc_dec_epilogue:
1789 ret
1790.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1791
1792.globl bsaes_ctr32_encrypt_blocks
1793.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1794.align 16
1795bsaes_ctr32_encrypt_blocks:
1796 mov %rsp, %rax
1797.Lctr_enc_prologue:
1798 push %rbp
1799 push %rbx
1800 push %r12
1801 push %r13
1802 push %r14
1803 push %r15
1804 lea -0x48(%rsp), %rsp
1805___
1806$code.=<<___ if ($win64);
1807 mov 0xa0(%rsp),$arg5 # pull ivp
1808 lea -0xa0(%rsp), %rsp
1809 movaps %xmm6, 0x40(%rsp)
1810 movaps %xmm7, 0x50(%rsp)
1811 movaps %xmm8, 0x60(%rsp)
1812 movaps %xmm9, 0x70(%rsp)
1813 movaps %xmm10, 0x80(%rsp)
1814 movaps %xmm11, 0x90(%rsp)
1815 movaps %xmm12, 0xa0(%rsp)
1816 movaps %xmm13, 0xb0(%rsp)
1817 movaps %xmm14, 0xc0(%rsp)
1818 movaps %xmm15, 0xd0(%rsp)
1819.Lctr_enc_body:
1820___
1821$code.=<<___;
1822 mov %rsp, %rbp # backup %rsp
1823 movdqu ($arg5), %xmm0 # load counter
1824 mov 240($arg4), %eax # rounds
1825 mov $arg1, $inp # backup arguments
1826 mov $arg2, $out
1827 mov $arg3, $len
1828 mov $arg4, $key
1829 movdqa %xmm0, 0x20(%rbp) # copy counter
1830 cmp \$8, $arg3
1831 jb .Lctr_enc_short
1832
1833 mov %eax, %ebx # rounds
1834 shl \$7, %rax # 128 bytes per inner round key
1835 sub \$`128-32`, %rax # size of bit-sliced key schedule
1836 sub %rax, %rsp
1837
1838 mov %rsp, %rax # pass key schedule
1839 mov $key, %rcx # pass key
1840 mov %ebx, %r10d # pass rounds
1841 call _bsaes_key_convert
1842 pxor %xmm6,%xmm7 # fix up last round key
1843 movdqa %xmm7,(%rax) # save last round key
1844
1845 movdqa (%rsp), @XMM[9] # load round0 key
1846 lea .LADD1(%rip), %r11
1847 movdqa 0x20(%rbp), @XMM[0] # counter copy
1848 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1849 pshufb @XMM[8], @XMM[9] # byte swap upper part
1850 pshufb @XMM[8], @XMM[0]
1851 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1852 jmp .Lctr_enc_loop
1853.align 16
1854.Lctr_enc_loop:
1855 movdqa @XMM[0], 0x20(%rbp) # save counter
1856 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1857 movdqa @XMM[0], @XMM[2]
1858 paddd 0x00(%r11), @XMM[1] # .LADD1
1859 movdqa @XMM[0], @XMM[3]
1860 paddd 0x10(%r11), @XMM[2] # .LADD2
1861 movdqa @XMM[0], @XMM[4]
1862 paddd 0x20(%r11), @XMM[3] # .LADD3
1863 movdqa @XMM[0], @XMM[5]
1864 paddd 0x30(%r11), @XMM[4] # .LADD4
1865 movdqa @XMM[0], @XMM[6]
1866 paddd 0x40(%r11), @XMM[5] # .LADD5
1867 movdqa @XMM[0], @XMM[7]
1868 paddd 0x50(%r11), @XMM[6] # .LADD6
1869 paddd 0x60(%r11), @XMM[7] # .LADD7
1870
1871 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1872 # to flip byte order in 32-bit counter
1873 movdqa (%rsp), @XMM[9] # round 0 key
1874 lea 0x10(%rsp), %rax # pass key schedule
1875 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1876 pxor @XMM[9], @XMM[0] # xor with round0 key
1877 pxor @XMM[9], @XMM[1]
1878 pshufb @XMM[8], @XMM[0]
1879 pxor @XMM[9], @XMM[2]
1880 pshufb @XMM[8], @XMM[1]
1881 pxor @XMM[9], @XMM[3]
1882 pshufb @XMM[8], @XMM[2]
1883 pxor @XMM[9], @XMM[4]
1884 pshufb @XMM[8], @XMM[3]
1885 pxor @XMM[9], @XMM[5]
1886 pshufb @XMM[8], @XMM[4]
1887 pxor @XMM[9], @XMM[6]
1888 pshufb @XMM[8], @XMM[5]
1889 pxor @XMM[9], @XMM[7]
1890 pshufb @XMM[8], @XMM[6]
1891 lea .LBS0(%rip), %r11 # constants table
1892 pshufb @XMM[8], @XMM[7]
1893 mov %ebx,%r10d # pass rounds
1894
1895 call _bsaes_encrypt8_bitslice
1896
1897 sub \$8,$len
1898 jc .Lctr_enc_loop_done
1899
1900 movdqu 0x00($inp), @XMM[8] # load input
1901 movdqu 0x10($inp), @XMM[9]
1902 movdqu 0x20($inp), @XMM[10]
1903 movdqu 0x30($inp), @XMM[11]
1904 movdqu 0x40($inp), @XMM[12]
1905 movdqu 0x50($inp), @XMM[13]
1906 movdqu 0x60($inp), @XMM[14]
1907 movdqu 0x70($inp), @XMM[15]
1908 lea 0x80($inp),$inp
1909 pxor @XMM[0], @XMM[8]
1910 movdqa 0x20(%rbp), @XMM[0] # load counter
1911 pxor @XMM[9], @XMM[1]
1912 movdqu @XMM[8], 0x00($out) # write output
1913 pxor @XMM[10], @XMM[4]
1914 movdqu @XMM[1], 0x10($out)
1915 pxor @XMM[11], @XMM[6]
1916 movdqu @XMM[4], 0x20($out)
1917 pxor @XMM[12], @XMM[3]
1918 movdqu @XMM[6], 0x30($out)
1919 pxor @XMM[13], @XMM[7]
1920 movdqu @XMM[3], 0x40($out)
1921 pxor @XMM[14], @XMM[2]
1922 movdqu @XMM[7], 0x50($out)
1923 pxor @XMM[15], @XMM[5]
1924 movdqu @XMM[2], 0x60($out)
1925 lea .LADD1(%rip), %r11
1926 movdqu @XMM[5], 0x70($out)
1927 lea 0x80($out), $out
1928 paddd 0x70(%r11), @XMM[0] # .LADD8
1929 jnz .Lctr_enc_loop
1930
1931 jmp .Lctr_enc_done
1932.align 16
1933.Lctr_enc_loop_done:
1934 add \$8, $len
1935 movdqu 0x00($inp), @XMM[8] # load input
1936 pxor @XMM[8], @XMM[0]
1937 movdqu @XMM[0], 0x00($out) # write output
1938 cmp \$2,$len
1939 jb .Lctr_enc_done
1940 movdqu 0x10($inp), @XMM[9]
1941 pxor @XMM[9], @XMM[1]
1942 movdqu @XMM[1], 0x10($out)
1943 je .Lctr_enc_done
1944 movdqu 0x20($inp), @XMM[10]
1945 pxor @XMM[10], @XMM[4]
1946 movdqu @XMM[4], 0x20($out)
1947 cmp \$4,$len
1948 jb .Lctr_enc_done
1949 movdqu 0x30($inp), @XMM[11]
1950 pxor @XMM[11], @XMM[6]
1951 movdqu @XMM[6], 0x30($out)
1952 je .Lctr_enc_done
1953 movdqu 0x40($inp), @XMM[12]
1954 pxor @XMM[12], @XMM[3]
1955 movdqu @XMM[3], 0x40($out)
1956 cmp \$6,$len
1957 jb .Lctr_enc_done
1958 movdqu 0x50($inp), @XMM[13]
1959 pxor @XMM[13], @XMM[7]
1960 movdqu @XMM[7], 0x50($out)
1961 je .Lctr_enc_done
1962 movdqu 0x60($inp), @XMM[14]
1963 pxor @XMM[14], @XMM[2]
1964 movdqu @XMM[2], 0x60($out)
1965 jmp .Lctr_enc_done
1966
1967.align 16
1968.Lctr_enc_short:
1969 lea 0x20(%rbp), $arg1
1970 lea 0x30(%rbp), $arg2
1971 lea ($key), $arg3
1972 call asm_AES_encrypt
1973 movdqu ($inp), @XMM[1]
1974 lea 16($inp), $inp
1975 mov 0x2c(%rbp), %eax # load 32-bit counter
1976 bswap %eax
1977 pxor 0x30(%rbp), @XMM[1]
1978 inc %eax # increment
1979 movdqu @XMM[1], ($out)
1980 bswap %eax
1981 lea 16($out), $out
1982 mov %eax, 0x2c(%rsp) # save 32-bit counter
1983 dec $len
1984 jnz .Lctr_enc_short
1985
1986.Lctr_enc_done:
1987 lea (%rsp), %rax
1988 pxor %xmm0, %xmm0
1989.Lctr_enc_bzero: # wipe key schedule [if any]
1990 movdqa %xmm0, 0x00(%rax)
1991 movdqa %xmm0, 0x10(%rax)
1992 lea 0x20(%rax), %rax
1993 cmp %rax, %rbp
1994 ja .Lctr_enc_bzero
1995
1996 lea (%rbp),%rsp # restore %rsp
1997___
1998$code.=<<___ if ($win64);
1999 movaps 0x40(%rbp), %xmm6
2000 movaps 0x50(%rbp), %xmm7
2001 movaps 0x60(%rbp), %xmm8
2002 movaps 0x70(%rbp), %xmm9
2003 movaps 0x80(%rbp), %xmm10
2004 movaps 0x90(%rbp), %xmm11
2005 movaps 0xa0(%rbp), %xmm12
2006 movaps 0xb0(%rbp), %xmm13
2007 movaps 0xc0(%rbp), %xmm14
2008 movaps 0xd0(%rbp), %xmm15
2009 lea 0xa0(%rbp), %rsp
2010___
2011$code.=<<___;
2012 mov 0x48(%rsp), %r15
2013 mov 0x50(%rsp), %r14
2014 mov 0x58(%rsp), %r13
2015 mov 0x60(%rsp), %r12
2016 mov 0x68(%rsp), %rbx
2017 mov 0x70(%rsp), %rax
2018 lea 0x78(%rsp), %rsp
2019 mov %rax, %rbp
2020.Lctr_enc_epilogue:
2021 ret
2022.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2023___
2024######################################################################
2025# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2026# const AES_KEY *key1, const AES_KEY *key2,
2027# const unsigned char iv[16]);
2028#
2029my ($twmask,$twres,$twtmp)=@XMM[13..15];
2030$code.=<<___;
2031.globl bsaes_xts_encrypt
2032.type bsaes_xts_encrypt,\@abi-omnipotent
2033.align 16
2034bsaes_xts_encrypt:
2035 mov %rsp, %rax
2036.Lxts_enc_prologue:
2037 push %rbp
2038 push %rbx
2039 push %r12
2040 push %r13
2041 push %r14
2042 push %r15
2043 lea -0x48(%rsp), %rsp
2044___
2045$code.=<<___ if ($win64);
2046 mov 0xa0(%rsp),$arg5 # pull key2
2047 mov 0xa8(%rsp),$arg6 # pull ivp
2048 lea -0xa0(%rsp), %rsp
2049 movaps %xmm6, 0x40(%rsp)
2050 movaps %xmm7, 0x50(%rsp)
2051 movaps %xmm8, 0x60(%rsp)
2052 movaps %xmm9, 0x70(%rsp)
2053 movaps %xmm10, 0x80(%rsp)
2054 movaps %xmm11, 0x90(%rsp)
2055 movaps %xmm12, 0xa0(%rsp)
2056 movaps %xmm13, 0xb0(%rsp)
2057 movaps %xmm14, 0xc0(%rsp)
2058 movaps %xmm15, 0xd0(%rsp)
2059.Lxts_enc_body:
2060___
2061$code.=<<___;
2062 mov %rsp, %rbp # backup %rsp
2063 mov $arg1, $inp # backup arguments
2064 mov $arg2, $out
2065 mov $arg3, $len
2066 mov $arg4, $key
2067
2068 lea ($arg6), $arg1
2069 lea 0x20(%rbp), $arg2
2070 lea ($arg5), $arg3
2071 call asm_AES_encrypt # generate initial tweak
2072
2073 mov 240($key), %eax # rounds
2074 mov $len, %rbx # backup $len
2075
2076 mov %eax, %edx # rounds
2077 shl \$7, %rax # 128 bytes per inner round key
2078 sub \$`128-32`, %rax # size of bit-sliced key schedule
2079 sub %rax, %rsp
2080
2081 mov %rsp, %rax # pass key schedule
2082 mov $key, %rcx # pass key
2083 mov %edx, %r10d # pass rounds
2084 call _bsaes_key_convert
2085 pxor %xmm6, %xmm7 # fix up last round key
2086 movdqa %xmm7, (%rax) # save last round key
2087
2088 and \$-16, $len
2089 sub \$0x80, %rsp # place for tweak[8]
2090 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2091
2092 pxor $twtmp, $twtmp
2093 movdqa .Lxts_magic(%rip), $twmask
2094 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2095
2096 sub \$0x80, $len
2097 jc .Lxts_enc_short
2098 jmp .Lxts_enc_loop
2099
2100.align 16
2101.Lxts_enc_loop:
2102___
2103 for ($i=0;$i<7;$i++) {
2104 $code.=<<___;
2105 pshufd \$0x13, $twtmp, $twres
2106 pxor $twtmp, $twtmp
2107 movdqa @XMM[7], @XMM[$i]
2108 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2109 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2110 pand $twmask, $twres # isolate carry and residue
2111 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2112 pxor $twres, @XMM[7]
2113___
2114 $code.=<<___ if ($i>=1);
2115 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2116___
2117 $code.=<<___ if ($i>=2);
2118 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2119___
2120 }
2121$code.=<<___;
2122 movdqu 0x60($inp), @XMM[8+6]
2123 pxor @XMM[8+5], @XMM[5]
2124 movdqu 0x70($inp), @XMM[8+7]
2125 lea 0x80($inp), $inp
2126 movdqa @XMM[7], 0x70(%rsp)
2127 pxor @XMM[8+6], @XMM[6]
2128 lea 0x80(%rsp), %rax # pass key schedule
2129 pxor @XMM[8+7], @XMM[7]
2130 mov %edx, %r10d # pass rounds
2131
2132 call _bsaes_encrypt8
2133
2134 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2135 pxor 0x10(%rsp), @XMM[1]
2136 movdqu @XMM[0], 0x00($out) # write output
2137 pxor 0x20(%rsp), @XMM[4]
2138 movdqu @XMM[1], 0x10($out)
2139 pxor 0x30(%rsp), @XMM[6]
2140 movdqu @XMM[4], 0x20($out)
2141 pxor 0x40(%rsp), @XMM[3]
2142 movdqu @XMM[6], 0x30($out)
2143 pxor 0x50(%rsp), @XMM[7]
2144 movdqu @XMM[3], 0x40($out)
2145 pxor 0x60(%rsp), @XMM[2]
2146 movdqu @XMM[7], 0x50($out)
2147 pxor 0x70(%rsp), @XMM[5]
2148 movdqu @XMM[2], 0x60($out)
2149 movdqu @XMM[5], 0x70($out)
2150 lea 0x80($out), $out
2151
2152 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2153 pxor $twtmp, $twtmp
2154 movdqa .Lxts_magic(%rip), $twmask
2155 pcmpgtd @XMM[7], $twtmp
2156 pshufd \$0x13, $twtmp, $twres
2157 pxor $twtmp, $twtmp
2158 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2159 pand $twmask, $twres # isolate carry and residue
2160 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2161 pxor $twres, @XMM[7]
2162
2163 sub \$0x80,$len
2164 jnc .Lxts_enc_loop
2165
2166.Lxts_enc_short:
2167 add \$0x80, $len
2168 jz .Lxts_enc_done
2169___
2170 for ($i=0;$i<7;$i++) {
2171 $code.=<<___;
2172 pshufd \$0x13, $twtmp, $twres
2173 pxor $twtmp, $twtmp
2174 movdqa @XMM[7], @XMM[$i]
2175 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2176 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2177 pand $twmask, $twres # isolate carry and residue
2178 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2179 pxor $twres, @XMM[7]
2180___
2181 $code.=<<___ if ($i>=1);
2182 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2183 cmp \$`0x10*$i`,$len
2184 je .Lxts_enc_$i
2185___
2186 $code.=<<___ if ($i>=2);
2187 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2188___
2189 }
2190$code.=<<___;
2191 movdqu 0x60($inp), @XMM[8+6]
2192 pxor @XMM[8+5], @XMM[5]
2193 movdqa @XMM[7], 0x70(%rsp)
2194 lea 0x70($inp), $inp
2195 pxor @XMM[8+6], @XMM[6]
2196 lea 0x80(%rsp), %rax # pass key schedule
2197 mov %edx, %r10d # pass rounds
2198
2199 call _bsaes_encrypt8
2200
2201 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2202 pxor 0x10(%rsp), @XMM[1]
2203 movdqu @XMM[0], 0x00($out) # write output
2204 pxor 0x20(%rsp), @XMM[4]
2205 movdqu @XMM[1], 0x10($out)
2206 pxor 0x30(%rsp), @XMM[6]
2207 movdqu @XMM[4], 0x20($out)
2208 pxor 0x40(%rsp), @XMM[3]
2209 movdqu @XMM[6], 0x30($out)
2210 pxor 0x50(%rsp), @XMM[7]
2211 movdqu @XMM[3], 0x40($out)
2212 pxor 0x60(%rsp), @XMM[2]
2213 movdqu @XMM[7], 0x50($out)
2214 movdqu @XMM[2], 0x60($out)
2215 lea 0x70($out), $out
2216
2217 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2218 jmp .Lxts_enc_done
2219.align 16
2220.Lxts_enc_6:
2221 pxor @XMM[8+4], @XMM[4]
2222 lea 0x60($inp), $inp
2223 pxor @XMM[8+5], @XMM[5]
2224 lea 0x80(%rsp), %rax # pass key schedule
2225 mov %edx, %r10d # pass rounds
2226
2227 call _bsaes_encrypt8
2228
2229 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2230 pxor 0x10(%rsp), @XMM[1]
2231 movdqu @XMM[0], 0x00($out) # write output
2232 pxor 0x20(%rsp), @XMM[4]
2233 movdqu @XMM[1], 0x10($out)
2234 pxor 0x30(%rsp), @XMM[6]
2235 movdqu @XMM[4], 0x20($out)
2236 pxor 0x40(%rsp), @XMM[3]
2237 movdqu @XMM[6], 0x30($out)
2238 pxor 0x50(%rsp), @XMM[7]
2239 movdqu @XMM[3], 0x40($out)
2240 movdqu @XMM[7], 0x50($out)
2241 lea 0x60($out), $out
2242
2243 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2244 jmp .Lxts_enc_done
2245.align 16
2246.Lxts_enc_5:
2247 pxor @XMM[8+3], @XMM[3]
2248 lea 0x50($inp), $inp
2249 pxor @XMM[8+4], @XMM[4]
2250 lea 0x80(%rsp), %rax # pass key schedule
2251 mov %edx, %r10d # pass rounds
2252
2253 call _bsaes_encrypt8
2254
2255 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2256 pxor 0x10(%rsp), @XMM[1]
2257 movdqu @XMM[0], 0x00($out) # write output
2258 pxor 0x20(%rsp), @XMM[4]
2259 movdqu @XMM[1], 0x10($out)
2260 pxor 0x30(%rsp), @XMM[6]
2261 movdqu @XMM[4], 0x20($out)
2262 pxor 0x40(%rsp), @XMM[3]
2263 movdqu @XMM[6], 0x30($out)
2264 movdqu @XMM[3], 0x40($out)
2265 lea 0x50($out), $out
2266
2267 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2268 jmp .Lxts_enc_done
2269.align 16
2270.Lxts_enc_4:
2271 pxor @XMM[8+2], @XMM[2]
2272 lea 0x40($inp), $inp
2273 pxor @XMM[8+3], @XMM[3]
2274 lea 0x80(%rsp), %rax # pass key schedule
2275 mov %edx, %r10d # pass rounds
2276
2277 call _bsaes_encrypt8
2278
2279 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2280 pxor 0x10(%rsp), @XMM[1]
2281 movdqu @XMM[0], 0x00($out) # write output
2282 pxor 0x20(%rsp), @XMM[4]
2283 movdqu @XMM[1], 0x10($out)
2284 pxor 0x30(%rsp), @XMM[6]
2285 movdqu @XMM[4], 0x20($out)
2286 movdqu @XMM[6], 0x30($out)
2287 lea 0x40($out), $out
2288
2289 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2290 jmp .Lxts_enc_done
2291.align 16
2292.Lxts_enc_3:
2293 pxor @XMM[8+1], @XMM[1]
2294 lea 0x30($inp), $inp
2295 pxor @XMM[8+2], @XMM[2]
2296 lea 0x80(%rsp), %rax # pass key schedule
2297 mov %edx, %r10d # pass rounds
2298
2299 call _bsaes_encrypt8
2300
2301 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2302 pxor 0x10(%rsp), @XMM[1]
2303 movdqu @XMM[0], 0x00($out) # write output
2304 pxor 0x20(%rsp), @XMM[4]
2305 movdqu @XMM[1], 0x10($out)
2306 movdqu @XMM[4], 0x20($out)
2307 lea 0x30($out), $out
2308
2309 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2310 jmp .Lxts_enc_done
2311.align 16
2312.Lxts_enc_2:
2313 pxor @XMM[8+0], @XMM[0]
2314 lea 0x20($inp), $inp
2315 pxor @XMM[8+1], @XMM[1]
2316 lea 0x80(%rsp), %rax # pass key schedule
2317 mov %edx, %r10d # pass rounds
2318
2319 call _bsaes_encrypt8
2320
2321 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2322 pxor 0x10(%rsp), @XMM[1]
2323 movdqu @XMM[0], 0x00($out) # write output
2324 movdqu @XMM[1], 0x10($out)
2325 lea 0x20($out), $out
2326
2327 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2328 jmp .Lxts_enc_done
2329.align 16
2330.Lxts_enc_1:
2331 pxor @XMM[0], @XMM[8]
2332 lea 0x10($inp), $inp
2333 movdqa @XMM[8], 0x20(%rbp)
2334 lea 0x20(%rbp), $arg1
2335 lea 0x20(%rbp), $arg2
2336 lea ($key), $arg3
2337 call asm_AES_encrypt # doesn't touch %xmm
2338 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2339 #pxor @XMM[8], @XMM[0]
2340 #lea 0x80(%rsp), %rax # pass key schedule
2341 #mov %edx, %r10d # pass rounds
2342 #call _bsaes_encrypt8
2343 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2344 movdqu @XMM[0], 0x00($out) # write output
2345 lea 0x10($out), $out
2346
2347 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2348
2349.Lxts_enc_done:
2350 and \$15, %ebx
2351 jz .Lxts_enc_ret
2352 mov $out, %rdx
2353
2354.Lxts_enc_steal:
2355 movzb ($inp), %eax
2356 movzb -16(%rdx), %ecx
2357 lea 1($inp), $inp
2358 mov %al, -16(%rdx)
2359 mov %cl, 0(%rdx)
2360 lea 1(%rdx), %rdx
2361 sub \$1,%ebx
2362 jnz .Lxts_enc_steal
2363
2364 movdqu -16($out), @XMM[0]
2365 lea 0x20(%rbp), $arg1
2366 pxor @XMM[7], @XMM[0]
2367 lea 0x20(%rbp), $arg2
2368 movdqa @XMM[0], 0x20(%rbp)
2369 lea ($key), $arg3
2370 call asm_AES_encrypt # doesn't touch %xmm
2371 pxor 0x20(%rbp), @XMM[7]
2372 movdqu @XMM[7], -16($out)
2373
2374.Lxts_enc_ret:
2375 lea (%rsp), %rax
2376 pxor %xmm0, %xmm0
2377.Lxts_enc_bzero: # wipe key schedule [if any]
2378 movdqa %xmm0, 0x00(%rax)
2379 movdqa %xmm0, 0x10(%rax)
2380 lea 0x20(%rax), %rax
2381 cmp %rax, %rbp
2382 ja .Lxts_enc_bzero
2383
2384 lea (%rbp),%rsp # restore %rsp
2385___
2386$code.=<<___ if ($win64);
2387 movaps 0x40(%rbp), %xmm6
2388 movaps 0x50(%rbp), %xmm7
2389 movaps 0x60(%rbp), %xmm8
2390 movaps 0x70(%rbp), %xmm9
2391 movaps 0x80(%rbp), %xmm10
2392 movaps 0x90(%rbp), %xmm11
2393 movaps 0xa0(%rbp), %xmm12
2394 movaps 0xb0(%rbp), %xmm13
2395 movaps 0xc0(%rbp), %xmm14
2396 movaps 0xd0(%rbp), %xmm15
2397 lea 0xa0(%rbp), %rsp
2398___
2399$code.=<<___;
2400 mov 0x48(%rsp), %r15
2401 mov 0x50(%rsp), %r14
2402 mov 0x58(%rsp), %r13
2403 mov 0x60(%rsp), %r12
2404 mov 0x68(%rsp), %rbx
2405 mov 0x70(%rsp), %rax
2406 lea 0x78(%rsp), %rsp
2407 mov %rax, %rbp
2408.Lxts_enc_epilogue:
2409 ret
2410.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2411
2412.globl bsaes_xts_decrypt
2413.type bsaes_xts_decrypt,\@abi-omnipotent
2414.align 16
2415bsaes_xts_decrypt:
2416 mov %rsp, %rax
2417.Lxts_dec_prologue:
2418 push %rbp
2419 push %rbx
2420 push %r12
2421 push %r13
2422 push %r14
2423 push %r15
2424 lea -0x48(%rsp), %rsp
2425___
2426$code.=<<___ if ($win64);
2427 mov 0xa0(%rsp),$arg5 # pull key2
2428 mov 0xa8(%rsp),$arg6 # pull ivp
2429 lea -0xa0(%rsp), %rsp
2430 movaps %xmm6, 0x40(%rsp)
2431 movaps %xmm7, 0x50(%rsp)
2432 movaps %xmm8, 0x60(%rsp)
2433 movaps %xmm9, 0x70(%rsp)
2434 movaps %xmm10, 0x80(%rsp)
2435 movaps %xmm11, 0x90(%rsp)
2436 movaps %xmm12, 0xa0(%rsp)
2437 movaps %xmm13, 0xb0(%rsp)
2438 movaps %xmm14, 0xc0(%rsp)
2439 movaps %xmm15, 0xd0(%rsp)
2440.Lxts_dec_body:
2441___
2442$code.=<<___;
2443 mov %rsp, %rbp # backup %rsp
2444 mov $arg1, $inp # backup arguments
2445 mov $arg2, $out
2446 mov $arg3, $len
2447 mov $arg4, $key
2448
2449 lea ($arg6), $arg1
2450 lea 0x20(%rbp), $arg2
2451 lea ($arg5), $arg3
2452 call asm_AES_encrypt # generate initial tweak
2453
2454 mov 240($key), %eax # rounds
2455 mov $len, %rbx # backup $len
2456
2457 mov %eax, %edx # rounds
2458 shl \$7, %rax # 128 bytes per inner round key
2459 sub \$`128-32`, %rax # size of bit-sliced key schedule
2460 sub %rax, %rsp
2461
2462 mov %rsp, %rax # pass key schedule
2463 mov $key, %rcx # pass key
2464 mov %edx, %r10d # pass rounds
2465 call _bsaes_key_convert
2466 pxor (%rsp), %xmm7 # fix up round 0 key
2467 movdqa %xmm6, (%rax) # save last round key
2468 movdqa %xmm7, (%rsp)
2469
2470 xor %eax, %eax # if ($len%16) len-=16;
2471 and \$-16, $len
2472 test \$15, %ebx
2473 setnz %al
2474 shl \$4, %rax
2475 sub %rax, $len
2476
2477 sub \$0x80, %rsp # place for tweak[8]
2478 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2479
2480 pxor $twtmp, $twtmp
2481 movdqa .Lxts_magic(%rip), $twmask
2482 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2483
2484 sub \$0x80, $len
2485 jc .Lxts_dec_short
2486 jmp .Lxts_dec_loop
2487
2488.align 16
2489.Lxts_dec_loop:
2490___
2491 for ($i=0;$i<7;$i++) {
2492 $code.=<<___;
2493 pshufd \$0x13, $twtmp, $twres
2494 pxor $twtmp, $twtmp
2495 movdqa @XMM[7], @XMM[$i]
2496 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2497 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2498 pand $twmask, $twres # isolate carry and residue
2499 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2500 pxor $twres, @XMM[7]
2501___
2502 $code.=<<___ if ($i>=1);
2503 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2504___
2505 $code.=<<___ if ($i>=2);
2506 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2507___
2508 }
2509$code.=<<___;
2510 movdqu 0x60($inp), @XMM[8+6]
2511 pxor @XMM[8+5], @XMM[5]
2512 movdqu 0x70($inp), @XMM[8+7]
2513 lea 0x80($inp), $inp
2514 movdqa @XMM[7], 0x70(%rsp)
2515 pxor @XMM[8+6], @XMM[6]
2516 lea 0x80(%rsp), %rax # pass key schedule
2517 pxor @XMM[8+7], @XMM[7]
2518 mov %edx, %r10d # pass rounds
2519
2520 call _bsaes_decrypt8
2521
2522 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2523 pxor 0x10(%rsp), @XMM[1]
2524 movdqu @XMM[0], 0x00($out) # write output
2525 pxor 0x20(%rsp), @XMM[6]
2526 movdqu @XMM[1], 0x10($out)
2527 pxor 0x30(%rsp), @XMM[4]
2528 movdqu @XMM[6], 0x20($out)
2529 pxor 0x40(%rsp), @XMM[2]
2530 movdqu @XMM[4], 0x30($out)
2531 pxor 0x50(%rsp), @XMM[7]
2532 movdqu @XMM[2], 0x40($out)
2533 pxor 0x60(%rsp), @XMM[3]
2534 movdqu @XMM[7], 0x50($out)
2535 pxor 0x70(%rsp), @XMM[5]
2536 movdqu @XMM[3], 0x60($out)
2537 movdqu @XMM[5], 0x70($out)
2538 lea 0x80($out), $out
2539
2540 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2541 pxor $twtmp, $twtmp
2542 movdqa .Lxts_magic(%rip), $twmask
2543 pcmpgtd @XMM[7], $twtmp
2544 pshufd \$0x13, $twtmp, $twres
2545 pxor $twtmp, $twtmp
2546 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2547 pand $twmask, $twres # isolate carry and residue
2548 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2549 pxor $twres, @XMM[7]
2550
2551 sub \$0x80,$len
2552 jnc .Lxts_dec_loop
2553
2554.Lxts_dec_short:
2555 add \$0x80, $len
2556 jz .Lxts_dec_done
2557___
2558 for ($i=0;$i<7;$i++) {
2559 $code.=<<___;
2560 pshufd \$0x13, $twtmp, $twres
2561 pxor $twtmp, $twtmp
2562 movdqa @XMM[7], @XMM[$i]
2563 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2564 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2565 pand $twmask, $twres # isolate carry and residue
2566 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2567 pxor $twres, @XMM[7]
2568___
2569 $code.=<<___ if ($i>=1);
2570 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2571 cmp \$`0x10*$i`,$len
2572 je .Lxts_dec_$i
2573___
2574 $code.=<<___ if ($i>=2);
2575 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2576___
2577 }
2578$code.=<<___;
2579 movdqu 0x60($inp), @XMM[8+6]
2580 pxor @XMM[8+5], @XMM[5]
2581 movdqa @XMM[7], 0x70(%rsp)
2582 lea 0x70($inp), $inp
2583 pxor @XMM[8+6], @XMM[6]
2584 lea 0x80(%rsp), %rax # pass key schedule
2585 mov %edx, %r10d # pass rounds
2586
2587 call _bsaes_decrypt8
2588
2589 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2590 pxor 0x10(%rsp), @XMM[1]
2591 movdqu @XMM[0], 0x00($out) # write output
2592 pxor 0x20(%rsp), @XMM[6]
2593 movdqu @XMM[1], 0x10($out)
2594 pxor 0x30(%rsp), @XMM[4]
2595 movdqu @XMM[6], 0x20($out)
2596 pxor 0x40(%rsp), @XMM[2]
2597 movdqu @XMM[4], 0x30($out)
2598 pxor 0x50(%rsp), @XMM[7]
2599 movdqu @XMM[2], 0x40($out)
2600 pxor 0x60(%rsp), @XMM[3]
2601 movdqu @XMM[7], 0x50($out)
2602 movdqu @XMM[3], 0x60($out)
2603 lea 0x70($out), $out
2604
2605 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2606 jmp .Lxts_dec_done
2607.align 16
2608.Lxts_dec_6:
2609 pxor @XMM[8+4], @XMM[4]
2610 lea 0x60($inp), $inp
2611 pxor @XMM[8+5], @XMM[5]
2612 lea 0x80(%rsp), %rax # pass key schedule
2613 mov %edx, %r10d # pass rounds
2614
2615 call _bsaes_decrypt8
2616
2617 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2618 pxor 0x10(%rsp), @XMM[1]
2619 movdqu @XMM[0], 0x00($out) # write output
2620 pxor 0x20(%rsp), @XMM[6]
2621 movdqu @XMM[1], 0x10($out)
2622 pxor 0x30(%rsp), @XMM[4]
2623 movdqu @XMM[6], 0x20($out)
2624 pxor 0x40(%rsp), @XMM[2]
2625 movdqu @XMM[4], 0x30($out)
2626 pxor 0x50(%rsp), @XMM[7]
2627 movdqu @XMM[2], 0x40($out)
2628 movdqu @XMM[7], 0x50($out)
2629 lea 0x60($out), $out
2630
2631 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2632 jmp .Lxts_dec_done
2633.align 16
2634.Lxts_dec_5:
2635 pxor @XMM[8+3], @XMM[3]
2636 lea 0x50($inp), $inp
2637 pxor @XMM[8+4], @XMM[4]
2638 lea 0x80(%rsp), %rax # pass key schedule
2639 mov %edx, %r10d # pass rounds
2640
2641 call _bsaes_decrypt8
2642
2643 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2644 pxor 0x10(%rsp), @XMM[1]
2645 movdqu @XMM[0], 0x00($out) # write output
2646 pxor 0x20(%rsp), @XMM[6]
2647 movdqu @XMM[1], 0x10($out)
2648 pxor 0x30(%rsp), @XMM[4]
2649 movdqu @XMM[6], 0x20($out)
2650 pxor 0x40(%rsp), @XMM[2]
2651 movdqu @XMM[4], 0x30($out)
2652 movdqu @XMM[2], 0x40($out)
2653 lea 0x50($out), $out
2654
2655 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2656 jmp .Lxts_dec_done
2657.align 16
2658.Lxts_dec_4:
2659 pxor @XMM[8+2], @XMM[2]
2660 lea 0x40($inp), $inp
2661 pxor @XMM[8+3], @XMM[3]
2662 lea 0x80(%rsp), %rax # pass key schedule
2663 mov %edx, %r10d # pass rounds
2664
2665 call _bsaes_decrypt8
2666
2667 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2668 pxor 0x10(%rsp), @XMM[1]
2669 movdqu @XMM[0], 0x00($out) # write output
2670 pxor 0x20(%rsp), @XMM[6]
2671 movdqu @XMM[1], 0x10($out)
2672 pxor 0x30(%rsp), @XMM[4]
2673 movdqu @XMM[6], 0x20($out)
2674 movdqu @XMM[4], 0x30($out)
2675 lea 0x40($out), $out
2676
2677 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2678 jmp .Lxts_dec_done
2679.align 16
2680.Lxts_dec_3:
2681 pxor @XMM[8+1], @XMM[1]
2682 lea 0x30($inp), $inp
2683 pxor @XMM[8+2], @XMM[2]
2684 lea 0x80(%rsp), %rax # pass key schedule
2685 mov %edx, %r10d # pass rounds
2686
2687 call _bsaes_decrypt8
2688
2689 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2690 pxor 0x10(%rsp), @XMM[1]
2691 movdqu @XMM[0], 0x00($out) # write output
2692 pxor 0x20(%rsp), @XMM[6]
2693 movdqu @XMM[1], 0x10($out)
2694 movdqu @XMM[6], 0x20($out)
2695 lea 0x30($out), $out
2696
2697 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2698 jmp .Lxts_dec_done
2699.align 16
2700.Lxts_dec_2:
2701 pxor @XMM[8+0], @XMM[0]
2702 lea 0x20($inp), $inp
2703 pxor @XMM[8+1], @XMM[1]
2704 lea 0x80(%rsp), %rax # pass key schedule
2705 mov %edx, %r10d # pass rounds
2706
2707 call _bsaes_decrypt8
2708
2709 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2710 pxor 0x10(%rsp), @XMM[1]
2711 movdqu @XMM[0], 0x00($out) # write output
2712 movdqu @XMM[1], 0x10($out)
2713 lea 0x20($out), $out
2714
2715 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2716 jmp .Lxts_dec_done
2717.align 16
2718.Lxts_dec_1:
2719 pxor @XMM[0], @XMM[8]
2720 lea 0x10($inp), $inp
2721 movdqa @XMM[8], 0x20(%rbp)
2722 lea 0x20(%rbp), $arg1
2723 lea 0x20(%rbp), $arg2
2724 lea ($key), $arg3
2725 call asm_AES_decrypt # doesn't touch %xmm
2726 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2727 #pxor @XMM[8], @XMM[0]
2728 #lea 0x80(%rsp), %rax # pass key schedule
2729 #mov %edx, %r10d # pass rounds
2730 #call _bsaes_decrypt8
2731 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2732 movdqu @XMM[0], 0x00($out) # write output
2733 lea 0x10($out), $out
2734
2735 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2736
2737.Lxts_dec_done:
2738 and \$15, %ebx
2739 jz .Lxts_dec_ret
2740
2741 pxor $twtmp, $twtmp
2742 movdqa .Lxts_magic(%rip), $twmask
2743 pcmpgtd @XMM[7], $twtmp
2744 pshufd \$0x13, $twtmp, $twres
2745 movdqa @XMM[7], @XMM[6]
2746 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2747 pand $twmask, $twres # isolate carry and residue
2748 movdqu ($inp), @XMM[0]
2749 pxor $twres, @XMM[7]
2750
2751 lea 0x20(%rbp), $arg1
2752 pxor @XMM[7], @XMM[0]
2753 lea 0x20(%rbp), $arg2
2754 movdqa @XMM[0], 0x20(%rbp)
2755 lea ($key), $arg3
2756 call asm_AES_decrypt # doesn't touch %xmm
2757 pxor 0x20(%rbp), @XMM[7]
2758 mov $out, %rdx
2759 movdqu @XMM[7], ($out)
2760
2761.Lxts_dec_steal:
2762 movzb 16($inp), %eax
2763 movzb (%rdx), %ecx
2764 lea 1($inp), $inp
2765 mov %al, (%rdx)
2766 mov %cl, 16(%rdx)
2767 lea 1(%rdx), %rdx
2768 sub \$1,%ebx
2769 jnz .Lxts_dec_steal
2770
2771 movdqu ($out), @XMM[0]
2772 lea 0x20(%rbp), $arg1
2773 pxor @XMM[6], @XMM[0]
2774 lea 0x20(%rbp), $arg2
2775 movdqa @XMM[0], 0x20(%rbp)
2776 lea ($key), $arg3
2777 call asm_AES_decrypt # doesn't touch %xmm
2778 pxor 0x20(%rbp), @XMM[6]
2779 movdqu @XMM[6], ($out)
2780
2781.Lxts_dec_ret:
2782 lea (%rsp), %rax
2783 pxor %xmm0, %xmm0
2784.Lxts_dec_bzero: # wipe key schedule [if any]
2785 movdqa %xmm0, 0x00(%rax)
2786 movdqa %xmm0, 0x10(%rax)
2787 lea 0x20(%rax), %rax
2788 cmp %rax, %rbp
2789 ja .Lxts_dec_bzero
2790
2791 lea (%rbp),%rsp # restore %rsp
2792___
2793$code.=<<___ if ($win64);
2794 movaps 0x40(%rbp), %xmm6
2795 movaps 0x50(%rbp), %xmm7
2796 movaps 0x60(%rbp), %xmm8
2797 movaps 0x70(%rbp), %xmm9
2798 movaps 0x80(%rbp), %xmm10
2799 movaps 0x90(%rbp), %xmm11
2800 movaps 0xa0(%rbp), %xmm12
2801 movaps 0xb0(%rbp), %xmm13
2802 movaps 0xc0(%rbp), %xmm14
2803 movaps 0xd0(%rbp), %xmm15
2804 lea 0xa0(%rbp), %rsp
2805___
2806$code.=<<___;
2807 mov 0x48(%rsp), %r15
2808 mov 0x50(%rsp), %r14
2809 mov 0x58(%rsp), %r13
2810 mov 0x60(%rsp), %r12
2811 mov 0x68(%rsp), %rbx
2812 mov 0x70(%rsp), %rax
2813 lea 0x78(%rsp), %rsp
2814 mov %rax, %rbp
2815.Lxts_dec_epilogue:
2816 ret
2817.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2818___
2819}
2820$code.=<<___;
2821.type _bsaes_const,\@object
2822.align 64
2823_bsaes_const:
2824.LM0ISR: # InvShiftRows constants
2825 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2826.LISRM0:
2827 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2828.LISR:
2829 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2830.LBS0: # bit-slice constants
2831 .quad 0x5555555555555555, 0x5555555555555555
2832.LBS1:
2833 .quad 0x3333333333333333, 0x3333333333333333
2834.LBS2:
2835 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2836.LSR: # shiftrows constants
2837 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2838.LSRM0:
2839 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2840.LM0SR:
2841 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2842.LSWPUP: # byte-swap upper dword
2843 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2844.LSWPUPM0SR:
2845 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2846.LADD1: # counter increment constants
2847 .quad 0x0000000000000000, 0x0000000100000000
2848.LADD2:
2849 .quad 0x0000000000000000, 0x0000000200000000
2850.LADD3:
2851 .quad 0x0000000000000000, 0x0000000300000000
2852.LADD4:
2853 .quad 0x0000000000000000, 0x0000000400000000
2854.LADD5:
2855 .quad 0x0000000000000000, 0x0000000500000000
2856.LADD6:
2857 .quad 0x0000000000000000, 0x0000000600000000
2858.LADD7:
2859 .quad 0x0000000000000000, 0x0000000700000000
2860.LADD8:
2861 .quad 0x0000000000000000, 0x0000000800000000
2862.Lxts_magic:
2863 .long 0x87,0,1,0
2864.Lmasks:
2865 .quad 0x0101010101010101, 0x0101010101010101
2866 .quad 0x0202020202020202, 0x0202020202020202
2867 .quad 0x0404040404040404, 0x0404040404040404
2868 .quad 0x0808080808080808, 0x0808080808080808
2869.LM0:
2870 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2871.L63:
2872 .quad 0x6363636363636363, 0x6363636363636363
2873.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2874.align 64
2875.size _bsaes_const,.-_bsaes_const
2876___
2877
2878# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2879# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2880if ($win64) {
2881$rec="%rcx";
2882$frame="%rdx";
2883$context="%r8";
2884$disp="%r9";
2885
2886$code.=<<___;
2887.extern __imp_RtlVirtualUnwind
2888.type se_handler,\@abi-omnipotent
2889.align 16
2890se_handler:
2891 push %rsi
2892 push %rdi
2893 push %rbx
2894 push %rbp
2895 push %r12
2896 push %r13
2897 push %r14
2898 push %r15
2899 pushfq
2900 sub \$64,%rsp
2901
2902 mov 120($context),%rax # pull context->Rax
2903 mov 248($context),%rbx # pull context->Rip
2904
2905 mov 8($disp),%rsi # disp->ImageBase
2906 mov 56($disp),%r11 # disp->HandlerData
2907
2908 mov 0(%r11),%r10d # HandlerData[0]
2909 lea (%rsi,%r10),%r10 # prologue label
2910 cmp %r10,%rbx # context->Rip<prologue label
2911 jb .Lin_prologue
2912
2913 mov 152($context),%rax # pull context->Rsp
2914
2915 mov 4(%r11),%r10d # HandlerData[1]
2916 lea (%rsi,%r10),%r10 # epilogue label
2917 cmp %r10,%rbx # context->Rip>=epilogue label
2918 jae .Lin_prologue
2919
2920 mov 160($context),%rax # pull context->Rbp
2921
2922 lea 0x40(%rax),%rsi # %xmm save area
2923 lea 512($context),%rdi # &context.Xmm6
2924 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2925 .long 0xa548f3fc # cld; rep movsq
2926 lea 0xa0(%rax),%rax # adjust stack pointer
2927
2928 mov 0x70(%rax),%rbp
2929 mov 0x68(%rax),%rbx
2930 mov 0x60(%rax),%r12
2931 mov 0x58(%rax),%r13
2932 mov 0x50(%rax),%r14
2933 mov 0x48(%rax),%r15
2934 lea 0x78(%rax),%rax # adjust stack pointer
2935 mov %rbx,144($context) # restore context->Rbx
2936 mov %rbp,160($context) # restore context->Rbp
2937 mov %r12,216($context) # restore context->R12
2938 mov %r13,224($context) # restore context->R13
2939 mov %r14,232($context) # restore context->R14
2940 mov %r15,240($context) # restore context->R15
2941
2942.Lin_prologue:
2943 mov %rax,152($context) # restore context->Rsp
2944
2945 mov 40($disp),%rdi # disp->ContextRecord
2946 mov $context,%rsi # context
2947 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2948 .long 0xa548f3fc # cld; rep movsq
2949
2950 mov $disp,%rsi
2951 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2952 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2953 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2954 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2955 mov 40(%rsi),%r10 # disp->ContextRecord
2956 lea 56(%rsi),%r11 # &disp->HandlerData
2957 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2958 mov %r10,32(%rsp) # arg5
2959 mov %r11,40(%rsp) # arg6
2960 mov %r12,48(%rsp) # arg7
2961 mov %rcx,56(%rsp) # arg8, (NULL)
2962 call *__imp_RtlVirtualUnwind(%rip)
2963
2964 mov \$1,%eax # ExceptionContinueSearch
2965 add \$64,%rsp
2966 popfq
2967 pop %r15
2968 pop %r14
2969 pop %r13
2970 pop %r12
2971 pop %rbp
2972 pop %rbx
2973 pop %rdi
2974 pop %rsi
2975 ret
2976.size se_handler,.-se_handler
2977
2978.section .pdata
2979.align 4
2980___
2981$code.=<<___ if ($ecb);
2982 .rva .Lecb_enc_prologue
2983 .rva .Lecb_enc_epilogue
2984 .rva .Lecb_enc_info
2985
2986 .rva .Lecb_dec_prologue
2987 .rva .Lecb_dec_epilogue
2988 .rva .Lecb_dec_info
2989___
2990$code.=<<___;
2991 .rva .Lcbc_dec_prologue
2992 .rva .Lcbc_dec_epilogue
2993 .rva .Lcbc_dec_info
2994
2995 .rva .Lctr_enc_prologue
2996 .rva .Lctr_enc_epilogue
2997 .rva .Lctr_enc_info
2998
2999 .rva .Lxts_enc_prologue
3000 .rva .Lxts_enc_epilogue
3001 .rva .Lxts_enc_info
3002
3003 .rva .Lxts_dec_prologue
3004 .rva .Lxts_dec_epilogue
3005 .rva .Lxts_dec_info
3006
3007.section .xdata
3008.align 8
3009___
3010$code.=<<___ if ($ecb);
3011.Lecb_enc_info:
3012 .byte 9,0,0,0
3013 .rva se_handler
3014 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3015.Lecb_dec_info:
3016 .byte 9,0,0,0
3017 .rva se_handler
3018 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3019___
3020$code.=<<___;
3021.Lcbc_dec_info:
3022 .byte 9,0,0,0
3023 .rva se_handler
3024 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3025.Lctr_enc_info:
3026 .byte 9,0,0,0
3027 .rva se_handler
3028 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3029.Lxts_enc_info:
3030 .byte 9,0,0,0
3031 .rva se_handler
3032 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3033.Lxts_dec_info:
3034 .byte 9,0,0,0
3035 .rva se_handler
3036 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3037___
3038}
3039
3040$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3041
3042print $code;
3043
3044close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl
deleted file mode 100644
index 1533e2c304..0000000000
--- a/src/lib/libcrypto/aes/asm/vpaes-x86.pl
+++ /dev/null
@@ -1,903 +0,0 @@
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
17# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-586.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86.pl column - [also
26# large-block CBC] encrypt/decrypt.
27#
28# aes-586.pl vpaes-x86.pl
29#
30# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***)
31# Nehalem 27.9/40.4/18.1 10.3/12.0
32# Atom 102./119./60.1 64.5/85.3(***)
33#
34# (*) "Hyper-threading" in the context refers rather to cache shared
35# among multiple cores, than to specifically Intel HTT. As vast
36# majority of contemporary cores share cache, slower code path
37# is common place. In other words "with-hyper-threading-off"
38# results are presented mostly for reference purposes.
39#
40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
41#
42# (***) Less impressive improvement on Core 2 and Atom is due to slow
43# pshufb, yet it's respectable +32%/65% improvement on Core 2
44# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
45# code path).
46#
47# <appro@openssl.org>
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50push(@INC,"${dir}","${dir}../../perlasm");
51require "x86asm.pl";
52
53&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
54
55$PREFIX="vpaes";
56
57my ($round, $base, $magic, $key, $const, $inp, $out)=
58 ("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
59
60&static_label("_vpaes_consts");
61&static_label("_vpaes_schedule_low_round");
62
63&set_label("_vpaes_consts",64);
64$k_inv=-0x30; # inv, inva
65 &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
66 &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
67
68$k_s0F=-0x10; # s0F
69 &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
70
71$k_ipt=0x00; # input transform (lo, hi)
72 &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
73 &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
74
75$k_sb1=0x20; # sb1u, sb1t
76 &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
77 &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
78$k_sb2=0x40; # sb2u, sb2t
79 &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
80 &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
81$k_sbo=0x60; # sbou, sbot
82 &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
83 &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
84
85$k_mc_forward=0x80; # mc_forward
86 &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
87 &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
88 &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
89 &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
90
91$k_mc_backward=0xc0; # mc_backward
92 &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
93 &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
94 &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
95 &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
96
97$k_sr=0x100; # sr
98 &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
99 &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
100 &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
101 &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
102
103$k_rcon=0x140; # rcon
104 &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
105
106$k_s63=0x150; # s63: all equal to 0x63 transformed
107 &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
108
109$k_opt=0x160; # output transform
110 &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
111 &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
112
113$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
114 &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
115 &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
116##
117## Decryption stuff
118## Key schedule constants
119##
120$k_dksd=0x1a0; # decryption key schedule: invskew x*D
121 &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
122 &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
123$k_dksb=0x1c0; # decryption key schedule: invskew x*B
124 &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
125 &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
126$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
127 &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
128 &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
129$k_dks9=0x200; # decryption key schedule: invskew x*9
130 &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
131 &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
132
133##
134## Decryption stuff
135## Round function constants
136##
137$k_dipt=0x220; # decryption input transform
138 &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
139 &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
140
141$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
142 &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
143 &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
144$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
145 &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
146 &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
147$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
148 &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
149 &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
150$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
151 &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
152 &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
153$k_dsbo=0x2c0; # decryption sbox final output
154 &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
155 &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
156&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
157&align (64);
158
159&function_begin_B("_vpaes_preheat");
160 &add ($const,&DWP(0,"esp"));
161 &movdqa ("xmm7",&QWP($k_inv,$const));
162 &movdqa ("xmm6",&QWP($k_s0F,$const));
163 &ret ();
164&function_end_B("_vpaes_preheat");
165
166##
167## _aes_encrypt_core
168##
169## AES-encrypt %xmm0.
170##
171## Inputs:
172## %xmm0 = input
173## %xmm6-%xmm7 as in _vpaes_preheat
174## (%edx) = scheduled keys
175##
176## Output in %xmm0
177## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
178##
179##
180&function_begin_B("_vpaes_encrypt_core");
181 &mov ($magic,16);
182 &mov ($round,&DWP(240,$key));
183 &movdqa ("xmm1","xmm6")
184 &movdqa ("xmm2",&QWP($k_ipt,$const));
185 &pandn ("xmm1","xmm0");
186 &movdqu ("xmm5",&QWP(0,$key));
187 &psrld ("xmm1",4);
188 &pand ("xmm0","xmm6");
189 &pshufb ("xmm2","xmm0");
190 &movdqa ("xmm0",&QWP($k_ipt+16,$const));
191 &pshufb ("xmm0","xmm1");
192 &pxor ("xmm2","xmm5");
193 &pxor ("xmm0","xmm2");
194 &add ($key,16);
195 &lea ($base,&DWP($k_mc_backward,$const));
196 &jmp (&label("enc_entry"));
197
198
199&set_label("enc_loop",16);
200 # middle of middle round
201 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
202 &pshufb ("xmm4","xmm2"); # 4 = sb1u
203 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
204 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
205 &pshufb ("xmm0","xmm3"); # 0 = sb1t
206 &pxor ("xmm0","xmm4"); # 0 = A
207 &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
208 &pshufb ("xmm5","xmm2"); # 4 = sb2u
209 &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
210 &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
211 &pshufb ("xmm2","xmm3"); # 2 = sb2t
212 &pxor ("xmm2","xmm5"); # 2 = 2A
213 &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
214 &movdqa ("xmm3","xmm0"); # 3 = A
215 &pshufb ("xmm0","xmm1"); # 0 = B
216 &add ($key,16); # next key
217 &pxor ("xmm0","xmm2"); # 0 = 2A+B
218 &pshufb ("xmm3","xmm4"); # 3 = D
219 &add ($magic,16); # next mc
220 &pxor ("xmm3","xmm0"); # 3 = 2A+B+D
221 &pshufb ("xmm0","xmm1"); # 0 = 2B+C
222 &and ($magic,0x30); # ... mod 4
223 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
224 &sub ($round,1); # nr--
225
226&set_label("enc_entry");
227 # top of round
228 &movdqa ("xmm1","xmm6"); # 1 : i
229 &pandn ("xmm1","xmm0"); # 1 = i<<4
230 &psrld ("xmm1",4); # 1 = i
231 &pand ("xmm0","xmm6"); # 0 = k
232 &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
233 &pshufb ("xmm5","xmm0"); # 2 = a/k
234 &pxor ("xmm0","xmm1"); # 0 = j
235 &movdqa ("xmm3","xmm7"); # 3 : 1/i
236 &pshufb ("xmm3","xmm1"); # 3 = 1/i
237 &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
238 &movdqa ("xmm4","xmm7"); # 4 : 1/j
239 &pshufb ("xmm4","xmm0"); # 4 = 1/j
240 &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
241 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
242 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
243 &pxor ("xmm2","xmm0"); # 2 = io
244 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
245 &movdqu ("xmm5",&QWP(0,$key));
246 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
247 &pxor ("xmm3","xmm1"); # 3 = jo
248 &jnz (&label("enc_loop"));
249
250 # middle of last round
251 &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
252 &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
253 &pshufb ("xmm4","xmm2"); # 4 = sbou
254 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
255 &pshufb ("xmm0","xmm3"); # 0 = sb1t
256 &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
257 &pxor ("xmm0","xmm4"); # 0 = A
258 &pshufb ("xmm0","xmm1");
259 &ret ();
260&function_end_B("_vpaes_encrypt_core");
261
262##
263## Decryption core
264##
265## Same API as encryption core.
266##
267&function_begin_B("_vpaes_decrypt_core");
268 &mov ($round,&DWP(240,$key));
269 &lea ($base,&DWP($k_dsbd,$const));
270 &movdqa ("xmm1","xmm6");
271 &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
272 &pandn ("xmm1","xmm0");
273 &mov ($magic,$round);
274 &psrld ("xmm1",4)
275 &movdqu ("xmm5",&QWP(0,$key));
276 &shl ($magic,4);
277 &pand ("xmm0","xmm6");
278 &pshufb ("xmm2","xmm0");
279 &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
280 &xor ($magic,0x30);
281 &pshufb ("xmm0","xmm1");
282 &and ($magic,0x30);
283 &pxor ("xmm2","xmm5");
284 &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
285 &pxor ("xmm0","xmm2");
286 &add ($key,16);
287 &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
288 &jmp (&label("dec_entry"));
289
290&set_label("dec_loop",16);
291##
292## Inverse mix columns
293##
294 &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
295 &pshufb ("xmm4","xmm2"); # 4 = sb9u
296 &pxor ("xmm4","xmm0");
297 &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t
298 &pshufb ("xmm0","xmm3"); # 0 = sb9t
299 &pxor ("xmm0","xmm4"); # 0 = ch
300 &add ($key,16); # next round key
301
302 &pshufb ("xmm0","xmm5"); # MC ch
303 &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
304 &pshufb ("xmm4","xmm2"); # 4 = sbdu
305 &pxor ("xmm4","xmm0"); # 4 = ch
306 &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt
307 &pshufb ("xmm0","xmm3"); # 0 = sbdt
308 &pxor ("xmm0","xmm4"); # 0 = ch
309 &sub ($round,1); # nr--
310
311 &pshufb ("xmm0","xmm5"); # MC ch
312 &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
313 &pshufb ("xmm4","xmm2"); # 4 = sbbu
314 &pxor ("xmm4","xmm0"); # 4 = ch
315 &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt
316 &pshufb ("xmm0","xmm3"); # 0 = sbbt
317 &pxor ("xmm0","xmm4"); # 0 = ch
318
319 &pshufb ("xmm0","xmm5"); # MC ch
320 &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
321 &pshufb ("xmm4","xmm2"); # 4 = sbeu
322 &pxor ("xmm4","xmm0"); # 4 = ch
323 &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
324 &pshufb ("xmm0","xmm3"); # 0 = sbet
325 &pxor ("xmm0","xmm4"); # 0 = ch
326
327 &palignr("xmm5","xmm5",12);
328
329&set_label("dec_entry");
330 # top of round
331 &movdqa ("xmm1","xmm6"); # 1 : i
332 &pandn ("xmm1","xmm0"); # 1 = i<<4
333 &psrld ("xmm1",4); # 1 = i
334 &pand ("xmm0","xmm6"); # 0 = k
335 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
336 &pshufb ("xmm2","xmm0"); # 2 = a/k
337 &pxor ("xmm0","xmm1"); # 0 = j
338 &movdqa ("xmm3","xmm7"); # 3 : 1/i
339 &pshufb ("xmm3","xmm1"); # 3 = 1/i
340 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
341 &movdqa ("xmm4","xmm7"); # 4 : 1/j
342 &pshufb ("xmm4","xmm0"); # 4 = 1/j
343 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
344 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
345 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
346 &pxor ("xmm2","xmm0"); # 2 = io
347 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
348 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
349 &pxor ("xmm3","xmm1"); # 3 = jo
350 &movdqu ("xmm0",&QWP(0,$key));
351 &jnz (&label("dec_loop"));
352
353 # middle of last round
354 &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
355 &pshufb ("xmm4","xmm2"); # 4 = sbou
356 &pxor ("xmm4","xmm0"); # 4 = sb1u + k
357 &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
358 &movdqa ("xmm2",&QWP(0,$magic));
359 &pshufb ("xmm0","xmm3"); # 0 = sb1t
360 &pxor ("xmm0","xmm4"); # 0 = A
361 &pshufb ("xmm0","xmm2");
362 &ret ();
363&function_end_B("_vpaes_decrypt_core");
364
365########################################################
366## ##
367## AES key schedule ##
368## ##
369########################################################
370&function_begin_B("_vpaes_schedule_core");
371 &add ($const,&DWP(0,"esp"));
372 &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
373 &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
374
375 # input transform
376 &movdqa ("xmm3","xmm0");
377 &lea ($base,&DWP($k_ipt,$const));
378 &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
379 &call ("_vpaes_schedule_transform");
380 &movdqa ("xmm7","xmm0");
381
382 &test ($out,$out);
383 &jnz (&label("schedule_am_decrypting"));
384
385 # encrypting, output zeroth round key after transform
386 &movdqu (&QWP(0,$key),"xmm0");
387 &jmp (&label("schedule_go"));
388
389&set_label("schedule_am_decrypting");
390 # decrypting, output zeroth round key after shiftrows
391 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
392 &pshufb ("xmm3","xmm1");
393 &movdqu (&QWP(0,$key),"xmm3");
394 &xor ($magic,0x30);
395
396&set_label("schedule_go");
397 &cmp ($round,192);
398 &ja (&label("schedule_256"));
399 &je (&label("schedule_192"));
400 # 128: fall though
401
402##
403## .schedule_128
404##
405## 128-bit specific part of key schedule.
406##
407## This schedule is really simple, because all its parts
408## are accomplished by the subroutines.
409##
410&set_label("schedule_128");
411 &mov ($round,10);
412
413&set_label("loop_schedule_128");
414 &call ("_vpaes_schedule_round");
415 &dec ($round);
416 &jz (&label("schedule_mangle_last"));
417 &call ("_vpaes_schedule_mangle"); # write output
418 &jmp (&label("loop_schedule_128"));
419
420##
421## .aes_schedule_192
422##
423## 192-bit specific part of key schedule.
424##
425## The main body of this schedule is the same as the 128-bit
426## schedule, but with more smearing. The long, high side is
427## stored in %xmm7 as before, and the short, low side is in
428## the high bits of %xmm6.
429##
430## This schedule is somewhat nastier, however, because each
431## round produces 192 bits of key material, or 1.5 round keys.
432## Therefore, on each cycle we do 2 rounds and produce 3 round
433## keys.
434##
435&set_label("schedule_192",16);
436 &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
437 &call ("_vpaes_schedule_transform"); # input transform
438 &movdqa ("xmm6","xmm0"); # save short part
439 &pxor ("xmm4","xmm4"); # clear 4
440 &movhlps("xmm6","xmm4"); # clobber low side with zeros
441 &mov ($round,4);
442
443&set_label("loop_schedule_192");
444 &call ("_vpaes_schedule_round");
445 &palignr("xmm0","xmm6",8);
446 &call ("_vpaes_schedule_mangle"); # save key n
447 &call ("_vpaes_schedule_192_smear");
448 &call ("_vpaes_schedule_mangle"); # save key n+1
449 &call ("_vpaes_schedule_round");
450 &dec ($round);
451 &jz (&label("schedule_mangle_last"));
452 &call ("_vpaes_schedule_mangle"); # save key n+2
453 &call ("_vpaes_schedule_192_smear");
454 &jmp (&label("loop_schedule_192"));
455
456##
457## .aes_schedule_256
458##
459## 256-bit specific part of key schedule.
460##
461## The structure here is very similar to the 128-bit
462## schedule, but with an additional "low side" in
463## %xmm6. The low side's rounds are the same as the
464## high side's, except no rcon and no rotation.
465##
466&set_label("schedule_256",16);
467 &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
468 &call ("_vpaes_schedule_transform"); # input transform
469 &mov ($round,7);
470
471&set_label("loop_schedule_256");
472 &call ("_vpaes_schedule_mangle"); # output low result
473 &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
474
475 # high round
476 &call ("_vpaes_schedule_round");
477 &dec ($round);
478 &jz (&label("schedule_mangle_last"));
479 &call ("_vpaes_schedule_mangle");
480
481 # low round. swap xmm7 and xmm6
482 &pshufd ("xmm0","xmm0",0xFF);
483 &movdqa (&QWP(20,"esp"),"xmm7");
484 &movdqa ("xmm7","xmm6");
485 &call ("_vpaes_schedule_low_round");
486 &movdqa ("xmm7",&QWP(20,"esp"));
487
488 &jmp (&label("loop_schedule_256"));
489
490##
491## .aes_schedule_mangle_last
492##
493## Mangler for last round of key schedule
494## Mangles %xmm0
495## when encrypting, outputs out(%xmm0) ^ 63
496## when decrypting, outputs unskew(%xmm0)
497##
498## Always called right before return... jumps to cleanup and exits
499##
500&set_label("schedule_mangle_last",16);
501 # schedule last round key from xmm0
502 &lea ($base,&DWP($k_deskew,$const));
503 &test ($out,$out);
504 &jnz (&label("schedule_mangle_last_dec"));
505
506 # encrypting
507 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
508 &pshufb ("xmm0","xmm1"); # output permute
509 &lea ($base,&DWP($k_opt,$const)); # prepare to output transform
510 &add ($key,32);
511
512&set_label("schedule_mangle_last_dec");
513 &add ($key,-16);
514 &pxor ("xmm0",&QWP($k_s63,$const));
515 &call ("_vpaes_schedule_transform"); # output transform
516 &movdqu (&QWP(0,$key),"xmm0"); # save last key
517
518 # cleanup
519 &pxor ("xmm0","xmm0");
520 &pxor ("xmm1","xmm1");
521 &pxor ("xmm2","xmm2");
522 &pxor ("xmm3","xmm3");
523 &pxor ("xmm4","xmm4");
524 &pxor ("xmm5","xmm5");
525 &pxor ("xmm6","xmm6");
526 &pxor ("xmm7","xmm7");
527 &ret ();
528&function_end_B("_vpaes_schedule_core");
529
530##
531## .aes_schedule_192_smear
532##
533## Smear the short, low side in the 192-bit key schedule.
534##
535## Inputs:
536## %xmm7: high side, b a x y
537## %xmm6: low side, d c 0 0
538## %xmm13: 0
539##
540## Outputs:
541## %xmm6: b+c+d b+c 0 0
542## %xmm0: b+c+d b+c b a
543##
544&function_begin_B("_vpaes_schedule_192_smear");
545 &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0
546 &pxor ("xmm6","xmm0"); # -> c+d c 0 0
547 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
548 &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
549 &movdqa ("xmm0","xmm6");
550 &pxor ("xmm1","xmm1");
551 &movhlps("xmm6","xmm1"); # clobber low side with zeros
552 &ret ();
553&function_end_B("_vpaes_schedule_192_smear");
554
555##
556## .aes_schedule_round
557##
558## Runs one main round of the key schedule on %xmm0, %xmm7
559##
560## Specifically, runs subbytes on the high dword of %xmm0
561## then rotates it by one byte and xors into the low dword of
562## %xmm7.
563##
564## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
565## next rcon.
566##
567## Smears the dwords of %xmm7 by xoring the low into the
568## second low, result into third, result into highest.
569##
570## Returns results in %xmm7 = %xmm0.
571## Clobbers %xmm1-%xmm5.
572##
573&function_begin_B("_vpaes_schedule_round");
574 # extract rcon from xmm8
575 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8
576 &pxor ("xmm1","xmm1");
577 &palignr("xmm1","xmm2",15);
578 &palignr("xmm2","xmm2",15);
579 &pxor ("xmm7","xmm1");
580
581 # rotate
582 &pshufd ("xmm0","xmm0",0xFF);
583 &palignr("xmm0","xmm0",1);
584
585 # fall through...
586 &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
587
588 # low round: same as high round, but no rotation and no rcon.
589&set_label("_vpaes_schedule_low_round");
590 # smear xmm7
591 &movdqa ("xmm1","xmm7");
592 &pslldq ("xmm7",4);
593 &pxor ("xmm7","xmm1");
594 &movdqa ("xmm1","xmm7");
595 &pslldq ("xmm7",8);
596 &pxor ("xmm7","xmm1");
597 &pxor ("xmm7",&QWP($k_s63,$const));
598
599 # subbyte
600 &movdqa ("xmm4",&QWP($k_s0F,$const));
601 &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
602 &movdqa ("xmm1","xmm4");
603 &pandn ("xmm1","xmm0");
604 &psrld ("xmm1",4); # 1 = i
605 &pand ("xmm0","xmm4"); # 0 = k
606 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
607 &pshufb ("xmm2","xmm0"); # 2 = a/k
608 &pxor ("xmm0","xmm1"); # 0 = j
609 &movdqa ("xmm3","xmm5"); # 3 : 1/i
610 &pshufb ("xmm3","xmm1"); # 3 = 1/i
611 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
612 &movdqa ("xmm4","xmm5"); # 4 : 1/j
613 &pshufb ("xmm4","xmm0"); # 4 = 1/j
614 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
615 &movdqa ("xmm2","xmm5"); # 2 : 1/iak
616 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
617 &pxor ("xmm2","xmm0"); # 2 = io
618 &movdqa ("xmm3","xmm5"); # 3 : 1/jak
619 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
620 &pxor ("xmm3","xmm1"); # 3 = jo
621 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
622 &pshufb ("xmm4","xmm2"); # 4 = sbou
623 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
624 &pshufb ("xmm0","xmm3"); # 0 = sb1t
625 &pxor ("xmm0","xmm4"); # 0 = sbox output
626
627 # add in smeared stuff
628 &pxor ("xmm0","xmm7");
629 &movdqa ("xmm7","xmm0");
630 &ret ();
631&function_end_B("_vpaes_schedule_round");
632
633##
634## .aes_schedule_transform
635##
636## Linear-transform %xmm0 according to tables at (%ebx)
637##
638## Output in %xmm0
639## Clobbers %xmm1, %xmm2
640##
641&function_begin_B("_vpaes_schedule_transform");
642 &movdqa ("xmm2",&QWP($k_s0F,$const));
643 &movdqa ("xmm1","xmm2");
644 &pandn ("xmm1","xmm0");
645 &psrld ("xmm1",4);
646 &pand ("xmm0","xmm2");
647 &movdqa ("xmm2",&QWP(0,$base));
648 &pshufb ("xmm2","xmm0");
649 &movdqa ("xmm0",&QWP(16,$base));
650 &pshufb ("xmm0","xmm1");
651 &pxor ("xmm0","xmm2");
652 &ret ();
653&function_end_B("_vpaes_schedule_transform");
654
655##
656## .aes_schedule_mangle
657##
658## Mangle xmm0 from (basis-transformed) standard version
659## to our version.
660##
661## On encrypt,
662## xor with 0x63
663## multiply by circulant 0,1,1,1
664## apply shiftrows transform
665##
666## On decrypt,
667## xor with 0x63
668## multiply by "inverse mixcolumns" circulant E,B,D,9
669## deskew
670## apply shiftrows transform
671##
672##
673## Writes out to (%edx), and increments or decrements it
674## Keeps track of round number mod 4 in %ecx
675## Preserves xmm0
676## Clobbers xmm1-xmm5
677##
678&function_begin_B("_vpaes_schedule_mangle");
679 &movdqa ("xmm4","xmm0"); # save xmm0 for later
680 &movdqa ("xmm5",&QWP($k_mc_forward,$const));
681 &test ($out,$out);
682 &jnz (&label("schedule_mangle_dec"));
683
684 # encrypting
685 &add ($key,16);
686 &pxor ("xmm4",&QWP($k_s63,$const));
687 &pshufb ("xmm4","xmm5");
688 &movdqa ("xmm3","xmm4");
689 &pshufb ("xmm4","xmm5");
690 &pxor ("xmm3","xmm4");
691 &pshufb ("xmm4","xmm5");
692 &pxor ("xmm3","xmm4");
693
694 &jmp (&label("schedule_mangle_both"));
695
696&set_label("schedule_mangle_dec",16);
697 # inverse mix columns
698 &movdqa ("xmm2",&QWP($k_s0F,$const));
699 &lea ($inp,&DWP($k_dksd,$const));
700 &movdqa ("xmm1","xmm2");
701 &pandn ("xmm1","xmm4");
702 &psrld ("xmm1",4); # 1 = hi
703 &pand ("xmm4","xmm2"); # 4 = lo
704
705 &movdqa ("xmm2",&QWP(0,$inp));
706 &pshufb ("xmm2","xmm4");
707 &movdqa ("xmm3",&QWP(0x10,$inp));
708 &pshufb ("xmm3","xmm1");
709 &pxor ("xmm3","xmm2");
710 &pshufb ("xmm3","xmm5");
711
712 &movdqa ("xmm2",&QWP(0x20,$inp));
713 &pshufb ("xmm2","xmm4");
714 &pxor ("xmm2","xmm3");
715 &movdqa ("xmm3",&QWP(0x30,$inp));
716 &pshufb ("xmm3","xmm1");
717 &pxor ("xmm3","xmm2");
718 &pshufb ("xmm3","xmm5");
719
720 &movdqa ("xmm2",&QWP(0x40,$inp));
721 &pshufb ("xmm2","xmm4");
722 &pxor ("xmm2","xmm3");
723 &movdqa ("xmm3",&QWP(0x50,$inp));
724 &pshufb ("xmm3","xmm1");
725 &pxor ("xmm3","xmm2");
726 &pshufb ("xmm3","xmm5");
727
728 &movdqa ("xmm2",&QWP(0x60,$inp));
729 &pshufb ("xmm2","xmm4");
730 &pxor ("xmm2","xmm3");
731 &movdqa ("xmm3",&QWP(0x70,$inp));
732 &pshufb ("xmm3","xmm1");
733 &pxor ("xmm3","xmm2");
734
735 &add ($key,-16);
736
737&set_label("schedule_mangle_both");
738 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
739 &pshufb ("xmm3","xmm1");
740 &add ($magic,-16);
741 &and ($magic,0x30);
742 &movdqu (&QWP(0,$key),"xmm3");
743 &ret ();
744&function_end_B("_vpaes_schedule_mangle");
745
746#
747# Interface to OpenSSL
748#
749&function_begin("${PREFIX}_set_encrypt_key");
750 &mov ($inp,&wparam(0)); # inp
751 &lea ($base,&DWP(-56,"esp"));
752 &mov ($round,&wparam(1)); # bits
753 &and ($base,-16);
754 &mov ($key,&wparam(2)); # key
755 &xchg ($base,"esp"); # alloca
756 &mov (&DWP(48,"esp"),$base);
757
758 &mov ($base,$round);
759 &shr ($base,5);
760 &add ($base,5);
761 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
762 &mov ($magic,0x30);
763 &mov ($out,0);
764
765 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
766 &call ("_vpaes_schedule_core");
767&set_label("pic_point");
768
769 &mov ("esp",&DWP(48,"esp"));
770 &xor ("eax","eax");
771&function_end("${PREFIX}_set_encrypt_key");
772
773&function_begin("${PREFIX}_set_decrypt_key");
774 &mov ($inp,&wparam(0)); # inp
775 &lea ($base,&DWP(-56,"esp"));
776 &mov ($round,&wparam(1)); # bits
777 &and ($base,-16);
778 &mov ($key,&wparam(2)); # key
779 &xchg ($base,"esp"); # alloca
780 &mov (&DWP(48,"esp"),$base);
781
782 &mov ($base,$round);
783 &shr ($base,5);
784 &add ($base,5);
785 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
786 &shl ($base,4);
787 &lea ($key,&DWP(16,$key,$base));
788
789 &mov ($out,1);
790 &mov ($magic,$round);
791 &shr ($magic,1);
792 &and ($magic,32);
793 &xor ($magic,32); # nbist==192?0:32;
794
795 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
796 &call ("_vpaes_schedule_core");
797&set_label("pic_point");
798
799 &mov ("esp",&DWP(48,"esp"));
800 &xor ("eax","eax");
801&function_end("${PREFIX}_set_decrypt_key");
802
803&function_begin("${PREFIX}_encrypt");
804 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
805 &call ("_vpaes_preheat");
806&set_label("pic_point");
807 &mov ($inp,&wparam(0)); # inp
808 &lea ($base,&DWP(-56,"esp"));
809 &mov ($out,&wparam(1)); # out
810 &and ($base,-16);
811 &mov ($key,&wparam(2)); # key
812 &xchg ($base,"esp"); # alloca
813 &mov (&DWP(48,"esp"),$base);
814
815 &movdqu ("xmm0",&QWP(0,$inp));
816 &call ("_vpaes_encrypt_core");
817 &movdqu (&QWP(0,$out),"xmm0");
818
819 &mov ("esp",&DWP(48,"esp"));
820&function_end("${PREFIX}_encrypt");
821
822&function_begin("${PREFIX}_decrypt");
823 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
824 &call ("_vpaes_preheat");
825&set_label("pic_point");
826 &mov ($inp,&wparam(0)); # inp
827 &lea ($base,&DWP(-56,"esp"));
828 &mov ($out,&wparam(1)); # out
829 &and ($base,-16);
830 &mov ($key,&wparam(2)); # key
831 &xchg ($base,"esp"); # alloca
832 &mov (&DWP(48,"esp"),$base);
833
834 &movdqu ("xmm0",&QWP(0,$inp));
835 &call ("_vpaes_decrypt_core");
836 &movdqu (&QWP(0,$out),"xmm0");
837
838 &mov ("esp",&DWP(48,"esp"));
839&function_end("${PREFIX}_decrypt");
840
841&function_begin("${PREFIX}_cbc_encrypt");
842 &mov ($inp,&wparam(0)); # inp
843 &mov ($out,&wparam(1)); # out
844 &mov ($round,&wparam(2)); # len
845 &mov ($key,&wparam(3)); # key
846 &sub ($round,16);
847 &jc (&label("cbc_abort"));
848 &lea ($base,&DWP(-56,"esp"));
849 &mov ($const,&wparam(4)); # ivp
850 &and ($base,-16);
851 &mov ($magic,&wparam(5)); # enc
852 &xchg ($base,"esp"); # alloca
853 &movdqu ("xmm1",&QWP(0,$const)); # load IV
854 &sub ($out,$inp);
855 &mov (&DWP(48,"esp"),$base);
856
857 &mov (&DWP(0,"esp"),$out); # save out
858 &mov (&DWP(4,"esp"),$key) # save key
859 &mov (&DWP(8,"esp"),$const); # save ivp
860 &mov ($out,$round); # $out works as $len
861
862 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
863 &call ("_vpaes_preheat");
864&set_label("pic_point");
865 &cmp ($magic,0);
866 &je (&label("cbc_dec_loop"));
867 &jmp (&label("cbc_enc_loop"));
868
869&set_label("cbc_enc_loop",16);
870 &movdqu ("xmm0",&QWP(0,$inp)); # load input
871 &pxor ("xmm0","xmm1"); # inp^=iv
872 &call ("_vpaes_encrypt_core");
873 &mov ($base,&DWP(0,"esp")); # restore out
874 &mov ($key,&DWP(4,"esp")); # restore key
875 &movdqa ("xmm1","xmm0");
876 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
877 &lea ($inp,&DWP(16,$inp));
878 &sub ($out,16);
879 &jnc (&label("cbc_enc_loop"));
880 &jmp (&label("cbc_done"));
881
882&set_label("cbc_dec_loop",16);
883 &movdqu ("xmm0",&QWP(0,$inp)); # load input
884 &movdqa (&QWP(16,"esp"),"xmm1"); # save IV
885 &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
886 &call ("_vpaes_decrypt_core");
887 &mov ($base,&DWP(0,"esp")); # restore out
888 &mov ($key,&DWP(4,"esp")); # restore key
889 &pxor ("xmm0",&QWP(16,"esp")); # out^=iv
890 &movdqa ("xmm1",&QWP(32,"esp")); # load next IV
891 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
892 &lea ($inp,&DWP(16,$inp));
893 &sub ($out,16);
894 &jnc (&label("cbc_dec_loop"));
895
896&set_label("cbc_done");
897 &mov ($base,&DWP(8,"esp")); # restore ivp
898 &mov ("esp",&DWP(48,"esp"));
899 &movdqu (&QWP(0,$base),"xmm1"); # write IV
900&set_label("cbc_abort");
901&function_end("${PREFIX}_cbc_encrypt");
902
903&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
deleted file mode 100644
index 37998db5e1..0000000000
--- a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
+++ /dev/null
@@ -1,1206 +0,0 @@
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Interface to OpenSSL as "almost" drop-in replacement for
17# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-x86_64.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86_64.pl column -
26# [also large-block CBC] encrypt/decrypt.
27#
28# aes-x86_64.pl vpaes-x86_64.pl
29#
30# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***)
31# Nehalem 30.5/42.2/14.6 9.8/11.8
32# Atom 63.9/79.0/32.1 64.0/84.8(***)
33#
34# (*) "Hyper-threading" in the context refers rather to cache shared
35# among multiple cores, than to specifically Intel HTT. As vast
36# majority of contemporary cores share cache, slower code path
37# is common place. In other words "with-hyper-threading-off"
38# results are presented mostly for reference purposes.
39#
40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
41#
42# (***) Less impressive improvement on Core 2 and Atom is due to slow
43# pshufb, yet it's respectable +40%/78% improvement on Core 2
44# (as implied, over "hyper-threading-safe" code path).
45#
46# <appro@openssl.org>
47
48$flavour = shift;
49$output = shift;
50if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
51
52$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53
54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57die "can't locate x86_64-xlate.pl";
58
59open STDOUT,"| $^X $xlate $flavour $output";
60
61$PREFIX="vpaes";
62
63$code.=<<___;
64.text
65
66##
67## _aes_encrypt_core
68##
69## AES-encrypt %xmm0.
70##
71## Inputs:
72## %xmm0 = input
73## %xmm9-%xmm15 as in _vpaes_preheat
74## (%rdx) = scheduled keys
75##
76## Output in %xmm0
77## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
78## Preserves %xmm6 - %xmm8 so you get some local vectors
79##
80##
81.type _vpaes_encrypt_core,\@abi-omnipotent
82.align 16
83_vpaes_encrypt_core:
84 mov %rdx, %r9
85 mov \$16, %r11
86 mov 240(%rdx),%eax
87 movdqa %xmm9, %xmm1
88 movdqa .Lk_ipt(%rip), %xmm2 # iptlo
89 pandn %xmm0, %xmm1
90 movdqu (%r9), %xmm5 # round0 key
91 psrld \$4, %xmm1
92 pand %xmm9, %xmm0
93 pshufb %xmm0, %xmm2
94 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
95 pshufb %xmm1, %xmm0
96 pxor %xmm5, %xmm2
97 pxor %xmm2, %xmm0
98 add \$16, %r9
99 lea .Lk_mc_backward(%rip),%r10
100 jmp .Lenc_entry
101
102.align 16
103.Lenc_loop:
104 # middle of middle round
105 movdqa %xmm13, %xmm4 # 4 : sb1u
106 pshufb %xmm2, %xmm4 # 4 = sb1u
107 pxor %xmm5, %xmm4 # 4 = sb1u + k
108 movdqa %xmm12, %xmm0 # 0 : sb1t
109 pshufb %xmm3, %xmm0 # 0 = sb1t
110 pxor %xmm4, %xmm0 # 0 = A
111 movdqa %xmm15, %xmm5 # 4 : sb2u
112 pshufb %xmm2, %xmm5 # 4 = sb2u
113 movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
114 movdqa %xmm14, %xmm2 # 2 : sb2t
115 pshufb %xmm3, %xmm2 # 2 = sb2t
116 pxor %xmm5, %xmm2 # 2 = 2A
117 movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
118 movdqa %xmm0, %xmm3 # 3 = A
119 pshufb %xmm1, %xmm0 # 0 = B
120 add \$16, %r9 # next key
121 pxor %xmm2, %xmm0 # 0 = 2A+B
122 pshufb %xmm4, %xmm3 # 3 = D
123 add \$16, %r11 # next mc
124 pxor %xmm0, %xmm3 # 3 = 2A+B+D
125 pshufb %xmm1, %xmm0 # 0 = 2B+C
126 and \$0x30, %r11 # ... mod 4
127 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
128 sub \$1,%rax # nr--
129
130.Lenc_entry:
131 # top of round
132 movdqa %xmm9, %xmm1 # 1 : i
133 pandn %xmm0, %xmm1 # 1 = i<<4
134 psrld \$4, %xmm1 # 1 = i
135 pand %xmm9, %xmm0 # 0 = k
136 movdqa %xmm11, %xmm5 # 2 : a/k
137 pshufb %xmm0, %xmm5 # 2 = a/k
138 pxor %xmm1, %xmm0 # 0 = j
139 movdqa %xmm10, %xmm3 # 3 : 1/i
140 pshufb %xmm1, %xmm3 # 3 = 1/i
141 pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
142 movdqa %xmm10, %xmm4 # 4 : 1/j
143 pshufb %xmm0, %xmm4 # 4 = 1/j
144 pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
145 movdqa %xmm10, %xmm2 # 2 : 1/iak
146 pshufb %xmm3, %xmm2 # 2 = 1/iak
147 pxor %xmm0, %xmm2 # 2 = io
148 movdqa %xmm10, %xmm3 # 3 : 1/jak
149 movdqu (%r9), %xmm5
150 pshufb %xmm4, %xmm3 # 3 = 1/jak
151 pxor %xmm1, %xmm3 # 3 = jo
152 jnz .Lenc_loop
153
154 # middle of last round
155 movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
156 movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
157 pshufb %xmm2, %xmm4 # 4 = sbou
158 pxor %xmm5, %xmm4 # 4 = sb1u + k
159 pshufb %xmm3, %xmm0 # 0 = sb1t
160 movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
161 pxor %xmm4, %xmm0 # 0 = A
162 pshufb %xmm1, %xmm0
163 ret
164.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
165
166##
167## Decryption core
168##
169## Same API as encryption core.
170##
171.type _vpaes_decrypt_core,\@abi-omnipotent
172.align 16
173_vpaes_decrypt_core:
174 mov %rdx, %r9 # load key
175 mov 240(%rdx),%eax
176 movdqa %xmm9, %xmm1
177 movdqa .Lk_dipt(%rip), %xmm2 # iptlo
178 pandn %xmm0, %xmm1
179 mov %rax, %r11
180 psrld \$4, %xmm1
181 movdqu (%r9), %xmm5 # round0 key
182 shl \$4, %r11
183 pand %xmm9, %xmm0
184 pshufb %xmm0, %xmm2
185 movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
186 xor \$0x30, %r11
187 lea .Lk_dsbd(%rip),%r10
188 pshufb %xmm1, %xmm0
189 and \$0x30, %r11
190 pxor %xmm5, %xmm2
191 movdqa .Lk_mc_forward+48(%rip), %xmm5
192 pxor %xmm2, %xmm0
193 add \$16, %r9
194 add %r10, %r11
195 jmp .Ldec_entry
196
197.align 16
198.Ldec_loop:
199##
200## Inverse mix columns
201##
202 movdqa -0x20(%r10),%xmm4 # 4 : sb9u
203 pshufb %xmm2, %xmm4 # 4 = sb9u
204 pxor %xmm0, %xmm4
205 movdqa -0x10(%r10),%xmm0 # 0 : sb9t
206 pshufb %xmm3, %xmm0 # 0 = sb9t
207 pxor %xmm4, %xmm0 # 0 = ch
208 add \$16, %r9 # next round key
209
210 pshufb %xmm5, %xmm0 # MC ch
211 movdqa 0x00(%r10),%xmm4 # 4 : sbdu
212 pshufb %xmm2, %xmm4 # 4 = sbdu
213 pxor %xmm0, %xmm4 # 4 = ch
214 movdqa 0x10(%r10),%xmm0 # 0 : sbdt
215 pshufb %xmm3, %xmm0 # 0 = sbdt
216 pxor %xmm4, %xmm0 # 0 = ch
217 sub \$1,%rax # nr--
218
219 pshufb %xmm5, %xmm0 # MC ch
220 movdqa 0x20(%r10),%xmm4 # 4 : sbbu
221 pshufb %xmm2, %xmm4 # 4 = sbbu
222 pxor %xmm0, %xmm4 # 4 = ch
223 movdqa 0x30(%r10),%xmm0 # 0 : sbbt
224 pshufb %xmm3, %xmm0 # 0 = sbbt
225 pxor %xmm4, %xmm0 # 0 = ch
226
227 pshufb %xmm5, %xmm0 # MC ch
228 movdqa 0x40(%r10),%xmm4 # 4 : sbeu
229 pshufb %xmm2, %xmm4 # 4 = sbeu
230 pxor %xmm0, %xmm4 # 4 = ch
231 movdqa 0x50(%r10),%xmm0 # 0 : sbet
232 pshufb %xmm3, %xmm0 # 0 = sbet
233 pxor %xmm4, %xmm0 # 0 = ch
234
235 palignr \$12, %xmm5, %xmm5
236
237.Ldec_entry:
238 # top of round
239 movdqa %xmm9, %xmm1 # 1 : i
240 pandn %xmm0, %xmm1 # 1 = i<<4
241 psrld \$4, %xmm1 # 1 = i
242 pand %xmm9, %xmm0 # 0 = k
243 movdqa %xmm11, %xmm2 # 2 : a/k
244 pshufb %xmm0, %xmm2 # 2 = a/k
245 pxor %xmm1, %xmm0 # 0 = j
246 movdqa %xmm10, %xmm3 # 3 : 1/i
247 pshufb %xmm1, %xmm3 # 3 = 1/i
248 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
249 movdqa %xmm10, %xmm4 # 4 : 1/j
250 pshufb %xmm0, %xmm4 # 4 = 1/j
251 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
252 movdqa %xmm10, %xmm2 # 2 : 1/iak
253 pshufb %xmm3, %xmm2 # 2 = 1/iak
254 pxor %xmm0, %xmm2 # 2 = io
255 movdqa %xmm10, %xmm3 # 3 : 1/jak
256 pshufb %xmm4, %xmm3 # 3 = 1/jak
257 pxor %xmm1, %xmm3 # 3 = jo
258 movdqu (%r9), %xmm0
259 jnz .Ldec_loop
260
261 # middle of last round
262 movdqa 0x60(%r10), %xmm4 # 3 : sbou
263 pshufb %xmm2, %xmm4 # 4 = sbou
264 pxor %xmm0, %xmm4 # 4 = sb1u + k
265 movdqa 0x70(%r10), %xmm0 # 0 : sbot
266 movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
267 pshufb %xmm3, %xmm0 # 0 = sb1t
268 pxor %xmm4, %xmm0 # 0 = A
269 pshufb %xmm2, %xmm0
270 ret
271.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
272
273########################################################
274## ##
275## AES key schedule ##
276## ##
277########################################################
278.type _vpaes_schedule_core,\@abi-omnipotent
279.align 16
280_vpaes_schedule_core:
281 # rdi = key
282 # rsi = size in bits
283 # rdx = buffer
284 # rcx = direction. 0=encrypt, 1=decrypt
285
286 call _vpaes_preheat # load the tables
287 movdqa .Lk_rcon(%rip), %xmm8 # load rcon
288 movdqu (%rdi), %xmm0 # load key (unaligned)
289
290 # input transform
291 movdqa %xmm0, %xmm3
292 lea .Lk_ipt(%rip), %r11
293 call _vpaes_schedule_transform
294 movdqa %xmm0, %xmm7
295
296 lea .Lk_sr(%rip),%r10
297 test %rcx, %rcx
298 jnz .Lschedule_am_decrypting
299
300 # encrypting, output zeroth round key after transform
301 movdqu %xmm0, (%rdx)
302 jmp .Lschedule_go
303
304.Lschedule_am_decrypting:
305 # decrypting, output zeroth round key after shiftrows
306 movdqa (%r8,%r10),%xmm1
307 pshufb %xmm1, %xmm3
308 movdqu %xmm3, (%rdx)
309 xor \$0x30, %r8
310
311.Lschedule_go:
312 cmp \$192, %esi
313 ja .Lschedule_256
314 je .Lschedule_192
315 # 128: fall though
316
317##
318## .schedule_128
319##
320## 128-bit specific part of key schedule.
321##
322## This schedule is really simple, because all its parts
323## are accomplished by the subroutines.
324##
325.Lschedule_128:
326 mov \$10, %esi
327
328.Loop_schedule_128:
329 call _vpaes_schedule_round
330 dec %rsi
331 jz .Lschedule_mangle_last
332 call _vpaes_schedule_mangle # write output
333 jmp .Loop_schedule_128
334
335##
336## .aes_schedule_192
337##
338## 192-bit specific part of key schedule.
339##
340## The main body of this schedule is the same as the 128-bit
341## schedule, but with more smearing. The long, high side is
342## stored in %xmm7 as before, and the short, low side is in
343## the high bits of %xmm6.
344##
345## This schedule is somewhat nastier, however, because each
346## round produces 192 bits of key material, or 1.5 round keys.
347## Therefore, on each cycle we do 2 rounds and produce 3 round
348## keys.
349##
350.align 16
351.Lschedule_192:
352 movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
353 call _vpaes_schedule_transform # input transform
354 movdqa %xmm0, %xmm6 # save short part
355 pxor %xmm4, %xmm4 # clear 4
356 movhlps %xmm4, %xmm6 # clobber low side with zeros
357 mov \$4, %esi
358
359.Loop_schedule_192:
360 call _vpaes_schedule_round
361 palignr \$8,%xmm6,%xmm0
362 call _vpaes_schedule_mangle # save key n
363 call _vpaes_schedule_192_smear
364 call _vpaes_schedule_mangle # save key n+1
365 call _vpaes_schedule_round
366 dec %rsi
367 jz .Lschedule_mangle_last
368 call _vpaes_schedule_mangle # save key n+2
369 call _vpaes_schedule_192_smear
370 jmp .Loop_schedule_192
371
372##
373## .aes_schedule_256
374##
375## 256-bit specific part of key schedule.
376##
377## The structure here is very similar to the 128-bit
378## schedule, but with an additional "low side" in
379## %xmm6. The low side's rounds are the same as the
380## high side's, except no rcon and no rotation.
381##
382.align 16
383.Lschedule_256:
384 movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
385 call _vpaes_schedule_transform # input transform
386 mov \$7, %esi
387
388.Loop_schedule_256:
389 call _vpaes_schedule_mangle # output low result
390 movdqa %xmm0, %xmm6 # save cur_lo in xmm6
391
392 # high round
393 call _vpaes_schedule_round
394 dec %rsi
395 jz .Lschedule_mangle_last
396 call _vpaes_schedule_mangle
397
398 # low round. swap xmm7 and xmm6
399 pshufd \$0xFF, %xmm0, %xmm0
400 movdqa %xmm7, %xmm5
401 movdqa %xmm6, %xmm7
402 call _vpaes_schedule_low_round
403 movdqa %xmm5, %xmm7
404
405 jmp .Loop_schedule_256
406
407
408##
409## .aes_schedule_mangle_last
410##
411## Mangler for last round of key schedule
412## Mangles %xmm0
413## when encrypting, outputs out(%xmm0) ^ 63
414## when decrypting, outputs unskew(%xmm0)
415##
416## Always called right before return... jumps to cleanup and exits
417##
418.align 16
419.Lschedule_mangle_last:
420 # schedule last round key from xmm0
421 lea .Lk_deskew(%rip),%r11 # prepare to deskew
422 test %rcx, %rcx
423 jnz .Lschedule_mangle_last_dec
424
425 # encrypting
426 movdqa (%r8,%r10),%xmm1
427 pshufb %xmm1, %xmm0 # output permute
428 lea .Lk_opt(%rip), %r11 # prepare to output transform
429 add \$32, %rdx
430
431.Lschedule_mangle_last_dec:
432 add \$-16, %rdx
433 pxor .Lk_s63(%rip), %xmm0
434 call _vpaes_schedule_transform # output transform
435 movdqu %xmm0, (%rdx) # save last key
436
437 # cleanup
438 pxor %xmm0, %xmm0
439 pxor %xmm1, %xmm1
440 pxor %xmm2, %xmm2
441 pxor %xmm3, %xmm3
442 pxor %xmm4, %xmm4
443 pxor %xmm5, %xmm5
444 pxor %xmm6, %xmm6
445 pxor %xmm7, %xmm7
446 ret
447.size _vpaes_schedule_core,.-_vpaes_schedule_core
448
449##
450## .aes_schedule_192_smear
451##
452## Smear the short, low side in the 192-bit key schedule.
453##
454## Inputs:
455## %xmm7: high side, b a x y
456## %xmm6: low side, d c 0 0
457## %xmm13: 0
458##
459## Outputs:
460## %xmm6: b+c+d b+c 0 0
461## %xmm0: b+c+d b+c b a
462##
463.type _vpaes_schedule_192_smear,\@abi-omnipotent
464.align 16
465_vpaes_schedule_192_smear:
466 pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
467 pxor %xmm0, %xmm6 # -> c+d c 0 0
468 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
469 pxor %xmm0, %xmm6 # -> b+c+d b+c b a
470 movdqa %xmm6, %xmm0
471 pxor %xmm1, %xmm1
472 movhlps %xmm1, %xmm6 # clobber low side with zeros
473 ret
474.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
475
476##
477## .aes_schedule_round
478##
479## Runs one main round of the key schedule on %xmm0, %xmm7
480##
481## Specifically, runs subbytes on the high dword of %xmm0
482## then rotates it by one byte and xors into the low dword of
483## %xmm7.
484##
485## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
486## next rcon.
487##
488## Smears the dwords of %xmm7 by xoring the low into the
489## second low, result into third, result into highest.
490##
491## Returns results in %xmm7 = %xmm0.
492## Clobbers %xmm1-%xmm4, %r11.
493##
494.type _vpaes_schedule_round,\@abi-omnipotent
495.align 16
496_vpaes_schedule_round:
497 # extract rcon from xmm8
498 pxor %xmm1, %xmm1
499 palignr \$15, %xmm8, %xmm1
500 palignr \$15, %xmm8, %xmm8
501 pxor %xmm1, %xmm7
502
503 # rotate
504 pshufd \$0xFF, %xmm0, %xmm0
505 palignr \$1, %xmm0, %xmm0
506
507 # fall through...
508
509 # low round: same as high round, but no rotation and no rcon.
510_vpaes_schedule_low_round:
511 # smear xmm7
512 movdqa %xmm7, %xmm1
513 pslldq \$4, %xmm7
514 pxor %xmm1, %xmm7
515 movdqa %xmm7, %xmm1
516 pslldq \$8, %xmm7
517 pxor %xmm1, %xmm7
518 pxor .Lk_s63(%rip), %xmm7
519
520 # subbytes
521 movdqa %xmm9, %xmm1
522 pandn %xmm0, %xmm1
523 psrld \$4, %xmm1 # 1 = i
524 pand %xmm9, %xmm0 # 0 = k
525 movdqa %xmm11, %xmm2 # 2 : a/k
526 pshufb %xmm0, %xmm2 # 2 = a/k
527 pxor %xmm1, %xmm0 # 0 = j
528 movdqa %xmm10, %xmm3 # 3 : 1/i
529 pshufb %xmm1, %xmm3 # 3 = 1/i
530 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
531 movdqa %xmm10, %xmm4 # 4 : 1/j
532 pshufb %xmm0, %xmm4 # 4 = 1/j
533 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
534 movdqa %xmm10, %xmm2 # 2 : 1/iak
535 pshufb %xmm3, %xmm2 # 2 = 1/iak
536 pxor %xmm0, %xmm2 # 2 = io
537 movdqa %xmm10, %xmm3 # 3 : 1/jak
538 pshufb %xmm4, %xmm3 # 3 = 1/jak
539 pxor %xmm1, %xmm3 # 3 = jo
540 movdqa %xmm13, %xmm4 # 4 : sbou
541 pshufb %xmm2, %xmm4 # 4 = sbou
542 movdqa %xmm12, %xmm0 # 0 : sbot
543 pshufb %xmm3, %xmm0 # 0 = sb1t
544 pxor %xmm4, %xmm0 # 0 = sbox output
545
546 # add in smeared stuff
547 pxor %xmm7, %xmm0
548 movdqa %xmm0, %xmm7
549 ret
550.size _vpaes_schedule_round,.-_vpaes_schedule_round
551
552##
553## .aes_schedule_transform
554##
555## Linear-transform %xmm0 according to tables at (%r11)
556##
557## Requires that %xmm9 = 0x0F0F... as in preheat
558## Output in %xmm0
559## Clobbers %xmm1, %xmm2
560##
561.type _vpaes_schedule_transform,\@abi-omnipotent
562.align 16
563_vpaes_schedule_transform:
564 movdqa %xmm9, %xmm1
565 pandn %xmm0, %xmm1
566 psrld \$4, %xmm1
567 pand %xmm9, %xmm0
568 movdqa (%r11), %xmm2 # lo
569 pshufb %xmm0, %xmm2
570 movdqa 16(%r11), %xmm0 # hi
571 pshufb %xmm1, %xmm0
572 pxor %xmm2, %xmm0
573 ret
574.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
575
576##
577## .aes_schedule_mangle
578##
579## Mangle xmm0 from (basis-transformed) standard version
580## to our version.
581##
582## On encrypt,
583## xor with 0x63
584## multiply by circulant 0,1,1,1
585## apply shiftrows transform
586##
587## On decrypt,
588## xor with 0x63
589## multiply by "inverse mixcolumns" circulant E,B,D,9
590## deskew
591## apply shiftrows transform
592##
593##
594## Writes out to (%rdx), and increments or decrements it
595## Keeps track of round number mod 4 in %r8
596## Preserves xmm0
597## Clobbers xmm1-xmm5
598##
599.type _vpaes_schedule_mangle,\@abi-omnipotent
600.align 16
601_vpaes_schedule_mangle:
602 movdqa %xmm0, %xmm4 # save xmm0 for later
603 movdqa .Lk_mc_forward(%rip),%xmm5
604 test %rcx, %rcx
605 jnz .Lschedule_mangle_dec
606
607 # encrypting
608 add \$16, %rdx
609 pxor .Lk_s63(%rip),%xmm4
610 pshufb %xmm5, %xmm4
611 movdqa %xmm4, %xmm3
612 pshufb %xmm5, %xmm4
613 pxor %xmm4, %xmm3
614 pshufb %xmm5, %xmm4
615 pxor %xmm4, %xmm3
616
617 jmp .Lschedule_mangle_both
618.align 16
619.Lschedule_mangle_dec:
620 # inverse mix columns
621 lea .Lk_dksd(%rip),%r11
622 movdqa %xmm9, %xmm1
623 pandn %xmm4, %xmm1
624 psrld \$4, %xmm1 # 1 = hi
625 pand %xmm9, %xmm4 # 4 = lo
626
627 movdqa 0x00(%r11), %xmm2
628 pshufb %xmm4, %xmm2
629 movdqa 0x10(%r11), %xmm3
630 pshufb %xmm1, %xmm3
631 pxor %xmm2, %xmm3
632 pshufb %xmm5, %xmm3
633
634 movdqa 0x20(%r11), %xmm2
635 pshufb %xmm4, %xmm2
636 pxor %xmm3, %xmm2
637 movdqa 0x30(%r11), %xmm3
638 pshufb %xmm1, %xmm3
639 pxor %xmm2, %xmm3
640 pshufb %xmm5, %xmm3
641
642 movdqa 0x40(%r11), %xmm2
643 pshufb %xmm4, %xmm2
644 pxor %xmm3, %xmm2
645 movdqa 0x50(%r11), %xmm3
646 pshufb %xmm1, %xmm3
647 pxor %xmm2, %xmm3
648 pshufb %xmm5, %xmm3
649
650 movdqa 0x60(%r11), %xmm2
651 pshufb %xmm4, %xmm2
652 pxor %xmm3, %xmm2
653 movdqa 0x70(%r11), %xmm3
654 pshufb %xmm1, %xmm3
655 pxor %xmm2, %xmm3
656
657 add \$-16, %rdx
658
659.Lschedule_mangle_both:
660 movdqa (%r8,%r10),%xmm1
661 pshufb %xmm1,%xmm3
662 add \$-16, %r8
663 and \$0x30, %r8
664 movdqu %xmm3, (%rdx)
665 ret
666.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
667
668#
669# Interface to OpenSSL
670#
671.globl ${PREFIX}_set_encrypt_key
672.type ${PREFIX}_set_encrypt_key,\@function,3
673.align 16
674${PREFIX}_set_encrypt_key:
675___
676$code.=<<___ if ($win64);
677 lea -0xb8(%rsp),%rsp
678 movaps %xmm6,0x10(%rsp)
679 movaps %xmm7,0x20(%rsp)
680 movaps %xmm8,0x30(%rsp)
681 movaps %xmm9,0x40(%rsp)
682 movaps %xmm10,0x50(%rsp)
683 movaps %xmm11,0x60(%rsp)
684 movaps %xmm12,0x70(%rsp)
685 movaps %xmm13,0x80(%rsp)
686 movaps %xmm14,0x90(%rsp)
687 movaps %xmm15,0xa0(%rsp)
688.Lenc_key_body:
689___
690$code.=<<___;
691 mov %esi,%eax
692 shr \$5,%eax
693 add \$5,%eax
694 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
695
696 mov \$0,%ecx
697 mov \$0x30,%r8d
698 call _vpaes_schedule_core
699___
700$code.=<<___ if ($win64);
701 movaps 0x10(%rsp),%xmm6
702 movaps 0x20(%rsp),%xmm7
703 movaps 0x30(%rsp),%xmm8
704 movaps 0x40(%rsp),%xmm9
705 movaps 0x50(%rsp),%xmm10
706 movaps 0x60(%rsp),%xmm11
707 movaps 0x70(%rsp),%xmm12
708 movaps 0x80(%rsp),%xmm13
709 movaps 0x90(%rsp),%xmm14
710 movaps 0xa0(%rsp),%xmm15
711 lea 0xb8(%rsp),%rsp
712.Lenc_key_epilogue:
713___
714$code.=<<___;
715 xor %eax,%eax
716 ret
717.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
718
719.globl ${PREFIX}_set_decrypt_key
720.type ${PREFIX}_set_decrypt_key,\@function,3
721.align 16
722${PREFIX}_set_decrypt_key:
723___
724$code.=<<___ if ($win64);
725 lea -0xb8(%rsp),%rsp
726 movaps %xmm6,0x10(%rsp)
727 movaps %xmm7,0x20(%rsp)
728 movaps %xmm8,0x30(%rsp)
729 movaps %xmm9,0x40(%rsp)
730 movaps %xmm10,0x50(%rsp)
731 movaps %xmm11,0x60(%rsp)
732 movaps %xmm12,0x70(%rsp)
733 movaps %xmm13,0x80(%rsp)
734 movaps %xmm14,0x90(%rsp)
735 movaps %xmm15,0xa0(%rsp)
736.Ldec_key_body:
737___
738$code.=<<___;
739 mov %esi,%eax
740 shr \$5,%eax
741 add \$5,%eax
742 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
743 shl \$4,%eax
744 lea 16(%rdx,%rax),%rdx
745
746 mov \$1,%ecx
747 mov %esi,%r8d
748 shr \$1,%r8d
749 and \$32,%r8d
750 xor \$32,%r8d # nbits==192?0:32
751 call _vpaes_schedule_core
752___
753$code.=<<___ if ($win64);
754 movaps 0x10(%rsp),%xmm6
755 movaps 0x20(%rsp),%xmm7
756 movaps 0x30(%rsp),%xmm8
757 movaps 0x40(%rsp),%xmm9
758 movaps 0x50(%rsp),%xmm10
759 movaps 0x60(%rsp),%xmm11
760 movaps 0x70(%rsp),%xmm12
761 movaps 0x80(%rsp),%xmm13
762 movaps 0x90(%rsp),%xmm14
763 movaps 0xa0(%rsp),%xmm15
764 lea 0xb8(%rsp),%rsp
765.Ldec_key_epilogue:
766___
767$code.=<<___;
768 xor %eax,%eax
769 ret
770.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
771
772.globl ${PREFIX}_encrypt
773.type ${PREFIX}_encrypt,\@function,3
774.align 16
775${PREFIX}_encrypt:
776___
777$code.=<<___ if ($win64);
778 lea -0xb8(%rsp),%rsp
779 movaps %xmm6,0x10(%rsp)
780 movaps %xmm7,0x20(%rsp)
781 movaps %xmm8,0x30(%rsp)
782 movaps %xmm9,0x40(%rsp)
783 movaps %xmm10,0x50(%rsp)
784 movaps %xmm11,0x60(%rsp)
785 movaps %xmm12,0x70(%rsp)
786 movaps %xmm13,0x80(%rsp)
787 movaps %xmm14,0x90(%rsp)
788 movaps %xmm15,0xa0(%rsp)
789.Lenc_body:
790___
791$code.=<<___;
792 movdqu (%rdi),%xmm0
793 call _vpaes_preheat
794 call _vpaes_encrypt_core
795 movdqu %xmm0,(%rsi)
796___
797$code.=<<___ if ($win64);
798 movaps 0x10(%rsp),%xmm6
799 movaps 0x20(%rsp),%xmm7
800 movaps 0x30(%rsp),%xmm8
801 movaps 0x40(%rsp),%xmm9
802 movaps 0x50(%rsp),%xmm10
803 movaps 0x60(%rsp),%xmm11
804 movaps 0x70(%rsp),%xmm12
805 movaps 0x80(%rsp),%xmm13
806 movaps 0x90(%rsp),%xmm14
807 movaps 0xa0(%rsp),%xmm15
808 lea 0xb8(%rsp),%rsp
809.Lenc_epilogue:
810___
811$code.=<<___;
812 ret
813.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
814
815.globl ${PREFIX}_decrypt
816.type ${PREFIX}_decrypt,\@function,3
817.align 16
818${PREFIX}_decrypt:
819___
820$code.=<<___ if ($win64);
821 lea -0xb8(%rsp),%rsp
822 movaps %xmm6,0x10(%rsp)
823 movaps %xmm7,0x20(%rsp)
824 movaps %xmm8,0x30(%rsp)
825 movaps %xmm9,0x40(%rsp)
826 movaps %xmm10,0x50(%rsp)
827 movaps %xmm11,0x60(%rsp)
828 movaps %xmm12,0x70(%rsp)
829 movaps %xmm13,0x80(%rsp)
830 movaps %xmm14,0x90(%rsp)
831 movaps %xmm15,0xa0(%rsp)
832.Ldec_body:
833___
834$code.=<<___;
835 movdqu (%rdi),%xmm0
836 call _vpaes_preheat
837 call _vpaes_decrypt_core
838 movdqu %xmm0,(%rsi)
839___
840$code.=<<___ if ($win64);
841 movaps 0x10(%rsp),%xmm6
842 movaps 0x20(%rsp),%xmm7
843 movaps 0x30(%rsp),%xmm8
844 movaps 0x40(%rsp),%xmm9
845 movaps 0x50(%rsp),%xmm10
846 movaps 0x60(%rsp),%xmm11
847 movaps 0x70(%rsp),%xmm12
848 movaps 0x80(%rsp),%xmm13
849 movaps 0x90(%rsp),%xmm14
850 movaps 0xa0(%rsp),%xmm15
851 lea 0xb8(%rsp),%rsp
852.Ldec_epilogue:
853___
854$code.=<<___;
855 ret
856.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
857___
858{
859my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
860# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
861# size_t length, const AES_KEY *key,
862# unsigned char *ivp,const int enc);
863$code.=<<___;
864.globl ${PREFIX}_cbc_encrypt
865.type ${PREFIX}_cbc_encrypt,\@function,6
866.align 16
867${PREFIX}_cbc_encrypt:
868 xchg $key,$len
869___
870($len,$key)=($key,$len);
871$code.=<<___;
872 sub \$16,$len
873 jc .Lcbc_abort
874___
875$code.=<<___ if ($win64);
876 lea -0xb8(%rsp),%rsp
877 movaps %xmm6,0x10(%rsp)
878 movaps %xmm7,0x20(%rsp)
879 movaps %xmm8,0x30(%rsp)
880 movaps %xmm9,0x40(%rsp)
881 movaps %xmm10,0x50(%rsp)
882 movaps %xmm11,0x60(%rsp)
883 movaps %xmm12,0x70(%rsp)
884 movaps %xmm13,0x80(%rsp)
885 movaps %xmm14,0x90(%rsp)
886 movaps %xmm15,0xa0(%rsp)
887.Lcbc_body:
888___
889$code.=<<___;
890 movdqu ($ivp),%xmm6 # load IV
891 sub $inp,$out
892 call _vpaes_preheat
893 cmp \$0,${enc}d
894 je .Lcbc_dec_loop
895 jmp .Lcbc_enc_loop
896.align 16
897.Lcbc_enc_loop:
898 movdqu ($inp),%xmm0
899 pxor %xmm6,%xmm0
900 call _vpaes_encrypt_core
901 movdqa %xmm0,%xmm6
902 movdqu %xmm0,($out,$inp)
903 lea 16($inp),$inp
904 sub \$16,$len
905 jnc .Lcbc_enc_loop
906 jmp .Lcbc_done
907.align 16
908.Lcbc_dec_loop:
909 movdqu ($inp),%xmm0
910 movdqa %xmm0,%xmm7
911 call _vpaes_decrypt_core
912 pxor %xmm6,%xmm0
913 movdqa %xmm7,%xmm6
914 movdqu %xmm0,($out,$inp)
915 lea 16($inp),$inp
916 sub \$16,$len
917 jnc .Lcbc_dec_loop
918.Lcbc_done:
919 movdqu %xmm6,($ivp) # save IV
920___
921$code.=<<___ if ($win64);
922 movaps 0x10(%rsp),%xmm6
923 movaps 0x20(%rsp),%xmm7
924 movaps 0x30(%rsp),%xmm8
925 movaps 0x40(%rsp),%xmm9
926 movaps 0x50(%rsp),%xmm10
927 movaps 0x60(%rsp),%xmm11
928 movaps 0x70(%rsp),%xmm12
929 movaps 0x80(%rsp),%xmm13
930 movaps 0x90(%rsp),%xmm14
931 movaps 0xa0(%rsp),%xmm15
932 lea 0xb8(%rsp),%rsp
933.Lcbc_epilogue:
934___
935$code.=<<___;
936.Lcbc_abort:
937 ret
938.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
939___
940}
941$code.=<<___;
942##
943## _aes_preheat
944##
945## Fills register %r10 -> .aes_consts (so you can -fPIC)
946## and %xmm9-%xmm15 as specified below.
947##
948.type _vpaes_preheat,\@abi-omnipotent
949.align 16
950_vpaes_preheat:
951 lea .Lk_s0F(%rip), %r10
952 movdqa -0x20(%r10), %xmm10 # .Lk_inv
953 movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
954 movdqa 0x00(%r10), %xmm9 # .Lk_s0F
955 movdqa 0x30(%r10), %xmm13 # .Lk_sb1
956 movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
957 movdqa 0x50(%r10), %xmm15 # .Lk_sb2
958 movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
959 ret
960.size _vpaes_preheat,.-_vpaes_preheat
961########################################################
962## ##
963## Constants ##
964## ##
965########################################################
966.type _vpaes_consts,\@object
967.align 64
968_vpaes_consts:
969.Lk_inv: # inv, inva
970 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
971 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
972
973.Lk_s0F: # s0F
974 .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
975
976.Lk_ipt: # input transform (lo, hi)
977 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
978 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
979
980.Lk_sb1: # sb1u, sb1t
981 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
982 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
983.Lk_sb2: # sb2u, sb2t
984 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
985 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
986.Lk_sbo: # sbou, sbot
987 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
988 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
989
990.Lk_mc_forward: # mc_forward
991 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
992 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
993 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
994 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
995
996.Lk_mc_backward:# mc_backward
997 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
998 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
999 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
1000 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
1001
1002.Lk_sr: # sr
1003 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
1004 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
1005 .quad 0x0F060D040B020900, 0x070E050C030A0108
1006 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
1007
1008.Lk_rcon: # rcon
1009 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1010
1011.Lk_s63: # s63: all equal to 0x63 transformed
1012 .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1013
1014.Lk_opt: # output transform
1015 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
1016 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1017
1018.Lk_deskew: # deskew tables: inverts the sbox's "skew"
1019 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1020 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1021
1022##
1023## Decryption stuff
1024## Key schedule constants
1025##
1026.Lk_dksd: # decryption key schedule: invskew x*D
1027 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1028 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1029.Lk_dksb: # decryption key schedule: invskew x*B
1030 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
1031 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1032.Lk_dkse: # decryption key schedule: invskew x*E + 0x63
1033 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
1034 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1035.Lk_dks9: # decryption key schedule: invskew x*9
1036 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
1037 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
1038
1039##
1040## Decryption stuff
1041## Round function constants
1042##
1043.Lk_dipt: # decryption input transform
1044 .quad 0x0F505B040B545F00, 0x154A411E114E451A
1045 .quad 0x86E383E660056500, 0x12771772F491F194
1046
1047.Lk_dsb9: # decryption sbox output *9*u, *9*t
1048 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
1049 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1050.Lk_dsbd: # decryption sbox output *D*u, *D*t
1051 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1052 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1053.Lk_dsbb: # decryption sbox output *B*u, *B*t
1054 .quad 0xD022649296B44200, 0x602646F6B0F2D404
1055 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1056.Lk_dsbe: # decryption sbox output *E*u, *E*t
1057 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
1058 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1059.Lk_dsbo: # decryption sbox final output
1060 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1061 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1062.asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
1063.align 64
1064.size _vpaes_consts,.-_vpaes_consts
1065___
1066
1067if ($win64) {
1068# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1069# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1070$rec="%rcx";
1071$frame="%rdx";
1072$context="%r8";
1073$disp="%r9";
1074
1075$code.=<<___;
1076.extern __imp_RtlVirtualUnwind
1077.type se_handler,\@abi-omnipotent
1078.align 16
1079se_handler:
1080 push %rsi
1081 push %rdi
1082 push %rbx
1083 push %rbp
1084 push %r12
1085 push %r13
1086 push %r14
1087 push %r15
1088 pushfq
1089 sub \$64,%rsp
1090
1091 mov 120($context),%rax # pull context->Rax
1092 mov 248($context),%rbx # pull context->Rip
1093
1094 mov 8($disp),%rsi # disp->ImageBase
1095 mov 56($disp),%r11 # disp->HandlerData
1096
1097 mov 0(%r11),%r10d # HandlerData[0]
1098 lea (%rsi,%r10),%r10 # prologue label
1099 cmp %r10,%rbx # context->Rip<prologue label
1100 jb .Lin_prologue
1101
1102 mov 152($context),%rax # pull context->Rsp
1103
1104 mov 4(%r11),%r10d # HandlerData[1]
1105 lea (%rsi,%r10),%r10 # epilogue label
1106 cmp %r10,%rbx # context->Rip>=epilogue label
1107 jae .Lin_prologue
1108
1109 lea 16(%rax),%rsi # %xmm save area
1110 lea 512($context),%rdi # &context.Xmm6
1111 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1112 .long 0xa548f3fc # cld; rep movsq
1113 lea 0xb8(%rax),%rax # adjust stack pointer
1114
1115.Lin_prologue:
1116 mov 8(%rax),%rdi
1117 mov 16(%rax),%rsi
1118 mov %rax,152($context) # restore context->Rsp
1119 mov %rsi,168($context) # restore context->Rsi
1120 mov %rdi,176($context) # restore context->Rdi
1121
1122 mov 40($disp),%rdi # disp->ContextRecord
1123 mov $context,%rsi # context
1124 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1125 .long 0xa548f3fc # cld; rep movsq
1126
1127 mov $disp,%rsi
1128 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1129 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1130 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1131 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1132 mov 40(%rsi),%r10 # disp->ContextRecord
1133 lea 56(%rsi),%r11 # &disp->HandlerData
1134 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1135 mov %r10,32(%rsp) # arg5
1136 mov %r11,40(%rsp) # arg6
1137 mov %r12,48(%rsp) # arg7
1138 mov %rcx,56(%rsp) # arg8, (NULL)
1139 call *__imp_RtlVirtualUnwind(%rip)
1140
1141 mov \$1,%eax # ExceptionContinueSearch
1142 add \$64,%rsp
1143 popfq
1144 pop %r15
1145 pop %r14
1146 pop %r13
1147 pop %r12
1148 pop %rbp
1149 pop %rbx
1150 pop %rdi
1151 pop %rsi
1152 ret
1153.size se_handler,.-se_handler
1154
1155.section .pdata
1156.align 4
1157 .rva .LSEH_begin_${PREFIX}_set_encrypt_key
1158 .rva .LSEH_end_${PREFIX}_set_encrypt_key
1159 .rva .LSEH_info_${PREFIX}_set_encrypt_key
1160
1161 .rva .LSEH_begin_${PREFIX}_set_decrypt_key
1162 .rva .LSEH_end_${PREFIX}_set_decrypt_key
1163 .rva .LSEH_info_${PREFIX}_set_decrypt_key
1164
1165 .rva .LSEH_begin_${PREFIX}_encrypt
1166 .rva .LSEH_end_${PREFIX}_encrypt
1167 .rva .LSEH_info_${PREFIX}_encrypt
1168
1169 .rva .LSEH_begin_${PREFIX}_decrypt
1170 .rva .LSEH_end_${PREFIX}_decrypt
1171 .rva .LSEH_info_${PREFIX}_decrypt
1172
1173 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
1174 .rva .LSEH_end_${PREFIX}_cbc_encrypt
1175 .rva .LSEH_info_${PREFIX}_cbc_encrypt
1176
1177.section .xdata
1178.align 8
1179.LSEH_info_${PREFIX}_set_encrypt_key:
1180 .byte 9,0,0,0
1181 .rva se_handler
1182 .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
1183.LSEH_info_${PREFIX}_set_decrypt_key:
1184 .byte 9,0,0,0
1185 .rva se_handler
1186 .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
1187.LSEH_info_${PREFIX}_encrypt:
1188 .byte 9,0,0,0
1189 .rva se_handler
1190 .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
1191.LSEH_info_${PREFIX}_decrypt:
1192 .byte 9,0,0,0
1193 .rva se_handler
1194 .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
1195.LSEH_info_${PREFIX}_cbc_encrypt:
1196 .byte 9,0,0,0
1197 .rva se_handler
1198 .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
1199___
1200}
1201
1202$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1203
1204print $code;
1205
1206close STDOUT;