From 2b6e09b39ef1d803b50ee024a06d1c250fde442d Mon Sep 17 00:00:00 2001
From: djm <>
Date: Mon, 6 Apr 2009 06:30:10 +0000
Subject: import of OpenSSL 0.9.8k

---
 src/lib/libcrypto/aes/aes_x86core.c           | 1063 ++++++++++++++++++++
 src/lib/libcrypto/aes/asm/aes-armv4.pl        | 1030 +++++++++++++++++++
 src/lib/libcrypto/aes/asm/aes-ppc.pl          | 1176 ++++++++++++++++++++++
 src/lib/libcrypto/aes/asm/aes-s390x.pl        | 1333 +++++++++++++++++++++++++
 src/lib/libcrypto/aes/asm/aes-sparcv9.pl      | 1181 ++++++++++++++++++++++
 src/lib/libcrypto/asn1/a_bytes.c              |    2 +-
 src/lib/libcrypto/asn1/ameth_lib.c            |  446 +++++++++
 src/lib/libcrypto/asn1/asn1.h                 |    3 +
 src/lib/libcrypto/asn1/asn1_err.c             |    2 +
 src/lib/libcrypto/asn1/asn1_locl.h            |  134 +++
 src/lib/libcrypto/asn1/asn1_par.c             |    2 +
 src/lib/libcrypto/asn1/asn_mime.c             |    2 -
 src/lib/libcrypto/asn1/bio_asn1.c             |  495 +++++++++
 src/lib/libcrypto/asn1/bio_ndef.c             |  246 +++++
 src/lib/libcrypto/asn1/t_x509.c               |    2 +-
 src/lib/libcrypto/asn1/tasn_dec.c             |   44 +-
 src/lib/libcrypto/asn1/x_nx509.c              |   72 ++
 src/lib/libcrypto/bio/bss_mem.c               |   22 +-
 src/lib/libcrypto/bio/bss_sock.c              |    5 +
 src/lib/libcrypto/bn/asm/alpha-mont.pl        |  317 ++++++
 src/lib/libcrypto/bn/asm/armv4-mont.pl        |  200 ++++
 src/lib/libcrypto/bn/asm/ppc-mont.pl          |  323 ++++++
 src/lib/libcrypto/bn/asm/ppc64-mont.pl        |  918 +++++++++++++++++
 src/lib/libcrypto/bn/asm/s390x-mont.pl        |  225 +++++
 src/lib/libcrypto/bn/asm/s390x.S              |  678 +++++++++++++
 src/lib/libcrypto/bn/asm/sparcv9-mont.pl      |  606 +++++++++++
 src/lib/libcrypto/bn/asm/sparcv9a-mont.pl     |  882 ++++++++++++++++
 src/lib/libcrypto/bn/asm/via-mont.pl          |  242 +++++
 src/lib/libcrypto/bn/asm/x86-mont.pl          |  591 +++++++++++
 src/lib/libcrypto/camellia/asm/cmll-x86.pl    | 1138 +++++++++++++++++++++
 src/lib/libcrypto/camellia/asm/cmll-x86_64.pl | 1080 ++++++++++++++++++++
 src/lib/libcrypto/cms/cms_smime.c             |    4 +-
 src/lib/libcrypto/des/asm/des_enc.m4          |  345 ++++---
 src/lib/libcrypto/opensslv.h                  |    6 +-
 src/lib/libcrypto/pem/pem.h                   |    4 +
 src/lib/libcrypto/pkcs12/p12_crt.c            |    3 +
 src/lib/libcrypto/pkcs7/pk7_smime.c           |    3 +-
 src/lib/libcrypto/ppccpuid.pl                 |   94 ++
 src/lib/libcrypto/ripemd/README               |    2 +-
 src/lib/libcrypto/s390xcpuid.S                |   90 ++
 src/lib/libcrypto/sha/asm/sha1-ia64.pl        |    1 +
 src/lib/libcrypto/sparcv9cap.c                |  154 +++
 src/lib/libcrypto/ui/ui_lib.c                 |    1 +
 src/lib/libcrypto/x509/x509_cmp.c             |    3 +-
 src/lib/libcrypto/x509/x509_vpm.c             |   16 +-
 src/lib/libcrypto/x509v3/v3_cpols.c           |    7 +-
 src/lib/libcrypto/x509v3/v3_utl.c             |    2 +-
 src/lib/libssl/s3_clnt.c                      |    2 +-
 src/lib/libssl/ssl_ciph.c                     |    2 +-
 src/lib/libssl/ssl_lib.c                      |    2 +
 src/lib/libssl/test/times                     |    2 +-
 51 files changed, 15047 insertions(+), 156 deletions(-)
 create mode 100644 src/lib/libcrypto/aes/aes_x86core.c
 create mode 100644 src/lib/libcrypto/aes/asm/aes-armv4.pl
 create mode 100644 src/lib/libcrypto/aes/asm/aes-ppc.pl
 create mode 100644 src/lib/libcrypto/aes/asm/aes-s390x.pl
 create mode 100755 src/lib/libcrypto/aes/asm/aes-sparcv9.pl
 create mode 100644 src/lib/libcrypto/asn1/ameth_lib.c
 create mode 100644 src/lib/libcrypto/asn1/asn1_locl.h
 create mode 100644 src/lib/libcrypto/asn1/bio_asn1.c
 create mode 100644 src/lib/libcrypto/asn1/bio_ndef.c
 create mode 100644 src/lib/libcrypto/asn1/x_nx509.c
 create mode 100644 src/lib/libcrypto/bn/asm/alpha-mont.pl
 create mode 100644 src/lib/libcrypto/bn/asm/armv4-mont.pl
 create mode 100644 src/lib/libcrypto/bn/asm/ppc-mont.pl
 create mode 100644 src/lib/libcrypto/bn/asm/ppc64-mont.pl
 create mode 100644 src/lib/libcrypto/bn/asm/s390x-mont.pl
 create mode 100755 src/lib/libcrypto/bn/asm/s390x.S
 create mode 100644 src/lib/libcrypto/bn/asm/sparcv9-mont.pl
 create mode 100755 src/lib/libcrypto/bn/asm/sparcv9a-mont.pl
 create mode 100644 src/lib/libcrypto/bn/asm/via-mont.pl
 create mode 100755 src/lib/libcrypto/bn/asm/x86-mont.pl
 create mode 100644 src/lib/libcrypto/camellia/asm/cmll-x86.pl
 create mode 100644 src/lib/libcrypto/camellia/asm/cmll-x86_64.pl
 create mode 100755 src/lib/libcrypto/ppccpuid.pl
 create mode 100644 src/lib/libcrypto/s390xcpuid.S
 create mode 100644 src/lib/libcrypto/sparcv9cap.c

(limited to 'src')

diff --git a/src/lib/libcrypto/aes/aes_x86core.c b/src/lib/libcrypto/aes/aes_x86core.c
new file mode 100644
index 0000000000..d323e265c0
--- /dev/null
+++ b/src/lib/libcrypto/aes/aes_x86core.c
@@ -0,0 +1,1063 @@
+/* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
+/**
+ * rijndael-alg-fst.c
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This is experimental x86[_64] derivative. It assumes little-endian
+ * byte order and expects CPU to sustain unaligned memory references.
+ * It is used as playground for cache-time attack mitigations and
+ * serves as reference C implementation for x86[_64] assembler.
+ *
+ *					<appro@fy.chalmers.se>
+ */
+
+
+#ifndef AES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+#include <stdlib.h>
+#include <openssl/aes.h>
+#include "aes_locl.h"
+
+/*
+ * These two parameters control which table, 256-byte or 2KB, is
+ * referenced in outer and respectively inner rounds.
+ */
+#define AES_COMPACT_IN_OUTER_ROUNDS
+#ifdef  AES_COMPACT_IN_OUTER_ROUNDS
+/* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while
+ * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further*
+ * by factor of ~2. */
+# undef  AES_COMPACT_IN_INNER_ROUNDS
+#endif
+
+#if 1
+static void prefetch256(const void *table)
+{
+	volatile unsigned long *t=(void *)table,ret;
+	unsigned long sum;
+	int i;
+
+	/* 32 is common least cache-line size */
+	for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0]))	sum ^= t[i];
+
+	ret = sum;
+}
+#else
+# define prefetch256(t)
+#endif
+
+#undef GETU32
+#define GETU32(p) (*((u32*)(p)))
+
+#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+typedef unsigned __int64 u64;
+#define U64(C)	C##UI64
+#elif defined(__arch64__)
+typedef unsigned long u64;
+#define U64(C)	C##UL
+#else
+typedef unsigned long long u64;
+#define U64(C)	C##ULL
+#endif
+
+#undef ROTATE
+#if defined(_MSC_VER) || defined(__ICC)
+# define ROTATE(a,n)	_lrotl(a,n)
+#elif defined(__GNUC__) && __GNUC__>=2
+# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
+#   define ROTATE(a,n)	({ register unsigned int ret;	\
+				asm (			\
+				"roll %1,%0"		\
+				: "=r"(ret)		\
+				: "I"(n), "0"(a)	\
+				: "cc");		\
+			   ret;				\
+			})
+# endif
+#endif
+/*
+Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
+Te0[x] = S [x].[02, 01, 01, 03];
+Te1[x] = S [x].[03, 02, 01, 01];
+Te2[x] = S [x].[01, 03, 02, 01];
+Te3[x] = S [x].[01, 01, 03, 02];
+*/
+#define Te0 (u32)((u64*)((u8*)Te+0))
+#define Te1 (u32)((u64*)((u8*)Te+3))
+#define Te2 (u32)((u64*)((u8*)Te+2))
+#define Te3 (u32)((u64*)((u8*)Te+1))
+/*
+Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
+Td0[x] = Si[x].[0e, 09, 0d, 0b];
+Td1[x] = Si[x].[0b, 0e, 09, 0d];
+Td2[x] = Si[x].[0d, 0b, 0e, 09];
+Td3[x] = Si[x].[09, 0d, 0b, 0e];
+Td4[x] = Si[x].[01];
+*/
+#define Td0 (u32)((u64*)((u8*)Td+0))
+#define Td1 (u32)((u64*)((u8*)Td+3))
+#define Td2 (u32)((u64*)((u8*)Td+2))
+#define Td3 (u32)((u64*)((u8*)Td+1))
+
+static const u64 Te[256] = {
+    U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
+    U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
+    U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
+    U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
+    U64(0x5030306050303060), U64(0x0301010203010102),
+    U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
+    U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
+    U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
+    U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
+    U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
+    U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
+    U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
+    U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
+    U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
+    U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
+    U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
+    U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
+    U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
+    U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
+    U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
+    U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
+    U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
+    U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
+    U64(0x5331316253313162), U64(0x3f15152a3f15152a),
+    U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
+    U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
+    U64(0x2818183028181830), U64(0xa1969637a1969637),
+    U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
+    U64(0x0907070e0907070e), U64(0x3612122436121224),
+    U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
+    U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
+    U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
+    U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
+    U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
+    U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
+    U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
+    U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
+    U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
+    U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
+    U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
+    U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
+    U64(0x0000000000000000), U64(0x2cededc12cededc1),
+    U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
+    U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
+    U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
+    U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
+    U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
+    U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
+    U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
+    U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
+    U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
+    U64(0x5533336655333366), U64(0x9485851194858511),
+    U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
+    U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
+    U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
+    U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
+    U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
+    U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
+    U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
+    U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
+    U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
+    U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
+    U64(0x3010102030101020), U64(0x1affffe51affffe5),
+    U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
+    U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
+    U64(0x3513132635131326), U64(0x2fececc32fececc3),
+    U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
+    U64(0xcc444488cc444488), U64(0x3917172e3917172e),
+    U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
+    U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
+    U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
+    U64(0x2b1919322b191932), U64(0x957373e6957373e6),
+    U64(0xa06060c0a06060c0), U64(0x9881811998818119),
+    U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
+    U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
+    U64(0xab90903bab90903b), U64(0x8388880b8388880b),
+    U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
+    U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
+    U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
+    U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
+    U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
+    U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
+    U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
+    U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
+    U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
+    U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
+    U64(0xa8919139a8919139), U64(0xa4959531a4959531),
+    U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
+    U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
+    U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
+    U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
+    U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
+    U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
+    U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
+    U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
+    U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
+    U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
+    U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
+    U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
+    U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
+    U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
+    U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
+    U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
+    U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
+    U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
+    U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
+    U64(0xd8484890d8484890), U64(0x0503030605030306),
+    U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
+    U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
+    U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
+    U64(0x9186861791868617), U64(0x58c1c19958c1c199),
+    U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
+    U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
+    U64(0xb398982bb398982b), U64(0x3311112233111122),
+    U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
+    U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
+    U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
+    U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
+    U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
+    U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
+    U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
+    U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
+    U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
+    U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
+    U64(0xc3414182c3414182), U64(0xb0999929b0999929),
+    U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
+    U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
+    U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
+};
+
+static const u8 Te4[256] = {
+    0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
+    0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
+    0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
+    0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
+    0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
+    0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
+    0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
+    0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
+    0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
+    0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
+    0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
+    0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
+    0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
+    0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
+    0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
+    0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
+    0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
+    0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
+    0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
+    0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
+    0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
+    0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
+    0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
+    0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
+    0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
+    0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
+    0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
+    0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
+    0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
+    0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
+    0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
+    0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
+};
+
+static const u64 Td[256] = {
+    U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
+    U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
+    U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
+    U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
+    U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
+    U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
+    U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
+    U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
+    U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
+    U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
+    U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
+    U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
+    U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
+    U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
+    U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
+    U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
+    U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
+    U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
+    U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
+    U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
+    U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
+    U64(0x6033519760335197), U64(0x457f5362457f5362),
+    U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
+    U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
+    U64(0x5868487058684870), U64(0x19fd458f19fd458f),
+    U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
+    U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
+    U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
+    U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
+    U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
+    U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
+    U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
+    U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
+    U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
+    U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
+    U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
+    U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
+    U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
+    U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
+    U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
+    U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
+    U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
+    U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
+    U64(0x6fd406046fd40604), U64(0xff155060ff155060),
+    U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
+    U64(0xcc434089cc434089), U64(0x779ed967779ed967),
+    U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
+    U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
+    U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
+    U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
+    U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
+    U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
+    U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
+    U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
+    U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
+    U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
+    U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
+    U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
+    U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
+    U64(0x694b775a694b775a), U64(0x161a121c161a121c),
+    U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
+    U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
+    U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
+    U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
+    U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
+    U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
+    U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
+    U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
+    U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
+    U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
+    U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
+    U64(0x4022971340229713), U64(0x2011c6842011c684),
+    U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
+    U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
+    U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
+    U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
+    U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
+    U64(0xfa489411fa489411), U64(0x2264e9472264e947),
+    U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
+    U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
+    U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
+    U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
+    U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
+    U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
+    U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
+    U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
+    U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
+    U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
+    U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
+    U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
+    U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
+    U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
+    U64(0x097826cd097826cd), U64(0xf418596ef418596e),
+    U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
+    U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
+    U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
+    U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
+    U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
+    U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
+    U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
+    U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
+    U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
+    U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
+    U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
+    U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
+    U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
+    U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
+    U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
+    U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
+    U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
+    U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
+    U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
+    U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
+    U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
+    U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
+    U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
+    U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
+    U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
+    U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
+    U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
+    U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
+    U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
+    U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
+    U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
+    U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
+    U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
+    U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
+    U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
+};
+static const u8 Td4[256] = {
+    0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
+    0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
+    0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
+    0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
+    0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
+    0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
+    0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
+    0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
+    0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
+    0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
+    0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
+    0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
+    0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
+    0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
+    0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
+    0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
+    0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
+    0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
+    0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
+    0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
+    0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
+    0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
+    0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
+    0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
+    0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
+    0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
+    0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
+    0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
+    0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
+    0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
+    0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
+    0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
+};
+
+static const u32 rcon[] = {
+    0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
+    0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
+    0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+};
+
+/**
+ * Expand the cipher key into the encryption key schedule.
+ */
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+			AES_KEY *key) {
+
+	u32 *rk;
+   	int i = 0;
+	u32 temp;
+
+	if (!userKey || !key)
+		return -1;
+	if (bits != 128 && bits != 192 && bits != 256)
+		return -2;
+
+	rk = key->rd_key;
+
+	if (bits==128)
+		key->rounds = 10;
+	else if (bits==192)
+		key->rounds = 12;
+	else
+		key->rounds = 14;
+
+	rk[0] = GETU32(userKey     );
+	rk[1] = GETU32(userKey +  4);
+	rk[2] = GETU32(userKey +  8);
+	rk[3] = GETU32(userKey + 12);
+	if (bits == 128) {
+		while (1) {
+			temp  = rk[3];
+			rk[4] = rk[0] ^
+				(Te4[(temp >>  8) & 0xff]      ) ^
+				(Te4[(temp >> 16) & 0xff] <<  8) ^
+				(Te4[(temp >> 24)       ] << 16) ^
+				(Te4[(temp      ) & 0xff] << 24) ^
+				rcon[i];
+			rk[5] = rk[1] ^ rk[4];
+			rk[6] = rk[2] ^ rk[5];
+			rk[7] = rk[3] ^ rk[6];
+			if (++i == 10) {
+				return 0;
+			}
+			rk += 4;
+		}
+	}
+	rk[4] = GETU32(userKey + 16);
+	rk[5] = GETU32(userKey + 20);
+	if (bits == 192) {
+		while (1) {
+			temp = rk[ 5];
+			rk[ 6] = rk[ 0] ^
+				(Te4[(temp >>  8) & 0xff]      ) ^
+				(Te4[(temp >> 16) & 0xff] <<  8) ^
+				(Te4[(temp >> 24)       ] << 16) ^
+				(Te4[(temp      ) & 0xff] << 24) ^
+				rcon[i];
+			rk[ 7] = rk[ 1] ^ rk[ 6];
+			rk[ 8] = rk[ 2] ^ rk[ 7];
+			rk[ 9] = rk[ 3] ^ rk[ 8];
+			if (++i == 8) {
+				return 0;
+			}
+			rk[10] = rk[ 4] ^ rk[ 9];
+			rk[11] = rk[ 5] ^ rk[10];
+			rk += 6;
+		}
+	}
+	rk[6] = GETU32(userKey + 24);
+	rk[7] = GETU32(userKey + 28);
+	if (bits == 256) {
+		while (1) {
+			temp = rk[ 7];
+			rk[ 8] = rk[ 0] ^
+				(Te4[(temp >>  8) & 0xff]      ) ^
+				(Te4[(temp >> 16) & 0xff] <<  8) ^
+				(Te4[(temp >> 24)       ] << 16) ^
+				(Te4[(temp      ) & 0xff] << 24) ^
+				rcon[i];
+			rk[ 9] = rk[ 1] ^ rk[ 8];
+			rk[10] = rk[ 2] ^ rk[ 9];
+			rk[11] = rk[ 3] ^ rk[10];
+			if (++i == 7) {
+				return 0;
+			}
+			temp = rk[11];
+			rk[12] = rk[ 4] ^
+				(Te4[(temp      ) & 0xff]      ) ^
+				(Te4[(temp >>  8) & 0xff] <<  8) ^
+				(Te4[(temp >> 16) & 0xff] << 16) ^
+				(Te4[(temp >> 24)       ] << 24);
+			rk[13] = rk[ 5] ^ rk[12];
+			rk[14] = rk[ 6] ^ rk[13];
+			rk[15] = rk[ 7] ^ rk[14];
+
+			rk += 8;
+        	}
+	}
+	return 0;
+}
+
+/**
+ * Expand the cipher key into the decryption key schedule.
+ */
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+			 AES_KEY *key) {
+
+        u32 *rk;
+	int i, j, status;
+	u32 temp;
+
+	/* first, start with an encryption schedule */
+	status = AES_set_encrypt_key(userKey, bits, key);
+	if (status < 0)
+		return status;
+
+	rk = key->rd_key;
+
+	/* invert the order of the round keys: */
+	for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
+		temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
+		temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+		temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+		temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+	}
+	/* apply the inverse MixColumn transform to all round keys but the first and the last: */
+	for (i = 1; i < (key->rounds); i++) {
+		rk += 4;
+#if 1
+		for (j = 0; j < 4; j++) {
+			u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
+
+			tp1 = rk[j];
+			m = tp1 & 0x80808080;
+			tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
+				((m - (m >> 7)) & 0x1b1b1b1b);
+			m = tp2 & 0x80808080;
+			tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
+				((m - (m >> 7)) & 0x1b1b1b1b);
+			m = tp4 & 0x80808080;
+			tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
+				((m - (m >> 7)) & 0x1b1b1b1b);
+			tp9 = tp8 ^ tp1;
+			tpb = tp9 ^ tp2;
+			tpd = tp9 ^ tp4;
+			tpe = tp8 ^ tp4 ^ tp2;
+#if defined(ROTATE)
+			rk[j] = tpe ^ ROTATE(tpd,16) ^
+				ROTATE(tp9,8) ^ ROTATE(tpb,24);
+#else
+			rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ 
+				(tp9 >> 24) ^ (tp9 << 8) ^
+				(tpb >> 8) ^ (tpb << 24);
+#endif
+		}
+#else
+		rk[0] =
+			Td0[Te2[(rk[0]      ) & 0xff] & 0xff] ^
+			Td1[Te2[(rk[0] >>  8) & 0xff] & 0xff] ^
+			Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
+			Td3[Te2[(rk[0] >> 24)       ] & 0xff];
+		rk[1] =
+			Td0[Te2[(rk[1]      ) & 0xff] & 0xff] ^
+			Td1[Te2[(rk[1] >>  8) & 0xff] & 0xff] ^
+			Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
+			Td3[Te2[(rk[1] >> 24)       ] & 0xff];
+		rk[2] =
+			Td0[Te2[(rk[2]      ) & 0xff] & 0xff] ^
+			Td1[Te2[(rk[2] >>  8) & 0xff] & 0xff] ^
+			Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
+			Td3[Te2[(rk[2] >> 24)       ] & 0xff];
+		rk[3] =
+			Td0[Te2[(rk[3]      ) & 0xff] & 0xff] ^
+			Td1[Te2[(rk[3] >>  8) & 0xff] & 0xff] ^
+			Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
+			Td3[Te2[(rk[3] >> 24)       ] & 0xff];
+#endif
+	}
+	return 0;
+}
+
+/*
+ * Encrypt a single block
+ * in and out can overlap
+ */
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+		 const AES_KEY *key) {
+
+	const u32 *rk;
+	u32 s0, s1, s2, s3, t[4];
+	int r;
+
+	assert(in && out && key);
+	rk = key->rd_key;
+
+	/*
+	 * map byte array block to cipher state
+	 * and add initial round key:
+	 */
+	s0 = GETU32(in     ) ^ rk[0];
+	s1 = GETU32(in +  4) ^ rk[1];
+	s2 = GETU32(in +  8) ^ rk[2];
+	s3 = GETU32(in + 12) ^ rk[3];
+
+#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
+	prefetch256(Te4);
+
+	t[0] =	Te4[(s0      ) & 0xff]       ^
+		Te4[(s1 >>  8) & 0xff] <<  8 ^
+		Te4[(s2 >> 16) & 0xff] << 16 ^
+		Te4[(s3 >> 24)       ] << 24;
+	t[1] =	Te4[(s1      ) & 0xff]       ^
+		Te4[(s2 >>  8) & 0xff] <<  8 ^
+		Te4[(s3 >> 16) & 0xff] << 16 ^
+		Te4[(s0 >> 24)       ] << 24;
+	t[2] =	Te4[(s2      ) & 0xff]       ^
+		Te4[(s3 >>  8) & 0xff] <<  8 ^
+		Te4[(s0 >> 16) & 0xff] << 16 ^
+		Te4[(s1 >> 24)       ] << 24;
+	t[3] =	Te4[(s3      ) & 0xff]       ^
+		Te4[(s0 >>  8) & 0xff] <<  8 ^
+		Te4[(s1 >> 16) & 0xff] << 16 ^
+		Te4[(s2 >> 24)       ] << 24;
+
+	/* now do the linear transform using words */
+	{	int i;
+		u32 r0, r1, r2;
+
+		for (i = 0; i < 4; i++) {
+			r0 = t[i];
+			r1 = r0 & 0x80808080;
+			r2 = ((r0 & 0x7f7f7f7f) << 1) ^
+				((r1 - (r1 >> 7)) & 0x1b1b1b1b);
+#if defined(ROTATE)
+			t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
+				ROTATE(r0,16) ^ ROTATE(r0,8);
+#else
+			t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
+				(r0 << 16) ^ (r0 >> 16) ^
+				(r0 << 8) ^ (r0 >> 24);
+#endif
+			t[i] ^= rk[4+i];
+		}
+	}
+#else
+	t[0] =	Te0[(s0      ) & 0xff] ^
+		Te1[(s1 >>  8) & 0xff] ^
+		Te2[(s2 >> 16) & 0xff] ^
+		Te3[(s3 >> 24)       ] ^
+		rk[4];
+	t[1] =	Te0[(s1      ) & 0xff] ^
+		Te1[(s2 >>  8) & 0xff] ^
+		Te2[(s3 >> 16) & 0xff] ^
+		Te3[(s0 >> 24)       ] ^
+		rk[5];
+	t[2] =	Te0[(s2      ) & 0xff] ^
+		Te1[(s3 >>  8) & 0xff] ^
+		Te2[(s0 >> 16) & 0xff] ^
+		Te3[(s1 >> 24)       ] ^
+		rk[6];
+	t[3] =	Te0[(s3      ) & 0xff] ^
+		Te1[(s0 >>  8) & 0xff] ^
+		Te2[(s1 >> 16) & 0xff] ^
+		Te3[(s2 >> 24)       ] ^
+		rk[7];
+#endif
+	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
+
+    /*
+     * Nr - 2 full rounds:
+     */
+    for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
+#if defined(AES_COMPACT_IN_INNER_ROUNDS)
+	t[0] =	Te4[(s0      ) & 0xff]       ^
+		Te4[(s1 >>  8) & 0xff] <<  8 ^
+		Te4[(s2 >> 16) & 0xff] << 16 ^
+		Te4[(s3 >> 24)       ] << 24;
+	t[1] =	Te4[(s1      ) & 0xff]       ^
+		Te4[(s2 >>  8) & 0xff] <<  8 ^
+		Te4[(s3 >> 16) & 0xff] << 16 ^
+		Te4[(s0 >> 24)       ] << 24;
+	t[2] =	Te4[(s2      ) & 0xff]       ^
+		Te4[(s3 >>  8) & 0xff] <<  8 ^
+		Te4[(s0 >> 16) & 0xff] << 16 ^
+		Te4[(s1 >> 24)       ] << 24;
+	t[3] =	Te4[(s3      ) & 0xff]       ^
+		Te4[(s0 >>  8) & 0xff] <<  8 ^
+		Te4[(s1 >> 16) & 0xff] << 16 ^
+		Te4[(s2 >> 24)       ] << 24;
+
+	/* now do the linear transform using words */
+	{	int i;
+		u32 r0, r1, r2;
+
+		for (i = 0; i < 4; i++) {
+			r0 = t[i];
+			r1 = r0 & 0x80808080;
+			r2 = ((r0 & 0x7f7f7f7f) << 1) ^
+				((r1 - (r1 >> 7)) & 0x1b1b1b1b);
+#if defined(ROTATE)
+			t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
+				ROTATE(r0,16) ^ ROTATE(r0,8);
+#else
+			t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
+				(r0 << 16) ^ (r0 >> 16) ^
+				(r0 << 8) ^ (r0 >> 24);
+#endif
+			t[i] ^= rk[i];
+		}
+	}
+#else
+	t[0] =	Te0[(s0      ) & 0xff] ^
+		Te1[(s1 >>  8) & 0xff] ^
+		Te2[(s2 >> 16) & 0xff] ^
+		Te3[(s3 >> 24)       ] ^
+		rk[0];
+	t[1] =	Te0[(s1      ) & 0xff] ^
+		Te1[(s2 >>  8) & 0xff] ^
+		Te2[(s3 >> 16) & 0xff] ^
+		Te3[(s0 >> 24)       ] ^
+		rk[1];
+	t[2] =	Te0[(s2      ) & 0xff] ^
+		Te1[(s3 >>  8) & 0xff] ^
+		Te2[(s0 >> 16) & 0xff] ^
+		Te3[(s1 >> 24)       ] ^
+		rk[2];
+	t[3] =	Te0[(s3      ) & 0xff] ^
+		Te1[(s0 >>  8) & 0xff] ^
+		Te2[(s1 >> 16) & 0xff] ^
+		Te3[(s2 >> 24)       ] ^
+		rk[3];
+#endif
+	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
+    }
+    /*
+	 * apply last round and
+	 * map cipher state to byte array block:
+	 */
+#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
+	prefetch256(Te4);
+
+	*(u32*)(out+0) =
+		Te4[(s0      ) & 0xff]       ^
+		Te4[(s1 >>  8) & 0xff] <<  8 ^
+		Te4[(s2 >> 16) & 0xff] << 16 ^
+		Te4[(s3 >> 24)       ] << 24 ^
+		rk[0];
+	*(u32*)(out+4) =
+		Te4[(s1      ) & 0xff]       ^
+		Te4[(s2 >>  8) & 0xff] <<  8 ^
+		Te4[(s3 >> 16) & 0xff] << 16 ^
+		Te4[(s0 >> 24)       ] << 24 ^
+		rk[1];
+	*(u32*)(out+8) =
+		Te4[(s2      ) & 0xff]       ^
+		Te4[(s3 >>  8) & 0xff] <<  8 ^
+		Te4[(s0 >> 16) & 0xff] << 16 ^
+		Te4[(s1 >> 24)       ] << 24 ^
+		rk[2];
+	*(u32*)(out+12) =
+		Te4[(s3      ) & 0xff]       ^
+		Te4[(s0 >>  8) & 0xff] <<  8 ^
+		Te4[(s1 >> 16) & 0xff] << 16 ^
+		Te4[(s2 >> 24)       ] << 24 ^
+		rk[3];
+#else
+	*(u32*)(out+0) =
+		(Te2[(s0      ) & 0xff] & 0x000000ffU) ^
+		(Te3[(s1 >>  8) & 0xff] & 0x0000ff00U) ^
+		(Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^
+		(Te1[(s3 >> 24)       ] & 0xff000000U) ^
+		rk[0];
+	*(u32*)(out+4) =
+		(Te2[(s1      ) & 0xff] & 0x000000ffU) ^
+		(Te3[(s2 >>  8) & 0xff] & 0x0000ff00U) ^
+		(Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^
+		(Te1[(s0 >> 24)       ] & 0xff000000U) ^
+		rk[1];
+	*(u32*)(out+8) =
+		(Te2[(s2      ) & 0xff] & 0x000000ffU) ^
+		(Te3[(s3 >>  8) & 0xff] & 0x0000ff00U) ^
+		(Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^
+		(Te1[(s1 >> 24)       ] & 0xff000000U) ^
+		rk[2];
+	*(u32*)(out+12) =
+		(Te2[(s3      ) & 0xff] & 0x000000ffU) ^
+		(Te3[(s0 >>  8) & 0xff] & 0x0000ff00U) ^
+		(Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^
+		(Te1[(s2 >> 24)       ] & 0xff000000U) ^
+		rk[3];
+#endif
+}
+
+/*
+ * Decrypt a single block
+ * in and out can overlap
+ */
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+		 const AES_KEY *key) {
+
+	const u32 *rk;
+	u32 s0, s1, s2, s3, t[4];
+	int r;
+
+	assert(in && out && key);
+	rk = key->rd_key;
+
+	/*
+	 * map byte array block to cipher state
+	 * and add initial round key:
+	 */
+	s0 = GETU32(in     ) ^ rk[0];
+	s1 = GETU32(in +  4) ^ rk[1];
+	s2 = GETU32(in +  8) ^ rk[2];
+	s3 = GETU32(in + 12) ^ rk[3];
+
+#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
+	prefetch256(Td4);
+
+        t[0] =	Td4[(s0      ) & 0xff]       ^
+		Td4[(s3 >>  8) & 0xff] <<  8 ^
+		Td4[(s2 >> 16) & 0xff] << 16 ^
+		Td4[(s1 >> 24)       ] << 24;
+        t[1] =	Td4[(s1      ) & 0xff]       ^
+		Td4[(s0 >>  8) & 0xff] <<  8 ^
+		Td4[(s3 >> 16) & 0xff] << 16 ^
+		Td4[(s2 >> 24)       ] << 24;
+        t[2] =	Td4[(s2      ) & 0xff]       ^
+		Td4[(s1 >>  8) & 0xff] <<  8 ^
+		Td4[(s0 >> 16) & 0xff] << 16 ^
+		Td4[(s3 >> 24)       ] << 24;
+        t[3] =	Td4[(s3      ) & 0xff]       ^
+		Td4[(s2 >>  8) & 0xff] <<  8 ^
+		Td4[(s1 >> 16) & 0xff] << 16 ^
+		Td4[(s0 >> 24)       ] << 24;
+
+	/* now do the linear transform using words */ 
+	{	int i;
+		u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
+
+		for (i = 0; i < 4; i++) {
+			tp1 = t[i];
+			m = tp1 & 0x80808080;
+			tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
+				((m - (m >> 7)) & 0x1b1b1b1b);
+			m = tp2 & 0x80808080;
+			tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
+				((m - (m >> 7)) & 0x1b1b1b1b);
+			m = tp4 & 0x80808080;
+			tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
+				((m - (m >> 7)) & 0x1b1b1b1b);
+			tp9 = tp8 ^ tp1;
+			tpb = tp9 ^ tp2;
+			tpd = tp9 ^ tp4;
+			tpe = tp8 ^ tp4 ^ tp2;
+#if defined(ROTATE)
+			t[i] = tpe ^ ROTATE(tpd,16) ^
+				ROTATE(tp9,8) ^ ROTATE(tpb,24);
+#else
+			t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ 
+				(tp9 >> 24) ^ (tp9 << 8) ^
+				(tpb >> 8) ^ (tpb << 24);
+#endif
+			t[i] ^= rk[4+i];
+		}
+	}
+#else
+	t[0] =	Td0[(s0      ) & 0xff] ^
+		Td1[(s3 >>  8) & 0xff] ^
+		Td2[(s2 >> 16) & 0xff] ^
+		Td3[(s1 >> 24)       ] ^
+		rk[4];
+	t[1] =	Td0[(s1      ) & 0xff] ^
+		Td1[(s0 >>  8) & 0xff] ^
+		Td2[(s3 >> 16) & 0xff] ^
+		Td3[(s2 >> 24)       ] ^
+		rk[5];
+	t[2] =	Td0[(s2      ) & 0xff] ^
+		Td1[(s1 >>  8) & 0xff] ^
+		Td2[(s0 >> 16) & 0xff] ^
+		Td3[(s3 >> 24)       ] ^
+		rk[6];
+	t[3] =	Td0[(s3      ) & 0xff] ^
+		Td1[(s2 >>  8) & 0xff] ^
+		Td2[(s1 >> 16) & 0xff] ^
+		Td3[(s0 >> 24)       ] ^
+		rk[7];
+#endif
+	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
+
+    /*
+     * Nr - 2 full rounds:
+     */
+    for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
+#if defined(AES_COMPACT_IN_INNER_ROUNDS)
+        t[0] =	Td4[(s0      ) & 0xff]       ^
+		Td4[(s3 >>  8) & 0xff] <<  8 ^
+		Td4[(s2 >> 16) & 0xff] << 16 ^
+		Td4[(s1 >> 24)       ] << 24;
+        t[1] =	Td4[(s1      ) & 0xff]       ^
+		Td4[(s0 >>  8) & 0xff] <<  8 ^
+		Td4[(s3 >> 16) & 0xff] << 16 ^
+		Td4[(s2 >> 24)       ] << 24;
+        t[2] =	Td4[(s2      ) & 0xff]       ^
+		Td4[(s1 >>  8) & 0xff] <<  8 ^
+		Td4[(s0 >> 16) & 0xff] << 16 ^
+		Td4[(s3 >> 24)       ] << 24;
+        t[3] =	Td4[(s3      ) & 0xff]       ^
+		Td4[(s2 >>  8) & 0xff] <<  8 ^
+		Td4[(s1 >> 16) & 0xff] << 16 ^
+		Td4[(s0 >> 24)       ] << 24;
+
+	/* now do the linear transform using words */ 
+	{	int i;
+		u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
+
+		for (i = 0; i < 4; i++) {
+			tp1 = t[i];
+			m = tp1 & 0x80808080;
+			tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
+				((m - (m >> 7)) & 0x1b1b1b1b);
+			m = tp2 & 0x80808080;
+			tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
+				((m - (m >> 7)) & 0x1b1b1b1b);
+			m = tp4 & 0x80808080;
+			tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
+				((m - (m >> 7)) & 0x1b1b1b1b);
+			tp9 = tp8 ^ tp1;
+			tpb = tp9 ^ tp2;
+			tpd = tp9 ^ tp4;
+			tpe = tp8 ^ tp4 ^ tp2;
+#if defined(ROTATE)
+			t[i] = tpe ^ ROTATE(tpd,16) ^
+				ROTATE(tp9,8) ^ ROTATE(tpb,24);
+#else
+			t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ 
+				(tp9 >> 24) ^ (tp9 << 8) ^
+				(tpb >> 8) ^ (tpb << 24);
+#endif
+			t[i] ^= rk[i];
+		}
+	}
+#else
+	t[0] =	Td0[(s0      ) & 0xff] ^
+		Td1[(s3 >>  8) & 0xff] ^
+		Td2[(s2 >> 16) & 0xff] ^
+		Td3[(s1 >> 24)       ] ^
+		rk[0];
+	t[1] =	Td0[(s1      ) & 0xff] ^
+		Td1[(s0 >>  8) & 0xff] ^
+		Td2[(s3 >> 16) & 0xff] ^
+		Td3[(s2 >> 24)       ] ^
+		rk[1];
+	t[2] =	Td0[(s2      ) & 0xff] ^
+		Td1[(s1 >>  8) & 0xff] ^
+		Td2[(s0 >> 16) & 0xff] ^
+		Td3[(s3 >> 24)       ] ^
+		rk[2];
+	t[3] =	Td0[(s3      ) & 0xff] ^
+		Td1[(s2 >>  8) & 0xff] ^
+		Td2[(s1 >> 16) & 0xff] ^
+		Td3[(s0 >> 24)       ] ^
+		rk[3];
+#endif
+	s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
+    }
+    /*
+	 * apply last round and
+	 * map cipher state to byte array block:
+	 */
+	prefetch256(Td4);
+
+	*(u32*)(out+0) =
+		(Td4[(s0      ) & 0xff])	^
+		(Td4[(s3 >>  8) & 0xff] <<  8) ^
+		(Td4[(s2 >> 16) & 0xff] << 16) ^
+		(Td4[(s1 >> 24)       ] << 24) ^
+		rk[0];
+	*(u32*)(out+4) =
+		(Td4[(s1      ) & 0xff])	 ^
+		(Td4[(s0 >>  8) & 0xff] <<  8) ^
+		(Td4[(s3 >> 16) & 0xff] << 16) ^
+		(Td4[(s2 >> 24)       ] << 24) ^
+		rk[1];
+	*(u32*)(out+8) =
+		(Td4[(s2      ) & 0xff])	 ^
+		(Td4[(s1 >>  8) & 0xff] <<  8) ^
+		(Td4[(s0 >> 16) & 0xff] << 16) ^
+		(Td4[(s3 >> 24)       ] << 24) ^
+		rk[2];
+	*(u32*)(out+12) =
+		(Td4[(s3      ) & 0xff])	 ^
+		(Td4[(s2 >>  8) & 0xff] <<  8) ^
+		(Td4[(s1 >> 16) & 0xff] << 16) ^
+		(Td4[(s0 >> 24)       ] << 24) ^
+		rk[3];
+}
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl
new file mode 100644
index 0000000000..15742c1ec5
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-armv4.pl
@@ -0,0 +1,1030 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for ARMv4
+
+# January 2007.
+#
+# Code uses single 1K S-box and is >2 times faster than code generated
+# by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
+# allows to merge logical or arithmetic operation with shift or rotate
+# in one instruction and emit combined result every cycle. The module
+# is endian-neutral. The performance is ~42 cycles/byte for 128-bit
+# key.
+
+# May 2007.
+#
+# AES_set_[en|de]crypt_key is added.
+
+$s0="r0";
+$s1="r1";
+$s2="r2";
+$s3="r3";
+$t1="r4";
+$t2="r5";
+$t3="r6";
+$i1="r7";
+$i2="r8";
+$i3="r9";
+
+$tbl="r10";
+$key="r11";
+$rounds="r12";
+
+$code=<<___;
+.text
+.code	32
+
+.type	AES_Te,%object
+.align	5
+AES_Te:
+.word	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
+.word	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
+.word	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
+.word	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
+.word	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
+.word	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
+.word	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
+.word	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
+.word	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
+.word	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
+.word	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
+.word	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
+.word	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
+.word	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
+.word	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
+.word	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
+.word	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
+.word	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
+.word	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
+.word	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
+.word	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
+.word	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
+.word	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
+.word	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
+.word	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
+.word	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
+.word	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
+.word	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
+.word	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
+.word	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
+.word	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
+.word	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
+.word	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
+.word	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
+.word	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
+.word	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
+.word	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
+.word	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
+.word	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
+.word	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
+.word	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
+.word	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
+.word	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
+.word	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
+.word	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
+.word	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
+.word	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
+.word	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
+.word	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
+.word	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
+.word	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
+.word	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
+.word	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
+.word	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
+.word	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
+.word	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
+.word	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
+.word	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
+.word	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
+.word	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
+.word	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
+.word	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
+.word	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
+.word	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
+@ Te4[256]
+.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+@ rcon[]
+.word	0x01000000, 0x02000000, 0x04000000, 0x08000000
+.word	0x10000000, 0x20000000, 0x40000000, 0x80000000
+.word	0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
+.size	AES_Te,.-AES_Te
+
+@ void AES_encrypt(const unsigned char *in, unsigned char *out,
+@ 		 const AES_KEY *key) {
+.global AES_encrypt
+.type   AES_encrypt,%function
+.align	5
+AES_encrypt:
+	sub	r3,pc,#8		@ AES_encrypt
+	stmdb   sp!,{r1,r4-r12,lr}
+	mov	$rounds,r0		@ inp
+	mov	$key,r2
+	sub	$tbl,r3,#AES_encrypt-AES_Te	@ Te
+
+	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
+	ldrb	$t1,[$rounds,#2]	@ manner...
+	ldrb	$t2,[$rounds,#1]
+	ldrb	$t3,[$rounds,#0]
+	orr	$s0,$s0,$t1,lsl#8
+	orr	$s0,$s0,$t2,lsl#16
+	orr	$s0,$s0,$t3,lsl#24
+	ldrb	$s1,[$rounds,#7]
+	ldrb	$t1,[$rounds,#6]
+	ldrb	$t2,[$rounds,#5]
+	ldrb	$t3,[$rounds,#4]
+	orr	$s1,$s1,$t1,lsl#8
+	orr	$s1,$s1,$t2,lsl#16
+	orr	$s1,$s1,$t3,lsl#24
+	ldrb	$s2,[$rounds,#11]
+	ldrb	$t1,[$rounds,#10]
+	ldrb	$t2,[$rounds,#9]
+	ldrb	$t3,[$rounds,#8]
+	orr	$s2,$s2,$t1,lsl#8
+	orr	$s2,$s2,$t2,lsl#16
+	orr	$s2,$s2,$t3,lsl#24
+	ldrb	$s3,[$rounds,#15]
+	ldrb	$t1,[$rounds,#14]
+	ldrb	$t2,[$rounds,#13]
+	ldrb	$t3,[$rounds,#12]
+	orr	$s3,$s3,$t1,lsl#8
+	orr	$s3,$s3,$t2,lsl#16
+	orr	$s3,$s3,$t3,lsl#24
+
+	bl	_armv4_AES_encrypt
+
+	ldr	$rounds,[sp],#4		@ pop out
+	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
+	mov	$t2,$s0,lsr#16		@ manner...
+	mov	$t3,$s0,lsr#8
+	strb	$t1,[$rounds,#0]
+	strb	$t2,[$rounds,#1]
+	strb	$t3,[$rounds,#2]
+	strb	$s0,[$rounds,#3]
+	mov	$t1,$s1,lsr#24
+	mov	$t2,$s1,lsr#16
+	mov	$t3,$s1,lsr#8
+	strb	$t1,[$rounds,#4]
+	strb	$t2,[$rounds,#5]
+	strb	$t3,[$rounds,#6]
+	strb	$s1,[$rounds,#7]
+	mov	$t1,$s2,lsr#24
+	mov	$t2,$s2,lsr#16
+	mov	$t3,$s2,lsr#8
+	strb	$t1,[$rounds,#8]
+	strb	$t2,[$rounds,#9]
+	strb	$t3,[$rounds,#10]
+	strb	$s2,[$rounds,#11]
+	mov	$t1,$s3,lsr#24
+	mov	$t2,$s3,lsr#16
+	mov	$t3,$s3,lsr#8
+	strb	$t1,[$rounds,#12]
+	strb	$t2,[$rounds,#13]
+	strb	$t3,[$rounds,#14]
+	strb	$s3,[$rounds,#15]
+
+	ldmia   sp!,{r4-r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+.size	AES_encrypt,.-AES_encrypt
+
+.type   _armv4_AES_encrypt,%function
+.align	2
+_armv4_AES_encrypt:
+	str	lr,[sp,#-4]!		@ push lr
+	ldr	$t1,[$key],#16
+	ldr	$t2,[$key,#-12]
+	ldr	$t3,[$key,#-8]
+	ldr	$i1,[$key,#-4]
+	ldr	$rounds,[$key,#240-16]
+	eor	$s0,$s0,$t1
+	eor	$s1,$s1,$t2
+	eor	$s2,$s2,$t3
+	eor	$s3,$s3,$i1
+	sub	$rounds,$rounds,#1
+	mov	lr,#255
+
+.Lenc_loop:
+	and	$i2,lr,$s0,lsr#8
+	and	$i3,lr,$s0,lsr#16
+	and	$i1,lr,$s0
+	mov	$s0,$s0,lsr#24
+	ldr	$t1,[$tbl,$i1,lsl#2]	@ Te3[s0>>0]
+	ldr	$s0,[$tbl,$s0,lsl#2]	@ Te0[s0>>24]
+	ldr	$t2,[$tbl,$i2,lsl#2]	@ Te2[s0>>8]
+	ldr	$t3,[$tbl,$i3,lsl#2]	@ Te1[s0>>16]
+
+	and	$i1,lr,$s1,lsr#16	@ i0
+	and	$i2,lr,$s1
+	and	$i3,lr,$s1,lsr#8
+	mov	$s1,$s1,lsr#24
+	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te1[s1>>16]
+	ldr	$s1,[$tbl,$s1,lsl#2]	@ Te0[s1>>24]
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te3[s1>>0]
+	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te2[s1>>8]
+	eor	$s0,$s0,$i1,ror#8
+	eor	$s1,$s1,$t1,ror#24
+	eor	$t2,$t2,$i2,ror#8
+	eor	$t3,$t3,$i3,ror#8
+
+	and	$i1,lr,$s2,lsr#8	@ i0
+	and	$i2,lr,$s2,lsr#16	@ i1
+	and	$i3,lr,$s2
+	mov	$s2,$s2,lsr#24
+	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te2[s2>>8]
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
+	ldr	$s2,[$tbl,$s2,lsl#2]	@ Te0[s2>>24]
+	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te3[s2>>0]
+	eor	$s0,$s0,$i1,ror#16
+	eor	$s1,$s1,$i2,ror#8
+	eor	$s2,$s2,$t2,ror#16
+	eor	$t3,$t3,$i3,ror#16
+
+	and	$i1,lr,$s3		@ i0
+	and	$i2,lr,$s3,lsr#8	@ i1
+	and	$i3,lr,$s3,lsr#16	@ i2
+	mov	$s3,$s3,lsr#24
+	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te3[s3>>0]
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
+	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te1[s3>>16]
+	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
+	eor	$s0,$s0,$i1,ror#24
+	eor	$s1,$s1,$i2,ror#16
+	eor	$s2,$s2,$i3,ror#8
+	eor	$s3,$s3,$t3,ror#8
+
+	ldr	$t1,[$key],#16
+	ldr	$t2,[$key,#-12]
+	ldr	$t3,[$key,#-8]
+	ldr	$i1,[$key,#-4]
+	eor	$s0,$s0,$t1
+	eor	$s1,$s1,$t2
+	eor	$s2,$s2,$t3
+	eor	$s3,$s3,$i1
+
+	subs	$rounds,$rounds,#1
+	bne	.Lenc_loop
+
+	add	$tbl,$tbl,#2
+
+	and	$i1,lr,$s0
+	and	$i2,lr,$s0,lsr#8
+	and	$i3,lr,$s0,lsr#16
+	mov	$s0,$s0,lsr#24
+	ldrb	$t1,[$tbl,$i1,lsl#2]	@ Te4[s0>>0]
+	ldrb	$s0,[$tbl,$s0,lsl#2]	@ Te4[s0>>24]
+	ldrb	$t2,[$tbl,$i2,lsl#2]	@ Te4[s0>>8]
+	ldrb	$t3,[$tbl,$i3,lsl#2]	@ Te4[s0>>16]
+
+	and	$i1,lr,$s1,lsr#16	@ i0
+	and	$i2,lr,$s1
+	and	$i3,lr,$s1,lsr#8
+	mov	$s1,$s1,lsr#24
+	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s1>>16]
+	ldrb	$s1,[$tbl,$s1,lsl#2]	@ Te4[s1>>24]
+	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s1>>0]
+	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s1>>8]
+	eor	$s0,$i1,$s0,lsl#8
+	eor	$s1,$t1,$s1,lsl#24
+	eor	$t2,$i2,$t2,lsl#8
+	eor	$t3,$i3,$t3,lsl#8
+
+	and	$i1,lr,$s2,lsr#8	@ i0
+	and	$i2,lr,$s2,lsr#16	@ i1
+	and	$i3,lr,$s2
+	mov	$s2,$s2,lsr#24
+	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s2>>8]
+	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
+	ldrb	$s2,[$tbl,$s2,lsl#2]	@ Te4[s2>>24]
+	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s2>>0]
+	eor	$s0,$i1,$s0,lsl#8
+	eor	$s1,$s1,$i2,lsl#16
+	eor	$s2,$t2,$s2,lsl#24
+	eor	$t3,$i3,$t3,lsl#8
+
+	and	$i1,lr,$s3		@ i0
+	and	$i2,lr,$s3,lsr#8	@ i1
+	and	$i3,lr,$s3,lsr#16	@ i2
+	mov	$s3,$s3,lsr#24
+	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s3>>0]
+	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
+	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s3>>16]
+	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
+	eor	$s0,$i1,$s0,lsl#8
+	eor	$s1,$s1,$i2,lsl#8
+	eor	$s2,$s2,$i3,lsl#16
+	eor	$s3,$t3,$s3,lsl#24
+
+	ldr	lr,[sp],#4		@ pop lr
+	ldr	$t1,[$key,#0]
+	ldr	$t2,[$key,#4]
+	ldr	$t3,[$key,#8]
+	ldr	$i1,[$key,#12]
+	eor	$s0,$s0,$t1
+	eor	$s1,$s1,$t2
+	eor	$s2,$s2,$t3
+	eor	$s3,$s3,$i1
+
+	sub	$tbl,$tbl,#2
+	mov	pc,lr			@ return
+.size	_armv4_AES_encrypt,.-_armv4_AES_encrypt
+
+.global AES_set_encrypt_key
+.type   AES_set_encrypt_key,%function
+.align	5
+AES_set_encrypt_key:
+	sub	r3,pc,#8		@ AES_set_encrypt_key
+	teq	r0,#0
+	moveq	r0,#-1
+	beq	.Labrt
+	teq	r2,#0
+	moveq	r0,#-1
+	beq	.Labrt
+
+	teq	r1,#128
+	beq	.Lok
+	teq	r1,#192
+	beq	.Lok
+	teq	r1,#256
+	movne	r0,#-1
+	bne	.Labrt
+
+.Lok:	stmdb   sp!,{r4-r12,lr}
+	sub	$tbl,r3,#AES_set_encrypt_key-AES_Te-1024	@ Te4
+
+	mov	$rounds,r0		@ inp
+	mov	lr,r1			@ bits
+	mov	$key,r2			@ key
+
+	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
+	ldrb	$t1,[$rounds,#2]	@ manner...
+	ldrb	$t2,[$rounds,#1]
+	ldrb	$t3,[$rounds,#0]
+	orr	$s0,$s0,$t1,lsl#8
+	orr	$s0,$s0,$t2,lsl#16
+	orr	$s0,$s0,$t3,lsl#24
+	ldrb	$s1,[$rounds,#7]
+	ldrb	$t1,[$rounds,#6]
+	ldrb	$t2,[$rounds,#5]
+	ldrb	$t3,[$rounds,#4]
+	orr	$s1,$s1,$t1,lsl#8
+	orr	$s1,$s1,$t2,lsl#16
+	orr	$s1,$s1,$t3,lsl#24
+	ldrb	$s2,[$rounds,#11]
+	ldrb	$t1,[$rounds,#10]
+	ldrb	$t2,[$rounds,#9]
+	ldrb	$t3,[$rounds,#8]
+	orr	$s2,$s2,$t1,lsl#8
+	orr	$s2,$s2,$t2,lsl#16
+	orr	$s2,$s2,$t3,lsl#24
+	ldrb	$s3,[$rounds,#15]
+	ldrb	$t1,[$rounds,#14]
+	ldrb	$t2,[$rounds,#13]
+	ldrb	$t3,[$rounds,#12]
+	orr	$s3,$s3,$t1,lsl#8
+	orr	$s3,$s3,$t2,lsl#16
+	orr	$s3,$s3,$t3,lsl#24
+	str	$s0,[$key],#16
+	str	$s1,[$key,#-12]
+	str	$s2,[$key,#-8]
+	str	$s3,[$key,#-4]
+
+	teq	lr,#128
+	bne	.Lnot128
+	mov	$rounds,#10
+	str	$rounds,[$key,#240-16]
+	add	$t3,$tbl,#256			@ rcon
+	mov	lr,#255
+
+.L128_loop:
+	and	$t2,lr,$s3,lsr#24
+	and	$i1,lr,$s3,lsr#16
+	and	$i2,lr,$s3,lsr#8
+	and	$i3,lr,$s3
+	ldrb	$t2,[$tbl,$t2]
+	ldrb	$i1,[$tbl,$i1]
+	ldrb	$i2,[$tbl,$i2]
+	ldrb	$i3,[$tbl,$i3]
+	ldr	$t1,[$t3],#4			@ rcon[i++]
+	orr	$t2,$t2,$i1,lsl#24
+	orr	$t2,$t2,$i2,lsl#16
+	orr	$t2,$t2,$i3,lsl#8
+	eor	$t2,$t2,$t1
+	eor	$s0,$s0,$t2			@ rk[4]=rk[0]^...
+	eor	$s1,$s1,$s0			@ rk[5]=rk[1]^rk[4]
+	eor	$s2,$s2,$s1			@ rk[6]=rk[2]^rk[5]
+	eor	$s3,$s3,$s2			@ rk[7]=rk[3]^rk[6]
+	str	$s0,[$key],#16
+	str	$s1,[$key,#-12]
+	str	$s2,[$key,#-8]
+	str	$s3,[$key,#-4]
+
+	subs	$rounds,$rounds,#1
+	bne	.L128_loop
+	sub	r2,$key,#176
+	b	.Ldone
+
+.Lnot128:
+	ldrb	$i2,[$rounds,#19]
+	ldrb	$t1,[$rounds,#18]
+	ldrb	$t2,[$rounds,#17]
+	ldrb	$t3,[$rounds,#16]
+	orr	$i2,$i2,$t1,lsl#8
+	orr	$i2,$i2,$t2,lsl#16
+	orr	$i2,$i2,$t3,lsl#24
+	ldrb	$i3,[$rounds,#23]
+	ldrb	$t1,[$rounds,#22]
+	ldrb	$t2,[$rounds,#21]
+	ldrb	$t3,[$rounds,#20]
+	orr	$i3,$i3,$t1,lsl#8
+	orr	$i3,$i3,$t2,lsl#16
+	orr	$i3,$i3,$t3,lsl#24
+	str	$i2,[$key],#8
+	str	$i3,[$key,#-4]
+
+	teq	lr,#192
+	bne	.Lnot192
+	mov	$rounds,#12
+	str	$rounds,[$key,#240-24]
+	add	$t3,$tbl,#256			@ rcon
+	mov	lr,#255
+	mov	$rounds,#8
+
+.L192_loop:
+	and	$t2,lr,$i3,lsr#24
+	and	$i1,lr,$i3,lsr#16
+	and	$i2,lr,$i3,lsr#8
+	and	$i3,lr,$i3
+	ldrb	$t2,[$tbl,$t2]
+	ldrb	$i1,[$tbl,$i1]
+	ldrb	$i2,[$tbl,$i2]
+	ldrb	$i3,[$tbl,$i3]
+	ldr	$t1,[$t3],#4			@ rcon[i++]
+	orr	$t2,$t2,$i1,lsl#24
+	orr	$t2,$t2,$i2,lsl#16
+	orr	$t2,$t2,$i3,lsl#8
+	eor	$i3,$t2,$t1
+	eor	$s0,$s0,$i3			@ rk[6]=rk[0]^...
+	eor	$s1,$s1,$s0			@ rk[7]=rk[1]^rk[6]
+	eor	$s2,$s2,$s1			@ rk[8]=rk[2]^rk[7]
+	eor	$s3,$s3,$s2			@ rk[9]=rk[3]^rk[8]
+	str	$s0,[$key],#24
+	str	$s1,[$key,#-20]
+	str	$s2,[$key,#-16]
+	str	$s3,[$key,#-12]
+
+	subs	$rounds,$rounds,#1
+	subeq	r2,$key,#216
+	beq	.Ldone
+
+	ldr	$i1,[$key,#-32]
+	ldr	$i2,[$key,#-28]
+	eor	$i1,$i1,$s3			@ rk[10]=rk[4]^rk[9]
+	eor	$i3,$i2,$i1			@ rk[11]=rk[5]^rk[10]
+	str	$i1,[$key,#-8]
+	str	$i3,[$key,#-4]
+	b	.L192_loop
+
+.Lnot192:
+	ldrb	$i2,[$rounds,#27]
+	ldrb	$t1,[$rounds,#26]
+	ldrb	$t2,[$rounds,#25]
+	ldrb	$t3,[$rounds,#24]
+	orr	$i2,$i2,$t1,lsl#8
+	orr	$i2,$i2,$t2,lsl#16
+	orr	$i2,$i2,$t3,lsl#24
+	ldrb	$i3,[$rounds,#31]
+	ldrb	$t1,[$rounds,#30]
+	ldrb	$t2,[$rounds,#29]
+	ldrb	$t3,[$rounds,#28]
+	orr	$i3,$i3,$t1,lsl#8
+	orr	$i3,$i3,$t2,lsl#16
+	orr	$i3,$i3,$t3,lsl#24
+	str	$i2,[$key],#8
+	str	$i3,[$key,#-4]
+
+	mov	$rounds,#14
+	str	$rounds,[$key,#240-32]
+	add	$t3,$tbl,#256			@ rcon
+	mov	lr,#255
+	mov	$rounds,#7
+
+.L256_loop:
+	and	$t2,lr,$i3,lsr#24
+	and	$i1,lr,$i3,lsr#16
+	and	$i2,lr,$i3,lsr#8
+	and	$i3,lr,$i3
+	ldrb	$t2,[$tbl,$t2]
+	ldrb	$i1,[$tbl,$i1]
+	ldrb	$i2,[$tbl,$i2]
+	ldrb	$i3,[$tbl,$i3]
+	ldr	$t1,[$t3],#4			@ rcon[i++]
+	orr	$t2,$t2,$i1,lsl#24
+	orr	$t2,$t2,$i2,lsl#16
+	orr	$t2,$t2,$i3,lsl#8
+	eor	$i3,$t2,$t1
+	eor	$s0,$s0,$i3			@ rk[8]=rk[0]^...
+	eor	$s1,$s1,$s0			@ rk[9]=rk[1]^rk[8]
+	eor	$s2,$s2,$s1			@ rk[10]=rk[2]^rk[9]
+	eor	$s3,$s3,$s2			@ rk[11]=rk[3]^rk[10]
+	str	$s0,[$key],#32
+	str	$s1,[$key,#-28]
+	str	$s2,[$key,#-24]
+	str	$s3,[$key,#-20]
+
+	subs	$rounds,$rounds,#1
+	subeq	r2,$key,#256
+	beq	.Ldone
+
+	and	$t2,lr,$s3
+	and	$i1,lr,$s3,lsr#8
+	and	$i2,lr,$s3,lsr#16
+	and	$i3,lr,$s3,lsr#24
+	ldrb	$t2,[$tbl,$t2]
+	ldrb	$i1,[$tbl,$i1]
+	ldrb	$i2,[$tbl,$i2]
+	ldrb	$i3,[$tbl,$i3]
+	orr	$t2,$t2,$i1,lsl#8
+	orr	$t2,$t2,$i2,lsl#16
+	orr	$t2,$t2,$i3,lsl#24
+
+	ldr	$t1,[$key,#-48]
+	ldr	$i1,[$key,#-44]
+	ldr	$i2,[$key,#-40]
+	ldr	$i3,[$key,#-36]
+	eor	$t1,$t1,$t2			@ rk[12]=rk[4]^...
+	eor	$i1,$i1,$t1			@ rk[13]=rk[5]^rk[12]
+	eor	$i2,$i2,$i1			@ rk[14]=rk[6]^rk[13]
+	eor	$i3,$i3,$i2			@ rk[15]=rk[7]^rk[14]
+	str	$t1,[$key,#-16]
+	str	$i1,[$key,#-12]
+	str	$i2,[$key,#-8]
+	str	$i3,[$key,#-4]
+	b	.L256_loop
+
+.Ldone:	mov	r0,#0
+	ldmia   sp!,{r4-r12,lr}
+.Labrt:	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+
+.global AES_set_decrypt_key
+.type   AES_set_decrypt_key,%function
+.align	5
+AES_set_decrypt_key:
+	str	lr,[sp,#-4]!            @ push lr
+	bl	AES_set_encrypt_key
+	teq	r0,#0
+	ldrne	lr,[sp],#4              @ pop lr
+	bne	.Labrt
+
+	stmdb   sp!,{r4-r12}
+
+	ldr	$rounds,[r2,#240]	@ AES_set_encrypt_key preserves r2,
+	mov	$key,r2			@ which is AES_KEY *key
+	mov	$i1,r2
+	add	$i2,r2,$rounds,lsl#4
+
+.Linv:	ldr	$s0,[$i1]
+	ldr	$s1,[$i1,#4]
+	ldr	$s2,[$i1,#8]
+	ldr	$s3,[$i1,#12]
+	ldr	$t1,[$i2]
+	ldr	$t2,[$i2,#4]
+	ldr	$t3,[$i2,#8]
+	ldr	$i3,[$i2,#12]
+	str	$s0,[$i2],#-16
+	str	$s1,[$i2,#16+4]
+	str	$s2,[$i2,#16+8]
+	str	$s3,[$i2,#16+12]
+	str	$t1,[$i1],#16
+	str	$t2,[$i1,#-12]
+	str	$t3,[$i1,#-8]
+	str	$i3,[$i1,#-4]
+	teq	$i1,$i2
+	bne	.Linv
+___
+$mask80=$i1;
+$mask1b=$i2;
+$mask7f=$i3;
+$code.=<<___;
+	ldr	$s0,[$key,#16]!		@ prefetch tp1
+	mov	$mask80,#0x80
+	mov	$mask1b,#0x1b
+	orr	$mask80,$mask80,#0x8000
+	orr	$mask1b,$mask1b,#0x1b00
+	orr	$mask80,$mask80,$mask80,lsl#16
+	orr	$mask1b,$mask1b,$mask1b,lsl#16
+	sub	$rounds,$rounds,#1
+	mvn	$mask7f,$mask80
+	mov	$rounds,$rounds,lsl#2	@ (rounds-1)*4
+
+.Lmix:	and	$t1,$s0,$mask80
+	and	$s1,$s0,$mask7f
+	sub	$t1,$t1,$t1,lsr#7
+	and	$t1,$t1,$mask1b
+	eor	$s1,$t1,$s1,lsl#1	@ tp2
+
+	and	$t1,$s1,$mask80
+	and	$s2,$s1,$mask7f
+	sub	$t1,$t1,$t1,lsr#7
+	and	$t1,$t1,$mask1b
+	eor	$s2,$t1,$s2,lsl#1	@ tp4
+
+	and	$t1,$s2,$mask80
+	and	$s3,$s2,$mask7f
+	sub	$t1,$t1,$t1,lsr#7
+	and	$t1,$t1,$mask1b
+	eor	$s3,$t1,$s3,lsl#1	@ tp8
+
+	eor	$t1,$s1,$s2
+	eor	$t2,$s0,$s3		@ tp9
+	eor	$t1,$t1,$s3		@ tpe
+	eor	$t1,$t1,$s1,ror#24
+	eor	$t1,$t1,$t2,ror#24	@ ^= ROTATE(tpb=tp9^tp2,8)
+	eor	$t1,$t1,$s2,ror#16
+	eor	$t1,$t1,$t2,ror#16	@ ^= ROTATE(tpd=tp9^tp4,16)
+	eor	$t1,$t1,$t2,ror#8	@ ^= ROTATE(tp9,24)
+
+	ldr	$s0,[$key,#4]		@ prefetch tp1
+	str	$t1,[$key],#4
+	subs	$rounds,$rounds,#1
+	bne	.Lmix
+
+	mov	r0,#0
+	ldmia   sp!,{r4-r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+
+.type	AES_Td,%object
+.align	5
+AES_Td:
+.word	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
+.word	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
+.word	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
+.word	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
+.word	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
+.word	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
+.word	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
+.word	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
+.word	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
+.word	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
+.word	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
+.word	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
+.word	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
+.word	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
+.word	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
+.word	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
+.word	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
+.word	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
+.word	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
+.word	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
+.word	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
+.word	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
+.word	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
+.word	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
+.word	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
+.word	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
+.word	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
+.word	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
+.word	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
+.word	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
+.word	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
+.word	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
+.word	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
+.word	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
+.word	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
+.word	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
+.word	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
+.word	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
+.word	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
+.word	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
+.word	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
+.word	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
+.word	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
+.word	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
+.word	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
+.word	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
+.word	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
+.word	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
+.word	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
+.word	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
+.word	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
+.word	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
+.word	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
+.word	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
+.word	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
+.word	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
+.word	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
+.word	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
+.word	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
+.word	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
+.word	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
+.word	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
+.word	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
+.word	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
+@ Td4[256]
+.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+.size	AES_Td,.-AES_Td
+
+@ void AES_decrypt(const unsigned char *in, unsigned char *out,
+@ 		 const AES_KEY *key) {
+.global AES_decrypt
+.type   AES_decrypt,%function
+.align	5
+AES_decrypt:
+	sub	r3,pc,#8		@ AES_decrypt
+	stmdb   sp!,{r1,r4-r12,lr}
+	mov	$rounds,r0		@ inp
+	mov	$key,r2
+	sub	$tbl,r3,#AES_decrypt-AES_Td		@ Td
+
+	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
+	ldrb	$t1,[$rounds,#2]	@ manner...
+	ldrb	$t2,[$rounds,#1]
+	ldrb	$t3,[$rounds,#0]
+	orr	$s0,$s0,$t1,lsl#8
+	orr	$s0,$s0,$t2,lsl#16
+	orr	$s0,$s0,$t3,lsl#24
+	ldrb	$s1,[$rounds,#7]
+	ldrb	$t1,[$rounds,#6]
+	ldrb	$t2,[$rounds,#5]
+	ldrb	$t3,[$rounds,#4]
+	orr	$s1,$s1,$t1,lsl#8
+	orr	$s1,$s1,$t2,lsl#16
+	orr	$s1,$s1,$t3,lsl#24
+	ldrb	$s2,[$rounds,#11]
+	ldrb	$t1,[$rounds,#10]
+	ldrb	$t2,[$rounds,#9]
+	ldrb	$t3,[$rounds,#8]
+	orr	$s2,$s2,$t1,lsl#8
+	orr	$s2,$s2,$t2,lsl#16
+	orr	$s2,$s2,$t3,lsl#24
+	ldrb	$s3,[$rounds,#15]
+	ldrb	$t1,[$rounds,#14]
+	ldrb	$t2,[$rounds,#13]
+	ldrb	$t3,[$rounds,#12]
+	orr	$s3,$s3,$t1,lsl#8
+	orr	$s3,$s3,$t2,lsl#16
+	orr	$s3,$s3,$t3,lsl#24
+
+	bl	_armv4_AES_decrypt
+
+	ldr	$rounds,[sp],#4		@ pop out
+	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
+	mov	$t2,$s0,lsr#16		@ manner...
+	mov	$t3,$s0,lsr#8
+	strb	$t1,[$rounds,#0]
+	strb	$t2,[$rounds,#1]
+	strb	$t3,[$rounds,#2]
+	strb	$s0,[$rounds,#3]
+	mov	$t1,$s1,lsr#24
+	mov	$t2,$s1,lsr#16
+	mov	$t3,$s1,lsr#8
+	strb	$t1,[$rounds,#4]
+	strb	$t2,[$rounds,#5]
+	strb	$t3,[$rounds,#6]
+	strb	$s1,[$rounds,#7]
+	mov	$t1,$s2,lsr#24
+	mov	$t2,$s2,lsr#16
+	mov	$t3,$s2,lsr#8
+	strb	$t1,[$rounds,#8]
+	strb	$t2,[$rounds,#9]
+	strb	$t3,[$rounds,#10]
+	strb	$s2,[$rounds,#11]
+	mov	$t1,$s3,lsr#24
+	mov	$t2,$s3,lsr#16
+	mov	$t3,$s3,lsr#8
+	strb	$t1,[$rounds,#12]
+	strb	$t2,[$rounds,#13]
+	strb	$t3,[$rounds,#14]
+	strb	$s3,[$rounds,#15]
+
+	ldmia   sp!,{r4-r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+.size	AES_decrypt,.-AES_decrypt
+
+.type   _armv4_AES_decrypt,%function
+.align	2
+_armv4_AES_decrypt:
+	str	lr,[sp,#-4]!		@ push lr
+	ldr	$t1,[$key],#16
+	ldr	$t2,[$key,#-12]
+	ldr	$t3,[$key,#-8]
+	ldr	$i1,[$key,#-4]
+	ldr	$rounds,[$key,#240-16]
+	eor	$s0,$s0,$t1
+	eor	$s1,$s1,$t2
+	eor	$s2,$s2,$t3
+	eor	$s3,$s3,$i1
+	sub	$rounds,$rounds,#1
+	mov	lr,#255
+
+.Ldec_loop:
+	and	$i1,lr,$s0,lsr#16
+	and	$i2,lr,$s0,lsr#8
+	and	$i3,lr,$s0
+	mov	$s0,$s0,lsr#24
+	ldr	$t1,[$tbl,$i1,lsl#2]	@ Td1[s0>>16]
+	ldr	$s0,[$tbl,$s0,lsl#2]	@ Td0[s0>>24]
+	ldr	$t2,[$tbl,$i2,lsl#2]	@ Td2[s0>>8]
+	ldr	$t3,[$tbl,$i3,lsl#2]	@ Td3[s0>>0]
+
+	and	$i1,lr,$s1		@ i0
+	and	$i2,lr,$s1,lsr#16
+	and	$i3,lr,$s1,lsr#8
+	mov	$s1,$s1,lsr#24
+	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td3[s1>>0]
+	ldr	$s1,[$tbl,$s1,lsl#2]	@ Td0[s1>>24]
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td1[s1>>16]
+	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td2[s1>>8]
+	eor	$s0,$s0,$i1,ror#24
+	eor	$s1,$s1,$t1,ror#8
+	eor	$t2,$i2,$t2,ror#8
+	eor	$t3,$i3,$t3,ror#8
+
+	and	$i1,lr,$s2,lsr#8	@ i0
+	and	$i2,lr,$s2		@ i1
+	and	$i3,lr,$s2,lsr#16
+	mov	$s2,$s2,lsr#24
+	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td2[s2>>8]
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
+	ldr	$s2,[$tbl,$s2,lsl#2]	@ Td0[s2>>24]
+	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td1[s2>>16]
+	eor	$s0,$s0,$i1,ror#16
+	eor	$s1,$s1,$i2,ror#24
+	eor	$s2,$s2,$t2,ror#8
+	eor	$t3,$i3,$t3,ror#8
+
+	and	$i1,lr,$s3,lsr#16	@ i0
+	and	$i2,lr,$s3,lsr#8	@ i1
+	and	$i3,lr,$s3		@ i2
+	mov	$s3,$s3,lsr#24
+	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td1[s3>>16]
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
+	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td3[s3>>0]
+	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
+	eor	$s0,$s0,$i1,ror#8
+	eor	$s1,$s1,$i2,ror#16
+	eor	$s2,$s2,$i3,ror#24
+	eor	$s3,$s3,$t3,ror#8
+
+	ldr	$t1,[$key],#16
+	ldr	$t2,[$key,#-12]
+	ldr	$t3,[$key,#-8]
+	ldr	$i1,[$key,#-4]
+	eor	$s0,$s0,$t1
+	eor	$s1,$s1,$t2
+	eor	$s2,$s2,$t3
+	eor	$s3,$s3,$i1
+
+	subs	$rounds,$rounds,#1
+	bne	.Ldec_loop
+
+	add	$tbl,$tbl,#1024
+
+	ldr	$t1,[$tbl,#0]		@ prefetch Td4
+	ldr	$t2,[$tbl,#32]
+	ldr	$t3,[$tbl,#64]
+	ldr	$i1,[$tbl,#96]
+	ldr	$i2,[$tbl,#128]
+	ldr	$i3,[$tbl,#160]
+	ldr	$t1,[$tbl,#192]
+	ldr	$t2,[$tbl,#224]
+
+	and	$i1,lr,$s0,lsr#16
+	and	$i2,lr,$s0,lsr#8
+	and	$i3,lr,$s0
+	ldrb	$s0,[$tbl,$s0,lsr#24]	@ Td4[s0>>24]
+	ldrb	$t1,[$tbl,$i1]		@ Td4[s0>>16]
+	ldrb	$t2,[$tbl,$i2]		@ Td4[s0>>8]
+	ldrb	$t3,[$tbl,$i3]		@ Td4[s0>>0]
+
+	and	$i1,lr,$s1		@ i0
+	and	$i2,lr,$s1,lsr#16
+	and	$i3,lr,$s1,lsr#8
+	ldrb	$i1,[$tbl,$i1]		@ Td4[s1>>0]
+	ldrb	$s1,[$tbl,$s1,lsr#24]	@ Td4[s1>>24]
+	ldrb	$i2,[$tbl,$i2]		@ Td4[s1>>16]
+	ldrb	$i3,[$tbl,$i3]		@ Td4[s1>>8]
+	eor	$s0,$i1,$s0,lsl#24
+	eor	$s1,$t1,$s1,lsl#8
+	eor	$t2,$t2,$i2,lsl#8
+	eor	$t3,$t3,$i3,lsl#8
+
+	and	$i1,lr,$s2,lsr#8	@ i0
+	and	$i2,lr,$s2		@ i1
+	and	$i3,lr,$s2,lsr#16
+	ldrb	$i1,[$tbl,$i1]		@ Td4[s2>>8]
+	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
+	ldrb	$s2,[$tbl,$s2,lsr#24]	@ Td4[s2>>24]
+	ldrb	$i3,[$tbl,$i3]		@ Td4[s2>>16]
+	eor	$s0,$s0,$i1,lsl#8
+	eor	$s1,$i2,$s1,lsl#16
+	eor	$s2,$t2,$s2,lsl#16
+	eor	$t3,$t3,$i3,lsl#16
+
+	and	$i1,lr,$s3,lsr#16	@ i0
+	and	$i2,lr,$s3,lsr#8	@ i1
+	and	$i3,lr,$s3		@ i2
+	ldrb	$i1,[$tbl,$i1]		@ Td4[s3>>16]
+	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
+	ldrb	$i3,[$tbl,$i3]		@ Td4[s3>>0]
+	ldrb	$s3,[$tbl,$s3,lsr#24]	@ Td4[s3>>24]
+	eor	$s0,$s0,$i1,lsl#16
+	eor	$s1,$s1,$i2,lsl#8
+	eor	$s2,$i3,$s2,lsl#8
+	eor	$s3,$t3,$s3,lsl#24
+
+	ldr	lr,[sp],#4		@ pop lr
+	ldr	$t1,[$key,#0]
+	ldr	$t2,[$key,#4]
+	ldr	$t3,[$key,#8]
+	ldr	$i1,[$key,#12]
+	eor	$s0,$s0,$t1
+	eor	$s1,$s1,$t2
+	eor	$s2,$s2,$t3
+	eor	$s3,$s3,$i1
+
+	sub	$tbl,$tbl,#1024
+	mov	pc,lr			@ return
+.size	_armv4_AES_decrypt,.-_armv4_AES_decrypt
+.asciz	"AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+print $code;
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl
new file mode 100644
index 0000000000..ce427655ef
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-ppc.pl
@@ -0,0 +1,1176 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# Needs more work: key setup, page boundaries, CBC routine...
+#
+# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
+# 128-bit key, which is ~40% better than 64-bit code generated by gcc
+# 4.0. But these are not the ones currently used! Their "compact"
+# counterparts are, for security reason. ppc_AES_encrypt_compact runs
+# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
+# at 1/3 of ppc_AES_decrypt.
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+	$SIZE_T	=8;
+	$STU	="stdu";
+	$POP	="ld";
+	$PUSH	="std";
+} elsif ($flavour =~ /32/) {
+	$SIZE_T	=4;
+	$STU	="stwu";
+	$POP	="lwz";
+	$PUSH	="stw";
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=32*$SIZE_T;
+
+sub _data_word()
+{ my $i;
+    while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
+}
+
+$sp="r1";
+$toc="r2";
+$inp="r3";
+$out="r4";
+$key="r5";
+
+$Tbl0="r3";
+$Tbl1="r6";
+$Tbl2="r7";
+$Tbl3="r2";
+
+$s0="r8";
+$s1="r9";
+$s2="r10";
+$s3="r11";
+
+$t0="r12";
+$t1="r13";
+$t2="r14";
+$t3="r15";
+
+$acc00="r16";
+$acc01="r17";
+$acc02="r18";
+$acc03="r19";
+
+$acc04="r20";
+$acc05="r21";
+$acc06="r22";
+$acc07="r23";
+
+$acc08="r24";
+$acc09="r25";
+$acc10="r26";
+$acc11="r27";
+
+$acc12="r28";
+$acc13="r29";
+$acc14="r30";
+$acc15="r31";
+
+# stay away from TLS pointer
+if ($SIZE_T==8)	{ die if ($t1 ne "r13");  $t1="r0";		}
+else		{ die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0";	}
+$mask80=$Tbl2;
+$mask1b=$Tbl3;
+
+$code.=<<___;
+.machine	"any"
+.text
+
+.align	7
+LAES_Te:
+	mflr	r0
+	bcl	20,31,\$+4
+	mflr	$Tbl0	;    vvvvv "distance" between . and 1st data entry
+	addi	$Tbl0,$Tbl0,`128-8`
+	mtlr	r0
+	blr
+	.space	`32-24`
+LAES_Td:
+	mflr	r0
+	bcl	20,31,\$+4
+	mflr	$Tbl0	;    vvvvvvvv "distance" between . and 1st data entry
+	addi	$Tbl0,$Tbl0,`128-8-32+2048+256`
+	mtlr	r0
+	blr
+	.space	`128-32-24`
+___
+&_data_word(
+	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
+	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
+	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
+	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
+	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
+	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
+	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
+	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
+	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
+	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
+	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
+	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
+	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
+	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
+	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
+	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
+	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
+	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
+	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
+	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
+	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
+	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
+	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
+	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
+	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
+	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
+	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
+	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
+	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
+	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
+	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
+	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
+	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
+	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
+	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
+	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
+	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
+	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
+	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
+	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
+	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
+	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
+	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
+	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
+	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
+	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
+	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
+	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
+	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
+	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
+	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
+	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
+	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
+	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
+	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
+	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
+	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
+	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
+	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
+	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
+	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
+	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
+	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
+	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
+$code.=<<___;
+.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+___
+&_data_word(
+	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
+	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
+	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
+	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
+	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
+	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
+	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
+	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
+	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
+	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
+	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
+	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
+	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
+	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
+	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
+	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
+	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
+	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
+	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
+	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
+	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
+	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
+	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
+	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
+	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
+	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
+	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
+	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
+	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
+	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
+	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
+	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
+	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
+	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
+	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
+	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
+	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
+	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
+	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
+	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
+	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
+	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
+	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
+	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
+	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
+	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
+	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
+	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
+	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
+	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
+	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
+	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
+	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
+	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
+	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
+	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
+	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
+	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
+	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
+	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
+	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
+	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
+	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
+	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
+$code.=<<___;
+.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+
+
+.globl	.AES_encrypt
+.align	7
+.AES_encrypt:
+	mflr	r0
+	$STU	$sp,-$FRAME($sp)
+
+	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
+	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
+	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
+	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
+	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
+	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
+	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
+	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
+	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
+	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
+	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
+	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
+	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
+	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
+	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
+	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
+	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
+	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
+	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
+	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
+	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+
+	lwz	$s0,0($inp)
+	lwz	$s1,4($inp)
+	lwz	$s2,8($inp)
+	lwz	$s3,12($inp)
+	bl	LAES_Te
+	bl	Lppc_AES_encrypt_compact
+	stw	$s0,0($out)
+	stw	$s1,4($out)
+	stw	$s2,8($out)
+	stw	$s3,12($out)
+
+	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
+	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
+	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
+	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
+	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
+	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
+	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
+	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
+	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
+	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
+	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
+	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
+	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
+	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
+	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
+	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
+	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
+	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
+	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
+	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
+	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
+	mtlr	r0
+	addi	$sp,$sp,$FRAME
+	blr
+
+.align	4
+Lppc_AES_encrypt:
+	lwz	$acc00,240($key)
+	lwz	$t0,0($key)
+	lwz	$t1,4($key)
+	lwz	$t2,8($key)
+	lwz	$t3,12($key)
+	addi	$Tbl1,$Tbl0,3
+	addi	$Tbl2,$Tbl0,2
+	addi	$Tbl3,$Tbl0,1
+	addi	$acc00,$acc00,-1
+	addi	$key,$key,16
+	xor	$s0,$s0,$t0
+	xor	$s1,$s1,$t1
+	xor	$s2,$s2,$t2
+	xor	$s3,$s3,$t3
+	mtctr	$acc00
+.align	4
+Lenc_loop:
+	rlwinm	$acc00,$s0,`32-24+3`,21,28
+	rlwinm	$acc01,$s1,`32-24+3`,21,28
+	lwz	$t0,0($key)
+	lwz	$t1,4($key)
+	rlwinm	$acc02,$s2,`32-24+3`,21,28
+	rlwinm	$acc03,$s3,`32-24+3`,21,28
+	lwz	$t2,8($key)
+	lwz	$t3,12($key)
+	rlwinm	$acc04,$s1,`32-16+3`,21,28
+	rlwinm	$acc05,$s2,`32-16+3`,21,28
+	lwzx	$acc00,$Tbl0,$acc00
+	lwzx	$acc01,$Tbl0,$acc01
+	rlwinm	$acc06,$s3,`32-16+3`,21,28
+	rlwinm	$acc07,$s0,`32-16+3`,21,28
+	lwzx	$acc02,$Tbl0,$acc02
+	lwzx	$acc03,$Tbl0,$acc03
+	rlwinm	$acc08,$s2,`32-8+3`,21,28
+	rlwinm	$acc09,$s3,`32-8+3`,21,28
+	lwzx	$acc04,$Tbl1,$acc04
+	lwzx	$acc05,$Tbl1,$acc05
+	rlwinm	$acc10,$s0,`32-8+3`,21,28
+	rlwinm	$acc11,$s1,`32-8+3`,21,28
+	lwzx	$acc06,$Tbl1,$acc06
+	lwzx	$acc07,$Tbl1,$acc07
+	rlwinm	$acc12,$s3,`0+3`,21,28
+	rlwinm	$acc13,$s0,`0+3`,21,28
+	lwzx	$acc08,$Tbl2,$acc08
+	lwzx	$acc09,$Tbl2,$acc09
+	rlwinm	$acc14,$s1,`0+3`,21,28
+	rlwinm	$acc15,$s2,`0+3`,21,28
+	lwzx	$acc10,$Tbl2,$acc10
+	lwzx	$acc11,$Tbl2,$acc11
+	xor	$t0,$t0,$acc00
+	xor	$t1,$t1,$acc01
+	lwzx	$acc12,$Tbl3,$acc12
+	lwzx	$acc13,$Tbl3,$acc13
+	xor	$t2,$t2,$acc02
+	xor	$t3,$t3,$acc03
+	lwzx	$acc14,$Tbl3,$acc14
+	lwzx	$acc15,$Tbl3,$acc15
+	xor	$t0,$t0,$acc04
+	xor	$t1,$t1,$acc05
+	xor	$t2,$t2,$acc06
+	xor	$t3,$t3,$acc07
+	xor	$t0,$t0,$acc08
+	xor	$t1,$t1,$acc09
+	xor	$t2,$t2,$acc10
+	xor	$t3,$t3,$acc11
+	xor	$s0,$t0,$acc12
+	xor	$s1,$t1,$acc13
+	xor	$s2,$t2,$acc14
+	xor	$s3,$t3,$acc15
+	addi	$key,$key,16
+	bdnz-	Lenc_loop
+
+	addi	$Tbl2,$Tbl0,2048
+	nop
+	lwz	$acc08,`2048+0`($Tbl0)	! prefetch Te4
+	lwz	$acc09,`2048+32`($Tbl0)
+	lwz	$acc10,`2048+64`($Tbl0)
+	lwz	$acc11,`2048+96`($Tbl0)
+	lwz	$acc08,`2048+128`($Tbl0)
+	lwz	$acc09,`2048+160`($Tbl0)
+	lwz	$acc10,`2048+192`($Tbl0)
+	lwz	$acc11,`2048+224`($Tbl0)
+	rlwinm	$acc00,$s0,`32-24`,24,31
+	rlwinm	$acc01,$s1,`32-24`,24,31
+	lwz	$t0,0($key)
+	lwz	$t1,4($key)
+	rlwinm	$acc02,$s2,`32-24`,24,31
+	rlwinm	$acc03,$s3,`32-24`,24,31
+	lwz	$t2,8($key)
+	lwz	$t3,12($key)
+	rlwinm	$acc04,$s1,`32-16`,24,31
+	rlwinm	$acc05,$s2,`32-16`,24,31
+	lbzx	$acc00,$Tbl2,$acc00
+	lbzx	$acc01,$Tbl2,$acc01
+	rlwinm	$acc06,$s3,`32-16`,24,31
+	rlwinm	$acc07,$s0,`32-16`,24,31
+	lbzx	$acc02,$Tbl2,$acc02
+	lbzx	$acc03,$Tbl2,$acc03
+	rlwinm	$acc08,$s2,`32-8`,24,31
+	rlwinm	$acc09,$s3,`32-8`,24,31
+	lbzx	$acc04,$Tbl2,$acc04
+	lbzx	$acc05,$Tbl2,$acc05
+	rlwinm	$acc10,$s0,`32-8`,24,31
+	rlwinm	$acc11,$s1,`32-8`,24,31
+	lbzx	$acc06,$Tbl2,$acc06
+	lbzx	$acc07,$Tbl2,$acc07
+	rlwinm	$acc12,$s3,`0`,24,31
+	rlwinm	$acc13,$s0,`0`,24,31
+	lbzx	$acc08,$Tbl2,$acc08
+	lbzx	$acc09,$Tbl2,$acc09
+	rlwinm	$acc14,$s1,`0`,24,31
+	rlwinm	$acc15,$s2,`0`,24,31
+	lbzx	$acc10,$Tbl2,$acc10
+	lbzx	$acc11,$Tbl2,$acc11
+	rlwinm	$s0,$acc00,24,0,7
+	rlwinm	$s1,$acc01,24,0,7
+	lbzx	$acc12,$Tbl2,$acc12
+	lbzx	$acc13,$Tbl2,$acc13
+	rlwinm	$s2,$acc02,24,0,7
+	rlwinm	$s3,$acc03,24,0,7
+	lbzx	$acc14,$Tbl2,$acc14
+	lbzx	$acc15,$Tbl2,$acc15
+	rlwimi	$s0,$acc04,16,8,15
+	rlwimi	$s1,$acc05,16,8,15
+	rlwimi	$s2,$acc06,16,8,15
+	rlwimi	$s3,$acc07,16,8,15
+	rlwimi	$s0,$acc08,8,16,23
+	rlwimi	$s1,$acc09,8,16,23
+	rlwimi	$s2,$acc10,8,16,23
+	rlwimi	$s3,$acc11,8,16,23
+	or	$s0,$s0,$acc12
+	or	$s1,$s1,$acc13
+	or	$s2,$s2,$acc14
+	or	$s3,$s3,$acc15
+	xor	$s0,$s0,$t0
+	xor	$s1,$s1,$t1
+	xor	$s2,$s2,$t2
+	xor	$s3,$s3,$t3
+	blr
+
+.align	4
+Lppc_AES_encrypt_compact:
+	lwz	$acc00,240($key)
+	lwz	$t0,0($key)
+	lwz	$t1,4($key)
+	lwz	$t2,8($key)
+	lwz	$t3,12($key)
+	addi	$Tbl1,$Tbl0,2048
+	lis	$mask80,0x8080
+	lis	$mask1b,0x1b1b
+	addi	$key,$key,16
+	ori	$mask80,$mask80,0x8080
+	ori	$mask1b,$mask1b,0x1b1b
+	mtctr	$acc00
+.align	4
+Lenc_compact_loop:
+	xor	$s0,$s0,$t0
+	xor	$s1,$s1,$t1
+	xor	$s2,$s2,$t2
+	xor	$s3,$s3,$t3
+	rlwinm	$acc00,$s0,`32-24`,24,31
+	rlwinm	$acc01,$s1,`32-24`,24,31
+	rlwinm	$acc02,$s2,`32-24`,24,31
+	rlwinm	$acc03,$s3,`32-24`,24,31
+	lbzx	$acc00,$Tbl1,$acc00
+	lbzx	$acc01,$Tbl1,$acc01
+	rlwinm	$acc04,$s1,`32-16`,24,31
+	rlwinm	$acc05,$s2,`32-16`,24,31
+	lbzx	$acc02,$Tbl1,$acc02
+	lbzx	$acc03,$Tbl1,$acc03
+	rlwinm	$acc06,$s3,`32-16`,24,31
+	rlwinm	$acc07,$s0,`32-16`,24,31
+	lbzx	$acc04,$Tbl1,$acc04
+	lbzx	$acc05,$Tbl1,$acc05
+	rlwinm	$acc08,$s2,`32-8`,24,31
+	rlwinm	$acc09,$s3,`32-8`,24,31
+	lbzx	$acc06,$Tbl1,$acc06
+	lbzx	$acc07,$Tbl1,$acc07
+	rlwinm	$acc10,$s0,`32-8`,24,31
+	rlwinm	$acc11,$s1,`32-8`,24,31
+	lbzx	$acc08,$Tbl1,$acc08
+	lbzx	$acc09,$Tbl1,$acc09
+	rlwinm	$acc12,$s3,`0`,24,31
+	rlwinm	$acc13,$s0,`0`,24,31
+	lbzx	$acc10,$Tbl1,$acc10
+	lbzx	$acc11,$Tbl1,$acc11
+	rlwinm	$acc14,$s1,`0`,24,31
+	rlwinm	$acc15,$s2,`0`,24,31
+	lbzx	$acc12,$Tbl1,$acc12
+	lbzx	$acc13,$Tbl1,$acc13
+	rlwinm	$s0,$acc00,24,0,7
+	rlwinm	$s1,$acc01,24,0,7
+	lbzx	$acc14,$Tbl1,$acc14
+	lbzx	$acc15,$Tbl1,$acc15
+	rlwinm	$s2,$acc02,24,0,7
+	rlwinm	$s3,$acc03,24,0,7
+	rlwimi	$s0,$acc04,16,8,15
+	rlwimi	$s1,$acc05,16,8,15
+	rlwimi	$s2,$acc06,16,8,15
+	rlwimi	$s3,$acc07,16,8,15
+	rlwimi	$s0,$acc08,8,16,23
+	rlwimi	$s1,$acc09,8,16,23
+	rlwimi	$s2,$acc10,8,16,23
+	rlwimi	$s3,$acc11,8,16,23
+	lwz	$t0,0($key)
+	lwz	$t1,4($key)
+	or	$s0,$s0,$acc12
+	or	$s1,$s1,$acc13
+	lwz	$t2,8($key)
+	lwz	$t3,12($key)
+	or	$s2,$s2,$acc14
+	or	$s3,$s3,$acc15
+
+	addi	$key,$key,16
+	bdz	Lenc_compact_done
+
+	and	$acc00,$s0,$mask80	# r1=r0&0x80808080
+	and	$acc01,$s1,$mask80
+	and	$acc02,$s2,$mask80
+	and	$acc03,$s3,$mask80
+	srwi	$acc04,$acc00,7		# r1>>7
+	srwi	$acc05,$acc01,7
+	srwi	$acc06,$acc02,7
+	srwi	$acc07,$acc03,7
+	andc	$acc08,$s0,$mask80	# r0&0x7f7f7f7f
+	andc	$acc09,$s1,$mask80
+	andc	$acc10,$s2,$mask80
+	andc	$acc11,$s3,$mask80
+	sub	$acc00,$acc00,$acc04	# r1-(r1>>7)
+	sub	$acc01,$acc01,$acc05
+	sub	$acc02,$acc02,$acc06
+	sub	$acc03,$acc03,$acc07
+	add	$acc08,$acc08,$acc08	# (r0&0x7f7f7f7f)<<1
+	add	$acc09,$acc09,$acc09
+	add	$acc10,$acc10,$acc10
+	add	$acc11,$acc11,$acc11
+	and	$acc00,$acc00,$mask1b	# (r1-(r1>>7))&0x1b1b1b1b
+	and	$acc01,$acc01,$mask1b
+	and	$acc02,$acc02,$mask1b
+	and	$acc03,$acc03,$mask1b
+	xor	$acc00,$acc00,$acc08	# r2
+	xor	$acc01,$acc01,$acc09
+	xor	$acc02,$acc02,$acc10
+	xor	$acc03,$acc03,$acc11
+
+	rotlwi	$acc12,$s0,16		# ROTATE(r0,16)
+	rotlwi	$acc13,$s1,16
+	rotlwi	$acc14,$s2,16
+	rotlwi	$acc15,$s3,16
+	xor	$s0,$s0,$acc00		# r0^r2
+	xor	$s1,$s1,$acc01
+	xor	$s2,$s2,$acc02
+	xor	$s3,$s3,$acc03
+	rotrwi	$s0,$s0,24		# ROTATE(r2^r0,24)
+	rotrwi	$s1,$s1,24
+	rotrwi	$s2,$s2,24
+	rotrwi	$s3,$s3,24
+	xor	$s0,$s0,$acc00		# ROTATE(r2^r0,24)^r2
+	xor	$s1,$s1,$acc01
+	xor	$s2,$s2,$acc02
+	xor	$s3,$s3,$acc03
+	rotlwi	$acc08,$acc12,8		# ROTATE(r0,24)
+	rotlwi	$acc09,$acc13,8
+	rotlwi	$acc10,$acc14,8
+	rotlwi	$acc11,$acc15,8
+	xor	$s0,$s0,$acc12		#
+	xor	$s1,$s1,$acc13
+	xor	$s2,$s2,$acc14
+	xor	$s3,$s3,$acc15
+	xor	$s0,$s0,$acc08		#
+	xor	$s1,$s1,$acc09
+	xor	$s2,$s2,$acc10
+	xor	$s3,$s3,$acc11
+
+	b	Lenc_compact_loop
+.align	4
+Lenc_compact_done:
+	xor	$s0,$s0,$t0
+	xor	$s1,$s1,$t1
+	xor	$s2,$s2,$t2
+	xor	$s3,$s3,$t3
+	blr
+
+.globl	.AES_decrypt
+.align	7
+.AES_decrypt:
+	mflr	r0
+	$STU	$sp,-$FRAME($sp)
+
+	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
+	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
+	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
+	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
+	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
+	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
+	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
+	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
+	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
+	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
+	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
+	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
+	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
+	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
+	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
+	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
+	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
+	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
+	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
+	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
+	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+
+	lwz	$s0,0($inp)
+	lwz	$s1,4($inp)
+	lwz	$s2,8($inp)
+	lwz	$s3,12($inp)
+	bl	LAES_Td
+	bl	Lppc_AES_decrypt_compact
+	stw	$s0,0($out)
+	stw	$s1,4($out)
+	stw	$s2,8($out)
+	stw	$s3,12($out)
+
+	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
+	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
+	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
+	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
+	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
+	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
+	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
+	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
+	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
+	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
+	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
+	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
+	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
+	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
+	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
+	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
+	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
+	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
+	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
+	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
+	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
+	mtlr	r0
+	addi	$sp,$sp,$FRAME
+	blr
+
+.align	4
+Lppc_AES_decrypt:
+	lwz	$acc00,240($key)
+	lwz	$t0,0($key)
+	lwz	$t1,4($key)
+	lwz	$t2,8($key)
+	lwz	$t3,12($key)
+	addi	$Tbl1,$Tbl0,3
+	addi	$Tbl2,$Tbl0,2
+	addi	$Tbl3,$Tbl0,1
+	addi	$acc00,$acc00,-1
+	addi	$key,$key,16
+	xor	$s0,$s0,$t0
+	xor	$s1,$s1,$t1
+	xor	$s2,$s2,$t2
+	xor	$s3,$s3,$t3
+	mtctr	$acc00
+.align	4
+Ldec_loop:
+	rlwinm	$acc00,$s0,`32-24+3`,21,28
+	rlwinm	$acc01,$s1,`32-24+3`,21,28
+	lwz	$t0,0($key)
+	lwz	$t1,4($key)
+	rlwinm	$acc02,$s2,`32-24+3`,21,28
+	rlwinm	$acc03,$s3,`32-24+3`,21,28
+	lwz	$t2,8($key)
+	lwz	$t3,12($key)
+	rlwinm	$acc04,$s3,`32-16+3`,21,28
+	rlwinm	$acc05,$s0,`32-16+3`,21,28
+	lwzx	$acc00,$Tbl0,$acc00
+	lwzx	$acc01,$Tbl0,$acc01
+	rlwinm	$acc06,$s1,`32-16+3`,21,28
+	rlwinm	$acc07,$s2,`32-16+3`,21,28
+	lwzx	$acc02,$Tbl0,$acc02
+	lwzx	$acc03,$Tbl0,$acc03
+	rlwinm	$acc08,$s2,`32-8+3`,21,28
+	rlwinm	$acc09,$s3,`32-8+3`,21,28
+	lwzx	$acc04,$Tbl1,$acc04
+	lwzx	$acc05,$Tbl1,$acc05
+	rlwinm	$acc10,$s0,`32-8+3`,21,28
+	rlwinm	$acc11,$s1,`32-8+3`,21,28
+	lwzx	$acc06,$Tbl1,$acc06
+	lwzx	$acc07,$Tbl1,$acc07
+	rlwinm	$acc12,$s1,`0+3`,21,28
+	rlwinm	$acc13,$s2,`0+3`,21,28
+	lwzx	$acc08,$Tbl2,$acc08
+	lwzx	$acc09,$Tbl2,$acc09
+	rlwinm	$acc14,$s3,`0+3`,21,28
+	rlwinm	$acc15,$s0,`0+3`,21,28
+	lwzx	$acc10,$Tbl2,$acc10
+	lwzx	$acc11,$Tbl2,$acc11
+	xor	$t0,$t0,$acc00
+	xor	$t1,$t1,$acc01
+	lwzx	$acc12,$Tbl3,$acc12
+	lwzx	$acc13,$Tbl3,$acc13
+	xor	$t2,$t2,$acc02
+	xor	$t3,$t3,$acc03
+	lwzx	$acc14,$Tbl3,$acc14
+	lwzx	$acc15,$Tbl3,$acc15
+	xor	$t0,$t0,$acc04
+	xor	$t1,$t1,$acc05
+	xor	$t2,$t2,$acc06
+	xor	$t3,$t3,$acc07
+	xor	$t0,$t0,$acc08
+	xor	$t1,$t1,$acc09
+	xor	$t2,$t2,$acc10
+	xor	$t3,$t3,$acc11
+	xor	$s0,$t0,$acc12
+	xor	$s1,$t1,$acc13
+	xor	$s2,$t2,$acc14
+	xor	$s3,$t3,$acc15
+	addi	$key,$key,16
+	bdnz-	Ldec_loop
+
+	addi	$Tbl2,$Tbl0,2048
+	nop
+	lwz	$acc08,`2048+0`($Tbl0)	! prefetch Td4
+	lwz	$acc09,`2048+32`($Tbl0)
+	lwz	$acc10,`2048+64`($Tbl0)
+	lwz	$acc11,`2048+96`($Tbl0)
+	lwz	$acc08,`2048+128`($Tbl0)
+	lwz	$acc09,`2048+160`($Tbl0)
+	lwz	$acc10,`2048+192`($Tbl0)
+	lwz	$acc11,`2048+224`($Tbl0)
+	rlwinm	$acc00,$s0,`32-24`,24,31
+	rlwinm	$acc01,$s1,`32-24`,24,31
+	lwz	$t0,0($key)
+	lwz	$t1,4($key)
+	rlwinm	$acc02,$s2,`32-24`,24,31
+	rlwinm	$acc03,$s3,`32-24`,24,31
+	lwz	$t2,8($key)
+	lwz	$t3,12($key)
+	rlwinm	$acc04,$s3,`32-16`,24,31
+	rlwinm	$acc05,$s0,`32-16`,24,31
+	lbzx	$acc00,$Tbl2,$acc00
+	lbzx	$acc01,$Tbl2,$acc01
+	rlwinm	$acc06,$s1,`32-16`,24,31
+	rlwinm	$acc07,$s2,`32-16`,24,31
+	lbzx	$acc02,$Tbl2,$acc02
+	lbzx	$acc03,$Tbl2,$acc03
+	rlwinm	$acc08,$s2,`32-8`,24,31
+	rlwinm	$acc09,$s3,`32-8`,24,31
+	lbzx	$acc04,$Tbl2,$acc04
+	lbzx	$acc05,$Tbl2,$acc05
+	rlwinm	$acc10,$s0,`32-8`,24,31
+	rlwinm	$acc11,$s1,`32-8`,24,31
+	lbzx	$acc06,$Tbl2,$acc06
+	lbzx	$acc07,$Tbl2,$acc07
+	rlwinm	$acc12,$s1,`0`,24,31
+	rlwinm	$acc13,$s2,`0`,24,31
+	lbzx	$acc08,$Tbl2,$acc08
+	lbzx	$acc09,$Tbl2,$acc09
+	rlwinm	$acc14,$s3,`0`,24,31
+	rlwinm	$acc15,$s0,`0`,24,31
+	lbzx	$acc10,$Tbl2,$acc10
+	lbzx	$acc11,$Tbl2,$acc11
+	rlwinm	$s0,$acc00,24,0,7
+	rlwinm	$s1,$acc01,24,0,7
+	lbzx	$acc12,$Tbl2,$acc12
+	lbzx	$acc13,$Tbl2,$acc13
+	rlwinm	$s2,$acc02,24,0,7
+	rlwinm	$s3,$acc03,24,0,7
+	lbzx	$acc14,$Tbl2,$acc14
+	lbzx	$acc15,$Tbl2,$acc15
+	rlwimi	$s0,$acc04,16,8,15
+	rlwimi	$s1,$acc05,16,8,15
+	rlwimi	$s2,$acc06,16,8,15
+	rlwimi	$s3,$acc07,16,8,15
+	rlwimi	$s0,$acc08,8,16,23
+	rlwimi	$s1,$acc09,8,16,23
+	rlwimi	$s2,$acc10,8,16,23
+	rlwimi	$s3,$acc11,8,16,23
+	or	$s0,$s0,$acc12
+	or	$s1,$s1,$acc13
+	or	$s2,$s2,$acc14
+	or	$s3,$s3,$acc15
+	xor	$s0,$s0,$t0
+	xor	$s1,$s1,$t1
+	xor	$s2,$s2,$t2
+	xor	$s3,$s3,$t3
+	blr
+
+.align	4
+Lppc_AES_decrypt_compact:
+	lwz	$acc00,240($key)
+	lwz	$t0,0($key)
+	lwz	$t1,4($key)
+	lwz	$t2,8($key)
+	lwz	$t3,12($key)
+	addi	$Tbl1,$Tbl0,2048
+	lis	$mask80,0x8080
+	lis	$mask1b,0x1b1b
+	addi	$key,$key,16
+	ori	$mask80,$mask80,0x8080
+	ori	$mask1b,$mask1b,0x1b1b
+___
+$code.=<<___ if ($SIZE_T==8);
+	insrdi	$mask80,$mask80,32,0
+	insrdi	$mask1b,$mask1b,32,0
+___
+$code.=<<___;
+	mtctr	$acc00
+.align	4
+Ldec_compact_loop:
+	xor	$s0,$s0,$t0
+	xor	$s1,$s1,$t1
+	xor	$s2,$s2,$t2
+	xor	$s3,$s3,$t3
+	rlwinm	$acc00,$s0,`32-24`,24,31
+	rlwinm	$acc01,$s1,`32-24`,24,31
+	rlwinm	$acc02,$s2,`32-24`,24,31
+	rlwinm	$acc03,$s3,`32-24`,24,31
+	lbzx	$acc00,$Tbl1,$acc00
+	lbzx	$acc01,$Tbl1,$acc01
+	rlwinm	$acc04,$s3,`32-16`,24,31
+	rlwinm	$acc05,$s0,`32-16`,24,31
+	lbzx	$acc02,$Tbl1,$acc02
+	lbzx	$acc03,$Tbl1,$acc03
+	rlwinm	$acc06,$s1,`32-16`,24,31
+	rlwinm	$acc07,$s2,`32-16`,24,31
+	lbzx	$acc04,$Tbl1,$acc04
+	lbzx	$acc05,$Tbl1,$acc05
+	rlwinm	$acc08,$s2,`32-8`,24,31
+	rlwinm	$acc09,$s3,`32-8`,24,31
+	lbzx	$acc06,$Tbl1,$acc06
+	lbzx	$acc07,$Tbl1,$acc07
+	rlwinm	$acc10,$s0,`32-8`,24,31
+	rlwinm	$acc11,$s1,`32-8`,24,31
+	lbzx	$acc08,$Tbl1,$acc08
+	lbzx	$acc09,$Tbl1,$acc09
+	rlwinm	$acc12,$s1,`0`,24,31
+	rlwinm	$acc13,$s2,`0`,24,31
+	lbzx	$acc10,$Tbl1,$acc10
+	lbzx	$acc11,$Tbl1,$acc11
+	rlwinm	$acc14,$s3,`0`,24,31
+	rlwinm	$acc15,$s0,`0`,24,31
+	lbzx	$acc12,$Tbl1,$acc12
+	lbzx	$acc13,$Tbl1,$acc13
+	rlwinm	$s0,$acc00,24,0,7
+	rlwinm	$s1,$acc01,24,0,7
+	lbzx	$acc14,$Tbl1,$acc14
+	lbzx	$acc15,$Tbl1,$acc15
+	rlwinm	$s2,$acc02,24,0,7
+	rlwinm	$s3,$acc03,24,0,7
+	rlwimi	$s0,$acc04,16,8,15
+	rlwimi	$s1,$acc05,16,8,15
+	rlwimi	$s2,$acc06,16,8,15
+	rlwimi	$s3,$acc07,16,8,15
+	rlwimi	$s0,$acc08,8,16,23
+	rlwimi	$s1,$acc09,8,16,23
+	rlwimi	$s2,$acc10,8,16,23
+	rlwimi	$s3,$acc11,8,16,23
+	lwz	$t0,0($key)
+	lwz	$t1,4($key)
+	or	$s0,$s0,$acc12
+	or	$s1,$s1,$acc13
+	lwz	$t2,8($key)
+	lwz	$t3,12($key)
+	or	$s2,$s2,$acc14
+	or	$s3,$s3,$acc15
+
+	addi	$key,$key,16
+	bdz	Ldec_compact_done
+___
+$code.=<<___ if ($SIZE_T==8);
+	# vectorized permutation improves decrypt performance by 10%
+	insrdi	$s0,$s1,32,0
+	insrdi	$s2,$s3,32,0
+
+	and	$acc00,$s0,$mask80	# r1=r0&0x80808080
+	and	$acc02,$s2,$mask80
+	srdi	$acc04,$acc00,7		# r1>>7
+	srdi	$acc06,$acc02,7
+	andc	$acc08,$s0,$mask80	# r0&0x7f7f7f7f
+	andc	$acc10,$s2,$mask80
+	sub	$acc00,$acc00,$acc04	# r1-(r1>>7)
+	sub	$acc02,$acc02,$acc06
+	add	$acc08,$acc08,$acc08	# (r0&0x7f7f7f7f)<<1
+	add	$acc10,$acc10,$acc10
+	and	$acc00,$acc00,$mask1b	# (r1-(r1>>7))&0x1b1b1b1b
+	and	$acc02,$acc02,$mask1b
+	xor	$acc00,$acc00,$acc08	# r2
+	xor	$acc02,$acc02,$acc10
+
+	and	$acc04,$acc00,$mask80	# r1=r2&0x80808080
+	and	$acc06,$acc02,$mask80
+	srdi	$acc08,$acc04,7		# r1>>7
+	srdi	$acc10,$acc06,7
+	andc	$acc12,$acc00,$mask80	# r2&0x7f7f7f7f
+	andc	$acc14,$acc02,$mask80
+	sub	$acc04,$acc04,$acc08	# r1-(r1>>7)
+	sub	$acc06,$acc06,$acc10
+	add	$acc12,$acc12,$acc12	# (r2&0x7f7f7f7f)<<1
+	add	$acc14,$acc14,$acc14
+	and	$acc04,$acc04,$mask1b	# (r1-(r1>>7))&0x1b1b1b1b
+	and	$acc06,$acc06,$mask1b
+	xor	$acc04,$acc04,$acc12	# r4
+	xor	$acc06,$acc06,$acc14
+
+	and	$acc08,$acc04,$mask80	# r1=r4&0x80808080
+	and	$acc10,$acc06,$mask80
+	srdi	$acc12,$acc08,7		# r1>>7
+	srdi	$acc14,$acc10,7
+	sub	$acc08,$acc08,$acc12	# r1-(r1>>7)
+	sub	$acc10,$acc10,$acc14
+	andc	$acc12,$acc04,$mask80	# r4&0x7f7f7f7f
+	andc	$acc14,$acc06,$mask80
+	add	$acc12,$acc12,$acc12	# (r4&0x7f7f7f7f)<<1
+	add	$acc14,$acc14,$acc14
+	and	$acc08,$acc08,$mask1b	# (r1-(r1>>7))&0x1b1b1b1b
+	and	$acc10,$acc10,$mask1b
+	xor	$acc08,$acc08,$acc12	# r8
+	xor	$acc10,$acc10,$acc14
+
+	xor	$acc00,$acc00,$s0	# r2^r0
+	xor	$acc02,$acc02,$s2
+	xor	$acc04,$acc04,$s0	# r4^r0
+	xor	$acc06,$acc06,$s2
+
+	extrdi	$acc01,$acc00,32,0
+	extrdi	$acc03,$acc02,32,0
+	extrdi	$acc05,$acc04,32,0
+	extrdi	$acc07,$acc06,32,0
+	extrdi	$acc09,$acc08,32,0
+	extrdi	$acc11,$acc10,32,0
+___
+$code.=<<___ if ($SIZE_T==4);
+	and	$acc00,$s0,$mask80	# r1=r0&0x80808080
+	and	$acc01,$s1,$mask80
+	and	$acc02,$s2,$mask80
+	and	$acc03,$s3,$mask80
+	srwi	$acc04,$acc00,7		# r1>>7
+	srwi	$acc05,$acc01,7
+	srwi	$acc06,$acc02,7
+	srwi	$acc07,$acc03,7
+	andc	$acc08,$s0,$mask80	# r0&0x7f7f7f7f
+	andc	$acc09,$s1,$mask80
+	andc	$acc10,$s2,$mask80
+	andc	$acc11,$s3,$mask80
+	sub	$acc00,$acc00,$acc04	# r1-(r1>>7)
+	sub	$acc01,$acc01,$acc05
+	sub	$acc02,$acc02,$acc06
+	sub	$acc03,$acc03,$acc07
+	add	$acc08,$acc08,$acc08	# (r0&0x7f7f7f7f)<<1
+	add	$acc09,$acc09,$acc09
+	add	$acc10,$acc10,$acc10
+	add	$acc11,$acc11,$acc11
+	and	$acc00,$acc00,$mask1b	# (r1-(r1>>7))&0x1b1b1b1b
+	and	$acc01,$acc01,$mask1b
+	and	$acc02,$acc02,$mask1b
+	and	$acc03,$acc03,$mask1b
+	xor	$acc00,$acc00,$acc08	# r2
+	xor	$acc01,$acc01,$acc09
+	xor	$acc02,$acc02,$acc10
+	xor	$acc03,$acc03,$acc11
+
+	and	$acc04,$acc00,$mask80	# r1=r2&0x80808080
+	and	$acc05,$acc01,$mask80
+	and	$acc06,$acc02,$mask80
+	and	$acc07,$acc03,$mask80
+	srwi	$acc08,$acc04,7		# r1>>7
+	srwi	$acc09,$acc05,7
+	srwi	$acc10,$acc06,7
+	srwi	$acc11,$acc07,7
+	andc	$acc12,$acc00,$mask80	# r2&0x7f7f7f7f
+	andc	$acc13,$acc01,$mask80
+	andc	$acc14,$acc02,$mask80
+	andc	$acc15,$acc03,$mask80
+	sub	$acc04,$acc04,$acc08	# r1-(r1>>7)
+	sub	$acc05,$acc05,$acc09
+	sub	$acc06,$acc06,$acc10
+	sub	$acc07,$acc07,$acc11
+	add	$acc12,$acc12,$acc12	# (r2&0x7f7f7f7f)<<1
+	add	$acc13,$acc13,$acc13
+	add	$acc14,$acc14,$acc14
+	add	$acc15,$acc15,$acc15
+	and	$acc04,$acc04,$mask1b	# (r1-(r1>>7))&0x1b1b1b1b
+	and	$acc05,$acc05,$mask1b
+	and	$acc06,$acc06,$mask1b
+	and	$acc07,$acc07,$mask1b
+	xor	$acc04,$acc04,$acc12	# r4
+	xor	$acc05,$acc05,$acc13
+	xor	$acc06,$acc06,$acc14
+	xor	$acc07,$acc07,$acc15
+
+	and	$acc08,$acc04,$mask80	# r1=r4&0x80808080
+	and	$acc09,$acc05,$mask80
+	and	$acc10,$acc06,$mask80
+	and	$acc11,$acc07,$mask80
+	srwi	$acc12,$acc08,7		# r1>>7
+	srwi	$acc13,$acc09,7
+	srwi	$acc14,$acc10,7
+	srwi	$acc15,$acc11,7
+	sub	$acc08,$acc08,$acc12	# r1-(r1>>7)
+	sub	$acc09,$acc09,$acc13
+	sub	$acc10,$acc10,$acc14
+	sub	$acc11,$acc11,$acc15
+	andc	$acc12,$acc04,$mask80	# r4&0x7f7f7f7f
+	andc	$acc13,$acc05,$mask80
+	andc	$acc14,$acc06,$mask80
+	andc	$acc15,$acc07,$mask80
+	add	$acc12,$acc12,$acc12	# (r4&0x7f7f7f7f)<<1
+	add	$acc13,$acc13,$acc13
+	add	$acc14,$acc14,$acc14
+	add	$acc15,$acc15,$acc15
+	and	$acc08,$acc08,$mask1b	# (r1-(r1>>7))&0x1b1b1b1b
+	and	$acc09,$acc09,$mask1b
+	and	$acc10,$acc10,$mask1b
+	and	$acc11,$acc11,$mask1b
+	xor	$acc08,$acc08,$acc12	# r8
+	xor	$acc09,$acc09,$acc13
+	xor	$acc10,$acc10,$acc14
+	xor	$acc11,$acc11,$acc15
+
+	xor	$acc00,$acc00,$s0	# r2^r0
+	xor	$acc01,$acc01,$s1
+	xor	$acc02,$acc02,$s2
+	xor	$acc03,$acc03,$s3
+	xor	$acc04,$acc04,$s0	# r4^r0
+	xor	$acc05,$acc05,$s1
+	xor	$acc06,$acc06,$s2
+	xor	$acc07,$acc07,$s3
+___
+$code.=<<___;
+	rotrwi	$s0,$s0,8		# = ROTATE(r0,8)
+	rotrwi	$s1,$s1,8
+	rotrwi	$s2,$s2,8
+	rotrwi	$s3,$s3,8
+	xor	$s0,$s0,$acc00		# ^= r2^r0
+	xor	$s1,$s1,$acc01
+	xor	$s2,$s2,$acc02
+	xor	$s3,$s3,$acc03
+	xor	$acc00,$acc00,$acc08
+	xor	$acc01,$acc01,$acc09
+	xor	$acc02,$acc02,$acc10
+	xor	$acc03,$acc03,$acc11
+	xor	$s0,$s0,$acc04		# ^= r4^r0
+	xor	$s1,$s1,$acc05
+	xor	$s2,$s2,$acc06
+	xor	$s3,$s3,$acc07
+	rotrwi	$acc00,$acc00,24
+	rotrwi	$acc01,$acc01,24
+	rotrwi	$acc02,$acc02,24
+	rotrwi	$acc03,$acc03,24
+	xor	$acc04,$acc04,$acc08
+	xor	$acc05,$acc05,$acc09
+	xor	$acc06,$acc06,$acc10
+	xor	$acc07,$acc07,$acc11
+	xor	$s0,$s0,$acc08		# ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
+	xor	$s1,$s1,$acc09
+	xor	$s2,$s2,$acc10
+	xor	$s3,$s3,$acc11
+	rotrwi	$acc04,$acc04,16
+	rotrwi	$acc05,$acc05,16
+	rotrwi	$acc06,$acc06,16
+	rotrwi	$acc07,$acc07,16
+	xor	$s0,$s0,$acc00		# ^= ROTATE(r8^r2^r0,24)
+	xor	$s1,$s1,$acc01
+	xor	$s2,$s2,$acc02
+	xor	$s3,$s3,$acc03
+	rotrwi	$acc08,$acc08,8
+	rotrwi	$acc09,$acc09,8
+	rotrwi	$acc10,$acc10,8
+	rotrwi	$acc11,$acc11,8
+	xor	$s0,$s0,$acc04		# ^= ROTATE(r8^r4^r0,16)
+	xor	$s1,$s1,$acc05
+	xor	$s2,$s2,$acc06
+	xor	$s3,$s3,$acc07
+	xor	$s0,$s0,$acc08		# ^= ROTATE(r8,8)	
+	xor	$s1,$s1,$acc09	
+	xor	$s2,$s2,$acc10	
+	xor	$s3,$s3,$acc11	
+
+	b	Ldec_compact_loop
+.align	4
+Ldec_compact_done:
+	xor	$s0,$s0,$t0
+	xor	$s1,$s1,$t1
+	xor	$s2,$s2,$t2
+	xor	$s3,$s3,$t3
+	blr
+.long	0
+.asciz	"AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
+.align	7
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl
new file mode 100644
index 0000000000..4b27afd92f
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-s390x.pl
@@ -0,0 +1,1333 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for s390x.
+
+# April 2007.
+#
+# Software performance improvement over gcc-generated code is ~70% and
+# in absolute terms is ~73 cycles per byte processed with 128-bit key.
+# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
+# *strictly* in-order execution and issued instruction [in this case
+# load value from memory is critical] has to complete before execution
+# flow proceeds. S-boxes are compressed to 2KB[+256B].
+#
+# As for hardware acceleration support. It's basically a "teaser," as
+# it can and should be improved in several ways. Most notably support
+# for CBC is not utilized, nor multiple blocks are ever processed.
+# Then software key schedule can be postponed till hardware support
+# detection... Performance improvement over assembler is reportedly
+# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
+# support is implemented.
+
+# May 2007.
+#
+# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
+# for 128-bit keys, if hardware support is detected.
+
+# Januray 2009.
+#
+# Add support for hardware AES192/256 and reschedule instructions to
+# minimize/avoid Address Generation Interlock hazard and to favour
+# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
+# almost 50% on z9. The gain is smaller on z10, because being dual-
+# issue z10 makes it improssible to eliminate the interlock condition:
+# critial path is not long enough. Yet it spends ~24 cycles per byte
+# processed with 128-bit key.
+#
+# Unlike previous version hardware support detection takes place only
+# at the moment of key schedule setup, which is denoted in key->rounds.
+# This is done, because deferred key setup can't be made MT-safe, not
+# for key lengthes longer than 128 bits.
+#
+# Add AES_cbc_encrypt, which gives incredible performance improvement,
+# it was measured to be ~6.6x. It's less than previously mentioned 8x,
+# because software implementation was optimized.
+
+$softonly=0;	# allow hardware support
+
+$t0="%r0";	$mask="%r0";
+$t1="%r1";
+$t2="%r2";	$inp="%r2";
+$t3="%r3";	$out="%r3";	$bits="%r3";
+$key="%r4";
+$i1="%r5";
+$i2="%r6";
+$i3="%r7";
+$s0="%r8";
+$s1="%r9";
+$s2="%r10";
+$s3="%r11";
+$tbl="%r12";
+$rounds="%r13";
+$ra="%r14";
+$sp="%r15";
+
+sub _data_word()
+{ my $i;
+    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
+}
+
+$code=<<___;
+.text
+
+.type	AES_Te,\@object
+.align	256
+AES_Te:
+___
+&_data_word(
+	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
+	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
+	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
+	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
+	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
+	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
+	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
+	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
+	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
+	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
+	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
+	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
+	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
+	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
+	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
+	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
+	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
+	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
+	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
+	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
+	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
+	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
+	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
+	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
+	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
+	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
+	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
+	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
+	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
+	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
+	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
+	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
+	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
+	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
+	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
+	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
+	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
+	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
+	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
+	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
+	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
+	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
+	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
+	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
+	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
+	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
+	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
+	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
+	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
+	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
+	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
+	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
+	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
+	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
+	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
+	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
+	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
+	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
+	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
+	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
+	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
+	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
+	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
+	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
+$code.=<<___;
+# Te4[256]
+.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+# rcon[]
+.long	0x01000000, 0x02000000, 0x04000000, 0x08000000
+.long	0x10000000, 0x20000000, 0x40000000, 0x80000000
+.long	0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
+.align	256
+.size	AES_Te,.-AES_Te
+
+# void AES_encrypt(const unsigned char *inp, unsigned char *out,
+# 		 const AES_KEY *key) {
+.globl	AES_encrypt
+.type	AES_encrypt,\@function
+AES_encrypt:
+___
+$code.=<<___ if (!$softonly);
+	l	%r0,240($key)
+	lhi	%r1,16
+	clr	%r0,%r1
+	jl	.Lesoft
+
+	la	%r1,0($key)
+	#la	%r2,0($inp)
+	la	%r4,0($out)
+	lghi	%r3,16		# single block length
+	.long	0xb92e0042	# km %r4,%r2
+	brc	1,.-4		# can this happen?
+	br	%r14
+.align	64
+.Lesoft:
+___
+$code.=<<___;
+	stmg	%r3,$ra,24($sp)
+
+	llgf	$s0,0($inp)
+	llgf	$s1,4($inp)
+	llgf	$s2,8($inp)
+	llgf	$s3,12($inp)
+
+	larl	$tbl,AES_Te
+	bras	$ra,_s390x_AES_encrypt
+
+	lg	$out,24($sp)
+	st	$s0,0($out)
+	st	$s1,4($out)
+	st	$s2,8($out)
+	st	$s3,12($out)
+
+	lmg	%r6,$ra,48($sp)
+	br	$ra
+.size	AES_encrypt,.-AES_encrypt
+
+.type   _s390x_AES_encrypt,\@function
+.align	16
+_s390x_AES_encrypt:
+	stg	$ra,152($sp)
+	x	$s0,0($key)
+	x	$s1,4($key)
+	x	$s2,8($key)
+	x	$s3,12($key)
+	l	$rounds,240($key)
+	llill	$mask,`0xff<<3`
+	aghi	$rounds,-1
+	j	.Lenc_loop
+.align	16
+.Lenc_loop:
+	sllg	$t1,$s0,`0+3`
+	srlg	$t2,$s0,`8-3`
+	srlg	$t3,$s0,`16-3`
+	srl	$s0,`24-3`
+	nr	$s0,$mask
+	ngr	$t1,$mask
+	nr	$t2,$mask
+	nr	$t3,$mask
+
+	srlg	$i1,$s1,`16-3`	# i0
+	sllg	$i2,$s1,`0+3`
+	srlg	$i3,$s1,`8-3`
+	srl	$s1,`24-3`
+	nr	$i1,$mask
+	nr	$s1,$mask
+	ngr	$i2,$mask
+	nr	$i3,$mask
+
+	l	$s0,0($s0,$tbl)	# Te0[s0>>24]
+	l	$t1,1($t1,$tbl)	# Te3[s0>>0]
+	l	$t2,2($t2,$tbl) # Te2[s0>>8]
+	l	$t3,3($t3,$tbl)	# Te1[s0>>16]
+
+	x	$s0,3($i1,$tbl)	# Te1[s1>>16]
+	l	$s1,0($s1,$tbl)	# Te0[s1>>24]
+	x	$t2,1($i2,$tbl)	# Te3[s1>>0]
+	x	$t3,2($i3,$tbl)	# Te2[s1>>8]
+
+	srlg	$i1,$s2,`8-3`	# i0
+	srlg	$i2,$s2,`16-3`	# i1
+	nr	$i1,$mask
+	nr	$i2,$mask
+	sllg	$i3,$s2,`0+3`
+	srl	$s2,`24-3`
+	nr	$s2,$mask
+	ngr	$i3,$mask
+
+	xr	$s1,$t1
+	srlg	$ra,$s3,`8-3`	# i1
+	sllg	$t1,$s3,`0+3`	# i0
+	nr	$ra,$mask
+	la	$key,16($key)
+	ngr	$t1,$mask
+
+	x	$s0,2($i1,$tbl)	# Te2[s2>>8]
+	x	$s1,3($i2,$tbl)	# Te1[s2>>16]
+	l	$s2,0($s2,$tbl)	# Te0[s2>>24]
+	x	$t3,1($i3,$tbl)	# Te3[s2>>0]
+
+	srlg	$i3,$s3,`16-3`	# i2
+	xr	$s2,$t2
+	srl	$s3,`24-3`
+	nr	$i3,$mask
+	nr	$s3,$mask
+
+	x	$s0,0($key)
+	x	$s1,4($key)
+	x	$s2,8($key)
+	x	$t3,12($key)
+
+	x	$s0,1($t1,$tbl)	# Te3[s3>>0]
+	x	$s1,2($ra,$tbl)	# Te2[s3>>8]
+	x	$s2,3($i3,$tbl)	# Te1[s3>>16]
+	l	$s3,0($s3,$tbl)	# Te0[s3>>24]
+	xr	$s3,$t3
+
+	brct	$rounds,.Lenc_loop
+	.align	16
+
+	sllg	$t1,$s0,`0+3`
+	srlg	$t2,$s0,`8-3`
+	ngr	$t1,$mask
+	srlg	$t3,$s0,`16-3`
+	srl	$s0,`24-3`
+	nr	$s0,$mask
+	nr	$t2,$mask
+	nr	$t3,$mask
+
+	srlg	$i1,$s1,`16-3`	# i0
+	sllg	$i2,$s1,`0+3`
+	ngr	$i2,$mask
+	srlg	$i3,$s1,`8-3`
+	srl	$s1,`24-3`
+	nr	$i1,$mask
+	nr	$s1,$mask
+	nr	$i3,$mask
+
+	llgc	$s0,2($s0,$tbl)	# Te4[s0>>24]
+	llgc	$t1,2($t1,$tbl)	# Te4[s0>>0]
+	sll	$s0,24
+	llgc	$t2,2($t2,$tbl)	# Te4[s0>>8]
+	llgc	$t3,2($t3,$tbl)	# Te4[s0>>16]
+	sll	$t2,8
+	sll	$t3,16
+
+	llgc	$i1,2($i1,$tbl)	# Te4[s1>>16]
+	llgc	$s1,2($s1,$tbl)	# Te4[s1>>24]
+	llgc	$i2,2($i2,$tbl)	# Te4[s1>>0]
+	llgc	$i3,2($i3,$tbl)	# Te4[s1>>8]
+	sll	$i1,16
+	sll	$s1,24
+	sll	$i3,8
+	or	$s0,$i1
+	or	$s1,$t1
+	or	$t2,$i2
+	or	$t3,$i3
+	
+	srlg	$i1,$s2,`8-3`	# i0
+	srlg	$i2,$s2,`16-3`	# i1
+	nr	$i1,$mask
+	nr	$i2,$mask
+	sllg	$i3,$s2,`0+3`
+	srl	$s2,`24-3`
+	ngr	$i3,$mask
+	nr	$s2,$mask
+
+	sllg	$t1,$s3,`0+3`	# i0
+	srlg	$ra,$s3,`8-3`	# i1
+	ngr	$t1,$mask
+
+	llgc	$i1,2($i1,$tbl)	# Te4[s2>>8]
+	llgc	$i2,2($i2,$tbl)	# Te4[s2>>16]
+	sll	$i1,8
+	llgc	$s2,2($s2,$tbl)	# Te4[s2>>24]
+	llgc	$i3,2($i3,$tbl)	# Te4[s2>>0]
+	sll	$i2,16
+	nr	$ra,$mask
+	sll	$s2,24
+	or	$s0,$i1
+	or	$s1,$i2
+	or	$s2,$t2
+	or	$t3,$i3
+
+	srlg	$i3,$s3,`16-3`	# i2
+	srl	$s3,`24-3`
+	nr	$i3,$mask
+	nr	$s3,$mask
+
+	l	$t0,16($key)
+	l	$t2,20($key)
+
+	llgc	$i1,2($t1,$tbl)	# Te4[s3>>0]
+	llgc	$i2,2($ra,$tbl)	# Te4[s3>>8]
+	llgc	$i3,2($i3,$tbl)	# Te4[s3>>16]
+	llgc	$s3,2($s3,$tbl)	# Te4[s3>>24]
+	sll	$i2,8
+	sll	$i3,16
+	sll	$s3,24
+	or	$s0,$i1
+	or	$s1,$i2
+	or	$s2,$i3
+	or	$s3,$t3
+
+	lg	$ra,152($sp)
+	xr	$s0,$t0
+	xr	$s1,$t2
+	x	$s2,24($key)
+	x	$s3,28($key)
+
+	br	$ra	
+.size	_s390x_AES_encrypt,.-_s390x_AES_encrypt
+___
+
+$code.=<<___;
+.type	AES_Td,\@object
+.align	256
+AES_Td:
+___
+&_data_word(
+	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
+	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
+	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
+	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
+	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
+	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
+	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
+	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
+	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
+	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
+	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
+	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
+	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
+	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
+	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
+	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
+	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
+	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
+	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
+	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
+	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
+	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
+	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
+	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
+	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
+	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
+	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
+	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
+	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
+	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
+	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
+	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
+	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
+	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
+	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
+	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
+	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
+	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
+	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
+	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
+	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
+	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
+	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
+	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
+	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
+	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
+	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
+	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
+	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
+	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
+	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
+	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
+	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
+	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
+	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
+	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
+	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
+	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
+	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
+	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
+	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
+	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
+	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
+	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
+$code.=<<___;
+# Td4[256]
+.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+.size	AES_Td,.-AES_Td
+
+# void AES_decrypt(const unsigned char *inp, unsigned char *out,
+# 		 const AES_KEY *key) {
+.globl	AES_decrypt
+.type	AES_decrypt,\@function
+AES_decrypt:
+___
+$code.=<<___ if (!$softonly);
+	l	%r0,240($key)
+	lhi	%r1,16
+	clr	%r0,%r1
+	jl	.Ldsoft
+
+	la	%r1,0($key)
+	#la	%r2,0($inp)
+	la	%r4,0($out)
+	lghi	%r3,16		# single block length
+	.long	0xb92e0042	# km %r4,%r2
+	brc	1,.-4		# can this happen?
+	br	%r14
+.align	64
+.Ldsoft:
+___
+$code.=<<___;
+	stmg	%r3,$ra,24($sp)
+
+	llgf	$s0,0($inp)
+	llgf	$s1,4($inp)
+	llgf	$s2,8($inp)
+	llgf	$s3,12($inp)
+
+	larl	$tbl,AES_Td
+	bras	$ra,_s390x_AES_decrypt
+
+	lg	$out,24($sp)
+	st	$s0,0($out)
+	st	$s1,4($out)
+	st	$s2,8($out)
+	st	$s3,12($out)
+
+	lmg	%r6,$ra,48($sp)
+	br	$ra
+.size	AES_decrypt,.-AES_decrypt
+
+.type   _s390x_AES_decrypt,\@function
+.align	16
+_s390x_AES_decrypt:
+	stg	$ra,152($sp)
+	x	$s0,0($key)
+	x	$s1,4($key)
+	x	$s2,8($key)
+	x	$s3,12($key)
+	l	$rounds,240($key)
+	llill	$mask,`0xff<<3`
+	aghi	$rounds,-1
+	j	.Ldec_loop
+.align	16
+.Ldec_loop:
+	srlg	$t1,$s0,`16-3`
+	srlg	$t2,$s0,`8-3`
+	sllg	$t3,$s0,`0+3`
+	srl	$s0,`24-3`
+	nr	$s0,$mask
+	nr	$t1,$mask
+	nr	$t2,$mask
+	ngr	$t3,$mask
+
+	sllg	$i1,$s1,`0+3`	# i0
+	srlg	$i2,$s1,`16-3`
+	srlg	$i3,$s1,`8-3`
+	srl	$s1,`24-3`
+	ngr	$i1,$mask
+	nr	$s1,$mask
+	nr	$i2,$mask
+	nr	$i3,$mask
+
+	l	$s0,0($s0,$tbl)	# Td0[s0>>24]
+	l	$t1,3($t1,$tbl)	# Td1[s0>>16]
+	l	$t2,2($t2,$tbl)	# Td2[s0>>8]
+	l	$t3,1($t3,$tbl)	# Td3[s0>>0]
+
+	x	$s0,1($i1,$tbl)	# Td3[s1>>0]
+	l	$s1,0($s1,$tbl)	# Td0[s1>>24]
+	x	$t2,3($i2,$tbl)	# Td1[s1>>16]
+	x	$t3,2($i3,$tbl)	# Td2[s1>>8]
+
+	srlg	$i1,$s2,`8-3`	# i0
+	sllg	$i2,$s2,`0+3`	# i1
+	srlg	$i3,$s2,`16-3`
+	srl	$s2,`24-3`
+	nr	$i1,$mask
+	ngr	$i2,$mask
+	nr	$s2,$mask
+	nr	$i3,$mask
+
+	xr	$s1,$t1
+	srlg	$ra,$s3,`8-3`	# i1
+	srlg	$t1,$s3,`16-3`	# i0
+	nr	$ra,$mask
+	la	$key,16($key)
+	nr	$t1,$mask
+
+	x	$s0,2($i1,$tbl)	# Td2[s2>>8]
+	x	$s1,1($i2,$tbl)	# Td3[s2>>0]
+	l	$s2,0($s2,$tbl)	# Td0[s2>>24]
+	x	$t3,3($i3,$tbl)	# Td1[s2>>16]
+
+	sllg	$i3,$s3,`0+3`	# i2
+	srl	$s3,`24-3`
+	ngr	$i3,$mask
+	nr	$s3,$mask
+
+	xr	$s2,$t2
+	x	$s0,0($key)
+	x	$s1,4($key)
+	x	$s2,8($key)
+	x	$t3,12($key)
+
+	x	$s0,3($t1,$tbl)	# Td1[s3>>16]
+	x	$s1,2($ra,$tbl)	# Td2[s3>>8]
+	x	$s2,1($i3,$tbl)	# Td3[s3>>0]
+	l	$s3,0($s3,$tbl)	# Td0[s3>>24]
+	xr	$s3,$t3
+
+	brct	$rounds,.Ldec_loop
+	.align	16
+
+	l	$t1,`2048+0`($tbl)	# prefetch Td4
+	l	$t2,`2048+64`($tbl)
+	l	$t3,`2048+128`($tbl)
+	l	$i1,`2048+192`($tbl)
+	llill	$mask,0xff
+
+	srlg	$i3,$s0,24	# i0
+	srlg	$t1,$s0,16
+	srlg	$t2,$s0,8
+	nr	$s0,$mask	# i3
+	nr	$t1,$mask
+
+	srlg	$i1,$s1,24
+	nr	$t2,$mask
+	srlg	$i2,$s1,16
+	srlg	$ra,$s1,8
+	nr	$s1,$mask	# i0
+	nr	$i2,$mask
+	nr	$ra,$mask
+
+	llgc	$i3,2048($i3,$tbl)	# Td4[s0>>24]
+	llgc	$t1,2048($t1,$tbl)	# Td4[s0>>16]
+	llgc	$t2,2048($t2,$tbl)	# Td4[s0>>8]
+	sll	$t1,16
+	llgc	$t3,2048($s0,$tbl)	# Td4[s0>>0]
+	sllg	$s0,$i3,24
+	sll	$t2,8
+
+	llgc	$s1,2048($s1,$tbl)	# Td4[s1>>0]
+	llgc	$i1,2048($i1,$tbl)	# Td4[s1>>24]
+	llgc	$i2,2048($i2,$tbl)	# Td4[s1>>16]
+	sll	$i1,24
+	llgc	$i3,2048($ra,$tbl)	# Td4[s1>>8]
+	sll	$i2,16
+	sll	$i3,8
+	or	$s0,$s1
+	or	$t1,$i1
+	or	$t2,$i2
+	or	$t3,$i3
+
+	srlg	$i1,$s2,8	# i0
+	srlg	$i2,$s2,24
+	srlg	$i3,$s2,16
+	nr	$s2,$mask	# i1
+	nr	$i1,$mask
+	nr	$i3,$mask
+	llgc	$i1,2048($i1,$tbl)	# Td4[s2>>8]
+	llgc	$s1,2048($s2,$tbl)	# Td4[s2>>0]
+	llgc	$i2,2048($i2,$tbl)	# Td4[s2>>24]
+	llgc	$i3,2048($i3,$tbl)	# Td4[s2>>16]
+	sll	$i1,8
+	sll	$i2,24
+	or	$s0,$i1
+	sll	$i3,16
+	or	$t2,$i2
+	or	$t3,$i3
+
+	srlg	$i1,$s3,16	# i0
+	srlg	$i2,$s3,8	# i1
+	srlg	$i3,$s3,24
+	nr	$s3,$mask	# i2
+	nr	$i1,$mask
+	nr	$i2,$mask
+
+	lg	$ra,152($sp)
+	or	$s1,$t1
+	l	$t0,16($key)
+	l	$t1,20($key)
+
+	llgc	$i1,2048($i1,$tbl)	# Td4[s3>>16]
+	llgc	$i2,2048($i2,$tbl)	# Td4[s3>>8]
+	sll	$i1,16
+	llgc	$s2,2048($s3,$tbl)	# Td4[s3>>0]
+	llgc	$s3,2048($i3,$tbl)	# Td4[s3>>24]
+	sll	$i2,8
+	sll	$s3,24
+	or	$s0,$i1
+	or	$s1,$i2
+	or	$s2,$t2
+	or	$s3,$t3
+
+	xr	$s0,$t0
+	xr	$s1,$t1
+	x	$s2,24($key)
+	x	$s3,28($key)
+
+	br	$ra	
+.size	_s390x_AES_decrypt,.-_s390x_AES_decrypt
+___
+
+$code.=<<___;
+# void AES_set_encrypt_key(const unsigned char *in, int bits,
+# 		 AES_KEY *key) {
+.globl	AES_set_encrypt_key
+.type	AES_set_encrypt_key,\@function
+.align	16
+AES_set_encrypt_key:
+	lghi	$t0,0
+	clgr	$inp,$t0
+	je	.Lminus1
+	clgr	$key,$t0
+	je	.Lminus1
+
+	lghi	$t0,128
+	clr	$bits,$t0
+	je	.Lproceed
+	lghi	$t0,192
+	clr	$bits,$t0
+	je	.Lproceed
+	lghi	$t0,256
+	clr	$bits,$t0
+	je	.Lproceed
+	lghi	%r2,-2
+	br	%r14
+
+.align	16
+.Lproceed:
+___
+$code.=<<___ if (!$softonly);
+	# convert bits to km code, [128,192,256]->[18,19,20]
+	lhi	%r5,-128
+	lhi	%r0,18
+	ar	%r5,$bits
+	srl	%r5,6
+	ar	%r5,%r0
+
+	lghi	%r0,0		# query capability vector
+	la	%r1,16($sp)
+	.long	0xb92f0042	# kmc %r4,%r2
+
+	llihh	%r1,0x8000
+	srlg	%r1,%r1,0(%r5)
+	ng	%r1,16($sp)
+	jz	.Lekey_internal
+
+	lmg	%r0,%r1,0($inp)	# just copy 128 bits...
+	stmg	%r0,%r1,0($key)
+	lhi	%r0,192
+	cr	$bits,%r0
+	jl	1f
+	lg	%r1,16($inp)
+	stg	%r1,16($key)
+	je	1f
+	lg	%r1,24($inp)
+	stg	%r1,24($key)
+1:	st	$bits,236($key)	# save bits
+	st	%r5,240($key)	# save km code
+	lghi	%r2,0
+	br	%r14
+___
+$code.=<<___;
+.align	16
+.Lekey_internal:
+	stmg	%r6,%r13,48($sp)	# all non-volatile regs
+
+	larl	$tbl,AES_Te+2048
+
+	llgf	$s0,0($inp)
+	llgf	$s1,4($inp)
+	llgf	$s2,8($inp)
+	llgf	$s3,12($inp)
+	st	$s0,0($key)
+	st	$s1,4($key)
+	st	$s2,8($key)
+	st	$s3,12($key)
+	lghi	$t0,128
+	cr	$bits,$t0
+	jne	.Lnot128
+
+	llill	$mask,0xff
+	lghi	$t3,0			# i=0
+	lghi	$rounds,10
+	st	$rounds,240($key)
+
+	llgfr	$t2,$s3			# temp=rk[3]
+	srlg	$i1,$s3,8
+	srlg	$i2,$s3,16
+	srlg	$i3,$s3,24
+	nr	$t2,$mask
+	nr	$i1,$mask
+	nr	$i2,$mask
+
+.align	16
+.L128_loop:
+	la	$t2,0($t2,$tbl)
+	la	$i1,0($i1,$tbl)
+	la	$i2,0($i2,$tbl)
+	la	$i3,0($i3,$tbl)
+	icm	$t2,2,0($t2)		# Te4[rk[3]>>0]<<8
+	icm	$t2,4,0($i1)		# Te4[rk[3]>>8]<<16
+	icm	$t2,8,0($i2)		# Te4[rk[3]>>16]<<24
+	icm	$t2,1,0($i3)		# Te4[rk[3]>>24]
+	x	$t2,256($t3,$tbl)	# rcon[i]
+	xr	$s0,$t2			# rk[4]=rk[0]^...
+	xr	$s1,$s0			# rk[5]=rk[1]^rk[4]
+	xr	$s2,$s1			# rk[6]=rk[2]^rk[5]
+	xr	$s3,$s2			# rk[7]=rk[3]^rk[6]
+
+	llgfr	$t2,$s3			# temp=rk[3]
+	srlg	$i1,$s3,8
+	srlg	$i2,$s3,16
+	nr	$t2,$mask
+	nr	$i1,$mask
+	srlg	$i3,$s3,24
+	nr	$i2,$mask
+
+	st	$s0,16($key)
+	st	$s1,20($key)
+	st	$s2,24($key)
+	st	$s3,28($key)
+	la	$key,16($key)		# key+=4
+	la	$t3,4($t3)		# i++
+	brct	$rounds,.L128_loop
+	lghi	%r2,0
+	lmg	%r6,%r13,48($sp)
+	br	$ra
+
+.align	16
+.Lnot128:
+	llgf	$t0,16($inp)
+	llgf	$t1,20($inp)
+	st	$t0,16($key)
+	st	$t1,20($key)
+	lghi	$t0,192
+	cr	$bits,$t0
+	jne	.Lnot192
+
+	llill	$mask,0xff
+	lghi	$t3,0			# i=0
+	lghi	$rounds,12
+	st	$rounds,240($key)
+	lghi	$rounds,8
+
+	srlg	$i1,$t1,8
+	srlg	$i2,$t1,16
+	srlg	$i3,$t1,24
+	nr	$t1,$mask
+	nr	$i1,$mask
+	nr	$i2,$mask
+
+.align	16
+.L192_loop:
+	la	$t1,0($t1,$tbl)
+	la	$i1,0($i1,$tbl)
+	la	$i2,0($i2,$tbl)
+	la	$i3,0($i3,$tbl)
+	icm	$t1,2,0($t1)		# Te4[rk[5]>>0]<<8
+	icm	$t1,4,0($i1)		# Te4[rk[5]>>8]<<16
+	icm	$t1,8,0($i2)		# Te4[rk[5]>>16]<<24
+	icm	$t1,1,0($i3)		# Te4[rk[5]>>24]
+	x	$t1,256($t3,$tbl)	# rcon[i]
+	xr	$s0,$t1			# rk[6]=rk[0]^...
+	xr	$s1,$s0			# rk[7]=rk[1]^rk[6]
+	xr	$s2,$s1			# rk[8]=rk[2]^rk[7]
+	xr	$s3,$s2			# rk[9]=rk[3]^rk[8]
+
+	st	$s0,24($key)
+	st	$s1,28($key)
+	st	$s2,32($key)
+	st	$s3,36($key)
+	brct	$rounds,.L192_continue
+	lghi	%r2,0
+	lmg	%r6,%r13,48($sp)
+	br	$ra
+
+.align	16
+.L192_continue:
+	lgr	$t1,$s3
+	x	$t1,16($key)		# rk[10]=rk[4]^rk[9]
+	st	$t1,40($key)
+	x	$t1,20($key)		# rk[11]=rk[5]^rk[10]
+	st	$t1,44($key)
+
+	srlg	$i1,$t1,8
+	srlg	$i2,$t1,16
+	srlg	$i3,$t1,24
+	nr	$t1,$mask
+	nr	$i1,$mask
+	nr	$i2,$mask
+
+	la	$key,24($key)		# key+=6
+	la	$t3,4($t3)		# i++
+	j	.L192_loop
+
+.align	16
+.Lnot192:
+	llgf	$t0,24($inp)
+	llgf	$t1,28($inp)
+	st	$t0,24($key)
+	st	$t1,28($key)
+	llill	$mask,0xff
+	lghi	$t3,0			# i=0
+	lghi	$rounds,14
+	st	$rounds,240($key)
+	lghi	$rounds,7
+
+	srlg	$i1,$t1,8
+	srlg	$i2,$t1,16
+	srlg	$i3,$t1,24
+	nr	$t1,$mask
+	nr	$i1,$mask
+	nr	$i2,$mask
+
+.align	16
+.L256_loop:
+	la	$t1,0($t1,$tbl)
+	la	$i1,0($i1,$tbl)
+	la	$i2,0($i2,$tbl)
+	la	$i3,0($i3,$tbl)
+	icm	$t1,2,0($t1)		# Te4[rk[7]>>0]<<8
+	icm	$t1,4,0($i1)		# Te4[rk[7]>>8]<<16
+	icm	$t1,8,0($i2)		# Te4[rk[7]>>16]<<24
+	icm	$t1,1,0($i3)		# Te4[rk[7]>>24]
+	x	$t1,256($t3,$tbl)	# rcon[i]
+	xr	$s0,$t1			# rk[8]=rk[0]^...
+	xr	$s1,$s0			# rk[9]=rk[1]^rk[8]
+	xr	$s2,$s1			# rk[10]=rk[2]^rk[9]
+	xr	$s3,$s2			# rk[11]=rk[3]^rk[10]
+	st	$s0,32($key)
+	st	$s1,36($key)
+	st	$s2,40($key)
+	st	$s3,44($key)
+	brct	$rounds,.L256_continue
+	lghi	%r2,0
+	lmg	%r6,%r13,48($sp)
+	br	$ra
+
+.align	16
+.L256_continue:
+	lgr	$t1,$s3			# temp=rk[11]
+	srlg	$i1,$s3,8
+	srlg	$i2,$s3,16
+	srlg	$i3,$s3,24
+	nr	$t1,$mask
+	nr	$i1,$mask
+	nr	$i2,$mask
+	la	$t1,0($t1,$tbl)
+	la	$i1,0($i1,$tbl)
+	la	$i2,0($i2,$tbl)
+	la	$i3,0($i3,$tbl)
+	llgc	$t1,0($t1)		# Te4[rk[11]>>0]
+	icm	$t1,2,0($i1)		# Te4[rk[11]>>8]<<8
+	icm	$t1,4,0($i2)		# Te4[rk[11]>>16]<<16
+	icm	$t1,8,0($i3)		# Te4[rk[11]>>24]<<24
+	x	$t1,16($key)		# rk[12]=rk[4]^...
+	st	$t1,48($key)
+	x	$t1,20($key)		# rk[13]=rk[5]^rk[12]
+	st	$t1,52($key)
+	x	$t1,24($key)		# rk[14]=rk[6]^rk[13]
+	st	$t1,56($key)
+	x	$t1,28($key)		# rk[15]=rk[7]^rk[14]
+	st	$t1,60($key)
+
+	srlg	$i1,$t1,8
+	srlg	$i2,$t1,16
+	srlg	$i3,$t1,24
+	nr	$t1,$mask
+	nr	$i1,$mask
+	nr	$i2,$mask
+
+	la	$key,32($key)		# key+=8
+	la	$t3,4($t3)		# i++
+	j	.L256_loop
+
+.Lminus1:
+	lghi	%r2,-1
+	br	$ra
+.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+
+# void AES_set_decrypt_key(const unsigned char *in, int bits,
+# 		 AES_KEY *key) {
+.globl	AES_set_decrypt_key
+.type	AES_set_decrypt_key,\@function
+.align	16
+AES_set_decrypt_key:
+	stg	$key,32($sp)		# I rely on AES_set_encrypt_key to
+	stg	$ra,112($sp)		# save non-volatile registers!
+	bras	$ra,AES_set_encrypt_key
+	lg	$key,32($sp)
+	lg	$ra,112($sp)
+	ltgr	%r2,%r2
+	bnzr	$ra
+___
+$code.=<<___ if (!$softonly);
+	l	$t0,240($key)
+	lhi	$t1,16
+	cr	$t0,$t1
+	jl	.Lgo
+	oill	$t0,0x80	# set "decrypt" bit
+	st	$t0,240($key)
+	br	$ra
+
+.align	16
+.Ldkey_internal:
+	stg	$key,32($sp)
+	stg	$ra,40($sp)
+	bras	$ra,.Lekey_internal
+	lg	$key,32($sp)
+	lg	$ra,40($sp)
+___
+$code.=<<___;
+
+.Lgo:	llgf	$rounds,240($key)
+	la	$i1,0($key)
+	sllg	$i2,$rounds,4
+	la	$i2,0($i2,$key)
+	srl	$rounds,1
+	lghi	$t1,-16
+
+.align	16
+.Linv:	lmg	$s0,$s1,0($i1)
+	lmg	$s2,$s3,0($i2)
+	stmg	$s0,$s1,0($i2)
+	stmg	$s2,$s3,0($i1)
+	la	$i1,16($i1)
+	la	$i2,0($t1,$i2)
+	brct	$rounds,.Linv
+___
+$mask80=$i1;
+$mask1b=$i2;
+$maskfe=$i3;
+$code.=<<___;
+	llgf	$rounds,240($key)
+	aghi	$rounds,-1
+	sll	$rounds,2	# (rounds-1)*4
+	llilh	$mask80,0x8080
+	llilh	$mask1b,0x1b1b
+	llilh	$maskfe,0xfefe
+	oill	$mask80,0x8080
+	oill	$mask1b,0x1b1b
+	oill	$maskfe,0xfefe
+
+.align	16
+.Lmix:	l	$s0,16($key)	# tp1
+	lr	$s1,$s0
+	ngr	$s1,$mask80
+	srlg	$t1,$s1,7
+	slr	$s1,$t1
+	nr	$s1,$mask1b
+	sllg	$t1,$s0,1
+	nr	$t1,$maskfe
+	xr	$s1,$t1		# tp2
+
+	lr	$s2,$s1
+	ngr	$s2,$mask80
+	srlg	$t1,$s2,7
+	slr	$s2,$t1
+	nr	$s2,$mask1b
+	sllg	$t1,$s1,1
+	nr	$t1,$maskfe
+	xr	$s2,$t1		# tp4
+
+	lr	$s3,$s2
+	ngr	$s3,$mask80
+	srlg	$t1,$s3,7
+	slr	$s3,$t1
+	nr	$s3,$mask1b
+	sllg	$t1,$s2,1
+	nr	$t1,$maskfe
+	xr	$s3,$t1		# tp8
+
+	xr	$s1,$s0		# tp2^tp1
+	xr	$s2,$s0		# tp4^tp1
+	rll	$s0,$s0,24	# = ROTATE(tp1,8)
+	xr	$s2,$s3		# ^=tp8
+	xr	$s0,$s1		# ^=tp2^tp1
+	xr	$s1,$s3		# tp2^tp1^tp8
+	xr	$s0,$s2		# ^=tp4^tp1^tp8
+	rll	$s1,$s1,8
+	rll	$s2,$s2,16
+	xr	$s0,$s1		# ^= ROTATE(tp8^tp2^tp1,24)
+	rll	$s3,$s3,24
+	xr	$s0,$s2    	# ^= ROTATE(tp8^tp4^tp1,16)
+	xr	$s0,$s3		# ^= ROTATE(tp8,8)
+
+	st	$s0,16($key)
+	la	$key,4($key)
+	brct	$rounds,.Lmix
+
+	lmg	%r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
+	lghi	%r2,0
+	br	$ra
+.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+___
+
+#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+#                     size_t length, const AES_KEY *key,
+#                     unsigned char *ivec, const int enc)
+{
+my $inp="%r2";
+my $out="%r4";	# length and out are swapped
+my $len="%r3";
+my $key="%r5";
+my $ivp="%r6";
+
+$code.=<<___;
+.globl	AES_cbc_encrypt
+.type	AES_cbc_encrypt,\@function
+.align	16
+AES_cbc_encrypt:
+	xgr	%r3,%r4		# flip %r3 and %r4, out and len
+	xgr	%r4,%r3
+	xgr	%r3,%r4
+___
+$code.=<<___ if (!$softonly);
+	lhi	%r0,16
+	cl	%r0,240($key)
+	jh	.Lcbc_software
+
+	lg	%r0,0($ivp)	# copy ivec
+	lg	%r1,8($ivp)
+	stmg	%r0,%r1,16($sp)
+	lmg	%r0,%r1,0($key)	# copy key, cover 256 bit
+	stmg	%r0,%r1,32($sp)
+	lmg	%r0,%r1,16($key)
+	stmg	%r0,%r1,48($sp)
+	l	%r0,240($key)	# load kmc code
+	lghi	$key,15		# res=len%16, len-=res;
+	ngr	$key,$len
+	slgr	$len,$key
+	la	%r1,16($sp)	# parameter block - ivec || key
+	jz	.Lkmc_truncated
+	.long	0xb92f0042	# kmc %r4,%r2
+	brc	1,.-4		# pay attention to "partial completion"
+	ltr	$key,$key
+	jnz	.Lkmc_truncated
+.Lkmc_done:
+	lmg	%r0,%r1,16($sp)	# copy ivec to caller
+	stg	%r0,0($ivp)
+	stg	%r1,8($ivp)
+	br	$ra
+.align	16
+.Lkmc_truncated:
+	ahi	$key,-1		# it's the way it's encoded in mvc
+	tmll	%r0,0x80
+	jnz	.Lkmc_truncated_dec
+	lghi	%r1,0
+	stg	%r1,128($sp)
+	stg	%r1,136($sp)
+	bras	%r1,1f
+	mvc	128(1,$sp),0($inp)
+1:	ex	$key,0(%r1)
+	la	%r1,16($sp)	# restore parameter block
+	la	$inp,128($sp)
+	lghi	$len,16
+	.long	0xb92f0042	# kmc %r4,%r2
+	j	.Lkmc_done
+.align	16
+.Lkmc_truncated_dec:
+	stg	$out,64($sp)
+	la	$out,128($sp)
+	lghi	$len,16
+	.long	0xb92f0042	# kmc %r4,%r2
+	lg	$out,64($sp)
+	bras	%r1,2f
+	mvc	0(1,$out),128($sp)
+2:	ex	$key,0(%r1)
+	j	.Lkmc_done
+.align	16
+.Lcbc_software:
+___
+$code.=<<___;
+	stmg	$key,$ra,40($sp)
+	lhi	%r0,0
+	cl	%r0,164($sp)
+	je	.Lcbc_decrypt
+
+	larl	$tbl,AES_Te
+
+	llgf	$s0,0($ivp)
+	llgf	$s1,4($ivp)
+	llgf	$s2,8($ivp)
+	llgf	$s3,12($ivp)
+
+	lghi	$t0,16
+	slgr	$len,$t0
+	brc	4,.Lcbc_enc_tail	# if borrow
+.Lcbc_enc_loop:
+	stmg	$inp,$out,16($sp)
+	x	$s0,0($inp)
+	x	$s1,4($inp)
+	x	$s2,8($inp)
+	x	$s3,12($inp)
+	lgr	%r4,$key
+
+	bras	$ra,_s390x_AES_encrypt
+
+	lmg	$inp,$key,16($sp)
+	st	$s0,0($out)
+	st	$s1,4($out)
+	st	$s2,8($out)
+	st	$s3,12($out)
+
+	la	$inp,16($inp)
+	la	$out,16($out)
+	lghi	$t0,16
+	ltgr	$len,$len
+	jz	.Lcbc_enc_done
+	slgr	$len,$t0
+	brc	4,.Lcbc_enc_tail	# if borrow
+	j	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_done:
+	lg	$ivp,48($sp)
+	st	$s0,0($ivp)
+	st	$s1,4($ivp)	
+	st	$s2,8($ivp)
+	st	$s3,12($ivp)
+
+	lmg	%r7,$ra,56($sp)
+	br	$ra
+
+.align	16
+.Lcbc_enc_tail:
+	aghi	$len,15
+	lghi	$t0,0
+	stg	$t0,128($sp)
+	stg	$t0,136($sp)
+	bras	$t1,3f
+	mvc	128(1,$sp),0($inp)
+3:	ex	$len,0($t1)
+	lghi	$len,0
+	la	$inp,128($sp)
+	j	.Lcbc_enc_loop
+
+.align	16
+.Lcbc_decrypt:
+	larl	$tbl,AES_Td
+
+	lg	$t0,0($ivp)
+	lg	$t1,8($ivp)
+	stmg	$t0,$t1,128($sp)
+
+.Lcbc_dec_loop:
+	stmg	$inp,$out,16($sp)
+	llgf	$s0,0($inp)
+	llgf	$s1,4($inp)
+	llgf	$s2,8($inp)
+	llgf	$s3,12($inp)
+	lgr	%r4,$key
+
+	bras	$ra,_s390x_AES_decrypt
+
+	lmg	$inp,$key,16($sp)
+	sllg	$s0,$s0,32
+	sllg	$s2,$s2,32
+	lr	$s0,$s1
+	lr	$s2,$s3
+
+	lg	$t0,0($inp)
+	lg	$t1,8($inp)
+	xg	$s0,128($sp)
+	xg	$s2,136($sp)
+	lghi	$s1,16
+	slgr	$len,$s1
+	brc	4,.Lcbc_dec_tail	# if borrow
+	brc	2,.Lcbc_dec_done	# if zero
+	stg	$s0,0($out)
+	stg	$s2,8($out)
+	stmg	$t0,$t1,128($sp)
+
+	la	$inp,16($inp)
+	la	$out,16($out)
+	j	.Lcbc_dec_loop
+
+.Lcbc_dec_done:
+	stg	$s0,0($out)
+	stg	$s2,8($out)
+.Lcbc_dec_exit:
+	lmg	$ivp,$ra,48($sp)
+	stmg	$t0,$t1,0($ivp)
+
+	br	$ra
+
+.align	16
+.Lcbc_dec_tail:
+	aghi	$len,15
+	stg	$s0,128($sp)
+	stg	$s2,136($sp)
+	bras	$s1,4f
+	mvc	0(1,$out),128($sp)
+4:	ex	$len,0($s1)
+	j	.Lcbc_dec_exit
+.size	AES_cbc_encrypt,.-AES_cbc_encrypt
+___
+}
+$code.=<<___;
+.string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
diff --git a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
new file mode 100755
index 0000000000..c57b3a2d6d
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
@@ -0,0 +1,1181 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. Rights for redistribution and usage in source and binary
+# forms are granted according to the OpenSSL license.
+# ====================================================================
+#
+# Version 1.1
+#
+# The major reason for undertaken effort was to mitigate the hazard of
+# cache-timing attack. This is [currently and initially!] addressed in
+# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
+# 2. References to them are scheduled for L2 cache latency, meaning
+# that the tables don't have to reside in L1 cache. Once again, this
+# is an initial draft and one should expect more countermeasures to
+# be implemented...
+#
+# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
+# round.
+#
+# Even though performance was not the primary goal [on the contrary,
+# extra shifts "induced" by compressed S-box and longer loop epilogue
+# "induced" by scheduling for L2 have negative effect on performance],
+# the code turned out to run in ~23 cycles per processed byte en-/
+# decrypted with 128-bit key. This is pretty good result for code
+# with mentioned qualities and UltraSPARC core. Compared to Sun C
+# generated code my encrypt procedure runs just few percents faster,
+# while decrypt one - whole 50% faster [yes, Sun C failed to generate
+# optimal decrypt procedure]. Compared to GNU C generated code both
+# procedures are more than 60% faster:-)
+
+$bits=32;
+for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)	{ $bias=2047; $frame=192; }
+else		{ $bias=0;    $frame=112; }
+$locals=16;
+
+$acc0="%l0";
+$acc1="%o0";
+$acc2="%o1";
+$acc3="%o2";
+
+$acc4="%l1";
+$acc5="%o3";
+$acc6="%o4";
+$acc7="%o5";
+
+$acc8="%l2";
+$acc9="%o7";
+$acc10="%g1";
+$acc11="%g2";
+
+$acc12="%l3";
+$acc13="%g3";
+$acc14="%g4";
+$acc15="%g5";
+
+$t0="%l4";
+$t1="%l5";
+$t2="%l6";
+$t3="%l7";
+
+$s0="%i0";
+$s1="%i1";
+$s2="%i2";
+$s3="%i3";
+$tbl="%i4";
+$key="%i5";
+$rounds="%i7";	# aliases with return address, which is off-loaded to stack
+
+sub _data_word()
+{ my $i;
+    while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
+}
+
+$code.=<<___ if ($bits==64);
+.register	%g2,#scratch
+.register	%g3,#scratch
+___
+$code.=<<___;
+.section	".text",#alloc,#execinstr
+
+.align	256
+AES_Te:
+___
+&_data_word(
+	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
+	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
+	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
+	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
+	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
+	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
+	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
+	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
+	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
+	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
+	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
+	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
+	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
+	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
+	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
+	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
+	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
+	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
+	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
+	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
+	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
+	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
+	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
+	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
+	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
+	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
+	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
+	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
+	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
+	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
+	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
+	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
+	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
+	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
+	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
+	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
+	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
+	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
+	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
+	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
+	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
+	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
+	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
+	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
+	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
+	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
+	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
+	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
+	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
+	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
+	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
+	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
+	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
+	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
+	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
+	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
+	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
+	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
+	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
+	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
+	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
+	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
+	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
+	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
+$code.=<<___;
+	.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+	.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+	.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+	.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+	.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+	.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+	.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+	.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+	.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+	.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+	.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+	.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+	.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+	.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+	.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+	.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+	.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+	.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+	.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+	.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+	.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+	.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+	.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+	.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+	.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+	.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+	.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+	.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+	.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+	.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+	.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+	.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+.type	AES_Te,#object
+.size	AES_Te,(.-AES_Te)
+
+.align	64
+.skip	16
+_sparcv9_AES_encrypt:
+	save	%sp,-$frame-$locals,%sp
+	stx	%i7,[%sp+$bias+$frame+0]	! off-load return address
+	ld	[$key+240],$rounds
+	ld	[$key+0],$t0
+	ld	[$key+4],$t1			!
+	ld	[$key+8],$t2
+	srl	$rounds,1,$rounds
+	xor	$t0,$s0,$s0
+	ld	[$key+12],$t3
+	srl	$s0,21,$acc0
+	xor	$t1,$s1,$s1
+	ld	[$key+16],$t0
+	srl	$s1,13,$acc1			!
+	xor	$t2,$s2,$s2
+	ld	[$key+20],$t1
+	xor	$t3,$s3,$s3
+	ld	[$key+24],$t2
+	and	$acc0,2040,$acc0
+	ld	[$key+28],$t3
+	nop
+.Lenc_loop:
+	srl	$s2,5,$acc2			!
+	and	$acc1,2040,$acc1
+	ldx	[$tbl+$acc0],$acc0
+	sll	$s3,3,$acc3
+	and	$acc2,2040,$acc2
+	ldx	[$tbl+$acc1],$acc1
+	srl	$s1,21,$acc4
+	and	$acc3,2040,$acc3
+	ldx	[$tbl+$acc2],$acc2		!
+	srl	$s2,13,$acc5
+	and	$acc4,2040,$acc4
+	ldx	[$tbl+$acc3],$acc3
+	srl	$s3,5,$acc6
+	and	$acc5,2040,$acc5
+	ldx	[$tbl+$acc4],$acc4
+	fmovs	%f0,%f0
+	sll	$s0,3,$acc7			!
+	and	$acc6,2040,$acc6
+	ldx	[$tbl+$acc5],$acc5
+	srl	$s2,21,$acc8
+	and	$acc7,2040,$acc7
+	ldx	[$tbl+$acc6],$acc6
+	srl	$s3,13,$acc9
+	and	$acc8,2040,$acc8
+	ldx	[$tbl+$acc7],$acc7		!
+	srl	$s0,5,$acc10
+	and	$acc9,2040,$acc9
+	ldx	[$tbl+$acc8],$acc8
+	sll	$s1,3,$acc11
+	and	$acc10,2040,$acc10
+	ldx	[$tbl+$acc9],$acc9
+	fmovs	%f0,%f0
+	srl	$s3,21,$acc12			!
+	and	$acc11,2040,$acc11
+	ldx	[$tbl+$acc10],$acc10
+	srl	$s0,13,$acc13
+	and	$acc12,2040,$acc12
+	ldx	[$tbl+$acc11],$acc11
+	srl	$s1,5,$acc14
+	and	$acc13,2040,$acc13
+	ldx	[$tbl+$acc12],$acc12		!
+	sll	$s2,3,$acc15
+	and	$acc14,2040,$acc14
+	ldx	[$tbl+$acc13],$acc13
+	and	$acc15,2040,$acc15
+	add	$key,32,$key
+	ldx	[$tbl+$acc14],$acc14
+	fmovs	%f0,%f0
+	subcc	$rounds,1,$rounds		!
+	ldx	[$tbl+$acc15],$acc15
+	bz,a,pn	%icc,.Lenc_last
+	add	$tbl,2048,$rounds
+
+		srlx	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ld	[$key+0],$s0
+	fmovs	%f0,%f0
+		srlx	$acc2,16,$acc2		!
+		xor	$acc1,$t0,$t0
+	ld	[$key+4],$s1
+		srlx	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ld	[$key+8],$s2
+		srlx	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ld	[$key+12],$s3			!
+		srlx	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+	fmovs	%f0,%f0
+		srlx	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		srlx	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		srlx	$acc10,16,$acc10	!
+		xor	$acc7,$t1,$t1
+		srlx	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		srlx	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		srlx	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		srlx	$acc15,24,$acc15	!
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	srl	$t0,21,$acc0
+		xor	$acc14,$t3,$t3
+	srl	$t1,13,$acc1
+		xor	$acc15,$t3,$t3
+
+	and	$acc0,2040,$acc0		!
+	srl	$t2,5,$acc2
+	and	$acc1,2040,$acc1
+	ldx	[$tbl+$acc0],$acc0
+	sll	$t3,3,$acc3
+	and	$acc2,2040,$acc2
+	ldx	[$tbl+$acc1],$acc1
+	fmovs	%f0,%f0
+	srl	$t1,21,$acc4			!
+	and	$acc3,2040,$acc3
+	ldx	[$tbl+$acc2],$acc2
+	srl	$t2,13,$acc5
+	and	$acc4,2040,$acc4
+	ldx	[$tbl+$acc3],$acc3
+	srl	$t3,5,$acc6
+	and	$acc5,2040,$acc5
+	ldx	[$tbl+$acc4],$acc4		!
+	sll	$t0,3,$acc7
+	and	$acc6,2040,$acc6
+	ldx	[$tbl+$acc5],$acc5
+	srl	$t2,21,$acc8
+	and	$acc7,2040,$acc7
+	ldx	[$tbl+$acc6],$acc6
+	fmovs	%f0,%f0
+	srl	$t3,13,$acc9			!
+	and	$acc8,2040,$acc8
+	ldx	[$tbl+$acc7],$acc7
+	srl	$t0,5,$acc10
+	and	$acc9,2040,$acc9
+	ldx	[$tbl+$acc8],$acc8
+	sll	$t1,3,$acc11
+	and	$acc10,2040,$acc10
+	ldx	[$tbl+$acc9],$acc9		!
+	srl	$t3,21,$acc12
+	and	$acc11,2040,$acc11
+	ldx	[$tbl+$acc10],$acc10
+	srl	$t0,13,$acc13
+	and	$acc12,2040,$acc12
+	ldx	[$tbl+$acc11],$acc11
+	fmovs	%f0,%f0
+	srl	$t1,5,$acc14			!
+	and	$acc13,2040,$acc13
+	ldx	[$tbl+$acc12],$acc12
+	sll	$t2,3,$acc15
+	and	$acc14,2040,$acc14
+	ldx	[$tbl+$acc13],$acc13
+		srlx	$acc1,8,$acc1
+	and	$acc15,2040,$acc15
+	ldx	[$tbl+$acc14],$acc14		!
+
+		srlx	$acc2,16,$acc2
+		xor	$acc0,$s0,$s0
+	ldx	[$tbl+$acc15],$acc15
+		srlx	$acc3,24,$acc3
+		xor	$acc1,$s0,$s0
+	ld	[$key+16],$t0
+	fmovs	%f0,%f0
+		srlx	$acc5,8,$acc5		!
+		xor	$acc2,$s0,$s0
+	ld	[$key+20],$t1
+		srlx	$acc6,16,$acc6
+		xor	$acc3,$s0,$s0
+	ld	[$key+24],$t2
+		srlx	$acc7,24,$acc7
+		xor	$acc4,$s1,$s1
+	ld	[$key+28],$t3			!
+		srlx	$acc9,8,$acc9
+		xor	$acc5,$s1,$s1
+	ldx	[$tbl+2048+0],%g0		! prefetch te4
+		srlx	$acc10,16,$acc10
+		xor	$acc6,$s1,$s1
+	ldx	[$tbl+2048+32],%g0		! prefetch te4
+		srlx	$acc11,24,$acc11
+		xor	$acc7,$s1,$s1
+	ldx	[$tbl+2048+64],%g0		! prefetch te4
+		srlx	$acc13,8,$acc13
+		xor	$acc8,$s2,$s2
+	ldx	[$tbl+2048+96],%g0		! prefetch te4
+		srlx	$acc14,16,$acc14	!
+		xor	$acc9,$s2,$s2
+	ldx	[$tbl+2048+128],%g0		! prefetch te4
+		srlx	$acc15,24,$acc15
+		xor	$acc10,$s2,$s2
+	ldx	[$tbl+2048+160],%g0		! prefetch te4
+	srl	$s0,21,$acc0
+		xor	$acc11,$s2,$s2
+	ldx	[$tbl+2048+192],%g0		! prefetch te4
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$s3,$s3
+	ldx	[$tbl+2048+224],%g0		! prefetch te4
+	srl	$s1,13,$acc1			!
+		xor	$acc14,$s3,$s3
+		xor	$acc15,$s3,$s3
+	ba	.Lenc_loop
+	and	$acc0,2040,$acc0
+
+.align	32
+.Lenc_last:
+		srlx	$acc1,8,$acc1		!
+		xor	$acc0,$t0,$t0
+	ld	[$key+0],$s0
+		srlx	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ld	[$key+4],$s1
+		srlx	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ld	[$key+8],$s2			!
+		srlx	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ld	[$key+12],$s3
+		srlx	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		srlx	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		srlx	$acc9,8,$acc9		!
+		xor	$acc6,$t1,$t1
+		srlx	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		srlx	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		srlx	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		srlx	$acc14,16,$acc14	!
+		xor	$acc10,$t2,$t2
+		srlx	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	srl	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+	srl	$t1,16,$acc1			!
+		xor	$acc15,$t3,$t3
+
+	srl	$t2,8,$acc2
+	and	$acc1,255,$acc1
+	ldub	[$rounds+$acc0],$acc0
+	srl	$t1,24,$acc4
+	and	$acc2,255,$acc2
+	ldub	[$rounds+$acc1],$acc1
+	srl	$t2,16,$acc5			!
+	and	$t3,255,$acc3
+	ldub	[$rounds+$acc2],$acc2
+	ldub	[$rounds+$acc3],$acc3
+	srl	$t3,8,$acc6
+	and	$acc5,255,$acc5
+	ldub	[$rounds+$acc4],$acc4
+	fmovs	%f0,%f0
+	srl	$t2,24,$acc8			!
+	and	$acc6,255,$acc6
+	ldub	[$rounds+$acc5],$acc5
+	srl	$t3,16,$acc9
+	and	$t0,255,$acc7
+	ldub	[$rounds+$acc6],$acc6
+	ldub	[$rounds+$acc7],$acc7
+	fmovs	%f0,%f0
+	srl	$t0,8,$acc10			!
+	and	$acc9,255,$acc9
+	ldub	[$rounds+$acc8],$acc8
+	srl	$t3,24,$acc12
+	and	$acc10,255,$acc10
+	ldub	[$rounds+$acc9],$acc9
+	srl	$t0,16,$acc13
+	and	$t1,255,$acc11
+	ldub	[$rounds+$acc10],$acc10		!
+	srl	$t1,8,$acc14
+	and	$acc13,255,$acc13
+	ldub	[$rounds+$acc11],$acc11
+	ldub	[$rounds+$acc12],$acc12
+	and	$acc14,255,$acc14
+	ldub	[$rounds+$acc13],$acc13
+	and	$t2,255,$acc15
+	ldub	[$rounds+$acc14],$acc14		!
+
+		sll	$acc0,24,$acc0
+		xor	$acc3,$s0,$s0
+	ldub	[$rounds+$acc15],$acc15
+		sll	$acc1,16,$acc1
+		xor	$acc0,$s0,$s0
+	ldx	[%sp+$bias+$frame+0],%i7	! restore return address
+	fmovs	%f0,%f0
+		sll	$acc2,8,$acc2		!
+		xor	$acc1,$s0,$s0
+		sll	$acc4,24,$acc4
+		xor	$acc2,$s0,$s0
+		sll	$acc5,16,$acc5
+		xor	$acc7,$s1,$s1
+		sll	$acc6,8,$acc6
+		xor	$acc4,$s1,$s1
+		sll	$acc8,24,$acc8		!
+		xor	$acc5,$s1,$s1
+		sll	$acc9,16,$acc9
+		xor	$acc11,$s2,$s2
+		sll	$acc10,8,$acc10
+		xor	$acc6,$s1,$s1
+		sll	$acc12,24,$acc12
+		xor	$acc8,$s2,$s2
+		sll	$acc13,16,$acc13	!
+		xor	$acc9,$s2,$s2
+		sll	$acc14,8,$acc14
+		xor	$acc10,$s2,$s2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$s3,$s3
+		xor	$acc14,$s3,$s3
+		xor	$acc15,$s3,$s3
+
+	ret
+	restore
+.type	_sparcv9_AES_encrypt,#function
+.size	_sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
+
+.align	32
+.globl	AES_encrypt
+AES_encrypt:
+	or	%o0,%o1,%g1
+	andcc	%g1,3,%g0
+	bnz,pn	%xcc,.Lunaligned_enc
+	save	%sp,-$frame,%sp
+
+	ld	[%i0+0],%o0
+	ld	[%i0+4],%o1
+	ld	[%i0+8],%o2
+	ld	[%i0+12],%o3
+
+1:	call	.+8
+	add	%o7,AES_Te-1b,%o4
+	call	_sparcv9_AES_encrypt
+	mov	%i2,%o5
+
+	st	%o0,[%i1+0]
+	st	%o1,[%i1+4]
+	st	%o2,[%i1+8]
+	st	%o3,[%i1+12]
+
+	ret
+	restore
+
+.align	32
+.Lunaligned_enc:
+	ldub	[%i0+0],%l0
+	ldub	[%i0+1],%l1
+	ldub	[%i0+2],%l2
+
+	sll	%l0,24,%l0
+	ldub	[%i0+3],%l3
+	sll	%l1,16,%l1
+	ldub	[%i0+4],%l4
+	sll	%l2,8,%l2
+	or	%l1,%l0,%l0
+	ldub	[%i0+5],%l5
+	sll	%l4,24,%l4
+	or	%l3,%l2,%l2
+	ldub	[%i0+6],%l6
+	sll	%l5,16,%l5
+	or	%l0,%l2,%o0
+	ldub	[%i0+7],%l7
+
+	sll	%l6,8,%l6
+	or	%l5,%l4,%l4
+	ldub	[%i0+8],%l0
+	or	%l7,%l6,%l6
+	ldub	[%i0+9],%l1
+	or	%l4,%l6,%o1
+	ldub	[%i0+10],%l2
+
+	sll	%l0,24,%l0
+	ldub	[%i0+11],%l3
+	sll	%l1,16,%l1
+	ldub	[%i0+12],%l4
+	sll	%l2,8,%l2
+	or	%l1,%l0,%l0
+	ldub	[%i0+13],%l5
+	sll	%l4,24,%l4
+	or	%l3,%l2,%l2
+	ldub	[%i0+14],%l6
+	sll	%l5,16,%l5
+	or	%l0,%l2,%o2
+	ldub	[%i0+15],%l7
+
+	sll	%l6,8,%l6
+	or	%l5,%l4,%l4
+	or	%l7,%l6,%l6
+	or	%l4,%l6,%o3
+
+1:	call	.+8
+	add	%o7,AES_Te-1b,%o4
+	call	_sparcv9_AES_encrypt
+	mov	%i2,%o5
+
+	srl	%o0,24,%l0
+	srl	%o0,16,%l1
+	stb	%l0,[%i1+0]
+	srl	%o0,8,%l2
+	stb	%l1,[%i1+1]
+	stb	%l2,[%i1+2]
+	srl	%o1,24,%l4
+	stb	%o0,[%i1+3]
+
+	srl	%o1,16,%l5
+	stb	%l4,[%i1+4]
+	srl	%o1,8,%l6
+	stb	%l5,[%i1+5]
+	stb	%l6,[%i1+6]
+	srl	%o2,24,%l0
+	stb	%o1,[%i1+7]
+
+	srl	%o2,16,%l1
+	stb	%l0,[%i1+8]
+	srl	%o2,8,%l2
+	stb	%l1,[%i1+9]
+	stb	%l2,[%i1+10]
+	srl	%o3,24,%l4
+	stb	%o2,[%i1+11]
+
+	srl	%o3,16,%l5
+	stb	%l4,[%i1+12]
+	srl	%o3,8,%l6
+	stb	%l5,[%i1+13]
+	stb	%l6,[%i1+14]
+	stb	%o3,[%i1+15]
+
+	ret
+	restore
+.type	AES_encrypt,#function
+.size	AES_encrypt,(.-AES_encrypt)
+
+___
+
+$code.=<<___;
+.align	256
+AES_Td:
+___
+&_data_word(
+	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
+	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
+	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
+	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
+	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
+	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
+	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
+	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
+	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
+	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
+	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
+	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
+	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
+	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
+	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
+	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
+	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
+	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
+	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
+	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
+	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
+	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
+	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
+	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
+	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
+	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
+	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
+	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
+	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
+	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
+	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
+	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
+	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
+	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
+	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
+	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
+	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
+	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
+	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
+	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
+	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
+	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
+	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
+	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
+	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
+	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
+	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
+	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
+	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
+	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
+	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
+	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
+	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
+	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
+	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
+	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
+	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
+	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
+	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
+	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
+	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
+	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
+	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
+	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
+$code.=<<___;
+	.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+	.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+	.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+	.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+	.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+	.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+	.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+	.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+	.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+	.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+	.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+	.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+	.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+	.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+	.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+	.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+	.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+	.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+	.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+	.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+	.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+	.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+	.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+	.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+	.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+	.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+	.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+	.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+	.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+	.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+	.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+	.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+.type	AES_Td,#object
+.size	AES_Td,(.-AES_Td)
+
+.align	64
+.skip	16
+_sparcv9_AES_decrypt:
+	save	%sp,-$frame-$locals,%sp
+	stx	%i7,[%sp+$bias+$frame+0]	! off-load return address
+	ld	[$key+240],$rounds
+	ld	[$key+0],$t0
+	ld	[$key+4],$t1			!
+	ld	[$key+8],$t2
+	ld	[$key+12],$t3
+	srl	$rounds,1,$rounds
+	xor	$t0,$s0,$s0
+	ld	[$key+16],$t0
+	xor	$t1,$s1,$s1
+	ld	[$key+20],$t1
+	srl	$s0,21,$acc0			!
+	xor	$t2,$s2,$s2
+	ld	[$key+24],$t2
+	xor	$t3,$s3,$s3
+	and	$acc0,2040,$acc0
+	ld	[$key+28],$t3
+	srl	$s3,13,$acc1
+	nop
+.Ldec_loop:
+	srl	$s2,5,$acc2			!
+	and	$acc1,2040,$acc1
+	ldx	[$tbl+$acc0],$acc0
+	sll	$s1,3,$acc3
+	and	$acc2,2040,$acc2
+	ldx	[$tbl+$acc1],$acc1
+	srl	$s1,21,$acc4
+	and	$acc3,2040,$acc3
+	ldx	[$tbl+$acc2],$acc2		!
+	srl	$s0,13,$acc5
+	and	$acc4,2040,$acc4
+	ldx	[$tbl+$acc3],$acc3
+	srl	$s3,5,$acc6
+	and	$acc5,2040,$acc5
+	ldx	[$tbl+$acc4],$acc4
+	fmovs	%f0,%f0
+	sll	$s2,3,$acc7			!
+	and	$acc6,2040,$acc6
+	ldx	[$tbl+$acc5],$acc5
+	srl	$s2,21,$acc8
+	and	$acc7,2040,$acc7
+	ldx	[$tbl+$acc6],$acc6
+	srl	$s1,13,$acc9
+	and	$acc8,2040,$acc8
+	ldx	[$tbl+$acc7],$acc7		!
+	srl	$s0,5,$acc10
+	and	$acc9,2040,$acc9
+	ldx	[$tbl+$acc8],$acc8
+	sll	$s3,3,$acc11
+	and	$acc10,2040,$acc10
+	ldx	[$tbl+$acc9],$acc9
+	fmovs	%f0,%f0
+	srl	$s3,21,$acc12			!
+	and	$acc11,2040,$acc11
+	ldx	[$tbl+$acc10],$acc10
+	srl	$s2,13,$acc13
+	and	$acc12,2040,$acc12
+	ldx	[$tbl+$acc11],$acc11
+	srl	$s1,5,$acc14
+	and	$acc13,2040,$acc13
+	ldx	[$tbl+$acc12],$acc12		!
+	sll	$s0,3,$acc15
+	and	$acc14,2040,$acc14
+	ldx	[$tbl+$acc13],$acc13
+	and	$acc15,2040,$acc15
+	add	$key,32,$key
+	ldx	[$tbl+$acc14],$acc14
+	fmovs	%f0,%f0
+	subcc	$rounds,1,$rounds		!
+	ldx	[$tbl+$acc15],$acc15
+	bz,a,pn	%icc,.Ldec_last
+	add	$tbl,2048,$rounds
+
+		srlx	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ld	[$key+0],$s0
+	fmovs	%f0,%f0
+		srlx	$acc2,16,$acc2		!
+		xor	$acc1,$t0,$t0
+	ld	[$key+4],$s1
+		srlx	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ld	[$key+8],$s2
+		srlx	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ld	[$key+12],$s3			!
+		srlx	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+	fmovs	%f0,%f0
+		srlx	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		srlx	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		srlx	$acc10,16,$acc10	!
+		xor	$acc7,$t1,$t1
+		srlx	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		srlx	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		srlx	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		srlx	$acc15,24,$acc15	!
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	srl	$t0,21,$acc0
+		xor	$acc14,$t3,$t3
+		xor	$acc15,$t3,$t3
+	srl	$t3,13,$acc1
+
+	and	$acc0,2040,$acc0		!
+	srl	$t2,5,$acc2
+	and	$acc1,2040,$acc1
+	ldx	[$tbl+$acc0],$acc0
+	sll	$t1,3,$acc3
+	and	$acc2,2040,$acc2
+	ldx	[$tbl+$acc1],$acc1
+	fmovs	%f0,%f0
+	srl	$t1,21,$acc4			!
+	and	$acc3,2040,$acc3
+	ldx	[$tbl+$acc2],$acc2
+	srl	$t0,13,$acc5
+	and	$acc4,2040,$acc4
+	ldx	[$tbl+$acc3],$acc3
+	srl	$t3,5,$acc6
+	and	$acc5,2040,$acc5
+	ldx	[$tbl+$acc4],$acc4		!
+	sll	$t2,3,$acc7
+	and	$acc6,2040,$acc6
+	ldx	[$tbl+$acc5],$acc5
+	srl	$t2,21,$acc8
+	and	$acc7,2040,$acc7
+	ldx	[$tbl+$acc6],$acc6
+	fmovs	%f0,%f0
+	srl	$t1,13,$acc9			!
+	and	$acc8,2040,$acc8
+	ldx	[$tbl+$acc7],$acc7
+	srl	$t0,5,$acc10
+	and	$acc9,2040,$acc9
+	ldx	[$tbl+$acc8],$acc8
+	sll	$t3,3,$acc11
+	and	$acc10,2040,$acc10
+	ldx	[$tbl+$acc9],$acc9		!
+	srl	$t3,21,$acc12
+	and	$acc11,2040,$acc11
+	ldx	[$tbl+$acc10],$acc10
+	srl	$t2,13,$acc13
+	and	$acc12,2040,$acc12
+	ldx	[$tbl+$acc11],$acc11
+	fmovs	%f0,%f0
+	srl	$t1,5,$acc14			!
+	and	$acc13,2040,$acc13
+	ldx	[$tbl+$acc12],$acc12
+	sll	$t0,3,$acc15
+	and	$acc14,2040,$acc14
+	ldx	[$tbl+$acc13],$acc13
+		srlx	$acc1,8,$acc1
+	and	$acc15,2040,$acc15
+	ldx	[$tbl+$acc14],$acc14		!
+
+		srlx	$acc2,16,$acc2
+		xor	$acc0,$s0,$s0
+	ldx	[$tbl+$acc15],$acc15
+		srlx	$acc3,24,$acc3
+		xor	$acc1,$s0,$s0
+	ld	[$key+16],$t0
+	fmovs	%f0,%f0
+		srlx	$acc5,8,$acc5		!
+		xor	$acc2,$s0,$s0
+	ld	[$key+20],$t1
+		srlx	$acc6,16,$acc6
+		xor	$acc3,$s0,$s0
+	ld	[$key+24],$t2
+		srlx	$acc7,24,$acc7
+		xor	$acc4,$s1,$s1
+	ld	[$key+28],$t3			!
+		srlx	$acc9,8,$acc9
+		xor	$acc5,$s1,$s1
+	ldx	[$tbl+2048+0],%g0		! prefetch td4
+		srlx	$acc10,16,$acc10
+		xor	$acc6,$s1,$s1
+	ldx	[$tbl+2048+32],%g0		! prefetch td4
+		srlx	$acc11,24,$acc11
+		xor	$acc7,$s1,$s1
+	ldx	[$tbl+2048+64],%g0		! prefetch td4
+		srlx	$acc13,8,$acc13
+		xor	$acc8,$s2,$s2
+	ldx	[$tbl+2048+96],%g0		! prefetch td4
+		srlx	$acc14,16,$acc14	!
+		xor	$acc9,$s2,$s2
+	ldx	[$tbl+2048+128],%g0		! prefetch td4
+		srlx	$acc15,24,$acc15
+		xor	$acc10,$s2,$s2
+	ldx	[$tbl+2048+160],%g0		! prefetch td4
+	srl	$s0,21,$acc0
+		xor	$acc11,$s2,$s2
+	ldx	[$tbl+2048+192],%g0		! prefetch td4
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$s3,$s3
+	ldx	[$tbl+2048+224],%g0		! prefetch td4
+	and	$acc0,2040,$acc0		!
+		xor	$acc14,$s3,$s3
+		xor	$acc15,$s3,$s3
+	ba	.Ldec_loop
+	srl	$s3,13,$acc1
+
+.align	32
+.Ldec_last:
+		srlx	$acc1,8,$acc1		!
+		xor	$acc0,$t0,$t0
+	ld	[$key+0],$s0
+		srlx	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ld	[$key+4],$s1
+		srlx	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ld	[$key+8],$s2			!
+		srlx	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ld	[$key+12],$s3
+		srlx	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		srlx	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		srlx	$acc9,8,$acc9		!
+		xor	$acc6,$t1,$t1
+		srlx	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		srlx	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		srlx	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		srlx	$acc14,16,$acc14	!
+		xor	$acc10,$t2,$t2
+		srlx	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	srl	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+		xor	$acc15,$t3,$t3		!
+	srl	$t3,16,$acc1
+
+	srl	$t2,8,$acc2
+	and	$acc1,255,$acc1
+	ldub	[$rounds+$acc0],$acc0
+	srl	$t1,24,$acc4
+	and	$acc2,255,$acc2
+	ldub	[$rounds+$acc1],$acc1
+	srl	$t0,16,$acc5			!
+	and	$t1,255,$acc3
+	ldub	[$rounds+$acc2],$acc2
+	ldub	[$rounds+$acc3],$acc3
+	srl	$t3,8,$acc6
+	and	$acc5,255,$acc5
+	ldub	[$rounds+$acc4],$acc4
+	fmovs	%f0,%f0
+	srl	$t2,24,$acc8			!
+	and	$acc6,255,$acc6
+	ldub	[$rounds+$acc5],$acc5
+	srl	$t1,16,$acc9
+	and	$t2,255,$acc7
+	ldub	[$rounds+$acc6],$acc6
+	ldub	[$rounds+$acc7],$acc7
+	fmovs	%f0,%f0
+	srl	$t0,8,$acc10			!
+	and	$acc9,255,$acc9
+	ldub	[$rounds+$acc8],$acc8
+	srl	$t3,24,$acc12
+	and	$acc10,255,$acc10
+	ldub	[$rounds+$acc9],$acc9
+	srl	$t2,16,$acc13
+	and	$t3,255,$acc11
+	ldub	[$rounds+$acc10],$acc10		!
+	srl	$t1,8,$acc14
+	and	$acc13,255,$acc13
+	ldub	[$rounds+$acc11],$acc11
+	ldub	[$rounds+$acc12],$acc12
+	and	$acc14,255,$acc14
+	ldub	[$rounds+$acc13],$acc13
+	and	$t0,255,$acc15
+	ldub	[$rounds+$acc14],$acc14		!
+
+		sll	$acc0,24,$acc0
+		xor	$acc3,$s0,$s0
+	ldub	[$rounds+$acc15],$acc15
+		sll	$acc1,16,$acc1
+		xor	$acc0,$s0,$s0
+	ldx	[%sp+$bias+$frame+0],%i7	! restore return address
+	fmovs	%f0,%f0
+		sll	$acc2,8,$acc2		!
+		xor	$acc1,$s0,$s0
+		sll	$acc4,24,$acc4
+		xor	$acc2,$s0,$s0
+		sll	$acc5,16,$acc5
+		xor	$acc7,$s1,$s1
+		sll	$acc6,8,$acc6
+		xor	$acc4,$s1,$s1
+		sll	$acc8,24,$acc8		!
+		xor	$acc5,$s1,$s1
+		sll	$acc9,16,$acc9
+		xor	$acc11,$s2,$s2
+		sll	$acc10,8,$acc10
+		xor	$acc6,$s1,$s1
+		sll	$acc12,24,$acc12
+		xor	$acc8,$s2,$s2
+		sll	$acc13,16,$acc13	!
+		xor	$acc9,$s2,$s2
+		sll	$acc14,8,$acc14
+		xor	$acc10,$s2,$s2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$s3,$s3
+		xor	$acc14,$s3,$s3
+		xor	$acc15,$s3,$s3
+
+	ret
+	restore
+.type	_sparcv9_AES_decrypt,#function
+.size	_sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
+
+.align	32
+.globl	AES_decrypt
+AES_decrypt:
+	or	%o0,%o1,%g1
+	andcc	%g1,3,%g0
+	bnz,pn	%xcc,.Lunaligned_dec
+	save	%sp,-$frame,%sp
+
+	ld	[%i0+0],%o0
+	ld	[%i0+4],%o1
+	ld	[%i0+8],%o2
+	ld	[%i0+12],%o3
+
+1:	call	.+8
+	add	%o7,AES_Td-1b,%o4
+	call	_sparcv9_AES_decrypt
+	mov	%i2,%o5
+
+	st	%o0,[%i1+0]
+	st	%o1,[%i1+4]
+	st	%o2,[%i1+8]
+	st	%o3,[%i1+12]
+
+	ret
+	restore
+
+.align	32
+.Lunaligned_dec:
+	ldub	[%i0+0],%l0
+	ldub	[%i0+1],%l1
+	ldub	[%i0+2],%l2
+
+	sll	%l0,24,%l0
+	ldub	[%i0+3],%l3
+	sll	%l1,16,%l1
+	ldub	[%i0+4],%l4
+	sll	%l2,8,%l2
+	or	%l1,%l0,%l0
+	ldub	[%i0+5],%l5
+	sll	%l4,24,%l4
+	or	%l3,%l2,%l2
+	ldub	[%i0+6],%l6
+	sll	%l5,16,%l5
+	or	%l0,%l2,%o0
+	ldub	[%i0+7],%l7
+
+	sll	%l6,8,%l6
+	or	%l5,%l4,%l4
+	ldub	[%i0+8],%l0
+	or	%l7,%l6,%l6
+	ldub	[%i0+9],%l1
+	or	%l4,%l6,%o1
+	ldub	[%i0+10],%l2
+
+	sll	%l0,24,%l0
+	ldub	[%i0+11],%l3
+	sll	%l1,16,%l1
+	ldub	[%i0+12],%l4
+	sll	%l2,8,%l2
+	or	%l1,%l0,%l0
+	ldub	[%i0+13],%l5
+	sll	%l4,24,%l4
+	or	%l3,%l2,%l2
+	ldub	[%i0+14],%l6
+	sll	%l5,16,%l5
+	or	%l0,%l2,%o2
+	ldub	[%i0+15],%l7
+
+	sll	%l6,8,%l6
+	or	%l5,%l4,%l4
+	or	%l7,%l6,%l6
+	or	%l4,%l6,%o3
+
+1:	call	.+8
+	add	%o7,AES_Td-1b,%o4
+	call	_sparcv9_AES_decrypt
+	mov	%i2,%o5
+
+	srl	%o0,24,%l0
+	srl	%o0,16,%l1
+	stb	%l0,[%i1+0]
+	srl	%o0,8,%l2
+	stb	%l1,[%i1+1]
+	stb	%l2,[%i1+2]
+	srl	%o1,24,%l4
+	stb	%o0,[%i1+3]
+
+	srl	%o1,16,%l5
+	stb	%l4,[%i1+4]
+	srl	%o1,8,%l6
+	stb	%l5,[%i1+5]
+	stb	%l6,[%i1+6]
+	srl	%o2,24,%l0
+	stb	%o1,[%i1+7]
+
+	srl	%o2,16,%l1
+	stb	%l0,[%i1+8]
+	srl	%o2,8,%l2
+	stb	%l1,[%i1+9]
+	stb	%l2,[%i1+10]
+	srl	%o3,24,%l4
+	stb	%o2,[%i1+11]
+
+	srl	%o3,16,%l5
+	stb	%l4,[%i1+12]
+	srl	%o3,8,%l6
+	stb	%l5,[%i1+13]
+	stb	%l6,[%i1+14]
+	stb	%o3,[%i1+15]
+
+	ret
+	restore
+.type	AES_decrypt,#function
+.size	AES_decrypt,(.-AES_decrypt)
+___
+
+# fmovs instructions substituting for FP nops were originally added
+# to meet specific instruction alignment requirements to maximize ILP.
+# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
+# undesired effect, so just omit them and sacrifice some portion of
+# percent in performance...
+$code =~ s/fmovs.*$//gem;
+
+print $code;
diff --git a/src/lib/libcrypto/asn1/a_bytes.c b/src/lib/libcrypto/asn1/a_bytes.c
index 8d13f9c931..92d630cdba 100644
--- a/src/lib/libcrypto/asn1/a_bytes.c
+++ b/src/lib/libcrypto/asn1/a_bytes.c
@@ -79,7 +79,7 @@ ASN1_STRING *d2i_ASN1_type_bytes(ASN1_STRING **a, const unsigned char **pp,
 
 	if (tag >= 32)
 		{
-		i=ASN1_R_TAG_VALUE_TOO_HIGH;;
+		i=ASN1_R_TAG_VALUE_TOO_HIGH;
 		goto err;
 		}
 	if (!(ASN1_tag2bit(tag) & type))
diff --git a/src/lib/libcrypto/asn1/ameth_lib.c b/src/lib/libcrypto/asn1/ameth_lib.c
new file mode 100644
index 0000000000..18957c669e
--- /dev/null
+++ b/src/lib/libcrypto/asn1/ameth_lib.c
@@ -0,0 +1,446 @@
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2006.
+ */
+/* ====================================================================
+ * Copyright (c) 2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/x509.h>
+#ifndef OPENSSL_NO_ENGINE
+#include <openssl/engine.h>
+#endif
+#include "asn1_locl.h"
+
+extern const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[];
+extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[];
+extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth;
+extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth;
+extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth;
+
+/* Keep this sorted in type order !! */
+static const EVP_PKEY_ASN1_METHOD *standard_methods[] = 
+	{
+#ifndef OPENSSL_NO_RSA
+	&rsa_asn1_meths[0],
+	&rsa_asn1_meths[1],
+#endif
+#ifndef OPENSSL_NO_DH
+	&dh_asn1_meth,
+#endif
+#ifndef OPENSSL_NO_DSA
+	&dsa_asn1_meths[0],
+	&dsa_asn1_meths[1],
+	&dsa_asn1_meths[2],
+	&dsa_asn1_meths[3],
+	&dsa_asn1_meths[4],
+#endif
+#ifndef OPENSSL_NO_EC
+	&eckey_asn1_meth,
+#endif
+	&hmac_asn1_meth
+	};
+
+typedef int sk_cmp_fn_type(const char * const *a, const char * const *b);
+DECLARE_STACK_OF(EVP_PKEY_ASN1_METHOD)
+static STACK_OF(EVP_PKEY_ASN1_METHOD) *app_methods = NULL;
+
+
+
+#ifdef TEST
+void main()
+	{
+	int i;
+	for (i = 0;
+		i < sizeof(standard_methods)/sizeof(EVP_PKEY_ASN1_METHOD *);
+		i++)
+		fprintf(stderr, "Number %d id=%d (%s)\n", i,
+			standard_methods[i]->pkey_id,
+			OBJ_nid2sn(standard_methods[i]->pkey_id));
+	}
+#endif
+
+DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_ASN1_METHOD *,
+			   const EVP_PKEY_ASN1_METHOD *, ameth);
+
+static int ameth_cmp(const EVP_PKEY_ASN1_METHOD * const *a,
+		     const EVP_PKEY_ASN1_METHOD * const *b)
+	{
+        return ((*a)->pkey_id - (*b)->pkey_id);
+	}
+
+IMPLEMENT_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_ASN1_METHOD *,
+			     const EVP_PKEY_ASN1_METHOD *, ameth);
+
+int EVP_PKEY_asn1_get_count(void)
+	{
+	int num = sizeof(standard_methods)/sizeof(EVP_PKEY_ASN1_METHOD *);
+	if (app_methods)
+		num += sk_EVP_PKEY_ASN1_METHOD_num(app_methods);
+	return num;
+	}
+
+const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_get0(int idx)
+	{
+	int num = sizeof(standard_methods)/sizeof(EVP_PKEY_ASN1_METHOD *);
+	if (idx < 0)
+		return NULL; 
+	if (idx < num)
+		return standard_methods[idx];
+	idx -= num;
+	return sk_EVP_PKEY_ASN1_METHOD_value(app_methods, idx);
+	}
+
+static const EVP_PKEY_ASN1_METHOD *pkey_asn1_find(int type)
+	{
+	EVP_PKEY_ASN1_METHOD tmp;
+	const EVP_PKEY_ASN1_METHOD *t = &tmp, **ret;
+	tmp.pkey_id = type;
+	if (app_methods)
+		{
+		int idx;
+		idx = sk_EVP_PKEY_ASN1_METHOD_find(app_methods, &tmp);
+		if (idx >= 0)
+			return sk_EVP_PKEY_ASN1_METHOD_value(app_methods, idx);
+		}
+	ret = OBJ_bsearch_ameth(&t, standard_methods,
+			  sizeof(standard_methods)
+			  /sizeof(EVP_PKEY_ASN1_METHOD *));
+	if (!ret || !*ret)
+		return NULL;
+	return *ret;
+	}
+
+/* Find an implementation of an ASN1 algorithm. If 'pe' is not NULL
+ * also search through engines and set *pe to a functional reference
+ * to the engine implementing 'type' or NULL if no engine implements 
+ * it.
+ */
+
+const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_find(ENGINE **pe, int type)
+	{
+	const EVP_PKEY_ASN1_METHOD *t;
+	ENGINE *e;
+
+	for (;;)
+		{
+		t = pkey_asn1_find(type);
+		if (!t || !(t->pkey_flags & ASN1_PKEY_ALIAS))
+			break;
+		type = t->pkey_base_id;
+		}
+	if (pe)
+		{
+#ifndef OPENSSL_NO_ENGINE
+		/* type will contain the final unaliased type */
+		e = ENGINE_get_pkey_asn1_meth_engine(type);
+		if (e)
+			{
+			*pe = e;
+			return ENGINE_get_pkey_asn1_meth(e, type);
+			}
+#endif
+		*pe = NULL;
+		}
+	return t;
+	}
+
+const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_find_str(ENGINE **pe,
+					const char *str, int len)
+	{
+	int i;
+	const EVP_PKEY_ASN1_METHOD *ameth;
+	if (len == -1)
+		len = strlen(str);
+	if (pe)
+		{
+#ifndef OPENSSL_NO_ENGINE
+		ENGINE *e;
+		ameth = ENGINE_pkey_asn1_find_str(&e, str, len);
+		if (ameth)
+			{
+			/* Convert structural into
+			 * functional reference
+			 */
+			if (!ENGINE_init(e))
+				ameth = NULL;
+			ENGINE_free(e);
+			*pe = e;
+			return ameth;
+			}
+#endif
+		*pe = NULL;
+		}
+	for (i = 0; i < EVP_PKEY_asn1_get_count(); i++)
+		{
+		ameth = EVP_PKEY_asn1_get0(i);
+		if (ameth->pkey_flags & ASN1_PKEY_ALIAS)
+			continue;
+		if (((int)strlen(ameth->pem_str) == len) && 
+			!strncasecmp(ameth->pem_str, str, len))
+			return ameth;
+		}
+	return NULL;
+	}
+
+int EVP_PKEY_asn1_add0(const EVP_PKEY_ASN1_METHOD *ameth)
+	{
+	if (app_methods == NULL)
+		{
+		app_methods = sk_EVP_PKEY_ASN1_METHOD_new(ameth_cmp);
+		if (!app_methods)
+			return 0;
+		}
+	if (!sk_EVP_PKEY_ASN1_METHOD_push(app_methods, ameth))
+		return 0;
+	sk_EVP_PKEY_ASN1_METHOD_sort(app_methods);
+	return 1;
+	}
+
+int EVP_PKEY_asn1_add_alias(int to, int from)
+	{
+	EVP_PKEY_ASN1_METHOD *ameth;
+	ameth = EVP_PKEY_asn1_new(from, ASN1_PKEY_ALIAS, NULL, NULL);
+	if (!ameth)
+		return 0;
+	ameth->pkey_base_id = to;
+	return EVP_PKEY_asn1_add0(ameth);
+	}
+
+int EVP_PKEY_asn1_get0_info(int *ppkey_id, int *ppkey_base_id, int *ppkey_flags,
+				const char **pinfo, const char **ppem_str,
+					const EVP_PKEY_ASN1_METHOD *ameth)
+	{
+	if (!ameth)
+		return 0;
+	if (ppkey_id)
+		*ppkey_id = ameth->pkey_id;
+	if (ppkey_base_id)
+		*ppkey_base_id = ameth->pkey_base_id;
+	if (ppkey_flags)
+		*ppkey_flags = ameth->pkey_flags;
+	if (pinfo)
+		*pinfo = ameth->info;
+	if (ppem_str)
+		*ppem_str = ameth->pem_str;
+	return 1;
+	}
+
+const EVP_PKEY_ASN1_METHOD* EVP_PKEY_get0_asn1(EVP_PKEY *pkey)
+	{
+	return pkey->ameth;
+	}
+
+EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags,
+					const char *pem_str, const char *info)
+	{
+	EVP_PKEY_ASN1_METHOD *ameth;
+	ameth = OPENSSL_malloc(sizeof(EVP_PKEY_ASN1_METHOD));
+	if (!ameth)
+		return NULL;
+
+	ameth->pkey_id = id;
+	ameth->pkey_base_id = id;
+	ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC;
+
+	if (info)
+		{
+		ameth->info = BUF_strdup(info);
+		if (!ameth->info)
+			goto err;
+		}
+
+	if (pem_str)
+		{
+		ameth->pem_str = BUF_strdup(pem_str);
+		if (!ameth->pem_str)
+			goto err;
+		}
+
+	ameth->pub_decode = 0;
+	ameth->pub_encode = 0;
+	ameth->pub_cmp = 0;
+	ameth->pub_print = 0;
+
+	ameth->priv_decode = 0;
+	ameth->priv_encode = 0;
+	ameth->priv_print = 0;
+
+	ameth->old_priv_encode = 0;
+	ameth->old_priv_decode = 0;
+
+	ameth->pkey_size = 0;
+	ameth->pkey_bits = 0;
+
+	ameth->param_decode = 0;
+	ameth->param_encode = 0;
+	ameth->param_missing = 0;
+	ameth->param_copy = 0;
+	ameth->param_cmp = 0;
+	ameth->param_print = 0;
+
+	ameth->pkey_free = 0;
+	ameth->pkey_ctrl = 0;
+
+	return ameth;
+
+	err:
+
+	EVP_PKEY_asn1_free(ameth);
+	return NULL;
+
+	}
+
+void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst, 
+			const EVP_PKEY_ASN1_METHOD *src)
+	{
+
+	dst->pub_decode = src->pub_decode;
+	dst->pub_encode = src->pub_encode;
+	dst->pub_cmp = src->pub_cmp;
+	dst->pub_print = src->pub_print;
+
+	dst->priv_decode = src->priv_decode;
+	dst->priv_encode = src->priv_encode;
+	dst->priv_print = src->priv_print;
+
+	dst->old_priv_encode = src->old_priv_encode;
+	dst->old_priv_decode = src->old_priv_decode;
+
+	dst->pkey_size = src->pkey_size;
+	dst->pkey_bits = src->pkey_bits;
+
+	dst->param_decode = src->param_decode;
+	dst->param_encode = src->param_encode;
+	dst->param_missing = src->param_missing;
+	dst->param_copy = src->param_copy;
+	dst->param_cmp = src->param_cmp;
+	dst->param_print = src->param_print;
+
+	dst->pkey_free = src->pkey_free;
+	dst->pkey_ctrl = src->pkey_ctrl;
+
+	}
+
+void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth)
+	{
+	if (ameth && (ameth->pkey_flags & ASN1_PKEY_DYNAMIC))
+		{
+		if (ameth->pem_str)
+			OPENSSL_free(ameth->pem_str);
+		if (ameth->info)
+			OPENSSL_free(ameth->info);
+		OPENSSL_free(ameth);
+		}
+	}
+
+void EVP_PKEY_asn1_set_public(EVP_PKEY_ASN1_METHOD *ameth,
+		int (*pub_decode)(EVP_PKEY *pk, X509_PUBKEY *pub),
+		int (*pub_encode)(X509_PUBKEY *pub, const EVP_PKEY *pk),
+		int (*pub_cmp)(const EVP_PKEY *a, const EVP_PKEY *b),
+		int (*pub_print)(BIO *out, const EVP_PKEY *pkey, int indent,
+							ASN1_PCTX *pctx),
+		int (*pkey_size)(const EVP_PKEY *pk),
+		int (*pkey_bits)(const EVP_PKEY *pk))
+	{
+	ameth->pub_decode = pub_decode;
+	ameth->pub_encode = pub_encode;
+	ameth->pub_cmp = pub_cmp;
+	ameth->pub_print = pub_print;
+	ameth->pkey_size = pkey_size;
+	ameth->pkey_bits = pkey_bits;
+	}
+
+void EVP_PKEY_asn1_set_private(EVP_PKEY_ASN1_METHOD *ameth,
+		int (*priv_decode)(EVP_PKEY *pk, PKCS8_PRIV_KEY_INFO *p8inf),
+		int (*priv_encode)(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pk),
+		int (*priv_print)(BIO *out, const EVP_PKEY *pkey, int indent,
+							ASN1_PCTX *pctx))
+	{
+	ameth->priv_decode = priv_decode;
+	ameth->priv_encode = priv_encode;
+	ameth->priv_print = priv_print;
+	}
+
+void EVP_PKEY_asn1_set_param(EVP_PKEY_ASN1_METHOD *ameth,
+		int (*param_decode)(EVP_PKEY *pkey,
+				const unsigned char **pder, int derlen),
+		int (*param_encode)(const EVP_PKEY *pkey, unsigned char **pder),
+		int (*param_missing)(const EVP_PKEY *pk),
+		int (*param_copy)(EVP_PKEY *to, const EVP_PKEY *from),
+		int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b),
+		int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent,
+							ASN1_PCTX *pctx))
+	{
+	ameth->param_decode = param_decode;
+	ameth->param_encode = param_encode;
+	ameth->param_missing = param_missing;
+	ameth->param_copy = param_copy;
+	ameth->param_cmp = param_cmp;
+	ameth->param_print = param_print;
+	}
+
+void EVP_PKEY_asn1_set_free(EVP_PKEY_ASN1_METHOD *ameth,
+		void (*pkey_free)(EVP_PKEY *pkey))
+	{
+	ameth->pkey_free = pkey_free;
+	}
+
+void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth,
+		int (*pkey_ctrl)(EVP_PKEY *pkey, int op,
+							long arg1, void *arg2))
+	{
+	ameth->pkey_ctrl = pkey_ctrl;
+	}
diff --git a/src/lib/libcrypto/asn1/asn1.h b/src/lib/libcrypto/asn1/asn1.h
index 424cd348bb..e3385226d4 100644
--- a/src/lib/libcrypto/asn1/asn1.h
+++ b/src/lib/libcrypto/asn1/asn1.h
@@ -612,6 +612,7 @@ typedef struct BIT_STRING_BITNAME_st {
 			B_ASN1_GENERALIZEDTIME
 
 #define B_ASN1_PRINTABLE \
+			B_ASN1_NUMERICSTRING| \
 			B_ASN1_PRINTABLESTRING| \
 			B_ASN1_T61STRING| \
 			B_ASN1_IA5STRING| \
@@ -1217,6 +1218,7 @@ void ERR_load_ASN1_strings(void);
 #define ASN1_R_BAD_OBJECT_HEADER			 102
 #define ASN1_R_BAD_PASSWORD_READ			 103
 #define ASN1_R_BAD_TAG					 104
+#define ASN1_R_BMPSTRING_IS_WRONG_LENGTH		 210
 #define ASN1_R_BN_LIB					 105
 #define ASN1_R_BOOLEAN_IS_WRONG_LENGTH			 106
 #define ASN1_R_BUFFER_TOO_SMALL				 107
@@ -1306,6 +1308,7 @@ void ERR_load_ASN1_strings(void);
 #define ASN1_R_UNABLE_TO_DECODE_RSA_KEY			 157
 #define ASN1_R_UNABLE_TO_DECODE_RSA_PRIVATE_KEY		 158
 #define ASN1_R_UNEXPECTED_EOC				 159
+#define ASN1_R_UNIVERSALSTRING_IS_WRONG_LENGTH		 211
 #define ASN1_R_UNKNOWN_FORMAT				 160
 #define ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM		 161
 #define ASN1_R_UNKNOWN_OBJECT_TYPE			 162
diff --git a/src/lib/libcrypto/asn1/asn1_err.c b/src/lib/libcrypto/asn1/asn1_err.c
index f8a3e2e6cd..5f5de98eed 100644
--- a/src/lib/libcrypto/asn1/asn1_err.c
+++ b/src/lib/libcrypto/asn1/asn1_err.c
@@ -195,6 +195,7 @@ static ERR_STRING_DATA ASN1_str_reasons[]=
 {ERR_REASON(ASN1_R_BAD_OBJECT_HEADER)    ,"bad object header"},
 {ERR_REASON(ASN1_R_BAD_PASSWORD_READ)    ,"bad password read"},
 {ERR_REASON(ASN1_R_BAD_TAG)              ,"bad tag"},
+{ERR_REASON(ASN1_R_BMPSTRING_IS_WRONG_LENGTH),"bmpstring is wrong length"},
 {ERR_REASON(ASN1_R_BN_LIB)               ,"bn lib"},
 {ERR_REASON(ASN1_R_BOOLEAN_IS_WRONG_LENGTH),"boolean is wrong length"},
 {ERR_REASON(ASN1_R_BUFFER_TOO_SMALL)     ,"buffer too small"},
@@ -284,6 +285,7 @@ static ERR_STRING_DATA ASN1_str_reasons[]=
 {ERR_REASON(ASN1_R_UNABLE_TO_DECODE_RSA_KEY),"unable to decode rsa key"},
 {ERR_REASON(ASN1_R_UNABLE_TO_DECODE_RSA_PRIVATE_KEY),"unable to decode rsa private key"},
 {ERR_REASON(ASN1_R_UNEXPECTED_EOC)       ,"unexpected eoc"},
+{ERR_REASON(ASN1_R_UNIVERSALSTRING_IS_WRONG_LENGTH),"universalstring is wrong length"},
 {ERR_REASON(ASN1_R_UNKNOWN_FORMAT)       ,"unknown format"},
 {ERR_REASON(ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM),"unknown message digest algorithm"},
 {ERR_REASON(ASN1_R_UNKNOWN_OBJECT_TYPE)  ,"unknown object type"},
diff --git a/src/lib/libcrypto/asn1/asn1_locl.h b/src/lib/libcrypto/asn1/asn1_locl.h
new file mode 100644
index 0000000000..5aa65e28f5
--- /dev/null
+++ b/src/lib/libcrypto/asn1/asn1_locl.h
@@ -0,0 +1,134 @@
+/* asn1t.h */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2006.
+ */
+/* ====================================================================
+ * Copyright (c) 2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+/* Internal ASN1 structures and functions: not for application use */
+
+/* ASN1 print context structure */
+
+struct asn1_pctx_st
+	{
+	unsigned long flags;
+	unsigned long nm_flags;
+	unsigned long cert_flags;
+	unsigned long oid_flags;
+	unsigned long str_flags;
+	} /* ASN1_PCTX */;
+
+/* ASN1 public key method structure */
+
+struct evp_pkey_asn1_method_st
+	{
+	int pkey_id;
+	int pkey_base_id;
+	unsigned long pkey_flags;
+
+	char *pem_str;
+	char *info;
+
+	int (*pub_decode)(EVP_PKEY *pk, X509_PUBKEY *pub);
+	int (*pub_encode)(X509_PUBKEY *pub, const EVP_PKEY *pk);
+	int (*pub_cmp)(const EVP_PKEY *a, const EVP_PKEY *b);
+	int (*pub_print)(BIO *out, const EVP_PKEY *pkey, int indent,
+							ASN1_PCTX *pctx);
+
+	int (*priv_decode)(EVP_PKEY *pk, PKCS8_PRIV_KEY_INFO *p8inf);
+	int (*priv_encode)(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pk);
+	int (*priv_print)(BIO *out, const EVP_PKEY *pkey, int indent,
+							ASN1_PCTX *pctx);
+
+	int (*pkey_size)(const EVP_PKEY *pk);
+	int (*pkey_bits)(const EVP_PKEY *pk);
+
+	int (*param_decode)(EVP_PKEY *pkey,
+				const unsigned char **pder, int derlen);
+	int (*param_encode)(const EVP_PKEY *pkey, unsigned char **pder);
+	int (*param_missing)(const EVP_PKEY *pk);
+	int (*param_copy)(EVP_PKEY *to, const EVP_PKEY *from);
+	int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b);
+	int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent,
+							ASN1_PCTX *pctx);
+
+	void (*pkey_free)(EVP_PKEY *pkey);
+	int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2);
+
+	/* Legacy functions for old PEM */
+
+	int (*old_priv_decode)(EVP_PKEY *pkey,
+				const unsigned char **pder, int derlen);
+	int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder);
+
+	} /* EVP_PKEY_ASN1_METHOD */;
+
+/* Method to handle CRL access.
+ * In general a CRL could be very large (several Mb) and can consume large
+ * amounts of resources if stored in memory by multiple processes.
+ * This method allows general CRL operations to be redirected to more
+ * efficient callbacks: for example a CRL entry database.
+ */
+
+#define X509_CRL_METHOD_DYNAMIC		1
+
+struct x509_crl_method_st
+	{
+	int flags;
+	int (*crl_init)(X509_CRL *crl);
+	int (*crl_free)(X509_CRL *crl);
+	int (*crl_lookup)(X509_CRL *crl, X509_REVOKED **ret,
+				ASN1_INTEGER *ser, X509_NAME *issuer);
+	int (*crl_verify)(X509_CRL *crl, EVP_PKEY *pk);
+	};
diff --git a/src/lib/libcrypto/asn1/asn1_par.c b/src/lib/libcrypto/asn1/asn1_par.c
index 501b62a4b1..8657f73d66 100644
--- a/src/lib/libcrypto/asn1/asn1_par.c
+++ b/src/lib/libcrypto/asn1/asn1_par.c
@@ -213,6 +213,8 @@ static int asn1_parse2(BIO *bp, const unsigned char **pp, long length, int offse
 				(tag == V_ASN1_T61STRING) ||
 				(tag == V_ASN1_IA5STRING) ||
 				(tag == V_ASN1_VISIBLESTRING) ||
+				(tag == V_ASN1_NUMERICSTRING) ||
+				(tag == V_ASN1_UTF8STRING) ||
 				(tag == V_ASN1_UTCTIME) ||
 				(tag == V_ASN1_GENERALIZEDTIME))
 				{
diff --git a/src/lib/libcrypto/asn1/asn_mime.c b/src/lib/libcrypto/asn1/asn_mime.c
index bc80b20d63..d8d9e76cc0 100644
--- a/src/lib/libcrypto/asn1/asn_mime.c
+++ b/src/lib/libcrypto/asn1/asn_mime.c
@@ -152,7 +152,6 @@ static ASN1_VALUE *b64_read_asn1(BIO *bio, const ASN1_ITEM *it)
 
 static int asn1_write_micalg(BIO *out, STACK_OF(X509_ALGOR) *mdalgs)
 	{
-	const EVP_MD *md;
 	int i, have_unknown = 0, write_comma, md_nid;
 	have_unknown = 0;
 	write_comma = 0;
@@ -162,7 +161,6 @@ static int asn1_write_micalg(BIO *out, STACK_OF(X509_ALGOR) *mdalgs)
 			BIO_write(out, ",", 1);
 		write_comma = 1;
 		md_nid = OBJ_obj2nid(sk_X509_ALGOR_value(mdalgs, i)->algorithm);
-		md = EVP_get_digestbynid(md_nid);
 		switch(md_nid)
 			{
 			case NID_sha1:
diff --git a/src/lib/libcrypto/asn1/bio_asn1.c b/src/lib/libcrypto/asn1/bio_asn1.c
new file mode 100644
index 0000000000..dc7efd551c
--- /dev/null
+++ b/src/lib/libcrypto/asn1/bio_asn1.c
@@ -0,0 +1,495 @@
+/* bio_asn1.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+/* Experimental ASN1 BIO. When written through the data is converted
+ * to an ASN1 string type: default is OCTET STRING. Additional functions
+ * can be provided to add prefix and suffix data.
+ */
+
+#include <string.h>
+#include <openssl/bio.h>
+#include <openssl/asn1.h>
+
+/* Must be large enough for biggest tag+length */
+#define DEFAULT_ASN1_BUF_SIZE 20
+
+typedef enum 
+	{
+	ASN1_STATE_START,
+	ASN1_STATE_PRE_COPY,
+	ASN1_STATE_HEADER,
+	ASN1_STATE_HEADER_COPY,
+	ASN1_STATE_DATA_COPY,
+	ASN1_STATE_POST_COPY,
+	ASN1_STATE_DONE
+	} asn1_bio_state_t;
+
+typedef struct BIO_ASN1_EX_FUNCS_st
+	{
+	asn1_ps_func	*ex_func;
+	asn1_ps_func	*ex_free_func;
+	} BIO_ASN1_EX_FUNCS;
+
+typedef struct BIO_ASN1_BUF_CTX_t
+	{
+	/* Internal state */
+	asn1_bio_state_t state;
+	/* Internal buffer */
+	unsigned char *buf;
+	/* Size of buffer */
+	int bufsize;
+	/* Current position in buffer */
+	int bufpos;
+	/* Current buffer length */
+	int buflen;
+	/* Amount of data to copy */
+	int copylen;
+	/* Class and tag to use */
+	int asn1_class, asn1_tag;
+	asn1_ps_func *prefix, *prefix_free, *suffix, *suffix_free;
+	/* Extra buffer for prefix and suffix data */
+	unsigned char *ex_buf;
+	int ex_len;
+	int ex_pos;
+	void *ex_arg;
+	} BIO_ASN1_BUF_CTX;
+
+
+static int asn1_bio_write(BIO *h, const char *buf,int num);
+static int asn1_bio_read(BIO *h, char *buf, int size);
+static int asn1_bio_puts(BIO *h, const char *str);
+static int asn1_bio_gets(BIO *h, char *str, int size);
+static long asn1_bio_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int asn1_bio_new(BIO *h);
+static int asn1_bio_free(BIO *data);
+static long asn1_bio_callback_ctrl(BIO *h, int cmd, bio_info_cb *fp);
+
+static int asn1_bio_init(BIO_ASN1_BUF_CTX *ctx, int size);
+static int asn1_bio_flush_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx,
+				asn1_ps_func *cleanup, asn1_bio_state_t next);
+static int asn1_bio_setup_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx,
+				asn1_ps_func *setup, 
+				asn1_bio_state_t ex_state,
+				asn1_bio_state_t other_state);
+
+static BIO_METHOD methods_asn1=
+	{
+	BIO_TYPE_ASN1,
+	"asn1",
+	asn1_bio_write,
+	asn1_bio_read,
+	asn1_bio_puts,
+	asn1_bio_gets,
+	asn1_bio_ctrl,
+	asn1_bio_new,
+	asn1_bio_free,
+	asn1_bio_callback_ctrl,
+	};
+
+BIO_METHOD *BIO_f_asn1(void)
+	{
+	return(&methods_asn1);
+	}
+
+
+static int asn1_bio_new(BIO *b)
+	{
+	BIO_ASN1_BUF_CTX *ctx;
+	ctx = OPENSSL_malloc(sizeof(BIO_ASN1_BUF_CTX));
+	if (!ctx)
+		return 0;
+	if (!asn1_bio_init(ctx, DEFAULT_ASN1_BUF_SIZE))
+		return 0;
+	b->init = 1;
+	b->ptr = (char *)ctx;
+	b->flags = 0;
+	return 1;
+	}
+
+static int asn1_bio_init(BIO_ASN1_BUF_CTX *ctx, int size)
+	{
+	ctx->buf = OPENSSL_malloc(size);
+	if (!ctx->buf)
+		return 0;
+	ctx->bufsize = size;
+	ctx->bufpos = 0;
+	ctx->buflen = 0;
+	ctx->copylen = 0;
+	ctx->asn1_class = V_ASN1_UNIVERSAL;
+	ctx->asn1_tag = V_ASN1_OCTET_STRING;
+	ctx->ex_buf = 0;
+	ctx->ex_pos = 0;
+	ctx->ex_len = 0;
+	ctx->state = ASN1_STATE_START;
+	return 1;
+	}
+
+static int asn1_bio_free(BIO *b)
+	{
+	BIO_ASN1_BUF_CTX *ctx;
+	ctx = (BIO_ASN1_BUF_CTX *) b->ptr;
+	if (ctx == NULL)
+		return 0;
+	if (ctx->buf)
+		OPENSSL_free(ctx->buf);
+	OPENSSL_free(ctx);
+	b->init = 0;
+	b->ptr = NULL;
+	b->flags = 0;
+	return 1;
+	}
+
+static int asn1_bio_write(BIO *b, const char *in , int inl)
+	{
+	BIO_ASN1_BUF_CTX *ctx;
+	int wrmax, wrlen, ret;
+	unsigned char *p;
+	if (!in || (inl < 0) || (b->next_bio == NULL))
+		return 0;
+	ctx = (BIO_ASN1_BUF_CTX *) b->ptr;
+	if (ctx == NULL)
+		return 0;
+
+	wrlen = 0;
+	ret = -1;
+
+	for(;;)
+		{
+		switch (ctx->state)
+			{
+
+			/* Setup prefix data, call it */
+			case ASN1_STATE_START:
+			if (!asn1_bio_setup_ex(b, ctx, ctx->prefix,
+				ASN1_STATE_PRE_COPY, ASN1_STATE_HEADER))
+				return 0;
+			break;
+
+			/* Copy any pre data first */
+			case ASN1_STATE_PRE_COPY:
+
+			ret = asn1_bio_flush_ex(b, ctx, ctx->prefix_free,
+							ASN1_STATE_HEADER);
+
+			if (ret <= 0)
+				goto done;
+
+			break;
+
+			case ASN1_STATE_HEADER:
+			ctx->buflen =
+				ASN1_object_size(0, inl, ctx->asn1_tag) - inl;
+			OPENSSL_assert(ctx->buflen <= ctx->bufsize);
+			p = ctx->buf;
+			ASN1_put_object(&p, 0, inl,
+					ctx->asn1_tag, ctx->asn1_class);
+			ctx->copylen = inl;
+			ctx->state = ASN1_STATE_HEADER_COPY;
+
+			break;
+
+			case ASN1_STATE_HEADER_COPY:	
+			ret = BIO_write(b->next_bio,
+					ctx->buf + ctx->bufpos, ctx->buflen);
+			if (ret <= 0)
+				goto done;
+
+			ctx->buflen -= ret;
+			if (ctx->buflen)
+				ctx->bufpos += ret;
+			else
+				{
+				ctx->bufpos = 0;
+				ctx->state = ASN1_STATE_DATA_COPY;
+				}
+
+			break;
+
+			case ASN1_STATE_DATA_COPY:
+
+			if (inl > ctx->copylen)
+				wrmax = ctx->copylen;
+			else
+				wrmax = inl;
+			ret = BIO_write(b->next_bio, in, wrmax);
+			if (ret <= 0)
+				break;
+			wrlen += ret;
+			ctx->copylen -= ret;
+			in += ret;
+			inl -= ret;
+
+			if (ctx->copylen == 0)
+				ctx->state = ASN1_STATE_HEADER;
+
+			if (inl == 0)
+				goto done;
+
+			break;
+
+			default:
+			BIO_clear_retry_flags(b);
+			return 0;
+
+			}
+
+		}
+
+	done:
+	BIO_clear_retry_flags(b);
+	BIO_copy_next_retry(b);
+
+	return (wrlen > 0) ? wrlen : ret;
+
+	}
+
+static int asn1_bio_flush_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx,
+				asn1_ps_func *cleanup, asn1_bio_state_t next)
+	{
+	int ret;
+	if (ctx->ex_len <= 0)
+		return 1;
+	for(;;)
+		{
+		ret = BIO_write(b->next_bio, ctx->ex_buf + ctx->ex_pos,
+								ctx->ex_len);
+		if (ret <= 0)
+			break;
+		ctx->ex_len -= ret;
+		if (ctx->ex_len > 0)
+			ctx->ex_pos += ret;
+		else
+			{
+			if(cleanup)
+				cleanup(b, &ctx->ex_buf, &ctx->ex_len,
+								&ctx->ex_arg);
+			ctx->state = next;
+			ctx->ex_pos = 0;
+			break;
+			}
+		}
+	return ret;
+	}
+
+static int asn1_bio_setup_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx,
+				asn1_ps_func *setup, 
+				asn1_bio_state_t ex_state,
+				asn1_bio_state_t other_state)
+	{
+	if (setup && !setup(b, &ctx->ex_buf, &ctx->ex_len, &ctx->ex_arg))
+		{
+		BIO_clear_retry_flags(b);
+		return 0;
+		}
+	if (ctx->ex_len > 0)
+		ctx->state = ex_state;
+	else
+		ctx->state = other_state;
+	return 1;
+	}
+
+static int asn1_bio_read(BIO *b, char *in , int inl)
+	{
+	if (!b->next_bio)
+		return 0;
+	return BIO_read(b->next_bio, in , inl);
+	}
+
+static int asn1_bio_puts(BIO *b, const char *str)
+	{
+	return asn1_bio_write(b, str, strlen(str));
+	}
+
+static int asn1_bio_gets(BIO *b, char *str, int size)
+	{
+	if (!b->next_bio)
+		return 0;
+	return BIO_gets(b->next_bio, str , size);
+	}
+
+static long asn1_bio_callback_ctrl(BIO *b, int cmd, bio_info_cb *fp)
+	{
+	if (b->next_bio == NULL) return(0);
+	return BIO_callback_ctrl(b->next_bio,cmd,fp);
+	}
+
+static long asn1_bio_ctrl(BIO *b, int cmd, long arg1, void *arg2)
+	{
+	BIO_ASN1_BUF_CTX *ctx;
+	BIO_ASN1_EX_FUNCS *ex_func;
+	long ret = 1;
+	ctx = (BIO_ASN1_BUF_CTX *) b->ptr;
+	if (ctx == NULL)
+		return 0;
+	switch(cmd)
+		{
+
+		case BIO_C_SET_PREFIX:
+		ex_func = arg2;
+		ctx->prefix  = ex_func->ex_func;
+		ctx->prefix_free  = ex_func->ex_free_func;
+		break;
+
+		case BIO_C_GET_PREFIX:
+		ex_func = arg2;
+		ex_func->ex_func = ctx->prefix;
+		ex_func->ex_free_func = ctx->prefix_free;
+		break;
+
+		case BIO_C_SET_SUFFIX:
+		ex_func = arg2;
+		ctx->suffix  = ex_func->ex_func;
+		ctx->suffix_free  = ex_func->ex_free_func;
+		break;
+
+		case BIO_C_GET_SUFFIX:
+		ex_func = arg2;
+		ex_func->ex_func = ctx->suffix;
+		ex_func->ex_free_func = ctx->suffix_free;
+		break;
+
+		case BIO_C_SET_EX_ARG:
+		ctx->ex_arg = arg2;
+		break;
+
+		case BIO_C_GET_EX_ARG:
+		*(void **)arg2 = ctx->ex_arg;
+		break;
+
+		case BIO_CTRL_FLUSH:
+		if (!b->next_bio)
+			return 0;
+
+		/* Call post function if possible */
+		if (ctx->state == ASN1_STATE_HEADER)
+			{
+			if (!asn1_bio_setup_ex(b, ctx, ctx->suffix,
+				ASN1_STATE_POST_COPY, ASN1_STATE_DONE))
+				return 0;
+			}
+
+		if (ctx->state == ASN1_STATE_POST_COPY)
+			{
+			ret = asn1_bio_flush_ex(b, ctx, ctx->suffix_free,
+							ASN1_STATE_DONE);
+			if (ret <= 0)
+				return ret;
+			}
+
+		if (ctx->state == ASN1_STATE_DONE)
+			return BIO_ctrl(b->next_bio, cmd, arg1, arg2);
+		else
+			{
+			BIO_clear_retry_flags(b);
+			return 0;
+			}
+		break;
+
+
+		default:
+		if (!b->next_bio)
+			return 0;
+		return BIO_ctrl(b->next_bio, cmd, arg1, arg2);
+
+		}
+
+	return ret;
+	}
+
+static int asn1_bio_set_ex(BIO *b, int cmd,
+		asn1_ps_func *ex_func, asn1_ps_func *ex_free_func)
+	{
+	BIO_ASN1_EX_FUNCS extmp;
+	extmp.ex_func = ex_func;
+	extmp.ex_free_func = ex_free_func;
+	return BIO_ctrl(b, cmd, 0, &extmp);
+	}
+
+static int asn1_bio_get_ex(BIO *b, int cmd,
+		asn1_ps_func **ex_func, asn1_ps_func **ex_free_func)
+	{
+	BIO_ASN1_EX_FUNCS extmp;
+	int ret;
+	ret = BIO_ctrl(b, cmd, 0, &extmp);
+	if (ret > 0)
+		{
+		*ex_func = extmp.ex_func;
+		*ex_free_func = extmp.ex_free_func;
+		}
+	return ret;
+	}
+
+int BIO_asn1_set_prefix(BIO *b, asn1_ps_func *prefix, asn1_ps_func *prefix_free)
+	{
+	return asn1_bio_set_ex(b, BIO_C_SET_PREFIX, prefix, prefix_free);
+	}
+
+int BIO_asn1_get_prefix(BIO *b, asn1_ps_func **pprefix, asn1_ps_func **pprefix_free)
+	{
+	return asn1_bio_get_ex(b, BIO_C_GET_PREFIX, pprefix, pprefix_free);
+	}
+
+int BIO_asn1_set_suffix(BIO *b, asn1_ps_func *suffix, asn1_ps_func *suffix_free)
+	{
+	return asn1_bio_set_ex(b, BIO_C_SET_SUFFIX, suffix, suffix_free);
+	}
+
+int BIO_asn1_get_suffix(BIO *b, asn1_ps_func **psuffix, asn1_ps_func **psuffix_free)
+	{
+	return asn1_bio_get_ex(b, BIO_C_GET_SUFFIX, psuffix, psuffix_free);
+	}
diff --git a/src/lib/libcrypto/asn1/bio_ndef.c b/src/lib/libcrypto/asn1/bio_ndef.c
new file mode 100644
index 0000000000..370389b1e6
--- /dev/null
+++ b/src/lib/libcrypto/asn1/bio_ndef.c
@@ -0,0 +1,246 @@
+/* bio_ndef.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ */
+
+#include <openssl/asn1.h>
+#include <openssl/asn1t.h>
+#include <openssl/bio.h>
+#include <openssl/err.h>
+
+#ifndef OPENSSL_SYSNAME_NETWARE
+#include <memory.h>
+#endif
+#include <stdio.h>
+
+/* Experimental NDEF ASN1 BIO support routines */
+
+/* The usage is quite simple, initialize an ASN1 structure,
+ * get a BIO from it then any data written through the BIO
+ * will end up translated to approptiate format on the fly.
+ * The data is streamed out and does *not* need to be
+ * all held in memory at once.
+ *
+ * When the BIO is flushed the output is finalized and any
+ * signatures etc written out.
+ *
+ * The BIO is a 'proper' BIO and can handle non blocking I/O
+ * correctly.
+ *
+ * The usage is simple. The implementation is *not*...
+ */
+
+/* BIO support data stored in the ASN1 BIO ex_arg */
+
+typedef struct ndef_aux_st
+	{
+	/* ASN1 structure this BIO refers to */
+	ASN1_VALUE *val;
+	const ASN1_ITEM *it;
+	/* Top of the BIO chain */
+	BIO *ndef_bio;
+	/* Output BIO */
+	BIO *out;
+	/* Boundary where content is inserted */
+	unsigned char **boundary;
+	/* DER buffer start */
+	unsigned char *derbuf;
+	} NDEF_SUPPORT;
+
+static int ndef_prefix(BIO *b, unsigned char **pbuf, int *plen, void *parg);
+static int ndef_prefix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg);
+static int ndef_suffix(BIO *b, unsigned char **pbuf, int *plen, void *parg);
+static int ndef_suffix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg);
+
+BIO *BIO_new_NDEF(BIO *out, ASN1_VALUE *val, const ASN1_ITEM *it)
+	{
+	NDEF_SUPPORT *ndef_aux = NULL;
+	BIO *asn_bio = NULL;
+	const ASN1_AUX *aux = it->funcs;
+	ASN1_STREAM_ARG sarg;
+
+	if (!aux || !aux->asn1_cb)
+		{
+		ASN1err(ASN1_F_BIO_NEW_NDEF, ASN1_R_STREAMING_NOT_SUPPORTED);
+		return NULL;
+		}
+	ndef_aux = OPENSSL_malloc(sizeof(NDEF_SUPPORT));
+	asn_bio = BIO_new(BIO_f_asn1());
+
+	/* ASN1 bio needs to be next to output BIO */
+
+	out = BIO_push(asn_bio, out);
+
+	if (!ndef_aux || !asn_bio || !out)
+		goto err;
+
+	BIO_asn1_set_prefix(asn_bio, ndef_prefix, ndef_prefix_free);
+	BIO_asn1_set_suffix(asn_bio, ndef_suffix, ndef_suffix_free);
+
+	/* Now let callback prepend any digest, cipher etc BIOs
+	 * ASN1 structure needs.
+	 */
+
+	sarg.out = out;
+	sarg.ndef_bio = NULL;
+	sarg.boundary = NULL;
+
+	if (aux->asn1_cb(ASN1_OP_STREAM_PRE, &val, it, &sarg) <= 0)
+		goto err;
+
+	ndef_aux->val = val;
+	ndef_aux->it = it;
+	ndef_aux->ndef_bio = sarg.ndef_bio;
+	ndef_aux->boundary = sarg.boundary;
+	ndef_aux->out = out;
+
+	BIO_ctrl(asn_bio, BIO_C_SET_EX_ARG, 0, ndef_aux);
+
+	return sarg.ndef_bio;
+
+	err:
+	if (asn_bio)
+		BIO_free(asn_bio);
+	if (ndef_aux)
+		OPENSSL_free(ndef_aux);
+	return NULL;
+	}
+
+static int ndef_prefix(BIO *b, unsigned char **pbuf, int *plen, void *parg)
+	{
+	NDEF_SUPPORT *ndef_aux;
+	unsigned char *p;
+	int derlen;
+
+	if (!parg)
+		return 0;
+
+	ndef_aux = *(NDEF_SUPPORT **)parg;
+
+	derlen = ASN1_item_ndef_i2d(ndef_aux->val, NULL, ndef_aux->it);
+	p = OPENSSL_malloc(derlen);
+	ndef_aux->derbuf = p;
+	*pbuf = p;
+	derlen = ASN1_item_ndef_i2d(ndef_aux->val, &p, ndef_aux->it);
+
+	if (!*ndef_aux->boundary)
+		return 0;
+
+	*plen = *ndef_aux->boundary - *pbuf;
+
+	return 1;
+	}
+
+static int ndef_prefix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg)
+	{
+	NDEF_SUPPORT *ndef_aux;
+
+	if (!parg)
+		return 0;
+
+	ndef_aux = *(NDEF_SUPPORT **)parg;
+
+	if (ndef_aux->derbuf)
+		OPENSSL_free(ndef_aux->derbuf);
+
+	ndef_aux->derbuf = NULL;
+	*pbuf = NULL;
+	*plen = 0;
+	return 1;
+	}
+
+static int ndef_suffix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg)
+	{
+	NDEF_SUPPORT **pndef_aux = (NDEF_SUPPORT **)parg;
+	if (!ndef_prefix_free(b, pbuf, plen, parg))
+		return 0;
+	OPENSSL_free(*pndef_aux);
+	*pndef_aux = NULL;
+	return 1;
+	}
+
+static int ndef_suffix(BIO *b, unsigned char **pbuf, int *plen, void *parg)
+	{
+	NDEF_SUPPORT *ndef_aux;
+	unsigned char *p;
+	int derlen;
+	const ASN1_AUX *aux;
+	ASN1_STREAM_ARG sarg;
+
+	if (!parg)
+		return 0;
+
+	ndef_aux = *(NDEF_SUPPORT **)parg;
+
+	aux = ndef_aux->it->funcs;
+
+	/* Finalize structures */
+	sarg.ndef_bio = ndef_aux->ndef_bio;
+	sarg.out = ndef_aux->out;
+	sarg.boundary = ndef_aux->boundary;
+	if (aux->asn1_cb(ASN1_OP_STREAM_POST,
+				&ndef_aux->val, ndef_aux->it, &sarg) <= 0)
+		return 0;
+
+	derlen = ASN1_item_ndef_i2d(ndef_aux->val, NULL, ndef_aux->it);
+	p = OPENSSL_malloc(derlen);
+	ndef_aux->derbuf = p;
+	*pbuf = p;
+	derlen = ASN1_item_ndef_i2d(ndef_aux->val, &p, ndef_aux->it);
+
+	if (!*ndef_aux->boundary)
+		return 0;
+	*pbuf = *ndef_aux->boundary;
+	*plen = derlen - (*ndef_aux->boundary - ndef_aux->derbuf);
+
+	return 1;
+	}
diff --git a/src/lib/libcrypto/asn1/t_x509.c b/src/lib/libcrypto/asn1/t_x509.c
index 8b09e5890f..8f746f9c05 100644
--- a/src/lib/libcrypto/asn1/t_x509.c
+++ b/src/lib/libcrypto/asn1/t_x509.c
@@ -332,7 +332,7 @@ int X509_signature_print(BIO *bp, X509_ALGOR *sigalg, ASN1_STRING *sig)
 int ASN1_STRING_print(BIO *bp, ASN1_STRING *v)
 	{
 	int i,n;
-	char buf[80],*p;;
+	char buf[80],*p;
 
 	if (v == NULL) return(0);
 	n=0;
diff --git a/src/lib/libcrypto/asn1/tasn_dec.c b/src/lib/libcrypto/asn1/tasn_dec.c
index ced641698e..48bc1c0d4d 100644
--- a/src/lib/libcrypto/asn1/tasn_dec.c
+++ b/src/lib/libcrypto/asn1/tasn_dec.c
@@ -69,7 +69,7 @@ static int asn1_check_eoc(const unsigned char **in, long len);
 static int asn1_find_end(const unsigned char **in, long len, char inf);
 
 static int asn1_collect(BUF_MEM *buf, const unsigned char **in, long len,
-				char inf, int tag, int aclass);
+			char inf, int tag, int aclass, int depth);
 
 static int collect_data(BUF_MEM *buf, const unsigned char **p, long plen);
 
@@ -611,7 +611,6 @@ static int asn1_template_ex_d2i(ASN1_VALUE **val,
 
 	err:
 	ASN1_template_free(val, tt);
-	*val = NULL;
 	return 0;
 	}
 
@@ -758,7 +757,6 @@ static int asn1_template_noexp_d2i(ASN1_VALUE **val,
 
 	err:
 	ASN1_template_free(val, tt);
-	*val = NULL;
 	return 0;
 	}
 
@@ -878,7 +876,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
 		 * internally irrespective of the type. So instead just check
 		 * for UNIVERSAL class and ignore the tag.
 		 */
-		if (!asn1_collect(&buf, &p, plen, inf, -1, V_ASN1_UNIVERSAL))
+		if (!asn1_collect(&buf, &p, plen, inf, -1, V_ASN1_UNIVERSAL, 0))
 			{
 			free_cont = 1;
 			goto err;
@@ -1012,6 +1010,18 @@ int asn1_ex_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len,
 		case V_ASN1_SET:
 		case V_ASN1_SEQUENCE:
 		default:
+		if (utype == V_ASN1_BMPSTRING && (len & 1))
+			{
+			ASN1err(ASN1_F_ASN1_EX_C2I,
+					ASN1_R_BMPSTRING_IS_WRONG_LENGTH);
+			goto err;
+			}
+		if (utype == V_ASN1_UNIVERSALSTRING && (len & 3))
+			{
+			ASN1err(ASN1_F_ASN1_EX_C2I,
+					ASN1_R_UNIVERSALSTRING_IS_WRONG_LENGTH);
+			goto err;
+			}
 		/* All based on ASN1_STRING and handled the same */
 		if (!*pval)
 			{
@@ -1128,8 +1138,18 @@ static int asn1_find_end(const unsigned char **in, long len, char inf)
  * if it is indefinite length.
  */
 
+#ifndef ASN1_MAX_STRING_NEST
+/* This determines how many levels of recursion are permitted in ASN1
+ * string types. If it is not limited stack overflows can occur. If set
+ * to zero no recursion is allowed at all. Although zero should be adequate
+ * examples exist that require a value of 1. So 5 should be more than enough.
+ */
+#define ASN1_MAX_STRING_NEST 5
+#endif
+
+
 static int asn1_collect(BUF_MEM *buf, const unsigned char **in, long len,
-				char inf, int tag, int aclass)
+			char inf, int tag, int aclass, int depth)
 	{
 	const unsigned char *p, *q;
 	long plen;
@@ -1171,13 +1191,15 @@ static int asn1_collect(BUF_MEM *buf, const unsigned char **in, long len,
 		/* If indefinite length constructed update max length */
 		if (cst)
 			{
-#ifdef OPENSSL_ALLOW_NESTED_ASN1_STRINGS
-			if (!asn1_collect(buf, &p, plen, ininf, tag, aclass))
+			if (depth >= ASN1_MAX_STRING_NEST)
+				{
+				ASN1err(ASN1_F_ASN1_COLLECT,
+					ASN1_R_NESTED_ASN1_STRING);
+				return 0;
+				}
+			if (!asn1_collect(buf, &p, plen, ininf, tag, aclass,
+						depth + 1))
 				return 0;
-#else
-			ASN1err(ASN1_F_ASN1_COLLECT, ASN1_R_NESTED_ASN1_STRING);
-			return 0;
-#endif
 			}
 		else if (plen && !collect_data(buf, &p, plen))
 			return 0;
diff --git a/src/lib/libcrypto/asn1/x_nx509.c b/src/lib/libcrypto/asn1/x_nx509.c
new file mode 100644
index 0000000000..fbd9a22db3
--- /dev/null
+++ b/src/lib/libcrypto/asn1/x_nx509.c
@@ -0,0 +1,72 @@
+/* x_nx509.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2005.
+ */
+/* ====================================================================
+ * Copyright (c) 2005 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <stddef.h>
+#include <openssl/x509.h>
+#include <openssl/asn1.h>
+#include <openssl/asn1t.h>
+
+/* Old netscape certificate wrapper format */
+
+ASN1_SEQUENCE(NETSCAPE_X509) = {
+	ASN1_SIMPLE(NETSCAPE_X509, header, ASN1_OCTET_STRING),
+	ASN1_OPT(NETSCAPE_X509, cert, X509)
+} ASN1_SEQUENCE_END(NETSCAPE_X509)
+
+IMPLEMENT_ASN1_FUNCTIONS(NETSCAPE_X509)
+
diff --git a/src/lib/libcrypto/bio/bss_mem.c b/src/lib/libcrypto/bio/bss_mem.c
index a4edb711ae..e7ab9cb3a3 100644
--- a/src/lib/libcrypto/bio/bss_mem.c
+++ b/src/lib/libcrypto/bio/bss_mem.c
@@ -284,6 +284,7 @@ static int mem_gets(BIO *bp, char *buf, int size)
 
 	BIO_clear_retry_flags(bp);
 	j=bm->length;
+	if ((size-1) < j) j=size-1;
 	if (j <= 0)
 		{
 		*buf='\0';
@@ -292,17 +293,18 @@ static int mem_gets(BIO *bp, char *buf, int size)
 	p=bm->data;
 	for (i=0; i<j; i++)
 		{
-		if (p[i] == '\n') break;
-		}
-	if (i == j)
-		{
-		BIO_set_retry_read(bp);
-		/* return(-1);  change the semantics 0.6.6a */ 
+		if (p[i] == '\n')
+			{
+			i++;
+			break;
+			}
 		}
-	else
-		i++;
-	/* i is the max to copy */
-	if ((size-1) < i) i=size-1;
+
+	/*
+	 * i is now the max num of bytes to copy, either j or up to
+	 * and including the first newline
+	 */ 
+
 	i=mem_read(bp,buf,i);
 	if (i > 0) buf[i]='\0';
 	ret=i;
diff --git a/src/lib/libcrypto/bio/bss_sock.c b/src/lib/libcrypto/bio/bss_sock.c
index 472dd75821..30c3ceab46 100644
--- a/src/lib/libcrypto/bio/bss_sock.c
+++ b/src/lib/libcrypto/bio/bss_sock.c
@@ -60,6 +60,9 @@
 #include <errno.h>
 #define USE_SOCKETS
 #include "cryptlib.h"
+
+#ifndef OPENSSL_NO_SOCK
+
 #include <openssl/bio.h>
 
 #ifdef WATT32
@@ -300,3 +303,5 @@ int BIO_sock_non_fatal_error(int err)
 		}
 	return(0);
 	}
+
+#endif  /* #ifndef OPENSSL_NO_SOCK */
diff --git a/src/lib/libcrypto/bn/asm/alpha-mont.pl b/src/lib/libcrypto/bn/asm/alpha-mont.pl
new file mode 100644
index 0000000000..7a2cc3173b
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/alpha-mont.pl
@@ -0,0 +1,317 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# On 21264 RSA sign performance improves by 70/35/20/15 percent for
+# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
+# instructed to '-tune host' code with in-line assembler. Other
+# benchmarks improve by 15-20%. To anchor it to something else, the
+# code provides approximately the same performance per GHz as AMD64.
+# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
+# difference.
+
+# int bn_mul_mont(
+$rp="a0";	# BN_ULONG *rp,
+$ap="a1";	# const BN_ULONG *ap,
+$bp="a2";	# const BN_ULONG *bp,
+$np="a3";	# const BN_ULONG *np,
+$n0="a4";	# const BN_ULONG *n0,
+$num="a5";	# int num);
+
+$lo0="t0";
+$hi0="t1";
+$lo1="t2";
+$hi1="t3";
+$aj="t4";
+$bi="t5";
+$nj="t6";
+$tp="t7";
+$alo="t8";
+$ahi="t9";
+$nlo="t10";
+$nhi="t11";
+$tj="t12";
+$i="s3";
+$j="s4";
+$m1="s5";
+
+$code=<<___;
+#include <asm.h>
+#include <regdef.h>
+
+.text
+
+.set	noat
+.set	noreorder
+
+.globl	bn_mul_mont
+.align	5
+.ent	bn_mul_mont
+bn_mul_mont:
+	lda	sp,-40(sp)
+	stq	ra,0(sp)
+	stq	s3,8(sp)
+	stq	s4,16(sp)
+	stq	s5,24(sp)
+	stq	fp,32(sp)
+	mov	sp,fp
+	.mask	0x0400f000,-40
+	.frame	fp,40,ra
+	.prologue 0
+
+	.align	4
+	.set	reorder
+	sextl	$num,$num
+	mov	0,v0
+	cmplt	$num,4,AT
+	bne	AT,.Lexit
+
+	ldq	$hi0,0($ap)	# ap[0]
+	s8addq	$num,16,AT
+	ldq	$aj,8($ap)
+	subq	sp,AT,sp
+	ldq	$bi,0($bp)	# bp[0]
+	mov	-4096,AT
+	ldq	$n0,0($n0)
+	and	sp,AT,sp
+
+	mulq	$hi0,$bi,$lo0
+	ldq	$hi1,0($np)	# np[0]
+	umulh	$hi0,$bi,$hi0
+	ldq	$nj,8($np)
+
+	mulq	$lo0,$n0,$m1
+
+	mulq	$hi1,$m1,$lo1
+	umulh	$hi1,$m1,$hi1
+
+	addq	$lo1,$lo0,$lo1
+	cmpult	$lo1,$lo0,AT
+	addq	$hi1,AT,$hi1
+
+	mulq	$aj,$bi,$alo
+	mov	2,$j
+	umulh	$aj,$bi,$ahi
+	mov	sp,$tp
+
+	mulq	$nj,$m1,$nlo
+	s8addq	$j,$ap,$aj
+	umulh	$nj,$m1,$nhi
+	s8addq	$j,$np,$nj
+.align	4
+.L1st:
+	.set	noreorder
+	ldq	$aj,($aj)
+	addl	$j,1,$j
+	ldq	$nj,($nj)
+	lda	$tp,8($tp)
+
+	addq	$alo,$hi0,$lo0
+	mulq	$aj,$bi,$alo
+	cmpult	$lo0,$hi0,AT
+	addq	$nlo,$hi1,$lo1
+
+	mulq	$nj,$m1,$nlo
+	addq	$ahi,AT,$hi0
+	cmpult	$lo1,$hi1,v0
+	cmplt	$j,$num,$tj
+
+	umulh	$aj,$bi,$ahi
+	addq	$nhi,v0,$hi1
+	addq	$lo1,$lo0,$lo1
+	s8addq	$j,$ap,$aj
+
+	umulh	$nj,$m1,$nhi
+	cmpult	$lo1,$lo0,v0
+	addq	$hi1,v0,$hi1
+	s8addq	$j,$np,$nj
+
+	stq	$lo1,-8($tp)
+	nop
+	unop
+	bne	$tj,.L1st
+	.set	reorder
+
+	addq	$alo,$hi0,$lo0
+	addq	$nlo,$hi1,$lo1
+	cmpult	$lo0,$hi0,AT
+	cmpult	$lo1,$hi1,v0
+	addq	$ahi,AT,$hi0
+	addq	$nhi,v0,$hi1
+
+	addq	$lo1,$lo0,$lo1
+	cmpult	$lo1,$lo0,v0
+	addq	$hi1,v0,$hi1
+
+	stq	$lo1,0($tp)
+
+	addq	$hi1,$hi0,$hi1
+	cmpult	$hi1,$hi0,AT
+	stq	$hi1,8($tp)
+	stq	AT,16($tp)
+
+	mov	1,$i
+.align	4
+.Louter:
+	s8addq	$i,$bp,$bi
+	ldq	$hi0,($ap)
+	ldq	$aj,8($ap)
+	ldq	$bi,($bi)
+	ldq	$hi1,($np)
+	ldq	$nj,8($np)
+	ldq	$tj,(sp)
+
+	mulq	$hi0,$bi,$lo0
+	umulh	$hi0,$bi,$hi0
+
+	addq	$lo0,$tj,$lo0
+	cmpult	$lo0,$tj,AT
+	addq	$hi0,AT,$hi0
+
+	mulq	$lo0,$n0,$m1
+
+	mulq	$hi1,$m1,$lo1
+	umulh	$hi1,$m1,$hi1
+
+	addq	$lo1,$lo0,$lo1
+	cmpult	$lo1,$lo0,AT
+	mov	2,$j
+	addq	$hi1,AT,$hi1
+
+	mulq	$aj,$bi,$alo
+	mov	sp,$tp
+	umulh	$aj,$bi,$ahi
+
+	mulq	$nj,$m1,$nlo
+	s8addq	$j,$ap,$aj
+	umulh	$nj,$m1,$nhi
+.align	4
+.Linner:
+	.set	noreorder
+	ldq	$tj,8($tp)	#L0
+	nop			#U1
+	ldq	$aj,($aj)	#L1
+	s8addq	$j,$np,$nj	#U0
+
+	ldq	$nj,($nj)	#L0
+	nop			#U1
+	addq	$alo,$hi0,$lo0	#L1
+	lda	$tp,8($tp)
+
+	mulq	$aj,$bi,$alo	#U1
+	cmpult	$lo0,$hi0,AT	#L0
+	addq	$nlo,$hi1,$lo1	#L1
+	addl	$j,1,$j
+
+	mulq	$nj,$m1,$nlo	#U1
+	addq	$ahi,AT,$hi0	#L0
+	addq	$lo0,$tj,$lo0	#L1
+	cmpult	$lo1,$hi1,v0	#U0
+
+	umulh	$aj,$bi,$ahi	#U1
+	cmpult	$lo0,$tj,AT	#L0
+	addq	$lo1,$lo0,$lo1	#L1
+	addq	$nhi,v0,$hi1	#U0
+
+	umulh	$nj,$m1,$nhi	#U1
+	s8addq	$j,$ap,$aj	#L0
+	cmpult	$lo1,$lo0,v0	#L1
+	cmplt	$j,$num,$tj	#U0	# borrow $tj
+
+	addq	$hi0,AT,$hi0	#L0
+	addq	$hi1,v0,$hi1	#U1
+	stq	$lo1,-8($tp)	#L1
+	bne	$tj,.Linner	#U0
+	.set	reorder
+
+	ldq	$tj,8($tp)
+	addq	$alo,$hi0,$lo0
+	addq	$nlo,$hi1,$lo1
+	cmpult	$lo0,$hi0,AT
+	cmpult	$lo1,$hi1,v0
+	addq	$ahi,AT,$hi0
+	addq	$nhi,v0,$hi1
+
+	addq	$lo0,$tj,$lo0
+	cmpult	$lo0,$tj,AT
+	addq	$hi0,AT,$hi0
+
+	ldq	$tj,16($tp)
+	addq	$lo1,$lo0,$j
+	cmpult	$j,$lo0,v0
+	addq	$hi1,v0,$hi1
+
+	addq	$hi1,$hi0,$lo1
+	stq	$j,($tp)
+	cmpult	$lo1,$hi0,$hi1
+	addq	$lo1,$tj,$lo1
+	cmpult	$lo1,$tj,AT
+	addl	$i,1,$i
+	addq	$hi1,AT,$hi1
+	stq	$lo1,8($tp)
+	cmplt	$i,$num,$tj	# borrow $tj
+	stq	$hi1,16($tp)
+	bne	$tj,.Louter
+
+	s8addq	$num,sp,$tj	# &tp[num]
+	mov	$rp,$bp		# put rp aside
+	mov	sp,$tp
+	mov	sp,$ap
+	mov	0,$hi0		# clear borrow bit
+
+.align	4
+.Lsub:	ldq	$lo0,($tp)
+	ldq	$lo1,($np)
+	lda	$tp,8($tp)
+	lda	$np,8($np)
+	subq	$lo0,$lo1,$lo1	# tp[i]-np[i]
+	cmpult	$lo0,$lo1,AT
+	subq	$lo1,$hi0,$lo0
+	cmpult	$lo1,$lo0,$hi0
+	or	$hi0,AT,$hi0
+	stq	$lo0,($rp)
+	cmpult	$tp,$tj,v0
+	lda	$rp,8($rp)
+	bne	v0,.Lsub
+
+	subq	$hi1,$hi0,$hi0	# handle upmost overflow bit
+	mov	sp,$tp
+	mov	$bp,$rp		# restore rp
+
+	and	sp,$hi0,$ap
+	bic	$bp,$hi0,$bp
+	bis	$bp,$ap,$ap	# ap=borrow?tp:rp
+
+.align	4
+.Lcopy:	ldq	$aj,($ap)	# copy or in-place refresh
+	lda	$tp,8($tp)
+	lda	$rp,8($rp)
+	lda	$ap,8($ap)
+	stq	zero,-8($tp)	# zap tp
+	cmpult	$tp,$tj,AT
+	stq	$aj,-8($rp)
+	bne	AT,.Lcopy
+	mov	1,v0
+
+.Lexit:
+	.set	noreorder
+	mov	fp,sp
+	/*ldq	ra,0(sp)*/
+	ldq	s3,8(sp)
+	ldq	s4,16(sp)
+	ldq	s5,24(sp)
+	ldq	fp,32(sp)
+	lda	sp,40(sp)
+	ret	(ra)
+.end	bn_mul_mont
+.rdata
+.asciiz	"Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl
new file mode 100644
index 0000000000..05d5dc1a48
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/armv4-mont.pl
@@ -0,0 +1,200 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2007.
+
+# Montgomery multiplication for ARMv4.
+#
+# Performance improvement naturally varies among CPU implementations
+# and compilers. The code was observed to provide +65-35% improvement
+# [depending on key length, less for longer keys] on ARM920T, and
+# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
+# base and compiler generated code with in-lined umull and even umlal
+# instructions. The latter means that this code didn't really have an 
+# "advantage" of utilizing some "secret" instruction.
+#
+# The code is interoperable with Thumb ISA and is rather compact, less
+# than 1/2KB. Windows CE port would be trivial, as it's exclusively
+# about decorations, ABI and instruction syntax are identical.
+
+$num="r0";	# starts as num argument, but holds &tp[num-1]
+$ap="r1";
+$bp="r2"; $bi="r2"; $rp="r2";
+$np="r3";
+$tp="r4";
+$aj="r5";
+$nj="r6";
+$tj="r7";
+$n0="r8";
+###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
+$alo="r10";	# sl, gcc uses it to keep @GOT
+$ahi="r11";	# fp
+$nlo="r12";	# ip
+###########	# r13 is stack pointer
+$nhi="r14";	# lr
+###########	# r15 is program counter
+
+#### argument block layout relative to &tp[num-1], a.k.a. $num
+$_rp="$num,#12*4";
+# ap permanently resides in r1
+$_bp="$num,#13*4";
+# np permanently resides in r3
+$_n0="$num,#14*4";
+$_num="$num,#15*4";	$_bpend=$_num;
+
+$code=<<___;
+.text
+
+.global	bn_mul_mont
+.type	bn_mul_mont,%function
+
+.align	2
+bn_mul_mont:
+	stmdb	sp!,{r0,r2}		@ sp points at argument block
+	ldr	$num,[sp,#3*4]		@ load num
+	cmp	$num,#2
+	movlt	r0,#0
+	addlt	sp,sp,#2*4
+	blt	.Labrt
+
+	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
+
+	mov	$num,$num,lsl#2		@ rescale $num for byte count
+	sub	sp,sp,$num		@ alloca(4*num)
+	sub	sp,sp,#4		@ +extra dword
+	sub	$num,$num,#4		@ "num=num-1"
+	add	$tp,$bp,$num		@ &bp[num-1]
+
+	add	$num,sp,$num		@ $num to point at &tp[num-1]
+	ldr	$n0,[$_n0]		@ &n0
+	ldr	$bi,[$bp]		@ bp[0]
+	ldr	$aj,[$ap],#4		@ ap[0],ap++
+	ldr	$nj,[$np],#4		@ np[0],np++
+	ldr	$n0,[$n0]		@ *n0
+	str	$tp,[$_bpend]		@ save &bp[num]
+
+	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
+	str	$n0,[$_n0]		@ save n0 value
+	mul	$n0,$alo,$n0		@ "tp[0]"*n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
+	mov	$tp,sp
+
+.L1st:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	mov	$alo,$ahi
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.L1st
+
+	adds	$nlo,$nlo,$ahi
+	mov	$nhi,#0
+	adc	$nhi,$nhi,#0
+	ldr	$tp,[$_bp]		@ restore bp
+	str	$nlo,[$num]		@ tp[num-1]=
+	ldr	$n0,[$_n0]		@ restore n0
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+.Louter:
+	sub	$tj,$num,sp		@ "original" $num-1 value
+	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
+	sub	$np,$np,$tj		@ "rewind" np to &np[1]
+	ldr	$bi,[$tp,#4]!		@ *(++bp)
+	ldr	$aj,[$ap,#-4]		@ ap[0]
+	ldr	$nj,[$np,#-4]		@ np[0]
+	ldr	$alo,[sp]		@ tp[0]
+	ldr	$tj,[sp,#4]		@ tp[1]
+
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
+	str	$tp,[$_bp]		@ save bp
+	mul	$n0,$alo,$n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
+	mov	$tp,sp
+
+.Linner:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	adds	$alo,$ahi,$tj		@ +=tp[j]
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	ldr	$tj,[$tp,#8]		@ tp[j+1]
+	adc	$ahi,$ahi,#0
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.Linner
+
+	adds	$nlo,$nlo,$ahi
+	mov	$nhi,#0
+	adc	$nhi,$nhi,#0
+	adds	$nlo,$nlo,$tj
+	adc	$nhi,$nhi,#0
+	ldr	$tp,[$_bp]		@ restore bp
+	ldr	$tj,[$_bpend]		@ restore &bp[num]
+	str	$nlo,[$num]		@ tp[num-1]=
+	ldr	$n0,[$_n0]		@ restore n0
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+	cmp	$tp,$tj
+	bne	.Louter
+
+	ldr	$rp,[$_rp]		@ pull rp
+	add	$num,$num,#4		@ $num to point at &tp[num]
+	sub	$aj,$num,sp		@ "original" num value
+	mov	$tp,sp			@ "rewind" $tp
+	mov	$ap,$tp			@ "borrow" $ap
+	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
+
+	subs	$tj,$tj,$tj		@ "clear" carry flag
+.Lsub:	ldr	$tj,[$tp],#4
+	ldr	$nj,[$np],#4
+	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
+	str	$tj,[$rp],#4		@ rp[j]=
+	teq	$tp,$num		@ preserve carry
+	bne	.Lsub
+	sbcs	$nhi,$nhi,#0		@ upmost carry
+	mov	$tp,sp			@ "rewind" $tp
+	sub	$rp,$rp,$aj		@ "rewind" $rp
+
+	and	$ap,$tp,$nhi
+	bic	$np,$rp,$nhi
+	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
+
+.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
+	str	sp,[$tp],#4		@ zap tp
+	str	$tj,[$rp],#4
+	cmp	$tp,$num
+	bne	.Lcopy
+
+	add	sp,$num,#4		@ skip over tp[num+1]
+	ldmia	sp!,{r4-r12,lr}		@ restore registers
+	add	sp,sp,#2*4		@ skip over {r0,r2}
+	mov	r0,#1
+.Labrt:	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+.size	bn_mul_mont,.-bn_mul_mont
+.asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ppc-mont.pl b/src/lib/libcrypto/bn/asm/ppc-mont.pl
new file mode 100644
index 0000000000..7849eae959
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/ppc-mont.pl
@@ -0,0 +1,323 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# April 2006
+
+# "Teaser" Montgomery multiplication module for PowerPC. It's possible
+# to gain a bit more by modulo-scheduling outer loop, then dedicated
+# squaring procedure should give further 20% and code can be adapted
+# for 32-bit application running on 64-bit CPU. As for the latter.
+# It won't be able to achieve "native" 64-bit performance, because in
+# 32-bit application context every addc instruction will have to be
+# expanded as addc, twice right shift by 32 and finally adde, etc.
+# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
+# for 64-bit application running on PPC970/G5 is:
+#
+# 512-bit	+65%	
+# 1024-bit	+35%
+# 2048-bit	+18%
+# 4096-bit	+4%
+
+$flavour = shift;
+
+if ($flavour =~ /32/) {
+	$BITS=	32;
+	$BNSZ=	$BITS/8;
+	$SIZE_T=4;
+	$RZONE=	224;
+	$FRAME=	$SIZE_T*16;
+
+	$LD=	"lwz";		# load
+	$LDU=	"lwzu";		# load and update
+	$LDX=	"lwzx";		# load indexed
+	$ST=	"stw";		# store
+	$STU=	"stwu";		# store and update
+	$STX=	"stwx";		# store indexed
+	$STUX=	"stwux";	# store indexed and update
+	$UMULL=	"mullw";	# unsigned multiply low
+	$UMULH=	"mulhwu";	# unsigned multiply high
+	$UCMP=	"cmplw";	# unsigned compare
+	$SHRI=	"srwi";		# unsigned shift right by immediate	
+	$PUSH=	$ST;
+	$POP=	$LD;
+} elsif ($flavour =~ /64/) {
+	$BITS=	64;
+	$BNSZ=	$BITS/8;
+	$SIZE_T=8;
+	$RZONE=	288;
+	$FRAME=	$SIZE_T*16;
+
+	# same as above, but 64-bit mnemonics...
+	$LD=	"ld";		# load
+	$LDU=	"ldu";		# load and update
+	$LDX=	"ldx";		# load indexed
+	$ST=	"std";		# store
+	$STU=	"stdu";		# store and update
+	$STX=	"stdx";		# store indexed
+	$STUX=	"stdux";	# store indexed and update
+	$UMULL=	"mulld";	# unsigned multiply low
+	$UMULH=	"mulhdu";	# unsigned multiply high
+	$UCMP=	"cmpld";	# unsigned compare
+	$SHRI=	"srdi";		# unsigned shift right by immediate	
+	$PUSH=	$ST;
+	$POP=	$LD;
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$sp="r1";
+$toc="r2";
+$rp="r3";	$ovf="r3";
+$ap="r4";
+$bp="r5";
+$np="r6";
+$n0="r7";
+$num="r8";
+$rp="r9";	# $rp is reassigned
+$aj="r10";
+$nj="r11";
+$tj="r12";
+# non-volatile registers
+$i="r14";
+$j="r15";
+$tp="r16";
+$m0="r17";
+$m1="r18";
+$lo0="r19";
+$hi0="r20";
+$lo1="r21";
+$hi1="r22";
+$alo="r23";
+$ahi="r24";
+$nlo="r25";
+#
+$nhi="r0";
+
+$code=<<___;
+.machine "any"
+.text
+
+.globl	.bn_mul_mont
+.align	4
+.bn_mul_mont:
+	cmpwi	$num,4
+	mr	$rp,r3		; $rp is reassigned
+	li	r3,0
+	bltlr
+
+	slwi	$num,$num,`log($BNSZ)/log(2)`
+	li	$tj,-4096
+	addi	$ovf,$num,`$FRAME+$RZONE`
+	subf	$ovf,$ovf,$sp	; $sp-$ovf
+	and	$ovf,$ovf,$tj	; minimize TLB usage
+	subf	$ovf,$sp,$ovf	; $ovf-$sp
+	srwi	$num,$num,`log($BNSZ)/log(2)`
+	$STUX	$sp,$sp,$ovf
+
+	$PUSH	r14,`4*$SIZE_T`($sp)
+	$PUSH	r15,`5*$SIZE_T`($sp)
+	$PUSH	r16,`6*$SIZE_T`($sp)
+	$PUSH	r17,`7*$SIZE_T`($sp)
+	$PUSH	r18,`8*$SIZE_T`($sp)
+	$PUSH	r19,`9*$SIZE_T`($sp)
+	$PUSH	r20,`10*$SIZE_T`($sp)
+	$PUSH	r21,`11*$SIZE_T`($sp)
+	$PUSH	r22,`12*$SIZE_T`($sp)
+	$PUSH	r23,`13*$SIZE_T`($sp)
+	$PUSH	r24,`14*$SIZE_T`($sp)
+	$PUSH	r25,`15*$SIZE_T`($sp)
+
+	$LD	$n0,0($n0)	; pull n0[0] value
+	addi	$num,$num,-2	; adjust $num for counter register
+
+	$LD	$m0,0($bp)	; m0=bp[0]
+	$LD	$aj,0($ap)	; ap[0]
+	addi	$tp,$sp,$FRAME
+	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
+	$UMULH	$hi0,$aj,$m0
+
+	$LD	$aj,$BNSZ($ap)	; ap[1]
+	$LD	$nj,0($np)	; np[0]
+
+	$UMULL	$m1,$lo0,$n0	; "tp[0]"*n0
+
+	$UMULL	$alo,$aj,$m0	; ap[1]*bp[0]
+	$UMULH	$ahi,$aj,$m0
+
+	$UMULL	$lo1,$nj,$m1	; np[0]*m1
+	$UMULH	$hi1,$nj,$m1
+	$LD	$nj,$BNSZ($np)	; np[1]
+	addc	$lo1,$lo1,$lo0
+	addze	$hi1,$hi1
+
+	$UMULL	$nlo,$nj,$m1	; np[1]*m1
+	$UMULH	$nhi,$nj,$m1
+
+	mtctr	$num
+	li	$j,`2*$BNSZ`
+.align	4
+L1st:
+	$LDX	$aj,$ap,$j	; ap[j]
+	addc	$lo0,$alo,$hi0
+	$LDX	$nj,$np,$j	; np[j]
+	addze	$hi0,$ahi
+	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
+	addc	$lo1,$nlo,$hi1
+	$UMULH	$ahi,$aj,$m0
+	addze	$hi1,$nhi
+	$UMULL	$nlo,$nj,$m1	; np[j]*m1
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
+	$UMULH	$nhi,$nj,$m1
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+
+	addi	$j,$j,$BNSZ	; j++
+	addi	$tp,$tp,$BNSZ	; tp++
+	bdnz-	L1st
+;L1st
+	addc	$lo0,$alo,$hi0
+	addze	$hi0,$ahi
+
+	addc	$lo1,$nlo,$hi1
+	addze	$hi1,$nhi
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+
+	li	$ovf,0
+	addc	$hi1,$hi1,$hi0
+	addze	$ovf,$ovf	; upmost overflow bit
+	$ST	$hi1,$BNSZ($tp)
+
+	li	$i,$BNSZ
+.align	4
+Louter:
+	$LDX	$m0,$bp,$i	; m0=bp[i]
+	$LD	$aj,0($ap)	; ap[0]
+	addi	$tp,$sp,$FRAME
+	$LD	$tj,$FRAME($sp)	; tp[0]
+	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
+	$UMULH	$hi0,$aj,$m0
+	$LD	$aj,$BNSZ($ap)	; ap[1]
+	$LD	$nj,0($np)	; np[0]
+	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
+	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
+	addze	$hi0,$hi0
+	$UMULL	$m1,$lo0,$n0	; tp[0]*n0
+	$UMULH	$ahi,$aj,$m0
+	$UMULL	$lo1,$nj,$m1	; np[0]*m1
+	$UMULH	$hi1,$nj,$m1
+	$LD	$nj,$BNSZ($np)	; np[1]
+	addc	$lo1,$lo1,$lo0
+	$UMULL	$nlo,$nj,$m1	; np[1]*m1
+	addze	$hi1,$hi1
+	$UMULH	$nhi,$nj,$m1
+
+	mtctr	$num
+	li	$j,`2*$BNSZ`
+.align	4
+Linner:
+	$LDX	$aj,$ap,$j	; ap[j]
+	addc	$lo0,$alo,$hi0
+	$LD	$tj,$BNSZ($tp)	; tp[j]
+	addze	$hi0,$ahi
+	$LDX	$nj,$np,$j	; np[j]
+	addc	$lo1,$nlo,$hi1
+	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
+	addze	$hi1,$nhi
+	$UMULH	$ahi,$aj,$m0
+	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
+	$UMULL	$nlo,$nj,$m1	; np[j]*m1
+	addze	$hi0,$hi0
+	$UMULH	$nhi,$nj,$m1
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
+	addi	$j,$j,$BNSZ	; j++
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+	addi	$tp,$tp,$BNSZ	; tp++
+	bdnz-	Linner
+;Linner
+	$LD	$tj,$BNSZ($tp)	; tp[j]
+	addc	$lo0,$alo,$hi0
+	addze	$hi0,$ahi
+	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
+	addze	$hi0,$hi0
+
+	addc	$lo1,$nlo,$hi1
+	addze	$hi1,$nhi
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+
+	addic	$ovf,$ovf,-1	; move upmost overflow to XER[CA]
+	li	$ovf,0
+	adde	$hi1,$hi1,$hi0
+	addze	$ovf,$ovf
+	$ST	$hi1,$BNSZ($tp)
+;
+	slwi	$tj,$num,`log($BNSZ)/log(2)`
+	$UCMP	$i,$tj
+	addi	$i,$i,$BNSZ
+	ble-	Louter
+
+	addi	$num,$num,2	; restore $num
+	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
+	addi	$tp,$sp,$FRAME
+	mtctr	$num
+
+.align	4
+Lsub:	$LDX	$tj,$tp,$j
+	$LDX	$nj,$np,$j
+	subfe	$aj,$nj,$tj	; tp[j]-np[j]
+	$STX	$aj,$rp,$j
+	addi	$j,$j,$BNSZ
+	bdnz-	Lsub
+
+	li	$j,0
+	mtctr	$num
+	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
+	and	$ap,$tp,$ovf
+	andc	$np,$rp,$ovf
+	or	$ap,$ap,$np	; ap=borrow?tp:rp
+
+.align	4
+Lcopy:				; copy or in-place refresh
+	$LDX	$tj,$ap,$j
+	$STX	$tj,$rp,$j
+	$STX	$j,$tp,$j	; zap at once
+	addi	$j,$j,$BNSZ
+	bdnz-	Lcopy
+
+	$POP	r14,`4*$SIZE_T`($sp)
+	$POP	r15,`5*$SIZE_T`($sp)
+	$POP	r16,`6*$SIZE_T`($sp)
+	$POP	r17,`7*$SIZE_T`($sp)
+	$POP	r18,`8*$SIZE_T`($sp)
+	$POP	r19,`9*$SIZE_T`($sp)
+	$POP	r20,`10*$SIZE_T`($sp)
+	$POP	r21,`11*$SIZE_T`($sp)
+	$POP	r22,`12*$SIZE_T`($sp)
+	$POP	r23,`13*$SIZE_T`($sp)
+	$POP	r24,`14*$SIZE_T`($sp)
+	$POP	r25,`15*$SIZE_T`($sp)
+	$POP	$sp,0($sp)
+	li	r3,1
+	blr
+	.long	0
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ppc64-mont.pl b/src/lib/libcrypto/bn/asm/ppc64-mont.pl
new file mode 100644
index 0000000000..3449b35855
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/ppc64-mont.pl
@@ -0,0 +1,918 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# December 2007
+
+# The reason for undertaken effort is basically following. Even though
+# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
+# performance was observed to be less than impressive, essentially as
+# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
+# Well, it's not surprising that IBM had to make some sacrifices to
+# boost the clock frequency that much, but no overall improvement?
+# Having observed how much difference did switching to FPU make on
+# UltraSPARC, playing same stunt on Power 6 appeared appropriate...
+# Unfortunately the resulting performance improvement is not as
+# impressive, ~30%, and in absolute terms is still very far from what
+# one would expect from 4.7GHz CPU. There is a chance that I'm doing
+# something wrong, but in the lack of assembler level micro-profiling
+# data or at least decent platform guide I can't tell... Or better
+# results might be achieved with VMX... Anyway, this module provides
+# *worse* performance on other PowerPC implementations, ~40-15% slower
+# on PPC970 depending on key length and ~40% slower on Power 5 for all
+# key lengths. As it's obviously inappropriate as "best all-round"
+# alternative, it has to be complemented with run-time CPU family
+# detection. Oh! It should also be noted that unlike other PowerPC
+# implementation IALU ppc-mont.pl module performs *suboptimaly* on
+# >=1024-bit key lengths on Power 6. It should also be noted that
+# *everything* said so far applies to 64-bit builds! As far as 32-bit
+# application executed on 64-bit CPU goes, this module is likely to
+# become preferred choice, because it's easy to adapt it for such
+# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
+
+# February 2008
+
+# Micro-profiling assisted optimization results in ~15% improvement
+# over original ppc64-mont.pl version, or overall ~50% improvement
+# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
+# Power 6 CPU, this module is 5-150% faster depending on key length,
+# [hereafter] more for longer keys. But if compared to ppc-mont.pl
+# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
+# in absolute terms, but it's apparently the way Power 6 is...
+
+$flavour = shift;
+
+if ($flavour =~ /32/) {
+	$SIZE_T=4;
+	$RZONE=	224;
+	$FRAME=	$SIZE_T*12+8*12;
+	$fname=	"bn_mul_mont_ppc64";
+
+	$STUX=	"stwux";	# store indexed and update
+	$PUSH=	"stw";
+	$POP=	"lwz";
+	die "not implemented yet";
+} elsif ($flavour =~ /64/) {
+	$SIZE_T=8;
+	$RZONE=	288;
+	$FRAME=	$SIZE_T*12+8*12;
+	$fname=	"bn_mul_mont";
+
+	# same as above, but 64-bit mnemonics...
+	$STUX=	"stdux";	# store indexed and update
+	$PUSH=	"std";
+	$POP=	"ld";
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=($FRAME+63)&~63;
+$TRANSFER=16*8;
+
+$carry="r0";
+$sp="r1";
+$toc="r2";
+$rp="r3";	$ovf="r3";
+$ap="r4";
+$bp="r5";
+$np="r6";
+$n0="r7";
+$num="r8";
+$rp="r9";	# $rp is reassigned
+$tp="r10";
+$j="r11";
+$i="r12";
+# non-volatile registers
+$nap_d="r14";	# interleaved ap and np in double format
+$a0="r15";	# ap[0]
+$t0="r16";	# temporary registers
+$t1="r17";
+$t2="r18";
+$t3="r19";
+$t4="r20";
+$t5="r21";
+$t6="r22";
+$t7="r23";
+
+# PPC offers enough register bank capacity to unroll inner loops twice
+#
+#     ..A3A2A1A0
+#           dcba
+#    -----------
+#            A0a
+#           A0b
+#          A0c
+#         A0d
+#          A1a
+#         A1b
+#        A1c
+#       A1d
+#        A2a
+#       A2b
+#      A2c
+#     A2d
+#      A3a
+#     A3b
+#    A3c
+#   A3d
+#    ..a
+#   ..b
+#
+$ba="f0";	$bb="f1";	$bc="f2";	$bd="f3";
+$na="f4";	$nb="f5";	$nc="f6";	$nd="f7";
+$dota="f8";	$dotb="f9";
+$A0="f10";	$A1="f11";	$A2="f12";	$A3="f13";
+$N0="f14";	$N1="f15";	$N2="f16";	$N3="f17";
+$T0a="f18";	$T0b="f19";
+$T1a="f20";	$T1b="f21";
+$T2a="f22";	$T2b="f23";
+$T3a="f24";	$T3b="f25";
+
+# sp----------->+-------------------------------+
+#		| saved sp			|
+#		+-------------------------------+
+#		|				|
+#		+-------------------------------+
+#		| 10 saved gpr, r14-r23		|
+#		.				.
+#		.				.
+#   +12*size_t	+-------------------------------+
+#		| 12 saved fpr, f14-f25		|
+#		.				.
+#		.				.
+#   +12*8	+-------------------------------+
+#		| padding to 64 byte boundary	|
+#		.				.
+#   +X		+-------------------------------+
+#		| 16 gpr<->fpr transfer zone	|
+#		.				.
+#		.				.
+#   +16*8	+-------------------------------+
+#		| __int64 tmp[-1]		|
+#		+-------------------------------+
+#		| __int64 tmp[num]		|
+#		.				.
+#		.				.
+#		.				.
+#   +(num+1)*8	+-------------------------------+
+#		| padding to 64 byte boundary	|
+#		.				.
+#   +X		+-------------------------------+
+#		| double nap_d[4*num]		|
+#		.				.
+#		.				.
+#		.				.
+#		+-------------------------------+
+
+$code=<<___;
+.machine "any"
+.text
+
+.globl	.$fname
+.align	5
+.$fname:
+	cmpwi	$num,4
+	mr	$rp,r3		; $rp is reassigned
+	li	r3,0		; possible "not handled" return code
+	bltlr-
+	andi.	r0,$num,1	; $num has to be even
+	bnelr-
+
+	slwi	$num,$num,3	; num*=8
+	li	$i,-4096
+	slwi	$tp,$num,2	; place for {an}p_{lh}[num], i.e. 4*num
+	add	$tp,$tp,$num	; place for tp[num+1]
+	addi	$tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
+	subf	$tp,$tp,$sp	; $sp-$tp
+	and	$tp,$tp,$i	; minimize TLB usage
+	subf	$tp,$sp,$tp	; $tp-$sp
+	$STUX	$sp,$sp,$tp	; alloca
+
+	$PUSH	r14,`2*$SIZE_T`($sp)
+	$PUSH	r15,`3*$SIZE_T`($sp)
+	$PUSH	r16,`4*$SIZE_T`($sp)
+	$PUSH	r17,`5*$SIZE_T`($sp)
+	$PUSH	r18,`6*$SIZE_T`($sp)
+	$PUSH	r19,`7*$SIZE_T`($sp)
+	$PUSH	r20,`8*$SIZE_T`($sp)
+	$PUSH	r21,`9*$SIZE_T`($sp)
+	$PUSH	r22,`10*$SIZE_T`($sp)
+	$PUSH	r23,`11*$SIZE_T`($sp)
+	stfd	f14,`12*$SIZE_T+0`($sp)
+	stfd	f15,`12*$SIZE_T+8`($sp)
+	stfd	f16,`12*$SIZE_T+16`($sp)
+	stfd	f17,`12*$SIZE_T+24`($sp)
+	stfd	f18,`12*$SIZE_T+32`($sp)
+	stfd	f19,`12*$SIZE_T+40`($sp)
+	stfd	f20,`12*$SIZE_T+48`($sp)
+	stfd	f21,`12*$SIZE_T+56`($sp)
+	stfd	f22,`12*$SIZE_T+64`($sp)
+	stfd	f23,`12*$SIZE_T+72`($sp)
+	stfd	f24,`12*$SIZE_T+80`($sp)
+	stfd	f25,`12*$SIZE_T+88`($sp)
+
+	ld	$a0,0($ap)	; pull ap[0] value
+	ld	$n0,0($n0)	; pull n0[0] value
+	ld	$t3,0($bp)	; bp[0]
+
+	addi	$tp,$sp,`$FRAME+$TRANSFER+8+64`
+	li	$i,-64
+	add	$nap_d,$tp,$num
+	and	$nap_d,$nap_d,$i	; align to 64 bytes
+
+	mulld	$t7,$a0,$t3	; ap[0]*bp[0]
+	; nap_d is off by 1, because it's used with stfdu/lfdu
+	addi	$nap_d,$nap_d,-8
+	srwi	$j,$num,`3+1`	; counter register, num/2
+	mulld	$t7,$t7,$n0	; tp[0]*n0
+	addi	$j,$j,-1
+	addi	$tp,$sp,`$FRAME+$TRANSFER-8`
+	li	$carry,0
+	mtctr	$j
+
+	; transfer bp[0] to FPU as 4x16-bit values
+	extrdi	$t0,$t3,16,48
+	extrdi	$t1,$t3,16,32
+	extrdi	$t2,$t3,16,16
+	extrdi	$t3,$t3,16,0
+	std	$t0,`$FRAME+0`($sp)
+	std	$t1,`$FRAME+8`($sp)
+	std	$t2,`$FRAME+16`($sp)
+	std	$t3,`$FRAME+24`($sp)
+	; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
+	extrdi	$t4,$t7,16,48
+	extrdi	$t5,$t7,16,32
+	extrdi	$t6,$t7,16,16
+	extrdi	$t7,$t7,16,0
+	std	$t4,`$FRAME+32`($sp)
+	std	$t5,`$FRAME+40`($sp)
+	std	$t6,`$FRAME+48`($sp)
+	std	$t7,`$FRAME+56`($sp)
+	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
+	lwz	$t1,0($ap)
+	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
+	lwz	$t3,8($ap)
+	lwz	$t4,4($np)		; load n[j] as 32-bit word pair
+	lwz	$t5,0($np)
+	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
+	lwz	$t7,8($np)
+	lfd	$ba,`$FRAME+0`($sp)
+	lfd	$bb,`$FRAME+8`($sp)
+	lfd	$bc,`$FRAME+16`($sp)
+	lfd	$bd,`$FRAME+24`($sp)
+	lfd	$na,`$FRAME+32`($sp)
+	lfd	$nb,`$FRAME+40`($sp)
+	lfd	$nc,`$FRAME+48`($sp)
+	lfd	$nd,`$FRAME+56`($sp)
+	std	$t0,`$FRAME+64`($sp)
+	std	$t1,`$FRAME+72`($sp)
+	std	$t2,`$FRAME+80`($sp)
+	std	$t3,`$FRAME+88`($sp)
+	std	$t4,`$FRAME+96`($sp)
+	std	$t5,`$FRAME+104`($sp)
+	std	$t6,`$FRAME+112`($sp)
+	std	$t7,`$FRAME+120`($sp)
+	fcfid	$ba,$ba
+	fcfid	$bb,$bb
+	fcfid	$bc,$bc
+	fcfid	$bd,$bd
+	fcfid	$na,$na
+	fcfid	$nb,$nb
+	fcfid	$nc,$nc
+	fcfid	$nd,$nd
+
+	lfd	$A0,`$FRAME+64`($sp)
+	lfd	$A1,`$FRAME+72`($sp)
+	lfd	$A2,`$FRAME+80`($sp)
+	lfd	$A3,`$FRAME+88`($sp)
+	lfd	$N0,`$FRAME+96`($sp)
+	lfd	$N1,`$FRAME+104`($sp)
+	lfd	$N2,`$FRAME+112`($sp)
+	lfd	$N3,`$FRAME+120`($sp)
+	fcfid	$A0,$A0
+	fcfid	$A1,$A1
+	fcfid	$A2,$A2
+	fcfid	$A3,$A3
+	fcfid	$N0,$N0
+	fcfid	$N1,$N1
+	fcfid	$N2,$N2
+	fcfid	$N3,$N3
+	addi	$ap,$ap,16
+	addi	$np,$np,16
+
+	fmul	$T1a,$A1,$ba
+	fmul	$T1b,$A1,$bb
+	stfd	$A0,8($nap_d)		; save a[j] in double format
+	stfd	$A1,16($nap_d)
+	fmul	$T2a,$A2,$ba
+	fmul	$T2b,$A2,$bb
+	stfd	$A2,24($nap_d)		; save a[j+1] in double format
+	stfd	$A3,32($nap_d)
+	fmul	$T3a,$A3,$ba
+	fmul	$T3b,$A3,$bb
+	stfd	$N0,40($nap_d)		; save n[j] in double format
+	stfd	$N1,48($nap_d)
+	fmul	$T0a,$A0,$ba
+	fmul	$T0b,$A0,$bb
+	stfd	$N2,56($nap_d)		; save n[j+1] in double format
+	stfdu	$N3,64($nap_d)
+
+	fmadd	$T1a,$A0,$bc,$T1a
+	fmadd	$T1b,$A0,$bd,$T1b
+	fmadd	$T2a,$A1,$bc,$T2a
+	fmadd	$T2b,$A1,$bd,$T2b
+	fmadd	$T3a,$A2,$bc,$T3a
+	fmadd	$T3b,$A2,$bd,$T3b
+	fmul	$dota,$A3,$bc
+	fmul	$dotb,$A3,$bd
+
+	fmadd	$T1a,$N1,$na,$T1a
+	fmadd	$T1b,$N1,$nb,$T1b
+	fmadd	$T2a,$N2,$na,$T2a
+	fmadd	$T2b,$N2,$nb,$T2b
+	fmadd	$T3a,$N3,$na,$T3a
+	fmadd	$T3b,$N3,$nb,$T3b
+	fmadd	$T0a,$N0,$na,$T0a
+	fmadd	$T0b,$N0,$nb,$T0b
+
+	fmadd	$T1a,$N0,$nc,$T1a
+	fmadd	$T1b,$N0,$nd,$T1b
+	fmadd	$T2a,$N1,$nc,$T2a
+	fmadd	$T2b,$N1,$nd,$T2b
+	fmadd	$T3a,$N2,$nc,$T3a
+	fmadd	$T3b,$N2,$nd,$T3b
+	fmadd	$dota,$N3,$nc,$dota
+	fmadd	$dotb,$N3,$nd,$dotb
+
+	fctid	$T0a,$T0a
+	fctid	$T0b,$T0b
+	fctid	$T1a,$T1a
+	fctid	$T1b,$T1b
+	fctid	$T2a,$T2a
+	fctid	$T2b,$T2b
+	fctid	$T3a,$T3a
+	fctid	$T3b,$T3b
+
+	stfd	$T0a,`$FRAME+0`($sp)
+	stfd	$T0b,`$FRAME+8`($sp)
+	stfd	$T1a,`$FRAME+16`($sp)
+	stfd	$T1b,`$FRAME+24`($sp)
+	stfd	$T2a,`$FRAME+32`($sp)
+	stfd	$T2b,`$FRAME+40`($sp)
+	stfd	$T3a,`$FRAME+48`($sp)
+	stfd	$T3b,`$FRAME+56`($sp)
+
+.align	5
+L1st:
+	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
+	lwz	$t1,0($ap)
+	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
+	lwz	$t3,8($ap)
+	lwz	$t4,4($np)		; load n[j] as 32-bit word pair
+	lwz	$t5,0($np)
+	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
+	lwz	$t7,8($np)
+	std	$t0,`$FRAME+64`($sp)
+	std	$t1,`$FRAME+72`($sp)
+	std	$t2,`$FRAME+80`($sp)
+	std	$t3,`$FRAME+88`($sp)
+	std	$t4,`$FRAME+96`($sp)
+	std	$t5,`$FRAME+104`($sp)
+	std	$t6,`$FRAME+112`($sp)
+	std	$t7,`$FRAME+120`($sp)
+	ld	$t0,`$FRAME+0`($sp)
+	ld	$t1,`$FRAME+8`($sp)
+	ld	$t2,`$FRAME+16`($sp)
+	ld	$t3,`$FRAME+24`($sp)
+	ld	$t4,`$FRAME+32`($sp)
+	ld	$t5,`$FRAME+40`($sp)
+	ld	$t6,`$FRAME+48`($sp)
+	ld	$t7,`$FRAME+56`($sp)
+	lfd	$A0,`$FRAME+64`($sp)
+	lfd	$A1,`$FRAME+72`($sp)
+	lfd	$A2,`$FRAME+80`($sp)
+	lfd	$A3,`$FRAME+88`($sp)
+	lfd	$N0,`$FRAME+96`($sp)
+	lfd	$N1,`$FRAME+104`($sp)
+	lfd	$N2,`$FRAME+112`($sp)
+	lfd	$N3,`$FRAME+120`($sp)
+	fcfid	$A0,$A0
+	fcfid	$A1,$A1
+	fcfid	$A2,$A2
+	fcfid	$A3,$A3
+	fcfid	$N0,$N0
+	fcfid	$N1,$N1
+	fcfid	$N2,$N2
+	fcfid	$N3,$N3
+	addi	$ap,$ap,16
+	addi	$np,$np,16
+
+	fmul	$T1a,$A1,$ba
+	fmul	$T1b,$A1,$bb
+	fmul	$T2a,$A2,$ba
+	fmul	$T2b,$A2,$bb
+	stfd	$A0,8($nap_d)		; save a[j] in double format
+	stfd	$A1,16($nap_d)
+	fmul	$T3a,$A3,$ba
+	fmul	$T3b,$A3,$bb
+	fmadd	$T0a,$A0,$ba,$dota
+	fmadd	$T0b,$A0,$bb,$dotb
+	stfd	$A2,24($nap_d)		; save a[j+1] in double format
+	stfd	$A3,32($nap_d)
+
+	fmadd	$T1a,$A0,$bc,$T1a
+	fmadd	$T1b,$A0,$bd,$T1b
+	fmadd	$T2a,$A1,$bc,$T2a
+	fmadd	$T2b,$A1,$bd,$T2b
+	stfd	$N0,40($nap_d)		; save n[j] in double format
+	stfd	$N1,48($nap_d)
+	fmadd	$T3a,$A2,$bc,$T3a
+	fmadd	$T3b,$A2,$bd,$T3b
+	 add	$t0,$t0,$carry		; can not overflow
+	fmul	$dota,$A3,$bc
+	fmul	$dotb,$A3,$bd
+	stfd	$N2,56($nap_d)		; save n[j+1] in double format
+	stfdu	$N3,64($nap_d)
+	 srdi	$carry,$t0,16
+	 add	$t1,$t1,$carry
+	 srdi	$carry,$t1,16
+
+	fmadd	$T1a,$N1,$na,$T1a
+	fmadd	$T1b,$N1,$nb,$T1b
+	 insrdi	$t0,$t1,16,32
+	fmadd	$T2a,$N2,$na,$T2a
+	fmadd	$T2b,$N2,$nb,$T2b
+	 add	$t2,$t2,$carry
+	fmadd	$T3a,$N3,$na,$T3a
+	fmadd	$T3b,$N3,$nb,$T3b
+	 srdi	$carry,$t2,16
+	fmadd	$T0a,$N0,$na,$T0a
+	fmadd	$T0b,$N0,$nb,$T0b
+	 insrdi	$t0,$t2,16,16
+	 add	$t3,$t3,$carry
+	 srdi	$carry,$t3,16
+
+	fmadd	$T1a,$N0,$nc,$T1a
+	fmadd	$T1b,$N0,$nd,$T1b
+	 insrdi	$t0,$t3,16,0		; 0..63 bits
+	fmadd	$T2a,$N1,$nc,$T2a
+	fmadd	$T2b,$N1,$nd,$T2b
+	 add	$t4,$t4,$carry
+	fmadd	$T3a,$N2,$nc,$T3a
+	fmadd	$T3b,$N2,$nd,$T3b
+	 srdi	$carry,$t4,16
+	fmadd	$dota,$N3,$nc,$dota
+	fmadd	$dotb,$N3,$nd,$dotb
+	 add	$t5,$t5,$carry
+	 srdi	$carry,$t5,16
+	 insrdi	$t4,$t5,16,32
+
+	fctid	$T0a,$T0a
+	fctid	$T0b,$T0b
+	 add	$t6,$t6,$carry
+	fctid	$T1a,$T1a
+	fctid	$T1b,$T1b
+	 srdi	$carry,$t6,16
+	fctid	$T2a,$T2a
+	fctid	$T2b,$T2b
+	 insrdi	$t4,$t6,16,16
+	fctid	$T3a,$T3a
+	fctid	$T3b,$T3b
+	 add	$t7,$t7,$carry
+	 insrdi	$t4,$t7,16,0		; 64..127 bits
+	 srdi	$carry,$t7,16		; upper 33 bits
+
+	stfd	$T0a,`$FRAME+0`($sp)
+	stfd	$T0b,`$FRAME+8`($sp)
+	stfd	$T1a,`$FRAME+16`($sp)
+	stfd	$T1b,`$FRAME+24`($sp)
+	stfd	$T2a,`$FRAME+32`($sp)
+	stfd	$T2b,`$FRAME+40`($sp)
+	stfd	$T3a,`$FRAME+48`($sp)
+	stfd	$T3b,`$FRAME+56`($sp)
+	 std	$t0,8($tp)		; tp[j-1]
+	 stdu	$t4,16($tp)		; tp[j]
+	bdnz-	L1st
+
+	fctid	$dota,$dota
+	fctid	$dotb,$dotb
+
+	ld	$t0,`$FRAME+0`($sp)
+	ld	$t1,`$FRAME+8`($sp)
+	ld	$t2,`$FRAME+16`($sp)
+	ld	$t3,`$FRAME+24`($sp)
+	ld	$t4,`$FRAME+32`($sp)
+	ld	$t5,`$FRAME+40`($sp)
+	ld	$t6,`$FRAME+48`($sp)
+	ld	$t7,`$FRAME+56`($sp)
+	stfd	$dota,`$FRAME+64`($sp)
+	stfd	$dotb,`$FRAME+72`($sp)
+
+	add	$t0,$t0,$carry		; can not overflow
+	srdi	$carry,$t0,16
+	add	$t1,$t1,$carry
+	srdi	$carry,$t1,16
+	insrdi	$t0,$t1,16,32
+	add	$t2,$t2,$carry
+	srdi	$carry,$t2,16
+	insrdi	$t0,$t2,16,16
+	add	$t3,$t3,$carry
+	srdi	$carry,$t3,16
+	insrdi	$t0,$t3,16,0		; 0..63 bits
+	add	$t4,$t4,$carry
+	srdi	$carry,$t4,16
+	add	$t5,$t5,$carry
+	srdi	$carry,$t5,16
+	insrdi	$t4,$t5,16,32
+	add	$t6,$t6,$carry
+	srdi	$carry,$t6,16
+	insrdi	$t4,$t6,16,16
+	add	$t7,$t7,$carry
+	insrdi	$t4,$t7,16,0		; 64..127 bits
+	srdi	$carry,$t7,16		; upper 33 bits
+	ld	$t6,`$FRAME+64`($sp)
+	ld	$t7,`$FRAME+72`($sp)
+
+	std	$t0,8($tp)		; tp[j-1]
+	stdu	$t4,16($tp)		; tp[j]
+
+	add	$t6,$t6,$carry		; can not overflow
+	srdi	$carry,$t6,16
+	add	$t7,$t7,$carry
+	insrdi	$t6,$t7,48,0
+	srdi	$ovf,$t7,48
+	std	$t6,8($tp)		; tp[num-1]
+
+	slwi	$t7,$num,2
+	subf	$nap_d,$t7,$nap_d	; rewind pointer
+
+	li	$i,8			; i=1
+.align	5
+Louter:
+	ldx	$t3,$bp,$i	; bp[i]
+	ld	$t6,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
+	mulld	$t7,$a0,$t3	; ap[0]*bp[i]
+
+	addi	$tp,$sp,`$FRAME+$TRANSFER`
+	add	$t7,$t7,$t6	; ap[0]*bp[i]+tp[0]
+	li	$carry,0
+	mulld	$t7,$t7,$n0	; tp[0]*n0
+	mtctr	$j
+
+	; transfer bp[i] to FPU as 4x16-bit values
+	extrdi	$t0,$t3,16,48
+	extrdi	$t1,$t3,16,32
+	extrdi	$t2,$t3,16,16
+	extrdi	$t3,$t3,16,0
+	std	$t0,`$FRAME+0`($sp)
+	std	$t1,`$FRAME+8`($sp)
+	std	$t2,`$FRAME+16`($sp)
+	std	$t3,`$FRAME+24`($sp)
+	; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
+	extrdi	$t4,$t7,16,48
+	extrdi	$t5,$t7,16,32
+	extrdi	$t6,$t7,16,16
+	extrdi	$t7,$t7,16,0
+	std	$t4,`$FRAME+32`($sp)
+	std	$t5,`$FRAME+40`($sp)
+	std	$t6,`$FRAME+48`($sp)
+	std	$t7,`$FRAME+56`($sp)
+
+	lfd	$A0,8($nap_d)		; load a[j] in double format
+	lfd	$A1,16($nap_d)
+	lfd	$A2,24($nap_d)		; load a[j+1] in double format
+	lfd	$A3,32($nap_d)
+	lfd	$N0,40($nap_d)		; load n[j] in double format
+	lfd	$N1,48($nap_d)
+	lfd	$N2,56($nap_d)		; load n[j+1] in double format
+	lfdu	$N3,64($nap_d)
+
+	lfd	$ba,`$FRAME+0`($sp)
+	lfd	$bb,`$FRAME+8`($sp)
+	lfd	$bc,`$FRAME+16`($sp)
+	lfd	$bd,`$FRAME+24`($sp)
+	lfd	$na,`$FRAME+32`($sp)
+	lfd	$nb,`$FRAME+40`($sp)
+	lfd	$nc,`$FRAME+48`($sp)
+	lfd	$nd,`$FRAME+56`($sp)
+
+	fcfid	$ba,$ba
+	fcfid	$bb,$bb
+	fcfid	$bc,$bc
+	fcfid	$bd,$bd
+	fcfid	$na,$na
+	fcfid	$nb,$nb
+	fcfid	$nc,$nc
+	fcfid	$nd,$nd
+
+	fmul	$T1a,$A1,$ba
+	fmul	$T1b,$A1,$bb
+	fmul	$T2a,$A2,$ba
+	fmul	$T2b,$A2,$bb
+	fmul	$T3a,$A3,$ba
+	fmul	$T3b,$A3,$bb
+	fmul	$T0a,$A0,$ba
+	fmul	$T0b,$A0,$bb
+
+	fmadd	$T1a,$A0,$bc,$T1a
+	fmadd	$T1b,$A0,$bd,$T1b
+	fmadd	$T2a,$A1,$bc,$T2a
+	fmadd	$T2b,$A1,$bd,$T2b
+	fmadd	$T3a,$A2,$bc,$T3a
+	fmadd	$T3b,$A2,$bd,$T3b
+	fmul	$dota,$A3,$bc
+	fmul	$dotb,$A3,$bd
+
+	fmadd	$T1a,$N1,$na,$T1a
+	fmadd	$T1b,$N1,$nb,$T1b
+	 lfd	$A0,8($nap_d)		; load a[j] in double format
+	 lfd	$A1,16($nap_d)
+	fmadd	$T2a,$N2,$na,$T2a
+	fmadd	$T2b,$N2,$nb,$T2b
+	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
+	 lfd	$A3,32($nap_d)
+	fmadd	$T3a,$N3,$na,$T3a
+	fmadd	$T3b,$N3,$nb,$T3b
+	fmadd	$T0a,$N0,$na,$T0a
+	fmadd	$T0b,$N0,$nb,$T0b
+
+	fmadd	$T1a,$N0,$nc,$T1a
+	fmadd	$T1b,$N0,$nd,$T1b
+	fmadd	$T2a,$N1,$nc,$T2a
+	fmadd	$T2b,$N1,$nd,$T2b
+	fmadd	$T3a,$N2,$nc,$T3a
+	fmadd	$T3b,$N2,$nd,$T3b
+	fmadd	$dota,$N3,$nc,$dota
+	fmadd	$dotb,$N3,$nd,$dotb
+
+	fctid	$T0a,$T0a
+	fctid	$T0b,$T0b
+	fctid	$T1a,$T1a
+	fctid	$T1b,$T1b
+	fctid	$T2a,$T2a
+	fctid	$T2b,$T2b
+	fctid	$T3a,$T3a
+	fctid	$T3b,$T3b
+
+	stfd	$T0a,`$FRAME+0`($sp)
+	stfd	$T0b,`$FRAME+8`($sp)
+	stfd	$T1a,`$FRAME+16`($sp)
+	stfd	$T1b,`$FRAME+24`($sp)
+	stfd	$T2a,`$FRAME+32`($sp)
+	stfd	$T2b,`$FRAME+40`($sp)
+	stfd	$T3a,`$FRAME+48`($sp)
+	stfd	$T3b,`$FRAME+56`($sp)
+
+.align	5
+Linner:
+	fmul	$T1a,$A1,$ba
+	fmul	$T1b,$A1,$bb
+	fmul	$T2a,$A2,$ba
+	fmul	$T2b,$A2,$bb
+	lfd	$N0,40($nap_d)		; load n[j] in double format
+	lfd	$N1,48($nap_d)
+	fmul	$T3a,$A3,$ba
+	fmul	$T3b,$A3,$bb
+	fmadd	$T0a,$A0,$ba,$dota
+	fmadd	$T0b,$A0,$bb,$dotb
+	lfd	$N2,56($nap_d)		; load n[j+1] in double format
+	lfdu	$N3,64($nap_d)
+
+	fmadd	$T1a,$A0,$bc,$T1a
+	fmadd	$T1b,$A0,$bd,$T1b
+	fmadd	$T2a,$A1,$bc,$T2a
+	fmadd	$T2b,$A1,$bd,$T2b
+	 lfd	$A0,8($nap_d)		; load a[j] in double format
+	 lfd	$A1,16($nap_d)
+	fmadd	$T3a,$A2,$bc,$T3a
+	fmadd	$T3b,$A2,$bd,$T3b
+	fmul	$dota,$A3,$bc
+	fmul	$dotb,$A3,$bd
+	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
+	 lfd	$A3,32($nap_d)
+
+	fmadd	$T1a,$N1,$na,$T1a
+	fmadd	$T1b,$N1,$nb,$T1b
+	 ld	$t0,`$FRAME+0`($sp)
+	 ld	$t1,`$FRAME+8`($sp)
+	fmadd	$T2a,$N2,$na,$T2a
+	fmadd	$T2b,$N2,$nb,$T2b
+	 ld	$t2,`$FRAME+16`($sp)
+	 ld	$t3,`$FRAME+24`($sp)
+	fmadd	$T3a,$N3,$na,$T3a
+	fmadd	$T3b,$N3,$nb,$T3b
+	 add	$t0,$t0,$carry		; can not overflow
+	 ld	$t4,`$FRAME+32`($sp)
+	 ld	$t5,`$FRAME+40`($sp)
+	fmadd	$T0a,$N0,$na,$T0a
+	fmadd	$T0b,$N0,$nb,$T0b
+	 srdi	$carry,$t0,16
+	 add	$t1,$t1,$carry
+	 srdi	$carry,$t1,16
+	 ld	$t6,`$FRAME+48`($sp)
+	 ld	$t7,`$FRAME+56`($sp)
+
+	fmadd	$T1a,$N0,$nc,$T1a
+	fmadd	$T1b,$N0,$nd,$T1b
+	 insrdi	$t0,$t1,16,32
+	 ld	$t1,8($tp)		; tp[j]
+	fmadd	$T2a,$N1,$nc,$T2a
+	fmadd	$T2b,$N1,$nd,$T2b
+	 add	$t2,$t2,$carry
+	fmadd	$T3a,$N2,$nc,$T3a
+	fmadd	$T3b,$N2,$nd,$T3b
+	 srdi	$carry,$t2,16
+	 insrdi	$t0,$t2,16,16
+	fmadd	$dota,$N3,$nc,$dota
+	fmadd	$dotb,$N3,$nd,$dotb
+	 add	$t3,$t3,$carry
+	 ldu	$t2,16($tp)		; tp[j+1]
+	 srdi	$carry,$t3,16
+	 insrdi	$t0,$t3,16,0		; 0..63 bits
+	 add	$t4,$t4,$carry
+
+	fctid	$T0a,$T0a
+	fctid	$T0b,$T0b
+	 srdi	$carry,$t4,16
+	fctid	$T1a,$T1a
+	fctid	$T1b,$T1b
+	 add	$t5,$t5,$carry
+	fctid	$T2a,$T2a
+	fctid	$T2b,$T2b
+	 srdi	$carry,$t5,16
+	 insrdi	$t4,$t5,16,32
+	fctid	$T3a,$T3a
+	fctid	$T3b,$T3b
+	 add	$t6,$t6,$carry
+	 srdi	$carry,$t6,16
+	 insrdi	$t4,$t6,16,16
+
+	stfd	$T0a,`$FRAME+0`($sp)
+	stfd	$T0b,`$FRAME+8`($sp)
+	 add	$t7,$t7,$carry
+	 addc	$t3,$t0,$t1
+	stfd	$T1a,`$FRAME+16`($sp)
+	stfd	$T1b,`$FRAME+24`($sp)
+	 insrdi	$t4,$t7,16,0		; 64..127 bits
+	 srdi	$carry,$t7,16		; upper 33 bits
+	stfd	$T2a,`$FRAME+32`($sp)
+	stfd	$T2b,`$FRAME+40`($sp)
+	 adde	$t5,$t4,$t2
+	stfd	$T3a,`$FRAME+48`($sp)
+	stfd	$T3b,`$FRAME+56`($sp)
+	 addze	$carry,$carry
+	 std	$t3,-16($tp)		; tp[j-1]
+	 std	$t5,-8($tp)		; tp[j]
+	bdnz-	Linner
+
+	fctid	$dota,$dota
+	fctid	$dotb,$dotb
+	ld	$t0,`$FRAME+0`($sp)
+	ld	$t1,`$FRAME+8`($sp)
+	ld	$t2,`$FRAME+16`($sp)
+	ld	$t3,`$FRAME+24`($sp)
+	ld	$t4,`$FRAME+32`($sp)
+	ld	$t5,`$FRAME+40`($sp)
+	ld	$t6,`$FRAME+48`($sp)
+	ld	$t7,`$FRAME+56`($sp)
+	stfd	$dota,`$FRAME+64`($sp)
+	stfd	$dotb,`$FRAME+72`($sp)
+
+	add	$t0,$t0,$carry		; can not overflow
+	srdi	$carry,$t0,16
+	add	$t1,$t1,$carry
+	srdi	$carry,$t1,16
+	insrdi	$t0,$t1,16,32
+	add	$t2,$t2,$carry
+	ld	$t1,8($tp)		; tp[j]
+	srdi	$carry,$t2,16
+	insrdi	$t0,$t2,16,16
+	add	$t3,$t3,$carry
+	ldu	$t2,16($tp)		; tp[j+1]
+	srdi	$carry,$t3,16
+	insrdi	$t0,$t3,16,0		; 0..63 bits
+	add	$t4,$t4,$carry
+	srdi	$carry,$t4,16
+	add	$t5,$t5,$carry
+	srdi	$carry,$t5,16
+	insrdi	$t4,$t5,16,32
+	add	$t6,$t6,$carry
+	srdi	$carry,$t6,16
+	insrdi	$t4,$t6,16,16
+	add	$t7,$t7,$carry
+	insrdi	$t4,$t7,16,0		; 64..127 bits
+	srdi	$carry,$t7,16		; upper 33 bits
+	ld	$t6,`$FRAME+64`($sp)
+	ld	$t7,`$FRAME+72`($sp)
+
+	addc	$t3,$t0,$t1
+	adde	$t5,$t4,$t2
+	addze	$carry,$carry
+
+	std	$t3,-16($tp)		; tp[j-1]
+	std	$t5,-8($tp)		; tp[j]
+
+	add	$carry,$carry,$ovf	; comsume upmost overflow
+	add	$t6,$t6,$carry		; can not overflow
+	srdi	$carry,$t6,16
+	add	$t7,$t7,$carry
+	insrdi	$t6,$t7,48,0
+	srdi	$ovf,$t7,48
+	std	$t6,0($tp)		; tp[num-1]
+
+	slwi	$t7,$num,2
+	addi	$i,$i,8
+	subf	$nap_d,$t7,$nap_d	; rewind pointer
+	cmpw	$i,$num
+	blt-	Louter
+
+	subf	$np,$num,$np	; rewind np
+	addi	$j,$j,1		; restore counter
+	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
+	addi	$tp,$sp,`$FRAME+$TRANSFER+8`
+	addi	$t4,$sp,`$FRAME+$TRANSFER+16`
+	addi	$t5,$np,8
+	addi	$t6,$rp,8
+	mtctr	$j
+
+.align	4
+Lsub:	ldx	$t0,$tp,$i
+	ldx	$t1,$np,$i
+	ldx	$t2,$t4,$i
+	ldx	$t3,$t5,$i
+	subfe	$t0,$t1,$t0	; tp[j]-np[j]
+	subfe	$t2,$t3,$t2	; tp[j+1]-np[j+1]
+	stdx	$t0,$rp,$i
+	stdx	$t2,$t6,$i
+	addi	$i,$i,16
+	bdnz-	Lsub
+
+	li	$i,0
+	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
+	and	$ap,$tp,$ovf
+	andc	$np,$rp,$ovf
+	or	$ap,$ap,$np	; ap=borrow?tp:rp
+	addi	$t7,$ap,8
+	mtctr	$j
+
+.align	4
+Lcopy:				; copy or in-place refresh
+	ldx	$t0,$ap,$i
+	ldx	$t1,$t7,$i
+	std	$i,8($nap_d)	; zap nap_d
+	std	$i,16($nap_d)
+	std	$i,24($nap_d)
+	std	$i,32($nap_d)
+	std	$i,40($nap_d)
+	std	$i,48($nap_d)
+	std	$i,56($nap_d)
+	stdu	$i,64($nap_d)
+	stdx	$t0,$rp,$i
+	stdx	$t1,$t6,$i
+	stdx	$i,$tp,$i	; zap tp at once
+	stdx	$i,$t4,$i
+	addi	$i,$i,16
+	bdnz-	Lcopy
+
+	$POP	r14,`2*$SIZE_T`($sp)
+	$POP	r15,`3*$SIZE_T`($sp)
+	$POP	r16,`4*$SIZE_T`($sp)
+	$POP	r17,`5*$SIZE_T`($sp)
+	$POP	r18,`6*$SIZE_T`($sp)
+	$POP	r19,`7*$SIZE_T`($sp)
+	$POP	r20,`8*$SIZE_T`($sp)
+	$POP	r21,`9*$SIZE_T`($sp)
+	$POP	r22,`10*$SIZE_T`($sp)
+	$POP	r23,`11*$SIZE_T`($sp)
+	lfd	f14,`12*$SIZE_T+0`($sp)
+	lfd	f15,`12*$SIZE_T+8`($sp)
+	lfd	f16,`12*$SIZE_T+16`($sp)
+	lfd	f17,`12*$SIZE_T+24`($sp)
+	lfd	f18,`12*$SIZE_T+32`($sp)
+	lfd	f19,`12*$SIZE_T+40`($sp)
+	lfd	f20,`12*$SIZE_T+48`($sp)
+	lfd	f21,`12*$SIZE_T+56`($sp)
+	lfd	f22,`12*$SIZE_T+64`($sp)
+	lfd	f23,`12*$SIZE_T+72`($sp)
+	lfd	f24,`12*$SIZE_T+80`($sp)
+	lfd	f25,`12*$SIZE_T+88`($sp)
+	$POP	$sp,0($sp)
+	li	r3,1	; signal "handled"
+	blr
+	.long	0
+.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl
new file mode 100644
index 0000000000..d23251033b
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/s390x-mont.pl
@@ -0,0 +1,225 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# April 2007.
+#
+# Performance improvement over vanilla C code varies from 85% to 45%
+# depending on key length and benchmark. Unfortunately in this context
+# these are not very impressive results [for code that utilizes "wide"
+# 64x64=128-bit multiplication, which is not commonly available to C
+# programmers], at least hand-coded bn_asm.c replacement is known to
+# provide 30-40% better results for longest keys. Well, on a second
+# thought it's not very surprising, because z-CPUs are single-issue
+# and _strictly_ in-order execution, while bn_mul_mont is more or less
+# dependent on CPU ability to pipe-line instructions and have several
+# of them "in-flight" at the same time. I mean while other methods,
+# for example Karatsuba, aim to minimize amount of multiplications at
+# the cost of other operations increase, bn_mul_mont aim to neatly
+# "overlap" multiplications and the other operations [and on most
+# platforms even minimize the amount of the other operations, in
+# particular references to memory]. But it's possible to improve this
+# module performance by implementing dedicated squaring code-path and
+# possibly by unrolling loops...
+
+# January 2009.
+#
+# Reschedule to minimize/avoid Address Generation Interlock hazard,
+# make inner loops counter-based.
+
+$mn0="%r0";
+$num="%r1";
+
+# int bn_mul_mont(
+$rp="%r2";		# BN_ULONG *rp,
+$ap="%r3";		# const BN_ULONG *ap,
+$bp="%r4";		# const BN_ULONG *bp,
+$np="%r5";		# const BN_ULONG *np,
+$n0="%r6";		# const BN_ULONG *n0,
+#$num="160(%r15)"	# int num);
+
+$bi="%r2";	# zaps rp
+$j="%r7";
+
+$ahi="%r8";
+$alo="%r9";
+$nhi="%r10";
+$nlo="%r11";
+$AHI="%r12";
+$NHI="%r13";
+$count="%r14";
+$sp="%r15";
+
+$code.=<<___;
+.text
+.globl	bn_mul_mont
+.type	bn_mul_mont,\@function
+bn_mul_mont:
+	lgf	$num,164($sp)	# pull $num
+	sla	$num,3		# $num to enumerate bytes
+	la	$bp,0($num,$bp)
+
+	stg	%r2,16($sp)
+
+	cghi	$num,16		#
+	lghi	%r2,0		#
+	blr	%r14		# if($num<16) return 0;
+	cghi	$num,128	#
+	bhr	%r14		# if($num>128) return 0;
+
+	stmg	%r3,%r15,24($sp)
+
+	lghi	$rp,-160-8	# leave room for carry bit
+	lcgr	$j,$num		# -$num
+	lgr	%r0,$sp
+	la	$rp,0($rp,$sp)
+	la	$sp,0($j,$rp)	# alloca
+	stg	%r0,0($sp)	# back chain
+
+	sra	$num,3		# restore $num
+	la	$bp,0($j,$bp)	# restore $bp
+	ahi	$num,-1		# adjust $num for inner loop
+	lg	$n0,0($n0)	# pull n0
+
+	lg	$bi,0($bp)
+	lg	$alo,0($ap)
+	mlgr	$ahi,$bi	# ap[0]*bp[0]
+	lgr	$AHI,$ahi
+
+	lgr	$mn0,$alo	# "tp[0]"*n0
+	msgr	$mn0,$n0
+
+	lg	$nlo,0($np)	#
+	mlgr	$nhi,$mn0	# np[0]*m1
+	algr	$nlo,$alo	# +="tp[0]"
+	lghi	$NHI,0
+	alcgr	$NHI,$nhi
+
+	la	$j,8(%r0)	# j=1
+	lr	$count,$num
+
+.align	16
+.L1st:
+	lg	$alo,0($j,$ap)
+	mlgr	$ahi,$bi	# ap[j]*bp[0]
+	algr	$alo,$AHI
+	lghi	$AHI,0
+	alcgr	$AHI,$ahi
+
+	lg	$nlo,0($j,$np)
+	mlgr	$nhi,$mn0	# np[j]*m1
+	algr	$nlo,$NHI
+	lghi	$NHI,0
+	alcgr	$nhi,$NHI	# +="tp[j]"
+	algr	$nlo,$alo
+	alcgr	$NHI,$nhi
+
+	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
+	la	$j,8($j)	# j++
+	brct	$count,.L1st
+
+	algr	$NHI,$AHI
+	lghi	$AHI,0
+	alcgr	$AHI,$AHI	# upmost overflow bit
+	stg	$NHI,160-8($j,$sp)
+	stg	$AHI,160($j,$sp)
+	la	$bp,8($bp)	# bp++
+
+.Louter:
+	lg	$bi,0($bp)	# bp[i]
+	lg	$alo,0($ap)
+	mlgr	$ahi,$bi	# ap[0]*bp[i]
+	alg	$alo,160($sp)	# +=tp[0]
+	lghi	$AHI,0
+	alcgr	$AHI,$ahi
+
+	lgr	$mn0,$alo
+	msgr	$mn0,$n0	# tp[0]*n0
+
+	lg	$nlo,0($np)	# np[0]
+	mlgr	$nhi,$mn0	# np[0]*m1
+	algr	$nlo,$alo	# +="tp[0]"
+	lghi	$NHI,0
+	alcgr	$NHI,$nhi
+
+	la	$j,8(%r0)	# j=1
+	lr	$count,$num
+
+.align	16
+.Linner:
+	lg	$alo,0($j,$ap)
+	mlgr	$ahi,$bi	# ap[j]*bp[i]
+	algr	$alo,$AHI
+	lghi	$AHI,0
+	alcgr	$ahi,$AHI
+	alg	$alo,160($j,$sp)# +=tp[j]
+	alcgr	$AHI,$ahi
+
+	lg	$nlo,0($j,$np)
+	mlgr	$nhi,$mn0	# np[j]*m1
+	algr	$nlo,$NHI
+	lghi	$NHI,0
+	alcgr	$nhi,$NHI
+	algr	$nlo,$alo	# +="tp[j]"
+	alcgr	$NHI,$nhi
+
+	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
+	la	$j,8($j)	# j++
+	brct	$count,.Linner
+
+	algr	$NHI,$AHI
+	lghi	$AHI,0
+	alcgr	$AHI,$AHI
+	alg	$NHI,160($j,$sp)# accumulate previous upmost overflow bit
+	lghi	$ahi,0
+	alcgr	$AHI,$ahi	# new upmost overflow bit
+	stg	$NHI,160-8($j,$sp)
+	stg	$AHI,160($j,$sp)
+
+	la	$bp,8($bp)	# bp++
+	clg	$bp,160+8+32($j,$sp)	# compare to &bp[num]
+	jne	.Louter
+
+	lg	$rp,160+8+16($j,$sp)	# reincarnate rp
+	la	$ap,160($sp)
+	ahi	$num,1		# restore $num, incidentally clears "borrow"
+
+	la	$j,0(%r0)
+	lr	$count,$num
+.Lsub:	lg	$alo,0($j,$ap)
+	slbg	$alo,0($j,$np)
+	stg	$alo,0($j,$rp)
+	la	$j,8($j)
+	brct	$count,.Lsub
+	lghi	$ahi,0
+	slbgr	$AHI,$ahi	# handle upmost carry
+
+	ngr	$ap,$AHI
+	lghi	$np,-1
+	xgr	$np,$AHI
+	ngr	$np,$rp
+	ogr	$ap,$np		# ap=borrow?tp:rp
+
+	la	$j,0(%r0)
+	lgr	$count,$num
+.Lcopy:	lg	$alo,0($j,$ap)	# copy or in-place refresh
+	stg	$j,160($j,$sp)	# zap tp
+	stg	$alo,0($j,$rp)
+	la	$j,8($j)
+	brct	$count,.Lcopy
+
+	la	%r1,160+8+48($j,$sp)
+	lmg	%r6,%r15,0(%r1)
+	lghi	%r2,1		# signal "processed"
+	br	%r14
+.size	bn_mul_mont,.-bn_mul_mont
+.string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/s390x.S b/src/lib/libcrypto/bn/asm/s390x.S
new file mode 100755
index 0000000000..8f45f5d513
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/s390x.S
@@ -0,0 +1,678 @@
+.ident "s390x.S, version 1.0"
+// ====================================================================
+// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+// project.
+//
+// Rights for redistribution and usage in source and binary forms are
+// granted according to the OpenSSL license. Warranty of any kind is
+// disclaimed.
+// ====================================================================
+
+.text
+
+#define zero	%r0
+
+// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
+.globl	bn_mul_add_words
+.type	bn_mul_add_words,@function
+.align	4
+bn_mul_add_words:
+	lghi	zero,0		// zero = 0
+	la	%r1,0(%r2)	// put rp aside
+	lghi	%r2,0		// i=0;
+	ltgfr	%r4,%r4
+	bler	%r14		// if (len<=0) return 0;
+
+	stmg	%r6,%r10,48(%r15)
+	lghi	%r8,0		// carry = 0
+	srag	%r10,%r4,2	// cnt=len/4
+	jz	.Loop1_madd
+
+.Loop4_madd:
+	lg	%r7,0(%r2,%r3)	// ap[i]
+	mlgr	%r6,%r5		// *=w
+	algr	%r7,%r8		// +=carry
+	alcgr	%r6,zero
+	alg	%r7,0(%r2,%r1)	// +=rp[i]
+	alcgr	%r6,zero
+	stg	%r7,0(%r2,%r1)	// rp[i]=
+
+	lg	%r9,8(%r2,%r3)
+	mlgr	%r8,%r5
+	algr	%r9,%r6
+	alcgr	%r8,zero
+	alg	%r9,8(%r2,%r1)
+	alcgr	%r8,zero
+	stg	%r9,8(%r2,%r1)
+
+	lg	%r7,16(%r2,%r3)
+	mlgr	%r6,%r5
+	algr	%r7,%r8
+	alcgr	%r6,zero
+	alg	%r7,16(%r2,%r1)
+	alcgr	%r6,zero
+	stg	%r7,16(%r2,%r1)
+
+	lg	%r9,24(%r2,%r3)
+	mlgr	%r8,%r5
+	algr	%r9,%r6
+	alcgr	%r8,zero
+	alg	%r9,24(%r2,%r1)
+	alcgr	%r8,zero
+	stg	%r9,24(%r2,%r1)
+
+	la	%r2,32(%r2)	// i+=4
+	brct	%r10,.Loop4_madd
+
+	lghi	%r10,3
+	nr	%r4,%r10	// cnt=len%4
+	jz	.Lend_madd
+
+.Loop1_madd:
+	lg	%r7,0(%r2,%r3)	// ap[i]
+	mlgr	%r6,%r5		// *=w
+	algr	%r7,%r8		// +=carry
+	alcgr	%r6,zero
+	alg	%r7,0(%r2,%r1)	// +=rp[i]
+	alcgr	%r6,zero
+	stg	%r7,0(%r2,%r1)	// rp[i]=
+
+	lgr	%r8,%r6
+	la	%r2,8(%r2)	// i++
+	brct	%r4,.Loop1_madd
+
+.Lend_madd:
+	lgr	%r2,%r8
+	lmg	%r6,%r10,48(%r15)
+	br	%r14
+.size	bn_mul_add_words,.-bn_mul_add_words
+
+// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
+.globl	bn_mul_words
+.type	bn_mul_words,@function
+.align	4
+bn_mul_words:
+	lghi	zero,0		// zero = 0
+	la	%r1,0(%r2)	// put rp aside
+	lghi	%r2,0		// i=0;
+	ltgfr	%r4,%r4
+	bler	%r14		// if (len<=0) return 0;
+
+	stmg	%r6,%r10,48(%r15)
+	lghi	%r8,0		// carry = 0
+	srag	%r10,%r4,2	// cnt=len/4
+	jz	.Loop1_mul
+
+.Loop4_mul:
+	lg	%r7,0(%r2,%r3)	// ap[i]
+	mlgr	%r6,%r5		// *=w
+	algr	%r7,%r8		// +=carry
+	alcgr	%r6,zero
+	stg	%r7,0(%r2,%r1)	// rp[i]=
+
+	lg	%r9,8(%r2,%r3)
+	mlgr	%r8,%r5
+	algr	%r9,%r6
+	alcgr	%r8,zero
+	stg	%r9,8(%r2,%r1)
+
+	lg	%r7,16(%r2,%r3)
+	mlgr	%r6,%r5
+	algr	%r7,%r8
+	alcgr	%r6,zero
+	stg	%r7,16(%r2,%r1)
+
+	lg	%r9,24(%r2,%r3)
+	mlgr	%r8,%r5
+	algr	%r9,%r6
+	alcgr	%r8,zero
+	stg	%r9,24(%r2,%r1)
+
+	la	%r2,32(%r2)	// i+=4
+	brct	%r10,.Loop4_mul
+
+	lghi	%r10,3
+	nr	%r4,%r10	// cnt=len%4
+	jz	.Lend_mul
+
+.Loop1_mul:
+	lg	%r7,0(%r2,%r3)	// ap[i]
+	mlgr	%r6,%r5		// *=w
+	algr	%r7,%r8		// +=carry
+	alcgr	%r6,zero
+	stg	%r7,0(%r2,%r1)	// rp[i]=
+
+	lgr	%r8,%r6
+	la	%r2,8(%r2)	// i++
+	brct	%r4,.Loop1_mul
+
+.Lend_mul:
+	lgr	%r2,%r8
+	lmg	%r6,%r10,48(%r15)
+	br	%r14
+.size	bn_mul_words,.-bn_mul_words
+
+// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
+.globl	bn_sqr_words
+.type	bn_sqr_words,@function
+.align	4
+bn_sqr_words:
+	ltgfr	%r4,%r4
+	bler	%r14
+
+	stmg	%r6,%r7,48(%r15)
+	srag	%r1,%r4,2	// cnt=len/4
+	jz	.Loop1_sqr
+
+.Loop4_sqr:
+	lg	%r7,0(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,0(%r2)
+	stg	%r6,8(%r2)
+
+	lg	%r7,8(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,16(%r2)
+	stg	%r6,24(%r2)
+
+	lg	%r7,16(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,32(%r2)
+	stg	%r6,40(%r2)
+
+	lg	%r7,24(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,48(%r2)
+	stg	%r6,56(%r2)
+
+	la	%r3,32(%r3)
+	la	%r2,64(%r2)
+	brct	%r1,.Loop4_sqr
+
+	lghi	%r1,3
+	nr	%r4,%r1		// cnt=len%4
+	jz	.Lend_sqr
+
+.Loop1_sqr:
+	lg	%r7,0(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,0(%r2)
+	stg	%r6,8(%r2)
+
+	la	%r3,8(%r3)
+	la	%r2,16(%r2)
+	brct	%r4,.Loop1_sqr
+
+.Lend_sqr:
+	lmg	%r6,%r7,48(%r15)
+	br	%r14
+.size	bn_sqr_words,.-bn_sqr_words
+
+// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
+.globl	bn_div_words
+.type	bn_div_words,@function
+.align	4
+bn_div_words:
+	dlgr	%r2,%r4
+	lgr	%r2,%r3
+	br	%r14
+.size	bn_div_words,.-bn_div_words
+
+// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
+.globl	bn_add_words
+.type	bn_add_words,@function
+.align	4
+bn_add_words:
+	la	%r1,0(%r2)	// put rp aside
+	lghi	%r2,0		// i=0
+	ltgfr	%r5,%r5
+	bler	%r14		// if (len<=0) return 0;
+
+	stg	%r6,48(%r15)
+	lghi	%r6,3
+	nr	%r6,%r5		// len%4
+	sra	%r5,2		// len/4, use sra because it sets condition code
+	jz	.Loop1_add	// carry is incidentally cleared if branch taken
+	algr	%r2,%r2		// clear carry
+
+.Loop4_add:
+	lg	%r0,0(%r2,%r3)
+	alcg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+	lg	%r0,8(%r2,%r3)
+	alcg	%r0,8(%r2,%r4)
+	stg	%r0,8(%r2,%r1)
+	lg	%r0,16(%r2,%r3)
+	alcg	%r0,16(%r2,%r4)
+	stg	%r0,16(%r2,%r1)
+	lg	%r0,24(%r2,%r3)
+	alcg	%r0,24(%r2,%r4)
+	stg	%r0,24(%r2,%r1)
+
+	la	%r2,32(%r2)	// i+=4
+	brct	%r5,.Loop4_add
+
+	la	%r6,1(%r6)	// see if len%4 is zero ...
+	brct	%r6,.Loop1_add	// without touching condition code:-)
+
+.Lexit_add:
+	lghi	%r2,0
+	alcgr	%r2,%r2
+	lg	%r6,48(%r15)
+	br	%r14
+
+.Loop1_add:
+	lg	%r0,0(%r2,%r3)
+	alcg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+
+	la	%r2,8(%r2)	// i++
+	brct	%r6,.Loop1_add
+
+	j	.Lexit_add
+.size	bn_add_words,.-bn_add_words
+
+// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
+.globl	bn_sub_words
+.type	bn_sub_words,@function
+.align	4
+bn_sub_words:
+	la	%r1,0(%r2)	// put rp aside
+	lghi	%r2,0		// i=0
+	ltgfr	%r5,%r5
+	bler	%r14		// if (len<=0) return 0;
+
+	stg	%r6,48(%r15)
+	lghi	%r6,3
+	nr	%r6,%r5		// len%4
+	sra	%r5,2		// len/4, use sra because it sets condition code
+	jnz	.Loop4_sub	// borrow is incidentally cleared if branch taken
+	slgr	%r2,%r2		// clear borrow
+
+.Loop1_sub:
+	lg	%r0,0(%r2,%r3)
+	slbg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+
+	la	%r2,8(%r2)	// i++
+	brct	%r6,.Loop1_sub
+	j	.Lexit_sub
+
+.Loop4_sub:
+	lg	%r0,0(%r2,%r3)
+	slbg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+	lg	%r0,8(%r2,%r3)
+	slbg	%r0,8(%r2,%r4)
+	stg	%r0,8(%r2,%r1)
+	lg	%r0,16(%r2,%r3)
+	slbg	%r0,16(%r2,%r4)
+	stg	%r0,16(%r2,%r1)
+	lg	%r0,24(%r2,%r3)
+	slbg	%r0,24(%r2,%r4)
+	stg	%r0,24(%r2,%r1)
+
+	la	%r2,32(%r2)	// i+=4
+	brct	%r5,.Loop4_sub
+
+	la	%r6,1(%r6)	// see if len%4 is zero ...
+	brct	%r6,.Loop1_sub	// without touching condition code:-)
+
+.Lexit_sub:
+	lghi	%r2,0
+	slbgr	%r2,%r2
+	lcgr	%r2,%r2
+	lg	%r6,48(%r15)
+	br	%r14
+.size	bn_sub_words,.-bn_sub_words
+
+#define c1	%r1
+#define c2	%r5
+#define c3	%r8
+
+#define mul_add_c(ai,bi,c1,c2,c3)	\
+	lg	%r7,ai*8(%r3);		\
+	mlg	%r6,bi*8(%r4);		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero
+
+// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
+.globl	bn_mul_comba8
+.type	bn_mul_comba8,@function
+.align	4
+bn_mul_comba8:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	mul_add_c(0,0,c1,c2,c3);
+	stg	c1,0*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(0,1,c2,c3,c1);
+	mul_add_c(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(2,0,c3,c1,c2);
+	mul_add_c(1,1,c3,c1,c2);
+	mul_add_c(0,2,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(0,3,c1,c2,c3);
+	mul_add_c(1,2,c1,c2,c3);
+	mul_add_c(2,1,c1,c2,c3);
+	mul_add_c(3,0,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(4,0,c2,c3,c1);
+	mul_add_c(3,1,c2,c3,c1);
+	mul_add_c(2,2,c2,c3,c1);
+	mul_add_c(1,3,c2,c3,c1);
+	mul_add_c(0,4,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(0,5,c3,c1,c2);
+	mul_add_c(1,4,c3,c1,c2);
+	mul_add_c(2,3,c3,c1,c2);
+	mul_add_c(3,2,c3,c1,c2);
+	mul_add_c(4,1,c3,c1,c2);
+	mul_add_c(5,0,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(6,0,c1,c2,c3);
+	mul_add_c(5,1,c1,c2,c3);
+	mul_add_c(4,2,c1,c2,c3);
+	mul_add_c(3,3,c1,c2,c3);
+	mul_add_c(2,4,c1,c2,c3);
+	mul_add_c(1,5,c1,c2,c3);
+	mul_add_c(0,6,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(0,7,c2,c3,c1);
+	mul_add_c(1,6,c2,c3,c1);
+	mul_add_c(2,5,c2,c3,c1);
+	mul_add_c(3,4,c2,c3,c1);
+	mul_add_c(4,3,c2,c3,c1);
+	mul_add_c(5,2,c2,c3,c1);
+	mul_add_c(6,1,c2,c3,c1);
+	mul_add_c(7,0,c2,c3,c1);
+	stg	c2,7*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(7,1,c3,c1,c2);
+	mul_add_c(6,2,c3,c1,c2);
+	mul_add_c(5,3,c3,c1,c2);
+	mul_add_c(4,4,c3,c1,c2);
+	mul_add_c(3,5,c3,c1,c2);
+	mul_add_c(2,6,c3,c1,c2);
+	mul_add_c(1,7,c3,c1,c2);
+	stg	c3,8*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(2,7,c1,c2,c3);
+	mul_add_c(3,6,c1,c2,c3);
+	mul_add_c(4,5,c1,c2,c3);
+	mul_add_c(5,4,c1,c2,c3);
+	mul_add_c(6,3,c1,c2,c3);
+	mul_add_c(7,2,c1,c2,c3);
+	stg	c1,9*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(7,3,c2,c3,c1);
+	mul_add_c(6,4,c2,c3,c1);
+	mul_add_c(5,5,c2,c3,c1);
+	mul_add_c(4,6,c2,c3,c1);
+	mul_add_c(3,7,c2,c3,c1);
+	stg	c2,10*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(4,7,c3,c1,c2);
+	mul_add_c(5,6,c3,c1,c2);
+	mul_add_c(6,5,c3,c1,c2);
+	mul_add_c(7,4,c3,c1,c2);
+	stg	c3,11*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(7,5,c1,c2,c3);
+	mul_add_c(6,6,c1,c2,c3);
+	mul_add_c(5,7,c1,c2,c3);
+	stg	c1,12*8(%r2)
+	lghi	c1,0
+
+
+	mul_add_c(6,7,c2,c3,c1);
+	mul_add_c(7,6,c2,c3,c1);
+	stg	c2,13*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(7,7,c3,c1,c2);
+	stg	c3,14*8(%r2)
+	stg	c1,15*8(%r2)
+
+	lmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_mul_comba8,.-bn_mul_comba8
+
+// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
+.globl	bn_mul_comba4
+.type	bn_mul_comba4,@function
+.align	4
+bn_mul_comba4:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	mul_add_c(0,0,c1,c2,c3);
+	stg	c1,0*8(%r3)
+	lghi	c1,0
+
+	mul_add_c(0,1,c2,c3,c1);
+	mul_add_c(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(2,0,c3,c1,c2);
+	mul_add_c(1,1,c3,c1,c2);
+	mul_add_c(0,2,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(0,3,c1,c2,c3);
+	mul_add_c(1,2,c1,c2,c3);
+	mul_add_c(2,1,c1,c2,c3);
+	mul_add_c(3,0,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(3,1,c2,c3,c1);
+	mul_add_c(2,2,c2,c3,c1);
+	mul_add_c(1,3,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(2,3,c3,c1,c2);
+	mul_add_c(3,2,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(3,3,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	stg	c2,7*8(%r2)
+
+	stmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_mul_comba4,.-bn_mul_comba4
+
+#define sqr_add_c(ai,c1,c2,c3)		\
+	lg	%r7,ai*8(%r3);		\
+	mlgr	%r6,%r7;		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero
+
+#define sqr_add_c2(ai,aj,c1,c2,c3)	\
+	lg	%r7,ai*8(%r3);		\
+	mlg	%r6,aj*8(%r3);		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero;		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero
+
+// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
+.globl	bn_sqr_comba8
+.type	bn_sqr_comba8,@function
+.align	4
+bn_sqr_comba8:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	sqr_add_c(0,c1,c2,c3);
+	stg	c1,0*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(1,c3,c1,c2);
+	sqr_add_c2(2,0,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c2(3,0,c1,c2,c3);
+	sqr_add_c2(2,1,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c(2,c2,c3,c1);
+	sqr_add_c2(3,1,c2,c3,c1);
+	sqr_add_c2(4,0,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c2(5,0,c3,c1,c2);
+	sqr_add_c2(4,1,c3,c1,c2);
+	sqr_add_c2(3,2,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c(3,c1,c2,c3);
+	sqr_add_c2(4,2,c1,c2,c3);
+	sqr_add_c2(5,1,c1,c2,c3);
+	sqr_add_c2(6,0,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(7,0,c2,c3,c1);
+	sqr_add_c2(6,1,c2,c3,c1);
+	sqr_add_c2(5,2,c2,c3,c1);
+	sqr_add_c2(4,3,c2,c3,c1);
+	stg	c2,7*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(4,c3,c1,c2);
+	sqr_add_c2(5,3,c3,c1,c2);
+	sqr_add_c2(6,2,c3,c1,c2);
+	sqr_add_c2(7,1,c3,c1,c2);
+	stg	c3,8*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c2(7,2,c1,c2,c3);
+	sqr_add_c2(6,3,c1,c2,c3);
+	sqr_add_c2(5,4,c1,c2,c3);
+	stg	c1,9*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c(5,c2,c3,c1);
+	sqr_add_c2(6,4,c2,c3,c1);
+	sqr_add_c2(7,3,c2,c3,c1);
+	stg	c2,10*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c2(7,4,c3,c1,c2);
+	sqr_add_c2(6,5,c3,c1,c2);
+	stg	c3,11*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c(6,c1,c2,c3);
+	sqr_add_c2(7,5,c1,c2,c3);
+	stg	c1,12*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(7,6,c2,c3,c1);
+	stg	c2,13*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(7,c3,c1,c2);
+	stg	c3,14*8(%r2)
+	stg	c1,15*8(%r2)
+
+	lmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_sqr_comba8,.-bn_sqr_comba8
+
+// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
+.globl bn_sqr_comba4
+.type	bn_sqr_comba4,@function
+.align	4
+bn_sqr_comba4:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	sqr_add_c(0,c1,c2,c3);
+	stg	c1,0*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(1,c3,c1,c2);
+	sqr_add_c2(2,0,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c2(3,0,c1,c2,c3);
+	sqr_add_c2(2,1,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c(2,c2,c3,c1);
+	sqr_add_c2(3,1,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c2(3,2,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c(3,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	stg	c2,7*8(%r2)
+
+	lmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_sqr_comba4,.-bn_sqr_comba4
diff --git a/src/lib/libcrypto/bn/asm/sparcv9-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9-mont.pl
new file mode 100644
index 0000000000..b8fb1e8a25
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/sparcv9-mont.pl
@@ -0,0 +1,606 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# December 2005
+#
+# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
+# for undertaken effort are multiple. First of all, UltraSPARC is not
+# the whole SPARCv9 universe and other VIS-free implementations deserve
+# optimized code as much. Secondly, newly introduced UltraSPARC T1,
+# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
+# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
+# several integrated RSA/DSA accelerator circuits accessible through
+# kernel driver [only(*)], but having decent user-land software
+# implementation is important too. Finally, reasons like desire to
+# experiment with dedicated squaring procedure. Yes, this module
+# implements one, because it was easiest to draft it in SPARCv9
+# instructions...
+
+# (*)	Engine accessing the driver in question is on my TODO list.
+#	For reference, acceleator is estimated to give 6 to 10 times
+#	improvement on single-threaded RSA sign. It should be noted
+#	that 6-10x improvement coefficient does not actually mean
+#	something extraordinary in terms of absolute [single-threaded]
+#	performance, as SPARCv9 instruction set is by all means least
+#	suitable for high performance crypto among other 64 bit
+#	platforms. 6-10x factor simply places T1 in same performance
+#	domain as say AMD64 and IA-64. Improvement of RSA verify don't
+#	appear impressive at all, but it's the sign operation which is
+#	far more critical/interesting.
+
+# You might notice that inner loops are modulo-scheduled:-) This has
+# essentially negligible impact on UltraSPARC performance, it's
+# Fujitsu SPARC64 V users who should notice and hopefully appreciate
+# the advantage... Currently this module surpasses sparcv9a-mont.pl
+# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
+# module still have hidden potential [see TODO list there], which is
+# estimated to be larger than 20%...
+
+# int bn_mul_mont(
+$rp="%i0";	# BN_ULONG *rp,
+$ap="%i1";	# const BN_ULONG *ap,
+$bp="%i2";	# const BN_ULONG *bp,
+$np="%i3";	# const BN_ULONG *np,
+$n0="%i4";	# const BN_ULONG *n0,
+$num="%i5";	# int num);
+
+$bits=32;
+for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)	{ $bias=2047; $frame=192; }
+else		{ $bias=0;    $frame=128; }
+
+$car0="%o0";
+$car1="%o1";
+$car2="%o2";	# 1 bit
+$acc0="%o3";
+$acc1="%o4";
+$mask="%g1";	# 32 bits, what a waste...
+$tmp0="%g4";
+$tmp1="%g5";
+
+$i="%l0";
+$j="%l1";
+$mul0="%l2";
+$mul1="%l3";
+$tp="%l4";
+$apj="%l5";
+$npj="%l6";
+$tpj="%l7";
+
+$fname="bn_mul_mont_int";
+
+$code=<<___;
+.section	".text",#alloc,#execinstr
+
+.global	$fname
+.align	32
+$fname:
+	cmp	%o5,4			! 128 bits minimum
+	bge,pt	%icc,.Lenter
+	sethi	%hi(0xffffffff),$mask
+	retl
+	clr	%o0
+.align	32
+.Lenter:
+	save	%sp,-$frame,%sp
+	sll	$num,2,$num		! num*=4
+	or	$mask,%lo(0xffffffff),$mask
+	ld	[$n0],$n0
+	cmp	$ap,$bp
+	and	$num,$mask,$num
+	ld	[$bp],$mul0		! bp[0]
+	nop
+
+	add	%sp,$bias,%o7		! real top of stack
+	ld	[$ap],$car0		! ap[0] ! redundant in squaring context
+	sub	%o7,$num,%o7
+	ld	[$ap+4],$apj		! ap[1]
+	and	%o7,-1024,%o7
+	ld	[$np],$car1		! np[0]
+	sub	%o7,$bias,%sp		! alloca
+	ld	[$np+4],$npj		! np[1]
+	be,pt	`$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
+	mov	12,$j
+
+	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
+	mulx	$apj,$mul0,$tmp0	!prologue! ap[1]*bp[0]
+	and	$car0,$mask,$acc0
+	add	%sp,$bias+$frame,$tp
+	ld	[$ap+8],$apj		!prologue!
+
+	mulx	$n0,$acc0,$mul1		! "t[0]"*n0
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1	! np[0]*"t[0]"*n0
+	mulx	$npj,$mul1,$acc1	!prologue! np[1]*"t[0]"*n0
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	ld	[$np+8],$npj		!prologue!
+	srlx	$car1,32,$car1
+	mov	$tmp0,$acc0		!prologue!
+
+.L1st:
+	mulx	$apj,$mul0,$tmp0
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0
+	ld	[$ap+$j],$apj		! ap[j]
+	and	$car0,$mask,$acc0
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj		! np[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	add	$j,4,$j			! j++
+	mov	$tmp0,$acc0
+	st	$car1,[$tp]
+	cmp	$j,$num
+	mov	$tmp1,$acc1
+	srlx	$car1,32,$car1
+	bl	%icc,.L1st
+	add	$tp,4,$tp		! tp++
+!.L1st
+
+	mulx	$apj,$mul0,$tmp0	!epilogue!
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0
+	and	$car0,$mask,$acc0
+	add	$acc1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$tmp0,$car0,$car0
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car1,$car1
+	st	$car1,[$tp+8]
+	srlx	$car1,32,$car2
+
+	mov	4,$i			! i++
+	ld	[$bp+4],$mul0		! bp[1]
+.Louter:
+	add	%sp,$bias+$frame,$tp
+	ld	[$ap],$car0		! ap[0]
+	ld	[$ap+4],$apj		! ap[1]
+	ld	[$np],$car1		! np[0]
+	ld	[$np+4],$npj		! np[1]
+	ld	[$tp],$tmp1		! tp[0]
+	ld	[$tp+4],$tpj		! tp[1]
+	mov	12,$j
+
+	mulx	$car0,$mul0,$car0
+	mulx	$apj,$mul0,$tmp0	!prologue!
+	add	$tmp1,$car0,$car0
+	ld	[$ap+8],$apj		!prologue!
+	and	$car0,$mask,$acc0
+
+	mulx	$n0,$acc0,$mul1
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1
+	mulx	$npj,$mul1,$acc1	!prologue!
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	ld	[$np+8],$npj		!prologue!
+	srlx	$car1,32,$car1
+	mov	$tmp0,$acc0		!prologue!
+
+.Linner:
+	mulx	$apj,$mul0,$tmp0
+	mulx	$npj,$mul1,$tmp1
+	add	$tpj,$car0,$car0
+	ld	[$ap+$j],$apj		! ap[j]
+	add	$acc0,$car0,$car0
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj		! np[j]
+	and	$car0,$mask,$acc0
+	ld	[$tp+8],$tpj		! tp[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	add	$j,4,$j			! j++
+	mov	$tmp0,$acc0
+	st	$car1,[$tp]		! tp[j-1]
+	srlx	$car1,32,$car1
+	mov	$tmp1,$acc1
+	cmp	$j,$num
+	bl	%icc,.Linner
+	add	$tp,4,$tp		! tp++
+!.Linner
+
+	mulx	$apj,$mul0,$tmp0	!epilogue!
+	mulx	$npj,$mul1,$tmp1
+	add	$tpj,$car0,$car0
+	add	$acc0,$car0,$car0
+	ld	[$tp+8],$tpj		! tp[j]
+	and	$car0,$mask,$acc0
+	add	$acc1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]		! tp[j-1]
+	srlx	$car1,32,$car1
+
+	add	$tpj,$car0,$car0
+	add	$tmp0,$car0,$car0
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp+4]		! tp[j-1]
+	srlx	$car0,32,$car0
+	add	$i,4,$i			! i++
+	srlx	$car1,32,$car1
+
+	add	$car0,$car1,$car1
+	cmp	$i,$num
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+8]
+
+	srlx	$car1,32,$car2
+	bl,a	%icc,.Louter
+	ld	[$bp+$i],$mul0		! bp[i]
+!.Louter
+
+	add	$tp,12,$tp
+
+.Ltail:
+	add	$np,$num,$np
+	add	$rp,$num,$rp
+	mov	$tp,$ap
+	sub	%g0,$num,%o7		! k=-num
+	ba	.Lsub
+	subcc	%g0,%g0,%g0		! clear %icc.c
+.align	16
+.Lsub:
+	ld	[$tp+%o7],%o0
+	ld	[$np+%o7],%o1
+	subccc	%o0,%o1,%o1		! tp[j]-np[j]
+	add	$rp,%o7,$i
+	add	%o7,4,%o7
+	brnz	%o7,.Lsub
+	st	%o1,[$i]
+	subc	$car2,0,$car2		! handle upmost overflow bit
+	and	$tp,$car2,$ap
+	andn	$rp,$car2,$np
+	or	$ap,$np,$ap
+	sub	%g0,$num,%o7
+
+.Lcopy:
+	ld	[$ap+%o7],%o0		! copy or in-place refresh
+	st	%g0,[$tp+%o7]		! zap tp
+	st	%o0,[$rp+%o7]
+	add	%o7,4,%o7
+	brnz	%o7,.Lcopy
+	nop
+	mov	1,%i0
+	ret
+	restore
+___
+
+########
+######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
+######## code without following dedicated squaring procedure.
+########
+$sbit="%i2";		# re-use $bp!
+
+$code.=<<___;
+.align	32
+.Lbn_sqr_mont:
+	mulx	$mul0,$mul0,$car0		! ap[0]*ap[0]
+	mulx	$apj,$mul0,$tmp0		!prologue!
+	and	$car0,$mask,$acc0
+	add	%sp,$bias+$frame,$tp
+	ld	[$ap+8],$apj			!prologue!
+
+	mulx	$n0,$acc0,$mul1			! "t[0]"*n0
+	srlx	$car0,32,$car0
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1		! np[0]*"t[0]"*n0
+	mulx	$npj,$mul1,$acc1		!prologue!
+	and	$car0,1,$sbit
+	ld	[$np+8],$npj			!prologue!
+	srlx	$car0,1,$car0
+	add	$acc0,$car1,$car1
+	srlx	$car1,32,$car1
+	mov	$tmp0,$acc0			!prologue!
+
+.Lsqr_1st:
+	mulx	$apj,$mul0,$tmp0
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0		! ap[j]*a0+c0
+	add	$acc1,$car1,$car1
+	ld	[$ap+$j],$apj			! ap[j]
+	and	$car0,$mask,$acc0
+	ld	[$np+$j],$npj			! np[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	mov	$tmp1,$acc1
+	srlx	$acc0,32,$sbit
+	add	$j,4,$j				! j++
+	and	$acc0,$mask,$acc0
+	cmp	$j,$num
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]
+	mov	$tmp0,$acc0
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_1st
+	add	$tp,4,$tp			! tp++
+!.Lsqr_1st
+
+	mulx	$apj,$mul0,$tmp0		! epilogue
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0		! ap[j]*a0+c0
+	add	$acc1,$car1,$car1
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$tmp0,$car0,$car0		! ap[j]*a0+c0
+	add	$tmp1,$car1,$car1
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	st	$car1,[$tp+8]
+	srlx	$car1,32,$car2
+
+	ld	[%sp+$bias+$frame],$tmp0	! tp[0]
+	ld	[%sp+$bias+$frame+4],$tmp1	! tp[1]
+	ld	[%sp+$bias+$frame+8],$tpj	! tp[2]
+	ld	[$ap+4],$mul0			! ap[1]
+	ld	[$ap+8],$apj			! ap[2]
+	ld	[$np],$car1			! np[0]
+	ld	[$np+4],$npj			! np[1]
+	mulx	$n0,$tmp0,$mul1
+
+	mulx	$mul0,$mul0,$car0
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1
+	mulx	$npj,$mul1,$acc1
+	add	$tmp0,$car1,$car1
+	and	$car0,$mask,$acc0
+	ld	[$np+8],$npj			! np[2]
+	srlx	$car1,32,$car1
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	and	$car0,1,$sbit
+	add	$acc1,$car1,$car1
+	srlx	$car0,1,$car0
+	mov	12,$j
+	st	$car1,[%sp+$bias+$frame]	! tp[0]=
+	srlx	$car1,32,$car1
+	add	%sp,$bias+$frame+4,$tp
+
+.Lsqr_2nd:
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$acc0,$car0,$car0
+	add	$tpj,$car1,$car1
+	ld	[$ap+$j],$apj			! ap[j]
+	and	$car0,$mask,$acc0
+	ld	[$np+$j],$npj			! np[j]
+	srlx	$car0,32,$car0
+	add	$acc1,$car1,$car1
+	ld	[$tp+8],$tpj			! tp[j]
+	add	$acc0,$acc0,$acc0
+	add	$j,4,$j				! j++
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	cmp	$j,$num
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_2nd
+	add	$tp,4,$tp			! tp++
+!.Lsqr_2nd
+
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$acc0,$car0,$car0
+	add	$tpj,$car1,$car1
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc1,$car1,$car1
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car2
+
+	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
+	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
+	ld	[$ap+8],$mul0			! ap[2]
+	ld	[$np],$car1			! np[0]
+	ld	[$np+4],$npj			! np[1]
+	mulx	$n0,$tmp1,$mul1
+	and	$mul1,$mask,$mul1
+	mov	8,$i
+
+	mulx	$mul0,$mul0,$car0
+	mulx	$car1,$mul1,$car1
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	%sp,$bias+$frame,$tp
+	srlx	$car1,32,$car1
+	and	$car0,1,$sbit
+	srlx	$car0,1,$car0
+	mov	4,$j
+
+.Lsqr_outer:
+.Lsqr_inner1:
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$j,4,$j
+	ld	[$tp+8],$tpj
+	cmp	$j,$i
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_inner1
+	add	$tp,4,$tp
+!.Lsqr_inner1
+
+	add	$j,4,$j
+	ld	[$ap+$j],$apj			! ap[j]
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	ld	[$np+$j],$npj			! np[j]
+	add	$acc0,$car1,$car1
+	ld	[$tp+8],$tpj			! tp[j]
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$j,4,$j
+	cmp	$j,$num
+	be,pn	%icc,.Lsqr_no_inner2
+	add	$tp,4,$tp
+
+.Lsqr_inner2:
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$acc0,$car0,$car0
+	ld	[$ap+$j],$apj			! ap[j]
+	and	$car0,$mask,$acc0
+	ld	[$np+$j],$npj			! np[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	ld	[$tp+8],$tpj			! tp[j]
+	or	$sbit,$acc0,$acc0
+	add	$j,4,$j				! j++
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	cmp	$j,$num
+	add	$acc0,$car1,$car1
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_inner2
+	add	$tp,4,$tp			! tp++
+
+.Lsqr_no_inner2:
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$acc0,$car0,$car0
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car2
+
+	add	$i,4,$i				! i++
+	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
+	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
+	ld	[$ap+$i],$mul0			! ap[j]
+	ld	[$np],$car1			! np[0]
+	ld	[$np+4],$npj			! np[1]
+	mulx	$n0,$tmp1,$mul1
+	and	$mul1,$mask,$mul1
+	add	$i,4,$tmp0
+
+	mulx	$mul0,$mul0,$car0
+	mulx	$car1,$mul1,$car1
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	%sp,$bias+$frame,$tp
+	srlx	$car1,32,$car1
+	and	$car0,1,$sbit
+	srlx	$car0,1,$car0
+
+	cmp	$tmp0,$num			! i<num-1
+	bl	%icc,.Lsqr_outer
+	mov	4,$j
+
+.Lsqr_last:
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$j,4,$j
+	ld	[$tp+8],$tpj
+	cmp	$j,$i
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_last
+	add	$tp,4,$tp
+!.Lsqr_last
+
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$acc0,$car1,$car1
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0		! recover $car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car2
+
+	ba	.Ltail
+	add	$tp,8,$tp
+.type	$fname,#function
+.size	$fname,(.-$fname)
+.asciz	"Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	32
+___
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl
new file mode 100755
index 0000000000..a14205f2f0
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl
@@ -0,0 +1,882 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
+# Because unlike integer multiplier, which simply stalls whole CPU,
+# FPU is fully pipelined and can effectively emit 48 bit partial
+# product every cycle. Why not blended SPARC v9? One can argue that
+# making this module dependent on UltraSPARC VIS extension limits its
+# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
+# implementations from compatibility matrix. But the rest, whole Sun
+# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
+# VIS extension instructions used in this module. This is considered
+# good enough to not care about HAL SPARC64 users [if any] who have
+# integer-only pure SPARCv9 module to "fall down" to.
+
+# USI&II cores currently exhibit uniform 2x improvement [over pre-
+# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
+# performance improves few percents for shorter keys and worsens few
+# percents for longer keys. This is because USIII integer multiplier
+# is >3x faster than USI&II one, which is harder to match [but see
+# TODO list below]. It should also be noted that SPARC64 V features
+# out-of-order execution, which *might* mean that integer multiplier
+# is pipelined, which in turn *might* be impossible to match... On
+# additional note, SPARC64 V implements FP Multiply-Add instruction,
+# which is perfectly usable in this context... In other words, as far
+# as Fujitsu SPARC64 V goes, talk to the author:-)
+
+# The implementation implies following "non-natural" limitations on
+# input arguments:
+# - num may not be less than 4;
+# - num has to be even;
+# Failure to meet either condition has no fatal effects, simply
+# doesn't give any performance gain.
+
+# TODO:
+# - modulo-schedule inner loop for better performance (on in-order
+#   execution core such as UltraSPARC this shall result in further
+#   noticeable(!) improvement);
+# - dedicated squaring procedure[?];
+
+######################################################################
+# November 2006
+#
+# Modulo-scheduled inner loops allow to interleave floating point and
+# integer instructions and minimize Read-After-Write penalties. This
+# results in *further* 20-50% perfromance improvement [depending on
+# key length, more for longer keys] on USI&II cores and 30-80% - on
+# USIII&IV.
+
+$fname="bn_mul_mont_fpu";
+$bits=32;
+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+
+if ($bits==64) {
+	$bias=2047;
+	$frame=192;
+} else {
+	$bias=0;
+	$frame=128;	# 96 rounded up to largest known cache-line
+}
+$locals=64;
+
+# In order to provide for 32-/64-bit ABI duality, I keep integers wider
+# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
+# exclusively for pointers, indexes and other small values...
+# int bn_mul_mont(
+$rp="%i0";	# BN_ULONG *rp,
+$ap="%i1";	# const BN_ULONG *ap,
+$bp="%i2";	# const BN_ULONG *bp,
+$np="%i3";	# const BN_ULONG *np,
+$n0="%i4";	# const BN_ULONG *n0,
+$num="%i5";	# int num);
+
+$tp="%l0";	# t[num]
+$ap_l="%l1";	# a[num],n[num] are smashed to 32-bit words and saved
+$ap_h="%l2";	# to these four vectors as double-precision FP values.
+$np_l="%l3";	# This way a bunch of fxtods are eliminated in second
+$np_h="%l4";	# loop and L1-cache aliasing is minimized...
+$i="%l5";
+$j="%l6";
+$mask="%l7";	# 16-bit mask, 0xffff
+
+$n0="%g4";	# reassigned(!) to "64-bit" register
+$carry="%i4";	# %i4 reused(!) for a carry bit
+
+# FP register naming chart
+#
+#     ..HILO
+#       dcba
+#   --------
+#        LOa
+#       LOb
+#      LOc
+#     LOd
+#      HIa
+#     HIb
+#    HIc
+#   HId
+#    ..a
+#   ..b
+$ba="%f0";    $bb="%f2";    $bc="%f4";    $bd="%f6";
+$na="%f8";    $nb="%f10";   $nc="%f12";   $nd="%f14";
+$alo="%f16";  $alo_="%f17"; $ahi="%f18";  $ahi_="%f19";
+$nlo="%f20";  $nlo_="%f21"; $nhi="%f22";  $nhi_="%f23";
+
+$dota="%f24"; $dotb="%f26";
+
+$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
+$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
+$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
+$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
+
+$ASI_FL16_P=0xD2;	# magic ASI value to engage 16-bit FP load
+
+$code=<<___;
+.section	".text",#alloc,#execinstr
+
+.global $fname
+.align  32
+$fname:
+	save	%sp,-$frame-$locals,%sp
+
+	cmp	$num,4
+	bl,a,pn %icc,.Lret
+	clr	%i0
+	andcc	$num,1,%g0		! $num has to be even...
+	bnz,a,pn %icc,.Lret
+	clr	%i0			! signal "unsupported input value"
+
+	srl	$num,1,$num
+	sethi	%hi(0xffff),$mask
+	ld	[%i4+0],$n0		! $n0 reassigned, remember?
+	or	$mask,%lo(0xffff),$mask
+	ld	[%i4+4],%o0
+	sllx	%o0,32,%o0
+	or	%o0,$n0,$n0		! $n0=n0[1].n0[0]
+
+	sll	$num,3,$num		! num*=8
+
+	add	%sp,$bias,%o0		! real top of stack
+	sll	$num,2,%o1
+	add	%o1,$num,%o1		! %o1=num*5
+	sub	%o0,%o1,%o0
+	and	%o0,-2048,%o0		! optimize TLB utilization
+	sub	%o0,$bias,%sp		! alloca(5*num*8)
+
+	rd	%asi,%o7		! save %asi
+	add	%sp,$bias+$frame+$locals,$tp
+	add	$tp,$num,$ap_l
+	add	$ap_l,$num,$ap_l	! [an]p_[lh] point at the vectors' ends !
+	add	$ap_l,$num,$ap_h
+	add	$ap_h,$num,$np_l
+	add	$np_l,$num,$np_h
+
+	wr	%g0,$ASI_FL16_P,%asi	! setup %asi for 16-bit FP loads
+
+	add	$rp,$num,$rp		! readjust input pointers to point
+	add	$ap,$num,$ap		! at the ends too...
+	add	$bp,$num,$bp
+	add	$np,$num,$np
+
+	stx	%o7,[%sp+$bias+$frame+48]	! save %asi
+
+	sub	%g0,$num,$i		! i=-num
+	sub	%g0,$num,$j		! j=-num
+
+	add	$ap,$j,%o3
+	add	$bp,$i,%o4
+
+	ld	[%o3+4],%g1		! bp[0]
+	ld	[%o3+0],%o0
+	ld	[%o4+4],%g5		! ap[0]
+	sllx	%g1,32,%g1
+	ld	[%o4+0],%o1
+	sllx	%g5,32,%g5
+	or	%g1,%o0,%o0
+	or	%g5,%o1,%o1
+
+	add	$np,$j,%o5
+
+	mulx	%o1,%o0,%o0		! ap[0]*bp[0]
+	mulx	$n0,%o0,%o0		! ap[0]*bp[0]*n0
+	stx	%o0,[%sp+$bias+$frame+0]
+
+	ld	[%o3+0],$alo_	! load a[j] as pair of 32-bit words
+	fzeros	$alo
+	ld	[%o3+4],$ahi_
+	fzeros	$ahi
+	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
+	fzeros	$nlo
+	ld	[%o5+4],$nhi_
+	fzeros	$nhi
+
+	! transfer b[i] to FPU as 4x16-bit values
+	ldda	[%o4+2]%asi,$ba
+	fxtod	$alo,$alo
+	ldda	[%o4+0]%asi,$bb
+	fxtod	$ahi,$ahi
+	ldda	[%o4+6]%asi,$bc
+	fxtod	$nlo,$nlo
+	ldda	[%o4+4]%asi,$bd
+	fxtod	$nhi,$nhi
+
+	! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
+	ldda	[%sp+$bias+$frame+6]%asi,$na
+	fxtod	$ba,$ba
+	ldda	[%sp+$bias+$frame+4]%asi,$nb
+	fxtod	$bb,$bb
+	ldda	[%sp+$bias+$frame+2]%asi,$nc
+	fxtod	$bc,$bc
+	ldda	[%sp+$bias+$frame+0]%asi,$nd
+	fxtod	$bd,$bd
+
+	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
+	fxtod	$na,$na
+	std	$ahi,[$ap_h+$j]
+	fxtod	$nb,$nb
+	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
+	fxtod	$nc,$nc
+	std	$nhi,[$np_h+$j]
+	fxtod	$nd,$nd
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+		fmuld	$alo,$bd,$alod
+	faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+		fmuld	$ahi,$ba,$ahia
+	faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+		fmuld	$ahi,$bb,$ahib
+	faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+		fmuld	$ahi,$bc,$ahic
+	faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+		fmuld	$ahi,$bd,$ahid
+	faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	add	$j,8,$j
+	std	$nlob,[%sp+$bias+$frame+8]
+	add	$ap,$j,%o4
+	std	$nloc,[%sp+$bias+$frame+16]
+	add	$np,$j,%o5
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	ld	[%o4+0],$alo_	! load a[j] as pair of 32-bit words
+	fzeros	$alo
+	ld	[%o4+4],$ahi_
+	fzeros	$ahi
+	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
+	fzeros	$nlo
+	ld	[%o5+4],$nhi_
+	fzeros	$nhi
+
+	fxtod	$alo,$alo
+	fxtod	$ahi,$ahi
+	fxtod	$nlo,$nlo
+	fxtod	$nhi,$nhi
+
+	ldx	[%sp+$bias+$frame+0],%o0
+		fmuld	$alo,$ba,$aloa
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$nlo,$na,$nloa
+	ldx	[%sp+$bias+$frame+16],%o2
+		fmuld	$alo,$bb,$alob
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$nlo,$nb,$nlob
+
+	srlx	%o0,16,%o7
+	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
+		fmuld	$alo,$bc,$aloc
+	add	%o7,%o1,%o1
+	std	$ahi,[$ap_h+$j]
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	srlx	%o1,16,%o7
+	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
+		fmuld	$alo,$bd,$alod
+	add	%o7,%o2,%o2
+	std	$nhi,[$np_h+$j]
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	srlx	%o2,16,%o7
+		fmuld	$ahi,$ba,$ahia
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	!and	%o0,$mask,%o0
+	!and	%o1,$mask,%o1
+	!and	%o2,$mask,%o2
+	!sllx	%o1,16,%o1
+	!sllx	%o2,32,%o2
+	!sllx	%o3,48,%o7
+	!or	%o1,%o0,%o0
+	!or	%o2,%o0,%o0
+	!or	%o7,%o0,%o0		! 64-bit result
+	srlx	%o3,16,%g1		! 34-bit carry
+		fmuld	$ahi,$bb,$ahib
+
+	faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+		fmuld	$ahi,$bc,$ahic
+	faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+		fmuld	$ahi,$bd,$ahid
+	faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+
+	faddd	$dota,$nloa,$nloa
+	faddd	$dotb,$nlob,$nlob
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	addcc	$j,8,$j
+	std	$nloc,[%sp+$bias+$frame+16]
+	bz,pn	%icc,.L1stskip
+	std	$nlod,[%sp+$bias+$frame+24]
+
+.align	32			! incidentally already aligned !
+.L1st:
+	add	$ap,$j,%o4
+	add	$np,$j,%o5
+	ld	[%o4+0],$alo_	! load a[j] as pair of 32-bit words
+	fzeros	$alo
+	ld	[%o4+4],$ahi_
+	fzeros	$ahi
+	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
+	fzeros	$nlo
+	ld	[%o5+4],$nhi_
+	fzeros	$nhi
+
+	fxtod	$alo,$alo
+	fxtod	$ahi,$ahi
+	fxtod	$nlo,$nlo
+	fxtod	$nhi,$nhi
+
+	ldx	[%sp+$bias+$frame+0],%o0
+		fmuld	$alo,$ba,$aloa
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$nlo,$na,$nloa
+	ldx	[%sp+$bias+$frame+16],%o2
+		fmuld	$alo,$bb,$alob
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$nlo,$nb,$nlob
+
+	srlx	%o0,16,%o7
+	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
+		fmuld	$alo,$bc,$aloc
+	add	%o7,%o1,%o1
+	std	$ahi,[$ap_h+$j]
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	srlx	%o1,16,%o7
+	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
+		fmuld	$alo,$bd,$alod
+	add	%o7,%o2,%o2
+	std	$nhi,[$np_h+$j]
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	srlx	%o2,16,%o7
+		fmuld	$ahi,$ba,$ahia
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+		fmuld	$ahi,$bb,$ahib
+	sllx	%o1,16,%o1
+		faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+	sllx	%o2,32,%o2
+		fmuld	$ahi,$bc,$ahic
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+		faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+	or	%o2,%o0,%o0
+		fmuld	$ahi,$bd,$ahid
+	or	%o7,%o0,%o0		! 64-bit result
+		faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+	addcc	%g1,%o0,%o0
+		faddd	$dota,$nloa,$nloa
+	srlx	%o3,16,%g1		! 34-bit carry
+		faddd	$dotb,$nlob,$nlob
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]=
+
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	std	$nloc,[%sp+$bias+$frame+16]
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	addcc	$j,8,$j
+	bnz,pt	%icc,.L1st
+	add	$tp,8,$tp
+
+.L1stskip:
+	fdtox	$dota,$dota
+	fdtox	$dotb,$dotb
+
+	ldx	[%sp+$bias+$frame+0],%o0
+	ldx	[%sp+$bias+$frame+8],%o1
+	ldx	[%sp+$bias+$frame+16],%o2
+	ldx	[%sp+$bias+$frame+24],%o3
+
+	srlx	%o0,16,%o7
+	std	$dota,[%sp+$bias+$frame+32]
+	add	%o7,%o1,%o1
+	std	$dotb,[%sp+$bias+$frame+40]
+	srlx	%o1,16,%o7
+	add	%o7,%o2,%o2
+	srlx	%o2,16,%o7
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+	sllx	%o1,16,%o1
+	sllx	%o2,32,%o2
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+	or	%o2,%o0,%o0
+	or	%o7,%o0,%o0		! 64-bit result
+	ldx	[%sp+$bias+$frame+32],%o4
+	addcc	%g1,%o0,%o0
+	ldx	[%sp+$bias+$frame+40],%o5
+	srlx	%o3,16,%g1		! 34-bit carry
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]=
+	add	$tp,8,$tp
+
+	srlx	%o4,16,%o7
+	add	%o7,%o5,%o5
+	and	%o4,$mask,%o4
+	sllx	%o5,16,%o7
+	or	%o7,%o4,%o4
+	addcc	%g1,%o4,%o4
+	srlx	%o5,48,%g1
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	mov	%g1,$carry
+	stx	%o4,[$tp]		! tp[num-1]=
+
+	ba	.Louter
+	add	$i,8,$i
+.align	32
+.Louter:
+	sub	%g0,$num,$j		! j=-num
+	add	%sp,$bias+$frame+$locals,$tp
+
+	add	$ap,$j,%o3
+	add	$bp,$i,%o4
+
+	ld	[%o3+4],%g1		! bp[i]
+	ld	[%o3+0],%o0
+	ld	[%o4+4],%g5		! ap[0]
+	sllx	%g1,32,%g1
+	ld	[%o4+0],%o1
+	sllx	%g5,32,%g5
+	or	%g1,%o0,%o0
+	or	%g5,%o1,%o1
+
+	ldx	[$tp],%o2		! tp[0]
+	mulx	%o1,%o0,%o0
+	addcc	%o2,%o0,%o0
+	mulx	$n0,%o0,%o0		! (ap[0]*bp[i]+t[0])*n0
+	stx	%o0,[%sp+$bias+$frame+0]
+
+	! transfer b[i] to FPU as 4x16-bit values
+	ldda	[%o4+2]%asi,$ba
+	ldda	[%o4+0]%asi,$bb
+	ldda	[%o4+6]%asi,$bc
+	ldda	[%o4+4]%asi,$bd
+
+	! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
+	ldda	[%sp+$bias+$frame+6]%asi,$na
+	fxtod	$ba,$ba
+	ldda	[%sp+$bias+$frame+4]%asi,$nb
+	fxtod	$bb,$bb
+	ldda	[%sp+$bias+$frame+2]%asi,$nc
+	fxtod	$bc,$bc
+	ldda	[%sp+$bias+$frame+0]%asi,$nd
+	fxtod	$bd,$bd
+	ldd	[$ap_l+$j],$alo		! load a[j] in double format
+	fxtod	$na,$na
+	ldd	[$ap_h+$j],$ahi
+	fxtod	$nb,$nb
+	ldd	[$np_l+$j],$nlo		! load n[j] in double format
+	fxtod	$nc,$nc
+	ldd	[$np_h+$j],$nhi
+	fxtod	$nd,$nd
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+		fmuld	$alo,$bd,$alod
+	faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+		fmuld	$ahi,$ba,$ahia
+	faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+		fmuld	$ahi,$bb,$ahib
+	faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+		fmuld	$ahi,$bc,$ahic
+	faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+		fmuld	$ahi,$bd,$ahid
+	faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	std	$nloc,[%sp+$bias+$frame+16]
+	add	$j,8,$j
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	ldd	[$ap_l+$j],$alo		! load a[j] in double format
+	ldd	[$ap_h+$j],$ahi
+	ldd	[$np_l+$j],$nlo		! load n[j] in double format
+	ldd	[$np_h+$j],$nhi
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	ldx	[%sp+$bias+$frame+0],%o0
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$alo,$bd,$alod
+	ldx	[%sp+$bias+$frame+16],%o2
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$ahi,$ba,$ahia
+
+	srlx	%o0,16,%o7
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	add	%o7,%o1,%o1
+		fmuld	$ahi,$bb,$ahib
+	srlx	%o1,16,%o7
+		faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+	add	%o7,%o2,%o2
+		fmuld	$ahi,$bc,$ahic
+	srlx	%o2,16,%o7
+		faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	! why?
+	and	%o0,$mask,%o0
+		fmuld	$ahi,$bd,$ahid
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+		faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+	sllx	%o1,16,%o1
+		faddd	$dota,$nloa,$nloa
+	sllx	%o2,32,%o2
+		faddd	$dotb,$nlob,$nlob
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+		faddd	$ahic,$nhic,$dota	! $nhic
+	or	%o2,%o0,%o0
+		faddd	$ahid,$nhid,$dotb	! $nhid
+	or	%o7,%o0,%o0		! 64-bit result
+	ldx	[$tp],%o7
+		faddd	$nloc,$nhia,$nloc
+	addcc	%o7,%o0,%o0
+	! end-of-why?
+		faddd	$nlod,$nhib,$nlod
+	srlx	%o3,16,%g1		! 34-bit carry
+		fdtox	$nloa,$nloa
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	addcc	$j,8,$j
+	std	$nloc,[%sp+$bias+$frame+16]
+	bz,pn	%icc,.Linnerskip
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	ba	.Linner
+	nop
+.align	32
+.Linner:
+	ldd	[$ap_l+$j],$alo		! load a[j] in double format
+	ldd	[$ap_h+$j],$ahi
+	ldd	[$np_l+$j],$nlo		! load n[j] in double format
+	ldd	[$np_h+$j],$nhi
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	ldx	[%sp+$bias+$frame+0],%o0
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$alo,$bd,$alod
+	ldx	[%sp+$bias+$frame+16],%o2
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$ahi,$ba,$ahia
+
+	srlx	%o0,16,%o7
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	add	%o7,%o1,%o1
+		fmuld	$ahi,$bb,$ahib
+	srlx	%o1,16,%o7
+		faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+	add	%o7,%o2,%o2
+		fmuld	$ahi,$bc,$ahic
+	srlx	%o2,16,%o7
+		faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+		fmuld	$ahi,$bd,$ahid
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+		faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+	sllx	%o1,16,%o1
+		faddd	$dota,$nloa,$nloa
+	sllx	%o2,32,%o2
+		faddd	$dotb,$nlob,$nlob
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+		faddd	$ahic,$nhic,$dota	! $nhic
+	or	%o2,%o0,%o0
+		faddd	$ahid,$nhid,$dotb	! $nhid
+	or	%o7,%o0,%o0		! 64-bit result
+		faddd	$nloc,$nhia,$nloc
+	addcc	%g1,%o0,%o0
+	ldx	[$tp+8],%o7		! tp[j]
+		faddd	$nlod,$nhib,$nlod
+	srlx	%o3,16,%g1		! 34-bit carry
+		fdtox	$nloa,$nloa
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+		fdtox	$nlob,$nlob
+	addcc	%o7,%o0,%o0
+		fdtox	$nloc,$nloc
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]
+		fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	std	$nloc,[%sp+$bias+$frame+16]
+	addcc	$j,8,$j
+	std	$nlod,[%sp+$bias+$frame+24]
+	bnz,pt	%icc,.Linner
+	add	$tp,8,$tp
+
+.Linnerskip:
+	fdtox	$dota,$dota
+	fdtox	$dotb,$dotb
+
+	ldx	[%sp+$bias+$frame+0],%o0
+	ldx	[%sp+$bias+$frame+8],%o1
+	ldx	[%sp+$bias+$frame+16],%o2
+	ldx	[%sp+$bias+$frame+24],%o3
+
+	srlx	%o0,16,%o7
+	std	$dota,[%sp+$bias+$frame+32]
+	add	%o7,%o1,%o1
+	std	$dotb,[%sp+$bias+$frame+40]
+	srlx	%o1,16,%o7
+	add	%o7,%o2,%o2
+	srlx	%o2,16,%o7
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+	sllx	%o1,16,%o1
+	sllx	%o2,32,%o2
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+	or	%o2,%o0,%o0
+	ldx	[%sp+$bias+$frame+32],%o4
+	or	%o7,%o0,%o0		! 64-bit result
+	ldx	[%sp+$bias+$frame+40],%o5
+	addcc	%g1,%o0,%o0
+	ldx	[$tp+8],%o7		! tp[j]
+	srlx	%o3,16,%g1		! 34-bit carry
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	addcc	%o7,%o0,%o0
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]
+	add	$tp,8,$tp
+
+	srlx	%o4,16,%o7
+	add	%o7,%o5,%o5
+	and	%o4,$mask,%o4
+	sllx	%o5,16,%o7
+	or	%o7,%o4,%o4
+	addcc	%g1,%o4,%o4
+	srlx	%o5,48,%g1
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	addcc	$carry,%o4,%o4
+	stx	%o4,[$tp]		! tp[num-1]
+	mov	%g1,$carry
+	bcs,a	%xcc,.+8
+	add	$carry,1,$carry
+
+	addcc	$i,8,$i
+	bnz	%icc,.Louter
+	nop
+
+	add	$tp,8,$tp		! adjust tp to point at the end
+	orn	%g0,%g0,%g4
+	sub	%g0,$num,%o7		! n=-num
+	ba	.Lsub
+	subcc	%g0,%g0,%g0		! clear %icc.c
+
+.align	32
+.Lsub:
+	ldx	[$tp+%o7],%o0
+	add	$np,%o7,%g1
+	ld	[%g1+0],%o2
+	ld	[%g1+4],%o3
+	srlx	%o0,32,%o1
+	subccc	%o0,%o2,%o2
+	add	$rp,%o7,%g1
+	subccc	%o1,%o3,%o3
+	st	%o2,[%g1+0]
+	add	%o7,8,%o7
+	brnz,pt	%o7,.Lsub
+	st	%o3,[%g1+4]
+	subc	$carry,0,%g4
+	sub	%g0,$num,%o7		! n=-num
+	ba	.Lcopy
+	nop
+
+.align	32
+.Lcopy:
+	ldx	[$tp+%o7],%o0
+	add	$rp,%o7,%g1
+	ld	[%g1+0],%o2
+	ld	[%g1+4],%o3
+	stx	%g0,[$tp+%o7]
+	and	%o0,%g4,%o0
+	srlx	%o0,32,%o1
+	andn	%o2,%g4,%o2
+	andn	%o3,%g4,%o3
+	or	%o2,%o0,%o0
+	or	%o3,%o1,%o1
+	st	%o0,[%g1+0]
+	add	%o7,8,%o7
+	brnz,pt	%o7,.Lcopy
+	st	%o1,[%g1+4]
+	sub	%g0,$num,%o7		! n=-num
+
+.Lzap:
+	stx	%g0,[$ap_l+%o7]
+	stx	%g0,[$ap_h+%o7]
+	stx	%g0,[$np_l+%o7]
+	stx	%g0,[$np_h+%o7]
+	add	%o7,8,%o7
+	brnz,pt	%o7,.Lzap
+	nop
+
+	ldx	[%sp+$bias+$frame+48],%o7
+	wr	%g0,%o7,%asi		! restore %asi
+
+	mov	1,%i0
+.Lret:
+	ret
+	restore
+.type   $fname,#function
+.size	$fname,(.-$fname)
+.asciz	"Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
+.align	32
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+# Below substitution makes it possible to compile without demanding
+# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
+# dare to do this, because VIS capability is detected at run-time now
+# and this routine is not called on CPU not capable to execute it. Do
+# note that fzeros is not the only VIS dependency! Another dependency
+# is implicit and is just _a_ numerical value loaded to %asi register,
+# which assembler can't recognize as VIS specific...
+$code =~ s/fzeros\s+%f([0-9]+)/
+	   sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
+	  /gem;
+
+print $code;
+# flush
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/via-mont.pl b/src/lib/libcrypto/bn/asm/via-mont.pl
new file mode 100644
index 0000000000..c046a514c8
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/via-mont.pl
@@ -0,0 +1,242 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Wrapper around 'rep montmul', VIA-specific instruction accessing
+# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
+# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
+#
+# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
+# different software configurations on 1.5GHz VIA Esther processor.
+# Lines marked with "software integer" denote performance of hand-
+# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
+# refers to hand-coded SSE2 Montgomery multiplication procedure found
+# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
+# Padlock SDK 2.0.1 available for download from VIA, which naturally
+# utilizes the magic 'repz montmul' instruction. And finally "hardware
+# this" refers to *this* implementation which also uses 'repz montmul'
+#
+#                   sign    verify    sign/s verify/s
+# rsa  512 bits 0.001720s 0.000140s    581.4   7149.7	software integer
+# rsa  512 bits 0.000690s 0.000086s   1450.3  11606.0	software SSE2
+# rsa  512 bits 0.006136s 0.000201s    163.0   4974.5	hardware VIA SDK
+# rsa  512 bits 0.000712s 0.000050s   1404.9  19858.5	hardware this
+#
+# rsa 1024 bits 0.008518s 0.000413s    117.4   2420.8	software integer
+# rsa 1024 bits 0.004275s 0.000277s    233.9   3609.7	software SSE2
+# rsa 1024 bits 0.012136s 0.000260s     82.4   3844.5	hardware VIA SDK
+# rsa 1024 bits 0.002522s 0.000116s    396.5   8650.9	hardware this
+#
+# rsa 2048 bits 0.050101s 0.001371s     20.0    729.6	software integer
+# rsa 2048 bits 0.030273s 0.001008s     33.0    991.9	software SSE2
+# rsa 2048 bits 0.030833s 0.000976s     32.4   1025.1	hardware VIA SDK
+# rsa 2048 bits 0.011879s 0.000342s     84.2   2921.7	hardware this
+#
+# rsa 4096 bits 0.327097s 0.004859s      3.1    205.8	software integer
+# rsa 4096 bits 0.229318s 0.003859s      4.4    259.2	software SSE2
+# rsa 4096 bits 0.233953s 0.003274s      4.3    305.4	hardware VIA SDK
+# rsa 4096 bits 0.070493s 0.001166s     14.2    857.6	hardware this
+#
+# dsa  512 bits 0.001342s 0.001651s    745.2    605.7	software integer
+# dsa  512 bits 0.000844s 0.000987s   1185.3   1013.1	software SSE2
+# dsa  512 bits 0.001902s 0.002247s    525.6    444.9	hardware VIA SDK
+# dsa  512 bits 0.000458s 0.000524s   2182.2   1909.1	hardware this
+#
+# dsa 1024 bits 0.003964s 0.004926s    252.3    203.0	software integer
+# dsa 1024 bits 0.002686s 0.003166s    372.3    315.8	software SSE2
+# dsa 1024 bits 0.002397s 0.002823s    417.1    354.3	hardware VIA SDK
+# dsa 1024 bits 0.000978s 0.001170s   1022.2    855.0	hardware this
+#
+# dsa 2048 bits 0.013280s 0.016518s     75.3     60.5	software integer
+# dsa 2048 bits 0.009911s 0.011522s    100.9     86.8	software SSE2
+# dsa 2048 bits 0.009542s 0.011763s    104.8     85.0	hardware VIA SDK
+# dsa 2048 bits 0.002884s 0.003352s    346.8    298.3	hardware this
+#
+# To give you some other reference point here is output for 2.4GHz P4
+# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
+# SSE2" in above terms.
+#
+# rsa  512 bits 0.000407s 0.000047s   2454.2  21137.0
+# rsa 1024 bits 0.002426s 0.000141s    412.1   7100.0
+# rsa 2048 bits 0.015046s 0.000491s     66.5   2034.9
+# rsa 4096 bits 0.109770s 0.002379s      9.1    420.3
+# dsa  512 bits 0.000438s 0.000525s   2281.1   1904.1
+# dsa 1024 bits 0.001346s 0.001595s    742.7    627.0
+# dsa 2048 bits 0.004745s 0.005582s    210.7    179.1
+#
+# Conclusions: 
+# - VIA SDK leaves a *lot* of room for improvement (which this
+#   implementation successfully fills:-);
+# - 'rep montmul' gives up to >3x performance improvement depending on
+#   key length;
+# - in terms of absolute performance it delivers approximately as much
+#   as modern out-of-order 32-bit cores [again, for longer keys].
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"via-mont.pl");
+
+# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
+$func="bn_mul_mont_padlock";
+
+$pad=16*1;	# amount of reserved bytes on top of every vector
+
+# stack layout
+$mZeroPrime=&DWP(0,"esp");		# these are specified by VIA
+$A=&DWP(4,"esp");
+$B=&DWP(8,"esp");
+$T=&DWP(12,"esp");
+$M=&DWP(16,"esp");
+$scratch=&DWP(20,"esp");
+$rp=&DWP(24,"esp");			# these are mine
+$sp=&DWP(28,"esp");
+# &DWP(32,"esp")			# 32 byte scratch area
+# &DWP(64+(4*$num+$pad)*0,"esp")	# padded tp[num]
+# &DWP(64+(4*$num+$pad)*1,"esp")	# padded copy of ap[num]
+# &DWP(64+(4*$num+$pad)*2,"esp")	# padded copy of bp[num]
+# &DWP(64+(4*$num+$pad)*3,"esp")	# padded copy of np[num]
+# Note that SDK suggests to unconditionally allocate 2K per vector. This
+# has quite an impact on performance. It naturally depends on key length,
+# but to give an example 1024 bit private RSA key operations suffer >30%
+# penalty. I allocate only as much as actually required...
+
+&function_begin($func);
+	&xor	("eax","eax");
+	&mov	("ecx",&wparam(5));	# num
+	# meet VIA's limitations for num [note that the specification
+	# expresses them in bits, while we work with amount of 32-bit words]
+	&test	("ecx",3);
+	&jnz	(&label("leave"));	# num % 4 != 0
+	&cmp	("ecx",8);
+	&jb	(&label("leave"));	# num < 8
+	&cmp	("ecx",1024);
+	&ja	(&label("leave"));	# num > 1024
+
+	&pushf	();
+	&cld	();
+
+	&mov	("edi",&wparam(0));	# rp
+	&mov	("eax",&wparam(1));	# ap
+	&mov	("ebx",&wparam(2));	# bp
+	&mov	("edx",&wparam(3));	# np
+	&mov	("esi",&wparam(4));	# n0
+	&mov	("esi",&DWP(0,"esi"));	# *n0
+
+	&lea	("ecx",&DWP($pad,"","ecx",4));	# ecx becomes vector size in bytes
+	&lea	("ebp",&DWP(64,"","ecx",4));	# allocate 4 vectors + 64 bytes
+	&neg	("ebp");
+	&add	("ebp","esp");
+	&and	("ebp",-64);		# align to cache-line
+	&xchg	("ebp","esp");		# alloca
+
+	&mov	($rp,"edi");		# save rp
+	&mov	($sp,"ebp");		# save esp
+
+	&mov	($mZeroPrime,"esi");
+	&lea	("esi",&DWP(64,"esp"));	# tp
+	&mov	($T,"esi");
+	&lea	("edi",&DWP(32,"esp"));	# scratch area
+	&mov	($scratch,"edi");
+	&mov	("esi","eax");
+
+	&lea	("ebp",&DWP(-$pad,"ecx"));
+	&shr	("ebp",2);		# restore original num value in ebp
+
+	&xor	("eax","eax");
+
+	&mov	("ecx","ebp");
+	&lea	("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
+	&data_byte(0xf3,0xab);		# rep stosl, bzero
+
+	&mov	("ecx","ebp");
+	&lea	("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
+	&mov	($A,"edi");
+	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded ap copy...
+
+	&mov	("ecx","ebp");
+	&mov	("esi","ebx");
+	&mov	($B,"edi");
+	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded bp copy...
+
+	&mov	("ecx","ebp");
+	&mov	("esi","edx");
+	&mov	($M,"edi");
+	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded np copy...
+
+	# let magic happen...
+	&mov	("ecx","ebp");
+	&mov	("esi","esp");
+	&shl	("ecx",5);		# convert word counter to bit counter
+	&align	(4);
+	&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
+
+	&mov	("ecx","ebp");
+	&lea	("esi",&DWP(64,"esp"));		# tp
+	# edi still points at the end of padded np copy...
+	&neg	("ebp");
+	&lea	("ebp",&DWP(-$pad,"edi","ebp",4));	# so just "rewind"
+	&mov	("edi",$rp);			# restore rp
+	&xor	("edx","edx");			# i=0 and clear CF
+
+&set_label("sub",8);
+	&mov	("eax",&DWP(0,"esi","edx",4));
+	&sbb	("eax",&DWP(0,"ebp","edx",4));
+	&mov	(&DWP(0,"edi","edx",4),"eax");	# rp[i]=tp[i]-np[i]
+	&lea	("edx",&DWP(1,"edx"));		# i++
+	&loop	(&label("sub"));		# doesn't affect CF!
+
+	&mov	("eax",&DWP(0,"esi","edx",4));	# upmost overflow bit
+	&sbb	("eax",0);
+	&and	("esi","eax");
+	&not	("eax");
+	&mov	("ebp","edi");
+	&and	("ebp","eax");
+	&or	("esi","ebp");			# tp=carry?tp:rp
+
+	&mov	("ecx","edx");			# num
+	&xor	("edx","edx");			# i=0
+
+&set_label("copy",8);
+	&mov	("eax",&DWP(0,"esi","edx",4));
+	&mov	(&DWP(64,"esp","edx",4),"ecx");	# zap tp
+	&mov	(&DWP(0,"edi","edx",4),"eax");
+	&lea	("edx",&DWP(1,"edx"));		# i++
+	&loop	(&label("copy"));
+
+	&mov	("ebp",$sp);
+	&xor	("eax","eax");
+
+	&mov	("ecx",64/4);
+	&mov	("edi","esp");		# zap frame including scratch area
+	&data_byte(0xf3,0xab);		# rep stosl, bzero
+
+	# zap copies of ap, bp and np
+	&lea	("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
+	&lea	("ecx",&DWP(3*$pad/4,"edx","edx",2));
+	&data_byte(0xf3,0xab);		# rep stosl, bzero
+
+	&mov	("esp","ebp");
+	&inc	("eax");		# signal "done"
+	&popf	();
+&set_label("leave");
+&function_end($func);
+
+&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/src/lib/libcrypto/bn/asm/x86-mont.pl b/src/lib/libcrypto/bn/asm/x86-mont.pl
new file mode 100755
index 0000000000..5cd3cd2ed5
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86-mont.pl
@@ -0,0 +1,591 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&function_begin("bn_mul_mont");
+
+$i="edx";
+$j="ecx";
+$ap="esi";	$tp="esi";		# overlapping variables!!!
+$rp="edi";	$bp="edi";		# overlapping variables!!!
+$np="ebp";
+$num="ebx";
+
+$_num=&DWP(4*0,"esp");			# stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32;				# size of above frame rounded up to 16n
+
+	&xor	("eax","eax");
+	&mov	("edi",&wparam(5));	# int num
+	&cmp	("edi",4);
+	&jl	(&label("just_leave"));
+
+	&lea	("esi",&wparam(0));	# put aside pointer to argument block
+	&lea	("edx",&wparam(1));	# load ap
+	&mov	("ebp","esp");		# saved stack pointer!
+	&add	("edi",2);		# extra two words on top of tp
+	&neg	("edi");
+	&lea	("esp",&DWP(-$frame,"esp","edi",4));	# alloca($frame+4*(num+2))
+	&neg	("edi");
+
+	# minimize cache contention by arraning 2K window between stack
+	# pointer and ap argument [np is also position sensitive vector,
+	# but it's assumed to be near ap, as it's allocated at ~same
+	# time].
+	&mov	("eax","esp");
+	&sub	("eax","edx");
+	&and	("eax",2047);
+	&sub	("esp","eax");		# this aligns sp and ap modulo 2048
+
+	&xor	("edx","esp");
+	&and	("edx",2048);
+	&xor	("edx",2048);
+	&sub	("esp","edx");		# this splits them apart modulo 4096
+
+	&and	("esp",-64);		# align to cache line
+
+	################################# load argument block...
+	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+	&mov	("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+	#&mov	("edi",&DWP(5*4,"esi"));# int num
+
+	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
+	&mov	($_rp,"eax");		# ... save a copy of argument block
+	&mov	($_ap,"ebx");
+	&mov	($_bp,"ecx");
+	&mov	($_np,"edx");
+	&mov	($_n0,"esi");
+	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
+	#&mov	($_num,$num);		# redundant as $num is not reused
+	&mov	($_sp,"ebp");		# saved stack pointer!
+
+if($sse2) {
+$acc0="mm0";	# mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+	&picmeup("eax","OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,"eax"),26);
+	&jnc	(&label("non_sse2"));
+
+	&mov	("eax",-1);
+	&movd	($mask,"eax");		# mask 32 lower bits
+
+	&mov	($ap,$_ap);		# load input pointers
+	&mov	($bp,$_bp);
+	&mov	($np,$_np);
+
+	&xor	($i,$i);		# i=0
+	&xor	($j,$j);		# j=0
+
+	&movd	($mul0,&DWP(0,$bp));		# bp[0]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
+	&movq	($car0,$mul1);
+	&movq	($acc0,$mul1);			# I wish movd worked for
+	&pand	($acc0,$mask);			# inter-register transfers
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
+	&paddq	($car1,$acc0);
+
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&inc	($j);				# j++
+&set_label("1st",16);
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
+	&psrlq	($car1,32);
+
+	&lea	($j,&DWP(1,$j));
+	&cmp	($j,$num);
+	&jl	(&label("1st"));
+
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&paddq	($car1,$car0);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&inc	($i);				# i++
+&set_label("outer");
+	&xor	($j,$j);			# j=0
+
+	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
+
+	&paddq	($mul1,$temp);			# +=tp[0]
+	&movq	($acc0,$mul1);
+	&movq	($car0,$mul1);
+	&pand	($acc0,$mask);
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);
+	&paddq	($car1,$acc0);
+
+	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[1]
+
+	&inc	($j);				# j++
+	&dec	($num);
+&set_label("inner");
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[j+1]
+
+	&dec	($num);
+	&lea	($j,&DWP(1,$j));		# j++
+	&jnz	(&label("inner"));
+
+	&mov	($num,$j);
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
+	&paddq	($car1,$car0);
+	&paddq	($car1,$temp);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&lea	($i,&DWP(1,$i));		# i++
+	&cmp	($i,$num);
+	&jle	(&label("outer"));
+
+	&emms	();				# done with mmx bank
+	&jmp	(&label("common_tail"));
+
+&set_label("non_sse2",16);
+}
+
+if (0) {
+	&mov	("esp",$_sp);
+	&xor	("eax","eax");	# signal "not fast enough [yet]"
+	&jmp	(&label("just_leave"));
+	# While the below code provides competitive performance for
+	# all key lengthes on modern Intel cores, it's still more
+	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+	# means compared to the original integer-only assembler.
+	# 512-bit RSA sign is better by ~40%, but that's about all
+	# one can say about all CPUs...
+} else {
+$inp="esi";	# integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+	&mov	($inp,$_ap);
+	&lea	($carry,&DWP(1,$num));
+	&mov	($word,$_bp);
+	&xor	($j,$j);				# j=0
+	&mov	("edx",$inp);
+	&and	($carry,1);				# see if num is even
+	&sub	("edx",$word);				# see if ap==bp
+	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
+	&or	($carry,"edx");
+	&mov	($word,&DWP(0,$word));			# bp[0]
+	&jz	(&label("bn_sqr_mont"));
+	&mov	($_bpend,"eax");
+	&mov	("eax",&DWP(0,$inp));
+	&xor	("edx","edx");
+
+&set_label("mull",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[0]
+	&add	($carry,"eax");
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("mull"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[0]
+	 &mov	($word,$_n0);
+	&add	("eax",$carry);
+	 &mov	($inp,$_np);
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
+	&xor	($j,$j);
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&inc	($j);
+
+	&jmp	(&label("2ndmadd"));
+
+&set_label("1stmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[i]
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("1stmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[i]
+	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&xor	($j,$j);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
+	&adc	($j,0);
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&mov	($j,1);
+
+&set_label("2ndmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
+	&jl	(&label("2ndmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&xor	("eax","eax");
+	 &mov	($j,$_bp);				# &bp[i]
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	 &lea	($j,&DWP(4,$j));
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	 &cmp	($j,$_bpend);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(0,$j));			# bp[i+1]
+	&mov	($inp,$_ap);
+	&mov	($_bp,$j);				# &bp[++i]
+	&xor	($j,$j);
+	&xor	("edx","edx");
+	&mov	("eax",&DWP(0,$inp));
+	&jmp	(&label("1stmadd"));
+
+&set_label("bn_sqr_mont",16);
+$sbit=$num;
+	&mov	($_num,$num);
+	&mov	($_bp,$j);				# i=0
+
+	&mov	("eax",$word);				# ap[0]
+	&mul	($word);				# ap[0]*ap[0]
+	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
+	&mov	($sbit,"edx");
+	&shr	("edx",1);
+	&and	($sbit,1);
+	&inc	($j);
+&set_label("sqr",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[0]
+	&add	("eax",$carry);
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	&shr	("eax",31);
+	&cmp	($j,$_num);
+	&mov	($sbit,"eax");
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("sqr"));
+
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*ap[0]
+	&add	("eax",$carry);
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+	&shr	("eax",31);
+	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
+
+	&lea	($carry,&DWP(0,"eax","edx",2));
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&shr	("edx",31);
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	($num,$j);
+	&adc	("edx",0);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&mov	($j,1);
+
+&set_label("3rdmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j+1]*m
+	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
+	&lea	($j,&DWP(2,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("3rdmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&mov	($j,$_bp);				# i
+	&xor	("eax","eax");
+	&mov	($inp,$_ap);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	&cmp	($j,$num);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
+	&lea	($j,&DWP(1,$j));
+	&mov	("eax",$word);
+	&mov	($_bp,$j);				# ++i
+	&mul	($word);				# ap[i]*ap[i]
+	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
+	&adc	("edx",0);
+	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
+	&xor	($carry,$carry);
+	&cmp	($j,$num);
+	&lea	($j,&DWP(1,$j));
+	&je	(&label("sqrlast"));
+
+	&mov	($sbit,"edx");				# zaps $num
+	&shr	("edx",1);
+	&and	($sbit,1);
+&set_label("sqradd",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[i]
+	&add	("eax",$carry);
+	&lea	($carry,&DWP(0,"eax","eax"));
+	&adc	("edx",0);
+	&shr	("eax",31);
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("eax",0);
+	&add	($carry,$sbit);
+	&adc	("eax",0);
+	&cmp	($j,$_num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&mov	($sbit,"eax");
+	&jle	(&label("sqradd"));
+
+	&mov	($carry,"edx");
+	&lea	("edx",&DWP(0,$sbit,"edx",2));
+	&shr	($carry,31);
+&set_label("sqrlast");
+	&mov	($word,$_n0);
+	&mov	($inp,$_np);
+	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&adc	($carry,0);
+	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&lea	($num,&DWP(-1,$j));
+	&adc	("edx",0);
+	&mov	($j,1);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+
+	&jmp	(&label("3rdmadd"));
+}
+
+&set_label("common_tail",16);
+	&mov	($np,$_np);			# load modulus pointer
+	&mov	($rp,$_rp);			# load result pointer
+	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
+
+	&mov	("eax",&DWP(0,$tp));		# tp[0]
+	&mov	($j,$num);			# j=num-1
+	&xor	($i,$i);			# i=0 and clear CF!
+
+&set_label("sub",16);
+	&sbb	("eax",&DWP(0,$np,$i,4));
+	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
+	&dec	($j);				# doesn't affect CF!
+	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
+	&lea	($i,&DWP(1,$i));		# i++
+	&jge	(&label("sub"));
+
+	&sbb	("eax",0);			# handle upmost overflow bit
+	&and	($tp,"eax");
+	&not	("eax");
+	&mov	($np,$rp);
+	&and	($np,"eax");
+	&or	($tp,$np);			# tp=carry?tp:rp
+
+&set_label("copy",16);				# copy or in-place refresh
+	&mov	("eax",&DWP(0,$tp,$num,4));
+	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
+	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
+	&dec	($num);
+	&jge	(&label("copy"));
+
+	&mov	("esp",$_sp);		# pull saved stack pointer
+	&mov	("eax",1);
+&set_label("just_leave");
+&function_end("bn_mul_mont");
+
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/src/lib/libcrypto/camellia/asm/cmll-x86.pl b/src/lib/libcrypto/camellia/asm/cmll-x86.pl
new file mode 100644
index 0000000000..0812815bfb
--- /dev/null
+++ b/src/lib/libcrypto/camellia/asm/cmll-x86.pl
@@ -0,0 +1,1138 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
+#
+# This module may be used under the terms of either the GNU General
+# Public License version 2 or later, the GNU Lesser General Public
+# License version 2.1 or later, the Mozilla Public License version
+# 1.1 or the BSD License. The exact terms of either license are
+# distributed along with this module. For further details see
+# http://www.openssl.org/~appro/camellia/.
+# ====================================================================
+
+# Performance in cycles per processed byte (less is better) in
+# 'openssl speed ...' benchmark:
+#
+#			AMD K8	Core2	PIII	P4
+# -evp camellia-128-ecb	21.5	22.8	27.0	28.9
+# + over gcc 3.4.6	+90/11% +70/10%	+53/4%	+160/64%
+# + over icc 8.0	+48/19% +21/15%	+21/17%	+55/37%
+#
+# camellia-128-cbc	17.3	21.1	23.9	25.9
+#
+# 128-bit key setup	196	280	256	240	cycles/key
+# + over gcc 3.4.6	+30/0%	+17/11%	+11/0%	+63/40%
+# + over icc 8.0	+18/3%	+10/0%	+10/3%	+21/10%
+#
+# Pairs of numbers in "+" rows represent performance improvement over
+# compiler generated position-independent code, PIC, and non-PIC
+# respectively. PIC results are of greater relevance, as this module
+# is position-independent, i.e. suitable for a shared library or PIE.
+# Position independence "costs" one register, which is why compilers
+# are so close with non-PIC results, they have an extra register to
+# spare. CBC results are better than ECB ones thanks to "zero-copy"
+# private _x86_* interface, and are ~30-40% better than with compiler
+# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
+# same CPU (where applicable).
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+$OPENSSL=1;
+
+&asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386");
+
+@T=("eax","ebx","ecx","edx");
+$idx="esi";
+$key="edi";
+$Tbl="ebp";
+
+# stack frame layout in _x86_Camellia_* routines, frame is allocated
+# by caller
+$__ra=&DWP(0,"esp");	# return address
+$__s0=&DWP(4,"esp");	# s0 backing store
+$__s1=&DWP(8,"esp");	# s1 backing store
+$__s2=&DWP(12,"esp");	# s2 backing store
+$__s3=&DWP(16,"esp");	# s3 backing store
+$__end=&DWP(20,"esp");	# pointer to end/start of key schedule
+
+# stack frame layout in Camellia_[en|crypt] routines, which differs from
+# above by 4 and overlaps by pointer to end/start of key schedule
+$_end=&DWP(16,"esp");
+$_esp=&DWP(20,"esp");
+
+# const unsigned int Camellia_SBOX[4][256];
+# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
+# and [2][] - with [3][]. This is done to optimize code size.
+$SBOX1_1110=0;		# Camellia_SBOX[0]
+$SBOX4_4404=4;		# Camellia_SBOX[1]
+$SBOX2_0222=2048;	# Camellia_SBOX[2]
+$SBOX3_3033=2052;	# Camellia_SBOX[3]
+&static_label("Camellia_SIGMA");
+&static_label("Camellia_SBOX");
+
+sub Camellia_Feistel {
+my $i=@_[0];
+my $seed=defined(@_[1])?@_[1]:0;
+my $scale=$seed<0?-8:8;
+my $frame=defined(@_[2])?@_[2]:0;
+my $j=($i&1)*2;
+my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
+
+	&xor	($t0,$idx);				# t0^=key[0]
+	&xor	($t1,&DWP($seed+$i*$scale+4,$key));	# t1^=key[1]
+	&movz	($idx,&HB($t0));			# (t0>>8)&0xff
+	&mov	($t3,&DWP($SBOX3_3033,$Tbl,$idx,8));	# t3=SBOX3_3033[0]
+	&movz	($idx,&LB($t0));			# (t0>>0)&0xff
+	&xor	($t3,&DWP($SBOX4_4404,$Tbl,$idx,8));	# t3^=SBOX4_4404[0]
+	&shr	($t0,16);
+	&movz	($idx,&LB($t1));			# (t1>>0)&0xff
+	&mov	($t2,&DWP($SBOX1_1110,$Tbl,$idx,8));	# t2=SBOX1_1110[1]
+	&movz	($idx,&HB($t0));			# (t0>>24)&0xff
+	&xor	($t3,&DWP($SBOX1_1110,$Tbl,$idx,8));	# t3^=SBOX1_1110[0]
+	&movz	($idx,&HB($t1));			# (t1>>8)&0xff
+	&xor	($t2,&DWP($SBOX4_4404,$Tbl,$idx,8));	# t2^=SBOX4_4404[1]
+	&shr	($t1,16);
+	&movz	($t0,&LB($t0));				# (t0>>16)&0xff
+	&xor	($t3,&DWP($SBOX2_0222,$Tbl,$t0,8));	# t3^=SBOX2_0222[0]
+	&movz	($idx,&HB($t1));			# (t1>>24)&0xff
+	&mov	($t0,&DWP($frame+4*(($j+3)%4),"esp"));	# prefetch "s3"
+	&xor	($t2,$t3);				# t2^=t3
+	&rotr	($t3,8);				# t3=RightRotate(t3,8)
+	&xor	($t2,&DWP($SBOX2_0222,$Tbl,$idx,8));	# t2^=SBOX2_0222[1]
+	&movz	($idx,&LB($t1));			# (t1>>16)&0xff
+	&mov	($t1,&DWP($frame+4*(($j+2)%4),"esp"));	# prefetch "s2"
+	&xor	($t3,$t0);				# t3^=s3
+	&xor	($t2,&DWP($SBOX3_3033,$Tbl,$idx,8));	# t2^=SBOX3_3033[1]
+	&mov	($idx,&DWP($seed+($i+1)*$scale,$key));	# prefetch key[i+1]
+	&xor	($t3,$t2);				# t3^=t2
+	&mov	(&DWP($frame+4*(($j+3)%4),"esp"),$t3);	# s3=t3
+	&xor	($t2,$t1);				# t2^=s2
+	&mov	(&DWP($frame+4*(($j+2)%4),"esp"),$t2);	# s2=t2
+}
+
+# void Camellia_EncryptBlock_Rounds(
+#		int grandRounds,
+#		const Byte plaintext[],
+#		const KEY_TABLE_TYPE keyTable,
+#		Byte ciphertext[])
+&function_begin("Camellia_EncryptBlock_Rounds");
+	&mov	("eax",&wparam(0));	# load grandRounds
+	&mov	($idx,&wparam(1));	# load plaintext pointer
+	&mov	($key,&wparam(2));	# load key schedule pointer
+
+	&mov	("ebx","esp");
+	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
+	&and	("esp",-64);
+
+	# place stack frame just "above mod 1024" the key schedule
+	# this ensures that cache associativity of 2 suffices
+	&lea	("ecx",&DWP(-64-63,$key));
+	&sub	("ecx","esp");
+	&neg	("ecx");
+	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
+	&sub	("esp","ecx");
+	&add	("esp",4);	# 4 is reserved for callee's return address
+
+	&shl	("eax",6);
+	&lea	("eax",&DWP(0,$key,"eax"));
+	&mov	($_esp,"ebx");	# save %esp
+	&mov	($_end,"eax");	# save keyEnd
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop($Tbl);
+	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
+
+	&mov	(@T[0],&DWP(0,$idx));	# load plaintext
+	&mov	(@T[1],&DWP(4,$idx));
+	&mov	(@T[2],&DWP(8,$idx));
+	&bswap	(@T[0]);
+	&mov	(@T[3],&DWP(12,$idx));
+	&bswap	(@T[1]);
+	&bswap	(@T[2]);
+	&bswap	(@T[3]);
+
+	&call	("_x86_Camellia_encrypt");
+
+	&mov	("esp",$_esp);
+	&bswap	(@T[0]);
+	&mov	($idx,&wparam(3));	# load ciphertext pointer
+	&bswap	(@T[1]);
+	&bswap	(@T[2]);
+	&bswap	(@T[3]);
+	&mov	(&DWP(0,$idx),@T[0]);	# write ciphertext
+	&mov	(&DWP(4,$idx),@T[1]);
+	&mov	(&DWP(8,$idx),@T[2]);
+	&mov	(&DWP(12,$idx),@T[3]);
+&function_end("Camellia_EncryptBlock_Rounds");
+# V1.x API
+&function_begin_B("Camellia_EncryptBlock");
+	&mov	("eax",128);
+	&sub	("eax",&wparam(0));	# load keyBitLength
+	&mov	("eax",3);
+	&adc	("eax",0);		# keyBitLength==128?3:4
+	&mov	(&wparam(0),"eax");
+	&jmp	(&label("Camellia_EncryptBlock_Rounds"));
+&function_end_B("Camellia_EncryptBlock");
+
+if ($OPENSSL) {
+# void Camellia_encrypt(
+#		const unsigned char *in,
+#		unsigned char *out,
+#		const CAMELLIA_KEY *key)
+&function_begin("Camellia_encrypt");
+	&mov	($idx,&wparam(0));	# load plaintext pointer
+	&mov	($key,&wparam(2));	# load key schedule pointer
+
+	&mov	("ebx","esp");
+	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
+	&and	("esp",-64);
+	&mov	("eax",&DWP(272,$key));	# load grandRounds counter
+
+	# place stack frame just "above mod 1024" the key schedule
+	# this ensures that cache associativity of 2 suffices
+	&lea	("ecx",&DWP(-64-63,$key));
+	&sub	("ecx","esp");
+	&neg	("ecx");
+	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
+	&sub	("esp","ecx");
+	&add	("esp",4);	# 4 is reserved for callee's return address
+
+	&shl	("eax",6);
+	&lea	("eax",&DWP(0,$key,"eax"));
+	&mov	($_esp,"ebx");	# save %esp
+	&mov	($_end,"eax");	# save keyEnd
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop($Tbl);
+	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
+
+	&mov	(@T[0],&DWP(0,$idx));	# load plaintext
+	&mov	(@T[1],&DWP(4,$idx));
+	&mov	(@T[2],&DWP(8,$idx));
+	&bswap	(@T[0]);
+	&mov	(@T[3],&DWP(12,$idx));
+	&bswap	(@T[1]);
+	&bswap	(@T[2]);
+	&bswap	(@T[3]);
+
+	&call	("_x86_Camellia_encrypt");
+
+	&mov	("esp",$_esp);
+	&bswap	(@T[0]);
+	&mov	($idx,&wparam(1));	# load ciphertext pointer
+	&bswap	(@T[1]);
+	&bswap	(@T[2]);
+	&bswap	(@T[3]);
+	&mov	(&DWP(0,$idx),@T[0]);	# write ciphertext
+	&mov	(&DWP(4,$idx),@T[1]);
+	&mov	(&DWP(8,$idx),@T[2]);
+	&mov	(&DWP(12,$idx),@T[3]);
+&function_end("Camellia_encrypt");
+}
+
+&function_begin_B("_x86_Camellia_encrypt");
+	&xor	(@T[0],&DWP(0,$key));	# ^=key[0-3]
+	&xor	(@T[1],&DWP(4,$key));
+	&xor	(@T[2],&DWP(8,$key));
+	&xor	(@T[3],&DWP(12,$key));
+	&mov	($idx,&DWP(16,$key));	# prefetch key[4]
+
+	&mov	($__s0,@T[0]);		# save s[0-3]
+	&mov	($__s1,@T[1]);
+	&mov	($__s2,@T[2]);
+	&mov	($__s3,@T[3]);
+
+&set_label("loop",16);
+	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
+
+	&add	($key,16*4);
+	&cmp	($key,$__end);
+	&je	(&label("done"));
+
+	# @T[0-1] are preloaded, $idx is preloaded with key[0]
+	&and	($idx,@T[0]);
+	 &mov	 (@T[3],$__s3);
+	&rotl	($idx,1);
+	 &mov	 (@T[2],@T[3]);
+	&xor	(@T[1],$idx);
+	 &or	 (@T[2],&DWP(12,$key));
+	&mov	($__s1,@T[1]);		# s1^=LeftRotate(s0&key[0],1);
+	 &xor	 (@T[2],$__s2);
+
+	&mov	($idx,&DWP(4,$key));
+	 &mov	 ($__s2,@T[2]);		# s2^=s3|key[3];
+	&or	($idx,@T[1]);
+	 &and	 (@T[2],&DWP(8,$key));
+	&xor	(@T[0],$idx);
+	 &rotl	 (@T[2],1);
+	&mov	($__s0,@T[0]);		# s0^=s1|key[1];
+	 &xor	 (@T[3],@T[2]);
+	&mov	($idx,&DWP(16,$key));		# prefetch key[4]
+	 &mov	 ($__s3,@T[3]);		# s3^=LeftRotate(s2&key[2],1);
+	&jmp	(&label("loop"));
+
+&set_label("done",8);
+	&mov	(@T[2],@T[0]);		# SwapHalf
+	&mov	(@T[3],@T[1]);
+	&mov	(@T[0],$__s2);
+	&mov	(@T[1],$__s3);
+	&xor	(@T[0],$idx);		# $idx is preloaded with key[0]
+	&xor	(@T[1],&DWP(4,$key));
+	&xor	(@T[2],&DWP(8,$key));
+	&xor	(@T[3],&DWP(12,$key));
+	&ret	();
+&function_end_B("_x86_Camellia_encrypt");
+
+# void Camellia_DecryptBlock_Rounds(
+#		int grandRounds,
+#		const Byte ciphertext[],
+#		const KEY_TABLE_TYPE keyTable,
+#		Byte plaintext[])
+&function_begin("Camellia_DecryptBlock_Rounds");
+	&mov	("eax",&wparam(0));	# load grandRounds
+	&mov	($idx,&wparam(1));	# load ciphertext pointer
+	&mov	($key,&wparam(2));	# load key schedule pointer
+
+	&mov	("ebx","esp");
+	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
+	&and	("esp",-64);
+
+	# place stack frame just "above mod 1024" the key schedule
+	# this ensures that cache associativity of 2 suffices
+	&lea	("ecx",&DWP(-64-63,$key));
+	&sub	("ecx","esp");
+	&neg	("ecx");
+	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
+	&sub	("esp","ecx");
+	&add	("esp",4);	# 4 is reserved for callee's return address
+
+	&shl	("eax",6);
+	&mov	(&DWP(4*4,"esp"),$key);	# save keyStart
+	&lea	($key,&DWP(0,$key,"eax"));
+	&mov	(&DWP(5*4,"esp"),"ebx");# save %esp
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop($Tbl);
+	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
+
+	&mov	(@T[0],&DWP(0,$idx));	# load ciphertext
+	&mov	(@T[1],&DWP(4,$idx));
+	&mov	(@T[2],&DWP(8,$idx));
+	&bswap	(@T[0]);
+	&mov	(@T[3],&DWP(12,$idx));
+	&bswap	(@T[1]);
+	&bswap	(@T[2]);
+	&bswap	(@T[3]);
+
+	&call	("_x86_Camellia_decrypt");
+
+	&mov	("esp",&DWP(5*4,"esp"));
+	&bswap	(@T[0]);
+	&mov	($idx,&wparam(3));	# load plaintext pointer
+	&bswap	(@T[1]);
+	&bswap	(@T[2]);
+	&bswap	(@T[3]);
+	&mov	(&DWP(0,$idx),@T[0]);	# write plaintext
+	&mov	(&DWP(4,$idx),@T[1]);
+	&mov	(&DWP(8,$idx),@T[2]);
+	&mov	(&DWP(12,$idx),@T[3]);
+&function_end("Camellia_DecryptBlock_Rounds");
+# V1.x API
+&function_begin_B("Camellia_DecryptBlock");
+	&mov	("eax",128);
+	&sub	("eax",&wparam(0));	# load keyBitLength
+	&mov	("eax",3);
+	&adc	("eax",0);		# keyBitLength==128?3:4
+	&mov	(&wparam(0),"eax");
+	&jmp	(&label("Camellia_DecryptBlock_Rounds"));
+&function_end_B("Camellia_DecryptBlock");
+
+if ($OPENSSL) {
+# void Camellia_decrypt(
+#		const unsigned char *in,
+#		unsigned char *out,
+#		const CAMELLIA_KEY *key)
+&function_begin("Camellia_decrypt");
+	&mov	($idx,&wparam(0));	# load ciphertext pointer
+	&mov	($key,&wparam(2));	# load key schedule pointer
+
+	&mov	("ebx","esp");
+	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
+	&and	("esp",-64);
+	&mov	("eax",&DWP(272,$key));	# load grandRounds counter
+
+	# place stack frame just "above mod 1024" the key schedule
+	# this ensures that cache associativity of 2 suffices
+	&lea	("ecx",&DWP(-64-63,$key));
+	&sub	("ecx","esp");
+	&neg	("ecx");
+	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
+	&sub	("esp","ecx");
+	&add	("esp",4);	# 4 is reserved for callee's return address
+
+	&shl	("eax",6);
+	&mov	(&DWP(4*4,"esp"),$key);	# save keyStart
+	&lea	($key,&DWP(0,$key,"eax"));
+	&mov	(&DWP(5*4,"esp"),"ebx");# save %esp
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop($Tbl);
+	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
+
+	&mov	(@T[0],&DWP(0,$idx));	# load ciphertext
+	&mov	(@T[1],&DWP(4,$idx));
+	&mov	(@T[2],&DWP(8,$idx));
+	&bswap	(@T[0]);
+	&mov	(@T[3],&DWP(12,$idx));
+	&bswap	(@T[1]);
+	&bswap	(@T[2]);
+	&bswap	(@T[3]);
+
+	&call	("_x86_Camellia_decrypt");
+
+	&mov	("esp",&DWP(5*4,"esp"));
+	&bswap	(@T[0]);
+	&mov	($idx,&wparam(1));	# load plaintext pointer
+	&bswap	(@T[1]);
+	&bswap	(@T[2]);
+	&bswap	(@T[3]);
+	&mov	(&DWP(0,$idx),@T[0]);	# write plaintext
+	&mov	(&DWP(4,$idx),@T[1]);
+	&mov	(&DWP(8,$idx),@T[2]);
+	&mov	(&DWP(12,$idx),@T[3]);
+&function_end("Camellia_decrypt");
+}
+
+&function_begin_B("_x86_Camellia_decrypt");
+	&xor	(@T[0],&DWP(0,$key));	# ^=key[0-3]
+	&xor	(@T[1],&DWP(4,$key));
+	&xor	(@T[2],&DWP(8,$key));
+	&xor	(@T[3],&DWP(12,$key));
+	&mov	($idx,&DWP(-8,$key));	# prefetch key[-2]
+
+	&mov	($__s0,@T[0]);		# save s[0-3]
+	&mov	($__s1,@T[1]);
+	&mov	($__s2,@T[2]);
+	&mov	($__s3,@T[3]);
+
+&set_label("loop",16);
+	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
+
+	&sub	($key,16*4);
+	&cmp	($key,$__end);
+	&je	(&label("done"));
+
+	# @T[0-1] are preloaded, $idx is preloaded with key[2]
+	&and	($idx,@T[0]);
+	 &mov	 (@T[3],$__s3);
+	&rotl	($idx,1);
+	 &mov	 (@T[2],@T[3]);
+	&xor	(@T[1],$idx);
+	 &or	 (@T[2],&DWP(4,$key));
+	&mov	($__s1,@T[1]);		# s1^=LeftRotate(s0&key[0],1);
+	 &xor	 (@T[2],$__s2);
+
+	&mov	($idx,&DWP(12,$key));
+	 &mov	 ($__s2,@T[2]);		# s2^=s3|key[3];
+	&or	($idx,@T[1]);
+	 &and	 (@T[2],&DWP(0,$key));
+	&xor	(@T[0],$idx);
+	 &rotl	 (@T[2],1);
+	&mov	($__s0,@T[0]);		# s0^=s1|key[1];
+	 &xor	 (@T[3],@T[2]);
+	&mov	($idx,&DWP(-8,$key));	# prefetch key[4]
+	 &mov	 ($__s3,@T[3]);		# s3^=LeftRotate(s2&key[2],1);
+	&jmp	(&label("loop"));
+
+&set_label("done",8);
+	&mov	(@T[2],@T[0]);		# SwapHalf
+	&mov	(@T[3],@T[1]);
+	&mov	(@T[0],$__s2);
+	&mov	(@T[1],$__s3);
+	&xor	(@T[2],$idx);		# $idx is preloaded with key[2]
+	&xor	(@T[3],&DWP(12,$key));
+	&xor	(@T[0],&DWP(0,$key));
+	&xor	(@T[1],&DWP(4,$key));
+	&ret	();
+&function_end_B("_x86_Camellia_decrypt");
+
+# shld is very slow on Intel P4 family. Even on AMD it limits
+# instruction decode rate [because it's VectorPath] and consequently
+# performance. PIII, PM and Core[2] seem to be the only ones which
+# execute this code ~7% faster...
+sub __rotl128 {
+  my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
+
+    $rnd *= 2;
+    if ($rot) {
+	&mov	($idx,$i0);
+	&shld	($i0,$i1,$rot);
+	&shld	($i1,$i2,$rot);
+	&shld	($i2,$i3,$rot);
+	&shld	($i3,$idx,$rot);
+    }
+    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i0 eq @T[0]);
+    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i1 eq @T[0]);
+    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i2 eq @T[0]);
+    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i3 eq @T[0]);
+}
+
+# ... Implementing 128-bit rotate without shld gives >3x performance
+# improvement on P4, only ~7% degradation on other Intel CPUs and
+# not worse performance on AMD. This is therefore preferred.
+sub _rotl128 {
+  my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
+
+    $rnd *= 2;
+    if ($rot) {
+	&mov	($Tbl,$i0);
+	&shl	($i0,$rot);
+	&mov	($idx,$i1);
+	&shr	($idx,32-$rot);
+	&shl	($i1,$rot);
+	&or	($i0,$idx);
+	&mov	($idx,$i2);
+	&shl	($i2,$rot);
+	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i0 eq @T[0]);
+	&shr	($idx,32-$rot);
+	&or	($i1,$idx);
+	&shr	($Tbl,32-$rot);
+	&mov	($idx,$i3);
+	&shr	($idx,32-$rot);
+	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i1 eq @T[0]);
+	&shl	($i3,$rot);
+	&or	($i2,$idx);
+	&or	($i3,$Tbl);
+	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i2 eq @T[0]);
+	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i3 eq @T[0]);
+    } else {
+	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i0 eq @T[0]);
+	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i1 eq @T[0]);
+	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i2 eq @T[0]);
+	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i3 eq @T[0]);
+    }
+}
+
+sub _saveround {
+my ($rnd,$key,@T)=@_;
+my $bias=int(@T[0])?shift(@T):0;
+
+	&mov	(&DWP($bias+$rnd*8+0,$key),@T[0]);
+	&mov	(&DWP($bias+$rnd*8+4,$key),@T[1])	if ($#T>=1);
+	&mov	(&DWP($bias+$rnd*8+8,$key),@T[2])	if ($#T>=2);
+	&mov	(&DWP($bias+$rnd*8+12,$key),@T[3])	if ($#T>=3);
+}
+
+sub _loadround {
+my ($rnd,$key,@T)=@_;
+my $bias=int(@T[0])?shift(@T):0;
+
+	&mov	(@T[0],&DWP($bias+$rnd*8+0,$key));
+	&mov	(@T[1],&DWP($bias+$rnd*8+4,$key))	if ($#T>=1);
+	&mov	(@T[2],&DWP($bias+$rnd*8+8,$key))	if ($#T>=2);
+	&mov	(@T[3],&DWP($bias+$rnd*8+12,$key))	if ($#T>=3);
+}
+
+# void Camellia_Ekeygen(
+#		const int keyBitLength,
+#		const Byte *rawKey,
+#		KEY_TABLE_TYPE keyTable)
+&function_begin("Camellia_Ekeygen");
+{ my $step=0;
+
+	&stack_push(4);				# place for s[0-3]
+
+	&mov	($Tbl,&wparam(0));		# load arguments
+	&mov	($idx,&wparam(1));
+	&mov	($key,&wparam(2));
+
+	&mov	(@T[0],&DWP(0,$idx));		# load 0-127 bits
+	&mov	(@T[1],&DWP(4,$idx));
+	&mov	(@T[2],&DWP(8,$idx));
+	&mov	(@T[3],&DWP(12,$idx));
+
+	&bswap	(@T[0]);
+	&bswap	(@T[1]);
+	&bswap	(@T[2]);
+	&bswap	(@T[3]);
+
+	&_saveround	(0,$key,@T);		# KL<<<0
+
+	&cmp	($Tbl,128);
+	&je	(&label("1st128"));
+
+	&mov	(@T[0],&DWP(16,$idx));		# load 128-191 bits
+	&mov	(@T[1],&DWP(20,$idx));
+	&cmp	($Tbl,192);
+	&je	(&label("1st192"));
+	&mov	(@T[2],&DWP(24,$idx));		# load 192-255 bits
+	&mov	(@T[3],&DWP(28,$idx));
+	&jmp	(&label("1st256"));
+&set_label("1st192",4);
+	&mov	(@T[2],@T[0]);
+	&mov	(@T[3],@T[1]);
+	&not	(@T[2]);
+	&not	(@T[3]);
+&set_label("1st256",4);
+	&bswap	(@T[0]);
+	&bswap	(@T[1]);
+	&bswap	(@T[2]);
+	&bswap	(@T[3]);
+
+	&_saveround	(4,$key,@T);		# temporary storage for KR!
+
+	&xor	(@T[0],&DWP(0*8+0,$key));	# KR^KL
+	&xor	(@T[1],&DWP(0*8+4,$key));
+	&xor	(@T[2],&DWP(1*8+0,$key));
+	&xor	(@T[3],&DWP(1*8+4,$key));
+
+&set_label("1st128",4);
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop($Tbl);
+	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
+	&lea	($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
+
+	&mov	($idx,&DWP($step*8,$key));	# prefetch SIGMA[0]
+	&mov	(&swtmp(0),@T[0]);		# save s[0-3]
+	&mov	(&swtmp(1),@T[1]);
+	&mov	(&swtmp(2),@T[2]);
+	&mov	(&swtmp(3),@T[3]);
+	&Camellia_Feistel($step++);
+	&Camellia_Feistel($step++);
+	&mov	(@T[2],&swtmp(2));
+	&mov	(@T[3],&swtmp(3));
+
+	&mov	($idx,&wparam(2));
+	&xor	(@T[0],&DWP(0*8+0,$idx));	# ^KL
+	&xor	(@T[1],&DWP(0*8+4,$idx));
+	&xor	(@T[2],&DWP(1*8+0,$idx));
+	&xor	(@T[3],&DWP(1*8+4,$idx));
+
+	&mov	($idx,&DWP($step*8,$key));	# prefetch SIGMA[4]
+	&mov	(&swtmp(0),@T[0]);		# save s[0-3]
+	&mov	(&swtmp(1),@T[1]);
+	&mov	(&swtmp(2),@T[2]);
+	&mov	(&swtmp(3),@T[3]);
+	&Camellia_Feistel($step++);
+	&Camellia_Feistel($step++);
+	&mov	(@T[2],&swtmp(2));
+	&mov	(@T[3],&swtmp(3));
+
+	&mov	($idx,&wparam(0));
+	&cmp	($idx,128);
+	&jne	(&label("2nd256"));
+
+	&mov	($key,&wparam(2));
+	&lea	($key,&DWP(128,$key));		# size optimization
+
+	####### process KA
+	&_saveround	(2,$key,-128,@T);	# KA<<<0
+	&_rotl128	(@T,15,6,@T);		# KA<<<15
+	&_rotl128	(@T,15,8,@T);		# KA<<<(15+15=30)
+	&_rotl128	(@T,15,12,@T[0],@T[1]);	# KA<<<(30+15=45)
+	&_rotl128	(@T,15,14,@T);		# KA<<<(45+15=60)
+	push		(@T,shift(@T));		# rotl128(@T,32);
+	&_rotl128	(@T,2,20,@T);		# KA<<<(60+32+2=94)
+	&_rotl128	(@T,17,24,@T);		# KA<<<(94+17=111)
+
+	####### process KL
+	&_loadround	(0,$key,-128,@T);	# load KL
+	&_rotl128	(@T,15,4,@T);		# KL<<<15
+	&_rotl128	(@T,30,10,@T);		# KL<<<(15+30=45)
+	&_rotl128	(@T,15,13,@T[2],@T[3]);	# KL<<<(45+15=60)
+	&_rotl128	(@T,17,16,@T);		# KL<<<(60+17=77)
+	&_rotl128	(@T,17,18,@T);		# KL<<<(77+17=94)
+	&_rotl128	(@T,17,22,@T);		# KL<<<(94+17=111)
+
+	while (@T[0] ne "eax")			# restore order
+	{   unshift	(@T,pop(@T));   }
+
+	&mov	("eax",3);			# 3 grandRounds
+	&jmp	(&label("done"));
+
+&set_label("2nd256",16);
+	&mov	($idx,&wparam(2));
+	&_saveround	(6,$idx,@T);		# temporary storage for KA!
+
+	&xor	(@T[0],&DWP(4*8+0,$idx));	# KA^KR
+	&xor	(@T[1],&DWP(4*8+4,$idx));
+	&xor	(@T[2],&DWP(5*8+0,$idx));
+	&xor	(@T[3],&DWP(5*8+4,$idx));
+
+	&mov	($idx,&DWP($step*8,$key));	# prefetch SIGMA[8]
+	&mov	(&swtmp(0),@T[0]);		# save s[0-3]
+	&mov	(&swtmp(1),@T[1]);
+	&mov	(&swtmp(2),@T[2]);
+	&mov	(&swtmp(3),@T[3]);
+	&Camellia_Feistel($step++);
+	&Camellia_Feistel($step++);
+	&mov	(@T[2],&swtmp(2));
+	&mov	(@T[3],&swtmp(3));
+
+	&mov	($key,&wparam(2));
+	&lea	($key,&DWP(128,$key));		# size optimization
+
+	####### process KB
+	&_saveround	(2,$key,-128,@T);	# KB<<<0
+	&_rotl128	(@T,30,10,@T);		# KB<<<30
+	&_rotl128	(@T,30,20,@T);		# KB<<<(30+30=60)
+	push		(@T,shift(@T));		# rotl128(@T,32);
+	&_rotl128	(@T,19,32,@T);		# KB<<<(60+32+19=111)
+
+	####### process KR
+	&_loadround	(4,$key,-128,@T);	# load KR
+	&_rotl128	(@T,15,4,@T);		# KR<<<15
+	&_rotl128	(@T,15,8,@T);		# KR<<<(15+15=30)
+	&_rotl128	(@T,30,18,@T);		# KR<<<(30+30=60)
+	push		(@T,shift(@T));		# rotl128(@T,32);
+	&_rotl128	(@T,2,26,@T);		# KR<<<(60+32+2=94)
+
+	####### process KA
+	&_loadround	(6,$key,-128,@T);	# load KA
+	&_rotl128	(@T,15,6,@T);		# KA<<<15
+	&_rotl128	(@T,30,14,@T);		# KA<<<(15+30=45)
+	push		(@T,shift(@T));		# rotl128(@T,32);
+	&_rotl128	(@T,0,24,@T);		# KA<<<(45+32+0=77)
+	&_rotl128	(@T,17,28,@T);		# KA<<<(77+17=94)
+
+	####### process KL
+	&_loadround	(0,$key,-128,@T);	# load KL
+	push		(@T,shift(@T));		# rotl128(@T,32);
+	&_rotl128	(@T,13,12,@T);		# KL<<<(32+13=45)
+	&_rotl128	(@T,15,16,@T);		# KL<<<(45+15=60)
+	&_rotl128	(@T,17,22,@T);		# KL<<<(60+17=77)
+	push		(@T,shift(@T));		# rotl128(@T,32);
+	&_rotl128	(@T,2,30,@T);		# KL<<<(77+32+2=111)
+
+	while (@T[0] ne "eax")			# restore order
+	{   unshift	(@T,pop(@T));   }
+
+	&mov	("eax",4);			# 4 grandRounds
+&set_label("done");
+	&lea	("edx",&DWP(272-128,$key));	# end of key schedule
+	&stack_pop(4);
+}
+&function_end("Camellia_Ekeygen");
+
+if ($OPENSSL) {
+# int Camellia_set_key (
+#		const unsigned char *userKey,
+#		int bits,
+#		CAMELLIA_KEY *key)
+&function_begin_B("Camellia_set_key");
+	&push	("ebx");
+	&mov	("ecx",&wparam(0));	# pull arguments
+	&mov	("ebx",&wparam(1));
+	&mov	("edx",&wparam(2));
+
+	&mov	("eax",-1);
+	&test	("ecx","ecx");
+	&jz	(&label("done"));	# userKey==NULL?
+	&test	("edx","edx");
+	&jz	(&label("done"));	# key==NULL?
+
+	&mov	("eax",-2);
+	&cmp	("ebx",256);
+	&je	(&label("arg_ok"));	# bits==256?
+	&cmp	("ebx",192);
+	&je	(&label("arg_ok"));	# bits==192?
+	&cmp	("ebx",128);
+	&jne	(&label("done"));	# bits!=128?
+&set_label("arg_ok",4);
+
+	&push	("edx");		# push arguments
+	&push	("ecx");
+	&push	("ebx");
+	&call	("Camellia_Ekeygen");
+	&stack_pop(3);
+
+	# eax holds grandRounds and edx points at where to put it
+	&mov	(&DWP(0,"edx"),"eax");
+	&xor	("eax","eax");
+&set_label("done",4);
+	&pop	("ebx");
+	&ret	();
+&function_end_B("Camellia_set_key");
+}
+
+@SBOX=(
+112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
+ 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
+134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
+166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
+139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
+223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
+ 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
+254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
+170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
+ 16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
+135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
+ 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
+233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
+120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
+114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
+ 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
+
+sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
+sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }	
+sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }	
+sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }	
+
+&set_label("Camellia_SIGMA",64);
+&data_word(
+    0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
+    0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
+    0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
+    0,          0,          0,          0);
+&set_label("Camellia_SBOX",64);
+# tables are interleaved, remember?
+for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
+for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
+
+# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
+#			size_t length, const CAMELLIA_KEY *key,
+#			unsigned char *ivp,const int enc);
+{
+# stack frame layout
+#             -4(%esp)		# return address	 0(%esp)
+#              0(%esp)		# s0			 4(%esp)
+#              4(%esp)		# s1			 8(%esp)
+#              8(%esp)		# s2			12(%esp)
+#             12(%esp)		# s3			16(%esp)
+#             16(%esp)		# end of key schedule	20(%esp)
+#             20(%esp)		# %esp backup
+my $_inp=&DWP(24,"esp");	#copy of wparam(0)
+my $_out=&DWP(28,"esp");	#copy of wparam(1)
+my $_len=&DWP(32,"esp");	#copy of wparam(2)
+my $_key=&DWP(36,"esp");	#copy of wparam(3)
+my $_ivp=&DWP(40,"esp");	#copy of wparam(4)
+my $ivec=&DWP(44,"esp");	#ivec[16]
+my $_tmp=&DWP(44,"esp");	#volatile variable [yes, aliases with ivec]
+my ($s0,$s1,$s2,$s3) = @T;
+
+&function_begin("Camellia_cbc_encrypt");
+	&mov	($s2 eq "ecx"? $s2 : "",&wparam(2));	# load len
+	&cmp	($s2,0);
+	&je	(&label("enc_out"));
+
+	&pushf	();
+	&cld	();
+
+	&mov	($s0,&wparam(0));	# load inp
+	&mov	($s1,&wparam(1));	# load out
+	#&mov	($s2,&wparam(2));	# load len
+	&mov	($s3,&wparam(3));	# load key
+	&mov	($Tbl,&wparam(4));	# load ivp
+
+	# allocate aligned stack frame...
+	&lea	($idx,&DWP(-64,"esp"));
+	&and	($idx,-64);
+
+	# place stack frame just "above mod 1024" the key schedule
+	# this ensures that cache associativity of 2 suffices
+	&lea	($key,&DWP(-64-63,$s3));
+	&sub	($key,$idx);
+	&neg	($key);
+	&and	($key,0x3C0);	# modulo 1024, but aligned to cache-line
+	&sub	($idx,$key);
+
+	&mov	($key,&wparam(5));	# load enc
+
+	&exch	("esp",$idx);
+	&add	("esp",4);		# reserve for return address!
+	&mov	($_esp,$idx);		# save %esp
+
+	&mov	($_inp,$s0);		# save copy of inp
+	&mov	($_out,$s1);		# save copy of out
+	&mov	($_len,$s2);		# save copy of len
+	&mov	($_key,$s3);		# save copy of key
+	&mov	($_ivp,$Tbl);		# save copy of ivp
+
+	&call   (&label("pic_point"));	# make it PIC!
+	&set_label("pic_point");
+	&blindpop($Tbl);
+	&lea    ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
+
+	&mov	($idx,32);
+	&set_label("prefetch_sbox",4);
+		&mov	($s0,&DWP(0,$Tbl));
+		&mov	($s1,&DWP(32,$Tbl));
+		&mov	($s2,&DWP(64,$Tbl));
+		&mov	($s3,&DWP(96,$Tbl));
+		&lea	($Tbl,&DWP(128,$Tbl));
+		&dec	($idx);
+	&jnz	(&label("prefetch_sbox"));
+	&mov	($s0,$_key);
+	&sub	($Tbl,4096);
+	&mov	($idx,$_inp);
+	&mov	($s3,&DWP(272,$s0));		# load grandRounds
+
+	&cmp	($key,0);
+	&je	(&label("DECRYPT"));
+
+	&mov	($s2,$_len);
+	&mov	($key,$_ivp);
+	&shl	($s3,6);
+	&lea	($s3,&DWP(0,$s0,$s3));
+	&mov	($_end,$s3);
+
+	&test	($s2,0xFFFFFFF0);
+	&jz	(&label("enc_tail"));		# short input...
+
+	&mov	($s0,&DWP(0,$key));		# load iv
+	&mov	($s1,&DWP(4,$key));
+
+	&set_label("enc_loop",4);
+		&mov	($s2,&DWP(8,$key));
+		&mov	($s3,&DWP(12,$key));
+
+		&xor	($s0,&DWP(0,$idx));	# xor input data
+		&xor	($s1,&DWP(4,$idx));
+		&xor	($s2,&DWP(8,$idx));
+		&bswap	($s0);
+		&xor	($s3,&DWP(12,$idx));
+		&bswap	($s1);
+		&mov	($key,$_key);		# load key
+		&bswap	($s2);
+		&bswap	($s3);
+
+		&call	("_x86_Camellia_encrypt");
+
+		&mov	($idx,$_inp);		# load inp
+		&mov	($key,$_out);		# load out
+
+		&bswap	($s0);
+		&bswap	($s1);
+		&bswap	($s2);
+		&mov	(&DWP(0,$key),$s0);	# save output data
+		&bswap	($s3);
+		&mov	(&DWP(4,$key),$s1);
+		&mov	(&DWP(8,$key),$s2);
+		&mov	(&DWP(12,$key),$s3);
+
+		&mov	($s2,$_len);		# load len
+
+		&lea	($idx,&DWP(16,$idx));
+		&mov	($_inp,$idx);		# save inp
+
+		&lea	($s3,&DWP(16,$key));
+		&mov	($_out,$s3);		# save out
+
+		&sub	($s2,16);
+		&test	($s2,0xFFFFFFF0);
+		&mov	($_len,$s2);		# save len
+	&jnz	(&label("enc_loop"));
+	&test	($s2,15);
+	&jnz	(&label("enc_tail"));
+	&mov	($idx,$_ivp);		# load ivp
+	&mov	($s2,&DWP(8,$key));	# restore last dwords
+	&mov	($s3,&DWP(12,$key));
+	&mov	(&DWP(0,$idx),$s0);	# save ivec
+	&mov	(&DWP(4,$idx),$s1);
+	&mov	(&DWP(8,$idx),$s2);
+	&mov	(&DWP(12,$idx),$s3);
+
+	&mov	("esp",$_esp);
+	&popf	();
+    &set_label("enc_out");
+	&function_end_A();
+	&pushf	();			# kludge, never executed
+
+    &set_label("enc_tail",4);
+	&mov	($s0,$key eq "edi" ? $key : "");
+	&mov	($key,$_out);			# load out
+	&push	($s0);				# push ivp
+	&mov	($s1,16);
+	&sub	($s1,$s2);
+	&cmp	($key,$idx);			# compare with inp
+	&je	(&label("enc_in_place"));
+	&align	(4);
+	&data_word(0xA4F3F689);	# rep movsb	# copy input
+	&jmp	(&label("enc_skip_in_place"));
+    &set_label("enc_in_place");
+	&lea	($key,&DWP(0,$key,$s2));
+    &set_label("enc_skip_in_place");
+	&mov	($s2,$s1);
+	&xor	($s0,$s0);
+	&align	(4);
+	&data_word(0xAAF3F689);	# rep stosb	# zero tail
+	&pop	($key);				# pop ivp
+
+	&mov	($idx,$_out);			# output as input
+	&mov	($s0,&DWP(0,$key));
+	&mov	($s1,&DWP(4,$key));
+	&mov	($_len,16);			# len=16
+	&jmp	(&label("enc_loop"));		# one more spin...
+
+#----------------------------- DECRYPT -----------------------------#
+&set_label("DECRYPT",16);
+	&shl	($s3,6);
+	&lea	($s3,&DWP(0,$s0,$s3));
+	&mov	($_end,$s0);
+	&mov	($_key,$s3);
+
+	&cmp	($idx,$_out);
+	&je	(&label("dec_in_place"));	# in-place processing...
+
+	&mov	($key,$_ivp);			# load ivp
+	&mov	($_tmp,$key);
+
+	&set_label("dec_loop",4);
+		&mov	($s0,&DWP(0,$idx));	# read input
+		&mov	($s1,&DWP(4,$idx));
+		&mov	($s2,&DWP(8,$idx));
+		&bswap	($s0);
+		&mov	($s3,&DWP(12,$idx));
+		&bswap	($s1);
+		&mov	($key,$_key);		# load key
+		&bswap	($s2);
+		&bswap	($s3);
+
+		&call	("_x86_Camellia_decrypt");
+
+		&mov	($key,$_tmp);		# load ivp
+		&mov	($idx,$_len);		# load len
+
+		&bswap	($s0);
+		&bswap	($s1);
+		&bswap	($s2);
+		&xor	($s0,&DWP(0,$key));	# xor iv
+		&bswap	($s3);
+		&xor	($s1,&DWP(4,$key));
+		&xor	($s2,&DWP(8,$key));
+		&xor	($s3,&DWP(12,$key));
+
+		&sub	($idx,16);
+		&jc	(&label("dec_partial"));
+		&mov	($_len,$idx);		# save len
+		&mov	($idx,$_inp);		# load inp
+		&mov	($key,$_out);		# load out
+
+		&mov	(&DWP(0,$key),$s0);	# write output
+		&mov	(&DWP(4,$key),$s1);
+		&mov	(&DWP(8,$key),$s2);
+		&mov	(&DWP(12,$key),$s3);
+
+		&mov	($_tmp,$idx);		# save ivp
+		&lea	($idx,&DWP(16,$idx));
+		&mov	($_inp,$idx);		# save inp
+
+		&lea	($key,&DWP(16,$key));
+		&mov	($_out,$key);		# save out
+
+	&jnz	(&label("dec_loop"));
+	&mov	($key,$_tmp);		# load temp ivp
+    &set_label("dec_end");
+	&mov	($idx,$_ivp);		# load user ivp
+	&mov	($s0,&DWP(0,$key));	# load iv
+	&mov	($s1,&DWP(4,$key));
+	&mov	($s2,&DWP(8,$key));
+	&mov	($s3,&DWP(12,$key));
+	&mov	(&DWP(0,$idx),$s0);	# copy back to user
+	&mov	(&DWP(4,$idx),$s1);
+	&mov	(&DWP(8,$idx),$s2);
+	&mov	(&DWP(12,$idx),$s3);
+	&jmp	(&label("dec_out"));
+
+    &set_label("dec_partial",4);
+	&lea	($key,$ivec);
+	&mov	(&DWP(0,$key),$s0);	# dump output to stack
+	&mov	(&DWP(4,$key),$s1);
+	&mov	(&DWP(8,$key),$s2);
+	&mov	(&DWP(12,$key),$s3);
+	&lea	($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx));
+	&mov	($idx eq "esi" ? $idx : "",$key);
+	&mov	($key eq "edi" ? $key : "",$_out);	# load out
+	&data_word(0xA4F3F689);	# rep movsb		# copy output
+	&mov	($key,$_inp);				# use inp as temp ivp
+	&jmp	(&label("dec_end"));
+
+    &set_label("dec_in_place",4);
+	&set_label("dec_in_place_loop");
+		&lea	($key,$ivec);
+		&mov	($s0,&DWP(0,$idx));	# read input
+		&mov	($s1,&DWP(4,$idx));
+		&mov	($s2,&DWP(8,$idx));
+		&mov	($s3,&DWP(12,$idx));
+
+		&mov	(&DWP(0,$key),$s0);	# copy to temp
+		&mov	(&DWP(4,$key),$s1);
+		&mov	(&DWP(8,$key),$s2);
+		&bswap	($s0);
+		&mov	(&DWP(12,$key),$s3);
+		&bswap	($s1);
+		&mov	($key,$_key);		# load key
+		&bswap	($s2);
+		&bswap	($s3);
+
+		&call	("_x86_Camellia_decrypt");
+
+		&mov	($key,$_ivp);		# load ivp
+		&mov	($idx,$_out);		# load out
+
+		&bswap	($s0);
+		&bswap	($s1);
+		&bswap	($s2);
+		&xor	($s0,&DWP(0,$key));	# xor iv
+		&bswap	($s3);
+		&xor	($s1,&DWP(4,$key));
+		&xor	($s2,&DWP(8,$key));
+		&xor	($s3,&DWP(12,$key));
+
+		&mov	(&DWP(0,$idx),$s0);	# write output
+		&mov	(&DWP(4,$idx),$s1);
+		&mov	(&DWP(8,$idx),$s2);
+		&mov	(&DWP(12,$idx),$s3);
+
+		&lea	($idx,&DWP(16,$idx));
+		&mov	($_out,$idx);		# save out
+
+		&lea	($idx,$ivec);
+		&mov	($s0,&DWP(0,$idx));	# read temp
+		&mov	($s1,&DWP(4,$idx));
+		&mov	($s2,&DWP(8,$idx));
+		&mov	($s3,&DWP(12,$idx));
+
+		&mov	(&DWP(0,$key),$s0);	# copy iv
+		&mov	(&DWP(4,$key),$s1);
+		&mov	(&DWP(8,$key),$s2);
+		&mov	(&DWP(12,$key),$s3);
+
+		&mov	($idx,$_inp);		# load inp
+
+		&lea	($idx,&DWP(16,$idx));
+		&mov	($_inp,$idx);		# save inp
+
+		&mov	($s2,$_len);		# load len
+		&sub	($s2,16);
+		&jc	(&label("dec_in_place_partial"));
+		&mov	($_len,$s2);		# save len
+	&jnz	(&label("dec_in_place_loop"));
+	&jmp	(&label("dec_out"));
+
+    &set_label("dec_in_place_partial",4);
+	# one can argue if this is actually required...
+	&mov	($key eq "edi" ? $key : "",$_out);
+	&lea	($idx eq "esi" ? $idx : "",$ivec);
+	&lea	($key,&DWP(0,$key,$s2));
+	&lea	($idx,&DWP(16,$idx,$s2));
+	&neg	($s2 eq "ecx" ? $s2 : "");
+	&data_word(0xA4F3F689);	# rep movsb	# restore tail
+
+    &set_label("dec_out",4);
+    &mov	("esp",$_esp);
+    &popf	();
+&function_end("Camellia_cbc_encrypt");
+}
+
+&asciz("Camellia for x86 by <appro@openssl.org>");
+
+&asm_finish();
diff --git a/src/lib/libcrypto/camellia/asm/cmll-x86_64.pl b/src/lib/libcrypto/camellia/asm/cmll-x86_64.pl
new file mode 100644
index 0000000000..c683646ca7
--- /dev/null
+++ b/src/lib/libcrypto/camellia/asm/cmll-x86_64.pl
@@ -0,0 +1,1080 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
+#
+# This module may be used under the terms of either the GNU General
+# Public License version 2 or later, the GNU Lesser General Public
+# License version 2.1 or later, the Mozilla Public License version
+# 1.1 or the BSD License. The exact terms of either license are
+# distributed along with this module. For further details see
+# http://www.openssl.org/~appro/camellia/.
+# ====================================================================
+
+# Performance in cycles per processed byte (less is better) in
+# 'openssl speed ...' benchmark:
+#
+#			AMD64	Core2	EM64T
+# -evp camellia-128-ecb	16.7	21.0	22.7
+# + over gcc 3.4.6	+25%	+5%	0%
+#
+# camellia-128-cbc	15.7	20.4	21.1
+#
+# 128-bit key setup	128	216	205	cycles/key
+# + over gcc 3.4.6	+54%	+39%	+15%
+#
+# Numbers in "+" rows represent performance improvement over compiler
+# generated code. Key setup timings are impressive on AMD and Core2
+# thanks to 64-bit operations being covertly deployed. Improvement on
+# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
+# apparently emulates some of 64-bit operations in [32-bit] microcode.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
+sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
+                        $r =~ s/%[er]([sd]i)/%\1l/;
+                        $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
+
+$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
+@S=("%r8d","%r9d","%r10d","%r11d");
+$i0="%esi";
+$i1="%edi";
+$Tbl="%rbp";	# size optimization
+$inp="%r12";
+$out="%r13";
+$key="%r14";
+$keyend="%r15";
+$arg0d=$win64?"%ecx":"%edi";
+
+# const unsigned int Camellia_SBOX[4][256];
+# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
+# and [2][] - with [3][]. This is done to minimize code size.
+$SBOX1_1110=0;		# Camellia_SBOX[0]
+$SBOX4_4404=4;		# Camellia_SBOX[1]
+$SBOX2_0222=2048;	# Camellia_SBOX[2]
+$SBOX3_3033=2052;	# Camellia_SBOX[3]
+
+sub Camellia_Feistel {
+my $i=@_[0];
+my $seed=defined(@_[1])?@_[1]:0;
+my $scale=$seed<0?-8:8;
+my $j=($i&1)*2;
+my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
+
+$code.=<<___;
+	xor	$s0,$t0				# t0^=key[0]
+	xor	$s1,$t1				# t1^=key[1]
+	movz	`&hi("$t0")`,$i0		# (t0>>8)&0xff
+	movz	`&lo("$t1")`,$i1		# (t1>>0)&0xff
+	mov	$SBOX3_3033($Tbl,$i0,8),$t3	# t3=SBOX3_3033[0]
+	mov	$SBOX1_1110($Tbl,$i1,8),$t2	# t2=SBOX1_1110[1]
+	movz	`&lo("$t0")`,$i0		# (t0>>0)&0xff
+	shr	\$16,$t0
+	movz	`&hi("$t1")`,$i1		# (t1>>8)&0xff
+	xor	$SBOX4_4404($Tbl,$i0,8),$t3	# t3^=SBOX4_4404[0]
+	shr	\$16,$t1
+	xor	$SBOX4_4404($Tbl,$i1,8),$t2	# t2^=SBOX4_4404[1]
+	movz	`&hi("$t0")`,$i0		# (t0>>24)&0xff
+	movz	`&lo("$t1")`,$i1		# (t1>>16)&0xff
+	xor	$SBOX1_1110($Tbl,$i0,8),$t3	# t3^=SBOX1_1110[0]
+	xor	$SBOX3_3033($Tbl,$i1,8),$t2	# t2^=SBOX3_3033[1]
+	movz	`&lo("$t0")`,$i0		# (t0>>16)&0xff
+	movz	`&hi("$t1")`,$i1		# (t1>>24)&0xff
+	xor	$SBOX2_0222($Tbl,$i0,8),$t3	# t3^=SBOX2_0222[0]
+	xor	$SBOX2_0222($Tbl,$i1,8),$t2	# t2^=SBOX2_0222[1]
+	mov	`$seed+($i+1)*$scale`($key),$t1	# prefetch key[i+1]
+	mov	`$seed+($i+1)*$scale+4`($key),$t0
+	xor	$t3,$t2				# t2^=t3
+	ror	\$8,$t3				# t3=RightRotate(t3,8)
+	xor	$t2,$s2
+	xor	$t2,$s3
+	xor	$t3,$s3
+___
+}
+
+# void Camellia_EncryptBlock_Rounds(
+#		int grandRounds,
+#		const Byte plaintext[],
+#		const KEY_TABLE_TYPE keyTable,
+#		Byte ciphertext[])
+$code=<<___;
+.text
+
+# V1.x API
+.globl	Camellia_EncryptBlock
+.type	Camellia_EncryptBlock,\@abi-omnipotent
+.align	16
+Camellia_EncryptBlock:
+	movl	\$128,%eax
+	subl	$arg0d,%eax
+	movl	\$3,$arg0d
+	adcl	\$0,$arg0d	# keyBitLength==128?3:4
+	jmp	.Lenc_rounds
+.size	Camellia_EncryptBlock,.-Camellia_EncryptBlock
+# V2
+.globl	Camellia_EncryptBlock_Rounds
+.type	Camellia_EncryptBlock_Rounds,\@function,4
+.align	16
+.Lenc_rounds:
+Camellia_EncryptBlock_Rounds:
+	push	%rbx
+	push	%rbp
+	push	%r13
+	push	%r14
+	push	%r15
+.Lenc_prologue:
+
+	#mov	%rsi,$inp		# put away arguments
+	mov	%rcx,$out
+	mov	%rdx,$key
+
+	shl	\$6,%edi		# process grandRounds
+	lea	.LCamellia_SBOX(%rip),$Tbl
+	lea	($key,%rdi),$keyend
+
+	mov	0(%rsi),@S[0]		# load plaintext
+	mov	4(%rsi),@S[1]
+	mov	8(%rsi),@S[2]
+	bswap	@S[0]
+	mov	12(%rsi),@S[3]
+	bswap	@S[1]
+	bswap	@S[2]
+	bswap	@S[3]
+
+	call	_x86_64_Camellia_encrypt
+
+	bswap	@S[0]
+	bswap	@S[1]
+	bswap	@S[2]
+	mov	@S[0],0($out)
+	bswap	@S[3]
+	mov	@S[1],4($out)
+	mov	@S[2],8($out)
+	mov	@S[3],12($out)
+
+	mov	0(%rsp),%r15
+	mov	8(%rsp),%r14
+	mov	16(%rsp),%r13
+	mov	24(%rsp),%rbp
+	mov	32(%rsp),%rbx
+	lea	40(%rsp),%rsp
+.Lenc_epilogue:
+	ret
+.size	Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
+
+.type	_x86_64_Camellia_encrypt,\@abi-omnipotent
+.align	16
+_x86_64_Camellia_encrypt:
+	xor	0($key),@S[1]
+	xor	4($key),@S[0]		# ^=key[0-3]
+	xor	8($key),@S[3]
+	xor	12($key),@S[2]
+.align	16
+.Leloop:
+	mov	16($key),$t1		# prefetch key[4-5]
+	mov	20($key),$t0
+
+___
+	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
+$code.=<<___;
+	lea	16*4($key),$key
+	cmp	$keyend,$key
+	mov	8($key),$t3		# prefetch key[2-3]
+	mov	12($key),$t2
+	je	.Ledone
+
+	and	@S[0],$t0
+	or	@S[3],$t3
+	rol	\$1,$t0
+	xor	$t3,@S[2]		# s2^=s3|key[3];
+	xor	$t0,@S[1]		# s1^=LeftRotate(s0&key[0],1);
+	and	@S[2],$t2
+	or	@S[1],$t1
+	rol	\$1,$t2
+	xor	$t1,@S[0]		# s0^=s1|key[1];
+	xor	$t2,@S[3]		# s3^=LeftRotate(s2&key[2],1);
+	jmp	.Leloop
+
+.align	16
+.Ledone:
+	xor	@S[2],$t0		# SwapHalf
+	xor	@S[3],$t1
+	xor	@S[0],$t2
+	xor	@S[1],$t3
+
+	mov	$t0,@S[0]
+	mov	$t1,@S[1]
+	mov	$t2,@S[2]
+	mov	$t3,@S[3]
+
+	.byte	0xf3,0xc3		# rep ret
+.size	_x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
+
+# V1.x API
+.globl	Camellia_DecryptBlock
+.type	Camellia_DecryptBlock,\@abi-omnipotent
+.align	16
+Camellia_DecryptBlock:
+	movl	\$128,%eax
+	subl	$arg0d,%eax
+	movl	\$3,$arg0d
+	adcl	\$0,$arg0d	# keyBitLength==128?3:4
+	jmp	.Ldec_rounds
+.size	Camellia_DecryptBlock,.-Camellia_DecryptBlock
+# V2
+.globl	Camellia_DecryptBlock_Rounds
+.type	Camellia_DecryptBlock_Rounds,\@function,4
+.align	16
+.Ldec_rounds:
+Camellia_DecryptBlock_Rounds:
+	push	%rbx
+	push	%rbp
+	push	%r13
+	push	%r14
+	push	%r15
+.Ldec_prologue:
+
+	#mov	%rsi,$inp		# put away arguments
+	mov	%rcx,$out
+	mov	%rdx,$keyend
+
+	shl	\$6,%edi		# process grandRounds
+	lea	.LCamellia_SBOX(%rip),$Tbl
+	lea	($keyend,%rdi),$key
+
+	mov	0(%rsi),@S[0]		# load plaintext
+	mov	4(%rsi),@S[1]
+	mov	8(%rsi),@S[2]
+	bswap	@S[0]
+	mov	12(%rsi),@S[3]
+	bswap	@S[1]
+	bswap	@S[2]
+	bswap	@S[3]
+
+	call	_x86_64_Camellia_decrypt
+
+	bswap	@S[0]
+	bswap	@S[1]
+	bswap	@S[2]
+	mov	@S[0],0($out)
+	bswap	@S[3]
+	mov	@S[1],4($out)
+	mov	@S[2],8($out)
+	mov	@S[3],12($out)
+
+	mov	0(%rsp),%r15
+	mov	8(%rsp),%r14
+	mov	16(%rsp),%r13
+	mov	24(%rsp),%rbp
+	mov	32(%rsp),%rbx
+	lea	40(%rsp),%rsp
+.Ldec_epilogue:
+	ret
+.size	Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
+
+.type	_x86_64_Camellia_decrypt,\@abi-omnipotent
+.align	16
+_x86_64_Camellia_decrypt:
+	xor	0($key),@S[1]
+	xor	4($key),@S[0]		# ^=key[0-3]
+	xor	8($key),@S[3]
+	xor	12($key),@S[2]
+.align	16
+.Ldloop:
+	mov	-8($key),$t1		# prefetch key[4-5]
+	mov	-4($key),$t0
+
+___
+	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
+$code.=<<___;
+	lea	-16*4($key),$key
+	cmp	$keyend,$key
+	mov	0($key),$t3		# prefetch key[2-3]
+	mov	4($key),$t2
+	je	.Lddone
+
+	and	@S[0],$t0
+	or	@S[3],$t3
+	rol	\$1,$t0
+	xor	$t3,@S[2]		# s2^=s3|key[3];
+	xor	$t0,@S[1]		# s1^=LeftRotate(s0&key[0],1);
+	and	@S[2],$t2
+	or	@S[1],$t1
+	rol	\$1,$t2
+	xor	$t1,@S[0]		# s0^=s1|key[1];
+	xor	$t2,@S[3]		# s3^=LeftRotate(s2&key[2],1);
+
+	jmp	.Ldloop
+
+.align	16
+.Lddone:
+	xor	@S[2],$t2
+	xor	@S[3],$t3
+	xor	@S[0],$t0
+	xor	@S[1],$t1
+
+	mov	$t2,@S[0]		# SwapHalf
+	mov	$t3,@S[1]
+	mov	$t0,@S[2]
+	mov	$t1,@S[3]
+
+	.byte	0xf3,0xc3		# rep ret
+.size	_x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
+___
+
+sub _saveround {
+my ($rnd,$key,@T)=@_;
+my $bias=int(@T[0])?shift(@T):0;
+
+    if ($#T==3) {
+	$code.=<<___;
+	mov	@T[1],`$bias+$rnd*8+0`($key)
+	mov	@T[0],`$bias+$rnd*8+4`($key)
+	mov	@T[3],`$bias+$rnd*8+8`($key)
+	mov	@T[2],`$bias+$rnd*8+12`($key)
+___
+    } else {
+	$code.="	mov	@T[0],`$bias+$rnd*8+0`($key)\n";
+	$code.="	mov	@T[1],`$bias+$rnd*8+8`($key)\n"	if ($#T>=1);
+    }
+}
+
+sub _loadround {
+my ($rnd,$key,@T)=@_;
+my $bias=int(@T[0])?shift(@T):0;
+
+$code.="	mov	`$bias+$rnd*8+0`($key),@T[0]\n";
+$code.="	mov	`$bias+$rnd*8+8`($key),@T[1]\n"	if ($#T>=1);
+}
+
+# shld is very slow on Intel EM64T family. Even on AMD it limits
+# instruction decode rate [because it's VectorPath] and consequently
+# performance...
+sub __rotl128 {
+my ($i0,$i1,$rot)=@_;
+
+    if ($rot) {
+	$code.=<<___;
+	mov	$i0,%r11
+	shld	\$$rot,$i1,$i0
+	shld	\$$rot,%r11,$i1
+___
+    }
+}
+
+# ... Implementing 128-bit rotate without shld gives 80% better
+# performance EM64T, +15% on AMD64 and only ~7% degradation on
+# Core2. This is therefore preferred.
+sub _rotl128 {
+my ($i0,$i1,$rot)=@_;
+
+    if ($rot) {
+	$code.=<<___;
+	mov	$i0,%r11
+	shl	\$$rot,$i0
+	mov	$i1,%r9
+	shr	\$`64-$rot`,%r9
+	shr	\$`64-$rot`,%r11
+	or	%r9,$i0
+	shl	\$$rot,$i1
+	or	%r11,$i1
+___
+    }
+}
+
+{ my $step=0;
+
+$code.=<<___;
+.globl	Camellia_Ekeygen
+.type	Camellia_Ekeygen,\@function,3
+.align	16
+Camellia_Ekeygen:
+	push	%rbx
+	push	%rbp
+	push	%r13
+	push	%r14
+	push	%r15
+.Lkey_prologue:
+
+	mov	%rdi,$keyend		# put away arguments, keyBitLength
+	mov	%rdx,$out		# keyTable
+
+	mov	0(%rsi),@S[0]		# load 0-127 bits
+	mov	4(%rsi),@S[1]
+	mov	8(%rsi),@S[2]
+	mov	12(%rsi),@S[3]
+
+	bswap	@S[0]
+	bswap	@S[1]
+	bswap	@S[2]
+	bswap	@S[3]
+___
+	&_saveround	(0,$out,@S);	# KL<<<0
+$code.=<<___;
+	cmp	\$128,$keyend		# check keyBitLength
+	je	.L1st128
+
+	mov	16(%rsi),@S[0]		# load 128-191 bits
+	mov	20(%rsi),@S[1]
+	cmp	\$192,$keyend
+	je	.L1st192
+	mov	24(%rsi),@S[2]		# load 192-255 bits
+	mov	28(%rsi),@S[3]
+	jmp	.L1st256
+.L1st192:
+	mov	@S[0],@S[2]
+	mov	@S[1],@S[3]
+	not	@S[2]
+	not	@S[3]
+.L1st256:
+	bswap	@S[0]
+	bswap	@S[1]
+	bswap	@S[2]
+	bswap	@S[3]
+___
+	&_saveround	(4,$out,@S);	# temp storage for KR!
+$code.=<<___;
+	xor	0($out),@S[1]		# KR^KL
+	xor	4($out),@S[0]
+	xor	8($out),@S[3]
+	xor	12($out),@S[2]
+
+.L1st128:
+	lea	.LCamellia_SIGMA(%rip),$key
+	lea	.LCamellia_SBOX(%rip),$Tbl
+
+	mov	0($key),$t1
+	mov	4($key),$t0
+___
+	&Camellia_Feistel($step++);
+	&Camellia_Feistel($step++);
+$code.=<<___;
+	xor	0($out),@S[1]		# ^KL
+	xor	4($out),@S[0]
+	xor	8($out),@S[3]
+	xor	12($out),@S[2]
+___
+	&Camellia_Feistel($step++);
+	&Camellia_Feistel($step++);
+$code.=<<___;
+	cmp	\$128,$keyend
+	jne	.L2nd256
+
+	lea	128($out),$out		# size optimization
+	shl	\$32,%r8		# @S[0]||
+	shl	\$32,%r10		# @S[2]||
+	or	%r9,%r8			# ||@S[1]
+	or	%r11,%r10		# ||@S[3]
+___
+	&_loadround	(0,$out,-128,"%rax","%rbx");	# KL
+	&_saveround	(2,$out,-128,"%r8","%r10");	# KA<<<0
+	&_rotl128	("%rax","%rbx",15);
+	&_saveround	(4,$out,-128,"%rax","%rbx");	# KL<<<15
+	&_rotl128	("%r8","%r10",15);
+	&_saveround	(6,$out,-128,"%r8","%r10");	# KA<<<15
+	&_rotl128	("%r8","%r10",15);		# 15+15=30
+	&_saveround	(8,$out,-128,"%r8","%r10");	# KA<<<30
+	&_rotl128	("%rax","%rbx",30);		# 15+30=45
+	&_saveround	(10,$out,-128,"%rax","%rbx");	# KL<<<45
+	&_rotl128	("%r8","%r10",15);		# 30+15=45
+	&_saveround	(12,$out,-128,"%r8");		# KA<<<45
+	&_rotl128	("%rax","%rbx",15);		# 45+15=60
+	&_saveround	(13,$out,-128,"%rbx");		# KL<<<60
+	&_rotl128	("%r8","%r10",15);		# 45+15=60
+	&_saveround	(14,$out,-128,"%r8","%r10");	# KA<<<60
+	&_rotl128	("%rax","%rbx",17);		# 60+17=77
+	&_saveround	(16,$out,-128,"%rax","%rbx");	# KL<<<77
+	&_rotl128	("%rax","%rbx",17);		# 77+17=94
+	&_saveround	(18,$out,-128,"%rax","%rbx");	# KL<<<94
+	&_rotl128	("%r8","%r10",34);		# 60+34=94
+	&_saveround	(20,$out,-128,"%r8","%r10");	# KA<<<94
+	&_rotl128	("%rax","%rbx",17);		# 94+17=111
+	&_saveround	(22,$out,-128,"%rax","%rbx");	# KL<<<111
+	&_rotl128	("%r8","%r10",17);		# 94+17=111
+	&_saveround	(24,$out,-128,"%r8","%r10");	# KA<<<111
+$code.=<<___;
+	mov	\$3,%eax
+	jmp	.Ldone
+.align	16
+.L2nd256:
+___
+	&_saveround	(6,$out,@S);	# temp storage for KA!
+$code.=<<___;
+	xor	`4*8+0`($out),@S[1]	# KA^KR
+	xor	`4*8+4`($out),@S[0]
+	xor	`5*8+0`($out),@S[3]
+	xor	`5*8+4`($out),@S[2]
+___
+	&Camellia_Feistel($step++);
+	&Camellia_Feistel($step++);
+
+	&_loadround	(0,$out,"%rax","%rbx");	# KL
+	&_loadround	(4,$out,"%rcx","%rdx");	# KR
+	&_loadround	(6,$out,"%r14","%r15");	# KA
+$code.=<<___;
+	lea	128($out),$out		# size optimization
+	shl	\$32,%r8		# @S[0]||
+	shl	\$32,%r10		# @S[2]||
+	or	%r9,%r8			# ||@S[1]
+	or	%r11,%r10		# ||@S[3]
+___
+	&_saveround	(2,$out,-128,"%r8","%r10");	# KB<<<0
+	&_rotl128	("%rcx","%rdx",15);
+	&_saveround	(4,$out,-128,"%rcx","%rdx");	# KR<<<15
+	&_rotl128	("%r14","%r15",15);
+	&_saveround	(6,$out,-128,"%r14","%r15");	# KA<<<15
+	&_rotl128	("%rcx","%rdx",15);		# 15+15=30
+	&_saveround	(8,$out,-128,"%rcx","%rdx");	# KR<<<30
+	&_rotl128	("%r8","%r10",30);
+	&_saveround	(10,$out,-128,"%r8","%r10");	# KB<<<30
+	&_rotl128	("%rax","%rbx",45);
+	&_saveround	(12,$out,-128,"%rax","%rbx");	# KL<<<45
+	&_rotl128	("%r14","%r15",30);		# 15+30=45
+	&_saveround	(14,$out,-128,"%r14","%r15");	# KA<<<45
+	&_rotl128	("%rax","%rbx",15);		# 45+15=60
+	&_saveround	(16,$out,-128,"%rax","%rbx");	# KL<<<60
+	&_rotl128	("%rcx","%rdx",30);		# 30+30=60
+	&_saveround	(18,$out,-128,"%rcx","%rdx");	# KR<<<60
+	&_rotl128	("%r8","%r10",30);		# 30+30=60
+	&_saveround	(20,$out,-128,"%r8","%r10");	# KB<<<60
+	&_rotl128	("%rax","%rbx",17);		# 60+17=77
+	&_saveround	(22,$out,-128,"%rax","%rbx");	# KL<<<77
+	&_rotl128	("%r14","%r15",32);		# 45+32=77
+	&_saveround	(24,$out,-128,"%r14","%r15");	# KA<<<77
+	&_rotl128	("%rcx","%rdx",34);		# 60+34=94
+	&_saveround	(26,$out,-128,"%rcx","%rdx");	# KR<<<94
+	&_rotl128	("%r14","%r15",17);		# 77+17=94
+	&_saveround	(28,$out,-128,"%r14","%r15");	# KA<<<77
+	&_rotl128	("%rax","%rbx",34);		# 77+34=111
+	&_saveround	(30,$out,-128,"%rax","%rbx");	# KL<<<111
+	&_rotl128	("%r8","%r10",51);		# 60+51=111
+	&_saveround	(32,$out,-128,"%r8","%r10");	# KB<<<111
+$code.=<<___;
+	mov	\$4,%eax
+.Ldone:
+	mov	0(%rsp),%r15
+	mov	8(%rsp),%r14
+	mov	16(%rsp),%r13
+	mov	24(%rsp),%rbp
+	mov	32(%rsp),%rbx
+	lea	40(%rsp),%rsp
+.Lkey_epilogue:
+	ret
+.size	Camellia_Ekeygen,.-Camellia_Ekeygen
+___
+}
+
+@SBOX=(
+112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
+ 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
+134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
+166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
+139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
+223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
+ 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
+254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
+170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
+ 16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
+135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
+ 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
+233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
+120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
+114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
+ 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
+
+sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
+sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
+sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
+sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
+
+$code.=<<___;
+.align	64
+.LCamellia_SIGMA:
+.long	0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
+.long	0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
+.long	0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
+.long	0,          0,          0,          0
+.LCamellia_SBOX:
+___
+# tables are interleaved, remember?
+sub data_word { $code.=".long\t".join(',',@_)."\n"; }
+for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
+for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
+
+# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
+#			size_t length, const CAMELLIA_KEY *key,
+#			unsigned char *ivp,const int enc);
+{
+$_key="0(%rsp)";
+$_end="8(%rsp)";	# inp+len&~15
+$_res="16(%rsp)";	# len&15
+$ivec="24(%rsp)";
+$_ivp="40(%rsp)";
+$_rsp="48(%rsp)";
+
+$code.=<<___;
+.globl	Camellia_cbc_encrypt
+.type	Camellia_cbc_encrypt,\@function,6
+.align	16
+Camellia_cbc_encrypt:
+	cmp	\$0,%rdx
+	je	.Lcbc_abort
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+.Lcbc_prologue:
+
+	mov	%rsp,%rbp
+	sub	\$64,%rsp
+	and	\$-64,%rsp
+
+	# place stack frame just "above mod 1024" the key schedule,
+	# this ensures that cache associativity suffices
+	lea	-64-63(%rcx),%r10
+	sub	%rsp,%r10
+	neg	%r10
+	and	\$0x3C0,%r10
+	sub	%r10,%rsp
+	#add	\$8,%rsp		# 8 is reserved for callee's ra
+
+	mov	%rdi,$inp		# inp argument
+	mov	%rsi,$out		# out argument
+	mov	%r8,%rbx		# ivp argument
+	mov	%rcx,$key		# key argument
+	mov	272(%rcx),$keyend	# grandRounds
+
+	mov	%r8,$_ivp
+	mov	%rbp,$_rsp
+
+.Lcbc_body:
+	lea	.LCamellia_SBOX(%rip),$Tbl
+
+	mov	\$32,%ecx
+.align	4
+.Lcbc_prefetch_sbox:
+	mov	0($Tbl),%rax
+	mov	32($Tbl),%rsi
+	mov	64($Tbl),%rdi
+	mov	96($Tbl),%r11
+	lea	128($Tbl),$Tbl
+	loop	.Lcbc_prefetch_sbox
+	sub	\$4096,$Tbl
+	shl	\$6,$keyend
+	mov	%rdx,%rcx		# len argument
+	lea	($key,$keyend),$keyend
+
+	cmp	\$0,%r9d		# enc argument
+	je	.LCBC_DECRYPT
+
+	and	\$-16,%rdx
+	and	\$15,%rcx		# length residue
+	lea	($inp,%rdx),%rdx
+	mov	$key,$_key
+	mov	%rdx,$_end
+	mov	%rcx,$_res
+
+	cmp	$inp,%rdx
+	mov	0(%rbx),@S[0]		# load IV
+	mov	4(%rbx),@S[1]
+	mov	8(%rbx),@S[2]
+	mov	12(%rbx),@S[3]
+	je	.Lcbc_enc_tail
+	jmp	.Lcbc_eloop
+
+.align	16
+.Lcbc_eloop:
+	xor	0($inp),@S[0]
+	xor	4($inp),@S[1]
+	xor	8($inp),@S[2]
+	bswap	@S[0]
+	xor	12($inp),@S[3]
+	bswap	@S[1]
+	bswap	@S[2]
+	bswap	@S[3]
+
+	call	_x86_64_Camellia_encrypt
+
+	mov	$_key,$key		# "rewind" the key
+	bswap	@S[0]
+	mov	$_end,%rdx
+	bswap	@S[1]
+	mov	$_res,%rcx
+	bswap	@S[2]
+	mov	@S[0],0($out)
+	bswap	@S[3]
+	mov	@S[1],4($out)
+	mov	@S[2],8($out)
+	lea	16($inp),$inp
+	mov	@S[3],12($out)
+	cmp	%rdx,$inp
+	lea	16($out),$out
+	jne	.Lcbc_eloop
+
+	cmp	\$0,%rcx
+	jne	.Lcbc_enc_tail
+
+	mov	$_ivp,$out
+	mov	@S[0],0($out)		# write out IV residue
+	mov	@S[1],4($out)
+	mov	@S[2],8($out)
+	mov	@S[3],12($out)
+	jmp	.Lcbc_done
+
+.align	16
+.Lcbc_enc_tail:
+	xor	%rax,%rax
+	mov	%rax,0+$ivec
+	mov	%rax,8+$ivec
+	mov	%rax,$_res
+
+.Lcbc_enc_pushf:
+	pushfq
+	cld
+	mov	$inp,%rsi
+	lea	8+$ivec,%rdi
+	.long	0x9066A4F3		# rep movsb
+	popfq
+.Lcbc_enc_popf:
+
+	lea	$ivec,$inp
+	lea	16+$ivec,%rax
+	mov	%rax,$_end
+	jmp	.Lcbc_eloop		# one more time
+
+.align	16
+.LCBC_DECRYPT:
+	xchg	$key,$keyend
+	add	\$15,%rdx
+	and	\$15,%rcx		# length residue
+	and	\$-16,%rdx
+	mov	$key,$_key
+	lea	($inp,%rdx),%rdx
+	mov	%rdx,$_end
+	mov	%rcx,$_res
+
+	mov	(%rbx),%rax		# load IV
+	mov	8(%rbx),%rbx
+	jmp	.Lcbc_dloop
+.align	16
+.Lcbc_dloop:
+	mov	0($inp),@S[0]
+	mov	4($inp),@S[1]
+	mov	8($inp),@S[2]
+	bswap	@S[0]
+	mov	12($inp),@S[3]
+	bswap	@S[1]
+	mov	%rax,0+$ivec		# save IV to temporary storage
+	bswap	@S[2]
+	mov	%rbx,8+$ivec
+	bswap	@S[3]
+
+	call	_x86_64_Camellia_decrypt
+
+	mov	$_key,$key		# "rewind" the key
+	mov	$_end,%rdx
+	mov	$_res,%rcx
+
+	bswap	@S[0]
+	mov	($inp),%rax		# load IV for next iteration
+	bswap	@S[1]
+	mov	8($inp),%rbx
+	bswap	@S[2]
+	xor	0+$ivec,@S[0]
+	bswap	@S[3]
+	xor	4+$ivec,@S[1]
+	xor	8+$ivec,@S[2]
+	lea	16($inp),$inp
+	xor	12+$ivec,@S[3]
+	cmp	%rdx,$inp
+	je	.Lcbc_ddone
+
+	mov	@S[0],0($out)
+	mov	@S[1],4($out)
+	mov	@S[2],8($out)
+	mov	@S[3],12($out)
+
+	lea	16($out),$out
+	jmp	.Lcbc_dloop
+
+.align	16
+.Lcbc_ddone:
+	mov	$_ivp,%rdx
+	cmp	\$0,%rcx
+	jne	.Lcbc_dec_tail
+
+	mov	@S[0],0($out)
+	mov	@S[1],4($out)
+	mov	@S[2],8($out)
+	mov	@S[3],12($out)
+
+	mov	%rax,(%rdx)		# write out IV residue
+	mov	%rbx,8(%rdx)
+	jmp	.Lcbc_done
+.align	16
+.Lcbc_dec_tail:
+	mov	@S[0],0+$ivec
+	mov	@S[1],4+$ivec
+	mov	@S[2],8+$ivec
+	mov	@S[3],12+$ivec
+
+.Lcbc_dec_pushf:
+	pushfq
+	cld
+	lea	8+$ivec,%rsi
+	lea	($out),%rdi
+	.long	0x9066A4F3		# rep movsb
+	popfq
+.Lcbc_dec_popf:
+
+	mov	%rax,(%rdx)		# write out IV residue
+	mov	%rbx,8(%rdx)
+	jmp	.Lcbc_done
+
+.align	16
+.Lcbc_done:
+	mov	$_rsp,%rcx
+	mov	0(%rcx),%r15
+	mov	8(%rcx),%r14
+	mov	16(%rcx),%r13
+	mov	24(%rcx),%r12
+	mov	32(%rcx),%rbp
+	mov	40(%rcx),%rbx
+	lea	48(%rcx),%rsp
+.Lcbc_abort:
+	ret
+.size	Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
+
+.asciz	"Camellia for x86_64 by <appro@openssl.org>"
+___
+}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	common_se_handler,\@abi-omnipotent
+.align	16
+common_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	lea	-64(%rsp),%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	lea	40(%rax),%rax
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r13
+	mov	-32(%rax),%r14
+	mov	-40(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	jmp	.Lcommon_seh_exit
+.size	common_se_handler,.-common_se_handler
+
+.type	cbc_se_handler,\@abi-omnipotent
+.align	16
+cbc_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	lea	-64(%rsp),%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lcbc_prologue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lcbc_prologue
+	jb	.Lin_cbc_prologue
+
+	lea	.Lcbc_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lcbc_body
+	jb	.Lin_cbc_frame_setup
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lcbc_abort(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lcbc_abort
+	jae	.Lin_cbc_prologue
+
+	# handle pushf/popf in Camellia_cbc_encrypt
+	lea	.Lcbc_enc_pushf(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<=.Lcbc_enc_pushf
+	jbe	.Lin_cbc_no_flag
+	lea	8(%rax),%rax
+	lea	.Lcbc_enc_popf(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lcbc_enc_popf
+	jb	.Lin_cbc_no_flag
+	lea	-8(%rax),%rax
+	lea	.Lcbc_dec_pushf(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<=.Lcbc_dec_pushf
+	jbe	.Lin_cbc_no_flag
+	lea	8(%rax),%rax
+	lea	.Lcbc_dec_popf(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lcbc_dec_popf
+	jb	.Lin_cbc_no_flag
+	lea	-8(%rax),%rax
+
+.Lin_cbc_no_flag:
+	mov	48(%rax),%rax		# $_rsp
+	lea	48(%rax),%rax
+
+.Lin_cbc_frame_setup:
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_cbc_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+.align	4
+.Lcommon_seh_exit:
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	lea	64(%rsp),%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	cbc_se_handler,.-cbc_se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_Camellia_EncryptBlock_Rounds
+	.rva	.LSEH_end_Camellia_EncryptBlock_Rounds
+	.rva	.LSEH_info_Camellia_EncryptBlock_Rounds
+
+	.rva	.LSEH_begin_Camellia_DecryptBlock_Rounds
+	.rva	.LSEH_end_Camellia_DecryptBlock_Rounds
+	.rva	.LSEH_info_Camellia_DecryptBlock_Rounds
+
+	.rva	.LSEH_begin_Camellia_Ekeygen
+	.rva	.LSEH_end_Camellia_Ekeygen
+	.rva	.LSEH_info_Camellia_Ekeygen
+
+	.rva	.LSEH_begin_Camellia_cbc_encrypt
+	.rva	.LSEH_end_Camellia_cbc_encrypt
+	.rva	.LSEH_info_Camellia_cbc_encrypt
+
+.section	.xdata
+.align	8
+.LSEH_info_Camellia_EncryptBlock_Rounds:
+	.byte	9,0,0,0
+	.rva	common_se_handler
+	.rva	.Lenc_prologue,.Lenc_epilogue	# HandlerData[]
+.LSEH_info_Camellia_DecryptBlock_Rounds:
+	.byte	9,0,0,0
+	.rva	common_se_handler
+	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
+.LSEH_info_Camellia_Ekeygen:
+	.byte	9,0,0,0
+	.rva	common_se_handler
+	.rva	.Lkey_prologue,.Lkey_epilogue	# HandlerData[]
+.LSEH_info_Camellia_cbc_encrypt:
+	.byte	9,0,0,0
+	.rva	cbc_se_handler
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/cms/cms_smime.c b/src/lib/libcrypto/cms/cms_smime.c
index b9463f9abb..f35883aa22 100644
--- a/src/lib/libcrypto/cms/cms_smime.c
+++ b/src/lib/libcrypto/cms/cms_smime.c
@@ -298,7 +298,7 @@ static int cms_signerinfo_verify_cert(CMS_SignerInfo *si,
 						CMS_R_STORE_INIT_ERROR);
 		goto err;
 		}
-	X509_STORE_CTX_set_purpose(&ctx, X509_PURPOSE_SMIME_SIGN);
+	X509_STORE_CTX_set_default(&ctx, "smime_sign");
 	if (crls)
 		X509_STORE_CTX_set0_crls(&ctx, crls);
 
@@ -425,7 +425,7 @@ int CMS_verify(CMS_ContentInfo *cms, STACK_OF(X509) *certs,
 		for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
 			{
 			si = sk_CMS_SignerInfo_value(sinfos, i);
-			if (!CMS_SignerInfo_verify_content(si, cmsbio))
+			if (CMS_SignerInfo_verify_content(si, cmsbio) <= 0)
 				{
 				CMSerr(CMS_F_CMS_VERIFY,
 					CMS_R_CONTENT_VERIFY_ERROR);
diff --git a/src/lib/libcrypto/des/asm/des_enc.m4 b/src/lib/libcrypto/des/asm/des_enc.m4
index f5b1928f99..f59333a030 100644
--- a/src/lib/libcrypto/des/asm/des_enc.m4
+++ b/src/lib/libcrypto/des/asm/des_enc.m4
@@ -44,6 +44,7 @@
 !
 
 .ident "des_enc.m4 2.1"
+.file  "des_enc-sparc.S"
 
 #if defined(__SUNPRO_C) && defined(__sparcv9)
 # define ABI64  /* They've said -xarch=v9 at command line */
@@ -315,16 +316,16 @@ $4:
 	ld	[global1+local1], local1
 	xor	$2, out1, out1            ! 8642
 	xor	$2, out0, out0            ! 7531
-	fmovs	%f0, %f0                  ! fxor used for alignment
+	! fmovs	%f0, %f0                  ! fxor used for alignment
 
 	srl	out1, 4, local0           ! rotate 4 right
 	and	out0, local5, local3      ! 3
-	fmovs	%f0, %f0
+	! fmovs	%f0, %f0
 
 	ld	[$5+$3*8], local7         ! key 7531 next round
 	srl	local3, 8, local3         ! 3
 	and	local0, 252, local2       ! 2
-	fmovs	%f0, %f0
+	! fmovs	%f0, %f0
 
 	ld	[global3+local3],local3   ! 3
 	sll	out1, 28, out1            ! rotate
@@ -1179,8 +1180,11 @@ DES_encrypt1:
 
 	save	%sp, FRAME, %sp
 
-	call	.PIC.me.up
-	mov	.PIC.me.up-(.-4),out0
+	sethi	%hi(.PIC.DES_SPtrans-1f),global1
+	or	global1,%lo(.PIC.DES_SPtrans-1f),global1
+1:	call	.+8
+	add	%o7,global1,global1
+	sub	global1,.PIC.DES_SPtrans-.des_and,out2
 
 	ld	[in0], in5                ! left
 	cmp	in2, 0                    ! enc
@@ -1237,8 +1241,11 @@ DES_encrypt2:
 
 	save	%sp, FRAME, %sp
 
-	call	.PIC.me.up
-	mov	.PIC.me.up-(.-4),out0
+	sethi	%hi(.PIC.DES_SPtrans-1f),global1
+	or	global1,%lo(.PIC.DES_SPtrans-1f),global1
+1:	call	.+8
+	add	%o7,global1,global1
+	sub	global1,.PIC.DES_SPtrans-.des_and,out2
 
 	! Set sbox address 1 to 6 and rotate halfs 3 left
 	! Errors caught by destest? Yes. Still? *NO*
@@ -1352,8 +1359,11 @@ DES_encrypt3:
 
 	save	%sp, FRAME, %sp
 	
-	call	.PIC.me.up
-	mov	.PIC.me.up-(.-4),out0
+	sethi	%hi(.PIC.DES_SPtrans-1f),global1
+	or	global1,%lo(.PIC.DES_SPtrans-1f),global1
+1:	call	.+8
+	add	%o7,global1,global1
+	sub	global1,.PIC.DES_SPtrans-.des_and,out2
 
 	ld	[in0], in5                ! left
 	add	in2, 120, in4             ! ks2
@@ -1394,8 +1404,11 @@ DES_decrypt3:
 
 	save	%sp, FRAME, %sp
 	
-	call	.PIC.me.up
-	mov	.PIC.me.up-(.-4),out0
+	sethi	%hi(.PIC.DES_SPtrans-1f),global1
+	or	global1,%lo(.PIC.DES_SPtrans-1f),global1
+1:	call	.+8
+	add	%o7,global1,global1
+	sub	global1,.PIC.DES_SPtrans-.des_and,out2
 
 	ld	[in0], in5                ! left
 	add	in3, 120, in4             ! ks3
@@ -1424,105 +1437,6 @@ DES_decrypt3:
 .DES_decrypt3.end:
 	.size	 DES_decrypt3,.DES_decrypt3.end-DES_decrypt3
 
-	.align	256
-	.type	 .des_and,#object
-	.size	 .des_and,284
-
-.des_and:
-
-! This table is used for AND 0xFC when it is known that register
-! bits 8-31 are zero. Makes it possible to do three arithmetic
-! operations in one cycle.
-
-	.byte  0, 0, 0, 0, 4, 4, 4, 4
-	.byte  8, 8, 8, 8, 12, 12, 12, 12
-	.byte  16, 16, 16, 16, 20, 20, 20, 20
-	.byte  24, 24, 24, 24, 28, 28, 28, 28
-	.byte  32, 32, 32, 32, 36, 36, 36, 36
-	.byte  40, 40, 40, 40, 44, 44, 44, 44
-	.byte  48, 48, 48, 48, 52, 52, 52, 52
-	.byte  56, 56, 56, 56, 60, 60, 60, 60
-	.byte  64, 64, 64, 64, 68, 68, 68, 68
-	.byte  72, 72, 72, 72, 76, 76, 76, 76
-	.byte  80, 80, 80, 80, 84, 84, 84, 84
-	.byte  88, 88, 88, 88, 92, 92, 92, 92
-	.byte  96, 96, 96, 96, 100, 100, 100, 100
-	.byte  104, 104, 104, 104, 108, 108, 108, 108
-	.byte  112, 112, 112, 112, 116, 116, 116, 116
-	.byte  120, 120, 120, 120, 124, 124, 124, 124
-	.byte  128, 128, 128, 128, 132, 132, 132, 132
-	.byte  136, 136, 136, 136, 140, 140, 140, 140
-	.byte  144, 144, 144, 144, 148, 148, 148, 148
-	.byte  152, 152, 152, 152, 156, 156, 156, 156
-	.byte  160, 160, 160, 160, 164, 164, 164, 164
-	.byte  168, 168, 168, 168, 172, 172, 172, 172
-	.byte  176, 176, 176, 176, 180, 180, 180, 180
-	.byte  184, 184, 184, 184, 188, 188, 188, 188
-	.byte  192, 192, 192, 192, 196, 196, 196, 196
-	.byte  200, 200, 200, 200, 204, 204, 204, 204
-	.byte  208, 208, 208, 208, 212, 212, 212, 212
-	.byte  216, 216, 216, 216, 220, 220, 220, 220
-	.byte  224, 224, 224, 224, 228, 228, 228, 228
-	.byte  232, 232, 232, 232, 236, 236, 236, 236
-	.byte  240, 240, 240, 240, 244, 244, 244, 244
-	.byte  248, 248, 248, 248, 252, 252, 252, 252
-
-	! 5 numbers for initil/final permutation
-
-	.word   0x0f0f0f0f                ! offset 256
-	.word	0x0000ffff                ! 260
-	.word	0x33333333                ! 264
-	.word	0x00ff00ff                ! 268
-	.word	0x55555555                ! 272
-
-	.word	0                         ! 276
-	.word	LOOPS                     ! 280
-	.word	0x0000FC00                ! 284
-.PIC.DES_SPtrans:
-	.word	%r_disp32(DES_SPtrans)
-
-! input:	out0	offset between .PIC.me.up and caller
-! output:	out0	pointer to .PIC.me.up
-!		out2	pointer to .des_and
-!		global1	pointer to DES_SPtrans
-	.align	32
-.PIC.me.up:
-	add	out0,%o7,out0			! pointer to .PIC.me.up
-#if 1
-	ld	[out0+(.PIC.DES_SPtrans-.PIC.me.up)],global1
-	add	global1,(.PIC.DES_SPtrans-.PIC.me.up),global1
-	add	global1,out0,global1
-#else
-# ifdef OPENSSL_PIC
-	! In case anybody wonders why this code is same for both ABI.
-	! To start with it is not. Do note LDPTR below. But of course
-	! you must be wondering why the rest of it does not contain
-	! things like %hh, %hm and %lm. Well, those are needed only
-	! if OpenSSL library *itself* will become larger than 4GB,
-	! which is not going to happen any time soon. 
-	sethi	%hi(DES_SPtrans),global1
-	or	global1,%lo(DES_SPtrans),global1
-	sethi	%hi(_GLOBAL_OFFSET_TABLE_-(.PIC.me.up-.)),out2
-	add	global1,out0,global1
-	add	out2,%lo(_GLOBAL_OFFSET_TABLE_-(.PIC.me.up-.)),out2
-	LDPTR	[out2+global1],global1
-# elif 0
-	setn	DES_SPtrans,out2,global1	! synthetic instruction !
-# elif defined(ABI64)
-	sethi	%hh(DES_SPtrans),out2
-	or	out2,%hm(DES_SPtrans),out2
-	sethi	%lm(DES_SPtrans),global1
-	or	global1,%lo(DES_SPtrans),global1
-	sllx	out2,32,out2
-	or	out2,global1,global1
-# else
-	sethi	%hi(DES_SPtrans),global1
-	or	global1,%lo(DES_SPtrans),global1
-# endif
-#endif
-	retl
-	add	out0,.des_and-.PIC.me.up,out2
-
 ! void DES_ncbc_encrypt(input, output, length, schedule, ivec, enc)
 ! *****************************************************************
 
@@ -1539,8 +1453,11 @@ DES_ncbc_encrypt:
 	define({OUTPUT}, { [%sp+BIAS+ARG0+1*ARGSZ] })
 	define({IVEC},   { [%sp+BIAS+ARG0+4*ARGSZ] })
 
-	call	.PIC.me.up
-	mov	.PIC.me.up-(.-4),out0
+	sethi	%hi(.PIC.DES_SPtrans-1f),global1
+	or	global1,%lo(.PIC.DES_SPtrans-1f),global1
+1:	call	.+8
+	add	%o7,global1,global1
+	sub	global1,.PIC.DES_SPtrans-.des_and,out2
 
 	cmp	in5, 0                    ! enc   
 
@@ -1761,8 +1678,11 @@ DES_ede3_cbc_encrypt:
 	define({KS2}, { [%sp+BIAS+ARG0+4*ARGSZ] })
 	define({KS3}, { [%sp+BIAS+ARG0+5*ARGSZ] })
 
-	call	.PIC.me.up
-	mov	.PIC.me.up-(.-4),out0
+	sethi	%hi(.PIC.DES_SPtrans-1f),global1
+	or	global1,%lo(.PIC.DES_SPtrans-1f),global1
+1:	call	.+8
+	add	%o7,global1,global1
+	sub	global1,.PIC.DES_SPtrans-.des_and,out2
 
 	LDPTR	[%fp+BIAS+ARG0+7*ARGSZ], local3          ! enc
 	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local4          ! ivec
@@ -1978,3 +1898,200 @@ DES_ede3_cbc_encrypt:
 
 .DES_ede3_cbc_encrypt.end:
 	.size	 DES_ede3_cbc_encrypt,.DES_ede3_cbc_encrypt.end-DES_ede3_cbc_encrypt
+
+	.align	256
+	.type	 .des_and,#object
+	.size	 .des_and,284
+
+.des_and:
+
+! This table is used for AND 0xFC when it is known that register
+! bits 8-31 are zero. Makes it possible to do three arithmetic
+! operations in one cycle.
+
+	.byte  0, 0, 0, 0, 4, 4, 4, 4
+	.byte  8, 8, 8, 8, 12, 12, 12, 12
+	.byte  16, 16, 16, 16, 20, 20, 20, 20
+	.byte  24, 24, 24, 24, 28, 28, 28, 28
+	.byte  32, 32, 32, 32, 36, 36, 36, 36
+	.byte  40, 40, 40, 40, 44, 44, 44, 44
+	.byte  48, 48, 48, 48, 52, 52, 52, 52
+	.byte  56, 56, 56, 56, 60, 60, 60, 60
+	.byte  64, 64, 64, 64, 68, 68, 68, 68
+	.byte  72, 72, 72, 72, 76, 76, 76, 76
+	.byte  80, 80, 80, 80, 84, 84, 84, 84
+	.byte  88, 88, 88, 88, 92, 92, 92, 92
+	.byte  96, 96, 96, 96, 100, 100, 100, 100
+	.byte  104, 104, 104, 104, 108, 108, 108, 108
+	.byte  112, 112, 112, 112, 116, 116, 116, 116
+	.byte  120, 120, 120, 120, 124, 124, 124, 124
+	.byte  128, 128, 128, 128, 132, 132, 132, 132
+	.byte  136, 136, 136, 136, 140, 140, 140, 140
+	.byte  144, 144, 144, 144, 148, 148, 148, 148
+	.byte  152, 152, 152, 152, 156, 156, 156, 156
+	.byte  160, 160, 160, 160, 164, 164, 164, 164
+	.byte  168, 168, 168, 168, 172, 172, 172, 172
+	.byte  176, 176, 176, 176, 180, 180, 180, 180
+	.byte  184, 184, 184, 184, 188, 188, 188, 188
+	.byte  192, 192, 192, 192, 196, 196, 196, 196
+	.byte  200, 200, 200, 200, 204, 204, 204, 204
+	.byte  208, 208, 208, 208, 212, 212, 212, 212
+	.byte  216, 216, 216, 216, 220, 220, 220, 220
+	.byte  224, 224, 224, 224, 228, 228, 228, 228
+	.byte  232, 232, 232, 232, 236, 236, 236, 236
+	.byte  240, 240, 240, 240, 244, 244, 244, 244
+	.byte  248, 248, 248, 248, 252, 252, 252, 252
+
+	! 5 numbers for initil/final permutation
+
+	.word   0x0f0f0f0f                ! offset 256
+	.word	0x0000ffff                ! 260
+	.word	0x33333333                ! 264
+	.word	0x00ff00ff                ! 268
+	.word	0x55555555                ! 272
+
+	.word	0                         ! 276
+	.word	LOOPS                     ! 280
+	.word	0x0000FC00                ! 284
+
+	.type	.PIC.DES_SPtrans,#object
+	.size	.PIC.DES_SPtrans,2048
+.align	64
+.PIC.DES_SPtrans:
+	! nibble 0
+	.word	0x02080800, 0x00080000, 0x02000002, 0x02080802
+	.word	0x02000000, 0x00080802, 0x00080002, 0x02000002
+	.word	0x00080802, 0x02080800, 0x02080000, 0x00000802
+	.word	0x02000802, 0x02000000, 0x00000000, 0x00080002
+	.word	0x00080000, 0x00000002, 0x02000800, 0x00080800
+	.word	0x02080802, 0x02080000, 0x00000802, 0x02000800
+	.word	0x00000002, 0x00000800, 0x00080800, 0x02080002
+	.word	0x00000800, 0x02000802, 0x02080002, 0x00000000
+	.word	0x00000000, 0x02080802, 0x02000800, 0x00080002
+	.word	0x02080800, 0x00080000, 0x00000802, 0x02000800
+	.word	0x02080002, 0x00000800, 0x00080800, 0x02000002
+	.word	0x00080802, 0x00000002, 0x02000002, 0x02080000
+	.word	0x02080802, 0x00080800, 0x02080000, 0x02000802
+	.word	0x02000000, 0x00000802, 0x00080002, 0x00000000
+	.word	0x00080000, 0x02000000, 0x02000802, 0x02080800
+	.word	0x00000002, 0x02080002, 0x00000800, 0x00080802
+	! nibble 1
+	.word	0x40108010, 0x00000000, 0x00108000, 0x40100000
+	.word	0x40000010, 0x00008010, 0x40008000, 0x00108000
+	.word	0x00008000, 0x40100010, 0x00000010, 0x40008000
+	.word	0x00100010, 0x40108000, 0x40100000, 0x00000010
+	.word	0x00100000, 0x40008010, 0x40100010, 0x00008000
+	.word	0x00108010, 0x40000000, 0x00000000, 0x00100010
+	.word	0x40008010, 0x00108010, 0x40108000, 0x40000010
+	.word	0x40000000, 0x00100000, 0x00008010, 0x40108010
+	.word	0x00100010, 0x40108000, 0x40008000, 0x00108010
+	.word	0x40108010, 0x00100010, 0x40000010, 0x00000000
+	.word	0x40000000, 0x00008010, 0x00100000, 0x40100010
+	.word	0x00008000, 0x40000000, 0x00108010, 0x40008010
+	.word	0x40108000, 0x00008000, 0x00000000, 0x40000010
+	.word	0x00000010, 0x40108010, 0x00108000, 0x40100000
+	.word	0x40100010, 0x00100000, 0x00008010, 0x40008000
+	.word	0x40008010, 0x00000010, 0x40100000, 0x00108000
+	! nibble 2
+	.word	0x04000001, 0x04040100, 0x00000100, 0x04000101
+	.word	0x00040001, 0x04000000, 0x04000101, 0x00040100
+	.word	0x04000100, 0x00040000, 0x04040000, 0x00000001
+	.word	0x04040101, 0x00000101, 0x00000001, 0x04040001
+	.word	0x00000000, 0x00040001, 0x04040100, 0x00000100
+	.word	0x00000101, 0x04040101, 0x00040000, 0x04000001
+	.word	0x04040001, 0x04000100, 0x00040101, 0x04040000
+	.word	0x00040100, 0x00000000, 0x04000000, 0x00040101
+	.word	0x04040100, 0x00000100, 0x00000001, 0x00040000
+	.word	0x00000101, 0x00040001, 0x04040000, 0x04000101
+	.word	0x00000000, 0x04040100, 0x00040100, 0x04040001
+	.word	0x00040001, 0x04000000, 0x04040101, 0x00000001
+	.word	0x00040101, 0x04000001, 0x04000000, 0x04040101
+	.word	0x00040000, 0x04000100, 0x04000101, 0x00040100
+	.word	0x04000100, 0x00000000, 0x04040001, 0x00000101
+	.word	0x04000001, 0x00040101, 0x00000100, 0x04040000
+	! nibble 3
+	.word	0x00401008, 0x10001000, 0x00000008, 0x10401008
+	.word	0x00000000, 0x10400000, 0x10001008, 0x00400008
+	.word	0x10401000, 0x10000008, 0x10000000, 0x00001008
+	.word	0x10000008, 0x00401008, 0x00400000, 0x10000000
+	.word	0x10400008, 0x00401000, 0x00001000, 0x00000008
+	.word	0x00401000, 0x10001008, 0x10400000, 0x00001000
+	.word	0x00001008, 0x00000000, 0x00400008, 0x10401000
+	.word	0x10001000, 0x10400008, 0x10401008, 0x00400000
+	.word	0x10400008, 0x00001008, 0x00400000, 0x10000008
+	.word	0x00401000, 0x10001000, 0x00000008, 0x10400000
+	.word	0x10001008, 0x00000000, 0x00001000, 0x00400008
+	.word	0x00000000, 0x10400008, 0x10401000, 0x00001000
+	.word	0x10000000, 0x10401008, 0x00401008, 0x00400000
+	.word	0x10401008, 0x00000008, 0x10001000, 0x00401008
+	.word	0x00400008, 0x00401000, 0x10400000, 0x10001008
+	.word	0x00001008, 0x10000000, 0x10000008, 0x10401000
+	! nibble 4
+	.word	0x08000000, 0x00010000, 0x00000400, 0x08010420
+	.word	0x08010020, 0x08000400, 0x00010420, 0x08010000
+	.word	0x00010000, 0x00000020, 0x08000020, 0x00010400
+	.word	0x08000420, 0x08010020, 0x08010400, 0x00000000
+	.word	0x00010400, 0x08000000, 0x00010020, 0x00000420
+	.word	0x08000400, 0x00010420, 0x00000000, 0x08000020
+	.word	0x00000020, 0x08000420, 0x08010420, 0x00010020
+	.word	0x08010000, 0x00000400, 0x00000420, 0x08010400
+	.word	0x08010400, 0x08000420, 0x00010020, 0x08010000
+	.word	0x00010000, 0x00000020, 0x08000020, 0x08000400
+	.word	0x08000000, 0x00010400, 0x08010420, 0x00000000
+	.word	0x00010420, 0x08000000, 0x00000400, 0x00010020
+	.word	0x08000420, 0x00000400, 0x00000000, 0x08010420
+	.word	0x08010020, 0x08010400, 0x00000420, 0x00010000
+	.word	0x00010400, 0x08010020, 0x08000400, 0x00000420
+	.word	0x00000020, 0x00010420, 0x08010000, 0x08000020
+	! nibble 5
+	.word	0x80000040, 0x00200040, 0x00000000, 0x80202000
+	.word	0x00200040, 0x00002000, 0x80002040, 0x00200000
+	.word	0x00002040, 0x80202040, 0x00202000, 0x80000000
+	.word	0x80002000, 0x80000040, 0x80200000, 0x00202040
+	.word	0x00200000, 0x80002040, 0x80200040, 0x00000000
+	.word	0x00002000, 0x00000040, 0x80202000, 0x80200040
+	.word	0x80202040, 0x80200000, 0x80000000, 0x00002040
+	.word	0x00000040, 0x00202000, 0x00202040, 0x80002000
+	.word	0x00002040, 0x80000000, 0x80002000, 0x00202040
+	.word	0x80202000, 0x00200040, 0x00000000, 0x80002000
+	.word	0x80000000, 0x00002000, 0x80200040, 0x00200000
+	.word	0x00200040, 0x80202040, 0x00202000, 0x00000040
+	.word	0x80202040, 0x00202000, 0x00200000, 0x80002040
+	.word	0x80000040, 0x80200000, 0x00202040, 0x00000000
+	.word	0x00002000, 0x80000040, 0x80002040, 0x80202000
+	.word	0x80200000, 0x00002040, 0x00000040, 0x80200040
+	! nibble 6
+	.word	0x00004000, 0x00000200, 0x01000200, 0x01000004
+	.word	0x01004204, 0x00004004, 0x00004200, 0x00000000
+	.word	0x01000000, 0x01000204, 0x00000204, 0x01004000
+	.word	0x00000004, 0x01004200, 0x01004000, 0x00000204
+	.word	0x01000204, 0x00004000, 0x00004004, 0x01004204
+	.word	0x00000000, 0x01000200, 0x01000004, 0x00004200
+	.word	0x01004004, 0x00004204, 0x01004200, 0x00000004
+	.word	0x00004204, 0x01004004, 0x00000200, 0x01000000
+	.word	0x00004204, 0x01004000, 0x01004004, 0x00000204
+	.word	0x00004000, 0x00000200, 0x01000000, 0x01004004
+	.word	0x01000204, 0x00004204, 0x00004200, 0x00000000
+	.word	0x00000200, 0x01000004, 0x00000004, 0x01000200
+	.word	0x00000000, 0x01000204, 0x01000200, 0x00004200
+	.word	0x00000204, 0x00004000, 0x01004204, 0x01000000
+	.word	0x01004200, 0x00000004, 0x00004004, 0x01004204
+	.word	0x01000004, 0x01004200, 0x01004000, 0x00004004
+	! nibble 7
+	.word	0x20800080, 0x20820000, 0x00020080, 0x00000000
+	.word	0x20020000, 0x00800080, 0x20800000, 0x20820080
+	.word	0x00000080, 0x20000000, 0x00820000, 0x00020080
+	.word	0x00820080, 0x20020080, 0x20000080, 0x20800000
+	.word	0x00020000, 0x00820080, 0x00800080, 0x20020000
+	.word	0x20820080, 0x20000080, 0x00000000, 0x00820000
+	.word	0x20000000, 0x00800000, 0x20020080, 0x20800080
+	.word	0x00800000, 0x00020000, 0x20820000, 0x00000080
+	.word	0x00800000, 0x00020000, 0x20000080, 0x20820080
+	.word	0x00020080, 0x20000000, 0x00000000, 0x00820000
+	.word	0x20800080, 0x20020080, 0x20020000, 0x00800080
+	.word	0x20820000, 0x00000080, 0x00800080, 0x20020000
+	.word	0x20820080, 0x00800000, 0x20800000, 0x20000080
+	.word	0x00820000, 0x00020080, 0x20020080, 0x20800000
+	.word	0x00000080, 0x20820000, 0x00820080, 0x00000000
+	.word	0x20000000, 0x20800080, 0x00020000, 0x00820080
+
diff --git a/src/lib/libcrypto/opensslv.h b/src/lib/libcrypto/opensslv.h
index 09687b5136..c6207f76b2 100644
--- a/src/lib/libcrypto/opensslv.h
+++ b/src/lib/libcrypto/opensslv.h
@@ -25,11 +25,11 @@
  * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
  *  major minor fix final patch/beta)
  */
-#define OPENSSL_VERSION_NUMBER	0x009080afL
+#define OPENSSL_VERSION_NUMBER	0x009080bfL
 #ifdef OPENSSL_FIPS
-#define OPENSSL_VERSION_TEXT	"OpenSSL 0.9.8j-fips 07 Jan 2009"
+#define OPENSSL_VERSION_TEXT	"OpenSSL 0.9.8k-fips 25 Mar 2009"
 #else
-#define OPENSSL_VERSION_TEXT	"OpenSSL 0.9.8j 07 Jan 2009"
+#define OPENSSL_VERSION_TEXT	"OpenSSL 0.9.8k 25 Mar 2009"
 #endif
 #define OPENSSL_VERSION_PTEXT	" part of " OPENSSL_VERSION_TEXT
 
diff --git a/src/lib/libcrypto/pem/pem.h b/src/lib/libcrypto/pem/pem.h
index 6f8e01544b..6c193f1cbf 100644
--- a/src/lib/libcrypto/pem/pem.h
+++ b/src/lib/libcrypto/pem/pem.h
@@ -215,7 +215,9 @@ typedef struct pem_ctx_st
 
 #define IMPLEMENT_PEM_read_fp(name, type, str, asn1) /**/
 #define IMPLEMENT_PEM_write_fp(name, type, str, asn1) /**/
+#define IMPLEMENT_PEM_write_fp_const(name, type, str, asn1) /**/
 #define IMPLEMENT_PEM_write_cb_fp(name, type, str, asn1) /**/
+#define IMPLEMENT_PEM_write_cb_fp_const(name, type, str, asn1) /**/
 
 #else
 
@@ -355,6 +357,7 @@ int PEM_write_bio_##name(BIO *bp, type *x, const EVP_CIPHER *enc, \
 
 #define DECLARE_PEM_read_fp(name, type) /**/
 #define DECLARE_PEM_write_fp(name, type) /**/
+#define DECLARE_PEM_write_fp_const(name, type) /**/
 #define DECLARE_PEM_write_cb_fp(name, type) /**/
 
 #else
@@ -392,6 +395,7 @@ int PEM_write_bio_##name(BIO *bp, type *x, const EVP_CIPHER *enc, \
 
 #define DECLARE_PEM_read_bio(name, type) /**/
 #define DECLARE_PEM_write_bio(name, type) /**/
+#define DECLARE_PEM_write_bio_const(name, type) /**/
 #define DECLARE_PEM_write_cb_bio(name, type) /**/
 
 #endif
diff --git a/src/lib/libcrypto/pkcs12/p12_crt.c b/src/lib/libcrypto/pkcs12/p12_crt.c
index e863de52ce..9522342fa5 100644
--- a/src/lib/libcrypto/pkcs12/p12_crt.c
+++ b/src/lib/libcrypto/pkcs12/p12_crt.c
@@ -170,6 +170,9 @@ PKCS12 *PKCS12_create(char *pass, char *name, EVP_PKEY *pkey, X509 *cert,
 
 	p12 = PKCS12_add_safes(safes, 0);
 
+	if (!p12)
+		goto err;
+
 	sk_PKCS7_pop_free(safes, PKCS7_free);
 
 	safes = NULL;
diff --git a/src/lib/libcrypto/pkcs7/pk7_smime.c b/src/lib/libcrypto/pkcs7/pk7_smime.c
index c34db1d6fe..fd18ec3d95 100644
--- a/src/lib/libcrypto/pkcs7/pk7_smime.c
+++ b/src/lib/libcrypto/pkcs7/pk7_smime.c
@@ -229,8 +229,7 @@ int PKCS7_verify(PKCS7 *p7, STACK_OF(X509) *certs, X509_STORE *store,
 				sk_X509_free(signers);
 				return 0;
 				}
-			X509_STORE_CTX_set_purpose(&cert_ctx,
-						X509_PURPOSE_SMIME_SIGN);
+			X509_STORE_CTX_set_default(&cert_ctx, "smime_sign");
 		} else if(!X509_STORE_CTX_init (&cert_ctx, store, signer, NULL)) {
 			PKCS7err(PKCS7_F_PKCS7_VERIFY,ERR_R_X509_LIB);
 			sk_X509_free(signers);
diff --git a/src/lib/libcrypto/ppccpuid.pl b/src/lib/libcrypto/ppccpuid.pl
new file mode 100755
index 0000000000..fe44ff07bc
--- /dev/null
+++ b/src/lib/libcrypto/ppccpuid.pl
@@ -0,0 +1,94 @@
+#!/usr/bin/env perl
+
+$flavour = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+if ($flavour=~/64/) {
+    $CMPLI="cmpldi";
+    $SHRLI="srdi";
+    $SIGNX="extsw";
+} else {
+    $CMPLI="cmplwi";
+    $SHRLI="srwi";
+    $SIGNX="mr";
+}
+
+$code=<<___;
+.machine	"any"
+.text
+
+.globl	.OPENSSL_cpuid_setup
+.align	4
+.OPENSSL_cpuid_setup:
+	blr
+
+.globl	.OPENSSL_wipe_cpu
+.align	4
+.OPENSSL_wipe_cpu:
+	xor	r0,r0,r0
+	mr	r3,r1
+	xor	r4,r4,r4
+	xor	r5,r5,r5
+	xor	r6,r6,r6
+	xor	r7,r7,r7
+	xor	r8,r8,r8
+	xor	r9,r9,r9
+	xor	r10,r10,r10
+	xor	r11,r11,r11
+	xor	r12,r12,r12
+	blr
+
+.globl	.OPENSSL_atomic_add
+.align	4
+.OPENSSL_atomic_add:
+Loop:	lwarx	r5,0,r3
+	add	r0,r4,r5
+	stwcx.	r0,0,r3
+	bne-	Loop
+	$SIGNX	r3,r0
+	blr
+
+.globl	.OPENSSL_rdtsc
+.align	4
+.OPENSSL_rdtsc:
+	mftb	r3
+	mftbu	r4
+	blr
+
+.globl	.OPENSSL_cleanse
+.align	4
+.OPENSSL_cleanse:
+	$CMPLI	r4,7
+	li	r0,0
+	bge	Lot
+Little:	mtctr	r4
+	stb	r0,0(r3)
+	addi	r3,r3,1
+	bdnz-	\$-8
+	blr
+Lot:	andi.	r5,r3,3
+	beq	Laligned
+	stb	r0,0(r3)
+	subi	r4,r4,1
+	addi	r3,r3,1
+	b	Lot
+Laligned:
+	$SHRLI	r5,r4,2
+	mtctr	r5
+	stw	r0,0(r3)
+	addi	r3,r3,4
+	bdnz-	\$-8
+	andi.	r4,r4,3
+	bne	Little
+	blr
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/ripemd/README b/src/lib/libcrypto/ripemd/README
index 7097707264..f1ffc8b134 100644
--- a/src/lib/libcrypto/ripemd/README
+++ b/src/lib/libcrypto/ripemd/README
@@ -4,7 +4,7 @@ http://www.esat.kuleuven.ac.be/~bosselae/ripemd160.html
 This is my implementation of RIPEMD-160.  The pentium assember is a little
 off the pace since I only get 1050 cycles, while the best is 1013.
 I have a few ideas for how to get another 20 or so cycles, but at
-this point I will not bother right now.  I belive the trick will be
+this point I will not bother right now.  I believe the trick will be
 to remove my 'copy X array onto stack' until inside the RIP1() finctions the
 first time round.  To do this I need another register and will only have one
 temporary one.  A bit tricky....  I can also cleanup the saving of the 5 words
diff --git a/src/lib/libcrypto/s390xcpuid.S b/src/lib/libcrypto/s390xcpuid.S
new file mode 100644
index 0000000000..8500133ad0
--- /dev/null
+++ b/src/lib/libcrypto/s390xcpuid.S
@@ -0,0 +1,90 @@
+.text
+
+.globl	OPENSSL_cpuid_setup
+.type	OPENSSL_cpuid_setup,@function
+.align	16
+OPENSSL_cpuid_setup:
+	br	%r14		# reserved for future
+.size	OPENSSL_cpuid_setup,.-OPENSSL_cpuid_setup
+
+.globl	OPENSSL_s390x_facilities
+.type	OPENSSL_s390x_facilities,@function
+.align	16
+OPENSSL_s390x_facilities:
+	lghi	%r0,0
+	.long	0xb2b0f010	# stfle	16(%r15)
+	lg	%r2,16(%r15)
+	br	%r14
+.size	OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities
+
+.globl	OPENSSL_rdtsc
+.type	OPENSSL_rdtsc,@function
+.align	16
+OPENSSL_rdtsc:
+	stck	16(%r15)
+	lg	%r2,16(%r15)
+	br	%r14
+.size	OPENSSL_rdtsc,.-OPENSSL_rdtsc
+
+.globl	OPENSSL_atomic_add
+.type	OPENSSL_atomic_add,@function
+.align	16
+OPENSSL_atomic_add:
+	l	%r1,0(%r2)
+.Lspin:	lr	%r0,%r1
+	ar	%r0,%r3
+	cs	%r1,%r0,0(%r2)
+	brc	4,.Lspin
+	lgfr	%r2,%r0		# OpenSSL expects the new value
+	br	%r14
+.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.globl	OPENSSL_wipe_cpu
+.type	OPENSSL_wipe_cpu,@function
+.align	16
+OPENSSL_wipe_cpu:
+	xgr	%r0,%r0
+	xgr	%r1,%r1
+	lgr	%r2,%r15
+	xgr	%r3,%r3
+	xgr	%r4,%r4
+	lzdr	%f0
+	lzdr	%f1
+	lzdr	%f2
+	lzdr	%f3
+	lzdr	%f4
+	lzdr	%f5
+	lzdr	%f6
+	lzdr	%f7
+	br	%r14
+.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+
+.globl	OPENSSL_cleanse
+.type	OPENSSL_cleanse,@function
+.align	16
+OPENSSL_cleanse:
+	lghi	%r4,15
+	lghi	%r0,0
+	clgr	%r3,%r4
+	jh	.Lot
+.Little:
+	stc	%r0,0(%r2)
+	la	%r2,1(%r2)
+	brctg	%r3,.Little
+	br	%r14
+.align	4
+.Lot:	tmll	%r2,7
+	jz	.Laligned
+	stc	%r0,0(%r2)
+	la	%r2,1(%r2)
+	brctg	%r3,.Lot
+.Laligned:
+	srlg	%r4,%r3,3
+.Loop:	stg	%r0,0(%r2)
+	la	%r2,8(%r2)
+	brctg	%r4,.Loop
+	lghi	%r4,7
+	ngr	%r3,%r4
+	jnz	.Little
+	br	%r14
+.size	OPENSSL_cleanse,.-OPENSSL_cleanse
diff --git a/src/lib/libcrypto/sha/asm/sha1-ia64.pl b/src/lib/libcrypto/sha/asm/sha1-ia64.pl
index aa18c1089b..51c4f47ecb 100644
--- a/src/lib/libcrypto/sha/asm/sha1-ia64.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-ia64.pl
@@ -302,4 +302,5 @@ $code.=<<___;
 stringz	"SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
+$output=shift and open STDOUT,">$output";
 print $code;
diff --git a/src/lib/libcrypto/sparcv9cap.c b/src/lib/libcrypto/sparcv9cap.c
new file mode 100644
index 0000000000..5f31d20bd0
--- /dev/null
+++ b/src/lib/libcrypto/sparcv9cap.c
@@ -0,0 +1,154 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <openssl/bn.h>
+
+#define SPARCV9_TICK_PRIVILEGED	(1<<0)
+#define SPARCV9_PREFER_FPU	(1<<1)
+#define SPARCV9_VIS1		(1<<2)
+#define SPARCV9_VIS2		(1<<3)	/* reserved */
+#define SPARCV9_FMADD		(1<<4)	/* reserved for SPARC64 V */
+static int OPENSSL_sparcv9cap_P=SPARCV9_TICK_PRIVILEGED;
+
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
+	{
+	int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
+	int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
+
+	if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
+		(SPARCV9_PREFER_FPU|SPARCV9_VIS1))
+		return bn_mul_mont_fpu(rp,ap,bp,np,n0,num);
+	else
+		return bn_mul_mont_int(rp,ap,bp,np,n0,num);
+	}
+
+unsigned long OPENSSL_rdtsc(void)
+	{
+	unsigned long _sparcv9_rdtick(void);
+
+	if (OPENSSL_sparcv9cap_P&SPARCV9_TICK_PRIVILEGED)
+#if defined(__sun) && defined(__SVR4)
+		return gethrtime();
+#else
+		return 0;
+#endif
+	else
+		return _sparcv9_rdtick();
+	}
+
+#if defined(__sun) && defined(__SVR4)
+
+#include <dlfcn.h>
+#include <libdevinfo.h>
+#include <sys/systeminfo.h>
+
+typedef di_node_t (*di_init_t)(const char *,uint_t);
+typedef void      (*di_fini_t)(di_node_t);
+typedef char *    (*di_node_name_t)(di_node_t);
+typedef int       (*di_walk_node_t)(di_node_t,uint_t,di_node_name_t,int (*)(di_node_t,di_node_name_t));
+
+#define DLLINK(h,name) (name=(name##_t)dlsym((h),#name))
+
+static int walk_nodename(di_node_t node, di_node_name_t di_node_name)
+	{
+	char *name = (*di_node_name)(node);
+
+	/* This is expected to catch all UltraSPARC flavors prior T1 */
+	if (!strcmp (name,"SUNW,UltraSPARC") ||
+	    !strncmp(name,"SUNW,UltraSPARC-I",17))  /* covers II,III,IV */
+		{
+		OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU|SPARCV9_VIS1;
+
+		/* %tick is privileged only on UltraSPARC-I/II, but not IIe */
+		if (name[14]!='\0' && name[17]!='\0' && name[18]!='\0')
+			OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
+
+		return DI_WALK_TERMINATE;
+		}
+	/* This is expected to catch remaining UltraSPARCs, such as T1 */
+	else if (!strncmp(name,"SUNW,UltraSPARC",15))
+		{
+		OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
+
+		return DI_WALK_TERMINATE;
+		}
+
+	return DI_WALK_CONTINUE;
+	}
+
+void OPENSSL_cpuid_setup(void)
+	{
+	void *h;
+	char *e,si[256];
+	static int trigger=0;
+
+	if (trigger) return;
+	trigger=1;
+
+	if ((e=getenv("OPENSSL_sparcv9cap")))
+		{
+		OPENSSL_sparcv9cap_P=strtoul(e,NULL,0);
+		return;
+		}
+
+	if (sysinfo(SI_MACHINE,si,sizeof(si))>0)
+		{
+		if (strcmp(si,"sun4v"))
+			/* FPU is preferred for all CPUs, but US-T1/2 */
+			OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU;
+		}
+
+	if (sysinfo(SI_ISALIST,si,sizeof(si))>0)
+		{
+		if (strstr(si,"+vis"))
+			OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
+		if (strstr(si,"+vis2"))
+			{
+			OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
+			OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
+			return;
+			}
+		}
+
+	if ((h = dlopen("libdevinfo.so.1",RTLD_LAZY))) do
+		{
+		di_init_t	di_init;
+		di_fini_t	di_fini;
+		di_walk_node_t	di_walk_node;
+		di_node_name_t	di_node_name;
+		di_node_t	root_node;
+
+		if (!DLLINK(h,di_init))		break;
+		if (!DLLINK(h,di_fini))		break;
+		if (!DLLINK(h,di_walk_node))	break;
+		if (!DLLINK(h,di_node_name))	break;
+
+		if ((root_node = (*di_init)("/",DINFOSUBTREE))!=DI_NODE_NIL)
+			{
+			(*di_walk_node)(root_node,DI_WALK_SIBFIRST,
+					di_node_name,walk_nodename);
+			(*di_fini)(root_node);
+			}
+		} while(0);
+
+	if (h) dlclose(h);
+	}
+
+#else
+
+void OPENSSL_cpuid_setup(void)
+	{
+	char *e;
+ 
+	if ((e=getenv("OPENSSL_sparcv9cap")))
+		{
+		OPENSSL_sparcv9cap_P=strtoul(e,NULL,0);
+		return;
+		}
+
+	/* For now we assume that the rest supports UltraSPARC-I* only */
+	OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU|SPARCV9_VIS1;
+	}
+
+#endif
diff --git a/src/lib/libcrypto/ui/ui_lib.c b/src/lib/libcrypto/ui/ui_lib.c
index 7ab249c3be..ac0100808f 100644
--- a/src/lib/libcrypto/ui/ui_lib.c
+++ b/src/lib/libcrypto/ui/ui_lib.c
@@ -90,6 +90,7 @@ UI *UI_new_method(const UI_METHOD *method)
 
 	ret->strings=NULL;
 	ret->user_data=NULL;
+	ret->flags=0;
 	CRYPTO_new_ex_data(CRYPTO_EX_INDEX_UI, ret, &ret->ex_data);
 	return ret;
 	}
diff --git a/src/lib/libcrypto/x509/x509_cmp.c b/src/lib/libcrypto/x509/x509_cmp.c
index e4c682fc44..2faf92514a 100644
--- a/src/lib/libcrypto/x509/x509_cmp.c
+++ b/src/lib/libcrypto/x509/x509_cmp.c
@@ -288,7 +288,8 @@ int X509_NAME_cmp(const X509_NAME *a, const X509_NAME *b)
 			if (!(nabit & STR_TYPE_CMP) ||
 				!(nbbit & STR_TYPE_CMP))
 				return j;
-			j = asn1_string_memcmp(na->value, nb->value);
+			if (!asn1_string_memcmp(na->value, nb->value))
+				j = 0;
 			}
 		else if (na->value->type == V_ASN1_PRINTABLESTRING)
 			j=nocase_spacenorm_cmp(na->value, nb->value);
diff --git a/src/lib/libcrypto/x509/x509_vpm.c b/src/lib/libcrypto/x509/x509_vpm.c
index c92e65936f..2b06718aec 100644
--- a/src/lib/libcrypto/x509/x509_vpm.c
+++ b/src/lib/libcrypto/x509/x509_vpm.c
@@ -74,7 +74,7 @@ static void x509_verify_param_zero(X509_VERIFY_PARAM *param)
 	param->name = NULL;
 	param->purpose = 0;
 	param->trust = 0;
-	param->inh_flags = X509_VP_FLAG_DEFAULT;
+	param->inh_flags = 0;
 	param->flags = 0;
 	param->depth = -1;
 	if (param->policies)
@@ -320,11 +320,21 @@ static const X509_VERIFY_PARAM default_table[] = {
 	0,		/* flags */
 	0,		/* purpose */
 	0,		/* trust */
-	9,		/* depth */
+	100,		/* depth */
 	NULL		/* policies */
 	},
 	{
-	"pkcs7",			/* SSL/TLS client parameters */
+	"pkcs7",			/* S/MIME signing parameters */
+	0,				/* Check time */
+	0,				/* internal flags */
+	0,				/* flags */
+	X509_PURPOSE_SMIME_SIGN,	/* purpose */
+	X509_TRUST_EMAIL,		/* trust */
+	-1,				/* depth */
+	NULL				/* policies */
+	},
+	{
+	"smime_sign",			/* S/MIME signing parameters */
 	0,				/* Check time */
 	0,				/* internal flags */
 	0,				/* flags */
diff --git a/src/lib/libcrypto/x509v3/v3_cpols.c b/src/lib/libcrypto/x509v3/v3_cpols.c
index 95596055ab..ad0506d75c 100644
--- a/src/lib/libcrypto/x509v3/v3_cpols.c
+++ b/src/lib/libcrypto/x509v3/v3_cpols.c
@@ -181,7 +181,11 @@ static STACK_OF(POLICYINFO) *r2i_certpol(X509V3_EXT_METHOD *method,
 			pol = POLICYINFO_new();
 			pol->policyid = pobj;
 		}
-		sk_POLICYINFO_push(pols, pol);
+		if (!sk_POLICYINFO_push(pols, pol)){
+			POLICYINFO_free(pol);
+			X509V3err(X509V3_F_R2I_CERTPOL, ERR_R_MALLOC_FAILURE);
+			goto err;
+		}
 	}
 	sk_CONF_VALUE_pop_free(vals, X509V3_conf_free);
 	return pols;
@@ -447,3 +451,4 @@ void X509_POLICY_NODE_print(BIO *out, X509_POLICY_NODE *node, int indent)
 		BIO_printf(out, "%*sNo Qualifiers\n", indent + 2, "");
 	}
 	
+IMPLEMENT_STACK_OF(X509_POLICY_NODE)
diff --git a/src/lib/libcrypto/x509v3/v3_utl.c b/src/lib/libcrypto/x509v3/v3_utl.c
index a4236bbb6d..7a45216c00 100644
--- a/src/lib/libcrypto/x509v3/v3_utl.c
+++ b/src/lib/libcrypto/x509v3/v3_utl.c
@@ -84,7 +84,7 @@ int X509V3_add_value(const char *name, const char *value,
 	CONF_VALUE *vtmp = NULL;
 	char *tname = NULL, *tvalue = NULL;
 	if(name && !(tname = BUF_strdup(name))) goto err;
-	if(value && !(tvalue = BUF_strdup(value))) goto err;;
+	if(value && !(tvalue = BUF_strdup(value))) goto err;
 	if(!(vtmp = (CONF_VALUE *)OPENSSL_malloc(sizeof(CONF_VALUE)))) goto err;
 	if(!*extlist && !(*extlist = sk_CONF_VALUE_new_null())) goto err;
 	vtmp->section = NULL;
diff --git a/src/lib/libssl/s3_clnt.c b/src/lib/libssl/s3_clnt.c
index 5fd3520caf..50308487aa 100644
--- a/src/lib/libssl/s3_clnt.c
+++ b/src/lib/libssl/s3_clnt.c
@@ -173,7 +173,7 @@ int ssl3_connect(SSL *s)
 	long num1;
 	void (*cb)(const SSL *ssl,int type,int val)=NULL;
 	int ret= -1;
-	int new_state,state,skip=0;;
+	int new_state,state,skip=0;
 
 	RAND_add(&Time,sizeof(Time),0);
 	ERR_clear_error();
diff --git a/src/lib/libssl/ssl_ciph.c b/src/lib/libssl/ssl_ciph.c
index 0c2aa249b4..52f91cfe60 100644
--- a/src/lib/libssl/ssl_ciph.c
+++ b/src/lib/libssl/ssl_ciph.c
@@ -1355,7 +1355,7 @@ int SSL_COMP_add_compression_method(int id, COMP_METHOD *cm)
 	comp->method=cm;
 	load_builtin_compressions();
 	if (ssl_comp_methods
-		&& !sk_SSL_COMP_find(ssl_comp_methods,comp))
+		&& sk_SSL_COMP_find(ssl_comp_methods,comp) >= 0)
 		{
 		OPENSSL_free(comp);
 		MemCheck_on();
diff --git a/src/lib/libssl/ssl_lib.c b/src/lib/libssl/ssl_lib.c
index 68eee77e6f..893abff1f4 100644
--- a/src/lib/libssl/ssl_lib.c
+++ b/src/lib/libssl/ssl_lib.c
@@ -510,6 +510,8 @@ void SSL_free(SSL *s)
 
 	if (s->ctx) SSL_CTX_free(s->ctx);
 #ifndef OPENSSL_NO_TLSEXT
+	if (s->tlsext_hostname)
+		OPENSSL_free(s->tlsext_hostname);
 	if (s->initial_ctx) SSL_CTX_free(s->initial_ctx);
 	if (s->tlsext_ocsp_exts)
 		sk_X509_EXTENSION_pop_free(s->tlsext_ocsp_exts,
diff --git a/src/lib/libssl/test/times b/src/lib/libssl/test/times
index 49aeebf216..738d569b8f 100644
--- a/src/lib/libssl/test/times
+++ b/src/lib/libssl/test/times
@@ -68,7 +68,7 @@ eric (adding numbers to speculation)
 --- Appendix ---
 - The time measured is user time but these number a very rough.
 - Remember this is the cost of both client and server sides of the protocol.
-- The TCP/kernal overhead of connection establishment is normally the
+- The TCP/kernel overhead of connection establishment is normally the
   killer in SSL.  Often delays in the TCP protocol will make session-id
   reuse look slower that new sessions, but this would not be the case on
   a loaded server.
-- 
cgit v1.2.3-55-g6feb