160 files changed, 48653 insertions, 1420 deletions
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl
index c51ee1fbf6..86b86c4a0f 100644
--- a/src/lib/libcrypto/aes/asm/aes-armv4.pl
+++ b/src/lib/libcrypto/aes/asm/aes-armv4.pl
@@ -27,6 +27,11 @@
 # Rescheduling for dual-issue pipeline resulted in 12% improvement on
 # Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~21.5 cycles per byte.
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
@@ -46,6 +51,7 @@ $key="r11";
 $rounds="r12";
 $code=<<___;
+#include "arm_arch.h"
 .text
 .code   32
@@ -166,7 +172,7 @@ AES_encrypt:
        mov     $rounds,r0              @ inp
        mov     $key,r2
        sub     $tbl,r3,#AES_encrypt-AES_Te     @ Te
+#if __ARM_ARCH__<7
        ldrb    $s0,[$rounds,#3]        @ load input data in endian-neutral
        ldrb    $t1,[$rounds,#2]        @ manner...
        ldrb    $t2,[$rounds,#1]
@@ -195,10 +201,33 @@ AES_encrypt:
        orr     $s3,$s3,$t1,lsl#8
        orr     $s3,$s3,$t2,lsl#16
        orr     $s3,$s3,$t3,lsl#24
+#else
+        ldr     $s0,[$rounds,#0]
+        ldr     $s1,[$rounds,#4]
+        ldr     $s2,[$rounds,#8]
+        ldr     $s3,[$rounds,#12]
+#ifdef __ARMEL__
+        rev     $s0,$s0
+        rev     $s1,$s1
+        rev     $s2,$s2
+        rev     $s3,$s3
+#endif
+#endif
        bl      _armv4_AES_encrypt
        ldr     $rounds,[sp],#4         @ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+        rev     $s0,$s0
+        rev     $s1,$s1
+        rev     $s2,$s2
+        rev     $s3,$s3
+#endif
+        str     $s0,[$rounds,#0]
+        str     $s1,[$rounds,#4]
+        str     $s2,[$rounds,#8]
+        str     $s3,[$rounds,#12]
+#else
        mov     $t1,$s0,lsr#24          @ write output in endian-neutral
        mov     $t2,$s0,lsr#16          @ manner...
        mov     $t3,$s0,lsr#8
@@ -227,11 +256,15 @@ AES_encrypt:
        strb    $t2,[$rounds,#13]
        strb    $t3,[$rounds,#14]
        strb    $s3,[$rounds,#15]
+#endif
+#if __ARM_ARCH__>=5
+        ldmia   sp!,{r4-r12,pc}
+#else
        ldmia   sp!,{r4-r12,lr}
        tst     lr,#1
        moveq   pc,lr                   @ be binary compatible with V4, yet
        bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
 .size   AES_encrypt,.-AES_encrypt
 .type   _armv4_AES_encrypt,%function
@@ -271,11 +304,11 @@ _armv4_AES_encrypt:
        and     $i2,lr,$s2,lsr#16       @ i1
        eor     $t3,$t3,$i3,ror#8
        and     $i3,lr,$s2
-        eor     $s1,$s1,$t1,ror#24
        ldr     $i1,[$tbl,$i1,lsl#2]    @ Te2[s2>>8]
+        eor     $s1,$s1,$t1,ror#24
+        ldr     $i2,[$tbl,$i2,lsl#2]    @ Te1[s2>>16]
        mov     $s2,$s2,lsr#24
-        ldr     $i2,[$tbl,$i2,lsl#2]    @ Te1[s2>>16]
        ldr     $i3,[$tbl,$i3,lsl#2]    @ Te3[s2>>0]
        eor     $s0,$s0,$i1,ror#16
        ldr     $s2,[$tbl,$s2,lsl#2]    @ Te0[s2>>24]
@@ -284,16 +317,16 @@ _armv4_AES_encrypt:
        and     $i2,lr,$s3,lsr#8        @ i1
        eor     $t3,$t3,$i3,ror#16
        and     $i3,lr,$s3,lsr#16       @ i2
-        eor     $s2,$s2,$t2,ror#16
        ldr     $i1,[$tbl,$i1,lsl#2]    @ Te3[s3>>0]
+        eor     $s2,$s2,$t2,ror#16
+        ldr     $i2,[$tbl,$i2,lsl#2]    @ Te2[s3>>8]
        mov     $s3,$s3,lsr#24
-        ldr     $i2,[$tbl,$i2,lsl#2]    @ Te2[s3>>8]
        ldr     $i3,[$tbl,$i3,lsl#2]    @ Te1[s3>>16]
        eor     $s0,$s0,$i1,ror#24
-        ldr     $s3,[$tbl,$s3,lsl#2]    @ Te0[s3>>24]
-        eor     $s1,$s1,$i2,ror#16
        ldr     $i1,[$key],#16
+        eor     $s1,$s1,$i2,ror#16
+        ldr     $s3,[$tbl,$s3,lsl#2]    @ Te0[s3>>24]
        eor     $s2,$s2,$i3,ror#8
        ldr     $t1,[$key,#-12]
        eor     $s3,$s3,$t3,ror#8
@@ -333,11 +366,11 @@ _armv4_AES_encrypt:
        and     $i2,lr,$s2,lsr#16       @ i1
        eor     $t3,$i3,$t3,lsl#8
        and     $i3,lr,$s2
-        eor     $s1,$t1,$s1,lsl#24
        ldrb    $i1,[$tbl,$i1,lsl#2]    @ Te4[s2>>8]
+        eor     $s1,$t1,$s1,lsl#24
+        ldrb    $i2,[$tbl,$i2,lsl#2]    @ Te4[s2>>16]
        mov     $s2,$s2,lsr#24
-        ldrb    $i2,[$tbl,$i2,lsl#2]    @ Te4[s2>>16]
        ldrb    $i3,[$tbl,$i3,lsl#2]    @ Te4[s2>>0]
        eor     $s0,$i1,$s0,lsl#8
        ldrb    $s2,[$tbl,$s2,lsl#2]    @ Te4[s2>>24]
@@ -346,15 +379,15 @@ _armv4_AES_encrypt:
        and     $i2,lr,$s3,lsr#8        @ i1
        eor     $t3,$i3,$t3,lsl#8
        and     $i3,lr,$s3,lsr#16       @ i2
-        eor     $s2,$t2,$s2,lsl#24
        ldrb    $i1,[$tbl,$i1,lsl#2]    @ Te4[s3>>0]
+        eor     $s2,$t2,$s2,lsl#24
+        ldrb    $i2,[$tbl,$i2,lsl#2]    @ Te4[s3>>8]
        mov     $s3,$s3,lsr#24
-        ldrb    $i2,[$tbl,$i2,lsl#2]    @ Te4[s3>>8]
        ldrb    $i3,[$tbl,$i3,lsl#2]    @ Te4[s3>>16]
        eor     $s0,$i1,$s0,lsl#8
-        ldrb    $s3,[$tbl,$s3,lsl#2]    @ Te4[s3>>24]
        ldr     $i1,[$key,#0]
+        ldrb    $s3,[$tbl,$s3,lsl#2]    @ Te4[s3>>24]
        eor     $s1,$s1,$i2,lsl#8
        ldr     $t1,[$key,#4]
        eor     $s2,$s2,$i3,lsl#16
@@ -371,10 +404,11 @@ _armv4_AES_encrypt:
        ldr     pc,[sp],#4              @ pop and return
 .size   _armv4_AES_encrypt,.-_armv4_AES_encrypt
-.global AES_set_encrypt_key
+.global private_AES_set_encrypt_key
-.type   AES_set_encrypt_key,%function
+.type   private_AES_set_encrypt_key,%function
 .align  5
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_armv4_AES_set_encrypt_key:
        sub     r3,pc,#8                @ AES_set_encrypt_key
        teq     r0,#0
        moveq   r0,#-1
@@ -392,12 +426,13 @@ AES_set_encrypt_key:
        bne     .Labrt
 .Lok:   stmdb   sp!,{r4-r12,lr}
-        sub     $tbl,r3,#AES_set_encrypt_key-AES_Te-1024        @ Te4
+        sub     $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
        mov     $rounds,r0              @ inp
        mov     lr,r1                   @ bits
        mov     $key,r2                 @ key
+#if __ARM_ARCH__<7
        ldrb    $s0,[$rounds,#3]        @ load input data in endian-neutral
        ldrb    $t1,[$rounds,#2]        @ manner...
        ldrb    $t2,[$rounds,#1]
@@ -430,6 +465,22 @@ AES_set_encrypt_key:
        orr     $s3,$s3,$t3,lsl#24
        str     $s2,[$key,#-8]
        str     $s3,[$key,#-4]
+#else
+        ldr     $s0,[$rounds,#0]
+        ldr     $s1,[$rounds,#4]
+        ldr     $s2,[$rounds,#8]
+        ldr     $s3,[$rounds,#12]
+#ifdef __ARMEL__
+        rev     $s0,$s0
+        rev     $s1,$s1
+        rev     $s2,$s2
+        rev     $s3,$s3
+#endif
+        str     $s0,[$key],#16
+        str     $s1,[$key,#-12]
+        str     $s2,[$key,#-8]
+        str     $s3,[$key,#-4]
+#endif
        teq     lr,#128
        bne     .Lnot128
@@ -466,6 +517,7 @@ AES_set_encrypt_key:
        b       .Ldone
 .Lnot128:
+#if __ARM_ARCH__<7
        ldrb    $i2,[$rounds,#19]
        ldrb    $t1,[$rounds,#18]
        ldrb    $t2,[$rounds,#17]
@@ -482,6 +534,16 @@ AES_set_encrypt_key:
        str     $i2,[$key],#8
        orr     $i3,$i3,$t3,lsl#24
        str     $i3,[$key,#-4]
+#else
+        ldr     $i2,[$rounds,#16]
+        ldr     $i3,[$rounds,#20]
+#ifdef __ARMEL__
+        rev     $i2,$i2
+        rev     $i3,$i3
+#endif
+        str     $i2,[$key],#8
+        str     $i3,[$key,#-4]
+#endif
        teq     lr,#192
        bne     .Lnot192
@@ -526,6 +588,7 @@ AES_set_encrypt_key:
        b       .L192_loop
 .Lnot192:
+#if __ARM_ARCH__<7
        ldrb    $i2,[$rounds,#27]
        ldrb    $t1,[$rounds,#26]
        ldrb    $t2,[$rounds,#25]
@@ -542,6 +605,16 @@ AES_set_encrypt_key:
        str     $i2,[$key],#8
        orr     $i3,$i3,$t3,lsl#24
        str     $i3,[$key,#-4]
+#else
+        ldr     $i2,[$rounds,#24]
+        ldr     $i3,[$rounds,#28]
+#ifdef __ARMEL__
+        rev     $i2,$i2
+        rev     $i3,$i3
+#endif
+        str     $i2,[$key],#8
+        str     $i3,[$key,#-4]
+#endif
        mov     $rounds,#14
        str     $rounds,[$key,#240-32]
@@ -606,14 +679,14 @@ AES_set_encrypt_key:
 .Labrt: tst     lr,#1
        moveq   pc,lr                   @ be binary compatible with V4, yet
        bx      lr                      @ interoperable with Thumb ISA:-)
-.size   AES_set_encrypt_key,.-AES_set_encrypt_key
+.size   private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
-.global AES_set_decrypt_key
+.global private_AES_set_decrypt_key
-.type   AES_set_decrypt_key,%function
+.type   private_AES_set_decrypt_key,%function
 .align  5
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
        str     lr,[sp,#-4]!            @ push lr
-        bl      AES_set_encrypt_key
+        bl      _armv4_AES_set_encrypt_key
        teq     r0,#0
        ldrne   lr,[sp],#4              @ pop lr
        bne     .Labrt
@@ -692,11 +765,15 @@ $code.=<<___;
        bne     .Lmix
        mov     r0,#0
+#if __ARM_ARCH__>=5
+        ldmia   sp!,{r4-r12,pc}
+#else
        ldmia   sp!,{r4-r12,lr}
        tst     lr,#1
        moveq   pc,lr                   @ be binary compatible with V4, yet
        bx      lr                      @ interoperable with Thumb ISA:-)
-.size   AES_set_decrypt_key,.-AES_set_decrypt_key
+#endif
+.size   private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 .type   AES_Td,%object
 .align  5
@@ -811,7 +888,7 @@ AES_decrypt:
        mov     $rounds,r0              @ inp
        mov     $key,r2
        sub     $tbl,r3,#AES_decrypt-AES_Td             @ Td
+#if __ARM_ARCH__<7
        ldrb    $s0,[$rounds,#3]        @ load input data in endian-neutral
        ldrb    $t1,[$rounds,#2]        @ manner...
        ldrb    $t2,[$rounds,#1]
@@ -840,10 +917,33 @@ AES_decrypt:
        orr     $s3,$s3,$t1,lsl#8
        orr     $s3,$s3,$t2,lsl#16
        orr     $s3,$s3,$t3,lsl#24
+#else
+        ldr     $s0,[$rounds,#0]
+        ldr     $s1,[$rounds,#4]
+        ldr     $s2,[$rounds,#8]
+        ldr     $s3,[$rounds,#12]
+#ifdef __ARMEL__
+        rev     $s0,$s0
+        rev     $s1,$s1
+        rev     $s2,$s2
+        rev     $s3,$s3
+#endif
+#endif
        bl      _armv4_AES_decrypt
        ldr     $rounds,[sp],#4         @ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+        rev     $s0,$s0
+        rev     $s1,$s1
+        rev     $s2,$s2
+        rev     $s3,$s3
+#endif
+        str     $s0,[$rounds,#0]
+        str     $s1,[$rounds,#4]
+        str     $s2,[$rounds,#8]
+        str     $s3,[$rounds,#12]
+#else
        mov     $t1,$s0,lsr#24          @ write output in endian-neutral
        mov     $t2,$s0,lsr#16          @ manner...
        mov     $t3,$s0,lsr#8
@@ -872,11 +972,15 @@ AES_decrypt:
        strb    $t2,[$rounds,#13]
        strb    $t3,[$rounds,#14]
        strb    $s3,[$rounds,#15]
+#endif
+#if __ARM_ARCH__>=5
+        ldmia   sp!,{r4-r12,pc}
+#else
        ldmia   sp!,{r4-r12,lr}
        tst     lr,#1
        moveq   pc,lr                   @ be binary compatible with V4, yet
        bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
 .size   AES_decrypt,.-AES_decrypt
 .type   _armv4_AES_decrypt,%function
@@ -916,11 +1020,11 @@ _armv4_AES_decrypt:
        and     $i2,lr,$s2              @ i1
        eor     $t3,$i3,$t3,ror#8
        and     $i3,lr,$s2,lsr#16
-        eor     $s1,$s1,$t1,ror#8
        ldr     $i1,[$tbl,$i1,lsl#2]    @ Td2[s2>>8]
+        eor     $s1,$s1,$t1,ror#8
+        ldr     $i2,[$tbl,$i2,lsl#2]    @ Td3[s2>>0]
        mov     $s2,$s2,lsr#24
-        ldr     $i2,[$tbl,$i2,lsl#2]    @ Td3[s2>>0]
        ldr     $i3,[$tbl,$i3,lsl#2]    @ Td1[s2>>16]
        eor     $s0,$s0,$i1,ror#16
        ldr     $s2,[$tbl,$s2,lsl#2]    @ Td0[s2>>24]
@@ -929,22 +1033,22 @@ _armv4_AES_decrypt:
        and     $i2,lr,$s3,lsr#8        @ i1
        eor     $t3,$i3,$t3,ror#8
        and     $i3,lr,$s3              @ i2
-        eor     $s2,$s2,$t2,ror#8
        ldr     $i1,[$tbl,$i1,lsl#2]    @ Td1[s3>>16]
+        eor     $s2,$s2,$t2,ror#8
+        ldr     $i2,[$tbl,$i2,lsl#2]    @ Td2[s3>>8]
        mov     $s3,$s3,lsr#24
-        ldr     $i2,[$tbl,$i2,lsl#2]    @ Td2[s3>>8]
        ldr     $i3,[$tbl,$i3,lsl#2]    @ Td3[s3>>0]
        eor     $s0,$s0,$i1,ror#8
-        ldr     $s3,[$tbl,$s3,lsl#2]    @ Td0[s3>>24]
+        ldr     $i1,[$key],#16
        eor     $s1,$s1,$i2,ror#16
+        ldr     $s3,[$tbl,$s3,lsl#2]    @ Td0[s3>>24]
        eor     $s2,$s2,$i3,ror#24
-        ldr     $i1,[$key],#16
-        eor     $s3,$s3,$t3,ror#8
        ldr     $t1,[$key,#-12]
-        ldr     $t2,[$key,#-8]
        eor     $s0,$s0,$i1
+        ldr     $t2,[$key,#-8]
+        eor     $s3,$s3,$t3,ror#8
        ldr     $t3,[$key,#-4]
        and     $i1,lr,$s0,lsr#16
        eor     $s1,$s1,$t1
@@ -985,11 +1089,11 @@ _armv4_AES_decrypt:
        and     $i1,lr,$s2,lsr#8        @ i0
        eor     $t2,$t2,$i2,lsl#8
        and     $i2,lr,$s2              @ i1
-        eor     $t3,$t3,$i3,lsl#8
        ldrb    $i1,[$tbl,$i1]          @ Td4[s2>>8]
+        eor     $t3,$t3,$i3,lsl#8
+        ldrb    $i2,[$tbl,$i2]          @ Td4[s2>>0]
        and     $i3,lr,$s2,lsr#16
-        ldrb    $i2,[$tbl,$i2]          @ Td4[s2>>0]
        ldrb    $s2,[$tbl,$s2,lsr#24]   @ Td4[s2>>24]
        eor     $s0,$s0,$i1,lsl#8
        ldrb    $i3,[$tbl,$i3]          @ Td4[s2>>16]
@@ -997,11 +1101,11 @@ _armv4_AES_decrypt:
        and     $i1,lr,$s3,lsr#16       @ i0
        eor     $s2,$t2,$s2,lsl#16
        and     $i2,lr,$s3,lsr#8        @ i1
-        eor     $t3,$t3,$i3,lsl#16
        ldrb    $i1,[$tbl,$i1]          @ Td4[s3>>16]
+        eor     $t3,$t3,$i3,lsl#16
+        ldrb    $i2,[$tbl,$i2]          @ Td4[s3>>8]
        and     $i3,lr,$s3              @ i2
-        ldrb    $i2,[$tbl,$i2]          @ Td4[s3>>8]
        ldrb    $i3,[$tbl,$i3]          @ Td4[s3>>0]
        ldrb    $s3,[$tbl,$s3,lsr#24]   @ Td4[s3>>24]
        eor     $s0,$s0,$i1,lsl#16
diff --git a/src/lib/libcrypto/aes/asm/aes-mips.pl b/src/lib/libcrypto/aes/asm/aes-mips.pl
new file mode 100644
index 0000000000..2ce6deffc8
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-mips.pl
@@ -0,0 +1,1611 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# AES for MIPS
+# October 2010
+#
+# Code uses 1K[+256B] S-box and on single-issue core [such as R5000]
+# spends ~68 cycles per byte processed with 128-bit key. This is ~16%
+# faster than gcc-generated code, which is not very impressive. But
+# recall that compressed S-box requires extra processing, namely
+# additional rotations. Rotations are implemented with lwl/lwr pairs,
+# which is normally used for loading unaligned data. Another cool
+# thing about this module is its endian neutrality, which means that
+# it processes data without ever changing byte order...
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+if ($flavour =~ /64|n32/i) {
+        $PTR_ADD="dadd";        # incidentally works even on n32
+        $PTR_SUB="dsub";        # incidentally works even on n32
+        $REG_S="sd";
+        $REG_L="ld";
+        $PTR_SLL="dsll";        # incidentally works even on n32
+        $SZREG=8;
+} else {
+        $PTR_ADD="add";
+        $PTR_SUB="sub";
+        $REG_S="sw";
+        $REG_L="lw";
+        $PTR_SLL="sll";
+        $SZREG=4;
+}
+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
+#
+# <appro@openssl.org>
+#
+######################################################################
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+for (@ARGV) {   $output=$_ if (/^\w[\w\-]*\.\w+$/);     }
+open STDOUT,">$output";
+if (!defined($big_endian))
+{    $big_endian=(unpack('L',pack('N',1))==1);   }
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+my ($MSB,$LSB)=(0,3);   # automatically converted to little-endian
+$code.=<<___;
+.text
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+#if !defined(__vxworks) || defined(__pic__)
+.option pic2
+#endif
+.set    noat
+___
+{{{
+my $FRAMESIZE=16*$SZREG;
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
+my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23));
+my ($key0,$cnt)=($gp,$fp);
+# instuction ordering is "stolen" from output from MIPSpro assembler
+# invoked with -mips3 -O3 arguments...
+$code.=<<___;
+.align  5
+.ent    _mips_AES_encrypt
+_mips_AES_encrypt:
+        .frame  $sp,0,$ra
+        .set    reorder
+        lw      $t0,0($key)
+        lw      $t1,4($key)
+        lw      $t2,8($key)
+        lw      $t3,12($key)
+        lw      $cnt,240($key)
+        $PTR_ADD $key0,$key,16
+        xor     $s0,$t0
+        xor     $s1,$t1
+        xor     $s2,$t2
+        xor     $s3,$t3
+        sub     $cnt,1
+        _xtr    $i0,$s1,16-2
+.Loop_enc:
+        _xtr    $i1,$s2,16-2
+        _xtr    $i2,$s3,16-2
+        _xtr    $i3,$s0,16-2
+        and     $i0,0x3fc
+        and     $i1,0x3fc
+        and     $i2,0x3fc
+        and     $i3,0x3fc
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lwl     $t0,3($i0)              # Te1[s1>>16]
+        lwl     $t1,3($i1)              # Te1[s2>>16]
+        lwl     $t2,3($i2)              # Te1[s3>>16]
+        lwl     $t3,3($i3)              # Te1[s0>>16]
+        lwr     $t0,2($i0)              # Te1[s1>>16]
+        lwr     $t1,2($i1)              # Te1[s2>>16]
+        lwr     $t2,2($i2)              # Te1[s3>>16]
+        lwr     $t3,2($i3)              # Te1[s0>>16]
+        _xtr    $i0,$s2,8-2
+        _xtr    $i1,$s3,8-2
+        _xtr    $i2,$s0,8-2
+        _xtr    $i3,$s1,8-2
+        and     $i0,0x3fc
+        and     $i1,0x3fc
+        and     $i2,0x3fc
+        and     $i3,0x3fc
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lwl     $t4,2($i0)              # Te2[s2>>8]
+        lwl     $t5,2($i1)              # Te2[s3>>8]
+        lwl     $t6,2($i2)              # Te2[s0>>8]
+        lwl     $t7,2($i3)              # Te2[s1>>8]
+        lwr     $t4,1($i0)              # Te2[s2>>8]
+        lwr     $t5,1($i1)              # Te2[s3>>8]
+        lwr     $t6,1($i2)              # Te2[s0>>8]
+        lwr     $t7,1($i3)              # Te2[s1>>8]
+        _xtr    $i0,$s3,0-2
+        _xtr    $i1,$s0,0-2
+        _xtr    $i2,$s1,0-2
+        _xtr    $i3,$s2,0-2
+        and     $i0,0x3fc
+        and     $i1,0x3fc
+        and     $i2,0x3fc
+        and     $i3,0x3fc
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lwl     $t8,1($i0)              # Te3[s3]
+        lwl     $t9,1($i1)              # Te3[s0]
+        lwl     $t10,1($i2)             # Te3[s1]
+        lwl     $t11,1($i3)             # Te3[s2]
+        lwr     $t8,0($i0)              # Te3[s3]
+        lwr     $t9,0($i1)              # Te3[s0]
+        lwr     $t10,0($i2)             # Te3[s1]
+        lwr     $t11,0($i3)             # Te3[s2]
+        _xtr    $i0,$s0,24-2
+        _xtr    $i1,$s1,24-2
+        _xtr    $i2,$s2,24-2
+        _xtr    $i3,$s3,24-2
+        and     $i0,0x3fc
+        and     $i1,0x3fc
+        and     $i2,0x3fc
+        and     $i3,0x3fc
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        xor     $t0,$t4
+        xor     $t1,$t5
+        xor     $t2,$t6
+        xor     $t3,$t7
+        lw      $t4,0($i0)              # Te0[s0>>24]
+        lw      $t5,0($i1)              # Te0[s1>>24]
+        lw      $t6,0($i2)              # Te0[s2>>24]
+        lw      $t7,0($i3)              # Te0[s3>>24]
+        lw      $s0,0($key0)
+        lw      $s1,4($key0)
+        lw      $s2,8($key0)
+        lw      $s3,12($key0)
+        xor     $t0,$t8
+        xor     $t1,$t9
+        xor     $t2,$t10
+        xor     $t3,$t11
+        xor     $t0,$t4
+        xor     $t1,$t5
+        xor     $t2,$t6
+        xor     $t3,$t7
+        sub     $cnt,1
+        $PTR_ADD $key0,16
+        xor     $s0,$t0
+        xor     $s1,$t1
+        xor     $s2,$t2
+        xor     $s3,$t3
+        .set    noreorder
+        bnez    $cnt,.Loop_enc
+        _xtr    $i0,$s1,16-2
+        .set    reorder
+        _xtr    $i1,$s2,16-2
+        _xtr    $i2,$s3,16-2
+        _xtr    $i3,$s0,16-2
+        and     $i0,0x3fc
+        and     $i1,0x3fc
+        and     $i2,0x3fc
+        and     $i3,0x3fc
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lbu     $t0,2($i0)              # Te4[s1>>16]
+        lbu     $t1,2($i1)              # Te4[s2>>16]
+        lbu     $t2,2($i2)              # Te4[s3>>16]
+        lbu     $t3,2($i3)              # Te4[s0>>16]
+        _xtr    $i0,$s2,8-2
+        _xtr    $i1,$s3,8-2
+        _xtr    $i2,$s0,8-2
+        _xtr    $i3,$s1,8-2
+        and     $i0,0x3fc
+        and     $i1,0x3fc
+        and     $i2,0x3fc
+        and     $i3,0x3fc
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lbu     $t4,2($i0)              # Te4[s2>>8]
+        lbu     $t5,2($i1)              # Te4[s3>>8]
+        lbu     $t6,2($i2)              # Te4[s0>>8]
+        lbu     $t7,2($i3)              # Te4[s1>>8]
+        _xtr    $i0,$s0,24-2
+        _xtr    $i1,$s1,24-2
+        _xtr    $i2,$s2,24-2
+        _xtr    $i3,$s3,24-2
+        and     $i0,0x3fc
+        and     $i1,0x3fc
+        and     $i2,0x3fc
+        and     $i3,0x3fc
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lbu     $t8,2($i0)              # Te4[s0>>24]
+        lbu     $t9,2($i1)              # Te4[s1>>24]
+        lbu     $t10,2($i2)             # Te4[s2>>24]
+        lbu     $t11,2($i3)             # Te4[s3>>24]
+        _xtr    $i0,$s3,0-2
+        _xtr    $i1,$s0,0-2
+        _xtr    $i2,$s1,0-2
+        _xtr    $i3,$s2,0-2
+        and     $i0,0x3fc
+        and     $i1,0x3fc
+        and     $i2,0x3fc
+        and     $i3,0x3fc
+        _ins    $t0,16
+        _ins    $t1,16
+        _ins    $t2,16
+        _ins    $t3,16
+        _ins    $t4,8
+        _ins    $t5,8
+        _ins    $t6,8
+        _ins    $t7,8
+        xor     $t0,$t4
+        xor     $t1,$t5
+        xor     $t2,$t6
+        xor     $t3,$t7
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lbu     $t4,2($i0)              # Te4[s3]
+        lbu     $t5,2($i1)              # Te4[s0]
+        lbu     $t6,2($i2)              # Te4[s1]
+        lbu     $t7,2($i3)              # Te4[s2]
+        _ins    $t8,24
+        _ins    $t9,24
+        _ins    $t10,24
+        _ins    $t11,24
+        lw      $s0,0($key0)
+        lw      $s1,4($key0)
+        lw      $s2,8($key0)
+        lw      $s3,12($key0)
+        xor     $t0,$t8
+        xor     $t1,$t9
+        xor     $t2,$t10
+        xor     $t3,$t11
+        _ins    $t4,0
+        _ins    $t5,0
+        _ins    $t6,0
+        _ins    $t7,0
+        xor     $t0,$t4
+        xor     $t1,$t5
+        xor     $t2,$t6
+        xor     $t3,$t7
+        xor     $s0,$t0
+        xor     $s1,$t1
+        xor     $s2,$t2
+        xor     $s3,$t3
+        jr      $ra
+.end    _mips_AES_encrypt
+.align  5
+.globl  AES_encrypt
+.ent    AES_encrypt
+AES_encrypt:
+        .frame  $sp,$FRAMESIZE,$ra
+        .mask   $SAVED_REGS_MASK,-$SZREG
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);   # o32 PIC-ification
+        .cpload $pf
+___
+$code.=<<___;
+        $PTR_SUB $sp,$FRAMESIZE
+        $REG_S  $ra,$FRAMESIZE-1*$SZREG($sp)
+        $REG_S  $fp,$FRAMESIZE-2*$SZREG($sp)
+        $REG_S  $s11,$FRAMESIZE-3*$SZREG($sp)
+        $REG_S  $s10,$FRAMESIZE-4*$SZREG($sp)
+        $REG_S  $s9,$FRAMESIZE-5*$SZREG($sp)
+        $REG_S  $s8,$FRAMESIZE-6*$SZREG($sp)
+        $REG_S  $s7,$FRAMESIZE-7*$SZREG($sp)
+        $REG_S  $s6,$FRAMESIZE-8*$SZREG($sp)
+        $REG_S  $s5,$FRAMESIZE-9*$SZREG($sp)
+        $REG_S  $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi prologue
+        $REG_S  \$15,$FRAMESIZE-11*$SZREG($sp)
+        $REG_S  \$14,$FRAMESIZE-12*$SZREG($sp)
+        $REG_S  \$13,$FRAMESIZE-13*$SZREG($sp)
+        $REG_S  \$12,$FRAMESIZE-14*$SZREG($sp)
+        $REG_S  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);   # non-o32 PIC-ification
+        .cplocal        $Tbl
+        .cpsetup        $pf,$zero,AES_encrypt
+___
+$code.=<<___;
+        .set    reorder
+        la      $Tbl,AES_Te             # PIC-ified 'load address'
+        lwl     $s0,0+$MSB($inp)
+        lwl     $s1,4+$MSB($inp)
+        lwl     $s2,8+$MSB($inp)
+        lwl     $s3,12+$MSB($inp)
+        lwr     $s0,0+$LSB($inp)
+        lwr     $s1,4+$LSB($inp)
+        lwr     $s2,8+$LSB($inp)
+        lwr     $s3,12+$LSB($inp)
+        bal     _mips_AES_encrypt
+        swr     $s0,0+$LSB($out)
+        swr     $s1,4+$LSB($out)
+        swr     $s2,8+$LSB($out)
+        swr     $s3,12+$LSB($out)
+        swl     $s0,0+$MSB($out)
+        swl     $s1,4+$MSB($out)
+        swl     $s2,8+$MSB($out)
+        swl     $s3,12+$MSB($out)
+        .set    noreorder
+        $REG_L  $ra,$FRAMESIZE-1*$SZREG($sp)
+        $REG_L  $fp,$FRAMESIZE-2*$SZREG($sp)
+        $REG_L  $s11,$FRAMESIZE-3*$SZREG($sp)
+        $REG_L  $s10,$FRAMESIZE-4*$SZREG($sp)
+        $REG_L  $s9,$FRAMESIZE-5*$SZREG($sp)
+        $REG_L  $s8,$FRAMESIZE-6*$SZREG($sp)
+        $REG_L  $s7,$FRAMESIZE-7*$SZREG($sp)
+        $REG_L  $s6,$FRAMESIZE-8*$SZREG($sp)
+        $REG_L  $s5,$FRAMESIZE-9*$SZREG($sp)
+        $REG_L  $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  \$15,$FRAMESIZE-11*$SZREG($sp)
+        $REG_L  \$14,$FRAMESIZE-12*$SZREG($sp)
+        $REG_L  \$13,$FRAMESIZE-13*$SZREG($sp)
+        $REG_L  \$12,$FRAMESIZE-14*$SZREG($sp)
+        $REG_L  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+        jr      $ra
+        $PTR_ADD $sp,$FRAMESIZE
+.end    AES_encrypt
+___
+$code.=<<___;
+.align  5
+.ent    _mips_AES_decrypt
+_mips_AES_decrypt:
+        .frame  $sp,0,$ra
+        .set    reorder
+        lw      $t0,0($key)
+        lw      $t1,4($key)
+        lw      $t2,8($key)
+        lw      $t3,12($key)
+        lw      $cnt,240($key)
+        $PTR_ADD $key0,$key,16
+        xor     $s0,$t0
+        xor     $s1,$t1
+        xor     $s2,$t2
+        xor     $s3,$t3
+        sub     $cnt,1
+        _xtr    $i0,$s3,16-2
+.Loop_dec:
+        _xtr    $i1,$s0,16-2
+        _xtr    $i2,$s1,16-2
+        _xtr    $i3,$s2,16-2
+        and     $i0,0x3fc
+        and     $i1,0x3fc
+        and     $i2,0x3fc
+        and     $i3,0x3fc
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lwl     $t0,3($i0)              # Td1[s3>>16]
+        lwl     $t1,3($i1)              # Td1[s0>>16]
+        lwl     $t2,3($i2)              # Td1[s1>>16]
+        lwl     $t3,3($i3)              # Td1[s2>>16]
+        lwr     $t0,2($i0)              # Td1[s3>>16]
+        lwr     $t1,2($i1)              # Td1[s0>>16]
+        lwr     $t2,2($i2)              # Td1[s1>>16]
+        lwr     $t3,2($i3)              # Td1[s2>>16]
+        _xtr    $i0,$s2,8-2
+        _xtr    $i1,$s3,8-2
+        _xtr    $i2,$s0,8-2
+        _xtr    $i3,$s1,8-2
+        and     $i0,0x3fc
+        and     $i1,0x3fc
+        and     $i2,0x3fc
+        and     $i3,0x3fc
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lwl     $t4,2($i0)              # Td2[s2>>8]
+        lwl     $t5,2($i1)              # Td2[s3>>8]
+        lwl     $t6,2($i2)              # Td2[s0>>8]
+        lwl     $t7,2($i3)              # Td2[s1>>8]
+        lwr     $t4,1($i0)              # Td2[s2>>8]
+        lwr     $t5,1($i1)              # Td2[s3>>8]
+        lwr     $t6,1($i2)              # Td2[s0>>8]
+        lwr     $t7,1($i3)              # Td2[s1>>8]
+        _xtr    $i0,$s1,0-2
+        _xtr    $i1,$s2,0-2
+        _xtr    $i2,$s3,0-2
+        _xtr    $i3,$s0,0-2
+        and     $i0,0x3fc
+        and     $i1,0x3fc
+        and     $i2,0x3fc
+        and     $i3,0x3fc
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lwl     $t8,1($i0)              # Td3[s1]
+        lwl     $t9,1($i1)              # Td3[s2]
+        lwl     $t10,1($i2)             # Td3[s3]
+        lwl     $t11,1($i3)             # Td3[s0]
+        lwr     $t8,0($i0)              # Td3[s1]
+        lwr     $t9,0($i1)              # Td3[s2]
+        lwr     $t10,0($i2)             # Td3[s3]
+        lwr     $t11,0($i3)             # Td3[s0]
+        _xtr    $i0,$s0,24-2
+        _xtr    $i1,$s1,24-2
+        _xtr    $i2,$s2,24-2
+        _xtr    $i3,$s3,24-2
+        and     $i0,0x3fc
+        and     $i1,0x3fc
+        and     $i2,0x3fc
+        and     $i3,0x3fc
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        xor     $t0,$t4
+        xor     $t1,$t5
+        xor     $t2,$t6
+        xor     $t3,$t7
+        lw      $t4,0($i0)              # Td0[s0>>24]
+        lw      $t5,0($i1)              # Td0[s1>>24]
+        lw      $t6,0($i2)              # Td0[s2>>24]
+        lw      $t7,0($i3)              # Td0[s3>>24]
+        lw      $s0,0($key0)
+        lw      $s1,4($key0)
+        lw      $s2,8($key0)
+        lw      $s3,12($key0)
+        xor     $t0,$t8
+        xor     $t1,$t9
+        xor     $t2,$t10
+        xor     $t3,$t11
+        xor     $t0,$t4
+        xor     $t1,$t5
+        xor     $t2,$t6
+        xor     $t3,$t7
+        sub     $cnt,1
+        $PTR_ADD $key0,16
+        xor     $s0,$t0
+        xor     $s1,$t1
+        xor     $s2,$t2
+        xor     $s3,$t3
+        .set    noreorder
+        bnez    $cnt,.Loop_dec
+        _xtr    $i0,$s3,16-2
+        .set    reorder
+        lw      $t4,1024($Tbl)          # prefetch Td4
+        lw      $t5,1024+32($Tbl)
+        lw      $t6,1024+64($Tbl)
+        lw      $t7,1024+96($Tbl)
+        lw      $t8,1024+128($Tbl)
+        lw      $t9,1024+160($Tbl)
+        lw      $t10,1024+192($Tbl)
+        lw      $t11,1024+224($Tbl)
+        _xtr    $i0,$s3,16
+        _xtr    $i1,$s0,16
+        _xtr    $i2,$s1,16
+        _xtr    $i3,$s2,16
+        and     $i0,0xff
+        and     $i1,0xff
+        and     $i2,0xff
+        and     $i3,0xff
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lbu     $t0,1024($i0)           # Td4[s3>>16]
+        lbu     $t1,1024($i1)           # Td4[s0>>16]
+        lbu     $t2,1024($i2)           # Td4[s1>>16]
+        lbu     $t3,1024($i3)           # Td4[s2>>16]
+        _xtr    $i0,$s2,8
+        _xtr    $i1,$s3,8
+        _xtr    $i2,$s0,8
+        _xtr    $i3,$s1,8
+        and     $i0,0xff
+        and     $i1,0xff
+        and     $i2,0xff
+        and     $i3,0xff
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lbu     $t4,1024($i0)           # Td4[s2>>8]
+        lbu     $t5,1024($i1)           # Td4[s3>>8]
+        lbu     $t6,1024($i2)           # Td4[s0>>8]
+        lbu     $t7,1024($i3)           # Td4[s1>>8]
+        _xtr    $i0,$s0,24
+        _xtr    $i1,$s1,24
+        _xtr    $i2,$s2,24
+        _xtr    $i3,$s3,24
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lbu     $t8,1024($i0)           # Td4[s0>>24]
+        lbu     $t9,1024($i1)           # Td4[s1>>24]
+        lbu     $t10,1024($i2)          # Td4[s2>>24]
+        lbu     $t11,1024($i3)          # Td4[s3>>24]
+        _xtr    $i0,$s1,0
+        _xtr    $i1,$s2,0
+        _xtr    $i2,$s3,0
+        _xtr    $i3,$s0,0
+        _ins    $t0,16
+        _ins    $t1,16
+        _ins    $t2,16
+        _ins    $t3,16
+        _ins    $t4,8
+        _ins    $t5,8
+        _ins    $t6,8
+        _ins    $t7,8
+        xor     $t0,$t4
+        xor     $t1,$t5
+        xor     $t2,$t6
+        xor     $t3,$t7
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lbu     $t4,1024($i0)           # Td4[s1]
+        lbu     $t5,1024($i1)           # Td4[s2]
+        lbu     $t6,1024($i2)           # Td4[s3]
+        lbu     $t7,1024($i3)           # Td4[s0]
+        _ins    $t8,24
+        _ins    $t9,24
+        _ins    $t10,24
+        _ins    $t11,24
+        lw      $s0,0($key0)
+        lw      $s1,4($key0)
+        lw      $s2,8($key0)
+        lw      $s3,12($key0)
+        _ins    $t4,0
+        _ins    $t5,0
+        _ins    $t6,0
+        _ins    $t7,0
+        xor     $t0,$t8
+        xor     $t1,$t9
+        xor     $t2,$t10
+        xor     $t3,$t11
+        xor     $t0,$t4
+        xor     $t1,$t5
+        xor     $t2,$t6
+        xor     $t3,$t7
+        xor     $s0,$t0
+        xor     $s1,$t1
+        xor     $s2,$t2
+        xor     $s3,$t3
+        jr      $ra
+.end    _mips_AES_decrypt
+.align  5
+.globl  AES_decrypt
+.ent    AES_decrypt
+AES_decrypt:
+        .frame  $sp,$FRAMESIZE,$ra
+        .mask   $SAVED_REGS_MASK,-$SZREG
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);   # o32 PIC-ification
+        .cpload $pf
+___
+$code.=<<___;
+        $PTR_SUB $sp,$FRAMESIZE
+        $REG_S  $ra,$FRAMESIZE-1*$SZREG($sp)
+        $REG_S  $fp,$FRAMESIZE-2*$SZREG($sp)
+        $REG_S  $s11,$FRAMESIZE-3*$SZREG($sp)
+        $REG_S  $s10,$FRAMESIZE-4*$SZREG($sp)
+        $REG_S  $s9,$FRAMESIZE-5*$SZREG($sp)
+        $REG_S  $s8,$FRAMESIZE-6*$SZREG($sp)
+        $REG_S  $s7,$FRAMESIZE-7*$SZREG($sp)
+        $REG_S  $s6,$FRAMESIZE-8*$SZREG($sp)
+        $REG_S  $s5,$FRAMESIZE-9*$SZREG($sp)
+        $REG_S  $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi prologue
+        $REG_S  \$15,$FRAMESIZE-11*$SZREG($sp)
+        $REG_S  \$14,$FRAMESIZE-12*$SZREG($sp)
+        $REG_S  \$13,$FRAMESIZE-13*$SZREG($sp)
+        $REG_S  \$12,$FRAMESIZE-14*$SZREG($sp)
+        $REG_S  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);   # non-o32 PIC-ification
+        .cplocal        $Tbl
+        .cpsetup        $pf,$zero,AES_decrypt
+___
+$code.=<<___;
+        .set    reorder
+        la      $Tbl,AES_Td             # PIC-ified 'load address'
+        lwl     $s0,0+$MSB($inp)
+        lwl     $s1,4+$MSB($inp)
+        lwl     $s2,8+$MSB($inp)
+        lwl     $s3,12+$MSB($inp)
+        lwr     $s0,0+$LSB($inp)
+        lwr     $s1,4+$LSB($inp)
+        lwr     $s2,8+$LSB($inp)
+        lwr     $s3,12+$LSB($inp)
+        bal     _mips_AES_decrypt
+        swr     $s0,0+$LSB($out)
+        swr     $s1,4+$LSB($out)
+        swr     $s2,8+$LSB($out)
+        swr     $s3,12+$LSB($out)
+        swl     $s0,0+$MSB($out)
+        swl     $s1,4+$MSB($out)
+        swl     $s2,8+$MSB($out)
+        swl     $s3,12+$MSB($out)
+        .set    noreorder
+        $REG_L  $ra,$FRAMESIZE-1*$SZREG($sp)
+        $REG_L  $fp,$FRAMESIZE-2*$SZREG($sp)
+        $REG_L  $s11,$FRAMESIZE-3*$SZREG($sp)
+        $REG_L  $s10,$FRAMESIZE-4*$SZREG($sp)
+        $REG_L  $s9,$FRAMESIZE-5*$SZREG($sp)
+        $REG_L  $s8,$FRAMESIZE-6*$SZREG($sp)
+        $REG_L  $s7,$FRAMESIZE-7*$SZREG($sp)
+        $REG_L  $s6,$FRAMESIZE-8*$SZREG($sp)
+        $REG_L  $s5,$FRAMESIZE-9*$SZREG($sp)
+        $REG_L  $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  \$15,$FRAMESIZE-11*$SZREG($sp)
+        $REG_L  \$14,$FRAMESIZE-12*$SZREG($sp)
+        $REG_L  \$13,$FRAMESIZE-13*$SZREG($sp)
+        $REG_L  \$12,$FRAMESIZE-14*$SZREG($sp)
+        $REG_L  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+        jr      $ra
+        $PTR_ADD $sp,$FRAMESIZE
+.end    AES_decrypt
+___
+}}}
+{{{
+my $FRAMESIZE=8*$SZREG;
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000;
+my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
+my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
+my ($rcon,$cnt)=($gp,$fp);
+$code.=<<___;
+.align  5
+.ent    _mips_AES_set_encrypt_key
+_mips_AES_set_encrypt_key:
+        .frame  $sp,0,$ra
+        .set    noreorder
+        beqz    $inp,.Lekey_done
+        li      $t0,-1
+        beqz    $key,.Lekey_done
+        $PTR_ADD $rcon,$Tbl,1024+256
+        .set    reorder
+        lwl     $rk0,0+$MSB($inp)       # load 128 bits
+        lwl     $rk1,4+$MSB($inp)
+        lwl     $rk2,8+$MSB($inp)
+        lwl     $rk3,12+$MSB($inp)
+        li      $at,128
+        lwr     $rk0,0+$LSB($inp)
+        lwr     $rk1,4+$LSB($inp)
+        lwr     $rk2,8+$LSB($inp)
+        lwr     $rk3,12+$LSB($inp)
+        .set    noreorder
+        beq     $bits,$at,.L128bits
+        li      $cnt,10
+        .set    reorder
+        lwl     $rk4,16+$MSB($inp)      # load 192 bits
+        lwl     $rk5,20+$MSB($inp)
+        li      $at,192
+        lwr     $rk4,16+$LSB($inp)
+        lwr     $rk5,20+$LSB($inp)
+        .set    noreorder
+        beq     $bits,$at,.L192bits
+        li      $cnt,8
+        .set    reorder
+        lwl     $rk6,24+$MSB($inp)      # load 256 bits
+        lwl     $rk7,28+$MSB($inp)
+        li      $at,256
+        lwr     $rk6,24+$LSB($inp)
+        lwr     $rk7,28+$LSB($inp)
+        .set    noreorder
+        beq     $bits,$at,.L256bits
+        li      $cnt,7
+        b       .Lekey_done
+        li      $t0,-2
+.align  4
+.L128bits:
+        .set    reorder
+        srl     $i0,$rk3,16
+        srl     $i1,$rk3,8
+        and     $i0,0xff
+        and     $i1,0xff
+        and     $i2,$rk3,0xff
+        srl     $i3,$rk3,24
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lbu     $i0,1024($i0)
+        lbu     $i1,1024($i1)
+        lbu     $i2,1024($i2)
+        lbu     $i3,1024($i3)
+        sw      $rk0,0($key)
+        sw      $rk1,4($key)
+        sw      $rk2,8($key)
+        sw      $rk3,12($key)
+        sub     $cnt,1
+        $PTR_ADD $key,16
+        _bias   $i0,24
+        _bias   $i1,16
+        _bias   $i2,8
+        _bias   $i3,0
+        xor     $rk0,$i0
+        lw      $i0,0($rcon)
+        xor     $rk0,$i1
+        xor     $rk0,$i2
+        xor     $rk0,$i3
+        xor     $rk0,$i0
+        xor     $rk1,$rk0
+        xor     $rk2,$rk1
+        xor     $rk3,$rk2
+        .set    noreorder
+        bnez    $cnt,.L128bits
+        $PTR_ADD $rcon,4
+        sw      $rk0,0($key)
+        sw      $rk1,4($key)
+        sw      $rk2,8($key)
+        li      $cnt,10
+        sw      $rk3,12($key)
+        li      $t0,0
+        sw      $cnt,80($key)
+        b       .Lekey_done
+        $PTR_SUB $key,10*16
+.align  4
+.L192bits:
+        .set    reorder
+        srl     $i0,$rk5,16
+        srl     $i1,$rk5,8
+        and     $i0,0xff
+        and     $i1,0xff
+        and     $i2,$rk5,0xff
+        srl     $i3,$rk5,24
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lbu     $i0,1024($i0)
+        lbu     $i1,1024($i1)
+        lbu     $i2,1024($i2)
+        lbu     $i3,1024($i3)
+        sw      $rk0,0($key)
+        sw      $rk1,4($key)
+        sw      $rk2,8($key)
+        sw      $rk3,12($key)
+        sw      $rk4,16($key)
+        sw      $rk5,20($key)
+        sub     $cnt,1
+        $PTR_ADD $key,24
+        _bias   $i0,24
+        _bias   $i1,16
+        _bias   $i2,8
+        _bias   $i3,0
+        xor     $rk0,$i0
+        lw      $i0,0($rcon)
+        xor     $rk0,$i1
+        xor     $rk0,$i2
+        xor     $rk0,$i3
+        xor     $rk0,$i0
+        xor     $rk1,$rk0
+        xor     $rk2,$rk1
+        xor     $rk3,$rk2
+        xor     $rk4,$rk3
+        xor     $rk5,$rk4
+        .set    noreorder
+        bnez    $cnt,.L192bits
+        $PTR_ADD $rcon,4
+        sw      $rk0,0($key)
+        sw      $rk1,4($key)
+        sw      $rk2,8($key)
+        li      $cnt,12
+        sw      $rk3,12($key)
+        li      $t0,0
+        sw      $cnt,48($key)
+        b       .Lekey_done
+        $PTR_SUB $key,12*16
+.align  4
+.L256bits:
+        .set    reorder
+        srl     $i0,$rk7,16
+        srl     $i1,$rk7,8
+        and     $i0,0xff
+        and     $i1,0xff
+        and     $i2,$rk7,0xff
+        srl     $i3,$rk7,24
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lbu     $i0,1024($i0)
+        lbu     $i1,1024($i1)
+        lbu     $i2,1024($i2)
+        lbu     $i3,1024($i3)
+        sw      $rk0,0($key)
+        sw      $rk1,4($key)
+        sw      $rk2,8($key)
+        sw      $rk3,12($key)
+        sw      $rk4,16($key)
+        sw      $rk5,20($key)
+        sw      $rk6,24($key)
+        sw      $rk7,28($key)
+        sub     $cnt,1
+        _bias   $i0,24
+        _bias   $i1,16
+        _bias   $i2,8
+        _bias   $i3,0
+        xor     $rk0,$i0
+        lw      $i0,0($rcon)
+        xor     $rk0,$i1
+        xor     $rk0,$i2
+        xor     $rk0,$i3
+        xor     $rk0,$i0
+        xor     $rk1,$rk0
+        xor     $rk2,$rk1
+        xor     $rk3,$rk2
+        beqz    $cnt,.L256bits_done
+        srl     $i0,$rk3,24
+        srl     $i1,$rk3,16
+        srl     $i2,$rk3,8
+        and     $i3,$rk3,0xff
+        and     $i1,0xff
+        and     $i2,0xff
+        $PTR_ADD $i0,$Tbl
+        $PTR_ADD $i1,$Tbl
+        $PTR_ADD $i2,$Tbl
+        $PTR_ADD $i3,$Tbl
+        lbu     $i0,1024($i0)
+        lbu     $i1,1024($i1)
+        lbu     $i2,1024($i2)
+        lbu     $i3,1024($i3)
+        sll     $i0,24
+        sll     $i1,16
+        sll     $i2,8
+        xor     $rk4,$i0
+        xor     $rk4,$i1
+        xor     $rk4,$i2
+        xor     $rk4,$i3
+        xor     $rk5,$rk4
+        xor     $rk6,$rk5
+        xor     $rk7,$rk6
+        $PTR_ADD $key,32
+        .set    noreorder
+        b       .L256bits
+        $PTR_ADD $rcon,4
+.L256bits_done:
+        sw      $rk0,32($key)
+        sw      $rk1,36($key)
+        sw      $rk2,40($key)
+        li      $cnt,14
+        sw      $rk3,44($key)
+        li      $t0,0
+        sw      $cnt,48($key)
+        $PTR_SUB $key,12*16
+.Lekey_done:
+        jr      $ra
+        nop
+.end    _mips_AES_set_encrypt_key
+.globl  AES_set_encrypt_key
+.ent    AES_set_encrypt_key
+AES_set_encrypt_key:
+        .frame  $sp,$FRAMESIZE,$ra
+        .mask   $SAVED_REGS_MASK,-$SZREG
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);   # o32 PIC-ification
+        .cpload $pf
+___
+$code.=<<___;
+        $PTR_SUB $sp,$FRAMESIZE
+        $REG_S  $ra,$FRAMESIZE-1*$SZREG($sp)
+        $REG_S  $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi prologue
+        $REG_S  $s3,$FRAMESIZE-3*$SZREG($sp)
+        $REG_S  $s2,$FRAMESIZE-4*$SZREG($sp)
+        $REG_S  $s1,$FRAMESIZE-5*$SZREG($sp)
+        $REG_S  $s0,$FRAMESIZE-6*$SZREG($sp)
+        $REG_S  $gp,$FRAMESIZE-7*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);   # non-o32 PIC-ification
+        .cplocal        $Tbl
+        .cpsetup        $pf,$zero,AES_set_encrypt_key
+___
+$code.=<<___;
+        .set    reorder
+        la      $Tbl,AES_Te             # PIC-ified 'load address'
+        bal     _mips_AES_set_encrypt_key
+        .set    noreorder
+        move    $a0,$t0
+        $REG_L  $ra,$FRAMESIZE-1*$SZREG($sp)
+        $REG_L  $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $s3,$FRAMESIZE-11*$SZREG($sp)
+        $REG_L  $s2,$FRAMESIZE-12*$SZREG($sp)
+        $REG_L  $s1,$FRAMESIZE-13*$SZREG($sp)
+        $REG_L  $s0,$FRAMESIZE-14*$SZREG($sp)
+        $REG_L  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+        jr      $ra
+        $PTR_ADD $sp,$FRAMESIZE
+.end    AES_set_encrypt_key
+___
+my ($head,$tail)=($inp,$bits);
+my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
+my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
+$code.=<<___;
+.align  5
+.globl  AES_set_decrypt_key
+.ent    AES_set_decrypt_key
+AES_set_decrypt_key:
+        .frame  $sp,$FRAMESIZE,$ra
+        .mask   $SAVED_REGS_MASK,-$SZREG
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);   # o32 PIC-ification
+        .cpload $pf
+___
+$code.=<<___;
+        $PTR_SUB $sp,$FRAMESIZE
+        $REG_S  $ra,$FRAMESIZE-1*$SZREG($sp)
+        $REG_S  $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi prologue
+        $REG_S  $s3,$FRAMESIZE-3*$SZREG($sp)
+        $REG_S  $s2,$FRAMESIZE-4*$SZREG($sp)
+        $REG_S  $s1,$FRAMESIZE-5*$SZREG($sp)
+        $REG_S  $s0,$FRAMESIZE-6*$SZREG($sp)
+        $REG_S  $gp,$FRAMESIZE-7*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);   # non-o32 PIC-ification
+        .cplocal        $Tbl
+        .cpsetup        $pf,$zero,AES_set_decrypt_key
+___
+$code.=<<___;
+        .set    reorder
+        la      $Tbl,AES_Te             # PIC-ified 'load address'
+        bal     _mips_AES_set_encrypt_key
+        bltz    $t0,.Ldkey_done
+        sll     $at,$cnt,4
+        $PTR_ADD $head,$key,0
+        $PTR_ADD $tail,$key,$at
+.align  4
+.Lswap:
+        lw      $rk0,0($head)
+        lw      $rk1,4($head)
+        lw      $rk2,8($head)
+        lw      $rk3,12($head)
+        lw      $rk4,0($tail)
+        lw      $rk5,4($tail)
+        lw      $rk6,8($tail)
+        lw      $rk7,12($tail)
+        sw      $rk0,0($tail)
+        sw      $rk1,4($tail)
+        sw      $rk2,8($tail)
+        sw      $rk3,12($tail)
+        $PTR_ADD $head,16
+        $PTR_SUB $tail,16
+        sw      $rk4,-16($head)
+        sw      $rk5,-12($head)
+        sw      $rk6,-8($head)
+        sw      $rk7,-4($head)
+        bne     $head,$tail,.Lswap
+        lw      $tp1,16($key)           # modulo-scheduled
+        lui     $x80808080,0x8080
+        sub     $cnt,1
+        or      $x80808080,0x8080
+        sll     $cnt,2
+        $PTR_ADD $key,16
+        lui     $x1b1b1b1b,0x1b1b
+        nor     $x7f7f7f7f,$zero,$x80808080
+        or      $x1b1b1b1b,0x1b1b
+.align  4
+.Lmix:
+        and     $m,$tp1,$x80808080
+        and     $tp2,$tp1,$x7f7f7f7f
+        srl     $tp4,$m,7
+        addu    $tp2,$tp2               # tp2<<1
+        subu    $m,$tp4
+        and     $m,$x1b1b1b1b
+        xor     $tp2,$m
+        and     $m,$tp2,$x80808080
+        and     $tp4,$tp2,$x7f7f7f7f
+        srl     $tp8,$m,7
+        addu    $tp4,$tp4               # tp4<<1
+        subu    $m,$tp8
+        and     $m,$x1b1b1b1b
+        xor     $tp4,$m
+        and     $m,$tp4,$x80808080
+        and     $tp8,$tp4,$x7f7f7f7f
+        srl     $tp9,$m,7
+        addu    $tp8,$tp8               # tp8<<1
+        subu    $m,$tp9
+        and     $m,$x1b1b1b1b
+        xor     $tp8,$m
+        xor     $tp9,$tp8,$tp1
+        xor     $tpe,$tp8,$tp4
+        xor     $tpb,$tp9,$tp2
+        xor     $tpd,$tp9,$tp4
+        _ror    $tp1,$tpd,16
+         xor    $tpe,$tp2
+        _ror    $tp2,$tpd,-16
+        xor     $tpe,$tp1
+        _ror    $tp1,$tp9,8
+        xor     $tpe,$tp2
+        _ror    $tp2,$tp9,-24
+        xor     $tpe,$tp1
+        _ror    $tp1,$tpb,24
+        xor     $tpe,$tp2
+        _ror    $tp2,$tpb,-8
+        xor     $tpe,$tp1
+        lw      $tp1,4($key)            # modulo-scheduled
+        xor     $tpe,$tp2
+        sub     $cnt,1
+        sw      $tpe,0($key)
+        $PTR_ADD $key,4
+        bnez    $cnt,.Lmix
+        li      $t0,0
+.Ldkey_done:
+        .set    noreorder
+        move    $a0,$t0
+        $REG_L  $ra,$FRAMESIZE-1*$SZREG($sp)
+        $REG_L  $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $s3,$FRAMESIZE-11*$SZREG($sp)
+        $REG_L  $s2,$FRAMESIZE-12*$SZREG($sp)
+        $REG_L  $s1,$FRAMESIZE-13*$SZREG($sp)
+        $REG_L  $s0,$FRAMESIZE-14*$SZREG($sp)
+        $REG_L  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+        jr      $ra
+        $PTR_ADD $sp,$FRAMESIZE
+.end    AES_set_decrypt_key
+___
+}}}
+######################################################################
+# Tables are kept in endian-neutral manner
+$code.=<<___;
+.rdata
+.align  6
+AES_Te:
+.byte   0xc6,0x63,0x63,0xa5,    0xf8,0x7c,0x7c,0x84     # Te0
+.byte   0xee,0x77,0x77,0x99,    0xf6,0x7b,0x7b,0x8d
+.byte   0xff,0xf2,0xf2,0x0d,    0xd6,0x6b,0x6b,0xbd
+.byte   0xde,0x6f,0x6f,0xb1,    0x91,0xc5,0xc5,0x54
+.byte   0x60,0x30,0x30,0x50,    0x02,0x01,0x01,0x03
+.byte   0xce,0x67,0x67,0xa9,    0x56,0x2b,0x2b,0x7d
+.byte   0xe7,0xfe,0xfe,0x19,    0xb5,0xd7,0xd7,0x62
+.byte   0x4d,0xab,0xab,0xe6,    0xec,0x76,0x76,0x9a
+.byte   0x8f,0xca,0xca,0x45,    0x1f,0x82,0x82,0x9d
+.byte   0x89,0xc9,0xc9,0x40,    0xfa,0x7d,0x7d,0x87
+.byte   0xef,0xfa,0xfa,0x15,    0xb2,0x59,0x59,0xeb
+.byte   0x8e,0x47,0x47,0xc9,    0xfb,0xf0,0xf0,0x0b
+.byte   0x41,0xad,0xad,0xec,    0xb3,0xd4,0xd4,0x67
+.byte   0x5f,0xa2,0xa2,0xfd,    0x45,0xaf,0xaf,0xea
+.byte   0x23,0x9c,0x9c,0xbf,    0x53,0xa4,0xa4,0xf7
+.byte   0xe4,0x72,0x72,0x96,    0x9b,0xc0,0xc0,0x5b
+.byte   0x75,0xb7,0xb7,0xc2,    0xe1,0xfd,0xfd,0x1c
+.byte   0x3d,0x93,0x93,0xae,    0x4c,0x26,0x26,0x6a
+.byte   0x6c,0x36,0x36,0x5a,    0x7e,0x3f,0x3f,0x41
+.byte   0xf5,0xf7,0xf7,0x02,    0x83,0xcc,0xcc,0x4f
+.byte   0x68,0x34,0x34,0x5c,    0x51,0xa5,0xa5,0xf4
+.byte   0xd1,0xe5,0xe5,0x34,    0xf9,0xf1,0xf1,0x08
+.byte   0xe2,0x71,0x71,0x93,    0xab,0xd8,0xd8,0x73
+.byte   0x62,0x31,0x31,0x53,    0x2a,0x15,0x15,0x3f
+.byte   0x08,0x04,0x04,0x0c,    0x95,0xc7,0xc7,0x52
+.byte   0x46,0x23,0x23,0x65,    0x9d,0xc3,0xc3,0x5e
+.byte   0x30,0x18,0x18,0x28,    0x37,0x96,0x96,0xa1
+.byte   0x0a,0x05,0x05,0x0f,    0x2f,0x9a,0x9a,0xb5
+.byte   0x0e,0x07,0x07,0x09,    0x24,0x12,0x12,0x36
+.byte   0x1b,0x80,0x80,0x9b,    0xdf,0xe2,0xe2,0x3d
+.byte   0xcd,0xeb,0xeb,0x26,    0x4e,0x27,0x27,0x69
+.byte   0x7f,0xb2,0xb2,0xcd,    0xea,0x75,0x75,0x9f
+.byte   0x12,0x09,0x09,0x1b,    0x1d,0x83,0x83,0x9e
+.byte   0x58,0x2c,0x2c,0x74,    0x34,0x1a,0x1a,0x2e
+.byte   0x36,0x1b,0x1b,0x2d,    0xdc,0x6e,0x6e,0xb2
+.byte   0xb4,0x5a,0x5a,0xee,    0x5b,0xa0,0xa0,0xfb
+.byte   0xa4,0x52,0x52,0xf6,    0x76,0x3b,0x3b,0x4d
+.byte   0xb7,0xd6,0xd6,0x61,    0x7d,0xb3,0xb3,0xce
+.byte   0x52,0x29,0x29,0x7b,    0xdd,0xe3,0xe3,0x3e
+.byte   0x5e,0x2f,0x2f,0x71,    0x13,0x84,0x84,0x97
+.byte   0xa6,0x53,0x53,0xf5,    0xb9,0xd1,0xd1,0x68
+.byte   0x00,0x00,0x00,0x00,    0xc1,0xed,0xed,0x2c
+.byte   0x40,0x20,0x20,0x60,    0xe3,0xfc,0xfc,0x1f
+.byte   0x79,0xb1,0xb1,0xc8,    0xb6,0x5b,0x5b,0xed
+.byte   0xd4,0x6a,0x6a,0xbe,    0x8d,0xcb,0xcb,0x46
+.byte   0x67,0xbe,0xbe,0xd9,    0x72,0x39,0x39,0x4b
+.byte   0x94,0x4a,0x4a,0xde,    0x98,0x4c,0x4c,0xd4
+.byte   0xb0,0x58,0x58,0xe8,    0x85,0xcf,0xcf,0x4a
+.byte   0xbb,0xd0,0xd0,0x6b,    0xc5,0xef,0xef,0x2a
+.byte   0x4f,0xaa,0xaa,0xe5,    0xed,0xfb,0xfb,0x16
+.byte   0x86,0x43,0x43,0xc5,    0x9a,0x4d,0x4d,0xd7
+.byte   0x66,0x33,0x33,0x55,    0x11,0x85,0x85,0x94
+.byte   0x8a,0x45,0x45,0xcf,    0xe9,0xf9,0xf9,0x10
+.byte   0x04,0x02,0x02,0x06,    0xfe,0x7f,0x7f,0x81
+.byte   0xa0,0x50,0x50,0xf0,    0x78,0x3c,0x3c,0x44
+.byte   0x25,0x9f,0x9f,0xba,    0x4b,0xa8,0xa8,0xe3
+.byte   0xa2,0x51,0x51,0xf3,    0x5d,0xa3,0xa3,0xfe
+.byte   0x80,0x40,0x40,0xc0,    0x05,0x8f,0x8f,0x8a
+.byte   0x3f,0x92,0x92,0xad,    0x21,0x9d,0x9d,0xbc
+.byte   0x70,0x38,0x38,0x48,    0xf1,0xf5,0xf5,0x04
+.byte   0x63,0xbc,0xbc,0xdf,    0x77,0xb6,0xb6,0xc1
+.byte   0xaf,0xda,0xda,0x75,    0x42,0x21,0x21,0x63
+.byte   0x20,0x10,0x10,0x30,    0xe5,0xff,0xff,0x1a
+.byte   0xfd,0xf3,0xf3,0x0e,    0xbf,0xd2,0xd2,0x6d
+.byte   0x81,0xcd,0xcd,0x4c,    0x18,0x0c,0x0c,0x14
+.byte   0x26,0x13,0x13,0x35,    0xc3,0xec,0xec,0x2f
+.byte   0xbe,0x5f,0x5f,0xe1,    0x35,0x97,0x97,0xa2
+.byte   0x88,0x44,0x44,0xcc,    0x2e,0x17,0x17,0x39
+.byte   0x93,0xc4,0xc4,0x57,    0x55,0xa7,0xa7,0xf2
+.byte   0xfc,0x7e,0x7e,0x82,    0x7a,0x3d,0x3d,0x47
+.byte   0xc8,0x64,0x64,0xac,    0xba,0x5d,0x5d,0xe7
+.byte   0x32,0x19,0x19,0x2b,    0xe6,0x73,0x73,0x95
+.byte   0xc0,0x60,0x60,0xa0,    0x19,0x81,0x81,0x98
+.byte   0x9e,0x4f,0x4f,0xd1,    0xa3,0xdc,0xdc,0x7f
+.byte   0x44,0x22,0x22,0x66,    0x54,0x2a,0x2a,0x7e
+.byte   0x3b,0x90,0x90,0xab,    0x0b,0x88,0x88,0x83
+.byte   0x8c,0x46,0x46,0xca,    0xc7,0xee,0xee,0x29
+.byte   0x6b,0xb8,0xb8,0xd3,    0x28,0x14,0x14,0x3c
+.byte   0xa7,0xde,0xde,0x79,    0xbc,0x5e,0x5e,0xe2
+.byte   0x16,0x0b,0x0b,0x1d,    0xad,0xdb,0xdb,0x76
+.byte   0xdb,0xe0,0xe0,0x3b,    0x64,0x32,0x32,0x56
+.byte   0x74,0x3a,0x3a,0x4e,    0x14,0x0a,0x0a,0x1e
+.byte   0x92,0x49,0x49,0xdb,    0x0c,0x06,0x06,0x0a
+.byte   0x48,0x24,0x24,0x6c,    0xb8,0x5c,0x5c,0xe4
+.byte   0x9f,0xc2,0xc2,0x5d,    0xbd,0xd3,0xd3,0x6e
+.byte   0x43,0xac,0xac,0xef,    0xc4,0x62,0x62,0xa6
+.byte   0x39,0x91,0x91,0xa8,    0x31,0x95,0x95,0xa4
+.byte   0xd3,0xe4,0xe4,0x37,    0xf2,0x79,0x79,0x8b
+.byte   0xd5,0xe7,0xe7,0x32,    0x8b,0xc8,0xc8,0x43
+.byte   0x6e,0x37,0x37,0x59,    0xda,0x6d,0x6d,0xb7
+.byte   0x01,0x8d,0x8d,0x8c,    0xb1,0xd5,0xd5,0x64
+.byte   0x9c,0x4e,0x4e,0xd2,    0x49,0xa9,0xa9,0xe0
+.byte   0xd8,0x6c,0x6c,0xb4,    0xac,0x56,0x56,0xfa
+.byte   0xf3,0xf4,0xf4,0x07,    0xcf,0xea,0xea,0x25
+.byte   0xca,0x65,0x65,0xaf,    0xf4,0x7a,0x7a,0x8e
+.byte   0x47,0xae,0xae,0xe9,    0x10,0x08,0x08,0x18
+.byte   0x6f,0xba,0xba,0xd5,    0xf0,0x78,0x78,0x88
+.byte   0x4a,0x25,0x25,0x6f,    0x5c,0x2e,0x2e,0x72
+.byte   0x38,0x1c,0x1c,0x24,    0x57,0xa6,0xa6,0xf1
+.byte   0x73,0xb4,0xb4,0xc7,    0x97,0xc6,0xc6,0x51
+.byte   0xcb,0xe8,0xe8,0x23,    0xa1,0xdd,0xdd,0x7c
+.byte   0xe8,0x74,0x74,0x9c,    0x3e,0x1f,0x1f,0x21
+.byte   0x96,0x4b,0x4b,0xdd,    0x61,0xbd,0xbd,0xdc
+.byte   0x0d,0x8b,0x8b,0x86,    0x0f,0x8a,0x8a,0x85
+.byte   0xe0,0x70,0x70,0x90,    0x7c,0x3e,0x3e,0x42
+.byte   0x71,0xb5,0xb5,0xc4,    0xcc,0x66,0x66,0xaa
+.byte   0x90,0x48,0x48,0xd8,    0x06,0x03,0x03,0x05
+.byte   0xf7,0xf6,0xf6,0x01,    0x1c,0x0e,0x0e,0x12
+.byte   0xc2,0x61,0x61,0xa3,    0x6a,0x35,0x35,0x5f
+.byte   0xae,0x57,0x57,0xf9,    0x69,0xb9,0xb9,0xd0
+.byte   0x17,0x86,0x86,0x91,    0x99,0xc1,0xc1,0x58
+.byte   0x3a,0x1d,0x1d,0x27,    0x27,0x9e,0x9e,0xb9
+.byte   0xd9,0xe1,0xe1,0x38,    0xeb,0xf8,0xf8,0x13
+.byte   0x2b,0x98,0x98,0xb3,    0x22,0x11,0x11,0x33
+.byte   0xd2,0x69,0x69,0xbb,    0xa9,0xd9,0xd9,0x70
+.byte   0x07,0x8e,0x8e,0x89,    0x33,0x94,0x94,0xa7
+.byte   0x2d,0x9b,0x9b,0xb6,    0x3c,0x1e,0x1e,0x22
+.byte   0x15,0x87,0x87,0x92,    0xc9,0xe9,0xe9,0x20
+.byte   0x87,0xce,0xce,0x49,    0xaa,0x55,0x55,0xff
+.byte   0x50,0x28,0x28,0x78,    0xa5,0xdf,0xdf,0x7a
+.byte   0x03,0x8c,0x8c,0x8f,    0x59,0xa1,0xa1,0xf8
+.byte   0x09,0x89,0x89,0x80,    0x1a,0x0d,0x0d,0x17
+.byte   0x65,0xbf,0xbf,0xda,    0xd7,0xe6,0xe6,0x31
+.byte   0x84,0x42,0x42,0xc6,    0xd0,0x68,0x68,0xb8
+.byte   0x82,0x41,0x41,0xc3,    0x29,0x99,0x99,0xb0
+.byte   0x5a,0x2d,0x2d,0x77,    0x1e,0x0f,0x0f,0x11
+.byte   0x7b,0xb0,0xb0,0xcb,    0xa8,0x54,0x54,0xfc
+.byte   0x6d,0xbb,0xbb,0xd6,    0x2c,0x16,0x16,0x3a
+.byte   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5  # Te4
+.byte   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+.byte   0x01,0x00,0x00,0x00,    0x02,0x00,0x00,0x00     # rcon
+.byte   0x04,0x00,0x00,0x00,    0x08,0x00,0x00,0x00
+.byte   0x10,0x00,0x00,0x00,    0x20,0x00,0x00,0x00
+.byte   0x40,0x00,0x00,0x00,    0x80,0x00,0x00,0x00
+.byte   0x1B,0x00,0x00,0x00,    0x36,0x00,0x00,0x00
+.align  6
+AES_Td:
+.byte   0x51,0xf4,0xa7,0x50,    0x7e,0x41,0x65,0x53     # Td0
+.byte   0x1a,0x17,0xa4,0xc3,    0x3a,0x27,0x5e,0x96
+.byte   0x3b,0xab,0x6b,0xcb,    0x1f,0x9d,0x45,0xf1
+.byte   0xac,0xfa,0x58,0xab,    0x4b,0xe3,0x03,0x93
+.byte   0x20,0x30,0xfa,0x55,    0xad,0x76,0x6d,0xf6
+.byte   0x88,0xcc,0x76,0x91,    0xf5,0x02,0x4c,0x25
+.byte   0x4f,0xe5,0xd7,0xfc,    0xc5,0x2a,0xcb,0xd7
+.byte   0x26,0x35,0x44,0x80,    0xb5,0x62,0xa3,0x8f
+.byte   0xde,0xb1,0x5a,0x49,    0x25,0xba,0x1b,0x67
+.byte   0x45,0xea,0x0e,0x98,    0x5d,0xfe,0xc0,0xe1
+.byte   0xc3,0x2f,0x75,0x02,    0x81,0x4c,0xf0,0x12
+.byte   0x8d,0x46,0x97,0xa3,    0x6b,0xd3,0xf9,0xc6
+.byte   0x03,0x8f,0x5f,0xe7,    0x15,0x92,0x9c,0x95
+.byte   0xbf,0x6d,0x7a,0xeb,    0x95,0x52,0x59,0xda
+.byte   0xd4,0xbe,0x83,0x2d,    0x58,0x74,0x21,0xd3
+.byte   0x49,0xe0,0x69,0x29,    0x8e,0xc9,0xc8,0x44
+.byte   0x75,0xc2,0x89,0x6a,    0xf4,0x8e,0x79,0x78
+.byte   0x99,0x58,0x3e,0x6b,    0x27,0xb9,0x71,0xdd
+.byte   0xbe,0xe1,0x4f,0xb6,    0xf0,0x88,0xad,0x17
+.byte   0xc9,0x20,0xac,0x66,    0x7d,0xce,0x3a,0xb4
+.byte   0x63,0xdf,0x4a,0x18,    0xe5,0x1a,0x31,0x82
+.byte   0x97,0x51,0x33,0x60,    0x62,0x53,0x7f,0x45
+.byte   0xb1,0x64,0x77,0xe0,    0xbb,0x6b,0xae,0x84
+.byte   0xfe,0x81,0xa0,0x1c,    0xf9,0x08,0x2b,0x94
+.byte   0x70,0x48,0x68,0x58,    0x8f,0x45,0xfd,0x19
+.byte   0x94,0xde,0x6c,0x87,    0x52,0x7b,0xf8,0xb7
+.byte   0xab,0x73,0xd3,0x23,    0x72,0x4b,0x02,0xe2
+.byte   0xe3,0x1f,0x8f,0x57,    0x66,0x55,0xab,0x2a
+.byte   0xb2,0xeb,0x28,0x07,    0x2f,0xb5,0xc2,0x03
+.byte   0x86,0xc5,0x7b,0x9a,    0xd3,0x37,0x08,0xa5
+.byte   0x30,0x28,0x87,0xf2,    0x23,0xbf,0xa5,0xb2
+.byte   0x02,0x03,0x6a,0xba,    0xed,0x16,0x82,0x5c
+.byte   0x8a,0xcf,0x1c,0x2b,    0xa7,0x79,0xb4,0x92
+.byte   0xf3,0x07,0xf2,0xf0,    0x4e,0x69,0xe2,0xa1
+.byte   0x65,0xda,0xf4,0xcd,    0x06,0x05,0xbe,0xd5
+.byte   0xd1,0x34,0x62,0x1f,    0xc4,0xa6,0xfe,0x8a
+.byte   0x34,0x2e,0x53,0x9d,    0xa2,0xf3,0x55,0xa0
+.byte   0x05,0x8a,0xe1,0x32,    0xa4,0xf6,0xeb,0x75
+.byte   0x0b,0x83,0xec,0x39,    0x40,0x60,0xef,0xaa
+.byte   0x5e,0x71,0x9f,0x06,    0xbd,0x6e,0x10,0x51
+.byte   0x3e,0x21,0x8a,0xf9,    0x96,0xdd,0x06,0x3d
+.byte   0xdd,0x3e,0x05,0xae,    0x4d,0xe6,0xbd,0x46
+.byte   0x91,0x54,0x8d,0xb5,    0x71,0xc4,0x5d,0x05
+.byte   0x04,0x06,0xd4,0x6f,    0x60,0x50,0x15,0xff
+.byte   0x19,0x98,0xfb,0x24,    0xd6,0xbd,0xe9,0x97
+.byte   0x89,0x40,0x43,0xcc,    0x67,0xd9,0x9e,0x77
+.byte   0xb0,0xe8,0x42,0xbd,    0x07,0x89,0x8b,0x88
+.byte   0xe7,0x19,0x5b,0x38,    0x79,0xc8,0xee,0xdb
+.byte   0xa1,0x7c,0x0a,0x47,    0x7c,0x42,0x0f,0xe9
+.byte   0xf8,0x84,0x1e,0xc9,    0x00,0x00,0x00,0x00
+.byte   0x09,0x80,0x86,0x83,    0x32,0x2b,0xed,0x48
+.byte   0x1e,0x11,0x70,0xac,    0x6c,0x5a,0x72,0x4e
+.byte   0xfd,0x0e,0xff,0xfb,    0x0f,0x85,0x38,0x56
+.byte   0x3d,0xae,0xd5,0x1e,    0x36,0x2d,0x39,0x27
+.byte   0x0a,0x0f,0xd9,0x64,    0x68,0x5c,0xa6,0x21
+.byte   0x9b,0x5b,0x54,0xd1,    0x24,0x36,0x2e,0x3a
+.byte   0x0c,0x0a,0x67,0xb1,    0x93,0x57,0xe7,0x0f
+.byte   0xb4,0xee,0x96,0xd2,    0x1b,0x9b,0x91,0x9e
+.byte   0x80,0xc0,0xc5,0x4f,    0x61,0xdc,0x20,0xa2
+.byte   0x5a,0x77,0x4b,0x69,    0x1c,0x12,0x1a,0x16
+.byte   0xe2,0x93,0xba,0x0a,    0xc0,0xa0,0x2a,0xe5
+.byte   0x3c,0x22,0xe0,0x43,    0x12,0x1b,0x17,0x1d
+.byte   0x0e,0x09,0x0d,0x0b,    0xf2,0x8b,0xc7,0xad
+.byte   0x2d,0xb6,0xa8,0xb9,    0x14,0x1e,0xa9,0xc8
+.byte   0x57,0xf1,0x19,0x85,    0xaf,0x75,0x07,0x4c
+.byte   0xee,0x99,0xdd,0xbb,    0xa3,0x7f,0x60,0xfd
+.byte   0xf7,0x01,0x26,0x9f,    0x5c,0x72,0xf5,0xbc
+.byte   0x44,0x66,0x3b,0xc5,    0x5b,0xfb,0x7e,0x34
+.byte   0x8b,0x43,0x29,0x76,    0xcb,0x23,0xc6,0xdc
+.byte   0xb6,0xed,0xfc,0x68,    0xb8,0xe4,0xf1,0x63
+.byte   0xd7,0x31,0xdc,0xca,    0x42,0x63,0x85,0x10
+.byte   0x13,0x97,0x22,0x40,    0x84,0xc6,0x11,0x20
+.byte   0x85,0x4a,0x24,0x7d,    0xd2,0xbb,0x3d,0xf8
+.byte   0xae,0xf9,0x32,0x11,    0xc7,0x29,0xa1,0x6d
+.byte   0x1d,0x9e,0x2f,0x4b,    0xdc,0xb2,0x30,0xf3
+.byte   0x0d,0x86,0x52,0xec,    0x77,0xc1,0xe3,0xd0
+.byte   0x2b,0xb3,0x16,0x6c,    0xa9,0x70,0xb9,0x99
+.byte   0x11,0x94,0x48,0xfa,    0x47,0xe9,0x64,0x22
+.byte   0xa8,0xfc,0x8c,0xc4,    0xa0,0xf0,0x3f,0x1a
+.byte   0x56,0x7d,0x2c,0xd8,    0x22,0x33,0x90,0xef
+.byte   0x87,0x49,0x4e,0xc7,    0xd9,0x38,0xd1,0xc1
+.byte   0x8c,0xca,0xa2,0xfe,    0x98,0xd4,0x0b,0x36
+.byte   0xa6,0xf5,0x81,0xcf,    0xa5,0x7a,0xde,0x28
+.byte   0xda,0xb7,0x8e,0x26,    0x3f,0xad,0xbf,0xa4
+.byte   0x2c,0x3a,0x9d,0xe4,    0x50,0x78,0x92,0x0d
+.byte   0x6a,0x5f,0xcc,0x9b,    0x54,0x7e,0x46,0x62
+.byte   0xf6,0x8d,0x13,0xc2,    0x90,0xd8,0xb8,0xe8
+.byte   0x2e,0x39,0xf7,0x5e,    0x82,0xc3,0xaf,0xf5
+.byte   0x9f,0x5d,0x80,0xbe,    0x69,0xd0,0x93,0x7c
+.byte   0x6f,0xd5,0x2d,0xa9,    0xcf,0x25,0x12,0xb3
+.byte   0xc8,0xac,0x99,0x3b,    0x10,0x18,0x7d,0xa7
+.byte   0xe8,0x9c,0x63,0x6e,    0xdb,0x3b,0xbb,0x7b
+.byte   0xcd,0x26,0x78,0x09,    0x6e,0x59,0x18,0xf4
+.byte   0xec,0x9a,0xb7,0x01,    0x83,0x4f,0x9a,0xa8
+.byte   0xe6,0x95,0x6e,0x65,    0xaa,0xff,0xe6,0x7e
+.byte   0x21,0xbc,0xcf,0x08,    0xef,0x15,0xe8,0xe6
+.byte   0xba,0xe7,0x9b,0xd9,    0x4a,0x6f,0x36,0xce
+.byte   0xea,0x9f,0x09,0xd4,    0x29,0xb0,0x7c,0xd6
+.byte   0x31,0xa4,0xb2,0xaf,    0x2a,0x3f,0x23,0x31
+.byte   0xc6,0xa5,0x94,0x30,    0x35,0xa2,0x66,0xc0
+.byte   0x74,0x4e,0xbc,0x37,    0xfc,0x82,0xca,0xa6
+.byte   0xe0,0x90,0xd0,0xb0,    0x33,0xa7,0xd8,0x15
+.byte   0xf1,0x04,0x98,0x4a,    0x41,0xec,0xda,0xf7
+.byte   0x7f,0xcd,0x50,0x0e,    0x17,0x91,0xf6,0x2f
+.byte   0x76,0x4d,0xd6,0x8d,    0x43,0xef,0xb0,0x4d
+.byte   0xcc,0xaa,0x4d,0x54,    0xe4,0x96,0x04,0xdf
+.byte   0x9e,0xd1,0xb5,0xe3,    0x4c,0x6a,0x88,0x1b
+.byte   0xc1,0x2c,0x1f,0xb8,    0x46,0x65,0x51,0x7f
+.byte   0x9d,0x5e,0xea,0x04,    0x01,0x8c,0x35,0x5d
+.byte   0xfa,0x87,0x74,0x73,    0xfb,0x0b,0x41,0x2e
+.byte   0xb3,0x67,0x1d,0x5a,    0x92,0xdb,0xd2,0x52
+.byte   0xe9,0x10,0x56,0x33,    0x6d,0xd6,0x47,0x13
+.byte   0x9a,0xd7,0x61,0x8c,    0x37,0xa1,0x0c,0x7a
+.byte   0x59,0xf8,0x14,0x8e,    0xeb,0x13,0x3c,0x89
+.byte   0xce,0xa9,0x27,0xee,    0xb7,0x61,0xc9,0x35
+.byte   0xe1,0x1c,0xe5,0xed,    0x7a,0x47,0xb1,0x3c
+.byte   0x9c,0xd2,0xdf,0x59,    0x55,0xf2,0x73,0x3f
+.byte   0x18,0x14,0xce,0x79,    0x73,0xc7,0x37,0xbf
+.byte   0x53,0xf7,0xcd,0xea,    0x5f,0xfd,0xaa,0x5b
+.byte   0xdf,0x3d,0x6f,0x14,    0x78,0x44,0xdb,0x86
+.byte   0xca,0xaf,0xf3,0x81,    0xb9,0x68,0xc4,0x3e
+.byte   0x38,0x24,0x34,0x2c,    0xc2,0xa3,0x40,0x5f
+.byte   0x16,0x1d,0xc3,0x72,    0xbc,0xe2,0x25,0x0c
+.byte   0x28,0x3c,0x49,0x8b,    0xff,0x0d,0x95,0x41
+.byte   0x39,0xa8,0x01,0x71,    0x08,0x0c,0xb3,0xde
+.byte   0xd8,0xb4,0xe4,0x9c,    0x64,0x56,0xc1,0x90
+.byte   0x7b,0xcb,0x84,0x61,    0xd5,0x32,0xb6,0x70
+.byte   0x48,0x6c,0x5c,0x74,    0xd0,0xb8,0x57,0x42
+.byte   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38  # Td4
+.byte   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+___
+foreach (split("\n",$code)) {
+        s/\`([^\`]*)\`/eval $1/ge;
+        # made-up _instructions, _xtr, _ins, _ror and _bias, cope
+        # with byte order dependencies...
+        if (/^\s+_/) {
+            s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/;
+            s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/
+                sprintf("srl\t$1,$2,%d",$big_endian ?   eval($3)
+                                        :               eval("24-$3"))/e or
+            s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
+                sprintf("sll\t$1,$2,%d",$big_endian ?   eval($3)
+                                        :               eval("24-$3"))/e or
+            s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/
+                sprintf("srl\t$1,$2,%d",$big_endian ?   eval($3)
+                                        :               eval("$3*-1"))/e or
+            s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
+                sprintf("sll\t$1,$2,%d",$big_endian ?   eval($3)
+                                        :               eval("($3-16)&31"))/e;
+            s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/
+                sprintf("sll\t$1,$2,$3")/e                              or
+            s/srl\s+(\$[0-9]+),(\$[0-9]+),0/
+                sprintf("and\t$1,$2,0xff")/e                            or
+            s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/;
+        }
+        # convert lwl/lwr and swr/swl to little-endian order
+        if (!$big_endian && /^\s+[sl]w[lr]\s+/) {
+            s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/
+                sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e        or
+            s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/
+                sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e;
+        }
+        print $_,"\n";
+}
+close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-parisc.pl b/src/lib/libcrypto/aes/asm/aes-parisc.pl
new file mode 100644
index 0000000000..c36b6a2270
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-parisc.pl
@@ -0,0 +1,1021 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# AES for PA-RISC.
+#
+# June 2009.
+#
+# The module is mechanical transliteration of aes-sparcv9.pl, but with
+# a twist: S-boxes are compressed even further down to 1K+256B. On
+# PA-7100LC performance is ~40% better than gcc 3.2 generated code and
+# is about 33 cycles per byte processed with 128-bit key. Newer CPUs
+# perform at 16 cycles per byte. It's not faster than code generated
+# by vendor compiler, but recall that it has compressed S-boxes, which
+# requires extra processing.
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+if ($flavour =~ /64/) {
+        $LEVEL          ="2.0W";
+        $SIZE_T         =8;
+        $FRAME_MARKER   =80;
+        $SAVED_RP       =16;
+        $PUSH           ="std";
+        $PUSHMA         ="std,ma";
+        $POP            ="ldd";
+        $POPMB          ="ldd,mb";
+} else {
+        $LEVEL          ="1.0";
+        $SIZE_T         =4;
+        $FRAME_MARKER   =48;
+        $SAVED_RP       =20;
+        $PUSH           ="stw";
+        $PUSHMA         ="stwm";
+        $POP            ="ldw";
+        $POPMB          ="ldwm";
+}
+$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
+                                #                 [+ argument transfer]
+$inp="%r26";    # arg0
+$out="%r25";    # arg1
+$key="%r24";    # arg2
+($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4");
+($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8");
+($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7,
+ $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) =
+("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16",
+"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26");
+$tbl="%r28";
+$rounds="%r29";
+$code=<<___;
+        .LEVEL  $LEVEL
+        .SPACE  \$TEXT\$
+        .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+        .EXPORT AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+        .ALIGN  64
+AES_encrypt
+        .PROC
+        .CALLINFO       FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+        .ENTRY
+        $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
+        $PUSHMA %r3,$FRAME(%sp)
+        $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
+        $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
+        $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
+        $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
+        $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
+        $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
+        $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
+        $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
+        $PUSH   %r12,`-$FRAME+9*$SIZE_T`(%sp)
+        $PUSH   %r13,`-$FRAME+10*$SIZE_T`(%sp)
+        $PUSH   %r14,`-$FRAME+11*$SIZE_T`(%sp)
+        $PUSH   %r15,`-$FRAME+12*$SIZE_T`(%sp)
+        $PUSH   %r16,`-$FRAME+13*$SIZE_T`(%sp)
+        $PUSH   %r17,`-$FRAME+14*$SIZE_T`(%sp)
+        $PUSH   %r18,`-$FRAME+15*$SIZE_T`(%sp)
+        blr     %r0,$tbl
+        ldi     3,$t0
+L\$enc_pic
+        andcm   $tbl,$t0,$tbl
+        ldo     L\$AES_Te-L\$enc_pic($tbl),$tbl
+        and     $inp,$t0,$t0
+        sub     $inp,$t0,$inp
+        ldw     0($inp),$s0
+        ldw     4($inp),$s1
+        ldw     8($inp),$s2
+        comib,= 0,$t0,L\$enc_inp_aligned
+        ldw     12($inp),$s3
+        sh3addl $t0,%r0,$t0
+        subi    32,$t0,$t0
+        mtctl   $t0,%cr11
+        ldw     16($inp),$t1
+        vshd    $s0,$s1,$s0
+        vshd    $s1,$s2,$s1
+        vshd    $s2,$s3,$s2
+        vshd    $s3,$t1,$s3
+L\$enc_inp_aligned
+        bl      _parisc_AES_encrypt,%r31
+        nop
+        extru,<> $out,31,2,%r0
+        b       L\$enc_out_aligned
+        nop
+        _srm    $s0,24,$acc0
+        _srm    $s0,16,$acc1
+        stb     $acc0,0($out)
+        _srm    $s0,8,$acc2
+        stb     $acc1,1($out)
+        _srm    $s1,24,$acc4
+        stb     $acc2,2($out)
+        _srm    $s1,16,$acc5
+        stb     $s0,3($out)
+        _srm    $s1,8,$acc6
+        stb     $acc4,4($out)
+        _srm    $s2,24,$acc0
+        stb     $acc5,5($out)
+        _srm    $s2,16,$acc1
+        stb     $acc6,6($out)
+        _srm    $s2,8,$acc2
+        stb     $s1,7($out)
+        _srm    $s3,24,$acc4
+        stb     $acc0,8($out)
+        _srm    $s3,16,$acc5
+        stb     $acc1,9($out)
+        _srm    $s3,8,$acc6
+        stb     $acc2,10($out)
+        stb     $s2,11($out)
+        stb     $acc4,12($out)
+        stb     $acc5,13($out)
+        stb     $acc6,14($out)
+        b       L\$enc_done
+        stb     $s3,15($out)
+L\$enc_out_aligned
+        stw     $s0,0($out)
+        stw     $s1,4($out)
+        stw     $s2,8($out)
+        stw     $s3,12($out)
+L\$enc_done
+        $POP    `-$FRAME-$SAVED_RP`(%sp),%r2    ; standard epilogue
+        $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
+        $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
+        $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
+        $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
+        $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
+        $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
+        $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
+        $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
+        $POP    `-$FRAME+9*$SIZE_T`(%sp),%r12
+        $POP    `-$FRAME+10*$SIZE_T`(%sp),%r13
+        $POP    `-$FRAME+11*$SIZE_T`(%sp),%r14
+        $POP    `-$FRAME+12*$SIZE_T`(%sp),%r15
+        $POP    `-$FRAME+13*$SIZE_T`(%sp),%r16
+        $POP    `-$FRAME+14*$SIZE_T`(%sp),%r17
+        $POP    `-$FRAME+15*$SIZE_T`(%sp),%r18
+        bv      (%r2)
+        .EXIT
+        $POPMB  -$FRAME(%sp),%r3
+        .PROCEND
+        .ALIGN  16
+_parisc_AES_encrypt
+        .PROC
+        .CALLINFO       MILLICODE
+        .ENTRY
+        ldw     240($key),$rounds
+        ldw     0($key),$t0
+        ldw     4($key),$t1
+        ldw     8($key),$t2
+        _srm    $rounds,1,$rounds
+        xor     $t0,$s0,$s0
+        ldw     12($key),$t3
+        _srm    $s0,24,$acc0
+        xor     $t1,$s1,$s1
+        ldw     16($key),$t0
+        _srm    $s1,16,$acc1
+        xor     $t2,$s2,$s2
+        ldw     20($key),$t1
+        xor     $t3,$s3,$s3
+        ldw     24($key),$t2
+        ldw     28($key),$t3
+L\$enc_loop
+        _srm    $s2,8,$acc2
+        ldwx,s  $acc0($tbl),$acc0
+        _srm    $s3,0,$acc3
+        ldwx,s  $acc1($tbl),$acc1
+        _srm    $s1,24,$acc4
+        ldwx,s  $acc2($tbl),$acc2
+        _srm    $s2,16,$acc5
+        ldwx,s  $acc3($tbl),$acc3
+        _srm    $s3,8,$acc6
+        ldwx,s  $acc4($tbl),$acc4
+        _srm    $s0,0,$acc7
+        ldwx,s  $acc5($tbl),$acc5
+        _srm    $s2,24,$acc8
+        ldwx,s  $acc6($tbl),$acc6
+        _srm    $s3,16,$acc9
+        ldwx,s  $acc7($tbl),$acc7
+        _srm    $s0,8,$acc10
+        ldwx,s  $acc8($tbl),$acc8
+        _srm    $s1,0,$acc11
+        ldwx,s  $acc9($tbl),$acc9
+        _srm    $s3,24,$acc12
+        ldwx,s  $acc10($tbl),$acc10
+        _srm    $s0,16,$acc13
+        ldwx,s  $acc11($tbl),$acc11
+        _srm    $s1,8,$acc14
+        ldwx,s  $acc12($tbl),$acc12
+        _srm    $s2,0,$acc15
+        ldwx,s  $acc13($tbl),$acc13
+        ldwx,s  $acc14($tbl),$acc14
+        ldwx,s  $acc15($tbl),$acc15
+        addib,= -1,$rounds,L\$enc_last
+        ldo     32($key),$key
+                _ror    $acc1,8,$acc1
+                xor     $acc0,$t0,$t0
+        ldw     0($key),$s0
+                _ror    $acc2,16,$acc2
+                xor     $acc1,$t0,$t0
+        ldw     4($key),$s1
+                _ror    $acc3,24,$acc3
+                xor     $acc2,$t0,$t0
+        ldw     8($key),$s2
+                _ror    $acc5,8,$acc5
+                xor     $acc3,$t0,$t0
+        ldw     12($key),$s3
+                _ror    $acc6,16,$acc6
+                xor     $acc4,$t1,$t1
+                _ror    $acc7,24,$acc7
+                xor     $acc5,$t1,$t1
+                _ror    $acc9,8,$acc9
+                xor     $acc6,$t1,$t1
+                _ror    $acc10,16,$acc10
+                xor     $acc7,$t1,$t1
+                _ror    $acc11,24,$acc11
+                xor     $acc8,$t2,$t2
+                _ror    $acc13,8,$acc13
+                xor     $acc9,$t2,$t2
+                _ror    $acc14,16,$acc14
+                xor     $acc10,$t2,$t2
+                _ror    $acc15,24,$acc15
+                xor     $acc11,$t2,$t2
+                xor     $acc12,$acc14,$acc14
+                xor     $acc13,$t3,$t3
+        _srm    $t0,24,$acc0
+                xor     $acc14,$t3,$t3
+        _srm    $t1,16,$acc1
+                xor     $acc15,$t3,$t3
+        _srm    $t2,8,$acc2
+        ldwx,s  $acc0($tbl),$acc0
+        _srm    $t3,0,$acc3
+        ldwx,s  $acc1($tbl),$acc1
+        _srm    $t1,24,$acc4
+        ldwx,s  $acc2($tbl),$acc2
+        _srm    $t2,16,$acc5
+        ldwx,s  $acc3($tbl),$acc3
+        _srm    $t3,8,$acc6
+        ldwx,s  $acc4($tbl),$acc4
+        _srm    $t0,0,$acc7
+        ldwx,s  $acc5($tbl),$acc5
+        _srm    $t2,24,$acc8
+        ldwx,s  $acc6($tbl),$acc6
+        _srm    $t3,16,$acc9
+        ldwx,s  $acc7($tbl),$acc7
+        _srm    $t0,8,$acc10
+        ldwx,s  $acc8($tbl),$acc8
+        _srm    $t1,0,$acc11
+        ldwx,s  $acc9($tbl),$acc9
+        _srm    $t3,24,$acc12
+        ldwx,s  $acc10($tbl),$acc10
+        _srm    $t0,16,$acc13
+        ldwx,s  $acc11($tbl),$acc11
+        _srm    $t1,8,$acc14
+        ldwx,s  $acc12($tbl),$acc12
+        _srm    $t2,0,$acc15
+        ldwx,s  $acc13($tbl),$acc13
+                _ror    $acc1,8,$acc1
+        ldwx,s  $acc14($tbl),$acc14
+                _ror    $acc2,16,$acc2
+                xor     $acc0,$s0,$s0
+        ldwx,s  $acc15($tbl),$acc15
+                _ror    $acc3,24,$acc3
+                xor     $acc1,$s0,$s0
+        ldw     16($key),$t0
+                _ror    $acc5,8,$acc5
+                xor     $acc2,$s0,$s0
+        ldw     20($key),$t1
+                _ror    $acc6,16,$acc6
+                xor     $acc3,$s0,$s0
+        ldw     24($key),$t2
+                _ror    $acc7,24,$acc7
+                xor     $acc4,$s1,$s1
+        ldw     28($key),$t3
+                _ror    $acc9,8,$acc9
+                xor     $acc5,$s1,$s1
+        ldw     1024+0($tbl),%r0                ; prefetch te4
+                _ror    $acc10,16,$acc10
+                xor     $acc6,$s1,$s1
+        ldw     1024+32($tbl),%r0               ; prefetch te4
+                _ror    $acc11,24,$acc11
+                xor     $acc7,$s1,$s1
+        ldw     1024+64($tbl),%r0               ; prefetch te4
+                _ror    $acc13,8,$acc13
+                xor     $acc8,$s2,$s2
+        ldw     1024+96($tbl),%r0               ; prefetch te4
+                _ror    $acc14,16,$acc14
+                xor     $acc9,$s2,$s2
+        ldw     1024+128($tbl),%r0              ; prefetch te4
+                _ror    $acc15,24,$acc15
+                xor     $acc10,$s2,$s2
+        ldw     1024+160($tbl),%r0              ; prefetch te4
+        _srm    $s0,24,$acc0
+                xor     $acc11,$s2,$s2
+        ldw     1024+192($tbl),%r0              ; prefetch te4
+                xor     $acc12,$acc14,$acc14
+                xor     $acc13,$s3,$s3
+        ldw     1024+224($tbl),%r0              ; prefetch te4
+        _srm    $s1,16,$acc1
+                xor     $acc14,$s3,$s3
+        b       L\$enc_loop
+                xor     $acc15,$s3,$s3
+        .ALIGN  16
+L\$enc_last
+        ldo     1024($tbl),$rounds
+                _ror    $acc1,8,$acc1
+                xor     $acc0,$t0,$t0
+        ldw     0($key),$s0
+                _ror    $acc2,16,$acc2
+                xor     $acc1,$t0,$t0
+        ldw     4($key),$s1
+                _ror    $acc3,24,$acc3
+                xor     $acc2,$t0,$t0
+        ldw     8($key),$s2
+                _ror    $acc5,8,$acc5
+                xor     $acc3,$t0,$t0
+        ldw     12($key),$s3
+                _ror    $acc6,16,$acc6
+                xor     $acc4,$t1,$t1
+                _ror    $acc7,24,$acc7
+                xor     $acc5,$t1,$t1
+                _ror    $acc9,8,$acc9
+                xor     $acc6,$t1,$t1
+                _ror    $acc10,16,$acc10
+                xor     $acc7,$t1,$t1
+                _ror    $acc11,24,$acc11
+                xor     $acc8,$t2,$t2
+                _ror    $acc13,8,$acc13
+                xor     $acc9,$t2,$t2
+                _ror    $acc14,16,$acc14
+                xor     $acc10,$t2,$t2
+                _ror    $acc15,24,$acc15
+                xor     $acc11,$t2,$t2
+                xor     $acc12,$acc14,$acc14
+                xor     $acc13,$t3,$t3
+        _srm    $t0,24,$acc0
+                xor     $acc14,$t3,$t3
+        _srm    $t1,16,$acc1
+                xor     $acc15,$t3,$t3
+        _srm    $t2,8,$acc2
+        ldbx    $acc0($rounds),$acc0
+        _srm    $t1,24,$acc4
+        ldbx    $acc1($rounds),$acc1
+        _srm    $t2,16,$acc5
+        _srm    $t3,0,$acc3
+        ldbx    $acc2($rounds),$acc2
+        ldbx    $acc3($rounds),$acc3
+        _srm    $t3,8,$acc6
+        ldbx    $acc4($rounds),$acc4
+        _srm    $t2,24,$acc8
+        ldbx    $acc5($rounds),$acc5
+        _srm    $t3,16,$acc9
+        _srm    $t0,0,$acc7
+        ldbx    $acc6($rounds),$acc6
+        ldbx    $acc7($rounds),$acc7
+        _srm    $t0,8,$acc10
+        ldbx    $acc8($rounds),$acc8
+        _srm    $t3,24,$acc12
+        ldbx    $acc9($rounds),$acc9
+        _srm    $t0,16,$acc13
+        _srm    $t1,0,$acc11
+        ldbx    $acc10($rounds),$acc10
+        _srm    $t1,8,$acc14
+        ldbx    $acc11($rounds),$acc11
+        ldbx    $acc12($rounds),$acc12
+        ldbx    $acc13($rounds),$acc13
+        _srm    $t2,0,$acc15
+        ldbx    $acc14($rounds),$acc14
+                dep     $acc0,7,8,$acc3
+        ldbx    $acc15($rounds),$acc15
+                dep     $acc4,7,8,$acc7
+                dep     $acc1,15,8,$acc3
+                dep     $acc5,15,8,$acc7
+                dep     $acc2,23,8,$acc3
+                dep     $acc6,23,8,$acc7
+                xor     $acc3,$s0,$s0
+                xor     $acc7,$s1,$s1
+                dep     $acc8,7,8,$acc11
+                dep     $acc12,7,8,$acc15
+                dep     $acc9,15,8,$acc11
+                dep     $acc13,15,8,$acc15
+                dep     $acc10,23,8,$acc11
+                dep     $acc14,23,8,$acc15
+                xor     $acc11,$s2,$s2
+        bv      (%r31)
+        .EXIT
+                xor     $acc15,$s3,$s3
+        .PROCEND
+        .ALIGN  64
+L\$AES_Te
+        .WORD   0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
+        .WORD   0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
+        .WORD   0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
+        .WORD   0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
+        .WORD   0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
+        .WORD   0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
+        .WORD   0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
+        .WORD   0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
+        .WORD   0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
+        .WORD   0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
+        .WORD   0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
+        .WORD   0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
+        .WORD   0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
+        .WORD   0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
+        .WORD   0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
+        .WORD   0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
+        .WORD   0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
+        .WORD   0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
+        .WORD   0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
+        .WORD   0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
+        .WORD   0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
+        .WORD   0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
+        .WORD   0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
+        .WORD   0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
+        .WORD   0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
+        .WORD   0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
+        .WORD   0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
+        .WORD   0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
+        .WORD   0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
+        .WORD   0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
+        .WORD   0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
+        .WORD   0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
+        .WORD   0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
+        .WORD   0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
+        .WORD   0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
+        .WORD   0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
+        .WORD   0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
+        .WORD   0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
+        .WORD   0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
+        .WORD   0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
+        .WORD   0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
+        .WORD   0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
+        .WORD   0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
+        .WORD   0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
+        .WORD   0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
+        .WORD   0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
+        .WORD   0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
+        .WORD   0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
+        .WORD   0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
+        .WORD   0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
+        .WORD   0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
+        .WORD   0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
+        .WORD   0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
+        .WORD   0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
+        .WORD   0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
+        .WORD   0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
+        .WORD   0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
+        .WORD   0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
+        .WORD   0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
+        .WORD   0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
+        .WORD   0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
+        .WORD   0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
+        .WORD   0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
+        .WORD   0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
+        .BYTE   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+        .BYTE   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+        .BYTE   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+        .BYTE   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+        .BYTE   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+        .BYTE   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+        .BYTE   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+        .BYTE   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+        .BYTE   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+        .BYTE   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+        .BYTE   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+        .BYTE   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+        .BYTE   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+        .BYTE   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+        .BYTE   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+        .BYTE   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+        .BYTE   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+        .BYTE   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+        .BYTE   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+        .BYTE   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+        .BYTE   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+        .BYTE   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+        .BYTE   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+        .BYTE   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+        .BYTE   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+        .BYTE   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+        .BYTE   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+        .BYTE   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+        .BYTE   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+        .BYTE   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+        .BYTE   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+        .BYTE   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+___
+$code.=<<___;
+        .EXPORT AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+        .ALIGN  16
+AES_decrypt
+        .PROC
+        .CALLINFO       FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+        .ENTRY
+        $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
+        $PUSHMA %r3,$FRAME(%sp)
+        $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
+        $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
+        $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
+        $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
+        $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
+        $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
+        $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
+        $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
+        $PUSH   %r12,`-$FRAME+9*$SIZE_T`(%sp)
+        $PUSH   %r13,`-$FRAME+10*$SIZE_T`(%sp)
+        $PUSH   %r14,`-$FRAME+11*$SIZE_T`(%sp)
+        $PUSH   %r15,`-$FRAME+12*$SIZE_T`(%sp)
+        $PUSH   %r16,`-$FRAME+13*$SIZE_T`(%sp)
+        $PUSH   %r17,`-$FRAME+14*$SIZE_T`(%sp)
+        $PUSH   %r18,`-$FRAME+15*$SIZE_T`(%sp)
+        blr     %r0,$tbl
+        ldi     3,$t0
+L\$dec_pic
+        andcm   $tbl,$t0,$tbl
+        ldo     L\$AES_Td-L\$dec_pic($tbl),$tbl
+        and     $inp,$t0,$t0
+        sub     $inp,$t0,$inp
+        ldw     0($inp),$s0
+        ldw     4($inp),$s1
+        ldw     8($inp),$s2
+        comib,= 0,$t0,L\$dec_inp_aligned
+        ldw     12($inp),$s3
+        sh3addl $t0,%r0,$t0
+        subi    32,$t0,$t0
+        mtctl   $t0,%cr11
+        ldw     16($inp),$t1
+        vshd    $s0,$s1,$s0
+        vshd    $s1,$s2,$s1
+        vshd    $s2,$s3,$s2
+        vshd    $s3,$t1,$s3
+L\$dec_inp_aligned
+        bl      _parisc_AES_decrypt,%r31
+        nop
+        extru,<> $out,31,2,%r0
+        b       L\$dec_out_aligned
+        nop
+        _srm    $s0,24,$acc0
+        _srm    $s0,16,$acc1
+        stb     $acc0,0($out)
+        _srm    $s0,8,$acc2
+        stb     $acc1,1($out)
+        _srm    $s1,24,$acc4
+        stb     $acc2,2($out)
+        _srm    $s1,16,$acc5
+        stb     $s0,3($out)
+        _srm    $s1,8,$acc6
+        stb     $acc4,4($out)
+        _srm    $s2,24,$acc0
+        stb     $acc5,5($out)
+        _srm    $s2,16,$acc1
+        stb     $acc6,6($out)
+        _srm    $s2,8,$acc2
+        stb     $s1,7($out)
+        _srm    $s3,24,$acc4
+        stb     $acc0,8($out)
+        _srm    $s3,16,$acc5
+        stb     $acc1,9($out)
+        _srm    $s3,8,$acc6
+        stb     $acc2,10($out)
+        stb     $s2,11($out)
+        stb     $acc4,12($out)
+        stb     $acc5,13($out)
+        stb     $acc6,14($out)
+        b       L\$dec_done
+        stb     $s3,15($out)
+L\$dec_out_aligned
+        stw     $s0,0($out)
+        stw     $s1,4($out)
+        stw     $s2,8($out)
+        stw     $s3,12($out)
+L\$dec_done
+        $POP    `-$FRAME-$SAVED_RP`(%sp),%r2    ; standard epilogue
+        $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
+        $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
+        $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
+        $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
+        $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
+        $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
+        $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
+        $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
+        $POP    `-$FRAME+9*$SIZE_T`(%sp),%r12
+        $POP    `-$FRAME+10*$SIZE_T`(%sp),%r13
+        $POP    `-$FRAME+11*$SIZE_T`(%sp),%r14
+        $POP    `-$FRAME+12*$SIZE_T`(%sp),%r15
+        $POP    `-$FRAME+13*$SIZE_T`(%sp),%r16
+        $POP    `-$FRAME+14*$SIZE_T`(%sp),%r17
+        $POP    `-$FRAME+15*$SIZE_T`(%sp),%r18
+        bv      (%r2)
+        .EXIT
+        $POPMB  -$FRAME(%sp),%r3
+        .PROCEND
+        .ALIGN  16
+_parisc_AES_decrypt
+        .PROC
+        .CALLINFO       MILLICODE
+        .ENTRY
+        ldw     240($key),$rounds
+        ldw     0($key),$t0
+        ldw     4($key),$t1
+        ldw     8($key),$t2
+        ldw     12($key),$t3
+        _srm    $rounds,1,$rounds
+        xor     $t0,$s0,$s0
+        ldw     16($key),$t0
+        xor     $t1,$s1,$s1
+        ldw     20($key),$t1
+        _srm    $s0,24,$acc0
+        xor     $t2,$s2,$s2
+        ldw     24($key),$t2
+        xor     $t3,$s3,$s3
+        ldw     28($key),$t3
+        _srm    $s3,16,$acc1
+L\$dec_loop
+        _srm    $s2,8,$acc2
+        ldwx,s  $acc0($tbl),$acc0
+        _srm    $s1,0,$acc3
+        ldwx,s  $acc1($tbl),$acc1
+        _srm    $s1,24,$acc4
+        ldwx,s  $acc2($tbl),$acc2
+        _srm    $s0,16,$acc5
+        ldwx,s  $acc3($tbl),$acc3
+        _srm    $s3,8,$acc6
+        ldwx,s  $acc4($tbl),$acc4
+        _srm    $s2,0,$acc7
+        ldwx,s  $acc5($tbl),$acc5
+        _srm    $s2,24,$acc8
+        ldwx,s  $acc6($tbl),$acc6
+        _srm    $s1,16,$acc9
+        ldwx,s  $acc7($tbl),$acc7
+        _srm    $s0,8,$acc10
+        ldwx,s  $acc8($tbl),$acc8
+        _srm    $s3,0,$acc11
+        ldwx,s  $acc9($tbl),$acc9
+        _srm    $s3,24,$acc12
+        ldwx,s  $acc10($tbl),$acc10
+        _srm    $s2,16,$acc13
+        ldwx,s  $acc11($tbl),$acc11
+        _srm    $s1,8,$acc14
+        ldwx,s  $acc12($tbl),$acc12
+        _srm    $s0,0,$acc15
+        ldwx,s  $acc13($tbl),$acc13
+        ldwx,s  $acc14($tbl),$acc14
+        ldwx,s  $acc15($tbl),$acc15
+        addib,= -1,$rounds,L\$dec_last
+        ldo     32($key),$key
+                _ror    $acc1,8,$acc1
+                xor     $acc0,$t0,$t0
+        ldw     0($key),$s0
+                _ror    $acc2,16,$acc2
+                xor     $acc1,$t0,$t0
+        ldw     4($key),$s1
+                _ror    $acc3,24,$acc3
+                xor     $acc2,$t0,$t0
+        ldw     8($key),$s2
+                _ror    $acc5,8,$acc5
+                xor     $acc3,$t0,$t0
+        ldw     12($key),$s3
+                _ror    $acc6,16,$acc6
+                xor     $acc4,$t1,$t1
+                _ror    $acc7,24,$acc7
+                xor     $acc5,$t1,$t1
+                _ror    $acc9,8,$acc9
+                xor     $acc6,$t1,$t1
+                _ror    $acc10,16,$acc10
+                xor     $acc7,$t1,$t1
+                _ror    $acc11,24,$acc11
+                xor     $acc8,$t2,$t2
+                _ror    $acc13,8,$acc13
+                xor     $acc9,$t2,$t2
+                _ror    $acc14,16,$acc14
+                xor     $acc10,$t2,$t2
+                _ror    $acc15,24,$acc15
+                xor     $acc11,$t2,$t2
+                xor     $acc12,$acc14,$acc14
+                xor     $acc13,$t3,$t3
+        _srm    $t0,24,$acc0
+                xor     $acc14,$t3,$t3
+                xor     $acc15,$t3,$t3
+        _srm    $t3,16,$acc1
+        _srm    $t2,8,$acc2
+        ldwx,s  $acc0($tbl),$acc0
+        _srm    $t1,0,$acc3
+        ldwx,s  $acc1($tbl),$acc1
+        _srm    $t1,24,$acc4
+        ldwx,s  $acc2($tbl),$acc2
+        _srm    $t0,16,$acc5
+        ldwx,s  $acc3($tbl),$acc3
+        _srm    $t3,8,$acc6
+        ldwx,s  $acc4($tbl),$acc4
+        _srm    $t2,0,$acc7
+        ldwx,s  $acc5($tbl),$acc5
+        _srm    $t2,24,$acc8
+        ldwx,s  $acc6($tbl),$acc6
+        _srm    $t1,16,$acc9
+        ldwx,s  $acc7($tbl),$acc7
+        _srm    $t0,8,$acc10
+        ldwx,s  $acc8($tbl),$acc8
+        _srm    $t3,0,$acc11
+        ldwx,s  $acc9($tbl),$acc9
+        _srm    $t3,24,$acc12
+        ldwx,s  $acc10($tbl),$acc10
+        _srm    $t2,16,$acc13
+        ldwx,s  $acc11($tbl),$acc11
+        _srm    $t1,8,$acc14
+        ldwx,s  $acc12($tbl),$acc12
+        _srm    $t0,0,$acc15
+        ldwx,s  $acc13($tbl),$acc13
+                _ror    $acc1,8,$acc1
+        ldwx,s  $acc14($tbl),$acc14
+                _ror    $acc2,16,$acc2
+                xor     $acc0,$s0,$s0
+        ldwx,s  $acc15($tbl),$acc15
+                _ror    $acc3,24,$acc3
+                xor     $acc1,$s0,$s0
+        ldw     16($key),$t0
+                _ror    $acc5,8,$acc5
+                xor     $acc2,$s0,$s0
+        ldw     20($key),$t1
+                _ror    $acc6,16,$acc6
+                xor     $acc3,$s0,$s0
+        ldw     24($key),$t2
+                _ror    $acc7,24,$acc7
+                xor     $acc4,$s1,$s1
+        ldw     28($key),$t3
+                _ror    $acc9,8,$acc9
+                xor     $acc5,$s1,$s1
+        ldw     1024+0($tbl),%r0                ; prefetch td4
+                _ror    $acc10,16,$acc10
+                xor     $acc6,$s1,$s1
+        ldw     1024+32($tbl),%r0               ; prefetch td4
+                _ror    $acc11,24,$acc11
+                xor     $acc7,$s1,$s1
+        ldw     1024+64($tbl),%r0               ; prefetch td4
+                _ror    $acc13,8,$acc13
+                xor     $acc8,$s2,$s2
+        ldw     1024+96($tbl),%r0               ; prefetch td4
+                _ror    $acc14,16,$acc14
+                xor     $acc9,$s2,$s2
+        ldw     1024+128($tbl),%r0              ; prefetch td4
+                _ror    $acc15,24,$acc15
+                xor     $acc10,$s2,$s2
+        ldw     1024+160($tbl),%r0              ; prefetch td4
+        _srm    $s0,24,$acc0
+                xor     $acc11,$s2,$s2
+        ldw     1024+192($tbl),%r0              ; prefetch td4
+                xor     $acc12,$acc14,$acc14
+                xor     $acc13,$s3,$s3
+        ldw     1024+224($tbl),%r0              ; prefetch td4
+                xor     $acc14,$s3,$s3
+                xor     $acc15,$s3,$s3
+        b       L\$dec_loop
+        _srm    $s3,16,$acc1
+        .ALIGN  16
+L\$dec_last
+        ldo     1024($tbl),$rounds
+                _ror    $acc1,8,$acc1
+                xor     $acc0,$t0,$t0
+        ldw     0($key),$s0
+                _ror    $acc2,16,$acc2
+                xor     $acc1,$t0,$t0
+        ldw     4($key),$s1
+                _ror    $acc3,24,$acc3
+                xor     $acc2,$t0,$t0
+        ldw     8($key),$s2
+                _ror    $acc5,8,$acc5
+                xor     $acc3,$t0,$t0
+        ldw     12($key),$s3
+                _ror    $acc6,16,$acc6
+                xor     $acc4,$t1,$t1
+                _ror    $acc7,24,$acc7
+                xor     $acc5,$t1,$t1
+                _ror    $acc9,8,$acc9
+                xor     $acc6,$t1,$t1
+                _ror    $acc10,16,$acc10
+                xor     $acc7,$t1,$t1
+                _ror    $acc11,24,$acc11
+                xor     $acc8,$t2,$t2
+                _ror    $acc13,8,$acc13
+                xor     $acc9,$t2,$t2
+                _ror    $acc14,16,$acc14
+                xor     $acc10,$t2,$t2
+                _ror    $acc15,24,$acc15
+                xor     $acc11,$t2,$t2
+                xor     $acc12,$acc14,$acc14
+                xor     $acc13,$t3,$t3
+        _srm    $t0,24,$acc0
+                xor     $acc14,$t3,$t3
+                xor     $acc15,$t3,$t3
+        _srm    $t3,16,$acc1
+        _srm    $t2,8,$acc2
+        ldbx    $acc0($rounds),$acc0
+        _srm    $t1,24,$acc4
+        ldbx    $acc1($rounds),$acc1
+        _srm    $t0,16,$acc5
+        _srm    $t1,0,$acc3
+        ldbx    $acc2($rounds),$acc2
+        ldbx    $acc3($rounds),$acc3
+        _srm    $t3,8,$acc6
+        ldbx    $acc4($rounds),$acc4
+        _srm    $t2,24,$acc8
+        ldbx    $acc5($rounds),$acc5
+        _srm    $t1,16,$acc9
+        _srm    $t2,0,$acc7
+        ldbx    $acc6($rounds),$acc6
+        ldbx    $acc7($rounds),$acc7
+        _srm    $t0,8,$acc10
+        ldbx    $acc8($rounds),$acc8
+        _srm    $t3,24,$acc12
+        ldbx    $acc9($rounds),$acc9
+        _srm    $t2,16,$acc13
+        _srm    $t3,0,$acc11
+        ldbx    $acc10($rounds),$acc10
+        _srm    $t1,8,$acc14
+        ldbx    $acc11($rounds),$acc11
+        ldbx    $acc12($rounds),$acc12
+        ldbx    $acc13($rounds),$acc13
+        _srm    $t0,0,$acc15
+        ldbx    $acc14($rounds),$acc14
+                dep     $acc0,7,8,$acc3
+        ldbx    $acc15($rounds),$acc15
+                dep     $acc4,7,8,$acc7
+                dep     $acc1,15,8,$acc3
+                dep     $acc5,15,8,$acc7
+                dep     $acc2,23,8,$acc3
+                dep     $acc6,23,8,$acc7
+                xor     $acc3,$s0,$s0
+                xor     $acc7,$s1,$s1
+                dep     $acc8,7,8,$acc11
+                dep     $acc12,7,8,$acc15
+                dep     $acc9,15,8,$acc11
+                dep     $acc13,15,8,$acc15
+                dep     $acc10,23,8,$acc11
+                dep     $acc14,23,8,$acc15
+                xor     $acc11,$s2,$s2
+        bv      (%r31)
+        .EXIT
+                xor     $acc15,$s3,$s3
+        .PROCEND
+        .ALIGN  64
+L\$AES_Td
+        .WORD   0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
+        .WORD   0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
+        .WORD   0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
+        .WORD   0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
+        .WORD   0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
+        .WORD   0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
+        .WORD   0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
+        .WORD   0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
+        .WORD   0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
+        .WORD   0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
+        .WORD   0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
+        .WORD   0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
+        .WORD   0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
+        .WORD   0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
+        .WORD   0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
+        .WORD   0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
+        .WORD   0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
+        .WORD   0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
+        .WORD   0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
+        .WORD   0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
+        .WORD   0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
+        .WORD   0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
+        .WORD   0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
+        .WORD   0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
+        .WORD   0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
+        .WORD   0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
+        .WORD   0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
+        .WORD   0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
+        .WORD   0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
+        .WORD   0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
+        .WORD   0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
+        .WORD   0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
+        .WORD   0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
+        .WORD   0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
+        .WORD   0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
+        .WORD   0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
+        .WORD   0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
+        .WORD   0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
+        .WORD   0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
+        .WORD   0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
+        .WORD   0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
+        .WORD   0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
+        .WORD   0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
+        .WORD   0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
+        .WORD   0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
+        .WORD   0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
+        .WORD   0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
+        .WORD   0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
+        .WORD   0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
+        .WORD   0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
+        .WORD   0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
+        .WORD   0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
+        .WORD   0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
+        .WORD   0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
+        .WORD   0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
+        .WORD   0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
+        .WORD   0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
+        .WORD   0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
+        .WORD   0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
+        .WORD   0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
+        .WORD   0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
+        .WORD   0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
+        .WORD   0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
+        .WORD   0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
+        .BYTE   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+        .BYTE   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+        .BYTE   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+        .BYTE   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+        .BYTE   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+        .BYTE   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+        .BYTE   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+        .BYTE   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+        .BYTE   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+        .BYTE   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+        .BYTE   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+        .BYTE   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+        .BYTE   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+        .BYTE   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+        .BYTE   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+        .BYTE   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+        .BYTE   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+        .BYTE   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+        .BYTE   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+        .BYTE   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+        .BYTE   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+        .BYTE   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+        .BYTE   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+        .BYTE   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+        .BYTE   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+        .BYTE   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+        .BYTE   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+        .BYTE   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+        .BYTE   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+        .BYTE   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+        .BYTE   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+        .BYTE   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+        .STRINGZ "AES for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+foreach (split("\n",$code)) {
+        s/\`([^\`]*)\`/eval $1/ge;
+        # translate made up instructons: _ror, _srm
+        s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/                             or
+        s/_srm(\s+%r[0-9]+),([0-9]+),/
+                $SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2)
+                :            sprintf("extrd,u%s,%d,8,",$1,63-$2)/e;
+        s/,\*/,/ if ($SIZE_T==4);
+        print $_,"\n";
+}
+close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl
index f82c5e1814..7c52cbe5f9 100644
--- a/src/lib/libcrypto/aes/asm/aes-ppc.pl
+++ b/src/lib/libcrypto/aes/asm/aes-ppc.pl
@@ -7,7 +7,7 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
-# Needs more work: key setup, page boundaries, CBC routine...
+# Needs more work: key setup, CBC routine...
 #
 # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
 # 128-bit key, which is ~40% better than 64-bit code generated by gcc
@@ -18,7 +18,7 @@
 # February 2010
 #
-# Rescheduling instructions to favour Power6 pipeline gives 10%
+# Rescheduling instructions to favour Power6 pipeline gave 10%
 # performance improvement on the platfrom in question (and marginal
 # improvement even on others). It should be noted that Power6 fails
 # to process byte in 18 cycles, only in 23, because it fails to issue
@@ -33,11 +33,13 @@ $flavour = shift;
 if ($flavour =~ /64/) {
        $SIZE_T =8;
+        $LRSAVE =2*$SIZE_T;
        $STU    ="stdu";
        $POP    ="ld";
        $PUSH   ="std";
 } elsif ($flavour =~ /32/) {
        $SIZE_T =4;
+        $LRSAVE =$SIZE_T;
        $STU    ="stwu";
        $POP    ="lwz";
        $PUSH   ="stw";
@@ -116,15 +118,19 @@ LAES_Te:
        addi    $Tbl0,$Tbl0,`128-8`
        mtlr    r0
        blr
-        .space  `32-24`
+        .long   0
+        .byte   0,12,0x14,0,0,0,0,0
+        .space  `64-9*4`
 LAES_Td:
        mflr    r0
        bcl     20,31,\$+4
        mflr    $Tbl0   ;    vvvvvvvv "distance" between . and 1st data entry
-        addi    $Tbl0,$Tbl0,`128-8-32+2048+256`
+        addi    $Tbl0,$Tbl0,`128-64-8+2048+256`
        mtlr    r0
        blr
-        .space  `128-32-24`
+        .long   0
+        .byte   0,12,0x14,0,0,0,0,0
+        .space  `128-64-9*4`
 ___
 &_data_word(
        0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
@@ -328,10 +334,9 @@ $code.=<<___;
 .globl  .AES_encrypt
 .align  7
 .AES_encrypt:
-        mflr    r0
        $STU    $sp,-$FRAME($sp)
+        mflr    r0
-        $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
        $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
        $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
        $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
@@ -352,7 +357,14 @@ $code.=<<___;
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+        $PUSH   r0,`$FRAME+$LRSAVE`($sp)
+        andi.   $t0,$inp,3
+        andi.   $t1,$out,3
+        or.     $t0,$t0,$t1
+        bne     Lenc_unaligned
+Lenc_unaligned_ok:
        lwz     $s0,0($inp)
        lwz     $s1,4($inp)
        lwz     $s2,8($inp)
@@ -363,8 +375,80 @@ $code.=<<___;
        stw     $s1,4($out)
        stw     $s2,8($out)
        stw     $s3,12($out)
+        b       Lenc_done
+Lenc_unaligned:
+        subfic  $t0,$inp,4096
+        subfic  $t1,$out,4096
+        andi.   $t0,$t0,4096-16
+        beq     Lenc_xpage
+        andi.   $t1,$t1,4096-16
+        bne     Lenc_unaligned_ok
+Lenc_xpage:
+        lbz     $acc00,0($inp)
+        lbz     $acc01,1($inp)
+        lbz     $acc02,2($inp)
+        lbz     $s0,3($inp)
+        lbz     $acc04,4($inp)
+        lbz     $acc05,5($inp)
+        lbz     $acc06,6($inp)
+        lbz     $s1,7($inp)
+        lbz     $acc08,8($inp)
+        lbz     $acc09,9($inp)
+        lbz     $acc10,10($inp)
+        insrwi  $s0,$acc00,8,0
+        lbz     $s2,11($inp)
+        insrwi  $s1,$acc04,8,0
+        lbz     $acc12,12($inp)
+        insrwi  $s0,$acc01,8,8
+        lbz     $acc13,13($inp)
+        insrwi  $s1,$acc05,8,8
+        lbz     $acc14,14($inp)
+        insrwi  $s0,$acc02,8,16
+        lbz     $s3,15($inp)
+        insrwi  $s1,$acc06,8,16
+        insrwi  $s2,$acc08,8,0
+        insrwi  $s3,$acc12,8,0
+        insrwi  $s2,$acc09,8,8
+        insrwi  $s3,$acc13,8,8
+        insrwi  $s2,$acc10,8,16
+        insrwi  $s3,$acc14,8,16
+        bl      LAES_Te
+        bl      Lppc_AES_encrypt_compact
+        extrwi  $acc00,$s0,8,0
+        extrwi  $acc01,$s0,8,8
+        stb     $acc00,0($out)
+        extrwi  $acc02,$s0,8,16
+        stb     $acc01,1($out)
+        stb     $acc02,2($out)
+        extrwi  $acc04,$s1,8,0
+        stb     $s0,3($out)
+        extrwi  $acc05,$s1,8,8
+        stb     $acc04,4($out)
+        extrwi  $acc06,$s1,8,16
+        stb     $acc05,5($out)
+        stb     $acc06,6($out)
+        extrwi  $acc08,$s2,8,0
+        stb     $s1,7($out)
+        extrwi  $acc09,$s2,8,8
+        stb     $acc08,8($out)
+        extrwi  $acc10,$s2,8,16
+        stb     $acc09,9($out)
+        stb     $acc10,10($out)
+        extrwi  $acc12,$s3,8,0
+        stb     $s2,11($out)
+        extrwi  $acc13,$s3,8,8
+        stb     $acc12,12($out)
+        extrwi  $acc14,$s3,8,16
+        stb     $acc13,13($out)
+        stb     $acc14,14($out)
+        stb     $s3,15($out)
-        $POP    r0,`$FRAME-$SIZE_T*21`($sp)
+Lenc_done:
+        $POP    r0,`$FRAME+$LRSAVE`($sp)
        $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
        $POP    r13,`$FRAME-$SIZE_T*19`($sp)
        $POP    r14,`$FRAME-$SIZE_T*18`($sp)
@@ -388,18 +472,21 @@ $code.=<<___;
        mtlr    r0
        addi    $sp,$sp,$FRAME
        blr
+        .long   0
+        .byte   0,12,4,1,0x80,18,3,0
+        .long   0
 .align  5
 Lppc_AES_encrypt:
        lwz     $acc00,240($key)
-        lwz     $t0,0($key)
-        lwz     $t1,4($key)
-        lwz     $t2,8($key)
-        lwz     $t3,12($key)
        addi    $Tbl1,$Tbl0,3
+        lwz     $t0,0($key)
        addi    $Tbl2,$Tbl0,2
+        lwz     $t1,4($key)
        addi    $Tbl3,$Tbl0,1
+        lwz     $t2,8($key)
        addi    $acc00,$acc00,-1
+        lwz     $t3,12($key)
        addi    $key,$key,16
        xor     $s0,$s0,$t0
        xor     $s1,$s1,$t1
@@ -413,44 +500,44 @@ Lenc_loop:
        rlwinm  $acc02,$s2,`32-24+3`,21,28
        rlwinm  $acc03,$s3,`32-24+3`,21,28
        lwz     $t0,0($key)
-        lwz     $t1,4($key)
        rlwinm  $acc04,$s1,`32-16+3`,21,28
+        lwz     $t1,4($key)
        rlwinm  $acc05,$s2,`32-16+3`,21,28
        lwz     $t2,8($key)
-        lwz     $t3,12($key)
        rlwinm  $acc06,$s3,`32-16+3`,21,28
+        lwz     $t3,12($key)
        rlwinm  $acc07,$s0,`32-16+3`,21,28
        lwzx    $acc00,$Tbl0,$acc00
-        lwzx    $acc01,$Tbl0,$acc01
        rlwinm  $acc08,$s2,`32-8+3`,21,28
+        lwzx    $acc01,$Tbl0,$acc01
        rlwinm  $acc09,$s3,`32-8+3`,21,28
        lwzx    $acc02,$Tbl0,$acc02
-        lwzx    $acc03,$Tbl0,$acc03
        rlwinm  $acc10,$s0,`32-8+3`,21,28
+        lwzx    $acc03,$Tbl0,$acc03
        rlwinm  $acc11,$s1,`32-8+3`,21,28
        lwzx    $acc04,$Tbl1,$acc04
-        lwzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc12,$s3,`0+3`,21,28
+        lwzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc13,$s0,`0+3`,21,28
        lwzx    $acc06,$Tbl1,$acc06
-        lwzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc14,$s1,`0+3`,21,28
+        lwzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc15,$s2,`0+3`,21,28
        lwzx    $acc08,$Tbl2,$acc08
-        lwzx    $acc09,$Tbl2,$acc09
        xor     $t0,$t0,$acc00
+        lwzx    $acc09,$Tbl2,$acc09
        xor     $t1,$t1,$acc01
        lwzx    $acc10,$Tbl2,$acc10
-        lwzx    $acc11,$Tbl2,$acc11
        xor     $t2,$t2,$acc02
+        lwzx    $acc11,$Tbl2,$acc11
        xor     $t3,$t3,$acc03
        lwzx    $acc12,$Tbl3,$acc12
-        lwzx    $acc13,$Tbl3,$acc13
        xor     $t0,$t0,$acc04
+        lwzx    $acc13,$Tbl3,$acc13
        xor     $t1,$t1,$acc05
        lwzx    $acc14,$Tbl3,$acc14
-        lwzx    $acc15,$Tbl3,$acc15
        xor     $t2,$t2,$acc06
+        lwzx    $acc15,$Tbl3,$acc15
        xor     $t3,$t3,$acc07
        xor     $t0,$t0,$acc08
        xor     $t1,$t1,$acc09
@@ -466,60 +553,60 @@ Lenc_loop:
        addi    $Tbl2,$Tbl0,2048
        nop
        lwz     $t0,0($key)
-        lwz     $t1,4($key)
        rlwinm  $acc00,$s0,`32-24`,24,31
+        lwz     $t1,4($key)
        rlwinm  $acc01,$s1,`32-24`,24,31
        lwz     $t2,8($key)
-        lwz     $t3,12($key)
        rlwinm  $acc02,$s2,`32-24`,24,31
+        lwz     $t3,12($key)
        rlwinm  $acc03,$s3,`32-24`,24,31
        lwz     $acc08,`2048+0`($Tbl0)  ! prefetch Te4
-        lwz     $acc09,`2048+32`($Tbl0)
        rlwinm  $acc04,$s1,`32-16`,24,31
+        lwz     $acc09,`2048+32`($Tbl0)
        rlwinm  $acc05,$s2,`32-16`,24,31
        lwz     $acc10,`2048+64`($Tbl0)
-        lwz     $acc11,`2048+96`($Tbl0)
        rlwinm  $acc06,$s3,`32-16`,24,31
+        lwz     $acc11,`2048+96`($Tbl0)
        rlwinm  $acc07,$s0,`32-16`,24,31
        lwz     $acc12,`2048+128`($Tbl0)
-        lwz     $acc13,`2048+160`($Tbl0)
        rlwinm  $acc08,$s2,`32-8`,24,31
+        lwz     $acc13,`2048+160`($Tbl0)
        rlwinm  $acc09,$s3,`32-8`,24,31
        lwz     $acc14,`2048+192`($Tbl0)
-        lwz     $acc15,`2048+224`($Tbl0)
        rlwinm  $acc10,$s0,`32-8`,24,31
+        lwz     $acc15,`2048+224`($Tbl0)
        rlwinm  $acc11,$s1,`32-8`,24,31
        lbzx    $acc00,$Tbl2,$acc00
-        lbzx    $acc01,$Tbl2,$acc01
        rlwinm  $acc12,$s3,`0`,24,31
+        lbzx    $acc01,$Tbl2,$acc01
        rlwinm  $acc13,$s0,`0`,24,31
        lbzx    $acc02,$Tbl2,$acc02
-        lbzx    $acc03,$Tbl2,$acc03
        rlwinm  $acc14,$s1,`0`,24,31
+        lbzx    $acc03,$Tbl2,$acc03
        rlwinm  $acc15,$s2,`0`,24,31
        lbzx    $acc04,$Tbl2,$acc04
-        lbzx    $acc05,$Tbl2,$acc05
        rlwinm  $s0,$acc00,24,0,7
+        lbzx    $acc05,$Tbl2,$acc05
        rlwinm  $s1,$acc01,24,0,7
        lbzx    $acc06,$Tbl2,$acc06
-        lbzx    $acc07,$Tbl2,$acc07
        rlwinm  $s2,$acc02,24,0,7
+        lbzx    $acc07,$Tbl2,$acc07
        rlwinm  $s3,$acc03,24,0,7
        lbzx    $acc08,$Tbl2,$acc08
-        lbzx    $acc09,$Tbl2,$acc09
        rlwimi  $s0,$acc04,16,8,15
+        lbzx    $acc09,$Tbl2,$acc09
        rlwimi  $s1,$acc05,16,8,15
        lbzx    $acc10,$Tbl2,$acc10
-        lbzx    $acc11,$Tbl2,$acc11
        rlwimi  $s2,$acc06,16,8,15
+        lbzx    $acc11,$Tbl2,$acc11
        rlwimi  $s3,$acc07,16,8,15
        lbzx    $acc12,$Tbl2,$acc12
-        lbzx    $acc13,$Tbl2,$acc13
        rlwimi  $s0,$acc08,8,16,23
+        lbzx    $acc13,$Tbl2,$acc13
        rlwimi  $s1,$acc09,8,16,23
        lbzx    $acc14,$Tbl2,$acc14
-        lbzx    $acc15,$Tbl2,$acc15
        rlwimi  $s2,$acc10,8,16,23
+        lbzx    $acc15,$Tbl2,$acc15
        rlwimi  $s3,$acc11,8,16,23
        or      $s0,$s0,$acc12
        or      $s1,$s1,$acc13
@@ -530,29 +617,31 @@ Lenc_loop:
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
+        .long   0
+        .byte   0,12,0x14,0,0,0,0,0
 .align  4
 Lppc_AES_encrypt_compact:
        lwz     $acc00,240($key)
-        lwz     $t0,0($key)
-        lwz     $t1,4($key)
-        lwz     $t2,8($key)
-        lwz     $t3,12($key)
        addi    $Tbl1,$Tbl0,2048
+        lwz     $t0,0($key)
        lis     $mask80,0x8080
+        lwz     $t1,4($key)
        lis     $mask1b,0x1b1b
-        addi    $key,$key,16
+        lwz     $t2,8($key)
        ori     $mask80,$mask80,0x8080
+        lwz     $t3,12($key)
        ori     $mask1b,$mask1b,0x1b1b
+        addi    $key,$key,16
        mtctr   $acc00
 .align  4
 Lenc_compact_loop:
        xor     $s0,$s0,$t0
        xor     $s1,$s1,$t1
-        xor     $s2,$s2,$t2
-        xor     $s3,$s3,$t3
        rlwinm  $acc00,$s0,`32-24`,24,31
+        xor     $s2,$s2,$t2
        rlwinm  $acc01,$s1,`32-24`,24,31
+        xor     $s3,$s3,$t3
        rlwinm  $acc02,$s2,`32-24`,24,31
        rlwinm  $acc03,$s3,`32-24`,24,31
        rlwinm  $acc04,$s1,`32-16`,24,31
@@ -560,48 +649,48 @@ Lenc_compact_loop:
        rlwinm  $acc06,$s3,`32-16`,24,31
        rlwinm  $acc07,$s0,`32-16`,24,31
        lbzx    $acc00,$Tbl1,$acc00
-        lbzx    $acc01,$Tbl1,$acc01
        rlwinm  $acc08,$s2,`32-8`,24,31
+        lbzx    $acc01,$Tbl1,$acc01
        rlwinm  $acc09,$s3,`32-8`,24,31
        lbzx    $acc02,$Tbl1,$acc02
-        lbzx    $acc03,$Tbl1,$acc03
        rlwinm  $acc10,$s0,`32-8`,24,31
+        lbzx    $acc03,$Tbl1,$acc03
        rlwinm  $acc11,$s1,`32-8`,24,31
        lbzx    $acc04,$Tbl1,$acc04
-        lbzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc12,$s3,`0`,24,31
+        lbzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc13,$s0,`0`,24,31
        lbzx    $acc06,$Tbl1,$acc06
-        lbzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc14,$s1,`0`,24,31
+        lbzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc15,$s2,`0`,24,31
        lbzx    $acc08,$Tbl1,$acc08
-        lbzx    $acc09,$Tbl1,$acc09
        rlwinm  $s0,$acc00,24,0,7
+        lbzx    $acc09,$Tbl1,$acc09
        rlwinm  $s1,$acc01,24,0,7
        lbzx    $acc10,$Tbl1,$acc10
-        lbzx    $acc11,$Tbl1,$acc11
        rlwinm  $s2,$acc02,24,0,7
+        lbzx    $acc11,$Tbl1,$acc11
        rlwinm  $s3,$acc03,24,0,7
        lbzx    $acc12,$Tbl1,$acc12
-        lbzx    $acc13,$Tbl1,$acc13
        rlwimi  $s0,$acc04,16,8,15
+        lbzx    $acc13,$Tbl1,$acc13
        rlwimi  $s1,$acc05,16,8,15
        lbzx    $acc14,$Tbl1,$acc14
-        lbzx    $acc15,$Tbl1,$acc15
        rlwimi  $s2,$acc06,16,8,15
+        lbzx    $acc15,$Tbl1,$acc15
        rlwimi  $s3,$acc07,16,8,15
        rlwimi  $s0,$acc08,8,16,23
        rlwimi  $s1,$acc09,8,16,23
        rlwimi  $s2,$acc10,8,16,23
        rlwimi  $s3,$acc11,8,16,23
        lwz     $t0,0($key)
-        lwz     $t1,4($key)
        or      $s0,$s0,$acc12
+        lwz     $t1,4($key)
        or      $s1,$s1,$acc13
        lwz     $t2,8($key)
-        lwz     $t3,12($key)
        or      $s2,$s2,$acc14
+        lwz     $t3,12($key)
        or      $s3,$s3,$acc15
        addi    $key,$key,16
@@ -612,12 +701,12 @@ Lenc_compact_loop:
        and     $acc02,$s2,$mask80
        and     $acc03,$s3,$mask80
        srwi    $acc04,$acc00,7         # r1>>7
-        srwi    $acc05,$acc01,7
-        srwi    $acc06,$acc02,7
-        srwi    $acc07,$acc03,7
        andc    $acc08,$s0,$mask80      # r0&0x7f7f7f7f
+        srwi    $acc05,$acc01,7
        andc    $acc09,$s1,$mask80
+        srwi    $acc06,$acc02,7
        andc    $acc10,$s2,$mask80
+        srwi    $acc07,$acc03,7
        andc    $acc11,$s3,$mask80
        sub     $acc00,$acc00,$acc04    # r1-(r1>>7)
        sub     $acc01,$acc01,$acc05
@@ -633,32 +722,32 @@ Lenc_compact_loop:
        and     $acc03,$acc03,$mask1b
        xor     $acc00,$acc00,$acc08    # r2
        xor     $acc01,$acc01,$acc09
+         rotlwi $acc12,$s0,16           # ROTATE(r0,16)
        xor     $acc02,$acc02,$acc10
+         rotlwi $acc13,$s1,16
        xor     $acc03,$acc03,$acc11
+         rotlwi $acc14,$s2,16
-        rotlwi  $acc12,$s0,16           # ROTATE(r0,16)
-        rotlwi  $acc13,$s1,16
-        rotlwi  $acc14,$s2,16
-        rotlwi  $acc15,$s3,16
        xor     $s0,$s0,$acc00          # r0^r2
+        rotlwi  $acc15,$s3,16
        xor     $s1,$s1,$acc01
-        xor     $s2,$s2,$acc02
-        xor     $s3,$s3,$acc03
        rotrwi  $s0,$s0,24              # ROTATE(r2^r0,24)
+        xor     $s2,$s2,$acc02
        rotrwi  $s1,$s1,24
+        xor     $s3,$s3,$acc03
        rotrwi  $s2,$s2,24
-        rotrwi  $s3,$s3,24
        xor     $s0,$s0,$acc00          # ROTATE(r2^r0,24)^r2
+        rotrwi  $s3,$s3,24
        xor     $s1,$s1,$acc01
        xor     $s2,$s2,$acc02
        xor     $s3,$s3,$acc03
        rotlwi  $acc08,$acc12,8         # ROTATE(r0,24)
-        rotlwi  $acc09,$acc13,8
-        rotlwi  $acc10,$acc14,8
-        rotlwi  $acc11,$acc15,8
        xor     $s0,$s0,$acc12          #
+        rotlwi  $acc09,$acc13,8
        xor     $s1,$s1,$acc13
+        rotlwi  $acc10,$acc14,8
        xor     $s2,$s2,$acc14
+        rotlwi  $acc11,$acc15,8
        xor     $s3,$s3,$acc15
        xor     $s0,$s0,$acc08          #
        xor     $s1,$s1,$acc09
@@ -673,14 +762,15 @@ Lenc_compact_done:
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
+        .long   0
+        .byte   0,12,0x14,0,0,0,0,0
 .globl  .AES_decrypt
 .align  7
 .AES_decrypt:
-        mflr    r0
        $STU    $sp,-$FRAME($sp)
+        mflr    r0
-        $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
        $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
        $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
        $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
@@ -701,7 +791,14 @@ Lenc_compact_done:
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+        $PUSH   r0,`$FRAME+$LRSAVE`($sp)
+        andi.   $t0,$inp,3
+        andi.   $t1,$out,3
+        or.     $t0,$t0,$t1
+        bne     Ldec_unaligned
+Ldec_unaligned_ok:
        lwz     $s0,0($inp)
        lwz     $s1,4($inp)
        lwz     $s2,8($inp)
@@ -712,8 +809,80 @@ Lenc_compact_done:
        stw     $s1,4($out)
        stw     $s2,8($out)
        stw     $s3,12($out)
+        b       Ldec_done
+Ldec_unaligned:
+        subfic  $t0,$inp,4096
+        subfic  $t1,$out,4096
+        andi.   $t0,$t0,4096-16
+        beq     Ldec_xpage
+        andi.   $t1,$t1,4096-16
+        bne     Ldec_unaligned_ok
+Ldec_xpage:
+        lbz     $acc00,0($inp)
+        lbz     $acc01,1($inp)
+        lbz     $acc02,2($inp)
+        lbz     $s0,3($inp)
+        lbz     $acc04,4($inp)
+        lbz     $acc05,5($inp)
+        lbz     $acc06,6($inp)
+        lbz     $s1,7($inp)
+        lbz     $acc08,8($inp)
+        lbz     $acc09,9($inp)
+        lbz     $acc10,10($inp)
+        insrwi  $s0,$acc00,8,0
+        lbz     $s2,11($inp)
+        insrwi  $s1,$acc04,8,0
+        lbz     $acc12,12($inp)
+        insrwi  $s0,$acc01,8,8
+        lbz     $acc13,13($inp)
+        insrwi  $s1,$acc05,8,8
+        lbz     $acc14,14($inp)
+        insrwi  $s0,$acc02,8,16
+        lbz     $s3,15($inp)
+        insrwi  $s1,$acc06,8,16
+        insrwi  $s2,$acc08,8,0
+        insrwi  $s3,$acc12,8,0
+        insrwi  $s2,$acc09,8,8
+        insrwi  $s3,$acc13,8,8
+        insrwi  $s2,$acc10,8,16
+        insrwi  $s3,$acc14,8,16
+        bl      LAES_Td
+        bl      Lppc_AES_decrypt_compact
-        $POP    r0,`$FRAME-$SIZE_T*21`($sp)
+        extrwi  $acc00,$s0,8,0
+        extrwi  $acc01,$s0,8,8
+        stb     $acc00,0($out)
+        extrwi  $acc02,$s0,8,16
+        stb     $acc01,1($out)
+        stb     $acc02,2($out)
+        extrwi  $acc04,$s1,8,0
+        stb     $s0,3($out)
+        extrwi  $acc05,$s1,8,8
+        stb     $acc04,4($out)
+        extrwi  $acc06,$s1,8,16
+        stb     $acc05,5($out)
+        stb     $acc06,6($out)
+        extrwi  $acc08,$s2,8,0
+        stb     $s1,7($out)
+        extrwi  $acc09,$s2,8,8
+        stb     $acc08,8($out)
+        extrwi  $acc10,$s2,8,16
+        stb     $acc09,9($out)
+        stb     $acc10,10($out)
+        extrwi  $acc12,$s3,8,0
+        stb     $s2,11($out)
+        extrwi  $acc13,$s3,8,8
+        stb     $acc12,12($out)
+        extrwi  $acc14,$s3,8,16
+        stb     $acc13,13($out)
+        stb     $acc14,14($out)
+        stb     $s3,15($out)
+Ldec_done:
+        $POP    r0,`$FRAME+$LRSAVE`($sp)
        $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
        $POP    r13,`$FRAME-$SIZE_T*19`($sp)
        $POP    r14,`$FRAME-$SIZE_T*18`($sp)
@@ -737,18 +906,21 @@ Lenc_compact_done:
        mtlr    r0
        addi    $sp,$sp,$FRAME
        blr
+        .long   0
+        .byte   0,12,4,1,0x80,18,3,0
+        .long   0
 .align  5
 Lppc_AES_decrypt:
        lwz     $acc00,240($key)
-        lwz     $t0,0($key)
-        lwz     $t1,4($key)
-        lwz     $t2,8($key)
-        lwz     $t3,12($key)
        addi    $Tbl1,$Tbl0,3
+        lwz     $t0,0($key)
        addi    $Tbl2,$Tbl0,2
+        lwz     $t1,4($key)
        addi    $Tbl3,$Tbl0,1
+        lwz     $t2,8($key)
        addi    $acc00,$acc00,-1
+        lwz     $t3,12($key)
        addi    $key,$key,16
        xor     $s0,$s0,$t0
        xor     $s1,$s1,$t1
@@ -762,44 +934,44 @@ Ldec_loop:
        rlwinm  $acc02,$s2,`32-24+3`,21,28
        rlwinm  $acc03,$s3,`32-24+3`,21,28
        lwz     $t0,0($key)
-        lwz     $t1,4($key)
        rlwinm  $acc04,$s3,`32-16+3`,21,28
+        lwz     $t1,4($key)
        rlwinm  $acc05,$s0,`32-16+3`,21,28
        lwz     $t2,8($key)
-        lwz     $t3,12($key)
        rlwinm  $acc06,$s1,`32-16+3`,21,28
+        lwz     $t3,12($key)
        rlwinm  $acc07,$s2,`32-16+3`,21,28
        lwzx    $acc00,$Tbl0,$acc00
-        lwzx    $acc01,$Tbl0,$acc01
        rlwinm  $acc08,$s2,`32-8+3`,21,28
+        lwzx    $acc01,$Tbl0,$acc01
        rlwinm  $acc09,$s3,`32-8+3`,21,28
        lwzx    $acc02,$Tbl0,$acc02
-        lwzx    $acc03,$Tbl0,$acc03
        rlwinm  $acc10,$s0,`32-8+3`,21,28
+        lwzx    $acc03,$Tbl0,$acc03
        rlwinm  $acc11,$s1,`32-8+3`,21,28
        lwzx    $acc04,$Tbl1,$acc04
-        lwzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc12,$s1,`0+3`,21,28
+        lwzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc13,$s2,`0+3`,21,28
        lwzx    $acc06,$Tbl1,$acc06
-        lwzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc14,$s3,`0+3`,21,28
+        lwzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc15,$s0,`0+3`,21,28
        lwzx    $acc08,$Tbl2,$acc08
-        lwzx    $acc09,$Tbl2,$acc09
        xor     $t0,$t0,$acc00
+        lwzx    $acc09,$Tbl2,$acc09
        xor     $t1,$t1,$acc01
        lwzx    $acc10,$Tbl2,$acc10
-        lwzx    $acc11,$Tbl2,$acc11
        xor     $t2,$t2,$acc02
+        lwzx    $acc11,$Tbl2,$acc11
        xor     $t3,$t3,$acc03
        lwzx    $acc12,$Tbl3,$acc12
-        lwzx    $acc13,$Tbl3,$acc13
        xor     $t0,$t0,$acc04
+        lwzx    $acc13,$Tbl3,$acc13
        xor     $t1,$t1,$acc05
        lwzx    $acc14,$Tbl3,$acc14
-        lwzx    $acc15,$Tbl3,$acc15
        xor     $t2,$t2,$acc06
+        lwzx    $acc15,$Tbl3,$acc15
        xor     $t3,$t3,$acc07
        xor     $t0,$t0,$acc08
        xor     $t1,$t1,$acc09
@@ -815,56 +987,56 @@ Ldec_loop:
        addi    $Tbl2,$Tbl0,2048
        nop
        lwz     $t0,0($key)
-        lwz     $t1,4($key)
        rlwinm  $acc00,$s0,`32-24`,24,31
+        lwz     $t1,4($key)
        rlwinm  $acc01,$s1,`32-24`,24,31
        lwz     $t2,8($key)
-        lwz     $t3,12($key)
        rlwinm  $acc02,$s2,`32-24`,24,31
+        lwz     $t3,12($key)
        rlwinm  $acc03,$s3,`32-24`,24,31
        lwz     $acc08,`2048+0`($Tbl0)  ! prefetch Td4
-        lwz     $acc09,`2048+32`($Tbl0)
        rlwinm  $acc04,$s3,`32-16`,24,31
+        lwz     $acc09,`2048+32`($Tbl0)
        rlwinm  $acc05,$s0,`32-16`,24,31
        lwz     $acc10,`2048+64`($Tbl0)
-        lwz     $acc11,`2048+96`($Tbl0)
        lbzx    $acc00,$Tbl2,$acc00
+        lwz     $acc11,`2048+96`($Tbl0)
        lbzx    $acc01,$Tbl2,$acc01
        lwz     $acc12,`2048+128`($Tbl0)
-        lwz     $acc13,`2048+160`($Tbl0)
        rlwinm  $acc06,$s1,`32-16`,24,31
+        lwz     $acc13,`2048+160`($Tbl0)
        rlwinm  $acc07,$s2,`32-16`,24,31
        lwz     $acc14,`2048+192`($Tbl0)
-        lwz     $acc15,`2048+224`($Tbl0)
        rlwinm  $acc08,$s2,`32-8`,24,31
+        lwz     $acc15,`2048+224`($Tbl0)
        rlwinm  $acc09,$s3,`32-8`,24,31
        lbzx    $acc02,$Tbl2,$acc02
-        lbzx    $acc03,$Tbl2,$acc03
        rlwinm  $acc10,$s0,`32-8`,24,31
+        lbzx    $acc03,$Tbl2,$acc03
        rlwinm  $acc11,$s1,`32-8`,24,31
        lbzx    $acc04,$Tbl2,$acc04
-        lbzx    $acc05,$Tbl2,$acc05
        rlwinm  $acc12,$s1,`0`,24,31
+        lbzx    $acc05,$Tbl2,$acc05
        rlwinm  $acc13,$s2,`0`,24,31
        lbzx    $acc06,$Tbl2,$acc06
-        lbzx    $acc07,$Tbl2,$acc07
        rlwinm  $acc14,$s3,`0`,24,31
+        lbzx    $acc07,$Tbl2,$acc07
        rlwinm  $acc15,$s0,`0`,24,31
        lbzx    $acc08,$Tbl2,$acc08
-        lbzx    $acc09,$Tbl2,$acc09
        rlwinm  $s0,$acc00,24,0,7
+        lbzx    $acc09,$Tbl2,$acc09
        rlwinm  $s1,$acc01,24,0,7
        lbzx    $acc10,$Tbl2,$acc10
-        lbzx    $acc11,$Tbl2,$acc11
        rlwinm  $s2,$acc02,24,0,7
+        lbzx    $acc11,$Tbl2,$acc11
        rlwinm  $s3,$acc03,24,0,7
        lbzx    $acc12,$Tbl2,$acc12
-        lbzx    $acc13,$Tbl2,$acc13
        rlwimi  $s0,$acc04,16,8,15
+        lbzx    $acc13,$Tbl2,$acc13
        rlwimi  $s1,$acc05,16,8,15
        lbzx    $acc14,$Tbl2,$acc14
-        lbzx    $acc15,$Tbl2,$acc15
        rlwimi  $s2,$acc06,16,8,15
+        lbzx    $acc15,$Tbl2,$acc15
        rlwimi  $s3,$acc07,16,8,15
        rlwimi  $s0,$acc08,8,16,23
        rlwimi  $s1,$acc09,8,16,23
@@ -879,20 +1051,22 @@ Ldec_loop:
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
+        .long   0
+        .byte   0,12,0x14,0,0,0,0,0
 .align  4
 Lppc_AES_decrypt_compact:
        lwz     $acc00,240($key)
-        lwz     $t0,0($key)
-        lwz     $t1,4($key)
-        lwz     $t2,8($key)
-        lwz     $t3,12($key)
        addi    $Tbl1,$Tbl0,2048
+        lwz     $t0,0($key)
        lis     $mask80,0x8080
+        lwz     $t1,4($key)
        lis     $mask1b,0x1b1b
-        addi    $key,$key,16
+        lwz     $t2,8($key)
        ori     $mask80,$mask80,0x8080
+        lwz     $t3,12($key)
        ori     $mask1b,$mask1b,0x1b1b
+        addi    $key,$key,16
 ___
 $code.=<<___ if ($SIZE_T==8);
        insrdi  $mask80,$mask80,32,0
@@ -904,10 +1078,10 @@ $code.=<<___;
 Ldec_compact_loop:
        xor     $s0,$s0,$t0
        xor     $s1,$s1,$t1
-        xor     $s2,$s2,$t2
-        xor     $s3,$s3,$t3
        rlwinm  $acc00,$s0,`32-24`,24,31
+        xor     $s2,$s2,$t2
        rlwinm  $acc01,$s1,`32-24`,24,31
+        xor     $s3,$s3,$t3
        rlwinm  $acc02,$s2,`32-24`,24,31
        rlwinm  $acc03,$s3,`32-24`,24,31
        rlwinm  $acc04,$s3,`32-16`,24,31
@@ -915,48 +1089,48 @@ Ldec_compact_loop:
        rlwinm  $acc06,$s1,`32-16`,24,31
        rlwinm  $acc07,$s2,`32-16`,24,31
        lbzx    $acc00,$Tbl1,$acc00
-        lbzx    $acc01,$Tbl1,$acc01
        rlwinm  $acc08,$s2,`32-8`,24,31
+        lbzx    $acc01,$Tbl1,$acc01
        rlwinm  $acc09,$s3,`32-8`,24,31
        lbzx    $acc02,$Tbl1,$acc02
-        lbzx    $acc03,$Tbl1,$acc03
        rlwinm  $acc10,$s0,`32-8`,24,31
+        lbzx    $acc03,$Tbl1,$acc03
        rlwinm  $acc11,$s1,`32-8`,24,31
        lbzx    $acc04,$Tbl1,$acc04
-        lbzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc12,$s1,`0`,24,31
+        lbzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc13,$s2,`0`,24,31
        lbzx    $acc06,$Tbl1,$acc06
-        lbzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc14,$s3,`0`,24,31
+        lbzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc15,$s0,`0`,24,31
        lbzx    $acc08,$Tbl1,$acc08
-        lbzx    $acc09,$Tbl1,$acc09
        rlwinm  $s0,$acc00,24,0,7
+        lbzx    $acc09,$Tbl1,$acc09
        rlwinm  $s1,$acc01,24,0,7
        lbzx    $acc10,$Tbl1,$acc10
-        lbzx    $acc11,$Tbl1,$acc11
        rlwinm  $s2,$acc02,24,0,7
+        lbzx    $acc11,$Tbl1,$acc11
        rlwinm  $s3,$acc03,24,0,7
        lbzx    $acc12,$Tbl1,$acc12
-        lbzx    $acc13,$Tbl1,$acc13
        rlwimi  $s0,$acc04,16,8,15
+        lbzx    $acc13,$Tbl1,$acc13
        rlwimi  $s1,$acc05,16,8,15
        lbzx    $acc14,$Tbl1,$acc14
-        lbzx    $acc15,$Tbl1,$acc15
        rlwimi  $s2,$acc06,16,8,15
+        lbzx    $acc15,$Tbl1,$acc15
        rlwimi  $s3,$acc07,16,8,15
        rlwimi  $s0,$acc08,8,16,23
        rlwimi  $s1,$acc09,8,16,23
        rlwimi  $s2,$acc10,8,16,23
        rlwimi  $s3,$acc11,8,16,23
        lwz     $t0,0($key)
-        lwz     $t1,4($key)
        or      $s0,$s0,$acc12
+        lwz     $t1,4($key)
        or      $s1,$s1,$acc13
        lwz     $t2,8($key)
-        lwz     $t3,12($key)
        or      $s2,$s2,$acc14
+        lwz     $t3,12($key)
        or      $s3,$s3,$acc15
        addi    $key,$key,16
@@ -1030,12 +1204,12 @@ $code.=<<___ if ($SIZE_T==4);
        and     $acc02,$s2,$mask80
        and     $acc03,$s3,$mask80
        srwi    $acc04,$acc00,7         # r1>>7
-        srwi    $acc05,$acc01,7
-        srwi    $acc06,$acc02,7
-        srwi    $acc07,$acc03,7
        andc    $acc08,$s0,$mask80      # r0&0x7f7f7f7f
+        srwi    $acc05,$acc01,7
        andc    $acc09,$s1,$mask80
+        srwi    $acc06,$acc02,7
        andc    $acc10,$s2,$mask80
+        srwi    $acc07,$acc03,7
        andc    $acc11,$s3,$mask80
        sub     $acc00,$acc00,$acc04    # r1-(r1>>7)
        sub     $acc01,$acc01,$acc05
@@ -1059,12 +1233,12 @@ $code.=<<___ if ($SIZE_T==4);
        and     $acc06,$acc02,$mask80
        and     $acc07,$acc03,$mask80
        srwi    $acc08,$acc04,7         # r1>>7
-        srwi    $acc09,$acc05,7
-        srwi    $acc10,$acc06,7
-        srwi    $acc11,$acc07,7
        andc    $acc12,$acc00,$mask80   # r2&0x7f7f7f7f
+        srwi    $acc09,$acc05,7
        andc    $acc13,$acc01,$mask80
+        srwi    $acc10,$acc06,7
        andc    $acc14,$acc02,$mask80
+        srwi    $acc11,$acc07,7
        andc    $acc15,$acc03,$mask80
        sub     $acc04,$acc04,$acc08    # r1-(r1>>7)
        sub     $acc05,$acc05,$acc09
@@ -1085,13 +1259,13 @@ $code.=<<___ if ($SIZE_T==4);
        and     $acc08,$acc04,$mask80   # r1=r4&0x80808080
        and     $acc09,$acc05,$mask80
-        and     $acc10,$acc06,$mask80
-        and     $acc11,$acc07,$mask80
        srwi    $acc12,$acc08,7         # r1>>7
+        and     $acc10,$acc06,$mask80
        srwi    $acc13,$acc09,7
+        and     $acc11,$acc07,$mask80
        srwi    $acc14,$acc10,7
-        srwi    $acc15,$acc11,7
        sub     $acc08,$acc08,$acc12    # r1-(r1>>7)
+        srwi    $acc15,$acc11,7
        sub     $acc09,$acc09,$acc13
        sub     $acc10,$acc10,$acc14
        sub     $acc11,$acc11,$acc15
@@ -1124,10 +1298,10 @@ ___
 $code.=<<___;
        rotrwi  $s0,$s0,8               # = ROTATE(r0,8)
        rotrwi  $s1,$s1,8
-        rotrwi  $s2,$s2,8
-        rotrwi  $s3,$s3,8
        xor     $s0,$s0,$acc00          # ^= r2^r0
+        rotrwi  $s2,$s2,8
        xor     $s1,$s1,$acc01
+        rotrwi  $s3,$s3,8
        xor     $s2,$s2,$acc02
        xor     $s3,$s3,$acc03
        xor     $acc00,$acc00,$acc08
@@ -1135,32 +1309,32 @@ $code.=<<___;
        xor     $acc02,$acc02,$acc10
        xor     $acc03,$acc03,$acc11
        xor     $s0,$s0,$acc04          # ^= r4^r0
-        xor     $s1,$s1,$acc05
-        xor     $s2,$s2,$acc06
-        xor     $s3,$s3,$acc07
        rotrwi  $acc00,$acc00,24
+        xor     $s1,$s1,$acc05
        rotrwi  $acc01,$acc01,24
+        xor     $s2,$s2,$acc06
        rotrwi  $acc02,$acc02,24
+        xor     $s3,$s3,$acc07
        rotrwi  $acc03,$acc03,24
        xor     $acc04,$acc04,$acc08
        xor     $acc05,$acc05,$acc09
        xor     $acc06,$acc06,$acc10
        xor     $acc07,$acc07,$acc11
        xor     $s0,$s0,$acc08          # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
-        xor     $s1,$s1,$acc09
-        xor     $s2,$s2,$acc10
-        xor     $s3,$s3,$acc11
        rotrwi  $acc04,$acc04,16
+        xor     $s1,$s1,$acc09
        rotrwi  $acc05,$acc05,16
+        xor     $s2,$s2,$acc10
        rotrwi  $acc06,$acc06,16
+        xor     $s3,$s3,$acc11
        rotrwi  $acc07,$acc07,16
        xor     $s0,$s0,$acc00          # ^= ROTATE(r8^r2^r0,24)
-        xor     $s1,$s1,$acc01
-        xor     $s2,$s2,$acc02
-        xor     $s3,$s3,$acc03
        rotrwi  $acc08,$acc08,8
+        xor     $s1,$s1,$acc01
        rotrwi  $acc09,$acc09,8
+        xor     $s2,$s2,$acc02
        rotrwi  $acc10,$acc10,8
+        xor     $s3,$s3,$acc03
        rotrwi  $acc11,$acc11,8
        xor     $s0,$s0,$acc04          # ^= ROTATE(r8^r4^r0,16)
        xor     $s1,$s1,$acc05
@@ -1179,7 +1353,9 @@ Ldec_compact_done:
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
-.long   0
+        .long   0
+        .byte   0,12,0x14,0,0,0,0,0
 .asciz  "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 .align  7
 ___
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl
index 7e01889298..445a1e6762 100644
--- a/src/lib/libcrypto/aes/asm/aes-s390x.pl
+++ b/src/lib/libcrypto/aes/asm/aes-s390x.pl
@@ -44,12 +44,57 @@
 # Unlike previous version hardware support detection takes place only
 # at the moment of key schedule setup, which is denoted in key->rounds.
 # This is done, because deferred key setup can't be made MT-safe, not
-# for key lengthes longer than 128 bits.
+# for keys longer than 128 bits.
 #
 # Add AES_cbc_encrypt, which gives incredible performance improvement,
 # it was measured to be ~6.6x. It's less than previously mentioned 8x,
 # because software implementation was optimized.
+# May 2010.
+#
+# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
+# performance improvement over "generic" counter mode routine relying
+# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
+# to the fact that exact throughput value depends on current stack
+# frame alignment within 4KB page. In worst case you get ~75% of the
+# maximum, but *on average* it would be as much as ~98%. Meaning that
+# worst case is unlike, it's like hitting ravine on plateau.
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2x better than code generated by gcc 4.3.
+# December 2010.
+#
+# Add support for z196 "cipher message with counter" instruction.
+# Note however that it's disengaged, because it was measured to
+# perform ~12% worse than vanilla km-based code...
+# February 2011.
+#
+# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
+# instructions, which deliver ~70% improvement at 8KB block size over
+# vanilla km-based code, 37% - at most like 512-bytes block size.
+$flavour = shift;
+if ($flavour =~ /3[12]/) {
+        $SIZE_T=4;
+        $g="";
+} else {
+        $SIZE_T=8;
+        $g="g";
+}
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
 $softonly=0;    # allow hardware support
 $t0="%r0";      $mask="%r0";
@@ -69,6 +114,8 @@ $rounds="%r13";
 $ra="%r14";
 $sp="%r15";
+$stdframe=16*$SIZE_T+4*8;
 sub _data_word()
 { my $i;
    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -210,7 +257,7 @@ $code.=<<___ if (!$softonly);
 .Lesoft:
 ___
 $code.=<<___;
-        stmg    %r3,$ra,24($sp)
+        stm${g} %r3,$ra,3*$SIZE_T($sp)
        llgf    $s0,0($inp)
        llgf    $s1,4($inp)
@@ -220,20 +267,20 @@ $code.=<<___;
        larl    $tbl,AES_Te
        bras    $ra,_s390x_AES_encrypt
-        lg      $out,24($sp)
+        l${g}   $out,3*$SIZE_T($sp)
        st      $s0,0($out)
        st      $s1,4($out)
        st      $s2,8($out)
        st      $s3,12($out)
-        lmg     %r6,$ra,48($sp)
+        lm${g}  %r6,$ra,6*$SIZE_T($sp)
        br      $ra
 .size   AES_encrypt,.-AES_encrypt
 .type   _s390x_AES_encrypt,\@function
 .align  16
 _s390x_AES_encrypt:
-        stg     $ra,152($sp)
+        st${g}  $ra,15*$SIZE_T($sp)
        x       $s0,0($key)
        x       $s1,4($key)
        x       $s2,8($key)
@@ -397,7 +444,7 @@ _s390x_AES_encrypt:
        or      $s2,$i3
        or      $s3,$t3
-        lg      $ra,152($sp)
+        l${g}   $ra,15*$SIZE_T($sp)
        xr      $s0,$t0
        xr      $s1,$t2
        x       $s2,24($key)
@@ -536,7 +583,7 @@ $code.=<<___ if (!$softonly);
 .Ldsoft:
 ___
 $code.=<<___;
-        stmg    %r3,$ra,24($sp)
+        stm${g} %r3,$ra,3*$SIZE_T($sp)
        llgf    $s0,0($inp)
        llgf    $s1,4($inp)
@@ -546,20 +593,20 @@ $code.=<<___;
        larl    $tbl,AES_Td
        bras    $ra,_s390x_AES_decrypt
-        lg      $out,24($sp)
+        l${g}   $out,3*$SIZE_T($sp)
        st      $s0,0($out)
        st      $s1,4($out)
        st      $s2,8($out)
        st      $s3,12($out)
-        lmg     %r6,$ra,48($sp)
+        lm${g}  %r6,$ra,6*$SIZE_T($sp)
        br      $ra
 .size   AES_decrypt,.-AES_decrypt
 .type   _s390x_AES_decrypt,\@function
 .align  16
 _s390x_AES_decrypt:
-        stg     $ra,152($sp)
+        st${g}  $ra,15*$SIZE_T($sp)
        x       $s0,0($key)
        x       $s1,4($key)
        x       $s2,8($key)
@@ -703,7 +750,7 @@ _s390x_AES_decrypt:
        nr      $i1,$mask
        nr      $i2,$mask
-        lg      $ra,152($sp)
+        l${g}   $ra,15*$SIZE_T($sp)
        or      $s1,$t1
        l       $t0,16($key)
        l       $t1,20($key)
@@ -732,14 +779,15 @@ ___
 $code.=<<___;
 # void AES_set_encrypt_key(const unsigned char *in, int bits,
 #                AES_KEY *key) {
-.globl  AES_set_encrypt_key
+.globl  private_AES_set_encrypt_key
-.type   AES_set_encrypt_key,\@function
+.type   private_AES_set_encrypt_key,\@function
 .align  16
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_s390x_AES_set_encrypt_key:
        lghi    $t0,0
-        clgr    $inp,$t0
+        cl${g}r $inp,$t0
        je      .Lminus1
-        clgr    $key,$t0
+        cl${g}r $key,$t0
        je      .Lminus1
        lghi    $t0,128
@@ -789,7 +837,8 @@ $code.=<<___ if (!$softonly);
        je      1f
        lg      %r1,24($inp)
        stg     %r1,24($key)
-1:      st      $bits,236($key) # save bits
+1:      st      $bits,236($key) # save bits [for debugging purposes]
+        lgr     $t0,%r5
        st      %r5,240($key)   # save km code
        lghi    %r2,0
        br      %r14
@@ -797,7 +846,7 @@ ___
 $code.=<<___;
 .align  16
 .Lekey_internal:
-        stmg    %r6,%r13,48($sp)        # all non-volatile regs
+        stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
        larl    $tbl,AES_Te+2048
@@ -857,8 +906,9 @@ $code.=<<___;
        la      $key,16($key)           # key+=4
        la      $t3,4($t3)              # i++
        brct    $rounds,.L128_loop
+        lghi    $t0,10
        lghi    %r2,0
-        lmg     %r6,%r13,48($sp)
+        lm${g}  %r4,%r13,4*$SIZE_T($sp)
        br      $ra
 .align  16
@@ -905,8 +955,9 @@ $code.=<<___;
        st      $s2,32($key)
        st      $s3,36($key)
        brct    $rounds,.L192_continue
+        lghi    $t0,12
        lghi    %r2,0
-        lmg     %r6,%r13,48($sp)
+        lm${g}  %r4,%r13,4*$SIZE_T($sp)
        br      $ra
 .align  16
@@ -967,8 +1018,9 @@ $code.=<<___;
        st      $s2,40($key)
        st      $s3,44($key)
        brct    $rounds,.L256_continue
+        lghi    $t0,14
        lghi    %r2,0
-        lmg     %r6,%r13,48($sp)
+        lm${g}  %r4,%r13,4*$SIZE_T($sp)
        br      $ra
 .align  16
@@ -1011,42 +1063,34 @@ $code.=<<___;
 .Lminus1:
        lghi    %r2,-1
        br      $ra
-.size   AES_set_encrypt_key,.-AES_set_encrypt_key
+.size   private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 # void AES_set_decrypt_key(const unsigned char *in, int bits,
 #                AES_KEY *key) {
-.globl  AES_set_decrypt_key
+.globl  private_AES_set_decrypt_key
-.type   AES_set_decrypt_key,\@function
+.type   private_AES_set_decrypt_key,\@function
 .align  16
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
-        stg     $key,32($sp)            # I rely on AES_set_encrypt_key to
+        #st${g} $key,4*$SIZE_T($sp)     # I rely on AES_set_encrypt_key to
-        stg     $ra,112($sp)            # save non-volatile registers!
+        st${g}  $ra,14*$SIZE_T($sp)     # save non-volatile registers and $key!
-        bras    $ra,AES_set_encrypt_key
+        bras    $ra,_s390x_AES_set_encrypt_key
-        lg      $key,32($sp)
+        #l${g}  $key,4*$SIZE_T($sp)
-        lg      $ra,112($sp)
+        l${g}   $ra,14*$SIZE_T($sp)
        ltgr    %r2,%r2
        bnzr    $ra
 ___
 $code.=<<___ if (!$softonly);
-        l       $t0,240($key)
+        #l      $t0,240($key)
        lhi     $t1,16
        cr      $t0,$t1
        jl      .Lgo
        oill    $t0,0x80        # set "decrypt" bit
        st      $t0,240($key)
        br      $ra
-.align  16
-.Ldkey_internal:
-        stg     $key,32($sp)
-        stg     $ra,40($sp)
-        bras    $ra,.Lekey_internal
-        lg      $key,32($sp)
-        lg      $ra,40($sp)
 ___
 $code.=<<___;
+.align  16
-.Lgo:   llgf    $rounds,240($key)
+.Lgo:   lgr     $rounds,$t0     #llgf   $rounds,240($key)
        la      $i1,0($key)
        sllg    $i2,$rounds,4
        la      $i2,0($i2,$key)
@@ -1123,13 +1167,14 @@ $code.=<<___;
        la      $key,4($key)
        brct    $rounds,.Lmix
-        lmg     %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
+        lm${g}  %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
        lghi    %r2,0
        br      $ra
-.size   AES_set_decrypt_key,.-AES_set_decrypt_key
+.size   private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 ___
-#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+########################################################################
+# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
 #                     size_t length, const AES_KEY *key,
 #                     unsigned char *ivec, const int enc)
 {
@@ -1163,7 +1208,7 @@ $code.=<<___ if (!$softonly);
        l       %r0,240($key)   # load kmc code
        lghi    $key,15         # res=len%16, len-=res;
        ngr     $key,$len
-        slgr    $len,$key
+        sl${g}r $len,$key
        la      %r1,16($sp)     # parameter block - ivec || key
        jz      .Lkmc_truncated
        .long   0xb92f0042      # kmc %r4,%r2
@@ -1181,34 +1226,34 @@ $code.=<<___ if (!$softonly);
        tmll    %r0,0x80
        jnz     .Lkmc_truncated_dec
        lghi    %r1,0
-        stg     %r1,128($sp)
+        stg     %r1,16*$SIZE_T($sp)
-        stg     %r1,136($sp)
+        stg     %r1,16*$SIZE_T+8($sp)
        bras    %r1,1f
-        mvc     128(1,$sp),0($inp)
+        mvc     16*$SIZE_T(1,$sp),0($inp)
 1:      ex      $key,0(%r1)
        la      %r1,16($sp)     # restore parameter block
-        la      $inp,128($sp)
+        la      $inp,16*$SIZE_T($sp)
        lghi    $len,16
        .long   0xb92f0042      # kmc %r4,%r2
        j       .Lkmc_done
 .align  16
 .Lkmc_truncated_dec:
-        stg     $out,64($sp)
+        st${g}  $out,4*$SIZE_T($sp)
-        la      $out,128($sp)
+        la      $out,16*$SIZE_T($sp)
        lghi    $len,16
        .long   0xb92f0042      # kmc %r4,%r2
-        lg      $out,64($sp)
+        l${g}   $out,4*$SIZE_T($sp)
        bras    %r1,2f
-        mvc     0(1,$out),128($sp)
+        mvc     0(1,$out),16*$SIZE_T($sp)
 2:      ex      $key,0(%r1)
        j       .Lkmc_done
 .align  16
 .Lcbc_software:
 ___
 $code.=<<___;
-        stmg    $key,$ra,40($sp)
+        stm${g} $key,$ra,5*$SIZE_T($sp)
        lhi     %r0,0
-        cl      %r0,164($sp)
+        cl      %r0,`$stdframe+$SIZE_T-4`($sp)
        je      .Lcbc_decrypt
        larl    $tbl,AES_Te
@@ -1219,10 +1264,10 @@ $code.=<<___;
        llgf    $s3,12($ivp)
        lghi    $t0,16
-        slgr    $len,$t0
+        sl${g}r $len,$t0
        brc     4,.Lcbc_enc_tail        # if borrow
 .Lcbc_enc_loop:
-        stmg    $inp,$out,16($sp)
+        stm${g} $inp,$out,2*$SIZE_T($sp)
        x       $s0,0($inp)
        x       $s1,4($inp)
        x       $s2,8($inp)
@@ -1231,7 +1276,7 @@ $code.=<<___;
        bras    $ra,_s390x_AES_encrypt
-        lmg     $inp,$key,16($sp)
+        lm${g}  $inp,$key,2*$SIZE_T($sp)
        st      $s0,0($out)
        st      $s1,4($out)
        st      $s2,8($out)
@@ -1240,33 +1285,33 @@ $code.=<<___;
        la      $inp,16($inp)
        la      $out,16($out)
        lghi    $t0,16
-        ltgr    $len,$len
+        lt${g}r $len,$len
        jz      .Lcbc_enc_done
-        slgr    $len,$t0
+        sl${g}r $len,$t0
        brc     4,.Lcbc_enc_tail        # if borrow
        j       .Lcbc_enc_loop
 .align  16
 .Lcbc_enc_done:
-        lg      $ivp,48($sp)
+        l${g}   $ivp,6*$SIZE_T($sp)
        st      $s0,0($ivp)
        st      $s1,4($ivp)     
        st      $s2,8($ivp)
        st      $s3,12($ivp)
-        lmg     %r7,$ra,56($sp)
+        lm${g}  %r7,$ra,7*$SIZE_T($sp)
        br      $ra
 .align  16
 .Lcbc_enc_tail:
        aghi    $len,15
        lghi    $t0,0
-        stg     $t0,128($sp)
+        stg     $t0,16*$SIZE_T($sp)
-        stg     $t0,136($sp)
+        stg     $t0,16*$SIZE_T+8($sp)
        bras    $t1,3f
-        mvc     128(1,$sp),0($inp)
+        mvc     16*$SIZE_T(1,$sp),0($inp)
 3:      ex      $len,0($t1)
        lghi    $len,0
-        la      $inp,128($sp)
+        la      $inp,16*$SIZE_T($sp)
        j       .Lcbc_enc_loop
 .align  16
@@ -1275,10 +1320,10 @@ $code.=<<___;
        lg      $t0,0($ivp)
        lg      $t1,8($ivp)
-        stmg    $t0,$t1,128($sp)
+        stmg    $t0,$t1,16*$SIZE_T($sp)
 .Lcbc_dec_loop:
-        stmg    $inp,$out,16($sp)
+        stm${g} $inp,$out,2*$SIZE_T($sp)
        llgf    $s0,0($inp)
        llgf    $s1,4($inp)
        llgf    $s2,8($inp)
@@ -1287,7 +1332,7 @@ $code.=<<___;
        bras    $ra,_s390x_AES_decrypt
-        lmg     $inp,$key,16($sp)
+        lm${g}  $inp,$key,2*$SIZE_T($sp)
        sllg    $s0,$s0,32
        sllg    $s2,$s2,32
        lr      $s0,$s1
@@ -1295,15 +1340,15 @@ $code.=<<___;
        lg      $t0,0($inp)
        lg      $t1,8($inp)
-        xg      $s0,128($sp)
+        xg      $s0,16*$SIZE_T($sp)
-        xg      $s2,136($sp)
+        xg      $s2,16*$SIZE_T+8($sp)
        lghi    $s1,16
-        slgr    $len,$s1
+        sl${g}r $len,$s1
        brc     4,.Lcbc_dec_tail        # if borrow
        brc     2,.Lcbc_dec_done        # if zero
        stg     $s0,0($out)
        stg     $s2,8($out)
-        stmg    $t0,$t1,128($sp)
+        stmg    $t0,$t1,16*$SIZE_T($sp)
        la      $inp,16($inp)
        la      $out,16($out)
@@ -1313,7 +1358,7 @@ $code.=<<___;
        stg     $s0,0($out)
        stg     $s2,8($out)
 .Lcbc_dec_exit:
-        lmg     $ivp,$ra,48($sp)
+        lm${g}  %r6,$ra,6*$SIZE_T($sp)
        stmg    $t0,$t1,0($ivp)
        br      $ra
@@ -1321,19 +1366,889 @@ $code.=<<___;
 .align  16
 .Lcbc_dec_tail:
        aghi    $len,15
-        stg     $s0,128($sp)
+        stg     $s0,16*$SIZE_T($sp)
-        stg     $s2,136($sp)
+        stg     $s2,16*$SIZE_T+8($sp)
        bras    $s1,4f
-        mvc     0(1,$out),128($sp)
+        mvc     0(1,$out),16*$SIZE_T($sp)
 4:      ex      $len,0($s1)
        j       .Lcbc_dec_exit
 .size   AES_cbc_encrypt,.-AES_cbc_encrypt
-.comm  OPENSSL_s390xcap_P,8,8
+___
+}
+########################################################################
+# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+#                     size_t blocks, const AES_KEY *key,
+#                     const unsigned char *ivec)
+{
+my $inp="%r2";
+my $out="%r4";  # blocks and out are swapped
+my $len="%r3";
+my $key="%r5";  my $iv0="%r5";
+my $ivp="%r6";
+my $fp ="%r7";
+$code.=<<___;
+.globl  AES_ctr32_encrypt
+.type   AES_ctr32_encrypt,\@function
+.align  16
+AES_ctr32_encrypt:
+        xgr     %r3,%r4         # flip %r3 and %r4, $out and $len
+        xgr     %r4,%r3
+        xgr     %r3,%r4
+        llgfr   $len,$len       # safe in ctr32 subroutine even in 64-bit case
+___
+$code.=<<___ if (!$softonly);
+        l       %r0,240($key)
+        lhi     %r1,16
+        clr     %r0,%r1
+        jl      .Lctr32_software
+        stm${g} %r6,$s3,6*$SIZE_T($sp)
+        slgr    $out,$inp
+        la      %r1,0($key)     # %r1 is permanent copy of $key
+        lg      $iv0,0($ivp)    # load ivec
+        lg      $ivp,8($ivp)
+        # prepare and allocate stack frame at the top of 4K page
+        # with 1K reserved for eventual signal handling
+        lghi    $s0,-1024-256-16# guarantee at least 256-bytes buffer
+        lghi    $s1,-4096
+        algr    $s0,$sp
+        lgr     $fp,$sp
+        ngr     $s0,$s1         # align at page boundary
+        slgr    $fp,$s0         # total buffer size
+        lgr     $s2,$sp
+        lghi    $s1,1024+16     # sl[g]fi is extended-immediate facility
+        slgr    $fp,$s1         # deduct reservation to get usable buffer size
+        # buffer size is at lest 256 and at most 3072+256-16
+        la      $sp,1024($s0)   # alloca
+        srlg    $fp,$fp,4       # convert bytes to blocks, minimum 16
+        st${g}  $s2,0($sp)      # back-chain
+        st${g}  $fp,$SIZE_T($sp)
+        slgr    $len,$fp
+        brc     1,.Lctr32_hw_switch     # not zero, no borrow
+        algr    $fp,$len        # input is shorter than allocated buffer
+        lghi    $len,0
+        st${g}  $fp,$SIZE_T($sp)
+.Lctr32_hw_switch:
+___
+$code.=<<___ if (0);    ######### kmctr code was measured to be ~12% slower
+        larl    $s0,OPENSSL_s390xcap_P
+        lg      $s0,8($s0)
+        tmhh    $s0,0x0004      # check for message_security-assist-4
+        jz      .Lctr32_km_loop
+        llgfr   $s0,%r0
+        lgr     $s1,%r1
+        lghi    %r0,0
+        la      %r1,16($sp)
+        .long   0xb92d2042      # kmctr %r4,%r2,%r2
+        llihh   %r0,0x8000      # check if kmctr supports the function code
+        srlg    %r0,%r0,0($s0)
+        ng      %r0,16($sp)
+        lgr     %r0,$s0
+        lgr     %r1,$s1
+        jz      .Lctr32_km_loop
+####### kmctr code
+        algr    $out,$inp       # restore $out
+        lgr     $s1,$len        # $s1 undertakes $len
+        j       .Lctr32_kmctr_loop
+.align  16
+.Lctr32_kmctr_loop:
+        la      $s2,16($sp)
+        lgr     $s3,$fp
+.Lctr32_kmctr_prepare:
+        stg     $iv0,0($s2)
+        stg     $ivp,8($s2)
+        la      $s2,16($s2)
+        ahi     $ivp,1          # 32-bit increment, preserves upper half
+        brct    $s3,.Lctr32_kmctr_prepare
+        #la     $inp,0($inp)    # inp
+        sllg    $len,$fp,4      # len
+        #la     $out,0($out)    # out
+        la      $s2,16($sp)     # iv
+        .long   0xb92da042      # kmctr $out,$s2,$inp
+        brc     1,.-4           # pay attention to "partial completion"
+        slgr    $s1,$fp
+        brc     1,.Lctr32_kmctr_loop    # not zero, no borrow
+        algr    $fp,$s1
+        lghi    $s1,0
+        brc     4+1,.Lctr32_kmctr_loop  # not zero
+        l${g}   $sp,0($sp)
+        lm${g}  %r6,$s3,6*$SIZE_T($sp)
+        br      $ra
+.align  16
+___
+$code.=<<___;
+.Lctr32_km_loop:
+        la      $s2,16($sp)
+        lgr     $s3,$fp
+.Lctr32_km_prepare:
+        stg     $iv0,0($s2)
+        stg     $ivp,8($s2)
+        la      $s2,16($s2)
+        ahi     $ivp,1          # 32-bit increment, preserves upper half
+        brct    $s3,.Lctr32_km_prepare
+        la      $s0,16($sp)     # inp
+        sllg    $s1,$fp,4       # len
+        la      $s2,16($sp)     # out
+        .long   0xb92e00a8      # km %r10,%r8
+        brc     1,.-4           # pay attention to "partial completion"
+        la      $s2,16($sp)
+        lgr     $s3,$fp
+        slgr    $s2,$inp
+.Lctr32_km_xor:
+        lg      $s0,0($inp)
+        lg      $s1,8($inp)
+        xg      $s0,0($s2,$inp)
+        xg      $s1,8($s2,$inp)
+        stg     $s0,0($out,$inp)
+        stg     $s1,8($out,$inp)
+        la      $inp,16($inp)
+        brct    $s3,.Lctr32_km_xor
+        slgr    $len,$fp
+        brc     1,.Lctr32_km_loop       # not zero, no borrow
+        algr    $fp,$len
+        lghi    $len,0
+        brc     4+1,.Lctr32_km_loop     # not zero
+        l${g}   $s0,0($sp)
+        l${g}   $s1,$SIZE_T($sp)
+        la      $s2,16($sp)
+.Lctr32_km_zap:
+        stg     $s0,0($s2)
+        stg     $s0,8($s2)
+        la      $s2,16($s2)
+        brct    $s1,.Lctr32_km_zap
+        la      $sp,0($s0)
+        lm${g}  %r6,$s3,6*$SIZE_T($sp)
+        br      $ra
+.align  16
+.Lctr32_software:
+___
+$code.=<<___;
+        stm${g} $key,$ra,5*$SIZE_T($sp)
+        sl${g}r $inp,$out
+        larl    $tbl,AES_Te
+        llgf    $t1,12($ivp)
+.Lctr32_loop:
+        stm${g} $inp,$out,2*$SIZE_T($sp)
+        llgf    $s0,0($ivp)
+        llgf    $s1,4($ivp)
+        llgf    $s2,8($ivp)
+        lgr     $s3,$t1
+        st      $t1,16*$SIZE_T($sp)
+        lgr     %r4,$key
+        bras    $ra,_s390x_AES_encrypt
+        lm${g}  $inp,$ivp,2*$SIZE_T($sp)
+        llgf    $t1,16*$SIZE_T($sp)
+        x       $s0,0($inp,$out)
+        x       $s1,4($inp,$out)
+        x       $s2,8($inp,$out)
+        x       $s3,12($inp,$out)
+        stm     $s0,$s3,0($out)
+        la      $out,16($out)
+        ahi     $t1,1           # 32-bit increment
+        brct    $len,.Lctr32_loop
+        lm${g}  %r6,$ra,6*$SIZE_T($sp)
+        br      $ra
+.size   AES_ctr32_encrypt,.-AES_ctr32_encrypt
+___
+}
+########################################################################
+# void AES_xts_encrypt(const char *inp,char *out,size_t len,
+#       const AES_KEY *key1, const AES_KEY *key2,
+#       const unsigned char iv[16]);
+#
+{
+my $inp="%r2";
+my $out="%r4";  # len and out are swapped
+my $len="%r3";
+my $key1="%r5"; # $i1
+my $key2="%r6"; # $i2
+my $fp="%r7";   # $i3
+my $tweak=16*$SIZE_T+16;        # or $stdframe-16, bottom of the frame...
+$code.=<<___;
+.type   _s390x_xts_km,\@function
+.align  16
+_s390x_xts_km:
+___
+$code.=<<___ if(1);
+        llgfr   $s0,%r0                 # put aside the function code
+        lghi    $s1,0x7f
+        nr      $s1,%r0
+        lghi    %r0,0                   # query capability vector
+        la      %r1,2*$SIZE_T($sp)
+        .long   0xb92e0042              # km %r4,%r2
+        llihh   %r1,0x8000
+        srlg    %r1,%r1,32($s1)         # check for 32+function code
+        ng      %r1,2*$SIZE_T($sp)
+        lgr     %r0,$s0                 # restore the function code
+        la      %r1,0($key1)            # restore $key1
+        jz      .Lxts_km_vanilla
+        lmg     $i2,$i3,$tweak($sp)     # put aside the tweak value
+        algr    $out,$inp
+        oill    %r0,32                  # switch to xts function code
+        aghi    $s1,-18                 #
+        sllg    $s1,$s1,3               # (function code - 18)*8, 0 or 16
+        la      %r1,$tweak-16($sp)
+        slgr    %r1,$s1                 # parameter block position
+        lmg     $s0,$s3,0($key1)        # load 256 bits of key material,
+        stmg    $s0,$s3,0(%r1)          # and copy it to parameter block.
+                                        # yes, it contains junk and overlaps
+                                        # with the tweak in 128-bit case.
+                                        # it's done to avoid conditional
+                                        # branch.
+        stmg    $i2,$i3,$tweak($sp)     # "re-seat" the tweak value
+        .long   0xb92e0042              # km %r4,%r2
+        brc     1,.-4                   # pay attention to "partial completion"
+        lrvg    $s0,$tweak+0($sp)       # load the last tweak
+        lrvg    $s1,$tweak+8($sp)
+        stmg    %r0,%r3,$tweak-32(%r1)  # wipe copy of the key
+        nill    %r0,0xffdf              # switch back to original function code
+        la      %r1,0($key1)            # restore pointer to $key1
+        slgr    $out,$inp
+        llgc    $len,2*$SIZE_T-1($sp)
+        nill    $len,0x0f               # $len%=16
+        br      $ra
+        
+.align  16
+.Lxts_km_vanilla:
+___
+$code.=<<___;
+        # prepare and allocate stack frame at the top of 4K page
+        # with 1K reserved for eventual signal handling
+        lghi    $s0,-1024-256-16# guarantee at least 256-bytes buffer
+        lghi    $s1,-4096
+        algr    $s0,$sp
+        lgr     $fp,$sp
+        ngr     $s0,$s1         # align at page boundary
+        slgr    $fp,$s0         # total buffer size
+        lgr     $s2,$sp
+        lghi    $s1,1024+16     # sl[g]fi is extended-immediate facility
+        slgr    $fp,$s1         # deduct reservation to get usable buffer size
+        # buffer size is at lest 256 and at most 3072+256-16
+        la      $sp,1024($s0)   # alloca
+        nill    $fp,0xfff0      # round to 16*n
+        st${g}  $s2,0($sp)      # back-chain
+        nill    $len,0xfff0     # redundant
+        st${g}  $fp,$SIZE_T($sp)
+        slgr    $len,$fp
+        brc     1,.Lxts_km_go   # not zero, no borrow
+        algr    $fp,$len        # input is shorter than allocated buffer
+        lghi    $len,0
+        st${g}  $fp,$SIZE_T($sp)
+.Lxts_km_go:
+        lrvg    $s0,$tweak+0($s2)       # load the tweak value in little-endian
+        lrvg    $s1,$tweak+8($s2)
+        la      $s2,16($sp)             # vector of ascending tweak values
+        slgr    $s2,$inp
+        srlg    $s3,$fp,4
+        j       .Lxts_km_start
+.Lxts_km_loop:
+        la      $s2,16($sp)
+        slgr    $s2,$inp
+        srlg    $s3,$fp,4
+.Lxts_km_prepare:
+        lghi    $i1,0x87
+        srag    $i2,$s1,63              # broadcast upper bit
+        ngr     $i1,$i2                 # rem
+        srlg    $i2,$s0,63              # carry bit from lower half
+        sllg    $s0,$s0,1
+        sllg    $s1,$s1,1
+        xgr     $s0,$i1
+        ogr     $s1,$i2
+.Lxts_km_start:
+        lrvgr   $i1,$s0                 # flip byte order
+        lrvgr   $i2,$s1
+        stg     $i1,0($s2,$inp)
+        stg     $i2,8($s2,$inp)
+        xg      $i1,0($inp)
+        xg      $i2,8($inp)
+        stg     $i1,0($out,$inp)
+        stg     $i2,8($out,$inp)
+        la      $inp,16($inp)
+        brct    $s3,.Lxts_km_prepare
+        slgr    $inp,$fp                # rewind $inp
+        la      $s2,0($out,$inp)
+        lgr     $s3,$fp
+        .long   0xb92e00aa              # km $s2,$s2
+        brc     1,.-4                   # pay attention to "partial completion"
+        la      $s2,16($sp)
+        slgr    $s2,$inp
+        srlg    $s3,$fp,4
+.Lxts_km_xor:
+        lg      $i1,0($out,$inp)
+        lg      $i2,8($out,$inp)
+        xg      $i1,0($s2,$inp)
+        xg      $i2,8($s2,$inp)
+        stg     $i1,0($out,$inp)
+        stg     $i2,8($out,$inp)
+        la      $inp,16($inp)
+        brct    $s3,.Lxts_km_xor
+        slgr    $len,$fp
+        brc     1,.Lxts_km_loop         # not zero, no borrow
+        algr    $fp,$len
+        lghi    $len,0
+        brc     4+1,.Lxts_km_loop       # not zero
+        l${g}   $i1,0($sp)              # back-chain
+        llgf    $fp,`2*$SIZE_T-4`($sp)  # bytes used
+        la      $i2,16($sp)
+        srlg    $fp,$fp,4
+.Lxts_km_zap:
+        stg     $i1,0($i2)
+        stg     $i1,8($i2)
+        la      $i2,16($i2)
+        brct    $fp,.Lxts_km_zap
+        la      $sp,0($i1)
+        llgc    $len,2*$SIZE_T-1($i1)
+        nill    $len,0x0f               # $len%=16
+        bzr     $ra
+        # generate one more tweak...
+        lghi    $i1,0x87
+        srag    $i2,$s1,63              # broadcast upper bit
+        ngr     $i1,$i2                 # rem
+        srlg    $i2,$s0,63              # carry bit from lower half
+        sllg    $s0,$s0,1
+        sllg    $s1,$s1,1
+        xgr     $s0,$i1
+        ogr     $s1,$i2
+        ltr     $len,$len               # clear zero flag
+        br      $ra
+.size   _s390x_xts_km,.-_s390x_xts_km
+.globl  AES_xts_encrypt
+.type   AES_xts_encrypt,\@function
+.align  16
+AES_xts_encrypt:
+        xgr     %r3,%r4                 # flip %r3 and %r4, $out and $len
+        xgr     %r4,%r3
+        xgr     %r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+        llgfr   $len,$len
+___
+$code.=<<___;
+        st${g}  $len,1*$SIZE_T($sp)     # save copy of $len
+        srag    $len,$len,4             # formally wrong, because it expands
+                                        # sign byte, but who can afford asking
+                                        # to process more than 2^63-1 bytes?
+                                        # I use it, because it sets condition
+                                        # code...
+        bcr     8,$ra                   # abort if zero (i.e. less than 16)
+___
+$code.=<<___ if (!$softonly);
+        llgf    %r0,240($key2)
+        lhi     %r1,16
+        clr     %r0,%r1
+        jl      .Lxts_enc_software
+        stm${g} %r6,$s3,6*$SIZE_T($sp)
+        st${g}  $ra,14*$SIZE_T($sp)
+        sllg    $len,$len,4             # $len&=~15
+        slgr    $out,$inp
+        # generate the tweak value
+        l${g}   $s3,$stdframe($sp)      # pointer to iv
+        la      $s2,$tweak($sp)
+        lmg     $s0,$s1,0($s3)
+        lghi    $s3,16
+        stmg    $s0,$s1,0($s2)
+        la      %r1,0($key2)            # $key2 is not needed anymore
+        .long   0xb92e00aa              # km $s2,$s2, generate the tweak
+        brc     1,.-4                   # can this happen?
+        l       %r0,240($key1)
+        la      %r1,0($key1)            # $key1 is not needed anymore
+        bras    $ra,_s390x_xts_km
+        jz      .Lxts_enc_km_done
+        aghi    $inp,-16                # take one step back
+        la      $i3,0($out,$inp)        # put aside real $out
+.Lxts_enc_km_steal:
+        llgc    $i1,16($inp)
+        llgc    $i2,0($out,$inp)
+        stc     $i1,0($out,$inp)
+        stc     $i2,16($out,$inp)
+        la      $inp,1($inp)
+        brct    $len,.Lxts_enc_km_steal
+        la      $s2,0($i3)
+        lghi    $s3,16
+        lrvgr   $i1,$s0                 # flip byte order
+        lrvgr   $i2,$s1
+        xg      $i1,0($s2)
+        xg      $i2,8($s2)
+        stg     $i1,0($s2)
+        stg     $i2,8($s2)
+        .long   0xb92e00aa              # km $s2,$s2
+        brc     1,.-4                   # can this happen?
+        lrvgr   $i1,$s0                 # flip byte order
+        lrvgr   $i2,$s1
+        xg      $i1,0($i3)
+        xg      $i2,8($i3)
+        stg     $i1,0($i3)
+        stg     $i2,8($i3)
+.Lxts_enc_km_done:
+        l${g}   $ra,14*$SIZE_T($sp)
+        st${g}  $sp,$tweak($sp)         # wipe tweak
+        st${g}  $sp,$tweak($sp)
+        lm${g}  %r6,$s3,6*$SIZE_T($sp)
+        br      $ra
+.align  16
+.Lxts_enc_software:
+___
+$code.=<<___;
+        stm${g} %r6,$ra,6*$SIZE_T($sp)
+        slgr    $out,$inp
+        xgr     $s0,$s0                 # clear upper half
+        xgr     $s1,$s1
+        lrv     $s0,$stdframe+4($sp)    # load secno
+        lrv     $s1,$stdframe+0($sp)
+        xgr     $s2,$s2
+        xgr     $s3,$s3
+        stm${g} %r2,%r5,2*$SIZE_T($sp)
+        la      $key,0($key2)
+        larl    $tbl,AES_Te
+        bras    $ra,_s390x_AES_encrypt  # generate the tweak
+        lm${g}  %r2,%r5,2*$SIZE_T($sp)
+        stm     $s0,$s3,$tweak($sp)     # save the tweak
+        j       .Lxts_enc_enter
+.align  16
+.Lxts_enc_loop:
+        lrvg    $s1,$tweak+0($sp)       # load the tweak in little-endian
+        lrvg    $s3,$tweak+8($sp)
+        lghi    %r1,0x87
+        srag    %r0,$s3,63              # broadcast upper bit
+        ngr     %r1,%r0                 # rem
+        srlg    %r0,$s1,63              # carry bit from lower half
+        sllg    $s1,$s1,1
+        sllg    $s3,$s3,1
+        xgr     $s1,%r1
+        ogr     $s3,%r0
+        lrvgr   $s1,$s1                 # flip byte order
+        lrvgr   $s3,$s3
+        srlg    $s0,$s1,32              # smash the tweak to 4x32-bits 
+        stg     $s1,$tweak+0($sp)       # save the tweak
+        llgfr   $s1,$s1
+        srlg    $s2,$s3,32
+        stg     $s3,$tweak+8($sp)
+        llgfr   $s3,$s3
+        la      $inp,16($inp)           # $inp+=16
+.Lxts_enc_enter:
+        x       $s0,0($inp)             # ^=*($inp)
+        x       $s1,4($inp)
+        x       $s2,8($inp)
+        x       $s3,12($inp)
+        stm${g} %r2,%r3,2*$SIZE_T($sp)  # only two registers are changing
+        la      $key,0($key1)
+        bras    $ra,_s390x_AES_encrypt
+        lm${g}  %r2,%r5,2*$SIZE_T($sp)
+        x       $s0,$tweak+0($sp)       # ^=tweak
+        x       $s1,$tweak+4($sp)
+        x       $s2,$tweak+8($sp)
+        x       $s3,$tweak+12($sp)
+        st      $s0,0($out,$inp)
+        st      $s1,4($out,$inp)
+        st      $s2,8($out,$inp)
+        st      $s3,12($out,$inp)
+        brct${g}        $len,.Lxts_enc_loop
+        llgc    $len,`2*$SIZE_T-1`($sp)
+        nill    $len,0x0f               # $len%16
+        jz      .Lxts_enc_done
+        la      $i3,0($inp,$out)        # put aside real $out
+.Lxts_enc_steal:
+        llgc    %r0,16($inp)
+        llgc    %r1,0($out,$inp)
+        stc     %r0,0($out,$inp)
+        stc     %r1,16($out,$inp)
+        la      $inp,1($inp)
+        brct    $len,.Lxts_enc_steal
+        la      $out,0($i3)             # restore real $out
+        # generate last tweak...
+        lrvg    $s1,$tweak+0($sp)       # load the tweak in little-endian
+        lrvg    $s3,$tweak+8($sp)
+        lghi    %r1,0x87
+        srag    %r0,$s3,63              # broadcast upper bit
+        ngr     %r1,%r0                 # rem
+        srlg    %r0,$s1,63              # carry bit from lower half
+        sllg    $s1,$s1,1
+        sllg    $s3,$s3,1
+        xgr     $s1,%r1
+        ogr     $s3,%r0
+        lrvgr   $s1,$s1                 # flip byte order
+        lrvgr   $s3,$s3
+        srlg    $s0,$s1,32              # smash the tweak to 4x32-bits 
+        stg     $s1,$tweak+0($sp)       # save the tweak
+        llgfr   $s1,$s1
+        srlg    $s2,$s3,32
+        stg     $s3,$tweak+8($sp)
+        llgfr   $s3,$s3
+        x       $s0,0($out)             # ^=*(inp)|stolen cipther-text
+        x       $s1,4($out)
+        x       $s2,8($out)
+        x       $s3,12($out)
+        st${g}  $out,4*$SIZE_T($sp)
+        la      $key,0($key1)
+        bras    $ra,_s390x_AES_encrypt
+        l${g}   $out,4*$SIZE_T($sp)
+        x       $s0,`$tweak+0`($sp)     # ^=tweak
+        x       $s1,`$tweak+4`($sp)
+        x       $s2,`$tweak+8`($sp)
+        x       $s3,`$tweak+12`($sp)
+        st      $s0,0($out)
+        st      $s1,4($out)
+        st      $s2,8($out)
+        st      $s3,12($out)
+.Lxts_enc_done:
+        stg     $sp,$tweak+0($sp)       # wipe tweak
+        stg     $sp,$twesk+8($sp)
+        lm${g}  %r6,$ra,6*$SIZE_T($sp)
+        br      $ra
+.size   AES_xts_encrypt,.-AES_xts_encrypt
+___
+# void AES_xts_decrypt(const char *inp,char *out,size_t len,
+#       const AES_KEY *key1, const AES_KEY *key2,u64 secno);
+#
+$code.=<<___;
+.globl  AES_xts_decrypt
+.type   AES_xts_decrypt,\@function
+.align  16
+AES_xts_decrypt:
+        xgr     %r3,%r4                 # flip %r3 and %r4, $out and $len
+        xgr     %r4,%r3
+        xgr     %r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+        llgfr   $len,$len
+___
+$code.=<<___;
+        st${g}  $len,1*$SIZE_T($sp)     # save copy of $len
+        aghi    $len,-16
+        bcr     4,$ra                   # abort if less than zero. formally
+                                        # wrong, because $len is unsigned,
+                                        # but who can afford asking to
+                                        # process more than 2^63-1 bytes?
+        tmll    $len,0x0f
+        jnz     .Lxts_dec_proceed
+        aghi    $len,16
+.Lxts_dec_proceed:
+___
+$code.=<<___ if (!$softonly);
+        llgf    %r0,240($key2)
+        lhi     %r1,16
+        clr     %r0,%r1
+        jl      .Lxts_dec_software
+        stm${g} %r6,$s3,6*$SIZE_T($sp)
+        st${g}  $ra,14*$SIZE_T($sp)
+        nill    $len,0xfff0             # $len&=~15
+        slgr    $out,$inp
+        # generate the tweak value
+        l${g}   $s3,$stdframe($sp)      # pointer to iv
+        la      $s2,$tweak($sp)
+        lmg     $s0,$s1,0($s3)
+        lghi    $s3,16
+        stmg    $s0,$s1,0($s2)
+        la      %r1,0($key2)            # $key2 is not needed past this point
+        .long   0xb92e00aa              # km $s2,$s2, generate the tweak
+        brc     1,.-4                   # can this happen?
+        l       %r0,240($key1)
+        la      %r1,0($key1)            # $key1 is not needed anymore
+        ltgr    $len,$len
+        jz      .Lxts_dec_km_short
+        bras    $ra,_s390x_xts_km
+        jz      .Lxts_dec_km_done
+        lrvgr   $s2,$s0                 # make copy in reverse byte order
+        lrvgr   $s3,$s1
+        j       .Lxts_dec_km_2ndtweak
+.Lxts_dec_km_short:
+        llgc    $len,`2*$SIZE_T-1`($sp)
+        nill    $len,0x0f               # $len%=16
+        lrvg    $s0,$tweak+0($sp)       # load the tweak
+        lrvg    $s1,$tweak+8($sp)
+        lrvgr   $s2,$s0                 # make copy in reverse byte order
+        lrvgr   $s3,$s1
+.Lxts_dec_km_2ndtweak:
+        lghi    $i1,0x87
+        srag    $i2,$s1,63              # broadcast upper bit
+        ngr     $i1,$i2                 # rem
+        srlg    $i2,$s0,63              # carry bit from lower half
+        sllg    $s0,$s0,1
+        sllg    $s1,$s1,1
+        xgr     $s0,$i1
+        ogr     $s1,$i2
+        lrvgr   $i1,$s0                 # flip byte order
+        lrvgr   $i2,$s1
+        xg      $i1,0($inp)
+        xg      $i2,8($inp)
+        stg     $i1,0($out,$inp)
+        stg     $i2,8($out,$inp)
+        la      $i2,0($out,$inp)
+        lghi    $i3,16
+        .long   0xb92e0066              # km $i2,$i2
+        brc     1,.-4                   # can this happen?
+        lrvgr   $i1,$s0
+        lrvgr   $i2,$s1
+        xg      $i1,0($out,$inp)
+        xg      $i2,8($out,$inp)
+        stg     $i1,0($out,$inp)
+        stg     $i2,8($out,$inp)
+        la      $i3,0($out,$inp)        # put aside real $out
+.Lxts_dec_km_steal:
+        llgc    $i1,16($inp)
+        llgc    $i2,0($out,$inp)
+        stc     $i1,0($out,$inp)
+        stc     $i2,16($out,$inp)
+        la      $inp,1($inp)
+        brct    $len,.Lxts_dec_km_steal
+        lgr     $s0,$s2
+        lgr     $s1,$s3
+        xg      $s0,0($i3)
+        xg      $s1,8($i3)
+        stg     $s0,0($i3)
+        stg     $s1,8($i3)
+        la      $s0,0($i3)
+        lghi    $s1,16
+        .long   0xb92e0088              # km $s0,$s0
+        brc     1,.-4                   # can this happen?
+        xg      $s2,0($i3)
+        xg      $s3,8($i3)
+        stg     $s2,0($i3)
+        stg     $s3,8($i3)
+.Lxts_dec_km_done:
+        l${g}   $ra,14*$SIZE_T($sp)
+        st${g}  $sp,$tweak($sp)         # wipe tweak
+        st${g}  $sp,$tweak($sp)
+        lm${g}  %r6,$s3,6*$SIZE_T($sp)
+        br      $ra
+.align  16
+.Lxts_dec_software:
+___
+$code.=<<___;
+        stm${g} %r6,$ra,6*$SIZE_T($sp)
+        srlg    $len,$len,4
+        slgr    $out,$inp
+        xgr     $s0,$s0                 # clear upper half
+        xgr     $s1,$s1
+        lrv     $s0,$stdframe+4($sp)    # load secno
+        lrv     $s1,$stdframe+0($sp)
+        xgr     $s2,$s2
+        xgr     $s3,$s3
+        stm${g} %r2,%r5,2*$SIZE_T($sp)
+        la      $key,0($key2)
+        larl    $tbl,AES_Te
+        bras    $ra,_s390x_AES_encrypt  # generate the tweak
+        lm${g}  %r2,%r5,2*$SIZE_T($sp)
+        larl    $tbl,AES_Td
+        lt${g}r $len,$len
+        stm     $s0,$s3,$tweak($sp)     # save the tweak
+        jz      .Lxts_dec_short
+        j       .Lxts_dec_enter
+.align  16
+.Lxts_dec_loop:
+        lrvg    $s1,$tweak+0($sp)       # load the tweak in little-endian
+        lrvg    $s3,$tweak+8($sp)
+        lghi    %r1,0x87
+        srag    %r0,$s3,63              # broadcast upper bit
+        ngr     %r1,%r0                 # rem
+        srlg    %r0,$s1,63              # carry bit from lower half
+        sllg    $s1,$s1,1
+        sllg    $s3,$s3,1
+        xgr     $s1,%r1
+        ogr     $s3,%r0
+        lrvgr   $s1,$s1                 # flip byte order
+        lrvgr   $s3,$s3
+        srlg    $s0,$s1,32              # smash the tweak to 4x32-bits 
+        stg     $s1,$tweak+0($sp)       # save the tweak
+        llgfr   $s1,$s1
+        srlg    $s2,$s3,32
+        stg     $s3,$tweak+8($sp)
+        llgfr   $s3,$s3
+.Lxts_dec_enter:
+        x       $s0,0($inp)             # tweak^=*(inp)
+        x       $s1,4($inp)
+        x       $s2,8($inp)
+        x       $s3,12($inp)
+        stm${g} %r2,%r3,2*$SIZE_T($sp)  # only two registers are changing
+        la      $key,0($key1)
+        bras    $ra,_s390x_AES_decrypt
+        lm${g}  %r2,%r5,2*$SIZE_T($sp)
+        x       $s0,$tweak+0($sp)       # ^=tweak
+        x       $s1,$tweak+4($sp)
+        x       $s2,$tweak+8($sp)
+        x       $s3,$tweak+12($sp)
+        st      $s0,0($out,$inp)
+        st      $s1,4($out,$inp)
+        st      $s2,8($out,$inp)
+        st      $s3,12($out,$inp)
+        la      $inp,16($inp)
+        brct${g}        $len,.Lxts_dec_loop
+        llgc    $len,`2*$SIZE_T-1`($sp)
+        nill    $len,0x0f               # $len%16
+        jz      .Lxts_dec_done
+        # generate pair of tweaks...
+        lrvg    $s1,$tweak+0($sp)       # load the tweak in little-endian
+        lrvg    $s3,$tweak+8($sp)
+        lghi    %r1,0x87
+        srag    %r0,$s3,63              # broadcast upper bit
+        ngr     %r1,%r0                 # rem
+        srlg    %r0,$s1,63              # carry bit from lower half
+        sllg    $s1,$s1,1
+        sllg    $s3,$s3,1
+        xgr     $s1,%r1
+        ogr     $s3,%r0
+        lrvgr   $i2,$s1                 # flip byte order
+        lrvgr   $i3,$s3
+        stmg    $i2,$i3,$tweak($sp)     # save the 1st tweak
+        j       .Lxts_dec_2ndtweak
+.align  16
+.Lxts_dec_short:
+        llgc    $len,`2*$SIZE_T-1`($sp)
+        nill    $len,0x0f               # $len%16
+        lrvg    $s1,$tweak+0($sp)       # load the tweak in little-endian
+        lrvg    $s3,$tweak+8($sp)
+.Lxts_dec_2ndtweak:
+        lghi    %r1,0x87
+        srag    %r0,$s3,63              # broadcast upper bit
+        ngr     %r1,%r0                 # rem
+        srlg    %r0,$s1,63              # carry bit from lower half
+        sllg    $s1,$s1,1
+        sllg    $s3,$s3,1
+        xgr     $s1,%r1
+        ogr     $s3,%r0
+        lrvgr   $s1,$s1                 # flip byte order
+        lrvgr   $s3,$s3
+        srlg    $s0,$s1,32              # smash the tweak to 4x32-bits
+        stg     $s1,$tweak-16+0($sp)    # save the 2nd tweak
+        llgfr   $s1,$s1
+        srlg    $s2,$s3,32
+        stg     $s3,$tweak-16+8($sp)
+        llgfr   $s3,$s3
+        x       $s0,0($inp)             # tweak_the_2nd^=*(inp)
+        x       $s1,4($inp)
+        x       $s2,8($inp)
+        x       $s3,12($inp)
+        stm${g} %r2,%r3,2*$SIZE_T($sp)
+        la      $key,0($key1)
+        bras    $ra,_s390x_AES_decrypt
+        lm${g}  %r2,%r5,2*$SIZE_T($sp)
+        x       $s0,$tweak-16+0($sp)    # ^=tweak_the_2nd
+        x       $s1,$tweak-16+4($sp)
+        x       $s2,$tweak-16+8($sp)
+        x       $s3,$tweak-16+12($sp)
+        st      $s0,0($out,$inp)
+        st      $s1,4($out,$inp)
+        st      $s2,8($out,$inp)
+        st      $s3,12($out,$inp)
+        la      $i3,0($out,$inp)        # put aside real $out
+.Lxts_dec_steal:
+        llgc    %r0,16($inp)
+        llgc    %r1,0($out,$inp)
+        stc     %r0,0($out,$inp)
+        stc     %r1,16($out,$inp)
+        la      $inp,1($inp)
+        brct    $len,.Lxts_dec_steal
+        la      $out,0($i3)             # restore real $out
+        lm      $s0,$s3,$tweak($sp)     # load the 1st tweak
+        x       $s0,0($out)             # tweak^=*(inp)|stolen cipher-text
+        x       $s1,4($out)
+        x       $s2,8($out)
+        x       $s3,12($out)
+        st${g}  $out,4*$SIZE_T($sp)
+        la      $key,0($key1)
+        bras    $ra,_s390x_AES_decrypt
+        l${g}   $out,4*$SIZE_T($sp)
+        x       $s0,$tweak+0($sp)       # ^=tweak
+        x       $s1,$tweak+4($sp)
+        x       $s2,$tweak+8($sp)
+        x       $s3,$tweak+12($sp)
+        st      $s0,0($out)
+        st      $s1,4($out)
+        st      $s2,8($out)
+        st      $s3,12($out)
+        stg     $sp,$tweak-16+0($sp)    # wipe 2nd tweak
+        stg     $sp,$tweak-16+8($sp)
+.Lxts_dec_done:
+        stg     $sp,$tweak+0($sp)       # wipe tweak
+        stg     $sp,$twesk+8($sp)
+        lm${g}  %r6,$ra,6*$SIZE_T($sp)
+        br      $ra
+.size   AES_xts_decrypt,.-AES_xts_decrypt
 ___
 }
 $code.=<<___;
 .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+.comm   OPENSSL_s390xcap_P,16,8
 ___
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 print $code;
+close STDOUT;   # force flush
diff --git a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
index c57b3a2d6d..403c4d1290 100755
--- a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
+++ b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
@@ -1176,6 +1176,7 @@ ___
 # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
 # undesired effect, so just omit them and sacrifice some portion of
 # percent in performance...
-$code =~ s/fmovs.*$//gem;
+$code =~ s/fmovs.*$//gm;
 print $code;
+close STDOUT;   # ensure flush
diff --git a/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl
new file mode 100644
index 0000000000..c6f6b3334a
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl
@@ -0,0 +1,1249 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2011
+#
+# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
+# AESNI code is weaved into it. Below are performance numbers in
+# cycles per processed byte, less is better, for standalone AESNI-CBC
+# encrypt, sum of the latter and standalone SHA1, and "stitched"
+# subroutine:
+#
+#               AES-128-CBC     +SHA1           stitch      gain
+# Westmere      3.77[+5.6]      9.37            6.65        +41%
+# Sandy Bridge  5.05[+5.2(6.3)] 10.25(11.35)    6.16(7.08)  +67%(+60%)
+#
+#               AES-192-CBC
+# Westmere      4.51            10.11           6.97        +45%
+# Sandy Bridge  6.05            11.25(12.35)    6.34(7.27)  +77%(+70%)
+#
+#               AES-256-CBC
+# Westmere      5.25            10.85           7.25        +50%
+# Sandy Bridge  7.05            12.25(13.35)    7.06(7.70)  +74%(+73%)
+#
+# (*)   There are two code paths: SSSE3 and AVX. See sha1-568.pl for
+#       background information. Above numbers in parentheses are SSSE3
+#       results collected on AVX-capable CPU, i.e. apply on OSes that
+#       don't support AVX.
+#
+# Needless to mention that it makes no sense to implement "stitched"
+# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
+# fully utilize parallelism, so stitching would not give any gain
+# anyway. Well, there might be some, e.g. because of better cache
+# locality... For reference, here are performance results for
+# standalone AESNI-CBC decrypt:
+#
+#               AES-128-CBC     AES-192-CBC     AES-256-CBC
+# Westmere      1.31            1.55            1.80
+# Sandy Bridge  0.93            1.06            1.22
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+                =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+           $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+           `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+           $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+           `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+           $1>=10);
+open STDOUT,"| $^X $xlate $flavour $output";
+# void aesni_cbc_sha1_enc(const void *inp,
+#                       void *out,
+#                       size_t length,
+#                       const AES_KEY *key,
+#                       unsigned char *iv,
+#                       SHA_CTX *ctx,
+#                       const void *in0);
+$code.=<<___;
+.text
+.extern OPENSSL_ia32cap_P
+.globl  aesni_cbc_sha1_enc
+.type   aesni_cbc_sha1_enc,\@abi-omnipotent
+.align  16
+aesni_cbc_sha1_enc:
+        # caller should check for SSSE3 and AES-NI bits
+        mov     OPENSSL_ia32cap_P+0(%rip),%r10d
+        mov     OPENSSL_ia32cap_P+4(%rip),%r11d
+___
+$code.=<<___ if ($avx);
+        and     \$`1<<28`,%r11d         # mask AVX bit
+        and     \$`1<<30`,%r10d         # mask "Intel CPU" bit
+        or      %r11d,%r10d
+        cmp     \$`1<<28|1<<30`,%r10d
+        je      aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+        jmp     aesni_cbc_sha1_enc_ssse3
+        ret
+.size   aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+___
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
+my @T=("%esi","%edi");
+my $j=0; my $jj=0; my $r=0; my $sn=0;
+my $K_XX_XX="%r11";
+my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
+my @rndkey=("%xmm14","%xmm15");
+sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+$code.=<<___;
+.type   aesni_cbc_sha1_enc_ssse3,\@function,6
+.align  16
+aesni_cbc_sha1_enc_ssse3:
+        mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
+        #shr    \$6,$len                        # debugging artefact
+        #jz     .Lepilogue_ssse3                # debugging artefact
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        lea     `-104-($win64?10*16:0)`(%rsp),%rsp
+        #mov    $in0,$inp                       # debugging artefact
+        #lea    64(%rsp),$ctx                   # debugging artefact
+___
+$code.=<<___ if ($win64);
+        movaps  %xmm6,96+0(%rsp)
+        movaps  %xmm7,96+16(%rsp)
+        movaps  %xmm8,96+32(%rsp)
+        movaps  %xmm9,96+48(%rsp)
+        movaps  %xmm10,96+64(%rsp)
+        movaps  %xmm11,96+80(%rsp)
+        movaps  %xmm12,96+96(%rsp)
+        movaps  %xmm13,96+112(%rsp)
+        movaps  %xmm14,96+128(%rsp)
+        movaps  %xmm15,96+144(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+        mov     $in0,%r12                       # reassign arguments
+        mov     $out,%r13
+        mov     $len,%r14
+        mov     $key,%r15
+        movdqu  ($ivp),$iv                      # load IV
+        mov     $ivp,88(%rsp)                   # save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15));  # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+        shl     \$6,$len
+        sub     $in0,$out
+        mov     240($key),$rounds
+        add     $inp,$len               # end of input
+        lea     K_XX_XX(%rip),$K_XX_XX
+        mov     0($ctx),$A              # load context
+        mov     4($ctx),$B
+        mov     8($ctx),$C
+        mov     12($ctx),$D
+        mov     $B,@T[0]                # magic seed
+        mov     16($ctx),$E
+        movdqa  64($K_XX_XX),@X[2]      # pbswap mask
+        movdqa  0($K_XX_XX),@Tx[1]      # K_00_19
+        movdqu  0($inp),@X[-4&7]        # load input to %xmm[0-3]
+        movdqu  16($inp),@X[-3&7]
+        movdqu  32($inp),@X[-2&7]
+        movdqu  48($inp),@X[-1&7]
+        pshufb  @X[2],@X[-4&7]          # byte swap
+        add     \$64,$inp
+        pshufb  @X[2],@X[-3&7]
+        pshufb  @X[2],@X[-2&7]
+        pshufb  @X[2],@X[-1&7]
+        paddd   @Tx[1],@X[-4&7]         # add K_00_19
+        paddd   @Tx[1],@X[-3&7]
+        paddd   @Tx[1],@X[-2&7]
+        movdqa  @X[-4&7],0(%rsp)        # X[]+K xfer to IALU
+        psubd   @Tx[1],@X[-4&7]         # restore X[]
+        movdqa  @X[-3&7],16(%rsp)
+        psubd   @Tx[1],@X[-3&7]
+        movdqa  @X[-2&7],32(%rsp)
+        psubd   @Tx[1],@X[-2&7]
+        movups  ($key),$rndkey0         # $key[0]
+        movups  16($key),$rndkey[0]     # forward reference
+        jmp     .Loop_ssse3
+___
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+        movups          `16*$n`($in0),$in               # load input
+        xorps           $rndkey0,$in
+___
+      $code.=<<___ if ($n);
+        movups          $iv,`16*($n-1)`($out,$in0)      # write output
+___
+      $code.=<<___;
+        xorps           $in,$iv
+        aesenc          $rndkey[0],$iv
+        movups          `32+16*$k`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+        cmp             \$11,$rounds
+        jb              .Laesenclast$sn
+        movups          `32+16*($k+0)`($key),$rndkey[1]
+        aesenc          $rndkey[0],$iv
+        movups          `32+16*($k+1)`($key),$rndkey[0]
+        aesenc          $rndkey[1],$iv
+        je              .Laesenclast$sn
+        movups          `32+16*($k+2)`($key),$rndkey[1]
+        aesenc          $rndkey[0],$iv
+        movups          `32+16*($k+3)`($key),$rndkey[0]
+        aesenc          $rndkey[1],$iv
+.Laesenclast$sn:
+        aesenclast      $rndkey[0],$iv
+        movups          16($key),$rndkey[1]             # forward reference
+___
+    } else {
+      $code.=<<___;
+        aesenc          $rndkey[0],$iv
+        movups          `32+16*$k`($key),$rndkey[1]
+___
+    }
+    $r++;       unshift(@rndkey,pop(@rndkey));
+};
+sub Xupdate_ssse3_16_31()               # recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
+  my ($a,$b,$c,$d,$e);
+        &movdqa (@X[0],@X[-3&7]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &movdqa (@Tx[0],@X[-1&7]);
+        &palignr(@X[0],@X[-4&7],8);     # compose "X[-14]" in "X[0]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &paddd        (@Tx[1],@X[-1&7]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &psrldq (@Tx[0],4);             # "X[-3]", 3 dwords
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &pxor   (@X[0],@X[-4&7]);       # "X[0]"^="X[-16]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &pxor   (@Tx[0],@X[-2&7]);      # "X[-3]"^"X[-8]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-3]"^"X[-8]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &movdqa (@Tx[2],@X[0]);
+        &movdqa (@Tx[0],@X[0]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &pslldq (@Tx[2],12);            # "X[0]"<<96, extract one dword
+        &paddd  (@X[0],@X[0]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &psrld  (@Tx[0],31);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &movdqa (@Tx[1],@Tx[2]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &psrld  (@Tx[2],30);
+        &por    (@X[0],@Tx[0]);         # "X[0]"<<<=1
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &pslld  (@Tx[1],2);
+        &pxor   (@X[0],@Tx[2]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &movdqa       (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &pxor   (@X[0],@Tx[1]);         # "X[0]"^=("X[0]">>96)<<<2
+         foreach (@insns) { eval; }     # remaining instructions [if any]
+  $Xi++;        push(@X,shift(@X));     # "rotate" X[]
+                push(@Tx,shift(@Tx));
+}
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+        &movdqa (@Tx[0],@X[-1&7])       if ($Xi==8);
+         eval(shift(@insns));           # body_20_39
+        &pxor   (@X[0],@X[-4&7]);       # "X[0]"="X[-32]"^"X[-16]"
+        &palignr(@Tx[0],@X[-2&7],8);    # compose "X[-6]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+        &pxor   (@X[0],@X[-7&7]);       # "X[0]"^="X[-28]"
+         eval(shift(@insns));
+         eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
+        if ($Xi%5) {
+          &movdqa       (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+        } else {                        # ... or load next one
+          &movdqa       (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+        }
+          &paddd        (@Tx[1],@X[-1&7]);
+         eval(shift(@insns));           # ror
+         eval(shift(@insns));
+        &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-6]"
+         eval(shift(@insns));           # body_20_39
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+        &movdqa (@Tx[0],@X[0]);
+          &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # ror
+         eval(shift(@insns));
+        &pslld  (@X[0],2);
+         eval(shift(@insns));           # body_20_39
+         eval(shift(@insns));
+        &psrld  (@Tx[0],30);
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # ror
+         eval(shift(@insns));
+        &por    (@X[0],@Tx[0]);         # "X[0]"<<<=2
+         eval(shift(@insns));           # body_20_39
+         eval(shift(@insns));
+          &movdqa       (@Tx[1],@X[0])  if ($Xi<19);
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+         eval(shift(@insns));
+         foreach (@insns) { eval; }     # remaining instructions
+  $Xi++;        push(@X,shift(@X));     # "rotate" X[]
+                push(@Tx,shift(@Tx));
+}
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
+  my ($a,$b,$c,$d,$e);
+         eval(shift(@insns));
+          &paddd        (@Tx[1],@X[-1&7]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+         foreach (@insns) { eval; }             # remaining instructions
+        &cmp    ($inp,$len);
+        &je     (".Ldone_ssse3");
+        unshift(@Tx,pop(@Tx));
+        &movdqa (@X[2],"64($K_XX_XX)");         # pbswap mask
+        &movdqa (@Tx[1],"0($K_XX_XX)");         # K_00_19
+        &movdqu (@X[-4&7],"0($inp)");           # load input
+        &movdqu (@X[-3&7],"16($inp)");
+        &movdqu (@X[-2&7],"32($inp)");
+        &movdqu (@X[-1&7],"48($inp)");
+        &pshufb (@X[-4&7],@X[2]);               # byte swap
+        &add    ($inp,64);
+  $Xi=0;
+}
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
+  my ($a,$b,$c,$d,$e);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &pshufb (@X[($Xi-3)&7],@X[2]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &paddd  (@X[($Xi-4)&7],@Tx[1]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);  # X[]+K xfer to IALU
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &psubd  (@X[($Xi-4)&7],@Tx[1]);
+        foreach (@insns) { eval; }
+  $Xi++;
+}
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
+  my ($a,$b,$c,$d,$e);
+        foreach (@insns) { eval; }
+}
+sub body_00_19 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+        '($a,$b,$c,$d,$e)=@V;'.
+        '&add   ($e,eval(4*($j&15))."(%rsp)");',        # X[]+K xfer
+        '&xor   ($c,$d);',
+        '&mov   (@T[1],$a);',   # $b in next round
+        '&$_rol ($a,5);',
+        '&and   (@T[0],$c);',   # ($b&($c^$d))
+        '&xor   ($c,$d);',      # restore $c
+        '&xor   (@T[0],$d);',
+        '&add   ($e,$a);',
+        '&$_ror ($b,$j?7:2);',  # $b>>>2
+        '&add   ($e,@T[0]);'    .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+        );
+        $n = scalar(@r);
+        $k = (($jj+1)*12/20)*20*$n/12;  # 12 aesencs per these 20 rounds
+        @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n);
+        $jj++;
+    return @r;
+}
+sub body_20_39 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+        '($a,$b,$c,$d,$e)=@V;'.
+        '&add   ($e,eval(4*($j++&15))."(%rsp)");',      # X[]+K xfer
+        '&xor   (@T[0],$d);',   # ($b^$d)
+        '&mov   (@T[1],$a);',   # $b in next round
+        '&$_rol ($a,5);',
+        '&xor   (@T[0],$c);',   # ($b^$d^$c)
+        '&add   ($e,$a);',
+        '&$_ror ($b,7);',       # $b>>>2
+        '&add   ($e,@T[0]);'    .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+        );
+        $n = scalar(@r);
+        $k = (($jj+1)*8/20)*20*$n/8;    # 8 aesencs per these 20 rounds
+        @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n);
+        $jj++;
+    return @r;
+}
+sub body_40_59 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+        '($a,$b,$c,$d,$e)=@V;'.
+        '&mov   (@T[1],$c);',
+        '&xor   ($c,$d);',
+        '&add   ($e,eval(4*($j++&15))."(%rsp)");',      # X[]+K xfer
+        '&and   (@T[1],$d);',
+        '&and   (@T[0],$c);',   # ($b&($c^$d))
+        '&$_ror ($b,7);',       # $b>>>2
+        '&add   ($e,@T[1]);',
+        '&mov   (@T[1],$a);',   # $b in next round
+        '&$_rol ($a,5);',
+        '&add   ($e,@T[0]);',
+        '&xor   ($c,$d);',      # restore $c
+        '&add   ($e,$a);'       .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+        );
+        $n = scalar(@r);
+        $k=(($jj+1)*12/20)*20*$n/12;    # 12 aesencs per these 20 rounds
+        @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n);
+        $jj++;
+    return @r;
+}
+$code.=<<___;
+.align  16
+.Loop_ssse3:
+___
+        &Xupdate_ssse3_16_31(\&body_00_19);
+        &Xupdate_ssse3_16_31(\&body_00_19);
+        &Xupdate_ssse3_16_31(\&body_00_19);
+        &Xupdate_ssse3_16_31(\&body_00_19);
+        &Xupdate_ssse3_32_79(\&body_00_19);
+        &Xupdate_ssse3_32_79(\&body_20_39);
+        &Xupdate_ssse3_32_79(\&body_20_39);
+        &Xupdate_ssse3_32_79(\&body_20_39);
+        &Xupdate_ssse3_32_79(\&body_20_39);
+        &Xupdate_ssse3_32_79(\&body_20_39);
+        &Xupdate_ssse3_32_79(\&body_40_59);
+        &Xupdate_ssse3_32_79(\&body_40_59);
+        &Xupdate_ssse3_32_79(\&body_40_59);
+        &Xupdate_ssse3_32_79(\&body_40_59);
+        &Xupdate_ssse3_32_79(\&body_40_59);
+        &Xupdate_ssse3_32_79(\&body_20_39);
+        &Xuplast_ssse3_80(\&body_20_39);        # can jump to "done"
+                                $saved_j=$j; @saved_V=@V;
+                                $saved_r=$r; @saved_rndkey=@rndkey;
+        &Xloop_ssse3(\&body_20_39);
+        &Xloop_ssse3(\&body_20_39);
+        &Xloop_ssse3(\&body_20_39);
+$code.=<<___;
+        movups  $iv,48($out,$in0)               # write output
+        lea     64($in0),$in0
+        add     0($ctx),$A                      # update context
+        add     4($ctx),@T[0]
+        add     8($ctx),$C
+        add     12($ctx),$D
+        mov     $A,0($ctx)
+        add     16($ctx),$E
+        mov     @T[0],4($ctx)
+        mov     @T[0],$B                        # magic seed
+        mov     $C,8($ctx)
+        mov     $D,12($ctx)
+        mov     $E,16($ctx)
+        jmp     .Loop_ssse3
+.align  16
+.Ldone_ssse3:
+___
+                                $jj=$j=$saved_j; @V=@saved_V;
+                                $r=$saved_r;     @rndkey=@saved_rndkey;
+        &Xtail_ssse3(\&body_20_39);
+        &Xtail_ssse3(\&body_20_39);
+        &Xtail_ssse3(\&body_20_39);
+$code.=<<___;
+        movups  $iv,48($out,$in0)               # write output
+        mov     88(%rsp),$ivp                   # restore $ivp
+        add     0($ctx),$A                      # update context
+        add     4($ctx),@T[0]
+        add     8($ctx),$C
+        mov     $A,0($ctx)
+        add     12($ctx),$D
+        mov     @T[0],4($ctx)
+        add     16($ctx),$E
+        mov     $C,8($ctx)
+        mov     $D,12($ctx)
+        mov     $E,16($ctx)
+        movups  $iv,($ivp)                      # write IV
+___
+$code.=<<___ if ($win64);
+        movaps  96+0(%rsp),%xmm6
+        movaps  96+16(%rsp),%xmm7
+        movaps  96+32(%rsp),%xmm8
+        movaps  96+48(%rsp),%xmm9
+        movaps  96+64(%rsp),%xmm10
+        movaps  96+80(%rsp),%xmm11
+        movaps  96+96(%rsp),%xmm12
+        movaps  96+112(%rsp),%xmm13
+        movaps  96+128(%rsp),%xmm14
+        movaps  96+144(%rsp),%xmm15
+___
+$code.=<<___;
+        lea     `104+($win64?10*16:0)`(%rsp),%rsi
+        mov     0(%rsi),%r15
+        mov     8(%rsi),%r14
+        mov     16(%rsi),%r13
+        mov     24(%rsi),%r12
+        mov     32(%rsi),%rbp
+        mov     40(%rsi),%rbx
+        lea     48(%rsi),%rsp
+.Lepilogue_ssse3:
+        ret
+.size   aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+___
+$j=$jj=$r=$sn=0;
+if ($avx) {
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
+my @T=("%esi","%edi");
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+$code.=<<___;
+.type   aesni_cbc_sha1_enc_avx,\@function,6
+.align  16
+aesni_cbc_sha1_enc_avx:
+        mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
+        #shr    \$6,$len                        # debugging artefact
+        #jz     .Lepilogue_avx                  # debugging artefact
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        lea     `-104-($win64?10*16:0)`(%rsp),%rsp
+        #mov    $in0,$inp                       # debugging artefact
+        #lea    64(%rsp),$ctx                   # debugging artefact
+___
+$code.=<<___ if ($win64);
+        movaps  %xmm6,96+0(%rsp)
+        movaps  %xmm7,96+16(%rsp)
+        movaps  %xmm8,96+32(%rsp)
+        movaps  %xmm9,96+48(%rsp)
+        movaps  %xmm10,96+64(%rsp)
+        movaps  %xmm11,96+80(%rsp)
+        movaps  %xmm12,96+96(%rsp)
+        movaps  %xmm13,96+112(%rsp)
+        movaps  %xmm14,96+128(%rsp)
+        movaps  %xmm15,96+144(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+        vzeroall
+        mov     $in0,%r12                       # reassign arguments
+        mov     $out,%r13
+        mov     $len,%r14
+        mov     $key,%r15
+        vmovdqu ($ivp),$iv                      # load IV
+        mov     $ivp,88(%rsp)                   # save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15));  # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+        shl     \$6,$len
+        sub     $in0,$out
+        mov     240($key),$rounds
+        add     \$112,$key              # size optimization
+        add     $inp,$len               # end of input
+        lea     K_XX_XX(%rip),$K_XX_XX
+        mov     0($ctx),$A              # load context
+        mov     4($ctx),$B
+        mov     8($ctx),$C
+        mov     12($ctx),$D
+        mov     $B,@T[0]                # magic seed
+        mov     16($ctx),$E
+        vmovdqa 64($K_XX_XX),@X[2]      # pbswap mask
+        vmovdqa 0($K_XX_XX),@Tx[1]      # K_00_19
+        vmovdqu 0($inp),@X[-4&7]        # load input to %xmm[0-3]
+        vmovdqu 16($inp),@X[-3&7]
+        vmovdqu 32($inp),@X[-2&7]
+        vmovdqu 48($inp),@X[-1&7]
+        vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+        add     \$64,$inp
+        vpshufb @X[2],@X[-3&7],@X[-3&7]
+        vpshufb @X[2],@X[-2&7],@X[-2&7]
+        vpshufb @X[2],@X[-1&7],@X[-1&7]
+        vpaddd  @Tx[1],@X[-4&7],@X[0]   # add K_00_19
+        vpaddd  @Tx[1],@X[-3&7],@X[1]
+        vpaddd  @Tx[1],@X[-2&7],@X[2]
+        vmovdqa @X[0],0(%rsp)           # X[]+K xfer to IALU
+        vmovdqa @X[1],16(%rsp)
+        vmovdqa @X[2],32(%rsp)
+        vmovups -112($key),$rndkey0     # $key[0]
+        vmovups 16-112($key),$rndkey[0] # forward reference
+        jmp     .Loop_avx
+___
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+        vmovups         `16*$n`($in0),$in               # load input
+        vxorps          $rndkey0,$in,$in
+___
+      $code.=<<___ if ($n);
+        vmovups         $iv,`16*($n-1)`($out,$in0)      # write output
+___
+      $code.=<<___;
+        vxorps          $in,$iv,$iv
+        vaesenc         $rndkey[0],$iv,$iv
+        vmovups         `32+16*$k-112`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+        cmp             \$11,$rounds
+        jb              .Lvaesenclast$sn
+        vaesenc         $rndkey[0],$iv,$iv
+        vmovups         `32+16*($k+0)-112`($key),$rndkey[1]
+        vaesenc         $rndkey[1],$iv,$iv
+        vmovups         `32+16*($k+1)-112`($key),$rndkey[0]
+        je              .Lvaesenclast$sn
+        vaesenc         $rndkey[0],$iv,$iv
+        vmovups         `32+16*($k+2)-112`($key),$rndkey[1]
+        vaesenc         $rndkey[1],$iv,$iv
+        vmovups         `32+16*($k+3)-112`($key),$rndkey[0]
+.Lvaesenclast$sn:
+        vaesenclast     $rndkey[0],$iv,$iv
+        vmovups         16-112($key),$rndkey[1]         # forward reference
+___
+    } else {
+      $code.=<<___;
+        vaesenc         $rndkey[0],$iv,$iv
+        vmovups         `32+16*$k-112`($key),$rndkey[1]
+___
+    }
+    $r++;       unshift(@rndkey,pop(@rndkey));
+};
+sub Xupdate_avx_16_31()         # recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
+  my ($a,$b,$c,$d,$e);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpalignr(@X[0],@X[-3&7],@X[-4&7],8);   # compose "X[-14]" in "X[0]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &vpaddd       (@Tx[1],@Tx[1],@X[-1&7]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpsrldq(@Tx[0],@X[-1&7],4);    # "X[-3]", 3 dwords
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"^="X[-16]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpxor  (@Tx[0],@Tx[0],@X[-2&7]);       # "X[-3]"^"X[-8]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-3]"^"X[-8]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpsrld (@Tx[0],@X[0],31);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpslldq(@Tx[2],@X[0],12);              # "X[0]"<<96, extract one dword
+        &vpaddd (@X[0],@X[0],@X[0]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpsrld (@Tx[1],@Tx[2],30);
+        &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=1
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpslld (@Tx[2],@Tx[2],2);
+        &vpxor  (@X[0],@X[0],@Tx[1]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpxor  (@X[0],@X[0],@Tx[2]);           # "X[0]"^=("X[0]">>96)<<<2
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &vmovdqa      (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
+         eval(shift(@insns));
+         eval(shift(@insns));
+         foreach (@insns) { eval; }     # remaining instructions [if any]
+  $Xi++;        push(@X,shift(@X));     # "rotate" X[]
+                push(@Tx,shift(@Tx));
+}
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+        &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);  # compose "X[-6]"
+        &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"="X[-32]"^"X[-16]"
+         eval(shift(@insns));           # body_20_39
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+        &vpxor  (@X[0],@X[0],@X[-7&7]);         # "X[0]"^="X[-28]"
+         eval(shift(@insns));
+         eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
+        if ($Xi%5) {
+          &vmovdqa      (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+        } else {                        # ... or load next one
+          &vmovdqa      (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+        }
+          &vpaddd       (@Tx[1],@Tx[1],@X[-1&7]);
+         eval(shift(@insns));           # ror
+         eval(shift(@insns));
+        &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-6]"
+         eval(shift(@insns));           # body_20_39
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+        &vpsrld (@Tx[0],@X[0],30);
+          &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # ror
+         eval(shift(@insns));
+        &vpslld (@X[0],@X[0],2);
+         eval(shift(@insns));           # body_20_39
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # ror
+         eval(shift(@insns));
+        &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=2
+         eval(shift(@insns));           # body_20_39
+         eval(shift(@insns));
+          &vmovdqa      (@Tx[1],@X[0])  if ($Xi<19);
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+         eval(shift(@insns));
+         foreach (@insns) { eval; }     # remaining instructions
+  $Xi++;        push(@X,shift(@X));     # "rotate" X[]
+                push(@Tx,shift(@Tx));
+}
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
+  my ($a,$b,$c,$d,$e);
+         eval(shift(@insns));
+          &vpaddd       (@Tx[1],@Tx[1],@X[-1&7]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+         foreach (@insns) { eval; }             # remaining instructions
+        &cmp    ($inp,$len);
+        &je     (".Ldone_avx");
+        unshift(@Tx,pop(@Tx));
+        &vmovdqa(@X[2],"64($K_XX_XX)");         # pbswap mask
+        &vmovdqa(@Tx[1],"0($K_XX_XX)");         # K_00_19
+        &vmovdqu(@X[-4&7],"0($inp)");           # load input
+        &vmovdqu(@X[-3&7],"16($inp)");
+        &vmovdqu(@X[-2&7],"32($inp)");
+        &vmovdqu(@X[-1&7],"48($inp)");
+        &vpshufb(@X[-4&7],@X[-4&7],@X[2]);      # byte swap
+        &add    ($inp,64);
+  $Xi=0;
+}
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
+  my ($a,$b,$c,$d,$e);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);      # X[]+K xfer to IALU
+         eval(shift(@insns));
+         eval(shift(@insns));
+        foreach (@insns) { eval; }
+  $Xi++;
+}
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
+  my ($a,$b,$c,$d,$e);
+        foreach (@insns) { eval; }
+}
+$code.=<<___;
+.align  16
+.Loop_avx:
+___
+        &Xupdate_avx_16_31(\&body_00_19);
+        &Xupdate_avx_16_31(\&body_00_19);
+        &Xupdate_avx_16_31(\&body_00_19);
+        &Xupdate_avx_16_31(\&body_00_19);
+        &Xupdate_avx_32_79(\&body_00_19);
+        &Xupdate_avx_32_79(\&body_20_39);
+        &Xupdate_avx_32_79(\&body_20_39);
+        &Xupdate_avx_32_79(\&body_20_39);
+        &Xupdate_avx_32_79(\&body_20_39);
+        &Xupdate_avx_32_79(\&body_20_39);
+        &Xupdate_avx_32_79(\&body_40_59);
+        &Xupdate_avx_32_79(\&body_40_59);
+        &Xupdate_avx_32_79(\&body_40_59);
+        &Xupdate_avx_32_79(\&body_40_59);
+        &Xupdate_avx_32_79(\&body_40_59);
+        &Xupdate_avx_32_79(\&body_20_39);
+        &Xuplast_avx_80(\&body_20_39);  # can jump to "done"
+                                $saved_j=$j; @saved_V=@V;
+                                $saved_r=$r; @saved_rndkey=@rndkey;
+        &Xloop_avx(\&body_20_39);
+        &Xloop_avx(\&body_20_39);
+        &Xloop_avx(\&body_20_39);
+$code.=<<___;
+        vmovups $iv,48($out,$in0)               # write output
+        lea     64($in0),$in0
+        add     0($ctx),$A                      # update context
+        add     4($ctx),@T[0]
+        add     8($ctx),$C
+        add     12($ctx),$D
+        mov     $A,0($ctx)
+        add     16($ctx),$E
+        mov     @T[0],4($ctx)
+        mov     @T[0],$B                        # magic seed
+        mov     $C,8($ctx)
+        mov     $D,12($ctx)
+        mov     $E,16($ctx)
+        jmp     .Loop_avx
+.align  16
+.Ldone_avx:
+___
+                                $jj=$j=$saved_j; @V=@saved_V;
+                                $r=$saved_r;     @rndkey=@saved_rndkey;
+        &Xtail_avx(\&body_20_39);
+        &Xtail_avx(\&body_20_39);
+        &Xtail_avx(\&body_20_39);
+$code.=<<___;
+        vmovups $iv,48($out,$in0)               # write output
+        mov     88(%rsp),$ivp                   # restore $ivp
+        add     0($ctx),$A                      # update context
+        add     4($ctx),@T[0]
+        add     8($ctx),$C
+        mov     $A,0($ctx)
+        add     12($ctx),$D
+        mov     @T[0],4($ctx)
+        add     16($ctx),$E
+        mov     $C,8($ctx)
+        mov     $D,12($ctx)
+        mov     $E,16($ctx)
+        vmovups $iv,($ivp)                      # write IV
+        vzeroall
+___
+$code.=<<___ if ($win64);
+        movaps  96+0(%rsp),%xmm6
+        movaps  96+16(%rsp),%xmm7
+        movaps  96+32(%rsp),%xmm8
+        movaps  96+48(%rsp),%xmm9
+        movaps  96+64(%rsp),%xmm10
+        movaps  96+80(%rsp),%xmm11
+        movaps  96+96(%rsp),%xmm12
+        movaps  96+112(%rsp),%xmm13
+        movaps  96+128(%rsp),%xmm14
+        movaps  96+144(%rsp),%xmm15
+___
+$code.=<<___;
+        lea     `104+($win64?10*16:0)`(%rsp),%rsi
+        mov     0(%rsi),%r15
+        mov     8(%rsi),%r14
+        mov     16(%rsi),%r13
+        mov     24(%rsi),%r12
+        mov     32(%rsi),%rbp
+        mov     40(%rsi),%rbx
+        lea     48(%rsi),%rsp
+.Lepilogue_avx:
+        ret
+.size   aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
+___
+}
+$code.=<<___;
+.align  64
+K_XX_XX:
+.long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
+.long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
+.long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
+.long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
+.long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap mask
+.asciz  "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align  64
+___
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type   ssse3_handler,\@abi-omnipotent
+.align  16
+ssse3_handler:
+        push    %rsi
+        push    %rdi
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        pushfq
+        sub     \$64,%rsp
+        mov     120($context),%rax      # pull context->Rax
+        mov     248($context),%rbx      # pull context->Rip
+        mov     8($disp),%rsi           # disp->ImageBase
+        mov     56($disp),%r11          # disp->HandlerData
+        mov     0(%r11),%r10d           # HandlerData[0]
+        lea     (%rsi,%r10),%r10        # prologue label
+        cmp     %r10,%rbx               # context->Rip<prologue label
+        jb      .Lcommon_seh_tail
+        mov     152($context),%rax      # pull context->Rsp
+        mov     4(%r11),%r10d           # HandlerData[1]
+        lea     (%rsi,%r10),%r10        # epilogue label
+        cmp     %r10,%rbx               # context->Rip>=epilogue label
+        jae     .Lcommon_seh_tail
+        lea     96(%rax),%rsi
+        lea     512($context),%rdi      # &context.Xmm6
+        mov     \$20,%ecx
+        .long   0xa548f3fc              # cld; rep movsq
+        lea     `104+10*16`(%rax),%rax  # adjust stack pointer
+        mov     0(%rax),%r15
+        mov     8(%rax),%r14
+        mov     16(%rax),%r13
+        mov     24(%rax),%r12
+        mov     32(%rax),%rbp
+        mov     40(%rax),%rbx
+        lea     48(%rax),%rax
+        mov     %rbx,144($context)      # restore context->Rbx
+        mov     %rbp,160($context)      # restore context->Rbp
+        mov     %r12,216($context)      # restore context->R12
+        mov     %r13,224($context)      # restore context->R13
+        mov     %r14,232($context)      # restore context->R14
+        mov     %r15,240($context)      # restore context->R15
+.Lcommon_seh_tail:
+        mov     8(%rax),%rdi
+        mov     16(%rax),%rsi
+        mov     %rax,152($context)      # restore context->Rsp
+        mov     %rsi,168($context)      # restore context->Rsi
+        mov     %rdi,176($context)      # restore context->Rdi
+        mov     40($disp),%rdi          # disp->ContextRecord
+        mov     $context,%rsi           # context
+        mov     \$154,%ecx              # sizeof(CONTEXT)
+        .long   0xa548f3fc              # cld; rep movsq
+        mov     $disp,%rsi
+        xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+        mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+        mov     0(%rsi),%r8             # arg3, disp->ControlPc
+        mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+        mov     40(%rsi),%r10           # disp->ContextRecord
+        lea     56(%rsi),%r11           # &disp->HandlerData
+        lea     24(%rsi),%r12           # &disp->EstablisherFrame
+        mov     %r10,32(%rsp)           # arg5
+        mov     %r11,40(%rsp)           # arg6
+        mov     %r12,48(%rsp)           # arg7
+        mov     %rcx,56(%rsp)           # arg8, (NULL)
+        call    *__imp_RtlVirtualUnwind(%rip)
+        mov     \$1,%eax                # ExceptionContinueSearch
+        add     \$64,%rsp
+        popfq
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+        pop     %rbp
+        pop     %rbx
+        pop     %rdi
+        pop     %rsi
+        ret
+.size   ssse3_handler,.-ssse3_handler
+.section        .pdata
+.align  4
+        .rva    .LSEH_begin_aesni_cbc_sha1_enc_ssse3
+        .rva    .LSEH_end_aesni_cbc_sha1_enc_ssse3
+        .rva    .LSEH_info_aesni_cbc_sha1_enc_ssse3
+___
+$code.=<<___ if ($avx);
+        .rva    .LSEH_begin_aesni_cbc_sha1_enc_avx
+        .rva    .LSEH_end_aesni_cbc_sha1_enc_avx
+        .rva    .LSEH_info_aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+.section        .xdata
+.align  8
+.LSEH_info_aesni_cbc_sha1_enc_ssse3:
+        .byte   9,0,0,0
+        .rva    ssse3_handler
+        .rva    .Lprologue_ssse3,.Lepilogue_ssse3       # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_cbc_sha1_enc_avx:
+        .byte   9,0,0,0
+        .rva    ssse3_handler
+        .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
+___
+}
+####################################################################
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+    $rex|=0x04                  if($dst>=8);
+    $rex|=0x01                  if($src>=8);
+    push @opcode,$rex|0x40      if($rex);
+}
+sub aesni {
+  my $line=shift;
+  my @opcode=(0x66);
+    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+        my %opcodelet = (
+                "aesenc" => 0xdc,       "aesenclast" => 0xdd
+        );
+        return undef if (!defined($opcodelet{$1}));
+        rex(\@opcode,$3,$2);
+        push @opcode,0x0f,0x38,$opcodelet{$1};
+        push @opcode,0xc0|($2&7)|(($3&7)<<3);   # ModR/M
+        return ".byte\t".join(',',@opcode);
+    }
+    return $line;
+}
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86.pl b/src/lib/libcrypto/aes/asm/aesni-x86.pl
new file mode 100644
index 0000000000..3dc345b585
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aesni-x86.pl
@@ -0,0 +1,2189 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
+# details].
+#
+# Performance.
+#
+# To start with see corresponding paragraph in aesni-x86_64.pl...
+# Instead of filling table similar to one found there I've chosen to
+# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
+# The simplified table below represents 32-bit performance relative
+# to 64-bit one in every given point. Ratios vary for different
+# encryption modes, therefore interval values.
+#
+#       16-byte     64-byte     256-byte    1-KB        8-KB
+#       53-67%      67-84%      91-94%      95-98%      97-99.5%
+#
+# Lower ratios for smaller block sizes are perfectly understandable,
+# because function call overhead is higher in 32-bit mode. Largest
+# 8-KB block performance is virtually same: 32-bit code is less than
+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+# January 2011
+#
+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
+# interleaves at most 6 aes[enc|dec] instructions, because there are
+# not enough registers for 8x interleave [which should be optimal for
+# Sandy Bridge]. Actually, performance results for 6x interleave
+# factor presented in aesni-x86_64.pl (except for CTR) are for this
+# module.
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
+$PREFIX="aesni";        # if $PREFIX is set to "AES", the script
+                        # generates drop-in replacement for
+                        # crypto/aes/asm/aes-586.pl:-)
+$inline=1;              # inline _aesni_[en|de]crypt
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+&asm_init($ARGV[0],$0);
+if ($PREFIX eq "aesni") { $movekey=*movups; }
+else                    { $movekey=*movups; }
+$len="eax";
+$rounds="ecx";
+$key="edx";
+$inp="esi";
+$out="edi";
+$rounds_="ebx"; # backup copy for $rounds
+$key_="ebp";    # backup copy for $key
+$rndkey0="xmm0";
+$rndkey1="xmm1";
+$inout0="xmm2";
+$inout1="xmm3";
+$inout2="xmm4";
+$inout3="xmm5"; $in1="xmm5";
+$inout4="xmm6"; $in0="xmm6";
+$inout5="xmm7"; $ivec="xmm7";
+# AESNI extenstion
+sub aeskeygenassist
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {   &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);   }
+}
+sub aescommon
+{ my($opcodelet,$dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {   &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
+}
+sub aesimc      { aescommon(0xdb,@_); }
+sub aesenc      { aescommon(0xdc,@_); }
+sub aesenclast  { aescommon(0xdd,@_); }
+sub aesdec      { aescommon(0xde,@_); }
+sub aesdeclast  { aescommon(0xdf,@_); }
+# Inline version of internal aesni_[en|de]crypt1
+{ my $sn;
+sub aesni_inline_generate1
+{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+  $sn++;
+    &$movekey           ($rndkey0,&QWP(0,$key));
+    &$movekey           ($rndkey1,&QWP(16,$key));
+    &xorps              ($ivec,$rndkey0)        if (defined($ivec));
+    &lea                ($key,&DWP(32,$key));
+    &xorps              ($inout,$ivec)          if (defined($ivec));
+    &xorps              ($inout,$rndkey0)       if (!defined($ivec));
+    &set_label("${p}1_loop_$sn");
+        eval"&aes${p}   ($inout,$rndkey1)";
+        &dec            ($rounds);
+        &$movekey       ($rndkey1,&QWP(0,$key));
+        &lea            ($key,&DWP(16,$key));
+    &jnz                (&label("${p}1_loop_$sn"));
+    eval"&aes${p}last   ($inout,$rndkey1)";
+}}
+sub aesni_generate1     # fully unrolled loop
+{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
+    &function_begin_B("_aesni_${p}rypt1");
+        &movups         ($rndkey0,&QWP(0,$key));
+        &$movekey       ($rndkey1,&QWP(0x10,$key));
+        &xorps          ($inout,$rndkey0);
+        &$movekey       ($rndkey0,&QWP(0x20,$key));
+        &lea            ($key,&DWP(0x30,$key));
+        &cmp            ($rounds,11);
+        &jb             (&label("${p}128"));
+        &lea            ($key,&DWP(0x20,$key));
+        &je             (&label("${p}192"));
+        &lea            ($key,&DWP(0x20,$key));
+        eval"&aes${p}   ($inout,$rndkey1)";
+        &$movekey       ($rndkey1,&QWP(-0x40,$key));
+        eval"&aes${p}   ($inout,$rndkey0)";
+        &$movekey       ($rndkey0,&QWP(-0x30,$key));
+    &set_label("${p}192");
+        eval"&aes${p}   ($inout,$rndkey1)";
+        &$movekey       ($rndkey1,&QWP(-0x20,$key));
+        eval"&aes${p}   ($inout,$rndkey0)";
+        &$movekey       ($rndkey0,&QWP(-0x10,$key));
+    &set_label("${p}128");
+        eval"&aes${p}   ($inout,$rndkey1)";
+        &$movekey       ($rndkey1,&QWP(0,$key));
+        eval"&aes${p}   ($inout,$rndkey0)";
+        &$movekey       ($rndkey0,&QWP(0x10,$key));
+        eval"&aes${p}   ($inout,$rndkey1)";
+        &$movekey       ($rndkey1,&QWP(0x20,$key));
+        eval"&aes${p}   ($inout,$rndkey0)";
+        &$movekey       ($rndkey0,&QWP(0x30,$key));
+        eval"&aes${p}   ($inout,$rndkey1)";
+        &$movekey       ($rndkey1,&QWP(0x40,$key));
+        eval"&aes${p}   ($inout,$rndkey0)";
+        &$movekey       ($rndkey0,&QWP(0x50,$key));
+        eval"&aes${p}   ($inout,$rndkey1)";
+        &$movekey       ($rndkey1,&QWP(0x60,$key));
+        eval"&aes${p}   ($inout,$rndkey0)";
+        &$movekey       ($rndkey0,&QWP(0x70,$key));
+        eval"&aes${p}   ($inout,$rndkey1)";
+    eval"&aes${p}last   ($inout,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt1");
+}
+# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("enc") if (!$inline);
+&function_begin_B("${PREFIX}_encrypt");
+        &mov    ("eax",&wparam(0));
+        &mov    ($key,&wparam(2));
+        &movups ($inout0,&QWP(0,"eax"));
+        &mov    ($rounds,&DWP(240,$key));
+        &mov    ("eax",&wparam(1));
+        if ($inline)
+        {   &aesni_inline_generate1("enc");     }
+        else
+        {   &call       ("_aesni_encrypt1");    }
+        &movups (&QWP(0,"eax"),$inout0);
+        &ret    ();
+&function_end_B("${PREFIX}_encrypt");
+# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("dec") if(!$inline);
+&function_begin_B("${PREFIX}_decrypt");
+        &mov    ("eax",&wparam(0));
+        &mov    ($key,&wparam(2));
+        &movups ($inout0,&QWP(0,"eax"));
+        &mov    ($rounds,&DWP(240,$key));
+        &mov    ("eax",&wparam(1));
+        if ($inline)
+        {   &aesni_inline_generate1("dec");     }
+        else
+        {   &call       ("_aesni_decrypt1");    }
+        &movups (&QWP(0,"eax"),$inout0);
+        &ret    ();
+&function_end_B("${PREFIX}_decrypt");
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x, but it's unfeasible to accommodate it
+# in XMM registers addreassable in 32-bit mode and therefore 6x is
+# used instead...
+sub aesni_generate3
+{ my $p=shift;
+    &function_begin_B("_aesni_${p}rypt3");
+        &$movekey       ($rndkey0,&QWP(0,$key));
+        &shr            ($rounds,1);
+        &$movekey       ($rndkey1,&QWP(16,$key));
+        &lea            ($key,&DWP(32,$key));
+        &xorps          ($inout0,$rndkey0);
+        &pxor           ($inout1,$rndkey0);
+        &pxor           ($inout2,$rndkey0);
+        &$movekey       ($rndkey0,&QWP(0,$key));
+    &set_label("${p}3_loop");
+        eval"&aes${p}   ($inout0,$rndkey1)";
+        eval"&aes${p}   ($inout1,$rndkey1)";
+        &dec            ($rounds);
+        eval"&aes${p}   ($inout2,$rndkey1)";
+        &$movekey       ($rndkey1,&QWP(16,$key));
+        eval"&aes${p}   ($inout0,$rndkey0)";
+        eval"&aes${p}   ($inout1,$rndkey0)";
+        &lea            ($key,&DWP(32,$key));
+        eval"&aes${p}   ($inout2,$rndkey0)";
+        &$movekey       ($rndkey0,&QWP(0,$key));
+        &jnz            (&label("${p}3_loop"));
+    eval"&aes${p}       ($inout0,$rndkey1)";
+    eval"&aes${p}       ($inout1,$rndkey1)";
+    eval"&aes${p}       ($inout2,$rndkey1)";
+    eval"&aes${p}last   ($inout0,$rndkey0)";
+    eval"&aes${p}last   ($inout1,$rndkey0)";
+    eval"&aes${p}last   ($inout2,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt3");
+}
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement  would be <20%,
+# so it's not worth it...
+sub aesni_generate4
+{ my $p=shift;
+    &function_begin_B("_aesni_${p}rypt4");
+        &$movekey       ($rndkey0,&QWP(0,$key));
+        &$movekey       ($rndkey1,&QWP(16,$key));
+        &shr            ($rounds,1);
+        &lea            ($key,&DWP(32,$key));
+        &xorps          ($inout0,$rndkey0);
+        &pxor           ($inout1,$rndkey0);
+        &pxor           ($inout2,$rndkey0);
+        &pxor           ($inout3,$rndkey0);
+        &$movekey       ($rndkey0,&QWP(0,$key));
+    &set_label("${p}4_loop");
+        eval"&aes${p}   ($inout0,$rndkey1)";
+        eval"&aes${p}   ($inout1,$rndkey1)";
+        &dec            ($rounds);
+        eval"&aes${p}   ($inout2,$rndkey1)";
+        eval"&aes${p}   ($inout3,$rndkey1)";
+        &$movekey       ($rndkey1,&QWP(16,$key));
+        eval"&aes${p}   ($inout0,$rndkey0)";
+        eval"&aes${p}   ($inout1,$rndkey0)";
+        &lea            ($key,&DWP(32,$key));
+        eval"&aes${p}   ($inout2,$rndkey0)";
+        eval"&aes${p}   ($inout3,$rndkey0)";
+        &$movekey       ($rndkey0,&QWP(0,$key));
+    &jnz                (&label("${p}4_loop"));
+    eval"&aes${p}       ($inout0,$rndkey1)";
+    eval"&aes${p}       ($inout1,$rndkey1)";
+    eval"&aes${p}       ($inout2,$rndkey1)";
+    eval"&aes${p}       ($inout3,$rndkey1)";
+    eval"&aes${p}last   ($inout0,$rndkey0)";
+    eval"&aes${p}last   ($inout1,$rndkey0)";
+    eval"&aes${p}last   ($inout2,$rndkey0)";
+    eval"&aes${p}last   ($inout3,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt4");
+}
+sub aesni_generate6
+{ my $p=shift;
+    &function_begin_B("_aesni_${p}rypt6");
+    &static_label("_aesni_${p}rypt6_enter");
+        &$movekey       ($rndkey0,&QWP(0,$key));
+        &shr            ($rounds,1);
+        &$movekey       ($rndkey1,&QWP(16,$key));
+        &lea            ($key,&DWP(32,$key));
+        &xorps          ($inout0,$rndkey0);
+        &pxor           ($inout1,$rndkey0);     # pxor does better here
+        eval"&aes${p}   ($inout0,$rndkey1)";
+        &pxor           ($inout2,$rndkey0);
+        eval"&aes${p}   ($inout1,$rndkey1)";
+        &pxor           ($inout3,$rndkey0);
+        &dec            ($rounds);
+        eval"&aes${p}   ($inout2,$rndkey1)";
+        &pxor           ($inout4,$rndkey0);
+        eval"&aes${p}   ($inout3,$rndkey1)";
+        &pxor           ($inout5,$rndkey0);
+        eval"&aes${p}   ($inout4,$rndkey1)";
+        &$movekey       ($rndkey0,&QWP(0,$key));
+        eval"&aes${p}   ($inout5,$rndkey1)";
+        &jmp            (&label("_aesni_${p}rypt6_enter"));
+    &set_label("${p}6_loop",16);
+        eval"&aes${p}   ($inout0,$rndkey1)";
+        eval"&aes${p}   ($inout1,$rndkey1)";
+        &dec            ($rounds);
+        eval"&aes${p}   ($inout2,$rndkey1)";
+        eval"&aes${p}   ($inout3,$rndkey1)";
+        eval"&aes${p}   ($inout4,$rndkey1)";
+        eval"&aes${p}   ($inout5,$rndkey1)";
+    &set_label("_aesni_${p}rypt6_enter",16);
+        &$movekey       ($rndkey1,&QWP(16,$key));
+        eval"&aes${p}   ($inout0,$rndkey0)";
+        eval"&aes${p}   ($inout1,$rndkey0)";
+        &lea            ($key,&DWP(32,$key));
+        eval"&aes${p}   ($inout2,$rndkey0)";
+        eval"&aes${p}   ($inout3,$rndkey0)";
+        eval"&aes${p}   ($inout4,$rndkey0)";
+        eval"&aes${p}   ($inout5,$rndkey0)";
+        &$movekey       ($rndkey0,&QWP(0,$key));
+    &jnz                (&label("${p}6_loop"));
+    eval"&aes${p}       ($inout0,$rndkey1)";
+    eval"&aes${p}       ($inout1,$rndkey1)";
+    eval"&aes${p}       ($inout2,$rndkey1)";
+    eval"&aes${p}       ($inout3,$rndkey1)";
+    eval"&aes${p}       ($inout4,$rndkey1)";
+    eval"&aes${p}       ($inout5,$rndkey1)";
+    eval"&aes${p}last   ($inout0,$rndkey0)";
+    eval"&aes${p}last   ($inout1,$rndkey0)";
+    eval"&aes${p}last   ($inout2,$rndkey0)";
+    eval"&aes${p}last   ($inout3,$rndkey0)";
+    eval"&aes${p}last   ($inout4,$rndkey0)";
+    eval"&aes${p}last   ($inout5,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt6");
+}
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+if ($PREFIX eq "aesni") {
+######################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+#                         size_t length, const AES_KEY *key,
+#                         int enc);
+&function_begin("aesni_ecb_encrypt");
+        &mov    ($inp,&wparam(0));
+        &mov    ($out,&wparam(1));
+        &mov    ($len,&wparam(2));
+        &mov    ($key,&wparam(3));
+        &mov    ($rounds_,&wparam(4));
+        &and    ($len,-16);
+        &jz     (&label("ecb_ret"));
+        &mov    ($rounds,&DWP(240,$key));
+        &test   ($rounds_,$rounds_);
+        &jz     (&label("ecb_decrypt"));
+        &mov    ($key_,$key);           # backup $key
+        &mov    ($rounds_,$rounds);     # backup $rounds
+        &cmp    ($len,0x60);
+        &jb     (&label("ecb_enc_tail"));
+        &movdqu ($inout0,&QWP(0,$inp));
+        &movdqu ($inout1,&QWP(0x10,$inp));
+        &movdqu ($inout2,&QWP(0x20,$inp));
+        &movdqu ($inout3,&QWP(0x30,$inp));
+        &movdqu ($inout4,&QWP(0x40,$inp));
+        &movdqu ($inout5,&QWP(0x50,$inp));
+        &lea    ($inp,&DWP(0x60,$inp));
+        &sub    ($len,0x60);
+        &jmp    (&label("ecb_enc_loop6_enter"));
+&set_label("ecb_enc_loop6",16);
+        &movups (&QWP(0,$out),$inout0);
+        &movdqu ($inout0,&QWP(0,$inp));
+        &movups (&QWP(0x10,$out),$inout1);
+        &movdqu ($inout1,&QWP(0x10,$inp));
+        &movups (&QWP(0x20,$out),$inout2);
+        &movdqu ($inout2,&QWP(0x20,$inp));
+        &movups (&QWP(0x30,$out),$inout3);
+        &movdqu ($inout3,&QWP(0x30,$inp));
+        &movups (&QWP(0x40,$out),$inout4);
+        &movdqu ($inout4,&QWP(0x40,$inp));
+        &movups (&QWP(0x50,$out),$inout5);
+        &lea    ($out,&DWP(0x60,$out));
+        &movdqu ($inout5,&QWP(0x50,$inp));
+        &lea    ($inp,&DWP(0x60,$inp));
+&set_label("ecb_enc_loop6_enter");
+        &call   ("_aesni_encrypt6");
+        &mov    ($key,$key_);           # restore $key
+        &mov    ($rounds,$rounds_);     # restore $rounds
+        &sub    ($len,0x60);
+        &jnc    (&label("ecb_enc_loop6"));
+        &movups (&QWP(0,$out),$inout0);
+        &movups (&QWP(0x10,$out),$inout1);
+        &movups (&QWP(0x20,$out),$inout2);
+        &movups (&QWP(0x30,$out),$inout3);
+        &movups (&QWP(0x40,$out),$inout4);
+        &movups (&QWP(0x50,$out),$inout5);
+        &lea    ($out,&DWP(0x60,$out));
+        &add    ($len,0x60);
+        &jz     (&label("ecb_ret"));
+&set_label("ecb_enc_tail");
+        &movups ($inout0,&QWP(0,$inp));
+        &cmp    ($len,0x20);
+        &jb     (&label("ecb_enc_one"));
+        &movups ($inout1,&QWP(0x10,$inp));
+        &je     (&label("ecb_enc_two"));
+        &movups ($inout2,&QWP(0x20,$inp));
+        &cmp    ($len,0x40);
+        &jb     (&label("ecb_enc_three"));
+        &movups ($inout3,&QWP(0x30,$inp));
+        &je     (&label("ecb_enc_four"));
+        &movups ($inout4,&QWP(0x40,$inp));
+        &xorps  ($inout5,$inout5);
+        &call   ("_aesni_encrypt6");
+        &movups (&QWP(0,$out),$inout0);
+        &movups (&QWP(0x10,$out),$inout1);
+        &movups (&QWP(0x20,$out),$inout2);
+        &movups (&QWP(0x30,$out),$inout3);
+        &movups (&QWP(0x40,$out),$inout4);
+        jmp     (&label("ecb_ret"));
+&set_label("ecb_enc_one",16);
+        if ($inline)
+        {   &aesni_inline_generate1("enc");     }
+        else
+        {   &call       ("_aesni_encrypt1");    }
+        &movups (&QWP(0,$out),$inout0);
+        &jmp    (&label("ecb_ret"));
+&set_label("ecb_enc_two",16);
+        &xorps  ($inout2,$inout2);
+        &call   ("_aesni_encrypt3");
+        &movups (&QWP(0,$out),$inout0);
+        &movups (&QWP(0x10,$out),$inout1);
+        &jmp    (&label("ecb_ret"));
+&set_label("ecb_enc_three",16);
+        &call   ("_aesni_encrypt3");
+        &movups (&QWP(0,$out),$inout0);
+        &movups (&QWP(0x10,$out),$inout1);
+        &movups (&QWP(0x20,$out),$inout2);
+        &jmp    (&label("ecb_ret"));
+&set_label("ecb_enc_four",16);
+        &call   ("_aesni_encrypt4");
+        &movups (&QWP(0,$out),$inout0);
+        &movups (&QWP(0x10,$out),$inout1);
+        &movups (&QWP(0x20,$out),$inout2);
+        &movups (&QWP(0x30,$out),$inout3);
+        &jmp    (&label("ecb_ret"));
+######################################################################
+&set_label("ecb_decrypt",16);
+        &mov    ($key_,$key);           # backup $key
+        &mov    ($rounds_,$rounds);     # backup $rounds
+        &cmp    ($len,0x60);
+        &jb     (&label("ecb_dec_tail"));
+        &movdqu ($inout0,&QWP(0,$inp));
+        &movdqu ($inout1,&QWP(0x10,$inp));
+        &movdqu ($inout2,&QWP(0x20,$inp));
+        &movdqu ($inout3,&QWP(0x30,$inp));
+        &movdqu ($inout4,&QWP(0x40,$inp));
+        &movdqu ($inout5,&QWP(0x50,$inp));
+        &lea    ($inp,&DWP(0x60,$inp));
+        &sub    ($len,0x60);
+        &jmp    (&label("ecb_dec_loop6_enter"));
+&set_label("ecb_dec_loop6",16);
+        &movups (&QWP(0,$out),$inout0);
+        &movdqu ($inout0,&QWP(0,$inp));
+        &movups (&QWP(0x10,$out),$inout1);
+        &movdqu ($inout1,&QWP(0x10,$inp));
+        &movups (&QWP(0x20,$out),$inout2);
+        &movdqu ($inout2,&QWP(0x20,$inp));
+        &movups (&QWP(0x30,$out),$inout3);
+        &movdqu ($inout3,&QWP(0x30,$inp));
+        &movups (&QWP(0x40,$out),$inout4);
+        &movdqu ($inout4,&QWP(0x40,$inp));
+        &movups (&QWP(0x50,$out),$inout5);
+        &lea    ($out,&DWP(0x60,$out));
+        &movdqu ($inout5,&QWP(0x50,$inp));
+        &lea    ($inp,&DWP(0x60,$inp));
+&set_label("ecb_dec_loop6_enter");
+        &call   ("_aesni_decrypt6");
+        &mov    ($key,$key_);           # restore $key
+        &mov    ($rounds,$rounds_);     # restore $rounds
+        &sub    ($len,0x60);
+        &jnc    (&label("ecb_dec_loop6"));
+        &movups (&QWP(0,$out),$inout0);
+        &movups (&QWP(0x10,$out),$inout1);
+        &movups (&QWP(0x20,$out),$inout2);
+        &movups (&QWP(0x30,$out),$inout3);
+        &movups (&QWP(0x40,$out),$inout4);
+        &movups (&QWP(0x50,$out),$inout5);
+        &lea    ($out,&DWP(0x60,$out));
+        &add    ($len,0x60);
+        &jz     (&label("ecb_ret"));
+&set_label("ecb_dec_tail");
+        &movups ($inout0,&QWP(0,$inp));
+        &cmp    ($len,0x20);
+        &jb     (&label("ecb_dec_one"));
+        &movups ($inout1,&QWP(0x10,$inp));
+        &je     (&label("ecb_dec_two"));
+        &movups ($inout2,&QWP(0x20,$inp));
+        &cmp    ($len,0x40);
+        &jb     (&label("ecb_dec_three"));
+        &movups ($inout3,&QWP(0x30,$inp));
+        &je     (&label("ecb_dec_four"));
+        &movups ($inout4,&QWP(0x40,$inp));
+        &xorps  ($inout5,$inout5);
+        &call   ("_aesni_decrypt6");
+        &movups (&QWP(0,$out),$inout0);
+        &movups (&QWP(0x10,$out),$inout1);
+        &movups (&QWP(0x20,$out),$inout2);
+        &movups (&QWP(0x30,$out),$inout3);
+        &movups (&QWP(0x40,$out),$inout4);
+        &jmp    (&label("ecb_ret"));
+&set_label("ecb_dec_one",16);
+        if ($inline)
+        {   &aesni_inline_generate1("dec");     }
+        else
+        {   &call       ("_aesni_decrypt1");    }
+        &movups (&QWP(0,$out),$inout0);
+        &jmp    (&label("ecb_ret"));
+&set_label("ecb_dec_two",16);
+        &xorps  ($inout2,$inout2);
+        &call   ("_aesni_decrypt3");
+        &movups (&QWP(0,$out),$inout0);
+        &movups (&QWP(0x10,$out),$inout1);
+        &jmp    (&label("ecb_ret"));
+&set_label("ecb_dec_three",16);
+        &call   ("_aesni_decrypt3");
+        &movups (&QWP(0,$out),$inout0);
+        &movups (&QWP(0x10,$out),$inout1);
+        &movups (&QWP(0x20,$out),$inout2);
+        &jmp    (&label("ecb_ret"));
+&set_label("ecb_dec_four",16);
+        &call   ("_aesni_decrypt4");
+        &movups (&QWP(0,$out),$inout0);
+        &movups (&QWP(0x10,$out),$inout1);
+        &movups (&QWP(0x20,$out),$inout2);
+        &movups (&QWP(0x30,$out),$inout3);
+&set_label("ecb_ret");
+&function_end("aesni_ecb_encrypt");
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{ my $cmac=$inout1;
+&function_begin("aesni_ccm64_encrypt_blocks");
+        &mov    ($inp,&wparam(0));
+        &mov    ($out,&wparam(1));
+        &mov    ($len,&wparam(2));
+        &mov    ($key,&wparam(3));
+        &mov    ($rounds_,&wparam(4));
+        &mov    ($rounds,&wparam(5));
+        &mov    ($key_,"esp");
+        &sub    ("esp",60);
+        &and    ("esp",-16);                    # align stack
+        &mov    (&DWP(48,"esp"),$key_);
+        &movdqu ($ivec,&QWP(0,$rounds_));       # load ivec
+        &movdqu ($cmac,&QWP(0,$rounds));        # load cmac
+        &mov    ($rounds,&DWP(240,$key));
+        # compose byte-swap control mask for pshufb on stack
+        &mov    (&DWP(0,"esp"),0x0c0d0e0f);
+        &mov    (&DWP(4,"esp"),0x08090a0b);
+        &mov    (&DWP(8,"esp"),0x04050607);
+        &mov    (&DWP(12,"esp"),0x00010203);
+        # compose counter increment vector on stack
+        &mov    ($rounds_,1);
+        &xor    ($key_,$key_);
+        &mov    (&DWP(16,"esp"),$rounds_);
+        &mov    (&DWP(20,"esp"),$key_);
+        &mov    (&DWP(24,"esp"),$key_);
+        &mov    (&DWP(28,"esp"),$key_);
+        &shr    ($rounds,1);
+        &lea    ($key_,&DWP(0,$key));
+        &movdqa ($inout3,&QWP(0,"esp"));
+        &movdqa ($inout0,$ivec);
+        &mov    ($rounds_,$rounds);
+        &pshufb ($ivec,$inout3);
+&set_label("ccm64_enc_outer");
+        &$movekey       ($rndkey0,&QWP(0,$key_));
+        &mov            ($rounds,$rounds_);
+        &movups         ($in0,&QWP(0,$inp));
+        &xorps          ($inout0,$rndkey0);
+        &$movekey       ($rndkey1,&QWP(16,$key_));
+        &xorps          ($rndkey0,$in0);
+        &lea            ($key,&DWP(32,$key_));
+        &xorps          ($cmac,$rndkey0);               # cmac^=inp
+        &$movekey       ($rndkey0,&QWP(0,$key));
+&set_label("ccm64_enc2_loop");
+        &aesenc         ($inout0,$rndkey1);
+        &dec            ($rounds);
+        &aesenc         ($cmac,$rndkey1);
+        &$movekey       ($rndkey1,&QWP(16,$key));
+        &aesenc         ($inout0,$rndkey0);
+        &lea            ($key,&DWP(32,$key));
+        &aesenc         ($cmac,$rndkey0);
+        &$movekey       ($rndkey0,&QWP(0,$key));
+        &jnz            (&label("ccm64_enc2_loop"));
+        &aesenc         ($inout0,$rndkey1);
+        &aesenc         ($cmac,$rndkey1);
+        &paddq          ($ivec,&QWP(16,"esp"));
+        &aesenclast     ($inout0,$rndkey0);
+        &aesenclast     ($cmac,$rndkey0);
+        &dec    ($len);
+        &lea    ($inp,&DWP(16,$inp));
+        &xorps  ($in0,$inout0);                 # inp^=E(ivec)
+        &movdqa ($inout0,$ivec);
+        &movups (&QWP(0,$out),$in0);            # save output
+        &lea    ($out,&DWP(16,$out));
+        &pshufb ($inout0,$inout3);
+        &jnz    (&label("ccm64_enc_outer"));
+        &mov    ("esp",&DWP(48,"esp"));
+        &mov    ($out,&wparam(5));
+        &movups (&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_encrypt_blocks");
+&function_begin("aesni_ccm64_decrypt_blocks");
+        &mov    ($inp,&wparam(0));
+        &mov    ($out,&wparam(1));
+        &mov    ($len,&wparam(2));
+        &mov    ($key,&wparam(3));
+        &mov    ($rounds_,&wparam(4));
+        &mov    ($rounds,&wparam(5));
+        &mov    ($key_,"esp");
+        &sub    ("esp",60);
+        &and    ("esp",-16);                    # align stack
+        &mov    (&DWP(48,"esp"),$key_);
+        &movdqu ($ivec,&QWP(0,$rounds_));       # load ivec
+        &movdqu ($cmac,&QWP(0,$rounds));        # load cmac
+        &mov    ($rounds,&DWP(240,$key));
+        # compose byte-swap control mask for pshufb on stack
+        &mov    (&DWP(0,"esp"),0x0c0d0e0f);
+        &mov    (&DWP(4,"esp"),0x08090a0b);
+        &mov    (&DWP(8,"esp"),0x04050607);
+        &mov    (&DWP(12,"esp"),0x00010203);
+        # compose counter increment vector on stack
+        &mov    ($rounds_,1);
+        &xor    ($key_,$key_);
+        &mov    (&DWP(16,"esp"),$rounds_);
+        &mov    (&DWP(20,"esp"),$key_);
+        &mov    (&DWP(24,"esp"),$key_);
+        &mov    (&DWP(28,"esp"),$key_);
+        &movdqa ($inout3,&QWP(0,"esp"));        # bswap mask
+        &movdqa ($inout0,$ivec);
+        &mov    ($key_,$key);
+        &mov    ($rounds_,$rounds);
+        &pshufb ($ivec,$inout3);
+        if ($inline)
+        {   &aesni_inline_generate1("enc");     }
+        else
+        {   &call       ("_aesni_encrypt1");    }
+        &movups ($in0,&QWP(0,$inp));            # load inp
+        &paddq  ($ivec,&QWP(16,"esp"));
+        &lea    ($inp,&QWP(16,$inp));
+        &jmp    (&label("ccm64_dec_outer"));
+&set_label("ccm64_dec_outer",16);
+        &xorps  ($in0,$inout0);                 # inp ^= E(ivec)
+        &movdqa ($inout0,$ivec);
+        &mov    ($rounds,$rounds_);
+        &movups (&QWP(0,$out),$in0);            # save output
+        &lea    ($out,&DWP(16,$out));
+        &pshufb ($inout0,$inout3);
+        &sub    ($len,1);
+        &jz     (&label("ccm64_dec_break"));
+        &$movekey       ($rndkey0,&QWP(0,$key_));
+        &shr            ($rounds,1);
+        &$movekey       ($rndkey1,&QWP(16,$key_));
+        &xorps          ($in0,$rndkey0);
+        &lea            ($key,&DWP(32,$key_));
+        &xorps          ($inout0,$rndkey0);
+        &xorps          ($cmac,$in0);           # cmac^=out
+        &$movekey       ($rndkey0,&QWP(0,$key));
+&set_label("ccm64_dec2_loop");
+        &aesenc         ($inout0,$rndkey1);
+        &dec            ($rounds);
+        &aesenc         ($cmac,$rndkey1);
+        &$movekey       ($rndkey1,&QWP(16,$key));
+        &aesenc         ($inout0,$rndkey0);
+        &lea            ($key,&DWP(32,$key));
+        &aesenc         ($cmac,$rndkey0);
+        &$movekey       ($rndkey0,&QWP(0,$key));
+        &jnz            (&label("ccm64_dec2_loop"));
+        &movups         ($in0,&QWP(0,$inp));    # load inp
+        &paddq          ($ivec,&QWP(16,"esp"));
+        &aesenc         ($inout0,$rndkey1);
+        &aesenc         ($cmac,$rndkey1);
+        &lea            ($inp,&QWP(16,$inp));
+        &aesenclast     ($inout0,$rndkey0);
+        &aesenclast     ($cmac,$rndkey0);
+        &jmp    (&label("ccm64_dec_outer"));
+&set_label("ccm64_dec_break",16);
+        &mov    ($key,$key_);
+        if ($inline)
+        {   &aesni_inline_generate1("enc",$cmac,$in0);  }
+        else
+        {   &call       ("_aesni_encrypt1",$cmac);      }
+        &mov    ("esp",&DWP(48,"esp"));
+        &mov    ($out,&wparam(5));
+        &movups (&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_decrypt_blocks");
+}
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+# stack layout:
+#       0       pshufb mask
+#       16      vector addend: 0,6,6,6
+#       32      counter-less ivec
+#       48      1st triplet of counter vector
+#       64      2nd triplet of counter vector
+#       80      saved %esp
+&function_begin("aesni_ctr32_encrypt_blocks");
+        &mov    ($inp,&wparam(0));
+        &mov    ($out,&wparam(1));
+        &mov    ($len,&wparam(2));
+        &mov    ($key,&wparam(3));
+        &mov    ($rounds_,&wparam(4));
+        &mov    ($key_,"esp");
+        &sub    ("esp",88);
+        &and    ("esp",-16);                    # align stack
+        &mov    (&DWP(80,"esp"),$key_);
+        &cmp    ($len,1);
+        &je     (&label("ctr32_one_shortcut"));
+        &movdqu ($inout5,&QWP(0,$rounds_));     # load ivec
+        # compose byte-swap control mask for pshufb on stack
+        &mov    (&DWP(0,"esp"),0x0c0d0e0f);
+        &mov    (&DWP(4,"esp"),0x08090a0b);
+        &mov    (&DWP(8,"esp"),0x04050607);
+        &mov    (&DWP(12,"esp"),0x00010203);
+        # compose counter increment vector on stack
+        &mov    ($rounds,6);
+        &xor    ($key_,$key_);
+        &mov    (&DWP(16,"esp"),$rounds);
+        &mov    (&DWP(20,"esp"),$rounds);
+        &mov    (&DWP(24,"esp"),$rounds);
+        &mov    (&DWP(28,"esp"),$key_);
+        &pextrd ($rounds_,$inout5,3);           # pull 32-bit counter
+        &pinsrd ($inout5,$key_,3);              # wipe 32-bit counter
+        &mov    ($rounds,&DWP(240,$key));       # key->rounds
+        # compose 2 vectors of 3x32-bit counters
+        &bswap  ($rounds_);
+        &pxor   ($rndkey1,$rndkey1);
+        &pxor   ($rndkey0,$rndkey0);
+        &movdqa ($inout0,&QWP(0,"esp"));        # load byte-swap mask
+        &pinsrd ($rndkey1,$rounds_,0);
+        &lea    ($key_,&DWP(3,$rounds_));
+        &pinsrd ($rndkey0,$key_,0);
+        &inc    ($rounds_);
+        &pinsrd ($rndkey1,$rounds_,1);
+        &inc    ($key_);
+        &pinsrd ($rndkey0,$key_,1);
+        &inc    ($rounds_);
+        &pinsrd ($rndkey1,$rounds_,2);
+        &inc    ($key_);
+        &pinsrd ($rndkey0,$key_,2);
+        &movdqa (&QWP(48,"esp"),$rndkey1);      # save 1st triplet
+        &pshufb ($rndkey1,$inout0);             # byte swap
+        &movdqa (&QWP(64,"esp"),$rndkey0);      # save 2nd triplet
+        &pshufb ($rndkey0,$inout0);             # byte swap
+        &pshufd ($inout0,$rndkey1,3<<6);        # place counter to upper dword
+        &pshufd ($inout1,$rndkey1,2<<6);
+        &cmp    ($len,6);
+        &jb     (&label("ctr32_tail"));
+        &movdqa (&QWP(32,"esp"),$inout5);       # save counter-less ivec
+        &shr    ($rounds,1);
+        &mov    ($key_,$key);                   # backup $key
+        &mov    ($rounds_,$rounds);             # backup $rounds
+        &sub    ($len,6);
+        &jmp    (&label("ctr32_loop6"));
+&set_label("ctr32_loop6",16);
+        &pshufd ($inout2,$rndkey1,1<<6);
+        &movdqa ($rndkey1,&QWP(32,"esp"));      # pull counter-less ivec
+        &pshufd ($inout3,$rndkey0,3<<6);
+        &por    ($inout0,$rndkey1);             # merge counter-less ivec
+        &pshufd ($inout4,$rndkey0,2<<6);
+        &por    ($inout1,$rndkey1);
+        &pshufd ($inout5,$rndkey0,1<<6);
+        &por    ($inout2,$rndkey1);
+        &por    ($inout3,$rndkey1);
+        &por    ($inout4,$rndkey1);
+        &por    ($inout5,$rndkey1);
+        # inlining _aesni_encrypt6's prologue gives ~4% improvement...
+        &$movekey       ($rndkey0,&QWP(0,$key_));
+        &$movekey       ($rndkey1,&QWP(16,$key_));
+        &lea            ($key,&DWP(32,$key_));
+        &dec            ($rounds);
+        &pxor           ($inout0,$rndkey0);
+        &pxor           ($inout1,$rndkey0);
+        &aesenc         ($inout0,$rndkey1);
+        &pxor           ($inout2,$rndkey0);
+        &aesenc         ($inout1,$rndkey1);
+        &pxor           ($inout3,$rndkey0);
+        &aesenc         ($inout2,$rndkey1);
+        &pxor           ($inout4,$rndkey0);
+        &aesenc         ($inout3,$rndkey1);
+        &pxor           ($inout5,$rndkey0);
+        &aesenc         ($inout4,$rndkey1);
+        &$movekey       ($rndkey0,&QWP(0,$key));
+        &aesenc         ($inout5,$rndkey1);
+        &call           (&label("_aesni_encrypt6_enter"));
+        &movups ($rndkey1,&QWP(0,$inp));
+        &movups ($rndkey0,&QWP(0x10,$inp));
+        &xorps  ($inout0,$rndkey1);
+        &movups ($rndkey1,&QWP(0x20,$inp));
+        &xorps  ($inout1,$rndkey0);
+        &movups (&QWP(0,$out),$inout0);
+        &movdqa ($rndkey0,&QWP(16,"esp"));      # load increment
+        &xorps  ($inout2,$rndkey1);
+        &movdqa ($rndkey1,&QWP(48,"esp"));      # load 1st triplet
+        &movups (&QWP(0x10,$out),$inout1);
+        &movups (&QWP(0x20,$out),$inout2);
+        &paddd  ($rndkey1,$rndkey0);            # 1st triplet increment
+        &paddd  ($rndkey0,&QWP(64,"esp"));      # 2nd triplet increment
+        &movdqa ($inout0,&QWP(0,"esp"));        # load byte swap mask
+        &movups ($inout1,&QWP(0x30,$inp));
+        &movups ($inout2,&QWP(0x40,$inp));
+        &xorps  ($inout3,$inout1);
+        &movups ($inout1,&QWP(0x50,$inp));
+        &lea    ($inp,&DWP(0x60,$inp));
+        &movdqa (&QWP(48,"esp"),$rndkey1);      # save 1st triplet
+        &pshufb ($rndkey1,$inout0);             # byte swap
+        &xorps  ($inout4,$inout2);
+        &movups (&QWP(0x30,$out),$inout3);
+        &xorps  ($inout5,$inout1);
+        &movdqa (&QWP(64,"esp"),$rndkey0);      # save 2nd triplet
+        &pshufb ($rndkey0,$inout0);             # byte swap
+        &movups (&QWP(0x40,$out),$inout4);
+        &pshufd ($inout0,$rndkey1,3<<6);
+        &movups (&QWP(0x50,$out),$inout5);
+        &lea    ($out,&DWP(0x60,$out));
+        &mov    ($rounds,$rounds_);
+        &pshufd ($inout1,$rndkey1,2<<6);
+        &sub    ($len,6);
+        &jnc    (&label("ctr32_loop6"));
+        &add    ($len,6);
+        &jz     (&label("ctr32_ret"));
+        &mov    ($key,$key_);
+        &lea    ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+        &movdqa ($inout5,&QWP(32,"esp"));       # pull count-less ivec
+&set_label("ctr32_tail");
+        &por    ($inout0,$inout5);
+        &cmp    ($len,2);
+        &jb     (&label("ctr32_one"));
+        &pshufd ($inout2,$rndkey1,1<<6);
+        &por    ($inout1,$inout5);
+        &je     (&label("ctr32_two"));
+        &pshufd ($inout3,$rndkey0,3<<6);
+        &por    ($inout2,$inout5);
+        &cmp    ($len,4);
+        &jb     (&label("ctr32_three"));
+        &pshufd ($inout4,$rndkey0,2<<6);
+        &por    ($inout3,$inout5);
+        &je     (&label("ctr32_four"));
+        &por    ($inout4,$inout5);
+        &call   ("_aesni_encrypt6");
+        &movups ($rndkey1,&QWP(0,$inp));
+        &movups ($rndkey0,&QWP(0x10,$inp));
+        &xorps  ($inout0,$rndkey1);
+        &movups ($rndkey1,&QWP(0x20,$inp));
+        &xorps  ($inout1,$rndkey0);
+        &movups ($rndkey0,&QWP(0x30,$inp));
+        &xorps  ($inout2,$rndkey1);
+        &movups ($rndkey1,&QWP(0x40,$inp));
+        &xorps  ($inout3,$rndkey0);
+        &movups (&QWP(0,$out),$inout0);
+        &xorps  ($inout4,$rndkey1);
+        &movups (&QWP(0x10,$out),$inout1);
+        &movups (&QWP(0x20,$out),$inout2);
+        &movups (&QWP(0x30,$out),$inout3);
+        &movups (&QWP(0x40,$out),$inout4);
+        &jmp    (&label("ctr32_ret"));
+&set_label("ctr32_one_shortcut",16);
+        &movups ($inout0,&QWP(0,$rounds_));     # load ivec
+        &mov    ($rounds,&DWP(240,$key));
+        
+&set_label("ctr32_one");
+        if ($inline)
+        {   &aesni_inline_generate1("enc");     }
+        else
+        {   &call       ("_aesni_encrypt1");    }
+        &movups ($in0,&QWP(0,$inp));
+        &xorps  ($in0,$inout0);
+        &movups (&QWP(0,$out),$in0);
+        &jmp    (&label("ctr32_ret"));
+&set_label("ctr32_two",16);
+        &call   ("_aesni_encrypt3");
+        &movups ($inout3,&QWP(0,$inp));
+        &movups ($inout4,&QWP(0x10,$inp));
+        &xorps  ($inout0,$inout3);
+        &xorps  ($inout1,$inout4);
+        &movups (&QWP(0,$out),$inout0);
+        &movups (&QWP(0x10,$out),$inout1);
+        &jmp    (&label("ctr32_ret"));
+&set_label("ctr32_three",16);
+        &call   ("_aesni_encrypt3");
+        &movups ($inout3,&QWP(0,$inp));
+        &movups ($inout4,&QWP(0x10,$inp));
+        &xorps  ($inout0,$inout3);
+        &movups ($inout5,&QWP(0x20,$inp));
+        &xorps  ($inout1,$inout4);
+        &movups (&QWP(0,$out),$inout0);
+        &xorps  ($inout2,$inout5);
+        &movups (&QWP(0x10,$out),$inout1);
+        &movups (&QWP(0x20,$out),$inout2);
+        &jmp    (&label("ctr32_ret"));
+&set_label("ctr32_four",16);
+        &call   ("_aesni_encrypt4");
+        &movups ($inout4,&QWP(0,$inp));
+        &movups ($inout5,&QWP(0x10,$inp));
+        &movups ($rndkey1,&QWP(0x20,$inp));
+        &xorps  ($inout0,$inout4);
+        &movups ($rndkey0,&QWP(0x30,$inp));
+        &xorps  ($inout1,$inout5);
+        &movups (&QWP(0,$out),$inout0);
+        &xorps  ($inout2,$rndkey1);
+        &movups (&QWP(0x10,$out),$inout1);
+        &xorps  ($inout3,$rndkey0);
+        &movups (&QWP(0x20,$out),$inout2);
+        &movups (&QWP(0x30,$out),$inout3);
+&set_label("ctr32_ret");
+        &mov    ("esp",&DWP(80,"esp"));
+&function_end("aesni_ctr32_encrypt_blocks");
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#       const AES_KEY *key1, const AES_KEY *key2
+#       const unsigned char iv[16]);
+#
+{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
+&function_begin("aesni_xts_encrypt");
+        &mov    ($key,&wparam(4));              # key2
+        &mov    ($inp,&wparam(5));              # clear-text tweak
+        &mov    ($rounds,&DWP(240,$key));       # key2->rounds
+        &movups ($inout0,&QWP(0,$inp));
+        if ($inline)
+        {   &aesni_inline_generate1("enc");     }
+        else
+        {   &call       ("_aesni_encrypt1");    }
+        &mov    ($inp,&wparam(0));
+        &mov    ($out,&wparam(1));
+        &mov    ($len,&wparam(2));
+        &mov    ($key,&wparam(3));              # key1
+        &mov    ($key_,"esp");
+        &sub    ("esp",16*7+8);
+        &mov    ($rounds,&DWP(240,$key));       # key1->rounds
+        &and    ("esp",-16);                    # align stack
+        &mov    (&DWP(16*6+0,"esp"),0x87);      # compose the magic constant
+        &mov    (&DWP(16*6+4,"esp"),0);
+        &mov    (&DWP(16*6+8,"esp"),1);
+        &mov    (&DWP(16*6+12,"esp"),0);
+        &mov    (&DWP(16*7+0,"esp"),$len);      # save original $len
+        &mov    (&DWP(16*7+4,"esp"),$key_);     # save original %esp
+        &movdqa ($tweak,$inout0);
+        &pxor   ($twtmp,$twtmp);
+        &movdqa ($twmask,&QWP(6*16,"esp"));     # 0x0...010...87
+        &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+        &and    ($len,-16);
+        &mov    ($key_,$key);                   # backup $key
+        &mov    ($rounds_,$rounds);             # backup $rounds
+        &sub    ($len,16*6);
+        &jc     (&label("xts_enc_short"));
+        &shr    ($rounds,1);
+        &mov    ($rounds_,$rounds);
+        &jmp    (&label("xts_enc_loop6"));
+&set_label("xts_enc_loop6",16);
+        for ($i=0;$i<4;$i++) {
+            &pshufd     ($twres,$twtmp,0x13);
+            &pxor       ($twtmp,$twtmp);
+            &movdqa     (&QWP(16*$i,"esp"),$tweak);
+            &paddq      ($tweak,$tweak);        # &psllq($tweak,1);
+            &pand       ($twres,$twmask);       # isolate carry and residue
+            &pcmpgtd    ($twtmp,$tweak);        # broadcast upper bits
+            &pxor       ($tweak,$twres);
+        }
+        &pshufd ($inout5,$twtmp,0x13);
+        &movdqa (&QWP(16*$i++,"esp"),$tweak);
+        &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+         &$movekey      ($rndkey0,&QWP(0,$key_));
+        &pand   ($inout5,$twmask);              # isolate carry and residue
+         &movups        ($inout0,&QWP(0,$inp)); # load input
+        &pxor   ($inout5,$tweak);
+        # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+        &movdqu ($inout1,&QWP(16*1,$inp));
+         &xorps         ($inout0,$rndkey0);     # input^=rndkey[0]
+        &movdqu ($inout2,&QWP(16*2,$inp));
+         &pxor          ($inout1,$rndkey0);
+        &movdqu ($inout3,&QWP(16*3,$inp));
+         &pxor          ($inout2,$rndkey0);
+        &movdqu ($inout4,&QWP(16*4,$inp));
+         &pxor          ($inout3,$rndkey0);
+        &movdqu ($rndkey1,&QWP(16*5,$inp));
+         &pxor          ($inout4,$rndkey0);
+        &lea    ($inp,&DWP(16*6,$inp));
+        &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
+        &movdqa (&QWP(16*$i,"esp"),$inout5);    # save last tweak
+        &pxor   ($inout5,$rndkey1);
+         &$movekey      ($rndkey1,&QWP(16,$key_));
+         &lea           ($key,&DWP(32,$key_));
+        &pxor   ($inout1,&QWP(16*1,"esp"));
+         &aesenc        ($inout0,$rndkey1);
+        &pxor   ($inout2,&QWP(16*2,"esp"));
+         &aesenc        ($inout1,$rndkey1);
+        &pxor   ($inout3,&QWP(16*3,"esp"));
+         &dec           ($rounds);
+         &aesenc        ($inout2,$rndkey1);
+        &pxor   ($inout4,&QWP(16*4,"esp"));
+         &aesenc        ($inout3,$rndkey1);
+        &pxor           ($inout5,$rndkey0);
+         &aesenc        ($inout4,$rndkey1);
+         &$movekey      ($rndkey0,&QWP(0,$key));
+         &aesenc        ($inout5,$rndkey1);
+        &call           (&label("_aesni_encrypt6_enter"));
+        &movdqa ($tweak,&QWP(16*5,"esp"));      # last tweak
+       &pxor    ($twtmp,$twtmp);
+        &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
+       &pcmpgtd ($twtmp,$tweak);                # broadcast upper bits
+        &xorps  ($inout1,&QWP(16*1,"esp"));
+        &movups (&QWP(16*0,$out),$inout0);      # write output
+        &xorps  ($inout2,&QWP(16*2,"esp"));
+        &movups (&QWP(16*1,$out),$inout1);
+        &xorps  ($inout3,&QWP(16*3,"esp"));
+        &movups (&QWP(16*2,$out),$inout2);
+        &xorps  ($inout4,&QWP(16*4,"esp"));
+        &movups (&QWP(16*3,$out),$inout3);
+        &xorps  ($inout5,$tweak);
+        &movups (&QWP(16*4,$out),$inout4);
+       &pshufd  ($twres,$twtmp,0x13);
+        &movups (&QWP(16*5,$out),$inout5);
+        &lea    ($out,&DWP(16*6,$out));
+       &movdqa  ($twmask,&QWP(16*6,"esp"));     # 0x0...010...87
+        &pxor   ($twtmp,$twtmp);
+        &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+        &pand   ($twres,$twmask);               # isolate carry and residue
+        &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+        &mov    ($rounds,$rounds_);             # restore $rounds
+        &pxor   ($tweak,$twres);
+        &sub    ($len,16*6);
+        &jnc    (&label("xts_enc_loop6"));
+        &lea    ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+        &mov    ($key,$key_);                   # restore $key
+        &mov    ($rounds_,$rounds);
+&set_label("xts_enc_short");
+        &add    ($len,16*6);
+        &jz     (&label("xts_enc_done6x"));
+        &movdqa ($inout3,$tweak);               # put aside previous tweak
+        &cmp    ($len,0x20);
+        &jb     (&label("xts_enc_one"));
+        &pshufd ($twres,$twtmp,0x13);
+        &pxor   ($twtmp,$twtmp);
+        &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+        &pand   ($twres,$twmask);               # isolate carry and residue
+        &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+        &pxor   ($tweak,$twres);
+        &je     (&label("xts_enc_two"));
+        &pshufd ($twres,$twtmp,0x13);
+        &pxor   ($twtmp,$twtmp);
+        &movdqa ($inout4,$tweak);               # put aside previous tweak
+        &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+        &pand   ($twres,$twmask);               # isolate carry and residue
+        &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+        &pxor   ($tweak,$twres);
+        &cmp    ($len,0x40);
+        &jb     (&label("xts_enc_three"));
+        &pshufd ($twres,$twtmp,0x13);
+        &pxor   ($twtmp,$twtmp);
+        &movdqa ($inout5,$tweak);               # put aside previous tweak
+        &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+        &pand   ($twres,$twmask);               # isolate carry and residue
+        &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+        &pxor   ($tweak,$twres);
+        &movdqa (&QWP(16*0,"esp"),$inout3);
+        &movdqa (&QWP(16*1,"esp"),$inout4);
+        &je     (&label("xts_enc_four"));
+        &movdqa (&QWP(16*2,"esp"),$inout5);
+        &pshufd ($inout5,$twtmp,0x13);
+        &movdqa (&QWP(16*3,"esp"),$tweak);
+        &paddq  ($tweak,$tweak);                # &psllq($inout0,1);
+        &pand   ($inout5,$twmask);              # isolate carry and residue
+        &pxor   ($inout5,$tweak);
+        &movdqu ($inout0,&QWP(16*0,$inp));      # load input
+        &movdqu ($inout1,&QWP(16*1,$inp));
+        &movdqu ($inout2,&QWP(16*2,$inp));
+        &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
+        &movdqu ($inout3,&QWP(16*3,$inp));
+        &pxor   ($inout1,&QWP(16*1,"esp"));
+        &movdqu ($inout4,&QWP(16*4,$inp));
+        &pxor   ($inout2,&QWP(16*2,"esp"));
+        &lea    ($inp,&DWP(16*5,$inp));
+        &pxor   ($inout3,&QWP(16*3,"esp"));
+        &movdqa (&QWP(16*4,"esp"),$inout5);     # save last tweak
+        &pxor   ($inout4,$inout5);
+        &call   ("_aesni_encrypt6");
+        &movaps ($tweak,&QWP(16*4,"esp"));      # last tweak
+        &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
+        &xorps  ($inout1,&QWP(16*1,"esp"));
+        &xorps  ($inout2,&QWP(16*2,"esp"));
+        &movups (&QWP(16*0,$out),$inout0);      # write output
+        &xorps  ($inout3,&QWP(16*3,"esp"));
+        &movups (&QWP(16*1,$out),$inout1);
+        &xorps  ($inout4,$tweak);
+        &movups (&QWP(16*2,$out),$inout2);
+        &movups (&QWP(16*3,$out),$inout3);
+        &movups (&QWP(16*4,$out),$inout4);
+        &lea    ($out,&DWP(16*5,$out));
+        &jmp    (&label("xts_enc_done"));
+&set_label("xts_enc_one",16);
+        &movups ($inout0,&QWP(16*0,$inp));      # load input
+        &lea    ($inp,&DWP(16*1,$inp));
+        &xorps  ($inout0,$inout3);              # input^=tweak
+        if ($inline)
+        {   &aesni_inline_generate1("enc");     }
+        else
+        {   &call       ("_aesni_encrypt1");    }
+        &xorps  ($inout0,$inout3);              # output^=tweak
+        &movups (&QWP(16*0,$out),$inout0);      # write output
+        &lea    ($out,&DWP(16*1,$out));
+        &movdqa ($tweak,$inout3);               # last tweak
+        &jmp    (&label("xts_enc_done"));
+&set_label("xts_enc_two",16);
+        &movaps ($inout4,$tweak);               # put aside last tweak
+        &movups ($inout0,&QWP(16*0,$inp));      # load input
+        &movups ($inout1,&QWP(16*1,$inp));
+        &lea    ($inp,&DWP(16*2,$inp));
+        &xorps  ($inout0,$inout3);              # input^=tweak
+        &xorps  ($inout1,$inout4);
+        &xorps  ($inout2,$inout2);
+        &call   ("_aesni_encrypt3");
+        &xorps  ($inout0,$inout3);              # output^=tweak
+        &xorps  ($inout1,$inout4);
+        &movups (&QWP(16*0,$out),$inout0);      # write output
+        &movups (&QWP(16*1,$out),$inout1);
+        &lea    ($out,&DWP(16*2,$out));
+        &movdqa ($tweak,$inout4);               # last tweak
+        &jmp    (&label("xts_enc_done"));
+&set_label("xts_enc_three",16);
+        &movaps ($inout5,$tweak);               # put aside last tweak
+        &movups ($inout0,&QWP(16*0,$inp));      # load input
+        &movups ($inout1,&QWP(16*1,$inp));
+        &movups ($inout2,&QWP(16*2,$inp));
+        &lea    ($inp,&DWP(16*3,$inp));
+        &xorps  ($inout0,$inout3);              # input^=tweak
+        &xorps  ($inout1,$inout4);
+        &xorps  ($inout2,$inout5);
+        &call   ("_aesni_encrypt3");
+        &xorps  ($inout0,$inout3);              # output^=tweak
+        &xorps  ($inout1,$inout4);
+        &xorps  ($inout2,$inout5);
+        &movups (&QWP(16*0,$out),$inout0);      # write output
+        &movups (&QWP(16*1,$out),$inout1);
+        &movups (&QWP(16*2,$out),$inout2);
+        &lea    ($out,&DWP(16*3,$out));
+        &movdqa ($tweak,$inout5);               # last tweak
+        &jmp    (&label("xts_enc_done"));
+&set_label("xts_enc_four",16);
+        &movaps ($inout4,$tweak);               # put aside last tweak
+        &movups ($inout0,&QWP(16*0,$inp));      # load input
+        &movups ($inout1,&QWP(16*1,$inp));
+        &movups ($inout2,&QWP(16*2,$inp));
+        &xorps  ($inout0,&QWP(16*0,"esp"));     # input^=tweak
+        &movups ($inout3,&QWP(16*3,$inp));
+        &lea    ($inp,&DWP(16*4,$inp));
+        &xorps  ($inout1,&QWP(16*1,"esp"));
+        &xorps  ($inout2,$inout5);
+        &xorps  ($inout3,$inout4);
+        &call   ("_aesni_encrypt4");
+        &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
+        &xorps  ($inout1,&QWP(16*1,"esp"));
+        &xorps  ($inout2,$inout5);
+        &movups (&QWP(16*0,$out),$inout0);      # write output
+        &xorps  ($inout3,$inout4);
+        &movups (&QWP(16*1,$out),$inout1);
+        &movups (&QWP(16*2,$out),$inout2);
+        &movups (&QWP(16*3,$out),$inout3);
+        &lea    ($out,&DWP(16*4,$out));
+        &movdqa ($tweak,$inout4);               # last tweak
+        &jmp    (&label("xts_enc_done"));
+&set_label("xts_enc_done6x",16);                # $tweak is pre-calculated
+        &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
+        &and    ($len,15);
+        &jz     (&label("xts_enc_ret"));
+        &movdqa ($inout3,$tweak);
+        &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
+        &jmp    (&label("xts_enc_steal"));
+&set_label("xts_enc_done",16);
+        &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
+        &pxor   ($twtmp,$twtmp);
+        &and    ($len,15);
+        &jz     (&label("xts_enc_ret"));
+        &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+        &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
+        &pshufd ($inout3,$twtmp,0x13);
+        &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+        &pand   ($inout3,&QWP(16*6,"esp"));     # isolate carry and residue
+        &pxor   ($inout3,$tweak);
+&set_label("xts_enc_steal");
+        &movz   ($rounds,&BP(0,$inp));
+        &movz   ($key,&BP(-16,$out));
+        &lea    ($inp,&DWP(1,$inp));
+        &mov    (&BP(-16,$out),&LB($rounds));
+        &mov    (&BP(0,$out),&LB($key));
+        &lea    ($out,&DWP(1,$out));
+        &sub    ($len,1);
+        &jnz    (&label("xts_enc_steal"));
+        &sub    ($out,&DWP(16*7+0,"esp"));      # rewind $out
+        &mov    ($key,$key_);                   # restore $key
+        &mov    ($rounds,$rounds_);             # restore $rounds
+        &movups ($inout0,&QWP(-16,$out));       # load input
+        &xorps  ($inout0,$inout3);              # input^=tweak
+        if ($inline)
+        {   &aesni_inline_generate1("enc");     }
+        else
+        {   &call       ("_aesni_encrypt1");    }
+        &xorps  ($inout0,$inout3);              # output^=tweak
+        &movups (&QWP(-16,$out),$inout0);       # write output
+&set_label("xts_enc_ret");
+        &mov    ("esp",&DWP(16*7+4,"esp"));     # restore %esp
+&function_end("aesni_xts_encrypt");
+&function_begin("aesni_xts_decrypt");
+        &mov    ($key,&wparam(4));              # key2
+        &mov    ($inp,&wparam(5));              # clear-text tweak
+        &mov    ($rounds,&DWP(240,$key));       # key2->rounds
+        &movups ($inout0,&QWP(0,$inp));
+        if ($inline)
+        {   &aesni_inline_generate1("enc");     }
+        else
+        {   &call       ("_aesni_encrypt1");    }
+        &mov    ($inp,&wparam(0));
+        &mov    ($out,&wparam(1));
+        &mov    ($len,&wparam(2));
+        &mov    ($key,&wparam(3));              # key1
+        &mov    ($key_,"esp");
+        &sub    ("esp",16*7+8);
+        &and    ("esp",-16);                    # align stack
+        &xor    ($rounds_,$rounds_);            # if(len%16) len-=16;
+        &test   ($len,15);
+        &setnz  (&LB($rounds_));
+        &shl    ($rounds_,4);
+        &sub    ($len,$rounds_);
+        &mov    (&DWP(16*6+0,"esp"),0x87);      # compose the magic constant
+        &mov    (&DWP(16*6+4,"esp"),0);
+        &mov    (&DWP(16*6+8,"esp"),1);
+        &mov    (&DWP(16*6+12,"esp"),0);
+        &mov    (&DWP(16*7+0,"esp"),$len);      # save original $len
+        &mov    (&DWP(16*7+4,"esp"),$key_);     # save original %esp
+        &mov    ($rounds,&DWP(240,$key));       # key1->rounds
+        &mov    ($key_,$key);                   # backup $key
+        &mov    ($rounds_,$rounds);             # backup $rounds
+        &movdqa ($tweak,$inout0);
+        &pxor   ($twtmp,$twtmp);
+        &movdqa ($twmask,&QWP(6*16,"esp"));     # 0x0...010...87
+        &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+        &and    ($len,-16);
+        &sub    ($len,16*6);
+        &jc     (&label("xts_dec_short"));
+        &shr    ($rounds,1);
+        &mov    ($rounds_,$rounds);
+        &jmp    (&label("xts_dec_loop6"));
+&set_label("xts_dec_loop6",16);
+        for ($i=0;$i<4;$i++) {
+            &pshufd     ($twres,$twtmp,0x13);
+            &pxor       ($twtmp,$twtmp);
+            &movdqa     (&QWP(16*$i,"esp"),$tweak);
+            &paddq      ($tweak,$tweak);        # &psllq($tweak,1);
+            &pand       ($twres,$twmask);       # isolate carry and residue
+            &pcmpgtd    ($twtmp,$tweak);        # broadcast upper bits
+            &pxor       ($tweak,$twres);
+        }
+        &pshufd ($inout5,$twtmp,0x13);
+        &movdqa (&QWP(16*$i++,"esp"),$tweak);
+        &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+         &$movekey      ($rndkey0,&QWP(0,$key_));
+        &pand   ($inout5,$twmask);              # isolate carry and residue
+         &movups        ($inout0,&QWP(0,$inp)); # load input
+        &pxor   ($inout5,$tweak);
+        # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+        &movdqu ($inout1,&QWP(16*1,$inp));
+         &xorps         ($inout0,$rndkey0);     # input^=rndkey[0]
+        &movdqu ($inout2,&QWP(16*2,$inp));
+         &pxor          ($inout1,$rndkey0);
+        &movdqu ($inout3,&QWP(16*3,$inp));
+         &pxor          ($inout2,$rndkey0);
+        &movdqu ($inout4,&QWP(16*4,$inp));
+         &pxor          ($inout3,$rndkey0);
+        &movdqu ($rndkey1,&QWP(16*5,$inp));
+         &pxor          ($inout4,$rndkey0);
+        &lea    ($inp,&DWP(16*6,$inp));
+        &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
+        &movdqa (&QWP(16*$i,"esp"),$inout5);    # save last tweak
+        &pxor   ($inout5,$rndkey1);
+         &$movekey      ($rndkey1,&QWP(16,$key_));
+         &lea           ($key,&DWP(32,$key_));
+        &pxor   ($inout1,&QWP(16*1,"esp"));
+         &aesdec        ($inout0,$rndkey1);
+        &pxor   ($inout2,&QWP(16*2,"esp"));
+         &aesdec        ($inout1,$rndkey1);
+        &pxor   ($inout3,&QWP(16*3,"esp"));
+         &dec           ($rounds);
+         &aesdec        ($inout2,$rndkey1);
+        &pxor   ($inout4,&QWP(16*4,"esp"));
+         &aesdec        ($inout3,$rndkey1);
+        &pxor           ($inout5,$rndkey0);
+         &aesdec        ($inout4,$rndkey1);
+         &$movekey      ($rndkey0,&QWP(0,$key));
+         &aesdec        ($inout5,$rndkey1);
+        &call           (&label("_aesni_decrypt6_enter"));
+        &movdqa ($tweak,&QWP(16*5,"esp"));      # last tweak
+       &pxor    ($twtmp,$twtmp);
+        &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
+       &pcmpgtd ($twtmp,$tweak);                # broadcast upper bits
+        &xorps  ($inout1,&QWP(16*1,"esp"));
+        &movups (&QWP(16*0,$out),$inout0);      # write output
+        &xorps  ($inout2,&QWP(16*2,"esp"));
+        &movups (&QWP(16*1,$out),$inout1);
+        &xorps  ($inout3,&QWP(16*3,"esp"));
+        &movups (&QWP(16*2,$out),$inout2);
+        &xorps  ($inout4,&QWP(16*4,"esp"));
+        &movups (&QWP(16*3,$out),$inout3);
+        &xorps  ($inout5,$tweak);
+        &movups (&QWP(16*4,$out),$inout4);
+       &pshufd  ($twres,$twtmp,0x13);
+        &movups (&QWP(16*5,$out),$inout5);
+        &lea    ($out,&DWP(16*6,$out));
+       &movdqa  ($twmask,&QWP(16*6,"esp"));     # 0x0...010...87
+        &pxor   ($twtmp,$twtmp);
+        &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+        &pand   ($twres,$twmask);               # isolate carry and residue
+        &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+        &mov    ($rounds,$rounds_);             # restore $rounds
+        &pxor   ($tweak,$twres);
+        &sub    ($len,16*6);
+        &jnc    (&label("xts_dec_loop6"));
+        &lea    ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+        &mov    ($key,$key_);                   # restore $key
+        &mov    ($rounds_,$rounds);
+&set_label("xts_dec_short");
+        &add    ($len,16*6);
+        &jz     (&label("xts_dec_done6x"));
+        &movdqa ($inout3,$tweak);               # put aside previous tweak
+        &cmp    ($len,0x20);
+        &jb     (&label("xts_dec_one"));
+        &pshufd ($twres,$twtmp,0x13);
+        &pxor   ($twtmp,$twtmp);
+        &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+        &pand   ($twres,$twmask);               # isolate carry and residue
+        &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+        &pxor   ($tweak,$twres);
+        &je     (&label("xts_dec_two"));
+        &pshufd ($twres,$twtmp,0x13);
+        &pxor   ($twtmp,$twtmp);
+        &movdqa ($inout4,$tweak);               # put aside previous tweak
+        &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+        &pand   ($twres,$twmask);               # isolate carry and residue
+        &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+        &pxor   ($tweak,$twres);
+        &cmp    ($len,0x40);
+        &jb     (&label("xts_dec_three"));
+        &pshufd ($twres,$twtmp,0x13);
+        &pxor   ($twtmp,$twtmp);
+        &movdqa ($inout5,$tweak);               # put aside previous tweak
+        &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+        &pand   ($twres,$twmask);               # isolate carry and residue
+        &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+        &pxor   ($tweak,$twres);
+        &movdqa (&QWP(16*0,"esp"),$inout3);
+        &movdqa (&QWP(16*1,"esp"),$inout4);
+        &je     (&label("xts_dec_four"));
+        &movdqa (&QWP(16*2,"esp"),$inout5);
+        &pshufd ($inout5,$twtmp,0x13);
+        &movdqa (&QWP(16*3,"esp"),$tweak);
+        &paddq  ($tweak,$tweak);                # &psllq($inout0,1);
+        &pand   ($inout5,$twmask);              # isolate carry and residue
+        &pxor   ($inout5,$tweak);
+        &movdqu ($inout0,&QWP(16*0,$inp));      # load input
+        &movdqu ($inout1,&QWP(16*1,$inp));
+        &movdqu ($inout2,&QWP(16*2,$inp));
+        &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
+        &movdqu ($inout3,&QWP(16*3,$inp));
+        &pxor   ($inout1,&QWP(16*1,"esp"));
+        &movdqu ($inout4,&QWP(16*4,$inp));
+        &pxor   ($inout2,&QWP(16*2,"esp"));
+        &lea    ($inp,&DWP(16*5,$inp));
+        &pxor   ($inout3,&QWP(16*3,"esp"));
+        &movdqa (&QWP(16*4,"esp"),$inout5);     # save last tweak
+        &pxor   ($inout4,$inout5);
+        &call   ("_aesni_decrypt6");
+        &movaps ($tweak,&QWP(16*4,"esp"));      # last tweak
+        &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
+        &xorps  ($inout1,&QWP(16*1,"esp"));
+        &xorps  ($inout2,&QWP(16*2,"esp"));
+        &movups (&QWP(16*0,$out),$inout0);      # write output
+        &xorps  ($inout3,&QWP(16*3,"esp"));
+        &movups (&QWP(16*1,$out),$inout1);
+        &xorps  ($inout4,$tweak);
+        &movups (&QWP(16*2,$out),$inout2);
+        &movups (&QWP(16*3,$out),$inout3);
+        &movups (&QWP(16*4,$out),$inout4);
+        &lea    ($out,&DWP(16*5,$out));
+        &jmp    (&label("xts_dec_done"));
+&set_label("xts_dec_one",16);
+        &movups ($inout0,&QWP(16*0,$inp));      # load input
+        &lea    ($inp,&DWP(16*1,$inp));
+        &xorps  ($inout0,$inout3);              # input^=tweak
+        if ($inline)
+        {   &aesni_inline_generate1("dec");     }
+        else
+        {   &call       ("_aesni_decrypt1");    }
+        &xorps  ($inout0,$inout3);              # output^=tweak
+        &movups (&QWP(16*0,$out),$inout0);      # write output
+        &lea    ($out,&DWP(16*1,$out));
+        &movdqa ($tweak,$inout3);               # last tweak
+        &jmp    (&label("xts_dec_done"));
+&set_label("xts_dec_two",16);
+        &movaps ($inout4,$tweak);               # put aside last tweak
+        &movups ($inout0,&QWP(16*0,$inp));      # load input
+        &movups ($inout1,&QWP(16*1,$inp));
+        &lea    ($inp,&DWP(16*2,$inp));
+        &xorps  ($inout0,$inout3);              # input^=tweak
+        &xorps  ($inout1,$inout4);
+        &call   ("_aesni_decrypt3");
+        &xorps  ($inout0,$inout3);              # output^=tweak
+        &xorps  ($inout1,$inout4);
+        &movups (&QWP(16*0,$out),$inout0);      # write output
+        &movups (&QWP(16*1,$out),$inout1);
+        &lea    ($out,&DWP(16*2,$out));
+        &movdqa ($tweak,$inout4);               # last tweak
+        &jmp    (&label("xts_dec_done"));
+&set_label("xts_dec_three",16);
+        &movaps ($inout5,$tweak);               # put aside last tweak
+        &movups ($inout0,&QWP(16*0,$inp));      # load input
+        &movups ($inout1,&QWP(16*1,$inp));
+        &movups ($inout2,&QWP(16*2,$inp));
+        &lea    ($inp,&DWP(16*3,$inp));
+        &xorps  ($inout0,$inout3);              # input^=tweak
+        &xorps  ($inout1,$inout4);
+        &xorps  ($inout2,$inout5);
+        &call   ("_aesni_decrypt3");
+        &xorps  ($inout0,$inout3);              # output^=tweak
+        &xorps  ($inout1,$inout4);
+        &xorps  ($inout2,$inout5);
+        &movups (&QWP(16*0,$out),$inout0);      # write output
+        &movups (&QWP(16*1,$out),$inout1);
+        &movups (&QWP(16*2,$out),$inout2);
+        &lea    ($out,&DWP(16*3,$out));
+        &movdqa ($tweak,$inout5);               # last tweak
+        &jmp    (&label("xts_dec_done"));
+&set_label("xts_dec_four",16);
+        &movaps ($inout4,$tweak);               # put aside last tweak
+        &movups ($inout0,&QWP(16*0,$inp));      # load input
+        &movups ($inout1,&QWP(16*1,$inp));
+        &movups ($inout2,&QWP(16*2,$inp));
+        &xorps  ($inout0,&QWP(16*0,"esp"));     # input^=tweak
+        &movups ($inout3,&QWP(16*3,$inp));
+        &lea    ($inp,&DWP(16*4,$inp));
+        &xorps  ($inout1,&QWP(16*1,"esp"));
+        &xorps  ($inout2,$inout5);
+        &xorps  ($inout3,$inout4);
+        &call   ("_aesni_decrypt4");
+        &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
+        &xorps  ($inout1,&QWP(16*1,"esp"));
+        &xorps  ($inout2,$inout5);
+        &movups (&QWP(16*0,$out),$inout0);      # write output
+        &xorps  ($inout3,$inout4);
+        &movups (&QWP(16*1,$out),$inout1);
+        &movups (&QWP(16*2,$out),$inout2);
+        &movups (&QWP(16*3,$out),$inout3);
+        &lea    ($out,&DWP(16*4,$out));
+        &movdqa ($tweak,$inout4);               # last tweak
+        &jmp    (&label("xts_dec_done"));
+&set_label("xts_dec_done6x",16);                # $tweak is pre-calculated
+        &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
+        &and    ($len,15);
+        &jz     (&label("xts_dec_ret"));
+        &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
+        &jmp    (&label("xts_dec_only_one_more"));
+&set_label("xts_dec_done",16);
+        &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
+        &pxor   ($twtmp,$twtmp);
+        &and    ($len,15);
+        &jz     (&label("xts_dec_ret"));
+        &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+        &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
+        &pshufd ($twres,$twtmp,0x13);
+        &pxor   ($twtmp,$twtmp);
+        &movdqa ($twmask,&QWP(16*6,"esp"));
+        &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+        &pand   ($twres,$twmask);               # isolate carry and residue
+        &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+        &pxor   ($tweak,$twres);
+&set_label("xts_dec_only_one_more");
+        &pshufd ($inout3,$twtmp,0x13);
+        &movdqa ($inout4,$tweak);               # put aside previous tweak
+        &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+        &pand   ($inout3,$twmask);              # isolate carry and residue
+        &pxor   ($inout3,$tweak);
+        &mov    ($key,$key_);                   # restore $key
+        &mov    ($rounds,$rounds_);             # restore $rounds
+        &movups ($inout0,&QWP(0,$inp));         # load input
+        &xorps  ($inout0,$inout3);              # input^=tweak
+        if ($inline)
+        {   &aesni_inline_generate1("dec");     }
+        else
+        {   &call       ("_aesni_decrypt1");    }
+        &xorps  ($inout0,$inout3);              # output^=tweak
+        &movups (&QWP(0,$out),$inout0);         # write output
+&set_label("xts_dec_steal");
+        &movz   ($rounds,&BP(16,$inp));
+        &movz   ($key,&BP(0,$out));
+        &lea    ($inp,&DWP(1,$inp));
+        &mov    (&BP(0,$out),&LB($rounds));
+        &mov    (&BP(16,$out),&LB($key));
+        &lea    ($out,&DWP(1,$out));
+        &sub    ($len,1);
+        &jnz    (&label("xts_dec_steal"));
+        &sub    ($out,&DWP(16*7+0,"esp"));      # rewind $out
+        &mov    ($key,$key_);                   # restore $key
+        &mov    ($rounds,$rounds_);             # restore $rounds
+        &movups ($inout0,&QWP(0,$out));         # load input
+        &xorps  ($inout0,$inout4);              # input^=tweak
+        if ($inline)
+        {   &aesni_inline_generate1("dec");     }
+        else
+        {   &call       ("_aesni_decrypt1");    }
+        &xorps  ($inout0,$inout4);              # output^=tweak
+        &movups (&QWP(0,$out),$inout0);         # write output
+&set_label("xts_dec_ret");
+        &mov    ("esp",&DWP(16*7+4,"esp"));     # restore %esp
+&function_end("aesni_xts_decrypt");
+}
+}
+######################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+#                           size_t length, const AES_KEY *key,
+#                           unsigned char *ivp,const int enc);
+&function_begin("${PREFIX}_cbc_encrypt");
+        &mov    ($inp,&wparam(0));
+        &mov    ($rounds_,"esp");
+        &mov    ($out,&wparam(1));
+        &sub    ($rounds_,24);
+        &mov    ($len,&wparam(2));
+        &and    ($rounds_,-16);
+        &mov    ($key,&wparam(3));
+        &mov    ($key_,&wparam(4));
+        &test   ($len,$len);
+        &jz     (&label("cbc_abort"));
+        &cmp    (&wparam(5),0);
+        &xchg   ($rounds_,"esp");               # alloca
+        &movups ($ivec,&QWP(0,$key_));          # load IV
+        &mov    ($rounds,&DWP(240,$key));
+        &mov    ($key_,$key);                   # backup $key
+        &mov    (&DWP(16,"esp"),$rounds_);      # save original %esp
+        &mov    ($rounds_,$rounds);             # backup $rounds
+        &je     (&label("cbc_decrypt"));
+        &movaps ($inout0,$ivec);
+        &cmp    ($len,16);
+        &jb     (&label("cbc_enc_tail"));
+        &sub    ($len,16);
+        &jmp    (&label("cbc_enc_loop"));
+&set_label("cbc_enc_loop",16);
+        &movups ($ivec,&QWP(0,$inp));           # input actually
+        &lea    ($inp,&DWP(16,$inp));
+        if ($inline)
+        {   &aesni_inline_generate1("enc",$inout0,$ivec);       }
+        else
+        {   &xorps($inout0,$ivec); &call("_aesni_encrypt1");    }
+        &mov    ($rounds,$rounds_);     # restore $rounds
+        &mov    ($key,$key_);           # restore $key
+        &movups (&QWP(0,$out),$inout0); # store output
+        &lea    ($out,&DWP(16,$out));
+        &sub    ($len,16);
+        &jnc    (&label("cbc_enc_loop"));
+        &add    ($len,16);
+        &jnz    (&label("cbc_enc_tail"));
+        &movaps ($ivec,$inout0);
+        &jmp    (&label("cbc_ret"));
+&set_label("cbc_enc_tail");
+        &mov    ("ecx",$len);           # zaps $rounds
+        &data_word(0xA4F3F689);         # rep movsb
+        &mov    ("ecx",16);             # zero tail
+        &sub    ("ecx",$len);
+        &xor    ("eax","eax");          # zaps $len
+        &data_word(0xAAF3F689);         # rep stosb
+        &lea    ($out,&DWP(-16,$out));  # rewind $out by 1 block
+        &mov    ($rounds,$rounds_);     # restore $rounds
+        &mov    ($inp,$out);            # $inp and $out are the same
+        &mov    ($key,$key_);           # restore $key
+        &jmp    (&label("cbc_enc_loop"));
+######################################################################
+&set_label("cbc_decrypt",16);
+        &cmp    ($len,0x50);
+        &jbe    (&label("cbc_dec_tail"));
+        &movaps (&QWP(0,"esp"),$ivec);          # save IV
+        &sub    ($len,0x50);
+        &jmp    (&label("cbc_dec_loop6_enter"));
+&set_label("cbc_dec_loop6",16);
+        &movaps (&QWP(0,"esp"),$rndkey0);       # save IV
+        &movups (&QWP(0,$out),$inout5);
+        &lea    ($out,&DWP(0x10,$out));
+&set_label("cbc_dec_loop6_enter");
+        &movdqu ($inout0,&QWP(0,$inp));
+        &movdqu ($inout1,&QWP(0x10,$inp));
+        &movdqu ($inout2,&QWP(0x20,$inp));
+        &movdqu ($inout3,&QWP(0x30,$inp));
+        &movdqu ($inout4,&QWP(0x40,$inp));
+        &movdqu ($inout5,&QWP(0x50,$inp));
+        &call   ("_aesni_decrypt6");
+        &movups ($rndkey1,&QWP(0,$inp));
+        &movups ($rndkey0,&QWP(0x10,$inp));
+        &xorps  ($inout0,&QWP(0,"esp"));        # ^=IV
+        &xorps  ($inout1,$rndkey1);
+        &movups ($rndkey1,&QWP(0x20,$inp));
+        &xorps  ($inout2,$rndkey0);
+        &movups ($rndkey0,&QWP(0x30,$inp));
+        &xorps  ($inout3,$rndkey1);
+        &movups ($rndkey1,&QWP(0x40,$inp));
+        &xorps  ($inout4,$rndkey0);
+        &movups ($rndkey0,&QWP(0x50,$inp));     # IV
+        &xorps  ($inout5,$rndkey1);
+        &movups (&QWP(0,$out),$inout0);
+        &movups (&QWP(0x10,$out),$inout1);
+        &lea    ($inp,&DWP(0x60,$inp));
+        &movups (&QWP(0x20,$out),$inout2);
+        &mov    ($rounds,$rounds_)              # restore $rounds
+        &movups (&QWP(0x30,$out),$inout3);
+        &mov    ($key,$key_);                   # restore $key
+        &movups (&QWP(0x40,$out),$inout4);
+        &lea    ($out,&DWP(0x50,$out));
+        &sub    ($len,0x60);
+        &ja     (&label("cbc_dec_loop6"));
+        &movaps ($inout0,$inout5);
+        &movaps ($ivec,$rndkey0);
+        &add    ($len,0x50);
+        &jle    (&label("cbc_dec_tail_collected"));
+        &movups (&QWP(0,$out),$inout0);
+        &lea    ($out,&DWP(0x10,$out));
+&set_label("cbc_dec_tail");
+        &movups ($inout0,&QWP(0,$inp));
+        &movaps ($in0,$inout0);
+        &cmp    ($len,0x10);
+        &jbe    (&label("cbc_dec_one"));
+        &movups ($inout1,&QWP(0x10,$inp));
+        &movaps ($in1,$inout1);
+        &cmp    ($len,0x20);
+        &jbe    (&label("cbc_dec_two"));
+        &movups ($inout2,&QWP(0x20,$inp));
+        &cmp    ($len,0x30);
+        &jbe    (&label("cbc_dec_three"));
+        &movups ($inout3,&QWP(0x30,$inp));
+        &cmp    ($len,0x40);
+        &jbe    (&label("cbc_dec_four"));
+        &movups ($inout4,&QWP(0x40,$inp));
+        &movaps (&QWP(0,"esp"),$ivec);          # save IV
+        &movups ($inout0,&QWP(0,$inp));
+        &xorps  ($inout5,$inout5);
+        &call   ("_aesni_decrypt6");
+        &movups ($rndkey1,&QWP(0,$inp));
+        &movups ($rndkey0,&QWP(0x10,$inp));
+        &xorps  ($inout0,&QWP(0,"esp"));        # ^= IV
+        &xorps  ($inout1,$rndkey1);
+        &movups ($rndkey1,&QWP(0x20,$inp));
+        &xorps  ($inout2,$rndkey0);
+        &movups ($rndkey0,&QWP(0x30,$inp));
+        &xorps  ($inout3,$rndkey1);
+        &movups ($ivec,&QWP(0x40,$inp));        # IV
+        &xorps  ($inout4,$rndkey0);
+        &movups (&QWP(0,$out),$inout0);
+        &movups (&QWP(0x10,$out),$inout1);
+        &movups (&QWP(0x20,$out),$inout2);
+        &movups (&QWP(0x30,$out),$inout3);
+        &lea    ($out,&DWP(0x40,$out));
+        &movaps ($inout0,$inout4);
+        &sub    ($len,0x50);
+        &jmp    (&label("cbc_dec_tail_collected"));
+&set_label("cbc_dec_one",16);
+        if ($inline)
+        {   &aesni_inline_generate1("dec");     }
+        else
+        {   &call       ("_aesni_decrypt1");    }
+        &xorps  ($inout0,$ivec);
+        &movaps ($ivec,$in0);
+        &sub    ($len,0x10);
+        &jmp    (&label("cbc_dec_tail_collected"));
+&set_label("cbc_dec_two",16);
+        &xorps  ($inout2,$inout2);
+        &call   ("_aesni_decrypt3");
+        &xorps  ($inout0,$ivec);
+        &xorps  ($inout1,$in0);
+        &movups (&QWP(0,$out),$inout0);
+        &movaps ($inout0,$inout1);
+        &lea    ($out,&DWP(0x10,$out));
+        &movaps ($ivec,$in1);
+        &sub    ($len,0x20);
+        &jmp    (&label("cbc_dec_tail_collected"));
+&set_label("cbc_dec_three",16);
+        &call   ("_aesni_decrypt3");
+        &xorps  ($inout0,$ivec);
+        &xorps  ($inout1,$in0);
+        &xorps  ($inout2,$in1);
+        &movups (&QWP(0,$out),$inout0);
+        &movaps ($inout0,$inout2);
+        &movups (&QWP(0x10,$out),$inout1);
+        &lea    ($out,&DWP(0x20,$out));
+        &movups ($ivec,&QWP(0x20,$inp));
+        &sub    ($len,0x30);
+        &jmp    (&label("cbc_dec_tail_collected"));
+&set_label("cbc_dec_four",16);
+        &call   ("_aesni_decrypt4");
+        &movups ($rndkey1,&QWP(0x10,$inp));
+        &movups ($rndkey0,&QWP(0x20,$inp));
+        &xorps  ($inout0,$ivec);
+        &movups ($ivec,&QWP(0x30,$inp));
+        &xorps  ($inout1,$in0);
+        &movups (&QWP(0,$out),$inout0);
+        &xorps  ($inout2,$rndkey1);
+        &movups (&QWP(0x10,$out),$inout1);
+        &xorps  ($inout3,$rndkey0);
+        &movups (&QWP(0x20,$out),$inout2);
+        &lea    ($out,&DWP(0x30,$out));
+        &movaps ($inout0,$inout3);
+        &sub    ($len,0x40);
+&set_label("cbc_dec_tail_collected");
+        &and    ($len,15);
+        &jnz    (&label("cbc_dec_tail_partial"));
+        &movups (&QWP(0,$out),$inout0);
+        &jmp    (&label("cbc_ret"));
+&set_label("cbc_dec_tail_partial",16);
+        &movaps (&QWP(0,"esp"),$inout0);
+        &mov    ("ecx",16);
+        &mov    ($inp,"esp");
+        &sub    ("ecx",$len);
+        &data_word(0xA4F3F689);         # rep movsb
+&set_label("cbc_ret");
+        &mov    ("esp",&DWP(16,"esp")); # pull original %esp
+        &mov    ($key_,&wparam(4));
+        &movups (&QWP(0,$key_),$ivec);  # output IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+######################################################################
+# Mechanical port from aesni-x86_64.pl.
+#
+# _aesni_set_encrypt_key is private interface,
+# input:
+#       "eax"   const unsigned char *userKey
+#       $rounds int bits
+#       $key    AES_KEY *key
+# output:
+#       "eax"   return code
+#       $round  rounds
+&function_begin_B("_aesni_set_encrypt_key");
+        &test   ("eax","eax");
+        &jz     (&label("bad_pointer"));
+        &test   ($key,$key);
+        &jz     (&label("bad_pointer"));
+        &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
+        &xorps  ("xmm4","xmm4");        # low dword of xmm4 is assumed 0
+        &lea    ($key,&DWP(16,$key));
+        &cmp    ($rounds,256);
+        &je     (&label("14rounds"));
+        &cmp    ($rounds,192);
+        &je     (&label("12rounds"));
+        &cmp    ($rounds,128);
+        &jne    (&label("bad_keybits"));
+&set_label("10rounds",16);
+        &mov            ($rounds,9);
+        &$movekey       (&QWP(-16,$key),"xmm0");        # round 0
+        &aeskeygenassist("xmm1","xmm0",0x01);           # round 1
+        &call           (&label("key_128_cold"));
+        &aeskeygenassist("xmm1","xmm0",0x2);            # round 2
+        &call           (&label("key_128"));
+        &aeskeygenassist("xmm1","xmm0",0x04);           # round 3
+        &call           (&label("key_128"));
+        &aeskeygenassist("xmm1","xmm0",0x08);           # round 4
+        &call           (&label("key_128"));
+        &aeskeygenassist("xmm1","xmm0",0x10);           # round 5
+        &call           (&label("key_128"));
+        &aeskeygenassist("xmm1","xmm0",0x20);           # round 6
+        &call           (&label("key_128"));
+        &aeskeygenassist("xmm1","xmm0",0x40);           # round 7
+        &call           (&label("key_128"));
+        &aeskeygenassist("xmm1","xmm0",0x80);           # round 8
+        &call           (&label("key_128"));
+        &aeskeygenassist("xmm1","xmm0",0x1b);           # round 9
+        &call           (&label("key_128"));
+        &aeskeygenassist("xmm1","xmm0",0x36);           # round 10
+        &call           (&label("key_128"));
+        &$movekey       (&QWP(0,$key),"xmm0");
+        &mov            (&DWP(80,$key),$rounds);
+        &xor            ("eax","eax");
+        &ret();
+&set_label("key_128",16);
+        &$movekey       (&QWP(0,$key),"xmm0");
+        &lea            ($key,&DWP(16,$key));
+&set_label("key_128_cold");
+        &shufps         ("xmm4","xmm0",0b00010000);
+        &xorps          ("xmm0","xmm4");
+        &shufps         ("xmm4","xmm0",0b10001100);
+        &xorps          ("xmm0","xmm4");
+        &shufps         ("xmm1","xmm1",0b11111111);     # critical path
+        &xorps          ("xmm0","xmm1");
+        &ret();
+&set_label("12rounds",16);
+        &movq           ("xmm2",&QWP(16,"eax"));        # remaining 1/3 of *userKey
+        &mov            ($rounds,11);
+        &$movekey       (&QWP(-16,$key),"xmm0")         # round 0
+        &aeskeygenassist("xmm1","xmm2",0x01);           # round 1,2
+        &call           (&label("key_192a_cold"));
+        &aeskeygenassist("xmm1","xmm2",0x02);           # round 2,3
+        &call           (&label("key_192b"));
+        &aeskeygenassist("xmm1","xmm2",0x04);           # round 4,5
+        &call           (&label("key_192a"));
+        &aeskeygenassist("xmm1","xmm2",0x08);           # round 5,6
+        &call           (&label("key_192b"));
+        &aeskeygenassist("xmm1","xmm2",0x10);           # round 7,8
+        &call           (&label("key_192a"));
+        &aeskeygenassist("xmm1","xmm2",0x20);           # round 8,9
+        &call           (&label("key_192b"));
+        &aeskeygenassist("xmm1","xmm2",0x40);           # round 10,11
+        &call           (&label("key_192a"));
+        &aeskeygenassist("xmm1","xmm2",0x80);           # round 11,12
+        &call           (&label("key_192b"));
+        &$movekey       (&QWP(0,$key),"xmm0");
+        &mov            (&DWP(48,$key),$rounds);
+        &xor            ("eax","eax");
+        &ret();
+&set_label("key_192a",16);
+        &$movekey       (&QWP(0,$key),"xmm0");
+        &lea            ($key,&DWP(16,$key));
+&set_label("key_192a_cold",16);
+        &movaps         ("xmm5","xmm2");
+&set_label("key_192b_warm");
+        &shufps         ("xmm4","xmm0",0b00010000);
+        &movdqa         ("xmm3","xmm2");
+        &xorps          ("xmm0","xmm4");
+        &shufps         ("xmm4","xmm0",0b10001100);
+        &pslldq         ("xmm3",4);
+        &xorps          ("xmm0","xmm4");
+        &pshufd         ("xmm1","xmm1",0b01010101);     # critical path
+        &pxor           ("xmm2","xmm3");
+        &pxor           ("xmm0","xmm1");
+        &pshufd         ("xmm3","xmm0",0b11111111);
+        &pxor           ("xmm2","xmm3");
+        &ret();
+&set_label("key_192b",16);
+        &movaps         ("xmm3","xmm0");
+        &shufps         ("xmm5","xmm0",0b01000100);
+        &$movekey       (&QWP(0,$key),"xmm5");
+        &shufps         ("xmm3","xmm2",0b01001110);
+        &$movekey       (&QWP(16,$key),"xmm3");
+        &lea            ($key,&DWP(32,$key));
+        &jmp            (&label("key_192b_warm"));
+&set_label("14rounds",16);
+        &movups         ("xmm2",&QWP(16,"eax"));        # remaining half of *userKey
+        &mov            ($rounds,13);
+        &lea            ($key,&DWP(16,$key));
+        &$movekey       (&QWP(-32,$key),"xmm0");        # round 0
+        &$movekey       (&QWP(-16,$key),"xmm2");        # round 1
+        &aeskeygenassist("xmm1","xmm2",0x01);           # round 2
+        &call           (&label("key_256a_cold"));
+        &aeskeygenassist("xmm1","xmm0",0x01);           # round 3
+        &call           (&label("key_256b"));
+        &aeskeygenassist("xmm1","xmm2",0x02);           # round 4
+        &call           (&label("key_256a"));
+        &aeskeygenassist("xmm1","xmm0",0x02);           # round 5
+        &call           (&label("key_256b"));
+        &aeskeygenassist("xmm1","xmm2",0x04);           # round 6
+        &call           (&label("key_256a"));
+        &aeskeygenassist("xmm1","xmm0",0x04);           # round 7
+        &call           (&label("key_256b"));
+        &aeskeygenassist("xmm1","xmm2",0x08);           # round 8
+        &call           (&label("key_256a"));
+        &aeskeygenassist("xmm1","xmm0",0x08);           # round 9
+        &call           (&label("key_256b"));
+        &aeskeygenassist("xmm1","xmm2",0x10);           # round 10
+        &call           (&label("key_256a"));
+        &aeskeygenassist("xmm1","xmm0",0x10);           # round 11
+        &call           (&label("key_256b"));
+        &aeskeygenassist("xmm1","xmm2",0x20);           # round 12
+        &call           (&label("key_256a"));
+        &aeskeygenassist("xmm1","xmm0",0x20);           # round 13
+        &call           (&label("key_256b"));
+        &aeskeygenassist("xmm1","xmm2",0x40);           # round 14
+        &call           (&label("key_256a"));
+        &$movekey       (&QWP(0,$key),"xmm0");
+        &mov            (&DWP(16,$key),$rounds);
+        &xor            ("eax","eax");
+        &ret();
+&set_label("key_256a",16);
+        &$movekey       (&QWP(0,$key),"xmm2");
+        &lea            ($key,&DWP(16,$key));
+&set_label("key_256a_cold");
+        &shufps         ("xmm4","xmm0",0b00010000);
+        &xorps          ("xmm0","xmm4");
+        &shufps         ("xmm4","xmm0",0b10001100);
+        &xorps          ("xmm0","xmm4");
+        &shufps         ("xmm1","xmm1",0b11111111);     # critical path
+        &xorps          ("xmm0","xmm1");
+        &ret();
+&set_label("key_256b",16);
+        &$movekey       (&QWP(0,$key),"xmm0");
+        &lea            ($key,&DWP(16,$key));
+        &shufps         ("xmm4","xmm2",0b00010000);
+        &xorps          ("xmm2","xmm4");
+        &shufps         ("xmm4","xmm2",0b10001100);
+        &xorps          ("xmm2","xmm4");
+        &shufps         ("xmm1","xmm1",0b10101010);     # critical path
+        &xorps          ("xmm2","xmm1");
+        &ret();
+&set_label("bad_pointer",4);
+        &mov    ("eax",-1);
+        &ret    ();
+&set_label("bad_keybits",4);
+        &mov    ("eax",-2);
+        &ret    ();
+&function_end_B("_aesni_set_encrypt_key");
+# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
+#                              AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key");
+        &mov    ("eax",&wparam(0));
+        &mov    ($rounds,&wparam(1));
+        &mov    ($key,&wparam(2));
+        &call   ("_aesni_set_encrypt_key");
+        &ret    ();
+&function_end_B("${PREFIX}_set_encrypt_key");
+# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
+#                              AES_KEY *key)
+&function_begin_B("${PREFIX}_set_decrypt_key");
+        &mov    ("eax",&wparam(0));
+        &mov    ($rounds,&wparam(1));
+        &mov    ($key,&wparam(2));
+        &call   ("_aesni_set_encrypt_key");
+        &mov    ($key,&wparam(2));
+        &shl    ($rounds,4)     # rounds-1 after _aesni_set_encrypt_key
+        &test   ("eax","eax");
+        &jnz    (&label("dec_key_ret"));
+        &lea    ("eax",&DWP(16,$key,$rounds));  # end of key schedule
+        &$movekey       ("xmm0",&QWP(0,$key));  # just swap
+        &$movekey       ("xmm1",&QWP(0,"eax"));
+        &$movekey       (&QWP(0,"eax"),"xmm0");
+        &$movekey       (&QWP(0,$key),"xmm1");
+        &lea            ($key,&DWP(16,$key));
+        &lea            ("eax",&DWP(-16,"eax"));
+&set_label("dec_key_inverse");
+        &$movekey       ("xmm0",&QWP(0,$key));  # swap and inverse
+        &$movekey       ("xmm1",&QWP(0,"eax"));
+        &aesimc         ("xmm0","xmm0");
+        &aesimc         ("xmm1","xmm1");
+        &lea            ($key,&DWP(16,$key));
+        &lea            ("eax",&DWP(-16,"eax"));
+        &$movekey       (&QWP(16,"eax"),"xmm0");
+        &$movekey       (&QWP(-16,$key),"xmm1");
+        &cmp            ("eax",$key);
+        &ja             (&label("dec_key_inverse"));
+        &$movekey       ("xmm0",&QWP(0,$key));  # inverse middle
+        &aesimc         ("xmm0","xmm0");
+        &$movekey       (&QWP(0,$key),"xmm0");
+        &xor            ("eax","eax");          # return success
+&set_label("dec_key_ret");
+        &ret    ();
+&function_end_B("${PREFIX}_set_decrypt_key");
+&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
new file mode 100644
index 0000000000..c9c6312fa7
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
@@ -0,0 +1,3044 @@
+#!/usr/bin/env perl
+###################################################################
+### AES-128 [originally in CTR mode]                            ###
+### bitsliced implementation for Intel Core 2 processors        ###
+### requires support of SSE extensions up to SSSE3              ###
+### Author: Emilia Käsper and Peter Schwabe                    ###
+### Date: 2009-03-19                                            ###
+### Public domain                                               ###
+###                                                             ###
+### See http://homes.esat.kuleuven.be/~ekasper/#software for    ###
+### further information.                                        ###
+###################################################################
+#
+# September 2011.
+#
+# Started as transliteration to "perlasm" the original code has
+# undergone following changes:
+#
+# - code was made position-independent;
+# - rounds were folded into a loop resulting in >5x size reduction
+#   from 12.5KB to 2.2KB;
+# - above was possibile thanks to mixcolumns() modification that
+#   allowed to feed its output back to aesenc[last], this was
+#   achieved at cost of two additional inter-registers moves;
+# - some instruction reordering and interleaving;
+# - this module doesn't implement key setup subroutine, instead it
+#   relies on conversion of "conventional" key schedule as returned
+#   by AES_set_encrypt_key (see discussion below);
+# - first and last round keys are treated differently, which allowed
+#   to skip one shiftrows(), reduce bit-sliced key schedule and
+#   speed-up conversion by 22%;
+# - support for 192- and 256-bit keys was added;
+#
+# Resulting performance in CPU cycles spent to encrypt one byte out
+# of 4096-byte buffer with 128-bit key is:
+#
+#               Emilia's        this(*)         difference
+#
+# Core 2        9.30            8.69            +7%
+# Nehalem(**)   7.63            6.98            +9%
+# Atom          17.1            17.4            -2%(***)
+#
+# (*)   Comparison is not completely fair, because "this" is ECB,
+#       i.e. no extra processing such as counter values calculation
+#       and xor-ing input as in Emilia's CTR implementation is
+#       performed. However, the CTR calculations stand for not more
+#       than 1% of total time, so comparison is *rather* fair.
+#
+# (**)  Results were collected on Westmere, which is considered to
+#       be equivalent to Nehalem for this code.
+#
+# (***) Slowdown on Atom is rather strange per se, because original
+#       implementation has a number of 9+-bytes instructions, which
+#       are bad for Atom front-end, and which I eliminated completely.
+#       In attempt to address deterioration sbox() was tested in FP
+#       SIMD "domain" (movaps instead of movdqa, xorps instead of
+#       pxor, etc.). While it resulted in nominal 4% improvement on
+#       Atom, it hurted Westmere by more than 2x factor.
+#
+# As for key schedule conversion subroutine. Interface to OpenSSL
+# relies on per-invocation on-the-fly conversion. This naturally
+# has impact on performance, especially for short inputs. Conversion
+# time in CPU cycles and its ratio to CPU cycles spent in 8x block
+# function is:
+#
+#               conversion      conversion/8x block
+# Core 2        240             0.22
+# Nehalem       180             0.20
+# Atom          430             0.19
+#
+# The ratio values mean that 128-byte blocks will be processed
+# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
+# etc. Then keep in mind that input sizes not divisible by 128 are
+# *effectively* slower, especially shortest ones, e.g. consecutive
+# 144-byte blocks are processed 44% slower than one would expect,
+# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
+# it's still faster than ["hyper-threading-safe" code path in]
+# aes-x86_64.pl on all lengths above 64 bytes...
+#
+# October 2011.
+#
+# Add decryption procedure. Performance in CPU cycles spent to decrypt
+# one byte out of 4096-byte buffer with 128-bit key is:
+#
+# Core 2        11.0
+# Nehalem       9.16
+# Atom          20.9
+#
+# November 2011.
+#
+# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
+# suboptimal, but XTS is meant to be used with larger blocks...
+#
+#                                               <appro@openssl.org>
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+open STDOUT,"| $^X $xlate $flavour $output";
+my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
+my @XMM=map("%xmm$_",(15,0..14));       # best on Atom, +10% over (0..15)
+my $ecb=0;      # suppress unreferenced ECB subroutines, spare some space...
+{
+my ($key,$rounds,$const)=("%rax","%r10d","%r11");
+sub Sbox {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+        &InBasisChange  (@b);
+        &Inv_GF256      (@b[6,5,0,3,7,1,4,2],@t,@s);
+        &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+sub InBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
+my @b=@_[0..7];
+$code.=<<___;
+        pxor    @b[6], @b[5]
+        pxor    @b[1], @b[2]
+        pxor    @b[0], @b[3]
+        pxor    @b[2], @b[6]
+        pxor    @b[0], @b[5]
+        pxor    @b[3], @b[6]
+        pxor    @b[7], @b[3]
+        pxor    @b[5], @b[7]
+        pxor    @b[4], @b[3]
+        pxor    @b[5], @b[4]
+        pxor    @b[1], @b[3]
+        pxor    @b[7], @b[2]
+        pxor    @b[5], @b[1]
+___
+}
+sub OutBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+        pxor    @b[6], @b[0]
+        pxor    @b[4], @b[1]
+        pxor    @b[0], @b[2]
+        pxor    @b[6], @b[4]
+        pxor    @b[1], @b[6]
+        pxor    @b[5], @b[1]
+        pxor    @b[3], @b[5]
+        pxor    @b[7], @b[3]
+        pxor    @b[5], @b[7]
+        pxor    @b[5], @b[2]
+        pxor    @b[7], @b[4]
+___
+}
+sub InvSbox {
+# input in lsb  > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+        &InvInBasisChange       (@b);
+        &Inv_GF256              (@b[5,1,2,6,3,7,0,4],@t,@s);
+        &InvOutBasisChange      (@b[3,7,0,4,5,1,2,6]);
+}
+sub InvInBasisChange {          # OutBasisChange in reverse
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+        pxor    @b[7], @b[4]
+        pxor    @b[5], @b[7]
+        pxor    @b[5], @b[2]
+        pxor    @b[7], @b[3]
+        pxor    @b[3], @b[5]
+        pxor    @b[5], @b[1]
+        pxor    @b[1], @b[6]
+        pxor    @b[0], @b[2]
+        pxor    @b[6], @b[4]
+        pxor    @b[6], @b[0]
+        pxor    @b[4], @b[1]
+___
+}
+sub InvOutBasisChange {         # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+        pxor    @b[5], @b[1]
+        pxor    @b[7], @b[2]
+        pxor    @b[1], @b[3]
+        pxor    @b[5], @b[4]
+        pxor    @b[5], @b[7]
+        pxor    @b[4], @b[3]
+         pxor   @b[0], @b[5]
+        pxor    @b[7], @b[3]
+         pxor   @b[2], @b[6]
+         pxor   @b[1], @b[2]
+        pxor    @b[3], @b[6]
+        pxor    @b[0], @b[3]
+        pxor    @b[6], @b[5]
+___
+}
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+        movdqa  $y0, $t0
+        pxor    $y1, $t0
+        pand    $x0, $t0
+        pxor    $x1, $x0
+        pand    $y0, $x1
+        pand    $y1, $x0
+        pxor    $x1, $x0
+        pxor    $t0, $x1
+___
+}
+sub Mul_GF4_N {                         # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+        movdqa  $y0, $t0
+        pxor    $y1, $t0
+        pand    $x0, $t0
+        pxor    $x1, $x0
+        pand    $y0, $x1
+        pand    $y1, $x0
+        pxor    $x0, $x1
+        pxor    $t0, $x0
+___
+}
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+    $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+        movdqa  $y0, $t0
+         movdqa $y2, $t1
+        pxor    $y1, $t0
+         pxor   $y3, $t1
+        pand    $x0, $t0
+         pand   $x2, $t1
+        pxor    $x1, $x0
+         pxor   $x3, $x2
+        pand    $y0, $x1
+         pand   $y2, $x3
+        pand    $y1, $x0
+         pand   $y3, $x2
+        pxor    $x0, $x1
+         pxor   $x3, $x2
+        pxor    $t0, $x0
+         pxor   $t1, $x3
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+        movdqa  @x[0], @t[0]
+        movdqa  @x[1], @t[1]
+___
+        &Mul_GF4        (@x[0], @x[1], @y[0], @y[1], @t[2]);
+$code.=<<___;
+        pxor    @x[2], @t[0]
+        pxor    @x[3], @t[1]
+        pxor    @y[2], @y[0]
+        pxor    @y[3], @y[1]
+___
+        Mul_GF4_N_GF4   (@t[0], @t[1], @y[0], @y[1], @t[3],
+                         @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+        pxor    @t[0], @x[0]
+        pxor    @t[0], @x[2]
+        pxor    @t[1], @x[1]
+        pxor    @t[1], @x[3]
+        movdqa  @x[4], @t[0]
+        movdqa  @x[5], @t[1]
+        pxor    @x[6], @t[0]
+        pxor    @x[7], @t[1]
+___
+        &Mul_GF4_N_GF4  (@t[0], @t[1], @y[0], @y[1], @t[3],
+                         @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+        pxor    @y[2], @y[0]
+        pxor    @y[3], @y[1]
+___
+        &Mul_GF4        (@x[4], @x[5], @y[0], @y[1], @t[3]);
+$code.=<<___;
+        pxor    @t[0], @x[4]
+        pxor    @t[0], @x[6]
+        pxor    @t[1], @x[5]
+        pxor    @t[1], @x[7]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+        movdqa  @x[4], @t[3]
+        movdqa  @x[5], @t[2]
+        movdqa  @x[1], @t[1]
+        movdqa  @x[7], @s[1]
+        movdqa  @x[0], @s[0]
+        pxor    @x[6], @t[3]
+        pxor    @x[7], @t[2]
+        pxor    @x[3], @t[1]
+         movdqa @t[3], @s[2]
+        pxor    @x[6], @s[1]
+         movdqa @t[2], @t[0]
+        pxor    @x[2], @s[0]
+         movdqa @t[3], @s[3]
+        por     @t[1], @t[2]
+        por     @s[0], @t[3]
+        pxor    @t[0], @s[3]
+        pand    @s[0], @s[2]
+        pxor    @t[1], @s[0]
+        pand    @t[1], @t[0]
+        pand    @s[0], @s[3]
+        movdqa  @x[3], @s[0]
+        pxor    @x[2], @s[0]
+        pand    @s[0], @s[1]
+        pxor    @s[1], @t[3]
+        pxor    @s[1], @t[2]
+        movdqa  @x[4], @s[1]
+        movdqa  @x[1], @s[0]
+        pxor    @x[5], @s[1]
+        pxor    @x[0], @s[0]
+        movdqa  @s[1], @t[1]
+        pand    @s[0], @s[1]
+        por     @s[0], @t[1]
+        pxor    @s[1], @t[0]
+        pxor    @s[3], @t[3]
+        pxor    @s[2], @t[2]
+        pxor    @s[3], @t[1]
+        movdqa  @x[7], @s[0]
+        pxor    @s[2], @t[0]
+        movdqa  @x[6], @s[1]
+        pxor    @s[2], @t[1]
+        movdqa  @x[5], @s[2]
+        pand    @x[3], @s[0]
+        movdqa  @x[4], @s[3]
+        pand    @x[2], @s[1]
+        pand    @x[1], @s[2]
+        por     @x[0], @s[3]
+        pxor    @s[0], @t[3]
+        pxor    @s[1], @t[2]
+        pxor    @s[2], @t[1]
+        pxor    @s[3], @t[0] 
+        #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+        # new smaller inversion
+        movdqa  @t[3], @s[0]
+        pand    @t[1], @t[3]
+        pxor    @t[2], @s[0]
+        movdqa  @t[0], @s[2]
+        movdqa  @s[0], @s[3]
+        pxor    @t[3], @s[2]
+        pand    @s[2], @s[3]
+        movdqa  @t[1], @s[1]
+        pxor    @t[2], @s[3]
+        pxor    @t[0], @s[1]
+        pxor    @t[2], @t[3]
+        pand    @t[3], @s[1]
+        movdqa  @s[2], @t[2]
+        pxor    @t[0], @s[1]
+        pxor    @s[1], @t[2]
+        pxor    @s[1], @t[1]
+        pand    @t[0], @t[2]
+        pxor    @t[2], @s[2]
+        pxor    @t[2], @t[1]
+        pand    @s[3], @s[2]
+        pxor    @s[0], @s[2]
+___
+# output in s3, s2, s1, t1
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+        &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+# AES linear components
+sub ShiftRows {
+my @x=@_[0..7];
+my $mask=pop;
+$code.=<<___;
+        pxor    0x00($key),@x[0]
+        pxor    0x10($key),@x[1]
+        pshufb  $mask,@x[0]
+        pxor    0x20($key),@x[2]
+        pshufb  $mask,@x[1]
+        pxor    0x30($key),@x[3]
+        pshufb  $mask,@x[2]
+        pxor    0x40($key),@x[4]
+        pshufb  $mask,@x[3]
+        pxor    0x50($key),@x[5]
+        pshufb  $mask,@x[4]
+        pxor    0x60($key),@x[6]
+        pshufb  $mask,@x[5]
+        pxor    0x70($key),@x[7]
+        pshufb  $mask,@x[6]
+        lea     0x80($key),$key
+        pshufb  $mask,@x[7]
+___
+}
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+        pshufd  \$0x93, @x[0], @t[0]    # x0 <<< 32
+        pshufd  \$0x93, @x[1], @t[1]
+         pxor   @t[0], @x[0]            # x0 ^ (x0 <<< 32)
+        pshufd  \$0x93, @x[2], @t[2]
+         pxor   @t[1], @x[1]
+        pshufd  \$0x93, @x[3], @t[3]
+         pxor   @t[2], @x[2]
+        pshufd  \$0x93, @x[4], @t[4]
+         pxor   @t[3], @x[3]
+        pshufd  \$0x93, @x[5], @t[5]
+         pxor   @t[4], @x[4]
+        pshufd  \$0x93, @x[6], @t[6]
+         pxor   @t[5], @x[5]
+        pshufd  \$0x93, @x[7], @t[7]
+         pxor   @t[6], @x[6]
+         pxor   @t[7], @x[7]
+        pxor    @x[0], @t[1]
+        pxor    @x[7], @t[0]
+        pxor    @x[7], @t[1]
+         pshufd \$0x4E, @x[0], @x[0]    # (x0 ^ (x0 <<< 32)) <<< 64)
+        pxor    @x[1], @t[2]
+         pshufd \$0x4E, @x[1], @x[1]
+        pxor    @x[4], @t[5]
+         pxor   @t[0], @x[0]
+        pxor    @x[5], @t[6]
+         pxor   @t[1], @x[1]
+        pxor    @x[3], @t[4]
+         pshufd \$0x4E, @x[4], @t[0]
+        pxor    @x[6], @t[7]
+         pshufd \$0x4E, @x[5], @t[1]
+        pxor    @x[2], @t[3]
+         pshufd \$0x4E, @x[3], @x[4]
+        pxor    @x[7], @t[3]
+         pshufd \$0x4E, @x[7], @x[5]
+        pxor    @x[7], @t[4]
+         pshufd \$0x4E, @x[6], @x[3]
+        pxor    @t[4], @t[0]
+         pshufd \$0x4E, @x[2], @x[6]
+        pxor    @t[5], @t[1]
+        pxor    @t[3], @x[4]
+        pxor    @t[7], @x[5]
+        pxor    @t[6], @x[3]
+         movdqa @t[0], @x[2]
+        pxor    @t[2], @x[6]
+         movdqa @t[1], @x[7]
+___
+}
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+        # multiplication by 0x0e
+        pshufd  \$0x93, @x[7], @t[7]
+        movdqa  @x[2], @t[2]
+        pxor    @x[5], @x[7]            # 7 5
+        pxor    @x[5], @x[2]            # 2 5
+        pshufd  \$0x93, @x[0], @t[0]
+        movdqa  @x[5], @t[5]
+        pxor    @x[0], @x[5]            # 5 0           [1]
+        pxor    @x[1], @x[0]            # 0 1
+        pshufd  \$0x93, @x[1], @t[1]
+        pxor    @x[2], @x[1]            # 1 25
+        pxor    @x[6], @x[0]            # 01 6          [2]
+        pxor    @x[3], @x[1]            # 125 3         [4]
+        pshufd  \$0x93, @x[3], @t[3]
+        pxor    @x[0], @x[2]            # 25 016        [3]
+        pxor    @x[7], @x[3]            # 3 75
+        pxor    @x[6], @x[7]            # 75 6          [0]
+        pshufd  \$0x93, @x[6], @t[6]
+        movdqa  @x[4], @t[4]
+        pxor    @x[4], @x[6]            # 6 4
+        pxor    @x[3], @x[4]            # 4 375         [6]
+        pxor    @x[7], @x[3]            # 375 756=36
+        pxor    @t[5], @x[6]            # 64 5          [7]
+        pxor    @t[2], @x[3]            # 36 2
+        pxor    @t[4], @x[3]            # 362 4         [5]
+        pshufd  \$0x93, @t[5], @t[5]
+___
+                                        my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+        # multiplication by 0x0b
+        pxor    @y[0], @y[1]
+        pxor    @t[0], @y[0]
+        pxor    @t[1], @y[1]
+        pshufd  \$0x93, @t[2], @t[2]
+        pxor    @t[5], @y[0]
+        pxor    @t[6], @y[1]
+        pxor    @t[7], @y[0]
+        pshufd  \$0x93, @t[4], @t[4]
+        pxor    @t[6], @t[7]            # clobber t[7]
+        pxor    @y[0], @y[1]
+        pxor    @t[0], @y[3]
+        pshufd  \$0x93, @t[0], @t[0]
+        pxor    @t[1], @y[2]
+        pxor    @t[1], @y[4]
+        pxor    @t[2], @y[2]
+        pshufd  \$0x93, @t[1], @t[1]
+        pxor    @t[2], @y[3]
+        pxor    @t[2], @y[5]
+        pxor    @t[7], @y[2]
+        pshufd  \$0x93, @t[2], @t[2]
+        pxor    @t[3], @y[3]
+        pxor    @t[3], @y[6]
+        pxor    @t[3], @y[4]
+        pshufd  \$0x93, @t[3], @t[3]
+        pxor    @t[4], @y[7]
+        pxor    @t[4], @y[5]
+        pxor    @t[7], @y[7]
+        pxor    @t[5], @y[3]
+        pxor    @t[4], @y[4]
+        pxor    @t[5], @t[7]            # clobber t[7] even more
+        pxor    @t[7], @y[5]
+        pshufd  \$0x93, @t[4], @t[4]
+        pxor    @t[7], @y[6]
+        pxor    @t[7], @y[4]
+        pxor    @t[5], @t[7]
+        pshufd  \$0x93, @t[5], @t[5]
+        pxor    @t[6], @t[7]            # restore t[7]
+        # multiplication by 0x0d
+        pxor    @y[7], @y[4]
+        pxor    @t[4], @y[7]
+        pshufd  \$0x93, @t[6], @t[6]
+        pxor    @t[0], @y[2]
+        pxor    @t[5], @y[7]
+        pxor    @t[2], @y[2]
+        pshufd  \$0x93, @t[7], @t[7]
+        pxor    @y[1], @y[3]
+        pxor    @t[1], @y[1]
+        pxor    @t[0], @y[0]
+        pxor    @t[0], @y[3]
+        pxor    @t[5], @y[1]
+        pxor    @t[5], @y[0]
+        pxor    @t[7], @y[1]
+        pshufd  \$0x93, @t[0], @t[0]
+        pxor    @t[6], @y[0]
+        pxor    @y[1], @y[3]
+        pxor    @t[1], @y[4]
+        pshufd  \$0x93, @t[1], @t[1]
+        pxor    @t[7], @y[7]
+        pxor    @t[2], @y[4]
+        pxor    @t[2], @y[5]
+        pshufd  \$0x93, @t[2], @t[2]
+        pxor    @t[6], @y[2]
+        pxor    @t[3], @t[6]            # clobber t[6]
+        pxor    @y[7], @y[4]
+        pxor    @t[6], @y[3]
+        pxor    @t[6], @y[6]
+        pxor    @t[5], @y[5]
+        pxor    @t[4], @y[6]
+        pshufd  \$0x93, @t[4], @t[4]
+        pxor    @t[6], @y[5]
+        pxor    @t[7], @y[6]
+        pxor    @t[3], @t[6]            # restore t[6]
+        pshufd  \$0x93, @t[5], @t[5]
+        pshufd  \$0x93, @t[6], @t[6]
+        pshufd  \$0x93, @t[7], @t[7]
+        pshufd  \$0x93, @t[3], @t[3]
+        # multiplication by 0x09
+        pxor    @y[1], @y[4]
+        pxor    @y[1], @t[1]            # t[1]=y[1]
+        pxor    @t[5], @t[0]            # clobber t[0]
+        pxor    @t[5], @t[1]
+        pxor    @t[0], @y[3]
+        pxor    @y[0], @t[0]            # t[0]=y[0]
+        pxor    @t[6], @t[1]
+        pxor    @t[7], @t[6]            # clobber t[6]
+        pxor    @t[1], @y[4]
+        pxor    @t[4], @y[7]
+        pxor    @y[4], @t[4]            # t[4]=y[4]
+        pxor    @t[3], @y[6]
+        pxor    @y[3], @t[3]            # t[3]=y[3]
+        pxor    @t[2], @y[5]
+        pxor    @y[2], @t[2]            # t[2]=y[2]
+        pxor    @t[7], @t[3]
+        pxor    @y[5], @t[5]            # t[5]=y[5]
+        pxor    @t[6], @t[2]
+        pxor    @t[6], @t[5]
+        pxor    @y[6], @t[6]            # t[6]=y[6]
+        pxor    @y[7], @t[7]            # t[7]=y[7]
+        movdqa  @t[0],@XMM[0]
+        movdqa  @t[1],@XMM[1]
+        movdqa  @t[2],@XMM[2]
+        movdqa  @t[3],@XMM[3]
+        movdqa  @t[4],@XMM[4]
+        movdqa  @t[5],@XMM[5]
+        movdqa  @t[6],@XMM[6]
+        movdqa  @t[7],@XMM[7]
+___
+}
+sub aesenc {                            # not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+        movdqa  0x30($const),@t[0]      # .LSR
+___
+        &ShiftRows      (@b,@t[0]);
+        &Sbox           (@b,@t);
+        &MixColumns     (@b[0,1,4,6,3,7,2,5],@t);
+}
+sub aesenclast {                        # not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+        movdqa  0x40($const),@t[0]      # .LSRM0
+___
+        &ShiftRows      (@b,@t[0]);
+        &Sbox           (@b,@t);
+$code.=<<___
+        pxor    0x00($key),@b[0]
+        pxor    0x10($key),@b[1]
+        pxor    0x20($key),@b[4]
+        pxor    0x30($key),@b[6]
+        pxor    0x40($key),@b[3]
+        pxor    0x50($key),@b[7]
+        pxor    0x60($key),@b[2]
+        pxor    0x70($key),@b[5]
+___
+}
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+        movdqa  $b,$t
+        psrlq   \$$n,$b
+        pxor    $a,$b
+        pand    $mask,$b
+        pxor    $b,$a
+        psllq   \$$n,$b
+        pxor    $t,$b
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+        movdqa  $b0,$t0
+        psrlq   \$$n,$b0
+         movdqa $b1,$t1
+         psrlq  \$$n,$b1
+        pxor    $a0,$b0
+         pxor   $a1,$b1
+        pand    $mask,$b0
+         pand   $mask,$b1
+        pxor    $b0,$a0
+        psllq   \$$n,$b0
+         pxor   $b1,$a1
+         psllq  \$$n,$b1
+        pxor    $t0,$b0
+         pxor   $t1,$b1
+___
+}
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+        movdqa  0x00($const),$t0        # .LBS0
+        movdqa  0x10($const),$t1        # .LBS1
+___
+        &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+        &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+        movdqa  0x20($const),$t0        # .LBS2
+___
+        &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+        &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+        &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+        &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+$code.=<<___;
+.text
+.extern asm_AES_encrypt
+.extern asm_AES_decrypt
+.type   _bsaes_encrypt8,\@abi-omnipotent
+.align  64
+_bsaes_encrypt8:
+        lea     .LBS0(%rip), $const     # constants table
+        movdqa  ($key), @XMM[9]         # round 0 key
+        lea     0x10($key), $key
+        movdqa  0x50($const), @XMM[8]   # .LM0SR
+        pxor    @XMM[9], @XMM[0]        # xor with round0 key
+        pxor    @XMM[9], @XMM[1]
+         pshufb @XMM[8], @XMM[0]
+        pxor    @XMM[9], @XMM[2]
+         pshufb @XMM[8], @XMM[1]
+        pxor    @XMM[9], @XMM[3]
+         pshufb @XMM[8], @XMM[2]
+        pxor    @XMM[9], @XMM[4]
+         pshufb @XMM[8], @XMM[3]
+        pxor    @XMM[9], @XMM[5]
+         pshufb @XMM[8], @XMM[4]
+        pxor    @XMM[9], @XMM[6]
+         pshufb @XMM[8], @XMM[5]
+        pxor    @XMM[9], @XMM[7]
+         pshufb @XMM[8], @XMM[6]
+         pshufb @XMM[8], @XMM[7]
+_bsaes_encrypt8_bitslice:
+___
+        &bitslice       (@XMM[0..7, 8..11]);
+$code.=<<___;
+        dec     $rounds
+        jmp     .Lenc_sbox
+.align  16
+.Lenc_loop:
+___
+        &ShiftRows      (@XMM[0..7, 8]);
+$code.=".Lenc_sbox:\n";
+        &Sbox           (@XMM[0..7, 8..15]);
+$code.=<<___;
+        dec     $rounds
+        jl      .Lenc_done
+___
+        &MixColumns     (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+        movdqa  0x30($const), @XMM[8]   # .LSR
+        jnz     .Lenc_loop
+        movdqa  0x40($const), @XMM[8]   # .LSRM0
+        jmp     .Lenc_loop
+.align  16
+.Lenc_done:
+___
+        # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+        &bitslice       (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+        movdqa  ($key), @XMM[8]         # last round key
+        pxor    @XMM[8], @XMM[4]
+        pxor    @XMM[8], @XMM[6]
+        pxor    @XMM[8], @XMM[3]
+        pxor    @XMM[8], @XMM[7]
+        pxor    @XMM[8], @XMM[2]
+        pxor    @XMM[8], @XMM[5]
+        pxor    @XMM[8], @XMM[0]
+        pxor    @XMM[8], @XMM[1]
+        ret
+.size   _bsaes_encrypt8,.-_bsaes_encrypt8
+.type   _bsaes_decrypt8,\@abi-omnipotent
+.align  64
+_bsaes_decrypt8:
+        lea     .LBS0(%rip), $const     # constants table
+        movdqa  ($key), @XMM[9]         # round 0 key
+        lea     0x10($key), $key
+        movdqa  -0x30($const), @XMM[8]  # .LM0ISR
+        pxor    @XMM[9], @XMM[0]        # xor with round0 key
+        pxor    @XMM[9], @XMM[1]
+         pshufb @XMM[8], @XMM[0]
+        pxor    @XMM[9], @XMM[2]
+         pshufb @XMM[8], @XMM[1]
+        pxor    @XMM[9], @XMM[3]
+         pshufb @XMM[8], @XMM[2]
+        pxor    @XMM[9], @XMM[4]
+         pshufb @XMM[8], @XMM[3]
+        pxor    @XMM[9], @XMM[5]
+         pshufb @XMM[8], @XMM[4]
+        pxor    @XMM[9], @XMM[6]
+         pshufb @XMM[8], @XMM[5]
+        pxor    @XMM[9], @XMM[7]
+         pshufb @XMM[8], @XMM[6]
+         pshufb @XMM[8], @XMM[7]
+___
+        &bitslice       (@XMM[0..7, 8..11]);
+$code.=<<___;
+        dec     $rounds
+        jmp     .Ldec_sbox
+.align  16
+.Ldec_loop:
+___
+        &ShiftRows      (@XMM[0..7, 8]);
+$code.=".Ldec_sbox:\n";
+        &InvSbox        (@XMM[0..7, 8..15]);
+$code.=<<___;
+        dec     $rounds
+        jl      .Ldec_done
+___
+        &InvMixColumns  (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+        movdqa  -0x10($const), @XMM[8]  # .LISR
+        jnz     .Ldec_loop
+        movdqa  -0x20($const), @XMM[8]  # .LISRM0
+        jmp     .Ldec_loop
+.align  16
+.Ldec_done:
+___
+        &bitslice       (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+        movdqa  ($key), @XMM[8]         # last round key
+        pxor    @XMM[8], @XMM[6]
+        pxor    @XMM[8], @XMM[4]
+        pxor    @XMM[8], @XMM[2]
+        pxor    @XMM[8], @XMM[7]
+        pxor    @XMM[8], @XMM[3]
+        pxor    @XMM[8], @XMM[5]
+        pxor    @XMM[8], @XMM[0]
+        pxor    @XMM[8], @XMM[1]
+        ret
+.size   _bsaes_decrypt8,.-_bsaes_decrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+        &swapmove       (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+        #&swapmove(@x[2,3],1,$t0,$t2,$t3);
+        movdqa  @x[0], @x[2]
+        movdqa  @x[1], @x[3]
+___
+        #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+        &swapmove2x     (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+        #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+        movdqa  @x[0], @x[4]
+        movdqa  @x[2], @x[6]
+        movdqa  @x[1], @x[5]
+        movdqa  @x[3], @x[7]
+___
+        &swapmove2x     (@x[0,4,1,5],4,$bs2,$t2,$t3);
+        &swapmove2x     (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+$code.=<<___;
+.type   _bsaes_key_convert,\@abi-omnipotent
+.align  16
+_bsaes_key_convert:
+        lea     .Lmasks(%rip), $const
+        movdqu  ($inp), %xmm7           # load round 0 key
+        lea     0x10($inp), $inp
+        movdqa  0x00($const), %xmm0     # 0x01...
+        movdqa  0x10($const), %xmm1     # 0x02...
+        movdqa  0x20($const), %xmm2     # 0x04...
+        movdqa  0x30($const), %xmm3     # 0x08...
+        movdqa  0x40($const), %xmm4     # .LM0
+        pcmpeqd %xmm5, %xmm5            # .LNOT
+        movdqu  ($inp), %xmm6           # load round 1 key
+        movdqa  %xmm7, ($out)           # save round 0 key
+        lea     0x10($out), $out
+        dec     $rounds
+        jmp     .Lkey_loop
+.align  16
+.Lkey_loop:
+        pshufb  %xmm4, %xmm6            # .LM0
+        movdqa  %xmm0,  %xmm8
+        movdqa  %xmm1,  %xmm9
+        pand    %xmm6,  %xmm8
+        pand    %xmm6,  %xmm9
+        movdqa  %xmm2,  %xmm10
+        pcmpeqb %xmm0,  %xmm8
+        psllq   \$4,    %xmm0           # 0x10...
+        movdqa  %xmm3,  %xmm11
+        pcmpeqb %xmm1,  %xmm9
+        psllq   \$4,    %xmm1           # 0x20...
+        pand    %xmm6,  %xmm10
+        pand    %xmm6,  %xmm11
+        movdqa  %xmm0,  %xmm12
+        pcmpeqb %xmm2,  %xmm10
+        psllq   \$4,    %xmm2           # 0x40...
+        movdqa  %xmm1,  %xmm13
+        pcmpeqb %xmm3,  %xmm11
+        psllq   \$4,    %xmm3           # 0x80...
+        movdqa  %xmm2,  %xmm14
+        movdqa  %xmm3,  %xmm15
+         pxor   %xmm5,  %xmm8           # "pnot"
+         pxor   %xmm5,  %xmm9
+        pand    %xmm6,  %xmm12
+        pand    %xmm6,  %xmm13
+         movdqa %xmm8, 0x00($out)       # write bit-sliced round key
+        pcmpeqb %xmm0,  %xmm12
+        psrlq   \$4,    %xmm0           # 0x01...
+         movdqa %xmm9, 0x10($out)
+        pcmpeqb %xmm1,  %xmm13
+        psrlq   \$4,    %xmm1           # 0x02...
+         lea    0x10($inp), $inp
+        pand    %xmm6,  %xmm14
+        pand    %xmm6,  %xmm15
+         movdqa %xmm10, 0x20($out)
+        pcmpeqb %xmm2,  %xmm14
+        psrlq   \$4,    %xmm2           # 0x04...
+         movdqa %xmm11, 0x30($out)
+        pcmpeqb %xmm3,  %xmm15
+        psrlq   \$4,    %xmm3           # 0x08...
+         movdqu ($inp), %xmm6           # load next round key
+        pxor    %xmm5, %xmm13           # "pnot"
+        pxor    %xmm5, %xmm14
+        movdqa  %xmm12, 0x40($out)
+        movdqa  %xmm13, 0x50($out)
+        movdqa  %xmm14, 0x60($out)
+        movdqa  %xmm15, 0x70($out)
+        lea     0x80($out),$out
+        dec     $rounds
+        jnz     .Lkey_loop
+        movdqa  0x50($const), %xmm7     # .L63
+        #movdqa %xmm6, ($out)           # don't save last round key
+        ret
+.size   _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+if (0 && !$win64) {     # following four functions are unsupported interface
+                        # used for benchmarking...
+$code.=<<___;
+.globl  bsaes_enc_key_convert
+.type   bsaes_enc_key_convert,\@function,2
+.align  16
+bsaes_enc_key_convert:
+        mov     240($inp),%r10d         # pass rounds
+        mov     $inp,%rcx               # pass key
+        mov     $out,%rax               # pass key schedule
+        call    _bsaes_key_convert
+        pxor    %xmm6,%xmm7             # fix up last round key
+        movdqa  %xmm7,(%rax)            # save last round key
+        ret
+.size   bsaes_enc_key_convert,.-bsaes_enc_key_convert
+.globl  bsaes_encrypt_128
+.type   bsaes_encrypt_128,\@function,4
+.align  16
+bsaes_encrypt_128:
+.Lenc128_loop:
+        movdqu  0x00($inp), @XMM[0]     # load input
+        movdqu  0x10($inp), @XMM[1]
+        movdqu  0x20($inp), @XMM[2]
+        movdqu  0x30($inp), @XMM[3]
+        movdqu  0x40($inp), @XMM[4]
+        movdqu  0x50($inp), @XMM[5]
+        movdqu  0x60($inp), @XMM[6]
+        movdqu  0x70($inp), @XMM[7]
+        mov     $key, %rax              # pass the $key
+        lea     0x80($inp), $inp
+        mov     \$10,%r10d
+        call    _bsaes_encrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[4], 0x20($out)
+        movdqu  @XMM[6], 0x30($out)
+        movdqu  @XMM[3], 0x40($out)
+        movdqu  @XMM[7], 0x50($out)
+        movdqu  @XMM[2], 0x60($out)
+        movdqu  @XMM[5], 0x70($out)
+        lea     0x80($out), $out
+        sub     \$0x80,$len
+        ja      .Lenc128_loop
+        ret
+.size   bsaes_encrypt_128,.-bsaes_encrypt_128
+.globl  bsaes_dec_key_convert
+.type   bsaes_dec_key_convert,\@function,2
+.align  16
+bsaes_dec_key_convert:
+        mov     240($inp),%r10d         # pass rounds
+        mov     $inp,%rcx               # pass key
+        mov     $out,%rax               # pass key schedule
+        call    _bsaes_key_convert
+        pxor    ($out),%xmm7            # fix up round 0 key
+        movdqa  %xmm6,(%rax)            # save last round key
+        movdqa  %xmm7,($out)
+        ret
+.size   bsaes_dec_key_convert,.-bsaes_dec_key_convert
+.globl  bsaes_decrypt_128
+.type   bsaes_decrypt_128,\@function,4
+.align  16
+bsaes_decrypt_128:
+.Ldec128_loop:
+        movdqu  0x00($inp), @XMM[0]     # load input
+        movdqu  0x10($inp), @XMM[1]
+        movdqu  0x20($inp), @XMM[2]
+        movdqu  0x30($inp), @XMM[3]
+        movdqu  0x40($inp), @XMM[4]
+        movdqu  0x50($inp), @XMM[5]
+        movdqu  0x60($inp), @XMM[6]
+        movdqu  0x70($inp), @XMM[7]
+        mov     $key, %rax              # pass the $key
+        lea     0x80($inp), $inp
+        mov     \$10,%r10d
+        call    _bsaes_decrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[6], 0x20($out)
+        movdqu  @XMM[4], 0x30($out)
+        movdqu  @XMM[2], 0x40($out)
+        movdqu  @XMM[7], 0x50($out)
+        movdqu  @XMM[3], 0x60($out)
+        movdqu  @XMM[5], 0x70($out)
+        lea     0x80($out), $out
+        sub     \$0x80,$len
+        ja      .Ldec128_loop
+        ret
+.size   bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+######################################################################
+#
+# OpenSSL interface
+#
+my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
+                                                : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
+if ($ecb) {
+$code.=<<___;
+.globl  bsaes_ecb_encrypt_blocks
+.type   bsaes_ecb_encrypt_blocks,\@abi-omnipotent
+.align  16
+bsaes_ecb_encrypt_blocks:
+        mov     %rsp, %rax
+.Lecb_enc_prologue:
+        push    %rbp
+        push    %rbx
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        lea     -0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+        lea     -0xa0(%rsp), %rsp
+        movaps  %xmm6, 0x40(%rsp)
+        movaps  %xmm7, 0x50(%rsp)
+        movaps  %xmm8, 0x60(%rsp)
+        movaps  %xmm9, 0x70(%rsp)
+        movaps  %xmm10, 0x80(%rsp)
+        movaps  %xmm11, 0x90(%rsp)
+        movaps  %xmm12, 0xa0(%rsp)
+        movaps  %xmm13, 0xb0(%rsp)
+        movaps  %xmm14, 0xc0(%rsp)
+        movaps  %xmm15, 0xd0(%rsp)
+.Lecb_enc_body:
+___
+$code.=<<___;
+        mov     %rsp,%rbp               # backup %rsp
+        mov     240($arg4),%eax         # rounds
+        mov     $arg1,$inp              # backup arguments
+        mov     $arg2,$out
+        mov     $arg3,$len
+        mov     $arg4,$key
+        cmp     \$8,$arg3
+        jb      .Lecb_enc_short
+        mov     %eax,%ebx               # backup rounds
+        shl     \$7,%rax                # 128 bytes per inner round key
+        sub     \$`128-32`,%rax         # size of bit-sliced key schedule
+        sub     %rax,%rsp
+        mov     %rsp,%rax               # pass key schedule
+        mov     $key,%rcx               # pass key
+        mov     %ebx,%r10d              # pass rounds
+        call    _bsaes_key_convert
+        pxor    %xmm6,%xmm7             # fix up last round key
+        movdqa  %xmm7,(%rax)            # save last round key
+        sub     \$8,$len
+.Lecb_enc_loop:
+        movdqu  0x00($inp), @XMM[0]     # load input
+        movdqu  0x10($inp), @XMM[1]
+        movdqu  0x20($inp), @XMM[2]
+        movdqu  0x30($inp), @XMM[3]
+        movdqu  0x40($inp), @XMM[4]
+        movdqu  0x50($inp), @XMM[5]
+        mov     %rsp, %rax              # pass key schedule
+        movdqu  0x60($inp), @XMM[6]
+        mov     %ebx,%r10d              # pass rounds
+        movdqu  0x70($inp), @XMM[7]
+        lea     0x80($inp), $inp
+        call    _bsaes_encrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[4], 0x20($out)
+        movdqu  @XMM[6], 0x30($out)
+        movdqu  @XMM[3], 0x40($out)
+        movdqu  @XMM[7], 0x50($out)
+        movdqu  @XMM[2], 0x60($out)
+        movdqu  @XMM[5], 0x70($out)
+        lea     0x80($out), $out
+        sub     \$8,$len
+        jnc     .Lecb_enc_loop
+        add     \$8,$len
+        jz      .Lecb_enc_done
+        movdqu  0x00($inp), @XMM[0]     # load input
+        mov     %rsp, %rax              # pass key schedule
+        mov     %ebx,%r10d              # pass rounds
+        cmp     \$2,$len
+        jb      .Lecb_enc_one
+        movdqu  0x10($inp), @XMM[1]
+        je      .Lecb_enc_two
+        movdqu  0x20($inp), @XMM[2]
+        cmp     \$4,$len
+        jb      .Lecb_enc_three
+        movdqu  0x30($inp), @XMM[3]
+        je      .Lecb_enc_four
+        movdqu  0x40($inp), @XMM[4]
+        cmp     \$6,$len
+        jb      .Lecb_enc_five
+        movdqu  0x50($inp), @XMM[5]
+        je      .Lecb_enc_six
+        movdqu  0x60($inp), @XMM[6]
+        call    _bsaes_encrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[4], 0x20($out)
+        movdqu  @XMM[6], 0x30($out)
+        movdqu  @XMM[3], 0x40($out)
+        movdqu  @XMM[7], 0x50($out)
+        movdqu  @XMM[2], 0x60($out)
+        jmp     .Lecb_enc_done
+.align  16
+.Lecb_enc_six:
+        call    _bsaes_encrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[4], 0x20($out)
+        movdqu  @XMM[6], 0x30($out)
+        movdqu  @XMM[3], 0x40($out)
+        movdqu  @XMM[7], 0x50($out)
+        jmp     .Lecb_enc_done
+.align  16
+.Lecb_enc_five:
+        call    _bsaes_encrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[4], 0x20($out)
+        movdqu  @XMM[6], 0x30($out)
+        movdqu  @XMM[3], 0x40($out)
+        jmp     .Lecb_enc_done
+.align  16
+.Lecb_enc_four:
+        call    _bsaes_encrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[4], 0x20($out)
+        movdqu  @XMM[6], 0x30($out)
+        jmp     .Lecb_enc_done
+.align  16
+.Lecb_enc_three:
+        call    _bsaes_encrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[4], 0x20($out)
+        jmp     .Lecb_enc_done
+.align  16
+.Lecb_enc_two:
+        call    _bsaes_encrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        jmp     .Lecb_enc_done
+.align  16
+.Lecb_enc_one:
+        call    _bsaes_encrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        jmp     .Lecb_enc_done
+.align  16
+.Lecb_enc_short:
+        lea     ($inp), $arg1
+        lea     ($out), $arg2
+        lea     ($key), $arg3
+        call    asm_AES_encrypt
+        lea     16($inp), $inp
+        lea     16($out), $out
+        dec     $len
+        jnz     .Lecb_enc_short
+.Lecb_enc_done:
+        lea     (%rsp),%rax
+        pxor    %xmm0, %xmm0
+.Lecb_enc_bzero:                        # wipe key schedule [if any]
+        movdqa  %xmm0, 0x00(%rax)
+        movdqa  %xmm0, 0x10(%rax)
+        lea     0x20(%rax), %rax
+        cmp     %rax, %rbp
+        jb      .Lecb_enc_bzero
+        lea     (%rbp),%rsp             # restore %rsp
+___
+$code.=<<___ if ($win64);
+        movaps  0x40(%rbp), %xmm6
+        movaps  0x50(%rbp), %xmm7
+        movaps  0x60(%rbp), %xmm8
+        movaps  0x70(%rbp), %xmm9
+        movaps  0x80(%rbp), %xmm10
+        movaps  0x90(%rbp), %xmm11
+        movaps  0xa0(%rbp), %xmm12
+        movaps  0xb0(%rbp), %xmm13
+        movaps  0xc0(%rbp), %xmm14
+        movaps  0xd0(%rbp), %xmm15
+        lea     0xa0(%rbp), %rsp
+___
+$code.=<<___;
+        mov     0x48(%rsp), %r15
+        mov     0x50(%rsp), %r14
+        mov     0x58(%rsp), %r13
+        mov     0x60(%rsp), %r12
+        mov     0x68(%rsp), %rbx
+        mov     0x70(%rsp), %rax
+        lea     0x78(%rsp), %rsp
+        mov     %rax, %rbp
+.Lecb_enc_epilogue:
+        ret
+.size   bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
+.globl  bsaes_ecb_decrypt_blocks
+.type   bsaes_ecb_decrypt_blocks,\@abi-omnipotent
+.align  16
+bsaes_ecb_decrypt_blocks:
+        mov     %rsp, %rax
+.Lecb_dec_prologue:
+        push    %rbp
+        push    %rbx
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        lea     -0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+        lea     -0xa0(%rsp), %rsp
+        movaps  %xmm6, 0x40(%rsp)
+        movaps  %xmm7, 0x50(%rsp)
+        movaps  %xmm8, 0x60(%rsp)
+        movaps  %xmm9, 0x70(%rsp)
+        movaps  %xmm10, 0x80(%rsp)
+        movaps  %xmm11, 0x90(%rsp)
+        movaps  %xmm12, 0xa0(%rsp)
+        movaps  %xmm13, 0xb0(%rsp)
+        movaps  %xmm14, 0xc0(%rsp)
+        movaps  %xmm15, 0xd0(%rsp)
+.Lecb_dec_body:
+___
+$code.=<<___;
+        mov     %rsp,%rbp               # backup %rsp
+        mov     240($arg4),%eax         # rounds
+        mov     $arg1,$inp              # backup arguments
+        mov     $arg2,$out
+        mov     $arg3,$len
+        mov     $arg4,$key
+        cmp     \$8,$arg3
+        jb      .Lecb_dec_short
+        mov     %eax,%ebx               # backup rounds
+        shl     \$7,%rax                # 128 bytes per inner round key
+        sub     \$`128-32`,%rax         # size of bit-sliced key schedule
+        sub     %rax,%rsp
+        mov     %rsp,%rax               # pass key schedule
+        mov     $key,%rcx               # pass key
+        mov     %ebx,%r10d              # pass rounds
+        call    _bsaes_key_convert
+        pxor    (%rsp),%xmm7            # fix up 0 round key
+        movdqa  %xmm6,(%rax)            # save last round key
+        movdqa  %xmm7,(%rsp)
+        sub     \$8,$len
+.Lecb_dec_loop:
+        movdqu  0x00($inp), @XMM[0]     # load input
+        movdqu  0x10($inp), @XMM[1]
+        movdqu  0x20($inp), @XMM[2]
+        movdqu  0x30($inp), @XMM[3]
+        movdqu  0x40($inp), @XMM[4]
+        movdqu  0x50($inp), @XMM[5]
+        mov     %rsp, %rax              # pass key schedule
+        movdqu  0x60($inp), @XMM[6]
+        mov     %ebx,%r10d              # pass rounds
+        movdqu  0x70($inp), @XMM[7]
+        lea     0x80($inp), $inp
+        call    _bsaes_decrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[6], 0x20($out)
+        movdqu  @XMM[4], 0x30($out)
+        movdqu  @XMM[2], 0x40($out)
+        movdqu  @XMM[7], 0x50($out)
+        movdqu  @XMM[3], 0x60($out)
+        movdqu  @XMM[5], 0x70($out)
+        lea     0x80($out), $out
+        sub     \$8,$len
+        jnc     .Lecb_dec_loop
+        add     \$8,$len
+        jz      .Lecb_dec_done
+        movdqu  0x00($inp), @XMM[0]     # load input
+        mov     %rsp, %rax              # pass key schedule
+        mov     %ebx,%r10d              # pass rounds
+        cmp     \$2,$len
+        jb      .Lecb_dec_one
+        movdqu  0x10($inp), @XMM[1]
+        je      .Lecb_dec_two
+        movdqu  0x20($inp), @XMM[2]
+        cmp     \$4,$len
+        jb      .Lecb_dec_three
+        movdqu  0x30($inp), @XMM[3]
+        je      .Lecb_dec_four
+        movdqu  0x40($inp), @XMM[4]
+        cmp     \$6,$len
+        jb      .Lecb_dec_five
+        movdqu  0x50($inp), @XMM[5]
+        je      .Lecb_dec_six
+        movdqu  0x60($inp), @XMM[6]
+        call    _bsaes_decrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[6], 0x20($out)
+        movdqu  @XMM[4], 0x30($out)
+        movdqu  @XMM[2], 0x40($out)
+        movdqu  @XMM[7], 0x50($out)
+        movdqu  @XMM[3], 0x60($out)
+        jmp     .Lecb_dec_done
+.align  16
+.Lecb_dec_six:
+        call    _bsaes_decrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[6], 0x20($out)
+        movdqu  @XMM[4], 0x30($out)
+        movdqu  @XMM[2], 0x40($out)
+        movdqu  @XMM[7], 0x50($out)
+        jmp     .Lecb_dec_done
+.align  16
+.Lecb_dec_five:
+        call    _bsaes_decrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[6], 0x20($out)
+        movdqu  @XMM[4], 0x30($out)
+        movdqu  @XMM[2], 0x40($out)
+        jmp     .Lecb_dec_done
+.align  16
+.Lecb_dec_four:
+        call    _bsaes_decrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[6], 0x20($out)
+        movdqu  @XMM[4], 0x30($out)
+        jmp     .Lecb_dec_done
+.align  16
+.Lecb_dec_three:
+        call    _bsaes_decrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[6], 0x20($out)
+        jmp     .Lecb_dec_done
+.align  16
+.Lecb_dec_two:
+        call    _bsaes_decrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        jmp     .Lecb_dec_done
+.align  16
+.Lecb_dec_one:
+        call    _bsaes_decrypt8
+        movdqu  @XMM[0], 0x00($out)     # write output
+        jmp     .Lecb_dec_done
+.align  16
+.Lecb_dec_short:
+        lea     ($inp), $arg1
+        lea     ($out), $arg2
+        lea     ($key), $arg3
+        call    asm_AES_decrypt
+        lea     16($inp), $inp
+        lea     16($out), $out
+        dec     $len
+        jnz     .Lecb_dec_short
+.Lecb_dec_done:
+        lea     (%rsp),%rax
+        pxor    %xmm0, %xmm0
+.Lecb_dec_bzero:                        # wipe key schedule [if any]
+        movdqa  %xmm0, 0x00(%rax)
+        movdqa  %xmm0, 0x10(%rax)
+        lea     0x20(%rax), %rax
+        cmp     %rax, %rbp
+        jb      .Lecb_dec_bzero
+        lea     (%rbp),%rsp             # restore %rsp
+___
+$code.=<<___ if ($win64);
+        movaps  0x40(%rbp), %xmm6
+        movaps  0x50(%rbp), %xmm7
+        movaps  0x60(%rbp), %xmm8
+        movaps  0x70(%rbp), %xmm9
+        movaps  0x80(%rbp), %xmm10
+        movaps  0x90(%rbp), %xmm11
+        movaps  0xa0(%rbp), %xmm12
+        movaps  0xb0(%rbp), %xmm13
+        movaps  0xc0(%rbp), %xmm14
+        movaps  0xd0(%rbp), %xmm15
+        lea     0xa0(%rbp), %rsp
+___
+$code.=<<___;
+        mov     0x48(%rsp), %r15
+        mov     0x50(%rsp), %r14
+        mov     0x58(%rsp), %r13
+        mov     0x60(%rsp), %r12
+        mov     0x68(%rsp), %rbx
+        mov     0x70(%rsp), %rax
+        lea     0x78(%rsp), %rsp
+        mov     %rax, %rbp
+.Lecb_dec_epilogue:
+        ret
+.size   bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
+___
+}
+$code.=<<___;
+.extern asm_AES_cbc_encrypt
+.globl  bsaes_cbc_encrypt
+.type   bsaes_cbc_encrypt,\@abi-omnipotent
+.align  16
+bsaes_cbc_encrypt:
+___
+$code.=<<___ if ($win64);
+        mov     48(%rsp),$arg6          # pull direction flag
+___
+$code.=<<___;
+        cmp     \$0,$arg6
+        jne     asm_AES_cbc_encrypt
+        cmp     \$128,$arg3
+        jb      asm_AES_cbc_encrypt
+        mov     %rsp, %rax
+.Lcbc_dec_prologue:
+        push    %rbp
+        push    %rbx
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        lea     -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+        mov     0xa0(%rsp),$arg5        # pull ivp
+        lea     -0xa0(%rsp), %rsp
+        movaps  %xmm6, 0x40(%rsp)
+        movaps  %xmm7, 0x50(%rsp)
+        movaps  %xmm8, 0x60(%rsp)
+        movaps  %xmm9, 0x70(%rsp)
+        movaps  %xmm10, 0x80(%rsp)
+        movaps  %xmm11, 0x90(%rsp)
+        movaps  %xmm12, 0xa0(%rsp)
+        movaps  %xmm13, 0xb0(%rsp)
+        movaps  %xmm14, 0xc0(%rsp)
+        movaps  %xmm15, 0xd0(%rsp)
+.Lcbc_dec_body:
+___
+$code.=<<___;
+        mov     %rsp, %rbp              # backup %rsp
+        mov     240($arg4), %eax        # rounds
+        mov     $arg1, $inp             # backup arguments
+        mov     $arg2, $out
+        mov     $arg3, $len
+        mov     $arg4, $key
+        mov     $arg5, %rbx
+        shr     \$4, $len               # bytes to blocks
+        mov     %eax, %edx              # rounds
+        shl     \$7, %rax               # 128 bytes per inner round key
+        sub     \$`128-32`, %rax        # size of bit-sliced key schedule
+        sub     %rax, %rsp
+        mov     %rsp, %rax              # pass key schedule
+        mov     $key, %rcx              # pass key
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_key_convert
+        pxor    (%rsp),%xmm7            # fix up 0 round key
+        movdqa  %xmm6,(%rax)            # save last round key
+        movdqa  %xmm7,(%rsp)
+        movdqu  (%rbx), @XMM[15]        # load IV
+        sub     \$8,$len
+.Lcbc_dec_loop:
+        movdqu  0x00($inp), @XMM[0]     # load input
+        movdqu  0x10($inp), @XMM[1]
+        movdqu  0x20($inp), @XMM[2]
+        movdqu  0x30($inp), @XMM[3]
+        movdqu  0x40($inp), @XMM[4]
+        movdqu  0x50($inp), @XMM[5]
+        mov     %rsp, %rax              # pass key schedule
+        movdqu  0x60($inp), @XMM[6]
+        mov     %edx,%r10d              # pass rounds
+        movdqu  0x70($inp), @XMM[7]
+        movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
+        call    _bsaes_decrypt8
+        pxor    0x20(%rbp), @XMM[0]     # ^= IV
+        movdqu  0x00($inp), @XMM[8]     # re-load input
+        movdqu  0x10($inp), @XMM[9]
+        pxor    @XMM[8], @XMM[1]
+        movdqu  0x20($inp), @XMM[10]
+        pxor    @XMM[9], @XMM[6]
+        movdqu  0x30($inp), @XMM[11]
+        pxor    @XMM[10], @XMM[4]
+        movdqu  0x40($inp), @XMM[12]
+        pxor    @XMM[11], @XMM[2]
+        movdqu  0x50($inp), @XMM[13]
+        pxor    @XMM[12], @XMM[7]
+        movdqu  0x60($inp), @XMM[14]
+        pxor    @XMM[13], @XMM[3]
+        movdqu  0x70($inp), @XMM[15]    # IV
+        pxor    @XMM[14], @XMM[5]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        lea     0x80($inp), $inp
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[6], 0x20($out)
+        movdqu  @XMM[4], 0x30($out)
+        movdqu  @XMM[2], 0x40($out)
+        movdqu  @XMM[7], 0x50($out)
+        movdqu  @XMM[3], 0x60($out)
+        movdqu  @XMM[5], 0x70($out)
+        lea     0x80($out), $out
+        sub     \$8,$len
+        jnc     .Lcbc_dec_loop
+        add     \$8,$len
+        jz      .Lcbc_dec_done
+        movdqu  0x00($inp), @XMM[0]     # load input
+        mov     %rsp, %rax              # pass key schedule
+        mov     %edx, %r10d             # pass rounds
+        cmp     \$2,$len
+        jb      .Lcbc_dec_one
+        movdqu  0x10($inp), @XMM[1]
+        je      .Lcbc_dec_two
+        movdqu  0x20($inp), @XMM[2]
+        cmp     \$4,$len
+        jb      .Lcbc_dec_three
+        movdqu  0x30($inp), @XMM[3]
+        je      .Lcbc_dec_four
+        movdqu  0x40($inp), @XMM[4]
+        cmp     \$6,$len
+        jb      .Lcbc_dec_five
+        movdqu  0x50($inp), @XMM[5]
+        je      .Lcbc_dec_six
+        movdqu  0x60($inp), @XMM[6]
+        movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
+        call    _bsaes_decrypt8
+        pxor    0x20(%rbp), @XMM[0]     # ^= IV
+        movdqu  0x00($inp), @XMM[8]     # re-load input
+        movdqu  0x10($inp), @XMM[9]
+        pxor    @XMM[8], @XMM[1]
+        movdqu  0x20($inp), @XMM[10]
+        pxor    @XMM[9], @XMM[6]
+        movdqu  0x30($inp), @XMM[11]
+        pxor    @XMM[10], @XMM[4]
+        movdqu  0x40($inp), @XMM[12]
+        pxor    @XMM[11], @XMM[2]
+        movdqu  0x50($inp), @XMM[13]
+        pxor    @XMM[12], @XMM[7]
+        movdqu  0x60($inp), @XMM[15]    # IV
+        pxor    @XMM[13], @XMM[3]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[6], 0x20($out)
+        movdqu  @XMM[4], 0x30($out)
+        movdqu  @XMM[2], 0x40($out)
+        movdqu  @XMM[7], 0x50($out)
+        movdqu  @XMM[3], 0x60($out)
+        jmp     .Lcbc_dec_done
+.align  16
+.Lcbc_dec_six:
+        movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
+        call    _bsaes_decrypt8
+        pxor    0x20(%rbp), @XMM[0]     # ^= IV
+        movdqu  0x00($inp), @XMM[8]     # re-load input
+        movdqu  0x10($inp), @XMM[9]
+        pxor    @XMM[8], @XMM[1]
+        movdqu  0x20($inp), @XMM[10]
+        pxor    @XMM[9], @XMM[6]
+        movdqu  0x30($inp), @XMM[11]
+        pxor    @XMM[10], @XMM[4]
+        movdqu  0x40($inp), @XMM[12]
+        pxor    @XMM[11], @XMM[2]
+        movdqu  0x50($inp), @XMM[15]    # IV
+        pxor    @XMM[12], @XMM[7]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[6], 0x20($out)
+        movdqu  @XMM[4], 0x30($out)
+        movdqu  @XMM[2], 0x40($out)
+        movdqu  @XMM[7], 0x50($out)
+        jmp     .Lcbc_dec_done
+.align  16
+.Lcbc_dec_five:
+        movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
+        call    _bsaes_decrypt8
+        pxor    0x20(%rbp), @XMM[0]     # ^= IV
+        movdqu  0x00($inp), @XMM[8]     # re-load input
+        movdqu  0x10($inp), @XMM[9]
+        pxor    @XMM[8], @XMM[1]
+        movdqu  0x20($inp), @XMM[10]
+        pxor    @XMM[9], @XMM[6]
+        movdqu  0x30($inp), @XMM[11]
+        pxor    @XMM[10], @XMM[4]
+        movdqu  0x40($inp), @XMM[15]    # IV
+        pxor    @XMM[11], @XMM[2]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[6], 0x20($out)
+        movdqu  @XMM[4], 0x30($out)
+        movdqu  @XMM[2], 0x40($out)
+        jmp     .Lcbc_dec_done
+.align  16
+.Lcbc_dec_four:
+        movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
+        call    _bsaes_decrypt8
+        pxor    0x20(%rbp), @XMM[0]     # ^= IV
+        movdqu  0x00($inp), @XMM[8]     # re-load input
+        movdqu  0x10($inp), @XMM[9]
+        pxor    @XMM[8], @XMM[1]
+        movdqu  0x20($inp), @XMM[10]
+        pxor    @XMM[9], @XMM[6]
+        movdqu  0x30($inp), @XMM[15]    # IV
+        pxor    @XMM[10], @XMM[4]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[6], 0x20($out)
+        movdqu  @XMM[4], 0x30($out)
+        jmp     .Lcbc_dec_done
+.align  16
+.Lcbc_dec_three:
+        movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
+        call    _bsaes_decrypt8
+        pxor    0x20(%rbp), @XMM[0]     # ^= IV
+        movdqu  0x00($inp), @XMM[8]     # re-load input
+        movdqu  0x10($inp), @XMM[9]
+        pxor    @XMM[8], @XMM[1]
+        movdqu  0x20($inp), @XMM[15]    # IV
+        pxor    @XMM[9], @XMM[6]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[6], 0x20($out)
+        jmp     .Lcbc_dec_done
+.align  16
+.Lcbc_dec_two:
+        movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
+        call    _bsaes_decrypt8
+        pxor    0x20(%rbp), @XMM[0]     # ^= IV
+        movdqu  0x00($inp), @XMM[8]     # re-load input
+        movdqu  0x10($inp), @XMM[15]    # IV
+        pxor    @XMM[8], @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        jmp     .Lcbc_dec_done
+.align  16
+.Lcbc_dec_one:
+        lea     ($inp), $arg1
+        lea     0x20(%rbp), $arg2       # buffer output
+        lea     ($key), $arg3
+        call    asm_AES_decrypt         # doesn't touch %xmm
+        pxor    0x20(%rbp), @XMM[15]    # ^= IV
+        movdqu  @XMM[15], ($out)        # write output
+        movdqa  @XMM[0], @XMM[15]       # IV
+.Lcbc_dec_done:
+        movdqu  @XMM[15], (%rbx)        # return IV
+        lea     (%rsp), %rax
+        pxor    %xmm0, %xmm0
+.Lcbc_dec_bzero:                        # wipe key schedule [if any]
+        movdqa  %xmm0, 0x00(%rax)
+        movdqa  %xmm0, 0x10(%rax)
+        lea     0x20(%rax), %rax
+        cmp     %rax, %rbp
+        ja      .Lcbc_dec_bzero
+        lea     (%rbp),%rsp             # restore %rsp
+___
+$code.=<<___ if ($win64);
+        movaps  0x40(%rbp), %xmm6
+        movaps  0x50(%rbp), %xmm7
+        movaps  0x60(%rbp), %xmm8
+        movaps  0x70(%rbp), %xmm9
+        movaps  0x80(%rbp), %xmm10
+        movaps  0x90(%rbp), %xmm11
+        movaps  0xa0(%rbp), %xmm12
+        movaps  0xb0(%rbp), %xmm13
+        movaps  0xc0(%rbp), %xmm14
+        movaps  0xd0(%rbp), %xmm15
+        lea     0xa0(%rbp), %rsp
+___
+$code.=<<___;
+        mov     0x48(%rsp), %r15
+        mov     0x50(%rsp), %r14
+        mov     0x58(%rsp), %r13
+        mov     0x60(%rsp), %r12
+        mov     0x68(%rsp), %rbx
+        mov     0x70(%rsp), %rax
+        lea     0x78(%rsp), %rsp
+        mov     %rax, %rbp
+.Lcbc_dec_epilogue:
+        ret
+.size   bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+.globl  bsaes_ctr32_encrypt_blocks
+.type   bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
+.align  16
+bsaes_ctr32_encrypt_blocks:
+        mov     %rsp, %rax
+.Lctr_enc_prologue:
+        push    %rbp
+        push    %rbx
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        lea     -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+        mov     0xa0(%rsp),$arg5        # pull ivp
+        lea     -0xa0(%rsp), %rsp
+        movaps  %xmm6, 0x40(%rsp)
+        movaps  %xmm7, 0x50(%rsp)
+        movaps  %xmm8, 0x60(%rsp)
+        movaps  %xmm9, 0x70(%rsp)
+        movaps  %xmm10, 0x80(%rsp)
+        movaps  %xmm11, 0x90(%rsp)
+        movaps  %xmm12, 0xa0(%rsp)
+        movaps  %xmm13, 0xb0(%rsp)
+        movaps  %xmm14, 0xc0(%rsp)
+        movaps  %xmm15, 0xd0(%rsp)
+.Lctr_enc_body:
+___
+$code.=<<___;
+        mov     %rsp, %rbp              # backup %rsp
+        movdqu  ($arg5), %xmm0          # load counter
+        mov     240($arg4), %eax        # rounds
+        mov     $arg1, $inp             # backup arguments
+        mov     $arg2, $out
+        mov     $arg3, $len
+        mov     $arg4, $key
+        movdqa  %xmm0, 0x20(%rbp)       # copy counter
+        cmp     \$8, $arg3
+        jb      .Lctr_enc_short
+        mov     %eax, %ebx              # rounds
+        shl     \$7, %rax               # 128 bytes per inner round key
+        sub     \$`128-32`, %rax        # size of bit-sliced key schedule
+        sub     %rax, %rsp
+        mov     %rsp, %rax              # pass key schedule
+        mov     $key, %rcx              # pass key
+        mov     %ebx, %r10d             # pass rounds
+        call    _bsaes_key_convert
+        pxor    %xmm6,%xmm7             # fix up last round key
+        movdqa  %xmm7,(%rax)            # save last round key
+        movdqa  (%rsp), @XMM[9]         # load round0 key
+        lea     .LADD1(%rip), %r11
+        movdqa  0x20(%rbp), @XMM[0]     # counter copy
+        movdqa  -0x20(%r11), @XMM[8]    # .LSWPUP
+        pshufb  @XMM[8], @XMM[9]        # byte swap upper part
+        pshufb  @XMM[8], @XMM[0]
+        movdqa  @XMM[9], (%rsp)         # save adjusted round0 key
+        jmp     .Lctr_enc_loop
+.align  16
+.Lctr_enc_loop:
+        movdqa  @XMM[0], 0x20(%rbp)     # save counter
+        movdqa  @XMM[0], @XMM[1]        # prepare 8 counter values
+        movdqa  @XMM[0], @XMM[2]
+        paddd   0x00(%r11), @XMM[1]     # .LADD1
+        movdqa  @XMM[0], @XMM[3]
+        paddd   0x10(%r11), @XMM[2]     # .LADD2
+        movdqa  @XMM[0], @XMM[4]
+        paddd   0x20(%r11), @XMM[3]     # .LADD3
+        movdqa  @XMM[0], @XMM[5]
+        paddd   0x30(%r11), @XMM[4]     # .LADD4
+        movdqa  @XMM[0], @XMM[6]
+        paddd   0x40(%r11), @XMM[5]     # .LADD5
+        movdqa  @XMM[0], @XMM[7]
+        paddd   0x50(%r11), @XMM[6]     # .LADD6
+        paddd   0x60(%r11), @XMM[7]     # .LADD7
+        # Borrow prologue from _bsaes_encrypt8 to use the opportunity
+        # to flip byte order in 32-bit counter
+        movdqa  (%rsp), @XMM[9]         # round 0 key
+        lea     0x10(%rsp), %rax        # pass key schedule
+        movdqa  -0x10(%r11), @XMM[8]    # .LSWPUPM0SR
+        pxor    @XMM[9], @XMM[0]        # xor with round0 key
+        pxor    @XMM[9], @XMM[1]
+         pshufb @XMM[8], @XMM[0]
+        pxor    @XMM[9], @XMM[2]
+         pshufb @XMM[8], @XMM[1]
+        pxor    @XMM[9], @XMM[3]
+         pshufb @XMM[8], @XMM[2]
+        pxor    @XMM[9], @XMM[4]
+         pshufb @XMM[8], @XMM[3]
+        pxor    @XMM[9], @XMM[5]
+         pshufb @XMM[8], @XMM[4]
+        pxor    @XMM[9], @XMM[6]
+         pshufb @XMM[8], @XMM[5]
+        pxor    @XMM[9], @XMM[7]
+         pshufb @XMM[8], @XMM[6]
+        lea     .LBS0(%rip), %r11       # constants table
+         pshufb @XMM[8], @XMM[7]
+        mov     %ebx,%r10d              # pass rounds
+        call    _bsaes_encrypt8_bitslice
+        sub     \$8,$len
+        jc      .Lctr_enc_loop_done
+        movdqu  0x00($inp), @XMM[8]     # load input
+        movdqu  0x10($inp), @XMM[9]
+        movdqu  0x20($inp), @XMM[10]
+        movdqu  0x30($inp), @XMM[11]
+        movdqu  0x40($inp), @XMM[12]
+        movdqu  0x50($inp), @XMM[13]
+        movdqu  0x60($inp), @XMM[14]
+        movdqu  0x70($inp), @XMM[15]
+        lea     0x80($inp),$inp
+        pxor    @XMM[0], @XMM[8]
+        movdqa  0x20(%rbp), @XMM[0]     # load counter
+        pxor    @XMM[9], @XMM[1]
+        movdqu  @XMM[8], 0x00($out)     # write output
+        pxor    @XMM[10], @XMM[4]
+        movdqu  @XMM[1], 0x10($out)
+        pxor    @XMM[11], @XMM[6]
+        movdqu  @XMM[4], 0x20($out)
+        pxor    @XMM[12], @XMM[3]
+        movdqu  @XMM[6], 0x30($out)
+        pxor    @XMM[13], @XMM[7]
+        movdqu  @XMM[3], 0x40($out)
+        pxor    @XMM[14], @XMM[2]
+        movdqu  @XMM[7], 0x50($out)
+        pxor    @XMM[15], @XMM[5]
+        movdqu  @XMM[2], 0x60($out)
+        lea     .LADD1(%rip), %r11
+        movdqu  @XMM[5], 0x70($out)
+        lea     0x80($out), $out
+        paddd   0x70(%r11), @XMM[0]     # .LADD8
+        jnz     .Lctr_enc_loop
+        jmp     .Lctr_enc_done
+.align  16
+.Lctr_enc_loop_done:
+        add     \$8, $len
+        movdqu  0x00($inp), @XMM[8]     # load input
+        pxor    @XMM[8], @XMM[0]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        cmp     \$2,$len
+        jb      .Lctr_enc_done
+        movdqu  0x10($inp), @XMM[9]
+        pxor    @XMM[9], @XMM[1]
+        movdqu  @XMM[1], 0x10($out)
+        je      .Lctr_enc_done
+        movdqu  0x20($inp), @XMM[10]
+        pxor    @XMM[10], @XMM[4]
+        movdqu  @XMM[4], 0x20($out)
+        cmp     \$4,$len
+        jb      .Lctr_enc_done
+        movdqu  0x30($inp), @XMM[11]
+        pxor    @XMM[11], @XMM[6]
+        movdqu  @XMM[6], 0x30($out)
+        je      .Lctr_enc_done
+        movdqu  0x40($inp), @XMM[12]
+        pxor    @XMM[12], @XMM[3]
+        movdqu  @XMM[3], 0x40($out)
+        cmp     \$6,$len
+        jb      .Lctr_enc_done
+        movdqu  0x50($inp), @XMM[13]
+        pxor    @XMM[13], @XMM[7]
+        movdqu  @XMM[7], 0x50($out)
+        je      .Lctr_enc_done
+        movdqu  0x60($inp), @XMM[14]
+        pxor    @XMM[14], @XMM[2]
+        movdqu  @XMM[2], 0x60($out)
+        jmp     .Lctr_enc_done
+.align  16
+.Lctr_enc_short:
+        lea     0x20(%rbp), $arg1
+        lea     0x30(%rbp), $arg2
+        lea     ($key), $arg3
+        call    asm_AES_encrypt
+        movdqu  ($inp), @XMM[1]
+        lea     16($inp), $inp
+        mov     0x2c(%rbp), %eax        # load 32-bit counter
+        bswap   %eax
+        pxor    0x30(%rbp), @XMM[1]
+        inc     %eax                    # increment
+        movdqu  @XMM[1], ($out)
+        bswap   %eax
+        lea     16($out), $out
+        mov     %eax, 0x2c(%rsp)        # save 32-bit counter
+        dec     $len
+        jnz     .Lctr_enc_short
+.Lctr_enc_done:
+        lea     (%rsp), %rax
+        pxor    %xmm0, %xmm0
+.Lctr_enc_bzero:                        # wipe key schedule [if any]
+        movdqa  %xmm0, 0x00(%rax)
+        movdqa  %xmm0, 0x10(%rax)
+        lea     0x20(%rax), %rax
+        cmp     %rax, %rbp
+        ja      .Lctr_enc_bzero
+        lea     (%rbp),%rsp             # restore %rsp
+___
+$code.=<<___ if ($win64);
+        movaps  0x40(%rbp), %xmm6
+        movaps  0x50(%rbp), %xmm7
+        movaps  0x60(%rbp), %xmm8
+        movaps  0x70(%rbp), %xmm9
+        movaps  0x80(%rbp), %xmm10
+        movaps  0x90(%rbp), %xmm11
+        movaps  0xa0(%rbp), %xmm12
+        movaps  0xb0(%rbp), %xmm13
+        movaps  0xc0(%rbp), %xmm14
+        movaps  0xd0(%rbp), %xmm15
+        lea     0xa0(%rbp), %rsp
+___
+$code.=<<___;
+        mov     0x48(%rsp), %r15
+        mov     0x50(%rsp), %r14
+        mov     0x58(%rsp), %r13
+        mov     0x60(%rsp), %r12
+        mov     0x68(%rsp), %rbx
+        mov     0x70(%rsp), %rax
+        lea     0x78(%rsp), %rsp
+        mov     %rax, %rbp
+.Lctr_enc_epilogue:
+        ret
+.size   bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#       const AES_KEY *key1, const AES_KEY *key2,
+#       const unsigned char iv[16]);
+#
+my ($twmask,$twres,$twtmp)=@XMM[13..15];
+$code.=<<___;
+.globl  bsaes_xts_encrypt
+.type   bsaes_xts_encrypt,\@abi-omnipotent
+.align  16
+bsaes_xts_encrypt:
+        mov     %rsp, %rax
+.Lxts_enc_prologue:
+        push    %rbp
+        push    %rbx
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        lea     -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+        mov     0xa0(%rsp),$arg5        # pull key2
+        mov     0xa8(%rsp),$arg6        # pull ivp
+        lea     -0xa0(%rsp), %rsp
+        movaps  %xmm6, 0x40(%rsp)
+        movaps  %xmm7, 0x50(%rsp)
+        movaps  %xmm8, 0x60(%rsp)
+        movaps  %xmm9, 0x70(%rsp)
+        movaps  %xmm10, 0x80(%rsp)
+        movaps  %xmm11, 0x90(%rsp)
+        movaps  %xmm12, 0xa0(%rsp)
+        movaps  %xmm13, 0xb0(%rsp)
+        movaps  %xmm14, 0xc0(%rsp)
+        movaps  %xmm15, 0xd0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+        mov     %rsp, %rbp              # backup %rsp
+        mov     $arg1, $inp             # backup arguments
+        mov     $arg2, $out
+        mov     $arg3, $len
+        mov     $arg4, $key
+        lea     ($arg6), $arg1
+        lea     0x20(%rbp), $arg2
+        lea     ($arg5), $arg3
+        call    asm_AES_encrypt         # generate initial tweak
+        mov     240($key), %eax         # rounds
+        mov     $len, %rbx              # backup $len
+        mov     %eax, %edx              # rounds
+        shl     \$7, %rax               # 128 bytes per inner round key
+        sub     \$`128-32`, %rax        # size of bit-sliced key schedule
+        sub     %rax, %rsp
+        mov     %rsp, %rax              # pass key schedule
+        mov     $key, %rcx              # pass key
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_key_convert
+        pxor    %xmm6, %xmm7            # fix up last round key
+        movdqa  %xmm7, (%rax)           # save last round key
+        and     \$-16, $len
+        sub     \$0x80, %rsp            # place for tweak[8]
+        movdqa  0x20(%rbp), @XMM[7]     # initial tweak
+        pxor    $twtmp, $twtmp
+        movdqa  .Lxts_magic(%rip), $twmask
+        pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+        sub     \$0x80, $len
+        jc      .Lxts_enc_short
+        jmp     .Lxts_enc_loop
+.align  16
+.Lxts_enc_loop:
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+        pshufd  \$0x13, $twtmp, $twres
+        pxor    $twtmp, $twtmp
+        movdqa  @XMM[7], @XMM[$i]
+        movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+        paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
+        pand    $twmask, $twres         # isolate carry and residue
+        pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+        pxor    $twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+        movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+    $code.=<<___ if ($i>=2);
+        pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+        movdqu  0x60($inp), @XMM[8+6]
+        pxor    @XMM[8+5], @XMM[5]
+        movdqu  0x70($inp), @XMM[8+7]
+        lea     0x80($inp), $inp
+        movdqa  @XMM[7], 0x70(%rsp)
+        pxor    @XMM[8+6], @XMM[6]
+        lea     0x80(%rsp), %rax        # pass key schedule
+        pxor    @XMM[8+7], @XMM[7]
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_encrypt8
+        pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+        pxor    0x10(%rsp), @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        pxor    0x20(%rsp), @XMM[4]
+        movdqu  @XMM[1], 0x10($out)
+        pxor    0x30(%rsp), @XMM[6]
+        movdqu  @XMM[4], 0x20($out)
+        pxor    0x40(%rsp), @XMM[3]
+        movdqu  @XMM[6], 0x30($out)
+        pxor    0x50(%rsp), @XMM[7]
+        movdqu  @XMM[3], 0x40($out)
+        pxor    0x60(%rsp), @XMM[2]
+        movdqu  @XMM[7], 0x50($out)
+        pxor    0x70(%rsp), @XMM[5]
+        movdqu  @XMM[2], 0x60($out)
+        movdqu  @XMM[5], 0x70($out)
+        lea     0x80($out), $out
+        movdqa  0x70(%rsp), @XMM[7]     # prepare next iteration tweak
+        pxor    $twtmp, $twtmp
+        movdqa  .Lxts_magic(%rip), $twmask
+        pcmpgtd @XMM[7], $twtmp
+        pshufd  \$0x13, $twtmp, $twres
+        pxor    $twtmp, $twtmp
+        paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
+        pand    $twmask, $twres         # isolate carry and residue
+        pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+        pxor    $twres, @XMM[7]
+        sub     \$0x80,$len
+        jnc     .Lxts_enc_loop
+.Lxts_enc_short:
+        add     \$0x80, $len
+        jz      .Lxts_enc_done
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+        pshufd  \$0x13, $twtmp, $twres
+        pxor    $twtmp, $twtmp
+        movdqa  @XMM[7], @XMM[$i]
+        movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+        paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
+        pand    $twmask, $twres         # isolate carry and residue
+        pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+        pxor    $twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+        movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
+        cmp     \$`0x10*$i`,$len
+        je      .Lxts_enc_$i
+___
+    $code.=<<___ if ($i>=2);
+        pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+        movdqu  0x60($inp), @XMM[8+6]
+        pxor    @XMM[8+5], @XMM[5]
+        movdqa  @XMM[7], 0x70(%rsp)
+        lea     0x70($inp), $inp
+        pxor    @XMM[8+6], @XMM[6]
+        lea     0x80(%rsp), %rax        # pass key schedule
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_encrypt8
+        pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+        pxor    0x10(%rsp), @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        pxor    0x20(%rsp), @XMM[4]
+        movdqu  @XMM[1], 0x10($out)
+        pxor    0x30(%rsp), @XMM[6]
+        movdqu  @XMM[4], 0x20($out)
+        pxor    0x40(%rsp), @XMM[3]
+        movdqu  @XMM[6], 0x30($out)
+        pxor    0x50(%rsp), @XMM[7]
+        movdqu  @XMM[3], 0x40($out)
+        pxor    0x60(%rsp), @XMM[2]
+        movdqu  @XMM[7], 0x50($out)
+        movdqu  @XMM[2], 0x60($out)
+        lea     0x70($out), $out
+        movdqa  0x70(%rsp), @XMM[7]     # next iteration tweak
+        jmp     .Lxts_enc_done
+.align  16
+.Lxts_enc_6:
+        pxor    @XMM[8+4], @XMM[4]
+        lea     0x60($inp), $inp
+        pxor    @XMM[8+5], @XMM[5]
+        lea     0x80(%rsp), %rax        # pass key schedule
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_encrypt8
+        pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+        pxor    0x10(%rsp), @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        pxor    0x20(%rsp), @XMM[4]
+        movdqu  @XMM[1], 0x10($out)
+        pxor    0x30(%rsp), @XMM[6]
+        movdqu  @XMM[4], 0x20($out)
+        pxor    0x40(%rsp), @XMM[3]
+        movdqu  @XMM[6], 0x30($out)
+        pxor    0x50(%rsp), @XMM[7]
+        movdqu  @XMM[3], 0x40($out)
+        movdqu  @XMM[7], 0x50($out)
+        lea     0x60($out), $out
+        movdqa  0x60(%rsp), @XMM[7]     # next iteration tweak
+        jmp     .Lxts_enc_done
+.align  16
+.Lxts_enc_5:
+        pxor    @XMM[8+3], @XMM[3]
+        lea     0x50($inp), $inp
+        pxor    @XMM[8+4], @XMM[4]
+        lea     0x80(%rsp), %rax        # pass key schedule
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_encrypt8
+        pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+        pxor    0x10(%rsp), @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        pxor    0x20(%rsp), @XMM[4]
+        movdqu  @XMM[1], 0x10($out)
+        pxor    0x30(%rsp), @XMM[6]
+        movdqu  @XMM[4], 0x20($out)
+        pxor    0x40(%rsp), @XMM[3]
+        movdqu  @XMM[6], 0x30($out)
+        movdqu  @XMM[3], 0x40($out)
+        lea     0x50($out), $out
+        movdqa  0x50(%rsp), @XMM[7]     # next iteration tweak
+        jmp     .Lxts_enc_done
+.align  16
+.Lxts_enc_4:
+        pxor    @XMM[8+2], @XMM[2]
+        lea     0x40($inp), $inp
+        pxor    @XMM[8+3], @XMM[3]
+        lea     0x80(%rsp), %rax        # pass key schedule
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_encrypt8
+        pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+        pxor    0x10(%rsp), @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        pxor    0x20(%rsp), @XMM[4]
+        movdqu  @XMM[1], 0x10($out)
+        pxor    0x30(%rsp), @XMM[6]
+        movdqu  @XMM[4], 0x20($out)
+        movdqu  @XMM[6], 0x30($out)
+        lea     0x40($out), $out
+        movdqa  0x40(%rsp), @XMM[7]     # next iteration tweak
+        jmp     .Lxts_enc_done
+.align  16
+.Lxts_enc_3:
+        pxor    @XMM[8+1], @XMM[1]
+        lea     0x30($inp), $inp
+        pxor    @XMM[8+2], @XMM[2]
+        lea     0x80(%rsp), %rax        # pass key schedule
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_encrypt8
+        pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+        pxor    0x10(%rsp), @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        pxor    0x20(%rsp), @XMM[4]
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[4], 0x20($out)
+        lea     0x30($out), $out
+        movdqa  0x30(%rsp), @XMM[7]     # next iteration tweak
+        jmp     .Lxts_enc_done
+.align  16
+.Lxts_enc_2:
+        pxor    @XMM[8+0], @XMM[0]
+        lea     0x20($inp), $inp
+        pxor    @XMM[8+1], @XMM[1]
+        lea     0x80(%rsp), %rax        # pass key schedule
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_encrypt8
+        pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+        pxor    0x10(%rsp), @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        lea     0x20($out), $out
+        movdqa  0x20(%rsp), @XMM[7]     # next iteration tweak
+        jmp     .Lxts_enc_done
+.align  16
+.Lxts_enc_1:
+        pxor    @XMM[0], @XMM[8]
+        lea     0x10($inp), $inp
+        movdqa  @XMM[8], 0x20(%rbp)
+        lea     0x20(%rbp), $arg1
+        lea     0x20(%rbp), $arg2
+        lea     ($key), $arg3
+        call    asm_AES_encrypt         # doesn't touch %xmm
+        pxor    0x20(%rbp), @XMM[0]     # ^= tweak[]
+        #pxor   @XMM[8], @XMM[0]
+        #lea    0x80(%rsp), %rax        # pass key schedule
+        #mov    %edx, %r10d             # pass rounds
+        #call   _bsaes_encrypt8
+        #pxor   0x00(%rsp), @XMM[0]     # ^= tweak[]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        lea     0x10($out), $out
+        movdqa  0x10(%rsp), @XMM[7]     # next iteration tweak
+.Lxts_enc_done:
+        and     \$15, %ebx
+        jz      .Lxts_enc_ret
+        mov     $out, %rdx
+.Lxts_enc_steal:
+        movzb   ($inp), %eax
+        movzb   -16(%rdx), %ecx
+        lea     1($inp), $inp
+        mov     %al, -16(%rdx)
+        mov     %cl, 0(%rdx)
+        lea     1(%rdx), %rdx
+        sub     \$1,%ebx
+        jnz     .Lxts_enc_steal
+        movdqu  -16($out), @XMM[0]
+        lea     0x20(%rbp), $arg1
+        pxor    @XMM[7], @XMM[0]
+        lea     0x20(%rbp), $arg2
+        movdqa  @XMM[0], 0x20(%rbp)
+        lea     ($key), $arg3
+        call    asm_AES_encrypt         # doesn't touch %xmm
+        pxor    0x20(%rbp), @XMM[7]
+        movdqu  @XMM[7], -16($out)
+.Lxts_enc_ret:
+        lea     (%rsp), %rax
+        pxor    %xmm0, %xmm0
+.Lxts_enc_bzero:                        # wipe key schedule [if any]
+        movdqa  %xmm0, 0x00(%rax)
+        movdqa  %xmm0, 0x10(%rax)
+        lea     0x20(%rax), %rax
+        cmp     %rax, %rbp
+        ja      .Lxts_enc_bzero
+        lea     (%rbp),%rsp             # restore %rsp
+___
+$code.=<<___ if ($win64);
+        movaps  0x40(%rbp), %xmm6
+        movaps  0x50(%rbp), %xmm7
+        movaps  0x60(%rbp), %xmm8
+        movaps  0x70(%rbp), %xmm9
+        movaps  0x80(%rbp), %xmm10
+        movaps  0x90(%rbp), %xmm11
+        movaps  0xa0(%rbp), %xmm12
+        movaps  0xb0(%rbp), %xmm13
+        movaps  0xc0(%rbp), %xmm14
+        movaps  0xd0(%rbp), %xmm15
+        lea     0xa0(%rbp), %rsp
+___
+$code.=<<___;
+        mov     0x48(%rsp), %r15
+        mov     0x50(%rsp), %r14
+        mov     0x58(%rsp), %r13
+        mov     0x60(%rsp), %r12
+        mov     0x68(%rsp), %rbx
+        mov     0x70(%rsp), %rax
+        lea     0x78(%rsp), %rsp
+        mov     %rax, %rbp
+.Lxts_enc_epilogue:
+        ret
+.size   bsaes_xts_encrypt,.-bsaes_xts_encrypt
+.globl  bsaes_xts_decrypt
+.type   bsaes_xts_decrypt,\@abi-omnipotent
+.align  16
+bsaes_xts_decrypt:
+        mov     %rsp, %rax
+.Lxts_dec_prologue:
+        push    %rbp
+        push    %rbx
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        lea     -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+        mov     0xa0(%rsp),$arg5        # pull key2
+        mov     0xa8(%rsp),$arg6        # pull ivp
+        lea     -0xa0(%rsp), %rsp
+        movaps  %xmm6, 0x40(%rsp)
+        movaps  %xmm7, 0x50(%rsp)
+        movaps  %xmm8, 0x60(%rsp)
+        movaps  %xmm9, 0x70(%rsp)
+        movaps  %xmm10, 0x80(%rsp)
+        movaps  %xmm11, 0x90(%rsp)
+        movaps  %xmm12, 0xa0(%rsp)
+        movaps  %xmm13, 0xb0(%rsp)
+        movaps  %xmm14, 0xc0(%rsp)
+        movaps  %xmm15, 0xd0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+        mov     %rsp, %rbp              # backup %rsp
+        mov     $arg1, $inp             # backup arguments
+        mov     $arg2, $out
+        mov     $arg3, $len
+        mov     $arg4, $key
+        lea     ($arg6), $arg1
+        lea     0x20(%rbp), $arg2
+        lea     ($arg5), $arg3
+        call    asm_AES_encrypt         # generate initial tweak
+        mov     240($key), %eax         # rounds
+        mov     $len, %rbx              # backup $len
+        mov     %eax, %edx              # rounds
+        shl     \$7, %rax               # 128 bytes per inner round key
+        sub     \$`128-32`, %rax        # size of bit-sliced key schedule
+        sub     %rax, %rsp
+        mov     %rsp, %rax              # pass key schedule
+        mov     $key, %rcx              # pass key
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_key_convert
+        pxor    (%rsp), %xmm7           # fix up round 0 key
+        movdqa  %xmm6, (%rax)           # save last round key
+        movdqa  %xmm7, (%rsp)
+        xor     %eax, %eax              # if ($len%16) len-=16;
+        and     \$-16, $len
+        test    \$15, %ebx
+        setnz   %al
+        shl     \$4, %rax
+        sub     %rax, $len
+        sub     \$0x80, %rsp            # place for tweak[8]
+        movdqa  0x20(%rbp), @XMM[7]     # initial tweak
+        pxor    $twtmp, $twtmp
+        movdqa  .Lxts_magic(%rip), $twmask
+        pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+        sub     \$0x80, $len
+        jc      .Lxts_dec_short
+        jmp     .Lxts_dec_loop
+.align  16
+.Lxts_dec_loop:
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+        pshufd  \$0x13, $twtmp, $twres
+        pxor    $twtmp, $twtmp
+        movdqa  @XMM[7], @XMM[$i]
+        movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+        paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
+        pand    $twmask, $twres         # isolate carry and residue
+        pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+        pxor    $twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+        movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+    $code.=<<___ if ($i>=2);
+        pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+        movdqu  0x60($inp), @XMM[8+6]
+        pxor    @XMM[8+5], @XMM[5]
+        movdqu  0x70($inp), @XMM[8+7]
+        lea     0x80($inp), $inp
+        movdqa  @XMM[7], 0x70(%rsp)
+        pxor    @XMM[8+6], @XMM[6]
+        lea     0x80(%rsp), %rax        # pass key schedule
+        pxor    @XMM[8+7], @XMM[7]
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_decrypt8
+        pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+        pxor    0x10(%rsp), @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        pxor    0x20(%rsp), @XMM[6]
+        movdqu  @XMM[1], 0x10($out)
+        pxor    0x30(%rsp), @XMM[4]
+        movdqu  @XMM[6], 0x20($out)
+        pxor    0x40(%rsp), @XMM[2]
+        movdqu  @XMM[4], 0x30($out)
+        pxor    0x50(%rsp), @XMM[7]
+        movdqu  @XMM[2], 0x40($out)
+        pxor    0x60(%rsp), @XMM[3]
+        movdqu  @XMM[7], 0x50($out)
+        pxor    0x70(%rsp), @XMM[5]
+        movdqu  @XMM[3], 0x60($out)
+        movdqu  @XMM[5], 0x70($out)
+        lea     0x80($out), $out
+        movdqa  0x70(%rsp), @XMM[7]     # prepare next iteration tweak
+        pxor    $twtmp, $twtmp
+        movdqa  .Lxts_magic(%rip), $twmask
+        pcmpgtd @XMM[7], $twtmp
+        pshufd  \$0x13, $twtmp, $twres
+        pxor    $twtmp, $twtmp
+        paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
+        pand    $twmask, $twres         # isolate carry and residue
+        pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+        pxor    $twres, @XMM[7]
+        sub     \$0x80,$len
+        jnc     .Lxts_dec_loop
+.Lxts_dec_short:
+        add     \$0x80, $len
+        jz      .Lxts_dec_done
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+        pshufd  \$0x13, $twtmp, $twres
+        pxor    $twtmp, $twtmp
+        movdqa  @XMM[7], @XMM[$i]
+        movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+        paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
+        pand    $twmask, $twres         # isolate carry and residue
+        pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+        pxor    $twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+        movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
+        cmp     \$`0x10*$i`,$len
+        je      .Lxts_dec_$i
+___
+    $code.=<<___ if ($i>=2);
+        pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+        movdqu  0x60($inp), @XMM[8+6]
+        pxor    @XMM[8+5], @XMM[5]
+        movdqa  @XMM[7], 0x70(%rsp)
+        lea     0x70($inp), $inp
+        pxor    @XMM[8+6], @XMM[6]
+        lea     0x80(%rsp), %rax        # pass key schedule
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_decrypt8
+        pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+        pxor    0x10(%rsp), @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        pxor    0x20(%rsp), @XMM[6]
+        movdqu  @XMM[1], 0x10($out)
+        pxor    0x30(%rsp), @XMM[4]
+        movdqu  @XMM[6], 0x20($out)
+        pxor    0x40(%rsp), @XMM[2]
+        movdqu  @XMM[4], 0x30($out)
+        pxor    0x50(%rsp), @XMM[7]
+        movdqu  @XMM[2], 0x40($out)
+        pxor    0x60(%rsp), @XMM[3]
+        movdqu  @XMM[7], 0x50($out)
+        movdqu  @XMM[3], 0x60($out)
+        lea     0x70($out), $out
+        movdqa  0x70(%rsp), @XMM[7]     # next iteration tweak
+        jmp     .Lxts_dec_done
+.align  16
+.Lxts_dec_6:
+        pxor    @XMM[8+4], @XMM[4]
+        lea     0x60($inp), $inp
+        pxor    @XMM[8+5], @XMM[5]
+        lea     0x80(%rsp), %rax        # pass key schedule
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_decrypt8
+        pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+        pxor    0x10(%rsp), @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        pxor    0x20(%rsp), @XMM[6]
+        movdqu  @XMM[1], 0x10($out)
+        pxor    0x30(%rsp), @XMM[4]
+        movdqu  @XMM[6], 0x20($out)
+        pxor    0x40(%rsp), @XMM[2]
+        movdqu  @XMM[4], 0x30($out)
+        pxor    0x50(%rsp), @XMM[7]
+        movdqu  @XMM[2], 0x40($out)
+        movdqu  @XMM[7], 0x50($out)
+        lea     0x60($out), $out
+        movdqa  0x60(%rsp), @XMM[7]     # next iteration tweak
+        jmp     .Lxts_dec_done
+.align  16
+.Lxts_dec_5:
+        pxor    @XMM[8+3], @XMM[3]
+        lea     0x50($inp), $inp
+        pxor    @XMM[8+4], @XMM[4]
+        lea     0x80(%rsp), %rax        # pass key schedule
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_decrypt8
+        pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+        pxor    0x10(%rsp), @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        pxor    0x20(%rsp), @XMM[6]
+        movdqu  @XMM[1], 0x10($out)
+        pxor    0x30(%rsp), @XMM[4]
+        movdqu  @XMM[6], 0x20($out)
+        pxor    0x40(%rsp), @XMM[2]
+        movdqu  @XMM[4], 0x30($out)
+        movdqu  @XMM[2], 0x40($out)
+        lea     0x50($out), $out
+        movdqa  0x50(%rsp), @XMM[7]     # next iteration tweak
+        jmp     .Lxts_dec_done
+.align  16
+.Lxts_dec_4:
+        pxor    @XMM[8+2], @XMM[2]
+        lea     0x40($inp), $inp
+        pxor    @XMM[8+3], @XMM[3]
+        lea     0x80(%rsp), %rax        # pass key schedule
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_decrypt8
+        pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+        pxor    0x10(%rsp), @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        pxor    0x20(%rsp), @XMM[6]
+        movdqu  @XMM[1], 0x10($out)
+        pxor    0x30(%rsp), @XMM[4]
+        movdqu  @XMM[6], 0x20($out)
+        movdqu  @XMM[4], 0x30($out)
+        lea     0x40($out), $out
+        movdqa  0x40(%rsp), @XMM[7]     # next iteration tweak
+        jmp     .Lxts_dec_done
+.align  16
+.Lxts_dec_3:
+        pxor    @XMM[8+1], @XMM[1]
+        lea     0x30($inp), $inp
+        pxor    @XMM[8+2], @XMM[2]
+        lea     0x80(%rsp), %rax        # pass key schedule
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_decrypt8
+        pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+        pxor    0x10(%rsp), @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        pxor    0x20(%rsp), @XMM[6]
+        movdqu  @XMM[1], 0x10($out)
+        movdqu  @XMM[6], 0x20($out)
+        lea     0x30($out), $out
+        movdqa  0x30(%rsp), @XMM[7]     # next iteration tweak
+        jmp     .Lxts_dec_done
+.align  16
+.Lxts_dec_2:
+        pxor    @XMM[8+0], @XMM[0]
+        lea     0x20($inp), $inp
+        pxor    @XMM[8+1], @XMM[1]
+        lea     0x80(%rsp), %rax        # pass key schedule
+        mov     %edx, %r10d             # pass rounds
+        call    _bsaes_decrypt8
+        pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+        pxor    0x10(%rsp), @XMM[1]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        movdqu  @XMM[1], 0x10($out)
+        lea     0x20($out), $out
+        movdqa  0x20(%rsp), @XMM[7]     # next iteration tweak
+        jmp     .Lxts_dec_done
+.align  16
+.Lxts_dec_1:
+        pxor    @XMM[0], @XMM[8]
+        lea     0x10($inp), $inp
+        movdqa  @XMM[8], 0x20(%rbp)
+        lea     0x20(%rbp), $arg1
+        lea     0x20(%rbp), $arg2
+        lea     ($key), $arg3
+        call    asm_AES_decrypt         # doesn't touch %xmm
+        pxor    0x20(%rbp), @XMM[0]     # ^= tweak[]
+        #pxor   @XMM[8], @XMM[0]
+        #lea    0x80(%rsp), %rax        # pass key schedule
+        #mov    %edx, %r10d             # pass rounds
+        #call   _bsaes_decrypt8
+        #pxor   0x00(%rsp), @XMM[0]     # ^= tweak[]
+        movdqu  @XMM[0], 0x00($out)     # write output
+        lea     0x10($out), $out
+        movdqa  0x10(%rsp), @XMM[7]     # next iteration tweak
+.Lxts_dec_done:
+        and     \$15, %ebx
+        jz      .Lxts_dec_ret
+        pxor    $twtmp, $twtmp
+        movdqa  .Lxts_magic(%rip), $twmask
+        pcmpgtd @XMM[7], $twtmp
+        pshufd  \$0x13, $twtmp, $twres
+        movdqa  @XMM[7], @XMM[6]
+        paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
+        pand    $twmask, $twres         # isolate carry and residue
+        movdqu  ($inp), @XMM[0]
+        pxor    $twres, @XMM[7]
+        lea     0x20(%rbp), $arg1
+        pxor    @XMM[7], @XMM[0]
+        lea     0x20(%rbp), $arg2
+        movdqa  @XMM[0], 0x20(%rbp)
+        lea     ($key), $arg3
+        call    asm_AES_decrypt         # doesn't touch %xmm
+        pxor    0x20(%rbp), @XMM[7]
+        mov     $out, %rdx
+        movdqu  @XMM[7], ($out)
+.Lxts_dec_steal:
+        movzb   16($inp), %eax
+        movzb   (%rdx), %ecx
+        lea     1($inp), $inp
+        mov     %al, (%rdx)
+        mov     %cl, 16(%rdx)
+        lea     1(%rdx), %rdx
+        sub     \$1,%ebx
+        jnz     .Lxts_dec_steal
+        movdqu  ($out), @XMM[0]
+        lea     0x20(%rbp), $arg1
+        pxor    @XMM[6], @XMM[0]
+        lea     0x20(%rbp), $arg2
+        movdqa  @XMM[0], 0x20(%rbp)
+        lea     ($key), $arg3
+        call    asm_AES_decrypt         # doesn't touch %xmm
+        pxor    0x20(%rbp), @XMM[6]
+        movdqu  @XMM[6], ($out)
+.Lxts_dec_ret:
+        lea     (%rsp), %rax
+        pxor    %xmm0, %xmm0
+.Lxts_dec_bzero:                        # wipe key schedule [if any]
+        movdqa  %xmm0, 0x00(%rax)
+        movdqa  %xmm0, 0x10(%rax)
+        lea     0x20(%rax), %rax
+        cmp     %rax, %rbp
+        ja      .Lxts_dec_bzero
+        lea     (%rbp),%rsp             # restore %rsp
+___
+$code.=<<___ if ($win64);
+        movaps  0x40(%rbp), %xmm6
+        movaps  0x50(%rbp), %xmm7
+        movaps  0x60(%rbp), %xmm8
+        movaps  0x70(%rbp), %xmm9
+        movaps  0x80(%rbp), %xmm10
+        movaps  0x90(%rbp), %xmm11
+        movaps  0xa0(%rbp), %xmm12
+        movaps  0xb0(%rbp), %xmm13
+        movaps  0xc0(%rbp), %xmm14
+        movaps  0xd0(%rbp), %xmm15
+        lea     0xa0(%rbp), %rsp
+___
+$code.=<<___;
+        mov     0x48(%rsp), %r15
+        mov     0x50(%rsp), %r14
+        mov     0x58(%rsp), %r13
+        mov     0x60(%rsp), %r12
+        mov     0x68(%rsp), %rbx
+        mov     0x70(%rsp), %rax
+        lea     0x78(%rsp), %rsp
+        mov     %rax, %rbp
+.Lxts_dec_epilogue:
+        ret
+.size   bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+.type   _bsaes_const,\@object
+.align  64
+_bsaes_const:
+.LM0ISR:        # InvShiftRows constants
+        .quad   0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+        .quad   0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+        .quad   0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0:          # bit-slice constants
+        .quad   0x5555555555555555, 0x5555555555555555
+.LBS1:
+        .quad   0x3333333333333333, 0x3333333333333333
+.LBS2:
+        .quad   0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR:           # shiftrows constants
+        .quad   0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+        .quad   0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0SR:
+        .quad   0x0a0e02060f03070b, 0x0004080c05090d01
+.LSWPUP:        # byte-swap upper dword
+        .quad   0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+        .quad   0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1:         # counter increment constants
+        .quad   0x0000000000000000, 0x0000000100000000
+.LADD2:
+        .quad   0x0000000000000000, 0x0000000200000000
+.LADD3:
+        .quad   0x0000000000000000, 0x0000000300000000
+.LADD4:
+        .quad   0x0000000000000000, 0x0000000400000000
+.LADD5:
+        .quad   0x0000000000000000, 0x0000000500000000
+.LADD6:
+        .quad   0x0000000000000000, 0x0000000600000000
+.LADD7:
+        .quad   0x0000000000000000, 0x0000000700000000
+.LADD8:
+        .quad   0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+        .long   0x87,0,1,0
+.Lmasks:
+        .quad   0x0101010101010101, 0x0101010101010101
+        .quad   0x0202020202020202, 0x0202020202020202
+        .quad   0x0404040404040404, 0x0404040404040404
+        .quad   0x0808080808080808, 0x0808080808080808
+.LM0:
+        .quad   0x02060a0e03070b0f, 0x0004080c0105090d
+.L63:
+        .quad   0x6363636363636363, 0x6363636363636363
+.asciz  "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
+.align  64
+.size   _bsaes_const,.-_bsaes_const
+___
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type   se_handler,\@abi-omnipotent
+.align  16
+se_handler:
+        push    %rsi
+        push    %rdi
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        pushfq
+        sub     \$64,%rsp
+        mov     120($context),%rax      # pull context->Rax
+        mov     248($context),%rbx      # pull context->Rip
+        mov     8($disp),%rsi           # disp->ImageBase
+        mov     56($disp),%r11          # disp->HandlerData
+        mov     0(%r11),%r10d           # HandlerData[0]
+        lea     (%rsi,%r10),%r10        # prologue label
+        cmp     %r10,%rbx               # context->Rip<prologue label
+        jb      .Lin_prologue
+        mov     152($context),%rax      # pull context->Rsp
+        mov     4(%r11),%r10d           # HandlerData[1]
+        lea     (%rsi,%r10),%r10        # epilogue label
+        cmp     %r10,%rbx               # context->Rip>=epilogue label
+        jae     .Lin_prologue
+        mov     160($context),%rax      # pull context->Rbp
+        lea     0x40(%rax),%rsi         # %xmm save area
+        lea     512($context),%rdi      # &context.Xmm6
+        mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
+        .long   0xa548f3fc              # cld; rep movsq
+        lea     0xa0(%rax),%rax         # adjust stack pointer
+        mov     0x70(%rax),%rbp
+        mov     0x68(%rax),%rbx
+        mov     0x60(%rax),%r12
+        mov     0x58(%rax),%r13
+        mov     0x50(%rax),%r14
+        mov     0x48(%rax),%r15
+        lea     0x78(%rax),%rax         # adjust stack pointer
+        mov     %rbx,144($context)      # restore context->Rbx
+        mov     %rbp,160($context)      # restore context->Rbp
+        mov     %r12,216($context)      # restore context->R12
+        mov     %r13,224($context)      # restore context->R13
+        mov     %r14,232($context)      # restore context->R14
+        mov     %r15,240($context)      # restore context->R15
+.Lin_prologue:
+        mov     %rax,152($context)      # restore context->Rsp
+        mov     40($disp),%rdi          # disp->ContextRecord
+        mov     $context,%rsi           # context
+        mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
+        .long   0xa548f3fc              # cld; rep movsq
+        mov     $disp,%rsi
+        xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+        mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+        mov     0(%rsi),%r8             # arg3, disp->ControlPc
+        mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+        mov     40(%rsi),%r10           # disp->ContextRecord
+        lea     56(%rsi),%r11           # &disp->HandlerData
+        lea     24(%rsi),%r12           # &disp->EstablisherFrame
+        mov     %r10,32(%rsp)           # arg5
+        mov     %r11,40(%rsp)           # arg6
+        mov     %r12,48(%rsp)           # arg7
+        mov     %rcx,56(%rsp)           # arg8, (NULL)
+        call    *__imp_RtlVirtualUnwind(%rip)
+        mov     \$1,%eax                # ExceptionContinueSearch
+        add     \$64,%rsp
+        popfq
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+        pop     %rbp
+        pop     %rbx
+        pop     %rdi
+        pop     %rsi
+        ret
+.size   se_handler,.-se_handler
+.section        .pdata
+.align  4
+___
+$code.=<<___ if ($ecb);
+        .rva    .Lecb_enc_prologue
+        .rva    .Lecb_enc_epilogue
+        .rva    .Lecb_enc_info
+        .rva    .Lecb_dec_prologue
+        .rva    .Lecb_dec_epilogue
+        .rva    .Lecb_dec_info
+___
+$code.=<<___;
+        .rva    .Lcbc_dec_prologue
+        .rva    .Lcbc_dec_epilogue
+        .rva    .Lcbc_dec_info
+        .rva    .Lctr_enc_prologue
+        .rva    .Lctr_enc_epilogue
+        .rva    .Lctr_enc_info
+        .rva    .Lxts_enc_prologue
+        .rva    .Lxts_enc_epilogue
+        .rva    .Lxts_enc_info
+        .rva    .Lxts_dec_prologue
+        .rva    .Lxts_dec_epilogue
+        .rva    .Lxts_dec_info
+.section        .xdata
+.align  8
+___
+$code.=<<___ if ($ecb);
+.Lecb_enc_info:
+        .byte   9,0,0,0
+        .rva    se_handler
+        .rva    .Lecb_enc_body,.Lecb_enc_epilogue       # HandlerData[]
+.Lecb_dec_info:
+        .byte   9,0,0,0
+        .rva    se_handler
+        .rva    .Lecb_dec_body,.Lecb_dec_epilogue       # HandlerData[]
+___
+$code.=<<___;
+.Lcbc_dec_info:
+        .byte   9,0,0,0
+        .rva    se_handler
+        .rva    .Lcbc_dec_body,.Lcbc_dec_epilogue       # HandlerData[]
+.Lctr_enc_info:
+        .byte   9,0,0,0
+        .rva    se_handler
+        .rva    .Lctr_enc_body,.Lctr_enc_epilogue       # HandlerData[]
+.Lxts_enc_info:
+        .byte   9,0,0,0
+        .rva    se_handler
+        .rva    .Lxts_enc_body,.Lxts_enc_epilogue       # HandlerData[]
+.Lxts_dec_info:
+        .byte   9,0,0,0
+        .rva    se_handler
+        .rva    .Lxts_dec_body,.Lxts_dec_epilogue       # HandlerData[]
+___
+}
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl
new file mode 100644
index 0000000000..1533e2c304
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/vpaes-x86.pl
@@ -0,0 +1,903 @@
+#!/usr/bin/env perl
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+######################################################################
+# September 2011.
+#
+# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
+# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-586.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86.pl column - [also
+# large-block CBC] encrypt/decrypt.
+#
+#               aes-586.pl              vpaes-x86.pl
+#
+# Core 2(**)    29.1/42.3/18.3          22.0/25.6(***)
+# Nehalem       27.9/40.4/18.1          10.3/12.0
+# Atom          102./119./60.1          64.5/85.3(***)
+#
+# (*)   "Hyper-threading" in the context refers rather to cache shared
+#       among multiple cores, than to specifically Intel HTT. As vast
+#       majority of contemporary cores share cache, slower code path
+#       is common place. In other words "with-hyper-threading-off"
+#       results are presented mostly for reference purposes.
+#
+# (**)  "Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***) Less impressive improvement on Core 2 and Atom is due to slow
+#       pshufb, yet it's respectable +32%/65%  improvement on Core 2
+#       and +58%/40% on Atom (as implied, over "hyper-threading-safe"
+#       code path).
+#
+#                                               <appro@openssl.org>
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+$PREFIX="vpaes";
+my  ($round, $base, $magic, $key, $const, $inp, $out)=
+    ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
+&static_label("_vpaes_consts");
+&static_label("_vpaes_schedule_low_round");
+&set_label("_vpaes_consts",64);
+$k_inv=-0x30;           # inv, inva
+        &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
+        &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
+$k_s0F=-0x10;           # s0F
+        &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
+$k_ipt=0x00;            # input transform (lo, hi)
+        &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
+        &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
+$k_sb1=0x20;            # sb1u, sb1t
+        &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
+        &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
+$k_sb2=0x40;            # sb2u, sb2t
+        &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
+        &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
+$k_sbo=0x60;            # sbou, sbot
+        &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
+        &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
+$k_mc_forward=0x80;     # mc_forward
+        &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
+        &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
+        &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
+        &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
+$k_mc_backward=0xc0;    # mc_backward
+        &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
+        &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
+        &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
+        &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
+$k_sr=0x100;            # sr
+        &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
+        &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
+        &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
+        &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
+$k_rcon=0x140;          # rcon
+        &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
+$k_s63=0x150;           # s63: all equal to 0x63 transformed
+        &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
+$k_opt=0x160;           # output transform
+        &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
+        &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
+$k_deskew=0x180;        # deskew tables: inverts the sbox's "skew"
+        &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
+        &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
+##
+##  Decryption stuff
+##  Key schedule constants
+##
+$k_dksd=0x1a0;          # decryption key schedule: invskew x*D
+        &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
+        &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
+$k_dksb=0x1c0;          # decryption key schedule: invskew x*B
+        &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
+        &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
+$k_dkse=0x1e0;          # decryption key schedule: invskew x*E + 0x63
+        &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
+        &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
+$k_dks9=0x200;          # decryption key schedule: invskew x*9
+        &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
+        &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
+##
+##  Decryption stuff
+##  Round function constants
+##
+$k_dipt=0x220;          # decryption input transform
+        &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
+        &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
+$k_dsb9=0x240;          # decryption sbox output *9*u, *9*t
+        &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
+        &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
+$k_dsbd=0x260;          # decryption sbox output *D*u, *D*t
+        &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
+        &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
+$k_dsbb=0x280;          # decryption sbox output *B*u, *B*t
+        &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
+        &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
+$k_dsbe=0x2a0;          # decryption sbox output *E*u, *E*t
+        &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
+        &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
+$k_dsbo=0x2c0;          # decryption sbox final output
+        &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
+        &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
+&asciz  ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
+&align  (64);
+&function_begin_B("_vpaes_preheat");
+        &add    ($const,&DWP(0,"esp"));
+        &movdqa ("xmm7",&QWP($k_inv,$const));
+        &movdqa ("xmm6",&QWP($k_s0F,$const));
+        &ret    ();
+&function_end_B("_vpaes_preheat");
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm6-%xmm7 as in _vpaes_preheat
+##    (%edx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
+##
+##
+&function_begin_B("_vpaes_encrypt_core");
+        &mov    ($magic,16);
+        &mov    ($round,&DWP(240,$key));
+        &movdqa ("xmm1","xmm6")
+        &movdqa ("xmm2",&QWP($k_ipt,$const));
+        &pandn  ("xmm1","xmm0");
+        &movdqu ("xmm5",&QWP(0,$key));
+        &psrld  ("xmm1",4);
+        &pand   ("xmm0","xmm6");
+        &pshufb ("xmm2","xmm0");
+        &movdqa ("xmm0",&QWP($k_ipt+16,$const));
+        &pshufb ("xmm0","xmm1");
+        &pxor   ("xmm2","xmm5");
+        &pxor   ("xmm0","xmm2");
+        &add    ($key,16);
+        &lea    ($base,&DWP($k_mc_backward,$const));
+        &jmp    (&label("enc_entry"));
+&set_label("enc_loop",16);
+        # middle of middle round
+        &movdqa ("xmm4",&QWP($k_sb1,$const));   # 4 : sb1u
+        &pshufb ("xmm4","xmm2");                # 4 = sb1u
+        &pxor   ("xmm4","xmm5");                # 4 = sb1u + k
+        &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
+        &pshufb ("xmm0","xmm3");                # 0 = sb1t
+        &pxor   ("xmm0","xmm4");                # 0 = A
+        &movdqa ("xmm5",&QWP($k_sb2,$const));   # 4 : sb2u
+        &pshufb ("xmm5","xmm2");                # 4 = sb2u
+        &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
+        &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
+        &pshufb ("xmm2","xmm3");                # 2 = sb2t
+        &pxor   ("xmm2","xmm5");                # 2 = 2A
+        &movdqa ("xmm4",&QWP(0,$base,$magic));  # .Lk_mc_backward[]
+        &movdqa ("xmm3","xmm0");                # 3 = A
+        &pshufb ("xmm0","xmm1");                # 0 = B
+        &add    ($key,16);                      # next key
+        &pxor   ("xmm0","xmm2");                # 0 = 2A+B
+        &pshufb ("xmm3","xmm4");                # 3 = D
+        &add    ($magic,16);                    # next mc
+        &pxor   ("xmm3","xmm0");                # 3 = 2A+B+D
+        &pshufb ("xmm0","xmm1");                # 0 = 2B+C
+        &and    ($magic,0x30);                  # ... mod 4
+        &pxor   ("xmm0","xmm3");                # 0 = 2A+3B+C+D
+        &sub    ($round,1);                     # nr--
+&set_label("enc_entry");
+        # top of round
+        &movdqa ("xmm1","xmm6");                # 1 : i
+        &pandn  ("xmm1","xmm0");                # 1 = i<<4
+        &psrld  ("xmm1",4);                     # 1 = i
+        &pand   ("xmm0","xmm6");                # 0 = k
+        &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
+        &pshufb ("xmm5","xmm0");                # 2 = a/k
+        &pxor   ("xmm0","xmm1");                # 0 = j
+        &movdqa ("xmm3","xmm7");                # 3 : 1/i
+        &pshufb ("xmm3","xmm1");                # 3 = 1/i
+        &pxor   ("xmm3","xmm5");                # 3 = iak = 1/i + a/k
+        &movdqa ("xmm4","xmm7");                # 4 : 1/j
+        &pshufb ("xmm4","xmm0");                # 4 = 1/j
+        &pxor   ("xmm4","xmm5");                # 4 = jak = 1/j + a/k
+        &movdqa ("xmm2","xmm7");                # 2 : 1/iak
+        &pshufb ("xmm2","xmm3");                # 2 = 1/iak
+        &pxor   ("xmm2","xmm0");                # 2 = io
+        &movdqa ("xmm3","xmm7");                # 3 : 1/jak
+        &movdqu ("xmm5",&QWP(0,$key));
+        &pshufb ("xmm3","xmm4");                # 3 = 1/jak
+        &pxor   ("xmm3","xmm1");                # 3 = jo
+        &jnz    (&label("enc_loop"));
+        # middle of last round
+        &movdqa ("xmm4",&QWP($k_sbo,$const));   # 3 : sbou      .Lk_sbo
+        &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
+        &pshufb ("xmm4","xmm2");                # 4 = sbou
+        &pxor   ("xmm4","xmm5");                # 4 = sb1u + k
+        &pshufb ("xmm0","xmm3");                # 0 = sb1t
+        &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
+        &pxor   ("xmm0","xmm4");                # 0 = A
+        &pshufb ("xmm0","xmm1");
+        &ret    ();
+&function_end_B("_vpaes_encrypt_core");
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+&function_begin_B("_vpaes_decrypt_core");
+        &mov    ($round,&DWP(240,$key));
+        &lea    ($base,&DWP($k_dsbd,$const));
+        &movdqa ("xmm1","xmm6");
+        &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
+        &pandn  ("xmm1","xmm0");
+        &mov    ($magic,$round);
+        &psrld  ("xmm1",4)
+        &movdqu ("xmm5",&QWP(0,$key));
+        &shl    ($magic,4);
+        &pand   ("xmm0","xmm6");
+        &pshufb ("xmm2","xmm0");
+        &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
+        &xor    ($magic,0x30);
+        &pshufb ("xmm0","xmm1");
+        &and    ($magic,0x30);
+        &pxor   ("xmm2","xmm5");
+        &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
+        &pxor   ("xmm0","xmm2");
+        &add    ($key,16);
+        &lea    ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
+        &jmp    (&label("dec_entry"));
+&set_label("dec_loop",16);
+##
+##  Inverse mix columns
+##
+        &movdqa ("xmm4",&QWP(-0x20,$base));     # 4 : sb9u
+        &pshufb ("xmm4","xmm2");                # 4 = sb9u
+        &pxor   ("xmm4","xmm0");
+        &movdqa ("xmm0",&QWP(-0x10,$base));     # 0 : sb9t
+        &pshufb ("xmm0","xmm3");                # 0 = sb9t
+        &pxor   ("xmm0","xmm4");                # 0 = ch
+        &add    ($key,16);                      # next round key
+        &pshufb ("xmm0","xmm5");                # MC ch
+        &movdqa ("xmm4",&QWP(0,$base));         # 4 : sbdu
+        &pshufb ("xmm4","xmm2");                # 4 = sbdu
+        &pxor   ("xmm4","xmm0");                # 4 = ch
+        &movdqa ("xmm0",&QWP(0x10,$base));      # 0 : sbdt
+        &pshufb ("xmm0","xmm3");                # 0 = sbdt
+        &pxor   ("xmm0","xmm4");                # 0 = ch
+        &sub    ($round,1);                     # nr--
+        &pshufb ("xmm0","xmm5");                # MC ch
+        &movdqa ("xmm4",&QWP(0x20,$base));      # 4 : sbbu
+        &pshufb ("xmm4","xmm2");                # 4 = sbbu
+        &pxor   ("xmm4","xmm0");                # 4 = ch
+        &movdqa ("xmm0",&QWP(0x30,$base));      # 0 : sbbt
+        &pshufb ("xmm0","xmm3");                # 0 = sbbt
+        &pxor   ("xmm0","xmm4");                # 0 = ch
+        &pshufb ("xmm0","xmm5");                # MC ch
+        &movdqa ("xmm4",&QWP(0x40,$base));      # 4 : sbeu
+        &pshufb ("xmm4","xmm2");                # 4 = sbeu
+        &pxor   ("xmm4","xmm0");                # 4 = ch
+        &movdqa ("xmm0",&QWP(0x50,$base));      # 0 : sbet
+        &pshufb ("xmm0","xmm3");                # 0 = sbet
+        &pxor   ("xmm0","xmm4");                # 0 = ch
+        &palignr("xmm5","xmm5",12);
+&set_label("dec_entry");
+        # top of round
+        &movdqa ("xmm1","xmm6");                # 1 : i
+        &pandn  ("xmm1","xmm0");                # 1 = i<<4
+        &psrld  ("xmm1",4);                     # 1 = i
+        &pand   ("xmm0","xmm6");                # 0 = k
+        &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+        &pshufb ("xmm2","xmm0");                # 2 = a/k
+        &pxor   ("xmm0","xmm1");                # 0 = j
+        &movdqa ("xmm3","xmm7");                # 3 : 1/i
+        &pshufb ("xmm3","xmm1");                # 3 = 1/i
+        &pxor   ("xmm3","xmm2");                # 3 = iak = 1/i + a/k
+        &movdqa ("xmm4","xmm7");                # 4 : 1/j
+        &pshufb ("xmm4","xmm0");                # 4 = 1/j
+        &pxor   ("xmm4","xmm2");                # 4 = jak = 1/j + a/k
+        &movdqa ("xmm2","xmm7");                # 2 : 1/iak
+        &pshufb ("xmm2","xmm3");                # 2 = 1/iak
+        &pxor   ("xmm2","xmm0");                # 2 = io
+        &movdqa ("xmm3","xmm7");                # 3 : 1/jak
+        &pshufb ("xmm3","xmm4");                # 3 = 1/jak
+        &pxor   ("xmm3","xmm1");                # 3 = jo
+        &movdqu ("xmm0",&QWP(0,$key));
+        &jnz    (&label("dec_loop"));
+        # middle of last round
+        &movdqa ("xmm4",&QWP(0x60,$base));      # 3 : sbou
+        &pshufb ("xmm4","xmm2");                # 4 = sbou
+        &pxor   ("xmm4","xmm0");                # 4 = sb1u + k
+        &movdqa ("xmm0",&QWP(0x70,$base));      # 0 : sbot
+        &movdqa ("xmm2",&QWP(0,$magic));
+        &pshufb ("xmm0","xmm3");                # 0 = sb1t
+        &pxor   ("xmm0","xmm4");                # 0 = A
+        &pshufb ("xmm0","xmm2");
+        &ret    ();
+&function_end_B("_vpaes_decrypt_core");
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+&function_begin_B("_vpaes_schedule_core");
+        &add    ($const,&DWP(0,"esp"));
+        &movdqu ("xmm0",&QWP(0,$inp));          # load key (unaligned)
+        &movdqa ("xmm2",&QWP($k_rcon,$const));  # load rcon
+        # input transform
+        &movdqa ("xmm3","xmm0");
+        &lea    ($base,&DWP($k_ipt,$const));
+        &movdqa (&QWP(4,"esp"),"xmm2");         # xmm8
+        &call   ("_vpaes_schedule_transform");
+        &movdqa ("xmm7","xmm0");
+        &test   ($out,$out);
+        &jnz    (&label("schedule_am_decrypting"));
+        # encrypting, output zeroth round key after transform
+        &movdqu (&QWP(0,$key),"xmm0");
+        &jmp    (&label("schedule_go"));
+&set_label("schedule_am_decrypting");
+        # decrypting, output zeroth round key after shiftrows
+        &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
+        &pshufb ("xmm3","xmm1");
+        &movdqu (&QWP(0,$key),"xmm3");
+        &xor    ($magic,0x30);
+&set_label("schedule_go");
+        &cmp    ($round,192);
+        &ja     (&label("schedule_256"));
+        &je     (&label("schedule_192"));
+        # 128: fall though
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+&set_label("schedule_128");
+        &mov    ($round,10);
+&set_label("loop_schedule_128");
+        &call   ("_vpaes_schedule_round");
+        &dec    ($round);
+        &jz     (&label("schedule_mangle_last"));
+        &call   ("_vpaes_schedule_mangle");     # write output
+        &jmp    (&label("loop_schedule_128"));
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+&set_label("schedule_192",16);
+        &movdqu ("xmm0",&QWP(8,$inp));          # load key part 2 (very unaligned)
+        &call   ("_vpaes_schedule_transform");  # input transform       
+        &movdqa ("xmm6","xmm0");                # save short part
+        &pxor   ("xmm4","xmm4");                # clear 4
+        &movhlps("xmm6","xmm4");                # clobber low side with zeros
+        &mov    ($round,4);
+&set_label("loop_schedule_192");
+        &call   ("_vpaes_schedule_round");
+        &palignr("xmm0","xmm6",8);
+        &call   ("_vpaes_schedule_mangle");     # save key n
+        &call   ("_vpaes_schedule_192_smear");
+        &call   ("_vpaes_schedule_mangle");     # save key n+1
+        &call   ("_vpaes_schedule_round");
+        &dec    ($round);
+        &jz     (&label("schedule_mangle_last"));
+        &call   ("_vpaes_schedule_mangle");     # save key n+2
+        &call   ("_vpaes_schedule_192_smear");
+        &jmp    (&label("loop_schedule_192"));
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+&set_label("schedule_256",16);
+        &movdqu ("xmm0",&QWP(16,$inp));         # load key part 2 (unaligned)
+        &call   ("_vpaes_schedule_transform");  # input transform       
+        &mov    ($round,7);
+&set_label("loop_schedule_256");
+        &call   ("_vpaes_schedule_mangle");     # output low result
+        &movdqa ("xmm6","xmm0");                # save cur_lo in xmm6
+        # high round
+        &call   ("_vpaes_schedule_round");
+        &dec    ($round);
+        &jz     (&label("schedule_mangle_last"));
+        &call   ("_vpaes_schedule_mangle");     
+        # low round. swap xmm7 and xmm6
+        &pshufd ("xmm0","xmm0",0xFF);
+        &movdqa (&QWP(20,"esp"),"xmm7");
+        &movdqa ("xmm7","xmm6");
+        &call   ("_vpaes_schedule_low_round");
+        &movdqa ("xmm7",&QWP(20,"esp"));
+        &jmp    (&label("loop_schedule_256"));
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+&set_label("schedule_mangle_last",16);
+        # schedule last round key from xmm0
+        &lea    ($base,&DWP($k_deskew,$const));
+        &test   ($out,$out);
+        &jnz    (&label("schedule_mangle_last_dec"));
+        # encrypting
+        &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
+        &pshufb ("xmm0","xmm1");                # output permute
+        &lea    ($base,&DWP($k_opt,$const));    # prepare to output transform
+        &add    ($key,32);
+&set_label("schedule_mangle_last_dec");
+        &add    ($key,-16);
+        &pxor   ("xmm0",&QWP($k_s63,$const));
+        &call   ("_vpaes_schedule_transform");  # output transform
+        &movdqu (&QWP(0,$key),"xmm0");          # save last key
+        # cleanup
+        &pxor   ("xmm0","xmm0");
+        &pxor   ("xmm1","xmm1");
+        &pxor   ("xmm2","xmm2");
+        &pxor   ("xmm3","xmm3");
+        &pxor   ("xmm4","xmm4");
+        &pxor   ("xmm5","xmm5");
+        &pxor   ("xmm6","xmm6");
+        &pxor   ("xmm7","xmm7");
+        &ret    ();
+&function_end_B("_vpaes_schedule_core");
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+&function_begin_B("_vpaes_schedule_192_smear");
+        &pshufd ("xmm0","xmm6",0x80);           # d c 0 0 -> c 0 0 0
+        &pxor   ("xmm6","xmm0");                # -> c+d c 0 0
+        &pshufd ("xmm0","xmm7",0xFE);           # b a _ _ -> b b b a
+        &pxor   ("xmm6","xmm0");                # -> b+c+d b+c b a
+        &movdqa ("xmm0","xmm6");
+        &pxor   ("xmm1","xmm1");
+        &movhlps("xmm6","xmm1");                # clobber low side with zeros
+        &ret    ();
+&function_end_B("_vpaes_schedule_192_smear");
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm5.
+##
+&function_begin_B("_vpaes_schedule_round");
+        # extract rcon from xmm8
+        &movdqa ("xmm2",&QWP(8,"esp"));         # xmm8
+        &pxor   ("xmm1","xmm1");
+        &palignr("xmm1","xmm2",15);
+        &palignr("xmm2","xmm2",15);
+        &pxor   ("xmm7","xmm1");
+        # rotate
+        &pshufd ("xmm0","xmm0",0xFF);
+        &palignr("xmm0","xmm0",1);
+        # fall through...
+        &movdqa (&QWP(8,"esp"),"xmm2");         # xmm8
+        # low round: same as high round, but no rotation and no rcon.
+&set_label("_vpaes_schedule_low_round");
+        # smear xmm7
+        &movdqa ("xmm1","xmm7");
+        &pslldq ("xmm7",4);
+        &pxor   ("xmm7","xmm1");
+        &movdqa ("xmm1","xmm7");
+        &pslldq ("xmm7",8);
+        &pxor   ("xmm7","xmm1");
+        &pxor   ("xmm7",&QWP($k_s63,$const));
+        # subbyte
+        &movdqa ("xmm4",&QWP($k_s0F,$const));
+        &movdqa ("xmm5",&QWP($k_inv,$const));   # 4 : 1/j
+        &movdqa ("xmm1","xmm4");        
+        &pandn  ("xmm1","xmm0");
+        &psrld  ("xmm1",4);                     # 1 = i
+        &pand   ("xmm0","xmm4");                # 0 = k
+        &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+        &pshufb ("xmm2","xmm0");                # 2 = a/k
+        &pxor   ("xmm0","xmm1");                # 0 = j
+        &movdqa ("xmm3","xmm5");                # 3 : 1/i
+        &pshufb ("xmm3","xmm1");                # 3 = 1/i
+        &pxor   ("xmm3","xmm2");                # 3 = iak = 1/i + a/k
+        &movdqa ("xmm4","xmm5");                # 4 : 1/j
+        &pshufb ("xmm4","xmm0");                # 4 = 1/j
+        &pxor   ("xmm4","xmm2");                # 4 = jak = 1/j + a/k
+        &movdqa ("xmm2","xmm5");                # 2 : 1/iak
+        &pshufb ("xmm2","xmm3");                # 2 = 1/iak
+        &pxor   ("xmm2","xmm0");                # 2 = io
+        &movdqa ("xmm3","xmm5");                # 3 : 1/jak
+        &pshufb ("xmm3","xmm4");                # 3 = 1/jak
+        &pxor   ("xmm3","xmm1");                # 3 = jo
+        &movdqa ("xmm4",&QWP($k_sb1,$const));   # 4 : sbou
+        &pshufb ("xmm4","xmm2");                # 4 = sbou
+        &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
+        &pshufb ("xmm0","xmm3");                # 0 = sb1t
+        &pxor   ("xmm0","xmm4");                # 0 = sbox output
+        # add in smeared stuff
+        &pxor   ("xmm0","xmm7");
+        &movdqa ("xmm7","xmm0");
+        &ret    ();
+&function_end_B("_vpaes_schedule_round");
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%ebx)
+##
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+&function_begin_B("_vpaes_schedule_transform");
+        &movdqa ("xmm2",&QWP($k_s0F,$const));
+        &movdqa ("xmm1","xmm2");
+        &pandn  ("xmm1","xmm0");
+        &psrld  ("xmm1",4);
+        &pand   ("xmm0","xmm2");
+        &movdqa ("xmm2",&QWP(0,$base));
+        &pshufb ("xmm2","xmm0");
+        &movdqa ("xmm0",&QWP(16,$base));
+        &pshufb ("xmm0","xmm1");
+        &pxor   ("xmm0","xmm2");
+        &ret    ();
+&function_end_B("_vpaes_schedule_transform");
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%edx), and increments or decrements it
+##  Keeps track of round number mod 4 in %ecx
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+&function_begin_B("_vpaes_schedule_mangle");
+        &movdqa ("xmm4","xmm0");        # save xmm0 for later
+        &movdqa ("xmm5",&QWP($k_mc_forward,$const));
+        &test   ($out,$out);
+        &jnz    (&label("schedule_mangle_dec"));
+        # encrypting
+        &add    ($key,16);
+        &pxor   ("xmm4",&QWP($k_s63,$const));
+        &pshufb ("xmm4","xmm5");
+        &movdqa ("xmm3","xmm4");
+        &pshufb ("xmm4","xmm5");
+        &pxor   ("xmm3","xmm4");
+        &pshufb ("xmm4","xmm5");
+        &pxor   ("xmm3","xmm4");
+        &jmp    (&label("schedule_mangle_both"));
+&set_label("schedule_mangle_dec",16);
+        # inverse mix columns
+        &movdqa ("xmm2",&QWP($k_s0F,$const));
+        &lea    ($inp,&DWP($k_dksd,$const));
+        &movdqa ("xmm1","xmm2");
+        &pandn  ("xmm1","xmm4");
+        &psrld  ("xmm1",4);                     # 1 = hi
+        &pand   ("xmm4","xmm2");                # 4 = lo
+        &movdqa ("xmm2",&QWP(0,$inp));
+        &pshufb ("xmm2","xmm4");
+        &movdqa ("xmm3",&QWP(0x10,$inp));
+        &pshufb ("xmm3","xmm1");
+        &pxor   ("xmm3","xmm2");
+        &pshufb ("xmm3","xmm5");
+        &movdqa ("xmm2",&QWP(0x20,$inp));
+        &pshufb ("xmm2","xmm4");
+        &pxor   ("xmm2","xmm3");
+        &movdqa ("xmm3",&QWP(0x30,$inp));
+        &pshufb ("xmm3","xmm1");
+        &pxor   ("xmm3","xmm2");
+        &pshufb ("xmm3","xmm5");
+        &movdqa ("xmm2",&QWP(0x40,$inp));
+        &pshufb ("xmm2","xmm4");
+        &pxor   ("xmm2","xmm3");
+        &movdqa ("xmm3",&QWP(0x50,$inp));
+        &pshufb ("xmm3","xmm1");
+        &pxor   ("xmm3","xmm2");
+        &pshufb ("xmm3","xmm5");
+        &movdqa ("xmm2",&QWP(0x60,$inp));
+        &pshufb ("xmm2","xmm4");
+        &pxor   ("xmm2","xmm3");
+        &movdqa ("xmm3",&QWP(0x70,$inp));
+        &pshufb ("xmm3","xmm1");
+        &pxor   ("xmm3","xmm2");
+        &add    ($key,-16);
+&set_label("schedule_mangle_both");
+        &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
+        &pshufb ("xmm3","xmm1");
+        &add    ($magic,-16);
+        &and    ($magic,0x30);
+        &movdqu (&QWP(0,$key),"xmm3");
+        &ret    ();
+&function_end_B("_vpaes_schedule_mangle");
+#
+# Interface to OpenSSL
+#
+&function_begin("${PREFIX}_set_encrypt_key");
+        &mov    ($inp,&wparam(0));              # inp
+        &lea    ($base,&DWP(-56,"esp"));
+        &mov    ($round,&wparam(1));            # bits
+        &and    ($base,-16);
+        &mov    ($key,&wparam(2));              # key
+        &xchg   ($base,"esp");                  # alloca
+        &mov    (&DWP(48,"esp"),$base);
+        &mov    ($base,$round);
+        &shr    ($base,5);
+        &add    ($base,5);
+        &mov    (&DWP(240,$key),$base);         # AES_KEY->rounds = nbits/32+5;
+        &mov    ($magic,0x30);
+        &mov    ($out,0);
+        &lea    ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+        &call   ("_vpaes_schedule_core");
+&set_label("pic_point");
+        &mov    ("esp",&DWP(48,"esp"));
+        &xor    ("eax","eax");
+&function_end("${PREFIX}_set_encrypt_key");
+&function_begin("${PREFIX}_set_decrypt_key");
+        &mov    ($inp,&wparam(0));              # inp
+        &lea    ($base,&DWP(-56,"esp"));
+        &mov    ($round,&wparam(1));            # bits
+        &and    ($base,-16);
+        &mov    ($key,&wparam(2));              # key
+        &xchg   ($base,"esp");                  # alloca
+        &mov    (&DWP(48,"esp"),$base);
+        &mov    ($base,$round);
+        &shr    ($base,5);
+        &add    ($base,5);
+        &mov    (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
+        &shl    ($base,4);
+        &lea    ($key,&DWP(16,$key,$base));
+        &mov    ($out,1);
+        &mov    ($magic,$round);
+        &shr    ($magic,1);
+        &and    ($magic,32);
+        &xor    ($magic,32);                    # nbist==192?0:32;
+        &lea    ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+        &call   ("_vpaes_schedule_core");
+&set_label("pic_point");
+        &mov    ("esp",&DWP(48,"esp"));
+        &xor    ("eax","eax");
+&function_end("${PREFIX}_set_decrypt_key");
+&function_begin("${PREFIX}_encrypt");
+        &lea    ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+        &call   ("_vpaes_preheat");
+&set_label("pic_point");
+        &mov    ($inp,&wparam(0));              # inp
+        &lea    ($base,&DWP(-56,"esp"));
+        &mov    ($out,&wparam(1));              # out
+        &and    ($base,-16);
+        &mov    ($key,&wparam(2));              # key
+        &xchg   ($base,"esp");                  # alloca
+        &mov    (&DWP(48,"esp"),$base);
+        &movdqu ("xmm0",&QWP(0,$inp));
+        &call   ("_vpaes_encrypt_core");
+        &movdqu (&QWP(0,$out),"xmm0");
+        &mov    ("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_encrypt");
+&function_begin("${PREFIX}_decrypt");
+        &lea    ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+        &call   ("_vpaes_preheat");
+&set_label("pic_point");
+        &mov    ($inp,&wparam(0));              # inp
+        &lea    ($base,&DWP(-56,"esp"));
+        &mov    ($out,&wparam(1));              # out
+        &and    ($base,-16);
+        &mov    ($key,&wparam(2));              # key
+        &xchg   ($base,"esp");                  # alloca
+        &mov    (&DWP(48,"esp"),$base);
+        &movdqu ("xmm0",&QWP(0,$inp));
+        &call   ("_vpaes_decrypt_core");
+        &movdqu (&QWP(0,$out),"xmm0");
+        &mov    ("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_decrypt");
+&function_begin("${PREFIX}_cbc_encrypt");
+        &mov    ($inp,&wparam(0));              # inp
+        &mov    ($out,&wparam(1));              # out
+        &mov    ($round,&wparam(2));            # len
+        &mov    ($key,&wparam(3));              # key
+        &sub    ($round,16);
+        &jc     (&label("cbc_abort"));
+        &lea    ($base,&DWP(-56,"esp"));
+        &mov    ($const,&wparam(4));            # ivp
+        &and    ($base,-16);
+        &mov    ($magic,&wparam(5));            # enc
+        &xchg   ($base,"esp");                  # alloca
+        &movdqu ("xmm1",&QWP(0,$const));        # load IV
+        &sub    ($out,$inp);
+        &mov    (&DWP(48,"esp"),$base);
+        &mov    (&DWP(0,"esp"),$out);           # save out
+        &mov    (&DWP(4,"esp"),$key)            # save key
+        &mov    (&DWP(8,"esp"),$const);         # save ivp
+        &mov    ($out,$round);                  # $out works as $len
+        &lea    ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+        &call   ("_vpaes_preheat");
+&set_label("pic_point");
+        &cmp    ($magic,0);
+        &je     (&label("cbc_dec_loop"));
+        &jmp    (&label("cbc_enc_loop"));
+&set_label("cbc_enc_loop",16);
+        &movdqu ("xmm0",&QWP(0,$inp));          # load input
+        &pxor   ("xmm0","xmm1");                # inp^=iv
+        &call   ("_vpaes_encrypt_core");
+        &mov    ($base,&DWP(0,"esp"));          # restore out
+        &mov    ($key,&DWP(4,"esp"));           # restore key
+        &movdqa ("xmm1","xmm0");
+        &movdqu (&QWP(0,$base,$inp),"xmm0");    # write output
+        &lea    ($inp,&DWP(16,$inp));
+        &sub    ($out,16);
+        &jnc    (&label("cbc_enc_loop"));
+        &jmp    (&label("cbc_done"));
+&set_label("cbc_dec_loop",16);
+        &movdqu ("xmm0",&QWP(0,$inp));          # load input
+        &movdqa (&QWP(16,"esp"),"xmm1");        # save IV
+        &movdqa (&QWP(32,"esp"),"xmm0");        # save future IV
+        &call   ("_vpaes_decrypt_core");
+        &mov    ($base,&DWP(0,"esp"));          # restore out
+        &mov    ($key,&DWP(4,"esp"));           # restore key
+        &pxor   ("xmm0",&QWP(16,"esp"));        # out^=iv
+        &movdqa ("xmm1",&QWP(32,"esp"));        # load next IV
+        &movdqu (&QWP(0,$base,$inp),"xmm0");    # write output
+        &lea    ($inp,&DWP(16,$inp));
+        &sub    ($out,16);
+        &jnc    (&label("cbc_dec_loop"));
+&set_label("cbc_done");
+        &mov    ($base,&DWP(8,"esp"));          # restore ivp
+        &mov    ("esp",&DWP(48,"esp"));
+        &movdqu (&QWP(0,$base),"xmm1");         # write IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
new file mode 100644
index 0000000000..37998db5e1
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
@@ -0,0 +1,1206 @@
+#!/usr/bin/env perl
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+######################################################################
+# September 2011.
+#
+# Interface to OpenSSL as "almost" drop-in replacement for
+# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-x86_64.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86_64.pl column -
+# [also large-block CBC] encrypt/decrypt.
+#
+#               aes-x86_64.pl           vpaes-x86_64.pl
+#
+# Core 2(**)    30.5/43.7/14.3          21.8/25.7(***)
+# Nehalem       30.5/42.2/14.6           9.8/11.8
+# Atom          63.9/79.0/32.1          64.0/84.8(***)
+#
+# (*)   "Hyper-threading" in the context refers rather to cache shared
+#       among multiple cores, than to specifically Intel HTT. As vast
+#       majority of contemporary cores share cache, slower code path
+#       is common place. In other words "with-hyper-threading-off"
+#       results are presented mostly for reference purposes.
+#
+# (**)  "Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***) Less impressive improvement on Core 2 and Atom is due to slow
+#       pshufb, yet it's respectable +40%/78% improvement on Core 2
+#       (as implied, over "hyper-threading-safe" code path).
+#
+#                                               <appro@openssl.org>
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+open STDOUT,"| $^X $xlate $flavour $output";
+$PREFIX="vpaes";
+$code.=<<___;
+.text
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type   _vpaes_encrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_encrypt_core:
+        mov     %rdx,   %r9
+        mov     \$16,   %r11
+        mov     240(%rdx),%eax
+        movdqa  %xmm9,  %xmm1
+        movdqa  .Lk_ipt(%rip), %xmm2    # iptlo
+        pandn   %xmm0,  %xmm1
+        movdqu  (%r9),  %xmm5           # round0 key
+        psrld   \$4,    %xmm1
+        pand    %xmm9,  %xmm0
+        pshufb  %xmm0,  %xmm2
+        movdqa  .Lk_ipt+16(%rip), %xmm0 # ipthi
+        pshufb  %xmm1,  %xmm0
+        pxor    %xmm5,  %xmm2
+        pxor    %xmm2,  %xmm0
+        add     \$16,   %r9
+        lea     .Lk_mc_backward(%rip),%r10
+        jmp     .Lenc_entry
+.align 16
+.Lenc_loop:
+        # middle of middle round
+        movdqa  %xmm13, %xmm4   # 4 : sb1u
+        pshufb  %xmm2,  %xmm4   # 4 = sb1u
+        pxor    %xmm5,  %xmm4   # 4 = sb1u + k
+        movdqa  %xmm12, %xmm0   # 0 : sb1t
+        pshufb  %xmm3,  %xmm0   # 0 = sb1t
+        pxor    %xmm4,  %xmm0   # 0 = A
+        movdqa  %xmm15, %xmm5   # 4 : sb2u
+        pshufb  %xmm2,  %xmm5   # 4 = sb2u
+        movdqa  -0x40(%r11,%r10), %xmm1         # .Lk_mc_forward[]
+        movdqa  %xmm14, %xmm2   # 2 : sb2t
+        pshufb  %xmm3,  %xmm2   # 2 = sb2t
+        pxor    %xmm5,  %xmm2   # 2 = 2A
+        movdqa  (%r11,%r10), %xmm4              # .Lk_mc_backward[]
+        movdqa  %xmm0,  %xmm3   # 3 = A
+        pshufb  %xmm1,  %xmm0   # 0 = B
+        add     \$16,   %r9     # next key
+        pxor    %xmm2,  %xmm0   # 0 = 2A+B
+        pshufb  %xmm4,  %xmm3   # 3 = D
+        add     \$16,   %r11    # next mc
+        pxor    %xmm0,  %xmm3   # 3 = 2A+B+D
+        pshufb  %xmm1,  %xmm0   # 0 = 2B+C
+        and     \$0x30, %r11    # ... mod 4
+        pxor    %xmm3,  %xmm0   # 0 = 2A+3B+C+D
+        sub     \$1,%rax        # nr--
+.Lenc_entry:
+        # top of round
+        movdqa  %xmm9,  %xmm1   # 1 : i
+        pandn   %xmm0,  %xmm1   # 1 = i<<4
+        psrld   \$4,    %xmm1   # 1 = i
+        pand    %xmm9,  %xmm0   # 0 = k
+        movdqa  %xmm11, %xmm5   # 2 : a/k
+        pshufb  %xmm0,  %xmm5   # 2 = a/k
+        pxor    %xmm1,  %xmm0   # 0 = j
+        movdqa  %xmm10, %xmm3   # 3 : 1/i
+        pshufb  %xmm1,  %xmm3   # 3 = 1/i
+        pxor    %xmm5,  %xmm3   # 3 = iak = 1/i + a/k
+        movdqa  %xmm10, %xmm4   # 4 : 1/j
+        pshufb  %xmm0,  %xmm4   # 4 = 1/j
+        pxor    %xmm5,  %xmm4   # 4 = jak = 1/j + a/k
+        movdqa  %xmm10, %xmm2   # 2 : 1/iak
+        pshufb  %xmm3,  %xmm2   # 2 = 1/iak
+        pxor    %xmm0,  %xmm2   # 2 = io
+        movdqa  %xmm10, %xmm3   # 3 : 1/jak
+        movdqu  (%r9),  %xmm5
+        pshufb  %xmm4,  %xmm3   # 3 = 1/jak
+        pxor    %xmm1,  %xmm3   # 3 = jo
+        jnz     .Lenc_loop
+        # middle of last round
+        movdqa  -0x60(%r10), %xmm4      # 3 : sbou      .Lk_sbo
+        movdqa  -0x50(%r10), %xmm0      # 0 : sbot      .Lk_sbo+16
+        pshufb  %xmm2,  %xmm4   # 4 = sbou
+        pxor    %xmm5,  %xmm4   # 4 = sb1u + k
+        pshufb  %xmm3,  %xmm0   # 0 = sb1t
+        movdqa  0x40(%r11,%r10), %xmm1          # .Lk_sr[]
+        pxor    %xmm4,  %xmm0   # 0 = A
+        pshufb  %xmm1,  %xmm0
+        ret
+.size   _vpaes_encrypt_core,.-_vpaes_encrypt_core
+        
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.type   _vpaes_decrypt_core,\@abi-omnipotent
+.align  16
+_vpaes_decrypt_core:
+        mov     %rdx,   %r9             # load key
+        mov     240(%rdx),%eax
+        movdqa  %xmm9,  %xmm1
+        movdqa  .Lk_dipt(%rip), %xmm2   # iptlo
+        pandn   %xmm0,  %xmm1
+        mov     %rax,   %r11
+        psrld   \$4,    %xmm1
+        movdqu  (%r9),  %xmm5           # round0 key
+        shl     \$4,    %r11
+        pand    %xmm9,  %xmm0
+        pshufb  %xmm0,  %xmm2
+        movdqa  .Lk_dipt+16(%rip), %xmm0 # ipthi
+        xor     \$0x30, %r11
+        lea     .Lk_dsbd(%rip),%r10
+        pshufb  %xmm1,  %xmm0
+        and     \$0x30, %r11
+        pxor    %xmm5,  %xmm2
+        movdqa  .Lk_mc_forward+48(%rip), %xmm5
+        pxor    %xmm2,  %xmm0
+        add     \$16,   %r9
+        add     %r10,   %r11
+        jmp     .Ldec_entry
+.align 16
+.Ldec_loop:
+##
+##  Inverse mix columns
+##
+        movdqa  -0x20(%r10),%xmm4       # 4 : sb9u
+        pshufb  %xmm2,  %xmm4           # 4 = sb9u
+        pxor    %xmm0,  %xmm4
+        movdqa  -0x10(%r10),%xmm0       # 0 : sb9t
+        pshufb  %xmm3,  %xmm0           # 0 = sb9t
+        pxor    %xmm4,  %xmm0           # 0 = ch
+        add     \$16, %r9               # next round key
+        pshufb  %xmm5,  %xmm0           # MC ch
+        movdqa  0x00(%r10),%xmm4        # 4 : sbdu
+        pshufb  %xmm2,  %xmm4           # 4 = sbdu
+        pxor    %xmm0,  %xmm4           # 4 = ch
+        movdqa  0x10(%r10),%xmm0        # 0 : sbdt
+        pshufb  %xmm3,  %xmm0           # 0 = sbdt
+        pxor    %xmm4,  %xmm0           # 0 = ch
+        sub     \$1,%rax                # nr--
+        
+        pshufb  %xmm5,  %xmm0           # MC ch
+        movdqa  0x20(%r10),%xmm4        # 4 : sbbu
+        pshufb  %xmm2,  %xmm4           # 4 = sbbu
+        pxor    %xmm0,  %xmm4           # 4 = ch
+        movdqa  0x30(%r10),%xmm0        # 0 : sbbt
+        pshufb  %xmm3,  %xmm0           # 0 = sbbt
+        pxor    %xmm4,  %xmm0           # 0 = ch
+        
+        pshufb  %xmm5,  %xmm0           # MC ch
+        movdqa  0x40(%r10),%xmm4        # 4 : sbeu
+        pshufb  %xmm2,  %xmm4           # 4 = sbeu
+        pxor    %xmm0,  %xmm4           # 4 = ch
+        movdqa  0x50(%r10),%xmm0        # 0 : sbet
+        pshufb  %xmm3,  %xmm0           # 0 = sbet
+        pxor    %xmm4,  %xmm0           # 0 = ch
+        palignr \$12,   %xmm5,  %xmm5
+        
+.Ldec_entry:
+        # top of round
+        movdqa  %xmm9,  %xmm1   # 1 : i
+        pandn   %xmm0,  %xmm1   # 1 = i<<4
+        psrld   \$4,    %xmm1   # 1 = i
+        pand    %xmm9,  %xmm0   # 0 = k
+        movdqa  %xmm11, %xmm2   # 2 : a/k
+        pshufb  %xmm0,  %xmm2   # 2 = a/k
+        pxor    %xmm1,  %xmm0   # 0 = j
+        movdqa  %xmm10, %xmm3   # 3 : 1/i
+        pshufb  %xmm1,  %xmm3   # 3 = 1/i
+        pxor    %xmm2,  %xmm3   # 3 = iak = 1/i + a/k
+        movdqa  %xmm10, %xmm4   # 4 : 1/j
+        pshufb  %xmm0,  %xmm4   # 4 = 1/j
+        pxor    %xmm2,  %xmm4   # 4 = jak = 1/j + a/k
+        movdqa  %xmm10, %xmm2   # 2 : 1/iak
+        pshufb  %xmm3,  %xmm2   # 2 = 1/iak
+        pxor    %xmm0,  %xmm2   # 2 = io
+        movdqa  %xmm10, %xmm3   # 3 : 1/jak
+        pshufb  %xmm4,  %xmm3   # 3 = 1/jak
+        pxor    %xmm1,  %xmm3   # 3 = jo
+        movdqu  (%r9),  %xmm0
+        jnz     .Ldec_loop
+        # middle of last round
+        movdqa  0x60(%r10), %xmm4       # 3 : sbou
+        pshufb  %xmm2,  %xmm4   # 4 = sbou
+        pxor    %xmm0,  %xmm4   # 4 = sb1u + k
+        movdqa  0x70(%r10), %xmm0       # 0 : sbot
+        movdqa  -0x160(%r11), %xmm2     # .Lk_sr-.Lk_dsbd=-0x160
+        pshufb  %xmm3,  %xmm0   # 0 = sb1t
+        pxor    %xmm4,  %xmm0   # 0 = A
+        pshufb  %xmm2,  %xmm0
+        ret
+.size   _vpaes_decrypt_core,.-_vpaes_decrypt_core
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type   _vpaes_schedule_core,\@abi-omnipotent
+.align  16
+_vpaes_schedule_core:
+        # rdi = key
+        # rsi = size in bits
+        # rdx = buffer
+        # rcx = direction.  0=encrypt, 1=decrypt
+        call    _vpaes_preheat          # load the tables
+        movdqa  .Lk_rcon(%rip), %xmm8   # load rcon
+        movdqu  (%rdi), %xmm0           # load key (unaligned)
+        # input transform
+        movdqa  %xmm0,  %xmm3
+        lea     .Lk_ipt(%rip), %r11
+        call    _vpaes_schedule_transform
+        movdqa  %xmm0,  %xmm7
+        lea     .Lk_sr(%rip),%r10
+        test    %rcx,   %rcx
+        jnz     .Lschedule_am_decrypting
+        # encrypting, output zeroth round key after transform
+        movdqu  %xmm0,  (%rdx)
+        jmp     .Lschedule_go
+.Lschedule_am_decrypting:
+        # decrypting, output zeroth round key after shiftrows
+        movdqa  (%r8,%r10),%xmm1
+        pshufb  %xmm1,  %xmm3
+        movdqu  %xmm3,  (%rdx)
+        xor     \$0x30, %r8
+.Lschedule_go:
+        cmp     \$192,  %esi
+        ja      .Lschedule_256
+        je      .Lschedule_192
+        # 128: fall though
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+        mov     \$10, %esi
+        
+.Loop_schedule_128:
+        call    _vpaes_schedule_round
+        dec     %rsi
+        jz      .Lschedule_mangle_last
+        call    _vpaes_schedule_mangle  # write output
+        jmp     .Loop_schedule_128
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align  16
+.Lschedule_192:
+        movdqu  8(%rdi),%xmm0           # load key part 2 (very unaligned)
+        call    _vpaes_schedule_transform       # input transform
+        movdqa  %xmm0,  %xmm6           # save short part
+        pxor    %xmm4,  %xmm4           # clear 4
+        movhlps %xmm4,  %xmm6           # clobber low side with zeros
+        mov     \$4,    %esi
+.Loop_schedule_192:
+        call    _vpaes_schedule_round
+        palignr \$8,%xmm6,%xmm0 
+        call    _vpaes_schedule_mangle  # save key n
+        call    _vpaes_schedule_192_smear
+        call    _vpaes_schedule_mangle  # save key n+1
+        call    _vpaes_schedule_round
+        dec     %rsi
+        jz      .Lschedule_mangle_last
+        call    _vpaes_schedule_mangle  # save key n+2
+        call    _vpaes_schedule_192_smear
+        jmp     .Loop_schedule_192
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align  16
+.Lschedule_256:
+        movdqu  16(%rdi),%xmm0          # load key part 2 (unaligned)
+        call    _vpaes_schedule_transform       # input transform
+        mov     \$7, %esi
+        
+.Loop_schedule_256:
+        call    _vpaes_schedule_mangle  # output low result
+        movdqa  %xmm0,  %xmm6           # save cur_lo in xmm6
+        # high round
+        call    _vpaes_schedule_round
+        dec     %rsi
+        jz      .Lschedule_mangle_last
+        call    _vpaes_schedule_mangle  
+        # low round. swap xmm7 and xmm6
+        pshufd  \$0xFF, %xmm0,  %xmm0
+        movdqa  %xmm7,  %xmm5
+        movdqa  %xmm6,  %xmm7
+        call    _vpaes_schedule_low_round
+        movdqa  %xmm5,  %xmm7
+        
+        jmp     .Loop_schedule_256
+        
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align  16
+.Lschedule_mangle_last:
+        # schedule last round key from xmm0
+        lea     .Lk_deskew(%rip),%r11   # prepare to deskew
+        test    %rcx,   %rcx
+        jnz     .Lschedule_mangle_last_dec
+        # encrypting
+        movdqa  (%r8,%r10),%xmm1
+        pshufb  %xmm1,  %xmm0           # output permute
+        lea     .Lk_opt(%rip),  %r11    # prepare to output transform
+        add     \$32,   %rdx
+.Lschedule_mangle_last_dec:
+        add     \$-16,  %rdx
+        pxor    .Lk_s63(%rip),  %xmm0
+        call    _vpaes_schedule_transform # output transform
+        movdqu  %xmm0,  (%rdx)          # save last key
+        # cleanup
+        pxor    %xmm0,  %xmm0
+        pxor    %xmm1,  %xmm1
+        pxor    %xmm2,  %xmm2
+        pxor    %xmm3,  %xmm3
+        pxor    %xmm4,  %xmm4
+        pxor    %xmm5,  %xmm5
+        pxor    %xmm6,  %xmm6
+        pxor    %xmm7,  %xmm7
+        ret
+.size   _vpaes_schedule_core,.-_vpaes_schedule_core
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.type   _vpaes_schedule_192_smear,\@abi-omnipotent
+.align  16
+_vpaes_schedule_192_smear:
+        pshufd  \$0x80, %xmm6,  %xmm0   # d c 0 0 -> c 0 0 0
+        pxor    %xmm0,  %xmm6           # -> c+d c 0 0
+        pshufd  \$0xFE, %xmm7,  %xmm0   # b a _ _ -> b b b a
+        pxor    %xmm0,  %xmm6           # -> b+c+d b+c b a
+        movdqa  %xmm6,  %xmm0
+        pxor    %xmm1,  %xmm1
+        movhlps %xmm1,  %xmm6           # clobber low side with zeros
+        ret
+.size   _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type   _vpaes_schedule_round,\@abi-omnipotent
+.align  16
+_vpaes_schedule_round:
+        # extract rcon from xmm8
+        pxor    %xmm1,  %xmm1
+        palignr \$15,   %xmm8,  %xmm1
+        palignr \$15,   %xmm8,  %xmm8
+        pxor    %xmm1,  %xmm7
+        # rotate
+        pshufd  \$0xFF, %xmm0,  %xmm0
+        palignr \$1,    %xmm0,  %xmm0
+        
+        # fall through...
+        
+        # low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+        # smear xmm7
+        movdqa  %xmm7,  %xmm1
+        pslldq  \$4,    %xmm7
+        pxor    %xmm1,  %xmm7
+        movdqa  %xmm7,  %xmm1
+        pslldq  \$8,    %xmm7
+        pxor    %xmm1,  %xmm7
+        pxor    .Lk_s63(%rip), %xmm7
+        # subbytes
+        movdqa  %xmm9,  %xmm1
+        pandn   %xmm0,  %xmm1
+        psrld   \$4,    %xmm1           # 1 = i
+        pand    %xmm9,  %xmm0           # 0 = k
+        movdqa  %xmm11, %xmm2           # 2 : a/k
+        pshufb  %xmm0,  %xmm2           # 2 = a/k
+        pxor    %xmm1,  %xmm0           # 0 = j
+        movdqa  %xmm10, %xmm3           # 3 : 1/i
+        pshufb  %xmm1,  %xmm3           # 3 = 1/i
+        pxor    %xmm2,  %xmm3           # 3 = iak = 1/i + a/k
+        movdqa  %xmm10, %xmm4           # 4 : 1/j
+        pshufb  %xmm0,  %xmm4           # 4 = 1/j
+        pxor    %xmm2,  %xmm4           # 4 = jak = 1/j + a/k
+        movdqa  %xmm10, %xmm2           # 2 : 1/iak
+        pshufb  %xmm3,  %xmm2           # 2 = 1/iak
+        pxor    %xmm0,  %xmm2           # 2 = io
+        movdqa  %xmm10, %xmm3           # 3 : 1/jak
+        pshufb  %xmm4,  %xmm3           # 3 = 1/jak
+        pxor    %xmm1,  %xmm3           # 3 = jo
+        movdqa  %xmm13, %xmm4           # 4 : sbou
+        pshufb  %xmm2,  %xmm4           # 4 = sbou
+        movdqa  %xmm12, %xmm0           # 0 : sbot
+        pshufb  %xmm3,  %xmm0           # 0 = sb1t
+        pxor    %xmm4,  %xmm0           # 0 = sbox output
+        # add in smeared stuff
+        pxor    %xmm7,  %xmm0   
+        movdqa  %xmm0,  %xmm7
+        ret
+.size   _vpaes_schedule_round,.-_vpaes_schedule_round
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type   _vpaes_schedule_transform,\@abi-omnipotent
+.align  16
+_vpaes_schedule_transform:
+        movdqa  %xmm9,  %xmm1
+        pandn   %xmm0,  %xmm1
+        psrld   \$4,    %xmm1
+        pand    %xmm9,  %xmm0
+        movdqa  (%r11), %xmm2   # lo
+        pshufb  %xmm0,  %xmm2
+        movdqa  16(%r11), %xmm0 # hi
+        pshufb  %xmm1,  %xmm0
+        pxor    %xmm2,  %xmm0
+        ret
+.size   _vpaes_schedule_transform,.-_vpaes_schedule_transform
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type   _vpaes_schedule_mangle,\@abi-omnipotent
+.align  16
+_vpaes_schedule_mangle:
+        movdqa  %xmm0,  %xmm4   # save xmm0 for later
+        movdqa  .Lk_mc_forward(%rip),%xmm5
+        test    %rcx,   %rcx
+        jnz     .Lschedule_mangle_dec
+        # encrypting
+        add     \$16,   %rdx
+        pxor    .Lk_s63(%rip),%xmm4
+        pshufb  %xmm5,  %xmm4
+        movdqa  %xmm4,  %xmm3
+        pshufb  %xmm5,  %xmm4
+        pxor    %xmm4,  %xmm3
+        pshufb  %xmm5,  %xmm4
+        pxor    %xmm4,  %xmm3
+        jmp     .Lschedule_mangle_both
+.align  16
+.Lschedule_mangle_dec:
+        # inverse mix columns
+        lea     .Lk_dksd(%rip),%r11
+        movdqa  %xmm9,  %xmm1
+        pandn   %xmm4,  %xmm1
+        psrld   \$4,    %xmm1   # 1 = hi
+        pand    %xmm9,  %xmm4   # 4 = lo
+        movdqa  0x00(%r11), %xmm2
+        pshufb  %xmm4,  %xmm2
+        movdqa  0x10(%r11), %xmm3
+        pshufb  %xmm1,  %xmm3
+        pxor    %xmm2,  %xmm3
+        pshufb  %xmm5,  %xmm3
+        movdqa  0x20(%r11), %xmm2
+        pshufb  %xmm4,  %xmm2
+        pxor    %xmm3,  %xmm2
+        movdqa  0x30(%r11), %xmm3
+        pshufb  %xmm1,  %xmm3
+        pxor    %xmm2,  %xmm3
+        pshufb  %xmm5,  %xmm3
+        movdqa  0x40(%r11), %xmm2
+        pshufb  %xmm4,  %xmm2
+        pxor    %xmm3,  %xmm2
+        movdqa  0x50(%r11), %xmm3
+        pshufb  %xmm1,  %xmm3
+        pxor    %xmm2,  %xmm3
+        pshufb  %xmm5,  %xmm3
+        movdqa  0x60(%r11), %xmm2
+        pshufb  %xmm4,  %xmm2
+        pxor    %xmm3,  %xmm2
+        movdqa  0x70(%r11), %xmm3
+        pshufb  %xmm1,  %xmm3
+        pxor    %xmm2,  %xmm3
+        add     \$-16,  %rdx
+.Lschedule_mangle_both:
+        movdqa  (%r8,%r10),%xmm1
+        pshufb  %xmm1,%xmm3
+        add     \$-16,  %r8
+        and     \$0x30, %r8
+        movdqu  %xmm3,  (%rdx)
+        ret
+.size   _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+#
+# Interface to OpenSSL
+#
+.globl  ${PREFIX}_set_encrypt_key
+.type   ${PREFIX}_set_encrypt_key,\@function,3
+.align  16
+${PREFIX}_set_encrypt_key:
+___
+$code.=<<___ if ($win64);
+        lea     -0xb8(%rsp),%rsp
+        movaps  %xmm6,0x10(%rsp)
+        movaps  %xmm7,0x20(%rsp)
+        movaps  %xmm8,0x30(%rsp)
+        movaps  %xmm9,0x40(%rsp)
+        movaps  %xmm10,0x50(%rsp)
+        movaps  %xmm11,0x60(%rsp)
+        movaps  %xmm12,0x70(%rsp)
+        movaps  %xmm13,0x80(%rsp)
+        movaps  %xmm14,0x90(%rsp)
+        movaps  %xmm15,0xa0(%rsp)
+.Lenc_key_body:
+___
+$code.=<<___;
+        mov     %esi,%eax
+        shr     \$5,%eax
+        add     \$5,%eax
+        mov     %eax,240(%rdx)  # AES_KEY->rounds = nbits/32+5;
+        mov     \$0,%ecx
+        mov     \$0x30,%r8d
+        call    _vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+        movaps  0x10(%rsp),%xmm6
+        movaps  0x20(%rsp),%xmm7
+        movaps  0x30(%rsp),%xmm8
+        movaps  0x40(%rsp),%xmm9
+        movaps  0x50(%rsp),%xmm10
+        movaps  0x60(%rsp),%xmm11
+        movaps  0x70(%rsp),%xmm12
+        movaps  0x80(%rsp),%xmm13
+        movaps  0x90(%rsp),%xmm14
+        movaps  0xa0(%rsp),%xmm15
+        lea     0xb8(%rsp),%rsp
+.Lenc_key_epilogue:
+___
+$code.=<<___;
+        xor     %eax,%eax
+        ret
+.size   ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+.globl  ${PREFIX}_set_decrypt_key
+.type   ${PREFIX}_set_decrypt_key,\@function,3
+.align  16
+${PREFIX}_set_decrypt_key:
+___
+$code.=<<___ if ($win64);
+        lea     -0xb8(%rsp),%rsp
+        movaps  %xmm6,0x10(%rsp)
+        movaps  %xmm7,0x20(%rsp)
+        movaps  %xmm8,0x30(%rsp)
+        movaps  %xmm9,0x40(%rsp)
+        movaps  %xmm10,0x50(%rsp)
+        movaps  %xmm11,0x60(%rsp)
+        movaps  %xmm12,0x70(%rsp)
+        movaps  %xmm13,0x80(%rsp)
+        movaps  %xmm14,0x90(%rsp)
+        movaps  %xmm15,0xa0(%rsp)
+.Ldec_key_body:
+___
+$code.=<<___;
+        mov     %esi,%eax
+        shr     \$5,%eax
+        add     \$5,%eax
+        mov     %eax,240(%rdx)  # AES_KEY->rounds = nbits/32+5;
+        shl     \$4,%eax
+        lea     16(%rdx,%rax),%rdx
+        mov     \$1,%ecx
+        mov     %esi,%r8d
+        shr     \$1,%r8d
+        and     \$32,%r8d
+        xor     \$32,%r8d       # nbits==192?0:32
+        call    _vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+        movaps  0x10(%rsp),%xmm6
+        movaps  0x20(%rsp),%xmm7
+        movaps  0x30(%rsp),%xmm8
+        movaps  0x40(%rsp),%xmm9
+        movaps  0x50(%rsp),%xmm10
+        movaps  0x60(%rsp),%xmm11
+        movaps  0x70(%rsp),%xmm12
+        movaps  0x80(%rsp),%xmm13
+        movaps  0x90(%rsp),%xmm14
+        movaps  0xa0(%rsp),%xmm15
+        lea     0xb8(%rsp),%rsp
+.Ldec_key_epilogue:
+___
+$code.=<<___;
+        xor     %eax,%eax
+        ret
+.size   ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+.globl  ${PREFIX}_encrypt
+.type   ${PREFIX}_encrypt,\@function,3
+.align  16
+${PREFIX}_encrypt:
+___
+$code.=<<___ if ($win64);
+        lea     -0xb8(%rsp),%rsp
+        movaps  %xmm6,0x10(%rsp)
+        movaps  %xmm7,0x20(%rsp)
+        movaps  %xmm8,0x30(%rsp)
+        movaps  %xmm9,0x40(%rsp)
+        movaps  %xmm10,0x50(%rsp)
+        movaps  %xmm11,0x60(%rsp)
+        movaps  %xmm12,0x70(%rsp)
+        movaps  %xmm13,0x80(%rsp)
+        movaps  %xmm14,0x90(%rsp)
+        movaps  %xmm15,0xa0(%rsp)
+.Lenc_body:
+___
+$code.=<<___;
+        movdqu  (%rdi),%xmm0
+        call    _vpaes_preheat
+        call    _vpaes_encrypt_core
+        movdqu  %xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+        movaps  0x10(%rsp),%xmm6
+        movaps  0x20(%rsp),%xmm7
+        movaps  0x30(%rsp),%xmm8
+        movaps  0x40(%rsp),%xmm9
+        movaps  0x50(%rsp),%xmm10
+        movaps  0x60(%rsp),%xmm11
+        movaps  0x70(%rsp),%xmm12
+        movaps  0x80(%rsp),%xmm13
+        movaps  0x90(%rsp),%xmm14
+        movaps  0xa0(%rsp),%xmm15
+        lea     0xb8(%rsp),%rsp
+.Lenc_epilogue:
+___
+$code.=<<___;
+        ret
+.size   ${PREFIX}_encrypt,.-${PREFIX}_encrypt
+.globl  ${PREFIX}_decrypt
+.type   ${PREFIX}_decrypt,\@function,3
+.align  16
+${PREFIX}_decrypt:
+___
+$code.=<<___ if ($win64);
+        lea     -0xb8(%rsp),%rsp
+        movaps  %xmm6,0x10(%rsp)
+        movaps  %xmm7,0x20(%rsp)
+        movaps  %xmm8,0x30(%rsp)
+        movaps  %xmm9,0x40(%rsp)
+        movaps  %xmm10,0x50(%rsp)
+        movaps  %xmm11,0x60(%rsp)
+        movaps  %xmm12,0x70(%rsp)
+        movaps  %xmm13,0x80(%rsp)
+        movaps  %xmm14,0x90(%rsp)
+        movaps  %xmm15,0xa0(%rsp)
+.Ldec_body:
+___
+$code.=<<___;
+        movdqu  (%rdi),%xmm0
+        call    _vpaes_preheat
+        call    _vpaes_decrypt_core
+        movdqu  %xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+        movaps  0x10(%rsp),%xmm6
+        movaps  0x20(%rsp),%xmm7
+        movaps  0x30(%rsp),%xmm8
+        movaps  0x40(%rsp),%xmm9
+        movaps  0x50(%rsp),%xmm10
+        movaps  0x60(%rsp),%xmm11
+        movaps  0x70(%rsp),%xmm12
+        movaps  0x80(%rsp),%xmm13
+        movaps  0x90(%rsp),%xmm14
+        movaps  0xa0(%rsp),%xmm15
+        lea     0xb8(%rsp),%rsp
+.Ldec_epilogue:
+___
+$code.=<<___;
+        ret
+.size   ${PREFIX}_decrypt,.-${PREFIX}_decrypt
+___
+{
+my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
+#                       size_t length, const AES_KEY *key,
+#                       unsigned char *ivp,const int enc);
+$code.=<<___;
+.globl  ${PREFIX}_cbc_encrypt
+.type   ${PREFIX}_cbc_encrypt,\@function,6
+.align  16
+${PREFIX}_cbc_encrypt:
+        xchg    $key,$len
+___
+($len,$key)=($key,$len);
+$code.=<<___;
+        sub     \$16,$len
+        jc      .Lcbc_abort
+___
+$code.=<<___ if ($win64);
+        lea     -0xb8(%rsp),%rsp
+        movaps  %xmm6,0x10(%rsp)
+        movaps  %xmm7,0x20(%rsp)
+        movaps  %xmm8,0x30(%rsp)
+        movaps  %xmm9,0x40(%rsp)
+        movaps  %xmm10,0x50(%rsp)
+        movaps  %xmm11,0x60(%rsp)
+        movaps  %xmm12,0x70(%rsp)
+        movaps  %xmm13,0x80(%rsp)
+        movaps  %xmm14,0x90(%rsp)
+        movaps  %xmm15,0xa0(%rsp)
+.Lcbc_body:
+___
+$code.=<<___;
+        movdqu  ($ivp),%xmm6            # load IV
+        sub     $inp,$out
+        call    _vpaes_preheat
+        cmp     \$0,${enc}d
+        je      .Lcbc_dec_loop
+        jmp     .Lcbc_enc_loop
+.align  16
+.Lcbc_enc_loop:
+        movdqu  ($inp),%xmm0
+        pxor    %xmm6,%xmm0
+        call    _vpaes_encrypt_core
+        movdqa  %xmm0,%xmm6
+        movdqu  %xmm0,($out,$inp)
+        lea     16($inp),$inp
+        sub     \$16,$len
+        jnc     .Lcbc_enc_loop
+        jmp     .Lcbc_done
+.align  16
+.Lcbc_dec_loop:
+        movdqu  ($inp),%xmm0
+        movdqa  %xmm0,%xmm7
+        call    _vpaes_decrypt_core
+        pxor    %xmm6,%xmm0
+        movdqa  %xmm7,%xmm6
+        movdqu  %xmm0,($out,$inp)
+        lea     16($inp),$inp
+        sub     \$16,$len
+        jnc     .Lcbc_dec_loop
+.Lcbc_done:
+        movdqu  %xmm6,($ivp)            # save IV
+___
+$code.=<<___ if ($win64);
+        movaps  0x10(%rsp),%xmm6
+        movaps  0x20(%rsp),%xmm7
+        movaps  0x30(%rsp),%xmm8
+        movaps  0x40(%rsp),%xmm9
+        movaps  0x50(%rsp),%xmm10
+        movaps  0x60(%rsp),%xmm11
+        movaps  0x70(%rsp),%xmm12
+        movaps  0x80(%rsp),%xmm13
+        movaps  0x90(%rsp),%xmm14
+        movaps  0xa0(%rsp),%xmm15
+        lea     0xb8(%rsp),%rsp
+.Lcbc_epilogue:
+___
+$code.=<<___;
+.Lcbc_abort:
+        ret
+.size   ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+}
+$code.=<<___;
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type   _vpaes_preheat,\@abi-omnipotent
+.align  16
+_vpaes_preheat:
+        lea     .Lk_s0F(%rip), %r10
+        movdqa  -0x20(%r10), %xmm10     # .Lk_inv
+        movdqa  -0x10(%r10), %xmm11     # .Lk_inv+16
+        movdqa  0x00(%r10), %xmm9       # .Lk_s0F
+        movdqa  0x30(%r10), %xmm13      # .Lk_sb1
+        movdqa  0x40(%r10), %xmm12      # .Lk_sb1+16
+        movdqa  0x50(%r10), %xmm15      # .Lk_sb2
+        movdqa  0x60(%r10), %xmm14      # .Lk_sb2+16
+        ret
+.size   _vpaes_preheat,.-_vpaes_preheat
+########################################################
+##                                                    ##
+##                     Constants                      ##
+##                                                    ##
+########################################################
+.type   _vpaes_consts,\@object
+.align  64
+_vpaes_consts:
+.Lk_inv:        # inv, inva
+        .quad   0x0E05060F0D080180, 0x040703090A0B0C02
+        .quad   0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_s0F:        # s0F
+        .quad   0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+.Lk_ipt:        # input transform (lo, hi)
+        .quad   0xC2B2E8985A2A7000, 0xCABAE09052227808
+        .quad   0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sb1:        # sb1u, sb1t
+        .quad   0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+        .quad   0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:        # sb2u, sb2t
+        .quad   0xE27A93C60B712400, 0x5EB7E955BC982FCD
+        .quad   0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:        # sbou, sbot
+        .quad   0xD0D26D176FBDC700, 0x15AABF7AC502A878
+        .quad   0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_mc_forward: # mc_forward
+        .quad   0x0407060500030201, 0x0C0F0E0D080B0A09
+        .quad   0x080B0A0904070605, 0x000302010C0F0E0D
+        .quad   0x0C0F0E0D080B0A09, 0x0407060500030201
+        .quad   0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:# mc_backward
+        .quad   0x0605040702010003, 0x0E0D0C0F0A09080B
+        .quad   0x020100030E0D0C0F, 0x0A09080B06050407
+        .quad   0x0E0D0C0F0A09080B, 0x0605040702010003
+        .quad   0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:         # sr
+        .quad   0x0706050403020100, 0x0F0E0D0C0B0A0908
+        .quad   0x030E09040F0A0500, 0x0B06010C07020D08
+        .quad   0x0F060D040B020900, 0x070E050C030A0108
+        .quad   0x0B0E0104070A0D00, 0x0306090C0F020508
+.Lk_rcon:       # rcon
+        .quad   0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+.Lk_s63:        # s63: all equal to 0x63 transformed
+        .quad   0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+.Lk_opt:        # output transform
+        .quad   0xFF9F4929D6B66000, 0xF7974121DEBE6808
+        .quad   0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:     # deskew tables: inverts the sbox's "skew"
+        .quad   0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+        .quad   0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+##
+##  Decryption stuff
+##  Key schedule constants
+##
+.Lk_dksd:       # decryption key schedule: invskew x*D
+        .quad   0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+        .quad   0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:       # decryption key schedule: invskew x*B
+        .quad   0x9A4FCA1F8550D500, 0x03D653861CC94C99
+        .quad   0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:       # decryption key schedule: invskew x*E + 0x63
+        .quad   0xD5031CCA1FC9D600, 0x53859A4C994F5086
+        .quad   0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:       # decryption key schedule: invskew x*9
+        .quad   0xB6116FC87ED9A700, 0x4AED933482255BFC
+        .quad   0x4576516227143300, 0x8BB89FACE9DAFDCE
+##
+##  Decryption stuff
+##  Round function constants
+##
+.Lk_dipt:       # decryption input transform
+        .quad   0x0F505B040B545F00, 0x154A411E114E451A
+        .quad   0x86E383E660056500, 0x12771772F491F194
+.Lk_dsb9:       # decryption sbox output *9*u, *9*t
+        .quad   0x851C03539A86D600, 0xCAD51F504F994CC9
+        .quad   0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:       # decryption sbox output *D*u, *D*t
+        .quad   0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+        .quad   0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:       # decryption sbox output *B*u, *B*t
+        .quad   0xD022649296B44200, 0x602646F6B0F2D404
+        .quad   0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:       # decryption sbox output *E*u, *E*t
+        .quad   0x46F2929626D4D000, 0x2242600464B4F6B0
+        .quad   0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:       # decryption sbox final output
+        .quad   0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+        .quad   0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.asciz  "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
+.align  64
+.size   _vpaes_consts,.-_vpaes_consts
+___
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type   se_handler,\@abi-omnipotent
+.align  16
+se_handler:
+        push    %rsi
+        push    %rdi
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        pushfq
+        sub     \$64,%rsp
+        mov     120($context),%rax      # pull context->Rax
+        mov     248($context),%rbx      # pull context->Rip
+        mov     8($disp),%rsi           # disp->ImageBase
+        mov     56($disp),%r11          # disp->HandlerData
+        mov     0(%r11),%r10d           # HandlerData[0]
+        lea     (%rsi,%r10),%r10        # prologue label
+        cmp     %r10,%rbx               # context->Rip<prologue label
+        jb      .Lin_prologue
+        mov     152($context),%rax      # pull context->Rsp
+        mov     4(%r11),%r10d           # HandlerData[1]
+        lea     (%rsi,%r10),%r10        # epilogue label
+        cmp     %r10,%rbx               # context->Rip>=epilogue label
+        jae     .Lin_prologue
+        lea     16(%rax),%rsi           # %xmm save area
+        lea     512($context),%rdi      # &context.Xmm6
+        mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
+        .long   0xa548f3fc              # cld; rep movsq
+        lea     0xb8(%rax),%rax         # adjust stack pointer
+.Lin_prologue:
+        mov     8(%rax),%rdi
+        mov     16(%rax),%rsi
+        mov     %rax,152($context)      # restore context->Rsp
+        mov     %rsi,168($context)      # restore context->Rsi
+        mov     %rdi,176($context)      # restore context->Rdi
+        mov     40($disp),%rdi          # disp->ContextRecord
+        mov     $context,%rsi           # context
+        mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
+        .long   0xa548f3fc              # cld; rep movsq
+        mov     $disp,%rsi
+        xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+        mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+        mov     0(%rsi),%r8             # arg3, disp->ControlPc
+        mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+        mov     40(%rsi),%r10           # disp->ContextRecord
+        lea     56(%rsi),%r11           # &disp->HandlerData
+        lea     24(%rsi),%r12           # &disp->EstablisherFrame
+        mov     %r10,32(%rsp)           # arg5
+        mov     %r11,40(%rsp)           # arg6
+        mov     %r12,48(%rsp)           # arg7
+        mov     %rcx,56(%rsp)           # arg8, (NULL)
+        call    *__imp_RtlVirtualUnwind(%rip)
+        mov     \$1,%eax                # ExceptionContinueSearch
+        add     \$64,%rsp
+        popfq
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+        pop     %rbp
+        pop     %rbx
+        pop     %rdi
+        pop     %rsi
+        ret
+.size   se_handler,.-se_handler
+.section        .pdata
+.align  4
+        .rva    .LSEH_begin_${PREFIX}_set_encrypt_key
+        .rva    .LSEH_end_${PREFIX}_set_encrypt_key
+        .rva    .LSEH_info_${PREFIX}_set_encrypt_key
+        .rva    .LSEH_begin_${PREFIX}_set_decrypt_key
+        .rva    .LSEH_end_${PREFIX}_set_decrypt_key
+        .rva    .LSEH_info_${PREFIX}_set_decrypt_key
+        .rva    .LSEH_begin_${PREFIX}_encrypt
+        .rva    .LSEH_end_${PREFIX}_encrypt
+        .rva    .LSEH_info_${PREFIX}_encrypt
+        .rva    .LSEH_begin_${PREFIX}_decrypt
+        .rva    .LSEH_end_${PREFIX}_decrypt
+        .rva    .LSEH_info_${PREFIX}_decrypt
+        .rva    .LSEH_begin_${PREFIX}_cbc_encrypt
+        .rva    .LSEH_end_${PREFIX}_cbc_encrypt
+        .rva    .LSEH_info_${PREFIX}_cbc_encrypt
+.section        .xdata
+.align  8
+.LSEH_info_${PREFIX}_set_encrypt_key:
+        .byte   9,0,0,0
+        .rva    se_handler
+        .rva    .Lenc_key_body,.Lenc_key_epilogue       # HandlerData[]
+.LSEH_info_${PREFIX}_set_decrypt_key:
+        .byte   9,0,0,0
+        .rva    se_handler
+        .rva    .Ldec_key_body,.Ldec_key_epilogue       # HandlerData[]
+.LSEH_info_${PREFIX}_encrypt:
+        .byte   9,0,0,0
+        .rva    se_handler
+        .rva    .Lenc_body,.Lenc_epilogue               # HandlerData[]
+.LSEH_info_${PREFIX}_decrypt:
+        .byte   9,0,0,0
+        .rva    se_handler
+        .rva    .Ldec_body,.Ldec_epilogue               # HandlerData[]
+.LSEH_info_${PREFIX}_cbc_encrypt:
+        .byte   9,0,0,0
+        .rva    se_handler
+        .rva    .Lcbc_body,.Lcbc_epilogue               # HandlerData[]
+___
+}
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/arm_arch.h b/src/lib/libcrypto/arm_arch.h
new file mode 100644
index 0000000000..5a83107680
--- /dev/null
+++ b/src/lib/libcrypto/arm_arch.h
@@ -0,0 +1,51 @@
+#ifndef __ARM_ARCH_H__
+#define __ARM_ARCH_H__
+#if !defined(__ARM_ARCH__)
+# if defined(__CC_ARM)
+#  define __ARM_ARCH__ __TARGET_ARCH_ARM
+#  if defined(__BIG_ENDIAN)
+#   define __ARMEB__
+#  else
+#   define __ARMEL__
+#  endif
+# elif defined(__GNUC__)
+  /*
+   * Why doesn't gcc define __ARM_ARCH__? Instead it defines
+   * bunch of below macros. See all_architectires[] table in
+   * gcc/config/arm/arm.c. On a side note it defines
+   * __ARMEL__/__ARMEB__ for little-/big-endian.
+   */
+#  if   defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__)     || \
+        defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__)     || \
+        defined(__ARM_ARCH_7EM__)
+#   define __ARM_ARCH__ 7
+#  elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__)     || \
+        defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__)     || \
+        defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__)    || \
+        defined(__ARM_ARCH_6T2__)
+#   define __ARM_ARCH__ 6
+#  elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__)     || \
+        defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__)    || \
+        defined(__ARM_ARCH_5TEJ__)
+#   define __ARM_ARCH__ 5
+#  elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
+#   define __ARM_ARCH__ 4
+#  else
+#   error "unsupported ARM architecture"
+#  endif
+# endif
+#endif
+#ifdef OPENSSL_FIPSCANISTER
+#include <openssl/fipssyms.h>
+#endif
+#if !__ASSEMBLER__
+extern unsigned int OPENSSL_armcap_P;
+                                     
+#define ARMV7_NEON      (1<<0)
+#define ARMV7_TICK      (1<<1)
+#endif
+#endif
diff --git a/src/lib/libcrypto/armcap.c b/src/lib/libcrypto/armcap.c
new file mode 100644
index 0000000000..5258d2fbdd
--- /dev/null
+++ b/src/lib/libcrypto/armcap.c
@@ -0,0 +1,80 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <crypto.h>
+#include "arm_arch.h"
+unsigned int OPENSSL_armcap_P;
+static sigset_t all_masked;
+static sigjmp_buf ill_jmp;
+static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
+/*
+ * Following subroutines could have been inlined, but it's not all
+ * ARM compilers support inline assembler...
+ */
+void _armv7_neon_probe(void);
+unsigned int _armv7_tick(void);
+unsigned int OPENSSL_rdtsc(void)
+        {
+        if (OPENSSL_armcap_P|ARMV7_TICK)
+                return _armv7_tick();
+        else
+                return 0;
+        }
+#if defined(__GNUC__) && __GNUC__>=2
+void OPENSSL_cpuid_setup(void) __attribute__((constructor));
+#endif
+void OPENSSL_cpuid_setup(void)
+        {
+        char *e;
+        struct sigaction        ill_oact,ill_act;
+        sigset_t                oset;
+        static int trigger=0;
+        if (trigger) return;
+        trigger=1;
+ 
+        if ((e=getenv("OPENSSL_armcap")))
+                {
+                OPENSSL_armcap_P=strtoul(e,NULL,0);
+                return;
+                }
+        sigfillset(&all_masked);
+        sigdelset(&all_masked,SIGILL);
+        sigdelset(&all_masked,SIGTRAP);
+        sigdelset(&all_masked,SIGFPE);
+        sigdelset(&all_masked,SIGBUS);
+        sigdelset(&all_masked,SIGSEGV);
+        OPENSSL_armcap_P = 0;
+        memset(&ill_act,0,sizeof(ill_act));
+        ill_act.sa_handler = ill_handler;
+        ill_act.sa_mask    = all_masked;
+        sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
+        sigaction(SIGILL,&ill_act,&ill_oact);
+        if (sigsetjmp(ill_jmp,1) == 0)
+                {
+                _armv7_neon_probe();
+                OPENSSL_armcap_P |= ARMV7_NEON;
+                }
+        if (sigsetjmp(ill_jmp,1) == 0)
+                {
+                _armv7_tick();
+                OPENSSL_armcap_P |= ARMV7_TICK;
+                }
+        sigaction (SIGILL,&ill_oact,NULL);
+        sigprocmask(SIG_SETMASK,&oset,NULL);
+        }
diff --git a/src/lib/libcrypto/armv4cpuid.S b/src/lib/libcrypto/armv4cpuid.S
new file mode 100644
index 0000000000..2d618deaa4
--- /dev/null
+++ b/src/lib/libcrypto/armv4cpuid.S
@@ -0,0 +1,154 @@
+#include "arm_arch.h"
+.text
+.code   32
+.align  5
+.global _armv7_neon_probe
+.type   _armv7_neon_probe,%function
+_armv7_neon_probe:
+        .word   0xf26ee1fe      @ vorr  q15,q15,q15
+        .word   0xe12fff1e      @ bx    lr
+.size   _armv7_neon_probe,.-_armv7_neon_probe
+.global _armv7_tick
+.type   _armv7_tick,%function
+_armv7_tick:
+        mrc     p15,0,r0,c9,c13,0
+        .word   0xe12fff1e      @ bx    lr
+.size   _armv7_tick,.-_armv7_tick
+.global OPENSSL_atomic_add
+.type   OPENSSL_atomic_add,%function
+OPENSSL_atomic_add:
+#if __ARM_ARCH__>=6
+.Ladd:  ldrex   r2,[r0]
+        add     r3,r2,r1
+        strex   r2,r3,[r0]
+        cmp     r2,#0
+        bne     .Ladd
+        mov     r0,r3
+        .word   0xe12fff1e      @ bx    lr
+#else
+        stmdb   sp!,{r4-r6,lr}
+        ldr     r2,.Lspinlock
+        adr     r3,.Lspinlock
+        mov     r4,r0
+        mov     r5,r1
+        add     r6,r3,r2        @ &spinlock
+        b       .+8
+.Lspin: bl      sched_yield
+        mov     r0,#-1
+        swp     r0,r0,[r6]
+        cmp     r0,#0
+        bne     .Lspin
+        ldr     r2,[r4]
+        add     r2,r2,r5
+        str     r2,[r4]
+        str     r0,[r6]         @ release spinlock
+        ldmia   sp!,{r4-r6,lr}
+        tst     lr,#1
+        moveq   pc,lr
+        .word   0xe12fff1e      @ bx    lr
+#endif
+.size   OPENSSL_atomic_add,.-OPENSSL_atomic_add
+.global OPENSSL_cleanse
+.type   OPENSSL_cleanse,%function
+OPENSSL_cleanse:
+        eor     ip,ip,ip
+        cmp     r1,#7
+        subhs   r1,r1,#4
+        bhs     .Lot
+        cmp     r1,#0
+        beq     .Lcleanse_done
+.Little:
+        strb    ip,[r0],#1
+        subs    r1,r1,#1
+        bhi     .Little
+        b       .Lcleanse_done
+.Lot:   tst     r0,#3
+        beq     .Laligned
+        strb    ip,[r0],#1
+        sub     r1,r1,#1
+        b       .Lot
+.Laligned:
+        str     ip,[r0],#4
+        subs    r1,r1,#4
+        bhs     .Laligned
+        adds    r1,r1,#4
+        bne     .Little
+.Lcleanse_done:
+        tst     lr,#1
+        moveq   pc,lr
+        .word   0xe12fff1e      @ bx    lr
+.size   OPENSSL_cleanse,.-OPENSSL_cleanse
+.global OPENSSL_wipe_cpu
+.type   OPENSSL_wipe_cpu,%function
+OPENSSL_wipe_cpu:
+        ldr     r0,.LOPENSSL_armcap
+        adr     r1,.LOPENSSL_armcap
+        ldr     r0,[r1,r0]
+        eor     r2,r2,r2
+        eor     r3,r3,r3
+        eor     ip,ip,ip
+        tst     r0,#1
+        beq     .Lwipe_done
+        .word   0xf3000150      @ veor    q0, q0, q0
+        .word   0xf3022152      @ veor    q1, q1, q1
+        .word   0xf3044154      @ veor    q2, q2, q2
+        .word   0xf3066156      @ veor    q3, q3, q3
+        .word   0xf34001f0      @ veor    q8, q8, q8
+        .word   0xf34221f2      @ veor    q9, q9, q9
+        .word   0xf34441f4      @ veor    q10, q10, q10
+        .word   0xf34661f6      @ veor    q11, q11, q11
+        .word   0xf34881f8      @ veor    q12, q12, q12
+        .word   0xf34aa1fa      @ veor    q13, q13, q13
+        .word   0xf34cc1fc      @ veor    q14, q14, q14
+        .word   0xf34ee1fe      @ veor    q15, q15, q15
+.Lwipe_done:
+        mov     r0,sp
+        tst     lr,#1
+        moveq   pc,lr
+        .word   0xe12fff1e      @ bx    lr
+.size   OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+.global OPENSSL_instrument_bus
+.type   OPENSSL_instrument_bus,%function
+OPENSSL_instrument_bus:
+        eor     r0,r0,r0
+        tst     lr,#1
+        moveq   pc,lr
+        .word   0xe12fff1e      @ bx    lr
+.size   OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+.global OPENSSL_instrument_bus2
+.type   OPENSSL_instrument_bus2,%function
+OPENSSL_instrument_bus2:
+        eor     r0,r0,r0
+        tst     lr,#1
+        moveq   pc,lr
+        .word   0xe12fff1e      @ bx    lr
+.size   OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+.align  5
+.LOPENSSL_armcap:
+.word   OPENSSL_armcap_P-.LOPENSSL_armcap
+#if __ARM_ARCH__>=6
+.align  5
+#else
+.Lspinlock:
+.word   atomic_add_spinlock-.Lspinlock
+.align  5
+.data
+.align  2
+atomic_add_spinlock:
+.word   0
+#endif
+.comm   OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
diff --git a/src/lib/libcrypto/asn1/ameth_lib.c b/src/lib/libcrypto/asn1/ameth_lib.c
index 5a581b90ea..a19e058fca 100644
--- a/src/lib/libcrypto/asn1/ameth_lib.c
+++ b/src/lib/libcrypto/asn1/ameth_lib.c
@@ -69,6 +69,7 @@ extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[];
 extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth;
+extern const EVP_PKEY_ASN1_METHOD cmac_asn1_meth;
 /* Keep this sorted in type order !! */
 static const EVP_PKEY_ASN1_METHOD *standard_methods[] = 
@@ -90,7 +91,8 @@ static const EVP_PKEY_ASN1_METHOD *standard_methods[] =
 #ifndef OPENSSL_NO_EC
        &eckey_asn1_meth,
 #endif
-        &hmac_asn1_meth
+        &hmac_asn1_meth,
+        &cmac_asn1_meth
        };
 typedef int sk_cmp_fn_type(const char * const *a, const char * const *b);
@@ -291,6 +293,8 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags,
        if (!ameth)
                return NULL;
+        memset(ameth, 0, sizeof(EVP_PKEY_ASN1_METHOD));
        ameth->pkey_id = id;
        ameth->pkey_base_id = id;
        ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC;
@@ -325,6 +329,9 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags,
        ameth->old_priv_encode = 0;
        ameth->old_priv_decode = 0;
+        ameth->item_verify = 0;
+        ameth->item_sign = 0;
        ameth->pkey_size = 0;
        ameth->pkey_bits = 0;
@@ -376,6 +383,9 @@ void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst,
        dst->pkey_free = src->pkey_free;
        dst->pkey_ctrl = src->pkey_ctrl;
+        dst->item_sign = src->item_sign;
+        dst->item_verify = src->item_verify;
        }
 void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth)
diff --git a/src/lib/libcrypto/asn1/asn1_locl.h b/src/lib/libcrypto/asn1/asn1_locl.h
index 5aa65e28f5..9fcf0d9530 100644
--- a/src/lib/libcrypto/asn1/asn1_locl.h
+++ b/src/lib/libcrypto/asn1/asn1_locl.h
@@ -102,6 +102,10 @@ struct evp_pkey_asn1_method_st
        int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b);
        int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent,
                                                        ASN1_PCTX *pctx);
+        int (*sig_print)(BIO *out,
+                         const X509_ALGOR *sigalg, const ASN1_STRING *sig,
+                                         int indent, ASN1_PCTX *pctx);
        void (*pkey_free)(EVP_PKEY *pkey);
        int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2);
@@ -111,6 +115,13 @@ struct evp_pkey_asn1_method_st
        int (*old_priv_decode)(EVP_PKEY *pkey,
                                const unsigned char **pder, int derlen);
        int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder);
+        /* Custom ASN1 signature verification */
+        int (*item_verify)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+                                X509_ALGOR *a, ASN1_BIT_STRING *sig,
+                                EVP_PKEY *pkey);
+        int (*item_sign)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+                                X509_ALGOR *alg1, X509_ALGOR *alg2, 
+                                ASN1_BIT_STRING *sig);
        } /* EVP_PKEY_ASN1_METHOD */;
diff --git a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl
new file mode 100644
index 0000000000..c52e0b75b5
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl
@@ -0,0 +1,278 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... Except that it has two code paths: pure
+# integer code suitable for any ARMv4 and later CPU and NEON code
+# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
+# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
+# faster than compiler-generated code. For ECDH and ECDSA verify (but
+# not for ECDSA sign) it means 25%-45% improvement depending on key
+# length, more for longer keys. Even though NEON 1x1 multiplication
+# runs in even less cycles, ~30, improvement is measurable only on
+# longer keys. One has to optimize code elsewhere to get NEON glow...
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+$code=<<___;
+#include "arm_arch.h"
+.text
+.code   32
+#if __ARM_ARCH__>=7
+.fpu    neon
+.type   mul_1x1_neon,%function
+.align  5
+mul_1x1_neon:
+        vshl.u64        `&Dlo("q1")`,d16,#8     @ q1-q3 are slided $a
+        vmull.p8        `&Q("d0")`,d16,d17      @ a�bb
+        vshl.u64        `&Dlo("q2")`,d16,#16
+        vmull.p8        q1,`&Dlo("q1")`,d17     @ a<<8�bb
+        vshl.u64        `&Dlo("q3")`,d16,#24
+        vmull.p8        q2,`&Dlo("q2")`,d17     @ a<<16�bb
+        vshr.u64        `&Dlo("q1")`,#8
+        vmull.p8        q3,`&Dlo("q3")`,d17     @ a<<24�bb
+        vshl.u64        `&Dhi("q1")`,#24
+        veor            d0,`&Dlo("q1")`
+        vshr.u64        `&Dlo("q2")`,#16
+        veor            d0,`&Dhi("q1")`
+        vshl.u64        `&Dhi("q2")`,#16
+        veor            d0,`&Dlo("q2")`
+        vshr.u64        `&Dlo("q3")`,#24
+        veor            d0,`&Dhi("q2")`
+        vshl.u64        `&Dhi("q3")`,#8
+        veor            d0,`&Dlo("q3")`
+        veor            d0,`&Dhi("q3")`
+        bx      lr
+.size   mul_1x1_neon,.-mul_1x1_neon
+#endif
+___
+################
+# private interface to mul_1x1_ialu
+#
+$a="r1";
+$b="r0";
+($a0,$a1,$a2,$a12,$a4,$a14)=
+($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
+$mask="r12";
+$code.=<<___;
+.type   mul_1x1_ialu,%function
+.align  5
+mul_1x1_ialu:
+        mov     $a0,#0
+        bic     $a1,$a,#3<<30           @ a1=a&0x3fffffff
+        str     $a0,[sp,#0]             @ tab[0]=0
+        add     $a2,$a1,$a1             @ a2=a1<<1
+        str     $a1,[sp,#4]             @ tab[1]=a1
+        eor     $a12,$a1,$a2            @ a1^a2
+        str     $a2,[sp,#8]             @ tab[2]=a2
+        mov     $a4,$a1,lsl#2           @ a4=a1<<2
+        str     $a12,[sp,#12]           @ tab[3]=a1^a2
+        eor     $a14,$a1,$a4            @ a1^a4
+        str     $a4,[sp,#16]            @ tab[4]=a4
+        eor     $a0,$a2,$a4             @ a2^a4
+        str     $a14,[sp,#20]           @ tab[5]=a1^a4
+        eor     $a12,$a12,$a4           @ a1^a2^a4
+        str     $a0,[sp,#24]            @ tab[6]=a2^a4
+        and     $i0,$mask,$b,lsl#2
+        str     $a12,[sp,#28]           @ tab[7]=a1^a2^a4
+        and     $i1,$mask,$b,lsr#1
+        ldr     $lo,[sp,$i0]            @ tab[b       & 0x7]
+        and     $i0,$mask,$b,lsr#4
+        ldr     $t1,[sp,$i1]            @ tab[b >>  3 & 0x7]
+        and     $i1,$mask,$b,lsr#7
+        ldr     $t0,[sp,$i0]            @ tab[b >>  6 & 0x7]
+        eor     $lo,$lo,$t1,lsl#3       @ stall
+        mov     $hi,$t1,lsr#29
+        ldr     $t1,[sp,$i1]            @ tab[b >>  9 & 0x7]
+        and     $i0,$mask,$b,lsr#10
+        eor     $lo,$lo,$t0,lsl#6
+        eor     $hi,$hi,$t0,lsr#26
+        ldr     $t0,[sp,$i0]            @ tab[b >> 12 & 0x7]
+        and     $i1,$mask,$b,lsr#13
+        eor     $lo,$lo,$t1,lsl#9
+        eor     $hi,$hi,$t1,lsr#23
+        ldr     $t1,[sp,$i1]            @ tab[b >> 15 & 0x7]
+        and     $i0,$mask,$b,lsr#16
+        eor     $lo,$lo,$t0,lsl#12
+        eor     $hi,$hi,$t0,lsr#20
+        ldr     $t0,[sp,$i0]            @ tab[b >> 18 & 0x7]
+        and     $i1,$mask,$b,lsr#19
+        eor     $lo,$lo,$t1,lsl#15
+        eor     $hi,$hi,$t1,lsr#17
+        ldr     $t1,[sp,$i1]            @ tab[b >> 21 & 0x7]
+        and     $i0,$mask,$b,lsr#22
+        eor     $lo,$lo,$t0,lsl#18
+        eor     $hi,$hi,$t0,lsr#14
+        ldr     $t0,[sp,$i0]            @ tab[b >> 24 & 0x7]
+        and     $i1,$mask,$b,lsr#25
+        eor     $lo,$lo,$t1,lsl#21
+        eor     $hi,$hi,$t1,lsr#11
+        ldr     $t1,[sp,$i1]            @ tab[b >> 27 & 0x7]
+        tst     $a,#1<<30
+        and     $i0,$mask,$b,lsr#28
+        eor     $lo,$lo,$t0,lsl#24
+        eor     $hi,$hi,$t0,lsr#8
+        ldr     $t0,[sp,$i0]            @ tab[b >> 30      ]
+        eorne   $lo,$lo,$b,lsl#30
+        eorne   $hi,$hi,$b,lsr#2
+        tst     $a,#1<<31
+        eor     $lo,$lo,$t1,lsl#27
+        eor     $hi,$hi,$t1,lsr#5
+        eorne   $lo,$lo,$b,lsl#31
+        eorne   $hi,$hi,$b,lsr#1
+        eor     $lo,$lo,$t0,lsl#30
+        eor     $hi,$hi,$t0,lsr#2
+        mov     pc,lr
+.size   mul_1x1_ialu,.-mul_1x1_ialu
+___
+################
+# void  bn_GF2m_mul_2x2(BN_ULONG *r,
+#       BN_ULONG a1,BN_ULONG a0,
+#       BN_ULONG b1,BN_ULONG b0);       # r[3..0]=a1a0�b1b0
+($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
+$code.=<<___;
+.global bn_GF2m_mul_2x2
+.type   bn_GF2m_mul_2x2,%function
+.align  5
+bn_GF2m_mul_2x2:
+#if __ARM_ARCH__>=7
+        ldr     r12,.LOPENSSL_armcap
+.Lpic:  ldr     r12,[pc,r12]
+        tst     r12,#1
+        beq     .Lialu
+        veor    $A1,$A1
+        vmov.32 $B1,r3,r3               @ two copies of b1
+        vmov.32 ${A1}[0],r1             @ a1
+        veor    $A0,$A0
+        vld1.32 ${B0}[],[sp,:32]        @ two copies of b0
+        vmov.32 ${A0}[0],r2             @ a0
+        mov     r12,lr
+        vmov    d16,$A1
+        vmov    d17,$B1
+        bl      mul_1x1_neon            @ a1�b1
+        vmov    $A1B1,d0
+        vmov    d16,$A0
+        vmov    d17,$B0
+        bl      mul_1x1_neon            @ a0�b0
+        vmov    $A0B0,d0
+        veor    d16,$A0,$A1
+        veor    d17,$B0,$B1
+        veor    $A0,$A0B0,$A1B1
+        bl      mul_1x1_neon            @ (a0+a1)�(b0+b1)
+        veor    d0,$A0                  @ (a0+a1)�(b0+b1)-a0�b0-a1�b1
+        vshl.u64 d1,d0,#32
+        vshr.u64 d0,d0,#32
+        veor    $A0B0,d1
+        veor    $A1B1,d0
+        vst1.32 {${A0B0}[0]},[r0,:32]!
+        vst1.32 {${A0B0}[1]},[r0,:32]!
+        vst1.32 {${A1B1}[0]},[r0,:32]!
+        vst1.32 {${A1B1}[1]},[r0,:32]
+        bx      r12
+.align  4
+.Lialu:
+#endif
+___
+$ret="r10";     # reassigned 1st argument
+$code.=<<___;
+        stmdb   sp!,{r4-r10,lr}
+        mov     $ret,r0                 @ reassign 1st argument
+        mov     $b,r3                   @ $b=b1
+        ldr     r3,[sp,#32]             @ load b0
+        mov     $mask,#7<<2
+        sub     sp,sp,#32               @ allocate tab[8]
+        bl      mul_1x1_ialu            @ a1�b1
+        str     $lo,[$ret,#8]
+        str     $hi,[$ret,#12]
+        eor     $b,$b,r3                @ flip b0 and b1
+         eor    $a,$a,r2                @ flip a0 and a1
+        eor     r3,r3,$b
+         eor    r2,r2,$a
+        eor     $b,$b,r3
+         eor    $a,$a,r2
+        bl      mul_1x1_ialu            @ a0�b0
+        str     $lo,[$ret]
+        str     $hi,[$ret,#4]
+        eor     $a,$a,r2
+        eor     $b,$b,r3
+        bl      mul_1x1_ialu            @ (a1+a0)�(b1+b0)
+___
+@r=map("r$_",(6..9));
+$code.=<<___;
+        ldmia   $ret,{@r[0]-@r[3]}
+        eor     $lo,$lo,$hi
+        eor     $hi,$hi,@r[1]
+        eor     $lo,$lo,@r[0]
+        eor     $hi,$hi,@r[2]
+        eor     $lo,$lo,@r[3]
+        eor     $hi,$hi,@r[3]
+        str     $hi,[$ret,#8]
+        eor     $lo,$lo,$hi
+        add     sp,sp,#32               @ destroy tab[8]
+        str     $lo,[$ret,#4]
+#if __ARM_ARCH__>=5
+        ldmia   sp!,{r4-r10,pc}
+#else
+        ldmia   sp!,{r4-r10,lr}
+        tst     lr,#1
+        moveq   pc,lr                   @ be binary compatible with V4, yet
+        bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size   bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_ARCH__>=7
+.align  5
+.LOPENSSL_armcap:
+.word   OPENSSL_armcap_P-(.Lpic+8)
+#endif
+.asciz  "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align  5
+.comm   OPENSSL_armcap_P,4,4
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
+print $code;
+close STDOUT;   # enforce flush
diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl
index 14e0d2d1dd..f78a8b5f0f 100644
--- a/src/lib/libcrypto/bn/asm/armv4-mont.pl
+++ b/src/lib/libcrypto/bn/asm/armv4-mont.pl
@@ -23,6 +23,9 @@
 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
 # about decorations, ABI and instruction syntax are identical.
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
 $num="r0";      # starts as num argument, but holds &tp[num-1]
 $ap="r1";
 $bp="r2"; $bi="r2"; $rp="r2";
@@ -89,9 +92,9 @@ bn_mul_mont:
 .L1st:
        ldr     $aj,[$ap],#4            @ ap[j],ap++
        mov     $alo,$ahi
+        ldr     $nj,[$np],#4            @ np[j],np++
        mov     $ahi,#0
        umlal   $alo,$ahi,$aj,$bi       @ ap[j]*bp[0]
-        ldr     $nj,[$np],#4            @ np[j],np++
        mov     $nhi,#0
        umlal   $nlo,$nhi,$nj,$n0       @ np[j]*n0
        adds    $nlo,$nlo,$alo
@@ -101,21 +104,21 @@ bn_mul_mont:
        bne     .L1st
        adds    $nlo,$nlo,$ahi
+        ldr     $tp,[$_bp]              @ restore bp
        mov     $nhi,#0
+        ldr     $n0,[$_n0]              @ restore n0
        adc     $nhi,$nhi,#0
-        ldr     $tp,[$_bp]              @ restore bp
        str     $nlo,[$num]             @ tp[num-1]=
-        ldr     $n0,[$_n0]              @ restore n0
        str     $nhi,[$num,#4]          @ tp[num]=
 .Louter:
        sub     $tj,$num,sp             @ "original" $num-1 value
        sub     $ap,$ap,$tj             @ "rewind" ap to &ap[1]
-        sub     $np,$np,$tj             @ "rewind" np to &np[1]
        ldr     $bi,[$tp,#4]!           @ *(++bp)
+        sub     $np,$np,$tj             @ "rewind" np to &np[1]
        ldr     $aj,[$ap,#-4]           @ ap[0]
-        ldr     $nj,[$np,#-4]           @ np[0]
        ldr     $alo,[sp]               @ tp[0]
+        ldr     $nj,[$np,#-4]           @ np[0]
        ldr     $tj,[sp,#4]             @ tp[1]
        mov     $ahi,#0
@@ -129,13 +132,13 @@ bn_mul_mont:
 .Linner:
        ldr     $aj,[$ap],#4            @ ap[j],ap++
        adds    $alo,$ahi,$tj           @ +=tp[j]
+        ldr     $nj,[$np],#4            @ np[j],np++
        mov     $ahi,#0
        umlal   $alo,$ahi,$aj,$bi       @ ap[j]*bp[i]
-        ldr     $nj,[$np],#4            @ np[j],np++
        mov     $nhi,#0
        umlal   $nlo,$nhi,$nj,$n0       @ np[j]*n0
-        ldr     $tj,[$tp,#8]            @ tp[j+1]
        adc     $ahi,$ahi,#0
+        ldr     $tj,[$tp,#8]            @ tp[j+1]
        adds    $nlo,$nlo,$alo
        str     $nlo,[$tp],#4           @ tp[j-1]=,tp++
        adc     $nlo,$nhi,#0
@@ -144,13 +147,13 @@ bn_mul_mont:
        adds    $nlo,$nlo,$ahi
        mov     $nhi,#0
+        ldr     $tp,[$_bp]              @ restore bp
        adc     $nhi,$nhi,#0
+        ldr     $n0,[$_n0]              @ restore n0
        adds    $nlo,$nlo,$tj
-        adc     $nhi,$nhi,#0
-        ldr     $tp,[$_bp]              @ restore bp
        ldr     $tj,[$_bpend]           @ restore &bp[num]
+        adc     $nhi,$nhi,#0
        str     $nlo,[$num]             @ tp[num-1]=
-        ldr     $n0,[$_n0]              @ restore n0
        str     $nhi,[$num,#4]          @ tp[num]=
        cmp     $tp,$tj
diff --git a/src/lib/libcrypto/bn/asm/ia64-mont.pl b/src/lib/libcrypto/bn/asm/ia64-mont.pl
new file mode 100644
index 0000000000..e258658428
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/ia64-mont.pl
@@ -0,0 +1,851 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# January 2010
+#
+# "Teaser" Montgomery multiplication module for IA-64. There are
+# several possibilities for improvement:
+#
+# - modulo-scheduling outer loop would eliminate quite a number of
+#   stalls after ldf8, xma and getf.sig outside inner loop and
+#   improve shorter key performance;
+# - shorter vector support [with input vectors being fetched only
+#   once] should be added;
+# - 2x unroll with help of n0[1] would make the code scalable on
+#   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
+#   acute interest, because upcoming Tukwila's individual cores are
+#   reportedly based on Itanium 2 design;
+# - dedicated squaring procedure(?);
+#
+# January 2010
+#
+# Shorter vector support is implemented by zero-padding ap and np
+# vectors up to 8 elements, or 512 bits. This means that 256-bit
+# inputs will be processed only 2 times faster than 512-bit inputs,
+# not 4 [as one would expect, because algorithm complexity is n^2].
+# The reason for padding is that inputs shorter than 512 bits won't
+# be processed faster anyway, because minimal critical path of the
+# core loop happens to match 512-bit timing. Either way, it resulted
+# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
+# 1024-bit one [in comparison to original version of *this* module].
+#
+# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
+# this module is:
+#                   sign    verify    sign/s verify/s
+# rsa  512 bits 0.000290s 0.000024s   3452.8  42031.4
+# rsa 1024 bits 0.000793s 0.000058s   1261.7  17172.0
+# rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
+# rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
+# dsa  512 bits 0.000253s 0.000198s   3949.9   5057.0
+# dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
+# dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
+#
+# ... and *without* (but still with ia64.S):
+#
+# rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
+# rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
+# rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
+# rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
+# dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
+# dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
+# dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
+#
+# As it can be seen, RSA sign performance improves by 130-30%,
+# hereafter less for longer keys, while verify - by 74-13%.
+# DSA performance improves by 115-30%.
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+$code=<<___;
+.explicit
+.text
+// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
+//                  const BN_ULONG *bp,const BN_ULONG *np,
+//                  const BN_ULONG *n0p,int num);                       
+.align  64
+.global bn_mul_mont#
+.proc   bn_mul_mont#
+bn_mul_mont:
+        .prologue
+        .body
+{ .mmi; cmp4.le         p6,p7=2,r37;;
+(p6)    cmp4.lt.unc     p8,p9=8,r37
+        mov             ret0=r0         };;
+{ .bbb;
+(p9)    br.cond.dptk.many       bn_mul_mont_8
+(p8)    br.cond.dpnt.many       bn_mul_mont_general
+(p7)    br.ret.spnt.many        b0      };;
+.endp   bn_mul_mont#
+prevfs=r2;      prevpr=r3;      prevlc=r10;     prevsp=r11;
+rptr=r8;        aptr=r9;        bptr=r14;       nptr=r15;
+tptr=r16;       // &tp[0]
+tp_1=r17;       // &tp[-1]
+num=r18;        len=r19;        lc=r20;
+topbit=r21;     // carry bit from tmp[num]
+n0=f6;
+m0=f7;
+bi=f8;
+.align  64
+.local  bn_mul_mont_general#
+.proc   bn_mul_mont_general#
+bn_mul_mont_general:
+        .prologue
+{ .mmi; .save   ar.pfs,prevfs
+        alloc   prevfs=ar.pfs,6,2,0,8
+        $ADDP   aptr=0,in1
+        .save   ar.lc,prevlc
+        mov     prevlc=ar.lc            }
+{ .mmi; .vframe prevsp
+        mov     prevsp=sp
+        $ADDP   bptr=0,in2
+        .save   pr,prevpr
+        mov     prevpr=pr               };;
+        .body
+        .rotf           alo[6],nlo[4],ahi[8],nhi[6]
+        .rotr           a[3],n[3],t[2]
+{ .mmi; ldf8            bi=[bptr],8             // (*bp++)
+        ldf8            alo[4]=[aptr],16        // ap[0]
+        $ADDP           r30=8,in1       };;
+{ .mmi; ldf8            alo[3]=[r30],16         // ap[1]
+        ldf8            alo[2]=[aptr],16        // ap[2]
+        $ADDP           in4=0,in4       };;
+{ .mmi; ldf8            alo[1]=[r30]            // ap[3]
+        ldf8            n0=[in4]                // n0
+        $ADDP           rptr=0,in0              }
+{ .mmi; $ADDP           nptr=0,in3
+        mov             r31=16
+        zxt4            num=in5         };;
+{ .mmi; ldf8            nlo[2]=[nptr],8         // np[0]
+        shladd          len=num,3,r0
+        shladd          r31=num,3,r31   };;
+{ .mmi; ldf8            nlo[1]=[nptr],8         // np[1]
+        add             lc=-5,num
+        sub             r31=sp,r31      };;
+{ .mfb; and             sp=-16,r31              // alloca
+        xmpy.hu         ahi[2]=alo[4],bi        // ap[0]*bp[0]
+        nop.b           0               }
+{ .mfb; nop.m           0
+        xmpy.lu         alo[4]=alo[4],bi
+        brp.loop.imp    .L1st_ctop,.L1st_cend-16
+                                        };;
+{ .mfi; nop.m           0
+        xma.hu          ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
+        add             tp_1=8,sp       }
+{ .mfi; nop.m           0
+        xma.lu          alo[3]=alo[3],bi,ahi[2]
+        mov             pr.rot=0x20001f<<16
+                        // ------^----- (p40) at first (p23)
+                        // ----------^^ p[16:20]=1
+                                        };;
+{ .mfi; nop.m           0
+        xmpy.lu         m0=alo[4],n0            // (ap[0]*bp[0])*n0
+        mov             ar.lc=lc        }
+{ .mfi; nop.m           0
+        fcvt.fxu.s1     nhi[1]=f0
+        mov             ar.ec=8         };;
+.align  32
+.L1st_ctop:
+.pred.rel       "mutex",p40,p42
+{ .mfi; (p16)   ldf8            alo[0]=[aptr],8             // *(aptr++)
+        (p18)   xma.hu          ahi[0]=alo[2],bi,ahi[1]
+        (p40)   add             n[2]=n[2],a[2]          }   // (p23)                                    }
+{ .mfi; (p18)   ldf8            nlo[0]=[nptr],8             // *(nptr++)(p16)
+        (p18)   xma.lu          alo[2]=alo[2],bi,ahi[1]
+        (p42)   add             n[2]=n[2],a[2],1        };; // (p23)
+{ .mfi; (p21)   getf.sig        a[0]=alo[5]
+        (p20)   xma.hu          nhi[0]=nlo[2],m0,nhi[1]
+        (p42)   cmp.leu         p41,p39=n[2],a[2]       }   // (p23)
+{ .mfi; (p23)   st8             [tp_1]=n[2],8
+        (p20)   xma.lu          nlo[2]=nlo[2],m0,nhi[1]
+        (p40)   cmp.ltu         p41,p39=n[2],a[2]       }   // (p23)
+{ .mmb; (p21)   getf.sig        n[0]=nlo[3]
+        (p16)   nop.m           0
+        br.ctop.sptk    .L1st_ctop                      };;
+.L1st_cend:
+{ .mmi; getf.sig        a[0]=ahi[6]             // (p24)
+        getf.sig        n[0]=nhi[4]
+        add             num=-1,num      };;     // num--
+{ .mmi; .pred.rel       "mutex",p40,p42
+(p40)   add             n[0]=n[0],a[0]
+(p42)   add             n[0]=n[0],a[0],1
+        sub             aptr=aptr,len   };;     // rewind
+{ .mmi; .pred.rel       "mutex",p40,p42
+(p40)   cmp.ltu         p41,p39=n[0],a[0]
+(p42)   cmp.leu         p41,p39=n[0],a[0]
+        sub             nptr=nptr,len   };;
+{ .mmi; .pred.rel       "mutex",p39,p41
+(p39)   add             topbit=r0,r0
+(p41)   add             topbit=r0,r0,1
+        nop.i           0               }       
+{ .mmi; st8             [tp_1]=n[0]
+        add             tptr=16,sp
+        add             tp_1=8,sp       };;
+.Louter:
+{ .mmi; ldf8            bi=[bptr],8             // (*bp++)
+        ldf8            ahi[3]=[tptr]           // tp[0]
+        add             r30=8,aptr      };;
+{ .mmi; ldf8            alo[4]=[aptr],16        // ap[0]
+        ldf8            alo[3]=[r30],16         // ap[1]
+        add             r31=8,nptr      };;
+{ .mfb; ldf8            alo[2]=[aptr],16        // ap[2]
+        xma.hu          ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
+        brp.loop.imp    .Linner_ctop,.Linner_cend-16
+                                        }
+{ .mfb; ldf8            alo[1]=[r30]            // ap[3]
+        xma.lu          alo[4]=alo[4],bi,ahi[3]
+        clrrrb.pr                       };;
+{ .mfi; ldf8            nlo[2]=[nptr],16        // np[0]
+        xma.hu          ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
+        nop.i           0               }
+{ .mfi; ldf8            nlo[1]=[r31]            // np[1]
+        xma.lu          alo[3]=alo[3],bi,ahi[2]
+        mov             pr.rot=0x20101f<<16
+                        // ------^----- (p40) at first (p23)
+                        // --------^--- (p30) at first (p22)
+                        // ----------^^ p[16:20]=1
+                                        };;
+{ .mfi; st8             [tptr]=r0               // tp[0] is already accounted
+        xmpy.lu         m0=alo[4],n0            // (ap[0]*bp[i]+tp[0])*n0
+        mov             ar.lc=lc        }
+{ .mfi;
+        fcvt.fxu.s1     nhi[1]=f0
+        mov             ar.ec=8         };;
+// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
+// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
+// in latter case accounts for two-tick pipeline stall, which means
+// that its performance would be ~20% lower than optimal one. No
+// attempt was made to address this, because original Itanium is
+// hardly represented out in the wild...
+.align  32
+.Linner_ctop:
+.pred.rel       "mutex",p40,p42
+.pred.rel       "mutex",p30,p32
+{ .mfi; (p16)   ldf8            alo[0]=[aptr],8             // *(aptr++)
+        (p18)   xma.hu          ahi[0]=alo[2],bi,ahi[1]
+        (p40)   add             n[2]=n[2],a[2]          }   // (p23)
+{ .mfi; (p16)   nop.m           0
+        (p18)   xma.lu          alo[2]=alo[2],bi,ahi[1]
+        (p42)   add             n[2]=n[2],a[2],1        };; // (p23)
+{ .mfi; (p21)   getf.sig        a[0]=alo[5]
+        (p16)   nop.f           0
+        (p40)   cmp.ltu         p41,p39=n[2],a[2]       }   // (p23)
+{ .mfi; (p21)   ld8             t[0]=[tptr],8
+        (p16)   nop.f           0
+        (p42)   cmp.leu         p41,p39=n[2],a[2]       };; // (p23)
+{ .mfi; (p18)   ldf8            nlo[0]=[nptr],8             // *(nptr++)
+        (p20)   xma.hu          nhi[0]=nlo[2],m0,nhi[1]
+        (p30)   add             a[1]=a[1],t[1]          }   // (p22)
+{ .mfi; (p16)   nop.m           0
+        (p20)   xma.lu          nlo[2]=nlo[2],m0,nhi[1]
+        (p32)   add             a[1]=a[1],t[1],1        };; // (p22)
+{ .mmi; (p21)   getf.sig        n[0]=nlo[3]
+        (p16)   nop.m           0
+        (p30)   cmp.ltu         p31,p29=a[1],t[1]       }   // (p22)
+{ .mmb; (p23)   st8             [tp_1]=n[2],8
+        (p32)   cmp.leu         p31,p29=a[1],t[1]           // (p22)
+        br.ctop.sptk    .Linner_ctop                    };;
+.Linner_cend:
+{ .mmi; getf.sig        a[0]=ahi[6]             // (p24)
+        getf.sig        n[0]=nhi[4]
+        nop.i           0               };;
+{ .mmi; .pred.rel       "mutex",p31,p33
+(p31)   add             a[0]=a[0],topbit
+(p33)   add             a[0]=a[0],topbit,1
+        mov             topbit=r0       };;
+{ .mfi; .pred.rel       "mutex",p31,p33
+(p31)   cmp.ltu         p32,p30=a[0],topbit
+(p33)   cmp.leu         p32,p30=a[0],topbit
+                                        }
+{ .mfi; .pred.rel       "mutex",p40,p42
+(p40)   add             n[0]=n[0],a[0]
+(p42)   add             n[0]=n[0],a[0],1
+                                        };;
+{ .mmi; .pred.rel       "mutex",p44,p46
+(p40)   cmp.ltu         p41,p39=n[0],a[0]
+(p42)   cmp.leu         p41,p39=n[0],a[0]
+(p32)   add             topbit=r0,r0,1  }
+{ .mmi; st8             [tp_1]=n[0],8
+        cmp4.ne         p6,p0=1,num
+        sub             aptr=aptr,len   };;     // rewind
+{ .mmi; sub             nptr=nptr,len
+(p41)   add             topbit=r0,r0,1
+        add             tptr=16,sp      }
+{ .mmb; add             tp_1=8,sp
+        add             num=-1,num              // num--
+(p6)    br.cond.sptk.many       .Louter };;
+{ .mbb; add             lc=4,lc
+        brp.loop.imp    .Lsub_ctop,.Lsub_cend-16
+        clrrrb.pr                       };;
+{ .mii; nop.m           0
+        mov             pr.rot=0x10001<<16
+                        // ------^---- (p33) at first (p17)
+        mov             ar.lc=lc        }
+{ .mii; nop.m           0
+        mov             ar.ec=3
+        nop.i           0               };;
+.Lsub_ctop:
+.pred.rel       "mutex",p33,p35
+{ .mfi; (p16)   ld8             t[0]=[tptr],8               // t=*(tp++)
+        (p16)   nop.f           0
+        (p33)   sub             n[1]=t[1],n[1]          }   // (p17)
+{ .mfi; (p16)   ld8             n[0]=[nptr],8               // n=*(np++)
+        (p16)   nop.f           0
+        (p35)   sub             n[1]=t[1],n[1],1        };; // (p17)
+{ .mib; (p18)   st8             [rptr]=n[2],8               // *(rp++)=r
+        (p33)   cmp.gtu         p34,p32=n[1],t[1]           // (p17)
+        (p18)   nop.b           0                       }
+{ .mib; (p18)   nop.m           0
+        (p35)   cmp.geu         p34,p32=n[1],t[1]           // (p17)
+        br.ctop.sptk    .Lsub_ctop                      };;
+.Lsub_cend:
+{ .mmb; .pred.rel       "mutex",p34,p36
+(p34)   sub     topbit=topbit,r0        // (p19)
+(p36)   sub     topbit=topbit,r0,1
+        brp.loop.imp    .Lcopy_ctop,.Lcopy_cend-16
+                                        }
+{ .mmb; sub     rptr=rptr,len           // rewind
+        sub     tptr=tptr,len
+        clrrrb.pr                       };;
+{ .mmi; and     aptr=tptr,topbit
+        andcm   bptr=rptr,topbit
+        mov     pr.rot=1<<16            };;
+{ .mii; or      nptr=aptr,bptr
+        mov     ar.lc=lc
+        mov     ar.ec=3                 };;
+.Lcopy_ctop:
+{ .mmb; (p16)   ld8     n[0]=[nptr],8
+        (p18)   st8     [tptr]=r0,8
+        (p16)   nop.b   0               }
+{ .mmb; (p16)   nop.m   0
+        (p18)   st8     [rptr]=n[2],8
+        br.ctop.sptk    .Lcopy_ctop     };;
+.Lcopy_cend:
+{ .mmi; mov             ret0=1                  // signal "handled"
+        rum             1<<5                    // clear um.mfh
+        mov             ar.lc=prevlc    }
+{ .mib; .restore        sp
+        mov             sp=prevsp
+        mov             pr=prevpr,0x1ffff
+        br.ret.sptk.many        b0      };;
+.endp   bn_mul_mont_general#
+a1=r16;  a2=r17;  a3=r18;  a4=r19;  a5=r20;  a6=r21;  a7=r22;  a8=r23;
+n1=r24;  n2=r25;  n3=r26;  n4=r27;  n5=r28;  n6=r29;  n7=r30;  n8=r31;
+t0=r15;
+ai0=f8;  ai1=f9;  ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
+ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
+.align  64
+.skip   48              // aligns loop body
+.local  bn_mul_mont_8#
+.proc   bn_mul_mont_8#
+bn_mul_mont_8:
+        .prologue
+{ .mmi; .save           ar.pfs,prevfs
+        alloc           prevfs=ar.pfs,6,2,0,8
+        .vframe         prevsp
+        mov             prevsp=sp
+        .save           ar.lc,prevlc
+        mov             prevlc=ar.lc    }
+{ .mmi; add             r17=-6*16,sp
+        add             sp=-7*16,sp
+        .save           pr,prevpr
+        mov             prevpr=pr       };;
+{ .mmi; .save.gf        0,0x10
+        stf.spill       [sp]=f16,-16
+        .save.gf        0,0x20
+        stf.spill       [r17]=f17,32
+        add             r16=-5*16,prevsp};;
+{ .mmi; .save.gf        0,0x40
+        stf.spill       [r16]=f18,32
+        .save.gf        0,0x80
+        stf.spill       [r17]=f19,32
+        $ADDP           aptr=0,in1      };;
+{ .mmi; .save.gf        0,0x100
+        stf.spill       [r16]=f20,32
+        .save.gf        0,0x200
+        stf.spill       [r17]=f21,32
+        $ADDP           r29=8,in1       };;
+{ .mmi; .save.gf        0,0x400
+        stf.spill       [r16]=f22
+        .save.gf        0,0x800
+        stf.spill       [r17]=f23
+        $ADDP           rptr=0,in0      };;
+        .body
+        .rotf           bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
+        .rotr           t[8]
+// load input vectors padding them to 8 elements
+{ .mmi; ldf8            ai0=[aptr],16           // ap[0]
+        ldf8            ai1=[r29],16            // ap[1]
+        $ADDP           bptr=0,in2      }
+{ .mmi; $ADDP           r30=8,in2
+        $ADDP           nptr=0,in3
+        $ADDP           r31=8,in3       };;
+{ .mmi; ldf8            bj[7]=[bptr],16         // bp[0]
+        ldf8            bj[6]=[r30],16          // bp[1]
+        cmp4.le         p4,p5=3,in5     }
+{ .mmi; ldf8            ni0=[nptr],16           // np[0]
+        ldf8            ni1=[r31],16            // np[1]
+        cmp4.le         p6,p7=4,in5     };;
+{ .mfi; (p4)ldf8        ai2=[aptr],16           // ap[2]
+        (p5)fcvt.fxu    ai2=f0
+        cmp4.le         p8,p9=5,in5     }
+{ .mfi; (p6)ldf8        ai3=[r29],16            // ap[3]
+        (p7)fcvt.fxu    ai3=f0
+        cmp4.le         p10,p11=6,in5   }
+{ .mfi; (p4)ldf8        bj[5]=[bptr],16         // bp[2]
+        (p5)fcvt.fxu    bj[5]=f0
+        cmp4.le         p12,p13=7,in5   }
+{ .mfi; (p6)ldf8        bj[4]=[r30],16          // bp[3]
+        (p7)fcvt.fxu    bj[4]=f0
+        cmp4.le         p14,p15=8,in5   }
+{ .mfi; (p4)ldf8        ni2=[nptr],16           // np[2]
+        (p5)fcvt.fxu    ni2=f0
+        addp4           r28=-1,in5      }
+{ .mfi; (p6)ldf8        ni3=[r31],16            // np[3]
+        (p7)fcvt.fxu    ni3=f0
+        $ADDP           in4=0,in4       };;
+{ .mfi; ldf8            n0=[in4]
+        fcvt.fxu        tf[1]=f0
+        nop.i           0               }
+{ .mfi; (p8)ldf8        ai4=[aptr],16           // ap[4]
+        (p9)fcvt.fxu    ai4=f0
+        mov             t[0]=r0         }
+{ .mfi; (p10)ldf8       ai5=[r29],16            // ap[5]
+        (p11)fcvt.fxu   ai5=f0
+        mov             t[1]=r0         }
+{ .mfi; (p8)ldf8        bj[3]=[bptr],16         // bp[4]
+        (p9)fcvt.fxu    bj[3]=f0
+        mov             t[2]=r0         }
+{ .mfi; (p10)ldf8       bj[2]=[r30],16          // bp[5]
+        (p11)fcvt.fxu   bj[2]=f0
+        mov             t[3]=r0         }
+{ .mfi; (p8)ldf8        ni4=[nptr],16           // np[4]
+        (p9)fcvt.fxu    ni4=f0
+        mov             t[4]=r0         }
+{ .mfi; (p10)ldf8       ni5=[r31],16            // np[5]
+        (p11)fcvt.fxu   ni5=f0
+        mov             t[5]=r0         };;
+{ .mfi; (p12)ldf8       ai6=[aptr],16           // ap[6]
+        (p13)fcvt.fxu   ai6=f0
+        mov             t[6]=r0         }
+{ .mfi; (p14)ldf8       ai7=[r29],16            // ap[7]
+        (p15)fcvt.fxu   ai7=f0
+        mov             t[7]=r0         }
+{ .mfi; (p12)ldf8       bj[1]=[bptr],16         // bp[6]
+        (p13)fcvt.fxu   bj[1]=f0
+        mov             ar.lc=r28       }
+{ .mfi; (p14)ldf8       bj[0]=[r30],16          // bp[7]
+        (p15)fcvt.fxu   bj[0]=f0
+        mov             ar.ec=1         }
+{ .mfi; (p12)ldf8       ni6=[nptr],16           // np[6]
+        (p13)fcvt.fxu   ni6=f0
+        mov             pr.rot=1<<16    }
+{ .mfb; (p14)ldf8       ni7=[r31],16            // np[7]
+        (p15)fcvt.fxu   ni7=f0
+        brp.loop.imp    .Louter_8_ctop,.Louter_8_cend-16
+                                        };;
+// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
+// to measure with help of Interval Time Counter indicated that the
+// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
+// addressing the issue is problematic, because I don't have access
+// to platform-specific instruction-level profiler. On Itanium it
+// should run in 56*n ticks, because of higher xma latency...
+.Louter_8_ctop:
+        .pred.rel               "mutex",p40,p42
+        .pred.rel               "mutex",p48,p50
+{ .mfi; (p16)   nop.m           0                       // 0:
+        (p16)   xma.hu          ahi[0]=ai0,bj[7],tf[1]  //      ap[0]*b[i]+t[0]
+        (p40)   add             a3=a3,n3        }       //      (p17) a3+=n3
+{ .mfi; (p42)   add             a3=a3,n3,1
+        (p16)   xma.lu          alo[0]=ai0,bj[7],tf[1]
+        (p16)   nop.i           0               };;
+{ .mii; (p17)   getf.sig        a7=alo[8]               // 1:
+        (p48)   add             t[6]=t[6],a3            //      (p17) t[6]+=a3
+        (p50)   add             t[6]=t[6],a3,1  };;
+{ .mfi; (p17)   getf.sig        a8=ahi[8]               // 2:
+        (p17)   xma.hu          nhi[7]=ni6,mj[1],nhi[6] //      np[6]*m0
+        (p40)   cmp.ltu         p43,p41=a3,n3   }
+{ .mfi; (p42)   cmp.leu         p43,p41=a3,n3
+        (p17)   xma.lu          nlo[7]=ni6,mj[1],nhi[6]
+        (p16)   nop.i           0               };;
+{ .mii; (p17)   getf.sig        n5=nlo[6]               // 3:
+        (p48)   cmp.ltu         p51,p49=t[6],a3
+        (p50)   cmp.leu         p51,p49=t[6],a3 };;
+        .pred.rel               "mutex",p41,p43
+        .pred.rel               "mutex",p49,p51
+{ .mfi; (p16)   nop.m           0                       // 4:
+        (p16)   xma.hu          ahi[1]=ai1,bj[7],ahi[0] //      ap[1]*b[i]
+        (p41)   add             a4=a4,n4        }       //      (p17) a4+=n4
+{ .mfi; (p43)   add             a4=a4,n4,1
+        (p16)   xma.lu          alo[1]=ai1,bj[7],ahi[0]
+        (p16)   nop.i           0               };;
+{ .mfi; (p49)   add             t[5]=t[5],a4            // 5:   (p17) t[5]+=a4
+        (p16)   xmpy.lu         mj[0]=alo[0],n0         //      (ap[0]*b[i]+t[0])*n0
+        (p51)   add             t[5]=t[5],a4,1  };;
+{ .mfi; (p16)   nop.m           0                       // 6:
+        (p17)   xma.hu          nhi[8]=ni7,mj[1],nhi[7] //      np[7]*m0
+        (p41)   cmp.ltu         p42,p40=a4,n4   }
+{ .mfi; (p43)   cmp.leu         p42,p40=a4,n4
+        (p17)   xma.lu          nlo[8]=ni7,mj[1],nhi[7]
+        (p16)   nop.i           0               };;
+{ .mii; (p17)   getf.sig        n6=nlo[7]               // 7:
+        (p49)   cmp.ltu         p50,p48=t[5],a4
+        (p51)   cmp.leu         p50,p48=t[5],a4 };;
+        .pred.rel               "mutex",p40,p42
+        .pred.rel               "mutex",p48,p50
+{ .mfi; (p16)   nop.m           0                       // 8:
+        (p16)   xma.hu          ahi[2]=ai2,bj[7],ahi[1] //      ap[2]*b[i]
+        (p40)   add             a5=a5,n5        }       //      (p17) a5+=n5
+{ .mfi; (p42)   add             a5=a5,n5,1
+        (p16)   xma.lu          alo[2]=ai2,bj[7],ahi[1]
+        (p16)   nop.i           0               };;
+{ .mii; (p16)   getf.sig        a1=alo[1]               // 9:
+        (p48)   add             t[4]=t[4],a5            //      p(17) t[4]+=a5
+        (p50)   add             t[4]=t[4],a5,1  };;
+{ .mfi; (p16)   nop.m           0                       // 10:
+        (p16)   xma.hu          nhi[0]=ni0,mj[0],alo[0] //      np[0]*m0
+        (p40)   cmp.ltu         p43,p41=a5,n5   }
+{ .mfi; (p42)   cmp.leu         p43,p41=a5,n5
+        (p16)   xma.lu          nlo[0]=ni0,mj[0],alo[0]
+        (p16)   nop.i           0               };;
+{ .mii; (p17)   getf.sig        n7=nlo[8]               // 11:
+        (p48)   cmp.ltu         p51,p49=t[4],a5
+        (p50)   cmp.leu         p51,p49=t[4],a5 };;
+        .pred.rel               "mutex",p41,p43
+        .pred.rel               "mutex",p49,p51
+{ .mfi; (p17)   getf.sig        n8=nhi[8]               // 12:
+        (p16)   xma.hu          ahi[3]=ai3,bj[7],ahi[2] //      ap[3]*b[i]
+        (p41)   add             a6=a6,n6        }       //      (p17) a6+=n6
+{ .mfi; (p43)   add             a6=a6,n6,1
+        (p16)   xma.lu          alo[3]=ai3,bj[7],ahi[2]
+        (p16)   nop.i           0               };;
+{ .mii; (p16)   getf.sig        a2=alo[2]               // 13:
+        (p49)   add             t[3]=t[3],a6            //      (p17) t[3]+=a6
+        (p51)   add             t[3]=t[3],a6,1  };;
+{ .mfi; (p16)   nop.m           0                       // 14:
+        (p16)   xma.hu          nhi[1]=ni1,mj[0],nhi[0] //      np[1]*m0
+        (p41)   cmp.ltu         p42,p40=a6,n6   }
+{ .mfi; (p43)   cmp.leu         p42,p40=a6,n6
+        (p16)   xma.lu          nlo[1]=ni1,mj[0],nhi[0]
+        (p16)   nop.i           0               };;
+{ .mii; (p16)   nop.m           0                       // 15:
+        (p49)   cmp.ltu         p50,p48=t[3],a6
+        (p51)   cmp.leu         p50,p48=t[3],a6 };;
+        .pred.rel               "mutex",p40,p42
+        .pred.rel               "mutex",p48,p50
+{ .mfi; (p16)   nop.m           0                       // 16:
+        (p16)   xma.hu          ahi[4]=ai4,bj[7],ahi[3] //      ap[4]*b[i]
+        (p40)   add             a7=a7,n7        }       //      (p17) a7+=n7
+{ .mfi; (p42)   add             a7=a7,n7,1
+        (p16)   xma.lu          alo[4]=ai4,bj[7],ahi[3]
+        (p16)   nop.i           0               };;
+{ .mii; (p16)   getf.sig        a3=alo[3]               // 17:
+        (p48)   add             t[2]=t[2],a7            //      (p17) t[2]+=a7
+        (p50)   add             t[2]=t[2],a7,1  };;
+{ .mfi; (p16)   nop.m           0                       // 18:
+        (p16)   xma.hu          nhi[2]=ni2,mj[0],nhi[1] //      np[2]*m0
+        (p40)   cmp.ltu         p43,p41=a7,n7   }
+{ .mfi; (p42)   cmp.leu         p43,p41=a7,n7
+        (p16)   xma.lu          nlo[2]=ni2,mj[0],nhi[1]
+        (p16)   nop.i           0               };;
+{ .mii; (p16)   getf.sig        n1=nlo[1]               // 19:
+        (p48)   cmp.ltu         p51,p49=t[2],a7
+        (p50)   cmp.leu         p51,p49=t[2],a7 };;
+        .pred.rel               "mutex",p41,p43
+        .pred.rel               "mutex",p49,p51
+{ .mfi; (p16)   nop.m           0                       // 20:
+        (p16)   xma.hu          ahi[5]=ai5,bj[7],ahi[4] //      ap[5]*b[i]
+        (p41)   add             a8=a8,n8        }       //      (p17) a8+=n8
+{ .mfi; (p43)   add             a8=a8,n8,1
+        (p16)   xma.lu          alo[5]=ai5,bj[7],ahi[4]
+        (p16)   nop.i           0               };;
+{ .mii; (p16)   getf.sig        a4=alo[4]               // 21:
+        (p49)   add             t[1]=t[1],a8            //      (p17) t[1]+=a8
+        (p51)   add             t[1]=t[1],a8,1  };;
+{ .mfi; (p16)   nop.m           0                       // 22:
+        (p16)   xma.hu          nhi[3]=ni3,mj[0],nhi[2] //      np[3]*m0
+        (p41)   cmp.ltu         p42,p40=a8,n8   }
+{ .mfi; (p43)   cmp.leu         p42,p40=a8,n8
+        (p16)   xma.lu          nlo[3]=ni3,mj[0],nhi[2]
+        (p16)   nop.i           0               };;
+{ .mii; (p16)   getf.sig        n2=nlo[2]               // 23:
+        (p49)   cmp.ltu         p50,p48=t[1],a8
+        (p51)   cmp.leu         p50,p48=t[1],a8 };;
+{ .mfi; (p16)   nop.m           0                       // 24:
+        (p16)   xma.hu          ahi[6]=ai6,bj[7],ahi[5] //      ap[6]*b[i]
+        (p16)   add             a1=a1,n1        }       //      (p16) a1+=n1
+{ .mfi; (p16)   nop.m           0
+        (p16)   xma.lu          alo[6]=ai6,bj[7],ahi[5]
+        (p17)   mov             t[0]=r0         };;
+{ .mii; (p16)   getf.sig        a5=alo[5]               // 25:
+        (p16)   add             t0=t[7],a1              //      (p16) t[7]+=a1
+        (p42)   add             t[0]=t[0],r0,1  };;
+{ .mfi; (p16)   setf.sig        tf[0]=t0                // 26:
+        (p16)   xma.hu          nhi[4]=ni4,mj[0],nhi[3] //      np[4]*m0
+        (p50)   add             t[0]=t[0],r0,1  }
+{ .mfi; (p16)   cmp.ltu.unc     p42,p40=a1,n1
+        (p16)   xma.lu          nlo[4]=ni4,mj[0],nhi[3]
+        (p16)   nop.i           0               };;
+{ .mii; (p16)   getf.sig        n3=nlo[3]               // 27:
+        (p16)   cmp.ltu.unc     p50,p48=t0,a1
+        (p16)   nop.i           0               };;
+        .pred.rel               "mutex",p40,p42
+        .pred.rel               "mutex",p48,p50
+{ .mfi; (p16)   nop.m           0                       // 28:
+        (p16)   xma.hu          ahi[7]=ai7,bj[7],ahi[6] //      ap[7]*b[i]
+        (p40)   add             a2=a2,n2        }       //      (p16) a2+=n2
+{ .mfi; (p42)   add             a2=a2,n2,1
+        (p16)   xma.lu          alo[7]=ai7,bj[7],ahi[6]
+        (p16)   nop.i           0               };;
+{ .mii; (p16)   getf.sig        a6=alo[6]               // 29:
+        (p48)   add             t[6]=t[6],a2            //      (p16) t[6]+=a2
+        (p50)   add             t[6]=t[6],a2,1  };;
+{ .mfi; (p16)   nop.m           0                       // 30:
+        (p16)   xma.hu          nhi[5]=ni5,mj[0],nhi[4] //      np[5]*m0
+        (p40)   cmp.ltu         p41,p39=a2,n2   }
+{ .mfi; (p42)   cmp.leu         p41,p39=a2,n2
+        (p16)   xma.lu          nlo[5]=ni5,mj[0],nhi[4]
+        (p16)   nop.i           0               };;
+{ .mfi; (p16)   getf.sig        n4=nlo[4]               // 31:
+        (p16)   nop.f           0
+        (p48)   cmp.ltu         p49,p47=t[6],a2 }
+{ .mfb; (p50)   cmp.leu         p49,p47=t[6],a2
+        (p16)   nop.f           0
+        br.ctop.sptk.many       .Louter_8_ctop  };;
+.Louter_8_cend:
+// above loop has to execute one more time, without (p16), which is
+// replaced with merged move of np[8] to GPR bank
+        .pred.rel               "mutex",p40,p42
+        .pred.rel               "mutex",p48,p50
+{ .mmi; (p0)    getf.sig        n1=ni0                  // 0:
+        (p40)   add             a3=a3,n3                //      (p17) a3+=n3
+        (p42)   add             a3=a3,n3,1      };;
+{ .mii; (p17)   getf.sig        a7=alo[8]               // 1:
+        (p48)   add             t[6]=t[6],a3            //      (p17) t[6]+=a3
+        (p50)   add             t[6]=t[6],a3,1  };;
+{ .mfi; (p17)   getf.sig        a8=ahi[8]               // 2:
+        (p17)   xma.hu          nhi[7]=ni6,mj[1],nhi[6] //      np[6]*m0
+        (p40)   cmp.ltu         p43,p41=a3,n3   }
+{ .mfi; (p42)   cmp.leu         p43,p41=a3,n3
+        (p17)   xma.lu          nlo[7]=ni6,mj[1],nhi[6]
+        (p0)    nop.i           0               };;
+{ .mii; (p17)   getf.sig        n5=nlo[6]               // 3:
+        (p48)   cmp.ltu         p51,p49=t[6],a3
+        (p50)   cmp.leu         p51,p49=t[6],a3 };;
+        .pred.rel               "mutex",p41,p43
+        .pred.rel               "mutex",p49,p51
+{ .mmi; (p0)    getf.sig        n2=ni1                  // 4:
+        (p41)   add             a4=a4,n4                //      (p17) a4+=n4
+        (p43)   add             a4=a4,n4,1      };;
+{ .mfi; (p49)   add             t[5]=t[5],a4            // 5:   (p17) t[5]+=a4
+        (p0)    nop.f           0
+        (p51)   add             t[5]=t[5],a4,1  };;
+{ .mfi; (p0)    getf.sig        n3=ni2                  // 6:
+        (p17)   xma.hu          nhi[8]=ni7,mj[1],nhi[7] //      np[7]*m0
+        (p41)   cmp.ltu         p42,p40=a4,n4   }
+{ .mfi; (p43)   cmp.leu         p42,p40=a4,n4
+        (p17)   xma.lu          nlo[8]=ni7,mj[1],nhi[7]
+        (p0)    nop.i           0               };;
+{ .mii; (p17)   getf.sig        n6=nlo[7]               // 7:
+        (p49)   cmp.ltu         p50,p48=t[5],a4
+        (p51)   cmp.leu         p50,p48=t[5],a4 };;
+        .pred.rel               "mutex",p40,p42
+        .pred.rel               "mutex",p48,p50
+{ .mii; (p0)    getf.sig        n4=ni3                  // 8:
+        (p40)   add             a5=a5,n5                //      (p17) a5+=n5
+        (p42)   add             a5=a5,n5,1      };;
+{ .mii; (p0)    nop.m           0                       // 9:
+        (p48)   add             t[4]=t[4],a5            //      p(17) t[4]+=a5
+        (p50)   add             t[4]=t[4],a5,1  };;
+{ .mii; (p0)    nop.m           0                       // 10:
+        (p40)   cmp.ltu         p43,p41=a5,n5
+        (p42)   cmp.leu         p43,p41=a5,n5   };;
+{ .mii; (p17)   getf.sig        n7=nlo[8]               // 11:
+        (p48)   cmp.ltu         p51,p49=t[4],a5
+        (p50)   cmp.leu         p51,p49=t[4],a5 };;
+        .pred.rel               "mutex",p41,p43
+        .pred.rel               "mutex",p49,p51
+{ .mii; (p17)   getf.sig        n8=nhi[8]               // 12:
+        (p41)   add             a6=a6,n6                //      (p17) a6+=n6
+        (p43)   add             a6=a6,n6,1      };;
+{ .mii; (p0)    getf.sig        n5=ni4                  // 13:
+        (p49)   add             t[3]=t[3],a6            //      (p17) t[3]+=a6
+        (p51)   add             t[3]=t[3],a6,1  };;
+{ .mii; (p0)    nop.m           0                       // 14:
+        (p41)   cmp.ltu         p42,p40=a6,n6
+        (p43)   cmp.leu         p42,p40=a6,n6   };;
+{ .mii; (p0)    getf.sig        n6=ni5                  // 15:
+        (p49)   cmp.ltu         p50,p48=t[3],a6
+        (p51)   cmp.leu         p50,p48=t[3],a6 };;
+        .pred.rel               "mutex",p40,p42
+        .pred.rel               "mutex",p48,p50
+{ .mii; (p0)    nop.m           0                       // 16:
+        (p40)   add             a7=a7,n7                //      (p17) a7+=n7
+        (p42)   add             a7=a7,n7,1      };;
+{ .mii; (p0)    nop.m           0                       // 17:
+        (p48)   add             t[2]=t[2],a7            //      (p17) t[2]+=a7
+        (p50)   add             t[2]=t[2],a7,1  };;
+{ .mii; (p0)    nop.m           0                       // 18:
+        (p40)   cmp.ltu         p43,p41=a7,n7
+        (p42)   cmp.leu         p43,p41=a7,n7   };;
+{ .mii; (p0)    getf.sig        n7=ni6                  // 19:
+        (p48)   cmp.ltu         p51,p49=t[2],a7
+        (p50)   cmp.leu         p51,p49=t[2],a7 };;
+        .pred.rel               "mutex",p41,p43
+        .pred.rel               "mutex",p49,p51
+{ .mii; (p0)    nop.m           0                       // 20:
+        (p41)   add             a8=a8,n8                //      (p17) a8+=n8
+        (p43)   add             a8=a8,n8,1      };;
+{ .mmi; (p0)    nop.m           0                       // 21:
+        (p49)   add             t[1]=t[1],a8            //      (p17) t[1]+=a8
+        (p51)   add             t[1]=t[1],a8,1  }
+{ .mmi; (p17)   mov             t[0]=r0
+        (p41)   cmp.ltu         p42,p40=a8,n8
+        (p43)   cmp.leu         p42,p40=a8,n8   };;
+{ .mmi; (p0)    getf.sig        n8=ni7                  // 22:
+        (p49)   cmp.ltu         p50,p48=t[1],a8
+        (p51)   cmp.leu         p50,p48=t[1],a8 }
+{ .mmi; (p42)   add             t[0]=t[0],r0,1
+        (p0)    add             r16=-7*16,prevsp
+        (p0)    add             r17=-6*16,prevsp        };;
+// subtract np[8] from carrybit|tmp[8]
+// carrybit|tmp[8] layout upon exit from above loop is:
+//      t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
+{ .mmi; (p50)add        t[0]=t[0],r0,1
+        add             r18=-5*16,prevsp
+        sub             n1=t0,n1        };;
+{ .mmi; cmp.gtu         p34,p32=n1,t0;;
+        .pred.rel       "mutex",p32,p34
+        (p32)sub        n2=t[7],n2
+        (p34)sub        n2=t[7],n2,1    };;
+{ .mii; (p32)cmp.gtu    p35,p33=n2,t[7]
+        (p34)cmp.geu    p35,p33=n2,t[7];;
+        .pred.rel       "mutex",p33,p35
+        (p33)sub        n3=t[6],n3      }
+{ .mmi; (p35)sub        n3=t[6],n3,1;;
+        (p33)cmp.gtu    p34,p32=n3,t[6]
+        (p35)cmp.geu    p34,p32=n3,t[6] };;
+        .pred.rel       "mutex",p32,p34
+{ .mii; (p32)sub        n4=t[5],n4
+        (p34)sub        n4=t[5],n4,1;;
+        (p32)cmp.gtu    p35,p33=n4,t[5] }
+{ .mmi; (p34)cmp.geu    p35,p33=n4,t[5];;
+        .pred.rel       "mutex",p33,p35
+        (p33)sub        n5=t[4],n5
+        (p35)sub        n5=t[4],n5,1    };;
+{ .mii; (p33)cmp.gtu    p34,p32=n5,t[4]
+        (p35)cmp.geu    p34,p32=n5,t[4];;
+        .pred.rel       "mutex",p32,p34
+        (p32)sub        n6=t[3],n6      }
+{ .mmi; (p34)sub        n6=t[3],n6,1;;
+        (p32)cmp.gtu    p35,p33=n6,t[3]
+        (p34)cmp.geu    p35,p33=n6,t[3] };;
+        .pred.rel       "mutex",p33,p35
+{ .mii; (p33)sub        n7=t[2],n7
+        (p35)sub        n7=t[2],n7,1;;
+        (p33)cmp.gtu    p34,p32=n7,t[2] }
+{ .mmi; (p35)cmp.geu    p34,p32=n7,t[2];;
+        .pred.rel       "mutex",p32,p34
+        (p32)sub        n8=t[1],n8
+        (p34)sub        n8=t[1],n8,1    };;
+{ .mii; (p32)cmp.gtu    p35,p33=n8,t[1]
+        (p34)cmp.geu    p35,p33=n8,t[1];;
+        .pred.rel       "mutex",p33,p35
+        (p33)sub        a8=t[0],r0      }
+{ .mmi; (p35)sub        a8=t[0],r0,1;;
+        (p33)cmp.gtu    p34,p32=a8,t[0]
+        (p35)cmp.geu    p34,p32=a8,t[0] };;
+// save the result, either tmp[num] or tmp[num]-np[num]
+        .pred.rel       "mutex",p32,p34
+{ .mmi; (p32)st8        [rptr]=n1,8
+        (p34)st8        [rptr]=t0,8
+        add             r19=-4*16,prevsp};;
+{ .mmb; (p32)st8        [rptr]=n2,8
+        (p34)st8        [rptr]=t[7],8
+        (p5)br.cond.dpnt.few    .Ldone  };;
+{ .mmb; (p32)st8        [rptr]=n3,8
+        (p34)st8        [rptr]=t[6],8
+        (p7)br.cond.dpnt.few    .Ldone  };;
+{ .mmb; (p32)st8        [rptr]=n4,8
+        (p34)st8        [rptr]=t[5],8
+        (p9)br.cond.dpnt.few    .Ldone  };;
+{ .mmb; (p32)st8        [rptr]=n5,8
+        (p34)st8        [rptr]=t[4],8
+        (p11)br.cond.dpnt.few   .Ldone  };;
+{ .mmb; (p32)st8        [rptr]=n6,8
+        (p34)st8        [rptr]=t[3],8
+        (p13)br.cond.dpnt.few   .Ldone  };;
+{ .mmb; (p32)st8        [rptr]=n7,8
+        (p34)st8        [rptr]=t[2],8
+        (p15)br.cond.dpnt.few   .Ldone  };;
+{ .mmb; (p32)st8        [rptr]=n8,8
+        (p34)st8        [rptr]=t[1],8
+        nop.b           0               };;
+.Ldone:                                         // epilogue
+{ .mmi; ldf.fill        f16=[r16],64
+        ldf.fill        f17=[r17],64
+        nop.i           0               }
+{ .mmi; ldf.fill        f18=[r18],64
+        ldf.fill        f19=[r19],64
+        mov             pr=prevpr,0x1ffff       };;
+{ .mmi; ldf.fill        f20=[r16]
+        ldf.fill        f21=[r17]
+        mov             ar.lc=prevlc    }
+{ .mmi; ldf.fill        f22=[r18]
+        ldf.fill        f23=[r19]
+        mov             ret0=1          }       // signal "handled"
+{ .mib; rum             1<<5
+        .restore        sp
+        mov             sp=prevsp
+        br.ret.sptk.many        b0      };;
+.endp   bn_mul_mont_8#
+.type   copyright#,\@object
+copyright:
+stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/mips-mont.pl b/src/lib/libcrypto/bn/asm/mips-mont.pl
new file mode 100644
index 0000000000..b944a12b8e
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/mips-mont.pl
@@ -0,0 +1,426 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# This module doesn't present direct interest for OpenSSL, because it
+# doesn't provide better performance for longer keys, at least not on
+# in-order-execution cores. While 512-bit RSA sign operations can be
+# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
+# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
+# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
+# verify:-( All comparisons are against bn_mul_mont-free assembler.
+# The module might be of interest to embedded system developers, as
+# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
+# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
+# code.
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+if ($flavour =~ /64|n32/i) {
+        $PTR_ADD="dadd";        # incidentally works even on n32
+        $PTR_SUB="dsub";        # incidentally works even on n32
+        $REG_S="sd";
+        $REG_L="ld";
+        $SZREG=8;
+} else {
+        $PTR_ADD="add";
+        $PTR_SUB="sub";
+        $REG_S="sw";
+        $REG_L="lw";
+        $SZREG=4;
+}
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
+#
+# <appro@openssl.org>
+#
+######################################################################
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+if ($flavour =~ /64|n32/i) {
+        $LD="ld";
+        $ST="sd";
+        $MULTU="dmultu";
+        $ADDU="daddu";
+        $SUBU="dsubu";
+        $BNSZ=8;
+} else {
+        $LD="lw";
+        $ST="sw";
+        $MULTU="multu";
+        $ADDU="addu";
+        $SUBU="subu";
+        $BNSZ=4;
+}
+# int bn_mul_mont(
+$rp=$a0;        # BN_ULONG *rp,
+$ap=$a1;        # const BN_ULONG *ap,
+$bp=$a2;        # const BN_ULONG *bp,
+$np=$a3;        # const BN_ULONG *np,
+$n0=$a4;        # const BN_ULONG *n0,
+$num=$a5;       # int num);
+$lo0=$a6;
+$hi0=$a7;
+$lo1=$t1;
+$hi1=$t2;
+$aj=$s0;
+$bi=$s1;
+$nj=$s2;
+$tp=$s3;
+$alo=$s4;
+$ahi=$s5;
+$nlo=$s6;
+$nhi=$s7;
+$tj=$s8;
+$i=$s9;
+$j=$s10;
+$m1=$s11;
+$FRAMESIZE=14;
+$code=<<___;
+.text
+.set    noat
+.set    noreorder
+.align  5
+.globl  bn_mul_mont
+.ent    bn_mul_mont
+bn_mul_mont:
+___
+$code.=<<___ if ($flavour =~ /o32/i);
+        lw      $n0,16($sp)
+        lw      $num,20($sp)
+___
+$code.=<<___;
+        slt     $at,$num,4
+        bnez    $at,1f
+        li      $t0,0
+        slt     $at,$num,17     # on in-order CPU
+        bnezl   $at,bn_mul_mont_internal
+        nop
+1:      jr      $ra
+        li      $a0,0
+.end    bn_mul_mont
+.align  5
+.ent    bn_mul_mont_internal
+bn_mul_mont_internal:
+        .frame  $fp,$FRAMESIZE*$SZREG,$ra
+        .mask   0x40000000|$SAVED_REGS_MASK,-$SZREG
+        $PTR_SUB $sp,$FRAMESIZE*$SZREG
+        $REG_S  $fp,($FRAMESIZE-1)*$SZREG($sp)
+        $REG_S  $s11,($FRAMESIZE-2)*$SZREG($sp)
+        $REG_S  $s10,($FRAMESIZE-3)*$SZREG($sp)
+        $REG_S  $s9,($FRAMESIZE-4)*$SZREG($sp)
+        $REG_S  $s8,($FRAMESIZE-5)*$SZREG($sp)
+        $REG_S  $s7,($FRAMESIZE-6)*$SZREG($sp)
+        $REG_S  $s6,($FRAMESIZE-7)*$SZREG($sp)
+        $REG_S  $s5,($FRAMESIZE-8)*$SZREG($sp)
+        $REG_S  $s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_S  $s3,($FRAMESIZE-10)*$SZREG($sp)
+        $REG_S  $s2,($FRAMESIZE-11)*$SZREG($sp)
+        $REG_S  $s1,($FRAMESIZE-12)*$SZREG($sp)
+        $REG_S  $s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+        move    $fp,$sp
+        .set    reorder
+        $LD     $n0,0($n0)
+        $LD     $bi,0($bp)      # bp[0]
+        $LD     $aj,0($ap)      # ap[0]
+        $LD     $nj,0($np)      # np[0]
+        $PTR_SUB $sp,2*$BNSZ    # place for two extra words
+        sll     $num,`log($BNSZ)/log(2)`
+        li      $at,-4096
+        $PTR_SUB $sp,$num
+        and     $sp,$at
+        $MULTU  $aj,$bi
+        $LD     $alo,$BNSZ($ap)
+        $LD     $nlo,$BNSZ($np)
+        mflo    $lo0
+        mfhi    $hi0
+        $MULTU  $lo0,$n0
+        mflo    $m1
+        $MULTU  $alo,$bi
+        mflo    $alo
+        mfhi    $ahi
+        $MULTU  $nj,$m1
+        mflo    $lo1
+        mfhi    $hi1
+        $MULTU  $nlo,$m1
+        $ADDU   $lo1,$lo0
+        sltu    $at,$lo1,$lo0
+        $ADDU   $hi1,$at
+        mflo    $nlo
+        mfhi    $nhi
+        move    $tp,$sp
+        li      $j,2*$BNSZ
+.align  4
+.L1st:
+        .set    noreorder
+        $PTR_ADD $aj,$ap,$j
+        $PTR_ADD $nj,$np,$j
+        $LD     $aj,($aj)
+        $LD     $nj,($nj)
+        $MULTU  $aj,$bi
+        $ADDU   $lo0,$alo,$hi0
+        $ADDU   $lo1,$nlo,$hi1
+        sltu    $at,$lo0,$hi0
+        sltu    $t0,$lo1,$hi1
+        $ADDU   $hi0,$ahi,$at
+        $ADDU   $hi1,$nhi,$t0
+        mflo    $alo
+        mfhi    $ahi
+        $ADDU   $lo1,$lo0
+        sltu    $at,$lo1,$lo0
+        $MULTU  $nj,$m1
+        $ADDU   $hi1,$at
+        addu    $j,$BNSZ
+        $ST     $lo1,($tp)
+        sltu    $t0,$j,$num
+        mflo    $nlo
+        mfhi    $nhi
+        bnez    $t0,.L1st
+        $PTR_ADD $tp,$BNSZ
+        .set    reorder
+        $ADDU   $lo0,$alo,$hi0
+        sltu    $at,$lo0,$hi0
+        $ADDU   $hi0,$ahi,$at
+        $ADDU   $lo1,$nlo,$hi1
+        sltu    $t0,$lo1,$hi1
+        $ADDU   $hi1,$nhi,$t0
+        $ADDU   $lo1,$lo0
+        sltu    $at,$lo1,$lo0
+        $ADDU   $hi1,$at
+        $ST     $lo1,($tp)
+        $ADDU   $hi1,$hi0
+        sltu    $at,$hi1,$hi0
+        $ST     $hi1,$BNSZ($tp)
+        $ST     $at,2*$BNSZ($tp)
+        li      $i,$BNSZ
+.align  4
+.Louter:
+        $PTR_ADD $bi,$bp,$i
+        $LD     $bi,($bi)
+        $LD     $aj,($ap)
+        $LD     $alo,$BNSZ($ap)
+        $LD     $tj,($sp)
+        $MULTU  $aj,$bi
+        $LD     $nj,($np)
+        $LD     $nlo,$BNSZ($np)
+        mflo    $lo0
+        mfhi    $hi0
+        $ADDU   $lo0,$tj
+        $MULTU  $lo0,$n0
+        sltu    $at,$lo0,$tj
+        $ADDU   $hi0,$at
+        mflo    $m1
+        $MULTU  $alo,$bi
+        mflo    $alo
+        mfhi    $ahi
+        $MULTU  $nj,$m1
+        mflo    $lo1
+        mfhi    $hi1
+        $MULTU  $nlo,$m1
+        $ADDU   $lo1,$lo0
+        sltu    $at,$lo1,$lo0
+        $ADDU   $hi1,$at
+        mflo    $nlo
+        mfhi    $nhi
+        move    $tp,$sp
+        li      $j,2*$BNSZ
+        $LD     $tj,$BNSZ($tp)
+.align  4
+.Linner:
+        .set    noreorder
+        $PTR_ADD $aj,$ap,$j
+        $PTR_ADD $nj,$np,$j
+        $LD     $aj,($aj)
+        $LD     $nj,($nj)
+        $MULTU  $aj,$bi
+        $ADDU   $lo0,$alo,$hi0
+        $ADDU   $lo1,$nlo,$hi1
+        sltu    $at,$lo0,$hi0
+        sltu    $t0,$lo1,$hi1
+        $ADDU   $hi0,$ahi,$at
+        $ADDU   $hi1,$nhi,$t0
+        mflo    $alo
+        mfhi    $ahi
+        $ADDU   $lo0,$tj
+        addu    $j,$BNSZ
+        $MULTU  $nj,$m1
+        sltu    $at,$lo0,$tj
+        $ADDU   $lo1,$lo0
+        $ADDU   $hi0,$at
+        sltu    $t0,$lo1,$lo0
+        $LD     $tj,2*$BNSZ($tp)
+        $ADDU   $hi1,$t0
+        sltu    $at,$j,$num
+        mflo    $nlo
+        mfhi    $nhi
+        $ST     $lo1,($tp)
+        bnez    $at,.Linner
+        $PTR_ADD $tp,$BNSZ
+        .set    reorder
+        $ADDU   $lo0,$alo,$hi0
+        sltu    $at,$lo0,$hi0
+        $ADDU   $hi0,$ahi,$at
+        $ADDU   $lo0,$tj
+        sltu    $t0,$lo0,$tj
+        $ADDU   $hi0,$t0
+        $LD     $tj,2*$BNSZ($tp)
+        $ADDU   $lo1,$nlo,$hi1
+        sltu    $at,$lo1,$hi1
+        $ADDU   $hi1,$nhi,$at
+        $ADDU   $lo1,$lo0
+        sltu    $t0,$lo1,$lo0
+        $ADDU   $hi1,$t0
+        $ST     $lo1,($tp)
+        $ADDU   $lo1,$hi1,$hi0
+        sltu    $hi1,$lo1,$hi0
+        $ADDU   $lo1,$tj
+        sltu    $at,$lo1,$tj
+        $ADDU   $hi1,$at
+        $ST     $lo1,$BNSZ($tp)
+        $ST     $hi1,2*$BNSZ($tp)
+        addu    $i,$BNSZ
+        sltu    $t0,$i,$num
+        bnez    $t0,.Louter
+        .set    noreorder
+        $PTR_ADD $tj,$sp,$num   # &tp[num]
+        move    $tp,$sp
+        move    $ap,$sp
+        li      $hi0,0          # clear borrow bit
+.align  4
+.Lsub:  $LD     $lo0,($tp)
+        $LD     $lo1,($np)
+        $PTR_ADD $tp,$BNSZ
+        $PTR_ADD $np,$BNSZ
+        $SUBU   $lo1,$lo0,$lo1  # tp[i]-np[i]
+        sgtu    $at,$lo1,$lo0
+        $SUBU   $lo0,$lo1,$hi0
+        sgtu    $hi0,$lo0,$lo1
+        $ST     $lo0,($rp)
+        or      $hi0,$at
+        sltu    $at,$tp,$tj
+        bnez    $at,.Lsub
+        $PTR_ADD $rp,$BNSZ
+        $SUBU   $hi0,$hi1,$hi0  # handle upmost overflow bit
+        move    $tp,$sp
+        $PTR_SUB $rp,$num       # restore rp
+        not     $hi1,$hi0
+        and     $ap,$hi0,$sp
+        and     $bp,$hi1,$rp
+        or      $ap,$ap,$bp     # ap=borrow?tp:rp
+.align  4
+.Lcopy: $LD     $aj,($ap)
+        $PTR_ADD $ap,$BNSZ
+        $ST     $zero,($tp)
+        $PTR_ADD $tp,$BNSZ
+        sltu    $at,$tp,$tj
+        $ST     $aj,($rp)
+        bnez    $at,.Lcopy
+        $PTR_ADD $rp,$BNSZ
+        li      $a0,1
+        li      $t0,1
+        .set    noreorder
+        move    $sp,$fp
+        $REG_L  $fp,($FRAMESIZE-1)*$SZREG($sp)
+        $REG_L  $s11,($FRAMESIZE-2)*$SZREG($sp)
+        $REG_L  $s10,($FRAMESIZE-3)*$SZREG($sp)
+        $REG_L  $s9,($FRAMESIZE-4)*$SZREG($sp)
+        $REG_L  $s8,($FRAMESIZE-5)*$SZREG($sp)
+        $REG_L  $s7,($FRAMESIZE-6)*$SZREG($sp)
+        $REG_L  $s6,($FRAMESIZE-7)*$SZREG($sp)
+        $REG_L  $s5,($FRAMESIZE-8)*$SZREG($sp)
+        $REG_L  $s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $s3,($FRAMESIZE-10)*$SZREG($sp)
+        $REG_L  $s2,($FRAMESIZE-11)*$SZREG($sp)
+        $REG_L  $s1,($FRAMESIZE-12)*$SZREG($sp)
+        $REG_L  $s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+        jr      $ra
+        $PTR_ADD $sp,$FRAMESIZE*$SZREG
+.end    bn_mul_mont_internal
+.rdata
+.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/mips.pl b/src/lib/libcrypto/bn/asm/mips.pl
new file mode 100644
index 0000000000..c162a3ec23
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/mips.pl
@@ -0,0 +1,2585 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project.
+#
+# Rights for redistribution and usage in source and binary forms are
+# granted according to the OpenSSL license. Warranty of any kind is
+# disclaimed.
+# ====================================================================
+# July 1999
+#
+# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
+#
+# The module is designed to work with either of the "new" MIPS ABI(5),
+# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
+# IRIX 5.x not only because it doesn't support new ABIs but also
+# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
+# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
+# cause illegal instruction exception:-(
+#
+# In addition the code depends on preprocessor flags set up by MIPSpro
+# compiler driver (either as or cc) and therefore (probably?) can't be
+# compiled by the GNU assembler. GNU C driver manages fine though...
+# I mean as long as -mmips-as is specified or is the default option,
+# because then it simply invokes /usr/bin/as which in turn takes
+# perfect care of the preprocessor definitions. Another neat feature
+# offered by the MIPSpro assembler is an optimization pass. This gave
+# me the opportunity to have the code looking more regular as all those
+# architecture dependent instruction rescheduling details were left to
+# the assembler. Cool, huh?
+#
+# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
+# goes way over 3 times faster!
+#
+#                                       <appro@fy.chalmers.se>
+# October 2010
+#
+# Adapt the module even for 32-bit ABIs and other OSes. The former was
+# achieved by mechanical replacement of 64-bit arithmetic instructions
+# such as dmultu, daddu, etc. with their 32-bit counterparts and
+# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
+# >3x performance improvement naturally does not apply to 32-bit code
+# [because there is no instruction 32-bit compiler can't use], one
+# has to content with 40-85% improvement depending on benchmark and
+# key length, more for longer keys.
+$flavour = shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+if ($flavour =~ /64|n32/i) {
+        $LD="ld";
+        $ST="sd";
+        $MULTU="dmultu";
+        $DIVU="ddivu";
+        $ADDU="daddu";
+        $SUBU="dsubu";
+        $SRL="dsrl";
+        $SLL="dsll";
+        $BNSZ=8;
+        $PTR_ADD="daddu";
+        $PTR_SUB="dsubu";
+        $SZREG=8;
+        $REG_S="sd";
+        $REG_L="ld";
+} else {
+        $LD="lw";
+        $ST="sw";
+        $MULTU="multu";
+        $DIVU="divu";
+        $ADDU="addu";
+        $SUBU="subu";
+        $SRL="srl";
+        $SLL="sll";
+        $BNSZ=4;
+        $PTR_ADD="addu";
+        $PTR_SUB="subu";
+        $SZREG=4;
+        $REG_S="sw";
+        $REG_L="lw";
+        $code=".set     mips2\n";
+}
+# Below is N32/64 register layout used in the original module.
+#
+($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
+#
+# No special adaptation is required for O32. NUBI on the other hand
+# is treated by saving/restoring ($v1,$t0..$t3).
+$gp=$v1 if ($flavour =~ /nubi/i);
+$minus4=$v1;
+$code.=<<___;
+.rdata
+.asciiz "mips3.s, Version 1.2"
+.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
+.text
+.set    noat
+.align  5
+.globl  bn_mul_add_words
+.ent    bn_mul_add_words
+bn_mul_add_words:
+        .set    noreorder
+        bgtz    $a2,bn_mul_add_words_internal
+        move    $v0,$zero
+        jr      $ra
+        move    $a0,$v0
+.end    bn_mul_add_words
+.align  5
+.ent    bn_mul_add_words_internal
+bn_mul_add_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        .frame  $sp,6*$SZREG,$ra
+        .mask   0x8000f008,-$SZREG
+        .set    noreorder
+        $PTR_SUB $sp,6*$SZREG
+        $REG_S  $ra,5*$SZREG($sp)
+        $REG_S  $t3,4*$SZREG($sp)
+        $REG_S  $t2,3*$SZREG($sp)
+        $REG_S  $t1,2*$SZREG($sp)
+        $REG_S  $t0,1*$SZREG($sp)
+        $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+        .set    reorder
+        li      $minus4,-4
+        and     $ta0,$a2,$minus4
+        $LD     $t0,0($a1)
+        beqz    $ta0,.L_bn_mul_add_words_tail
+.L_bn_mul_add_words_loop:
+        $MULTU  $t0,$a3
+        $LD     $t1,0($a0)
+        $LD     $t2,$BNSZ($a1)
+        $LD     $t3,$BNSZ($a0)
+        $LD     $ta0,2*$BNSZ($a1)
+        $LD     $ta1,2*$BNSZ($a0)
+        $ADDU   $t1,$v0
+        sltu    $v0,$t1,$v0     # All manuals say it "compares 32-bit
+                                # values", but it seems to work fine
+                                # even on 64-bit registers.
+        mflo    $at
+        mfhi    $t0
+        $ADDU   $t1,$at
+        $ADDU   $v0,$t0
+         $MULTU $t2,$a3
+        sltu    $at,$t1,$at
+        $ST     $t1,0($a0)
+        $ADDU   $v0,$at
+        $LD     $ta2,3*$BNSZ($a1)
+        $LD     $ta3,3*$BNSZ($a0)
+        $ADDU   $t3,$v0
+        sltu    $v0,$t3,$v0
+        mflo    $at
+        mfhi    $t2
+        $ADDU   $t3,$at
+        $ADDU   $v0,$t2
+         $MULTU $ta0,$a3
+        sltu    $at,$t3,$at
+        $ST     $t3,$BNSZ($a0)
+        $ADDU   $v0,$at
+        subu    $a2,4
+        $PTR_ADD $a0,4*$BNSZ
+        $PTR_ADD $a1,4*$BNSZ
+        $ADDU   $ta1,$v0
+        sltu    $v0,$ta1,$v0
+        mflo    $at
+        mfhi    $ta0
+        $ADDU   $ta1,$at
+        $ADDU   $v0,$ta0
+         $MULTU $ta2,$a3
+        sltu    $at,$ta1,$at
+        $ST     $ta1,-2*$BNSZ($a0)
+        $ADDU   $v0,$at
+        and     $ta0,$a2,$minus4
+        $ADDU   $ta3,$v0
+        sltu    $v0,$ta3,$v0
+        mflo    $at
+        mfhi    $ta2
+        $ADDU   $ta3,$at
+        $ADDU   $v0,$ta2
+        sltu    $at,$ta3,$at
+        $ST     $ta3,-$BNSZ($a0)
+        $ADDU   $v0,$at
+        .set    noreorder
+        bgtzl   $ta0,.L_bn_mul_add_words_loop
+        $LD     $t0,0($a1)
+        beqz    $a2,.L_bn_mul_add_words_return
+        nop
+.L_bn_mul_add_words_tail:
+        .set    reorder
+        $LD     $t0,0($a1)
+        $MULTU  $t0,$a3
+        $LD     $t1,0($a0)
+        subu    $a2,1
+        $ADDU   $t1,$v0
+        sltu    $v0,$t1,$v0
+        mflo    $at
+        mfhi    $t0
+        $ADDU   $t1,$at
+        $ADDU   $v0,$t0
+        sltu    $at,$t1,$at
+        $ST     $t1,0($a0)
+        $ADDU   $v0,$at
+        beqz    $a2,.L_bn_mul_add_words_return
+        $LD     $t0,$BNSZ($a1)
+        $MULTU  $t0,$a3
+        $LD     $t1,$BNSZ($a0)
+        subu    $a2,1
+        $ADDU   $t1,$v0
+        sltu    $v0,$t1,$v0
+        mflo    $at
+        mfhi    $t0
+        $ADDU   $t1,$at
+        $ADDU   $v0,$t0
+        sltu    $at,$t1,$at
+        $ST     $t1,$BNSZ($a0)
+        $ADDU   $v0,$at
+        beqz    $a2,.L_bn_mul_add_words_return
+        $LD     $t0,2*$BNSZ($a1)
+        $MULTU  $t0,$a3
+        $LD     $t1,2*$BNSZ($a0)
+        $ADDU   $t1,$v0
+        sltu    $v0,$t1,$v0
+        mflo    $at
+        mfhi    $t0
+        $ADDU   $t1,$at
+        $ADDU   $v0,$t0
+        sltu    $at,$t1,$at
+        $ST     $t1,2*$BNSZ($a0)
+        $ADDU   $v0,$at
+.L_bn_mul_add_words_return:
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $t3,4*$SZREG($sp)
+        $REG_L  $t2,3*$SZREG($sp)
+        $REG_L  $t1,2*$SZREG($sp)
+        $REG_L  $t0,1*$SZREG($sp)
+        $REG_L  $gp,0*$SZREG($sp)
+        $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+        jr      $ra
+        move    $a0,$v0
+.end    bn_mul_add_words_internal
+.align  5
+.globl  bn_mul_words
+.ent    bn_mul_words
+bn_mul_words:
+        .set    noreorder
+        bgtz    $a2,bn_mul_words_internal
+        move    $v0,$zero
+        jr      $ra
+        move    $a0,$v0
+.end    bn_mul_words
+.align  5
+.ent    bn_mul_words_internal
+bn_mul_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        .frame  $sp,6*$SZREG,$ra
+        .mask   0x8000f008,-$SZREG
+        .set    noreorder
+        $PTR_SUB $sp,6*$SZREG
+        $REG_S  $ra,5*$SZREG($sp)
+        $REG_S  $t3,4*$SZREG($sp)
+        $REG_S  $t2,3*$SZREG($sp)
+        $REG_S  $t1,2*$SZREG($sp)
+        $REG_S  $t0,1*$SZREG($sp)
+        $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+        .set    reorder
+        li      $minus4,-4
+        and     $ta0,$a2,$minus4
+        $LD     $t0,0($a1)
+        beqz    $ta0,.L_bn_mul_words_tail
+.L_bn_mul_words_loop:
+        $MULTU  $t0,$a3
+        $LD     $t2,$BNSZ($a1)
+        $LD     $ta0,2*$BNSZ($a1)
+        $LD     $ta2,3*$BNSZ($a1)
+        mflo    $at
+        mfhi    $t0
+        $ADDU   $v0,$at
+        sltu    $t1,$v0,$at
+         $MULTU $t2,$a3
+        $ST     $v0,0($a0)
+        $ADDU   $v0,$t1,$t0
+        subu    $a2,4
+        $PTR_ADD $a0,4*$BNSZ
+        $PTR_ADD $a1,4*$BNSZ
+        mflo    $at
+        mfhi    $t2
+        $ADDU   $v0,$at
+        sltu    $t3,$v0,$at
+         $MULTU $ta0,$a3
+        $ST     $v0,-3*$BNSZ($a0)
+        $ADDU   $v0,$t3,$t2
+        mflo    $at
+        mfhi    $ta0
+        $ADDU   $v0,$at
+        sltu    $ta1,$v0,$at
+         $MULTU $ta2,$a3
+        $ST     $v0,-2*$BNSZ($a0)
+        $ADDU   $v0,$ta1,$ta0
+        and     $ta0,$a2,$minus4
+        mflo    $at
+        mfhi    $ta2
+        $ADDU   $v0,$at
+        sltu    $ta3,$v0,$at
+        $ST     $v0,-$BNSZ($a0)
+        $ADDU   $v0,$ta3,$ta2
+        .set    noreorder
+        bgtzl   $ta0,.L_bn_mul_words_loop
+        $LD     $t0,0($a1)
+        beqz    $a2,.L_bn_mul_words_return
+        nop
+.L_bn_mul_words_tail:
+        .set    reorder
+        $LD     $t0,0($a1)
+        $MULTU  $t0,$a3
+        subu    $a2,1
+        mflo    $at
+        mfhi    $t0
+        $ADDU   $v0,$at
+        sltu    $t1,$v0,$at
+        $ST     $v0,0($a0)
+        $ADDU   $v0,$t1,$t0
+        beqz    $a2,.L_bn_mul_words_return
+        $LD     $t0,$BNSZ($a1)
+        $MULTU  $t0,$a3
+        subu    $a2,1
+        mflo    $at
+        mfhi    $t0
+        $ADDU   $v0,$at
+        sltu    $t1,$v0,$at
+        $ST     $v0,$BNSZ($a0)
+        $ADDU   $v0,$t1,$t0
+        beqz    $a2,.L_bn_mul_words_return
+        $LD     $t0,2*$BNSZ($a1)
+        $MULTU  $t0,$a3
+        mflo    $at
+        mfhi    $t0
+        $ADDU   $v0,$at
+        sltu    $t1,$v0,$at
+        $ST     $v0,2*$BNSZ($a0)
+        $ADDU   $v0,$t1,$t0
+.L_bn_mul_words_return:
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $t3,4*$SZREG($sp)
+        $REG_L  $t2,3*$SZREG($sp)
+        $REG_L  $t1,2*$SZREG($sp)
+        $REG_L  $t0,1*$SZREG($sp)
+        $REG_L  $gp,0*$SZREG($sp)
+        $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+        jr      $ra
+        move    $a0,$v0
+.end    bn_mul_words_internal
+.align  5
+.globl  bn_sqr_words
+.ent    bn_sqr_words
+bn_sqr_words:
+        .set    noreorder
+        bgtz    $a2,bn_sqr_words_internal
+        move    $v0,$zero
+        jr      $ra
+        move    $a0,$v0
+.end    bn_sqr_words
+.align  5
+.ent    bn_sqr_words_internal
+bn_sqr_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        .frame  $sp,6*$SZREG,$ra
+        .mask   0x8000f008,-$SZREG
+        .set    noreorder
+        $PTR_SUB $sp,6*$SZREG
+        $REG_S  $ra,5*$SZREG($sp)
+        $REG_S  $t3,4*$SZREG($sp)
+        $REG_S  $t2,3*$SZREG($sp)
+        $REG_S  $t1,2*$SZREG($sp)
+        $REG_S  $t0,1*$SZREG($sp)
+        $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+        .set    reorder
+        li      $minus4,-4
+        and     $ta0,$a2,$minus4
+        $LD     $t0,0($a1)
+        beqz    $ta0,.L_bn_sqr_words_tail
+.L_bn_sqr_words_loop:
+        $MULTU  $t0,$t0
+        $LD     $t2,$BNSZ($a1)
+        $LD     $ta0,2*$BNSZ($a1)
+        $LD     $ta2,3*$BNSZ($a1)
+        mflo    $t1
+        mfhi    $t0
+        $ST     $t1,0($a0)
+        $ST     $t0,$BNSZ($a0)
+        $MULTU  $t2,$t2
+        subu    $a2,4
+        $PTR_ADD $a0,8*$BNSZ
+        $PTR_ADD $a1,4*$BNSZ
+        mflo    $t3
+        mfhi    $t2
+        $ST     $t3,-6*$BNSZ($a0)
+        $ST     $t2,-5*$BNSZ($a0)
+        $MULTU  $ta0,$ta0
+        mflo    $ta1
+        mfhi    $ta0
+        $ST     $ta1,-4*$BNSZ($a0)
+        $ST     $ta0,-3*$BNSZ($a0)
+        $MULTU  $ta2,$ta2
+        and     $ta0,$a2,$minus4
+        mflo    $ta3
+        mfhi    $ta2
+        $ST     $ta3,-2*$BNSZ($a0)
+        $ST     $ta2,-$BNSZ($a0)
+        .set    noreorder
+        bgtzl   $ta0,.L_bn_sqr_words_loop
+        $LD     $t0,0($a1)
+        beqz    $a2,.L_bn_sqr_words_return
+        nop
+.L_bn_sqr_words_tail:
+        .set    reorder
+        $LD     $t0,0($a1)
+        $MULTU  $t0,$t0
+        subu    $a2,1
+        mflo    $t1
+        mfhi    $t0
+        $ST     $t1,0($a0)
+        $ST     $t0,$BNSZ($a0)
+        beqz    $a2,.L_bn_sqr_words_return
+        $LD     $t0,$BNSZ($a1)
+        $MULTU  $t0,$t0
+        subu    $a2,1
+        mflo    $t1
+        mfhi    $t0
+        $ST     $t1,2*$BNSZ($a0)
+        $ST     $t0,3*$BNSZ($a0)
+        beqz    $a2,.L_bn_sqr_words_return
+        $LD     $t0,2*$BNSZ($a1)
+        $MULTU  $t0,$t0
+        mflo    $t1
+        mfhi    $t0
+        $ST     $t1,4*$BNSZ($a0)
+        $ST     $t0,5*$BNSZ($a0)
+.L_bn_sqr_words_return:
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $t3,4*$SZREG($sp)
+        $REG_L  $t2,3*$SZREG($sp)
+        $REG_L  $t1,2*$SZREG($sp)
+        $REG_L  $t0,1*$SZREG($sp)
+        $REG_L  $gp,0*$SZREG($sp)
+        $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+        jr      $ra
+        move    $a0,$v0
+.end    bn_sqr_words_internal
+.align  5
+.globl  bn_add_words
+.ent    bn_add_words
+bn_add_words:
+        .set    noreorder
+        bgtz    $a3,bn_add_words_internal
+        move    $v0,$zero
+        jr      $ra
+        move    $a0,$v0
+.end    bn_add_words
+.align  5
+.ent    bn_add_words_internal
+bn_add_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        .frame  $sp,6*$SZREG,$ra
+        .mask   0x8000f008,-$SZREG
+        .set    noreorder
+        $PTR_SUB $sp,6*$SZREG
+        $REG_S  $ra,5*$SZREG($sp)
+        $REG_S  $t3,4*$SZREG($sp)
+        $REG_S  $t2,3*$SZREG($sp)
+        $REG_S  $t1,2*$SZREG($sp)
+        $REG_S  $t0,1*$SZREG($sp)
+        $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+        .set    reorder
+        li      $minus4,-4
+        and     $at,$a3,$minus4
+        $LD     $t0,0($a1)
+        beqz    $at,.L_bn_add_words_tail
+.L_bn_add_words_loop:
+        $LD     $ta0,0($a2)
+        subu    $a3,4
+        $LD     $t1,$BNSZ($a1)
+        and     $at,$a3,$minus4
+        $LD     $t2,2*$BNSZ($a1)
+        $PTR_ADD $a2,4*$BNSZ
+        $LD     $t3,3*$BNSZ($a1)
+        $PTR_ADD $a0,4*$BNSZ
+        $LD     $ta1,-3*$BNSZ($a2)
+        $PTR_ADD $a1,4*$BNSZ
+        $LD     $ta2,-2*$BNSZ($a2)
+        $LD     $ta3,-$BNSZ($a2)
+        $ADDU   $ta0,$t0
+        sltu    $t8,$ta0,$t0
+        $ADDU   $t0,$ta0,$v0
+        sltu    $v0,$t0,$ta0
+        $ST     $t0,-4*$BNSZ($a0)
+        $ADDU   $v0,$t8
+        $ADDU   $ta1,$t1
+        sltu    $t9,$ta1,$t1
+        $ADDU   $t1,$ta1,$v0
+        sltu    $v0,$t1,$ta1
+        $ST     $t1,-3*$BNSZ($a0)
+        $ADDU   $v0,$t9
+        $ADDU   $ta2,$t2
+        sltu    $t8,$ta2,$t2
+        $ADDU   $t2,$ta2,$v0
+        sltu    $v0,$t2,$ta2
+        $ST     $t2,-2*$BNSZ($a0)
+        $ADDU   $v0,$t8
+        
+        $ADDU   $ta3,$t3
+        sltu    $t9,$ta3,$t3
+        $ADDU   $t3,$ta3,$v0
+        sltu    $v0,$t3,$ta3
+        $ST     $t3,-$BNSZ($a0)
+        $ADDU   $v0,$t9
+        
+        .set    noreorder
+        bgtzl   $at,.L_bn_add_words_loop
+        $LD     $t0,0($a1)
+        beqz    $a3,.L_bn_add_words_return
+        nop
+.L_bn_add_words_tail:
+        .set    reorder
+        $LD     $t0,0($a1)
+        $LD     $ta0,0($a2)
+        $ADDU   $ta0,$t0
+        subu    $a3,1
+        sltu    $t8,$ta0,$t0
+        $ADDU   $t0,$ta0,$v0
+        sltu    $v0,$t0,$ta0
+        $ST     $t0,0($a0)
+        $ADDU   $v0,$t8
+        beqz    $a3,.L_bn_add_words_return
+        $LD     $t1,$BNSZ($a1)
+        $LD     $ta1,$BNSZ($a2)
+        $ADDU   $ta1,$t1
+        subu    $a3,1
+        sltu    $t9,$ta1,$t1
+        $ADDU   $t1,$ta1,$v0
+        sltu    $v0,$t1,$ta1
+        $ST     $t1,$BNSZ($a0)
+        $ADDU   $v0,$t9
+        beqz    $a3,.L_bn_add_words_return
+        $LD     $t2,2*$BNSZ($a1)
+        $LD     $ta2,2*$BNSZ($a2)
+        $ADDU   $ta2,$t2
+        sltu    $t8,$ta2,$t2
+        $ADDU   $t2,$ta2,$v0
+        sltu    $v0,$t2,$ta2
+        $ST     $t2,2*$BNSZ($a0)
+        $ADDU   $v0,$t8
+.L_bn_add_words_return:
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $t3,4*$SZREG($sp)
+        $REG_L  $t2,3*$SZREG($sp)
+        $REG_L  $t1,2*$SZREG($sp)
+        $REG_L  $t0,1*$SZREG($sp)
+        $REG_L  $gp,0*$SZREG($sp)
+        $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+        jr      $ra
+        move    $a0,$v0
+.end    bn_add_words_internal
+.align  5
+.globl  bn_sub_words
+.ent    bn_sub_words
+bn_sub_words:
+        .set    noreorder
+        bgtz    $a3,bn_sub_words_internal
+        move    $v0,$zero
+        jr      $ra
+        move    $a0,$zero
+.end    bn_sub_words
+.align  5
+.ent    bn_sub_words_internal
+bn_sub_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        .frame  $sp,6*$SZREG,$ra
+        .mask   0x8000f008,-$SZREG
+        .set    noreorder
+        $PTR_SUB $sp,6*$SZREG
+        $REG_S  $ra,5*$SZREG($sp)
+        $REG_S  $t3,4*$SZREG($sp)
+        $REG_S  $t2,3*$SZREG($sp)
+        $REG_S  $t1,2*$SZREG($sp)
+        $REG_S  $t0,1*$SZREG($sp)
+        $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+        .set    reorder
+        li      $minus4,-4
+        and     $at,$a3,$minus4
+        $LD     $t0,0($a1)
+        beqz    $at,.L_bn_sub_words_tail
+.L_bn_sub_words_loop:
+        $LD     $ta0,0($a2)
+        subu    $a3,4
+        $LD     $t1,$BNSZ($a1)
+        and     $at,$a3,$minus4
+        $LD     $t2,2*$BNSZ($a1)
+        $PTR_ADD $a2,4*$BNSZ
+        $LD     $t3,3*$BNSZ($a1)
+        $PTR_ADD $a0,4*$BNSZ
+        $LD     $ta1,-3*$BNSZ($a2)
+        $PTR_ADD $a1,4*$BNSZ
+        $LD     $ta2,-2*$BNSZ($a2)
+        $LD     $ta3,-$BNSZ($a2)
+        sltu    $t8,$t0,$ta0
+        $SUBU   $ta0,$t0,$ta0
+        $SUBU   $t0,$ta0,$v0
+        sgtu    $v0,$t0,$ta0
+        $ST     $t0,-4*$BNSZ($a0)
+        $ADDU   $v0,$t8
+        sltu    $t9,$t1,$ta1
+        $SUBU   $ta1,$t1,$ta1
+        $SUBU   $t1,$ta1,$v0
+        sgtu    $v0,$t1,$ta1
+        $ST     $t1,-3*$BNSZ($a0)
+        $ADDU   $v0,$t9
+        sltu    $t8,$t2,$ta2
+        $SUBU   $ta2,$t2,$ta2
+        $SUBU   $t2,$ta2,$v0
+        sgtu    $v0,$t2,$ta2
+        $ST     $t2,-2*$BNSZ($a0)
+        $ADDU   $v0,$t8
+        sltu    $t9,$t3,$ta3
+        $SUBU   $ta3,$t3,$ta3
+        $SUBU   $t3,$ta3,$v0
+        sgtu    $v0,$t3,$ta3
+        $ST     $t3,-$BNSZ($a0)
+        $ADDU   $v0,$t9
+        .set    noreorder
+        bgtzl   $at,.L_bn_sub_words_loop
+        $LD     $t0,0($a1)
+        beqz    $a3,.L_bn_sub_words_return
+        nop
+.L_bn_sub_words_tail:
+        .set    reorder
+        $LD     $t0,0($a1)
+        $LD     $ta0,0($a2)
+        subu    $a3,1
+        sltu    $t8,$t0,$ta0
+        $SUBU   $ta0,$t0,$ta0
+        $SUBU   $t0,$ta0,$v0
+        sgtu    $v0,$t0,$ta0
+        $ST     $t0,0($a0)
+        $ADDU   $v0,$t8
+        beqz    $a3,.L_bn_sub_words_return
+        $LD     $t1,$BNSZ($a1)
+        subu    $a3,1
+        $LD     $ta1,$BNSZ($a2)
+        sltu    $t9,$t1,$ta1
+        $SUBU   $ta1,$t1,$ta1
+        $SUBU   $t1,$ta1,$v0
+        sgtu    $v0,$t1,$ta1
+        $ST     $t1,$BNSZ($a0)
+        $ADDU   $v0,$t9
+        beqz    $a3,.L_bn_sub_words_return
+        $LD     $t2,2*$BNSZ($a1)
+        $LD     $ta2,2*$BNSZ($a2)
+        sltu    $t8,$t2,$ta2
+        $SUBU   $ta2,$t2,$ta2
+        $SUBU   $t2,$ta2,$v0
+        sgtu    $v0,$t2,$ta2
+        $ST     $t2,2*$BNSZ($a0)
+        $ADDU   $v0,$t8
+.L_bn_sub_words_return:
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $t3,4*$SZREG($sp)
+        $REG_L  $t2,3*$SZREG($sp)
+        $REG_L  $t1,2*$SZREG($sp)
+        $REG_L  $t0,1*$SZREG($sp)
+        $REG_L  $gp,0*$SZREG($sp)
+        $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+        jr      $ra
+        move    $a0,$v0
+.end    bn_sub_words_internal
+.align 5
+.globl  bn_div_3_words
+.ent    bn_div_3_words
+bn_div_3_words:
+        .set    noreorder
+        move    $a3,$a0         # we know that bn_div_words does not
+                                # touch $a3, $ta2, $ta3 and preserves $a2
+                                # so that we can save two arguments
+                                # and return address in registers
+                                # instead of stack:-)
+                                
+        $LD     $a0,($a3)
+        move    $ta2,$a1
+        bne     $a0,$a2,bn_div_3_words_internal
+        $LD     $a1,-$BNSZ($a3)
+        li      $v0,-1
+        jr      $ra
+        move    $a0,$v0
+.end    bn_div_3_words
+.align  5
+.ent    bn_div_3_words_internal
+bn_div_3_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        .frame  $sp,6*$SZREG,$ra
+        .mask   0x8000f008,-$SZREG
+        .set    noreorder
+        $PTR_SUB $sp,6*$SZREG
+        $REG_S  $ra,5*$SZREG($sp)
+        $REG_S  $t3,4*$SZREG($sp)
+        $REG_S  $t2,3*$SZREG($sp)
+        $REG_S  $t1,2*$SZREG($sp)
+        $REG_S  $t0,1*$SZREG($sp)
+        $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+        .set    reorder
+        move    $ta3,$ra
+        bal     bn_div_words
+        move    $ra,$ta3
+        $MULTU  $ta2,$v0
+        $LD     $t2,-2*$BNSZ($a3)
+        move    $ta0,$zero
+        mfhi    $t1
+        mflo    $t0
+        sltu    $t8,$t1,$a1
+.L_bn_div_3_words_inner_loop:
+        bnez    $t8,.L_bn_div_3_words_inner_loop_done
+        sgeu    $at,$t2,$t0
+        seq     $t9,$t1,$a1
+        and     $at,$t9
+        sltu    $t3,$t0,$ta2
+        $ADDU   $a1,$a2
+        $SUBU   $t1,$t3
+        $SUBU   $t0,$ta2
+        sltu    $t8,$t1,$a1
+        sltu    $ta0,$a1,$a2
+        or      $t8,$ta0
+        .set    noreorder
+        beqzl   $at,.L_bn_div_3_words_inner_loop
+        $SUBU   $v0,1
+        .set    reorder
+.L_bn_div_3_words_inner_loop_done:
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $t3,4*$SZREG($sp)
+        $REG_L  $t2,3*$SZREG($sp)
+        $REG_L  $t1,2*$SZREG($sp)
+        $REG_L  $t0,1*$SZREG($sp)
+        $REG_L  $gp,0*$SZREG($sp)
+        $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+        jr      $ra
+        move    $a0,$v0
+.end    bn_div_3_words_internal
+.align  5
+.globl  bn_div_words
+.ent    bn_div_words
+bn_div_words:
+        .set    noreorder
+        bnez    $a2,bn_div_words_internal
+        li      $v0,-1          # I would rather signal div-by-zero
+                                # which can be done with 'break 7'
+        jr      $ra
+        move    $a0,$v0
+.end    bn_div_words
+.align  5
+.ent    bn_div_words_internal
+bn_div_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        .frame  $sp,6*$SZREG,$ra
+        .mask   0x8000f008,-$SZREG
+        .set    noreorder
+        $PTR_SUB $sp,6*$SZREG
+        $REG_S  $ra,5*$SZREG($sp)
+        $REG_S  $t3,4*$SZREG($sp)
+        $REG_S  $t2,3*$SZREG($sp)
+        $REG_S  $t1,2*$SZREG($sp)
+        $REG_S  $t0,1*$SZREG($sp)
+        $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+        move    $v1,$zero
+        bltz    $a2,.L_bn_div_words_body
+        move    $t9,$v1
+        $SLL    $a2,1
+        bgtz    $a2,.-4
+        addu    $t9,1
+        .set    reorder
+        negu    $t1,$t9
+        li      $t2,-1
+        $SLL    $t2,$t1
+        and     $t2,$a0
+        $SRL    $at,$a1,$t1
+        .set    noreorder
+        bnezl   $t2,.+8
+        break   6               # signal overflow
+        .set    reorder
+        $SLL    $a0,$t9
+        $SLL    $a1,$t9
+        or      $a0,$at
+___
+$QT=$ta0;
+$HH=$ta1;
+$DH=$v1;
+$code.=<<___;
+.L_bn_div_words_body:
+        $SRL    $DH,$a2,4*$BNSZ # bits
+        sgeu    $at,$a0,$a2
+        .set    noreorder
+        bnezl   $at,.+8
+        $SUBU   $a0,$a2
+        .set    reorder
+        li      $QT,-1
+        $SRL    $HH,$a0,4*$BNSZ # bits
+        $SRL    $QT,4*$BNSZ     # q=0xffffffff
+        beq     $DH,$HH,.L_bn_div_words_skip_div1
+        $DIVU   $zero,$a0,$DH
+        mflo    $QT
+.L_bn_div_words_skip_div1:
+        $MULTU  $a2,$QT
+        $SLL    $t3,$a0,4*$BNSZ # bits
+        $SRL    $at,$a1,4*$BNSZ # bits
+        or      $t3,$at
+        mflo    $t0
+        mfhi    $t1
+.L_bn_div_words_inner_loop1:
+        sltu    $t2,$t3,$t0
+        seq     $t8,$HH,$t1
+        sltu    $at,$HH,$t1
+        and     $t2,$t8
+        sltu    $v0,$t0,$a2
+        or      $at,$t2
+        .set    noreorder
+        beqz    $at,.L_bn_div_words_inner_loop1_done
+        $SUBU   $t1,$v0
+        $SUBU   $t0,$a2
+        b       .L_bn_div_words_inner_loop1
+        $SUBU   $QT,1
+        .set    reorder
+.L_bn_div_words_inner_loop1_done:
+        $SLL    $a1,4*$BNSZ     # bits
+        $SUBU   $a0,$t3,$t0
+        $SLL    $v0,$QT,4*$BNSZ # bits
+        li      $QT,-1
+        $SRL    $HH,$a0,4*$BNSZ # bits
+        $SRL    $QT,4*$BNSZ     # q=0xffffffff
+        beq     $DH,$HH,.L_bn_div_words_skip_div2
+        $DIVU   $zero,$a0,$DH
+        mflo    $QT
+.L_bn_div_words_skip_div2:
+        $MULTU  $a2,$QT
+        $SLL    $t3,$a0,4*$BNSZ # bits
+        $SRL    $at,$a1,4*$BNSZ # bits
+        or      $t3,$at
+        mflo    $t0
+        mfhi    $t1
+.L_bn_div_words_inner_loop2:
+        sltu    $t2,$t3,$t0
+        seq     $t8,$HH,$t1
+        sltu    $at,$HH,$t1
+        and     $t2,$t8
+        sltu    $v1,$t0,$a2
+        or      $at,$t2
+        .set    noreorder
+        beqz    $at,.L_bn_div_words_inner_loop2_done
+        $SUBU   $t1,$v1
+        $SUBU   $t0,$a2
+        b       .L_bn_div_words_inner_loop2
+        $SUBU   $QT,1
+        .set    reorder
+.L_bn_div_words_inner_loop2_done:
+        $SUBU   $a0,$t3,$t0
+        or      $v0,$QT
+        $SRL    $v1,$a0,$t9     # $v1 contains remainder if anybody wants it
+        $SRL    $a2,$t9         # restore $a2
+        .set    noreorder
+        move    $a1,$v1
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $t3,4*$SZREG($sp)
+        $REG_L  $t2,3*$SZREG($sp)
+        $REG_L  $t1,2*$SZREG($sp)
+        $REG_L  $t0,1*$SZREG($sp)
+        $REG_L  $gp,0*$SZREG($sp)
+        $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+        jr      $ra
+        move    $a0,$v0
+.end    bn_div_words_internal
+___
+undef $HH; undef $QT; undef $DH;
+($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
+($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
+($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
+($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
+($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
+$code.=<<___;
+.align  5
+.globl  bn_mul_comba8
+.ent    bn_mul_comba8
+bn_mul_comba8:
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        .frame  $sp,12*$SZREG,$ra
+        .mask   0x803ff008,-$SZREG
+        $PTR_SUB $sp,12*$SZREG
+        $REG_S  $ra,11*$SZREG($sp)
+        $REG_S  $s5,10*$SZREG($sp)
+        $REG_S  $s4,9*$SZREG($sp)
+        $REG_S  $s3,8*$SZREG($sp)
+        $REG_S  $s2,7*$SZREG($sp)
+        $REG_S  $s1,6*$SZREG($sp)
+        $REG_S  $s0,5*$SZREG($sp)
+        $REG_S  $t3,4*$SZREG($sp)
+        $REG_S  $t2,3*$SZREG($sp)
+        $REG_S  $t1,2*$SZREG($sp)
+        $REG_S  $t0,1*$SZREG($sp)
+        $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /nubi/i);
+        .frame  $sp,6*$SZREG,$ra
+        .mask   0x003f0000,-$SZREG
+        $PTR_SUB $sp,6*$SZREG
+        $REG_S  $s5,5*$SZREG($sp)
+        $REG_S  $s4,4*$SZREG($sp)
+        $REG_S  $s3,3*$SZREG($sp)
+        $REG_S  $s2,2*$SZREG($sp)
+        $REG_S  $s1,1*$SZREG($sp)
+        $REG_S  $s0,0*$SZREG($sp)
+___
+$code.=<<___;
+        .set    reorder
+        $LD     $a_0,0($a1)     # If compiled with -mips3 option on
+                                # R5000 box assembler barks on this
+                                # 1ine with "should not have mult/div
+                                # as last instruction in bb (R10K
+                                # bug)" warning. If anybody out there
+                                # has a clue about how to circumvent
+                                # this do send me a note.
+                                #               <appro\@fy.chalmers.se>
+        $LD     $b_0,0($a2)
+        $LD     $a_1,$BNSZ($a1)
+        $LD     $a_2,2*$BNSZ($a1)
+        $MULTU  $a_0,$b_0               # mul_add_c(a[0],b[0],c1,c2,c3);
+        $LD     $a_3,3*$BNSZ($a1)
+        $LD     $b_1,$BNSZ($a2)
+        $LD     $b_2,2*$BNSZ($a2)
+        $LD     $b_3,3*$BNSZ($a2)
+        mflo    $c_1
+        mfhi    $c_2
+        $LD     $a_4,4*$BNSZ($a1)
+        $LD     $a_5,5*$BNSZ($a1)
+        $MULTU  $a_0,$b_1               # mul_add_c(a[0],b[1],c2,c3,c1);
+        $LD     $a_6,6*$BNSZ($a1)
+        $LD     $a_7,7*$BNSZ($a1)
+        $LD     $b_4,4*$BNSZ($a2)
+        $LD     $b_5,5*$BNSZ($a2)
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_1,$b_0               # mul_add_c(a[1],b[0],c2,c3,c1);
+        $ADDU   $c_3,$t_2,$at
+        $LD     $b_6,6*$BNSZ($a2)
+        $LD     $b_7,7*$BNSZ($a2)
+        $ST     $c_1,0($a0)     # r[0]=c1;
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+         $MULTU $a_2,$b_0               # mul_add_c(a[2],b[0],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $c_1,$c_3,$t_2
+        $ST     $c_2,$BNSZ($a0) # r[1]=c2;
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_1,$b_1               # mul_add_c(a[1],b[1],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_0,$b_2               # mul_add_c(a[0],b[2],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $c_2,$c_1,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+         $MULTU $a_0,$b_3               # mul_add_c(a[0],b[3],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        $ST     $c_3,2*$BNSZ($a0)       # r[2]=c3;
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_1,$b_2               # mul_add_c(a[1],b[2],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $c_3,$c_2,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_2,$b_1               # mul_add_c(a[2],b[1],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_3,$b_0               # mul_add_c(a[3],b[0],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+         $MULTU $a_4,$b_0               # mul_add_c(a[4],b[0],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        $ST     $c_1,3*$BNSZ($a0)       # r[3]=c1;
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_3,$b_1               # mul_add_c(a[3],b[1],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $c_1,$c_3,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_2,$b_2               # mul_add_c(a[2],b[2],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_1,$b_3               # mul_add_c(a[1],b[3],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_0,$b_4               # mul_add_c(a[0],b[4],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+         $MULTU $a_0,$b_5               # mul_add_c(a[0],b[5],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        $ST     $c_2,4*$BNSZ($a0)       # r[4]=c2;
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_1,$b_4               # mul_add_c(a[1],b[4],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $c_2,$c_1,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_2,$b_3               # mul_add_c(a[2],b[3],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_3,$b_2               # mul_add_c(a[3],b[2],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_4,$b_1               # mul_add_c(a[4],b[1],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_5,$b_0               # mul_add_c(a[5],b[0],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+         $MULTU $a_6,$b_0               # mul_add_c(a[6],b[0],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        $ST     $c_3,5*$BNSZ($a0)       # r[5]=c3;
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_5,$b_1               # mul_add_c(a[5],b[1],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $c_3,$c_2,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_4,$b_2               # mul_add_c(a[4],b[2],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_3,$b_3               # mul_add_c(a[3],b[3],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_2,$b_4               # mul_add_c(a[2],b[4],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_1,$b_5               # mul_add_c(a[1],b[5],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_0,$b_6               # mul_add_c(a[0],b[6],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+         $MULTU $a_0,$b_7               # mul_add_c(a[0],b[7],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        $ST     $c_1,6*$BNSZ($a0)       # r[6]=c1;
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_1,$b_6               # mul_add_c(a[1],b[6],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $c_1,$c_3,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_2,$b_5               # mul_add_c(a[2],b[5],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_3,$b_4               # mul_add_c(a[3],b[4],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_4,$b_3               # mul_add_c(a[4],b[3],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_5,$b_2               # mul_add_c(a[5],b[2],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_6,$b_1               # mul_add_c(a[6],b[1],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_7,$b_0               # mul_add_c(a[7],b[0],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+         $MULTU $a_7,$b_1               # mul_add_c(a[7],b[1],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        $ST     $c_2,7*$BNSZ($a0)       # r[7]=c2;
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_6,$b_2               # mul_add_c(a[6],b[2],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $c_2,$c_1,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_5,$b_3               # mul_add_c(a[5],b[3],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_4,$b_4               # mul_add_c(a[4],b[4],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_3,$b_5               # mul_add_c(a[3],b[5],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_2,$b_6               # mul_add_c(a[2],b[6],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_1,$b_7               # mul_add_c(a[1],b[7],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+         $MULTU $a_2,$b_7               # mul_add_c(a[2],b[7],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        $ST     $c_3,8*$BNSZ($a0)       # r[8]=c3;
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_3,$b_6               # mul_add_c(a[3],b[6],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $c_3,$c_2,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_4,$b_5               # mul_add_c(a[4],b[5],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_5,$b_4               # mul_add_c(a[5],b[4],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_6,$b_3               # mul_add_c(a[6],b[3],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_7,$b_2               # mul_add_c(a[7],b[2],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+         $MULTU $a_7,$b_3               # mul_add_c(a[7],b[3],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        $ST     $c_1,9*$BNSZ($a0)       # r[9]=c1;
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_6,$b_4               # mul_add_c(a[6],b[4],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $c_1,$c_3,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_5,$b_5               # mul_add_c(a[5],b[5],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_4,$b_6               # mul_add_c(a[4],b[6],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_3,$b_7               # mul_add_c(a[3],b[7],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_4,$b_7               # mul_add_c(a[4],b[7],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        $ST     $c_2,10*$BNSZ($a0)      # r[10]=c2;
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_5,$b_6               # mul_add_c(a[5],b[6],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $c_2,$c_1,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_6,$b_5               # mul_add_c(a[6],b[5],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_7,$b_4               # mul_add_c(a[7],b[4],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+         $MULTU $a_7,$b_5               # mul_add_c(a[7],b[5],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        $ST     $c_3,11*$BNSZ($a0)      # r[11]=c3;
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_6,$b_6               # mul_add_c(a[6],b[6],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $c_3,$c_2,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_5,$b_7               # mul_add_c(a[5],b[7],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+         $MULTU $a_6,$b_7               # mul_add_c(a[6],b[7],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        $ST     $c_1,12*$BNSZ($a0)      # r[12]=c1;
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_7,$b_6               # mul_add_c(a[7],b[6],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $c_1,$c_3,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_7,$b_7               # mul_add_c(a[7],b[7],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        $ST     $c_2,13*$BNSZ($a0)      # r[13]=c2;
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        $ST     $c_3,14*$BNSZ($a0)      # r[14]=c3;
+        $ST     $c_1,15*$BNSZ($a0)      # r[15]=c1;
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $s5,10*$SZREG($sp)
+        $REG_L  $s4,9*$SZREG($sp)
+        $REG_L  $s3,8*$SZREG($sp)
+        $REG_L  $s2,7*$SZREG($sp)
+        $REG_L  $s1,6*$SZREG($sp)
+        $REG_L  $s0,5*$SZREG($sp)
+        $REG_L  $t3,4*$SZREG($sp)
+        $REG_L  $t2,3*$SZREG($sp)
+        $REG_L  $t1,2*$SZREG($sp)
+        $REG_L  $t0,1*$SZREG($sp)
+        $REG_L  $gp,0*$SZREG($sp)
+        jr      $ra
+        $PTR_ADD $sp,12*$SZREG
+___
+$code.=<<___ if ($flavour !~ /nubi/i);
+        $REG_L  $s5,5*$SZREG($sp)
+        $REG_L  $s4,4*$SZREG($sp)
+        $REG_L  $s3,3*$SZREG($sp)
+        $REG_L  $s2,2*$SZREG($sp)
+        $REG_L  $s1,1*$SZREG($sp)
+        $REG_L  $s0,0*$SZREG($sp)
+        jr      $ra
+        $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+.end    bn_mul_comba8
+.align  5
+.globl  bn_mul_comba4
+.ent    bn_mul_comba4
+bn_mul_comba4:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        .frame  $sp,6*$SZREG,$ra
+        .mask   0x8000f008,-$SZREG
+        .set    noreorder
+        $PTR_SUB $sp,6*$SZREG
+        $REG_S  $ra,5*$SZREG($sp)
+        $REG_S  $t3,4*$SZREG($sp)
+        $REG_S  $t2,3*$SZREG($sp)
+        $REG_S  $t1,2*$SZREG($sp)
+        $REG_S  $t0,1*$SZREG($sp)
+        $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+        .set    reorder
+        $LD     $a_0,0($a1)
+        $LD     $b_0,0($a2)
+        $LD     $a_1,$BNSZ($a1)
+        $LD     $a_2,2*$BNSZ($a1)
+        $MULTU  $a_0,$b_0               # mul_add_c(a[0],b[0],c1,c2,c3);
+        $LD     $a_3,3*$BNSZ($a1)
+        $LD     $b_1,$BNSZ($a2)
+        $LD     $b_2,2*$BNSZ($a2)
+        $LD     $b_3,3*$BNSZ($a2)
+        mflo    $c_1
+        mfhi    $c_2
+        $ST     $c_1,0($a0)
+        $MULTU  $a_0,$b_1               # mul_add_c(a[0],b[1],c2,c3,c1);
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_1,$b_0               # mul_add_c(a[1],b[0],c2,c3,c1);
+        $ADDU   $c_3,$t_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+         $MULTU $a_2,$b_0               # mul_add_c(a[2],b[0],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $c_1,$c_3,$t_2
+        $ST     $c_2,$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_1,$b_1               # mul_add_c(a[1],b[1],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_0,$b_2               # mul_add_c(a[0],b[2],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $c_2,$c_1,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+         $MULTU $a_0,$b_3               # mul_add_c(a[0],b[3],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        $ST     $c_3,2*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_1,$b_2               # mul_add_c(a[1],b[2],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $c_3,$c_2,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_2,$b_1               # mul_add_c(a[2],b[1],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $MULTU  $a_3,$b_0               # mul_add_c(a[3],b[0],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+         $MULTU $a_3,$b_1               # mul_add_c(a[3],b[1],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        $ST     $c_1,3*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_2,$b_2               # mul_add_c(a[2],b[2],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $c_1,$c_3,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $MULTU  $a_1,$b_3               # mul_add_c(a[1],b[3],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+         $MULTU $a_2,$b_3               # mul_add_c(a[2],b[3],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        $ST     $c_2,4*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $MULTU  $a_3,$b_2               # mul_add_c(a[3],b[2],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $c_2,$c_1,$t_2
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+         $MULTU $a_3,$b_3               # mul_add_c(a[3],b[3],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        $ST     $c_3,5*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        $ST     $c_1,6*$BNSZ($a0)
+        $ST     $c_2,7*$BNSZ($a0)
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $t3,4*$SZREG($sp)
+        $REG_L  $t2,3*$SZREG($sp)
+        $REG_L  $t1,2*$SZREG($sp)
+        $REG_L  $t0,1*$SZREG($sp)
+        $REG_L  $gp,0*$SZREG($sp)
+        $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+        jr      $ra
+        nop
+.end    bn_mul_comba4
+___
+($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
+$code.=<<___;
+.align  5
+.globl  bn_sqr_comba8
+.ent    bn_sqr_comba8
+bn_sqr_comba8:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        .frame  $sp,6*$SZREG,$ra
+        .mask   0x8000f008,-$SZREG
+        .set    noreorder
+        $PTR_SUB $sp,6*$SZREG
+        $REG_S  $ra,5*$SZREG($sp)
+        $REG_S  $t3,4*$SZREG($sp)
+        $REG_S  $t2,3*$SZREG($sp)
+        $REG_S  $t1,2*$SZREG($sp)
+        $REG_S  $t0,1*$SZREG($sp)
+        $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+        .set    reorder
+        $LD     $a_0,0($a1)
+        $LD     $a_1,$BNSZ($a1)
+        $LD     $a_2,2*$BNSZ($a1)
+        $LD     $a_3,3*$BNSZ($a1)
+        $MULTU  $a_0,$a_0               # mul_add_c(a[0],b[0],c1,c2,c3);
+        $LD     $a_4,4*$BNSZ($a1)
+        $LD     $a_5,5*$BNSZ($a1)
+        $LD     $a_6,6*$BNSZ($a1)
+        $LD     $a_7,7*$BNSZ($a1)
+        mflo    $c_1
+        mfhi    $c_2
+        $ST     $c_1,0($a0)
+        $MULTU  $a_0,$a_1               # mul_add_c2(a[0],b[1],c2,c3,c1);
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_1,$t_2,$zero
+        $SLL    $t_2,1
+         $MULTU $a_2,$a_0               # mul_add_c2(a[2],b[0],c3,c1,c2);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $ADDU   $c_3,$t_2,$at
+        $ST     $c_2,$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_2,$t_2,$zero
+        $SLL    $t_2,1
+        $MULTU  $a_1,$a_1               # mul_add_c(a[1],b[1],c3,c1,c2);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+         $MULTU $a_0,$a_3               # mul_add_c2(a[0],b[3],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        $ST     $c_3,2*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_3,$t_2,$zero
+        $SLL    $t_2,1
+        $MULTU  $a_1,$a_2               # mul_add_c2(a[1],b[2],c1,c2,c3);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_3,$at
+         $MULTU $a_4,$a_0               # mul_add_c2(a[4],b[0],c2,c3,c1);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        $ST     $c_1,3*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_1,$t_2,$zero
+        $SLL    $t_2,1
+        $MULTU  $a_3,$a_1               # mul_add_c2(a[3],b[1],c2,c3,c1);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_1,$at
+        $MULTU  $a_2,$a_2               # mul_add_c(a[2],b[2],c2,c3,c1);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+         $MULTU $a_0,$a_5               # mul_add_c2(a[0],b[5],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        $ST     $c_2,4*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_2,$t_2,$zero
+        $SLL    $t_2,1
+        $MULTU  $a_1,$a_4               # mul_add_c2(a[1],b[4],c3,c1,c2);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_2,$at
+        $MULTU  $a_2,$a_3               # mul_add_c2(a[2],b[3],c3,c1,c2);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+         $MULTU $a_6,$a_0               # mul_add_c2(a[6],b[0],c1,c2,c3);
+        $ADDU   $c_2,$at
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        $ST     $c_3,5*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_3,$t_2,$zero
+        $SLL    $t_2,1
+        $MULTU  $a_5,$a_1               # mul_add_c2(a[5],b[1],c1,c2,c3);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_3,$at
+        $MULTU  $a_4,$a_2               # mul_add_c2(a[4],b[2],c1,c2,c3);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_3,$at
+        $MULTU  $a_3,$a_3               # mul_add_c(a[3],b[3],c1,c2,c3);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+         $MULTU $a_0,$a_7               # mul_add_c2(a[0],b[7],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        $ST     $c_1,6*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_1,$t_2,$zero
+        $SLL    $t_2,1
+        $MULTU  $a_1,$a_6               # mul_add_c2(a[1],b[6],c2,c3,c1);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_1,$at
+        $MULTU  $a_2,$a_5               # mul_add_c2(a[2],b[5],c2,c3,c1);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_1,$at
+        $MULTU  $a_3,$a_4               # mul_add_c2(a[3],b[4],c2,c3,c1);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_1,$at
+         $MULTU $a_7,$a_1               # mul_add_c2(a[7],b[1],c3,c1,c2);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        $ST     $c_2,7*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_2,$t_2,$zero
+        $SLL    $t_2,1
+        $MULTU  $a_6,$a_2               # mul_add_c2(a[6],b[2],c3,c1,c2);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_2,$at
+        $MULTU  $a_5,$a_3               # mul_add_c2(a[5],b[3],c3,c1,c2);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_2,$at
+        $MULTU  $a_4,$a_4               # mul_add_c(a[4],b[4],c3,c1,c2);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+         $MULTU $a_2,$a_7               # mul_add_c2(a[2],b[7],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        $ST     $c_3,8*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_3,$t_2,$zero
+        $SLL    $t_2,1
+        $MULTU  $a_3,$a_6               # mul_add_c2(a[3],b[6],c1,c2,c3);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_3,$at
+        $MULTU  $a_4,$a_5               # mul_add_c2(a[4],b[5],c1,c2,c3);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_3,$at
+         $MULTU $a_7,$a_3               # mul_add_c2(a[7],b[3],c2,c3,c1);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        $ST     $c_1,9*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_1,$t_2,$zero
+        $SLL    $t_2,1
+        $MULTU  $a_6,$a_4               # mul_add_c2(a[6],b[4],c2,c3,c1);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_1,$at
+        $MULTU  $a_5,$a_5               # mul_add_c(a[5],b[5],c2,c3,c1);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+         $MULTU $a_4,$a_7               # mul_add_c2(a[4],b[7],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        $ST     $c_2,10*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_2,$t_2,$zero
+        $SLL    $t_2,1
+        $MULTU  $a_5,$a_6               # mul_add_c2(a[5],b[6],c3,c1,c2);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_2,$at
+         $MULTU $a_7,$a_5               # mul_add_c2(a[7],b[5],c1,c2,c3);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        $ST     $c_3,11*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_3,$t_2,$zero
+        $SLL    $t_2,1
+        $MULTU  $a_6,$a_6               # mul_add_c(a[6],b[6],c1,c2,c3);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+         $MULTU $a_6,$a_7               # mul_add_c2(a[6],b[7],c2,c3,c1);
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        $ST     $c_1,12*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_1,$t_2,$zero
+        $SLL    $t_2,1
+         $MULTU $a_7,$a_7               # mul_add_c(a[7],b[7],c3,c1,c2);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        $ST     $c_2,13*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        $ST     $c_3,14*$BNSZ($a0)
+        $ST     $c_1,15*$BNSZ($a0)
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $t3,4*$SZREG($sp)
+        $REG_L  $t2,3*$SZREG($sp)
+        $REG_L  $t1,2*$SZREG($sp)
+        $REG_L  $t0,1*$SZREG($sp)
+        $REG_L  $gp,0*$SZREG($sp)
+        $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+        jr      $ra
+        nop
+.end    bn_sqr_comba8
+.align  5
+.globl  bn_sqr_comba4
+.ent    bn_sqr_comba4
+bn_sqr_comba4:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        .frame  $sp,6*$SZREG,$ra
+        .mask   0x8000f008,-$SZREG
+        .set    noreorder
+        $PTR_SUB $sp,6*$SZREG
+        $REG_S  $ra,5*$SZREG($sp)
+        $REG_S  $t3,4*$SZREG($sp)
+        $REG_S  $t2,3*$SZREG($sp)
+        $REG_S  $t1,2*$SZREG($sp)
+        $REG_S  $t0,1*$SZREG($sp)
+        $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+        .set    reorder
+        $LD     $a_0,0($a1)
+        $LD     $a_1,$BNSZ($a1)
+        $MULTU  $a_0,$a_0               # mul_add_c(a[0],b[0],c1,c2,c3);
+        $LD     $a_2,2*$BNSZ($a1)
+        $LD     $a_3,3*$BNSZ($a1)
+        mflo    $c_1
+        mfhi    $c_2
+        $ST     $c_1,0($a0)
+        $MULTU  $a_0,$a_1               # mul_add_c2(a[0],b[1],c2,c3,c1);
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_1,$t_2,$zero
+        $SLL    $t_2,1
+         $MULTU $a_2,$a_0               # mul_add_c2(a[2],b[0],c3,c1,c2);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $ADDU   $c_3,$t_2,$at
+        $ST     $c_2,$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_2,$t_2,$zero
+        $SLL    $t_2,1
+        $MULTU  $a_1,$a_1               # mul_add_c(a[1],b[1],c3,c1,c2);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+         $MULTU $a_0,$a_3               # mul_add_c2(a[0],b[3],c1,c2,c3);
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        $ST     $c_3,2*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_3,$t_2,$zero
+        $SLL    $t_2,1
+        $MULTU  $a_1,$a_2               # mul_add_c(a2[1],b[2],c1,c2,c3);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $at,$t_2,$zero
+        $ADDU   $c_3,$at
+         $MULTU $a_3,$a_1               # mul_add_c2(a[3],b[1],c2,c3,c1);
+        $SLL    $t_2,1
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        sltu    $at,$c_2,$t_2
+        $ADDU   $c_3,$at
+        $ST     $c_1,3*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_1,$t_2,$zero
+        $SLL    $t_2,1
+        $MULTU  $a_2,$a_2               # mul_add_c(a[2],b[2],c2,c3,c1);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_2,$t_1
+        sltu    $at,$c_2,$t_1
+         $MULTU $a_2,$a_3               # mul_add_c2(a[2],b[3],c3,c1,c2);
+        $ADDU   $t_2,$at
+        $ADDU   $c_3,$t_2
+        sltu    $at,$c_3,$t_2
+        $ADDU   $c_1,$at
+        $ST     $c_2,4*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        slt     $c_2,$t_2,$zero
+        $SLL    $t_2,1
+         $MULTU $a_3,$a_3               # mul_add_c(a[3],b[3],c1,c2,c3);
+        slt     $a2,$t_1,$zero
+        $ADDU   $t_2,$a2
+        $SLL    $t_1,1
+        $ADDU   $c_3,$t_1
+        sltu    $at,$c_3,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_1,$t_2
+        sltu    $at,$c_1,$t_2
+        $ADDU   $c_2,$at
+        $ST     $c_3,5*$BNSZ($a0)
+        mflo    $t_1
+        mfhi    $t_2
+        $ADDU   $c_1,$t_1
+        sltu    $at,$c_1,$t_1
+        $ADDU   $t_2,$at
+        $ADDU   $c_2,$t_2
+        $ST     $c_1,6*$BNSZ($a0)
+        $ST     $c_2,7*$BNSZ($a0)
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $t3,4*$SZREG($sp)
+        $REG_L  $t2,3*$SZREG($sp)
+        $REG_L  $t1,2*$SZREG($sp)
+        $REG_L  $t0,1*$SZREG($sp)
+        $REG_L  $gp,0*$SZREG($sp)
+        $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+        jr      $ra
+        nop
+.end    bn_sqr_comba4
+___
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl b/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl
new file mode 100644
index 0000000000..54aeb01921
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl
@@ -0,0 +1,1496 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2010-2011 Intel Corp.
+#   Author: Vinodh.Gopal@intel.com
+#           Jim Guilford
+#           Erdinc.Ozturk@intel.com
+#           Maxim.Perminov@intel.com
+#
+# More information about algorithm used can be found at:
+#   http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf
+#
+# ====================================================================
+# Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+#
+# 3. All advertising materials mentioning features or use of this
+#    software must display the following acknowledgment:
+#    "This product includes software developed by the OpenSSL Project
+#    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+#
+# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+#    endorse or promote products derived from this software without
+#    prior written permission. For written permission, please contact
+#    licensing@OpenSSL.org.
+#
+# 5. Products derived from this software may not be called "OpenSSL"
+#    nor may "OpenSSL" appear in their names without prior written
+#    permission of the OpenSSL Project.
+#
+# 6. Redistributions of any form whatsoever must retain the following
+#    acknowledgment:
+#    "This product includes software developed by the OpenSSL Project
+#    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+#
+# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+# OF THE POSSIBILITY OF SUCH DAMAGE.
+# ====================================================================
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+open STDOUT,"| $^X $xlate $flavour $output";
+use strict;
+my $code=".text\n\n";
+my $m=0;
+#
+# Define x512 macros
+#
+#MULSTEP_512_ADD        MACRO   x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2
+#
+# uses rax, rdx, and args
+sub MULSTEP_512_ADD
+{
+ my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_;
+ my @X=@$x;     # make a copy
+$code.=<<___;
+         mov    (+8*0)($SRC2), %rax
+         mul    $OP                     # rdx:rax = %OP * [0]
+         mov    ($ASRC), $X[0]
+         add    %rax, $X[0]
+         adc    \$0, %rdx
+         mov    $X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+         mov    %rdx, $TMP
+         mov    (+8*$i)($SRC2), %rax
+         mul    $OP                     # rdx:rax = %OP * [$i]
+         mov    (+8*$i)($ASRC), $X[$i]
+         add    %rax, $X[$i]
+         adc    \$0, %rdx
+         add    $TMP, $X[$i]
+         adc    \$0, %rdx
+___
+}
+$code.=<<___;
+         mov    %rdx, $X[0]
+___
+}
+#MULSTEP_512    MACRO   x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp
+#
+# uses rax, rdx, and args
+sub MULSTEP_512
+{
+ my ($x, $DST, $SRC2, $OP, $TMP)=@_;
+ my @X=@$x;     # make a copy
+$code.=<<___;
+         mov    (+8*0)($SRC2), %rax
+         mul    $OP                     # rdx:rax = %OP * [0]
+         add    %rax, $X[0]
+         adc    \$0, %rdx
+         mov    $X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+         mov    %rdx, $TMP
+         mov    (+8*$i)($SRC2), %rax
+         mul    $OP                     # rdx:rax = %OP * [$i]
+         add    %rax, $X[$i]
+         adc    \$0, %rdx
+         add    $TMP, $X[$i]
+         adc    \$0, %rdx
+___
+}
+$code.=<<___;
+         mov    %rdx, $X[0]
+___
+}
+#
+# Swizzle Macros
+#
+# macro to copy data from flat space to swizzled table
+#MACRO swizzle  pDst, pSrc, tmp1, tmp2
+# pDst and pSrc are modified
+sub swizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0)=@_;
+$code.=<<___;
+         mov    \$8, $cnt
+loop_$m:
+         mov    ($pSrc), $d0
+         mov    $d0#w, ($pDst)
+         shr    \$16, $d0
+         mov    $d0#w, (+64*1)($pDst)
+         shr    \$16, $d0
+         mov    $d0#w, (+64*2)($pDst)
+         shr    \$16, $d0
+         mov    $d0#w, (+64*3)($pDst)
+         lea    8($pSrc), $pSrc
+         lea    64*4($pDst), $pDst
+         dec    $cnt
+         jnz    loop_$m
+___
+ $m++;
+}
+# macro to copy data from swizzled table to  flat space
+#MACRO unswizzle        pDst, pSrc, tmp*3
+sub unswizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0, $d1)=@_;
+$code.=<<___;
+         mov    \$4, $cnt
+loop_$m:
+         movzxw (+64*3+256*0)($pSrc), $d0
+         movzxw (+64*3+256*1)($pSrc), $d1
+         shl    \$16, $d0
+         shl    \$16, $d1
+         mov    (+64*2+256*0)($pSrc), $d0#w
+         mov    (+64*2+256*1)($pSrc), $d1#w
+         shl    \$16, $d0
+         shl    \$16, $d1
+         mov    (+64*1+256*0)($pSrc), $d0#w
+         mov    (+64*1+256*1)($pSrc), $d1#w
+         shl    \$16, $d0
+         shl    \$16, $d1
+         mov    (+64*0+256*0)($pSrc), $d0#w
+         mov    (+64*0+256*1)($pSrc), $d1#w
+         mov    $d0, (+8*0)($pDst)
+         mov    $d1, (+8*1)($pDst)
+         lea    256*2($pSrc), $pSrc
+         lea    8*2($pDst), $pDst
+         sub    \$1, $cnt
+         jnz    loop_$m
+___
+ $m++;
+}
+#
+# Data Structures
+#
+# Reduce Data
+#
+#
+# Offset  Value
+# 0C0     Carries
+# 0B8     X2[10]
+# 0B0     X2[9]
+# 0A8     X2[8]
+# 0A0     X2[7]
+# 098     X2[6]
+# 090     X2[5]
+# 088     X2[4]
+# 080     X2[3]
+# 078     X2[2]
+# 070     X2[1]
+# 068     X2[0]
+# 060     X1[12]  P[10]
+# 058     X1[11]  P[9]  Z[8]
+# 050     X1[10]  P[8]  Z[7]
+# 048     X1[9]   P[7]  Z[6]
+# 040     X1[8]   P[6]  Z[5]
+# 038     X1[7]   P[5]  Z[4]
+# 030     X1[6]   P[4]  Z[3]
+# 028     X1[5]   P[3]  Z[2]
+# 020     X1[4]   P[2]  Z[1]
+# 018     X1[3]   P[1]  Z[0]
+# 010     X1[2]   P[0]  Y[2]
+# 008     X1[1]   Q[1]  Y[1]
+# 000     X1[0]   Q[0]  Y[0]
+my $X1_offset           =  0;                   # 13 qwords
+my $X2_offset           =  $X1_offset + 13*8;                   # 11 qwords
+my $Carries_offset      =  $X2_offset + 11*8;                   # 1 qword
+my $Q_offset            =  0;                   # 2 qwords
+my $P_offset            =  $Q_offset + 2*8;                     # 11 qwords
+my $Y_offset            =  0;                   # 3 qwords
+my $Z_offset            =  $Y_offset + 3*8;                     # 9 qwords
+my $Red_Data_Size       =  $Carries_offset + 1*8;                       # (25 qwords)
+#
+# Stack Frame
+#
+#
+# offset        value
+# ...           <old stack contents>
+# ...
+# 280           Garray
+# 278           tmp16[15]
+# ...           ...
+# 200           tmp16[0]
+# 1F8           tmp[7]
+# ...           ...
+# 1C0           tmp[0]
+# 1B8           GT[7]
+# ...           ...
+# 180           GT[0]
+# 178           Reduce Data
+# ...           ...
+# 0B8           Reduce Data
+# 0B0           reserved
+# 0A8           reserved
+# 0A0           reserved
+# 098           reserved
+# 090           reserved
+# 088           reduce result addr
+# 080           exp[8]
+# ...
+# 048           exp[1]
+# 040           exp[0]
+# 038           reserved
+# 030           loop_idx
+# 028           pg
+# 020           i
+# 018           pData   ; arg 4
+# 010           pG      ; arg 2
+# 008           pResult ; arg 1
+# 000           rsp     ; stack pointer before subtract
+my $rsp_offset          =  0;
+my $pResult_offset      =  8*1 + $rsp_offset;
+my $pG_offset           =  8*1 + $pResult_offset;
+my $pData_offset        =  8*1 + $pG_offset;
+my $i_offset            =  8*1 + $pData_offset;
+my $pg_offset           =  8*1 + $i_offset;
+my $loop_idx_offset     =  8*1 + $pg_offset;
+my $reserved1_offset    =  8*1 + $loop_idx_offset;
+my $exp_offset          =  8*1 + $reserved1_offset;
+my $red_result_addr_offset=  8*9 + $exp_offset;
+my $reserved2_offset    =  8*1 + $red_result_addr_offset;
+my $Reduce_Data_offset  =  8*5 + $reserved2_offset;
+my $GT_offset           =  $Red_Data_Size + $Reduce_Data_offset;
+my $tmp_offset          =  8*8 + $GT_offset;
+my $tmp16_offset        =  8*8 + $tmp_offset;
+my $garray_offset       =  8*16 + $tmp16_offset;
+my $mem_size            =  8*8*32 + $garray_offset;
+#
+# Offsets within Reduce Data
+#
+#
+#       struct MODF_2FOLD_MONT_512_C1_DATA {
+#       UINT64 t[8][8];
+#       UINT64 m[8];
+#       UINT64 m1[8]; /* 2^768 % m */
+#       UINT64 m2[8]; /* 2^640 % m */
+#       UINT64 k1[2]; /* (- 1/m) % 2^128 */
+#       };
+my $T                   =  0;
+my $M                   =  512;                 # = 8 * 8 * 8
+my $M1                  =  576;                 # = 8 * 8 * 9 /* += 8 * 8 */
+my $M2                  =  640;                 # = 8 * 8 * 10 /* += 8 * 8 */
+my $K1                  =  704;                 # = 8 * 8 * 11 /* += 8 * 8 */
+#
+#   FUNCTIONS
+#
+{{{
+#
+# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords)
+#                       and add 512-bits (8 qwords)
+#                       to get 640 bits (10 qwords)
+# Input: 128-bit mul source: [rdi+8*1], rbp
+#        512-bit mul source: [rsi+8*n]
+#        512-bit add source: r15, r14, ..., r9, r8
+# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0]
+# Clobbers all regs except: rcx, rsi, rdi
+$code.=<<___;
+.type   MULADD_128x512,\@abi-omnipotent
+.align  16
+MULADD_128x512:
+___
+        &MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+         mov    (+8*1)(%rdi), %rbp
+___
+        &MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+         ret
+.size   MULADD_128x512,.-MULADD_128x512
+___
+}}}
+{{{
+#MULADD_256x512 MACRO   pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0
+#
+# Inputs: pDst: Destination  (768 bits, 12 qwords)
+#         pA:   Multiplicand (1024 bits, 16 qwords)
+#         pB:   Multiplicand (512 bits, 8 qwords)
+# Dst = Ah * B + Al
+# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits)
+# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0]
+# Uses registers: arguments, RAX, RDX
+sub MULADD_256x512
+{
+ my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_;
+$code.=<<___;
+        mov     (+8*12)($pA), $OP
+___
+        &MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP);
+        push(@$X,shift(@$X));
+$code.=<<___;
+         mov    (+8*13)($pA), $OP
+___
+        &MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP);
+        push(@$X,shift(@$X));
+$code.=<<___;
+         mov    (+8*14)($pA), $OP
+___
+        &MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP);
+        push(@$X,shift(@$X));
+$code.=<<___;
+         mov    (+8*15)($pA), $OP
+___
+        &MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP);
+        push(@$X,shift(@$X));
+}
+#
+# mont_reduce(UINT64 *x,  /* 1024 bits, 16 qwords */
+#              UINT64 *m,  /*  512 bits,  8 qwords */
+#              MODF_2FOLD_MONT_512_C1_DATA *data,
+#             UINT64 *r)  /*  512 bits,  8 qwords */
+# Input:  x (number to be reduced): tmp16 (Implicit)
+#         m (modulus):              [pM]  (Implicit)
+#         data (reduce data):       [pData] (Implicit)
+# Output: r (result):                Address in [red_res_addr]
+#         result also in: r9, r8, r15, r14, r13, r12, r11, r10
+my @X=map("%r$_",(8..15));
+$code.=<<___;
+.type   mont_reduce,\@abi-omnipotent
+.align  16
+mont_reduce:
+___
+my $STACK_DEPTH         =  8;
+        #
+        # X1 = Xh * M1 + Xl
+$code.=<<___;
+         lea    (+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi                      # pX1 (Dst) 769 bits, 13 qwords
+         mov    (+$pData_offset+$STACK_DEPTH)(%rsp), %rsi                       # pM1 (Bsrc) 512 bits, 8 qwords
+         add    \$$M1, %rsi
+         lea    (+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx                       # X (Asrc) 1024 bits, 16 qwords
+___
+        &MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X);   # rotates @X 4 times
+        # results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0]
+$code.=<<___;
+         xor    %rax, %rax
+        # X1 += xl
+         add    (+8*8)(%rcx), $X[4]
+         adc    (+8*9)(%rcx), $X[5]
+         adc    (+8*10)(%rcx), $X[6]
+         adc    (+8*11)(%rcx), $X[7]
+         adc    \$0, %rax
+        # X1 is now rax, r11-r8, r15-r12, tmp16[3:0]
+        #
+        # check for carry ;; carry stored in rax
+         mov    $X[4], (+8*8)(%rdi)                     # rdi points to X1
+         mov    $X[5], (+8*9)(%rdi)
+         mov    $X[6], %rbp
+         mov    $X[7], (+8*11)(%rdi)
+         mov    %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+         mov    (+8*0)(%rdi), $X[4]
+         mov    (+8*1)(%rdi), $X[5]
+         mov    (+8*2)(%rdi), $X[6]
+         mov    (+8*3)(%rdi), $X[7]
+        # X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8
+        # rdi -> X1
+        # rsi -> M1
+        #
+        # X2 = Xh * M2 + Xl
+        # do first part (X2 = Xh * M2)
+         add    \$8*10, %rdi                    # rdi -> pXh ; 128 bits, 2 qwords
+                                #        Xh is actually { [rdi+8*1], rbp }
+         add    \$`$M2-$M1`, %rsi                       # rsi -> M2
+         lea    (+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx                      # rcx -> pX2 ; 641 bits, 11 qwords
+___
+        unshift(@X,pop(@X));    unshift(@X,pop(@X));
+$code.=<<___;
+         call   MULADD_128x512                  # args in rcx, rdi / rbp, rsi, r15-r8
+        # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+         mov    (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax
+        # X2 += Xl
+         add    (+8*8-8*10)(%rdi), $X[6]                # (-8*10) is to adjust rdi -> Xh to Xl
+         adc    (+8*9-8*10)(%rdi), $X[7]
+         mov    $X[6], (+8*8)(%rcx)
+         mov    $X[7], (+8*9)(%rcx)
+         adc    %rax, %rax
+         mov    %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+         lea    (+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi                       # rdi -> pQ ; 128 bits, 2 qwords
+         add    \$`$K1-$M2`, %rsi                       # rsi -> pK1 ; 128 bits, 2 qwords
+        # MUL_128x128t128       rdi, rcx, rsi   ; Q = X2 * K1 (bottom half)
+        # B1:B0 = rsi[1:0] = K1[1:0]
+        # A1:A0 = rcx[1:0] = X2[1:0]
+        # Result = rdi[1],rbp = Q[1],rbp
+         mov    (%rsi), %r8                     # B0
+         mov    (+8*1)(%rsi), %rbx                      # B1
+         mov    (%rcx), %rax                    # A0
+         mul    %r8                     # B0
+         mov    %rax, %rbp
+         mov    %rdx, %r9
+         mov    (+8*1)(%rcx), %rax                      # A1
+         mul    %r8                     # B0
+         add    %rax, %r9
+         mov    (%rcx), %rax                    # A0
+         mul    %rbx                    # B1
+         add    %rax, %r9
+         mov    %r9, (+8*1)(%rdi)
+        # end MUL_128x128t128
+         sub    \$`$K1-$M`, %rsi
+         mov    (%rcx), $X[6]
+         mov    (+8*1)(%rcx), $X[7]                     # r9:r8 = X2[1:0]
+         call   MULADD_128x512                  # args in rcx, rdi / rbp, rsi, r15-r8
+        # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+        # load first half of m to rdx, rdi, rbx, rax
+        # moved this here for efficiency
+         mov    (+8*0)(%rsi), %rax
+         mov    (+8*1)(%rsi), %rbx
+         mov    (+8*2)(%rsi), %rdi
+         mov    (+8*3)(%rsi), %rdx
+        # continue with reduction
+         mov    (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp
+         add    (+8*8)(%rcx), $X[6]
+         adc    (+8*9)(%rcx), $X[7]
+        #accumulate the final carry to rbp
+         adc    %rbp, %rbp
+        # Add in overflow corrections: R = (X2>>128) += T[overflow]
+        # R = {r9, r8, r15, r14, ..., r10}
+         shl    \$3, %rbp
+         mov    (+$pData_offset+$STACK_DEPTH)(%rsp), %rcx                       # rsi -> Data (and points to T)
+         add    %rcx, %rbp                      # pT ; 512 bits, 8 qwords, spread out
+        # rsi will be used to generate a mask after the addition
+         xor    %rsi, %rsi
+         add    (+8*8*0)(%rbp), $X[0]
+         adc    (+8*8*1)(%rbp), $X[1]
+         adc    (+8*8*2)(%rbp), $X[2]
+         adc    (+8*8*3)(%rbp), $X[3]
+         adc    (+8*8*4)(%rbp), $X[4]
+         adc    (+8*8*5)(%rbp), $X[5]
+         adc    (+8*8*6)(%rbp), $X[6]
+         adc    (+8*8*7)(%rbp), $X[7]
+        # if there is a carry:  rsi = 0xFFFFFFFFFFFFFFFF
+        # if carry is clear:    rsi = 0x0000000000000000
+         sbb    \$0, %rsi
+        # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+         and    %rsi, %rax
+         and    %rsi, %rbx
+         and    %rsi, %rdi
+         and    %rsi, %rdx
+         mov    \$1, %rbp
+         sub    %rax, $X[0]
+         sbb    %rbx, $X[1]
+         sbb    %rdi, $X[2]
+         sbb    %rdx, $X[3]
+        # if there is a borrow:         rbp = 0
+        # if there is no borrow:        rbp = 1
+        # this is used to save the borrows in between the first half and the 2nd half of the subtraction of m
+         sbb    \$0, %rbp
+        #load second half of m to rdx, rdi, rbx, rax
+         add    \$$M, %rcx
+         mov    (+8*4)(%rcx), %rax
+         mov    (+8*5)(%rcx), %rbx
+         mov    (+8*6)(%rcx), %rdi
+         mov    (+8*7)(%rcx), %rdx
+        # use the rsi mask as before
+        # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+         and    %rsi, %rax
+         and    %rsi, %rbx
+         and    %rsi, %rdi
+         and    %rsi, %rdx
+        # if rbp = 0, there was a borrow before, it is moved to the carry flag
+        # if rbp = 1, there was not a borrow before, carry flag is cleared
+         sub    \$1, %rbp
+         sbb    %rax, $X[4]
+         sbb    %rbx, $X[5]
+         sbb    %rdi, $X[6]
+         sbb    %rdx, $X[7]
+        # write R back to memory
+         mov    (+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi
+         mov    $X[0], (+8*0)(%rsi)
+         mov    $X[1], (+8*1)(%rsi)
+         mov    $X[2], (+8*2)(%rsi)
+         mov    $X[3], (+8*3)(%rsi)
+         mov    $X[4], (+8*4)(%rsi)
+         mov    $X[5], (+8*5)(%rsi)
+         mov    $X[6], (+8*6)(%rsi)
+         mov    $X[7], (+8*7)(%rsi)
+         ret
+.size   mont_reduce,.-mont_reduce
+___
+}}}
+{{{
+#MUL_512x512    MACRO   pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2
+#
+# Inputs: pDst: Destination  (1024 bits, 16 qwords)
+#         pA:   Multiplicand (512 bits, 8 qwords)
+#         pB:   Multiplicand (512 bits, 8 qwords)
+# Uses registers rax, rdx, args
+#   B operand in [pB] and also in x7...x0
+sub MUL_512x512
+{
+ my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_;
+ my ($pDst,  $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x;     # make a copy
+$code.=<<___;
+         mov    (+8*0)($pA), $OP
+         mov    $X[0], %rax
+         mul    $OP                     # rdx:rax = %OP * [0]
+         mov    %rax, (+$pDst_o+8*0)($pDst)
+         mov    %rdx, $X[0]
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+         mov    $X[$i], %rax
+         mul    $OP                     # rdx:rax = %OP * [$i]
+         add    %rax, $X[$i-1]
+         adc    \$0, %rdx
+         mov    %rdx, $X[$i]
+___
+}
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+         mov    (+8*$i)($pA), $OP
+___
+        &MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP);
+        push(@X,shift(@X));
+}
+$code.=<<___;
+         mov    $X[0], (+$pDst_o+8*8)($pDst)
+         mov    $X[1], (+$pDst_o+8*9)($pDst)
+         mov    $X[2], (+$pDst_o+8*10)($pDst)
+         mov    $X[3], (+$pDst_o+8*11)($pDst)
+         mov    $X[4], (+$pDst_o+8*12)($pDst)
+         mov    $X[5], (+$pDst_o+8*13)($pDst)
+         mov    $X[6], (+$pDst_o+8*14)($pDst)
+         mov    $X[7], (+$pDst_o+8*15)($pDst)
+___
+}
+#
+# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits)
+# Input:  src1: Address of source 1: rdi
+#         src2: Address of source 2: rsi
+# Output: dst:  Address of destination: [red_res_addr]
+#    src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+# Temp:   Clobbers [tmp16], all registers
+$code.=<<___;
+.type   mont_mul_a3b,\@abi-omnipotent
+.align  16
+mont_mul_a3b:
+        #
+        # multiply tmp = src1 * src2
+        # For multiply: dst = rcx, src1 = rdi, src2 = rsi
+        # stack depth is extra 8 from call
+___
+        &MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx");
+$code.=<<___;
+        #
+        # Dst = tmp % m
+        # Call reduce(tmp, m, data, dst)
+        # tail recursion optimization: jmp to mont_reduce and return from there
+         jmp    mont_reduce
+        # call  mont_reduce
+        # ret
+.size   mont_mul_a3b,.-mont_mul_a3b
+___
+}}}
+{{{
+#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4
+#
+# Input in memory [pA] and also in x7...x0
+# Uses all argument registers plus rax and rdx
+#
+# This version computes all of the off-diagonal terms into memory,
+# and then it adds in the diagonal terms
+sub SQR_512
+{
+ my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_;
+ my ($pDst,  $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x;     # make a copy
+$code.=<<___;
+        # ------------------
+        # first pass 01...07
+        # ------------------
+         mov    $X[0], $A
+         mov    $X[1],%rax
+         mul    $A
+         mov    %rax, (+$pDst_o+8*1)($pDst)
+___
+for(my $i=2;$i<8;$i++) {
+$code.=<<___;
+         mov    %rdx, $X[$i-2]
+         mov    $X[$i],%rax
+         mul    $A
+         add    %rax, $X[$i-2]
+         adc    \$0, %rdx
+___
+}
+$code.=<<___;
+         mov    %rdx, $x7
+         mov    $X[0], (+$pDst_o+8*2)($pDst)
+        # ------------------
+        # second pass 12...17
+        # ------------------
+         mov    (+8*1)($pA), $A
+         mov    (+8*2)($pA),%rax
+         mul    $A
+         add    %rax, $X[1]
+         adc    \$0, %rdx
+         mov    $X[1], (+$pDst_o+8*3)($pDst)
+         mov    %rdx, $X[0]
+         mov    (+8*3)($pA),%rax
+         mul    $A
+         add    %rax, $X[2]
+         adc    \$0, %rdx
+         add    $X[0], $X[2]
+         adc    \$0, %rdx
+         mov    $X[2], (+$pDst_o+8*4)($pDst)
+         mov    %rdx, $X[0]
+         mov    (+8*4)($pA),%rax
+         mul    $A
+         add    %rax, $X[3]
+         adc    \$0, %rdx
+         add    $X[0], $X[3]
+         adc    \$0, %rdx
+         mov    %rdx, $X[0]
+         mov    (+8*5)($pA),%rax
+         mul    $A
+         add    %rax, $X[4]
+         adc    \$0, %rdx
+         add    $X[0], $X[4]
+         adc    \$0, %rdx
+         mov    %rdx, $X[0]
+         mov    $X[6],%rax
+         mul    $A
+         add    %rax, $X[5]
+         adc    \$0, %rdx
+         add    $X[0], $X[5]
+         adc    \$0, %rdx
+         mov    %rdx, $X[0]
+         mov    $X[7],%rax
+         mul    $A
+         add    %rax, $x7
+         adc    \$0, %rdx
+         add    $X[0], $x7
+         adc    \$0, %rdx
+         mov    %rdx, $X[1]
+        # ------------------
+        # third pass 23...27
+        # ------------------
+         mov    (+8*2)($pA), $A
+         mov    (+8*3)($pA),%rax
+         mul    $A
+         add    %rax, $X[3]
+         adc    \$0, %rdx
+         mov    $X[3], (+$pDst_o+8*5)($pDst)
+         mov    %rdx, $X[0]
+         mov    (+8*4)($pA),%rax
+         mul    $A
+         add    %rax, $X[4]
+         adc    \$0, %rdx
+         add    $X[0], $X[4]
+         adc    \$0, %rdx
+         mov    $X[4], (+$pDst_o+8*6)($pDst)
+         mov    %rdx, $X[0]
+         mov    (+8*5)($pA),%rax
+         mul    $A
+         add    %rax, $X[5]
+         adc    \$0, %rdx
+         add    $X[0], $X[5]
+         adc    \$0, %rdx
+         mov    %rdx, $X[0]
+         mov    $X[6],%rax
+         mul    $A
+         add    %rax, $x7
+         adc    \$0, %rdx
+         add    $X[0], $x7
+         adc    \$0, %rdx
+         mov    %rdx, $X[0]
+         mov    $X[7],%rax
+         mul    $A
+         add    %rax, $X[1]
+         adc    \$0, %rdx
+         add    $X[0], $X[1]
+         adc    \$0, %rdx
+         mov    %rdx, $X[2]
+        # ------------------
+        # fourth pass 34...37
+        # ------------------
+         mov    (+8*3)($pA), $A
+         mov    (+8*4)($pA),%rax
+         mul    $A
+         add    %rax, $X[5]
+         adc    \$0, %rdx
+         mov    $X[5], (+$pDst_o+8*7)($pDst)
+         mov    %rdx, $X[0]
+         mov    (+8*5)($pA),%rax
+         mul    $A
+         add    %rax, $x7
+         adc    \$0, %rdx
+         add    $X[0], $x7
+         adc    \$0, %rdx
+         mov    $x7, (+$pDst_o+8*8)($pDst)
+         mov    %rdx, $X[0]
+         mov    $X[6],%rax
+         mul    $A
+         add    %rax, $X[1]
+         adc    \$0, %rdx
+         add    $X[0], $X[1]
+         adc    \$0, %rdx
+         mov    %rdx, $X[0]
+         mov    $X[7],%rax
+         mul    $A
+         add    %rax, $X[2]
+         adc    \$0, %rdx
+         add    $X[0], $X[2]
+         adc    \$0, %rdx
+         mov    %rdx, $X[5]
+        # ------------------
+        # fifth pass 45...47
+        # ------------------
+         mov    (+8*4)($pA), $A
+         mov    (+8*5)($pA),%rax
+         mul    $A
+         add    %rax, $X[1]
+         adc    \$0, %rdx
+         mov    $X[1], (+$pDst_o+8*9)($pDst)
+         mov    %rdx, $X[0]
+         mov    $X[6],%rax
+         mul    $A
+         add    %rax, $X[2]
+         adc    \$0, %rdx
+         add    $X[0], $X[2]
+         adc    \$0, %rdx
+         mov    $X[2], (+$pDst_o+8*10)($pDst)
+         mov    %rdx, $X[0]
+         mov    $X[7],%rax
+         mul    $A
+         add    %rax, $X[5]
+         adc    \$0, %rdx
+         add    $X[0], $X[5]
+         adc    \$0, %rdx
+         mov    %rdx, $X[1]
+        # ------------------
+        # sixth pass 56...57
+        # ------------------
+         mov    (+8*5)($pA), $A
+         mov    $X[6],%rax
+         mul    $A
+         add    %rax, $X[5]
+         adc    \$0, %rdx
+         mov    $X[5], (+$pDst_o+8*11)($pDst)
+         mov    %rdx, $X[0]
+         mov    $X[7],%rax
+         mul    $A
+         add    %rax, $X[1]
+         adc    \$0, %rdx
+         add    $X[0], $X[1]
+         adc    \$0, %rdx
+         mov    $X[1], (+$pDst_o+8*12)($pDst)
+         mov    %rdx, $X[2]
+        # ------------------
+        # seventh pass 67
+        # ------------------
+         mov    $X[6], $A
+         mov    $X[7],%rax
+         mul    $A
+         add    %rax, $X[2]
+         adc    \$0, %rdx
+         mov    $X[2], (+$pDst_o+8*13)($pDst)
+         mov    %rdx, (+$pDst_o+8*14)($pDst)
+        # start finalize (add   in squares, and double off-terms)
+         mov    (+$pDst_o+8*1)($pDst), $X[0]
+         mov    (+$pDst_o+8*2)($pDst), $X[1]
+         mov    (+$pDst_o+8*3)($pDst), $X[2]
+         mov    (+$pDst_o+8*4)($pDst), $X[3]
+         mov    (+$pDst_o+8*5)($pDst), $X[4]
+         mov    (+$pDst_o+8*6)($pDst), $X[5]
+         mov    (+8*3)($pA), %rax
+         mul    %rax
+         mov    %rax, $x6
+         mov    %rdx, $X[6]
+         add    $X[0], $X[0]
+         adc    $X[1], $X[1]
+         adc    $X[2], $X[2]
+         adc    $X[3], $X[3]
+         adc    $X[4], $X[4]
+         adc    $X[5], $X[5]
+         adc    \$0, $X[6]
+         mov    (+8*0)($pA), %rax
+         mul    %rax
+         mov    %rax, (+$pDst_o+8*0)($pDst)
+         mov    %rdx, $A
+         mov    (+8*1)($pA), %rax
+         mul    %rax
+         add    $A, $X[0]
+         adc    %rax, $X[1]
+         adc    \$0, %rdx
+         mov    %rdx, $A
+         mov    $X[0], (+$pDst_o+8*1)($pDst)
+         mov    $X[1], (+$pDst_o+8*2)($pDst)
+         mov    (+8*2)($pA), %rax
+         mul    %rax
+         add    $A, $X[2]
+         adc    %rax, $X[3]
+         adc    \$0, %rdx
+         mov    %rdx, $A
+         mov    $X[2], (+$pDst_o+8*3)($pDst)
+         mov    $X[3], (+$pDst_o+8*4)($pDst)
+         xor    $tmp, $tmp
+         add    $A, $X[4]
+         adc    $x6, $X[5]
+         adc    \$0, $tmp
+         mov    $X[4], (+$pDst_o+8*5)($pDst)
+         mov    $X[5], (+$pDst_o+8*6)($pDst)
+        # %%tmp has 0/1 in column 7
+        # %%A6 has a full value in column 7
+         mov    (+$pDst_o+8*7)($pDst), $X[0]
+         mov    (+$pDst_o+8*8)($pDst), $X[1]
+         mov    (+$pDst_o+8*9)($pDst), $X[2]
+         mov    (+$pDst_o+8*10)($pDst), $X[3]
+         mov    (+$pDst_o+8*11)($pDst), $X[4]
+         mov    (+$pDst_o+8*12)($pDst), $X[5]
+         mov    (+$pDst_o+8*13)($pDst), $x6
+         mov    (+$pDst_o+8*14)($pDst), $x7
+         mov    $X[7], %rax
+         mul    %rax
+         mov    %rax, $X[7]
+         mov    %rdx, $A
+         add    $X[0], $X[0]
+         adc    $X[1], $X[1]
+         adc    $X[2], $X[2]
+         adc    $X[3], $X[3]
+         adc    $X[4], $X[4]
+         adc    $X[5], $X[5]
+         adc    $x6, $x6
+         adc    $x7, $x7
+         adc    \$0, $A
+         add    $tmp, $X[0]
+         mov    (+8*4)($pA), %rax
+         mul    %rax
+         add    $X[6], $X[0]
+         adc    %rax, $X[1]
+         adc    \$0, %rdx
+         mov    %rdx, $tmp
+         mov    $X[0], (+$pDst_o+8*7)($pDst)
+         mov    $X[1], (+$pDst_o+8*8)($pDst)
+         mov    (+8*5)($pA), %rax
+         mul    %rax
+         add    $tmp, $X[2]
+         adc    %rax, $X[3]
+         adc    \$0, %rdx
+         mov    %rdx, $tmp
+         mov    $X[2], (+$pDst_o+8*9)($pDst)
+         mov    $X[3], (+$pDst_o+8*10)($pDst)
+         mov    (+8*6)($pA), %rax
+         mul    %rax
+         add    $tmp, $X[4]
+         adc    %rax, $X[5]
+         adc    \$0, %rdx
+         mov    $X[4], (+$pDst_o+8*11)($pDst)
+         mov    $X[5], (+$pDst_o+8*12)($pDst)
+         add    %rdx, $x6
+         adc    $X[7], $x7
+         adc    \$0, $A
+         mov    $x6, (+$pDst_o+8*13)($pDst)
+         mov    $x7, (+$pDst_o+8*14)($pDst)
+         mov    $A, (+$pDst_o+8*15)($pDst)
+___
+}
+#
+# sqr_reduce: subroutine to compute Result = reduce(Result * Result)
+#
+# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+#
+$code.=<<___;
+.type   sqr_reduce,\@abi-omnipotent
+.align  16
+sqr_reduce:
+         mov    (+$pResult_offset+8)(%rsp), %rcx
+___
+        &SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi");
+$code.=<<___;
+        # tail recursion optimization: jmp to mont_reduce and return from there
+         jmp    mont_reduce
+        # call  mont_reduce
+        # ret
+.size   sqr_reduce,.-sqr_reduce
+___
+}}}
+#
+# MAIN FUNCTION
+#
+#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */
+#           UINT64 *g,   /* 512 bits, 8 qwords */
+#           UINT64 *exp, /* 512 bits, 8 qwords */
+#           struct mod_ctx_512 *data)
+# window size = 5
+# table size = 2^5 = 32
+#table_entries  equ     32
+#table_size     equ     table_entries * 8
+$code.=<<___;
+.globl  mod_exp_512
+.type   mod_exp_512,\@function,4
+mod_exp_512:
+         push   %rbp
+         push   %rbx
+         push   %r12
+         push   %r13
+         push   %r14
+         push   %r15
+        # adjust stack down and then align it with cache boundary
+         mov    %rsp, %r8
+         sub    \$$mem_size, %rsp
+         and    \$-64, %rsp
+        # store previous stack pointer and arguments
+         mov    %r8, (+$rsp_offset)(%rsp)
+         mov    %rdi, (+$pResult_offset)(%rsp)
+         mov    %rsi, (+$pG_offset)(%rsp)
+         mov    %rcx, (+$pData_offset)(%rsp)
+.Lbody:
+        # transform g into montgomery space
+        # GT = reduce(g * C2) = reduce(g * (2^256))
+        # reduce expects to have the input in [tmp16]
+         pxor   %xmm4, %xmm4
+         movdqu (+16*0)(%rsi), %xmm0
+         movdqu (+16*1)(%rsi), %xmm1
+         movdqu (+16*2)(%rsi), %xmm2
+         movdqu (+16*3)(%rsi), %xmm3
+         movdqa %xmm4, (+$tmp16_offset+16*0)(%rsp)
+         movdqa %xmm4, (+$tmp16_offset+16*1)(%rsp)
+         movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp)
+         movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp)
+         movdqa %xmm0, (+$tmp16_offset+16*2)(%rsp)
+         movdqa %xmm1, (+$tmp16_offset+16*3)(%rsp)
+         movdqa %xmm2, (+$tmp16_offset+16*4)(%rsp)
+         movdqa %xmm3, (+$tmp16_offset+16*5)(%rsp)
+        # load pExp before rdx gets blown away
+         movdqu (+16*0)(%rdx), %xmm0
+         movdqu (+16*1)(%rdx), %xmm1
+         movdqu (+16*2)(%rdx), %xmm2
+         movdqu (+16*3)(%rdx), %xmm3
+         lea    (+$GT_offset)(%rsp), %rbx
+         mov    %rbx, (+$red_result_addr_offset)(%rsp)
+         call   mont_reduce
+        # Initialize tmp = C
+         lea    (+$tmp_offset)(%rsp), %rcx
+         xor    %rax, %rax
+         mov    %rax, (+8*0)(%rcx)
+         mov    %rax, (+8*1)(%rcx)
+         mov    %rax, (+8*3)(%rcx)
+         mov    %rax, (+8*4)(%rcx)
+         mov    %rax, (+8*5)(%rcx)
+         mov    %rax, (+8*6)(%rcx)
+         mov    %rax, (+8*7)(%rcx)
+         mov    %rax, (+$exp_offset+8*8)(%rsp)
+         movq   \$1, (+8*2)(%rcx)
+         lea    (+$garray_offset)(%rsp), %rbp
+         mov    %rcx, %rsi                      # pTmp
+         mov    %rbp, %rdi                      # Garray[][0]
+___
+        &swizzle("%rdi", "%rcx", "%rax", "%rbx");
+        # for (rax = 31; rax != 0; rax--) {
+        #     tmp = reduce(tmp * G)
+        #     swizzle(pg, tmp);
+        #     pg += 2; }
+$code.=<<___;
+         mov    \$31, %rax
+         mov    %rax, (+$i_offset)(%rsp)
+         mov    %rbp, (+$pg_offset)(%rsp)
+        # rsi -> pTmp
+         mov    %rsi, (+$red_result_addr_offset)(%rsp)
+         mov    (+8*0)(%rsi), %r10
+         mov    (+8*1)(%rsi), %r11
+         mov    (+8*2)(%rsi), %r12
+         mov    (+8*3)(%rsi), %r13
+         mov    (+8*4)(%rsi), %r14
+         mov    (+8*5)(%rsi), %r15
+         mov    (+8*6)(%rsi), %r8
+         mov    (+8*7)(%rsi), %r9
+init_loop:
+         lea    (+$GT_offset)(%rsp), %rdi
+         call   mont_mul_a3b
+         lea    (+$tmp_offset)(%rsp), %rsi
+         mov    (+$pg_offset)(%rsp), %rbp
+         add    \$2, %rbp
+         mov    %rbp, (+$pg_offset)(%rsp)
+         mov    %rsi, %rcx                      # rcx = rsi = addr of tmp
+___
+        &swizzle("%rbp", "%rcx", "%rax", "%rbx");
+$code.=<<___;
+         mov    (+$i_offset)(%rsp), %rax
+         sub    \$1, %rax
+         mov    %rax, (+$i_offset)(%rsp)
+         jne    init_loop
+        #
+        # Copy exponent onto stack
+         movdqa %xmm0, (+$exp_offset+16*0)(%rsp)
+         movdqa %xmm1, (+$exp_offset+16*1)(%rsp)
+         movdqa %xmm2, (+$exp_offset+16*2)(%rsp)
+         movdqa %xmm3, (+$exp_offset+16*3)(%rsp)
+        #
+        # Do exponentiation
+        # Initialize result to G[exp{511:507}]
+         mov    (+$exp_offset+62)(%rsp), %eax
+         mov    %rax, %rdx
+         shr    \$11, %rax
+         and    \$0x07FF, %edx
+         mov    %edx, (+$exp_offset+62)(%rsp)
+         lea    (+$garray_offset)(%rsp,%rax,2), %rsi
+         mov    (+$pResult_offset)(%rsp), %rdx
+___
+        &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+        #
+        # Loop variables
+        # rcx = [loop_idx] = index: 510-5 to 0 by 5
+$code.=<<___;
+         movq   \$505, (+$loop_idx_offset)(%rsp)
+         mov    (+$pResult_offset)(%rsp), %rcx
+         mov    %rcx, (+$red_result_addr_offset)(%rsp)
+         mov    (+8*0)(%rcx), %r10
+         mov    (+8*1)(%rcx), %r11
+         mov    (+8*2)(%rcx), %r12
+         mov    (+8*3)(%rcx), %r13
+         mov    (+8*4)(%rcx), %r14
+         mov    (+8*5)(%rcx), %r15
+         mov    (+8*6)(%rcx), %r8
+         mov    (+8*7)(%rcx), %r9
+         jmp    sqr_2
+main_loop_a3b:
+         call   sqr_reduce
+         call   sqr_reduce
+         call   sqr_reduce
+sqr_2:
+         call   sqr_reduce
+         call   sqr_reduce
+        #
+        # Do multiply, first look up proper value in Garray
+         mov    (+$loop_idx_offset)(%rsp), %rcx                 # bit index
+         mov    %rcx, %rax
+         shr    \$4, %rax                       # rax is word pointer
+         mov    (+$exp_offset)(%rsp,%rax,2), %edx
+         and    \$15, %rcx
+         shrq   %cl, %rdx
+         and    \$0x1F, %rdx
+         lea    (+$garray_offset)(%rsp,%rdx,2), %rsi
+         lea    (+$tmp_offset)(%rsp), %rdx
+         mov    %rdx, %rdi
+___
+        &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+        # rdi = tmp = pG
+        #
+        # Call mod_mul_a1(pDst,  pSrc1, pSrc2, pM, pData)
+        #                 result result pG     M   Data
+$code.=<<___;
+         mov    (+$pResult_offset)(%rsp), %rsi
+         call   mont_mul_a3b
+        #
+        # finish loop
+         mov    (+$loop_idx_offset)(%rsp), %rcx
+         sub    \$5, %rcx
+         mov    %rcx, (+$loop_idx_offset)(%rsp)
+         jge    main_loop_a3b
+        #
+end_main_loop_a3b:
+        # transform result out of Montgomery space
+        # result = reduce(result)
+         mov    (+$pResult_offset)(%rsp), %rdx
+         pxor   %xmm4, %xmm4
+         movdqu (+16*0)(%rdx), %xmm0
+         movdqu (+16*1)(%rdx), %xmm1
+         movdqu (+16*2)(%rdx), %xmm2
+         movdqu (+16*3)(%rdx), %xmm3
+         movdqa %xmm4, (+$tmp16_offset+16*4)(%rsp)
+         movdqa %xmm4, (+$tmp16_offset+16*5)(%rsp)
+         movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp)
+         movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp)
+         movdqa %xmm0, (+$tmp16_offset+16*0)(%rsp)
+         movdqa %xmm1, (+$tmp16_offset+16*1)(%rsp)
+         movdqa %xmm2, (+$tmp16_offset+16*2)(%rsp)
+         movdqa %xmm3, (+$tmp16_offset+16*3)(%rsp)
+         call   mont_reduce
+        # If result > m, subract m
+        # load result into r15:r8
+         mov    (+$pResult_offset)(%rsp), %rax
+         mov    (+8*0)(%rax), %r8
+         mov    (+8*1)(%rax), %r9
+         mov    (+8*2)(%rax), %r10
+         mov    (+8*3)(%rax), %r11
+         mov    (+8*4)(%rax), %r12
+         mov    (+8*5)(%rax), %r13
+         mov    (+8*6)(%rax), %r14
+         mov    (+8*7)(%rax), %r15
+        # subtract m
+         mov    (+$pData_offset)(%rsp), %rbx
+         add    \$$M, %rbx
+         sub    (+8*0)(%rbx), %r8
+         sbb    (+8*1)(%rbx), %r9
+         sbb    (+8*2)(%rbx), %r10
+         sbb    (+8*3)(%rbx), %r11
+         sbb    (+8*4)(%rbx), %r12
+         sbb    (+8*5)(%rbx), %r13
+         sbb    (+8*6)(%rbx), %r14
+         sbb    (+8*7)(%rbx), %r15
+        # if Carry is clear, replace result with difference
+         mov    (+8*0)(%rax), %rsi
+         mov    (+8*1)(%rax), %rdi
+         mov    (+8*2)(%rax), %rcx
+         mov    (+8*3)(%rax), %rdx
+         cmovnc %r8, %rsi
+         cmovnc %r9, %rdi
+         cmovnc %r10, %rcx
+         cmovnc %r11, %rdx
+         mov    %rsi, (+8*0)(%rax)
+         mov    %rdi, (+8*1)(%rax)
+         mov    %rcx, (+8*2)(%rax)
+         mov    %rdx, (+8*3)(%rax)
+         mov    (+8*4)(%rax), %rsi
+         mov    (+8*5)(%rax), %rdi
+         mov    (+8*6)(%rax), %rcx
+         mov    (+8*7)(%rax), %rdx
+         cmovnc %r12, %rsi
+         cmovnc %r13, %rdi
+         cmovnc %r14, %rcx
+         cmovnc %r15, %rdx
+         mov    %rsi, (+8*4)(%rax)
+         mov    %rdi, (+8*5)(%rax)
+         mov    %rcx, (+8*6)(%rax)
+         mov    %rdx, (+8*7)(%rax)
+         mov    (+$rsp_offset)(%rsp), %rsi
+         mov    0(%rsi),%r15
+         mov    8(%rsi),%r14
+         mov    16(%rsi),%r13
+         mov    24(%rsi),%r12
+         mov    32(%rsi),%rbx
+         mov    40(%rsi),%rbp
+         lea    48(%rsi),%rsp
+.Lepilogue:
+         ret
+.size mod_exp_512, . - mod_exp_512
+___
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type   mod_exp_512_se_handler,\@abi-omnipotent
+.align  16
+mod_exp_512_se_handler:
+        push    %rsi
+        push    %rdi
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        pushfq
+        sub     \$64,%rsp
+        mov     120($context),%rax      # pull context->Rax
+        mov     248($context),%rbx      # pull context->Rip
+        lea     .Lbody(%rip),%r10
+        cmp     %r10,%rbx               # context->Rip<prologue label
+        jb      .Lin_prologue
+        mov     152($context),%rax      # pull context->Rsp
+        lea     .Lepilogue(%rip),%r10
+        cmp     %r10,%rbx               # context->Rip>=epilogue label
+        jae     .Lin_prologue
+        mov     $rsp_offset(%rax),%rax  # pull saved Rsp
+        mov     32(%rax),%rbx
+        mov     40(%rax),%rbp
+        mov     24(%rax),%r12
+        mov     16(%rax),%r13
+        mov     8(%rax),%r14
+        mov     0(%rax),%r15
+        lea     48(%rax),%rax
+        mov     %rbx,144($context)      # restore context->Rbx
+        mov     %rbp,160($context)      # restore context->Rbp
+        mov     %r12,216($context)      # restore context->R12
+        mov     %r13,224($context)      # restore context->R13
+        mov     %r14,232($context)      # restore context->R14
+        mov     %r15,240($context)      # restore context->R15
+.Lin_prologue:
+        mov     8(%rax),%rdi
+        mov     16(%rax),%rsi
+        mov     %rax,152($context)      # restore context->Rsp
+        mov     %rsi,168($context)      # restore context->Rsi
+        mov     %rdi,176($context)      # restore context->Rdi
+        mov     40($disp),%rdi          # disp->ContextRecord
+        mov     $context,%rsi           # context
+        mov     \$154,%ecx              # sizeof(CONTEXT)
+        .long   0xa548f3fc              # cld; rep movsq
+        mov     $disp,%rsi
+        xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+        mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+        mov     0(%rsi),%r8             # arg3, disp->ControlPc
+        mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+        mov     40(%rsi),%r10           # disp->ContextRecord
+        lea     56(%rsi),%r11           # &disp->HandlerData
+        lea     24(%rsi),%r12           # &disp->EstablisherFrame
+        mov     %r10,32(%rsp)           # arg5
+        mov     %r11,40(%rsp)           # arg6
+        mov     %r12,48(%rsp)           # arg7
+        mov     %rcx,56(%rsp)           # arg8, (NULL)
+        call    *__imp_RtlVirtualUnwind(%rip)
+        mov     \$1,%eax                # ExceptionContinueSearch
+        add     \$64,%rsp
+        popfq
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+        pop     %rbp
+        pop     %rbx
+        pop     %rdi
+        pop     %rsi
+        ret
+.size   mod_exp_512_se_handler,.-mod_exp_512_se_handler
+.section        .pdata
+.align  4
+        .rva    .LSEH_begin_mod_exp_512
+        .rva    .LSEH_end_mod_exp_512
+        .rva    .LSEH_info_mod_exp_512
+.section        .xdata
+.align  8
+.LSEH_info_mod_exp_512:
+        .byte   9,0,0,0
+        .rva    mod_exp_512_se_handler
+___
+}
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)     { $reg .= $conv; }
+    elsif ($conv eq "b")        { $reg =~ s/%[er]([^x]+)x?/%$1l/;       }
+    elsif ($conv eq "w")        { $reg =~ s/%[er](.+)/%$1/;             }
+    elsif ($conv eq "d")        { $reg =~ s/%[er](.+)/%e$1/;            }
+    return $reg;
+}
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/(\(\+[^)]+\))/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/parisc-mont.pl b/src/lib/libcrypto/bn/asm/parisc-mont.pl
new file mode 100644
index 0000000000..4a766a87fb
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/parisc-mont.pl
@@ -0,0 +1,993 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# On PA-7100LC this module performs ~90-50% better, less for longer
+# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
+# that compiler utilized xmpyu instruction to perform 32x32=64-bit
+# multiplication, which in turn means that "baseline" performance was
+# optimal in respect to instruction set capabilities. Fair comparison
+# with vendor compiler is problematic, because OpenSSL doesn't define
+# BN_LLONG [presumably] for historical reasons, which drives compiler
+# toward 4 times 16x16=32-bit multiplicatons [plus complementary
+# shifts and additions] instead. This means that you should observe
+# several times improvement over code generated by vendor compiler
+# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
+# improvement coefficient was never collected on PA-7100LC, or any
+# other 1.1 CPU, because I don't have access to such machine with
+# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
+# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
+# of ~5x on PA-8600.
+#
+# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
+# reportedly ~2x faster than vendor compiler generated code [according
+# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
+# this implementation is actually 32-bit one, in the sense that it
+# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
+# 64-bit BN_LONGs... How do they interoperate then? No problem. This
+# module picks halves of 64-bit values in reverse order and pretends
+# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
+# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
+# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
+# i.e. there is no "wider" multiplication like on most other 64-bit
+# platforms. This means that even being effectively 32-bit, this
+# implementation performs "64-bit" computational task in same amount
+# of arithmetic operations, most notably multiplications. It requires
+# more memory references, most notably to tp[num], but this doesn't
+# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
+# 2.0 code path, provides virtually same performance as pa-risc2[W].s:
+# it's ~10% better for shortest key length and ~10% worse for longest
+# one.
+#
+# In case it wasn't clear. The module has two distinct code paths:
+# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
+# additions and 64-bit integer loads, not to mention specific
+# instruction scheduling. In 64-bit build naturally only 2.0 code path
+# is assembled. In 32-bit application context both code paths are
+# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
+# is taken automatically. Also, in 32-bit build the module imposes
+# couple of limitations: vector lengths has to be even and vector
+# addresses has to be 64-bit aligned. Normally neither is a problem:
+# most common key lengths are even and vectors are commonly malloc-ed,
+# which ensures alignment.
+#
+# Special thanks to polarhome.com for providing HP-UX account on
+# PA-RISC 1.1 machine, and to correspondent who chose to remain
+# anonymous for testing the code on PA-RISC 2.0 machine.
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+if ($flavour =~ /64/) {
+        $LEVEL          ="2.0W";
+        $SIZE_T         =8;
+        $FRAME_MARKER   =80;
+        $SAVED_RP       =16;
+        $PUSH           ="std";
+        $PUSHMA         ="std,ma";
+        $POP            ="ldd";
+        $POPMB          ="ldd,mb";
+        $BN_SZ          =$SIZE_T;
+} else {
+        $LEVEL          ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
+        $SIZE_T         =4;
+        $FRAME_MARKER   =48;
+        $SAVED_RP       =20;
+        $PUSH           ="stw";
+        $PUSHMA         ="stwm";
+        $POP            ="ldw";
+        $POPMB          ="ldwm";
+        $BN_SZ          =$SIZE_T;
+        if (open CONF,"<${dir}../../opensslconf.h") {
+            while(<CONF>) {
+                if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
+                    $BN_SZ=8;
+                    $LEVEL="2.0";
+                    last;
+                }
+            }
+            close CONF;
+        }
+}
+$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
+                                #                [+ argument transfer]
+$LOCALS=$FRAME-$FRAME_MARKER;
+$FRAME+=32;                     # local variables
+$tp="%r31";
+$ti1="%r29";
+$ti0="%r28";
+$rp="%r26";
+$ap="%r25";
+$bp="%r24";
+$np="%r23";
+$n0="%r22";     # passed through stack in 32-bit
+$num="%r21";    # passed through stack in 32-bit
+$idx="%r20";
+$arrsz="%r19";
+$nm1="%r7";
+$nm0="%r6";
+$ab1="%r5";
+$ab0="%r4";
+$fp="%r3";
+$hi1="%r2";
+$hi0="%r1";
+$xfer=$n0;      # accomodates [-16..15] offset in fld[dw]s
+$fm0="%fr4";    $fti=$fm0;
+$fbi="%fr5L";
+$fn0="%fr5R";
+$fai="%fr6";    $fab0="%fr7";   $fab1="%fr8";
+$fni="%fr9";    $fnm0="%fr10";  $fnm1="%fr11";
+$code=<<___;
+        .LEVEL  $LEVEL
+        .SPACE  \$TEXT\$
+        .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+        .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+        .ALIGN  64
+bn_mul_mont
+        .PROC
+        .CALLINFO       FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
+        .ENTRY
+        $PUSH   %r2,-$SAVED_RP(%sp)             ; standard prologue
+        $PUSHMA %r3,$FRAME(%sp)
+        $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
+        $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
+        $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
+        $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
+        $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
+        $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
+        $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
+        ldo     -$FRAME(%sp),$fp
+___
+$code.=<<___ if ($SIZE_T==4);
+        ldw     `-$FRAME_MARKER-4`($fp),$n0
+        ldw     `-$FRAME_MARKER-8`($fp),$num
+        nop
+        nop                                     ; alignment
+___
+$code.=<<___ if ($BN_SZ==4);
+        comiclr,<=      6,$num,%r0              ; are vectors long enough?
+        b               L\$abort
+        ldi             0,%r28                  ; signal "unhandled"
+        add,ev          %r0,$num,$num           ; is $num even?
+        b               L\$abort
+        nop
+        or              $ap,$np,$ti1
+        extru,=         $ti1,31,3,%r0           ; are ap and np 64-bit aligned?
+        b               L\$abort
+        nop
+        nop                                     ; alignment
+        nop
+        fldws           0($n0),${fn0}
+        fldws,ma        4($bp),${fbi}           ; bp[0]
+___
+$code.=<<___ if ($BN_SZ==8);
+        comib,>         3,$num,L\$abort         ; are vectors long enough?
+        ldi             0,%r28                  ; signal "unhandled"
+        addl            $num,$num,$num          ; I operate on 32-bit values
+        fldws           4($n0),${fn0}           ; only low part of n0
+        fldws           4($bp),${fbi}           ; bp[0] in flipped word order
+___
+$code.=<<___;
+        fldds           0($ap),${fai}           ; ap[0,1]
+        fldds           0($np),${fni}           ; np[0,1]
+        sh2addl         $num,%r0,$arrsz
+        ldi             31,$hi0
+        ldo             36($arrsz),$hi1         ; space for tp[num+1]
+        andcm           $hi1,$hi0,$hi1          ; align
+        addl            $hi1,%sp,%sp
+        $PUSH           $fp,-$SIZE_T(%sp)
+        ldo             `$LOCALS+16`($fp),$xfer
+        ldo             `$LOCALS+32+4`($fp),$tp
+        xmpyu           ${fai}L,${fbi},${fab0}  ; ap[0]*bp[0]
+        xmpyu           ${fai}R,${fbi},${fab1}  ; ap[1]*bp[0]
+        xmpyu           ${fn0},${fab0}R,${fm0}
+        addl            $arrsz,$ap,$ap          ; point at the end
+        addl            $arrsz,$np,$np
+        subi            0,$arrsz,$idx           ; j=0
+        ldo             8($idx),$idx            ; j++++
+        xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+        xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+        fstds           ${fab0},-16($xfer)
+        fstds           ${fnm0},-8($xfer)
+        fstds           ${fab1},0($xfer)
+        fstds           ${fnm1},8($xfer)
+         flddx          $idx($ap),${fai}        ; ap[2,3]
+         flddx          $idx($np),${fni}        ; np[2,3]
+___
+$code.=<<___ if ($BN_SZ==4);
+        mtctl           $hi0,%cr11              ; $hi0 still holds 31
+        extrd,u,*=      $hi0,%sar,1,$hi0        ; executes on PA-RISC 1.0
+        b               L\$parisc11
+        nop
+___
+$code.=<<___;                                   # PA-RISC 2.0 code-path
+        xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
+        xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+        ldd             -16($xfer),$ab0
+        fstds           ${fab0},-16($xfer)
+        extrd,u         $ab0,31,32,$hi0
+        extrd,u         $ab0,63,32,$ab0
+        ldd             -8($xfer),$nm0
+        fstds           ${fnm0},-8($xfer)
+         ldo            8($idx),$idx            ; j++++
+         addl           $ab0,$nm0,$nm0          ; low part is discarded
+         extrd,u        $nm0,31,32,$hi1
+L\$1st
+        xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[0]
+        xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+        ldd             0($xfer),$ab1
+        fstds           ${fab1},0($xfer)
+         addl           $hi0,$ab1,$ab1
+         extrd,u        $ab1,31,32,$hi0
+        ldd             8($xfer),$nm1
+        fstds           ${fnm1},8($xfer)
+         extrd,u        $ab1,63,32,$ab1
+         addl           $hi1,$nm1,$nm1
+        flddx           $idx($ap),${fai}        ; ap[j,j+1]
+        flddx           $idx($np),${fni}        ; np[j,j+1]
+         addl           $ab1,$nm1,$nm1
+         extrd,u        $nm1,31,32,$hi1
+        xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
+        xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+        ldd             -16($xfer),$ab0
+        fstds           ${fab0},-16($xfer)
+         addl           $hi0,$ab0,$ab0
+         extrd,u        $ab0,31,32,$hi0
+        ldd             -8($xfer),$nm0
+        fstds           ${fnm0},-8($xfer)
+         extrd,u        $ab0,63,32,$ab0
+         addl           $hi1,$nm0,$nm0
+        stw             $nm1,-4($tp)            ; tp[j-1]
+         addl           $ab0,$nm0,$nm0
+         stw,ma         $nm0,8($tp)             ; tp[j-1]
+        addib,<>        8,$idx,L\$1st           ; j++++
+         extrd,u        $nm0,31,32,$hi1
+        xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[0]
+        xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+        ldd             0($xfer),$ab1
+        fstds           ${fab1},0($xfer)
+         addl           $hi0,$ab1,$ab1
+         extrd,u        $ab1,31,32,$hi0
+        ldd             8($xfer),$nm1
+        fstds           ${fnm1},8($xfer)
+         extrd,u        $ab1,63,32,$ab1
+         addl           $hi1,$nm1,$nm1
+        ldd             -16($xfer),$ab0
+         addl           $ab1,$nm1,$nm1
+        ldd             -8($xfer),$nm0
+         extrd,u        $nm1,31,32,$hi1
+         addl           $hi0,$ab0,$ab0
+         extrd,u        $ab0,31,32,$hi0
+        stw             $nm1,-4($tp)            ; tp[j-1]
+         extrd,u        $ab0,63,32,$ab0
+         addl           $hi1,$nm0,$nm0
+        ldd             0($xfer),$ab1
+         addl           $ab0,$nm0,$nm0
+        ldd,mb          8($xfer),$nm1
+         extrd,u        $nm0,31,32,$hi1
+        stw,ma          $nm0,8($tp)             ; tp[j-1]
+        ldo             -1($num),$num           ; i--
+        subi            0,$arrsz,$idx           ; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+        fldws,ma        4($bp),${fbi}           ; bp[1]
+___
+$code.=<<___ if ($BN_SZ==8);
+        fldws           0($bp),${fbi}           ; bp[1] in flipped word order
+___
+$code.=<<___;
+         flddx          $idx($ap),${fai}        ; ap[0,1]
+         flddx          $idx($np),${fni}        ; np[0,1]
+         fldws          8($xfer),${fti}R        ; tp[0]
+        addl            $hi0,$ab1,$ab1
+         extrd,u        $ab1,31,32,$hi0
+         extrd,u        $ab1,63,32,$ab1
+         ldo            8($idx),$idx            ; j++++
+         xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[1]
+         xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[1]
+        addl            $hi1,$nm1,$nm1
+        addl            $ab1,$nm1,$nm1
+        extrd,u         $nm1,31,32,$hi1
+         fstws,mb       ${fab0}L,-8($xfer)      ; save high part
+        stw             $nm1,-4($tp)            ; tp[j-1]
+         fcpy,sgl       %fr0,${fti}L            ; zero high part
+         fcpy,sgl       %fr0,${fab0}L
+        addl            $hi1,$hi0,$hi0
+        extrd,u         $hi0,31,32,$hi1
+         fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
+         fcnvxf,dbl,dbl ${fab0},${fab0}
+        stw             $hi0,0($tp)
+        stw             $hi1,4($tp)
+        fadd,dbl        ${fti},${fab0},${fab0}  ; add tp[0]
+        fcnvfx,dbl,dbl  ${fab0},${fab0}         ; double -> 33-bit unsigned int
+        xmpyu           ${fn0},${fab0}R,${fm0}
+        ldo             `$LOCALS+32+4`($fp),$tp
+L\$outer
+        xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+        xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+        fstds           ${fab0},-16($xfer)      ; 33-bit value
+        fstds           ${fnm0},-8($xfer)
+         flddx          $idx($ap),${fai}        ; ap[2]
+         flddx          $idx($np),${fni}        ; np[2]
+         ldo            8($idx),$idx            ; j++++
+        ldd             -16($xfer),$ab0         ; 33-bit value
+        ldd             -8($xfer),$nm0
+        ldw             0($xfer),$hi0           ; high part
+        xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
+        xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+         extrd,u        $ab0,31,32,$ti0         ; carry bit
+         extrd,u        $ab0,63,32,$ab0
+        fstds           ${fab1},0($xfer)
+         addl           $ti0,$hi0,$hi0          ; account carry bit
+        fstds           ${fnm1},8($xfer)
+         addl           $ab0,$nm0,$nm0          ; low part is discarded
+        ldw             0($tp),$ti1             ; tp[1]
+         extrd,u        $nm0,31,32,$hi1
+        fstds           ${fab0},-16($xfer)
+        fstds           ${fnm0},-8($xfer)
+L\$inner
+        xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[i]
+        xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+        ldd             0($xfer),$ab1
+        fstds           ${fab1},0($xfer)
+         addl           $hi0,$ti1,$ti1
+         addl           $ti1,$ab1,$ab1
+        ldd             8($xfer),$nm1
+        fstds           ${fnm1},8($xfer)
+         extrd,u        $ab1,31,32,$hi0
+         extrd,u        $ab1,63,32,$ab1
+        flddx           $idx($ap),${fai}        ; ap[j,j+1]
+        flddx           $idx($np),${fni}        ; np[j,j+1]
+         addl           $hi1,$nm1,$nm1
+         addl           $ab1,$nm1,$nm1
+        ldw             4($tp),$ti0             ; tp[j]
+        stw             $nm1,-4($tp)            ; tp[j-1]
+        xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
+        xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+        ldd             -16($xfer),$ab0
+        fstds           ${fab0},-16($xfer)
+         addl           $hi0,$ti0,$ti0
+         addl           $ti0,$ab0,$ab0
+        ldd             -8($xfer),$nm0
+        fstds           ${fnm0},-8($xfer)
+         extrd,u        $ab0,31,32,$hi0
+         extrd,u        $nm1,31,32,$hi1
+        ldw             8($tp),$ti1             ; tp[j]
+         extrd,u        $ab0,63,32,$ab0
+         addl           $hi1,$nm0,$nm0
+         addl           $ab0,$nm0,$nm0
+         stw,ma         $nm0,8($tp)             ; tp[j-1]
+        addib,<>        8,$idx,L\$inner         ; j++++
+         extrd,u        $nm0,31,32,$hi1
+        xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[i]
+        xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+        ldd             0($xfer),$ab1
+        fstds           ${fab1},0($xfer)
+         addl           $hi0,$ti1,$ti1
+         addl           $ti1,$ab1,$ab1
+        ldd             8($xfer),$nm1
+        fstds           ${fnm1},8($xfer)
+         extrd,u        $ab1,31,32,$hi0
+         extrd,u        $ab1,63,32,$ab1
+        ldw             4($tp),$ti0             ; tp[j]
+         addl           $hi1,$nm1,$nm1
+         addl           $ab1,$nm1,$nm1
+        ldd             -16($xfer),$ab0
+        ldd             -8($xfer),$nm0
+         extrd,u        $nm1,31,32,$hi1
+        addl            $hi0,$ab0,$ab0
+         addl           $ti0,$ab0,$ab0
+         stw            $nm1,-4($tp)            ; tp[j-1]
+         extrd,u        $ab0,31,32,$hi0
+        ldw             8($tp),$ti1             ; tp[j]
+         extrd,u        $ab0,63,32,$ab0
+         addl           $hi1,$nm0,$nm0
+        ldd             0($xfer),$ab1
+         addl           $ab0,$nm0,$nm0
+        ldd,mb          8($xfer),$nm1
+         extrd,u        $nm0,31,32,$hi1
+         stw,ma         $nm0,8($tp)             ; tp[j-1]
+        addib,=         -1,$num,L\$outerdone    ; i--
+        subi            0,$arrsz,$idx           ; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+        fldws,ma        4($bp),${fbi}           ; bp[i]
+___
+$code.=<<___ if ($BN_SZ==8);
+        ldi             12,$ti0                 ; bp[i] in flipped word order
+        addl,ev         %r0,$num,$num
+        ldi             -4,$ti0
+        addl            $ti0,$bp,$bp
+        fldws           0($bp),${fbi}
+___
+$code.=<<___;
+         flddx          $idx($ap),${fai}        ; ap[0]
+        addl            $hi0,$ab1,$ab1
+         flddx          $idx($np),${fni}        ; np[0]
+         fldws          8($xfer),${fti}R        ; tp[0]
+        addl            $ti1,$ab1,$ab1
+        extrd,u         $ab1,31,32,$hi0
+        extrd,u         $ab1,63,32,$ab1
+         ldo            8($idx),$idx            ; j++++
+         xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[i]
+         xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[i]
+        ldw             4($tp),$ti0             ; tp[j]
+        addl            $hi1,$nm1,$nm1
+         fstws,mb       ${fab0}L,-8($xfer)      ; save high part
+        addl            $ab1,$nm1,$nm1
+        extrd,u         $nm1,31,32,$hi1
+         fcpy,sgl       %fr0,${fti}L            ; zero high part
+         fcpy,sgl       %fr0,${fab0}L
+        stw             $nm1,-4($tp)            ; tp[j-1]
+         fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
+         fcnvxf,dbl,dbl ${fab0},${fab0}
+        addl            $hi1,$hi0,$hi0
+         fadd,dbl       ${fti},${fab0},${fab0}  ; add tp[0]
+        addl            $ti0,$hi0,$hi0
+        extrd,u         $hi0,31,32,$hi1
+         fcnvfx,dbl,dbl ${fab0},${fab0}         ; double -> 33-bit unsigned int
+        stw             $hi0,0($tp)
+        stw             $hi1,4($tp)
+         xmpyu          ${fn0},${fab0}R,${fm0}
+        b               L\$outer
+        ldo             `$LOCALS+32+4`($fp),$tp
+L\$outerdone
+        addl            $hi0,$ab1,$ab1
+        addl            $ti1,$ab1,$ab1
+        extrd,u         $ab1,31,32,$hi0
+        extrd,u         $ab1,63,32,$ab1
+        ldw             4($tp),$ti0             ; tp[j]
+        addl            $hi1,$nm1,$nm1
+        addl            $ab1,$nm1,$nm1
+        extrd,u         $nm1,31,32,$hi1
+        stw             $nm1,-4($tp)            ; tp[j-1]
+        addl            $hi1,$hi0,$hi0
+        addl            $ti0,$hi0,$hi0
+        extrd,u         $hi0,31,32,$hi1
+        stw             $hi0,0($tp)
+        stw             $hi1,4($tp)
+        ldo             `$LOCALS+32`($fp),$tp
+        sub             %r0,%r0,%r0             ; clear borrow
+___
+$code.=<<___ if ($BN_SZ==4);
+        ldws,ma         4($tp),$ti0
+        extru,=         $rp,31,3,%r0            ; is rp 64-bit aligned?
+        b               L\$sub_pa11
+        addl            $tp,$arrsz,$tp
+L\$sub
+        ldwx            $idx($np),$hi0
+        subb            $ti0,$hi0,$hi1
+        ldwx            $idx($tp),$ti0
+        addib,<>        4,$idx,L\$sub
+        stws,ma         $hi1,4($rp)
+        subb            $ti0,%r0,$hi1
+        ldo             -4($tp),$tp
+___
+$code.=<<___ if ($BN_SZ==8);
+        ldd,ma          8($tp),$ti0
+L\$sub
+        ldd             $idx($np),$hi0
+        shrpd           $ti0,$ti0,32,$ti0       ; flip word order
+        std             $ti0,-8($tp)            ; save flipped value
+        sub,db          $ti0,$hi0,$hi1
+        ldd,ma          8($tp),$ti0
+        addib,<>        8,$idx,L\$sub
+        std,ma          $hi1,8($rp)
+        extrd,u         $ti0,31,32,$ti0         ; carry in flipped word order
+        sub,db          $ti0,%r0,$hi1
+        ldo             -8($tp),$tp
+___
+$code.=<<___;
+        and             $tp,$hi1,$ap
+        andcm           $rp,$hi1,$bp
+        or              $ap,$bp,$np
+        sub             $rp,$arrsz,$rp          ; rewind rp
+        subi            0,$arrsz,$idx
+        ldo             `$LOCALS+32`($fp),$tp
+L\$copy
+        ldd             $idx($np),$hi0
+        std,ma          %r0,8($tp)
+        addib,<>        8,$idx,.-8              ; L\$copy
+        std,ma          $hi0,8($rp)     
+___
+if ($BN_SZ==4) {                                # PA-RISC 1.1 code-path
+$ablo=$ab0;
+$abhi=$ab1;
+$nmlo0=$nm0;
+$nmhi0=$nm1;
+$nmlo1="%r9";
+$nmhi1="%r8";
+$code.=<<___;
+        b               L\$done
+        nop
+        .ALIGN          8
+L\$parisc11
+        xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
+        xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+        ldw             -12($xfer),$ablo
+        ldw             -16($xfer),$hi0
+        ldw             -4($xfer),$nmlo0
+        ldw             -8($xfer),$nmhi0
+        fstds           ${fab0},-16($xfer)
+        fstds           ${fnm0},-8($xfer)
+         ldo            8($idx),$idx            ; j++++
+         add            $ablo,$nmlo0,$nmlo0     ; discarded
+         addc           %r0,$nmhi0,$hi1
+        ldw             4($xfer),$ablo
+        ldw             0($xfer),$abhi
+        nop
+L\$1st_pa11
+        xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[0]
+        flddx           $idx($ap),${fai}        ; ap[j,j+1]
+        xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+        flddx           $idx($np),${fni}        ; np[j,j+1]
+         add            $hi0,$ablo,$ablo
+        ldw             12($xfer),$nmlo1
+         addc           %r0,$abhi,$hi0
+        ldw             8($xfer),$nmhi1
+         add            $ablo,$nmlo1,$nmlo1
+        fstds           ${fab1},0($xfer)
+         addc           %r0,$nmhi1,$nmhi1
+        fstds           ${fnm1},8($xfer)
+         add            $hi1,$nmlo1,$nmlo1
+        ldw             -12($xfer),$ablo
+         addc           %r0,$nmhi1,$hi1
+        ldw             -16($xfer),$abhi
+        xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[0]
+        ldw             -4($xfer),$nmlo0
+        xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+        ldw             -8($xfer),$nmhi0
+         add            $hi0,$ablo,$ablo
+        stw             $nmlo1,-4($tp)          ; tp[j-1]
+         addc           %r0,$abhi,$hi0
+        fstds           ${fab0},-16($xfer)
+         add            $ablo,$nmlo0,$nmlo0
+        fstds           ${fnm0},-8($xfer)
+         addc           %r0,$nmhi0,$nmhi0
+        ldw             0($xfer),$abhi
+         add            $hi1,$nmlo0,$nmlo0
+        ldw             4($xfer),$ablo
+         stws,ma        $nmlo0,8($tp)           ; tp[j-1]
+        addib,<>        8,$idx,L\$1st_pa11      ; j++++
+         addc           %r0,$nmhi0,$hi1
+         ldw            8($xfer),$nmhi1
+         ldw            12($xfer),$nmlo1
+        xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[0]
+        xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+         add            $hi0,$ablo,$ablo
+        fstds           ${fab1},0($xfer)
+         addc           %r0,$abhi,$hi0
+        fstds           ${fnm1},8($xfer)
+         add            $ablo,$nmlo1,$nmlo1
+        ldw             -16($xfer),$abhi
+         addc           %r0,$nmhi1,$nmhi1
+        ldw             -12($xfer),$ablo
+         add            $hi1,$nmlo1,$nmlo1
+        ldw             -8($xfer),$nmhi0
+         addc           %r0,$nmhi1,$hi1
+        ldw             -4($xfer),$nmlo0
+         add            $hi0,$ablo,$ablo
+        stw             $nmlo1,-4($tp)          ; tp[j-1]
+         addc           %r0,$abhi,$hi0
+        ldw             0($xfer),$abhi
+         add            $ablo,$nmlo0,$nmlo0
+        ldw             4($xfer),$ablo
+         addc           %r0,$nmhi0,$nmhi0
+        ldws,mb         8($xfer),$nmhi1
+         add            $hi1,$nmlo0,$nmlo0
+        ldw             4($xfer),$nmlo1
+         addc           %r0,$nmhi0,$hi1
+        stws,ma         $nmlo0,8($tp)           ; tp[j-1]
+        ldo             -1($num),$num           ; i--
+        subi            0,$arrsz,$idx           ; j=0
+         fldws,ma       4($bp),${fbi}           ; bp[1]
+         flddx          $idx($ap),${fai}        ; ap[0,1]
+         flddx          $idx($np),${fni}        ; np[0,1]
+         fldws          8($xfer),${fti}R        ; tp[0]
+        add             $hi0,$ablo,$ablo
+        addc            %r0,$abhi,$hi0
+         ldo            8($idx),$idx            ; j++++
+         xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[1]
+         xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[1]
+        add             $hi1,$nmlo1,$nmlo1
+        addc            %r0,$nmhi1,$nmhi1
+        add             $ablo,$nmlo1,$nmlo1
+        addc            %r0,$nmhi1,$hi1
+         fstws,mb       ${fab0}L,-8($xfer)      ; save high part
+        stw             $nmlo1,-4($tp)          ; tp[j-1]
+         fcpy,sgl       %fr0,${fti}L            ; zero high part
+         fcpy,sgl       %fr0,${fab0}L
+        add             $hi1,$hi0,$hi0
+        addc            %r0,%r0,$hi1
+         fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
+         fcnvxf,dbl,dbl ${fab0},${fab0}
+        stw             $hi0,0($tp)
+        stw             $hi1,4($tp)
+        fadd,dbl        ${fti},${fab0},${fab0}  ; add tp[0]
+        fcnvfx,dbl,dbl  ${fab0},${fab0}         ; double -> 33-bit unsigned int
+        xmpyu           ${fn0},${fab0}R,${fm0}
+        ldo             `$LOCALS+32+4`($fp),$tp
+L\$outer_pa11
+        xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+        xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+        fstds           ${fab0},-16($xfer)      ; 33-bit value
+        fstds           ${fnm0},-8($xfer)
+         flddx          $idx($ap),${fai}        ; ap[2,3]
+         flddx          $idx($np),${fni}        ; np[2,3]
+        ldw             -16($xfer),$abhi        ; carry bit actually
+         ldo            8($idx),$idx            ; j++++
+        ldw             -12($xfer),$ablo
+        ldw             -8($xfer),$nmhi0
+        ldw             -4($xfer),$nmlo0
+        ldw             0($xfer),$hi0           ; high part
+        xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
+        xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+        fstds           ${fab1},0($xfer)
+         addl           $abhi,$hi0,$hi0         ; account carry bit
+        fstds           ${fnm1},8($xfer)
+         add            $ablo,$nmlo0,$nmlo0     ; discarded
+        ldw             0($tp),$ti1             ; tp[1]
+         addc           %r0,$nmhi0,$hi1
+        fstds           ${fab0},-16($xfer)
+        fstds           ${fnm0},-8($xfer)
+        ldw             4($xfer),$ablo
+        ldw             0($xfer),$abhi
+L\$inner_pa11
+        xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j+1]*bp[i]
+        flddx           $idx($ap),${fai}        ; ap[j,j+1]
+        xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+        flddx           $idx($np),${fni}        ; np[j,j+1]
+         add            $hi0,$ablo,$ablo
+        ldw             4($tp),$ti0             ; tp[j]
+         addc           %r0,$abhi,$abhi
+        ldw             12($xfer),$nmlo1
+         add            $ti1,$ablo,$ablo
+        ldw             8($xfer),$nmhi1
+         addc           %r0,$abhi,$hi0
+        fstds           ${fab1},0($xfer)
+         add            $ablo,$nmlo1,$nmlo1
+        fstds           ${fnm1},8($xfer)
+         addc           %r0,$nmhi1,$nmhi1
+        ldw             -12($xfer),$ablo
+         add            $hi1,$nmlo1,$nmlo1
+        ldw             -16($xfer),$abhi
+         addc           %r0,$nmhi1,$hi1
+        xmpyu           ${fai}L,${fbi},${fab0}  ; ap[j]*bp[i]
+        ldw             8($tp),$ti1             ; tp[j]
+        xmpyu           ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+        ldw             -4($xfer),$nmlo0
+         add            $hi0,$ablo,$ablo
+        ldw             -8($xfer),$nmhi0
+         addc           %r0,$abhi,$abhi
+        stw             $nmlo1,-4($tp)          ; tp[j-1]
+         add            $ti0,$ablo,$ablo
+        fstds           ${fab0},-16($xfer)
+         addc           %r0,$abhi,$hi0
+        fstds           ${fnm0},-8($xfer)
+         add            $ablo,$nmlo0,$nmlo0
+        ldw             4($xfer),$ablo
+         addc           %r0,$nmhi0,$nmhi0
+        ldw             0($xfer),$abhi
+         add            $hi1,$nmlo0,$nmlo0
+         stws,ma        $nmlo0,8($tp)           ; tp[j-1]
+        addib,<>        8,$idx,L\$inner_pa11    ; j++++
+         addc           %r0,$nmhi0,$hi1
+        xmpyu           ${fai}R,${fbi},${fab1}  ; ap[j]*bp[i]
+        ldw             12($xfer),$nmlo1
+        xmpyu           ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+        ldw             8($xfer),$nmhi1
+         add            $hi0,$ablo,$ablo
+        ldw             4($tp),$ti0             ; tp[j]
+         addc           %r0,$abhi,$abhi
+        fstds           ${fab1},0($xfer)
+         add            $ti1,$ablo,$ablo
+        fstds           ${fnm1},8($xfer)
+         addc           %r0,$abhi,$hi0
+        ldw             -16($xfer),$abhi
+         add            $ablo,$nmlo1,$nmlo1
+        ldw             -12($xfer),$ablo
+         addc           %r0,$nmhi1,$nmhi1
+        ldw             -8($xfer),$nmhi0
+         add            $hi1,$nmlo1,$nmlo1
+        ldw             -4($xfer),$nmlo0
+         addc           %r0,$nmhi1,$hi1
+        add             $hi0,$ablo,$ablo
+         stw            $nmlo1,-4($tp)          ; tp[j-1]
+        addc            %r0,$abhi,$abhi
+         add            $ti0,$ablo,$ablo
+        ldw             8($tp),$ti1             ; tp[j]
+         addc           %r0,$abhi,$hi0
+        ldw             0($xfer),$abhi
+         add            $ablo,$nmlo0,$nmlo0
+        ldw             4($xfer),$ablo
+         addc           %r0,$nmhi0,$nmhi0
+        ldws,mb         8($xfer),$nmhi1
+         add            $hi1,$nmlo0,$nmlo0
+        ldw             4($xfer),$nmlo1
+         addc           %r0,$nmhi0,$hi1
+         stws,ma        $nmlo0,8($tp)           ; tp[j-1]
+        addib,=         -1,$num,L\$outerdone_pa11; i--
+        subi            0,$arrsz,$idx           ; j=0
+         fldws,ma       4($bp),${fbi}           ; bp[i]
+         flddx          $idx($ap),${fai}        ; ap[0]
+        add             $hi0,$ablo,$ablo
+        addc            %r0,$abhi,$abhi
+         flddx          $idx($np),${fni}        ; np[0]
+         fldws          8($xfer),${fti}R        ; tp[0]
+        add             $ti1,$ablo,$ablo
+        addc            %r0,$abhi,$hi0
+         ldo            8($idx),$idx            ; j++++
+         xmpyu          ${fai}L,${fbi},${fab0}  ; ap[0]*bp[i]
+         xmpyu          ${fai}R,${fbi},${fab1}  ; ap[1]*bp[i]
+        ldw             4($tp),$ti0             ; tp[j]
+        add             $hi1,$nmlo1,$nmlo1
+        addc            %r0,$nmhi1,$nmhi1
+         fstws,mb       ${fab0}L,-8($xfer)      ; save high part
+        add             $ablo,$nmlo1,$nmlo1
+        addc            %r0,$nmhi1,$hi1
+         fcpy,sgl       %fr0,${fti}L            ; zero high part
+         fcpy,sgl       %fr0,${fab0}L
+        stw             $nmlo1,-4($tp)          ; tp[j-1]
+         fcnvxf,dbl,dbl ${fti},${fti}           ; 32-bit unsigned int -> double
+         fcnvxf,dbl,dbl ${fab0},${fab0}
+        add             $hi1,$hi0,$hi0
+        addc            %r0,%r0,$hi1
+         fadd,dbl       ${fti},${fab0},${fab0}  ; add tp[0]
+        add             $ti0,$hi0,$hi0
+        addc            %r0,$hi1,$hi1
+         fcnvfx,dbl,dbl ${fab0},${fab0}         ; double -> 33-bit unsigned int
+        stw             $hi0,0($tp)
+        stw             $hi1,4($tp)
+         xmpyu          ${fn0},${fab0}R,${fm0}
+        b               L\$outer_pa11
+        ldo             `$LOCALS+32+4`($fp),$tp
+L\$outerdone_pa11
+        add             $hi0,$ablo,$ablo
+        addc            %r0,$abhi,$abhi
+        add             $ti1,$ablo,$ablo
+        addc            %r0,$abhi,$hi0
+        ldw             4($tp),$ti0             ; tp[j]
+        add             $hi1,$nmlo1,$nmlo1
+        addc            %r0,$nmhi1,$nmhi1
+        add             $ablo,$nmlo1,$nmlo1
+        addc            %r0,$nmhi1,$hi1
+        stw             $nmlo1,-4($tp)          ; tp[j-1]
+        add             $hi1,$hi0,$hi0
+        addc            %r0,%r0,$hi1
+        add             $ti0,$hi0,$hi0
+        addc            %r0,$hi1,$hi1
+        stw             $hi0,0($tp)
+        stw             $hi1,4($tp)
+        ldo             `$LOCALS+32+4`($fp),$tp
+        sub             %r0,%r0,%r0             ; clear borrow
+        ldw             -4($tp),$ti0
+        addl            $tp,$arrsz,$tp
+L\$sub_pa11
+        ldwx            $idx($np),$hi0
+        subb            $ti0,$hi0,$hi1
+        ldwx            $idx($tp),$ti0
+        addib,<>        4,$idx,L\$sub_pa11
+        stws,ma         $hi1,4($rp)
+        subb            $ti0,%r0,$hi1
+        ldo             -4($tp),$tp
+        and             $tp,$hi1,$ap
+        andcm           $rp,$hi1,$bp
+        or              $ap,$bp,$np
+        sub             $rp,$arrsz,$rp          ; rewind rp
+        subi            0,$arrsz,$idx
+        ldo             `$LOCALS+32`($fp),$tp
+L\$copy_pa11
+        ldwx            $idx($np),$hi0
+        stws,ma         %r0,4($tp)
+        addib,<>        4,$idx,L\$copy_pa11
+        stws,ma         $hi0,4($rp)     
+        nop                                     ; alignment
+L\$done
+___
+}
+$code.=<<___;
+        ldi             1,%r28                  ; signal "handled"
+        ldo             $FRAME($fp),%sp         ; destroy tp[num+1]
+        $POP    `-$FRAME-$SAVED_RP`(%sp),%r2    ; standard epilogue
+        $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
+        $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
+        $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
+        $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
+        $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
+        $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
+        $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
+L\$abort
+        bv      (%r2)
+        .EXIT
+        $POPMB  -$FRAME(%sp),%r3
+        .PROCEND
+        .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)         # format 4
+    {   my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)     # format 5
+    {   my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+        $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);                # encode offset
+        $opcode|=(1<<5)  if ($mod =~ /^,m/);
+        $opcode|=(1<<13) if ($mod =~ /^,mb/);
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)        # format 6
+    {   my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
+        $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);                  # encode offset
+        $opcode|=(1<<5)  if ($mod =~ /^,m/);
+        $opcode|=(1<<13) if ($mod =~ /^,mb/);
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 15
+    {   my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+        my $len=32-$3;
+        $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
+        $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)      # format 12
+    {   my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+        my $len=32-$2;
+        $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
+        $opcode |= (1<<13) if ($mod =~ /,\**=/);
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)   # format 14
+    {   my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+        my $cpos=63-$3;
+        $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+my $sub = sub {
+  my ($mod,$args) = @_;
+  my $orig = "sub$mod\t$args";
+    if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
+        my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
+        $opcode|=(1<<10);       # e1
+        $opcode|=(1<<8);        # e2
+        $opcode|=(1<<5);        # d
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
+    }
+    else { "\t".$orig; }
+};
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+foreach (split("\n",$code)) {
+        s/\`([^\`]*)\`/eval $1/ge;
+        # flip word order in 64-bit mode...
+        s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
+        # assemble 2.0 instructions in 32-bit mode...
+        s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
+        print $_,"\n";
+}
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ppc-mont.pl b/src/lib/libcrypto/bn/asm/ppc-mont.pl
index 7849eae959..f9b6992ccc 100644
--- a/src/lib/libcrypto/bn/asm/ppc-mont.pl
+++ b/src/lib/libcrypto/bn/asm/ppc-mont.pl
@@ -31,7 +31,6 @@ if ($flavour =~ /32/) {
        $BNSZ=  $BITS/8;
        $SIZE_T=4;
        $RZONE= 224;
-        $FRAME= $SIZE_T*16;
        $LD=    "lwz";          # load
        $LDU=   "lwzu";         # load and update
@@ -51,7 +50,6 @@ if ($flavour =~ /32/) {
        $BNSZ=  $BITS/8;
        $SIZE_T=8;
        $RZONE= 288;
-        $FRAME= $SIZE_T*16;
        # same as above, but 64-bit mnemonics...
        $LD=    "ld";           # load
@@ -69,6 +67,9 @@ if ($flavour =~ /32/) {
        $POP=   $LD;
 } else { die "nonsense $flavour"; }
+$FRAME=8*$SIZE_T+$RZONE;
+$LOCALS=8*$SIZE_T;
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -89,18 +90,18 @@ $aj="r10";
 $nj="r11";
 $tj="r12";
 # non-volatile registers
-$i="r14";
+$i="r20";
-$j="r15";
+$j="r21";
-$tp="r16";
+$tp="r22";
-$m0="r17";
+$m0="r23";
-$m1="r18";
+$m1="r24";
-$lo0="r19";
+$lo0="r25";
-$hi0="r20";
+$hi0="r26";
-$lo1="r21";
+$lo1="r27";
-$hi1="r22";
+$hi1="r28";
-$alo="r23";
+$alo="r29";
-$ahi="r24";
+$ahi="r30";
-$nlo="r25";
+$nlo="r31";
 #
 $nhi="r0";
@@ -108,42 +109,48 @@ $code=<<___;
 .machine "any"
 .text
-.globl  .bn_mul_mont
+.globl  .bn_mul_mont_int
 .align  4
-.bn_mul_mont:
+.bn_mul_mont_int:
        cmpwi   $num,4
        mr      $rp,r3          ; $rp is reassigned
        li      r3,0
        bltlr
+___
+$code.=<<___ if ($BNSZ==4);
+        cmpwi   $num,32         ; longer key performance is not better
+        bgelr
+___
+$code.=<<___;
        slwi    $num,$num,`log($BNSZ)/log(2)`
        li      $tj,-4096
-        addi    $ovf,$num,`$FRAME+$RZONE`
+        addi    $ovf,$num,$FRAME
        subf    $ovf,$ovf,$sp   ; $sp-$ovf
        and     $ovf,$ovf,$tj   ; minimize TLB usage
        subf    $ovf,$sp,$ovf   ; $ovf-$sp
+        mr      $tj,$sp
        srwi    $num,$num,`log($BNSZ)/log(2)`
        $STUX   $sp,$sp,$ovf
-        $PUSH   r14,`4*$SIZE_T`($sp)
+        $PUSH   r20,`-12*$SIZE_T`($tj)
-        $PUSH   r15,`5*$SIZE_T`($sp)
+        $PUSH   r21,`-11*$SIZE_T`($tj)
-        $PUSH   r16,`6*$SIZE_T`($sp)
+        $PUSH   r22,`-10*$SIZE_T`($tj)
-        $PUSH   r17,`7*$SIZE_T`($sp)
+        $PUSH   r23,`-9*$SIZE_T`($tj)
-        $PUSH   r18,`8*$SIZE_T`($sp)
+        $PUSH   r24,`-8*$SIZE_T`($tj)
-        $PUSH   r19,`9*$SIZE_T`($sp)
+        $PUSH   r25,`-7*$SIZE_T`($tj)
-        $PUSH   r20,`10*$SIZE_T`($sp)
+        $PUSH   r26,`-6*$SIZE_T`($tj)
-        $PUSH   r21,`11*$SIZE_T`($sp)
+        $PUSH   r27,`-5*$SIZE_T`($tj)
-        $PUSH   r22,`12*$SIZE_T`($sp)
+        $PUSH   r28,`-4*$SIZE_T`($tj)
-        $PUSH   r23,`13*$SIZE_T`($sp)
+        $PUSH   r29,`-3*$SIZE_T`($tj)
-        $PUSH   r24,`14*$SIZE_T`($sp)
+        $PUSH   r30,`-2*$SIZE_T`($tj)
-        $PUSH   r25,`15*$SIZE_T`($sp)
+        $PUSH   r31,`-1*$SIZE_T`($tj)
        $LD     $n0,0($n0)      ; pull n0[0] value
        addi    $num,$num,-2    ; adjust $num for counter register
        $LD     $m0,0($bp)      ; m0=bp[0]
        $LD     $aj,0($ap)      ; ap[0]
-        addi    $tp,$sp,$FRAME
+        addi    $tp,$sp,$LOCALS
        $UMULL  $lo0,$aj,$m0    ; ap[0]*bp[0]
        $UMULH  $hi0,$aj,$m0
@@ -205,8 +212,8 @@ L1st:
 Louter:
        $LDX    $m0,$bp,$i      ; m0=bp[i]
        $LD     $aj,0($ap)      ; ap[0]
-        addi    $tp,$sp,$FRAME
+        addi    $tp,$sp,$LOCALS
-        $LD     $tj,$FRAME($sp) ; tp[0]
+        $LD     $tj,$LOCALS($sp); tp[0]
        $UMULL  $lo0,$aj,$m0    ; ap[0]*bp[i]
        $UMULH  $hi0,$aj,$m0
        $LD     $aj,$BNSZ($ap)  ; ap[1]
@@ -273,7 +280,7 @@ Linner:
        addi    $num,$num,2     ; restore $num
        subfc   $j,$j,$j        ; j=0 and "clear" XER[CA]
-        addi    $tp,$sp,$FRAME
+        addi    $tp,$sp,$LOCALS
        mtctr   $num
 .align  4
@@ -299,23 +306,27 @@ Lcopy:				; copy or in-place refresh
        addi    $j,$j,$BNSZ
        bdnz-   Lcopy
-        $POP    r14,`4*$SIZE_T`($sp)
+        $POP    $tj,0($sp)
-        $POP    r15,`5*$SIZE_T`($sp)
-        $POP    r16,`6*$SIZE_T`($sp)
-        $POP    r17,`7*$SIZE_T`($sp)
-        $POP    r18,`8*$SIZE_T`($sp)
-        $POP    r19,`9*$SIZE_T`($sp)
-        $POP    r20,`10*$SIZE_T`($sp)
-        $POP    r21,`11*$SIZE_T`($sp)
-        $POP    r22,`12*$SIZE_T`($sp)
-        $POP    r23,`13*$SIZE_T`($sp)
-        $POP    r24,`14*$SIZE_T`($sp)
-        $POP    r25,`15*$SIZE_T`($sp)
-        $POP    $sp,0($sp)
        li      r3,1
+        $POP    r20,`-12*$SIZE_T`($tj)
+        $POP    r21,`-11*$SIZE_T`($tj)
+        $POP    r22,`-10*$SIZE_T`($tj)
+        $POP    r23,`-9*$SIZE_T`($tj)
+        $POP    r24,`-8*$SIZE_T`($tj)
+        $POP    r25,`-7*$SIZE_T`($tj)
+        $POP    r26,`-6*$SIZE_T`($tj)
+        $POP    r27,`-5*$SIZE_T`($tj)
+        $POP    r28,`-4*$SIZE_T`($tj)
+        $POP    r29,`-3*$SIZE_T`($tj)
+        $POP    r30,`-2*$SIZE_T`($tj)
+        $POP    r31,`-1*$SIZE_T`($tj)
+        mr      $sp,$tj
        blr
        .long   0
-.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+        .byte   0,12,4,0,0x80,12,6,0
+        .long   0
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/bn/asm/ppc.pl b/src/lib/libcrypto/bn/asm/ppc.pl
index f4093177e6..1249ce2299 100644
--- a/src/lib/libcrypto/bn/asm/ppc.pl
+++ b/src/lib/libcrypto/bn/asm/ppc.pl
@@ -389,7 +389,9 @@ $data=<<EOF;
        $ST             r9,`6*$BNSZ`(r3)        #r[6]=c1
        $ST             r10,`7*$BNSZ`(r3)       #r[7]=c2
        blr
-        .long   0x00000000
+        .long   0
+        .byte   0,12,0x14,0,0,0,2,0
+        .long   0
 #
 #       NOTE:   The following label name should be changed to
@@ -814,8 +816,9 @@ $data=<<EOF;
        blr
+        .long   0
-        .long   0x00000000
+        .byte   0,12,0x14,0,0,0,2,0
+        .long   0
 #
 #       NOTE:   The following label name should be changed to
@@ -966,7 +969,9 @@ $data=<<EOF;
        $ST     r10,`6*$BNSZ`(r3)       #r[6]=c1
        $ST     r11,`7*$BNSZ`(r3)       #r[7]=c2
        blr
-        .long   0x00000000
+        .long   0
+        .byte   0,12,0x14,0,0,0,3,0
+        .long   0
 #
 #       NOTE:   The following label name should be changed to
@@ -1502,7 +1507,9 @@ $data=<<EOF;
        $ST     r12,`14*$BNSZ`(r3)      #r[14]=c3;
        $ST     r10,`15*$BNSZ`(r3)      #r[15]=c1;
        blr
-        .long   0x00000000
+        .long   0
+        .byte   0,12,0x14,0,0,0,3,0
+        .long   0
 #
 #       NOTE:   The following label name should be changed to
@@ -1550,8 +1557,9 @@ Lppcasm_sub_adios:
        subfze  r3,r0           # if carry bit is set then r3 = 0 else -1
        andi.   r3,r3,1         # keep only last bit.
        blr
-        .long   0x00000000
+        .long   0
+        .byte   0,12,0x14,0,0,0,4,0
+        .long   0
 #
 #       NOTE:   The following label name should be changed to
@@ -1594,7 +1602,9 @@ Lppcasm_add_mainloop:
 Lppcasm_add_adios:      
        addze   r3,r0                   #return carry bit.
        blr
-        .long   0x00000000
+        .long   0
+        .byte   0,12,0x14,0,0,0,4,0
+        .long   0
 #
 #       NOTE:   The following label name should be changed to
@@ -1707,7 +1717,9 @@ Lppcasm_div8:
 Lppcasm_div9:
        or      r3,r8,r0
        blr
-        .long   0x00000000
+        .long   0
+        .byte   0,12,0x14,0,0,0,3,0
+        .long   0
 #
 #       NOTE:   The following label name should be changed to
@@ -1746,8 +1758,9 @@ Lppcasm_sqr_mainloop:
        bdnz-   Lppcasm_sqr_mainloop
 Lppcasm_sqr_adios:      
        blr
-        .long   0x00000000
+        .long   0
+        .byte   0,12,0x14,0,0,0,3,0
+        .long   0
 #
 #       NOTE:   The following label name should be changed to
@@ -1850,7 +1863,9 @@ Lppcasm_mw_REM:
 Lppcasm_mw_OVER:        
        addi    r3,r12,0
        blr
-        .long   0x00000000
+        .long   0
+        .byte   0,12,0x14,0,0,0,4,0
+        .long   0
 #
 #       NOTE:   The following label name should be changed to
@@ -1973,7 +1988,9 @@ Lppcasm_maw_leftover:
 Lppcasm_maw_adios:      
        addi    r3,r12,0
        blr
-        .long   0x00000000
+        .long   0
+        .byte   0,12,0x14,0,0,0,4,0
+        .long   0
        .align  4
 EOF
 $data =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/bn/asm/ppc64-mont.pl b/src/lib/libcrypto/bn/asm/ppc64-mont.pl
index 3449b35855..a14e769ad0 100644
--- a/src/lib/libcrypto/bn/asm/ppc64-mont.pl
+++ b/src/lib/libcrypto/bn/asm/ppc64-mont.pl
@@ -45,23 +45,40 @@
 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
 # in absolute terms, but it's apparently the way Power 6 is...
+# December 2009
+# Adapted for 32-bit build this module delivers 25-120%, yes, more
+# than *twice* for longer keys, performance improvement over 32-bit
+# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
+# even 64-bit integer operations and the trouble is that most PPC
+# operating systems don't preserve upper halves of general purpose
+# registers upon 32-bit signal delivery. They do preserve them upon
+# context switch, but not signalling:-( This means that asynchronous
+# signals have to be blocked upon entry to this subroutine. Signal
+# masking (and of course complementary unmasking) has quite an impact
+# on performance, naturally larger for shorter keys. It's so severe
+# that 512-bit key performance can be as low as 1/3 of expected one.
+# This is why this routine can be engaged for longer key operations
+# only on these OSes, see crypto/ppccap.c for further details. MacOS X
+# is an exception from this and doesn't require signal masking, and
+# that's where above improvement coefficients were collected. For
+# others alternative would be to break dependence on upper halves of
+# GPRs by sticking to 32-bit integer operations...
 $flavour = shift;
 if ($flavour =~ /32/) {
        $SIZE_T=4;
        $RZONE= 224;
-        $FRAME= $SIZE_T*12+8*12;
+        $fname= "bn_mul_mont_fpu64";
-        $fname= "bn_mul_mont_ppc64";
        $STUX=  "stwux";        # store indexed and update
        $PUSH=  "stw";
        $POP=   "lwz";
-        die "not implemented yet";
 } elsif ($flavour =~ /64/) {
        $SIZE_T=8;
        $RZONE= 288;
-        $FRAME= $SIZE_T*12+8*12;
+        $fname= "bn_mul_mont_fpu64";
-        $fname= "bn_mul_mont";
        # same as above, but 64-bit mnemonics...
        $STUX=  "stdux";        # store indexed and update
@@ -76,7 +93,7 @@ die "can't locate ppc-xlate.pl";
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
-$FRAME=($FRAME+63)&~63;
+$FRAME=64;      # padded frame header
 $TRANSFER=16*8;
 $carry="r0";
@@ -93,16 +110,16 @@ $tp="r10";
 $j="r11";
 $i="r12";
 # non-volatile registers
-$nap_d="r14";   # interleaved ap and np in double format
+$nap_d="r22";   # interleaved ap and np in double format
-$a0="r15";      # ap[0]
+$a0="r23";      # ap[0]
-$t0="r16";      # temporary registers
+$t0="r24";      # temporary registers
-$t1="r17";
+$t1="r25";
-$t2="r18";
+$t2="r26";
-$t3="r19";
+$t3="r27";
-$t4="r20";
+$t4="r28";
-$t5="r21";
+$t5="r29";
-$t6="r22";
+$t6="r30";
-$t7="r23";
+$t7="r31";
 # PPC offers enough register bank capacity to unroll inner loops twice
 #
@@ -132,28 +149,17 @@ $ba="f0";	$bb="f1";	$bc="f2";	$bd="f3";
 $na="f4";       $nb="f5";       $nc="f6";       $nd="f7";
 $dota="f8";     $dotb="f9";
 $A0="f10";      $A1="f11";      $A2="f12";      $A3="f13";
-$N0="f14";      $N1="f15";      $N2="f16";      $N3="f17";
+$N0="f20";      $N1="f21";      $N2="f22";      $N3="f23";
-$T0a="f18";     $T0b="f19";
+$T0a="f24";     $T0b="f25";
-$T1a="f20";     $T1b="f21";
+$T1a="f26";     $T1b="f27";
-$T2a="f22";     $T2b="f23";
+$T2a="f28";     $T2b="f29";
-$T3a="f24";     $T3b="f25";
+$T3a="f30";     $T3b="f31";
 # sp----------->+-------------------------------+
 #               | saved sp                      |
 #               +-------------------------------+
-#               |                               |
-#               +-------------------------------+
-#               | 10 saved gpr, r14-r23         |
-#               .                               .
-#               .                               .
-#   +12*size_t  +-------------------------------+
-#               | 12 saved fpr, f14-f25         |
 #               .                               .
-#               .                               .
+#   +64         +-------------------------------+
-#   +12*8       +-------------------------------+
-#               | padding to 64 byte boundary   |
-#               .                               .
-#   +X          +-------------------------------+
 #               | 16 gpr<->fpr transfer zone    |
 #               .                               .
 #               .                               .
@@ -173,6 +179,16 @@ $T3a="f24";	$T3b="f25";
 #               .                               .
 #               .                               .
 #               +-------------------------------+
+#               .                               .
+#   -12*size_t  +-------------------------------+
+#               | 10 saved gpr, r22-r31         |
+#               .                               .
+#               .                               .
+#   -12*8       +-------------------------------+
+#               | 12 saved fpr, f20-f31         |
+#               .                               .
+#               .                               .
+#               +-------------------------------+
 $code=<<___;
 .machine "any"
@@ -181,14 +197,14 @@ $code=<<___;
 .globl  .$fname
 .align  5
 .$fname:
-        cmpwi   $num,4
+        cmpwi   $num,`3*8/$SIZE_T`
        mr      $rp,r3          ; $rp is reassigned
        li      r3,0            ; possible "not handled" return code
        bltlr-
-        andi.   r0,$num,1       ; $num has to be even
+        andi.   r0,$num,`16/$SIZE_T-1`          ; $num has to be "even"
        bnelr-
-        slwi    $num,$num,3     ; num*=8
+        slwi    $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
        li      $i,-4096
        slwi    $tp,$num,2      ; place for {an}p_{lh}[num], i.e. 4*num
        add     $tp,$tp,$num    ; place for tp[num+1]
@@ -196,35 +212,50 @@ $code=<<___;
        subf    $tp,$tp,$sp     ; $sp-$tp
        and     $tp,$tp,$i      ; minimize TLB usage
        subf    $tp,$sp,$tp     ; $tp-$sp
+        mr      $i,$sp
        $STUX   $sp,$sp,$tp     ; alloca
-        $PUSH   r14,`2*$SIZE_T`($sp)
+        $PUSH   r22,`-12*8-10*$SIZE_T`($i)
-        $PUSH   r15,`3*$SIZE_T`($sp)
+        $PUSH   r23,`-12*8-9*$SIZE_T`($i)
-        $PUSH   r16,`4*$SIZE_T`($sp)
+        $PUSH   r24,`-12*8-8*$SIZE_T`($i)
-        $PUSH   r17,`5*$SIZE_T`($sp)
+        $PUSH   r25,`-12*8-7*$SIZE_T`($i)
-        $PUSH   r18,`6*$SIZE_T`($sp)
+        $PUSH   r26,`-12*8-6*$SIZE_T`($i)
-        $PUSH   r19,`7*$SIZE_T`($sp)
+        $PUSH   r27,`-12*8-5*$SIZE_T`($i)
-        $PUSH   r20,`8*$SIZE_T`($sp)
+        $PUSH   r28,`-12*8-4*$SIZE_T`($i)
-        $PUSH   r21,`9*$SIZE_T`($sp)
+        $PUSH   r29,`-12*8-3*$SIZE_T`($i)
-        $PUSH   r22,`10*$SIZE_T`($sp)
+        $PUSH   r30,`-12*8-2*$SIZE_T`($i)
-        $PUSH   r23,`11*$SIZE_T`($sp)
+        $PUSH   r31,`-12*8-1*$SIZE_T`($i)
-        stfd    f14,`12*$SIZE_T+0`($sp)
+        stfd    f20,`-12*8`($i)
-        stfd    f15,`12*$SIZE_T+8`($sp)
+        stfd    f21,`-11*8`($i)
-        stfd    f16,`12*$SIZE_T+16`($sp)
+        stfd    f22,`-10*8`($i)
-        stfd    f17,`12*$SIZE_T+24`($sp)
+        stfd    f23,`-9*8`($i)
-        stfd    f18,`12*$SIZE_T+32`($sp)
+        stfd    f24,`-8*8`($i)
-        stfd    f19,`12*$SIZE_T+40`($sp)
+        stfd    f25,`-7*8`($i)
-        stfd    f20,`12*$SIZE_T+48`($sp)
+        stfd    f26,`-6*8`($i)
-        stfd    f21,`12*$SIZE_T+56`($sp)
+        stfd    f27,`-5*8`($i)
-        stfd    f22,`12*$SIZE_T+64`($sp)
+        stfd    f28,`-4*8`($i)
-        stfd    f23,`12*$SIZE_T+72`($sp)
+        stfd    f29,`-3*8`($i)
-        stfd    f24,`12*$SIZE_T+80`($sp)
+        stfd    f30,`-2*8`($i)
-        stfd    f25,`12*$SIZE_T+88`($sp)
+        stfd    f31,`-1*8`($i)
+___
+$code.=<<___ if ($SIZE_T==8);
        ld      $a0,0($ap)      ; pull ap[0] value
        ld      $n0,0($n0)      ; pull n0[0] value
        ld      $t3,0($bp)      ; bp[0]
+___
+$code.=<<___ if ($SIZE_T==4);
+        mr      $t1,$n0
+        lwz     $a0,0($ap)      ; pull ap[0,1] value
+        lwz     $t0,4($ap)
+        lwz     $n0,0($t1)      ; pull n0[0,1] value
+        lwz     $t1,4($t1)
+        lwz     $t3,0($bp)      ; bp[0,1]
+        lwz     $t2,4($bp)
+        insrdi  $a0,$t0,32,0
+        insrdi  $n0,$t1,32,0
+        insrdi  $t3,$t2,32,0
+___
+$code.=<<___;
        addi    $tp,$sp,`$FRAME+$TRANSFER+8+64`
        li      $i,-64
        add     $nap_d,$tp,$num
@@ -258,6 +289,8 @@ $code=<<___;
        std     $t5,`$FRAME+40`($sp)
        std     $t6,`$FRAME+48`($sp)
        std     $t7,`$FRAME+56`($sp)
+___
+$code.=<<___ if ($SIZE_T==8);
        lwz     $t0,4($ap)              ; load a[j] as 32-bit word pair
        lwz     $t1,0($ap)
        lwz     $t2,12($ap)             ; load a[j+1] as 32-bit word pair
@@ -266,6 +299,18 @@ $code=<<___;
        lwz     $t5,0($np)
        lwz     $t6,12($np)             ; load n[j+1] as 32-bit word pair
        lwz     $t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+        lwz     $t0,0($ap)              ; load a[j..j+3] as 32-bit word pairs
+        lwz     $t1,4($ap)
+        lwz     $t2,8($ap)
+        lwz     $t3,12($ap)
+        lwz     $t4,0($np)              ; load n[j..j+3] as 32-bit word pairs
+        lwz     $t5,4($np)
+        lwz     $t6,8($np)
+        lwz     $t7,12($np)
+___
+$code.=<<___;
        lfd     $ba,`$FRAME+0`($sp)
        lfd     $bb,`$FRAME+8`($sp)
        lfd     $bc,`$FRAME+16`($sp)
@@ -374,6 +419,8 @@ $code=<<___;
 .align  5
 L1st:
+___
+$code.=<<___ if ($SIZE_T==8);
        lwz     $t0,4($ap)              ; load a[j] as 32-bit word pair
        lwz     $t1,0($ap)
        lwz     $t2,12($ap)             ; load a[j+1] as 32-bit word pair
@@ -382,6 +429,18 @@ L1st:
        lwz     $t5,0($np)
        lwz     $t6,12($np)             ; load n[j+1] as 32-bit word pair
        lwz     $t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+        lwz     $t0,0($ap)              ; load a[j..j+3] as 32-bit word pairs
+        lwz     $t1,4($ap)
+        lwz     $t2,8($ap)
+        lwz     $t3,12($ap)
+        lwz     $t4,0($np)              ; load n[j..j+3] as 32-bit word pairs
+        lwz     $t5,4($np)
+        lwz     $t6,8($np)
+        lwz     $t7,12($np)
+___
+$code.=<<___;
        std     $t0,`$FRAME+64`($sp)
        std     $t1,`$FRAME+72`($sp)
        std     $t2,`$FRAME+80`($sp)
@@ -559,7 +618,17 @@ L1st:
        li      $i,8                    ; i=1
 .align  5
 Louter:
+___
+$code.=<<___ if ($SIZE_T==8);
        ldx     $t3,$bp,$i      ; bp[i]
+___
+$code.=<<___ if ($SIZE_T==4);
+        add     $t0,$bp,$i
+        lwz     $t3,0($t0)              ; bp[i,i+1]
+        lwz     $t0,4($t0)
+        insrdi  $t3,$t0,32,0
+___
+$code.=<<___;
        ld      $t6,`$FRAME+$TRANSFER+8`($sp)   ; tp[0]
        mulld   $t7,$a0,$t3     ; ap[0]*bp[i]
@@ -761,6 +830,13 @@ Linner:
        stfd    $T0b,`$FRAME+8`($sp)
         add    $t7,$t7,$carry
         addc   $t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4);           # adjust XER[CA]
+        extrdi  $t0,$t0,32,0
+        extrdi  $t1,$t1,32,0
+        adde    $t0,$t0,$t1
+___
+$code.=<<___;
        stfd    $T1a,`$FRAME+16`($sp)
        stfd    $T1b,`$FRAME+24`($sp)
         insrdi $t4,$t7,16,0            ; 64..127 bits
@@ -768,6 +844,13 @@ Linner:
        stfd    $T2a,`$FRAME+32`($sp)
        stfd    $T2b,`$FRAME+40`($sp)
         adde   $t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4);           # adjust XER[CA]
+        extrdi  $t4,$t4,32,0
+        extrdi  $t2,$t2,32,0
+        adde    $t4,$t4,$t2
+___
+$code.=<<___;
        stfd    $T3a,`$FRAME+48`($sp)
        stfd    $T3b,`$FRAME+56`($sp)
         addze  $carry,$carry
@@ -816,7 +899,21 @@ Linner:
        ld      $t7,`$FRAME+72`($sp)
        addc    $t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4);           # adjust XER[CA]
+        extrdi  $t0,$t0,32,0
+        extrdi  $t1,$t1,32,0
+        adde    $t0,$t0,$t1
+___
+$code.=<<___;
        adde    $t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4);           # adjust XER[CA]
+        extrdi  $t4,$t4,32,0
+        extrdi  $t2,$t2,32,0
+        adde    $t4,$t4,$t2
+___
+$code.=<<___;
        addze   $carry,$carry
        std     $t3,-16($tp)            ; tp[j-1]
@@ -835,7 +932,9 @@ Linner:
        subf    $nap_d,$t7,$nap_d       ; rewind pointer
        cmpw    $i,$num
        blt-    Louter
+___
+$code.=<<___ if ($SIZE_T==8);
        subf    $np,$num,$np    ; rewind np
        addi    $j,$j,1         ; restore counter
        subfc   $i,$i,$i        ; j=0 and "clear" XER[CA]
@@ -883,34 +982,105 @@ Lcopy:				; copy or in-place refresh
        stdx    $i,$t4,$i
        addi    $i,$i,16
        bdnz-   Lcopy
+___
+$code.=<<___ if ($SIZE_T==4);
+        subf    $np,$num,$np    ; rewind np
+        addi    $j,$j,1         ; restore counter
+        subfc   $i,$i,$i        ; j=0 and "clear" XER[CA]
+        addi    $tp,$sp,`$FRAME+$TRANSFER`
+        addi    $np,$np,-4
+        addi    $rp,$rp,-4
+        addi    $ap,$sp,`$FRAME+$TRANSFER+4`
+        mtctr   $j
+.align  4
+Lsub:   ld      $t0,8($tp)      ; load tp[j..j+3] in 64-bit word order
+        ldu     $t2,16($tp)
+        lwz     $t4,4($np)      ; load np[j..j+3] in 32-bit word order
+        lwz     $t5,8($np)
+        lwz     $t6,12($np)
+        lwzu    $t7,16($np)
+        extrdi  $t1,$t0,32,0
+        extrdi  $t3,$t2,32,0
+        subfe   $t4,$t4,$t0     ; tp[j]-np[j]
+         stw    $t0,4($ap)      ; save tp[j..j+3] in 32-bit word order
+        subfe   $t5,$t5,$t1     ; tp[j+1]-np[j+1]
+         stw    $t1,8($ap)
+        subfe   $t6,$t6,$t2     ; tp[j+2]-np[j+2]
+         stw    $t2,12($ap)
+        subfe   $t7,$t7,$t3     ; tp[j+3]-np[j+3]
+         stwu   $t3,16($ap)
+        stw     $t4,4($rp)
+        stw     $t5,8($rp)
+        stw     $t6,12($rp)
+        stwu    $t7,16($rp)
+        bdnz-   Lsub
+        li      $i,0
+        subfe   $ovf,$i,$ovf    ; handle upmost overflow bit
+        addi    $tp,$sp,`$FRAME+$TRANSFER+4`
+        subf    $rp,$num,$rp    ; rewind rp
+        and     $ap,$tp,$ovf
+        andc    $np,$rp,$ovf
+        or      $ap,$ap,$np     ; ap=borrow?tp:rp
+        addi    $tp,$sp,`$FRAME+$TRANSFER`
+        mtctr   $j
+.align  4
+Lcopy:                          ; copy or in-place refresh
+        lwz     $t0,4($ap)
+        lwz     $t1,8($ap)
+        lwz     $t2,12($ap)
+        lwzu    $t3,16($ap)
+        std     $i,8($nap_d)    ; zap nap_d
+        std     $i,16($nap_d)
+        std     $i,24($nap_d)
+        std     $i,32($nap_d)
+        std     $i,40($nap_d)
+        std     $i,48($nap_d)
+        std     $i,56($nap_d)
+        stdu    $i,64($nap_d)
+        stw     $t0,4($rp)
+        stw     $t1,8($rp)
+        stw     $t2,12($rp)
+        stwu    $t3,16($rp)
+        std     $i,8($tp)       ; zap tp at once
+        stdu    $i,16($tp)
+        bdnz-   Lcopy
+___
-        $POP    r14,`2*$SIZE_T`($sp)
+$code.=<<___;
-        $POP    r15,`3*$SIZE_T`($sp)
+        $POP    $i,0($sp)
-        $POP    r16,`4*$SIZE_T`($sp)
-        $POP    r17,`5*$SIZE_T`($sp)
-        $POP    r18,`6*$SIZE_T`($sp)
-        $POP    r19,`7*$SIZE_T`($sp)
-        $POP    r20,`8*$SIZE_T`($sp)
-        $POP    r21,`9*$SIZE_T`($sp)
-        $POP    r22,`10*$SIZE_T`($sp)
-        $POP    r23,`11*$SIZE_T`($sp)
-        lfd     f14,`12*$SIZE_T+0`($sp)
-        lfd     f15,`12*$SIZE_T+8`($sp)
-        lfd     f16,`12*$SIZE_T+16`($sp)
-        lfd     f17,`12*$SIZE_T+24`($sp)
-        lfd     f18,`12*$SIZE_T+32`($sp)
-        lfd     f19,`12*$SIZE_T+40`($sp)
-        lfd     f20,`12*$SIZE_T+48`($sp)
-        lfd     f21,`12*$SIZE_T+56`($sp)
-        lfd     f22,`12*$SIZE_T+64`($sp)
-        lfd     f23,`12*$SIZE_T+72`($sp)
-        lfd     f24,`12*$SIZE_T+80`($sp)
-        lfd     f25,`12*$SIZE_T+88`($sp)
-        $POP    $sp,0($sp)
        li      r3,1    ; signal "handled"
+        $POP    r22,`-12*8-10*$SIZE_T`($i)
+        $POP    r23,`-12*8-9*$SIZE_T`($i)
+        $POP    r24,`-12*8-8*$SIZE_T`($i)
+        $POP    r25,`-12*8-7*$SIZE_T`($i)
+        $POP    r26,`-12*8-6*$SIZE_T`($i)
+        $POP    r27,`-12*8-5*$SIZE_T`($i)
+        $POP    r28,`-12*8-4*$SIZE_T`($i)
+        $POP    r29,`-12*8-3*$SIZE_T`($i)
+        $POP    r30,`-12*8-2*$SIZE_T`($i)
+        $POP    r31,`-12*8-1*$SIZE_T`($i)
+        lfd     f20,`-12*8`($i)
+        lfd     f21,`-11*8`($i)
+        lfd     f22,`-10*8`($i)
+        lfd     f23,`-9*8`($i)
+        lfd     f24,`-8*8`($i)
+        lfd     f25,`-7*8`($i)
+        lfd     f26,`-6*8`($i)
+        lfd     f27,`-5*8`($i)
+        lfd     f28,`-4*8`($i)
+        lfd     f29,`-3*8`($i)
+        lfd     f30,`-2*8`($i)
+        lfd     f31,`-1*8`($i)
+        mr      $sp,$i
        blr
        .long   0
-.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+        .byte   0,12,4,0,0x8c,10,6,0
+        .long   0
+.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl
new file mode 100644
index 0000000000..cd9f13eca2
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl
@@ -0,0 +1,221 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... gcc 4.3 appeared to generate poor code, therefore
+# the effort. And indeed, the module delivers 55%-90%(*) improvement
+# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
+# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
+# This is for 64-bit build. In 32-bit "highgprs" case improvement is
+# even higher, for example on z990 it was measured 80%-150%. ECDSA
+# sign is modest 9%-12% faster. Keep in mind that these coefficients
+# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
+# burnt in it...
+#
+# (*)   gcc 4.1 was observed to deliver better results than gcc 4.3,
+#       so that improvement coefficients can vary from one specific
+#       setup to another.
+$flavour = shift;
+if ($flavour =~ /3[12]/) {
+        $SIZE_T=4;
+        $g="";
+} else {
+        $SIZE_T=8;
+        $g="g";
+}
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+$stdframe=16*$SIZE_T+4*8;
+$rp="%r2";
+$a1="%r3";
+$a0="%r4";
+$b1="%r5";
+$b0="%r6";
+$ra="%r14";
+$sp="%r15";
+@T=("%r0","%r1");
+@i=("%r12","%r13");
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
+($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
+$code.=<<___;
+.text
+.type   _mul_1x1,\@function
+.align  16
+_mul_1x1:
+        lgr     $a1,$a
+        sllg    $a2,$a,1
+        sllg    $a4,$a,2
+        sllg    $a8,$a,3
+        srag    $lo,$a1,63                      # broadcast 63rd bit
+        nihh    $a1,0x1fff
+        srag    @i[0],$a2,63                    # broadcast 62nd bit
+        nihh    $a2,0x3fff
+        srag    @i[1],$a4,63                    # broadcast 61st bit
+        nihh    $a4,0x7fff
+        ngr     $lo,$b
+        ngr     @i[0],$b
+        ngr     @i[1],$b
+        lghi    @T[0],0
+        lgr     $a12,$a1
+        stg     @T[0],`$stdframe+0*8`($sp)      # tab[0]=0
+        xgr     $a12,$a2
+        stg     $a1,`$stdframe+1*8`($sp)        # tab[1]=a1
+         lgr    $a48,$a4
+        stg     $a2,`$stdframe+2*8`($sp)        # tab[2]=a2
+         xgr    $a48,$a8
+        stg     $a12,`$stdframe+3*8`($sp)       # tab[3]=a1^a2
+         xgr    $a1,$a4
+        stg     $a4,`$stdframe+4*8`($sp)        # tab[4]=a4
+        xgr     $a2,$a4
+        stg     $a1,`$stdframe+5*8`($sp)        # tab[5]=a1^a4
+        xgr     $a12,$a4
+        stg     $a2,`$stdframe+6*8`($sp)        # tab[6]=a2^a4
+         xgr    $a1,$a48
+        stg     $a12,`$stdframe+7*8`($sp)       # tab[7]=a1^a2^a4
+         xgr    $a2,$a48
+        stg     $a8,`$stdframe+8*8`($sp)        # tab[8]=a8
+        xgr     $a12,$a48
+        stg     $a1,`$stdframe+9*8`($sp)        # tab[9]=a1^a8
+         xgr    $a1,$a4
+        stg     $a2,`$stdframe+10*8`($sp)       # tab[10]=a2^a8
+         xgr    $a2,$a4
+        stg     $a12,`$stdframe+11*8`($sp)      # tab[11]=a1^a2^a8
+        xgr     $a12,$a4
+        stg     $a48,`$stdframe+12*8`($sp)      # tab[12]=a4^a8
+         srlg   $hi,$lo,1
+        stg     $a1,`$stdframe+13*8`($sp)       # tab[13]=a1^a4^a8
+         sllg   $lo,$lo,63
+        stg     $a2,`$stdframe+14*8`($sp)       # tab[14]=a2^a4^a8
+         srlg   @T[0],@i[0],2
+        stg     $a12,`$stdframe+15*8`($sp)      # tab[15]=a1^a2^a4^a8
+        lghi    $mask,`0xf<<3`
+        sllg    $a1,@i[0],62
+         sllg   @i[0],$b,3
+        srlg    @T[1],@i[1],3
+         ngr    @i[0],$mask
+        sllg    $a2,@i[1],61
+         srlg   @i[1],$b,4-3
+        xgr     $hi,@T[0]
+         ngr    @i[1],$mask
+        xgr     $lo,$a1
+        xgr     $hi,@T[1]
+        xgr     $lo,$a2
+        xg      $lo,$stdframe(@i[0],$sp)
+        srlg    @i[0],$b,8-3
+        ngr     @i[0],$mask
+___
+for($n=1;$n<14;$n++) {
+$code.=<<___;
+        lg      @T[1],$stdframe(@i[1],$sp)
+        srlg    @i[1],$b,`($n+2)*4`-3
+        sllg    @T[0],@T[1],`$n*4`
+        ngr     @i[1],$mask
+        srlg    @T[1],@T[1],`64-$n*4`
+        xgr     $lo,@T[0]
+        xgr     $hi,@T[1]
+___
+        push(@i,shift(@i)); push(@T,shift(@T));
+}
+$code.=<<___;
+        lg      @T[1],$stdframe(@i[1],$sp)
+        sllg    @T[0],@T[1],`$n*4`
+        srlg    @T[1],@T[1],`64-$n*4`
+        xgr     $lo,@T[0]
+        xgr     $hi,@T[1]
+        lg      @T[0],$stdframe(@i[0],$sp)
+        sllg    @T[1],@T[0],`($n+1)*4`
+        srlg    @T[0],@T[0],`64-($n+1)*4`
+        xgr     $lo,@T[1]
+        xgr     $hi,@T[0]
+        br      $ra
+.size   _mul_1x1,.-_mul_1x1
+.globl  bn_GF2m_mul_2x2
+.type   bn_GF2m_mul_2x2,\@function
+.align  16
+bn_GF2m_mul_2x2:
+        stm${g} %r3,%r15,3*$SIZE_T($sp)
+        lghi    %r1,-$stdframe-128
+        la      %r0,0($sp)
+        la      $sp,0(%r1,$sp)                  # alloca
+        st${g}  %r0,0($sp)                      # back chain
+___
+if ($SIZE_T==8) {
+my @r=map("%r$_",(6..9));
+$code.=<<___;
+        bras    $ra,_mul_1x1                    # a1�b1
+        stmg    $lo,$hi,16($rp)
+        lg      $a,`$stdframe+128+4*$SIZE_T`($sp)
+        lg      $b,`$stdframe+128+6*$SIZE_T`($sp)
+        bras    $ra,_mul_1x1                    # a0�b0
+        stmg    $lo,$hi,0($rp)
+        lg      $a,`$stdframe+128+3*$SIZE_T`($sp)
+        lg      $b,`$stdframe+128+5*$SIZE_T`($sp)
+        xg      $a,`$stdframe+128+4*$SIZE_T`($sp)
+        xg      $b,`$stdframe+128+6*$SIZE_T`($sp)
+        bras    $ra,_mul_1x1                    # (a0+a1)�(b0+b1)
+        lmg     @r[0],@r[3],0($rp)
+        xgr     $lo,$hi
+        xgr     $hi,@r[1]
+        xgr     $lo,@r[0]
+        xgr     $hi,@r[2]
+        xgr     $lo,@r[3]       
+        xgr     $hi,@r[3]
+        xgr     $lo,$hi
+        stg     $hi,16($rp)
+        stg     $lo,8($rp)
+___
+} else {
+$code.=<<___;
+        sllg    %r3,%r3,32
+        sllg    %r5,%r5,32
+        or      %r3,%r4
+        or      %r5,%r6
+        bras    $ra,_mul_1x1
+        rllg    $lo,$lo,32
+        rllg    $hi,$hi,32
+        stmg    $lo,$hi,0($rp)
+___
+}
+$code.=<<___;
+        lm${g}  %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
+        br      $ra
+.size   bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl
index f61246f5b6..9fd64e81ee 100644
--- a/src/lib/libcrypto/bn/asm/s390x-mont.pl
+++ b/src/lib/libcrypto/bn/asm/s390x-mont.pl
@@ -32,6 +32,33 @@
 # Reschedule to minimize/avoid Address Generation Interlock hazard,
 # make inner loops counter-based.
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
+# is achieved by swapping words after 64-bit loads, follow _dswap-s.
+# On z990 it was measured to perform 2.6-2.2 times better than
+# compiler-generated code, less for longer keys...
+$flavour = shift;
+if ($flavour =~ /3[12]/) {
+        $SIZE_T=4;
+        $g="";
+} else {
+        $SIZE_T=8;
+        $g="g";
+}
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+$stdframe=16*$SIZE_T+4*8;
 $mn0="%r0";
 $num="%r1";
@@ -60,34 +87,44 @@ $code.=<<___;
 .globl  bn_mul_mont
 .type   bn_mul_mont,\@function
 bn_mul_mont:
-        lgf     $num,164($sp)   # pull $num
+        lgf     $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
-        sla     $num,3          # $num to enumerate bytes
+        sla     $num,`log($SIZE_T)/log(2)`      # $num to enumerate bytes
        la      $bp,0($num,$bp)
-        stg     %r2,16($sp)
+        st${g}  %r2,2*$SIZE_T($sp)
        cghi    $num,16         #
        lghi    %r2,0           #
        blr     %r14            # if($num<16) return 0;
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+        tmll    $num,4
+        bnzr    %r14            # if ($num&1) return 0;
+___
+$code.=<<___ if ($flavour !~ /3[12]/);
        cghi    $num,96         #
        bhr     %r14            # if($num>96) return 0;
+___
+$code.=<<___;
+        stm${g} %r3,%r15,3*$SIZE_T($sp)
-        stmg    %r3,%r15,24($sp)
+        lghi    $rp,-$stdframe-8        # leave room for carry bit
-        lghi    $rp,-160-8      # leave room for carry bit
        lcgr    $j,$num         # -$num
        lgr     %r0,$sp
        la      $rp,0($rp,$sp)
        la      $sp,0($j,$rp)   # alloca
-        stg     %r0,0($sp)      # back chain
+        st${g}  %r0,0($sp)      # back chain
        sra     $num,3          # restore $num
        la      $bp,0($j,$bp)   # restore $bp
        ahi     $num,-1         # adjust $num for inner loop
        lg      $n0,0($n0)      # pull n0
+        _dswap  $n0
        lg      $bi,0($bp)
+        _dswap  $bi
        lg      $alo,0($ap)
+        _dswap  $alo
        mlgr    $ahi,$bi        # ap[0]*bp[0]
        lgr     $AHI,$ahi
@@ -95,6 +132,7 @@ bn_mul_mont:
        msgr    $mn0,$n0
        lg      $nlo,0($np)     #
+        _dswap  $nlo
        mlgr    $nhi,$mn0       # np[0]*m1
        algr    $nlo,$alo       # +="tp[0]"
        lghi    $NHI,0
@@ -106,12 +144,14 @@ bn_mul_mont:
 .align  16
 .L1st:
        lg      $alo,0($j,$ap)
+        _dswap  $alo
        mlgr    $ahi,$bi        # ap[j]*bp[0]
        algr    $alo,$AHI
        lghi    $AHI,0
        alcgr   $AHI,$ahi
        lg      $nlo,0($j,$np)
+        _dswap  $nlo
        mlgr    $nhi,$mn0       # np[j]*m1
        algr    $nlo,$NHI
        lghi    $NHI,0
@@ -119,22 +159,24 @@ bn_mul_mont:
        algr    $nlo,$alo
        alcgr   $NHI,$nhi
-        stg     $nlo,160-8($j,$sp)      # tp[j-1]=
+        stg     $nlo,$stdframe-8($j,$sp)        # tp[j-1]=
        la      $j,8($j)        # j++
        brct    $count,.L1st
        algr    $NHI,$AHI
        lghi    $AHI,0
        alcgr   $AHI,$AHI       # upmost overflow bit
-        stg     $NHI,160-8($j,$sp)
+        stg     $NHI,$stdframe-8($j,$sp)
-        stg     $AHI,160($j,$sp)
+        stg     $AHI,$stdframe($j,$sp)
        la      $bp,8($bp)      # bp++
 .Louter:
        lg      $bi,0($bp)      # bp[i]
+        _dswap  $bi
        lg      $alo,0($ap)
+        _dswap  $alo
        mlgr    $ahi,$bi        # ap[0]*bp[i]
-        alg     $alo,160($sp)   # +=tp[0]
+        alg     $alo,$stdframe($sp)     # +=tp[0]
        lghi    $AHI,0
        alcgr   $AHI,$ahi
@@ -142,6 +184,7 @@ bn_mul_mont:
        msgr    $mn0,$n0        # tp[0]*n0
        lg      $nlo,0($np)     # np[0]
+        _dswap  $nlo
        mlgr    $nhi,$mn0       # np[0]*m1
        algr    $nlo,$alo       # +="tp[0]"
        lghi    $NHI,0
@@ -153,14 +196,16 @@ bn_mul_mont:
 .align  16
 .Linner:
        lg      $alo,0($j,$ap)
+        _dswap  $alo
        mlgr    $ahi,$bi        # ap[j]*bp[i]
        algr    $alo,$AHI
        lghi    $AHI,0
        alcgr   $ahi,$AHI
-        alg     $alo,160($j,$sp)# +=tp[j]
+        alg     $alo,$stdframe($j,$sp)# +=tp[j]
        alcgr   $AHI,$ahi
        lg      $nlo,0($j,$np)
+        _dswap  $nlo
        mlgr    $nhi,$mn0       # np[j]*m1
        algr    $nlo,$NHI
        lghi    $NHI,0
@@ -168,31 +213,33 @@ bn_mul_mont:
        algr    $nlo,$alo       # +="tp[j]"
        alcgr   $NHI,$nhi
-        stg     $nlo,160-8($j,$sp)      # tp[j-1]=
+        stg     $nlo,$stdframe-8($j,$sp)        # tp[j-1]=
        la      $j,8($j)        # j++
        brct    $count,.Linner
        algr    $NHI,$AHI
        lghi    $AHI,0
        alcgr   $AHI,$AHI
-        alg     $NHI,160($j,$sp)# accumulate previous upmost overflow bit
+        alg     $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
        lghi    $ahi,0
        alcgr   $AHI,$ahi       # new upmost overflow bit
-        stg     $NHI,160-8($j,$sp)
+        stg     $NHI,$stdframe-8($j,$sp)
-        stg     $AHI,160($j,$sp)
+        stg     $AHI,$stdframe($j,$sp)
        la      $bp,8($bp)      # bp++
-        clg     $bp,160+8+32($j,$sp)    # compare to &bp[num]
+        cl${g}  $bp,`$stdframe+8+4*$SIZE_T`($j,$sp)     # compare to &bp[num]
        jne     .Louter
-        lg      $rp,160+8+16($j,$sp)    # reincarnate rp
+        l${g}   $rp,`$stdframe+8+2*$SIZE_T`($j,$sp)     # reincarnate rp
-        la      $ap,160($sp)
+        la      $ap,$stdframe($sp)
        ahi     $num,1          # restore $num, incidentally clears "borrow"
        la      $j,0(%r0)
        lr      $count,$num
 .Lsub:  lg      $alo,0($j,$ap)
-        slbg    $alo,0($j,$np)
+        lg      $nlo,0($j,$np)
+        _dswap  $nlo
+        slbgr   $alo,$nlo
        stg     $alo,0($j,$rp)
        la      $j,8($j)
        brct    $count,.Lsub
@@ -207,19 +254,24 @@ bn_mul_mont:
        la      $j,0(%r0)
        lgr     $count,$num
-.Lcopy: lg      $alo,0($j,$ap)  # copy or in-place refresh
+.Lcopy: lg      $alo,0($j,$ap)          # copy or in-place refresh
-        stg     $j,160($j,$sp)  # zap tp
+        _dswap  $alo
+        stg     $j,$stdframe($j,$sp)    # zap tp
        stg     $alo,0($j,$rp)
        la      $j,8($j)
        brct    $count,.Lcopy
-        la      %r1,160+8+48($j,$sp)
+        la      %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
-        lmg     %r6,%r15,0(%r1)
+        lm${g}  %r6,%r15,0(%r1)
        lghi    %r2,1           # signal "processed"
        br      %r14
 .size   bn_mul_mont,.-bn_mul_mont
 .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
 ___
-print $code;
+foreach (split("\n",$code)) {
+        s/\`([^\`]*)\`/eval $1/ge;
+        s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
+        print $_,"\n";
+}
 close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/x86-gf2m.pl b/src/lib/libcrypto/bn/asm/x86-gf2m.pl
new file mode 100644
index 0000000000..808a1e5969
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86-gf2m.pl
@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has three code paths: pure integer
+# code suitable for any x86 CPU, MMX code suitable for PIII and later
+# and PCLMULQDQ suitable for Westmere and later. Improvement varies
+# from one benchmark and �-arch to another. Below are interval values
+# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
+# code:
+#
+# PIII          16%-30%
+# P4            12%-12%
+# Opteron       18%-40%
+# Core2         19%-44%
+# Atom          38%-64%
+# Westmere      53%-121%(PCLMULQDQ)/20%-32%(MMX)
+# Sandy Bridge  72%-127%(PCLMULQDQ)/27%-23%(MMX)
+#
+# Note that above improvement coefficients are not coefficients for
+# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
+# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
+# is more and more dominated by other subroutines, most notably by
+# BN_GF2m_mod[_mul]_arr...
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+$a="eax";
+$b="ebx";
+($a1,$a2,$a4)=("ecx","edx","ebp");
+$R="mm0";
+@T=("mm1","mm2");
+($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
+@i=("esi","edi");
+                                        if (!$x86only) {
+&function_begin_B("_mul_1x1_mmx");
+        &sub    ("esp",32+4);
+         &mov   ($a1,$a);
+         &lea   ($a2,&DWP(0,$a,$a));
+         &and   ($a1,0x3fffffff);
+         &lea   ($a4,&DWP(0,$a2,$a2));
+         &mov   (&DWP(0*4,"esp"),0);
+         &and   ($a2,0x7fffffff);
+        &movd   ($A,$a);
+        &movd   ($B,$b);
+         &mov   (&DWP(1*4,"esp"),$a1);  # a1
+         &xor   ($a1,$a2);              # a1^a2
+        &pxor   ($B31,$B31);
+        &pxor   ($B30,$B30);
+         &mov   (&DWP(2*4,"esp"),$a2);  # a2
+         &xor   ($a2,$a4);              # a2^a4
+         &mov   (&DWP(3*4,"esp"),$a1);  # a1^a2
+        &pcmpgtd($B31,$A);              # broadcast 31st bit
+        &paddd  ($A,$A);                # $A<<=1
+         &xor   ($a1,$a2);              # a1^a4=a1^a2^a2^a4
+         &mov   (&DWP(4*4,"esp"),$a4);  # a4
+         &xor   ($a4,$a2);              # a2=a4^a2^a4
+        &pand   ($B31,$B);
+        &pcmpgtd($B30,$A);              # broadcast 30th bit
+         &mov   (&DWP(5*4,"esp"),$a1);  # a1^a4
+         &xor   ($a4,$a1);              # a1^a2^a4
+        &psllq  ($B31,31);
+        &pand   ($B30,$B);
+         &mov   (&DWP(6*4,"esp"),$a2);  # a2^a4
+        &mov    (@i[0],0x7);
+         &mov   (&DWP(7*4,"esp"),$a4);  # a1^a2^a4
+         &mov   ($a4,@i[0]);
+        &and    (@i[0],$b);
+        &shr    ($b,3);
+        &mov    (@i[1],$a4);
+        &psllq  ($B30,30);
+        &and    (@i[1],$b);
+        &shr    ($b,3);
+        &movd   ($R,&DWP(0,"esp",@i[0],4));
+        &mov    (@i[0],$a4);
+        &and    (@i[0],$b);
+        &shr    ($b,3);
+        for($n=1;$n<9;$n++) {
+                &movd   (@T[1],&DWP(0,"esp",@i[1],4));
+                &mov    (@i[1],$a4);
+                &psllq  (@T[1],3*$n);
+                &and    (@i[1],$b);
+                &shr    ($b,3);
+                &pxor   ($R,@T[1]);
+                push(@i,shift(@i)); push(@T,shift(@T));
+        }
+        &movd   (@T[1],&DWP(0,"esp",@i[1],4));
+        &pxor   ($R,$B30);
+        &psllq  (@T[1],3*$n++);
+        &pxor   ($R,@T[1]);
+        &movd   (@T[0],&DWP(0,"esp",@i[0],4));
+        &pxor   ($R,$B31);
+        &psllq  (@T[0],3*$n);
+        &add    ("esp",32+4);
+        &pxor   ($R,@T[0]);
+        &ret    ();
+&function_end_B("_mul_1x1_mmx");
+                                        }
+($lo,$hi)=("eax","edx");
+@T=("ecx","ebp");
+&function_begin_B("_mul_1x1_ialu");
+        &sub    ("esp",32+4);
+         &mov   ($a1,$a);
+         &lea   ($a2,&DWP(0,$a,$a));
+         &lea   ($a4,&DWP(0,"",$a,4));
+         &and   ($a1,0x3fffffff);
+        &lea    (@i[1],&DWP(0,$lo,$lo));
+        &sar    ($lo,31);               # broadcast 31st bit
+         &mov   (&DWP(0*4,"esp"),0);
+         &and   ($a2,0x7fffffff);
+         &mov   (&DWP(1*4,"esp"),$a1);  # a1
+         &xor   ($a1,$a2);              # a1^a2
+         &mov   (&DWP(2*4,"esp"),$a2);  # a2
+         &xor   ($a2,$a4);              # a2^a4
+         &mov   (&DWP(3*4,"esp"),$a1);  # a1^a2
+         &xor   ($a1,$a2);              # a1^a4=a1^a2^a2^a4
+         &mov   (&DWP(4*4,"esp"),$a4);  # a4
+         &xor   ($a4,$a2);              # a2=a4^a2^a4
+         &mov   (&DWP(5*4,"esp"),$a1);  # a1^a4
+         &xor   ($a4,$a1);              # a1^a2^a4
+        &sar    (@i[1],31);             # broardcast 30th bit
+        &and    ($lo,$b);
+         &mov   (&DWP(6*4,"esp"),$a2);  # a2^a4
+        &and    (@i[1],$b);
+         &mov   (&DWP(7*4,"esp"),$a4);  # a1^a2^a4
+        &mov    ($hi,$lo);
+        &shl    ($lo,31);
+        &mov    (@T[0],@i[1]);
+        &shr    ($hi,1);
+         &mov   (@i[0],0x7);
+        &shl    (@i[1],30);
+         &and   (@i[0],$b);
+        &shr    (@T[0],2);
+        &xor    ($lo,@i[1]);
+        &shr    ($b,3);
+        &mov    (@i[1],0x7);            # 5-byte instruction!?
+        &and    (@i[1],$b);
+        &shr    ($b,3);
+         &xor   ($hi,@T[0]);
+        &xor    ($lo,&DWP(0,"esp",@i[0],4));
+        &mov    (@i[0],0x7);
+        &and    (@i[0],$b);
+        &shr    ($b,3);
+        for($n=1;$n<9;$n++) {
+                &mov    (@T[1],&DWP(0,"esp",@i[1],4));
+                &mov    (@i[1],0x7);
+                &mov    (@T[0],@T[1]);
+                &shl    (@T[1],3*$n);
+                &and    (@i[1],$b);
+                &shr    (@T[0],32-3*$n);
+                &xor    ($lo,@T[1]);
+                &shr    ($b,3);
+                &xor    ($hi,@T[0]);
+                push(@i,shift(@i)); push(@T,shift(@T));
+        }
+        &mov    (@T[1],&DWP(0,"esp",@i[1],4));
+        &mov    (@T[0],@T[1]);
+        &shl    (@T[1],3*$n);
+        &mov    (@i[1],&DWP(0,"esp",@i[0],4));
+        &shr    (@T[0],32-3*$n);        $n++;
+        &mov    (@i[0],@i[1]);
+        &xor    ($lo,@T[1]);
+        &shl    (@i[1],3*$n);
+        &xor    ($hi,@T[0]);
+        &shr    (@i[0],32-3*$n);
+        &xor    ($lo,@i[1]);
+        &xor    ($hi,@i[0]);
+        &add    ("esp",32+4);
+        &ret    ();
+&function_end_B("_mul_1x1_ialu");
+# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+&function_begin_B("bn_GF2m_mul_2x2");
+if (!$x86only) {
+        &picmeup("edx","OPENSSL_ia32cap_P");
+        &mov    ("eax",&DWP(0,"edx"));
+        &mov    ("edx",&DWP(4,"edx"));
+        &test   ("eax",1<<23);          # check MMX bit
+        &jz     (&label("ialu"));
+if ($sse2) {
+        &test   ("eax",1<<24);          # check FXSR bit
+        &jz     (&label("mmx"));
+        &test   ("edx",1<<1);           # check PCLMULQDQ bit
+        &jz     (&label("mmx"));
+        &movups         ("xmm0",&QWP(8,"esp"));
+        &shufps         ("xmm0","xmm0",0b10110001);
+        &pclmulqdq      ("xmm0","xmm0",1);
+        &mov            ("eax",&DWP(4,"esp"));
+        &movups         (&QWP(0,"eax"),"xmm0");
+        &ret    ();
+&set_label("mmx",16);
+}
+        &push   ("ebp");
+        &push   ("ebx");
+        &push   ("esi");
+        &push   ("edi");
+        &mov    ($a,&wparam(1));
+        &mov    ($b,&wparam(3));
+        &call   ("_mul_1x1_mmx");       # a1�b1
+        &movq   ("mm7",$R);
+        &mov    ($a,&wparam(2));
+        &mov    ($b,&wparam(4));
+        &call   ("_mul_1x1_mmx");       # a0�b0
+        &movq   ("mm6",$R);
+        &mov    ($a,&wparam(1));
+        &mov    ($b,&wparam(3));
+        &xor    ($a,&wparam(2));
+        &xor    ($b,&wparam(4));
+        &call   ("_mul_1x1_mmx");       # (a0+a1)�(b0+b1)
+        &pxor   ($R,"mm7");
+        &mov    ($a,&wparam(0));
+        &pxor   ($R,"mm6");             # (a0+a1)�(b0+b1)-a1�b1-a0�b0
+        &movq   ($A,$R);
+        &psllq  ($R,32);
+        &pop    ("edi");
+        &psrlq  ($A,32);
+        &pop    ("esi");
+        &pxor   ($R,"mm6");
+        &pop    ("ebx");
+        &pxor   ($A,"mm7");
+        &movq   (&QWP(0,$a),$R);
+        &pop    ("ebp");
+        &movq   (&QWP(8,$a),$A);
+        &emms   ();
+        &ret    ();
+&set_label("ialu",16);
+}
+        &push   ("ebp");
+        &push   ("ebx");
+        &push   ("esi");
+        &push   ("edi");
+        &stack_push(4+1);
+        &mov    ($a,&wparam(1));
+        &mov    ($b,&wparam(3));
+        &call   ("_mul_1x1_ialu");      # a1�b1
+        &mov    (&DWP(8,"esp"),$lo);
+        &mov    (&DWP(12,"esp"),$hi);
+        &mov    ($a,&wparam(2));
+        &mov    ($b,&wparam(4));
+        &call   ("_mul_1x1_ialu");      # a0�b0
+        &mov    (&DWP(0,"esp"),$lo);
+        &mov    (&DWP(4,"esp"),$hi);
+        &mov    ($a,&wparam(1));
+        &mov    ($b,&wparam(3));
+        &xor    ($a,&wparam(2));
+        &xor    ($b,&wparam(4));
+        &call   ("_mul_1x1_ialu");      # (a0+a1)�(b0+b1)
+        &mov    ("ebp",&wparam(0));
+                 @r=("ebx","ecx","edi","esi");
+        &mov    (@r[0],&DWP(0,"esp"));
+        &mov    (@r[1],&DWP(4,"esp"));
+        &mov    (@r[2],&DWP(8,"esp"));
+        &mov    (@r[3],&DWP(12,"esp"));
+        &xor    ($lo,$hi);
+        &xor    ($hi,@r[1]);
+        &xor    ($lo,@r[0]);
+        &mov    (&DWP(0,"ebp"),@r[0]);
+        &xor    ($hi,@r[2]);
+        &mov    (&DWP(12,"ebp"),@r[3]);
+        &xor    ($lo,@r[3]);
+        &stack_pop(4+1);
+        &xor    ($hi,@r[3]);
+        &pop    ("edi");
+        &xor    ($lo,$hi);
+        &pop    ("esi");
+        &mov    (&DWP(8,"ebp"),$hi);
+        &pop    ("ebx");
+        &mov    (&DWP(4,"ebp"),$lo);
+        &pop    ("ebp");
+        &ret    ();
+&function_end_B("bn_GF2m_mul_2x2");
+&asciz  ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl b/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl
new file mode 100644
index 0000000000..1658acbbdd
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl
@@ -0,0 +1,389 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: code suitable
+# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
+# later. Improvement varies from one benchmark and �-arch to another.
+# Vanilla code path is at most 20% faster than compiler-generated code
+# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
+# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
+# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
+# all CPU time is burnt in it...
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+open STDOUT,"| $^X $xlate $flavour $output";
+($lo,$hi)=("%rax","%rdx");      $a=$lo;
+($i0,$i1)=("%rsi","%rdi");
+($t0,$t1)=("%rbx","%rcx");
+($b,$mask)=("%rbp","%r8");
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
+($R,$Tx)=("%xmm0","%xmm1");
+$code.=<<___;
+.text
+.type   _mul_1x1,\@abi-omnipotent
+.align  16
+_mul_1x1:
+        sub     \$128+8,%rsp
+        mov     \$-1,$a1
+        lea     ($a,$a),$i0
+        shr     \$3,$a1
+        lea     (,$a,4),$i1
+        and     $a,$a1                  # a1=a&0x1fffffffffffffff
+        lea     (,$a,8),$a8
+        sar     \$63,$a                 # broadcast 63rd bit
+        lea     ($a1,$a1),$a2
+        sar     \$63,$i0                # broadcast 62nd bit
+        lea     (,$a1,4),$a4
+        and     $b,$a
+        sar     \$63,$i1                # boardcast 61st bit
+        mov     $a,$hi                  # $a is $lo
+        shl     \$63,$lo
+        and     $b,$i0
+        shr     \$1,$hi
+        mov     $i0,$t1
+        shl     \$62,$i0
+        and     $b,$i1
+        shr     \$2,$t1
+        xor     $i0,$lo
+        mov     $i1,$t0
+        shl     \$61,$i1
+        xor     $t1,$hi
+        shr     \$3,$t0
+        xor     $i1,$lo
+        xor     $t0,$hi
+        mov     $a1,$a12
+        movq    \$0,0(%rsp)             # tab[0]=0
+        xor     $a2,$a12                # a1^a2
+        mov     $a1,8(%rsp)             # tab[1]=a1
+         mov    $a4,$a48
+        mov     $a2,16(%rsp)            # tab[2]=a2
+         xor    $a8,$a48                # a4^a8
+        mov     $a12,24(%rsp)           # tab[3]=a1^a2
+        xor     $a4,$a1
+        mov     $a4,32(%rsp)            # tab[4]=a4
+        xor     $a4,$a2
+        mov     $a1,40(%rsp)            # tab[5]=a1^a4
+        xor     $a4,$a12
+        mov     $a2,48(%rsp)            # tab[6]=a2^a4
+         xor    $a48,$a1                # a1^a4^a4^a8=a1^a8
+        mov     $a12,56(%rsp)           # tab[7]=a1^a2^a4
+         xor    $a48,$a2                # a2^a4^a4^a8=a1^a8
+        mov     $a8,64(%rsp)            # tab[8]=a8
+        xor     $a48,$a12               # a1^a2^a4^a4^a8=a1^a2^a8
+        mov     $a1,72(%rsp)            # tab[9]=a1^a8
+         xor    $a4,$a1                 # a1^a8^a4
+        mov     $a2,80(%rsp)            # tab[10]=a2^a8
+         xor    $a4,$a2                 # a2^a8^a4
+        mov     $a12,88(%rsp)           # tab[11]=a1^a2^a8
+        xor     $a4,$a12                # a1^a2^a8^a4
+        mov     $a48,96(%rsp)           # tab[12]=a4^a8
+         mov    $mask,$i0
+        mov     $a1,104(%rsp)           # tab[13]=a1^a4^a8
+         and    $b,$i0
+        mov     $a2,112(%rsp)           # tab[14]=a2^a4^a8
+         shr    \$4,$b
+        mov     $a12,120(%rsp)          # tab[15]=a1^a2^a4^a8
+         mov    $mask,$i1
+         and    $b,$i1
+         shr    \$4,$b
+        movq    (%rsp,$i0,8),$R         # half of calculations is done in SSE2
+        mov     $mask,$i0
+        and     $b,$i0
+        shr     \$4,$b
+___
+    for ($n=1;$n<8;$n++) {
+        $code.=<<___;
+        mov     (%rsp,$i1,8),$t1
+        mov     $mask,$i1
+        mov     $t1,$t0
+        shl     \$`8*$n-4`,$t1
+        and     $b,$i1
+         movq   (%rsp,$i0,8),$Tx
+        shr     \$`64-(8*$n-4)`,$t0
+        xor     $t1,$lo
+         pslldq \$$n,$Tx
+         mov    $mask,$i0
+        shr     \$4,$b
+        xor     $t0,$hi
+         and    $b,$i0
+         shr    \$4,$b
+         pxor   $Tx,$R
+___
+    }
+$code.=<<___;
+        mov     (%rsp,$i1,8),$t1
+        mov     $t1,$t0
+        shl     \$`8*$n-4`,$t1
+        movq    $R,$i0
+        shr     \$`64-(8*$n-4)`,$t0
+        xor     $t1,$lo
+        psrldq  \$8,$R
+        xor     $t0,$hi
+        movq    $R,$i1
+        xor     $i0,$lo
+        xor     $i1,$hi
+        add     \$128+8,%rsp
+        ret
+.Lend_mul_1x1:
+.size   _mul_1x1,.-_mul_1x1
+___
+($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") :   # Win64 order
+                                ("%rdi","%rsi","%rdx","%rcx","%r8");    # Unix order
+$code.=<<___;
+.extern OPENSSL_ia32cap_P
+.globl  bn_GF2m_mul_2x2
+.type   bn_GF2m_mul_2x2,\@abi-omnipotent
+.align  16
+bn_GF2m_mul_2x2:
+        mov     OPENSSL_ia32cap_P(%rip),%rax
+        bt      \$33,%rax
+        jnc     .Lvanilla_mul_2x2
+        movq            $a1,%xmm0
+        movq            $b1,%xmm1
+        movq            $a0,%xmm2
+___
+$code.=<<___ if ($win64);
+        movq            40(%rsp),%xmm3
+___
+$code.=<<___ if (!$win64);
+        movq            $b0,%xmm3
+___
+$code.=<<___;
+        movdqa          %xmm0,%xmm4
+        movdqa          %xmm1,%xmm5
+        pclmulqdq       \$0,%xmm1,%xmm0 # a1�b1
+        pxor            %xmm2,%xmm4
+        pxor            %xmm3,%xmm5
+        pclmulqdq       \$0,%xmm3,%xmm2 # a0�b0
+        pclmulqdq       \$0,%xmm5,%xmm4 # (a0+a1)�(b0+b1)
+        xorps           %xmm0,%xmm4
+        xorps           %xmm2,%xmm4     # (a0+a1)�(b0+b1)-a0�b0-a1�b1
+        movdqa          %xmm4,%xmm5
+        pslldq          \$8,%xmm4
+        psrldq          \$8,%xmm5
+        pxor            %xmm4,%xmm2
+        pxor            %xmm5,%xmm0
+        movdqu          %xmm2,0($rp)
+        movdqu          %xmm0,16($rp)
+        ret
+.align  16
+.Lvanilla_mul_2x2:
+        lea     -8*17(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+        mov     `8*17+40`(%rsp),$b0
+        mov     %rdi,8*15(%rsp)
+        mov     %rsi,8*16(%rsp)
+___
+$code.=<<___;
+        mov     %r14,8*10(%rsp)
+        mov     %r13,8*11(%rsp)
+        mov     %r12,8*12(%rsp)
+        mov     %rbp,8*13(%rsp)
+        mov     %rbx,8*14(%rsp)
+.Lbody_mul_2x2:
+        mov     $rp,32(%rsp)            # save the arguments
+        mov     $a1,40(%rsp)
+        mov     $a0,48(%rsp)
+        mov     $b1,56(%rsp)
+        mov     $b0,64(%rsp)
+        mov     \$0xf,$mask
+        mov     $a1,$a
+        mov     $b1,$b
+        call    _mul_1x1                # a1�b1
+        mov     $lo,16(%rsp)
+        mov     $hi,24(%rsp)
+        mov     48(%rsp),$a
+        mov     64(%rsp),$b
+        call    _mul_1x1                # a0�b0
+        mov     $lo,0(%rsp)
+        mov     $hi,8(%rsp)
+        mov     40(%rsp),$a
+        mov     56(%rsp),$b
+        xor     48(%rsp),$a
+        xor     64(%rsp),$b
+        call    _mul_1x1                # (a0+a1)�(b0+b1)
+___
+        @r=("%rbx","%rcx","%rdi","%rsi");
+$code.=<<___;
+        mov     0(%rsp),@r[0]
+        mov     8(%rsp),@r[1]
+        mov     16(%rsp),@r[2]
+        mov     24(%rsp),@r[3]
+        mov     32(%rsp),%rbp
+        xor     $hi,$lo
+        xor     @r[1],$hi
+        xor     @r[0],$lo
+        mov     @r[0],0(%rbp)
+        xor     @r[2],$hi
+        mov     @r[3],24(%rbp)
+        xor     @r[3],$lo
+        xor     @r[3],$hi
+        xor     $hi,$lo
+        mov     $hi,16(%rbp)
+        mov     $lo,8(%rbp)
+        mov     8*10(%rsp),%r14
+        mov     8*11(%rsp),%r13
+        mov     8*12(%rsp),%r12
+        mov     8*13(%rsp),%rbp
+        mov     8*14(%rsp),%rbx
+___
+$code.=<<___ if ($win64);
+        mov     8*15(%rsp),%rdi
+        mov     8*16(%rsp),%rsi
+___
+$code.=<<___;
+        lea     8*17(%rsp),%rsp
+        ret
+.Lend_mul_2x2:
+.size   bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz  "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align  16
+___
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type   se_handler,\@abi-omnipotent
+.align  16
+se_handler:
+        push    %rsi
+        push    %rdi
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        pushfq
+        sub     \$64,%rsp
+        mov     152($context),%rax      # pull context->Rsp
+        mov     248($context),%rbx      # pull context->Rip
+        lea     .Lbody_mul_2x2(%rip),%r10
+        cmp     %r10,%rbx               # context->Rip<"prologue" label
+        jb      .Lin_prologue
+        mov     8*10(%rax),%r14         # mimic epilogue
+        mov     8*11(%rax),%r13
+        mov     8*12(%rax),%r12
+        mov     8*13(%rax),%rbp
+        mov     8*14(%rax),%rbx
+        mov     8*15(%rax),%rdi
+        mov     8*16(%rax),%rsi
+        mov     %rbx,144($context)      # restore context->Rbx
+        mov     %rbp,160($context)      # restore context->Rbp
+        mov     %rsi,168($context)      # restore context->Rsi
+        mov     %rdi,176($context)      # restore context->Rdi
+        mov     %r12,216($context)      # restore context->R12
+        mov     %r13,224($context)      # restore context->R13
+        mov     %r14,232($context)      # restore context->R14
+.Lin_prologue:
+        lea     8*17(%rax),%rax
+        mov     %rax,152($context)      # restore context->Rsp
+        mov     40($disp),%rdi          # disp->ContextRecord
+        mov     $context,%rsi           # context
+        mov     \$154,%ecx              # sizeof(CONTEXT)
+        .long   0xa548f3fc              # cld; rep movsq
+        mov     $disp,%rsi
+        xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+        mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+        mov     0(%rsi),%r8             # arg3, disp->ControlPc
+        mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+        mov     40(%rsi),%r10           # disp->ContextRecord
+        lea     56(%rsi),%r11           # &disp->HandlerData
+        lea     24(%rsi),%r12           # &disp->EstablisherFrame
+        mov     %r10,32(%rsp)           # arg5
+        mov     %r11,40(%rsp)           # arg6
+        mov     %r12,48(%rsp)           # arg7
+        mov     %rcx,56(%rsp)           # arg8, (NULL)
+        call    *__imp_RtlVirtualUnwind(%rip)
+        mov     \$1,%eax                # ExceptionContinueSearch
+        add     \$64,%rsp
+        popfq
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+        pop     %rbp
+        pop     %rbx
+        pop     %rdi
+        pop     %rsi
+        ret
+.size   se_handler,.-se_handler
+.section        .pdata
+.align  4
+        .rva    _mul_1x1
+        .rva    .Lend_mul_1x1
+        .rva    .LSEH_info_1x1
+        .rva    .Lvanilla_mul_2x2
+        .rva    .Lend_mul_2x2
+        .rva    .LSEH_info_2x2
+.section        .xdata
+.align  8
+.LSEH_info_1x1:
+        .byte   0x01,0x07,0x02,0x00
+        .byte   0x07,0x01,0x11,0x00     # sub rsp,128+8
+.LSEH_info_2x2:
+        .byte   9,0,0,0
+        .rva    se_handler
+___
+}
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont.pl b/src/lib/libcrypto/bn/asm/x86_64-mont.pl
index 3b7a6f243f..5d79b35e1c 100755
--- a/src/lib/libcrypto/bn/asm/x86_64-mont.pl
+++ b/src/lib/libcrypto/bn/asm/x86_64-mont.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
@@ -15,6 +15,20 @@
 # respectful 50%. It remains to be seen if loop unrolling and
 # dedicated squaring routine can provide further improvement...
+# July 2011.
+#
+# Add dedicated squaring procedure. Performance improvement varies
+# from platform to platform, but in average it's ~5%/15%/25%/33%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+# August 2011.
+#
+# Unroll and modulo-schedule inner loops in such manner that they
+# are "fallen through" for input lengths of 8, which is critical for
+# 1024-bit RSA *sign*. Average performance improvement in comparison
+# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -37,7 +51,6 @@ $n0="%r8";	# const BN_ULONG *n0,
 $num="%r9";     # int num);
 $lo0="%r10";
 $hi0="%r11";
-$bp="%r12";     # reassign $bp
 $hi1="%r13";
 $i="%r14";
 $j="%r15";
@@ -51,6 +64,16 @@ $code=<<___;
 .type   bn_mul_mont,\@function,6
 .align  16
 bn_mul_mont:
+        test    \$3,${num}d
+        jnz     .Lmul_enter
+        cmp     \$8,${num}d
+        jb      .Lmul_enter
+        cmp     $ap,$bp
+        jne     .Lmul4x_enter
+        jmp     .Lsqr4x_enter
+.align  16
+.Lmul_enter:
        push    %rbx
        push    %rbp
        push    %r12
@@ -66,48 +89,66 @@ bn_mul_mont:
        and     \$-1024,%rsp            # minimize TLB usage
        mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
-.Lprologue:
+.Lmul_body:
-        mov     %rdx,$bp                # $bp reassigned, remember?
+        mov     $bp,%r12                # reassign $bp
+___
+                $bp="%r12";
+$code.=<<___;
        mov     ($n0),$n0               # pull n0[0] value
+        mov     ($bp),$m0               # m0=bp[0]
+        mov     ($ap),%rax
        xor     $i,$i                   # i=0
        xor     $j,$j                   # j=0
-        mov     ($bp),$m0               # m0=bp[0]
+        mov     $n0,$m1
-        mov     ($ap),%rax
        mulq    $m0                     # ap[0]*bp[0]
        mov     %rax,$lo0
-        mov     %rdx,$hi0
+        mov     ($np),%rax
-        imulq   $n0,%rax                # "tp[0]"*n0
+        imulq   $lo0,$m1                # "tp[0]"*n0
-        mov     %rax,$m1
+        mov     %rdx,$hi0
-        mulq    ($np)                   # np[0]*m1
+        mulq    $m1                     # np[0]*m1
-        add     $lo0,%rax               # discarded
+        add     %rax,$lo0               # discarded
+        mov     8($ap),%rax
        adc     \$0,%rdx
        mov     %rdx,$hi1
        lea     1($j),$j                # j++
+        jmp     .L1st_enter
+.align  16
 .L1st:
+        add     %rax,$hi1
        mov     ($ap,$j,8),%rax
-        mulq    $m0                     # ap[j]*bp[0]
-        add     $hi0,%rax
        adc     \$0,%rdx
-        mov     %rax,$lo0
+        add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
+        mov     $lo0,$hi0
+        adc     \$0,%rdx
+        mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+        mov     %rdx,$hi1
+.L1st_enter:
+        mulq    $m0                     # ap[j]*bp[0]
+        add     %rax,$hi0
        mov     ($np,$j,8),%rax
-        mov     %rdx,$hi0
+        adc     \$0,%rdx
+        lea     1($j),$j                # j++
+        mov     %rdx,$lo0
        mulq    $m1                     # np[j]*m1
-        add     $hi1,%rax
+        cmp     $num,$j
-        lea     1($j),$j                # j++
+        jne     .L1st
+        add     %rax,$hi1
+        mov     ($ap),%rax              # ap[0]
        adc     \$0,%rdx
-        add     $lo0,%rax               # np[j]*m1+ap[j]*bp[0]
+        add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
        adc     \$0,%rdx
-        mov     %rax,-16(%rsp,$j,8)     # tp[j-1]
+        mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
-        cmp     $num,$j
        mov     %rdx,$hi1
-        jl      .L1st
+        mov     $lo0,$hi0
        xor     %rdx,%rdx
        add     $hi0,$hi1
@@ -116,50 +157,64 @@ bn_mul_mont:
        mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
        lea     1($i),$i                # i++
-.align  4
+        jmp     .Louter
+.align  16
 .Louter:
-        xor     $j,$j                   # j=0
        mov     ($bp,$i,8),$m0          # m0=bp[i]
-        mov     ($ap),%rax              # ap[0]
+        xor     $j,$j                   # j=0
+        mov     $n0,$m1
+        mov     (%rsp),$lo0
        mulq    $m0                     # ap[0]*bp[i]
-        add     (%rsp),%rax             # ap[0]*bp[i]+tp[0]
+        add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
+        mov     ($np),%rax
        adc     \$0,%rdx
-        mov     %rax,$lo0
-        mov     %rdx,$hi0
-        imulq   $n0,%rax                # tp[0]*n0
+        imulq   $lo0,$m1                # tp[0]*n0
-        mov     %rax,$m1
+        mov     %rdx,$hi0
-        mulq    ($np,$j,8)              # np[0]*m1
+        mulq    $m1                     # np[0]*m1
-        add     $lo0,%rax               # discarded
+        add     %rax,$lo0               # discarded
-        mov     8(%rsp),$lo0            # tp[1]
+        mov     8($ap),%rax
        adc     \$0,%rdx
+        mov     8(%rsp),$lo0            # tp[1]
        mov     %rdx,$hi1
        lea     1($j),$j                # j++
-.align  4
+        jmp     .Linner_enter
+.align  16
 .Linner:
+        add     %rax,$hi1
        mov     ($ap,$j,8),%rax
-        mulq    $m0                     # ap[j]*bp[i]
-        add     $hi0,%rax
        adc     \$0,%rdx
-        add     %rax,$lo0               # ap[j]*bp[i]+tp[j]
+        add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
+        mov     (%rsp,$j,8),$lo0
+        adc     \$0,%rdx
+        mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+        mov     %rdx,$hi1
+.Linner_enter:
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$hi0
        mov     ($np,$j,8),%rax
        adc     \$0,%rdx
+        add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
        mov     %rdx,$hi0
+        adc     \$0,$hi0
+        lea     1($j),$j                # j++
        mulq    $m1                     # np[j]*m1
-        add     $hi1,%rax
+        cmp     $num,$j
-        lea     1($j),$j                # j++
+        jne     .Linner
-        adc     \$0,%rdx
-        add     $lo0,%rax               # np[j]*m1+ap[j]*bp[i]+tp[j]
+        add     %rax,$hi1
+        mov     ($ap),%rax              # ap[0]
        adc     \$0,%rdx
+        add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
        mov     (%rsp,$j,8),$lo0
-        cmp     $num,$j
+        adc     \$0,%rdx
-        mov     %rax,-16(%rsp,$j,8)     # tp[j-1]
+        mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
        mov     %rdx,$hi1
-        jl      .Linner
        xor     %rdx,%rdx
        add     $hi0,$hi1
@@ -173,35 +228,449 @@ bn_mul_mont:
        cmp     $num,$i
        jl      .Louter
-        lea     (%rsp),$ap              # borrow ap for tp
-        lea     -1($num),$j             # j=num-1
-        mov     ($ap),%rax              # tp[0]
        xor     $i,$i                   # i=0 and clear CF!
+        mov     (%rsp),%rax             # tp[0]
+        lea     (%rsp),$ap              # borrow ap for tp
+        mov     $num,$j                 # j=num
        jmp     .Lsub
 .align  16
 .Lsub:  sbb     ($np,$i,8),%rax
        mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
-        dec     $j                      # doesn't affect CF!
        mov     8($ap,$i,8),%rax        # tp[i+1]
        lea     1($i),$i                # i++
-        jge     .Lsub
+        dec     $j                      # doesnn't affect CF!
+        jnz     .Lsub
        sbb     \$0,%rax                # handle upmost overflow bit
+        xor     $i,$i
        and     %rax,$ap
        not     %rax
        mov     $rp,$np
        and     %rax,$np
-        lea     -1($num),$j
+        mov     $num,$j                 # j=num
        or      $np,$ap                 # ap=borrow?tp:rp
 .align  16
 .Lcopy:                                 # copy or in-place refresh
+        mov     ($ap,$i,8),%rax
+        mov     $i,(%rsp,$i,8)          # zap temporary vector
+        mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
+        lea     1($i),$i
+        sub     \$1,$j
+        jnz     .Lcopy
+        mov     8(%rsp,$num,8),%rsi     # restore %rsp
+        mov     \$1,%rax
+        mov     (%rsi),%r15
+        mov     8(%rsi),%r14
+        mov     16(%rsi),%r13
+        mov     24(%rsi),%r12
+        mov     32(%rsi),%rbp
+        mov     40(%rsi),%rbx
+        lea     48(%rsi),%rsp
+.Lmul_epilogue:
+        ret
+.size   bn_mul_mont,.-bn_mul_mont
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type   bn_mul4x_mont,\@function,6
+.align  16
+bn_mul4x_mont:
+.Lmul4x_enter:
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        mov     ${num}d,${num}d
+        lea     4($num),%r10
+        mov     %rsp,%r11
+        neg     %r10
+        lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+4))
+        and     \$-1024,%rsp            # minimize TLB usage
+        mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul4x_body:
+        mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
+        mov     %rdx,%r12               # reassign $bp
+___
+                $bp="%r12";
+$code.=<<___;
+        mov     ($n0),$n0               # pull n0[0] value
+        mov     ($bp),$m0               # m0=bp[0]
+        mov     ($ap),%rax
+        xor     $i,$i                   # i=0
+        xor     $j,$j                   # j=0
+        mov     $n0,$m1
+        mulq    $m0                     # ap[0]*bp[0]
+        mov     %rax,$A[0]
+        mov     ($np),%rax
+        imulq   $A[0],$m1               # "tp[0]"*n0
+        mov     %rdx,$A[1]
+        mulq    $m1                     # np[0]*m1
+        add     %rax,$A[0]              # discarded
+        mov     8($ap),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$N[1]
+        mulq    $m0
+        add     %rax,$A[1]
+        mov     8($np),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$A[0]
+        mulq    $m1
+        add     %rax,$N[1]
+        mov     16($ap),%rax
+        adc     \$0,%rdx
+        add     $A[1],$N[1]
+        lea     4($j),$j                # j++
+        adc     \$0,%rdx
+        mov     $N[1],(%rsp)
+        mov     %rdx,$N[0]
+        jmp     .L1st4x
+.align  16
+.L1st4x:
+        mulq    $m0                     # ap[j]*bp[0]
+        add     %rax,$A[0]
+        mov     -16($np,$j,8),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$A[1]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[0]
+        mov     -8($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+        adc     \$0,%rdx
+        mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[1]
+        mulq    $m0                     # ap[j]*bp[0]
+        add     %rax,$A[1]
+        mov     -8($np,$j,8),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$A[0]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[1]
        mov     ($ap,$j,8),%rax
-        mov     %rax,($rp,$j,8)         # rp[i]=tp[i]
+        adc     \$0,%rdx
-        mov     $i,(%rsp,$j,8)          # zap temporary vector
+        add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+        adc     \$0,%rdx
+        mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[0]
+        mulq    $m0                     # ap[j]*bp[0]
+        add     %rax,$A[0]
+        mov     ($np,$j,8),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$A[1]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[0]
+        mov     8($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+        adc     \$0,%rdx
+        mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
+        mov     %rdx,$N[1]
+        mulq    $m0                     # ap[j]*bp[0]
+        add     %rax,$A[1]
+        mov     8($np,$j,8),%rax
+        adc     \$0,%rdx
+        lea     4($j),$j                # j++
+        mov     %rdx,$A[0]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[1]
+        mov     -16($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+        adc     \$0,%rdx
+        mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[0]
+        cmp     $num,$j
+        jl      .L1st4x
+        mulq    $m0                     # ap[j]*bp[0]
+        add     %rax,$A[0]
+        mov     -16($np,$j,8),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$A[1]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[0]
+        mov     -8($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+        adc     \$0,%rdx
+        mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[1]
+        mulq    $m0                     # ap[j]*bp[0]
+        add     %rax,$A[1]
+        mov     -8($np,$j,8),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$A[0]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[1]
+        mov     ($ap),%rax              # ap[0]
+        adc     \$0,%rdx
+        add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+        adc     \$0,%rdx
+        mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[0]
+        xor     $N[1],$N[1]
+        add     $A[0],$N[0]
+        adc     \$0,$N[1]
+        mov     $N[0],-8(%rsp,$j,8)
+        mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
+        lea     1($i),$i                # i++
+.align  4
+.Louter4x:
+        mov     ($bp,$i,8),$m0          # m0=bp[i]
+        xor     $j,$j                   # j=0
+        mov     (%rsp),$A[0]
+        mov     $n0,$m1
+        mulq    $m0                     # ap[0]*bp[i]
+        add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
+        mov     ($np),%rax
+        adc     \$0,%rdx
+        imulq   $A[0],$m1               # tp[0]*n0
+        mov     %rdx,$A[1]
+        mulq    $m1                     # np[0]*m1
+        add     %rax,$A[0]              # "$N[0]", discarded
+        mov     8($ap),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$N[1]
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$A[1]
+        mov     8($np),%rax
+        adc     \$0,%rdx
+        add     8(%rsp),$A[1]           # +tp[1]
+        adc     \$0,%rdx
+        mov     %rdx,$A[0]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[1]
+        mov     16($ap),%rax
+        adc     \$0,%rdx
+        add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
+        lea     4($j),$j                # j+=2
+        adc     \$0,%rdx
+        mov     $N[1],(%rsp)            # tp[j-1]
+        mov     %rdx,$N[0]
+        jmp     .Linner4x
+.align  16
+.Linner4x:
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$A[0]
+        mov     -16($np,$j,8),%rax
+        adc     \$0,%rdx
+        add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
+        adc     \$0,%rdx
+        mov     %rdx,$A[1]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[0]
+        mov     -8($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[0],$N[0]
+        adc     \$0,%rdx
+        mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[1]
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$A[1]
+        mov     -8($np,$j,8),%rax
+        adc     \$0,%rdx
+        add     -8(%rsp,$j,8),$A[1]
+        adc     \$0,%rdx
+        mov     %rdx,$A[0]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[1]
+        mov     ($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[1],$N[1]
+        adc     \$0,%rdx
+        mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[0]
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$A[0]
+        mov     ($np,$j,8),%rax
+        adc     \$0,%rdx
+        add     (%rsp,$j,8),$A[0]       # ap[j]*bp[i]+tp[j]
+        adc     \$0,%rdx
+        mov     %rdx,$A[1]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[0]
+        mov     8($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[0],$N[0]
+        adc     \$0,%rdx
+        mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
+        mov     %rdx,$N[1]
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$A[1]
+        mov     8($np,$j,8),%rax
+        adc     \$0,%rdx
+        add     8(%rsp,$j,8),$A[1]
+        adc     \$0,%rdx
+        lea     4($j),$j                # j++
+        mov     %rdx,$A[0]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[1]
+        mov     -16($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[1],$N[1]
+        adc     \$0,%rdx
+        mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[0]
+        cmp     $num,$j
+        jl      .Linner4x
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$A[0]
+        mov     -16($np,$j,8),%rax
+        adc     \$0,%rdx
+        add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
+        adc     \$0,%rdx
+        mov     %rdx,$A[1]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[0]
+        mov     -8($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[0],$N[0]
+        adc     \$0,%rdx
+        mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[1]
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$A[1]
+        mov     -8($np,$j,8),%rax
+        adc     \$0,%rdx
+        add     -8(%rsp,$j,8),$A[1]
+        adc     \$0,%rdx
+        lea     1($i),$i                # i++
+        mov     %rdx,$A[0]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[1]
+        mov     ($ap),%rax              # ap[0]
+        adc     \$0,%rdx
+        add     $A[1],$N[1]
+        adc     \$0,%rdx
+        mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[0]
+        xor     $N[1],$N[1]
+        add     $A[0],$N[0]
+        adc     \$0,$N[1]
+        add     (%rsp,$num,8),$N[0]     # pull upmost overflow bit
+        adc     \$0,$N[1]
+        mov     $N[0],-8(%rsp,$j,8)
+        mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
+        cmp     $num,$i
+        jl      .Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+        mov     16(%rsp,$num,8),$rp     # restore $rp
+        mov     0(%rsp),@ri[0]          # tp[0]
+        pxor    %xmm0,%xmm0
+        mov     8(%rsp),@ri[1]          # tp[1]
+        shr     \$2,$num                # num/=4
+        lea     (%rsp),$ap              # borrow ap for tp
+        xor     $i,$i                   # i=0 and clear CF!
+        sub     0($np),@ri[0]
+        mov     16($ap),@ri[2]          # tp[2]
+        mov     24($ap),@ri[3]          # tp[3]
+        sbb     8($np),@ri[1]
+        lea     -1($num),$j             # j=num/4-1
+        jmp     .Lsub4x
+.align  16
+.Lsub4x:
+        mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
+        mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
+        sbb     16($np,$i,8),@ri[2]
+        mov     32($ap,$i,8),@ri[0]     # tp[i+1]
+        mov     40($ap,$i,8),@ri[1]
+        sbb     24($np,$i,8),@ri[3]
+        mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
+        mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
+        sbb     32($np,$i,8),@ri[0]
+        mov     48($ap,$i,8),@ri[2]
+        mov     56($ap,$i,8),@ri[3]
+        sbb     40($np,$i,8),@ri[1]
+        lea     4($i),$i                # i++
+        dec     $j                      # doesnn't affect CF!
+        jnz     .Lsub4x
+        mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
+        mov     32($ap,$i,8),@ri[0]     # load overflow bit
+        sbb     16($np,$i,8),@ri[2]
+        mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
+        sbb     24($np,$i,8),@ri[3]
+        mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
+        sbb     \$0,@ri[0]              # handle upmost overflow bit
+        mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
+        xor     $i,$i                   # i=0
+        and     @ri[0],$ap
+        not     @ri[0]
+        mov     $rp,$np
+        and     @ri[0],$np
+        lea     -1($num),$j
+        or      $np,$ap                 # ap=borrow?tp:rp
+        movdqu  ($ap),%xmm1
+        movdqa  %xmm0,(%rsp)
+        movdqu  %xmm1,($rp)
+        jmp     .Lcopy4x
+.align  16
+.Lcopy4x:                                       # copy or in-place refresh
+        movdqu  16($ap,$i),%xmm2
+        movdqu  32($ap,$i),%xmm1
+        movdqa  %xmm0,16(%rsp,$i)
+        movdqu  %xmm2,16($rp,$i)
+        movdqa  %xmm0,32(%rsp,$i)
+        movdqu  %xmm1,32($rp,$i)
+        lea     32($i),$i
        dec     $j
-        jge     .Lcopy
+        jnz     .Lcopy4x
+        shl     \$2,$num
+        movdqu  16($ap,$i),%xmm2
+        movdqa  %xmm0,16(%rsp,$i)
+        movdqu  %xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
        mov     8(%rsp,$num,8),%rsi     # restore %rsp
        mov     \$1,%rax
        mov     (%rsi),%r15
@@ -211,9 +680,823 @@ bn_mul_mont:
        mov     32(%rsi),%rbp
        mov     40(%rsi),%rbx
        lea     48(%rsi),%rsp
-.Lepilogue:
+.Lmul4x_epilogue:
        ret
-.size   bn_mul_mont,.-bn_mul_mont
+.size   bn_mul4x_mont,.-bn_mul4x_mont
+___
+}}}
+{{{
+######################################################################
+# void bn_sqr4x_mont(
+my $rptr="%rdi";        # const BN_ULONG *rptr,
+my $aptr="%rsi";        # const BN_ULONG *aptr,
+my $bptr="%rdx";        # not used
+my $nptr="%rcx";        # const BN_ULONG *nptr,
+my $n0  ="%r8";         # const BN_ULONG *n0);
+my $num ="%r9";         # int num, has to be divisible by 4 and
+                        # not less than 8
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+$code.=<<___;
+.type   bn_sqr4x_mont,\@function,6
+.align  16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        shl     \$3,${num}d             # convert $num to bytes
+        xor     %r10,%r10
+        mov     %rsp,%r11               # put aside %rsp
+        sub     $num,%r10               # -$num
+        mov     ($n0),$n0               # *n0
+        lea     -72(%rsp,%r10,2),%rsp   # alloca(frame+2*$num)
+        and     \$-1024,%rsp            # minimize TLB usage
+        ##############################################################
+        # Stack layout
+        #
+        # +0    saved $num, used in reduction section
+        # +8    &t[2*$num], used in reduction section
+        # +32   saved $rptr
+        # +40   saved $nptr
+        # +48   saved *n0
+        # +56   saved %rsp
+        # +64   t[2*$num]
+        #
+        mov     $rptr,32(%rsp)          # save $rptr
+        mov     $nptr,40(%rsp)
+        mov     $n0,  48(%rsp)
+        mov     %r11, 56(%rsp)          # save original %rsp
+.Lsqr4x_body:
+        ##############################################################
+        # Squaring part:
+        #
+        # a) multiply-n-add everything but a[i]*a[i];
+        # b) shift result of a) by 1 to the left and accumulate
+        #    a[i]*a[i] products;
+        #
+        lea     32(%r10),$i             # $i=-($num-32)
+        lea     ($aptr,$num),$aptr      # end of a[] buffer, ($aptr,$i)=&ap[2]
+        mov     $num,$j                 # $j=$num
+                                        # comments apply to $num==8 case
+        mov     -32($aptr,$i),$a0       # a[0]
+        lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
+        mov     -24($aptr,$i),%rax      # a[1]
+        lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
+        mov     -16($aptr,$i),$ai       # a[2]
+        mov     %rax,$a1
+        mul     $a0                     # a[1]*a[0]
+        mov     %rax,$A0[0]             # a[1]*a[0]
+         mov    $ai,%rax                # a[2]
+        mov     %rdx,$A0[1]
+        mov     $A0[0],-24($tptr,$i)    # t[1]
+        xor     $A0[0],$A0[0]
+        mul     $a0                     # a[2]*a[0]
+        add     %rax,$A0[1]
+         mov    $ai,%rax
+        adc     %rdx,$A0[0]
+        mov     $A0[1],-16($tptr,$i)    # t[2]
+        lea     -16($i),$j              # j=-16
+         mov    8($aptr,$j),$ai         # a[3]
+        mul     $a1                     # a[2]*a[1]
+        mov     %rax,$A1[0]             # a[2]*a[1]+t[3]
+         mov    $ai,%rax
+        mov     %rdx,$A1[1]
+        xor     $A0[1],$A0[1]
+        add     $A1[0],$A0[0]
+         lea    16($j),$j
+        adc     \$0,$A0[1]
+        mul     $a0                     # a[3]*a[0]
+        add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
+         mov    $ai,%rax
+        adc     %rdx,$A0[1]
+        mov     $A0[0],-8($tptr,$j)     # t[3]
+        jmp     .Lsqr4x_1st
+.align  16
+.Lsqr4x_1st:
+         mov    ($aptr,$j),$ai          # a[4]
+        xor     $A1[0],$A1[0]
+        mul     $a1                     # a[3]*a[1]
+        add     %rax,$A1[1]             # a[3]*a[1]+t[4]
+         mov    $ai,%rax
+        adc     %rdx,$A1[0]
+        xor     $A0[0],$A0[0]
+        add     $A1[1],$A0[1]
+        adc     \$0,$A0[0]
+        mul     $a0                     # a[4]*a[0]
+        add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
+         mov    $ai,%rax                # a[3]
+        adc     %rdx,$A0[0]
+        mov     $A0[1],($tptr,$j)       # t[4]
+         mov    8($aptr,$j),$ai         # a[5]
+        xor     $A1[1],$A1[1]
+        mul     $a1                     # a[4]*a[3]
+        add     %rax,$A1[0]             # a[4]*a[3]+t[5]
+         mov    $ai,%rax
+        adc     %rdx,$A1[1]
+        xor     $A0[1],$A0[1]
+        add     $A1[0],$A0[0]
+        adc     \$0,$A0[1]
+        mul     $a0                     # a[5]*a[2]
+        add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
+         mov    $ai,%rax
+        adc     %rdx,$A0[1]
+        mov     $A0[0],8($tptr,$j)      # t[5]
+         mov    16($aptr,$j),$ai        # a[6]
+        xor     $A1[0],$A1[0]
+        mul     $a1                     # a[5]*a[3]
+        add     %rax,$A1[1]             # a[5]*a[3]+t[6]
+         mov    $ai,%rax
+        adc     %rdx,$A1[0]
+        xor     $A0[0],$A0[0]
+        add     $A1[1],$A0[1]
+        adc     \$0,$A0[0]
+        mul     $a0                     # a[6]*a[2]
+        add     %rax,$A0[1]             # a[6]*a[2]+a[5]*a[3]+t[6]
+         mov    $ai,%rax                # a[3]
+        adc     %rdx,$A0[0]
+        mov     $A0[1],16($tptr,$j)     # t[6]
+         mov    24($aptr,$j),$ai        # a[7]
+        xor     $A1[1],$A1[1]
+        mul     $a1                     # a[6]*a[5]
+        add     %rax,$A1[0]             # a[6]*a[5]+t[7]
+         mov    $ai,%rax
+        adc     %rdx,$A1[1]
+        xor     $A0[1],$A0[1]
+        add     $A1[0],$A0[0]
+         lea    32($j),$j
+        adc     \$0,$A0[1]
+        mul     $a0                     # a[7]*a[4]
+        add     %rax,$A0[0]             # a[7]*a[4]+a[6]*a[5]+t[6]
+         mov    $ai,%rax
+        adc     %rdx,$A0[1]
+        mov     $A0[0],-8($tptr,$j)     # t[7]
+        cmp     \$0,$j
+        jne     .Lsqr4x_1st
+        xor     $A1[0],$A1[0]
+        add     $A0[1],$A1[1]
+        adc     \$0,$A1[0]
+        mul     $a1                     # a[7]*a[5]
+        add     %rax,$A1[1]
+        adc     %rdx,$A1[0]
+        mov     $A1[1],($tptr)          # t[8]
+        lea     16($i),$i
+        mov     $A1[0],8($tptr)         # t[9]
+        jmp     .Lsqr4x_outer
+.align  16
+.Lsqr4x_outer:                          # comments apply to $num==6 case
+        mov     -32($aptr,$i),$a0       # a[0]
+        lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
+        mov     -24($aptr,$i),%rax      # a[1]
+        lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
+        mov     -16($aptr,$i),$ai       # a[2]
+        mov     %rax,$a1
+        mov     -24($tptr,$i),$A0[0]    # t[1]
+        xor     $A0[1],$A0[1]
+        mul     $a0                     # a[1]*a[0]
+        add     %rax,$A0[0]             # a[1]*a[0]+t[1]
+         mov    $ai,%rax                # a[2]
+        adc     %rdx,$A0[1]
+        mov     $A0[0],-24($tptr,$i)    # t[1]
+        xor     $A0[0],$A0[0]
+        add     -16($tptr,$i),$A0[1]    # a[2]*a[0]+t[2]
+        adc     \$0,$A0[0]
+        mul     $a0                     # a[2]*a[0]
+        add     %rax,$A0[1]
+         mov    $ai,%rax
+        adc     %rdx,$A0[0]
+        mov     $A0[1],-16($tptr,$i)    # t[2]
+        lea     -16($i),$j              # j=-16
+        xor     $A1[0],$A1[0]
+         mov    8($aptr,$j),$ai         # a[3]
+        xor     $A1[1],$A1[1]
+        add     8($tptr,$j),$A1[0]
+        adc     \$0,$A1[1]
+        mul     $a1                     # a[2]*a[1]
+        add     %rax,$A1[0]             # a[2]*a[1]+t[3]
+         mov    $ai,%rax
+        adc     %rdx,$A1[1]
+        xor     $A0[1],$A0[1]
+        add     $A1[0],$A0[0]
+        adc     \$0,$A0[1]
+        mul     $a0                     # a[3]*a[0]
+        add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
+         mov    $ai,%rax
+        adc     %rdx,$A0[1]
+        mov     $A0[0],8($tptr,$j)      # t[3]
+        lea     16($j),$j
+        jmp     .Lsqr4x_inner
+.align  16
+.Lsqr4x_inner:
+         mov    ($aptr,$j),$ai          # a[4]
+        xor     $A1[0],$A1[0]
+        add     ($tptr,$j),$A1[1]
+        adc     \$0,$A1[0]
+        mul     $a1                     # a[3]*a[1]
+        add     %rax,$A1[1]             # a[3]*a[1]+t[4]
+         mov    $ai,%rax
+        adc     %rdx,$A1[0]
+        xor     $A0[0],$A0[0]
+        add     $A1[1],$A0[1]
+        adc     \$0,$A0[0]
+        mul     $a0                     # a[4]*a[0]
+        add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
+         mov    $ai,%rax                # a[3]
+        adc     %rdx,$A0[0]
+        mov     $A0[1],($tptr,$j)       # t[4]
+         mov    8($aptr,$j),$ai         # a[5]
+        xor     $A1[1],$A1[1]
+        add     8($tptr,$j),$A1[0]
+        adc     \$0,$A1[1]
+        mul     $a1                     # a[4]*a[3]
+        add     %rax,$A1[0]             # a[4]*a[3]+t[5]
+         mov    $ai,%rax
+        adc     %rdx,$A1[1]
+        xor     $A0[1],$A0[1]
+        add     $A1[0],$A0[0]
+        lea     16($j),$j               # j++
+        adc     \$0,$A0[1]
+        mul     $a0                     # a[5]*a[2]
+        add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
+         mov    $ai,%rax
+        adc     %rdx,$A0[1]
+        mov     $A0[0],-8($tptr,$j)     # t[5], "preloaded t[1]" below
+        cmp     \$0,$j
+        jne     .Lsqr4x_inner
+        xor     $A1[0],$A1[0]
+        add     $A0[1],$A1[1]
+        adc     \$0,$A1[0]
+        mul     $a1                     # a[5]*a[3]
+        add     %rax,$A1[1]
+        adc     %rdx,$A1[0]
+        mov     $A1[1],($tptr)          # t[6], "preloaded t[2]" below
+        mov     $A1[0],8($tptr)         # t[7], "preloaded t[3]" below
+        add     \$16,$i
+        jnz     .Lsqr4x_outer
+                                        # comments apply to $num==4 case
+        mov     -32($aptr),$a0          # a[0]
+        lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
+        mov     -24($aptr),%rax         # a[1]
+        lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
+        mov     -16($aptr),$ai          # a[2]
+        mov     %rax,$a1
+        xor     $A0[1],$A0[1]
+        mul     $a0                     # a[1]*a[0]
+        add     %rax,$A0[0]             # a[1]*a[0]+t[1], preloaded t[1]
+         mov    $ai,%rax                # a[2]
+        adc     %rdx,$A0[1]
+        mov     $A0[0],-24($tptr)       # t[1]
+        xor     $A0[0],$A0[0]
+        add     $A1[1],$A0[1]           # a[2]*a[0]+t[2], preloaded t[2]
+        adc     \$0,$A0[0]
+        mul     $a0                     # a[2]*a[0]
+        add     %rax,$A0[1]
+         mov    $ai,%rax
+        adc     %rdx,$A0[0]
+        mov     $A0[1],-16($tptr)       # t[2]
+         mov    -8($aptr),$ai           # a[3]
+        mul     $a1                     # a[2]*a[1]
+        add     %rax,$A1[0]             # a[2]*a[1]+t[3], preloaded t[3]
+         mov    $ai,%rax
+        adc     \$0,%rdx
+        xor     $A0[1],$A0[1]
+        add     $A1[0],$A0[0]
+         mov    %rdx,$A1[1]
+        adc     \$0,$A0[1]
+        mul     $a0                     # a[3]*a[0]
+        add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
+         mov    $ai,%rax
+        adc     %rdx,$A0[1]
+        mov     $A0[0],-8($tptr)        # t[3]
+        xor     $A1[0],$A1[0]
+        add     $A0[1],$A1[1]
+        adc     \$0,$A1[0]
+        mul     $a1                     # a[3]*a[1]
+        add     %rax,$A1[1]
+         mov    -16($aptr),%rax         # a[2]
+        adc     %rdx,$A1[0]
+        mov     $A1[1],($tptr)          # t[4]
+        mov     $A1[0],8($tptr)         # t[5]
+        mul     $ai                     # a[2]*a[3]
+___
+{
+my ($shift,$carry)=($a0,$a1);
+my @S=(@A1,$ai,$n0);
+$code.=<<___;
+         add    \$16,$i
+         xor    $shift,$shift
+         sub    $num,$i                 # $i=16-$num
+         xor    $carry,$carry
+        add     $A1[0],%rax             # t[5]
+        adc     \$0,%rdx
+        mov     %rax,8($tptr)           # t[5]
+        mov     %rdx,16($tptr)          # t[6]
+        mov     $carry,24($tptr)        # t[7]
+         mov    -16($aptr,$i),%rax      # a[0]
+        lea     64(%rsp,$num,2),$tptr
+         xor    $A0[0],$A0[0]           # t[0]
+         mov    -24($tptr,$i,2),$A0[1]  # t[1]
+        lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+        shr     \$63,$A0[0]
+        lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
+        shr     \$63,$A0[1]
+        or      $A0[0],$S[1]            # | t[2*i]>>63
+         mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
+        mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+        mul     %rax                    # a[i]*a[i]
+        neg     $carry                  # mov $carry,cf
+         mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
+        adc     %rax,$S[0]
+         mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
+        mov     $S[0],-32($tptr,$i,2)
+        adc     %rdx,$S[1]
+        lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+         mov    $S[1],-24($tptr,$i,2)
+         sbb    $carry,$carry           # mov cf,$carry
+        shr     \$63,$A0[0]
+        lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
+        shr     \$63,$A0[1]
+        or      $A0[0],$S[3]            # | t[2*i]>>63
+         mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
+        mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+        mul     %rax                    # a[i]*a[i]
+        neg     $carry                  # mov $carry,cf
+         mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
+        adc     %rax,$S[2]
+         mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
+        mov     $S[2],-16($tptr,$i,2)
+        adc     %rdx,$S[3]
+        lea     16($i),$i
+        mov     $S[3],-40($tptr,$i,2)
+        sbb     $carry,$carry           # mov cf,$carry
+        jmp     .Lsqr4x_shift_n_add
+.align  16
+.Lsqr4x_shift_n_add:
+        lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+        shr     \$63,$A0[0]
+        lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
+        shr     \$63,$A0[1]
+        or      $A0[0],$S[1]            # | t[2*i]>>63
+         mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
+        mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+        mul     %rax                    # a[i]*a[i]
+        neg     $carry                  # mov $carry,cf
+         mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
+        adc     %rax,$S[0]
+         mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
+        mov     $S[0],-32($tptr,$i,2)
+        adc     %rdx,$S[1]
+        lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+         mov    $S[1],-24($tptr,$i,2)
+         sbb    $carry,$carry           # mov cf,$carry
+        shr     \$63,$A0[0]
+        lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
+        shr     \$63,$A0[1]
+        or      $A0[0],$S[3]            # | t[2*i]>>63
+         mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
+        mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+        mul     %rax                    # a[i]*a[i]
+        neg     $carry                  # mov $carry,cf
+         mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
+        adc     %rax,$S[2]
+         mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
+        mov     $S[2],-16($tptr,$i,2)
+        adc     %rdx,$S[3]
+        lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+         mov    $S[3],-8($tptr,$i,2)
+         sbb    $carry,$carry           # mov cf,$carry
+        shr     \$63,$A0[0]
+        lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
+        shr     \$63,$A0[1]
+        or      $A0[0],$S[1]            # | t[2*i]>>63
+         mov    16($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
+        mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+        mul     %rax                    # a[i]*a[i]
+        neg     $carry                  # mov $carry,cf
+         mov    24($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
+        adc     %rax,$S[0]
+         mov    8($aptr,$i),%rax        # a[i+1]        # prefetch
+        mov     $S[0],0($tptr,$i,2)
+        adc     %rdx,$S[1]
+        lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+         mov    $S[1],8($tptr,$i,2)
+         sbb    $carry,$carry           # mov cf,$carry
+        shr     \$63,$A0[0]
+        lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
+        shr     \$63,$A0[1]
+        or      $A0[0],$S[3]            # | t[2*i]>>63
+         mov    32($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
+        mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+        mul     %rax                    # a[i]*a[i]
+        neg     $carry                  # mov $carry,cf
+         mov    40($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
+        adc     %rax,$S[2]
+         mov    16($aptr,$i),%rax       # a[i+1]        # prefetch
+        mov     $S[2],16($tptr,$i,2)
+        adc     %rdx,$S[3]
+        mov     $S[3],24($tptr,$i,2)
+        sbb     $carry,$carry           # mov cf,$carry
+        add     \$32,$i
+        jnz     .Lsqr4x_shift_n_add
+        lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+        shr     \$63,$A0[0]
+        lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
+        shr     \$63,$A0[1]
+        or      $A0[0],$S[1]            # | t[2*i]>>63
+         mov    -16($tptr),$A0[0]       # t[2*i+2]      # prefetch
+        mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+        mul     %rax                    # a[i]*a[i]
+        neg     $carry                  # mov $carry,cf
+         mov    -8($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
+        adc     %rax,$S[0]
+         mov    -8($aptr),%rax          # a[i+1]        # prefetch
+        mov     $S[0],-32($tptr)
+        adc     %rdx,$S[1]
+        lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
+         mov    $S[1],-24($tptr)
+         sbb    $carry,$carry           # mov cf,$carry
+        shr     \$63,$A0[0]
+        lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
+        shr     \$63,$A0[1]
+        or      $A0[0],$S[3]            # | t[2*i]>>63
+        mul     %rax                    # a[i]*a[i]
+        neg     $carry                  # mov $carry,cf
+        adc     %rax,$S[2]
+        adc     %rdx,$S[3]
+        mov     $S[2],-16($tptr)
+        mov     $S[3],-8($tptr)
+___
+}
+##############################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+{
+my ($topbit,$nptr)=("%rbp",$aptr);
+my ($m0,$m1)=($a0,$a1);
+my @Ni=("%rbx","%r9");
+$code.=<<___;
+        mov     40(%rsp),$nptr          # restore $nptr
+        mov     48(%rsp),$n0            # restore *n0
+        xor     $j,$j
+        mov     $num,0(%rsp)            # save $num
+        sub     $num,$j                 # $j=-$num
+         mov    64(%rsp),$A0[0]         # t[0]          # modsched #
+         mov    $n0,$m0                 #               # modsched #
+        lea     64(%rsp,$num,2),%rax    # end of t[] buffer
+        lea     64(%rsp,$num),$tptr     # end of t[] window
+        mov     %rax,8(%rsp)            # save end of t[] buffer
+        lea     ($nptr,$num),$nptr      # end of n[] buffer
+        xor     $topbit,$topbit         # $topbit=0
+        mov     0($nptr,$j),%rax        # n[0]          # modsched #
+        mov     8($nptr,$j),$Ni[1]      # n[1]          # modsched #
+         imulq  $A0[0],$m0              # m0=t[0]*n0    # modsched #
+         mov    %rax,$Ni[0]             #               # modsched #
+        jmp     .Lsqr4x_mont_outer
+.align  16
+.Lsqr4x_mont_outer:
+        xor     $A0[1],$A0[1]
+        mul     $m0                     # n[0]*m0
+        add     %rax,$A0[0]             # n[0]*m0+t[0]
+         mov    $Ni[1],%rax
+        adc     %rdx,$A0[1]
+        mov     $n0,$m1
+        xor     $A0[0],$A0[0]
+        add     8($tptr,$j),$A0[1]
+        adc     \$0,$A0[0]
+        mul     $m0                     # n[1]*m0
+        add     %rax,$A0[1]             # n[1]*m0+t[1]
+         mov    $Ni[0],%rax
+        adc     %rdx,$A0[0]
+        imulq   $A0[1],$m1
+        mov     16($nptr,$j),$Ni[0]     # n[2]
+        xor     $A1[1],$A1[1]
+        add     $A0[1],$A1[0]
+        adc     \$0,$A1[1]
+        mul     $m1                     # n[0]*m1
+        add     %rax,$A1[0]             # n[0]*m1+"t[1]"
+         mov    $Ni[0],%rax
+        adc     %rdx,$A1[1]
+        mov     $A1[0],8($tptr,$j)      # "t[1]"
+        xor     $A0[1],$A0[1]
+        add     16($tptr,$j),$A0[0]
+        adc     \$0,$A0[1]
+        mul     $m0                     # n[2]*m0
+        add     %rax,$A0[0]             # n[2]*m0+t[2]
+         mov    $Ni[1],%rax
+        adc     %rdx,$A0[1]
+        mov     24($nptr,$j),$Ni[1]     # n[3]
+        xor     $A1[0],$A1[0]
+        add     $A0[0],$A1[1]
+        adc     \$0,$A1[0]
+        mul     $m1                     # n[1]*m1
+        add     %rax,$A1[1]             # n[1]*m1+"t[2]"
+         mov    $Ni[1],%rax
+        adc     %rdx,$A1[0]
+        mov     $A1[1],16($tptr,$j)     # "t[2]"
+        xor     $A0[0],$A0[0]
+        add     24($tptr,$j),$A0[1]
+        lea     32($j),$j
+        adc     \$0,$A0[0]
+        mul     $m0                     # n[3]*m0
+        add     %rax,$A0[1]             # n[3]*m0+t[3]
+         mov    $Ni[0],%rax
+        adc     %rdx,$A0[0]
+        jmp     .Lsqr4x_mont_inner
+.align  16
+.Lsqr4x_mont_inner:
+        mov     ($nptr,$j),$Ni[0]       # n[4]
+        xor     $A1[1],$A1[1]
+        add     $A0[1],$A1[0]
+        adc     \$0,$A1[1]
+        mul     $m1                     # n[2]*m1
+        add     %rax,$A1[0]             # n[2]*m1+"t[3]"
+         mov    $Ni[0],%rax
+        adc     %rdx,$A1[1]
+        mov     $A1[0],-8($tptr,$j)     # "t[3]"
+        xor     $A0[1],$A0[1]
+        add     ($tptr,$j),$A0[0]
+        adc     \$0,$A0[1]
+        mul     $m0                     # n[4]*m0
+        add     %rax,$A0[0]             # n[4]*m0+t[4]
+         mov    $Ni[1],%rax
+        adc     %rdx,$A0[1]
+        mov     8($nptr,$j),$Ni[1]      # n[5]
+        xor     $A1[0],$A1[0]
+        add     $A0[0],$A1[1]
+        adc     \$0,$A1[0]
+        mul     $m1                     # n[3]*m1
+        add     %rax,$A1[1]             # n[3]*m1+"t[4]"
+         mov    $Ni[1],%rax
+        adc     %rdx,$A1[0]
+        mov     $A1[1],($tptr,$j)       # "t[4]"
+        xor     $A0[0],$A0[0]
+        add     8($tptr,$j),$A0[1]
+        adc     \$0,$A0[0]
+        mul     $m0                     # n[5]*m0
+        add     %rax,$A0[1]             # n[5]*m0+t[5]
+         mov    $Ni[0],%rax
+        adc     %rdx,$A0[0]
+        mov     16($nptr,$j),$Ni[0]     # n[6]
+        xor     $A1[1],$A1[1]
+        add     $A0[1],$A1[0]
+        adc     \$0,$A1[1]
+        mul     $m1                     # n[4]*m1
+        add     %rax,$A1[0]             # n[4]*m1+"t[5]"
+         mov    $Ni[0],%rax
+        adc     %rdx,$A1[1]
+        mov     $A1[0],8($tptr,$j)      # "t[5]"
+        xor     $A0[1],$A0[1]
+        add     16($tptr,$j),$A0[0]
+        adc     \$0,$A0[1]
+        mul     $m0                     # n[6]*m0
+        add     %rax,$A0[0]             # n[6]*m0+t[6]
+         mov    $Ni[1],%rax
+        adc     %rdx,$A0[1]
+        mov     24($nptr,$j),$Ni[1]     # n[7]
+        xor     $A1[0],$A1[0]
+        add     $A0[0],$A1[1]
+        adc     \$0,$A1[0]
+        mul     $m1                     # n[5]*m1
+        add     %rax,$A1[1]             # n[5]*m1+"t[6]"
+         mov    $Ni[1],%rax
+        adc     %rdx,$A1[0]
+        mov     $A1[1],16($tptr,$j)     # "t[6]"
+        xor     $A0[0],$A0[0]
+        add     24($tptr,$j),$A0[1]
+        lea     32($j),$j
+        adc     \$0,$A0[0]
+        mul     $m0                     # n[7]*m0
+        add     %rax,$A0[1]             # n[7]*m0+t[7]
+         mov    $Ni[0],%rax
+        adc     %rdx,$A0[0]
+        cmp     \$0,$j
+        jne     .Lsqr4x_mont_inner
+         sub    0(%rsp),$j              # $j=-$num      # modsched #
+         mov    $n0,$m0                 #               # modsched #
+        xor     $A1[1],$A1[1]
+        add     $A0[1],$A1[0]
+        adc     \$0,$A1[1]
+        mul     $m1                     # n[6]*m1
+        add     %rax,$A1[0]             # n[6]*m1+"t[7]"
+        mov     $Ni[1],%rax
+        adc     %rdx,$A1[1]
+        mov     $A1[0],-8($tptr)        # "t[7]"
+        xor     $A0[1],$A0[1]
+        add     ($tptr),$A0[0]          # +t[8]
+        adc     \$0,$A0[1]
+         mov    0($nptr,$j),$Ni[0]      # n[0]          # modsched #
+        add     $topbit,$A0[0]
+        adc     \$0,$A0[1]
+         imulq  16($tptr,$j),$m0        # m0=t[0]*n0    # modsched #
+        xor     $A1[0],$A1[0]
+         mov    8($nptr,$j),$Ni[1]      # n[1]          # modsched #
+        add     $A0[0],$A1[1]
+         mov    16($tptr,$j),$A0[0]     # t[0]          # modsched #
+        adc     \$0,$A1[0]
+        mul     $m1                     # n[7]*m1
+        add     %rax,$A1[1]             # n[7]*m1+"t[8]"
+         mov    $Ni[0],%rax             #               # modsched #
+        adc     %rdx,$A1[0]
+        mov     $A1[1],($tptr)          # "t[8]"
+        xor     $topbit,$topbit
+        add     8($tptr),$A1[0]         # +t[9]
+        adc     $topbit,$topbit
+        add     $A0[1],$A1[0]
+        lea     16($tptr),$tptr         # "t[$num]>>128"
+        adc     \$0,$topbit
+        mov     $A1[0],-8($tptr)        # "t[9]"
+        cmp     8(%rsp),$tptr           # are we done?
+        jb      .Lsqr4x_mont_outer
+        mov     0(%rsp),$num            # restore $num
+        mov     $topbit,($tptr)         # save $topbit
+___
+}
+##############################################################
+# Post-condition, 4x unrolled copy from bn_mul_mont
+#
+{
+my ($tptr,$nptr)=("%rbx",$aptr);
+my @ri=("%rax","%rdx","%r10","%r11");
+$code.=<<___;
+        mov     64(%rsp,$num),@ri[0]    # tp[0]
+        lea     64(%rsp,$num),$tptr     # upper half of t[2*$num] holds result
+        mov     40(%rsp),$nptr          # restore $nptr
+        shr     \$5,$num                # num/4
+        mov     8($tptr),@ri[1]         # t[1]
+        xor     $i,$i                   # i=0 and clear CF!
+        mov     32(%rsp),$rptr          # restore $rptr
+        sub     0($nptr),@ri[0]
+        mov     16($tptr),@ri[2]        # t[2]
+        mov     24($tptr),@ri[3]        # t[3]
+        sbb     8($nptr),@ri[1]
+        lea     -1($num),$j             # j=num/4-1
+        jmp     .Lsqr4x_sub
+.align  16
+.Lsqr4x_sub:
+        mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
+        mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
+        sbb     16($nptr,$i,8),@ri[2]
+        mov     32($tptr,$i,8),@ri[0]   # tp[i+1]
+        mov     40($tptr,$i,8),@ri[1]
+        sbb     24($nptr,$i,8),@ri[3]
+        mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
+        mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
+        sbb     32($nptr,$i,8),@ri[0]
+        mov     48($tptr,$i,8),@ri[2]
+        mov     56($tptr,$i,8),@ri[3]
+        sbb     40($nptr,$i,8),@ri[1]
+        lea     4($i),$i                # i++
+        dec     $j                      # doesn't affect CF!
+        jnz     .Lsqr4x_sub
+        mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
+        mov     32($tptr,$i,8),@ri[0]   # load overflow bit
+        sbb     16($nptr,$i,8),@ri[2]
+        mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
+        sbb     24($nptr,$i,8),@ri[3]
+        mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
+        sbb     \$0,@ri[0]              # handle upmost overflow bit
+        mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
+        xor     $i,$i                   # i=0
+        and     @ri[0],$tptr
+        not     @ri[0]
+        mov     $rptr,$nptr
+        and     @ri[0],$nptr
+        lea     -1($num),$j
+        or      $nptr,$tptr             # tp=borrow?tp:rp
+        pxor    %xmm0,%xmm0
+        lea     64(%rsp,$num,8),$nptr
+        movdqu  ($tptr),%xmm1
+        lea     ($nptr,$num,8),$nptr
+        movdqa  %xmm0,64(%rsp)          # zap lower half of temporary vector
+        movdqa  %xmm0,($nptr)           # zap upper half of temporary vector
+        movdqu  %xmm1,($rptr)
+        jmp     .Lsqr4x_copy
+.align  16
+.Lsqr4x_copy:                           # copy or in-place refresh
+        movdqu  16($tptr,$i),%xmm2
+        movdqu  32($tptr,$i),%xmm1
+        movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
+        movdqa  %xmm0,96(%rsp,$i)       # zap lower half of temporary vector
+        movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
+        movdqa  %xmm0,32($nptr,$i)      # zap upper half of temporary vector
+        movdqu  %xmm2,16($rptr,$i)
+        movdqu  %xmm1,32($rptr,$i)
+        lea     32($i),$i
+        dec     $j
+        jnz     .Lsqr4x_copy
+        movdqu  16($tptr,$i),%xmm2
+        movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
+        movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
+        movdqu  %xmm2,16($rptr,$i)
+___
+}
+$code.=<<___;
+        mov     56(%rsp),%rsi           # restore %rsp
+        mov     \$1,%rax
+        mov     0(%rsi),%r15
+        mov     8(%rsi),%r14
+        mov     16(%rsi),%r13
+        mov     24(%rsi),%r12
+        mov     32(%rsi),%rbp
+        mov     40(%rsi),%rbx
+        lea     48(%rsi),%rsp
+.Lsqr4x_epilogue:
+        ret
+.size   bn_sqr4x_mont,.-bn_sqr4x_mont
+___
+}}}
+$code.=<<___;
 .asciz  "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align  16
 ___
@@ -228,9 +1511,9 @@ $disp="%r9";
 $code.=<<___;
 .extern __imp_RtlVirtualUnwind
-.type   se_handler,\@abi-omnipotent
+.type   mul_handler,\@abi-omnipotent
 .align  16
-se_handler:
+mul_handler:
        push    %rsi
        push    %rdi
        push    %rbx
@@ -245,15 +1528,20 @@ se_handler:
        mov     120($context),%rax      # pull context->Rax
        mov     248($context),%rbx      # pull context->Rip
-        lea     .Lprologue(%rip),%r10
+        mov     8($disp),%rsi           # disp->ImageBase
-        cmp     %r10,%rbx               # context->Rip<.Lprologue
+        mov     56($disp),%r11          # disp->HandlerData
-        jb      .Lin_prologue
+        mov     0(%r11),%r10d           # HandlerData[0]
+        lea     (%rsi,%r10),%r10        # end of prologue label
+        cmp     %r10,%rbx               # context->Rip<end of prologue label
+        jb      .Lcommon_seh_tail
        mov     152($context),%rax      # pull context->Rsp
-        lea     .Lepilogue(%rip),%r10
+        mov     4(%r11),%r10d           # HandlerData[1]
-        cmp     %r10,%rbx               # context->Rip>=.Lepilogue
+        lea     (%rsi,%r10),%r10        # epilogue label
-        jae     .Lin_prologue
+        cmp     %r10,%rbx               # context->Rip>=epilogue label
+        jae     .Lcommon_seh_tail
        mov     192($context),%r10      # pull $num
        mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
@@ -272,7 +1560,53 @@ se_handler:
        mov     %r14,232($context)      # restore context->R14
        mov     %r15,240($context)      # restore context->R15
-.Lin_prologue:
+        jmp     .Lcommon_seh_tail
+.size   mul_handler,.-mul_handler
+.type   sqr_handler,\@abi-omnipotent
+.align  16
+sqr_handler:
+        push    %rsi
+        push    %rdi
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        pushfq
+        sub     \$64,%rsp
+        mov     120($context),%rax      # pull context->Rax
+        mov     248($context),%rbx      # pull context->Rip
+        lea     .Lsqr4x_body(%rip),%r10
+        cmp     %r10,%rbx               # context->Rip<.Lsqr_body
+        jb      .Lcommon_seh_tail
+        mov     152($context),%rax      # pull context->Rsp
+        lea     .Lsqr4x_epilogue(%rip),%r10
+        cmp     %r10,%rbx               # context->Rip>=.Lsqr_epilogue
+        jae     .Lcommon_seh_tail
+        mov     56(%rax),%rax           # pull saved stack pointer
+        lea     48(%rax),%rax
+        mov     -8(%rax),%rbx
+        mov     -16(%rax),%rbp
+        mov     -24(%rax),%r12
+        mov     -32(%rax),%r13
+        mov     -40(%rax),%r14
+        mov     -48(%rax),%r15
+        mov     %rbx,144($context)      # restore context->Rbx
+        mov     %rbp,160($context)      # restore context->Rbp
+        mov     %r12,216($context)      # restore context->R12
+        mov     %r13,224($context)      # restore context->R13
+        mov     %r14,232($context)      # restore context->R14
+        mov     %r15,240($context)      # restore context->R15
+.Lcommon_seh_tail:
        mov     8(%rax),%rdi
        mov     16(%rax),%rsi
        mov     %rax,152($context)      # restore context->Rsp
@@ -310,7 +1644,7 @@ se_handler:
        pop     %rdi
        pop     %rsi
        ret
-.size   se_handler,.-se_handler
+.size   sqr_handler,.-sqr_handler
 .section        .pdata
 .align  4
@@ -318,11 +1652,27 @@ se_handler:
        .rva    .LSEH_end_bn_mul_mont
        .rva    .LSEH_info_bn_mul_mont
+        .rva    .LSEH_begin_bn_mul4x_mont
+        .rva    .LSEH_end_bn_mul4x_mont
+        .rva    .LSEH_info_bn_mul4x_mont
+        .rva    .LSEH_begin_bn_sqr4x_mont
+        .rva    .LSEH_end_bn_sqr4x_mont
+        .rva    .LSEH_info_bn_sqr4x_mont
 .section        .xdata
 .align  8
 .LSEH_info_bn_mul_mont:
        .byte   9,0,0,0
-        .rva    se_handler
+        .rva    mul_handler
+        .rva    .Lmul_body,.Lmul_epilogue       # HandlerData[]
+.LSEH_info_bn_mul4x_mont:
+        .byte   9,0,0,0
+        .rva    mul_handler
+        .rva    .Lmul4x_body,.Lmul4x_epilogue   # HandlerData[]
+.LSEH_info_bn_sqr4x_mont:
+        .byte   9,0,0,0
+        .rva    sqr_handler
 ___
 }
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
new file mode 100755
index 0000000000..057cda28aa
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
@@ -0,0 +1,1070 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# August 2011.
+#
+# Companion to x86_64-mont.pl that optimizes cache-timing attack
+# countermeasures. The subroutines are produced by replacing bp[i]
+# references in their x86_64-mont.pl counterparts with cache-neutral
+# references to powers table computed in BN_mod_exp_mont_consttime.
+# In addition subroutine that scatters elements of the powers table
+# is implemented, so that scatter-/gathering can be tuned without
+# bn_exp.c modifications.
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+open STDOUT,"| $^X $xlate $flavour $output";
+# int bn_mul_mont_gather5(
+$rp="%rdi";     # BN_ULONG *rp,
+$ap="%rsi";     # const BN_ULONG *ap,
+$bp="%rdx";     # const BN_ULONG *bp,
+$np="%rcx";     # const BN_ULONG *np,
+$n0="%r8";      # const BN_ULONG *n0,
+$num="%r9";     # int num,
+                # int idx);     # 0 to 2^5-1, "index" in $bp holding
+                                # pre-computed powers of a', interlaced
+                                # in such manner that b[0] is $bp[idx],
+                                # b[1] is [2^5+idx], etc.
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+$code=<<___;
+.text
+.globl  bn_mul_mont_gather5
+.type   bn_mul_mont_gather5,\@function,6
+.align  64
+bn_mul_mont_gather5:
+        test    \$3,${num}d
+        jnz     .Lmul_enter
+        cmp     \$8,${num}d
+        jb      .Lmul_enter
+        jmp     .Lmul4x_enter
+.align  16
+.Lmul_enter:
+        mov     ${num}d,${num}d
+        mov     `($win64?56:8)`(%rsp),%r10d     # load 7th argument
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+___
+$code.=<<___ if ($win64);
+        lea     -0x28(%rsp),%rsp
+        movaps  %xmm6,(%rsp)
+        movaps  %xmm7,0x10(%rsp)
+.Lmul_alloca:
+___
+$code.=<<___;
+        mov     %rsp,%rax
+        lea     2($num),%r11
+        neg     %r11
+        lea     (%rsp,%r11,8),%rsp      # tp=alloca(8*(num+2))
+        and     \$-1024,%rsp            # minimize TLB usage
+        mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul_body:
+        mov     $bp,%r12                # reassign $bp
+___
+                $bp="%r12";
+                $STRIDE=2**5*8;         # 5 is "window size"
+                $N=$STRIDE/4;           # should match cache line size
+$code.=<<___;
+        mov     %r10,%r11
+        shr     \$`log($N/8)/log(2)`,%r10
+        and     \$`$N/8-1`,%r11
+        not     %r10
+        lea     .Lmagic_masks(%rip),%rax
+        and     \$`2**5/($N/8)-1`,%r10  # 5 is "window size"
+        lea     96($bp,%r11,8),$bp      # pointer within 1st cache line
+        movq    0(%rax,%r10,8),%xmm4    # set of masks denoting which
+        movq    8(%rax,%r10,8),%xmm5    # cache line contains element
+        movq    16(%rax,%r10,8),%xmm6   # denoted by 7th argument
+        movq    24(%rax,%r10,8),%xmm7
+        movq    `0*$STRIDE/4-96`($bp),%xmm0
+        movq    `1*$STRIDE/4-96`($bp),%xmm1
+        pand    %xmm4,%xmm0
+        movq    `2*$STRIDE/4-96`($bp),%xmm2
+        pand    %xmm5,%xmm1
+        movq    `3*$STRIDE/4-96`($bp),%xmm3
+        pand    %xmm6,%xmm2
+        por     %xmm1,%xmm0
+        pand    %xmm7,%xmm3
+        por     %xmm2,%xmm0
+        lea     $STRIDE($bp),$bp
+        por     %xmm3,%xmm0
+        movq    %xmm0,$m0               # m0=bp[0]
+        mov     ($n0),$n0               # pull n0[0] value
+        mov     ($ap),%rax
+        xor     $i,$i                   # i=0
+        xor     $j,$j                   # j=0
+        movq    `0*$STRIDE/4-96`($bp),%xmm0
+        movq    `1*$STRIDE/4-96`($bp),%xmm1
+        pand    %xmm4,%xmm0
+        movq    `2*$STRIDE/4-96`($bp),%xmm2
+        pand    %xmm5,%xmm1
+        mov     $n0,$m1
+        mulq    $m0                     # ap[0]*bp[0]
+        mov     %rax,$lo0
+        mov     ($np),%rax
+        movq    `3*$STRIDE/4-96`($bp),%xmm3
+        pand    %xmm6,%xmm2
+        por     %xmm1,%xmm0
+        pand    %xmm7,%xmm3
+        imulq   $lo0,$m1                # "tp[0]"*n0
+        mov     %rdx,$hi0
+        por     %xmm2,%xmm0
+        lea     $STRIDE($bp),$bp
+        por     %xmm3,%xmm0
+        mulq    $m1                     # np[0]*m1
+        add     %rax,$lo0               # discarded
+        mov     8($ap),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$hi1
+        lea     1($j),$j                # j++
+        jmp     .L1st_enter
+.align  16
+.L1st:
+        add     %rax,$hi1
+        mov     ($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
+        mov     $lo0,$hi0
+        adc     \$0,%rdx
+        mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+        mov     %rdx,$hi1
+.L1st_enter:
+        mulq    $m0                     # ap[j]*bp[0]
+        add     %rax,$hi0
+        mov     ($np,$j,8),%rax
+        adc     \$0,%rdx
+        lea     1($j),$j                # j++
+        mov     %rdx,$lo0
+        mulq    $m1                     # np[j]*m1
+        cmp     $num,$j
+        jne     .L1st
+        movq    %xmm0,$m0               # bp[1]
+        add     %rax,$hi1
+        mov     ($ap),%rax              # ap[0]
+        adc     \$0,%rdx
+        add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
+        adc     \$0,%rdx
+        mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+        mov     %rdx,$hi1
+        mov     $lo0,$hi0
+        xor     %rdx,%rdx
+        add     $hi0,$hi1
+        adc     \$0,%rdx
+        mov     $hi1,-8(%rsp,$num,8)
+        mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
+        lea     1($i),$i                # i++
+        jmp     .Louter
+.align  16
+.Louter:
+        xor     $j,$j                   # j=0
+        mov     $n0,$m1
+        mov     (%rsp),$lo0
+        movq    `0*$STRIDE/4-96`($bp),%xmm0
+        movq    `1*$STRIDE/4-96`($bp),%xmm1
+        pand    %xmm4,%xmm0
+        movq    `2*$STRIDE/4-96`($bp),%xmm2
+        pand    %xmm5,%xmm1
+        mulq    $m0                     # ap[0]*bp[i]
+        add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
+        mov     ($np),%rax
+        adc     \$0,%rdx
+        movq    `3*$STRIDE/4-96`($bp),%xmm3
+        pand    %xmm6,%xmm2
+        por     %xmm1,%xmm0
+        pand    %xmm7,%xmm3
+        imulq   $lo0,$m1                # tp[0]*n0
+        mov     %rdx,$hi0
+        por     %xmm2,%xmm0
+        lea     $STRIDE($bp),$bp
+        por     %xmm3,%xmm0
+        mulq    $m1                     # np[0]*m1
+        add     %rax,$lo0               # discarded
+        mov     8($ap),%rax
+        adc     \$0,%rdx
+        mov     8(%rsp),$lo0            # tp[1]
+        mov     %rdx,$hi1
+        lea     1($j),$j                # j++
+        jmp     .Linner_enter
+.align  16
+.Linner:
+        add     %rax,$hi1
+        mov     ($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
+        mov     (%rsp,$j,8),$lo0
+        adc     \$0,%rdx
+        mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+        mov     %rdx,$hi1
+.Linner_enter:
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$hi0
+        mov     ($np,$j,8),%rax
+        adc     \$0,%rdx
+        add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
+        mov     %rdx,$hi0
+        adc     \$0,$hi0
+        lea     1($j),$j                # j++
+        mulq    $m1                     # np[j]*m1
+        cmp     $num,$j
+        jne     .Linner
+        movq    %xmm0,$m0               # bp[i+1]
+        add     %rax,$hi1
+        mov     ($ap),%rax              # ap[0]
+        adc     \$0,%rdx
+        add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
+        mov     (%rsp,$j,8),$lo0
+        adc     \$0,%rdx
+        mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+        mov     %rdx,$hi1
+        xor     %rdx,%rdx
+        add     $hi0,$hi1
+        adc     \$0,%rdx
+        add     $lo0,$hi1               # pull upmost overflow bit
+        adc     \$0,%rdx
+        mov     $hi1,-8(%rsp,$num,8)
+        mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
+        lea     1($i),$i                # i++
+        cmp     $num,$i
+        jl      .Louter
+        xor     $i,$i                   # i=0 and clear CF!
+        mov     (%rsp),%rax             # tp[0]
+        lea     (%rsp),$ap              # borrow ap for tp
+        mov     $num,$j                 # j=num
+        jmp     .Lsub
+.align  16
+.Lsub:  sbb     ($np,$i,8),%rax
+        mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
+        mov     8($ap,$i,8),%rax        # tp[i+1]
+        lea     1($i),$i                # i++
+        dec     $j                      # doesnn't affect CF!
+        jnz     .Lsub
+        sbb     \$0,%rax                # handle upmost overflow bit
+        xor     $i,$i
+        and     %rax,$ap
+        not     %rax
+        mov     $rp,$np
+        and     %rax,$np
+        mov     $num,$j                 # j=num
+        or      $np,$ap                 # ap=borrow?tp:rp
+.align  16
+.Lcopy:                                 # copy or in-place refresh
+        mov     ($ap,$i,8),%rax
+        mov     $i,(%rsp,$i,8)          # zap temporary vector
+        mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
+        lea     1($i),$i
+        sub     \$1,$j
+        jnz     .Lcopy
+        mov     8(%rsp,$num,8),%rsi     # restore %rsp
+        mov     \$1,%rax
+___
+$code.=<<___ if ($win64);
+        movaps  (%rsi),%xmm6
+        movaps  0x10(%rsi),%xmm7
+        lea     0x28(%rsi),%rsi
+___
+$code.=<<___;
+        mov     (%rsi),%r15
+        mov     8(%rsi),%r14
+        mov     16(%rsi),%r13
+        mov     24(%rsi),%r12
+        mov     32(%rsi),%rbp
+        mov     40(%rsi),%rbx
+        lea     48(%rsi),%rsp
+.Lmul_epilogue:
+        ret
+.size   bn_mul_mont_gather5,.-bn_mul_mont_gather5
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type   bn_mul4x_mont_gather5,\@function,6
+.align  16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+        mov     ${num}d,${num}d
+        mov     `($win64?56:8)`(%rsp),%r10d     # load 7th argument
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+___
+$code.=<<___ if ($win64);
+        lea     -0x28(%rsp),%rsp
+        movaps  %xmm6,(%rsp)
+        movaps  %xmm7,0x10(%rsp)
+.Lmul4x_alloca:
+___
+$code.=<<___;
+        mov     %rsp,%rax
+        lea     4($num),%r11
+        neg     %r11
+        lea     (%rsp,%r11,8),%rsp      # tp=alloca(8*(num+4))
+        and     \$-1024,%rsp            # minimize TLB usage
+        mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul4x_body:
+        mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
+        mov     %rdx,%r12               # reassign $bp
+___
+                $bp="%r12";
+                $STRIDE=2**5*8;         # 5 is "window size"
+                $N=$STRIDE/4;           # should match cache line size
+$code.=<<___;
+        mov     %r10,%r11
+        shr     \$`log($N/8)/log(2)`,%r10
+        and     \$`$N/8-1`,%r11
+        not     %r10
+        lea     .Lmagic_masks(%rip),%rax
+        and     \$`2**5/($N/8)-1`,%r10  # 5 is "window size"
+        lea     96($bp,%r11,8),$bp      # pointer within 1st cache line
+        movq    0(%rax,%r10,8),%xmm4    # set of masks denoting which
+        movq    8(%rax,%r10,8),%xmm5    # cache line contains element
+        movq    16(%rax,%r10,8),%xmm6   # denoted by 7th argument
+        movq    24(%rax,%r10,8),%xmm7
+        movq    `0*$STRIDE/4-96`($bp),%xmm0
+        movq    `1*$STRIDE/4-96`($bp),%xmm1
+        pand    %xmm4,%xmm0
+        movq    `2*$STRIDE/4-96`($bp),%xmm2
+        pand    %xmm5,%xmm1
+        movq    `3*$STRIDE/4-96`($bp),%xmm3
+        pand    %xmm6,%xmm2
+        por     %xmm1,%xmm0
+        pand    %xmm7,%xmm3
+        por     %xmm2,%xmm0
+        lea     $STRIDE($bp),$bp
+        por     %xmm3,%xmm0
+        movq    %xmm0,$m0               # m0=bp[0]
+        mov     ($n0),$n0               # pull n0[0] value
+        mov     ($ap),%rax
+        xor     $i,$i                   # i=0
+        xor     $j,$j                   # j=0
+        movq    `0*$STRIDE/4-96`($bp),%xmm0
+        movq    `1*$STRIDE/4-96`($bp),%xmm1
+        pand    %xmm4,%xmm0
+        movq    `2*$STRIDE/4-96`($bp),%xmm2
+        pand    %xmm5,%xmm1
+        mov     $n0,$m1
+        mulq    $m0                     # ap[0]*bp[0]
+        mov     %rax,$A[0]
+        mov     ($np),%rax
+        movq    `3*$STRIDE/4-96`($bp),%xmm3
+        pand    %xmm6,%xmm2
+        por     %xmm1,%xmm0
+        pand    %xmm7,%xmm3
+        imulq   $A[0],$m1               # "tp[0]"*n0
+        mov     %rdx,$A[1]
+        por     %xmm2,%xmm0
+        lea     $STRIDE($bp),$bp
+        por     %xmm3,%xmm0
+        mulq    $m1                     # np[0]*m1
+        add     %rax,$A[0]              # discarded
+        mov     8($ap),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$N[1]
+        mulq    $m0
+        add     %rax,$A[1]
+        mov     8($np),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$A[0]
+        mulq    $m1
+        add     %rax,$N[1]
+        mov     16($ap),%rax
+        adc     \$0,%rdx
+        add     $A[1],$N[1]
+        lea     4($j),$j                # j++
+        adc     \$0,%rdx
+        mov     $N[1],(%rsp)
+        mov     %rdx,$N[0]
+        jmp     .L1st4x
+.align  16
+.L1st4x:
+        mulq    $m0                     # ap[j]*bp[0]
+        add     %rax,$A[0]
+        mov     -16($np,$j,8),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$A[1]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[0]
+        mov     -8($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+        adc     \$0,%rdx
+        mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[1]
+        mulq    $m0                     # ap[j]*bp[0]
+        add     %rax,$A[1]
+        mov     -8($np,$j,8),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$A[0]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[1]
+        mov     ($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+        adc     \$0,%rdx
+        mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[0]
+        mulq    $m0                     # ap[j]*bp[0]
+        add     %rax,$A[0]
+        mov     ($np,$j,8),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$A[1]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[0]
+        mov     8($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+        adc     \$0,%rdx
+        mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
+        mov     %rdx,$N[1]
+        mulq    $m0                     # ap[j]*bp[0]
+        add     %rax,$A[1]
+        mov     8($np,$j,8),%rax
+        adc     \$0,%rdx
+        lea     4($j),$j                # j++
+        mov     %rdx,$A[0]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[1]
+        mov     -16($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+        adc     \$0,%rdx
+        mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[0]
+        cmp     $num,$j
+        jl      .L1st4x
+        mulq    $m0                     # ap[j]*bp[0]
+        add     %rax,$A[0]
+        mov     -16($np,$j,8),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$A[1]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[0]
+        mov     -8($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+        adc     \$0,%rdx
+        mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[1]
+        mulq    $m0                     # ap[j]*bp[0]
+        add     %rax,$A[1]
+        mov     -8($np,$j,8),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$A[0]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[1]
+        mov     ($ap),%rax              # ap[0]
+        adc     \$0,%rdx
+        add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+        adc     \$0,%rdx
+        mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[0]
+        movq    %xmm0,$m0               # bp[1]
+        xor     $N[1],$N[1]
+        add     $A[0],$N[0]
+        adc     \$0,$N[1]
+        mov     $N[0],-8(%rsp,$j,8)
+        mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
+        lea     1($i),$i                # i++
+.align  4
+.Louter4x:
+        xor     $j,$j                   # j=0
+        movq    `0*$STRIDE/4-96`($bp),%xmm0
+        movq    `1*$STRIDE/4-96`($bp),%xmm1
+        pand    %xmm4,%xmm0
+        movq    `2*$STRIDE/4-96`($bp),%xmm2
+        pand    %xmm5,%xmm1
+        mov     (%rsp),$A[0]
+        mov     $n0,$m1
+        mulq    $m0                     # ap[0]*bp[i]
+        add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
+        mov     ($np),%rax
+        adc     \$0,%rdx
+        movq    `3*$STRIDE/4-96`($bp),%xmm3
+        pand    %xmm6,%xmm2
+        por     %xmm1,%xmm0
+        pand    %xmm7,%xmm3
+        imulq   $A[0],$m1               # tp[0]*n0
+        mov     %rdx,$A[1]
+        por     %xmm2,%xmm0
+        lea     $STRIDE($bp),$bp
+        por     %xmm3,%xmm0
+        mulq    $m1                     # np[0]*m1
+        add     %rax,$A[0]              # "$N[0]", discarded
+        mov     8($ap),%rax
+        adc     \$0,%rdx
+        mov     %rdx,$N[1]
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$A[1]
+        mov     8($np),%rax
+        adc     \$0,%rdx
+        add     8(%rsp),$A[1]           # +tp[1]
+        adc     \$0,%rdx
+        mov     %rdx,$A[0]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[1]
+        mov     16($ap),%rax
+        adc     \$0,%rdx
+        add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
+        lea     4($j),$j                # j+=2
+        adc     \$0,%rdx
+        mov     %rdx,$N[0]
+        jmp     .Linner4x
+.align  16
+.Linner4x:
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$A[0]
+        mov     -16($np,$j,8),%rax
+        adc     \$0,%rdx
+        add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
+        adc     \$0,%rdx
+        mov     %rdx,$A[1]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[0]
+        mov     -8($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[0],$N[0]
+        adc     \$0,%rdx
+        mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[1]
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$A[1]
+        mov     -8($np,$j,8),%rax
+        adc     \$0,%rdx
+        add     -8(%rsp,$j,8),$A[1]
+        adc     \$0,%rdx
+        mov     %rdx,$A[0]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[1]
+        mov     ($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[1],$N[1]
+        adc     \$0,%rdx
+        mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[0]
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$A[0]
+        mov     ($np,$j,8),%rax
+        adc     \$0,%rdx
+        add     (%rsp,$j,8),$A[0]       # ap[j]*bp[i]+tp[j]
+        adc     \$0,%rdx
+        mov     %rdx,$A[1]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[0]
+        mov     8($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[0],$N[0]
+        adc     \$0,%rdx
+        mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[1]
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$A[1]
+        mov     8($np,$j,8),%rax
+        adc     \$0,%rdx
+        add     8(%rsp,$j,8),$A[1]
+        adc     \$0,%rdx
+        lea     4($j),$j                # j++
+        mov     %rdx,$A[0]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[1]
+        mov     -16($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[1],$N[1]
+        adc     \$0,%rdx
+        mov     $N[0],-40(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[0]
+        cmp     $num,$j
+        jl      .Linner4x
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$A[0]
+        mov     -16($np,$j,8),%rax
+        adc     \$0,%rdx
+        add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
+        adc     \$0,%rdx
+        mov     %rdx,$A[1]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[0]
+        mov     -8($ap,$j,8),%rax
+        adc     \$0,%rdx
+        add     $A[0],$N[0]
+        adc     \$0,%rdx
+        mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[1]
+        mulq    $m0                     # ap[j]*bp[i]
+        add     %rax,$A[1]
+        mov     -8($np,$j,8),%rax
+        adc     \$0,%rdx
+        add     -8(%rsp,$j,8),$A[1]
+        adc     \$0,%rdx
+        lea     1($i),$i                # i++
+        mov     %rdx,$A[0]
+        mulq    $m1                     # np[j]*m1
+        add     %rax,$N[1]
+        mov     ($ap),%rax              # ap[0]
+        adc     \$0,%rdx
+        add     $A[1],$N[1]
+        adc     \$0,%rdx
+        mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+        mov     %rdx,$N[0]
+        movq    %xmm0,$m0               # bp[i+1]
+        mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+        xor     $N[1],$N[1]
+        add     $A[0],$N[0]
+        adc     \$0,$N[1]
+        add     (%rsp,$num,8),$N[0]     # pull upmost overflow bit
+        adc     \$0,$N[1]
+        mov     $N[0],-8(%rsp,$j,8)
+        mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
+        cmp     $num,$i
+        jl      .Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+        mov     16(%rsp,$num,8),$rp     # restore $rp
+        mov     0(%rsp),@ri[0]          # tp[0]
+        pxor    %xmm0,%xmm0
+        mov     8(%rsp),@ri[1]          # tp[1]
+        shr     \$2,$num                # num/=4
+        lea     (%rsp),$ap              # borrow ap for tp
+        xor     $i,$i                   # i=0 and clear CF!
+        sub     0($np),@ri[0]
+        mov     16($ap),@ri[2]          # tp[2]
+        mov     24($ap),@ri[3]          # tp[3]
+        sbb     8($np),@ri[1]
+        lea     -1($num),$j             # j=num/4-1
+        jmp     .Lsub4x
+.align  16
+.Lsub4x:
+        mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
+        mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
+        sbb     16($np,$i,8),@ri[2]
+        mov     32($ap,$i,8),@ri[0]     # tp[i+1]
+        mov     40($ap,$i,8),@ri[1]
+        sbb     24($np,$i,8),@ri[3]
+        mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
+        mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
+        sbb     32($np,$i,8),@ri[0]
+        mov     48($ap,$i,8),@ri[2]
+        mov     56($ap,$i,8),@ri[3]
+        sbb     40($np,$i,8),@ri[1]
+        lea     4($i),$i                # i++
+        dec     $j                      # doesnn't affect CF!
+        jnz     .Lsub4x
+        mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
+        mov     32($ap,$i,8),@ri[0]     # load overflow bit
+        sbb     16($np,$i,8),@ri[2]
+        mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
+        sbb     24($np,$i,8),@ri[3]
+        mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
+        sbb     \$0,@ri[0]              # handle upmost overflow bit
+        mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
+        xor     $i,$i                   # i=0
+        and     @ri[0],$ap
+        not     @ri[0]
+        mov     $rp,$np
+        and     @ri[0],$np
+        lea     -1($num),$j
+        or      $np,$ap                 # ap=borrow?tp:rp
+        movdqu  ($ap),%xmm1
+        movdqa  %xmm0,(%rsp)
+        movdqu  %xmm1,($rp)
+        jmp     .Lcopy4x
+.align  16
+.Lcopy4x:                                       # copy or in-place refresh
+        movdqu  16($ap,$i),%xmm2
+        movdqu  32($ap,$i),%xmm1
+        movdqa  %xmm0,16(%rsp,$i)
+        movdqu  %xmm2,16($rp,$i)
+        movdqa  %xmm0,32(%rsp,$i)
+        movdqu  %xmm1,32($rp,$i)
+        lea     32($i),$i
+        dec     $j
+        jnz     .Lcopy4x
+        shl     \$2,$num
+        movdqu  16($ap,$i),%xmm2
+        movdqa  %xmm0,16(%rsp,$i)
+        movdqu  %xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
+        mov     8(%rsp,$num,8),%rsi     # restore %rsp
+        mov     \$1,%rax
+___
+$code.=<<___ if ($win64);
+        movaps  (%rsi),%xmm6
+        movaps  0x10(%rsi),%xmm7
+        lea     0x28(%rsi),%rsi
+___
+$code.=<<___;
+        mov     (%rsi),%r15
+        mov     8(%rsi),%r14
+        mov     16(%rsi),%r13
+        mov     24(%rsi),%r12
+        mov     32(%rsi),%rbp
+        mov     40(%rsi),%rbx
+        lea     48(%rsi),%rsp
+.Lmul4x_epilogue:
+        ret
+.size   bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+___
+}}}
+{
+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+                                ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+my $out=$inp;
+my $STRIDE=2**5*8;
+my $N=$STRIDE/4;
+$code.=<<___;
+.globl  bn_scatter5
+.type   bn_scatter5,\@abi-omnipotent
+.align  16
+bn_scatter5:
+        cmp     \$0, $num
+        jz      .Lscatter_epilogue
+        lea     ($tbl,$idx,8),$tbl
+.Lscatter:
+        mov     ($inp),%rax
+        lea     8($inp),$inp
+        mov     %rax,($tbl)
+        lea     32*8($tbl),$tbl
+        sub     \$1,$num
+        jnz     .Lscatter
+.Lscatter_epilogue:
+        ret
+.size   bn_scatter5,.-bn_scatter5
+.globl  bn_gather5
+.type   bn_gather5,\@abi-omnipotent
+.align  16
+bn_gather5:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_bn_gather5:
+        # I can't trust assembler to use specific encoding:-(
+        .byte   0x48,0x83,0xec,0x28             #sub    \$0x28,%rsp
+        .byte   0x0f,0x29,0x34,0x24             #movaps %xmm6,(%rsp)
+        .byte   0x0f,0x29,0x7c,0x24,0x10        #movdqa %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+        mov     $idx,%r11
+        shr     \$`log($N/8)/log(2)`,$idx
+        and     \$`$N/8-1`,%r11
+        not     $idx
+        lea     .Lmagic_masks(%rip),%rax
+        and     \$`2**5/($N/8)-1`,$idx  # 5 is "window size"
+        lea     96($tbl,%r11,8),$tbl    # pointer within 1st cache line
+        movq    0(%rax,$idx,8),%xmm4    # set of masks denoting which
+        movq    8(%rax,$idx,8),%xmm5    # cache line contains element
+        movq    16(%rax,$idx,8),%xmm6   # denoted by 7th argument
+        movq    24(%rax,$idx,8),%xmm7
+        jmp     .Lgather
+.align  16
+.Lgather:
+        movq    `0*$STRIDE/4-96`($tbl),%xmm0
+        movq    `1*$STRIDE/4-96`($tbl),%xmm1
+        pand    %xmm4,%xmm0
+        movq    `2*$STRIDE/4-96`($tbl),%xmm2
+        pand    %xmm5,%xmm1
+        movq    `3*$STRIDE/4-96`($tbl),%xmm3
+        pand    %xmm6,%xmm2
+        por     %xmm1,%xmm0
+        pand    %xmm7,%xmm3
+        por     %xmm2,%xmm0
+        lea     $STRIDE($tbl),$tbl
+        por     %xmm3,%xmm0
+        movq    %xmm0,($out)            # m0=bp[0]
+        lea     8($out),$out
+        sub     \$1,$num
+        jnz     .Lgather
+___
+$code.=<<___ if ($win64);
+        movaps  %xmm6,(%rsp)
+        movaps  %xmm7,0x10(%rsp)
+        lea     0x28(%rsp),%rsp
+___
+$code.=<<___;
+        ret
+.LSEH_end_bn_gather5:
+.size   bn_gather5,.-bn_gather5
+___
+}
+$code.=<<___;
+.align  64
+.Lmagic_masks:
+        .long   0,0, 0,0, 0,0, -1,-1
+        .long   0,0, 0,0, 0,0,  0,0
+.asciz  "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type   mul_handler,\@abi-omnipotent
+.align  16
+mul_handler:
+        push    %rsi
+        push    %rdi
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        pushfq
+        sub     \$64,%rsp
+        mov     120($context),%rax      # pull context->Rax
+        mov     248($context),%rbx      # pull context->Rip
+        mov     8($disp),%rsi           # disp->ImageBase
+        mov     56($disp),%r11          # disp->HandlerData
+        mov     0(%r11),%r10d           # HandlerData[0]
+        lea     (%rsi,%r10),%r10        # end of prologue label
+        cmp     %r10,%rbx               # context->Rip<end of prologue label
+        jb      .Lcommon_seh_tail
+        lea     `40+48`(%rax),%rax
+        mov     4(%r11),%r10d           # HandlerData[1]
+        lea     (%rsi,%r10),%r10        # end of alloca label
+        cmp     %r10,%rbx               # context->Rip<end of alloca label
+        jb      .Lcommon_seh_tail
+        mov     152($context),%rax      # pull context->Rsp
+        mov     8(%r11),%r10d           # HandlerData[2]
+        lea     (%rsi,%r10),%r10        # epilogue label
+        cmp     %r10,%rbx               # context->Rip>=epilogue label
+        jae     .Lcommon_seh_tail
+        mov     192($context),%r10      # pull $num
+        mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
+        movaps  (%rax),%xmm0
+        movaps  16(%rax),%xmm1
+        lea     `40+48`(%rax),%rax
+        mov     -8(%rax),%rbx
+        mov     -16(%rax),%rbp
+        mov     -24(%rax),%r12
+        mov     -32(%rax),%r13
+        mov     -40(%rax),%r14
+        mov     -48(%rax),%r15
+        mov     %rbx,144($context)      # restore context->Rbx
+        mov     %rbp,160($context)      # restore context->Rbp
+        mov     %r12,216($context)      # restore context->R12
+        mov     %r13,224($context)      # restore context->R13
+        mov     %r14,232($context)      # restore context->R14
+        mov     %r15,240($context)      # restore context->R15
+        movups  %xmm0,512($context)     # restore context->Xmm6
+        movups  %xmm1,528($context)     # restore context->Xmm7
+.Lcommon_seh_tail:
+        mov     8(%rax),%rdi
+        mov     16(%rax),%rsi
+        mov     %rax,152($context)      # restore context->Rsp
+        mov     %rsi,168($context)      # restore context->Rsi
+        mov     %rdi,176($context)      # restore context->Rdi
+        mov     40($disp),%rdi          # disp->ContextRecord
+        mov     $context,%rsi           # context
+        mov     \$154,%ecx              # sizeof(CONTEXT)
+        .long   0xa548f3fc              # cld; rep movsq
+        mov     $disp,%rsi
+        xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+        mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+        mov     0(%rsi),%r8             # arg3, disp->ControlPc
+        mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+        mov     40(%rsi),%r10           # disp->ContextRecord
+        lea     56(%rsi),%r11           # &disp->HandlerData
+        lea     24(%rsi),%r12           # &disp->EstablisherFrame
+        mov     %r10,32(%rsp)           # arg5
+        mov     %r11,40(%rsp)           # arg6
+        mov     %r12,48(%rsp)           # arg7
+        mov     %rcx,56(%rsp)           # arg8, (NULL)
+        call    *__imp_RtlVirtualUnwind(%rip)
+        mov     \$1,%eax                # ExceptionContinueSearch
+        add     \$64,%rsp
+        popfq
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+        pop     %rbp
+        pop     %rbx
+        pop     %rdi
+        pop     %rsi
+        ret
+.size   mul_handler,.-mul_handler
+.section        .pdata
+.align  4
+        .rva    .LSEH_begin_bn_mul_mont_gather5
+        .rva    .LSEH_end_bn_mul_mont_gather5
+        .rva    .LSEH_info_bn_mul_mont_gather5
+        .rva    .LSEH_begin_bn_mul4x_mont_gather5
+        .rva    .LSEH_end_bn_mul4x_mont_gather5
+        .rva    .LSEH_info_bn_mul4x_mont_gather5
+        .rva    .LSEH_begin_bn_gather5
+        .rva    .LSEH_end_bn_gather5
+        .rva    .LSEH_info_bn_gather5
+.section        .xdata
+.align  8
+.LSEH_info_bn_mul_mont_gather5:
+        .byte   9,0,0,0
+        .rva    mul_handler
+        .rva    .Lmul_alloca,.Lmul_body,.Lmul_epilogue          # HandlerData[]
+.align  8
+.LSEH_info_bn_mul4x_mont_gather5:
+        .byte   9,0,0,0
+        .rva    mul_handler
+        .rva    .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue    # HandlerData[]
+.align  8
+.LSEH_info_bn_gather5:
+        .byte   0x01,0x0d,0x05,0x00
+        .byte   0x0d,0x78,0x01,0x00     #movaps 0x10(rsp),xmm7
+        .byte   0x08,0x68,0x00,0x00     #movaps (rsp),xmm6
+        .byte   0x04,0x42,0x00,0x00     #sub    rsp,0x28
+.align  8
+___
+}
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/camellia/asm/cmll-x86.pl b/src/lib/libcrypto/camellia/asm/cmll-x86.pl
index 027302ac86..c314d62312 100644
--- a/src/lib/libcrypto/camellia/asm/cmll-x86.pl
+++ b/src/lib/libcrypto/camellia/asm/cmll-x86.pl
@@ -723,11 +723,11 @@ my $bias=int(@T[0])?shift(@T):0;
 &function_end("Camellia_Ekeygen");
 if ($OPENSSL) {
-# int Camellia_set_key (
+# int private_Camellia_set_key (
 #               const unsigned char *userKey,
 #               int bits,
 #               CAMELLIA_KEY *key)
-&function_begin_B("Camellia_set_key");
+&function_begin_B("private_Camellia_set_key");
        &push   ("ebx");
        &mov    ("ecx",&wparam(0));     # pull arguments
        &mov    ("ebx",&wparam(1));
@@ -760,7 +760,7 @@ if ($OPENSSL) {
 &set_label("done",4);
        &pop    ("ebx");
        &ret    ();
-&function_end_B("Camellia_set_key");
+&function_end_B("private_Camellia_set_key");
 }
 @SBOX=(
diff --git a/src/lib/libcrypto/camellia/camellia.h b/src/lib/libcrypto/camellia/camellia.h
index cf0457dd97..67911e0adf 100644
--- a/src/lib/libcrypto/camellia/camellia.h
+++ b/src/lib/libcrypto/camellia/camellia.h
@@ -88,6 +88,10 @@ struct camellia_key_st
        };
 typedef struct camellia_key_st CAMELLIA_KEY;
+#ifdef OPENSSL_FIPS
+int private_Camellia_set_key(const unsigned char *userKey, const int bits,
+        CAMELLIA_KEY *key);
+#endif
 int Camellia_set_key(const unsigned char *userKey, const int bits,
        CAMELLIA_KEY *key);
diff --git a/src/lib/libcrypto/camellia/cmll_locl.h b/src/lib/libcrypto/camellia/cmll_locl.h
index 4a4d880d16..246b6ce1d8 100644
--- a/src/lib/libcrypto/camellia/cmll_locl.h
+++ b/src/lib/libcrypto/camellia/cmll_locl.h
@@ -71,7 +71,8 @@
 typedef unsigned int  u32;
 typedef unsigned char u8;
-int Camellia_Ekeygen(int keyBitLength, const u8 *rawKey, KEY_TABLE_TYPE keyTable);
+int Camellia_Ekeygen(int keyBitLength, const u8 *rawKey,
+                     KEY_TABLE_TYPE keyTable);
 void Camellia_EncryptBlock_Rounds(int grandRounds, const u8 plaintext[], 
                const KEY_TABLE_TYPE keyTable, u8 ciphertext[]);
 void Camellia_DecryptBlock_Rounds(int grandRounds, const u8 ciphertext[], 
@@ -80,4 +81,6 @@ void Camellia_EncryptBlock(int keyBitLength, const u8 plaintext[],
                const KEY_TABLE_TYPE keyTable, u8 ciphertext[]);
 void Camellia_DecryptBlock(int keyBitLength, const u8 ciphertext[], 
                const KEY_TABLE_TYPE keyTable, u8 plaintext[]);
+int private_Camellia_set_key(const unsigned char *userKey, const int bits,
+                             CAMELLIA_KEY *key);
 #endif /* #ifndef HEADER_CAMELLIA_LOCL_H */
diff --git a/src/lib/libcrypto/camellia/cmll_misc.c b/src/lib/libcrypto/camellia/cmll_misc.c
index f44689124b..f44d48564c 100644
--- a/src/lib/libcrypto/camellia/cmll_misc.c
+++ b/src/lib/libcrypto/camellia/cmll_misc.c
@@ -50,12 +50,13 @@
 */
 
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
 #include <openssl/camellia.h>
 #include "cmll_locl.h"
 const char CAMELLIA_version[]="CAMELLIA" OPENSSL_VERSION_PTEXT;
-int Camellia_set_key(const unsigned char *userKey, const int bits,
+int private_Camellia_set_key(const unsigned char *userKey, const int bits,
        CAMELLIA_KEY *key)
        {
        if(!userKey || !key)
diff --git a/src/lib/libcrypto/cmac/cm_ameth.c b/src/lib/libcrypto/cmac/cm_ameth.c
new file mode 100644
index 0000000000..0b8e5670b0
--- /dev/null
+++ b/src/lib/libcrypto/cmac/cm_ameth.c
@@ -0,0 +1,97 @@
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2010.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/evp.h>
+#include <openssl/cmac.h>
+#include "asn1_locl.h"
+/* CMAC "ASN1" method. This is just here to indicate the
+ * maximum CMAC output length and to free up a CMAC
+ * key.
+ */
+static int cmac_size(const EVP_PKEY *pkey)
+        {
+        return EVP_MAX_BLOCK_LENGTH;
+        }
+static void cmac_key_free(EVP_PKEY *pkey)
+        {
+        CMAC_CTX *cmctx = (CMAC_CTX *)pkey->pkey.ptr;
+        if (cmctx)
+                CMAC_CTX_free(cmctx);
+        }
+const EVP_PKEY_ASN1_METHOD cmac_asn1_meth = 
+        {
+        EVP_PKEY_CMAC,
+        EVP_PKEY_CMAC,
+        0,
+        "CMAC",
+        "OpenSSL CMAC method",
+        0,0,0,0,
+        0,0,0,
+        cmac_size,
+        0,
+        0,0,0,0,0,0,0,
+        cmac_key_free,
+        0,
+        0,0
+        };
diff --git a/src/lib/libcrypto/cmac/cm_pmeth.c b/src/lib/libcrypto/cmac/cm_pmeth.c
new file mode 100644
index 0000000000..072228ec7f
--- /dev/null
+++ b/src/lib/libcrypto/cmac/cm_pmeth.c
@@ -0,0 +1,224 @@
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2010.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/x509.h>
+#include <openssl/x509v3.h>
+#include <openssl/evp.h>
+#include <openssl/cmac.h>
+#include "evp_locl.h"
+/* The context structure and "key" is simply a CMAC_CTX */
+static int pkey_cmac_init(EVP_PKEY_CTX *ctx)
+        {
+        ctx->data = CMAC_CTX_new();
+        if (!ctx->data)
+                return 0;
+        ctx->keygen_info_count = 0;
+        return 1;
+        }
+static int pkey_cmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
+        {
+        if (!pkey_cmac_init(dst))
+                return 0;
+        if (!CMAC_CTX_copy(dst->data, src->data))
+                return 0;
+        return 1;
+        }
+static void pkey_cmac_cleanup(EVP_PKEY_CTX *ctx)
+        {
+        CMAC_CTX_free(ctx->data);
+        }
+static int pkey_cmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
+        {
+        CMAC_CTX *cmkey = CMAC_CTX_new();
+        CMAC_CTX *cmctx = ctx->data;
+        if (!cmkey)
+                return 0;
+        if (!CMAC_CTX_copy(cmkey, cmctx))
+                {
+                CMAC_CTX_free(cmkey);
+                return 0;
+                }
+        EVP_PKEY_assign(pkey, EVP_PKEY_CMAC, cmkey);
+        
+        return 1;
+        }
+static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count)
+        {
+        if (!CMAC_Update(ctx->pctx->data, data, count))
+                return 0;
+        return 1;
+        }
+static int cmac_signctx_init(EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx)
+        {
+        EVP_MD_CTX_set_flags(mctx, EVP_MD_CTX_FLAG_NO_INIT);
+        mctx->update = int_update;
+        return 1;
+        }
+static int cmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
+                                        EVP_MD_CTX *mctx)
+        {
+        return CMAC_Final(ctx->data, sig, siglen);
+        }
+static int pkey_cmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
+        {
+        CMAC_CTX *cmctx = ctx->data;
+        switch (type)
+                {
+                case EVP_PKEY_CTRL_SET_MAC_KEY:
+                if (!p2 || p1 < 0)
+                        return 0;
+                if (!CMAC_Init(cmctx, p2, p1, NULL, NULL))
+                        return 0;
+                break;
+                case EVP_PKEY_CTRL_CIPHER:
+                if (!CMAC_Init(cmctx, NULL, 0, p2, ctx->engine))
+                        return 0;
+                break;
+                case EVP_PKEY_CTRL_MD:
+                if (ctx->pkey && !CMAC_CTX_copy(ctx->data,
+                                        (CMAC_CTX *)ctx->pkey->pkey.ptr))
+                        return 0;
+                if (!CMAC_Init(cmctx, NULL, 0, NULL, NULL))
+                        return 0;
+                break;
+                default:
+                return -2;
+                }
+        return 1;
+        }
+static int pkey_cmac_ctrl_str(EVP_PKEY_CTX *ctx,
+                        const char *type, const char *value)
+        {
+        if (!value)
+                {
+                return 0;
+                }
+        if (!strcmp(type, "key"))
+                {
+                void *p = (void *)value;
+                return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY,
+                                                                strlen(p), p);
+                }
+        if (!strcmp(type, "cipher"))
+                {
+                const EVP_CIPHER *c;
+                c = EVP_get_cipherbyname(value);
+                if (!c)
+                        return 0;
+                return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_CIPHER, -1, (void *)c);
+                }
+        if (!strcmp(type, "hexkey"))
+                {
+                unsigned char *key;
+                int r;
+                long keylen;
+                key = string_to_hex(value, &keylen);
+                if (!key)
+                        return 0;
+                r = pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key);
+                OPENSSL_free(key);
+                return r;
+                }
+        return -2;
+        }
+const EVP_PKEY_METHOD cmac_pkey_meth = 
+        {
+        EVP_PKEY_CMAC,
+        EVP_PKEY_FLAG_SIGCTX_CUSTOM,
+        pkey_cmac_init,
+        pkey_cmac_copy,
+        pkey_cmac_cleanup,
+        0, 0,
+        0,
+        pkey_cmac_keygen,
+        0, 0,
+        0, 0,
+        0,0,
+        cmac_signctx_init,
+        cmac_signctx,
+        0,0,
+        0,0,
+        0,0,
+        0,0,
+        pkey_cmac_ctrl,
+        pkey_cmac_ctrl_str
+        };
diff --git a/src/lib/libcrypto/cmac/cmac.c b/src/lib/libcrypto/cmac/cmac.c
new file mode 100644
index 0000000000..8b72b09681
--- /dev/null
+++ b/src/lib/libcrypto/cmac/cmac.c
@@ -0,0 +1,308 @@
+/* crypto/cmac/cmac.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "cryptlib.h"
+#include <openssl/cmac.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+struct CMAC_CTX_st
+        {
+        /* Cipher context to use */
+        EVP_CIPHER_CTX cctx;
+        /* Keys k1 and k2 */
+        unsigned char k1[EVP_MAX_BLOCK_LENGTH];
+        unsigned char k2[EVP_MAX_BLOCK_LENGTH];
+        /* Temporary block */
+        unsigned char tbl[EVP_MAX_BLOCK_LENGTH];
+        /* Last (possibly partial) block */
+        unsigned char last_block[EVP_MAX_BLOCK_LENGTH];
+        /* Number of bytes in last block: -1 means context not initialised */
+        int nlast_block;
+        };
+/* Make temporary keys K1 and K2 */
+static void make_kn(unsigned char *k1, unsigned char *l, int bl)
+        {
+        int i;
+        /* Shift block to left, including carry */
+        for (i = 0; i < bl; i++)
+                {
+                k1[i] = l[i] << 1;
+                if (i < bl - 1 && l[i + 1] & 0x80)
+                        k1[i] |= 1;
+                }
+        /* If MSB set fixup with R */
+        if (l[0] & 0x80)
+                k1[bl - 1] ^= bl == 16 ? 0x87 : 0x1b;
+        }
+CMAC_CTX *CMAC_CTX_new(void)
+        {
+        CMAC_CTX *ctx;
+        ctx = OPENSSL_malloc(sizeof(CMAC_CTX));
+        if (!ctx)
+                return NULL;
+        EVP_CIPHER_CTX_init(&ctx->cctx);
+        ctx->nlast_block = -1;
+        return ctx;
+        }
+void CMAC_CTX_cleanup(CMAC_CTX *ctx)
+        {
+#ifdef OPENSSL_FIPS
+        if (FIPS_mode() && !ctx->cctx.engine)
+                {
+                FIPS_cmac_ctx_cleanup(ctx);
+                return;
+                }
+#endif
+        EVP_CIPHER_CTX_cleanup(&ctx->cctx);
+        OPENSSL_cleanse(ctx->tbl, EVP_MAX_BLOCK_LENGTH);
+        OPENSSL_cleanse(ctx->k1, EVP_MAX_BLOCK_LENGTH);
+        OPENSSL_cleanse(ctx->k2, EVP_MAX_BLOCK_LENGTH);
+        OPENSSL_cleanse(ctx->last_block, EVP_MAX_BLOCK_LENGTH);
+        ctx->nlast_block = -1;
+        }
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx)
+        {
+        return &ctx->cctx;
+        }
+void CMAC_CTX_free(CMAC_CTX *ctx)
+        {
+        CMAC_CTX_cleanup(ctx);
+        OPENSSL_free(ctx);
+        }
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in)
+        {
+        int bl;
+        if (in->nlast_block == -1)
+                return 0;
+        if (!EVP_CIPHER_CTX_copy(&out->cctx, &in->cctx))
+                return 0;
+        bl = EVP_CIPHER_CTX_block_size(&in->cctx);
+        memcpy(out->k1, in->k1, bl);
+        memcpy(out->k2, in->k2, bl);
+        memcpy(out->tbl, in->tbl, bl);
+        memcpy(out->last_block, in->last_block, bl);
+        out->nlast_block = in->nlast_block;
+        return 1;
+        }
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, 
+                        const EVP_CIPHER *cipher, ENGINE *impl)
+        {
+        static unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH];
+#ifdef OPENSSL_FIPS
+        if (FIPS_mode())
+                {
+                /* If we have an ENGINE need to allow non FIPS */
+                if ((impl || ctx->cctx.engine)
+                        && !(ctx->cctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW))
+                        {
+                        EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS);
+                        return 0;
+                        }
+                /* Other algorithm blocking will be done in FIPS_cmac_init,
+                 * via FIPS_cipherinit().
+                 */
+                if (!impl && !ctx->cctx.engine)
+                        return FIPS_cmac_init(ctx, key, keylen, cipher, NULL);
+                }
+#endif
+        /* All zeros means restart */
+        if (!key && !cipher && !impl && keylen == 0)
+                {
+                /* Not initialised */
+                if (ctx->nlast_block == -1)
+                        return 0;
+                if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
+                        return 0;
+                memset(ctx->tbl, 0, EVP_CIPHER_CTX_block_size(&ctx->cctx));
+                ctx->nlast_block = 0;
+                return 1;
+                }
+        /* Initialiase context */
+        if (cipher && !EVP_EncryptInit_ex(&ctx->cctx, cipher, impl, NULL, NULL))
+                return 0;
+        /* Non-NULL key means initialisation complete */
+        if (key)
+                {
+                int bl;
+                if (!EVP_CIPHER_CTX_cipher(&ctx->cctx))
+                        return 0;
+                if (!EVP_CIPHER_CTX_set_key_length(&ctx->cctx, keylen))
+                        return 0;
+                if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, key, zero_iv))
+                        return 0;
+                bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+                if (!EVP_Cipher(&ctx->cctx, ctx->tbl, zero_iv, bl))
+                        return 0;
+                make_kn(ctx->k1, ctx->tbl, bl);
+                make_kn(ctx->k2, ctx->k1, bl);
+                OPENSSL_cleanse(ctx->tbl, bl);
+                /* Reset context again ready for first data block */
+                if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
+                        return 0;
+                /* Zero tbl so resume works */
+                memset(ctx->tbl, 0, bl);
+                ctx->nlast_block = 0;
+                }
+        return 1;
+        }
+int CMAC_Update(CMAC_CTX *ctx, const void *in, size_t dlen)
+        {
+        const unsigned char *data = in;
+        size_t bl;
+#ifdef OPENSSL_FIPS
+        if (FIPS_mode() && !ctx->cctx.engine)
+                return FIPS_cmac_update(ctx, in, dlen);
+#endif
+        if (ctx->nlast_block == -1)
+                return 0;
+        if (dlen == 0)
+                return 1;
+        bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+        /* Copy into partial block if we need to */
+        if (ctx->nlast_block > 0)
+                {
+                size_t nleft;
+                nleft = bl - ctx->nlast_block;
+                if (dlen < nleft)
+                        nleft = dlen;
+                memcpy(ctx->last_block + ctx->nlast_block, data, nleft);
+                dlen -= nleft;
+                ctx->nlast_block += nleft;
+                /* If no more to process return */
+                if (dlen == 0)
+                        return 1;
+                data += nleft;
+                /* Else not final block so encrypt it */
+                if (!EVP_Cipher(&ctx->cctx, ctx->tbl, ctx->last_block,bl))
+                        return 0;
+                }
+        /* Encrypt all but one of the complete blocks left */
+        while(dlen > bl)
+                {
+                if (!EVP_Cipher(&ctx->cctx, ctx->tbl, data, bl))
+                        return 0;
+                dlen -= bl;
+                data += bl;
+                }
+        /* Copy any data left to last block buffer */
+        memcpy(ctx->last_block, data, dlen);
+        ctx->nlast_block = dlen;
+        return 1;
+        }
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen)
+        {
+        int i, bl, lb;
+#ifdef OPENSSL_FIPS
+        if (FIPS_mode() && !ctx->cctx.engine)
+                return FIPS_cmac_final(ctx, out, poutlen);
+#endif
+        if (ctx->nlast_block == -1)
+                return 0;
+        bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+        *poutlen = (size_t)bl;
+        if (!out)
+                return 1;
+        lb = ctx->nlast_block;
+        /* Is last block complete? */
+        if (lb == bl)
+                {
+                for (i = 0; i < bl; i++)
+                        out[i] = ctx->last_block[i] ^ ctx->k1[i];
+                }
+        else
+                {
+                ctx->last_block[lb] = 0x80;
+                if (bl - lb > 1)
+                        memset(ctx->last_block + lb + 1, 0, bl - lb - 1);
+                for (i = 0; i < bl; i++)
+                        out[i] = ctx->last_block[i] ^ ctx->k2[i];
+                }
+        if (!EVP_Cipher(&ctx->cctx, out, out, bl))
+                {
+                OPENSSL_cleanse(out, bl);       
+                return 0;
+                }
+        return 1;
+        }
+int CMAC_resume(CMAC_CTX *ctx)
+        {
+        if (ctx->nlast_block == -1)
+                return 0;
+        /* The buffer "tbl" containes the last fully encrypted block
+         * which is the last IV (or all zeroes if no last encrypted block).
+         * The last block has not been modified since CMAC_final().
+         * So reinitliasing using the last decrypted block will allow
+         * CMAC to continue after calling CMAC_Final(). 
+         */
+        return EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, ctx->tbl);
+        }
diff --git a/src/lib/libcrypto/cmac/cmac.h b/src/lib/libcrypto/cmac/cmac.h
new file mode 100644
index 0000000000..712e92dced
--- /dev/null
+++ b/src/lib/libcrypto/cmac/cmac.h
@@ -0,0 +1,82 @@
+/* crypto/cmac/cmac.h */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+#ifndef HEADER_CMAC_H
+#define HEADER_CMAC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <openssl/evp.h>
+/* Opaque */
+typedef struct CMAC_CTX_st CMAC_CTX;
+CMAC_CTX *CMAC_CTX_new(void);
+void CMAC_CTX_cleanup(CMAC_CTX *ctx);
+void CMAC_CTX_free(CMAC_CTX *ctx);
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx);
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in);
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, 
+                        const EVP_CIPHER *cipher, ENGINE *impl);
+int CMAC_Update(CMAC_CTX *ctx, const void *data, size_t dlen);
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen);
+int CMAC_resume(CMAC_CTX *ctx);
+#ifdef  __cplusplus
+}
+#endif
+#endif
diff --git a/src/lib/libcrypto/cms/cms.h b/src/lib/libcrypto/cms/cms.h
index 09c45d0412..36994fa6a2 100644
--- a/src/lib/libcrypto/cms/cms.h
+++ b/src/lib/libcrypto/cms/cms.h
@@ -111,6 +111,7 @@ DECLARE_ASN1_PRINT_FUNCTION(CMS_ContentInfo)
 #define CMS_PARTIAL                     0x4000
 #define CMS_REUSE_DIGEST                0x8000
 #define CMS_USE_KEYID                   0x10000
+#define CMS_DEBUG_DECRYPT               0x20000
 const ASN1_OBJECT *CMS_get0_type(CMS_ContentInfo *cms);
@@ -184,6 +185,8 @@ int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert);
 int CMS_decrypt_set1_key(CMS_ContentInfo *cms, 
                                unsigned char *key, size_t keylen,
                                unsigned char *id, size_t idlen);
+int CMS_decrypt_set1_password(CMS_ContentInfo *cms, 
+                                unsigned char *pass, ossl_ssize_t passlen);
 STACK_OF(CMS_RecipientInfo) *CMS_get0_RecipientInfos(CMS_ContentInfo *cms);
 int CMS_RecipientInfo_type(CMS_RecipientInfo *ri);
@@ -219,6 +222,16 @@ int CMS_RecipientInfo_set0_key(CMS_RecipientInfo *ri,
 int CMS_RecipientInfo_kekri_id_cmp(CMS_RecipientInfo *ri, 
                                        const unsigned char *id, size_t idlen);
+int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri, 
+                                        unsigned char *pass,
+                                        ossl_ssize_t passlen);
+CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms,
+                                        int iter, int wrap_nid, int pbe_nid,
+                                        unsigned char *pass,
+                                        ossl_ssize_t passlen,
+                                        const EVP_CIPHER *kekciph);
 int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri);
        
 int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out,
@@ -330,6 +343,7 @@ void ERR_load_CMS_strings(void);
 #define CMS_F_CHECK_CONTENT                              99
 #define CMS_F_CMS_ADD0_CERT                              164
 #define CMS_F_CMS_ADD0_RECIPIENT_KEY                     100
+#define CMS_F_CMS_ADD0_RECIPIENT_PASSWORD                165
 #define CMS_F_CMS_ADD1_RECEIPTREQUEST                    158
 #define CMS_F_CMS_ADD1_RECIPIENT_CERT                    101
 #define CMS_F_CMS_ADD1_SIGNER                            102
@@ -344,6 +358,7 @@ void ERR_load_CMS_strings(void);
 #define CMS_F_CMS_DATAINIT                               111
 #define CMS_F_CMS_DECRYPT                                112
 #define CMS_F_CMS_DECRYPT_SET1_KEY                       113
+#define CMS_F_CMS_DECRYPT_SET1_PASSWORD                  166
 #define CMS_F_CMS_DECRYPT_SET1_PKEY                      114
 #define CMS_F_CMS_DIGESTALGORITHM_FIND_CTX               115
 #define CMS_F_CMS_DIGESTALGORITHM_INIT_BIO               116
@@ -378,7 +393,9 @@ void ERR_load_CMS_strings(void);
 #define CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT             141
 #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS           142
 #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID      143
+#define CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT               167
 #define CMS_F_CMS_RECIPIENTINFO_SET0_KEY                 144
+#define CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD            168
 #define CMS_F_CMS_RECIPIENTINFO_SET0_PKEY                145
 #define CMS_F_CMS_SET1_SIGNERIDENTIFIER                  146
 #define CMS_F_CMS_SET_DETACHED                           147
@@ -419,6 +436,7 @@ void ERR_load_CMS_strings(void);
 #define CMS_R_ERROR_SETTING_KEY                          115
 #define CMS_R_ERROR_SETTING_RECIPIENTINFO                116
 #define CMS_R_INVALID_ENCRYPTED_KEY_LENGTH               117
+#define CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER           176
 #define CMS_R_INVALID_KEY_LENGTH                         118
 #define CMS_R_MD_BIO_INIT_ERROR                          119
 #define CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH       120
@@ -431,6 +449,7 @@ void ERR_load_CMS_strings(void);
 #define CMS_R_NOT_ENCRYPTED_DATA                         122
 #define CMS_R_NOT_KEK                                    123
 #define CMS_R_NOT_KEY_TRANSPORT                          124
+#define CMS_R_NOT_PWRI                                   177
 #define CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE            125
 #define CMS_R_NO_CIPHER                                  126
 #define CMS_R_NO_CONTENT                                 127
@@ -443,6 +462,7 @@ void ERR_load_CMS_strings(void);
 #define CMS_R_NO_MATCHING_RECIPIENT                      132
 #define CMS_R_NO_MATCHING_SIGNATURE                      166
 #define CMS_R_NO_MSGSIGDIGEST                            167
+#define CMS_R_NO_PASSWORD                                178
 #define CMS_R_NO_PRIVATE_KEY                             133
 #define CMS_R_NO_PUBLIC_KEY                              134
 #define CMS_R_NO_RECEIPT_REQUEST                         168
@@ -466,10 +486,12 @@ void ERR_load_CMS_strings(void);
 #define CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM          151
 #define CMS_R_UNSUPPORTED_CONTENT_TYPE                   152
 #define CMS_R_UNSUPPORTED_KEK_ALGORITHM                  153
+#define CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM       179
 #define CMS_R_UNSUPPORTED_RECIPIENT_TYPE                 154
 #define CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE              155
 #define CMS_R_UNSUPPORTED_TYPE                           156
 #define CMS_R_UNWRAP_ERROR                               157
+#define CMS_R_UNWRAP_FAILURE                             180
 #define CMS_R_VERIFICATION_FAILURE                       158
 #define CMS_R_WRAP_ERROR                                 159
diff --git a/src/lib/libcrypto/cms/cms_asn1.c b/src/lib/libcrypto/cms/cms_asn1.c
index fcba4dcbcc..cfe67fb6c1 100644
--- a/src/lib/libcrypto/cms/cms_asn1.c
+++ b/src/lib/libcrypto/cms/cms_asn1.c
@@ -237,6 +237,15 @@ static int cms_ri_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
                                OPENSSL_free(kekri->key);
                                }
                        }
+                else if (ri->type == CMS_RECIPINFO_PASS)
+                        {
+                        CMS_PasswordRecipientInfo *pwri = ri->d.pwri;
+                        if (pwri->pass)
+                                {
+                                OPENSSL_cleanse(pwri->pass, pwri->passlen);
+                                OPENSSL_free(pwri->pass);
+                                }
+                        }
                }
        return 1;
        }
diff --git a/src/lib/libcrypto/cms/cms_enc.c b/src/lib/libcrypto/cms/cms_enc.c
index bab26235bd..f873ce3794 100644
--- a/src/lib/libcrypto/cms/cms_enc.c
+++ b/src/lib/libcrypto/cms/cms_enc.c
@@ -73,6 +73,8 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec)
        const EVP_CIPHER *ciph;
        X509_ALGOR *calg = ec->contentEncryptionAlgorithm;
        unsigned char iv[EVP_MAX_IV_LENGTH], *piv = NULL;
+        unsigned char *tkey = NULL;
+        size_t tkeylen;
        int ok = 0;
@@ -137,32 +139,57 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec)
                                CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR);
                goto err;
                }
+        tkeylen = EVP_CIPHER_CTX_key_length(ctx);
+        /* Generate random session key */
-        if (enc && !ec->key)
+        if (!enc || !ec->key)
                {
-                /* Generate random key */
+                tkey = OPENSSL_malloc(tkeylen);
-                if (!ec->keylen)
+                if (!tkey)
-                        ec->keylen = EVP_CIPHER_CTX_key_length(ctx);
-                ec->key = OPENSSL_malloc(ec->keylen);
-                if (!ec->key)
                        {
                        CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
                                                        ERR_R_MALLOC_FAILURE);
                        goto err;
                        }
-                if (EVP_CIPHER_CTX_rand_key(ctx, ec->key) <= 0)
+                if (EVP_CIPHER_CTX_rand_key(ctx, tkey) <= 0)
                        goto err;
-                keep_key = 1;
                }
-        else if (ec->keylen != (unsigned int)EVP_CIPHER_CTX_key_length(ctx))
+        if (!ec->key)
+                {
+                ec->key = tkey;
+                ec->keylen = tkeylen;
+                tkey = NULL;
+                if (enc)
+                        keep_key = 1;
+                else
+                        ERR_clear_error();
+                
+                }
+        if (ec->keylen != tkeylen)
                {
                /* If necessary set key length */
                if (EVP_CIPHER_CTX_set_key_length(ctx, ec->keylen) <= 0)
                        {
-                        CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+                        /* Only reveal failure if debugging so we don't
-                                CMS_R_INVALID_KEY_LENGTH);
+                         * leak information which may be useful in MMA.
-                        goto err;
+                         */
+                        if (enc || ec->debug)
+                                {
+                                CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+                                                CMS_R_INVALID_KEY_LENGTH);
+                                goto err;
+                                }
+                        else
+                                {
+                                /* Use random key */
+                                OPENSSL_cleanse(ec->key, ec->keylen);
+                                OPENSSL_free(ec->key);
+                                ec->key = tkey;
+                                ec->keylen = tkeylen;
+                                tkey = NULL;
+                                ERR_clear_error();
+                                }
                        }
                }
@@ -198,6 +225,11 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec)
                OPENSSL_free(ec->key);
                ec->key = NULL;
                }
+        if (tkey)
+                {
+                OPENSSL_cleanse(tkey, tkeylen);
+                OPENSSL_free(tkey);
+                }
        if (ok)
                return b;
        BIO_free(b);
diff --git a/src/lib/libcrypto/cms/cms_env.c b/src/lib/libcrypto/cms/cms_env.c
index b3237d4b94..be20b1c024 100644
--- a/src/lib/libcrypto/cms/cms_env.c
+++ b/src/lib/libcrypto/cms/cms_env.c
@@ -65,14 +65,13 @@
 /* CMS EnvelopedData Utilities */
 DECLARE_ASN1_ITEM(CMS_EnvelopedData)
-DECLARE_ASN1_ITEM(CMS_RecipientInfo)
 DECLARE_ASN1_ITEM(CMS_KeyTransRecipientInfo)
 DECLARE_ASN1_ITEM(CMS_KEKRecipientInfo)
 DECLARE_ASN1_ITEM(CMS_OtherKeyAttribute)
 DECLARE_STACK_OF(CMS_RecipientInfo)
-static CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms)
+CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms)
        {
        if (OBJ_obj2nid(cms->contentType) != NID_pkcs7_enveloped)
                {
@@ -371,6 +370,8 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms,
        unsigned char *ek = NULL;
        size_t eklen;
        int ret = 0;
+        CMS_EncryptedContentInfo *ec;
+        ec = cms->d.envelopedData->encryptedContentInfo;
        if (ktri->pkey == NULL)
                {
@@ -417,8 +418,14 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms,
        ret = 1;
-        cms->d.envelopedData->encryptedContentInfo->key = ek;
+        if (ec->key)
-        cms->d.envelopedData->encryptedContentInfo->keylen = eklen;
+                {
+                OPENSSL_cleanse(ec->key, ec->keylen);
+                OPENSSL_free(ec->key);
+                }
+        ec->key = ek;
+        ec->keylen = eklen;
        err:
        if (pctx)
@@ -786,6 +793,9 @@ int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri)
                case CMS_RECIPINFO_KEK:
                return cms_RecipientInfo_kekri_decrypt(cms, ri);
+                case CMS_RECIPINFO_PASS:
+                return cms_RecipientInfo_pwri_crypt(cms, ri, 0);
                default:
                CMSerr(CMS_F_CMS_RECIPIENTINFO_DECRYPT,
                        CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE);
@@ -829,6 +839,10 @@ BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms)
                        r = cms_RecipientInfo_kekri_encrypt(cms, ri);
                        break;
+                        case CMS_RECIPINFO_PASS:
+                        r = cms_RecipientInfo_pwri_crypt(cms, ri, 1);
+                        break;
                        default:
                        CMSerr(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO,
                                CMS_R_UNSUPPORTED_RECIPIENT_TYPE);
diff --git a/src/lib/libcrypto/cms/cms_err.c b/src/lib/libcrypto/cms/cms_err.c
index ff7b0309e5..8330ead7ed 100644
--- a/src/lib/libcrypto/cms/cms_err.c
+++ b/src/lib/libcrypto/cms/cms_err.c
@@ -1,6 +1,6 @@
 /* crypto/cms/cms_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2007 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2009 The OpenSSL Project.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
@@ -73,6 +73,7 @@ static ERR_STRING_DATA CMS_str_functs[]=
 {ERR_FUNC(CMS_F_CHECK_CONTENT), "CHECK_CONTENT"},
 {ERR_FUNC(CMS_F_CMS_ADD0_CERT), "CMS_add0_cert"},
 {ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_KEY),        "CMS_add0_recipient_key"},
+{ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD),   "CMS_add0_recipient_password"},
 {ERR_FUNC(CMS_F_CMS_ADD1_RECEIPTREQUEST),       "CMS_add1_ReceiptRequest"},
 {ERR_FUNC(CMS_F_CMS_ADD1_RECIPIENT_CERT),       "CMS_add1_recipient_cert"},
 {ERR_FUNC(CMS_F_CMS_ADD1_SIGNER),       "CMS_add1_signer"},
@@ -87,6 +88,7 @@ static ERR_STRING_DATA CMS_str_functs[]=
 {ERR_FUNC(CMS_F_CMS_DATAINIT),  "CMS_dataInit"},
 {ERR_FUNC(CMS_F_CMS_DECRYPT),   "CMS_decrypt"},
 {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_KEY),  "CMS_decrypt_set1_key"},
+{ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PASSWORD),     "CMS_decrypt_set1_password"},
 {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PKEY), "CMS_decrypt_set1_pkey"},
 {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_FIND_CTX),  "cms_DigestAlgorithm_find_ctx"},
 {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_INIT_BIO),  "cms_DigestAlgorithm_init_bio"},
@@ -105,7 +107,7 @@ static ERR_STRING_DATA CMS_str_functs[]=
 {ERR_FUNC(CMS_F_CMS_GET0_CERTIFICATE_CHOICES),  "CMS_GET0_CERTIFICATE_CHOICES"},
 {ERR_FUNC(CMS_F_CMS_GET0_CONTENT),      "CMS_get0_content"},
 {ERR_FUNC(CMS_F_CMS_GET0_ECONTENT_TYPE),        "CMS_GET0_ECONTENT_TYPE"},
-{ERR_FUNC(CMS_F_CMS_GET0_ENVELOPED),    "CMS_GET0_ENVELOPED"},
+{ERR_FUNC(CMS_F_CMS_GET0_ENVELOPED),    "cms_get0_enveloped"},
 {ERR_FUNC(CMS_F_CMS_GET0_REVOCATION_CHOICES),   "CMS_GET0_REVOCATION_CHOICES"},
 {ERR_FUNC(CMS_F_CMS_GET0_SIGNED),       "CMS_GET0_SIGNED"},
 {ERR_FUNC(CMS_F_CMS_MSGSIGDIGEST_ADD1), "cms_msgSigDigest_add1"},
@@ -121,7 +123,9 @@ static ERR_STRING_DATA CMS_str_functs[]=
 {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT),        "CMS_RECIPIENTINFO_KTRI_ENCRYPT"},
 {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS),      "CMS_RecipientInfo_ktri_get0_algs"},
 {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID), "CMS_RecipientInfo_ktri_get0_signer_id"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT),  "cms_RecipientInfo_pwri_crypt"},
 {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_KEY),    "CMS_RecipientInfo_set0_key"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD),       "CMS_RecipientInfo_set0_password"},
 {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PKEY),   "CMS_RecipientInfo_set0_pkey"},
 {ERR_FUNC(CMS_F_CMS_SET1_SIGNERIDENTIFIER),     "cms_set1_SignerIdentifier"},
 {ERR_FUNC(CMS_F_CMS_SET_DETACHED),      "CMS_set_detached"},
@@ -165,6 +169,7 @@ static ERR_STRING_DATA CMS_str_reasons[]=
 {ERR_REASON(CMS_R_ERROR_SETTING_KEY)     ,"error setting key"},
 {ERR_REASON(CMS_R_ERROR_SETTING_RECIPIENTINFO),"error setting recipientinfo"},
 {ERR_REASON(CMS_R_INVALID_ENCRYPTED_KEY_LENGTH),"invalid encrypted key length"},
+{ERR_REASON(CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER),"invalid key encryption parameter"},
 {ERR_REASON(CMS_R_INVALID_KEY_LENGTH)    ,"invalid key length"},
 {ERR_REASON(CMS_R_MD_BIO_INIT_ERROR)     ,"md bio init error"},
 {ERR_REASON(CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH),"messagedigest attribute wrong length"},
@@ -177,6 +182,7 @@ static ERR_STRING_DATA CMS_str_reasons[]=
 {ERR_REASON(CMS_R_NOT_ENCRYPTED_DATA)    ,"not encrypted data"},
 {ERR_REASON(CMS_R_NOT_KEK)               ,"not kek"},
 {ERR_REASON(CMS_R_NOT_KEY_TRANSPORT)     ,"not key transport"},
+{ERR_REASON(CMS_R_NOT_PWRI)              ,"not pwri"},
 {ERR_REASON(CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE),"not supported for this key type"},
 {ERR_REASON(CMS_R_NO_CIPHER)             ,"no cipher"},
 {ERR_REASON(CMS_R_NO_CONTENT)            ,"no content"},
@@ -189,6 +195,7 @@ static ERR_STRING_DATA CMS_str_reasons[]=
 {ERR_REASON(CMS_R_NO_MATCHING_RECIPIENT) ,"no matching recipient"},
 {ERR_REASON(CMS_R_NO_MATCHING_SIGNATURE) ,"no matching signature"},
 {ERR_REASON(CMS_R_NO_MSGSIGDIGEST)       ,"no msgsigdigest"},
+{ERR_REASON(CMS_R_NO_PASSWORD)           ,"no password"},
 {ERR_REASON(CMS_R_NO_PRIVATE_KEY)        ,"no private key"},
 {ERR_REASON(CMS_R_NO_PUBLIC_KEY)         ,"no public key"},
 {ERR_REASON(CMS_R_NO_RECEIPT_REQUEST)    ,"no receipt request"},
@@ -212,10 +219,12 @@ static ERR_STRING_DATA CMS_str_reasons[]=
 {ERR_REASON(CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM),"unsupported compression algorithm"},
 {ERR_REASON(CMS_R_UNSUPPORTED_CONTENT_TYPE),"unsupported content type"},
 {ERR_REASON(CMS_R_UNSUPPORTED_KEK_ALGORITHM),"unsupported kek algorithm"},
+{ERR_REASON(CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM),"unsupported key encryption algorithm"},
 {ERR_REASON(CMS_R_UNSUPPORTED_RECIPIENT_TYPE),"unsupported recipient type"},
 {ERR_REASON(CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE),"unsupported recpientinfo type"},
 {ERR_REASON(CMS_R_UNSUPPORTED_TYPE)      ,"unsupported type"},
 {ERR_REASON(CMS_R_UNWRAP_ERROR)          ,"unwrap error"},
+{ERR_REASON(CMS_R_UNWRAP_FAILURE)        ,"unwrap failure"},
 {ERR_REASON(CMS_R_VERIFICATION_FAILURE)  ,"verification failure"},
 {ERR_REASON(CMS_R_WRAP_ERROR)            ,"wrap error"},
 {0,NULL}
diff --git a/src/lib/libcrypto/cms/cms_lcl.h b/src/lib/libcrypto/cms/cms_lcl.h
index c8ecfa724a..a9f9730157 100644
--- a/src/lib/libcrypto/cms/cms_lcl.h
+++ b/src/lib/libcrypto/cms/cms_lcl.h
@@ -175,6 +175,8 @@ struct CMS_EncryptedContentInfo_st
        const EVP_CIPHER *cipher;
        unsigned char *key;
        size_t keylen;
+        /* Set to 1 if we are debugging decrypt and don't fake keys for MMA */
+        int debug;
        };
 struct CMS_RecipientInfo_st
@@ -273,6 +275,9 @@ struct CMS_PasswordRecipientInfo_st
        X509_ALGOR *keyDerivationAlgorithm;
        X509_ALGOR *keyEncryptionAlgorithm;
        ASN1_OCTET_STRING *encryptedKey;
+        /* Extra info: password to use */
+        unsigned char *pass;
+        size_t passlen;
        };
 struct CMS_OtherRecipientInfo_st
@@ -411,6 +416,8 @@ DECLARE_ASN1_ITEM(CMS_SignerInfo)
 DECLARE_ASN1_ITEM(CMS_IssuerAndSerialNumber)
 DECLARE_ASN1_ITEM(CMS_Attributes_Sign)
 DECLARE_ASN1_ITEM(CMS_Attributes_Verify)
+DECLARE_ASN1_ITEM(CMS_RecipientInfo)
+DECLARE_ASN1_ITEM(CMS_PasswordRecipientInfo)
 DECLARE_ASN1_ALLOC_FUNCTIONS(CMS_IssuerAndSerialNumber)
 #define CMS_SIGNERINFO_ISSUER_SERIAL    0
@@ -454,6 +461,11 @@ int cms_msgSigDigest_add1(CMS_SignerInfo *dest, CMS_SignerInfo *src);
 ASN1_OCTET_STRING *cms_encode_Receipt(CMS_SignerInfo *si);
 BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms);
+CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms);
+/* PWRI routines */
+int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri,
+                                                        int en_de);
        
 #ifdef  __cplusplus
 }
diff --git a/src/lib/libcrypto/cms/cms_lib.c b/src/lib/libcrypto/cms/cms_lib.c
index d00fe0f87b..f88e8f3b52 100644
--- a/src/lib/libcrypto/cms/cms_lib.c
+++ b/src/lib/libcrypto/cms/cms_lib.c
@@ -412,8 +412,7 @@ int cms_DigestAlgorithm_find_ctx(EVP_MD_CTX *mctx, BIO *chain,
                 */
                        || EVP_MD_pkey_type(EVP_MD_CTX_md(mtmp)) == nid)
                        {
-                        EVP_MD_CTX_copy_ex(mctx, mtmp);
+                        return EVP_MD_CTX_copy_ex(mctx, mtmp);
-                        return 1;
                        }
                chain = BIO_next(chain);
                }
diff --git a/src/lib/libcrypto/cms/cms_pwri.c b/src/lib/libcrypto/cms/cms_pwri.c
new file mode 100644
index 0000000000..b79612a12d
--- /dev/null
+++ b/src/lib/libcrypto/cms/cms_pwri.c
@@ -0,0 +1,454 @@
+/* crypto/cms/cms_pwri.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2009 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include <openssl/rand.h>
+#include <openssl/aes.h>
+#include "cms_lcl.h"
+#include "asn1_locl.h"
+int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri, 
+                                unsigned char *pass, ossl_ssize_t passlen)
+        {
+        CMS_PasswordRecipientInfo *pwri;
+        if (ri->type != CMS_RECIPINFO_PASS)
+                {
+                CMSerr(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD, CMS_R_NOT_PWRI);
+                return 0;
+                }
+        pwri = ri->d.pwri;
+        pwri->pass = pass;
+        if (pass && passlen < 0)
+                passlen = strlen((char *)pass);
+        pwri->passlen = passlen;
+        return 1;
+        }
+CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms,
+                                        int iter, int wrap_nid, int pbe_nid,
+                                        unsigned char *pass,
+                                        ossl_ssize_t passlen,
+                                        const EVP_CIPHER *kekciph)
+        {
+        CMS_RecipientInfo *ri = NULL;
+        CMS_EnvelopedData *env;
+        CMS_PasswordRecipientInfo *pwri;
+        EVP_CIPHER_CTX ctx;
+        X509_ALGOR *encalg = NULL;
+        unsigned char iv[EVP_MAX_IV_LENGTH];
+        int ivlen;
+        env = cms_get0_enveloped(cms);
+        if (!env)
+                goto err;
+        if (wrap_nid <= 0)
+                wrap_nid = NID_id_alg_PWRI_KEK;
+        if (pbe_nid <= 0)
+                pbe_nid = NID_id_pbkdf2;
+        /* Get from enveloped data */
+        if (kekciph == NULL)
+                kekciph = env->encryptedContentInfo->cipher;
+        if (kekciph == NULL)
+                {
+                CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, CMS_R_NO_CIPHER);
+                return NULL;
+                }
+        if (wrap_nid != NID_id_alg_PWRI_KEK)
+                {
+                CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+                                CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM);
+                return NULL;
+                }
+        /* Setup algorithm identifier for cipher */
+        encalg = X509_ALGOR_new();
+        EVP_CIPHER_CTX_init(&ctx);
+        if (EVP_EncryptInit_ex(&ctx, kekciph, NULL, NULL, NULL) <= 0)
+                {
+                CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_EVP_LIB);
+                goto err;
+                }
+        ivlen = EVP_CIPHER_CTX_iv_length(&ctx);
+        if (ivlen > 0)
+                {
+                if (RAND_pseudo_bytes(iv, ivlen) <= 0)
+                        goto err;
+                if (EVP_EncryptInit_ex(&ctx, NULL, NULL, NULL, iv) <= 0)
+                        {
+                        CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+                                                        ERR_R_EVP_LIB);
+                        goto err;
+                        }
+                encalg->parameter = ASN1_TYPE_new();
+                if (!encalg->parameter)
+                        {
+                        CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+                                                        ERR_R_MALLOC_FAILURE);
+                        goto err;
+                        }
+                if (EVP_CIPHER_param_to_asn1(&ctx, encalg->parameter) <= 0)
+                        {
+                        CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+                                CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR);
+                        goto err;
+                        }
+                }
+        encalg->algorithm = OBJ_nid2obj(EVP_CIPHER_CTX_type(&ctx));
+        EVP_CIPHER_CTX_cleanup(&ctx);
+        /* Initialize recipient info */
+        ri = M_ASN1_new_of(CMS_RecipientInfo);
+        if (!ri)
+                goto merr;
+        ri->d.pwri = M_ASN1_new_of(CMS_PasswordRecipientInfo);
+        if (!ri->d.pwri)
+                goto merr;
+        ri->type = CMS_RECIPINFO_PASS;
+        pwri = ri->d.pwri;
+        /* Since this is overwritten, free up empty structure already there */
+        X509_ALGOR_free(pwri->keyEncryptionAlgorithm);
+        pwri->keyEncryptionAlgorithm = X509_ALGOR_new();
+        if (!pwri->keyEncryptionAlgorithm)
+                goto merr;
+        pwri->keyEncryptionAlgorithm->algorithm = OBJ_nid2obj(wrap_nid);
+        pwri->keyEncryptionAlgorithm->parameter = ASN1_TYPE_new();
+        if (!pwri->keyEncryptionAlgorithm->parameter)
+                goto merr;
+        if(!ASN1_item_pack(encalg, ASN1_ITEM_rptr(X509_ALGOR),
+            &pwri->keyEncryptionAlgorithm->parameter->value.sequence))
+                goto merr;
+        pwri->keyEncryptionAlgorithm->parameter->type = V_ASN1_SEQUENCE;
+        X509_ALGOR_free(encalg);
+        encalg = NULL;
+        /* Setup PBE algorithm */
+        pwri->keyDerivationAlgorithm = PKCS5_pbkdf2_set(iter, NULL, 0, -1, -1);
+        if (!pwri->keyDerivationAlgorithm)
+                goto err;
+        CMS_RecipientInfo_set0_password(ri, pass, passlen);
+        pwri->version = 0;
+        if (!sk_CMS_RecipientInfo_push(env->recipientInfos, ri))
+                goto merr;
+        return ri;
+        merr:
+        CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_MALLOC_FAILURE);
+        err:
+        EVP_CIPHER_CTX_cleanup(&ctx);
+        if (ri)
+                M_ASN1_free_of(ri, CMS_RecipientInfo);
+        if (encalg)
+                X509_ALGOR_free(encalg);
+        return NULL;
+        }
+/* This is an implementation of the key wrapping mechanism in RFC3211,
+ * at some point this should go into EVP.
+ */
+static int kek_unwrap_key(unsigned char *out, size_t *outlen,
+                const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx)
+        {
+        size_t blocklen = EVP_CIPHER_CTX_block_size(ctx);
+        unsigned char *tmp;
+        int outl, rv = 0;
+        if (inlen < 2 * blocklen)
+                {
+                /* too small */
+                return 0;
+                }
+        if (inlen % blocklen)
+                {
+                /* Invalid size */
+                return 0;
+                }
+        tmp = OPENSSL_malloc(inlen);
+        /* setup IV by decrypting last two blocks */
+        EVP_DecryptUpdate(ctx, tmp + inlen - 2 * blocklen, &outl,
+                                in  + inlen - 2 * blocklen, blocklen * 2);
+        /* Do a decrypt of last decrypted block to set IV to correct value
+         * output it to start of buffer so we don't corrupt decrypted block
+         * this works because buffer is at least two block lengths long.
+         */
+        EVP_DecryptUpdate(ctx, tmp, &outl,
+                                tmp  + inlen - blocklen, blocklen);
+        /* Can now decrypt first n - 1 blocks */
+        EVP_DecryptUpdate(ctx, tmp, &outl, in, inlen - blocklen);
+        /* Reset IV to original value */
+        EVP_DecryptInit_ex(ctx, NULL, NULL, NULL, NULL);
+        /* Decrypt again */
+        EVP_DecryptUpdate(ctx, tmp, &outl, tmp, inlen);
+        /* Check check bytes */
+        if (((tmp[1] ^ tmp[4]) & (tmp[2] ^ tmp[5]) & (tmp[3] ^ tmp[6])) != 0xff)
+                {
+                /* Check byte failure */
+                goto err;
+                }
+        if (inlen < (size_t)(tmp[0] - 4 ))
+                {
+                /* Invalid length value */
+                goto err;
+                }
+        *outlen = (size_t)tmp[0];
+        memcpy(out, tmp + 4, *outlen);
+        rv = 1;
+        err:
+        OPENSSL_cleanse(tmp, inlen);
+        OPENSSL_free(tmp);
+        return rv;
+        }
+static int kek_wrap_key(unsigned char *out, size_t *outlen,
+                const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx)
+        {
+        size_t blocklen = EVP_CIPHER_CTX_block_size(ctx);
+        size_t olen;
+        int dummy;
+        /* First decide length of output buffer: need header and round up to
+         * multiple of block length.
+         */
+        olen = (inlen + 4 + blocklen - 1)/blocklen;
+        olen *= blocklen;
+        if (olen < 2 * blocklen)
+                {
+                /* Key too small */
+                return 0;
+                }
+        if (inlen > 0xFF)
+                {
+                /* Key too large */
+                return 0;
+                }
+        if (out)
+                {
+                /* Set header */
+                out[0] = (unsigned char)inlen;
+                out[1] = in[0] ^ 0xFF;
+                out[2] = in[1] ^ 0xFF;
+                out[3] = in[2] ^ 0xFF;
+                memcpy(out + 4, in, inlen);
+                /* Add random padding to end */
+                if (olen > inlen + 4)
+                        RAND_pseudo_bytes(out + 4 + inlen, olen - 4 - inlen);
+                /* Encrypt twice */
+                EVP_EncryptUpdate(ctx, out, &dummy, out, olen);
+                EVP_EncryptUpdate(ctx, out, &dummy, out, olen);
+                }
+        *outlen = olen;
+        return 1;
+        }
+/* Encrypt/Decrypt content key in PWRI recipient info */
+int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri,
+                                                        int en_de)
+        {
+        CMS_EncryptedContentInfo *ec;
+        CMS_PasswordRecipientInfo *pwri;
+        const unsigned char *p = NULL;
+        int plen;
+        int r = 0;
+        X509_ALGOR *algtmp, *kekalg = NULL;
+        EVP_CIPHER_CTX kekctx;
+        const EVP_CIPHER *kekcipher;
+        unsigned char *key = NULL;
+        size_t keylen;
+        ec = cms->d.envelopedData->encryptedContentInfo;
+        pwri = ri->d.pwri;
+        EVP_CIPHER_CTX_init(&kekctx);
+        if (!pwri->pass)
+                {
+                CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, CMS_R_NO_PASSWORD);
+                return 0;
+                }
+        algtmp = pwri->keyEncryptionAlgorithm;
+        if (!algtmp || OBJ_obj2nid(algtmp->algorithm) != NID_id_alg_PWRI_KEK)
+                {
+                CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+                                CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM);
+                return 0;
+                }
+        if (algtmp->parameter->type == V_ASN1_SEQUENCE)
+                {
+                p = algtmp->parameter->value.sequence->data;
+                plen = algtmp->parameter->value.sequence->length;
+                kekalg = d2i_X509_ALGOR(NULL, &p, plen);
+                }
+        if (kekalg == NULL)
+                {
+                CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+                                CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER);
+                return 0;
+                }
+        kekcipher = EVP_get_cipherbyobj(kekalg->algorithm);
+                
+        if(!kekcipher)
+                {
+                CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+                                CMS_R_UNKNOWN_CIPHER);
+                goto err;
+                }
+        /* Fixup cipher based on AlgorithmIdentifier to set IV etc */
+        if (!EVP_CipherInit_ex(&kekctx, kekcipher, NULL, NULL, NULL, en_de))
+                goto err;
+        EVP_CIPHER_CTX_set_padding(&kekctx, 0);
+        if(EVP_CIPHER_asn1_to_param(&kekctx, kekalg->parameter) < 0)
+                {
+                CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+                                CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR);
+                goto err;
+                }
+        algtmp = pwri->keyDerivationAlgorithm;
+        /* Finish password based key derivation to setup key in "ctx" */
+        if (EVP_PBE_CipherInit(algtmp->algorithm,
+                                (char *)pwri->pass, pwri->passlen,
+                                algtmp->parameter, &kekctx, en_de) < 0)
+                {
+                CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, ERR_R_EVP_LIB);
+                goto err;
+                }
+        /* Finally wrap/unwrap the key */
+        if (en_de)
+                {
+                if (!kek_wrap_key(NULL, &keylen, ec->key, ec->keylen, &kekctx))
+                        goto err;
+                key = OPENSSL_malloc(keylen);
+                if (!key)
+                        goto err;
+                if (!kek_wrap_key(key, &keylen, ec->key, ec->keylen, &kekctx))
+                        goto err;
+                pwri->encryptedKey->data = key;
+                pwri->encryptedKey->length = keylen;
+                }
+        else
+                {
+                key = OPENSSL_malloc(pwri->encryptedKey->length);
+                if (!key)
+                        {
+                        CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+                                                        ERR_R_MALLOC_FAILURE);
+                        goto err;
+                        }
+                if (!kek_unwrap_key(key, &keylen,
+                                        pwri->encryptedKey->data,
+                                        pwri->encryptedKey->length, &kekctx))
+                        {
+                        CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+                                                        CMS_R_UNWRAP_FAILURE);
+                        goto err;
+                        }
+                ec->key = key;
+                ec->keylen = keylen;
+                }
+        r = 1;
+        err:
+        EVP_CIPHER_CTX_cleanup(&kekctx);
+        if (!r && key)
+                OPENSSL_free(key);
+        X509_ALGOR_free(kekalg);
+        return r;
+        }
diff --git a/src/lib/libcrypto/cms/cms_sd.c b/src/lib/libcrypto/cms/cms_sd.c
index e3192b9c57..77fbd13596 100644
--- a/src/lib/libcrypto/cms/cms_sd.c
+++ b/src/lib/libcrypto/cms/cms_sd.c
@@ -641,7 +641,8 @@ static int cms_SignerInfo_content_sign(CMS_ContentInfo *cms,
                        cms->d.signedData->encapContentInfo->eContentType; 
                unsigned char md[EVP_MAX_MD_SIZE];
                unsigned int mdlen;
-                EVP_DigestFinal_ex(&mctx, md, &mdlen);
+                if (!EVP_DigestFinal_ex(&mctx, md, &mdlen))
+                        goto err;
                if (!CMS_signed_add1_attr_by_NID(si, NID_pkcs9_messageDigest,
                                                V_ASN1_OCTET_STRING,
                                                md, mdlen))
diff --git a/src/lib/libcrypto/dh/dh_ameth.c b/src/lib/libcrypto/dh/dh_ameth.c
index 377caf96c9..02ec2d47b4 100644
--- a/src/lib/libcrypto/dh/dh_ameth.c
+++ b/src/lib/libcrypto/dh/dh_ameth.c
@@ -493,6 +493,7 @@ const EVP_PKEY_ASN1_METHOD dh_asn1_meth =
        dh_copy_parameters,
        dh_cmp_parameters,
        dh_param_print,
+        0,
        int_dh_free,
        0
diff --git a/src/lib/libcrypto/dsa/dsa_ameth.c b/src/lib/libcrypto/dsa/dsa_ameth.c
index 6413aae46e..376156ec5e 100644
--- a/src/lib/libcrypto/dsa/dsa_ameth.c
+++ b/src/lib/libcrypto/dsa/dsa_ameth.c
@@ -542,6 +542,52 @@ static int old_dsa_priv_encode(const EVP_PKEY *pkey, unsigned char **pder)
        return i2d_DSAPrivateKey(pkey->pkey.dsa, pder);
        }
+static int dsa_sig_print(BIO *bp, const X509_ALGOR *sigalg,
+                                        const ASN1_STRING *sig,
+                                        int indent, ASN1_PCTX *pctx)
+        {
+        DSA_SIG *dsa_sig;
+        const unsigned char *p;
+        if (!sig)
+                {
+                if (BIO_puts(bp, "\n") <= 0)
+                        return 0;
+                else
+                        return 1;
+                }
+        p = sig->data;
+        dsa_sig = d2i_DSA_SIG(NULL, &p, sig->length);
+        if (dsa_sig)
+                {
+                int rv = 0;
+                size_t buf_len = 0;
+                unsigned char *m=NULL;
+                update_buflen(dsa_sig->r, &buf_len);
+                update_buflen(dsa_sig->s, &buf_len);
+                m = OPENSSL_malloc(buf_len+10);
+                if (m == NULL)
+                        {
+                        DSAerr(DSA_F_DSA_SIG_PRINT,ERR_R_MALLOC_FAILURE);
+                        goto err;
+                        }
+                if (BIO_write(bp, "\n", 1) != 1)
+                        goto err;
+                if (!ASN1_bn_print(bp,"r:   ",dsa_sig->r,m,indent))
+                        goto err;
+                if (!ASN1_bn_print(bp,"s:   ",dsa_sig->s,m,indent))
+                        goto err;
+                rv = 1;
+                err:
+                if (m)
+                        OPENSSL_free(m);
+                DSA_SIG_free(dsa_sig);
+                return rv;
+                }
+        return X509_signature_dump(bp, sig, indent);
+        }
 static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
        {
        switch (op)
@@ -647,6 +693,7 @@ const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[] =
                dsa_copy_parameters,
                dsa_cmp_parameters,
                dsa_param_print,
+                dsa_sig_print,
                int_dsa_free,
                dsa_pkey_ctrl,
diff --git a/src/lib/libcrypto/dsa/dsa_locl.h b/src/lib/libcrypto/dsa/dsa_locl.h
index 2b8cfee3db..21e2e45242 100644
--- a/src/lib/libcrypto/dsa/dsa_locl.h
+++ b/src/lib/libcrypto/dsa/dsa_locl.h
@@ -56,4 +56,5 @@
 int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
        const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len,
+        unsigned char *seed_out,
        int *counter_ret, unsigned long *h_ret, BN_GENCB *cb);
diff --git a/src/lib/libcrypto/dsa/dsa_pmeth.c b/src/lib/libcrypto/dsa/dsa_pmeth.c
index e2df54fec6..715d8d675b 100644
--- a/src/lib/libcrypto/dsa/dsa_pmeth.c
+++ b/src/lib/libcrypto/dsa/dsa_pmeth.c
@@ -189,7 +189,9 @@ static int pkey_dsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
                    EVP_MD_type((const EVP_MD *)p2) != NID_dsa    &&
                    EVP_MD_type((const EVP_MD *)p2) != NID_dsaWithSHA    &&
                    EVP_MD_type((const EVP_MD *)p2) != NID_sha224 &&
-                    EVP_MD_type((const EVP_MD *)p2) != NID_sha256)
+                    EVP_MD_type((const EVP_MD *)p2) != NID_sha256 &&
+                    EVP_MD_type((const EVP_MD *)p2) != NID_sha384 &&
+                    EVP_MD_type((const EVP_MD *)p2) != NID_sha512)
                        {
                        DSAerr(DSA_F_PKEY_DSA_CTRL, DSA_R_INVALID_DIGEST_TYPE);
                        return 0;
@@ -253,7 +255,7 @@ static int pkey_dsa_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
        if (!dsa)
                return 0;
        ret = dsa_builtin_paramgen(dsa, dctx->nbits, dctx->qbits, dctx->pmd,
-                                   NULL, 0, NULL, NULL, pcb);
+                                   NULL, 0, NULL, NULL, NULL, pcb);
        if (ret)
                EVP_PKEY_assign_DSA(pkey, dsa);
        else
diff --git a/src/lib/libcrypto/ec/ec2_mult.c b/src/lib/libcrypto/ec/ec2_mult.c
index e12b9b284a..26f4a783fc 100644
--- a/src/lib/libcrypto/ec/ec2_mult.c
+++ b/src/lib/libcrypto/ec/ec2_mult.c
@@ -71,6 +71,8 @@
 #include "ec_lcl.h"
+#ifndef OPENSSL_NO_EC2M
 /* Compute the x-coordinate x/z for the point 2*(x/z) in Montgomery projective 
 * coordinates.
@@ -384,3 +386,5 @@ int ec_GF2m_have_precompute_mult(const EC_GROUP *group)
        {
        return ec_wNAF_have_precompute_mult(group);
        }
+#endif
diff --git a/src/lib/libcrypto/ec/ec2_oct.c b/src/lib/libcrypto/ec/ec2_oct.c
new file mode 100644
index 0000000000..f1d75e5ddf
--- /dev/null
+++ b/src/lib/libcrypto/ec/ec2_oct.c
@@ -0,0 +1,407 @@
+/* crypto/ec/ec2_oct.c */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * The Elliptic Curve Public-Key Crypto Library (ECC Code) included
+ * herein is developed by SUN MICROSYSTEMS, INC., and is contributed
+ * to the OpenSSL project.
+ *
+ * The ECC Code is licensed pursuant to the OpenSSL open source
+ * license provided below.
+ *
+ * The software is originally written by Sheueling Chang Shantz and
+ * Douglas Stebila of Sun Microsystems Laboratories.
+ *
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#include <openssl/err.h>
+#include "ec_lcl.h"
+#ifndef OPENSSL_NO_EC2M
+/* Calculates and sets the affine coordinates of an EC_POINT from the given
+ * compressed coordinates.  Uses algorithm 2.3.4 of SEC 1. 
+ * Note that the simple implementation only uses affine coordinates.
+ *
+ * The method is from the following publication:
+ * 
+ *     Harper, Menezes, Vanstone:
+ *     "Public-Key Cryptosystems with Very Small Key Lengths",
+ *     EUROCRYPT '92, Springer-Verlag LNCS 658,
+ *     published February 1993
+ *
+ * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe
+ * the same method, but claim no priority date earlier than July 29, 1994
+ * (and additionally fail to cite the EUROCRYPT '92 publication as prior art).
+ */
+int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
+        const BIGNUM *x_, int y_bit, BN_CTX *ctx)
+        {
+        BN_CTX *new_ctx = NULL;
+        BIGNUM *tmp, *x, *y, *z;
+        int ret = 0, z0;
+        /* clear error queue */
+        ERR_clear_error();
+        if (ctx == NULL)
+                {
+                ctx = new_ctx = BN_CTX_new();
+                if (ctx == NULL)
+                        return 0;
+                }
+        y_bit = (y_bit != 0) ? 1 : 0;
+        BN_CTX_start(ctx);
+        tmp = BN_CTX_get(ctx);
+        x = BN_CTX_get(ctx);
+        y = BN_CTX_get(ctx);
+        z = BN_CTX_get(ctx);
+        if (z == NULL) goto err;
+        if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err;
+        if (BN_is_zero(x))
+                {
+                if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err;
+                }
+        else
+                {
+                if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err;
+                if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err;
+                if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err;
+                if (!BN_GF2m_add(tmp, x, tmp)) goto err;
+                if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx))
+                        {
+                        unsigned long err = ERR_peek_last_error();
+                        
+                        if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION)
+                                {
+                                ERR_clear_error();
+                                ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+                                }
+                        else
+                                ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
+                        goto err;
+                        }
+                z0 = (BN_is_odd(z)) ? 1 : 0;
+                if (!group->meth->field_mul(group, y, x, z, ctx)) goto err;
+                if (z0 != y_bit)
+                        {
+                        if (!BN_GF2m_add(y, y, x)) goto err;
+                        }
+                }
+        if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+        ret = 1;
+ err:
+        BN_CTX_end(ctx);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        return ret;
+        }
+/* Converts an EC_POINT to an octet string.  
+ * If buf is NULL, the encoded length will be returned.
+ * If the length len of buf is smaller than required an error will be returned.
+ */
+size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+        unsigned char *buf, size_t len, BN_CTX *ctx)
+        {
+        size_t ret;
+        BN_CTX *new_ctx = NULL;
+        int used_ctx = 0;
+        BIGNUM *x, *y, *yxi;
+        size_t field_len, i, skip;
+        if ((form != POINT_CONVERSION_COMPRESSED)
+                && (form != POINT_CONVERSION_UNCOMPRESSED)
+                && (form != POINT_CONVERSION_HYBRID))
+                {
+                ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
+                goto err;
+                }
+        if (EC_POINT_is_at_infinity(group, point))
+                {
+                /* encodes to a single 0 octet */
+                if (buf != NULL)
+                        {
+                        if (len < 1)
+                                {
+                                ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+                                return 0;
+                                }
+                        buf[0] = 0;
+                        }
+                return 1;
+                }
+        /* ret := required output buffer length */
+        field_len = (EC_GROUP_get_degree(group) + 7) / 8;
+        ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+        /* if 'buf' is NULL, just return required length */
+        if (buf != NULL)
+                {
+                if (len < ret)
+                        {
+                        ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+                        goto err;
+                        }
+                if (ctx == NULL)
+                        {
+                        ctx = new_ctx = BN_CTX_new();
+                        if (ctx == NULL)
+                                return 0;
+                        }
+                BN_CTX_start(ctx);
+                used_ctx = 1;
+                x = BN_CTX_get(ctx);
+                y = BN_CTX_get(ctx);
+                yxi = BN_CTX_get(ctx);
+                if (yxi == NULL) goto err;
+                if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+                buf[0] = form;
+                if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x))
+                        {
+                        if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
+                        if (BN_is_odd(yxi)) buf[0]++;
+                        }
+                i = 1;
+                
+                skip = field_len - BN_num_bytes(x);
+                if (skip > field_len)
+                        {
+                        ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+                        goto err;
+                        }
+                while (skip > 0)
+                        {
+                        buf[i++] = 0;
+                        skip--;
+                        }
+                skip = BN_bn2bin(x, buf + i);
+                i += skip;
+                if (i != 1 + field_len)
+                        {
+                        ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+                        goto err;
+                        }
+                if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
+                        {
+                        skip = field_len - BN_num_bytes(y);
+                        if (skip > field_len)
+                                {
+                                ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+                                goto err;
+                                }
+                        while (skip > 0)
+                                {
+                                buf[i++] = 0;
+                                skip--;
+                                }
+                        skip = BN_bn2bin(y, buf + i);
+                        i += skip;
+                        }
+                if (i != ret)
+                        {
+                        ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+                        goto err;
+                        }
+                }
+        
+        if (used_ctx)
+                BN_CTX_end(ctx);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        return ret;
+ err:
+        if (used_ctx)
+                BN_CTX_end(ctx);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        return 0;
+        }
+/* Converts an octet string representation to an EC_POINT. 
+ * Note that the simple implementation only uses affine coordinates.
+ */
+int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
+        const unsigned char *buf, size_t len, BN_CTX *ctx)
+        {
+        point_conversion_form_t form;
+        int y_bit;
+        BN_CTX *new_ctx = NULL;
+        BIGNUM *x, *y, *yxi;
+        size_t field_len, enc_len;
+        int ret = 0;
+        if (len == 0)
+                {
+                ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
+                return 0;
+                }
+        form = buf[0];
+        y_bit = form & 1;
+        form = form & ~1U;
+        if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED)
+                && (form != POINT_CONVERSION_UNCOMPRESSED)
+                && (form != POINT_CONVERSION_HYBRID))
+                {
+                ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+                return 0;
+                }
+        if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
+                {
+                ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+                return 0;
+                }
+        if (form == 0)
+                {
+                if (len != 1)
+                        {
+                        ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+                        return 0;
+                        }
+                return EC_POINT_set_to_infinity(group, point);
+                }
+        
+        field_len = (EC_GROUP_get_degree(group) + 7) / 8;
+        enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+        if (len != enc_len)
+                {
+                ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+                return 0;
+                }
+        if (ctx == NULL)
+                {
+                ctx = new_ctx = BN_CTX_new();
+                if (ctx == NULL)
+                        return 0;
+                }
+        BN_CTX_start(ctx);
+        x = BN_CTX_get(ctx);
+        y = BN_CTX_get(ctx);
+        yxi = BN_CTX_get(ctx);
+        if (yxi == NULL) goto err;
+        if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
+        if (BN_ucmp(x, &group->field) >= 0)
+                {
+                ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+                goto err;
+                }
+        if (form == POINT_CONVERSION_COMPRESSED)
+                {
+                if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err;
+                }
+        else
+                {
+                if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
+                if (BN_ucmp(y, &group->field) >= 0)
+                        {
+                        ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+                        goto err;
+                        }
+                if (form == POINT_CONVERSION_HYBRID)
+                        {
+                        if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
+                        if (y_bit != BN_is_odd(yxi))
+                                {
+                                ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+                                goto err;
+                                }
+                        }
+                if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+                }
+        
+        if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
+                {
+                ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
+                goto err;
+                }
+        ret = 1;
+        
+ err:
+        BN_CTX_end(ctx);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        return ret;
+        }
+#endif
diff --git a/src/lib/libcrypto/ec/ec_ameth.c b/src/lib/libcrypto/ec/ec_ameth.c
index c00f7d746c..83909c1853 100644
--- a/src/lib/libcrypto/ec/ec_ameth.c
+++ b/src/lib/libcrypto/ec/ec_ameth.c
@@ -651,6 +651,7 @@ const EVP_PKEY_ASN1_METHOD eckey_asn1_meth =
        ec_copy_parameters,
        ec_cmp_parameters,
        eckey_param_print,
+        0,
        int_ec_free,
        ec_pkey_ctrl,
diff --git a/src/lib/libcrypto/ec/ec_asn1.c b/src/lib/libcrypto/ec/ec_asn1.c
index ae55539859..175eec5342 100644
--- a/src/lib/libcrypto/ec/ec_asn1.c
+++ b/src/lib/libcrypto/ec/ec_asn1.c
@@ -83,7 +83,7 @@ int EC_GROUP_get_basis_type(const EC_GROUP *group)
                /* everything else is currently not supported */
                return 0;
        }
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k)
        {
        if (group == NULL)
@@ -101,7 +101,6 @@ int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k)
        return 1;
        }
 int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1,
        unsigned int *k2, unsigned int *k3)
        {
@@ -124,7 +123,7 @@ int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1,
        return 1;
        }
+#endif
 /* some structures needed for the asn1 encoding */
@@ -340,6 +339,12 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field)
                        }
                }
        else    /* nid == NID_X9_62_characteristic_two_field */
+#ifdef OPENSSL_NO_EC2M
+                {
+                ECerr(EC_F_EC_ASN1_GROUP2FIELDID, EC_R_GF2M_NOT_SUPPORTED);
+                goto err;
+                }
+#else
                {
                int             field_type;
                X9_62_CHARACTERISTIC_TWO *char_two;
@@ -419,6 +424,7 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field)
                                }
                        }
                }
+#endif
        ok = 1;
@@ -456,6 +462,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve)
                        goto err;
                        }
                }
+#ifndef OPENSSL_NO_EC2M
        else    /* nid == NID_X9_62_characteristic_two_field */
                {
                if (!EC_GROUP_get_curve_GF2m(group, NULL, tmp_1, tmp_2, NULL))
@@ -464,7 +471,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve)
                        goto err;
                        }
                }
+#endif
        len_1 = (size_t)BN_num_bytes(tmp_1);
        len_2 = (size_t)BN_num_bytes(tmp_2);
@@ -775,8 +782,13 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params)
        /* get the field parameters */
        tmp = OBJ_obj2nid(params->fieldID->fieldType);
        if (tmp == NID_X9_62_characteristic_two_field)
+#ifdef OPENSSL_NO_EC2M
+                {
+                ECerr(EC_F_EC_ASN1_PARAMETERS2GROUP, EC_R_GF2M_NOT_SUPPORTED);
+                goto err;
+                }
+#else
                {
                X9_62_CHARACTERISTIC_TWO *char_two;
@@ -862,6 +874,7 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params)
                /* create the EC_GROUP structure */
                ret = EC_GROUP_new_curve_GF2m(p, a, b, NULL);
                }
+#endif
        else if (tmp == NID_X9_62_prime_field)
                {
                /* we have a curve over a prime field */
@@ -1065,6 +1078,7 @@ EC_GROUP *d2i_ECPKParameters(EC_GROUP **a, const unsigned char **in, long len)
        if ((group = ec_asn1_pkparameters2group(params)) == NULL)
                {
                ECerr(EC_F_D2I_ECPKPARAMETERS, EC_R_PKPARAMETERS2GROUP_FAILURE);
+                ECPKPARAMETERS_free(params);
                return NULL; 
                }
diff --git a/src/lib/libcrypto/ec/ec_curve.c b/src/lib/libcrypto/ec/ec_curve.c
index 23274e4031..c72fb2697c 100644
--- a/src/lib/libcrypto/ec/ec_curve.c
+++ b/src/lib/libcrypto/ec/ec_curve.c
@@ -3,7 +3,7 @@
 * Written by Nils Larsch for the OpenSSL project.
 */
 /* ====================================================================
- * Copyright (c) 1998-2004 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1998-2010 The OpenSSL Project.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
@@ -72,6 +72,7 @@
 #include "ec_lcl.h"
 #include <openssl/err.h>
 #include <openssl/obj_mac.h>
+#include <openssl/opensslconf.h>
 typedef struct {
        int     field_type,     /* either NID_X9_62_prime_field or
@@ -703,6 +704,8 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+28*6]; }
          0x13,0xDD,0x29,0x45,0x5C,0x5C,0x2A,0x3D }
        };
+#ifndef OPENSSL_NO_EC2M
 /* characteristic two curves */
 static const struct { EC_CURVE_DATA h; unsigned char data[20+15*6]; }
        _EC_SECG_CHAR2_113R1 = {
@@ -1300,7 +1303,7 @@ static const struct { EC_CURVE_DATA h; unsigned char data[20+21*6]; }
        { 0x53,0x81,0x4C,0x05,0x0D,0x44,0xD6,0x96,0xE6,0x76,    /* seed */
          0x87,0x56,0x15,0x17,0x58,0x0C,0xA4,0xE2,0x9F,0xFD,
-          0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,    /* p */
+          0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,    /* p */
          0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
          0x07,
          0x01,0x08,0xB3,0x9E,0x77,0xC4,0xB1,0x08,0xBE,0xD9,    /* a */
@@ -1817,103 +1820,128 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+24*6]; }
          0xBA,0xFC,0xA7,0x5E }
        };
+#endif
 typedef struct _ec_list_element_st {
        int     nid;
        const EC_CURVE_DATA *data;
+        const EC_METHOD *(*meth)(void);
        const char *comment;
        } ec_list_element;
 static const ec_list_element curve_list[] = {
-        /* prime field curves */        
+        /* prime field curves */
        /* secg curves */
-        { NID_secp112r1, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"},
+        { NID_secp112r1, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" },
-        { NID_secp112r2, &_EC_SECG_PRIME_112R2.h, "SECG curve over a 112 bit prime field"},
+        { NID_secp112r2, &_EC_SECG_PRIME_112R2.h, 0, "SECG curve over a 112 bit prime field" },
-        { NID_secp128r1, &_EC_SECG_PRIME_128R1.h, "SECG curve over a 128 bit prime field"},
+        { NID_secp128r1, &_EC_SECG_PRIME_128R1.h, 0, "SECG curve over a 128 bit prime field" },
-        { NID_secp128r2, &_EC_SECG_PRIME_128R2.h, "SECG curve over a 128 bit prime field"},
+        { NID_secp128r2, &_EC_SECG_PRIME_128R2.h, 0, "SECG curve over a 128 bit prime field" },
-        { NID_secp160k1, &_EC_SECG_PRIME_160K1.h, "SECG curve over a 160 bit prime field"},
+        { NID_secp160k1, &_EC_SECG_PRIME_160K1.h, 0, "SECG curve over a 160 bit prime field" },
-        { NID_secp160r1, &_EC_SECG_PRIME_160R1.h, "SECG curve over a 160 bit prime field"},
+        { NID_secp160r1, &_EC_SECG_PRIME_160R1.h, 0, "SECG curve over a 160 bit prime field" },
-        { NID_secp160r2, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"},
+        { NID_secp160r2, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" },
        /* SECG secp192r1 is the same as X9.62 prime192v1 and hence omitted */
-        { NID_secp192k1, &_EC_SECG_PRIME_192K1.h, "SECG curve over a 192 bit prime field"},
+        { NID_secp192k1, &_EC_SECG_PRIME_192K1.h, 0, "SECG curve over a 192 bit prime field" },
-        { NID_secp224k1, &_EC_SECG_PRIME_224K1.h, "SECG curve over a 224 bit prime field"},
+        { NID_secp224k1, &_EC_SECG_PRIME_224K1.h, 0, "SECG curve over a 224 bit prime field" },
-        { NID_secp224r1, &_EC_NIST_PRIME_224.h,   "NIST/SECG curve over a 224 bit prime field"},
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
-        { NID_secp256k1, &_EC_SECG_PRIME_256K1.h, "SECG curve over a 256 bit prime field"},
+        { NID_secp224r1, &_EC_NIST_PRIME_224.h, EC_GFp_nistp224_method, "NIST/SECG curve over a 224 bit prime field" },
+#else
+        { NID_secp224r1, &_EC_NIST_PRIME_224.h, 0, "NIST/SECG curve over a 224 bit prime field" },
+#endif
+        { NID_secp256k1, &_EC_SECG_PRIME_256K1.h, 0, "SECG curve over a 256 bit prime field" },
        /* SECG secp256r1 is the same as X9.62 prime256v1 and hence omitted */
-        { NID_secp384r1, &_EC_NIST_PRIME_384.h, "NIST/SECG curve over a 384 bit prime field"},
+        { NID_secp384r1, &_EC_NIST_PRIME_384.h, 0, "NIST/SECG curve over a 384 bit prime field" },
-        { NID_secp521r1, &_EC_NIST_PRIME_521.h, "NIST/SECG curve over a 521 bit prime field"},
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+        { NID_secp521r1, &_EC_NIST_PRIME_521.h, EC_GFp_nistp521_method, "NIST/SECG curve over a 521 bit prime field" },
+#else
+        { NID_secp521r1, &_EC_NIST_PRIME_521.h, 0, "NIST/SECG curve over a 521 bit prime field" },
+#endif
        /* X9.62 curves */
-        { NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, "NIST/X9.62/SECG curve over a 192 bit prime field"},
+        { NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, 0, "NIST/X9.62/SECG curve over a 192 bit prime field" },
-        { NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, "X9.62 curve over a 192 bit prime field"},
+        { NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, 0, "X9.62 curve over a 192 bit prime field" },
-        { NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, "X9.62 curve over a 192 bit prime field"},
+        { NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, 0, "X9.62 curve over a 192 bit prime field" },
-        { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, "X9.62 curve over a 239 bit prime field"},
+        { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, 0, "X9.62 curve over a 239 bit prime field" },
-        { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, "X9.62 curve over a 239 bit prime field"},
+        { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, 0, "X9.62 curve over a 239 bit prime field" },
-        { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, "X9.62 curve over a 239 bit prime field"},
+        { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, 0, "X9.62 curve over a 239 bit prime field" },
-        { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, "X9.62/SECG curve over a 256 bit prime field"},
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+        { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, EC_GFp_nistp256_method, "X9.62/SECG curve over a 256 bit prime field" },
+#else
+        { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, 0, "X9.62/SECG curve over a 256 bit prime field" },
+#endif
+#ifndef OPENSSL_NO_EC2M
        /* characteristic two field curves */
        /* NIST/SECG curves */
-        { NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"},
+        { NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" },
-        { NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, "SECG curve over a 113 bit binary field"},
+        { NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, 0, "SECG curve over a 113 bit binary field" },
-        { NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, "SECG/WTLS curve over a 131 bit binary field"},
+        { NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, 0, "SECG/WTLS curve over a 131 bit binary field" },
-        { NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, "SECG curve over a 131 bit binary field"},
+        { NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, 0, "SECG curve over a 131 bit binary field" },
-        { NID_sect163k1, &_EC_NIST_CHAR2_163K.h,  "NIST/SECG/WTLS curve over a 163 bit binary field" },
+        { NID_sect163k1, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" },
-        { NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, "SECG curve over a 163 bit binary field"},
+        { NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, 0, "SECG curve over a 163 bit binary field" },
-        { NID_sect163r2, &_EC_NIST_CHAR2_163B.h,  "NIST/SECG curve over a 163 bit binary field" },
+        { NID_sect163r2, &_EC_NIST_CHAR2_163B.h, 0, "NIST/SECG curve over a 163 bit binary field" },
-        { NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, "SECG curve over a 193 bit binary field"},
+        { NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, 0, "SECG curve over a 193 bit binary field" },
-        { NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, "SECG curve over a 193 bit binary field"},
+        { NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, 0, "SECG curve over a 193 bit binary field" },
-        { NID_sect233k1, &_EC_NIST_CHAR2_233K.h,  "NIST/SECG/WTLS curve over a 233 bit binary field" },
+        { NID_sect233k1, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
-        { NID_sect233r1, &_EC_NIST_CHAR2_233B.h,  "NIST/SECG/WTLS curve over a 233 bit binary field" },
+        { NID_sect233r1, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
-        { NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, "SECG curve over a 239 bit binary field"},
+        { NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, 0, "SECG curve over a 239 bit binary field" },
-        { NID_sect283k1, &_EC_NIST_CHAR2_283K.h,  "NIST/SECG curve over a 283 bit binary field" },
+        { NID_sect283k1, &_EC_NIST_CHAR2_283K.h, 0, "NIST/SECG curve over a 283 bit binary field" },
-        { NID_sect283r1, &_EC_NIST_CHAR2_283B.h,  "NIST/SECG curve over a 283 bit binary field" },
+        { NID_sect283r1, &_EC_NIST_CHAR2_283B.h, 0, "NIST/SECG curve over a 283 bit binary field" },
-        { NID_sect409k1, &_EC_NIST_CHAR2_409K.h,  "NIST/SECG curve over a 409 bit binary field" },
+        { NID_sect409k1, &_EC_NIST_CHAR2_409K.h, 0, "NIST/SECG curve over a 409 bit binary field" },
-        { NID_sect409r1, &_EC_NIST_CHAR2_409B.h,  "NIST/SECG curve over a 409 bit binary field" },
+        { NID_sect409r1, &_EC_NIST_CHAR2_409B.h, 0, "NIST/SECG curve over a 409 bit binary field" },
-        { NID_sect571k1, &_EC_NIST_CHAR2_571K.h,  "NIST/SECG curve over a 571 bit binary field" },
+        { NID_sect571k1, &_EC_NIST_CHAR2_571K.h, 0, "NIST/SECG curve over a 571 bit binary field" },
-        { NID_sect571r1, &_EC_NIST_CHAR2_571B.h,  "NIST/SECG curve over a 571 bit binary field" },
+        { NID_sect571r1, &_EC_NIST_CHAR2_571B.h, 0, "NIST/SECG curve over a 571 bit binary field" },
        /* X9.62 curves */
-        { NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"},
+        { NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" },
-        { NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, "X9.62 curve over a 163 bit binary field"},
+        { NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, 0, "X9.62 curve over a 163 bit binary field" },
-        { NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, "X9.62 curve over a 163 bit binary field"},
+        { NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, 0, "X9.62 curve over a 163 bit binary field" },
-        { NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, "X9.62 curve over a 176 bit binary field"},
+        { NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, 0, "X9.62 curve over a 176 bit binary field" },
-        { NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, "X9.62 curve over a 191 bit binary field"},
+        { NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, 0, "X9.62 curve over a 191 bit binary field" },
-        { NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, "X9.62 curve over a 191 bit binary field"},
+        { NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, 0, "X9.62 curve over a 191 bit binary field" },
-        { NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, "X9.62 curve over a 191 bit binary field"},
+        { NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, 0, "X9.62 curve over a 191 bit binary field" },
-        { NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, "X9.62 curve over a 208 bit binary field"},
+        { NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, 0, "X9.62 curve over a 208 bit binary field" },
-        { NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, "X9.62 curve over a 239 bit binary field"},
+        { NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, 0, "X9.62 curve over a 239 bit binary field" },
-        { NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, "X9.62 curve over a 239 bit binary field"},
+        { NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, 0, "X9.62 curve over a 239 bit binary field" },
-        { NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, "X9.62 curve over a 239 bit binary field"},
+        { NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, 0, "X9.62 curve over a 239 bit binary field" },
-        { NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, "X9.62 curve over a 272 bit binary field"},
+        { NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, 0, "X9.62 curve over a 272 bit binary field" },
-        { NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, "X9.62 curve over a 304 bit binary field"},
+        { NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, 0, "X9.62 curve over a 304 bit binary field" },
-        { NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, "X9.62 curve over a 359 bit binary field"},
+        { NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, 0, "X9.62 curve over a 359 bit binary field" },
-        { NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, "X9.62 curve over a 368 bit binary field"},
+        { NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, 0, "X9.62 curve over a 368 bit binary field" },
-        { NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, "X9.62 curve over a 431 bit binary field"},
+        { NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, 0, "X9.62 curve over a 431 bit binary field" },
        /* the WAP/WTLS curves
         * [unlike SECG, spec has its own OIDs for curves from X9.62] */
-        { NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, "WTLS curve over a 113 bit binary field"},
+        { NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, 0, "WTLS curve over a 113 bit binary field" },
-        { NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h,   "NIST/SECG/WTLS curve over a 163 bit binary field"},
+        { NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" },
-        { NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h,  "SECG curve over a 113 bit binary field"},
+        { NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" },
-        { NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"},
+        { NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" },
-        { NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h,  "SECG/WTLS curve over a 112 bit prime field"},
+#endif
-        { NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h,  "SECG/WTLS curve over a 160 bit prime field"},
+        { NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" },
-        { NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, "WTLS curve over a 112 bit prime field"},
+        { NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" },
-        { NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, "WTLS curve over a 160 bit prime field" },
+        { NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, 0, "WTLS curve over a 112 bit prime field" },
-        { NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field"},
+        { NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, 0, "WTLS curve over a 160 bit prime field" },
-        { NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field"},
+#ifndef OPENSSL_NO_EC2M
-        { NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, "WTLS curvs over a 224 bit prime field"},
+        { NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+        { NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+#endif
+        { NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, 0, "WTLS curvs over a 224 bit prime field" },
+#ifndef OPENSSL_NO_EC2M
        /* IPSec curves */
-        { NID_ipsec3, &_EC_IPSEC_155_ID3.h, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
+        { NID_ipsec3, &_EC_IPSEC_155_ID3.h, 0, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n"
-        { NID_ipsec4, &_EC_IPSEC_185_ID4.h, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
+          "\tNot suitable for ECDSA.\n\tQuestionable extension field!" },
+        { NID_ipsec4, &_EC_IPSEC_185_ID4.h, 0, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n"
+          "\tNot suitable for ECDSA.\n\tQuestionable extension field!" },
+#endif
 };
 #define curve_list_length (sizeof(curve_list)/sizeof(ec_list_element))
-static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
+static EC_GROUP *ec_group_new_from_data(const ec_list_element curve)
        {
        EC_GROUP *group=NULL;
        EC_POINT *P=NULL;
        BN_CTX   *ctx=NULL;
-        BIGNUM   *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL;
+        BIGNUM   *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL;
        int      ok=0;
        int      seed_len,param_len;
+        const EC_METHOD *meth;
+        const EC_CURVE_DATA *data;
        const unsigned char *params;
        if ((ctx = BN_CTX_new()) == NULL)
@@ -1922,10 +1950,11 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
                goto err;
                }
+        data = curve.data;
        seed_len  = data->seed_len;
        param_len = data->param_len;
-        params    = (const unsigned char *)(data+1);    /* skip header */
+        params    = (const unsigned char *)(data+1);    /* skip header */
-        params   += seed_len;                           /* skip seed   */
+        params   += seed_len;                           /* skip seed   */
        if (!(p = BN_bin2bn(params+0*param_len, param_len, NULL))
                || !(a = BN_bin2bn(params+1*param_len, param_len, NULL))
@@ -1935,7 +1964,17 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
                goto err;
                }
-        if (data->field_type == NID_X9_62_prime_field)
+        if (curve.meth != 0)
+                {
+                meth = curve.meth();
+                if (((group = EC_GROUP_new(meth)) == NULL) ||
+                        (!(group->meth->group_set_curve(group, p, a, b, ctx))))
+                        {
+                        ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
+                        goto err;
+                        }
+                }
+        else if (data->field_type == NID_X9_62_prime_field)
                {
                if ((group = EC_GROUP_new_curve_GFp(p, a, b, ctx)) == NULL)
                        {
@@ -1943,6 +1982,7 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
                        goto err;
                        }
                }
+#ifndef OPENSSL_NO_EC2M
        else    /* field_type == NID_X9_62_characteristic_two_field */
                {
                if ((group = EC_GROUP_new_curve_GF2m(p, a, b, ctx)) == NULL)
@@ -1951,20 +1991,21 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
                        goto err;
                        }
                }
+#endif
        if ((P = EC_POINT_new(group)) == NULL)
                {
                ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
                goto err;
                }
-        
        if (!(x = BN_bin2bn(params+3*param_len, param_len, NULL))
                || !(y = BN_bin2bn(params+4*param_len, param_len, NULL)))
                {
                ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_BN_LIB);
                goto err;
                }
-        if (!EC_POINT_set_affine_coordinates_GF2m(group, P, x, y, ctx))
+        if (!EC_POINT_set_affine_coordinates_GFp(group, P, x, y, ctx))
                {
                ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
                goto err;
@@ -2025,7 +2066,7 @@ EC_GROUP *EC_GROUP_new_by_curve_name(int nid)
        for (i=0; i<curve_list_length; i++)
                if (curve_list[i].nid == nid)
                        {
-                        ret = ec_group_new_from_data(curve_list[i].data);
+                        ret = ec_group_new_from_data(curve_list[i]);
                        break;
                        }
diff --git a/src/lib/libcrypto/ec/ec_key.c b/src/lib/libcrypto/ec/ec_key.c
index 522802c07a..bf9fd2dc2c 100644
--- a/src/lib/libcrypto/ec/ec_key.c
+++ b/src/lib/libcrypto/ec/ec_key.c
@@ -64,7 +64,9 @@
 #include <string.h>
 #include "ec_lcl.h"
 #include <openssl/err.h>
-#include <string.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 EC_KEY *EC_KEY_new(void)
        {
@@ -78,6 +80,7 @@ EC_KEY *EC_KEY_new(void)
                }
        ret->version = 1;       
+        ret->flags = 0;
        ret->group   = NULL;
        ret->pub_key = NULL;
        ret->priv_key= NULL;
@@ -197,6 +200,7 @@ EC_KEY *EC_KEY_copy(EC_KEY *dest, const EC_KEY *src)
        dest->enc_flag  = src->enc_flag;
        dest->conv_form = src->conv_form;
        dest->version   = src->version;
+        dest->flags = src->flags;
        return dest;
        }
@@ -237,6 +241,11 @@ int EC_KEY_generate_key(EC_KEY *eckey)
        BIGNUM  *priv_key = NULL, *order = NULL;
        EC_POINT *pub_key = NULL;
+#ifdef OPENSSL_FIPS
+        if (FIPS_mode())
+                return FIPS_ec_key_generate_key(eckey);
+#endif
        if (!eckey || !eckey->group)
                {
                ECerr(EC_F_EC_KEY_GENERATE_KEY, ERR_R_PASSED_NULL_PARAMETER);
@@ -371,6 +380,82 @@ err:
        return(ok);
        }
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y)
+        {
+        BN_CTX *ctx = NULL;
+        BIGNUM *tx, *ty;
+        EC_POINT *point = NULL;
+        int ok = 0, tmp_nid, is_char_two = 0;
+        if (!key || !key->group || !x || !y)
+                {
+                ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
+                                                ERR_R_PASSED_NULL_PARAMETER);
+                return 0;
+                }
+        ctx = BN_CTX_new();
+        if (!ctx)
+                goto err;
+        point = EC_POINT_new(key->group);
+        if (!point)
+                goto err;
+        tmp_nid = EC_METHOD_get_field_type(EC_GROUP_method_of(key->group));
+        if (tmp_nid == NID_X9_62_characteristic_two_field)
+                is_char_two = 1;
+        tx = BN_CTX_get(ctx);
+        ty = BN_CTX_get(ctx);
+#ifndef OPENSSL_NO_EC2M
+        if (is_char_two)
+                {
+                if (!EC_POINT_set_affine_coordinates_GF2m(key->group, point,
+                                                                x, y, ctx))
+                        goto err;
+                if (!EC_POINT_get_affine_coordinates_GF2m(key->group, point,
+                                                                tx, ty, ctx))
+                        goto err;
+                }
+        else
+#endif
+                {
+                if (!EC_POINT_set_affine_coordinates_GFp(key->group, point,
+                                                                x, y, ctx))
+                        goto err;
+                if (!EC_POINT_get_affine_coordinates_GFp(key->group, point,
+                                                                tx, ty, ctx))
+                        goto err;
+                }
+        /* Check if retrieved coordinates match originals: if not values
+         * are out of range.
+         */
+        if (BN_cmp(x, tx) || BN_cmp(y, ty))
+                {
+                ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
+                        EC_R_COORDINATES_OUT_OF_RANGE);
+                goto err;
+                }
+        if (!EC_KEY_set_public_key(key, point))
+                goto err;
+        if (EC_KEY_check_key(key) == 0)
+                goto err;
+        ok = 1;
+        err:
+        if (ctx)
+                BN_CTX_free(ctx);
+        if (point)
+                EC_POINT_free(point);
+        return ok;
+        }
 const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key)
        {
        return key->group;
@@ -461,3 +546,18 @@ int EC_KEY_precompute_mult(EC_KEY *key, BN_CTX *ctx)
                return 0;
        return EC_GROUP_precompute_mult(key->group, ctx);
        }
+int EC_KEY_get_flags(const EC_KEY *key)
+        {
+        return key->flags;
+        }
+void EC_KEY_set_flags(EC_KEY *key, int flags)
+        {
+        key->flags |= flags;
+        }
+void EC_KEY_clear_flags(EC_KEY *key, int flags)
+        {
+        key->flags &= ~flags;
+        }
diff --git a/src/lib/libcrypto/ec/ec_oct.c b/src/lib/libcrypto/ec/ec_oct.c
new file mode 100644
index 0000000000..fd9db0798d
--- /dev/null
+++ b/src/lib/libcrypto/ec/ec_oct.c
@@ -0,0 +1,199 @@
+/* crypto/ec/ec_lib.c */
+/*
+ * Originally written by Bodo Moeller for the OpenSSL project.
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2003 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * Binary polynomial ECC support in OpenSSL originally developed by 
+ * SUN MICROSYSTEMS, INC., and contributed to the OpenSSL project.
+ */
+#include <string.h>
+#include <openssl/err.h>
+#include <openssl/opensslv.h>
+#include "ec_lcl.h"
+int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
+        const BIGNUM *x, int y_bit, BN_CTX *ctx)
+        {
+        if (group->meth->point_set_compressed_coordinates == 0
+                && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+                {
+                ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+                return 0;
+                }
+        if (group->meth != point->meth)
+                {
+                ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS);
+                return 0;
+                }
+        if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+                {
+                if (group->meth->field_type == NID_X9_62_prime_field)
+                        return ec_GFp_simple_set_compressed_coordinates(
+                                        group, point, x, y_bit, ctx);
+                else
+#ifdef OPENSSL_NO_EC2M
+                        {
+                        ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_GF2M_NOT_SUPPORTED);
+                        return 0;
+                        }
+#else
+                        return ec_GF2m_simple_set_compressed_coordinates(
+                                        group, point, x, y_bit, ctx);
+#endif
+                }
+        return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
+        }
+#ifndef OPENSSL_NO_EC2M
+int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
+        const BIGNUM *x, int y_bit, BN_CTX *ctx)
+        {
+        if (group->meth->point_set_compressed_coordinates == 0
+                && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+                {
+                ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+                return 0;
+                }
+        if (group->meth != point->meth)
+                {
+                ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS);
+                return 0;
+                }
+        if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+                {
+                if (group->meth->field_type == NID_X9_62_prime_field)
+                        return ec_GFp_simple_set_compressed_coordinates(
+                                        group, point, x, y_bit, ctx);
+                else
+                        return ec_GF2m_simple_set_compressed_coordinates(
+                                        group, point, x, y_bit, ctx);
+                }
+        return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
+        }
+#endif
+size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+        unsigned char *buf, size_t len, BN_CTX *ctx)
+        {
+        if (group->meth->point2oct == 0
+                && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+                {
+                ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+                return 0;
+                }
+        if (group->meth != point->meth)
+                {
+                ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS);
+                return 0;
+                }
+        if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+                {
+                if (group->meth->field_type == NID_X9_62_prime_field)
+                        return ec_GFp_simple_point2oct(group, point,
+                                                        form, buf, len, ctx);
+                else
+#ifdef OPENSSL_NO_EC2M
+                        {
+                        ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_GF2M_NOT_SUPPORTED);
+                        return 0;
+                        }
+#else
+                        return ec_GF2m_simple_point2oct(group, point,
+                                                        form, buf, len, ctx);
+#endif
+                }
+                        
+        return group->meth->point2oct(group, point, form, buf, len, ctx);
+        }
+int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point,
+        const unsigned char *buf, size_t len, BN_CTX *ctx)
+        {
+        if (group->meth->oct2point == 0
+                && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+                {
+                ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+                return 0;
+                }
+        if (group->meth != point->meth)
+                {
+                ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS);
+                return 0;
+                }
+        if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+                {
+                if (group->meth->field_type == NID_X9_62_prime_field)
+                        return ec_GFp_simple_oct2point(group, point,
+                                                        buf, len, ctx);
+                else
+#ifdef OPENSSL_NO_EC2M
+                        {
+                        ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_GF2M_NOT_SUPPORTED);
+                        return 0;
+                        }
+#else
+                        return ec_GF2m_simple_oct2point(group, point,
+                                                        buf, len, ctx);
+#endif
+                }
+        return group->meth->oct2point(group, point, buf, len, ctx);
+        }
diff --git a/src/lib/libcrypto/ec/ec_pmeth.c b/src/lib/libcrypto/ec/ec_pmeth.c
index f433076ca1..d1ed66c37e 100644
--- a/src/lib/libcrypto/ec/ec_pmeth.c
+++ b/src/lib/libcrypto/ec/ec_pmeth.c
@@ -221,6 +221,7 @@ static int pkey_ec_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
                case EVP_PKEY_CTRL_MD:
                if (EVP_MD_type((const EVP_MD *)p2) != NID_sha1 &&
+                    EVP_MD_type((const EVP_MD *)p2) != NID_ecdsa_with_SHA1 &&
                    EVP_MD_type((const EVP_MD *)p2) != NID_sha224 &&
                    EVP_MD_type((const EVP_MD *)p2) != NID_sha256 &&
                    EVP_MD_type((const EVP_MD *)p2) != NID_sha384 &&
diff --git a/src/lib/libcrypto/ec/eck_prn.c b/src/lib/libcrypto/ec/eck_prn.c
index 7d3e175ae7..06de8f3959 100644
--- a/src/lib/libcrypto/ec/eck_prn.c
+++ b/src/lib/libcrypto/ec/eck_prn.c
@@ -207,7 +207,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off)
                        reason = ERR_R_MALLOC_FAILURE;
                        goto err;
                        }
+#ifndef OPENSSL_NO_EC2M
                if (is_char_two)
                        {
                        if (!EC_GROUP_get_curve_GF2m(x, p, a, b, ctx))
@@ -217,6 +217,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off)
                                }
                        }
                else /* prime field */
+#endif
                        {
                        if (!EC_GROUP_get_curve_GFp(x, p, a, b, ctx))
                                {
diff --git a/src/lib/libcrypto/ec/ecp_nistp224.c b/src/lib/libcrypto/ec/ecp_nistp224.c
new file mode 100644
index 0000000000..b5ff56c252
--- /dev/null
+++ b/src/lib/libcrypto/ec/ecp_nistp224.c
@@ -0,0 +1,1658 @@
+/* crypto/ec/ecp_nistp224.c */
+/*
+ * Written by Emilia Kasper (Google) for the OpenSSL project.
+ */
+/* Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ *
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/*
+ * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
+ *
+ * Inspired by Daniel J. Bernstein's public domain nistp224 implementation
+ * and Adam Langley's public domain 64-bit C implementation of curve25519
+ */
+#include <openssl/opensslconf.h>
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+#ifndef OPENSSL_SYS_VMS
+#include <stdint.h>
+#else
+#include <inttypes.h>
+#endif
+#include <string.h>
+#include <openssl/err.h>
+#include "ec_lcl.h"
+#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+  /* even with gcc, the typedef won't work for 32-bit platforms */
+  typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
+#else
+  #error "Need GCC 3.1 or later to define type uint128_t"
+#endif
+typedef uint8_t u8;
+typedef uint64_t u64;
+typedef int64_t s64;
+/******************************************************************************/
+/*                  INTERNAL REPRESENTATION OF FIELD ELEMENTS
+ *
+ * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
+ * using 64-bit coefficients called 'limbs',
+ * and sometimes (for multiplication results) as
+ * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6
+ * using 128-bit coefficients called 'widelimbs'.
+ * A 4-limb representation is an 'felem';
+ * a 7-widelimb representation is a 'widefelem'.
+ * Even within felems, bits of adjacent limbs overlap, and we don't always
+ * reduce the representations: we ensure that inputs to each felem
+ * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60,
+ * and fit into a 128-bit word without overflow. The coefficients are then
+ * again partially reduced to obtain an felem satisfying a_i < 2^57.
+ * We only reduce to the unique minimal representation at the end of the
+ * computation.
+ */
+typedef uint64_t limb;
+typedef uint128_t widelimb;
+typedef limb felem[4];
+typedef widelimb widefelem[7];
+/* Field element represented as a byte arrary.
+ * 28*8 = 224 bits is also the group order size for the elliptic curve,
+ * and we also use this type for scalars for point multiplication.
+  */
+typedef u8 felem_bytearray[28];
+static const felem_bytearray nistp224_curve_params[5] = {
+        {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,    /* p */
+         0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,
+         0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01},
+        {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,    /* a */
+         0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFF,0xFF,
+         0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE},
+        {0xB4,0x05,0x0A,0x85,0x0C,0x04,0xB3,0xAB,0xF5,0x41,    /* b */
+         0x32,0x56,0x50,0x44,0xB0,0xB7,0xD7,0xBF,0xD8,0xBA,
+         0x27,0x0B,0x39,0x43,0x23,0x55,0xFF,0xB4},
+        {0xB7,0x0E,0x0C,0xBD,0x6B,0xB4,0xBF,0x7F,0x32,0x13,    /* x */
+         0x90,0xB9,0x4A,0x03,0xC1,0xD3,0x56,0xC2,0x11,0x22,
+         0x34,0x32,0x80,0xD6,0x11,0x5C,0x1D,0x21},
+        {0xbd,0x37,0x63,0x88,0xb5,0xf7,0x23,0xfb,0x4c,0x22,    /* y */
+         0xdf,0xe6,0xcd,0x43,0x75,0xa0,0x5a,0x07,0x47,0x64,
+         0x44,0xd5,0x81,0x99,0x85,0x00,0x7e,0x34}
+};
+/* Precomputed multiples of the standard generator
+ * Points are given in coordinates (X, Y, Z) where Z normally is 1
+ * (0 for the point at infinity).
+ * For each field element, slice a_0 is word 0, etc.
+ *
+ * The table has 2 * 16 elements, starting with the following:
+ * index | bits    | point
+ * ------+---------+------------------------------
+ *     0 | 0 0 0 0 | 0G
+ *     1 | 0 0 0 1 | 1G
+ *     2 | 0 0 1 0 | 2^56G
+ *     3 | 0 0 1 1 | (2^56 + 1)G
+ *     4 | 0 1 0 0 | 2^112G
+ *     5 | 0 1 0 1 | (2^112 + 1)G
+ *     6 | 0 1 1 0 | (2^112 + 2^56)G
+ *     7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
+ *     8 | 1 0 0 0 | 2^168G
+ *     9 | 1 0 0 1 | (2^168 + 1)G
+ *    10 | 1 0 1 0 | (2^168 + 2^56)G
+ *    11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
+ *    12 | 1 1 0 0 | (2^168 + 2^112)G
+ *    13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
+ *    14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
+ *    15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
+ * followed by a copy of this with each element multiplied by 2^28.
+ *
+ * The reason for this is so that we can clock bits into four different
+ * locations when doing simple scalar multiplies against the base point,
+ * and then another four locations using the second 16 elements.
+ */
+static const felem gmul[2][16][3] =
+{{{{0, 0, 0, 0},
+   {0, 0, 0, 0},
+   {0, 0, 0, 0}},
+  {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
+   {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
+   {1, 0, 0, 0}},
+  {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
+   {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
+   {1, 0, 0, 0}},
+  {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
+   {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
+   {1, 0, 0, 0}},
+  {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
+   {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
+   {1, 0, 0, 0}},
+  {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
+   {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
+   {1, 0, 0, 0}},
+  {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
+   {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
+   {1, 0, 0, 0}},
+  {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
+   {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
+   {1, 0, 0, 0}},
+  {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
+   {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
+   {1, 0, 0, 0}},
+  {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
+   {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
+   {1, 0, 0, 0}},
+  {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
+   {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
+   {1, 0, 0, 0}},
+  {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
+   {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
+   {1, 0, 0, 0}},
+  {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
+   {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
+   {1, 0, 0, 0}},
+  {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
+   {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
+   {1, 0, 0, 0}},
+  {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
+   {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
+   {1, 0, 0, 0}},
+  {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
+   {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
+   {1, 0, 0, 0}}},
+ {{{0, 0, 0, 0},
+   {0, 0, 0, 0},
+   {0, 0, 0, 0}},
+  {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
+   {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
+   {1, 0, 0, 0}},
+  {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
+   {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
+   {1, 0, 0, 0}},
+  {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
+   {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
+   {1, 0, 0, 0}},
+  {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
+   {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
+   {1, 0, 0, 0}},
+  {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
+   {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
+   {1, 0, 0, 0}},
+  {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
+   {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
+   {1, 0, 0, 0}},
+  {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
+   {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
+   {1, 0, 0, 0}},
+  {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
+   {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
+   {1, 0, 0, 0}},
+  {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
+   {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
+   {1, 0, 0, 0}},
+  {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
+   {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
+   {1, 0, 0, 0}},
+  {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
+   {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
+   {1, 0, 0, 0}},
+  {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
+   {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
+   {1, 0, 0, 0}},
+  {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
+   {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
+   {1, 0, 0, 0}},
+  {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
+   {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
+   {1, 0, 0, 0}},
+  {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
+   {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
+   {1, 0, 0, 0}}}};
+/* Precomputation for the group generator. */
+typedef struct {
+        felem g_pre_comp[2][16][3];
+        int references;
+} NISTP224_PRE_COMP;
+const EC_METHOD *EC_GFp_nistp224_method(void)
+        {
+        static const EC_METHOD ret = {
+                EC_FLAGS_DEFAULT_OCT,
+                NID_X9_62_prime_field,
+                ec_GFp_nistp224_group_init,
+                ec_GFp_simple_group_finish,
+                ec_GFp_simple_group_clear_finish,
+                ec_GFp_nist_group_copy,
+                ec_GFp_nistp224_group_set_curve,
+                ec_GFp_simple_group_get_curve,
+                ec_GFp_simple_group_get_degree,
+                ec_GFp_simple_group_check_discriminant,
+                ec_GFp_simple_point_init,
+                ec_GFp_simple_point_finish,
+                ec_GFp_simple_point_clear_finish,
+                ec_GFp_simple_point_copy,
+                ec_GFp_simple_point_set_to_infinity,
+                ec_GFp_simple_set_Jprojective_coordinates_GFp,
+                ec_GFp_simple_get_Jprojective_coordinates_GFp,
+                ec_GFp_simple_point_set_affine_coordinates,
+                ec_GFp_nistp224_point_get_affine_coordinates,
+                0 /* point_set_compressed_coordinates */,
+                0 /* point2oct */,
+                0 /* oct2point */,
+                ec_GFp_simple_add,
+                ec_GFp_simple_dbl,
+                ec_GFp_simple_invert,
+                ec_GFp_simple_is_at_infinity,
+                ec_GFp_simple_is_on_curve,
+                ec_GFp_simple_cmp,
+                ec_GFp_simple_make_affine,
+                ec_GFp_simple_points_make_affine,
+                ec_GFp_nistp224_points_mul,
+                ec_GFp_nistp224_precompute_mult,
+                ec_GFp_nistp224_have_precompute_mult,
+                ec_GFp_nist_field_mul,
+                ec_GFp_nist_field_sqr,
+                0 /* field_div */,
+                0 /* field_encode */,
+                0 /* field_decode */,
+                0 /* field_set_to_one */ };
+        return &ret;
+        }
+/* Helper functions to convert field elements to/from internal representation */
+static void bin28_to_felem(felem out, const u8 in[28])
+        {
+        out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff;
+        out[1] = (*((const uint64_t *)(in+7))) & 0x00ffffffffffffff;
+        out[2] = (*((const uint64_t *)(in+14))) & 0x00ffffffffffffff;
+        out[3] = (*((const uint64_t *)(in+21))) & 0x00ffffffffffffff;
+        }
+static void felem_to_bin28(u8 out[28], const felem in)
+        {
+        unsigned i;
+        for (i = 0; i < 7; ++i)
+                {
+                out[i]    = in[0]>>(8*i);
+                out[i+7]  = in[1]>>(8*i);
+                out[i+14] = in[2]>>(8*i);
+                out[i+21] = in[3]>>(8*i);
+                }
+        }
+/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
+static void flip_endian(u8 *out, const u8 *in, unsigned len)
+        {
+        unsigned i;
+        for (i = 0; i < len; ++i)
+                out[i] = in[len-1-i];
+        }
+/* From OpenSSL BIGNUM to internal representation */
+static int BN_to_felem(felem out, const BIGNUM *bn)
+        {
+        felem_bytearray b_in;
+        felem_bytearray b_out;
+        unsigned num_bytes;
+        /* BN_bn2bin eats leading zeroes */
+        memset(b_out, 0, sizeof b_out);
+        num_bytes = BN_num_bytes(bn);
+        if (num_bytes > sizeof b_out)
+                {
+                ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
+                return 0;
+                }
+        if (BN_is_negative(bn))
+                {
+                ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
+                return 0;
+                }
+        num_bytes = BN_bn2bin(bn, b_in);
+        flip_endian(b_out, b_in, num_bytes);
+        bin28_to_felem(out, b_out);
+        return 1;
+        }
+/* From internal representation to OpenSSL BIGNUM */
+static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
+        {
+        felem_bytearray b_in, b_out;
+        felem_to_bin28(b_in, in);
+        flip_endian(b_out, b_in, sizeof b_out);
+        return BN_bin2bn(b_out, sizeof b_out, out);
+        }
+/******************************************************************************/
+/*                              FIELD OPERATIONS
+ *
+ * Field operations, using the internal representation of field elements.
+ * NB! These operations are specific to our point multiplication and cannot be
+ * expected to be correct in general - e.g., multiplication with a large scalar
+ * will cause an overflow.
+ *
+ */
+static void felem_one(felem out)
+        {
+        out[0] = 1;
+        out[1] = 0;
+        out[2] = 0;
+        out[3] = 0;
+        }
+static void felem_assign(felem out, const felem in)
+        {
+        out[0] = in[0];
+        out[1] = in[1];
+        out[2] = in[2];
+        out[3] = in[3];
+        }
+/* Sum two field elements: out += in */
+static void felem_sum(felem out, const felem in)
+        {
+        out[0] += in[0];
+        out[1] += in[1];
+        out[2] += in[2];
+        out[3] += in[3];
+        }
+/* Get negative value: out = -in */
+/* Assumes in[i] < 2^57 */
+static void felem_neg(felem out, const felem in)
+        {
+        static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
+        static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
+        static const limb two58m42m2 = (((limb) 1) << 58) -
+            (((limb) 1) << 42) - (((limb) 1) << 2);
+        /* Set to 0 mod 2^224-2^96+1 to ensure out > in */
+        out[0] = two58p2 - in[0];
+        out[1] = two58m42m2 - in[1];
+        out[2] = two58m2 - in[2];
+        out[3] = two58m2 - in[3];
+        }
+/* Subtract field elements: out -= in */
+/* Assumes in[i] < 2^57 */
+static void felem_diff(felem out, const felem in)
+        {
+        static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
+        static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
+        static const limb two58m42m2 = (((limb) 1) << 58) -
+            (((limb) 1) << 42) - (((limb) 1) << 2);
+        /* Add 0 mod 2^224-2^96+1 to ensure out > in */
+        out[0] += two58p2;
+        out[1] += two58m42m2;
+        out[2] += two58m2;
+        out[3] += two58m2;
+        out[0] -= in[0];
+        out[1] -= in[1];
+        out[2] -= in[2];
+        out[3] -= in[3];
+        }
+/* Subtract in unreduced 128-bit mode: out -= in */
+/* Assumes in[i] < 2^119 */
+static void widefelem_diff(widefelem out, const widefelem in)
+        {
+        static const widelimb two120 = ((widelimb) 1) << 120;
+        static const widelimb two120m64 = (((widelimb) 1) << 120) -
+                (((widelimb) 1) << 64);
+        static const widelimb two120m104m64 = (((widelimb) 1) << 120) -
+                (((widelimb) 1) << 104) - (((widelimb) 1) << 64);
+        /* Add 0 mod 2^224-2^96+1 to ensure out > in */
+        out[0] += two120;
+        out[1] += two120m64;
+        out[2] += two120m64;
+        out[3] += two120;
+        out[4] += two120m104m64;
+        out[5] += two120m64;
+        out[6] += two120m64;
+        out[0] -= in[0];
+        out[1] -= in[1];
+        out[2] -= in[2];
+        out[3] -= in[3];
+        out[4] -= in[4];
+        out[5] -= in[5];
+        out[6] -= in[6];
+        }
+/* Subtract in mixed mode: out128 -= in64 */
+/* in[i] < 2^63 */
+static void felem_diff_128_64(widefelem out, const felem in)
+        {
+        static const widelimb two64p8 = (((widelimb) 1) << 64) +
+                (((widelimb) 1) << 8);
+        static const widelimb two64m8 = (((widelimb) 1) << 64) -
+                (((widelimb) 1) << 8);
+        static const widelimb two64m48m8 = (((widelimb) 1) << 64) -
+                (((widelimb) 1) << 48) - (((widelimb) 1) << 8);
+        /* Add 0 mod 2^224-2^96+1 to ensure out > in */
+        out[0] += two64p8;
+        out[1] += two64m48m8;
+        out[2] += two64m8;
+        out[3] += two64m8;
+        out[0] -= in[0];
+        out[1] -= in[1];
+        out[2] -= in[2];
+        out[3] -= in[3];
+        }
+/* Multiply a field element by a scalar: out = out * scalar
+ * The scalars we actually use are small, so results fit without overflow */
+static void felem_scalar(felem out, const limb scalar)
+        {
+        out[0] *= scalar;
+        out[1] *= scalar;
+        out[2] *= scalar;
+        out[3] *= scalar;
+        }
+/* Multiply an unreduced field element by a scalar: out = out * scalar
+ * The scalars we actually use are small, so results fit without overflow */
+static void widefelem_scalar(widefelem out, const widelimb scalar)
+        {
+        out[0] *= scalar;
+        out[1] *= scalar;
+        out[2] *= scalar;
+        out[3] *= scalar;
+        out[4] *= scalar;
+        out[5] *= scalar;
+        out[6] *= scalar;
+        }
+/* Square a field element: out = in^2 */
+static void felem_square(widefelem out, const felem in)
+        {
+        limb tmp0, tmp1, tmp2;
+        tmp0 = 2 * in[0]; tmp1 = 2 * in[1]; tmp2 = 2 * in[2];
+        out[0] = ((widelimb) in[0]) * in[0];
+        out[1] = ((widelimb) in[0]) * tmp1;
+        out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1];
+        out[3] = ((widelimb) in[3]) * tmp0 +
+                ((widelimb) in[1]) * tmp2;
+        out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2];
+        out[5] = ((widelimb) in[3]) * tmp2;
+        out[6] = ((widelimb) in[3]) * in[3];
+        }
+/* Multiply two field elements: out = in1 * in2 */
+static void felem_mul(widefelem out, const felem in1, const felem in2)
+        {
+        out[0] = ((widelimb) in1[0]) * in2[0];
+        out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0];
+        out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] +
+                ((widelimb) in1[2]) * in2[0];
+        out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] +
+                ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0];
+        out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] +
+                ((widelimb) in1[3]) * in2[1];
+        out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2];
+        out[6] = ((widelimb) in1[3]) * in2[3];
+        }
+/* Reduce seven 128-bit coefficients to four 64-bit coefficients.
+ * Requires in[i] < 2^126,
+ * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */
+static void felem_reduce(felem out, const widefelem in)
+        {
+        static const widelimb two127p15 = (((widelimb) 1) << 127) +
+                (((widelimb) 1) << 15);
+        static const widelimb two127m71 = (((widelimb) 1) << 127) -
+                (((widelimb) 1) << 71);
+        static const widelimb two127m71m55 = (((widelimb) 1) << 127) -
+                (((widelimb) 1) << 71) - (((widelimb) 1) << 55);
+        widelimb output[5];
+        /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */
+        output[0] = in[0] + two127p15;
+        output[1] = in[1] + two127m71m55;
+        output[2] = in[2] + two127m71;
+        output[3] = in[3];
+        output[4] = in[4];
+        /* Eliminate in[4], in[5], in[6] */
+        output[4] += in[6] >> 16;
+        output[3] += (in[6] & 0xffff) << 40;
+        output[2] -= in[6];
+        output[3] += in[5] >> 16;
+        output[2] += (in[5] & 0xffff) << 40;
+        output[1] -= in[5];
+        output[2] += output[4] >> 16;
+        output[1] += (output[4] & 0xffff) << 40;
+        output[0] -= output[4];
+        /* Carry 2 -> 3 -> 4 */
+        output[3] += output[2] >> 56;
+        output[2] &= 0x00ffffffffffffff;
+        output[4] = output[3] >> 56;
+        output[3] &= 0x00ffffffffffffff;
+        /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */
+        /* Eliminate output[4] */
+        output[2] += output[4] >> 16;
+        /* output[2] < 2^56 + 2^56 = 2^57 */
+        output[1] += (output[4] & 0xffff) << 40;
+        output[0] -= output[4];
+        /* Carry 0 -> 1 -> 2 -> 3 */
+        output[1] += output[0] >> 56;
+        out[0] = output[0] & 0x00ffffffffffffff;
+        output[2] += output[1] >> 56;
+        /* output[2] < 2^57 + 2^72 */
+        out[1] = output[1] & 0x00ffffffffffffff;
+        output[3] += output[2] >> 56;
+        /* output[3] <= 2^56 + 2^16 */
+        out[2] = output[2] & 0x00ffffffffffffff;
+        /* out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
+         * out[3] <= 2^56 + 2^16 (due to final carry),
+         * so out < 2*p */
+        out[3] = output[3];
+        }
+static void felem_square_reduce(felem out, const felem in)
+        {
+        widefelem tmp;
+        felem_square(tmp, in);
+        felem_reduce(out, tmp);
+        }
+static void felem_mul_reduce(felem out, const felem in1, const felem in2)
+        {
+        widefelem tmp;
+        felem_mul(tmp, in1, in2);
+        felem_reduce(out, tmp);
+        }
+/* Reduce to unique minimal representation.
+ * Requires 0 <= in < 2*p (always call felem_reduce first) */
+static void felem_contract(felem out, const felem in)
+        {
+        static const int64_t two56 = ((limb) 1) << 56;
+        /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */
+        /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */
+        int64_t tmp[4], a;
+        tmp[0] = in[0];
+        tmp[1] = in[1];
+        tmp[2] = in[2];
+        tmp[3] = in[3];
+        /* Case 1: a = 1 iff in >= 2^224 */
+        a = (in[3] >> 56);
+        tmp[0] -= a;
+        tmp[1] += a << 40;
+        tmp[3] &= 0x00ffffffffffffff;
+        /* Case 2: a = 0 iff p <= in < 2^224, i.e.,
+         * the high 128 bits are all 1 and the lower part is non-zero */
+        a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
+                (((int64_t)(in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
+        a &= 0x00ffffffffffffff;
+        /* turn a into an all-one mask (if a = 0) or an all-zero mask */
+        a = (a - 1) >> 63;
+        /* subtract 2^224 - 2^96 + 1 if a is all-one*/
+        tmp[3] &= a ^ 0xffffffffffffffff;
+        tmp[2] &= a ^ 0xffffffffffffffff;
+        tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
+        tmp[0] -= 1 & a;
+        /* eliminate negative coefficients: if tmp[0] is negative, tmp[1] must
+         * be non-zero, so we only need one step */
+        a = tmp[0] >> 63;
+        tmp[0] += two56 & a;
+        tmp[1] -= 1 & a;
+        /* carry 1 -> 2 -> 3 */
+        tmp[2] += tmp[1] >> 56;
+        tmp[1] &= 0x00ffffffffffffff;
+        tmp[3] += tmp[2] >> 56;
+        tmp[2] &= 0x00ffffffffffffff;
+        /* Now 0 <= out < p */
+        out[0] = tmp[0];
+        out[1] = tmp[1];
+        out[2] = tmp[2];
+        out[3] = tmp[3];
+        }
+/* Zero-check: returns 1 if input is 0, and 0 otherwise.
+ * We know that field elements are reduced to in < 2^225,
+ * so we only need to check three cases: 0, 2^224 - 2^96 + 1,
+ * and 2^225 - 2^97 + 2 */
+static limb felem_is_zero(const felem in)
+        {
+        limb zero, two224m96p1, two225m97p2;
+        zero = in[0] | in[1] | in[2] | in[3];
+        zero = (((int64_t)(zero) - 1) >> 63) & 1;
+        two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000)
+                | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff);
+        two224m96p1 = (((int64_t)(two224m96p1) - 1) >> 63) & 1;
+        two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000)
+                | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff);
+        two225m97p2 = (((int64_t)(two225m97p2) - 1) >> 63) & 1;
+        return (zero | two224m96p1 | two225m97p2);
+        }
+static limb felem_is_zero_int(const felem in)
+        {
+        return (int) (felem_is_zero(in) & ((limb)1));
+        }
+/* Invert a field element */
+/* Computation chain copied from djb's code */
+static void felem_inv(felem out, const felem in)
+        {
+        felem ftmp, ftmp2, ftmp3, ftmp4;
+        widefelem tmp;
+        unsigned i;
+        felem_square(tmp, in); felem_reduce(ftmp, tmp);         /* 2 */
+        felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);      /* 2^2 - 1 */
+        felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);       /* 2^3 - 2 */
+        felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);      /* 2^3 - 1 */
+        felem_square(tmp, ftmp); felem_reduce(ftmp2, tmp);      /* 2^4 - 2 */
+        felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);     /* 2^5 - 4 */
+        felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);     /* 2^6 - 8 */
+        felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp, tmp);   /* 2^6 - 1 */
+        felem_square(tmp, ftmp); felem_reduce(ftmp2, tmp);      /* 2^7 - 2 */
+        for (i = 0; i < 5; ++i)                                 /* 2^12 - 2^6 */
+                {
+                felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);
+                }
+        felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp2, tmp);  /* 2^12 - 1 */
+        felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp);     /* 2^13 - 2 */
+        for (i = 0; i < 11; ++i)                                /* 2^24 - 2^12 */
+                {
+                felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);
+                }
+        felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp2, tmp); /* 2^24 - 1 */
+        felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp);     /* 2^25 - 2 */
+        for (i = 0; i < 23; ++i)                                /* 2^48 - 2^24 */
+                {
+                felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);
+                }
+        felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^48 - 1 */
+        felem_square(tmp, ftmp3); felem_reduce(ftmp4, tmp);     /* 2^49 - 2 */
+        for (i = 0; i < 47; ++i)                                /* 2^96 - 2^48 */
+                {
+                felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp);
+                }
+        felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^96 - 1 */
+        felem_square(tmp, ftmp3); felem_reduce(ftmp4, tmp);     /* 2^97 - 2 */
+        for (i = 0; i < 23; ++i)                                /* 2^120 - 2^24 */
+                {
+                felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp);
+                }
+        felem_mul(tmp, ftmp2, ftmp4); felem_reduce(ftmp2, tmp); /* 2^120 - 1 */
+        for (i = 0; i < 6; ++i)                                 /* 2^126 - 2^6 */
+                {
+                felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);
+                }
+        felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp, tmp);   /* 2^126 - 1 */
+        felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);       /* 2^127 - 2 */
+        felem_mul(tmp, ftmp, in); felem_reduce(ftmp, tmp);      /* 2^127 - 1 */
+        for (i = 0; i < 97; ++i)                                /* 2^224 - 2^97 */
+                {
+                felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);
+                }
+        felem_mul(tmp, ftmp, ftmp3); felem_reduce(out, tmp);    /* 2^224 - 2^96 - 1 */
+        }
+/* Copy in constant time:
+ * if icopy == 1, copy in to out,
+ * if icopy == 0, copy out to itself. */
+static void
+copy_conditional(felem out, const felem in, limb icopy)
+        {
+        unsigned i;
+        /* icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one */
+        const limb copy = -icopy;
+        for (i = 0; i < 4; ++i)
+                {
+                const limb tmp = copy & (in[i] ^ out[i]);
+                out[i] ^= tmp;
+                }
+        }
+/******************************************************************************/
+/*                       ELLIPTIC CURVE POINT OPERATIONS
+ *
+ * Points are represented in Jacobian projective coordinates:
+ * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
+ * or to the point at infinity if Z == 0.
+ *
+ */
+/* Double an elliptic curve point:
+ * (X', Y', Z') = 2 * (X, Y, Z), where
+ * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
+ * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2
+ * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
+ * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
+ * while x_out == y_in is not (maybe this works, but it's not tested). */
+static void
+point_double(felem x_out, felem y_out, felem z_out,
+             const felem x_in, const felem y_in, const felem z_in)
+        {
+        widefelem tmp, tmp2;
+        felem delta, gamma, beta, alpha, ftmp, ftmp2;
+        felem_assign(ftmp, x_in);
+        felem_assign(ftmp2, x_in);
+        /* delta = z^2 */
+        felem_square(tmp, z_in);
+        felem_reduce(delta, tmp);
+        /* gamma = y^2 */
+        felem_square(tmp, y_in);
+        felem_reduce(gamma, tmp);
+        /* beta = x*gamma */
+        felem_mul(tmp, x_in, gamma);
+        felem_reduce(beta, tmp);
+        /* alpha = 3*(x-delta)*(x+delta) */
+        felem_diff(ftmp, delta);
+        /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */
+        felem_sum(ftmp2, delta);
+        /* ftmp2[i] < 2^57 + 2^57 = 2^58 */
+        felem_scalar(ftmp2, 3);
+        /* ftmp2[i] < 3 * 2^58 < 2^60 */
+        felem_mul(tmp, ftmp, ftmp2);
+        /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */
+        felem_reduce(alpha, tmp);
+        /* x' = alpha^2 - 8*beta */
+        felem_square(tmp, alpha);
+        /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
+        felem_assign(ftmp, beta);
+        felem_scalar(ftmp, 8);
+        /* ftmp[i] < 8 * 2^57 = 2^60 */
+        felem_diff_128_64(tmp, ftmp);
+        /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
+        felem_reduce(x_out, tmp);
+        /* z' = (y + z)^2 - gamma - delta */
+        felem_sum(delta, gamma);
+        /* delta[i] < 2^57 + 2^57 = 2^58 */
+        felem_assign(ftmp, y_in);
+        felem_sum(ftmp, z_in);
+        /* ftmp[i] < 2^57 + 2^57 = 2^58 */
+        felem_square(tmp, ftmp);
+        /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */
+        felem_diff_128_64(tmp, delta);
+        /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */
+        felem_reduce(z_out, tmp);
+        /* y' = alpha*(4*beta - x') - 8*gamma^2 */
+        felem_scalar(beta, 4);
+        /* beta[i] < 4 * 2^57 = 2^59 */
+        felem_diff(beta, x_out);
+        /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */
+        felem_mul(tmp, alpha, beta);
+        /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */
+        felem_square(tmp2, gamma);
+        /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */
+        widefelem_scalar(tmp2, 8);
+        /* tmp2[i] < 8 * 2^116 = 2^119 */
+        widefelem_diff(tmp, tmp2);
+        /* tmp[i] < 2^119 + 2^120 < 2^121 */
+        felem_reduce(y_out, tmp);
+        }
+/* Add two elliptic curve points:
+ * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
+ * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
+ * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
+ * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) -
+ *        Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
+ * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
+ *
+ * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
+ */
+/* This function is not entirely constant-time:
+ * it includes a branch for checking whether the two input points are equal,
+ * (while not equal to the point at infinity).
+ * This case never happens during single point multiplication,
+ * so there is no timing leak for ECDH or ECDSA signing. */
+static void point_add(felem x3, felem y3, felem z3,
+        const felem x1, const felem y1, const felem z1,
+        const int mixed, const felem x2, const felem y2, const felem z2)
+        {
+        felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
+        widefelem tmp, tmp2;
+        limb z1_is_zero, z2_is_zero, x_equal, y_equal;
+        if (!mixed)
+                {
+                /* ftmp2 = z2^2 */
+                felem_square(tmp, z2);
+                felem_reduce(ftmp2, tmp);
+                /* ftmp4 = z2^3 */
+                felem_mul(tmp, ftmp2, z2);
+                felem_reduce(ftmp4, tmp);
+                /* ftmp4 = z2^3*y1 */
+                felem_mul(tmp2, ftmp4, y1);
+                felem_reduce(ftmp4, tmp2);
+                /* ftmp2 = z2^2*x1 */
+                felem_mul(tmp2, ftmp2, x1);
+                felem_reduce(ftmp2, tmp2);
+                }
+        else
+                {
+                /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
+                /* ftmp4 = z2^3*y1 */
+                felem_assign(ftmp4, y1);
+                /* ftmp2 = z2^2*x1 */
+                felem_assign(ftmp2, x1);
+                }
+        /* ftmp = z1^2 */
+        felem_square(tmp, z1);
+        felem_reduce(ftmp, tmp);
+        /* ftmp3 = z1^3 */
+        felem_mul(tmp, ftmp, z1);
+        felem_reduce(ftmp3, tmp);
+        /* tmp = z1^3*y2 */
+        felem_mul(tmp, ftmp3, y2);
+        /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
+        /* ftmp3 = z1^3*y2 - z2^3*y1 */
+        felem_diff_128_64(tmp, ftmp4);
+        /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
+        felem_reduce(ftmp3, tmp);
+        /* tmp = z1^2*x2 */
+        felem_mul(tmp, ftmp, x2);
+        /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
+        /* ftmp = z1^2*x2 - z2^2*x1 */
+        felem_diff_128_64(tmp, ftmp2);
+        /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
+        felem_reduce(ftmp, tmp);
+        /* the formulae are incorrect if the points are equal
+         * so we check for this and do doubling if this happens */
+        x_equal = felem_is_zero(ftmp);
+        y_equal = felem_is_zero(ftmp3);
+        z1_is_zero = felem_is_zero(z1);
+        z2_is_zero = felem_is_zero(z2);
+        /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */
+        if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
+                {
+                point_double(x3, y3, z3, x1, y1, z1);
+                return;
+                }
+        /* ftmp5 = z1*z2 */
+        if (!mixed)
+                {
+                felem_mul(tmp, z1, z2);
+                felem_reduce(ftmp5, tmp);
+                }
+        else
+                {
+                /* special case z2 = 0 is handled later */
+                felem_assign(ftmp5, z1);
+                }
+        /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
+        felem_mul(tmp, ftmp, ftmp5);
+        felem_reduce(z_out, tmp);
+        /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
+        felem_assign(ftmp5, ftmp);
+        felem_square(tmp, ftmp);
+        felem_reduce(ftmp, tmp);
+        /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
+        felem_mul(tmp, ftmp, ftmp5);
+        felem_reduce(ftmp5, tmp);
+        /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
+        felem_mul(tmp, ftmp2, ftmp);
+        felem_reduce(ftmp2, tmp);
+        /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
+        felem_mul(tmp, ftmp4, ftmp5);
+        /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
+        /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
+        felem_square(tmp2, ftmp3);
+        /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
+        /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
+        felem_diff_128_64(tmp2, ftmp5);
+        /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
+        /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
+        felem_assign(ftmp5, ftmp2);
+        felem_scalar(ftmp5, 2);
+        /* ftmp5[i] < 2 * 2^57 = 2^58 */
+        /* x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
+           2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
+        felem_diff_128_64(tmp2, ftmp5);
+        /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
+        felem_reduce(x_out, tmp2);
+        /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
+        felem_diff(ftmp2, x_out);
+        /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
+        /* tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) */
+        felem_mul(tmp2, ftmp3, ftmp2);
+        /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
+        /* y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
+           z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
+        widefelem_diff(tmp2, tmp);
+        /* tmp2[i] < 2^118 + 2^120 < 2^121 */
+        felem_reduce(y_out, tmp2);
+        /* the result (x_out, y_out, z_out) is incorrect if one of the inputs is
+         * the point at infinity, so we need to check for this separately */
+        /* if point 1 is at infinity, copy point 2 to output, and vice versa */
+        copy_conditional(x_out, x2, z1_is_zero);
+        copy_conditional(x_out, x1, z2_is_zero);
+        copy_conditional(y_out, y2, z1_is_zero);
+        copy_conditional(y_out, y1, z2_is_zero);
+        copy_conditional(z_out, z2, z1_is_zero);
+        copy_conditional(z_out, z1, z2_is_zero);
+        felem_assign(x3, x_out);
+        felem_assign(y3, y_out);
+        felem_assign(z3, z_out);
+        }
+/* select_point selects the |idx|th point from a precomputation table and
+ * copies it to out. */
+static void select_point(const u64 idx, unsigned int size, const felem pre_comp[/*size*/][3], felem out[3])
+        {
+        unsigned i, j;
+        limb *outlimbs = &out[0][0];
+        memset(outlimbs, 0, 3 * sizeof(felem));
+        for (i = 0; i < size; i++)
+                {
+                const limb *inlimbs = &pre_comp[i][0][0];
+                u64 mask = i ^ idx;
+                mask |= mask >> 4;
+                mask |= mask >> 2;
+                mask |= mask >> 1;
+                mask &= 1;
+                mask--;
+                for (j = 0; j < 4 * 3; j++)
+                        outlimbs[j] |= inlimbs[j] & mask;
+                }
+        }
+/* get_bit returns the |i|th bit in |in| */
+static char get_bit(const felem_bytearray in, unsigned i)
+        {
+        if (i >= 224)
+                return 0;
+        return (in[i >> 3] >> (i & 7)) & 1;
+        }
+/* Interleaved point multiplication using precomputed point multiples:
+ * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
+ * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
+ * of the generator, using certain (large) precomputed multiples in g_pre_comp.
+ * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
+static void batch_mul(felem x_out, felem y_out, felem z_out,
+        const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
+        const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[2][16][3])
+        {
+        int i, skip;
+        unsigned num;
+        unsigned gen_mul = (g_scalar != NULL);
+        felem nq[3], tmp[4];
+        u64 bits;
+        u8 sign, digit;
+        /* set nq to the point at infinity */
+        memset(nq, 0, 3 * sizeof(felem));
+        /* Loop over all scalars msb-to-lsb, interleaving additions
+         * of multiples of the generator (two in each of the last 28 rounds)
+         * and additions of other points multiples (every 5th round).
+         */
+        skip = 1; /* save two point operations in the first round */
+        for (i = (num_points ? 220 : 27); i >= 0; --i)
+                {
+                /* double */
+                if (!skip)
+                        point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
+                /* add multiples of the generator */
+                if (gen_mul && (i <= 27))
+                        {
+                        /* first, look 28 bits upwards */
+                        bits = get_bit(g_scalar, i + 196) << 3;
+                        bits |= get_bit(g_scalar, i + 140) << 2;
+                        bits |= get_bit(g_scalar, i + 84) << 1;
+                        bits |= get_bit(g_scalar, i + 28);
+                        /* select the point to add, in constant time */
+                        select_point(bits, 16, g_pre_comp[1], tmp);
+                        if (!skip)
+                                {
+                                point_add(nq[0], nq[1], nq[2],
+                                        nq[0], nq[1], nq[2],
+                                        1 /* mixed */, tmp[0], tmp[1], tmp[2]);
+                                }
+                        else
+                                {
+                                memcpy(nq, tmp, 3 * sizeof(felem));
+                                skip = 0;
+                                }
+                        /* second, look at the current position */
+                        bits = get_bit(g_scalar, i + 168) << 3;
+                        bits |= get_bit(g_scalar, i + 112) << 2;
+                        bits |= get_bit(g_scalar, i + 56) << 1;
+                        bits |= get_bit(g_scalar, i);
+                        /* select the point to add, in constant time */
+                        select_point(bits, 16, g_pre_comp[0], tmp);
+                        point_add(nq[0], nq[1], nq[2],
+                                nq[0], nq[1], nq[2],
+                                1 /* mixed */, tmp[0], tmp[1], tmp[2]);
+                        }
+                /* do other additions every 5 doublings */
+                if (num_points && (i % 5 == 0))
+                        {
+                        /* loop over all scalars */
+                        for (num = 0; num < num_points; ++num)
+                                {
+                                bits = get_bit(scalars[num], i + 4) << 5;
+                                bits |= get_bit(scalars[num], i + 3) << 4;
+                                bits |= get_bit(scalars[num], i + 2) << 3;
+                                bits |= get_bit(scalars[num], i + 1) << 2;
+                                bits |= get_bit(scalars[num], i) << 1;
+                                bits |= get_bit(scalars[num], i - 1);
+                                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
+                                /* select the point to add or subtract */
+                                select_point(digit, 17, pre_comp[num], tmp);
+                                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */
+                                copy_conditional(tmp[1], tmp[3], sign);
+                                if (!skip)
+                                        {
+                                        point_add(nq[0], nq[1], nq[2],
+                                                nq[0], nq[1], nq[2],
+                                                mixed, tmp[0], tmp[1], tmp[2]);
+                                        }
+                                else
+                                        {
+                                        memcpy(nq, tmp, 3 * sizeof(felem));
+                                        skip = 0;
+                                        }
+                                }
+                        }
+                }
+        felem_assign(x_out, nq[0]);
+        felem_assign(y_out, nq[1]);
+        felem_assign(z_out, nq[2]);
+        }
+/******************************************************************************/
+/*                     FUNCTIONS TO MANAGE PRECOMPUTATION
+ */
+static NISTP224_PRE_COMP *nistp224_pre_comp_new()
+        {
+        NISTP224_PRE_COMP *ret = NULL;
+        ret = (NISTP224_PRE_COMP *) OPENSSL_malloc(sizeof *ret);
+        if (!ret)
+                {
+                ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
+                return ret;
+                }
+        memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
+        ret->references = 1;
+        return ret;
+        }
+static void *nistp224_pre_comp_dup(void *src_)
+        {
+        NISTP224_PRE_COMP *src = src_;
+        /* no need to actually copy, these objects never change! */
+        CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
+        return src_;
+        }
+static void nistp224_pre_comp_free(void *pre_)
+        {
+        int i;
+        NISTP224_PRE_COMP *pre = pre_;
+        if (!pre)
+                return;
+        i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+        if (i > 0)
+                return;
+        OPENSSL_free(pre);
+        }
+static void nistp224_pre_comp_clear_free(void *pre_)
+        {
+        int i;
+        NISTP224_PRE_COMP *pre = pre_;
+        if (!pre)
+                return;
+        i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+        if (i > 0)
+                return;
+        OPENSSL_cleanse(pre, sizeof *pre);
+        OPENSSL_free(pre);
+        }
+/******************************************************************************/
+/*                         OPENSSL EC_METHOD FUNCTIONS
+ */
+int ec_GFp_nistp224_group_init(EC_GROUP *group)
+        {
+        int ret;
+        ret = ec_GFp_simple_group_init(group);
+        group->a_is_minus3 = 1;
+        return ret;
+        }
+int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
+        const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
+        {
+        int ret = 0;
+        BN_CTX *new_ctx = NULL;
+        BIGNUM *curve_p, *curve_a, *curve_b;
+        if (ctx == NULL)
+                if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+        BN_CTX_start(ctx);
+        if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
+                ((curve_a = BN_CTX_get(ctx)) == NULL) ||
+                ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
+        BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p);
+        BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a);
+        BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b);
+        if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
+                (BN_cmp(curve_b, b)))
+                {
+                ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE,
+                        EC_R_WRONG_CURVE_PARAMETERS);
+                goto err;
+                }
+        group->field_mod_func = BN_nist_mod_224;
+        ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
+err:
+        BN_CTX_end(ctx);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        return ret;
+        }
+/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
+ * (X', Y') = (X/Z^2, Y/Z^3) */
+int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
+        const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
+        {
+        felem z1, z2, x_in, y_in, x_out, y_out;
+        widefelem tmp;
+        if (EC_POINT_is_at_infinity(group, point))
+                {
+                ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
+                        EC_R_POINT_AT_INFINITY);
+                return 0;
+                }
+        if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
+                (!BN_to_felem(z1, &point->Z))) return 0;
+        felem_inv(z2, z1);
+        felem_square(tmp, z2); felem_reduce(z1, tmp);
+        felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
+        felem_contract(x_out, x_in);
+        if (x != NULL)
+                {
+                if (!felem_to_BN(x, x_out)) {
+                ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
+                        ERR_R_BN_LIB);
+                return 0;
+                }
+                }
+        felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
+        felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
+        felem_contract(y_out, y_in);
+        if (y != NULL)
+                {
+                if (!felem_to_BN(y, y_out)) {
+                ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
+                        ERR_R_BN_LIB);
+                return 0;
+                }
+                }
+        return 1;
+        }
+static void make_points_affine(size_t num, felem points[/*num*/][3], felem tmp_felems[/*num+1*/])
+        {
+        /* Runs in constant time, unless an input is the point at infinity
+         * (which normally shouldn't happen). */
+        ec_GFp_nistp_points_make_affine_internal(
+                num,
+                points,
+                sizeof(felem),
+                tmp_felems,
+                (void (*)(void *)) felem_one,
+                (int (*)(const void *)) felem_is_zero_int,
+                (void (*)(void *, const void *)) felem_assign,
+                (void (*)(void *, const void *)) felem_square_reduce,
+                (void (*)(void *, const void *, const void *)) felem_mul_reduce,
+                (void (*)(void *, const void *)) felem_inv,
+                (void (*)(void *, const void *)) felem_contract);
+        }
+/* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
+ * Result is stored in r (r can equal one of the inputs). */
+int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
+        const BIGNUM *scalar, size_t num, const EC_POINT *points[],
+        const BIGNUM *scalars[], BN_CTX *ctx)
+        {
+        int ret = 0;
+        int j;
+        unsigned i;
+        int mixed = 0;
+        BN_CTX *new_ctx = NULL;
+        BIGNUM *x, *y, *z, *tmp_scalar;
+        felem_bytearray g_secret;
+        felem_bytearray *secrets = NULL;
+        felem (*pre_comp)[17][3] = NULL;
+        felem *tmp_felems = NULL;
+        felem_bytearray tmp;
+        unsigned num_bytes;
+        int have_pre_comp = 0;
+        size_t num_points = num;
+        felem x_in, y_in, z_in, x_out, y_out, z_out;
+        NISTP224_PRE_COMP *pre = NULL;
+        const felem (*g_pre_comp)[16][3] = NULL;
+        EC_POINT *generator = NULL;
+        const EC_POINT *p = NULL;
+        const BIGNUM *p_scalar = NULL;
+        if (ctx == NULL)
+                if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+        BN_CTX_start(ctx);
+        if (((x = BN_CTX_get(ctx)) == NULL) ||
+                ((y = BN_CTX_get(ctx)) == NULL) ||
+                ((z = BN_CTX_get(ctx)) == NULL) ||
+                ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
+                goto err;
+        if (scalar != NULL)
+                {
+                pre = EC_EX_DATA_get_data(group->extra_data,
+                        nistp224_pre_comp_dup, nistp224_pre_comp_free,
+                        nistp224_pre_comp_clear_free);
+                if (pre)
+                        /* we have precomputation, try to use it */
+                        g_pre_comp = (const felem (*)[16][3]) pre->g_pre_comp;
+                else
+                        /* try to use the standard precomputation */
+                        g_pre_comp = &gmul[0];
+                generator = EC_POINT_new(group);
+                if (generator == NULL)
+                        goto err;
+                /* get the generator from precomputation */
+                if (!felem_to_BN(x, g_pre_comp[0][1][0]) ||
+                        !felem_to_BN(y, g_pre_comp[0][1][1]) ||
+                        !felem_to_BN(z, g_pre_comp[0][1][2]))
+                        {
+                        ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
+                        goto err;
+                        }
+                if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
+                                generator, x, y, z, ctx))
+                        goto err;
+                if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
+                        /* precomputation matches generator */
+                        have_pre_comp = 1;
+                else
+                        /* we don't have valid precomputation:
+                         * treat the generator as a random point */
+                        num_points = num_points + 1;
+                }
+        if (num_points > 0)
+                {
+                if (num_points >= 3)
+                        {
+                        /* unless we precompute multiples for just one or two points,
+                         * converting those into affine form is time well spent  */
+                        mixed = 1;
+                        }
+                secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
+                pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
+                if (mixed)
+                        tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
+                if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL)))
+                        {
+                        ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE);
+                        goto err;
+                        }
+                /* we treat NULL scalars as 0, and NULL points as points at infinity,
+                 * i.e., they contribute nothing to the linear combination */
+                memset(secrets, 0, num_points * sizeof(felem_bytearray));
+                memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
+                for (i = 0; i < num_points; ++i)
+                        {
+                        if (i == num)
+                                /* the generator */
+                                {
+                                p = EC_GROUP_get0_generator(group);
+                                p_scalar = scalar;
+                                }
+                        else
+                                /* the i^th point */
+                                {
+                                p = points[i];
+                                p_scalar = scalars[i];
+                                }
+                        if ((p_scalar != NULL) && (p != NULL))
+                                {
+                                /* reduce scalar to 0 <= scalar < 2^224 */
+                                if ((BN_num_bits(p_scalar) > 224) || (BN_is_negative(p_scalar)))
+                                        {
+                                        /* this is an unusual input, and we don't guarantee
+                                         * constant-timeness */
+                                        if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
+                                                {
+                                                ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
+                                                goto err;
+                                                }
+                                        num_bytes = BN_bn2bin(tmp_scalar, tmp);
+                                        }
+                                else
+                                        num_bytes = BN_bn2bin(p_scalar, tmp);
+                                flip_endian(secrets[i], tmp, num_bytes);
+                                /* precompute multiples */
+                                if ((!BN_to_felem(x_out, &p->X)) ||
+                                        (!BN_to_felem(y_out, &p->Y)) ||
+                                        (!BN_to_felem(z_out, &p->Z))) goto err;
+                                felem_assign(pre_comp[i][1][0], x_out);
+                                felem_assign(pre_comp[i][1][1], y_out);
+                                felem_assign(pre_comp[i][1][2], z_out);
+                                for (j = 2; j <= 16; ++j)
+                                        {
+                                        if (j & 1)
+                                                {
+                                                point_add(
+                                                        pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
+                                                        pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
+                                                        0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
+                                                }
+                                        else
+                                                {
+                                                point_double(
+                                                        pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
+                                                        pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
+                                                }
+                                        }
+                                }
+                        }
+                if (mixed)
+                        make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
+                }
+        /* the scalar for the generator */
+        if ((scalar != NULL) && (have_pre_comp))
+                {
+                memset(g_secret, 0, sizeof g_secret);
+                /* reduce scalar to 0 <= scalar < 2^224 */
+                if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar)))
+                        {
+                        /* this is an unusual input, and we don't guarantee
+                         * constant-timeness */
+                        if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
+                                {
+                                ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
+                                goto err;
+                                }
+                        num_bytes = BN_bn2bin(tmp_scalar, tmp);
+                        }
+                else
+                        num_bytes = BN_bn2bin(scalar, tmp);
+                flip_endian(g_secret, tmp, num_bytes);
+                /* do the multiplication with generator precomputation*/
+                batch_mul(x_out, y_out, z_out,
+                        (const felem_bytearray (*)) secrets, num_points,
+                        g_secret,
+                        mixed, (const felem (*)[17][3]) pre_comp,
+                        g_pre_comp);
+                }
+        else
+                /* do the multiplication without generator precomputation */
+                batch_mul(x_out, y_out, z_out,
+                        (const felem_bytearray (*)) secrets, num_points,
+                        NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL);
+        /* reduce the output to its unique minimal representation */
+        felem_contract(x_in, x_out);
+        felem_contract(y_in, y_out);
+        felem_contract(z_in, z_out);
+        if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
+                (!felem_to_BN(z, z_in)))
+                {
+                ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
+                goto err;
+                }
+        ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
+err:
+        BN_CTX_end(ctx);
+        if (generator != NULL)
+                EC_POINT_free(generator);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        if (secrets != NULL)
+                OPENSSL_free(secrets);
+        if (pre_comp != NULL)
+                OPENSSL_free(pre_comp);
+        if (tmp_felems != NULL)
+                OPENSSL_free(tmp_felems);
+        return ret;
+        }
+int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
+        {
+        int ret = 0;
+        NISTP224_PRE_COMP *pre = NULL;
+        int i, j;
+        BN_CTX *new_ctx = NULL;
+        BIGNUM *x, *y;
+        EC_POINT *generator = NULL;
+        felem tmp_felems[32];
+        /* throw away old precomputation */
+        EC_EX_DATA_free_data(&group->extra_data, nistp224_pre_comp_dup,
+                nistp224_pre_comp_free, nistp224_pre_comp_clear_free);
+        if (ctx == NULL)
+                if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+        BN_CTX_start(ctx);
+        if (((x = BN_CTX_get(ctx)) == NULL) ||
+                ((y = BN_CTX_get(ctx)) == NULL))
+                goto err;
+        /* get the generator */
+        if (group->generator == NULL) goto err;
+        generator = EC_POINT_new(group);
+        if (generator == NULL)
+                goto err;
+        BN_bin2bn(nistp224_curve_params[3], sizeof (felem_bytearray), x);
+        BN_bin2bn(nistp224_curve_params[4], sizeof (felem_bytearray), y);
+        if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
+                goto err;
+        if ((pre = nistp224_pre_comp_new()) == NULL)
+                goto err;
+        /* if the generator is the standard one, use built-in precomputation */
+        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
+                {
+                memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
+                ret = 1;
+                goto err;
+                }
+        if ((!BN_to_felem(pre->g_pre_comp[0][1][0], &group->generator->X)) ||
+                (!BN_to_felem(pre->g_pre_comp[0][1][1], &group->generator->Y)) ||
+                (!BN_to_felem(pre->g_pre_comp[0][1][2], &group->generator->Z)))
+                goto err;
+        /* compute 2^56*G, 2^112*G, 2^168*G for the first table,
+         * 2^28*G, 2^84*G, 2^140*G, 2^196*G for the second one
+         */
+        for (i = 1; i <= 8; i <<= 1)
+                {
+                point_double(
+                        pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2],
+                        pre->g_pre_comp[0][i][0], pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
+                for (j = 0; j < 27; ++j)
+                        {
+                        point_double(
+                                pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2],
+                                pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
+                        }
+                if (i == 8)
+                        break;
+                point_double(
+                        pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2],
+                        pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
+                for (j = 0; j < 27; ++j)
+                        {
+                        point_double(
+                                pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2],
+                                pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2]);
+                        }
+                }
+        for (i = 0; i < 2; i++)
+                {
+                /* g_pre_comp[i][0] is the point at infinity */
+                memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
+                /* the remaining multiples */
+                /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */
+                point_add(
+                        pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
+                        pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
+                        pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
+                        0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
+                        pre->g_pre_comp[i][2][2]);
+                /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */
+                point_add(
+                        pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
+                        pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
+                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
+                        0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
+                        pre->g_pre_comp[i][2][2]);
+                /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */
+                point_add(
+                        pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
+                        pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
+                        pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
+                        0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
+                        pre->g_pre_comp[i][4][2]);
+                /* 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G */
+                point_add(
+                        pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
+                        pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
+                        pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
+                        0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
+                        pre->g_pre_comp[i][2][2]);
+                for (j = 1; j < 8; ++j)
+                        {
+                        /* odd multiples: add G resp. 2^28*G */
+                        point_add(
+                                pre->g_pre_comp[i][2*j+1][0], pre->g_pre_comp[i][2*j+1][1],
+                                pre->g_pre_comp[i][2*j+1][2], pre->g_pre_comp[i][2*j][0],
+                                pre->g_pre_comp[i][2*j][1], pre->g_pre_comp[i][2*j][2],
+                                0, pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1],
+                                pre->g_pre_comp[i][1][2]);
+                        }
+                }
+        make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
+        if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup,
+                        nistp224_pre_comp_free, nistp224_pre_comp_clear_free))
+                goto err;
+        ret = 1;
+        pre = NULL;
+ err:
+        BN_CTX_end(ctx);
+        if (generator != NULL)
+                EC_POINT_free(generator);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        if (pre)
+                nistp224_pre_comp_free(pre);
+        return ret;
+        }
+int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group)
+        {
+        if (EC_EX_DATA_get_data(group->extra_data, nistp224_pre_comp_dup,
+                        nistp224_pre_comp_free, nistp224_pre_comp_clear_free)
+                != NULL)
+                return 1;
+        else
+                return 0;
+        }
+#else
+static void *dummy=&dummy;
+#endif
diff --git a/src/lib/libcrypto/ec/ecp_nistp256.c b/src/lib/libcrypto/ec/ecp_nistp256.c
new file mode 100644
index 0000000000..4bc0f5dce0
--- /dev/null
+++ b/src/lib/libcrypto/ec/ecp_nistp256.c
@@ -0,0 +1,2171 @@
+/* crypto/ec/ecp_nistp256.c */
+/*
+ * Written by Adam Langley (Google) for the OpenSSL project
+ */
+/* Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ *
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/*
+ * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
+ *
+ * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
+ * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
+ * work which got its smarts from Daniel J. Bernstein's work on the same.
+ */
+#include <openssl/opensslconf.h>
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+#ifndef OPENSSL_SYS_VMS
+#include <stdint.h>
+#else
+#include <inttypes.h>
+#endif
+#include <string.h>
+#include <openssl/err.h>
+#include "ec_lcl.h"
+#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+  /* even with gcc, the typedef won't work for 32-bit platforms */
+  typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
+  typedef __int128_t int128_t;
+#else
+  #error "Need GCC 3.1 or later to define type uint128_t"
+#endif
+typedef uint8_t u8;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int64_t s64;
+/* The underlying field.
+ *
+ * P256 operates over GF(2^256-2^224+2^192+2^96-1). We can serialise an element
+ * of this field into 32 bytes. We call this an felem_bytearray. */
+typedef u8 felem_bytearray[32];
+/* These are the parameters of P256, taken from FIPS 186-3, page 86. These
+ * values are big-endian. */
+static const felem_bytearray nistp256_curve_params[5] = {
+        {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,       /* p */
+         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+        {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,       /* a = -3 */
+         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc},      /* b */
+        {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
+         0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
+         0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
+         0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
+        {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47,       /* x */
+         0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
+         0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
+         0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
+        {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b,       /* y */
+         0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
+         0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
+         0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
+};
+/* The representation of field elements.
+ * ------------------------------------
+ *
+ * We represent field elements with either four 128-bit values, eight 128-bit
+ * values, or four 64-bit values. The field element represented is:
+ *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
+ * or:
+ *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
+ *
+ * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
+ * apart, but are 128-bits wide, the most significant bits of each limb overlap
+ * with the least significant bits of the next.
+ *
+ * A field element with four limbs is an 'felem'. One with eight limbs is a
+ * 'longfelem'
+ *
+ * A field element with four, 64-bit values is called a 'smallfelem'. Small
+ * values are used as intermediate values before multiplication.
+ */
+#define NLIMBS 4
+typedef uint128_t limb;
+typedef limb felem[NLIMBS];
+typedef limb longfelem[NLIMBS * 2];
+typedef u64 smallfelem[NLIMBS];
+/* This is the value of the prime as four 64-bit words, little-endian. */
+static const u64 kPrime[4] = { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
+static const limb bottom32bits = 0xffffffff;
+static const u64 bottom63bits = 0x7ffffffffffffffful;
+/* bin32_to_felem takes a little-endian byte array and converts it into felem
+ * form. This assumes that the CPU is little-endian. */
+static void bin32_to_felem(felem out, const u8 in[32])
+        {
+        out[0] = *((u64*) &in[0]);
+        out[1] = *((u64*) &in[8]);
+        out[2] = *((u64*) &in[16]);
+        out[3] = *((u64*) &in[24]);
+        }
+/* smallfelem_to_bin32 takes a smallfelem and serialises into a little endian,
+ * 32 byte array. This assumes that the CPU is little-endian. */
+static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
+        {
+        *((u64*) &out[0]) = in[0];
+        *((u64*) &out[8]) = in[1];
+        *((u64*) &out[16]) = in[2];
+        *((u64*) &out[24]) = in[3];
+        }
+/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
+static void flip_endian(u8 *out, const u8 *in, unsigned len)
+        {
+        unsigned i;
+        for (i = 0; i < len; ++i)
+                out[i] = in[len-1-i];
+        }
+/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
+static int BN_to_felem(felem out, const BIGNUM *bn)
+        {
+        felem_bytearray b_in;
+        felem_bytearray b_out;
+        unsigned num_bytes;
+        /* BN_bn2bin eats leading zeroes */
+        memset(b_out, 0, sizeof b_out);
+        num_bytes = BN_num_bytes(bn);
+        if (num_bytes > sizeof b_out)
+                {
+                ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
+                return 0;
+                }
+        if (BN_is_negative(bn))
+                {
+                ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
+                return 0;
+                }
+        num_bytes = BN_bn2bin(bn, b_in);
+        flip_endian(b_out, b_in, num_bytes);
+        bin32_to_felem(out, b_out);
+        return 1;
+        }
+/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
+static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
+        {
+        felem_bytearray b_in, b_out;
+        smallfelem_to_bin32(b_in, in);
+        flip_endian(b_out, b_in, sizeof b_out);
+        return BN_bin2bn(b_out, sizeof b_out, out);
+        }
+/* Field operations
+ * ---------------- */
+static void smallfelem_one(smallfelem out)
+        {
+        out[0] = 1;
+        out[1] = 0;
+        out[2] = 0;
+        out[3] = 0;
+        }
+static void smallfelem_assign(smallfelem out, const smallfelem in)
+        {
+        out[0] = in[0];
+        out[1] = in[1];
+        out[2] = in[2];
+        out[3] = in[3];
+        }
+static void felem_assign(felem out, const felem in)
+        {
+        out[0] = in[0];
+        out[1] = in[1];
+        out[2] = in[2];
+        out[3] = in[3];
+        }
+/* felem_sum sets out = out + in. */
+static void felem_sum(felem out, const felem in)
+        {
+        out[0] += in[0];
+        out[1] += in[1];
+        out[2] += in[2];
+        out[3] += in[3];
+        }
+/* felem_small_sum sets out = out + in. */
+static void felem_small_sum(felem out, const smallfelem in)
+        {
+        out[0] += in[0];
+        out[1] += in[1];
+        out[2] += in[2];
+        out[3] += in[3];
+        }
+/* felem_scalar sets out = out * scalar */
+static void felem_scalar(felem out, const u64 scalar)
+        {
+        out[0] *= scalar;
+        out[1] *= scalar;
+        out[2] *= scalar;
+        out[3] *= scalar;
+        }
+/* longfelem_scalar sets out = out * scalar */
+static void longfelem_scalar(longfelem out, const u64 scalar)
+        {
+        out[0] *= scalar;
+        out[1] *= scalar;
+        out[2] *= scalar;
+        out[3] *= scalar;
+        out[4] *= scalar;
+        out[5] *= scalar;
+        out[6] *= scalar;
+        out[7] *= scalar;
+        }
+#define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
+#define two105 (((limb)1) << 105)
+#define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
+/* zero105 is 0 mod p */
+static const felem zero105 = { two105m41m9, two105, two105m41p9, two105m41p9 };
+/* smallfelem_neg sets |out| to |-small|
+ * On exit:
+ *   out[i] < out[i] + 2^105
+ */
+static void smallfelem_neg(felem out, const smallfelem small)
+        {
+        /* In order to prevent underflow, we subtract from 0 mod p. */
+        out[0] = zero105[0] - small[0];
+        out[1] = zero105[1] - small[1];
+        out[2] = zero105[2] - small[2];
+        out[3] = zero105[3] - small[3];
+        }
+/* felem_diff subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 2^104
+ * On exit:
+ *   out[i] < out[i] + 2^105
+ */
+static void felem_diff(felem out, const felem in)
+        {
+        /* In order to prevent underflow, we add 0 mod p before subtracting. */
+        out[0] += zero105[0];
+        out[1] += zero105[1];
+        out[2] += zero105[2];
+        out[3] += zero105[3];
+        out[0] -= in[0];
+        out[1] -= in[1];
+        out[2] -= in[2];
+        out[3] -= in[3];
+        }
+#define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
+#define two107 (((limb)1) << 107)
+#define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
+/* zero107 is 0 mod p */
+static const felem zero107 = { two107m43m11, two107, two107m43p11, two107m43p11 };
+/* An alternative felem_diff for larger inputs |in|
+ * felem_diff_zero107 subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 2^106
+ * On exit:
+ *   out[i] < out[i] + 2^107
+ */
+static void felem_diff_zero107(felem out, const felem in)
+        {
+        /* In order to prevent underflow, we add 0 mod p before subtracting. */
+        out[0] += zero107[0];
+        out[1] += zero107[1];
+        out[2] += zero107[2];
+        out[3] += zero107[3];
+        out[0] -= in[0];
+        out[1] -= in[1];
+        out[2] -= in[2];
+        out[3] -= in[3];
+        }
+/* longfelem_diff subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 7*2^67
+ * On exit:
+ *   out[i] < out[i] + 2^70 + 2^40
+ */
+static void longfelem_diff(longfelem out, const longfelem in)
+        {
+        static const limb two70m8p6 = (((limb)1) << 70) - (((limb)1) << 8) + (((limb)1) << 6);
+        static const limb two70p40 = (((limb)1) << 70) + (((limb)1) << 40);
+        static const limb two70 = (((limb)1) << 70);
+        static const limb two70m40m38p6 = (((limb)1) << 70) - (((limb)1) << 40) - (((limb)1) << 38) + (((limb)1) << 6);
+        static const limb two70m6 = (((limb)1) << 70) - (((limb)1) << 6);
+        /* add 0 mod p to avoid underflow */
+        out[0] += two70m8p6;
+        out[1] += two70p40;
+        out[2] += two70;
+        out[3] += two70m40m38p6;
+        out[4] += two70m6;
+        out[5] += two70m6;
+        out[6] += two70m6;
+        out[7] += two70m6;
+        /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
+        out[0] -= in[0];
+        out[1] -= in[1];
+        out[2] -= in[2];
+        out[3] -= in[3];
+        out[4] -= in[4];
+        out[5] -= in[5];
+        out[6] -= in[6];
+        out[7] -= in[7];
+        }
+#define two64m0 (((limb)1) << 64) - 1
+#define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
+#define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
+#define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
+/* zero110 is 0 mod p */
+static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
+/* felem_shrink converts an felem into a smallfelem. The result isn't quite
+ * minimal as the value may be greater than p.
+ *
+ * On entry:
+ *   in[i] < 2^109
+ * On exit:
+ *   out[i] < 2^64
+ */
+static void felem_shrink(smallfelem out, const felem in)
+        {
+        felem tmp;
+        u64 a, b, mask;
+        s64 high, low;
+        static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
+        /* Carry 2->3 */
+        tmp[3] = zero110[3] + in[3] + ((u64) (in[2] >> 64));
+        /* tmp[3] < 2^110 */
+        tmp[2] = zero110[2] + (u64) in[2];
+        tmp[0] = zero110[0] + in[0];
+        tmp[1] = zero110[1] + in[1];
+        /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
+        /* We perform two partial reductions where we eliminate the
+         * high-word of tmp[3]. We don't update the other words till the end.
+         */
+        a = tmp[3] >> 64; /* a < 2^46 */
+        tmp[3] = (u64) tmp[3];
+        tmp[3] -= a;
+        tmp[3] += ((limb)a) << 32;
+        /* tmp[3] < 2^79 */
+        b = a;
+        a = tmp[3] >> 64; /* a < 2^15 */
+        b += a; /* b < 2^46 + 2^15 < 2^47 */
+        tmp[3] = (u64) tmp[3];
+        tmp[3] -= a;
+        tmp[3] += ((limb)a) << 32;
+        /* tmp[3] < 2^64 + 2^47 */
+        /* This adjusts the other two words to complete the two partial
+         * reductions. */
+        tmp[0] += b;
+        tmp[1] -= (((limb)b) << 32);
+        /* In order to make space in tmp[3] for the carry from 2 -> 3, we
+         * conditionally subtract kPrime if tmp[3] is large enough. */
+        high = tmp[3] >> 64;
+        /* As tmp[3] < 2^65, high is either 1 or 0 */
+        high <<= 63;
+        high >>= 63;
+        /* high is:
+         *   all ones   if the high word of tmp[3] is 1
+         *   all zeros  if the high word of tmp[3] if 0 */
+        low = tmp[3];
+        mask = low >> 63;
+        /* mask is:
+         *   all ones   if the MSB of low is 1
+         *   all zeros  if the MSB of low if 0 */
+        low &= bottom63bits;
+        low -= kPrime3Test;
+        /* if low was greater than kPrime3Test then the MSB is zero */
+        low = ~low;
+        low >>= 63;
+        /* low is:
+         *   all ones   if low was > kPrime3Test
+         *   all zeros  if low was <= kPrime3Test */
+        mask = (mask & low) | high;
+        tmp[0] -= mask & kPrime[0];
+        tmp[1] -= mask & kPrime[1];
+        /* kPrime[2] is zero, so omitted */
+        tmp[3] -= mask & kPrime[3];
+        /* tmp[3] < 2**64 - 2**32 + 1 */
+        tmp[1] += ((u64) (tmp[0] >> 64)); tmp[0] = (u64) tmp[0];
+        tmp[2] += ((u64) (tmp[1] >> 64)); tmp[1] = (u64) tmp[1];
+        tmp[3] += ((u64) (tmp[2] >> 64)); tmp[2] = (u64) tmp[2];
+        /* tmp[i] < 2^64 */
+        out[0] = tmp[0];
+        out[1] = tmp[1];
+        out[2] = tmp[2];
+        out[3] = tmp[3];
+        }
+/* smallfelem_expand converts a smallfelem to an felem */
+static void smallfelem_expand(felem out, const smallfelem in)
+        {
+        out[0] = in[0];
+        out[1] = in[1];
+        out[2] = in[2];
+        out[3] = in[3];
+        }
+/* smallfelem_square sets |out| = |small|^2
+ * On entry:
+ *   small[i] < 2^64
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void smallfelem_square(longfelem out, const smallfelem small)
+        {
+        limb a;
+        u64 high, low;
+        a = ((uint128_t) small[0]) * small[0];
+        low = a;
+        high = a >> 64;
+        out[0] = low;
+        out[1] = high;
+        a = ((uint128_t) small[0]) * small[1];
+        low = a;
+        high = a >> 64;
+        out[1] += low;
+        out[1] += low;
+        out[2] = high;
+        a = ((uint128_t) small[0]) * small[2];
+        low = a;
+        high = a >> 64;
+        out[2] += low;
+        out[2] *= 2;
+        out[3] = high;
+        a = ((uint128_t) small[0]) * small[3];
+        low = a;
+        high = a >> 64;
+        out[3] += low;
+        out[4] = high;
+        a = ((uint128_t) small[1]) * small[2];
+        low = a;
+        high = a >> 64;
+        out[3] += low;
+        out[3] *= 2;
+        out[4] += high;
+        a = ((uint128_t) small[1]) * small[1];
+        low = a;
+        high = a >> 64;
+        out[2] += low;
+        out[3] += high;
+        a = ((uint128_t) small[1]) * small[3];
+        low = a;
+        high = a >> 64;
+        out[4] += low;
+        out[4] *= 2;
+        out[5] = high;
+        a = ((uint128_t) small[2]) * small[3];
+        low = a;
+        high = a >> 64;
+        out[5] += low;
+        out[5] *= 2;
+        out[6] = high;
+        out[6] += high;
+        a = ((uint128_t) small[2]) * small[2];
+        low = a;
+        high = a >> 64;
+        out[4] += low;
+        out[5] += high;
+        a = ((uint128_t) small[3]) * small[3];
+        low = a;
+        high = a >> 64;
+        out[6] += low;
+        out[7] = high;
+        }
+/* felem_square sets |out| = |in|^2
+ * On entry:
+ *   in[i] < 2^109
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void felem_square(longfelem out, const felem in)
+        {
+        u64 small[4];
+        felem_shrink(small, in);
+        smallfelem_square(out, small);
+        }
+/* smallfelem_mul sets |out| = |small1| * |small2|
+ * On entry:
+ *   small1[i] < 2^64
+ *   small2[i] < 2^64
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void smallfelem_mul(longfelem out, const smallfelem small1, const smallfelem small2)
+        {
+        limb a;
+        u64 high, low;
+        a = ((uint128_t) small1[0]) * small2[0];
+        low = a;
+        high = a >> 64;
+        out[0] = low;
+        out[1] = high;
+        a = ((uint128_t) small1[0]) * small2[1];
+        low = a;
+        high = a >> 64;
+        out[1] += low;
+        out[2] = high;
+        a = ((uint128_t) small1[1]) * small2[0];
+        low = a;
+        high = a >> 64;
+        out[1] += low;
+        out[2] += high;
+        a = ((uint128_t) small1[0]) * small2[2];
+        low = a;
+        high = a >> 64;
+        out[2] += low;
+        out[3] = high;
+        a = ((uint128_t) small1[1]) * small2[1];
+        low = a;
+        high = a >> 64;
+        out[2] += low;
+        out[3] += high;
+        a = ((uint128_t) small1[2]) * small2[0];
+        low = a;
+        high = a >> 64;
+        out[2] += low;
+        out[3] += high;
+        a = ((uint128_t) small1[0]) * small2[3];
+        low = a;
+        high = a >> 64;
+        out[3] += low;
+        out[4] = high;
+        a = ((uint128_t) small1[1]) * small2[2];
+        low = a;
+        high = a >> 64;
+        out[3] += low;
+        out[4] += high;
+        a = ((uint128_t) small1[2]) * small2[1];
+        low = a;
+        high = a >> 64;
+        out[3] += low;
+        out[4] += high;
+        a = ((uint128_t) small1[3]) * small2[0];
+        low = a;
+        high = a >> 64;
+        out[3] += low;
+        out[4] += high;
+        a = ((uint128_t) small1[1]) * small2[3];
+        low = a;
+        high = a >> 64;
+        out[4] += low;
+        out[5] = high;
+        a = ((uint128_t) small1[2]) * small2[2];
+        low = a;
+        high = a >> 64;
+        out[4] += low;
+        out[5] += high;
+        a = ((uint128_t) small1[3]) * small2[1];
+        low = a;
+        high = a >> 64;
+        out[4] += low;
+        out[5] += high;
+        a = ((uint128_t) small1[2]) * small2[3];
+        low = a;
+        high = a >> 64;
+        out[5] += low;
+        out[6] = high;
+        a = ((uint128_t) small1[3]) * small2[2];
+        low = a;
+        high = a >> 64;
+        out[5] += low;
+        out[6] += high;
+        a = ((uint128_t) small1[3]) * small2[3];
+        low = a;
+        high = a >> 64;
+        out[6] += low;
+        out[7] = high;
+        }
+/* felem_mul sets |out| = |in1| * |in2|
+ * On entry:
+ *   in1[i] < 2^109
+ *   in2[i] < 2^109
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void felem_mul(longfelem out, const felem in1, const felem in2)
+        {
+        smallfelem small1, small2;
+        felem_shrink(small1, in1);
+        felem_shrink(small2, in2);
+        smallfelem_mul(out, small1, small2);
+        }
+/* felem_small_mul sets |out| = |small1| * |in2|
+ * On entry:
+ *   small1[i] < 2^64
+ *   in2[i] < 2^109
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void felem_small_mul(longfelem out, const smallfelem small1, const felem in2)
+        {
+        smallfelem small2;
+        felem_shrink(small2, in2);
+        smallfelem_mul(out, small1, small2);
+        }
+#define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
+#define two100 (((limb)1) << 100)
+#define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
+/* zero100 is 0 mod p */
+static const felem zero100 = { two100m36m4, two100, two100m36p4, two100m36p4 };
+/* Internal function for the different flavours of felem_reduce.
+ * felem_reduce_ reduces the higher coefficients in[4]-in[7].
+ * On entry:
+ *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7] 
+ *   out[1] >= in[7] + 2^32*in[4]
+ *   out[2] >= in[5] + 2^32*in[5]
+ *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
+ * On exit:
+ *   out[0] <= out[0] + in[4] + 2^32*in[5]
+ *   out[1] <= out[1] + in[5] + 2^33*in[6]
+ *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
+ *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
+ */
+static void felem_reduce_(felem out, const longfelem in)
+        {
+        int128_t c;
+        /* combine common terms from below */
+        c = in[4] + (in[5] << 32);
+        out[0] += c;
+        out[3] -= c;
+        c = in[5] - in[7];
+        out[1] += c;
+        out[2] -= c;
+        /* the remaining terms */
+        /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
+        out[1] -= (in[4] << 32);
+        out[3] += (in[4] << 32);
+        /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
+        out[2] -= (in[5] << 32);
+        /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
+        out[0] -= in[6];
+        out[0] -= (in[6] << 32);
+        out[1] += (in[6] << 33);
+        out[2] += (in[6] * 2);
+        out[3] -= (in[6] << 32);
+        /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
+        out[0] -= in[7];
+        out[0] -= (in[7] << 32);
+        out[2] += (in[7] << 33);
+        out[3] += (in[7] * 3);
+        }
+/* felem_reduce converts a longfelem into an felem.
+ * To be called directly after felem_square or felem_mul.
+ * On entry:
+ *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
+ *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
+ * On exit:
+ *   out[i] < 2^101
+ */
+static void felem_reduce(felem out, const longfelem in)
+        {
+        out[0] = zero100[0] + in[0];
+        out[1] = zero100[1] + in[1];
+        out[2] = zero100[2] + in[2];
+        out[3] = zero100[3] + in[3];
+        felem_reduce_(out, in);
+        /* out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
+         * out[1] > 2^100 - 2^64 - 7*2^96 > 0
+         * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
+         * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
+         *
+         * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
+         * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
+         * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
+         * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
+         */
+        }
+/* felem_reduce_zero105 converts a larger longfelem into an felem.
+ * On entry:
+ *   in[0] < 2^71
+ * On exit:
+ *   out[i] < 2^106
+ */
+static void felem_reduce_zero105(felem out, const longfelem in)
+        {
+        out[0] = zero105[0] + in[0];
+        out[1] = zero105[1] + in[1];
+        out[2] = zero105[2] + in[2];
+        out[3] = zero105[3] + in[3];
+        felem_reduce_(out, in);
+        /* out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
+         * out[1] > 2^105 - 2^71 - 2^103 > 0
+         * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
+         * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
+         *
+         * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
+         * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
+         * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
+         * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
+         */
+        }
+/* subtract_u64 sets *result = *result - v and *carry to one if the subtraction
+ * underflowed. */
+static void subtract_u64(u64* result, u64* carry, u64 v)
+        {
+        uint128_t r = *result;
+        r -= v;
+        *carry = (r >> 64) & 1;
+        *result = (u64) r;
+        }
+/* felem_contract converts |in| to its unique, minimal representation.
+ * On entry:
+ *   in[i] < 2^109
+ */
+static void felem_contract(smallfelem out, const felem in)
+        {
+        unsigned i;
+        u64 all_equal_so_far = 0, result = 0, carry;
+        felem_shrink(out, in);
+        /* small is minimal except that the value might be > p */
+        all_equal_so_far--;
+        /* We are doing a constant time test if out >= kPrime. We need to
+         * compare each u64, from most-significant to least significant. For
+         * each one, if all words so far have been equal (m is all ones) then a
+         * non-equal result is the answer. Otherwise we continue. */
+        for (i = 3; i < 4; i--)
+                {
+                u64 equal;
+                uint128_t a = ((uint128_t) kPrime[i]) - out[i];
+                /* if out[i] > kPrime[i] then a will underflow and the high
+                 * 64-bits will all be set. */
+                result |= all_equal_so_far & ((u64) (a >> 64));
+                /* if kPrime[i] == out[i] then |equal| will be all zeros and
+                 * the decrement will make it all ones. */
+                equal = kPrime[i] ^ out[i];
+                equal--;
+                equal &= equal << 32;
+                equal &= equal << 16;
+                equal &= equal << 8;
+                equal &= equal << 4;
+                equal &= equal << 2;
+                equal &= equal << 1;
+                equal = ((s64) equal) >> 63;
+                all_equal_so_far &= equal;
+                }
+        /* if all_equal_so_far is still all ones then the two values are equal
+         * and so out >= kPrime is true. */
+        result |= all_equal_so_far;
+        /* if out >= kPrime then we subtract kPrime. */
+        subtract_u64(&out[0], &carry, result & kPrime[0]);
+        subtract_u64(&out[1], &carry, carry);
+        subtract_u64(&out[2], &carry, carry);
+        subtract_u64(&out[3], &carry, carry);
+        subtract_u64(&out[1], &carry, result & kPrime[1]);
+        subtract_u64(&out[2], &carry, carry);
+        subtract_u64(&out[3], &carry, carry);
+        subtract_u64(&out[2], &carry, result & kPrime[2]);
+        subtract_u64(&out[3], &carry, carry);
+        subtract_u64(&out[3], &carry, result & kPrime[3]);
+        }
+static void smallfelem_square_contract(smallfelem out, const smallfelem in)
+        {
+        longfelem longtmp;
+        felem tmp;
+        smallfelem_square(longtmp, in);
+        felem_reduce(tmp, longtmp);
+        felem_contract(out, tmp);
+        }
+static void smallfelem_mul_contract(smallfelem out, const smallfelem in1, const smallfelem in2)
+        {
+        longfelem longtmp;
+        felem tmp;
+        smallfelem_mul(longtmp, in1, in2);
+        felem_reduce(tmp, longtmp);
+        felem_contract(out, tmp);
+        }
+/* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
+ * otherwise.
+ * On entry:
+ *   small[i] < 2^64
+ */
+static limb smallfelem_is_zero(const smallfelem small)
+        {
+        limb result;
+        u64 is_p;
+        u64 is_zero = small[0] | small[1] | small[2] | small[3];
+        is_zero--;
+        is_zero &= is_zero << 32;
+        is_zero &= is_zero << 16;
+        is_zero &= is_zero << 8;
+        is_zero &= is_zero << 4;
+        is_zero &= is_zero << 2;
+        is_zero &= is_zero << 1;
+        is_zero = ((s64) is_zero) >> 63;
+        is_p = (small[0] ^ kPrime[0]) |
+               (small[1] ^ kPrime[1]) |
+               (small[2] ^ kPrime[2]) |
+               (small[3] ^ kPrime[3]);
+        is_p--;
+        is_p &= is_p << 32;
+        is_p &= is_p << 16;
+        is_p &= is_p << 8;
+        is_p &= is_p << 4;
+        is_p &= is_p << 2;
+        is_p &= is_p << 1;
+        is_p = ((s64) is_p) >> 63;
+        is_zero |= is_p;
+        result = is_zero;
+        result |= ((limb) is_zero) << 64;
+        return result;
+        }
+static int smallfelem_is_zero_int(const smallfelem small)
+        {
+        return (int) (smallfelem_is_zero(small) & ((limb)1));
+        }
+/* felem_inv calculates |out| = |in|^{-1}
+ *
+ * Based on Fermat's Little Theorem:
+ *   a^p = a (mod p)
+ *   a^{p-1} = 1 (mod p)
+ *   a^{p-2} = a^{-1} (mod p)
+ */
+static void felem_inv(felem out, const felem in)
+        {
+        felem ftmp, ftmp2;
+        /* each e_I will hold |in|^{2^I - 1} */
+        felem e2, e4, e8, e16, e32, e64;
+        longfelem tmp;
+        unsigned i;
+        felem_square(tmp, in); felem_reduce(ftmp, tmp);                 /* 2^1 */
+        felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);              /* 2^2 - 2^0 */
+        felem_assign(e2, ftmp);
+        felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);               /* 2^3 - 2^1 */
+        felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);               /* 2^4 - 2^2 */
+        felem_mul(tmp, ftmp, e2); felem_reduce(ftmp, tmp);              /* 2^4 - 2^0 */
+        felem_assign(e4, ftmp);
+        felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);               /* 2^5 - 2^1 */
+        felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);               /* 2^6 - 2^2 */
+        felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);               /* 2^7 - 2^3 */
+        felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);               /* 2^8 - 2^4 */
+        felem_mul(tmp, ftmp, e4); felem_reduce(ftmp, tmp);              /* 2^8 - 2^0 */
+        felem_assign(e8, ftmp);
+        for (i = 0; i < 8; i++) {
+                felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);
+        }                                                               /* 2^16 - 2^8 */
+        felem_mul(tmp, ftmp, e8); felem_reduce(ftmp, tmp);              /* 2^16 - 2^0 */
+        felem_assign(e16, ftmp);
+        for (i = 0; i < 16; i++) {
+                felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);
+        }                                                               /* 2^32 - 2^16 */
+        felem_mul(tmp, ftmp, e16); felem_reduce(ftmp, tmp);             /* 2^32 - 2^0 */
+        felem_assign(e32, ftmp);
+        for (i = 0; i < 32; i++) {
+                felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);
+        }                                                               /* 2^64 - 2^32 */
+        felem_assign(e64, ftmp);
+        felem_mul(tmp, ftmp, in); felem_reduce(ftmp, tmp);              /* 2^64 - 2^32 + 2^0 */
+        for (i = 0; i < 192; i++) {
+                felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);
+        }                                                               /* 2^256 - 2^224 + 2^192 */
+        felem_mul(tmp, e64, e32); felem_reduce(ftmp2, tmp);             /* 2^64 - 2^0 */
+        for (i = 0; i < 16; i++) {
+                felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);
+        }                                                               /* 2^80 - 2^16 */
+        felem_mul(tmp, ftmp2, e16); felem_reduce(ftmp2, tmp);           /* 2^80 - 2^0 */
+        for (i = 0; i < 8; i++) {
+                felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);
+        }                                                               /* 2^88 - 2^8 */
+        felem_mul(tmp, ftmp2, e8); felem_reduce(ftmp2, tmp);            /* 2^88 - 2^0 */
+        for (i = 0; i < 4; i++) {
+                felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);
+        }                                                               /* 2^92 - 2^4 */
+        felem_mul(tmp, ftmp2, e4); felem_reduce(ftmp2, tmp);            /* 2^92 - 2^0 */
+        felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);             /* 2^93 - 2^1 */
+        felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);             /* 2^94 - 2^2 */
+        felem_mul(tmp, ftmp2, e2); felem_reduce(ftmp2, tmp);            /* 2^94 - 2^0 */
+        felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);             /* 2^95 - 2^1 */
+        felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);             /* 2^96 - 2^2 */
+        felem_mul(tmp, ftmp2, in); felem_reduce(ftmp2, tmp);            /* 2^96 - 3 */
+        felem_mul(tmp, ftmp2, ftmp); felem_reduce(out, tmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
+        }
+static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
+        {
+        felem tmp;
+        smallfelem_expand(tmp, in);
+        felem_inv(tmp, tmp);
+        felem_contract(out, tmp);
+        }
+/* Group operations
+ * ----------------
+ *
+ * Building on top of the field operations we have the operations on the
+ * elliptic curve group itself. Points on the curve are represented in Jacobian
+ * coordinates */
+/* point_double calculates 2*(x_in, y_in, z_in)
+ *
+ * The method is taken from:
+ *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
+ *
+ * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
+ * while x_out == y_in is not (maybe this works, but it's not tested). */
+static void
+point_double(felem x_out, felem y_out, felem z_out,
+             const felem x_in, const felem y_in, const felem z_in)
+        {
+        longfelem tmp, tmp2;
+        felem delta, gamma, beta, alpha, ftmp, ftmp2;
+        smallfelem small1, small2;
+        felem_assign(ftmp, x_in);
+        /* ftmp[i] < 2^106 */
+        felem_assign(ftmp2, x_in);
+        /* ftmp2[i] < 2^106 */
+        /* delta = z^2 */
+        felem_square(tmp, z_in);
+        felem_reduce(delta, tmp);
+        /* delta[i] < 2^101 */
+        /* gamma = y^2 */
+        felem_square(tmp, y_in);
+        felem_reduce(gamma, tmp);
+        /* gamma[i] < 2^101 */
+        felem_shrink(small1, gamma);
+        /* beta = x*gamma */
+        felem_small_mul(tmp, small1, x_in);
+        felem_reduce(beta, tmp);
+        /* beta[i] < 2^101 */
+        /* alpha = 3*(x-delta)*(x+delta) */
+        felem_diff(ftmp, delta);
+        /* ftmp[i] < 2^105 + 2^106 < 2^107 */
+        felem_sum(ftmp2, delta);
+        /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
+        felem_scalar(ftmp2, 3);
+        /* ftmp2[i] < 3 * 2^107 < 2^109 */
+        felem_mul(tmp, ftmp, ftmp2);
+        felem_reduce(alpha, tmp);
+        /* alpha[i] < 2^101 */
+        felem_shrink(small2, alpha);
+        /* x' = alpha^2 - 8*beta */
+        smallfelem_square(tmp, small2);
+        felem_reduce(x_out, tmp);
+        felem_assign(ftmp, beta);
+        felem_scalar(ftmp, 8);
+        /* ftmp[i] < 8 * 2^101 = 2^104 */
+        felem_diff(x_out, ftmp);
+        /* x_out[i] < 2^105 + 2^101 < 2^106 */
+        /* z' = (y + z)^2 - gamma - delta */
+        felem_sum(delta, gamma);
+        /* delta[i] < 2^101 + 2^101 = 2^102 */
+        felem_assign(ftmp, y_in);
+        felem_sum(ftmp, z_in);
+        /* ftmp[i] < 2^106 + 2^106 = 2^107 */
+        felem_square(tmp, ftmp);
+        felem_reduce(z_out, tmp);
+        felem_diff(z_out, delta);
+        /* z_out[i] < 2^105 + 2^101 < 2^106 */
+        /* y' = alpha*(4*beta - x') - 8*gamma^2 */
+        felem_scalar(beta, 4);
+        /* beta[i] < 4 * 2^101 = 2^103 */
+        felem_diff_zero107(beta, x_out);
+        /* beta[i] < 2^107 + 2^103 < 2^108 */
+        felem_small_mul(tmp, small2, beta);
+        /* tmp[i] < 7 * 2^64 < 2^67 */
+        smallfelem_square(tmp2, small1);
+        /* tmp2[i] < 7 * 2^64 */
+        longfelem_scalar(tmp2, 8);
+        /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
+        longfelem_diff(tmp, tmp2);
+        /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
+        felem_reduce_zero105(y_out, tmp);
+        /* y_out[i] < 2^106 */
+        }
+/* point_double_small is the same as point_double, except that it operates on
+ * smallfelems */
+static void
+point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
+                   const smallfelem x_in, const smallfelem y_in, const smallfelem z_in)
+        {
+        felem felem_x_out, felem_y_out, felem_z_out;
+        felem felem_x_in, felem_y_in, felem_z_in;
+        smallfelem_expand(felem_x_in, x_in);
+        smallfelem_expand(felem_y_in, y_in);
+        smallfelem_expand(felem_z_in, z_in);
+        point_double(felem_x_out, felem_y_out, felem_z_out,
+                     felem_x_in, felem_y_in, felem_z_in);
+        felem_shrink(x_out, felem_x_out);
+        felem_shrink(y_out, felem_y_out);
+        felem_shrink(z_out, felem_z_out);
+        }
+/* copy_conditional copies in to out iff mask is all ones. */
+static void
+copy_conditional(felem out, const felem in, limb mask)
+        {
+        unsigned i;
+        for (i = 0; i < NLIMBS; ++i)
+                {
+                const limb tmp = mask & (in[i] ^ out[i]);
+                out[i] ^= tmp;
+                }
+        }
+/* copy_small_conditional copies in to out iff mask is all ones. */
+static void
+copy_small_conditional(felem out, const smallfelem in, limb mask)
+        {
+        unsigned i;
+        const u64 mask64 = mask;
+        for (i = 0; i < NLIMBS; ++i)
+                {
+                out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
+                }
+        }
+/* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
+ *
+ * The method is taken from:
+ *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
+ * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
+ *
+ * This function includes a branch for checking whether the two input points
+ * are equal, (while not equal to the point at infinity). This case never
+ * happens during single point multiplication, so there is no timing leak for
+ * ECDH or ECDSA signing. */
+static void point_add(felem x3, felem y3, felem z3,
+        const felem x1, const felem y1, const felem z1,
+        const int mixed, const smallfelem x2, const smallfelem y2, const smallfelem z2)
+        {
+        felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
+        longfelem tmp, tmp2;
+        smallfelem small1, small2, small3, small4, small5;
+        limb x_equal, y_equal, z1_is_zero, z2_is_zero;
+        felem_shrink(small3, z1);
+        z1_is_zero = smallfelem_is_zero(small3);
+        z2_is_zero = smallfelem_is_zero(z2);
+        /* ftmp = z1z1 = z1**2 */
+        smallfelem_square(tmp, small3);
+        felem_reduce(ftmp, tmp);
+        /* ftmp[i] < 2^101 */
+        felem_shrink(small1, ftmp);
+        if(!mixed)
+                {
+                /* ftmp2 = z2z2 = z2**2 */
+                smallfelem_square(tmp, z2);
+                felem_reduce(ftmp2, tmp);
+                /* ftmp2[i] < 2^101 */
+                felem_shrink(small2, ftmp2);
+                felem_shrink(small5, x1);
+                /* u1 = ftmp3 = x1*z2z2 */
+                smallfelem_mul(tmp, small5, small2);
+                felem_reduce(ftmp3, tmp);
+                /* ftmp3[i] < 2^101 */
+                /* ftmp5 = z1 + z2 */
+                felem_assign(ftmp5, z1);
+                felem_small_sum(ftmp5, z2);
+                /* ftmp5[i] < 2^107 */
+                /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
+                felem_square(tmp, ftmp5);
+                felem_reduce(ftmp5, tmp);
+                /* ftmp2 = z2z2 + z1z1 */
+                felem_sum(ftmp2, ftmp);
+                /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
+                felem_diff(ftmp5, ftmp2);
+                /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
+                /* ftmp2 = z2 * z2z2 */
+                smallfelem_mul(tmp, small2, z2);
+                felem_reduce(ftmp2, tmp);
+                /* s1 = ftmp2 = y1 * z2**3 */
+                felem_mul(tmp, y1, ftmp2);
+                felem_reduce(ftmp6, tmp);
+                /* ftmp6[i] < 2^101 */
+                }
+        else
+                {
+                /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
+                /* u1 = ftmp3 = x1*z2z2 */
+                felem_assign(ftmp3, x1);
+                /* ftmp3[i] < 2^106 */
+                /* ftmp5 = 2z1z2 */
+                felem_assign(ftmp5, z1);
+                felem_scalar(ftmp5, 2);
+                /* ftmp5[i] < 2*2^106 = 2^107 */
+                /* s1 = ftmp2 = y1 * z2**3 */
+                felem_assign(ftmp6, y1);
+                /* ftmp6[i] < 2^106 */
+                }
+        /* u2 = x2*z1z1 */
+        smallfelem_mul(tmp, x2, small1);
+        felem_reduce(ftmp4, tmp);
+        /* h = ftmp4 = u2 - u1 */
+        felem_diff_zero107(ftmp4, ftmp3);
+        /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
+        felem_shrink(small4, ftmp4);
+        x_equal = smallfelem_is_zero(small4);
+        /* z_out = ftmp5 * h */
+        felem_small_mul(tmp, small4, ftmp5);
+        felem_reduce(z_out, tmp);
+        /* z_out[i] < 2^101 */
+        /* ftmp = z1 * z1z1 */
+        smallfelem_mul(tmp, small1, small3);
+        felem_reduce(ftmp, tmp);
+        /* s2 = tmp = y2 * z1**3 */
+        felem_small_mul(tmp, y2, ftmp);
+        felem_reduce(ftmp5, tmp);
+        /* r = ftmp5 = (s2 - s1)*2 */
+        felem_diff_zero107(ftmp5, ftmp6);
+        /* ftmp5[i] < 2^107 + 2^107 = 2^108*/
+        felem_scalar(ftmp5, 2);
+        /* ftmp5[i] < 2^109 */
+        felem_shrink(small1, ftmp5);
+        y_equal = smallfelem_is_zero(small1);
+        if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
+                {
+                point_double(x3, y3, z3, x1, y1, z1);
+                return;
+                }
+        /* I = ftmp = (2h)**2 */
+        felem_assign(ftmp, ftmp4);
+        felem_scalar(ftmp, 2);
+        /* ftmp[i] < 2*2^108 = 2^109 */
+        felem_square(tmp, ftmp);
+        felem_reduce(ftmp, tmp);
+        /* J = ftmp2 = h * I */
+        felem_mul(tmp, ftmp4, ftmp);
+        felem_reduce(ftmp2, tmp);
+        /* V = ftmp4 = U1 * I */
+        felem_mul(tmp, ftmp3, ftmp);
+        felem_reduce(ftmp4, tmp);
+        /* x_out = r**2 - J - 2V */
+        smallfelem_square(tmp, small1);
+        felem_reduce(x_out, tmp);
+        felem_assign(ftmp3, ftmp4);
+        felem_scalar(ftmp4, 2);
+        felem_sum(ftmp4, ftmp2);
+        /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
+        felem_diff(x_out, ftmp4);
+        /* x_out[i] < 2^105 + 2^101 */
+        /* y_out = r(V-x_out) - 2 * s1 * J */
+        felem_diff_zero107(ftmp3, x_out);
+        /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
+        felem_small_mul(tmp, small1, ftmp3);
+        felem_mul(tmp2, ftmp6, ftmp2);
+        longfelem_scalar(tmp2, 2);
+        /* tmp2[i] < 2*2^67 = 2^68 */
+        longfelem_diff(tmp, tmp2);
+        /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
+        felem_reduce_zero105(y_out, tmp);
+        /* y_out[i] < 2^106 */
+        copy_small_conditional(x_out, x2, z1_is_zero);
+        copy_conditional(x_out, x1, z2_is_zero);
+        copy_small_conditional(y_out, y2, z1_is_zero);
+        copy_conditional(y_out, y1, z2_is_zero);
+        copy_small_conditional(z_out, z2, z1_is_zero);
+        copy_conditional(z_out, z1, z2_is_zero);
+        felem_assign(x3, x_out);
+        felem_assign(y3, y_out);
+        felem_assign(z3, z_out);
+        }
+/* point_add_small is the same as point_add, except that it operates on
+ * smallfelems */
+static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
+                            smallfelem x1, smallfelem y1, smallfelem z1,
+                            smallfelem x2, smallfelem y2, smallfelem z2)
+        {
+        felem felem_x3, felem_y3, felem_z3;
+        felem felem_x1, felem_y1, felem_z1;
+        smallfelem_expand(felem_x1, x1);
+        smallfelem_expand(felem_y1, y1);
+        smallfelem_expand(felem_z1, z1);
+        point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0, x2, y2, z2);
+        felem_shrink(x3, felem_x3);
+        felem_shrink(y3, felem_y3);
+        felem_shrink(z3, felem_z3);
+        }
+/* Base point pre computation
+ * --------------------------
+ *
+ * Two different sorts of precomputed tables are used in the following code.
+ * Each contain various points on the curve, where each point is three field
+ * elements (x, y, z).
+ *
+ * For the base point table, z is usually 1 (0 for the point at infinity).
+ * This table has 2 * 16 elements, starting with the following:
+ * index | bits    | point
+ * ------+---------+------------------------------
+ *     0 | 0 0 0 0 | 0G
+ *     1 | 0 0 0 1 | 1G
+ *     2 | 0 0 1 0 | 2^64G
+ *     3 | 0 0 1 1 | (2^64 + 1)G
+ *     4 | 0 1 0 0 | 2^128G
+ *     5 | 0 1 0 1 | (2^128 + 1)G
+ *     6 | 0 1 1 0 | (2^128 + 2^64)G
+ *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
+ *     8 | 1 0 0 0 | 2^192G
+ *     9 | 1 0 0 1 | (2^192 + 1)G
+ *    10 | 1 0 1 0 | (2^192 + 2^64)G
+ *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
+ *    12 | 1 1 0 0 | (2^192 + 2^128)G
+ *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
+ *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
+ *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
+ * followed by a copy of this with each element multiplied by 2^32.
+ *
+ * The reason for this is so that we can clock bits into four different
+ * locations when doing simple scalar multiplies against the base point,
+ * and then another four locations using the second 16 elements.
+ *
+ * Tables for other points have table[i] = iG for i in 0 .. 16. */
+/* gmul is the table of precomputed base points */
+static const smallfelem gmul[2][16][3] =
+{{{{0, 0, 0, 0},
+   {0, 0, 0, 0},
+   {0, 0, 0, 0}},
+  {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2, 0x6b17d1f2e12c4247},
+   {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16, 0x4fe342e2fe1a7f9b},
+   {1, 0, 0, 0}},
+  {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de, 0x0fa822bc2811aaa5},
+   {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b, 0xbff44ae8f5dba80d},
+   {1, 0, 0, 0}},
+  {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789, 0x300a4bbc89d6726f},
+   {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f, 0x72aac7e0d09b4644},
+   {1, 0, 0, 0}},
+  {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e, 0x447d739beedb5e67},
+   {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7, 0x2d4825ab834131ee},
+   {1, 0, 0, 0}},
+  {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60, 0xef9519328a9c72ff},
+   {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c, 0x611e9fc37dbb2c9b},
+   {1, 0, 0, 0}},
+  {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf, 0x550663797b51f5d8},
+   {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5, 0x157164848aecb851},
+   {1, 0, 0, 0}},
+  {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391, 0xeb5d7745b21141ea},
+   {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee, 0xeafd72ebdbecc17b},
+   {1, 0, 0, 0}},
+  {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5, 0xa6d39677a7849276},
+   {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf, 0x674f84749b0b8816},
+   {1, 0, 0, 0}},
+  {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb, 0x4e769e7672c9ddad},
+   {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281, 0x42b99082de830663},
+   {1, 0, 0, 0}},
+  {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478, 0x78878ef61c6ce04d},
+   {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def, 0xb6cb3f5d7b72c321},
+   {1, 0, 0, 0}},
+  {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae, 0x0c88bc4d716b1287},
+   {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa, 0xdd5ddea3f3901dc6},
+   {1, 0, 0, 0}},
+  {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3, 0x68f344af6b317466},
+   {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3, 0x31b9c405f8540a20},
+   {1, 0, 0, 0}},
+  {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0, 0x4052bf4b6f461db9},
+   {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8, 0xfecf4d5190b0fc61},
+   {1, 0, 0, 0}},
+  {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a, 0x1eddbae2c802e41a},
+   {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0, 0x43104d86560ebcfc},
+   {1, 0, 0, 0}},
+  {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a, 0xb48e26b484f7a21c},
+   {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668, 0xfac015404d4d3dab},
+   {1, 0, 0, 0}}},
+ {{{0, 0, 0, 0},
+   {0, 0, 0, 0},
+   {0, 0, 0, 0}},
+  {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da, 0x7fe36b40af22af89},
+   {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1, 0xe697d45825b63624},
+   {1, 0, 0, 0}},
+  {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902, 0x4a5b506612a677a6},
+   {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40, 0xeb13461ceac089f1},
+   {1, 0, 0, 0}},
+  {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857, 0x0781b8291c6a220a},
+   {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434, 0x690cde8df0151593},
+   {1, 0, 0, 0}},
+  {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326, 0x8a535f566ec73617},
+   {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf, 0x0455c08468b08bd7},
+   {1, 0, 0, 0}},
+  {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279, 0x06bada7ab77f8276},
+   {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70, 0x5b476dfd0e6cb18a},
+   {1, 0, 0, 0}},
+  {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8, 0x3e29864e8a2ec908},
+   {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed, 0x239b90ea3dc31e7e},
+   {1, 0, 0, 0}},
+  {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4, 0x820f4dd949f72ff7},
+   {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3, 0x140406ec783a05ec},
+   {1, 0, 0, 0}},
+  {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe, 0x68f6b8542783dfee},
+   {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028, 0xcbe1feba92e40ce6},
+   {1, 0, 0, 0}},
+  {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927, 0xd0b2f94d2f420109},
+   {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a, 0x971459828b0719e5},
+   {1, 0, 0, 0}},
+  {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687, 0x961610004a866aba},
+   {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c, 0x7acb9fadcee75e44},
+   {1, 0, 0, 0}},
+  {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea, 0x24eb9acca333bf5b},
+   {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d, 0x69f891c5acd079cc},
+   {1, 0, 0, 0}},
+  {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514, 0xe51f547c5972a107},
+   {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06, 0x1c309a2b25bb1387},
+   {1, 0, 0, 0}},
+  {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828, 0x20b87b8aa2c4e503},
+   {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044, 0xf5c6fa49919776be},
+   {1, 0, 0, 0}},
+  {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56, 0x1ed7d1b9332010b9},
+   {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24, 0x3a2b03f03217257a},
+   {1, 0, 0, 0}},
+  {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b, 0x15fee545c78dd9f6},
+   {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb, 0x4ab5b6b2b8753f81},
+   {1, 0, 0, 0}}}};
+/* select_point selects the |idx|th point from a precomputation table and
+ * copies it to out. */
+static void select_point(const u64 idx, unsigned int size, const smallfelem pre_comp[16][3], smallfelem out[3])
+        {
+        unsigned i, j;
+        u64 *outlimbs = &out[0][0];
+        memset(outlimbs, 0, 3 * sizeof(smallfelem));
+        for (i = 0; i < size; i++)
+                {
+                const u64 *inlimbs = (u64*) &pre_comp[i][0][0];
+                u64 mask = i ^ idx;
+                mask |= mask >> 4;
+                mask |= mask >> 2;
+                mask |= mask >> 1;
+                mask &= 1;
+                mask--;
+                for (j = 0; j < NLIMBS * 3; j++)
+                        outlimbs[j] |= inlimbs[j] & mask;
+                }
+        }
+/* get_bit returns the |i|th bit in |in| */
+static char get_bit(const felem_bytearray in, int i)
+        {
+        if ((i < 0) || (i >= 256))
+                return 0;
+        return (in[i >> 3] >> (i & 7)) & 1;
+        }
+/* Interleaved point multiplication using precomputed point multiples:
+ * The small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[],
+ * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
+ * of the generator, using certain (large) precomputed multiples in g_pre_comp.
+ * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
+static void batch_mul(felem x_out, felem y_out, felem z_out,
+        const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
+        const int mixed, const smallfelem pre_comp[][17][3], const smallfelem g_pre_comp[2][16][3])
+        {
+        int i, skip;
+        unsigned num, gen_mul = (g_scalar != NULL);
+        felem nq[3], ftmp;
+        smallfelem tmp[3];
+        u64 bits;
+        u8 sign, digit;
+        /* set nq to the point at infinity */
+        memset(nq, 0, 3 * sizeof(felem));
+        /* Loop over all scalars msb-to-lsb, interleaving additions
+         * of multiples of the generator (two in each of the last 32 rounds)
+         * and additions of other points multiples (every 5th round).
+         */
+        skip = 1; /* save two point operations in the first round */
+        for (i = (num_points ? 255 : 31); i >= 0; --i)
+                {
+                /* double */
+                if (!skip)
+                        point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
+                /* add multiples of the generator */
+                if (gen_mul && (i <= 31))
+                        {
+                        /* first, look 32 bits upwards */
+                        bits = get_bit(g_scalar, i + 224) << 3;
+                        bits |= get_bit(g_scalar, i + 160) << 2;
+                        bits |= get_bit(g_scalar, i + 96) << 1;
+                        bits |= get_bit(g_scalar, i + 32);
+                        /* select the point to add, in constant time */
+                        select_point(bits, 16, g_pre_comp[1], tmp);
+                        if (!skip)
+                                {
+                                point_add(nq[0], nq[1], nq[2],
+                                        nq[0], nq[1], nq[2],
+                                        1 /* mixed */, tmp[0], tmp[1], tmp[2]);
+                                }
+                        else
+                                {
+                                smallfelem_expand(nq[0], tmp[0]);
+                                smallfelem_expand(nq[1], tmp[1]);
+                                smallfelem_expand(nq[2], tmp[2]);
+                                skip = 0;
+                                }
+                        /* second, look at the current position */
+                        bits = get_bit(g_scalar, i + 192) << 3;
+                        bits |= get_bit(g_scalar, i + 128) << 2;
+                        bits |= get_bit(g_scalar, i + 64) << 1;
+                        bits |= get_bit(g_scalar, i);
+                        /* select the point to add, in constant time */
+                        select_point(bits, 16, g_pre_comp[0], tmp);
+                        point_add(nq[0], nq[1], nq[2],
+                                nq[0], nq[1], nq[2],
+                                1 /* mixed */, tmp[0], tmp[1], tmp[2]);
+                        }
+                /* do other additions every 5 doublings */
+                if (num_points && (i % 5 == 0))
+                        {
+                        /* loop over all scalars */
+                        for (num = 0; num < num_points; ++num)
+                                {
+                                bits = get_bit(scalars[num], i + 4) << 5;
+                                bits |= get_bit(scalars[num], i + 3) << 4;
+                                bits |= get_bit(scalars[num], i + 2) << 3;
+                                bits |= get_bit(scalars[num], i + 1) << 2;
+                                bits |= get_bit(scalars[num], i) << 1;
+                                bits |= get_bit(scalars[num], i - 1);
+                                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
+                                /* select the point to add or subtract, in constant time */
+                                select_point(digit, 17, pre_comp[num], tmp);
+                                smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative point */
+                                copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
+                                felem_contract(tmp[1], ftmp);
+                                if (!skip)
+                                        {
+                                        point_add(nq[0], nq[1], nq[2],
+                                                nq[0], nq[1], nq[2],
+                                                mixed, tmp[0], tmp[1], tmp[2]);
+                                        }
+                                else
+                                        {
+                                        smallfelem_expand(nq[0], tmp[0]);
+                                        smallfelem_expand(nq[1], tmp[1]);
+                                        smallfelem_expand(nq[2], tmp[2]);
+                                        skip = 0;
+                                        }
+                                }
+                        }
+                }
+        felem_assign(x_out, nq[0]);
+        felem_assign(y_out, nq[1]);
+        felem_assign(z_out, nq[2]);
+        }
+/* Precomputation for the group generator. */
+typedef struct {
+        smallfelem g_pre_comp[2][16][3];
+        int references;
+} NISTP256_PRE_COMP;
+const EC_METHOD *EC_GFp_nistp256_method(void)
+        {
+        static const EC_METHOD ret = {
+                EC_FLAGS_DEFAULT_OCT,
+                NID_X9_62_prime_field,
+                ec_GFp_nistp256_group_init,
+                ec_GFp_simple_group_finish,
+                ec_GFp_simple_group_clear_finish,
+                ec_GFp_nist_group_copy,
+                ec_GFp_nistp256_group_set_curve,
+                ec_GFp_simple_group_get_curve,
+                ec_GFp_simple_group_get_degree,
+                ec_GFp_simple_group_check_discriminant,
+                ec_GFp_simple_point_init,
+                ec_GFp_simple_point_finish,
+                ec_GFp_simple_point_clear_finish,
+                ec_GFp_simple_point_copy,
+                ec_GFp_simple_point_set_to_infinity,
+                ec_GFp_simple_set_Jprojective_coordinates_GFp,
+                ec_GFp_simple_get_Jprojective_coordinates_GFp,
+                ec_GFp_simple_point_set_affine_coordinates,
+                ec_GFp_nistp256_point_get_affine_coordinates,
+                0 /* point_set_compressed_coordinates */,
+                0 /* point2oct */,
+                0 /* oct2point */,
+                ec_GFp_simple_add,
+                ec_GFp_simple_dbl,
+                ec_GFp_simple_invert,
+                ec_GFp_simple_is_at_infinity,
+                ec_GFp_simple_is_on_curve,
+                ec_GFp_simple_cmp,
+                ec_GFp_simple_make_affine,
+                ec_GFp_simple_points_make_affine,
+                ec_GFp_nistp256_points_mul,
+                ec_GFp_nistp256_precompute_mult,
+                ec_GFp_nistp256_have_precompute_mult,
+                ec_GFp_nist_field_mul,
+                ec_GFp_nist_field_sqr,
+                0 /* field_div */,
+                0 /* field_encode */,
+                0 /* field_decode */,
+                0 /* field_set_to_one */ };
+        return &ret;
+        }
+/******************************************************************************/
+/*                     FUNCTIONS TO MANAGE PRECOMPUTATION
+ */
+static NISTP256_PRE_COMP *nistp256_pre_comp_new()
+        {
+        NISTP256_PRE_COMP *ret = NULL;
+        ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof *ret);
+        if (!ret)
+                {
+                ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
+                return ret;
+                }
+        memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
+        ret->references = 1;
+        return ret;
+        }
+static void *nistp256_pre_comp_dup(void *src_)
+        {
+        NISTP256_PRE_COMP *src = src_;
+        /* no need to actually copy, these objects never change! */
+        CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
+        return src_;
+        }
+static void nistp256_pre_comp_free(void *pre_)
+        {
+        int i;
+        NISTP256_PRE_COMP *pre = pre_;
+        if (!pre)
+                return;
+        i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+        if (i > 0)
+                return;
+        OPENSSL_free(pre);
+        }
+static void nistp256_pre_comp_clear_free(void *pre_)
+        {
+        int i;
+        NISTP256_PRE_COMP *pre = pre_;
+        if (!pre)
+                return;
+        i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+        if (i > 0)
+                return;
+        OPENSSL_cleanse(pre, sizeof *pre);
+        OPENSSL_free(pre);
+        }
+/******************************************************************************/
+/*                         OPENSSL EC_METHOD FUNCTIONS
+ */
+int ec_GFp_nistp256_group_init(EC_GROUP *group)
+        {
+        int ret;
+        ret = ec_GFp_simple_group_init(group);
+        group->a_is_minus3 = 1;
+        return ret;
+        }
+int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
+        const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
+        {
+        int ret = 0;
+        BN_CTX *new_ctx = NULL;
+        BIGNUM *curve_p, *curve_a, *curve_b;
+        if (ctx == NULL)
+                if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+        BN_CTX_start(ctx);
+        if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
+                ((curve_a = BN_CTX_get(ctx)) == NULL) ||
+                ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
+        BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
+        BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
+        BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
+        if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
+                (BN_cmp(curve_b, b)))
+                {
+                ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
+                        EC_R_WRONG_CURVE_PARAMETERS);
+                goto err;
+                }
+        group->field_mod_func = BN_nist_mod_256;
+        ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
+err:
+        BN_CTX_end(ctx);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        return ret;
+        }
+/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
+ * (X', Y') = (X/Z^2, Y/Z^3) */
+int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
+        const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
+        {
+        felem z1, z2, x_in, y_in;
+        smallfelem x_out, y_out;
+        longfelem tmp;
+        if (EC_POINT_is_at_infinity(group, point))
+                {
+                ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
+                        EC_R_POINT_AT_INFINITY);
+                return 0;
+                }
+        if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
+                (!BN_to_felem(z1, &point->Z))) return 0;
+        felem_inv(z2, z1);
+        felem_square(tmp, z2); felem_reduce(z1, tmp);
+        felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
+        felem_contract(x_out, x_in);
+        if (x != NULL)
+                {
+                if (!smallfelem_to_BN(x, x_out)) {
+                ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
+                        ERR_R_BN_LIB);
+                return 0;
+                }
+                }
+        felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
+        felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
+        felem_contract(y_out, y_in);
+        if (y != NULL)
+                {
+                if (!smallfelem_to_BN(y, y_out))
+                        {
+                        ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
+                                ERR_R_BN_LIB);
+                        return 0;
+                        }
+                }
+        return 1;
+        }
+static void make_points_affine(size_t num, smallfelem points[/* num */][3], smallfelem tmp_smallfelems[/* num+1 */])
+        {
+        /* Runs in constant time, unless an input is the point at infinity
+         * (which normally shouldn't happen). */
+        ec_GFp_nistp_points_make_affine_internal(
+                num,
+                points,
+                sizeof(smallfelem),
+                tmp_smallfelems,
+                (void (*)(void *)) smallfelem_one,
+                (int (*)(const void *)) smallfelem_is_zero_int,
+                (void (*)(void *, const void *)) smallfelem_assign,
+                (void (*)(void *, const void *)) smallfelem_square_contract,
+                (void (*)(void *, const void *, const void *)) smallfelem_mul_contract,
+                (void (*)(void *, const void *)) smallfelem_inv_contract,
+                (void (*)(void *, const void *)) smallfelem_assign /* nothing to contract */);
+        }
+/* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
+ * Result is stored in r (r can equal one of the inputs). */
+int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
+        const BIGNUM *scalar, size_t num, const EC_POINT *points[],
+        const BIGNUM *scalars[], BN_CTX *ctx)
+        {
+        int ret = 0;
+        int j;
+        int mixed = 0;
+        BN_CTX *new_ctx = NULL;
+        BIGNUM *x, *y, *z, *tmp_scalar;
+        felem_bytearray g_secret;
+        felem_bytearray *secrets = NULL;
+        smallfelem (*pre_comp)[17][3] = NULL;
+        smallfelem *tmp_smallfelems = NULL;
+        felem_bytearray tmp;
+        unsigned i, num_bytes;
+        int have_pre_comp = 0;
+        size_t num_points = num;
+        smallfelem x_in, y_in, z_in;
+        felem x_out, y_out, z_out;
+        NISTP256_PRE_COMP *pre = NULL;
+        const smallfelem (*g_pre_comp)[16][3] = NULL;
+        EC_POINT *generator = NULL;
+        const EC_POINT *p = NULL;
+        const BIGNUM *p_scalar = NULL;
+        if (ctx == NULL)
+                if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+        BN_CTX_start(ctx);
+        if (((x = BN_CTX_get(ctx)) == NULL) ||
+                ((y = BN_CTX_get(ctx)) == NULL) ||
+                ((z = BN_CTX_get(ctx)) == NULL) ||
+                ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
+                goto err;
+        if (scalar != NULL)
+                {
+                pre = EC_EX_DATA_get_data(group->extra_data,
+                        nistp256_pre_comp_dup, nistp256_pre_comp_free,
+                        nistp256_pre_comp_clear_free);
+                if (pre)
+                        /* we have precomputation, try to use it */
+                        g_pre_comp = (const smallfelem (*)[16][3]) pre->g_pre_comp;
+                else
+                        /* try to use the standard precomputation */
+                        g_pre_comp = &gmul[0];
+                generator = EC_POINT_new(group);
+                if (generator == NULL)
+                        goto err;
+                /* get the generator from precomputation */
+                if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
+                        !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
+                        !smallfelem_to_BN(z, g_pre_comp[0][1][2]))
+                        {
+                        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
+                        goto err;
+                        }
+                if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
+                                generator, x, y, z, ctx))
+                        goto err;
+                if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
+                        /* precomputation matches generator */
+                        have_pre_comp = 1;
+                else
+                        /* we don't have valid precomputation:
+                         * treat the generator as a random point */
+                        num_points++;
+                }
+        if (num_points > 0)
+                {
+                if (num_points >= 3)
+                        {
+                        /* unless we precompute multiples for just one or two points,
+                         * converting those into affine form is time well spent  */
+                        mixed = 1;
+                        }
+                secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
+                pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem));
+                if (mixed)
+                        tmp_smallfelems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem));
+                if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_smallfelems == NULL)))
+                        {
+                        ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
+                        goto err;
+                        }
+                /* we treat NULL scalars as 0, and NULL points as points at infinity,
+                 * i.e., they contribute nothing to the linear combination */
+                memset(secrets, 0, num_points * sizeof(felem_bytearray));
+                memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem));
+                for (i = 0; i < num_points; ++i)
+                        {
+                        if (i == num)
+                                /* we didn't have a valid precomputation, so we pick
+                                 * the generator */
+                                {
+                                p = EC_GROUP_get0_generator(group);
+                                p_scalar = scalar;
+                                }
+                        else
+                                /* the i^th point */
+                                {
+                                p = points[i];
+                                p_scalar = scalars[i];
+                                }
+                        if ((p_scalar != NULL) && (p != NULL))
+                                {
+                                /* reduce scalar to 0 <= scalar < 2^256 */
+                                if ((BN_num_bits(p_scalar) > 256) || (BN_is_negative(p_scalar)))
+                                        {
+                                        /* this is an unusual input, and we don't guarantee
+                                         * constant-timeness */
+                                        if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
+                                                {
+                                                ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
+                                                goto err;
+                                                }
+                                        num_bytes = BN_bn2bin(tmp_scalar, tmp);
+                                        }
+                                else
+                                        num_bytes = BN_bn2bin(p_scalar, tmp);
+                                flip_endian(secrets[i], tmp, num_bytes);
+                                /* precompute multiples */
+                                if ((!BN_to_felem(x_out, &p->X)) ||
+                                        (!BN_to_felem(y_out, &p->Y)) ||
+                                        (!BN_to_felem(z_out, &p->Z))) goto err;
+                                felem_shrink(pre_comp[i][1][0], x_out);
+                                felem_shrink(pre_comp[i][1][1], y_out);
+                                felem_shrink(pre_comp[i][1][2], z_out);
+                                for (j = 2; j <= 16; ++j)
+                                        {
+                                        if (j & 1)
+                                                {
+                                                point_add_small(
+                                                        pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
+                                                        pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
+                                                        pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
+                                                }
+                                        else
+                                                {
+                                                point_double_small(
+                                                        pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
+                                                        pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
+                                                }
+                                        }
+                                }
+                        }
+                if (mixed)
+                        make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
+                }
+        /* the scalar for the generator */
+        if ((scalar != NULL) && (have_pre_comp))
+                {
+                memset(g_secret, 0, sizeof(g_secret));
+                /* reduce scalar to 0 <= scalar < 2^256 */
+                if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar)))
+                        {
+                        /* this is an unusual input, and we don't guarantee
+                         * constant-timeness */
+                        if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
+                                {
+                                ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
+                                goto err;
+                                }
+                        num_bytes = BN_bn2bin(tmp_scalar, tmp);
+                        }
+                else
+                        num_bytes = BN_bn2bin(scalar, tmp);
+                flip_endian(g_secret, tmp, num_bytes);
+                /* do the multiplication with generator precomputation*/
+                batch_mul(x_out, y_out, z_out,
+                        (const felem_bytearray (*)) secrets, num_points,
+                        g_secret,
+                        mixed, (const smallfelem (*)[17][3]) pre_comp,
+                        g_pre_comp);
+                }
+        else
+                /* do the multiplication without generator precomputation */
+                batch_mul(x_out, y_out, z_out,
+                        (const felem_bytearray (*)) secrets, num_points,
+                        NULL, mixed, (const smallfelem (*)[17][3]) pre_comp, NULL);
+        /* reduce the output to its unique minimal representation */
+        felem_contract(x_in, x_out);
+        felem_contract(y_in, y_out);
+        felem_contract(z_in, z_out);
+        if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
+                (!smallfelem_to_BN(z, z_in)))
+                {
+                ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
+                goto err;
+                }
+        ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
+err:
+        BN_CTX_end(ctx);
+        if (generator != NULL)
+                EC_POINT_free(generator);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        if (secrets != NULL)
+                OPENSSL_free(secrets);
+        if (pre_comp != NULL)
+                OPENSSL_free(pre_comp);
+        if (tmp_smallfelems != NULL)
+                OPENSSL_free(tmp_smallfelems);
+        return ret;
+        }
+int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
+        {
+        int ret = 0;
+        NISTP256_PRE_COMP *pre = NULL;
+        int i, j;
+        BN_CTX *new_ctx = NULL;
+        BIGNUM *x, *y;
+        EC_POINT *generator = NULL;
+        smallfelem tmp_smallfelems[32];
+        felem x_tmp, y_tmp, z_tmp;
+        /* throw away old precomputation */
+        EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup,
+                nistp256_pre_comp_free, nistp256_pre_comp_clear_free);
+        if (ctx == NULL)
+                if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+        BN_CTX_start(ctx);
+        if (((x = BN_CTX_get(ctx)) == NULL) ||
+                ((y = BN_CTX_get(ctx)) == NULL))
+                goto err;
+        /* get the generator */
+        if (group->generator == NULL) goto err;
+        generator = EC_POINT_new(group);
+        if (generator == NULL)
+                goto err;
+        BN_bin2bn(nistp256_curve_params[3], sizeof (felem_bytearray), x);
+        BN_bin2bn(nistp256_curve_params[4], sizeof (felem_bytearray), y);
+        if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
+                goto err;
+        if ((pre = nistp256_pre_comp_new()) == NULL)
+                goto err;
+        /* if the generator is the standard one, use built-in precomputation */
+        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
+                {
+                memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
+                ret = 1;
+                goto err;
+                }
+        if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
+                (!BN_to_felem(y_tmp, &group->generator->Y)) ||
+                (!BN_to_felem(z_tmp, &group->generator->Z)))
+                goto err;
+        felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
+        felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
+        felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
+        /* compute 2^64*G, 2^128*G, 2^192*G for the first table,
+         * 2^32*G, 2^96*G, 2^160*G, 2^224*G for the second one
+         */
+        for (i = 1; i <= 8; i <<= 1)
+                {
+                point_double_small(
+                        pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2],
+                        pre->g_pre_comp[0][i][0], pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
+                for (j = 0; j < 31; ++j)
+                        {
+                        point_double_small(
+                                pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2],
+                                pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
+                        }
+                if (i == 8)
+                        break;
+                point_double_small(
+                        pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2],
+                        pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
+                for (j = 0; j < 31; ++j)
+                        {
+                        point_double_small(
+                                pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2],
+                                pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2]);
+                        }
+                }
+        for (i = 0; i < 2; i++)
+                {
+                /* g_pre_comp[i][0] is the point at infinity */
+                memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
+                /* the remaining multiples */
+                /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
+                point_add_small(
+                        pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1], pre->g_pre_comp[i][6][2],
+                        pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
+                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]);
+                /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
+                point_add_small(
+                        pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1], pre->g_pre_comp[i][10][2],
+                        pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
+                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]);
+                /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
+                point_add_small(
+                        pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
+                        pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
+                        pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2]);
+                /* 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G */
+                point_add_small(
+                        pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1], pre->g_pre_comp[i][14][2],
+                        pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
+                        pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]);
+                for (j = 1; j < 8; ++j)
+                        {
+                        /* odd multiples: add G resp. 2^32*G */
+                        point_add_small(
+                                pre->g_pre_comp[i][2*j+1][0], pre->g_pre_comp[i][2*j+1][1], pre->g_pre_comp[i][2*j+1][2],
+                                pre->g_pre_comp[i][2*j][0], pre->g_pre_comp[i][2*j][1], pre->g_pre_comp[i][2*j][2],
+                                pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1], pre->g_pre_comp[i][1][2]);
+                        }
+                }
+        make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
+        if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
+                        nistp256_pre_comp_free, nistp256_pre_comp_clear_free))
+                goto err;
+        ret = 1;
+        pre = NULL;
+ err:
+        BN_CTX_end(ctx);
+        if (generator != NULL)
+                EC_POINT_free(generator);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        if (pre)
+                nistp256_pre_comp_free(pre);
+        return ret;
+        }
+int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
+        {
+        if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup,
+                        nistp256_pre_comp_free, nistp256_pre_comp_clear_free)
+                != NULL)
+                return 1;
+        else
+                return 0;
+        }
+#else
+static void *dummy=&dummy;
+#endif
diff --git a/src/lib/libcrypto/ec/ecp_nistp521.c b/src/lib/libcrypto/ec/ecp_nistp521.c
new file mode 100644
index 0000000000..178b655f7f
--- /dev/null
+++ b/src/lib/libcrypto/ec/ecp_nistp521.c
@@ -0,0 +1,2025 @@
+/* crypto/ec/ecp_nistp521.c */
+/*
+ * Written by Adam Langley (Google) for the OpenSSL project
+ */
+/* Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ *
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/*
+ * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
+ *
+ * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
+ * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
+ * work which got its smarts from Daniel J. Bernstein's work on the same.
+ */
+#include <openssl/opensslconf.h>
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+#ifndef OPENSSL_SYS_VMS
+#include <stdint.h>
+#else
+#include <inttypes.h>
+#endif
+#include <string.h>
+#include <openssl/err.h>
+#include "ec_lcl.h"
+#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+  /* even with gcc, the typedef won't work for 32-bit platforms */
+  typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
+#else
+  #error "Need GCC 3.1 or later to define type uint128_t"
+#endif
+typedef uint8_t u8;
+typedef uint64_t u64;
+typedef int64_t s64;
+/* The underlying field.
+ *
+ * P521 operates over GF(2^521-1). We can serialise an element of this field
+ * into 66 bytes where the most significant byte contains only a single bit. We
+ * call this an felem_bytearray. */
+typedef u8 felem_bytearray[66];
+/* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
+ * These values are big-endian. */
+static const felem_bytearray nistp521_curve_params[5] =
+        {
+        {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* p */
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff},
+        {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* a = -3 */
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+         0xff, 0xfc},
+        {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c,  /* b */
+         0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
+         0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
+         0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
+         0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
+         0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
+         0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
+         0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
+         0x3f, 0x00},
+        {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04,  /* x */
+         0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
+         0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
+         0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
+         0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
+         0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
+         0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
+         0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
+         0xbd, 0x66},
+        {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b,  /* y */
+         0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
+         0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
+         0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
+         0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
+         0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
+         0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
+         0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
+         0x66, 0x50}
+        };
+/* The representation of field elements.
+ * ------------------------------------
+ *
+ * We represent field elements with nine values. These values are either 64 or
+ * 128 bits and the field element represented is:
+ *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
+ * Each of the nine values is called a 'limb'. Since the limbs are spaced only
+ * 58 bits apart, but are greater than 58 bits in length, the most significant
+ * bits of each limb overlap with the least significant bits of the next.
+ *
+ * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
+ * 'largefelem' */
+#define NLIMBS 9
+typedef uint64_t limb;
+typedef limb felem[NLIMBS];
+typedef uint128_t largefelem[NLIMBS];
+static const limb bottom57bits = 0x1ffffffffffffff;
+static const limb bottom58bits = 0x3ffffffffffffff;
+/* bin66_to_felem takes a little-endian byte array and converts it into felem
+ * form. This assumes that the CPU is little-endian. */
+static void bin66_to_felem(felem out, const u8 in[66])
+        {
+        out[0] = (*((limb*) &in[0])) & bottom58bits;
+        out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits;
+        out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits;
+        out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits;
+        out[4] = (*((limb*) &in[29])) & bottom58bits;
+        out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits;
+        out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits;
+        out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits;
+        out[8] = (*((limb*) &in[58])) & bottom57bits;
+        }
+/* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
+ * array. This assumes that the CPU is little-endian. */
+static void felem_to_bin66(u8 out[66], const felem in)
+        {
+        memset(out, 0, 66);
+        (*((limb*) &out[0])) = in[0];
+        (*((limb*) &out[7])) |= in[1] << 2;
+        (*((limb*) &out[14])) |= in[2] << 4;
+        (*((limb*) &out[21])) |= in[3] << 6;
+        (*((limb*) &out[29])) = in[4];
+        (*((limb*) &out[36])) |= in[5] << 2;
+        (*((limb*) &out[43])) |= in[6] << 4;
+        (*((limb*) &out[50])) |= in[7] << 6;
+        (*((limb*) &out[58])) = in[8];
+        }
+/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
+static void flip_endian(u8 *out, const u8 *in, unsigned len)
+        {
+        unsigned i;
+        for (i = 0; i < len; ++i)
+                out[i] = in[len-1-i];
+        }
+/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
+static int BN_to_felem(felem out, const BIGNUM *bn)
+        {
+        felem_bytearray b_in;
+        felem_bytearray b_out;
+        unsigned num_bytes;
+        /* BN_bn2bin eats leading zeroes */
+        memset(b_out, 0, sizeof b_out);
+        num_bytes = BN_num_bytes(bn);
+        if (num_bytes > sizeof b_out)
+                {
+                ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
+                return 0;
+                }
+        if (BN_is_negative(bn))
+                {
+                ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
+                return 0;
+                }
+        num_bytes = BN_bn2bin(bn, b_in);
+        flip_endian(b_out, b_in, num_bytes);
+        bin66_to_felem(out, b_out);
+        return 1;
+        }
+/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
+static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
+        {
+        felem_bytearray b_in, b_out;
+        felem_to_bin66(b_in, in);
+        flip_endian(b_out, b_in, sizeof b_out);
+        return BN_bin2bn(b_out, sizeof b_out, out);
+        }
+/* Field operations
+ * ---------------- */
+static void felem_one(felem out)
+        {
+        out[0] = 1;
+        out[1] = 0;
+        out[2] = 0;
+        out[3] = 0;
+        out[4] = 0;
+        out[5] = 0;
+        out[6] = 0;
+        out[7] = 0;
+        out[8] = 0;
+        }
+static void felem_assign(felem out, const felem in)
+        {
+        out[0] = in[0];
+        out[1] = in[1];
+        out[2] = in[2];
+        out[3] = in[3];
+        out[4] = in[4];
+        out[5] = in[5];
+        out[6] = in[6];
+        out[7] = in[7];
+        out[8] = in[8];
+        }
+/* felem_sum64 sets out = out + in. */
+static void felem_sum64(felem out, const felem in)
+        {
+        out[0] += in[0];
+        out[1] += in[1];
+        out[2] += in[2];
+        out[3] += in[3];
+        out[4] += in[4];
+        out[5] += in[5];
+        out[6] += in[6];
+        out[7] += in[7];
+        out[8] += in[8];
+        }
+/* felem_scalar sets out = in * scalar */
+static void felem_scalar(felem out, const felem in, limb scalar)
+        {
+        out[0] = in[0] * scalar;
+        out[1] = in[1] * scalar;
+        out[2] = in[2] * scalar;
+        out[3] = in[3] * scalar;
+        out[4] = in[4] * scalar;
+        out[5] = in[5] * scalar;
+        out[6] = in[6] * scalar;
+        out[7] = in[7] * scalar;
+        out[8] = in[8] * scalar;
+        }
+/* felem_scalar64 sets out = out * scalar */
+static void felem_scalar64(felem out, limb scalar)
+        {
+        out[0] *= scalar;
+        out[1] *= scalar;
+        out[2] *= scalar;
+        out[3] *= scalar;
+        out[4] *= scalar;
+        out[5] *= scalar;
+        out[6] *= scalar;
+        out[7] *= scalar;
+        out[8] *= scalar;
+        }
+/* felem_scalar128 sets out = out * scalar */
+static void felem_scalar128(largefelem out, limb scalar)
+        {
+        out[0] *= scalar;
+        out[1] *= scalar;
+        out[2] *= scalar;
+        out[3] *= scalar;
+        out[4] *= scalar;
+        out[5] *= scalar;
+        out[6] *= scalar;
+        out[7] *= scalar;
+        out[8] *= scalar;
+        }
+/* felem_neg sets |out| to |-in|
+ * On entry:
+ *   in[i] < 2^59 + 2^14
+ * On exit:
+ *   out[i] < 2^62
+ */
+static void felem_neg(felem out, const felem in)
+        {
+        /* In order to prevent underflow, we subtract from 0 mod p. */
+        static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
+        static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
+        out[0] = two62m3 - in[0];
+        out[1] = two62m2 - in[1];
+        out[2] = two62m2 - in[2];
+        out[3] = two62m2 - in[3];
+        out[4] = two62m2 - in[4];
+        out[5] = two62m2 - in[5];
+        out[6] = two62m2 - in[6];
+        out[7] = two62m2 - in[7];
+        out[8] = two62m2 - in[8];
+        }
+/* felem_diff64 subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 2^59 + 2^14
+ * On exit:
+ *   out[i] < out[i] + 2^62
+ */
+static void felem_diff64(felem out, const felem in)
+        {
+        /* In order to prevent underflow, we add 0 mod p before subtracting. */
+        static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
+        static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
+        out[0] += two62m3 - in[0];
+        out[1] += two62m2 - in[1];
+        out[2] += two62m2 - in[2];
+        out[3] += two62m2 - in[3];
+        out[4] += two62m2 - in[4];
+        out[5] += two62m2 - in[5];
+        out[6] += two62m2 - in[6];
+        out[7] += two62m2 - in[7];
+        out[8] += two62m2 - in[8];
+        }
+/* felem_diff_128_64 subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 2^62 + 2^17
+ * On exit:
+ *   out[i] < out[i] + 2^63
+ */
+static void felem_diff_128_64(largefelem out, const felem in)
+        {
+        /* In order to prevent underflow, we add 0 mod p before subtracting. */
+        static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5);
+        static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4);
+        out[0] += two63m6 - in[0];
+        out[1] += two63m5 - in[1];
+        out[2] += two63m5 - in[2];
+        out[3] += two63m5 - in[3];
+        out[4] += two63m5 - in[4];
+        out[5] += two63m5 - in[5];
+        out[6] += two63m5 - in[6];
+        out[7] += two63m5 - in[7];
+        out[8] += two63m5 - in[8];
+        }
+/* felem_diff_128_64 subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 2^126
+ * On exit:
+ *   out[i] < out[i] + 2^127 - 2^69
+ */
+static void felem_diff128(largefelem out, const largefelem in)
+        {
+        /* In order to prevent underflow, we add 0 mod p before subtracting. */
+        static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70);
+        static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69);
+        out[0] += (two127m70 - in[0]);
+        out[1] += (two127m69 - in[1]);
+        out[2] += (two127m69 - in[2]);
+        out[3] += (two127m69 - in[3]);
+        out[4] += (two127m69 - in[4]);
+        out[5] += (two127m69 - in[5]);
+        out[6] += (two127m69 - in[6]);
+        out[7] += (two127m69 - in[7]);
+        out[8] += (two127m69 - in[8]);
+        }
+/* felem_square sets |out| = |in|^2
+ * On entry:
+ *   in[i] < 2^62
+ * On exit:
+ *   out[i] < 17 * max(in[i]) * max(in[i])
+ */
+static void felem_square(largefelem out, const felem in)
+        {
+        felem inx2, inx4;
+        felem_scalar(inx2, in, 2);
+        felem_scalar(inx4, in, 4);
+        /* We have many cases were we want to do
+         *   in[x] * in[y] +
+         *   in[y] * in[x]
+         * This is obviously just
+         *   2 * in[x] * in[y]
+         * However, rather than do the doubling on the 128 bit result, we
+         * double one of the inputs to the multiplication by reading from
+         * |inx2| */
+        out[0] = ((uint128_t) in[0]) * in[0];
+        out[1] = ((uint128_t) in[0]) * inx2[1];
+        out[2] = ((uint128_t) in[0]) * inx2[2] +
+                 ((uint128_t) in[1]) * in[1];
+        out[3] = ((uint128_t) in[0]) * inx2[3] +
+                 ((uint128_t) in[1]) * inx2[2];
+        out[4] = ((uint128_t) in[0]) * inx2[4] +
+                 ((uint128_t) in[1]) * inx2[3] +
+                 ((uint128_t) in[2]) * in[2];
+        out[5] = ((uint128_t) in[0]) * inx2[5] +
+                 ((uint128_t) in[1]) * inx2[4] +
+                 ((uint128_t) in[2]) * inx2[3];
+        out[6] = ((uint128_t) in[0]) * inx2[6] +
+                 ((uint128_t) in[1]) * inx2[5] +
+                 ((uint128_t) in[2]) * inx2[4] +
+                 ((uint128_t) in[3]) * in[3];
+        out[7] = ((uint128_t) in[0]) * inx2[7] +
+                 ((uint128_t) in[1]) * inx2[6] +
+                 ((uint128_t) in[2]) * inx2[5] +
+                 ((uint128_t) in[3]) * inx2[4];
+        out[8] = ((uint128_t) in[0]) * inx2[8] +
+                 ((uint128_t) in[1]) * inx2[7] +
+                 ((uint128_t) in[2]) * inx2[6] +
+                 ((uint128_t) in[3]) * inx2[5] +
+                 ((uint128_t) in[4]) * in[4];
+        /* The remaining limbs fall above 2^521, with the first falling at
+         * 2^522. They correspond to locations one bit up from the limbs
+         * produced above so we would have to multiply by two to align them.
+         * Again, rather than operate on the 128-bit result, we double one of
+         * the inputs to the multiplication. If we want to double for both this
+         * reason, and the reason above, then we end up multiplying by four. */
+        /* 9 */
+        out[0] += ((uint128_t) in[1]) * inx4[8] +
+                  ((uint128_t) in[2]) * inx4[7] +
+                  ((uint128_t) in[3]) * inx4[6] +
+                  ((uint128_t) in[4]) * inx4[5];
+        /* 10 */
+        out[1] += ((uint128_t) in[2]) * inx4[8] +
+                  ((uint128_t) in[3]) * inx4[7] +
+                  ((uint128_t) in[4]) * inx4[6] +
+                  ((uint128_t) in[5]) * inx2[5];
+        /* 11 */
+        out[2] += ((uint128_t) in[3]) * inx4[8] +
+                  ((uint128_t) in[4]) * inx4[7] +
+                  ((uint128_t) in[5]) * inx4[6];
+        /* 12 */
+        out[3] += ((uint128_t) in[4]) * inx4[8] +
+                  ((uint128_t) in[5]) * inx4[7] +
+                  ((uint128_t) in[6]) * inx2[6];
+        /* 13 */
+        out[4] += ((uint128_t) in[5]) * inx4[8] +
+                  ((uint128_t) in[6]) * inx4[7];
+        /* 14 */
+        out[5] += ((uint128_t) in[6]) * inx4[8] +
+                  ((uint128_t) in[7]) * inx2[7];
+        /* 15 */
+        out[6] += ((uint128_t) in[7]) * inx4[8];
+        /* 16 */
+        out[7] += ((uint128_t) in[8]) * inx2[8];
+        }
+/* felem_mul sets |out| = |in1| * |in2|
+ * On entry:
+ *   in1[i] < 2^64
+ *   in2[i] < 2^63
+ * On exit:
+ *   out[i] < 17 * max(in1[i]) * max(in2[i])
+ */
+static void felem_mul(largefelem out, const felem in1, const felem in2)
+        {
+        felem in2x2;
+        felem_scalar(in2x2, in2, 2);
+        out[0] = ((uint128_t) in1[0]) * in2[0];
+        out[1] = ((uint128_t) in1[0]) * in2[1] +
+                 ((uint128_t) in1[1]) * in2[0];
+        out[2] = ((uint128_t) in1[0]) * in2[2] +
+                 ((uint128_t) in1[1]) * in2[1] +
+                 ((uint128_t) in1[2]) * in2[0];
+        out[3] = ((uint128_t) in1[0]) * in2[3] +
+                 ((uint128_t) in1[1]) * in2[2] +
+                 ((uint128_t) in1[2]) * in2[1] +
+                 ((uint128_t) in1[3]) * in2[0];
+        out[4] = ((uint128_t) in1[0]) * in2[4] +
+                 ((uint128_t) in1[1]) * in2[3] +
+                 ((uint128_t) in1[2]) * in2[2] +
+                 ((uint128_t) in1[3]) * in2[1] +
+                 ((uint128_t) in1[4]) * in2[0];
+        out[5] = ((uint128_t) in1[0]) * in2[5] +
+                 ((uint128_t) in1[1]) * in2[4] +
+                 ((uint128_t) in1[2]) * in2[3] +
+                 ((uint128_t) in1[3]) * in2[2] +
+                 ((uint128_t) in1[4]) * in2[1] +
+                 ((uint128_t) in1[5]) * in2[0];
+        out[6] = ((uint128_t) in1[0]) * in2[6] +
+                 ((uint128_t) in1[1]) * in2[5] +
+                 ((uint128_t) in1[2]) * in2[4] +
+                 ((uint128_t) in1[3]) * in2[3] +
+                 ((uint128_t) in1[4]) * in2[2] +
+                 ((uint128_t) in1[5]) * in2[1] +
+                 ((uint128_t) in1[6]) * in2[0];
+        out[7] = ((uint128_t) in1[0]) * in2[7] +
+                 ((uint128_t) in1[1]) * in2[6] +
+                 ((uint128_t) in1[2]) * in2[5] +
+                 ((uint128_t) in1[3]) * in2[4] +
+                 ((uint128_t) in1[4]) * in2[3] +
+                 ((uint128_t) in1[5]) * in2[2] +
+                 ((uint128_t) in1[6]) * in2[1] +
+                 ((uint128_t) in1[7]) * in2[0];
+        out[8] = ((uint128_t) in1[0]) * in2[8] +
+                 ((uint128_t) in1[1]) * in2[7] +
+                 ((uint128_t) in1[2]) * in2[6] +
+                 ((uint128_t) in1[3]) * in2[5] +
+                 ((uint128_t) in1[4]) * in2[4] +
+                 ((uint128_t) in1[5]) * in2[3] +
+                 ((uint128_t) in1[6]) * in2[2] +
+                 ((uint128_t) in1[7]) * in2[1] +
+                 ((uint128_t) in1[8]) * in2[0];
+        /* See comment in felem_square about the use of in2x2 here */
+        out[0] += ((uint128_t) in1[1]) * in2x2[8] +
+                  ((uint128_t) in1[2]) * in2x2[7] +
+                  ((uint128_t) in1[3]) * in2x2[6] +
+                  ((uint128_t) in1[4]) * in2x2[5] +
+                  ((uint128_t) in1[5]) * in2x2[4] +
+                  ((uint128_t) in1[6]) * in2x2[3] +
+                  ((uint128_t) in1[7]) * in2x2[2] +
+                  ((uint128_t) in1[8]) * in2x2[1];
+        out[1] += ((uint128_t) in1[2]) * in2x2[8] +
+                  ((uint128_t) in1[3]) * in2x2[7] +
+                  ((uint128_t) in1[4]) * in2x2[6] +
+                  ((uint128_t) in1[5]) * in2x2[5] +
+                  ((uint128_t) in1[6]) * in2x2[4] +
+                  ((uint128_t) in1[7]) * in2x2[3] +
+                  ((uint128_t) in1[8]) * in2x2[2];
+        out[2] += ((uint128_t) in1[3]) * in2x2[8] +
+                  ((uint128_t) in1[4]) * in2x2[7] +
+                  ((uint128_t) in1[5]) * in2x2[6] +
+                  ((uint128_t) in1[6]) * in2x2[5] +
+                  ((uint128_t) in1[7]) * in2x2[4] +
+                  ((uint128_t) in1[8]) * in2x2[3];
+        out[3] += ((uint128_t) in1[4]) * in2x2[8] +
+                  ((uint128_t) in1[5]) * in2x2[7] +
+                  ((uint128_t) in1[6]) * in2x2[6] +
+                  ((uint128_t) in1[7]) * in2x2[5] +
+                  ((uint128_t) in1[8]) * in2x2[4];
+        out[4] += ((uint128_t) in1[5]) * in2x2[8] +
+                  ((uint128_t) in1[6]) * in2x2[7] +
+                  ((uint128_t) in1[7]) * in2x2[6] +
+                  ((uint128_t) in1[8]) * in2x2[5];
+        out[5] += ((uint128_t) in1[6]) * in2x2[8] +
+                  ((uint128_t) in1[7]) * in2x2[7] +
+                  ((uint128_t) in1[8]) * in2x2[6];
+        out[6] += ((uint128_t) in1[7]) * in2x2[8] +
+                  ((uint128_t) in1[8]) * in2x2[7];
+        out[7] += ((uint128_t) in1[8]) * in2x2[8];
+        }
+static const limb bottom52bits = 0xfffffffffffff;
+/* felem_reduce converts a largefelem to an felem.
+ * On entry:
+ *   in[i] < 2^128
+ * On exit:
+ *   out[i] < 2^59 + 2^14
+ */
+static void felem_reduce(felem out, const largefelem in)
+        {
+        u64 overflow1, overflow2;
+        out[0] = ((limb) in[0]) & bottom58bits;
+        out[1] = ((limb) in[1]) & bottom58bits;
+        out[2] = ((limb) in[2]) & bottom58bits;
+        out[3] = ((limb) in[3]) & bottom58bits;
+        out[4] = ((limb) in[4]) & bottom58bits;
+        out[5] = ((limb) in[5]) & bottom58bits;
+        out[6] = ((limb) in[6]) & bottom58bits;
+        out[7] = ((limb) in[7]) & bottom58bits;
+        out[8] = ((limb) in[8]) & bottom58bits;
+        /* out[i] < 2^58 */
+        out[1] += ((limb) in[0]) >> 58;
+        out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
+        /* out[1] < 2^58 + 2^6 + 2^58
+         *        = 2^59 + 2^6 */
+        out[2] += ((limb) (in[0] >> 64)) >> 52;
+        out[2] += ((limb) in[1]) >> 58;
+        out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
+        out[3] += ((limb) (in[1] >> 64)) >> 52;
+        out[3] += ((limb) in[2]) >> 58;
+        out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
+        out[4] += ((limb) (in[2] >> 64)) >> 52;
+        out[4] += ((limb) in[3]) >> 58;
+        out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
+        out[5] += ((limb) (in[3] >> 64)) >> 52;
+        out[5] += ((limb) in[4]) >> 58;
+        out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
+        out[6] += ((limb) (in[4] >> 64)) >> 52;
+        out[6] += ((limb) in[5]) >> 58;
+        out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
+        out[7] += ((limb) (in[5] >> 64)) >> 52;
+        out[7] += ((limb) in[6]) >> 58;
+        out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
+        out[8] += ((limb) (in[6] >> 64)) >> 52;
+        out[8] += ((limb) in[7]) >> 58;
+        out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
+        /* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
+         *            < 2^59 + 2^13 */
+        overflow1 = ((limb) (in[7] >> 64)) >> 52;
+        overflow1 += ((limb) in[8]) >> 58;
+        overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
+        overflow2 = ((limb) (in[8] >> 64)) >> 52;
+        overflow1 <<= 1;  /* overflow1 < 2^13 + 2^7 + 2^59 */
+        overflow2 <<= 1;  /* overflow2 < 2^13 */
+        out[0] += overflow1;  /* out[0] < 2^60 */
+        out[1] += overflow2;  /* out[1] < 2^59 + 2^6 + 2^13 */
+        out[1] += out[0] >> 58; out[0] &= bottom58bits;
+        /* out[0] < 2^58
+         * out[1] < 2^59 + 2^6 + 2^13 + 2^2
+         *        < 2^59 + 2^14 */
+        }
+static void felem_square_reduce(felem out, const felem in)
+        {
+        largefelem tmp;
+        felem_square(tmp, in);
+        felem_reduce(out, tmp);
+        }
+static void felem_mul_reduce(felem out, const felem in1, const felem in2)
+        {
+        largefelem tmp;
+        felem_mul(tmp, in1, in2);
+        felem_reduce(out, tmp);
+        }
+/* felem_inv calculates |out| = |in|^{-1}
+ *
+ * Based on Fermat's Little Theorem:
+ *   a^p = a (mod p)
+ *   a^{p-1} = 1 (mod p)
+ *   a^{p-2} = a^{-1} (mod p)
+ */
+static void felem_inv(felem out, const felem in)
+        {
+        felem ftmp, ftmp2, ftmp3, ftmp4;
+        largefelem tmp;
+        unsigned i;
+        felem_square(tmp, in); felem_reduce(ftmp, tmp);         /* 2^1 */
+        felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);      /* 2^2 - 2^0 */
+        felem_assign(ftmp2, ftmp);
+        felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);       /* 2^3 - 2^1 */
+        felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);      /* 2^3 - 2^0 */
+        felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);       /* 2^4 - 2^1 */
+        felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp);     /* 2^3 - 2^1 */
+        felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^4 - 2^2 */
+        felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
+        felem_assign(ftmp2, ftmp3);
+        felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^5 - 2^1 */
+        felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^6 - 2^2 */
+        felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^7 - 2^3 */
+        felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^8 - 2^4 */
+        felem_assign(ftmp4, ftmp3);
+        felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp);  /* 2^8 - 2^1 */
+        felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp);     /* 2^9 - 2^2 */
+        felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
+        felem_assign(ftmp2, ftmp3);
+        for (i = 0; i < 8; i++)
+                {
+                felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^16 - 2^8 */
+                }
+        felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
+        felem_assign(ftmp2, ftmp3);
+        for (i = 0; i < 16; i++)
+                {
+                felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^32 - 2^16 */
+                }
+        felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
+        felem_assign(ftmp2, ftmp3);
+        for (i = 0; i < 32; i++)
+                {
+                felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^64 - 2^32 */
+                }
+        felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
+        felem_assign(ftmp2, ftmp3);
+        for (i = 0; i < 64; i++)
+                {
+                felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^128 - 2^64 */
+                }
+        felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
+        felem_assign(ftmp2, ftmp3);
+        for (i = 0; i < 128; i++)
+                {
+                felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^256 - 2^128 */
+                }
+        felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
+        felem_assign(ftmp2, ftmp3);
+        for (i = 0; i < 256; i++)
+                {
+                felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^512 - 2^256 */
+                }
+        felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
+        for (i = 0; i < 9; i++)
+                {
+                felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^521 - 2^9 */
+                }
+        felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */
+        felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp);      /* 2^512 - 3 */
+}
+/* This is 2^521-1, expressed as an felem */
+static const felem kPrime =
+        {
+        0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
+        0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
+        0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
+        };
+/* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
+ * otherwise.
+ * On entry:
+ *   in[i] < 2^59 + 2^14
+ */
+static limb felem_is_zero(const felem in)
+        {
+        felem ftmp;
+        limb is_zero, is_p;
+        felem_assign(ftmp, in);
+        ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits;
+        /* ftmp[8] < 2^57 */
+        ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits;
+        ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits;
+        ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits;
+        ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits;
+        ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits;
+        ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits;
+        ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits;
+        ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits;
+        /* ftmp[8] < 2^57 + 4 */
+        /* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is
+         * greater than our bound for ftmp[8]. Therefore we only have to check
+         * if the zero is zero or 2^521-1. */
+        is_zero = 0;
+        is_zero |= ftmp[0];
+        is_zero |= ftmp[1];
+        is_zero |= ftmp[2];
+        is_zero |= ftmp[3];
+        is_zero |= ftmp[4];
+        is_zero |= ftmp[5];
+        is_zero |= ftmp[6];
+        is_zero |= ftmp[7];
+        is_zero |= ftmp[8];
+        is_zero--;
+        /* We know that ftmp[i] < 2^63, therefore the only way that the top bit
+         * can be set is if is_zero was 0 before the decrement. */
+        is_zero = ((s64) is_zero) >> 63;
+        is_p = ftmp[0] ^ kPrime[0];
+        is_p |= ftmp[1] ^ kPrime[1];
+        is_p |= ftmp[2] ^ kPrime[2];
+        is_p |= ftmp[3] ^ kPrime[3];
+        is_p |= ftmp[4] ^ kPrime[4];
+        is_p |= ftmp[5] ^ kPrime[5];
+        is_p |= ftmp[6] ^ kPrime[6];
+        is_p |= ftmp[7] ^ kPrime[7];
+        is_p |= ftmp[8] ^ kPrime[8];
+        is_p--;
+        is_p = ((s64) is_p) >> 63;
+        is_zero |= is_p;
+        return is_zero;
+        }
+static int felem_is_zero_int(const felem in)
+        {
+        return (int) (felem_is_zero(in) & ((limb)1));
+        }
+/* felem_contract converts |in| to its unique, minimal representation.
+ * On entry:
+ *   in[i] < 2^59 + 2^14
+ */
+static void felem_contract(felem out, const felem in)
+        {
+        limb is_p, is_greater, sign;
+        static const limb two58 = ((limb)1) << 58;
+        felem_assign(out, in);
+        out[0] += out[8] >> 57; out[8] &= bottom57bits;
+        /* out[8] < 2^57 */
+        out[1] += out[0] >> 58; out[0] &= bottom58bits;
+        out[2] += out[1] >> 58; out[1] &= bottom58bits;
+        out[3] += out[2] >> 58; out[2] &= bottom58bits;
+        out[4] += out[3] >> 58; out[3] &= bottom58bits;
+        out[5] += out[4] >> 58; out[4] &= bottom58bits;
+        out[6] += out[5] >> 58; out[5] &= bottom58bits;
+        out[7] += out[6] >> 58; out[6] &= bottom58bits;
+        out[8] += out[7] >> 58; out[7] &= bottom58bits;
+        /* out[8] < 2^57 + 4 */
+        /* If the value is greater than 2^521-1 then we have to subtract
+         * 2^521-1 out. See the comments in felem_is_zero regarding why we
+         * don't test for other multiples of the prime. */
+        /* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */
+        is_p = out[0] ^ kPrime[0];
+        is_p |= out[1] ^ kPrime[1];
+        is_p |= out[2] ^ kPrime[2];
+        is_p |= out[3] ^ kPrime[3];
+        is_p |= out[4] ^ kPrime[4];
+        is_p |= out[5] ^ kPrime[5];
+        is_p |= out[6] ^ kPrime[6];
+        is_p |= out[7] ^ kPrime[7];
+        is_p |= out[8] ^ kPrime[8];
+        is_p--;
+        is_p &= is_p << 32;
+        is_p &= is_p << 16;
+        is_p &= is_p << 8;
+        is_p &= is_p << 4;
+        is_p &= is_p << 2;
+        is_p &= is_p << 1;
+        is_p = ((s64) is_p) >> 63;
+        is_p = ~is_p;
+        /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
+        out[0] &= is_p;
+        out[1] &= is_p;
+        out[2] &= is_p;
+        out[3] &= is_p;
+        out[4] &= is_p;
+        out[5] &= is_p;
+        out[6] &= is_p;
+        out[7] &= is_p;
+        out[8] &= is_p;
+        /* In order to test that |out| >= 2^521-1 we need only test if out[8]
+         * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */
+        is_greater = out[8] >> 57;
+        is_greater |= is_greater << 32;
+        is_greater |= is_greater << 16;
+        is_greater |= is_greater << 8;
+        is_greater |= is_greater << 4;
+        is_greater |= is_greater << 2;
+        is_greater |= is_greater << 1;
+        is_greater = ((s64) is_greater) >> 63;
+        out[0] -= kPrime[0] & is_greater;
+        out[1] -= kPrime[1] & is_greater;
+        out[2] -= kPrime[2] & is_greater;
+        out[3] -= kPrime[3] & is_greater;
+        out[4] -= kPrime[4] & is_greater;
+        out[5] -= kPrime[5] & is_greater;
+        out[6] -= kPrime[6] & is_greater;
+        out[7] -= kPrime[7] & is_greater;
+        out[8] -= kPrime[8] & is_greater;
+        /* Eliminate negative coefficients */
+        sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign);
+        sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign);
+        sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign);
+        sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign);
+        sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign);
+        sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
+        sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
+        sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
+        sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
+        sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
+        sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
+        }
+/* Group operations
+ * ----------------
+ *
+ * Building on top of the field operations we have the operations on the
+ * elliptic curve group itself. Points on the curve are represented in Jacobian
+ * coordinates */
+/* point_double calcuates 2*(x_in, y_in, z_in)
+ *
+ * The method is taken from:
+ *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
+ *
+ * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
+ * while x_out == y_in is not (maybe this works, but it's not tested). */
+static void
+point_double(felem x_out, felem y_out, felem z_out,
+             const felem x_in, const felem y_in, const felem z_in)
+        {
+        largefelem tmp, tmp2;
+        felem delta, gamma, beta, alpha, ftmp, ftmp2;
+        felem_assign(ftmp, x_in);
+        felem_assign(ftmp2, x_in);
+        /* delta = z^2 */
+        felem_square(tmp, z_in);
+        felem_reduce(delta, tmp);  /* delta[i] < 2^59 + 2^14 */
+        /* gamma = y^2 */
+        felem_square(tmp, y_in);
+        felem_reduce(gamma, tmp);  /* gamma[i] < 2^59 + 2^14 */
+        /* beta = x*gamma */
+        felem_mul(tmp, x_in, gamma);
+        felem_reduce(beta, tmp);  /* beta[i] < 2^59 + 2^14 */
+        /* alpha = 3*(x-delta)*(x+delta) */
+        felem_diff64(ftmp, delta);
+        /* ftmp[i] < 2^61 */
+        felem_sum64(ftmp2, delta);
+        /* ftmp2[i] < 2^60 + 2^15 */
+        felem_scalar64(ftmp2, 3);
+        /* ftmp2[i] < 3*2^60 + 3*2^15 */
+        felem_mul(tmp, ftmp, ftmp2);
+        /* tmp[i] < 17(3*2^121 + 3*2^76)
+         *        = 61*2^121 + 61*2^76
+         *        < 64*2^121 + 64*2^76
+         *        = 2^127 + 2^82
+         *        < 2^128 */
+        felem_reduce(alpha, tmp);
+        /* x' = alpha^2 - 8*beta */
+        felem_square(tmp, alpha);
+        /* tmp[i] < 17*2^120
+         *        < 2^125 */
+        felem_assign(ftmp, beta);
+        felem_scalar64(ftmp, 8);
+        /* ftmp[i] < 2^62 + 2^17 */
+        felem_diff_128_64(tmp, ftmp);
+        /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
+        felem_reduce(x_out, tmp);
+        /* z' = (y + z)^2 - gamma - delta */
+        felem_sum64(delta, gamma);
+        /* delta[i] < 2^60 + 2^15 */
+        felem_assign(ftmp, y_in);
+        felem_sum64(ftmp, z_in);
+        /* ftmp[i] < 2^60 + 2^15 */
+        felem_square(tmp, ftmp);
+        /* tmp[i] < 17(2^122)
+         *        < 2^127 */
+        felem_diff_128_64(tmp, delta);
+        /* tmp[i] < 2^127 + 2^63 */
+        felem_reduce(z_out, tmp);
+        /* y' = alpha*(4*beta - x') - 8*gamma^2 */
+        felem_scalar64(beta, 4);
+        /* beta[i] < 2^61 + 2^16 */
+        felem_diff64(beta, x_out);
+        /* beta[i] < 2^61 + 2^60 + 2^16 */
+        felem_mul(tmp, alpha, beta);
+        /* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
+         *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30) 
+         *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
+         *        < 2^128 */
+        felem_square(tmp2, gamma);
+        /* tmp2[i] < 17*(2^59 + 2^14)^2
+         *         = 17*(2^118 + 2^74 + 2^28) */
+        felem_scalar128(tmp2, 8);
+        /* tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
+         *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
+         *         < 2^126 */
+        felem_diff128(tmp, tmp2);
+        /* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
+         *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
+         *          2^74 + 2^69 + 2^34 + 2^30
+         *        < 2^128 */
+        felem_reduce(y_out, tmp);
+        }
+/* copy_conditional copies in to out iff mask is all ones. */
+static void
+copy_conditional(felem out, const felem in, limb mask)
+        {
+        unsigned i;
+        for (i = 0; i < NLIMBS; ++i)
+                {
+                const limb tmp = mask & (in[i] ^ out[i]);
+                out[i] ^= tmp;
+                }
+        }
+/* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
+ *
+ * The method is taken from
+ *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
+ * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
+ *
+ * This function includes a branch for checking whether the two input points
+ * are equal (while not equal to the point at infinity). This case never
+ * happens during single point multiplication, so there is no timing leak for
+ * ECDH or ECDSA signing. */
+static void point_add(felem x3, felem y3, felem z3,
+        const felem x1, const felem y1, const felem z1,
+        const int mixed, const felem x2, const felem y2, const felem z2)
+        {
+        felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
+        largefelem tmp, tmp2;
+        limb x_equal, y_equal, z1_is_zero, z2_is_zero;
+        z1_is_zero = felem_is_zero(z1);
+        z2_is_zero = felem_is_zero(z2);
+        /* ftmp = z1z1 = z1**2 */
+        felem_square(tmp, z1);
+        felem_reduce(ftmp, tmp);
+        if (!mixed)
+                {
+                /* ftmp2 = z2z2 = z2**2 */
+                felem_square(tmp, z2);
+                felem_reduce(ftmp2, tmp);
+                /* u1 = ftmp3 = x1*z2z2 */
+                felem_mul(tmp, x1, ftmp2);
+                felem_reduce(ftmp3, tmp);
+                /* ftmp5 = z1 + z2 */
+                felem_assign(ftmp5, z1);
+                felem_sum64(ftmp5, z2);
+                /* ftmp5[i] < 2^61 */
+                /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
+                felem_square(tmp, ftmp5);
+                /* tmp[i] < 17*2^122 */
+                felem_diff_128_64(tmp, ftmp);
+                /* tmp[i] < 17*2^122 + 2^63 */
+                felem_diff_128_64(tmp, ftmp2);
+                /* tmp[i] < 17*2^122 + 2^64 */
+                felem_reduce(ftmp5, tmp);
+                /* ftmp2 = z2 * z2z2 */
+                felem_mul(tmp, ftmp2, z2);
+                felem_reduce(ftmp2, tmp);
+                /* s1 = ftmp6 = y1 * z2**3 */
+                felem_mul(tmp, y1, ftmp2);
+                felem_reduce(ftmp6, tmp);
+                }
+        else
+                {
+                /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
+                /* u1 = ftmp3 = x1*z2z2 */
+                felem_assign(ftmp3, x1);
+                /* ftmp5 = 2*z1z2 */
+                felem_scalar(ftmp5, z1, 2);
+                /* s1 = ftmp6 = y1 * z2**3 */
+                felem_assign(ftmp6, y1);
+                }
+        /* u2 = x2*z1z1 */
+        felem_mul(tmp, x2, ftmp);
+        /* tmp[i] < 17*2^120 */
+        /* h = ftmp4 = u2 - u1 */
+        felem_diff_128_64(tmp, ftmp3);
+        /* tmp[i] < 17*2^120 + 2^63 */
+        felem_reduce(ftmp4, tmp);
+        x_equal = felem_is_zero(ftmp4);
+        /* z_out = ftmp5 * h */
+        felem_mul(tmp, ftmp5, ftmp4);
+        felem_reduce(z_out, tmp);
+        /* ftmp = z1 * z1z1 */
+        felem_mul(tmp, ftmp, z1);
+        felem_reduce(ftmp, tmp);
+        /* s2 = tmp = y2 * z1**3 */
+        felem_mul(tmp, y2, ftmp);
+        /* tmp[i] < 17*2^120 */
+        /* r = ftmp5 = (s2 - s1)*2 */
+        felem_diff_128_64(tmp, ftmp6);
+        /* tmp[i] < 17*2^120 + 2^63 */
+        felem_reduce(ftmp5, tmp);
+        y_equal = felem_is_zero(ftmp5);
+        felem_scalar64(ftmp5, 2);
+        /* ftmp5[i] < 2^61 */
+        if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
+                {
+                point_double(x3, y3, z3, x1, y1, z1);
+                return;
+                }
+        /* I = ftmp = (2h)**2 */
+        felem_assign(ftmp, ftmp4);
+        felem_scalar64(ftmp, 2);
+        /* ftmp[i] < 2^61 */
+        felem_square(tmp, ftmp);
+        /* tmp[i] < 17*2^122 */
+        felem_reduce(ftmp, tmp);
+        /* J = ftmp2 = h * I */
+        felem_mul(tmp, ftmp4, ftmp);
+        felem_reduce(ftmp2, tmp);
+        /* V = ftmp4 = U1 * I */
+        felem_mul(tmp, ftmp3, ftmp);
+        felem_reduce(ftmp4, tmp);
+        /* x_out = r**2 - J - 2V */
+        felem_square(tmp, ftmp5);
+        /* tmp[i] < 17*2^122 */
+        felem_diff_128_64(tmp, ftmp2);
+        /* tmp[i] < 17*2^122 + 2^63 */
+        felem_assign(ftmp3, ftmp4);
+        felem_scalar64(ftmp4, 2);
+        /* ftmp4[i] < 2^61 */
+        felem_diff_128_64(tmp, ftmp4);
+        /* tmp[i] < 17*2^122 + 2^64 */
+        felem_reduce(x_out, tmp);
+        /* y_out = r(V-x_out) - 2 * s1 * J */
+        felem_diff64(ftmp3, x_out);
+        /* ftmp3[i] < 2^60 + 2^60
+         *          = 2^61 */
+        felem_mul(tmp, ftmp5, ftmp3);
+        /* tmp[i] < 17*2^122 */
+        felem_mul(tmp2, ftmp6, ftmp2);
+        /* tmp2[i] < 17*2^120 */
+        felem_scalar128(tmp2, 2);
+        /* tmp2[i] < 17*2^121 */
+        felem_diff128(tmp, tmp2);
+        /* tmp[i] < 2^127 - 2^69 + 17*2^122
+         *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
+         *        < 2^127 */
+        felem_reduce(y_out, tmp);
+        copy_conditional(x_out, x2, z1_is_zero);
+        copy_conditional(x_out, x1, z2_is_zero);
+        copy_conditional(y_out, y2, z1_is_zero);
+        copy_conditional(y_out, y1, z2_is_zero);
+        copy_conditional(z_out, z2, z1_is_zero);
+        copy_conditional(z_out, z1, z2_is_zero);
+        felem_assign(x3, x_out);
+        felem_assign(y3, y_out);
+        felem_assign(z3, z_out);
+        }
+/* Base point pre computation
+ * --------------------------
+ *
+ * Two different sorts of precomputed tables are used in the following code.
+ * Each contain various points on the curve, where each point is three field
+ * elements (x, y, z).
+ *
+ * For the base point table, z is usually 1 (0 for the point at infinity).
+ * This table has 16 elements:
+ * index | bits    | point
+ * ------+---------+------------------------------
+ *     0 | 0 0 0 0 | 0G
+ *     1 | 0 0 0 1 | 1G
+ *     2 | 0 0 1 0 | 2^130G
+ *     3 | 0 0 1 1 | (2^130 + 1)G
+ *     4 | 0 1 0 0 | 2^260G
+ *     5 | 0 1 0 1 | (2^260 + 1)G
+ *     6 | 0 1 1 0 | (2^260 + 2^130)G
+ *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
+ *     8 | 1 0 0 0 | 2^390G
+ *     9 | 1 0 0 1 | (2^390 + 1)G
+ *    10 | 1 0 1 0 | (2^390 + 2^130)G
+ *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
+ *    12 | 1 1 0 0 | (2^390 + 2^260)G
+ *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
+ *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
+ *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
+ *
+ * The reason for this is so that we can clock bits into four different
+ * locations when doing simple scalar multiplies against the base point.
+ *
+ * Tables for other points have table[i] = iG for i in 0 .. 16. */
+/* gmul is the table of precomputed base points */
+static const felem gmul[16][3] =
+        {{{0, 0, 0, 0, 0, 0, 0, 0, 0},
+          {0, 0, 0, 0, 0, 0, 0, 0, 0},
+          {0, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
+           0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
+           0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
+          {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
+           0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
+           0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
+          {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
+           0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
+           0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
+          {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
+           0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
+           0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
+          {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
+           0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
+           0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
+          {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
+           0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
+           0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
+          {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
+           0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
+           0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
+          {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
+           0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
+           0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
+          {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
+           0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
+           0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
+          {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
+           0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
+           0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
+          {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
+           0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
+           0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
+          {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
+           0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
+           0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
+          {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
+           0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
+           0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
+          {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
+           0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
+           0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
+          {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
+           0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
+           0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
+          {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
+           0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
+           0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
+          {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
+           0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
+           0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
+          {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
+           0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
+           0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
+          {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
+           0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
+           0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
+          {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
+           0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
+           0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
+          {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
+           0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
+           0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
+          {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
+           0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
+           0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
+          {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
+           0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
+           0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
+          {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
+           0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
+           0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
+          {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
+           0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
+           0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
+          {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
+           0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
+           0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
+          {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
+           0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
+           0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
+          {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
+           0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
+           0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
+          {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+         {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
+           0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
+           0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
+          {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
+           0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
+           0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
+         {1, 0, 0, 0, 0, 0, 0, 0, 0}}};
+/* select_point selects the |idx|th point from a precomputation table and
+ * copies it to out. */
+static void select_point(const limb idx, unsigned int size, const felem pre_comp[/* size */][3],
+                         felem out[3])
+        {
+        unsigned i, j;
+        limb *outlimbs = &out[0][0];
+        memset(outlimbs, 0, 3 * sizeof(felem));
+        for (i = 0; i < size; i++)
+                {
+                const limb *inlimbs = &pre_comp[i][0][0];
+                limb mask = i ^ idx;
+                mask |= mask >> 4;
+                mask |= mask >> 2;
+                mask |= mask >> 1;
+                mask &= 1;
+                mask--;
+                for (j = 0; j < NLIMBS * 3; j++)
+                        outlimbs[j] |= inlimbs[j] & mask;
+                }
+        }
+/* get_bit returns the |i|th bit in |in| */
+static char get_bit(const felem_bytearray in, int i)
+        {
+        if (i < 0)
+                return 0;
+        return (in[i >> 3] >> (i & 7)) & 1;
+        }
+/* Interleaved point multiplication using precomputed point multiples:
+ * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
+ * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
+ * of the generator, using certain (large) precomputed multiples in g_pre_comp.
+ * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
+static void batch_mul(felem x_out, felem y_out, felem z_out,
+        const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
+        const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3])
+        {
+        int i, skip;
+        unsigned num, gen_mul = (g_scalar != NULL);
+        felem nq[3], tmp[4];
+        limb bits;
+        u8 sign, digit;
+        /* set nq to the point at infinity */
+        memset(nq, 0, 3 * sizeof(felem));
+        /* Loop over all scalars msb-to-lsb, interleaving additions
+         * of multiples of the generator (last quarter of rounds)
+         * and additions of other points multiples (every 5th round).
+         */
+        skip = 1; /* save two point operations in the first round */
+        for (i = (num_points ? 520 : 130); i >= 0; --i)
+                {
+                /* double */
+                if (!skip)
+                        point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
+                /* add multiples of the generator */
+                if (gen_mul && (i <= 130))
+                        {
+                        bits = get_bit(g_scalar, i + 390) << 3;
+                        if (i < 130)
+                                {
+                                bits |= get_bit(g_scalar, i + 260) << 2;
+                                bits |= get_bit(g_scalar, i + 130) << 1;
+                                bits |= get_bit(g_scalar, i);
+                                }
+                        /* select the point to add, in constant time */
+                        select_point(bits, 16, g_pre_comp, tmp);
+                        if (!skip)
+                                {
+                                point_add(nq[0], nq[1], nq[2],
+                                        nq[0], nq[1], nq[2],
+                                        1 /* mixed */, tmp[0], tmp[1], tmp[2]);
+                                }
+                        else
+                                {
+                                memcpy(nq, tmp, 3 * sizeof(felem));
+                                skip = 0;
+                                }
+                        }
+                /* do other additions every 5 doublings */
+                if (num_points && (i % 5 == 0))
+                        {
+                        /* loop over all scalars */
+                        for (num = 0; num < num_points; ++num)
+                                {
+                                bits = get_bit(scalars[num], i + 4) << 5;
+                                bits |= get_bit(scalars[num], i + 3) << 4;
+                                bits |= get_bit(scalars[num], i + 2) << 3;
+                                bits |= get_bit(scalars[num], i + 1) << 2;
+                                bits |= get_bit(scalars[num], i) << 1;
+                                bits |= get_bit(scalars[num], i - 1);
+                                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
+                                /* select the point to add or subtract, in constant time */
+                                select_point(digit, 17, pre_comp[num], tmp);
+                                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */
+                                copy_conditional(tmp[1], tmp[3], (-(limb) sign));
+                                if (!skip)
+                                        {
+                                        point_add(nq[0], nq[1], nq[2],
+                                                nq[0], nq[1], nq[2],
+                                                mixed, tmp[0], tmp[1], tmp[2]);
+                                        }
+                                else
+                                        {
+                                        memcpy(nq, tmp, 3 * sizeof(felem));
+                                        skip = 0;
+                                        }
+                                }
+                        }
+                }
+        felem_assign(x_out, nq[0]);
+        felem_assign(y_out, nq[1]);
+        felem_assign(z_out, nq[2]);
+        }
+/* Precomputation for the group generator. */
+typedef struct {
+        felem g_pre_comp[16][3];
+        int references;
+} NISTP521_PRE_COMP;
+const EC_METHOD *EC_GFp_nistp521_method(void)
+        {
+        static const EC_METHOD ret = {
+                EC_FLAGS_DEFAULT_OCT,
+                NID_X9_62_prime_field,
+                ec_GFp_nistp521_group_init,
+                ec_GFp_simple_group_finish,
+                ec_GFp_simple_group_clear_finish,
+                ec_GFp_nist_group_copy,
+                ec_GFp_nistp521_group_set_curve,
+                ec_GFp_simple_group_get_curve,
+                ec_GFp_simple_group_get_degree,
+                ec_GFp_simple_group_check_discriminant,
+                ec_GFp_simple_point_init,
+                ec_GFp_simple_point_finish,
+                ec_GFp_simple_point_clear_finish,
+                ec_GFp_simple_point_copy,
+                ec_GFp_simple_point_set_to_infinity,
+                ec_GFp_simple_set_Jprojective_coordinates_GFp,
+                ec_GFp_simple_get_Jprojective_coordinates_GFp,
+                ec_GFp_simple_point_set_affine_coordinates,
+                ec_GFp_nistp521_point_get_affine_coordinates,
+                0 /* point_set_compressed_coordinates */,
+                0 /* point2oct */,
+                0 /* oct2point */,
+                ec_GFp_simple_add,
+                ec_GFp_simple_dbl,
+                ec_GFp_simple_invert,
+                ec_GFp_simple_is_at_infinity,
+                ec_GFp_simple_is_on_curve,
+                ec_GFp_simple_cmp,
+                ec_GFp_simple_make_affine,
+                ec_GFp_simple_points_make_affine,
+                ec_GFp_nistp521_points_mul,
+                ec_GFp_nistp521_precompute_mult,
+                ec_GFp_nistp521_have_precompute_mult,
+                ec_GFp_nist_field_mul,
+                ec_GFp_nist_field_sqr,
+                0 /* field_div */,
+                0 /* field_encode */,
+                0 /* field_decode */,
+                0 /* field_set_to_one */ };
+        return &ret;
+        }
+/******************************************************************************/
+/*                     FUNCTIONS TO MANAGE PRECOMPUTATION
+ */
+static NISTP521_PRE_COMP *nistp521_pre_comp_new()
+        {
+        NISTP521_PRE_COMP *ret = NULL;
+        ret = (NISTP521_PRE_COMP *)OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
+        if (!ret)
+                {
+                ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
+                return ret;
+                }
+        memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
+        ret->references = 1;
+        return ret;
+        }
+static void *nistp521_pre_comp_dup(void *src_)
+        {
+        NISTP521_PRE_COMP *src = src_;
+        /* no need to actually copy, these objects never change! */
+        CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
+        return src_;
+        }
+static void nistp521_pre_comp_free(void *pre_)
+        {
+        int i;
+        NISTP521_PRE_COMP *pre = pre_;
+        if (!pre)
+                return;
+        i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+        if (i > 0)
+                return;
+        OPENSSL_free(pre);
+        }
+static void nistp521_pre_comp_clear_free(void *pre_)
+        {
+        int i;
+        NISTP521_PRE_COMP *pre = pre_;
+        if (!pre)
+                return;
+        i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+        if (i > 0)
+                return;
+        OPENSSL_cleanse(pre, sizeof(*pre));
+        OPENSSL_free(pre);
+        }
+/******************************************************************************/
+/*                         OPENSSL EC_METHOD FUNCTIONS
+ */
+int ec_GFp_nistp521_group_init(EC_GROUP *group)
+        {
+        int ret;
+        ret = ec_GFp_simple_group_init(group);
+        group->a_is_minus3 = 1;
+        return ret;
+        }
+int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
+        const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
+        {
+        int ret = 0;
+        BN_CTX *new_ctx = NULL;
+        BIGNUM *curve_p, *curve_a, *curve_b;
+        if (ctx == NULL)
+                if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+        BN_CTX_start(ctx);
+        if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
+                ((curve_a = BN_CTX_get(ctx)) == NULL) ||
+                ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
+        BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
+        BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
+        BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
+        if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
+                (BN_cmp(curve_b, b)))
+                {
+                ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
+                        EC_R_WRONG_CURVE_PARAMETERS);
+                goto err;
+                }
+        group->field_mod_func = BN_nist_mod_521;
+        ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
+err:
+        BN_CTX_end(ctx);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        return ret;
+        }
+/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
+ * (X', Y') = (X/Z^2, Y/Z^3) */
+int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
+        const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
+        {
+        felem z1, z2, x_in, y_in, x_out, y_out;
+        largefelem tmp;
+        if (EC_POINT_is_at_infinity(group, point))
+                {
+                ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
+                        EC_R_POINT_AT_INFINITY);
+                return 0;
+                }
+        if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
+                (!BN_to_felem(z1, &point->Z))) return 0;
+        felem_inv(z2, z1);
+        felem_square(tmp, z2); felem_reduce(z1, tmp);
+        felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
+        felem_contract(x_out, x_in);
+        if (x != NULL)
+                {
+                if (!felem_to_BN(x, x_out))
+                        {
+                        ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
+                        return 0;
+                        }
+                }
+        felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
+        felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
+        felem_contract(y_out, y_in);
+        if (y != NULL)
+                {
+                if (!felem_to_BN(y, y_out))
+                        {
+                        ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
+                        return 0;
+                        }
+                }
+        return 1;
+        }
+static void make_points_affine(size_t num, felem points[/* num */][3], felem tmp_felems[/* num+1 */])
+        {
+        /* Runs in constant time, unless an input is the point at infinity
+         * (which normally shouldn't happen). */
+        ec_GFp_nistp_points_make_affine_internal(
+                num,
+                points,
+                sizeof(felem),
+                tmp_felems,
+                (void (*)(void *)) felem_one,
+                (int (*)(const void *)) felem_is_zero_int,
+                (void (*)(void *, const void *)) felem_assign,
+                (void (*)(void *, const void *)) felem_square_reduce,
+                (void (*)(void *, const void *, const void *)) felem_mul_reduce,
+                (void (*)(void *, const void *)) felem_inv,
+                (void (*)(void *, const void *)) felem_contract);
+        }
+/* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
+ * Result is stored in r (r can equal one of the inputs). */
+int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
+        const BIGNUM *scalar, size_t num, const EC_POINT *points[],
+        const BIGNUM *scalars[], BN_CTX *ctx)
+        {
+        int ret = 0;
+        int j;
+        int mixed = 0;
+        BN_CTX *new_ctx = NULL;
+        BIGNUM *x, *y, *z, *tmp_scalar;
+        felem_bytearray g_secret;
+        felem_bytearray *secrets = NULL;
+        felem (*pre_comp)[17][3] = NULL;
+        felem *tmp_felems = NULL;
+        felem_bytearray tmp;
+        unsigned i, num_bytes;
+        int have_pre_comp = 0;
+        size_t num_points = num;
+        felem x_in, y_in, z_in, x_out, y_out, z_out;
+        NISTP521_PRE_COMP *pre = NULL;
+        felem (*g_pre_comp)[3] = NULL;
+        EC_POINT *generator = NULL;
+        const EC_POINT *p = NULL;
+        const BIGNUM *p_scalar = NULL;
+        if (ctx == NULL)
+                if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+        BN_CTX_start(ctx);
+        if (((x = BN_CTX_get(ctx)) == NULL) ||
+                ((y = BN_CTX_get(ctx)) == NULL) ||
+                ((z = BN_CTX_get(ctx)) == NULL) ||
+                ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
+                goto err;
+        if (scalar != NULL)
+                {
+                pre = EC_EX_DATA_get_data(group->extra_data,
+                        nistp521_pre_comp_dup, nistp521_pre_comp_free,
+                        nistp521_pre_comp_clear_free);
+                if (pre)
+                        /* we have precomputation, try to use it */
+                        g_pre_comp = &pre->g_pre_comp[0];
+                else
+                        /* try to use the standard precomputation */
+                        g_pre_comp = (felem (*)[3]) gmul;
+                generator = EC_POINT_new(group);
+                if (generator == NULL)
+                        goto err;
+                /* get the generator from precomputation */
+                if (!felem_to_BN(x, g_pre_comp[1][0]) ||
+                        !felem_to_BN(y, g_pre_comp[1][1]) ||
+                        !felem_to_BN(z, g_pre_comp[1][2]))
+                        {
+                        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
+                        goto err;
+                        }
+                if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
+                                generator, x, y, z, ctx))
+                        goto err;
+                if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
+                        /* precomputation matches generator */
+                        have_pre_comp = 1;
+                else
+                        /* we don't have valid precomputation:
+                         * treat the generator as a random point */
+                        num_points++;
+                }
+        if (num_points > 0)
+                {
+                if (num_points >= 2)
+                        {
+                        /* unless we precompute multiples for just one point,
+                         * converting those into affine form is time well spent  */
+                        mixed = 1;
+                        }
+                secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
+                pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
+                if (mixed)
+                        tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
+                if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL)))
+                        {
+                        ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
+                        goto err;
+                        }
+                /* we treat NULL scalars as 0, and NULL points as points at infinity,
+                 * i.e., they contribute nothing to the linear combination */
+                memset(secrets, 0, num_points * sizeof(felem_bytearray));
+                memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
+                for (i = 0; i < num_points; ++i)
+                        {
+                        if (i == num)
+                                /* we didn't have a valid precomputation, so we pick
+                                 * the generator */
+                                {
+                                p = EC_GROUP_get0_generator(group);
+                                p_scalar = scalar;
+                                }
+                        else
+                                /* the i^th point */
+                                {
+                                p = points[i];
+                                p_scalar = scalars[i];
+                                }
+                        if ((p_scalar != NULL) && (p != NULL))
+                                {
+                                /* reduce scalar to 0 <= scalar < 2^521 */
+                                if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar)))
+                                        {
+                                        /* this is an unusual input, and we don't guarantee
+                                         * constant-timeness */
+                                        if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
+                                                {
+                                                ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
+                                                goto err;
+                                                }
+                                        num_bytes = BN_bn2bin(tmp_scalar, tmp);
+                                        }
+                                else
+                                        num_bytes = BN_bn2bin(p_scalar, tmp);
+                                flip_endian(secrets[i], tmp, num_bytes);
+                                /* precompute multiples */
+                                if ((!BN_to_felem(x_out, &p->X)) ||
+                                        (!BN_to_felem(y_out, &p->Y)) ||
+                                        (!BN_to_felem(z_out, &p->Z))) goto err;
+                                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
+                                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
+                                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
+                                for (j = 2; j <= 16; ++j)
+                                        {
+                                        if (j & 1)
+                                                {
+                                                point_add(
+                                                        pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
+                                                        pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
+                                                        0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
+                                                }
+                                        else
+                                                {
+                                                point_double(
+                                                        pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
+                                                        pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
+                                                }
+                                        }
+                                }
+                        }
+                if (mixed)
+                        make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
+                }
+        /* the scalar for the generator */
+        if ((scalar != NULL) && (have_pre_comp))
+                {
+                memset(g_secret, 0, sizeof(g_secret));
+                /* reduce scalar to 0 <= scalar < 2^521 */
+                if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar)))
+                        {
+                        /* this is an unusual input, and we don't guarantee
+                         * constant-timeness */
+                        if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
+                                {
+                                ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
+                                goto err;
+                                }
+                        num_bytes = BN_bn2bin(tmp_scalar, tmp);
+                        }
+                else
+                        num_bytes = BN_bn2bin(scalar, tmp);
+                flip_endian(g_secret, tmp, num_bytes);
+                /* do the multiplication with generator precomputation*/
+                batch_mul(x_out, y_out, z_out,
+                        (const felem_bytearray (*)) secrets, num_points,
+                        g_secret,
+                        mixed, (const felem (*)[17][3]) pre_comp,
+                        (const felem (*)[3]) g_pre_comp);
+                }
+        else
+                /* do the multiplication without generator precomputation */
+                batch_mul(x_out, y_out, z_out,
+                        (const felem_bytearray (*)) secrets, num_points,
+                        NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL);
+        /* reduce the output to its unique minimal representation */
+        felem_contract(x_in, x_out);
+        felem_contract(y_in, y_out);
+        felem_contract(z_in, z_out);
+        if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
+                (!felem_to_BN(z, z_in)))
+                {
+                ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
+                goto err;
+                }
+        ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
+err:
+        BN_CTX_end(ctx);
+        if (generator != NULL)
+                EC_POINT_free(generator);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        if (secrets != NULL)
+                OPENSSL_free(secrets);
+        if (pre_comp != NULL)
+                OPENSSL_free(pre_comp);
+        if (tmp_felems != NULL)
+                OPENSSL_free(tmp_felems);
+        return ret;
+        }
+int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
+        {
+        int ret = 0;
+        NISTP521_PRE_COMP *pre = NULL;
+        int i, j;
+        BN_CTX *new_ctx = NULL;
+        BIGNUM *x, *y;
+        EC_POINT *generator = NULL;
+        felem tmp_felems[16];
+        /* throw away old precomputation */
+        EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
+                nistp521_pre_comp_free, nistp521_pre_comp_clear_free);
+        if (ctx == NULL)
+                if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+        BN_CTX_start(ctx);
+        if (((x = BN_CTX_get(ctx)) == NULL) ||
+                ((y = BN_CTX_get(ctx)) == NULL))
+                goto err;
+        /* get the generator */
+        if (group->generator == NULL) goto err;
+        generator = EC_POINT_new(group);
+        if (generator == NULL)
+                goto err;
+        BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x);
+        BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y);
+        if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
+                goto err;
+        if ((pre = nistp521_pre_comp_new()) == NULL)
+                goto err;
+        /* if the generator is the standard one, use built-in precomputation */
+        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
+                {
+                memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
+                ret = 1;
+                goto err;
+                }
+        if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
+                (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
+                (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
+                goto err;
+        /* compute 2^130*G, 2^260*G, 2^390*G */
+        for (i = 1; i <= 4; i <<= 1)
+                {
+                point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1],
+                        pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0],
+                        pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
+                for (j = 0; j < 129; ++j)
+                        {
+                        point_double(pre->g_pre_comp[2*i][0],
+                                pre->g_pre_comp[2*i][1],
+                                pre->g_pre_comp[2*i][2],
+                                pre->g_pre_comp[2*i][0],
+                                pre->g_pre_comp[2*i][1],
+                                pre->g_pre_comp[2*i][2]);
+                        }
+                }
+        /* g_pre_comp[0] is the point at infinity */
+        memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
+        /* the remaining multiples */
+        /* 2^130*G + 2^260*G */
+        point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
+                pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
+                pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
+                0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
+                pre->g_pre_comp[2][2]);
+        /* 2^130*G + 2^390*G */
+        point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
+                pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
+                pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
+                0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
+                pre->g_pre_comp[2][2]);
+        /* 2^260*G + 2^390*G */
+        point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
+                pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
+                pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
+                0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
+                pre->g_pre_comp[4][2]);
+        /* 2^130*G + 2^260*G + 2^390*G */
+        point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
+                pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
+                pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
+                0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
+                pre->g_pre_comp[2][2]);
+        for (i = 1; i < 8; ++i)
+                {
+                /* odd multiples: add G */
+                point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1],
+                        pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0],
+                        pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2],
+                        0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
+                        pre->g_pre_comp[1][2]);
+                }
+        make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
+        if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
+                        nistp521_pre_comp_free, nistp521_pre_comp_clear_free))
+                goto err;
+        ret = 1;
+        pre = NULL;
+ err:
+        BN_CTX_end(ctx);
+        if (generator != NULL)
+                EC_POINT_free(generator);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        if (pre)
+                nistp521_pre_comp_free(pre);
+        return ret;
+        }
+int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
+        {
+        if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
+                        nistp521_pre_comp_free, nistp521_pre_comp_clear_free)
+                != NULL)
+                return 1;
+        else
+                return 0;
+        }
+#else
+static void *dummy=&dummy;
+#endif
diff --git a/src/lib/libcrypto/ec/ecp_nistputil.c b/src/lib/libcrypto/ec/ecp_nistputil.c
new file mode 100644
index 0000000000..c8140c807f
--- /dev/null
+++ b/src/lib/libcrypto/ec/ecp_nistputil.c
@@ -0,0 +1,197 @@
+/* crypto/ec/ecp_nistputil.c */
+/*
+ * Written by Bodo Moeller for the OpenSSL project.
+ */
+/* Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ *
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+#include <openssl/opensslconf.h>
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+/*
+ * Common utility functions for ecp_nistp224.c, ecp_nistp256.c, ecp_nistp521.c.
+ */
+#include <stddef.h>
+#include "ec_lcl.h"
+/* Convert an array of points into affine coordinates.
+ * (If the point at infinity is found (Z = 0), it remains unchanged.)
+ * This function is essentially an equivalent to EC_POINTs_make_affine(), but
+ * works with the internal representation of points as used by ecp_nistp###.c
+ * rather than with (BIGNUM-based) EC_POINT data structures.
+ *
+ * point_array is the input/output buffer ('num' points in projective form,
+ * i.e. three coordinates each), based on an internal representation of
+ * field elements of size 'felem_size'.
+ *
+ * tmp_felems needs to point to a temporary array of 'num'+1 field elements
+ * for storage of intermediate values.
+ */
+void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
+        size_t felem_size, void *tmp_felems,
+        void (*felem_one)(void *out),
+        int (*felem_is_zero)(const void *in),
+        void (*felem_assign)(void *out, const void *in),
+        void (*felem_square)(void *out, const void *in),
+        void (*felem_mul)(void *out, const void *in1, const void *in2),
+        void (*felem_inv)(void *out, const void *in),
+        void (*felem_contract)(void *out, const void *in))
+        {
+        int i = 0;
+#define tmp_felem(I) (&((char *)tmp_felems)[(I) * felem_size])
+#define X(I) (&((char *)point_array)[3*(I) * felem_size])
+#define Y(I) (&((char *)point_array)[(3*(I) + 1) * felem_size])
+#define Z(I) (&((char *)point_array)[(3*(I) + 2) * felem_size])
+        if (!felem_is_zero(Z(0)))
+                felem_assign(tmp_felem(0), Z(0));
+        else
+                felem_one(tmp_felem(0));
+        for (i = 1; i < (int)num; i++)
+                {
+                if (!felem_is_zero(Z(i)))
+                        felem_mul(tmp_felem(i), tmp_felem(i-1), Z(i));
+                else
+                        felem_assign(tmp_felem(i), tmp_felem(i-1));
+                }
+        /* Now each tmp_felem(i) is the product of Z(0) .. Z(i), skipping any zero-valued factors:
+         * if Z(i) = 0, we essentially pretend that Z(i) = 1 */
+        felem_inv(tmp_felem(num-1), tmp_felem(num-1));
+        for (i = num - 1; i >= 0; i--)
+                {
+                if (i > 0)
+                        /* tmp_felem(i-1) is the product of Z(0) .. Z(i-1),
+                         * tmp_felem(i) is the inverse of the product of Z(0) .. Z(i)
+                         */
+                        felem_mul(tmp_felem(num), tmp_felem(i-1), tmp_felem(i)); /* 1/Z(i) */
+                else
+                        felem_assign(tmp_felem(num), tmp_felem(0)); /* 1/Z(0) */
+                if (!felem_is_zero(Z(i)))
+                        {
+                        if (i > 0)
+                                /* For next iteration, replace tmp_felem(i-1) by its inverse */
+                                felem_mul(tmp_felem(i-1), tmp_felem(i), Z(i));
+                        /* Convert point (X, Y, Z) into affine form (X/(Z^2), Y/(Z^3), 1) */
+                        felem_square(Z(i), tmp_felem(num)); /* 1/(Z^2) */
+                        felem_mul(X(i), X(i), Z(i)); /* X/(Z^2) */
+                        felem_mul(Z(i), Z(i), tmp_felem(num)); /* 1/(Z^3) */
+                        felem_mul(Y(i), Y(i), Z(i)); /* Y/(Z^3) */
+                        felem_contract(X(i), X(i));
+                        felem_contract(Y(i), Y(i));
+                        felem_one(Z(i));
+                        }
+                else
+                        {
+                        if (i > 0)
+                                /* For next iteration, replace tmp_felem(i-1) by its inverse */
+                                felem_assign(tmp_felem(i-1), tmp_felem(i));
+                        }
+                }
+        }
+/*
+ * This function looks at 5+1 scalar bits (5 current, 1 adjacent less
+ * significant bit), and recodes them into a signed digit for use in fast point
+ * multiplication: the use of signed rather than unsigned digits means that
+ * fewer points need to be precomputed, given that point inversion is easy
+ * (a precomputed point dP makes -dP available as well).
+ *
+ * BACKGROUND:
+ *
+ * Signed digits for multiplication were introduced by Booth ("A signed binary
+ * multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV,
+ * pt. 2 (1951), pp. 236-240), in that case for multiplication of integers.
+ * Booth's original encoding did not generally improve the density of nonzero
+ * digits over the binary representation, and was merely meant to simplify the
+ * handling of signed factors given in two's complement; but it has since been
+ * shown to be the basis of various signed-digit representations that do have
+ * further advantages, including the wNAF, using the following general approach:
+ *
+ * (1) Given a binary representation
+ *
+ *       b_k  ...  b_2  b_1  b_0,
+ *
+ *     of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1
+ *     by using bit-wise subtraction as follows:
+ *
+ *        b_k b_(k-1)  ...  b_2  b_1  b_0
+ *      -     b_k      ...  b_3  b_2  b_1  b_0
+ *       -------------------------------------
+ *        s_k b_(k-1)  ...  s_3  s_2  s_1  s_0
+ *
+ *     A left-shift followed by subtraction of the original value yields a new
+ *     representation of the same value, using signed bits s_i = b_(i+1) - b_i.
+ *     This representation from Booth's paper has since appeared in the
+ *     literature under a variety of different names including "reversed binary
+ *     form", "alternating greedy expansion", "mutual opposite form", and
+ *     "sign-alternating {+-1}-representation".
+ *
+ *     An interesting property is that among the nonzero bits, values 1 and -1
+ *     strictly alternate.
+ *
+ * (2) Various window schemes can be applied to the Booth representation of
+ *     integers: for example, right-to-left sliding windows yield the wNAF
+ *     (a signed-digit encoding independently discovered by various researchers
+ *     in the 1990s), and left-to-right sliding windows yield a left-to-right
+ *     equivalent of the wNAF (independently discovered by various researchers
+ *     around 2004).
+ *
+ * To prevent leaking information through side channels in point multiplication,
+ * we need to recode the given integer into a regular pattern: sliding windows
+ * as in wNAFs won't do, we need their fixed-window equivalent -- which is a few
+ * decades older: we'll be using the so-called "modified Booth encoding" due to
+ * MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49
+ * (1961), pp. 67-91), in a radix-2^5 setting.  That is, we always combine five
+ * signed bits into a signed digit:
+ *
+ *       s_(4j + 4) s_(4j + 3) s_(4j + 2) s_(4j + 1) s_(4j)
+ *
+ * The sign-alternating property implies that the resulting digit values are
+ * integers from -16 to 16.
+ *
+ * Of course, we don't actually need to compute the signed digits s_i as an
+ * intermediate step (that's just a nice way to see how this scheme relates
+ * to the wNAF): a direct computation obtains the recoded digit from the
+ * six bits b_(4j + 4) ... b_(4j - 1).
+ *
+ * This function takes those five bits as an integer (0 .. 63), writing the
+ * recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute
+ * value, in the range 0 .. 8).  Note that this integer essentially provides the
+ * input bits "shifted to the left" by one position: for example, the input to
+ * compute the least significant recoded digit, given that there's no bit b_-1,
+ * has to be b_4 b_3 b_2 b_1 b_0 0.
+ *
+ */
+void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign, unsigned char *digit, unsigned char in)
+        {
+        unsigned char s, d;
+        s = ~((in >> 5) - 1); /* sets all bits to MSB(in), 'in' seen as 6-bit value */
+        d = (1 << 6) - in - 1;
+        d = (d & s) | (in & ~s);
+        d = (d >> 1) + (d & 1);
+        *sign = s & 1;
+        *digit = d;
+        }
+#else
+static void *dummy=&dummy;
+#endif
diff --git a/src/lib/libcrypto/ec/ecp_oct.c b/src/lib/libcrypto/ec/ecp_oct.c
new file mode 100644
index 0000000000..374a0ee731
--- /dev/null
+++ b/src/lib/libcrypto/ec/ecp_oct.c
@@ -0,0 +1,433 @@
+/* crypto/ec/ecp_oct.c */
+/* Includes code written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
+ * for the OpenSSL project. 
+ * Includes code written by Bodo Moeller for the OpenSSL project.
+*/
+/* ====================================================================
+ * Copyright (c) 1998-2002 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * Portions of this software developed by SUN MICROSYSTEMS, INC.,
+ * and contributed to the OpenSSL project.
+ */
+#include <openssl/err.h>
+#include <openssl/symhacks.h>
+#include "ec_lcl.h"
+int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
+        const BIGNUM *x_, int y_bit, BN_CTX *ctx)
+        {
+        BN_CTX *new_ctx = NULL;
+        BIGNUM *tmp1, *tmp2, *x, *y;
+        int ret = 0;
+        /* clear error queue*/
+        ERR_clear_error();
+        if (ctx == NULL)
+                {
+                ctx = new_ctx = BN_CTX_new();
+                if (ctx == NULL)
+                        return 0;
+                }
+        y_bit = (y_bit != 0);
+        BN_CTX_start(ctx);
+        tmp1 = BN_CTX_get(ctx);
+        tmp2 = BN_CTX_get(ctx);
+        x = BN_CTX_get(ctx);
+        y = BN_CTX_get(ctx);
+        if (y == NULL) goto err;
+        /* Recover y.  We have a Weierstrass equation
+         *     y^2 = x^3 + a*x + b,
+         * so  y  is one of the square roots of  x^3 + a*x + b.
+         */
+        /* tmp1 := x^3 */
+        if (!BN_nnmod(x, x_, &group->field,ctx)) goto err;
+        if (group->meth->field_decode == 0)
+                {
+                /* field_{sqr,mul} work on standard representation */
+                if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err;
+                if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err;
+                }
+        else
+                {
+                if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err;
+                if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err;
+                }
+        
+        /* tmp1 := tmp1 + a*x */
+        if (group->a_is_minus3)
+                {
+                if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err;
+                if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err;
+                if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+                }
+        else
+                {
+                if (group->meth->field_decode)
+                        {
+                        if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err;
+                        if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err;
+                        }
+                else
+                        {
+                        /* field_mul works on standard representation */
+                        if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err;
+                        }
+                
+                if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+                }
+        
+        /* tmp1 := tmp1 + b */
+        if (group->meth->field_decode)
+                {
+                if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err;
+                if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+                }
+        else
+                {
+                if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err;
+                }
+        
+        if (!BN_mod_sqrt(y, tmp1, &group->field, ctx))
+                {
+                unsigned long err = ERR_peek_last_error();
+                
+                if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE)
+                        {
+                        ERR_clear_error();
+                        ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+                        }
+                else
+                        ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
+                goto err;
+                }
+        if (y_bit != BN_is_odd(y))
+                {
+                if (BN_is_zero(y))
+                        {
+                        int kron;
+                        kron = BN_kronecker(x, &group->field, ctx);
+                        if (kron == -2) goto err;
+                        if (kron == 1)
+                                ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT);
+                        else
+                                /* BN_mod_sqrt() should have cought this error (not a square) */
+                                ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+                        goto err;
+                        }
+                if (!BN_usub(y, &group->field, y)) goto err;
+                }
+        if (y_bit != BN_is_odd(y))
+                {
+                ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR);
+                goto err;
+                }
+        if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+        ret = 1;
+ err:
+        BN_CTX_end(ctx);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        return ret;
+        }
+size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+        unsigned char *buf, size_t len, BN_CTX *ctx)
+        {
+        size_t ret;
+        BN_CTX *new_ctx = NULL;
+        int used_ctx = 0;
+        BIGNUM *x, *y;
+        size_t field_len, i, skip;
+        if ((form != POINT_CONVERSION_COMPRESSED)
+                && (form != POINT_CONVERSION_UNCOMPRESSED)
+                && (form != POINT_CONVERSION_HYBRID))
+                {
+                ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
+                goto err;
+                }
+        if (EC_POINT_is_at_infinity(group, point))
+                {
+                /* encodes to a single 0 octet */
+                if (buf != NULL)
+                        {
+                        if (len < 1)
+                                {
+                                ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+                                return 0;
+                                }
+                        buf[0] = 0;
+                        }
+                return 1;
+                }
+        /* ret := required output buffer length */
+        field_len = BN_num_bytes(&group->field);
+        ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+        /* if 'buf' is NULL, just return required length */
+        if (buf != NULL)
+                {
+                if (len < ret)
+                        {
+                        ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+                        goto err;
+                        }
+                if (ctx == NULL)
+                        {
+                        ctx = new_ctx = BN_CTX_new();
+                        if (ctx == NULL)
+                                return 0;
+                        }
+                BN_CTX_start(ctx);
+                used_ctx = 1;
+                x = BN_CTX_get(ctx);
+                y = BN_CTX_get(ctx);
+                if (y == NULL) goto err;
+                if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+                if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y))
+                        buf[0] = form + 1;
+                else
+                        buf[0] = form;
+        
+                i = 1;
+                
+                skip = field_len - BN_num_bytes(x);
+                if (skip > field_len)
+                        {
+                        ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+                        goto err;
+                        }
+                while (skip > 0)
+                        {
+                        buf[i++] = 0;
+                        skip--;
+                        }
+                skip = BN_bn2bin(x, buf + i);
+                i += skip;
+                if (i != 1 + field_len)
+                        {
+                        ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+                        goto err;
+                        }
+                if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
+                        {
+                        skip = field_len - BN_num_bytes(y);
+                        if (skip > field_len)
+                                {
+                                ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+                                goto err;
+                                }
+                        while (skip > 0)
+                                {
+                                buf[i++] = 0;
+                                skip--;
+                                }
+                        skip = BN_bn2bin(y, buf + i);
+                        i += skip;
+                        }
+                if (i != ret)
+                        {
+                        ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+                        goto err;
+                        }
+                }
+        
+        if (used_ctx)
+                BN_CTX_end(ctx);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        return ret;
+ err:
+        if (used_ctx)
+                BN_CTX_end(ctx);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        return 0;
+        }
+int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
+        const unsigned char *buf, size_t len, BN_CTX *ctx)
+        {
+        point_conversion_form_t form;
+        int y_bit;
+        BN_CTX *new_ctx = NULL;
+        BIGNUM *x, *y;
+        size_t field_len, enc_len;
+        int ret = 0;
+        if (len == 0)
+                {
+                ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
+                return 0;
+                }
+        form = buf[0];
+        y_bit = form & 1;
+        form = form & ~1U;
+        if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED)
+                && (form != POINT_CONVERSION_UNCOMPRESSED)
+                && (form != POINT_CONVERSION_HYBRID))
+                {
+                ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+                return 0;
+                }
+        if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
+                {
+                ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+                return 0;
+                }
+        if (form == 0)
+                {
+                if (len != 1)
+                        {
+                        ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+                        return 0;
+                        }
+                return EC_POINT_set_to_infinity(group, point);
+                }
+        
+        field_len = BN_num_bytes(&group->field);
+        enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+        if (len != enc_len)
+                {
+                ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+                return 0;
+                }
+        if (ctx == NULL)
+                {
+                ctx = new_ctx = BN_CTX_new();
+                if (ctx == NULL)
+                        return 0;
+                }
+        BN_CTX_start(ctx);
+        x = BN_CTX_get(ctx);
+        y = BN_CTX_get(ctx);
+        if (y == NULL) goto err;
+        if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
+        if (BN_ucmp(x, &group->field) >= 0)
+                {
+                ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+                goto err;
+                }
+        if (form == POINT_CONVERSION_COMPRESSED)
+                {
+                if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err;
+                }
+        else
+                {
+                if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
+                if (BN_ucmp(y, &group->field) >= 0)
+                        {
+                        ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+                        goto err;
+                        }
+                if (form == POINT_CONVERSION_HYBRID)
+                        {
+                        if (y_bit != BN_is_odd(y))
+                                {
+                                ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+                                goto err;
+                                }
+                        }
+                if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+                }
+        
+        if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
+                {
+                ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
+                goto err;
+                }
+        ret = 1;
+        
+ err:
+        BN_CTX_end(ctx);
+        if (new_ctx != NULL)
+                BN_CTX_free(new_ctx);
+        return ret;
+        }
diff --git a/src/lib/libcrypto/ecdh/ecdh.h b/src/lib/libcrypto/ecdh/ecdh.h
index b4b58ee65b..8887102c0b 100644
--- a/src/lib/libcrypto/ecdh/ecdh.h
+++ b/src/lib/libcrypto/ecdh/ecdh.h
@@ -109,11 +109,13 @@ void ERR_load_ECDH_strings(void);
 /* Error codes for the ECDH functions. */
 /* Function codes. */
+#define ECDH_F_ECDH_CHECK                                102
 #define ECDH_F_ECDH_COMPUTE_KEY                          100
 #define ECDH_F_ECDH_DATA_NEW_METHOD                      101
 /* Reason codes. */
 #define ECDH_R_KDF_FAILED                                102
+#define ECDH_R_NON_FIPS_METHOD                           103
 #define ECDH_R_NO_PRIVATE_VALUE                          100
 #define ECDH_R_POINT_ARITHMETIC_FAILURE                  101
diff --git a/src/lib/libcrypto/ecdh/ech_err.c b/src/lib/libcrypto/ecdh/ech_err.c
index 6f4b0c9953..3bd247398d 100644
--- a/src/lib/libcrypto/ecdh/ech_err.c
+++ b/src/lib/libcrypto/ecdh/ech_err.c
@@ -1,6 +1,6 @@
 /* crypto/ecdh/ech_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
 static ERR_STRING_DATA ECDH_str_functs[]=
        {
+{ERR_FUNC(ECDH_F_ECDH_CHECK),   "ECDH_CHECK"},
 {ERR_FUNC(ECDH_F_ECDH_COMPUTE_KEY),     "ECDH_compute_key"},
 {ERR_FUNC(ECDH_F_ECDH_DATA_NEW_METHOD), "ECDH_DATA_new_method"},
 {0,NULL}
@@ -78,6 +79,7 @@ static ERR_STRING_DATA ECDH_str_functs[]=
 static ERR_STRING_DATA ECDH_str_reasons[]=
        {
 {ERR_REASON(ECDH_R_KDF_FAILED)           ,"KDF failed"},
+{ERR_REASON(ECDH_R_NON_FIPS_METHOD)      ,"non fips method"},
 {ERR_REASON(ECDH_R_NO_PRIVATE_VALUE)     ,"no private value"},
 {ERR_REASON(ECDH_R_POINT_ARITHMETIC_FAILURE),"point arithmetic failure"},
 {0,NULL}
diff --git a/src/lib/libcrypto/ecdh/ech_lib.c b/src/lib/libcrypto/ecdh/ech_lib.c
index 4d8ea03d3d..dadbfd3c49 100644
--- a/src/lib/libcrypto/ecdh/ech_lib.c
+++ b/src/lib/libcrypto/ecdh/ech_lib.c
@@ -73,6 +73,9 @@
 #include <openssl/engine.h>
 #endif
 #include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 const char ECDH_version[]="ECDH" OPENSSL_VERSION_PTEXT;
@@ -90,7 +93,16 @@ void ECDH_set_default_method(const ECDH_METHOD *meth)
 const ECDH_METHOD *ECDH_get_default_method(void)
        {
        if(!default_ECDH_method) 
+                {
+#ifdef OPENSSL_FIPS
+                if (FIPS_mode())
+                        return FIPS_ecdh_openssl();
+                else
+                        return ECDH_OpenSSL();
+#else
                default_ECDH_method = ECDH_OpenSSL();
+#endif
+                }
        return default_ECDH_method;
        }
@@ -215,6 +227,14 @@ ECDH_DATA *ecdh_check(EC_KEY *key)
        }
        else
                ecdh_data = (ECDH_DATA *)data;
+#ifdef OPENSSL_FIPS
+        if (FIPS_mode() && !(ecdh_data->flags & ECDH_FLAG_FIPS_METHOD)
+                        && !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW))
+                {
+                ECDHerr(ECDH_F_ECDH_CHECK, ECDH_R_NON_FIPS_METHOD);
+                return NULL;
+                }
+#endif
        
        return ecdh_data;
diff --git a/src/lib/libcrypto/ecdh/ech_locl.h b/src/lib/libcrypto/ecdh/ech_locl.h
index f658526a7e..f6cad6a894 100644
--- a/src/lib/libcrypto/ecdh/ech_locl.h
+++ b/src/lib/libcrypto/ecdh/ech_locl.h
@@ -75,6 +75,14 @@ struct ecdh_method
        char *app_data;
        };
+/* If this flag is set the ECDH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+#define ECDH_FLAG_FIPS_METHOD   0x1
 typedef struct ecdh_data_st {
        /* EC_KEY_METH_DATA part */
        int (*init)(EC_KEY *);
diff --git a/src/lib/libcrypto/ecdsa/ecdsa.h b/src/lib/libcrypto/ecdsa/ecdsa.h
index e61c539812..7fb5254b62 100644
--- a/src/lib/libcrypto/ecdsa/ecdsa.h
+++ b/src/lib/libcrypto/ecdsa/ecdsa.h
@@ -238,6 +238,7 @@ void ERR_load_ECDSA_strings(void);
 /* Error codes for the ECDSA functions. */
 /* Function codes. */
+#define ECDSA_F_ECDSA_CHECK                              104
 #define ECDSA_F_ECDSA_DATA_NEW_METHOD                    100
 #define ECDSA_F_ECDSA_DO_SIGN                            101
 #define ECDSA_F_ECDSA_DO_VERIFY                          102
@@ -249,6 +250,7 @@ void ERR_load_ECDSA_strings(void);
 #define ECDSA_R_ERR_EC_LIB                               102
 #define ECDSA_R_MISSING_PARAMETERS                       103
 #define ECDSA_R_NEED_NEW_SETUP_VALUES                    106
+#define ECDSA_R_NON_FIPS_METHOD                          107
 #define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED          104
 #define ECDSA_R_SIGNATURE_MALLOC_FAILED                  105
diff --git a/src/lib/libcrypto/ecdsa/ecs_err.c b/src/lib/libcrypto/ecdsa/ecs_err.c
index 98e38d537f..81542e6d15 100644
--- a/src/lib/libcrypto/ecdsa/ecs_err.c
+++ b/src/lib/libcrypto/ecdsa/ecs_err.c
@@ -1,6 +1,6 @@
 /* crypto/ecdsa/ecs_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
 static ERR_STRING_DATA ECDSA_str_functs[]=
        {
+{ERR_FUNC(ECDSA_F_ECDSA_CHECK), "ECDSA_CHECK"},
 {ERR_FUNC(ECDSA_F_ECDSA_DATA_NEW_METHOD),       "ECDSA_DATA_NEW_METHOD"},
 {ERR_FUNC(ECDSA_F_ECDSA_DO_SIGN),       "ECDSA_do_sign"},
 {ERR_FUNC(ECDSA_F_ECDSA_DO_VERIFY),     "ECDSA_do_verify"},
@@ -84,6 +85,7 @@ static ERR_STRING_DATA ECDSA_str_reasons[]=
 {ERR_REASON(ECDSA_R_ERR_EC_LIB)          ,"err ec lib"},
 {ERR_REASON(ECDSA_R_MISSING_PARAMETERS)  ,"missing parameters"},
 {ERR_REASON(ECDSA_R_NEED_NEW_SETUP_VALUES),"need new setup values"},
+{ERR_REASON(ECDSA_R_NON_FIPS_METHOD)     ,"non fips method"},
 {ERR_REASON(ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED),"random number generation failed"},
 {ERR_REASON(ECDSA_R_SIGNATURE_MALLOC_FAILED),"signature malloc failed"},
 {0,NULL}
diff --git a/src/lib/libcrypto/ecdsa/ecs_lib.c b/src/lib/libcrypto/ecdsa/ecs_lib.c
index 2ebae3aa27..e477da430b 100644
--- a/src/lib/libcrypto/ecdsa/ecs_lib.c
+++ b/src/lib/libcrypto/ecdsa/ecs_lib.c
@@ -60,6 +60,9 @@
 #endif
 #include <openssl/err.h>
 #include <openssl/bn.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 const char ECDSA_version[]="ECDSA" OPENSSL_VERSION_PTEXT;
@@ -77,7 +80,16 @@ void ECDSA_set_default_method(const ECDSA_METHOD *meth)
 const ECDSA_METHOD *ECDSA_get_default_method(void)
 {
        if(!default_ECDSA_method) 
+                {
+#ifdef OPENSSL_FIPS
+                if (FIPS_mode())
+                        return FIPS_ecdsa_openssl();
+                else
+                        return ECDSA_OpenSSL();
+#else
                default_ECDSA_method = ECDSA_OpenSSL();
+#endif
+                }
        return default_ECDSA_method;
 }
@@ -193,7 +205,14 @@ ECDSA_DATA *ecdsa_check(EC_KEY *key)
        }
        else
                ecdsa_data = (ECDSA_DATA *)data;
-        
+#ifdef OPENSSL_FIPS
+        if (FIPS_mode() && !(ecdsa_data->flags & ECDSA_FLAG_FIPS_METHOD)
+                        && !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW))
+                {
+                ECDSAerr(ECDSA_F_ECDSA_CHECK, ECDSA_R_NON_FIPS_METHOD);
+                return NULL;
+                }
+#endif
        return ecdsa_data;
 }
diff --git a/src/lib/libcrypto/ecdsa/ecs_locl.h b/src/lib/libcrypto/ecdsa/ecs_locl.h
index 3a69a840e2..cb3be13cfc 100644
--- a/src/lib/libcrypto/ecdsa/ecs_locl.h
+++ b/src/lib/libcrypto/ecdsa/ecs_locl.h
@@ -82,6 +82,14 @@ struct ecdsa_method
        char *app_data;
        };
+/* If this flag is set the ECDSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+#define ECDSA_FLAG_FIPS_METHOD  0x1
 typedef struct ecdsa_data_st {
        /* EC_KEY_METH_DATA part */
        int (*init)(EC_KEY *);
diff --git a/src/lib/libcrypto/ecdsa/ecs_ossl.c b/src/lib/libcrypto/ecdsa/ecs_ossl.c
index 1bbf328de5..7725935610 100644
--- a/src/lib/libcrypto/ecdsa/ecs_ossl.c
+++ b/src/lib/libcrypto/ecdsa/ecs_ossl.c
@@ -167,6 +167,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
                                goto err;
                        }
                }
+#ifndef OPENSSL_NO_EC2M
                else /* NID_X9_62_characteristic_two_field */
                {
                        if (!EC_POINT_get_affine_coordinates_GF2m(group,
@@ -176,6 +177,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
                                goto err;
                        }
                }
+#endif
                if (!BN_nnmod(r, X, order, ctx))
                {
                        ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
@@ -454,6 +456,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
                        goto err;
                }
        }
+#ifndef OPENSSL_NO_EC2M
        else /* NID_X9_62_characteristic_two_field */
        {
                if (!EC_POINT_get_affine_coordinates_GF2m(group,
@@ -463,7 +466,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
                        goto err;
                }
        }
-        
+#endif  
        if (!BN_nnmod(u1, X, order, ctx))
        {
                ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB);
diff --git a/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c b/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c
new file mode 100644
index 0000000000..710fb79baf
--- /dev/null
+++ b/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c
@@ -0,0 +1,406 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+#include <openssl/opensslconf.h>
+#include <stdio.h>
+#include <string.h>
+#if !defined(OPENSSL_NO_AES) && !defined(OPENSSL_NO_SHA1)
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/aes.h>
+#include <openssl/sha.h>
+#include "evp_locl.h"
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER       0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD          0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY       0x17
+#endif
+#if !defined(EVP_CIPH_FLAG_DEFAULT_ASN1)
+#define EVP_CIPH_FLAG_DEFAULT_ASN1 0
+#endif
+#define TLS1_1_VERSION 0x0302
+typedef struct
+    {
+    AES_KEY             ks;
+    SHA_CTX             head,tail,md;
+    size_t              payload_length; /* AAD length in decrypt case */
+    union {
+        unsigned int    tls_ver;
+        unsigned char   tls_aad[16];    /* 13 used */
+    } aux;
+    } EVP_AES_HMAC_SHA1;
+#define NO_PAYLOAD_LENGTH       ((size_t)-1)
+#if     defined(AES_ASM) &&     ( \
+        defined(__x86_64)       || defined(__x86_64__)  || \
+        defined(_M_AMD64)       || defined(_M_X64)      || \
+        defined(__INTEL__)      )
+extern unsigned int OPENSSL_ia32cap_P[2];
+#define AESNI_CAPABLE   (1<<(57-32))
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+                              AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+                              AES_KEY *key);
+void aesni_cbc_encrypt(const unsigned char *in,
+                           unsigned char *out,
+                           size_t length,
+                           const AES_KEY *key,
+                           unsigned char *ivec, int enc);
+void aesni_cbc_sha1_enc (const void *inp, void *out, size_t blocks,
+                const AES_KEY *key, unsigned char iv[16],
+                SHA_CTX *ctx,const void *in0);
+#define data(ctx) ((EVP_AES_HMAC_SHA1 *)(ctx)->cipher_data)
+static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx,
+                        const unsigned char *inkey,
+                        const unsigned char *iv, int enc)
+        {
+        EVP_AES_HMAC_SHA1 *key = data(ctx);
+        int ret;
+        if (enc)
+                ret=aesni_set_encrypt_key(inkey,ctx->key_len*8,&key->ks);
+        else
+                ret=aesni_set_decrypt_key(inkey,ctx->key_len*8,&key->ks);
+        SHA1_Init(&key->head);  /* handy when benchmarking */
+        key->tail = key->head;
+        key->md   = key->head;
+        key->payload_length = NO_PAYLOAD_LENGTH;
+        return ret<0?0:1;
+        }
+#define STITCHED_CALL
+#if !defined(STITCHED_CALL)
+#define aes_off 0
+#endif
+void sha1_block_data_order (void *c,const void *p,size_t len);
+static void sha1_update(SHA_CTX *c,const void *data,size_t len)
+{       const unsigned char *ptr = data;
+        size_t res;
+        if ((res = c->num)) {
+                res = SHA_CBLOCK-res;
+                if (len<res) res=len;
+                SHA1_Update (c,ptr,res);
+                ptr += res;
+                len -= res;
+        }
+        res = len % SHA_CBLOCK;
+        len -= res;
+        if (len) {
+                sha1_block_data_order(c,ptr,len/SHA_CBLOCK);
+                ptr += len;
+                c->Nh += len>>29;
+                c->Nl += len<<=3;
+                if (c->Nl<(unsigned int)len) c->Nh++;
+        }
+        if (res)
+                SHA1_Update(c,ptr,res);
+}
+#define SHA1_Update sha1_update
+static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                      const unsigned char *in, size_t len)
+        {
+        EVP_AES_HMAC_SHA1 *key = data(ctx);
+        unsigned int l;
+        size_t  plen = key->payload_length,
+                iv = 0,         /* explicit IV in TLS 1.1 and later */
+                sha_off = 0;
+#if defined(STITCHED_CALL)
+        size_t  aes_off = 0,
+                blocks;
+        sha_off = SHA_CBLOCK-key->md.num;
+#endif
+        if (len%AES_BLOCK_SIZE) return 0;
+        if (ctx->encrypt) {
+                if (plen==NO_PAYLOAD_LENGTH)
+                        plen = len;
+                else if (len!=((plen+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE))
+                        return 0;
+                else if (key->aux.tls_ver >= TLS1_1_VERSION)
+                        iv = AES_BLOCK_SIZE;
+#if defined(STITCHED_CALL)
+                if (plen>(sha_off+iv) && (blocks=(plen-(sha_off+iv))/SHA_CBLOCK)) {
+                        SHA1_Update(&key->md,in+iv,sha_off);
+                        aesni_cbc_sha1_enc(in,out,blocks,&key->ks,
+                                ctx->iv,&key->md,in+iv+sha_off);
+                        blocks *= SHA_CBLOCK;
+                        aes_off += blocks;
+                        sha_off += blocks;
+                        key->md.Nh += blocks>>29;
+                        key->md.Nl += blocks<<=3;
+                        if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+                } else {
+                        sha_off = 0;
+                }
+#endif
+                sha_off += iv;
+                SHA1_Update(&key->md,in+sha_off,plen-sha_off);
+                if (plen!=len)  {       /* "TLS" mode of operation */
+                        if (in!=out)
+                                memcpy(out+aes_off,in+aes_off,plen-aes_off);
+                        /* calculate HMAC and append it to payload */
+                        SHA1_Final(out+plen,&key->md);
+                        key->md = key->tail;
+                        SHA1_Update(&key->md,out+plen,SHA_DIGEST_LENGTH);
+                        SHA1_Final(out+plen,&key->md);
+                        /* pad the payload|hmac */
+                        plen += SHA_DIGEST_LENGTH;
+                        for (l=len-plen-1;plen<len;plen++) out[plen]=l;
+                        /* encrypt HMAC|padding at once */
+                        aesni_cbc_encrypt(out+aes_off,out+aes_off,len-aes_off,
+                                        &key->ks,ctx->iv,1);
+                } else {
+                        aesni_cbc_encrypt(in+aes_off,out+aes_off,len-aes_off,
+                                        &key->ks,ctx->iv,1);
+                }
+        } else {
+                unsigned char mac[SHA_DIGEST_LENGTH];
+                /* decrypt HMAC|padding at once */
+                aesni_cbc_encrypt(in,out,len,
+                                &key->ks,ctx->iv,0);
+                if (plen) {     /* "TLS" mode of operation */
+                        /* figure out payload length */
+                        if (len<(size_t)(out[len-1]+1+SHA_DIGEST_LENGTH))
+                                return 0;
+                        len -= (out[len-1]+1+SHA_DIGEST_LENGTH);
+                        if ((key->aux.tls_aad[plen-4]<<8|key->aux.tls_aad[plen-3])
+                            >= TLS1_1_VERSION) {
+                                len -= AES_BLOCK_SIZE;
+                                iv = AES_BLOCK_SIZE;
+                        }
+                        key->aux.tls_aad[plen-2] = len>>8;
+                        key->aux.tls_aad[plen-1] = len;
+                        /* calculate HMAC and verify it */
+                        key->md = key->head;
+                        SHA1_Update(&key->md,key->aux.tls_aad,plen);
+                        SHA1_Update(&key->md,out+iv,len);
+                        SHA1_Final(mac,&key->md);
+                        key->md = key->tail;
+                        SHA1_Update(&key->md,mac,SHA_DIGEST_LENGTH);
+                        SHA1_Final(mac,&key->md);
+                        if (memcmp(out+iv+len,mac,SHA_DIGEST_LENGTH))
+                                return 0;
+                } else {
+                        SHA1_Update(&key->md,out,len);
+                }
+        }
+        key->payload_length = NO_PAYLOAD_LENGTH;
+        return 1;
+        }
+static int aesni_cbc_hmac_sha1_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+        {
+        EVP_AES_HMAC_SHA1 *key = data(ctx);
+        switch (type)
+                {
+        case EVP_CTRL_AEAD_SET_MAC_KEY:
+                {
+                unsigned int  i;
+                unsigned char hmac_key[64];
+                memset (hmac_key,0,sizeof(hmac_key));
+                if (arg > (int)sizeof(hmac_key)) {
+                        SHA1_Init(&key->head);
+                        SHA1_Update(&key->head,ptr,arg);
+                        SHA1_Final(hmac_key,&key->head);
+                } else {
+                        memcpy(hmac_key,ptr,arg);
+                }
+                for (i=0;i<sizeof(hmac_key);i++)
+                        hmac_key[i] ^= 0x36;            /* ipad */
+                SHA1_Init(&key->head);
+                SHA1_Update(&key->head,hmac_key,sizeof(hmac_key));
+                for (i=0;i<sizeof(hmac_key);i++)
+                        hmac_key[i] ^= 0x36^0x5c;       /* opad */
+                SHA1_Init(&key->tail);
+                SHA1_Update(&key->tail,hmac_key,sizeof(hmac_key));
+                return 1;
+                }
+        case EVP_CTRL_AEAD_TLS1_AAD:
+                {
+                unsigned char *p=ptr;
+                unsigned int   len=p[arg-2]<<8|p[arg-1];
+                if (ctx->encrypt)
+                        {
+                        key->payload_length = len;
+                        if ((key->aux.tls_ver=p[arg-4]<<8|p[arg-3]) >= TLS1_1_VERSION) {
+                                len -= AES_BLOCK_SIZE;
+                                p[arg-2] = len>>8;
+                                p[arg-1] = len;
+                        }
+                        key->md = key->head;
+                        SHA1_Update(&key->md,p,arg);
+                        return (int)(((len+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE)
+                                - len);
+                        }
+                else
+                        {
+                        if (arg>13) arg = 13;
+                        memcpy(key->aux.tls_aad,ptr,arg);
+                        key->payload_length = arg;
+                        return SHA_DIGEST_LENGTH;
+                        }
+                }
+        default:
+                return -1;
+                }
+        }
+static EVP_CIPHER aesni_128_cbc_hmac_sha1_cipher =
+        {
+#ifdef NID_aes_128_cbc_hmac_sha1
+        NID_aes_128_cbc_hmac_sha1,
+#else
+        NID_undef,
+#endif
+        16,16,16,
+        EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+        aesni_cbc_hmac_sha1_init_key,
+        aesni_cbc_hmac_sha1_cipher,
+        NULL,
+        sizeof(EVP_AES_HMAC_SHA1),
+        EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+        EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+        aesni_cbc_hmac_sha1_ctrl,
+        NULL
+        };
+static EVP_CIPHER aesni_256_cbc_hmac_sha1_cipher =
+        {
+#ifdef NID_aes_256_cbc_hmac_sha1
+        NID_aes_256_cbc_hmac_sha1,
+#else
+        NID_undef,
+#endif
+        16,32,16,
+        EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+        aesni_cbc_hmac_sha1_init_key,
+        aesni_cbc_hmac_sha1_cipher,
+        NULL,
+        sizeof(EVP_AES_HMAC_SHA1),
+        EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+        EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+        aesni_cbc_hmac_sha1_ctrl,
+        NULL
+        };
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+        {
+        return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+                &aesni_128_cbc_hmac_sha1_cipher:NULL);
+        }
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+        {
+        return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+                &aesni_256_cbc_hmac_sha1_cipher:NULL);
+        }
+#else
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+        {
+        return NULL;
+        }
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+        {
+        return NULL;
+        }
+#endif
+#endif
diff --git a/src/lib/libcrypto/evp/e_rc4_hmac_md5.c b/src/lib/libcrypto/evp/e_rc4_hmac_md5.c
new file mode 100644
index 0000000000..56563191ba
--- /dev/null
+++ b/src/lib/libcrypto/evp/e_rc4_hmac_md5.c
@@ -0,0 +1,298 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+#include <openssl/opensslconf.h>
+#include <stdio.h>
+#include <string.h>
+#if !defined(OPENSSL_NO_RC4) && !defined(OPENSSL_NO_MD5)
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/rc4.h>
+#include <openssl/md5.h>
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER       0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD          0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY       0x17
+#endif
+/* FIXME: surely this is available elsewhere? */
+#define EVP_RC4_KEY_SIZE                16
+typedef struct
+    {
+    RC4_KEY             ks;
+    MD5_CTX             head,tail,md;
+    size_t              payload_length;
+    } EVP_RC4_HMAC_MD5;
+#define NO_PAYLOAD_LENGTH       ((size_t)-1)
+void rc4_md5_enc (RC4_KEY *key, const void *in0, void *out,
+                MD5_CTX *ctx,const void *inp,size_t blocks);
+#define data(ctx) ((EVP_RC4_HMAC_MD5 *)(ctx)->cipher_data)
+static int rc4_hmac_md5_init_key(EVP_CIPHER_CTX *ctx,
+                        const unsigned char *inkey,
+                        const unsigned char *iv, int enc)
+        {
+        EVP_RC4_HMAC_MD5 *key = data(ctx);
+        RC4_set_key(&key->ks,EVP_CIPHER_CTX_key_length(ctx),
+                    inkey);
+        MD5_Init(&key->head);   /* handy when benchmarking */
+        key->tail = key->head;
+        key->md   = key->head;
+        key->payload_length = NO_PAYLOAD_LENGTH;
+        return 1;
+        }
+#if     !defined(OPENSSL_NO_ASM) &&     ( \
+        defined(__x86_64)       || defined(__x86_64__)  || \
+        defined(_M_AMD64)       || defined(_M_X64)      || \
+        defined(__INTEL__)              ) && \
+        !(defined(__APPLE__) && defined(__MACH__))
+#define STITCHED_CALL
+#endif
+#if !defined(STITCHED_CALL)
+#define rc4_off 0
+#define md5_off 0
+#endif
+static int rc4_hmac_md5_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                      const unsigned char *in, size_t len)
+        {
+        EVP_RC4_HMAC_MD5 *key = data(ctx);
+#if defined(STITCHED_CALL)
+        size_t  rc4_off = 32-1-(key->ks.x&(32-1)),      /* 32 is $MOD from rc4_md5-x86_64.pl */
+                md5_off = MD5_CBLOCK-key->md.num,
+                blocks;
+        unsigned int l;
+        extern unsigned int OPENSSL_ia32cap_P[];
+#endif
+        size_t  plen = key->payload_length;
+        if (plen!=NO_PAYLOAD_LENGTH && len!=(plen+MD5_DIGEST_LENGTH)) return 0;
+        if (ctx->encrypt) {
+                if (plen==NO_PAYLOAD_LENGTH) plen = len;
+#if defined(STITCHED_CALL)
+                /* cipher has to "fall behind" */
+                if (rc4_off>md5_off) md5_off+=MD5_CBLOCK;
+                if (plen>md5_off && (blocks=(plen-md5_off)/MD5_CBLOCK) &&
+                    (OPENSSL_ia32cap_P[0]&(1<<20))==0) {
+                        MD5_Update(&key->md,in,md5_off);
+                        RC4(&key->ks,rc4_off,in,out);
+                        rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+                                &key->md,in+md5_off,blocks);
+                        blocks *= MD5_CBLOCK;
+                        rc4_off += blocks;
+                        md5_off += blocks;
+                        key->md.Nh += blocks>>29;
+                        key->md.Nl += blocks<<=3;
+                        if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+                } else {
+                        rc4_off = 0;
+                        md5_off = 0;
+                }
+#endif
+                MD5_Update(&key->md,in+md5_off,plen-md5_off);
+                if (plen!=len) {        /* "TLS" mode of operation */
+                        if (in!=out)
+                                memcpy(out+rc4_off,in+rc4_off,plen-rc4_off);
+                        /* calculate HMAC and append it to payload */
+                        MD5_Final(out+plen,&key->md);
+                        key->md = key->tail;
+                        MD5_Update(&key->md,out+plen,MD5_DIGEST_LENGTH);
+                        MD5_Final(out+plen,&key->md);
+                        /* encrypt HMAC at once */
+                        RC4(&key->ks,len-rc4_off,out+rc4_off,out+rc4_off);
+                } else {
+                        RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+                }
+        } else {
+                unsigned char mac[MD5_DIGEST_LENGTH];
+#if defined(STITCHED_CALL)
+                /* digest has to "fall behind" */
+                if (md5_off>rc4_off)    rc4_off += 2*MD5_CBLOCK;
+                else                    rc4_off += MD5_CBLOCK;
+                if (len>rc4_off && (blocks=(len-rc4_off)/MD5_CBLOCK) &&
+                    (OPENSSL_ia32cap_P[0]&(1<<20))==0) {
+                        RC4(&key->ks,rc4_off,in,out);
+                        MD5_Update(&key->md,out,md5_off);
+                        rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+                                &key->md,out+md5_off,blocks);
+                        blocks *= MD5_CBLOCK;
+                        rc4_off += blocks;
+                        md5_off += blocks;
+                        l = (key->md.Nl+(blocks<<3))&0xffffffffU;
+                        if (l<key->md.Nl) key->md.Nh++;
+                        key->md.Nl  = l;
+                        key->md.Nh += blocks>>29;
+                } else {
+                        md5_off=0;
+                        rc4_off=0;
+                }
+#endif
+                /* decrypt HMAC at once */
+                RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+                if (plen!=NO_PAYLOAD_LENGTH) {  /* "TLS" mode of operation */
+                        MD5_Update(&key->md,out+md5_off,plen-md5_off);
+                        /* calculate HMAC and verify it */
+                        MD5_Final(mac,&key->md);
+                        key->md = key->tail;
+                        MD5_Update(&key->md,mac,MD5_DIGEST_LENGTH);
+                        MD5_Final(mac,&key->md);
+                        if (memcmp(out+plen,mac,MD5_DIGEST_LENGTH))
+                                return 0;
+                } else {
+                        MD5_Update(&key->md,out+md5_off,len-md5_off);
+                }
+        }
+        key->payload_length = NO_PAYLOAD_LENGTH;
+        return 1;
+        }
+static int rc4_hmac_md5_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+        {
+        EVP_RC4_HMAC_MD5 *key = data(ctx);
+        switch (type)
+                {
+        case EVP_CTRL_AEAD_SET_MAC_KEY:
+                {
+                unsigned int  i;
+                unsigned char hmac_key[64];
+                memset (hmac_key,0,sizeof(hmac_key));
+                if (arg > (int)sizeof(hmac_key)) {
+                        MD5_Init(&key->head);
+                        MD5_Update(&key->head,ptr,arg);
+                        MD5_Final(hmac_key,&key->head);
+                } else {
+                        memcpy(hmac_key,ptr,arg);
+                }
+                for (i=0;i<sizeof(hmac_key);i++)
+                        hmac_key[i] ^= 0x36;            /* ipad */
+                MD5_Init(&key->head);
+                MD5_Update(&key->head,hmac_key,sizeof(hmac_key));
+                for (i=0;i<sizeof(hmac_key);i++)
+                        hmac_key[i] ^= 0x36^0x5c;       /* opad */
+                MD5_Init(&key->tail);
+                MD5_Update(&key->tail,hmac_key,sizeof(hmac_key));
+                return 1;
+                }
+        case EVP_CTRL_AEAD_TLS1_AAD:
+                {
+                unsigned char *p=ptr;
+                unsigned int   len=p[arg-2]<<8|p[arg-1];
+                if (!ctx->encrypt)
+                        {
+                        len -= MD5_DIGEST_LENGTH;
+                        p[arg-2] = len>>8;
+                        p[arg-1] = len;
+                        }
+                key->payload_length=len;
+                key->md = key->head;
+                MD5_Update(&key->md,p,arg);
+                return MD5_DIGEST_LENGTH;
+                }
+        default:
+                return -1;
+                }
+        }
+static EVP_CIPHER r4_hmac_md5_cipher=
+        {
+#ifdef NID_rc4_hmac_md5
+        NID_rc4_hmac_md5,
+#else
+        NID_undef,
+#endif
+        1,EVP_RC4_KEY_SIZE,0,
+        EVP_CIPH_STREAM_CIPHER|EVP_CIPH_VARIABLE_LENGTH|EVP_CIPH_FLAG_AEAD_CIPHER,
+        rc4_hmac_md5_init_key,
+        rc4_hmac_md5_cipher,
+        NULL,
+        sizeof(EVP_RC4_HMAC_MD5),
+        NULL,
+        NULL,
+        rc4_hmac_md5_ctrl,
+        NULL
+        };
+const EVP_CIPHER *EVP_rc4_hmac_md5(void)
+        {
+        return(&r4_hmac_md5_cipher);
+        }
+#endif
diff --git a/src/lib/libcrypto/evp/m_ecdsa.c b/src/lib/libcrypto/evp/m_ecdsa.c
index 8d87a49ebe..4b15fb0f6c 100644
--- a/src/lib/libcrypto/evp/m_ecdsa.c
+++ b/src/lib/libcrypto/evp/m_ecdsa.c
@@ -116,6 +116,8 @@
 #include <openssl/x509.h>
 #ifndef OPENSSL_NO_SHA
+#ifndef OPENSSL_FIPS
 static int init(EVP_MD_CTX *ctx)
        { return SHA1_Init(ctx->md_data); }
@@ -146,3 +148,4 @@ const EVP_MD *EVP_ecdsa(void)
        return(&ecdsa_md);
        }
 #endif
+#endif
diff --git a/src/lib/libcrypto/evp/m_wp.c b/src/lib/libcrypto/evp/m_wp.c
index 1ce47c040b..c51bc2d5d1 100644
--- a/src/lib/libcrypto/evp/m_wp.c
+++ b/src/lib/libcrypto/evp/m_wp.c
@@ -9,6 +9,7 @@
 #include <openssl/objects.h>
 #include <openssl/x509.h>
 #include <openssl/whrlpool.h>
+#include "evp_locl.h"
 static int init(EVP_MD_CTX *ctx)
        { return WHIRLPOOL_Init(ctx->md_data); }
diff --git a/src/lib/libcrypto/evp/pmeth_gn.c b/src/lib/libcrypto/evp/pmeth_gn.c
index 5d74161a09..4651c81370 100644
--- a/src/lib/libcrypto/evp/pmeth_gn.c
+++ b/src/lib/libcrypto/evp/pmeth_gn.c
@@ -199,7 +199,7 @@ int EVP_PKEY_CTX_get_keygen_info(EVP_PKEY_CTX *ctx, int idx)
        }
 EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
-                                unsigned char *key, int keylen)
+                                const unsigned char *key, int keylen)
        {
        EVP_PKEY_CTX *mac_ctx = NULL;
        EVP_PKEY *mac_key = NULL;
@@ -209,7 +209,8 @@ EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
        if (EVP_PKEY_keygen_init(mac_ctx) <= 0)
                goto merr;
        if (EVP_PKEY_CTX_ctrl(mac_ctx, -1, EVP_PKEY_OP_KEYGEN,
-                                EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key) <= 0)
+                                EVP_PKEY_CTRL_SET_MAC_KEY,
+                                keylen, (void *)key) <= 0)
                goto merr;
        if (EVP_PKEY_keygen(mac_ctx, &mac_key) <= 0)
                goto merr;
diff --git a/src/lib/libcrypto/evp/pmeth_lib.c b/src/lib/libcrypto/evp/pmeth_lib.c
index 5481d4b8a5..acfa7b6f87 100644
--- a/src/lib/libcrypto/evp/pmeth_lib.c
+++ b/src/lib/libcrypto/evp/pmeth_lib.c
@@ -73,7 +73,7 @@ DECLARE_STACK_OF(EVP_PKEY_METHOD)
 STACK_OF(EVP_PKEY_METHOD) *app_pkey_methods = NULL;
 extern const EVP_PKEY_METHOD rsa_pkey_meth, dh_pkey_meth, dsa_pkey_meth;
-extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth;
+extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth, cmac_pkey_meth;
 static const EVP_PKEY_METHOD *standard_methods[] =
        {
@@ -90,6 +90,7 @@ static const EVP_PKEY_METHOD *standard_methods[] =
        &ec_pkey_meth,
 #endif
        &hmac_pkey_meth,
+        &cmac_pkey_meth
        };
 DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_METHOD *, const EVP_PKEY_METHOD *,
@@ -203,6 +204,8 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags)
        if (!pmeth)
                return NULL;
+        memset(pmeth, 0, sizeof(EVP_PKEY_METHOD));
        pmeth->pkey_id = id;
        pmeth->flags = flags | EVP_PKEY_FLAG_DYNAMIC;
@@ -235,6 +238,56 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags)
        return pmeth;
        }
+void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags,
+                                const EVP_PKEY_METHOD *meth)
+        {
+        if (ppkey_id)
+                *ppkey_id = meth->pkey_id;
+        if (pflags)
+                *pflags = meth->flags;
+        }
+void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src)
+        {
+        dst->init = src->init;
+        dst->copy = src->copy;
+        dst->cleanup = src->cleanup;
+        dst->paramgen_init = src->paramgen_init;
+        dst->paramgen = src->paramgen;
+        dst->keygen_init = src->keygen_init;
+        dst->keygen = src->keygen;
+        dst->sign_init = src->sign_init;
+        dst->sign = src->sign;
+        dst->verify_init = src->verify_init;
+        dst->verify = src->verify;
+        dst->verify_recover_init = src->verify_recover_init;
+        dst->verify_recover = src->verify_recover;
+        dst->signctx_init = src->signctx_init;
+        dst->signctx = src->signctx;
+        dst->verifyctx_init = src->verifyctx_init;
+        dst->verifyctx = src->verifyctx;
+        dst->encrypt_init = src->encrypt_init;
+        dst->encrypt = src->encrypt;
+        dst->decrypt_init = src->decrypt_init;
+        dst->decrypt = src->decrypt;
+        dst->derive_init = src->derive_init;
+        dst->derive = src->derive;
+        dst->ctrl = src->ctrl;
+        dst->ctrl_str = src->ctrl_str;
+        }
 void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth)
        {
        if (pmeth && (pmeth->flags & EVP_PKEY_FLAG_DYNAMIC))
diff --git a/src/lib/libcrypto/hmac/hm_ameth.c b/src/lib/libcrypto/hmac/hm_ameth.c
index 6d8a89149e..e03f24aeda 100644
--- a/src/lib/libcrypto/hmac/hm_ameth.c
+++ b/src/lib/libcrypto/hmac/hm_ameth.c
@@ -153,7 +153,7 @@ const EVP_PKEY_ASN1_METHOD hmac_asn1_meth =
        hmac_size,
        0,
-        0,0,0,0,0,0,
+        0,0,0,0,0,0,0,
        hmac_key_free,
        hmac_pkey_ctrl,
diff --git a/src/lib/libcrypto/hmac/hm_pmeth.c b/src/lib/libcrypto/hmac/hm_pmeth.c
index 71e8567a14..0daa44511d 100644
--- a/src/lib/libcrypto/hmac/hm_pmeth.c
+++ b/src/lib/libcrypto/hmac/hm_pmeth.c
@@ -100,7 +100,8 @@ static int pkey_hmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
        dctx = dst->data;
        dctx->md = sctx->md;
        HMAC_CTX_init(&dctx->ctx);
-        HMAC_CTX_copy(&dctx->ctx, &sctx->ctx);
+        if (!HMAC_CTX_copy(&dctx->ctx, &sctx->ctx))
+                return 0;
        if (sctx->ktmp.data)
                {
                if (!ASN1_OCTET_STRING_set(&dctx->ktmp,
@@ -141,7 +142,8 @@ static int pkey_hmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
 static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count)
        {
        HMAC_PKEY_CTX *hctx = ctx->pctx->data;
-        HMAC_Update(&hctx->ctx, data, count);
+        if (!HMAC_Update(&hctx->ctx, data, count))
+                return 0;
        return 1;
        }
@@ -167,7 +169,8 @@ static int hmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
        if (!sig)
                return 1;
-        HMAC_Final(&hctx->ctx, sig, &hlen);
+        if (!HMAC_Final(&hctx->ctx, sig, &hlen))
+                return 0;
        *siglen = (size_t)hlen;
        return 1;
        }
@@ -192,8 +195,9 @@ static int pkey_hmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
                case EVP_PKEY_CTRL_DIGESTINIT:
                key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr;
-                HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md,
+                if (!HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md,
-                                ctx->engine);
+                                ctx->engine))
+                        return 0;
                break;
                default:
diff --git a/src/lib/libcrypto/ia64cpuid.S b/src/lib/libcrypto/ia64cpuid.S
index d705fff7ee..7832b9b640 100644
--- a/src/lib/libcrypto/ia64cpuid.S
+++ b/src/lib/libcrypto/ia64cpuid.S
@@ -26,7 +26,7 @@ OPENSSL_atomic_add:
 { .mii; mov             ar.ccv=r2
        add             r8=r2,r33
        mov             r3=r2           };;
-{ .mmi; mf
+{ .mmi; mf;;
        cmpxchg4.acq    r2=[r32],r8,ar.ccv
        nop.i           0               };;
 { .mib; cmp.ne          p6,p0=r2,r3
diff --git a/src/lib/libcrypto/idea/i_cbc.c b/src/lib/libcrypto/idea/i_cbc.c
new file mode 100644
index 0000000000..ecb9cb8b83
--- /dev/null
+++ b/src/lib/libcrypto/idea/i_cbc.c
@@ -0,0 +1,168 @@
+/* crypto/idea/i_cbc.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+#include <openssl/idea.h>
+#include "idea_lcl.h"
+void idea_cbc_encrypt(const unsigned char *in, unsigned char *out, long length,
+             IDEA_KEY_SCHEDULE *ks, unsigned char *iv, int encrypt)
+        {
+        register unsigned long tin0,tin1;
+        register unsigned long tout0,tout1,xor0,xor1;
+        register long l=length;
+        unsigned long tin[2];
+        if (encrypt)
+                {
+                n2l(iv,tout0);
+                n2l(iv,tout1);
+                iv-=8;
+                for (l-=8; l>=0; l-=8)
+                        {
+                        n2l(in,tin0);
+                        n2l(in,tin1);
+                        tin0^=tout0;
+                        tin1^=tout1;
+                        tin[0]=tin0;
+                        tin[1]=tin1;
+                        idea_encrypt(tin,ks);
+                        tout0=tin[0]; l2n(tout0,out);
+                        tout1=tin[1]; l2n(tout1,out);
+                        }
+                if (l != -8)
+                        {
+                        n2ln(in,tin0,tin1,l+8);
+                        tin0^=tout0;
+                        tin1^=tout1;
+                        tin[0]=tin0;
+                        tin[1]=tin1;
+                        idea_encrypt(tin,ks);
+                        tout0=tin[0]; l2n(tout0,out);
+                        tout1=tin[1]; l2n(tout1,out);
+                        }
+                l2n(tout0,iv);
+                l2n(tout1,iv);
+                }
+        else
+                {
+                n2l(iv,xor0);
+                n2l(iv,xor1);
+                iv-=8;
+                for (l-=8; l>=0; l-=8)
+                        {
+                        n2l(in,tin0); tin[0]=tin0;
+                        n2l(in,tin1); tin[1]=tin1;
+                        idea_encrypt(tin,ks);
+                        tout0=tin[0]^xor0;
+                        tout1=tin[1]^xor1;
+                        l2n(tout0,out);
+                        l2n(tout1,out);
+                        xor0=tin0;
+                        xor1=tin1;
+                        }
+                if (l != -8)
+                        {
+                        n2l(in,tin0); tin[0]=tin0;
+                        n2l(in,tin1); tin[1]=tin1;
+                        idea_encrypt(tin,ks);
+                        tout0=tin[0]^xor0;
+                        tout1=tin[1]^xor1;
+                        l2nn(tout0,tout1,out,l+8);
+                        xor0=tin0;
+                        xor1=tin1;
+                        }
+                l2n(xor0,iv);
+                l2n(xor1,iv);
+                }
+        tin0=tin1=tout0=tout1=xor0=xor1=0;
+        tin[0]=tin[1]=0;
+        }
+void idea_encrypt(unsigned long *d, IDEA_KEY_SCHEDULE *key)
+        {
+        register IDEA_INT *p;
+        register unsigned long x1,x2,x3,x4,t0,t1,ul;
+        x2=d[0];
+        x1=(x2>>16);
+        x4=d[1];
+        x3=(x4>>16);
+        p= &(key->data[0][0]);
+        E_IDEA(0);
+        E_IDEA(1);
+        E_IDEA(2);
+        E_IDEA(3);
+        E_IDEA(4);
+        E_IDEA(5);
+        E_IDEA(6);
+        E_IDEA(7);
+        x1&=0xffff;
+        idea_mul(x1,x1,*p,ul); p++;
+        t0= x3+ *(p++);
+        t1= x2+ *(p++);
+        x4&=0xffff;
+        idea_mul(x4,x4,*p,ul);
+        d[0]=(t0&0xffff)|((x1&0xffff)<<16);
+        d[1]=(x4&0xffff)|((t1&0xffff)<<16);
+        }
diff --git a/src/lib/libcrypto/idea/i_cfb64.c b/src/lib/libcrypto/idea/i_cfb64.c
new file mode 100644
index 0000000000..66d49d520e
--- /dev/null
+++ b/src/lib/libcrypto/idea/i_cfb64.c
@@ -0,0 +1,122 @@
+/* crypto/idea/i_cfb64.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+#include <openssl/idea.h>
+#include "idea_lcl.h"
+/* The input and output encrypted as though 64bit cfb mode is being
+ * used.  The extra state information to record how much of the
+ * 64bit block we have used is contained in *num;
+ */
+void idea_cfb64_encrypt(const unsigned char *in, unsigned char *out,
+                        long length, IDEA_KEY_SCHEDULE *schedule,
+                        unsigned char *ivec, int *num, int encrypt)
+        {
+        register unsigned long v0,v1,t;
+        register int n= *num;
+        register long l=length;
+        unsigned long ti[2];
+        unsigned char *iv,c,cc;
+        iv=(unsigned char *)ivec;
+        if (encrypt)
+                {
+                while (l--)
+                        {
+                        if (n == 0)
+                                {
+                                n2l(iv,v0); ti[0]=v0;
+                                n2l(iv,v1); ti[1]=v1;
+                                idea_encrypt((unsigned long *)ti,schedule);
+                                iv=(unsigned char *)ivec;
+                                t=ti[0]; l2n(t,iv);
+                                t=ti[1]; l2n(t,iv);
+                                iv=(unsigned char *)ivec;
+                                }
+                        c= *(in++)^iv[n];
+                        *(out++)=c;
+                        iv[n]=c;
+                        n=(n+1)&0x07;
+                        }
+                }
+        else
+                {
+                while (l--)
+                        {
+                        if (n == 0)
+                                {
+                                n2l(iv,v0); ti[0]=v0;
+                                n2l(iv,v1); ti[1]=v1;
+                                idea_encrypt((unsigned long *)ti,schedule);
+                                iv=(unsigned char *)ivec;
+                                t=ti[0]; l2n(t,iv);
+                                t=ti[1]; l2n(t,iv);
+                                iv=(unsigned char *)ivec;
+                                }
+                        cc= *(in++);
+                        c=iv[n];
+                        iv[n]=cc;
+                        *(out++)=c^cc;
+                        n=(n+1)&0x07;
+                        }
+                }
+        v0=v1=ti[0]=ti[1]=t=c=cc=0;
+        *num=n;
+        }
diff --git a/src/lib/libcrypto/idea/i_ecb.c b/src/lib/libcrypto/idea/i_ecb.c
new file mode 100644
index 0000000000..fef38230a7
--- /dev/null
+++ b/src/lib/libcrypto/idea/i_ecb.c
@@ -0,0 +1,85 @@
+/* crypto/idea/i_ecb.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+#include <openssl/idea.h>
+#include "idea_lcl.h"
+#include <openssl/opensslv.h>
+const char IDEA_version[]="IDEA" OPENSSL_VERSION_PTEXT;
+const char *idea_options(void)
+        {
+        if (sizeof(short) != sizeof(IDEA_INT))
+                return("idea(int)");
+        else
+                return("idea(short)");
+        }
+void idea_ecb_encrypt(const unsigned char *in, unsigned char *out,
+             IDEA_KEY_SCHEDULE *ks)
+        {
+        unsigned long l0,l1,d[2];
+        n2l(in,l0); d[0]=l0;
+        n2l(in,l1); d[1]=l1;
+        idea_encrypt(d,ks);
+        l0=d[0]; l2n(l0,out);
+        l1=d[1]; l2n(l1,out);
+        l0=l1=d[0]=d[1]=0;
+        }
diff --git a/src/lib/libcrypto/idea/i_ofb64.c b/src/lib/libcrypto/idea/i_ofb64.c
new file mode 100644
index 0000000000..e749e88e34
--- /dev/null
+++ b/src/lib/libcrypto/idea/i_ofb64.c
@@ -0,0 +1,111 @@
+/* crypto/idea/i_ofb64.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+#include <openssl/idea.h>
+#include "idea_lcl.h"
+/* The input and output encrypted as though 64bit ofb mode is being
+ * used.  The extra state information to record how much of the
+ * 64bit block we have used is contained in *num;
+ */
+void idea_ofb64_encrypt(const unsigned char *in, unsigned char *out,
+                        long length, IDEA_KEY_SCHEDULE *schedule,
+                        unsigned char *ivec, int *num)
+        {
+        register unsigned long v0,v1,t;
+        register int n= *num;
+        register long l=length;
+        unsigned char d[8];
+        register char *dp;
+        unsigned long ti[2];
+        unsigned char *iv;
+        int save=0;
+        iv=(unsigned char *)ivec;
+        n2l(iv,v0);
+        n2l(iv,v1);
+        ti[0]=v0;
+        ti[1]=v1;
+        dp=(char *)d;
+        l2n(v0,dp);
+        l2n(v1,dp);
+        while (l--)
+                {
+                if (n == 0)
+                        {
+                        idea_encrypt((unsigned long *)ti,schedule);
+                        dp=(char *)d;
+                        t=ti[0]; l2n(t,dp);
+                        t=ti[1]; l2n(t,dp);
+                        save++;
+                        }
+                *(out++)= *(in++)^d[n];
+                n=(n+1)&0x07;
+                }
+        if (save)
+                {
+                v0=ti[0];
+                v1=ti[1];
+                iv=(unsigned char *)ivec;
+                l2n(v0,iv);
+                l2n(v1,iv);
+                }
+        t=v0=v1=ti[0]=ti[1]=0;
+        *num=n;
+        }
diff --git a/src/lib/libcrypto/idea/i_skey.c b/src/lib/libcrypto/idea/i_skey.c
new file mode 100644
index 0000000000..afb830964d
--- /dev/null
+++ b/src/lib/libcrypto/idea/i_skey.c
@@ -0,0 +1,164 @@
+/* crypto/idea/i_skey.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+#include <openssl/crypto.h>
+#include <openssl/idea.h>
+#include "idea_lcl.h"
+static IDEA_INT inverse(unsigned int xin);
+void idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks)
+#ifdef OPENSSL_FIPS
+        {
+        fips_cipher_abort(IDEA);
+        private_idea_set_encrypt_key(key, ks);
+        }
+void private_idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks)
+#endif
+        {
+        int i;
+        register IDEA_INT *kt,*kf,r0,r1,r2;
+        kt= &(ks->data[0][0]);
+        n2s(key,kt[0]); n2s(key,kt[1]); n2s(key,kt[2]); n2s(key,kt[3]);
+        n2s(key,kt[4]); n2s(key,kt[5]); n2s(key,kt[6]); n2s(key,kt[7]);
+        kf=kt;
+        kt+=8;
+        for (i=0; i<6; i++)
+                {
+                r2= kf[1];
+                r1= kf[2];
+                *(kt++)= ((r2<<9) | (r1>>7))&0xffff;
+                r0= kf[3];
+                *(kt++)= ((r1<<9) | (r0>>7))&0xffff;
+                r1= kf[4];
+                *(kt++)= ((r0<<9) | (r1>>7))&0xffff;
+                r0= kf[5];
+                *(kt++)= ((r1<<9) | (r0>>7))&0xffff;
+                r1= kf[6];
+                *(kt++)= ((r0<<9) | (r1>>7))&0xffff;
+                r0= kf[7];
+                *(kt++)= ((r1<<9) | (r0>>7))&0xffff;
+                r1= kf[0];
+                if (i >= 5) break;
+                *(kt++)= ((r0<<9) | (r1>>7))&0xffff;
+                *(kt++)= ((r1<<9) | (r2>>7))&0xffff;
+                kf+=8;
+                }
+        }
+void idea_set_decrypt_key(IDEA_KEY_SCHEDULE *ek, IDEA_KEY_SCHEDULE *dk)
+        {
+        int r;
+        register IDEA_INT *fp,*tp,t;
+        tp= &(dk->data[0][0]);
+        fp= &(ek->data[8][0]);
+        for (r=0; r<9; r++)
+                {
+                *(tp++)=inverse(fp[0]);
+                *(tp++)=((int)(0x10000L-fp[2])&0xffff);
+                *(tp++)=((int)(0x10000L-fp[1])&0xffff);
+                *(tp++)=inverse(fp[3]);
+                if (r == 8) break;
+                fp-=6;
+                *(tp++)=fp[4];
+                *(tp++)=fp[5];
+                }
+        tp= &(dk->data[0][0]);
+        t=tp[1];
+        tp[1]=tp[2];
+        tp[2]=t;
+        t=tp[49];
+        tp[49]=tp[50];
+        tp[50]=t;
+        }
+/* taken directly from the 'paper' I'll have a look at it later */
+static IDEA_INT inverse(unsigned int xin)
+        {
+        long n1,n2,q,r,b1,b2,t;
+        if (xin == 0)
+                b2=0;
+        else
+                {
+                n1=0x10001;
+                n2=xin;
+                b2=1;
+                b1=0;
+                do      {
+                        r=(n1%n2);
+                        q=(n1-r)/n2;
+                        if (r == 0)
+                                { if (b2 < 0) b2=0x10001+b2; }
+                        else
+                                {
+                                n1=n2;
+                                n2=r;
+                                t=b2;
+                                b2=b1-q*b2;
+                                b1=t;
+                                }
+                        } while (r != 0);
+                }
+        return((IDEA_INT)b2);
+        }
diff --git a/src/lib/libcrypto/idea/idea_lcl.h b/src/lib/libcrypto/idea/idea_lcl.h
new file mode 100644
index 0000000000..f3dbfa67e9
--- /dev/null
+++ b/src/lib/libcrypto/idea/idea_lcl.h
@@ -0,0 +1,215 @@
+/* crypto/idea/idea_lcl.h */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* The new form of this macro (check if the a*b == 0) was suggested by 
+ * Colin Plumb <colin@nyx10.cs.du.edu> */
+/* Removal of the inner if from from Wei Dai 24/4/96 */
+#define idea_mul(r,a,b,ul) \
+ul=(unsigned long)a*b; \
+if (ul != 0) \
+        { \
+        r=(ul&0xffff)-(ul>>16); \
+        r-=((r)>>16); \
+        } \
+else \
+        r=(-(int)a-b+1); /* assuming a or b is 0 and in range */ 
+#ifdef undef
+#define idea_mul(r,a,b,ul,sl) \
+if (a == 0) r=(0x10001-b)&0xffff; \
+else if (b == 0) r=(0x10001-a)&0xffff; \
+else    { \
+        ul=(unsigned long)a*b; \
+        sl=(ul&0xffff)-(ul>>16); \
+        if (sl <= 0) sl+=0x10001; \
+        r=sl; \
+        } 
+#endif
+/*  7/12/95 - Many thanks to Rhys Weatherley <rweather@us.oracle.com>
+ * for pointing out that I was assuming little endian
+ * byte order for all quantities what idea
+ * actually used bigendian.  No where in the spec does it mention
+ * this, it is all in terms of 16 bit numbers and even the example
+ * does not use byte streams for the input example :-(.
+ * If you byte swap each pair of input, keys and iv, the functions
+ * would produce the output as the old version :-(.
+ */
+/* NOTE - c is not incremented as per n2l */
+#define n2ln(c,l1,l2,n) { \
+                        c+=n; \
+                        l1=l2=0; \
+                        switch (n) { \
+                        case 8: l2 =((unsigned long)(*(--(c))))    ; \
+                        case 7: l2|=((unsigned long)(*(--(c))))<< 8; \
+                        case 6: l2|=((unsigned long)(*(--(c))))<<16; \
+                        case 5: l2|=((unsigned long)(*(--(c))))<<24; \
+                        case 4: l1 =((unsigned long)(*(--(c))))    ; \
+                        case 3: l1|=((unsigned long)(*(--(c))))<< 8; \
+                        case 2: l1|=((unsigned long)(*(--(c))))<<16; \
+                        case 1: l1|=((unsigned long)(*(--(c))))<<24; \
+                                } \
+                        }
+/* NOTE - c is not incremented as per l2n */
+#define l2nn(l1,l2,c,n) { \
+                        c+=n; \
+                        switch (n) { \
+                        case 8: *(--(c))=(unsigned char)(((l2)    )&0xff); \
+                        case 7: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \
+                        case 6: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \
+                        case 5: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \
+                        case 4: *(--(c))=(unsigned char)(((l1)    )&0xff); \
+                        case 3: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \
+                        case 2: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \
+                        case 1: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \
+                                } \
+                        }
+#undef n2l
+#define n2l(c,l)        (l =((unsigned long)(*((c)++)))<<24L, \
+                         l|=((unsigned long)(*((c)++)))<<16L, \
+                         l|=((unsigned long)(*((c)++)))<< 8L, \
+                         l|=((unsigned long)(*((c)++))))
+#undef l2n
+#define l2n(l,c)        (*((c)++)=(unsigned char)(((l)>>24L)&0xff), \
+                         *((c)++)=(unsigned char)(((l)>>16L)&0xff), \
+                         *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \
+                         *((c)++)=(unsigned char)(((l)     )&0xff))
+#undef s2n
+#define s2n(l,c)        (*((c)++)=(unsigned char)(((l)     )&0xff), \
+                         *((c)++)=(unsigned char)(((l)>> 8L)&0xff))
+#undef n2s
+#define n2s(c,l)        (l =((IDEA_INT)(*((c)++)))<< 8L, \
+                         l|=((IDEA_INT)(*((c)++)))      )
+#ifdef undef
+/* NOTE - c is not incremented as per c2l */
+#define c2ln(c,l1,l2,n) { \
+                        c+=n; \
+                        l1=l2=0; \
+                        switch (n) { \
+                        case 8: l2 =((unsigned long)(*(--(c))))<<24; \
+                        case 7: l2|=((unsigned long)(*(--(c))))<<16; \
+                        case 6: l2|=((unsigned long)(*(--(c))))<< 8; \
+                        case 5: l2|=((unsigned long)(*(--(c))));     \
+                        case 4: l1 =((unsigned long)(*(--(c))))<<24; \
+                        case 3: l1|=((unsigned long)(*(--(c))))<<16; \
+                        case 2: l1|=((unsigned long)(*(--(c))))<< 8; \
+                        case 1: l1|=((unsigned long)(*(--(c))));     \
+                                } \
+                        }
+/* NOTE - c is not incremented as per l2c */
+#define l2cn(l1,l2,c,n) { \
+                        c+=n; \
+                        switch (n) { \
+                        case 8: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \
+                        case 7: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \
+                        case 6: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \
+                        case 5: *(--(c))=(unsigned char)(((l2)    )&0xff); \
+                        case 4: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \
+                        case 3: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \
+                        case 2: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \
+                        case 1: *(--(c))=(unsigned char)(((l1)    )&0xff); \
+                                } \
+                        }
+#undef c2s
+#define c2s(c,l)        (l =((unsigned long)(*((c)++)))    , \
+                         l|=((unsigned long)(*((c)++)))<< 8L)
+#undef s2c
+#define s2c(l,c)        (*((c)++)=(unsigned char)(((l)     )&0xff), \
+                         *((c)++)=(unsigned char)(((l)>> 8L)&0xff))
+#undef c2l
+#define c2l(c,l)        (l =((unsigned long)(*((c)++)))     , \
+                         l|=((unsigned long)(*((c)++)))<< 8L, \
+                         l|=((unsigned long)(*((c)++)))<<16L, \
+                         l|=((unsigned long)(*((c)++)))<<24L)
+#undef l2c
+#define l2c(l,c)        (*((c)++)=(unsigned char)(((l)     )&0xff), \
+                         *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \
+                         *((c)++)=(unsigned char)(((l)>>16L)&0xff), \
+                         *((c)++)=(unsigned char)(((l)>>24L)&0xff))
+#endif
+#define E_IDEA(num) \
+        x1&=0xffff; \
+        idea_mul(x1,x1,*p,ul); p++; \
+        x2+= *(p++); \
+        x3+= *(p++); \
+        x4&=0xffff; \
+        idea_mul(x4,x4,*p,ul); p++; \
+        t0=(x1^x3)&0xffff; \
+        idea_mul(t0,t0,*p,ul); p++; \
+        t1=(t0+(x2^x4))&0xffff; \
+        idea_mul(t1,t1,*p,ul); p++; \
+        t0+=t1; \
+        x1^=t1; \
+        x4^=t0; \
+        ul=x2^t0; /* do the swap to x3 */ \
+        x2=x3^t1; \
+        x3=ul;
diff --git a/src/lib/libcrypto/modes/asm/ghash-alpha.pl b/src/lib/libcrypto/modes/asm/ghash-alpha.pl
new file mode 100644
index 0000000000..6358b2750f
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-alpha.pl
@@ -0,0 +1,451 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Even though
+# loops are aggressively modulo-scheduled in respect to references to
+# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
+# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
+# scheduling "glitch," because uprofile(1) indicates uniform sample
+# distribution, as if all instruction bundles execute in 1.5 cycles.
+# Meaning that it could have been even faster, yet 12 cycles is ~60%
+# better than gcc-generated code and ~80% than code generated by vendor
+# compiler.
+$cnt="v0";      # $0
+$t0="t0";
+$t1="t1";
+$t2="t2";
+$Thi0="t3";     # $4
+$Tlo0="t4";
+$Thi1="t5";
+$Tlo1="t6";
+$rem="t7";      # $8
+#################
+$Xi="a0";       # $16, input argument block
+$Htbl="a1";
+$inp="a2";
+$len="a3";
+$nlo="a4";      # $20
+$nhi="a5";
+$Zhi="t8";
+$Zlo="t9";
+$Xhi="t10";     # $24
+$Xlo="t11";
+$remp="t12";
+$rem_4bit="AT"; # $28
+{ my $N;
+  sub loop() {
+        $N++;
+$code.=<<___;
+.align  4
+        extbl   $Xlo,7,$nlo
+        and     $nlo,0xf0,$nhi
+        sll     $nlo,4,$nlo
+        and     $nlo,0xf0,$nlo
+        addq    $nlo,$Htbl,$nlo
+        ldq     $Zlo,8($nlo)
+        addq    $nhi,$Htbl,$nhi
+        ldq     $Zhi,0($nlo)
+        and     $Zlo,0x0f,$remp
+        sll     $Zhi,60,$t0
+        lda     $cnt,6(zero)
+        extbl   $Xlo,6,$nlo
+        ldq     $Tlo1,8($nhi)
+        s8addq  $remp,$rem_4bit,$remp
+        ldq     $Thi1,0($nhi)
+        srl     $Zlo,4,$Zlo
+        ldq     $rem,0($remp)
+        srl     $Zhi,4,$Zhi
+        xor     $t0,$Zlo,$Zlo
+        and     $nlo,0xf0,$nhi
+        xor     $Tlo1,$Zlo,$Zlo
+        sll     $nlo,4,$nlo
+        xor     $Thi1,$Zhi,$Zhi
+        and     $nlo,0xf0,$nlo
+        addq    $nlo,$Htbl,$nlo
+        ldq     $Tlo0,8($nlo)
+        addq    $nhi,$Htbl,$nhi
+        ldq     $Thi0,0($nlo)
+.Looplo$N:
+        and     $Zlo,0x0f,$remp
+        sll     $Zhi,60,$t0
+        subq    $cnt,1,$cnt
+        srl     $Zlo,4,$Zlo
+        ldq     $Tlo1,8($nhi)
+        xor     $rem,$Zhi,$Zhi
+        ldq     $Thi1,0($nhi)
+        s8addq  $remp,$rem_4bit,$remp
+        ldq     $rem,0($remp)
+        srl     $Zhi,4,$Zhi
+        xor     $t0,$Zlo,$Zlo
+        extbl   $Xlo,$cnt,$nlo
+        and     $nlo,0xf0,$nhi
+        xor     $Thi0,$Zhi,$Zhi
+        xor     $Tlo0,$Zlo,$Zlo
+        sll     $nlo,4,$nlo
+        and     $Zlo,0x0f,$remp
+        sll     $Zhi,60,$t0
+        and     $nlo,0xf0,$nlo
+        srl     $Zlo,4,$Zlo
+        s8addq  $remp,$rem_4bit,$remp
+        xor     $rem,$Zhi,$Zhi
+        addq    $nlo,$Htbl,$nlo
+        addq    $nhi,$Htbl,$nhi
+        ldq     $rem,0($remp)
+        srl     $Zhi,4,$Zhi
+        ldq     $Tlo0,8($nlo)
+        xor     $t0,$Zlo,$Zlo
+        xor     $Tlo1,$Zlo,$Zlo
+        xor     $Thi1,$Zhi,$Zhi
+        ldq     $Thi0,0($nlo)
+        bne     $cnt,.Looplo$N
+        and     $Zlo,0x0f,$remp
+        sll     $Zhi,60,$t0
+        lda     $cnt,7(zero)
+        srl     $Zlo,4,$Zlo
+        ldq     $Tlo1,8($nhi)
+        xor     $rem,$Zhi,$Zhi
+        ldq     $Thi1,0($nhi)
+        s8addq  $remp,$rem_4bit,$remp
+        ldq     $rem,0($remp)
+        srl     $Zhi,4,$Zhi
+        xor     $t0,$Zlo,$Zlo
+        extbl   $Xhi,$cnt,$nlo
+        and     $nlo,0xf0,$nhi
+        xor     $Thi0,$Zhi,$Zhi
+        xor     $Tlo0,$Zlo,$Zlo
+        sll     $nlo,4,$nlo
+        and     $Zlo,0x0f,$remp
+        sll     $Zhi,60,$t0
+        and     $nlo,0xf0,$nlo
+        srl     $Zlo,4,$Zlo
+        s8addq  $remp,$rem_4bit,$remp
+        xor     $rem,$Zhi,$Zhi
+        addq    $nlo,$Htbl,$nlo
+        addq    $nhi,$Htbl,$nhi
+        ldq     $rem,0($remp)
+        srl     $Zhi,4,$Zhi
+        ldq     $Tlo0,8($nlo)
+        xor     $t0,$Zlo,$Zlo
+        xor     $Tlo1,$Zlo,$Zlo
+        xor     $Thi1,$Zhi,$Zhi
+        ldq     $Thi0,0($nlo)
+        unop
+.Loophi$N:
+        and     $Zlo,0x0f,$remp
+        sll     $Zhi,60,$t0
+        subq    $cnt,1,$cnt
+        srl     $Zlo,4,$Zlo
+        ldq     $Tlo1,8($nhi)
+        xor     $rem,$Zhi,$Zhi
+        ldq     $Thi1,0($nhi)
+        s8addq  $remp,$rem_4bit,$remp
+        ldq     $rem,0($remp)
+        srl     $Zhi,4,$Zhi
+        xor     $t0,$Zlo,$Zlo
+        extbl   $Xhi,$cnt,$nlo
+        and     $nlo,0xf0,$nhi
+        xor     $Thi0,$Zhi,$Zhi
+        xor     $Tlo0,$Zlo,$Zlo
+        sll     $nlo,4,$nlo
+        and     $Zlo,0x0f,$remp
+        sll     $Zhi,60,$t0
+        and     $nlo,0xf0,$nlo
+        srl     $Zlo,4,$Zlo
+        s8addq  $remp,$rem_4bit,$remp
+        xor     $rem,$Zhi,$Zhi
+        addq    $nlo,$Htbl,$nlo
+        addq    $nhi,$Htbl,$nhi
+        ldq     $rem,0($remp)
+        srl     $Zhi,4,$Zhi
+        ldq     $Tlo0,8($nlo)
+        xor     $t0,$Zlo,$Zlo
+        xor     $Tlo1,$Zlo,$Zlo
+        xor     $Thi1,$Zhi,$Zhi
+        ldq     $Thi0,0($nlo)
+        bne     $cnt,.Loophi$N
+        and     $Zlo,0x0f,$remp
+        sll     $Zhi,60,$t0
+        srl     $Zlo,4,$Zlo
+        ldq     $Tlo1,8($nhi)
+        xor     $rem,$Zhi,$Zhi
+        ldq     $Thi1,0($nhi)
+        s8addq  $remp,$rem_4bit,$remp
+        ldq     $rem,0($remp)
+        srl     $Zhi,4,$Zhi
+        xor     $t0,$Zlo,$Zlo
+        xor     $Tlo0,$Zlo,$Zlo
+        xor     $Thi0,$Zhi,$Zhi
+        and     $Zlo,0x0f,$remp
+        sll     $Zhi,60,$t0
+        srl     $Zlo,4,$Zlo
+        s8addq  $remp,$rem_4bit,$remp
+        xor     $rem,$Zhi,$Zhi
+        ldq     $rem,0($remp)
+        srl     $Zhi,4,$Zhi
+        xor     $Tlo1,$Zlo,$Zlo
+        xor     $Thi1,$Zhi,$Zhi
+        xor     $t0,$Zlo,$Zlo
+        xor     $rem,$Zhi,$Zhi
+___
+}}
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+.text
+.set    noat
+.set    noreorder
+.globl  gcm_gmult_4bit
+.align  4
+.ent    gcm_gmult_4bit
+gcm_gmult_4bit:
+        .frame  sp,0,ra
+        .prologue 0
+        ldq     $Xlo,8($Xi)
+        ldq     $Xhi,0($Xi)
+        br      $rem_4bit,.Lpic1
+.Lpic1: lda     $rem_4bit,rem_4bit-.Lpic1($rem_4bit)
+___
+        &loop();
+$code.=<<___;
+        srl     $Zlo,24,$t0     # byte swap
+        srl     $Zlo,8,$t1
+        sll     $Zlo,8,$t2
+        sll     $Zlo,24,$Zlo
+        zapnot  $t0,0x11,$t0
+        zapnot  $t1,0x22,$t1
+        zapnot  $Zlo,0x88,$Zlo
+        or      $t0,$t1,$t0
+        zapnot  $t2,0x44,$t2
+        or      $Zlo,$t0,$Zlo
+        srl     $Zhi,24,$t0
+        srl     $Zhi,8,$t1
+        or      $Zlo,$t2,$Zlo
+        sll     $Zhi,8,$t2
+        sll     $Zhi,24,$Zhi
+        srl     $Zlo,32,$Xlo
+        sll     $Zlo,32,$Zlo
+        zapnot  $t0,0x11,$t0
+        zapnot  $t1,0x22,$t1
+        or      $Zlo,$Xlo,$Xlo
+        zapnot  $Zhi,0x88,$Zhi
+        or      $t0,$t1,$t0
+        zapnot  $t2,0x44,$t2
+        or      $Zhi,$t0,$Zhi
+        or      $Zhi,$t2,$Zhi
+        srl     $Zhi,32,$Xhi
+        sll     $Zhi,32,$Zhi
+        or      $Zhi,$Xhi,$Xhi
+        stq     $Xlo,8($Xi)
+        stq     $Xhi,0($Xi)
+        ret     (ra)
+.end    gcm_gmult_4bit
+___
+$inhi="s0";
+$inlo="s1";
+$code.=<<___;
+.globl  gcm_ghash_4bit
+.align  4
+.ent    gcm_ghash_4bit
+gcm_ghash_4bit:
+        lda     sp,-32(sp)
+        stq     ra,0(sp)
+        stq     s0,8(sp)
+        stq     s1,16(sp)
+        .mask   0x04000600,-32
+        .frame  sp,32,ra
+        .prologue 0
+        ldq_u   $inhi,0($inp)
+        ldq_u   $Thi0,7($inp)
+        ldq_u   $inlo,8($inp)
+        ldq_u   $Tlo0,15($inp)
+        ldq     $Xhi,0($Xi)
+        ldq     $Xlo,8($Xi)
+        br      $rem_4bit,.Lpic2
+.Lpic2: lda     $rem_4bit,rem_4bit-.Lpic2($rem_4bit)
+.Louter:
+        extql   $inhi,$inp,$inhi
+        extqh   $Thi0,$inp,$Thi0
+        or      $inhi,$Thi0,$inhi
+        lda     $inp,16($inp)
+        extql   $inlo,$inp,$inlo
+        extqh   $Tlo0,$inp,$Tlo0
+        or      $inlo,$Tlo0,$inlo
+        subq    $len,16,$len
+        xor     $Xlo,$inlo,$Xlo
+        xor     $Xhi,$inhi,$Xhi
+___
+        &loop();
+$code.=<<___;
+        srl     $Zlo,24,$t0     # byte swap
+        srl     $Zlo,8,$t1
+        sll     $Zlo,8,$t2
+        sll     $Zlo,24,$Zlo
+        zapnot  $t0,0x11,$t0
+        zapnot  $t1,0x22,$t1
+        zapnot  $Zlo,0x88,$Zlo
+        or      $t0,$t1,$t0
+        zapnot  $t2,0x44,$t2
+        or      $Zlo,$t0,$Zlo
+        srl     $Zhi,24,$t0
+        srl     $Zhi,8,$t1
+        or      $Zlo,$t2,$Zlo
+        sll     $Zhi,8,$t2
+        sll     $Zhi,24,$Zhi
+        srl     $Zlo,32,$Xlo
+        sll     $Zlo,32,$Zlo
+        beq     $len,.Ldone
+        zapnot  $t0,0x11,$t0
+        zapnot  $t1,0x22,$t1
+        or      $Zlo,$Xlo,$Xlo
+        ldq_u   $inhi,0($inp)
+        zapnot  $Zhi,0x88,$Zhi
+        or      $t0,$t1,$t0
+        zapnot  $t2,0x44,$t2
+        ldq_u   $Thi0,7($inp)
+        or      $Zhi,$t0,$Zhi
+        or      $Zhi,$t2,$Zhi
+        ldq_u   $inlo,8($inp)
+        ldq_u   $Tlo0,15($inp)
+        srl     $Zhi,32,$Xhi
+        sll     $Zhi,32,$Zhi
+        or      $Zhi,$Xhi,$Xhi
+        br      zero,.Louter
+.Ldone:
+        zapnot  $t0,0x11,$t0
+        zapnot  $t1,0x22,$t1
+        or      $Zlo,$Xlo,$Xlo
+        zapnot  $Zhi,0x88,$Zhi
+        or      $t0,$t1,$t0
+        zapnot  $t2,0x44,$t2
+        or      $Zhi,$t0,$Zhi
+        or      $Zhi,$t2,$Zhi
+        srl     $Zhi,32,$Xhi
+        sll     $Zhi,32,$Zhi
+        or      $Zhi,$Xhi,$Xhi
+        stq     $Xlo,8($Xi)
+        stq     $Xhi,0($Xi)
+        .set    noreorder
+        /*ldq   ra,0(sp)*/
+        ldq     s0,8(sp)
+        ldq     s1,16(sp)
+        lda     sp,32(sp)
+        ret     (ra)
+.end    gcm_ghash_4bit
+.align  4
+rem_4bit:
+        .quad   0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+        .quad   0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+        .quad   0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+        .quad   0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.ascii  "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align  4
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-armv4.pl b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
new file mode 100644
index 0000000000..d91586ee29
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
@@ -0,0 +1,429 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+#   bit shift operation is neatly fused with 128-bit xor here, and
+#   "538B" variant would eliminate only 4-5 instructions out of 32
+#   in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+#   consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+$Xi="r0";       # argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+$Zll="r4";      # variables
+$Zlh="r5";
+$Zhl="r6";
+$Zhh="r7";
+$Tll="r8";
+$Tlh="r9";
+$Thl="r10";
+$Thh="r11";
+$nlo="r12";
+################# r13 is stack pointer
+$nhi="r14";
+################# r15 is program counter
+$rem_4bit=$inp; # used in gcm_gmult_4bit
+$cnt=$len;
+sub Zsmash() {
+  my $i=12;
+  my @args=@_;
+  for ($Zll,$Zlh,$Zhl,$Zhh) {
+    $code.=<<___;
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+        rev     $_,$_
+        str     $_,[$Xi,#$i]
+#elif defined(__ARMEB__)
+        str     $_,[$Xi,#$i]
+#else
+        mov     $Tlh,$_,lsr#8
+        strb    $_,[$Xi,#$i+3]
+        mov     $Thl,$_,lsr#16
+        strb    $Tlh,[$Xi,#$i+2]
+        mov     $Thh,$_,lsr#24
+        strb    $Thl,[$Xi,#$i+1]
+        strb    $Thh,[$Xi,#$i]
+#endif
+___
+    $code.="\t".shift(@args)."\n";
+    $i-=4;
+  }
+}
+$code=<<___;
+#include "arm_arch.h"
+.text
+.code   32
+.type   rem_4bit,%object
+.align  5
+rem_4bit:
+.short  0x0000,0x1C20,0x3840,0x2460
+.short  0x7080,0x6CA0,0x48C0,0x54E0
+.short  0xE100,0xFD20,0xD940,0xC560
+.short  0x9180,0x8DA0,0xA9C0,0xB5E0
+.size   rem_4bit,.-rem_4bit
+.type   rem_4bit_get,%function
+rem_4bit_get:
+        sub     $rem_4bit,pc,#8
+        sub     $rem_4bit,$rem_4bit,#32 @ &rem_4bit
+        b       .Lrem_4bit_got
+        nop
+.size   rem_4bit_get,.-rem_4bit_get
+.global gcm_ghash_4bit
+.type   gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+        sub     r12,pc,#8
+        add     $len,$inp,$len          @ $len to point at the end
+        stmdb   sp!,{r3-r11,lr}         @ save $len/end too
+        sub     r12,r12,#48             @ &rem_4bit
+        ldmia   r12,{r4-r11}            @ copy rem_4bit ...
+        stmdb   sp!,{r4-r11}            @ ... to stack
+        ldrb    $nlo,[$inp,#15]
+        ldrb    $nhi,[$Xi,#15]
+.Louter:
+        eor     $nlo,$nlo,$nhi
+        and     $nhi,$nlo,#0xf0
+        and     $nlo,$nlo,#0x0f
+        mov     $cnt,#14
+        add     $Zhh,$Htbl,$nlo,lsl#4
+        ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
+        add     $Thh,$Htbl,$nhi
+        ldrb    $nlo,[$inp,#14]
+        and     $nhi,$Zll,#0xf          @ rem
+        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
+        add     $nhi,$nhi,$nhi
+        eor     $Zll,$Tll,$Zll,lsr#4
+        ldrh    $Tll,[sp,$nhi]          @ rem_4bit[rem]
+        eor     $Zll,$Zll,$Zlh,lsl#28
+        ldrb    $nhi,[$Xi,#14]
+        eor     $Zlh,$Tlh,$Zlh,lsr#4
+        eor     $Zlh,$Zlh,$Zhl,lsl#28
+        eor     $Zhl,$Thl,$Zhl,lsr#4
+        eor     $Zhl,$Zhl,$Zhh,lsl#28
+        eor     $Zhh,$Thh,$Zhh,lsr#4
+        eor     $nlo,$nlo,$nhi
+        and     $nhi,$nlo,#0xf0
+        and     $nlo,$nlo,#0x0f
+        eor     $Zhh,$Zhh,$Tll,lsl#16
+.Linner:
+        add     $Thh,$Htbl,$nlo,lsl#4
+        and     $nlo,$Zll,#0xf          @ rem
+        subs    $cnt,$cnt,#1
+        add     $nlo,$nlo,$nlo
+        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
+        eor     $Zll,$Tll,$Zll,lsr#4
+        eor     $Zll,$Zll,$Zlh,lsl#28
+        eor     $Zlh,$Tlh,$Zlh,lsr#4
+        eor     $Zlh,$Zlh,$Zhl,lsl#28
+        ldrh    $Tll,[sp,$nlo]          @ rem_4bit[rem]
+        eor     $Zhl,$Thl,$Zhl,lsr#4
+        ldrplb  $nlo,[$inp,$cnt]
+        eor     $Zhl,$Zhl,$Zhh,lsl#28
+        eor     $Zhh,$Thh,$Zhh,lsr#4
+        add     $Thh,$Htbl,$nhi
+        and     $nhi,$Zll,#0xf          @ rem
+        eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
+        add     $nhi,$nhi,$nhi
+        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
+        eor     $Zll,$Tll,$Zll,lsr#4
+        ldrplb  $Tll,[$Xi,$cnt]
+        eor     $Zll,$Zll,$Zlh,lsl#28
+        eor     $Zlh,$Tlh,$Zlh,lsr#4
+        ldrh    $Tlh,[sp,$nhi]
+        eor     $Zlh,$Zlh,$Zhl,lsl#28
+        eor     $Zhl,$Thl,$Zhl,lsr#4
+        eor     $Zhl,$Zhl,$Zhh,lsl#28
+        eorpl   $nlo,$nlo,$Tll
+        eor     $Zhh,$Thh,$Zhh,lsr#4
+        andpl   $nhi,$nlo,#0xf0
+        andpl   $nlo,$nlo,#0x0f
+        eor     $Zhh,$Zhh,$Tlh,lsl#16   @ ^= rem_4bit[rem]
+        bpl     .Linner
+        ldr     $len,[sp,#32]           @ re-load $len/end
+        add     $inp,$inp,#16
+        mov     $nhi,$Zll
+___
+        &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+$code.=<<___;
+        bne     .Louter
+        add     sp,sp,#36
+#if __ARM_ARCH__>=5
+        ldmia   sp!,{r4-r11,pc}
+#else
+        ldmia   sp!,{r4-r11,lr}
+        tst     lr,#1
+        moveq   pc,lr                   @ be binary compatible with V4, yet
+        bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size   gcm_ghash_4bit,.-gcm_ghash_4bit
+.global gcm_gmult_4bit
+.type   gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+        stmdb   sp!,{r4-r11,lr}
+        ldrb    $nlo,[$Xi,#15]
+        b       rem_4bit_get
+.Lrem_4bit_got:
+        and     $nhi,$nlo,#0xf0
+        and     $nlo,$nlo,#0x0f
+        mov     $cnt,#14
+        add     $Zhh,$Htbl,$nlo,lsl#4
+        ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
+        ldrb    $nlo,[$Xi,#14]
+        add     $Thh,$Htbl,$nhi
+        and     $nhi,$Zll,#0xf          @ rem
+        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
+        add     $nhi,$nhi,$nhi
+        eor     $Zll,$Tll,$Zll,lsr#4
+        ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
+        eor     $Zll,$Zll,$Zlh,lsl#28
+        eor     $Zlh,$Tlh,$Zlh,lsr#4
+        eor     $Zlh,$Zlh,$Zhl,lsl#28
+        eor     $Zhl,$Thl,$Zhl,lsr#4
+        eor     $Zhl,$Zhl,$Zhh,lsl#28
+        eor     $Zhh,$Thh,$Zhh,lsr#4
+        and     $nhi,$nlo,#0xf0
+        eor     $Zhh,$Zhh,$Tll,lsl#16
+        and     $nlo,$nlo,#0x0f
+.Loop:
+        add     $Thh,$Htbl,$nlo,lsl#4
+        and     $nlo,$Zll,#0xf          @ rem
+        subs    $cnt,$cnt,#1
+        add     $nlo,$nlo,$nlo
+        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
+        eor     $Zll,$Tll,$Zll,lsr#4
+        eor     $Zll,$Zll,$Zlh,lsl#28
+        eor     $Zlh,$Tlh,$Zlh,lsr#4
+        eor     $Zlh,$Zlh,$Zhl,lsl#28
+        ldrh    $Tll,[$rem_4bit,$nlo]   @ rem_4bit[rem]
+        eor     $Zhl,$Thl,$Zhl,lsr#4
+        ldrplb  $nlo,[$Xi,$cnt]
+        eor     $Zhl,$Zhl,$Zhh,lsl#28
+        eor     $Zhh,$Thh,$Zhh,lsr#4
+        add     $Thh,$Htbl,$nhi
+        and     $nhi,$Zll,#0xf          @ rem
+        eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
+        add     $nhi,$nhi,$nhi
+        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
+        eor     $Zll,$Tll,$Zll,lsr#4
+        eor     $Zll,$Zll,$Zlh,lsl#28
+        eor     $Zlh,$Tlh,$Zlh,lsr#4
+        ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
+        eor     $Zlh,$Zlh,$Zhl,lsl#28
+        eor     $Zhl,$Thl,$Zhl,lsr#4
+        eor     $Zhl,$Zhl,$Zhh,lsl#28
+        eor     $Zhh,$Thh,$Zhh,lsr#4
+        andpl   $nhi,$nlo,#0xf0
+        andpl   $nlo,$nlo,#0x0f
+        eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
+        bpl     .Loop
+___
+        &Zsmash();
+$code.=<<___;
+#if __ARM_ARCH__>=5
+        ldmia   sp!,{r4-r11,pc}
+#else
+        ldmia   sp!,{r4-r11,lr}
+        tst     lr,#1
+        moveq   pc,lr                   @ be binary compatible with V4, yet
+        bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size   gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+{
+my $cnt=$Htbl;  # $Htbl is used once in the very beginning
+my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
+my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
+# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
+# in Zo. Or should I say "top bit", because GHASH is specified in
+# reverse bit order? Otherwise straightforward 128-bt H by one input
+# byte multiplication and modulo-reduction, times 16.
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu    neon
+.global gcm_gmult_neon
+.type   gcm_gmult_neon,%function
+.align  4
+gcm_gmult_neon:
+        sub             $Htbl,#16               @ point at H in GCM128_CTX
+        vld1.64         `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+        vmov.i32        $mod,#0xe1              @ our irreducible polynomial
+        vld1.64         `&Dlo("$IN")`,[$Xi,:64]!
+        vshr.u64        $mod,#32
+        vldmia          $Htbl,{$Hhi-$Hlo}       @ load H
+        veor            $zero,$zero
+#ifdef __ARMEL__
+        vrev64.8        $IN,$IN
+#endif
+        veor            $Qpost,$Qpost
+        veor            $R,$R
+        mov             $cnt,#16
+        veor            $Z,$Z
+        mov             $len,#16
+        veor            $Zo,$Zo
+        vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
+        b               .Linner_neon
+.size   gcm_gmult_neon,.-gcm_gmult_neon
+.global gcm_ghash_neon
+.type   gcm_ghash_neon,%function
+.align  4
+gcm_ghash_neon:
+        vld1.64         `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
+        vmov.i32        $mod,#0xe1              @ our irreducible polynomial
+        vld1.64         `&Dlo("$Z")`,[$Xi,:64]!
+        vshr.u64        $mod,#32
+        vldmia          $Xi,{$Hhi-$Hlo}         @ load H
+        veor            $zero,$zero
+        nop
+#ifdef __ARMEL__
+        vrev64.8        $Z,$Z
+#endif
+.Louter_neon:
+        vld1.64         `&Dhi($IN)`,[$inp]!     @ load inp
+        veor            $Qpost,$Qpost
+        vld1.64         `&Dlo($IN)`,[$inp]!
+        veor            $R,$R
+        mov             $cnt,#16
+#ifdef __ARMEL__
+        vrev64.8        $IN,$IN
+#endif
+        veor            $Zo,$Zo
+        veor            $IN,$Z                  @ inp^=Xi
+        veor            $Z,$Z
+        vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
+.Linner_neon:
+        subs            $cnt,$cnt,#1
+        vmull.p8        $Qlo,$Hlo,$xi           @ H.lo�Xi[i]
+        vmull.p8        $Qhi,$Hhi,$xi           @ H.hi�Xi[i]
+        vext.8          $IN,$zero,#1            @ IN>>=8
+        veor            $Z,$Qpost               @ modulo-scheduled part
+        vshl.i64        `&Dlo("$R")`,#48
+        vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
+        veor            $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
+        veor            `&Dhi("$Z")`,`&Dlo("$R")`
+        vuzp.8          $Qlo,$Qhi
+        vsli.8          $Zo,$T,#1               @ compose the "carry" byte
+        vext.8          $Z,$zero,#1             @ Z>>=8
+        vmull.p8        $R,$Zo,$mod             @ "carry"�0xe1
+        vshr.u8         $Zo,$T,#7               @ save Z's bottom bit
+        vext.8          $Qpost,$Qlo,$zero,#1    @ Qlo>>=8
+        veor            $Z,$Qhi
+        bne             .Linner_neon
+        veor            $Z,$Qpost               @ modulo-scheduled artefact
+        vshl.i64        `&Dlo("$R")`,#48
+        veor            `&Dhi("$Z")`,`&Dlo("$R")`
+        @ finalization, normalize Z:Zo
+        vand            $Zo,$mod                @ suffices to mask the bit
+        vshr.u64        `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
+        vshl.i64        $Z,#1
+        subs            $len,#16
+        vorr            $Z,`&Q("$Zo")`          @ Z=Z:Zo<<1
+        bne             .Louter_neon
+#ifdef __ARMEL__
+        vrev64.8        $Z,$Z
+#endif
+        sub             $Xi,#16 
+        vst1.64         `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
+        vst1.64         `&Dlo("$Z")`,[$Xi,:64]
+        bx      lr
+.size   gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/modes/asm/ghash-ia64.pl b/src/lib/libcrypto/modes/asm/ghash-ia64.pl
new file mode 100755
index 0000000000..0354c95444
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-ia64.pl
@@ -0,0 +1,463 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
+# GHASH performance was measured to be 6.67 cycles per processed byte
+# on Itanium 2, which is >90% better than Microsoft compiler generated
+# code. To anchor to something else sha1-ia64.pl module processes one
+# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
+# byte.
+# September 2010
+#
+# It was originally thought that it makes lesser sense to implement
+# "528B" variant on Itanium 2 for following reason. Because number of
+# functional units is naturally limited, it appeared impossible to
+# implement "528B" loop in 4 cycles, only in 5. This would mean that
+# theoretically performance improvement couldn't be more than 20%.
+# But occasionally you prove yourself wrong:-) I figured out a way to
+# fold couple of instructions and having freed yet another instruction
+# slot by unrolling the loop... Resulting performance is 4.45 cycles
+# per processed byte and 50% better than "256B" version. On original
+# Itanium performance should remain the same as the "256B" version,
+# i.e. ~8.5 cycles.
+$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
+                $big_endian=0 if (/\-DL_ENDIAN/);  }
+if (!defined($big_endian))
+             {  $big_endian=(unpack('L',pack('N',1))==1);  }
+sub loop() {
+my $label=shift;
+my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
+# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
+# in scalable manner;-) Naturally assuming data in L1 cache...
+# Special note about 'dep' instruction, which is used to construct
+# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
+# bytes boundary and lower 7 bits of its address are guaranteed to
+# be zero.
+$code.=<<___;
+$label:
+{ .mfi; (p18)   ld8     Hlo=[Hi[1]],-8
+        (p19)   dep     rem=Zlo,rem_4bitp,3,4   }
+{ .mfi; (p19)   xor     Zhi=Zhi,Hhi
+        ($p17)  xor     xi[1]=xi[1],in[1]       };;
+{ .mfi; (p18)   ld8     Hhi=[Hi[1]]
+        (p19)   shrp    Zlo=Zhi,Zlo,4           }
+{ .mfi; (p19)   ld8     rem=[rem]
+        (p18)   and     Hi[1]=mask0xf0,xi[2]    };;
+{ .mmi; ($p16)  ld1     in[0]=[inp],-1
+        (p18)   xor     Zlo=Zlo,Hlo
+        (p19)   shr.u   Zhi=Zhi,4               }
+{ .mib; (p19)   xor     Hhi=Hhi,rem
+        (p18)   add     Hi[1]=Htbl,Hi[1]        };;
+{ .mfi; (p18)   ld8     Hlo=[Hi[1]],-8
+        (p18)   dep     rem=Zlo,rem_4bitp,3,4   }
+{ .mfi; (p17)   shladd  Hi[0]=xi[1],4,r0
+        (p18)   xor     Zhi=Zhi,Hhi             };;
+{ .mfi; (p18)   ld8     Hhi=[Hi[1]]
+        (p18)   shrp    Zlo=Zhi,Zlo,4           }
+{ .mfi; (p18)   ld8     rem=[rem]
+        (p17)   and     Hi[0]=mask0xf0,Hi[0]    };;
+{ .mmi; (p16)   ld1     xi[0]=[Xi],-1
+        (p18)   xor     Zlo=Zlo,Hlo
+        (p18)   shr.u   Zhi=Zhi,4               }
+{ .mib; (p18)   xor     Hhi=Hhi,rem
+        (p17)   add     Hi[0]=Htbl,Hi[0]
+        br.ctop.sptk    $label                  };;
+___
+}
+$code=<<___;
+.explicit
+.text
+prevfs=r2;      prevlc=r3;      prevpr=r8;
+mask0xf0=r21;
+rem=r22;        rem_4bitp=r23;
+Xi=r24;         Htbl=r25;
+inp=r26;        end=r27;
+Hhi=r28;        Hlo=r29;
+Zhi=r30;        Zlo=r31;
+.align  128
+.skip   16                                      // aligns loop body
+.global gcm_gmult_4bit#
+.proc   gcm_gmult_4bit#
+gcm_gmult_4bit:
+        .prologue
+{ .mmi; .save   ar.pfs,prevfs
+        alloc   prevfs=ar.pfs,2,6,0,8
+        $ADDP   Xi=15,in0                       // &Xi[15]
+        mov     rem_4bitp=ip            }
+{ .mii; $ADDP   Htbl=8,in1                      // &Htbl[0].lo
+        .save   ar.lc,prevlc
+        mov     prevlc=ar.lc
+        .save   pr,prevpr
+        mov     prevpr=pr               };;
+        .body
+        .rotr   in[3],xi[3],Hi[2]
+{ .mib; ld1     xi[2]=[Xi],-1                   // Xi[15]
+        mov     mask0xf0=0xf0
+        brp.loop.imp    .Loop1,.Lend1-16};;
+{ .mmi; ld1     xi[1]=[Xi],-1                   // Xi[14]
+                                        };;
+{ .mii; shladd  Hi[1]=xi[2],4,r0
+        mov     pr.rot=0x7<<16
+        mov     ar.lc=13                };;
+{ .mii; and     Hi[1]=mask0xf0,Hi[1]
+        mov     ar.ec=3
+        xor     Zlo=Zlo,Zlo             };;
+{ .mii; add     Hi[1]=Htbl,Hi[1]                // &Htbl[nlo].lo
+        add     rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
+        xor     Zhi=Zhi,Zhi             };;
+___
+        &loop   (".Loop1",1);
+$code.=<<___;
+.Lend1:
+{ .mib; xor     Zhi=Zhi,Hhi             };;     // modulo-scheduling artefact
+{ .mib; mux1    Zlo=Zlo,\@rev           };;
+{ .mib; mux1    Zhi=Zhi,\@rev           };;
+{ .mmi; add     Hlo=9,Xi;;                      // ;; is here to prevent
+        add     Hhi=1,Xi                };;     // pipeline flush on Itanium
+{ .mib; st8     [Hlo]=Zlo
+        mov     pr=prevpr,0x1ffff       };;
+{ .mib; st8     [Hhi]=Zhi
+        mov     ar.lc=prevlc
+        br.ret.sptk.many        b0      };;
+.endp   gcm_gmult_4bit#
+___
+######################################################################
+# "528B" (well, "512B" actualy) streamed GHASH
+#
+$Xip="in0";
+$Htbl="in1";
+$inp="in2";
+$len="in3";
+$rem_8bit="loc0";
+$mask0xff="loc1";
+($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
+sub load_htable() {
+    for (my $i=0;$i<8;$i++) {
+        $code.=<<___;
+{ .mmi; ld8     r`16+2*$i+1`=[r8],16            // Htable[$i].hi
+        ld8     r`16+2*$i`=[r9],16      }       // Htable[$i].lo
+{ .mmi; ldf8    f`32+2*$i+1`=[r10],16           // Htable[`8+$i`].hi
+        ldf8    f`32+2*$i`=[r11],16             // Htable[`8+$i`].lo
+___
+        $code.=shift    if (($i+$#_)==7);
+        $code.="\t};;\n"
+    }
+}
+$code.=<<___;
+prevsp=r3;
+.align  32
+.skip   16                                      // aligns loop body
+.global gcm_ghash_4bit#
+.proc   gcm_ghash_4bit#
+gcm_ghash_4bit:
+        .prologue
+{ .mmi; .save   ar.pfs,prevfs
+        alloc   prevfs=ar.pfs,4,2,0,0
+        .vframe prevsp
+        mov     prevsp=sp
+        mov     $rem_8bit=ip            };;
+        .body
+{ .mfi; $ADDP   r8=0+0,$Htbl
+        $ADDP   r9=0+8,$Htbl            }
+{ .mfi; $ADDP   r10=128+0,$Htbl
+        $ADDP   r11=128+8,$Htbl         };;
+___
+        &load_htable(
+        "       $ADDP   $Xip=15,$Xip",          # &Xi[15]
+        "       $ADDP   $len=$len,$inp",        # &inp[len]
+        "       $ADDP   $inp=15,$inp",          # &inp[15]
+        "       mov     $mask0xff=0xff",
+        "       add     sp=-512,sp",
+        "       andcm   sp=sp,$mask0xff",       # align stack frame
+        "       add     r14=0,sp",
+        "       add     r15=8,sp");
+$code.=<<___;
+{ .mmi; $sum    1<<1                            // go big-endian
+        add     r8=256+0,sp
+        add     r9=256+8,sp             }
+{ .mmi; add     r10=256+128+0,sp
+        add     r11=256+128+8,sp
+        add     $len=-17,$len           };;
+___
+for($i=0;$i<8;$i++) {   # generate first half of Hshr4[]
+my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
+$code.=<<___;
+{ .mmi; st8     [r8]=$rlo,16                    // Htable[$i].lo
+        st8     [r9]=$rhi,16                    // Htable[$i].hi
+        shrp    $rlo=$rhi,$rlo,4        }//;;
+{ .mmi; stf8    [r10]=f`32+2*$i`,16             // Htable[`8+$i`].lo
+        stf8    [r11]=f`32+2*$i+1`,16           // Htable[`8+$i`].hi
+        shr.u   $rhi=$rhi,4             };;
+{ .mmi; st8     [r14]=$rlo,16                   // Htable[$i].lo>>4
+        st8     [r15]=$rhi,16           }//;;   // Htable[$i].hi>>4
+___
+}
+$code.=<<___;
+{ .mmi; ld8     r16=[r8],16                     // Htable[8].lo
+        ld8     r17=[r9],16             };;     // Htable[8].hi
+{ .mmi; ld8     r18=[r8],16                     // Htable[9].lo
+        ld8     r19=[r9],16             }       // Htable[9].hi
+{ .mmi; rum     1<<5                            // clear um.mfh
+        shrp    r16=r17,r16,4           };;
+___
+for($i=0;$i<6;$i++) {   # generate second half of Hshr4[]
+$code.=<<___;
+{ .mmi; ld8     r`20+2*$i`=[r8],16              // Htable[`10+$i`].lo
+        ld8     r`20+2*$i+1`=[r9],16            // Htable[`10+$i`].hi
+        shr.u   r`16+2*$i+1`=r`16+2*$i+1`,4     };;
+{ .mmi; st8     [r14]=r`16+2*$i`,16             // Htable[`8+$i`].lo>>4
+        st8     [r15]=r`16+2*$i+1`,16           // Htable[`8+$i`].hi>>4
+        shrp    r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4    }
+___
+}
+$code.=<<___;
+{ .mmi; shr.u   r`16+2*$i+1`=r`16+2*$i+1`,4     };;
+{ .mmi; st8     [r14]=r`16+2*$i`,16             // Htable[`8+$i`].lo>>4
+        st8     [r15]=r`16+2*$i+1`,16           // Htable[`8+$i`].hi>>4
+        shrp    r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4    }
+{ .mmi; add     $Htbl=256,sp                    // &Htable[0]
+        add     $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
+        shr.u   r`18+2*$i+1`=r`18+2*$i+1`,4     };;
+{ .mmi; st8     [r14]=r`18+2*$i`                // Htable[`8+$i`].lo>>4
+        st8     [r15]=r`18+2*$i+1`      }       // Htable[`8+$i`].hi>>4
+___
+$in="r15";
+@xi=("r16","r17");
+@rem=("r18","r19");
+($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
+($Atbl,$Btbl)=("r26","r27");
+$code.=<<___;   # (p16)
+{ .mmi; ld1     $in=[$inp],-1                   //(p16) *inp--
+        ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
+        cmp.eq  p0,p6=r0,r0             };;     //      clear p6
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+$code.=<<___;   # (p16),(p17)
+{ .mmi; ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
+        xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
+{ .mii; ld1     $in=[$inp],-1                   //(p16) *inp--
+        dep     $Atbl=$xi[1],$Htbl,4,4          //(p17) &Htable[nlo].lo
+        and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
+.align  32
+.LOOP:
+{ .mmi;
+(p6)    st8     [$Xip]=$Zhi,13
+        xor     $Zlo=$Zlo,$Zlo
+        add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi].lo
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+$code.=<<___;   # (p16),(p17),(p18)
+{ .mmi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
+        ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+        xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
+{ .mfi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
+        dep     $Atbl=$xi[1],$Htbl,4,4  }       //(p17) &Htable[nlo].lo
+{ .mfi; shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
+        xor     $Zlo=$Zlo,$Alo          };;     //(p18) Z.lo^=Htable[nlo].lo
+{ .mmi; ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+        ld1     $in=[$inp],-1           }       //(p16) *inp--
+{ .mmi; xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
+        mov     $Zhi=$Ahi                       //(p18) Z.hi^=Htable[nlo].hi
+        and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
+{ .mmi; ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
+        ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
+        shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+        add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+for ($i=1;$i<14;$i++) {
+# Above and below fragments are derived from this one by removing
+# unsuitable (p??) instructions.
+$code.=<<___;   # (p16),(p17),(p18),(p19)
+{ .mmi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
+        ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+        shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
+{ .mmi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
+        xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
+        xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
+{ .mmi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
+        ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
+        dep     $Atbl=$xi[1],$Htbl,4,4  }       //(p17) &Htable[nlo].lo
+{ .mmi; shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
+        xor     $Zlo=$Zlo,$Alo                  //(p18) Z.lo^=Htable[nlo].lo
+        xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi; ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+        ld1     $in=[$inp],-1                   //(p16) *inp--
+        shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
+{ .mmi; xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
+        xor     $Zhi=$Zhi,$Ahi                  //(p18) Z.hi^=Htable[nlo].hi
+        and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
+{ .mmi; ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
+        ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
+        shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+        xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
+        add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+}
+$code.=<<___;   # (p17),(p18),(p19)
+{ .mmi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
+        ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+        shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
+{ .mmi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
+        xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
+        xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
+{ .mmi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
+        ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
+        dep     $Atbl=$xi[1],$Htbl,4,4  };;     //(p17) &Htable[nlo].lo
+{ .mmi; shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
+        xor     $Zlo=$Zlo,$Alo                  //(p18) Z.lo^=Htable[nlo].lo
+        xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi; ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+        shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
+{ .mmi; xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
+        xor     $Zhi=$Zhi,$Ahi                  //(p18) Z.hi^=Htable[nlo].hi
+        and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
+{ .mmi; ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
+        shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+        xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
+        add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+$code.=<<___;   # (p18),(p19)
+{ .mfi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
+        shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
+{ .mfi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
+        xor     $Zlo=$Zlo,$Blo          };;     //(p19) Z.lo^=Hshr4[nhi].lo
+{ .mfi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
+        xor     $Zlo=$Zlo,$Alo          }       //(p18) Z.lo^=Htable[nlo].lo
+{ .mfi; ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
+        xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
+{ .mfi; ld8     $Blo=[$Btbl],8                  //(p18) Htable[nhi].lo,&Htable[nhi].hi
+        shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
+{ .mfi; shladd  $rem[0]=$Zlo,4,r0               //(p18) Z.lo<<4
+        xor     $Zhi=$Zhi,$Ahi          };;     //(p18) Z.hi^=Htable[nlo].hi
+{ .mfi; ld8     $Bhi=[$Btbl]                    //(p18) Htable[nhi].hi
+        shrp    $Zlo=$Zhi,$Zlo,4        }       //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
+{ .mfi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+        xor     $Zhi=$Zhi,$rem[1]       };;     //(p19) Z.hi^=rem_8bit[rem]<<48
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+$code.=<<___;   # (p19)
+{ .mmi; cmp.ltu p6,p0=$inp,$len
+        add     $inp=32,$inp
+        shr.u   $Zhi=$Zhi,4             }       //(p19) Z.hi>>=4
+{ .mmi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
+        xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
+        add     $Xip=9,$Xip             };;     //      &Xi.lo
+{ .mmi; ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
+(p6)    ld1     $in=[$inp],-1                   //[p16] *inp--
+(p6)    extr.u  $xi[1]=$Zlo,8,8         }       //[p17] Xi[14]
+{ .mmi; xor     $Zhi=$Zhi,$Bhi                  //(p19) Z.hi^=Hshr4[nhi].hi
+(p6)    and     $xi[0]=$Zlo,$mask0xff   };;     //[p16] Xi[15]
+{ .mmi; st8     [$Xip]=$Zlo,-8
+(p6)    xor     $xi[0]=$xi[0],$in               //[p17] xi=$xi[i]^inp[i]
+        shl     $rem[1]=$rem[1],48      };;     //(p19) rem_8bit[rem]<<48
+{ .mmi;
+(p6)    ld1     $in=[$inp],-1                   //[p16] *inp--
+        xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
+(p6)    dep     $Atbl=$xi[0],$Htbl,4,4  }       //[p17] &Htable[nlo].lo
+{ .mib;
+(p6)    and     $xi[0]=-16,$xi[0]               //[p17] nhi=xi&0xf0
+(p6)    br.cond.dptk.many       .LOOP   };;
+{ .mib; st8     [$Xip]=$Zhi             };;
+{ .mib; $rum    1<<1                            // return to little-endian
+        .restore        sp
+        mov     sp=prevsp
+        br.ret.sptk.many        b0      };;
+.endp   gcm_ghash_4bit#
+___
+$code.=<<___;
+.align  128
+.type   rem_4bit#,\@object
+rem_4bit:
+        data8   0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+        data8   0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+        data8   0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+        data8   0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.size   rem_4bit#,128
+.type   rem_8bit#,\@object
+rem_8bit:
+        data1   0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
+        data1   0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
+        data1   0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
+        data1   0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
+        data1   0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
+        data1   0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
+        data1   0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
+        data1   0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
+        data1   0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
+        data1   0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
+        data1   0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
+        data1   0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
+        data1   0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
+        data1   0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
+        data1   0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
+        data1   0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
+        data1   0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
+        data1   0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
+        data1   0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
+        data1   0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
+        data1   0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
+        data1   0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
+        data1   0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
+        data1   0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
+        data1   0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
+        data1   0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
+        data1   0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
+        data1   0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
+        data1   0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
+        data1   0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
+        data1   0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
+        data1   0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
+.size   rem_8bit#,512
+stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-parisc.pl b/src/lib/libcrypto/modes/asm/ghash-parisc.pl
new file mode 100644
index 0000000000..8c7454ee93
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-parisc.pl
@@ -0,0 +1,730 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
+# it processes one byte in 19.6 cycles, which is more than twice as
+# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
+# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
+# processed byte. This is ~2.2x faster than 64-bit code generated by
+# vendor compiler (which used to be very hard to beat:-).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+if ($flavour =~ /64/) {
+        $LEVEL          ="2.0W";
+        $SIZE_T         =8;
+        $FRAME_MARKER   =80;
+        $SAVED_RP       =16;
+        $PUSH           ="std";
+        $PUSHMA         ="std,ma";
+        $POP            ="ldd";
+        $POPMB          ="ldd,mb";
+        $NREGS          =6;
+} else {
+        $LEVEL          ="1.0"; #"\n\t.ALLOW\t2.0";
+        $SIZE_T         =4;
+        $FRAME_MARKER   =48;
+        $SAVED_RP       =20;
+        $PUSH           ="stw";
+        $PUSHMA         ="stwm";
+        $POP            ="ldw";
+        $POPMB          ="ldwm";
+        $NREGS          =11;
+}
+$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
+                                #                 [+ argument transfer]
+################# volatile registers
+$Xi="%r26";     # argument block
+$Htbl="%r25";
+$inp="%r24";
+$len="%r23";
+$Hhh=$Htbl;     # variables
+$Hll="%r22";
+$Zhh="%r21";
+$Zll="%r20";
+$cnt="%r19";
+$rem_4bit="%r28";
+$rem="%r29";
+$mask0xf0="%r31";
+################# preserved registers
+$Thh="%r1";
+$Tll="%r2";
+$nlo="%r3";
+$nhi="%r4";
+$byte="%r5";
+if ($SIZE_T==4) {
+        $Zhl="%r6";
+        $Zlh="%r7";
+        $Hhl="%r8";
+        $Hlh="%r9";
+        $Thl="%r10";
+        $Tlh="%r11";
+}
+$rem2="%r6";    # used in PA-RISC 2.0 code
+$code.=<<___;
+        .LEVEL  $LEVEL
+        .SPACE  \$TEXT\$
+        .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+        .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
+        .ALIGN  64
+gcm_gmult_4bit
+        .PROC
+        .CALLINFO       FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
+        .ENTRY
+        $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
+        $PUSHMA %r3,$FRAME(%sp)
+        $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
+        $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
+        $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+        $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
+        $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
+        $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
+        $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
+        $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+        blr     %r0,$rem_4bit
+        ldi     3,$rem
+L\$pic_gmult
+        andcm   $rem_4bit,$rem,$rem_4bit
+        addl    $inp,$len,$len
+        ldo     L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
+        ldi     0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+        ldi     31,$rem
+        mtctl   $rem,%cr11
+        extrd,u,*= $rem,%sar,1,$rem     ; executes on PA-RISC 1.0
+        b       L\$parisc1_gmult
+        nop
+___
+$code.=<<___;
+        ldb     15($Xi),$nlo
+        ldo     8($Htbl),$Hll
+        and     $mask0xf0,$nlo,$nhi
+        depd,z  $nlo,59,4,$nlo
+        ldd     $nlo($Hll),$Zll
+        ldd     $nlo($Hhh),$Zhh
+        depd,z  $Zll,60,4,$rem
+        shrpd   $Zhh,$Zll,4,$Zll
+        extrd,u $Zhh,59,60,$Zhh
+        ldb     14($Xi),$nlo
+        ldd     $nhi($Hll),$Tll
+        ldd     $nhi($Hhh),$Thh
+        and     $mask0xf0,$nlo,$nhi
+        depd,z  $nlo,59,4,$nlo
+        xor     $Tll,$Zll,$Zll
+        xor     $Thh,$Zhh,$Zhh
+        ldd     $rem($rem_4bit),$rem
+        b       L\$oop_gmult_pa2
+        ldi     13,$cnt
+        .ALIGN  8
+L\$oop_gmult_pa2
+        xor     $rem,$Zhh,$Zhh          ; moved here to work around gas bug
+        depd,z  $Zll,60,4,$rem
+        shrpd   $Zhh,$Zll,4,$Zll
+        extrd,u $Zhh,59,60,$Zhh
+        ldd     $nlo($Hll),$Tll
+        ldd     $nlo($Hhh),$Thh
+        xor     $Tll,$Zll,$Zll
+        xor     $Thh,$Zhh,$Zhh
+        ldd     $rem($rem_4bit),$rem
+        xor     $rem,$Zhh,$Zhh
+        depd,z  $Zll,60,4,$rem
+        ldbx    $cnt($Xi),$nlo
+        shrpd   $Zhh,$Zll,4,$Zll
+        extrd,u $Zhh,59,60,$Zhh
+        ldd     $nhi($Hll),$Tll
+        ldd     $nhi($Hhh),$Thh
+        and     $mask0xf0,$nlo,$nhi
+        depd,z  $nlo,59,4,$nlo
+        ldd     $rem($rem_4bit),$rem
+        xor     $Tll,$Zll,$Zll
+        addib,uv -1,$cnt,L\$oop_gmult_pa2
+        xor     $Thh,$Zhh,$Zhh
+        xor     $rem,$Zhh,$Zhh
+        depd,z  $Zll,60,4,$rem
+        shrpd   $Zhh,$Zll,4,$Zll
+        extrd,u $Zhh,59,60,$Zhh
+        ldd     $nlo($Hll),$Tll
+        ldd     $nlo($Hhh),$Thh
+        xor     $Tll,$Zll,$Zll
+        xor     $Thh,$Zhh,$Zhh
+        ldd     $rem($rem_4bit),$rem
+        xor     $rem,$Zhh,$Zhh
+        depd,z  $Zll,60,4,$rem
+        shrpd   $Zhh,$Zll,4,$Zll
+        extrd,u $Zhh,59,60,$Zhh
+        ldd     $nhi($Hll),$Tll
+        ldd     $nhi($Hhh),$Thh
+        xor     $Tll,$Zll,$Zll
+        xor     $Thh,$Zhh,$Zhh
+        ldd     $rem($rem_4bit),$rem
+        xor     $rem,$Zhh,$Zhh
+        std     $Zll,8($Xi)
+        std     $Zhh,0($Xi)
+___
+$code.=<<___ if ($SIZE_T==4);
+        b       L\$done_gmult
+        nop
+L\$parisc1_gmult
+        ldb     15($Xi),$nlo
+        ldo     12($Htbl),$Hll
+        ldo     8($Htbl),$Hlh
+        ldo     4($Htbl),$Hhl
+        and     $mask0xf0,$nlo,$nhi
+        zdep    $nlo,27,4,$nlo
+        ldwx    $nlo($Hll),$Zll
+        ldwx    $nlo($Hlh),$Zlh
+        ldwx    $nlo($Hhl),$Zhl
+        ldwx    $nlo($Hhh),$Zhh
+        zdep    $Zll,28,4,$rem
+        ldb     14($Xi),$nlo
+        ldwx    $rem($rem_4bit),$rem
+        shrpw   $Zlh,$Zll,4,$Zll
+        ldwx    $nhi($Hll),$Tll
+        shrpw   $Zhl,$Zlh,4,$Zlh
+        ldwx    $nhi($Hlh),$Tlh
+        shrpw   $Zhh,$Zhl,4,$Zhl
+        ldwx    $nhi($Hhl),$Thl
+        extru   $Zhh,27,28,$Zhh
+        ldwx    $nhi($Hhh),$Thh
+        xor     $rem,$Zhh,$Zhh
+        and     $mask0xf0,$nlo,$nhi
+        zdep    $nlo,27,4,$nlo
+        xor     $Tll,$Zll,$Zll
+        ldwx    $nlo($Hll),$Tll
+        xor     $Tlh,$Zlh,$Zlh
+        ldwx    $nlo($Hlh),$Tlh
+        xor     $Thl,$Zhl,$Zhl
+        b       L\$oop_gmult_pa1
+        ldi     13,$cnt
+        .ALIGN  8
+L\$oop_gmult_pa1
+        zdep    $Zll,28,4,$rem
+        ldwx    $nlo($Hhl),$Thl
+        xor     $Thh,$Zhh,$Zhh
+        ldwx    $rem($rem_4bit),$rem
+        shrpw   $Zlh,$Zll,4,$Zll
+        ldwx    $nlo($Hhh),$Thh
+        shrpw   $Zhl,$Zlh,4,$Zlh
+        ldbx    $cnt($Xi),$nlo
+        xor     $Tll,$Zll,$Zll
+        ldwx    $nhi($Hll),$Tll
+        shrpw   $Zhh,$Zhl,4,$Zhl
+        xor     $Tlh,$Zlh,$Zlh
+        ldwx    $nhi($Hlh),$Tlh
+        extru   $Zhh,27,28,$Zhh
+        xor     $Thl,$Zhl,$Zhl
+        ldwx    $nhi($Hhl),$Thl
+        xor     $rem,$Zhh,$Zhh
+        zdep    $Zll,28,4,$rem
+        xor     $Thh,$Zhh,$Zhh
+        ldwx    $nhi($Hhh),$Thh
+        shrpw   $Zlh,$Zll,4,$Zll
+        ldwx    $rem($rem_4bit),$rem
+        shrpw   $Zhl,$Zlh,4,$Zlh
+        shrpw   $Zhh,$Zhl,4,$Zhl
+        and     $mask0xf0,$nlo,$nhi
+        extru   $Zhh,27,28,$Zhh
+        zdep    $nlo,27,4,$nlo
+        xor     $Tll,$Zll,$Zll
+        ldwx    $nlo($Hll),$Tll
+        xor     $Tlh,$Zlh,$Zlh
+        ldwx    $nlo($Hlh),$Tlh
+        xor     $rem,$Zhh,$Zhh
+        addib,uv -1,$cnt,L\$oop_gmult_pa1
+        xor     $Thl,$Zhl,$Zhl
+        zdep    $Zll,28,4,$rem
+        ldwx    $nlo($Hhl),$Thl
+        xor     $Thh,$Zhh,$Zhh
+        ldwx    $rem($rem_4bit),$rem
+        shrpw   $Zlh,$Zll,4,$Zll
+        ldwx    $nlo($Hhh),$Thh
+        shrpw   $Zhl,$Zlh,4,$Zlh
+        xor     $Tll,$Zll,$Zll
+        ldwx    $nhi($Hll),$Tll
+        shrpw   $Zhh,$Zhl,4,$Zhl
+        xor     $Tlh,$Zlh,$Zlh
+        ldwx    $nhi($Hlh),$Tlh
+        extru   $Zhh,27,28,$Zhh
+        xor     $rem,$Zhh,$Zhh
+        xor     $Thl,$Zhl,$Zhl
+        ldwx    $nhi($Hhl),$Thl
+        xor     $Thh,$Zhh,$Zhh
+        ldwx    $nhi($Hhh),$Thh
+        zdep    $Zll,28,4,$rem
+        ldwx    $rem($rem_4bit),$rem
+        shrpw   $Zlh,$Zll,4,$Zll
+        shrpw   $Zhl,$Zlh,4,$Zlh
+        shrpw   $Zhh,$Zhl,4,$Zhl
+        extru   $Zhh,27,28,$Zhh
+        xor     $Tll,$Zll,$Zll
+        xor     $Tlh,$Zlh,$Zlh
+        xor     $rem,$Zhh,$Zhh
+        stw     $Zll,12($Xi)
+        xor     $Thl,$Zhl,$Zhl
+        stw     $Zlh,8($Xi)
+        xor     $Thh,$Zhh,$Zhh
+        stw     $Zhl,4($Xi)
+        stw     $Zhh,0($Xi)
+___
+$code.=<<___;
+L\$done_gmult
+        $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
+        $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
+        $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
+        $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+        $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
+        $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
+        $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
+        $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
+        $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+        bv      (%r2)
+        .EXIT
+        $POPMB  -$FRAME(%sp),%r3
+        .PROCEND
+        .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+        .ALIGN  64
+gcm_ghash_4bit
+        .PROC
+        .CALLINFO       FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
+        .ENTRY
+        $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
+        $PUSHMA %r3,$FRAME(%sp)
+        $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
+        $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
+        $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+        $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
+        $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
+        $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
+        $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
+        $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+        blr     %r0,$rem_4bit
+        ldi     3,$rem
+L\$pic_ghash
+        andcm   $rem_4bit,$rem,$rem_4bit
+        addl    $inp,$len,$len
+        ldo     L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
+        ldi     0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+        ldi     31,$rem
+        mtctl   $rem,%cr11
+        extrd,u,*= $rem,%sar,1,$rem     ; executes on PA-RISC 1.0
+        b       L\$parisc1_ghash
+        nop
+___
+$code.=<<___;
+        ldb     15($Xi),$nlo
+        ldo     8($Htbl),$Hll
+L\$outer_ghash_pa2
+        ldb     15($inp),$nhi
+        xor     $nhi,$nlo,$nlo
+        and     $mask0xf0,$nlo,$nhi
+        depd,z  $nlo,59,4,$nlo
+        ldd     $nlo($Hll),$Zll
+        ldd     $nlo($Hhh),$Zhh
+        depd,z  $Zll,60,4,$rem
+        shrpd   $Zhh,$Zll,4,$Zll
+        extrd,u $Zhh,59,60,$Zhh
+        ldb     14($Xi),$nlo
+        ldb     14($inp),$byte
+        ldd     $nhi($Hll),$Tll
+        ldd     $nhi($Hhh),$Thh
+        xor     $byte,$nlo,$nlo
+        and     $mask0xf0,$nlo,$nhi
+        depd,z  $nlo,59,4,$nlo
+        xor     $Tll,$Zll,$Zll
+        xor     $Thh,$Zhh,$Zhh
+        ldd     $rem($rem_4bit),$rem
+        b       L\$oop_ghash_pa2
+        ldi     13,$cnt
+        .ALIGN  8
+L\$oop_ghash_pa2
+        xor     $rem,$Zhh,$Zhh          ; moved here to work around gas bug
+        depd,z  $Zll,60,4,$rem2
+        shrpd   $Zhh,$Zll,4,$Zll
+        extrd,u $Zhh,59,60,$Zhh
+        ldd     $nlo($Hll),$Tll
+        ldd     $nlo($Hhh),$Thh
+        xor     $Tll,$Zll,$Zll
+        xor     $Thh,$Zhh,$Zhh
+        ldbx    $cnt($Xi),$nlo
+        ldbx    $cnt($inp),$byte
+        depd,z  $Zll,60,4,$rem
+        shrpd   $Zhh,$Zll,4,$Zll
+        ldd     $rem2($rem_4bit),$rem2
+        xor     $rem2,$Zhh,$Zhh
+        xor     $byte,$nlo,$nlo
+        ldd     $nhi($Hll),$Tll
+        ldd     $nhi($Hhh),$Thh
+        and     $mask0xf0,$nlo,$nhi
+        depd,z  $nlo,59,4,$nlo
+        extrd,u $Zhh,59,60,$Zhh
+        xor     $Tll,$Zll,$Zll
+        ldd     $rem($rem_4bit),$rem
+        addib,uv -1,$cnt,L\$oop_ghash_pa2
+        xor     $Thh,$Zhh,$Zhh
+        xor     $rem,$Zhh,$Zhh
+        depd,z  $Zll,60,4,$rem2
+        shrpd   $Zhh,$Zll,4,$Zll
+        extrd,u $Zhh,59,60,$Zhh
+        ldd     $nlo($Hll),$Tll
+        ldd     $nlo($Hhh),$Thh
+        xor     $Tll,$Zll,$Zll
+        xor     $Thh,$Zhh,$Zhh
+        depd,z  $Zll,60,4,$rem
+        shrpd   $Zhh,$Zll,4,$Zll
+        ldd     $rem2($rem_4bit),$rem2
+        xor     $rem2,$Zhh,$Zhh
+        ldd     $nhi($Hll),$Tll
+        ldd     $nhi($Hhh),$Thh
+        extrd,u $Zhh,59,60,$Zhh
+        xor     $Tll,$Zll,$Zll
+        xor     $Thh,$Zhh,$Zhh
+        ldd     $rem($rem_4bit),$rem
+        xor     $rem,$Zhh,$Zhh
+        std     $Zll,8($Xi)
+        ldo     16($inp),$inp
+        std     $Zhh,0($Xi)
+        cmpb,*<> $inp,$len,L\$outer_ghash_pa2
+        copy    $Zll,$nlo
+___
+$code.=<<___ if ($SIZE_T==4);
+        b       L\$done_ghash
+        nop
+L\$parisc1_ghash
+        ldb     15($Xi),$nlo
+        ldo     12($Htbl),$Hll
+        ldo     8($Htbl),$Hlh
+        ldo     4($Htbl),$Hhl
+L\$outer_ghash_pa1
+        ldb     15($inp),$byte
+        xor     $byte,$nlo,$nlo
+        and     $mask0xf0,$nlo,$nhi
+        zdep    $nlo,27,4,$nlo
+        ldwx    $nlo($Hll),$Zll
+        ldwx    $nlo($Hlh),$Zlh
+        ldwx    $nlo($Hhl),$Zhl
+        ldwx    $nlo($Hhh),$Zhh
+        zdep    $Zll,28,4,$rem
+        ldb     14($Xi),$nlo
+        ldb     14($inp),$byte
+        ldwx    $rem($rem_4bit),$rem
+        shrpw   $Zlh,$Zll,4,$Zll
+        ldwx    $nhi($Hll),$Tll
+        shrpw   $Zhl,$Zlh,4,$Zlh
+        ldwx    $nhi($Hlh),$Tlh
+        shrpw   $Zhh,$Zhl,4,$Zhl
+        ldwx    $nhi($Hhl),$Thl
+        extru   $Zhh,27,28,$Zhh
+        ldwx    $nhi($Hhh),$Thh
+        xor     $byte,$nlo,$nlo
+        xor     $rem,$Zhh,$Zhh
+        and     $mask0xf0,$nlo,$nhi
+        zdep    $nlo,27,4,$nlo
+        xor     $Tll,$Zll,$Zll
+        ldwx    $nlo($Hll),$Tll
+        xor     $Tlh,$Zlh,$Zlh
+        ldwx    $nlo($Hlh),$Tlh
+        xor     $Thl,$Zhl,$Zhl
+        b       L\$oop_ghash_pa1
+        ldi     13,$cnt
+        .ALIGN  8
+L\$oop_ghash_pa1
+        zdep    $Zll,28,4,$rem
+        ldwx    $nlo($Hhl),$Thl
+        xor     $Thh,$Zhh,$Zhh
+        ldwx    $rem($rem_4bit),$rem
+        shrpw   $Zlh,$Zll,4,$Zll
+        ldwx    $nlo($Hhh),$Thh
+        shrpw   $Zhl,$Zlh,4,$Zlh
+        ldbx    $cnt($Xi),$nlo
+        xor     $Tll,$Zll,$Zll
+        ldwx    $nhi($Hll),$Tll
+        shrpw   $Zhh,$Zhl,4,$Zhl
+        ldbx    $cnt($inp),$byte
+        xor     $Tlh,$Zlh,$Zlh
+        ldwx    $nhi($Hlh),$Tlh
+        extru   $Zhh,27,28,$Zhh
+        xor     $Thl,$Zhl,$Zhl
+        ldwx    $nhi($Hhl),$Thl
+        xor     $rem,$Zhh,$Zhh
+        zdep    $Zll,28,4,$rem
+        xor     $Thh,$Zhh,$Zhh
+        ldwx    $nhi($Hhh),$Thh
+        shrpw   $Zlh,$Zll,4,$Zll
+        ldwx    $rem($rem_4bit),$rem
+        shrpw   $Zhl,$Zlh,4,$Zlh
+        xor     $byte,$nlo,$nlo
+        shrpw   $Zhh,$Zhl,4,$Zhl
+        and     $mask0xf0,$nlo,$nhi
+        extru   $Zhh,27,28,$Zhh
+        zdep    $nlo,27,4,$nlo
+        xor     $Tll,$Zll,$Zll
+        ldwx    $nlo($Hll),$Tll
+        xor     $Tlh,$Zlh,$Zlh
+        ldwx    $nlo($Hlh),$Tlh
+        xor     $rem,$Zhh,$Zhh
+        addib,uv -1,$cnt,L\$oop_ghash_pa1
+        xor     $Thl,$Zhl,$Zhl
+        zdep    $Zll,28,4,$rem
+        ldwx    $nlo($Hhl),$Thl
+        xor     $Thh,$Zhh,$Zhh
+        ldwx    $rem($rem_4bit),$rem
+        shrpw   $Zlh,$Zll,4,$Zll
+        ldwx    $nlo($Hhh),$Thh
+        shrpw   $Zhl,$Zlh,4,$Zlh
+        xor     $Tll,$Zll,$Zll
+        ldwx    $nhi($Hll),$Tll
+        shrpw   $Zhh,$Zhl,4,$Zhl
+        xor     $Tlh,$Zlh,$Zlh
+        ldwx    $nhi($Hlh),$Tlh
+        extru   $Zhh,27,28,$Zhh
+        xor     $rem,$Zhh,$Zhh
+        xor     $Thl,$Zhl,$Zhl
+        ldwx    $nhi($Hhl),$Thl
+        xor     $Thh,$Zhh,$Zhh
+        ldwx    $nhi($Hhh),$Thh
+        zdep    $Zll,28,4,$rem
+        ldwx    $rem($rem_4bit),$rem
+        shrpw   $Zlh,$Zll,4,$Zll
+        shrpw   $Zhl,$Zlh,4,$Zlh
+        shrpw   $Zhh,$Zhl,4,$Zhl
+        extru   $Zhh,27,28,$Zhh
+        xor     $Tll,$Zll,$Zll
+        xor     $Tlh,$Zlh,$Zlh
+        xor     $rem,$Zhh,$Zhh
+        stw     $Zll,12($Xi)
+        xor     $Thl,$Zhl,$Zhl
+        stw     $Zlh,8($Xi)
+        xor     $Thh,$Zhh,$Zhh
+        stw     $Zhl,4($Xi)
+        ldo     16($inp),$inp
+        stw     $Zhh,0($Xi)
+        comb,<> $inp,$len,L\$outer_ghash_pa1
+        copy    $Zll,$nlo
+___
+$code.=<<___;
+L\$done_ghash
+        $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
+        $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
+        $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
+        $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+        $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
+        $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
+        $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
+        $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
+        $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+        bv      (%r2)
+        .EXIT
+        $POPMB  -$FRAME(%sp),%r3
+        .PROCEND
+        .ALIGN  64
+L\$rem_4bit
+        .WORD   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+        .WORD   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+        .WORD   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+        .WORD   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+        .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
+        .ALIGN  64
+___
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)         # format 4
+    {   my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)     # format 5
+    {   my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+        $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);                # encode offset
+        $opcode|=(1<<5)  if ($mod =~ /^,m/);
+        $opcode|=(1<<13) if ($mod =~ /^,mb/);
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+    {   my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 15
+    {   my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+        my $len=32-$3;
+        $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
+        $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)      # format 12
+    {   my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+        my $len=32-$2;
+        $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
+        $opcode |= (1<<13) if ($mod =~ /,\**=/);
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)   # format 14
+    {   my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+        my $cpos=63-$3;
+        $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)    # format 11
+    {   sprintf "\t.WORD\t0x%08x\t; %s",
+                (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+    }
+    else { "\t".$orig; }
+};
+my $depd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "depd$mod\t$args";
+    # I only have ",z" completer, it's impicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 16
+    {   my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
+        my $cpos=63-$2;
+        my $len=32-$3;
+        $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode pos
+        $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+foreach (split("\n",$code)) {
+        s/\`([^\`]*)\`/eval $1/ge;
+        if ($SIZE_T==4) {
+                s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
+                s/cmpb,\*/comb,/;
+                s/,\*/,/;
+        }
+        print $_,"\n";
+}
+close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-s390x.pl b/src/lib/libcrypto/modes/asm/ghash-s390x.pl
new file mode 100644
index 0000000000..6a40d5d89c
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-s390x.pl
@@ -0,0 +1,262 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# September 2010.
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# was measured to be ~18 cycles per processed byte on z10, which is
+# almost 40% better than gcc-generated code. It should be noted that
+# 18 cycles is worse result than expected: loop is scheduled for 12
+# and the result should be close to 12. In the lack of instruction-
+# level profiling data it's impossible to tell why...
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2.8x better than 32-bit code generated by gcc 4.3.
+# March 2011.
+#
+# Support for hardware KIMD-GHASH is verified to produce correct
+# result and therefore is engaged. On z196 it was measured to process
+# 8KB buffer ~7 faster than software implementation. It's not as
+# impressive for smaller buffer sizes and for smallest 16-bytes buffer
+# it's actually almost 2 times slower. Which is the reason why
+# KIMD-GHASH is not used in gcm_gmult_4bit.
+$flavour = shift;
+if ($flavour =~ /3[12]/) {
+        $SIZE_T=4;
+        $g="";
+} else {
+        $SIZE_T=8;
+        $g="g";
+}
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+$softonly=0;
+$Zhi="%r0";
+$Zlo="%r1";
+$Xi="%r2";      # argument block
+$Htbl="%r3";
+$inp="%r4";
+$len="%r5";
+$rem0="%r6";    # variables
+$rem1="%r7";
+$nlo="%r8";
+$nhi="%r9";
+$xi="%r10";
+$cnt="%r11";
+$tmp="%r12";
+$x78="%r13";
+$rem_4bit="%r14";
+$sp="%r15";
+$code.=<<___;
+.text
+.globl  gcm_gmult_4bit
+.align  32
+gcm_gmult_4bit:
+___
+$code.=<<___ if(!$softonly && 0);       # hardware is slow for single block...
+        larl    %r1,OPENSSL_s390xcap_P
+        lg      %r0,0(%r1)
+        tmhl    %r0,0x4000      # check for message-security-assist
+        jz      .Lsoft_gmult
+        lghi    %r0,0
+        la      %r1,16($sp)
+        .long   0xb93e0004      # kimd %r0,%r4
+        lg      %r1,24($sp)
+        tmhh    %r1,0x4000      # check for function 65
+        jz      .Lsoft_gmult
+        stg     %r0,16($sp)     # arrange 16 bytes of zero input
+        stg     %r0,24($sp)
+        lghi    %r0,65          # function 65
+        la      %r1,0($Xi)      # H lies right after Xi in gcm128_context
+        la      $inp,16($sp)
+        lghi    $len,16
+        .long   0xb93e0004      # kimd %r0,$inp
+        brc     1,.-4           # pay attention to "partial completion"
+        br      %r14
+.align  32
+.Lsoft_gmult:
+___
+$code.=<<___;
+        stm${g} %r6,%r14,6*$SIZE_T($sp)
+        aghi    $Xi,-1
+        lghi    $len,1
+        lghi    $x78,`0xf<<3`
+        larl    $rem_4bit,rem_4bit
+        lg      $Zlo,8+1($Xi)           # Xi
+        j       .Lgmult_shortcut
+.type   gcm_gmult_4bit,\@function
+.size   gcm_gmult_4bit,(.-gcm_gmult_4bit)
+.globl  gcm_ghash_4bit
+.align  32
+gcm_ghash_4bit:
+___
+$code.=<<___ if(!$softonly);
+        larl    %r1,OPENSSL_s390xcap_P
+        lg      %r0,0(%r1)
+        tmhl    %r0,0x4000      # check for message-security-assist
+        jz      .Lsoft_ghash
+        lghi    %r0,0
+        la      %r1,16($sp)
+        .long   0xb93e0004      # kimd %r0,%r4
+        lg      %r1,24($sp)
+        tmhh    %r1,0x4000      # check for function 65
+        jz      .Lsoft_ghash
+        lghi    %r0,65          # function 65
+        la      %r1,0($Xi)      # H lies right after Xi in gcm128_context
+        .long   0xb93e0004      # kimd %r0,$inp
+        brc     1,.-4           # pay attention to "partial completion"
+        br      %r14
+.align  32
+.Lsoft_ghash:
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+        llgfr   $len,$len
+___
+$code.=<<___;
+        stm${g} %r6,%r14,6*$SIZE_T($sp)
+        aghi    $Xi,-1
+        srlg    $len,$len,4
+        lghi    $x78,`0xf<<3`
+        larl    $rem_4bit,rem_4bit
+        lg      $Zlo,8+1($Xi)           # Xi
+        lg      $Zhi,0+1($Xi)
+        lghi    $tmp,0
+.Louter:
+        xg      $Zhi,0($inp)            # Xi ^= inp 
+        xg      $Zlo,8($inp)
+        xgr     $Zhi,$tmp
+        stg     $Zlo,8+1($Xi)
+        stg     $Zhi,0+1($Xi)
+.Lgmult_shortcut:
+        lghi    $tmp,0xf0
+        sllg    $nlo,$Zlo,4
+        srlg    $xi,$Zlo,8              # extract second byte
+        ngr     $nlo,$tmp
+        lgr     $nhi,$Zlo
+        lghi    $cnt,14
+        ngr     $nhi,$tmp
+        lg      $Zlo,8($nlo,$Htbl)
+        lg      $Zhi,0($nlo,$Htbl)
+        sllg    $nlo,$xi,4
+        sllg    $rem0,$Zlo,3
+        ngr     $nlo,$tmp
+        ngr     $rem0,$x78
+        ngr     $xi,$tmp
+        sllg    $tmp,$Zhi,60
+        srlg    $Zlo,$Zlo,4
+        srlg    $Zhi,$Zhi,4
+        xg      $Zlo,8($nhi,$Htbl)
+        xg      $Zhi,0($nhi,$Htbl)
+        lgr     $nhi,$xi
+        sllg    $rem1,$Zlo,3
+        xgr     $Zlo,$tmp
+        ngr     $rem1,$x78
+        j       .Lghash_inner
+.align  16
+.Lghash_inner:
+        srlg    $Zlo,$Zlo,4
+        sllg    $tmp,$Zhi,60
+        xg      $Zlo,8($nlo,$Htbl)
+        srlg    $Zhi,$Zhi,4
+        llgc    $xi,0($cnt,$Xi)
+        xg      $Zhi,0($nlo,$Htbl)
+        sllg    $nlo,$xi,4
+        xg      $Zhi,0($rem0,$rem_4bit)
+        nill    $nlo,0xf0
+        sllg    $rem0,$Zlo,3
+        xgr     $Zlo,$tmp
+        ngr     $rem0,$x78
+        nill    $xi,0xf0
+        sllg    $tmp,$Zhi,60
+        srlg    $Zlo,$Zlo,4
+        srlg    $Zhi,$Zhi,4
+        xg      $Zlo,8($nhi,$Htbl)
+        xg      $Zhi,0($nhi,$Htbl)
+        lgr     $nhi,$xi
+        xg      $Zhi,0($rem1,$rem_4bit)
+        sllg    $rem1,$Zlo,3
+        xgr     $Zlo,$tmp
+        ngr     $rem1,$x78
+        brct    $cnt,.Lghash_inner
+        sllg    $tmp,$Zhi,60
+        srlg    $Zlo,$Zlo,4
+        srlg    $Zhi,$Zhi,4
+        xg      $Zlo,8($nlo,$Htbl)
+        xg      $Zhi,0($nlo,$Htbl)
+        sllg    $xi,$Zlo,3
+        xg      $Zhi,0($rem0,$rem_4bit)
+        xgr     $Zlo,$tmp
+        ngr     $xi,$x78
+        sllg    $tmp,$Zhi,60
+        srlg    $Zlo,$Zlo,4
+        srlg    $Zhi,$Zhi,4
+        xg      $Zlo,8($nhi,$Htbl)
+        xg      $Zhi,0($nhi,$Htbl)
+        xgr     $Zlo,$tmp
+        xg      $Zhi,0($rem1,$rem_4bit)
+        lg      $tmp,0($xi,$rem_4bit)
+        la      $inp,16($inp)
+        sllg    $tmp,$tmp,4             # correct last rem_4bit[rem]
+        brctg   $len,.Louter
+        xgr     $Zhi,$tmp
+        stg     $Zlo,8+1($Xi)
+        stg     $Zhi,0+1($Xi)
+        lm${g}  %r6,%r14,6*$SIZE_T($sp)
+        br      %r14
+.type   gcm_ghash_4bit,\@function
+.size   gcm_ghash_4bit,(.-gcm_ghash_4bit)
+.align  64
+rem_4bit:
+        .long   `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
+        .long   `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
+        .long   `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
+        .long   `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
+.type   rem_4bit,\@object
+.size   rem_4bit,(.-rem_4bit)
+.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
new file mode 100644
index 0000000000..70e7b044a3
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
@@ -0,0 +1,330 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
+# and are expressed in cycles per processed byte, less is better:
+#
+#               gcc 3.3.x       cc 5.2          this assembler
+#
+# 32-bit build  81.4            43.3            12.6    (+546%/+244%)
+# 64-bit build  20.2            21.2            12.6    (+60%/+68%)
+#
+# Here is data collected on UltraSPARC T1 system running Linux:
+#
+#               gcc 4.4.1                       this assembler
+#
+# 32-bit build  566                             50      (+1000%)
+# 64-bit build  56                              50      (+12%)
+#
+# I don't quite understand why difference between 32-bit and 64-bit
+# compiler-generated code is so big. Compilers *were* instructed to
+# generate code for UltraSPARC and should have used 64-bit registers
+# for Z vector (see C code) even in 32-bit build... Oh well, it only
+# means more impressive improvement coefficients for this assembler
+# module;-) Loops are aggressively modulo-scheduled in respect to
+# references to input data and Z.hi updates to achieve 12 cycles
+# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
+# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
+$bits=32;
+for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)  { $bias=2047; $frame=192; }
+else            { $bias=0;    $frame=112; }
+$output=shift;
+open STDOUT,">$output";
+$Zhi="%o0";     # 64-bit values
+$Zlo="%o1";
+$Thi="%o2";
+$Tlo="%o3";
+$rem="%o4";
+$tmp="%o5";
+$nhi="%l0";     # small values and pointers
+$nlo="%l1";
+$xi0="%l2";
+$xi1="%l3";
+$rem_4bit="%l4";
+$remi="%l5";
+$Htblo="%l6";
+$cnt="%l7";
+$Xi="%i0";      # input argument block
+$Htbl="%i1";
+$inp="%i2";
+$len="%i3";
+$code.=<<___;
+.section        ".text",#alloc,#execinstr
+.align  64
+rem_4bit:
+        .long   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+        .long   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+        .long   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+        .long   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+.type   rem_4bit,#object
+.size   rem_4bit,(.-rem_4bit)
+.globl  gcm_ghash_4bit
+.align  32
+gcm_ghash_4bit:
+        save    %sp,-$frame,%sp
+        ldub    [$inp+15],$nlo
+        ldub    [$Xi+15],$xi0
+        ldub    [$Xi+14],$xi1
+        add     $len,$inp,$len
+        add     $Htbl,8,$Htblo
+1:      call    .+8
+        add     %o7,rem_4bit-1b,$rem_4bit
+.Louter:
+        xor     $xi0,$nlo,$nlo
+        and     $nlo,0xf0,$nhi
+        and     $nlo,0x0f,$nlo
+        sll     $nlo,4,$nlo
+        ldx     [$Htblo+$nlo],$Zlo
+        ldx     [$Htbl+$nlo],$Zhi
+        ldub    [$inp+14],$nlo
+        ldx     [$Htblo+$nhi],$Tlo
+        and     $Zlo,0xf,$remi
+        ldx     [$Htbl+$nhi],$Thi
+        sll     $remi,3,$remi
+        ldx     [$rem_4bit+$remi],$rem
+        srlx    $Zlo,4,$Zlo
+        mov     13,$cnt
+        sllx    $Zhi,60,$tmp
+        xor     $Tlo,$Zlo,$Zlo
+        srlx    $Zhi,4,$Zhi
+        xor     $Zlo,$tmp,$Zlo
+        xor     $xi1,$nlo,$nlo
+        and     $Zlo,0xf,$remi
+        and     $nlo,0xf0,$nhi
+        and     $nlo,0x0f,$nlo
+        ba      .Lghash_inner
+        sll     $nlo,4,$nlo
+.align  32
+.Lghash_inner:
+        ldx     [$Htblo+$nlo],$Tlo
+        sll     $remi,3,$remi
+        xor     $Thi,$Zhi,$Zhi
+        ldx     [$Htbl+$nlo],$Thi
+        srlx    $Zlo,4,$Zlo
+        xor     $rem,$Zhi,$Zhi
+        ldx     [$rem_4bit+$remi],$rem
+        sllx    $Zhi,60,$tmp
+        xor     $Tlo,$Zlo,$Zlo
+        ldub    [$inp+$cnt],$nlo
+        srlx    $Zhi,4,$Zhi
+        xor     $Zlo,$tmp,$Zlo
+        ldub    [$Xi+$cnt],$xi1
+        xor     $Thi,$Zhi,$Zhi
+        and     $Zlo,0xf,$remi
+        ldx     [$Htblo+$nhi],$Tlo
+        sll     $remi,3,$remi
+        xor     $rem,$Zhi,$Zhi
+        ldx     [$Htbl+$nhi],$Thi
+        srlx    $Zlo,4,$Zlo
+        ldx     [$rem_4bit+$remi],$rem
+        sllx    $Zhi,60,$tmp
+        xor     $xi1,$nlo,$nlo
+        srlx    $Zhi,4,$Zhi
+        and     $nlo,0xf0,$nhi
+        addcc   $cnt,-1,$cnt
+        xor     $Zlo,$tmp,$Zlo
+        and     $nlo,0x0f,$nlo
+        xor     $Tlo,$Zlo,$Zlo
+        sll     $nlo,4,$nlo
+        blu     .Lghash_inner
+        and     $Zlo,0xf,$remi
+        ldx     [$Htblo+$nlo],$Tlo
+        sll     $remi,3,$remi
+        xor     $Thi,$Zhi,$Zhi
+        ldx     [$Htbl+$nlo],$Thi
+        srlx    $Zlo,4,$Zlo
+        xor     $rem,$Zhi,$Zhi
+        ldx     [$rem_4bit+$remi],$rem
+        sllx    $Zhi,60,$tmp
+        xor     $Tlo,$Zlo,$Zlo
+        srlx    $Zhi,4,$Zhi
+        xor     $Zlo,$tmp,$Zlo
+        xor     $Thi,$Zhi,$Zhi
+        add     $inp,16,$inp
+        cmp     $inp,$len
+        be,pn   `$bits==64?"%xcc":"%icc"`,.Ldone
+        and     $Zlo,0xf,$remi
+        ldx     [$Htblo+$nhi],$Tlo
+        sll     $remi,3,$remi
+        xor     $rem,$Zhi,$Zhi
+        ldx     [$Htbl+$nhi],$Thi
+        srlx    $Zlo,4,$Zlo
+        ldx     [$rem_4bit+$remi],$rem
+        sllx    $Zhi,60,$tmp
+        xor     $Tlo,$Zlo,$Zlo
+        ldub    [$inp+15],$nlo
+        srlx    $Zhi,4,$Zhi
+        xor     $Zlo,$tmp,$Zlo
+        xor     $Thi,$Zhi,$Zhi
+        stx     $Zlo,[$Xi+8]
+        xor     $rem,$Zhi,$Zhi
+        stx     $Zhi,[$Xi]
+        srl     $Zlo,8,$xi1
+        and     $Zlo,0xff,$xi0
+        ba      .Louter
+        and     $xi1,0xff,$xi1
+.align  32
+.Ldone:
+        ldx     [$Htblo+$nhi],$Tlo
+        sll     $remi,3,$remi
+        xor     $rem,$Zhi,$Zhi
+        ldx     [$Htbl+$nhi],$Thi
+        srlx    $Zlo,4,$Zlo
+        ldx     [$rem_4bit+$remi],$rem
+        sllx    $Zhi,60,$tmp
+        xor     $Tlo,$Zlo,$Zlo
+        srlx    $Zhi,4,$Zhi
+        xor     $Zlo,$tmp,$Zlo
+        xor     $Thi,$Zhi,$Zhi
+        stx     $Zlo,[$Xi+8]
+        xor     $rem,$Zhi,$Zhi
+        stx     $Zhi,[$Xi]
+        ret
+        restore
+.type   gcm_ghash_4bit,#function
+.size   gcm_ghash_4bit,(.-gcm_ghash_4bit)
+___
+undef $inp;
+undef $len;
+$code.=<<___;
+.globl  gcm_gmult_4bit
+.align  32
+gcm_gmult_4bit:
+        save    %sp,-$frame,%sp
+        ldub    [$Xi+15],$nlo
+        add     $Htbl,8,$Htblo
+1:      call    .+8
+        add     %o7,rem_4bit-1b,$rem_4bit
+        and     $nlo,0xf0,$nhi
+        and     $nlo,0x0f,$nlo
+        sll     $nlo,4,$nlo
+        ldx     [$Htblo+$nlo],$Zlo
+        ldx     [$Htbl+$nlo],$Zhi
+        ldub    [$Xi+14],$nlo
+        ldx     [$Htblo+$nhi],$Tlo
+        and     $Zlo,0xf,$remi
+        ldx     [$Htbl+$nhi],$Thi
+        sll     $remi,3,$remi
+        ldx     [$rem_4bit+$remi],$rem
+        srlx    $Zlo,4,$Zlo
+        mov     13,$cnt
+        sllx    $Zhi,60,$tmp
+        xor     $Tlo,$Zlo,$Zlo
+        srlx    $Zhi,4,$Zhi
+        xor     $Zlo,$tmp,$Zlo
+        and     $Zlo,0xf,$remi
+        and     $nlo,0xf0,$nhi
+        and     $nlo,0x0f,$nlo
+        ba      .Lgmult_inner
+        sll     $nlo,4,$nlo
+.align  32
+.Lgmult_inner:
+        ldx     [$Htblo+$nlo],$Tlo
+        sll     $remi,3,$remi
+        xor     $Thi,$Zhi,$Zhi
+        ldx     [$Htbl+$nlo],$Thi
+        srlx    $Zlo,4,$Zlo
+        xor     $rem,$Zhi,$Zhi
+        ldx     [$rem_4bit+$remi],$rem
+        sllx    $Zhi,60,$tmp
+        xor     $Tlo,$Zlo,$Zlo
+        ldub    [$Xi+$cnt],$nlo
+        srlx    $Zhi,4,$Zhi
+        xor     $Zlo,$tmp,$Zlo
+        xor     $Thi,$Zhi,$Zhi
+        and     $Zlo,0xf,$remi
+        ldx     [$Htblo+$nhi],$Tlo
+        sll     $remi,3,$remi
+        xor     $rem,$Zhi,$Zhi
+        ldx     [$Htbl+$nhi],$Thi
+        srlx    $Zlo,4,$Zlo
+        ldx     [$rem_4bit+$remi],$rem
+        sllx    $Zhi,60,$tmp
+        srlx    $Zhi,4,$Zhi
+        and     $nlo,0xf0,$nhi
+        addcc   $cnt,-1,$cnt
+        xor     $Zlo,$tmp,$Zlo
+        and     $nlo,0x0f,$nlo
+        xor     $Tlo,$Zlo,$Zlo
+        sll     $nlo,4,$nlo
+        blu     .Lgmult_inner
+        and     $Zlo,0xf,$remi
+        ldx     [$Htblo+$nlo],$Tlo
+        sll     $remi,3,$remi
+        xor     $Thi,$Zhi,$Zhi
+        ldx     [$Htbl+$nlo],$Thi
+        srlx    $Zlo,4,$Zlo
+        xor     $rem,$Zhi,$Zhi
+        ldx     [$rem_4bit+$remi],$rem
+        sllx    $Zhi,60,$tmp
+        xor     $Tlo,$Zlo,$Zlo
+        srlx    $Zhi,4,$Zhi
+        xor     $Zlo,$tmp,$Zlo
+        xor     $Thi,$Zhi,$Zhi
+        and     $Zlo,0xf,$remi
+        ldx     [$Htblo+$nhi],$Tlo
+        sll     $remi,3,$remi
+        xor     $rem,$Zhi,$Zhi
+        ldx     [$Htbl+$nhi],$Thi
+        srlx    $Zlo,4,$Zlo
+        ldx     [$rem_4bit+$remi],$rem
+        sllx    $Zhi,60,$tmp
+        xor     $Tlo,$Zlo,$Zlo
+        srlx    $Zhi,4,$Zhi
+        xor     $Zlo,$tmp,$Zlo
+        xor     $Thi,$Zhi,$Zhi
+        stx     $Zlo,[$Xi+8]
+        xor     $rem,$Zhi,$Zhi
+        stx     $Zhi,[$Xi]
+        ret
+        restore
+.type   gcm_gmult_4bit,#function
+.size   gcm_gmult_4bit,(.-gcm_gmult_4bit)
+.asciz  "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align  4
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86.pl b/src/lib/libcrypto/modes/asm/ghash-x86.pl
new file mode 100644
index 0000000000..6b09669d47
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-x86.pl
@@ -0,0 +1,1342 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, May, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
+# code paths: vanilla x86 and vanilla MMX. Former will be executed on
+# 486 and Pentium, latter on all others. MMX GHASH features so called
+# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
+# of per-key storage [+512 bytes shared table]. Performance results
+# are for streamed GHASH subroutine and are expressed in cycles per
+# processed byte, less is better:
+#
+#               gcc 2.95.3(*)   MMX assembler   x86 assembler
+#
+# Pentium       105/111(**)     -               50
+# PIII          68 /75          12.2            24
+# P4            125/125         17.8            84(***)
+# Opteron       66 /70          10.1            30
+# Core2         54 /67          8.4             18
+#
+# (*)   gcc 3.4.x was observed to generate few percent slower code,
+#       which is one of reasons why 2.95.3 results were chosen,
+#       another reason is lack of 3.4.x results for older CPUs;
+#       comparison with MMX results is not completely fair, because C
+#       results are for vanilla "256B" implementation, while
+#       assembler results are for "528B";-)
+# (**)  second number is result for code compiled with -fPIC flag,
+#       which is actually more relevant, because assembler code is
+#       position-independent;
+# (***) see comment in non-MMX routine for further details;
+#
+# To summarize, it's >2-5 times faster than gcc-generated code. To
+# anchor it to something else SHA1 assembler processes one byte in
+# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
+# particular, see comment at the end of the file...
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
+# The question is how close is it to theoretical limit? The pclmulqdq
+# instruction latency appears to be 14 cycles and there can't be more
+# than 2 of them executing at any given time. This means that single
+# Karatsuba multiplication would take 28 cycles *plus* few cycles for
+# pre- and post-processing. Then multiplication has to be followed by
+# modulo-reduction. Given that aggregated reduction method [see
+# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
+# white paper by Intel] allows you to perform reduction only once in
+# a while we can assume that asymptotic performance can be estimated
+# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
+# and Naggr is the aggregation factor.
+#
+# Before we proceed to this implementation let's have closer look at
+# the best-performing code suggested by Intel in their white paper.
+# By tracing inter-register dependencies Tmod is estimated as ~19
+# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
+# processed byte. As implied, this is quite optimistic estimate,
+# because it does not account for Karatsuba pre- and post-processing,
+# which for a single multiplication is ~5 cycles. Unfortunately Intel
+# does not provide performance data for GHASH alone. But benchmarking
+# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
+# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
+# the result accounts even for pre-computing of degrees of the hash
+# key H, but its portion is negligible at 16KB buffer size.
+#
+# Moving on to the implementation in question. Tmod is estimated as
+# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
+# 2.16. How is it possible that measured performance is better than
+# optimistic theoretical estimate? There is one thing Intel failed
+# to recognize. By serializing GHASH with CTR in same subroutine
+# former's performance is really limited to above (Tmul + Tmod/Naggr)
+# equation. But if GHASH procedure is detached, the modulo-reduction
+# can be interleaved with Naggr-1 multiplications at instruction level
+# and under ideal conditions even disappear from the equation. So that
+# optimistic theoretical estimate for this implementation is ...
+# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
+# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
+# where Tproc is time required for Karatsuba pre- and post-processing,
+# is more realistic estimate. In this case it gives ... 1.91 cycles.
+# Or in other words, depending on how well we can interleave reduction
+# and one of the two multiplications the performance should be betwen
+# 1.91 and 2.16. As already mentioned, this implementation processes
+# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
+# - in 2.02. x86_64 performance is better, because larger register
+# bank allows to interleave reduction and multiplication better.
+#
+# Does it make sense to increase Naggr? To start with it's virtually
+# impossible in 32-bit mode, because of limited register bank
+# capacity. Otherwise improvement has to be weighed agiainst slower
+# setup, as well as code size and complexity increase. As even
+# optimistic estimate doesn't promise 30% performance improvement,
+# there are currently no plans to increase Naggr.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+# January 2010
+#
+# Tweaked to optimize transitions between integer and FP operations
+# on same XMM register, PCLMULQDQ subroutine was measured to process
+# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
+# The minor regression on Westmere is outweighed by ~15% improvement
+# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
+# similar manner resulted in almost 20% degradation on Sandy Bridge,
+# where original 64-bit code processes one byte in 1.95 cycles.
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
+$inp  = "edi";
+$Htbl = "esi";
+$unroll = 0;    # Affects x86 loop. Folded loop performs ~7% worse
+                # than unrolled, which has to be weighted against
+                # 2.5x x86-specific code size reduction.
+sub x86_loop {
+    my $off = shift;
+    my $rem = "eax";
+        &mov    ($Zhh,&DWP(4,$Htbl,$Zll));
+        &mov    ($Zhl,&DWP(0,$Htbl,$Zll));
+        &mov    ($Zlh,&DWP(12,$Htbl,$Zll));
+        &mov    ($Zll,&DWP(8,$Htbl,$Zll));
+        &xor    ($rem,$rem);    # avoid partial register stalls on PIII
+        # shrd practically kills P4, 2.5x deterioration, but P4 has
+        # MMX code-path to execute. shrd runs tad faster [than twice
+        # the shifts, move's and or's] on pre-MMX Pentium (as well as
+        # PIII and Core2), *but* minimizes code size, spares register
+        # and thus allows to fold the loop...
+        if (!$unroll) {
+        my $cnt = $inp;
+        &mov    ($cnt,15);
+        &jmp    (&label("x86_loop"));
+        &set_label("x86_loop",16);
+            for($i=1;$i<=2;$i++) {
+                &mov    (&LB($rem),&LB($Zll));
+                &shrd   ($Zll,$Zlh,4);
+                &and    (&LB($rem),0xf);
+                &shrd   ($Zlh,$Zhl,4);
+                &shrd   ($Zhl,$Zhh,4);
+                &shr    ($Zhh,4);
+                &xor    ($Zhh,&DWP($off+16,"esp",$rem,4));
+                &mov    (&LB($rem),&BP($off,"esp",$cnt));
+                if ($i&1) {
+                        &and    (&LB($rem),0xf0);
+                } else {
+                        &shl    (&LB($rem),4);
+                }
+                &xor    ($Zll,&DWP(8,$Htbl,$rem));
+                &xor    ($Zlh,&DWP(12,$Htbl,$rem));
+                &xor    ($Zhl,&DWP(0,$Htbl,$rem));
+                &xor    ($Zhh,&DWP(4,$Htbl,$rem));
+                if ($i&1) {
+                        &dec    ($cnt);
+                        &js     (&label("x86_break"));
+                } else {
+                        &jmp    (&label("x86_loop"));
+                }
+            }
+        &set_label("x86_break",16);
+        } else {
+            for($i=1;$i<32;$i++) {
+                &comment($i);
+                &mov    (&LB($rem),&LB($Zll));
+                &shrd   ($Zll,$Zlh,4);
+                &and    (&LB($rem),0xf);
+                &shrd   ($Zlh,$Zhl,4);
+                &shrd   ($Zhl,$Zhh,4);
+                &shr    ($Zhh,4);
+                &xor    ($Zhh,&DWP($off+16,"esp",$rem,4));
+                if ($i&1) {
+                        &mov    (&LB($rem),&BP($off+15-($i>>1),"esp"));
+                        &and    (&LB($rem),0xf0);
+                } else {
+                        &mov    (&LB($rem),&BP($off+15-($i>>1),"esp"));
+                        &shl    (&LB($rem),4);
+                }
+                &xor    ($Zll,&DWP(8,$Htbl,$rem));
+                &xor    ($Zlh,&DWP(12,$Htbl,$rem));
+                &xor    ($Zhl,&DWP(0,$Htbl,$rem));
+                &xor    ($Zhh,&DWP(4,$Htbl,$rem));
+            }
+        }
+        &bswap  ($Zll);
+        &bswap  ($Zlh);
+        &bswap  ($Zhl);
+        if (!$x86only) {
+                &bswap  ($Zhh);
+        } else {
+                &mov    ("eax",$Zhh);
+                &bswap  ("eax");
+                &mov    ($Zhh,"eax");
+        }
+}
+if ($unroll) {
+    &function_begin_B("_x86_gmult_4bit_inner");
+        &x86_loop(4);
+        &ret    ();
+    &function_end_B("_x86_gmult_4bit_inner");
+}
+sub deposit_rem_4bit {
+    my $bias = shift;
+        &mov    (&DWP($bias+0, "esp"),0x0000<<16);
+        &mov    (&DWP($bias+4, "esp"),0x1C20<<16);
+        &mov    (&DWP($bias+8, "esp"),0x3840<<16);
+        &mov    (&DWP($bias+12,"esp"),0x2460<<16);
+        &mov    (&DWP($bias+16,"esp"),0x7080<<16);
+        &mov    (&DWP($bias+20,"esp"),0x6CA0<<16);
+        &mov    (&DWP($bias+24,"esp"),0x48C0<<16);
+        &mov    (&DWP($bias+28,"esp"),0x54E0<<16);
+        &mov    (&DWP($bias+32,"esp"),0xE100<<16);
+        &mov    (&DWP($bias+36,"esp"),0xFD20<<16);
+        &mov    (&DWP($bias+40,"esp"),0xD940<<16);
+        &mov    (&DWP($bias+44,"esp"),0xC560<<16);
+        &mov    (&DWP($bias+48,"esp"),0x9180<<16);
+        &mov    (&DWP($bias+52,"esp"),0x8DA0<<16);
+        &mov    (&DWP($bias+56,"esp"),0xA9C0<<16);
+        &mov    (&DWP($bias+60,"esp"),0xB5E0<<16);
+}
+$suffix = $x86only ? "" : "_x86";
+&function_begin("gcm_gmult_4bit".$suffix);
+        &stack_push(16+4+1);                    # +1 for stack alignment
+        &mov    ($inp,&wparam(0));              # load Xi
+        &mov    ($Htbl,&wparam(1));             # load Htable
+        &mov    ($Zhh,&DWP(0,$inp));            # load Xi[16]
+        &mov    ($Zhl,&DWP(4,$inp));
+        &mov    ($Zlh,&DWP(8,$inp));
+        &mov    ($Zll,&DWP(12,$inp));
+        &deposit_rem_4bit(16);
+        &mov    (&DWP(0,"esp"),$Zhh);           # copy Xi[16] on stack
+        &mov    (&DWP(4,"esp"),$Zhl);
+        &mov    (&DWP(8,"esp"),$Zlh);
+        &mov    (&DWP(12,"esp"),$Zll);
+        &shr    ($Zll,20);
+        &and    ($Zll,0xf0);
+        if ($unroll) {
+                &call   ("_x86_gmult_4bit_inner");
+        } else {
+                &x86_loop(0);
+                &mov    ($inp,&wparam(0));
+        }
+        &mov    (&DWP(12,$inp),$Zll);
+        &mov    (&DWP(8,$inp),$Zlh);
+        &mov    (&DWP(4,$inp),$Zhl);
+        &mov    (&DWP(0,$inp),$Zhh);
+        &stack_pop(16+4+1);
+&function_end("gcm_gmult_4bit".$suffix);
+&function_begin("gcm_ghash_4bit".$suffix);
+        &stack_push(16+4+1);                    # +1 for 64-bit alignment
+        &mov    ($Zll,&wparam(0));              # load Xi
+        &mov    ($Htbl,&wparam(1));             # load Htable
+        &mov    ($inp,&wparam(2));              # load in
+        &mov    ("ecx",&wparam(3));             # load len
+        &add    ("ecx",$inp);
+        &mov    (&wparam(3),"ecx");
+        &mov    ($Zhh,&DWP(0,$Zll));            # load Xi[16]
+        &mov    ($Zhl,&DWP(4,$Zll));
+        &mov    ($Zlh,&DWP(8,$Zll));
+        &mov    ($Zll,&DWP(12,$Zll));
+        &deposit_rem_4bit(16);
+    &set_label("x86_outer_loop",16);
+        &xor    ($Zll,&DWP(12,$inp));           # xor with input
+        &xor    ($Zlh,&DWP(8,$inp));
+        &xor    ($Zhl,&DWP(4,$inp));
+        &xor    ($Zhh,&DWP(0,$inp));
+        &mov    (&DWP(12,"esp"),$Zll);          # dump it on stack
+        &mov    (&DWP(8,"esp"),$Zlh);
+        &mov    (&DWP(4,"esp"),$Zhl);
+        &mov    (&DWP(0,"esp"),$Zhh);
+        &shr    ($Zll,20);
+        &and    ($Zll,0xf0);
+        if ($unroll) {
+                &call   ("_x86_gmult_4bit_inner");
+        } else {
+                &x86_loop(0);
+                &mov    ($inp,&wparam(2));
+        }
+        &lea    ($inp,&DWP(16,$inp));
+        &cmp    ($inp,&wparam(3));
+        &mov    (&wparam(2),$inp)       if (!$unroll);
+        &jb     (&label("x86_outer_loop"));
+        &mov    ($inp,&wparam(0));      # load Xi
+        &mov    (&DWP(12,$inp),$Zll);
+        &mov    (&DWP(8,$inp),$Zlh);
+        &mov    (&DWP(4,$inp),$Zhl);
+        &mov    (&DWP(0,$inp),$Zhh);
+        &stack_pop(16+4+1);
+&function_end("gcm_ghash_4bit".$suffix);
+if (!$x86only) {{{
+&static_label("rem_4bit");
+if (!$sse2) {{  # pure-MMX "May" version...
+$S=12;          # shift factor for rem_4bit
+&function_begin_B("_mmx_gmult_4bit_inner");
+# MMX version performs 3.5 times better on P4 (see comment in non-MMX
+# routine for further details), 100% better on Opteron, ~70% better
+# on Core2 and PIII... In other words effort is considered to be well
+# spent... Since initial release the loop was unrolled in order to
+# "liberate" register previously used as loop counter. Instead it's
+# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
+# The path involves move of Z.lo from MMX to integer register,
+# effective address calculation and finally merge of value to Z.hi.
+# Reference to rem_4bit is scheduled so late that I had to >>4
+# rem_4bit elements. This resulted in 20-45% procent improvement
+# on contemporary �-archs.
+{
+    my $cnt;
+    my $rem_4bit = "eax";
+    my @rem = ($Zhh,$Zll);
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+        &xor    ($nlo,$nlo);    # avoid partial register stalls on PIII
+        &mov    ($nhi,$Zll);
+        &mov    (&LB($nlo),&LB($nhi));
+        &shl    (&LB($nlo),4);
+        &and    ($nhi,0xf0);
+        &movq   ($Zlo,&QWP(8,$Htbl,$nlo));
+        &movq   ($Zhi,&QWP(0,$Htbl,$nlo));
+        &movd   ($rem[0],$Zlo);
+        for ($cnt=28;$cnt>=-2;$cnt--) {
+            my $odd = $cnt&1;
+            my $nix = $odd ? $nlo : $nhi;
+                &shl    (&LB($nlo),4)                   if ($odd);
+                &psrlq  ($Zlo,4);
+                &movq   ($tmp,$Zhi);
+                &psrlq  ($Zhi,4);
+                &pxor   ($Zlo,&QWP(8,$Htbl,$nix));
+                &mov    (&LB($nlo),&BP($cnt/2,$inp))    if (!$odd && $cnt>=0);
+                &psllq  ($tmp,60);
+                &and    ($nhi,0xf0)                     if ($odd);
+                &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
+                &and    ($rem[0],0xf);
+                &pxor   ($Zhi,&QWP(0,$Htbl,$nix));
+                &mov    ($nhi,$nlo)                     if (!$odd && $cnt>=0);
+                &movd   ($rem[1],$Zlo);
+                &pxor   ($Zlo,$tmp);
+                push    (@rem,shift(@rem));             # "rotate" registers
+        }
+        &mov    ($inp,&DWP(4,$rem_4bit,$rem[1],8));     # last rem_4bit[rem]
+        &psrlq  ($Zlo,32);      # lower part of Zlo is already there
+        &movd   ($Zhl,$Zhi);
+        &psrlq  ($Zhi,32);
+        &movd   ($Zlh,$Zlo);
+        &movd   ($Zhh,$Zhi);
+        &shl    ($inp,4);       # compensate for rem_4bit[i] being >>4
+        &bswap  ($Zll);
+        &bswap  ($Zhl);
+        &bswap  ($Zlh);
+        &xor    ($Zhh,$inp);
+        &bswap  ($Zhh);
+        &ret    ();
+}
+&function_end_B("_mmx_gmult_4bit_inner");
+&function_begin("gcm_gmult_4bit_mmx");
+        &mov    ($inp,&wparam(0));      # load Xi
+        &mov    ($Htbl,&wparam(1));     # load Htable
+        &call   (&label("pic_point"));
+        &set_label("pic_point");
+        &blindpop("eax");
+        &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+        &movz   ($Zll,&BP(15,$inp));
+        &call   ("_mmx_gmult_4bit_inner");
+        &mov    ($inp,&wparam(0));      # load Xi
+        &emms   ();
+        &mov    (&DWP(12,$inp),$Zll);
+        &mov    (&DWP(4,$inp),$Zhl);
+        &mov    (&DWP(8,$inp),$Zlh);
+        &mov    (&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+# Streamed version performs 20% better on P4, 7% on Opteron,
+# 10% on Core2 and PIII...
+&function_begin("gcm_ghash_4bit_mmx");
+        &mov    ($Zhh,&wparam(0));      # load Xi
+        &mov    ($Htbl,&wparam(1));     # load Htable
+        &mov    ($inp,&wparam(2));      # load in
+        &mov    ($Zlh,&wparam(3));      # load len
+        &call   (&label("pic_point"));
+        &set_label("pic_point");
+        &blindpop("eax");
+        &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+        &add    ($Zlh,$inp);
+        &mov    (&wparam(3),$Zlh);      # len to point at the end of input
+        &stack_push(4+1);               # +1 for stack alignment
+        &mov    ($Zll,&DWP(12,$Zhh));   # load Xi[16]
+        &mov    ($Zhl,&DWP(4,$Zhh));
+        &mov    ($Zlh,&DWP(8,$Zhh));
+        &mov    ($Zhh,&DWP(0,$Zhh));
+        &jmp    (&label("mmx_outer_loop"));
+    &set_label("mmx_outer_loop",16);
+        &xor    ($Zll,&DWP(12,$inp));
+        &xor    ($Zhl,&DWP(4,$inp));
+        &xor    ($Zlh,&DWP(8,$inp));
+        &xor    ($Zhh,&DWP(0,$inp));
+        &mov    (&wparam(2),$inp);
+        &mov    (&DWP(12,"esp"),$Zll);
+        &mov    (&DWP(4,"esp"),$Zhl);
+        &mov    (&DWP(8,"esp"),$Zlh);
+        &mov    (&DWP(0,"esp"),$Zhh);
+        &mov    ($inp,"esp");
+        &shr    ($Zll,24);
+        &call   ("_mmx_gmult_4bit_inner");
+        &mov    ($inp,&wparam(2));
+        &lea    ($inp,&DWP(16,$inp));
+        &cmp    ($inp,&wparam(3));
+        &jb     (&label("mmx_outer_loop"));
+        &mov    ($inp,&wparam(0));      # load Xi
+        &emms   ();
+        &mov    (&DWP(12,$inp),$Zll);
+        &mov    (&DWP(4,$inp),$Zhl);
+        &mov    (&DWP(8,$inp),$Zlh);
+        &mov    (&DWP(0,$inp),$Zhh);
+        &stack_pop(4+1);
+&function_end("gcm_ghash_4bit_mmx");
+}} else {{      # "June" MMX version...
+                # ... has slower "April" gcm_gmult_4bit_mmx with folded
+                # loop. This is done to conserve code size...
+$S=16;          # shift factor for rem_4bit
+sub mmx_loop() {
+# MMX version performs 2.8 times better on P4 (see comment in non-MMX
+# routine for further details), 40% better on Opteron and Core2, 50%
+# better on PIII... In other words effort is considered to be well
+# spent...
+    my $inp = shift;
+    my $rem_4bit = shift;
+    my $cnt = $Zhh;
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+    my $rem = $Zll;
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+        &xor    ($nlo,$nlo);    # avoid partial register stalls on PIII
+        &mov    ($nhi,$Zll);
+        &mov    (&LB($nlo),&LB($nhi));
+        &mov    ($cnt,14);
+        &shl    (&LB($nlo),4);
+        &and    ($nhi,0xf0);
+        &movq   ($Zlo,&QWP(8,$Htbl,$nlo));
+        &movq   ($Zhi,&QWP(0,$Htbl,$nlo));
+        &movd   ($rem,$Zlo);
+        &jmp    (&label("mmx_loop"));
+    &set_label("mmx_loop",16);
+        &psrlq  ($Zlo,4);
+        &and    ($rem,0xf);
+        &movq   ($tmp,$Zhi);
+        &psrlq  ($Zhi,4);
+        &pxor   ($Zlo,&QWP(8,$Htbl,$nhi));
+        &mov    (&LB($nlo),&BP(0,$inp,$cnt));
+        &psllq  ($tmp,60);
+        &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+        &dec    ($cnt);
+        &movd   ($rem,$Zlo);
+        &pxor   ($Zhi,&QWP(0,$Htbl,$nhi));
+        &mov    ($nhi,$nlo);
+        &pxor   ($Zlo,$tmp);
+        &js     (&label("mmx_break"));
+        &shl    (&LB($nlo),4);
+        &and    ($rem,0xf);
+        &psrlq  ($Zlo,4);
+        &and    ($nhi,0xf0);
+        &movq   ($tmp,$Zhi);
+        &psrlq  ($Zhi,4);
+        &pxor   ($Zlo,&QWP(8,$Htbl,$nlo));
+        &psllq  ($tmp,60);
+        &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+        &movd   ($rem,$Zlo);
+        &pxor   ($Zhi,&QWP(0,$Htbl,$nlo));
+        &pxor   ($Zlo,$tmp);
+        &jmp    (&label("mmx_loop"));
+    &set_label("mmx_break",16);
+        &shl    (&LB($nlo),4);
+        &and    ($rem,0xf);
+        &psrlq  ($Zlo,4);
+        &and    ($nhi,0xf0);
+        &movq   ($tmp,$Zhi);
+        &psrlq  ($Zhi,4);
+        &pxor   ($Zlo,&QWP(8,$Htbl,$nlo));
+        &psllq  ($tmp,60);
+        &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+        &movd   ($rem,$Zlo);
+        &pxor   ($Zhi,&QWP(0,$Htbl,$nlo));
+        &pxor   ($Zlo,$tmp);
+        &psrlq  ($Zlo,4);
+        &and    ($rem,0xf);
+        &movq   ($tmp,$Zhi);
+        &psrlq  ($Zhi,4);
+        &pxor   ($Zlo,&QWP(8,$Htbl,$nhi));
+        &psllq  ($tmp,60);
+        &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+        &movd   ($rem,$Zlo);
+        &pxor   ($Zhi,&QWP(0,$Htbl,$nhi));
+        &pxor   ($Zlo,$tmp);
+        &psrlq  ($Zlo,32);      # lower part of Zlo is already there
+        &movd   ($Zhl,$Zhi);
+        &psrlq  ($Zhi,32);
+        &movd   ($Zlh,$Zlo);
+        &movd   ($Zhh,$Zhi);
+        &bswap  ($Zll);
+        &bswap  ($Zhl);
+        &bswap  ($Zlh);
+        &bswap  ($Zhh);
+}
+&function_begin("gcm_gmult_4bit_mmx");
+        &mov    ($inp,&wparam(0));      # load Xi
+        &mov    ($Htbl,&wparam(1));     # load Htable
+        &call   (&label("pic_point"));
+        &set_label("pic_point");
+        &blindpop("eax");
+        &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+        &movz   ($Zll,&BP(15,$inp));
+        &mmx_loop($inp,"eax");
+        &emms   ();
+        &mov    (&DWP(12,$inp),$Zll);
+        &mov    (&DWP(4,$inp),$Zhl);
+        &mov    (&DWP(8,$inp),$Zlh);
+        &mov    (&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+######################################################################
+# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
+# (see gcm128.c for details). It provides further 20-40% performance
+# improvement over above mentioned "May" version.
+&static_label("rem_8bit");
+&function_begin("gcm_ghash_4bit_mmx");
+{ my ($Zlo,$Zhi) = ("mm7","mm6");
+  my $rem_8bit = "esi";
+  my $Htbl = "ebx";
+    # parameter block
+    &mov        ("eax",&wparam(0));             # Xi
+    &mov        ("ebx",&wparam(1));             # Htable
+    &mov        ("ecx",&wparam(2));             # inp
+    &mov        ("edx",&wparam(3));             # len
+    &mov        ("ebp","esp");                  # original %esp
+    &call       (&label("pic_point"));
+    &set_label  ("pic_point");
+    &blindpop   ($rem_8bit);
+    &lea        ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
+    &sub        ("esp",512+16+16);              # allocate stack frame...
+    &and        ("esp",-64);                    # ...and align it
+    &sub        ("esp",16);                     # place for (u8)(H[]<<4)
+    &add        ("edx","ecx");                  # pointer to the end of input
+    &mov        (&DWP(528+16+0,"esp"),"eax");   # save Xi
+    &mov        (&DWP(528+16+8,"esp"),"edx");   # save inp+len
+    &mov        (&DWP(528+16+12,"esp"),"ebp");  # save original %esp
+    { my @lo  = ("mm0","mm1","mm2");
+      my @hi  = ("mm3","mm4","mm5");
+      my @tmp = ("mm6","mm7");
+      my $off1=0,$off2=0,$i;
+      &add      ($Htbl,128);                    # optimize for size
+      &lea      ("edi",&DWP(16+128,"esp"));
+      &lea      ("ebp",&DWP(16+256+128,"esp"));
+      # decompose Htable (low and high parts are kept separately),
+      # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
+      for ($i=0;$i<18;$i++) {
+        &mov    ("edx",&DWP(16*$i+8-128,$Htbl))         if ($i<16);
+        &movq   ($lo[0],&QWP(16*$i+8-128,$Htbl))        if ($i<16);
+        &psllq  ($tmp[1],60)                            if ($i>1);
+        &movq   ($hi[0],&QWP(16*$i+0-128,$Htbl))        if ($i<16);
+        &por    ($lo[2],$tmp[1])                        if ($i>1);
+        &movq   (&QWP($off1-128,"edi"),$lo[1])          if ($i>0 && $i<17);
+        &psrlq  ($lo[1],4)                              if ($i>0 && $i<17);
+        &movq   (&QWP($off1,"edi"),$hi[1])              if ($i>0 && $i<17);
+        &movq   ($tmp[0],$hi[1])                        if ($i>0 && $i<17);
+        &movq   (&QWP($off2-128,"ebp"),$lo[2])          if ($i>1);
+        &psrlq  ($hi[1],4)                              if ($i>0 && $i<17);
+        &movq   (&QWP($off2,"ebp"),$hi[2])              if ($i>1);
+        &shl    ("edx",4)                               if ($i<16);
+        &mov    (&BP($i,"esp"),&LB("edx"))              if ($i<16);
+        unshift (@lo,pop(@lo));                 # "rotate" registers
+        unshift (@hi,pop(@hi));
+        unshift (@tmp,pop(@tmp));
+        $off1 += 8      if ($i>0);
+        $off2 += 8      if ($i>1);
+      }
+    }
+    &movq       ($Zhi,&QWP(0,"eax"));
+    &mov        ("ebx",&DWP(8,"eax"));
+    &mov        ("edx",&DWP(12,"eax"));         # load Xi
+&set_label("outer",16);
+  { my $nlo = "eax";
+    my $dat = "edx";
+    my @nhi = ("edi","ebp");
+    my @rem = ("ebx","ecx");
+    my @red = ("mm0","mm1","mm2");
+    my $tmp = "mm3";
+    &xor        ($dat,&DWP(12,"ecx"));          # merge input data
+    &xor        ("ebx",&DWP(8,"ecx"));
+    &pxor       ($Zhi,&QWP(0,"ecx"));
+    &lea        ("ecx",&DWP(16,"ecx"));         # inp+=16
+    #&mov       (&DWP(528+12,"esp"),$dat);      # save inp^Xi
+    &mov        (&DWP(528+8,"esp"),"ebx");
+    &movq       (&QWP(528+0,"esp"),$Zhi);
+    &mov        (&DWP(528+16+4,"esp"),"ecx");   # save inp
+    &xor        ($nlo,$nlo);
+    &rol        ($dat,8);
+    &mov        (&LB($nlo),&LB($dat));
+    &mov        ($nhi[1],$nlo);
+    &and        (&LB($nlo),0x0f);
+    &shr        ($nhi[1],4);
+    &pxor       ($red[0],$red[0]);
+    &rol        ($dat,8);                       # next byte
+    &pxor       ($red[1],$red[1]);
+    &pxor       ($red[2],$red[2]);
+    # Just like in "May" verson modulo-schedule for critical path in
+    # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
+    # is scheduled so late that rem_8bit[] has to be shifted *right*
+    # by 16, which is why last argument to pinsrw is 2, which
+    # corresponds to <<32=<<48>>16...
+    for ($j=11,$i=0;$i<15;$i++) {
+      if ($i>0) {
+        &pxor   ($Zlo,&QWP(16,"esp",$nlo,8));           # Z^=H[nlo]
+        &rol    ($dat,8);                               # next byte
+        &pxor   ($Zhi,&QWP(16+128,"esp",$nlo,8));
+        &pxor   ($Zlo,$tmp);
+        &pxor   ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+        &xor    (&LB($rem[1]),&BP(0,"esp",$nhi[0]));    # rem^(H[nhi]<<4)
+      } else {
+        &movq   ($Zlo,&QWP(16,"esp",$nlo,8));
+        &movq   ($Zhi,&QWP(16+128,"esp",$nlo,8));
+      }
+        &mov    (&LB($nlo),&LB($dat));
+        &mov    ($dat,&DWP(528+$j,"esp"))               if (--$j%4==0);
+        &movd   ($rem[0],$Zlo);
+        &movz   ($rem[1],&LB($rem[1]))                  if ($i>0);
+        &psrlq  ($Zlo,8);                               # Z>>=8
+        &movq   ($tmp,$Zhi);
+        &mov    ($nhi[0],$nlo);
+        &psrlq  ($Zhi,8);
+        &pxor   ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8));  # Z^=H[nhi]>>4
+        &and    (&LB($nlo),0x0f);
+        &psllq  ($tmp,56);
+        &pxor   ($Zhi,$red[1])                          if ($i>1);
+        &shr    ($nhi[0],4);
+        &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2)  if ($i>0);
+        unshift (@red,pop(@red));                       # "rotate" registers
+        unshift (@rem,pop(@rem));
+        unshift (@nhi,pop(@nhi));
+    }
+    &pxor       ($Zlo,&QWP(16,"esp",$nlo,8));           # Z^=H[nlo]
+    &pxor       ($Zhi,&QWP(16+128,"esp",$nlo,8));
+    &xor        (&LB($rem[1]),&BP(0,"esp",$nhi[0]));    # rem^(H[nhi]<<4)
+    &pxor       ($Zlo,$tmp);
+    &pxor       ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+    &movz       ($rem[1],&LB($rem[1]));
+    &pxor       ($red[2],$red[2]);                      # clear 2nd word
+    &psllq      ($red[1],4);
+    &movd       ($rem[0],$Zlo);
+    &psrlq      ($Zlo,4);                               # Z>>=4
+    &movq       ($tmp,$Zhi);
+    &psrlq      ($Zhi,4);
+    &shl        ($rem[0],4);                            # rem<<4
+    &pxor       ($Zlo,&QWP(16,"esp",$nhi[1],8));        # Z^=H[nhi]
+    &psllq      ($tmp,60);
+    &movz       ($rem[0],&LB($rem[0]));
+    &pxor       ($Zlo,$tmp);
+    &pxor       ($Zhi,&QWP(16+128,"esp",$nhi[1],8));
+    &pinsrw     ($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
+    &pxor       ($Zhi,$red[1]);
+    &movd       ($dat,$Zlo);
+    &pinsrw     ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
+    &psllq      ($red[0],12);                           # correct by <<16>>4
+    &pxor       ($Zhi,$red[0]);
+    &psrlq      ($Zlo,32);
+    &pxor       ($Zhi,$red[2]);
+    &mov        ("ecx",&DWP(528+16+4,"esp"));   # restore inp
+    &movd       ("ebx",$Zlo);
+    &movq       ($tmp,$Zhi);                    # 01234567
+    &psllw      ($Zhi,8);                       # 1.3.5.7.
+    &psrlw      ($tmp,8);                       # .0.2.4.6
+    &por        ($Zhi,$tmp);                    # 10325476
+    &bswap      ($dat);
+    &pshufw     ($Zhi,$Zhi,0b00011011);         # 76543210
+    &bswap      ("ebx");
+    
+    &cmp        ("ecx",&DWP(528+16+8,"esp"));   # are we done?
+    &jne        (&label("outer"));
+  }
+    &mov        ("eax",&DWP(528+16+0,"esp"));   # restore Xi
+    &mov        (&DWP(12,"eax"),"edx");
+    &mov        (&DWP(8,"eax"),"ebx");
+    &movq       (&QWP(0,"eax"),$Zhi);
+    &mov        ("esp",&DWP(528+16+12,"esp"));  # restore original %esp
+    &emms       ();
+}
+&function_end("gcm_ghash_4bit_mmx");
+}}
+if ($sse2) {{
+######################################################################
+# PCLMULQDQ version.
+$Xip="eax";
+$Htbl="edx";
+$const="ecx";
+$inp="esi";
+$len="ebx";
+($Xi,$Xhi)=("xmm0","xmm1");     $Hkey="xmm2";
+($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
+($Xn,$Xhn)=("xmm6","xmm7");
+&static_label("bswap");
+sub clmul64x64_T2 {     # minimal "register" pressure
+my ($Xhi,$Xi,$Hkey)=@_;
+        &movdqa         ($Xhi,$Xi);             #
+        &pshufd         ($T1,$Xi,0b01001110);
+        &pshufd         ($T2,$Hkey,0b01001110);
+        &pxor           ($T1,$Xi);              #
+        &pxor           ($T2,$Hkey);
+        &pclmulqdq      ($Xi,$Hkey,0x00);       #######
+        &pclmulqdq      ($Xhi,$Hkey,0x11);      #######
+        &pclmulqdq      ($T1,$T2,0x00);         #######
+        &xorps          ($T1,$Xi);              #
+        &xorps          ($T1,$Xhi);             #
+        &movdqa         ($T2,$T1);              #
+        &psrldq         ($T1,8);
+        &pslldq         ($T2,8);                #
+        &pxor           ($Xhi,$T1);
+        &pxor           ($Xi,$T2);              #
+}
+sub clmul64x64_T3 {
+# Even though this subroutine offers visually better ILP, it
+# was empirically found to be a tad slower than above version.
+# At least in gcm_ghash_clmul context. But it's just as well,
+# because loop modulo-scheduling is possible only thanks to
+# minimized "register" pressure...
+my ($Xhi,$Xi,$Hkey)=@_;
+        &movdqa         ($T1,$Xi);              #
+        &movdqa         ($Xhi,$Xi);
+        &pclmulqdq      ($Xi,$Hkey,0x00);       #######
+        &pclmulqdq      ($Xhi,$Hkey,0x11);      #######
+        &pshufd         ($T2,$T1,0b01001110);   #
+        &pshufd         ($T3,$Hkey,0b01001110);
+        &pxor           ($T2,$T1);              #
+        &pxor           ($T3,$Hkey);
+        &pclmulqdq      ($T2,$T3,0x00);         #######
+        &pxor           ($T2,$Xi);              #
+        &pxor           ($T2,$Xhi);             #
+        &movdqa         ($T3,$T2);              #
+        &psrldq         ($T2,8);
+        &pslldq         ($T3,8);                #
+        &pxor           ($Xhi,$T2);
+        &pxor           ($Xi,$T3);              #
+}
+if (1) {                # Algorithm 9 with <<1 twist.
+                        # Reduction is shorter and uses only two
+                        # temporary registers, which makes it better
+                        # candidate for interleaving with 64x64
+                        # multiplication. Pre-modulo-scheduled loop
+                        # was found to be ~20% faster than Algorithm 5
+                        # below. Algorithm 9 was therefore chosen for
+                        # further optimization...
+sub reduction_alg9 {    # 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+        # 1st phase
+        &movdqa         ($T1,$Xi)               #
+        &psllq          ($Xi,1);
+        &pxor           ($Xi,$T1);              #
+        &psllq          ($Xi,5);                #
+        &pxor           ($Xi,$T1);              #
+        &psllq          ($Xi,57);               #
+        &movdqa         ($T2,$Xi);              #
+        &pslldq         ($Xi,8);
+        &psrldq         ($T2,8);                #
+        &pxor           ($Xi,$T1);
+        &pxor           ($Xhi,$T2);             #
+        # 2nd phase
+        &movdqa         ($T2,$Xi);
+        &psrlq          ($Xi,5);
+        &pxor           ($Xi,$T2);              #
+        &psrlq          ($Xi,1);                #
+        &pxor           ($Xi,$T2);              #
+        &pxor           ($T2,$Xhi);
+        &psrlq          ($Xi,1);                #
+        &pxor           ($Xi,$T2);              #
+}
+&function_begin_B("gcm_init_clmul");
+        &mov            ($Htbl,&wparam(0));
+        &mov            ($Xip,&wparam(1));
+        &call           (&label("pic"));
+&set_label("pic");
+        &blindpop       ($const);
+        &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+        &movdqu         ($Hkey,&QWP(0,$Xip));
+        &pshufd         ($Hkey,$Hkey,0b01001110);# dword swap
+        # <<1 twist
+        &pshufd         ($T2,$Hkey,0b11111111); # broadcast uppermost dword
+        &movdqa         ($T1,$Hkey);
+        &psllq          ($Hkey,1);
+        &pxor           ($T3,$T3);              #
+        &psrlq          ($T1,63);
+        &pcmpgtd        ($T3,$T2);              # broadcast carry bit
+        &pslldq         ($T1,8);
+        &por            ($Hkey,$T1);            # H<<=1
+        # magic reduction
+        &pand           ($T3,&QWP(16,$const));  # 0x1c2_polynomial
+        &pxor           ($Hkey,$T3);            # if(carry) H^=0x1c2_polynomial
+        # calculate H^2
+        &movdqa         ($Xi,$Hkey);
+        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
+        &reduction_alg9 ($Xhi,$Xi);
+        &movdqu         (&QWP(0,$Htbl),$Hkey);  # save H
+        &movdqu         (&QWP(16,$Htbl),$Xi);   # save H^2
+        &ret            ();
+&function_end_B("gcm_init_clmul");
+&function_begin_B("gcm_gmult_clmul");
+        &mov            ($Xip,&wparam(0));
+        &mov            ($Htbl,&wparam(1));
+        &call           (&label("pic"));
+&set_label("pic");
+        &blindpop       ($const);
+        &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+        &movdqu         ($Xi,&QWP(0,$Xip));
+        &movdqa         ($T3,&QWP(0,$const));
+        &movups         ($Hkey,&QWP(0,$Htbl));
+        &pshufb         ($Xi,$T3);
+        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
+        &reduction_alg9 ($Xhi,$Xi);
+        &pshufb         ($Xi,$T3);
+        &movdqu         (&QWP(0,$Xip),$Xi);
+        &ret    ();
+&function_end_B("gcm_gmult_clmul");
+&function_begin("gcm_ghash_clmul");
+        &mov            ($Xip,&wparam(0));
+        &mov            ($Htbl,&wparam(1));
+        &mov            ($inp,&wparam(2));
+        &mov            ($len,&wparam(3));
+        &call           (&label("pic"));
+&set_label("pic");
+        &blindpop       ($const);
+        &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+        &movdqu         ($Xi,&QWP(0,$Xip));
+        &movdqa         ($T3,&QWP(0,$const));
+        &movdqu         ($Hkey,&QWP(0,$Htbl));
+        &pshufb         ($Xi,$T3);
+        &sub            ($len,0x10);
+        &jz             (&label("odd_tail"));
+        #######
+        # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+        #       [(H*Ii+1) + (H*Xi+1)] mod P =
+        #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+        #
+        &movdqu         ($T1,&QWP(0,$inp));     # Ii
+        &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
+        &pshufb         ($T1,$T3);
+        &pshufb         ($Xn,$T3);
+        &pxor           ($Xi,$T1);              # Ii+Xi
+        &clmul64x64_T2  ($Xhn,$Xn,$Hkey);       # H*Ii+1
+        &movups         ($Hkey,&QWP(16,$Htbl)); # load H^2
+        &lea            ($inp,&DWP(32,$inp));   # i+=2
+        &sub            ($len,0x20);
+        &jbe            (&label("even_tail"));
+&set_label("mod_loop");
+        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
+        &movdqu         ($T1,&QWP(0,$inp));     # Ii
+        &movups         ($Hkey,&QWP(0,$Htbl));  # load H
+        &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
+        &pxor           ($Xhi,$Xhn);
+        &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
+        &pshufb         ($T1,$T3);
+        &pshufb         ($Xn,$T3);
+        &movdqa         ($T3,$Xn);              #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
+        &movdqa         ($Xhn,$Xn);
+         &pxor          ($Xhi,$T1);             # "Ii+Xi", consume early
+          &movdqa       ($T1,$Xi)               #&reduction_alg9($Xhi,$Xi); 1st phase
+          &psllq        ($Xi,1);
+          &pxor         ($Xi,$T1);              #
+          &psllq        ($Xi,5);                #
+          &pxor         ($Xi,$T1);              #
+        &pclmulqdq      ($Xn,$Hkey,0x00);       #######
+          &psllq        ($Xi,57);               #
+          &movdqa       ($T2,$Xi);              #
+          &pslldq       ($Xi,8);
+          &psrldq       ($T2,8);                #       
+          &pxor         ($Xi,$T1);
+        &pshufd         ($T1,$T3,0b01001110);
+          &pxor         ($Xhi,$T2);             #
+        &pxor           ($T1,$T3);
+        &pshufd         ($T3,$Hkey,0b01001110);
+        &pxor           ($T3,$Hkey);            #
+        &pclmulqdq      ($Xhn,$Hkey,0x11);      #######
+          &movdqa       ($T2,$Xi);              # 2nd phase
+          &psrlq        ($Xi,5);
+          &pxor         ($Xi,$T2);              #
+          &psrlq        ($Xi,1);                #
+          &pxor         ($Xi,$T2);              #
+          &pxor         ($T2,$Xhi);
+          &psrlq        ($Xi,1);                #
+          &pxor         ($Xi,$T2);              #
+        &pclmulqdq      ($T1,$T3,0x00);         #######
+        &movups         ($Hkey,&QWP(16,$Htbl)); # load H^2
+        &xorps          ($T1,$Xn);              #
+        &xorps          ($T1,$Xhn);             #
+        &movdqa         ($T3,$T1);              #
+        &psrldq         ($T1,8);
+        &pslldq         ($T3,8);                #
+        &pxor           ($Xhn,$T1);
+        &pxor           ($Xn,$T3);              #
+        &movdqa         ($T3,&QWP(0,$const));
+        &lea            ($inp,&DWP(32,$inp));
+        &sub            ($len,0x20);
+        &ja             (&label("mod_loop"));
+&set_label("even_tail");
+        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
+        &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
+        &pxor           ($Xhi,$Xhn);
+        &reduction_alg9 ($Xhi,$Xi);
+        &test           ($len,$len);
+        &jnz            (&label("done"));
+        &movups         ($Hkey,&QWP(0,$Htbl));  # load H
+&set_label("odd_tail");
+        &movdqu         ($T1,&QWP(0,$inp));     # Ii
+        &pshufb         ($T1,$T3);
+        &pxor           ($Xi,$T1);              # Ii+Xi
+        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H*(Ii+Xi)
+        &reduction_alg9 ($Xhi,$Xi);
+&set_label("done");
+        &pshufb         ($Xi,$T3);
+        &movdqu         (&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+} else {                # Algorith 5. Kept for reference purposes.
+sub reduction_alg5 {    # 19/16 times faster than Intel version
+my ($Xhi,$Xi)=@_;
+        # <<1
+        &movdqa         ($T1,$Xi);              #
+        &movdqa         ($T2,$Xhi);
+        &pslld          ($Xi,1);
+        &pslld          ($Xhi,1);               #
+        &psrld          ($T1,31);
+        &psrld          ($T2,31);               #
+        &movdqa         ($T3,$T1);
+        &pslldq         ($T1,4);
+        &psrldq         ($T3,12);               #
+        &pslldq         ($T2,4);
+        &por            ($Xhi,$T3);             #
+        &por            ($Xi,$T1);
+        &por            ($Xhi,$T2);             #
+        # 1st phase
+        &movdqa         ($T1,$Xi);
+        &movdqa         ($T2,$Xi);
+        &movdqa         ($T3,$Xi);              #
+        &pslld          ($T1,31);
+        &pslld          ($T2,30);
+        &pslld          ($Xi,25);               #
+        &pxor           ($T1,$T2);
+        &pxor           ($T1,$Xi);              #
+        &movdqa         ($T2,$T1);              #
+        &pslldq         ($T1,12);
+        &psrldq         ($T2,4);                #
+        &pxor           ($T3,$T1);
+        # 2nd phase
+        &pxor           ($Xhi,$T3);             #
+        &movdqa         ($Xi,$T3);
+        &movdqa         ($T1,$T3);
+        &psrld          ($Xi,1);                #
+        &psrld          ($T1,2);
+        &psrld          ($T3,7);                #
+        &pxor           ($Xi,$T1);
+        &pxor           ($Xhi,$T2);
+        &pxor           ($Xi,$T3);              #
+        &pxor           ($Xi,$Xhi);             #
+}
+&function_begin_B("gcm_init_clmul");
+        &mov            ($Htbl,&wparam(0));
+        &mov            ($Xip,&wparam(1));
+        &call           (&label("pic"));
+&set_label("pic");
+        &blindpop       ($const);
+        &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+        &movdqu         ($Hkey,&QWP(0,$Xip));
+        &pshufd         ($Hkey,$Hkey,0b01001110);# dword swap
+        # calculate H^2
+        &movdqa         ($Xi,$Hkey);
+        &clmul64x64_T3  ($Xhi,$Xi,$Hkey);
+        &reduction_alg5 ($Xhi,$Xi);
+        &movdqu         (&QWP(0,$Htbl),$Hkey);  # save H
+        &movdqu         (&QWP(16,$Htbl),$Xi);   # save H^2
+        &ret            ();
+&function_end_B("gcm_init_clmul");
+&function_begin_B("gcm_gmult_clmul");
+        &mov            ($Xip,&wparam(0));
+        &mov            ($Htbl,&wparam(1));
+        &call           (&label("pic"));
+&set_label("pic");
+        &blindpop       ($const);
+        &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+        &movdqu         ($Xi,&QWP(0,$Xip));
+        &movdqa         ($Xn,&QWP(0,$const));
+        &movdqu         ($Hkey,&QWP(0,$Htbl));
+        &pshufb         ($Xi,$Xn);
+        &clmul64x64_T3  ($Xhi,$Xi,$Hkey);
+        &reduction_alg5 ($Xhi,$Xi);
+        &pshufb         ($Xi,$Xn);
+        &movdqu         (&QWP(0,$Xip),$Xi);
+        &ret    ();
+&function_end_B("gcm_gmult_clmul");
+&function_begin("gcm_ghash_clmul");
+        &mov            ($Xip,&wparam(0));
+        &mov            ($Htbl,&wparam(1));
+        &mov            ($inp,&wparam(2));
+        &mov            ($len,&wparam(3));
+        &call           (&label("pic"));
+&set_label("pic");
+        &blindpop       ($const);
+        &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+        &movdqu         ($Xi,&QWP(0,$Xip));
+        &movdqa         ($T3,&QWP(0,$const));
+        &movdqu         ($Hkey,&QWP(0,$Htbl));
+        &pshufb         ($Xi,$T3);
+        &sub            ($len,0x10);
+        &jz             (&label("odd_tail"));
+        #######
+        # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+        #       [(H*Ii+1) + (H*Xi+1)] mod P =
+        #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+        #
+        &movdqu         ($T1,&QWP(0,$inp));     # Ii
+        &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
+        &pshufb         ($T1,$T3);
+        &pshufb         ($Xn,$T3);
+        &pxor           ($Xi,$T1);              # Ii+Xi
+        &clmul64x64_T3  ($Xhn,$Xn,$Hkey);       # H*Ii+1
+        &movdqu         ($Hkey,&QWP(16,$Htbl)); # load H^2
+        &sub            ($len,0x20);
+        &lea            ($inp,&DWP(32,$inp));   # i+=2
+        &jbe            (&label("even_tail"));
+&set_label("mod_loop");
+        &clmul64x64_T3  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
+        &movdqu         ($Hkey,&QWP(0,$Htbl));  # load H
+        &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
+        &pxor           ($Xhi,$Xhn);
+        &reduction_alg5 ($Xhi,$Xi);
+        #######
+        &movdqa         ($T3,&QWP(0,$const));
+        &movdqu         ($T1,&QWP(0,$inp));     # Ii
+        &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
+        &pshufb         ($T1,$T3);
+        &pshufb         ($Xn,$T3);
+        &pxor           ($Xi,$T1);              # Ii+Xi
+        &clmul64x64_T3  ($Xhn,$Xn,$Hkey);       # H*Ii+1
+        &movdqu         ($Hkey,&QWP(16,$Htbl)); # load H^2
+        &sub            ($len,0x20);
+        &lea            ($inp,&DWP(32,$inp));
+        &ja             (&label("mod_loop"));
+&set_label("even_tail");
+        &clmul64x64_T3  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
+        &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
+        &pxor           ($Xhi,$Xhn);
+        &reduction_alg5 ($Xhi,$Xi);
+        &movdqa         ($T3,&QWP(0,$const));
+        &test           ($len,$len);
+        &jnz            (&label("done"));
+        &movdqu         ($Hkey,&QWP(0,$Htbl));  # load H
+&set_label("odd_tail");
+        &movdqu         ($T1,&QWP(0,$inp));     # Ii
+        &pshufb         ($T1,$T3);
+        &pxor           ($Xi,$T1);              # Ii+Xi
+        &clmul64x64_T3  ($Xhi,$Xi,$Hkey);       # H*(Ii+Xi)
+        &reduction_alg5 ($Xhi,$Xi);
+        &movdqa         ($T3,&QWP(0,$const));
+&set_label("done");
+        &pshufb         ($Xi,$T3);
+        &movdqu         (&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+}
+&set_label("bswap",64);
+        &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+        &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
+}}      # $sse2
+&set_label("rem_4bit",64);
+        &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+        &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+        &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+        &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
+&set_label("rem_8bit",64);
+        &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
+        &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
+        &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
+        &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
+        &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
+        &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
+        &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
+        &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
+        &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
+        &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
+        &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
+        &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
+        &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
+        &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
+        &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
+        &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
+        &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
+        &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
+        &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
+        &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
+        &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
+        &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
+        &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
+        &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
+        &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
+        &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
+        &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
+        &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
+        &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
+        &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
+        &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
+        &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}}}     # !$x86only
+&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
+# A question was risen about choice of vanilla MMX. Or rather why wasn't
+# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
+# CPUs such as PIII, "4-bit" MMX version was observed to provide better
+# performance than *corresponding* SSE2 one even on contemporary CPUs.
+# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
+# implementation featuring full range of lookup-table sizes, but with
+# per-invocation lookup table setup. Latter means that table size is
+# chosen depending on how much data is to be hashed in every given call,
+# more data - larger table. Best reported result for Core2 is ~4 cycles
+# per processed byte out of 64KB block. This number accounts even for
+# 64KB table setup overhead. As discussed in gcm128.c we choose to be
+# more conservative in respect to lookup table sizes, but how do the
+# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
+# on same platform. As also discussed in gcm128.c, next in line "8-bit
+# Shoup's" or "4KB" method should deliver twice the performance of
+# "256B" one, in other words not worse than ~6 cycles per byte. It
+# should be also be noted that in SSE2 case improvement can be "super-
+# linear," i.e. more than twice, mostly because >>8 maps to single
+# instruction on SSE2 register. This is unlike "4-bit" case when >>4
+# maps to same amount of instructions in both MMX and SSE2 cases.
+# Bottom line is that switch to SSE2 is considered to be justifiable
+# only in case we choose to implement "8-bit" method...
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
new file mode 100644
index 0000000000..a5ae180882
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
@@ -0,0 +1,805 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+#               gcc 3.4.x(*)    assembler
+#
+# P4            28.6            14.0            +100%
+# Opteron       19.3            7.7             +150%
+# Core2         17.8            8.1(**)         +120%
+#
+# (*)   comparison is not completely fair, because C results are
+#       for vanilla "256B" implementation, while assembler results
+#       are for "528B";-)
+# (**)  it's mystery [to me] why Core2 result is not same as for
+#       Opteron;
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+open STDOUT,"| $^X $xlate $flavour $output";
+# common register layout
+$nlo="%rax";
+$nhi="%rbx";
+$Zlo="%r8";
+$Zhi="%r9";
+$tmp="%r10";
+$rem_4bit = "%r11";
+$Xi="%rdi";
+$Htbl="%rsi";
+# per-function register layout
+$cnt="%rcx";
+$rem="%rdx";
+sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/     or
+                        $r =~ s/%[er]([sd]i)/%\1l/      or
+                        $r =~ s/%[er](bp)/%\1l/         or
+                        $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
+sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+{ my $N;
+  sub loop() {
+  my $inp = shift;
+        $N++;
+$code.=<<___;
+        xor     $nlo,$nlo
+        xor     $nhi,$nhi
+        mov     `&LB("$Zlo")`,`&LB("$nlo")`
+        mov     `&LB("$Zlo")`,`&LB("$nhi")`
+        shl     \$4,`&LB("$nlo")`
+        mov     \$14,$cnt
+        mov     8($Htbl,$nlo),$Zlo
+        mov     ($Htbl,$nlo),$Zhi
+        and     \$0xf0,`&LB("$nhi")`
+        mov     $Zlo,$rem
+        jmp     .Loop$N
+.align  16
+.Loop$N:
+        shr     \$4,$Zlo
+        and     \$0xf,$rem
+        mov     $Zhi,$tmp
+        mov     ($inp,$cnt),`&LB("$nlo")`
+        shr     \$4,$Zhi
+        xor     8($Htbl,$nhi),$Zlo
+        shl     \$60,$tmp
+        xor     ($Htbl,$nhi),$Zhi
+        mov     `&LB("$nlo")`,`&LB("$nhi")`
+        xor     ($rem_4bit,$rem,8),$Zhi
+        mov     $Zlo,$rem
+        shl     \$4,`&LB("$nlo")`
+        xor     $tmp,$Zlo
+        dec     $cnt
+        js      .Lbreak$N
+        shr     \$4,$Zlo
+        and     \$0xf,$rem
+        mov     $Zhi,$tmp
+        shr     \$4,$Zhi
+        xor     8($Htbl,$nlo),$Zlo
+        shl     \$60,$tmp
+        xor     ($Htbl,$nlo),$Zhi
+        and     \$0xf0,`&LB("$nhi")`
+        xor     ($rem_4bit,$rem,8),$Zhi
+        mov     $Zlo,$rem
+        xor     $tmp,$Zlo
+        jmp     .Loop$N
+.align  16
+.Lbreak$N:
+        shr     \$4,$Zlo
+        and     \$0xf,$rem
+        mov     $Zhi,$tmp
+        shr     \$4,$Zhi
+        xor     8($Htbl,$nlo),$Zlo
+        shl     \$60,$tmp
+        xor     ($Htbl,$nlo),$Zhi
+        and     \$0xf0,`&LB("$nhi")`
+        xor     ($rem_4bit,$rem,8),$Zhi
+        mov     $Zlo,$rem
+        xor     $tmp,$Zlo
+        shr     \$4,$Zlo
+        and     \$0xf,$rem
+        mov     $Zhi,$tmp
+        shr     \$4,$Zhi
+        xor     8($Htbl,$nhi),$Zlo
+        shl     \$60,$tmp
+        xor     ($Htbl,$nhi),$Zhi
+        xor     $tmp,$Zlo
+        xor     ($rem_4bit,$rem,8),$Zhi
+        bswap   $Zlo
+        bswap   $Zhi
+___
+}}
+$code=<<___;
+.text
+.globl  gcm_gmult_4bit
+.type   gcm_gmult_4bit,\@function,2
+.align  16
+gcm_gmult_4bit:
+        push    %rbx
+        push    %rbp            # %rbp and %r12 are pushed exclusively in
+        push    %r12            # order to reuse Win64 exception handler...
+.Lgmult_prologue:
+        movzb   15($Xi),$Zlo
+        lea     .Lrem_4bit(%rip),$rem_4bit
+___
+        &loop   ($Xi);
+$code.=<<___;
+        mov     $Zlo,8($Xi)
+        mov     $Zhi,($Xi)
+        mov     16(%rsp),%rbx
+        lea     24(%rsp),%rsp
+.Lgmult_epilogue:
+        ret
+.size   gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+# per-function register layout
+$inp="%rdx";
+$len="%rcx";
+$rem_8bit=$rem_4bit;
+$code.=<<___;
+.globl  gcm_ghash_4bit
+.type   gcm_ghash_4bit,\@function,4
+.align  16
+gcm_ghash_4bit:
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        sub     \$280,%rsp
+.Lghash_prologue:
+        mov     $inp,%r14               # reassign couple of args
+        mov     $len,%r15
+___
+{ my $inp="%r14";
+  my $dat="%edx";
+  my $len="%r15";
+  my @nhi=("%ebx","%ecx");
+  my @rem=("%r12","%r13");
+  my $Hshr4="%rbp";
+        &sub    ($Htbl,-128);           # size optimization
+        &lea    ($Hshr4,"16+128(%rsp)");
+        { my @lo =($nlo,$nhi);
+          my @hi =($Zlo,$Zhi);
+          &xor  ($dat,$dat);
+          for ($i=0,$j=-2;$i<18;$i++,$j++) {
+            &mov        ("$j(%rsp)",&LB($dat))          if ($i>1);
+            &or         ($lo[0],$tmp)                   if ($i>1);
+            &mov        (&LB($dat),&LB($lo[1]))         if ($i>0 && $i<17);
+            &shr        ($lo[1],4)                      if ($i>0 && $i<17);
+            &mov        ($tmp,$hi[1])                   if ($i>0 && $i<17);
+            &shr        ($hi[1],4)                      if ($i>0 && $i<17);
+            &mov        ("8*$j($Hshr4)",$hi[0])         if ($i>1);
+            &mov        ($hi[0],"16*$i+0-128($Htbl)")   if ($i<16);
+            &shl        (&LB($dat),4)                   if ($i>0 && $i<17);
+            &mov        ("8*$j-128($Hshr4)",$lo[0])     if ($i>1);
+            &mov        ($lo[0],"16*$i+8-128($Htbl)")   if ($i<16);
+            &shl        ($tmp,60)                       if ($i>0 && $i<17);
+            push        (@lo,shift(@lo));
+            push        (@hi,shift(@hi));
+          }
+        }
+        &add    ($Htbl,-128);
+        &mov    ($Zlo,"8($Xi)");
+        &mov    ($Zhi,"0($Xi)");
+        &add    ($len,$inp);            # pointer to the end of data
+        &lea    ($rem_8bit,".Lrem_8bit(%rip)");
+        &jmp    (".Louter_loop");
+$code.=".align  16\n.Louter_loop:\n";
+        &xor    ($Zhi,"($inp)");
+        &mov    ("%rdx","8($inp)");
+        &lea    ($inp,"16($inp)");
+        &xor    ("%rdx",$Zlo);
+        &mov    ("($Xi)",$Zhi);
+        &mov    ("8($Xi)","%rdx");
+        &shr    ("%rdx",32);
+        &xor    ($nlo,$nlo);
+        &rol    ($dat,8);
+        &mov    (&LB($nlo),&LB($dat));
+        &movz   ($nhi[0],&LB($dat));
+        &shl    (&LB($nlo),4);
+        &shr    ($nhi[0],4);
+        for ($j=11,$i=0;$i<15;$i++) {
+            &rol        ($dat,8);
+            &xor        ($Zlo,"8($Htbl,$nlo)")                  if ($i>0);
+            &xor        ($Zhi,"($Htbl,$nlo)")                   if ($i>0);
+            &mov        ($Zlo,"8($Htbl,$nlo)")                  if ($i==0);
+            &mov        ($Zhi,"($Htbl,$nlo)")                   if ($i==0);
+            &mov        (&LB($nlo),&LB($dat));
+            &xor        ($Zlo,$tmp)                             if ($i>0);
+            &movzw      ($rem[1],"($rem_8bit,$rem[1],2)")       if ($i>0);
+            &movz       ($nhi[1],&LB($dat));
+            &shl        (&LB($nlo),4);
+            &movzb      ($rem[0],"(%rsp,$nhi[0])");
+            &shr        ($nhi[1],4)                             if ($i<14);
+            &and        ($nhi[1],0xf0)                          if ($i==14);
+            &shl        ($rem[1],48)                            if ($i>0);
+            &xor        ($rem[0],$Zlo);
+            &mov        ($tmp,$Zhi);
+            &xor        ($Zhi,$rem[1])                          if ($i>0);
+            &shr        ($Zlo,8);
+            &movz       ($rem[0],&LB($rem[0]));
+            &mov        ($dat,"$j($Xi)")                        if (--$j%4==0);
+            &shr        ($Zhi,8);
+            &xor        ($Zlo,"-128($Hshr4,$nhi[0],8)");
+            &shl        ($tmp,56);
+            &xor        ($Zhi,"($Hshr4,$nhi[0],8)");
+            unshift     (@nhi,pop(@nhi));               # "rotate" registers
+            unshift     (@rem,pop(@rem));
+        }
+        &movzw  ($rem[1],"($rem_8bit,$rem[1],2)");
+        &xor    ($Zlo,"8($Htbl,$nlo)");
+        &xor    ($Zhi,"($Htbl,$nlo)");
+        &shl    ($rem[1],48);
+        &xor    ($Zlo,$tmp);
+        &xor    ($Zhi,$rem[1]);
+        &movz   ($rem[0],&LB($Zlo));
+        &shr    ($Zlo,4);
+        &mov    ($tmp,$Zhi);
+        &shl    (&LB($rem[0]),4);
+        &shr    ($Zhi,4);
+        &xor    ($Zlo,"8($Htbl,$nhi[0])");
+        &movzw  ($rem[0],"($rem_8bit,$rem[0],2)");
+        &shl    ($tmp,60);
+        &xor    ($Zhi,"($Htbl,$nhi[0])");
+        &xor    ($Zlo,$tmp);
+        &shl    ($rem[0],48);
+        &bswap  ($Zlo);
+        &xor    ($Zhi,$rem[0]);
+        &bswap  ($Zhi);
+        &cmp    ($inp,$len);
+        &jb     (".Louter_loop");
+}
+$code.=<<___;
+        mov     $Zlo,8($Xi)
+        mov     $Zhi,($Xi)
+        lea     280(%rsp),%rsi
+        mov     0(%rsi),%r15
+        mov     8(%rsi),%r14
+        mov     16(%rsi),%r13
+        mov     24(%rsi),%r12
+        mov     32(%rsi),%rbp
+        mov     40(%rsi),%rbx
+        lea     48(%rsi),%rsp
+.Lghash_epilogue:
+        ret
+.size   gcm_ghash_4bit,.-gcm_ghash_4bit
+___
+######################################################################
+# PCLMULQDQ version.
+@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+                ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+($Xi,$Xhi)=("%xmm0","%xmm1");   $Hkey="%xmm2";
+($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
+sub clmul64x64_T2 {     # minimal register pressure
+my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+$code.=<<___ if (!defined($modulo));
+        movdqa          $Xi,$Xhi                #
+        pshufd          \$0b01001110,$Xi,$T1
+        pshufd          \$0b01001110,$Hkey,$T2
+        pxor            $Xi,$T1                 #
+        pxor            $Hkey,$T2
+___
+$code.=<<___;
+        pclmulqdq       \$0x00,$Hkey,$Xi        #######
+        pclmulqdq       \$0x11,$Hkey,$Xhi       #######
+        pclmulqdq       \$0x00,$T2,$T1          #######
+        pxor            $Xi,$T1                 #
+        pxor            $Xhi,$T1                #
+        movdqa          $T1,$T2                 #
+        psrldq          \$8,$T1
+        pslldq          \$8,$T2                 #
+        pxor            $T1,$Xhi
+        pxor            $T2,$Xi                 #
+___
+}
+sub reduction_alg9 {    # 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+$code.=<<___;
+        # 1st phase
+        movdqa          $Xi,$T1                 #
+        psllq           \$1,$Xi
+        pxor            $T1,$Xi                 #
+        psllq           \$5,$Xi                 #
+        pxor            $T1,$Xi                 #
+        psllq           \$57,$Xi                #
+        movdqa          $Xi,$T2                 #
+        pslldq          \$8,$Xi
+        psrldq          \$8,$T2                 #       
+        pxor            $T1,$Xi
+        pxor            $T2,$Xhi                #
+        # 2nd phase
+        movdqa          $Xi,$T2
+        psrlq           \$5,$Xi
+        pxor            $T2,$Xi                 #
+        psrlq           \$1,$Xi                 #
+        pxor            $T2,$Xi                 #
+        pxor            $Xhi,$T2
+        psrlq           \$1,$Xi                 #
+        pxor            $T2,$Xi                 #
+___
+}
+{ my ($Htbl,$Xip)=@_4args;
+$code.=<<___;
+.globl  gcm_init_clmul
+.type   gcm_init_clmul,\@abi-omnipotent
+.align  16
+gcm_init_clmul:
+        movdqu          ($Xip),$Hkey
+        pshufd          \$0b01001110,$Hkey,$Hkey        # dword swap
+        # <<1 twist
+        pshufd          \$0b11111111,$Hkey,$T2  # broadcast uppermost dword
+        movdqa          $Hkey,$T1
+        psllq           \$1,$Hkey
+        pxor            $T3,$T3                 #
+        psrlq           \$63,$T1
+        pcmpgtd         $T2,$T3                 # broadcast carry bit
+        pslldq          \$8,$T1
+        por             $T1,$Hkey               # H<<=1
+        # magic reduction
+        pand            .L0x1c2_polynomial(%rip),$T3
+        pxor            $T3,$Hkey               # if(carry) H^=0x1c2_polynomial
+        # calculate H^2
+        movdqa          $Hkey,$Xi
+___
+        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
+        &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+        movdqu          $Hkey,($Htbl)           # save H
+        movdqu          $Xi,16($Htbl)           # save H^2
+        ret
+.size   gcm_init_clmul,.-gcm_init_clmul
+___
+}
+{ my ($Xip,$Htbl)=@_4args;
+$code.=<<___;
+.globl  gcm_gmult_clmul
+.type   gcm_gmult_clmul,\@abi-omnipotent
+.align  16
+gcm_gmult_clmul:
+        movdqu          ($Xip),$Xi
+        movdqa          .Lbswap_mask(%rip),$T3
+        movdqu          ($Htbl),$Hkey
+        pshufb          $T3,$Xi
+___
+        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
+        &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+        pshufb          $T3,$Xi
+        movdqu          $Xi,($Xip)
+        ret
+.size   gcm_gmult_clmul,.-gcm_gmult_clmul
+___
+}
+{ my ($Xip,$Htbl,$inp,$len)=@_4args;
+  my $Xn="%xmm6";
+  my $Xhn="%xmm7";
+  my $Hkey2="%xmm8";
+  my $T1n="%xmm9";
+  my $T2n="%xmm10";
+$code.=<<___;
+.globl  gcm_ghash_clmul
+.type   gcm_ghash_clmul,\@abi-omnipotent
+.align  16
+gcm_ghash_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_ghash_clmul:
+        # I can't trust assembler to use specific encoding:-(
+        .byte   0x48,0x83,0xec,0x58             #sub    \$0x58,%rsp
+        .byte   0x0f,0x29,0x34,0x24             #movaps %xmm6,(%rsp)
+        .byte   0x0f,0x29,0x7c,0x24,0x10        #movdqa %xmm7,0x10(%rsp)
+        .byte   0x44,0x0f,0x29,0x44,0x24,0x20   #movaps %xmm8,0x20(%rsp)
+        .byte   0x44,0x0f,0x29,0x4c,0x24,0x30   #movaps %xmm9,0x30(%rsp)
+        .byte   0x44,0x0f,0x29,0x54,0x24,0x40   #movaps %xmm10,0x40(%rsp)
+___
+$code.=<<___;
+        movdqa          .Lbswap_mask(%rip),$T3
+        movdqu          ($Xip),$Xi
+        movdqu          ($Htbl),$Hkey
+        pshufb          $T3,$Xi
+        sub             \$0x10,$len
+        jz              .Lodd_tail
+        movdqu          16($Htbl),$Hkey2
+        #######
+        # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+        #       [(H*Ii+1) + (H*Xi+1)] mod P =
+        #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+        #
+        movdqu          ($inp),$T1              # Ii
+        movdqu          16($inp),$Xn            # Ii+1
+        pshufb          $T3,$T1
+        pshufb          $T3,$Xn
+        pxor            $T1,$Xi                 # Ii+Xi
+___
+        &clmul64x64_T2  ($Xhn,$Xn,$Hkey);       # H*Ii+1
+$code.=<<___;
+        movdqa          $Xi,$Xhi                #
+        pshufd          \$0b01001110,$Xi,$T1
+        pshufd          \$0b01001110,$Hkey2,$T2
+        pxor            $Xi,$T1                 #
+        pxor            $Hkey2,$T2
+        lea             32($inp),$inp           # i+=2
+        sub             \$0x20,$len
+        jbe             .Leven_tail
+.Lmod_loop:
+___
+        &clmul64x64_T2  ($Xhi,$Xi,$Hkey2,1);    # H^2*(Ii+Xi)
+$code.=<<___;
+        movdqu          ($inp),$T1              # Ii
+        pxor            $Xn,$Xi                 # (H*Ii+1) + H^2*(Ii+Xi)
+        pxor            $Xhn,$Xhi
+        movdqu          16($inp),$Xn            # Ii+1
+        pshufb          $T3,$T1
+        pshufb          $T3,$Xn
+        movdqa          $Xn,$Xhn                #
+        pshufd          \$0b01001110,$Xn,$T1n
+        pshufd          \$0b01001110,$Hkey,$T2n
+        pxor            $Xn,$T1n                #
+        pxor            $Hkey,$T2n
+         pxor           $T1,$Xhi                # "Ii+Xi", consume early
+          movdqa        $Xi,$T1                 # 1st phase
+          psllq         \$1,$Xi
+          pxor          $T1,$Xi                 #
+          psllq         \$5,$Xi                 #
+          pxor          $T1,$Xi                 #
+        pclmulqdq       \$0x00,$Hkey,$Xn        #######
+          psllq         \$57,$Xi                #
+          movdqa        $Xi,$T2                 #
+          pslldq        \$8,$Xi
+          psrldq        \$8,$T2                 #       
+          pxor          $T1,$Xi
+          pxor          $T2,$Xhi                #
+        pclmulqdq       \$0x11,$Hkey,$Xhn       #######
+          movdqa        $Xi,$T2                 # 2nd phase
+          psrlq         \$5,$Xi
+          pxor          $T2,$Xi                 #
+          psrlq         \$1,$Xi                 #
+          pxor          $T2,$Xi                 #
+          pxor          $Xhi,$T2
+          psrlq         \$1,$Xi                 #
+          pxor          $T2,$Xi                 #
+        pclmulqdq       \$0x00,$T2n,$T1n        #######
+         movdqa         $Xi,$Xhi                #
+         pshufd         \$0b01001110,$Xi,$T1
+         pshufd         \$0b01001110,$Hkey2,$T2
+         pxor           $Xi,$T1                 #
+         pxor           $Hkey2,$T2
+        pxor            $Xn,$T1n                #
+        pxor            $Xhn,$T1n               #
+        movdqa          $T1n,$T2n               #
+        psrldq          \$8,$T1n
+        pslldq          \$8,$T2n                #
+        pxor            $T1n,$Xhn
+        pxor            $T2n,$Xn                #
+        lea             32($inp),$inp
+        sub             \$0x20,$len
+        ja              .Lmod_loop
+.Leven_tail:
+___
+        &clmul64x64_T2  ($Xhi,$Xi,$Hkey2,1);    # H^2*(Ii+Xi)
+$code.=<<___;
+        pxor            $Xn,$Xi                 # (H*Ii+1) + H^2*(Ii+Xi)
+        pxor            $Xhn,$Xhi
+___
+        &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+        test            $len,$len
+        jnz             .Ldone
+.Lodd_tail:
+        movdqu          ($inp),$T1              # Ii
+        pshufb          $T3,$T1
+        pxor            $T1,$Xi                 # Ii+Xi
+___
+        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H*(Ii+Xi)
+        &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+.Ldone:
+        pshufb          $T3,$Xi
+        movdqu          $Xi,($Xip)
+___
+$code.=<<___ if ($win64);
+        movaps  (%rsp),%xmm6
+        movaps  0x10(%rsp),%xmm7
+        movaps  0x20(%rsp),%xmm8
+        movaps  0x30(%rsp),%xmm9
+        movaps  0x40(%rsp),%xmm10
+        add     \$0x58,%rsp
+___
+$code.=<<___;
+        ret
+.LSEH_end_gcm_ghash_clmul:
+.size   gcm_ghash_clmul,.-gcm_ghash_clmul
+___
+}
+$code.=<<___;
+.align  64
+.Lbswap_mask:
+        .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+        .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.align  64
+.type   .Lrem_4bit,\@object
+.Lrem_4bit:
+        .long   0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
+        .long   0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
+        .long   0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
+        .long   0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
+.type   .Lrem_8bit,\@object
+.Lrem_8bit:
+        .value  0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+        .value  0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+        .value  0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+        .value  0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+        .value  0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+        .value  0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+        .value  0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+        .value  0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+        .value  0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+        .value  0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+        .value  0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+        .value  0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+        .value  0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+        .value  0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+        .value  0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+        .value  0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+        .value  0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+        .value  0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+        .value  0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+        .value  0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+        .value  0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+        .value  0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+        .value  0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+        .value  0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+        .value  0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+        .value  0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+        .value  0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+        .value  0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+        .value  0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+        .value  0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+        .value  0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+        .value  0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+.asciz  "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align  64
+___
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type   se_handler,\@abi-omnipotent
+.align  16
+se_handler:
+        push    %rsi
+        push    %rdi
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        pushfq
+        sub     \$64,%rsp
+        mov     120($context),%rax      # pull context->Rax
+        mov     248($context),%rbx      # pull context->Rip
+        mov     8($disp),%rsi           # disp->ImageBase
+        mov     56($disp),%r11          # disp->HandlerData
+        mov     0(%r11),%r10d           # HandlerData[0]
+        lea     (%rsi,%r10),%r10        # prologue label
+        cmp     %r10,%rbx               # context->Rip<prologue label
+        jb      .Lin_prologue
+        mov     152($context),%rax      # pull context->Rsp
+        mov     4(%r11),%r10d           # HandlerData[1]
+        lea     (%rsi,%r10),%r10        # epilogue label
+        cmp     %r10,%rbx               # context->Rip>=epilogue label
+        jae     .Lin_prologue
+        lea     24(%rax),%rax           # adjust "rsp"
+        mov     -8(%rax),%rbx
+        mov     -16(%rax),%rbp
+        mov     -24(%rax),%r12
+        mov     %rbx,144($context)      # restore context->Rbx
+        mov     %rbp,160($context)      # restore context->Rbp
+        mov     %r12,216($context)      # restore context->R12
+.Lin_prologue:
+        mov     8(%rax),%rdi
+        mov     16(%rax),%rsi
+        mov     %rax,152($context)      # restore context->Rsp
+        mov     %rsi,168($context)      # restore context->Rsi
+        mov     %rdi,176($context)      # restore context->Rdi
+        mov     40($disp),%rdi          # disp->ContextRecord
+        mov     $context,%rsi           # context
+        mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
+        .long   0xa548f3fc              # cld; rep movsq
+        mov     $disp,%rsi
+        xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+        mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+        mov     0(%rsi),%r8             # arg3, disp->ControlPc
+        mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+        mov     40(%rsi),%r10           # disp->ContextRecord
+        lea     56(%rsi),%r11           # &disp->HandlerData
+        lea     24(%rsi),%r12           # &disp->EstablisherFrame
+        mov     %r10,32(%rsp)           # arg5
+        mov     %r11,40(%rsp)           # arg6
+        mov     %r12,48(%rsp)           # arg7
+        mov     %rcx,56(%rsp)           # arg8, (NULL)
+        call    *__imp_RtlVirtualUnwind(%rip)
+        mov     \$1,%eax                # ExceptionContinueSearch
+        add     \$64,%rsp
+        popfq
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+        pop     %rbp
+        pop     %rbx
+        pop     %rdi
+        pop     %rsi
+        ret
+.size   se_handler,.-se_handler
+.section        .pdata
+.align  4
+        .rva    .LSEH_begin_gcm_gmult_4bit
+        .rva    .LSEH_end_gcm_gmult_4bit
+        .rva    .LSEH_info_gcm_gmult_4bit
+        .rva    .LSEH_begin_gcm_ghash_4bit
+        .rva    .LSEH_end_gcm_ghash_4bit
+        .rva    .LSEH_info_gcm_ghash_4bit
+        .rva    .LSEH_begin_gcm_ghash_clmul
+        .rva    .LSEH_end_gcm_ghash_clmul
+        .rva    .LSEH_info_gcm_ghash_clmul
+.section        .xdata
+.align  8
+.LSEH_info_gcm_gmult_4bit:
+        .byte   9,0,0,0
+        .rva    se_handler
+        .rva    .Lgmult_prologue,.Lgmult_epilogue       # HandlerData
+.LSEH_info_gcm_ghash_4bit:
+        .byte   9,0,0,0
+        .rva    se_handler
+        .rva    .Lghash_prologue,.Lghash_epilogue       # HandlerData
+.LSEH_info_gcm_ghash_clmul:
+        .byte   0x01,0x1f,0x0b,0x00
+        .byte   0x1f,0xa8,0x04,0x00     #movaps 0x40(rsp),xmm10
+        .byte   0x19,0x98,0x03,0x00     #movaps 0x30(rsp),xmm9
+        .byte   0x13,0x88,0x02,0x00     #movaps 0x20(rsp),xmm8
+        .byte   0x0d,0x78,0x01,0x00     #movaps 0x10(rsp),xmm7
+        .byte   0x08,0x68,0x00,0x00     #movaps (rsp),xmm6
+        .byte   0x04,0xa2,0x00,0x00     #sub    rsp,0x58
+___
+}
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/modes/cbc128.c b/src/lib/libcrypto/modes/cbc128.c
index 8f8bd563b9..3d3782cbe1 100644
--- a/src/lib/libcrypto/modes/cbc128.c
+++ b/src/lib/libcrypto/modes/cbc128.c
@@ -48,7 +48,8 @@
 *
 */
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 #ifndef MODES_DEBUG
@@ -58,12 +59,7 @@
 #endif
 #include <assert.h>
-#define STRICT_ALIGNMENT 1
+#ifndef STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
 #  define STRICT_ALIGNMENT 0
 #endif
diff --git a/src/lib/libcrypto/modes/ccm128.c b/src/lib/libcrypto/modes/ccm128.c
new file mode 100644
index 0000000000..c9b35e5b35
--- /dev/null
+++ b/src/lib/libcrypto/modes/ccm128.c
@@ -0,0 +1,441 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+/* First you setup M and L parameters and pass the key schedule.
+ * This is called once per session setup... */
+void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
+        unsigned int M,unsigned int L,void *key,block128_f block)
+{
+        memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
+        ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
+        ctx->blocks = 0;
+        ctx->block = block;
+        ctx->key = key;
+}
+/* !!! Following interfaces are to be called *once* per packet !!! */
+/* Then you setup per-message nonce and pass the length of the message */
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
+        const unsigned char *nonce,size_t nlen,size_t mlen)
+{
+        unsigned int L = ctx->nonce.c[0]&7;     /* the L parameter */
+        if (nlen<(14-L)) return -1;             /* nonce is too short */
+        if (sizeof(mlen)==8 && L>=3) {
+                ctx->nonce.c[8]  = (u8)(mlen>>(56%(sizeof(mlen)*8)));
+                ctx->nonce.c[9]  = (u8)(mlen>>(48%(sizeof(mlen)*8)));
+                ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
+                ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
+        }
+        else
+                *(u32*)(&ctx->nonce.c[8]) = 0;
+        ctx->nonce.c[12] = (u8)(mlen>>24);
+        ctx->nonce.c[13] = (u8)(mlen>>16);
+        ctx->nonce.c[14] = (u8)(mlen>>8);
+        ctx->nonce.c[15] = (u8)mlen;
+        ctx->nonce.c[0] &= ~0x40;       /* clear Adata flag */
+        memcpy(&ctx->nonce.c[1],nonce,14-L);
+        return 0;
+}
+/* Then you pass additional authentication data, this is optional */
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
+        const unsigned char *aad,size_t alen)
+{       unsigned int i;
+        block128_f block = ctx->block;
+        if (alen==0) return;
+        ctx->nonce.c[0] |= 0x40;        /* set Adata flag */
+        (*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
+        ctx->blocks++;
+        if (alen<(0x10000-0x100)) {
+                ctx->cmac.c[0] ^= (u8)(alen>>8);
+                ctx->cmac.c[1] ^= (u8)alen;
+                i=2;
+        }
+        else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
+                ctx->cmac.c[0] ^= 0xFF;
+                ctx->cmac.c[1] ^= 0xFF;
+                ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
+                ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
+                ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
+                ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
+                ctx->cmac.c[6] ^= (u8)(alen>>24);
+                ctx->cmac.c[7] ^= (u8)(alen>>16);
+                ctx->cmac.c[8] ^= (u8)(alen>>8);
+                ctx->cmac.c[9] ^= (u8)alen;
+                i=10;
+        }
+        else {
+                ctx->cmac.c[0] ^= 0xFF;
+                ctx->cmac.c[1] ^= 0xFE;
+                ctx->cmac.c[2] ^= (u8)(alen>>24);
+                ctx->cmac.c[3] ^= (u8)(alen>>16);
+                ctx->cmac.c[4] ^= (u8)(alen>>8);
+                ctx->cmac.c[5] ^= (u8)alen;
+                i=6;
+        }
+        do {
+                for(;i<16 && alen;++i,++aad,--alen)
+                        ctx->cmac.c[i] ^= *aad;
+                (*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
+                ctx->blocks++;
+                i=0;
+        } while (alen);
+}
+/* Finally you encrypt or decrypt the message */
+/* counter part of nonce may not be larger than L*8 bits,
+ * L is not larger than 8, therefore 64-bit counter... */
+static void ctr64_inc(unsigned char *counter) {
+        unsigned int n=8;
+        u8  c;
+        counter += 8;
+        do {
+                --n;
+                c = counter[n];
+                ++c;
+                counter[n] = c;
+                if (c) return;
+        } while (n);
+}
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
+        const unsigned char *inp, unsigned char *out,
+        size_t len)
+{
+        size_t          n;
+        unsigned int    i,L;
+        unsigned char   flags0  = ctx->nonce.c[0];
+        block128_f      block   = ctx->block;
+        void *          key     = ctx->key;
+        union { u64 u[2]; u8 c[16]; } scratch;
+        if (!(flags0&0x40))
+                (*block)(ctx->nonce.c,ctx->cmac.c,key),
+                ctx->blocks++;
+        ctx->nonce.c[0] = L = flags0&7;
+        for (n=0,i=15-L;i<15;++i) {
+                n |= ctx->nonce.c[i];
+                ctx->nonce.c[i]=0;
+                n <<= 8;
+        }
+        n |= ctx->nonce.c[15];  /* reconstructed length */
+        ctx->nonce.c[15]=1;
+        if (n!=len) return -1;  /* length mismatch */
+        ctx->blocks += ((len+15)>>3)|1;
+        if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
+        while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+                union { u64 u[2]; u8 c[16]; } temp;
+                memcpy (temp.c,inp,16);
+                ctx->cmac.u[0] ^= temp.u[0];
+                ctx->cmac.u[1] ^= temp.u[1];
+#else
+                ctx->cmac.u[0] ^= ((u64*)inp)[0];
+                ctx->cmac.u[1] ^= ((u64*)inp)[1];
+#endif
+                (*block)(ctx->cmac.c,ctx->cmac.c,key);
+                (*block)(ctx->nonce.c,scratch.c,key);
+                ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+                temp.u[0] ^= scratch.u[0];
+                temp.u[1] ^= scratch.u[1];
+                memcpy(out,temp.c,16);
+#else
+                ((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
+                ((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
+#endif
+                inp += 16;
+                out += 16;
+                len -= 16;
+        }
+        if (len) {
+                for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
+                (*block)(ctx->cmac.c,ctx->cmac.c,key);
+                (*block)(ctx->nonce.c,scratch.c,key);
+                for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
+        }
+        for (i=15-L;i<16;++i)
+                ctx->nonce.c[i]=0;
+        (*block)(ctx->nonce.c,scratch.c,key);
+        ctx->cmac.u[0] ^= scratch.u[0];
+        ctx->cmac.u[1] ^= scratch.u[1];
+        ctx->nonce.c[0] = flags0;
+        return 0;
+}
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
+        const unsigned char *inp, unsigned char *out,
+        size_t len)
+{
+        size_t          n;
+        unsigned int    i,L;
+        unsigned char   flags0  = ctx->nonce.c[0];
+        block128_f      block   = ctx->block;
+        void *          key     = ctx->key;
+        union { u64 u[2]; u8 c[16]; } scratch;
+        if (!(flags0&0x40))
+                (*block)(ctx->nonce.c,ctx->cmac.c,key);
+        ctx->nonce.c[0] = L = flags0&7;
+        for (n=0,i=15-L;i<15;++i) {
+                n |= ctx->nonce.c[i];
+                ctx->nonce.c[i]=0;
+                n <<= 8;
+        }
+        n |= ctx->nonce.c[15];  /* reconstructed length */
+        ctx->nonce.c[15]=1;
+        if (n!=len) return -1;
+        while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+                union { u64 u[2]; u8 c[16]; } temp;
+#endif
+                (*block)(ctx->nonce.c,scratch.c,key);
+                ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+                memcpy (temp.c,inp,16);
+                ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
+                ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
+                memcpy (out,scratch.c,16);
+#else
+                ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
+                ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
+#endif
+                (*block)(ctx->cmac.c,ctx->cmac.c,key);
+                inp += 16;
+                out += 16;
+                len -= 16;
+        }
+        if (len) {
+                (*block)(ctx->nonce.c,scratch.c,key);
+                for (i=0; i<len; ++i)
+                        ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
+                (*block)(ctx->cmac.c,ctx->cmac.c,key);
+        }
+        for (i=15-L;i<16;++i)
+                ctx->nonce.c[i]=0;
+        (*block)(ctx->nonce.c,scratch.c,key);
+        ctx->cmac.u[0] ^= scratch.u[0];
+        ctx->cmac.u[1] ^= scratch.u[1];
+        ctx->nonce.c[0] = flags0;
+        return 0;
+}
+static void ctr64_add (unsigned char *counter,size_t inc)
+{       size_t n=8, val=0;
+        counter += 8;
+        do {
+                --n;
+                val += counter[n] + (inc&0xff);
+                counter[n] = (unsigned char)val;
+                val >>= 8;      /* carry bit */
+                inc >>= 8;
+        } while(n && (inc || val));
+}
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
+        const unsigned char *inp, unsigned char *out,
+        size_t len,ccm128_f stream)
+{
+        size_t          n;
+        unsigned int    i,L;
+        unsigned char   flags0  = ctx->nonce.c[0];
+        block128_f      block   = ctx->block;
+        void *          key     = ctx->key;
+        union { u64 u[2]; u8 c[16]; } scratch;
+        if (!(flags0&0x40))
+                (*block)(ctx->nonce.c,ctx->cmac.c,key),
+                ctx->blocks++;
+        ctx->nonce.c[0] = L = flags0&7;
+        for (n=0,i=15-L;i<15;++i) {
+                n |= ctx->nonce.c[i];
+                ctx->nonce.c[i]=0;
+                n <<= 8;
+        }
+        n |= ctx->nonce.c[15];  /* reconstructed length */
+        ctx->nonce.c[15]=1;
+        if (n!=len) return -1;  /* length mismatch */
+        ctx->blocks += ((len+15)>>3)|1;
+        if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
+        if ((n=len/16)) {
+                (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
+                n   *= 16;
+                inp += n;
+                out += n;
+                len -= n;
+                if (len) ctr64_add(ctx->nonce.c,n/16);
+        }
+        if (len) {
+                for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
+                (*block)(ctx->cmac.c,ctx->cmac.c,key);
+                (*block)(ctx->nonce.c,scratch.c,key);
+                for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
+        }
+        for (i=15-L;i<16;++i)
+                ctx->nonce.c[i]=0;
+        (*block)(ctx->nonce.c,scratch.c,key);
+        ctx->cmac.u[0] ^= scratch.u[0];
+        ctx->cmac.u[1] ^= scratch.u[1];
+        ctx->nonce.c[0] = flags0;
+        return 0;
+}
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
+        const unsigned char *inp, unsigned char *out,
+        size_t len,ccm128_f stream)
+{
+        size_t          n;
+        unsigned int    i,L;
+        unsigned char   flags0  = ctx->nonce.c[0];
+        block128_f      block   = ctx->block;
+        void *          key     = ctx->key;
+        union { u64 u[2]; u8 c[16]; } scratch;
+        if (!(flags0&0x40))
+                (*block)(ctx->nonce.c,ctx->cmac.c,key);
+        ctx->nonce.c[0] = L = flags0&7;
+        for (n=0,i=15-L;i<15;++i) {
+                n |= ctx->nonce.c[i];
+                ctx->nonce.c[i]=0;
+                n <<= 8;
+        }
+        n |= ctx->nonce.c[15];  /* reconstructed length */
+        ctx->nonce.c[15]=1;
+        if (n!=len) return -1;
+        if ((n=len/16)) {
+                (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
+                n   *= 16;
+                inp += n;
+                out += n;
+                len -= n;
+                if (len) ctr64_add(ctx->nonce.c,n/16);
+        }
+        if (len) {
+                (*block)(ctx->nonce.c,scratch.c,key);
+                for (i=0; i<len; ++i)
+                        ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
+                (*block)(ctx->cmac.c,ctx->cmac.c,key);
+        }
+        for (i=15-L;i<16;++i)
+                ctx->nonce.c[i]=0;
+        (*block)(ctx->nonce.c,scratch.c,key);
+        ctx->cmac.u[0] ^= scratch.u[0];
+        ctx->cmac.u[1] ^= scratch.u[1];
+        ctx->nonce.c[0] = flags0;
+        return 0;
+}
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
+{       unsigned int M = (ctx->nonce.c[0]>>3)&7;        /* the M parameter */
+        M *= 2; M += 2;
+        if (len<M)      return 0;
+        memcpy(tag,ctx->cmac.c,M);
+        return M;
+}
diff --git a/src/lib/libcrypto/modes/cfb128.c b/src/lib/libcrypto/modes/cfb128.c
index e5938c6137..4e6f5d35e1 100644
--- a/src/lib/libcrypto/modes/cfb128.c
+++ b/src/lib/libcrypto/modes/cfb128.c
@@ -48,7 +48,8 @@
 *
 */
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 #ifndef MODES_DEBUG
@@ -58,14 +59,6 @@
 #endif
 #include <assert.h>
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
 /* The input and output encrypted as though 128bit cfb mode is being
 * used.  The extra state information to record how much of the
 * 128bit block we have used is contained in *num;
diff --git a/src/lib/libcrypto/modes/ctr128.c b/src/lib/libcrypto/modes/ctr128.c
index 932037f551..ee642c5863 100644
--- a/src/lib/libcrypto/modes/ctr128.c
+++ b/src/lib/libcrypto/modes/ctr128.c
@@ -48,7 +48,8 @@
 *
 */
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 #ifndef MODES_DEBUG
@@ -58,17 +59,6 @@
 #endif
 #include <assert.h>
-typedef unsigned int u32;
-typedef unsigned char u8;
-#define STRICT_ALIGNMENT
-#if defined(__i386)     || defined(__i386__)    || \
-    defined(__x86_64)   || defined(__x86_64__)  || \
-    defined(_M_IX86)    || defined(_M_AMD64)    || defined(_M_X64) || \
-    defined(__s390__)   || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
 /* NOTE: the IV/counter CTR mode is big-endian.  The code itself
 * is endian-neutral. */
@@ -182,3 +172,81 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
        *num=n;
 }
+/* increment upper 96 bits of 128-bit counter by 1 */
+static void ctr96_inc(unsigned char *counter) {
+        u32 n=12;
+        u8  c;
+        do {
+                --n;
+                c = counter[n];
+                ++c;
+                counter[n] = c;
+                if (c) return;
+        } while (n);
+}
+void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
+                        size_t len, const void *key,
+                        unsigned char ivec[16], unsigned char ecount_buf[16],
+                        unsigned int *num, ctr128_f func)
+{
+        unsigned int n,ctr32;
+        assert(in && out && key && ecount_buf && num);
+        assert(*num < 16);
+        n = *num;
+        while (n && len) {
+                *(out++) = *(in++) ^ ecount_buf[n];
+                --len;
+                n = (n+1) % 16;
+        }
+        ctr32 = GETU32(ivec+12);
+        while (len>=16) {
+                size_t blocks = len/16;
+                /*
+                 * 1<<28 is just a not-so-small yet not-so-large number...
+                 * Below condition is practically never met, but it has to
+                 * be checked for code correctness.
+                 */
+                if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28))
+                        blocks = (1U<<28);
+                /*
+                 * As (*func) operates on 32-bit counter, caller
+                 * has to handle overflow. 'if' below detects the
+                 * overflow, which is then handled by limiting the
+                 * amount of blocks to the exact overflow point...
+                 */
+                ctr32 += (u32)blocks;
+                if (ctr32 < blocks) {
+                        blocks -= ctr32;
+                        ctr32   = 0;
+                }
+                (*func)(in,out,blocks,key,ivec);
+                /* (*ctr) does not update ivec, caller does: */
+                PUTU32(ivec+12,ctr32);
+                /* ... overflow was detected, propogate carry. */
+                if (ctr32 == 0) ctr96_inc(ivec);
+                blocks *= 16;
+                len -= blocks;
+                out += blocks;
+                in  += blocks;
+        }
+        if (len) {
+                memset(ecount_buf,0,16);
+                (*func)(ecount_buf,ecount_buf,1,key,ivec);
+                ++ctr32;
+                PUTU32(ivec+12,ctr32);
+                if (ctr32 == 0) ctr96_inc(ivec);
+                while (len--) {
+                        out[n] = in[n] ^ ecount_buf[n];
+                        ++n;
+                }
+        }
+        *num=n;
+}
diff --git a/src/lib/libcrypto/modes/cts128.c b/src/lib/libcrypto/modes/cts128.c
index e0430f9fdc..c0e1f3696c 100644
--- a/src/lib/libcrypto/modes/cts128.c
+++ b/src/lib/libcrypto/modes/cts128.c
@@ -5,7 +5,8 @@
 * forms are granted according to the OpenSSL license.
 */
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 #ifndef MODES_DEBUG
@@ -23,8 +24,9 @@
 * deviates from mentioned RFCs. Most notably it allows input to be
 * of block length and it doesn't flip the order of the last two
 * blocks. CTS is being discussed even in ECB context, but it's not
- * adopted for any known application. This implementation complies
+ * adopted for any known application. This implementation provides
- * with mentioned RFCs and [as such] extends CBC mode.
+ * two interfaces: one compliant with above mentioned RFCs and one
+ * compliant with the NIST proposal, both extending CBC mode.
 */
 size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
@@ -54,6 +56,34 @@ size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
        return len+residue;
 }
+size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
+                        size_t len, const void *key,
+                        unsigned char ivec[16], block128_f block)
+{       size_t residue, n;
+        assert (in && out && key && ivec);
+        if (len < 16) return 0;
+        residue=len%16;
+        len -= residue;
+        CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
+        if (residue==0) return len;
+        in  += len;
+        out += len;
+        for (n=0; n<residue; ++n)
+                ivec[n] ^= in[n];
+        (*block)(ivec,ivec,key);
+        memcpy(out-16+residue,ivec,16);
+        return len+residue;
+}
 size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
                        size_t len, const void *key,
                        unsigned char ivec[16], cbc128_f cbc)
@@ -90,6 +120,41 @@ size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
        return len+residue;
 }
+size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
+                        size_t len, const void *key,
+                        unsigned char ivec[16], cbc128_f cbc)
+{       size_t residue;
+        union { size_t align; unsigned char c[16]; } tmp;
+        assert (in && out && key && ivec);
+        if (len < 16) return 0;
+        residue=len%16;
+        len -= residue;
+        (*cbc)(in,out,len,key,ivec,1);
+        if (residue==0) return len;
+        in  += len;
+        out += len;
+#if defined(CBC_HANDLES_TRUNCATED_IO)
+        (*cbc)(in,out-16+residue,residue,key,ivec,1);
+#else
+        {
+        size_t n;
+        for (n=0; n<16; n+=sizeof(size_t))
+                *(size_t *)(tmp.c+n) = 0;
+        memcpy(tmp.c,in,residue);
+        }
+        (*cbc)(tmp.c,out-16+residue,16,key,ivec,1);
+#endif
+        return len+residue;
+}
 size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
                        size_t len, const void *key,
                        unsigned char ivec[16], block128_f block)
@@ -125,7 +190,51 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
        for(residue+=16; n<residue; ++n)
                out[n] = tmp.c[n] ^ in[n];
-        return len+residue-16;
+        return 16+len+residue;
+}
+size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
+                        size_t len, const void *key,
+                        unsigned char ivec[16], block128_f block)
+{       size_t residue, n;
+        union { size_t align; unsigned char c[32]; } tmp;
+        assert (in && out && key && ivec);
+        if (len<16) return 0;
+        residue=len%16;
+        if (residue==0) {
+                CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
+                return len;
+        }
+        len -= 16+residue;
+        if (len) {
+                CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
+                in  += len;
+                out += len;
+        }
+        (*block)(in+residue,tmp.c+16,key);
+        for (n=0; n<16; n+=sizeof(size_t))
+                *(size_t *)(tmp.c+n) = *(size_t *)(tmp.c+16+n);
+        memcpy(tmp.c,in,residue);
+        (*block)(tmp.c,tmp.c,key);
+        for(n=0; n<16; ++n) {
+                unsigned char c = in[n];
+                out[n] = tmp.c[n] ^ ivec[n];
+                ivec[n] = in[n+residue];
+                tmp.c[n] = c;
+        }
+        for(residue+=16; n<residue; ++n)
+                out[n] = tmp.c[n] ^ tmp.c[n-16];
+        return 16+len+residue;
 }
 size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
@@ -160,7 +269,47 @@ size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
        (*cbc)(tmp.c,tmp.c,32,key,ivec,0);
        memcpy(out,tmp.c,16+residue);
 #endif
-        return len+residue;
+        return 16+len+residue;
+}
+size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
+                        size_t len, const void *key,
+                        unsigned char ivec[16], cbc128_f cbc)
+{       size_t residue, n;
+        union { size_t align; unsigned char c[32]; } tmp;
+        assert (in && out && key && ivec);
+        if (len<16) return 0;
+        residue=len%16;
+        if (residue==0) {
+                (*cbc)(in,out,len,key,ivec,0);
+                return len;
+        }
+        len -= 16+residue;
+        if (len) {
+                (*cbc)(in,out,len,key,ivec,0);
+                in  += len;
+                out += len;
+        }
+        for (n=16; n<32; n+=sizeof(size_t))
+                *(size_t *)(tmp.c+n) = 0;
+        /* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
+        (*cbc)(in+residue,tmp.c,16,key,tmp.c+16,0);
+        memcpy(tmp.c,in,residue);
+#if defined(CBC_HANDLES_TRUNCATED_IO)
+        (*cbc)(tmp.c,out,16+residue,key,ivec,0);
+#else
+        (*cbc)(tmp.c,tmp.c,32,key,ivec,0);
+        memcpy(out,tmp.c,16+residue);
+#endif
+        return 16+len+residue;
 }
 #if defined(SELFTEST)
@@ -200,9 +349,8 @@ static const unsigned char vector_64[64] =
 static AES_KEY encks, decks;
 void test_vector(const unsigned char *vector,size_t len)
-{       unsigned char cleartext[64];
+{       unsigned char iv[sizeof(test_iv)];
-        unsigned char iv[sizeof(test_iv)];
+        unsigned char cleartext[64],ciphertext[64];
-        unsigned char ciphertext[64];
        size_t tail;
        printf("vector_%d\n",len); fflush(stdout);
@@ -243,7 +391,57 @@ void test_vector(const unsigned char *vector,size_t len)
                fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
 }
-main()
+void test_nistvector(const unsigned char *vector,size_t len)
+{       unsigned char iv[sizeof(test_iv)];
+        unsigned char cleartext[64],ciphertext[64],nistvector[64];
+        size_t tail;
+        printf("nistvector_%d\n",len); fflush(stdout);
+        if ((tail=len%16) == 0) tail = 16;
+        len -= 16 + tail;
+        memcpy(nistvector,vector,len);
+        /* flip two last blocks */
+        memcpy(nistvector+len,vector+len+16,tail);
+        memcpy(nistvector+len+tail,vector+len,16);
+        len += 16 + tail;
+        tail = 16;
+        /* test block-based encryption */
+        memcpy(iv,test_iv,sizeof(test_iv));
+        CRYPTO_nistcts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt);
+        if (memcmp(ciphertext,nistvector,len))
+                fprintf(stderr,"output_%d mismatch\n",len), exit(1);
+        if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
+                fprintf(stderr,"iv_%d mismatch\n",len), exit(1);
+        /* test block-based decryption */
+        memcpy(iv,test_iv,sizeof(test_iv));
+        CRYPTO_nistcts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt);
+        if (memcmp(cleartext,test_input,len))
+                fprintf(stderr,"input_%d mismatch\n",len), exit(2);
+        if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
+                fprintf(stderr,"iv_%d mismatch\n",len), exit(2);
+        /* test streamed encryption */
+        memcpy(iv,test_iv,sizeof(test_iv));
+        CRYPTO_nistcts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt);
+        if (memcmp(ciphertext,nistvector,len))
+                fprintf(stderr,"output_%d mismatch\n",len), exit(3);
+        if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
+                fprintf(stderr,"iv_%d mismatch\n",len), exit(3);
+        /* test streamed decryption */
+        memcpy(iv,test_iv,sizeof(test_iv));
+        CRYPTO_nistcts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt);
+        if (memcmp(cleartext,test_input,len))
+                fprintf(stderr,"input_%d mismatch\n",len), exit(4);
+        if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
+                fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
+}
+int main()
 {
        AES_set_encrypt_key(test_key,128,&encks);
        AES_set_decrypt_key(test_key,128,&decks);
@@ -254,6 +452,14 @@ main()
        test_vector(vector_47,sizeof(vector_47));
        test_vector(vector_48,sizeof(vector_48));
        test_vector(vector_64,sizeof(vector_64));
-        exit(0);
+        test_nistvector(vector_17,sizeof(vector_17));
+        test_nistvector(vector_31,sizeof(vector_31));
+        test_nistvector(vector_32,sizeof(vector_32));
+        test_nistvector(vector_47,sizeof(vector_47));
+        test_nistvector(vector_48,sizeof(vector_48));
+        test_nistvector(vector_64,sizeof(vector_64));
+        return 0;
 }
 #endif
diff --git a/src/lib/libcrypto/modes/gcm128.c b/src/lib/libcrypto/modes/gcm128.c
new file mode 100644
index 0000000000..7d6d034970
--- /dev/null
+++ b/src/lib/libcrypto/modes/gcm128.c
@@ -0,0 +1,1757 @@
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+#define OPENSSL_FIPSAPI
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
+/* redefine, because alignment is ensured */
+#undef  GETU32
+#define GETU32(p)       BSWAP4(*(const u32 *)(p))
+#undef  PUTU32
+#define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
+#endif
+#define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
+#define REDUCE1BIT(V)   do { \
+        if (sizeof(size_t)==8) { \
+                u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
+                V.lo  = (V.hi<<63)|(V.lo>>1); \
+                V.hi  = (V.hi>>1 )^T; \
+        } \
+        else { \
+                u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
+                V.lo  = (V.hi<<63)|(V.lo>>1); \
+                V.hi  = (V.hi>>1 )^((u64)T<<32); \
+        } \
+} while(0)
+/*
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8. 8 is effectively reserved for testing purposes.
+ * TABLE_BITS>1 are lookup-table-driven implementations referred to as
+ * "Shoup's" in GCM specification. In other words OpenSSL does not cover
+ * whole spectrum of possible table driven implementations. Why? In
+ * non-"Shoup's" case memory access pattern is segmented in such manner,
+ * that it's trivial to see that cache timing information can reveal
+ * fair portion of intermediate hash value. Given that ciphertext is
+ * always available to attacker, it's possible for him to attempt to
+ * deduce secret parameter H and if successful, tamper with messages
+ * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
+ * not as trivial, but there is no reason to believe that it's resistant
+ * to cache-timing attack. And the thing about "8-bit" implementation is
+ * that it consumes 16 (sixteen) times more memory, 4KB per individual
+ * key + 1KB shared. Well, on pros side it should be twice as fast as
+ * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
+ * was observed to run ~75% faster, closer to 100% for commercial
+ * compilers... Yet "4-bit" procedure is preferred, because it's
+ * believed to provide better security-performance balance and adequate
+ * all-round performance. "All-round" refers to things like:
+ *
+ * - shorter setup time effectively improves overall timing for
+ *   handling short messages;
+ * - larger table allocation can become unbearable because of VM
+ *   subsystem penalties (for example on Windows large enough free
+ *   results in VM working set trimming, meaning that consequent
+ *   malloc would immediately incur working set expansion);
+ * - larger table has larger cache footprint, which can affect
+ *   performance of other code paths (not necessarily even from same
+ *   thread in Hyper-Threading world);
+ *
+ * Value of 1 is not appropriate for performance reasons.
+ */
+#if     TABLE_BITS==8
+static void gcm_init_8bit(u128 Htable[256], u64 H[2])
+{
+        int  i, j;
+        u128 V;
+        Htable[0].hi = 0;
+        Htable[0].lo = 0;
+        V.hi = H[0];
+        V.lo = H[1];
+        for (Htable[128]=V, i=64; i>0; i>>=1) {
+                REDUCE1BIT(V);
+                Htable[i] = V;
+        }
+        for (i=2; i<256; i<<=1) {
+                u128 *Hi = Htable+i, H0 = *Hi;
+                for (j=1; j<i; ++j) {
+                        Hi[j].hi = H0.hi^Htable[j].hi;
+                        Hi[j].lo = H0.lo^Htable[j].lo;
+                }
+        }
+}
+static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
+{
+        u128 Z = { 0, 0};
+        const u8 *xi = (const u8 *)Xi+15;
+        size_t rem, n = *xi;
+        const union { long one; char little; } is_endian = {1};
+        static const size_t rem_8bit[256] = {
+                PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
+                PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
+                PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
+                PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
+                PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
+                PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
+                PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
+                PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
+                PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
+                PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
+                PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
+                PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
+                PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
+                PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
+                PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
+                PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
+                PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
+                PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
+                PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
+                PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
+                PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
+                PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
+                PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
+                PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
+                PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
+                PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
+                PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
+                PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
+                PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
+                PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
+                PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
+                PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
+                PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
+                PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
+                PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
+                PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
+                PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
+                PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
+                PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
+                PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
+                PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
+                PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
+                PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
+                PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
+                PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
+                PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
+                PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
+                PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
+                PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
+                PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
+                PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
+                PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
+                PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
+                PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
+                PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
+                PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
+                PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
+                PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
+                PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
+                PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
+                PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
+                PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
+                PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
+                PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
+        while (1) {
+                Z.hi ^= Htable[n].hi;
+                Z.lo ^= Htable[n].lo;
+                if ((u8 *)Xi==xi)       break;
+                n = *(--xi);
+                rem  = (size_t)Z.lo&0xff;
+                Z.lo = (Z.hi<<56)|(Z.lo>>8);
+                Z.hi = (Z.hi>>8);
+                if (sizeof(size_t)==8)
+                        Z.hi ^= rem_8bit[rem];
+                else
+                        Z.hi ^= (u64)rem_8bit[rem]<<32;
+        }
+        if (is_endian.little) {
+#ifdef BSWAP8
+                Xi[0] = BSWAP8(Z.hi);
+                Xi[1] = BSWAP8(Z.lo);
+#else
+                u8 *p = (u8 *)Xi;
+                u32 v;
+                v = (u32)(Z.hi>>32);    PUTU32(p,v);
+                v = (u32)(Z.hi);        PUTU32(p+4,v);
+                v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
+                v = (u32)(Z.lo);        PUTU32(p+12,v);
+#endif
+        }
+        else {
+                Xi[0] = Z.hi;
+                Xi[1] = Z.lo;
+        }
+}
+#define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
+#elif   TABLE_BITS==4
+static void gcm_init_4bit(u128 Htable[16], u64 H[2])
+{
+        u128 V;
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+        int  i;
+#endif
+        Htable[0].hi = 0;
+        Htable[0].lo = 0;
+        V.hi = H[0];
+        V.lo = H[1];
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+        for (Htable[8]=V, i=4; i>0; i>>=1) {
+                REDUCE1BIT(V);
+                Htable[i] = V;
+        }
+        for (i=2; i<16; i<<=1) {
+                u128 *Hi = Htable+i;
+                int   j;
+                for (V=*Hi, j=1; j<i; ++j) {
+                        Hi[j].hi = V.hi^Htable[j].hi;
+                        Hi[j].lo = V.lo^Htable[j].lo;
+                }
+        }
+#else
+        Htable[8] = V;
+        REDUCE1BIT(V);
+        Htable[4] = V;
+        REDUCE1BIT(V);
+        Htable[2] = V;
+        REDUCE1BIT(V);
+        Htable[1] = V;
+        Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
+        V=Htable[4];
+        Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
+        Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
+        Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
+        V=Htable[8];
+        Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
+        Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
+        Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
+        Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
+        Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
+        Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
+        Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
+#endif
+#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
+        /*
+         * ARM assembler expects specific dword order in Htable.
+         */
+        {
+        int j;
+        const union { long one; char little; } is_endian = {1};
+        if (is_endian.little)
+                for (j=0;j<16;++j) {
+                        V = Htable[j];
+                        Htable[j].hi = V.lo;
+                        Htable[j].lo = V.hi;
+                }
+        else
+                for (j=0;j<16;++j) {
+                        V = Htable[j];
+                        Htable[j].hi = V.lo<<32|V.lo>>32;
+                        Htable[j].lo = V.hi<<32|V.hi>>32;
+                }
+        }
+#endif
+}
+#ifndef GHASH_ASM
+static const size_t rem_4bit[16] = {
+        PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
+        PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
+        PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
+        PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
+static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
+{
+        u128 Z;
+        int cnt = 15;
+        size_t rem, nlo, nhi;
+        const union { long one; char little; } is_endian = {1};
+        nlo  = ((const u8 *)Xi)[15];
+        nhi  = nlo>>4;
+        nlo &= 0xf;
+        Z.hi = Htable[nlo].hi;
+        Z.lo = Htable[nlo].lo;
+        while (1) {
+                rem  = (size_t)Z.lo&0xf;
+                Z.lo = (Z.hi<<60)|(Z.lo>>4);
+                Z.hi = (Z.hi>>4);
+                if (sizeof(size_t)==8)
+                        Z.hi ^= rem_4bit[rem];
+                else
+                        Z.hi ^= (u64)rem_4bit[rem]<<32;
+                Z.hi ^= Htable[nhi].hi;
+                Z.lo ^= Htable[nhi].lo;
+                if (--cnt<0)            break;
+                nlo  = ((const u8 *)Xi)[cnt];
+                nhi  = nlo>>4;
+                nlo &= 0xf;
+                rem  = (size_t)Z.lo&0xf;
+                Z.lo = (Z.hi<<60)|(Z.lo>>4);
+                Z.hi = (Z.hi>>4);
+                if (sizeof(size_t)==8)
+                        Z.hi ^= rem_4bit[rem];
+                else
+                        Z.hi ^= (u64)rem_4bit[rem]<<32;
+                Z.hi ^= Htable[nlo].hi;
+                Z.lo ^= Htable[nlo].lo;
+        }
+        if (is_endian.little) {
+#ifdef BSWAP8
+                Xi[0] = BSWAP8(Z.hi);
+                Xi[1] = BSWAP8(Z.lo);
+#else
+                u8 *p = (u8 *)Xi;
+                u32 v;
+                v = (u32)(Z.hi>>32);    PUTU32(p,v);
+                v = (u32)(Z.hi);        PUTU32(p+4,v);
+                v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
+                v = (u32)(Z.lo);        PUTU32(p+12,v);
+#endif
+        }
+        else {
+                Xi[0] = Z.hi;
+                Xi[1] = Z.lo;
+        }
+}
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+/*
+ * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
+ * details... Compiler-generated code doesn't seem to give any
+ * performance improvement, at least not on x86[_64]. It's here
+ * mostly as reference and a placeholder for possible future
+ * non-trivial optimization[s]...
+ */
+static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
+                                const u8 *inp,size_t len)
+{
+    u128 Z;
+    int cnt;
+    size_t rem, nlo, nhi;
+    const union { long one; char little; } is_endian = {1};
+#if 1
+    do {
+        cnt  = 15;
+        nlo  = ((const u8 *)Xi)[15];
+        nlo ^= inp[15];
+        nhi  = nlo>>4;
+        nlo &= 0xf;
+        Z.hi = Htable[nlo].hi;
+        Z.lo = Htable[nlo].lo;
+        while (1) {
+                rem  = (size_t)Z.lo&0xf;
+                Z.lo = (Z.hi<<60)|(Z.lo>>4);
+                Z.hi = (Z.hi>>4);
+                if (sizeof(size_t)==8)
+                        Z.hi ^= rem_4bit[rem];
+                else
+                        Z.hi ^= (u64)rem_4bit[rem]<<32;
+                Z.hi ^= Htable[nhi].hi;
+                Z.lo ^= Htable[nhi].lo;
+                if (--cnt<0)            break;
+                nlo  = ((const u8 *)Xi)[cnt];
+                nlo ^= inp[cnt];
+                nhi  = nlo>>4;
+                nlo &= 0xf;
+                rem  = (size_t)Z.lo&0xf;
+                Z.lo = (Z.hi<<60)|(Z.lo>>4);
+                Z.hi = (Z.hi>>4);
+                if (sizeof(size_t)==8)
+                        Z.hi ^= rem_4bit[rem];
+                else
+                        Z.hi ^= (u64)rem_4bit[rem]<<32;
+                Z.hi ^= Htable[nlo].hi;
+                Z.lo ^= Htable[nlo].lo;
+        }
+#else
+    /*
+     * Extra 256+16 bytes per-key plus 512 bytes shared tables
+     * [should] give ~50% improvement... One could have PACK()-ed
+     * the rem_8bit even here, but the priority is to minimize
+     * cache footprint...
+     */ 
+    u128 Hshr4[16];     /* Htable shifted right by 4 bits */
+    u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
+    static const unsigned short rem_8bit[256] = {
+        0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
+        0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
+        0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
+        0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
+        0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
+        0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
+        0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
+        0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
+        0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
+        0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
+        0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
+        0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
+        0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
+        0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
+        0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
+        0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
+        0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
+        0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
+        0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
+        0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
+        0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
+        0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
+        0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
+        0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
+        0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
+        0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
+        0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
+        0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
+        0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
+        0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
+        0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
+        0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
+    /*
+     * This pre-processing phase slows down procedure by approximately
+     * same time as it makes each loop spin faster. In other words
+     * single block performance is approximately same as straightforward
+     * "4-bit" implementation, and then it goes only faster...
+     */
+    for (cnt=0; cnt<16; ++cnt) {
+        Z.hi = Htable[cnt].hi;
+        Z.lo = Htable[cnt].lo;
+        Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
+        Hshr4[cnt].hi = (Z.hi>>4);
+        Hshl4[cnt]    = (u8)(Z.lo<<4);
+    }
+    do {
+        for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
+                nlo  = ((const u8 *)Xi)[cnt];
+                nlo ^= inp[cnt];
+                nhi  = nlo>>4;
+                nlo &= 0xf;
+                Z.hi ^= Htable[nlo].hi;
+                Z.lo ^= Htable[nlo].lo;
+                rem = (size_t)Z.lo&0xff;
+                Z.lo = (Z.hi<<56)|(Z.lo>>8);
+                Z.hi = (Z.hi>>8);
+                Z.hi ^= Hshr4[nhi].hi;
+                Z.lo ^= Hshr4[nhi].lo;
+                Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
+        }
+        nlo  = ((const u8 *)Xi)[0];
+        nlo ^= inp[0];
+        nhi  = nlo>>4;
+        nlo &= 0xf;
+        Z.hi ^= Htable[nlo].hi;
+        Z.lo ^= Htable[nlo].lo;
+        rem = (size_t)Z.lo&0xf;
+        Z.lo = (Z.hi<<60)|(Z.lo>>4);
+        Z.hi = (Z.hi>>4);
+        Z.hi ^= Htable[nhi].hi;
+        Z.lo ^= Htable[nhi].lo;
+        Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
+#endif
+        if (is_endian.little) {
+#ifdef BSWAP8
+                Xi[0] = BSWAP8(Z.hi);
+                Xi[1] = BSWAP8(Z.lo);
+#else
+                u8 *p = (u8 *)Xi;
+                u32 v;
+                v = (u32)(Z.hi>>32);    PUTU32(p,v);
+                v = (u32)(Z.hi);        PUTU32(p+4,v);
+                v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
+                v = (u32)(Z.lo);        PUTU32(p+12,v);
+#endif
+        }
+        else {
+                Xi[0] = Z.hi;
+                Xi[1] = Z.lo;
+        }
+    } while (inp+=16, len-=16);
+}
+#endif
+#else
+void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+#define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
+#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
+#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
+/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
+ * trashing effect. In other words idea is to hash data while it's
+ * still in L1 cache after encryption pass... */
+#define GHASH_CHUNK       (3*1024)
+#endif
+#else   /* TABLE_BITS */
+static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
+{
+        u128 V,Z = { 0,0 };
+        long X;
+        int  i,j;
+        const long *xi = (const long *)Xi;
+        const union { long one; char little; } is_endian = {1};
+        V.hi = H[0];    /* H is in host byte order, no byte swapping */
+        V.lo = H[1];
+        for (j=0; j<16/sizeof(long); ++j) {
+                if (is_endian.little) {
+                        if (sizeof(long)==8) {
+#ifdef BSWAP8
+                                X = (long)(BSWAP8(xi[j]));
+#else
+                                const u8 *p = (const u8 *)(xi+j);
+                                X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
+#endif
+                        }
+                        else {
+                                const u8 *p = (const u8 *)(xi+j);
+                                X = (long)GETU32(p);
+                        }
+                }
+                else
+                        X = xi[j];
+                for (i=0; i<8*sizeof(long); ++i, X<<=1) {
+                        u64 M = (u64)(X>>(8*sizeof(long)-1));
+                        Z.hi ^= V.hi&M;
+                        Z.lo ^= V.lo&M;
+                        REDUCE1BIT(V);
+                }
+        }
+        if (is_endian.little) {
+#ifdef BSWAP8
+                Xi[0] = BSWAP8(Z.hi);
+                Xi[1] = BSWAP8(Z.lo);
+#else
+                u8 *p = (u8 *)Xi;
+                u32 v;
+                v = (u32)(Z.hi>>32);    PUTU32(p,v);
+                v = (u32)(Z.hi);        PUTU32(p+4,v);
+                v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
+                v = (u32)(Z.lo);        PUTU32(p+12,v);
+#endif
+        }
+        else {
+                Xi[0] = Z.hi;
+                Xi[1] = Z.lo;
+        }
+}
+#define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
+#endif
+#if     TABLE_BITS==4 && defined(GHASH_ASM)
+# if    !defined(I386_ONLY) && \
+        (defined(__i386)        || defined(__i386__)    || \
+         defined(__x86_64)      || defined(__x86_64__)  || \
+         defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
+#  define GHASH_ASM_X86_OR_64
+#  define GCM_FUNCREF_4BIT
+extern unsigned int OPENSSL_ia32cap_P[2];
+void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
+#   define GHASH_ASM_X86
+void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#  endif
+# elif defined(__arm__) || defined(__arm)
+#  include "arm_arch.h"
+#  if __ARM_ARCH__>=7
+#   define GHASH_ASM_ARM
+#   define GCM_FUNCREF_4BIT
+void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#  endif
+# endif
+#endif
+#ifdef GCM_FUNCREF_4BIT
+# undef  GCM_MUL
+# define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
+# ifdef GHASH
+#  undef  GHASH
+#  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
+# endif
+#endif
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
+{
+        const union { long one; char little; } is_endian = {1};
+        memset(ctx,0,sizeof(*ctx));
+        ctx->block = block;
+        ctx->key   = key;
+        (*block)(ctx->H.c,ctx->H.c,key);
+        if (is_endian.little) {
+                /* H is stored in host byte order */
+#ifdef BSWAP8
+                ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
+                ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
+#else
+                u8 *p = ctx->H.c;
+                u64 hi,lo;
+                hi = (u64)GETU32(p)  <<32|GETU32(p+4);
+                lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
+                ctx->H.u[0] = hi;
+                ctx->H.u[1] = lo;
+#endif
+        }
+#if     TABLE_BITS==8
+        gcm_init_8bit(ctx->Htable,ctx->H.u);
+#elif   TABLE_BITS==4
+# if    defined(GHASH_ASM_X86_OR_64)
+#  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
+        if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
+            OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
+                gcm_init_clmul(ctx->Htable,ctx->H.u);
+                ctx->gmult = gcm_gmult_clmul;
+                ctx->ghash = gcm_ghash_clmul;
+                return;
+        }
+#  endif
+        gcm_init_4bit(ctx->Htable,ctx->H.u);
+#  if   defined(GHASH_ASM_X86)                  /* x86 only */
+#   if defined(OPENSSL_IA32_SSE2)
+        if (OPENSSL_ia32cap_P[0]&(1<<25)) {     /* check SSE bit */
+#   else
+        if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
+#   endif
+                ctx->gmult = gcm_gmult_4bit_mmx;
+                ctx->ghash = gcm_ghash_4bit_mmx;
+        } else {
+                ctx->gmult = gcm_gmult_4bit_x86;
+                ctx->ghash = gcm_ghash_4bit_x86;
+        }
+#  else
+        ctx->gmult = gcm_gmult_4bit;
+        ctx->ghash = gcm_ghash_4bit;
+#  endif
+# elif  defined(GHASH_ASM_ARM)
+        if (OPENSSL_armcap_P & ARMV7_NEON) {
+                ctx->gmult = gcm_gmult_neon;
+                ctx->ghash = gcm_ghash_neon;
+        } else {
+                gcm_init_4bit(ctx->Htable,ctx->H.u);
+                ctx->gmult = gcm_gmult_4bit;
+                ctx->ghash = gcm_ghash_4bit;
+        }
+# else
+        gcm_init_4bit(ctx->Htable,ctx->H.u);
+# endif
+#endif
+}
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
+{
+        const union { long one; char little; } is_endian = {1};
+        unsigned int ctr;
+#ifdef GCM_FUNCREF_4BIT
+        void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
+#endif
+        ctx->Yi.u[0]  = 0;
+        ctx->Yi.u[1]  = 0;
+        ctx->Xi.u[0]  = 0;
+        ctx->Xi.u[1]  = 0;
+        ctx->len.u[0] = 0;      /* AAD length */
+        ctx->len.u[1] = 0;      /* message length */
+        ctx->ares = 0;
+        ctx->mres = 0;
+        if (len==12) {
+                memcpy(ctx->Yi.c,iv,12);
+                ctx->Yi.c[15]=1;
+                ctr=1;
+        }
+        else {
+                size_t i;
+                u64 len0 = len;
+                while (len>=16) {
+                        for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
+                        GCM_MUL(ctx,Yi);
+                        iv += 16;
+                        len -= 16;
+                }
+                if (len) {
+                        for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
+                        GCM_MUL(ctx,Yi);
+                }
+                len0 <<= 3;
+                if (is_endian.little) {
+#ifdef BSWAP8
+                        ctx->Yi.u[1]  ^= BSWAP8(len0);
+#else
+                        ctx->Yi.c[8]  ^= (u8)(len0>>56);
+                        ctx->Yi.c[9]  ^= (u8)(len0>>48);
+                        ctx->Yi.c[10] ^= (u8)(len0>>40);
+                        ctx->Yi.c[11] ^= (u8)(len0>>32);
+                        ctx->Yi.c[12] ^= (u8)(len0>>24);
+                        ctx->Yi.c[13] ^= (u8)(len0>>16);
+                        ctx->Yi.c[14] ^= (u8)(len0>>8);
+                        ctx->Yi.c[15] ^= (u8)(len0);
+#endif
+                }
+                else
+                        ctx->Yi.u[1]  ^= len0;
+                GCM_MUL(ctx,Yi);
+                if (is_endian.little)
+                        ctr = GETU32(ctx->Yi.c+12);
+                else
+                        ctr = ctx->Yi.d[3];
+        }
+        (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
+        ++ctr;
+        if (is_endian.little)
+                PUTU32(ctx->Yi.c+12,ctr);
+        else
+                ctx->Yi.d[3] = ctr;
+}
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
+{
+        size_t i;
+        unsigned int n;
+        u64 alen = ctx->len.u[0];
+#ifdef GCM_FUNCREF_4BIT
+        void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
+# ifdef GHASH
+        void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+                                const u8 *inp,size_t len)       = ctx->ghash;
+# endif
+#endif
+        if (ctx->len.u[1]) return -2;
+        alen += len;
+        if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
+                return -1;
+        ctx->len.u[0] = alen;
+        n = ctx->ares;
+        if (n) {
+                while (n && len) {
+                        ctx->Xi.c[n] ^= *(aad++);
+                        --len;
+                        n = (n+1)%16;
+                }
+                if (n==0) GCM_MUL(ctx,Xi);
+                else {
+                        ctx->ares = n;
+                        return 0;
+                }
+        }
+#ifdef GHASH
+        if ((i = (len&(size_t)-16))) {
+                GHASH(ctx,aad,i);
+                aad += i;
+                len -= i;
+        }
+#else
+        while (len>=16) {
+                for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
+                GCM_MUL(ctx,Xi);
+                aad += 16;
+                len -= 16;
+        }
+#endif
+        if (len) {
+                n = (unsigned int)len;
+                for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
+        }
+        ctx->ares = n;
+        return 0;
+}
+int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
+                const unsigned char *in, unsigned char *out,
+                size_t len)
+{
+        const union { long one; char little; } is_endian = {1};
+        unsigned int n, ctr;
+        size_t i;
+        u64        mlen  = ctx->len.u[1];
+        block128_f block = ctx->block;
+        void      *key   = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+        void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
+# ifdef GHASH
+        void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+                                const u8 *inp,size_t len)       = ctx->ghash;
+# endif
+#endif
+#if 0
+        n = (unsigned int)mlen%16; /* alternative to ctx->mres */
+#endif
+        mlen += len;
+        if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+                return -1;
+        ctx->len.u[1] = mlen;
+        if (ctx->ares) {
+                /* First call to encrypt finalizes GHASH(AAD) */
+                GCM_MUL(ctx,Xi);
+                ctx->ares = 0;
+        }
+        if (is_endian.little)
+                ctr = GETU32(ctx->Yi.c+12);
+        else
+                ctr = ctx->Yi.d[3];
+        n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+        if (16%sizeof(size_t) == 0) do {        /* always true actually */
+                if (n) {
+                        while (n && len) {
+                                ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
+                                --len;
+                                n = (n+1)%16;
+                        }
+                        if (n==0) GCM_MUL(ctx,Xi);
+                        else {
+                                ctx->mres = n;
+                                return 0;
+                        }
+                }
+#if defined(STRICT_ALIGNMENT)
+                if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
+                        break;
+#endif
+#if defined(GHASH) && defined(GHASH_CHUNK)
+                while (len>=GHASH_CHUNK) {
+                    size_t j=GHASH_CHUNK;
+                    while (j) {
+                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
+                        ++ctr;
+                        if (is_endian.little)
+                                PUTU32(ctx->Yi.c+12,ctr);
+                        else
+                                ctx->Yi.d[3] = ctr;
+                        for (i=0; i<16; i+=sizeof(size_t))
+                                *(size_t *)(out+i) =
+                                *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+                        out += 16;
+                        in  += 16;
+                        j   -= 16;
+                    }
+                    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
+                    len -= GHASH_CHUNK;
+                }
+                if ((i = (len&(size_t)-16))) {
+                    size_t j=i;
+                    while (len>=16) {
+                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
+                        ++ctr;
+                        if (is_endian.little)
+                                PUTU32(ctx->Yi.c+12,ctr);
+                        else
+                                ctx->Yi.d[3] = ctr;
+                        for (i=0; i<16; i+=sizeof(size_t))
+                                *(size_t *)(out+i) =
+                                *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+                        out += 16;
+                        in  += 16;
+                        len -= 16;
+                    }
+                    GHASH(ctx,out-j,j);
+                }
+#else
+                while (len>=16) {
+                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
+                        ++ctr;
+                        if (is_endian.little)
+                                PUTU32(ctx->Yi.c+12,ctr);
+                        else
+                                ctx->Yi.d[3] = ctr;
+                        for (i=0; i<16; i+=sizeof(size_t))
+                                *(size_t *)(ctx->Xi.c+i) ^=
+                                *(size_t *)(out+i) =
+                                *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+                        GCM_MUL(ctx,Xi);
+                        out += 16;
+                        in  += 16;
+                        len -= 16;
+                }
+#endif
+                if (len) {
+                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
+                        ++ctr;
+                        if (is_endian.little)
+                                PUTU32(ctx->Yi.c+12,ctr);
+                        else
+                                ctx->Yi.d[3] = ctr;
+                        while (len--) {
+                                ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
+                                ++n;
+                        }
+                }
+                ctx->mres = n;
+                return 0;
+        } while(0);
+#endif
+        for (i=0;i<len;++i) {
+                if (n==0) {
+                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
+                        ++ctr;
+                        if (is_endian.little)
+                                PUTU32(ctx->Yi.c+12,ctr);
+                        else
+                                ctx->Yi.d[3] = ctr;
+                }
+                ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
+                n = (n+1)%16;
+                if (n==0)
+                        GCM_MUL(ctx,Xi);
+        }
+        ctx->mres = n;
+        return 0;
+}
+int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
+                const unsigned char *in, unsigned char *out,
+                size_t len)
+{
+        const union { long one; char little; } is_endian = {1};
+        unsigned int n, ctr;
+        size_t i;
+        u64        mlen  = ctx->len.u[1];
+        block128_f block = ctx->block;
+        void      *key   = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+        void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
+# ifdef GHASH
+        void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+                                const u8 *inp,size_t len)       = ctx->ghash;
+# endif
+#endif
+        mlen += len;
+        if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+                return -1;
+        ctx->len.u[1] = mlen;
+        if (ctx->ares) {
+                /* First call to decrypt finalizes GHASH(AAD) */
+                GCM_MUL(ctx,Xi);
+                ctx->ares = 0;
+        }
+        if (is_endian.little)
+                ctr = GETU32(ctx->Yi.c+12);
+        else
+                ctr = ctx->Yi.d[3];
+        n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+        if (16%sizeof(size_t) == 0) do {        /* always true actually */
+                if (n) {
+                        while (n && len) {
+                                u8 c = *(in++);
+                                *(out++) = c^ctx->EKi.c[n];
+                                ctx->Xi.c[n] ^= c;
+                                --len;
+                                n = (n+1)%16;
+                        }
+                        if (n==0) GCM_MUL (ctx,Xi);
+                        else {
+                                ctx->mres = n;
+                                return 0;
+                        }
+                }
+#if defined(STRICT_ALIGNMENT)
+                if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
+                        break;
+#endif
+#if defined(GHASH) && defined(GHASH_CHUNK)
+                while (len>=GHASH_CHUNK) {
+                    size_t j=GHASH_CHUNK;
+                    GHASH(ctx,in,GHASH_CHUNK);
+                    while (j) {
+                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
+                        ++ctr;
+                        if (is_endian.little)
+                                PUTU32(ctx->Yi.c+12,ctr);
+                        else
+                                ctx->Yi.d[3] = ctr;
+                        for (i=0; i<16; i+=sizeof(size_t))
+                                *(size_t *)(out+i) =
+                                *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+                        out += 16;
+                        in  += 16;
+                        j   -= 16;
+                    }
+                    len -= GHASH_CHUNK;
+                }
+                if ((i = (len&(size_t)-16))) {
+                    GHASH(ctx,in,i);
+                    while (len>=16) {
+                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
+                        ++ctr;
+                        if (is_endian.little)
+                                PUTU32(ctx->Yi.c+12,ctr);
+                        else
+                                ctx->Yi.d[3] = ctr;
+                        for (i=0; i<16; i+=sizeof(size_t))
+                                *(size_t *)(out+i) =
+                                *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+                        out += 16;
+                        in  += 16;
+                        len -= 16;
+                    }
+                }
+#else
+                while (len>=16) {
+                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
+                        ++ctr;
+                        if (is_endian.little)
+                                PUTU32(ctx->Yi.c+12,ctr);
+                        else
+                                ctx->Yi.d[3] = ctr;
+                        for (i=0; i<16; i+=sizeof(size_t)) {
+                                size_t c = *(size_t *)(in+i);
+                                *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
+                                *(size_t *)(ctx->Xi.c+i) ^= c;
+                        }
+                        GCM_MUL(ctx,Xi);
+                        out += 16;
+                        in  += 16;
+                        len -= 16;
+                }
+#endif
+                if (len) {
+                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
+                        ++ctr;
+                        if (is_endian.little)
+                                PUTU32(ctx->Yi.c+12,ctr);
+                        else
+                                ctx->Yi.d[3] = ctr;
+                        while (len--) {
+                                u8 c = in[n];
+                                ctx->Xi.c[n] ^= c;
+                                out[n] = c^ctx->EKi.c[n];
+                                ++n;
+                        }
+                }
+                ctx->mres = n;
+                return 0;
+        } while(0);
+#endif
+        for (i=0;i<len;++i) {
+                u8 c;
+                if (n==0) {
+                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
+                        ++ctr;
+                        if (is_endian.little)
+                                PUTU32(ctx->Yi.c+12,ctr);
+                        else
+                                ctx->Yi.d[3] = ctr;
+                }
+                c = in[i];
+                out[i] = c^ctx->EKi.c[n];
+                ctx->Xi.c[n] ^= c;
+                n = (n+1)%16;
+                if (n==0)
+                        GCM_MUL(ctx,Xi);
+        }
+        ctx->mres = n;
+        return 0;
+}
+int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
+                const unsigned char *in, unsigned char *out,
+                size_t len, ctr128_f stream)
+{
+        const union { long one; char little; } is_endian = {1};
+        unsigned int n, ctr;
+        size_t i;
+        u64   mlen = ctx->len.u[1];
+        void *key  = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+        void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
+# ifdef GHASH
+        void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+                                const u8 *inp,size_t len)       = ctx->ghash;
+# endif
+#endif
+        mlen += len;
+        if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+                return -1;
+        ctx->len.u[1] = mlen;
+        if (ctx->ares) {
+                /* First call to encrypt finalizes GHASH(AAD) */
+                GCM_MUL(ctx,Xi);
+                ctx->ares = 0;
+        }
+        if (is_endian.little)
+                ctr = GETU32(ctx->Yi.c+12);
+        else
+                ctr = ctx->Yi.d[3];
+        n = ctx->mres;
+        if (n) {
+                while (n && len) {
+                        ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
+                        --len;
+                        n = (n+1)%16;
+                }
+                if (n==0) GCM_MUL(ctx,Xi);
+                else {
+                        ctx->mres = n;
+                        return 0;
+                }
+        }
+#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+        while (len>=GHASH_CHUNK) {
+                (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
+                ctr += GHASH_CHUNK/16;
+                if (is_endian.little)
+                        PUTU32(ctx->Yi.c+12,ctr);
+                else
+                        ctx->Yi.d[3] = ctr;
+                GHASH(ctx,out,GHASH_CHUNK);
+                out += GHASH_CHUNK;
+                in  += GHASH_CHUNK;
+                len -= GHASH_CHUNK;
+        }
+#endif
+        if ((i = (len&(size_t)-16))) {
+                size_t j=i/16;
+                (*stream)(in,out,j,key,ctx->Yi.c);
+                ctr += (unsigned int)j;
+                if (is_endian.little)
+                        PUTU32(ctx->Yi.c+12,ctr);
+                else
+                        ctx->Yi.d[3] = ctr;
+                in  += i;
+                len -= i;
+#if defined(GHASH)
+                GHASH(ctx,out,i);
+                out += i;
+#else
+                while (j--) {
+                        for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
+                        GCM_MUL(ctx,Xi);
+                        out += 16;
+                }
+#endif
+        }
+        if (len) {
+                (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
+                ++ctr;
+                if (is_endian.little)
+                        PUTU32(ctx->Yi.c+12,ctr);
+                else
+                        ctx->Yi.d[3] = ctr;
+                while (len--) {
+                        ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
+                        ++n;
+                }
+        }
+        ctx->mres = n;
+        return 0;
+}
+int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
+                const unsigned char *in, unsigned char *out,
+                size_t len,ctr128_f stream)
+{
+        const union { long one; char little; } is_endian = {1};
+        unsigned int n, ctr;
+        size_t i;
+        u64   mlen = ctx->len.u[1];
+        void *key  = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+        void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
+# ifdef GHASH
+        void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+                                const u8 *inp,size_t len)       = ctx->ghash;
+# endif
+#endif
+        mlen += len;
+        if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+                return -1;
+        ctx->len.u[1] = mlen;
+        if (ctx->ares) {
+                /* First call to decrypt finalizes GHASH(AAD) */
+                GCM_MUL(ctx,Xi);
+                ctx->ares = 0;
+        }
+        if (is_endian.little)
+                ctr = GETU32(ctx->Yi.c+12);
+        else
+                ctr = ctx->Yi.d[3];
+        n = ctx->mres;
+        if (n) {
+                while (n && len) {
+                        u8 c = *(in++);
+                        *(out++) = c^ctx->EKi.c[n];
+                        ctx->Xi.c[n] ^= c;
+                        --len;
+                        n = (n+1)%16;
+                }
+                if (n==0) GCM_MUL (ctx,Xi);
+                else {
+                        ctx->mres = n;
+                        return 0;
+                }
+        }
+#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+        while (len>=GHASH_CHUNK) {
+                GHASH(ctx,in,GHASH_CHUNK);
+                (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
+                ctr += GHASH_CHUNK/16;
+                if (is_endian.little)
+                        PUTU32(ctx->Yi.c+12,ctr);
+                else
+                        ctx->Yi.d[3] = ctr;
+                out += GHASH_CHUNK;
+                in  += GHASH_CHUNK;
+                len -= GHASH_CHUNK;
+        }
+#endif
+        if ((i = (len&(size_t)-16))) {
+                size_t j=i/16;
+#if defined(GHASH)
+                GHASH(ctx,in,i);
+#else
+                while (j--) {
+                        size_t k;
+                        for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
+                        GCM_MUL(ctx,Xi);
+                        in += 16;
+                }
+                j   = i/16;
+                in -= i;
+#endif
+                (*stream)(in,out,j,key,ctx->Yi.c);
+                ctr += (unsigned int)j;
+                if (is_endian.little)
+                        PUTU32(ctx->Yi.c+12,ctr);
+                else
+                        ctx->Yi.d[3] = ctr;
+                out += i;
+                in  += i;
+                len -= i;
+        }
+        if (len) {
+                (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
+                ++ctr;
+                if (is_endian.little)
+                        PUTU32(ctx->Yi.c+12,ctr);
+                else
+                        ctx->Yi.d[3] = ctr;
+                while (len--) {
+                        u8 c = in[n];
+                        ctx->Xi.c[n] ^= c;
+                        out[n] = c^ctx->EKi.c[n];
+                        ++n;
+                }
+        }
+        ctx->mres = n;
+        return 0;
+}
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
+                        size_t len)
+{
+        const union { long one; char little; } is_endian = {1};
+        u64 alen = ctx->len.u[0]<<3;
+        u64 clen = ctx->len.u[1]<<3;
+#ifdef GCM_FUNCREF_4BIT
+        void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
+#endif
+        if (ctx->mres)
+                GCM_MUL(ctx,Xi);
+        if (is_endian.little) {
+#ifdef BSWAP8
+                alen = BSWAP8(alen);
+                clen = BSWAP8(clen);
+#else
+                u8 *p = ctx->len.c;
+                ctx->len.u[0] = alen;
+                ctx->len.u[1] = clen;
+                alen = (u64)GETU32(p)  <<32|GETU32(p+4);
+                clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
+#endif
+        }
+        ctx->Xi.u[0] ^= alen;
+        ctx->Xi.u[1] ^= clen;
+        GCM_MUL(ctx,Xi);
+        ctx->Xi.u[0] ^= ctx->EK0.u[0];
+        ctx->Xi.u[1] ^= ctx->EK0.u[1];
+        if (tag && len<=sizeof(ctx->Xi))
+                return memcmp(ctx->Xi.c,tag,len);
+        else
+                return -1;
+}
+void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+        CRYPTO_gcm128_finish(ctx, NULL, 0);
+        memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
+}
+GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
+{
+        GCM128_CONTEXT *ret;
+        if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
+                CRYPTO_gcm128_init(ret,key,block);
+        return ret;
+}
+void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
+{
+        if (ctx) {
+                OPENSSL_cleanse(ctx,sizeof(*ctx));
+                OPENSSL_free(ctx);
+        }
+}
+#if defined(SELFTEST)
+#include <stdio.h>
+#include <openssl/aes.h>
+/* Test Case 1 */
+static const u8 K1[16],
+                *P1=NULL,
+                *A1=NULL,
+                IV1[12],
+                *C1=NULL,
+                T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
+/* Test Case 2 */
+#define K2 K1
+#define A2 A1
+#define IV2 IV1
+static const u8 P2[16],
+                C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
+                T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
+/* Test Case 3 */
+#define A3 A2
+static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
+                P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+                        0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+                        0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+                        0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+                IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+                C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
+                        0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
+                        0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
+                        0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
+                T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
+/* Test Case 4 */
+#define K4 K3
+#define IV4 IV3
+static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+                        0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+                        0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+                        0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+                A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+                        0xab,0xad,0xda,0xd2},
+                C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
+                        0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
+                        0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
+                        0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
+                T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
+/* Test Case 5 */
+#define K5 K4
+#define P5 P4
+#define A5 A4
+static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+                C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
+                        0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
+                        0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
+                        0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
+                T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
+/* Test Case 6 */
+#define K6 K5
+#define P6 P5
+#define A6 A5
+static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+                        0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+                        0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+                        0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+                C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
+                        0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
+                        0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
+                        0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
+                T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
+/* Test Case 7 */
+static const u8 K7[24],
+                *P7=NULL,
+                *A7=NULL,
+                IV7[12],
+                *C7=NULL,
+                T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
+/* Test Case 8 */
+#define K8 K7
+#define IV8 IV7
+#define A8 A7
+static const u8 P8[16],
+                C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
+                T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
+/* Test Case 9 */
+#define A9 A8
+static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
+                        0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
+                P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+                        0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+                        0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+                        0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+                IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+                C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
+                        0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
+                        0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
+                        0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
+                T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
+/* Test Case 10 */
+#define K10 K9
+#define IV10 IV9
+static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+                        0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+                        0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+                        0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+                A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+                        0xab,0xad,0xda,0xd2},
+                C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
+                        0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
+                        0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
+                        0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
+                T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
+/* Test Case 11 */
+#define K11 K10
+#define P11 P10
+#define A11 A10
+static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+                C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
+                        0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
+                        0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
+                        0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
+                T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
+/* Test Case 12 */
+#define K12 K11
+#define P12 P11
+#define A12 A11
+static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+                        0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+                        0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+                        0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+                C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
+                        0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
+                        0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
+                        0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
+                T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
+/* Test Case 13 */
+static const u8 K13[32],
+                *P13=NULL,
+                *A13=NULL,
+                IV13[12],
+                *C13=NULL,
+                T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
+/* Test Case 14 */
+#define K14 K13
+#define A14 A13
+static const u8 P14[16],
+                IV14[12],
+                C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
+                T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
+/* Test Case 15 */
+#define A15 A14
+static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
+                        0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
+                P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+                        0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+                        0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+                        0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+                IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+                C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+                        0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+                        0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+                        0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
+                T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
+/* Test Case 16 */
+#define K16 K15
+#define IV16 IV15
+static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+                        0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+                        0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+                        0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+                A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+                        0xab,0xad,0xda,0xd2},
+                C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+                        0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+                        0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+                        0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
+                T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
+/* Test Case 17 */
+#define K17 K16
+#define P17 P16
+#define A17 A16
+static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+                C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
+                        0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
+                        0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
+                        0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
+                T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
+/* Test Case 18 */
+#define K18 K17
+#define P18 P17
+#define A18 A17
+static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+                        0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+                        0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+                        0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+                C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
+                        0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
+                        0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
+                        0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
+                T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
+#define TEST_CASE(n)    do {                                    \
+        u8 out[sizeof(P##n)];                                   \
+        AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
+        CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
+        CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
+        memset(out,0,sizeof(out));                              \
+        if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
+        if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
+        if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
+            (C##n && memcmp(out,C##n,sizeof(out))))             \
+                ret++, printf ("encrypt test#%d failed.\n",n);  \
+        CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
+        memset(out,0,sizeof(out));                              \
+        if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
+        if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
+        if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
+            (P##n && memcmp(out,P##n,sizeof(out))))             \
+                ret++, printf ("decrypt test#%d failed.\n",n);  \
+        } while(0)
+int main()
+{
+        GCM128_CONTEXT ctx;
+        AES_KEY key;
+        int ret=0;
+        TEST_CASE(1);
+        TEST_CASE(2);
+        TEST_CASE(3);
+        TEST_CASE(4);
+        TEST_CASE(5);
+        TEST_CASE(6);
+        TEST_CASE(7);
+        TEST_CASE(8);
+        TEST_CASE(9);
+        TEST_CASE(10);
+        TEST_CASE(11);
+        TEST_CASE(12);
+        TEST_CASE(13);
+        TEST_CASE(14);
+        TEST_CASE(15);
+        TEST_CASE(16);
+        TEST_CASE(17);
+        TEST_CASE(18);
+#ifdef OPENSSL_CPUID_OBJ
+        {
+        size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
+        union { u64 u; u8 c[1024]; } buf;
+        int i;
+        AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
+        CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
+        CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
+        CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
+        start = OPENSSL_rdtsc();
+        CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
+        gcm_t = OPENSSL_rdtsc() - start;
+        CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
+                        &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
+                        (block128_f)AES_encrypt);
+        start = OPENSSL_rdtsc();
+        CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
+                        &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
+                        (block128_f)AES_encrypt);
+        ctr_t = OPENSSL_rdtsc() - start;
+        printf("%.2f-%.2f=%.2f\n",
+                        gcm_t/(double)sizeof(buf),
+                        ctr_t/(double)sizeof(buf),
+                        (gcm_t-ctr_t)/(double)sizeof(buf));
+#ifdef GHASH
+        GHASH(&ctx,buf.c,sizeof(buf));
+        start = OPENSSL_rdtsc();
+        for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
+        gcm_t = OPENSSL_rdtsc() - start;
+        printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
+#endif
+        }
+#endif
+        return ret;
+}
+#endif
diff --git a/src/lib/libcrypto/modes/modes.h b/src/lib/libcrypto/modes/modes.h
index af8d97d795..f18215bb2b 100644
--- a/src/lib/libcrypto/modes/modes.h
+++ b/src/lib/libcrypto/modes/modes.h
@@ -15,6 +15,14 @@ typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out,
                        size_t len, const void *key,
                        unsigned char ivec[16], int enc);
+typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
+                        size_t blocks, const void *key,
+                        const unsigned char ivec[16]);
+typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out,
+                        size_t blocks, const void *key,
+                        const unsigned char ivec[16],unsigned char cmac[16]);
 void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
                        size_t len, const void *key,
                        unsigned char ivec[16], block128_f block);
@@ -27,6 +35,11 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
                        unsigned char ivec[16], unsigned char ecount_buf[16],
                        unsigned int *num, block128_f block);
+void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
+                        size_t len, const void *key,
+                        unsigned char ivec[16], unsigned char ecount_buf[16],
+                        unsigned int *num, ctr128_f ctr);
 void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
                        size_t len, const void *key,
                        unsigned char ivec[16], int *num,
@@ -57,3 +70,66 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
 size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
                        size_t len, const void *key,
                        unsigned char ivec[16], cbc128_f cbc);
+size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
+                        size_t len, const void *key,
+                        unsigned char ivec[16], block128_f block);
+size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
+                        size_t len, const void *key,
+                        unsigned char ivec[16], cbc128_f cbc);
+size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
+                        size_t len, const void *key,
+                        unsigned char ivec[16], block128_f block);
+size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
+                        size_t len, const void *key,
+                        unsigned char ivec[16], cbc128_f cbc);
+typedef struct gcm128_context GCM128_CONTEXT;
+GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block);
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
+                        size_t len);
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
+                        size_t len);
+int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
+                        const unsigned char *in, unsigned char *out,
+                        size_t len);
+int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
+                        const unsigned char *in, unsigned char *out,
+                        size_t len);
+int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
+                        const unsigned char *in, unsigned char *out,
+                        size_t len, ctr128_f stream);
+int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
+                        const unsigned char *in, unsigned char *out,
+                        size_t len, ctr128_f stream);
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
+                        size_t len);
+void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
+void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
+typedef struct ccm128_context CCM128_CONTEXT;
+void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
+        unsigned int M, unsigned int L, void *key,block128_f block);
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
+        const unsigned char *nonce, size_t nlen, size_t mlen);
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
+        const unsigned char *aad, size_t alen);
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
+        const unsigned char *inp, unsigned char *out, size_t len);
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
+        const unsigned char *inp, unsigned char *out, size_t len);
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
+        const unsigned char *inp, unsigned char *out, size_t len,
+        ccm128_f stream);
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
+        const unsigned char *inp, unsigned char *out, size_t len,
+        ccm128_f stream);
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
+typedef struct xts128_context XTS128_CONTEXT;
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
+        const unsigned char *inp, unsigned char *out, size_t len, int enc);
diff --git a/src/lib/libcrypto/modes/modes_lcl.h b/src/lib/libcrypto/modes/modes_lcl.h
new file mode 100644
index 0000000000..b6dc3c336f
--- /dev/null
+++ b/src/lib/libcrypto/modes/modes_lcl.h
@@ -0,0 +1,131 @@
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use is governed by OpenSSL license.
+ * ====================================================================
+ */
+#include <openssl/modes.h>
+#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+typedef __int64 i64;
+typedef unsigned __int64 u64;
+#define U64(C) C##UI64
+#elif defined(__arch64__)
+typedef long i64;
+typedef unsigned long u64;
+#define U64(C) C##UL
+#else
+typedef long long i64;
+typedef unsigned long long u64;
+#define U64(C) C##ULL
+#endif
+typedef unsigned int u32;
+typedef unsigned char u8;
+#define STRICT_ALIGNMENT 1
+#if defined(__i386)     || defined(__i386__)    || \
+    defined(__x86_64)   || defined(__x86_64__)  || \
+    defined(_M_IX86)    || defined(_M_AMD64)    || defined(_M_X64) || \
+    defined(__s390__)   || defined(__s390x__)   || \
+    ( (defined(__arm__) || defined(__arm)) && \
+      (defined(__ARM_ARCH_7__)  || defined(__ARM_ARCH_7A__) || \
+       defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)) )
+# undef STRICT_ALIGNMENT
+#endif
+#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+#if defined(__GNUC__) && __GNUC__>=2
+# if defined(__x86_64) || defined(__x86_64__)
+#  define BSWAP8(x) ({  u64 ret=(x);                    \
+                        asm ("bswapq %0"                \
+                        : "+r"(ret));   ret;            })
+#  define BSWAP4(x) ({  u32 ret=(x);                    \
+                        asm ("bswapl %0"                \
+                        : "+r"(ret));   ret;            })
+# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
+#  define BSWAP8(x) ({  u32 lo=(u64)(x)>>32,hi=(x);     \
+                        asm ("bswapl %0; bswapl %1"     \
+                        : "+r"(hi),"+r"(lo));           \
+                        (u64)hi<<32|lo;                 })
+#  define BSWAP4(x) ({  u32 ret=(x);                    \
+                        asm ("bswapl %0"                \
+                        : "+r"(ret));   ret;            })
+# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
+#  define BSWAP8(x) ({  u32 lo=(u64)(x)>>32,hi=(x);     \
+                        asm ("rev %0,%0; rev %1,%1"     \
+                        : "+r"(hi),"+r"(lo));           \
+                        (u64)hi<<32|lo;                 })
+#  define BSWAP4(x) ({  u32 ret;                        \
+                        asm ("rev %0,%1"                \
+                        : "=r"(ret) : "r"((u32)(x)));   \
+                        ret;                            })
+# endif
+#elif defined(_MSC_VER)
+# if _MSC_VER>=1300
+#  pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
+#  define BSWAP8(x)     _byteswap_uint64((u64)(x))
+#  define BSWAP4(x)     _byteswap_ulong((u32)(x))
+# elif defined(_M_IX86)
+   __inline u32 _bswap4(u32 val) {
+        _asm mov eax,val
+        _asm bswap eax
+   }
+#  define BSWAP4(x)     _bswap4(x)
+# endif
+#endif
+#endif
+#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
+#define GETU32(p)       BSWAP4(*(const u32 *)(p))
+#define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
+#else
+#define GETU32(p)       ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
+#define PUTU32(p,v)     ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
+#endif
+/* GCM definitions */
+typedef struct { u64 hi,lo; } u128;
+#ifdef  TABLE_BITS
+#undef  TABLE_BITS
+#endif
+/*
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8 [or 1]. For further information see gcm128.c.
+ */
+#define TABLE_BITS 4
+struct gcm128_context {
+        /* Following 6 names follow names in GCM specification */
+        union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,len,
+                                                Xi,H;
+        /* Relative position of Xi, H and pre-computed Htable is used
+         * in some assembler modules, i.e. don't change the order! */
+#if TABLE_BITS==8
+        u128 Htable[256];
+#else
+        u128 Htable[16];
+        void (*gmult)(u64 Xi[2],const u128 Htable[16]);
+        void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+        unsigned int mres, ares;
+        block128_f block;
+        void *key;
+};
+struct xts128_context {
+        void      *key1, *key2;
+        block128_f block1,block2;
+};
+struct ccm128_context {
+        union { u64 u[2]; u8 c[16]; } nonce, cmac;
+        u64 blocks;
+        block128_f block;
+        void *key;
+};
diff --git a/src/lib/libcrypto/modes/ofb128.c b/src/lib/libcrypto/modes/ofb128.c
index c732e2ec58..01c01702c4 100644
--- a/src/lib/libcrypto/modes/ofb128.c
+++ b/src/lib/libcrypto/modes/ofb128.c
@@ -48,7 +48,8 @@
 *
 */
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 #ifndef MODES_DEBUG
@@ -58,14 +59,6 @@
 #endif
 #include <assert.h>
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
 /* The input and output encrypted as though 128bit ofb mode is being
 * used.  The extra state information to record how much of the
 * 128bit block we have used is contained in *num;
diff --git a/src/lib/libcrypto/modes/xts128.c b/src/lib/libcrypto/modes/xts128.c
new file mode 100644
index 0000000000..9cf27a25e9
--- /dev/null
+++ b/src/lib/libcrypto/modes/xts128.c
@@ -0,0 +1,187 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
+        const unsigned char *inp, unsigned char *out,
+        size_t len, int enc)
+{
+        const union { long one; char little; } is_endian = {1};
+        union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
+        unsigned int i;
+        if (len<16) return -1;
+        memcpy(tweak.c, iv, 16);
+        (*ctx->block2)(tweak.c,tweak.c,ctx->key2);
+        if (!enc && (len%16)) len-=16;
+        while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+                memcpy(scratch.c,inp,16);
+                scratch.u[0] ^= tweak.u[0];
+                scratch.u[1] ^= tweak.u[1];
+#else
+                scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
+                scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
+#endif
+                (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+                scratch.u[0] ^= tweak.u[0];
+                scratch.u[1] ^= tweak.u[1];
+                memcpy(out,scratch.c,16);
+#else
+                ((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
+                ((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
+#endif
+                inp += 16;
+                out += 16;
+                len -= 16;
+                if (len==0)     return 0;
+                if (is_endian.little) {
+                        unsigned int carry,res;
+                        
+                        res = 0x87&(((int)tweak.d[3])>>31);
+                        carry = (unsigned int)(tweak.u[0]>>63);
+                        tweak.u[0] = (tweak.u[0]<<1)^res;
+                        tweak.u[1] = (tweak.u[1]<<1)|carry;
+                }
+                else {
+                        size_t c;
+                        for (c=0,i=0;i<16;++i) {
+                                /*+ substitutes for |, because c is 1 bit */ 
+                                c += ((size_t)tweak.c[i])<<1;
+                                tweak.c[i] = (u8)c;
+                                c = c>>8;
+                        }
+                        tweak.c[0] ^= (u8)(0x87&(0-c));
+                }
+        }
+        if (enc) {
+                for (i=0;i<len;++i) {
+                        u8 c = inp[i];
+                        out[i] = scratch.c[i];
+                        scratch.c[i] = c;
+                }
+                scratch.u[0] ^= tweak.u[0];
+                scratch.u[1] ^= tweak.u[1];
+                (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+                scratch.u[0] ^= tweak.u[0];
+                scratch.u[1] ^= tweak.u[1];
+                memcpy(out-16,scratch.c,16);
+        }
+        else {
+                union { u64 u[2]; u8 c[16]; } tweak1;
+                if (is_endian.little) {
+                        unsigned int carry,res;
+                        res = 0x87&(((int)tweak.d[3])>>31);
+                        carry = (unsigned int)(tweak.u[0]>>63);
+                        tweak1.u[0] = (tweak.u[0]<<1)^res;
+                        tweak1.u[1] = (tweak.u[1]<<1)|carry;
+                }
+                else {
+                        size_t c;
+                        for (c=0,i=0;i<16;++i) {
+                                /*+ substitutes for |, because c is 1 bit */ 
+                                c += ((size_t)tweak.c[i])<<1;
+                                tweak1.c[i] = (u8)c;
+                                c = c>>8;
+                        }
+                        tweak1.c[0] ^= (u8)(0x87&(0-c));
+                }
+#if defined(STRICT_ALIGNMENT)
+                memcpy(scratch.c,inp,16);
+                scratch.u[0] ^= tweak1.u[0];
+                scratch.u[1] ^= tweak1.u[1];
+#else
+                scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
+                scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
+#endif
+                (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+                scratch.u[0] ^= tweak1.u[0];
+                scratch.u[1] ^= tweak1.u[1];
+                for (i=0;i<len;++i) {
+                        u8 c = inp[16+i];
+                        out[16+i] = scratch.c[i];
+                        scratch.c[i] = c;
+                }
+                scratch.u[0] ^= tweak.u[0];
+                scratch.u[1] ^= tweak.u[1];
+                (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+                scratch.u[0] ^= tweak.u[0];
+                scratch.u[1] ^= tweak.u[1];
+                memcpy (out,scratch.c,16);
+#else
+                ((u64*)out)[0] = scratch.u[0]^tweak.u[0];
+                ((u64*)out)[1] = scratch.u[1]^tweak.u[1];
+#endif
+        }
+        return 0;
+}
diff --git a/src/lib/libcrypto/objects/obj_xref.c b/src/lib/libcrypto/objects/obj_xref.c
index 152eca5c67..9f744bcede 100644
--- a/src/lib/libcrypto/objects/obj_xref.c
+++ b/src/lib/libcrypto/objects/obj_xref.c
@@ -110,8 +110,10 @@ int OBJ_find_sigid_algs(int signid, int *pdig_nid, int *ppkey_nid)
 #endif
        if (rv == NULL)
                return 0;
-        *pdig_nid = rv->hash_id;
+        if (pdig_nid)
-        *ppkey_nid = rv->pkey_id;
+                *pdig_nid = rv->hash_id;
+        if (ppkey_nid)
+                *ppkey_nid = rv->pkey_id;
        return 1;
        }
@@ -144,7 +146,8 @@ int OBJ_find_sigid_by_algs(int *psignid, int dig_nid, int pkey_nid)
 #endif
        if (rv == NULL)
                return 0;
-        *psignid = (*rv)->sign_id;
+        if (psignid)
+                *psignid = (*rv)->sign_id;
        return 1;
        }
diff --git a/src/lib/libcrypto/objects/obj_xref.h b/src/lib/libcrypto/objects/obj_xref.h
index d5b9b8e198..e23938c296 100644
--- a/src/lib/libcrypto/objects/obj_xref.h
+++ b/src/lib/libcrypto/objects/obj_xref.h
@@ -38,10 +38,12 @@ static const nid_triple sigoid_srt[] =
        {NID_id_GostR3411_94_with_GostR3410_94, NID_id_GostR3411_94, NID_id_GostR3410_94},
        {NID_id_GostR3411_94_with_GostR3410_94_cc, NID_id_GostR3411_94, NID_id_GostR3410_94_cc},
        {NID_id_GostR3411_94_with_GostR3410_2001_cc, NID_id_GostR3411_94, NID_id_GostR3410_2001_cc},
+        {NID_rsassaPss, NID_undef, NID_rsaEncryption},
        };
 static const nid_triple * const sigoid_srt_xref[] =
        {
+        &sigoid_srt[29],
        &sigoid_srt[17],
        &sigoid_srt[18],
        &sigoid_srt[0],
diff --git a/src/lib/libcrypto/objects/obj_xref.txt b/src/lib/libcrypto/objects/obj_xref.txt
index e45b3d34b9..cb917182ee 100644
--- a/src/lib/libcrypto/objects/obj_xref.txt
+++ b/src/lib/libcrypto/objects/obj_xref.txt
@@ -13,6 +13,10 @@ sha512WithRSAEncryption	sha512	rsaEncryption
 sha224WithRSAEncryption sha224  rsaEncryption
 mdc2WithRSA             mdc2    rsaEncryption
 ripemd160WithRSA        ripemd160 rsaEncryption
+# For PSS the digest algorithm can vary and depends on the included
+# AlgorithmIdentifier. The digest "undef" indicates the public key
+# method should handle this explicitly.
+rsassaPss               undef   rsaEncryption
 # Alternative deprecated OIDs. By using the older "rsa" OID this
 # type will be recognized by not normally used.
diff --git a/src/lib/libcrypto/pariscid.pl b/src/lib/libcrypto/pariscid.pl
new file mode 100644
index 0000000000..477ec9b87d
--- /dev/null
+++ b/src/lib/libcrypto/pariscid.pl
@@ -0,0 +1,224 @@
+#!/usr/bin/env perl
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+if ($flavour =~ /64/) {
+        $LEVEL          ="2.0W";
+        $SIZE_T         =8;
+        $ST             ="std";
+} else {
+        $LEVEL          ="1.1";
+        $SIZE_T         =4;
+        $ST             ="stw";
+}
+$rp="%r2";
+$sp="%r30";
+$rv="%r28";
+$code=<<___;
+        .LEVEL  $LEVEL
+        .SPACE  \$TEXT\$
+        .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+        .EXPORT OPENSSL_cpuid_setup,ENTRY
+        .ALIGN  8
+OPENSSL_cpuid_setup
+        .PROC
+        .CALLINFO       NO_CALLS
+        .ENTRY
+        bv      ($rp)
+        .EXIT
+        nop
+        .PROCEND
+        .EXPORT OPENSSL_rdtsc,ENTRY
+        .ALIGN  8
+OPENSSL_rdtsc
+        .PROC
+        .CALLINFO       NO_CALLS
+        .ENTRY
+        mfctl   %cr16,$rv
+        bv      ($rp)
+        .EXIT
+        nop
+        .PROCEND
+        .EXPORT OPENSSL_wipe_cpu,ENTRY
+        .ALIGN  8
+OPENSSL_wipe_cpu
+        .PROC
+        .CALLINFO       NO_CALLS
+        .ENTRY
+        xor             %r0,%r0,%r1
+        fcpy,dbl        %fr0,%fr4
+        xor             %r0,%r0,%r19
+        fcpy,dbl        %fr0,%fr5
+        xor             %r0,%r0,%r20
+        fcpy,dbl        %fr0,%fr6
+        xor             %r0,%r0,%r21
+        fcpy,dbl        %fr0,%fr7
+        xor             %r0,%r0,%r22
+        fcpy,dbl        %fr0,%fr8
+        xor             %r0,%r0,%r23
+        fcpy,dbl        %fr0,%fr9
+        xor             %r0,%r0,%r24
+        fcpy,dbl        %fr0,%fr10
+        xor             %r0,%r0,%r25
+        fcpy,dbl        %fr0,%fr11
+        xor             %r0,%r0,%r26
+        fcpy,dbl        %fr0,%fr22
+        xor             %r0,%r0,%r29
+        fcpy,dbl        %fr0,%fr23
+        xor             %r0,%r0,%r31
+        fcpy,dbl        %fr0,%fr24
+        fcpy,dbl        %fr0,%fr25
+        fcpy,dbl        %fr0,%fr26
+        fcpy,dbl        %fr0,%fr27
+        fcpy,dbl        %fr0,%fr28
+        fcpy,dbl        %fr0,%fr29
+        fcpy,dbl        %fr0,%fr30
+        fcpy,dbl        %fr0,%fr31
+        bv              ($rp)
+        .EXIT
+        ldo             0($sp),$rv
+        .PROCEND
+___
+{
+my $inp="%r26";
+my $len="%r25";
+$code.=<<___;
+        .EXPORT OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR
+        .ALIGN  8
+OPENSSL_cleanse
+        .PROC
+        .CALLINFO       NO_CALLS
+        .ENTRY
+        cmpib,*=        0,$len,Ldone
+        nop
+        cmpib,*>>=      15,$len,Little
+        ldi             $SIZE_T-1,%r1
+Lalign
+        and,*<>         $inp,%r1,%r28
+        b,n             Laligned
+        stb             %r0,0($inp)
+        ldo             -1($len),$len
+        b               Lalign
+        ldo             1($inp),$inp
+Laligned
+        andcm           $len,%r1,%r28
+Lot
+        $ST             %r0,0($inp)
+        addib,*<>       -$SIZE_T,%r28,Lot
+        ldo             $SIZE_T($inp),$inp
+        and,*<>         $len,%r1,$len
+        b,n             Ldone
+Little
+        stb             %r0,0($inp)
+        addib,*<>       -1,$len,Little
+        ldo             1($inp),$inp
+Ldone
+        bv              ($rp)
+        .EXIT
+        nop
+        .PROCEND
+___
+}
+{
+my ($out,$cnt,$max)=("%r26","%r25","%r24");
+my ($tick,$lasttick)=("%r23","%r22");
+my ($diff,$lastdiff)=("%r21","%r20");
+$code.=<<___;
+        .EXPORT OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR
+        .ALIGN  8
+OPENSSL_instrument_bus
+        .PROC
+        .CALLINFO       NO_CALLS
+        .ENTRY
+        copy            $cnt,$rv
+        mfctl           %cr16,$tick
+        copy            $tick,$lasttick
+        ldi             0,$diff
+        fdc             0($out)
+        ldw             0($out),$tick
+        add             $diff,$tick,$tick
+        stw             $tick,0($out)
+Loop
+        mfctl           %cr16,$tick
+        sub             $tick,$lasttick,$diff
+        copy            $tick,$lasttick
+        fdc             0($out)
+        ldw             0($out),$tick
+        add             $diff,$tick,$tick
+        stw             $tick,0($out)
+        addib,<>        -1,$cnt,Loop
+        addi            4,$out,$out
+        bv              ($rp)
+        .EXIT
+        sub             $rv,$cnt,$rv
+        .PROCEND
+        .EXPORT OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR
+        .ALIGN  8
+OPENSSL_instrument_bus2
+        .PROC
+        .CALLINFO       NO_CALLS
+        .ENTRY
+        copy            $cnt,$rv
+        sub             %r0,$cnt,$cnt
+        mfctl           %cr16,$tick
+        copy            $tick,$lasttick
+        ldi             0,$diff
+        fdc             0($out)
+        ldw             0($out),$tick
+        add             $diff,$tick,$tick
+        stw             $tick,0($out)
+        mfctl           %cr16,$tick
+        sub             $tick,$lasttick,$diff
+        copy            $tick,$lasttick
+Loop2
+        copy            $diff,$lastdiff
+        fdc             0($out)
+        ldw             0($out),$tick
+        add             $diff,$tick,$tick
+        stw             $tick,0($out)
+        addib,=         -1,$max,Ldone2
+        nop
+        mfctl           %cr16,$tick
+        sub             $tick,$lasttick,$diff
+        copy            $tick,$lasttick
+        cmpclr,<>       $lastdiff,$diff,$tick
+        ldi             1,$tick
+        ldi             1,%r1
+        xor             %r1,$tick,$tick
+        addb,<>         $tick,$cnt,Loop2
+        shladd,l        $tick,2,$out,$out
+Ldone2
+        bv              ($rp)
+        .EXIT
+        add             $rv,$cnt,$rv
+        .PROCEND
+___
+}
+$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
+$code =~ s/,\*/,/gm if ($SIZE_T==4);
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/pem/pvkfmt.c b/src/lib/libcrypto/pem/pvkfmt.c
index 5f130c4528..b1bf71a5da 100644
--- a/src/lib/libcrypto/pem/pvkfmt.c
+++ b/src/lib/libcrypto/pem/pvkfmt.c
@@ -709,13 +709,16 @@ static int derive_pvk_key(unsigned char *key,
                        const unsigned char *pass, int passlen)
        {
        EVP_MD_CTX mctx;
+        int rv = 1;
        EVP_MD_CTX_init(&mctx);
-        EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL);
+        if (!EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL)
-        EVP_DigestUpdate(&mctx, salt, saltlen);
+                || !EVP_DigestUpdate(&mctx, salt, saltlen)
-        EVP_DigestUpdate(&mctx, pass, passlen);
+                || !EVP_DigestUpdate(&mctx, pass, passlen)
-        EVP_DigestFinal_ex(&mctx, key, NULL);
+                || !EVP_DigestFinal_ex(&mctx, key, NULL))
+                        rv = 0;
        EVP_MD_CTX_cleanup(&mctx);
-        return 1;
+        return rv;
        }
        
@@ -727,11 +730,12 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in,
        const unsigned char *p = *in;
        unsigned int magic;
        unsigned char *enctmp = NULL, *q;
+        EVP_CIPHER_CTX cctx;
+        EVP_CIPHER_CTX_init(&cctx);
        if (saltlen)
                {
                char psbuf[PEM_BUFSIZE];
                unsigned char keybuf[20];
-                EVP_CIPHER_CTX cctx;
                int enctmplen, inlen;
                if (cb)
                        inlen=cb(psbuf,PEM_BUFSIZE,0,u);
@@ -757,37 +761,41 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in,
                p += 8;
                inlen = keylen - 8;
                q = enctmp + 8;
-                EVP_CIPHER_CTX_init(&cctx);
+                if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL))
-                EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL);
+                        goto err;
-                EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen);
+                if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen))
-                EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen);
+                        goto err;
+                if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen))
+                        goto err;
                magic = read_ledword((const unsigned char **)&q);
                if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC)
                        {
                        q = enctmp + 8;
                        memset(keybuf + 5, 0, 11);
-                        EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf,
+                        if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf,
-                                                                NULL);
+                                                                NULL))
+                                goto err;
                        OPENSSL_cleanse(keybuf, 20);
-                        EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen);
+                        if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen))
-                        EVP_DecryptFinal_ex(&cctx, q + enctmplen,
+                                goto err;
-                                                                &enctmplen);
+                        if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen,
+                                                                &enctmplen))
+                                goto err;
                        magic = read_ledword((const unsigned char **)&q);
                        if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC)
                                {
-                                EVP_CIPHER_CTX_cleanup(&cctx);
                                PEMerr(PEM_F_DO_PVK_BODY, PEM_R_BAD_DECRYPT);
                                goto err;
                                }
                        }
                else
                        OPENSSL_cleanse(keybuf, 20);
-                EVP_CIPHER_CTX_cleanup(&cctx);
                p = enctmp;
                }
        ret = b2i_PrivateKey(&p, keylen);
        err:
+        EVP_CIPHER_CTX_cleanup(&cctx);
        if (enctmp && saltlen)
                OPENSSL_free(enctmp);
        return ret;
@@ -841,6 +849,8 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
        {
        int outlen = 24, pklen;
        unsigned char *p, *salt = NULL;
+        EVP_CIPHER_CTX cctx;
+        EVP_CIPHER_CTX_init(&cctx);
        if (enclevel)
                outlen += PVK_SALTLEN;
        pklen = do_i2b(NULL, pk, 0);
@@ -885,7 +895,6 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
                {
                char psbuf[PEM_BUFSIZE];
                unsigned char keybuf[20];
-                EVP_CIPHER_CTX cctx;
                int enctmplen, inlen;
                if (cb)
                        inlen=cb(psbuf,PEM_BUFSIZE,1,u);
@@ -902,16 +911,19 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
                if (enclevel == 1)
                        memset(keybuf + 5, 0, 11);
                p = salt + PVK_SALTLEN + 8;
-                EVP_CIPHER_CTX_init(&cctx);
+                if (!EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL))
-                EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL);
+                        goto error;
                OPENSSL_cleanse(keybuf, 20);
-                EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8);
+                if (!EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8))
-                EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen);
+                        goto error;
-                EVP_CIPHER_CTX_cleanup(&cctx);
+                if (!EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen))
+                        goto error;
                }
+        EVP_CIPHER_CTX_cleanup(&cctx);
        return outlen;
        error:
+        EVP_CIPHER_CTX_cleanup(&cctx);
        return -1;
        }
diff --git a/src/lib/libcrypto/perlasm/ppc-xlate.pl b/src/lib/libcrypto/perlasm/ppc-xlate.pl
index 4579671c97..a3edd982b6 100755
--- a/src/lib/libcrypto/perlasm/ppc-xlate.pl
+++ b/src/lib/libcrypto/perlasm/ppc-xlate.pl
@@ -31,10 +31,9 @@ my $globl = sub {
                                $ret .= ".type  $name,\@function";
                                last;
                              };
-        /linux.*64/     && do { $ret .= ".globl .$name\n";
+        /linux.*64/     && do { $ret .= ".globl $name\n";
-                                $ret .= ".type  .$name,\@function\n";
+                                $ret .= ".type  $name,\@function\n";
                                $ret .= ".section       \".opd\",\"aw\"\n";
-                                $ret .= ".globl $name\n";
                                $ret .= ".align 3\n";
                                $ret .= "$name:\n";
                                $ret .= ".quad  .$name,.TOC.\@tocbase,0\n";
@@ -62,6 +61,14 @@ my $machine = sub {
    }
    ".machine   $arch";
 };
+my $size = sub {
+    if ($flavour =~ /linux.*32/)
+    {   shift;
+        ".size  " . join(",",@_);
+    }
+    else
+    {   "";     }
+};
 my $asciz = sub {
    shift;
    my $line = join(",",@_);
diff --git a/src/lib/libcrypto/ppccap.c b/src/lib/libcrypto/ppccap.c
new file mode 100644
index 0000000000..ab89ccaa12
--- /dev/null
+++ b/src/lib/libcrypto/ppccap.c
@@ -0,0 +1,115 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <crypto.h>
+#include <openssl/bn.h>
+#define PPC_FPU64       (1<<0)
+#define PPC_ALTIVEC     (1<<1)
+static int OPENSSL_ppccap_P = 0;
+static sigset_t all_masked;
+#ifdef OPENSSL_BN_ASM_MONT
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num)
+        {
+        int bn_mul_mont_fpu64(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
+        int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
+        if (sizeof(size_t)==4)
+                {
+#if (defined(__APPLE__) && defined(__MACH__))
+                if (num>=8 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64))
+                        return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
+#else
+                /* boundary of 32 was experimentally determined on
+                   Linux 2.6.22, might have to be adjusted on AIX... */
+                if (num>=32 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64))
+                        {
+                        sigset_t oset;
+                        int ret;
+                        sigprocmask(SIG_SETMASK,&all_masked,&oset);
+                        ret=bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
+                        sigprocmask(SIG_SETMASK,&oset,NULL);
+                        return ret;
+                        }
+#endif
+                }
+        else if ((OPENSSL_ppccap_P&PPC_FPU64))
+                /* this is a "must" on POWER6, but run-time detection
+                 * is not implemented yet... */
+                return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
+        return bn_mul_mont_int(rp,ap,bp,np,n0,num);
+        }
+#endif
+static sigjmp_buf ill_jmp;
+static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
+void OPENSSL_ppc64_probe(void);
+void OPENSSL_cpuid_setup(void)
+        {
+        char *e;
+        struct sigaction        ill_oact,ill_act;
+        sigset_t                oset;
+        static int trigger=0;
+        if (trigger) return;
+        trigger=1;
+ 
+        sigfillset(&all_masked);
+        sigdelset(&all_masked,SIGILL);
+        sigdelset(&all_masked,SIGTRAP);
+#ifdef SIGEMT
+        sigdelset(&all_masked,SIGEMT);
+#endif
+        sigdelset(&all_masked,SIGFPE);
+        sigdelset(&all_masked,SIGBUS);
+        sigdelset(&all_masked,SIGSEGV);
+        if ((e=getenv("OPENSSL_ppccap")))
+                {
+                OPENSSL_ppccap_P=strtoul(e,NULL,0);
+                return;
+                }
+        OPENSSL_ppccap_P = 0;
+        memset(&ill_act,0,sizeof(ill_act));
+        ill_act.sa_handler = ill_handler;
+        ill_act.sa_mask    = all_masked;
+        sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
+        sigaction(SIGILL,&ill_act,&ill_oact);
+        if (sizeof(size_t)==4)
+                {
+                if (sigsetjmp(ill_jmp,1) == 0)
+                        {
+                        OPENSSL_ppc64_probe();
+                        OPENSSL_ppccap_P |= PPC_FPU64;
+                        }
+                }
+        else
+                {
+                /*
+                 * Wanted code detecting POWER6 CPU and setting PPC_FPU64
+                 */
+                }
+        if (sigsetjmp(ill_jmp,1) == 0)
+                {
+                OPENSSL_altivec_probe();
+                OPENSSL_ppccap_P |= PPC_ALTIVEC;
+                }
+        sigaction (SIGILL,&ill_oact,NULL);
+        sigprocmask(SIG_SETMASK,&oset,NULL);
+        }
diff --git a/src/lib/libcrypto/ppccpuid.pl b/src/lib/libcrypto/ppccpuid.pl
index 369e1d0df9..4ba736a1d1 100755
--- a/src/lib/libcrypto/ppccpuid.pl
+++ b/src/lib/libcrypto/ppccpuid.pl
@@ -23,36 +23,67 @@ $code=<<___;
 .machine        "any"
 .text
-.globl  .OPENSSL_cpuid_setup
+.globl  .OPENSSL_ppc64_probe
 .align  4
-.OPENSSL_cpuid_setup:
+.OPENSSL_ppc64_probe:
+        fcfid   f1,f1
+        extrdi  r0,r0,32,0
        blr
+        .long   0
+        .byte   0,12,0x14,0,0,0,0,0
+.globl  .OPENSSL_altivec_probe
+.align  4
+.OPENSSL_altivec_probe:
+        .long   0x10000484      # vor   v0,v0,v0
+        blr
+        .long   0
+        .byte   0,12,0x14,0,0,0,0,0
 .globl  .OPENSSL_wipe_cpu
 .align  4
 .OPENSSL_wipe_cpu:
        xor     r0,r0,r0
+        fmr     f0,f31
+        fmr     f1,f31
+        fmr     f2,f31
        mr      r3,r1
+        fmr     f3,f31
        xor     r4,r4,r4
+        fmr     f4,f31
        xor     r5,r5,r5
+        fmr     f5,f31
        xor     r6,r6,r6
+        fmr     f6,f31
        xor     r7,r7,r7
+        fmr     f7,f31
        xor     r8,r8,r8
+        fmr     f8,f31
        xor     r9,r9,r9
+        fmr     f9,f31
        xor     r10,r10,r10
+        fmr     f10,f31
        xor     r11,r11,r11
+        fmr     f11,f31
        xor     r12,r12,r12
+        fmr     f12,f31
+        fmr     f13,f31
        blr
+        .long   0
+        .byte   0,12,0x14,0,0,0,0,0
 .globl  .OPENSSL_atomic_add
 .align  4
 .OPENSSL_atomic_add:
-Loop:   lwarx   r5,0,r3
+Ladd:   lwarx   r5,0,r3
        add     r0,r4,r5
        stwcx.  r0,0,r3
-        bne-    Loop
+        bne-    Ladd
        $SIGNX  r3,r0
        blr
+        .long   0
+        .byte   0,12,0x14,0,0,0,2,0
+        .long   0
 .globl  .OPENSSL_rdtsc
 .align  4
@@ -60,6 +91,8 @@ Loop:	lwarx	r5,0,r3
        mftb    r3
        mftbu   r4
        blr
+        .long   0
+        .byte   0,12,0x14,0,0,0,0,0
 .globl  .OPENSSL_cleanse
 .align  4
@@ -72,7 +105,7 @@ Loop:	lwarx	r5,0,r3
 Little: mtctr   r4
        stb     r0,0(r3)
        addi    r3,r3,1
-        bdnz-   \$-8
+        bdnz    \$-8
        blr
 Lot:    andi.   r5,r3,3
        beq     Laligned
@@ -85,10 +118,13 @@ Laligned:
        mtctr   r5
        stw     r0,0(r3)
        addi    r3,r3,4
-        bdnz-   \$-8
+        bdnz    \$-8
        andi.   r4,r4,3
        bne     Little
        blr
+        .long   0
+        .byte   0,12,0x14,0,0,0,2,0
+        .long   0
 ___
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl
new file mode 100644
index 0000000000..7f684092d4
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl
@@ -0,0 +1,631 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# June 2011
+#
+# This is RC4+MD5 "stitch" implementation. The idea, as spelled in
+# http://download.intel.com/design/intarch/papers/323686.pdf, is that
+# since both algorithms exhibit instruction-level parallelism, ILP,
+# below theoretical maximum, interleaving them would allow to utilize
+# processor resources better and achieve better performance. RC4
+# instruction sequence is virtually identical to rc4-x86_64.pl, which
+# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin
+# and Jim Guilford of Intel. MD5 is fresh implementation aiming to
+# minimize register usage, which was used as "main thread" with RC4
+# weaved into it, one RC4 round per one MD5 round. In addition to the
+# stiched subroutine the script can generate standalone replacement
+# md5_block_asm_data_order and RC4. Below are performance numbers in
+# cycles per processed byte, less is better, for these the standalone
+# subroutines, sum of them, and stitched one:
+#
+#               RC4     MD5     RC4+MD5 stitch  gain
+# Opteron       6.5(*)  5.4     11.9    7.0     +70%(*)
+# Core2         6.5     5.8     12.3    7.7     +60%
+# Westmere      4.3     5.2     9.5     7.0     +36%
+# Sandy Bridge  4.2     5.5     9.7     6.8     +43%
+# Atom          9.3     6.5     15.8    11.1    +42%
+#
+# (*)   rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
+#       is +53%...
+my ($rc4,$md5)=(1,1);   # what to generate?
+my $D="#" if (!$md5);   # if set to "#", MD5 is stitched into RC4(),
+                        # but its result is discarded. Idea here is
+                        # to be able to use 'openssl speed rc4' for
+                        # benchmarking the stitched subroutine... 
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+open STDOUT,"| $^X $xlate $flavour $output";
+my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs);
+if ($rc4 && !$md5) {
+  ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx");
+  $func="RC4";                          $nargs=4;
+} elsif ($md5 && !$rc4) {
+  ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx");
+  $func="md5_block_asm_data_order";     $nargs=3;
+} else {
+  ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+  $func="rc4_md5_enc";                  $nargs=6;
+  # void rc4_md5_enc(
+  #             RC4_KEY *key,           #
+  #             const void *in0,        # RC4 input
+  #             void *out,              # RC4 output
+  #             MD5_CTX *ctx,           #
+  #             const void *inp,        # MD5 input
+  #             size_t len);            # number of 64-byte blocks
+}
+my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
+        0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
+        0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
+        0x6b901122,0xfd987193,0xa679438e,0x49b40821,
+        0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
+        0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
+        0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
+        0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
+        0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
+        0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
+        0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
+        0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
+        0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
+        0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
+        0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
+        0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391     );
+my @V=("%r8d","%r9d","%r10d","%r11d");  # MD5 registers
+my $tmp="%r12d";
+my @XX=("%rbp","%rsi");                 # RC4 registers
+my @TX=("%rax","%rbx");
+my $YY="%rcx";
+my $TY="%rdx";
+my $MOD=32;                             # 16, 32 or 64
+$code.=<<___;
+.text
+.align 16
+.globl  $func
+.type   $func,\@function,$nargs
+$func:
+        cmp     \$0,$len
+        je      .Labort
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        sub     \$40,%rsp
+.Lbody:
+___
+if ($rc4) {
+$code.=<<___;
+$D#md5# mov     $ctx,%r11               # reassign arguments
+        mov     $len,%r12
+        mov     $in0,%r13
+        mov     $out,%r14
+$D#md5# mov     $inp,%r15
+___
+    $ctx="%r11" if ($md5);              # reassign arguments
+    $len="%r12";
+    $in0="%r13";
+    $out="%r14";
+    $inp="%r15" if ($md5);
+    $inp=$in0   if (!$md5);
+$code.=<<___;
+        xor     $XX[0],$XX[0]
+        xor     $YY,$YY
+        lea     8($dat),$dat
+        mov     -8($dat),$XX[0]#b
+        mov     -4($dat),$YY#b
+        inc     $XX[0]#b
+        sub     $in0,$out
+        movl    ($dat,$XX[0],4),$TX[0]#d
+___
+$code.=<<___ if (!$md5);
+        xor     $TX[1],$TX[1]
+        test    \$-128,$len
+        jz      .Loop1
+        sub     $XX[0],$TX[1]
+        and     \$`$MOD-1`,$TX[1]
+        jz      .Loop${MOD}_is_hot
+        sub     $TX[1],$len
+.Loop${MOD}_warmup:
+        add     $TX[0]#b,$YY#b
+        movl    ($dat,$YY,4),$TY#d
+        movl    $TX[0]#d,($dat,$YY,4)
+        movl    $TY#d,($dat,$XX[0],4)
+        add     $TY#b,$TX[0]#b
+        inc     $XX[0]#b
+        movl    ($dat,$TX[0],4),$TY#d
+        movl    ($dat,$XX[0],4),$TX[0]#d
+        xorb    ($in0),$TY#b
+        movb    $TY#b,($out,$in0)
+        lea     1($in0),$in0
+        dec     $TX[1]
+        jnz     .Loop${MOD}_warmup
+        mov     $YY,$TX[1]
+        xor     $YY,$YY
+        mov     $TX[1]#b,$YY#b
+.Loop${MOD}_is_hot:
+        mov     $len,32(%rsp)           # save original $len
+        shr     \$6,$len                # number of 64-byte blocks
+___
+  if ($D && !$md5) {                    # stitch in dummy MD5
+    $md5=1;
+    $ctx="%r11";
+    $inp="%r15";
+    $code.=<<___;
+        mov     %rsp,$ctx
+        mov     $in0,$inp
+___
+  }
+}
+$code.=<<___;
+#rc4#   add     $TX[0]#b,$YY#b
+#rc4#   lea     ($dat,$XX[0],4),$XX[1]
+        shl     \$6,$len
+        add     $inp,$len               # pointer to the end of input
+        mov     $len,16(%rsp)
+#md5#   mov     $ctx,24(%rsp)           # save pointer to MD5_CTX
+#md5#   mov     0*4($ctx),$V[0]         # load current hash value from MD5_CTX
+#md5#   mov     1*4($ctx),$V[1]
+#md5#   mov     2*4($ctx),$V[2]
+#md5#   mov     3*4($ctx),$V[3]
+        jmp     .Loop
+.align  16
+.Loop:
+#md5#   mov     $V[0],0*4(%rsp)         # put aside current hash value
+#md5#   mov     $V[1],1*4(%rsp)
+#md5#   mov     $V[2],2*4(%rsp)
+#md5#   mov     $V[3],$tmp              # forward reference
+#md5#   mov     $V[3],3*4(%rsp)
+___
+sub R0 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot0=(7,12,17,22);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="    movdqu  ($in0),%xmm2\n"         if ($rc4 && $j==15);
+    $code.="    add     \$$MOD,$XX[0]#b\n"      if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="    pxor    $xmm,$xmm\n"            if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#   movl    ($dat,$YY,4),$TY#d
+#md5#   xor     $c,$tmp
+#rc4#   movl    $TX[0]#d,($dat,$YY,4)
+#md5#   and     $b,$tmp
+#md5#   add     4*`$j`($inp),$a
+#rc4#   add     $TY#b,$TX[0]#b
+#rc4#   movl    `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#   add     \$$K[$i],$a
+#md5#   xor     $d,$tmp
+#rc4#   movz    $TX[0]#b,$TX[0]#d
+#rc4#   movl    $TY#d,4*$k($XX[1])
+#md5#   add     $tmp,$a
+#rc4#   add     $TX[1]#b,$YY#b
+#md5#   rol     \$$rot0[$j%4],$a
+#md5#   mov     `$j==15?"$b":"$c"`,$tmp         # forward reference
+#rc4#   pinsrw  \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#   add     $b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+        mov     $YY,$XX[1]
+        xor     $YY,$YY                         # keyword to partial register
+        mov     $XX[1]#b,$YY#b
+        lea     ($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+        psllq   \$8,%xmm1
+        pxor    %xmm0,%xmm2
+        pxor    %xmm1,%xmm2
+___
+}
+sub R1 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot1=(5,9,14,20);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="    movdqu  16($in0),%xmm3\n"       if ($rc4 && $j==15);
+    $code.="    add     \$$MOD,$XX[0]#b\n"      if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="    pxor    $xmm,$xmm\n"            if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#   movl    ($dat,$YY,4),$TY#d
+#md5#   xor     $b,$tmp
+#rc4#   movl    $TX[0]#d,($dat,$YY,4)
+#md5#   and     $d,$tmp
+#md5#   add     4*`((1+5*$j)%16)`($inp),$a
+#rc4#   add     $TY#b,$TX[0]#b
+#rc4#   movl    `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#   add     \$$K[$i],$a
+#md5#   xor     $c,$tmp
+#rc4#   movz    $TX[0]#b,$TX[0]#d
+#rc4#   movl    $TY#d,4*$k($XX[1])
+#md5#   add     $tmp,$a
+#rc4#   add     $TX[1]#b,$YY#b
+#md5#   rol     \$$rot1[$j%4],$a
+#md5#   mov     `$j==15?"$c":"$b"`,$tmp         # forward reference
+#rc4#   pinsrw  \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#   add     $b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+        mov     $YY,$XX[1]
+        xor     $YY,$YY                         # keyword to partial register
+        mov     $XX[1]#b,$YY#b
+        lea     ($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+        psllq   \$8,%xmm1
+        pxor    %xmm0,%xmm3
+        pxor    %xmm1,%xmm3
+___
+}
+sub R2 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot2=(4,11,16,23);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="    movdqu  32($in0),%xmm4\n"       if ($rc4 && $j==15);
+    $code.="    add     \$$MOD,$XX[0]#b\n"      if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="    pxor    $xmm,$xmm\n"            if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#   movl    ($dat,$YY,4),$TY#d
+#md5#   xor     $c,$tmp
+#rc4#   movl    $TX[0]#d,($dat,$YY,4)
+#md5#   xor     $b,$tmp
+#md5#   add     4*`((5+3*$j)%16)`($inp),$a
+#rc4#   add     $TY#b,$TX[0]#b
+#rc4#   movl    `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#   add     \$$K[$i],$a
+#rc4#   movz    $TX[0]#b,$TX[0]#d
+#md5#   add     $tmp,$a
+#rc4#   movl    $TY#d,4*$k($XX[1])
+#rc4#   add     $TX[1]#b,$YY#b
+#md5#   rol     \$$rot2[$j%4],$a
+#md5#   mov     `$j==15?"\\\$-1":"$c"`,$tmp     # forward reference
+#rc4#   pinsrw  \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#   add     $b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+        mov     $YY,$XX[1]
+        xor     $YY,$YY                         # keyword to partial register
+        mov     $XX[1]#b,$YY#b
+        lea     ($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+        psllq   \$8,%xmm1
+        pxor    %xmm0,%xmm4
+        pxor    %xmm1,%xmm4
+___
+}
+sub R3 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot3=(6,10,15,21);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="    movdqu  48($in0),%xmm5\n"       if ($rc4 && $j==15);
+    $code.="    add     \$$MOD,$XX[0]#b\n"      if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="    pxor    $xmm,$xmm\n"            if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#   movl    ($dat,$YY,4),$TY#d
+#md5#   xor     $d,$tmp
+#rc4#   movl    $TX[0]#d,($dat,$YY,4)
+#md5#   or      $b,$tmp
+#md5#   add     4*`((7*$j)%16)`($inp),$a
+#rc4#   add     $TY#b,$TX[0]#b
+#rc4#   movl    `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#   add     \$$K[$i],$a
+#rc4#   movz    $TX[0]#b,$TX[0]#d
+#md5#   xor     $c,$tmp
+#rc4#   movl    $TY#d,4*$k($XX[1])
+#md5#   add     $tmp,$a
+#rc4#   add     $TX[1]#b,$YY#b
+#md5#   rol     \$$rot3[$j%4],$a
+#md5#   mov     \$-1,$tmp                       # forward reference
+#rc4#   pinsrw  \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#   add     $b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15);
+        mov     $XX[0],$XX[1]
+        xor     $XX[0],$XX[0]                   # keyword to partial register
+        mov     $XX[1]#b,$XX[0]#b
+        mov     $YY,$XX[1]
+        xor     $YY,$YY                         # keyword to partial register
+        mov     $XX[1]#b,$YY#b
+        lea     ($dat,$XX[0],4),$XX[1]
+        psllq   \$8,%xmm1
+        pxor    %xmm0,%xmm5
+        pxor    %xmm1,%xmm5
+___
+}
+my $i=0;
+for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+$code.=<<___;
+#md5#   add     0*4(%rsp),$V[0]         # accumulate hash value
+#md5#   add     1*4(%rsp),$V[1]
+#md5#   add     2*4(%rsp),$V[2]
+#md5#   add     3*4(%rsp),$V[3]
+#rc4#   movdqu  %xmm2,($out,$in0)       # write RC4 output
+#rc4#   movdqu  %xmm3,16($out,$in0)
+#rc4#   movdqu  %xmm4,32($out,$in0)
+#rc4#   movdqu  %xmm5,48($out,$in0)
+#md5#   lea     64($inp),$inp
+#rc4#   lea     64($in0),$in0
+        cmp     16(%rsp),$inp           # are we done?
+        jb      .Loop
+#md5#   mov     24(%rsp),$len           # restore pointer to MD5_CTX
+#rc4#   sub     $TX[0]#b,$YY#b          # correct $YY
+#md5#   mov     $V[0],0*4($len)         # write MD5_CTX
+#md5#   mov     $V[1],1*4($len)
+#md5#   mov     $V[2],2*4($len)
+#md5#   mov     $V[3],3*4($len)
+___
+$code.=<<___ if ($rc4 && (!$md5 || $D));
+        mov     32(%rsp),$len           # restore original $len
+        and     \$63,$len               # remaining bytes
+        jnz     .Loop1
+        jmp     .Ldone
+        
+.align  16
+.Loop1:
+        add     $TX[0]#b,$YY#b
+        movl    ($dat,$YY,4),$TY#d
+        movl    $TX[0]#d,($dat,$YY,4)
+        movl    $TY#d,($dat,$XX[0],4)
+        add     $TY#b,$TX[0]#b
+        inc     $XX[0]#b
+        movl    ($dat,$TX[0],4),$TY#d
+        movl    ($dat,$XX[0],4),$TX[0]#d
+        xorb    ($in0),$TY#b
+        movb    $TY#b,($out,$in0)
+        lea     1($in0),$in0
+        dec     $len
+        jnz     .Loop1
+.Ldone:
+___
+$code.=<<___;
+#rc4#   sub     \$1,$XX[0]#b
+#rc4#   movl    $XX[0]#d,-8($dat)
+#rc4#   movl    $YY#d,-4($dat)
+        mov     40(%rsp),%r15
+        mov     48(%rsp),%r14
+        mov     56(%rsp),%r13
+        mov     64(%rsp),%r12
+        mov     72(%rsp),%rbp
+        mov     80(%rsp),%rbx
+        lea     88(%rsp),%rsp
+.Lepilogue:
+.Labort:
+        ret
+.size $func,.-$func
+___
+if ($rc4 && $D) {       # sole purpose of this section is to provide
+                        # option to use the generated module as drop-in
+                        # replacement for rc4-x86_64.pl for debugging
+                        # and testing purposes...
+my ($idx,$ido)=("%r8","%r9");
+my ($dat,$len,$inp)=("%rdi","%rsi","%rdx");
+$code.=<<___;
+.globl  RC4_set_key
+.type   RC4_set_key,\@function,3
+.align  16
+RC4_set_key:
+        lea     8($dat),$dat
+        lea     ($inp,$len),$inp
+        neg     $len
+        mov     $len,%rcx
+        xor     %eax,%eax
+        xor     $ido,$ido
+        xor     %r10,%r10
+        xor     %r11,%r11
+        jmp     .Lw1stloop
+.align  16
+.Lw1stloop:
+        mov     %eax,($dat,%rax,4)
+        add     \$1,%al
+        jnc     .Lw1stloop
+        xor     $ido,$ido
+        xor     $idx,$idx
+.align  16
+.Lw2ndloop:
+        mov     ($dat,$ido,4),%r10d
+        add     ($inp,$len,1),$idx#b
+        add     %r10b,$idx#b
+        add     \$1,$len
+        mov     ($dat,$idx,4),%r11d
+        cmovz   %rcx,$len
+        mov     %r10d,($dat,$idx,4)
+        mov     %r11d,($dat,$ido,4)
+        add     \$1,$ido#b
+        jnc     .Lw2ndloop
+        xor     %eax,%eax
+        mov     %eax,-8($dat)
+        mov     %eax,-4($dat)
+        ret
+.size   RC4_set_key,.-RC4_set_key
+.globl  RC4_options
+.type   RC4_options,\@abi-omnipotent
+.align  16
+RC4_options:
+        lea     .Lopts(%rip),%rax
+        ret
+.align  64
+.Lopts:
+.asciz  "rc4(64x,int)"
+.align  64
+.size   RC4_options,.-RC4_options
+___
+}
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type   se_handler,\@abi-omnipotent
+.align  16
+se_handler:
+        push    %rsi
+        push    %rdi
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        pushfq
+        sub     \$64,%rsp
+        mov     120($context),%rax      # pull context->Rax
+        mov     248($context),%rbx      # pull context->Rip
+        lea     .Lbody(%rip),%r10
+        cmp     %r10,%rbx               # context->Rip<.Lbody
+        jb      .Lin_prologue
+        mov     152($context),%rax      # pull context->Rsp
+        lea     .Lepilogue(%rip),%r10
+        cmp     %r10,%rbx               # context->Rip>=.Lepilogue
+        jae     .Lin_prologue
+        mov     40(%rax),%r15
+        mov     48(%rax),%r14
+        mov     56(%rax),%r13
+        mov     64(%rax),%r12
+        mov     72(%rax),%rbp
+        mov     80(%rax),%rbx
+        lea     88(%rax),%rax
+        mov     %rbx,144($context)      # restore context->Rbx
+        mov     %rbp,160($context)      # restore context->Rbp
+        mov     %r12,216($context)      # restore context->R12
+        mov     %r13,224($context)      # restore context->R12
+        mov     %r14,232($context)      # restore context->R14
+        mov     %r15,240($context)      # restore context->R15
+.Lin_prologue:
+        mov     8(%rax),%rdi
+        mov     16(%rax),%rsi
+        mov     %rax,152($context)      # restore context->Rsp
+        mov     %rsi,168($context)      # restore context->Rsi
+        mov     %rdi,176($context)      # restore context->Rdi
+        mov     40($disp),%rdi          # disp->ContextRecord
+        mov     $context,%rsi           # context
+        mov     \$154,%ecx              # sizeof(CONTEXT)
+        .long   0xa548f3fc              # cld; rep movsq
+        mov     $disp,%rsi
+        xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+        mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+        mov     0(%rsi),%r8             # arg3, disp->ControlPc
+        mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+        mov     40(%rsi),%r10           # disp->ContextRecord
+        lea     56(%rsi),%r11           # &disp->HandlerData
+        lea     24(%rsi),%r12           # &disp->EstablisherFrame
+        mov     %r10,32(%rsp)           # arg5
+        mov     %r11,40(%rsp)           # arg6
+        mov     %r12,48(%rsp)           # arg7
+        mov     %rcx,56(%rsp)           # arg8, (NULL)
+        call    *__imp_RtlVirtualUnwind(%rip)
+        mov     \$1,%eax                # ExceptionContinueSearch
+        add     \$64,%rsp
+        popfq
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+        pop     %rbp
+        pop     %rbx
+        pop     %rdi
+        pop     %rsi
+        ret
+.size   se_handler,.-se_handler
+.section        .pdata
+.align  4
+        .rva    .LSEH_begin_$func
+        .rva    .LSEH_end_$func
+        .rva    .LSEH_info_$func
+.section        .xdata
+.align  8
+.LSEH_info_$func:
+        .byte   9,0,0,0
+        .rva    se_handler
+___
+}
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)     { $reg .= $conv; }
+    elsif ($conv eq "b")        { $reg =~ s/%[er]([^x]+)x?/%$1l/;       }
+    elsif ($conv eq "w")        { $reg =~ s/%[er](.+)/%$1/;             }
+    elsif ($conv eq "d")        { $reg =~ s/%[er](.+)/%e$1/;            }
+    return $reg;
+}
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/pinsrw\s+\$0,/movd   /gm;
+$code =~ s/#md5#//gm    if ($md5);
+$code =~ s/#rc4#//gm    if ($rc4);
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-parisc.pl b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl
new file mode 100644
index 0000000000..9165067080
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl
@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# RC4 for PA-RISC.
+# June 2009.
+#
+# Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
+# For reference, [4x] unrolled loop is >40% faster than folded one.
+# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
+# is believed to be not sufficient to justify the effort...
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+if ($flavour =~ /64/) {
+        $LEVEL          ="2.0W";
+        $SIZE_T         =8;
+        $FRAME_MARKER   =80;
+        $SAVED_RP       =16;
+        $PUSH           ="std";
+        $PUSHMA         ="std,ma";
+        $POP            ="ldd";
+        $POPMB          ="ldd,mb";
+} else {
+        $LEVEL          ="1.0";
+        $SIZE_T         =4;
+        $FRAME_MARKER   =48;
+        $SAVED_RP       =20;
+        $PUSH           ="stw";
+        $PUSHMA         ="stwm";
+        $POP            ="ldw";
+        $POPMB          ="ldwm";
+}
+$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker
+                                #                [+ argument transfer]
+$SZ=1;                          # defaults to RC4_CHAR
+if (open CONF,"<${dir}../../opensslconf.h") {
+    while(<CONF>) {
+        if (m/#\s*define\s+RC4_INT\s+(.*)/) {
+            $SZ = ($1=~/char$/) ? 1 : 4;
+            last;
+        }
+    }
+    close CONF;
+}
+if ($SZ==1) {   # RC4_CHAR
+    $LD="ldb";
+    $LDX="ldbx";
+    $MKX="addl";
+    $ST="stb";
+} else {        # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
+    $LD="ldw";
+    $LDX="ldwx,s";
+    $MKX="sh2addl";
+    $ST="stw";
+}
+$key="%r26";
+$len="%r25";
+$inp="%r24";
+$out="%r23";
+@XX=("%r19","%r20");
+@TX=("%r21","%r22");
+$YY="%r28";
+$TY="%r29";
+$acc="%r1";
+$ix="%r2";
+$iy="%r3";
+$dat0="%r4";
+$dat1="%r5";
+$rem="%r6";
+$mask="%r31";
+sub unrolledloopbody {
+for ($i=0;$i<4;$i++) {
+$code.=<<___;
+        ldo     1($XX[0]),$XX[1]
+        `sprintf("$LDX  %$TY(%$key),%$dat1") if ($i>0)` 
+        and     $mask,$XX[1],$XX[1]
+        $LDX    $YY($key),$TY
+        $MKX    $YY,$key,$ix
+        $LDX    $XX[1]($key),$TX[1]
+        $MKX    $XX[0],$key,$iy
+        $ST     $TX[0],0($ix)
+        comclr,<> $XX[1],$YY,%r0        ; conditional
+        copy    $TX[0],$TX[1]           ; move
+        `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
+        $ST     $TY,0($iy)
+        addl    $TX[0],$TY,$TY
+        addl    $TX[1],$YY,$YY
+        and     $mask,$TY,$TY
+        and     $mask,$YY,$YY
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX));     # "rotate" registers
+} }
+sub foldedloop {
+my ($label,$count)=@_;
+$code.=<<___;
+$label
+        $MKX    $YY,$key,$iy
+        $LDX    $YY($key),$TY
+        $MKX    $XX[0],$key,$ix
+        $ST     $TX[0],0($iy)
+        ldo     1($XX[0]),$XX[0]
+        $ST     $TY,0($ix)
+        addl    $TX[0],$TY,$TY
+        ldbx    $inp($out),$dat1
+        and     $mask,$TY,$TY
+        and     $mask,$XX[0],$XX[0]
+        $LDX    $TY($key),$acc
+        $LDX    $XX[0]($key),$TX[0]
+        ldo     1($out),$out
+        xor     $dat1,$acc,$acc
+        addl    $TX[0],$YY,$YY
+        stb     $acc,-1($out)
+        addib,<> -1,$count,$label       ; $count is always small
+        and     $mask,$YY,$YY
+___
+}
+$code=<<___;
+        .LEVEL  $LEVEL
+        .SPACE  \$TEXT\$
+        .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+        .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+RC4
+        .PROC
+        .CALLINFO       FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
+        .ENTRY
+        $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
+        $PUSHMA %r3,$FRAME(%sp)
+        $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
+        $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
+        $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
+        cmpib,*= 0,$len,L\$abort
+        sub     $inp,$out,$inp          ; distance between $inp and $out
+        $LD     `0*$SZ`($key),$XX[0]
+        $LD     `1*$SZ`($key),$YY
+        ldo     `2*$SZ`($key),$key
+        ldi     0xff,$mask
+        ldi     3,$dat0         
+        ldo     1($XX[0]),$XX[0]        ; warm up loop
+        and     $mask,$XX[0],$XX[0]
+        $LDX    $XX[0]($key),$TX[0]
+        addl    $TX[0],$YY,$YY
+        cmpib,*>>= 6,$len,L\$oop1       ; is $len large enough to bother?
+        and     $mask,$YY,$YY
+        and,<>  $out,$dat0,$rem         ; is $out aligned?
+        b       L\$alignedout
+        subi    4,$rem,$rem
+        sub     $len,$rem,$len
+___
+&foldedloop("L\$alignout",$rem);        # process till $out is aligned
+$code.=<<___;
+L\$alignedout                           ; $len is at least 4 here
+        and,<>  $inp,$dat0,$acc         ; is $inp aligned?
+        b       L\$oop4
+        sub     $inp,$acc,$rem          ; align $inp
+        sh3addl $acc,%r0,$acc
+        subi    32,$acc,$acc
+        mtctl   $acc,%cr11              ; load %sar with vshd align factor
+        ldwx    $rem($out),$dat0
+        ldo     4($rem),$rem
+L\$oop4misalignedinp
+___
+&unrolledloopbody();
+$code.=<<___;
+        $LDX    $TY($key),$ix
+        ldwx    $rem($out),$dat1
+        ldo     -4($len),$len
+        or      $ix,$acc,$acc           ; last piece, no need to dep
+        vshd    $dat0,$dat1,$iy         ; align data
+        copy    $dat1,$dat0
+        xor     $iy,$acc,$acc
+        stw     $acc,0($out)
+        cmpib,*<< 3,$len,L\$oop4misalignedinp
+        ldo     4($out),$out
+        cmpib,*= 0,$len,L\$done
+        nop
+        b       L\$oop1
+        nop
+        .ALIGN  8
+L\$oop4
+___
+&unrolledloopbody();
+$code.=<<___;
+        $LDX    $TY($key),$ix
+        ldwx    $inp($out),$dat0
+        ldo     -4($len),$len
+        or      $ix,$acc,$acc           ; last piece, no need to dep
+        xor     $dat0,$acc,$acc
+        stw     $acc,0($out)
+        cmpib,*<< 3,$len,L\$oop4
+        ldo     4($out),$out
+        cmpib,*= 0,$len,L\$done
+        nop
+___
+&foldedloop("L\$oop1",$len);
+$code.=<<___;
+L\$done
+        $POP    `-$FRAME-$SAVED_RP`(%sp),%r2
+        ldo     -1($XX[0]),$XX[0]       ; chill out loop
+        sub     $YY,$TX[0],$YY
+        and     $mask,$XX[0],$XX[0]
+        and     $mask,$YY,$YY
+        $ST     $XX[0],`-2*$SZ`($key)
+        $ST     $YY,`-1*$SZ`($key)
+        $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
+        $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
+        $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
+L\$abort
+        bv      (%r2)
+        .EXIT
+        $POPMB  -$FRAME(%sp),%r3
+        .PROCEND
+___
+$code.=<<___;
+        .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+        .ALIGN  8
+private_RC4_set_key
+        .PROC
+        .CALLINFO       NO_CALLS
+        .ENTRY
+        $ST     %r0,`0*$SZ`($key)
+        $ST     %r0,`1*$SZ`($key)
+        ldo     `2*$SZ`($key),$key
+        copy    %r0,@XX[0]
+L\$1st
+        $ST     @XX[0],0($key)
+        ldo     1(@XX[0]),@XX[0]
+        bb,>=   @XX[0],`31-8`,L\$1st    ; @XX[0]<256
+        ldo     $SZ($key),$key
+        ldo     `-256*$SZ`($key),$key   ; rewind $key
+        addl    $len,$inp,$inp          ; $inp to point at the end
+        sub     %r0,$len,%r23           ; inverse index
+        copy    %r0,@XX[0]
+        copy    %r0,@XX[1]
+        ldi     0xff,$mask
+L\$2nd
+        $LDX    @XX[0]($key),@TX[0]
+        ldbx    %r23($inp),@TX[1]
+        addi,nuv 1,%r23,%r23            ; increment and conditional
+        sub     %r0,$len,%r23           ; inverse index
+        addl    @TX[0],@XX[1],@XX[1]
+        addl    @TX[1],@XX[1],@XX[1]
+        and     $mask,@XX[1],@XX[1]
+        $MKX    @XX[0],$key,$TY
+        $LDX    @XX[1]($key),@TX[1]
+        $MKX    @XX[1],$key,$YY
+        ldo     1(@XX[0]),@XX[0]
+        $ST     @TX[0],0($YY)
+        bb,>=   @XX[0],`31-8`,L\$2nd    ; @XX[0]<256
+        $ST     @TX[1],0($TY)
+        bv,n    (%r2)
+        .EXIT
+        nop
+        .PROCEND
+        .EXPORT RC4_options,ENTRY
+        .ALIGN  8
+RC4_options
+        .PROC
+        .CALLINFO       NO_CALLS
+        .ENTRY
+        blr     %r0,%r28
+        ldi     3,%r1
+L\$pic
+        andcm   %r28,%r1,%r28
+        bv      (%r2)
+        .EXIT
+        ldo     L\$opts-L\$pic(%r28),%r28
+        .PROCEND
+        .ALIGN  8
+L\$opts
+        .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
+        .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl
index 96681fa05e..7528ece13c 100644
--- a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl
+++ b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl
@@ -13,6 +13,29 @@
 # "cluster" Address Generation Interlocks, so that one pipeline stall
 # resolves several dependencies.
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 50% better than code generated by gcc 4.3.
+$flavour = shift;
+if ($flavour =~ /3[12]/) {
+        $SIZE_T=4;
+        $g="";
+} else {
+        $SIZE_T=8;
+        $g="g";
+}
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
 $rp="%r14";
 $sp="%r15";
 $code=<<___;
@@ -39,7 +62,12 @@ $code.=<<___;
 .type   RC4,\@function
 .align  64
 RC4:
-        stmg    %r6,%r11,48($sp)
+        stm${g} %r6,%r11,6*$SIZE_T($sp)
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+        llgfr   $len,$len
+___
+$code.=<<___;
        llgc    $XX[0],0($key)
        llgc    $YY,1($key)
        la      $XX[0],1($XX[0])
@@ -90,7 +118,7 @@ $code.=<<___;
        xgr     $acc,$TX[1]
        stg     $acc,0($out)
        la      $out,8($out)
-        brct    $cnt,.Loop8
+        brctg   $cnt,.Loop8
 .Lshort:
        lghi    $acc,7
@@ -122,7 +150,7 @@ $code.=<<___;
        ahi     $XX[0],-1
        stc     $XX[0],0($key)
        stc     $YY,1($key)
-        lmg     %r6,%r11,48($sp)
+        lm${g}  %r6,%r11,6*$SIZE_T($sp)
        br      $rp
 .size   RC4,.-RC4
 .string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
@@ -143,11 +171,11 @@ $ikey="%r7";
 $iinp="%r8";
 $code.=<<___;
-.globl  RC4_set_key
+.globl  private_RC4_set_key
-.type   RC4_set_key,\@function
+.type   private_RC4_set_key,\@function
 .align  64
-RC4_set_key:
+private_RC4_set_key:
-        stmg    %r6,%r8,48($sp)
+        stm${g} %r6,%r8,6*$SIZE_T($sp)
        lhi     $cnt,256
        la      $idx,0(%r0)
        sth     $idx,0($key)
@@ -180,9 +208,9 @@ RC4_set_key:
        la      $iinp,0(%r0)
        j       .L2ndloop
 .Ldone:
-        lmg     %r6,%r8,48($sp)
+        lm${g}  %r6,%r8,6*$SIZE_T($sp)
        br      $rp
-.size   RC4_set_key,.-RC4_set_key
+.size   private_RC4_set_key,.-private_RC4_set_key
 ___
 }
@@ -203,3 +231,4 @@ RC4_options:
 ___
 print $code;
+close STDOUT;   # force flush
diff --git a/src/lib/libcrypto/rsa/rsa_ameth.c b/src/lib/libcrypto/rsa/rsa_ameth.c
index 8c3209885e..2460910ab2 100644
--- a/src/lib/libcrypto/rsa/rsa_ameth.c
+++ b/src/lib/libcrypto/rsa/rsa_ameth.c
@@ -265,6 +265,147 @@ static int rsa_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent,
        return do_rsa_print(bp, pkey->pkey.rsa, indent, 1);
        }
+static RSA_PSS_PARAMS *rsa_pss_decode(const X509_ALGOR *alg,
+                                        X509_ALGOR **pmaskHash)
+        {
+        const unsigned char *p;
+        int plen;
+        RSA_PSS_PARAMS *pss;
+        *pmaskHash = NULL;
+        if (!alg->parameter || alg->parameter->type != V_ASN1_SEQUENCE)
+                return NULL;
+        p = alg->parameter->value.sequence->data;
+        plen = alg->parameter->value.sequence->length;
+        pss = d2i_RSA_PSS_PARAMS(NULL, &p, plen);
+        if (!pss)
+                return NULL;
+        
+        if (pss->maskGenAlgorithm)
+                {
+                ASN1_TYPE *param = pss->maskGenAlgorithm->parameter;
+                if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) == NID_mgf1
+                        && param->type == V_ASN1_SEQUENCE)
+                        {
+                        p = param->value.sequence->data;
+                        plen = param->value.sequence->length;
+                        *pmaskHash = d2i_X509_ALGOR(NULL, &p, plen);
+                        }
+                }
+        return pss;
+        }
+static int rsa_pss_param_print(BIO *bp, RSA_PSS_PARAMS *pss, 
+                                X509_ALGOR *maskHash, int indent)
+        {
+        int rv = 0;
+        if (!pss)
+                {
+                if (BIO_puts(bp, " (INVALID PSS PARAMETERS)\n") <= 0)
+                        return 0;
+                return 1;
+                }
+        if (BIO_puts(bp, "\n") <= 0)
+                goto err;
+        if (!BIO_indent(bp, indent, 128))
+                goto err;
+        if (BIO_puts(bp, "Hash Algorithm: ") <= 0)
+                goto err;
+        if (pss->hashAlgorithm)
+                {
+                if (i2a_ASN1_OBJECT(bp, pss->hashAlgorithm->algorithm) <= 0)
+                        goto err;
+                }
+        else if (BIO_puts(bp, "sha1 (default)") <= 0)
+                goto err;
+        if (BIO_puts(bp, "\n") <= 0)
+                goto err;
+        if (!BIO_indent(bp, indent, 128))
+                goto err;
+        if (BIO_puts(bp, "Mask Algorithm: ") <= 0)
+                        goto err;
+        if (pss->maskGenAlgorithm)
+                {
+                if (i2a_ASN1_OBJECT(bp, pss->maskGenAlgorithm->algorithm) <= 0)
+                        goto err;
+                if (BIO_puts(bp, " with ") <= 0)
+                        goto err;
+                if (maskHash)
+                        {
+                        if (i2a_ASN1_OBJECT(bp, maskHash->algorithm) <= 0)
+                        goto err;
+                        }
+                else if (BIO_puts(bp, "INVALID") <= 0)
+                        goto err;
+                }
+        else if (BIO_puts(bp, "mgf1 with sha1 (default)") <= 0)
+                goto err;
+        BIO_puts(bp, "\n");
+        if (!BIO_indent(bp, indent, 128))
+                goto err;
+        if (BIO_puts(bp, "Salt Length: ") <= 0)
+                        goto err;
+        if (pss->saltLength)
+                {
+                if (i2a_ASN1_INTEGER(bp, pss->saltLength) <= 0)
+                        goto err;
+                }
+        else if (BIO_puts(bp, "20 (default)") <= 0)
+                goto err;
+        BIO_puts(bp, "\n");
+        if (!BIO_indent(bp, indent, 128))
+                goto err;
+        if (BIO_puts(bp, "Trailer Field: ") <= 0)
+                        goto err;
+        if (pss->trailerField)
+                {
+                if (i2a_ASN1_INTEGER(bp, pss->trailerField) <= 0)
+                        goto err;
+                }
+        else if (BIO_puts(bp, "0xbc (default)") <= 0)
+                goto err;
+        BIO_puts(bp, "\n");
+        
+        rv = 1;
+        err:
+        return rv;
+        }
+static int rsa_sig_print(BIO *bp, const X509_ALGOR *sigalg,
+                                        const ASN1_STRING *sig,
+                                        int indent, ASN1_PCTX *pctx)
+        {
+        if (OBJ_obj2nid(sigalg->algorithm) == NID_rsassaPss)
+                {
+                int rv;
+                RSA_PSS_PARAMS *pss;
+                X509_ALGOR *maskHash;
+                pss = rsa_pss_decode(sigalg, &maskHash);
+                rv = rsa_pss_param_print(bp, pss, maskHash, indent);
+                if (pss)
+                        RSA_PSS_PARAMS_free(pss);
+                if (maskHash)
+                        X509_ALGOR_free(maskHash);
+                if (!rv)
+                        return 0;
+                }
+        else if (!sig && BIO_puts(bp, "\n") <= 0)
+                return 0;
+        if (sig)
+                return X509_signature_dump(bp, sig, indent);
+        return 1;
+        }
 static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
        {
@@ -310,6 +451,211 @@ static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
        }
+/* Customised RSA item verification routine. This is called 
+ * when a signature is encountered requiring special handling. We 
+ * currently only handle PSS.
+ */
+static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+                        X509_ALGOR *sigalg, ASN1_BIT_STRING *sig,
+                        EVP_PKEY *pkey)
+        {
+        int rv = -1;
+        int saltlen;
+        const EVP_MD *mgf1md = NULL, *md = NULL;
+        RSA_PSS_PARAMS *pss;
+        X509_ALGOR *maskHash;
+        EVP_PKEY_CTX *pkctx;
+        /* Sanity check: make sure it is PSS */
+        if (OBJ_obj2nid(sigalg->algorithm) != NID_rsassaPss)
+                {
+                RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_SIGNATURE_TYPE);
+                return -1;
+                }
+        /* Decode PSS parameters */
+        pss = rsa_pss_decode(sigalg, &maskHash);
+        if (pss == NULL)
+                {
+                RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_PSS_PARAMETERS);
+                goto err;
+                }
+        /* Check mask and lookup mask hash algorithm */
+        if (pss->maskGenAlgorithm)
+                {
+                if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) != NID_mgf1)
+                        {
+                        RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_ALGORITHM);
+                        goto err;
+                        }
+                if (!maskHash)
+                        {
+                        RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_PARAMETER);
+                        goto err;
+                        }
+                mgf1md = EVP_get_digestbyobj(maskHash->algorithm);
+                if (mgf1md == NULL)
+                        {
+                        RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_MASK_DIGEST);
+                        goto err;
+                        }
+                }
+        else
+                mgf1md = EVP_sha1();
+        if (pss->hashAlgorithm)
+                {
+                md = EVP_get_digestbyobj(pss->hashAlgorithm->algorithm);
+                if (md == NULL)
+                        {
+                        RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_PSS_DIGEST);
+                        goto err;
+                        }
+                }
+        else
+                md = EVP_sha1();
+        if (pss->saltLength)
+                {
+                saltlen = ASN1_INTEGER_get(pss->saltLength);
+                /* Could perform more salt length sanity checks but the main
+                 * RSA routines will trap other invalid values anyway.
+                 */
+                if (saltlen < 0)
+                        {
+                        RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_SALT_LENGTH);
+                        goto err;
+                        }
+                }
+        else
+                saltlen = 20;
+        /* low-level routines support only trailer field 0xbc (value 1)
+         * and PKCS#1 says we should reject any other value anyway.
+         */
+        if (pss->trailerField && ASN1_INTEGER_get(pss->trailerField) != 1)
+                {
+                RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_TRAILER);
+                goto err;
+                }
+        /* We have all parameters now set up context */
+        if (!EVP_DigestVerifyInit(ctx, &pkctx, md, NULL, pkey))
+                goto err;
+        if (EVP_PKEY_CTX_set_rsa_padding(pkctx, RSA_PKCS1_PSS_PADDING) <= 0)
+                goto err;
+        if (EVP_PKEY_CTX_set_rsa_pss_saltlen(pkctx, saltlen) <= 0)
+                goto err;
+        if (EVP_PKEY_CTX_set_rsa_mgf1_md(pkctx, mgf1md) <= 0)
+                goto err;
+        /* Carry on */
+        rv = 2;
+        err:
+        RSA_PSS_PARAMS_free(pss);
+        if (maskHash)
+                X509_ALGOR_free(maskHash);
+        return rv;
+        }
+static int rsa_item_sign(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+                                X509_ALGOR *alg1, X509_ALGOR *alg2, 
+                                ASN1_BIT_STRING *sig)
+        {
+        int pad_mode;
+        EVP_PKEY_CTX *pkctx = ctx->pctx;
+        if (EVP_PKEY_CTX_get_rsa_padding(pkctx, &pad_mode) <= 0)
+                return 0;
+        if (pad_mode == RSA_PKCS1_PADDING)
+                return 2;
+        if (pad_mode == RSA_PKCS1_PSS_PADDING)
+                {
+                const EVP_MD *sigmd, *mgf1md;
+                RSA_PSS_PARAMS *pss = NULL;
+                X509_ALGOR *mgf1alg = NULL;
+                ASN1_STRING *os1 = NULL, *os2 = NULL;
+                EVP_PKEY *pk = EVP_PKEY_CTX_get0_pkey(pkctx);
+                int saltlen, rv = 0;
+                sigmd = EVP_MD_CTX_md(ctx);
+                if (EVP_PKEY_CTX_get_rsa_mgf1_md(pkctx, &mgf1md) <= 0)
+                        goto err;
+                if (!EVP_PKEY_CTX_get_rsa_pss_saltlen(pkctx, &saltlen))
+                        goto err;
+                if (saltlen == -1)
+                        saltlen = EVP_MD_size(sigmd);
+                else if (saltlen == -2)
+                        {
+                        saltlen = EVP_PKEY_size(pk) - EVP_MD_size(sigmd) - 2;
+                        if (((EVP_PKEY_bits(pk) - 1) & 0x7) == 0)
+                                saltlen--;
+                        }
+                pss = RSA_PSS_PARAMS_new();
+                if (!pss)
+                        goto err;
+                if (saltlen != 20)
+                        {
+                        pss->saltLength = ASN1_INTEGER_new();
+                        if (!pss->saltLength)
+                                goto err;
+                        if (!ASN1_INTEGER_set(pss->saltLength, saltlen))
+                                goto err;
+                        }
+                if (EVP_MD_type(sigmd) != NID_sha1)
+                        {
+                        pss->hashAlgorithm = X509_ALGOR_new();
+                        if (!pss->hashAlgorithm)
+                                goto err;
+                        X509_ALGOR_set_md(pss->hashAlgorithm, sigmd);
+                        }
+                if (EVP_MD_type(mgf1md) != NID_sha1)
+                        {
+                        ASN1_STRING *stmp = NULL;
+                        /* need to embed algorithm ID inside another */
+                        mgf1alg = X509_ALGOR_new();
+                        X509_ALGOR_set_md(mgf1alg, mgf1md);
+                        if (!ASN1_item_pack(mgf1alg, ASN1_ITEM_rptr(X509_ALGOR),
+                                                                        &stmp))
+                                        goto err;
+                        pss->maskGenAlgorithm = X509_ALGOR_new();
+                        if (!pss->maskGenAlgorithm)
+                                goto err;
+                        X509_ALGOR_set0(pss->maskGenAlgorithm,
+                                        OBJ_nid2obj(NID_mgf1),
+                                        V_ASN1_SEQUENCE, stmp);
+                        }
+                /* Finally create string with pss parameter encoding. */
+                if (!ASN1_item_pack(pss, ASN1_ITEM_rptr(RSA_PSS_PARAMS), &os1))
+                        goto err;
+                if (alg2)
+                        {
+                        os2 = ASN1_STRING_dup(os1);
+                        if (!os2)
+                                goto err;
+                        X509_ALGOR_set0(alg2, OBJ_nid2obj(NID_rsassaPss),
+                                                V_ASN1_SEQUENCE, os2);
+                        }
+                X509_ALGOR_set0(alg1, OBJ_nid2obj(NID_rsassaPss),
+                                        V_ASN1_SEQUENCE, os1);
+                os1 = os2 = NULL;
+                rv = 3;
+                err:
+                if (mgf1alg)
+                        X509_ALGOR_free(mgf1alg);
+                if (pss)
+                        RSA_PSS_PARAMS_free(pss);
+                if (os1)
+                        ASN1_STRING_free(os1);
+                return rv;
+                
+                }
+        return 2;
+        }
 const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = 
        {
@@ -335,10 +681,13 @@ const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] =
                0,0,0,0,0,0,
+                rsa_sig_print,
                int_rsa_free,
                rsa_pkey_ctrl,
                old_rsa_priv_decode,
-                old_rsa_priv_encode
+                old_rsa_priv_encode,
+                rsa_item_verify,
+                rsa_item_sign
                },
                {
diff --git a/src/lib/libcrypto/rsa/rsa_crpt.c b/src/lib/libcrypto/rsa/rsa_crpt.c
new file mode 100644
index 0000000000..d3e44785dc
--- /dev/null
+++ b/src/lib/libcrypto/rsa/rsa_crpt.c
@@ -0,0 +1,257 @@
+/* crypto/rsa/rsa_lib.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+#include <stdio.h>
+#include <openssl/crypto.h>
+#include "cryptlib.h"
+#include <openssl/lhash.h>
+#include <openssl/bn.h>
+#include <openssl/rsa.h>
+#include <openssl/rand.h>
+#ifndef OPENSSL_NO_ENGINE
+#include <openssl/engine.h>
+#endif
+int RSA_size(const RSA *r)
+        {
+        return(BN_num_bytes(r->n));
+        }
+int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to,
+             RSA *rsa, int padding)
+        {
+#ifdef OPENSSL_FIPS
+        if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+                        && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+                {
+                RSAerr(RSA_F_RSA_PUBLIC_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+                return -1;
+                }
+#endif
+        return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding));
+        }
+int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to,
+             RSA *rsa, int padding)
+        {
+#ifdef OPENSSL_FIPS
+        if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+                        && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+                {
+                RSAerr(RSA_F_RSA_PRIVATE_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+                return -1;
+                }
+#endif
+        return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding));
+        }
+int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to,
+             RSA *rsa, int padding)
+        {
+#ifdef OPENSSL_FIPS
+        if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+                        && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+                {
+                RSAerr(RSA_F_RSA_PRIVATE_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+                return -1;
+                }
+#endif
+        return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding));
+        }
+int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to,
+             RSA *rsa, int padding)
+        {
+#ifdef OPENSSL_FIPS
+        if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+                        && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+                {
+                RSAerr(RSA_F_RSA_PUBLIC_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+                return -1;
+                }
+#endif
+        return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding));
+        }
+int RSA_flags(const RSA *r)
+        {
+        return((r == NULL)?0:r->meth->flags);
+        }
+void RSA_blinding_off(RSA *rsa)
+        {
+        if (rsa->blinding != NULL)
+                {
+                BN_BLINDING_free(rsa->blinding);
+                rsa->blinding=NULL;
+                }
+        rsa->flags &= ~RSA_FLAG_BLINDING;
+        rsa->flags |= RSA_FLAG_NO_BLINDING;
+        }
+int RSA_blinding_on(RSA *rsa, BN_CTX *ctx)
+        {
+        int ret=0;
+        if (rsa->blinding != NULL)
+                RSA_blinding_off(rsa);
+        rsa->blinding = RSA_setup_blinding(rsa, ctx);
+        if (rsa->blinding == NULL)
+                goto err;
+        rsa->flags |= RSA_FLAG_BLINDING;
+        rsa->flags &= ~RSA_FLAG_NO_BLINDING;
+        ret=1;
+err:
+        return(ret);
+        }
+static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p,
+        const BIGNUM *q, BN_CTX *ctx)
+{
+        BIGNUM *ret = NULL, *r0, *r1, *r2;
+        if (d == NULL || p == NULL || q == NULL)
+                return NULL;
+        BN_CTX_start(ctx);
+        r0 = BN_CTX_get(ctx);
+        r1 = BN_CTX_get(ctx);
+        r2 = BN_CTX_get(ctx);
+        if (r2 == NULL)
+                goto err;
+        if (!BN_sub(r1, p, BN_value_one())) goto err;
+        if (!BN_sub(r2, q, BN_value_one())) goto err;
+        if (!BN_mul(r0, r1, r2, ctx)) goto err;
+        ret = BN_mod_inverse(NULL, d, r0, ctx);
+err:
+        BN_CTX_end(ctx);
+        return ret;
+}
+BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx)
+{
+        BIGNUM local_n;
+        BIGNUM *e,*n;
+        BN_CTX *ctx;
+        BN_BLINDING *ret = NULL;
+        if (in_ctx == NULL)
+                {
+                if ((ctx = BN_CTX_new()) == NULL) return 0;
+                }
+        else
+                ctx = in_ctx;
+        BN_CTX_start(ctx);
+        e  = BN_CTX_get(ctx);
+        if (e == NULL)
+                {
+                RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE);
+                goto err;
+                }
+        if (rsa->e == NULL)
+                {
+                e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx);
+                if (e == NULL)
+                        {
+                        RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT);
+                        goto err;
+                        }
+                }
+        else
+                e = rsa->e;
+        
+        if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL)
+                {
+                /* if PRNG is not properly seeded, resort to secret
+                 * exponent as unpredictable seed */
+                RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0);
+                }
+        if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME))
+                {
+                /* Set BN_FLG_CONSTTIME flag */
+                n = &local_n;
+                BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME);
+                }
+        else
+                n = rsa->n;
+        ret = BN_BLINDING_create_param(NULL, e, n, ctx,
+                        rsa->meth->bn_mod_exp, rsa->_method_mod_n);
+        if (ret == NULL)
+                {
+                RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB);
+                goto err;
+                }
+        CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret));
+err:
+        BN_CTX_end(ctx);
+        if (in_ctx == NULL)
+                BN_CTX_free(ctx);
+        if(rsa->e == NULL)
+                BN_free(e);
+        return ret;
+}
diff --git a/src/lib/libcrypto/rsa/rsa_pmeth.c b/src/lib/libcrypto/rsa/rsa_pmeth.c
index c6892ecd09..5b2ecf56ad 100644
--- a/src/lib/libcrypto/rsa/rsa_pmeth.c
+++ b/src/lib/libcrypto/rsa/rsa_pmeth.c
@@ -63,6 +63,12 @@
 #include <openssl/rsa.h>
 #include <openssl/bn.h>
 #include <openssl/evp.h>
+#ifndef OPENSSL_NO_CMS
+#include <openssl/cms.h>
+#endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 #include "evp_locl.h"
 #include "rsa_locl.h"
@@ -79,6 +85,8 @@ typedef struct
        int pad_mode;
        /* message digest */
        const EVP_MD *md;
+        /* message digest for MGF1 */
+        const EVP_MD *mgf1md;
        /* PSS/OAEP salt length */
        int saltlen;
        /* Temp buffer */
@@ -95,6 +103,7 @@ static int pkey_rsa_init(EVP_PKEY_CTX *ctx)
        rctx->pub_exp = NULL;
        rctx->pad_mode = RSA_PKCS1_PADDING;
        rctx->md = NULL;
+        rctx->mgf1md = NULL;
        rctx->tbuf = NULL;
        rctx->saltlen = -2;
@@ -147,6 +156,31 @@ static void pkey_rsa_cleanup(EVP_PKEY_CTX *ctx)
                OPENSSL_free(rctx);
                }
        }
+#ifdef OPENSSL_FIPS
+/* FIP checker. Return value indicates status of context parameters:
+ * 1  : redirect to FIPS.
+ * 0  : don't redirect to FIPS.
+ * -1 : illegal operation in FIPS mode.
+ */
+static int pkey_fips_check_ctx(EVP_PKEY_CTX *ctx)
+        {
+        RSA_PKEY_CTX *rctx = ctx->data;
+        RSA *rsa = ctx->pkey->pkey.rsa;
+        int rv = -1;
+        if (!FIPS_mode())
+                return 0;
+        if (rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)
+                rv = 0;
+        if (!(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) && rv)
+                return -1;
+        if (rctx->md && !(rctx->md->flags & EVP_MD_FLAG_FIPS))
+                return rv;
+        if (rctx->mgf1md && !(rctx->mgf1md->flags & EVP_MD_FLAG_FIPS))
+                return rv;
+        return 1;
+        }
+#endif
 static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
                                        const unsigned char *tbs, size_t tbslen)
@@ -155,6 +189,15 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
        RSA_PKEY_CTX *rctx = ctx->data;
        RSA *rsa = ctx->pkey->pkey.rsa;
+#ifdef OPENSSL_FIPS
+        ret = pkey_fips_check_ctx(ctx);
+        if (ret < 0)
+                {
+                RSAerr(RSA_F_PKEY_RSA_SIGN, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE);
+                return -1;
+                }
+#endif
        if (rctx->md)
                {
                if (tbslen != (size_t)EVP_MD_size(rctx->md))
@@ -163,7 +206,36 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
                                        RSA_R_INVALID_DIGEST_LENGTH);
                        return -1;
                        }
-                if (rctx->pad_mode == RSA_X931_PADDING)
+#ifdef OPENSSL_FIPS
+                if (ret > 0)
+                        {
+                        unsigned int slen;
+                        ret = FIPS_rsa_sign_digest(rsa, tbs, tbslen, rctx->md,
+                                                        rctx->pad_mode,
+                                                        rctx->saltlen,
+                                                        rctx->mgf1md,
+                                                        sig, &slen);
+                        if (ret > 0)
+                                *siglen = slen;
+                        else
+                                *siglen = 0;
+                        return ret;
+                        }
+#endif
+                if (EVP_MD_type(rctx->md) == NID_mdc2)
+                        {
+                        unsigned int sltmp;
+                        if (rctx->pad_mode != RSA_PKCS1_PADDING)
+                                return -1;
+                        ret = RSA_sign_ASN1_OCTET_STRING(NID_mdc2,
+                                                tbs, tbslen, sig, &sltmp, rsa);
+                        if (ret <= 0)
+                                return ret;
+                        ret = sltmp;
+                        }
+                else if (rctx->pad_mode == RSA_X931_PADDING)
                        {
                        if (!setup_tbuf(rctx, ctx))
                                return -1;
@@ -186,8 +258,10 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
                        {
                        if (!setup_tbuf(rctx, ctx))
                                return -1;
-                        if (!RSA_padding_add_PKCS1_PSS(rsa, rctx->tbuf, tbs,
+                        if (!RSA_padding_add_PKCS1_PSS_mgf1(rsa,
-                                                rctx->md, rctx->saltlen))
+                                                rctx->tbuf, tbs,
+                                                rctx->md, rctx->mgf1md,
+                                                rctx->saltlen))
                                return -1;
                        ret = RSA_private_encrypt(RSA_size(rsa), rctx->tbuf,
                                                sig, rsa, RSA_NO_PADDING);
@@ -269,8 +343,30 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx,
        RSA_PKEY_CTX *rctx = ctx->data;
        RSA *rsa = ctx->pkey->pkey.rsa;
        size_t rslen;
+#ifdef OPENSSL_FIPS
+        int rv;
+        rv = pkey_fips_check_ctx(ctx);
+        if (rv < 0)
+                {
+                RSAerr(RSA_F_PKEY_RSA_VERIFY, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE);
+                return -1;
+                }
+#endif
        if (rctx->md)
                {
+#ifdef OPENSSL_FIPS
+                if (rv > 0)
+                        {
+                        return FIPS_rsa_verify_digest(rsa,
+                                                        tbs, tbslen,
+                                                        rctx->md,
+                                                        rctx->pad_mode,
+                                                        rctx->saltlen,
+                                                        rctx->mgf1md,
+                                                        sig, siglen);
+                                                        
+                        }
+#endif
                if (rctx->pad_mode == RSA_PKCS1_PADDING)
                        return RSA_verify(EVP_MD_type(rctx->md), tbs, tbslen,
                                        sig, siglen, rsa);
@@ -289,7 +385,8 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx,
                                                        rsa, RSA_NO_PADDING);
                        if (ret <= 0)
                                return 0;
-                        ret = RSA_verify_PKCS1_PSS(rsa, tbs, rctx->md,
+                        ret = RSA_verify_PKCS1_PSS_mgf1(rsa, tbs,
+                                                rctx->md, rctx->mgf1md,
                                                rctx->tbuf, rctx->saltlen);
                        if (ret <= 0)
                                return 0;
@@ -403,15 +500,25 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
                                RSA_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE);
                return -2;
+                case EVP_PKEY_CTRL_GET_RSA_PADDING:
+                *(int *)p2 = rctx->pad_mode;
+                return 1;
                case EVP_PKEY_CTRL_RSA_PSS_SALTLEN:
-                if (p1 < -2)
+                case EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN:
-                        return -2;
                if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING)
                        {
                        RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PSS_SALTLEN);
                        return -2;
                        }
-                rctx->saltlen = p1;
+                if (type == EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN)
+                        *(int *)p2 = rctx->saltlen;
+                else
+                        {
+                        if (p1 < -2)
+                                return -2;
+                        rctx->saltlen = p1;
+                        }
                return 1;
                case EVP_PKEY_CTRL_RSA_KEYGEN_BITS:
@@ -435,16 +542,45 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
                rctx->md = p2;
                return 1;
+                case EVP_PKEY_CTRL_RSA_MGF1_MD:
+                case EVP_PKEY_CTRL_GET_RSA_MGF1_MD:
+                if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING)
+                        {
+                        RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_MGF1_MD);
+                        return -2;
+                        }
+                if (type == EVP_PKEY_CTRL_GET_RSA_MGF1_MD)
+                        {
+                        if (rctx->mgf1md)
+                                *(const EVP_MD **)p2 = rctx->mgf1md;
+                        else
+                                *(const EVP_MD **)p2 = rctx->md;
+                        }
+                else
+                        rctx->mgf1md = p2;
+                return 1;
                case EVP_PKEY_CTRL_DIGESTINIT:
                case EVP_PKEY_CTRL_PKCS7_ENCRYPT:
                case EVP_PKEY_CTRL_PKCS7_DECRYPT:
                case EVP_PKEY_CTRL_PKCS7_SIGN:
+                return 1;
 #ifndef OPENSSL_NO_CMS
-                case EVP_PKEY_CTRL_CMS_ENCRYPT:
                case EVP_PKEY_CTRL_CMS_DECRYPT:
+                {
+                X509_ALGOR *alg = NULL;
+                ASN1_OBJECT *encalg = NULL;
+                if (p2)
+                        CMS_RecipientInfo_ktri_get0_algs(p2, NULL, NULL, &alg);
+                if (alg)
+                        X509_ALGOR_get0(&encalg, NULL, NULL, alg);
+                if (encalg && OBJ_obj2nid(encalg) == NID_rsaesOaep)
+                        rctx->pad_mode = RSA_PKCS1_OAEP_PADDING;
+                }
+                case EVP_PKEY_CTRL_CMS_ENCRYPT:
                case EVP_PKEY_CTRL_CMS_SIGN:
-#endif
                return 1;
+#endif
                case EVP_PKEY_CTRL_PEER_KEY:
                        RSAerr(RSA_F_PKEY_RSA_CTRL,
                        RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE);
diff --git a/src/lib/libcrypto/rsa/rsa_pss.c b/src/lib/libcrypto/rsa/rsa_pss.c
index ac211e2ffe..5f9f533d0c 100644
--- a/src/lib/libcrypto/rsa/rsa_pss.c
+++ b/src/lib/libcrypto/rsa/rsa_pss.c
@@ -73,6 +73,13 @@ static const unsigned char zeroes[] = {0,0,0,0,0,0,0,0};
 int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
                        const EVP_MD *Hash, const unsigned char *EM, int sLen)
        {
+        return RSA_verify_PKCS1_PSS_mgf1(rsa, mHash, Hash, NULL, EM, sLen);
+        }
+int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash,
+                        const EVP_MD *Hash, const EVP_MD *mgf1Hash,
+                        const unsigned char *EM, int sLen)
+        {
        int i;
        int ret = 0;
        int hLen, maskedDBLen, MSBits, emLen;
@@ -80,6 +87,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
        unsigned char *DB = NULL;
        EVP_MD_CTX ctx;
        unsigned char H_[EVP_MAX_MD_SIZE];
+        EVP_MD_CTX_init(&ctx);
+        if (mgf1Hash == NULL)
+                mgf1Hash = Hash;
        hLen = EVP_MD_size(Hash);
        if (hLen < 0)
@@ -94,7 +105,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
        else if (sLen == -2)    sLen = -2;
        else if (sLen < -2)
                {
-                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
                goto err;
                }
@@ -102,7 +113,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
        emLen = RSA_size(rsa);
        if (EM[0] & (0xFF << MSBits))
                {
-                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_FIRST_OCTET_INVALID);
+                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_FIRST_OCTET_INVALID);
                goto err;
                }
        if (MSBits == 0)
@@ -112,12 +123,12 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
                }
        if (emLen < (hLen + sLen + 2)) /* sLen can be small negative */
                {
-                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_DATA_TOO_LARGE);
+                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_DATA_TOO_LARGE);
                goto err;
                }
        if (EM[emLen - 1] != 0xbc)
                {
-                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_LAST_OCTET_INVALID);
+                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_LAST_OCTET_INVALID);
                goto err;
                }
        maskedDBLen = emLen - hLen - 1;
@@ -125,10 +136,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
        DB = OPENSSL_malloc(maskedDBLen);
        if (!DB)
                {
-                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, ERR_R_MALLOC_FAILURE);
+                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, ERR_R_MALLOC_FAILURE);
                goto err;
                }
-        if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, Hash) < 0)
+        if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, mgf1Hash) < 0)
                goto err;
        for (i = 0; i < maskedDBLen; i++)
                DB[i] ^= EM[i];
@@ -137,25 +148,28 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
        for (i = 0; DB[i] == 0 && i < (maskedDBLen-1); i++) ;
        if (DB[i++] != 0x1)
                {
-                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_RECOVERY_FAILED);
+                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_RECOVERY_FAILED);
                goto err;
                }
        if (sLen >= 0 && (maskedDBLen - i) != sLen)
                {
-                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
                goto err;
                }
-        EVP_MD_CTX_init(&ctx);
+        if (!EVP_DigestInit_ex(&ctx, Hash, NULL)
-        EVP_DigestInit_ex(&ctx, Hash, NULL);
+                || !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes)
-        EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes);
+                || !EVP_DigestUpdate(&ctx, mHash, hLen))
-        EVP_DigestUpdate(&ctx, mHash, hLen);
+                goto err;
        if (maskedDBLen - i)
-                EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i);
+                {
-        EVP_DigestFinal(&ctx, H_, NULL);
+                if (!EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i))
-        EVP_MD_CTX_cleanup(&ctx);
+                        goto err;
+                }
+        if (!EVP_DigestFinal_ex(&ctx, H_, NULL))
+                goto err;
        if (memcmp(H_, H, hLen))
                {
-                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_BAD_SIGNATURE);
+                RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_BAD_SIGNATURE);
                ret = 0;
                }
        else 
@@ -164,6 +178,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
        err:
        if (DB)
                OPENSSL_free(DB);
+        EVP_MD_CTX_cleanup(&ctx);
        return ret;
@@ -173,12 +188,22 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
                        const unsigned char *mHash,
                        const EVP_MD *Hash, int sLen)
        {
+        return RSA_padding_add_PKCS1_PSS_mgf1(rsa, EM, mHash, Hash, NULL, sLen);
+        }
+int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM,
+                        const unsigned char *mHash,
+                        const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen)
+        {
        int i;
        int ret = 0;
        int hLen, maskedDBLen, MSBits, emLen;
        unsigned char *H, *salt = NULL, *p;
        EVP_MD_CTX ctx;
+        if (mgf1Hash == NULL)
+                mgf1Hash = Hash;
        hLen = EVP_MD_size(Hash);
        if (hLen < 0)
                goto err;
@@ -192,7 +217,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
        else if (sLen == -2)    sLen = -2;
        else if (sLen < -2)
                {
-                RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+                RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
                goto err;
                }
@@ -209,8 +234,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
                }
        else if (emLen < (hLen + sLen + 2))
                {
-                RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS,
+                RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE);
-                   RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE);
                goto err;
                }
        if (sLen > 0)
@@ -218,8 +242,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
                salt = OPENSSL_malloc(sLen);
                if (!salt)
                        {
-                        RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS,
+                        RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,ERR_R_MALLOC_FAILURE);
-                                ERR_R_MALLOC_FAILURE);
                        goto err;
                        }
                if (RAND_bytes(salt, sLen) <= 0)
@@ -228,16 +251,18 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
        maskedDBLen = emLen - hLen - 1;
        H = EM + maskedDBLen;
        EVP_MD_CTX_init(&ctx);
-        EVP_DigestInit_ex(&ctx, Hash, NULL);
+        if (!EVP_DigestInit_ex(&ctx, Hash, NULL)
-        EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes);
+                || !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes)
-        EVP_DigestUpdate(&ctx, mHash, hLen);
+                || !EVP_DigestUpdate(&ctx, mHash, hLen))
-        if (sLen)
+                goto err;
-                EVP_DigestUpdate(&ctx, salt, sLen);
+        if (sLen && !EVP_DigestUpdate(&ctx, salt, sLen))
-        EVP_DigestFinal(&ctx, H, NULL);
+                goto err;
+        if (!EVP_DigestFinal_ex(&ctx, H, NULL))
+                goto err;
        EVP_MD_CTX_cleanup(&ctx);
        /* Generate dbMask in place then perform XOR on it */
-        if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, Hash))
+        if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, mgf1Hash))
                goto err;
        p = EM;
diff --git a/src/lib/libcrypto/s390xcap.c b/src/lib/libcrypto/s390xcap.c
index ffbe0235f9..f2e94ef47e 100644
--- a/src/lib/libcrypto/s390xcap.c
+++ b/src/lib/libcrypto/s390xcap.c
@@ -4,7 +4,7 @@
 #include <setjmp.h>
 #include <signal.h>
-extern unsigned long OPENSSL_s390xcap_P;
+extern unsigned long OPENSSL_s390xcap_P[];
 static sigjmp_buf ill_jmp;
 static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
@@ -16,7 +16,9 @@ void OPENSSL_cpuid_setup(void)
        sigset_t oset;
        struct sigaction ill_act,oact;
-        if (OPENSSL_s390xcap_P) return;
+        if (OPENSSL_s390xcap_P[0]) return;
+        OPENSSL_s390xcap_P[0] = 1UL<<(8*sizeof(unsigned long)-1);
        memset(&ill_act,0,sizeof(ill_act));
        ill_act.sa_handler = ill_handler;
@@ -27,10 +29,8 @@ void OPENSSL_cpuid_setup(void)
        sigaction (SIGILL,&ill_act,&oact);
        /* protection against missing store-facility-list-extended */
-        if (sigsetjmp(ill_jmp,0) == 0)
+        if (sigsetjmp(ill_jmp,1) == 0)
-                OPENSSL_s390xcap_P = OPENSSL_s390x_facilities();
+                OPENSSL_s390x_facilities();
-        else
-                OPENSSL_s390xcap_P = 1UL<<63;
        sigaction (SIGILL,&oact,NULL);
        sigprocmask(SIG_SETMASK,&oset,NULL);
diff --git a/src/lib/libcrypto/s390xcpuid.S b/src/lib/libcrypto/s390xcpuid.S
index b053c6a281..06815347e6 100644
--- a/src/lib/libcrypto/s390xcpuid.S
+++ b/src/lib/libcrypto/s390xcpuid.S
@@ -5,10 +5,14 @@
 .align  16
 OPENSSL_s390x_facilities:
        lghi    %r0,0
-        .long   0xb2b0f010      # stfle 16(%r15)
+        larl    %r2,OPENSSL_s390xcap_P
-        lg      %r2,16(%r15)
+        stg     %r0,8(%r2)
-        larl    %r1,OPENSSL_s390xcap_P
+        .long   0xb2b02000      # stfle 0(%r2)
-        stg     %r2,0(%r1)
+        brc     8,.Ldone
+        lghi    %r0,1
+        .long   0xb2b02000      # stfle 0(%r2)
+.Ldone:
+        lg      %r2,0(%r2)
        br      %r14
 .size   OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities
@@ -58,6 +62,9 @@ OPENSSL_wipe_cpu:
 .type   OPENSSL_cleanse,@function
 .align  16
 OPENSSL_cleanse:
+#if !defined(__s390x__) && !defined(__s390x)
+        llgfr   %r3,%r3
+#endif
        lghi    %r4,15
        lghi    %r0,0
        clgr    %r3,%r4
@@ -89,4 +96,4 @@ OPENSSL_cleanse:
 .section        .init
        brasl   %r14,OPENSSL_cpuid_setup
-.comm   OPENSSL_s390xcap_P,8,8
+.comm   OPENSSL_s390xcap_P,16,8
diff --git a/src/lib/libcrypto/sha/asm/sha1-alpha.pl b/src/lib/libcrypto/sha/asm/sha1-alpha.pl
new file mode 100644
index 0000000000..6c4b9251fd
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-alpha.pl
@@ -0,0 +1,322 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# SHA1 block procedure for Alpha.
+# On 21264 performance is 33% better than code generated by vendor
+# compiler, and 75% better than GCC [3.4], and in absolute terms is
+# 8.7 cycles per processed byte. Implementation features vectorized
+# byte swap, but not Xupdate.
+@X=(    "\$0",  "\$1",  "\$2",  "\$3",  "\$4",  "\$5",  "\$6",  "\$7",
+        "\$8",  "\$9",  "\$10", "\$11", "\$12", "\$13", "\$14", "\$15");
+$ctx="a0";      # $16
+$inp="a1";
+$num="a2";
+$A="a3";
+$B="a4";        # 20
+$C="a5";
+$D="t8";
+$E="t9";        @V=($A,$B,$C,$D,$E);
+$t0="t10";      # 24
+$t1="t11";
+$t2="ra";
+$t3="t12";
+$K="AT";        # 28
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+        ldq_u   @X[0],0+0($inp)
+        ldq_u   @X[1],0+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<14);
+        ldq_u   @X[$i+2],($i+2)*4+0($inp)
+        ldq_u   @X[$i+3],($i+2)*4+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<15);
+        extql   @X[$i],$inp,@X[$i]
+        extqh   @X[$i+1],$inp,@X[$i+1]
+        or      @X[$i+1],@X[$i],@X[$i]  # pair of 32-bit values are fetched
+        srl     @X[$i],24,$t0           # vectorized byte swap
+        srl     @X[$i],8,$t2
+        sll     @X[$i],8,$t3
+        sll     @X[$i],24,@X[$i]
+        zapnot  $t0,0x11,$t0
+        zapnot  $t2,0x22,$t2
+        zapnot  @X[$i],0x88,@X[$i]
+        or      $t0,$t2,$t0
+        zapnot  $t3,0x44,$t3
+        sll     $a,5,$t1
+        or      @X[$i],$t0,@X[$i]
+        addl    $K,$e,$e
+        and     $b,$c,$t2
+        zapnot  $a,0xf,$a
+        or      @X[$i],$t3,@X[$i]
+        srl     $a,27,$t0
+        bic     $d,$b,$t3
+        sll     $b,30,$b
+        extll   @X[$i],4,@X[$i+1]       # extract upper half
+        or      $t2,$t3,$t2
+        addl    @X[$i],$e,$e
+        addl    $t1,$e,$e
+        srl     $b,32,$t3
+        zapnot  @X[$i],0xf,@X[$i]
+        addl    $t0,$e,$e
+        addl    $t2,$e,$e
+        or      $t3,$b,$b
+___
+$code.=<<___ if (($i&1) && $i<15);
+        sll     $a,5,$t1
+        addl    $K,$e,$e
+        and     $b,$c,$t2
+        zapnot  $a,0xf,$a
+        srl     $a,27,$t0
+        addl    @X[$i%16],$e,$e
+        bic     $d,$b,$t3
+        sll     $b,30,$b
+        or      $t2,$t3,$t2
+        addl    $t1,$e,$e
+        srl     $b,32,$t3
+        zapnot  @X[$i],0xf,@X[$i]
+        addl    $t0,$e,$e
+        addl    $t2,$e,$e
+        or      $t3,$b,$b
+___
+$code.=<<___ if ($i>=15);       # with forward Xupdate
+        sll     $a,5,$t1
+        addl    $K,$e,$e
+        and     $b,$c,$t2
+        xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]
+        zapnot  $a,0xf,$a
+        addl    @X[$i%16],$e,$e
+        bic     $d,$b,$t3
+        xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+        srl     $a,27,$t0
+        addl    $t1,$e,$e
+        or      $t2,$t3,$t2
+        xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+        sll     $b,30,$b
+        addl    $t0,$e,$e
+        srl     @X[$j%16],31,$t1
+        addl    $t2,$e,$e
+        srl     $b,32,$t3
+        addl    @X[$j%16],@X[$j%16],@X[$j%16]
+        or      $t3,$b,$b
+        zapnot  @X[$i%16],0xf,@X[$i%16]
+        or      $t1,@X[$j%16],@X[$j%16]
+___
+}
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);        # with forward Xupdate
+        sll     $a,5,$t1
+        addl    $K,$e,$e
+        zapnot  $a,0xf,$a
+        xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]
+        sll     $b,30,$t3
+        addl    $t1,$e,$e
+        xor     $b,$c,$t2
+        xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+        srl     $b,2,$b
+        addl    @X[$i%16],$e,$e
+        xor     $d,$t2,$t2
+        xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+        srl     @X[$j%16],31,$t1
+        addl    $t2,$e,$e
+        srl     $a,27,$t0
+        addl    @X[$j%16],@X[$j%16],@X[$j%16]
+        or      $t3,$b,$b
+        addl    $t0,$e,$e
+        or      $t1,@X[$j%16],@X[$j%16]
+___
+$code.=<<___ if ($i<77);
+        zapnot  @X[$i%16],0xf,@X[$i%16]
+___
+$code.=<<___ if ($i==79);       # with context fetch
+        sll     $a,5,$t1
+        addl    $K,$e,$e
+        zapnot  $a,0xf,$a
+        ldl     @X[0],0($ctx)
+        sll     $b,30,$t3
+        addl    $t1,$e,$e
+        xor     $b,$c,$t2
+        ldl     @X[1],4($ctx)
+        srl     $b,2,$b
+        addl    @X[$i%16],$e,$e
+        xor     $d,$t2,$t2
+        ldl     @X[2],8($ctx)
+        srl     $a,27,$t0
+        addl    $t2,$e,$e
+        ldl     @X[3],12($ctx)
+        or      $t3,$b,$b
+        addl    $t0,$e,$e
+        ldl     @X[4],16($ctx)
+___
+}
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;   # with forward Xupdate
+        sll     $a,5,$t1
+        addl    $K,$e,$e
+        zapnot  $a,0xf,$a
+        xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]
+        srl     $a,27,$t0
+        and     $b,$c,$t2
+        and     $b,$d,$t3
+        xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+        sll     $b,30,$b
+        addl    $t1,$e,$e
+        xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+        srl     @X[$j%16],31,$t1
+        addl    $t0,$e,$e
+        or      $t2,$t3,$t2
+        and     $c,$d,$t3
+        or      $t2,$t3,$t2
+        srl     $b,32,$t3
+        addl    @X[$i%16],$e,$e
+        addl    @X[$j%16],@X[$j%16],@X[$j%16]
+        or      $t3,$b,$b
+        addl    $t2,$e,$e
+        or      $t1,@X[$j%16],@X[$j%16]
+        zapnot  @X[$i%16],0xf,@X[$i%16]
+___
+}
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+.text
+.set    noat
+.set    noreorder
+.globl  sha1_block_data_order
+.align  5
+.ent    sha1_block_data_order
+sha1_block_data_order:
+        lda     sp,-64(sp)
+        stq     ra,0(sp)
+        stq     s0,8(sp)
+        stq     s1,16(sp)
+        stq     s2,24(sp)
+        stq     s3,32(sp)
+        stq     s4,40(sp)
+        stq     s5,48(sp)
+        stq     fp,56(sp)
+        .mask   0x0400fe00,-64
+        .frame  sp,64,ra
+        .prologue 0
+        ldl     $A,0($ctx)
+        ldl     $B,4($ctx)
+        sll     $num,6,$num
+        ldl     $C,8($ctx)
+        ldl     $D,12($ctx)
+        ldl     $E,16($ctx)
+        addq    $inp,$num,$num
+.Lloop:
+        .set    noreorder
+        ldah    $K,23170(zero)
+        zapnot  $B,0xf,$B
+        lda     $K,31129($K)    # K_00_19
+___
+for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        ldah    $K,28378(zero)
+        lda     $K,-5215($K)    # K_20_39
+___
+for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        ldah    $K,-28900(zero)
+        lda     $K,-17188($K)   # K_40_59
+___
+for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        ldah    $K,-13725(zero)
+        lda     $K,-15914($K)   # K_60_79
+___
+for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        addl    @X[0],$A,$A
+        addl    @X[1],$B,$B
+        addl    @X[2],$C,$C
+        addl    @X[3],$D,$D
+        addl    @X[4],$E,$E
+        stl     $A,0($ctx)
+        stl     $B,4($ctx)
+        addq    $inp,64,$inp
+        stl     $C,8($ctx)
+        stl     $D,12($ctx)
+        stl     $E,16($ctx)
+        cmpult  $inp,$num,$t1
+        bne     $t1,.Lloop
+        .set    noreorder
+        ldq     ra,0(sp)
+        ldq     s0,8(sp)
+        ldq     s1,16(sp)
+        ldq     s2,24(sp)
+        ldq     s3,32(sp)
+        ldq     s4,40(sp)
+        ldq     s5,48(sp)
+        ldq     fp,56(sp)
+        lda     sp,64(sp)
+        ret     (ra)
+.end    sha1_block_data_order
+.ascii  "SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
index 6e65fe3e01..fe8207f77f 100644
--- a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
@@ -47,6 +47,10 @@
 # Cortex A8 core and in absolute terms ~870 cycles per input block
 # [or 13.6 cycles per byte].
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 10%
+# improvement on Cortex A8 core and 12.2 cycles per byte.
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
@@ -76,31 +80,41 @@ $code.=<<___;
        add     $e,$K,$e,ror#2                  @ E+=K_xx_xx
        ldr     $t3,[$Xi,#2*4]
        eor     $t0,$t0,$t1
-        eor     $t2,$t2,$t3
+        eor     $t2,$t2,$t3                     @ 1 cycle stall
        eor     $t1,$c,$d                       @ F_xx_xx
        mov     $t0,$t0,ror#31
        add     $e,$e,$a,ror#27                 @ E+=ROR(A,27)
        eor     $t0,$t0,$t2,ror#31
+        str     $t0,[$Xi,#-4]!
        $opt1                                   @ F_xx_xx
        $opt2                                   @ F_xx_xx
        add     $e,$e,$t0                       @ E+=X[i]
-        str     $t0,[$Xi,#-4]!
 ___
 }
 sub BODY_00_15 {
 my ($a,$b,$c,$d,$e)=@_;
 $code.=<<___;
-        ldrb    $t0,[$inp],#4
+#if __ARM_ARCH__<7
-        ldrb    $t1,[$inp,#-1]
+        ldrb    $t1,[$inp,#2]
-        ldrb    $t2,[$inp,#-2]
+        ldrb    $t0,[$inp,#3]
+        ldrb    $t2,[$inp,#1]
        add     $e,$K,$e,ror#2                  @ E+=K_00_19
-        ldrb    $t3,[$inp,#-3]
+        ldrb    $t3,[$inp],#4
+        orr     $t0,$t0,$t1,lsl#8
+        eor     $t1,$c,$d                       @ F_xx_xx
+        orr     $t0,$t0,$t2,lsl#16
        add     $e,$e,$a,ror#27                 @ E+=ROR(A,27)
-        orr     $t0,$t1,$t0,lsl#24
+        orr     $t0,$t0,$t3,lsl#24
+#else
+        ldr     $t0,[$inp],#4                   @ handles unaligned
+        add     $e,$K,$e,ror#2                  @ E+=K_00_19
        eor     $t1,$c,$d                       @ F_xx_xx
-        orr     $t0,$t0,$t2,lsl#8
+        add     $e,$e,$a,ror#27                 @ E+=ROR(A,27)
-        orr     $t0,$t0,$t3,lsl#16
+#ifdef __ARMEL__
+        rev     $t0,$t0                         @ byte swap
+#endif
+#endif
        and     $t1,$b,$t1,ror#2
        add     $e,$e,$t0                       @ E+=X[i]
        eor     $t1,$t1,$d,ror#2                @ F_00_19(B,C,D)
@@ -136,6 +150,8 @@ ___
 }
 $code=<<___;
+#include "arm_arch.h"
 .text
 .global sha1_block_data_order
@@ -209,10 +225,14 @@ $code.=<<___;
        teq     $inp,$len
        bne     .Lloop                  @ [+18], total 1307
+#if __ARM_ARCH__>=5
+        ldmia   sp!,{r4-r12,pc}
+#else
        ldmia   sp!,{r4-r12,lr}
        tst     lr,#1
        moveq   pc,lr                   @ be binary compatible with V4, yet
        bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
 .align  2
 .LK_00_19:      .word   0x5a827999
 .LK_20_39:      .word   0x6ed9eba1
diff --git a/src/lib/libcrypto/sha/asm/sha1-ia64.pl b/src/lib/libcrypto/sha/asm/sha1-ia64.pl
index 51c4f47ecb..db28f0805a 100644
--- a/src/lib/libcrypto/sha/asm/sha1-ia64.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-ia64.pl
@@ -15,7 +15,7 @@
 # is >50% better than HP C and >2x better than gcc.
 $code=<<___;
-.ident  \"sha1-ia64.s, version 1.2\"
+.ident  \"sha1-ia64.s, version 1.3\"
 .ident  \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
 .explicit
@@ -26,14 +26,10 @@ if ($^O eq "hpux") {
    $ADDP="addp4";
    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
 } else { $ADDP="add"; }
-for (@ARGV) {   $big_endian=1 if (/\-DB_ENDIAN/);
-                $big_endian=0 if (/\-DL_ENDIAN/);   }
-if (!defined($big_endian))
-            {   $big_endian=(unpack('L',pack('N',1))==1);   }
 #$human=1;
 if ($human) {   # useful for visual code auditing...
-        ($A,$B,$C,$D,$E,$T)   = ("A","B","C","D","E","T");
+        ($A,$B,$C,$D,$E)   = ("A","B","C","D","E");
        ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
        ($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
            (   "K_00_19","K_20_39","K_40_59","K_60_79" );
@@ -41,47 +37,50 @@ if ($human) {	# useful for visual code auditing...
                "X8", "X9","X10","X11","X12","X13","X14","X15"  );
 }
 else {
-        ($A,$B,$C,$D,$E,$T)   = ("loc0","loc1","loc2","loc3","loc4","loc5");
+        ($A,$B,$C,$D,$E)   =    ("loc0","loc1","loc2","loc3","loc4");
-        ($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10");
+        ($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
        ($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
-            (   "r14", "r15", "loc11", "loc12"  );
+            (   "r14", "r15", "loc10", "loc11"  );
        @X= (   "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
                "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"  );
 }
 sub BODY_00_15 {
 local   *code=shift;
-local   ($i,$a,$b,$c,$d,$e,$f)=@_;
+my      ($i,$a,$b,$c,$d,$e)=@_;
+my      $j=$i+1;
+my      $Xn=@X[$j%16];
 $code.=<<___ if ($i==0);
-{ .mmi; ld1     $X[$i&0xf]=[inp],2          // MSB
+{ .mmi; ld1     $X[$i]=[inp],2              // MSB
        ld1     tmp2=[tmp3],2           };;
 { .mmi; ld1     tmp0=[inp],2
        ld1     tmp4=[tmp3],2               // LSB
-        dep     $X[$i&0xf]=$X[$i&0xf],tmp2,8,8  };;
+        dep     $X[$i]=$X[$i],tmp2,8,8  };;
 ___
 if ($i<15) {
        $code.=<<___;
-{ .mmi; ld1     $X[($i+1)&0xf]=[inp],2      // +1
+{ .mmi; ld1     $Xn=[inp],2                 // forward Xload
+        nop.m   0x0
        dep     tmp1=tmp0,tmp4,8,8      };;
-{ .mmi; ld1     tmp2=[tmp3],2               // +1
+{ .mmi; ld1     tmp2=[tmp3],2               // forward Xload
        and     tmp4=$c,$b
-        dep     $X[$i&0xf]=$X[$i&0xf],tmp1,16,16        } //;;
+        dep     $X[$i]=$X[$i],tmp1,16,16} //;;
-{ .mmi; andcm   tmp1=$d,$b
+{ .mmi; add     $e=$e,$K_00_19              // e+=K_00_19
-        add     tmp0=$e,$K_00_19
+        andcm   tmp1=$d,$b
        dep.z   tmp5=$a,5,27            };; // a<<5
-{ .mmi; or      tmp4=tmp4,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
+{ .mmi; add     $e=$e,$X[$i]                // e+=Xload
-        add     $f=tmp0,$X[$i&0xf]          // f=xi+e+K_00_19
+        or      tmp4=tmp4,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
        extr.u  tmp1=$a,27,5            };; // a>>27
-{ .mmi; ld1     tmp0=[inp],2                // +1
+{ .mmi; ld1     tmp0=[inp],2                // forward Xload
-        add     $f=$f,tmp4                  // f+=F_00_19(b,c,d)
+        add     $e=$e,tmp4                  // e+=F_00_19(b,c,d)
        shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30)
-{ .mmi; ld1     tmp4=[tmp3],2               // +1
+{ .mmi; ld1     tmp4=[tmp3],2               // forward Xload
        or      tmp5=tmp1,tmp5              // ROTATE(a,5)
        mux2    tmp6=$a,0x44            };; // see b in next iteration
-{ .mii; add     $f=$f,tmp5                  // f+=ROTATE(a,5)
+{ .mii; add     $e=$e,tmp5                  // e+=ROTATE(a,5)
-        dep     $X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8  // +1
+        dep     $Xn=$Xn,tmp2,8,8            // forward Xload
-        mux2    $X[$i&0xf]=$X[$i&0xf],0x44      } //;;
+        mux2    $X[$i]=$X[$i],0x44      } //;;
 ___
        }
@@ -89,24 +88,24 @@ else	{
        $code.=<<___;
 { .mii; and     tmp3=$c,$b
        dep     tmp1=tmp0,tmp4,8,8;;
-        dep     $X[$i&0xf]=$X[$i&0xf],tmp1,16,16        } //;;
+        dep     $X[$i]=$X[$i],tmp1,16,16} //;;
-{ .mmi; andcm   tmp1=$d,$b
+{ .mmi; add     $e=$e,$K_00_19              // e+=K_00_19
-        add     tmp0=$e,$K_00_19
+        andcm   tmp1=$d,$b
        dep.z   tmp5=$a,5,27            };; // a<<5
-{ .mmi; or      tmp4=tmp3,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
+{ .mmi; add     $e=$e,$X[$i]                // e+=Xupdate
-        add     $f=tmp0,$X[$i&0xf]          // f=xi+e+K_00_19
+        or      tmp4=tmp3,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
        extr.u  tmp1=$a,27,5            }   // a>>27
-{ .mmi; xor     tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]  // +1
+{ .mmi; xor     $Xn=$Xn,$X[($j+2)%16]       // forward Xupdate
-        xor     tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
+        xor     tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
        nop.i   0                       };;
-{ .mmi; add     $f=$f,tmp4                  // f+=F_00_19(b,c,d)
+{ .mmi; add     $e=$e,tmp4                  // e+=F_00_19(b,c,d)
-        xor     tmp2=tmp2,tmp3              // +1
+        xor     $Xn=$Xn,tmp3                // forward Xupdate
        shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30)
 { .mmi; or      tmp1=tmp1,tmp5              // ROTATE(a,5)
        mux2    tmp6=$a,0x44            };; // see b in next iteration
-{ .mii; add     $f=$f,tmp1                  // f+=ROTATE(a,5)
+{ .mii; add     $e=$e,tmp1                  // e+=ROTATE(a,5)
-        shrp    $e=tmp2,tmp2,31             // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+        shrp    $Xn=$Xn,$Xn,31              // ROTATE(x[0]^x[2]^x[8]^x[13],1)
-        mux2    $X[$i&0xf]=$X[$i&0xf],0x44  };;
+        mux2    $X[$i]=$X[$i],0x44      };;
 ___
        }
@@ -114,27 +113,28 @@ ___
 sub BODY_16_19 {
 local   *code=shift;
-local   ($i,$a,$b,$c,$d,$e,$f)=@_;
+my      ($i,$a,$b,$c,$d,$e)=@_;
+my      $j=$i+1;
+my      $Xn=@X[$j%16];
 $code.=<<___;
-{ .mmi; mov     $X[$i&0xf]=$f               // Xupdate
+{ .mib; add     $e=$e,$K_00_19              // e+=K_00_19
-        and     tmp0=$c,$b
        dep.z   tmp5=$a,5,27            }   // a<<5
-{ .mmi; andcm   tmp1=$d,$b
+{ .mib; andcm   tmp1=$d,$b
-        add     tmp4=$e,$K_00_19        };;
+        and     tmp0=$c,$b              };;
-{ .mmi; or      tmp0=tmp0,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
+{ .mmi; add     $e=$e,$X[$i%16]             // e+=Xupdate
-        add     $f=$f,tmp4                  // f+=e+K_00_19
+        or      tmp0=tmp0,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
        extr.u  tmp1=$a,27,5            }   // a>>27
-{ .mmi; xor     tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]  // +1
+{ .mmi; xor     $Xn=$Xn,$X[($j+2)%16]       // forward Xupdate
-        xor     tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
+        xor     tmp3=$X[($j+8)%16],$X[($j+13)%16]       // forward Xupdate
        nop.i   0                       };;
-{ .mmi; add     $f=$f,tmp0                  // f+=F_00_19(b,c,d)
+{ .mmi; add     $e=$e,tmp0                  // f+=F_00_19(b,c,d)
-        xor     tmp2=tmp2,tmp3              // +1
+        xor     $Xn=$Xn,tmp3                // forward Xupdate
        shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30)
 { .mmi; or      tmp1=tmp1,tmp5              // ROTATE(a,5)
        mux2    tmp6=$a,0x44            };; // see b in next iteration
-{ .mii; add     $f=$f,tmp1                  // f+=ROTATE(a,5)
+{ .mii; add     $e=$e,tmp1                  // e+=ROTATE(a,5)
-        shrp    $e=tmp2,tmp2,31             // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+        shrp    $Xn=$Xn,$Xn,31              // ROTATE(x[0]^x[2]^x[8]^x[13],1)
        nop.i   0                       };;
 ___
@@ -142,49 +142,47 @@ ___
 sub BODY_20_39 {
 local   *code=shift;
-local   ($i,$a,$b,$c,$d,$e,$f,$Konst)=@_;
+my      ($i,$a,$b,$c,$d,$e,$Konst)=@_;
        $Konst = $K_20_39 if (!defined($Konst));
+my      $j=$i+1;
+my      $Xn=@X[$j%16];
 if ($i<79) {
 $code.=<<___;
-{ .mib; mov     $X[$i&0xf]=$f               // Xupdate
+{ .mib; add     $e=$e,$Konst                // e+=K_XX_XX
        dep.z   tmp5=$a,5,27            }   // a<<5
 { .mib; xor     tmp0=$c,$b
-        add     tmp4=$e,$Konst          };;
+        xor     $Xn=$Xn,$X[($j+2)%16]   };; // forward Xupdate
-{ .mmi; xor     tmp0=tmp0,$d                // F_20_39(b,c,d)=b^c^d
+{ .mib; add     $e=$e,$X[$i%16]             // e+=Xupdate
-        add     $f=$f,tmp4                  // f+=e+K_20_39
        extr.u  tmp1=$a,27,5            }   // a>>27
-{ .mmi; xor     tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]  // +1
+{ .mib; xor     tmp0=tmp0,$d                // F_20_39(b,c,d)=b^c^d
-        xor     tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
+        xor     $Xn=$Xn,$X[($j+8)%16]   };; // forward Xupdate
-        nop.i   0                       };;
+{ .mmi; add     $e=$e,tmp0                  // e+=F_20_39(b,c,d)
-{ .mmi; add     $f=$f,tmp0                  // f+=F_20_39(b,c,d)
+        xor     $Xn=$Xn,$X[($j+13)%16]      // forward Xupdate
-        xor     tmp2=tmp2,tmp3              // +1
        shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30)
 { .mmi; or      tmp1=tmp1,tmp5              // ROTATE(a,5)
        mux2    tmp6=$a,0x44            };; // see b in next iteration
-{ .mii; add     $f=$f,tmp1                  // f+=ROTATE(a,5)
+{ .mii; add     $e=$e,tmp1                  // e+=ROTATE(a,5)
-        shrp    $e=tmp2,tmp2,31             // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+        shrp    $Xn=$Xn,$Xn,31              // ROTATE(x[0]^x[2]^x[8]^x[13],1)
        nop.i   0                       };;
 ___
 }
 else {
 $code.=<<___;
-{ .mib; mov     $X[$i&0xf]=$f               // Xupdate
+{ .mib; add     $e=$e,$Konst                // e+=K_60_79
        dep.z   tmp5=$a,5,27            }   // a<<5
 { .mib; xor     tmp0=$c,$b
-        add     tmp4=$e,$Konst          };;
-{ .mib; xor     tmp0=tmp0,$d                // F_20_39(b,c,d)=b^c^d
-        extr.u  tmp1=$a,27,5            }   // a>>27
-{ .mib; add     $f=$f,tmp4                  // f+=e+K_20_39
        add     $h1=$h1,$a              };; // wrap up
-{ .mmi; add     $f=$f,tmp0                  // f+=F_20_39(b,c,d)
+{ .mib; add     $e=$e,$X[$i%16]             // e+=Xupdate
-        shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30) ;;?
+        extr.u  tmp1=$a,27,5            }   // a>>27
-{ .mmi; or      tmp1=tmp1,tmp5              // ROTATE(a,5)
+{ .mib; xor     tmp0=tmp0,$d                // F_20_39(b,c,d)=b^c^d
        add     $h3=$h3,$c              };; // wrap up
-{ .mib; add     tmp3=1,inp                  // used in unaligned codepath
+{ .mmi; add     $e=$e,tmp0                  // e+=F_20_39(b,c,d)
-        add     $f=$f,tmp1              }   // f+=ROTATE(a,5)
+        or      tmp1=tmp1,tmp5              // ROTATE(a,5)
-{ .mib; add     $h2=$h2,$b                  // wrap up
+        shrp    $b=tmp6,tmp6,2          };; // b=ROTATE(b,30) ;;?
+{ .mmi; add     $e=$e,tmp1                  // e+=ROTATE(a,5)
+        add     tmp3=1,inp                  // used in unaligned codepath
        add     $h4=$h4,$d              };; // wrap up
 ___
@@ -193,29 +191,29 @@ ___
 sub BODY_40_59 {
 local   *code=shift;
-local   ($i,$a,$b,$c,$d,$e,$f)=@_;
+my      ($i,$a,$b,$c,$d,$e)=@_;
+my      $j=$i+1;
+my      $Xn=@X[$j%16];
 $code.=<<___;
-{ .mmi; mov     $X[$i&0xf]=$f               // Xupdate
+{ .mib; add     $e=$e,$K_40_59              // e+=K_40_59
-        and     tmp0=$c,$b
        dep.z   tmp5=$a,5,27            }   // a<<5
-{ .mmi; and     tmp1=$d,$b
+{ .mib; and     tmp1=$c,$d
-        add     tmp4=$e,$K_40_59        };;
+        xor     tmp0=$c,$d              };;
-{ .mmi; or      tmp0=tmp0,tmp1              // (b&c)|(b&d)
+{ .mmi; add     $e=$e,$X[$i%16]             // e+=Xupdate
-        add     $f=$f,tmp4                  // f+=e+K_40_59
+        add     tmp5=tmp5,tmp1              // a<<5+(c&d)
        extr.u  tmp1=$a,27,5            }   // a>>27
-{ .mmi; and     tmp4=$c,$d
+{ .mmi; and     tmp0=tmp0,$b
-        xor     tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]  // +1
+        xor     $Xn=$Xn,$X[($j+2)%16]       // forward Xupdate
-        xor     tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
+        xor     tmp3=$X[($j+8)%16],$X[($j+13)%16] };;   // forward Xupdate
-        };;
+{ .mmi; add     $e=$e,tmp0                  // e+=b&(c^d)
-{ .mmi; or      tmp1=tmp1,tmp5              // ROTATE(a,5)
+        add     tmp5=tmp5,tmp1              // ROTATE(a,5)+(c&d)
-        xor     tmp2=tmp2,tmp3              // +1
        shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30)
-{ .mmi; or      tmp0=tmp0,tmp4              // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d)
+{ .mmi; xor     $Xn=$Xn,tmp3
        mux2    tmp6=$a,0x44            };; // see b in next iteration
-{ .mii; add     $f=$f,tmp0                  // f+=F_40_59(b,c,d)
+{ .mii; add     $e=$e,tmp5                  // e+=ROTATE(a,5)+(c&d)
-        shrp    $e=tmp2,tmp2,31;;           // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+        shrp    $Xn=$Xn,$Xn,31              // ROTATE(x[0]^x[2]^x[8]^x[13],1)
-        add     $f=$f,tmp1              };; // f+=ROTATE(a,5)
+        nop.i   0x0                     };;
 ___
 }
@@ -237,7 +235,7 @@ inp=r33;	// in1
 .align  32
 sha1_block_data_order:
        .prologue
-{ .mmi; alloc   tmp1=ar.pfs,3,15,0,0
+{ .mmi; alloc   tmp1=ar.pfs,3,14,0,0
        $ADDP   tmp0=4,ctx
        .save   ar.lc,r3
        mov     r3=ar.lc                }
@@ -245,8 +243,8 @@ sha1_block_data_order:
        $ADDP   inp=0,inp
        mov     r2=pr                   };;
 tmp4=in2;
-tmp5=loc13;
+tmp5=loc12;
-tmp6=loc14;
+tmp6=loc13;
        .body
 { .mlx; ld4     $h0=[ctx],8
        movl    $K_00_19=0x5a827999     }
@@ -273,7 +271,7 @@ tmp6=loc14;
 ___
-{ my $i,@V=($A,$B,$C,$D,$E,$T);
+{ my $i,@V=($A,$B,$C,$D,$E);
        for($i=0;$i<16;$i++)    { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
        for(;$i<20;$i++)        { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
@@ -281,12 +279,12 @@ ___
        for(;$i<60;$i++)        { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
        for(;$i<80;$i++)        { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
-        (($V[5] eq $D) and ($V[0] eq $E)) or die;       # double-check
+        (($V[0] eq $A) and ($V[4] eq $E)) or die;       # double-check
 }
 $code.=<<___;
-{ .mmb; add     $h0=$h0,$E
+{ .mmb; add     $h0=$h0,$A
-        nop.m   0
+        add     $h2=$h2,$C
        br.ctop.dptk.many       .Ldtop  };;
 .Ldend:
 { .mmi; add     tmp0=4,ctx
diff --git a/src/lib/libcrypto/sha/asm/sha1-mips.pl b/src/lib/libcrypto/sha/asm/sha1-mips.pl
new file mode 100644
index 0000000000..f1a702f38f
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-mips.pl
@@ -0,0 +1,354 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# SHA1 block procedure for MIPS.
+# Performance improvement is 30% on unaligned input. The "secret" is
+# to deploy lwl/lwr pair to load unaligned input. One could have
+# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
+# compatible subroutine. There is room for minor optimization on
+# little-endian platforms...
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+if ($flavour =~ /64|n32/i) {
+        $PTR_ADD="dadd";        # incidentally works even on n32
+        $PTR_SUB="dsub";        # incidentally works even on n32
+        $REG_S="sd";
+        $REG_L="ld";
+        $PTR_SLL="dsll";        # incidentally works even on n32
+        $SZREG=8;
+} else {
+        $PTR_ADD="add";
+        $PTR_SUB="sub";
+        $REG_S="sw";
+        $REG_L="lw";
+        $PTR_SLL="sll";
+        $SZREG=4;
+}
+#
+# <appro@openssl.org>
+#
+######################################################################
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+for (@ARGV) {   $output=$_ if (/^\w[\w\-]*\.\w+$/);   }
+open STDOUT,">$output";
+if (!defined($big_endian))
+            {   $big_endian=(unpack('L',pack('N',1))==1);   }
+# offsets of the Most and Least Significant Bytes
+$MSB=$big_endian?0:3;
+$LSB=3&~$MSB;
+@X=map("\$$_",(8..23)); # a4-a7,s0-s11
+$ctx=$a0;
+$inp=$a1;
+$num=$a2;
+$A="\$1";
+$B="\$2";
+$C="\$3";
+$D="\$7";
+$E="\$24";      @V=($A,$B,$C,$D,$E);
+$t0="\$25";
+$t1=$num;       # $num is offloaded to stack
+$t2="\$30";     # fp
+$K="\$31";      # ra
+sub BODY_00_14 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___    if (!$big_endian);
+        srl     $t0,@X[$i],24   # byte swap($i)
+        srl     $t1,@X[$i],8
+        andi    $t2,@X[$i],0xFF00
+        sll     @X[$i],@X[$i],24
+        andi    $t1,0xFF00
+        sll     $t2,$t2,8
+        or      @X[$i],$t0
+        or      $t1,$t2
+        or      @X[$i],$t1
+___
+$code.=<<___;
+         lwl    @X[$j],$j*4+$MSB($inp)
+        sll     $t0,$a,5        # $i
+        addu    $e,$K
+         lwr    @X[$j],$j*4+$LSB($inp)
+        srl     $t1,$a,27
+        addu    $e,$t0
+        xor     $t0,$c,$d
+        addu    $e,$t1
+        sll     $t2,$b,30
+        and     $t0,$b
+        srl     $b,$b,2
+        xor     $t0,$d
+        addu    $e,@X[$i]
+        or      $b,$t2
+        addu    $e,$t0
+___
+}
+sub BODY_15_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___    if (!$big_endian && $i==15);
+        srl     $t0,@X[$i],24   # byte swap($i)
+        srl     $t1,@X[$i],8
+        andi    $t2,@X[$i],0xFF00
+        sll     @X[$i],@X[$i],24
+        andi    $t1,0xFF00
+        sll     $t2,$t2,8
+        or      @X[$i],$t0
+        or      @X[$i],$t1
+        or      @X[$i],$t2
+___
+$code.=<<___;
+         xor    @X[$j%16],@X[($j+2)%16]
+        sll     $t0,$a,5        # $i
+        addu    $e,$K
+        srl     $t1,$a,27
+        addu    $e,$t0
+         xor    @X[$j%16],@X[($j+8)%16]
+        xor     $t0,$c,$d
+        addu    $e,$t1
+         xor    @X[$j%16],@X[($j+13)%16]
+        sll     $t2,$b,30
+        and     $t0,$b
+         srl    $t1,@X[$j%16],31
+         addu   @X[$j%16],@X[$j%16]
+        srl     $b,$b,2
+        xor     $t0,$d
+         or     @X[$j%16],$t1
+        addu    $e,@X[$i%16]
+        or      $b,$t2
+        addu    $e,$t0
+___
+}
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+         xor    @X[$j%16],@X[($j+2)%16]
+        sll     $t0,$a,5        # $i
+        addu    $e,$K
+        srl     $t1,$a,27
+        addu    $e,$t0
+         xor    @X[$j%16],@X[($j+8)%16]
+        xor     $t0,$c,$d
+        addu    $e,$t1
+         xor    @X[$j%16],@X[($j+13)%16]
+        sll     $t2,$b,30
+        xor     $t0,$b
+         srl    $t1,@X[$j%16],31
+         addu   @X[$j%16],@X[$j%16]
+        srl     $b,$b,2
+        addu    $e,@X[$i%16]
+         or     @X[$j%16],$t1
+        or      $b,$t2
+        addu    $e,$t0
+___
+$code.=<<___ if ($i==79);
+         lw     @X[0],0($ctx)
+        sll     $t0,$a,5        # $i
+        addu    $e,$K
+         lw     @X[1],4($ctx)
+        srl     $t1,$a,27
+        addu    $e,$t0
+         lw     @X[2],8($ctx)
+        xor     $t0,$c,$d
+        addu    $e,$t1
+         lw     @X[3],12($ctx)
+        sll     $t2,$b,30
+        xor     $t0,$b
+         lw     @X[4],16($ctx)
+        srl     $b,$b,2
+        addu    $e,@X[$i%16]
+        or      $b,$t2
+        addu    $e,$t0
+___
+}
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+         xor    @X[$j%16],@X[($j+2)%16]
+        sll     $t0,$a,5        # $i
+        addu    $e,$K
+        srl     $t1,$a,27
+        addu    $e,$t0
+         xor    @X[$j%16],@X[($j+8)%16]
+        and     $t0,$c,$d
+        addu    $e,$t1
+         xor    @X[$j%16],@X[($j+13)%16]
+        sll     $t2,$b,30
+        addu    $e,$t0
+         srl    $t1,@X[$j%16],31
+        xor     $t0,$c,$d
+         addu   @X[$j%16],@X[$j%16]
+        and     $t0,$b
+        srl     $b,$b,2
+         or     @X[$j%16],$t1
+        addu    $e,@X[$i%16]
+        or      $b,$t2
+        addu    $e,$t0
+___
+}
+$FRAMESIZE=16;  # large enough to accomodate NUBI saved registers
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+$code=<<___;
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+.text
+.set    noat
+.set    noreorder
+.align  5
+.globl  sha1_block_data_order
+.ent    sha1_block_data_order
+sha1_block_data_order:
+        .frame  $sp,$FRAMESIZE*$SZREG,$ra
+        .mask   $SAVED_REGS_MASK,-$SZREG
+        .set    noreorder
+        $PTR_SUB $sp,$FRAMESIZE*$SZREG
+        $REG_S  $ra,($FRAMESIZE-1)*$SZREG($sp)
+        $REG_S  $fp,($FRAMESIZE-2)*$SZREG($sp)
+        $REG_S  $s11,($FRAMESIZE-3)*$SZREG($sp)
+        $REG_S  $s10,($FRAMESIZE-4)*$SZREG($sp)
+        $REG_S  $s9,($FRAMESIZE-5)*$SZREG($sp)
+        $REG_S  $s8,($FRAMESIZE-6)*$SZREG($sp)
+        $REG_S  $s7,($FRAMESIZE-7)*$SZREG($sp)
+        $REG_S  $s6,($FRAMESIZE-8)*$SZREG($sp)
+        $REG_S  $s5,($FRAMESIZE-9)*$SZREG($sp)
+        $REG_S  $s4,($FRAMESIZE-10)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi prologue
+        $REG_S  $s3,($FRAMESIZE-11)*$SZREG($sp)
+        $REG_S  $s2,($FRAMESIZE-12)*$SZREG($sp)
+        $REG_S  $s1,($FRAMESIZE-13)*$SZREG($sp)
+        $REG_S  $s0,($FRAMESIZE-14)*$SZREG($sp)
+        $REG_S  $gp,($FRAMESIZE-15)*$SZREG($sp)
+___
+$code.=<<___;
+        $PTR_SLL $num,6
+        $PTR_ADD $num,$inp
+        $REG_S  $num,0($sp)
+        lw      $A,0($ctx)
+        lw      $B,4($ctx)
+        lw      $C,8($ctx)
+        lw      $D,12($ctx)
+        b       .Loop
+        lw      $E,16($ctx)
+.align  4
+.Loop:
+        .set    reorder
+        lwl     @X[0],$MSB($inp)
+        lui     $K,0x5a82
+        lwr     @X[0],$LSB($inp)
+        ori     $K,0x7999       # K_00_19
+___
+for ($i=0;$i<15;$i++)   { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
+for (;$i<20;$i++)       { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        lui     $K,0x6ed9
+        ori     $K,0xeba1       # K_20_39
+___
+for (;$i<40;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        lui     $K,0x8f1b
+        ori     $K,0xbcdc       # K_40_59
+___
+for (;$i<60;$i++)       { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        lui     $K,0xca62
+        ori     $K,0xc1d6       # K_60_79
+___
+for (;$i<80;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        $PTR_ADD $inp,64
+        $REG_L  $num,0($sp)
+        addu    $A,$X[0]
+        addu    $B,$X[1]
+        sw      $A,0($ctx)
+        addu    $C,$X[2]
+        addu    $D,$X[3]
+        sw      $B,4($ctx)
+        addu    $E,$X[4]
+        sw      $C,8($ctx)
+        sw      $D,12($ctx)
+        sw      $E,16($ctx)
+        .set    noreorder
+        bne     $inp,$num,.Loop
+        nop
+        .set    noreorder
+        $REG_L  $ra,($FRAMESIZE-1)*$SZREG($sp)
+        $REG_L  $fp,($FRAMESIZE-2)*$SZREG($sp)
+        $REG_L  $s11,($FRAMESIZE-3)*$SZREG($sp)
+        $REG_L  $s10,($FRAMESIZE-4)*$SZREG($sp)
+        $REG_L  $s9,($FRAMESIZE-5)*$SZREG($sp)
+        $REG_L  $s8,($FRAMESIZE-6)*$SZREG($sp)
+        $REG_L  $s7,($FRAMESIZE-7)*$SZREG($sp)
+        $REG_L  $s6,($FRAMESIZE-8)*$SZREG($sp)
+        $REG_L  $s5,($FRAMESIZE-9)*$SZREG($sp)
+        $REG_L  $s4,($FRAMESIZE-10)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $s3,($FRAMESIZE-11)*$SZREG($sp)
+        $REG_L  $s2,($FRAMESIZE-12)*$SZREG($sp)
+        $REG_L  $s1,($FRAMESIZE-13)*$SZREG($sp)
+        $REG_L  $s0,($FRAMESIZE-14)*$SZREG($sp)
+        $REG_L  $gp,($FRAMESIZE-15)*$SZREG($sp)
+___
+$code.=<<___;
+        jr      $ra
+        $PTR_ADD $sp,$FRAMESIZE*$SZREG
+.end    sha1_block_data_order
+.rdata
+.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+___
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-parisc.pl b/src/lib/libcrypto/sha/asm/sha1-parisc.pl
new file mode 100644
index 0000000000..6d7bf495b2
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-parisc.pl
@@ -0,0 +1,259 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# SHA1 block procedure for PA-RISC.
+# June 2009.
+#
+# On PA-7100LC performance is >30% better than gcc 3.2 generated code
+# for aligned input and >50% better for unaligned. Compared to vendor
+# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
+# few percent faster in 32-bit one (this for aligned input, data for
+# unaligned input is not available).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+if ($flavour =~ /64/) {
+        $LEVEL          ="2.0W";
+        $SIZE_T         =8;
+        $FRAME_MARKER   =80;
+        $SAVED_RP       =16;
+        $PUSH           ="std";
+        $PUSHMA         ="std,ma";
+        $POP            ="ldd";
+        $POPMB          ="ldd,mb";
+} else {
+        $LEVEL          ="1.0";
+        $SIZE_T         =4;
+        $FRAME_MARKER   =48;
+        $SAVED_RP       =20;
+        $PUSH           ="stw";
+        $PUSHMA         ="stwm";
+        $POP            ="ldw";
+        $POPMB          ="ldwm";
+}
+$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
+                                #                 [+ argument transfer]
+$ctx="%r26";            # arg0
+$inp="%r25";            # arg1
+$num="%r24";            # arg2
+$t0="%r28";
+$t1="%r29";
+$K="%r31";
+@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
+@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<15);
+        addl    $K,$e,$e        ; $i
+        shd     $a,$a,27,$t1
+        addl    @X[$i],$e,$e
+        and     $c,$b,$t0
+        addl    $t1,$e,$e
+        andcm   $d,$b,$t1
+        shd     $b,$b,2,$b
+        or      $t1,$t0,$t0
+        addl    $t0,$e,$e
+___
+$code.=<<___ if ($i>=15);       # with forward Xupdate
+        addl    $K,$e,$e        ; $i
+        shd     $a,$a,27,$t1
+        xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]
+        addl    @X[$i%16],$e,$e
+        and     $c,$b,$t0
+        xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+        addl    $t1,$e,$e
+        andcm   $d,$b,$t1
+        shd     $b,$b,2,$b
+        or      $t1,$t0,$t0
+        xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+        add     $t0,$e,$e
+        shd     @X[$j%16],@X[$j%16],31,@X[$j%16]
+___
+}
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+        xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]       ; $i
+        addl    $K,$e,$e
+        shd     $a,$a,27,$t1
+        xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+        addl    @X[$i%16],$e,$e
+        xor     $b,$c,$t0
+        xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+        addl    $t1,$e,$e
+        shd     $b,$b,2,$b
+        xor     $d,$t0,$t0
+        shd     @X[$j%16],@X[$j%16],31,@X[$j%16]
+        addl    $t0,$e,$e
+___
+$code.=<<___ if ($i==79);       # with context load
+        ldw     0($ctx),@X[0]   ; $i
+        addl    $K,$e,$e
+        shd     $a,$a,27,$t1
+        ldw     4($ctx),@X[1]
+        addl    @X[$i%16],$e,$e
+        xor     $b,$c,$t0
+        ldw     8($ctx),@X[2]
+        addl    $t1,$e,$e
+        shd     $b,$b,2,$b
+        xor     $d,$t0,$t0
+        ldw     12($ctx),@X[3]
+        addl    $t0,$e,$e
+        ldw     16($ctx),@X[4]
+___
+}
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;
+        shd     $a,$a,27,$t1    ; $i
+        addl    $K,$e,$e
+        xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]
+        xor     $d,$c,$t0
+        addl    @X[$i%16],$e,$e
+        xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+        and     $b,$t0,$t0
+        addl    $t1,$e,$e
+        shd     $b,$b,2,$b
+        xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+        addl    $t0,$e,$e
+        and     $d,$c,$t1
+        shd     @X[$j%16],@X[$j%16],31,@X[$j%16]
+        addl    $t1,$e,$e
+___
+}
+$code=<<___;
+        .LEVEL  $LEVEL
+        .SPACE  \$TEXT\$
+        .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+        .EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+sha1_block_data_order
+        .PROC
+        .CALLINFO       FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
+        .ENTRY
+        $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
+        $PUSHMA %r3,$FRAME(%sp)
+        $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
+        $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
+        $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
+        $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
+        $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
+        $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
+        $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
+        $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
+        $PUSH   %r12,`-$FRAME+9*$SIZE_T`(%sp)
+        $PUSH   %r13,`-$FRAME+10*$SIZE_T`(%sp)
+        $PUSH   %r14,`-$FRAME+11*$SIZE_T`(%sp)
+        $PUSH   %r15,`-$FRAME+12*$SIZE_T`(%sp)
+        $PUSH   %r16,`-$FRAME+13*$SIZE_T`(%sp)
+        ldw     0($ctx),$A
+        ldw     4($ctx),$B
+        ldw     8($ctx),$C
+        ldw     12($ctx),$D
+        ldw     16($ctx),$E
+        extru   $inp,31,2,$t0           ; t0=inp&3;
+        sh3addl $t0,%r0,$t0             ; t0*=8;
+        subi    32,$t0,$t0              ; t0=32-t0;
+        mtctl   $t0,%cr11               ; %sar=t0;
+L\$oop
+        ldi     3,$t0
+        andcm   $inp,$t0,$t0            ; 64-bit neutral
+___
+        for ($i=0;$i<15;$i++) {         # load input block
+        $code.="\tldw   `4*$i`($t0),@X[$i]\n";          }
+$code.=<<___;
+        cmpb,*= $inp,$t0,L\$aligned
+        ldw     60($t0),@X[15]
+        ldw     64($t0),@X[16]
+___
+        for ($i=0;$i<16;$i++) {         # align input
+        $code.="\tvshd  @X[$i],@X[$i+1],@X[$i]\n";      }
+$code.=<<___;
+L\$aligned
+        ldil    L'0x5a827000,$K         ; K_00_19
+        ldo     0x999($K),$K
+___
+for ($i=0;$i<20;$i++)   { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        ldil    L'0x6ed9e000,$K         ; K_20_39
+        ldo     0xba1($K),$K
+___
+for (;$i<40;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        ldil    L'0x8f1bb000,$K         ; K_40_59
+        ldo     0xcdc($K),$K
+___
+for (;$i<60;$i++)       { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        ldil    L'0xca62c000,$K         ; K_60_79
+        ldo     0x1d6($K),$K
+___
+for (;$i<80;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        addl    @X[0],$A,$A
+        addl    @X[1],$B,$B
+        addl    @X[2],$C,$C
+        addl    @X[3],$D,$D
+        addl    @X[4],$E,$E
+        stw     $A,0($ctx)
+        stw     $B,4($ctx)
+        stw     $C,8($ctx)
+        stw     $D,12($ctx)
+        stw     $E,16($ctx)
+        addib,*<> -1,$num,L\$oop
+        ldo     64($inp),$inp
+        $POP    `-$FRAME-$SAVED_RP`(%sp),%r2    ; standard epilogue
+        $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
+        $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
+        $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
+        $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
+        $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
+        $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
+        $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
+        $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
+        $POP    `-$FRAME+9*$SIZE_T`(%sp),%r12
+        $POP    `-$FRAME+10*$SIZE_T`(%sp),%r13
+        $POP    `-$FRAME+11*$SIZE_T`(%sp),%r14
+        $POP    `-$FRAME+12*$SIZE_T`(%sp),%r15
+        $POP    `-$FRAME+13*$SIZE_T`(%sp),%r16
+        bv      (%r2)
+        .EXIT
+        $POPMB  -$FRAME(%sp),%r3
+        .PROCEND
+        .STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/,\*/,/gm if ($SIZE_T==4);
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-ppc.pl b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
index dcd0fcdfcf..2140dd2f8d 100755
--- a/src/lib/libcrypto/sha/asm/sha1-ppc.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
@@ -24,12 +24,14 @@ $flavour = shift;
 if ($flavour =~ /64/) {
        $SIZE_T =8;
+        $LRSAVE =2*$SIZE_T;
        $UCMP   ="cmpld";
        $STU    ="stdu";
        $POP    ="ld";
        $PUSH   ="std";
 } elsif ($flavour =~ /32/) {
        $SIZE_T =4;
+        $LRSAVE =$SIZE_T;
        $UCMP   ="cmplw";
        $STU    ="stwu";
        $POP    ="lwz";
@@ -43,7 +45,8 @@ die "can't locate ppc-xlate.pl";
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
-$FRAME=24*$SIZE_T;
+$FRAME=24*$SIZE_T+64;
+$LOCALS=6*$SIZE_T;
 $K  ="r0";
 $sp ="r1";
@@ -162,9 +165,8 @@ $code=<<___;
 .globl  .sha1_block_data_order
 .align  4
 .sha1_block_data_order:
+        $STU    $sp,-$FRAME($sp)
        mflr    r0
-        $STU    $sp,`-($FRAME+64)`($sp)
-        $PUSH   r0,`$FRAME-$SIZE_T*18`($sp)
        $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
        $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
        $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
@@ -182,6 +184,7 @@ $code=<<___;
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+        $PUSH   r0,`$FRAME+$LRSAVE`($sp)
        lwz     $A,0($ctx)
        lwz     $B,4($ctx)
        lwz     $C,8($ctx)
@@ -192,37 +195,14 @@ $code=<<___;
 Laligned:
        mtctr   $num
        bl      Lsha1_block_private
-Ldone:
+        b       Ldone
-        $POP    r0,`$FRAME-$SIZE_T*18`($sp)
-        $POP    r15,`$FRAME-$SIZE_T*17`($sp)
-        $POP    r16,`$FRAME-$SIZE_T*16`($sp)
-        $POP    r17,`$FRAME-$SIZE_T*15`($sp)
-        $POP    r18,`$FRAME-$SIZE_T*14`($sp)
-        $POP    r19,`$FRAME-$SIZE_T*13`($sp)
-        $POP    r20,`$FRAME-$SIZE_T*12`($sp)
-        $POP    r21,`$FRAME-$SIZE_T*11`($sp)
-        $POP    r22,`$FRAME-$SIZE_T*10`($sp)
-        $POP    r23,`$FRAME-$SIZE_T*9`($sp)
-        $POP    r24,`$FRAME-$SIZE_T*8`($sp)
-        $POP    r25,`$FRAME-$SIZE_T*7`($sp)
-        $POP    r26,`$FRAME-$SIZE_T*6`($sp)
-        $POP    r27,`$FRAME-$SIZE_T*5`($sp)
-        $POP    r28,`$FRAME-$SIZE_T*4`($sp)
-        $POP    r29,`$FRAME-$SIZE_T*3`($sp)
-        $POP    r30,`$FRAME-$SIZE_T*2`($sp)
-        $POP    r31,`$FRAME-$SIZE_T*1`($sp)
-        mtlr    r0
-        addi    $sp,$sp,`$FRAME+64`
-        blr
-___
-# PowerPC specification allows an implementation to be ill-behaved
+; PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
+; upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
+; than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for 64-byte input
+; look for particular offending word, but rather for 64-byte input
-# block which crosses the boundary. Once found that block is aligned
+; block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
+; and hashed separately...
-$code.=<<___;
 .align  4
 Lunaligned:
        subfic  $t1,$inp,4096
@@ -237,7 +217,7 @@ Lunaligned:
 Lcross_page:
        li      $t1,16
        mtctr   $t1
-        addi    r20,$sp,$FRAME  ; spot below the frame
+        addi    r20,$sp,$LOCALS ; spot within the frame
 Lmemcpy:
        lbz     r16,0($inp)
        lbz     r17,1($inp)
@@ -251,15 +231,40 @@ Lmemcpy:
        addi    r20,r20,4
        bdnz    Lmemcpy
-        $PUSH   $inp,`$FRAME-$SIZE_T*19`($sp)
+        $PUSH   $inp,`$FRAME-$SIZE_T*18`($sp)
        li      $t1,1
-        addi    $inp,$sp,$FRAME
+        addi    $inp,$sp,$LOCALS
        mtctr   $t1
        bl      Lsha1_block_private
-        $POP    $inp,`$FRAME-$SIZE_T*19`($sp)
+        $POP    $inp,`$FRAME-$SIZE_T*18`($sp)
        addic.  $num,$num,-1
        bne-    Lunaligned
-        b       Ldone
+Ldone:
+        $POP    r0,`$FRAME+$LRSAVE`($sp)
+        $POP    r15,`$FRAME-$SIZE_T*17`($sp)
+        $POP    r16,`$FRAME-$SIZE_T*16`($sp)
+        $POP    r17,`$FRAME-$SIZE_T*15`($sp)
+        $POP    r18,`$FRAME-$SIZE_T*14`($sp)
+        $POP    r19,`$FRAME-$SIZE_T*13`($sp)
+        $POP    r20,`$FRAME-$SIZE_T*12`($sp)
+        $POP    r21,`$FRAME-$SIZE_T*11`($sp)
+        $POP    r22,`$FRAME-$SIZE_T*10`($sp)
+        $POP    r23,`$FRAME-$SIZE_T*9`($sp)
+        $POP    r24,`$FRAME-$SIZE_T*8`($sp)
+        $POP    r25,`$FRAME-$SIZE_T*7`($sp)
+        $POP    r26,`$FRAME-$SIZE_T*6`($sp)
+        $POP    r27,`$FRAME-$SIZE_T*5`($sp)
+        $POP    r28,`$FRAME-$SIZE_T*4`($sp)
+        $POP    r29,`$FRAME-$SIZE_T*3`($sp)
+        $POP    r30,`$FRAME-$SIZE_T*2`($sp)
+        $POP    r31,`$FRAME-$SIZE_T*1`($sp)
+        mtlr    r0
+        addi    $sp,$sp,$FRAME
+        blr
+        .long   0
+        .byte   0,12,4,1,0x80,18,3,0
+        .long   0
 ___
 # This is private block function, which uses tailored calling
@@ -309,6 +314,8 @@ $code.=<<___;
        addi    $inp,$inp,`16*4`
        bdnz-   Lsha1_block_private
        blr
+        .long   0
+        .byte   0,12,0x14,0,0,0,0,0
 ___
 $code.=<<___;
 .asciz  "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
diff --git a/src/lib/libcrypto/sha/asm/sha1-s390x.pl b/src/lib/libcrypto/sha/asm/sha1-s390x.pl
index 4b17848287..9193dda45e 100644
--- a/src/lib/libcrypto/sha/asm/sha1-s390x.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-s390x.pl
@@ -21,9 +21,28 @@
 # instructions to favour dual-issue z10 pipeline. On z10 hardware is
 # "only" ~2.3x faster than software.
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific.
 $kimdfunc=1;    # magic function code for kimd instruction
-$output=shift;
+$flavour = shift;
+if ($flavour =~ /3[12]/) {
+        $SIZE_T=4;
+        $g="";
+} else {
+        $SIZE_T=8;
+        $g="g";
+}
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 $K_00_39="%r0"; $K=$K_00_39;
@@ -42,13 +61,14 @@ $t1="%r11";
 @X=("%r12","%r13","%r14");
 $sp="%r15";
-$frame=160+16*4;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*4;
 sub Xupdate {
 my $i=shift;
 $code.=<<___ if ($i==15);
-        lg      $prefetch,160($sp)      ### Xupdate(16) warm-up
+        lg      $prefetch,$stdframe($sp)        ### Xupdate(16) warm-up
        lr      $X[0],$X[2]
 ___
 return if ($i&1);       # Xupdate is vectorized and executed every 2nd cycle
@@ -58,8 +78,8 @@ $code.=<<___ if ($i<16);
 ___
 $code.=<<___ if ($i>=16);
        xgr     $X[0],$prefetch         ### Xupdate($i)
-        lg      $prefetch,`160+4*(($i+2)%16)`($sp)
+        lg      $prefetch,`$stdframe+4*(($i+2)%16)`($sp)
-        xg      $X[0],`160+4*(($i+8)%16)`($sp)
+        xg      $X[0],`$stdframe+4*(($i+8)%16)`($sp)
        xgr     $X[0],$prefetch
        rll     $X[0],$X[0],1
        rllg    $X[1],$X[0],32
@@ -68,7 +88,7 @@ $code.=<<___ if ($i>=16);
        lr      $X[2],$X[1]             # feedback
 ___
 $code.=<<___ if ($i<=70);
-        stg     $X[0],`160+4*($i%16)`($sp)
+        stg     $X[0],`$stdframe+4*($i%16)`($sp)
 ___
 unshift(@X,pop(@X));
 }
@@ -148,9 +168,9 @@ $code.=<<___ if ($kimdfunc);
        tmhl    %r0,0x4000      # check for message-security assist
        jz      .Lsoftware
        lghi    %r0,0
-        la      %r1,16($sp)
+        la      %r1,`2*$SIZE_T`($sp)
        .long   0xb93e0002      # kimd %r0,%r2
-        lg      %r0,16($sp)
+        lg      %r0,`2*$SIZE_T`($sp)
        tmhh    %r0,`0x8000>>$kimdfunc`
        jz      .Lsoftware
        lghi    %r0,$kimdfunc
@@ -165,11 +185,11 @@ $code.=<<___ if ($kimdfunc);
 ___
 $code.=<<___;
        lghi    %r1,-$frame
-        stg     $ctx,16($sp)
+        st${g}  $ctx,`2*$SIZE_T`($sp)
-        stmg    %r6,%r15,48($sp)
+        stm${g} %r6,%r15,`6*$SIZE_T`($sp)
        lgr     %r0,$sp
        la      $sp,0(%r1,$sp)
-        stg     %r0,0($sp)
+        st${g}  %r0,0($sp)
        larl    $t0,Ktable
        llgf    $A,0($ctx)
@@ -199,7 +219,7 @@ ___
 for (;$i<80;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;
-        lg      $ctx,`$frame+16`($sp)
+        l${g}   $ctx,`$frame+2*$SIZE_T`($sp)
        la      $inp,64($inp)
        al      $A,0($ctx)
        al      $B,4($ctx)
@@ -211,13 +231,13 @@ $code.=<<___;
        st      $C,8($ctx)
        st      $D,12($ctx)
        st      $E,16($ctx)
-        brct    $len,.Lloop
+        brct${g} $len,.Lloop
-        lmg     %r6,%r15,`$frame+48`($sp)
+        lm${g}  %r6,%r15,`$frame+6*$SIZE_T`($sp)
        br      %r14
 .size   sha1_block_data_order,.-sha1_block_data_order
 .string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm   OPENSSL_s390xcap_P,8,8
+.comm   OPENSSL_s390xcap_P,16,8
 ___
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
index 4edc5ea9ad..f27c1e3fb0 100755
--- a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
@@ -16,7 +16,7 @@
 # There was suggestion to mechanically translate 32-bit code, but I
 # dismissed it, reasoning that x86_64 offers enough register bank
 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
-# implementation:-) However! While 64-bit code does performs better
+# implementation:-) However! While 64-bit code does perform better
 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
 # x86_64 does offer larger *addressable* bank, but out-of-order core
 # reaches for even more registers through dynamic aliasing, and EM64T
@@ -29,6 +29,38 @@
 # Xeon P4       +65%            +0%             9.9
 # Core2         +60%            +10%            7.0
+# August 2009.
+#
+# The code was revised to minimize code size and to maximize
+# "distance" between instructions producing input to 'lea'
+# instruction and the 'lea' instruction itself, which is essential
+# for Intel Atom core.
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
+# for background and implementation details. The only difference from
+# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
+# to free temporary registers.
+# April 2011.
+#
+# Add AVX code path. See sha1-586.pl for further information.
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+#               x86_64          SSSE3           AVX
+# P4            9.8             -
+# Opteron       6.6             -
+# Core2         6.7             6.1/+10%        -
+# Atom          11.0            9.7/+13%        -
+# Westmere      7.1             5.6/+27%        -
+# Sandy Bridge  7.9             6.3/+25%        5.2/+51%
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -40,6 +72,16 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+                =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+           $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+           `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+           $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+           `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+           $1>=10);
 open STDOUT,"| $^X $xlate $flavour $output";
 $ctx="%rdi";    # 1st arg
@@ -51,196 +93,994 @@ $ctx="%r8";
 $inp="%r9";
 $num="%r10";
-$xi="%eax";
+$t0="%eax";
-$t0="%ebx";
+$t1="%ebx";
-$t1="%ecx";
+$t2="%ecx";
-$A="%edx";
+@xi=("%edx","%ebp");
-$B="%esi";
+$A="%esi";
-$C="%edi";
+$B="%edi";
-$D="%ebp";
+$C="%r11d";
-$E="%r11d";
+$D="%r12d";
-$T="%r12d";
+$E="%r13d";
-@V=($A,$B,$C,$D,$E,$T);
-sub PROLOGUE {
+@V=($A,$B,$C,$D,$E);
-my $func=shift;
-$code.=<<___;
-.globl  $func
-.type   $func,\@function,3
-.align  16
-$func:
-        push    %rbx
-        push    %rbp
-        push    %r12
-        mov     %rsp,%r11
-        mov     %rdi,$ctx       # reassigned argument
-        sub     \$`8+16*4`,%rsp
-        mov     %rsi,$inp       # reassigned argument
-        and     \$-64,%rsp
-        mov     %rdx,$num       # reassigned argument
-        mov     %r11,`16*4`(%rsp)
-.Lprologue:
-        mov     0($ctx),$A
-        mov     4($ctx),$B
-        mov     8($ctx),$C
-        mov     12($ctx),$D
-        mov     16($ctx),$E
-___
-}
-sub EPILOGUE {
-my $func=shift;
-$code.=<<___;
-        mov     `16*4`(%rsp),%rsi
-        mov     (%rsi),%r12
-        mov     8(%rsi),%rbp
-        mov     16(%rsi),%rbx
-        lea     24(%rsi),%rsp
-.Lepilogue:
-        ret
-.size   $func,.-$func
-___
-}
 sub BODY_00_19 {
-my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
 my $j=$i+1;
 $code.=<<___ if ($i==0);
-        mov     `4*$i`($inp),$xi        
+        mov     `4*$i`($inp),$xi[0]
-        `"bswap $xi"    if(!defined($host))`
+        bswap   $xi[0]
-        mov     $xi,`4*$i`(%rsp)
+        mov     $xi[0],`4*$i`(%rsp)
 ___
 $code.=<<___ if ($i<15);
-        lea     0x5a827999($xi,$e),$f
        mov     $c,$t0
-        mov     `4*$j`($inp),$xi
+        mov     `4*$j`($inp),$xi[1]
-        mov     $a,$e
+        mov     $a,$t2
        xor     $d,$t0
-        `"bswap $xi"    if(!defined($host))`    
+        bswap   $xi[1]
-        rol     \$5,$e
+        rol     \$5,$t2
+        lea     0x5a827999($xi[0],$e),$e
        and     $b,$t0
-        mov     $xi,`4*$j`(%rsp)
+        mov     $xi[1],`4*$j`(%rsp)
-        add     $e,$f
+        add     $t2,$e
        xor     $d,$t0
        rol     \$30,$b
-        add     $t0,$f
+        add     $t0,$e
 ___
 $code.=<<___ if ($i>=15);
-        lea     0x5a827999($xi,$e),$f
+        mov     `4*($j%16)`(%rsp),$xi[1]
-        mov     `4*($j%16)`(%rsp),$xi
        mov     $c,$t0
-        mov     $a,$e
+        mov     $a,$t2
-        xor     `4*(($j+2)%16)`(%rsp),$xi
+        xor     `4*(($j+2)%16)`(%rsp),$xi[1]
        xor     $d,$t0
-        rol     \$5,$e
+        rol     \$5,$t2
-        xor     `4*(($j+8)%16)`(%rsp),$xi
+        xor     `4*(($j+8)%16)`(%rsp),$xi[1]
        and     $b,$t0
-        add     $e,$f
+        lea     0x5a827999($xi[0],$e),$e
-        xor     `4*(($j+13)%16)`(%rsp),$xi
+        xor     `4*(($j+13)%16)`(%rsp),$xi[1]
        xor     $d,$t0
+        rol     \$1,$xi[1]
+        add     $t2,$e
        rol     \$30,$b
-        add     $t0,$f
+        mov     $xi[1],`4*($j%16)`(%rsp)
-        rol     \$1,$xi
+        add     $t0,$e
-        mov     $xi,`4*($j%16)`(%rsp)
 ___
+unshift(@xi,pop(@xi));
 }
 sub BODY_20_39 {
-my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
 my $j=$i+1;
 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
 $code.=<<___ if ($i<79);
-        lea     $K($xi,$e),$f
+        mov     `4*($j%16)`(%rsp),$xi[1]
-        mov     `4*($j%16)`(%rsp),$xi
        mov     $c,$t0
-        mov     $a,$e
+        mov     $a,$t2
-        xor     `4*(($j+2)%16)`(%rsp),$xi
+        xor     `4*(($j+2)%16)`(%rsp),$xi[1]
        xor     $b,$t0
-        rol     \$5,$e
+        rol     \$5,$t2
-        xor     `4*(($j+8)%16)`(%rsp),$xi
+        lea     $K($xi[0],$e),$e
+        xor     `4*(($j+8)%16)`(%rsp),$xi[1]
        xor     $d,$t0
-        add     $e,$f
+        add     $t2,$e
-        xor     `4*(($j+13)%16)`(%rsp),$xi
+        xor     `4*(($j+13)%16)`(%rsp),$xi[1]
        rol     \$30,$b
-        add     $t0,$f
+        add     $t0,$e
-        rol     \$1,$xi
+        rol     \$1,$xi[1]
 ___
 $code.=<<___ if ($i<76);
-        mov     $xi,`4*($j%16)`(%rsp)
+        mov     $xi[1],`4*($j%16)`(%rsp)
 ___
 $code.=<<___ if ($i==79);
-        lea     $K($xi,$e),$f
        mov     $c,$t0
-        mov     $a,$e
+        mov     $a,$t2
        xor     $b,$t0
-        rol     \$5,$e
+        lea     $K($xi[0],$e),$e
+        rol     \$5,$t2
        xor     $d,$t0
-        add     $e,$f
+        add     $t2,$e
        rol     \$30,$b
-        add     $t0,$f
+        add     $t0,$e
 ___
+unshift(@xi,pop(@xi));
 }
 sub BODY_40_59 {
-my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
 my $j=$i+1;
 $code.=<<___;
-        lea     0x8f1bbcdc($xi,$e),$f
+        mov     `4*($j%16)`(%rsp),$xi[1]
-        mov     `4*($j%16)`(%rsp),$xi
+        mov     $c,$t0
-        mov     $b,$t0
+        mov     $c,$t1
-        mov     $b,$t1
+        xor     `4*(($j+2)%16)`(%rsp),$xi[1]
-        xor     `4*(($j+2)%16)`(%rsp),$xi
+        and     $d,$t0
-        mov     $a,$e
+        mov     $a,$t2
-        and     $c,$t0
+        xor     `4*(($j+8)%16)`(%rsp),$xi[1]
-        xor     `4*(($j+8)%16)`(%rsp),$xi
+        xor     $d,$t1
-        or      $c,$t1
+        lea     0x8f1bbcdc($xi[0],$e),$e
-        rol     \$5,$e
+        rol     \$5,$t2
-        xor     `4*(($j+13)%16)`(%rsp),$xi
+        xor     `4*(($j+13)%16)`(%rsp),$xi[1]
-        and     $d,$t1
+        add     $t0,$e
-        add     $e,$f
+        and     $b,$t1
-        rol     \$1,$xi
+        rol     \$1,$xi[1]
-        or      $t1,$t0
+        add     $t1,$e
        rol     \$30,$b
-        mov     $xi,`4*($j%16)`(%rsp)
+        mov     $xi[1],`4*($j%16)`(%rsp)
-        add     $t0,$f
+        add     $t2,$e
 ___
+unshift(@xi,pop(@xi));
 }
-$code=".text\n";
+$code.=<<___;
+.text
+.extern OPENSSL_ia32cap_P
-&PROLOGUE("sha1_block_data_order");
+.globl  sha1_block_data_order
-$code.=".align  4\n.Lloop:\n";
+.type   sha1_block_data_order,\@function,3
+.align  16
+sha1_block_data_order:
+        mov     OPENSSL_ia32cap_P+0(%rip),%r9d
+        mov     OPENSSL_ia32cap_P+4(%rip),%r8d
+        test    \$`1<<9`,%r8d           # check SSSE3 bit
+        jz      .Lialu
+___
+$code.=<<___ if ($avx);
+        and     \$`1<<28`,%r8d          # mask AVX bit
+        and     \$`1<<30`,%r9d          # mask "Intel CPU" bit
+        or      %r9d,%r8d
+        cmp     \$`1<<28|1<<30`,%r8d
+        je      _avx_shortcut
+___
+$code.=<<___;
+        jmp     _ssse3_shortcut
+.align  16
+.Lialu:
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        mov     %rsp,%r11
+        mov     %rdi,$ctx       # reassigned argument
+        sub     \$`8+16*4`,%rsp
+        mov     %rsi,$inp       # reassigned argument
+        and     \$-64,%rsp
+        mov     %rdx,$num       # reassigned argument
+        mov     %r11,`16*4`(%rsp)
+.Lprologue:
+        mov     0($ctx),$A
+        mov     4($ctx),$B
+        mov     8($ctx),$C
+        mov     12($ctx),$D
+        mov     16($ctx),$E
+        jmp     .Lloop
+.align  16
+.Lloop:
+___
 for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
 for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
 for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;
-        add     0($ctx),$E
+        add     0($ctx),$A
-        add     4($ctx),$T
+        add     4($ctx),$B
-        add     8($ctx),$A
+        add     8($ctx),$C
-        add     12($ctx),$B
+        add     12($ctx),$D
-        add     16($ctx),$C
+        add     16($ctx),$E
-        mov     $E,0($ctx)
+        mov     $A,0($ctx)
-        mov     $T,4($ctx)
+        mov     $B,4($ctx)
-        mov     $A,8($ctx)
+        mov     $C,8($ctx)
-        mov     $B,12($ctx)
+        mov     $D,12($ctx)
-        mov     $C,16($ctx)
+        mov     $E,16($ctx)
-        xchg    $E,$A   # mov   $E,$A
-        xchg    $T,$B   # mov   $T,$B
-        xchg    $E,$C   # mov   $A,$C
-        xchg    $T,$D   # mov   $B,$D
-                        # mov   $C,$E
-        lea     `16*4`($inp),$inp
        sub     \$1,$num
+        lea     `16*4`($inp),$inp
        jnz     .Lloop
+        mov     `16*4`(%rsp),%rsi
+        mov     (%rsi),%r13
+        mov     8(%rsi),%r12
+        mov     16(%rsi),%rbp
+        mov     24(%rsi),%rbx
+        lea     32(%rsi),%rsp
+.Lepilogue:
+        ret
+.size   sha1_block_data_order,.-sha1_block_data_order
 ___
-&EPILOGUE("sha1_block_data_order");
+{{{
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $K_XX_XX="%r11";
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+$code.=<<___;
+.type   sha1_block_data_order_ssse3,\@function,3
+.align  16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+        push    %rbx
+        push    %rbp
+        push    %r12
+        lea     `-64-($win64?5*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+        movaps  %xmm6,64+0(%rsp)
+        movaps  %xmm7,64+16(%rsp)
+        movaps  %xmm8,64+32(%rsp)
+        movaps  %xmm9,64+48(%rsp)
+        movaps  %xmm10,64+64(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+        mov     %rdi,$ctx       # reassigned argument
+        mov     %rsi,$inp       # reassigned argument
+        mov     %rdx,$num       # reassigned argument
+        shl     \$6,$num
+        add     $inp,$num
+        lea     K_XX_XX(%rip),$K_XX_XX
+        mov     0($ctx),$A              # load context
+        mov     4($ctx),$B
+        mov     8($ctx),$C
+        mov     12($ctx),$D
+        mov     $B,@T[0]                # magic seed
+        mov     16($ctx),$E
+        movdqa  64($K_XX_XX),@X[2]      # pbswap mask
+        movdqa  0($K_XX_XX),@Tx[1]      # K_00_19
+        movdqu  0($inp),@X[-4&7]        # load input to %xmm[0-3]
+        movdqu  16($inp),@X[-3&7]
+        movdqu  32($inp),@X[-2&7]
+        movdqu  48($inp),@X[-1&7]
+        pshufb  @X[2],@X[-4&7]          # byte swap
+        add     \$64,$inp
+        pshufb  @X[2],@X[-3&7]
+        pshufb  @X[2],@X[-2&7]
+        pshufb  @X[2],@X[-1&7]
+        paddd   @Tx[1],@X[-4&7]         # add K_00_19
+        paddd   @Tx[1],@X[-3&7]
+        paddd   @Tx[1],@X[-2&7]
+        movdqa  @X[-4&7],0(%rsp)        # X[]+K xfer to IALU
+        psubd   @Tx[1],@X[-4&7]         # restore X[]
+        movdqa  @X[-3&7],16(%rsp)
+        psubd   @Tx[1],@X[-3&7]
+        movdqa  @X[-2&7],32(%rsp)
+        psubd   @Tx[1],@X[-2&7]
+        jmp     .Loop_ssse3
+___
+sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+sub Xupdate_ssse3_16_31()               # recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
+  my ($a,$b,$c,$d,$e);
+        &movdqa (@X[0],@X[-3&7]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &movdqa (@Tx[0],@X[-1&7]);
+        &palignr(@X[0],@X[-4&7],8);     # compose "X[-14]" in "X[0]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &paddd        (@Tx[1],@X[-1&7]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &psrldq (@Tx[0],4);             # "X[-3]", 3 dwords
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &pxor   (@X[0],@X[-4&7]);       # "X[0]"^="X[-16]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &pxor   (@Tx[0],@X[-2&7]);      # "X[-3]"^"X[-8]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-3]"^"X[-8]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &movdqa (@Tx[2],@X[0]);
+        &movdqa (@Tx[0],@X[0]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &pslldq (@Tx[2],12);            # "X[0]"<<96, extract one dword
+        &paddd  (@X[0],@X[0]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &psrld  (@Tx[0],31);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &movdqa (@Tx[1],@Tx[2]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &psrld  (@Tx[2],30);
+        &por    (@X[0],@Tx[0]);         # "X[0]"<<<=1
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &pslld  (@Tx[1],2);
+        &pxor   (@X[0],@Tx[2]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &movdqa       (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &pxor   (@X[0],@Tx[1]);         # "X[0]"^=("X[0]">>96)<<<2
+         foreach (@insns) { eval; }     # remaining instructions [if any]
+  $Xi++;        push(@X,shift(@X));     # "rotate" X[]
+                push(@Tx,shift(@Tx));
+}
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+        &movdqa (@Tx[0],@X[-1&7])       if ($Xi==8);
+         eval(shift(@insns));           # body_20_39
+        &pxor   (@X[0],@X[-4&7]);       # "X[0]"="X[-32]"^"X[-16]"
+        &palignr(@Tx[0],@X[-2&7],8);    # compose "X[-6]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+        &pxor   (@X[0],@X[-7&7]);       # "X[0]"^="X[-28]"
+         eval(shift(@insns));
+         eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
+        if ($Xi%5) {
+          &movdqa       (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+        } else {                        # ... or load next one
+          &movdqa       (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+        }
+          &paddd        (@Tx[1],@X[-1&7]);
+         eval(shift(@insns));           # ror
+         eval(shift(@insns));
+        &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-6]"
+         eval(shift(@insns));           # body_20_39
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+        &movdqa (@Tx[0],@X[0]);
+          &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # ror
+         eval(shift(@insns));
+        &pslld  (@X[0],2);
+         eval(shift(@insns));           # body_20_39
+         eval(shift(@insns));
+        &psrld  (@Tx[0],30);
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # ror
+         eval(shift(@insns));
+        &por    (@X[0],@Tx[0]);         # "X[0]"<<<=2
+         eval(shift(@insns));           # body_20_39
+         eval(shift(@insns));
+          &movdqa       (@Tx[1],@X[0])  if ($Xi<19);
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+         eval(shift(@insns));
+         foreach (@insns) { eval; }     # remaining instructions
+  $Xi++;        push(@X,shift(@X));     # "rotate" X[]
+                push(@Tx,shift(@Tx));
+}
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
+  my ($a,$b,$c,$d,$e);
+         eval(shift(@insns));
+          &paddd        (@Tx[1],@X[-1&7]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+         foreach (@insns) { eval; }             # remaining instructions
+        &cmp    ($inp,$num);
+        &je     (".Ldone_ssse3");
+        unshift(@Tx,pop(@Tx));
+        &movdqa (@X[2],"64($K_XX_XX)");         # pbswap mask
+        &movdqa (@Tx[1],"0($K_XX_XX)");         # K_00_19
+        &movdqu (@X[-4&7],"0($inp)");           # load input
+        &movdqu (@X[-3&7],"16($inp)");
+        &movdqu (@X[-2&7],"32($inp)");
+        &movdqu (@X[-1&7],"48($inp)");
+        &pshufb (@X[-4&7],@X[2]);               # byte swap
+        &add    ($inp,64);
+  $Xi=0;
+}
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
+  my ($a,$b,$c,$d,$e);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &pshufb (@X[($Xi-3)&7],@X[2]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &paddd  (@X[($Xi-4)&7],@Tx[1]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);  # X[]+K xfer to IALU
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &psubd  (@X[($Xi-4)&7],@Tx[1]);
+        foreach (@insns) { eval; }
+  $Xi++;
+}
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
+  my ($a,$b,$c,$d,$e);
+        foreach (@insns) { eval; }
+}
+sub body_00_19 () {
+        (
+        '($a,$b,$c,$d,$e)=@V;'.
+        '&add   ($e,eval(4*($j&15))."(%rsp)");',        # X[]+K xfer
+        '&xor   ($c,$d);',
+        '&mov   (@T[1],$a);',   # $b in next round
+        '&$_rol ($a,5);',
+        '&and   (@T[0],$c);',   # ($b&($c^$d))
+        '&xor   ($c,$d);',      # restore $c
+        '&xor   (@T[0],$d);',
+        '&add   ($e,$a);',
+        '&$_ror ($b,$j?7:2);',  # $b>>>2
+        '&add   ($e,@T[0]);'    .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+        );
+}
+sub body_20_39 () {
+        (
+        '($a,$b,$c,$d,$e)=@V;'.
+        '&add   ($e,eval(4*($j++&15))."(%rsp)");',      # X[]+K xfer
+        '&xor   (@T[0],$d);',   # ($b^$d)
+        '&mov   (@T[1],$a);',   # $b in next round
+        '&$_rol ($a,5);',
+        '&xor   (@T[0],$c);',   # ($b^$d^$c)
+        '&add   ($e,$a);',
+        '&$_ror ($b,7);',       # $b>>>2
+        '&add   ($e,@T[0]);'    .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+        );
+}
+sub body_40_59 () {
+        (
+        '($a,$b,$c,$d,$e)=@V;'.
+        '&mov   (@T[1],$c);',
+        '&xor   ($c,$d);',
+        '&add   ($e,eval(4*($j++&15))."(%rsp)");',      # X[]+K xfer
+        '&and   (@T[1],$d);',
+        '&and   (@T[0],$c);',   # ($b&($c^$d))
+        '&$_ror ($b,7);',       # $b>>>2
+        '&add   ($e,@T[1]);',
+        '&mov   (@T[1],$a);',   # $b in next round
+        '&$_rol ($a,5);',
+        '&add   ($e,@T[0]);',
+        '&xor   ($c,$d);',      # restore $c
+        '&add   ($e,$a);'       .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+        );
+}
 $code.=<<___;
-.asciz  "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align  16
+.Loop_ssse3:
+___
+        &Xupdate_ssse3_16_31(\&body_00_19);
+        &Xupdate_ssse3_16_31(\&body_00_19);
+        &Xupdate_ssse3_16_31(\&body_00_19);
+        &Xupdate_ssse3_16_31(\&body_00_19);
+        &Xupdate_ssse3_32_79(\&body_00_19);
+        &Xupdate_ssse3_32_79(\&body_20_39);
+        &Xupdate_ssse3_32_79(\&body_20_39);
+        &Xupdate_ssse3_32_79(\&body_20_39);
+        &Xupdate_ssse3_32_79(\&body_20_39);
+        &Xupdate_ssse3_32_79(\&body_20_39);
+        &Xupdate_ssse3_32_79(\&body_40_59);
+        &Xupdate_ssse3_32_79(\&body_40_59);
+        &Xupdate_ssse3_32_79(\&body_40_59);
+        &Xupdate_ssse3_32_79(\&body_40_59);
+        &Xupdate_ssse3_32_79(\&body_40_59);
+        &Xupdate_ssse3_32_79(\&body_20_39);
+        &Xuplast_ssse3_80(\&body_20_39);        # can jump to "done"
+                                $saved_j=$j; @saved_V=@V;
+        &Xloop_ssse3(\&body_20_39);
+        &Xloop_ssse3(\&body_20_39);
+        &Xloop_ssse3(\&body_20_39);
+$code.=<<___;
+        add     0($ctx),$A                      # update context
+        add     4($ctx),@T[0]
+        add     8($ctx),$C
+        add     12($ctx),$D
+        mov     $A,0($ctx)
+        add     16($ctx),$E
+        mov     @T[0],4($ctx)
+        mov     @T[0],$B                        # magic seed
+        mov     $C,8($ctx)
+        mov     $D,12($ctx)
+        mov     $E,16($ctx)
+        jmp     .Loop_ssse3
+.align  16
+.Ldone_ssse3:
+___
+                                $j=$saved_j; @V=@saved_V;
+        &Xtail_ssse3(\&body_20_39);
+        &Xtail_ssse3(\&body_20_39);
+        &Xtail_ssse3(\&body_20_39);
+$code.=<<___;
+        add     0($ctx),$A                      # update context
+        add     4($ctx),@T[0]
+        add     8($ctx),$C
+        mov     $A,0($ctx)
+        add     12($ctx),$D
+        mov     @T[0],4($ctx)
+        add     16($ctx),$E
+        mov     $C,8($ctx)
+        mov     $D,12($ctx)
+        mov     $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+        movaps  64+0(%rsp),%xmm6
+        movaps  64+16(%rsp),%xmm7
+        movaps  64+32(%rsp),%xmm8
+        movaps  64+48(%rsp),%xmm9
+        movaps  64+64(%rsp),%xmm10
+___
+$code.=<<___;
+        lea     `64+($win64?5*16:0)`(%rsp),%rsi
+        mov     0(%rsi),%r12
+        mov     8(%rsi),%rbp
+        mov     16(%rsi),%rbx
+        lea     24(%rsi),%rsp
+.Lepilogue_ssse3:
+        ret
+.size   sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+___
+if ($avx) {
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $K_XX_XX="%r11";
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+$code.=<<___;
+.type   sha1_block_data_order_avx,\@function,3
+.align  16
+sha1_block_data_order_avx:
+_avx_shortcut:
+        push    %rbx
+        push    %rbp
+        push    %r12
+        lea     `-64-($win64?5*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+        movaps  %xmm6,64+0(%rsp)
+        movaps  %xmm7,64+16(%rsp)
+        movaps  %xmm8,64+32(%rsp)
+        movaps  %xmm9,64+48(%rsp)
+        movaps  %xmm10,64+64(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+        mov     %rdi,$ctx       # reassigned argument
+        mov     %rsi,$inp       # reassigned argument
+        mov     %rdx,$num       # reassigned argument
+        vzeroall
+        shl     \$6,$num
+        add     $inp,$num
+        lea     K_XX_XX(%rip),$K_XX_XX
+        mov     0($ctx),$A              # load context
+        mov     4($ctx),$B
+        mov     8($ctx),$C
+        mov     12($ctx),$D
+        mov     $B,@T[0]                # magic seed
+        mov     16($ctx),$E
+        vmovdqa 64($K_XX_XX),@X[2]      # pbswap mask
+        vmovdqa 0($K_XX_XX),@Tx[1]      # K_00_19
+        vmovdqu 0($inp),@X[-4&7]        # load input to %xmm[0-3]
+        vmovdqu 16($inp),@X[-3&7]
+        vmovdqu 32($inp),@X[-2&7]
+        vmovdqu 48($inp),@X[-1&7]
+        vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+        add     \$64,$inp
+        vpshufb @X[2],@X[-3&7],@X[-3&7]
+        vpshufb @X[2],@X[-2&7],@X[-2&7]
+        vpshufb @X[2],@X[-1&7],@X[-1&7]
+        vpaddd  @Tx[1],@X[-4&7],@X[0]   # add K_00_19
+        vpaddd  @Tx[1],@X[-3&7],@X[1]
+        vpaddd  @Tx[1],@X[-2&7],@X[2]
+        vmovdqa @X[0],0(%rsp)           # X[]+K xfer to IALU
+        vmovdqa @X[1],16(%rsp)
+        vmovdqa @X[2],32(%rsp)
+        jmp     .Loop_avx
+___
+sub Xupdate_avx_16_31()         # recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
+  my ($a,$b,$c,$d,$e);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpalignr(@X[0],@X[-3&7],@X[-4&7],8);   # compose "X[-14]" in "X[0]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &vpaddd       (@Tx[1],@Tx[1],@X[-1&7]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpsrldq(@Tx[0],@X[-1&7],4);    # "X[-3]", 3 dwords
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"^="X[-16]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpxor  (@Tx[0],@Tx[0],@X[-2&7]);       # "X[-3]"^"X[-8]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-3]"^"X[-8]"
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpsrld (@Tx[0],@X[0],31);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpslldq(@Tx[2],@X[0],12);              # "X[0]"<<96, extract one dword
+        &vpaddd (@X[0],@X[0],@X[0]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpsrld (@Tx[1],@Tx[2],30);
+        &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=1
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpslld (@Tx[2],@Tx[2],2);
+        &vpxor  (@X[0],@X[0],@Tx[1]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpxor  (@X[0],@X[0],@Tx[2]);           # "X[0]"^=("X[0]">>96)<<<2
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &vmovdqa      (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
+         eval(shift(@insns));
+         eval(shift(@insns));
+         foreach (@insns) { eval; }     # remaining instructions [if any]
+  $Xi++;        push(@X,shift(@X));     # "rotate" X[]
+                push(@Tx,shift(@Tx));
+}
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+        &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);  # compose "X[-6]"
+        &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"="X[-32]"^"X[-16]"
+         eval(shift(@insns));           # body_20_39
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+        &vpxor  (@X[0],@X[0],@X[-7&7]);         # "X[0]"^="X[-28]"
+         eval(shift(@insns));
+         eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
+        if ($Xi%5) {
+          &vmovdqa      (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+        } else {                        # ... or load next one
+          &vmovdqa      (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+        }
+          &vpaddd       (@Tx[1],@Tx[1],@X[-1&7]);
+         eval(shift(@insns));           # ror
+         eval(shift(@insns));
+        &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-6]"
+         eval(shift(@insns));           # body_20_39
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+        &vpsrld (@Tx[0],@X[0],30);
+          &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # ror
+         eval(shift(@insns));
+        &vpslld (@X[0],@X[0],2);
+         eval(shift(@insns));           # body_20_39
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # ror
+         eval(shift(@insns));
+        &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=2
+         eval(shift(@insns));           # body_20_39
+         eval(shift(@insns));
+          &vmovdqa      (@Tx[1],@X[0])  if ($Xi<19);
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));           # rol
+         eval(shift(@insns));
+         foreach (@insns) { eval; }     # remaining instructions
+  $Xi++;        push(@X,shift(@X));     # "rotate" X[]
+                push(@Tx,shift(@Tx));
+}
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
+  my ($a,$b,$c,$d,$e);
+         eval(shift(@insns));
+          &vpaddd       (@Tx[1],@Tx[1],@X[-1&7]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+          &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+         foreach (@insns) { eval; }             # remaining instructions
+        &cmp    ($inp,$num);
+        &je     (".Ldone_avx");
+        unshift(@Tx,pop(@Tx));
+        &vmovdqa(@X[2],"64($K_XX_XX)");         # pbswap mask
+        &vmovdqa(@Tx[1],"0($K_XX_XX)");         # K_00_19
+        &vmovdqu(@X[-4&7],"0($inp)");           # load input
+        &vmovdqu(@X[-3&7],"16($inp)");
+        &vmovdqu(@X[-2&7],"32($inp)");
+        &vmovdqu(@X[-1&7],"48($inp)");
+        &vpshufb(@X[-4&7],@X[-4&7],@X[2]);      # byte swap
+        &add    ($inp,64);
+  $Xi=0;
+}
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
+  my ($a,$b,$c,$d,$e);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+         eval(shift(@insns));
+        &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);      # X[]+K xfer to IALU
+         eval(shift(@insns));
+         eval(shift(@insns));
+        foreach (@insns) { eval; }
+  $Xi++;
+}
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
+  my ($a,$b,$c,$d,$e);
+        foreach (@insns) { eval; }
+}
+$code.=<<___;
+.align  16
+.Loop_avx:
+___
+        &Xupdate_avx_16_31(\&body_00_19);
+        &Xupdate_avx_16_31(\&body_00_19);
+        &Xupdate_avx_16_31(\&body_00_19);
+        &Xupdate_avx_16_31(\&body_00_19);
+        &Xupdate_avx_32_79(\&body_00_19);
+        &Xupdate_avx_32_79(\&body_20_39);
+        &Xupdate_avx_32_79(\&body_20_39);
+        &Xupdate_avx_32_79(\&body_20_39);
+        &Xupdate_avx_32_79(\&body_20_39);
+        &Xupdate_avx_32_79(\&body_20_39);
+        &Xupdate_avx_32_79(\&body_40_59);
+        &Xupdate_avx_32_79(\&body_40_59);
+        &Xupdate_avx_32_79(\&body_40_59);
+        &Xupdate_avx_32_79(\&body_40_59);
+        &Xupdate_avx_32_79(\&body_40_59);
+        &Xupdate_avx_32_79(\&body_20_39);
+        &Xuplast_avx_80(\&body_20_39);  # can jump to "done"
+                                $saved_j=$j; @saved_V=@V;
+        &Xloop_avx(\&body_20_39);
+        &Xloop_avx(\&body_20_39);
+        &Xloop_avx(\&body_20_39);
+$code.=<<___;
+        add     0($ctx),$A                      # update context
+        add     4($ctx),@T[0]
+        add     8($ctx),$C
+        add     12($ctx),$D
+        mov     $A,0($ctx)
+        add     16($ctx),$E
+        mov     @T[0],4($ctx)
+        mov     @T[0],$B                        # magic seed
+        mov     $C,8($ctx)
+        mov     $D,12($ctx)
+        mov     $E,16($ctx)
+        jmp     .Loop_avx
+.align  16
+.Ldone_avx:
+___
+                                $j=$saved_j; @V=@saved_V;
+        &Xtail_avx(\&body_20_39);
+        &Xtail_avx(\&body_20_39);
+        &Xtail_avx(\&body_20_39);
+$code.=<<___;
+        vzeroall
+        add     0($ctx),$A                      # update context
+        add     4($ctx),@T[0]
+        add     8($ctx),$C
+        mov     $A,0($ctx)
+        add     12($ctx),$D
+        mov     @T[0],4($ctx)
+        add     16($ctx),$E
+        mov     $C,8($ctx)
+        mov     $D,12($ctx)
+        mov     $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+        movaps  64+0(%rsp),%xmm6
+        movaps  64+16(%rsp),%xmm7
+        movaps  64+32(%rsp),%xmm8
+        movaps  64+48(%rsp),%xmm9
+        movaps  64+64(%rsp),%xmm10
+___
+$code.=<<___;
+        lea     `64+($win64?5*16:0)`(%rsp),%rsi
+        mov     0(%rsi),%r12
+        mov     8(%rsi),%rbp
+        mov     16(%rsi),%rbx
+        lea     24(%rsi),%rsp
+.Lepilogue_avx:
+        ret
+.size   sha1_block_data_order_avx,.-sha1_block_data_order_avx
+___
+}
+$code.=<<___;
+.align  64
+K_XX_XX:
+.long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
+.long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
+.long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
+.long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
+.long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap mask
+___
+}}}
+$code.=<<___;
+.asciz  "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align  64
 ___
 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
@@ -272,25 +1112,75 @@ se_handler:
        lea     .Lprologue(%rip),%r10
        cmp     %r10,%rbx               # context->Rip<.Lprologue
-        jb      .Lin_prologue
+        jb      .Lcommon_seh_tail
        mov     152($context),%rax      # pull context->Rsp
        lea     .Lepilogue(%rip),%r10
        cmp     %r10,%rbx               # context->Rip>=.Lepilogue
-        jae     .Lin_prologue
+        jae     .Lcommon_seh_tail
        mov     `16*4`(%rax),%rax       # pull saved stack pointer
-        lea     24(%rax),%rax
+        lea     32(%rax),%rax
        mov     -8(%rax),%rbx
        mov     -16(%rax),%rbp
        mov     -24(%rax),%r12
+        mov     -32(%rax),%r13
        mov     %rbx,144($context)      # restore context->Rbx
        mov     %rbp,160($context)      # restore context->Rbp
        mov     %r12,216($context)      # restore context->R12
+        mov     %r13,224($context)      # restore context->R13
+        jmp     .Lcommon_seh_tail
+.size   se_handler,.-se_handler
-.Lin_prologue:
+.type   ssse3_handler,\@abi-omnipotent
+.align  16
+ssse3_handler:
+        push    %rsi
+        push    %rdi
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        pushfq
+        sub     \$64,%rsp
+        mov     120($context),%rax      # pull context->Rax
+        mov     248($context),%rbx      # pull context->Rip
+        mov     8($disp),%rsi           # disp->ImageBase
+        mov     56($disp),%r11          # disp->HandlerData
+        mov     0(%r11),%r10d           # HandlerData[0]
+        lea     (%rsi,%r10),%r10        # prologue label
+        cmp     %r10,%rbx               # context->Rip<prologue label
+        jb      .Lcommon_seh_tail
+        mov     152($context),%rax      # pull context->Rsp
+        mov     4(%r11),%r10d           # HandlerData[1]
+        lea     (%rsi,%r10),%r10        # epilogue label
+        cmp     %r10,%rbx               # context->Rip>=epilogue label
+        jae     .Lcommon_seh_tail
+        lea     64(%rax),%rsi
+        lea     512($context),%rdi      # &context.Xmm6
+        mov     \$10,%ecx
+        .long   0xa548f3fc              # cld; rep movsq
+        lea     `24+64+5*16`(%rax),%rax # adjust stack pointer
+        mov     -8(%rax),%rbx
+        mov     -16(%rax),%rbp
+        mov     -24(%rax),%r12
+        mov     %rbx,144($context)      # restore context->Rbx
+        mov     %rbp,160($context)      # restore context->Rbp
+        mov     %r12,216($context)      # restore cotnext->R12
+.Lcommon_seh_tail:
        mov     8(%rax),%rdi
        mov     16(%rax),%rsi
        mov     %rax,152($context)      # restore context->Rsp
@@ -328,19 +1218,38 @@ se_handler:
        pop     %rdi
        pop     %rsi
        ret
-.size   se_handler,.-se_handler
+.size   ssse3_handler,.-ssse3_handler
 .section        .pdata
 .align  4
        .rva    .LSEH_begin_sha1_block_data_order
        .rva    .LSEH_end_sha1_block_data_order
        .rva    .LSEH_info_sha1_block_data_order
+        .rva    .LSEH_begin_sha1_block_data_order_ssse3
+        .rva    .LSEH_end_sha1_block_data_order_ssse3
+        .rva    .LSEH_info_sha1_block_data_order_ssse3
+___
+$code.=<<___ if ($avx);
+        .rva    .LSEH_begin_sha1_block_data_order_avx
+        .rva    .LSEH_end_sha1_block_data_order_avx
+        .rva    .LSEH_info_sha1_block_data_order_avx
+___
+$code.=<<___;
 .section        .xdata
 .align  8
 .LSEH_info_sha1_block_data_order:
        .byte   9,0,0,0
        .rva    se_handler
+.LSEH_info_sha1_block_data_order_ssse3:
+        .byte   9,0,0,0
+        .rva    ssse3_handler
+        .rva    .Lprologue_ssse3,.Lepilogue_ssse3       # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_sha1_block_data_order_avx:
+        .byte   9,0,0,0
+        .rva    ssse3_handler
+        .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
 ___
 }
diff --git a/src/lib/libcrypto/sha/asm/sha256-586.pl b/src/lib/libcrypto/sha/asm/sha256-586.pl
index ecc8b69c75..928ec53123 100644
--- a/src/lib/libcrypto/sha/asm/sha256-586.pl
+++ b/src/lib/libcrypto/sha/asm/sha256-586.pl
@@ -14,8 +14,8 @@
 #               Pentium PIII    P4      AMD K8  Core2
 # gcc           46      36      41      27      26
 # icc           57      33      38      25      23      
-# x86 asm       40      30      35      20      20
+# x86 asm       40      30      33      20      18
-# x86_64 asm(*) -       -       21      15.8    16.5
+# x86_64 asm(*) -       -       21      16      16
 #
 # (*) x86_64 assembler performance is presented for reference
 #     purposes.
@@ -48,20 +48,19 @@ sub BODY_00_15() {
    my $in_16_63=shift;
        &mov    ("ecx",$E);
-         &add   ($T,&DWP(4*(8+15+16-9),"esp"))  if ($in_16_63); # T += X[-7]
+         &add   ($T,"edi")                      if ($in_16_63); # T += sigma1(X[-2])
-        &ror    ("ecx",6);
+        &ror    ("ecx",25-11);
-        &mov    ("edi",$E);
-        &ror    ("edi",11);
         &mov   ("esi",$Foff);
-        &xor    ("ecx","edi");
+        &xor    ("ecx",$E);
-        &ror    ("edi",25-11);
+        &ror    ("ecx",11-6);
         &mov   (&DWP(4*(8+15),"esp"),$T)       if ($in_16_63); # save X[0]
-        &xor    ("ecx","edi");  # Sigma1(e)
+        &xor    ("ecx",$E);
+        &ror    ("ecx",6);      # Sigma1(e)
         &mov   ("edi",$Goff);
        &add    ($T,"ecx");     # T += Sigma1(e)
-         &mov   ($Eoff,$E);     # modulo-scheduled
        &xor    ("esi","edi");
+         &mov   ($Eoff,$E);     # modulo-scheduled
         &mov   ("ecx",$A);
        &and    ("esi",$E);
         &mov   ($E,$Doff);     # e becomes d, which is e in next iteration
@@ -69,14 +68,14 @@ sub BODY_00_15() {
         &mov   ("edi",$A);
        &add    ($T,"esi");     # T += Ch(e,f,g)
-        &ror    ("ecx",2);
+        &ror    ("ecx",22-13);
         &add   ($T,$Hoff);     # T += h
-        &ror    ("edi",13);
+        &xor    ("ecx",$A);
+        &ror    ("ecx",13-2);
         &mov   ("esi",$Boff);
-        &xor    ("ecx","edi");
+        &xor    ("ecx",$A);
-        &ror    ("edi",22-13);
+        &ror    ("ecx",2);      # Sigma0(a)
         &add   ($E,$T);        # d += T
-        &xor    ("ecx","edi");  # Sigma0(a)
         &mov   ("edi",$Coff);
        &add    ($T,"ecx");     # T += Sigma0(a)
@@ -168,23 +167,22 @@ sub BODY_00_15() {
 &set_label("16_63",16);
        &mov    ("esi",$T);
         &mov   ("ecx",&DWP(4*(8+15+16-14),"esp"));
-        &shr    ($T,3);
-        &ror    ("esi",7);
-        &xor    ($T,"esi");
        &ror    ("esi",18-7);
         &mov   ("edi","ecx");
-        &xor    ($T,"esi");                     # T = sigma0(X[-15])
+        &xor    ("esi",$T);
+        &ror    ("esi",7);
+        &shr    ($T,3);
-        &shr    ("ecx",10);
-         &mov   ("esi",&DWP(4*(8+15+16),"esp"));
-        &ror    ("edi",17);
-        &xor    ("ecx","edi");
        &ror    ("edi",19-17);
-         &add   ($T,"esi");                     # T += X[-16]
+         &xor   ($T,"esi");                     # T = sigma0(X[-15])
-        &xor    ("edi","ecx")                   # sigma1(X[-2])
+        &xor    ("edi","ecx");
+        &ror    ("edi",17);
+        &shr    ("ecx",10);
+         &add   ($T,&DWP(4*(8+15+16),"esp"));   # T += X[-16]
+        &xor    ("edi","ecx");                  # sigma1(X[-2])
-        &add    ($T,"edi");                     # T += sigma1(X[-2])
+         &add   ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7]
-        # &add  ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1)
+        # &add  ($T,"edi");                     # T += sigma1(X[-2])
        # &mov  (&DWP(4*(8+15),"esp"),$T);      # save X[0]
        &BODY_00_15(1);
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
index 492cb62bc0..9c84e8d93c 100644
--- a/src/lib/libcrypto/sha/asm/sha256-armv4.pl
+++ b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
@@ -18,11 +18,16 @@
 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
 # Cortex A8 core and ~20 cycles per processed byte.
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~17 cycles per processed byte.
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 $ctx="r0";      $t0="r0";
-$inp="r1";
+$inp="r1";      $t3="r1";
 $len="r2";      $t1="r2";
 $T1="r3";
 $A="r4";
@@ -46,6 +51,9 @@ sub BODY_00_15 {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 $code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+        ldr     $T1,[$inp],#4
+#else
        ldrb    $T1,[$inp,#3]                   @ $i
        ldrb    $t2,[$inp,#2]
        ldrb    $t1,[$inp,#1]
@@ -53,16 +61,24 @@ $code.=<<___ if ($i<16);
        orr     $T1,$T1,$t2,lsl#8
        orr     $T1,$T1,$t1,lsl#16
        orr     $T1,$T1,$t0,lsl#24
-        `"str   $inp,[sp,#17*4]"        if ($i==15)`
+#endif
 ___
 $code.=<<___;
-        ldr     $t2,[$Ktbl],#4                  @ *K256++
        mov     $t0,$e,ror#$Sigma1[0]
-        str     $T1,[sp,#`$i%16`*4]
+        ldr     $t2,[$Ktbl],#4                  @ *K256++
        eor     $t0,$t0,$e,ror#$Sigma1[1]
        eor     $t1,$f,$g
+#if $i>=16
+        add     $T1,$T1,$t3                     @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+        rev     $T1,$T1
+#endif
+#if $i==15
+        str     $inp,[sp,#17*4]                 @ leave room for $t3
+#endif
        eor     $t0,$t0,$e,ror#$Sigma1[2]       @ Sigma1(e)
        and     $t1,$t1,$e
+        str     $T1,[sp,#`$i%16`*4]
        add     $T1,$T1,$t0
        eor     $t1,$t1,$g                      @ Ch(e,f,g)
        add     $T1,$T1,$h
@@ -71,6 +87,9 @@ $code.=<<___;
        eor     $h,$h,$a,ror#$Sigma0[1]
        add     $T1,$T1,$t2
        eor     $h,$h,$a,ror#$Sigma0[2]         @ Sigma0(a)
+#if $i>=15
+        ldr     $t3,[sp,#`($i+2)%16`*4]         @ from BODY_16_xx
+#endif
        orr     $t0,$a,$b
        and     $t1,$a,$b
        and     $t0,$t0,$c
@@ -85,24 +104,26 @@ sub BODY_16_XX {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 $code.=<<___;
-        ldr     $t1,[sp,#`($i+1)%16`*4]         @ $i
+        @ ldr   $t3,[sp,#`($i+1)%16`*4]         @ $i
        ldr     $t2,[sp,#`($i+14)%16`*4]
+        mov     $t0,$t3,ror#$sigma0[0]
        ldr     $T1,[sp,#`($i+0)%16`*4]
-        mov     $t0,$t1,ror#$sigma0[0]
+        eor     $t0,$t0,$t3,ror#$sigma0[1]
-        ldr     $inp,[sp,#`($i+9)%16`*4]
+        ldr     $t1,[sp,#`($i+9)%16`*4]
-        eor     $t0,$t0,$t1,ror#$sigma0[1]
+        eor     $t0,$t0,$t3,lsr#$sigma0[2]      @ sigma0(X[i+1])
-        eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
+        mov     $t3,$t2,ror#$sigma1[0]
-        mov     $t1,$t2,ror#$sigma1[0]
        add     $T1,$T1,$t0
-        eor     $t1,$t1,$t2,ror#$sigma1[1]
+        eor     $t3,$t3,$t2,ror#$sigma1[1]
-        add     $T1,$T1,$inp
-        eor     $t1,$t1,$t2,lsr#$sigma1[2]      @ sigma1(X[i+14])
        add     $T1,$T1,$t1
+        eor     $t3,$t3,$t2,lsr#$sigma1[2]      @ sigma1(X[i+14])
+        @ add   $T1,$T1,$t3
 ___
        &BODY_00_15(@_);
 }
 $code=<<___;
+#include "arm_arch.h"
 .text
 .code   32
@@ -132,7 +153,7 @@ K256:
 sha256_block_data_order:
        sub     r3,pc,#8                @ sha256_block_data_order
        add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
-        stmdb   sp!,{$ctx,$inp,$len,r4-r12,lr}
+        stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
        ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
        sub     $Ktbl,r3,#256           @ K256
        sub     sp,sp,#16*4             @ alloca(X[16])
@@ -171,10 +192,14 @@ $code.=<<___;
        bne     .Loop
        add     sp,sp,#`16+3`*4 @ destroy frame
-        ldmia   sp!,{r4-r12,lr}
+#if __ARM_ARCH__>=5
+        ldmia   sp!,{r4-r11,pc}
+#else
+        ldmia   sp!,{r4-r11,lr}
        tst     lr,#1
        moveq   pc,lr                   @ be binary compatible with V4, yet
        bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
 .size   sha256_block_data_order,.-sha256_block_data_order
 .asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 .align  2
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
index 3a35861ac6..7faf37b147 100644
--- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
@@ -18,22 +18,33 @@
 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
 # Cortex A8 core and ~40 cycles per processed byte.
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 25.5 cycles or 47% faster than integer-only code.
 # Byte order [in]dependence. =========================================
 #
-# Caller is expected to maintain specific *dword* order in h[0-7],
+# Originally caller was expected to maintain specific *dword* order in
-# namely with most significant dword at *lower* address, which is
+# h[0-7], namely with most significant dword at *lower* address, which
-# reflected in below two parameters. *Byte* order within these dwords
+# was reflected in below two parameters as 0 and 4. Now caller is
-# in turn is whatever *native* byte order on current platform.
+# expected to maintain native byte order for whole 64-bit values.
-$hi=0;
+$hi="HI";
-$lo=4;
+$lo="LO";
 # ====================================================================
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
-$ctx="r0";
+$ctx="r0";      # parameter block
 $inp="r1";
 $len="r2";
 $Tlo="r3";
 $Thi="r4";
 $Alo="r5";
@@ -61,15 +72,17 @@ $Xoff=8*8;
 sub BODY_00_15() {
 my $magic = shift;
 $code.=<<___;
-        ldr     $t2,[sp,#$Hoff+0]       @ h.lo
-        ldr     $t3,[sp,#$Hoff+4]       @ h.hi
        @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
        @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
        @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
        mov     $t0,$Elo,lsr#14
+        str     $Tlo,[sp,#$Xoff+0]
        mov     $t1,$Ehi,lsr#14
+        str     $Thi,[sp,#$Xoff+4]
        eor     $t0,$t0,$Ehi,lsl#18
+        ldr     $t2,[sp,#$Hoff+0]       @ h.lo
        eor     $t1,$t1,$Elo,lsl#18
+        ldr     $t3,[sp,#$Hoff+4]       @ h.hi
        eor     $t0,$t0,$Elo,lsr#18
        eor     $t1,$t1,$Ehi,lsr#18
        eor     $t0,$t0,$Ehi,lsl#14
@@ -96,25 +109,24 @@ $code.=<<___;
        and     $t1,$t1,$Ehi
        str     $Ahi,[sp,#$Aoff+4]
        eor     $t0,$t0,$t2
-        ldr     $t2,[$Ktbl,#4]          @ K[i].lo
+        ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
        eor     $t1,$t1,$t3             @ Ch(e,f,g)
-        ldr     $t3,[$Ktbl,#0]          @ K[i].hi
+        ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
        adds    $Tlo,$Tlo,$t0
        ldr     $Elo,[sp,#$Doff+0]      @ d.lo
        adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
        ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
        adds    $Tlo,$Tlo,$t2
+        and     $t0,$t2,#0xff
        adc     $Thi,$Thi,$t3           @ T += K[i]
        adds    $Elo,$Elo,$Tlo
+        ldr     $t2,[sp,#$Boff+0]       @ b.lo
        adc     $Ehi,$Ehi,$Thi          @ d += T
-        and     $t0,$t2,#0xff
        teq     $t0,#$magic
-        orreq   $Ktbl,$Ktbl,#1
-        ldr     $t2,[sp,#$Boff+0]       @ b.lo
        ldr     $t3,[sp,#$Coff+0]       @ c.lo
+        orreq   $Ktbl,$Ktbl,#1
        @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
        @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
        @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@@ -131,80 +143,100 @@ $code.=<<___;
        eor     $t0,$t0,$Alo,lsl#25
        eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
        adds    $Tlo,$Tlo,$t0
+        and     $t0,$Alo,$t2
        adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
-        and     $t0,$Alo,$t2
-        orr     $Alo,$Alo,$t2
        ldr     $t1,[sp,#$Boff+4]       @ b.hi
+        orr     $Alo,$Alo,$t2
        ldr     $t2,[sp,#$Coff+4]       @ c.hi
        and     $Alo,$Alo,$t3
-        orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
        and     $t3,$Ahi,$t1
        orr     $Ahi,$Ahi,$t1
+        orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
        and     $Ahi,$Ahi,$t2
-        orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
        adds    $Alo,$Alo,$Tlo
-        adc     $Ahi,$Ahi,$Thi          @ h += T
+        orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
        sub     sp,sp,#8
+        adc     $Ahi,$Ahi,$Thi          @ h += T
+        tst     $Ktbl,#1
        add     $Ktbl,$Ktbl,#8
 ___
 }
 $code=<<___;
+#include "arm_arch.h"
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
+#endif
 .text
 .code   32
 .type   K512,%object
 .align  5
 K512:
-.word   0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
-.word   0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
-.word   0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
-.word   0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
-.word   0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
-.word   0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
-.word   0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
-.word   0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
-.word   0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
-.word   0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
-.word   0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
-.word   0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
-.word   0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
-.word   0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
-.word   0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
-.word   0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
-.word   0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
-.word   0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
-.word   0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
-.word   0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
-.word   0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
-.word   0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
-.word   0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
-.word   0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
-.word   0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
-.word   0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
-.word   0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
-.word   0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
-.word   0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
-.word   0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
-.word   0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
-.word   0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
-.word   0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
-.word   0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
-.word   0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
-.word   0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
-.word   0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
-.word   0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
-.word   0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
-.word   0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size   K512,.-K512
+.LOPENSSL_armcap:
+.word   OPENSSL_armcap_P-sha512_block_data_order
+.skip   32-4
 .global sha512_block_data_order
 .type   sha512_block_data_order,%function
 sha512_block_data_order:
        sub     r3,pc,#8                @ sha512_block_data_order
        add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
+#if __ARM_ARCH__>=7
+        ldr     r12,.LOPENSSL_armcap
+        ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
+        tst     r12,#1
+        bne     .LNEON
+#endif
        stmdb   sp!,{r4-r12,lr}
-        sub     $Ktbl,r3,#640           @ K512
+        sub     $Ktbl,r3,#672           @ K512
        sub     sp,sp,#9*8
        ldr     $Elo,[$ctx,#$Eoff+$lo]
@@ -238,6 +270,7 @@ sha512_block_data_order:
        str     $Thi,[sp,#$Foff+4]
 .L00_15:
+#if __ARM_ARCH__<7
        ldrb    $Tlo,[$inp,#7]
        ldrb    $t0, [$inp,#6]
        ldrb    $t1, [$inp,#5]
@@ -252,26 +285,30 @@ sha512_block_data_order:
        orr     $Thi,$Thi,$t3,lsl#8
        orr     $Thi,$Thi,$t0,lsl#16
        orr     $Thi,$Thi,$t1,lsl#24
-        str     $Tlo,[sp,#$Xoff+0]
+#else
-        str     $Thi,[sp,#$Xoff+4]
+        ldr     $Tlo,[$inp,#4]
+        ldr     $Thi,[$inp],#8
+#ifdef __ARMEL__
+        rev     $Tlo,$Tlo
+        rev     $Thi,$Thi
+#endif
+#endif
 ___
        &BODY_00_15(0x94);
 $code.=<<___;
        tst     $Ktbl,#1
        beq     .L00_15
-        bic     $Ktbl,$Ktbl,#1
-.L16_79:
        ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
        ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
-        ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
+        bic     $Ktbl,$Ktbl,#1
-        ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
+.L16_79:
        @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
        @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
        @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
        mov     $Tlo,$t0,lsr#1
+        ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
        mov     $Thi,$t1,lsr#1
+        ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
        eor     $Tlo,$Tlo,$t1,lsl#31
        eor     $Thi,$Thi,$t0,lsl#31
        eor     $Tlo,$Tlo,$t0,lsr#8
@@ -295,25 +332,24 @@ $code.=<<___;
        eor     $t1,$t1,$t3,lsl#3
        eor     $t0,$t0,$t2,lsr#6
        eor     $t1,$t1,$t3,lsr#6
+        ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
        eor     $t0,$t0,$t3,lsl#26
-        ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
        ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
        adds    $Tlo,$Tlo,$t0
+        ldr     $t0,[sp,#`$Xoff+8*16`+0]
        adc     $Thi,$Thi,$t1
-        ldr     $t0,[sp,#`$Xoff+8*16`+0]
        ldr     $t1,[sp,#`$Xoff+8*16`+4]
        adds    $Tlo,$Tlo,$t2
        adc     $Thi,$Thi,$t3
        adds    $Tlo,$Tlo,$t0
        adc     $Thi,$Thi,$t1
-        str     $Tlo,[sp,#$Xoff+0]
-        str     $Thi,[sp,#$Xoff+4]
 ___
        &BODY_00_15(0x17);
 $code.=<<___;
-        tst     $Ktbl,#1
+        ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
+        ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
        beq     .L16_79
        bic     $Ktbl,$Ktbl,#1
@@ -324,12 +360,12 @@ $code.=<<___;
        ldr     $t2, [$ctx,#$Boff+$lo]
        ldr     $t3, [$ctx,#$Boff+$hi]
        adds    $t0,$Alo,$t0
-        adc     $t1,$Ahi,$t1
-        adds    $t2,$Tlo,$t2
-        adc     $t3,$Thi,$t3
        str     $t0, [$ctx,#$Aoff+$lo]
+        adc     $t1,$Ahi,$t1
        str     $t1, [$ctx,#$Aoff+$hi]
+        adds    $t2,$Tlo,$t2
        str     $t2, [$ctx,#$Boff+$lo]
+        adc     $t3,$Thi,$t3
        str     $t3, [$ctx,#$Boff+$hi]
        ldr     $Alo,[sp,#$Coff+0]
@@ -341,12 +377,12 @@ $code.=<<___;
        ldr     $t2, [$ctx,#$Doff+$lo]
        ldr     $t3, [$ctx,#$Doff+$hi]
        adds    $t0,$Alo,$t0
-        adc     $t1,$Ahi,$t1
-        adds    $t2,$Tlo,$t2
-        adc     $t3,$Thi,$t3
        str     $t0, [$ctx,#$Coff+$lo]
+        adc     $t1,$Ahi,$t1
        str     $t1, [$ctx,#$Coff+$hi]
+        adds    $t2,$Tlo,$t2
        str     $t2, [$ctx,#$Doff+$lo]
+        adc     $t3,$Thi,$t3
        str     $t3, [$ctx,#$Doff+$hi]
        ldr     $Tlo,[sp,#$Foff+0]
@@ -356,12 +392,12 @@ $code.=<<___;
        ldr     $t2, [$ctx,#$Foff+$lo]
        ldr     $t3, [$ctx,#$Foff+$hi]
        adds    $Elo,$Elo,$t0
-        adc     $Ehi,$Ehi,$t1
-        adds    $t2,$Tlo,$t2
-        adc     $t3,$Thi,$t3
        str     $Elo,[$ctx,#$Eoff+$lo]
+        adc     $Ehi,$Ehi,$t1
        str     $Ehi,[$ctx,#$Eoff+$hi]
+        adds    $t2,$Tlo,$t2
        str     $t2, [$ctx,#$Foff+$lo]
+        adc     $t3,$Thi,$t3
        str     $t3, [$ctx,#$Foff+$hi]
        ldr     $Alo,[sp,#$Goff+0]
@@ -373,12 +409,12 @@ $code.=<<___;
        ldr     $t2, [$ctx,#$Hoff+$lo]
        ldr     $t3, [$ctx,#$Hoff+$hi]
        adds    $t0,$Alo,$t0
-        adc     $t1,$Ahi,$t1
-        adds    $t2,$Tlo,$t2
-        adc     $t3,$Thi,$t3
        str     $t0, [$ctx,#$Goff+$lo]
+        adc     $t1,$Ahi,$t1
        str     $t1, [$ctx,#$Goff+$hi]
+        adds    $t2,$Tlo,$t2
        str     $t2, [$ctx,#$Hoff+$lo]
+        adc     $t3,$Thi,$t3
        str     $t3, [$ctx,#$Hoff+$hi]
        add     sp,sp,#640
@@ -388,13 +424,156 @@ $code.=<<___;
        bne     .Loop
        add     sp,sp,#8*9              @ destroy frame
+#if __ARM_ARCH__>=5
+        ldmia   sp!,{r4-r12,pc}
+#else
        ldmia   sp!,{r4-r12,lr}
        tst     lr,#1
        moveq   pc,lr                   @ be binary compatible with V4, yet
        bx      lr                      @ interoperable with Thumb ISA:-)
-.size   sha512_block_data_order,.-sha512_block_data_order
+#endif
-.asciz  "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+___
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+my $Ktbl="r3";
+my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
+$code.=<<___ if ($i<16 || $i&1);
+        vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
+#if $i<16
+        vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
+#endif
+        vshr.u64        $t1,$e,#@Sigma1[1]
+        vshr.u64        $t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+        vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
+        vsli.64         $t0,$e,#`64-@Sigma1[0]`
+        vsli.64         $t1,$e,#`64-@Sigma1[1]`
+        vsli.64         $t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+        vrev64.8        @X[$i],@X[$i]
+#endif
+        vadd.i64        $T1,$K,$h
+        veor            $Ch,$f,$g
+        veor            $t0,$t1
+        vand            $Ch,$e
+        veor            $t0,$t2                 @ Sigma1(e)
+        veor            $Ch,$g                  @ Ch(e,f,g)
+        vadd.i64        $T1,$t0
+        vshr.u64        $t0,$a,#@Sigma0[0]
+        vadd.i64        $T1,$Ch
+        vshr.u64        $t1,$a,#@Sigma0[1]
+        vshr.u64        $t2,$a,#@Sigma0[2]
+        vsli.64         $t0,$a,#`64-@Sigma0[0]`
+        vsli.64         $t1,$a,#`64-@Sigma0[1]`
+        vsli.64         $t2,$a,#`64-@Sigma0[2]`
+        vadd.i64        $T1,@X[$i%16]
+        vorr            $Maj,$a,$c
+        vand            $Ch,$a,$c
+        veor            $h,$t0,$t1
+        vand            $Maj,$b
+        veor            $h,$t2                  @ Sigma0(a)
+        vorr            $Maj,$Ch                @ Maj(a,b,c)
+        vadd.i64        $h,$T1
+        vadd.i64        $d,$T1
+        vadd.i64        $h,$Maj
+___
+}
+sub NEON_16_79() {
+my $i=shift;
+if ($i&1)       { &NEON_00_15($i,@_); return; }
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
+my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
+my $e=@_[4];                                    # $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+        vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
+        vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
+        vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
+        vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
+        vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
+        vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
+        veor            $s1,$t0
+        vshr.u64        $t0,$s0,#@sigma0[0]
+        veor            $s1,$t1                         @ sigma1(X[i+14])
+        vshr.u64        $t1,$s0,#@sigma0[1]
+        vadd.i64        @X[$i%8],$s1
+        vshr.u64        $s1,$s0,#@sigma0[2]
+        vsli.64         $t0,$s0,#`64-@sigma0[0]`
+        vsli.64         $t1,$s0,#`64-@sigma0[1]`
+        vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
+        veor            $s1,$t0
+        vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
+        vadd.i64        @X[$i%8],$s0
+        vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
+        veor            $s1,$t1                         @ sigma0(X[i+1])
+        vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
+        vadd.i64        @X[$i%8],$s1
+___
+        &NEON_00_15(2*$i,@_);
+}
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu    neon
+.align  4
+.LNEON:
+        dmb                             @ errata #451034 on early Cortex A8
+        vstmdb  sp!,{d8-d15}            @ ABI specification says so
+        sub     $Ktbl,r3,#672           @ K512
+        vldmia  $ctx,{$A-$H}            @ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        mov             $cnt,#4
+.L16_79_neon:
+        subs            $cnt,#1
+___
+for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        bne             .L16_79_neon
+        vldmia          $ctx,{d24-d31}  @ load context to temp
+        vadd.i64        q8,q12          @ vectorized accumulate
+        vadd.i64        q9,q13
+        vadd.i64        q10,q14
+        vadd.i64        q11,q15
+        vstmia          $ctx,{$A-$H}    @ save context
+        teq             $inp,$len
+        sub             $Ktbl,#640      @ rewind K512
+        bne             .Loop_neon
+        vldmia  sp!,{d8-d15}            @ epilogue
+        bx      lr
+#endif
+___
+}
+$code.=<<___;
+.size   sha512_block_data_order,.-sha512_block_data_order
+.asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align  2
+.comm   OPENSSL_armcap_P,4,4
 ___
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/sha/asm/sha512-mips.pl b/src/lib/libcrypto/sha/asm/sha512-mips.pl
new file mode 100644
index 0000000000..ba5b250890
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-mips.pl
@@ -0,0 +1,455 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# SHA2 block procedures for MIPS.
+# October 2010.
+#
+# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
+# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
+# for now can only be compiled for MIPS64 ISA] improvement is modest
+# ~17%, but it comes for free, because it's same instruction sequence.
+# Improvement coefficients are for aligned input.
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+#   excluded from the rule, because it's specified volatile];
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+if ($flavour =~ /64|n32/i) {
+        $PTR_ADD="dadd";        # incidentally works even on n32
+        $PTR_SUB="dsub";        # incidentally works even on n32
+        $REG_S="sd";
+        $REG_L="ld";
+        $PTR_SLL="dsll";        # incidentally works even on n32
+        $SZREG=8;
+} else {
+        $PTR_ADD="add";
+        $PTR_SUB="sub";
+        $REG_S="sw";
+        $REG_L="lw";
+        $PTR_SLL="sll";
+        $SZREG=4;
+}
+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
+#
+# <appro@openssl.org>
+#
+######################################################################
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+for (@ARGV) {   $output=$_ if (/^\w[\w\-]*\.\w+$/);     }
+open STDOUT,">$output";
+if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
+if ($output =~ /512/) {
+        $label="512";
+        $SZ=8;
+        $LD="ld";               # load from memory
+        $ST="sd";               # store to memory
+        $SLL="dsll";            # shift left logical
+        $SRL="dsrl";            # shift right logical
+        $ADDU="daddu";
+        @Sigma0=(28,34,39);
+        @Sigma1=(14,18,41);
+        @sigma0=( 7, 1, 8);     # right shift first
+        @sigma1=( 6,19,61);     # right shift first
+        $lastK=0x817;
+        $rounds=80;
+} else {
+        $label="256";
+        $SZ=4;
+        $LD="lw";               # load from memory
+        $ST="sw";               # store to memory
+        $SLL="sll";             # shift left logical
+        $SRL="srl";             # shift right logical
+        $ADDU="addu";
+        @Sigma0=( 2,13,22);
+        @Sigma1=( 6,11,25);
+        @sigma0=( 3, 7,18);     # right shift first
+        @sigma1=(10,17,19);     # right shift first
+        $lastK=0x8f2;
+        $rounds=64;
+}
+$MSB = $big_endian ? 0 : ($SZ-1);
+$LSB = ($SZ-1)&~$MSB;
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
+@X=map("\$$_",(8..23));
+$ctx=$a0;
+$inp=$a1;
+$len=$a2;       $Ktbl=$len;
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
+$code.=<<___ if ($i<15);
+        ${LD}l  @X[1],`($i+1)*$SZ+$MSB`($inp)
+        ${LD}r  @X[1],`($i+1)*$SZ+$LSB`($inp)
+___
+$code.=<<___    if (!$big_endian && $i<16 && $SZ==4);
+        srl     $tmp0,@X[0],24          # byte swap($i)
+        srl     $tmp1,@X[0],8
+        andi    $tmp2,@X[0],0xFF00
+        sll     @X[0],@X[0],24
+        andi    $tmp1,0xFF00
+        sll     $tmp2,$tmp2,8
+        or      @X[0],$tmp0
+        or      $tmp1,$tmp2
+        or      @X[0],$tmp1
+___
+$code.=<<___    if (!$big_endian && $i<16 && $SZ==8);
+        ori     $tmp0,$zero,0xFF
+        dsll    $tmp2,$tmp0,32
+        or      $tmp0,$tmp2             # 0x000000FF000000FF
+        and     $tmp1,@X[0],$tmp0       # byte swap($i)
+        dsrl    $tmp2,@X[0],24
+        dsll    $tmp1,24
+        and     $tmp2,$tmp0
+        dsll    $tmp0,8                 # 0x0000FF000000FF00
+        or      $tmp1,$tmp2
+        and     $tmp2,@X[0],$tmp0
+        dsrl    @X[0],8
+        dsll    $tmp2,8
+        and     @X[0],$tmp0
+        or      $tmp1,$tmp2
+        or      @X[0],$tmp1
+        dsrl    $tmp1,@X[0],32
+        dsll    @X[0],32
+        or      @X[0],$tmp1
+___
+$code.=<<___;
+        $ADDU   $T1,$X[0],$h                    # $i
+        $SRL    $h,$e,@Sigma1[0]
+        xor     $tmp2,$f,$g
+        $SLL    $tmp1,$e,`$SZ*8-@Sigma1[2]`
+        and     $tmp2,$e
+        $SRL    $tmp0,$e,@Sigma1[1]
+        xor     $h,$tmp1
+        $SLL    $tmp1,$e,`$SZ*8-@Sigma1[1]`
+        xor     $h,$tmp0
+        $SRL    $tmp0,$e,@Sigma1[2]
+        xor     $h,$tmp1
+        $SLL    $tmp1,$e,`$SZ*8-@Sigma1[0]`
+        xor     $h,$tmp0
+        xor     $tmp2,$g                        # Ch(e,f,g)
+        xor     $tmp0,$tmp1,$h                  # Sigma1(e)
+        $SRL    $h,$a,@Sigma0[0]
+        $ADDU   $T1,$tmp2
+        $LD     $tmp2,`$i*$SZ`($Ktbl)           # K[$i]
+        $SLL    $tmp1,$a,`$SZ*8-@Sigma0[2]`
+        $ADDU   $T1,$tmp0
+        $SRL    $tmp0,$a,@Sigma0[1]
+        xor     $h,$tmp1
+        $SLL    $tmp1,$a,`$SZ*8-@Sigma0[1]`
+        xor     $h,$tmp0
+        $SRL    $tmp0,$a,@Sigma0[2]
+        xor     $h,$tmp1
+        $SLL    $tmp1,$a,`$SZ*8-@Sigma0[0]`
+        xor     $h,$tmp0
+        $ST     @X[0],`($i%16)*$SZ`($sp)        # offload to ring buffer
+        xor     $h,$tmp1                        # Sigma0(a)
+        or      $tmp0,$a,$b
+        and     $tmp1,$a,$b
+        and     $tmp0,$c
+        or      $tmp1,$tmp0                     # Maj(a,b,c)
+        $ADDU   $T1,$tmp2                       # +=K[$i]
+        $ADDU   $h,$tmp1
+        $ADDU   $d,$T1
+        $ADDU   $h,$T1
+___
+$code.=<<___ if ($i>=13);
+        $LD     @X[3],`(($i+3)%16)*$SZ`($sp)    # prefetch from ring buffer
+___
+}
+sub BODY_16_XX {
+my $i=@_[0];
+my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
+$code.=<<___;
+        $SRL    $tmp2,@X[1],@sigma0[0]          # Xupdate($i)
+        $ADDU   @X[0],@X[9]                     # +=X[i+9]
+        $SLL    $tmp1,@X[1],`$SZ*8-@sigma0[2]`
+        $SRL    $tmp0,@X[1],@sigma0[1]
+        xor     $tmp2,$tmp1
+        $SLL    $tmp1,`@sigma0[2]-@sigma0[1]`
+        xor     $tmp2,$tmp0
+        $SRL    $tmp0,@X[1],@sigma0[2]
+        xor     $tmp2,$tmp1
+        $SRL    $tmp3,@X[14],@sigma1[0]
+        xor     $tmp2,$tmp0                     # sigma0(X[i+1])
+        $SLL    $tmp1,@X[14],`$SZ*8-@sigma1[2]`
+        $ADDU   @X[0],$tmp2
+        $SRL    $tmp0,@X[14],@sigma1[1]
+        xor     $tmp3,$tmp1
+        $SLL    $tmp1,`@sigma1[2]-@sigma1[1]`
+        xor     $tmp3,$tmp0
+        $SRL    $tmp0,@X[14],@sigma1[2]
+        xor     $tmp3,$tmp1
+        xor     $tmp3,$tmp0                     # sigma1(X[i+14])
+        $ADDU   @X[0],$tmp3
+___
+        &BODY_00_15(@_);
+}
+$FRAMESIZE=16*$SZ+16*$SZREG;
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+$code.=<<___;
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+.text
+.set    noat
+#if !defined(__vxworks) || defined(__pic__)
+.option pic2
+#endif
+.align  5
+.globl  sha${label}_block_data_order
+.ent    sha${label}_block_data_order
+sha${label}_block_data_order:
+        .frame  $sp,$FRAMESIZE,$ra
+        .mask   $SAVED_REGS_MASK,-$SZREG
+        .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);   # o32 PIC-ification
+        .cpload $pf
+___
+$code.=<<___;
+        $PTR_SUB $sp,$FRAMESIZE
+        $REG_S  $ra,$FRAMESIZE-1*$SZREG($sp)
+        $REG_S  $fp,$FRAMESIZE-2*$SZREG($sp)
+        $REG_S  $s11,$FRAMESIZE-3*$SZREG($sp)
+        $REG_S  $s10,$FRAMESIZE-4*$SZREG($sp)
+        $REG_S  $s9,$FRAMESIZE-5*$SZREG($sp)
+        $REG_S  $s8,$FRAMESIZE-6*$SZREG($sp)
+        $REG_S  $s7,$FRAMESIZE-7*$SZREG($sp)
+        $REG_S  $s6,$FRAMESIZE-8*$SZREG($sp)
+        $REG_S  $s5,$FRAMESIZE-9*$SZREG($sp)
+        $REG_S  $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi prologue
+        $REG_S  $s3,$FRAMESIZE-11*$SZREG($sp)
+        $REG_S  $s2,$FRAMESIZE-12*$SZREG($sp)
+        $REG_S  $s1,$FRAMESIZE-13*$SZREG($sp)
+        $REG_S  $s0,$FRAMESIZE-14*$SZREG($sp)
+        $REG_S  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+        $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
+___
+$code.=<<___ if ($flavour !~ /o32/i);   # non-o32 PIC-ification
+        .cplocal        $Ktbl
+        .cpsetup        $pf,$zero,sha${label}_block_data_order
+___
+$code.=<<___;
+        .set    reorder
+        la      $Ktbl,K${label}         # PIC-ified 'load address'
+        $LD     $A,0*$SZ($ctx)          # load context
+        $LD     $B,1*$SZ($ctx)
+        $LD     $C,2*$SZ($ctx)
+        $LD     $D,3*$SZ($ctx)
+        $LD     $E,4*$SZ($ctx)
+        $LD     $F,5*$SZ($ctx)
+        $LD     $G,6*$SZ($ctx)
+        $LD     $H,7*$SZ($ctx)
+        $PTR_ADD @X[15],$inp            # pointer to the end of input
+        $REG_S  @X[15],16*$SZ($sp)
+        b       .Loop
+.align  5
+.Loop:
+        ${LD}l  @X[0],$MSB($inp)
+        ${LD}r  @X[0],$LSB($inp)
+___
+for ($i=0;$i<16;$i++)
+{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
+$code.=<<___;
+        b       .L16_xx
+.align  4
+.L16_xx:
+___
+for (;$i<32;$i++)
+{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
+$code.=<<___;
+        and     @X[6],0xfff
+        li      @X[7],$lastK
+        .set    noreorder
+        bne     @X[6],@X[7],.L16_xx
+        $PTR_ADD $Ktbl,16*$SZ           # Ktbl+=16
+        $REG_L  @X[15],16*$SZ($sp)      # restore pointer to the end of input
+        $LD     @X[0],0*$SZ($ctx)
+        $LD     @X[1],1*$SZ($ctx)
+        $LD     @X[2],2*$SZ($ctx)
+        $PTR_ADD $inp,16*$SZ
+        $LD     @X[3],3*$SZ($ctx)
+        $ADDU   $A,@X[0]
+        $LD     @X[4],4*$SZ($ctx)
+        $ADDU   $B,@X[1]
+        $LD     @X[5],5*$SZ($ctx)
+        $ADDU   $C,@X[2]
+        $LD     @X[6],6*$SZ($ctx)
+        $ADDU   $D,@X[3]
+        $LD     @X[7],7*$SZ($ctx)
+        $ADDU   $E,@X[4]
+        $ST     $A,0*$SZ($ctx)
+        $ADDU   $F,@X[5]
+        $ST     $B,1*$SZ($ctx)
+        $ADDU   $G,@X[6]
+        $ST     $C,2*$SZ($ctx)
+        $ADDU   $H,@X[7]
+        $ST     $D,3*$SZ($ctx)
+        $ST     $E,4*$SZ($ctx)
+        $ST     $F,5*$SZ($ctx)
+        $ST     $G,6*$SZ($ctx)
+        $ST     $H,7*$SZ($ctx)
+        bnel    $inp,@X[15],.Loop
+        $PTR_SUB $Ktbl,`($rounds-16)*$SZ`       # rewind $Ktbl
+        $REG_L  $ra,$FRAMESIZE-1*$SZREG($sp)
+        $REG_L  $fp,$FRAMESIZE-2*$SZREG($sp)
+        $REG_L  $s11,$FRAMESIZE-3*$SZREG($sp)
+        $REG_L  $s10,$FRAMESIZE-4*$SZREG($sp)
+        $REG_L  $s9,$FRAMESIZE-5*$SZREG($sp)
+        $REG_L  $s8,$FRAMESIZE-6*$SZREG($sp)
+        $REG_L  $s7,$FRAMESIZE-7*$SZREG($sp)
+        $REG_L  $s6,$FRAMESIZE-8*$SZREG($sp)
+        $REG_L  $s5,$FRAMESIZE-9*$SZREG($sp)
+        $REG_L  $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+        $REG_L  $s3,$FRAMESIZE-11*$SZREG($sp)
+        $REG_L  $s2,$FRAMESIZE-12*$SZREG($sp)
+        $REG_L  $s1,$FRAMESIZE-13*$SZREG($sp)
+        $REG_L  $s0,$FRAMESIZE-14*$SZREG($sp)
+        $REG_L  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+        jr      $ra
+        $PTR_ADD $sp,$FRAMESIZE
+.end    sha${label}_block_data_order
+.rdata
+.align  5
+K${label}:
+___
+if ($SZ==4) {
+$code.=<<___;
+        .word   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+        .word   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+        .word   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+        .word   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+        .word   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+        .word   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+        .word   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+        .word   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+        .word   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+        .word   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+        .word   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+        .word   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+        .word   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+        .word   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+        .word   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+        .word   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+___
+} else {
+$code.=<<___;
+        .dword  0x428a2f98d728ae22, 0x7137449123ef65cd
+        .dword  0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+        .dword  0x3956c25bf348b538, 0x59f111f1b605d019
+        .dword  0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+        .dword  0xd807aa98a3030242, 0x12835b0145706fbe
+        .dword  0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+        .dword  0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+        .dword  0x9bdc06a725c71235, 0xc19bf174cf692694
+        .dword  0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+        .dword  0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+        .dword  0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+        .dword  0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+        .dword  0x983e5152ee66dfab, 0xa831c66d2db43210
+        .dword  0xb00327c898fb213f, 0xbf597fc7beef0ee4
+        .dword  0xc6e00bf33da88fc2, 0xd5a79147930aa725
+        .dword  0x06ca6351e003826f, 0x142929670a0e6e70
+        .dword  0x27b70a8546d22ffc, 0x2e1b21385c26c926
+        .dword  0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+        .dword  0x650a73548baf63de, 0x766a0abb3c77b2a8
+        .dword  0x81c2c92e47edaee6, 0x92722c851482353b
+        .dword  0xa2bfe8a14cf10364, 0xa81a664bbc423001
+        .dword  0xc24b8b70d0f89791, 0xc76c51a30654be30
+        .dword  0xd192e819d6ef5218, 0xd69906245565a910
+        .dword  0xf40e35855771202a, 0x106aa07032bbd1b8
+        .dword  0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+        .dword  0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+        .dword  0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+        .dword  0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+        .dword  0x748f82ee5defb2fc, 0x78a5636f43172f60
+        .dword  0x84c87814a1f0ab72, 0x8cc702081a6439ec
+        .dword  0x90befffa23631e28, 0xa4506cebde82bde9
+        .dword  0xbef9a3f7b2c67915, 0xc67178f2e372532b
+        .dword  0xca273eceea26619c, 0xd186b8c721c0c207
+        .dword  0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+        .dword  0x06f067aa72176fba, 0x0a637dc5a2c898a6
+        .dword  0x113f9804bef90dae, 0x1b710b35131c471b
+        .dword  0x28db77f523047d84, 0x32caab7b40c72493
+        .dword  0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+        .dword  0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+        .dword  0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+___
+}
+$code.=<<___;
+.asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+.align  5
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-parisc.pl b/src/lib/libcrypto/sha/asm/sha512-parisc.pl
new file mode 100755
index 0000000000..e24ee58ae9
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-parisc.pl
@@ -0,0 +1,791 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# SHA256/512 block procedure for PA-RISC.
+# June 2009.
+#
+# SHA256 performance is >75% better than gcc 3.2 generated code on
+# PA-7100LC. Compared to code generated by vendor compiler this
+# implementation is almost 70% faster in 64-bit build, but delivers
+# virtually same performance in 32-bit build on PA-8600.
+#
+# SHA512 performance is >2.9x better than gcc 3.2 generated code on
+# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
+# code is executed on PA-RISC 2.0 processor and switches to 64-bit
+# code path delivering adequate peformance even in "blended" 32-bit
+# build. Though 64-bit code is not any faster than code generated by
+# vendor compiler on PA-8600...
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+if ($flavour =~ /64/) {
+        $LEVEL          ="2.0W";
+        $SIZE_T         =8;
+        $FRAME_MARKER   =80;
+        $SAVED_RP       =16;
+        $PUSH           ="std";
+        $PUSHMA         ="std,ma";
+        $POP            ="ldd";
+        $POPMB          ="ldd,mb";
+} else {
+        $LEVEL          ="1.0";
+        $SIZE_T         =4;
+        $FRAME_MARKER   =48;
+        $SAVED_RP       =20;
+        $PUSH           ="stw";
+        $PUSHMA         ="stwm";
+        $POP            ="ldw";
+        $POPMB          ="ldwm";
+}
+if ($output =~ /512/) {
+        $func="sha512_block_data_order";
+        $SZ=8;
+        @Sigma0=(28,34,39);
+        @Sigma1=(14,18,41);
+        @sigma0=(1,  8, 7);
+        @sigma1=(19,61, 6);
+        $rounds=80;
+        $LAST10BITS=0x017;
+        $LD="ldd";
+        $LDM="ldd,ma";
+        $ST="std";
+} else {
+        $func="sha256_block_data_order";
+        $SZ=4;
+        @Sigma0=( 2,13,22);
+        @Sigma1=( 6,11,25);
+        @sigma0=( 7,18, 3);
+        @sigma1=(17,19,10);
+        $rounds=64;
+        $LAST10BITS=0x0f2;
+        $LD="ldw";
+        $LDM="ldwm";
+        $ST="stw";
+}
+$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
+                                #                 [+ argument transfer]
+$XOFF=16*$SZ+32;                # local variables
+$FRAME+=$XOFF;
+$XOFF+=$FRAME_MARKER;           # distance between %sp and local variables
+$ctx="%r26";    # zapped by $a0
+$inp="%r25";    # zapped by $a1
+$num="%r24";    # zapped by $t0
+$a0 ="%r26";
+$a1 ="%r25";
+$t0 ="%r24";
+$t1 ="%r29";
+$Tbl="%r31";
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
+@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
+sub ROUND_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$code.=<<___;
+        _ror    $e,$Sigma1[0],$a0
+        and     $f,$e,$t0
+        _ror    $e,$Sigma1[1],$a1
+        addl    $t1,$h,$h
+        andcm   $g,$e,$t1
+        xor     $a1,$a0,$a0
+        _ror    $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
+        or      $t0,$t1,$t1             ; Ch(e,f,g)
+        addl    @X[$i%16],$h,$h
+        xor     $a0,$a1,$a1             ; Sigma1(e)
+        addl    $t1,$h,$h
+        _ror    $a,$Sigma0[0],$a0
+        addl    $a1,$h,$h
+        _ror    $a,$Sigma0[1],$a1
+        and     $a,$b,$t0
+        and     $a,$c,$t1
+        xor     $a1,$a0,$a0
+        _ror    $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
+        xor     $t1,$t0,$t0
+        and     $b,$c,$t1
+        xor     $a0,$a1,$a1             ; Sigma0(a)
+        addl    $h,$d,$d
+        xor     $t1,$t0,$t0             ; Maj(a,b,c)
+        `"$LDM  $SZ($Tbl),$t1" if ($i<15)`
+        addl    $a1,$h,$h
+        addl    $t0,$h,$h
+___
+}
+sub ROUND_16_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$i-=16;
+$code.=<<___;
+        _ror    @X[($i+1)%16],$sigma0[0],$a0
+        _ror    @X[($i+1)%16],$sigma0[1],$a1
+        addl    @X[($i+9)%16],@X[$i],@X[$i]
+        _ror    @X[($i+14)%16],$sigma1[0],$t0
+        _ror    @X[($i+14)%16],$sigma1[1],$t1
+        xor     $a1,$a0,$a0
+        _shr    @X[($i+1)%16],$sigma0[2],$a1
+        xor     $t1,$t0,$t0
+        _shr    @X[($i+14)%16],$sigma1[2],$t1
+        xor     $a1,$a0,$a0             ; sigma0(X[(i+1)&0x0f])
+        xor     $t1,$t0,$t0             ; sigma1(X[(i+14)&0x0f])
+        $LDM    $SZ($Tbl),$t1
+        addl    $a0,@X[$i],@X[$i]
+        addl    $t0,@X[$i],@X[$i]
+___
+$code.=<<___ if ($i==15);
+        extru   $t1,31,10,$a1
+        comiclr,<> $LAST10BITS,$a1,%r0
+        ldo     1($Tbl),$Tbl            ; signal end of $Tbl
+___
+&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
+}
+$code=<<___;
+        .LEVEL  $LEVEL
+        .SPACE  \$TEXT\$
+        .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+        .ALIGN  64
+L\$table
+___
+$code.=<<___ if ($SZ==8);
+        .WORD   0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
+        .WORD   0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
+        .WORD   0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
+        .WORD   0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
+        .WORD   0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
+        .WORD   0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
+        .WORD   0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
+        .WORD   0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
+        .WORD   0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
+        .WORD   0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
+        .WORD   0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
+        .WORD   0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
+        .WORD   0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
+        .WORD   0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
+        .WORD   0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
+        .WORD   0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
+        .WORD   0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
+        .WORD   0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
+        .WORD   0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
+        .WORD   0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
+        .WORD   0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
+        .WORD   0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
+        .WORD   0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
+        .WORD   0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
+        .WORD   0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
+        .WORD   0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
+        .WORD   0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
+        .WORD   0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
+        .WORD   0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
+        .WORD   0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
+        .WORD   0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
+        .WORD   0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
+        .WORD   0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
+        .WORD   0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
+        .WORD   0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
+        .WORD   0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
+        .WORD   0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
+        .WORD   0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
+        .WORD   0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
+        .WORD   0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
+___
+$code.=<<___ if ($SZ==4);
+        .WORD   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        .WORD   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        .WORD   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        .WORD   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        .WORD   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        .WORD   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        .WORD   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        .WORD   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        .WORD   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        .WORD   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        .WORD   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        .WORD   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        .WORD   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        .WORD   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        .WORD   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        .WORD   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+___
+$code.=<<___;
+        .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+        .ALIGN  64
+$func
+        .PROC
+        .CALLINFO       FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+        .ENTRY
+        $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
+        $PUSHMA %r3,$FRAME(%sp)
+        $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
+        $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
+        $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
+        $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
+        $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
+        $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
+        $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
+        $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
+        $PUSH   %r12,`-$FRAME+9*$SIZE_T`(%sp)
+        $PUSH   %r13,`-$FRAME+10*$SIZE_T`(%sp)
+        $PUSH   %r14,`-$FRAME+11*$SIZE_T`(%sp)
+        $PUSH   %r15,`-$FRAME+12*$SIZE_T`(%sp)
+        $PUSH   %r16,`-$FRAME+13*$SIZE_T`(%sp)
+        $PUSH   %r17,`-$FRAME+14*$SIZE_T`(%sp)
+        $PUSH   %r18,`-$FRAME+15*$SIZE_T`(%sp)
+        _shl    $num,`log(16*$SZ)/log(2)`,$num
+        addl    $inp,$num,$num          ; $num to point at the end of $inp
+        $PUSH   $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp)    ; save arguments
+        $PUSH   $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
+        $PUSH   $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
+        blr     %r0,$Tbl
+        ldi     3,$t1
+L\$pic
+        andcm   $Tbl,$t1,$Tbl           ; wipe privilege level
+        ldo     L\$table-L\$pic($Tbl),$Tbl
+___
+$code.=<<___ if ($SZ==8 && $SIZE_T==4);
+        ldi     31,$t1
+        mtctl   $t1,%cr11
+        extrd,u,*= $t1,%sar,1,$t1       ; executes on PA-RISC 1.0
+        b       L\$parisc1
+        nop
+___
+$code.=<<___;
+        $LD     `0*$SZ`($ctx),$A        ; load context
+        $LD     `1*$SZ`($ctx),$B
+        $LD     `2*$SZ`($ctx),$C
+        $LD     `3*$SZ`($ctx),$D
+        $LD     `4*$SZ`($ctx),$E
+        $LD     `5*$SZ`($ctx),$F
+        $LD     `6*$SZ`($ctx),$G
+        $LD     `7*$SZ`($ctx),$H
+        extru   $inp,31,`log($SZ)/log(2)`,$t0
+        sh3addl $t0,%r0,$t0
+        subi    `8*$SZ`,$t0,$t0
+        mtctl   $t0,%cr11               ; load %sar with align factor
+L\$oop
+        ldi     `$SZ-1`,$t0
+        $LDM    $SZ($Tbl),$t1
+        andcm   $inp,$t0,$t0            ; align $inp
+___
+        for ($i=0;$i<15;$i++) {         # load input block
+        $code.="\t$LD   `$SZ*$i`($t0),@X[$i]\n";                }
+$code.=<<___;
+        cmpb,*= $inp,$t0,L\$aligned
+        $LD     `$SZ*15`($t0),@X[15]
+        $LD     `$SZ*16`($t0),@X[16]
+___
+        for ($i=0;$i<16;$i++) {         # align data
+        $code.="\t_align        @X[$i],@X[$i+1],@X[$i]\n";      }
+$code.=<<___;
+L\$aligned
+        nop     ; otherwise /usr/ccs/bin/as is confused by below .WORD
+___
+for($i=0;$i<16;$i++)    { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+L\$rounds
+        nop     ; otherwise /usr/ccs/bin/as is confused by below .WORD
+___
+for(;$i<32;$i++)        { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        bb,>=   $Tbl,31,L\$rounds       ; end of $Tbl signalled?
+        nop
+        $POP    `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx    ; restore arguments
+        $POP    `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
+        $POP    `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
+        ldo     `-$rounds*$SZ-1`($Tbl),$Tbl             ; rewind $Tbl
+        $LD     `0*$SZ`($ctx),@X[0]     ; load context
+        $LD     `1*$SZ`($ctx),@X[1]
+        $LD     `2*$SZ`($ctx),@X[2]
+        $LD     `3*$SZ`($ctx),@X[3]
+        $LD     `4*$SZ`($ctx),@X[4]
+        $LD     `5*$SZ`($ctx),@X[5]
+        addl    @X[0],$A,$A
+        $LD     `6*$SZ`($ctx),@X[6]
+        addl    @X[1],$B,$B
+        $LD     `7*$SZ`($ctx),@X[7]
+        ldo     `16*$SZ`($inp),$inp     ; advance $inp
+        $ST     $A,`0*$SZ`($ctx)        ; save context
+        addl    @X[2],$C,$C
+        $ST     $B,`1*$SZ`($ctx)
+        addl    @X[3],$D,$D
+        $ST     $C,`2*$SZ`($ctx)
+        addl    @X[4],$E,$E
+        $ST     $D,`3*$SZ`($ctx)
+        addl    @X[5],$F,$F
+        $ST     $E,`4*$SZ`($ctx)
+        addl    @X[6],$G,$G
+        $ST     $F,`5*$SZ`($ctx)
+        addl    @X[7],$H,$H
+        $ST     $G,`6*$SZ`($ctx)
+        $ST     $H,`7*$SZ`($ctx)
+        cmpb,*<>,n $inp,$num,L\$oop
+        $PUSH   $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)    ; save $inp
+___
+if ($SZ==8 && $SIZE_T==4)       # SHA512 for 32-bit PA-RISC 1.0
+{{
+$code.=<<___;
+        b       L\$done
+        nop
+        .ALIGN  64
+L\$parisc1
+___
+@V=(  $Ahi,  $Alo,  $Bhi,  $Blo,  $Chi,  $Clo,  $Dhi,  $Dlo,
+      $Ehi,  $Elo,  $Fhi,  $Flo,  $Ghi,  $Glo,  $Hhi,  $Hlo) = 
+   ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+     "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
+$a0 ="%r17";
+$a1 ="%r18";
+$a2 ="%r19";
+$a3 ="%r20";
+$t0 ="%r21";
+$t1 ="%r22";
+$t2 ="%r28";
+$t3 ="%r29";
+$Tbl="%r31";
+@X=("%r23","%r24","%r25","%r26");       # zaps $num,$inp,$ctx
+sub ROUND_00_15_pa1 {
+my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
+       $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
+my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
+$code.=<<___ if (!$flag);
+        ldw     `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
+        ldw     `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo     ; load X[i+1]
+___
+$code.=<<___;
+        shd     $ehi,$elo,$Sigma1[0],$t0
+         add    $Xlo,$hlo,$hlo
+        shd     $elo,$ehi,$Sigma1[0],$t1
+         addc   $Xhi,$hhi,$hhi          ; h += X[i]
+        shd     $ehi,$elo,$Sigma1[1],$t2
+         ldwm   8($Tbl),$Xhi
+        shd     $elo,$ehi,$Sigma1[1],$t3
+         ldw    -4($Tbl),$Xlo           ; load K[i]
+        xor     $t2,$t0,$t0
+        xor     $t3,$t1,$t1
+         and    $flo,$elo,$a0
+         and    $fhi,$ehi,$a1
+        shd     $ehi,$elo,$Sigma1[2],$t2
+         andcm  $glo,$elo,$a2
+        shd     $elo,$ehi,$Sigma1[2],$t3
+         andcm  $ghi,$ehi,$a3
+        xor     $t2,$t0,$t0
+        xor     $t3,$t1,$t1             ; Sigma1(e)
+        add     $Xlo,$hlo,$hlo
+         xor    $a2,$a0,$a0
+        addc    $Xhi,$hhi,$hhi          ; h += K[i]
+         xor    $a3,$a1,$a1             ; Ch(e,f,g)
+         add    $t0,$hlo,$hlo
+        shd     $ahi,$alo,$Sigma0[0],$t0
+         addc   $t1,$hhi,$hhi           ; h += Sigma1(e)
+        shd     $alo,$ahi,$Sigma0[0],$t1        
+         add    $a0,$hlo,$hlo
+        shd     $ahi,$alo,$Sigma0[1],$t2
+         addc   $a1,$hhi,$hhi           ; h += Ch(e,f,g)
+        shd     $alo,$ahi,$Sigma0[1],$t3
+        xor     $t2,$t0,$t0
+        xor     $t3,$t1,$t1
+        shd     $ahi,$alo,$Sigma0[2],$t2
+        and     $alo,$blo,$a0
+        shd     $alo,$ahi,$Sigma0[2],$t3
+        and     $ahi,$bhi,$a1
+        xor     $t2,$t0,$t0
+        xor     $t3,$t1,$t1             ; Sigma0(a)
+        and     $alo,$clo,$a2
+        and     $ahi,$chi,$a3
+        xor     $a2,$a0,$a0
+         add    $hlo,$dlo,$dlo
+        xor     $a3,$a1,$a1
+         addc   $hhi,$dhi,$dhi          ; d += h
+        and     $blo,$clo,$a2
+         add    $t0,$hlo,$hlo
+        and     $bhi,$chi,$a3
+         addc   $t1,$hhi,$hhi           ; h += Sigma0(a)
+        xor     $a2,$a0,$a0
+         add    $a0,$hlo,$hlo
+        xor     $a3,$a1,$a1             ; Maj(a,b,c)
+         addc   $a1,$hhi,$hhi           ; h += Maj(a,b,c)
+___
+$code.=<<___ if ($i==15 && $flag);
+        extru   $Xlo,31,10,$Xlo
+        comiclr,= $LAST10BITS,$Xlo,%r0
+        b       L\$rounds_pa1
+        nop
+___
+push(@X,shift(@X)); push(@X,shift(@X));
+}
+sub ROUND_16_xx_pa1 {
+my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
+my ($i)=shift;
+$i-=16;
+$code.=<<___;
+        ldw     `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
+        ldw     `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo     ; load X[i+1]
+        ldw     `-$XOFF+8*(($i+9)%16)`(%sp),$a1
+        ldw     `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0       ; load X[i+9]
+        ldw     `-$XOFF+8*(($i+14)%16)`(%sp),$a3
+        ldw     `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2      ; load X[i+14]
+        shd     $Xnhi,$Xnlo,$sigma0[0],$t0
+        shd     $Xnlo,$Xnhi,$sigma0[0],$t1
+         add    $a0,$Xlo,$Xlo
+        shd     $Xnhi,$Xnlo,$sigma0[1],$t2
+         addc   $a1,$Xhi,$Xhi
+        shd     $Xnlo,$Xnhi,$sigma0[1],$t3
+        xor     $t2,$t0,$t0
+        shd     $Xnhi,$Xnlo,$sigma0[2],$t2
+        xor     $t3,$t1,$t1
+        extru   $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
+        xor     $t2,$t0,$t0
+         shd    $a3,$a2,$sigma1[0],$a0
+        xor     $t3,$t1,$t1             ; sigma0(X[i+1)&0x0f])
+         shd    $a2,$a3,$sigma1[0],$a1
+        add     $t0,$Xlo,$Xlo
+         shd    $a3,$a2,$sigma1[1],$t2
+        addc    $t1,$Xhi,$Xhi
+         shd    $a2,$a3,$sigma1[1],$t3
+        xor     $t2,$a0,$a0
+        shd     $a3,$a2,$sigma1[2],$t2
+        xor     $t3,$a1,$a1
+        extru   $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
+        xor     $t2,$a0,$a0
+        xor     $t3,$a1,$a1             ; sigma0(X[i+14)&0x0f])
+        add     $a0,$Xlo,$Xlo
+        addc    $a1,$Xhi,$Xhi
+        stw     $Xhi,`-$XOFF+8*($i%16)`(%sp)
+        stw     $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
+___
+&ROUND_00_15_pa1($i,@_,1);
+}
+$code.=<<___;
+        ldw     `0*4`($ctx),$Ahi                ; load context
+        ldw     `1*4`($ctx),$Alo
+        ldw     `2*4`($ctx),$Bhi
+        ldw     `3*4`($ctx),$Blo
+        ldw     `4*4`($ctx),$Chi
+        ldw     `5*4`($ctx),$Clo
+        ldw     `6*4`($ctx),$Dhi
+        ldw     `7*4`($ctx),$Dlo
+        ldw     `8*4`($ctx),$Ehi
+        ldw     `9*4`($ctx),$Elo
+        ldw     `10*4`($ctx),$Fhi
+        ldw     `11*4`($ctx),$Flo
+        ldw     `12*4`($ctx),$Ghi
+        ldw     `13*4`($ctx),$Glo
+        ldw     `14*4`($ctx),$Hhi
+        ldw     `15*4`($ctx),$Hlo
+        extru   $inp,31,2,$t0
+        sh3addl $t0,%r0,$t0
+        subi    32,$t0,$t0
+        mtctl   $t0,%cr11               ; load %sar with align factor
+L\$oop_pa1
+        extru   $inp,31,2,$a3
+        comib,= 0,$a3,L\$aligned_pa1
+        sub     $inp,$a3,$inp
+        ldw     `0*4`($inp),$X[0]
+        ldw     `1*4`($inp),$X[1]
+        ldw     `2*4`($inp),$t2
+        ldw     `3*4`($inp),$t3
+        ldw     `4*4`($inp),$a0
+        ldw     `5*4`($inp),$a1
+        ldw     `6*4`($inp),$a2
+        ldw     `7*4`($inp),$a3
+        vshd    $X[0],$X[1],$X[0]
+        vshd    $X[1],$t2,$X[1]
+        stw     $X[0],`-$XOFF+0*4`(%sp)
+        ldw     `8*4`($inp),$t0
+        vshd    $t2,$t3,$t2
+        stw     $X[1],`-$XOFF+1*4`(%sp)
+        ldw     `9*4`($inp),$t1
+        vshd    $t3,$a0,$t3
+___
+{
+my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
+for ($i=2;$i<=(128/4-8);$i++) {
+$code.=<<___;
+        stw     $t[0],`-$XOFF+$i*4`(%sp)
+        ldw     `(8+$i)*4`($inp),$t[0]
+        vshd    $t[1],$t[2],$t[1]
+___
+push(@t,shift(@t));
+}
+for (;$i<(128/4-1);$i++) {
+$code.=<<___;
+        stw     $t[0],`-$XOFF+$i*4`(%sp)
+        vshd    $t[1],$t[2],$t[1]
+___
+push(@t,shift(@t));
+}
+$code.=<<___;
+        b       L\$collected_pa1
+        stw     $t[0],`-$XOFF+$i*4`(%sp)
+___
+}
+$code.=<<___;
+L\$aligned_pa1
+        ldw     `0*4`($inp),$X[0]
+        ldw     `1*4`($inp),$X[1]
+        ldw     `2*4`($inp),$t2
+        ldw     `3*4`($inp),$t3
+        ldw     `4*4`($inp),$a0
+        ldw     `5*4`($inp),$a1
+        ldw     `6*4`($inp),$a2
+        ldw     `7*4`($inp),$a3
+        stw     $X[0],`-$XOFF+0*4`(%sp)
+        ldw     `8*4`($inp),$t0
+        stw     $X[1],`-$XOFF+1*4`(%sp)
+        ldw     `9*4`($inp),$t1
+___
+{
+my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
+for ($i=2;$i<(128/4-8);$i++) {
+$code.=<<___;
+        stw     $t[0],`-$XOFF+$i*4`(%sp)
+        ldw     `(8+$i)*4`($inp),$t[0]
+___
+push(@t,shift(@t));
+}
+for (;$i<128/4;$i++) {
+$code.=<<___;
+        stw     $t[0],`-$XOFF+$i*4`(%sp)
+___
+push(@t,shift(@t));
+}
+$code.="L\$collected_pa1\n";
+}
+for($i=0;$i<16;$i++)    { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
+$code.="L\$rounds_pa1\n";
+for(;$i<32;$i++)        { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
+$code.=<<___;
+        $POP    `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx    ; restore arguments
+        $POP    `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
+        $POP    `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
+        ldo     `-$rounds*$SZ`($Tbl),$Tbl               ; rewind $Tbl
+        ldw     `0*4`($ctx),$t1         ; update context
+        ldw     `1*4`($ctx),$t0
+        ldw     `2*4`($ctx),$t3
+        ldw     `3*4`($ctx),$t2
+        ldw     `4*4`($ctx),$a1
+        ldw     `5*4`($ctx),$a0
+        ldw     `6*4`($ctx),$a3
+        add     $t0,$Alo,$Alo
+        ldw     `7*4`($ctx),$a2
+        addc    $t1,$Ahi,$Ahi
+        ldw     `8*4`($ctx),$t1
+        add     $t2,$Blo,$Blo
+        ldw     `9*4`($ctx),$t0
+        addc    $t3,$Bhi,$Bhi
+        ldw     `10*4`($ctx),$t3
+        add     $a0,$Clo,$Clo
+        ldw     `11*4`($ctx),$t2
+        addc    $a1,$Chi,$Chi
+        ldw     `12*4`($ctx),$a1
+        add     $a2,$Dlo,$Dlo
+        ldw     `13*4`($ctx),$a0
+        addc    $a3,$Dhi,$Dhi
+        ldw     `14*4`($ctx),$a3
+        add     $t0,$Elo,$Elo
+        ldw     `15*4`($ctx),$a2
+        addc    $t1,$Ehi,$Ehi
+        stw     $Ahi,`0*4`($ctx)
+        add     $t2,$Flo,$Flo
+        stw     $Alo,`1*4`($ctx)
+        addc    $t3,$Fhi,$Fhi
+        stw     $Bhi,`2*4`($ctx)
+        add     $a0,$Glo,$Glo
+        stw     $Blo,`3*4`($ctx)
+        addc    $a1,$Ghi,$Ghi
+        stw     $Chi,`4*4`($ctx)
+        add     $a2,$Hlo,$Hlo
+        stw     $Clo,`5*4`($ctx)
+        addc    $a3,$Hhi,$Hhi
+        stw     $Dhi,`6*4`($ctx)
+        ldo     `16*$SZ`($inp),$inp     ; advance $inp
+        stw     $Dlo,`7*4`($ctx)
+        stw     $Ehi,`8*4`($ctx)
+        stw     $Elo,`9*4`($ctx)
+        stw     $Fhi,`10*4`($ctx)
+        stw     $Flo,`11*4`($ctx)
+        stw     $Ghi,`12*4`($ctx)
+        stw     $Glo,`13*4`($ctx)
+        stw     $Hhi,`14*4`($ctx)
+        comb,=  $inp,$num,L\$done
+        stw     $Hlo,`15*4`($ctx)
+        b       L\$oop_pa1
+        $PUSH   $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)    ; save $inp
+L\$done
+___
+}}
+$code.=<<___;
+        $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
+        $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
+        $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
+        $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
+        $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
+        $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
+        $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
+        $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
+        $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
+        $POP    `-$FRAME+9*$SIZE_T`(%sp),%r12
+        $POP    `-$FRAME+10*$SIZE_T`(%sp),%r13
+        $POP    `-$FRAME+11*$SIZE_T`(%sp),%r14
+        $POP    `-$FRAME+12*$SIZE_T`(%sp),%r15
+        $POP    `-$FRAME+13*$SIZE_T`(%sp),%r16
+        $POP    `-$FRAME+14*$SIZE_T`(%sp),%r17
+        $POP    `-$FRAME+15*$SIZE_T`(%sp),%r18
+        bv      (%r2)
+        .EXIT
+        $POPMB  -$FRAME(%sp),%r3
+        .PROCEND
+        .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+    if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
+    {   my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
+        $opcode|=(1<<3) if ($mod =~ /^,m/);
+        $opcode|=(1<<2) if ($mod =~ /^,mb/);
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+    {   my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 15
+    {   my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+        my $len=32-$3;
+        $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
+        $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)      # format 12
+    {   my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+        my $len=32-$2;
+        $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
+        $opcode |= (1<<13) if ($mod =~ /,\**=/);
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)   # format 14
+    {   my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+        my $cpos=63-$3;
+        $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
+        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)    # format 11
+    {   sprintf "\t.WORD\t0x%08x\t; %s",
+                (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+    }
+    else { "\t".$orig; }
+};
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+foreach (split("\n",$code)) {
+        s/\`([^\`]*)\`/eval $1/ge;
+        s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
+                $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32)        # rotation for >=32
+                :       sprintf("shd\t%$1,%$2,%d",$3)/e                 or
+        # translate made up instructons: _ror, _shr, _align, _shl
+        s/_ror(\s+)(%r[0-9]+),/
+                ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e                 or
+        s/_shr(\s+%r[0-9]+),([0-9]+),/
+                $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
+                :        sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e   or
+        s/_align(\s+%r[0-9]+,%r[0-9]+),/
+                ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e                or
+        s/_shl(\s+%r[0-9]+),([0-9]+),/
+                $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
+                :            sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
+        s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
+        s/cmpb,\*/comb,/ if ($SIZE_T==4);
+        print $_,"\n";
+}
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-ppc.pl b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
index 768a6a6fad..6b44a68e59 100755
--- a/src/lib/libcrypto/sha/asm/sha512-ppc.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
@@ -40,6 +40,7 @@ $output =shift;
 if ($flavour =~ /64/) {
        $SIZE_T=8;
+        $LRSAVE=2*$SIZE_T;
        $STU="stdu";
        $UCMP="cmpld";
        $SHL="sldi";
@@ -47,6 +48,7 @@ if ($flavour =~ /64/) {
        $PUSH="std";
 } elsif ($flavour =~ /32/) {
        $SIZE_T=4;
+        $LRSAVE=$SIZE_T;
        $STU="stwu";
        $UCMP="cmplw";
        $SHL="slwi";
@@ -87,7 +89,8 @@ if ($output =~ /512/) {
        $SHR="srwi";
 }
-$FRAME=32*$SIZE_T;
+$FRAME=32*$SIZE_T+16*$SZ;
+$LOCALS=6*$SIZE_T;
 $sp ="r1";
 $toc="r2";
@@ -179,13 +182,12 @@ $code=<<___;
 .globl  $func
 .align  6
 $func:
+        $STU    $sp,-$FRAME($sp)
        mflr    r0
-        $STU    $sp,`-($FRAME+16*$SZ)`($sp)
        $SHL    $num,$num,`log(16*$SZ)/log(2)`
        $PUSH   $ctx,`$FRAME-$SIZE_T*22`($sp)
-        $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
        $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
        $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
        $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
@@ -206,6 +208,7 @@ $func:
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+        $PUSH   r0,`$FRAME+$LRSAVE`($sp)
        $LD     $A,`0*$SZ`($ctx)
        mr      $inp,r4                         ; incarnate $inp
@@ -217,7 +220,7 @@ $func:
        $LD     $G,`6*$SZ`($ctx)
        $LD     $H,`7*$SZ`($ctx)
-        b       LPICmeup
+        bl      LPICmeup
 LPICedup:
        andi.   r0,$inp,3
        bne     Lunaligned
@@ -226,40 +229,14 @@ Laligned:
        $PUSH   $num,`$FRAME-$SIZE_T*24`($sp)   ; end pointer
        $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
        bl      Lsha2_block_private
-Ldone:
+        b       Ldone
-        $POP    r0,`$FRAME-$SIZE_T*21`($sp)
-        $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
-        $POP    r13,`$FRAME-$SIZE_T*19`($sp)
-        $POP    r14,`$FRAME-$SIZE_T*18`($sp)
-        $POP    r15,`$FRAME-$SIZE_T*17`($sp)
-        $POP    r16,`$FRAME-$SIZE_T*16`($sp)
-        $POP    r17,`$FRAME-$SIZE_T*15`($sp)
-        $POP    r18,`$FRAME-$SIZE_T*14`($sp)
-        $POP    r19,`$FRAME-$SIZE_T*13`($sp)
-        $POP    r20,`$FRAME-$SIZE_T*12`($sp)
-        $POP    r21,`$FRAME-$SIZE_T*11`($sp)
-        $POP    r22,`$FRAME-$SIZE_T*10`($sp)
-        $POP    r23,`$FRAME-$SIZE_T*9`($sp)
-        $POP    r24,`$FRAME-$SIZE_T*8`($sp)
-        $POP    r25,`$FRAME-$SIZE_T*7`($sp)
-        $POP    r26,`$FRAME-$SIZE_T*6`($sp)
-        $POP    r27,`$FRAME-$SIZE_T*5`($sp)
-        $POP    r28,`$FRAME-$SIZE_T*4`($sp)
-        $POP    r29,`$FRAME-$SIZE_T*3`($sp)
-        $POP    r30,`$FRAME-$SIZE_T*2`($sp)
-        $POP    r31,`$FRAME-$SIZE_T*1`($sp)
-        mtlr    r0
-        addi    $sp,$sp,`$FRAME+16*$SZ`
-        blr
-___
-# PowerPC specification allows an implementation to be ill-behaved
+; PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
+; upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
+; than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for the input
+; look for particular offending word, but rather for the input
-# block which crosses the boundary. Once found that block is aligned
+; block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
+; and hashed separately...
-$code.=<<___;
 .align  4
 Lunaligned:
        subfic  $t1,$inp,4096
@@ -278,7 +255,7 @@ Lunaligned:
 Lcross_page:
        li      $t1,`16*$SZ/4`
        mtctr   $t1
-        addi    r20,$sp,$FRAME                  ; aligned spot below the frame
+        addi    r20,$sp,$LOCALS                 ; aligned spot below the frame
 Lmemcpy:
        lbz     r16,0($inp)
        lbz     r17,1($inp)
@@ -293,8 +270,8 @@ Lmemcpy:
        bdnz    Lmemcpy
        $PUSH   $inp,`$FRAME-$SIZE_T*26`($sp)   ; save real inp
-        addi    $t1,$sp,`$FRAME+16*$SZ`         ; fictitious end pointer
+        addi    $t1,$sp,`$LOCALS+16*$SZ`        ; fictitious end pointer
-        addi    $inp,$sp,$FRAME                 ; fictitious inp pointer
+        addi    $inp,$sp,$LOCALS                ; fictitious inp pointer
        $PUSH   $num,`$FRAME-$SIZE_T*25`($sp)   ; save real num
        $PUSH   $t1,`$FRAME-$SIZE_T*24`($sp)    ; end pointer
        $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
@@ -303,10 +280,36 @@ Lmemcpy:
        $POP    $num,`$FRAME-$SIZE_T*25`($sp)   ; restore real num
        addic.  $num,$num,`-16*$SZ`             ; num--
        bne-    Lunaligned
-        b       Ldone
-___
-$code.=<<___;
+Ldone:
+        $POP    r0,`$FRAME+$LRSAVE`($sp)
+        $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
+        $POP    r13,`$FRAME-$SIZE_T*19`($sp)
+        $POP    r14,`$FRAME-$SIZE_T*18`($sp)
+        $POP    r15,`$FRAME-$SIZE_T*17`($sp)
+        $POP    r16,`$FRAME-$SIZE_T*16`($sp)
+        $POP    r17,`$FRAME-$SIZE_T*15`($sp)
+        $POP    r18,`$FRAME-$SIZE_T*14`($sp)
+        $POP    r19,`$FRAME-$SIZE_T*13`($sp)
+        $POP    r20,`$FRAME-$SIZE_T*12`($sp)
+        $POP    r21,`$FRAME-$SIZE_T*11`($sp)
+        $POP    r22,`$FRAME-$SIZE_T*10`($sp)
+        $POP    r23,`$FRAME-$SIZE_T*9`($sp)
+        $POP    r24,`$FRAME-$SIZE_T*8`($sp)
+        $POP    r25,`$FRAME-$SIZE_T*7`($sp)
+        $POP    r26,`$FRAME-$SIZE_T*6`($sp)
+        $POP    r27,`$FRAME-$SIZE_T*5`($sp)
+        $POP    r28,`$FRAME-$SIZE_T*4`($sp)
+        $POP    r29,`$FRAME-$SIZE_T*3`($sp)
+        $POP    r30,`$FRAME-$SIZE_T*2`($sp)
+        $POP    r31,`$FRAME-$SIZE_T*1`($sp)
+        mtlr    r0
+        addi    $sp,$sp,$FRAME
+        blr
+        .long   0
+        .byte   0,12,4,1,0x80,18,3,0
+        .long   0
 .align  4
 Lsha2_block_private:
 ___
@@ -372,6 +375,8 @@ $code.=<<___;
        $ST     $H,`7*$SZ`($ctx)
        bne     Lsha2_block_private
        blr
+        .long   0
+        .byte   0,12,0x14,0,0,0,0,0
 ___
 # Ugly hack here, because PPC assembler syntax seem to vary too
@@ -379,22 +384,15 @@ ___
 $code.=<<___;
 .align  6
 LPICmeup:
-        bl      LPIC
+        mflr    r0
-        addi    $Tbl,$Tbl,`64-4`        ; "distance" between . and last nop
+        bcl     20,31,\$+4
-        b       LPICedup
+        mflr    $Tbl    ; vvvvvv "distance" between . and 1st data entry
-        nop
+        addi    $Tbl,$Tbl,`64-8`
-        nop
+        mtlr    r0
-        nop
-        nop
-        nop
-LPIC:   mflr    $Tbl
        blr
-        nop
+        .long   0
-        nop
+        .byte   0,12,0x14,0,0,0,0,0
-        nop
+        .space  `64-9*4`
-        nop
-        nop
-        nop
 ___
 $code.=<<___ if ($SZ==8);
        .long   0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
diff --git a/src/lib/libcrypto/sha/asm/sha512-s390x.pl b/src/lib/libcrypto/sha/asm/sha512-s390x.pl
index e7ef2d5a9f..079a3fc78a 100644
--- a/src/lib/libcrypto/sha/asm/sha512-s390x.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-s390x.pl
@@ -26,6 +26,26 @@
 # favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
 # than software.
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z900 SHA256 was measured to
+# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
+$flavour = shift;
+if ($flavour =~ /3[12]/) {
+        $SIZE_T=4;
+        $g="";
+} else {
+        $SIZE_T=8;
+        $g="g";
+}
 $t0="%r0";
 $t1="%r1";
 $ctx="%r2";     $t2="%r2";
@@ -44,7 +64,7 @@ $tbl="%r13";
 $T1="%r14";
 $sp="%r15";
-$output=shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 if ($output =~ /512/) {
@@ -78,7 +98,8 @@ if ($output =~ /512/) {
 }
 $Func="sha${label}_block_data_order";
 $Table="K${label}";
-$frame=160+16*$SZ;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*$SZ;
 sub BODY_00_15 {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
@@ -93,9 +114,9 @@ $code.=<<___;
        xgr     $t0,$t1
        $ROT    $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
         xgr    $t2,$g
-        $ST     $T1,`160+$SZ*($i%16)`($sp)
+        $ST     $T1,`$stdframe+$SZ*($i%16)`($sp)
        xgr     $t0,$t1                 # Sigma1(e)
-        la      $T1,0($T1,$h)           # T1+=h
+        algr    $T1,$h                  # T1+=h
         ngr    $t2,$e
         lgr    $t1,$a
        algr    $T1,$t0                 # T1+=Sigma1(e)
@@ -113,7 +134,7 @@ $code.=<<___;
         ngr    $t2,$b
        algr    $h,$T1                  # h+=T1
         ogr    $t2,$t1                 # Maj(a,b,c)
-        la      $d,0($d,$T1)            # d+=T1
+        algr    $d,$T1                  # d+=T1
        algr    $h,$t2                  # h+=Maj(a,b,c)
 ___
 }
@@ -122,19 +143,19 @@ sub BODY_16_XX {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 $code.=<<___;
-        $LD     $T1,`160+$SZ*(($i+1)%16)`($sp)  ### $i
+        $LD     $T1,`$stdframe+$SZ*(($i+1)%16)`($sp)    ### $i
-        $LD     $t1,`160+$SZ*(($i+14)%16)`($sp)
+        $LD     $t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
        $ROT    $t0,$T1,$sigma0[0]
        $SHR    $T1,$sigma0[2]
        $ROT    $t2,$t0,`$sigma0[1]-$sigma0[0]`
        xgr     $T1,$t0
        $ROT    $t0,$t1,$sigma1[0]
-        xgr     $T1,$t2                         # sigma0(X[i+1])
+        xgr     $T1,$t2                                 # sigma0(X[i+1])
        $SHR    $t1,$sigma1[2]
-        $ADD    $T1,`160+$SZ*($i%16)`($sp)      # +=X[i]
+        $ADD    $T1,`$stdframe+$SZ*($i%16)`($sp)        # +=X[i]
        xgr     $t1,$t0
        $ROT    $t0,$t0,`$sigma1[1]-$sigma1[0]`
-        $ADD    $T1,`160+$SZ*(($i+9)%16)`($sp)  # +=X[i+9]
+        $ADD    $T1,`$stdframe+$SZ*(($i+9)%16)`($sp)    # +=X[i+9]
        xgr     $t1,$t0                         # sigma1(X[i+14])
        algr    $T1,$t1                         # +=sigma1(X[i+14])
 ___
@@ -212,6 +233,7 @@ $code.=<<___;
 .globl  $Func
 .type   $Func,\@function
 $Func:
+        sllg    $len,$len,`log(16*$SZ)/log(2)`
 ___
 $code.=<<___ if ($kimdfunc);
        larl    %r1,OPENSSL_s390xcap_P
@@ -219,15 +241,15 @@ $code.=<<___ if ($kimdfunc);
        tmhl    %r0,0x4000      # check for message-security assist
        jz      .Lsoftware
        lghi    %r0,0
-        la      %r1,16($sp)
+        la      %r1,`2*$SIZE_T`($sp)
        .long   0xb93e0002      # kimd %r0,%r2
-        lg      %r0,16($sp)
+        lg      %r0,`2*$SIZE_T`($sp)
        tmhh    %r0,`0x8000>>$kimdfunc`
        jz      .Lsoftware
        lghi    %r0,$kimdfunc
        lgr     %r1,$ctx
        lgr     %r2,$inp
-        sllg    %r3,$len,`log(16*$SZ)/log(2)`
+        lgr     %r3,$len
        .long   0xb93e0002      # kimd %r0,%r2
        brc     1,.-4           # pay attention to "partial completion"
        br      %r14
@@ -235,13 +257,12 @@ $code.=<<___ if ($kimdfunc);
 .Lsoftware:
 ___
 $code.=<<___;
-        sllg    $len,$len,`log(16*$SZ)/log(2)`
        lghi    %r1,-$frame
-        agr     $len,$inp
+        la      $len,0($len,$inp)
-        stmg    $ctx,%r15,16($sp)
+        stm${g} $ctx,%r15,`2*$SIZE_T`($sp)
        lgr     %r0,$sp
        la      $sp,0(%r1,$sp)
-        stg     %r0,0($sp)
+        st${g}  %r0,0($sp)
        larl    $tbl,$Table
        $LD     $A,`0*$SZ`($ctx)
@@ -265,7 +286,7 @@ $code.=<<___;
        clgr    $len,$t0
        jne     .Lrounds_16_xx
-        lg      $ctx,`$frame+16`($sp)
+        l${g}   $ctx,`$frame+2*$SIZE_T`($sp)
        la      $inp,`16*$SZ`($inp)
        $ADD    $A,`0*$SZ`($ctx)
        $ADD    $B,`1*$SZ`($ctx)
@@ -283,14 +304,14 @@ $code.=<<___;
        $ST     $F,`5*$SZ`($ctx)
        $ST     $G,`6*$SZ`($ctx)
        $ST     $H,`7*$SZ`($ctx)
-        clg     $inp,`$frame+32`($sp)
+        cl${g}  $inp,`$frame+4*$SIZE_T`($sp)
        jne     .Lloop
-        lmg     %r6,%r15,`$frame+48`($sp)       
+        lm${g}  %r6,%r15,`$frame+6*$SIZE_T`($sp)        
        br      %r14
 .size   $Func,.-$Func
 .string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm   OPENSSL_s390xcap_P,8,8
+.comm   OPENSSL_s390xcap_P,16,8
 ___
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
index ec5d78135e..585740789e 100644
--- a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
@@ -305,9 +305,9 @@ $code.=<<___;
        srlx    @X[(($i+9)/2)%8],32,$tmp1       ! X[i+9]
        xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
        srl     @X[($i/2)%8],0,$tmp0
+        add     $tmp2,$tmp1,$tmp1
        add     $xi,$T1,$T1                     ! +=X[i]
        xor     $tmp0,@X[($i/2)%8],@X[($i/2)%8]
-        add     $tmp2,$T1,$T1
        add     $tmp1,$T1,$T1
        srl     $T1,0,$T1
@@ -318,9 +318,9 @@ ___
 $code.=<<___;
        srlx    @X[($i/2)%8],32,$tmp1           ! X[i]
        xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
-        srl     @X[($i/2)%8],0,@X[($i/2)%8]
        add     $xi,$T1,$T1                     ! +=X[i+9]
-        add     $tmp2,$T1,$T1
+        add     $tmp2,$tmp1,$tmp1
+        srl     @X[($i/2)%8],0,@X[($i/2)%8]
        add     $tmp1,$T1,$T1
        sllx    $T1,32,$tmp0
diff --git a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
index e6643f8cf6..f611a2d898 100755
--- a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
@@ -95,50 +95,44 @@ sub ROUND_00_15()
 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 $code.=<<___;
-        mov     $e,$a0
+        ror     \$`$Sigma1[2]-$Sigma1[1]`,$a0
-        mov     $e,$a1
        mov     $f,$a2
+        mov     $T1,`$SZ*($i&0xf)`(%rsp)
-        ror     \$$Sigma1[0],$a0
+        ror     \$`$Sigma0[2]-$Sigma0[1]`,$a1
-        ror     \$$Sigma1[1],$a1
+        xor     $e,$a0
        xor     $g,$a2                  # f^g
-        xor     $a1,$a0
+        ror     \$`$Sigma1[1]-$Sigma1[0]`,$a0
-        ror     \$`$Sigma1[2]-$Sigma1[1]`,$a1
+        add     $h,$T1                  # T1+=h
+        xor     $a,$a1
+        add     ($Tbl,$round,$SZ),$T1   # T1+=K[round]
        and     $e,$a2                  # (f^g)&e
-        mov     $T1,`$SZ*($i&0xf)`(%rsp)
+        mov     $b,$h
-        xor     $a1,$a0                 # Sigma1(e)
+        ror     \$`$Sigma0[1]-$Sigma0[0]`,$a1
+        xor     $e,$a0
        xor     $g,$a2                  # Ch(e,f,g)=((f^g)&e)^g
-        add     $h,$T1                  # T1+=h
-        mov     $a,$h
-        add     $a0,$T1                 # T1+=Sigma1(e)
+        xor     $c,$h                   # b^c
+        xor     $a,$a1
        add     $a2,$T1                 # T1+=Ch(e,f,g)
-        mov     $a,$a0
+        mov     $b,$a2
-        mov     $a,$a1
-        ror     \$$Sigma0[0],$h
+        ror     \$$Sigma1[0],$a0        # Sigma1(e)
-        ror     \$$Sigma0[1],$a0
+        and     $a,$h                   # h=(b^c)&a
-        mov     $a,$a2
+        and     $c,$a2                  # b&c
-        add     ($Tbl,$round,$SZ),$T1   # T1+=K[round]
-        xor     $a0,$h
+        ror     \$$Sigma0[0],$a1        # Sigma0(a)
-        ror     \$`$Sigma0[2]-$Sigma0[1]`,$a0
+        add     $a0,$T1                 # T1+=Sigma1(e)
-        or      $c,$a1                  # a|c
+        add     $a2,$h                  # h+=b&c (completes +=Maj(a,b,c)
-        xor     $a0,$h                  # h=Sigma0(a)
-        and     $c,$a2                  # a&c
        add     $T1,$d                  # d+=T1
-        and     $b,$a1                  # (a|c)&b
        add     $T1,$h                  # h+=T1
-        or      $a2,$a1                 # Maj(a,b,c)=((a|c)&b)|(a&c)
        lea     1($round),$round        # round++
+        add     $a1,$h                  # h+=Sigma0(a)
-        add     $a1,$h                  # h+=Maj(a,b,c)
 ___
 }
@@ -147,32 +141,30 @@ sub ROUND_16_XX()
 $code.=<<___;
        mov     `$SZ*(($i+1)&0xf)`(%rsp),$a0
-        mov     `$SZ*(($i+14)&0xf)`(%rsp),$T1
+        mov     `$SZ*(($i+14)&0xf)`(%rsp),$a1
+        mov     $a0,$T1
-        mov     $a0,$a2
+        mov     $a1,$a2
+        ror     \$`$sigma0[1]-$sigma0[0]`,$T1
+        xor     $a0,$T1
        shr     \$$sigma0[2],$a0
-        ror     \$$sigma0[0],$a2
-        xor     $a2,$a0
-        ror     \$`$sigma0[1]-$sigma0[0]`,$a2
-        xor     $a2,$a0                 # sigma0(X[(i+1)&0xf])
+        ror     \$$sigma0[0],$T1
-        mov     $T1,$a1
+        xor     $T1,$a0                 # sigma0(X[(i+1)&0xf])
+        mov     `$SZ*(($i+9)&0xf)`(%rsp),$T1
-        shr     \$$sigma1[2],$T1
+        ror     \$`$sigma1[1]-$sigma1[0]`,$a2
-        ror     \$$sigma1[0],$a1
+        xor     $a1,$a2
+        shr     \$$sigma1[2],$a1
-        xor     $a1,$T1
-        ror     \$`$sigma1[1]-$sigma1[0]`,$a1
-        xor     $a1,$T1                 # sigma1(X[(i+14)&0xf])
+        ror     \$$sigma1[0],$a2
        add     $a0,$T1
+        xor     $a2,$a1                 # sigma1(X[(i+14)&0xf])
-        add     `$SZ*(($i+9)&0xf)`(%rsp),$T1
        add     `$SZ*($i&0xf)`(%rsp),$T1
+        mov     $e,$a0
+        add     $a1,$T1
+        mov     $a,$a1
 ___
        &ROUND_00_15(@_);
 }
@@ -219,6 +211,8 @@ $func:
 ___
        for($i=0;$i<16;$i++) {
                $code.="        mov     $SZ*$i($inp),$T1\n";
+                $code.="        mov     @ROT[4],$a0\n";
+                $code.="        mov     @ROT[0],$a1\n";
                $code.="        bswap   $T1\n";
                &ROUND_00_15($i,@ROT);
                unshift(@ROT,pop(@ROT));
diff --git a/src/lib/libcrypto/sha/sha256.c b/src/lib/libcrypto/sha/sha256.c
index 8952d87673..f88d3d6dad 100644
--- a/src/lib/libcrypto/sha/sha256.c
+++ b/src/lib/libcrypto/sha/sha256.c
@@ -16,7 +16,7 @@
 const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT;
-int SHA224_Init (SHA256_CTX *c)
+fips_md_init_ctx(SHA224, SHA256)
        {
        memset (c,0,sizeof(*c));
        c->h[0]=0xc1059ed8UL;   c->h[1]=0x367cd507UL;
@@ -27,7 +27,7 @@ int SHA224_Init (SHA256_CTX *c)
        return 1;
        }
-int SHA256_Init (SHA256_CTX *c)
+fips_md_init(SHA256)
        {
        memset (c,0,sizeof(*c));
        c->h[0]=0x6a09e667UL;   c->h[1]=0xbb67ae85UL;
diff --git a/src/lib/libcrypto/sha/sha512.c b/src/lib/libcrypto/sha/sha512.c
index cbc0e58c48..50dd7dc744 100644
--- a/src/lib/libcrypto/sha/sha512.c
+++ b/src/lib/libcrypto/sha/sha512.c
@@ -59,21 +59,8 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
 #define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
 #endif
-int SHA384_Init (SHA512_CTX *c)
+fips_md_init_ctx(SHA384, SHA512)
        {
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-        /* maintain dword order required by assembler module */
-        unsigned int *h = (unsigned int *)c->h;
-        h[0]  = 0xcbbb9d5d; h[1]  = 0xc1059ed8;
-        h[2]  = 0x629a292a; h[3]  = 0x367cd507;
-        h[4]  = 0x9159015a; h[5]  = 0x3070dd17;
-        h[6]  = 0x152fecd8; h[7]  = 0xf70e5939;
-        h[8]  = 0x67332667; h[9]  = 0xffc00b31;
-        h[10] = 0x8eb44a87; h[11] = 0x68581511;
-        h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7;
-        h[14] = 0x47b5481d; h[15] = 0xbefa4fa4;
-#else
        c->h[0]=U64(0xcbbb9d5dc1059ed8);
        c->h[1]=U64(0x629a292a367cd507);
        c->h[2]=U64(0x9159015a3070dd17);
@@ -82,27 +69,14 @@ int SHA384_Init (SHA512_CTX *c)
        c->h[5]=U64(0x8eb44a8768581511);
        c->h[6]=U64(0xdb0c2e0d64f98fa7);
        c->h[7]=U64(0x47b5481dbefa4fa4);
-#endif
        c->Nl=0;        c->Nh=0;
        c->num=0;       c->md_len=SHA384_DIGEST_LENGTH;
        return 1;
        }
-int SHA512_Init (SHA512_CTX *c)
+fips_md_init(SHA512)
        {
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-        /* maintain dword order required by assembler module */
-        unsigned int *h = (unsigned int *)c->h;
-        h[0]  = 0x6a09e667; h[1]  = 0xf3bcc908;
-        h[2]  = 0xbb67ae85; h[3]  = 0x84caa73b;
-        h[4]  = 0x3c6ef372; h[5]  = 0xfe94f82b;
-        h[6]  = 0xa54ff53a; h[7]  = 0x5f1d36f1;
-        h[8]  = 0x510e527f; h[9]  = 0xade682d1;
-        h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f;
-        h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b;
-        h[14] = 0x5be0cd19; h[15] = 0x137e2179;
-#else
        c->h[0]=U64(0x6a09e667f3bcc908);
        c->h[1]=U64(0xbb67ae8584caa73b);
        c->h[2]=U64(0x3c6ef372fe94f82b);
@@ -111,7 +85,7 @@ int SHA512_Init (SHA512_CTX *c)
        c->h[5]=U64(0x9b05688c2b3e6c1f);
        c->h[6]=U64(0x1f83d9abfb41bd6b);
        c->h[7]=U64(0x5be0cd19137e2179);
-#endif
        c->Nl=0;        c->Nh=0;
        c->num=0;       c->md_len=SHA512_DIGEST_LENGTH;
        return 1;
@@ -160,24 +134,6 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
        if (md==0) return 0;
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-        /* recall assembler dword order... */
-        n = c->md_len;
-        if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH)
-                {
-                unsigned int *h = (unsigned int *)c->h, t;
-                for (n/=4;n;n--)
-                        {
-                        t = *(h++);
-                        *(md++) = (unsigned char)(t>>24);
-                        *(md++) = (unsigned char)(t>>16);
-                        *(md++) = (unsigned char)(t>>8);
-                        *(md++) = (unsigned char)(t);
-                        }
-                }
-        else    return 0;
-#else
        switch (c->md_len)
                {
                /* Let compiler decide if it's appropriate to unroll... */
@@ -214,7 +170,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
                /* ... as well as make sure md_len is not abused. */
                default:        return 0;
                }
-#endif
        return 1;
        }
diff --git a/src/lib/libcrypto/sparcv9cap.c b/src/lib/libcrypto/sparcv9cap.c
index ed195ab402..43b3ac6f81 100644
--- a/src/lib/libcrypto/sparcv9cap.c
+++ b/src/lib/libcrypto/sparcv9cap.c
@@ -19,7 +19,8 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
        int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
        int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
-        if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
+        if (num>=8 && !(num&1) &&
+            (OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
                (SPARCV9_PREFER_FPU|SPARCV9_VIS1))
                return bn_mul_mont_fpu(rp,ap,bp,np,n0,num);
        else
@@ -169,7 +170,6 @@ void OPENSSL_cpuid_setup(void)
        char *e;
        struct sigaction        common_act,ill_oact,bus_oact;
        sigset_t                all_masked,oset;
-        int                     sig;
        static int trigger=0;
        if (trigger) return;
diff --git a/src/lib/libcrypto/ts/ts.h b/src/lib/libcrypto/ts/ts.h
index 190e8a1bf2..c2448e3c3b 100644
--- a/src/lib/libcrypto/ts/ts.h
+++ b/src/lib/libcrypto/ts/ts.h
@@ -86,9 +86,6 @@
 #include <openssl/dh.h>
 #endif
-#include <openssl/evp.h>
 #ifdef  __cplusplus
 extern "C" {
 #endif
diff --git a/src/lib/libcrypto/whrlpool/whrlpool.h b/src/lib/libcrypto/whrlpool/whrlpool.h
index 03c91da115..9e01f5b076 100644
--- a/src/lib/libcrypto/whrlpool/whrlpool.h
+++ b/src/lib/libcrypto/whrlpool/whrlpool.h
@@ -24,6 +24,9 @@ typedef struct	{
        } WHIRLPOOL_CTX;
 #ifndef OPENSSL_NO_WHIRLPOOL
+#ifdef OPENSSL_FIPS
+int private_WHIRLPOOL_Init(WHIRLPOOL_CTX *c);
+#endif
 int WHIRLPOOL_Init      (WHIRLPOOL_CTX *c);
 int WHIRLPOOL_Update    (WHIRLPOOL_CTX *c,const void *inp,size_t bytes);
 void WHIRLPOOL_BitUpdate(WHIRLPOOL_CTX *c,const void *inp,size_t bits);
diff --git a/src/lib/libcrypto/whrlpool/wp_block.c b/src/lib/libcrypto/whrlpool/wp_block.c
index 221f6cc59f..824ed1827c 100644
--- a/src/lib/libcrypto/whrlpool/wp_block.c
+++ b/src/lib/libcrypto/whrlpool/wp_block.c
@@ -68,9 +68,9 @@ typedef unsigned long long	u64;
                                           CPUs this is actually faster! */
 #    endif
 #    define GO_FOR_MMX(ctx,inp,num)     do {                    \
-        extern unsigned long OPENSSL_ia32cap_P;                 \
+        extern unsigned int OPENSSL_ia32cap_P[];                \
        void whirlpool_block_mmx(void *,const void *,size_t);   \
-        if (!(OPENSSL_ia32cap_P & (1<<23)))     break;          \
+        if (!(OPENSSL_ia32cap_P[0] & (1<<23)))  break;          \
        whirlpool_block_mmx(ctx->H.c,inp,num);  return;         \
                                        } while (0)
 #  endif
diff --git a/src/lib/libcrypto/whrlpool/wp_dgst.c b/src/lib/libcrypto/whrlpool/wp_dgst.c
index ee5c5c1bf3..7e28bef51d 100644
--- a/src/lib/libcrypto/whrlpool/wp_dgst.c
+++ b/src/lib/libcrypto/whrlpool/wp_dgst.c
@@ -52,9 +52,10 @@
 */
 #include "wp_locl.h"
+#include <openssl/crypto.h>
 #include <string.h>
-int WHIRLPOOL_Init      (WHIRLPOOL_CTX *c)
+fips_md_init(WHIRLPOOL)
        {
        memset (c,0,sizeof(*c));
        return(1);
diff --git a/src/lib/libcrypto/x86cpuid.pl b/src/lib/libcrypto/x86cpuid.pl
index a7464af19b..39fd8f2293 100644
--- a/src/lib/libcrypto/x86cpuid.pl
+++ b/src/lib/libcrypto/x86cpuid.pl
@@ -19,9 +19,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
        &pushf  ();
        &pop    ("eax");
        &xor    ("ecx","eax");
-        &bt     ("ecx",21);
-        &jnc    (&label("done"));
        &xor    ("eax","eax");
+        &bt     ("ecx",21);
+        &jnc    (&label("nocpuid"));
        &cpuid  ();
        &mov    ("edi","eax");          # max value for standard query level
@@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
        # AMD specific
        &mov    ("eax",0x80000000);
        &cpuid  ();
-        &cmp    ("eax",0x80000008);
+        &cmp    ("eax",0x80000001);
+        &jb     (&label("intel"));
+        &mov    ("esi","eax");
+        &mov    ("eax",0x80000001);
+        &cpuid  ();
+        &or     ("ebp","ecx");
+        &and    ("ebp",1<<11|1);        # isolate XOP bit
+        &cmp    ("esi",0x80000008);
        &jb     (&label("intel"));
        &mov    ("eax",0x80000008);
@@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
        &mov    ("eax",1);
        &cpuid  ();
        &bt     ("edx",28);
-        &jnc    (&label("done"));
+        &jnc    (&label("generic"));
        &shr    ("ebx",16);
        &and    ("ebx",0xff);
        &cmp    ("ebx","esi");
-        &ja     (&label("done"));
+        &ja     (&label("generic"));
        &and    ("edx",0xefffffff);     # clear hyper-threading bit
-        &jmp    (&label("done"));
+        &jmp    (&label("generic"));
        
 &set_label("intel");
        &cmp    ("edi",4);
@@ -85,27 +92,51 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 &set_label("nocacheinfo");
        &mov    ("eax",1);
        &cpuid  ();
+        &and    ("edx",0xbfefffff);     # force reserved bits #20, #30 to 0
        &cmp    ("ebp",0);
-        &jne    (&label("notP4"));
+        &jne    (&label("notintel"));
+        &or     ("edx",1<<30);          # set reserved bit#30 on Intel CPUs
        &and    (&HB("eax"),15);        # familiy ID
        &cmp    (&HB("eax"),15);        # P4?
-        &jne    (&label("notP4"));
+        &jne    (&label("notintel"));
-        &or     ("edx",1<<20);          # use reserved bit to engage RC4_CHAR
+        &or     ("edx",1<<20);          # set reserved bit#20 to engage RC4_CHAR
-&set_label("notP4");
+&set_label("notintel");
        &bt     ("edx",28);             # test hyper-threading bit
-        &jnc    (&label("done"));
+        &jnc    (&label("generic"));
        &and    ("edx",0xefffffff);
        &cmp    ("edi",0);
-        &je     (&label("done"));
+        &je     (&label("generic"));
        &or     ("edx",0x10000000);
        &shr    ("ebx",16);
        &cmp    (&LB("ebx"),1);
-        &ja     (&label("done"));
+        &ja     (&label("generic"));
        &and    ("edx",0xefffffff);     # clear hyper-threading bit if not
+&set_label("generic");
+        &and    ("ebp",1<<11);          # isolate AMD XOP flag
+        &and    ("ecx",0xfffff7ff);     # force 11th bit to 0
+        &mov    ("esi","edx");
+        &or     ("ebp","ecx");          # merge AMD XOP flag
+        &bt     ("ecx",27);             # check OSXSAVE bit
+        &jnc    (&label("clear_avx"));
+        &xor    ("ecx","ecx");
+        &data_byte(0x0f,0x01,0xd0);     # xgetbv
+        &and    ("eax",6);
+        &cmp    ("eax",6);
+        &je     (&label("done"));
+        &cmp    ("eax",2);
+        &je     (&label("clear_avx"));
+&set_label("clear_xmm");
+        &and    ("ebp",0xfdfffffd);     # clear AESNI and PCLMULQDQ bits
+        &and    ("esi",0xfeffffff);     # clear FXSR
+&set_label("clear_avx");
+        &and    ("ebp",0xefffe7ff);     # clear AVX, FMA and AMD XOP bits
 &set_label("done");
-        &mov    ("eax","edx");
+        &mov    ("eax","esi");
-        &mov    ("edx","ecx");
+        &mov    ("edx","ebp");
+&set_label("nocpuid");
 &function_end("OPENSSL_ia32_cpuid");
 &external_label("OPENSSL_ia32cap_P");
@@ -199,8 +230,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
        &bt     (&DWP(0,"ecx"),1);
        &jnc    (&label("no_x87"));
        if ($sse2) {
-                &bt     (&DWP(0,"ecx"),26);
+                &and    ("ecx",1<<26|1<<24);    # check SSE2 and FXSR bits
-                &jnc    (&label("no_sse2"));
+                &cmp    ("ecx",1<<26|1<<24);
+                &jne    (&label("no_sse2"));
                &pxor   ("xmm0","xmm0");
                &pxor   ("xmm1","xmm1");
                &pxor   ("xmm2","xmm2");
@@ -307,6 +339,18 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
        &ret    ();
 &function_end_B("OPENSSL_cleanse");
+&function_begin_B("OPENSSL_ia32_rdrand");
+        &mov    ("ecx",8);
+&set_label("loop");
+        &rdrand ("eax");
+        &jc     (&label("break"));
+        &loop   (&label("loop"));
+&set_label("break");
+        &cmp    ("eax",0);
+        &cmove  ("eax","ecx");
+        &ret    ();
+&function_end_B("OPENSSL_ia32_rdrand");
 &initseg("OPENSSL_cpuid_setup");
 &asm_finish();
diff --git a/src/lib/libssl/d1_both.c b/src/lib/libssl/d1_both.c
index 9f898d6997..de8bab873f 100644
--- a/src/lib/libssl/d1_both.c
+++ b/src/lib/libssl/d1_both.c
@@ -227,14 +227,14 @@ int dtls1_do_write(SSL *s, int type)
        unsigned int len, frag_off, mac_size, blocksize;
        /* AHA!  Figure out the MTU, and stick to the right size */
-        if ( ! (SSL_get_options(s) & SSL_OP_NO_QUERY_MTU))
+        if (s->d1->mtu < dtls1_min_mtu() && !(SSL_get_options(s) & SSL_OP_NO_QUERY_MTU))
                {
                s->d1->mtu = 
                        BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_QUERY_MTU, 0, NULL);
                /* I've seen the kernel return bogus numbers when it doesn't know
                 * (initial write), so just make sure we have a reasonable number */
-                if ( s->d1->mtu < dtls1_min_mtu())
+                if (s->d1->mtu < dtls1_min_mtu())
                        {
                        s->d1->mtu = 0;
                        s->d1->mtu = dtls1_guess_mtu(s->d1->mtu);
@@ -1084,7 +1084,11 @@ int dtls1_read_failed(SSL *s, int code)
                return code;
                }
-        if ( ! SSL_in_init(s))  /* done, no need to send a retransmit */
+#ifndef OPENSSL_NO_HEARTBEATS
+        if (!SSL_in_init(s) && !s->tlsext_hb_pending)  /* done, no need to send a retransmit */
+#else
+        if (!SSL_in_init(s))  /* done, no need to send a retransmit */
+#endif
                {
                BIO_set_flags(SSL_get_rbio(s), BIO_FLAGS_READ);
                return code;
@@ -1417,3 +1421,171 @@ dtls1_get_ccs_header(unsigned char *data, struct ccs_header_st *ccs_hdr)
        ccs_hdr->type = *(data++);
        }
+int dtls1_shutdown(SSL *s)
+        {
+        int ret;
+#ifndef OPENSSL_NO_SCTP
+        if (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+            !(s->shutdown & SSL_SENT_SHUTDOWN))
+                {
+                ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s));
+                if (ret < 0) return -1;
+                if (ret == 0)
+                        BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 1, NULL);
+                }
+#endif
+        ret = ssl3_shutdown(s);
+#ifndef OPENSSL_NO_SCTP
+        BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 0, NULL);
+#endif
+        return ret;
+        }
+#ifndef OPENSSL_NO_HEARTBEATS
+int
+dtls1_process_heartbeat(SSL *s)
+        {
+        unsigned char *p = &s->s3->rrec.data[0], *pl;
+        unsigned short hbtype;
+        unsigned int payload;
+        unsigned int padding = 16; /* Use minimum padding */
+        /* Read type and payload length first */
+        hbtype = *p++;
+        n2s(p, payload);
+        pl = p;
+        if (s->msg_callback)
+                s->msg_callback(0, s->version, TLS1_RT_HEARTBEAT,
+                        &s->s3->rrec.data[0], s->s3->rrec.length,
+                        s, s->msg_callback_arg);
+        if (hbtype == TLS1_HB_REQUEST)
+                {
+                unsigned char *buffer, *bp;
+                int r;
+                /* Allocate memory for the response, size is 1 byte
+                 * message type, plus 2 bytes payload length, plus
+                 * payload, plus padding
+                 */
+                buffer = OPENSSL_malloc(1 + 2 + payload + padding);
+                bp = buffer;
+                /* Enter response type, length and copy payload */
+                *bp++ = TLS1_HB_RESPONSE;
+                s2n(payload, bp);
+                memcpy(bp, pl, payload);
+                bp += payload;
+                /* Random padding */
+                RAND_pseudo_bytes(bp, padding);
+                r = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buffer, 3 + payload + padding);
+                if (r >= 0 && s->msg_callback)
+                        s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT,
+                                buffer, 3 + payload + padding,
+                                s, s->msg_callback_arg);
+                OPENSSL_free(buffer);
+                if (r < 0)
+                        return r;
+                }
+        else if (hbtype == TLS1_HB_RESPONSE)
+                {
+                unsigned int seq;
+                /* We only send sequence numbers (2 bytes unsigned int),
+                 * and 16 random bytes, so we just try to read the
+                 * sequence number */
+                n2s(pl, seq);
+                if (payload == 18 && seq == s->tlsext_hb_seq)
+                        {
+                        dtls1_stop_timer(s);
+                        s->tlsext_hb_seq++;
+                        s->tlsext_hb_pending = 0;
+                        }
+                }
+        return 0;
+        }
+int
+dtls1_heartbeat(SSL *s)
+        {
+        unsigned char *buf, *p;
+        int ret;
+        unsigned int payload = 18; /* Sequence number + random bytes */
+        unsigned int padding = 16; /* Use minimum padding */
+        /* Only send if peer supports and accepts HB requests... */
+        if (!(s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED) ||
+            s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_SEND_REQUESTS)
+                {
+                SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT);
+                return -1;
+                }
+        /* ...and there is none in flight yet... */
+        if (s->tlsext_hb_pending)
+                {
+                SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PENDING);
+                return -1;
+                }
+        /* ...and no handshake in progress. */
+        if (SSL_in_init(s) || s->in_handshake)
+                {
+                SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_UNEXPECTED_MESSAGE);
+                return -1;
+                }
+        /* Check if padding is too long, payload and padding
+         * must not exceed 2^14 - 3 = 16381 bytes in total.
+         */
+        OPENSSL_assert(payload + padding <= 16381);
+        /* Create HeartBeat message, we just use a sequence number
+         * as payload to distuingish different messages and add
+         * some random stuff.
+         *  - Message Type, 1 byte
+         *  - Payload Length, 2 bytes (unsigned int)
+         *  - Payload, the sequence number (2 bytes uint)
+         *  - Payload, random bytes (16 bytes uint)
+         *  - Padding
+         */
+        buf = OPENSSL_malloc(1 + 2 + payload + padding);
+        p = buf;
+        /* Message Type */
+        *p++ = TLS1_HB_REQUEST;
+        /* Payload length (18 bytes here) */
+        s2n(payload, p);
+        /* Sequence number */
+        s2n(s->tlsext_hb_seq, p);
+        /* 16 random bytes */
+        RAND_pseudo_bytes(p, 16);
+        p += 16;
+        /* Random padding */
+        RAND_pseudo_bytes(p, padding);
+        ret = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buf, 3 + payload + padding);
+        if (ret >= 0)
+                {
+                if (s->msg_callback)
+                        s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT,
+                                buf, 3 + payload + padding,
+                                s, s->msg_callback_arg);
+                dtls1_start_timer(s);
+                s->tlsext_hb_pending = 1;
+                }
+        OPENSSL_free(buf);
+        return ret;
+        }
+#endif
diff --git a/src/lib/libssl/d1_enc.c b/src/lib/libssl/d1_enc.c
index becbab91c2..07a5e97ce5 100644
--- a/src/lib/libssl/d1_enc.c
+++ b/src/lib/libssl/d1_enc.c
@@ -260,7 +260,7 @@ int dtls1_enc(SSL *s, int send)
                                }
                        /* TLS 1.0 does not bound the number of padding bytes by the block size.
                         * All of them must have value 'padding_length'. */
-                        if (i > (int)rec->length)
+                        if (i + bs > (int)rec->length)
                                {
                                /* Incorrect padding. SSLerr() and ssl3_alert are done
                                 * by caller: we don't want to reveal whether this is
diff --git a/src/lib/libssl/d1_lib.c b/src/lib/libssl/d1_lib.c
index c3b77c889b..f61f718183 100644
--- a/src/lib/libssl/d1_lib.c
+++ b/src/lib/libssl/d1_lib.c
@@ -82,6 +82,7 @@ SSL3_ENC_METHOD DTLSv1_enc_data={
        TLS_MD_CLIENT_FINISH_CONST,TLS_MD_CLIENT_FINISH_CONST_SIZE,
        TLS_MD_SERVER_FINISH_CONST,TLS_MD_SERVER_FINISH_CONST_SIZE,
        tls1_alert_code,
+        tls1_export_keying_material,
        };
 long dtls1_default_timeout(void)
@@ -291,6 +292,15 @@ const SSL_CIPHER *dtls1_get_cipher(unsigned int u)
 void dtls1_start_timer(SSL *s)
        {
+#ifndef OPENSSL_NO_SCTP
+        /* Disable timer for SCTP */
+        if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+                {
+                memset(&(s->d1->next_timeout), 0, sizeof(struct timeval));
+                return;
+                }
+#endif
        /* If timer is not set, initialize duration with 1 second */
        if (s->d1->next_timeout.tv_sec == 0 && s->d1->next_timeout.tv_usec == 0)
                {
@@ -381,6 +391,7 @@ void dtls1_double_timeout(SSL *s)
 void dtls1_stop_timer(SSL *s)
        {
        /* Reset everything */
+        memset(&(s->d1->timeout), 0, sizeof(struct dtls1_timeout_st));
        memset(&(s->d1->next_timeout), 0, sizeof(struct timeval));
        s->d1->timeout_duration = 1;
        BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT, 0, &(s->d1->next_timeout));
@@ -388,10 +399,28 @@ void dtls1_stop_timer(SSL *s)
        dtls1_clear_record_buffer(s);
        }
-int dtls1_handle_timeout(SSL *s)
+int dtls1_check_timeout_num(SSL *s)
        {
-        DTLS1_STATE *state;
+        s->d1->timeout.num_alerts++;
+        /* Reduce MTU after 2 unsuccessful retransmissions */
+        if (s->d1->timeout.num_alerts > 2)
+                {
+                s->d1->mtu = BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_GET_FALLBACK_MTU, 0, NULL);               
+                }
+        if (s->d1->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT)
+                {
+                /* fail the connection, enough alerts have been sent */
+                SSLerr(SSL_F_DTLS1_CHECK_TIMEOUT_NUM,SSL_R_READ_TIMEOUT_EXPIRED);
+                return -1;
+                }
+        return 0;
+        }
+int dtls1_handle_timeout(SSL *s)
+        {
        /* if no timer is expired, don't do anything */
        if (!dtls1_is_timer_expired(s))
                {
@@ -399,20 +428,23 @@ int dtls1_handle_timeout(SSL *s)
                }
        dtls1_double_timeout(s);
-        state = s->d1;
-        state->timeout.num_alerts++;
+        if (dtls1_check_timeout_num(s) < 0)
-        if ( state->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT)
-                {
-                /* fail the connection, enough alerts have been sent */
-                SSLerr(SSL_F_DTLS1_HANDLE_TIMEOUT,SSL_R_READ_TIMEOUT_EXPIRED);
                return -1;
+        s->d1->timeout.read_timeouts++;
+        if (s->d1->timeout.read_timeouts > DTLS1_TMO_READ_COUNT)
+                {
+                s->d1->timeout.read_timeouts = 1;
                }
-        state->timeout.read_timeouts++;
+#ifndef OPENSSL_NO_HEARTBEATS
-        if ( state->timeout.read_timeouts > DTLS1_TMO_READ_COUNT)
+        if (s->tlsext_hb_pending)
                {
-                state->timeout.read_timeouts = 1;
+                s->tlsext_hb_pending = 0;
+                return dtls1_heartbeat(s);
                }
+#endif
        dtls1_start_timer(s);
        return dtls1_retransmit_buffered_messages(s);
diff --git a/src/lib/libssl/d1_srtp.c b/src/lib/libssl/d1_srtp.c
new file mode 100644
index 0000000000..928935bd8b
--- /dev/null
+++ b/src/lib/libssl/d1_srtp.c
@@ -0,0 +1,493 @@
+/* ssl/t1_lib.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/*
+  DTLS code by Eric Rescorla <ekr@rtfm.com>
+  Copyright (C) 2006, Network Resonance, Inc.
+  Copyright (C) 2011, RTFM, Inc.
+*/
+#ifndef OPENSSL_NO_SRTP
+#include <stdio.h>
+#include <openssl/objects.h>
+#include "ssl_locl.h"
+#include "srtp.h"
+static SRTP_PROTECTION_PROFILE srtp_known_profiles[]=
+    {
+    {
+    "SRTP_AES128_CM_SHA1_80",
+    SRTP_AES128_CM_SHA1_80,
+    },
+    {
+    "SRTP_AES128_CM_SHA1_32",
+    SRTP_AES128_CM_SHA1_32,
+    },
+#if 0
+    {
+    "SRTP_NULL_SHA1_80",
+    SRTP_NULL_SHA1_80,
+    },
+    {
+    "SRTP_NULL_SHA1_32",
+    SRTP_NULL_SHA1_32,
+    },
+#endif
+    {0}
+    };
+static int find_profile_by_name(char *profile_name,
+                                SRTP_PROTECTION_PROFILE **pptr,unsigned len)
+        {
+        SRTP_PROTECTION_PROFILE *p;
+        p=srtp_known_profiles;
+        while(p->name)
+                {
+                if((len == strlen(p->name)) && !strncmp(p->name,profile_name,
+                                                        len))
+                        {
+                        *pptr=p;
+                        return 0;
+                        }
+                p++;
+                }
+        return 1;
+        }
+static int find_profile_by_num(unsigned profile_num,
+                               SRTP_PROTECTION_PROFILE **pptr)
+        {
+        SRTP_PROTECTION_PROFILE *p;
+        p=srtp_known_profiles;
+        while(p->name)
+                {
+                if(p->id == profile_num)
+                        {
+                        *pptr=p;
+                        return 0;
+                        }
+                p++;
+                }
+        return 1;
+        }
+static int ssl_ctx_make_profiles(const char *profiles_string,STACK_OF(SRTP_PROTECTION_PROFILE) **out)
+        {
+        STACK_OF(SRTP_PROTECTION_PROFILE) *profiles;
+        char *col;
+        char *ptr=(char *)profiles_string;
+    
+        SRTP_PROTECTION_PROFILE *p;
+        if(!(profiles=sk_SRTP_PROTECTION_PROFILE_new_null()))
+                {
+                SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES, SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES);
+                return 1;
+                }
+    
+        do
+                {
+                col=strchr(ptr,':');
+                if(!find_profile_by_name(ptr,&p,
+                                         col ? col-ptr : (int)strlen(ptr)))
+                        {
+                        sk_SRTP_PROTECTION_PROFILE_push(profiles,p);
+                        }
+                else
+                        {
+                        SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES,SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE);
+                        return 1;
+                        }
+                if(col) ptr=col+1;
+                } while (col);
+        *out=profiles;
+    
+        return 0;
+        }
+    
+int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx,const char *profiles)
+        {
+        return ssl_ctx_make_profiles(profiles,&ctx->srtp_profiles);
+        }
+int SSL_set_tlsext_use_srtp(SSL *s,const char *profiles)
+        {
+        return ssl_ctx_make_profiles(profiles,&s->srtp_profiles);
+        }
+STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *s)
+        {
+        if(s != NULL)
+                {
+                if(s->srtp_profiles != NULL)
+                        {
+                        return s->srtp_profiles;
+                        }
+                else if((s->ctx != NULL) &&
+                        (s->ctx->srtp_profiles != NULL))
+                        {
+                        return s->ctx->srtp_profiles;
+                        }
+                }
+        return NULL;
+        }
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s)
+        {
+        return s->srtp_profile;
+        }
+/* Note: this function returns 0 length if there are no 
+   profiles specified */
+int ssl_add_clienthello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen)
+        {
+        int ct=0;
+        int i;
+        STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0;
+        SRTP_PROTECTION_PROFILE *prof;
+    
+        clnt=SSL_get_srtp_profiles(s);    
+        ct=sk_SRTP_PROTECTION_PROFILE_num(clnt); /* -1 if clnt == 0 */
+        if(p)
+                {
+                if(ct==0)
+                        {
+                        SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST);
+                        return 1;
+                        }
+                if((2 + ct*2 + 1) > maxlen)
+                        {
+                        SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG);
+                        return 1;
+                        }
+                /* Add the length */
+                s2n(ct * 2, p);
+                for(i=0;i<ct;i++)
+                        {
+                        prof=sk_SRTP_PROTECTION_PROFILE_value(clnt,i);
+                        s2n(prof->id,p);
+                        }
+                /* Add an empty use_mki value */
+                *p++ = 0;
+                }
+        *len=2 + ct*2 + 1;
+    
+        return 0;
+        }
+int ssl_parse_clienthello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al)
+        {
+        SRTP_PROTECTION_PROFILE *cprof,*sprof;
+        STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0,*srvr;
+        int ct;
+        int mki_len;
+        int i,j;
+        int id;
+        int ret;
+         /* Length value + the MKI length */
+        if(len < 3)
+                {            
+                SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+                *al=SSL_AD_DECODE_ERROR;
+                return 1;
+                }
+        /* Pull off the length of the cipher suite list */
+        n2s(d, ct);
+        len -= 2;
+        
+        /* Check that it is even */
+        if(ct%2)
+                {
+                SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+                *al=SSL_AD_DECODE_ERROR;
+                return 1;
+                }
+        
+        /* Check that lengths are consistent */
+        if(len < (ct + 1)) 
+                {
+                SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+                *al=SSL_AD_DECODE_ERROR;
+                return 1;
+                }
+        
+        clnt=sk_SRTP_PROTECTION_PROFILE_new_null();
+        while(ct)
+                {
+                n2s(d,id);
+                ct-=2;
+                len-=2;
+                if(!find_profile_by_num(id,&cprof))
+                        {
+                        sk_SRTP_PROTECTION_PROFILE_push(clnt,cprof);
+                        }
+                else
+                        {
+                        ; /* Ignore */
+                        }
+                }
+        /* Now extract the MKI value as a sanity check, but discard it for now */
+        mki_len = *d;
+        d++; len--;
+        if (mki_len != len)
+                {
+                SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE);
+                *al=SSL_AD_DECODE_ERROR;
+                return 1;
+                }
+        srvr=SSL_get_srtp_profiles(s);
+        /* Pick our most preferred profile. If no profiles have been
+         configured then the outer loop doesn't run 
+         (sk_SRTP_PROTECTION_PROFILE_num() = -1)
+         and so we just return without doing anything */
+        for(i=0;i<sk_SRTP_PROTECTION_PROFILE_num(srvr);i++)
+                {
+                sprof=sk_SRTP_PROTECTION_PROFILE_value(srvr,i);
+                for(j=0;j<sk_SRTP_PROTECTION_PROFILE_num(clnt);j++)
+                        {
+                        cprof=sk_SRTP_PROTECTION_PROFILE_value(clnt,j);
+            
+                        if(cprof->id==sprof->id)
+                                {
+                                s->srtp_profile=sprof;
+                                *al=0;
+                                ret=0;
+                                goto done;
+                                }
+                        }
+                }
+        ret=0;
+    
+done:
+        if(clnt) sk_SRTP_PROTECTION_PROFILE_free(clnt);
+        return ret;
+        }
+int ssl_add_serverhello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen)
+        {
+        if(p)
+                {
+                if(maxlen < 5)
+                        {
+                        SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG);
+                        return 1;
+                        }
+                if(s->srtp_profile==0)
+                        {
+                        SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_USE_SRTP_NOT_NEGOTIATED);
+                        return 1;
+                        }
+                s2n(2, p);
+                s2n(s->srtp_profile->id,p);
+                *p++ = 0;
+                }
+        *len=5;
+    
+        return 0;
+        }
+    
+int ssl_parse_serverhello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al)
+        {
+        unsigned id;
+        int i;
+        int ct;
+        STACK_OF(SRTP_PROTECTION_PROFILE) *clnt;
+        SRTP_PROTECTION_PROFILE *prof;
+        if(len!=5)
+                {
+                SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+                *al=SSL_AD_DECODE_ERROR;
+                return 1;
+                }
+        n2s(d, ct);
+        if(ct!=2)
+                {
+                SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+                *al=SSL_AD_DECODE_ERROR;
+                return 1;
+                }
+        n2s(d,id);
+        if (*d)  /* Must be no MKI, since we never offer one */
+                {
+                SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE);
+                *al=SSL_AD_ILLEGAL_PARAMETER;
+                return 1;
+                }
+        clnt=SSL_get_srtp_profiles(s);
+        /* Throw an error if the server gave us an unsolicited extension */
+        if (clnt == NULL)
+                {
+                SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_NO_SRTP_PROFILES);
+                *al=SSL_AD_DECODE_ERROR;
+                return 1;
+                }
+    
+        /* Check to see if the server gave us something we support
+           (and presumably offered)
+        */
+        for(i=0;i<sk_SRTP_PROTECTION_PROFILE_num(clnt);i++)
+                {
+                prof=sk_SRTP_PROTECTION_PROFILE_value(clnt,i);
+            
+                if(prof->id == id)
+                        {
+                        s->srtp_profile=prof;
+                        *al=0;
+                        return 0;
+                        }
+                }
+        SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+        *al=SSL_AD_DECODE_ERROR;
+        return 1;
+        }
+#endif
diff --git a/src/lib/libssl/srtp.h b/src/lib/libssl/srtp.h
new file mode 100644
index 0000000000..c0cf33ef28
--- /dev/null
+++ b/src/lib/libssl/srtp.h
@@ -0,0 +1,145 @@
+/* ssl/tls1.h */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/*
+  DTLS code by Eric Rescorla <ekr@rtfm.com>
+  Copyright (C) 2006, Network Resonance, Inc.
+  Copyright (C) 2011, RTFM, Inc.
+*/
+#ifndef HEADER_D1_SRTP_H
+#define HEADER_D1_SRTP_H
+#ifdef  __cplusplus
+extern "C" {
+#endif
+     
+#define SRTP_AES128_CM_SHA1_80 0x0001
+#define SRTP_AES128_CM_SHA1_32 0x0002
+#define SRTP_AES128_F8_SHA1_80 0x0003
+#define SRTP_AES128_F8_SHA1_32 0x0004
+#define SRTP_NULL_SHA1_80      0x0005
+#define SRTP_NULL_SHA1_32      0x0006
+int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx, const char *profiles);
+int SSL_set_tlsext_use_srtp(SSL *ctx, const char *profiles);
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s);
+STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *ssl);
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s);
+#ifdef  __cplusplus
+}
+#endif
+#endif
diff --git a/src/lib/libssl/test/P1ss.cnf b/src/lib/libssl/test/P1ss.cnf
index 876a0d35f8..326cce2ba8 100644
--- a/src/lib/libssl/test/P1ss.cnf
+++ b/src/lib/libssl/test/P1ss.cnf
@@ -7,7 +7,7 @@ RANDFILE		= ./.rnd
 ####################################################################
 [ req ]
-default_bits            = 512
+default_bits            = 1024
 default_keyfile         = keySS.pem
 distinguished_name      = req_distinguished_name
 encrypt_rsa_key         = no
diff --git a/src/lib/libssl/test/P2ss.cnf b/src/lib/libssl/test/P2ss.cnf
index 373a87e7c2..8b502321b8 100644
--- a/src/lib/libssl/test/P2ss.cnf
+++ b/src/lib/libssl/test/P2ss.cnf
@@ -7,7 +7,7 @@ RANDFILE		= ./.rnd
 ####################################################################
 [ req ]
-default_bits            = 512
+default_bits            = 1024
 default_keyfile         = keySS.pem
 distinguished_name      = req_distinguished_name
 encrypt_rsa_key         = no
diff --git a/src/lib/libssl/test/pkits-test.pl b/src/lib/libssl/test/pkits-test.pl
index 69dffa16f9..5c6b89fcdb 100644
--- a/src/lib/libssl/test/pkits-test.pl
+++ b/src/lib/libssl/test/pkits-test.pl
@@ -784,6 +784,15 @@ my $ossl = "ossl/apps/openssl";
 my $ossl_cmd = "$ossl_path cms -verify -verify_retcode ";
 $ossl_cmd .= "-CAfile pkitsta.pem -crl_check_all -x509_strict ";
+# Check for expiry of trust anchor
+system "$ossl_path x509 -inform DER -in $pkitsta -checkend 0";
+if ($? == 256)
+        {
+        print STDERR "WARNING: using older expired data\n";
+        $ossl_cmd .= "-attime 1291940972 ";
+        }
 $ossl_cmd .= "-policy_check -extended_crl -use_deltas -out /dev/null 2>&1 ";
 system "$ossl_path x509 -inform DER -in $pkitsta -out pkitsta.pem";
diff --git a/src/lib/libssl/test/test.cnf b/src/lib/libssl/test/test.cnf
index faad3914a8..10834442a1 100644
--- a/src/lib/libssl/test/test.cnf
+++ b/src/lib/libssl/test/test.cnf
@@ -56,7 +56,7 @@ emailAddress		= optional
 ####################################################################
 [ req ]
-default_bits            = 512
+default_bits            = 1024
 default_keyfile         = testkey.pem
 distinguished_name      = req_distinguished_name
 encrypt_rsa_key         = no