17 files changed, 5145 insertions, 88 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
new file mode 100644
index 0000000000..88861af641
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
@@ -0,0 +1,234 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# sha1_block procedure for ARMv4.
+#
+# January 2007.
+# Size/performance trade-off
+# ====================================================================
+# impl          size in bytes   comp cycles[*]  measured performance
+# ====================================================================
+# thumb         304             3212            4420
+# armv4-small   392/+29%        1958/+64%       2250/+96%
+# armv4-compact 740/+89%        1552/+26%       1840/+22%
+# armv4-large   1420/+92%       1307/+19%       1370/+34%[***]
+# full unroll   ~5100/+260%     ~1260/+4%       ~1300/+5%
+# ====================================================================
+# thumb         = same as 'small' but in Thumb instructions[**] and
+#                 with recurring code in two private functions;
+# small         = detached Xload/update, loops are folded;
+# compact       = detached Xload/update, 5x unroll;
+# large         = interleaved Xload/update, 5x unroll;
+# full unroll   = interleaved Xload/update, full unroll, estimated[!];
+#
+# [*]   Manually counted instructions in "grand" loop body. Measured
+#       performance is affected by prologue and epilogue overhead,
+#       i-cache availability, branch penalties, etc.
+# [**]  While each Thumb instruction is twice smaller, they are not as
+#       diverse as ARM ones: e.g., there are only two arithmetic
+#       instructions with 3 arguments, no [fixed] rotate, addressing
+#       modes are limited. As result it takes more instructions to do
+#       the same job in Thumb, therefore the code is never twice as
+#       small and always slower.
+# [***] which is also ~35% better than compiler generated code.
+$output=shift;
+open STDOUT,">$output";
+$ctx="r0";
+$inp="r1";
+$len="r2";
+$a="r3";
+$b="r4";
+$c="r5";
+$d="r6";
+$e="r7";
+$K="r8";
+$t0="r9";
+$t1="r10";
+$t2="r11";
+$t3="r12";
+$Xi="r14";
+@V=($a,$b,$c,$d,$e);
+# One can optimize this for aligned access on big-endian architecture,
+# but code's endian neutrality makes it too pretty:-)
+sub Xload {
+my ($a,$b,$c,$d,$e)=@_;
+$code.=<<___;
+        ldrb    $t0,[$inp],#4
+        ldrb    $t1,[$inp,#-3]
+        ldrb    $t2,[$inp,#-2]
+        ldrb    $t3,[$inp,#-1]
+        add     $e,$K,$e,ror#2                  @ E+=K_00_19
+        orr     $t0,$t1,$t0,lsl#8
+        add     $e,$e,$a,ror#27                 @ E+=ROR(A,27)
+        orr     $t0,$t2,$t0,lsl#8
+        eor     $t1,$c,$d                       @ F_xx_xx
+        orr     $t0,$t3,$t0,lsl#8
+        add     $e,$e,$t0                       @ E+=X[i]
+        str     $t0,[$Xi,#-4]!
+___
+}
+sub Xupdate {
+my ($a,$b,$c,$d,$e,$flag)=@_;
+$code.=<<___;
+        ldr     $t0,[$Xi,#15*4]
+        ldr     $t1,[$Xi,#13*4]
+        ldr     $t2,[$Xi,#7*4]
+        ldr     $t3,[$Xi,#2*4]
+        add     $e,$K,$e,ror#2                  @ E+=K_xx_xx
+        eor     $t0,$t0,$t1
+        eor     $t0,$t0,$t2
+        eor     $t0,$t0,$t3
+        add     $e,$e,$a,ror#27                 @ E+=ROR(A,27)
+___
+$code.=<<___ if (!defined($flag));
+        eor     $t1,$c,$d                       @ F_xx_xx, but not in 40_59
+___
+$code.=<<___;
+        mov     $t0,$t0,ror#31
+        add     $e,$e,$t0                       @ E+=X[i]
+        str     $t0,[$Xi,#-4]!
+___
+}
+sub BODY_00_15 {
+my ($a,$b,$c,$d,$e)=@_;
+        &Xload(@_);
+$code.=<<___;
+        and     $t1,$b,$t1,ror#2
+        eor     $t1,$t1,$d,ror#2                @ F_00_19(B,C,D)
+        add     $e,$e,$t1                       @ E+=F_00_19(B,C,D)
+___
+}
+sub BODY_16_19 {
+my ($a,$b,$c,$d,$e)=@_;
+        &Xupdate(@_);
+$code.=<<___;
+        and     $t1,$b,$t1,ror#2
+        eor     $t1,$t1,$d,ror#2                @ F_00_19(B,C,D)
+        add     $e,$e,$t1                       @ E+=F_00_19(B,C,D)
+___
+}
+sub BODY_20_39 {
+my ($a,$b,$c,$d,$e)=@_;
+        &Xupdate(@_);
+$code.=<<___;
+        eor     $t1,$b,$t1,ror#2                @ F_20_39(B,C,D)
+        add     $e,$e,$t1                       @ E+=F_20_39(B,C,D)
+___
+}
+sub BODY_40_59 {
+my ($a,$b,$c,$d,$e)=@_;
+        &Xupdate(@_,1);
+$code.=<<___;
+        and     $t1,$b,$c,ror#2
+        orr     $t2,$b,$c,ror#2
+        and     $t2,$t2,$d,ror#2
+        orr     $t1,$t1,$t2                     @ F_40_59(B,C,D)
+        add     $e,$e,$t1                       @ E+=F_40_59(B,C,D)
+___
+}
+$code=<<___;
+.text
+.global sha1_block_data_order
+.type   sha1_block_data_order,%function
+.align  2
+sha1_block_data_order:
+        stmdb   sp!,{r4-r12,lr}
+        add     $len,$inp,$len,lsl#6    @ $len to point at the end of $inp
+        ldmia   $ctx,{$a,$b,$c,$d,$e}
+.Lloop:
+        ldr     $K,.LK_00_19
+        mov     $Xi,sp
+        sub     sp,sp,#15*4
+        mov     $c,$c,ror#30
+        mov     $d,$d,ror#30
+        mov     $e,$e,ror#30            @ [6]
+.L_00_15:
+___
+for($i=0;$i<5;$i++) {
+        &BODY_00_15(@V);        unshift(@V,pop(@V));
+}
+$code.=<<___;
+        teq     $Xi,sp
+        bne     .L_00_15                @ [((11+4)*5+2)*3]
+___
+        &BODY_00_15(@V);        unshift(@V,pop(@V));
+        &BODY_16_19(@V);        unshift(@V,pop(@V));
+        &BODY_16_19(@V);        unshift(@V,pop(@V));
+        &BODY_16_19(@V);        unshift(@V,pop(@V));
+        &BODY_16_19(@V);        unshift(@V,pop(@V));
+$code.=<<___;
+        ldr     $K,.LK_20_39            @ [+15+16*4]
+        sub     sp,sp,#25*4
+        cmn     sp,#0                   @ [+3], clear carry to denote 20_39
+.L_20_39_or_60_79:
+___
+for($i=0;$i<5;$i++) {
+        &BODY_20_39(@V);        unshift(@V,pop(@V));
+}
+$code.=<<___;
+        teq     $Xi,sp                  @ preserve carry
+        bne     .L_20_39_or_60_79       @ [+((12+3)*5+2)*4]
+        bcs     .L_done                 @ [+((12+3)*5+2)*4], spare 300 bytes
+        ldr     $K,.LK_40_59
+        sub     sp,sp,#20*4             @ [+2]
+.L_40_59:
+___
+for($i=0;$i<5;$i++) {
+        &BODY_40_59(@V);        unshift(@V,pop(@V));
+}
+$code.=<<___;
+        teq     $Xi,sp
+        bne     .L_40_59                @ [+((12+5)*5+2)*4]
+        ldr     $K,.LK_60_79
+        sub     sp,sp,#20*4
+        cmp     sp,#0                   @ set carry to denote 60_79
+        b       .L_20_39_or_60_79       @ [+4], spare 300 bytes
+.L_done:
+        add     sp,sp,#80*4             @ "deallocate" stack frame
+        ldmia   $ctx,{$K,$t0,$t1,$t2,$t3}
+        add     $a,$K,$a
+        add     $b,$t0,$b
+        add     $c,$t1,$c,ror#2
+        add     $d,$t2,$d,ror#2
+        add     $e,$t3,$e,ror#2
+        stmia   $ctx,{$a,$b,$c,$d,$e}
+        teq     $inp,$len
+        bne     .Lloop                  @ [+18], total 1307
+        ldmia   sp!,{r4-r12,lr}
+        tst     lr,#1
+        moveq   pc,lr                   @ be binary compatible with V4, yet
+        bx      lr                      @ interoperable with Thumb ISA:-)
+.align  2
+.LK_00_19:      .word   0x5a827999
+.LK_20_39:      .word   0x6ed9eba1
+.LK_40_59:      .word   0x8f1bbcdc
+.LK_60_79:      .word   0xca62c1d6
+.size   sha1_block_data_order,.-sha1_block_data_order
+.asciz  "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha1-ppc.pl b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
new file mode 100755
index 0000000000..dcd0fcdfcf
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
@@ -0,0 +1,319 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# I let hardware handle unaligned input(*), except on page boundaries
+# (see below for details). Otherwise straightforward implementation
+# with X vector in register bank. The module is big-endian [which is
+# not big deal as there're no little-endian targets left around].
+#
+# (*) this means that this module is inappropriate for PPC403? Does
+#     anybody know if pre-POWER3 can sustain unaligned load?
+#                       -m64    -m32
+# ----------------------------------
+# PPC970,gcc-4.0.0      +76%    +59%
+# Power6,xlc-7          +68%    +33%
+$flavour = shift;
+if ($flavour =~ /64/) {
+        $SIZE_T =8;
+        $UCMP   ="cmpld";
+        $STU    ="stdu";
+        $POP    ="ld";
+        $PUSH   ="std";
+} elsif ($flavour =~ /32/) {
+        $SIZE_T =4;
+        $UCMP   ="cmplw";
+        $STU    ="stwu";
+        $POP    ="lwz";
+        $PUSH   ="stw";
+} else { die "nonsense $flavour"; }
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+$FRAME=24*$SIZE_T;
+$K  ="r0";
+$sp ="r1";
+$toc="r2";
+$ctx="r3";
+$inp="r4";
+$num="r5";
+$t0 ="r15";
+$t1 ="r6";
+$A  ="r7";
+$B  ="r8";
+$C  ="r9";
+$D  ="r10";
+$E  ="r11";
+$T  ="r12";
+@V=($A,$B,$C,$D,$E,$T);
+@X=("r16","r17","r18","r19","r20","r21","r22","r23",
+    "r24","r25","r26","r27","r28","r29","r30","r31");
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+        lwz     @X[$i],`$i*4`($inp)
+___
+$code.=<<___ if ($i<15);
+        lwz     @X[$j],`$j*4`($inp)
+        add     $f,$K,$e
+        rotlwi  $e,$a,5
+        add     $f,$f,@X[$i]
+        and     $t0,$c,$b
+        add     $f,$f,$e
+        andc    $t1,$d,$b
+        rotlwi  $b,$b,30
+        or      $t0,$t0,$t1
+        add     $f,$f,$t0
+___
+$code.=<<___ if ($i>=15);
+        add     $f,$K,$e
+        rotlwi  $e,$a,5
+        xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
+        add     $f,$f,@X[$i%16]
+        and     $t0,$c,$b
+        xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
+        add     $f,$f,$e
+        andc    $t1,$d,$b
+        rotlwi  $b,$b,30
+        or      $t0,$t0,$t1
+        xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
+        add     $f,$f,$t0
+        rotlwi  @X[$j%16],@X[$j%16],1
+___
+}
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+        add     $f,$K,$e
+        rotlwi  $e,$a,5
+        xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
+        add     $f,$f,@X[$i%16]
+        xor     $t0,$b,$c
+        xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
+        add     $f,$f,$e
+        rotlwi  $b,$b,30
+        xor     $t0,$t0,$d
+        xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
+        add     $f,$f,$t0
+        rotlwi  @X[$j%16],@X[$j%16],1
+___
+$code.=<<___ if ($i==79);
+        add     $f,$K,$e
+        rotlwi  $e,$a,5
+        lwz     r16,0($ctx)
+        add     $f,$f,@X[$i%16]
+        xor     $t0,$b,$c
+        lwz     r17,4($ctx)
+        add     $f,$f,$e
+        rotlwi  $b,$b,30
+        lwz     r18,8($ctx)
+        xor     $t0,$t0,$d
+        lwz     r19,12($ctx)
+        add     $f,$f,$t0
+        lwz     r20,16($ctx)
+___
+}
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my $j=$i+1;
+$code.=<<___;
+        add     $f,$K,$e
+        rotlwi  $e,$a,5
+        xor     @X[$j%16],@X[$j%16],@X[($j+2)%16]
+        add     $f,$f,@X[$i%16]
+        and     $t0,$b,$c
+        xor     @X[$j%16],@X[$j%16],@X[($j+8)%16]
+        add     $f,$f,$e
+        or      $t1,$b,$c
+        rotlwi  $b,$b,30
+        xor     @X[$j%16],@X[$j%16],@X[($j+13)%16]
+        and     $t1,$t1,$d
+        or      $t0,$t0,$t1
+        rotlwi  @X[$j%16],@X[$j%16],1
+        add     $f,$f,$t0
+___
+}
+$code=<<___;
+.machine        "any"
+.text
+.globl  .sha1_block_data_order
+.align  4
+.sha1_block_data_order:
+        mflr    r0
+        $STU    $sp,`-($FRAME+64)`($sp)
+        $PUSH   r0,`$FRAME-$SIZE_T*18`($sp)
+        $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
+        $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
+        $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
+        $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
+        $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
+        $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
+        $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
+        $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
+        $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
+        $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
+        $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
+        $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
+        $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
+        $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
+        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
+        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
+        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+        lwz     $A,0($ctx)
+        lwz     $B,4($ctx)
+        lwz     $C,8($ctx)
+        lwz     $D,12($ctx)
+        lwz     $E,16($ctx)
+        andi.   r0,$inp,3
+        bne     Lunaligned
+Laligned:
+        mtctr   $num
+        bl      Lsha1_block_private
+Ldone:
+        $POP    r0,`$FRAME-$SIZE_T*18`($sp)
+        $POP    r15,`$FRAME-$SIZE_T*17`($sp)
+        $POP    r16,`$FRAME-$SIZE_T*16`($sp)
+        $POP    r17,`$FRAME-$SIZE_T*15`($sp)
+        $POP    r18,`$FRAME-$SIZE_T*14`($sp)
+        $POP    r19,`$FRAME-$SIZE_T*13`($sp)
+        $POP    r20,`$FRAME-$SIZE_T*12`($sp)
+        $POP    r21,`$FRAME-$SIZE_T*11`($sp)
+        $POP    r22,`$FRAME-$SIZE_T*10`($sp)
+        $POP    r23,`$FRAME-$SIZE_T*9`($sp)
+        $POP    r24,`$FRAME-$SIZE_T*8`($sp)
+        $POP    r25,`$FRAME-$SIZE_T*7`($sp)
+        $POP    r26,`$FRAME-$SIZE_T*6`($sp)
+        $POP    r27,`$FRAME-$SIZE_T*5`($sp)
+        $POP    r28,`$FRAME-$SIZE_T*4`($sp)
+        $POP    r29,`$FRAME-$SIZE_T*3`($sp)
+        $POP    r30,`$FRAME-$SIZE_T*2`($sp)
+        $POP    r31,`$FRAME-$SIZE_T*1`($sp)
+        mtlr    r0
+        addi    $sp,$sp,`$FRAME+64`
+        blr
+___
+# PowerPC specification allows an implementation to be ill-behaved
+# upon unaligned access which crosses page boundary. "Better safe
+# than sorry" principle makes me treat it specially. But I don't
+# look for particular offending word, but rather for 64-byte input
+# block which crosses the boundary. Once found that block is aligned
+# and hashed separately...
+$code.=<<___;
+.align  4
+Lunaligned:
+        subfic  $t1,$inp,4096
+        andi.   $t1,$t1,4095    ; distance to closest page boundary
+        srwi.   $t1,$t1,6       ; t1/=64
+        beq     Lcross_page
+        $UCMP   $num,$t1
+        ble-    Laligned        ; didn't cross the page boundary
+        mtctr   $t1
+        subfc   $num,$t1,$num
+        bl      Lsha1_block_private
+Lcross_page:
+        li      $t1,16
+        mtctr   $t1
+        addi    r20,$sp,$FRAME  ; spot below the frame
+Lmemcpy:
+        lbz     r16,0($inp)
+        lbz     r17,1($inp)
+        lbz     r18,2($inp)
+        lbz     r19,3($inp)
+        addi    $inp,$inp,4
+        stb     r16,0(r20)
+        stb     r17,1(r20)
+        stb     r18,2(r20)
+        stb     r19,3(r20)
+        addi    r20,r20,4
+        bdnz    Lmemcpy
+        $PUSH   $inp,`$FRAME-$SIZE_T*19`($sp)
+        li      $t1,1
+        addi    $inp,$sp,$FRAME
+        mtctr   $t1
+        bl      Lsha1_block_private
+        $POP    $inp,`$FRAME-$SIZE_T*19`($sp)
+        addic.  $num,$num,-1
+        bne-    Lunaligned
+        b       Ldone
+___
+# This is private block function, which uses tailored calling
+# interface, namely upon entry SHA_CTX is pre-loaded to given
+# registers and counter register contains amount of chunks to
+# digest...
+$code.=<<___;
+.align  4
+Lsha1_block_private:
+___
+$code.=<<___;   # load K_00_19
+        lis     $K,0x5a82
+        ori     $K,$K,0x7999
+___
+for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;   # load K_20_39
+        lis     $K,0x6ed9
+        ori     $K,$K,0xeba1
+___
+for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;   # load K_40_59
+        lis     $K,0x8f1b
+        ori     $K,$K,0xbcdc
+___
+for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;   # load K_60_79
+        lis     $K,0xca62
+        ori     $K,$K,0xc1d6
+___
+for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        add     r16,r16,$E
+        add     r17,r17,$T
+        add     r18,r18,$A
+        add     r19,r19,$B
+        add     r20,r20,$C
+        stw     r16,0($ctx)
+        mr      $A,r16
+        stw     r17,4($ctx)
+        mr      $B,r17
+        stw     r18,8($ctx)
+        mr      $C,r18
+        stw     r19,12($ctx)
+        mr      $D,r19
+        stw     r20,16($ctx)
+        mr      $E,r20
+        addi    $inp,$inp,`16*4`
+        bdnz-   Lsha1_block_private
+        blr
+___
+$code.=<<___;
+.asciz  "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-s390x.pl b/src/lib/libcrypto/sha/asm/sha1-s390x.pl
new file mode 100644
index 0000000000..4b17848287
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-s390x.pl
@@ -0,0 +1,226 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# SHA1 block procedure for s390x.
+# April 2007.
+#
+# Performance is >30% better than gcc 3.3 generated code. But the real
+# twist is that SHA1 hardware support is detected and utilized. In
+# which case performance can reach further >4.5x for larger chunks.
+# January 2009.
+#
+# Optimize Xupdate for amount of memory references and reschedule
+# instructions to favour dual-issue z10 pipeline. On z10 hardware is
+# "only" ~2.3x faster than software.
+$kimdfunc=1;    # magic function code for kimd instruction
+$output=shift;
+open STDOUT,">$output";
+$K_00_39="%r0"; $K=$K_00_39;
+$K_40_79="%r1";
+$ctx="%r2";     $prefetch="%r2";
+$inp="%r3";
+$len="%r4";
+$A="%r5";
+$B="%r6";
+$C="%r7";
+$D="%r8";
+$E="%r9";       @V=($A,$B,$C,$D,$E);
+$t0="%r10";
+$t1="%r11";
+@X=("%r12","%r13","%r14");
+$sp="%r15";
+$frame=160+16*4;
+sub Xupdate {
+my $i=shift;
+$code.=<<___ if ($i==15);
+        lg      $prefetch,160($sp)      ### Xupdate(16) warm-up
+        lr      $X[0],$X[2]
+___
+return if ($i&1);       # Xupdate is vectorized and executed every 2nd cycle
+$code.=<<___ if ($i<16);
+        lg      $X[0],`$i*4`($inp)      ### Xload($i)
+        rllg    $X[1],$X[0],32
+___
+$code.=<<___ if ($i>=16);
+        xgr     $X[0],$prefetch         ### Xupdate($i)
+        lg      $prefetch,`160+4*(($i+2)%16)`($sp)
+        xg      $X[0],`160+4*(($i+8)%16)`($sp)
+        xgr     $X[0],$prefetch
+        rll     $X[0],$X[0],1
+        rllg    $X[1],$X[0],32
+        rll     $X[1],$X[1],1
+        rllg    $X[0],$X[1],32
+        lr      $X[2],$X[1]             # feedback
+___
+$code.=<<___ if ($i<=70);
+        stg     $X[0],`160+4*($i%16)`($sp)
+___
+unshift(@X,pop(@X));
+}
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $xi=$X[1];
+        &Xupdate($i);
+$code.=<<___;
+        alr     $e,$K           ### $i
+        rll     $t1,$a,5
+        lr      $t0,$d
+        xr      $t0,$c
+        alr     $e,$t1
+        nr      $t0,$b
+        alr     $e,$xi
+        xr      $t0,$d
+        rll     $b,$b,30
+        alr     $e,$t0
+___
+}
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $xi=$X[1];
+        &Xupdate($i);
+$code.=<<___;
+        alr     $e,$K           ### $i
+        rll     $t1,$a,5
+        lr      $t0,$b
+        alr     $e,$t1
+        xr      $t0,$c
+        alr     $e,$xi
+        xr      $t0,$d
+        rll     $b,$b,30
+        alr     $e,$t0
+___
+}
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $xi=$X[1];
+        &Xupdate($i);
+$code.=<<___;
+        alr     $e,$K           ### $i
+        rll     $t1,$a,5
+        lr      $t0,$b
+        alr     $e,$t1
+        or      $t0,$c
+        lr      $t1,$b
+        nr      $t0,$d
+        nr      $t1,$c
+        alr     $e,$xi
+        or      $t0,$t1
+        rll     $b,$b,30
+        alr     $e,$t0
+___
+}
+$code.=<<___;
+.text
+.align  64
+.type   Ktable,\@object
+Ktable: .long   0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
+        .skip   48      #.long  0,0,0,0,0,0,0,0,0,0,0,0
+.size   Ktable,.-Ktable
+.globl  sha1_block_data_order
+.type   sha1_block_data_order,\@function
+sha1_block_data_order:
+___
+$code.=<<___ if ($kimdfunc);
+        larl    %r1,OPENSSL_s390xcap_P
+        lg      %r0,0(%r1)
+        tmhl    %r0,0x4000      # check for message-security assist
+        jz      .Lsoftware
+        lghi    %r0,0
+        la      %r1,16($sp)
+        .long   0xb93e0002      # kimd %r0,%r2
+        lg      %r0,16($sp)
+        tmhh    %r0,`0x8000>>$kimdfunc`
+        jz      .Lsoftware
+        lghi    %r0,$kimdfunc
+        lgr     %r1,$ctx
+        lgr     %r2,$inp
+        sllg    %r3,$len,6
+        .long   0xb93e0002      # kimd %r0,%r2
+        brc     1,.-4           # pay attention to "partial completion"
+        br      %r14
+.align  16
+.Lsoftware:
+___
+$code.=<<___;
+        lghi    %r1,-$frame
+        stg     $ctx,16($sp)
+        stmg    %r6,%r15,48($sp)
+        lgr     %r0,$sp
+        la      $sp,0(%r1,$sp)
+        stg     %r0,0($sp)
+        larl    $t0,Ktable
+        llgf    $A,0($ctx)
+        llgf    $B,4($ctx)
+        llgf    $C,8($ctx)
+        llgf    $D,12($ctx)
+        llgf    $E,16($ctx)
+        lg      $K_00_39,0($t0)
+        lg      $K_40_79,8($t0)
+.Lloop:
+        rllg    $K_00_39,$K_00_39,32
+___
+for ($i=0;$i<20;$i++)   { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        rllg    $K_00_39,$K_00_39,32
+___
+for (;$i<40;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;   $K=$K_40_79;
+        rllg    $K_40_79,$K_40_79,32
+___
+for (;$i<60;$i++)       { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        rllg    $K_40_79,$K_40_79,32
+___
+for (;$i<80;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        lg      $ctx,`$frame+16`($sp)
+        la      $inp,64($inp)
+        al      $A,0($ctx)
+        al      $B,4($ctx)
+        al      $C,8($ctx)
+        al      $D,12($ctx)
+        al      $E,16($ctx)
+        st      $A,0($ctx)
+        st      $B,4($ctx)
+        st      $C,8($ctx)
+        st      $D,12($ctx)
+        st      $E,16($ctx)
+        brct    $len,.Lloop
+        lmg     %r6,%r15,`$frame+48`($sp)
+        br      %r14
+.size   sha1_block_data_order,.-sha1_block_data_order
+.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+.comm   OPENSSL_s390xcap_P,8,8
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
new file mode 100644
index 0000000000..8306fc88cc
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
@@ -0,0 +1,283 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# Performance improvement is not really impressive on pre-T1 CPU: +8%
+# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
+# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
+# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
+# X[16] vector is packed to 8 64-bit registers and as result nothing
+# is spilled on stack. In addition input data is loaded in compact
+# instruction sequence, thus minimizing the window when the code is
+# subject to [inter-thread] cache-thrashing hazard. The goal is to
+# ensure scalability on UltraSPARC T1, or rather to avoid decay when
+# amount of active threads exceeds the number of physical cores.
+$bits=32;
+for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)  { $bias=2047; $frame=192; }
+else            { $bias=0;    $frame=112; }
+$output=shift;
+open STDOUT,">$output";
+@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
+$rot1m="%g2";
+$tmp64="%g3";
+$Xi="%g4";
+$A="%l0";
+$B="%l1";
+$C="%l2";
+$D="%l3";
+$E="%l4";
+@V=($A,$B,$C,$D,$E);
+$K_00_19="%l5";
+$K_20_39="%l6";
+$K_40_59="%l7";
+$K_60_79="%g5";
+@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
+$ctx="%i0";
+$inp="%i1";
+$len="%i2";
+$tmp0="%i3";
+$tmp1="%i4";
+$tmp2="%i5";
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $xi=($i&1)?@X[($i/2)%8]:$Xi;
+$code.=<<___;
+        sll     $a,5,$tmp0              !! $i
+        add     @K[$i/20],$e,$e
+        srl     $a,27,$tmp1
+        add     $tmp0,$e,$e
+        and     $c,$b,$tmp0
+        add     $tmp1,$e,$e
+        sll     $b,30,$tmp2
+        andn    $d,$b,$tmp1
+        srl     $b,2,$b
+        or      $tmp1,$tmp0,$tmp1
+        or      $tmp2,$b,$b
+        add     $xi,$e,$e
+___
+if ($i&1 && $i<15) {
+        $code.=
+        "       srlx    @X[(($i+1)/2)%8],32,$Xi\n";
+}
+$code.=<<___;
+        add     $tmp1,$e,$e
+___
+}
+sub Xupdate {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i/2;
+if ($i&1) {
+$code.=<<___;
+        sll     $a,5,$tmp0              !! $i
+        add     @K[$i/20],$e,$e
+        srl     $a,27,$tmp1
+___
+} else {
+$code.=<<___;
+        sllx    @X[($j+6)%8],32,$Xi     ! Xupdate($i)
+        xor     @X[($j+1)%8],@X[$j%8],@X[$j%8]
+        srlx    @X[($j+7)%8],32,$tmp1
+        xor     @X[($j+4)%8],@X[$j%8],@X[$j%8]
+        sll     $a,5,$tmp0              !! $i
+        or      $tmp1,$Xi,$Xi
+        add     @K[$i/20],$e,$e         !!
+        xor     $Xi,@X[$j%8],@X[$j%8]
+        srlx    @X[$j%8],31,$Xi
+        add     @X[$j%8],@X[$j%8],@X[$j%8]
+        and     $Xi,$rot1m,$Xi
+        andn    @X[$j%8],$rot1m,@X[$j%8]
+        srl     $a,27,$tmp1             !!
+        or      $Xi,@X[$j%8],@X[$j%8]
+___
+}
+}
+sub BODY_16_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+        &Xupdate(@_);
+    if ($i&1) {
+        $xi=@X[($i/2)%8];
+    } else {
+        $xi=$Xi;
+        $code.="\tsrlx  @X[($i/2)%8],32,$xi\n";
+    }
+$code.=<<___;
+        add     $tmp0,$e,$e             !!
+        and     $c,$b,$tmp0
+        add     $tmp1,$e,$e
+        sll     $b,30,$tmp2
+        add     $xi,$e,$e
+        andn    $d,$b,$tmp1
+        srl     $b,2,$b
+        or      $tmp1,$tmp0,$tmp1
+        or      $tmp2,$b,$b
+        add     $tmp1,$e,$e
+___
+}
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $xi;
+        &Xupdate(@_);
+    if ($i&1) {
+        $xi=@X[($i/2)%8];
+    } else {
+        $xi=$Xi;
+        $code.="\tsrlx  @X[($i/2)%8],32,$xi\n";
+    }
+$code.=<<___;
+        add     $tmp0,$e,$e             !!
+        xor     $c,$b,$tmp0
+        add     $tmp1,$e,$e
+        sll     $b,30,$tmp2
+        xor     $d,$tmp0,$tmp1
+        srl     $b,2,$b
+        add     $tmp1,$e,$e
+        or      $tmp2,$b,$b
+        add     $xi,$e,$e
+___
+}
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $xi;
+        &Xupdate(@_);
+    if ($i&1) {
+        $xi=@X[($i/2)%8];
+    } else {
+        $xi=$Xi;
+        $code.="\tsrlx  @X[($i/2)%8],32,$xi\n";
+    }
+$code.=<<___;
+        add     $tmp0,$e,$e             !!
+        and     $c,$b,$tmp0
+        add     $tmp1,$e,$e
+        sll     $b,30,$tmp2
+        or      $c,$b,$tmp1
+        srl     $b,2,$b
+        and     $d,$tmp1,$tmp1
+        add     $xi,$e,$e
+        or      $tmp1,$tmp0,$tmp1
+        or      $tmp2,$b,$b
+        add     $tmp1,$e,$e
+___
+}
+$code.=<<___ if ($bits==64);
+.register       %g2,#scratch
+.register       %g3,#scratch
+___
+$code.=<<___;
+.section        ".text",#alloc,#execinstr
+.align  32
+.globl  sha1_block_data_order
+sha1_block_data_order:
+        save    %sp,-$frame,%sp
+        sllx    $len,6,$len
+        add     $inp,$len,$len
+        or      %g0,1,$rot1m
+        sllx    $rot1m,32,$rot1m
+        or      $rot1m,1,$rot1m
+        ld      [$ctx+0],$A
+        ld      [$ctx+4],$B
+        ld      [$ctx+8],$C
+        ld      [$ctx+12],$D
+        ld      [$ctx+16],$E
+        andn    $inp,7,$tmp0
+        sethi   %hi(0x5a827999),$K_00_19
+        or      $K_00_19,%lo(0x5a827999),$K_00_19
+        sethi   %hi(0x6ed9eba1),$K_20_39
+        or      $K_20_39,%lo(0x6ed9eba1),$K_20_39
+        sethi   %hi(0x8f1bbcdc),$K_40_59
+        or      $K_40_59,%lo(0x8f1bbcdc),$K_40_59
+        sethi   %hi(0xca62c1d6),$K_60_79
+        or      $K_60_79,%lo(0xca62c1d6),$K_60_79
+.Lloop:
+        ldx     [$tmp0+0],@X[0]
+        ldx     [$tmp0+16],@X[2]
+        ldx     [$tmp0+32],@X[4]
+        ldx     [$tmp0+48],@X[6]
+        and     $inp,7,$tmp1
+        ldx     [$tmp0+8],@X[1]
+        sll     $tmp1,3,$tmp1
+        ldx     [$tmp0+24],@X[3]
+        subcc   %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
+        ldx     [$tmp0+40],@X[5]
+        bz,pt   %icc,.Laligned
+        ldx     [$tmp0+56],@X[7]
+        sllx    @X[0],$tmp1,@X[0]
+        ldx     [$tmp0+64],$tmp64
+___
+for($i=0;$i<7;$i++)
+{   $code.=<<___;
+        srlx    @X[$i+1],$tmp2,$Xi
+        sllx    @X[$i+1],$tmp1,@X[$i+1]
+        or      $Xi,@X[$i],@X[$i]
+___
+}
+$code.=<<___;
+        srlx    $tmp64,$tmp2,$tmp64
+        or      $tmp64,@X[7],@X[7]
+.Laligned:
+        srlx    @X[0],32,$Xi
+___
+for ($i=0;$i<16;$i++)   { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+for (;$i<20;$i++)       { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
+for (;$i<40;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+for (;$i<60;$i++)       { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+for (;$i<80;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        ld      [$ctx+0],@X[0]
+        ld      [$ctx+4],@X[1]
+        ld      [$ctx+8],@X[2]
+        ld      [$ctx+12],@X[3]
+        add     $inp,64,$inp
+        ld      [$ctx+16],@X[4]
+        cmp     $inp,$len
+        add     $A,@X[0],$A
+        st      $A,[$ctx+0]
+        add     $B,@X[1],$B
+        st      $B,[$ctx+4]
+        add     $C,@X[2],$C
+        st      $C,[$ctx+8]
+        add     $D,@X[3],$D
+        st      $D,[$ctx+12]
+        add     $E,@X[4],$E
+        st      $E,[$ctx+16]
+        bne     `$bits==64?"%xcc":"%icc"`,.Lloop
+        andn    $inp,7,$tmp0
+        ret
+        restore
+.type   sha1_block_data_order,#function
+.size   sha1_block_data_order,(.-sha1_block_data_order)
+.asciz  "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
new file mode 100644
index 0000000000..15eb854bad
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
@@ -0,0 +1,600 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# January 2009
+#
+# Provided that UltraSPARC VIS instructions are pipe-lined(*) and
+# pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC
+# Graphic Unit would make it possible to achieve higher instruction-
+# level parallelism, ILP, and thus higher performance. It should be
+# explicitly noted that ILP is the keyword, and it means that this
+# code would be unsuitable for cores like UltraSPARC-Tx. The idea is
+# not really novel, Sun had VIS-powered implementation for a while.
+# Unlike Sun's implementation this one can process multiple unaligned
+# input blocks, and as such works as drop-in replacement for OpenSSL
+# sha1_block_data_order. Performance improvement was measured to be
+# 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on
+# UltraSPARC-III. See below for discussion...
+#
+# The module does not present direct interest for OpenSSL, because
+# it doesn't provide better performance on contemporary SPARCv9 CPUs,
+# UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they
+# absolutely must score on UltraSPARC-I-IV can simply replace
+# crypto/sha/asm/sha1-sparcv9.pl with this module.
+#
+# (*)   "Pipe-lined" means that even if it takes several cycles to
+#       complete, next instruction using same functional unit [but not
+#       depending on the result of the current instruction] can start
+#       execution without having to wait for the unit. "Pairable"
+#       means that two [or more] independent instructions can be
+#       issued at the very same time.
+$bits=32;
+for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)  { $bias=2047; $frame=192; }
+else            { $bias=0;    $frame=112; }
+$output=shift;
+open STDOUT,">$output";
+$ctx="%i0";
+$inp="%i1";
+$len="%i2";
+$tmp0="%i3";
+$tmp1="%i4";
+$tmp2="%i5";
+$tmp3="%g5";
+$base="%g1";
+$align="%g4";
+$Xfer="%o5";
+$nXfer=$tmp3;
+$Xi="%o7";
+$A="%l0";
+$B="%l1";
+$C="%l2";
+$D="%l3";
+$E="%l4";
+@V=($A,$B,$C,$D,$E);
+$Actx="%o0";
+$Bctx="%o1";
+$Cctx="%o2";
+$Dctx="%o3";
+$Ectx="%o4";
+$fmul="%f32";
+$VK_00_19="%f34";
+$VK_20_39="%f36";
+$VK_40_59="%f38";
+$VK_60_79="%f40";
+@VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79);
+@X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
+    "%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16");
+# This is reference 2x-parallelized VIS-powered Xupdate procedure. It
+# covers even K_NN_MM addition...
+sub Xupdate {
+my ($i)=@_;
+my $K=@VK[($i+16)/20];
+my $j=($i+16)%16;
+#       [ provided that GSR.alignaddr_offset is 5, $mul contains
+#         0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to
+#         chosen registers... ]
+$code.=<<___;
+        fxors           @X[($j+13)%16],@X[$j],@X[$j]    !-1/-1/-1:X[0]^=X[13]
+        fxors           @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
+        fxor            @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
+        fxor            %f18,@X[$j],@X[$j]              ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
+        faligndata      @X[$j],@X[$j],%f18              ! 3/ 7/ 5:Tmp=X[0,1]>>>24
+        fpadd32         @X[$j],@X[$j],@X[$j]            ! 4/ 8/ 6:X[0,1]<<=1
+        fmul8ulx16      %f18,$fmul,%f18                 ! 5/10/ 7:Tmp>>=7, Tmp&=1
+        ![fxors         %f15,%f2,%f2]
+        for             %f18,@X[$j],@X[$j]              ! 8/14/10:X[0,1]|=Tmp
+        ![fxors         %f0,%f3,%f3]                    !10/17/12:X[0] dependency
+        fpadd32         $K,@X[$j],%f20
+        std             %f20,[$Xfer+`4*$j`]
+___
+# The numbers delimited with slash are the earliest possible dispatch
+# cycles for given instruction assuming 1 cycle latency for simple VIS
+# instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as
+# on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being
+# 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1
+# round. As [long as] FPU/VIS instructions are perfectly pairable with
+# IALU ones, the round timing is defined by the maximum between VIS
+# and IALU timings. The latter varies from round to round and averages
+# out at 6.25 ticks. This means that USI&II should operate at IALU
+# rate, while USIII&IV - at VIS rate. This explains why performance
+# improvement varies among processors. Well, given that pure IALU
+# sha1-sparcv9.pl module exhibits virtually uniform performance of
+# ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical
+# lower limits. Real-life performance was measured to be 6.6 cycles
+# per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than
+# half-round VIS timing, because there are 16 Xupdate-free rounds,
+# which "push down" average theoretical timing to 8 cycles...
+# (*)   SPARC64-V[II] was originally believed to have 2 cycles VIS
+#       latency. Well, it might have, but it doesn't have dedicated
+#       VIS-unit. Instead, VIS instructions are executed by other
+#       functional units, ones used here - by IALU. This doesn't
+#       improve effective ILP...
+}
+# The reference Xupdate procedure is then "strained" over *pairs* of
+# BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13]
+# and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves
+# plenty of room to amortize for read-after-write hazard, as well as
+# to fetch and align input for the next spin. The VIS instructions are
+# scheduled for latency of 2 cycles, because there are not enough IALU
+# instructions to schedule for latency of 3, while scheduling for 1
+# would give no gain on USI&II anyway.
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i&~1;
+my $k=($j+16+2)%16;     # ahead reference
+my $l=($j+16-2)%16;     # behind reference
+my $K=@VK[($j+16-2)/20];
+$j=($j+16)%16;
+$code.=<<___ if (!($i&1));
+        sll             $a,5,$tmp0                      !! $i
+        and             $c,$b,$tmp3
+        ld              [$Xfer+`4*($i%16)`],$Xi
+         fxors          @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
+        srl             $a,27,$tmp1
+        add             $tmp0,$e,$e
+         fxor           @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
+        sll             $b,30,$tmp2
+        add             $tmp1,$e,$e
+        andn            $d,$b,$tmp1
+        add             $Xi,$e,$e
+         fxor           %f18,@X[$j],@X[$j]              ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
+        srl             $b,2,$b
+        or              $tmp1,$tmp3,$tmp1
+        or              $tmp2,$b,$b
+        add             $tmp1,$e,$e
+         faligndata     @X[$j],@X[$j],%f18              ! 3/ 7/ 5:Tmp=X[0,1]>>>24
+___
+$code.=<<___ if ($i&1);
+        sll             $a,5,$tmp0                      !! $i
+        and             $c,$b,$tmp3
+        ld              [$Xfer+`4*($i%16)`],$Xi
+         fpadd32        @X[$j],@X[$j],@X[$j]            ! 4/ 8/ 6:X[0,1]<<=1
+        srl             $a,27,$tmp1
+        add             $tmp0,$e,$e
+         fmul8ulx16     %f18,$fmul,%f18                 ! 5/10/ 7:Tmp>>=7, Tmp&=1
+        sll             $b,30,$tmp2
+        add             $tmp1,$e,$e
+         fpadd32        $K,@X[$l],%f20                  !
+        andn            $d,$b,$tmp1
+        add             $Xi,$e,$e
+         fxors          @X[($k+13)%16],@X[$k],@X[$k]    !-1/-1/-1:X[0]^=X[13]
+        srl             $b,2,$b
+        or              $tmp1,$tmp3,$tmp1
+         fxor           %f18,@X[$j],@X[$j]              ! 8/14/10:X[0,1]|=Tmp
+        or              $tmp2,$b,$b
+        add             $tmp1,$e,$e
+___
+$code.=<<___ if ($i&1 && $i>=2);
+         std            %f20,[$Xfer+`4*$l`]             !
+___
+}
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i&~1;
+my $k=($j+16+2)%16;     # ahead reference
+my $l=($j+16-2)%16;     # behind reference
+my $K=@VK[($j+16-2)/20];
+$j=($j+16)%16;
+$code.=<<___ if (!($i&1) && $i<64);
+        sll             $a,5,$tmp0                      !! $i
+        ld              [$Xfer+`4*($i%16)`],$Xi
+         fxors          @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
+        srl             $a,27,$tmp1
+        add             $tmp0,$e,$e
+         fxor           @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
+        xor             $c,$b,$tmp0
+        add             $tmp1,$e,$e
+        sll             $b,30,$tmp2
+        xor             $d,$tmp0,$tmp1
+         fxor           %f18,@X[$j],@X[$j]              ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
+        srl             $b,2,$b
+        add             $tmp1,$e,$e
+        or              $tmp2,$b,$b
+        add             $Xi,$e,$e
+         faligndata     @X[$j],@X[$j],%f18              ! 3/ 7/ 5:Tmp=X[0,1]>>>24
+___
+$code.=<<___ if ($i&1 && $i<64);
+        sll             $a,5,$tmp0                      !! $i
+        ld              [$Xfer+`4*($i%16)`],$Xi
+         fpadd32        @X[$j],@X[$j],@X[$j]            ! 4/ 8/ 6:X[0,1]<<=1
+        srl             $a,27,$tmp1
+        add             $tmp0,$e,$e
+         fmul8ulx16     %f18,$fmul,%f18                 ! 5/10/ 7:Tmp>>=7, Tmp&=1
+        xor             $c,$b,$tmp0
+        add             $tmp1,$e,$e
+         fpadd32        $K,@X[$l],%f20                  !
+        sll             $b,30,$tmp2
+        xor             $d,$tmp0,$tmp1
+         fxors          @X[($k+13)%16],@X[$k],@X[$k]    !-1/-1/-1:X[0]^=X[13]
+        srl             $b,2,$b
+        add             $tmp1,$e,$e
+         fxor           %f18,@X[$j],@X[$j]              ! 8/14/10:X[0,1]|=Tmp
+        or              $tmp2,$b,$b
+        add             $Xi,$e,$e
+         std            %f20,[$Xfer+`4*$l`]             !
+___
+$code.=<<___ if ($i==64);
+        sll             $a,5,$tmp0                      !! $i
+        ld              [$Xfer+`4*($i%16)`],$Xi
+         fpadd32        $K,@X[$l],%f20
+        srl             $a,27,$tmp1
+        add             $tmp0,$e,$e
+        xor             $c,$b,$tmp0
+        add             $tmp1,$e,$e
+        sll             $b,30,$tmp2
+        xor             $d,$tmp0,$tmp1
+         std            %f20,[$Xfer+`4*$l`]
+        srl             $b,2,$b
+        add             $tmp1,$e,$e
+        or              $tmp2,$b,$b
+        add             $Xi,$e,$e
+___
+$code.=<<___ if ($i>64);
+        sll             $a,5,$tmp0                      !! $i
+        ld              [$Xfer+`4*($i%16)`],$Xi
+        srl             $a,27,$tmp1
+        add             $tmp0,$e,$e
+        xor             $c,$b,$tmp0
+        add             $tmp1,$e,$e
+        sll             $b,30,$tmp2
+        xor             $d,$tmp0,$tmp1
+        srl             $b,2,$b
+        add             $tmp1,$e,$e
+        or              $tmp2,$b,$b
+        add             $Xi,$e,$e
+___
+}
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i&~1;
+my $k=($j+16+2)%16;     # ahead reference
+my $l=($j+16-2)%16;     # behind reference
+my $K=@VK[($j+16-2)/20];
+$j=($j+16)%16;
+$code.=<<___ if (!($i&1));
+        sll             $a,5,$tmp0                      !! $i
+        ld              [$Xfer+`4*($i%16)`],$Xi
+         fxors          @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
+        srl             $a,27,$tmp1
+        add             $tmp0,$e,$e
+         fxor           @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
+        and             $c,$b,$tmp0
+        add             $tmp1,$e,$e
+        sll             $b,30,$tmp2
+        or              $c,$b,$tmp1
+         fxor           %f18,@X[$j],@X[$j]              ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
+        srl             $b,2,$b
+        and             $d,$tmp1,$tmp1
+        add             $Xi,$e,$e
+        or              $tmp1,$tmp0,$tmp1
+         faligndata     @X[$j],@X[$j],%f18              ! 3/ 7/ 5:Tmp=X[0,1]>>>24
+        or              $tmp2,$b,$b
+        add             $tmp1,$e,$e
+         fpadd32        @X[$j],@X[$j],@X[$j]            ! 4/ 8/ 6:X[0,1]<<=1
+___
+$code.=<<___ if ($i&1);
+        sll             $a,5,$tmp0                      !! $i
+        ld              [$Xfer+`4*($i%16)`],$Xi
+        srl             $a,27,$tmp1
+        add             $tmp0,$e,$e
+         fmul8ulx16     %f18,$fmul,%f18                 ! 5/10/ 7:Tmp>>=7, Tmp&=1
+        and             $c,$b,$tmp0
+        add             $tmp1,$e,$e
+         fpadd32        $K,@X[$l],%f20                  !
+        sll             $b,30,$tmp2
+        or              $c,$b,$tmp1
+         fxors          @X[($k+13)%16],@X[$k],@X[$k]    !-1/-1/-1:X[0]^=X[13]
+        srl             $b,2,$b
+        and             $d,$tmp1,$tmp1
+         fxor           %f18,@X[$j],@X[$j]              ! 8/14/10:X[0,1]|=Tmp
+        add             $Xi,$e,$e
+        or              $tmp1,$tmp0,$tmp1
+        or              $tmp2,$b,$b
+        add             $tmp1,$e,$e
+         std            %f20,[$Xfer+`4*$l`]             !
+___
+}
+# If there is more data to process, then we pre-fetch the data for
+# next iteration in last ten rounds...
+sub BODY_70_79 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i&~1;
+my $m=($i%8)*2;
+$j=($j+16)%16;
+$code.=<<___ if ($i==70);
+        sll             $a,5,$tmp0                      !! $i
+        ld              [$Xfer+`4*($i%16)`],$Xi
+        srl             $a,27,$tmp1
+        add             $tmp0,$e,$e
+         ldd            [$inp+64],@X[0]
+        xor             $c,$b,$tmp0
+        add             $tmp1,$e,$e
+        sll             $b,30,$tmp2
+        xor             $d,$tmp0,$tmp1
+        srl             $b,2,$b
+        add             $tmp1,$e,$e
+        or              $tmp2,$b,$b
+        add             $Xi,$e,$e
+        and             $inp,-64,$nXfer
+        inc             64,$inp
+        and             $nXfer,255,$nXfer
+        alignaddr       %g0,$align,%g0
+        add             $base,$nXfer,$nXfer
+___
+$code.=<<___ if ($i==71);
+        sll             $a,5,$tmp0                      !! $i
+        ld              [$Xfer+`4*($i%16)`],$Xi
+        srl             $a,27,$tmp1
+        add             $tmp0,$e,$e
+        xor             $c,$b,$tmp0
+        add             $tmp1,$e,$e
+        sll             $b,30,$tmp2
+        xor             $d,$tmp0,$tmp1
+        srl             $b,2,$b
+        add             $tmp1,$e,$e
+        or              $tmp2,$b,$b
+        add             $Xi,$e,$e
+___
+$code.=<<___ if ($i>=72);
+         faligndata     @X[$m],@X[$m+2],@X[$m]
+        sll             $a,5,$tmp0                      !! $i
+        ld              [$Xfer+`4*($i%16)`],$Xi
+        srl             $a,27,$tmp1
+        add             $tmp0,$e,$e
+        xor             $c,$b,$tmp0
+        add             $tmp1,$e,$e
+         fpadd32        $VK_00_19,@X[$m],%f20
+        sll             $b,30,$tmp2
+        xor             $d,$tmp0,$tmp1
+        srl             $b,2,$b
+        add             $tmp1,$e,$e
+        or              $tmp2,$b,$b
+        add             $Xi,$e,$e
+___
+$code.=<<___ if ($i<77);
+         ldd            [$inp+`8*($i+1-70)`],@X[2*($i+1-70)]
+___
+$code.=<<___ if ($i==77);       # redundant if $inp was aligned
+         add            $align,63,$tmp0
+         and            $tmp0,-8,$tmp0
+         ldd            [$inp+$tmp0],@X[16]
+___
+$code.=<<___ if ($i>=72);
+         std            %f20,[$nXfer+`4*$m`]
+___
+}
+$code.=<<___;
+.section        ".text",#alloc,#execinstr
+.align  64
+vis_const:
+.long   0x5a827999,0x5a827999   ! K_00_19
+.long   0x6ed9eba1,0x6ed9eba1   ! K_20_39
+.long   0x8f1bbcdc,0x8f1bbcdc   ! K_40_59
+.long   0xca62c1d6,0xca62c1d6   ! K_60_79
+.long   0x00000100,0x00000100
+.align  64
+.type   vis_const,#object
+.size   vis_const,(.-vis_const)
+.globl  sha1_block_data_order
+sha1_block_data_order:
+        save    %sp,-$frame,%sp
+        add     %fp,$bias-256,$base
+1:      call    .+8
+        add     %o7,vis_const-1b,$tmp0
+        ldd     [$tmp0+0],$VK_00_19
+        ldd     [$tmp0+8],$VK_20_39
+        ldd     [$tmp0+16],$VK_40_59
+        ldd     [$tmp0+24],$VK_60_79
+        ldd     [$tmp0+32],$fmul
+        ld      [$ctx+0],$Actx
+        and     $base,-256,$base
+        ld      [$ctx+4],$Bctx
+        sub     $base,$bias+$frame,%sp
+        ld      [$ctx+8],$Cctx
+        and     $inp,7,$align
+        ld      [$ctx+12],$Dctx
+        and     $inp,-8,$inp
+        ld      [$ctx+16],$Ectx
+        ! X[16] is maintained in FP register bank
+        alignaddr       %g0,$align,%g0
+        ldd             [$inp+0],@X[0]
+        sub             $inp,-64,$Xfer
+        ldd             [$inp+8],@X[2]
+        and             $Xfer,-64,$Xfer
+        ldd             [$inp+16],@X[4]
+        and             $Xfer,255,$Xfer
+        ldd             [$inp+24],@X[6]
+        add             $base,$Xfer,$Xfer
+        ldd             [$inp+32],@X[8]
+        ldd             [$inp+40],@X[10]
+        ldd             [$inp+48],@X[12]
+        brz,pt          $align,.Laligned
+        ldd             [$inp+56],@X[14]
+        ldd             [$inp+64],@X[16]
+        faligndata      @X[0],@X[2],@X[0]
+        faligndata      @X[2],@X[4],@X[2]
+        faligndata      @X[4],@X[6],@X[4]
+        faligndata      @X[6],@X[8],@X[6]
+        faligndata      @X[8],@X[10],@X[8]
+        faligndata      @X[10],@X[12],@X[10]
+        faligndata      @X[12],@X[14],@X[12]
+        faligndata      @X[14],@X[16],@X[14]
+.Laligned:
+        mov             5,$tmp0
+        dec             1,$len
+        alignaddr       %g0,$tmp0,%g0
+        fpadd32         $VK_00_19,@X[0],%f16
+        fpadd32         $VK_00_19,@X[2],%f18
+        fpadd32         $VK_00_19,@X[4],%f20
+        fpadd32         $VK_00_19,@X[6],%f22
+        fpadd32         $VK_00_19,@X[8],%f24
+        fpadd32         $VK_00_19,@X[10],%f26
+        fpadd32         $VK_00_19,@X[12],%f28
+        fpadd32         $VK_00_19,@X[14],%f30
+        std             %f16,[$Xfer+0]
+        mov             $Actx,$A
+        std             %f18,[$Xfer+8]
+        mov             $Bctx,$B
+        std             %f20,[$Xfer+16]
+        mov             $Cctx,$C
+        std             %f22,[$Xfer+24]
+        mov             $Dctx,$D
+        std             %f24,[$Xfer+32]
+        mov             $Ectx,$E
+        std             %f26,[$Xfer+40]
+        fxors           @X[13],@X[0],@X[0]
+        std             %f28,[$Xfer+48]
+        ba              .Loop
+        std             %f30,[$Xfer+56]
+.align  32
+.Loop:
+___
+for ($i=0;$i<20;$i++)   { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+for (;$i<40;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+for (;$i<60;$i++)       { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+for (;$i<70;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        tst             $len
+        bz,pn           `$bits==32?"%icc":"%xcc"`,.Ltail
+        nop
+___
+for (;$i<80;$i++)       { &BODY_70_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        add             $A,$Actx,$Actx
+        add             $B,$Bctx,$Bctx
+        add             $C,$Cctx,$Cctx
+        add             $D,$Dctx,$Dctx
+        add             $E,$Ectx,$Ectx
+        mov             5,$tmp0
+        fxors           @X[13],@X[0],@X[0]
+        mov             $Actx,$A
+        mov             $Bctx,$B
+        mov             $Cctx,$C
+        mov             $Dctx,$D
+        mov             $Ectx,$E
+        alignaddr       %g0,$tmp0,%g0   
+        dec             1,$len
+        ba              .Loop
+        mov             $nXfer,$Xfer
+.align  32
+.Ltail:
+___
+for($i=70;$i<80;$i++)   { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        add     $A,$Actx,$Actx
+        add     $B,$Bctx,$Bctx
+        add     $C,$Cctx,$Cctx
+        add     $D,$Dctx,$Dctx
+        add     $E,$Ectx,$Ectx
+        st      $Actx,[$ctx+0]
+        st      $Bctx,[$ctx+4]
+        st      $Cctx,[$ctx+8]
+        st      $Dctx,[$ctx+12]
+        st      $Ectx,[$ctx+16]
+        ret
+        restore
+.type   sha1_block_data_order,#function
+.size   sha1_block_data_order,(.-sha1_block_data_order)
+.asciz  "SHA1 block transform for SPARCv9a, CRYPTOGAMS by <appro\@openssl.org>"
+___
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my $ref,$opf;
+my %visopf = (  "fmul8ulx16"    => 0x037,
+                "faligndata"    => 0x048,
+                "fpadd32"       => 0x052,
+                "fxor"          => 0x06c,
+                "fxors"         => 0x06d        );
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+    if ($opf=$visopf{$mnemonic}) {
+        foreach ($rs1,$rs2,$rd) {
+            return $ref if (!/%f([0-9]{1,2})/);
+            $_=$1;
+            if ($1>=32) {
+                return $ref if ($1&1);
+                # re-encode for upper double register addressing
+                $_=($1|$1>>5)&31;
+            }
+        }
+        return  sprintf ".word\t0x%08x !%s",
+                        0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+                        $ref;
+    } else {
+        return $ref;
+    }
+}
+sub unalignaddr {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my $ref="$mnemonic\t$rs1,$rs2,$rd";
+    foreach ($rs1,$rs2,$rd) {
+        if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
+        else                    { return $ref; }
+    }
+    return  sprintf ".word\t0x%08x !%s",
+                    0x81b00300|$rd<<25|$rs1<<14|$rs2,
+                    $ref;
+}
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/
+                &unvis($1,$2,$3,$4)
+          /gem;
+$code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/
+                &unalignaddr($1,$2,$3,$4)
+          /gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-thumb.pl b/src/lib/libcrypto/sha/asm/sha1-thumb.pl
new file mode 100644
index 0000000000..7c9ea9b029
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-thumb.pl
@@ -0,0 +1,259 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# sha1_block for Thumb.
+#
+# January 2007.
+#
+# The code does not present direct interest to OpenSSL, because of low
+# performance. Its purpose is to establish _size_ benchmark. Pretty
+# useless one I must say, because 30% or 88 bytes larger ARMv4 code
+# [avialable on demand] is almost _twice_ as fast. It should also be
+# noted that in-lining of .Lcommon and .Lrotate improves performance
+# by over 40%, while code increases by only 10% or 32 bytes. But once
+# again, the goal was to establish _size_ benchmark, not performance.
+$output=shift;
+open STDOUT,">$output";
+$inline=0;
+#$cheat_on_binutils=1;
+$t0="r0";
+$t1="r1";
+$t2="r2";
+$a="r3";
+$b="r4";
+$c="r5";
+$d="r6";
+$e="r7";
+$K="r8";        # "upper" registers can be used in add/sub and mov insns
+$ctx="r9";
+$inp="r10";
+$len="r11";
+$Xi="r12";
+sub common {
+<<___;
+        sub     $t0,#4
+        ldr     $t1,[$t0]
+        add     $e,$K                   @ E+=K_xx_xx
+        lsl     $t2,$a,#5
+        add     $t2,$e
+        lsr     $e,$a,#27
+        add     $t2,$e                  @ E+=ROR(A,27)
+        add     $t2,$t1                 @ E+=X[i]
+___
+}
+sub rotate {
+<<___;
+        mov     $e,$d                   @ E=D
+        mov     $d,$c                   @ D=C
+        lsl     $c,$b,#30
+        lsr     $b,$b,#2
+        orr     $c,$b                   @ C=ROR(B,2)
+        mov     $b,$a                   @ B=A
+        add     $a,$t2,$t1              @ A=E+F_xx_xx(B,C,D)
+___
+}
+sub BODY_00_19 {
+$code.=$inline?&common():"\tbl  .Lcommon\n";
+$code.=<<___;
+        mov     $t1,$c
+        eor     $t1,$d
+        and     $t1,$b
+        eor     $t1,$d                  @ F_00_19(B,C,D)
+___
+$code.=$inline?&rotate():"\tbl  .Lrotate\n";
+}
+sub BODY_20_39 {
+$code.=$inline?&common():"\tbl  .Lcommon\n";
+$code.=<<___;
+        mov     $t1,$b
+        eor     $t1,$c
+        eor     $t1,$d                  @ F_20_39(B,C,D)
+___
+$code.=$inline?&rotate():"\tbl  .Lrotate\n";
+}
+sub BODY_40_59 {
+$code.=$inline?&common():"\tbl  .Lcommon\n";
+$code.=<<___;
+        mov     $t1,$b
+        and     $t1,$c
+        mov     $e,$b
+        orr     $e,$c
+        and     $e,$d
+        orr     $t1,$e                  @ F_40_59(B,C,D)
+___
+$code.=$inline?&rotate():"\tbl  .Lrotate\n";
+}
+$code=<<___;
+.text
+.code   16
+.global sha1_block_data_order
+.type   sha1_block_data_order,%function
+.align  2
+sha1_block_data_order:
+___
+if ($cheat_on_binutils) {
+$code.=<<___;
+.code   32
+        add     r3,pc,#1
+        bx      r3                      @ switch to Thumb ISA
+.code   16
+___
+}
+$code.=<<___;
+        push    {r4-r7}
+        mov     r3,r8
+        mov     r4,r9
+        mov     r5,r10
+        mov     r6,r11
+        mov     r7,r12
+        push    {r3-r7,lr}
+        lsl     r2,#6
+        mov     $ctx,r0                 @ save context
+        mov     $inp,r1                 @ save inp
+        mov     $len,r2                 @ save len
+        add     $len,$inp               @ $len to point at inp end
+.Lloop:
+        mov     $Xi,sp
+        mov     $t2,sp
+        sub     $t2,#16*4               @ [3]
+.LXload:
+        ldrb    $a,[$t1,#0]             @ $t1 is r1 and holds inp
+        ldrb    $b,[$t1,#1]
+        ldrb    $c,[$t1,#2]
+        ldrb    $d,[$t1,#3]
+        lsl     $a,#24
+        lsl     $b,#16
+        lsl     $c,#8
+        orr     $a,$b
+        orr     $a,$c
+        orr     $a,$d
+        add     $t1,#4
+        push    {$a}
+        cmp     sp,$t2
+        bne     .LXload                 @ [+14*16]
+        mov     $inp,$t1                @ update $inp
+        sub     $t2,#32*4
+        sub     $t2,#32*4
+        mov     $e,#31                  @ [+4]
+.LXupdate:
+        ldr     $a,[sp,#15*4]
+        ldr     $b,[sp,#13*4]
+        ldr     $c,[sp,#7*4]
+        ldr     $d,[sp,#2*4]
+        eor     $a,$b
+        eor     $a,$c
+        eor     $a,$d
+        ror     $a,$e
+        push    {$a}
+        cmp     sp,$t2
+        bne     .LXupdate               @ [+(11+1)*64]
+        ldmia   $t0!,{$a,$b,$c,$d,$e}   @ $t0 is r0 and holds ctx
+        mov     $t0,$Xi
+        ldr     $t2,.LK_00_19
+        mov     $t1,$t0
+        sub     $t1,#20*4
+        mov     $Xi,$t1
+        mov     $K,$t2                  @ [+7+4]
+.L_00_19:
+___
+        &BODY_00_19();
+$code.=<<___;
+        cmp     $Xi,$t0
+        bne     .L_00_19                @ [+(2+9+4+2+8+2)*20]
+        ldr     $t2,.LK_20_39
+        mov     $t1,$t0
+        sub     $t1,#20*4
+        mov     $Xi,$t1
+        mov     $K,$t2                  @ [+5]
+.L_20_39_or_60_79:
+___
+        &BODY_20_39();
+$code.=<<___;
+        cmp     $Xi,$t0
+        bne     .L_20_39_or_60_79       @ [+(2+9+3+2+8+2)*20*2]
+        cmp     sp,$t0
+        beq     .Ldone                  @ [+2]
+        ldr     $t2,.LK_40_59
+        mov     $t1,$t0
+        sub     $t1,#20*4
+        mov     $Xi,$t1
+        mov     $K,$t2                  @ [+5]
+.L_40_59:
+___
+        &BODY_40_59();
+$code.=<<___;
+        cmp     $Xi,$t0
+        bne     .L_40_59                @ [+(2+9+6+2+8+2)*20]
+        ldr     $t2,.LK_60_79
+        mov     $Xi,sp
+        mov     $K,$t2
+        b       .L_20_39_or_60_79       @ [+4]
+.Ldone:
+        mov     $t0,$ctx
+        ldr     $t1,[$t0,#0]
+        ldr     $t2,[$t0,#4]
+        add     $a,$t1
+        ldr     $t1,[$t0,#8]
+        add     $b,$t2
+        ldr     $t2,[$t0,#12]
+        add     $c,$t1
+        ldr     $t1,[$t0,#16]
+        add     $d,$t2
+        add     $e,$t1
+        stmia   $t0!,{$a,$b,$c,$d,$e}   @ [+20]
+        add     sp,#80*4                @ deallocate stack frame
+        mov     $t0,$ctx                @ restore ctx
+        mov     $t1,$inp                @ restore inp
+        cmp     $t1,$len
+        beq     .Lexit
+        b       .Lloop                  @ [+6] total 3212 cycles
+.Lexit:
+        pop     {r2-r7}
+        mov     r8,r2
+        mov     r9,r3
+        mov     r10,r4
+        mov     r11,r5
+        mov     r12,r6
+        mov     lr,r7
+        pop     {r4-r7}
+        bx      lr
+.align  2
+___
+$code.=".Lcommon:\n".&common()."\tmov   pc,lr\n" if (!$inline);
+$code.=".Lrotate:\n".&rotate()."\tmov   pc,lr\n" if (!$inline);
+$code.=<<___;
+.align  2
+.LK_00_19:      .word   0x5a827999
+.LK_20_39:      .word   0x6ed9eba1
+.LK_40_59:      .word   0x8f1bbcdc
+.LK_60_79:      .word   0xca62c1d6
+.size   sha1_block_data_order,.-sha1_block_data_order
+.asciz  "SHA1 block transform for Thumb, CRYPTOGAMS by <appro\@openssl.org>"
+___
+print $code;
+close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
index f7ed67a726..4edc5ea9ad 100755
--- a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
@@ -29,14 +29,18 @@
 # Xeon P4       +65%            +0%             9.9
 # Core2         +60%            +10%            7.0
-$output=shift;
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $output";
+open STDOUT,"| $^X $xlate $flavour $output";
 $ctx="%rdi";    # 1st arg
 $inp="%rsi";    # 2nd arg
@@ -69,13 +73,14 @@ $func:
        push    %rbx
        push    %rbp
        push    %r12
-        mov     %rsp,%rax
+        mov     %rsp,%r11
        mov     %rdi,$ctx       # reassigned argument
        sub     \$`8+16*4`,%rsp
        mov     %rsi,$inp       # reassigned argument
        and     \$-64,%rsp
        mov     %rdx,$num       # reassigned argument
-        mov     %rax,`16*4`(%rsp)
+        mov     %r11,`16*4`(%rsp)
+.Lprologue:
        mov     0($ctx),$A
        mov     4($ctx),$B
@@ -88,10 +93,12 @@ ___
 sub EPILOGUE {
 my $func=shift;
 $code.=<<___;
-        mov     `16*4`(%rsp),%rsp
+        mov     `16*4`(%rsp),%rsi
-        pop     %r12
+        mov     (%rsi),%r12
-        pop     %rbp
+        mov     8(%rsi),%rbp
-        pop     %rbx
+        mov     16(%rsi),%rbx
+        lea     24(%rsi),%rsp
+.Lepilogue:
        ret
 .size   $func,.-$func
 ___
@@ -233,7 +240,109 @@ ___
 &EPILOGUE("sha1_block_data_order");
 $code.=<<___;
 .asciz  "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align  16
+___
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type   se_handler,\@abi-omnipotent
+.align  16
+se_handler:
+        push    %rsi
+        push    %rdi
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        pushfq
+        sub     \$64,%rsp
+        mov     120($context),%rax      # pull context->Rax
+        mov     248($context),%rbx      # pull context->Rip
+        lea     .Lprologue(%rip),%r10
+        cmp     %r10,%rbx               # context->Rip<.Lprologue
+        jb      .Lin_prologue
+        mov     152($context),%rax      # pull context->Rsp
+        lea     .Lepilogue(%rip),%r10
+        cmp     %r10,%rbx               # context->Rip>=.Lepilogue
+        jae     .Lin_prologue
+        mov     `16*4`(%rax),%rax       # pull saved stack pointer
+        lea     24(%rax),%rax
+        mov     -8(%rax),%rbx
+        mov     -16(%rax),%rbp
+        mov     -24(%rax),%r12
+        mov     %rbx,144($context)      # restore context->Rbx
+        mov     %rbp,160($context)      # restore context->Rbp
+        mov     %r12,216($context)      # restore context->R12
+.Lin_prologue:
+        mov     8(%rax),%rdi
+        mov     16(%rax),%rsi
+        mov     %rax,152($context)      # restore context->Rsp
+        mov     %rsi,168($context)      # restore context->Rsi
+        mov     %rdi,176($context)      # restore context->Rdi
+        mov     40($disp),%rdi          # disp->ContextRecord
+        mov     $context,%rsi           # context
+        mov     \$154,%ecx              # sizeof(CONTEXT)
+        .long   0xa548f3fc              # cld; rep movsq
+        mov     $disp,%rsi
+        xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+        mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+        mov     0(%rsi),%r8             # arg3, disp->ControlPc
+        mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+        mov     40(%rsi),%r10           # disp->ContextRecord
+        lea     56(%rsi),%r11           # &disp->HandlerData
+        lea     24(%rsi),%r12           # &disp->EstablisherFrame
+        mov     %r10,32(%rsp)           # arg5
+        mov     %r11,40(%rsp)           # arg6
+        mov     %r12,48(%rsp)           # arg7
+        mov     %rcx,56(%rsp)           # arg8, (NULL)
+        call    *__imp_RtlVirtualUnwind(%rip)
+        mov     \$1,%eax                # ExceptionContinueSearch
+        add     \$64,%rsp
+        popfq
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+        pop     %rbp
+        pop     %rbx
+        pop     %rdi
+        pop     %rsi
+        ret
+.size   se_handler,.-se_handler
+.section        .pdata
+.align  4
+        .rva    .LSEH_begin_sha1_block_data_order
+        .rva    .LSEH_end_sha1_block_data_order
+        .rva    .LSEH_info_sha1_block_data_order
+.section        .xdata
+.align  8
+.LSEH_info_sha1_block_data_order:
+        .byte   9,0,0,0
+        .rva    se_handler
 ___
+}
 ####################################################################
diff --git a/src/lib/libcrypto/sha/asm/sha256-586.pl b/src/lib/libcrypto/sha/asm/sha256-586.pl
new file mode 100644
index 0000000000..ecc8b69c75
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha256-586.pl
@@ -0,0 +1,251 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256 block transform for x86. September 2007.
+#
+# Performance in clock cycles per processed byte (less is better):
+#
+#               Pentium PIII    P4      AMD K8  Core2
+# gcc           46      36      41      27      26
+# icc           57      33      38      25      23      
+# x86 asm       40      30      35      20      20
+# x86_64 asm(*) -       -       21      15.8    16.5
+#
+# (*) x86_64 assembler performance is presented for reference
+#     purposes.
+#
+# Performance improvement over compiler generated code varies from
+# 10% to 40% [see above]. Not very impressive on some �-archs, but
+# it's 5 times smaller and optimizies amount of writes.
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
+$A="eax";
+$E="edx";
+$T="ebx";
+$Aoff=&DWP(0,"esp");
+$Boff=&DWP(4,"esp");
+$Coff=&DWP(8,"esp");
+$Doff=&DWP(12,"esp");
+$Eoff=&DWP(16,"esp");
+$Foff=&DWP(20,"esp");
+$Goff=&DWP(24,"esp");
+$Hoff=&DWP(28,"esp");
+$Xoff=&DWP(32,"esp");
+$K256="ebp";
+sub BODY_00_15() {
+    my $in_16_63=shift;
+        &mov    ("ecx",$E);
+         &add   ($T,&DWP(4*(8+15+16-9),"esp"))  if ($in_16_63); # T += X[-7]
+        &ror    ("ecx",6);
+        &mov    ("edi",$E);
+        &ror    ("edi",11);
+         &mov   ("esi",$Foff);
+        &xor    ("ecx","edi");
+        &ror    ("edi",25-11);
+         &mov   (&DWP(4*(8+15),"esp"),$T)       if ($in_16_63); # save X[0]
+        &xor    ("ecx","edi");  # Sigma1(e)
+         &mov   ("edi",$Goff);
+        &add    ($T,"ecx");     # T += Sigma1(e)
+         &mov   ($Eoff,$E);     # modulo-scheduled
+        &xor    ("esi","edi");
+         &mov   ("ecx",$A);
+        &and    ("esi",$E);
+         &mov   ($E,$Doff);     # e becomes d, which is e in next iteration
+        &xor    ("esi","edi");  # Ch(e,f,g)
+         &mov   ("edi",$A);
+        &add    ($T,"esi");     # T += Ch(e,f,g)
+        &ror    ("ecx",2);
+         &add   ($T,$Hoff);     # T += h
+        &ror    ("edi",13);
+         &mov   ("esi",$Boff);
+        &xor    ("ecx","edi");
+        &ror    ("edi",22-13);
+         &add   ($E,$T);        # d += T
+        &xor    ("ecx","edi");  # Sigma0(a)
+         &mov   ("edi",$Coff);
+        &add    ($T,"ecx");     # T += Sigma0(a)
+         &mov   ($Aoff,$A);     # modulo-scheduled
+        &mov    ("ecx",$A);
+         &sub   ("esp",4);
+        &or     ($A,"esi");     # a becomes h, which is a in next iteration
+        &and    ("ecx","esi");
+        &and    ($A,"edi");
+         &mov   ("esi",&DWP(0,$K256));
+        &or     ($A,"ecx");     # h=Maj(a,b,c)
+        &add    ($K256,4);
+        &add    ($A,$T);        # h += T
+         &mov   ($T,&DWP(4*(8+15+16-1),"esp"))  if ($in_16_63); # preload T
+        &add    ($E,"esi");     # d += K256[i]
+        &add    ($A,"esi");     # h += K256[i]
+}
+&function_begin("sha256_block_data_order");
+        &mov    ("esi",wparam(0));      # ctx
+        &mov    ("edi",wparam(1));      # inp
+        &mov    ("eax",wparam(2));      # num
+        &mov    ("ebx","esp");          # saved sp
+        &call   (&label("pic_point"));  # make it PIC!
+&set_label("pic_point");
+        &blindpop($K256);
+        &lea    ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
+        &sub    ("esp",16);
+        &and    ("esp",-64);
+        &shl    ("eax",6);
+        &add    ("eax","edi");
+        &mov    (&DWP(0,"esp"),"esi");  # ctx
+        &mov    (&DWP(4,"esp"),"edi");  # inp
+        &mov    (&DWP(8,"esp"),"eax");  # inp+num*128
+        &mov    (&DWP(12,"esp"),"ebx"); # saved sp
+&set_label("loop",16);
+    # copy input block to stack reversing byte and dword order
+    for($i=0;$i<4;$i++) {
+        &mov    ("eax",&DWP($i*16+0,"edi"));
+        &mov    ("ebx",&DWP($i*16+4,"edi"));
+        &mov    ("ecx",&DWP($i*16+8,"edi"));
+        &mov    ("edx",&DWP($i*16+12,"edi"));
+        &bswap  ("eax");
+        &bswap  ("ebx");
+        &bswap  ("ecx");
+        &bswap  ("edx");
+        &push   ("eax");
+        &push   ("ebx");
+        &push   ("ecx");
+        &push   ("edx");
+    }
+        &add    ("edi",64);
+        &sub    ("esp",4*8);            # place for A,B,C,D,E,F,G,H
+        &mov    (&DWP(4*(8+16)+4,"esp"),"edi");
+        # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
+        &mov    ($A,&DWP(0,"esi"));
+        &mov    ("ebx",&DWP(4,"esi"));
+        &mov    ("ecx",&DWP(8,"esi"));
+        &mov    ("edi",&DWP(12,"esi"));
+        # &mov  ($Aoff,$A);
+        &mov    ($Boff,"ebx");
+        &mov    ($Coff,"ecx");
+        &mov    ($Doff,"edi");
+        &mov    ($E,&DWP(16,"esi"));    
+        &mov    ("ebx",&DWP(20,"esi"));
+        &mov    ("ecx",&DWP(24,"esi"));
+        &mov    ("edi",&DWP(28,"esi"));
+        # &mov  ($Eoff,$E);
+        &mov    ($Foff,"ebx");
+        &mov    ($Goff,"ecx");
+        &mov    ($Hoff,"edi");
+&set_label("00_15",16);
+        &mov    ($T,&DWP(4*(8+15),"esp"));
+        &BODY_00_15();
+        &cmp    ("esi",0xc19bf174);
+        &jne    (&label("00_15"));
+        &mov    ($T,&DWP(4*(8+15+16-1),"esp")); # preloaded in BODY_00_15(1)
+&set_label("16_63",16);
+        &mov    ("esi",$T);
+         &mov   ("ecx",&DWP(4*(8+15+16-14),"esp"));
+        &shr    ($T,3);
+        &ror    ("esi",7);
+        &xor    ($T,"esi");
+        &ror    ("esi",18-7);
+         &mov   ("edi","ecx");
+        &xor    ($T,"esi");                     # T = sigma0(X[-15])
+        &shr    ("ecx",10);
+         &mov   ("esi",&DWP(4*(8+15+16),"esp"));
+        &ror    ("edi",17);
+        &xor    ("ecx","edi");
+        &ror    ("edi",19-17);
+         &add   ($T,"esi");                     # T += X[-16]
+        &xor    ("edi","ecx")                   # sigma1(X[-2])
+        &add    ($T,"edi");                     # T += sigma1(X[-2])
+        # &add  ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1)
+        # &mov  (&DWP(4*(8+15),"esp"),$T);      # save X[0]
+        &BODY_00_15(1);
+        &cmp    ("esi",0xc67178f2);
+        &jne    (&label("16_63"));
+        &mov    ("esi",&DWP(4*(8+16+64)+0,"esp"));#ctx
+        # &mov  ($A,$Aoff);
+        &mov    ("ebx",$Boff);
+        &mov    ("ecx",$Coff);
+        &mov    ("edi",$Doff);
+        &add    ($A,&DWP(0,"esi"));
+        &add    ("ebx",&DWP(4,"esi"));
+        &add    ("ecx",&DWP(8,"esi"));
+        &add    ("edi",&DWP(12,"esi"));
+        &mov    (&DWP(0,"esi"),$A);
+        &mov    (&DWP(4,"esi"),"ebx");
+        &mov    (&DWP(8,"esi"),"ecx");
+        &mov    (&DWP(12,"esi"),"edi");
+        # &mov  ($E,$Eoff);
+        &mov    ("eax",$Foff);
+        &mov    ("ebx",$Goff);
+        &mov    ("ecx",$Hoff);
+        &mov    ("edi",&DWP(4*(8+16+64)+4,"esp"));#inp
+        &add    ($E,&DWP(16,"esi"));
+        &add    ("eax",&DWP(20,"esi"));
+        &add    ("ebx",&DWP(24,"esi"));
+        &add    ("ecx",&DWP(28,"esi"));
+        &mov    (&DWP(16,"esi"),$E);
+        &mov    (&DWP(20,"esi"),"eax");
+        &mov    (&DWP(24,"esi"),"ebx");
+        &mov    (&DWP(28,"esi"),"ecx");
+        &add    ("esp",4*(8+16+64));            # destroy frame
+        &sub    ($K256,4*64);                   # rewind K
+        &cmp    ("edi",&DWP(8,"esp"));          # are we done yet?
+        &jb     (&label("loop"));
+        &mov    ("esp",&DWP(12,"esp"));         # restore sp
+&function_end_A();
+&set_label("K256",64);  # Yes! I keep it in the code segment!
+        &data_word(0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5);
+        &data_word(0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5);
+        &data_word(0xd807aa98,0x12835b01,0x243185be,0x550c7dc3);
+        &data_word(0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174);
+        &data_word(0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc);
+        &data_word(0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da);
+        &data_word(0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7);
+        &data_word(0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967);
+        &data_word(0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13);
+        &data_word(0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85);
+        &data_word(0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3);
+        &data_word(0xd192e819,0xd6990624,0xf40e3585,0x106aa070);
+        &data_word(0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5);
+        &data_word(0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3);
+        &data_word(0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208);
+        &data_word(0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2);
+&function_end_B("sha256_block_data_order");
+&asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
new file mode 100644
index 0000000000..48d846deec
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
@@ -0,0 +1,181 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# SHA256 block procedure for ARMv4. May 2007.
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte.
+$output=shift;
+open STDOUT,">$output";
+$ctx="r0";      $t0="r0";
+$inp="r1";
+$len="r2";      $t1="r2";
+$T1="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+$code.=<<___ if ($i<16);
+        ldrb    $T1,[$inp,#3]                   @ $i
+        ldrb    $t2,[$inp,#2]
+        ldrb    $t1,[$inp,#1]
+        ldrb    $t0,[$inp],#4
+        orr     $T1,$T1,$t2,lsl#8
+        orr     $T1,$T1,$t1,lsl#16
+        orr     $T1,$T1,$t0,lsl#24
+        `"str   $inp,[sp,#17*4]"        if ($i==15)`
+___
+$code.=<<___;
+        ldr     $t2,[$Ktbl],#4                  @ *K256++
+        str     $T1,[sp,#`$i%16`*4]
+        mov     $t0,$e,ror#$Sigma1[0]
+        eor     $t0,$t0,$e,ror#$Sigma1[1]
+        eor     $t0,$t0,$e,ror#$Sigma1[2]       @ Sigma1(e)
+        add     $T1,$T1,$t0
+        eor     $t1,$f,$g
+        and     $t1,$t1,$e
+        eor     $t1,$t1,$g                      @ Ch(e,f,g)
+        add     $T1,$T1,$t1
+        add     $T1,$T1,$h
+        add     $T1,$T1,$t2
+        mov     $h,$a,ror#$Sigma0[0]
+        eor     $h,$h,$a,ror#$Sigma0[1]
+        eor     $h,$h,$a,ror#$Sigma0[2]         @ Sigma0(a)
+        orr     $t0,$a,$b
+        and     $t0,$t0,$c
+        and     $t1,$a,$b
+        orr     $t0,$t0,$t1                     @ Maj(a,b,c)
+        add     $h,$h,$t0
+        add     $d,$d,$T1
+        add     $h,$h,$T1
+___
+}
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+$code.=<<___;
+        ldr     $t1,[sp,#`($i+1)%16`*4] @ $i
+        ldr     $t2,[sp,#`($i+14)%16`*4]
+        ldr     $T1,[sp,#`($i+0)%16`*4]
+        ldr     $inp,[sp,#`($i+9)%16`*4]
+        mov     $t0,$t1,ror#$sigma0[0]
+        eor     $t0,$t0,$t1,ror#$sigma0[1]
+        eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
+        mov     $t1,$t2,ror#$sigma1[0]
+        eor     $t1,$t1,$t2,ror#$sigma1[1]
+        eor     $t1,$t1,$t2,lsr#$sigma1[2]      @ sigma1(X[i+14])
+        add     $T1,$T1,$t0
+        add     $T1,$T1,$t1
+        add     $T1,$T1,$inp
+___
+        &BODY_00_15(@_);
+}
+$code=<<___;
+.text
+.code   32
+.type   K256,%object
+.align  5
+K256:
+.word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size   K256,.-K256
+.global sha256_block_data_order
+.type   sha256_block_data_order,%function
+sha256_block_data_order:
+        sub     r3,pc,#8                @ sha256_block_data_order
+        add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
+        stmdb   sp!,{$ctx,$inp,$len,r4-r12,lr}
+        ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+        sub     $Ktbl,r3,#256           @ K256
+        sub     sp,sp,#16*4             @ alloca(X[16])
+.Loop:
+___
+for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        and     $t2,$t2,#0xff
+        cmp     $t2,#0xf2
+        bne     .Lrounds_16_xx
+        ldr     $T1,[sp,#16*4]          @ pull ctx
+        ldr     $t0,[$T1,#0]
+        ldr     $t1,[$T1,#4]
+        ldr     $t2,[$T1,#8]
+        add     $A,$A,$t0
+        ldr     $t0,[$T1,#12]
+        add     $B,$B,$t1
+        ldr     $t1,[$T1,#16]
+        add     $C,$C,$t2
+        ldr     $t2,[$T1,#20]
+        add     $D,$D,$t0
+        ldr     $t0,[$T1,#24]
+        add     $E,$E,$t1
+        ldr     $t1,[$T1,#28]
+        add     $F,$F,$t2
+        ldr     $inp,[sp,#17*4]         @ pull inp
+        ldr     $t2,[sp,#18*4]          @ pull inp+len
+        add     $G,$G,$t0
+        add     $H,$H,$t1
+        stmia   $T1,{$A,$B,$C,$D,$E,$F,$G,$H}
+        cmp     $inp,$t2
+        sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
+        bne     .Loop
+        add     sp,sp,#`16+3`*4 @ destroy frame
+        ldmia   sp!,{r4-r12,lr}
+        tst     lr,#1
+        moveq   pc,lr                   @ be binary compatible with V4, yet
+        bx      lr                      @ interoperable with Thumb ISA:-)
+.size   sha256_block_data_order,.-sha256_block_data_order
+.asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha512-586.pl b/src/lib/libcrypto/sha/asm/sha512-586.pl
new file mode 100644
index 0000000000..5b9f3337ad
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-586.pl
@@ -0,0 +1,644 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA512 block transform for x86. September 2007.
+#
+# Performance in clock cycles per processed byte (less is better):
+#
+#               Pentium PIII    P4      AMD K8  Core2
+# gcc           100     75      116     54      66
+# icc           97      77      95      55      57
+# x86 asm       61      56      82      36      40
+# SSE2 asm      -       -       38      24      20
+# x86_64 asm(*) -       -       30      10.0    10.5
+#
+# (*) x86_64 assembler performance is presented for reference
+#     purposes.
+#
+# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
+# performance improvement over compiler generated code reaches ~60%,
+# while on PIII - ~35%. On newer �-archs improvement varies from 15%
+# to 50%, but it's less important as they are expected to execute SSE2
+# code-path, which is commonly ~2-3x faster [than compiler generated
+# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
+# though it does not use 128-bit operations. The latter means that
+# SSE2-aware kernel is no longer required to execute the code. Another
+# difference is that new code optimizes amount of writes, but at the
+# cost of increased data cache "footprint" by 1/2KB.
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+$Tlo=&DWP(0,"esp");     $Thi=&DWP(4,"esp");
+$Alo=&DWP(8,"esp");     $Ahi=&DWP(8+4,"esp");
+$Blo=&DWP(16,"esp");    $Bhi=&DWP(16+4,"esp");
+$Clo=&DWP(24,"esp");    $Chi=&DWP(24+4,"esp");
+$Dlo=&DWP(32,"esp");    $Dhi=&DWP(32+4,"esp");
+$Elo=&DWP(40,"esp");    $Ehi=&DWP(40+4,"esp");
+$Flo=&DWP(48,"esp");    $Fhi=&DWP(48+4,"esp");
+$Glo=&DWP(56,"esp");    $Ghi=&DWP(56+4,"esp");
+$Hlo=&DWP(64,"esp");    $Hhi=&DWP(64+4,"esp");
+$K512="ebp";
+$Asse2=&QWP(0,"esp");
+$Bsse2=&QWP(8,"esp");
+$Csse2=&QWP(16,"esp");
+$Dsse2=&QWP(24,"esp");
+$Esse2=&QWP(32,"esp");
+$Fsse2=&QWP(40,"esp");
+$Gsse2=&QWP(48,"esp");
+$Hsse2=&QWP(56,"esp");
+$A="mm0";       # B-D and
+$E="mm4";       # F-H are commonly loaded to respectively mm1-mm3 and
+                # mm5-mm7, but it's done on on-demand basis...
+sub BODY_00_15_sse2 {
+    my $prefetch=shift;
+        &movq   ("mm5",$Fsse2);                 # load f
+        &movq   ("mm6",$Gsse2);                 # load g
+        &movq   ("mm7",$Hsse2);                 # load h
+        &movq   ("mm1",$E);                     # %mm1 is sliding right
+        &movq   ("mm2",$E);                     # %mm2 is sliding left
+        &psrlq  ("mm1",14);
+        &movq   ($Esse2,$E);                    # modulo-scheduled save e
+        &psllq  ("mm2",23);
+        &movq   ("mm3","mm1");                  # %mm3 is T1
+        &psrlq  ("mm1",4);
+        &pxor   ("mm3","mm2");
+        &psllq  ("mm2",23);
+        &pxor   ("mm3","mm1");
+        &psrlq  ("mm1",23);
+        &pxor   ("mm3","mm2");
+        &psllq  ("mm2",4);
+        &pxor   ("mm3","mm1");
+        &paddq  ("mm7",QWP(0,$K512));           # h+=K512[i]
+        &pxor   ("mm3","mm2");                  # T1=Sigma1_512(e)
+        &pxor   ("mm5","mm6");                  # f^=g
+        &movq   ("mm1",$Bsse2);                 # load b
+        &pand   ("mm5",$E);                     # f&=e
+        &movq   ("mm2",$Csse2);                 # load c
+        &pxor   ("mm5","mm6");                  # f^=g
+        &movq   ($E,$Dsse2);                    # e = load d
+        &paddq  ("mm3","mm5");                  # T1+=Ch(e,f,g)
+        &movq   (&QWP(0,"esp"),$A);             # modulo-scheduled save a
+        &paddq  ("mm3","mm7");                  # T1+=h
+        &movq   ("mm5",$A);                     # %mm5 is sliding right
+        &movq   ("mm6",$A);                     # %mm6 is sliding left
+        &paddq  ("mm3",&QWP(8*9,"esp"));        # T1+=X[0]
+        &psrlq  ("mm5",28);
+        &paddq  ($E,"mm3");                     # e += T1
+        &psllq  ("mm6",25);
+        &movq   ("mm7","mm5");                  # %mm7 is T2
+        &psrlq  ("mm5",6);
+        &pxor   ("mm7","mm6");
+        &psllq  ("mm6",5);
+        &pxor   ("mm7","mm5");
+        &psrlq  ("mm5",5);
+        &pxor   ("mm7","mm6");
+        &psllq  ("mm6",6);
+        &pxor   ("mm7","mm5");
+        &sub    ("esp",8);
+        &pxor   ("mm7","mm6");                  # T2=Sigma0_512(a)
+        &movq   ("mm5",$A);                     # %mm5=a
+        &por    ($A,"mm2");                     # a=a|c
+        &movq   ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch);
+        &pand   ("mm5","mm2");                  # %mm5=a&c
+        &pand   ($A,"mm1");                     # a=(a|c)&b
+        &movq   ("mm2",&QWP(8*(9+16-1),"esp"))  if ($prefetch);
+        &por    ("mm5",$A);                     # %mm5=(a&c)|((a|c)&b)
+        &paddq  ("mm7","mm5");                  # T2+=Maj(a,b,c)
+        &movq   ($A,"mm3");                     # a=T1
+        &mov    (&LB("edx"),&BP(0,$K512));
+        &paddq  ($A,"mm7");                     # a+=T2
+        &add    ($K512,8);
+}
+sub BODY_00_15_x86 {
+        #define Sigma1(x)       (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+        #       LO              lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+        #       HI              hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+        &mov    ("ecx",$Elo);
+        &mov    ("edx",$Ehi);
+        &mov    ("esi","ecx");
+        &shr    ("ecx",9)       # lo>>9
+        &mov    ("edi","edx");
+        &shr    ("edx",9)       # hi>>9
+        &mov    ("ebx","ecx");
+        &shl    ("esi",14);     # lo<<14
+        &mov    ("eax","edx");
+        &shl    ("edi",14);     # hi<<14
+        &xor    ("ebx","esi");
+        &shr    ("ecx",14-9);   # lo>>14
+        &xor    ("eax","edi");
+        &shr    ("edx",14-9);   # hi>>14
+        &xor    ("eax","ecx");
+        &shl    ("esi",18-14);  # lo<<18
+        &xor    ("ebx","edx");
+        &shl    ("edi",18-14);  # hi<<18
+        &xor    ("ebx","esi");
+        &shr    ("ecx",18-14);  # lo>>18
+        &xor    ("eax","edi");
+        &shr    ("edx",18-14);  # hi>>18
+        &xor    ("eax","ecx");
+        &shl    ("esi",23-18);  # lo<<23
+        &xor    ("ebx","edx");
+        &shl    ("edi",23-18);  # hi<<23
+        &xor    ("eax","esi");
+        &xor    ("ebx","edi");                  # T1 = Sigma1(e)
+        &mov    ("ecx",$Flo);
+        &mov    ("edx",$Fhi);
+        &mov    ("esi",$Glo);
+        &mov    ("edi",$Ghi);
+         &add   ("eax",$Hlo);
+         &adc   ("ebx",$Hhi);                   # T1 += h
+        &xor    ("ecx","esi");
+        &xor    ("edx","edi");
+        &and    ("ecx",$Elo);
+        &and    ("edx",$Ehi);
+         &add   ("eax",&DWP(8*(9+15)+0,"esp"));
+         &adc   ("ebx",&DWP(8*(9+15)+4,"esp")); # T1 += X[0]
+        &xor    ("ecx","esi");
+        &xor    ("edx","edi");                  # Ch(e,f,g) = (f^g)&e)^g
+        &mov    ("esi",&DWP(0,$K512));
+        &mov    ("edi",&DWP(4,$K512));          # K[i]
+        &add    ("eax","ecx");
+        &adc    ("ebx","edx");                  # T1 += Ch(e,f,g)
+        &mov    ("ecx",$Dlo);
+        &mov    ("edx",$Dhi);
+        &add    ("eax","esi");
+        &adc    ("ebx","edi");                  # T1 += K[i]
+        &mov    ($Tlo,"eax");
+        &mov    ($Thi,"ebx");                   # put T1 away
+        &add    ("eax","ecx");
+        &adc    ("ebx","edx");                  # d += T1
+        #define Sigma0(x)       (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+        #       LO              lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+        #       HI              hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+        &mov    ("ecx",$Alo);
+        &mov    ("edx",$Ahi);
+        &mov    ($Dlo,"eax");
+        &mov    ($Dhi,"ebx");
+        &mov    ("esi","ecx");
+        &shr    ("ecx",2)       # lo>>2
+        &mov    ("edi","edx");
+        &shr    ("edx",2)       # hi>>2
+        &mov    ("ebx","ecx");
+        &shl    ("esi",4);      # lo<<4
+        &mov    ("eax","edx");
+        &shl    ("edi",4);      # hi<<4
+        &xor    ("ebx","esi");
+        &shr    ("ecx",7-2);    # lo>>7
+        &xor    ("eax","edi");
+        &shr    ("edx",7-2);    # hi>>7
+        &xor    ("ebx","ecx");
+        &shl    ("esi",25-4);   # lo<<25
+        &xor    ("eax","edx");
+        &shl    ("edi",25-4);   # hi<<25
+        &xor    ("eax","esi");
+        &shr    ("ecx",28-7);   # lo>>28
+        &xor    ("ebx","edi");
+        &shr    ("edx",28-7);   # hi>>28
+        &xor    ("eax","ecx");
+        &shl    ("esi",30-25);  # lo<<30
+        &xor    ("ebx","edx");
+        &shl    ("edi",30-25);  # hi<<30
+        &xor    ("eax","esi");
+        &xor    ("ebx","edi");                  # Sigma0(a)
+        &mov    ("ecx",$Alo);
+        &mov    ("edx",$Ahi);
+        &mov    ("esi",$Blo);
+        &mov    ("edi",$Bhi);
+        &add    ("eax",$Tlo);
+        &adc    ("ebx",$Thi);                   # T1 = Sigma0(a)+T1
+        &or     ("ecx","esi");
+        &or     ("edx","edi");
+        &and    ("ecx",$Clo);
+        &and    ("edx",$Chi);
+        &and    ("esi",$Alo);
+        &and    ("edi",$Ahi);
+        &or     ("ecx","esi");
+        &or     ("edx","edi");                  # Maj(a,b,c) = ((a|b)&c)|(a&b)
+        &add    ("eax","ecx");
+        &adc    ("ebx","edx");                  # T1 += Maj(a,b,c)
+        &mov    ($Tlo,"eax");
+        &mov    ($Thi,"ebx");
+        &mov    (&LB("edx"),&BP(0,$K512));      # pre-fetch LSB of *K
+        &sub    ("esp",8);
+        &lea    ($K512,&DWP(8,$K512));          # K++
+}
+&function_begin("sha512_block_data_order");
+        &mov    ("esi",wparam(0));      # ctx
+        &mov    ("edi",wparam(1));      # inp
+        &mov    ("eax",wparam(2));      # num
+        &mov    ("ebx","esp");          # saved sp
+        &call   (&label("pic_point"));  # make it PIC!
+&set_label("pic_point");
+        &blindpop($K512);
+        &lea    ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512));
+        &sub    ("esp",16);
+        &and    ("esp",-64);
+        &shl    ("eax",7);
+        &add    ("eax","edi");
+        &mov    (&DWP(0,"esp"),"esi");  # ctx
+        &mov    (&DWP(4,"esp"),"edi");  # inp
+        &mov    (&DWP(8,"esp"),"eax");  # inp+num*128
+        &mov    (&DWP(12,"esp"),"ebx"); # saved sp
+if ($sse2) {
+        &picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
+        &bt     (&DWP(0,"edx"),26);
+        &jnc    (&label("loop_x86"));
+        # load ctx->h[0-7]
+        &movq   ($A,&QWP(0,"esi"));
+        &movq   ("mm1",&QWP(8,"esi"));
+        &movq   ("mm2",&QWP(16,"esi"));
+        &movq   ("mm3",&QWP(24,"esi"));
+        &movq   ($E,&QWP(32,"esi"));
+        &movq   ("mm5",&QWP(40,"esi"));
+        &movq   ("mm6",&QWP(48,"esi"));
+        &movq   ("mm7",&QWP(56,"esi"));
+        &sub    ("esp",8*10);
+&set_label("loop_sse2",16);
+        # &movq ($Asse2,$A);
+        &movq   ($Bsse2,"mm1");
+        &movq   ($Csse2,"mm2");
+        &movq   ($Dsse2,"mm3");
+        # &movq ($Esse2,$E);
+        &movq   ($Fsse2,"mm5");
+        &movq   ($Gsse2,"mm6");
+        &movq   ($Hsse2,"mm7");
+        &mov    ("ecx",&DWP(0,"edi"));
+        &mov    ("edx",&DWP(4,"edi"));
+        &add    ("edi",8);
+        &bswap  ("ecx");
+        &bswap  ("edx");
+        &mov    (&DWP(8*9+4,"esp"),"ecx");
+        &mov    (&DWP(8*9+0,"esp"),"edx");
+&set_label("00_14_sse2",16);
+        &mov    ("eax",&DWP(0,"edi"));
+        &mov    ("ebx",&DWP(4,"edi"));
+        &add    ("edi",8);
+        &bswap  ("eax");
+        &bswap  ("ebx");
+        &mov    (&DWP(8*8+4,"esp"),"eax");
+        &mov    (&DWP(8*8+0,"esp"),"ebx");
+        &BODY_00_15_sse2();
+        &cmp    (&LB("edx"),0x35);
+        &jne    (&label("00_14_sse2"));
+        &BODY_00_15_sse2(1);
+&set_label("16_79_sse2",16);
+        #&movq  ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15 
+        #&movq  ("mm6",&QWP(8*(9+16-14),"esp"));
+        &movq   ("mm1","mm2");
+        &psrlq  ("mm2",1);
+        &movq   ("mm7","mm6");
+        &psrlq  ("mm6",6);
+        &movq   ("mm3","mm2");
+        &psrlq  ("mm2",7-1);
+        &movq   ("mm5","mm6");
+        &psrlq  ("mm6",19-6);
+        &pxor   ("mm3","mm2");
+        &psrlq  ("mm2",8-7);
+        &pxor   ("mm5","mm6");
+        &psrlq  ("mm6",61-19);
+        &pxor   ("mm3","mm2");
+        &movq   ("mm2",&QWP(8*(9+16),"esp"));
+        &psllq  ("mm1",56);
+        &pxor   ("mm5","mm6");
+        &psllq  ("mm7",3);
+        &pxor   ("mm3","mm1");
+        &paddq  ("mm2",&QWP(8*(9+16-9),"esp"));
+        &psllq  ("mm1",63-56);
+        &pxor   ("mm5","mm7");
+        &psllq  ("mm7",45-3);
+        &pxor   ("mm3","mm1");
+        &pxor   ("mm5","mm7");
+        &paddq  ("mm3","mm5");
+        &paddq  ("mm3","mm2");
+        &movq   (&QWP(8*9,"esp"),"mm3");
+        &BODY_00_15_sse2(1);
+        &cmp    (&LB("edx"),0x17);
+        &jne    (&label("16_79_sse2"));
+        # &movq ($A,$Asse2);
+        &movq   ("mm1",$Bsse2);
+        &movq   ("mm2",$Csse2);
+        &movq   ("mm3",$Dsse2);
+        # &movq ($E,$Esse2);
+        &movq   ("mm5",$Fsse2);
+        &movq   ("mm6",$Gsse2);
+        &movq   ("mm7",$Hsse2);
+        &paddq  ($A,&QWP(0,"esi"));
+        &paddq  ("mm1",&QWP(8,"esi"));
+        &paddq  ("mm2",&QWP(16,"esi"));
+        &paddq  ("mm3",&QWP(24,"esi"));
+        &paddq  ($E,&QWP(32,"esi"));
+        &paddq  ("mm5",&QWP(40,"esi"));
+        &paddq  ("mm6",&QWP(48,"esi"));
+        &paddq  ("mm7",&QWP(56,"esi"));
+        &movq   (&QWP(0,"esi"),$A);
+        &movq   (&QWP(8,"esi"),"mm1");
+        &movq   (&QWP(16,"esi"),"mm2");
+        &movq   (&QWP(24,"esi"),"mm3");
+        &movq   (&QWP(32,"esi"),$E);
+        &movq   (&QWP(40,"esi"),"mm5");
+        &movq   (&QWP(48,"esi"),"mm6");
+        &movq   (&QWP(56,"esi"),"mm7");
+        &add    ("esp",8*80);                   # destroy frame
+        &sub    ($K512,8*80);                   # rewind K
+        &cmp    ("edi",&DWP(8*10+8,"esp"));     # are we done yet?
+        &jb     (&label("loop_sse2"));
+        &emms   ();
+        &mov    ("esp",&DWP(8*10+12,"esp"));    # restore sp
+&function_end_A();
+}
+&set_label("loop_x86",16);
+    # copy input block to stack reversing byte and qword order
+    for ($i=0;$i<8;$i++) {
+        &mov    ("eax",&DWP($i*16+0,"edi"));
+        &mov    ("ebx",&DWP($i*16+4,"edi"));
+        &mov    ("ecx",&DWP($i*16+8,"edi"));
+        &mov    ("edx",&DWP($i*16+12,"edi"));
+        &bswap  ("eax");
+        &bswap  ("ebx");
+        &bswap  ("ecx");
+        &bswap  ("edx");
+        &push   ("eax");
+        &push   ("ebx");
+        &push   ("ecx");
+        &push   ("edx");
+    }
+        &add    ("edi",128);
+        &sub    ("esp",9*8);            # place for T,A,B,C,D,E,F,G,H
+        &mov    (&DWP(8*(9+16)+4,"esp"),"edi");
+        # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
+        &lea    ("edi",&DWP(8,"esp"));
+        &mov    ("ecx",16);
+        &data_word(0xA5F3F689);         # rep movsd
+&set_label("00_15_x86",16);
+        &BODY_00_15_x86();
+        &cmp    (&LB("edx"),0x94);
+        &jne    (&label("00_15_x86"));
+&set_label("16_79_x86",16);
+        #define sigma0(x)       (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+        #       LO              lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+        #       HI              hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+        &mov    ("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
+        &mov    ("edx",&DWP(8*(9+15+16-1)+4,"esp"));
+        &mov    ("esi","ecx");
+        &shr    ("ecx",1)       # lo>>1
+        &mov    ("edi","edx");
+        &shr    ("edx",1)       # hi>>1
+        &mov    ("eax","ecx");
+        &shl    ("esi",24);     # lo<<24
+        &mov    ("ebx","edx");
+        &shl    ("edi",24);     # hi<<24
+        &xor    ("ebx","esi");
+        &shr    ("ecx",7-1);    # lo>>7
+        &xor    ("eax","edi");
+        &shr    ("edx",7-1);    # hi>>7
+        &xor    ("eax","ecx");
+        &shl    ("esi",31-24);  # lo<<31
+        &xor    ("ebx","edx");
+        &shl    ("edi",25-24);  # hi<<25
+        &xor    ("ebx","esi");
+        &shr    ("ecx",8-7);    # lo>>8
+        &xor    ("eax","edi");
+        &shr    ("edx",8-7);    # hi>>8
+        &xor    ("eax","ecx");
+        &shl    ("edi",31-25);  # hi<<31
+        &xor    ("ebx","edx");
+        &xor    ("eax","edi");                  # T1 = sigma0(X[-15])
+        &mov    (&DWP(0,"esp"),"eax");
+        &mov    (&DWP(4,"esp"),"ebx");          # put T1 away
+        #define sigma1(x)       (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+        #       LO              lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+        #       HI              hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+        &mov    ("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
+        &mov    ("edx",&DWP(8*(9+15+16-14)+4,"esp"));
+        &mov    ("esi","ecx");
+        &shr    ("ecx",6)       # lo>>6
+        &mov    ("edi","edx");
+        &shr    ("edx",6)       # hi>>6
+        &mov    ("eax","ecx");
+        &shl    ("esi",3);      # lo<<3
+        &mov    ("ebx","edx");
+        &shl    ("edi",3);      # hi<<3
+        &xor    ("eax","esi");
+        &shr    ("ecx",19-6);   # lo>>19
+        &xor    ("ebx","edi");
+        &shr    ("edx",19-6);   # hi>>19
+        &xor    ("eax","ecx");
+        &shl    ("esi",13-3);   # lo<<13
+        &xor    ("ebx","edx");
+        &shl    ("edi",13-3);   # hi<<13
+        &xor    ("ebx","esi");
+        &shr    ("ecx",29-19);  # lo>>29
+        &xor    ("eax","edi");
+        &shr    ("edx",29-19);  # hi>>29
+        &xor    ("ebx","ecx");
+        &shl    ("edi",26-13);  # hi<<26
+        &xor    ("eax","edx");
+        &xor    ("eax","edi");                  # sigma1(X[-2])
+        &mov    ("ecx",&DWP(8*(9+15+16)+0,"esp"));
+        &mov    ("edx",&DWP(8*(9+15+16)+4,"esp"));
+        &add    ("eax",&DWP(0,"esp"));
+        &adc    ("ebx",&DWP(4,"esp"));          # T1 = sigma1(X[-2])+T1
+        &mov    ("esi",&DWP(8*(9+15+16-9)+0,"esp"));
+        &mov    ("edi",&DWP(8*(9+15+16-9)+4,"esp"));
+        &add    ("eax","ecx");
+        &adc    ("ebx","edx");                  # T1 += X[-16]
+        &add    ("eax","esi");
+        &adc    ("ebx","edi");                  # T1 += X[-7]
+        &mov    (&DWP(8*(9+15)+0,"esp"),"eax");
+        &mov    (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0]
+        &BODY_00_15_x86();
+        &cmp    (&LB("edx"),0x17);
+        &jne    (&label("16_79_x86"));
+        &mov    ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
+        &mov    ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
+    for($i=0;$i<4;$i++) {
+        &mov    ("eax",&DWP($i*16+0,"esi"));
+        &mov    ("ebx",&DWP($i*16+4,"esi"));
+        &mov    ("ecx",&DWP($i*16+8,"esi"));
+        &mov    ("edx",&DWP($i*16+12,"esi"));
+        &add    ("eax",&DWP(8+($i*16)+0,"esp"));
+        &adc    ("ebx",&DWP(8+($i*16)+4,"esp"));
+        &mov    (&DWP($i*16+0,"esi"),"eax");
+        &mov    (&DWP($i*16+4,"esi"),"ebx");
+        &add    ("ecx",&DWP(8+($i*16)+8,"esp"));
+        &adc    ("edx",&DWP(8+($i*16)+12,"esp"));
+        &mov    (&DWP($i*16+8,"esi"),"ecx");
+        &mov    (&DWP($i*16+12,"esi"),"edx");
+    }
+        &add    ("esp",8*(9+16+80));            # destroy frame
+        &sub    ($K512,8*80);                   # rewind K
+        &cmp    ("edi",&DWP(8,"esp"));          # are we done yet?
+        &jb     (&label("loop_x86"));
+        &mov    ("esp",&DWP(12,"esp"));         # restore sp
+&function_end_A();
+&set_label("K512",64);  # Yes! I keep it in the code segment!
+        &data_word(0xd728ae22,0x428a2f98);      # u64
+        &data_word(0x23ef65cd,0x71374491);      # u64
+        &data_word(0xec4d3b2f,0xb5c0fbcf);      # u64
+        &data_word(0x8189dbbc,0xe9b5dba5);      # u64
+        &data_word(0xf348b538,0x3956c25b);      # u64
+        &data_word(0xb605d019,0x59f111f1);      # u64
+        &data_word(0xaf194f9b,0x923f82a4);      # u64
+        &data_word(0xda6d8118,0xab1c5ed5);      # u64
+        &data_word(0xa3030242,0xd807aa98);      # u64
+        &data_word(0x45706fbe,0x12835b01);      # u64
+        &data_word(0x4ee4b28c,0x243185be);      # u64
+        &data_word(0xd5ffb4e2,0x550c7dc3);      # u64
+        &data_word(0xf27b896f,0x72be5d74);      # u64
+        &data_word(0x3b1696b1,0x80deb1fe);      # u64
+        &data_word(0x25c71235,0x9bdc06a7);      # u64
+        &data_word(0xcf692694,0xc19bf174);      # u64
+        &data_word(0x9ef14ad2,0xe49b69c1);      # u64
+        &data_word(0x384f25e3,0xefbe4786);      # u64
+        &data_word(0x8b8cd5b5,0x0fc19dc6);      # u64
+        &data_word(0x77ac9c65,0x240ca1cc);      # u64
+        &data_word(0x592b0275,0x2de92c6f);      # u64
+        &data_word(0x6ea6e483,0x4a7484aa);      # u64
+        &data_word(0xbd41fbd4,0x5cb0a9dc);      # u64
+        &data_word(0x831153b5,0x76f988da);      # u64
+        &data_word(0xee66dfab,0x983e5152);      # u64
+        &data_word(0x2db43210,0xa831c66d);      # u64
+        &data_word(0x98fb213f,0xb00327c8);      # u64
+        &data_word(0xbeef0ee4,0xbf597fc7);      # u64
+        &data_word(0x3da88fc2,0xc6e00bf3);      # u64
+        &data_word(0x930aa725,0xd5a79147);      # u64
+        &data_word(0xe003826f,0x06ca6351);      # u64
+        &data_word(0x0a0e6e70,0x14292967);      # u64
+        &data_word(0x46d22ffc,0x27b70a85);      # u64
+        &data_word(0x5c26c926,0x2e1b2138);      # u64
+        &data_word(0x5ac42aed,0x4d2c6dfc);      # u64
+        &data_word(0x9d95b3df,0x53380d13);      # u64
+        &data_word(0x8baf63de,0x650a7354);      # u64
+        &data_word(0x3c77b2a8,0x766a0abb);      # u64
+        &data_word(0x47edaee6,0x81c2c92e);      # u64
+        &data_word(0x1482353b,0x92722c85);      # u64
+        &data_word(0x4cf10364,0xa2bfe8a1);      # u64
+        &data_word(0xbc423001,0xa81a664b);      # u64
+        &data_word(0xd0f89791,0xc24b8b70);      # u64
+        &data_word(0x0654be30,0xc76c51a3);      # u64
+        &data_word(0xd6ef5218,0xd192e819);      # u64
+        &data_word(0x5565a910,0xd6990624);      # u64
+        &data_word(0x5771202a,0xf40e3585);      # u64
+        &data_word(0x32bbd1b8,0x106aa070);      # u64
+        &data_word(0xb8d2d0c8,0x19a4c116);      # u64
+        &data_word(0x5141ab53,0x1e376c08);      # u64
+        &data_word(0xdf8eeb99,0x2748774c);      # u64
+        &data_word(0xe19b48a8,0x34b0bcb5);      # u64
+        &data_word(0xc5c95a63,0x391c0cb3);      # u64
+        &data_word(0xe3418acb,0x4ed8aa4a);      # u64
+        &data_word(0x7763e373,0x5b9cca4f);      # u64
+        &data_word(0xd6b2b8a3,0x682e6ff3);      # u64
+        &data_word(0x5defb2fc,0x748f82ee);      # u64
+        &data_word(0x43172f60,0x78a5636f);      # u64
+        &data_word(0xa1f0ab72,0x84c87814);      # u64
+        &data_word(0x1a6439ec,0x8cc70208);      # u64
+        &data_word(0x23631e28,0x90befffa);      # u64
+        &data_word(0xde82bde9,0xa4506ceb);      # u64
+        &data_word(0xb2c67915,0xbef9a3f7);      # u64
+        &data_word(0xe372532b,0xc67178f2);      # u64
+        &data_word(0xea26619c,0xca273ece);      # u64
+        &data_word(0x21c0c207,0xd186b8c7);      # u64
+        &data_word(0xcde0eb1e,0xeada7dd6);      # u64
+        &data_word(0xee6ed178,0xf57d4f7f);      # u64
+        &data_word(0x72176fba,0x06f067aa);      # u64
+        &data_word(0xa2c898a6,0x0a637dc5);      # u64
+        &data_word(0xbef90dae,0x113f9804);      # u64
+        &data_word(0x131c471b,0x1b710b35);      # u64
+        &data_word(0x23047d84,0x28db77f5);      # u64
+        &data_word(0x40c72493,0x32caab7b);      # u64
+        &data_word(0x15c9bebc,0x3c9ebe0a);      # u64
+        &data_word(0x9c100d4c,0x431d67c4);      # u64
+        &data_word(0xcb3e42b6,0x4cc5d4be);      # u64
+        &data_word(0xfc657e2a,0x597f299c);      # u64
+        &data_word(0x3ad6faec,0x5fcb6fab);      # u64
+        &data_word(0x4a475817,0x6c44198c);      # u64
+&function_end_B("sha512_block_data_order");
+&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
new file mode 100644
index 0000000000..4fbb94a914
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
@@ -0,0 +1,399 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# SHA512 block procedure for ARMv4. September 2007.
+# This code is ~4.5 (four and a half) times faster than code generated
+# by gcc 3.4 and it spends ~72 clock cycles per byte. 
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in h[0-7],
+# namely with most significant dword at *lower* address, which is
+# reflected in below two parameters. *Byte* order within these dwords
+# in turn is whatever *native* byte order on current platform.
+$hi=0;
+$lo=4;
+# ====================================================================
+$output=shift;
+open STDOUT,">$output";
+$ctx="r0";
+$inp="r1";
+$len="r2";
+$Tlo="r3";
+$Thi="r4";
+$Alo="r5";
+$Ahi="r6";
+$Elo="r7";
+$Ehi="r8";
+$t0="r9";
+$t1="r10";
+$t2="r11";
+$t3="r12";
+############    r13 is stack pointer
+$Ktbl="r14";
+############    r15 is program counter
+$Aoff=8*0;
+$Boff=8*1;
+$Coff=8*2;
+$Doff=8*3;
+$Eoff=8*4;
+$Foff=8*5;
+$Goff=8*6;
+$Hoff=8*7;
+$Xoff=8*8;
+sub BODY_00_15() {
+my $magic = shift;
+$code.=<<___;
+        ldr     $t2,[sp,#$Hoff+0]       @ h.lo
+        ldr     $t3,[sp,#$Hoff+4]       @ h.hi
+        @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+        @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+        @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+        mov     $t0,$Elo,lsr#14
+        mov     $t1,$Ehi,lsr#14
+        eor     $t0,$t0,$Ehi,lsl#18
+        eor     $t1,$t1,$Elo,lsl#18
+        eor     $t0,$t0,$Elo,lsr#18
+        eor     $t1,$t1,$Ehi,lsr#18
+        eor     $t0,$t0,$Ehi,lsl#14
+        eor     $t1,$t1,$Elo,lsl#14
+        eor     $t0,$t0,$Ehi,lsr#9
+        eor     $t1,$t1,$Elo,lsr#9
+        eor     $t0,$t0,$Elo,lsl#23
+        eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
+        adds    $Tlo,$Tlo,$t0
+        adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
+        adds    $Tlo,$Tlo,$t2
+        adc     $Thi,$Thi,$t3           @ T += h
+        ldr     $t0,[sp,#$Foff+0]       @ f.lo
+        ldr     $t1,[sp,#$Foff+4]       @ f.hi
+        ldr     $t2,[sp,#$Goff+0]       @ g.lo
+        ldr     $t3,[sp,#$Goff+4]       @ g.hi
+        str     $Elo,[sp,#$Eoff+0]
+        str     $Ehi,[sp,#$Eoff+4]
+        str     $Alo,[sp,#$Aoff+0]
+        str     $Ahi,[sp,#$Aoff+4]
+        eor     $t0,$t0,$t2
+        eor     $t1,$t1,$t3
+        and     $t0,$t0,$Elo
+        and     $t1,$t1,$Ehi
+        eor     $t0,$t0,$t2
+        eor     $t1,$t1,$t3             @ Ch(e,f,g)
+        ldr     $t2,[$Ktbl,#4]          @ K[i].lo
+        ldr     $t3,[$Ktbl,#0]          @ K[i].hi
+        ldr     $Elo,[sp,#$Doff+0]      @ d.lo
+        ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
+        adds    $Tlo,$Tlo,$t0
+        adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
+        adds    $Tlo,$Tlo,$t2
+        adc     $Thi,$Thi,$t3           @ T += K[i]
+        adds    $Elo,$Elo,$Tlo
+        adc     $Ehi,$Ehi,$Thi          @ d += T
+        and     $t0,$t2,#0xff
+        teq     $t0,#$magic
+        orreq   $Ktbl,$Ktbl,#1
+        ldr     $t2,[sp,#$Boff+0]       @ b.lo
+        ldr     $t3,[sp,#$Coff+0]       @ c.lo
+        @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+        @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+        @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+        mov     $t0,$Alo,lsr#28
+        mov     $t1,$Ahi,lsr#28
+        eor     $t0,$t0,$Ahi,lsl#4
+        eor     $t1,$t1,$Alo,lsl#4
+        eor     $t0,$t0,$Ahi,lsr#2
+        eor     $t1,$t1,$Alo,lsr#2
+        eor     $t0,$t0,$Alo,lsl#30
+        eor     $t1,$t1,$Ahi,lsl#30
+        eor     $t0,$t0,$Ahi,lsr#7
+        eor     $t1,$t1,$Alo,lsr#7
+        eor     $t0,$t0,$Alo,lsl#25
+        eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
+        adds    $Tlo,$Tlo,$t0
+        adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
+        and     $t0,$Alo,$t2
+        orr     $Alo,$Alo,$t2
+        ldr     $t1,[sp,#$Boff+4]       @ b.hi
+        ldr     $t2,[sp,#$Coff+4]       @ c.hi
+        and     $Alo,$Alo,$t3
+        orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
+        and     $t3,$Ahi,$t1
+        orr     $Ahi,$Ahi,$t1
+        and     $Ahi,$Ahi,$t2
+        orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
+        adds    $Alo,$Alo,$Tlo
+        adc     $Ahi,$Ahi,$Thi          @ h += T
+        sub     sp,sp,#8
+        add     $Ktbl,$Ktbl,#8
+___
+}
+$code=<<___;
+.text
+.code   32
+.type   K512,%object
+.align  5
+K512:
+.word   0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
+.word   0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
+.word   0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
+.word   0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
+.word   0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
+.word   0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
+.word   0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
+.word   0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
+.word   0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
+.word   0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
+.word   0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
+.word   0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
+.word   0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
+.word   0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
+.word   0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
+.word   0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
+.word   0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
+.word   0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
+.word   0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
+.word   0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
+.word   0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
+.word   0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
+.word   0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
+.word   0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
+.word   0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
+.word   0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
+.word   0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
+.word   0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
+.word   0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
+.word   0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
+.word   0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
+.word   0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
+.word   0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
+.word   0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
+.word   0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
+.word   0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
+.word   0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
+.word   0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
+.word   0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
+.word   0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+.size   K512,.-K512
+.global sha512_block_data_order
+.type   sha512_block_data_order,%function
+sha512_block_data_order:
+        sub     r3,pc,#8                @ sha512_block_data_order
+        add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
+        stmdb   sp!,{r4-r12,lr}
+        sub     $Ktbl,r3,#640           @ K512
+        sub     sp,sp,#9*8
+        ldr     $Elo,[$ctx,#$Eoff+$lo]
+        ldr     $Ehi,[$ctx,#$Eoff+$hi]
+        ldr     $t0, [$ctx,#$Goff+$lo]
+        ldr     $t1, [$ctx,#$Goff+$hi]
+        ldr     $t2, [$ctx,#$Hoff+$lo]
+        ldr     $t3, [$ctx,#$Hoff+$hi]
+.Loop:
+        str     $t0, [sp,#$Goff+0]
+        str     $t1, [sp,#$Goff+4]
+        str     $t2, [sp,#$Hoff+0]
+        str     $t3, [sp,#$Hoff+4]
+        ldr     $Alo,[$ctx,#$Aoff+$lo]
+        ldr     $Ahi,[$ctx,#$Aoff+$hi]
+        ldr     $Tlo,[$ctx,#$Boff+$lo]
+        ldr     $Thi,[$ctx,#$Boff+$hi]
+        ldr     $t0, [$ctx,#$Coff+$lo]
+        ldr     $t1, [$ctx,#$Coff+$hi]
+        ldr     $t2, [$ctx,#$Doff+$lo]
+        ldr     $t3, [$ctx,#$Doff+$hi]
+        str     $Tlo,[sp,#$Boff+0]
+        str     $Thi,[sp,#$Boff+4]
+        str     $t0, [sp,#$Coff+0]
+        str     $t1, [sp,#$Coff+4]
+        str     $t2, [sp,#$Doff+0]
+        str     $t3, [sp,#$Doff+4]
+        ldr     $Tlo,[$ctx,#$Foff+$lo]
+        ldr     $Thi,[$ctx,#$Foff+$hi]
+        str     $Tlo,[sp,#$Foff+0]
+        str     $Thi,[sp,#$Foff+4]
+.L00_15:
+        ldrb    $Tlo,[$inp,#7]
+        ldrb    $t0, [$inp,#6]
+        ldrb    $t1, [$inp,#5]
+        ldrb    $t2, [$inp,#4]
+        ldrb    $Thi,[$inp,#3]
+        ldrb    $t3, [$inp,#2]
+        orr     $Tlo,$Tlo,$t0,lsl#8
+        ldrb    $t0, [$inp,#1]
+        orr     $Tlo,$Tlo,$t1,lsl#16
+        ldrb    $t1, [$inp],#8
+        orr     $Tlo,$Tlo,$t2,lsl#24
+        orr     $Thi,$Thi,$t3,lsl#8
+        orr     $Thi,$Thi,$t0,lsl#16
+        orr     $Thi,$Thi,$t1,lsl#24
+        str     $Tlo,[sp,#$Xoff+0]
+        str     $Thi,[sp,#$Xoff+4]
+___
+        &BODY_00_15(0x94);
+$code.=<<___;
+        tst     $Ktbl,#1
+        beq     .L00_15
+        bic     $Ktbl,$Ktbl,#1
+.L16_79:
+        ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
+        ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
+        ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
+        ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
+        @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+        @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+        @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+        mov     $Tlo,$t0,lsr#1
+        mov     $Thi,$t1,lsr#1
+        eor     $Tlo,$Tlo,$t1,lsl#31
+        eor     $Thi,$Thi,$t0,lsl#31
+        eor     $Tlo,$Tlo,$t0,lsr#8
+        eor     $Thi,$Thi,$t1,lsr#8
+        eor     $Tlo,$Tlo,$t1,lsl#24
+        eor     $Thi,$Thi,$t0,lsl#24
+        eor     $Tlo,$Tlo,$t0,lsr#7
+        eor     $Thi,$Thi,$t1,lsr#7
+        eor     $Tlo,$Tlo,$t1,lsl#25
+        @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+        @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+        @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+        mov     $t0,$t2,lsr#19
+        mov     $t1,$t3,lsr#19
+        eor     $t0,$t0,$t3,lsl#13
+        eor     $t1,$t1,$t2,lsl#13
+        eor     $t0,$t0,$t3,lsr#29
+        eor     $t1,$t1,$t2,lsr#29
+        eor     $t0,$t0,$t2,lsl#3
+        eor     $t1,$t1,$t3,lsl#3
+        eor     $t0,$t0,$t2,lsr#6
+        eor     $t1,$t1,$t3,lsr#6
+        eor     $t0,$t0,$t3,lsl#26
+        ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
+        ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
+        adds    $Tlo,$Tlo,$t0
+        adc     $Thi,$Thi,$t1
+        ldr     $t0,[sp,#`$Xoff+8*16`+0]
+        ldr     $t1,[sp,#`$Xoff+8*16`+4]
+        adds    $Tlo,$Tlo,$t2
+        adc     $Thi,$Thi,$t3
+        adds    $Tlo,$Tlo,$t0
+        adc     $Thi,$Thi,$t1
+        str     $Tlo,[sp,#$Xoff+0]
+        str     $Thi,[sp,#$Xoff+4]
+___
+        &BODY_00_15(0x17);
+$code.=<<___;
+        tst     $Ktbl,#1
+        beq     .L16_79
+        bic     $Ktbl,$Ktbl,#1
+        ldr     $Tlo,[sp,#$Boff+0]
+        ldr     $Thi,[sp,#$Boff+4]
+        ldr     $t0, [$ctx,#$Aoff+$lo]
+        ldr     $t1, [$ctx,#$Aoff+$hi]
+        ldr     $t2, [$ctx,#$Boff+$lo]
+        ldr     $t3, [$ctx,#$Boff+$hi]
+        adds    $t0,$Alo,$t0
+        adc     $t1,$Ahi,$t1
+        adds    $t2,$Tlo,$t2
+        adc     $t3,$Thi,$t3
+        str     $t0, [$ctx,#$Aoff+$lo]
+        str     $t1, [$ctx,#$Aoff+$hi]
+        str     $t2, [$ctx,#$Boff+$lo]
+        str     $t3, [$ctx,#$Boff+$hi]
+        ldr     $Alo,[sp,#$Coff+0]
+        ldr     $Ahi,[sp,#$Coff+4]
+        ldr     $Tlo,[sp,#$Doff+0]
+        ldr     $Thi,[sp,#$Doff+4]
+        ldr     $t0, [$ctx,#$Coff+$lo]
+        ldr     $t1, [$ctx,#$Coff+$hi]
+        ldr     $t2, [$ctx,#$Doff+$lo]
+        ldr     $t3, [$ctx,#$Doff+$hi]
+        adds    $t0,$Alo,$t0
+        adc     $t1,$Ahi,$t1
+        adds    $t2,$Tlo,$t2
+        adc     $t3,$Thi,$t3
+        str     $t0, [$ctx,#$Coff+$lo]
+        str     $t1, [$ctx,#$Coff+$hi]
+        str     $t2, [$ctx,#$Doff+$lo]
+        str     $t3, [$ctx,#$Doff+$hi]
+        ldr     $Tlo,[sp,#$Foff+0]
+        ldr     $Thi,[sp,#$Foff+4]
+        ldr     $t0, [$ctx,#$Eoff+$lo]
+        ldr     $t1, [$ctx,#$Eoff+$hi]
+        ldr     $t2, [$ctx,#$Foff+$lo]
+        ldr     $t3, [$ctx,#$Foff+$hi]
+        adds    $Elo,$Elo,$t0
+        adc     $Ehi,$Ehi,$t1
+        adds    $t2,$Tlo,$t2
+        adc     $t3,$Thi,$t3
+        str     $Elo,[$ctx,#$Eoff+$lo]
+        str     $Ehi,[$ctx,#$Eoff+$hi]
+        str     $t2, [$ctx,#$Foff+$lo]
+        str     $t3, [$ctx,#$Foff+$hi]
+        ldr     $Alo,[sp,#$Goff+0]
+        ldr     $Ahi,[sp,#$Goff+4]
+        ldr     $Tlo,[sp,#$Hoff+0]
+        ldr     $Thi,[sp,#$Hoff+4]
+        ldr     $t0, [$ctx,#$Goff+$lo]
+        ldr     $t1, [$ctx,#$Goff+$hi]
+        ldr     $t2, [$ctx,#$Hoff+$lo]
+        ldr     $t3, [$ctx,#$Hoff+$hi]
+        adds    $t0,$Alo,$t0
+        adc     $t1,$Ahi,$t1
+        adds    $t2,$Tlo,$t2
+        adc     $t3,$Thi,$t3
+        str     $t0, [$ctx,#$Goff+$lo]
+        str     $t1, [$ctx,#$Goff+$hi]
+        str     $t2, [$ctx,#$Hoff+$lo]
+        str     $t3, [$ctx,#$Hoff+$hi]
+        add     sp,sp,#640
+        sub     $Ktbl,$Ktbl,#640
+        teq     $inp,$len
+        bne     .Loop
+        add     sp,sp,#8*9              @ destroy frame
+        ldmia   sp!,{r4-r12,lr}
+        tst     lr,#1
+        moveq   pc,lr                   @ be binary compatible with V4, yet
+        bx      lr                      @ interoperable with Thumb ISA:-)
+.size   sha512_block_data_order,.-sha512_block_data_order
+.asciz  "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha512-ppc.pl b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
new file mode 100755
index 0000000000..768a6a6fad
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
@@ -0,0 +1,462 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# I let hardware handle unaligned input, except on page boundaries
+# (see below for details). Otherwise straightforward implementation
+# with X vector in register bank. The module is big-endian [which is
+# not big deal as there're no little-endian targets left around].
+#                       sha256          |       sha512
+#                       -m64    -m32    |       -m64    -m32
+# --------------------------------------+-----------------------
+# PPC970,gcc-4.0.0      +50%    +38%    |       +40%    +410%(*)
+# Power6,xlc-7          +150%   +90%    |       +100%   +430%(*)
+#
+# (*)   64-bit code in 32-bit application context, which actually is
+#       on TODO list. It should be noted that for safe deployment in
+#       32-bit *mutli-threaded* context asyncronous signals should be
+#       blocked upon entry to SHA512 block routine. This is because
+#       32-bit signaling procedure invalidates upper halves of GPRs.
+#       Context switch procedure preserves them, but not signaling:-(
+# Second version is true multi-thread safe. Trouble with the original
+# version was that it was using thread local storage pointer register.
+# Well, it scrupulously preserved it, but the problem would arise the
+# moment asynchronous signal was delivered and signal handler would
+# dereference the TLS pointer. While it's never the case in openssl
+# application or test suite, we have to respect this scenario and not
+# use TLS pointer register. Alternative would be to require caller to
+# block signals prior calling this routine. For the record, in 32-bit
+# context R2 serves as TLS pointer, while in 64-bit context - R13.
+$flavour=shift;
+$output =shift;
+if ($flavour =~ /64/) {
+        $SIZE_T=8;
+        $STU="stdu";
+        $UCMP="cmpld";
+        $SHL="sldi";
+        $POP="ld";
+        $PUSH="std";
+} elsif ($flavour =~ /32/) {
+        $SIZE_T=4;
+        $STU="stwu";
+        $UCMP="cmplw";
+        $SHL="slwi";
+        $POP="lwz";
+        $PUSH="stw";
+} else { die "nonsense $flavour"; }
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+if ($output =~ /512/) {
+        $func="sha512_block_data_order";
+        $SZ=8;
+        @Sigma0=(28,34,39);
+        @Sigma1=(14,18,41);
+        @sigma0=(1,  8, 7);
+        @sigma1=(19,61, 6);
+        $rounds=80;
+        $LD="ld";
+        $ST="std";
+        $ROR="rotrdi";
+        $SHR="srdi";
+} else {
+        $func="sha256_block_data_order";
+        $SZ=4;
+        @Sigma0=( 2,13,22);
+        @Sigma1=( 6,11,25);
+        @sigma0=( 7,18, 3);
+        @sigma1=(17,19,10);
+        $rounds=64;
+        $LD="lwz";
+        $ST="stw";
+        $ROR="rotrwi";
+        $SHR="srwi";
+}
+$FRAME=32*$SIZE_T;
+$sp ="r1";
+$toc="r2";
+$ctx="r3";      # zapped by $a0
+$inp="r4";      # zapped by $a1
+$num="r5";      # zapped by $t0
+$T  ="r0";
+$a0 ="r3";
+$a1 ="r4";
+$t0 ="r5";
+$t1 ="r6";
+$Tbl="r7";
+$A  ="r8";
+$B  ="r9";
+$C  ="r10";
+$D  ="r11";
+$E  ="r12";
+$F  ="r13";     $F="r2" if ($SIZE_T==8);# reassigned to exempt TLS pointer
+$G  ="r14";
+$H  ="r15";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+@X=("r16","r17","r18","r19","r20","r21","r22","r23",
+    "r24","r25","r26","r27","r28","r29","r30","r31");
+$inp="r31";     # reassigned $inp! aliases with @X[15]
+sub ROUND_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$code.=<<___;
+        $LD     $T,`$i*$SZ`($Tbl)
+        $ROR    $a0,$e,$Sigma1[0]
+        $ROR    $a1,$e,$Sigma1[1]
+        and     $t0,$f,$e
+        andc    $t1,$g,$e
+        add     $T,$T,$h
+        xor     $a0,$a0,$a1
+        $ROR    $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
+        or      $t0,$t0,$t1             ; Ch(e,f,g)
+        add     $T,$T,@X[$i]
+        xor     $a0,$a0,$a1             ; Sigma1(e)
+        add     $T,$T,$t0
+        add     $T,$T,$a0
+        $ROR    $a0,$a,$Sigma0[0]
+        $ROR    $a1,$a,$Sigma0[1]
+        and     $t0,$a,$b
+        and     $t1,$a,$c
+        xor     $a0,$a0,$a1
+        $ROR    $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
+        xor     $t0,$t0,$t1
+        and     $t1,$b,$c
+        xor     $a0,$a0,$a1             ; Sigma0(a)
+        add     $d,$d,$T
+        xor     $t0,$t0,$t1             ; Maj(a,b,c)
+        add     $h,$T,$a0
+        add     $h,$h,$t0
+___
+}
+sub ROUND_16_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$i-=16;
+$code.=<<___;
+        $ROR    $a0,@X[($i+1)%16],$sigma0[0]
+        $ROR    $a1,@X[($i+1)%16],$sigma0[1]
+        $ROR    $t0,@X[($i+14)%16],$sigma1[0]
+        $ROR    $t1,@X[($i+14)%16],$sigma1[1]
+        xor     $a0,$a0,$a1
+        $SHR    $a1,@X[($i+1)%16],$sigma0[2]
+        xor     $t0,$t0,$t1
+        $SHR    $t1,@X[($i+14)%16],$sigma1[2]
+        add     @X[$i],@X[$i],@X[($i+9)%16]
+        xor     $a0,$a0,$a1             ; sigma0(X[(i+1)&0x0f])
+        xor     $t0,$t0,$t1             ; sigma1(X[(i+14)&0x0f])
+        add     @X[$i],@X[$i],$a0
+        add     @X[$i],@X[$i],$t0
+___
+&ROUND_00_15($i,$a,$b,$c,$d,$e,$f,$g,$h);
+}
+$code=<<___;
+.machine        "any"
+.text
+.globl  $func
+.align  6
+$func:
+        mflr    r0
+        $STU    $sp,`-($FRAME+16*$SZ)`($sp)
+        $SHL    $num,$num,`log(16*$SZ)/log(2)`
+        $PUSH   $ctx,`$FRAME-$SIZE_T*22`($sp)
+        $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
+        $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
+        $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
+        $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
+        $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
+        $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
+        $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
+        $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
+        $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
+        $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
+        $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
+        $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
+        $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
+        $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
+        $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
+        $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
+        $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
+        $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
+        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
+        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
+        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+        $LD     $A,`0*$SZ`($ctx)
+        mr      $inp,r4                         ; incarnate $inp
+        $LD     $B,`1*$SZ`($ctx)
+        $LD     $C,`2*$SZ`($ctx)
+        $LD     $D,`3*$SZ`($ctx)
+        $LD     $E,`4*$SZ`($ctx)
+        $LD     $F,`5*$SZ`($ctx)
+        $LD     $G,`6*$SZ`($ctx)
+        $LD     $H,`7*$SZ`($ctx)
+        b       LPICmeup
+LPICedup:
+        andi.   r0,$inp,3
+        bne     Lunaligned
+Laligned:
+        add     $num,$inp,$num
+        $PUSH   $num,`$FRAME-$SIZE_T*24`($sp)   ; end pointer
+        $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
+        bl      Lsha2_block_private
+Ldone:
+        $POP    r0,`$FRAME-$SIZE_T*21`($sp)
+        $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
+        $POP    r13,`$FRAME-$SIZE_T*19`($sp)
+        $POP    r14,`$FRAME-$SIZE_T*18`($sp)
+        $POP    r15,`$FRAME-$SIZE_T*17`($sp)
+        $POP    r16,`$FRAME-$SIZE_T*16`($sp)
+        $POP    r17,`$FRAME-$SIZE_T*15`($sp)
+        $POP    r18,`$FRAME-$SIZE_T*14`($sp)
+        $POP    r19,`$FRAME-$SIZE_T*13`($sp)
+        $POP    r20,`$FRAME-$SIZE_T*12`($sp)
+        $POP    r21,`$FRAME-$SIZE_T*11`($sp)
+        $POP    r22,`$FRAME-$SIZE_T*10`($sp)
+        $POP    r23,`$FRAME-$SIZE_T*9`($sp)
+        $POP    r24,`$FRAME-$SIZE_T*8`($sp)
+        $POP    r25,`$FRAME-$SIZE_T*7`($sp)
+        $POP    r26,`$FRAME-$SIZE_T*6`($sp)
+        $POP    r27,`$FRAME-$SIZE_T*5`($sp)
+        $POP    r28,`$FRAME-$SIZE_T*4`($sp)
+        $POP    r29,`$FRAME-$SIZE_T*3`($sp)
+        $POP    r30,`$FRAME-$SIZE_T*2`($sp)
+        $POP    r31,`$FRAME-$SIZE_T*1`($sp)
+        mtlr    r0
+        addi    $sp,$sp,`$FRAME+16*$SZ`
+        blr
+___
+# PowerPC specification allows an implementation to be ill-behaved
+# upon unaligned access which crosses page boundary. "Better safe
+# than sorry" principle makes me treat it specially. But I don't
+# look for particular offending word, but rather for the input
+# block which crosses the boundary. Once found that block is aligned
+# and hashed separately...
+$code.=<<___;
+.align  4
+Lunaligned:
+        subfic  $t1,$inp,4096
+        andi.   $t1,$t1,`4096-16*$SZ`   ; distance to closest page boundary
+        beq     Lcross_page
+        $UCMP   $num,$t1
+        ble-    Laligned                ; didn't cross the page boundary
+        subfc   $num,$t1,$num
+        add     $t1,$inp,$t1
+        $PUSH   $num,`$FRAME-$SIZE_T*25`($sp)   ; save real remaining num
+        $PUSH   $t1,`$FRAME-$SIZE_T*24`($sp)    ; intermediate end pointer
+        $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
+        bl      Lsha2_block_private
+        ; $inp equals to the intermediate end pointer here
+        $POP    $num,`$FRAME-$SIZE_T*25`($sp)   ; restore real remaining num
+Lcross_page:
+        li      $t1,`16*$SZ/4`
+        mtctr   $t1
+        addi    r20,$sp,$FRAME                  ; aligned spot below the frame
+Lmemcpy:
+        lbz     r16,0($inp)
+        lbz     r17,1($inp)
+        lbz     r18,2($inp)
+        lbz     r19,3($inp)
+        addi    $inp,$inp,4
+        stb     r16,0(r20)
+        stb     r17,1(r20)
+        stb     r18,2(r20)
+        stb     r19,3(r20)
+        addi    r20,r20,4
+        bdnz    Lmemcpy
+        $PUSH   $inp,`$FRAME-$SIZE_T*26`($sp)   ; save real inp
+        addi    $t1,$sp,`$FRAME+16*$SZ`         ; fictitious end pointer
+        addi    $inp,$sp,$FRAME                 ; fictitious inp pointer
+        $PUSH   $num,`$FRAME-$SIZE_T*25`($sp)   ; save real num
+        $PUSH   $t1,`$FRAME-$SIZE_T*24`($sp)    ; end pointer
+        $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
+        bl      Lsha2_block_private
+        $POP    $inp,`$FRAME-$SIZE_T*26`($sp)   ; restore real inp
+        $POP    $num,`$FRAME-$SIZE_T*25`($sp)   ; restore real num
+        addic.  $num,$num,`-16*$SZ`             ; num--
+        bne-    Lunaligned
+        b       Ldone
+___
+$code.=<<___;
+.align  4
+Lsha2_block_private:
+___
+for($i=0;$i<16;$i++) {
+$code.=<<___ if ($SZ==4);
+        lwz     @X[$i],`$i*$SZ`($inp)
+___
+# 64-bit loads are split to 2x32-bit ones, as CPU can't handle
+# unaligned 64-bit loads, only 32-bit ones...
+$code.=<<___ if ($SZ==8);
+        lwz     $t0,`$i*$SZ`($inp)
+        lwz     @X[$i],`$i*$SZ+4`($inp)
+        insrdi  @X[$i],$t0,32,0
+___
+        &ROUND_00_15($i,@V);
+        unshift(@V,pop(@V));
+}
+$code.=<<___;
+        li      $T,`$rounds/16-1`
+        mtctr   $T
+.align  4
+Lrounds:
+        addi    $Tbl,$Tbl,`16*$SZ`
+___
+for(;$i<32;$i++) {
+        &ROUND_16_xx($i,@V);
+        unshift(@V,pop(@V));
+}
+$code.=<<___;
+        bdnz-   Lrounds
+        $POP    $ctx,`$FRAME-$SIZE_T*22`($sp)
+        $POP    $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
+        $POP    $num,`$FRAME-$SIZE_T*24`($sp)   ; end pointer
+        subi    $Tbl,$Tbl,`($rounds-16)*$SZ`    ; rewind Tbl
+        $LD     r16,`0*$SZ`($ctx)
+        $LD     r17,`1*$SZ`($ctx)
+        $LD     r18,`2*$SZ`($ctx)
+        $LD     r19,`3*$SZ`($ctx)
+        $LD     r20,`4*$SZ`($ctx)
+        $LD     r21,`5*$SZ`($ctx)
+        $LD     r22,`6*$SZ`($ctx)
+        addi    $inp,$inp,`16*$SZ`              ; advance inp
+        $LD     r23,`7*$SZ`($ctx)
+        add     $A,$A,r16
+        add     $B,$B,r17
+        $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)
+        add     $C,$C,r18
+        $ST     $A,`0*$SZ`($ctx)
+        add     $D,$D,r19
+        $ST     $B,`1*$SZ`($ctx)
+        add     $E,$E,r20
+        $ST     $C,`2*$SZ`($ctx)
+        add     $F,$F,r21
+        $ST     $D,`3*$SZ`($ctx)
+        add     $G,$G,r22
+        $ST     $E,`4*$SZ`($ctx)
+        add     $H,$H,r23
+        $ST     $F,`5*$SZ`($ctx)
+        $ST     $G,`6*$SZ`($ctx)
+        $UCMP   $inp,$num
+        $ST     $H,`7*$SZ`($ctx)
+        bne     Lsha2_block_private
+        blr
+___
+# Ugly hack here, because PPC assembler syntax seem to vary too
+# much from platforms to platform...
+$code.=<<___;
+.align  6
+LPICmeup:
+        bl      LPIC
+        addi    $Tbl,$Tbl,`64-4`        ; "distance" between . and last nop
+        b       LPICedup
+        nop
+        nop
+        nop
+        nop
+        nop
+LPIC:   mflr    $Tbl
+        blr
+        nop
+        nop
+        nop
+        nop
+        nop
+        nop
+___
+$code.=<<___ if ($SZ==8);
+        .long   0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
+        .long   0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
+        .long   0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
+        .long   0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
+        .long   0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
+        .long   0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
+        .long   0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
+        .long   0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
+        .long   0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
+        .long   0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
+        .long   0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
+        .long   0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
+        .long   0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
+        .long   0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
+        .long   0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
+        .long   0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
+        .long   0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
+        .long   0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
+        .long   0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
+        .long   0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
+        .long   0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
+        .long   0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
+        .long   0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
+        .long   0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
+        .long   0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
+        .long   0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
+        .long   0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
+        .long   0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
+        .long   0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
+        .long   0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
+        .long   0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
+        .long   0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
+        .long   0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
+        .long   0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
+        .long   0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
+        .long   0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
+        .long   0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
+        .long   0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
+        .long   0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
+        .long   0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
+___
+$code.=<<___ if ($SZ==4);
+        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-s390x.pl b/src/lib/libcrypto/sha/asm/sha512-s390x.pl
new file mode 100644
index 0000000000..e7ef2d5a9f
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-s390x.pl
@@ -0,0 +1,301 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# SHA256/512 block procedures for s390x.
+# April 2007.
+#
+# sha256_block_data_order is reportedly >3 times faster than gcc 3.3
+# generated code (must be a bug in compiler, as improvement is
+# "pathologically" high, in particular in comparison to other SHA
+# modules). But the real twist is that it detects if hardware support
+# for SHA256 is available and in such case utilizes it. Then the
+# performance can reach >6.5x of assembler one for larger chunks.
+#
+# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
+# January 2009.
+#
+# Add support for hardware SHA512 and reschedule instructions to
+# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
+# than software.
+$t0="%r0";
+$t1="%r1";
+$ctx="%r2";     $t2="%r2";
+$inp="%r3";
+$len="%r4";     # used as index in inner loop
+$A="%r5";
+$B="%r6";
+$C="%r7";
+$D="%r8";
+$E="%r9";
+$F="%r10";
+$G="%r11";
+$H="%r12";      @V=($A,$B,$C,$D,$E,$F,$G,$H);
+$tbl="%r13";
+$T1="%r14";
+$sp="%r15";
+$output=shift;
+open STDOUT,">$output";
+if ($output =~ /512/) {
+        $label="512";
+        $SZ=8;
+        $LD="lg";       # load from memory
+        $ST="stg";      # store to memory
+        $ADD="alg";     # add with memory operand
+        $ROT="rllg";    # rotate left
+        $SHR="srlg";    # logical right shift [see even at the end]
+        @Sigma0=(25,30,36);
+        @Sigma1=(23,46,50);
+        @sigma0=(56,63, 7);
+        @sigma1=( 3,45, 6);
+        $rounds=80;
+        $kimdfunc=3;    # 0 means unknown/unsupported/unimplemented/disabled
+} else {
+        $label="256";
+        $SZ=4;
+        $LD="llgf";     # load from memory
+        $ST="st";       # store to memory
+        $ADD="al";      # add with memory operand
+        $ROT="rll";     # rotate left
+        $SHR="srl";     # logical right shift
+        @Sigma0=(10,19,30);
+        @Sigma1=( 7,21,26);
+        @sigma0=(14,25, 3);
+        @sigma1=(13,15,10);
+        $rounds=64;
+        $kimdfunc=2;    # magic function code for kimd instruction
+}
+$Func="sha${label}_block_data_order";
+$Table="K${label}";
+$frame=160+16*$SZ;
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+$code.=<<___ if ($i<16);
+        $LD     $T1,`$i*$SZ`($inp)      ### $i
+___
+$code.=<<___;
+        $ROT    $t0,$e,$Sigma1[0]
+        $ROT    $t1,$e,$Sigma1[1]
+         lgr    $t2,$f
+        xgr     $t0,$t1
+        $ROT    $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
+         xgr    $t2,$g
+        $ST     $T1,`160+$SZ*($i%16)`($sp)
+        xgr     $t0,$t1                 # Sigma1(e)
+        la      $T1,0($T1,$h)           # T1+=h
+         ngr    $t2,$e
+         lgr    $t1,$a
+        algr    $T1,$t0                 # T1+=Sigma1(e)
+        $ROT    $h,$a,$Sigma0[0]
+         xgr    $t2,$g                  # Ch(e,f,g)
+        $ADD    $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
+        $ROT    $t0,$a,$Sigma0[1]
+        algr    $T1,$t2                 # T1+=Ch(e,f,g)
+         ogr    $t1,$b
+        xgr     $h,$t0
+         lgr    $t2,$a
+         ngr    $t1,$c
+        $ROT    $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
+        xgr     $h,$t0                  # h=Sigma0(a)
+         ngr    $t2,$b
+        algr    $h,$T1                  # h+=T1
+         ogr    $t2,$t1                 # Maj(a,b,c)
+        la      $d,0($d,$T1)            # d+=T1
+        algr    $h,$t2                  # h+=Maj(a,b,c)
+___
+}
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+$code.=<<___;
+        $LD     $T1,`160+$SZ*(($i+1)%16)`($sp)  ### $i
+        $LD     $t1,`160+$SZ*(($i+14)%16)`($sp)
+        $ROT    $t0,$T1,$sigma0[0]
+        $SHR    $T1,$sigma0[2]
+        $ROT    $t2,$t0,`$sigma0[1]-$sigma0[0]`
+        xgr     $T1,$t0
+        $ROT    $t0,$t1,$sigma1[0]
+        xgr     $T1,$t2                         # sigma0(X[i+1])
+        $SHR    $t1,$sigma1[2]
+        $ADD    $T1,`160+$SZ*($i%16)`($sp)      # +=X[i]
+        xgr     $t1,$t0
+        $ROT    $t0,$t0,`$sigma1[1]-$sigma1[0]`
+        $ADD    $T1,`160+$SZ*(($i+9)%16)`($sp)  # +=X[i+9]
+        xgr     $t1,$t0                         # sigma1(X[i+14])
+        algr    $T1,$t1                         # +=sigma1(X[i+14])
+___
+        &BODY_00_15(@_);
+}
+$code.=<<___;
+.text
+.align  64
+.type   $Table,\@object
+$Table:
+___
+$code.=<<___ if ($SZ==4);
+        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+___
+$code.=<<___ if ($SZ==8);
+        .quad   0x428a2f98d728ae22,0x7137449123ef65cd
+        .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+        .quad   0x3956c25bf348b538,0x59f111f1b605d019
+        .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
+        .quad   0xd807aa98a3030242,0x12835b0145706fbe
+        .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+        .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
+        .quad   0x9bdc06a725c71235,0xc19bf174cf692694
+        .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
+        .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+        .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
+        .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+        .quad   0x983e5152ee66dfab,0xa831c66d2db43210
+        .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
+        .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
+        .quad   0x06ca6351e003826f,0x142929670a0e6e70
+        .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
+        .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+        .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
+        .quad   0x81c2c92e47edaee6,0x92722c851482353b
+        .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
+        .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
+        .quad   0xd192e819d6ef5218,0xd69906245565a910
+        .quad   0xf40e35855771202a,0x106aa07032bbd1b8
+        .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
+        .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+        .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+        .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+        .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
+        .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
+        .quad   0x90befffa23631e28,0xa4506cebde82bde9
+        .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
+        .quad   0xca273eceea26619c,0xd186b8c721c0c207
+        .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+        .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
+        .quad   0x113f9804bef90dae,0x1b710b35131c471b
+        .quad   0x28db77f523047d84,0x32caab7b40c72493
+        .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+        .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+        .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
+___
+$code.=<<___;
+.size   $Table,.-$Table
+.globl  $Func
+.type   $Func,\@function
+$Func:
+___
+$code.=<<___ if ($kimdfunc);
+        larl    %r1,OPENSSL_s390xcap_P
+        lg      %r0,0(%r1)
+        tmhl    %r0,0x4000      # check for message-security assist
+        jz      .Lsoftware
+        lghi    %r0,0
+        la      %r1,16($sp)
+        .long   0xb93e0002      # kimd %r0,%r2
+        lg      %r0,16($sp)
+        tmhh    %r0,`0x8000>>$kimdfunc`
+        jz      .Lsoftware
+        lghi    %r0,$kimdfunc
+        lgr     %r1,$ctx
+        lgr     %r2,$inp
+        sllg    %r3,$len,`log(16*$SZ)/log(2)`
+        .long   0xb93e0002      # kimd %r0,%r2
+        brc     1,.-4           # pay attention to "partial completion"
+        br      %r14
+.align  16
+.Lsoftware:
+___
+$code.=<<___;
+        sllg    $len,$len,`log(16*$SZ)/log(2)`
+        lghi    %r1,-$frame
+        agr     $len,$inp
+        stmg    $ctx,%r15,16($sp)
+        lgr     %r0,$sp
+        la      $sp,0(%r1,$sp)
+        stg     %r0,0($sp)
+        larl    $tbl,$Table
+        $LD     $A,`0*$SZ`($ctx)
+        $LD     $B,`1*$SZ`($ctx)
+        $LD     $C,`2*$SZ`($ctx)
+        $LD     $D,`3*$SZ`($ctx)
+        $LD     $E,`4*$SZ`($ctx)
+        $LD     $F,`5*$SZ`($ctx)
+        $LD     $G,`6*$SZ`($ctx)
+        $LD     $H,`7*$SZ`($ctx)
+.Lloop:
+        lghi    $len,0
+___
+for ($i=0;$i<16;$i++)   { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        aghi    $len,`16*$SZ`
+        lghi    $t0,`($rounds-16)*$SZ`
+        clgr    $len,$t0
+        jne     .Lrounds_16_xx
+        lg      $ctx,`$frame+16`($sp)
+        la      $inp,`16*$SZ`($inp)
+        $ADD    $A,`0*$SZ`($ctx)
+        $ADD    $B,`1*$SZ`($ctx)
+        $ADD    $C,`2*$SZ`($ctx)
+        $ADD    $D,`3*$SZ`($ctx)
+        $ADD    $E,`4*$SZ`($ctx)
+        $ADD    $F,`5*$SZ`($ctx)
+        $ADD    $G,`6*$SZ`($ctx)
+        $ADD    $H,`7*$SZ`($ctx)
+        $ST     $A,`0*$SZ`($ctx)
+        $ST     $B,`1*$SZ`($ctx)
+        $ST     $C,`2*$SZ`($ctx)
+        $ST     $D,`3*$SZ`($ctx)
+        $ST     $E,`4*$SZ`($ctx)
+        $ST     $F,`5*$SZ`($ctx)
+        $ST     $G,`6*$SZ`($ctx)
+        $ST     $H,`7*$SZ`($ctx)
+        clg     $inp,`$frame+32`($sp)
+        jne     .Lloop
+        lmg     %r6,%r15,`$frame+48`($sp)       
+        br      %r14
+.size   $Func,.-$Func
+.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+.comm   OPENSSL_s390xcap_P,8,8
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+# unlike 32-bit shift 64-bit one takes three arguments
+$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
new file mode 100644
index 0000000000..54241aab50
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
@@ -0,0 +1,593 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# SHA256 performance improvement over compiler generated code varies
+# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
+# build]. Just like in SHA1 module I aim to ensure scalability on
+# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
+# SHA512 on pre-T1 UltraSPARC.
+#
+# Performance is >75% better than 64-bit code generated by Sun C and
+# over 2x than 32-bit code. X[16] resides on stack, but access to it
+# is scheduled for L2 latency and staged through 32 least significant
+# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
+# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
+# good [optimal coefficient is 50%].
+#
+# SHA512 on UltraSPARC T1.
+#
+# It's not any faster than 64-bit code generated by Sun C 5.8. This is
+# because 64-bit code generator has the advantage of using 64-bit
+# loads(*) to access X[16], which I consciously traded for 32-/64-bit
+# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
+# code by 60%, not to mention that it doesn't suffer from severe decay
+# when running 4 times physical cores threads and that it leaves gcc
+# [3.4] behind by over 4x factor! If compared to SHA256, single thread
+# performance is only 10% better, but overall throughput for maximum
+# amount of threads for given CPU exceeds corresponding one of SHA256
+# by 30% [again, optimal coefficient is 50%].
+#
+# (*)   Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
+#       in-order, i.e. load instruction has to complete prior next
+#       instruction in given thread is executed, even if the latter is
+#       not dependent on load result! This means that on T1 two 32-bit
+#       loads are always slower than one 64-bit load. Once again this
+#       is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
+#       2x32-bit loads can be as fast as 1x64-bit ones.
+$bits=32;
+for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)  { $bias=2047; $frame=192; }
+else            { $bias=0;    $frame=112; }
+$output=shift;
+open STDOUT,">$output";
+if ($output =~ /512/) {
+        $label="512";
+        $SZ=8;
+        $LD="ldx";              # load from memory
+        $ST="stx";              # store to memory
+        $SLL="sllx";            # shift left logical
+        $SRL="srlx";            # shift right logical
+        @Sigma0=(28,34,39);
+        @Sigma1=(14,18,41);
+        @sigma0=( 7, 1, 8);     # right shift first
+        @sigma1=( 6,19,61);     # right shift first
+        $lastK=0x817;
+        $rounds=80;
+        $align=4;
+        $locals=16*$SZ;         # X[16]
+        $A="%o0";
+        $B="%o1";
+        $C="%o2";
+        $D="%o3";
+        $E="%o4";
+        $F="%o5";
+        $G="%g1";
+        $H="%o7";
+        @V=($A,$B,$C,$D,$E,$F,$G,$H);
+} else {
+        $label="256";
+        $SZ=4;
+        $LD="ld";               # load from memory
+        $ST="st";               # store to memory
+        $SLL="sll";             # shift left logical
+        $SRL="srl";             # shift right logical
+        @Sigma0=( 2,13,22);
+        @Sigma1=( 6,11,25);
+        @sigma0=( 3, 7,18);     # right shift first
+        @sigma1=(10,17,19);     # right shift first
+        $lastK=0x8f2;
+        $rounds=64;
+        $align=8;
+        $locals=0;              # X[16] is register resident
+        @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
+        
+        $A="%l0";
+        $B="%l1";
+        $C="%l2";
+        $D="%l3";
+        $E="%l4";
+        $F="%l5";
+        $G="%l6";
+        $H="%l7";
+        @V=($A,$B,$C,$D,$E,$F,$G,$H);
+}
+$T1="%g2";
+$tmp0="%g3";
+$tmp1="%g4";
+$tmp2="%g5";
+$ctx="%i0";
+$inp="%i1";
+$len="%i2";
+$Ktbl="%i3";
+$tmp31="%i4";
+$tmp32="%i5";
+########### SHA256
+$Xload = sub {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+    if ($i==0) {
+$code.=<<___;
+        ldx     [$inp+0],@X[0]
+        ldx     [$inp+16],@X[2]
+        ldx     [$inp+32],@X[4]
+        ldx     [$inp+48],@X[6]
+        ldx     [$inp+8],@X[1]
+        ldx     [$inp+24],@X[3]
+        subcc   %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
+        ldx     [$inp+40],@X[5]
+        bz,pt   %icc,.Laligned
+        ldx     [$inp+56],@X[7]
+        sllx    @X[0],$tmp31,@X[0]
+        ldx     [$inp+64],$T1
+___
+for($j=0;$j<7;$j++)
+{   $code.=<<___;
+        srlx    @X[$j+1],$tmp32,$tmp1
+        sllx    @X[$j+1],$tmp31,@X[$j+1]
+        or      $tmp1,@X[$j],@X[$j]
+___
+}
+$code.=<<___;
+        srlx    $T1,$tmp32,$T1
+        or      $T1,@X[7],@X[7]
+.Laligned:
+___
+    }
+    if ($i&1) {
+        $code.="\tadd   @X[$i/2],$h,$T1\n";
+    } else {
+        $code.="\tsrlx  @X[$i/2],32,$T1\n\tadd  $h,$T1,$T1\n";
+    }
+} if ($SZ==4);
+########### SHA512
+$Xload = sub {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
+$code.=<<___ if ($i==0);
+        ld      [$inp+0],%l0
+        ld      [$inp+4],%l1
+        ld      [$inp+8],%l2
+        ld      [$inp+12],%l3
+        ld      [$inp+16],%l4
+        ld      [$inp+20],%l5
+        ld      [$inp+24],%l6
+        ld      [$inp+28],%l7
+___
+$code.=<<___ if ($i<15);
+        sllx    @pair[1],$tmp31,$tmp2   ! Xload($i)
+        add     $tmp31,32,$tmp0
+        sllx    @pair[0],$tmp0,$tmp1
+        `"ld    [$inp+".eval(32+0+$i*8)."],@pair[0]"    if ($i<12)`
+        srlx    @pair[2],$tmp32,@pair[1]
+        or      $tmp1,$tmp2,$tmp2
+        or      @pair[1],$tmp2,$tmp2
+        `"ld    [$inp+".eval(32+4+$i*8)."],@pair[1]"    if ($i<12)`
+        add     $h,$tmp2,$T1
+        $ST     $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
+___
+$code.=<<___ if ($i==12);
+        brnz,a  $tmp31,.+8
+        ld      [$inp+128],%l0
+___
+$code.=<<___ if ($i==15);
+        ld      [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
+        sllx    @pair[1],$tmp31,$tmp2   ! Xload($i)
+        add     $tmp31,32,$tmp0
+        ld      [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
+        sllx    @pair[0],$tmp0,$tmp1
+        ld      [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
+        srlx    @pair[2],$tmp32,@pair[1]
+        or      $tmp1,$tmp2,$tmp2
+        ld      [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
+        or      @pair[1],$tmp2,$tmp2
+        ld      [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
+        add     $h,$tmp2,$T1
+        $ST     $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
+        ld      [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
+        ld      [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
+        ld      [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
+___
+} if ($SZ==8);
+########### common
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+    if ($i<16) {
+        &$Xload(@_);
+    } else {
+        $code.="\tadd   $h,$T1,$T1\n";
+    }
+$code.=<<___;
+        $SRL    $e,@Sigma1[0],$h        !! $i
+        xor     $f,$g,$tmp2
+        $SLL    $e,`$SZ*8-@Sigma1[2]`,$tmp1
+        and     $e,$tmp2,$tmp2
+        $SRL    $e,@Sigma1[1],$tmp0
+        xor     $tmp1,$h,$h
+        $SLL    $e,`$SZ*8-@Sigma1[1]`,$tmp1
+        xor     $tmp0,$h,$h
+        $SRL    $e,@Sigma1[2],$tmp0
+        xor     $tmp1,$h,$h
+        $SLL    $e,`$SZ*8-@Sigma1[0]`,$tmp1
+        xor     $tmp0,$h,$h
+        xor     $g,$tmp2,$tmp2          ! Ch(e,f,g)
+        xor     $tmp1,$h,$tmp0          ! Sigma1(e)
+        $SRL    $a,@Sigma0[0],$h
+        add     $tmp2,$T1,$T1
+        $LD     [$Ktbl+`$i*$SZ`],$tmp2  ! K[$i]
+        $SLL    $a,`$SZ*8-@Sigma0[2]`,$tmp1
+        add     $tmp0,$T1,$T1
+        $SRL    $a,@Sigma0[1],$tmp0
+        xor     $tmp1,$h,$h
+        $SLL    $a,`$SZ*8-@Sigma0[1]`,$tmp1
+        xor     $tmp0,$h,$h
+        $SRL    $a,@Sigma0[2],$tmp0
+        xor     $tmp1,$h,$h     
+        $SLL    $a,`$SZ*8-@Sigma0[0]`,$tmp1
+        xor     $tmp0,$h,$h
+        xor     $tmp1,$h,$h             ! Sigma0(a)
+        or      $a,$b,$tmp0
+        and     $a,$b,$tmp1
+        and     $c,$tmp0,$tmp0
+        or      $tmp0,$tmp1,$tmp1       ! Maj(a,b,c)
+        add     $tmp2,$T1,$T1           ! +=K[$i]
+        add     $tmp1,$h,$h
+        add     $T1,$d,$d
+        add     $T1,$h,$h
+___
+}
+########### SHA256
+$BODY_16_XX = sub {
+my $i=@_[0];
+my $xi;
+    if ($i&1) {
+        $xi=$tmp32;
+        $code.="\tsrlx  @X[(($i+1)/2)%8],32,$xi\n";
+    } else {
+        $xi=@X[(($i+1)/2)%8];
+    }
+$code.=<<___;
+        srl     $xi,@sigma0[0],$T1              !! Xupdate($i)
+        sll     $xi,`32-@sigma0[2]`,$tmp1
+        srl     $xi,@sigma0[1],$tmp0
+        xor     $tmp1,$T1,$T1
+        sll     $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
+        xor     $tmp0,$T1,$T1
+        srl     $xi,@sigma0[2],$tmp0
+        xor     $tmp1,$T1,$T1
+___
+    if ($i&1) {
+        $xi=@X[(($i+14)/2)%8];
+    } else {
+        $xi=$tmp32;
+        $code.="\tsrlx  @X[(($i+14)/2)%8],32,$xi\n";
+    }
+$code.=<<___;
+        srl     $xi,@sigma1[0],$tmp2
+        xor     $tmp0,$T1,$T1                   ! T1=sigma0(X[i+1])
+        sll     $xi,`32-@sigma1[2]`,$tmp1
+        srl     $xi,@sigma1[1],$tmp0
+        xor     $tmp1,$tmp2,$tmp2
+        sll     $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
+        xor     $tmp0,$tmp2,$tmp2
+        srl     $xi,@sigma1[2],$tmp0
+        xor     $tmp1,$tmp2,$tmp2
+___
+    if ($i&1) {
+        $xi=@X[($i/2)%8];
+$code.=<<___;
+        srlx    @X[(($i+9)/2)%8],32,$tmp1       ! X[i+9]
+        xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
+        srl     @X[($i/2)%8],0,$tmp0
+        add     $xi,$T1,$T1                     ! +=X[i]
+        xor     $tmp0,@X[($i/2)%8],@X[($i/2)%8]
+        add     $tmp2,$T1,$T1
+        add     $tmp1,$T1,$T1
+        srl     $T1,0,$T1
+        or      $T1,@X[($i/2)%8],@X[($i/2)%8]
+___
+    } else {
+        $xi=@X[(($i+9)/2)%8];
+$code.=<<___;
+        srlx    @X[($i/2)%8],32,$tmp1           ! X[i]
+        xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
+        srl     @X[($i/2)%8],0,@X[($i/2)%8]
+        add     $xi,$T1,$T1                     ! +=X[i+9]
+        add     $tmp2,$T1,$T1
+        add     $tmp1,$T1,$T1
+        sllx    $T1,32,$tmp0
+        or      $tmp0,@X[($i/2)%8],@X[($i/2)%8]
+___
+    }
+    &BODY_00_15(@_);
+} if ($SZ==4);
+########### SHA512
+$BODY_16_XX = sub {
+my $i=@_[0];
+my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
+$code.=<<___;
+        sllx    %l2,32,$tmp0            !! Xupdate($i)
+        or      %l3,$tmp0,$tmp0
+        srlx    $tmp0,@sigma0[0],$T1
+        ld      [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
+        sllx    $tmp0,`64-@sigma0[2]`,$tmp1
+        ld      [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
+        srlx    $tmp0,@sigma0[1],$tmp0
+        xor     $tmp1,$T1,$T1
+        sllx    $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
+        xor     $tmp0,$T1,$T1
+        srlx    $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
+        xor     $tmp1,$T1,$T1
+        sllx    %l6,32,$tmp2
+        xor     $tmp0,$T1,$T1           ! sigma0(X[$i+1])
+        or      %l7,$tmp2,$tmp2
+        srlx    $tmp2,@sigma1[0],$tmp1
+        ld      [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
+        sllx    $tmp2,`64-@sigma1[2]`,$tmp0
+        ld      [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
+        srlx    $tmp2,@sigma1[1],$tmp2
+        xor     $tmp0,$tmp1,$tmp1
+        sllx    $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
+        xor     $tmp2,$tmp1,$tmp1
+        srlx    $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
+        xor     $tmp0,$tmp1,$tmp1
+        sllx    %l4,32,$tmp0
+        xor     $tmp2,$tmp1,$tmp1       ! sigma1(X[$i+14])
+        ld      [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
+        or      %l5,$tmp0,$tmp0
+        ld      [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
+        sllx    %l0,32,$tmp2
+        add     $tmp1,$T1,$T1
+        ld      [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
+        or      %l1,$tmp2,$tmp2
+        add     $tmp0,$T1,$T1           ! +=X[$i+9]
+        ld      [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
+        add     $tmp2,$T1,$T1           ! +=X[$i]
+        $ST     $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
+___
+    &BODY_00_15(@_);
+} if ($SZ==8);
+$code.=<<___ if ($bits==64);
+.register       %g2,#scratch
+.register       %g3,#scratch
+___
+$code.=<<___;
+.section        ".text",#alloc,#execinstr
+.align  64
+K${label}:
+.type   K${label},#object
+___
+if ($SZ==4) {
+$code.=<<___;
+        .long   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+        .long   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+        .long   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+        .long   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+        .long   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+        .long   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+        .long   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+        .long   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+        .long   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+        .long   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+        .long   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+        .long   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+        .long   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+        .long   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+        .long   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+        .long   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+___
+} else {
+$code.=<<___;
+        .long   0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
+        .long   0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
+        .long   0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
+        .long   0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
+        .long   0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
+        .long   0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
+        .long   0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
+        .long   0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
+        .long   0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
+        .long   0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
+        .long   0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
+        .long   0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
+        .long   0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
+        .long   0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
+        .long   0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
+        .long   0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
+        .long   0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
+        .long   0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
+        .long   0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
+        .long   0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
+        .long   0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
+        .long   0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
+        .long   0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
+        .long   0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
+        .long   0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
+        .long   0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
+        .long   0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
+        .long   0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
+        .long   0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
+        .long   0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
+        .long   0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
+        .long   0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
+        .long   0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
+        .long   0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
+        .long   0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
+        .long   0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
+        .long   0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
+        .long   0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
+        .long   0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
+        .long   0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+___
+}
+$code.=<<___;
+.size   K${label},.-K${label}
+.globl  sha${label}_block_data_order
+sha${label}_block_data_order:
+        save    %sp,`-$frame-$locals`,%sp
+        and     $inp,`$align-1`,$tmp31
+        sllx    $len,`log(16*$SZ)/log(2)`,$len
+        andn    $inp,`$align-1`,$inp
+        sll     $tmp31,3,$tmp31
+        add     $inp,$len,$len
+___
+$code.=<<___ if ($SZ==8); # SHA512
+        mov     32,$tmp32
+        sub     $tmp32,$tmp31,$tmp32
+___
+$code.=<<___;
+.Lpic:  call    .+8
+        add     %o7,K${label}-.Lpic,$Ktbl
+        $LD     [$ctx+`0*$SZ`],$A
+        $LD     [$ctx+`1*$SZ`],$B
+        $LD     [$ctx+`2*$SZ`],$C
+        $LD     [$ctx+`3*$SZ`],$D
+        $LD     [$ctx+`4*$SZ`],$E
+        $LD     [$ctx+`5*$SZ`],$F
+        $LD     [$ctx+`6*$SZ`],$G
+        $LD     [$ctx+`7*$SZ`],$H
+.Lloop:
+___
+for ($i=0;$i<16;$i++)   { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".L16_xx:\n";
+for (;$i<32;$i++)       { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        and     $tmp2,0xfff,$tmp2
+        cmp     $tmp2,$lastK
+        bne     .L16_xx
+        add     $Ktbl,`16*$SZ`,$Ktbl    ! Ktbl+=16
+___
+$code.=<<___ if ($SZ==4); # SHA256
+        $LD     [$ctx+`0*$SZ`],@X[0]
+        $LD     [$ctx+`1*$SZ`],@X[1]
+        $LD     [$ctx+`2*$SZ`],@X[2]
+        $LD     [$ctx+`3*$SZ`],@X[3]
+        $LD     [$ctx+`4*$SZ`],@X[4]
+        $LD     [$ctx+`5*$SZ`],@X[5]
+        $LD     [$ctx+`6*$SZ`],@X[6]
+        $LD     [$ctx+`7*$SZ`],@X[7]
+        add     $A,@X[0],$A
+        $ST     $A,[$ctx+`0*$SZ`]
+        add     $B,@X[1],$B
+        $ST     $B,[$ctx+`1*$SZ`]
+        add     $C,@X[2],$C
+        $ST     $C,[$ctx+`2*$SZ`]
+        add     $D,@X[3],$D
+        $ST     $D,[$ctx+`3*$SZ`]
+        add     $E,@X[4],$E
+        $ST     $E,[$ctx+`4*$SZ`]
+        add     $F,@X[5],$F
+        $ST     $F,[$ctx+`5*$SZ`]
+        add     $G,@X[6],$G
+        $ST     $G,[$ctx+`6*$SZ`]
+        add     $H,@X[7],$H
+        $ST     $H,[$ctx+`7*$SZ`]
+___
+$code.=<<___ if ($SZ==8); # SHA512
+        ld      [$ctx+`0*$SZ+0`],%l0
+        ld      [$ctx+`0*$SZ+4`],%l1
+        ld      [$ctx+`1*$SZ+0`],%l2
+        ld      [$ctx+`1*$SZ+4`],%l3
+        ld      [$ctx+`2*$SZ+0`],%l4
+        ld      [$ctx+`2*$SZ+4`],%l5
+        ld      [$ctx+`3*$SZ+0`],%l6
+        sllx    %l0,32,$tmp0
+        ld      [$ctx+`3*$SZ+4`],%l7
+        sllx    %l2,32,$tmp1
+        or      %l1,$tmp0,$tmp0
+        or      %l3,$tmp1,$tmp1
+        add     $tmp0,$A,$A
+        add     $tmp1,$B,$B
+        $ST     $A,[$ctx+`0*$SZ`]
+        sllx    %l4,32,$tmp2
+        $ST     $B,[$ctx+`1*$SZ`]
+        sllx    %l6,32,$T1
+        or      %l5,$tmp2,$tmp2
+        or      %l7,$T1,$T1
+        add     $tmp2,$C,$C
+        $ST     $C,[$ctx+`2*$SZ`]
+        add     $T1,$D,$D
+        $ST     $D,[$ctx+`3*$SZ`]
+        ld      [$ctx+`4*$SZ+0`],%l0
+        ld      [$ctx+`4*$SZ+4`],%l1
+        ld      [$ctx+`5*$SZ+0`],%l2
+        ld      [$ctx+`5*$SZ+4`],%l3
+        ld      [$ctx+`6*$SZ+0`],%l4
+        ld      [$ctx+`6*$SZ+4`],%l5
+        ld      [$ctx+`7*$SZ+0`],%l6
+        sllx    %l0,32,$tmp0
+        ld      [$ctx+`7*$SZ+4`],%l7
+        sllx    %l2,32,$tmp1
+        or      %l1,$tmp0,$tmp0
+        or      %l3,$tmp1,$tmp1
+        add     $tmp0,$E,$E
+        add     $tmp1,$F,$F
+        $ST     $E,[$ctx+`4*$SZ`]
+        sllx    %l4,32,$tmp2
+        $ST     $F,[$ctx+`5*$SZ`]
+        sllx    %l6,32,$T1
+        or      %l5,$tmp2,$tmp2
+        or      %l7,$T1,$T1
+        add     $tmp2,$G,$G
+        $ST     $G,[$ctx+`6*$SZ`]
+        add     $T1,$H,$H
+        $ST     $H,[$ctx+`7*$SZ`]
+___
+$code.=<<___;
+        add     $inp,`16*$SZ`,$inp              ! advance inp
+        cmp     $inp,$len
+        bne     `$bits==64?"%xcc":"%icc"`,.Lloop
+        sub     $Ktbl,`($rounds-16)*$SZ`,$Ktbl  ! rewind Ktbl
+        ret
+        restore
+.type   sha${label}_block_data_order,#function
+.size   sha${label}_block_data_order,(.-sha${label}_block_data_order)
+.asciz  "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
index b6252d31ec..e6643f8cf6 100755
--- a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
@@ -40,14 +40,18 @@
 # sha256_block:-( This is presumably because 64-bit shifts/rotates
 # apparently are not atomic instructions, but implemented in microcode.
-$output=shift;
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $output";
+open STDOUT,"| $^X $xlate $flavour $output";
 if ($output =~ /512/) {
        $func="sha512_block_data_order";
@@ -186,7 +190,7 @@ $func:
        push    %r13
        push    %r14
        push    %r15
-        mov     %rsp,%rbp               # copy %rsp
+        mov     %rsp,%r11               # copy %rsp
        shl     \$4,%rdx                # num*16
        sub     \$$framesz,%rsp
        lea     ($inp,%rdx,$SZ),%rdx    # inp+num*16*$SZ
@@ -194,10 +198,10 @@ $func:
        mov     $ctx,$_ctx              # save ctx, 1st arg
        mov     $inp,$_inp              # save inp, 2nd arh
        mov     %rdx,$_end              # save end pointer, "3rd" arg
-        mov     %rbp,$_rsp              # save copy of %rsp
+        mov     %r11,$_rsp              # save copy of %rsp
+.Lprologue:
-        .picmeup $Tbl
+        lea     $TABLE(%rip),$Tbl
-        lea     $TABLE-.($Tbl),$Tbl
        mov     $SZ*0($ctx),$A
        mov     $SZ*1($ctx),$B
@@ -257,14 +261,15 @@ $code.=<<___;
        mov     $H,$SZ*7($ctx)
        jb      .Lloop
-        mov     $_rsp,%rsp
+        mov     $_rsp,%rsi
-        pop     %r15
+        mov     (%rsi),%r15
-        pop     %r14
+        mov     8(%rsi),%r14
-        pop     %r13
+        mov     16(%rsi),%r13
-        pop     %r12
+        mov     24(%rsi),%r12
-        pop     %rbp
+        mov     32(%rsi),%rbp
-        pop     %rbx
+        mov     40(%rsi),%rbx
+        lea     48(%rsi),%rsp
+.Lepilogue:
        ret
 .size   $func,.-$func
 ___
@@ -339,6 +344,113 @@ $TABLE:
 ___
 }
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type   se_handler,\@abi-omnipotent
+.align  16
+se_handler:
+        push    %rsi
+        push    %rdi
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        pushfq
+        sub     \$64,%rsp
+        mov     120($context),%rax      # pull context->Rax
+        mov     248($context),%rbx      # pull context->Rip
+        lea     .Lprologue(%rip),%r10
+        cmp     %r10,%rbx               # context->Rip<.Lprologue
+        jb      .Lin_prologue
+        mov     152($context),%rax      # pull context->Rsp
+        lea     .Lepilogue(%rip),%r10
+        cmp     %r10,%rbx               # context->Rip>=.Lepilogue
+        jae     .Lin_prologue
+        mov     16*$SZ+3*8(%rax),%rax   # pull $_rsp
+        lea     48(%rax),%rax
+        mov     -8(%rax),%rbx
+        mov     -16(%rax),%rbp
+        mov     -24(%rax),%r12
+        mov     -32(%rax),%r13
+        mov     -40(%rax),%r14
+        mov     -48(%rax),%r15
+        mov     %rbx,144($context)      # restore context->Rbx
+        mov     %rbp,160($context)      # restore context->Rbp
+        mov     %r12,216($context)      # restore context->R12
+        mov     %r13,224($context)      # restore context->R13
+        mov     %r14,232($context)      # restore context->R14
+        mov     %r15,240($context)      # restore context->R15
+.Lin_prologue:
+        mov     8(%rax),%rdi
+        mov     16(%rax),%rsi
+        mov     %rax,152($context)      # restore context->Rsp
+        mov     %rsi,168($context)      # restore context->Rsi
+        mov     %rdi,176($context)      # restore context->Rdi
+        mov     40($disp),%rdi          # disp->ContextRecord
+        mov     $context,%rsi           # context
+        mov     \$154,%ecx              # sizeof(CONTEXT)
+        .long   0xa548f3fc              # cld; rep movsq
+        mov     $disp,%rsi
+        xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+        mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+        mov     0(%rsi),%r8             # arg3, disp->ControlPc
+        mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+        mov     40(%rsi),%r10           # disp->ContextRecord
+        lea     56(%rsi),%r11           # &disp->HandlerData
+        lea     24(%rsi),%r12           # &disp->EstablisherFrame
+        mov     %r10,32(%rsp)           # arg5
+        mov     %r11,40(%rsp)           # arg6
+        mov     %r12,48(%rsp)           # arg7
+        mov     %rcx,56(%rsp)           # arg8, (NULL)
+        call    *__imp_RtlVirtualUnwind(%rip)
+        mov     \$1,%eax                # ExceptionContinueSearch
+        add     \$64,%rsp
+        popfq
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+        pop     %rbp
+        pop     %rbx
+        pop     %rdi
+        pop     %rsi
+        ret
+.size   se_handler,.-se_handler
+.section        .pdata
+.align  4
+        .rva    .LSEH_begin_$func
+        .rva    .LSEH_end_$func
+        .rva    .LSEH_info_$func
+.section        .xdata
+.align  8
+.LSEH_info_$func:
+        .byte   9,0,0,0
+        .rva    se_handler
+___
+}
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 print $code;
 close STDOUT;
diff --git a/src/lib/libcrypto/sha/sha256.c b/src/lib/libcrypto/sha/sha256.c
index 3256a83e98..8952d87673 100644
--- a/src/lib/libcrypto/sha/sha256.c
+++ b/src/lib/libcrypto/sha/sha256.c
@@ -12,39 +12,29 @@
 #include <openssl/crypto.h>
 #include <openssl/sha.h>
-#ifdef OPENSSL_FIPS
-#include <openssl/fips.h>
-#endif
 #include <openssl/opensslv.h>
 const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT;
 int SHA224_Init (SHA256_CTX *c)
        {
-#ifdef OPENSSL_FIPS
+        memset (c,0,sizeof(*c));
-        FIPS_selftest_check();
-#endif
        c->h[0]=0xc1059ed8UL;   c->h[1]=0x367cd507UL;
        c->h[2]=0x3070dd17UL;   c->h[3]=0xf70e5939UL;
        c->h[4]=0xffc00b31UL;   c->h[5]=0x68581511UL;
        c->h[6]=0x64f98fa7UL;   c->h[7]=0xbefa4fa4UL;
-        c->Nl=0;        c->Nh=0;
+        c->md_len=SHA224_DIGEST_LENGTH;
-        c->num=0;       c->md_len=SHA224_DIGEST_LENGTH;
        return 1;
        }
 int SHA256_Init (SHA256_CTX *c)
        {
-#ifdef OPENSSL_FIPS
+        memset (c,0,sizeof(*c));
-        FIPS_selftest_check();
-#endif
        c->h[0]=0x6a09e667UL;   c->h[1]=0xbb67ae85UL;
        c->h[2]=0x3c6ef372UL;   c->h[3]=0xa54ff53aUL;
        c->h[4]=0x510e527fUL;   c->h[5]=0x9b05688cUL;
        c->h[6]=0x1f83d9abUL;   c->h[7]=0x5be0cd19UL;
-        c->Nl=0;        c->Nh=0;
+        c->md_len=SHA256_DIGEST_LENGTH;
-        c->num=0;       c->md_len=SHA256_DIGEST_LENGTH;
        return 1;
        }
@@ -94,21 +84,21 @@ int SHA224_Final (unsigned char *md, SHA256_CTX *c)
 */
 #define HASH_MAKE_STRING(c,s)   do {    \
        unsigned long ll;               \
-        unsigned int  xn;               \
+        unsigned int  nn;               \
        switch ((c)->md_len)            \
        {   case SHA224_DIGEST_LENGTH:  \
-                for (xn=0;xn<SHA224_DIGEST_LENGTH/4;xn++)       \
+                for (nn=0;nn<SHA224_DIGEST_LENGTH/4;nn++)       \
-                {   ll=(c)->h[xn]; HOST_l2c(ll,(s));   }        \
+                {   ll=(c)->h[nn]; HOST_l2c(ll,(s));   }        \
                break;                  \
            case SHA256_DIGEST_LENGTH:  \
-                for (xn=0;xn<SHA256_DIGEST_LENGTH/4;xn++)       \
+                for (nn=0;nn<SHA256_DIGEST_LENGTH/4;nn++)       \
-                {   ll=(c)->h[xn]; HOST_l2c(ll,(s));   }        \
+                {   ll=(c)->h[nn]; HOST_l2c(ll,(s));   }        \
                break;                  \
            default:                    \
                if ((c)->md_len > SHA256_DIGEST_LENGTH) \
                    return 0;                           \
-                for (xn=0;xn<(c)->md_len/4;xn++)                \
+                for (nn=0;nn<(c)->md_len/4;nn++)                \
-                {   ll=(c)->h[xn]; HOST_l2c(ll,(s));   }        \
+                {   ll=(c)->h[nn]; HOST_l2c(ll,(s));   }        \
                break;                  \
        }                               \
        } while (0)
diff --git a/src/lib/libcrypto/sha/sha512.c b/src/lib/libcrypto/sha/sha512.c
index f5ed468b85..cbc0e58c48 100644
--- a/src/lib/libcrypto/sha/sha512.c
+++ b/src/lib/libcrypto/sha/sha512.c
@@ -5,10 +5,6 @@
 * ====================================================================
 */
 #include <openssl/opensslconf.h>
-#ifdef OPENSSL_FIPS
-#include <openssl/fips.h>
-#endif
 #if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA512)
 /*
 * IMPLEMENTATION NOTES.
@@ -65,9 +61,19 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
 int SHA384_Init (SHA512_CTX *c)
        {
-#ifdef OPENSSL_FIPS
+#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-        FIPS_selftest_check();
+        /* maintain dword order required by assembler module */
-#endif
+        unsigned int *h = (unsigned int *)c->h;
+        h[0]  = 0xcbbb9d5d; h[1]  = 0xc1059ed8;
+        h[2]  = 0x629a292a; h[3]  = 0x367cd507;
+        h[4]  = 0x9159015a; h[5]  = 0x3070dd17;
+        h[6]  = 0x152fecd8; h[7]  = 0xf70e5939;
+        h[8]  = 0x67332667; h[9]  = 0xffc00b31;
+        h[10] = 0x8eb44a87; h[11] = 0x68581511;
+        h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7;
+        h[14] = 0x47b5481d; h[15] = 0xbefa4fa4;
+#else
        c->h[0]=U64(0xcbbb9d5dc1059ed8);
        c->h[1]=U64(0x629a292a367cd507);
        c->h[2]=U64(0x9159015a3070dd17);
@@ -76,6 +82,7 @@ int SHA384_Init (SHA512_CTX *c)
        c->h[5]=U64(0x8eb44a8768581511);
        c->h[6]=U64(0xdb0c2e0d64f98fa7);
        c->h[7]=U64(0x47b5481dbefa4fa4);
+#endif
        c->Nl=0;        c->Nh=0;
        c->num=0;       c->md_len=SHA384_DIGEST_LENGTH;
        return 1;
@@ -83,9 +90,19 @@ int SHA384_Init (SHA512_CTX *c)
 int SHA512_Init (SHA512_CTX *c)
        {
-#ifdef OPENSSL_FIPS
+#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-        FIPS_selftest_check();
+        /* maintain dword order required by assembler module */
-#endif
+        unsigned int *h = (unsigned int *)c->h;
+        h[0]  = 0x6a09e667; h[1]  = 0xf3bcc908;
+        h[2]  = 0xbb67ae85; h[3]  = 0x84caa73b;
+        h[4]  = 0x3c6ef372; h[5]  = 0xfe94f82b;
+        h[6]  = 0xa54ff53a; h[7]  = 0x5f1d36f1;
+        h[8]  = 0x510e527f; h[9]  = 0xade682d1;
+        h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f;
+        h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b;
+        h[14] = 0x5be0cd19; h[15] = 0x137e2179;
+#else
        c->h[0]=U64(0x6a09e667f3bcc908);
        c->h[1]=U64(0xbb67ae8584caa73b);
        c->h[2]=U64(0x3c6ef372fe94f82b);
@@ -94,6 +111,7 @@ int SHA512_Init (SHA512_CTX *c)
        c->h[5]=U64(0x9b05688c2b3e6c1f);
        c->h[6]=U64(0x1f83d9abfb41bd6b);
        c->h[7]=U64(0x5be0cd19137e2179);
+#endif
        c->Nl=0;        c->Nh=0;
        c->num=0;       c->md_len=SHA512_DIGEST_LENGTH;
        return 1;
@@ -142,6 +160,24 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
        if (md==0) return 0;
+#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
+        /* recall assembler dword order... */
+        n = c->md_len;
+        if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH)
+                {
+                unsigned int *h = (unsigned int *)c->h, t;
+                for (n/=4;n;n--)
+                        {
+                        t = *(h++);
+                        *(md++) = (unsigned char)(t>>24);
+                        *(md++) = (unsigned char)(t>>16);
+                        *(md++) = (unsigned char)(t>>8);
+                        *(md++) = (unsigned char)(t);
+                        }
+                }
+        else    return 0;
+#else
        switch (c->md_len)
                {
                /* Let compiler decide if it's appropriate to unroll... */
@@ -178,7 +214,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
                /* ... as well as make sure md_len is not abused. */
                default:        return 0;
                }
+#endif
        return 1;
        }
@@ -204,7 +240,7 @@ int SHA512_Update (SHA512_CTX *c, const void *_data, size_t len)
                if (len < n)
                        {
-                        memcpy (p+c->num,data,len), c->num += len;
+                        memcpy (p+c->num,data,len), c->num += (unsigned int)len;
                        return 1;
                        }
                else    {
@@ -314,7 +350,7 @@ static const SHA_LONG64 K512[80] = {
 #ifndef PEDANTIC
 # if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
 #  if defined(__x86_64) || defined(__x86_64__)
-#   define ROTR(a,n)    ({ unsigned long ret;           \
+#   define ROTR(a,n)    ({ SHA_LONG64 ret;              \
                                asm ("rorq %1,%0"       \
                                : "=r"(ret)             \
                                : "J"(n),"0"(a)         \
@@ -337,20 +373,21 @@ static const SHA_LONG64 K512[80] = {
                                ((SHA_LONG64)hi)<<32|lo;        })
 #   else
 #    define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
-                         unsigned int hi=p[0],lo=p[1];                  \
+                         unsigned int hi=p[0],lo=p[1];          \
                                asm ("bswapl %0; bswapl %1;"    \
                                : "=r"(lo),"=r"(hi)             \
                                : "0"(lo),"1"(hi));             \
                                ((SHA_LONG64)hi)<<32|lo;        })
 #   endif
 #  elif (defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64)
-#   define ROTR(a,n)    ({ unsigned long ret;           \
+#   define ROTR(a,n)    ({ SHA_LONG64 ret;              \
                                asm ("rotrdi %0,%1,%2"  \
                                : "=r"(ret)             \
                                : "r"(a),"K"(n)); ret;  })
 #  endif
 # elif defined(_MSC_VER)
 #  if defined(_WIN64)   /* applies to both IA-64 and AMD64 */
+#   pragma intrinsic(_rotr64)
 #   define ROTR(a,n)    _rotr64((a),n)
 #  endif
 #  if defined(_M_IX86) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
@@ -398,15 +435,66 @@ static const SHA_LONG64 K512[80] = {
 #define Ch(x,y,z)       (((x) & (y)) ^ ((~(x)) & (z)))
 #define Maj(x,y,z)      (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
-#if defined(OPENSSL_IA32_SSE2) && !defined(OPENSSL_NO_ASM) && !defined(I386_ONLY)
-#define GO_FOR_SSE2(ctx,in,num)         do {            \
+#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
-        void    sha512_block_sse2(void *,const void *,size_t);  \
+/*
-        if (!(OPENSSL_ia32cap_P & (1<<26))) break;      \
+ * This code should give better results on 32-bit CPU with less than
-        sha512_block_sse2(ctx->h,in,num); return;       \
+ * ~24 registers, both size and performance wise...
-                                        } while (0)
+ */
+static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
+        {
+        const SHA_LONG64 *W=in;
+        SHA_LONG64      A,E,T;
+        SHA_LONG64      X[9+80],*F;
+        int i;
+                        while (num--) {
+        F    = X+80;
+        A    = ctx->h[0];       F[1] = ctx->h[1];
+        F[2] = ctx->h[2];       F[3] = ctx->h[3];
+        E    = ctx->h[4];       F[5] = ctx->h[5];
+        F[6] = ctx->h[6];       F[7] = ctx->h[7];
+        for (i=0;i<16;i++,F--)
+                {
+#ifdef B_ENDIAN
+                T = W[i];
+#else
+                T = PULL64(W[i]);
 #endif
+                F[0] = A;
+                F[4] = E;
+                F[8] = T;
+                T   += F[7] + Sigma1(E) + Ch(E,F[5],F[6]) + K512[i];
+                E    = F[3] + T;
+                A    = T + Sigma0(A) + Maj(A,F[1],F[2]);
+                }
+        for (;i<80;i++,F--)
+                {
+                T    = sigma0(F[8+16-1]);
+                T   += sigma1(F[8+16-14]);
+                T   += F[8+16] + F[8+16-9];
+                F[0] = A;
+                F[4] = E;
+                F[8] = T;
+                T   += F[7] + Sigma1(E) + Ch(E,F[5],F[6]) + K512[i];
+                E    = F[3] + T;
+                A    = T + Sigma0(A) + Maj(A,F[1],F[2]);
+                }
-#ifdef OPENSSL_SMALL_FOOTPRINT
+        ctx->h[0] += A;         ctx->h[1] += F[1];
+        ctx->h[2] += F[2];      ctx->h[3] += F[3];
+        ctx->h[4] += E;         ctx->h[5] += F[5];
+        ctx->h[6] += F[6];      ctx->h[7] += F[7];
+                        W+=SHA_LBLOCK;
+                        }
+        }
+#elif defined(OPENSSL_SMALL_FOOTPRINT)
 static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
        {
@@ -415,10 +503,6 @@ static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num
        SHA_LONG64      X[16];
        int i;
-#ifdef GO_FOR_SSE2
-        GO_FOR_SSE2(ctx,in,num);
-#endif
                        while (num--) {
        a = ctx->h[0];  b = ctx->h[1];  c = ctx->h[2];  d = ctx->h[3];
@@ -463,11 +547,11 @@ static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num
        h = Sigma0(a) + Maj(a,b,c);                     \
        d += T1;        h += T1;                } while (0)
-#define ROUND_16_80(i,a,b,c,d,e,f,g,h,X)        do {    \
+#define ROUND_16_80(i,j,a,b,c,d,e,f,g,h,X)      do {    \
-        s0 = X[(i+1)&0x0f];     s0 = sigma0(s0);        \
+        s0 = X[(j+1)&0x0f];     s0 = sigma0(s0);        \
-        s1 = X[(i+14)&0x0f];    s1 = sigma1(s1);        \
+        s1 = X[(j+14)&0x0f];    s1 = sigma1(s1);        \
-        T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f];    \
+        T1 = X[(j)&0x0f] += s0 + s1 + X[(j+9)&0x0f];    \
-        ROUND_00_15(i,a,b,c,d,e,f,g,h);         } while (0)
+        ROUND_00_15(i+j,a,b,c,d,e,f,g,h);               } while (0)
 static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
        {
@@ -476,10 +560,6 @@ static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num
        SHA_LONG64      X[16];
        int i;
-#ifdef GO_FOR_SSE2
-        GO_FOR_SSE2(ctx,in,num);
-#endif
                        while (num--) {
        a = ctx->h[0];  b = ctx->h[1];  c = ctx->h[2];  d = ctx->h[3];
@@ -521,16 +601,24 @@ static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num
        T1 = X[15] = PULL64(W[15]);     ROUND_00_15(15,b,c,d,e,f,g,h,a);
 #endif
-        for (i=16;i<80;i+=8)
+        for (i=16;i<80;i+=16)
                {
-                ROUND_16_80(i+0,a,b,c,d,e,f,g,h,X);
+                ROUND_16_80(i, 0,a,b,c,d,e,f,g,h,X);
-                ROUND_16_80(i+1,h,a,b,c,d,e,f,g,X);
+                ROUND_16_80(i, 1,h,a,b,c,d,e,f,g,X);
-                ROUND_16_80(i+2,g,h,a,b,c,d,e,f,X);
+                ROUND_16_80(i, 2,g,h,a,b,c,d,e,f,X);
-                ROUND_16_80(i+3,f,g,h,a,b,c,d,e,X);
+                ROUND_16_80(i, 3,f,g,h,a,b,c,d,e,X);
-                ROUND_16_80(i+4,e,f,g,h,a,b,c,d,X);
+                ROUND_16_80(i, 4,e,f,g,h,a,b,c,d,X);
-                ROUND_16_80(i+5,d,e,f,g,h,a,b,c,X);
+                ROUND_16_80(i, 5,d,e,f,g,h,a,b,c,X);
-                ROUND_16_80(i+6,c,d,e,f,g,h,a,b,X);
+                ROUND_16_80(i, 6,c,d,e,f,g,h,a,b,X);
-                ROUND_16_80(i+7,b,c,d,e,f,g,h,a,X);
+                ROUND_16_80(i, 7,b,c,d,e,f,g,h,a,X);
+                ROUND_16_80(i, 8,a,b,c,d,e,f,g,h,X);
+                ROUND_16_80(i, 9,h,a,b,c,d,e,f,g,X);
+                ROUND_16_80(i,10,g,h,a,b,c,d,e,f,X);
+                ROUND_16_80(i,11,f,g,h,a,b,c,d,e,X);
+                ROUND_16_80(i,12,e,f,g,h,a,b,c,d,X);
+                ROUND_16_80(i,13,d,e,f,g,h,a,b,c,X);
+                ROUND_16_80(i,14,c,d,e,f,g,h,a,b,X);
+                ROUND_16_80(i,15,b,c,d,e,f,g,h,a,X);
                }
        ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
@@ -544,4 +632,10 @@ static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num
 #endif /* SHA512_ASM */
-#endif /* OPENSSL_NO_SHA512 */
+#else /* !OPENSSL_NO_SHA512 */
+#if defined(PEDANTIC) || defined(__DECC) || defined(OPENSSL_SYS_MACOSX)
+static void *dummy=&dummy;
+#endif
+#endif /* !OPENSSL_NO_SHA512 */