import of OpenSSL 0.9.8k

author: djm <> 2009-04-06 06:30:10 +0000
committer: djm <> 2009-04-06 06:30:10 +0000
commit: 2b6e09b39ef1d803b50ee024a06d1c250fde442d (patch)
tree: f116109c359f26a2b149bbc752be39c16099bae1 /src/lib/libcrypto/bn/asm
parent: a0fdc9ec41594852f67ec77dfad9cb06bacc4186 (diff)
download: openbsd-2b6e09b39ef1d803b50ee024a06d1c250fde442d.tar.gz
openbsd-2b6e09b39ef1d803b50ee024a06d1c250fde442d.tar.bz2
openbsd-2b6e09b39ef1d803b50ee024a06d1c250fde442d.zip
10 files changed, 4982 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/asm/alpha-mont.pl b/src/lib/libcrypto/bn/asm/alpha-mont.pl
new file mode 100644
index 0000000000..7a2cc3173b
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/alpha-mont.pl
@@ -0,0 +1,317 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# On 21264 RSA sign performance improves by 70/35/20/15 percent for
+# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
+# instructed to '-tune host' code with in-line assembler. Other
+# benchmarks improve by 15-20%. To anchor it to something else, the
+# code provides approximately the same performance per GHz as AMD64.
+# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
+# difference.
+# int bn_mul_mont(
+$rp="a0";       # BN_ULONG *rp,
+$ap="a1";       # const BN_ULONG *ap,
+$bp="a2";       # const BN_ULONG *bp,
+$np="a3";       # const BN_ULONG *np,
+$n0="a4";       # const BN_ULONG *n0,
+$num="a5";      # int num);
+$lo0="t0";
+$hi0="t1";
+$lo1="t2";
+$hi1="t3";
+$aj="t4";
+$bi="t5";
+$nj="t6";
+$tp="t7";
+$alo="t8";
+$ahi="t9";
+$nlo="t10";
+$nhi="t11";
+$tj="t12";
+$i="s3";
+$j="s4";
+$m1="s5";
+$code=<<___;
+#include <asm.h>
+#include <regdef.h>
+.text
+.set    noat
+.set    noreorder
+.globl  bn_mul_mont
+.align  5
+.ent    bn_mul_mont
+bn_mul_mont:
+        lda     sp,-40(sp)
+        stq     ra,0(sp)
+        stq     s3,8(sp)
+        stq     s4,16(sp)
+        stq     s5,24(sp)
+        stq     fp,32(sp)
+        mov     sp,fp
+        .mask   0x0400f000,-40
+        .frame  fp,40,ra
+        .prologue 0
+        .align  4
+        .set    reorder
+        sextl   $num,$num
+        mov     0,v0
+        cmplt   $num,4,AT
+        bne     AT,.Lexit
+        ldq     $hi0,0($ap)     # ap[0]
+        s8addq  $num,16,AT
+        ldq     $aj,8($ap)
+        subq    sp,AT,sp
+        ldq     $bi,0($bp)      # bp[0]
+        mov     -4096,AT
+        ldq     $n0,0($n0)
+        and     sp,AT,sp
+        mulq    $hi0,$bi,$lo0
+        ldq     $hi1,0($np)     # np[0]
+        umulh   $hi0,$bi,$hi0
+        ldq     $nj,8($np)
+        mulq    $lo0,$n0,$m1
+        mulq    $hi1,$m1,$lo1
+        umulh   $hi1,$m1,$hi1
+        addq    $lo1,$lo0,$lo1
+        cmpult  $lo1,$lo0,AT
+        addq    $hi1,AT,$hi1
+        mulq    $aj,$bi,$alo
+        mov     2,$j
+        umulh   $aj,$bi,$ahi
+        mov     sp,$tp
+        mulq    $nj,$m1,$nlo
+        s8addq  $j,$ap,$aj
+        umulh   $nj,$m1,$nhi
+        s8addq  $j,$np,$nj
+.align  4
+.L1st:
+        .set    noreorder
+        ldq     $aj,($aj)
+        addl    $j,1,$j
+        ldq     $nj,($nj)
+        lda     $tp,8($tp)
+        addq    $alo,$hi0,$lo0
+        mulq    $aj,$bi,$alo
+        cmpult  $lo0,$hi0,AT
+        addq    $nlo,$hi1,$lo1
+        mulq    $nj,$m1,$nlo
+        addq    $ahi,AT,$hi0
+        cmpult  $lo1,$hi1,v0
+        cmplt   $j,$num,$tj
+        umulh   $aj,$bi,$ahi
+        addq    $nhi,v0,$hi1
+        addq    $lo1,$lo0,$lo1
+        s8addq  $j,$ap,$aj
+        umulh   $nj,$m1,$nhi
+        cmpult  $lo1,$lo0,v0
+        addq    $hi1,v0,$hi1
+        s8addq  $j,$np,$nj
+        stq     $lo1,-8($tp)
+        nop
+        unop
+        bne     $tj,.L1st
+        .set    reorder
+        addq    $alo,$hi0,$lo0
+        addq    $nlo,$hi1,$lo1
+        cmpult  $lo0,$hi0,AT
+        cmpult  $lo1,$hi1,v0
+        addq    $ahi,AT,$hi0
+        addq    $nhi,v0,$hi1
+        addq    $lo1,$lo0,$lo1
+        cmpult  $lo1,$lo0,v0
+        addq    $hi1,v0,$hi1
+        stq     $lo1,0($tp)
+        addq    $hi1,$hi0,$hi1
+        cmpult  $hi1,$hi0,AT
+        stq     $hi1,8($tp)
+        stq     AT,16($tp)
+        mov     1,$i
+.align  4
+.Louter:
+        s8addq  $i,$bp,$bi
+        ldq     $hi0,($ap)
+        ldq     $aj,8($ap)
+        ldq     $bi,($bi)
+        ldq     $hi1,($np)
+        ldq     $nj,8($np)
+        ldq     $tj,(sp)
+        mulq    $hi0,$bi,$lo0
+        umulh   $hi0,$bi,$hi0
+        addq    $lo0,$tj,$lo0
+        cmpult  $lo0,$tj,AT
+        addq    $hi0,AT,$hi0
+        mulq    $lo0,$n0,$m1
+        mulq    $hi1,$m1,$lo1
+        umulh   $hi1,$m1,$hi1
+        addq    $lo1,$lo0,$lo1
+        cmpult  $lo1,$lo0,AT
+        mov     2,$j
+        addq    $hi1,AT,$hi1
+        mulq    $aj,$bi,$alo
+        mov     sp,$tp
+        umulh   $aj,$bi,$ahi
+        mulq    $nj,$m1,$nlo
+        s8addq  $j,$ap,$aj
+        umulh   $nj,$m1,$nhi
+.align  4
+.Linner:
+        .set    noreorder
+        ldq     $tj,8($tp)      #L0
+        nop                     #U1
+        ldq     $aj,($aj)       #L1
+        s8addq  $j,$np,$nj      #U0
+        ldq     $nj,($nj)       #L0
+        nop                     #U1
+        addq    $alo,$hi0,$lo0  #L1
+        lda     $tp,8($tp)
+        mulq    $aj,$bi,$alo    #U1
+        cmpult  $lo0,$hi0,AT    #L0
+        addq    $nlo,$hi1,$lo1  #L1
+        addl    $j,1,$j
+        mulq    $nj,$m1,$nlo    #U1
+        addq    $ahi,AT,$hi0    #L0
+        addq    $lo0,$tj,$lo0   #L1
+        cmpult  $lo1,$hi1,v0    #U0
+        umulh   $aj,$bi,$ahi    #U1
+        cmpult  $lo0,$tj,AT     #L0
+        addq    $lo1,$lo0,$lo1  #L1
+        addq    $nhi,v0,$hi1    #U0
+        umulh   $nj,$m1,$nhi    #U1
+        s8addq  $j,$ap,$aj      #L0
+        cmpult  $lo1,$lo0,v0    #L1
+        cmplt   $j,$num,$tj     #U0     # borrow $tj
+        addq    $hi0,AT,$hi0    #L0
+        addq    $hi1,v0,$hi1    #U1
+        stq     $lo1,-8($tp)    #L1
+        bne     $tj,.Linner     #U0
+        .set    reorder
+        ldq     $tj,8($tp)
+        addq    $alo,$hi0,$lo0
+        addq    $nlo,$hi1,$lo1
+        cmpult  $lo0,$hi0,AT
+        cmpult  $lo1,$hi1,v0
+        addq    $ahi,AT,$hi0
+        addq    $nhi,v0,$hi1
+        addq    $lo0,$tj,$lo0
+        cmpult  $lo0,$tj,AT
+        addq    $hi0,AT,$hi0
+        ldq     $tj,16($tp)
+        addq    $lo1,$lo0,$j
+        cmpult  $j,$lo0,v0
+        addq    $hi1,v0,$hi1
+        addq    $hi1,$hi0,$lo1
+        stq     $j,($tp)
+        cmpult  $lo1,$hi0,$hi1
+        addq    $lo1,$tj,$lo1
+        cmpult  $lo1,$tj,AT
+        addl    $i,1,$i
+        addq    $hi1,AT,$hi1
+        stq     $lo1,8($tp)
+        cmplt   $i,$num,$tj     # borrow $tj
+        stq     $hi1,16($tp)
+        bne     $tj,.Louter
+        s8addq  $num,sp,$tj     # &tp[num]
+        mov     $rp,$bp         # put rp aside
+        mov     sp,$tp
+        mov     sp,$ap
+        mov     0,$hi0          # clear borrow bit
+.align  4
+.Lsub:  ldq     $lo0,($tp)
+        ldq     $lo1,($np)
+        lda     $tp,8($tp)
+        lda     $np,8($np)
+        subq    $lo0,$lo1,$lo1  # tp[i]-np[i]
+        cmpult  $lo0,$lo1,AT
+        subq    $lo1,$hi0,$lo0
+        cmpult  $lo1,$lo0,$hi0
+        or      $hi0,AT,$hi0
+        stq     $lo0,($rp)
+        cmpult  $tp,$tj,v0
+        lda     $rp,8($rp)
+        bne     v0,.Lsub
+        subq    $hi1,$hi0,$hi0  # handle upmost overflow bit
+        mov     sp,$tp
+        mov     $bp,$rp         # restore rp
+        and     sp,$hi0,$ap
+        bic     $bp,$hi0,$bp
+        bis     $bp,$ap,$ap     # ap=borrow?tp:rp
+.align  4
+.Lcopy: ldq     $aj,($ap)       # copy or in-place refresh
+        lda     $tp,8($tp)
+        lda     $rp,8($rp)
+        lda     $ap,8($ap)
+        stq     zero,-8($tp)    # zap tp
+        cmpult  $tp,$tj,AT
+        stq     $aj,-8($rp)
+        bne     AT,.Lcopy
+        mov     1,v0
+.Lexit:
+        .set    noreorder
+        mov     fp,sp
+        /*ldq   ra,0(sp)*/
+        ldq     s3,8(sp)
+        ldq     s4,16(sp)
+        ldq     s5,24(sp)
+        ldq     fp,32(sp)
+        lda     sp,40(sp)
+        ret     (ra)
+.end    bn_mul_mont
+.rdata
+.asciiz "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+___
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl
new file mode 100644
index 0000000000..05d5dc1a48
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/armv4-mont.pl
@@ -0,0 +1,200 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# January 2007.
+# Montgomery multiplication for ARMv4.
+#
+# Performance improvement naturally varies among CPU implementations
+# and compilers. The code was observed to provide +65-35% improvement
+# [depending on key length, less for longer keys] on ARM920T, and
+# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
+# base and compiler generated code with in-lined umull and even umlal
+# instructions. The latter means that this code didn't really have an 
+# "advantage" of utilizing some "secret" instruction.
+#
+# The code is interoperable with Thumb ISA and is rather compact, less
+# than 1/2KB. Windows CE port would be trivial, as it's exclusively
+# about decorations, ABI and instruction syntax are identical.
+$num="r0";      # starts as num argument, but holds &tp[num-1]
+$ap="r1";
+$bp="r2"; $bi="r2"; $rp="r2";
+$np="r3";
+$tp="r4";
+$aj="r5";
+$nj="r6";
+$tj="r7";
+$n0="r8";
+###########     # r9 is reserved by ELF as platform specific, e.g. TLS pointer
+$alo="r10";     # sl, gcc uses it to keep @GOT
+$ahi="r11";     # fp
+$nlo="r12";     # ip
+###########     # r13 is stack pointer
+$nhi="r14";     # lr
+###########     # r15 is program counter
+#### argument block layout relative to &tp[num-1], a.k.a. $num
+$_rp="$num,#12*4";
+# ap permanently resides in r1
+$_bp="$num,#13*4";
+# np permanently resides in r3
+$_n0="$num,#14*4";
+$_num="$num,#15*4";     $_bpend=$_num;
+$code=<<___;
+.text
+.global bn_mul_mont
+.type   bn_mul_mont,%function
+.align  2
+bn_mul_mont:
+        stmdb   sp!,{r0,r2}             @ sp points at argument block
+        ldr     $num,[sp,#3*4]          @ load num
+        cmp     $num,#2
+        movlt   r0,#0
+        addlt   sp,sp,#2*4
+        blt     .Labrt
+        stmdb   sp!,{r4-r12,lr}         @ save 10 registers
+        mov     $num,$num,lsl#2         @ rescale $num for byte count
+        sub     sp,sp,$num              @ alloca(4*num)
+        sub     sp,sp,#4                @ +extra dword
+        sub     $num,$num,#4            @ "num=num-1"
+        add     $tp,$bp,$num            @ &bp[num-1]
+        add     $num,sp,$num            @ $num to point at &tp[num-1]
+        ldr     $n0,[$_n0]              @ &n0
+        ldr     $bi,[$bp]               @ bp[0]
+        ldr     $aj,[$ap],#4            @ ap[0],ap++
+        ldr     $nj,[$np],#4            @ np[0],np++
+        ldr     $n0,[$n0]               @ *n0
+        str     $tp,[$_bpend]           @ save &bp[num]
+        umull   $alo,$ahi,$aj,$bi       @ ap[0]*bp[0]
+        str     $n0,[$_n0]              @ save n0 value
+        mul     $n0,$alo,$n0            @ "tp[0]"*n0
+        mov     $nlo,#0
+        umlal   $alo,$nlo,$nj,$n0       @ np[0]*n0+"t[0]"
+        mov     $tp,sp
+.L1st:
+        ldr     $aj,[$ap],#4            @ ap[j],ap++
+        mov     $alo,$ahi
+        mov     $ahi,#0
+        umlal   $alo,$ahi,$aj,$bi       @ ap[j]*bp[0]
+        ldr     $nj,[$np],#4            @ np[j],np++
+        mov     $nhi,#0
+        umlal   $nlo,$nhi,$nj,$n0       @ np[j]*n0
+        adds    $nlo,$nlo,$alo
+        str     $nlo,[$tp],#4           @ tp[j-1]=,tp++
+        adc     $nlo,$nhi,#0
+        cmp     $tp,$num
+        bne     .L1st
+        adds    $nlo,$nlo,$ahi
+        mov     $nhi,#0
+        adc     $nhi,$nhi,#0
+        ldr     $tp,[$_bp]              @ restore bp
+        str     $nlo,[$num]             @ tp[num-1]=
+        ldr     $n0,[$_n0]              @ restore n0
+        str     $nhi,[$num,#4]          @ tp[num]=
+.Louter:
+        sub     $tj,$num,sp             @ "original" $num-1 value
+        sub     $ap,$ap,$tj             @ "rewind" ap to &ap[1]
+        sub     $np,$np,$tj             @ "rewind" np to &np[1]
+        ldr     $bi,[$tp,#4]!           @ *(++bp)
+        ldr     $aj,[$ap,#-4]           @ ap[0]
+        ldr     $nj,[$np,#-4]           @ np[0]
+        ldr     $alo,[sp]               @ tp[0]
+        ldr     $tj,[sp,#4]             @ tp[1]
+        mov     $ahi,#0
+        umlal   $alo,$ahi,$aj,$bi       @ ap[0]*bp[i]+tp[0]
+        str     $tp,[$_bp]              @ save bp
+        mul     $n0,$alo,$n0
+        mov     $nlo,#0
+        umlal   $alo,$nlo,$nj,$n0       @ np[0]*n0+"tp[0]"
+        mov     $tp,sp
+.Linner:
+        ldr     $aj,[$ap],#4            @ ap[j],ap++
+        adds    $alo,$ahi,$tj           @ +=tp[j]
+        mov     $ahi,#0
+        umlal   $alo,$ahi,$aj,$bi       @ ap[j]*bp[i]
+        ldr     $nj,[$np],#4            @ np[j],np++
+        mov     $nhi,#0
+        umlal   $nlo,$nhi,$nj,$n0       @ np[j]*n0
+        ldr     $tj,[$tp,#8]            @ tp[j+1]
+        adc     $ahi,$ahi,#0
+        adds    $nlo,$nlo,$alo
+        str     $nlo,[$tp],#4           @ tp[j-1]=,tp++
+        adc     $nlo,$nhi,#0
+        cmp     $tp,$num
+        bne     .Linner
+        adds    $nlo,$nlo,$ahi
+        mov     $nhi,#0
+        adc     $nhi,$nhi,#0
+        adds    $nlo,$nlo,$tj
+        adc     $nhi,$nhi,#0
+        ldr     $tp,[$_bp]              @ restore bp
+        ldr     $tj,[$_bpend]           @ restore &bp[num]
+        str     $nlo,[$num]             @ tp[num-1]=
+        ldr     $n0,[$_n0]              @ restore n0
+        str     $nhi,[$num,#4]          @ tp[num]=
+        cmp     $tp,$tj
+        bne     .Louter
+        ldr     $rp,[$_rp]              @ pull rp
+        add     $num,$num,#4            @ $num to point at &tp[num]
+        sub     $aj,$num,sp             @ "original" num value
+        mov     $tp,sp                  @ "rewind" $tp
+        mov     $ap,$tp                 @ "borrow" $ap
+        sub     $np,$np,$aj             @ "rewind" $np to &np[0]
+        subs    $tj,$tj,$tj             @ "clear" carry flag
+.Lsub:  ldr     $tj,[$tp],#4
+        ldr     $nj,[$np],#4
+        sbcs    $tj,$tj,$nj             @ tp[j]-np[j]
+        str     $tj,[$rp],#4            @ rp[j]=
+        teq     $tp,$num                @ preserve carry
+        bne     .Lsub
+        sbcs    $nhi,$nhi,#0            @ upmost carry
+        mov     $tp,sp                  @ "rewind" $tp
+        sub     $rp,$rp,$aj             @ "rewind" $rp
+        and     $ap,$tp,$nhi
+        bic     $np,$rp,$nhi
+        orr     $ap,$ap,$np             @ ap=borrow?tp:rp
+.Lcopy: ldr     $tj,[$ap],#4            @ copy or in-place refresh
+        str     sp,[$tp],#4             @ zap tp
+        str     $tj,[$rp],#4
+        cmp     $tp,$num
+        bne     .Lcopy
+        add     sp,$num,#4              @ skip over tp[num+1]
+        ldmia   sp!,{r4-r12,lr}         @ restore registers
+        add     sp,sp,#2*4              @ skip over {r0,r2}
+        mov     r0,#1
+.Labrt: tst     lr,#1
+        moveq   pc,lr                   @ be binary compatible with V4, yet
+        bx      lr                      @ interoperable with Thumb ISA:-)
+.size   bn_mul_mont,.-bn_mul_mont
+.asciz  "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ppc-mont.pl b/src/lib/libcrypto/bn/asm/ppc-mont.pl
new file mode 100644
index 0000000000..7849eae959
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/ppc-mont.pl
@@ -0,0 +1,323 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# April 2006
+# "Teaser" Montgomery multiplication module for PowerPC. It's possible
+# to gain a bit more by modulo-scheduling outer loop, then dedicated
+# squaring procedure should give further 20% and code can be adapted
+# for 32-bit application running on 64-bit CPU. As for the latter.
+# It won't be able to achieve "native" 64-bit performance, because in
+# 32-bit application context every addc instruction will have to be
+# expanded as addc, twice right shift by 32 and finally adde, etc.
+# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
+# for 64-bit application running on PPC970/G5 is:
+#
+# 512-bit       +65%    
+# 1024-bit      +35%
+# 2048-bit      +18%
+# 4096-bit      +4%
+$flavour = shift;
+if ($flavour =~ /32/) {
+        $BITS=  32;
+        $BNSZ=  $BITS/8;
+        $SIZE_T=4;
+        $RZONE= 224;
+        $FRAME= $SIZE_T*16;
+        $LD=    "lwz";          # load
+        $LDU=   "lwzu";         # load and update
+        $LDX=   "lwzx";         # load indexed
+        $ST=    "stw";          # store
+        $STU=   "stwu";         # store and update
+        $STX=   "stwx";         # store indexed
+        $STUX=  "stwux";        # store indexed and update
+        $UMULL= "mullw";        # unsigned multiply low
+        $UMULH= "mulhwu";       # unsigned multiply high
+        $UCMP=  "cmplw";        # unsigned compare
+        $SHRI=  "srwi";         # unsigned shift right by immediate     
+        $PUSH=  $ST;
+        $POP=   $LD;
+} elsif ($flavour =~ /64/) {
+        $BITS=  64;
+        $BNSZ=  $BITS/8;
+        $SIZE_T=8;
+        $RZONE= 288;
+        $FRAME= $SIZE_T*16;
+        # same as above, but 64-bit mnemonics...
+        $LD=    "ld";           # load
+        $LDU=   "ldu";          # load and update
+        $LDX=   "ldx";          # load indexed
+        $ST=    "std";          # store
+        $STU=   "stdu";         # store and update
+        $STX=   "stdx";         # store indexed
+        $STUX=  "stdux";        # store indexed and update
+        $UMULL= "mulld";        # unsigned multiply low
+        $UMULH= "mulhdu";       # unsigned multiply high
+        $UCMP=  "cmpld";        # unsigned compare
+        $SHRI=  "srdi";         # unsigned shift right by immediate     
+        $PUSH=  $ST;
+        $POP=   $LD;
+} else { die "nonsense $flavour"; }
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+$sp="r1";
+$toc="r2";
+$rp="r3";       $ovf="r3";
+$ap="r4";
+$bp="r5";
+$np="r6";
+$n0="r7";
+$num="r8";
+$rp="r9";       # $rp is reassigned
+$aj="r10";
+$nj="r11";
+$tj="r12";
+# non-volatile registers
+$i="r14";
+$j="r15";
+$tp="r16";
+$m0="r17";
+$m1="r18";
+$lo0="r19";
+$hi0="r20";
+$lo1="r21";
+$hi1="r22";
+$alo="r23";
+$ahi="r24";
+$nlo="r25";
+#
+$nhi="r0";
+$code=<<___;
+.machine "any"
+.text
+.globl  .bn_mul_mont
+.align  4
+.bn_mul_mont:
+        cmpwi   $num,4
+        mr      $rp,r3          ; $rp is reassigned
+        li      r3,0
+        bltlr
+        slwi    $num,$num,`log($BNSZ)/log(2)`
+        li      $tj,-4096
+        addi    $ovf,$num,`$FRAME+$RZONE`
+        subf    $ovf,$ovf,$sp   ; $sp-$ovf
+        and     $ovf,$ovf,$tj   ; minimize TLB usage
+        subf    $ovf,$sp,$ovf   ; $ovf-$sp
+        srwi    $num,$num,`log($BNSZ)/log(2)`
+        $STUX   $sp,$sp,$ovf
+        $PUSH   r14,`4*$SIZE_T`($sp)
+        $PUSH   r15,`5*$SIZE_T`($sp)
+        $PUSH   r16,`6*$SIZE_T`($sp)
+        $PUSH   r17,`7*$SIZE_T`($sp)
+        $PUSH   r18,`8*$SIZE_T`($sp)
+        $PUSH   r19,`9*$SIZE_T`($sp)
+        $PUSH   r20,`10*$SIZE_T`($sp)
+        $PUSH   r21,`11*$SIZE_T`($sp)
+        $PUSH   r22,`12*$SIZE_T`($sp)
+        $PUSH   r23,`13*$SIZE_T`($sp)
+        $PUSH   r24,`14*$SIZE_T`($sp)
+        $PUSH   r25,`15*$SIZE_T`($sp)
+        $LD     $n0,0($n0)      ; pull n0[0] value
+        addi    $num,$num,-2    ; adjust $num for counter register
+        $LD     $m0,0($bp)      ; m0=bp[0]
+        $LD     $aj,0($ap)      ; ap[0]
+        addi    $tp,$sp,$FRAME
+        $UMULL  $lo0,$aj,$m0    ; ap[0]*bp[0]
+        $UMULH  $hi0,$aj,$m0
+        $LD     $aj,$BNSZ($ap)  ; ap[1]
+        $LD     $nj,0($np)      ; np[0]
+        $UMULL  $m1,$lo0,$n0    ; "tp[0]"*n0
+        $UMULL  $alo,$aj,$m0    ; ap[1]*bp[0]
+        $UMULH  $ahi,$aj,$m0
+        $UMULL  $lo1,$nj,$m1    ; np[0]*m1
+        $UMULH  $hi1,$nj,$m1
+        $LD     $nj,$BNSZ($np)  ; np[1]
+        addc    $lo1,$lo1,$lo0
+        addze   $hi1,$hi1
+        $UMULL  $nlo,$nj,$m1    ; np[1]*m1
+        $UMULH  $nhi,$nj,$m1
+        mtctr   $num
+        li      $j,`2*$BNSZ`
+.align  4
+L1st:
+        $LDX    $aj,$ap,$j      ; ap[j]
+        addc    $lo0,$alo,$hi0
+        $LDX    $nj,$np,$j      ; np[j]
+        addze   $hi0,$ahi
+        $UMULL  $alo,$aj,$m0    ; ap[j]*bp[0]
+        addc    $lo1,$nlo,$hi1
+        $UMULH  $ahi,$aj,$m0
+        addze   $hi1,$nhi
+        $UMULL  $nlo,$nj,$m1    ; np[j]*m1
+        addc    $lo1,$lo1,$lo0  ; np[j]*m1+ap[j]*bp[0]
+        $UMULH  $nhi,$nj,$m1
+        addze   $hi1,$hi1
+        $ST     $lo1,0($tp)     ; tp[j-1]
+        addi    $j,$j,$BNSZ     ; j++
+        addi    $tp,$tp,$BNSZ   ; tp++
+        bdnz-   L1st
+;L1st
+        addc    $lo0,$alo,$hi0
+        addze   $hi0,$ahi
+        addc    $lo1,$nlo,$hi1
+        addze   $hi1,$nhi
+        addc    $lo1,$lo1,$lo0  ; np[j]*m1+ap[j]*bp[0]
+        addze   $hi1,$hi1
+        $ST     $lo1,0($tp)     ; tp[j-1]
+        li      $ovf,0
+        addc    $hi1,$hi1,$hi0
+        addze   $ovf,$ovf       ; upmost overflow bit
+        $ST     $hi1,$BNSZ($tp)
+        li      $i,$BNSZ
+.align  4
+Louter:
+        $LDX    $m0,$bp,$i      ; m0=bp[i]
+        $LD     $aj,0($ap)      ; ap[0]
+        addi    $tp,$sp,$FRAME
+        $LD     $tj,$FRAME($sp) ; tp[0]
+        $UMULL  $lo0,$aj,$m0    ; ap[0]*bp[i]
+        $UMULH  $hi0,$aj,$m0
+        $LD     $aj,$BNSZ($ap)  ; ap[1]
+        $LD     $nj,0($np)      ; np[0]
+        addc    $lo0,$lo0,$tj   ; ap[0]*bp[i]+tp[0]
+        $UMULL  $alo,$aj,$m0    ; ap[j]*bp[i]
+        addze   $hi0,$hi0
+        $UMULL  $m1,$lo0,$n0    ; tp[0]*n0
+        $UMULH  $ahi,$aj,$m0
+        $UMULL  $lo1,$nj,$m1    ; np[0]*m1
+        $UMULH  $hi1,$nj,$m1
+        $LD     $nj,$BNSZ($np)  ; np[1]
+        addc    $lo1,$lo1,$lo0
+        $UMULL  $nlo,$nj,$m1    ; np[1]*m1
+        addze   $hi1,$hi1
+        $UMULH  $nhi,$nj,$m1
+        mtctr   $num
+        li      $j,`2*$BNSZ`
+.align  4
+Linner:
+        $LDX    $aj,$ap,$j      ; ap[j]
+        addc    $lo0,$alo,$hi0
+        $LD     $tj,$BNSZ($tp)  ; tp[j]
+        addze   $hi0,$ahi
+        $LDX    $nj,$np,$j      ; np[j]
+        addc    $lo1,$nlo,$hi1
+        $UMULL  $alo,$aj,$m0    ; ap[j]*bp[i]
+        addze   $hi1,$nhi
+        $UMULH  $ahi,$aj,$m0
+        addc    $lo0,$lo0,$tj   ; ap[j]*bp[i]+tp[j]
+        $UMULL  $nlo,$nj,$m1    ; np[j]*m1
+        addze   $hi0,$hi0
+        $UMULH  $nhi,$nj,$m1
+        addc    $lo1,$lo1,$lo0  ; np[j]*m1+ap[j]*bp[i]+tp[j]
+        addi    $j,$j,$BNSZ     ; j++
+        addze   $hi1,$hi1
+        $ST     $lo1,0($tp)     ; tp[j-1]
+        addi    $tp,$tp,$BNSZ   ; tp++
+        bdnz-   Linner
+;Linner
+        $LD     $tj,$BNSZ($tp)  ; tp[j]
+        addc    $lo0,$alo,$hi0
+        addze   $hi0,$ahi
+        addc    $lo0,$lo0,$tj   ; ap[j]*bp[i]+tp[j]
+        addze   $hi0,$hi0
+        addc    $lo1,$nlo,$hi1
+        addze   $hi1,$nhi
+        addc    $lo1,$lo1,$lo0  ; np[j]*m1+ap[j]*bp[i]+tp[j]
+        addze   $hi1,$hi1
+        $ST     $lo1,0($tp)     ; tp[j-1]
+        addic   $ovf,$ovf,-1    ; move upmost overflow to XER[CA]
+        li      $ovf,0
+        adde    $hi1,$hi1,$hi0
+        addze   $ovf,$ovf
+        $ST     $hi1,$BNSZ($tp)
+;
+        slwi    $tj,$num,`log($BNSZ)/log(2)`
+        $UCMP   $i,$tj
+        addi    $i,$i,$BNSZ
+        ble-    Louter
+        addi    $num,$num,2     ; restore $num
+        subfc   $j,$j,$j        ; j=0 and "clear" XER[CA]
+        addi    $tp,$sp,$FRAME
+        mtctr   $num
+.align  4
+Lsub:   $LDX    $tj,$tp,$j
+        $LDX    $nj,$np,$j
+        subfe   $aj,$nj,$tj     ; tp[j]-np[j]
+        $STX    $aj,$rp,$j
+        addi    $j,$j,$BNSZ
+        bdnz-   Lsub
+        li      $j,0
+        mtctr   $num
+        subfe   $ovf,$j,$ovf    ; handle upmost overflow bit
+        and     $ap,$tp,$ovf
+        andc    $np,$rp,$ovf
+        or      $ap,$ap,$np     ; ap=borrow?tp:rp
+.align  4
+Lcopy:                          ; copy or in-place refresh
+        $LDX    $tj,$ap,$j
+        $STX    $tj,$rp,$j
+        $STX    $j,$tp,$j       ; zap at once
+        addi    $j,$j,$BNSZ
+        bdnz-   Lcopy
+        $POP    r14,`4*$SIZE_T`($sp)
+        $POP    r15,`5*$SIZE_T`($sp)
+        $POP    r16,`6*$SIZE_T`($sp)
+        $POP    r17,`7*$SIZE_T`($sp)
+        $POP    r18,`8*$SIZE_T`($sp)
+        $POP    r19,`9*$SIZE_T`($sp)
+        $POP    r20,`10*$SIZE_T`($sp)
+        $POP    r21,`11*$SIZE_T`($sp)
+        $POP    r22,`12*$SIZE_T`($sp)
+        $POP    r23,`13*$SIZE_T`($sp)
+        $POP    r24,`14*$SIZE_T`($sp)
+        $POP    r25,`15*$SIZE_T`($sp)
+        $POP    $sp,0($sp)
+        li      r3,1
+        blr
+        .long   0
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ppc64-mont.pl b/src/lib/libcrypto/bn/asm/ppc64-mont.pl
new file mode 100644
index 0000000000..3449b35855
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/ppc64-mont.pl
@@ -0,0 +1,918 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# December 2007
+# The reason for undertaken effort is basically following. Even though
+# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
+# performance was observed to be less than impressive, essentially as
+# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
+# Well, it's not surprising that IBM had to make some sacrifices to
+# boost the clock frequency that much, but no overall improvement?
+# Having observed how much difference did switching to FPU make on
+# UltraSPARC, playing same stunt on Power 6 appeared appropriate...
+# Unfortunately the resulting performance improvement is not as
+# impressive, ~30%, and in absolute terms is still very far from what
+# one would expect from 4.7GHz CPU. There is a chance that I'm doing
+# something wrong, but in the lack of assembler level micro-profiling
+# data or at least decent platform guide I can't tell... Or better
+# results might be achieved with VMX... Anyway, this module provides
+# *worse* performance on other PowerPC implementations, ~40-15% slower
+# on PPC970 depending on key length and ~40% slower on Power 5 for all
+# key lengths. As it's obviously inappropriate as "best all-round"
+# alternative, it has to be complemented with run-time CPU family
+# detection. Oh! It should also be noted that unlike other PowerPC
+# implementation IALU ppc-mont.pl module performs *suboptimaly* on
+# >=1024-bit key lengths on Power 6. It should also be noted that
+# *everything* said so far applies to 64-bit builds! As far as 32-bit
+# application executed on 64-bit CPU goes, this module is likely to
+# become preferred choice, because it's easy to adapt it for such
+# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
+# February 2008
+# Micro-profiling assisted optimization results in ~15% improvement
+# over original ppc64-mont.pl version, or overall ~50% improvement
+# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
+# Power 6 CPU, this module is 5-150% faster depending on key length,
+# [hereafter] more for longer keys. But if compared to ppc-mont.pl
+# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
+# in absolute terms, but it's apparently the way Power 6 is...
+$flavour = shift;
+if ($flavour =~ /32/) {
+        $SIZE_T=4;
+        $RZONE= 224;
+        $FRAME= $SIZE_T*12+8*12;
+        $fname= "bn_mul_mont_ppc64";
+        $STUX=  "stwux";        # store indexed and update
+        $PUSH=  "stw";
+        $POP=   "lwz";
+        die "not implemented yet";
+} elsif ($flavour =~ /64/) {
+        $SIZE_T=8;
+        $RZONE= 288;
+        $FRAME= $SIZE_T*12+8*12;
+        $fname= "bn_mul_mont";
+        # same as above, but 64-bit mnemonics...
+        $STUX=  "stdux";        # store indexed and update
+        $PUSH=  "std";
+        $POP=   "ld";
+} else { die "nonsense $flavour"; }
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+$FRAME=($FRAME+63)&~63;
+$TRANSFER=16*8;
+$carry="r0";
+$sp="r1";
+$toc="r2";
+$rp="r3";       $ovf="r3";
+$ap="r4";
+$bp="r5";
+$np="r6";
+$n0="r7";
+$num="r8";
+$rp="r9";       # $rp is reassigned
+$tp="r10";
+$j="r11";
+$i="r12";
+# non-volatile registers
+$nap_d="r14";   # interleaved ap and np in double format
+$a0="r15";      # ap[0]
+$t0="r16";      # temporary registers
+$t1="r17";
+$t2="r18";
+$t3="r19";
+$t4="r20";
+$t5="r21";
+$t6="r22";
+$t7="r23";
+# PPC offers enough register bank capacity to unroll inner loops twice
+#
+#     ..A3A2A1A0
+#           dcba
+#    -----------
+#            A0a
+#           A0b
+#          A0c
+#         A0d
+#          A1a
+#         A1b
+#        A1c
+#       A1d
+#        A2a
+#       A2b
+#      A2c
+#     A2d
+#      A3a
+#     A3b
+#    A3c
+#   A3d
+#    ..a
+#   ..b
+#
+$ba="f0";       $bb="f1";       $bc="f2";       $bd="f3";
+$na="f4";       $nb="f5";       $nc="f6";       $nd="f7";
+$dota="f8";     $dotb="f9";
+$A0="f10";      $A1="f11";      $A2="f12";      $A3="f13";
+$N0="f14";      $N1="f15";      $N2="f16";      $N3="f17";
+$T0a="f18";     $T0b="f19";
+$T1a="f20";     $T1b="f21";
+$T2a="f22";     $T2b="f23";
+$T3a="f24";     $T3b="f25";
+# sp----------->+-------------------------------+
+#               | saved sp                      |
+#               +-------------------------------+
+#               |                               |
+#               +-------------------------------+
+#               | 10 saved gpr, r14-r23         |
+#               .                               .
+#               .                               .
+#   +12*size_t  +-------------------------------+
+#               | 12 saved fpr, f14-f25         |
+#               .                               .
+#               .                               .
+#   +12*8       +-------------------------------+
+#               | padding to 64 byte boundary   |
+#               .                               .
+#   +X          +-------------------------------+
+#               | 16 gpr<->fpr transfer zone    |
+#               .                               .
+#               .                               .
+#   +16*8       +-------------------------------+
+#               | __int64 tmp[-1]               |
+#               +-------------------------------+
+#               | __int64 tmp[num]              |
+#               .                               .
+#               .                               .
+#               .                               .
+#   +(num+1)*8  +-------------------------------+
+#               | padding to 64 byte boundary   |
+#               .                               .
+#   +X          +-------------------------------+
+#               | double nap_d[4*num]           |
+#               .                               .
+#               .                               .
+#               .                               .
+#               +-------------------------------+
+$code=<<___;
+.machine "any"
+.text
+.globl  .$fname
+.align  5
+.$fname:
+        cmpwi   $num,4
+        mr      $rp,r3          ; $rp is reassigned
+        li      r3,0            ; possible "not handled" return code
+        bltlr-
+        andi.   r0,$num,1       ; $num has to be even
+        bnelr-
+        slwi    $num,$num,3     ; num*=8
+        li      $i,-4096
+        slwi    $tp,$num,2      ; place for {an}p_{lh}[num], i.e. 4*num
+        add     $tp,$tp,$num    ; place for tp[num+1]
+        addi    $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
+        subf    $tp,$tp,$sp     ; $sp-$tp
+        and     $tp,$tp,$i      ; minimize TLB usage
+        subf    $tp,$sp,$tp     ; $tp-$sp
+        $STUX   $sp,$sp,$tp     ; alloca
+        $PUSH   r14,`2*$SIZE_T`($sp)
+        $PUSH   r15,`3*$SIZE_T`($sp)
+        $PUSH   r16,`4*$SIZE_T`($sp)
+        $PUSH   r17,`5*$SIZE_T`($sp)
+        $PUSH   r18,`6*$SIZE_T`($sp)
+        $PUSH   r19,`7*$SIZE_T`($sp)
+        $PUSH   r20,`8*$SIZE_T`($sp)
+        $PUSH   r21,`9*$SIZE_T`($sp)
+        $PUSH   r22,`10*$SIZE_T`($sp)
+        $PUSH   r23,`11*$SIZE_T`($sp)
+        stfd    f14,`12*$SIZE_T+0`($sp)
+        stfd    f15,`12*$SIZE_T+8`($sp)
+        stfd    f16,`12*$SIZE_T+16`($sp)
+        stfd    f17,`12*$SIZE_T+24`($sp)
+        stfd    f18,`12*$SIZE_T+32`($sp)
+        stfd    f19,`12*$SIZE_T+40`($sp)
+        stfd    f20,`12*$SIZE_T+48`($sp)
+        stfd    f21,`12*$SIZE_T+56`($sp)
+        stfd    f22,`12*$SIZE_T+64`($sp)
+        stfd    f23,`12*$SIZE_T+72`($sp)
+        stfd    f24,`12*$SIZE_T+80`($sp)
+        stfd    f25,`12*$SIZE_T+88`($sp)
+        ld      $a0,0($ap)      ; pull ap[0] value
+        ld      $n0,0($n0)      ; pull n0[0] value
+        ld      $t3,0($bp)      ; bp[0]
+        addi    $tp,$sp,`$FRAME+$TRANSFER+8+64`
+        li      $i,-64
+        add     $nap_d,$tp,$num
+        and     $nap_d,$nap_d,$i        ; align to 64 bytes
+        mulld   $t7,$a0,$t3     ; ap[0]*bp[0]
+        ; nap_d is off by 1, because it's used with stfdu/lfdu
+        addi    $nap_d,$nap_d,-8
+        srwi    $j,$num,`3+1`   ; counter register, num/2
+        mulld   $t7,$t7,$n0     ; tp[0]*n0
+        addi    $j,$j,-1
+        addi    $tp,$sp,`$FRAME+$TRANSFER-8`
+        li      $carry,0
+        mtctr   $j
+        ; transfer bp[0] to FPU as 4x16-bit values
+        extrdi  $t0,$t3,16,48
+        extrdi  $t1,$t3,16,32
+        extrdi  $t2,$t3,16,16
+        extrdi  $t3,$t3,16,0
+        std     $t0,`$FRAME+0`($sp)
+        std     $t1,`$FRAME+8`($sp)
+        std     $t2,`$FRAME+16`($sp)
+        std     $t3,`$FRAME+24`($sp)
+        ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
+        extrdi  $t4,$t7,16,48
+        extrdi  $t5,$t7,16,32
+        extrdi  $t6,$t7,16,16
+        extrdi  $t7,$t7,16,0
+        std     $t4,`$FRAME+32`($sp)
+        std     $t5,`$FRAME+40`($sp)
+        std     $t6,`$FRAME+48`($sp)
+        std     $t7,`$FRAME+56`($sp)
+        lwz     $t0,4($ap)              ; load a[j] as 32-bit word pair
+        lwz     $t1,0($ap)
+        lwz     $t2,12($ap)             ; load a[j+1] as 32-bit word pair
+        lwz     $t3,8($ap)
+        lwz     $t4,4($np)              ; load n[j] as 32-bit word pair
+        lwz     $t5,0($np)
+        lwz     $t6,12($np)             ; load n[j+1] as 32-bit word pair
+        lwz     $t7,8($np)
+        lfd     $ba,`$FRAME+0`($sp)
+        lfd     $bb,`$FRAME+8`($sp)
+        lfd     $bc,`$FRAME+16`($sp)
+        lfd     $bd,`$FRAME+24`($sp)
+        lfd     $na,`$FRAME+32`($sp)
+        lfd     $nb,`$FRAME+40`($sp)
+        lfd     $nc,`$FRAME+48`($sp)
+        lfd     $nd,`$FRAME+56`($sp)
+        std     $t0,`$FRAME+64`($sp)
+        std     $t1,`$FRAME+72`($sp)
+        std     $t2,`$FRAME+80`($sp)
+        std     $t3,`$FRAME+88`($sp)
+        std     $t4,`$FRAME+96`($sp)
+        std     $t5,`$FRAME+104`($sp)
+        std     $t6,`$FRAME+112`($sp)
+        std     $t7,`$FRAME+120`($sp)
+        fcfid   $ba,$ba
+        fcfid   $bb,$bb
+        fcfid   $bc,$bc
+        fcfid   $bd,$bd
+        fcfid   $na,$na
+        fcfid   $nb,$nb
+        fcfid   $nc,$nc
+        fcfid   $nd,$nd
+        lfd     $A0,`$FRAME+64`($sp)
+        lfd     $A1,`$FRAME+72`($sp)
+        lfd     $A2,`$FRAME+80`($sp)
+        lfd     $A3,`$FRAME+88`($sp)
+        lfd     $N0,`$FRAME+96`($sp)
+        lfd     $N1,`$FRAME+104`($sp)
+        lfd     $N2,`$FRAME+112`($sp)
+        lfd     $N3,`$FRAME+120`($sp)
+        fcfid   $A0,$A0
+        fcfid   $A1,$A1
+        fcfid   $A2,$A2
+        fcfid   $A3,$A3
+        fcfid   $N0,$N0
+        fcfid   $N1,$N1
+        fcfid   $N2,$N2
+        fcfid   $N3,$N3
+        addi    $ap,$ap,16
+        addi    $np,$np,16
+        fmul    $T1a,$A1,$ba
+        fmul    $T1b,$A1,$bb
+        stfd    $A0,8($nap_d)           ; save a[j] in double format
+        stfd    $A1,16($nap_d)
+        fmul    $T2a,$A2,$ba
+        fmul    $T2b,$A2,$bb
+        stfd    $A2,24($nap_d)          ; save a[j+1] in double format
+        stfd    $A3,32($nap_d)
+        fmul    $T3a,$A3,$ba
+        fmul    $T3b,$A3,$bb
+        stfd    $N0,40($nap_d)          ; save n[j] in double format
+        stfd    $N1,48($nap_d)
+        fmul    $T0a,$A0,$ba
+        fmul    $T0b,$A0,$bb
+        stfd    $N2,56($nap_d)          ; save n[j+1] in double format
+        stfdu   $N3,64($nap_d)
+        fmadd   $T1a,$A0,$bc,$T1a
+        fmadd   $T1b,$A0,$bd,$T1b
+        fmadd   $T2a,$A1,$bc,$T2a
+        fmadd   $T2b,$A1,$bd,$T2b
+        fmadd   $T3a,$A2,$bc,$T3a
+        fmadd   $T3b,$A2,$bd,$T3b
+        fmul    $dota,$A3,$bc
+        fmul    $dotb,$A3,$bd
+        fmadd   $T1a,$N1,$na,$T1a
+        fmadd   $T1b,$N1,$nb,$T1b
+        fmadd   $T2a,$N2,$na,$T2a
+        fmadd   $T2b,$N2,$nb,$T2b
+        fmadd   $T3a,$N3,$na,$T3a
+        fmadd   $T3b,$N3,$nb,$T3b
+        fmadd   $T0a,$N0,$na,$T0a
+        fmadd   $T0b,$N0,$nb,$T0b
+        fmadd   $T1a,$N0,$nc,$T1a
+        fmadd   $T1b,$N0,$nd,$T1b
+        fmadd   $T2a,$N1,$nc,$T2a
+        fmadd   $T2b,$N1,$nd,$T2b
+        fmadd   $T3a,$N2,$nc,$T3a
+        fmadd   $T3b,$N2,$nd,$T3b
+        fmadd   $dota,$N3,$nc,$dota
+        fmadd   $dotb,$N3,$nd,$dotb
+        fctid   $T0a,$T0a
+        fctid   $T0b,$T0b
+        fctid   $T1a,$T1a
+        fctid   $T1b,$T1b
+        fctid   $T2a,$T2a
+        fctid   $T2b,$T2b
+        fctid   $T3a,$T3a
+        fctid   $T3b,$T3b
+        stfd    $T0a,`$FRAME+0`($sp)
+        stfd    $T0b,`$FRAME+8`($sp)
+        stfd    $T1a,`$FRAME+16`($sp)
+        stfd    $T1b,`$FRAME+24`($sp)
+        stfd    $T2a,`$FRAME+32`($sp)
+        stfd    $T2b,`$FRAME+40`($sp)
+        stfd    $T3a,`$FRAME+48`($sp)
+        stfd    $T3b,`$FRAME+56`($sp)
+.align  5
+L1st:
+        lwz     $t0,4($ap)              ; load a[j] as 32-bit word pair
+        lwz     $t1,0($ap)
+        lwz     $t2,12($ap)             ; load a[j+1] as 32-bit word pair
+        lwz     $t3,8($ap)
+        lwz     $t4,4($np)              ; load n[j] as 32-bit word pair
+        lwz     $t5,0($np)
+        lwz     $t6,12($np)             ; load n[j+1] as 32-bit word pair
+        lwz     $t7,8($np)
+        std     $t0,`$FRAME+64`($sp)
+        std     $t1,`$FRAME+72`($sp)
+        std     $t2,`$FRAME+80`($sp)
+        std     $t3,`$FRAME+88`($sp)
+        std     $t4,`$FRAME+96`($sp)
+        std     $t5,`$FRAME+104`($sp)
+        std     $t6,`$FRAME+112`($sp)
+        std     $t7,`$FRAME+120`($sp)
+        ld      $t0,`$FRAME+0`($sp)
+        ld      $t1,`$FRAME+8`($sp)
+        ld      $t2,`$FRAME+16`($sp)
+        ld      $t3,`$FRAME+24`($sp)
+        ld      $t4,`$FRAME+32`($sp)
+        ld      $t5,`$FRAME+40`($sp)
+        ld      $t6,`$FRAME+48`($sp)
+        ld      $t7,`$FRAME+56`($sp)
+        lfd     $A0,`$FRAME+64`($sp)
+        lfd     $A1,`$FRAME+72`($sp)
+        lfd     $A2,`$FRAME+80`($sp)
+        lfd     $A3,`$FRAME+88`($sp)
+        lfd     $N0,`$FRAME+96`($sp)
+        lfd     $N1,`$FRAME+104`($sp)
+        lfd     $N2,`$FRAME+112`($sp)
+        lfd     $N3,`$FRAME+120`($sp)
+        fcfid   $A0,$A0
+        fcfid   $A1,$A1
+        fcfid   $A2,$A2
+        fcfid   $A3,$A3
+        fcfid   $N0,$N0
+        fcfid   $N1,$N1
+        fcfid   $N2,$N2
+        fcfid   $N3,$N3
+        addi    $ap,$ap,16
+        addi    $np,$np,16
+        fmul    $T1a,$A1,$ba
+        fmul    $T1b,$A1,$bb
+        fmul    $T2a,$A2,$ba
+        fmul    $T2b,$A2,$bb
+        stfd    $A0,8($nap_d)           ; save a[j] in double format
+        stfd    $A1,16($nap_d)
+        fmul    $T3a,$A3,$ba
+        fmul    $T3b,$A3,$bb
+        fmadd   $T0a,$A0,$ba,$dota
+        fmadd   $T0b,$A0,$bb,$dotb
+        stfd    $A2,24($nap_d)          ; save a[j+1] in double format
+        stfd    $A3,32($nap_d)
+        fmadd   $T1a,$A0,$bc,$T1a
+        fmadd   $T1b,$A0,$bd,$T1b
+        fmadd   $T2a,$A1,$bc,$T2a
+        fmadd   $T2b,$A1,$bd,$T2b
+        stfd    $N0,40($nap_d)          ; save n[j] in double format
+        stfd    $N1,48($nap_d)
+        fmadd   $T3a,$A2,$bc,$T3a
+        fmadd   $T3b,$A2,$bd,$T3b
+         add    $t0,$t0,$carry          ; can not overflow
+        fmul    $dota,$A3,$bc
+        fmul    $dotb,$A3,$bd
+        stfd    $N2,56($nap_d)          ; save n[j+1] in double format
+        stfdu   $N3,64($nap_d)
+         srdi   $carry,$t0,16
+         add    $t1,$t1,$carry
+         srdi   $carry,$t1,16
+        fmadd   $T1a,$N1,$na,$T1a
+        fmadd   $T1b,$N1,$nb,$T1b
+         insrdi $t0,$t1,16,32
+        fmadd   $T2a,$N2,$na,$T2a
+        fmadd   $T2b,$N2,$nb,$T2b
+         add    $t2,$t2,$carry
+        fmadd   $T3a,$N3,$na,$T3a
+        fmadd   $T3b,$N3,$nb,$T3b
+         srdi   $carry,$t2,16
+        fmadd   $T0a,$N0,$na,$T0a
+        fmadd   $T0b,$N0,$nb,$T0b
+         insrdi $t0,$t2,16,16
+         add    $t3,$t3,$carry
+         srdi   $carry,$t3,16
+        fmadd   $T1a,$N0,$nc,$T1a
+        fmadd   $T1b,$N0,$nd,$T1b
+         insrdi $t0,$t3,16,0            ; 0..63 bits
+        fmadd   $T2a,$N1,$nc,$T2a
+        fmadd   $T2b,$N1,$nd,$T2b
+         add    $t4,$t4,$carry
+        fmadd   $T3a,$N2,$nc,$T3a
+        fmadd   $T3b,$N2,$nd,$T3b
+         srdi   $carry,$t4,16
+        fmadd   $dota,$N3,$nc,$dota
+        fmadd   $dotb,$N3,$nd,$dotb
+         add    $t5,$t5,$carry
+         srdi   $carry,$t5,16
+         insrdi $t4,$t5,16,32
+        fctid   $T0a,$T0a
+        fctid   $T0b,$T0b
+         add    $t6,$t6,$carry
+        fctid   $T1a,$T1a
+        fctid   $T1b,$T1b
+         srdi   $carry,$t6,16
+        fctid   $T2a,$T2a
+        fctid   $T2b,$T2b
+         insrdi $t4,$t6,16,16
+        fctid   $T3a,$T3a
+        fctid   $T3b,$T3b
+         add    $t7,$t7,$carry
+         insrdi $t4,$t7,16,0            ; 64..127 bits
+         srdi   $carry,$t7,16           ; upper 33 bits
+        stfd    $T0a,`$FRAME+0`($sp)
+        stfd    $T0b,`$FRAME+8`($sp)
+        stfd    $T1a,`$FRAME+16`($sp)
+        stfd    $T1b,`$FRAME+24`($sp)
+        stfd    $T2a,`$FRAME+32`($sp)
+        stfd    $T2b,`$FRAME+40`($sp)
+        stfd    $T3a,`$FRAME+48`($sp)
+        stfd    $T3b,`$FRAME+56`($sp)
+         std    $t0,8($tp)              ; tp[j-1]
+         stdu   $t4,16($tp)             ; tp[j]
+        bdnz-   L1st
+        fctid   $dota,$dota
+        fctid   $dotb,$dotb
+        ld      $t0,`$FRAME+0`($sp)
+        ld      $t1,`$FRAME+8`($sp)
+        ld      $t2,`$FRAME+16`($sp)
+        ld      $t3,`$FRAME+24`($sp)
+        ld      $t4,`$FRAME+32`($sp)
+        ld      $t5,`$FRAME+40`($sp)
+        ld      $t6,`$FRAME+48`($sp)
+        ld      $t7,`$FRAME+56`($sp)
+        stfd    $dota,`$FRAME+64`($sp)
+        stfd    $dotb,`$FRAME+72`($sp)
+        add     $t0,$t0,$carry          ; can not overflow
+        srdi    $carry,$t0,16
+        add     $t1,$t1,$carry
+        srdi    $carry,$t1,16
+        insrdi  $t0,$t1,16,32
+        add     $t2,$t2,$carry
+        srdi    $carry,$t2,16
+        insrdi  $t0,$t2,16,16
+        add     $t3,$t3,$carry
+        srdi    $carry,$t3,16
+        insrdi  $t0,$t3,16,0            ; 0..63 bits
+        add     $t4,$t4,$carry
+        srdi    $carry,$t4,16
+        add     $t5,$t5,$carry
+        srdi    $carry,$t5,16
+        insrdi  $t4,$t5,16,32
+        add     $t6,$t6,$carry
+        srdi    $carry,$t6,16
+        insrdi  $t4,$t6,16,16
+        add     $t7,$t7,$carry
+        insrdi  $t4,$t7,16,0            ; 64..127 bits
+        srdi    $carry,$t7,16           ; upper 33 bits
+        ld      $t6,`$FRAME+64`($sp)
+        ld      $t7,`$FRAME+72`($sp)
+        std     $t0,8($tp)              ; tp[j-1]
+        stdu    $t4,16($tp)             ; tp[j]
+        add     $t6,$t6,$carry          ; can not overflow
+        srdi    $carry,$t6,16
+        add     $t7,$t7,$carry
+        insrdi  $t6,$t7,48,0
+        srdi    $ovf,$t7,48
+        std     $t6,8($tp)              ; tp[num-1]
+        slwi    $t7,$num,2
+        subf    $nap_d,$t7,$nap_d       ; rewind pointer
+        li      $i,8                    ; i=1
+.align  5
+Louter:
+        ldx     $t3,$bp,$i      ; bp[i]
+        ld      $t6,`$FRAME+$TRANSFER+8`($sp)   ; tp[0]
+        mulld   $t7,$a0,$t3     ; ap[0]*bp[i]
+        addi    $tp,$sp,`$FRAME+$TRANSFER`
+        add     $t7,$t7,$t6     ; ap[0]*bp[i]+tp[0]
+        li      $carry,0
+        mulld   $t7,$t7,$n0     ; tp[0]*n0
+        mtctr   $j
+        ; transfer bp[i] to FPU as 4x16-bit values
+        extrdi  $t0,$t3,16,48
+        extrdi  $t1,$t3,16,32
+        extrdi  $t2,$t3,16,16
+        extrdi  $t3,$t3,16,0
+        std     $t0,`$FRAME+0`($sp)
+        std     $t1,`$FRAME+8`($sp)
+        std     $t2,`$FRAME+16`($sp)
+        std     $t3,`$FRAME+24`($sp)
+        ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
+        extrdi  $t4,$t7,16,48
+        extrdi  $t5,$t7,16,32
+        extrdi  $t6,$t7,16,16
+        extrdi  $t7,$t7,16,0
+        std     $t4,`$FRAME+32`($sp)
+        std     $t5,`$FRAME+40`($sp)
+        std     $t6,`$FRAME+48`($sp)
+        std     $t7,`$FRAME+56`($sp)
+        lfd     $A0,8($nap_d)           ; load a[j] in double format
+        lfd     $A1,16($nap_d)
+        lfd     $A2,24($nap_d)          ; load a[j+1] in double format
+        lfd     $A3,32($nap_d)
+        lfd     $N0,40($nap_d)          ; load n[j] in double format
+        lfd     $N1,48($nap_d)
+        lfd     $N2,56($nap_d)          ; load n[j+1] in double format
+        lfdu    $N3,64($nap_d)
+        lfd     $ba,`$FRAME+0`($sp)
+        lfd     $bb,`$FRAME+8`($sp)
+        lfd     $bc,`$FRAME+16`($sp)
+        lfd     $bd,`$FRAME+24`($sp)
+        lfd     $na,`$FRAME+32`($sp)
+        lfd     $nb,`$FRAME+40`($sp)
+        lfd     $nc,`$FRAME+48`($sp)
+        lfd     $nd,`$FRAME+56`($sp)
+        fcfid   $ba,$ba
+        fcfid   $bb,$bb
+        fcfid   $bc,$bc
+        fcfid   $bd,$bd
+        fcfid   $na,$na
+        fcfid   $nb,$nb
+        fcfid   $nc,$nc
+        fcfid   $nd,$nd
+        fmul    $T1a,$A1,$ba
+        fmul    $T1b,$A1,$bb
+        fmul    $T2a,$A2,$ba
+        fmul    $T2b,$A2,$bb
+        fmul    $T3a,$A3,$ba
+        fmul    $T3b,$A3,$bb
+        fmul    $T0a,$A0,$ba
+        fmul    $T0b,$A0,$bb
+        fmadd   $T1a,$A0,$bc,$T1a
+        fmadd   $T1b,$A0,$bd,$T1b
+        fmadd   $T2a,$A1,$bc,$T2a
+        fmadd   $T2b,$A1,$bd,$T2b
+        fmadd   $T3a,$A2,$bc,$T3a
+        fmadd   $T3b,$A2,$bd,$T3b
+        fmul    $dota,$A3,$bc
+        fmul    $dotb,$A3,$bd
+        fmadd   $T1a,$N1,$na,$T1a
+        fmadd   $T1b,$N1,$nb,$T1b
+         lfd    $A0,8($nap_d)           ; load a[j] in double format
+         lfd    $A1,16($nap_d)
+        fmadd   $T2a,$N2,$na,$T2a
+        fmadd   $T2b,$N2,$nb,$T2b
+         lfd    $A2,24($nap_d)          ; load a[j+1] in double format
+         lfd    $A3,32($nap_d)
+        fmadd   $T3a,$N3,$na,$T3a
+        fmadd   $T3b,$N3,$nb,$T3b
+        fmadd   $T0a,$N0,$na,$T0a
+        fmadd   $T0b,$N0,$nb,$T0b
+        fmadd   $T1a,$N0,$nc,$T1a
+        fmadd   $T1b,$N0,$nd,$T1b
+        fmadd   $T2a,$N1,$nc,$T2a
+        fmadd   $T2b,$N1,$nd,$T2b
+        fmadd   $T3a,$N2,$nc,$T3a
+        fmadd   $T3b,$N2,$nd,$T3b
+        fmadd   $dota,$N3,$nc,$dota
+        fmadd   $dotb,$N3,$nd,$dotb
+        fctid   $T0a,$T0a
+        fctid   $T0b,$T0b
+        fctid   $T1a,$T1a
+        fctid   $T1b,$T1b
+        fctid   $T2a,$T2a
+        fctid   $T2b,$T2b
+        fctid   $T3a,$T3a
+        fctid   $T3b,$T3b
+        stfd    $T0a,`$FRAME+0`($sp)
+        stfd    $T0b,`$FRAME+8`($sp)
+        stfd    $T1a,`$FRAME+16`($sp)
+        stfd    $T1b,`$FRAME+24`($sp)
+        stfd    $T2a,`$FRAME+32`($sp)
+        stfd    $T2b,`$FRAME+40`($sp)
+        stfd    $T3a,`$FRAME+48`($sp)
+        stfd    $T3b,`$FRAME+56`($sp)
+.align  5
+Linner:
+        fmul    $T1a,$A1,$ba
+        fmul    $T1b,$A1,$bb
+        fmul    $T2a,$A2,$ba
+        fmul    $T2b,$A2,$bb
+        lfd     $N0,40($nap_d)          ; load n[j] in double format
+        lfd     $N1,48($nap_d)
+        fmul    $T3a,$A3,$ba
+        fmul    $T3b,$A3,$bb
+        fmadd   $T0a,$A0,$ba,$dota
+        fmadd   $T0b,$A0,$bb,$dotb
+        lfd     $N2,56($nap_d)          ; load n[j+1] in double format
+        lfdu    $N3,64($nap_d)
+        fmadd   $T1a,$A0,$bc,$T1a
+        fmadd   $T1b,$A0,$bd,$T1b
+        fmadd   $T2a,$A1,$bc,$T2a
+        fmadd   $T2b,$A1,$bd,$T2b
+         lfd    $A0,8($nap_d)           ; load a[j] in double format
+         lfd    $A1,16($nap_d)
+        fmadd   $T3a,$A2,$bc,$T3a
+        fmadd   $T3b,$A2,$bd,$T3b
+        fmul    $dota,$A3,$bc
+        fmul    $dotb,$A3,$bd
+         lfd    $A2,24($nap_d)          ; load a[j+1] in double format
+         lfd    $A3,32($nap_d)
+        fmadd   $T1a,$N1,$na,$T1a
+        fmadd   $T1b,$N1,$nb,$T1b
+         ld     $t0,`$FRAME+0`($sp)
+         ld     $t1,`$FRAME+8`($sp)
+        fmadd   $T2a,$N2,$na,$T2a
+        fmadd   $T2b,$N2,$nb,$T2b
+         ld     $t2,`$FRAME+16`($sp)
+         ld     $t3,`$FRAME+24`($sp)
+        fmadd   $T3a,$N3,$na,$T3a
+        fmadd   $T3b,$N3,$nb,$T3b
+         add    $t0,$t0,$carry          ; can not overflow
+         ld     $t4,`$FRAME+32`($sp)
+         ld     $t5,`$FRAME+40`($sp)
+        fmadd   $T0a,$N0,$na,$T0a
+        fmadd   $T0b,$N0,$nb,$T0b
+         srdi   $carry,$t0,16
+         add    $t1,$t1,$carry
+         srdi   $carry,$t1,16
+         ld     $t6,`$FRAME+48`($sp)
+         ld     $t7,`$FRAME+56`($sp)
+        fmadd   $T1a,$N0,$nc,$T1a
+        fmadd   $T1b,$N0,$nd,$T1b
+         insrdi $t0,$t1,16,32
+         ld     $t1,8($tp)              ; tp[j]
+        fmadd   $T2a,$N1,$nc,$T2a
+        fmadd   $T2b,$N1,$nd,$T2b
+         add    $t2,$t2,$carry
+        fmadd   $T3a,$N2,$nc,$T3a
+        fmadd   $T3b,$N2,$nd,$T3b
+         srdi   $carry,$t2,16
+         insrdi $t0,$t2,16,16
+        fmadd   $dota,$N3,$nc,$dota
+        fmadd   $dotb,$N3,$nd,$dotb
+         add    $t3,$t3,$carry
+         ldu    $t2,16($tp)             ; tp[j+1]
+         srdi   $carry,$t3,16
+         insrdi $t0,$t3,16,0            ; 0..63 bits
+         add    $t4,$t4,$carry
+        fctid   $T0a,$T0a
+        fctid   $T0b,$T0b
+         srdi   $carry,$t4,16
+        fctid   $T1a,$T1a
+        fctid   $T1b,$T1b
+         add    $t5,$t5,$carry
+        fctid   $T2a,$T2a
+        fctid   $T2b,$T2b
+         srdi   $carry,$t5,16
+         insrdi $t4,$t5,16,32
+        fctid   $T3a,$T3a
+        fctid   $T3b,$T3b
+         add    $t6,$t6,$carry
+         srdi   $carry,$t6,16
+         insrdi $t4,$t6,16,16
+        stfd    $T0a,`$FRAME+0`($sp)
+        stfd    $T0b,`$FRAME+8`($sp)
+         add    $t7,$t7,$carry
+         addc   $t3,$t0,$t1
+        stfd    $T1a,`$FRAME+16`($sp)
+        stfd    $T1b,`$FRAME+24`($sp)
+         insrdi $t4,$t7,16,0            ; 64..127 bits
+         srdi   $carry,$t7,16           ; upper 33 bits
+        stfd    $T2a,`$FRAME+32`($sp)
+        stfd    $T2b,`$FRAME+40`($sp)
+         adde   $t5,$t4,$t2
+        stfd    $T3a,`$FRAME+48`($sp)
+        stfd    $T3b,`$FRAME+56`($sp)
+         addze  $carry,$carry
+         std    $t3,-16($tp)            ; tp[j-1]
+         std    $t5,-8($tp)             ; tp[j]
+        bdnz-   Linner
+        fctid   $dota,$dota
+        fctid   $dotb,$dotb
+        ld      $t0,`$FRAME+0`($sp)
+        ld      $t1,`$FRAME+8`($sp)
+        ld      $t2,`$FRAME+16`($sp)
+        ld      $t3,`$FRAME+24`($sp)
+        ld      $t4,`$FRAME+32`($sp)
+        ld      $t5,`$FRAME+40`($sp)
+        ld      $t6,`$FRAME+48`($sp)
+        ld      $t7,`$FRAME+56`($sp)
+        stfd    $dota,`$FRAME+64`($sp)
+        stfd    $dotb,`$FRAME+72`($sp)
+        add     $t0,$t0,$carry          ; can not overflow
+        srdi    $carry,$t0,16
+        add     $t1,$t1,$carry
+        srdi    $carry,$t1,16
+        insrdi  $t0,$t1,16,32
+        add     $t2,$t2,$carry
+        ld      $t1,8($tp)              ; tp[j]
+        srdi    $carry,$t2,16
+        insrdi  $t0,$t2,16,16
+        add     $t3,$t3,$carry
+        ldu     $t2,16($tp)             ; tp[j+1]
+        srdi    $carry,$t3,16
+        insrdi  $t0,$t3,16,0            ; 0..63 bits
+        add     $t4,$t4,$carry
+        srdi    $carry,$t4,16
+        add     $t5,$t5,$carry
+        srdi    $carry,$t5,16
+        insrdi  $t4,$t5,16,32
+        add     $t6,$t6,$carry
+        srdi    $carry,$t6,16
+        insrdi  $t4,$t6,16,16
+        add     $t7,$t7,$carry
+        insrdi  $t4,$t7,16,0            ; 64..127 bits
+        srdi    $carry,$t7,16           ; upper 33 bits
+        ld      $t6,`$FRAME+64`($sp)
+        ld      $t7,`$FRAME+72`($sp)
+        addc    $t3,$t0,$t1
+        adde    $t5,$t4,$t2
+        addze   $carry,$carry
+        std     $t3,-16($tp)            ; tp[j-1]
+        std     $t5,-8($tp)             ; tp[j]
+        add     $carry,$carry,$ovf      ; comsume upmost overflow
+        add     $t6,$t6,$carry          ; can not overflow
+        srdi    $carry,$t6,16
+        add     $t7,$t7,$carry
+        insrdi  $t6,$t7,48,0
+        srdi    $ovf,$t7,48
+        std     $t6,0($tp)              ; tp[num-1]
+        slwi    $t7,$num,2
+        addi    $i,$i,8
+        subf    $nap_d,$t7,$nap_d       ; rewind pointer
+        cmpw    $i,$num
+        blt-    Louter
+        subf    $np,$num,$np    ; rewind np
+        addi    $j,$j,1         ; restore counter
+        subfc   $i,$i,$i        ; j=0 and "clear" XER[CA]
+        addi    $tp,$sp,`$FRAME+$TRANSFER+8`
+        addi    $t4,$sp,`$FRAME+$TRANSFER+16`
+        addi    $t5,$np,8
+        addi    $t6,$rp,8
+        mtctr   $j
+.align  4
+Lsub:   ldx     $t0,$tp,$i
+        ldx     $t1,$np,$i
+        ldx     $t2,$t4,$i
+        ldx     $t3,$t5,$i
+        subfe   $t0,$t1,$t0     ; tp[j]-np[j]
+        subfe   $t2,$t3,$t2     ; tp[j+1]-np[j+1]
+        stdx    $t0,$rp,$i
+        stdx    $t2,$t6,$i
+        addi    $i,$i,16
+        bdnz-   Lsub
+        li      $i,0
+        subfe   $ovf,$i,$ovf    ; handle upmost overflow bit
+        and     $ap,$tp,$ovf
+        andc    $np,$rp,$ovf
+        or      $ap,$ap,$np     ; ap=borrow?tp:rp
+        addi    $t7,$ap,8
+        mtctr   $j
+.align  4
+Lcopy:                          ; copy or in-place refresh
+        ldx     $t0,$ap,$i
+        ldx     $t1,$t7,$i
+        std     $i,8($nap_d)    ; zap nap_d
+        std     $i,16($nap_d)
+        std     $i,24($nap_d)
+        std     $i,32($nap_d)
+        std     $i,40($nap_d)
+        std     $i,48($nap_d)
+        std     $i,56($nap_d)
+        stdu    $i,64($nap_d)
+        stdx    $t0,$rp,$i
+        stdx    $t1,$t6,$i
+        stdx    $i,$tp,$i       ; zap tp at once
+        stdx    $i,$t4,$i
+        addi    $i,$i,16
+        bdnz-   Lcopy
+        $POP    r14,`2*$SIZE_T`($sp)
+        $POP    r15,`3*$SIZE_T`($sp)
+        $POP    r16,`4*$SIZE_T`($sp)
+        $POP    r17,`5*$SIZE_T`($sp)
+        $POP    r18,`6*$SIZE_T`($sp)
+        $POP    r19,`7*$SIZE_T`($sp)
+        $POP    r20,`8*$SIZE_T`($sp)
+        $POP    r21,`9*$SIZE_T`($sp)
+        $POP    r22,`10*$SIZE_T`($sp)
+        $POP    r23,`11*$SIZE_T`($sp)
+        lfd     f14,`12*$SIZE_T+0`($sp)
+        lfd     f15,`12*$SIZE_T+8`($sp)
+        lfd     f16,`12*$SIZE_T+16`($sp)
+        lfd     f17,`12*$SIZE_T+24`($sp)
+        lfd     f18,`12*$SIZE_T+32`($sp)
+        lfd     f19,`12*$SIZE_T+40`($sp)
+        lfd     f20,`12*$SIZE_T+48`($sp)
+        lfd     f21,`12*$SIZE_T+56`($sp)
+        lfd     f22,`12*$SIZE_T+64`($sp)
+        lfd     f23,`12*$SIZE_T+72`($sp)
+        lfd     f24,`12*$SIZE_T+80`($sp)
+        lfd     f25,`12*$SIZE_T+88`($sp)
+        $POP    $sp,0($sp)
+        li      r3,1    ; signal "handled"
+        blr
+        .long   0
+.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl
new file mode 100644
index 0000000000..d23251033b
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/s390x-mont.pl
@@ -0,0 +1,225 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# April 2007.
+#
+# Performance improvement over vanilla C code varies from 85% to 45%
+# depending on key length and benchmark. Unfortunately in this context
+# these are not very impressive results [for code that utilizes "wide"
+# 64x64=128-bit multiplication, which is not commonly available to C
+# programmers], at least hand-coded bn_asm.c replacement is known to
+# provide 30-40% better results for longest keys. Well, on a second
+# thought it's not very surprising, because z-CPUs are single-issue
+# and _strictly_ in-order execution, while bn_mul_mont is more or less
+# dependent on CPU ability to pipe-line instructions and have several
+# of them "in-flight" at the same time. I mean while other methods,
+# for example Karatsuba, aim to minimize amount of multiplications at
+# the cost of other operations increase, bn_mul_mont aim to neatly
+# "overlap" multiplications and the other operations [and on most
+# platforms even minimize the amount of the other operations, in
+# particular references to memory]. But it's possible to improve this
+# module performance by implementing dedicated squaring code-path and
+# possibly by unrolling loops...
+# January 2009.
+#
+# Reschedule to minimize/avoid Address Generation Interlock hazard,
+# make inner loops counter-based.
+$mn0="%r0";
+$num="%r1";
+# int bn_mul_mont(
+$rp="%r2";              # BN_ULONG *rp,
+$ap="%r3";              # const BN_ULONG *ap,
+$bp="%r4";              # const BN_ULONG *bp,
+$np="%r5";              # const BN_ULONG *np,
+$n0="%r6";              # const BN_ULONG *n0,
+#$num="160(%r15)"       # int num);
+$bi="%r2";      # zaps rp
+$j="%r7";
+$ahi="%r8";
+$alo="%r9";
+$nhi="%r10";
+$nlo="%r11";
+$AHI="%r12";
+$NHI="%r13";
+$count="%r14";
+$sp="%r15";
+$code.=<<___;
+.text
+.globl  bn_mul_mont
+.type   bn_mul_mont,\@function
+bn_mul_mont:
+        lgf     $num,164($sp)   # pull $num
+        sla     $num,3          # $num to enumerate bytes
+        la      $bp,0($num,$bp)
+        stg     %r2,16($sp)
+        cghi    $num,16         #
+        lghi    %r2,0           #
+        blr     %r14            # if($num<16) return 0;
+        cghi    $num,128        #
+        bhr     %r14            # if($num>128) return 0;
+        stmg    %r3,%r15,24($sp)
+        lghi    $rp,-160-8      # leave room for carry bit
+        lcgr    $j,$num         # -$num
+        lgr     %r0,$sp
+        la      $rp,0($rp,$sp)
+        la      $sp,0($j,$rp)   # alloca
+        stg     %r0,0($sp)      # back chain
+        sra     $num,3          # restore $num
+        la      $bp,0($j,$bp)   # restore $bp
+        ahi     $num,-1         # adjust $num for inner loop
+        lg      $n0,0($n0)      # pull n0
+        lg      $bi,0($bp)
+        lg      $alo,0($ap)
+        mlgr    $ahi,$bi        # ap[0]*bp[0]
+        lgr     $AHI,$ahi
+        lgr     $mn0,$alo       # "tp[0]"*n0
+        msgr    $mn0,$n0
+        lg      $nlo,0($np)     #
+        mlgr    $nhi,$mn0       # np[0]*m1
+        algr    $nlo,$alo       # +="tp[0]"
+        lghi    $NHI,0
+        alcgr   $NHI,$nhi
+        la      $j,8(%r0)       # j=1
+        lr      $count,$num
+.align  16
+.L1st:
+        lg      $alo,0($j,$ap)
+        mlgr    $ahi,$bi        # ap[j]*bp[0]
+        algr    $alo,$AHI
+        lghi    $AHI,0
+        alcgr   $AHI,$ahi
+        lg      $nlo,0($j,$np)
+        mlgr    $nhi,$mn0       # np[j]*m1
+        algr    $nlo,$NHI
+        lghi    $NHI,0
+        alcgr   $nhi,$NHI       # +="tp[j]"
+        algr    $nlo,$alo
+        alcgr   $NHI,$nhi
+        stg     $nlo,160-8($j,$sp)      # tp[j-1]=
+        la      $j,8($j)        # j++
+        brct    $count,.L1st
+        algr    $NHI,$AHI
+        lghi    $AHI,0
+        alcgr   $AHI,$AHI       # upmost overflow bit
+        stg     $NHI,160-8($j,$sp)
+        stg     $AHI,160($j,$sp)
+        la      $bp,8($bp)      # bp++
+.Louter:
+        lg      $bi,0($bp)      # bp[i]
+        lg      $alo,0($ap)
+        mlgr    $ahi,$bi        # ap[0]*bp[i]
+        alg     $alo,160($sp)   # +=tp[0]
+        lghi    $AHI,0
+        alcgr   $AHI,$ahi
+        lgr     $mn0,$alo
+        msgr    $mn0,$n0        # tp[0]*n0
+        lg      $nlo,0($np)     # np[0]
+        mlgr    $nhi,$mn0       # np[0]*m1
+        algr    $nlo,$alo       # +="tp[0]"
+        lghi    $NHI,0
+        alcgr   $NHI,$nhi
+        la      $j,8(%r0)       # j=1
+        lr      $count,$num
+.align  16
+.Linner:
+        lg      $alo,0($j,$ap)
+        mlgr    $ahi,$bi        # ap[j]*bp[i]
+        algr    $alo,$AHI
+        lghi    $AHI,0
+        alcgr   $ahi,$AHI
+        alg     $alo,160($j,$sp)# +=tp[j]
+        alcgr   $AHI,$ahi
+        lg      $nlo,0($j,$np)
+        mlgr    $nhi,$mn0       # np[j]*m1
+        algr    $nlo,$NHI
+        lghi    $NHI,0
+        alcgr   $nhi,$NHI
+        algr    $nlo,$alo       # +="tp[j]"
+        alcgr   $NHI,$nhi
+        stg     $nlo,160-8($j,$sp)      # tp[j-1]=
+        la      $j,8($j)        # j++
+        brct    $count,.Linner
+        algr    $NHI,$AHI
+        lghi    $AHI,0
+        alcgr   $AHI,$AHI
+        alg     $NHI,160($j,$sp)# accumulate previous upmost overflow bit
+        lghi    $ahi,0
+        alcgr   $AHI,$ahi       # new upmost overflow bit
+        stg     $NHI,160-8($j,$sp)
+        stg     $AHI,160($j,$sp)
+        la      $bp,8($bp)      # bp++
+        clg     $bp,160+8+32($j,$sp)    # compare to &bp[num]
+        jne     .Louter
+        lg      $rp,160+8+16($j,$sp)    # reincarnate rp
+        la      $ap,160($sp)
+        ahi     $num,1          # restore $num, incidentally clears "borrow"
+        la      $j,0(%r0)
+        lr      $count,$num
+.Lsub:  lg      $alo,0($j,$ap)
+        slbg    $alo,0($j,$np)
+        stg     $alo,0($j,$rp)
+        la      $j,8($j)
+        brct    $count,.Lsub
+        lghi    $ahi,0
+        slbgr   $AHI,$ahi       # handle upmost carry
+        ngr     $ap,$AHI
+        lghi    $np,-1
+        xgr     $np,$AHI
+        ngr     $np,$rp
+        ogr     $ap,$np         # ap=borrow?tp:rp
+        la      $j,0(%r0)
+        lgr     $count,$num
+.Lcopy: lg      $alo,0($j,$ap)  # copy or in-place refresh
+        stg     $j,160($j,$sp)  # zap tp
+        stg     $alo,0($j,$rp)
+        la      $j,8($j)
+        brct    $count,.Lcopy
+        la      %r1,160+8+48($j,$sp)
+        lmg     %r6,%r15,0(%r1)
+        lghi    %r2,1           # signal "processed"
+        br      %r14
+.size   bn_mul_mont,.-bn_mul_mont
+.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/s390x.S b/src/lib/libcrypto/bn/asm/s390x.S
new file mode 100755
index 0000000000..8f45f5d513
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/s390x.S
@@ -0,0 +1,678 @@
+.ident "s390x.S, version 1.0"
+// ====================================================================
+// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+// project.
+//
+// Rights for redistribution and usage in source and binary forms are
+// granted according to the OpenSSL license. Warranty of any kind is
+// disclaimed.
+// ====================================================================
+.text
+#define zero    %r0
+// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
+.globl  bn_mul_add_words
+.type   bn_mul_add_words,@function
+.align  4
+bn_mul_add_words:
+        lghi    zero,0          // zero = 0
+        la      %r1,0(%r2)      // put rp aside
+        lghi    %r2,0           // i=0;
+        ltgfr   %r4,%r4
+        bler    %r14            // if (len<=0) return 0;
+        stmg    %r6,%r10,48(%r15)
+        lghi    %r8,0           // carry = 0
+        srag    %r10,%r4,2      // cnt=len/4
+        jz      .Loop1_madd
+.Loop4_madd:
+        lg      %r7,0(%r2,%r3)  // ap[i]
+        mlgr    %r6,%r5         // *=w
+        algr    %r7,%r8         // +=carry
+        alcgr   %r6,zero
+        alg     %r7,0(%r2,%r1)  // +=rp[i]
+        alcgr   %r6,zero
+        stg     %r7,0(%r2,%r1)  // rp[i]=
+        lg      %r9,8(%r2,%r3)
+        mlgr    %r8,%r5
+        algr    %r9,%r6
+        alcgr   %r8,zero
+        alg     %r9,8(%r2,%r1)
+        alcgr   %r8,zero
+        stg     %r9,8(%r2,%r1)
+        lg      %r7,16(%r2,%r3)
+        mlgr    %r6,%r5
+        algr    %r7,%r8
+        alcgr   %r6,zero
+        alg     %r7,16(%r2,%r1)
+        alcgr   %r6,zero
+        stg     %r7,16(%r2,%r1)
+        lg      %r9,24(%r2,%r3)
+        mlgr    %r8,%r5
+        algr    %r9,%r6
+        alcgr   %r8,zero
+        alg     %r9,24(%r2,%r1)
+        alcgr   %r8,zero
+        stg     %r9,24(%r2,%r1)
+        la      %r2,32(%r2)     // i+=4
+        brct    %r10,.Loop4_madd
+        lghi    %r10,3
+        nr      %r4,%r10        // cnt=len%4
+        jz      .Lend_madd
+.Loop1_madd:
+        lg      %r7,0(%r2,%r3)  // ap[i]
+        mlgr    %r6,%r5         // *=w
+        algr    %r7,%r8         // +=carry
+        alcgr   %r6,zero
+        alg     %r7,0(%r2,%r1)  // +=rp[i]
+        alcgr   %r6,zero
+        stg     %r7,0(%r2,%r1)  // rp[i]=
+        lgr     %r8,%r6
+        la      %r2,8(%r2)      // i++
+        brct    %r4,.Loop1_madd
+.Lend_madd:
+        lgr     %r2,%r8
+        lmg     %r6,%r10,48(%r15)
+        br      %r14
+.size   bn_mul_add_words,.-bn_mul_add_words
+// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
+.globl  bn_mul_words
+.type   bn_mul_words,@function
+.align  4
+bn_mul_words:
+        lghi    zero,0          // zero = 0
+        la      %r1,0(%r2)      // put rp aside
+        lghi    %r2,0           // i=0;
+        ltgfr   %r4,%r4
+        bler    %r14            // if (len<=0) return 0;
+        stmg    %r6,%r10,48(%r15)
+        lghi    %r8,0           // carry = 0
+        srag    %r10,%r4,2      // cnt=len/4
+        jz      .Loop1_mul
+.Loop4_mul:
+        lg      %r7,0(%r2,%r3)  // ap[i]
+        mlgr    %r6,%r5         // *=w
+        algr    %r7,%r8         // +=carry
+        alcgr   %r6,zero
+        stg     %r7,0(%r2,%r1)  // rp[i]=
+        lg      %r9,8(%r2,%r3)
+        mlgr    %r8,%r5
+        algr    %r9,%r6
+        alcgr   %r8,zero
+        stg     %r9,8(%r2,%r1)
+        lg      %r7,16(%r2,%r3)
+        mlgr    %r6,%r5
+        algr    %r7,%r8
+        alcgr   %r6,zero
+        stg     %r7,16(%r2,%r1)
+        lg      %r9,24(%r2,%r3)
+        mlgr    %r8,%r5
+        algr    %r9,%r6
+        alcgr   %r8,zero
+        stg     %r9,24(%r2,%r1)
+        la      %r2,32(%r2)     // i+=4
+        brct    %r10,.Loop4_mul
+        lghi    %r10,3
+        nr      %r4,%r10        // cnt=len%4
+        jz      .Lend_mul
+.Loop1_mul:
+        lg      %r7,0(%r2,%r3)  // ap[i]
+        mlgr    %r6,%r5         // *=w
+        algr    %r7,%r8         // +=carry
+        alcgr   %r6,zero
+        stg     %r7,0(%r2,%r1)  // rp[i]=
+        lgr     %r8,%r6
+        la      %r2,8(%r2)      // i++
+        brct    %r4,.Loop1_mul
+.Lend_mul:
+        lgr     %r2,%r8
+        lmg     %r6,%r10,48(%r15)
+        br      %r14
+.size   bn_mul_words,.-bn_mul_words
+// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
+.globl  bn_sqr_words
+.type   bn_sqr_words,@function
+.align  4
+bn_sqr_words:
+        ltgfr   %r4,%r4
+        bler    %r14
+        stmg    %r6,%r7,48(%r15)
+        srag    %r1,%r4,2       // cnt=len/4
+        jz      .Loop1_sqr
+.Loop4_sqr:
+        lg      %r7,0(%r3)
+        mlgr    %r6,%r7
+        stg     %r7,0(%r2)
+        stg     %r6,8(%r2)
+        lg      %r7,8(%r3)
+        mlgr    %r6,%r7
+        stg     %r7,16(%r2)
+        stg     %r6,24(%r2)
+        lg      %r7,16(%r3)
+        mlgr    %r6,%r7
+        stg     %r7,32(%r2)
+        stg     %r6,40(%r2)
+        lg      %r7,24(%r3)
+        mlgr    %r6,%r7
+        stg     %r7,48(%r2)
+        stg     %r6,56(%r2)
+        la      %r3,32(%r3)
+        la      %r2,64(%r2)
+        brct    %r1,.Loop4_sqr
+        lghi    %r1,3
+        nr      %r4,%r1         // cnt=len%4
+        jz      .Lend_sqr
+.Loop1_sqr:
+        lg      %r7,0(%r3)
+        mlgr    %r6,%r7
+        stg     %r7,0(%r2)
+        stg     %r6,8(%r2)
+        la      %r3,8(%r3)
+        la      %r2,16(%r2)
+        brct    %r4,.Loop1_sqr
+.Lend_sqr:
+        lmg     %r6,%r7,48(%r15)
+        br      %r14
+.size   bn_sqr_words,.-bn_sqr_words
+// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
+.globl  bn_div_words
+.type   bn_div_words,@function
+.align  4
+bn_div_words:
+        dlgr    %r2,%r4
+        lgr     %r2,%r3
+        br      %r14
+.size   bn_div_words,.-bn_div_words
+// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
+.globl  bn_add_words
+.type   bn_add_words,@function
+.align  4
+bn_add_words:
+        la      %r1,0(%r2)      // put rp aside
+        lghi    %r2,0           // i=0
+        ltgfr   %r5,%r5
+        bler    %r14            // if (len<=0) return 0;
+        stg     %r6,48(%r15)
+        lghi    %r6,3
+        nr      %r6,%r5         // len%4
+        sra     %r5,2           // len/4, use sra because it sets condition code
+        jz      .Loop1_add      // carry is incidentally cleared if branch taken
+        algr    %r2,%r2         // clear carry
+.Loop4_add:
+        lg      %r0,0(%r2,%r3)
+        alcg    %r0,0(%r2,%r4)
+        stg     %r0,0(%r2,%r1)
+        lg      %r0,8(%r2,%r3)
+        alcg    %r0,8(%r2,%r4)
+        stg     %r0,8(%r2,%r1)
+        lg      %r0,16(%r2,%r3)
+        alcg    %r0,16(%r2,%r4)
+        stg     %r0,16(%r2,%r1)
+        lg      %r0,24(%r2,%r3)
+        alcg    %r0,24(%r2,%r4)
+        stg     %r0,24(%r2,%r1)
+        la      %r2,32(%r2)     // i+=4
+        brct    %r5,.Loop4_add
+        la      %r6,1(%r6)      // see if len%4 is zero ...
+        brct    %r6,.Loop1_add  // without touching condition code:-)
+.Lexit_add:
+        lghi    %r2,0
+        alcgr   %r2,%r2
+        lg      %r6,48(%r15)
+        br      %r14
+.Loop1_add:
+        lg      %r0,0(%r2,%r3)
+        alcg    %r0,0(%r2,%r4)
+        stg     %r0,0(%r2,%r1)
+        la      %r2,8(%r2)      // i++
+        brct    %r6,.Loop1_add
+        j       .Lexit_add
+.size   bn_add_words,.-bn_add_words
+// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
+.globl  bn_sub_words
+.type   bn_sub_words,@function
+.align  4
+bn_sub_words:
+        la      %r1,0(%r2)      // put rp aside
+        lghi    %r2,0           // i=0
+        ltgfr   %r5,%r5
+        bler    %r14            // if (len<=0) return 0;
+        stg     %r6,48(%r15)
+        lghi    %r6,3
+        nr      %r6,%r5         // len%4
+        sra     %r5,2           // len/4, use sra because it sets condition code
+        jnz     .Loop4_sub      // borrow is incidentally cleared if branch taken
+        slgr    %r2,%r2         // clear borrow
+.Loop1_sub:
+        lg      %r0,0(%r2,%r3)
+        slbg    %r0,0(%r2,%r4)
+        stg     %r0,0(%r2,%r1)
+        la      %r2,8(%r2)      // i++
+        brct    %r6,.Loop1_sub
+        j       .Lexit_sub
+.Loop4_sub:
+        lg      %r0,0(%r2,%r3)
+        slbg    %r0,0(%r2,%r4)
+        stg     %r0,0(%r2,%r1)
+        lg      %r0,8(%r2,%r3)
+        slbg    %r0,8(%r2,%r4)
+        stg     %r0,8(%r2,%r1)
+        lg      %r0,16(%r2,%r3)
+        slbg    %r0,16(%r2,%r4)
+        stg     %r0,16(%r2,%r1)
+        lg      %r0,24(%r2,%r3)
+        slbg    %r0,24(%r2,%r4)
+        stg     %r0,24(%r2,%r1)
+        la      %r2,32(%r2)     // i+=4
+        brct    %r5,.Loop4_sub
+        la      %r6,1(%r6)      // see if len%4 is zero ...
+        brct    %r6,.Loop1_sub  // without touching condition code:-)
+.Lexit_sub:
+        lghi    %r2,0
+        slbgr   %r2,%r2
+        lcgr    %r2,%r2
+        lg      %r6,48(%r15)
+        br      %r14
+.size   bn_sub_words,.-bn_sub_words
+#define c1      %r1
+#define c2      %r5
+#define c3      %r8
+#define mul_add_c(ai,bi,c1,c2,c3)       \
+        lg      %r7,ai*8(%r3);          \
+        mlg     %r6,bi*8(%r4);          \
+        algr    c1,%r7;                 \
+        alcgr   c2,%r6;                 \
+        alcgr   c3,zero
+// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
+.globl  bn_mul_comba8
+.type   bn_mul_comba8,@function
+.align  4
+bn_mul_comba8:
+        stmg    %r6,%r8,48(%r15)
+        lghi    c1,0
+        lghi    c2,0
+        lghi    c3,0
+        lghi    zero,0
+        mul_add_c(0,0,c1,c2,c3);
+        stg     c1,0*8(%r2)
+        lghi    c1,0
+        mul_add_c(0,1,c2,c3,c1);
+        mul_add_c(1,0,c2,c3,c1);
+        stg     c2,1*8(%r2)
+        lghi    c2,0
+        mul_add_c(2,0,c3,c1,c2);
+        mul_add_c(1,1,c3,c1,c2);
+        mul_add_c(0,2,c3,c1,c2);
+        stg     c3,2*8(%r2)
+        lghi    c3,0
+        mul_add_c(0,3,c1,c2,c3);
+        mul_add_c(1,2,c1,c2,c3);
+        mul_add_c(2,1,c1,c2,c3);
+        mul_add_c(3,0,c1,c2,c3);
+        stg     c1,3*8(%r2)
+        lghi    c1,0
+        mul_add_c(4,0,c2,c3,c1);
+        mul_add_c(3,1,c2,c3,c1);
+        mul_add_c(2,2,c2,c3,c1);
+        mul_add_c(1,3,c2,c3,c1);
+        mul_add_c(0,4,c2,c3,c1);
+        stg     c2,4*8(%r2)
+        lghi    c2,0
+        mul_add_c(0,5,c3,c1,c2);
+        mul_add_c(1,4,c3,c1,c2);
+        mul_add_c(2,3,c3,c1,c2);
+        mul_add_c(3,2,c3,c1,c2);
+        mul_add_c(4,1,c3,c1,c2);
+        mul_add_c(5,0,c3,c1,c2);
+        stg     c3,5*8(%r2)
+        lghi    c3,0
+        mul_add_c(6,0,c1,c2,c3);
+        mul_add_c(5,1,c1,c2,c3);
+        mul_add_c(4,2,c1,c2,c3);
+        mul_add_c(3,3,c1,c2,c3);
+        mul_add_c(2,4,c1,c2,c3);
+        mul_add_c(1,5,c1,c2,c3);
+        mul_add_c(0,6,c1,c2,c3);
+        stg     c1,6*8(%r2)
+        lghi    c1,0
+        mul_add_c(0,7,c2,c3,c1);
+        mul_add_c(1,6,c2,c3,c1);
+        mul_add_c(2,5,c2,c3,c1);
+        mul_add_c(3,4,c2,c3,c1);
+        mul_add_c(4,3,c2,c3,c1);
+        mul_add_c(5,2,c2,c3,c1);
+        mul_add_c(6,1,c2,c3,c1);
+        mul_add_c(7,0,c2,c3,c1);
+        stg     c2,7*8(%r2)
+        lghi    c2,0
+        mul_add_c(7,1,c3,c1,c2);
+        mul_add_c(6,2,c3,c1,c2);
+        mul_add_c(5,3,c3,c1,c2);
+        mul_add_c(4,4,c3,c1,c2);
+        mul_add_c(3,5,c3,c1,c2);
+        mul_add_c(2,6,c3,c1,c2);
+        mul_add_c(1,7,c3,c1,c2);
+        stg     c3,8*8(%r2)
+        lghi    c3,0
+        mul_add_c(2,7,c1,c2,c3);
+        mul_add_c(3,6,c1,c2,c3);
+        mul_add_c(4,5,c1,c2,c3);
+        mul_add_c(5,4,c1,c2,c3);
+        mul_add_c(6,3,c1,c2,c3);
+        mul_add_c(7,2,c1,c2,c3);
+        stg     c1,9*8(%r2)
+        lghi    c1,0
+        mul_add_c(7,3,c2,c3,c1);
+        mul_add_c(6,4,c2,c3,c1);
+        mul_add_c(5,5,c2,c3,c1);
+        mul_add_c(4,6,c2,c3,c1);
+        mul_add_c(3,7,c2,c3,c1);
+        stg     c2,10*8(%r2)
+        lghi    c2,0
+        mul_add_c(4,7,c3,c1,c2);
+        mul_add_c(5,6,c3,c1,c2);
+        mul_add_c(6,5,c3,c1,c2);
+        mul_add_c(7,4,c3,c1,c2);
+        stg     c3,11*8(%r2)
+        lghi    c3,0
+        mul_add_c(7,5,c1,c2,c3);
+        mul_add_c(6,6,c1,c2,c3);
+        mul_add_c(5,7,c1,c2,c3);
+        stg     c1,12*8(%r2)
+        lghi    c1,0
+        mul_add_c(6,7,c2,c3,c1);
+        mul_add_c(7,6,c2,c3,c1);
+        stg     c2,13*8(%r2)
+        lghi    c2,0
+        mul_add_c(7,7,c3,c1,c2);
+        stg     c3,14*8(%r2)
+        stg     c1,15*8(%r2)
+        lmg     %r6,%r8,48(%r15)
+        br      %r14
+.size   bn_mul_comba8,.-bn_mul_comba8
+// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
+.globl  bn_mul_comba4
+.type   bn_mul_comba4,@function
+.align  4
+bn_mul_comba4:
+        stmg    %r6,%r8,48(%r15)
+        lghi    c1,0
+        lghi    c2,0
+        lghi    c3,0
+        lghi    zero,0
+        mul_add_c(0,0,c1,c2,c3);
+        stg     c1,0*8(%r3)
+        lghi    c1,0
+        mul_add_c(0,1,c2,c3,c1);
+        mul_add_c(1,0,c2,c3,c1);
+        stg     c2,1*8(%r2)
+        lghi    c2,0
+        mul_add_c(2,0,c3,c1,c2);
+        mul_add_c(1,1,c3,c1,c2);
+        mul_add_c(0,2,c3,c1,c2);
+        stg     c3,2*8(%r2)
+        lghi    c3,0
+        mul_add_c(0,3,c1,c2,c3);
+        mul_add_c(1,2,c1,c2,c3);
+        mul_add_c(2,1,c1,c2,c3);
+        mul_add_c(3,0,c1,c2,c3);
+        stg     c1,3*8(%r2)
+        lghi    c1,0
+        mul_add_c(3,1,c2,c3,c1);
+        mul_add_c(2,2,c2,c3,c1);
+        mul_add_c(1,3,c2,c3,c1);
+        stg     c2,4*8(%r2)
+        lghi    c2,0
+        mul_add_c(2,3,c3,c1,c2);
+        mul_add_c(3,2,c3,c1,c2);
+        stg     c3,5*8(%r2)
+        lghi    c3,0
+        mul_add_c(3,3,c1,c2,c3);
+        stg     c1,6*8(%r2)
+        stg     c2,7*8(%r2)
+        stmg    %r6,%r8,48(%r15)
+        br      %r14
+.size   bn_mul_comba4,.-bn_mul_comba4
+#define sqr_add_c(ai,c1,c2,c3)          \
+        lg      %r7,ai*8(%r3);          \
+        mlgr    %r6,%r7;                \
+        algr    c1,%r7;                 \
+        alcgr   c2,%r6;                 \
+        alcgr   c3,zero
+#define sqr_add_c2(ai,aj,c1,c2,c3)      \
+        lg      %r7,ai*8(%r3);          \
+        mlg     %r6,aj*8(%r3);          \
+        algr    c1,%r7;                 \
+        alcgr   c2,%r6;                 \
+        alcgr   c3,zero;                \
+        algr    c1,%r7;                 \
+        alcgr   c2,%r6;                 \
+        alcgr   c3,zero
+// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
+.globl  bn_sqr_comba8
+.type   bn_sqr_comba8,@function
+.align  4
+bn_sqr_comba8:
+        stmg    %r6,%r8,48(%r15)
+        lghi    c1,0
+        lghi    c2,0
+        lghi    c3,0
+        lghi    zero,0
+        sqr_add_c(0,c1,c2,c3);
+        stg     c1,0*8(%r2)
+        lghi    c1,0
+        sqr_add_c2(1,0,c2,c3,c1);
+        stg     c2,1*8(%r2)
+        lghi    c2,0
+        sqr_add_c(1,c3,c1,c2);
+        sqr_add_c2(2,0,c3,c1,c2);
+        stg     c3,2*8(%r2)
+        lghi    c3,0
+        sqr_add_c2(3,0,c1,c2,c3);
+        sqr_add_c2(2,1,c1,c2,c3);
+        stg     c1,3*8(%r2)
+        lghi    c1,0
+        sqr_add_c(2,c2,c3,c1);
+        sqr_add_c2(3,1,c2,c3,c1);
+        sqr_add_c2(4,0,c2,c3,c1);
+        stg     c2,4*8(%r2)
+        lghi    c2,0
+        sqr_add_c2(5,0,c3,c1,c2);
+        sqr_add_c2(4,1,c3,c1,c2);
+        sqr_add_c2(3,2,c3,c1,c2);
+        stg     c3,5*8(%r2)
+        lghi    c3,0
+        sqr_add_c(3,c1,c2,c3);
+        sqr_add_c2(4,2,c1,c2,c3);
+        sqr_add_c2(5,1,c1,c2,c3);
+        sqr_add_c2(6,0,c1,c2,c3);
+        stg     c1,6*8(%r2)
+        lghi    c1,0
+        sqr_add_c2(7,0,c2,c3,c1);
+        sqr_add_c2(6,1,c2,c3,c1);
+        sqr_add_c2(5,2,c2,c3,c1);
+        sqr_add_c2(4,3,c2,c3,c1);
+        stg     c2,7*8(%r2)
+        lghi    c2,0
+        sqr_add_c(4,c3,c1,c2);
+        sqr_add_c2(5,3,c3,c1,c2);
+        sqr_add_c2(6,2,c3,c1,c2);
+        sqr_add_c2(7,1,c3,c1,c2);
+        stg     c3,8*8(%r2)
+        lghi    c3,0
+        sqr_add_c2(7,2,c1,c2,c3);
+        sqr_add_c2(6,3,c1,c2,c3);
+        sqr_add_c2(5,4,c1,c2,c3);
+        stg     c1,9*8(%r2)
+        lghi    c1,0
+        sqr_add_c(5,c2,c3,c1);
+        sqr_add_c2(6,4,c2,c3,c1);
+        sqr_add_c2(7,3,c2,c3,c1);
+        stg     c2,10*8(%r2)
+        lghi    c2,0
+        sqr_add_c2(7,4,c3,c1,c2);
+        sqr_add_c2(6,5,c3,c1,c2);
+        stg     c3,11*8(%r2)
+        lghi    c3,0
+        sqr_add_c(6,c1,c2,c3);
+        sqr_add_c2(7,5,c1,c2,c3);
+        stg     c1,12*8(%r2)
+        lghi    c1,0
+        sqr_add_c2(7,6,c2,c3,c1);
+        stg     c2,13*8(%r2)
+        lghi    c2,0
+        sqr_add_c(7,c3,c1,c2);
+        stg     c3,14*8(%r2)
+        stg     c1,15*8(%r2)
+        lmg     %r6,%r8,48(%r15)
+        br      %r14
+.size   bn_sqr_comba8,.-bn_sqr_comba8
+// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
+.globl bn_sqr_comba4
+.type   bn_sqr_comba4,@function
+.align  4
+bn_sqr_comba4:
+        stmg    %r6,%r8,48(%r15)
+        lghi    c1,0
+        lghi    c2,0
+        lghi    c3,0
+        lghi    zero,0
+        sqr_add_c(0,c1,c2,c3);
+        stg     c1,0*8(%r2)
+        lghi    c1,0
+        sqr_add_c2(1,0,c2,c3,c1);
+        stg     c2,1*8(%r2)
+        lghi    c2,0
+        sqr_add_c(1,c3,c1,c2);
+        sqr_add_c2(2,0,c3,c1,c2);
+        stg     c3,2*8(%r2)
+        lghi    c3,0
+        sqr_add_c2(3,0,c1,c2,c3);
+        sqr_add_c2(2,1,c1,c2,c3);
+        stg     c1,3*8(%r2)
+        lghi    c1,0
+        sqr_add_c(2,c2,c3,c1);
+        sqr_add_c2(3,1,c2,c3,c1);
+        stg     c2,4*8(%r2)
+        lghi    c2,0
+        sqr_add_c2(3,2,c3,c1,c2);
+        stg     c3,5*8(%r2)
+        lghi    c3,0
+        sqr_add_c(3,c1,c2,c3);
+        stg     c1,6*8(%r2)
+        stg     c2,7*8(%r2)
+        lmg     %r6,%r8,48(%r15)
+        br      %r14
+.size   bn_sqr_comba4,.-bn_sqr_comba4
diff --git a/src/lib/libcrypto/bn/asm/sparcv9-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9-mont.pl
new file mode 100644
index 0000000000..b8fb1e8a25
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/sparcv9-mont.pl
@@ -0,0 +1,606 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# December 2005
+#
+# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
+# for undertaken effort are multiple. First of all, UltraSPARC is not
+# the whole SPARCv9 universe and other VIS-free implementations deserve
+# optimized code as much. Secondly, newly introduced UltraSPARC T1,
+# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
+# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
+# several integrated RSA/DSA accelerator circuits accessible through
+# kernel driver [only(*)], but having decent user-land software
+# implementation is important too. Finally, reasons like desire to
+# experiment with dedicated squaring procedure. Yes, this module
+# implements one, because it was easiest to draft it in SPARCv9
+# instructions...
+# (*)   Engine accessing the driver in question is on my TODO list.
+#       For reference, acceleator is estimated to give 6 to 10 times
+#       improvement on single-threaded RSA sign. It should be noted
+#       that 6-10x improvement coefficient does not actually mean
+#       something extraordinary in terms of absolute [single-threaded]
+#       performance, as SPARCv9 instruction set is by all means least
+#       suitable for high performance crypto among other 64 bit
+#       platforms. 6-10x factor simply places T1 in same performance
+#       domain as say AMD64 and IA-64. Improvement of RSA verify don't
+#       appear impressive at all, but it's the sign operation which is
+#       far more critical/interesting.
+# You might notice that inner loops are modulo-scheduled:-) This has
+# essentially negligible impact on UltraSPARC performance, it's
+# Fujitsu SPARC64 V users who should notice and hopefully appreciate
+# the advantage... Currently this module surpasses sparcv9a-mont.pl
+# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
+# module still have hidden potential [see TODO list there], which is
+# estimated to be larger than 20%...
+# int bn_mul_mont(
+$rp="%i0";      # BN_ULONG *rp,
+$ap="%i1";      # const BN_ULONG *ap,
+$bp="%i2";      # const BN_ULONG *bp,
+$np="%i3";      # const BN_ULONG *np,
+$n0="%i4";      # const BN_ULONG *n0,
+$num="%i5";     # int num);
+$bits=32;
+for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)  { $bias=2047; $frame=192; }
+else            { $bias=0;    $frame=128; }
+$car0="%o0";
+$car1="%o1";
+$car2="%o2";    # 1 bit
+$acc0="%o3";
+$acc1="%o4";
+$mask="%g1";    # 32 bits, what a waste...
+$tmp0="%g4";
+$tmp1="%g5";
+$i="%l0";
+$j="%l1";
+$mul0="%l2";
+$mul1="%l3";
+$tp="%l4";
+$apj="%l5";
+$npj="%l6";
+$tpj="%l7";
+$fname="bn_mul_mont_int";
+$code=<<___;
+.section        ".text",#alloc,#execinstr
+.global $fname
+.align  32
+$fname:
+        cmp     %o5,4                   ! 128 bits minimum
+        bge,pt  %icc,.Lenter
+        sethi   %hi(0xffffffff),$mask
+        retl
+        clr     %o0
+.align  32
+.Lenter:
+        save    %sp,-$frame,%sp
+        sll     $num,2,$num             ! num*=4
+        or      $mask,%lo(0xffffffff),$mask
+        ld      [$n0],$n0
+        cmp     $ap,$bp
+        and     $num,$mask,$num
+        ld      [$bp],$mul0             ! bp[0]
+        nop
+        add     %sp,$bias,%o7           ! real top of stack
+        ld      [$ap],$car0             ! ap[0] ! redundant in squaring context
+        sub     %o7,$num,%o7
+        ld      [$ap+4],$apj            ! ap[1]
+        and     %o7,-1024,%o7
+        ld      [$np],$car1             ! np[0]
+        sub     %o7,$bias,%sp           ! alloca
+        ld      [$np+4],$npj            ! np[1]
+        be,pt   `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
+        mov     12,$j
+        mulx    $car0,$mul0,$car0       ! ap[0]*bp[0]
+        mulx    $apj,$mul0,$tmp0        !prologue! ap[1]*bp[0]
+        and     $car0,$mask,$acc0
+        add     %sp,$bias+$frame,$tp
+        ld      [$ap+8],$apj            !prologue!
+        mulx    $n0,$acc0,$mul1         ! "t[0]"*n0
+        and     $mul1,$mask,$mul1
+        mulx    $car1,$mul1,$car1       ! np[0]*"t[0]"*n0
+        mulx    $npj,$mul1,$acc1        !prologue! np[1]*"t[0]"*n0
+        srlx    $car0,32,$car0
+        add     $acc0,$car1,$car1
+        ld      [$np+8],$npj            !prologue!
+        srlx    $car1,32,$car1
+        mov     $tmp0,$acc0             !prologue!
+.L1st:
+        mulx    $apj,$mul0,$tmp0
+        mulx    $npj,$mul1,$tmp1
+        add     $acc0,$car0,$car0
+        ld      [$ap+$j],$apj           ! ap[j]
+        and     $car0,$mask,$acc0
+        add     $acc1,$car1,$car1
+        ld      [$np+$j],$npj           ! np[j]
+        srlx    $car0,32,$car0
+        add     $acc0,$car1,$car1
+        add     $j,4,$j                 ! j++
+        mov     $tmp0,$acc0
+        st      $car1,[$tp]
+        cmp     $j,$num
+        mov     $tmp1,$acc1
+        srlx    $car1,32,$car1
+        bl      %icc,.L1st
+        add     $tp,4,$tp               ! tp++
+!.L1st
+        mulx    $apj,$mul0,$tmp0        !epilogue!
+        mulx    $npj,$mul1,$tmp1
+        add     $acc0,$car0,$car0
+        and     $car0,$mask,$acc0
+        add     $acc1,$car1,$car1
+        srlx    $car0,32,$car0
+        add     $acc0,$car1,$car1
+        st      $car1,[$tp]
+        srlx    $car1,32,$car1
+        add     $tmp0,$car0,$car0
+        and     $car0,$mask,$acc0
+        add     $tmp1,$car1,$car1
+        srlx    $car0,32,$car0
+        add     $acc0,$car1,$car1
+        st      $car1,[$tp+4]
+        srlx    $car1,32,$car1
+        add     $car0,$car1,$car1
+        st      $car1,[$tp+8]
+        srlx    $car1,32,$car2
+        mov     4,$i                    ! i++
+        ld      [$bp+4],$mul0           ! bp[1]
+.Louter:
+        add     %sp,$bias+$frame,$tp
+        ld      [$ap],$car0             ! ap[0]
+        ld      [$ap+4],$apj            ! ap[1]
+        ld      [$np],$car1             ! np[0]
+        ld      [$np+4],$npj            ! np[1]
+        ld      [$tp],$tmp1             ! tp[0]
+        ld      [$tp+4],$tpj            ! tp[1]
+        mov     12,$j
+        mulx    $car0,$mul0,$car0
+        mulx    $apj,$mul0,$tmp0        !prologue!
+        add     $tmp1,$car0,$car0
+        ld      [$ap+8],$apj            !prologue!
+        and     $car0,$mask,$acc0
+        mulx    $n0,$acc0,$mul1
+        and     $mul1,$mask,$mul1
+        mulx    $car1,$mul1,$car1
+        mulx    $npj,$mul1,$acc1        !prologue!
+        srlx    $car0,32,$car0
+        add     $acc0,$car1,$car1
+        ld      [$np+8],$npj            !prologue!
+        srlx    $car1,32,$car1
+        mov     $tmp0,$acc0             !prologue!
+.Linner:
+        mulx    $apj,$mul0,$tmp0
+        mulx    $npj,$mul1,$tmp1
+        add     $tpj,$car0,$car0
+        ld      [$ap+$j],$apj           ! ap[j]
+        add     $acc0,$car0,$car0
+        add     $acc1,$car1,$car1
+        ld      [$np+$j],$npj           ! np[j]
+        and     $car0,$mask,$acc0
+        ld      [$tp+8],$tpj            ! tp[j]
+        srlx    $car0,32,$car0
+        add     $acc0,$car1,$car1
+        add     $j,4,$j                 ! j++
+        mov     $tmp0,$acc0
+        st      $car1,[$tp]             ! tp[j-1]
+        srlx    $car1,32,$car1
+        mov     $tmp1,$acc1
+        cmp     $j,$num
+        bl      %icc,.Linner
+        add     $tp,4,$tp               ! tp++
+!.Linner
+        mulx    $apj,$mul0,$tmp0        !epilogue!
+        mulx    $npj,$mul1,$tmp1
+        add     $tpj,$car0,$car0
+        add     $acc0,$car0,$car0
+        ld      [$tp+8],$tpj            ! tp[j]
+        and     $car0,$mask,$acc0
+        add     $acc1,$car1,$car1
+        srlx    $car0,32,$car0
+        add     $acc0,$car1,$car1
+        st      $car1,[$tp]             ! tp[j-1]
+        srlx    $car1,32,$car1
+        add     $tpj,$car0,$car0
+        add     $tmp0,$car0,$car0
+        and     $car0,$mask,$acc0
+        add     $tmp1,$car1,$car1
+        add     $acc0,$car1,$car1
+        st      $car1,[$tp+4]           ! tp[j-1]
+        srlx    $car0,32,$car0
+        add     $i,4,$i                 ! i++
+        srlx    $car1,32,$car1
+        add     $car0,$car1,$car1
+        cmp     $i,$num
+        add     $car2,$car1,$car1
+        st      $car1,[$tp+8]
+        srlx    $car1,32,$car2
+        bl,a    %icc,.Louter
+        ld      [$bp+$i],$mul0          ! bp[i]
+!.Louter
+        add     $tp,12,$tp
+.Ltail:
+        add     $np,$num,$np
+        add     $rp,$num,$rp
+        mov     $tp,$ap
+        sub     %g0,$num,%o7            ! k=-num
+        ba      .Lsub
+        subcc   %g0,%g0,%g0             ! clear %icc.c
+.align  16
+.Lsub:
+        ld      [$tp+%o7],%o0
+        ld      [$np+%o7],%o1
+        subccc  %o0,%o1,%o1             ! tp[j]-np[j]
+        add     $rp,%o7,$i
+        add     %o7,4,%o7
+        brnz    %o7,.Lsub
+        st      %o1,[$i]
+        subc    $car2,0,$car2           ! handle upmost overflow bit
+        and     $tp,$car2,$ap
+        andn    $rp,$car2,$np
+        or      $ap,$np,$ap
+        sub     %g0,$num,%o7
+.Lcopy:
+        ld      [$ap+%o7],%o0           ! copy or in-place refresh
+        st      %g0,[$tp+%o7]           ! zap tp
+        st      %o0,[$rp+%o7]
+        add     %o7,4,%o7
+        brnz    %o7,.Lcopy
+        nop
+        mov     1,%i0
+        ret
+        restore
+___
+########
+######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
+######## code without following dedicated squaring procedure.
+########
+$sbit="%i2";            # re-use $bp!
+$code.=<<___;
+.align  32
+.Lbn_sqr_mont:
+        mulx    $mul0,$mul0,$car0               ! ap[0]*ap[0]
+        mulx    $apj,$mul0,$tmp0                !prologue!
+        and     $car0,$mask,$acc0
+        add     %sp,$bias+$frame,$tp
+        ld      [$ap+8],$apj                    !prologue!
+        mulx    $n0,$acc0,$mul1                 ! "t[0]"*n0
+        srlx    $car0,32,$car0
+        and     $mul1,$mask,$mul1
+        mulx    $car1,$mul1,$car1               ! np[0]*"t[0]"*n0
+        mulx    $npj,$mul1,$acc1                !prologue!
+        and     $car0,1,$sbit
+        ld      [$np+8],$npj                    !prologue!
+        srlx    $car0,1,$car0
+        add     $acc0,$car1,$car1
+        srlx    $car1,32,$car1
+        mov     $tmp0,$acc0                     !prologue!
+.Lsqr_1st:
+        mulx    $apj,$mul0,$tmp0
+        mulx    $npj,$mul1,$tmp1
+        add     $acc0,$car0,$car0               ! ap[j]*a0+c0
+        add     $acc1,$car1,$car1
+        ld      [$ap+$j],$apj                   ! ap[j]
+        and     $car0,$mask,$acc0
+        ld      [$np+$j],$npj                   ! np[j]
+        srlx    $car0,32,$car0
+        add     $acc0,$acc0,$acc0
+        or      $sbit,$acc0,$acc0
+        mov     $tmp1,$acc1
+        srlx    $acc0,32,$sbit
+        add     $j,4,$j                         ! j++
+        and     $acc0,$mask,$acc0
+        cmp     $j,$num
+        add     $acc0,$car1,$car1
+        st      $car1,[$tp]
+        mov     $tmp0,$acc0
+        srlx    $car1,32,$car1
+        bl      %icc,.Lsqr_1st
+        add     $tp,4,$tp                       ! tp++
+!.Lsqr_1st
+        mulx    $apj,$mul0,$tmp0                ! epilogue
+        mulx    $npj,$mul1,$tmp1
+        add     $acc0,$car0,$car0               ! ap[j]*a0+c0
+        add     $acc1,$car1,$car1
+        and     $car0,$mask,$acc0
+        srlx    $car0,32,$car0
+        add     $acc0,$acc0,$acc0
+        or      $sbit,$acc0,$acc0
+        srlx    $acc0,32,$sbit
+        and     $acc0,$mask,$acc0
+        add     $acc0,$car1,$car1
+        st      $car1,[$tp]
+        srlx    $car1,32,$car1
+        add     $tmp0,$car0,$car0               ! ap[j]*a0+c0
+        add     $tmp1,$car1,$car1
+        and     $car0,$mask,$acc0
+        srlx    $car0,32,$car0
+        add     $acc0,$acc0,$acc0
+        or      $sbit,$acc0,$acc0
+        srlx    $acc0,32,$sbit
+        and     $acc0,$mask,$acc0
+        add     $acc0,$car1,$car1
+        st      $car1,[$tp+4]
+        srlx    $car1,32,$car1
+        add     $car0,$car0,$car0
+        or      $sbit,$car0,$car0
+        add     $car0,$car1,$car1
+        st      $car1,[$tp+8]
+        srlx    $car1,32,$car2
+        ld      [%sp+$bias+$frame],$tmp0        ! tp[0]
+        ld      [%sp+$bias+$frame+4],$tmp1      ! tp[1]
+        ld      [%sp+$bias+$frame+8],$tpj       ! tp[2]
+        ld      [$ap+4],$mul0                   ! ap[1]
+        ld      [$ap+8],$apj                    ! ap[2]
+        ld      [$np],$car1                     ! np[0]
+        ld      [$np+4],$npj                    ! np[1]
+        mulx    $n0,$tmp0,$mul1
+        mulx    $mul0,$mul0,$car0
+        and     $mul1,$mask,$mul1
+        mulx    $car1,$mul1,$car1
+        mulx    $npj,$mul1,$acc1
+        add     $tmp0,$car1,$car1
+        and     $car0,$mask,$acc0
+        ld      [$np+8],$npj                    ! np[2]
+        srlx    $car1,32,$car1
+        add     $tmp1,$car1,$car1
+        srlx    $car0,32,$car0
+        add     $acc0,$car1,$car1
+        and     $car0,1,$sbit
+        add     $acc1,$car1,$car1
+        srlx    $car0,1,$car0
+        mov     12,$j
+        st      $car1,[%sp+$bias+$frame]        ! tp[0]=
+        srlx    $car1,32,$car1
+        add     %sp,$bias+$frame+4,$tp
+.Lsqr_2nd:
+        mulx    $apj,$mul0,$acc0
+        mulx    $npj,$mul1,$acc1
+        add     $acc0,$car0,$car0
+        add     $tpj,$car1,$car1
+        ld      [$ap+$j],$apj                   ! ap[j]
+        and     $car0,$mask,$acc0
+        ld      [$np+$j],$npj                   ! np[j]
+        srlx    $car0,32,$car0
+        add     $acc1,$car1,$car1
+        ld      [$tp+8],$tpj                    ! tp[j]
+        add     $acc0,$acc0,$acc0
+        add     $j,4,$j                         ! j++
+        or      $sbit,$acc0,$acc0
+        srlx    $acc0,32,$sbit
+        and     $acc0,$mask,$acc0
+        cmp     $j,$num
+        add     $acc0,$car1,$car1
+        st      $car1,[$tp]                     ! tp[j-1]
+        srlx    $car1,32,$car1
+        bl      %icc,.Lsqr_2nd
+        add     $tp,4,$tp                       ! tp++
+!.Lsqr_2nd
+        mulx    $apj,$mul0,$acc0
+        mulx    $npj,$mul1,$acc1
+        add     $acc0,$car0,$car0
+        add     $tpj,$car1,$car1
+        and     $car0,$mask,$acc0
+        srlx    $car0,32,$car0
+        add     $acc1,$car1,$car1
+        add     $acc0,$acc0,$acc0
+        or      $sbit,$acc0,$acc0
+        srlx    $acc0,32,$sbit
+        and     $acc0,$mask,$acc0
+        add     $acc0,$car1,$car1
+        st      $car1,[$tp]                     ! tp[j-1]
+        srlx    $car1,32,$car1
+        add     $car0,$car0,$car0
+        or      $sbit,$car0,$car0
+        add     $car0,$car1,$car1
+        add     $car2,$car1,$car1
+        st      $car1,[$tp+4]
+        srlx    $car1,32,$car2
+        ld      [%sp+$bias+$frame],$tmp1        ! tp[0]
+        ld      [%sp+$bias+$frame+4],$tpj       ! tp[1]
+        ld      [$ap+8],$mul0                   ! ap[2]
+        ld      [$np],$car1                     ! np[0]
+        ld      [$np+4],$npj                    ! np[1]
+        mulx    $n0,$tmp1,$mul1
+        and     $mul1,$mask,$mul1
+        mov     8,$i
+        mulx    $mul0,$mul0,$car0
+        mulx    $car1,$mul1,$car1
+        and     $car0,$mask,$acc0
+        add     $tmp1,$car1,$car1
+        srlx    $car0,32,$car0
+        add     %sp,$bias+$frame,$tp
+        srlx    $car1,32,$car1
+        and     $car0,1,$sbit
+        srlx    $car0,1,$car0
+        mov     4,$j
+.Lsqr_outer:
+.Lsqr_inner1:
+        mulx    $npj,$mul1,$acc1
+        add     $tpj,$car1,$car1
+        add     $j,4,$j
+        ld      [$tp+8],$tpj
+        cmp     $j,$i
+        add     $acc1,$car1,$car1
+        ld      [$np+$j],$npj
+        st      $car1,[$tp]
+        srlx    $car1,32,$car1
+        bl      %icc,.Lsqr_inner1
+        add     $tp,4,$tp
+!.Lsqr_inner1
+        add     $j,4,$j
+        ld      [$ap+$j],$apj                   ! ap[j]
+        mulx    $npj,$mul1,$acc1
+        add     $tpj,$car1,$car1
+        ld      [$np+$j],$npj                   ! np[j]
+        add     $acc0,$car1,$car1
+        ld      [$tp+8],$tpj                    ! tp[j]
+        add     $acc1,$car1,$car1
+        st      $car1,[$tp]
+        srlx    $car1,32,$car1
+        add     $j,4,$j
+        cmp     $j,$num
+        be,pn   %icc,.Lsqr_no_inner2
+        add     $tp,4,$tp
+.Lsqr_inner2:
+        mulx    $apj,$mul0,$acc0
+        mulx    $npj,$mul1,$acc1
+        add     $tpj,$car1,$car1
+        add     $acc0,$car0,$car0
+        ld      [$ap+$j],$apj                   ! ap[j]
+        and     $car0,$mask,$acc0
+        ld      [$np+$j],$npj                   ! np[j]
+        srlx    $car0,32,$car0
+        add     $acc0,$acc0,$acc0
+        ld      [$tp+8],$tpj                    ! tp[j]
+        or      $sbit,$acc0,$acc0
+        add     $j,4,$j                         ! j++
+        srlx    $acc0,32,$sbit
+        and     $acc0,$mask,$acc0
+        cmp     $j,$num
+        add     $acc0,$car1,$car1
+        add     $acc1,$car1,$car1
+        st      $car1,[$tp]                     ! tp[j-1]
+        srlx    $car1,32,$car1
+        bl      %icc,.Lsqr_inner2
+        add     $tp,4,$tp                       ! tp++
+.Lsqr_no_inner2:
+        mulx    $apj,$mul0,$acc0
+        mulx    $npj,$mul1,$acc1
+        add     $tpj,$car1,$car1
+        add     $acc0,$car0,$car0
+        and     $car0,$mask,$acc0
+        srlx    $car0,32,$car0
+        add     $acc0,$acc0,$acc0
+        or      $sbit,$acc0,$acc0
+        srlx    $acc0,32,$sbit
+        and     $acc0,$mask,$acc0
+        add     $acc0,$car1,$car1
+        add     $acc1,$car1,$car1
+        st      $car1,[$tp]                     ! tp[j-1]
+        srlx    $car1,32,$car1
+        add     $car0,$car0,$car0
+        or      $sbit,$car0,$car0
+        add     $car0,$car1,$car1
+        add     $car2,$car1,$car1
+        st      $car1,[$tp+4]
+        srlx    $car1,32,$car2
+        add     $i,4,$i                         ! i++
+        ld      [%sp+$bias+$frame],$tmp1        ! tp[0]
+        ld      [%sp+$bias+$frame+4],$tpj       ! tp[1]
+        ld      [$ap+$i],$mul0                  ! ap[j]
+        ld      [$np],$car1                     ! np[0]
+        ld      [$np+4],$npj                    ! np[1]
+        mulx    $n0,$tmp1,$mul1
+        and     $mul1,$mask,$mul1
+        add     $i,4,$tmp0
+        mulx    $mul0,$mul0,$car0
+        mulx    $car1,$mul1,$car1
+        and     $car0,$mask,$acc0
+        add     $tmp1,$car1,$car1
+        srlx    $car0,32,$car0
+        add     %sp,$bias+$frame,$tp
+        srlx    $car1,32,$car1
+        and     $car0,1,$sbit
+        srlx    $car0,1,$car0
+        cmp     $tmp0,$num                      ! i<num-1
+        bl      %icc,.Lsqr_outer
+        mov     4,$j
+.Lsqr_last:
+        mulx    $npj,$mul1,$acc1
+        add     $tpj,$car1,$car1
+        add     $j,4,$j
+        ld      [$tp+8],$tpj
+        cmp     $j,$i
+        add     $acc1,$car1,$car1
+        ld      [$np+$j],$npj
+        st      $car1,[$tp]
+        srlx    $car1,32,$car1
+        bl      %icc,.Lsqr_last
+        add     $tp,4,$tp
+!.Lsqr_last
+        mulx    $npj,$mul1,$acc1
+        add     $tpj,$car1,$car1
+        add     $acc0,$car1,$car1
+        add     $acc1,$car1,$car1
+        st      $car1,[$tp]
+        srlx    $car1,32,$car1
+        add     $car0,$car0,$car0               ! recover $car0
+        or      $sbit,$car0,$car0
+        add     $car0,$car1,$car1
+        add     $car2,$car1,$car1
+        st      $car1,[$tp+4]
+        srlx    $car1,32,$car2
+        ba      .Ltail
+        add     $tp,8,$tp
+.type   $fname,#function
+.size   $fname,(.-$fname)
+.asciz  "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align  32
+___
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl
new file mode 100755
index 0000000000..a14205f2f0
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl
@@ -0,0 +1,882 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# October 2005
+#
+# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
+# Because unlike integer multiplier, which simply stalls whole CPU,
+# FPU is fully pipelined and can effectively emit 48 bit partial
+# product every cycle. Why not blended SPARC v9? One can argue that
+# making this module dependent on UltraSPARC VIS extension limits its
+# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
+# implementations from compatibility matrix. But the rest, whole Sun
+# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
+# VIS extension instructions used in this module. This is considered
+# good enough to not care about HAL SPARC64 users [if any] who have
+# integer-only pure SPARCv9 module to "fall down" to.
+# USI&II cores currently exhibit uniform 2x improvement [over pre-
+# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
+# performance improves few percents for shorter keys and worsens few
+# percents for longer keys. This is because USIII integer multiplier
+# is >3x faster than USI&II one, which is harder to match [but see
+# TODO list below]. It should also be noted that SPARC64 V features
+# out-of-order execution, which *might* mean that integer multiplier
+# is pipelined, which in turn *might* be impossible to match... On
+# additional note, SPARC64 V implements FP Multiply-Add instruction,
+# which is perfectly usable in this context... In other words, as far
+# as Fujitsu SPARC64 V goes, talk to the author:-)
+# The implementation implies following "non-natural" limitations on
+# input arguments:
+# - num may not be less than 4;
+# - num has to be even;
+# Failure to meet either condition has no fatal effects, simply
+# doesn't give any performance gain.
+# TODO:
+# - modulo-schedule inner loop for better performance (on in-order
+#   execution core such as UltraSPARC this shall result in further
+#   noticeable(!) improvement);
+# - dedicated squaring procedure[?];
+######################################################################
+# November 2006
+#
+# Modulo-scheduled inner loops allow to interleave floating point and
+# integer instructions and minimize Read-After-Write penalties. This
+# results in *further* 20-50% perfromance improvement [depending on
+# key length, more for longer keys] on USI&II cores and 30-80% - on
+# USIII&IV.
+$fname="bn_mul_mont_fpu";
+$bits=32;
+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64) {
+        $bias=2047;
+        $frame=192;
+} else {
+        $bias=0;
+        $frame=128;     # 96 rounded up to largest known cache-line
+}
+$locals=64;
+# In order to provide for 32-/64-bit ABI duality, I keep integers wider
+# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
+# exclusively for pointers, indexes and other small values...
+# int bn_mul_mont(
+$rp="%i0";      # BN_ULONG *rp,
+$ap="%i1";      # const BN_ULONG *ap,
+$bp="%i2";      # const BN_ULONG *bp,
+$np="%i3";      # const BN_ULONG *np,
+$n0="%i4";      # const BN_ULONG *n0,
+$num="%i5";     # int num);
+$tp="%l0";      # t[num]
+$ap_l="%l1";    # a[num],n[num] are smashed to 32-bit words and saved
+$ap_h="%l2";    # to these four vectors as double-precision FP values.
+$np_l="%l3";    # This way a bunch of fxtods are eliminated in second
+$np_h="%l4";    # loop and L1-cache aliasing is minimized...
+$i="%l5";
+$j="%l6";
+$mask="%l7";    # 16-bit mask, 0xffff
+$n0="%g4";      # reassigned(!) to "64-bit" register
+$carry="%i4";   # %i4 reused(!) for a carry bit
+# FP register naming chart
+#
+#     ..HILO
+#       dcba
+#   --------
+#        LOa
+#       LOb
+#      LOc
+#     LOd
+#      HIa
+#     HIb
+#    HIc
+#   HId
+#    ..a
+#   ..b
+$ba="%f0";    $bb="%f2";    $bc="%f4";    $bd="%f6";
+$na="%f8";    $nb="%f10";   $nc="%f12";   $nd="%f14";
+$alo="%f16";  $alo_="%f17"; $ahi="%f18";  $ahi_="%f19";
+$nlo="%f20";  $nlo_="%f21"; $nhi="%f22";  $nhi_="%f23";
+$dota="%f24"; $dotb="%f26";
+$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
+$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
+$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
+$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
+$ASI_FL16_P=0xD2;       # magic ASI value to engage 16-bit FP load
+$code=<<___;
+.section        ".text",#alloc,#execinstr
+.global $fname
+.align  32
+$fname:
+        save    %sp,-$frame-$locals,%sp
+        cmp     $num,4
+        bl,a,pn %icc,.Lret
+        clr     %i0
+        andcc   $num,1,%g0              ! $num has to be even...
+        bnz,a,pn %icc,.Lret
+        clr     %i0                     ! signal "unsupported input value"
+        srl     $num,1,$num
+        sethi   %hi(0xffff),$mask
+        ld      [%i4+0],$n0             ! $n0 reassigned, remember?
+        or      $mask,%lo(0xffff),$mask
+        ld      [%i4+4],%o0
+        sllx    %o0,32,%o0
+        or      %o0,$n0,$n0             ! $n0=n0[1].n0[0]
+        sll     $num,3,$num             ! num*=8
+        add     %sp,$bias,%o0           ! real top of stack
+        sll     $num,2,%o1
+        add     %o1,$num,%o1            ! %o1=num*5
+        sub     %o0,%o1,%o0
+        and     %o0,-2048,%o0           ! optimize TLB utilization
+        sub     %o0,$bias,%sp           ! alloca(5*num*8)
+        rd      %asi,%o7                ! save %asi
+        add     %sp,$bias+$frame+$locals,$tp
+        add     $tp,$num,$ap_l
+        add     $ap_l,$num,$ap_l        ! [an]p_[lh] point at the vectors' ends !
+        add     $ap_l,$num,$ap_h
+        add     $ap_h,$num,$np_l
+        add     $np_l,$num,$np_h
+        wr      %g0,$ASI_FL16_P,%asi    ! setup %asi for 16-bit FP loads
+        add     $rp,$num,$rp            ! readjust input pointers to point
+        add     $ap,$num,$ap            ! at the ends too...
+        add     $bp,$num,$bp
+        add     $np,$num,$np
+        stx     %o7,[%sp+$bias+$frame+48]       ! save %asi
+        sub     %g0,$num,$i             ! i=-num
+        sub     %g0,$num,$j             ! j=-num
+        add     $ap,$j,%o3
+        add     $bp,$i,%o4
+        ld      [%o3+4],%g1             ! bp[0]
+        ld      [%o3+0],%o0
+        ld      [%o4+4],%g5             ! ap[0]
+        sllx    %g1,32,%g1
+        ld      [%o4+0],%o1
+        sllx    %g5,32,%g5
+        or      %g1,%o0,%o0
+        or      %g5,%o1,%o1
+        add     $np,$j,%o5
+        mulx    %o1,%o0,%o0             ! ap[0]*bp[0]
+        mulx    $n0,%o0,%o0             ! ap[0]*bp[0]*n0
+        stx     %o0,[%sp+$bias+$frame+0]
+        ld      [%o3+0],$alo_   ! load a[j] as pair of 32-bit words
+        fzeros  $alo
+        ld      [%o3+4],$ahi_
+        fzeros  $ahi
+        ld      [%o5+0],$nlo_   ! load n[j] as pair of 32-bit words
+        fzeros  $nlo
+        ld      [%o5+4],$nhi_
+        fzeros  $nhi
+        ! transfer b[i] to FPU as 4x16-bit values
+        ldda    [%o4+2]%asi,$ba
+        fxtod   $alo,$alo
+        ldda    [%o4+0]%asi,$bb
+        fxtod   $ahi,$ahi
+        ldda    [%o4+6]%asi,$bc
+        fxtod   $nlo,$nlo
+        ldda    [%o4+4]%asi,$bd
+        fxtod   $nhi,$nhi
+        ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
+        ldda    [%sp+$bias+$frame+6]%asi,$na
+        fxtod   $ba,$ba
+        ldda    [%sp+$bias+$frame+4]%asi,$nb
+        fxtod   $bb,$bb
+        ldda    [%sp+$bias+$frame+2]%asi,$nc
+        fxtod   $bc,$bc
+        ldda    [%sp+$bias+$frame+0]%asi,$nd
+        fxtod   $bd,$bd
+        std     $alo,[$ap_l+$j]         ! save smashed ap[j] in double format
+        fxtod   $na,$na
+        std     $ahi,[$ap_h+$j]
+        fxtod   $nb,$nb
+        std     $nlo,[$np_l+$j]         ! save smashed np[j] in double format
+        fxtod   $nc,$nc
+        std     $nhi,[$np_h+$j]
+        fxtod   $nd,$nd
+                fmuld   $alo,$ba,$aloa
+                fmuld   $nlo,$na,$nloa
+                fmuld   $alo,$bb,$alob
+                fmuld   $nlo,$nb,$nlob
+                fmuld   $alo,$bc,$aloc
+        faddd   $aloa,$nloa,$nloa
+                fmuld   $nlo,$nc,$nloc
+                fmuld   $alo,$bd,$alod
+        faddd   $alob,$nlob,$nlob
+                fmuld   $nlo,$nd,$nlod
+                fmuld   $ahi,$ba,$ahia
+        faddd   $aloc,$nloc,$nloc
+                fmuld   $nhi,$na,$nhia
+                fmuld   $ahi,$bb,$ahib
+        faddd   $alod,$nlod,$nlod
+                fmuld   $nhi,$nb,$nhib
+                fmuld   $ahi,$bc,$ahic
+        faddd   $ahia,$nhia,$nhia
+                fmuld   $nhi,$nc,$nhic
+                fmuld   $ahi,$bd,$ahid
+        faddd   $ahib,$nhib,$nhib
+                fmuld   $nhi,$nd,$nhid
+        faddd   $ahic,$nhic,$dota       ! $nhic
+        faddd   $ahid,$nhid,$dotb       ! $nhid
+        faddd   $nloc,$nhia,$nloc
+        faddd   $nlod,$nhib,$nlod
+        fdtox   $nloa,$nloa
+        fdtox   $nlob,$nlob
+        fdtox   $nloc,$nloc
+        fdtox   $nlod,$nlod
+        std     $nloa,[%sp+$bias+$frame+0]
+        add     $j,8,$j
+        std     $nlob,[%sp+$bias+$frame+8]
+        add     $ap,$j,%o4
+        std     $nloc,[%sp+$bias+$frame+16]
+        add     $np,$j,%o5
+        std     $nlod,[%sp+$bias+$frame+24]
+        ld      [%o4+0],$alo_   ! load a[j] as pair of 32-bit words
+        fzeros  $alo
+        ld      [%o4+4],$ahi_
+        fzeros  $ahi
+        ld      [%o5+0],$nlo_   ! load n[j] as pair of 32-bit words
+        fzeros  $nlo
+        ld      [%o5+4],$nhi_
+        fzeros  $nhi
+        fxtod   $alo,$alo
+        fxtod   $ahi,$ahi
+        fxtod   $nlo,$nlo
+        fxtod   $nhi,$nhi
+        ldx     [%sp+$bias+$frame+0],%o0
+                fmuld   $alo,$ba,$aloa
+        ldx     [%sp+$bias+$frame+8],%o1
+                fmuld   $nlo,$na,$nloa
+        ldx     [%sp+$bias+$frame+16],%o2
+                fmuld   $alo,$bb,$alob
+        ldx     [%sp+$bias+$frame+24],%o3
+                fmuld   $nlo,$nb,$nlob
+        srlx    %o0,16,%o7
+        std     $alo,[$ap_l+$j]         ! save smashed ap[j] in double format
+                fmuld   $alo,$bc,$aloc
+        add     %o7,%o1,%o1
+        std     $ahi,[$ap_h+$j]
+                faddd   $aloa,$nloa,$nloa
+                fmuld   $nlo,$nc,$nloc
+        srlx    %o1,16,%o7
+        std     $nlo,[$np_l+$j]         ! save smashed np[j] in double format
+                fmuld   $alo,$bd,$alod
+        add     %o7,%o2,%o2
+        std     $nhi,[$np_h+$j]
+                faddd   $alob,$nlob,$nlob
+                fmuld   $nlo,$nd,$nlod
+        srlx    %o2,16,%o7
+                fmuld   $ahi,$ba,$ahia
+        add     %o7,%o3,%o3             ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+                faddd   $aloc,$nloc,$nloc
+                fmuld   $nhi,$na,$nhia
+        !and    %o0,$mask,%o0
+        !and    %o1,$mask,%o1
+        !and    %o2,$mask,%o2
+        !sllx   %o1,16,%o1
+        !sllx   %o2,32,%o2
+        !sllx   %o3,48,%o7
+        !or     %o1,%o0,%o0
+        !or     %o2,%o0,%o0
+        !or     %o7,%o0,%o0             ! 64-bit result
+        srlx    %o3,16,%g1              ! 34-bit carry
+                fmuld   $ahi,$bb,$ahib
+        faddd   $alod,$nlod,$nlod
+                fmuld   $nhi,$nb,$nhib
+                fmuld   $ahi,$bc,$ahic
+        faddd   $ahia,$nhia,$nhia
+                fmuld   $nhi,$nc,$nhic
+                fmuld   $ahi,$bd,$ahid
+        faddd   $ahib,$nhib,$nhib
+                fmuld   $nhi,$nd,$nhid
+        faddd   $dota,$nloa,$nloa
+        faddd   $dotb,$nlob,$nlob
+        faddd   $ahic,$nhic,$dota       ! $nhic
+        faddd   $ahid,$nhid,$dotb       ! $nhid
+        faddd   $nloc,$nhia,$nloc
+        faddd   $nlod,$nhib,$nlod
+        fdtox   $nloa,$nloa
+        fdtox   $nlob,$nlob
+        fdtox   $nloc,$nloc
+        fdtox   $nlod,$nlod
+        std     $nloa,[%sp+$bias+$frame+0]
+        std     $nlob,[%sp+$bias+$frame+8]
+        addcc   $j,8,$j
+        std     $nloc,[%sp+$bias+$frame+16]
+        bz,pn   %icc,.L1stskip
+        std     $nlod,[%sp+$bias+$frame+24]
+.align  32                      ! incidentally already aligned !
+.L1st:
+        add     $ap,$j,%o4
+        add     $np,$j,%o5
+        ld      [%o4+0],$alo_   ! load a[j] as pair of 32-bit words
+        fzeros  $alo
+        ld      [%o4+4],$ahi_
+        fzeros  $ahi
+        ld      [%o5+0],$nlo_   ! load n[j] as pair of 32-bit words
+        fzeros  $nlo
+        ld      [%o5+4],$nhi_
+        fzeros  $nhi
+        fxtod   $alo,$alo
+        fxtod   $ahi,$ahi
+        fxtod   $nlo,$nlo
+        fxtod   $nhi,$nhi
+        ldx     [%sp+$bias+$frame+0],%o0
+                fmuld   $alo,$ba,$aloa
+        ldx     [%sp+$bias+$frame+8],%o1
+                fmuld   $nlo,$na,$nloa
+        ldx     [%sp+$bias+$frame+16],%o2
+                fmuld   $alo,$bb,$alob
+        ldx     [%sp+$bias+$frame+24],%o3
+                fmuld   $nlo,$nb,$nlob
+        srlx    %o0,16,%o7
+        std     $alo,[$ap_l+$j]         ! save smashed ap[j] in double format
+                fmuld   $alo,$bc,$aloc
+        add     %o7,%o1,%o1
+        std     $ahi,[$ap_h+$j]
+                faddd   $aloa,$nloa,$nloa
+                fmuld   $nlo,$nc,$nloc
+        srlx    %o1,16,%o7
+        std     $nlo,[$np_l+$j]         ! save smashed np[j] in double format
+                fmuld   $alo,$bd,$alod
+        add     %o7,%o2,%o2
+        std     $nhi,[$np_h+$j]
+                faddd   $alob,$nlob,$nlob
+                fmuld   $nlo,$nd,$nlod
+        srlx    %o2,16,%o7
+                fmuld   $ahi,$ba,$ahia
+        add     %o7,%o3,%o3             ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+        and     %o0,$mask,%o0
+                faddd   $aloc,$nloc,$nloc
+                fmuld   $nhi,$na,$nhia
+        and     %o1,$mask,%o1
+        and     %o2,$mask,%o2
+                fmuld   $ahi,$bb,$ahib
+        sllx    %o1,16,%o1
+                faddd   $alod,$nlod,$nlod
+                fmuld   $nhi,$nb,$nhib
+        sllx    %o2,32,%o2
+                fmuld   $ahi,$bc,$ahic
+        sllx    %o3,48,%o7
+        or      %o1,%o0,%o0
+                faddd   $ahia,$nhia,$nhia
+                fmuld   $nhi,$nc,$nhic
+        or      %o2,%o0,%o0
+                fmuld   $ahi,$bd,$ahid
+        or      %o7,%o0,%o0             ! 64-bit result
+                faddd   $ahib,$nhib,$nhib
+                fmuld   $nhi,$nd,$nhid
+        addcc   %g1,%o0,%o0
+                faddd   $dota,$nloa,$nloa
+        srlx    %o3,16,%g1              ! 34-bit carry
+                faddd   $dotb,$nlob,$nlob
+        bcs,a   %xcc,.+8
+        add     %g1,1,%g1
+        stx     %o0,[$tp]               ! tp[j-1]=
+        faddd   $ahic,$nhic,$dota       ! $nhic
+        faddd   $ahid,$nhid,$dotb       ! $nhid
+        faddd   $nloc,$nhia,$nloc
+        faddd   $nlod,$nhib,$nlod
+        fdtox   $nloa,$nloa
+        fdtox   $nlob,$nlob
+        fdtox   $nloc,$nloc
+        fdtox   $nlod,$nlod
+        std     $nloa,[%sp+$bias+$frame+0]
+        std     $nlob,[%sp+$bias+$frame+8]
+        std     $nloc,[%sp+$bias+$frame+16]
+        std     $nlod,[%sp+$bias+$frame+24]
+        addcc   $j,8,$j
+        bnz,pt  %icc,.L1st
+        add     $tp,8,$tp
+.L1stskip:
+        fdtox   $dota,$dota
+        fdtox   $dotb,$dotb
+        ldx     [%sp+$bias+$frame+0],%o0
+        ldx     [%sp+$bias+$frame+8],%o1
+        ldx     [%sp+$bias+$frame+16],%o2
+        ldx     [%sp+$bias+$frame+24],%o3
+        srlx    %o0,16,%o7
+        std     $dota,[%sp+$bias+$frame+32]
+        add     %o7,%o1,%o1
+        std     $dotb,[%sp+$bias+$frame+40]
+        srlx    %o1,16,%o7
+        add     %o7,%o2,%o2
+        srlx    %o2,16,%o7
+        add     %o7,%o3,%o3             ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+        and     %o0,$mask,%o0
+        and     %o1,$mask,%o1
+        and     %o2,$mask,%o2
+        sllx    %o1,16,%o1
+        sllx    %o2,32,%o2
+        sllx    %o3,48,%o7
+        or      %o1,%o0,%o0
+        or      %o2,%o0,%o0
+        or      %o7,%o0,%o0             ! 64-bit result
+        ldx     [%sp+$bias+$frame+32],%o4
+        addcc   %g1,%o0,%o0
+        ldx     [%sp+$bias+$frame+40],%o5
+        srlx    %o3,16,%g1              ! 34-bit carry
+        bcs,a   %xcc,.+8
+        add     %g1,1,%g1
+        stx     %o0,[$tp]               ! tp[j-1]=
+        add     $tp,8,$tp
+        srlx    %o4,16,%o7
+        add     %o7,%o5,%o5
+        and     %o4,$mask,%o4
+        sllx    %o5,16,%o7
+        or      %o7,%o4,%o4
+        addcc   %g1,%o4,%o4
+        srlx    %o5,48,%g1
+        bcs,a   %xcc,.+8
+        add     %g1,1,%g1
+        mov     %g1,$carry
+        stx     %o4,[$tp]               ! tp[num-1]=
+        ba      .Louter
+        add     $i,8,$i
+.align  32
+.Louter:
+        sub     %g0,$num,$j             ! j=-num
+        add     %sp,$bias+$frame+$locals,$tp
+        add     $ap,$j,%o3
+        add     $bp,$i,%o4
+        ld      [%o3+4],%g1             ! bp[i]
+        ld      [%o3+0],%o0
+        ld      [%o4+4],%g5             ! ap[0]
+        sllx    %g1,32,%g1
+        ld      [%o4+0],%o1
+        sllx    %g5,32,%g5
+        or      %g1,%o0,%o0
+        or      %g5,%o1,%o1
+        ldx     [$tp],%o2               ! tp[0]
+        mulx    %o1,%o0,%o0
+        addcc   %o2,%o0,%o0
+        mulx    $n0,%o0,%o0             ! (ap[0]*bp[i]+t[0])*n0
+        stx     %o0,[%sp+$bias+$frame+0]
+        ! transfer b[i] to FPU as 4x16-bit values
+        ldda    [%o4+2]%asi,$ba
+        ldda    [%o4+0]%asi,$bb
+        ldda    [%o4+6]%asi,$bc
+        ldda    [%o4+4]%asi,$bd
+        ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
+        ldda    [%sp+$bias+$frame+6]%asi,$na
+        fxtod   $ba,$ba
+        ldda    [%sp+$bias+$frame+4]%asi,$nb
+        fxtod   $bb,$bb
+        ldda    [%sp+$bias+$frame+2]%asi,$nc
+        fxtod   $bc,$bc
+        ldda    [%sp+$bias+$frame+0]%asi,$nd
+        fxtod   $bd,$bd
+        ldd     [$ap_l+$j],$alo         ! load a[j] in double format
+        fxtod   $na,$na
+        ldd     [$ap_h+$j],$ahi
+        fxtod   $nb,$nb
+        ldd     [$np_l+$j],$nlo         ! load n[j] in double format
+        fxtod   $nc,$nc
+        ldd     [$np_h+$j],$nhi
+        fxtod   $nd,$nd
+                fmuld   $alo,$ba,$aloa
+                fmuld   $nlo,$na,$nloa
+                fmuld   $alo,$bb,$alob
+                fmuld   $nlo,$nb,$nlob
+                fmuld   $alo,$bc,$aloc
+        faddd   $aloa,$nloa,$nloa
+                fmuld   $nlo,$nc,$nloc
+                fmuld   $alo,$bd,$alod
+        faddd   $alob,$nlob,$nlob
+                fmuld   $nlo,$nd,$nlod
+                fmuld   $ahi,$ba,$ahia
+        faddd   $aloc,$nloc,$nloc
+                fmuld   $nhi,$na,$nhia
+                fmuld   $ahi,$bb,$ahib
+        faddd   $alod,$nlod,$nlod
+                fmuld   $nhi,$nb,$nhib
+                fmuld   $ahi,$bc,$ahic
+        faddd   $ahia,$nhia,$nhia
+                fmuld   $nhi,$nc,$nhic
+                fmuld   $ahi,$bd,$ahid
+        faddd   $ahib,$nhib,$nhib
+                fmuld   $nhi,$nd,$nhid
+        faddd   $ahic,$nhic,$dota       ! $nhic
+        faddd   $ahid,$nhid,$dotb       ! $nhid
+        faddd   $nloc,$nhia,$nloc
+        faddd   $nlod,$nhib,$nlod
+        fdtox   $nloa,$nloa
+        fdtox   $nlob,$nlob
+        fdtox   $nloc,$nloc
+        fdtox   $nlod,$nlod
+        std     $nloa,[%sp+$bias+$frame+0]
+        std     $nlob,[%sp+$bias+$frame+8]
+        std     $nloc,[%sp+$bias+$frame+16]
+        add     $j,8,$j
+        std     $nlod,[%sp+$bias+$frame+24]
+        ldd     [$ap_l+$j],$alo         ! load a[j] in double format
+        ldd     [$ap_h+$j],$ahi
+        ldd     [$np_l+$j],$nlo         ! load n[j] in double format
+        ldd     [$np_h+$j],$nhi
+                fmuld   $alo,$ba,$aloa
+                fmuld   $nlo,$na,$nloa
+                fmuld   $alo,$bb,$alob
+                fmuld   $nlo,$nb,$nlob
+                fmuld   $alo,$bc,$aloc
+        ldx     [%sp+$bias+$frame+0],%o0
+                faddd   $aloa,$nloa,$nloa
+                fmuld   $nlo,$nc,$nloc
+        ldx     [%sp+$bias+$frame+8],%o1
+                fmuld   $alo,$bd,$alod
+        ldx     [%sp+$bias+$frame+16],%o2
+                faddd   $alob,$nlob,$nlob
+                fmuld   $nlo,$nd,$nlod
+        ldx     [%sp+$bias+$frame+24],%o3
+                fmuld   $ahi,$ba,$ahia
+        srlx    %o0,16,%o7
+                faddd   $aloc,$nloc,$nloc
+                fmuld   $nhi,$na,$nhia
+        add     %o7,%o1,%o1
+                fmuld   $ahi,$bb,$ahib
+        srlx    %o1,16,%o7
+                faddd   $alod,$nlod,$nlod
+                fmuld   $nhi,$nb,$nhib
+        add     %o7,%o2,%o2
+                fmuld   $ahi,$bc,$ahic
+        srlx    %o2,16,%o7
+                faddd   $ahia,$nhia,$nhia
+                fmuld   $nhi,$nc,$nhic
+        add     %o7,%o3,%o3             ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+        ! why?
+        and     %o0,$mask,%o0
+                fmuld   $ahi,$bd,$ahid
+        and     %o1,$mask,%o1
+        and     %o2,$mask,%o2
+                faddd   $ahib,$nhib,$nhib
+                fmuld   $nhi,$nd,$nhid
+        sllx    %o1,16,%o1
+                faddd   $dota,$nloa,$nloa
+        sllx    %o2,32,%o2
+                faddd   $dotb,$nlob,$nlob
+        sllx    %o3,48,%o7
+        or      %o1,%o0,%o0
+                faddd   $ahic,$nhic,$dota       ! $nhic
+        or      %o2,%o0,%o0
+                faddd   $ahid,$nhid,$dotb       ! $nhid
+        or      %o7,%o0,%o0             ! 64-bit result
+        ldx     [$tp],%o7
+                faddd   $nloc,$nhia,$nloc
+        addcc   %o7,%o0,%o0
+        ! end-of-why?
+                faddd   $nlod,$nhib,$nlod
+        srlx    %o3,16,%g1              ! 34-bit carry
+                fdtox   $nloa,$nloa
+        bcs,a   %xcc,.+8
+        add     %g1,1,%g1
+        fdtox   $nlob,$nlob
+        fdtox   $nloc,$nloc
+        fdtox   $nlod,$nlod
+        std     $nloa,[%sp+$bias+$frame+0]
+        std     $nlob,[%sp+$bias+$frame+8]
+        addcc   $j,8,$j
+        std     $nloc,[%sp+$bias+$frame+16]
+        bz,pn   %icc,.Linnerskip
+        std     $nlod,[%sp+$bias+$frame+24]
+        ba      .Linner
+        nop
+.align  32
+.Linner:
+        ldd     [$ap_l+$j],$alo         ! load a[j] in double format
+        ldd     [$ap_h+$j],$ahi
+        ldd     [$np_l+$j],$nlo         ! load n[j] in double format
+        ldd     [$np_h+$j],$nhi
+                fmuld   $alo,$ba,$aloa
+                fmuld   $nlo,$na,$nloa
+                fmuld   $alo,$bb,$alob
+                fmuld   $nlo,$nb,$nlob
+                fmuld   $alo,$bc,$aloc
+        ldx     [%sp+$bias+$frame+0],%o0
+                faddd   $aloa,$nloa,$nloa
+                fmuld   $nlo,$nc,$nloc
+        ldx     [%sp+$bias+$frame+8],%o1
+                fmuld   $alo,$bd,$alod
+        ldx     [%sp+$bias+$frame+16],%o2
+                faddd   $alob,$nlob,$nlob
+                fmuld   $nlo,$nd,$nlod
+        ldx     [%sp+$bias+$frame+24],%o3
+                fmuld   $ahi,$ba,$ahia
+        srlx    %o0,16,%o7
+                faddd   $aloc,$nloc,$nloc
+                fmuld   $nhi,$na,$nhia
+        add     %o7,%o1,%o1
+                fmuld   $ahi,$bb,$ahib
+        srlx    %o1,16,%o7
+                faddd   $alod,$nlod,$nlod
+                fmuld   $nhi,$nb,$nhib
+        add     %o7,%o2,%o2
+                fmuld   $ahi,$bc,$ahic
+        srlx    %o2,16,%o7
+                faddd   $ahia,$nhia,$nhia
+                fmuld   $nhi,$nc,$nhic
+        add     %o7,%o3,%o3             ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+        and     %o0,$mask,%o0
+                fmuld   $ahi,$bd,$ahid
+        and     %o1,$mask,%o1
+        and     %o2,$mask,%o2
+                faddd   $ahib,$nhib,$nhib
+                fmuld   $nhi,$nd,$nhid
+        sllx    %o1,16,%o1
+                faddd   $dota,$nloa,$nloa
+        sllx    %o2,32,%o2
+                faddd   $dotb,$nlob,$nlob
+        sllx    %o3,48,%o7
+        or      %o1,%o0,%o0
+                faddd   $ahic,$nhic,$dota       ! $nhic
+        or      %o2,%o0,%o0
+                faddd   $ahid,$nhid,$dotb       ! $nhid
+        or      %o7,%o0,%o0             ! 64-bit result
+                faddd   $nloc,$nhia,$nloc
+        addcc   %g1,%o0,%o0
+        ldx     [$tp+8],%o7             ! tp[j]
+                faddd   $nlod,$nhib,$nlod
+        srlx    %o3,16,%g1              ! 34-bit carry
+                fdtox   $nloa,$nloa
+        bcs,a   %xcc,.+8
+        add     %g1,1,%g1
+                fdtox   $nlob,$nlob
+        addcc   %o7,%o0,%o0
+                fdtox   $nloc,$nloc
+        bcs,a   %xcc,.+8
+        add     %g1,1,%g1
+        stx     %o0,[$tp]               ! tp[j-1]
+                fdtox   $nlod,$nlod
+        std     $nloa,[%sp+$bias+$frame+0]
+        std     $nlob,[%sp+$bias+$frame+8]
+        std     $nloc,[%sp+$bias+$frame+16]
+        addcc   $j,8,$j
+        std     $nlod,[%sp+$bias+$frame+24]
+        bnz,pt  %icc,.Linner
+        add     $tp,8,$tp
+.Linnerskip:
+        fdtox   $dota,$dota
+        fdtox   $dotb,$dotb
+        ldx     [%sp+$bias+$frame+0],%o0
+        ldx     [%sp+$bias+$frame+8],%o1
+        ldx     [%sp+$bias+$frame+16],%o2
+        ldx     [%sp+$bias+$frame+24],%o3
+        srlx    %o0,16,%o7
+        std     $dota,[%sp+$bias+$frame+32]
+        add     %o7,%o1,%o1
+        std     $dotb,[%sp+$bias+$frame+40]
+        srlx    %o1,16,%o7
+        add     %o7,%o2,%o2
+        srlx    %o2,16,%o7
+        add     %o7,%o3,%o3             ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+        and     %o0,$mask,%o0
+        and     %o1,$mask,%o1
+        and     %o2,$mask,%o2
+        sllx    %o1,16,%o1
+        sllx    %o2,32,%o2
+        sllx    %o3,48,%o7
+        or      %o1,%o0,%o0
+        or      %o2,%o0,%o0
+        ldx     [%sp+$bias+$frame+32],%o4
+        or      %o7,%o0,%o0             ! 64-bit result
+        ldx     [%sp+$bias+$frame+40],%o5
+        addcc   %g1,%o0,%o0
+        ldx     [$tp+8],%o7             ! tp[j]
+        srlx    %o3,16,%g1              ! 34-bit carry
+        bcs,a   %xcc,.+8
+        add     %g1,1,%g1
+        addcc   %o7,%o0,%o0
+        bcs,a   %xcc,.+8
+        add     %g1,1,%g1
+        stx     %o0,[$tp]               ! tp[j-1]
+        add     $tp,8,$tp
+        srlx    %o4,16,%o7
+        add     %o7,%o5,%o5
+        and     %o4,$mask,%o4
+        sllx    %o5,16,%o7
+        or      %o7,%o4,%o4
+        addcc   %g1,%o4,%o4
+        srlx    %o5,48,%g1
+        bcs,a   %xcc,.+8
+        add     %g1,1,%g1
+        addcc   $carry,%o4,%o4
+        stx     %o4,[$tp]               ! tp[num-1]
+        mov     %g1,$carry
+        bcs,a   %xcc,.+8
+        add     $carry,1,$carry
+        addcc   $i,8,$i
+        bnz     %icc,.Louter
+        nop
+        add     $tp,8,$tp               ! adjust tp to point at the end
+        orn     %g0,%g0,%g4
+        sub     %g0,$num,%o7            ! n=-num
+        ba      .Lsub
+        subcc   %g0,%g0,%g0             ! clear %icc.c
+.align  32
+.Lsub:
+        ldx     [$tp+%o7],%o0
+        add     $np,%o7,%g1
+        ld      [%g1+0],%o2
+        ld      [%g1+4],%o3
+        srlx    %o0,32,%o1
+        subccc  %o0,%o2,%o2
+        add     $rp,%o7,%g1
+        subccc  %o1,%o3,%o3
+        st      %o2,[%g1+0]
+        add     %o7,8,%o7
+        brnz,pt %o7,.Lsub
+        st      %o3,[%g1+4]
+        subc    $carry,0,%g4
+        sub     %g0,$num,%o7            ! n=-num
+        ba      .Lcopy
+        nop
+.align  32
+.Lcopy:
+        ldx     [$tp+%o7],%o0
+        add     $rp,%o7,%g1
+        ld      [%g1+0],%o2
+        ld      [%g1+4],%o3
+        stx     %g0,[$tp+%o7]
+        and     %o0,%g4,%o0
+        srlx    %o0,32,%o1
+        andn    %o2,%g4,%o2
+        andn    %o3,%g4,%o3
+        or      %o2,%o0,%o0
+        or      %o3,%o1,%o1
+        st      %o0,[%g1+0]
+        add     %o7,8,%o7
+        brnz,pt %o7,.Lcopy
+        st      %o1,[%g1+4]
+        sub     %g0,$num,%o7            ! n=-num
+.Lzap:
+        stx     %g0,[$ap_l+%o7]
+        stx     %g0,[$ap_h+%o7]
+        stx     %g0,[$np_l+%o7]
+        stx     %g0,[$np_h+%o7]
+        add     %o7,8,%o7
+        brnz,pt %o7,.Lzap
+        nop
+        ldx     [%sp+$bias+$frame+48],%o7
+        wr      %g0,%o7,%asi            ! restore %asi
+        mov     1,%i0
+.Lret:
+        ret
+        restore
+.type   $fname,#function
+.size   $fname,(.-$fname)
+.asciz  "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
+.align  32
+___
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+# Below substitution makes it possible to compile without demanding
+# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
+# dare to do this, because VIS capability is detected at run-time now
+# and this routine is not called on CPU not capable to execute it. Do
+# note that fzeros is not the only VIS dependency! Another dependency
+# is implicit and is just _a_ numerical value loaded to %asi register,
+# which assembler can't recognize as VIS specific...
+$code =~ s/fzeros\s+%f([0-9]+)/
+           sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
+          /gem;
+print $code;
+# flush
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/via-mont.pl b/src/lib/libcrypto/bn/asm/via-mont.pl
new file mode 100644
index 0000000000..c046a514c8
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/via-mont.pl
@@ -0,0 +1,242 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Wrapper around 'rep montmul', VIA-specific instruction accessing
+# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
+# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
+#
+# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
+# different software configurations on 1.5GHz VIA Esther processor.
+# Lines marked with "software integer" denote performance of hand-
+# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
+# refers to hand-coded SSE2 Montgomery multiplication procedure found
+# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
+# Padlock SDK 2.0.1 available for download from VIA, which naturally
+# utilizes the magic 'repz montmul' instruction. And finally "hardware
+# this" refers to *this* implementation which also uses 'repz montmul'
+#
+#                   sign    verify    sign/s verify/s
+# rsa  512 bits 0.001720s 0.000140s    581.4   7149.7   software integer
+# rsa  512 bits 0.000690s 0.000086s   1450.3  11606.0   software SSE2
+# rsa  512 bits 0.006136s 0.000201s    163.0   4974.5   hardware VIA SDK
+# rsa  512 bits 0.000712s 0.000050s   1404.9  19858.5   hardware this
+#
+# rsa 1024 bits 0.008518s 0.000413s    117.4   2420.8   software integer
+# rsa 1024 bits 0.004275s 0.000277s    233.9   3609.7   software SSE2
+# rsa 1024 bits 0.012136s 0.000260s     82.4   3844.5   hardware VIA SDK
+# rsa 1024 bits 0.002522s 0.000116s    396.5   8650.9   hardware this
+#
+# rsa 2048 bits 0.050101s 0.001371s     20.0    729.6   software integer
+# rsa 2048 bits 0.030273s 0.001008s     33.0    991.9   software SSE2
+# rsa 2048 bits 0.030833s 0.000976s     32.4   1025.1   hardware VIA SDK
+# rsa 2048 bits 0.011879s 0.000342s     84.2   2921.7   hardware this
+#
+# rsa 4096 bits 0.327097s 0.004859s      3.1    205.8   software integer
+# rsa 4096 bits 0.229318s 0.003859s      4.4    259.2   software SSE2
+# rsa 4096 bits 0.233953s 0.003274s      4.3    305.4   hardware VIA SDK
+# rsa 4096 bits 0.070493s 0.001166s     14.2    857.6   hardware this
+#
+# dsa  512 bits 0.001342s 0.001651s    745.2    605.7   software integer
+# dsa  512 bits 0.000844s 0.000987s   1185.3   1013.1   software SSE2
+# dsa  512 bits 0.001902s 0.002247s    525.6    444.9   hardware VIA SDK
+# dsa  512 bits 0.000458s 0.000524s   2182.2   1909.1   hardware this
+#
+# dsa 1024 bits 0.003964s 0.004926s    252.3    203.0   software integer
+# dsa 1024 bits 0.002686s 0.003166s    372.3    315.8   software SSE2
+# dsa 1024 bits 0.002397s 0.002823s    417.1    354.3   hardware VIA SDK
+# dsa 1024 bits 0.000978s 0.001170s   1022.2    855.0   hardware this
+#
+# dsa 2048 bits 0.013280s 0.016518s     75.3     60.5   software integer
+# dsa 2048 bits 0.009911s 0.011522s    100.9     86.8   software SSE2
+# dsa 2048 bits 0.009542s 0.011763s    104.8     85.0   hardware VIA SDK
+# dsa 2048 bits 0.002884s 0.003352s    346.8    298.3   hardware this
+#
+# To give you some other reference point here is output for 2.4GHz P4
+# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
+# SSE2" in above terms.
+#
+# rsa  512 bits 0.000407s 0.000047s   2454.2  21137.0
+# rsa 1024 bits 0.002426s 0.000141s    412.1   7100.0
+# rsa 2048 bits 0.015046s 0.000491s     66.5   2034.9
+# rsa 4096 bits 0.109770s 0.002379s      9.1    420.3
+# dsa  512 bits 0.000438s 0.000525s   2281.1   1904.1
+# dsa 1024 bits 0.001346s 0.001595s    742.7    627.0
+# dsa 2048 bits 0.004745s 0.005582s    210.7    179.1
+#
+# Conclusions: 
+# - VIA SDK leaves a *lot* of room for improvement (which this
+#   implementation successfully fills:-);
+# - 'rep montmul' gives up to >3x performance improvement depending on
+#   key length;
+# - in terms of absolute performance it delivers approximately as much
+#   as modern out-of-order 32-bit cores [again, for longer keys].
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+&asm_init($ARGV[0],"via-mont.pl");
+# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
+$func="bn_mul_mont_padlock";
+$pad=16*1;      # amount of reserved bytes on top of every vector
+# stack layout
+$mZeroPrime=&DWP(0,"esp");              # these are specified by VIA
+$A=&DWP(4,"esp");
+$B=&DWP(8,"esp");
+$T=&DWP(12,"esp");
+$M=&DWP(16,"esp");
+$scratch=&DWP(20,"esp");
+$rp=&DWP(24,"esp");                     # these are mine
+$sp=&DWP(28,"esp");
+# &DWP(32,"esp")                        # 32 byte scratch area
+# &DWP(64+(4*$num+$pad)*0,"esp")        # padded tp[num]
+# &DWP(64+(4*$num+$pad)*1,"esp")        # padded copy of ap[num]
+# &DWP(64+(4*$num+$pad)*2,"esp")        # padded copy of bp[num]
+# &DWP(64+(4*$num+$pad)*3,"esp")        # padded copy of np[num]
+# Note that SDK suggests to unconditionally allocate 2K per vector. This
+# has quite an impact on performance. It naturally depends on key length,
+# but to give an example 1024 bit private RSA key operations suffer >30%
+# penalty. I allocate only as much as actually required...
+&function_begin($func);
+        &xor    ("eax","eax");
+        &mov    ("ecx",&wparam(5));     # num
+        # meet VIA's limitations for num [note that the specification
+        # expresses them in bits, while we work with amount of 32-bit words]
+        &test   ("ecx",3);
+        &jnz    (&label("leave"));      # num % 4 != 0
+        &cmp    ("ecx",8);
+        &jb     (&label("leave"));      # num < 8
+        &cmp    ("ecx",1024);
+        &ja     (&label("leave"));      # num > 1024
+        &pushf  ();
+        &cld    ();
+        &mov    ("edi",&wparam(0));     # rp
+        &mov    ("eax",&wparam(1));     # ap
+        &mov    ("ebx",&wparam(2));     # bp
+        &mov    ("edx",&wparam(3));     # np
+        &mov    ("esi",&wparam(4));     # n0
+        &mov    ("esi",&DWP(0,"esi"));  # *n0
+        &lea    ("ecx",&DWP($pad,"","ecx",4));  # ecx becomes vector size in bytes
+        &lea    ("ebp",&DWP(64,"","ecx",4));    # allocate 4 vectors + 64 bytes
+        &neg    ("ebp");
+        &add    ("ebp","esp");
+        &and    ("ebp",-64);            # align to cache-line
+        &xchg   ("ebp","esp");          # alloca
+        &mov    ($rp,"edi");            # save rp
+        &mov    ($sp,"ebp");            # save esp
+        &mov    ($mZeroPrime,"esi");
+        &lea    ("esi",&DWP(64,"esp")); # tp
+        &mov    ($T,"esi");
+        &lea    ("edi",&DWP(32,"esp")); # scratch area
+        &mov    ($scratch,"edi");
+        &mov    ("esi","eax");
+        &lea    ("ebp",&DWP(-$pad,"ecx"));
+        &shr    ("ebp",2);              # restore original num value in ebp
+        &xor    ("eax","eax");
+        &mov    ("ecx","ebp");
+        &lea    ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
+        &data_byte(0xf3,0xab);          # rep stosl, bzero
+        &mov    ("ecx","ebp");
+        &lea    ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
+        &mov    ($A,"edi");
+        &data_byte(0xf3,0xa5);          # rep movsl, memcpy
+        &mov    ("ecx",$pad/4);
+        &data_byte(0xf3,0xab);          # rep stosl, bzero pad
+        # edi points at the end of padded ap copy...
+        &mov    ("ecx","ebp");
+        &mov    ("esi","ebx");
+        &mov    ($B,"edi");
+        &data_byte(0xf3,0xa5);          # rep movsl, memcpy
+        &mov    ("ecx",$pad/4);
+        &data_byte(0xf3,0xab);          # rep stosl, bzero pad
+        # edi points at the end of padded bp copy...
+        &mov    ("ecx","ebp");
+        &mov    ("esi","edx");
+        &mov    ($M,"edi");
+        &data_byte(0xf3,0xa5);          # rep movsl, memcpy
+        &mov    ("ecx",$pad/4);
+        &data_byte(0xf3,0xab);          # rep stosl, bzero pad
+        # edi points at the end of padded np copy...
+        # let magic happen...
+        &mov    ("ecx","ebp");
+        &mov    ("esi","esp");
+        &shl    ("ecx",5);              # convert word counter to bit counter
+        &align  (4);
+        &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
+        &mov    ("ecx","ebp");
+        &lea    ("esi",&DWP(64,"esp"));         # tp
+        # edi still points at the end of padded np copy...
+        &neg    ("ebp");
+        &lea    ("ebp",&DWP(-$pad,"edi","ebp",4));      # so just "rewind"
+        &mov    ("edi",$rp);                    # restore rp
+        &xor    ("edx","edx");                  # i=0 and clear CF
+&set_label("sub",8);
+        &mov    ("eax",&DWP(0,"esi","edx",4));
+        &sbb    ("eax",&DWP(0,"ebp","edx",4));
+        &mov    (&DWP(0,"edi","edx",4),"eax");  # rp[i]=tp[i]-np[i]
+        &lea    ("edx",&DWP(1,"edx"));          # i++
+        &loop   (&label("sub"));                # doesn't affect CF!
+        &mov    ("eax",&DWP(0,"esi","edx",4));  # upmost overflow bit
+        &sbb    ("eax",0);
+        &and    ("esi","eax");
+        &not    ("eax");
+        &mov    ("ebp","edi");
+        &and    ("ebp","eax");
+        &or     ("esi","ebp");                  # tp=carry?tp:rp
+        &mov    ("ecx","edx");                  # num
+        &xor    ("edx","edx");                  # i=0
+&set_label("copy",8);
+        &mov    ("eax",&DWP(0,"esi","edx",4));
+        &mov    (&DWP(64,"esp","edx",4),"ecx"); # zap tp
+        &mov    (&DWP(0,"edi","edx",4),"eax");
+        &lea    ("edx",&DWP(1,"edx"));          # i++
+        &loop   (&label("copy"));
+        &mov    ("ebp",$sp);
+        &xor    ("eax","eax");
+        &mov    ("ecx",64/4);
+        &mov    ("edi","esp");          # zap frame including scratch area
+        &data_byte(0xf3,0xab);          # rep stosl, bzero
+        # zap copies of ap, bp and np
+        &lea    ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
+        &lea    ("ecx",&DWP(3*$pad/4,"edx","edx",2));
+        &data_byte(0xf3,0xab);          # rep stosl, bzero
+        &mov    ("esp","ebp");
+        &inc    ("eax");                # signal "done"
+        &popf   ();
+&set_label("leave");
+&function_end($func);
+&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
diff --git a/src/lib/libcrypto/bn/asm/x86-mont.pl b/src/lib/libcrypto/bn/asm/x86-mont.pl
new file mode 100755
index 0000000000..5cd3cd2ed5
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86-mont.pl
@@ -0,0 +1,591 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+&asm_init($ARGV[0],$0);
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+&function_begin("bn_mul_mont");
+$i="edx";
+$j="ecx";
+$ap="esi";      $tp="esi";              # overlapping variables!!!
+$rp="edi";      $bp="edi";              # overlapping variables!!!
+$np="ebp";
+$num="ebx";
+$_num=&DWP(4*0,"esp");                  # stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp");   $_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32;                              # size of above frame rounded up to 16n
+        &xor    ("eax","eax");
+        &mov    ("edi",&wparam(5));     # int num
+        &cmp    ("edi",4);
+        &jl     (&label("just_leave"));
+        &lea    ("esi",&wparam(0));     # put aside pointer to argument block
+        &lea    ("edx",&wparam(1));     # load ap
+        &mov    ("ebp","esp");          # saved stack pointer!
+        &add    ("edi",2);              # extra two words on top of tp
+        &neg    ("edi");
+        &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
+        &neg    ("edi");
+        # minimize cache contention by arraning 2K window between stack
+        # pointer and ap argument [np is also position sensitive vector,
+        # but it's assumed to be near ap, as it's allocated at ~same
+        # time].
+        &mov    ("eax","esp");
+        &sub    ("eax","edx");
+        &and    ("eax",2047);
+        &sub    ("esp","eax");          # this aligns sp and ap modulo 2048
+        &xor    ("edx","esp");
+        &and    ("edx",2048);
+        &xor    ("edx",2048);
+        &sub    ("esp","edx");          # this splits them apart modulo 4096
+        &and    ("esp",-64);            # align to cache line
+        ################################# load argument block...
+        &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+        &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+        &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+        &mov    ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+        &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+        #&mov   ("edi",&DWP(5*4,"esi"));# int num
+        &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
+        &mov    ($_rp,"eax");           # ... save a copy of argument block
+        &mov    ($_ap,"ebx");
+        &mov    ($_bp,"ecx");
+        &mov    ($_np,"edx");
+        &mov    ($_n0,"esi");
+        &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
+        #&mov   ($_num,$num);           # redundant as $num is not reused
+        &mov    ($_sp,"ebp");           # saved stack pointer!
+if($sse2) {
+$acc0="mm0";    # mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+        &picmeup("eax","OPENSSL_ia32cap_P");
+        &bt     (&DWP(0,"eax"),26);
+        &jnc    (&label("non_sse2"));
+        &mov    ("eax",-1);
+        &movd   ($mask,"eax");          # mask 32 lower bits
+        &mov    ($ap,$_ap);             # load input pointers
+        &mov    ($bp,$_bp);
+        &mov    ($np,$_np);
+        &xor    ($i,$i);                # i=0
+        &xor    ($j,$j);                # j=0
+        &movd   ($mul0,&DWP(0,$bp));            # bp[0]
+        &movd   ($mul1,&DWP(0,$ap));            # ap[0]
+        &movd   ($car1,&DWP(0,$np));            # np[0]
+        &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
+        &movq   ($car0,$mul1);
+        &movq   ($acc0,$mul1);                  # I wish movd worked for
+        &pand   ($acc0,$mask);                  # inter-register transfers
+        &pmuludq($mul1,$_n0q);                  # *=n0
+        &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
+        &paddq  ($car1,$acc0);
+        &movd   ($acc1,&DWP(4,$np));            # np[1]
+        &movd   ($acc0,&DWP(4,$ap));            # ap[1]
+        &psrlq  ($car0,32);
+        &psrlq  ($car1,32);
+        &inc    ($j);                           # j++
+&set_label("1st",16);
+        &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
+        &pmuludq($acc1,$mul1);                  # np[j]*m1
+        &paddq  ($car0,$acc0);                  # +=c0
+        &paddq  ($car1,$acc1);                  # +=c1
+        &movq   ($acc0,$car0);
+        &pand   ($acc0,$mask);
+        &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
+        &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
+        &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
+        &psrlq  ($car0,32);
+        &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
+        &psrlq  ($car1,32);
+        &lea    ($j,&DWP(1,$j));
+        &cmp    ($j,$num);
+        &jl     (&label("1st"));
+        &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[0]
+        &pmuludq($acc1,$mul1);                  # np[num-1]*m1
+        &paddq  ($car0,$acc0);                  # +=c0
+        &paddq  ($car1,$acc1);                  # +=c1
+        &movq   ($acc0,$car0);
+        &pand   ($acc0,$mask);
+        &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[0];
+        &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
+        &psrlq  ($car0,32);
+        &psrlq  ($car1,32);
+        &paddq  ($car1,$car0);
+        &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
+        &inc    ($i);                           # i++
+&set_label("outer");
+        &xor    ($j,$j);                        # j=0
+        &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
+        &movd   ($mul1,&DWP(0,$ap));            # ap[0]
+        &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
+        &movd   ($car1,&DWP(0,$np));            # np[0]
+        &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
+        &paddq  ($mul1,$temp);                  # +=tp[0]
+        &movq   ($acc0,$mul1);
+        &movq   ($car0,$mul1);
+        &pand   ($acc0,$mask);
+        &pmuludq($mul1,$_n0q);                  # *=n0
+        &pmuludq($car1,$mul1);
+        &paddq  ($car1,$acc0);
+        &movd   ($temp,&DWP($frame+4,"esp"));   # tp[1]
+        &movd   ($acc1,&DWP(4,$np));            # np[1]
+        &movd   ($acc0,&DWP(4,$ap));            # ap[1]
+        &psrlq  ($car0,32);
+        &psrlq  ($car1,32);
+        &paddq  ($car0,$temp);                  # +=tp[1]
+        &inc    ($j);                           # j++
+        &dec    ($num);
+&set_label("inner");
+        &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
+        &pmuludq($acc1,$mul1);                  # np[j]*m1
+        &paddq  ($car0,$acc0);                  # +=c0
+        &paddq  ($car1,$acc1);                  # +=c1
+        &movq   ($acc0,$car0);
+        &movd   ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+        &pand   ($acc0,$mask);
+        &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
+        &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
+        &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
+        &psrlq  ($car0,32);
+        &movd   (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+        &psrlq  ($car1,32);
+        &paddq  ($car0,$temp);                  # +=tp[j+1]
+        &dec    ($num);
+        &lea    ($j,&DWP(1,$j));                # j++
+        &jnz    (&label("inner"));
+        &mov    ($num,$j);
+        &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[i]
+        &pmuludq($acc1,$mul1);                  # np[num-1]*m1
+        &paddq  ($car0,$acc0);                  # +=c0
+        &paddq  ($car1,$acc1);                  # +=c1
+        &movq   ($acc0,$car0);
+        &pand   ($acc0,$mask);
+        &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
+        &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
+        &psrlq  ($car0,32);
+        &psrlq  ($car1,32);
+        &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
+        &paddq  ($car1,$car0);
+        &paddq  ($car1,$temp);
+        &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
+        &lea    ($i,&DWP(1,$i));                # i++
+        &cmp    ($i,$num);
+        &jle    (&label("outer"));
+        &emms   ();                             # done with mmx bank
+        &jmp    (&label("common_tail"));
+&set_label("non_sse2",16);
+}
+if (0) {
+        &mov    ("esp",$_sp);
+        &xor    ("eax","eax");  # signal "not fast enough [yet]"
+        &jmp    (&label("just_leave"));
+        # While the below code provides competitive performance for
+        # all key lengthes on modern Intel cores, it's still more
+        # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+        # means compared to the original integer-only assembler.
+        # 512-bit RSA sign is better by ~40%, but that's about all
+        # one can say about all CPUs...
+} else {
+$inp="esi";     # integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+        &mov    ($inp,$_ap);
+        &lea    ($carry,&DWP(1,$num));
+        &mov    ($word,$_bp);
+        &xor    ($j,$j);                                # j=0
+        &mov    ("edx",$inp);
+        &and    ($carry,1);                             # see if num is even
+        &sub    ("edx",$word);                          # see if ap==bp
+        &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
+        &or     ($carry,"edx");
+        &mov    ($word,&DWP(0,$word));                  # bp[0]
+        &jz     (&label("bn_sqr_mont"));
+        &mov    ($_bpend,"eax");
+        &mov    ("eax",&DWP(0,$inp));
+        &xor    ("edx","edx");
+&set_label("mull",16);
+        &mov    ($carry,"edx");
+        &mul    ($word);                                # ap[j]*bp[0]
+        &add    ($carry,"eax");
+        &lea    ($j,&DWP(1,$j));
+        &adc    ("edx",0);
+        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
+        &cmp    ($j,$num);
+        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
+        &jl     (&label("mull"));
+        &mov    ($carry,"edx");
+        &mul    ($word);                                # ap[num-1]*bp[0]
+         &mov   ($word,$_n0);
+        &add    ("eax",$carry);
+         &mov   ($inp,$_np);
+        &adc    ("edx",0);
+         &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
+        &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
+        &xor    ($j,$j);
+        &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
+        &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
+        &mov    ("eax",&DWP(0,$inp));                   # np[0]
+        &mul    ($word);                                # np[0]*m
+        &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
+        &mov    ("eax",&DWP(4,$inp));                   # np[1]
+        &adc    ("edx",0);
+        &inc    ($j);
+        &jmp    (&label("2ndmadd"));
+&set_label("1stmadd",16);
+        &mov    ($carry,"edx");
+        &mul    ($word);                                # ap[j]*bp[i]
+        &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
+        &lea    ($j,&DWP(1,$j));
+        &adc    ("edx",0);
+        &add    ($carry,"eax");
+        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
+        &adc    ("edx",0);
+        &cmp    ($j,$num);
+        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
+        &jl     (&label("1stmadd"));
+        &mov    ($carry,"edx");
+        &mul    ($word);                                # ap[num-1]*bp[i]
+        &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
+         &mov   ($word,$_n0);
+        &adc    ("edx",0);
+         &mov   ($inp,$_np);
+        &add    ($carry,"eax");
+        &adc    ("edx",0);
+         &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
+        &xor    ($j,$j);
+        &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
+        &mov    (&DWP($frame,"esp",$num,4),$carry);     # tp[num-1]=
+        &adc    ($j,0);
+         &mov   ("eax",&DWP(0,$inp));                   # np[0]
+        &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
+        &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
+        &mul    ($word);                                # np[0]*m
+        &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
+        &mov    ("eax",&DWP(4,$inp));                   # np[1]
+        &adc    ("edx",0);
+        &mov    ($j,1);
+&set_label("2ndmadd",16);
+        &mov    ($carry,"edx");
+        &mul    ($word);                                # np[j]*m
+        &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
+        &lea    ($j,&DWP(1,$j));
+        &adc    ("edx",0);
+        &add    ($carry,"eax");
+        &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+1]
+        &adc    ("edx",0);
+        &cmp    ($j,$num);
+        &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j-1]=
+        &jl     (&label("2ndmadd"));
+        &mov    ($carry,"edx");
+        &mul    ($word);                                # np[j]*m
+        &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
+        &adc    ("edx",0);
+        &add    ($carry,"eax");
+        &adc    ("edx",0);
+        &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
+        &xor    ("eax","eax");
+         &mov   ($j,$_bp);                              # &bp[i]
+        &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
+        &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
+         &lea   ($j,&DWP(4,$j));
+        &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
+         &cmp   ($j,$_bpend);
+        &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
+        &je     (&label("common_tail"));
+        &mov    ($word,&DWP(0,$j));                     # bp[i+1]
+        &mov    ($inp,$_ap);
+        &mov    ($_bp,$j);                              # &bp[++i]
+        &xor    ($j,$j);
+        &xor    ("edx","edx");
+        &mov    ("eax",&DWP(0,$inp));
+        &jmp    (&label("1stmadd"));
+&set_label("bn_sqr_mont",16);
+$sbit=$num;
+        &mov    ($_num,$num);
+        &mov    ($_bp,$j);                              # i=0
+        &mov    ("eax",$word);                          # ap[0]
+        &mul    ($word);                                # ap[0]*ap[0]
+        &mov    (&DWP($frame,"esp"),"eax");             # tp[0]=
+        &mov    ($sbit,"edx");
+        &shr    ("edx",1);
+        &and    ($sbit,1);
+        &inc    ($j);
+&set_label("sqr",16);
+        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
+        &mov    ($carry,"edx");
+        &mul    ($word);                                # ap[j]*ap[0]
+        &add    ("eax",$carry);
+        &lea    ($j,&DWP(1,$j));
+        &adc    ("edx",0);
+        &lea    ($carry,&DWP(0,$sbit,"eax",2));
+        &shr    ("eax",31);
+        &cmp    ($j,$_num);
+        &mov    ($sbit,"eax");
+        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
+        &jl     (&label("sqr"));
+        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[num-1]
+        &mov    ($carry,"edx");
+        &mul    ($word);                                # ap[num-1]*ap[0]
+        &add    ("eax",$carry);
+         &mov   ($word,$_n0);
+        &adc    ("edx",0);
+         &mov   ($inp,$_np);
+        &lea    ($carry,&DWP(0,$sbit,"eax",2));
+         &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
+        &shr    ("eax",31);
+        &mov    (&DWP($frame,"esp",$j,4),$carry);       # tp[num-1]=
+        &lea    ($carry,&DWP(0,"eax","edx",2));
+         &mov   ("eax",&DWP(0,$inp));                   # np[0]
+        &shr    ("edx",31);
+        &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num]=
+        &mov    (&DWP($frame+8,"esp",$j,4),"edx");      # tp[num+1]=
+        &mul    ($word);                                # np[0]*m
+        &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
+        &mov    ($num,$j);
+        &adc    ("edx",0);
+        &mov    ("eax",&DWP(4,$inp));                   # np[1]
+        &mov    ($j,1);
+&set_label("3rdmadd",16);
+        &mov    ($carry,"edx");
+        &mul    ($word);                                # np[j]*m
+        &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
+        &adc    ("edx",0);
+        &add    ($carry,"eax");
+        &mov    ("eax",&DWP(4,$inp,$j,4));              # np[j+1]
+        &adc    ("edx",0);
+        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j-1]=
+        &mov    ($carry,"edx");
+        &mul    ($word);                                # np[j+1]*m
+        &add    ($carry,&DWP($frame+4,"esp",$j,4));     # +=tp[j+1]
+        &lea    ($j,&DWP(2,$j));
+        &adc    ("edx",0);
+        &add    ($carry,"eax");
+        &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+2]
+        &adc    ("edx",0);
+        &cmp    ($j,$num);
+        &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j]=
+        &jl     (&label("3rdmadd"));
+        &mov    ($carry,"edx");
+        &mul    ($word);                                # np[j]*m
+        &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
+        &adc    ("edx",0);
+        &add    ($carry,"eax");
+        &adc    ("edx",0);
+        &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
+        &mov    ($j,$_bp);                              # i
+        &xor    ("eax","eax");
+        &mov    ($inp,$_ap);
+        &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
+        &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
+        &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
+        &cmp    ($j,$num);
+        &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
+        &je     (&label("common_tail"));
+        &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
+        &lea    ($j,&DWP(1,$j));
+        &mov    ("eax",$word);
+        &mov    ($_bp,$j);                              # ++i
+        &mul    ($word);                                # ap[i]*ap[i]
+        &add    ("eax",&DWP($frame,"esp",$j,4));        # +=tp[i]
+        &adc    ("edx",0);
+        &mov    (&DWP($frame,"esp",$j,4),"eax");        # tp[i]=
+        &xor    ($carry,$carry);
+        &cmp    ($j,$num);
+        &lea    ($j,&DWP(1,$j));
+        &je     (&label("sqrlast"));
+        &mov    ($sbit,"edx");                          # zaps $num
+        &shr    ("edx",1);
+        &and    ($sbit,1);
+&set_label("sqradd",16);
+        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
+        &mov    ($carry,"edx");
+        &mul    ($word);                                # ap[j]*ap[i]
+        &add    ("eax",$carry);
+        &lea    ($carry,&DWP(0,"eax","eax"));
+        &adc    ("edx",0);
+        &shr    ("eax",31);
+        &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
+        &lea    ($j,&DWP(1,$j));
+        &adc    ("eax",0);
+        &add    ($carry,$sbit);
+        &adc    ("eax",0);
+        &cmp    ($j,$_num);
+        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
+        &mov    ($sbit,"eax");
+        &jle    (&label("sqradd"));
+        &mov    ($carry,"edx");
+        &lea    ("edx",&DWP(0,$sbit,"edx",2));
+        &shr    ($carry,31);
+&set_label("sqrlast");
+        &mov    ($word,$_n0);
+        &mov    ($inp,$_np);
+        &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
+        &add    ("edx",&DWP($frame,"esp",$j,4));        # +=tp[num]
+        &mov    ("eax",&DWP(0,$inp));                   # np[0]
+        &adc    ($carry,0);
+        &mov    (&DWP($frame,"esp",$j,4),"edx");        # tp[num]=
+        &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num+1]=
+        &mul    ($word);                                # np[0]*m
+        &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
+        &lea    ($num,&DWP(-1,$j));
+        &adc    ("edx",0);
+        &mov    ($j,1);
+        &mov    ("eax",&DWP(4,$inp));                   # np[1]
+        &jmp    (&label("3rdmadd"));
+}
+&set_label("common_tail",16);
+        &mov    ($np,$_np);                     # load modulus pointer
+        &mov    ($rp,$_rp);                     # load result pointer
+        &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
+        &mov    ("eax",&DWP(0,$tp));            # tp[0]
+        &mov    ($j,$num);                      # j=num-1
+        &xor    ($i,$i);                        # i=0 and clear CF!
+&set_label("sub",16);
+        &sbb    ("eax",&DWP(0,$np,$i,4));
+        &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
+        &dec    ($j);                           # doesn't affect CF!
+        &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
+        &lea    ($i,&DWP(1,$i));                # i++
+        &jge    (&label("sub"));
+        &sbb    ("eax",0);                      # handle upmost overflow bit
+        &and    ($tp,"eax");
+        &not    ("eax");
+        &mov    ($np,$rp);
+        &and    ($np,"eax");
+        &or     ($tp,$np);                      # tp=carry?tp:rp
+&set_label("copy",16);                          # copy or in-place refresh
+        &mov    ("eax",&DWP(0,$tp,$num,4));
+        &mov    (&DWP(0,$rp,$num,4),"eax");     # rp[i]=tp[i]
+        &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
+        &dec    ($num);
+        &jge    (&label("copy"));
+        &mov    ("esp",$_sp);           # pull saved stack pointer
+        &mov    ("eax",1);
+&set_label("just_leave");
+&function_end("bn_mul_mont");
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
author	djm <>	2009-04-06 06:30:10 +0000
committer	djm <>	2009-04-06 06:30:10 +0000
commit	2b6e09b39ef1d803b50ee024a06d1c250fde442d (patch)
tree	f116109c359f26a2b149bbc752be39c16099bae1 /src/lib/libcrypto/bn/asm
parent	a0fdc9ec41594852f67ec77dfad9cb06bacc4186 (diff)
download	openbsd-2b6e09b39ef1d803b50ee024a06d1c250fde442d.tar.gz openbsd-2b6e09b39ef1d803b50ee024a06d1c250fde442d.tar.bz2 openbsd-2b6e09b39ef1d803b50ee024a06d1c250fde442d.zip