diff options
| author | jsing <> | 2016-09-04 14:31:29 +0000 |
|---|---|---|
| committer | jsing <> | 2016-09-04 14:31:29 +0000 |
| commit | e38c58272a121e2bc9a785ec4001bbc802d68f66 (patch) | |
| tree | 492fd2a4355d8592de425463d194374bdc85aa0a /src/lib/libcrypto/bn | |
| parent | a9cbed3be03a99c87e2b07b16b511e65a90bf800 (diff) | |
| download | openbsd-e38c58272a121e2bc9a785ec4001bbc802d68f66.tar.gz openbsd-e38c58272a121e2bc9a785ec4001bbc802d68f66.tar.bz2 openbsd-e38c58272a121e2bc9a785ec4001bbc802d68f66.zip | |
Less S390.
ok deraadt@
Diffstat (limited to 'src/lib/libcrypto/bn')
| -rw-r--r-- | src/lib/libcrypto/bn/asm/s390x-gf2m.pl | 221 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/s390x-mont.pl | 277 | ||||
| -rwxr-xr-x | src/lib/libcrypto/bn/asm/s390x.S | 678 |
3 files changed, 0 insertions, 1176 deletions
diff --git a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl deleted file mode 100644 index cd9f13eca2..0000000000 --- a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl +++ /dev/null | |||
| @@ -1,221 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # May 2011 | ||
| 11 | # | ||
| 12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | ||
| 13 | # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | ||
| 14 | # the time being... gcc 4.3 appeared to generate poor code, therefore | ||
| 15 | # the effort. And indeed, the module delivers 55%-90%(*) improvement | ||
| 16 | # on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit | ||
| 17 | # key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196. | ||
| 18 | # This is for 64-bit build. In 32-bit "highgprs" case improvement is | ||
| 19 | # even higher, for example on z990 it was measured 80%-150%. ECDSA | ||
| 20 | # sign is modest 9%-12% faster. Keep in mind that these coefficients | ||
| 21 | # are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is | ||
| 22 | # burnt in it... | ||
| 23 | # | ||
| 24 | # (*) gcc 4.1 was observed to deliver better results than gcc 4.3, | ||
| 25 | # so that improvement coefficients can vary from one specific | ||
| 26 | # setup to another. | ||
| 27 | |||
| 28 | $flavour = shift; | ||
| 29 | |||
| 30 | if ($flavour =~ /3[12]/) { | ||
| 31 | $SIZE_T=4; | ||
| 32 | $g=""; | ||
| 33 | } else { | ||
| 34 | $SIZE_T=8; | ||
| 35 | $g="g"; | ||
| 36 | } | ||
| 37 | |||
| 38 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 39 | open STDOUT,">$output"; | ||
| 40 | |||
| 41 | $stdframe=16*$SIZE_T+4*8; | ||
| 42 | |||
| 43 | $rp="%r2"; | ||
| 44 | $a1="%r3"; | ||
| 45 | $a0="%r4"; | ||
| 46 | $b1="%r5"; | ||
| 47 | $b0="%r6"; | ||
| 48 | |||
| 49 | $ra="%r14"; | ||
| 50 | $sp="%r15"; | ||
| 51 | |||
| 52 | @T=("%r0","%r1"); | ||
| 53 | @i=("%r12","%r13"); | ||
| 54 | |||
| 55 | ($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11)); | ||
| 56 | ($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8; | ||
| 57 | |||
| 58 | $code.=<<___; | ||
| 59 | .text | ||
| 60 | |||
| 61 | .type _mul_1x1,\@function | ||
| 62 | .align 16 | ||
| 63 | _mul_1x1: | ||
| 64 | lgr $a1,$a | ||
| 65 | sllg $a2,$a,1 | ||
| 66 | sllg $a4,$a,2 | ||
| 67 | sllg $a8,$a,3 | ||
| 68 | |||
| 69 | srag $lo,$a1,63 # broadcast 63rd bit | ||
| 70 | nihh $a1,0x1fff | ||
| 71 | srag @i[0],$a2,63 # broadcast 62nd bit | ||
| 72 | nihh $a2,0x3fff | ||
| 73 | srag @i[1],$a4,63 # broadcast 61st bit | ||
| 74 | nihh $a4,0x7fff | ||
| 75 | ngr $lo,$b | ||
| 76 | ngr @i[0],$b | ||
| 77 | ngr @i[1],$b | ||
| 78 | |||
| 79 | lghi @T[0],0 | ||
| 80 | lgr $a12,$a1 | ||
| 81 | stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0 | ||
| 82 | xgr $a12,$a2 | ||
| 83 | stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1 | ||
| 84 | lgr $a48,$a4 | ||
| 85 | stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2 | ||
| 86 | xgr $a48,$a8 | ||
| 87 | stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2 | ||
| 88 | xgr $a1,$a4 | ||
| 89 | |||
| 90 | stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4 | ||
| 91 | xgr $a2,$a4 | ||
| 92 | stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4 | ||
| 93 | xgr $a12,$a4 | ||
| 94 | stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4 | ||
| 95 | xgr $a1,$a48 | ||
| 96 | stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4 | ||
| 97 | xgr $a2,$a48 | ||
| 98 | |||
| 99 | stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8 | ||
| 100 | xgr $a12,$a48 | ||
| 101 | stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8 | ||
| 102 | xgr $a1,$a4 | ||
| 103 | stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8 | ||
| 104 | xgr $a2,$a4 | ||
| 105 | stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8 | ||
| 106 | |||
| 107 | xgr $a12,$a4 | ||
| 108 | stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8 | ||
| 109 | srlg $hi,$lo,1 | ||
| 110 | stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8 | ||
| 111 | sllg $lo,$lo,63 | ||
| 112 | stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8 | ||
| 113 | srlg @T[0],@i[0],2 | ||
| 114 | stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8 | ||
| 115 | |||
| 116 | lghi $mask,`0xf<<3` | ||
| 117 | sllg $a1,@i[0],62 | ||
| 118 | sllg @i[0],$b,3 | ||
| 119 | srlg @T[1],@i[1],3 | ||
| 120 | ngr @i[0],$mask | ||
| 121 | sllg $a2,@i[1],61 | ||
| 122 | srlg @i[1],$b,4-3 | ||
| 123 | xgr $hi,@T[0] | ||
| 124 | ngr @i[1],$mask | ||
| 125 | xgr $lo,$a1 | ||
| 126 | xgr $hi,@T[1] | ||
| 127 | xgr $lo,$a2 | ||
| 128 | |||
| 129 | xg $lo,$stdframe(@i[0],$sp) | ||
| 130 | srlg @i[0],$b,8-3 | ||
| 131 | ngr @i[0],$mask | ||
| 132 | ___ | ||
| 133 | for($n=1;$n<14;$n++) { | ||
| 134 | $code.=<<___; | ||
| 135 | lg @T[1],$stdframe(@i[1],$sp) | ||
| 136 | srlg @i[1],$b,`($n+2)*4`-3 | ||
| 137 | sllg @T[0],@T[1],`$n*4` | ||
| 138 | ngr @i[1],$mask | ||
| 139 | srlg @T[1],@T[1],`64-$n*4` | ||
| 140 | xgr $lo,@T[0] | ||
| 141 | xgr $hi,@T[1] | ||
| 142 | ___ | ||
| 143 | push(@i,shift(@i)); push(@T,shift(@T)); | ||
| 144 | } | ||
| 145 | $code.=<<___; | ||
| 146 | lg @T[1],$stdframe(@i[1],$sp) | ||
| 147 | sllg @T[0],@T[1],`$n*4` | ||
| 148 | srlg @T[1],@T[1],`64-$n*4` | ||
| 149 | xgr $lo,@T[0] | ||
| 150 | xgr $hi,@T[1] | ||
| 151 | |||
| 152 | lg @T[0],$stdframe(@i[0],$sp) | ||
| 153 | sllg @T[1],@T[0],`($n+1)*4` | ||
| 154 | srlg @T[0],@T[0],`64-($n+1)*4` | ||
| 155 | xgr $lo,@T[1] | ||
| 156 | xgr $hi,@T[0] | ||
| 157 | |||
| 158 | br $ra | ||
| 159 | .size _mul_1x1,.-_mul_1x1 | ||
| 160 | |||
| 161 | .globl bn_GF2m_mul_2x2 | ||
| 162 | .type bn_GF2m_mul_2x2,\@function | ||
| 163 | .align 16 | ||
| 164 | bn_GF2m_mul_2x2: | ||
| 165 | stm${g} %r3,%r15,3*$SIZE_T($sp) | ||
| 166 | |||
| 167 | lghi %r1,-$stdframe-128 | ||
| 168 | la %r0,0($sp) | ||
| 169 | la $sp,0(%r1,$sp) # alloca | ||
| 170 | st${g} %r0,0($sp) # back chain | ||
| 171 | ___ | ||
| 172 | if ($SIZE_T==8) { | ||
| 173 | my @r=map("%r$_",(6..9)); | ||
| 174 | $code.=<<___; | ||
| 175 | bras $ra,_mul_1x1 # a1·b1 | ||
| 176 | stmg $lo,$hi,16($rp) | ||
| 177 | |||
| 178 | lg $a,`$stdframe+128+4*$SIZE_T`($sp) | ||
| 179 | lg $b,`$stdframe+128+6*$SIZE_T`($sp) | ||
| 180 | bras $ra,_mul_1x1 # a0·b0 | ||
| 181 | stmg $lo,$hi,0($rp) | ||
| 182 | |||
| 183 | lg $a,`$stdframe+128+3*$SIZE_T`($sp) | ||
| 184 | lg $b,`$stdframe+128+5*$SIZE_T`($sp) | ||
| 185 | xg $a,`$stdframe+128+4*$SIZE_T`($sp) | ||
| 186 | xg $b,`$stdframe+128+6*$SIZE_T`($sp) | ||
| 187 | bras $ra,_mul_1x1 # (a0+a1)·(b0+b1) | ||
| 188 | lmg @r[0],@r[3],0($rp) | ||
| 189 | |||
| 190 | xgr $lo,$hi | ||
| 191 | xgr $hi,@r[1] | ||
| 192 | xgr $lo,@r[0] | ||
| 193 | xgr $hi,@r[2] | ||
| 194 | xgr $lo,@r[3] | ||
| 195 | xgr $hi,@r[3] | ||
| 196 | xgr $lo,$hi | ||
| 197 | stg $hi,16($rp) | ||
| 198 | stg $lo,8($rp) | ||
| 199 | ___ | ||
| 200 | } else { | ||
| 201 | $code.=<<___; | ||
| 202 | sllg %r3,%r3,32 | ||
| 203 | sllg %r5,%r5,32 | ||
| 204 | or %r3,%r4 | ||
| 205 | or %r5,%r6 | ||
| 206 | bras $ra,_mul_1x1 | ||
| 207 | rllg $lo,$lo,32 | ||
| 208 | rllg $hi,$hi,32 | ||
| 209 | stmg $lo,$hi,0($rp) | ||
| 210 | ___ | ||
| 211 | } | ||
| 212 | $code.=<<___; | ||
| 213 | lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp) | ||
| 214 | br $ra | ||
| 215 | .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | ||
| 216 | .string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 217 | ___ | ||
| 218 | |||
| 219 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 220 | print $code; | ||
| 221 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl deleted file mode 100644 index 9fd64e81ee..0000000000 --- a/src/lib/libcrypto/bn/asm/s390x-mont.pl +++ /dev/null | |||
| @@ -1,277 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # April 2007. | ||
| 11 | # | ||
| 12 | # Performance improvement over vanilla C code varies from 85% to 45% | ||
| 13 | # depending on key length and benchmark. Unfortunately in this context | ||
| 14 | # these are not very impressive results [for code that utilizes "wide" | ||
| 15 | # 64x64=128-bit multiplication, which is not commonly available to C | ||
| 16 | # programmers], at least hand-coded bn_asm.c replacement is known to | ||
| 17 | # provide 30-40% better results for longest keys. Well, on a second | ||
| 18 | # thought it's not very surprising, because z-CPUs are single-issue | ||
| 19 | # and _strictly_ in-order execution, while bn_mul_mont is more or less | ||
| 20 | # dependent on CPU ability to pipe-line instructions and have several | ||
| 21 | # of them "in-flight" at the same time. I mean while other methods, | ||
| 22 | # for example Karatsuba, aim to minimize amount of multiplications at | ||
| 23 | # the cost of other operations increase, bn_mul_mont aim to neatly | ||
| 24 | # "overlap" multiplications and the other operations [and on most | ||
| 25 | # platforms even minimize the amount of the other operations, in | ||
| 26 | # particular references to memory]. But it's possible to improve this | ||
| 27 | # module performance by implementing dedicated squaring code-path and | ||
| 28 | # possibly by unrolling loops... | ||
| 29 | |||
| 30 | # January 2009. | ||
| 31 | # | ||
| 32 | # Reschedule to minimize/avoid Address Generation Interlock hazard, | ||
| 33 | # make inner loops counter-based. | ||
| 34 | |||
| 35 | # November 2010. | ||
| 36 | # | ||
| 37 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
| 38 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
| 39 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
| 40 | # application context. The feature is not specific to any particular | ||
| 41 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
| 42 | # remains z/Architecture specific. Compatibility with 32-bit BN_ULONG | ||
| 43 | # is achieved by swapping words after 64-bit loads, follow _dswap-s. | ||
| 44 | # On z990 it was measured to perform 2.6-2.2 times better than | ||
| 45 | # compiler-generated code, less for longer keys... | ||
| 46 | |||
| 47 | $flavour = shift; | ||
| 48 | |||
| 49 | if ($flavour =~ /3[12]/) { | ||
| 50 | $SIZE_T=4; | ||
| 51 | $g=""; | ||
| 52 | } else { | ||
| 53 | $SIZE_T=8; | ||
| 54 | $g="g"; | ||
| 55 | } | ||
| 56 | |||
| 57 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 58 | open STDOUT,">$output"; | ||
| 59 | |||
| 60 | $stdframe=16*$SIZE_T+4*8; | ||
| 61 | |||
| 62 | $mn0="%r0"; | ||
| 63 | $num="%r1"; | ||
| 64 | |||
| 65 | # int bn_mul_mont( | ||
| 66 | $rp="%r2"; # BN_ULONG *rp, | ||
| 67 | $ap="%r3"; # const BN_ULONG *ap, | ||
| 68 | $bp="%r4"; # const BN_ULONG *bp, | ||
| 69 | $np="%r5"; # const BN_ULONG *np, | ||
| 70 | $n0="%r6"; # const BN_ULONG *n0, | ||
| 71 | #$num="160(%r15)" # int num); | ||
| 72 | |||
| 73 | $bi="%r2"; # zaps rp | ||
| 74 | $j="%r7"; | ||
| 75 | |||
| 76 | $ahi="%r8"; | ||
| 77 | $alo="%r9"; | ||
| 78 | $nhi="%r10"; | ||
| 79 | $nlo="%r11"; | ||
| 80 | $AHI="%r12"; | ||
| 81 | $NHI="%r13"; | ||
| 82 | $count="%r14"; | ||
| 83 | $sp="%r15"; | ||
| 84 | |||
| 85 | $code.=<<___; | ||
| 86 | .text | ||
| 87 | .globl bn_mul_mont | ||
| 88 | .type bn_mul_mont,\@function | ||
| 89 | bn_mul_mont: | ||
| 90 | lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num | ||
| 91 | sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes | ||
| 92 | la $bp,0($num,$bp) | ||
| 93 | |||
| 94 | st${g} %r2,2*$SIZE_T($sp) | ||
| 95 | |||
| 96 | cghi $num,16 # | ||
| 97 | lghi %r2,0 # | ||
| 98 | blr %r14 # if($num<16) return 0; | ||
| 99 | ___ | ||
| 100 | $code.=<<___ if ($flavour =~ /3[12]/); | ||
| 101 | tmll $num,4 | ||
| 102 | bnzr %r14 # if ($num&1) return 0; | ||
| 103 | ___ | ||
| 104 | $code.=<<___ if ($flavour !~ /3[12]/); | ||
| 105 | cghi $num,96 # | ||
| 106 | bhr %r14 # if($num>96) return 0; | ||
| 107 | ___ | ||
| 108 | $code.=<<___; | ||
| 109 | stm${g} %r3,%r15,3*$SIZE_T($sp) | ||
| 110 | |||
| 111 | lghi $rp,-$stdframe-8 # leave room for carry bit | ||
| 112 | lcgr $j,$num # -$num | ||
| 113 | lgr %r0,$sp | ||
| 114 | la $rp,0($rp,$sp) | ||
| 115 | la $sp,0($j,$rp) # alloca | ||
| 116 | st${g} %r0,0($sp) # back chain | ||
| 117 | |||
| 118 | sra $num,3 # restore $num | ||
| 119 | la $bp,0($j,$bp) # restore $bp | ||
| 120 | ahi $num,-1 # adjust $num for inner loop | ||
| 121 | lg $n0,0($n0) # pull n0 | ||
| 122 | _dswap $n0 | ||
| 123 | |||
| 124 | lg $bi,0($bp) | ||
| 125 | _dswap $bi | ||
| 126 | lg $alo,0($ap) | ||
| 127 | _dswap $alo | ||
| 128 | mlgr $ahi,$bi # ap[0]*bp[0] | ||
| 129 | lgr $AHI,$ahi | ||
| 130 | |||
| 131 | lgr $mn0,$alo # "tp[0]"*n0 | ||
| 132 | msgr $mn0,$n0 | ||
| 133 | |||
| 134 | lg $nlo,0($np) # | ||
| 135 | _dswap $nlo | ||
| 136 | mlgr $nhi,$mn0 # np[0]*m1 | ||
| 137 | algr $nlo,$alo # +="tp[0]" | ||
| 138 | lghi $NHI,0 | ||
| 139 | alcgr $NHI,$nhi | ||
| 140 | |||
| 141 | la $j,8(%r0) # j=1 | ||
| 142 | lr $count,$num | ||
| 143 | |||
| 144 | .align 16 | ||
| 145 | .L1st: | ||
| 146 | lg $alo,0($j,$ap) | ||
| 147 | _dswap $alo | ||
| 148 | mlgr $ahi,$bi # ap[j]*bp[0] | ||
| 149 | algr $alo,$AHI | ||
| 150 | lghi $AHI,0 | ||
| 151 | alcgr $AHI,$ahi | ||
| 152 | |||
| 153 | lg $nlo,0($j,$np) | ||
| 154 | _dswap $nlo | ||
| 155 | mlgr $nhi,$mn0 # np[j]*m1 | ||
| 156 | algr $nlo,$NHI | ||
| 157 | lghi $NHI,0 | ||
| 158 | alcgr $nhi,$NHI # +="tp[j]" | ||
| 159 | algr $nlo,$alo | ||
| 160 | alcgr $NHI,$nhi | ||
| 161 | |||
| 162 | stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= | ||
| 163 | la $j,8($j) # j++ | ||
| 164 | brct $count,.L1st | ||
| 165 | |||
| 166 | algr $NHI,$AHI | ||
| 167 | lghi $AHI,0 | ||
| 168 | alcgr $AHI,$AHI # upmost overflow bit | ||
| 169 | stg $NHI,$stdframe-8($j,$sp) | ||
| 170 | stg $AHI,$stdframe($j,$sp) | ||
| 171 | la $bp,8($bp) # bp++ | ||
| 172 | |||
| 173 | .Louter: | ||
| 174 | lg $bi,0($bp) # bp[i] | ||
| 175 | _dswap $bi | ||
| 176 | lg $alo,0($ap) | ||
| 177 | _dswap $alo | ||
| 178 | mlgr $ahi,$bi # ap[0]*bp[i] | ||
| 179 | alg $alo,$stdframe($sp) # +=tp[0] | ||
| 180 | lghi $AHI,0 | ||
| 181 | alcgr $AHI,$ahi | ||
| 182 | |||
| 183 | lgr $mn0,$alo | ||
| 184 | msgr $mn0,$n0 # tp[0]*n0 | ||
| 185 | |||
| 186 | lg $nlo,0($np) # np[0] | ||
| 187 | _dswap $nlo | ||
| 188 | mlgr $nhi,$mn0 # np[0]*m1 | ||
| 189 | algr $nlo,$alo # +="tp[0]" | ||
| 190 | lghi $NHI,0 | ||
| 191 | alcgr $NHI,$nhi | ||
| 192 | |||
| 193 | la $j,8(%r0) # j=1 | ||
| 194 | lr $count,$num | ||
| 195 | |||
| 196 | .align 16 | ||
| 197 | .Linner: | ||
| 198 | lg $alo,0($j,$ap) | ||
| 199 | _dswap $alo | ||
| 200 | mlgr $ahi,$bi # ap[j]*bp[i] | ||
| 201 | algr $alo,$AHI | ||
| 202 | lghi $AHI,0 | ||
| 203 | alcgr $ahi,$AHI | ||
| 204 | alg $alo,$stdframe($j,$sp)# +=tp[j] | ||
| 205 | alcgr $AHI,$ahi | ||
| 206 | |||
| 207 | lg $nlo,0($j,$np) | ||
| 208 | _dswap $nlo | ||
| 209 | mlgr $nhi,$mn0 # np[j]*m1 | ||
| 210 | algr $nlo,$NHI | ||
| 211 | lghi $NHI,0 | ||
| 212 | alcgr $nhi,$NHI | ||
| 213 | algr $nlo,$alo # +="tp[j]" | ||
| 214 | alcgr $NHI,$nhi | ||
| 215 | |||
| 216 | stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= | ||
| 217 | la $j,8($j) # j++ | ||
| 218 | brct $count,.Linner | ||
| 219 | |||
| 220 | algr $NHI,$AHI | ||
| 221 | lghi $AHI,0 | ||
| 222 | alcgr $AHI,$AHI | ||
| 223 | alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit | ||
| 224 | lghi $ahi,0 | ||
| 225 | alcgr $AHI,$ahi # new upmost overflow bit | ||
| 226 | stg $NHI,$stdframe-8($j,$sp) | ||
| 227 | stg $AHI,$stdframe($j,$sp) | ||
| 228 | |||
| 229 | la $bp,8($bp) # bp++ | ||
| 230 | cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num] | ||
| 231 | jne .Louter | ||
| 232 | |||
| 233 | l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp | ||
| 234 | la $ap,$stdframe($sp) | ||
| 235 | ahi $num,1 # restore $num, incidentally clears "borrow" | ||
| 236 | |||
| 237 | la $j,0(%r0) | ||
| 238 | lr $count,$num | ||
| 239 | .Lsub: lg $alo,0($j,$ap) | ||
| 240 | lg $nlo,0($j,$np) | ||
| 241 | _dswap $nlo | ||
| 242 | slbgr $alo,$nlo | ||
| 243 | stg $alo,0($j,$rp) | ||
| 244 | la $j,8($j) | ||
| 245 | brct $count,.Lsub | ||
| 246 | lghi $ahi,0 | ||
| 247 | slbgr $AHI,$ahi # handle upmost carry | ||
| 248 | |||
| 249 | ngr $ap,$AHI | ||
| 250 | lghi $np,-1 | ||
| 251 | xgr $np,$AHI | ||
| 252 | ngr $np,$rp | ||
| 253 | ogr $ap,$np # ap=borrow?tp:rp | ||
| 254 | |||
| 255 | la $j,0(%r0) | ||
| 256 | lgr $count,$num | ||
| 257 | .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh | ||
| 258 | _dswap $alo | ||
| 259 | stg $j,$stdframe($j,$sp) # zap tp | ||
| 260 | stg $alo,0($j,$rp) | ||
| 261 | la $j,8($j) | ||
| 262 | brct $count,.Lcopy | ||
| 263 | |||
| 264 | la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp) | ||
| 265 | lm${g} %r6,%r15,0(%r1) | ||
| 266 | lghi %r2,1 # signal "processed" | ||
| 267 | br %r14 | ||
| 268 | .size bn_mul_mont,.-bn_mul_mont | ||
| 269 | .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 270 | ___ | ||
| 271 | |||
| 272 | foreach (split("\n",$code)) { | ||
| 273 | s/\`([^\`]*)\`/eval $1/ge; | ||
| 274 | s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e; | ||
| 275 | print $_,"\n"; | ||
| 276 | } | ||
| 277 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/s390x.S b/src/lib/libcrypto/bn/asm/s390x.S deleted file mode 100755 index 43fcb79bc0..0000000000 --- a/src/lib/libcrypto/bn/asm/s390x.S +++ /dev/null | |||
| @@ -1,678 +0,0 @@ | |||
| 1 | .ident "s390x.S, version 1.1" | ||
| 2 | // ==================================================================== | ||
| 3 | // Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 4 | // project. | ||
| 5 | // | ||
| 6 | // Rights for redistribution and usage in source and binary forms are | ||
| 7 | // granted according to the OpenSSL license. Warranty of any kind is | ||
| 8 | // disclaimed. | ||
| 9 | // ==================================================================== | ||
| 10 | |||
| 11 | .text | ||
| 12 | |||
| 13 | #define zero %r0 | ||
| 14 | |||
| 15 | // BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); | ||
| 16 | .globl bn_mul_add_words | ||
| 17 | .type bn_mul_add_words,@function | ||
| 18 | .align 4 | ||
| 19 | bn_mul_add_words: | ||
| 20 | lghi zero,0 // zero = 0 | ||
| 21 | la %r1,0(%r2) // put rp aside | ||
| 22 | lghi %r2,0 // i=0; | ||
| 23 | ltgfr %r4,%r4 | ||
| 24 | bler %r14 // if (len<=0) return 0; | ||
| 25 | |||
| 26 | stmg %r6,%r10,48(%r15) | ||
| 27 | lghi %r10,3 | ||
| 28 | lghi %r8,0 // carry = 0 | ||
| 29 | nr %r10,%r4 // len%4 | ||
| 30 | sra %r4,2 // cnt=len/4 | ||
| 31 | jz .Loop1_madd // carry is incidentally cleared if branch taken | ||
| 32 | algr zero,zero // clear carry | ||
| 33 | |||
| 34 | .Loop4_madd: | ||
| 35 | lg %r7,0(%r2,%r3) // ap[i] | ||
| 36 | mlgr %r6,%r5 // *=w | ||
| 37 | alcgr %r7,%r8 // +=carry | ||
| 38 | alcgr %r6,zero | ||
| 39 | alg %r7,0(%r2,%r1) // +=rp[i] | ||
| 40 | stg %r7,0(%r2,%r1) // rp[i]= | ||
| 41 | |||
| 42 | lg %r9,8(%r2,%r3) | ||
| 43 | mlgr %r8,%r5 | ||
| 44 | alcgr %r9,%r6 | ||
| 45 | alcgr %r8,zero | ||
| 46 | alg %r9,8(%r2,%r1) | ||
| 47 | stg %r9,8(%r2,%r1) | ||
| 48 | |||
| 49 | lg %r7,16(%r2,%r3) | ||
| 50 | mlgr %r6,%r5 | ||
| 51 | alcgr %r7,%r8 | ||
| 52 | alcgr %r6,zero | ||
| 53 | alg %r7,16(%r2,%r1) | ||
| 54 | stg %r7,16(%r2,%r1) | ||
| 55 | |||
| 56 | lg %r9,24(%r2,%r3) | ||
| 57 | mlgr %r8,%r5 | ||
| 58 | alcgr %r9,%r6 | ||
| 59 | alcgr %r8,zero | ||
| 60 | alg %r9,24(%r2,%r1) | ||
| 61 | stg %r9,24(%r2,%r1) | ||
| 62 | |||
| 63 | la %r2,32(%r2) // i+=4 | ||
| 64 | brct %r4,.Loop4_madd | ||
| 65 | |||
| 66 | la %r10,1(%r10) // see if len%4 is zero ... | ||
| 67 | brct %r10,.Loop1_madd // without touching condition code:-) | ||
| 68 | |||
| 69 | .Lend_madd: | ||
| 70 | alcgr %r8,zero // collect carry bit | ||
| 71 | lgr %r2,%r8 | ||
| 72 | lmg %r6,%r10,48(%r15) | ||
| 73 | br %r14 | ||
| 74 | |||
| 75 | .Loop1_madd: | ||
| 76 | lg %r7,0(%r2,%r3) // ap[i] | ||
| 77 | mlgr %r6,%r5 // *=w | ||
| 78 | alcgr %r7,%r8 // +=carry | ||
| 79 | alcgr %r6,zero | ||
| 80 | alg %r7,0(%r2,%r1) // +=rp[i] | ||
| 81 | stg %r7,0(%r2,%r1) // rp[i]= | ||
| 82 | |||
| 83 | lgr %r8,%r6 | ||
| 84 | la %r2,8(%r2) // i++ | ||
| 85 | brct %r10,.Loop1_madd | ||
| 86 | |||
| 87 | j .Lend_madd | ||
| 88 | .size bn_mul_add_words,.-bn_mul_add_words | ||
| 89 | |||
| 90 | // BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); | ||
| 91 | .globl bn_mul_words | ||
| 92 | .type bn_mul_words,@function | ||
| 93 | .align 4 | ||
| 94 | bn_mul_words: | ||
| 95 | lghi zero,0 // zero = 0 | ||
| 96 | la %r1,0(%r2) // put rp aside | ||
| 97 | lghi %r2,0 // i=0; | ||
| 98 | ltgfr %r4,%r4 | ||
| 99 | bler %r14 // if (len<=0) return 0; | ||
| 100 | |||
| 101 | stmg %r6,%r10,48(%r15) | ||
| 102 | lghi %r10,3 | ||
| 103 | lghi %r8,0 // carry = 0 | ||
| 104 | nr %r10,%r4 // len%4 | ||
| 105 | sra %r4,2 // cnt=len/4 | ||
| 106 | jz .Loop1_mul // carry is incidentally cleared if branch taken | ||
| 107 | algr zero,zero // clear carry | ||
| 108 | |||
| 109 | .Loop4_mul: | ||
| 110 | lg %r7,0(%r2,%r3) // ap[i] | ||
| 111 | mlgr %r6,%r5 // *=w | ||
| 112 | alcgr %r7,%r8 // +=carry | ||
| 113 | stg %r7,0(%r2,%r1) // rp[i]= | ||
| 114 | |||
| 115 | lg %r9,8(%r2,%r3) | ||
| 116 | mlgr %r8,%r5 | ||
| 117 | alcgr %r9,%r6 | ||
| 118 | stg %r9,8(%r2,%r1) | ||
| 119 | |||
| 120 | lg %r7,16(%r2,%r3) | ||
| 121 | mlgr %r6,%r5 | ||
| 122 | alcgr %r7,%r8 | ||
| 123 | stg %r7,16(%r2,%r1) | ||
| 124 | |||
| 125 | lg %r9,24(%r2,%r3) | ||
| 126 | mlgr %r8,%r5 | ||
| 127 | alcgr %r9,%r6 | ||
| 128 | stg %r9,24(%r2,%r1) | ||
| 129 | |||
| 130 | la %r2,32(%r2) // i+=4 | ||
| 131 | brct %r4,.Loop4_mul | ||
| 132 | |||
| 133 | la %r10,1(%r10) // see if len%4 is zero ... | ||
| 134 | brct %r10,.Loop1_mul // without touching condition code:-) | ||
| 135 | |||
| 136 | .Lend_mul: | ||
| 137 | alcgr %r8,zero // collect carry bit | ||
| 138 | lgr %r2,%r8 | ||
| 139 | lmg %r6,%r10,48(%r15) | ||
| 140 | br %r14 | ||
| 141 | |||
| 142 | .Loop1_mul: | ||
| 143 | lg %r7,0(%r2,%r3) // ap[i] | ||
| 144 | mlgr %r6,%r5 // *=w | ||
| 145 | alcgr %r7,%r8 // +=carry | ||
| 146 | stg %r7,0(%r2,%r1) // rp[i]= | ||
| 147 | |||
| 148 | lgr %r8,%r6 | ||
| 149 | la %r2,8(%r2) // i++ | ||
| 150 | brct %r10,.Loop1_mul | ||
| 151 | |||
| 152 | j .Lend_mul | ||
| 153 | .size bn_mul_words,.-bn_mul_words | ||
| 154 | |||
| 155 | // void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4) | ||
| 156 | .globl bn_sqr_words | ||
| 157 | .type bn_sqr_words,@function | ||
| 158 | .align 4 | ||
| 159 | bn_sqr_words: | ||
| 160 | ltgfr %r4,%r4 | ||
| 161 | bler %r14 | ||
| 162 | |||
| 163 | stmg %r6,%r7,48(%r15) | ||
| 164 | srag %r1,%r4,2 // cnt=len/4 | ||
| 165 | jz .Loop1_sqr | ||
| 166 | |||
| 167 | .Loop4_sqr: | ||
| 168 | lg %r7,0(%r3) | ||
| 169 | mlgr %r6,%r7 | ||
| 170 | stg %r7,0(%r2) | ||
| 171 | stg %r6,8(%r2) | ||
| 172 | |||
| 173 | lg %r7,8(%r3) | ||
| 174 | mlgr %r6,%r7 | ||
| 175 | stg %r7,16(%r2) | ||
| 176 | stg %r6,24(%r2) | ||
| 177 | |||
| 178 | lg %r7,16(%r3) | ||
| 179 | mlgr %r6,%r7 | ||
| 180 | stg %r7,32(%r2) | ||
| 181 | stg %r6,40(%r2) | ||
| 182 | |||
| 183 | lg %r7,24(%r3) | ||
| 184 | mlgr %r6,%r7 | ||
| 185 | stg %r7,48(%r2) | ||
| 186 | stg %r6,56(%r2) | ||
| 187 | |||
| 188 | la %r3,32(%r3) | ||
| 189 | la %r2,64(%r2) | ||
| 190 | brct %r1,.Loop4_sqr | ||
| 191 | |||
| 192 | lghi %r1,3 | ||
| 193 | nr %r4,%r1 // cnt=len%4 | ||
| 194 | jz .Lend_sqr | ||
| 195 | |||
| 196 | .Loop1_sqr: | ||
| 197 | lg %r7,0(%r3) | ||
| 198 | mlgr %r6,%r7 | ||
| 199 | stg %r7,0(%r2) | ||
| 200 | stg %r6,8(%r2) | ||
| 201 | |||
| 202 | la %r3,8(%r3) | ||
| 203 | la %r2,16(%r2) | ||
| 204 | brct %r4,.Loop1_sqr | ||
| 205 | |||
| 206 | .Lend_sqr: | ||
| 207 | lmg %r6,%r7,48(%r15) | ||
| 208 | br %r14 | ||
| 209 | .size bn_sqr_words,.-bn_sqr_words | ||
| 210 | |||
| 211 | // BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d); | ||
| 212 | .globl bn_div_words | ||
| 213 | .type bn_div_words,@function | ||
| 214 | .align 4 | ||
| 215 | bn_div_words: | ||
| 216 | dlgr %r2,%r4 | ||
| 217 | lgr %r2,%r3 | ||
| 218 | br %r14 | ||
| 219 | .size bn_div_words,.-bn_div_words | ||
| 220 | |||
| 221 | // BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); | ||
| 222 | .globl bn_add_words | ||
| 223 | .type bn_add_words,@function | ||
| 224 | .align 4 | ||
| 225 | bn_add_words: | ||
| 226 | la %r1,0(%r2) // put rp aside | ||
| 227 | lghi %r2,0 // i=0 | ||
| 228 | ltgfr %r5,%r5 | ||
| 229 | bler %r14 // if (len<=0) return 0; | ||
| 230 | |||
| 231 | stg %r6,48(%r15) | ||
| 232 | lghi %r6,3 | ||
| 233 | nr %r6,%r5 // len%4 | ||
| 234 | sra %r5,2 // len/4, use sra because it sets condition code | ||
| 235 | jz .Loop1_add // carry is incidentally cleared if branch taken | ||
| 236 | algr %r2,%r2 // clear carry | ||
| 237 | |||
| 238 | .Loop4_add: | ||
| 239 | lg %r0,0(%r2,%r3) | ||
| 240 | alcg %r0,0(%r2,%r4) | ||
| 241 | stg %r0,0(%r2,%r1) | ||
| 242 | lg %r0,8(%r2,%r3) | ||
| 243 | alcg %r0,8(%r2,%r4) | ||
| 244 | stg %r0,8(%r2,%r1) | ||
| 245 | lg %r0,16(%r2,%r3) | ||
| 246 | alcg %r0,16(%r2,%r4) | ||
| 247 | stg %r0,16(%r2,%r1) | ||
| 248 | lg %r0,24(%r2,%r3) | ||
| 249 | alcg %r0,24(%r2,%r4) | ||
| 250 | stg %r0,24(%r2,%r1) | ||
| 251 | |||
| 252 | la %r2,32(%r2) // i+=4 | ||
| 253 | brct %r5,.Loop4_add | ||
| 254 | |||
| 255 | la %r6,1(%r6) // see if len%4 is zero ... | ||
| 256 | brct %r6,.Loop1_add // without touching condition code:-) | ||
| 257 | |||
| 258 | .Lexit_add: | ||
| 259 | lghi %r2,0 | ||
| 260 | alcgr %r2,%r2 | ||
| 261 | lg %r6,48(%r15) | ||
| 262 | br %r14 | ||
| 263 | |||
| 264 | .Loop1_add: | ||
| 265 | lg %r0,0(%r2,%r3) | ||
| 266 | alcg %r0,0(%r2,%r4) | ||
| 267 | stg %r0,0(%r2,%r1) | ||
| 268 | |||
| 269 | la %r2,8(%r2) // i++ | ||
| 270 | brct %r6,.Loop1_add | ||
| 271 | |||
| 272 | j .Lexit_add | ||
| 273 | .size bn_add_words,.-bn_add_words | ||
| 274 | |||
| 275 | // BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); | ||
| 276 | .globl bn_sub_words | ||
| 277 | .type bn_sub_words,@function | ||
| 278 | .align 4 | ||
| 279 | bn_sub_words: | ||
| 280 | la %r1,0(%r2) // put rp aside | ||
| 281 | lghi %r2,0 // i=0 | ||
| 282 | ltgfr %r5,%r5 | ||
| 283 | bler %r14 // if (len<=0) return 0; | ||
| 284 | |||
| 285 | stg %r6,48(%r15) | ||
| 286 | lghi %r6,3 | ||
| 287 | nr %r6,%r5 // len%4 | ||
| 288 | sra %r5,2 // len/4, use sra because it sets condition code | ||
| 289 | jnz .Loop4_sub // borrow is incidentally cleared if branch taken | ||
| 290 | slgr %r2,%r2 // clear borrow | ||
| 291 | |||
| 292 | .Loop1_sub: | ||
| 293 | lg %r0,0(%r2,%r3) | ||
| 294 | slbg %r0,0(%r2,%r4) | ||
| 295 | stg %r0,0(%r2,%r1) | ||
| 296 | |||
| 297 | la %r2,8(%r2) // i++ | ||
| 298 | brct %r6,.Loop1_sub | ||
| 299 | j .Lexit_sub | ||
| 300 | |||
| 301 | .Loop4_sub: | ||
| 302 | lg %r0,0(%r2,%r3) | ||
| 303 | slbg %r0,0(%r2,%r4) | ||
| 304 | stg %r0,0(%r2,%r1) | ||
| 305 | lg %r0,8(%r2,%r3) | ||
| 306 | slbg %r0,8(%r2,%r4) | ||
| 307 | stg %r0,8(%r2,%r1) | ||
| 308 | lg %r0,16(%r2,%r3) | ||
| 309 | slbg %r0,16(%r2,%r4) | ||
| 310 | stg %r0,16(%r2,%r1) | ||
| 311 | lg %r0,24(%r2,%r3) | ||
| 312 | slbg %r0,24(%r2,%r4) | ||
| 313 | stg %r0,24(%r2,%r1) | ||
| 314 | |||
| 315 | la %r2,32(%r2) // i+=4 | ||
| 316 | brct %r5,.Loop4_sub | ||
| 317 | |||
| 318 | la %r6,1(%r6) // see if len%4 is zero ... | ||
| 319 | brct %r6,.Loop1_sub // without touching condition code:-) | ||
| 320 | |||
| 321 | .Lexit_sub: | ||
| 322 | lghi %r2,0 | ||
| 323 | slbgr %r2,%r2 | ||
| 324 | lcgr %r2,%r2 | ||
| 325 | lg %r6,48(%r15) | ||
| 326 | br %r14 | ||
| 327 | .size bn_sub_words,.-bn_sub_words | ||
| 328 | |||
| 329 | #define c1 %r1 | ||
| 330 | #define c2 %r5 | ||
| 331 | #define c3 %r8 | ||
| 332 | |||
| 333 | #define mul_add_c(ai,bi,c1,c2,c3) \ | ||
| 334 | lg %r7,ai*8(%r3); \ | ||
| 335 | mlg %r6,bi*8(%r4); \ | ||
| 336 | algr c1,%r7; \ | ||
| 337 | alcgr c2,%r6; \ | ||
| 338 | alcgr c3,zero | ||
| 339 | |||
| 340 | // void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); | ||
| 341 | .globl bn_mul_comba8 | ||
| 342 | .type bn_mul_comba8,@function | ||
| 343 | .align 4 | ||
| 344 | bn_mul_comba8: | ||
| 345 | stmg %r6,%r8,48(%r15) | ||
| 346 | |||
| 347 | lghi c1,0 | ||
| 348 | lghi c2,0 | ||
| 349 | lghi c3,0 | ||
| 350 | lghi zero,0 | ||
| 351 | |||
| 352 | mul_add_c(0,0,c1,c2,c3); | ||
| 353 | stg c1,0*8(%r2) | ||
| 354 | lghi c1,0 | ||
| 355 | |||
| 356 | mul_add_c(0,1,c2,c3,c1); | ||
| 357 | mul_add_c(1,0,c2,c3,c1); | ||
| 358 | stg c2,1*8(%r2) | ||
| 359 | lghi c2,0 | ||
| 360 | |||
| 361 | mul_add_c(2,0,c3,c1,c2); | ||
| 362 | mul_add_c(1,1,c3,c1,c2); | ||
| 363 | mul_add_c(0,2,c3,c1,c2); | ||
| 364 | stg c3,2*8(%r2) | ||
| 365 | lghi c3,0 | ||
| 366 | |||
| 367 | mul_add_c(0,3,c1,c2,c3); | ||
| 368 | mul_add_c(1,2,c1,c2,c3); | ||
| 369 | mul_add_c(2,1,c1,c2,c3); | ||
| 370 | mul_add_c(3,0,c1,c2,c3); | ||
| 371 | stg c1,3*8(%r2) | ||
| 372 | lghi c1,0 | ||
| 373 | |||
| 374 | mul_add_c(4,0,c2,c3,c1); | ||
| 375 | mul_add_c(3,1,c2,c3,c1); | ||
| 376 | mul_add_c(2,2,c2,c3,c1); | ||
| 377 | mul_add_c(1,3,c2,c3,c1); | ||
| 378 | mul_add_c(0,4,c2,c3,c1); | ||
| 379 | stg c2,4*8(%r2) | ||
| 380 | lghi c2,0 | ||
| 381 | |||
| 382 | mul_add_c(0,5,c3,c1,c2); | ||
| 383 | mul_add_c(1,4,c3,c1,c2); | ||
| 384 | mul_add_c(2,3,c3,c1,c2); | ||
| 385 | mul_add_c(3,2,c3,c1,c2); | ||
| 386 | mul_add_c(4,1,c3,c1,c2); | ||
| 387 | mul_add_c(5,0,c3,c1,c2); | ||
| 388 | stg c3,5*8(%r2) | ||
| 389 | lghi c3,0 | ||
| 390 | |||
| 391 | mul_add_c(6,0,c1,c2,c3); | ||
| 392 | mul_add_c(5,1,c1,c2,c3); | ||
| 393 | mul_add_c(4,2,c1,c2,c3); | ||
| 394 | mul_add_c(3,3,c1,c2,c3); | ||
| 395 | mul_add_c(2,4,c1,c2,c3); | ||
| 396 | mul_add_c(1,5,c1,c2,c3); | ||
| 397 | mul_add_c(0,6,c1,c2,c3); | ||
| 398 | stg c1,6*8(%r2) | ||
| 399 | lghi c1,0 | ||
| 400 | |||
| 401 | mul_add_c(0,7,c2,c3,c1); | ||
| 402 | mul_add_c(1,6,c2,c3,c1); | ||
| 403 | mul_add_c(2,5,c2,c3,c1); | ||
| 404 | mul_add_c(3,4,c2,c3,c1); | ||
| 405 | mul_add_c(4,3,c2,c3,c1); | ||
| 406 | mul_add_c(5,2,c2,c3,c1); | ||
| 407 | mul_add_c(6,1,c2,c3,c1); | ||
| 408 | mul_add_c(7,0,c2,c3,c1); | ||
| 409 | stg c2,7*8(%r2) | ||
| 410 | lghi c2,0 | ||
| 411 | |||
| 412 | mul_add_c(7,1,c3,c1,c2); | ||
| 413 | mul_add_c(6,2,c3,c1,c2); | ||
| 414 | mul_add_c(5,3,c3,c1,c2); | ||
| 415 | mul_add_c(4,4,c3,c1,c2); | ||
| 416 | mul_add_c(3,5,c3,c1,c2); | ||
| 417 | mul_add_c(2,6,c3,c1,c2); | ||
| 418 | mul_add_c(1,7,c3,c1,c2); | ||
| 419 | stg c3,8*8(%r2) | ||
| 420 | lghi c3,0 | ||
| 421 | |||
| 422 | mul_add_c(2,7,c1,c2,c3); | ||
| 423 | mul_add_c(3,6,c1,c2,c3); | ||
| 424 | mul_add_c(4,5,c1,c2,c3); | ||
| 425 | mul_add_c(5,4,c1,c2,c3); | ||
| 426 | mul_add_c(6,3,c1,c2,c3); | ||
| 427 | mul_add_c(7,2,c1,c2,c3); | ||
| 428 | stg c1,9*8(%r2) | ||
| 429 | lghi c1,0 | ||
| 430 | |||
| 431 | mul_add_c(7,3,c2,c3,c1); | ||
| 432 | mul_add_c(6,4,c2,c3,c1); | ||
| 433 | mul_add_c(5,5,c2,c3,c1); | ||
| 434 | mul_add_c(4,6,c2,c3,c1); | ||
| 435 | mul_add_c(3,7,c2,c3,c1); | ||
| 436 | stg c2,10*8(%r2) | ||
| 437 | lghi c2,0 | ||
| 438 | |||
| 439 | mul_add_c(4,7,c3,c1,c2); | ||
| 440 | mul_add_c(5,6,c3,c1,c2); | ||
| 441 | mul_add_c(6,5,c3,c1,c2); | ||
| 442 | mul_add_c(7,4,c3,c1,c2); | ||
| 443 | stg c3,11*8(%r2) | ||
| 444 | lghi c3,0 | ||
| 445 | |||
| 446 | mul_add_c(7,5,c1,c2,c3); | ||
| 447 | mul_add_c(6,6,c1,c2,c3); | ||
| 448 | mul_add_c(5,7,c1,c2,c3); | ||
| 449 | stg c1,12*8(%r2) | ||
| 450 | lghi c1,0 | ||
| 451 | |||
| 452 | |||
| 453 | mul_add_c(6,7,c2,c3,c1); | ||
| 454 | mul_add_c(7,6,c2,c3,c1); | ||
| 455 | stg c2,13*8(%r2) | ||
| 456 | lghi c2,0 | ||
| 457 | |||
| 458 | mul_add_c(7,7,c3,c1,c2); | ||
| 459 | stg c3,14*8(%r2) | ||
| 460 | stg c1,15*8(%r2) | ||
| 461 | |||
| 462 | lmg %r6,%r8,48(%r15) | ||
| 463 | br %r14 | ||
| 464 | .size bn_mul_comba8,.-bn_mul_comba8 | ||
| 465 | |||
| 466 | // void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); | ||
| 467 | .globl bn_mul_comba4 | ||
| 468 | .type bn_mul_comba4,@function | ||
| 469 | .align 4 | ||
| 470 | bn_mul_comba4: | ||
| 471 | stmg %r6,%r8,48(%r15) | ||
| 472 | |||
| 473 | lghi c1,0 | ||
| 474 | lghi c2,0 | ||
| 475 | lghi c3,0 | ||
| 476 | lghi zero,0 | ||
| 477 | |||
| 478 | mul_add_c(0,0,c1,c2,c3); | ||
| 479 | stg c1,0*8(%r3) | ||
| 480 | lghi c1,0 | ||
| 481 | |||
| 482 | mul_add_c(0,1,c2,c3,c1); | ||
| 483 | mul_add_c(1,0,c2,c3,c1); | ||
| 484 | stg c2,1*8(%r2) | ||
| 485 | lghi c2,0 | ||
| 486 | |||
| 487 | mul_add_c(2,0,c3,c1,c2); | ||
| 488 | mul_add_c(1,1,c3,c1,c2); | ||
| 489 | mul_add_c(0,2,c3,c1,c2); | ||
| 490 | stg c3,2*8(%r2) | ||
| 491 | lghi c3,0 | ||
| 492 | |||
| 493 | mul_add_c(0,3,c1,c2,c3); | ||
| 494 | mul_add_c(1,2,c1,c2,c3); | ||
| 495 | mul_add_c(2,1,c1,c2,c3); | ||
| 496 | mul_add_c(3,0,c1,c2,c3); | ||
| 497 | stg c1,3*8(%r2) | ||
| 498 | lghi c1,0 | ||
| 499 | |||
| 500 | mul_add_c(3,1,c2,c3,c1); | ||
| 501 | mul_add_c(2,2,c2,c3,c1); | ||
| 502 | mul_add_c(1,3,c2,c3,c1); | ||
| 503 | stg c2,4*8(%r2) | ||
| 504 | lghi c2,0 | ||
| 505 | |||
| 506 | mul_add_c(2,3,c3,c1,c2); | ||
| 507 | mul_add_c(3,2,c3,c1,c2); | ||
| 508 | stg c3,5*8(%r2) | ||
| 509 | lghi c3,0 | ||
| 510 | |||
| 511 | mul_add_c(3,3,c1,c2,c3); | ||
| 512 | stg c1,6*8(%r2) | ||
| 513 | stg c2,7*8(%r2) | ||
| 514 | |||
| 515 | stmg %r6,%r8,48(%r15) | ||
| 516 | br %r14 | ||
| 517 | .size bn_mul_comba4,.-bn_mul_comba4 | ||
| 518 | |||
| 519 | #define sqr_add_c(ai,c1,c2,c3) \ | ||
| 520 | lg %r7,ai*8(%r3); \ | ||
| 521 | mlgr %r6,%r7; \ | ||
| 522 | algr c1,%r7; \ | ||
| 523 | alcgr c2,%r6; \ | ||
| 524 | alcgr c3,zero | ||
| 525 | |||
| 526 | #define sqr_add_c2(ai,aj,c1,c2,c3) \ | ||
| 527 | lg %r7,ai*8(%r3); \ | ||
| 528 | mlg %r6,aj*8(%r3); \ | ||
| 529 | algr c1,%r7; \ | ||
| 530 | alcgr c2,%r6; \ | ||
| 531 | alcgr c3,zero; \ | ||
| 532 | algr c1,%r7; \ | ||
| 533 | alcgr c2,%r6; \ | ||
| 534 | alcgr c3,zero | ||
| 535 | |||
| 536 | // void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3); | ||
| 537 | .globl bn_sqr_comba8 | ||
| 538 | .type bn_sqr_comba8,@function | ||
| 539 | .align 4 | ||
| 540 | bn_sqr_comba8: | ||
| 541 | stmg %r6,%r8,48(%r15) | ||
| 542 | |||
| 543 | lghi c1,0 | ||
| 544 | lghi c2,0 | ||
| 545 | lghi c3,0 | ||
| 546 | lghi zero,0 | ||
| 547 | |||
| 548 | sqr_add_c(0,c1,c2,c3); | ||
| 549 | stg c1,0*8(%r2) | ||
| 550 | lghi c1,0 | ||
| 551 | |||
| 552 | sqr_add_c2(1,0,c2,c3,c1); | ||
| 553 | stg c2,1*8(%r2) | ||
| 554 | lghi c2,0 | ||
| 555 | |||
| 556 | sqr_add_c(1,c3,c1,c2); | ||
| 557 | sqr_add_c2(2,0,c3,c1,c2); | ||
| 558 | stg c3,2*8(%r2) | ||
| 559 | lghi c3,0 | ||
| 560 | |||
| 561 | sqr_add_c2(3,0,c1,c2,c3); | ||
| 562 | sqr_add_c2(2,1,c1,c2,c3); | ||
| 563 | stg c1,3*8(%r2) | ||
| 564 | lghi c1,0 | ||
| 565 | |||
| 566 | sqr_add_c(2,c2,c3,c1); | ||
| 567 | sqr_add_c2(3,1,c2,c3,c1); | ||
| 568 | sqr_add_c2(4,0,c2,c3,c1); | ||
| 569 | stg c2,4*8(%r2) | ||
| 570 | lghi c2,0 | ||
| 571 | |||
| 572 | sqr_add_c2(5,0,c3,c1,c2); | ||
| 573 | sqr_add_c2(4,1,c3,c1,c2); | ||
| 574 | sqr_add_c2(3,2,c3,c1,c2); | ||
| 575 | stg c3,5*8(%r2) | ||
| 576 | lghi c3,0 | ||
| 577 | |||
| 578 | sqr_add_c(3,c1,c2,c3); | ||
| 579 | sqr_add_c2(4,2,c1,c2,c3); | ||
| 580 | sqr_add_c2(5,1,c1,c2,c3); | ||
| 581 | sqr_add_c2(6,0,c1,c2,c3); | ||
| 582 | stg c1,6*8(%r2) | ||
| 583 | lghi c1,0 | ||
| 584 | |||
| 585 | sqr_add_c2(7,0,c2,c3,c1); | ||
| 586 | sqr_add_c2(6,1,c2,c3,c1); | ||
| 587 | sqr_add_c2(5,2,c2,c3,c1); | ||
| 588 | sqr_add_c2(4,3,c2,c3,c1); | ||
| 589 | stg c2,7*8(%r2) | ||
| 590 | lghi c2,0 | ||
| 591 | |||
| 592 | sqr_add_c(4,c3,c1,c2); | ||
| 593 | sqr_add_c2(5,3,c3,c1,c2); | ||
| 594 | sqr_add_c2(6,2,c3,c1,c2); | ||
| 595 | sqr_add_c2(7,1,c3,c1,c2); | ||
| 596 | stg c3,8*8(%r2) | ||
| 597 | lghi c3,0 | ||
| 598 | |||
| 599 | sqr_add_c2(7,2,c1,c2,c3); | ||
| 600 | sqr_add_c2(6,3,c1,c2,c3); | ||
| 601 | sqr_add_c2(5,4,c1,c2,c3); | ||
| 602 | stg c1,9*8(%r2) | ||
| 603 | lghi c1,0 | ||
| 604 | |||
| 605 | sqr_add_c(5,c2,c3,c1); | ||
| 606 | sqr_add_c2(6,4,c2,c3,c1); | ||
| 607 | sqr_add_c2(7,3,c2,c3,c1); | ||
| 608 | stg c2,10*8(%r2) | ||
| 609 | lghi c2,0 | ||
| 610 | |||
| 611 | sqr_add_c2(7,4,c3,c1,c2); | ||
| 612 | sqr_add_c2(6,5,c3,c1,c2); | ||
| 613 | stg c3,11*8(%r2) | ||
| 614 | lghi c3,0 | ||
| 615 | |||
| 616 | sqr_add_c(6,c1,c2,c3); | ||
| 617 | sqr_add_c2(7,5,c1,c2,c3); | ||
| 618 | stg c1,12*8(%r2) | ||
| 619 | lghi c1,0 | ||
| 620 | |||
| 621 | sqr_add_c2(7,6,c2,c3,c1); | ||
| 622 | stg c2,13*8(%r2) | ||
| 623 | lghi c2,0 | ||
| 624 | |||
| 625 | sqr_add_c(7,c3,c1,c2); | ||
| 626 | stg c3,14*8(%r2) | ||
| 627 | stg c1,15*8(%r2) | ||
| 628 | |||
| 629 | lmg %r6,%r8,48(%r15) | ||
| 630 | br %r14 | ||
| 631 | .size bn_sqr_comba8,.-bn_sqr_comba8 | ||
| 632 | |||
| 633 | // void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3); | ||
| 634 | .globl bn_sqr_comba4 | ||
| 635 | .type bn_sqr_comba4,@function | ||
| 636 | .align 4 | ||
| 637 | bn_sqr_comba4: | ||
| 638 | stmg %r6,%r8,48(%r15) | ||
| 639 | |||
| 640 | lghi c1,0 | ||
| 641 | lghi c2,0 | ||
| 642 | lghi c3,0 | ||
| 643 | lghi zero,0 | ||
| 644 | |||
| 645 | sqr_add_c(0,c1,c2,c3); | ||
| 646 | stg c1,0*8(%r2) | ||
| 647 | lghi c1,0 | ||
| 648 | |||
| 649 | sqr_add_c2(1,0,c2,c3,c1); | ||
| 650 | stg c2,1*8(%r2) | ||
| 651 | lghi c2,0 | ||
| 652 | |||
| 653 | sqr_add_c(1,c3,c1,c2); | ||
| 654 | sqr_add_c2(2,0,c3,c1,c2); | ||
| 655 | stg c3,2*8(%r2) | ||
| 656 | lghi c3,0 | ||
| 657 | |||
| 658 | sqr_add_c2(3,0,c1,c2,c3); | ||
| 659 | sqr_add_c2(2,1,c1,c2,c3); | ||
| 660 | stg c1,3*8(%r2) | ||
| 661 | lghi c1,0 | ||
| 662 | |||
| 663 | sqr_add_c(2,c2,c3,c1); | ||
| 664 | sqr_add_c2(3,1,c2,c3,c1); | ||
| 665 | stg c2,4*8(%r2) | ||
| 666 | lghi c2,0 | ||
| 667 | |||
| 668 | sqr_add_c2(3,2,c3,c1,c2); | ||
| 669 | stg c3,5*8(%r2) | ||
| 670 | lghi c3,0 | ||
| 671 | |||
| 672 | sqr_add_c(3,c1,c2,c3); | ||
| 673 | stg c1,6*8(%r2) | ||
| 674 | stg c2,7*8(%r2) | ||
| 675 | |||
| 676 | lmg %r6,%r8,48(%r15) | ||
| 677 | br %r14 | ||
| 678 | .size bn_sqr_comba4,.-bn_sqr_comba4 | ||
