diff options
| author | cvs2svn <admin@example.com> | 2015-03-08 16:48:48 +0000 |
|---|---|---|
| committer | cvs2svn <admin@example.com> | 2015-03-08 16:48:48 +0000 |
| commit | da1a9ad3a4a867ba6569c05e6fca66d7f296c553 (patch) | |
| tree | 44872802e872bdfd60730fa9cf01d9d5751251c1 /src/lib/libcrypto/bn/asm/parisc-mont.pl | |
| parent | 973703db67a8e73d70e63afa8f2cde19da09144d (diff) | |
| download | openbsd-OPENBSD_5_7_BASE.tar.gz openbsd-OPENBSD_5_7_BASE.tar.bz2 openbsd-OPENBSD_5_7_BASE.zip | |
This commit was manufactured by cvs2git to create tag 'OPENBSD_5_7_BASE'.OPENBSD_5_7_BASE
Diffstat (limited to 'src/lib/libcrypto/bn/asm/parisc-mont.pl')
| -rw-r--r-- | src/lib/libcrypto/bn/asm/parisc-mont.pl | 993 |
1 files changed, 0 insertions, 993 deletions
diff --git a/src/lib/libcrypto/bn/asm/parisc-mont.pl b/src/lib/libcrypto/bn/asm/parisc-mont.pl deleted file mode 100644 index fcfdee1f1f..0000000000 --- a/src/lib/libcrypto/bn/asm/parisc-mont.pl +++ /dev/null | |||
| @@ -1,993 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # On PA-7100LC this module performs ~90-50% better, less for longer | ||
| 11 | # keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means | ||
| 12 | # that compiler utilized xmpyu instruction to perform 32x32=64-bit | ||
| 13 | # multiplication, which in turn means that "baseline" performance was | ||
| 14 | # optimal in respect to instruction set capabilities. Fair comparison | ||
| 15 | # with vendor compiler is problematic, because OpenSSL doesn't define | ||
| 16 | # BN_LLONG [presumably] for historical reasons, which drives compiler | ||
| 17 | # toward 4 times 16x16=32-bit multiplicatons [plus complementary | ||
| 18 | # shifts and additions] instead. This means that you should observe | ||
| 19 | # several times improvement over code generated by vendor compiler | ||
| 20 | # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual | ||
| 21 | # improvement coefficient was never collected on PA-7100LC, or any | ||
| 22 | # other 1.1 CPU, because I don't have access to such machine with | ||
| 23 | # vendor compiler. But to give you a taste, PA-RISC 1.1 code path | ||
| 24 | # reportedly outperformed code generated by cc +DA1.1 +O3 by factor | ||
| 25 | # of ~5x on PA-8600. | ||
| 26 | # | ||
| 27 | # On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is | ||
| 28 | # reportedly ~2x faster than vendor compiler generated code [according | ||
| 29 | # to comment in pa-risc2[W].s]. Here comes a catch. Execution core of | ||
| 30 | # this implementation is actually 32-bit one, in the sense that it | ||
| 31 | # operates on 32-bit values. But pa-risc2[W].s operates on arrays of | ||
| 32 | # 64-bit BN_LONGs... How do they interoperate then? No problem. This | ||
| 33 | # module picks halves of 64-bit values in reverse order and pretends | ||
| 34 | # they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" | ||
| 35 | # 64-bit code such as pa-risc2[W].s then? Well, the thing is that | ||
| 36 | # 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, | ||
| 37 | # i.e. there is no "wider" multiplication like on most other 64-bit | ||
| 38 | # platforms. This means that even being effectively 32-bit, this | ||
| 39 | # implementation performs "64-bit" computational task in same amount | ||
| 40 | # of arithmetic operations, most notably multiplications. It requires | ||
| 41 | # more memory references, most notably to tp[num], but this doesn't | ||
| 42 | # seem to exhaust memory port capacity. And indeed, dedicated PA-RISC | ||
| 43 | # 2.0 code path provides virtually same performance as pa-risc2[W].s: | ||
| 44 | # it's ~10% better for shortest key length and ~10% worse for longest | ||
| 45 | # one. | ||
| 46 | # | ||
| 47 | # In case it wasn't clear. The module has two distinct code paths: | ||
| 48 | # PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit | ||
| 49 | # additions and 64-bit integer loads, not to mention specific | ||
| 50 | # instruction scheduling. In 64-bit build naturally only 2.0 code path | ||
| 51 | # is assembled. In 32-bit application context both code paths are | ||
| 52 | # assembled, PA-RISC 2.0 CPU is detected at run-time and proper path | ||
| 53 | # is taken automatically. Also, in 32-bit build the module imposes | ||
| 54 | # couple of limitations: vector lengths has to be even and vector | ||
| 55 | # addresses has to be 64-bit aligned. Normally neither is a problem: | ||
| 56 | # most common key lengths are even and vectors are commonly malloc-ed, | ||
| 57 | # which ensures alignment. | ||
| 58 | # | ||
| 59 | # Special thanks to polarhome.com for providing HP-UX account on | ||
| 60 | # PA-RISC 1.1 machine, and to correspondent who chose to remain | ||
| 61 | # anonymous for testing the code on PA-RISC 2.0 machine. | ||
| 62 | |||
| 63 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 64 | |||
| 65 | $flavour = shift; | ||
| 66 | $output = shift; | ||
| 67 | |||
| 68 | open STDOUT,">$output"; | ||
| 69 | |||
| 70 | if ($flavour =~ /64/) { | ||
| 71 | $LEVEL ="2.0W"; | ||
| 72 | $SIZE_T =8; | ||
| 73 | $FRAME_MARKER =80; | ||
| 74 | $SAVED_RP =16; | ||
| 75 | $PUSH ="std"; | ||
| 76 | $PUSHMA ="std,ma"; | ||
| 77 | $POP ="ldd"; | ||
| 78 | $POPMB ="ldd,mb"; | ||
| 79 | $BN_SZ =$SIZE_T; | ||
| 80 | } else { | ||
| 81 | $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; | ||
| 82 | $SIZE_T =4; | ||
| 83 | $FRAME_MARKER =48; | ||
| 84 | $SAVED_RP =20; | ||
| 85 | $PUSH ="stw"; | ||
| 86 | $PUSHMA ="stwm"; | ||
| 87 | $POP ="ldw"; | ||
| 88 | $POPMB ="ldwm"; | ||
| 89 | $BN_SZ =$SIZE_T; | ||
| 90 | } | ||
| 91 | |||
| 92 | $FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker | ||
| 93 | # [+ argument transfer] | ||
| 94 | $LOCALS=$FRAME-$FRAME_MARKER; | ||
| 95 | $FRAME+=32; # local variables | ||
| 96 | |||
| 97 | $tp="%r31"; | ||
| 98 | $ti1="%r29"; | ||
| 99 | $ti0="%r28"; | ||
| 100 | |||
| 101 | $rp="%r26"; | ||
| 102 | $ap="%r25"; | ||
| 103 | $bp="%r24"; | ||
| 104 | $np="%r23"; | ||
| 105 | $n0="%r22"; # passed through stack in 32-bit | ||
| 106 | $num="%r21"; # passed through stack in 32-bit | ||
| 107 | $idx="%r20"; | ||
| 108 | $arrsz="%r19"; | ||
| 109 | |||
| 110 | $nm1="%r7"; | ||
| 111 | $nm0="%r6"; | ||
| 112 | $ab1="%r5"; | ||
| 113 | $ab0="%r4"; | ||
| 114 | |||
| 115 | $fp="%r3"; | ||
| 116 | $hi1="%r2"; | ||
| 117 | $hi0="%r1"; | ||
| 118 | |||
| 119 | $xfer=$n0; # accomodates [-16..15] offset in fld[dw]s | ||
| 120 | |||
| 121 | $fm0="%fr4"; $fti=$fm0; | ||
| 122 | $fbi="%fr5L"; | ||
| 123 | $fn0="%fr5R"; | ||
| 124 | $fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; | ||
| 125 | $fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; | ||
| 126 | |||
| 127 | $code=<<___; | ||
| 128 | .LEVEL $LEVEL | ||
| 129 | #if 0 | ||
| 130 | .SPACE \$TEXT\$ | ||
| 131 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
| 132 | #else | ||
| 133 | .text | ||
| 134 | #endif | ||
| 135 | |||
| 136 | .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR | ||
| 137 | .ALIGN 64 | ||
| 138 | bn_mul_mont | ||
| 139 | .PROC | ||
| 140 | .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 | ||
| 141 | .ENTRY | ||
| 142 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
| 143 | $PUSHMA %r3,$FRAME(%sp) | ||
| 144 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
| 145 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
| 146 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
| 147 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
| 148 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
| 149 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
| 150 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
| 151 | ldo -$FRAME(%sp),$fp | ||
| 152 | ___ | ||
| 153 | $code.=<<___ if ($SIZE_T==4); | ||
| 154 | ldw `-$FRAME_MARKER-4`($fp),$n0 | ||
| 155 | ldw `-$FRAME_MARKER-8`($fp),$num | ||
| 156 | nop | ||
| 157 | nop ; alignment | ||
| 158 | ___ | ||
| 159 | $code.=<<___ if ($BN_SZ==4); | ||
| 160 | comiclr,<= 6,$num,%r0 ; are vectors long enough? | ||
| 161 | b L\$abort | ||
| 162 | ldi 0,%r28 ; signal "unhandled" | ||
| 163 | add,ev %r0,$num,$num ; is $num even? | ||
| 164 | b L\$abort | ||
| 165 | nop | ||
| 166 | or $ap,$np,$ti1 | ||
| 167 | extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? | ||
| 168 | b L\$abort | ||
| 169 | nop | ||
| 170 | nop ; alignment | ||
| 171 | nop | ||
| 172 | |||
| 173 | fldws 0($n0),${fn0} | ||
| 174 | fldws,ma 4($bp),${fbi} ; bp[0] | ||
| 175 | ___ | ||
| 176 | $code.=<<___ if ($BN_SZ==8); | ||
| 177 | comib,> 3,$num,L\$abort ; are vectors long enough? | ||
| 178 | ldi 0,%r28 ; signal "unhandled" | ||
| 179 | addl $num,$num,$num ; I operate on 32-bit values | ||
| 180 | |||
| 181 | fldws 4($n0),${fn0} ; only low part of n0 | ||
| 182 | fldws 4($bp),${fbi} ; bp[0] in flipped word order | ||
| 183 | ___ | ||
| 184 | $code.=<<___; | ||
| 185 | fldds 0($ap),${fai} ; ap[0,1] | ||
| 186 | fldds 0($np),${fni} ; np[0,1] | ||
| 187 | |||
| 188 | sh2addl $num,%r0,$arrsz | ||
| 189 | ldi 31,$hi0 | ||
| 190 | ldo 36($arrsz),$hi1 ; space for tp[num+1] | ||
| 191 | andcm $hi1,$hi0,$hi1 ; align | ||
| 192 | addl $hi1,%sp,%sp | ||
| 193 | $PUSH $fp,-$SIZE_T(%sp) | ||
| 194 | |||
| 195 | ldo `$LOCALS+16`($fp),$xfer | ||
| 196 | ldo `$LOCALS+32+4`($fp),$tp | ||
| 197 | |||
| 198 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] | ||
| 199 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] | ||
| 200 | xmpyu ${fn0},${fab0}R,${fm0} | ||
| 201 | |||
| 202 | addl $arrsz,$ap,$ap ; point at the end | ||
| 203 | addl $arrsz,$np,$np | ||
| 204 | subi 0,$arrsz,$idx ; j=0 | ||
| 205 | ldo 8($idx),$idx ; j++++ | ||
| 206 | |||
| 207 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m | ||
| 208 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m | ||
| 209 | fstds ${fab0},-16($xfer) | ||
| 210 | fstds ${fnm0},-8($xfer) | ||
| 211 | fstds ${fab1},0($xfer) | ||
| 212 | fstds ${fnm1},8($xfer) | ||
| 213 | flddx $idx($ap),${fai} ; ap[2,3] | ||
| 214 | flddx $idx($np),${fni} ; np[2,3] | ||
| 215 | ___ | ||
| 216 | $code.=<<___ if ($BN_SZ==4); | ||
| 217 | #ifndef __OpenBSD__ | ||
| 218 | mtctl $hi0,%cr11 ; $hi0 still holds 31 | ||
| 219 | extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 | ||
| 220 | b L\$parisc11 | ||
| 221 | nop | ||
| 222 | ___ | ||
| 223 | $code.=<<___; # PA-RISC 2.0 code-path | ||
| 224 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
| 225 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 226 | ldd -16($xfer),$ab0 | ||
| 227 | fstds ${fab0},-16($xfer) | ||
| 228 | |||
| 229 | extrd,u $ab0,31,32,$hi0 | ||
| 230 | extrd,u $ab0,63,32,$ab0 | ||
| 231 | ldd -8($xfer),$nm0 | ||
| 232 | fstds ${fnm0},-8($xfer) | ||
| 233 | ldo 8($idx),$idx ; j++++ | ||
| 234 | addl $ab0,$nm0,$nm0 ; low part is discarded | ||
| 235 | extrd,u $nm0,31,32,$hi1 | ||
| 236 | |||
| 237 | L\$1st | ||
| 238 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] | ||
| 239 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
| 240 | ldd 0($xfer),$ab1 | ||
| 241 | fstds ${fab1},0($xfer) | ||
| 242 | addl $hi0,$ab1,$ab1 | ||
| 243 | extrd,u $ab1,31,32,$hi0 | ||
| 244 | ldd 8($xfer),$nm1 | ||
| 245 | fstds ${fnm1},8($xfer) | ||
| 246 | extrd,u $ab1,63,32,$ab1 | ||
| 247 | addl $hi1,$nm1,$nm1 | ||
| 248 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
| 249 | flddx $idx($np),${fni} ; np[j,j+1] | ||
| 250 | addl $ab1,$nm1,$nm1 | ||
| 251 | extrd,u $nm1,31,32,$hi1 | ||
| 252 | |||
| 253 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
| 254 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 255 | ldd -16($xfer),$ab0 | ||
| 256 | fstds ${fab0},-16($xfer) | ||
| 257 | addl $hi0,$ab0,$ab0 | ||
| 258 | extrd,u $ab0,31,32,$hi0 | ||
| 259 | ldd -8($xfer),$nm0 | ||
| 260 | fstds ${fnm0},-8($xfer) | ||
| 261 | extrd,u $ab0,63,32,$ab0 | ||
| 262 | addl $hi1,$nm0,$nm0 | ||
| 263 | stw $nm1,-4($tp) ; tp[j-1] | ||
| 264 | addl $ab0,$nm0,$nm0 | ||
| 265 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
| 266 | addib,<> 8,$idx,L\$1st ; j++++ | ||
| 267 | extrd,u $nm0,31,32,$hi1 | ||
| 268 | |||
| 269 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] | ||
| 270 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
| 271 | ldd 0($xfer),$ab1 | ||
| 272 | fstds ${fab1},0($xfer) | ||
| 273 | addl $hi0,$ab1,$ab1 | ||
| 274 | extrd,u $ab1,31,32,$hi0 | ||
| 275 | ldd 8($xfer),$nm1 | ||
| 276 | fstds ${fnm1},8($xfer) | ||
| 277 | extrd,u $ab1,63,32,$ab1 | ||
| 278 | addl $hi1,$nm1,$nm1 | ||
| 279 | ldd -16($xfer),$ab0 | ||
| 280 | addl $ab1,$nm1,$nm1 | ||
| 281 | ldd -8($xfer),$nm0 | ||
| 282 | extrd,u $nm1,31,32,$hi1 | ||
| 283 | |||
| 284 | addl $hi0,$ab0,$ab0 | ||
| 285 | extrd,u $ab0,31,32,$hi0 | ||
| 286 | stw $nm1,-4($tp) ; tp[j-1] | ||
| 287 | extrd,u $ab0,63,32,$ab0 | ||
| 288 | addl $hi1,$nm0,$nm0 | ||
| 289 | ldd 0($xfer),$ab1 | ||
| 290 | addl $ab0,$nm0,$nm0 | ||
| 291 | ldd,mb 8($xfer),$nm1 | ||
| 292 | extrd,u $nm0,31,32,$hi1 | ||
| 293 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
| 294 | |||
| 295 | ldo -1($num),$num ; i-- | ||
| 296 | subi 0,$arrsz,$idx ; j=0 | ||
| 297 | ___ | ||
| 298 | $code.=<<___ if ($BN_SZ==4); | ||
| 299 | fldws,ma 4($bp),${fbi} ; bp[1] | ||
| 300 | ___ | ||
| 301 | $code.=<<___ if ($BN_SZ==8); | ||
| 302 | fldws 0($bp),${fbi} ; bp[1] in flipped word order | ||
| 303 | ___ | ||
| 304 | $code.=<<___; | ||
| 305 | flddx $idx($ap),${fai} ; ap[0,1] | ||
| 306 | flddx $idx($np),${fni} ; np[0,1] | ||
| 307 | fldws 8($xfer),${fti}R ; tp[0] | ||
| 308 | addl $hi0,$ab1,$ab1 | ||
| 309 | extrd,u $ab1,31,32,$hi0 | ||
| 310 | extrd,u $ab1,63,32,$ab1 | ||
| 311 | ldo 8($idx),$idx ; j++++ | ||
| 312 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] | ||
| 313 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] | ||
| 314 | addl $hi1,$nm1,$nm1 | ||
| 315 | addl $ab1,$nm1,$nm1 | ||
| 316 | extrd,u $nm1,31,32,$hi1 | ||
| 317 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
| 318 | stw $nm1,-4($tp) ; tp[j-1] | ||
| 319 | |||
| 320 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
| 321 | fcpy,sgl %fr0,${fab0}L | ||
| 322 | addl $hi1,$hi0,$hi0 | ||
| 323 | extrd,u $hi0,31,32,$hi1 | ||
| 324 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
| 325 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
| 326 | stw $hi0,0($tp) | ||
| 327 | stw $hi1,4($tp) | ||
| 328 | |||
| 329 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
| 330 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
| 331 | xmpyu ${fn0},${fab0}R,${fm0} | ||
| 332 | ldo `$LOCALS+32+4`($fp),$tp | ||
| 333 | L\$outer | ||
| 334 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m | ||
| 335 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m | ||
| 336 | fstds ${fab0},-16($xfer) ; 33-bit value | ||
| 337 | fstds ${fnm0},-8($xfer) | ||
| 338 | flddx $idx($ap),${fai} ; ap[2] | ||
| 339 | flddx $idx($np),${fni} ; np[2] | ||
| 340 | ldo 8($idx),$idx ; j++++ | ||
| 341 | ldd -16($xfer),$ab0 ; 33-bit value | ||
| 342 | ldd -8($xfer),$nm0 | ||
| 343 | ldw 0($xfer),$hi0 ; high part | ||
| 344 | |||
| 345 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
| 346 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 347 | extrd,u $ab0,31,32,$ti0 ; carry bit | ||
| 348 | extrd,u $ab0,63,32,$ab0 | ||
| 349 | fstds ${fab1},0($xfer) | ||
| 350 | addl $ti0,$hi0,$hi0 ; account carry bit | ||
| 351 | fstds ${fnm1},8($xfer) | ||
| 352 | addl $ab0,$nm0,$nm0 ; low part is discarded | ||
| 353 | ldw 0($tp),$ti1 ; tp[1] | ||
| 354 | extrd,u $nm0,31,32,$hi1 | ||
| 355 | fstds ${fab0},-16($xfer) | ||
| 356 | fstds ${fnm0},-8($xfer) | ||
| 357 | |||
| 358 | L\$inner | ||
| 359 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] | ||
| 360 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
| 361 | ldd 0($xfer),$ab1 | ||
| 362 | fstds ${fab1},0($xfer) | ||
| 363 | addl $hi0,$ti1,$ti1 | ||
| 364 | addl $ti1,$ab1,$ab1 | ||
| 365 | ldd 8($xfer),$nm1 | ||
| 366 | fstds ${fnm1},8($xfer) | ||
| 367 | extrd,u $ab1,31,32,$hi0 | ||
| 368 | extrd,u $ab1,63,32,$ab1 | ||
| 369 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
| 370 | flddx $idx($np),${fni} ; np[j,j+1] | ||
| 371 | addl $hi1,$nm1,$nm1 | ||
| 372 | addl $ab1,$nm1,$nm1 | ||
| 373 | ldw 4($tp),$ti0 ; tp[j] | ||
| 374 | stw $nm1,-4($tp) ; tp[j-1] | ||
| 375 | |||
| 376 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
| 377 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 378 | ldd -16($xfer),$ab0 | ||
| 379 | fstds ${fab0},-16($xfer) | ||
| 380 | addl $hi0,$ti0,$ti0 | ||
| 381 | addl $ti0,$ab0,$ab0 | ||
| 382 | ldd -8($xfer),$nm0 | ||
| 383 | fstds ${fnm0},-8($xfer) | ||
| 384 | extrd,u $ab0,31,32,$hi0 | ||
| 385 | extrd,u $nm1,31,32,$hi1 | ||
| 386 | ldw 8($tp),$ti1 ; tp[j] | ||
| 387 | extrd,u $ab0,63,32,$ab0 | ||
| 388 | addl $hi1,$nm0,$nm0 | ||
| 389 | addl $ab0,$nm0,$nm0 | ||
| 390 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
| 391 | addib,<> 8,$idx,L\$inner ; j++++ | ||
| 392 | extrd,u $nm0,31,32,$hi1 | ||
| 393 | |||
| 394 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] | ||
| 395 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
| 396 | ldd 0($xfer),$ab1 | ||
| 397 | fstds ${fab1},0($xfer) | ||
| 398 | addl $hi0,$ti1,$ti1 | ||
| 399 | addl $ti1,$ab1,$ab1 | ||
| 400 | ldd 8($xfer),$nm1 | ||
| 401 | fstds ${fnm1},8($xfer) | ||
| 402 | extrd,u $ab1,31,32,$hi0 | ||
| 403 | extrd,u $ab1,63,32,$ab1 | ||
| 404 | ldw 4($tp),$ti0 ; tp[j] | ||
| 405 | addl $hi1,$nm1,$nm1 | ||
| 406 | addl $ab1,$nm1,$nm1 | ||
| 407 | ldd -16($xfer),$ab0 | ||
| 408 | ldd -8($xfer),$nm0 | ||
| 409 | extrd,u $nm1,31,32,$hi1 | ||
| 410 | |||
| 411 | addl $hi0,$ab0,$ab0 | ||
| 412 | addl $ti0,$ab0,$ab0 | ||
| 413 | stw $nm1,-4($tp) ; tp[j-1] | ||
| 414 | extrd,u $ab0,31,32,$hi0 | ||
| 415 | ldw 8($tp),$ti1 ; tp[j] | ||
| 416 | extrd,u $ab0,63,32,$ab0 | ||
| 417 | addl $hi1,$nm0,$nm0 | ||
| 418 | ldd 0($xfer),$ab1 | ||
| 419 | addl $ab0,$nm0,$nm0 | ||
| 420 | ldd,mb 8($xfer),$nm1 | ||
| 421 | extrd,u $nm0,31,32,$hi1 | ||
| 422 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
| 423 | |||
| 424 | addib,= -1,$num,L\$outerdone ; i-- | ||
| 425 | subi 0,$arrsz,$idx ; j=0 | ||
| 426 | ___ | ||
| 427 | $code.=<<___ if ($BN_SZ==4); | ||
| 428 | fldws,ma 4($bp),${fbi} ; bp[i] | ||
| 429 | ___ | ||
| 430 | $code.=<<___ if ($BN_SZ==8); | ||
| 431 | ldi 12,$ti0 ; bp[i] in flipped word order | ||
| 432 | addl,ev %r0,$num,$num | ||
| 433 | ldi -4,$ti0 | ||
| 434 | addl $ti0,$bp,$bp | ||
| 435 | fldws 0($bp),${fbi} | ||
| 436 | ___ | ||
| 437 | $code.=<<___; | ||
| 438 | flddx $idx($ap),${fai} ; ap[0] | ||
| 439 | addl $hi0,$ab1,$ab1 | ||
| 440 | flddx $idx($np),${fni} ; np[0] | ||
| 441 | fldws 8($xfer),${fti}R ; tp[0] | ||
| 442 | addl $ti1,$ab1,$ab1 | ||
| 443 | extrd,u $ab1,31,32,$hi0 | ||
| 444 | extrd,u $ab1,63,32,$ab1 | ||
| 445 | |||
| 446 | ldo 8($idx),$idx ; j++++ | ||
| 447 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] | ||
| 448 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] | ||
| 449 | ldw 4($tp),$ti0 ; tp[j] | ||
| 450 | |||
| 451 | addl $hi1,$nm1,$nm1 | ||
| 452 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
| 453 | addl $ab1,$nm1,$nm1 | ||
| 454 | extrd,u $nm1,31,32,$hi1 | ||
| 455 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
| 456 | fcpy,sgl %fr0,${fab0}L | ||
| 457 | stw $nm1,-4($tp) ; tp[j-1] | ||
| 458 | |||
| 459 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
| 460 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
| 461 | addl $hi1,$hi0,$hi0 | ||
| 462 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
| 463 | addl $ti0,$hi0,$hi0 | ||
| 464 | extrd,u $hi0,31,32,$hi1 | ||
| 465 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
| 466 | stw $hi0,0($tp) | ||
| 467 | stw $hi1,4($tp) | ||
| 468 | xmpyu ${fn0},${fab0}R,${fm0} | ||
| 469 | |||
| 470 | b L\$outer | ||
| 471 | ldo `$LOCALS+32+4`($fp),$tp | ||
| 472 | |||
| 473 | L\$outerdone | ||
| 474 | addl $hi0,$ab1,$ab1 | ||
| 475 | addl $ti1,$ab1,$ab1 | ||
| 476 | extrd,u $ab1,31,32,$hi0 | ||
| 477 | extrd,u $ab1,63,32,$ab1 | ||
| 478 | |||
| 479 | ldw 4($tp),$ti0 ; tp[j] | ||
| 480 | |||
| 481 | addl $hi1,$nm1,$nm1 | ||
| 482 | addl $ab1,$nm1,$nm1 | ||
| 483 | extrd,u $nm1,31,32,$hi1 | ||
| 484 | stw $nm1,-4($tp) ; tp[j-1] | ||
| 485 | |||
| 486 | addl $hi1,$hi0,$hi0 | ||
| 487 | addl $ti0,$hi0,$hi0 | ||
| 488 | extrd,u $hi0,31,32,$hi1 | ||
| 489 | stw $hi0,0($tp) | ||
| 490 | stw $hi1,4($tp) | ||
| 491 | |||
| 492 | ldo `$LOCALS+32`($fp),$tp | ||
| 493 | sub %r0,%r0,%r0 ; clear borrow | ||
| 494 | ___ | ||
| 495 | $code.=<<___ if ($BN_SZ==4); | ||
| 496 | ldws,ma 4($tp),$ti0 | ||
| 497 | extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? | ||
| 498 | b L\$sub_pa11 | ||
| 499 | addl $tp,$arrsz,$tp | ||
| 500 | L\$sub | ||
| 501 | ldwx $idx($np),$hi0 | ||
| 502 | subb $ti0,$hi0,$hi1 | ||
| 503 | ldwx $idx($tp),$ti0 | ||
| 504 | addib,<> 4,$idx,L\$sub | ||
| 505 | stws,ma $hi1,4($rp) | ||
| 506 | |||
| 507 | subb $ti0,%r0,$hi1 | ||
| 508 | ldo -4($tp),$tp | ||
| 509 | ___ | ||
| 510 | $code.=<<___ if ($BN_SZ==8); | ||
| 511 | ldd,ma 8($tp),$ti0 | ||
| 512 | L\$sub | ||
| 513 | ldd $idx($np),$hi0 | ||
| 514 | shrpd $ti0,$ti0,32,$ti0 ; flip word order | ||
| 515 | std $ti0,-8($tp) ; save flipped value | ||
| 516 | sub,db $ti0,$hi0,$hi1 | ||
| 517 | ldd,ma 8($tp),$ti0 | ||
| 518 | addib,<> 8,$idx,L\$sub | ||
| 519 | std,ma $hi1,8($rp) | ||
| 520 | |||
| 521 | extrd,u $ti0,31,32,$ti0 ; carry in flipped word order | ||
| 522 | sub,db $ti0,%r0,$hi1 | ||
| 523 | ldo -8($tp),$tp | ||
| 524 | ___ | ||
| 525 | $code.=<<___; | ||
| 526 | and $tp,$hi1,$ap | ||
| 527 | andcm $rp,$hi1,$bp | ||
| 528 | or $ap,$bp,$np | ||
| 529 | |||
| 530 | sub $rp,$arrsz,$rp ; rewind rp | ||
| 531 | subi 0,$arrsz,$idx | ||
| 532 | ldo `$LOCALS+32`($fp),$tp | ||
| 533 | L\$copy | ||
| 534 | ldd $idx($np),$hi0 | ||
| 535 | std,ma %r0,8($tp) | ||
| 536 | addib,<> 8,$idx,.-8 ; L\$copy | ||
| 537 | std,ma $hi0,8($rp) | ||
| 538 | ___ | ||
| 539 | |||
| 540 | if ($BN_SZ==4) { # PA-RISC 1.1 code-path | ||
| 541 | $ablo=$ab0; | ||
| 542 | $abhi=$ab1; | ||
| 543 | $nmlo0=$nm0; | ||
| 544 | $nmhi0=$nm1; | ||
| 545 | $nmlo1="%r9"; | ||
| 546 | $nmhi1="%r8"; | ||
| 547 | |||
| 548 | $code.=<<___; | ||
| 549 | b L\$done | ||
| 550 | nop | ||
| 551 | |||
| 552 | .ALIGN 8 | ||
| 553 | L\$parisc11 | ||
| 554 | #endif | ||
| 555 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
| 556 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 557 | ldw -12($xfer),$ablo | ||
| 558 | ldw -16($xfer),$hi0 | ||
| 559 | ldw -4($xfer),$nmlo0 | ||
| 560 | ldw -8($xfer),$nmhi0 | ||
| 561 | fstds ${fab0},-16($xfer) | ||
| 562 | fstds ${fnm0},-8($xfer) | ||
| 563 | |||
| 564 | ldo 8($idx),$idx ; j++++ | ||
| 565 | add $ablo,$nmlo0,$nmlo0 ; discarded | ||
| 566 | addc %r0,$nmhi0,$hi1 | ||
| 567 | ldw 4($xfer),$ablo | ||
| 568 | ldw 0($xfer),$abhi | ||
| 569 | nop | ||
| 570 | |||
| 571 | L\$1st_pa11 | ||
| 572 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] | ||
| 573 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
| 574 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
| 575 | flddx $idx($np),${fni} ; np[j,j+1] | ||
| 576 | add $hi0,$ablo,$ablo | ||
| 577 | ldw 12($xfer),$nmlo1 | ||
| 578 | addc %r0,$abhi,$hi0 | ||
| 579 | ldw 8($xfer),$nmhi1 | ||
| 580 | add $ablo,$nmlo1,$nmlo1 | ||
| 581 | fstds ${fab1},0($xfer) | ||
| 582 | addc %r0,$nmhi1,$nmhi1 | ||
| 583 | fstds ${fnm1},8($xfer) | ||
| 584 | add $hi1,$nmlo1,$nmlo1 | ||
| 585 | ldw -12($xfer),$ablo | ||
| 586 | addc %r0,$nmhi1,$hi1 | ||
| 587 | ldw -16($xfer),$abhi | ||
| 588 | |||
| 589 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
| 590 | ldw -4($xfer),$nmlo0 | ||
| 591 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 592 | ldw -8($xfer),$nmhi0 | ||
| 593 | add $hi0,$ablo,$ablo | ||
| 594 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
| 595 | addc %r0,$abhi,$hi0 | ||
| 596 | fstds ${fab0},-16($xfer) | ||
| 597 | add $ablo,$nmlo0,$nmlo0 | ||
| 598 | fstds ${fnm0},-8($xfer) | ||
| 599 | addc %r0,$nmhi0,$nmhi0 | ||
| 600 | ldw 0($xfer),$abhi | ||
| 601 | add $hi1,$nmlo0,$nmlo0 | ||
| 602 | ldw 4($xfer),$ablo | ||
| 603 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
| 604 | addib,<> 8,$idx,L\$1st_pa11 ; j++++ | ||
| 605 | addc %r0,$nmhi0,$hi1 | ||
| 606 | |||
| 607 | ldw 8($xfer),$nmhi1 | ||
| 608 | ldw 12($xfer),$nmlo1 | ||
| 609 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] | ||
| 610 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
| 611 | add $hi0,$ablo,$ablo | ||
| 612 | fstds ${fab1},0($xfer) | ||
| 613 | addc %r0,$abhi,$hi0 | ||
| 614 | fstds ${fnm1},8($xfer) | ||
| 615 | add $ablo,$nmlo1,$nmlo1 | ||
| 616 | ldw -16($xfer),$abhi | ||
| 617 | addc %r0,$nmhi1,$nmhi1 | ||
| 618 | ldw -12($xfer),$ablo | ||
| 619 | add $hi1,$nmlo1,$nmlo1 | ||
| 620 | ldw -8($xfer),$nmhi0 | ||
| 621 | addc %r0,$nmhi1,$hi1 | ||
| 622 | ldw -4($xfer),$nmlo0 | ||
| 623 | |||
| 624 | add $hi0,$ablo,$ablo | ||
| 625 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
| 626 | addc %r0,$abhi,$hi0 | ||
| 627 | ldw 0($xfer),$abhi | ||
| 628 | add $ablo,$nmlo0,$nmlo0 | ||
| 629 | ldw 4($xfer),$ablo | ||
| 630 | addc %r0,$nmhi0,$nmhi0 | ||
| 631 | ldws,mb 8($xfer),$nmhi1 | ||
| 632 | add $hi1,$nmlo0,$nmlo0 | ||
| 633 | ldw 4($xfer),$nmlo1 | ||
| 634 | addc %r0,$nmhi0,$hi1 | ||
| 635 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
| 636 | |||
| 637 | ldo -1($num),$num ; i-- | ||
| 638 | subi 0,$arrsz,$idx ; j=0 | ||
| 639 | |||
| 640 | fldws,ma 4($bp),${fbi} ; bp[1] | ||
| 641 | flddx $idx($ap),${fai} ; ap[0,1] | ||
| 642 | flddx $idx($np),${fni} ; np[0,1] | ||
| 643 | fldws 8($xfer),${fti}R ; tp[0] | ||
| 644 | add $hi0,$ablo,$ablo | ||
| 645 | addc %r0,$abhi,$hi0 | ||
| 646 | ldo 8($idx),$idx ; j++++ | ||
| 647 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] | ||
| 648 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] | ||
| 649 | add $hi1,$nmlo1,$nmlo1 | ||
| 650 | addc %r0,$nmhi1,$nmhi1 | ||
| 651 | add $ablo,$nmlo1,$nmlo1 | ||
| 652 | addc %r0,$nmhi1,$hi1 | ||
| 653 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
| 654 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
| 655 | |||
| 656 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
| 657 | fcpy,sgl %fr0,${fab0}L | ||
| 658 | add $hi1,$hi0,$hi0 | ||
| 659 | addc %r0,%r0,$hi1 | ||
| 660 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
| 661 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
| 662 | stw $hi0,0($tp) | ||
| 663 | stw $hi1,4($tp) | ||
| 664 | |||
| 665 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
| 666 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
| 667 | xmpyu ${fn0},${fab0}R,${fm0} | ||
| 668 | ldo `$LOCALS+32+4`($fp),$tp | ||
| 669 | L\$outer_pa11 | ||
| 670 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m | ||
| 671 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m | ||
| 672 | fstds ${fab0},-16($xfer) ; 33-bit value | ||
| 673 | fstds ${fnm0},-8($xfer) | ||
| 674 | flddx $idx($ap),${fai} ; ap[2,3] | ||
| 675 | flddx $idx($np),${fni} ; np[2,3] | ||
| 676 | ldw -16($xfer),$abhi ; carry bit actually | ||
| 677 | ldo 8($idx),$idx ; j++++ | ||
| 678 | ldw -12($xfer),$ablo | ||
| 679 | ldw -8($xfer),$nmhi0 | ||
| 680 | ldw -4($xfer),$nmlo0 | ||
| 681 | ldw 0($xfer),$hi0 ; high part | ||
| 682 | |||
| 683 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
| 684 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 685 | fstds ${fab1},0($xfer) | ||
| 686 | addl $abhi,$hi0,$hi0 ; account carry bit | ||
| 687 | fstds ${fnm1},8($xfer) | ||
| 688 | add $ablo,$nmlo0,$nmlo0 ; discarded | ||
| 689 | ldw 0($tp),$ti1 ; tp[1] | ||
| 690 | addc %r0,$nmhi0,$hi1 | ||
| 691 | fstds ${fab0},-16($xfer) | ||
| 692 | fstds ${fnm0},-8($xfer) | ||
| 693 | ldw 4($xfer),$ablo | ||
| 694 | ldw 0($xfer),$abhi | ||
| 695 | |||
| 696 | L\$inner_pa11 | ||
| 697 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] | ||
| 698 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
| 699 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
| 700 | flddx $idx($np),${fni} ; np[j,j+1] | ||
| 701 | add $hi0,$ablo,$ablo | ||
| 702 | ldw 4($tp),$ti0 ; tp[j] | ||
| 703 | addc %r0,$abhi,$abhi | ||
| 704 | ldw 12($xfer),$nmlo1 | ||
| 705 | add $ti1,$ablo,$ablo | ||
| 706 | ldw 8($xfer),$nmhi1 | ||
| 707 | addc %r0,$abhi,$hi0 | ||
| 708 | fstds ${fab1},0($xfer) | ||
| 709 | add $ablo,$nmlo1,$nmlo1 | ||
| 710 | fstds ${fnm1},8($xfer) | ||
| 711 | addc %r0,$nmhi1,$nmhi1 | ||
| 712 | ldw -12($xfer),$ablo | ||
| 713 | add $hi1,$nmlo1,$nmlo1 | ||
| 714 | ldw -16($xfer),$abhi | ||
| 715 | addc %r0,$nmhi1,$hi1 | ||
| 716 | |||
| 717 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
| 718 | ldw 8($tp),$ti1 ; tp[j] | ||
| 719 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 720 | ldw -4($xfer),$nmlo0 | ||
| 721 | add $hi0,$ablo,$ablo | ||
| 722 | ldw -8($xfer),$nmhi0 | ||
| 723 | addc %r0,$abhi,$abhi | ||
| 724 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
| 725 | add $ti0,$ablo,$ablo | ||
| 726 | fstds ${fab0},-16($xfer) | ||
| 727 | addc %r0,$abhi,$hi0 | ||
| 728 | fstds ${fnm0},-8($xfer) | ||
| 729 | add $ablo,$nmlo0,$nmlo0 | ||
| 730 | ldw 4($xfer),$ablo | ||
| 731 | addc %r0,$nmhi0,$nmhi0 | ||
| 732 | ldw 0($xfer),$abhi | ||
| 733 | add $hi1,$nmlo0,$nmlo0 | ||
| 734 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
| 735 | addib,<> 8,$idx,L\$inner_pa11 ; j++++ | ||
| 736 | addc %r0,$nmhi0,$hi1 | ||
| 737 | |||
| 738 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] | ||
| 739 | ldw 12($xfer),$nmlo1 | ||
| 740 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
| 741 | ldw 8($xfer),$nmhi1 | ||
| 742 | add $hi0,$ablo,$ablo | ||
| 743 | ldw 4($tp),$ti0 ; tp[j] | ||
| 744 | addc %r0,$abhi,$abhi | ||
| 745 | fstds ${fab1},0($xfer) | ||
| 746 | add $ti1,$ablo,$ablo | ||
| 747 | fstds ${fnm1},8($xfer) | ||
| 748 | addc %r0,$abhi,$hi0 | ||
| 749 | ldw -16($xfer),$abhi | ||
| 750 | add $ablo,$nmlo1,$nmlo1 | ||
| 751 | ldw -12($xfer),$ablo | ||
| 752 | addc %r0,$nmhi1,$nmhi1 | ||
| 753 | ldw -8($xfer),$nmhi0 | ||
| 754 | add $hi1,$nmlo1,$nmlo1 | ||
| 755 | ldw -4($xfer),$nmlo0 | ||
| 756 | addc %r0,$nmhi1,$hi1 | ||
| 757 | |||
| 758 | add $hi0,$ablo,$ablo | ||
| 759 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
| 760 | addc %r0,$abhi,$abhi | ||
| 761 | add $ti0,$ablo,$ablo | ||
| 762 | ldw 8($tp),$ti1 ; tp[j] | ||
| 763 | addc %r0,$abhi,$hi0 | ||
| 764 | ldw 0($xfer),$abhi | ||
| 765 | add $ablo,$nmlo0,$nmlo0 | ||
| 766 | ldw 4($xfer),$ablo | ||
| 767 | addc %r0,$nmhi0,$nmhi0 | ||
| 768 | ldws,mb 8($xfer),$nmhi1 | ||
| 769 | add $hi1,$nmlo0,$nmlo0 | ||
| 770 | ldw 4($xfer),$nmlo1 | ||
| 771 | addc %r0,$nmhi0,$hi1 | ||
| 772 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
| 773 | |||
| 774 | addib,= -1,$num,L\$outerdone_pa11; i-- | ||
| 775 | subi 0,$arrsz,$idx ; j=0 | ||
| 776 | |||
| 777 | fldws,ma 4($bp),${fbi} ; bp[i] | ||
| 778 | flddx $idx($ap),${fai} ; ap[0] | ||
| 779 | add $hi0,$ablo,$ablo | ||
| 780 | addc %r0,$abhi,$abhi | ||
| 781 | flddx $idx($np),${fni} ; np[0] | ||
| 782 | fldws 8($xfer),${fti}R ; tp[0] | ||
| 783 | add $ti1,$ablo,$ablo | ||
| 784 | addc %r0,$abhi,$hi0 | ||
| 785 | |||
| 786 | ldo 8($idx),$idx ; j++++ | ||
| 787 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] | ||
| 788 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] | ||
| 789 | ldw 4($tp),$ti0 ; tp[j] | ||
| 790 | |||
| 791 | add $hi1,$nmlo1,$nmlo1 | ||
| 792 | addc %r0,$nmhi1,$nmhi1 | ||
| 793 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
| 794 | add $ablo,$nmlo1,$nmlo1 | ||
| 795 | addc %r0,$nmhi1,$hi1 | ||
| 796 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
| 797 | fcpy,sgl %fr0,${fab0}L | ||
| 798 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
| 799 | |||
| 800 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
| 801 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
| 802 | add $hi1,$hi0,$hi0 | ||
| 803 | addc %r0,%r0,$hi1 | ||
| 804 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
| 805 | add $ti0,$hi0,$hi0 | ||
| 806 | addc %r0,$hi1,$hi1 | ||
| 807 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
| 808 | stw $hi0,0($tp) | ||
| 809 | stw $hi1,4($tp) | ||
| 810 | xmpyu ${fn0},${fab0}R,${fm0} | ||
| 811 | |||
| 812 | b L\$outer_pa11 | ||
| 813 | ldo `$LOCALS+32+4`($fp),$tp | ||
| 814 | |||
| 815 | L\$outerdone_pa11 | ||
| 816 | add $hi0,$ablo,$ablo | ||
| 817 | addc %r0,$abhi,$abhi | ||
| 818 | add $ti1,$ablo,$ablo | ||
| 819 | addc %r0,$abhi,$hi0 | ||
| 820 | |||
| 821 | ldw 4($tp),$ti0 ; tp[j] | ||
| 822 | |||
| 823 | add $hi1,$nmlo1,$nmlo1 | ||
| 824 | addc %r0,$nmhi1,$nmhi1 | ||
| 825 | add $ablo,$nmlo1,$nmlo1 | ||
| 826 | addc %r0,$nmhi1,$hi1 | ||
| 827 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
| 828 | |||
| 829 | add $hi1,$hi0,$hi0 | ||
| 830 | addc %r0,%r0,$hi1 | ||
| 831 | add $ti0,$hi0,$hi0 | ||
| 832 | addc %r0,$hi1,$hi1 | ||
| 833 | stw $hi0,0($tp) | ||
| 834 | stw $hi1,4($tp) | ||
| 835 | |||
| 836 | ldo `$LOCALS+32+4`($fp),$tp | ||
| 837 | sub %r0,%r0,%r0 ; clear borrow | ||
| 838 | ldw -4($tp),$ti0 | ||
| 839 | addl $tp,$arrsz,$tp | ||
| 840 | L\$sub_pa11 | ||
| 841 | ldwx $idx($np),$hi0 | ||
| 842 | subb $ti0,$hi0,$hi1 | ||
| 843 | ldwx $idx($tp),$ti0 | ||
| 844 | addib,<> 4,$idx,L\$sub_pa11 | ||
| 845 | stws,ma $hi1,4($rp) | ||
| 846 | |||
| 847 | subb $ti0,%r0,$hi1 | ||
| 848 | ldo -4($tp),$tp | ||
| 849 | and $tp,$hi1,$ap | ||
| 850 | andcm $rp,$hi1,$bp | ||
| 851 | or $ap,$bp,$np | ||
| 852 | |||
| 853 | sub $rp,$arrsz,$rp ; rewind rp | ||
| 854 | subi 0,$arrsz,$idx | ||
| 855 | ldo `$LOCALS+32`($fp),$tp | ||
| 856 | L\$copy_pa11 | ||
| 857 | ldwx $idx($np),$hi0 | ||
| 858 | stws,ma %r0,4($tp) | ||
| 859 | addib,<> 4,$idx,L\$copy_pa11 | ||
| 860 | stws,ma $hi0,4($rp) | ||
| 861 | |||
| 862 | nop ; alignment | ||
| 863 | L\$done | ||
| 864 | ___ | ||
| 865 | } | ||
| 866 | |||
| 867 | $code.=<<___; | ||
| 868 | ldi 1,%r28 ; signal "handled" | ||
| 869 | ldo $FRAME($fp),%sp ; destroy tp[num+1] | ||
| 870 | |||
| 871 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
| 872 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
| 873 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
| 874 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
| 875 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
| 876 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
| 877 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
| 878 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
| 879 | L\$abort | ||
| 880 | bv (%r2) | ||
| 881 | .EXIT | ||
| 882 | $POPMB -$FRAME(%sp),%r3 | ||
| 883 | .PROCEND | ||
| 884 | |||
| 885 | .data | ||
| 886 | .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 887 | ___ | ||
| 888 | |||
| 889 | # Explicitly encode PA-RISC 2.0 instructions used in this module, so | ||
| 890 | # that it can be compiled with .LEVEL 1.0. It should be noted that I | ||
| 891 | # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 | ||
| 892 | # directive... | ||
| 893 | |||
| 894 | my $ldd = sub { | ||
| 895 | my ($mod,$args) = @_; | ||
| 896 | my $orig = "ldd$mod\t$args"; | ||
| 897 | |||
| 898 | if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 | ||
| 899 | { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; | ||
| 900 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 901 | } | ||
| 902 | elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 | ||
| 903 | { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; | ||
| 904 | $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset | ||
| 905 | $opcode|=(1<<5) if ($mod =~ /^,m/); | ||
| 906 | $opcode|=(1<<13) if ($mod =~ /^,mb/); | ||
| 907 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 908 | } | ||
| 909 | else { "\t".$orig; } | ||
| 910 | }; | ||
| 911 | |||
| 912 | my $std = sub { | ||
| 913 | my ($mod,$args) = @_; | ||
| 914 | my $orig = "std$mod\t$args"; | ||
| 915 | |||
| 916 | if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 | ||
| 917 | { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); | ||
| 918 | $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset | ||
| 919 | $opcode|=(1<<5) if ($mod =~ /^,m/); | ||
| 920 | $opcode|=(1<<13) if ($mod =~ /^,mb/); | ||
| 921 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 922 | } | ||
| 923 | else { "\t".$orig; } | ||
| 924 | }; | ||
| 925 | |||
| 926 | my $extrd = sub { | ||
| 927 | my ($mod,$args) = @_; | ||
| 928 | my $orig = "extrd$mod\t$args"; | ||
| 929 | |||
| 930 | # I only have ",u" completer, it's implicitly encoded... | ||
| 931 | if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 | ||
| 932 | { my $opcode=(0x36<<26)|($1<<21)|($4<<16); | ||
| 933 | my $len=32-$3; | ||
| 934 | $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos | ||
| 935 | $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len | ||
| 936 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 937 | } | ||
| 938 | elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 | ||
| 939 | { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); | ||
| 940 | my $len=32-$2; | ||
| 941 | $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len | ||
| 942 | $opcode |= (1<<13) if ($mod =~ /,\**=/); | ||
| 943 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 944 | } | ||
| 945 | else { "\t".$orig; } | ||
| 946 | }; | ||
| 947 | |||
| 948 | my $shrpd = sub { | ||
| 949 | my ($mod,$args) = @_; | ||
| 950 | my $orig = "shrpd$mod\t$args"; | ||
| 951 | |||
| 952 | if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 | ||
| 953 | { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; | ||
| 954 | my $cpos=63-$3; | ||
| 955 | $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa | ||
| 956 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 957 | } | ||
| 958 | else { "\t".$orig; } | ||
| 959 | }; | ||
| 960 | |||
| 961 | my $sub = sub { | ||
| 962 | my ($mod,$args) = @_; | ||
| 963 | my $orig = "sub$mod\t$args"; | ||
| 964 | |||
| 965 | if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { | ||
| 966 | my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; | ||
| 967 | $opcode|=(1<<10); # e1 | ||
| 968 | $opcode|=(1<<8); # e2 | ||
| 969 | $opcode|=(1<<5); # d | ||
| 970 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig | ||
| 971 | } | ||
| 972 | else { "\t".$orig; } | ||
| 973 | }; | ||
| 974 | |||
| 975 | sub assemble { | ||
| 976 | my ($mnemonic,$mod,$args)=@_; | ||
| 977 | my $opcode = eval("\$$mnemonic"); | ||
| 978 | |||
| 979 | ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; | ||
| 980 | } | ||
| 981 | |||
| 982 | foreach (split("\n",$code)) { | ||
| 983 | s/\`([^\`]*)\`/eval $1/ge; | ||
| 984 | # flip word order in 64-bit mode... | ||
| 985 | s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); | ||
| 986 | # assemble 2.0 instructions in 32-bit mode... | ||
| 987 | s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); | ||
| 988 | |||
| 989 | s/\bbv\b/bve/gm if ($SIZE_T==8); | ||
| 990 | |||
| 991 | print $_,"\n"; | ||
| 992 | } | ||
| 993 | close STDOUT; | ||
