diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/ia64-mont.pl')
| -rw-r--r-- | src/lib/libcrypto/bn/asm/ia64-mont.pl | 851 |
1 files changed, 0 insertions, 851 deletions
diff --git a/src/lib/libcrypto/bn/asm/ia64-mont.pl b/src/lib/libcrypto/bn/asm/ia64-mont.pl deleted file mode 100644 index e258658428..0000000000 --- a/src/lib/libcrypto/bn/asm/ia64-mont.pl +++ /dev/null | |||
| @@ -1,851 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # January 2010 | ||
| 11 | # | ||
| 12 | # "Teaser" Montgomery multiplication module for IA-64. There are | ||
| 13 | # several possibilities for improvement: | ||
| 14 | # | ||
| 15 | # - modulo-scheduling outer loop would eliminate quite a number of | ||
| 16 | # stalls after ldf8, xma and getf.sig outside inner loop and | ||
| 17 | # improve shorter key performance; | ||
| 18 | # - shorter vector support [with input vectors being fetched only | ||
| 19 | # once] should be added; | ||
| 20 | # - 2x unroll with help of n0[1] would make the code scalable on | ||
| 21 | # "wider" IA-64, "wider" than Itanium 2 that is, which is not of | ||
| 22 | # acute interest, because upcoming Tukwila's individual cores are | ||
| 23 | # reportedly based on Itanium 2 design; | ||
| 24 | # - dedicated squaring procedure(?); | ||
| 25 | # | ||
| 26 | # January 2010 | ||
| 27 | # | ||
| 28 | # Shorter vector support is implemented by zero-padding ap and np | ||
| 29 | # vectors up to 8 elements, or 512 bits. This means that 256-bit | ||
| 30 | # inputs will be processed only 2 times faster than 512-bit inputs, | ||
| 31 | # not 4 [as one would expect, because algorithm complexity is n^2]. | ||
| 32 | # The reason for padding is that inputs shorter than 512 bits won't | ||
| 33 | # be processed faster anyway, because minimal critical path of the | ||
| 34 | # core loop happens to match 512-bit timing. Either way, it resulted | ||
| 35 | # in >100% improvement of 512-bit RSA sign benchmark and 50% - of | ||
| 36 | # 1024-bit one [in comparison to original version of *this* module]. | ||
| 37 | # | ||
| 38 | # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* | ||
| 39 | # this module is: | ||
| 40 | # sign verify sign/s verify/s | ||
| 41 | # rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 | ||
| 42 | # rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 | ||
| 43 | # rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 | ||
| 44 | # rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 | ||
| 45 | # dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 | ||
| 46 | # dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 | ||
| 47 | # dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 | ||
| 48 | # | ||
| 49 | # ... and *without* (but still with ia64.S): | ||
| 50 | # | ||
| 51 | # rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 | ||
| 52 | # rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 | ||
| 53 | # rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9 | ||
| 54 | # rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9 | ||
| 55 | # dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6 | ||
| 56 | # dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 | ||
| 57 | # dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 | ||
| 58 | # | ||
| 59 | # As it can be seen, RSA sign performance improves by 130-30%, | ||
| 60 | # hereafter less for longer keys, while verify - by 74-13%. | ||
| 61 | # DSA performance improves by 115-30%. | ||
| 62 | |||
| 63 | if ($^O eq "hpux") { | ||
| 64 | $ADDP="addp4"; | ||
| 65 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | ||
| 66 | } else { $ADDP="add"; } | ||
| 67 | |||
| 68 | $code=<<___; | ||
| 69 | .explicit | ||
| 70 | .text | ||
| 71 | |||
| 72 | // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap, | ||
| 73 | // const BN_ULONG *bp,const BN_ULONG *np, | ||
| 74 | // const BN_ULONG *n0p,int num); | ||
| 75 | .align 64 | ||
| 76 | .global bn_mul_mont# | ||
| 77 | .proc bn_mul_mont# | ||
| 78 | bn_mul_mont: | ||
| 79 | .prologue | ||
| 80 | .body | ||
| 81 | { .mmi; cmp4.le p6,p7=2,r37;; | ||
| 82 | (p6) cmp4.lt.unc p8,p9=8,r37 | ||
| 83 | mov ret0=r0 };; | ||
| 84 | { .bbb; | ||
| 85 | (p9) br.cond.dptk.many bn_mul_mont_8 | ||
| 86 | (p8) br.cond.dpnt.many bn_mul_mont_general | ||
| 87 | (p7) br.ret.spnt.many b0 };; | ||
| 88 | .endp bn_mul_mont# | ||
| 89 | |||
| 90 | prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11; | ||
| 91 | |||
| 92 | rptr=r8; aptr=r9; bptr=r14; nptr=r15; | ||
| 93 | tptr=r16; // &tp[0] | ||
| 94 | tp_1=r17; // &tp[-1] | ||
| 95 | num=r18; len=r19; lc=r20; | ||
| 96 | topbit=r21; // carry bit from tmp[num] | ||
| 97 | |||
| 98 | n0=f6; | ||
| 99 | m0=f7; | ||
| 100 | bi=f8; | ||
| 101 | |||
| 102 | .align 64 | ||
| 103 | .local bn_mul_mont_general# | ||
| 104 | .proc bn_mul_mont_general# | ||
| 105 | bn_mul_mont_general: | ||
| 106 | .prologue | ||
| 107 | { .mmi; .save ar.pfs,prevfs | ||
| 108 | alloc prevfs=ar.pfs,6,2,0,8 | ||
| 109 | $ADDP aptr=0,in1 | ||
| 110 | .save ar.lc,prevlc | ||
| 111 | mov prevlc=ar.lc } | ||
| 112 | { .mmi; .vframe prevsp | ||
| 113 | mov prevsp=sp | ||
| 114 | $ADDP bptr=0,in2 | ||
| 115 | .save pr,prevpr | ||
| 116 | mov prevpr=pr };; | ||
| 117 | |||
| 118 | .body | ||
| 119 | .rotf alo[6],nlo[4],ahi[8],nhi[6] | ||
| 120 | .rotr a[3],n[3],t[2] | ||
| 121 | |||
| 122 | { .mmi; ldf8 bi=[bptr],8 // (*bp++) | ||
| 123 | ldf8 alo[4]=[aptr],16 // ap[0] | ||
| 124 | $ADDP r30=8,in1 };; | ||
| 125 | { .mmi; ldf8 alo[3]=[r30],16 // ap[1] | ||
| 126 | ldf8 alo[2]=[aptr],16 // ap[2] | ||
| 127 | $ADDP in4=0,in4 };; | ||
| 128 | { .mmi; ldf8 alo[1]=[r30] // ap[3] | ||
| 129 | ldf8 n0=[in4] // n0 | ||
| 130 | $ADDP rptr=0,in0 } | ||
| 131 | { .mmi; $ADDP nptr=0,in3 | ||
| 132 | mov r31=16 | ||
| 133 | zxt4 num=in5 };; | ||
| 134 | { .mmi; ldf8 nlo[2]=[nptr],8 // np[0] | ||
| 135 | shladd len=num,3,r0 | ||
| 136 | shladd r31=num,3,r31 };; | ||
| 137 | { .mmi; ldf8 nlo[1]=[nptr],8 // np[1] | ||
| 138 | add lc=-5,num | ||
| 139 | sub r31=sp,r31 };; | ||
| 140 | { .mfb; and sp=-16,r31 // alloca | ||
| 141 | xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0] | ||
| 142 | nop.b 0 } | ||
| 143 | { .mfb; nop.m 0 | ||
| 144 | xmpy.lu alo[4]=alo[4],bi | ||
| 145 | brp.loop.imp .L1st_ctop,.L1st_cend-16 | ||
| 146 | };; | ||
| 147 | { .mfi; nop.m 0 | ||
| 148 | xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0] | ||
| 149 | add tp_1=8,sp } | ||
| 150 | { .mfi; nop.m 0 | ||
| 151 | xma.lu alo[3]=alo[3],bi,ahi[2] | ||
| 152 | mov pr.rot=0x20001f<<16 | ||
| 153 | // ------^----- (p40) at first (p23) | ||
| 154 | // ----------^^ p[16:20]=1 | ||
| 155 | };; | ||
| 156 | { .mfi; nop.m 0 | ||
| 157 | xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0 | ||
| 158 | mov ar.lc=lc } | ||
| 159 | { .mfi; nop.m 0 | ||
| 160 | fcvt.fxu.s1 nhi[1]=f0 | ||
| 161 | mov ar.ec=8 };; | ||
| 162 | |||
| 163 | .align 32 | ||
| 164 | .L1st_ctop: | ||
| 165 | .pred.rel "mutex",p40,p42 | ||
| 166 | { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) | ||
| 167 | (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] | ||
| 168 | (p40) add n[2]=n[2],a[2] } // (p23) } | ||
| 169 | { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16) | ||
| 170 | (p18) xma.lu alo[2]=alo[2],bi,ahi[1] | ||
| 171 | (p42) add n[2]=n[2],a[2],1 };; // (p23) | ||
| 172 | { .mfi; (p21) getf.sig a[0]=alo[5] | ||
| 173 | (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] | ||
| 174 | (p42) cmp.leu p41,p39=n[2],a[2] } // (p23) | ||
| 175 | { .mfi; (p23) st8 [tp_1]=n[2],8 | ||
| 176 | (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] | ||
| 177 | (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) | ||
| 178 | { .mmb; (p21) getf.sig n[0]=nlo[3] | ||
| 179 | (p16) nop.m 0 | ||
| 180 | br.ctop.sptk .L1st_ctop };; | ||
| 181 | .L1st_cend: | ||
| 182 | |||
| 183 | { .mmi; getf.sig a[0]=ahi[6] // (p24) | ||
| 184 | getf.sig n[0]=nhi[4] | ||
| 185 | add num=-1,num };; // num-- | ||
| 186 | { .mmi; .pred.rel "mutex",p40,p42 | ||
| 187 | (p40) add n[0]=n[0],a[0] | ||
| 188 | (p42) add n[0]=n[0],a[0],1 | ||
| 189 | sub aptr=aptr,len };; // rewind | ||
| 190 | { .mmi; .pred.rel "mutex",p40,p42 | ||
| 191 | (p40) cmp.ltu p41,p39=n[0],a[0] | ||
| 192 | (p42) cmp.leu p41,p39=n[0],a[0] | ||
| 193 | sub nptr=nptr,len };; | ||
| 194 | { .mmi; .pred.rel "mutex",p39,p41 | ||
| 195 | (p39) add topbit=r0,r0 | ||
| 196 | (p41) add topbit=r0,r0,1 | ||
| 197 | nop.i 0 } | ||
| 198 | { .mmi; st8 [tp_1]=n[0] | ||
| 199 | add tptr=16,sp | ||
| 200 | add tp_1=8,sp };; | ||
| 201 | |||
| 202 | .Louter: | ||
| 203 | { .mmi; ldf8 bi=[bptr],8 // (*bp++) | ||
| 204 | ldf8 ahi[3]=[tptr] // tp[0] | ||
| 205 | add r30=8,aptr };; | ||
| 206 | { .mmi; ldf8 alo[4]=[aptr],16 // ap[0] | ||
| 207 | ldf8 alo[3]=[r30],16 // ap[1] | ||
| 208 | add r31=8,nptr };; | ||
| 209 | { .mfb; ldf8 alo[2]=[aptr],16 // ap[2] | ||
| 210 | xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0] | ||
| 211 | brp.loop.imp .Linner_ctop,.Linner_cend-16 | ||
| 212 | } | ||
| 213 | { .mfb; ldf8 alo[1]=[r30] // ap[3] | ||
| 214 | xma.lu alo[4]=alo[4],bi,ahi[3] | ||
| 215 | clrrrb.pr };; | ||
| 216 | { .mfi; ldf8 nlo[2]=[nptr],16 // np[0] | ||
| 217 | xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i] | ||
| 218 | nop.i 0 } | ||
| 219 | { .mfi; ldf8 nlo[1]=[r31] // np[1] | ||
| 220 | xma.lu alo[3]=alo[3],bi,ahi[2] | ||
| 221 | mov pr.rot=0x20101f<<16 | ||
| 222 | // ------^----- (p40) at first (p23) | ||
| 223 | // --------^--- (p30) at first (p22) | ||
| 224 | // ----------^^ p[16:20]=1 | ||
| 225 | };; | ||
| 226 | { .mfi; st8 [tptr]=r0 // tp[0] is already accounted | ||
| 227 | xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0 | ||
| 228 | mov ar.lc=lc } | ||
| 229 | { .mfi; | ||
| 230 | fcvt.fxu.s1 nhi[1]=f0 | ||
| 231 | mov ar.ec=8 };; | ||
| 232 | |||
| 233 | // This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in | ||
| 234 | // 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7 | ||
| 235 | // in latter case accounts for two-tick pipeline stall, which means | ||
| 236 | // that its performance would be ~20% lower than optimal one. No | ||
| 237 | // attempt was made to address this, because original Itanium is | ||
| 238 | // hardly represented out in the wild... | ||
| 239 | .align 32 | ||
| 240 | .Linner_ctop: | ||
| 241 | .pred.rel "mutex",p40,p42 | ||
| 242 | .pred.rel "mutex",p30,p32 | ||
| 243 | { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) | ||
| 244 | (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] | ||
| 245 | (p40) add n[2]=n[2],a[2] } // (p23) | ||
| 246 | { .mfi; (p16) nop.m 0 | ||
| 247 | (p18) xma.lu alo[2]=alo[2],bi,ahi[1] | ||
| 248 | (p42) add n[2]=n[2],a[2],1 };; // (p23) | ||
| 249 | { .mfi; (p21) getf.sig a[0]=alo[5] | ||
| 250 | (p16) nop.f 0 | ||
| 251 | (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) | ||
| 252 | { .mfi; (p21) ld8 t[0]=[tptr],8 | ||
| 253 | (p16) nop.f 0 | ||
| 254 | (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23) | ||
| 255 | { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++) | ||
| 256 | (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] | ||
| 257 | (p30) add a[1]=a[1],t[1] } // (p22) | ||
| 258 | { .mfi; (p16) nop.m 0 | ||
| 259 | (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] | ||
| 260 | (p32) add a[1]=a[1],t[1],1 };; // (p22) | ||
| 261 | { .mmi; (p21) getf.sig n[0]=nlo[3] | ||
| 262 | (p16) nop.m 0 | ||
| 263 | (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22) | ||
| 264 | { .mmb; (p23) st8 [tp_1]=n[2],8 | ||
| 265 | (p32) cmp.leu p31,p29=a[1],t[1] // (p22) | ||
| 266 | br.ctop.sptk .Linner_ctop };; | ||
| 267 | .Linner_cend: | ||
| 268 | |||
| 269 | { .mmi; getf.sig a[0]=ahi[6] // (p24) | ||
| 270 | getf.sig n[0]=nhi[4] | ||
| 271 | nop.i 0 };; | ||
| 272 | |||
| 273 | { .mmi; .pred.rel "mutex",p31,p33 | ||
| 274 | (p31) add a[0]=a[0],topbit | ||
| 275 | (p33) add a[0]=a[0],topbit,1 | ||
| 276 | mov topbit=r0 };; | ||
| 277 | { .mfi; .pred.rel "mutex",p31,p33 | ||
| 278 | (p31) cmp.ltu p32,p30=a[0],topbit | ||
| 279 | (p33) cmp.leu p32,p30=a[0],topbit | ||
| 280 | } | ||
| 281 | { .mfi; .pred.rel "mutex",p40,p42 | ||
| 282 | (p40) add n[0]=n[0],a[0] | ||
| 283 | (p42) add n[0]=n[0],a[0],1 | ||
| 284 | };; | ||
| 285 | { .mmi; .pred.rel "mutex",p44,p46 | ||
| 286 | (p40) cmp.ltu p41,p39=n[0],a[0] | ||
| 287 | (p42) cmp.leu p41,p39=n[0],a[0] | ||
| 288 | (p32) add topbit=r0,r0,1 } | ||
| 289 | |||
| 290 | { .mmi; st8 [tp_1]=n[0],8 | ||
| 291 | cmp4.ne p6,p0=1,num | ||
| 292 | sub aptr=aptr,len };; // rewind | ||
| 293 | { .mmi; sub nptr=nptr,len | ||
| 294 | (p41) add topbit=r0,r0,1 | ||
| 295 | add tptr=16,sp } | ||
| 296 | { .mmb; add tp_1=8,sp | ||
| 297 | add num=-1,num // num-- | ||
| 298 | (p6) br.cond.sptk.many .Louter };; | ||
| 299 | |||
| 300 | { .mbb; add lc=4,lc | ||
| 301 | brp.loop.imp .Lsub_ctop,.Lsub_cend-16 | ||
| 302 | clrrrb.pr };; | ||
| 303 | { .mii; nop.m 0 | ||
| 304 | mov pr.rot=0x10001<<16 | ||
| 305 | // ------^---- (p33) at first (p17) | ||
| 306 | mov ar.lc=lc } | ||
| 307 | { .mii; nop.m 0 | ||
| 308 | mov ar.ec=3 | ||
| 309 | nop.i 0 };; | ||
| 310 | |||
| 311 | .Lsub_ctop: | ||
| 312 | .pred.rel "mutex",p33,p35 | ||
| 313 | { .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++) | ||
| 314 | (p16) nop.f 0 | ||
| 315 | (p33) sub n[1]=t[1],n[1] } // (p17) | ||
| 316 | { .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++) | ||
| 317 | (p16) nop.f 0 | ||
| 318 | (p35) sub n[1]=t[1],n[1],1 };; // (p17) | ||
| 319 | { .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r | ||
| 320 | (p33) cmp.gtu p34,p32=n[1],t[1] // (p17) | ||
| 321 | (p18) nop.b 0 } | ||
| 322 | { .mib; (p18) nop.m 0 | ||
| 323 | (p35) cmp.geu p34,p32=n[1],t[1] // (p17) | ||
| 324 | br.ctop.sptk .Lsub_ctop };; | ||
| 325 | .Lsub_cend: | ||
| 326 | |||
| 327 | { .mmb; .pred.rel "mutex",p34,p36 | ||
| 328 | (p34) sub topbit=topbit,r0 // (p19) | ||
| 329 | (p36) sub topbit=topbit,r0,1 | ||
| 330 | brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16 | ||
| 331 | } | ||
| 332 | { .mmb; sub rptr=rptr,len // rewind | ||
| 333 | sub tptr=tptr,len | ||
| 334 | clrrrb.pr };; | ||
| 335 | { .mmi; and aptr=tptr,topbit | ||
| 336 | andcm bptr=rptr,topbit | ||
| 337 | mov pr.rot=1<<16 };; | ||
| 338 | { .mii; or nptr=aptr,bptr | ||
| 339 | mov ar.lc=lc | ||
| 340 | mov ar.ec=3 };; | ||
| 341 | |||
| 342 | .Lcopy_ctop: | ||
| 343 | { .mmb; (p16) ld8 n[0]=[nptr],8 | ||
| 344 | (p18) st8 [tptr]=r0,8 | ||
| 345 | (p16) nop.b 0 } | ||
| 346 | { .mmb; (p16) nop.m 0 | ||
| 347 | (p18) st8 [rptr]=n[2],8 | ||
| 348 | br.ctop.sptk .Lcopy_ctop };; | ||
| 349 | .Lcopy_cend: | ||
| 350 | |||
| 351 | { .mmi; mov ret0=1 // signal "handled" | ||
| 352 | rum 1<<5 // clear um.mfh | ||
| 353 | mov ar.lc=prevlc } | ||
| 354 | { .mib; .restore sp | ||
| 355 | mov sp=prevsp | ||
| 356 | mov pr=prevpr,0x1ffff | ||
| 357 | br.ret.sptk.many b0 };; | ||
| 358 | .endp bn_mul_mont_general# | ||
| 359 | |||
| 360 | a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23; | ||
| 361 | n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31; | ||
| 362 | t0=r15; | ||
| 363 | |||
| 364 | ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15; | ||
| 365 | ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23; | ||
| 366 | |||
| 367 | .align 64 | ||
| 368 | .skip 48 // aligns loop body | ||
| 369 | .local bn_mul_mont_8# | ||
| 370 | .proc bn_mul_mont_8# | ||
| 371 | bn_mul_mont_8: | ||
| 372 | .prologue | ||
| 373 | { .mmi; .save ar.pfs,prevfs | ||
| 374 | alloc prevfs=ar.pfs,6,2,0,8 | ||
| 375 | .vframe prevsp | ||
| 376 | mov prevsp=sp | ||
| 377 | .save ar.lc,prevlc | ||
| 378 | mov prevlc=ar.lc } | ||
| 379 | { .mmi; add r17=-6*16,sp | ||
| 380 | add sp=-7*16,sp | ||
| 381 | .save pr,prevpr | ||
| 382 | mov prevpr=pr };; | ||
| 383 | |||
| 384 | { .mmi; .save.gf 0,0x10 | ||
| 385 | stf.spill [sp]=f16,-16 | ||
| 386 | .save.gf 0,0x20 | ||
| 387 | stf.spill [r17]=f17,32 | ||
| 388 | add r16=-5*16,prevsp};; | ||
| 389 | { .mmi; .save.gf 0,0x40 | ||
| 390 | stf.spill [r16]=f18,32 | ||
| 391 | .save.gf 0,0x80 | ||
| 392 | stf.spill [r17]=f19,32 | ||
| 393 | $ADDP aptr=0,in1 };; | ||
| 394 | { .mmi; .save.gf 0,0x100 | ||
| 395 | stf.spill [r16]=f20,32 | ||
| 396 | .save.gf 0,0x200 | ||
| 397 | stf.spill [r17]=f21,32 | ||
| 398 | $ADDP r29=8,in1 };; | ||
| 399 | { .mmi; .save.gf 0,0x400 | ||
| 400 | stf.spill [r16]=f22 | ||
| 401 | .save.gf 0,0x800 | ||
| 402 | stf.spill [r17]=f23 | ||
| 403 | $ADDP rptr=0,in0 };; | ||
| 404 | |||
| 405 | .body | ||
| 406 | .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10] | ||
| 407 | .rotr t[8] | ||
| 408 | |||
| 409 | // load input vectors padding them to 8 elements | ||
| 410 | { .mmi; ldf8 ai0=[aptr],16 // ap[0] | ||
| 411 | ldf8 ai1=[r29],16 // ap[1] | ||
| 412 | $ADDP bptr=0,in2 } | ||
| 413 | { .mmi; $ADDP r30=8,in2 | ||
| 414 | $ADDP nptr=0,in3 | ||
| 415 | $ADDP r31=8,in3 };; | ||
| 416 | { .mmi; ldf8 bj[7]=[bptr],16 // bp[0] | ||
| 417 | ldf8 bj[6]=[r30],16 // bp[1] | ||
| 418 | cmp4.le p4,p5=3,in5 } | ||
| 419 | { .mmi; ldf8 ni0=[nptr],16 // np[0] | ||
| 420 | ldf8 ni1=[r31],16 // np[1] | ||
| 421 | cmp4.le p6,p7=4,in5 };; | ||
| 422 | |||
| 423 | { .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2] | ||
| 424 | (p5)fcvt.fxu ai2=f0 | ||
| 425 | cmp4.le p8,p9=5,in5 } | ||
| 426 | { .mfi; (p6)ldf8 ai3=[r29],16 // ap[3] | ||
| 427 | (p7)fcvt.fxu ai3=f0 | ||
| 428 | cmp4.le p10,p11=6,in5 } | ||
| 429 | { .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2] | ||
| 430 | (p5)fcvt.fxu bj[5]=f0 | ||
| 431 | cmp4.le p12,p13=7,in5 } | ||
| 432 | { .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3] | ||
| 433 | (p7)fcvt.fxu bj[4]=f0 | ||
| 434 | cmp4.le p14,p15=8,in5 } | ||
| 435 | { .mfi; (p4)ldf8 ni2=[nptr],16 // np[2] | ||
| 436 | (p5)fcvt.fxu ni2=f0 | ||
| 437 | addp4 r28=-1,in5 } | ||
| 438 | { .mfi; (p6)ldf8 ni3=[r31],16 // np[3] | ||
| 439 | (p7)fcvt.fxu ni3=f0 | ||
| 440 | $ADDP in4=0,in4 };; | ||
| 441 | |||
| 442 | { .mfi; ldf8 n0=[in4] | ||
| 443 | fcvt.fxu tf[1]=f0 | ||
| 444 | nop.i 0 } | ||
| 445 | |||
| 446 | { .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4] | ||
| 447 | (p9)fcvt.fxu ai4=f0 | ||
| 448 | mov t[0]=r0 } | ||
| 449 | { .mfi; (p10)ldf8 ai5=[r29],16 // ap[5] | ||
| 450 | (p11)fcvt.fxu ai5=f0 | ||
| 451 | mov t[1]=r0 } | ||
| 452 | { .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4] | ||
| 453 | (p9)fcvt.fxu bj[3]=f0 | ||
| 454 | mov t[2]=r0 } | ||
| 455 | { .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5] | ||
| 456 | (p11)fcvt.fxu bj[2]=f0 | ||
| 457 | mov t[3]=r0 } | ||
| 458 | { .mfi; (p8)ldf8 ni4=[nptr],16 // np[4] | ||
| 459 | (p9)fcvt.fxu ni4=f0 | ||
| 460 | mov t[4]=r0 } | ||
| 461 | { .mfi; (p10)ldf8 ni5=[r31],16 // np[5] | ||
| 462 | (p11)fcvt.fxu ni5=f0 | ||
| 463 | mov t[5]=r0 };; | ||
| 464 | |||
| 465 | { .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6] | ||
| 466 | (p13)fcvt.fxu ai6=f0 | ||
| 467 | mov t[6]=r0 } | ||
| 468 | { .mfi; (p14)ldf8 ai7=[r29],16 // ap[7] | ||
| 469 | (p15)fcvt.fxu ai7=f0 | ||
| 470 | mov t[7]=r0 } | ||
| 471 | { .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6] | ||
| 472 | (p13)fcvt.fxu bj[1]=f0 | ||
| 473 | mov ar.lc=r28 } | ||
| 474 | { .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] | ||
| 475 | (p15)fcvt.fxu bj[0]=f0 | ||
| 476 | mov ar.ec=1 } | ||
| 477 | { .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] | ||
| 478 | (p13)fcvt.fxu ni6=f0 | ||
| 479 | mov pr.rot=1<<16 } | ||
| 480 | { .mfb; (p14)ldf8 ni7=[r31],16 // np[7] | ||
| 481 | (p15)fcvt.fxu ni7=f0 | ||
| 482 | brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 | ||
| 483 | };; | ||
| 484 | |||
| 485 | // The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt | ||
| 486 | // to measure with help of Interval Time Counter indicated that the | ||
| 487 | // factor is a tad higher: 33 or 34, if not 35. Exact measurement and | ||
| 488 | // addressing the issue is problematic, because I don't have access | ||
| 489 | // to platform-specific instruction-level profiler. On Itanium it | ||
| 490 | // should run in 56*n ticks, because of higher xma latency... | ||
| 491 | .Louter_8_ctop: | ||
| 492 | .pred.rel "mutex",p40,p42 | ||
| 493 | .pred.rel "mutex",p48,p50 | ||
| 494 | { .mfi; (p16) nop.m 0 // 0: | ||
| 495 | (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0] | ||
| 496 | (p40) add a3=a3,n3 } // (p17) a3+=n3 | ||
| 497 | { .mfi; (p42) add a3=a3,n3,1 | ||
| 498 | (p16) xma.lu alo[0]=ai0,bj[7],tf[1] | ||
| 499 | (p16) nop.i 0 };; | ||
| 500 | { .mii; (p17) getf.sig a7=alo[8] // 1: | ||
| 501 | (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 | ||
| 502 | (p50) add t[6]=t[6],a3,1 };; | ||
| 503 | { .mfi; (p17) getf.sig a8=ahi[8] // 2: | ||
| 504 | (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 | ||
| 505 | (p40) cmp.ltu p43,p41=a3,n3 } | ||
| 506 | { .mfi; (p42) cmp.leu p43,p41=a3,n3 | ||
| 507 | (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] | ||
| 508 | (p16) nop.i 0 };; | ||
| 509 | { .mii; (p17) getf.sig n5=nlo[6] // 3: | ||
| 510 | (p48) cmp.ltu p51,p49=t[6],a3 | ||
| 511 | (p50) cmp.leu p51,p49=t[6],a3 };; | ||
| 512 | .pred.rel "mutex",p41,p43 | ||
| 513 | .pred.rel "mutex",p49,p51 | ||
| 514 | { .mfi; (p16) nop.m 0 // 4: | ||
| 515 | (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i] | ||
| 516 | (p41) add a4=a4,n4 } // (p17) a4+=n4 | ||
| 517 | { .mfi; (p43) add a4=a4,n4,1 | ||
| 518 | (p16) xma.lu alo[1]=ai1,bj[7],ahi[0] | ||
| 519 | (p16) nop.i 0 };; | ||
| 520 | { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 | ||
| 521 | (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0 | ||
| 522 | (p51) add t[5]=t[5],a4,1 };; | ||
| 523 | { .mfi; (p16) nop.m 0 // 6: | ||
| 524 | (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 | ||
| 525 | (p41) cmp.ltu p42,p40=a4,n4 } | ||
| 526 | { .mfi; (p43) cmp.leu p42,p40=a4,n4 | ||
| 527 | (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] | ||
| 528 | (p16) nop.i 0 };; | ||
| 529 | { .mii; (p17) getf.sig n6=nlo[7] // 7: | ||
| 530 | (p49) cmp.ltu p50,p48=t[5],a4 | ||
| 531 | (p51) cmp.leu p50,p48=t[5],a4 };; | ||
| 532 | .pred.rel "mutex",p40,p42 | ||
| 533 | .pred.rel "mutex",p48,p50 | ||
| 534 | { .mfi; (p16) nop.m 0 // 8: | ||
| 535 | (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i] | ||
| 536 | (p40) add a5=a5,n5 } // (p17) a5+=n5 | ||
| 537 | { .mfi; (p42) add a5=a5,n5,1 | ||
| 538 | (p16) xma.lu alo[2]=ai2,bj[7],ahi[1] | ||
| 539 | (p16) nop.i 0 };; | ||
| 540 | { .mii; (p16) getf.sig a1=alo[1] // 9: | ||
| 541 | (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 | ||
| 542 | (p50) add t[4]=t[4],a5,1 };; | ||
| 543 | { .mfi; (p16) nop.m 0 // 10: | ||
| 544 | (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0 | ||
| 545 | (p40) cmp.ltu p43,p41=a5,n5 } | ||
| 546 | { .mfi; (p42) cmp.leu p43,p41=a5,n5 | ||
| 547 | (p16) xma.lu nlo[0]=ni0,mj[0],alo[0] | ||
| 548 | (p16) nop.i 0 };; | ||
| 549 | { .mii; (p17) getf.sig n7=nlo[8] // 11: | ||
| 550 | (p48) cmp.ltu p51,p49=t[4],a5 | ||
| 551 | (p50) cmp.leu p51,p49=t[4],a5 };; | ||
| 552 | .pred.rel "mutex",p41,p43 | ||
| 553 | .pred.rel "mutex",p49,p51 | ||
| 554 | { .mfi; (p17) getf.sig n8=nhi[8] // 12: | ||
| 555 | (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i] | ||
| 556 | (p41) add a6=a6,n6 } // (p17) a6+=n6 | ||
| 557 | { .mfi; (p43) add a6=a6,n6,1 | ||
| 558 | (p16) xma.lu alo[3]=ai3,bj[7],ahi[2] | ||
| 559 | (p16) nop.i 0 };; | ||
| 560 | { .mii; (p16) getf.sig a2=alo[2] // 13: | ||
| 561 | (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 | ||
| 562 | (p51) add t[3]=t[3],a6,1 };; | ||
| 563 | { .mfi; (p16) nop.m 0 // 14: | ||
| 564 | (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0 | ||
| 565 | (p41) cmp.ltu p42,p40=a6,n6 } | ||
| 566 | { .mfi; (p43) cmp.leu p42,p40=a6,n6 | ||
| 567 | (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0] | ||
| 568 | (p16) nop.i 0 };; | ||
| 569 | { .mii; (p16) nop.m 0 // 15: | ||
| 570 | (p49) cmp.ltu p50,p48=t[3],a6 | ||
| 571 | (p51) cmp.leu p50,p48=t[3],a6 };; | ||
| 572 | .pred.rel "mutex",p40,p42 | ||
| 573 | .pred.rel "mutex",p48,p50 | ||
| 574 | { .mfi; (p16) nop.m 0 // 16: | ||
| 575 | (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i] | ||
| 576 | (p40) add a7=a7,n7 } // (p17) a7+=n7 | ||
| 577 | { .mfi; (p42) add a7=a7,n7,1 | ||
| 578 | (p16) xma.lu alo[4]=ai4,bj[7],ahi[3] | ||
| 579 | (p16) nop.i 0 };; | ||
| 580 | { .mii; (p16) getf.sig a3=alo[3] // 17: | ||
| 581 | (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 | ||
| 582 | (p50) add t[2]=t[2],a7,1 };; | ||
| 583 | { .mfi; (p16) nop.m 0 // 18: | ||
| 584 | (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0 | ||
| 585 | (p40) cmp.ltu p43,p41=a7,n7 } | ||
| 586 | { .mfi; (p42) cmp.leu p43,p41=a7,n7 | ||
| 587 | (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1] | ||
| 588 | (p16) nop.i 0 };; | ||
| 589 | { .mii; (p16) getf.sig n1=nlo[1] // 19: | ||
| 590 | (p48) cmp.ltu p51,p49=t[2],a7 | ||
| 591 | (p50) cmp.leu p51,p49=t[2],a7 };; | ||
| 592 | .pred.rel "mutex",p41,p43 | ||
| 593 | .pred.rel "mutex",p49,p51 | ||
| 594 | { .mfi; (p16) nop.m 0 // 20: | ||
| 595 | (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i] | ||
| 596 | (p41) add a8=a8,n8 } // (p17) a8+=n8 | ||
| 597 | { .mfi; (p43) add a8=a8,n8,1 | ||
| 598 | (p16) xma.lu alo[5]=ai5,bj[7],ahi[4] | ||
| 599 | (p16) nop.i 0 };; | ||
| 600 | { .mii; (p16) getf.sig a4=alo[4] // 21: | ||
| 601 | (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 | ||
| 602 | (p51) add t[1]=t[1],a8,1 };; | ||
| 603 | { .mfi; (p16) nop.m 0 // 22: | ||
| 604 | (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0 | ||
| 605 | (p41) cmp.ltu p42,p40=a8,n8 } | ||
| 606 | { .mfi; (p43) cmp.leu p42,p40=a8,n8 | ||
| 607 | (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2] | ||
| 608 | (p16) nop.i 0 };; | ||
| 609 | { .mii; (p16) getf.sig n2=nlo[2] // 23: | ||
| 610 | (p49) cmp.ltu p50,p48=t[1],a8 | ||
| 611 | (p51) cmp.leu p50,p48=t[1],a8 };; | ||
| 612 | { .mfi; (p16) nop.m 0 // 24: | ||
| 613 | (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i] | ||
| 614 | (p16) add a1=a1,n1 } // (p16) a1+=n1 | ||
| 615 | { .mfi; (p16) nop.m 0 | ||
| 616 | (p16) xma.lu alo[6]=ai6,bj[7],ahi[5] | ||
| 617 | (p17) mov t[0]=r0 };; | ||
| 618 | { .mii; (p16) getf.sig a5=alo[5] // 25: | ||
| 619 | (p16) add t0=t[7],a1 // (p16) t[7]+=a1 | ||
| 620 | (p42) add t[0]=t[0],r0,1 };; | ||
| 621 | { .mfi; (p16) setf.sig tf[0]=t0 // 26: | ||
| 622 | (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0 | ||
| 623 | (p50) add t[0]=t[0],r0,1 } | ||
| 624 | { .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1 | ||
| 625 | (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3] | ||
| 626 | (p16) nop.i 0 };; | ||
| 627 | { .mii; (p16) getf.sig n3=nlo[3] // 27: | ||
| 628 | (p16) cmp.ltu.unc p50,p48=t0,a1 | ||
| 629 | (p16) nop.i 0 };; | ||
| 630 | .pred.rel "mutex",p40,p42 | ||
| 631 | .pred.rel "mutex",p48,p50 | ||
| 632 | { .mfi; (p16) nop.m 0 // 28: | ||
| 633 | (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i] | ||
| 634 | (p40) add a2=a2,n2 } // (p16) a2+=n2 | ||
| 635 | { .mfi; (p42) add a2=a2,n2,1 | ||
| 636 | (p16) xma.lu alo[7]=ai7,bj[7],ahi[6] | ||
| 637 | (p16) nop.i 0 };; | ||
| 638 | { .mii; (p16) getf.sig a6=alo[6] // 29: | ||
| 639 | (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2 | ||
| 640 | (p50) add t[6]=t[6],a2,1 };; | ||
| 641 | { .mfi; (p16) nop.m 0 // 30: | ||
| 642 | (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0 | ||
| 643 | (p40) cmp.ltu p41,p39=a2,n2 } | ||
| 644 | { .mfi; (p42) cmp.leu p41,p39=a2,n2 | ||
| 645 | (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4] | ||
| 646 | (p16) nop.i 0 };; | ||
| 647 | { .mfi; (p16) getf.sig n4=nlo[4] // 31: | ||
| 648 | (p16) nop.f 0 | ||
| 649 | (p48) cmp.ltu p49,p47=t[6],a2 } | ||
| 650 | { .mfb; (p50) cmp.leu p49,p47=t[6],a2 | ||
| 651 | (p16) nop.f 0 | ||
| 652 | br.ctop.sptk.many .Louter_8_ctop };; | ||
| 653 | .Louter_8_cend: | ||
| 654 | |||
| 655 | // above loop has to execute one more time, without (p16), which is | ||
| 656 | // replaced with merged move of np[8] to GPR bank | ||
| 657 | .pred.rel "mutex",p40,p42 | ||
| 658 | .pred.rel "mutex",p48,p50 | ||
| 659 | { .mmi; (p0) getf.sig n1=ni0 // 0: | ||
| 660 | (p40) add a3=a3,n3 // (p17) a3+=n3 | ||
| 661 | (p42) add a3=a3,n3,1 };; | ||
| 662 | { .mii; (p17) getf.sig a7=alo[8] // 1: | ||
| 663 | (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 | ||
| 664 | (p50) add t[6]=t[6],a3,1 };; | ||
| 665 | { .mfi; (p17) getf.sig a8=ahi[8] // 2: | ||
| 666 | (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 | ||
| 667 | (p40) cmp.ltu p43,p41=a3,n3 } | ||
| 668 | { .mfi; (p42) cmp.leu p43,p41=a3,n3 | ||
| 669 | (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] | ||
| 670 | (p0) nop.i 0 };; | ||
| 671 | { .mii; (p17) getf.sig n5=nlo[6] // 3: | ||
| 672 | (p48) cmp.ltu p51,p49=t[6],a3 | ||
| 673 | (p50) cmp.leu p51,p49=t[6],a3 };; | ||
| 674 | .pred.rel "mutex",p41,p43 | ||
| 675 | .pred.rel "mutex",p49,p51 | ||
| 676 | { .mmi; (p0) getf.sig n2=ni1 // 4: | ||
| 677 | (p41) add a4=a4,n4 // (p17) a4+=n4 | ||
| 678 | (p43) add a4=a4,n4,1 };; | ||
| 679 | { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 | ||
| 680 | (p0) nop.f 0 | ||
| 681 | (p51) add t[5]=t[5],a4,1 };; | ||
| 682 | { .mfi; (p0) getf.sig n3=ni2 // 6: | ||
| 683 | (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 | ||
| 684 | (p41) cmp.ltu p42,p40=a4,n4 } | ||
| 685 | { .mfi; (p43) cmp.leu p42,p40=a4,n4 | ||
| 686 | (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] | ||
| 687 | (p0) nop.i 0 };; | ||
| 688 | { .mii; (p17) getf.sig n6=nlo[7] // 7: | ||
| 689 | (p49) cmp.ltu p50,p48=t[5],a4 | ||
| 690 | (p51) cmp.leu p50,p48=t[5],a4 };; | ||
| 691 | .pred.rel "mutex",p40,p42 | ||
| 692 | .pred.rel "mutex",p48,p50 | ||
| 693 | { .mii; (p0) getf.sig n4=ni3 // 8: | ||
| 694 | (p40) add a5=a5,n5 // (p17) a5+=n5 | ||
| 695 | (p42) add a5=a5,n5,1 };; | ||
| 696 | { .mii; (p0) nop.m 0 // 9: | ||
| 697 | (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 | ||
| 698 | (p50) add t[4]=t[4],a5,1 };; | ||
| 699 | { .mii; (p0) nop.m 0 // 10: | ||
| 700 | (p40) cmp.ltu p43,p41=a5,n5 | ||
| 701 | (p42) cmp.leu p43,p41=a5,n5 };; | ||
| 702 | { .mii; (p17) getf.sig n7=nlo[8] // 11: | ||
| 703 | (p48) cmp.ltu p51,p49=t[4],a5 | ||
| 704 | (p50) cmp.leu p51,p49=t[4],a5 };; | ||
| 705 | .pred.rel "mutex",p41,p43 | ||
| 706 | .pred.rel "mutex",p49,p51 | ||
| 707 | { .mii; (p17) getf.sig n8=nhi[8] // 12: | ||
| 708 | (p41) add a6=a6,n6 // (p17) a6+=n6 | ||
| 709 | (p43) add a6=a6,n6,1 };; | ||
| 710 | { .mii; (p0) getf.sig n5=ni4 // 13: | ||
| 711 | (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 | ||
| 712 | (p51) add t[3]=t[3],a6,1 };; | ||
| 713 | { .mii; (p0) nop.m 0 // 14: | ||
| 714 | (p41) cmp.ltu p42,p40=a6,n6 | ||
| 715 | (p43) cmp.leu p42,p40=a6,n6 };; | ||
| 716 | { .mii; (p0) getf.sig n6=ni5 // 15: | ||
| 717 | (p49) cmp.ltu p50,p48=t[3],a6 | ||
| 718 | (p51) cmp.leu p50,p48=t[3],a6 };; | ||
| 719 | .pred.rel "mutex",p40,p42 | ||
| 720 | .pred.rel "mutex",p48,p50 | ||
| 721 | { .mii; (p0) nop.m 0 // 16: | ||
| 722 | (p40) add a7=a7,n7 // (p17) a7+=n7 | ||
| 723 | (p42) add a7=a7,n7,1 };; | ||
| 724 | { .mii; (p0) nop.m 0 // 17: | ||
| 725 | (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 | ||
| 726 | (p50) add t[2]=t[2],a7,1 };; | ||
| 727 | { .mii; (p0) nop.m 0 // 18: | ||
| 728 | (p40) cmp.ltu p43,p41=a7,n7 | ||
| 729 | (p42) cmp.leu p43,p41=a7,n7 };; | ||
| 730 | { .mii; (p0) getf.sig n7=ni6 // 19: | ||
| 731 | (p48) cmp.ltu p51,p49=t[2],a7 | ||
| 732 | (p50) cmp.leu p51,p49=t[2],a7 };; | ||
| 733 | .pred.rel "mutex",p41,p43 | ||
| 734 | .pred.rel "mutex",p49,p51 | ||
| 735 | { .mii; (p0) nop.m 0 // 20: | ||
| 736 | (p41) add a8=a8,n8 // (p17) a8+=n8 | ||
| 737 | (p43) add a8=a8,n8,1 };; | ||
| 738 | { .mmi; (p0) nop.m 0 // 21: | ||
| 739 | (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 | ||
| 740 | (p51) add t[1]=t[1],a8,1 } | ||
| 741 | { .mmi; (p17) mov t[0]=r0 | ||
| 742 | (p41) cmp.ltu p42,p40=a8,n8 | ||
| 743 | (p43) cmp.leu p42,p40=a8,n8 };; | ||
| 744 | { .mmi; (p0) getf.sig n8=ni7 // 22: | ||
| 745 | (p49) cmp.ltu p50,p48=t[1],a8 | ||
| 746 | (p51) cmp.leu p50,p48=t[1],a8 } | ||
| 747 | { .mmi; (p42) add t[0]=t[0],r0,1 | ||
| 748 | (p0) add r16=-7*16,prevsp | ||
| 749 | (p0) add r17=-6*16,prevsp };; | ||
| 750 | |||
| 751 | // subtract np[8] from carrybit|tmp[8] | ||
| 752 | // carrybit|tmp[8] layout upon exit from above loop is: | ||
| 753 | // t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) | ||
| 754 | { .mmi; (p50)add t[0]=t[0],r0,1 | ||
| 755 | add r18=-5*16,prevsp | ||
| 756 | sub n1=t0,n1 };; | ||
| 757 | { .mmi; cmp.gtu p34,p32=n1,t0;; | ||
| 758 | .pred.rel "mutex",p32,p34 | ||
| 759 | (p32)sub n2=t[7],n2 | ||
| 760 | (p34)sub n2=t[7],n2,1 };; | ||
| 761 | { .mii; (p32)cmp.gtu p35,p33=n2,t[7] | ||
| 762 | (p34)cmp.geu p35,p33=n2,t[7];; | ||
| 763 | .pred.rel "mutex",p33,p35 | ||
| 764 | (p33)sub n3=t[6],n3 } | ||
| 765 | { .mmi; (p35)sub n3=t[6],n3,1;; | ||
| 766 | (p33)cmp.gtu p34,p32=n3,t[6] | ||
| 767 | (p35)cmp.geu p34,p32=n3,t[6] };; | ||
| 768 | .pred.rel "mutex",p32,p34 | ||
| 769 | { .mii; (p32)sub n4=t[5],n4 | ||
| 770 | (p34)sub n4=t[5],n4,1;; | ||
| 771 | (p32)cmp.gtu p35,p33=n4,t[5] } | ||
| 772 | { .mmi; (p34)cmp.geu p35,p33=n4,t[5];; | ||
| 773 | .pred.rel "mutex",p33,p35 | ||
| 774 | (p33)sub n5=t[4],n5 | ||
| 775 | (p35)sub n5=t[4],n5,1 };; | ||
| 776 | { .mii; (p33)cmp.gtu p34,p32=n5,t[4] | ||
| 777 | (p35)cmp.geu p34,p32=n5,t[4];; | ||
| 778 | .pred.rel "mutex",p32,p34 | ||
| 779 | (p32)sub n6=t[3],n6 } | ||
| 780 | { .mmi; (p34)sub n6=t[3],n6,1;; | ||
| 781 | (p32)cmp.gtu p35,p33=n6,t[3] | ||
| 782 | (p34)cmp.geu p35,p33=n6,t[3] };; | ||
| 783 | .pred.rel "mutex",p33,p35 | ||
| 784 | { .mii; (p33)sub n7=t[2],n7 | ||
| 785 | (p35)sub n7=t[2],n7,1;; | ||
| 786 | (p33)cmp.gtu p34,p32=n7,t[2] } | ||
| 787 | { .mmi; (p35)cmp.geu p34,p32=n7,t[2];; | ||
| 788 | .pred.rel "mutex",p32,p34 | ||
| 789 | (p32)sub n8=t[1],n8 | ||
| 790 | (p34)sub n8=t[1],n8,1 };; | ||
| 791 | { .mii; (p32)cmp.gtu p35,p33=n8,t[1] | ||
| 792 | (p34)cmp.geu p35,p33=n8,t[1];; | ||
| 793 | .pred.rel "mutex",p33,p35 | ||
| 794 | (p33)sub a8=t[0],r0 } | ||
| 795 | { .mmi; (p35)sub a8=t[0],r0,1;; | ||
| 796 | (p33)cmp.gtu p34,p32=a8,t[0] | ||
| 797 | (p35)cmp.geu p34,p32=a8,t[0] };; | ||
| 798 | |||
| 799 | // save the result, either tmp[num] or tmp[num]-np[num] | ||
| 800 | .pred.rel "mutex",p32,p34 | ||
| 801 | { .mmi; (p32)st8 [rptr]=n1,8 | ||
| 802 | (p34)st8 [rptr]=t0,8 | ||
| 803 | add r19=-4*16,prevsp};; | ||
| 804 | { .mmb; (p32)st8 [rptr]=n2,8 | ||
| 805 | (p34)st8 [rptr]=t[7],8 | ||
| 806 | (p5)br.cond.dpnt.few .Ldone };; | ||
| 807 | { .mmb; (p32)st8 [rptr]=n3,8 | ||
| 808 | (p34)st8 [rptr]=t[6],8 | ||
| 809 | (p7)br.cond.dpnt.few .Ldone };; | ||
| 810 | { .mmb; (p32)st8 [rptr]=n4,8 | ||
| 811 | (p34)st8 [rptr]=t[5],8 | ||
| 812 | (p9)br.cond.dpnt.few .Ldone };; | ||
| 813 | { .mmb; (p32)st8 [rptr]=n5,8 | ||
| 814 | (p34)st8 [rptr]=t[4],8 | ||
| 815 | (p11)br.cond.dpnt.few .Ldone };; | ||
| 816 | { .mmb; (p32)st8 [rptr]=n6,8 | ||
| 817 | (p34)st8 [rptr]=t[3],8 | ||
| 818 | (p13)br.cond.dpnt.few .Ldone };; | ||
| 819 | { .mmb; (p32)st8 [rptr]=n7,8 | ||
| 820 | (p34)st8 [rptr]=t[2],8 | ||
| 821 | (p15)br.cond.dpnt.few .Ldone };; | ||
| 822 | { .mmb; (p32)st8 [rptr]=n8,8 | ||
| 823 | (p34)st8 [rptr]=t[1],8 | ||
| 824 | nop.b 0 };; | ||
| 825 | .Ldone: // epilogue | ||
| 826 | { .mmi; ldf.fill f16=[r16],64 | ||
| 827 | ldf.fill f17=[r17],64 | ||
| 828 | nop.i 0 } | ||
| 829 | { .mmi; ldf.fill f18=[r18],64 | ||
| 830 | ldf.fill f19=[r19],64 | ||
| 831 | mov pr=prevpr,0x1ffff };; | ||
| 832 | { .mmi; ldf.fill f20=[r16] | ||
| 833 | ldf.fill f21=[r17] | ||
| 834 | mov ar.lc=prevlc } | ||
| 835 | { .mmi; ldf.fill f22=[r18] | ||
| 836 | ldf.fill f23=[r19] | ||
| 837 | mov ret0=1 } // signal "handled" | ||
| 838 | { .mib; rum 1<<5 | ||
| 839 | .restore sp | ||
| 840 | mov sp=prevsp | ||
| 841 | br.ret.sptk.many b0 };; | ||
| 842 | .endp bn_mul_mont_8# | ||
| 843 | |||
| 844 | .type copyright#,\@object | ||
| 845 | copyright: | ||
| 846 | stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 847 | ___ | ||
| 848 | |||
| 849 | $output=shift and open STDOUT,">$output"; | ||
| 850 | print $code; | ||
| 851 | close STDOUT; | ||
