diff options
| author | markus <> | 2003-05-12 02:18:40 +0000 |
|---|---|---|
| committer | markus <> | 2003-05-12 02:18:40 +0000 |
| commit | d4fcd82bb7f6d603bd61e19a81ba97337b89dfca (patch) | |
| tree | d52e3a0f1f08f65ad283027e560e17ed0d720462 /src/lib/libcrypto/bn/asm | |
| parent | 582bbd139cd2afd58d10dc051c5b0b989b441074 (diff) | |
| download | openbsd-d4fcd82bb7f6d603bd61e19a81ba97337b89dfca.tar.gz openbsd-d4fcd82bb7f6d603bd61e19a81ba97337b89dfca.tar.bz2 openbsd-d4fcd82bb7f6d603bd61e19a81ba97337b89dfca.zip | |
merge 0.9.7b with local changes; crank majors for libssl/libcrypto
Diffstat (limited to 'src/lib/libcrypto/bn/asm')
| -rw-r--r-- | src/lib/libcrypto/bn/asm/ia64.S | 235 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/pa-risc2.s | 36 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/vms.mar | 254 |
3 files changed, 330 insertions, 195 deletions
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S index ae56066310..7dfda85566 100644 --- a/src/lib/libcrypto/bn/asm/ia64.S +++ b/src/lib/libcrypto/bn/asm/ia64.S | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | .explicit | 1 | .explicit |
| 2 | .text | 2 | .text |
| 3 | .ident "ia64.S, Version 1.1" | 3 | .ident "ia64.S, Version 2.0" |
| 4 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | 4 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" |
| 5 | 5 | ||
| 6 | // | 6 | // |
| @@ -13,6 +13,35 @@ | |||
| 13 | // disclaimed. | 13 | // disclaimed. |
| 14 | // ==================================================================== | 14 | // ==================================================================== |
| 15 | // | 15 | // |
| 16 | // Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is | ||
| 17 | // different from Itanium to this module viewpoint. Most notably, is it | ||
| 18 | // "wider" than Itanium? Can you experience loop scalability as | ||
| 19 | // discussed in commentary sections? Not really:-( Itanium2 has 6 | ||
| 20 | // integer ALU ports, i.e. it's 2 ports wider, but it's not enough to | ||
| 21 | // spin twice as fast, as I need 8 IALU ports. Amount of floating point | ||
| 22 | // ports is the same, i.e. 2, while I need 4. In other words, to this | ||
| 23 | // module Itanium2 remains effectively as "wide" as Itanium. Yet it's | ||
| 24 | // essentially different in respect to this module, and a re-tune was | ||
| 25 | // required. Well, because some intruction latencies has changed. Most | ||
| 26 | // noticeably those intensively used: | ||
| 27 | // | ||
| 28 | // Itanium Itanium2 | ||
| 29 | // ldf8 9 6 L2 hit | ||
| 30 | // ld8 2 1 L1 hit | ||
| 31 | // getf 2 5 | ||
| 32 | // xma[->getf] 7[+1] 4[+0] | ||
| 33 | // add[->st8] 1[+1] 1[+0] | ||
| 34 | // | ||
| 35 | // What does it mean? You might ratiocinate that the original code | ||
| 36 | // should run just faster... Because sum of latencies is smaller... | ||
| 37 | // Wrong! Note that getf latency increased. This means that if a loop is | ||
| 38 | // scheduled for lower latency (and they are), then it will suffer from | ||
| 39 | // stall condition and the code will therefore turn anti-scalable, e.g. | ||
| 40 | // original bn_mul_words spun at 5*n or 2.5 times slower than expected | ||
| 41 | // on Itanium2! What to do? Reschedule loops for Itanium2? But then | ||
| 42 | // Itanium would exhibit anti-scalability. So I've chosen to reschedule | ||
| 43 | // for worst latency for every instruction aiming for best *all-round* | ||
| 44 | // performance. | ||
| 16 | 45 | ||
| 17 | // Q. How much faster does it get? | 46 | // Q. How much faster does it get? |
| 18 | // A. Here is the output from 'openssl speed rsa dsa' for vanilla | 47 | // A. Here is the output from 'openssl speed rsa dsa' for vanilla |
| @@ -149,12 +178,27 @@ bn_add_words: | |||
| 149 | brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 | 178 | brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 |
| 150 | } | 179 | } |
| 151 | .body | 180 | .body |
| 152 | { .mib; mov r14=r32 // rp | 181 | { .mib; |
| 182 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 183 | addp4 r14=0,r32 // rp | ||
| 184 | #else | ||
| 185 | mov r14=r32 // rp | ||
| 186 | #endif | ||
| 153 | mov r9=pr };; | 187 | mov r9=pr };; |
| 154 | { .mii; mov r15=r33 // ap | 188 | { .mii; |
| 189 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 190 | addp4 r15=0,r33 // ap | ||
| 191 | #else | ||
| 192 | mov r15=r33 // ap | ||
| 193 | #endif | ||
| 155 | mov ar.lc=r10 | 194 | mov ar.lc=r10 |
| 156 | mov ar.ec=6 } | 195 | mov ar.ec=6 } |
| 157 | { .mib; mov r16=r34 // bp | 196 | { .mib; |
| 197 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 198 | addp4 r16=0,r34 // bp | ||
| 199 | #else | ||
| 200 | mov r16=r34 // bp | ||
| 201 | #endif | ||
| 158 | mov pr.rot=1<<16 };; | 202 | mov pr.rot=1<<16 };; |
| 159 | 203 | ||
| 160 | .L_bn_add_words_ctop: | 204 | .L_bn_add_words_ctop: |
| @@ -174,7 +218,7 @@ bn_add_words: | |||
| 174 | 218 | ||
| 175 | { .mii; | 219 | { .mii; |
| 176 | (p59) add r8=1,r8 // return value | 220 | (p59) add r8=1,r8 // return value |
| 177 | mov pr=r9,-1 | 221 | mov pr=r9,0x1ffff |
| 178 | mov ar.lc=r3 } | 222 | mov ar.lc=r3 } |
| 179 | { .mbb; nop.b 0x0 | 223 | { .mbb; nop.b 0x0 |
| 180 | br.ret.sptk.many b0 };; | 224 | br.ret.sptk.many b0 };; |
| @@ -202,12 +246,27 @@ bn_sub_words: | |||
| 202 | brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 | 246 | brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 |
| 203 | } | 247 | } |
| 204 | .body | 248 | .body |
| 205 | { .mib; mov r14=r32 // rp | 249 | { .mib; |
| 250 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 251 | addp4 r14=0,r32 // rp | ||
| 252 | #else | ||
| 253 | mov r14=r32 // rp | ||
| 254 | #endif | ||
| 206 | mov r9=pr };; | 255 | mov r9=pr };; |
| 207 | { .mii; mov r15=r33 // ap | 256 | { .mii; |
| 257 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 258 | addp4 r15=0,r33 // ap | ||
| 259 | #else | ||
| 260 | mov r15=r33 // ap | ||
| 261 | #endif | ||
| 208 | mov ar.lc=r10 | 262 | mov ar.lc=r10 |
| 209 | mov ar.ec=6 } | 263 | mov ar.ec=6 } |
| 210 | { .mib; mov r16=r34 // bp | 264 | { .mib; |
| 265 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 266 | addp4 r16=0,r34 // bp | ||
| 267 | #else | ||
| 268 | mov r16=r34 // bp | ||
| 269 | #endif | ||
| 211 | mov pr.rot=1<<16 };; | 270 | mov pr.rot=1<<16 };; |
| 212 | 271 | ||
| 213 | .L_bn_sub_words_ctop: | 272 | .L_bn_sub_words_ctop: |
| @@ -227,7 +286,7 @@ bn_sub_words: | |||
| 227 | 286 | ||
| 228 | { .mii; | 287 | { .mii; |
| 229 | (p59) add r8=1,r8 // return value | 288 | (p59) add r8=1,r8 // return value |
| 230 | mov pr=r9,-1 | 289 | mov pr=r9,0x1ffff |
| 231 | mov ar.lc=r3 } | 290 | mov ar.lc=r3 } |
| 232 | { .mbb; nop.b 0x0 | 291 | { .mbb; nop.b 0x0 |
| 233 | br.ret.sptk.many b0 };; | 292 | br.ret.sptk.many b0 };; |
| @@ -253,7 +312,7 @@ bn_mul_words: | |||
| 253 | #ifdef XMA_TEMPTATION | 312 | #ifdef XMA_TEMPTATION |
| 254 | { .mfi; alloc r2=ar.pfs,4,0,0,0 };; | 313 | { .mfi; alloc r2=ar.pfs,4,0,0,0 };; |
| 255 | #else | 314 | #else |
| 256 | { .mfi; alloc r2=ar.pfs,4,4,0,8 };; | 315 | { .mfi; alloc r2=ar.pfs,4,12,0,16 };; |
| 257 | #endif | 316 | #endif |
| 258 | { .mib; mov r8=r0 // return value | 317 | { .mib; mov r8=r0 // return value |
| 259 | cmp4.le p6,p0=r34,r0 | 318 | cmp4.le p6,p0=r34,r0 |
| @@ -266,24 +325,30 @@ bn_mul_words: | |||
| 266 | 325 | ||
| 267 | .body | 326 | .body |
| 268 | { .mib; setf.sig f8=r35 // w | 327 | { .mib; setf.sig f8=r35 // w |
| 269 | mov pr.rot=0x400001<<16 | 328 | mov pr.rot=0x800001<<16 |
| 270 | // ------^----- serves as (p48) at first (p26) | 329 | // ------^----- serves as (p50) at first (p27) |
| 271 | brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16 | 330 | brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16 |
| 272 | } | 331 | } |
| 273 | 332 | ||
| 274 | #ifndef XMA_TEMPTATION | 333 | #ifndef XMA_TEMPTATION |
| 275 | 334 | ||
| 276 | { .mii; mov r14=r32 // rp | 335 | { .mii; |
| 277 | mov r15=r33 // ap | 336 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
| 337 | addp4 r14=0,r32 // rp | ||
| 338 | addp4 r15=0,r33 // ap | ||
| 339 | #else | ||
| 340 | mov r14=r32 // rp | ||
| 341 | mov r15=r33 // ap | ||
| 342 | #endif | ||
| 278 | mov ar.lc=r10 } | 343 | mov ar.lc=r10 } |
| 279 | { .mii; mov r39=0 // serves as r33 at first (p26) | 344 | { .mii; mov r40=0 // serves as r35 at first (p27) |
| 280 | mov ar.ec=12 };; | 345 | mov ar.ec=13 };; |
| 281 | 346 | ||
| 282 | // This loop spins in 2*(n+11) ticks. It's scheduled for data in L2 | 347 | // This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium |
| 283 | // cache (i.e. 9 ticks away) as floating point load/store instructions | 348 | // L2 cache (i.e. 9 ticks away) as floating point load/store instructions |
| 284 | // bypass L1 cache and L2 latency is actually best-case scenario for | 349 | // bypass L1 cache and L2 latency is actually best-case scenario for |
| 285 | // ldf8. The loop is not scalable and shall run in 2*(n+11) even on | 350 | // ldf8. The loop is not scalable and shall run in 2*(n+12) even on |
| 286 | // "wider" IA-64 implementations. It's a trade-off here. n+22 loop | 351 | // "wider" IA-64 implementations. It's a trade-off here. n+24 loop |
| 287 | // would give us ~5% in *overall* performance improvement on "wider" | 352 | // would give us ~5% in *overall* performance improvement on "wider" |
| 288 | // IA-64, but would hurt Itanium for about same because of longer | 353 | // IA-64, but would hurt Itanium for about same because of longer |
| 289 | // epilogue. As it's a matter of few percents in either case I've | 354 | // epilogue. As it's a matter of few percents in either case I've |
| @@ -291,25 +356,25 @@ bn_mul_words: | |||
| 291 | // this very instruction sequence in bn_mul_add_words loop which in | 356 | // this very instruction sequence in bn_mul_add_words loop which in |
| 292 | // turn is scalable). | 357 | // turn is scalable). |
| 293 | .L_bn_mul_words_ctop: | 358 | .L_bn_mul_words_ctop: |
| 294 | { .mfi; (p25) getf.sig r36=f49 // low | 359 | { .mfi; (p25) getf.sig r36=f52 // low |
| 295 | (p21) xmpy.lu f45=f37,f8 | 360 | (p21) xmpy.lu f48=f37,f8 |
| 296 | (p27) cmp.ltu p52,p48=r39,r38 } | 361 | (p28) cmp.ltu p54,p50=r41,r39 } |
| 297 | { .mfi; (p16) ldf8 f32=[r15],8 | 362 | { .mfi; (p16) ldf8 f32=[r15],8 |
| 298 | (p21) xmpy.hu f38=f37,f8 | 363 | (p21) xmpy.hu f40=f37,f8 |
| 299 | (p0) nop.i 0x0 };; | 364 | (p0) nop.i 0x0 };; |
| 300 | { .mii; (p26) getf.sig r32=f43 // high | 365 | { .mii; (p25) getf.sig r32=f44 // high |
| 301 | .pred.rel "mutex",p48,p52 | 366 | .pred.rel "mutex",p50,p54 |
| 302 | (p48) add r38=r37,r33 // (p26) | 367 | (p50) add r40=r38,r35 // (p27) |
| 303 | (p52) add r38=r37,r33,1 } // (p26) | 368 | (p54) add r40=r38,r35,1 } // (p27) |
| 304 | { .mfb; (p27) st8 [r14]=r39,8 | 369 | { .mfb; (p28) st8 [r14]=r41,8 |
| 305 | (p0) nop.f 0x0 | 370 | (p0) nop.f 0x0 |
| 306 | br.ctop.sptk .L_bn_mul_words_ctop };; | 371 | br.ctop.sptk .L_bn_mul_words_ctop };; |
| 307 | .L_bn_mul_words_cend: | 372 | .L_bn_mul_words_cend: |
| 308 | 373 | ||
| 309 | { .mii; nop.m 0x0 | 374 | { .mii; nop.m 0x0 |
| 310 | .pred.rel "mutex",p49,p53 | 375 | .pred.rel "mutex",p51,p55 |
| 311 | (p49) add r8=r34,r0 | 376 | (p51) add r8=r36,r0 |
| 312 | (p53) add r8=r34,r0,1 } | 377 | (p55) add r8=r36,r0,1 } |
| 313 | { .mfb; nop.m 0x0 | 378 | { .mfb; nop.m 0x0 |
| 314 | nop.f 0x0 | 379 | nop.f 0x0 |
| 315 | nop.b 0x0 } | 380 | nop.b 0x0 } |
| @@ -344,7 +409,7 @@ bn_mul_words: | |||
| 344 | #endif // XMA_TEMPTATION | 409 | #endif // XMA_TEMPTATION |
| 345 | 410 | ||
| 346 | { .mii; nop.m 0x0 | 411 | { .mii; nop.m 0x0 |
| 347 | mov pr=r9,-1 | 412 | mov pr=r9,0x1ffff |
| 348 | mov ar.lc=r3 } | 413 | mov ar.lc=r3 } |
| 349 | { .mfb; rum 1<<5 // clear um.mfh | 414 | { .mfb; rum 1<<5 // clear um.mfh |
| 350 | nop.f 0x0 | 415 | nop.f 0x0 |
| @@ -376,59 +441,69 @@ bn_mul_add_words: | |||
| 376 | 441 | ||
| 377 | .body | 442 | .body |
| 378 | { .mib; setf.sig f8=r35 // w | 443 | { .mib; setf.sig f8=r35 // w |
| 379 | mov pr.rot=0x400001<<16 | 444 | mov pr.rot=0x800001<<16 |
| 380 | // ------^----- serves as (p48) at first (p26) | 445 | // ------^----- serves as (p50) at first (p27) |
| 381 | brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 | 446 | brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 |
| 382 | } | 447 | } |
| 383 | { .mii; mov r14=r32 // rp | 448 | { .mii; |
| 384 | mov r15=r33 // ap | 449 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
| 450 | addp4 r14=0,r32 // rp | ||
| 451 | addp4 r15=0,r33 // ap | ||
| 452 | #else | ||
| 453 | mov r14=r32 // rp | ||
| 454 | mov r15=r33 // ap | ||
| 455 | #endif | ||
| 385 | mov ar.lc=r10 } | 456 | mov ar.lc=r10 } |
| 386 | { .mii; mov r39=0 // serves as r33 at first (p26) | 457 | { .mii; mov r40=0 // serves as r35 at first (p27) |
| 387 | mov r18=r32 // rp copy | 458 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
| 388 | mov ar.ec=14 };; | 459 | addp4 r18=0,r32 // rp copy |
| 460 | #else | ||
| 461 | mov r18=r32 // rp copy | ||
| 462 | #endif | ||
| 463 | mov ar.ec=15 };; | ||
| 389 | 464 | ||
| 390 | // This loop spins in 3*(n+13) ticks on Itanium and should spin in | 465 | // This loop spins in 3*(n+14) ticks on Itanium and should spin in |
| 391 | // 2*(n+13) on "wider" IA-64 implementations (to be verified with new | 466 | // 2*(n+14) on "wider" IA-64 implementations (to be verified with new |
| 392 | // µ-architecture manuals as they become available). As usual it's | 467 | // µ-architecture manuals as they become available). As usual it's |
| 393 | // possible to compress the epilogue, down to 10 in this case, at the | 468 | // possible to compress the epilogue, down to 10 in this case, at the |
| 394 | // cost of scalability. Compressed (and therefore non-scalable) loop | 469 | // cost of scalability. Compressed (and therefore non-scalable) loop |
| 395 | // running at 3*(n+10) would buy you ~10% on Itanium but take ~35% | 470 | // running at 3*(n+11) would buy you ~10% on Itanium but take ~35% |
| 396 | // from "wider" IA-64 so let it be scalable! Special attention was | 471 | // from "wider" IA-64 so let it be scalable! Special attention was |
| 397 | // paid for having the loop body split at 64-byte boundary. ld8 is | 472 | // paid for having the loop body split at 64-byte boundary. ld8 is |
| 398 | // scheduled for L1 cache as the data is more than likely there. | 473 | // scheduled for L1 cache as the data is more than likely there. |
| 399 | // Indeed, bn_mul_words has put it there a moment ago:-) | 474 | // Indeed, bn_mul_words has put it there a moment ago:-) |
| 400 | .L_bn_mul_add_words_ctop: | 475 | .L_bn_mul_add_words_ctop: |
| 401 | { .mfi; (p25) getf.sig r36=f49 // low | 476 | { .mfi; (p25) getf.sig r36=f52 // low |
| 402 | (p21) xmpy.lu f45=f37,f8 | 477 | (p21) xmpy.lu f48=f37,f8 |
| 403 | (p27) cmp.ltu p52,p48=r39,r38 } | 478 | (p28) cmp.ltu p54,p50=r41,r39 } |
| 404 | { .mfi; (p16) ldf8 f32=[r15],8 | 479 | { .mfi; (p16) ldf8 f32=[r15],8 |
| 405 | (p21) xmpy.hu f38=f37,f8 | 480 | (p21) xmpy.hu f40=f37,f8 |
| 406 | (p27) add r43=r43,r39 };; | 481 | (p28) add r45=r45,r41 };; |
| 407 | { .mii; (p26) getf.sig r32=f43 // high | 482 | { .mii; (p25) getf.sig r32=f44 // high |
| 408 | .pred.rel "mutex",p48,p52 | 483 | .pred.rel "mutex",p50,p54 |
| 409 | (p48) add r38=r37,r33 // (p26) | 484 | (p50) add r40=r38,r35 // (p27) |
| 410 | (p52) add r38=r37,r33,1 } // (p26) | 485 | (p54) add r40=r38,r35,1 } // (p27) |
| 411 | { .mfb; (p27) cmp.ltu.unc p56,p0=r43,r39 | 486 | { .mfb; (p28) cmp.ltu.unc p60,p0=r45,r41 |
| 412 | (p0) nop.f 0x0 | 487 | (p0) nop.f 0x0 |
| 413 | (p0) nop.b 0x0 } | 488 | (p0) nop.b 0x0 } |
| 414 | { .mii; (p26) ld8 r42=[r18],8 | 489 | { .mii; (p27) ld8 r44=[r18],8 |
| 415 | (p58) cmp.eq.or p57,p0=-1,r44 | 490 | (p62) cmp.eq.or p61,p0=-1,r46 |
| 416 | (p58) add r44=1,r44 } | 491 | (p62) add r46=1,r46 } |
| 417 | { .mfb; (p29) st8 [r14]=r45,8 | 492 | { .mfb; (p30) st8 [r14]=r47,8 |
| 418 | (p0) nop.f 0x0 | 493 | (p0) nop.f 0x0 |
| 419 | br.ctop.sptk .L_bn_mul_add_words_ctop};; | 494 | br.ctop.sptk .L_bn_mul_add_words_ctop};; |
| 420 | .L_bn_mul_add_words_cend: | 495 | .L_bn_mul_add_words_cend: |
| 421 | 496 | ||
| 422 | { .mii; nop.m 0x0 | 497 | { .mii; nop.m 0x0 |
| 423 | .pred.rel "mutex",p51,p55 | 498 | .pred.rel "mutex",p53,p57 |
| 424 | (p51) add r8=r36,r0 | 499 | (p53) add r8=r38,r0 |
| 425 | (p55) add r8=r36,r0,1 } | 500 | (p57) add r8=r38,r0,1 } |
| 426 | { .mfb; nop.m 0x0 | 501 | { .mfb; nop.m 0x0 |
| 427 | nop.f 0x0 | 502 | nop.f 0x0 |
| 428 | nop.b 0x0 };; | 503 | nop.b 0x0 };; |
| 429 | { .mii; | 504 | { .mii; |
| 430 | (p59) add r8=1,r8 | 505 | (p63) add r8=1,r8 |
| 431 | mov pr=r9,-1 | 506 | mov pr=r9,0x1ffff |
| 432 | mov ar.lc=r3 } | 507 | mov ar.lc=r3 } |
| 433 | { .mfb; rum 1<<5 // clear um.mfh | 508 | { .mfb; rum 1<<5 // clear um.mfh |
| 434 | nop.f 0x0 | 509 | nop.f 0x0 |
| @@ -461,6 +536,10 @@ bn_sqr_words: | |||
| 461 | mov r9=pr };; | 536 | mov r9=pr };; |
| 462 | 537 | ||
| 463 | .body | 538 | .body |
| 539 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 540 | { .mii; addp4 r32=0,r32 | ||
| 541 | addp4 r33=0,r33 };; | ||
| 542 | #endif | ||
| 464 | { .mib; | 543 | { .mib; |
| 465 | mov pr.rot=1<<16 | 544 | mov pr.rot=1<<16 |
| 466 | brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 | 545 | brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 |
| @@ -492,7 +571,7 @@ bn_sqr_words: | |||
| 492 | .L_bn_sqr_words_cend: | 571 | .L_bn_sqr_words_cend: |
| 493 | 572 | ||
| 494 | { .mii; nop.m 0x0 | 573 | { .mii; nop.m 0x0 |
| 495 | mov pr=r9,-1 | 574 | mov pr=r9,0x1ffff |
| 496 | mov ar.lc=r3 } | 575 | mov ar.lc=r3 } |
| 497 | { .mfb; rum 1<<5 // clear um.mfh | 576 | { .mfb; rum 1<<5 // clear um.mfh |
| 498 | nop.f 0x0 | 577 | nop.f 0x0 |
| @@ -526,7 +605,14 @@ bn_sqr_comba8: | |||
| 526 | .prologue | 605 | .prologue |
| 527 | .fframe 0 | 606 | .fframe 0 |
| 528 | .save ar.pfs,r2 | 607 | .save ar.pfs,r2 |
| 608 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 529 | { .mii; alloc r2=ar.pfs,2,1,0,0 | 609 | { .mii; alloc r2=ar.pfs,2,1,0,0 |
| 610 | addp4 r33=0,r33 | ||
| 611 | addp4 r32=0,r32 };; | ||
| 612 | { .mii; | ||
| 613 | #else | ||
| 614 | { .mii; alloc r2=ar.pfs,2,1,0,0 | ||
| 615 | #endif | ||
| 530 | mov r34=r33 | 616 | mov r34=r33 |
| 531 | add r14=8,r33 };; | 617 | add r14=8,r33 };; |
| 532 | .body | 618 | .body |
| @@ -587,7 +673,14 @@ bn_mul_comba8: | |||
| 587 | .prologue | 673 | .prologue |
| 588 | .fframe 0 | 674 | .fframe 0 |
| 589 | .save ar.pfs,r2 | 675 | .save ar.pfs,r2 |
| 676 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 590 | { .mii; alloc r2=ar.pfs,3,0,0,0 | 677 | { .mii; alloc r2=ar.pfs,3,0,0,0 |
| 678 | addp4 r33=0,r33 | ||
| 679 | addp4 r34=0,r34 };; | ||
| 680 | { .mii; addp4 r32=0,r32 | ||
| 681 | #else | ||
| 682 | { .mii; alloc r2=ar.pfs,3,0,0,0 | ||
| 683 | #endif | ||
| 591 | add r14=8,r33 | 684 | add r14=8,r33 |
| 592 | add r17=8,r34 } | 685 | add r17=8,r34 } |
| 593 | .body | 686 | .body |
| @@ -1138,7 +1231,14 @@ bn_sqr_comba4: | |||
| 1138 | .prologue | 1231 | .prologue |
| 1139 | .fframe 0 | 1232 | .fframe 0 |
| 1140 | .save ar.pfs,r2 | 1233 | .save ar.pfs,r2 |
| 1234 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 1235 | { .mii; alloc r2=ar.pfs,2,1,0,0 | ||
| 1236 | addp4 r32=0,r32 | ||
| 1237 | addp4 r33=0,r33 };; | ||
| 1238 | { .mii; | ||
| 1239 | #else | ||
| 1141 | { .mii; alloc r2=ar.pfs,2,1,0,0 | 1240 | { .mii; alloc r2=ar.pfs,2,1,0,0 |
| 1241 | #endif | ||
| 1142 | mov r34=r33 | 1242 | mov r34=r33 |
| 1143 | add r14=8,r33 };; | 1243 | add r14=8,r33 };; |
| 1144 | .body | 1244 | .body |
| @@ -1164,7 +1264,14 @@ bn_mul_comba4: | |||
| 1164 | .prologue | 1264 | .prologue |
| 1165 | .fframe 0 | 1265 | .fframe 0 |
| 1166 | .save ar.pfs,r2 | 1266 | .save ar.pfs,r2 |
| 1267 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 1268 | { .mii; alloc r2=ar.pfs,3,0,0,0 | ||
| 1269 | addp4 r33=0,r33 | ||
| 1270 | addp4 r34=0,r34 };; | ||
| 1271 | { .mii; addp4 r32=0,r32 | ||
| 1272 | #else | ||
| 1167 | { .mii; alloc r2=ar.pfs,3,0,0,0 | 1273 | { .mii; alloc r2=ar.pfs,3,0,0,0 |
| 1274 | #endif | ||
| 1168 | add r14=8,r33 | 1275 | add r14=8,r33 |
| 1169 | add r17=8,r34 } | 1276 | add r17=8,r34 } |
| 1170 | .body | 1277 | .body |
| @@ -1464,7 +1571,7 @@ bn_div_words: | |||
| 1464 | or r8=r8,r33 | 1571 | or r8=r8,r33 |
| 1465 | mov ar.pfs=r2 };; | 1572 | mov ar.pfs=r2 };; |
| 1466 | { .mii; shr.u r9=H,I // remainder if anybody wants it | 1573 | { .mii; shr.u r9=H,I // remainder if anybody wants it |
| 1467 | mov pr=r10,-1 } | 1574 | mov pr=r10,0x1ffff } |
| 1468 | { .mfb; br.ret.sptk.many b0 };; | 1575 | { .mfb; br.ret.sptk.many b0 };; |
| 1469 | 1576 | ||
| 1470 | // Unsigned 64 by 32 (well, by 64 for the moment) bit integer division | 1577 | // Unsigned 64 by 32 (well, by 64 for the moment) bit integer division |
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2.s b/src/lib/libcrypto/bn/asm/pa-risc2.s index af9730d062..f3b16290eb 100644 --- a/src/lib/libcrypto/bn/asm/pa-risc2.s +++ b/src/lib/libcrypto/bn/asm/pa-risc2.s | |||
| @@ -747,8 +747,8 @@ bn_div_words | |||
| 747 | .PROC | 747 | .PROC |
| 748 | .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN | 748 | .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN |
| 749 | .IMPORT BN_num_bits_word,CODE | 749 | .IMPORT BN_num_bits_word,CODE |
| 750 | .IMPORT __iob,DATA | 750 | ;--- not PIC .IMPORT __iob,DATA |
| 751 | .IMPORT fprintf,CODE | 751 | ;--- not PIC .IMPORT fprintf,CODE |
| 752 | .IMPORT abort,CODE | 752 | .IMPORT abort,CODE |
| 753 | .IMPORT $$div2U,MILLICODE | 753 | .IMPORT $$div2U,MILLICODE |
| 754 | .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE | 754 | .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE |
| @@ -844,12 +844,12 @@ $0006001A | |||
| 844 | MOVIB,TR 2,%r8,$0006001C ;offset 0xa18 | 844 | MOVIB,TR 2,%r8,$0006001C ;offset 0xa18 |
| 845 | EXTRD,U %r3,63,32,%r7 ;offset 0xa1c | 845 | EXTRD,U %r3,63,32,%r7 ;offset 0xa1c |
| 846 | $D2 | 846 | $D2 |
| 847 | ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20 | 847 | ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20 |
| 848 | LDIL LR'C$7,%r21 ;offset 0xa24 | 848 | ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24 |
| 849 | LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28 | 849 | ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28 |
| 850 | .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28; | 850 | ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28; |
| 851 | B,L fprintf,%r2 ;offset 0xa2c | 851 | ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c |
| 852 | LDO RR'C$7(%r21),%r25 ;offset 0xa30 | 852 | ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30 |
| 853 | .CALL ; | 853 | .CALL ; |
| 854 | B,L abort,%r2 ;offset 0xa34 | 854 | B,L abort,%r2 ;offset 0xa34 |
| 855 | NOP ;offset 0xa38 | 855 | NOP ;offset 0xa38 |
| @@ -1605,14 +1605,14 @@ bn_mul_comba4 | |||
| 1605 | .PROCEND | 1605 | .PROCEND |
| 1606 | 1606 | ||
| 1607 | 1607 | ||
| 1608 | .SPACE $TEXT$ | 1608 | ;--- not PIC .SPACE $TEXT$ |
| 1609 | .SUBSPA $CODE$ | 1609 | ;--- not PIC .SUBSPA $CODE$ |
| 1610 | .SPACE $PRIVATE$,SORT=16 | 1610 | ;--- not PIC .SPACE $PRIVATE$,SORT=16 |
| 1611 | .IMPORT $global$,DATA | 1611 | ;--- not PIC .IMPORT $global$,DATA |
| 1612 | .SPACE $TEXT$ | 1612 | ;--- not PIC .SPACE $TEXT$ |
| 1613 | .SUBSPA $CODE$ | 1613 | ;--- not PIC .SUBSPA $CODE$ |
| 1614 | .SUBSPA $LIT$,ACCESS=0x2c | 1614 | ;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c |
| 1615 | C$7 | 1615 | ;--- not PIC C$7 |
| 1616 | .ALIGN 8 | 1616 | ;--- not PIC .ALIGN 8 |
| 1617 | .STRINGZ "Division would overflow (%d)\n" | 1617 | ;--- not PIC .STRINGZ "Division would overflow (%d)\n" |
| 1618 | .END | 1618 | .END |
diff --git a/src/lib/libcrypto/bn/asm/vms.mar b/src/lib/libcrypto/bn/asm/vms.mar index 465f2774b6..aefab15cdb 100644 --- a/src/lib/libcrypto/bn/asm/vms.mar +++ b/src/lib/libcrypto/bn/asm/vms.mar | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | .title vax_bn_mul_add_word unsigned multiply & add, 32*32+32+32=>64 | 1 | .title vax_bn_mul_add_words unsigned multiply & add, 32*32+32+32=>64 |
| 2 | ; | 2 | ; |
| 3 | ; w.j.m. 15-jan-1999 | 3 | ; w.j.m. 15-jan-1999 |
| 4 | ; | 4 | ; |
| @@ -59,7 +59,7 @@ w=16 ;(AP) w by value (input) | |||
| 59 | movl r6,r0 ; return c | 59 | movl r6,r0 ; return c |
| 60 | ret | 60 | ret |
| 61 | 61 | ||
| 62 | .title vax_bn_mul_word unsigned multiply & add, 32*32+32=>64 | 62 | .title vax_bn_mul_words unsigned multiply & add, 32*32+32=>64 |
| 63 | ; | 63 | ; |
| 64 | ; w.j.m. 15-jan-1999 | 64 | ; w.j.m. 15-jan-1999 |
| 65 | ; | 65 | ; |
| @@ -172,147 +172,175 @@ n=12 ;(AP) n by value (input) | |||
| 172 | ; } | 172 | ; } |
| 173 | ; | 173 | ; |
| 174 | ; Using EDIV would be very easy, if it didn't do signed calculations. | 174 | ; Using EDIV would be very easy, if it didn't do signed calculations. |
| 175 | ; Therefore, som extra things have to happen around it. The way to | 175 | ; Any time any of the input numbers are signed, there are problems, |
| 176 | ; handle that is to shift all operands right one step (basically dividing | 176 | ; usually with integer overflow, at which point it returns useless |
| 177 | ; them by 2) and handle the different cases depending on what the lowest | 177 | ; data (the quotient gets the value of l, and the remainder becomes 0). |
| 178 | ; bit of each operand was. | ||
| 179 | ; | 178 | ; |
| 180 | ; To start with, let's define the following: | 179 | ; If it was just for the dividend, it would be very easy, just divide |
| 180 | ; it by 2 (unsigned), do the division, multiply the resulting quotient | ||
| 181 | ; and remainder by 2, add the bit that was dropped when dividing by 2 | ||
| 182 | ; to the remainder, and do some adjustment so the remainder doesn't | ||
| 183 | ; end up larger than the divisor. For some cases when the divisor is | ||
| 184 | ; negative (from EDIV's point of view, i.e. when the highest bit is set), | ||
| 185 | ; dividing the dividend by 2 isn't enough, and since some operations | ||
| 186 | ; might generate integer overflows even when the dividend is divided by | ||
| 187 | ; 4 (when the high part of the shifted down dividend ends up being exactly | ||
| 188 | ; half of the divisor, the result is the quotient 0x80000000, which is | ||
| 189 | ; negative...) it needs to be divided by 8. Furthermore, the divisor needs | ||
| 190 | ; to be divided by 2 (unsigned) as well, to avoid more problems with the sign. | ||
| 191 | ; In this case, a little extra fiddling with the remainder is required. | ||
| 181 | ; | 192 | ; |
| 182 | ; a' = l & 1 | 193 | ; So, the simplest way to handle this is always to divide the dividend |
| 183 | ; a2 = <h,l> >> 1 # UNSIGNED shift! | 194 | ; by 8, and to divide the divisor by 2 if it's highest bit is set. |
| 184 | ; b' = d & 1 | 195 | ; After EDIV has been used, the quotient gets multiplied by 8 if the |
| 185 | ; b2 = d >> 1 # UNSIGNED shift! | 196 | ; original divisor was positive, otherwise 4. The remainder, oddly |
| 197 | ; enough, is *always* multiplied by 8. | ||
| 198 | ; NOTE: in the case mentioned above, where the high part of the shifted | ||
| 199 | ; down dividend ends up being exactly half the shifted down divisor, we | ||
| 200 | ; end up with a 33 bit quotient. That's no problem however, it usually | ||
| 201 | ; means we have ended up with a too large remainder as well, and the | ||
| 202 | ; problem is fixed by the last part of the algorithm (next paragraph). | ||
| 186 | ; | 203 | ; |
| 187 | ; Now, use EDIV to calculate a quotient and a remainder: | 204 | ; The routine ends with comparing the resulting remainder with the |
| 205 | ; original divisor and if the remainder is larger, subtract the | ||
| 206 | ; original divisor from it, and increase the quotient by 1. This is | ||
| 207 | ; done until the remainder is smaller than the divisor. | ||
| 188 | ; | 208 | ; |
| 189 | ; q'' = a2/b2 | 209 | ; The complete algorithm looks like this: |
| 190 | ; r'' = a2 - q''*b2 | ||
| 191 | ; | 210 | ; |
| 192 | ; If b' is 0, the quotient is already correct, we just need to adjust the | 211 | ; d' = d |
| 193 | ; remainder: | 212 | ; l' = l & 7 |
| 213 | ; [h,l] = [h,l] >> 3 | ||
| 214 | ; [q,r] = floor([h,l] / d) # This is the EDIV operation | ||
| 215 | ; if (q < 0) q = -q # I doubt this is necessary any more | ||
| 194 | ; | 216 | ; |
| 195 | ; if (b' == 0) | 217 | ; r' = r >> 29 |
| 196 | ; { | 218 | ; if (d' >= 0) |
| 197 | ; r = 2*r'' + a' | 219 | ; q' = q >> 29 |
| 198 | ; q = q'' | 220 | ; q = q << 3 |
| 199 | ; } | 221 | ; else |
| 200 | ; | 222 | ; q' = q >> 30 |
| 201 | ; If b' is 1, we need to do other adjustements. The first thought is the | 223 | ; q = q << 2 |
| 202 | ; following (note that r' will not always have the right value, but an | 224 | ; r = (r << 3) + l' |
| 203 | ; adjustement follows further down): | ||
| 204 | ; | ||
| 205 | ; if (b' == 1) | ||
| 206 | ; { | ||
| 207 | ; q' = q'' | ||
| 208 | ; r' = a - q'*b | ||
| 209 | ; | ||
| 210 | ; However, one can note the folowing relationship: | ||
| 211 | ; | ||
| 212 | ; r'' = a2 - q''*b2 | ||
| 213 | ; => 2*r'' = 2*a2 - 2*q''*b2 | ||
| 214 | ; = { a = 2*a2 + a', b = 2*b2 + b' = 2*b2 + 1, | ||
| 215 | ; q' = q'' } | ||
| 216 | ; = a - a' - q'*(b - 1) | ||
| 217 | ; = a - q'*b - a' + q' | ||
| 218 | ; = r' - a' + q' | ||
| 219 | ; => r' = 2*r'' - q' + a' | ||
| 220 | ; | 225 | ; |
| 221 | ; This enables us to use r'' instead of discarding and calculating another | 226 | ; if (d' < 0) |
| 222 | ; modulo: | ||
| 223 | ; | ||
| 224 | ; if (b' == 1) | ||
| 225 | ; { | 227 | ; { |
| 226 | ; q' = q'' | 228 | ; [r',r] = [r',r] - q |
| 227 | ; r' = (r'' << 1) - q' + a' | 229 | ; while ([r',r] < 0) |
| 228 | ; | ||
| 229 | ; Now, all we have to do is adjust r', because it might be < 0: | ||
| 230 | ; | ||
| 231 | ; while (r' < 0) | ||
| 232 | ; { | 230 | ; { |
| 233 | ; r' = r' + b | 231 | ; [r',r] = [r',r] + d |
| 234 | ; q' = q' - 1 | 232 | ; [q',q] = [q',q] - 1 |
| 235 | ; } | 233 | ; } |
| 236 | ; } | 234 | ; } |
| 237 | ; | 235 | ; |
| 238 | ; return q' | 236 | ; while ([r',r] >= d') |
| 237 | ; { | ||
| 238 | ; [r',r] = [r',r] - d' | ||
| 239 | ; [q',q] = [q',q] + 1 | ||
| 240 | ; } | ||
| 241 | ; | ||
| 242 | ; return q | ||
| 239 | 243 | ||
| 240 | h=4 ;(AP) h by value (input) | 244 | h=4 ;(AP) h by value (input) |
| 241 | l=8 ;(AP) l by value (input) | 245 | l=8 ;(AP) l by value (input) |
| 242 | d=12 ;(AP) d by value (input) | 246 | d=12 ;(AP) d by value (input) |
| 243 | 247 | ||
| 244 | ;aprim=r5 | 248 | ;r2 = l, q |
| 245 | ;a2=r6 | 249 | ;r3 = h, r |
| 246 | ;a20=r6 | 250 | ;r4 = d |
| 247 | ;a21=r7 | 251 | ;r5 = l' |
| 248 | ;bprim=r8 | 252 | ;r6 = r' |
| 249 | ;b2=r9 | 253 | ;r7 = d' |
| 250 | ;qprim=r10 ; initially used as q'' | 254 | ;r8 = q' |
| 251 | ;rprim=r11 ; initially used as r'' | ||
| 252 | |||
| 253 | 255 | ||
| 254 | .psect code,nowrt | 256 | .psect code,nowrt |
| 255 | 257 | ||
| 256 | .entry bn_div_words,^m<r2,r3,r4,r5,r6,r7,r8,r9,r10,r11> | 258 | .entry bn_div_words,^m<r2,r3,r4,r5,r6,r7,r8> |
| 257 | movl l(ap),r2 | 259 | movl l(ap),r2 |
| 258 | movl h(ap),r3 | 260 | movl h(ap),r3 |
| 259 | movl d(ap),r4 | 261 | movl d(ap),r4 |
| 260 | 262 | ||
| 261 | movl #0,r5 | 263 | bicl3 #^XFFFFFFF8,r2,r5 ; l' = l & 7 |
| 262 | movl #0,r8 | 264 | bicl3 #^X00000007,r2,r2 |
| 263 | movl #0,r0 | ||
| 264 | ; movl #0,r1 | ||
| 265 | 265 | ||
| 266 | rotl #-1,r2,r6 ; a20 = l >> 1 (almost) | 266 | bicl3 #^XFFFFFFF8,r3,r6 |
| 267 | rotl #-1,r3,r7 ; a21 = h >> 1 (almost) | 267 | bicl3 #^X00000007,r3,r3 |
| 268 | rotl #-1,r4,r9 ; b2 = d >> 1 (almost) | 268 | |
| 269 | addl r6,r2 | ||
| 269 | 270 | ||
| 270 | tstl r6 | 271 | rotl #-3,r2,r2 ; l = l >> 3 |
| 271 | bgeq 1$ | 272 | rotl #-3,r3,r3 ; h = h >> 3 |
| 272 | xorl2 #^X80000000,r6 ; fixup a20 so highest bit is 0 | 273 | |
| 273 | incl r5 ; a' = 1 | 274 | movl r4,r7 ; d' = d |
| 274 | 1$: | 275 | |
| 275 | tstl r7 | 276 | movl #0,r6 ; r' = 0 |
| 276 | bgeq 2$ | 277 | movl #0,r8 ; q' = 0 |
| 277 | xorl2 #^X80000000,r6 ; fixup a20 so highest bit is 1, | 278 | |
| 278 | ; since that's what was lowest in a21 | 279 | tstl r4 |
| 279 | xorl2 #^X80000000,r7 ; fixup a21 so highest bit is 1 | ||
| 280 | 2$: | ||
| 281 | tstl r9 | ||
| 282 | beql 666$ ; Uh-oh, the divisor is 0... | 280 | beql 666$ ; Uh-oh, the divisor is 0... |
| 283 | bgtr 3$ | 281 | bgtr 1$ |
| 284 | xorl2 #^X80000000,r9 ; fixup b2 so highest bit is 0 | 282 | rotl #-1,r4,r4 ; If d is negative, shift it right. |
| 285 | incl r8 ; b' = 1 | 283 | bicl2 #^X80000000,r4 ; Since d is then a large number, the |
| 286 | 3$: | 284 | ; lowest bit is insignificant |
| 287 | tstl r9 | 285 | ; (contradict that, and I'll fix the problem!) |
| 288 | bneq 4$ ; if b2 is 0, we know that b' is 1 | 286 | 1$: |
| 289 | tstl r3 | 287 | ediv r4,r2,r2,r3 ; Do the actual division |
| 290 | bneq 666$ ; if higher half isn't 0, we overflow | 288 | |
| 291 | movl r2,r10 ; otherwise, we have our result | 289 | tstl r2 |
| 292 | brb 42$ ; This is a success, really. | 290 | bgeq 3$ |
| 293 | 4$: | 291 | mnegl r2,r2 ; if q < 0, negate it |
| 294 | ediv r9,r6,r10,r11 | 292 | 3$: |
| 295 | 293 | tstl r7 | |
| 296 | tstl r8 | 294 | blss 4$ |
| 297 | bneq 5$ ; If b' != 0, go to the other part | 295 | rotl #3,r2,r2 ; q = q << 3 |
| 298 | ; addl3 r11,r11,r1 | 296 | bicl3 #^XFFFFFFF8,r2,r8 ; q' gets the high bits from q |
| 299 | ; addl2 r5,r1 | 297 | bicl3 #^X00000007,r2,r2 |
| 300 | brb 42$ | 298 | bsb 41$ |
| 301 | 5$: | 299 | 4$: ; else |
| 302 | ashl #1,r11,r11 | 300 | rotl #2,r2,r2 ; q = q << 2 |
| 303 | subl2 r10,r11 | 301 | bicl3 #^XFFFFFFFC,r2,r8 ; q' gets the high bits from q |
| 304 | addl2 r5,r11 | 302 | bicl3 #^X00000003,r2,r2 |
| 305 | bgeq 7$ | 303 | 41$: |
| 306 | 6$: | 304 | rotl #3,r3,r3 ; r = r << 3 |
| 307 | decl r10 | 305 | bicl3 #^XFFFFFFF8,r3,r6 ; r' gets the high bits from r |
| 308 | addl2 r4,r11 | 306 | bicl3 #^X00000007,r3,r3 |
| 309 | blss 6$ | 307 | addl r5,r3 ; r = r + l' |
| 310 | 7$: | 308 | |
| 311 | ; movl r11,r1 | 309 | tstl r7 |
| 310 | bgeq 5$ | ||
| 311 | bitl #1,r7 | ||
| 312 | beql 5$ ; if d' < 0 && d' & 1 | ||
| 313 | subl r2,r3 ; [r',r] = [r',r] - [q',q] | ||
| 314 | sbwc r8,r6 | ||
| 315 | 45$: | ||
| 316 | bgeq 5$ ; while r < 0 | ||
| 317 | decl r2 ; [q',q] = [q',q] - 1 | ||
| 318 | sbwc #0,r8 | ||
| 319 | addl r7,r3 ; [r',r] = [r',r] + d' | ||
| 320 | adwc #0,r6 | ||
| 321 | brb 45$ | ||
| 322 | |||
| 323 | ; The return points are placed in the middle to keep a short distance from | ||
| 324 | ; all the branch points | ||
| 312 | 42$: | 325 | 42$: |
| 313 | movl r10,r0 | 326 | ; movl r3,r1 |
| 327 | movl r2,r0 | ||
| 328 | ret | ||
| 314 | 666$: | 329 | 666$: |
| 330 | movl #^XFFFFFFFF,r0 | ||
| 315 | ret | 331 | ret |
| 332 | |||
| 333 | 5$: | ||
| 334 | tstl r6 | ||
| 335 | bneq 6$ | ||
| 336 | cmpl r3,r7 | ||
| 337 | blssu 42$ ; while [r',r] >= d' | ||
| 338 | 6$: | ||
| 339 | subl r7,r3 ; [r',r] = [r',r] - d' | ||
| 340 | sbwc #0,r6 | ||
| 341 | incl r2 ; [q',q] = [q',q] + 1 | ||
| 342 | adwc #0,r8 | ||
| 343 | brb 5$ | ||
| 316 | 344 | ||
| 317 | .title vax_bn_add_words unsigned add of two arrays | 345 | .title vax_bn_add_words unsigned add of two arrays |
| 318 | ; | 346 | ; |
