summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm
diff options
context:
space:
mode:
authormarkus <>2003-05-12 02:18:40 +0000
committermarkus <>2003-05-12 02:18:40 +0000
commitd4fcd82bb7f6d603bd61e19a81ba97337b89dfca (patch)
treed52e3a0f1f08f65ad283027e560e17ed0d720462 /src/lib/libcrypto/bn/asm
parent582bbd139cd2afd58d10dc051c5b0b989b441074 (diff)
downloadopenbsd-d4fcd82bb7f6d603bd61e19a81ba97337b89dfca.tar.gz
openbsd-d4fcd82bb7f6d603bd61e19a81ba97337b89dfca.tar.bz2
openbsd-d4fcd82bb7f6d603bd61e19a81ba97337b89dfca.zip
merge 0.9.7b with local changes; crank majors for libssl/libcrypto
Diffstat (limited to 'src/lib/libcrypto/bn/asm')
-rw-r--r--src/lib/libcrypto/bn/asm/ia64.S235
-rw-r--r--src/lib/libcrypto/bn/asm/pa-risc2.s36
-rw-r--r--src/lib/libcrypto/bn/asm/vms.mar254
3 files changed, 330 insertions, 195 deletions
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S
index ae56066310..7dfda85566 100644
--- a/src/lib/libcrypto/bn/asm/ia64.S
+++ b/src/lib/libcrypto/bn/asm/ia64.S
@@ -1,6 +1,6 @@
1.explicit 1.explicit
2.text 2.text
3.ident "ia64.S, Version 1.1" 3.ident "ia64.S, Version 2.0"
4.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" 4.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5 5
6// 6//
@@ -13,6 +13,35 @@
13// disclaimed. 13// disclaimed.
14// ==================================================================== 14// ====================================================================
15// 15//
16// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
17// different from Itanium to this module viewpoint. Most notably, is it
18// "wider" than Itanium? Can you experience loop scalability as
19// discussed in commentary sections? Not really:-( Itanium2 has 6
20// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
21// spin twice as fast, as I need 8 IALU ports. Amount of floating point
22// ports is the same, i.e. 2, while I need 4. In other words, to this
23// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
24// essentially different in respect to this module, and a re-tune was
25// required. Well, because some intruction latencies has changed. Most
26// noticeably those intensively used:
27//
28// Itanium Itanium2
29// ldf8 9 6 L2 hit
30// ld8 2 1 L1 hit
31// getf 2 5
32// xma[->getf] 7[+1] 4[+0]
33// add[->st8] 1[+1] 1[+0]
34//
35// What does it mean? You might ratiocinate that the original code
36// should run just faster... Because sum of latencies is smaller...
37// Wrong! Note that getf latency increased. This means that if a loop is
38// scheduled for lower latency (and they are), then it will suffer from
39// stall condition and the code will therefore turn anti-scalable, e.g.
40// original bn_mul_words spun at 5*n or 2.5 times slower than expected
41// on Itanium2! What to do? Reschedule loops for Itanium2? But then
42// Itanium would exhibit anti-scalability. So I've chosen to reschedule
43// for worst latency for every instruction aiming for best *all-round*
44// performance.
16 45
17// Q. How much faster does it get? 46// Q. How much faster does it get?
18// A. Here is the output from 'openssl speed rsa dsa' for vanilla 47// A. Here is the output from 'openssl speed rsa dsa' for vanilla
@@ -149,12 +178,27 @@ bn_add_words:
149 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 178 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
150 } 179 }
151 .body 180 .body
152{ .mib; mov r14=r32 // rp 181{ .mib;
182#if defined(_HPUX_SOURCE) && defined(_ILP32)
183 addp4 r14=0,r32 // rp
184#else
185 mov r14=r32 // rp
186#endif
153 mov r9=pr };; 187 mov r9=pr };;
154{ .mii; mov r15=r33 // ap 188{ .mii;
189#if defined(_HPUX_SOURCE) && defined(_ILP32)
190 addp4 r15=0,r33 // ap
191#else
192 mov r15=r33 // ap
193#endif
155 mov ar.lc=r10 194 mov ar.lc=r10
156 mov ar.ec=6 } 195 mov ar.ec=6 }
157{ .mib; mov r16=r34 // bp 196{ .mib;
197#if defined(_HPUX_SOURCE) && defined(_ILP32)
198 addp4 r16=0,r34 // bp
199#else
200 mov r16=r34 // bp
201#endif
158 mov pr.rot=1<<16 };; 202 mov pr.rot=1<<16 };;
159 203
160.L_bn_add_words_ctop: 204.L_bn_add_words_ctop:
@@ -174,7 +218,7 @@ bn_add_words:
174 218
175{ .mii; 219{ .mii;
176(p59) add r8=1,r8 // return value 220(p59) add r8=1,r8 // return value
177 mov pr=r9,-1 221 mov pr=r9,0x1ffff
178 mov ar.lc=r3 } 222 mov ar.lc=r3 }
179{ .mbb; nop.b 0x0 223{ .mbb; nop.b 0x0
180 br.ret.sptk.many b0 };; 224 br.ret.sptk.many b0 };;
@@ -202,12 +246,27 @@ bn_sub_words:
202 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 246 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
203 } 247 }
204 .body 248 .body
205{ .mib; mov r14=r32 // rp 249{ .mib;
250#if defined(_HPUX_SOURCE) && defined(_ILP32)
251 addp4 r14=0,r32 // rp
252#else
253 mov r14=r32 // rp
254#endif
206 mov r9=pr };; 255 mov r9=pr };;
207{ .mii; mov r15=r33 // ap 256{ .mii;
257#if defined(_HPUX_SOURCE) && defined(_ILP32)
258 addp4 r15=0,r33 // ap
259#else
260 mov r15=r33 // ap
261#endif
208 mov ar.lc=r10 262 mov ar.lc=r10
209 mov ar.ec=6 } 263 mov ar.ec=6 }
210{ .mib; mov r16=r34 // bp 264{ .mib;
265#if defined(_HPUX_SOURCE) && defined(_ILP32)
266 addp4 r16=0,r34 // bp
267#else
268 mov r16=r34 // bp
269#endif
211 mov pr.rot=1<<16 };; 270 mov pr.rot=1<<16 };;
212 271
213.L_bn_sub_words_ctop: 272.L_bn_sub_words_ctop:
@@ -227,7 +286,7 @@ bn_sub_words:
227 286
228{ .mii; 287{ .mii;
229(p59) add r8=1,r8 // return value 288(p59) add r8=1,r8 // return value
230 mov pr=r9,-1 289 mov pr=r9,0x1ffff
231 mov ar.lc=r3 } 290 mov ar.lc=r3 }
232{ .mbb; nop.b 0x0 291{ .mbb; nop.b 0x0
233 br.ret.sptk.many b0 };; 292 br.ret.sptk.many b0 };;
@@ -253,7 +312,7 @@ bn_mul_words:
253#ifdef XMA_TEMPTATION 312#ifdef XMA_TEMPTATION
254{ .mfi; alloc r2=ar.pfs,4,0,0,0 };; 313{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
255#else 314#else
256{ .mfi; alloc r2=ar.pfs,4,4,0,8 };; 315{ .mfi; alloc r2=ar.pfs,4,12,0,16 };;
257#endif 316#endif
258{ .mib; mov r8=r0 // return value 317{ .mib; mov r8=r0 // return value
259 cmp4.le p6,p0=r34,r0 318 cmp4.le p6,p0=r34,r0
@@ -266,24 +325,30 @@ bn_mul_words:
266 325
267 .body 326 .body
268{ .mib; setf.sig f8=r35 // w 327{ .mib; setf.sig f8=r35 // w
269 mov pr.rot=0x400001<<16 328 mov pr.rot=0x800001<<16
270 // ------^----- serves as (p48) at first (p26) 329 // ------^----- serves as (p50) at first (p27)
271 brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16 330 brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
272 } 331 }
273 332
274#ifndef XMA_TEMPTATION 333#ifndef XMA_TEMPTATION
275 334
276{ .mii; mov r14=r32 // rp 335{ .mii;
277 mov r15=r33 // ap 336#if defined(_HPUX_SOURCE) && defined(_ILP32)
337 addp4 r14=0,r32 // rp
338 addp4 r15=0,r33 // ap
339#else
340 mov r14=r32 // rp
341 mov r15=r33 // ap
342#endif
278 mov ar.lc=r10 } 343 mov ar.lc=r10 }
279{ .mii; mov r39=0 // serves as r33 at first (p26) 344{ .mii; mov r40=0 // serves as r35 at first (p27)
280 mov ar.ec=12 };; 345 mov ar.ec=13 };;
281 346
282// This loop spins in 2*(n+11) ticks. It's scheduled for data in L2 347// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
283// cache (i.e. 9 ticks away) as floating point load/store instructions 348// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
284// bypass L1 cache and L2 latency is actually best-case scenario for 349// bypass L1 cache and L2 latency is actually best-case scenario for
285// ldf8. The loop is not scalable and shall run in 2*(n+11) even on 350// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
286// "wider" IA-64 implementations. It's a trade-off here. n+22 loop 351// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
287// would give us ~5% in *overall* performance improvement on "wider" 352// would give us ~5% in *overall* performance improvement on "wider"
288// IA-64, but would hurt Itanium for about same because of longer 353// IA-64, but would hurt Itanium for about same because of longer
289// epilogue. As it's a matter of few percents in either case I've 354// epilogue. As it's a matter of few percents in either case I've
@@ -291,25 +356,25 @@ bn_mul_words:
291// this very instruction sequence in bn_mul_add_words loop which in 356// this very instruction sequence in bn_mul_add_words loop which in
292// turn is scalable). 357// turn is scalable).
293.L_bn_mul_words_ctop: 358.L_bn_mul_words_ctop:
294{ .mfi; (p25) getf.sig r36=f49 // low 359{ .mfi; (p25) getf.sig r36=f52 // low
295 (p21) xmpy.lu f45=f37,f8 360 (p21) xmpy.lu f48=f37,f8
296 (p27) cmp.ltu p52,p48=r39,r38 } 361 (p28) cmp.ltu p54,p50=r41,r39 }
297{ .mfi; (p16) ldf8 f32=[r15],8 362{ .mfi; (p16) ldf8 f32=[r15],8
298 (p21) xmpy.hu f38=f37,f8 363 (p21) xmpy.hu f40=f37,f8
299 (p0) nop.i 0x0 };; 364 (p0) nop.i 0x0 };;
300{ .mii; (p26) getf.sig r32=f43 // high 365{ .mii; (p25) getf.sig r32=f44 // high
301 .pred.rel "mutex",p48,p52 366 .pred.rel "mutex",p50,p54
302 (p48) add r38=r37,r33 // (p26) 367 (p50) add r40=r38,r35 // (p27)
303 (p52) add r38=r37,r33,1 } // (p26) 368 (p54) add r40=r38,r35,1 } // (p27)
304{ .mfb; (p27) st8 [r14]=r39,8 369{ .mfb; (p28) st8 [r14]=r41,8
305 (p0) nop.f 0x0 370 (p0) nop.f 0x0
306 br.ctop.sptk .L_bn_mul_words_ctop };; 371 br.ctop.sptk .L_bn_mul_words_ctop };;
307.L_bn_mul_words_cend: 372.L_bn_mul_words_cend:
308 373
309{ .mii; nop.m 0x0 374{ .mii; nop.m 0x0
310.pred.rel "mutex",p49,p53 375.pred.rel "mutex",p51,p55
311(p49) add r8=r34,r0 376(p51) add r8=r36,r0
312(p53) add r8=r34,r0,1 } 377(p55) add r8=r36,r0,1 }
313{ .mfb; nop.m 0x0 378{ .mfb; nop.m 0x0
314 nop.f 0x0 379 nop.f 0x0
315 nop.b 0x0 } 380 nop.b 0x0 }
@@ -344,7 +409,7 @@ bn_mul_words:
344#endif // XMA_TEMPTATION 409#endif // XMA_TEMPTATION
345 410
346{ .mii; nop.m 0x0 411{ .mii; nop.m 0x0
347 mov pr=r9,-1 412 mov pr=r9,0x1ffff
348 mov ar.lc=r3 } 413 mov ar.lc=r3 }
349{ .mfb; rum 1<<5 // clear um.mfh 414{ .mfb; rum 1<<5 // clear um.mfh
350 nop.f 0x0 415 nop.f 0x0
@@ -376,59 +441,69 @@ bn_mul_add_words:
376 441
377 .body 442 .body
378{ .mib; setf.sig f8=r35 // w 443{ .mib; setf.sig f8=r35 // w
379 mov pr.rot=0x400001<<16 444 mov pr.rot=0x800001<<16
380 // ------^----- serves as (p48) at first (p26) 445 // ------^----- serves as (p50) at first (p27)
381 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 446 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
382 } 447 }
383{ .mii; mov r14=r32 // rp 448{ .mii;
384 mov r15=r33 // ap 449#if defined(_HPUX_SOURCE) && defined(_ILP32)
450 addp4 r14=0,r32 // rp
451 addp4 r15=0,r33 // ap
452#else
453 mov r14=r32 // rp
454 mov r15=r33 // ap
455#endif
385 mov ar.lc=r10 } 456 mov ar.lc=r10 }
386{ .mii; mov r39=0 // serves as r33 at first (p26) 457{ .mii; mov r40=0 // serves as r35 at first (p27)
387 mov r18=r32 // rp copy 458#if defined(_HPUX_SOURCE) && defined(_ILP32)
388 mov ar.ec=14 };; 459 addp4 r18=0,r32 // rp copy
460#else
461 mov r18=r32 // rp copy
462#endif
463 mov ar.ec=15 };;
389 464
390// This loop spins in 3*(n+13) ticks on Itanium and should spin in 465// This loop spins in 3*(n+14) ticks on Itanium and should spin in
391// 2*(n+13) on "wider" IA-64 implementations (to be verified with new 466// 2*(n+14) on "wider" IA-64 implementations (to be verified with new
392// µ-architecture manuals as they become available). As usual it's 467// µ-architecture manuals as they become available). As usual it's
393// possible to compress the epilogue, down to 10 in this case, at the 468// possible to compress the epilogue, down to 10 in this case, at the
394// cost of scalability. Compressed (and therefore non-scalable) loop 469// cost of scalability. Compressed (and therefore non-scalable) loop
395// running at 3*(n+10) would buy you ~10% on Itanium but take ~35% 470// running at 3*(n+11) would buy you ~10% on Itanium but take ~35%
396// from "wider" IA-64 so let it be scalable! Special attention was 471// from "wider" IA-64 so let it be scalable! Special attention was
397// paid for having the loop body split at 64-byte boundary. ld8 is 472// paid for having the loop body split at 64-byte boundary. ld8 is
398// scheduled for L1 cache as the data is more than likely there. 473// scheduled for L1 cache as the data is more than likely there.
399// Indeed, bn_mul_words has put it there a moment ago:-) 474// Indeed, bn_mul_words has put it there a moment ago:-)
400.L_bn_mul_add_words_ctop: 475.L_bn_mul_add_words_ctop:
401{ .mfi; (p25) getf.sig r36=f49 // low 476{ .mfi; (p25) getf.sig r36=f52 // low
402 (p21) xmpy.lu f45=f37,f8 477 (p21) xmpy.lu f48=f37,f8
403 (p27) cmp.ltu p52,p48=r39,r38 } 478 (p28) cmp.ltu p54,p50=r41,r39 }
404{ .mfi; (p16) ldf8 f32=[r15],8 479{ .mfi; (p16) ldf8 f32=[r15],8
405 (p21) xmpy.hu f38=f37,f8 480 (p21) xmpy.hu f40=f37,f8
406 (p27) add r43=r43,r39 };; 481 (p28) add r45=r45,r41 };;
407{ .mii; (p26) getf.sig r32=f43 // high 482{ .mii; (p25) getf.sig r32=f44 // high
408 .pred.rel "mutex",p48,p52 483 .pred.rel "mutex",p50,p54
409 (p48) add r38=r37,r33 // (p26) 484 (p50) add r40=r38,r35 // (p27)
410 (p52) add r38=r37,r33,1 } // (p26) 485 (p54) add r40=r38,r35,1 } // (p27)
411{ .mfb; (p27) cmp.ltu.unc p56,p0=r43,r39 486{ .mfb; (p28) cmp.ltu.unc p60,p0=r45,r41
412 (p0) nop.f 0x0 487 (p0) nop.f 0x0
413 (p0) nop.b 0x0 } 488 (p0) nop.b 0x0 }
414{ .mii; (p26) ld8 r42=[r18],8 489{ .mii; (p27) ld8 r44=[r18],8
415 (p58) cmp.eq.or p57,p0=-1,r44 490 (p62) cmp.eq.or p61,p0=-1,r46
416 (p58) add r44=1,r44 } 491 (p62) add r46=1,r46 }
417{ .mfb; (p29) st8 [r14]=r45,8 492{ .mfb; (p30) st8 [r14]=r47,8
418 (p0) nop.f 0x0 493 (p0) nop.f 0x0
419 br.ctop.sptk .L_bn_mul_add_words_ctop};; 494 br.ctop.sptk .L_bn_mul_add_words_ctop};;
420.L_bn_mul_add_words_cend: 495.L_bn_mul_add_words_cend:
421 496
422{ .mii; nop.m 0x0 497{ .mii; nop.m 0x0
423.pred.rel "mutex",p51,p55 498.pred.rel "mutex",p53,p57
424(p51) add r8=r36,r0 499(p53) add r8=r38,r0
425(p55) add r8=r36,r0,1 } 500(p57) add r8=r38,r0,1 }
426{ .mfb; nop.m 0x0 501{ .mfb; nop.m 0x0
427 nop.f 0x0 502 nop.f 0x0
428 nop.b 0x0 };; 503 nop.b 0x0 };;
429{ .mii; 504{ .mii;
430(p59) add r8=1,r8 505(p63) add r8=1,r8
431 mov pr=r9,-1 506 mov pr=r9,0x1ffff
432 mov ar.lc=r3 } 507 mov ar.lc=r3 }
433{ .mfb; rum 1<<5 // clear um.mfh 508{ .mfb; rum 1<<5 // clear um.mfh
434 nop.f 0x0 509 nop.f 0x0
@@ -461,6 +536,10 @@ bn_sqr_words:
461 mov r9=pr };; 536 mov r9=pr };;
462 537
463 .body 538 .body
539#if defined(_HPUX_SOURCE) && defined(_ILP32)
540{ .mii; addp4 r32=0,r32
541 addp4 r33=0,r33 };;
542#endif
464{ .mib; 543{ .mib;
465 mov pr.rot=1<<16 544 mov pr.rot=1<<16
466 brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 545 brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
@@ -492,7 +571,7 @@ bn_sqr_words:
492.L_bn_sqr_words_cend: 571.L_bn_sqr_words_cend:
493 572
494{ .mii; nop.m 0x0 573{ .mii; nop.m 0x0
495 mov pr=r9,-1 574 mov pr=r9,0x1ffff
496 mov ar.lc=r3 } 575 mov ar.lc=r3 }
497{ .mfb; rum 1<<5 // clear um.mfh 576{ .mfb; rum 1<<5 // clear um.mfh
498 nop.f 0x0 577 nop.f 0x0
@@ -526,7 +605,14 @@ bn_sqr_comba8:
526 .prologue 605 .prologue
527 .fframe 0 606 .fframe 0
528 .save ar.pfs,r2 607 .save ar.pfs,r2
608#if defined(_HPUX_SOURCE) && defined(_ILP32)
529{ .mii; alloc r2=ar.pfs,2,1,0,0 609{ .mii; alloc r2=ar.pfs,2,1,0,0
610 addp4 r33=0,r33
611 addp4 r32=0,r32 };;
612{ .mii;
613#else
614{ .mii; alloc r2=ar.pfs,2,1,0,0
615#endif
530 mov r34=r33 616 mov r34=r33
531 add r14=8,r33 };; 617 add r14=8,r33 };;
532 .body 618 .body
@@ -587,7 +673,14 @@ bn_mul_comba8:
587 .prologue 673 .prologue
588 .fframe 0 674 .fframe 0
589 .save ar.pfs,r2 675 .save ar.pfs,r2
676#if defined(_HPUX_SOURCE) && defined(_ILP32)
590{ .mii; alloc r2=ar.pfs,3,0,0,0 677{ .mii; alloc r2=ar.pfs,3,0,0,0
678 addp4 r33=0,r33
679 addp4 r34=0,r34 };;
680{ .mii; addp4 r32=0,r32
681#else
682{ .mii; alloc r2=ar.pfs,3,0,0,0
683#endif
591 add r14=8,r33 684 add r14=8,r33
592 add r17=8,r34 } 685 add r17=8,r34 }
593 .body 686 .body
@@ -1138,7 +1231,14 @@ bn_sqr_comba4:
1138 .prologue 1231 .prologue
1139 .fframe 0 1232 .fframe 0
1140 .save ar.pfs,r2 1233 .save ar.pfs,r2
1234#if defined(_HPUX_SOURCE) && defined(_ILP32)
1235{ .mii; alloc r2=ar.pfs,2,1,0,0
1236 addp4 r32=0,r32
1237 addp4 r33=0,r33 };;
1238{ .mii;
1239#else
1141{ .mii; alloc r2=ar.pfs,2,1,0,0 1240{ .mii; alloc r2=ar.pfs,2,1,0,0
1241#endif
1142 mov r34=r33 1242 mov r34=r33
1143 add r14=8,r33 };; 1243 add r14=8,r33 };;
1144 .body 1244 .body
@@ -1164,7 +1264,14 @@ bn_mul_comba4:
1164 .prologue 1264 .prologue
1165 .fframe 0 1265 .fframe 0
1166 .save ar.pfs,r2 1266 .save ar.pfs,r2
1267#if defined(_HPUX_SOURCE) && defined(_ILP32)
1268{ .mii; alloc r2=ar.pfs,3,0,0,0
1269 addp4 r33=0,r33
1270 addp4 r34=0,r34 };;
1271{ .mii; addp4 r32=0,r32
1272#else
1167{ .mii; alloc r2=ar.pfs,3,0,0,0 1273{ .mii; alloc r2=ar.pfs,3,0,0,0
1274#endif
1168 add r14=8,r33 1275 add r14=8,r33
1169 add r17=8,r34 } 1276 add r17=8,r34 }
1170 .body 1277 .body
@@ -1464,7 +1571,7 @@ bn_div_words:
1464 or r8=r8,r33 1571 or r8=r8,r33
1465 mov ar.pfs=r2 };; 1572 mov ar.pfs=r2 };;
1466{ .mii; shr.u r9=H,I // remainder if anybody wants it 1573{ .mii; shr.u r9=H,I // remainder if anybody wants it
1467 mov pr=r10,-1 } 1574 mov pr=r10,0x1ffff }
1468{ .mfb; br.ret.sptk.many b0 };; 1575{ .mfb; br.ret.sptk.many b0 };;
1469 1576
1470// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division 1577// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2.s b/src/lib/libcrypto/bn/asm/pa-risc2.s
index af9730d062..f3b16290eb 100644
--- a/src/lib/libcrypto/bn/asm/pa-risc2.s
+++ b/src/lib/libcrypto/bn/asm/pa-risc2.s
@@ -747,8 +747,8 @@ bn_div_words
747 .PROC 747 .PROC
748 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN 748 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
749 .IMPORT BN_num_bits_word,CODE 749 .IMPORT BN_num_bits_word,CODE
750 .IMPORT __iob,DATA 750 ;--- not PIC .IMPORT __iob,DATA
751 .IMPORT fprintf,CODE 751 ;--- not PIC .IMPORT fprintf,CODE
752 .IMPORT abort,CODE 752 .IMPORT abort,CODE
753 .IMPORT $$div2U,MILLICODE 753 .IMPORT $$div2U,MILLICODE
754 .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE 754 .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
@@ -844,12 +844,12 @@ $0006001A
844 MOVIB,TR 2,%r8,$0006001C ;offset 0xa18 844 MOVIB,TR 2,%r8,$0006001C ;offset 0xa18
845 EXTRD,U %r3,63,32,%r7 ;offset 0xa1c 845 EXTRD,U %r3,63,32,%r7 ;offset 0xa1c
846$D2 846$D2
847 ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20 847 ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20
848 LDIL LR'C$7,%r21 ;offset 0xa24 848 ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24
849 LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28 849 ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28
850 .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28; 850 ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28;
851 B,L fprintf,%r2 ;offset 0xa2c 851 ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c
852 LDO RR'C$7(%r21),%r25 ;offset 0xa30 852 ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30
853 .CALL ; 853 .CALL ;
854 B,L abort,%r2 ;offset 0xa34 854 B,L abort,%r2 ;offset 0xa34
855 NOP ;offset 0xa38 855 NOP ;offset 0xa38
@@ -1605,14 +1605,14 @@ bn_mul_comba4
1605 .PROCEND 1605 .PROCEND
1606 1606
1607 1607
1608 .SPACE $TEXT$ 1608;--- not PIC .SPACE $TEXT$
1609 .SUBSPA $CODE$ 1609;--- not PIC .SUBSPA $CODE$
1610 .SPACE $PRIVATE$,SORT=16 1610;--- not PIC .SPACE $PRIVATE$,SORT=16
1611 .IMPORT $global$,DATA 1611;--- not PIC .IMPORT $global$,DATA
1612 .SPACE $TEXT$ 1612;--- not PIC .SPACE $TEXT$
1613 .SUBSPA $CODE$ 1613;--- not PIC .SUBSPA $CODE$
1614 .SUBSPA $LIT$,ACCESS=0x2c 1614;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c
1615C$7 1615;--- not PIC C$7
1616 .ALIGN 8 1616;--- not PIC .ALIGN 8
1617 .STRINGZ "Division would overflow (%d)\n" 1617;--- not PIC .STRINGZ "Division would overflow (%d)\n"
1618 .END 1618 .END
diff --git a/src/lib/libcrypto/bn/asm/vms.mar b/src/lib/libcrypto/bn/asm/vms.mar
index 465f2774b6..aefab15cdb 100644
--- a/src/lib/libcrypto/bn/asm/vms.mar
+++ b/src/lib/libcrypto/bn/asm/vms.mar
@@ -1,4 +1,4 @@
1 .title vax_bn_mul_add_word unsigned multiply & add, 32*32+32+32=>64 1 .title vax_bn_mul_add_words unsigned multiply & add, 32*32+32+32=>64
2; 2;
3; w.j.m. 15-jan-1999 3; w.j.m. 15-jan-1999
4; 4;
@@ -59,7 +59,7 @@ w=16 ;(AP) w by value (input)
59 movl r6,r0 ; return c 59 movl r6,r0 ; return c
60 ret 60 ret
61 61
62 .title vax_bn_mul_word unsigned multiply & add, 32*32+32=>64 62 .title vax_bn_mul_words unsigned multiply & add, 32*32+32=>64
63; 63;
64; w.j.m. 15-jan-1999 64; w.j.m. 15-jan-1999
65; 65;
@@ -172,147 +172,175 @@ n=12 ;(AP) n by value (input)
172; } 172; }
173; 173;
174; Using EDIV would be very easy, if it didn't do signed calculations. 174; Using EDIV would be very easy, if it didn't do signed calculations.
175; Therefore, som extra things have to happen around it. The way to 175; Any time any of the input numbers are signed, there are problems,
176; handle that is to shift all operands right one step (basically dividing 176; usually with integer overflow, at which point it returns useless
177; them by 2) and handle the different cases depending on what the lowest 177; data (the quotient gets the value of l, and the remainder becomes 0).
178; bit of each operand was.
179; 178;
180; To start with, let's define the following: 179; If it was just for the dividend, it would be very easy, just divide
180; it by 2 (unsigned), do the division, multiply the resulting quotient
181; and remainder by 2, add the bit that was dropped when dividing by 2
182; to the remainder, and do some adjustment so the remainder doesn't
183; end up larger than the divisor. For some cases when the divisor is
184; negative (from EDIV's point of view, i.e. when the highest bit is set),
185; dividing the dividend by 2 isn't enough, and since some operations
186; might generate integer overflows even when the dividend is divided by
187; 4 (when the high part of the shifted down dividend ends up being exactly
188; half of the divisor, the result is the quotient 0x80000000, which is
189; negative...) it needs to be divided by 8. Furthermore, the divisor needs
190; to be divided by 2 (unsigned) as well, to avoid more problems with the sign.
191; In this case, a little extra fiddling with the remainder is required.
181; 192;
182; a' = l & 1 193; So, the simplest way to handle this is always to divide the dividend
183; a2 = <h,l> >> 1 # UNSIGNED shift! 194; by 8, and to divide the divisor by 2 if it's highest bit is set.
184; b' = d & 1 195; After EDIV has been used, the quotient gets multiplied by 8 if the
185; b2 = d >> 1 # UNSIGNED shift! 196; original divisor was positive, otherwise 4. The remainder, oddly
197; enough, is *always* multiplied by 8.
198; NOTE: in the case mentioned above, where the high part of the shifted
199; down dividend ends up being exactly half the shifted down divisor, we
200; end up with a 33 bit quotient. That's no problem however, it usually
201; means we have ended up with a too large remainder as well, and the
202; problem is fixed by the last part of the algorithm (next paragraph).
186; 203;
187; Now, use EDIV to calculate a quotient and a remainder: 204; The routine ends with comparing the resulting remainder with the
205; original divisor and if the remainder is larger, subtract the
206; original divisor from it, and increase the quotient by 1. This is
207; done until the remainder is smaller than the divisor.
188; 208;
189; q'' = a2/b2 209; The complete algorithm looks like this:
190; r'' = a2 - q''*b2
191; 210;
192; If b' is 0, the quotient is already correct, we just need to adjust the 211; d' = d
193; remainder: 212; l' = l & 7
213; [h,l] = [h,l] >> 3
214; [q,r] = floor([h,l] / d) # This is the EDIV operation
215; if (q < 0) q = -q # I doubt this is necessary any more
194; 216;
195; if (b' == 0) 217; r' = r >> 29
196; { 218; if (d' >= 0)
197; r = 2*r'' + a' 219; q' = q >> 29
198; q = q'' 220; q = q << 3
199; } 221; else
200; 222; q' = q >> 30
201; If b' is 1, we need to do other adjustements. The first thought is the 223; q = q << 2
202; following (note that r' will not always have the right value, but an 224; r = (r << 3) + l'
203; adjustement follows further down):
204;
205; if (b' == 1)
206; {
207; q' = q''
208; r' = a - q'*b
209;
210; However, one can note the folowing relationship:
211;
212; r'' = a2 - q''*b2
213; => 2*r'' = 2*a2 - 2*q''*b2
214; = { a = 2*a2 + a', b = 2*b2 + b' = 2*b2 + 1,
215; q' = q'' }
216; = a - a' - q'*(b - 1)
217; = a - q'*b - a' + q'
218; = r' - a' + q'
219; => r' = 2*r'' - q' + a'
220; 225;
221; This enables us to use r'' instead of discarding and calculating another 226; if (d' < 0)
222; modulo:
223;
224; if (b' == 1)
225; { 227; {
226; q' = q'' 228; [r',r] = [r',r] - q
227; r' = (r'' << 1) - q' + a' 229; while ([r',r] < 0)
228;
229; Now, all we have to do is adjust r', because it might be < 0:
230;
231; while (r' < 0)
232; { 230; {
233; r' = r' + b 231; [r',r] = [r',r] + d
234; q' = q' - 1 232; [q',q] = [q',q] - 1
235; } 233; }
236; } 234; }
237; 235;
238; return q' 236; while ([r',r] >= d')
237; {
238; [r',r] = [r',r] - d'
239; [q',q] = [q',q] + 1
240; }
241;
242; return q
239 243
240h=4 ;(AP) h by value (input) 244h=4 ;(AP) h by value (input)
241l=8 ;(AP) l by value (input) 245l=8 ;(AP) l by value (input)
242d=12 ;(AP) d by value (input) 246d=12 ;(AP) d by value (input)
243 247
244;aprim=r5 248;r2 = l, q
245;a2=r6 249;r3 = h, r
246;a20=r6 250;r4 = d
247;a21=r7 251;r5 = l'
248;bprim=r8 252;r6 = r'
249;b2=r9 253;r7 = d'
250;qprim=r10 ; initially used as q'' 254;r8 = q'
251;rprim=r11 ; initially used as r''
252
253 255
254 .psect code,nowrt 256 .psect code,nowrt
255 257
256.entry bn_div_words,^m<r2,r3,r4,r5,r6,r7,r8,r9,r10,r11> 258.entry bn_div_words,^m<r2,r3,r4,r5,r6,r7,r8>
257 movl l(ap),r2 259 movl l(ap),r2
258 movl h(ap),r3 260 movl h(ap),r3
259 movl d(ap),r4 261 movl d(ap),r4
260 262
261 movl #0,r5 263 bicl3 #^XFFFFFFF8,r2,r5 ; l' = l & 7
262 movl #0,r8 264 bicl3 #^X00000007,r2,r2
263 movl #0,r0
264; movl #0,r1
265 265
266 rotl #-1,r2,r6 ; a20 = l >> 1 (almost) 266 bicl3 #^XFFFFFFF8,r3,r6
267 rotl #-1,r3,r7 ; a21 = h >> 1 (almost) 267 bicl3 #^X00000007,r3,r3
268 rotl #-1,r4,r9 ; b2 = d >> 1 (almost) 268
269 addl r6,r2
269 270
270 tstl r6 271 rotl #-3,r2,r2 ; l = l >> 3
271 bgeq 1$ 272 rotl #-3,r3,r3 ; h = h >> 3
272 xorl2 #^X80000000,r6 ; fixup a20 so highest bit is 0 273
273 incl r5 ; a' = 1 274 movl r4,r7 ; d' = d
2741$: 275
275 tstl r7 276 movl #0,r6 ; r' = 0
276 bgeq 2$ 277 movl #0,r8 ; q' = 0
277 xorl2 #^X80000000,r6 ; fixup a20 so highest bit is 1, 278
278 ; since that's what was lowest in a21 279 tstl r4
279 xorl2 #^X80000000,r7 ; fixup a21 so highest bit is 1
2802$:
281 tstl r9
282 beql 666$ ; Uh-oh, the divisor is 0... 280 beql 666$ ; Uh-oh, the divisor is 0...
283 bgtr 3$ 281 bgtr 1$
284 xorl2 #^X80000000,r9 ; fixup b2 so highest bit is 0 282 rotl #-1,r4,r4 ; If d is negative, shift it right.
285 incl r8 ; b' = 1 283 bicl2 #^X80000000,r4 ; Since d is then a large number, the
2863$: 284 ; lowest bit is insignificant
287 tstl r9 285 ; (contradict that, and I'll fix the problem!)
288 bneq 4$ ; if b2 is 0, we know that b' is 1 2861$:
289 tstl r3 287 ediv r4,r2,r2,r3 ; Do the actual division
290 bneq 666$ ; if higher half isn't 0, we overflow 288
291 movl r2,r10 ; otherwise, we have our result 289 tstl r2
292 brb 42$ ; This is a success, really. 290 bgeq 3$
2934$: 291 mnegl r2,r2 ; if q < 0, negate it
294 ediv r9,r6,r10,r11 2923$:
295 293 tstl r7
296 tstl r8 294 blss 4$
297 bneq 5$ ; If b' != 0, go to the other part 295 rotl #3,r2,r2 ; q = q << 3
298; addl3 r11,r11,r1 296 bicl3 #^XFFFFFFF8,r2,r8 ; q' gets the high bits from q
299; addl2 r5,r1 297 bicl3 #^X00000007,r2,r2
300 brb 42$ 298 bsb 41$
3015$: 2994$: ; else
302 ashl #1,r11,r11 300 rotl #2,r2,r2 ; q = q << 2
303 subl2 r10,r11 301 bicl3 #^XFFFFFFFC,r2,r8 ; q' gets the high bits from q
304 addl2 r5,r11 302 bicl3 #^X00000003,r2,r2
305 bgeq 7$ 30341$:
3066$: 304 rotl #3,r3,r3 ; r = r << 3
307 decl r10 305 bicl3 #^XFFFFFFF8,r3,r6 ; r' gets the high bits from r
308 addl2 r4,r11 306 bicl3 #^X00000007,r3,r3
309 blss 6$ 307 addl r5,r3 ; r = r + l'
3107$: 308
311; movl r11,r1 309 tstl r7
310 bgeq 5$
311 bitl #1,r7
312 beql 5$ ; if d' < 0 && d' & 1
313 subl r2,r3 ; [r',r] = [r',r] - [q',q]
314 sbwc r8,r6
31545$:
316 bgeq 5$ ; while r < 0
317 decl r2 ; [q',q] = [q',q] - 1
318 sbwc #0,r8
319 addl r7,r3 ; [r',r] = [r',r] + d'
320 adwc #0,r6
321 brb 45$
322
323; The return points are placed in the middle to keep a short distance from
324; all the branch points
31242$: 32542$:
313 movl r10,r0 326; movl r3,r1
327 movl r2,r0
328 ret
314666$: 329666$:
330 movl #^XFFFFFFFF,r0
315 ret 331 ret
332
3335$:
334 tstl r6
335 bneq 6$
336 cmpl r3,r7
337 blssu 42$ ; while [r',r] >= d'
3386$:
339 subl r7,r3 ; [r',r] = [r',r] - d'
340 sbwc #0,r6
341 incl r2 ; [q',q] = [q',q] + 1
342 adwc #0,r8
343 brb 5$
316 344
317 .title vax_bn_add_words unsigned add of two arrays 345 .title vax_bn_add_words unsigned add of two arrays
318; 346;