diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/ia64.S')
-rw-r--r-- | src/lib/libcrypto/bn/asm/ia64.S | 217 |
1 files changed, 131 insertions, 86 deletions
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S index 7b82b820e6..7dfda85566 100644 --- a/src/lib/libcrypto/bn/asm/ia64.S +++ b/src/lib/libcrypto/bn/asm/ia64.S | |||
@@ -1,6 +1,6 @@ | |||
1 | .explicit | 1 | .explicit |
2 | .text | 2 | .text |
3 | .ident "ia64.S, Version 2.1" | 3 | .ident "ia64.S, Version 2.0" |
4 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | 4 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" |
5 | 5 | ||
6 | // | 6 | // |
@@ -35,7 +35,7 @@ | |||
35 | // What does it mean? You might ratiocinate that the original code | 35 | // What does it mean? You might ratiocinate that the original code |
36 | // should run just faster... Because sum of latencies is smaller... | 36 | // should run just faster... Because sum of latencies is smaller... |
37 | // Wrong! Note that getf latency increased. This means that if a loop is | 37 | // Wrong! Note that getf latency increased. This means that if a loop is |
38 | // scheduled for lower latency (as they were), then it will suffer from | 38 | // scheduled for lower latency (and they are), then it will suffer from |
39 | // stall condition and the code will therefore turn anti-scalable, e.g. | 39 | // stall condition and the code will therefore turn anti-scalable, e.g. |
40 | // original bn_mul_words spun at 5*n or 2.5 times slower than expected | 40 | // original bn_mul_words spun at 5*n or 2.5 times slower than expected |
41 | // on Itanium2! What to do? Reschedule loops for Itanium2? But then | 41 | // on Itanium2! What to do? Reschedule loops for Itanium2? But then |
@@ -145,12 +145,6 @@ | |||
145 | // -Drum=nop.m in command line. | 145 | // -Drum=nop.m in command line. |
146 | // | 146 | // |
147 | 147 | ||
148 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | ||
149 | #define ADDP addp4 | ||
150 | #else | ||
151 | #define ADDP add | ||
152 | #endif | ||
153 | |||
154 | #if 1 | 148 | #if 1 |
155 | // | 149 | // |
156 | // bn_[add|sub]_words routines. | 150 | // bn_[add|sub]_words routines. |
@@ -184,12 +178,27 @@ bn_add_words: | |||
184 | brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 | 178 | brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 |
185 | } | 179 | } |
186 | .body | 180 | .body |
187 | { .mib; ADDP r14=0,r32 // rp | 181 | { .mib; |
182 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
183 | addp4 r14=0,r32 // rp | ||
184 | #else | ||
185 | mov r14=r32 // rp | ||
186 | #endif | ||
188 | mov r9=pr };; | 187 | mov r9=pr };; |
189 | { .mii; ADDP r15=0,r33 // ap | 188 | { .mii; |
189 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
190 | addp4 r15=0,r33 // ap | ||
191 | #else | ||
192 | mov r15=r33 // ap | ||
193 | #endif | ||
190 | mov ar.lc=r10 | 194 | mov ar.lc=r10 |
191 | mov ar.ec=6 } | 195 | mov ar.ec=6 } |
192 | { .mib; ADDP r16=0,r34 // bp | 196 | { .mib; |
197 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
198 | addp4 r16=0,r34 // bp | ||
199 | #else | ||
200 | mov r16=r34 // bp | ||
201 | #endif | ||
193 | mov pr.rot=1<<16 };; | 202 | mov pr.rot=1<<16 };; |
194 | 203 | ||
195 | .L_bn_add_words_ctop: | 204 | .L_bn_add_words_ctop: |
@@ -237,12 +246,27 @@ bn_sub_words: | |||
237 | brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 | 246 | brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 |
238 | } | 247 | } |
239 | .body | 248 | .body |
240 | { .mib; ADDP r14=0,r32 // rp | 249 | { .mib; |
250 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
251 | addp4 r14=0,r32 // rp | ||
252 | #else | ||
253 | mov r14=r32 // rp | ||
254 | #endif | ||
241 | mov r9=pr };; | 255 | mov r9=pr };; |
242 | { .mii; ADDP r15=0,r33 // ap | 256 | { .mii; |
257 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
258 | addp4 r15=0,r33 // ap | ||
259 | #else | ||
260 | mov r15=r33 // ap | ||
261 | #endif | ||
243 | mov ar.lc=r10 | 262 | mov ar.lc=r10 |
244 | mov ar.ec=6 } | 263 | mov ar.ec=6 } |
245 | { .mib; ADDP r16=0,r34 // bp | 264 | { .mib; |
265 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
266 | addp4 r16=0,r34 // bp | ||
267 | #else | ||
268 | mov r16=r34 // bp | ||
269 | #endif | ||
246 | mov pr.rot=1<<16 };; | 270 | mov pr.rot=1<<16 };; |
247 | 271 | ||
248 | .L_bn_sub_words_ctop: | 272 | .L_bn_sub_words_ctop: |
@@ -308,10 +332,16 @@ bn_mul_words: | |||
308 | 332 | ||
309 | #ifndef XMA_TEMPTATION | 333 | #ifndef XMA_TEMPTATION |
310 | 334 | ||
311 | { .mmi; ADDP r14=0,r32 // rp | 335 | { .mii; |
312 | ADDP r15=0,r33 // ap | 336 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
337 | addp4 r14=0,r32 // rp | ||
338 | addp4 r15=0,r33 // ap | ||
339 | #else | ||
340 | mov r14=r32 // rp | ||
341 | mov r15=r33 // ap | ||
342 | #endif | ||
313 | mov ar.lc=r10 } | 343 | mov ar.lc=r10 } |
314 | { .mmi; mov r40=0 // serves as r35 at first (p27) | 344 | { .mii; mov r40=0 // serves as r35 at first (p27) |
315 | mov ar.ec=13 };; | 345 | mov ar.ec=13 };; |
316 | 346 | ||
317 | // This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium | 347 | // This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium |
@@ -394,64 +424,89 @@ bn_mul_words: | |||
394 | .global bn_mul_add_words# | 424 | .global bn_mul_add_words# |
395 | .proc bn_mul_add_words# | 425 | .proc bn_mul_add_words# |
396 | .align 64 | 426 | .align 64 |
397 | .skip 48 // makes the loop body aligned at 64-byte boundary | 427 | //.skip 0 // makes the loop split at 64-byte boundary |
398 | bn_mul_add_words: | 428 | bn_mul_add_words: |
399 | .prologue | 429 | .prologue |
400 | .fframe 0 | 430 | .fframe 0 |
401 | .save ar.pfs,r2 | 431 | .save ar.pfs,r2 |
402 | .save ar.lc,r3 | 432 | { .mii; alloc r2=ar.pfs,4,12,0,16 |
403 | .save pr,r9 | 433 | cmp4.le p6,p0=r34,r0 };; |
404 | { .mmi; alloc r2=ar.pfs,4,4,0,8 | 434 | { .mfb; mov r8=r0 // return value |
405 | cmp4.le p6,p0=r34,r0 | ||
406 | mov r3=ar.lc };; | ||
407 | { .mib; mov r8=r0 // return value | ||
408 | sub r10=r34,r0,1 | ||
409 | (p6) br.ret.spnt.many b0 };; | 435 | (p6) br.ret.spnt.many b0 };; |
410 | 436 | ||
437 | .save ar.lc,r3 | ||
438 | { .mii; sub r10=r34,r0,1 | ||
439 | mov r3=ar.lc | ||
440 | mov r9=pr };; | ||
441 | |||
411 | .body | 442 | .body |
412 | { .mib; setf.sig f8=r35 // w | 443 | { .mib; setf.sig f8=r35 // w |
413 | mov r9=pr | 444 | mov pr.rot=0x800001<<16 |
445 | // ------^----- serves as (p50) at first (p27) | ||
414 | brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 | 446 | brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 |
415 | } | 447 | } |
416 | { .mmi; ADDP r14=0,r32 // rp | 448 | { .mii; |
417 | ADDP r15=0,r33 // ap | 449 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
450 | addp4 r14=0,r32 // rp | ||
451 | addp4 r15=0,r33 // ap | ||
452 | #else | ||
453 | mov r14=r32 // rp | ||
454 | mov r15=r33 // ap | ||
455 | #endif | ||
418 | mov ar.lc=r10 } | 456 | mov ar.lc=r10 } |
419 | { .mii; ADDP r16=0,r32 // rp copy | 457 | { .mii; mov r40=0 // serves as r35 at first (p27) |
420 | mov pr.rot=0x2001<<16 | 458 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
421 | // ------^----- serves as (p40) at first (p27) | 459 | addp4 r18=0,r32 // rp copy |
422 | mov ar.ec=11 };; | 460 | #else |
423 | 461 | mov r18=r32 // rp copy | |
424 | // This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on | 462 | #endif |
425 | // Itanium 2. Yes, unlike previous versions it scales:-) Previous | 463 | mov ar.ec=15 };; |
426 | // version was peforming *all* additions in IALU and was starving | 464 | |
427 | // for those even on Itanium 2. In this version one addition is | 465 | // This loop spins in 3*(n+14) ticks on Itanium and should spin in |
428 | // moved to FPU and is folded with multiplication. This is at cost | 466 | // 2*(n+14) on "wider" IA-64 implementations (to be verified with new |
429 | // of propogating the result from previous call to this subroutine | 467 | // µ-architecture manuals as they become available). As usual it's |
430 | // to L2 cache... In other words negligible even for shorter keys. | 468 | // possible to compress the epilogue, down to 10 in this case, at the |
431 | // *Overall* performance improvement [over previous version] varies | 469 | // cost of scalability. Compressed (and therefore non-scalable) loop |
432 | // from 11 to 22 percent depending on key length. | 470 | // running at 3*(n+11) would buy you ~10% on Itanium but take ~35% |
471 | // from "wider" IA-64 so let it be scalable! Special attention was | ||
472 | // paid for having the loop body split at 64-byte boundary. ld8 is | ||
473 | // scheduled for L1 cache as the data is more than likely there. | ||
474 | // Indeed, bn_mul_words has put it there a moment ago:-) | ||
433 | .L_bn_mul_add_words_ctop: | 475 | .L_bn_mul_add_words_ctop: |
434 | .pred.rel "mutex",p40,p42 | 476 | { .mfi; (p25) getf.sig r36=f52 // low |
435 | { .mfi; (p23) getf.sig r36=f45 // low | 477 | (p21) xmpy.lu f48=f37,f8 |
436 | (p20) xma.lu f42=f36,f8,f50 // low | 478 | (p28) cmp.ltu p54,p50=r41,r39 } |
437 | (p40) add r39=r39,r35 } // (p27) | 479 | { .mfi; (p16) ldf8 f32=[r15],8 |
438 | { .mfi; (p16) ldf8 f32=[r15],8 // *(ap++) | 480 | (p21) xmpy.hu f40=f37,f8 |
439 | (p20) xma.hu f36=f36,f8,f50 // high | 481 | (p28) add r45=r45,r41 };; |
440 | (p42) add r39=r39,r35,1 };; // (p27) | 482 | { .mii; (p25) getf.sig r32=f44 // high |
441 | { .mmi; (p24) getf.sig r32=f40 // high | 483 | .pred.rel "mutex",p50,p54 |
442 | (p16) ldf8 f46=[r16],8 // *(rp1++) | 484 | (p50) add r40=r38,r35 // (p27) |
443 | (p40) cmp.ltu p41,p39=r39,r35 } // (p27) | 485 | (p54) add r40=r38,r35,1 } // (p27) |
444 | { .mib; (p26) st8 [r14]=r39,8 // *(rp2++) | 486 | { .mfb; (p28) cmp.ltu.unc p60,p0=r45,r41 |
445 | (p42) cmp.leu p41,p39=r39,r35 // (p27) | 487 | (p0) nop.f 0x0 |
488 | (p0) nop.b 0x0 } | ||
489 | { .mii; (p27) ld8 r44=[r18],8 | ||
490 | (p62) cmp.eq.or p61,p0=-1,r46 | ||
491 | (p62) add r46=1,r46 } | ||
492 | { .mfb; (p30) st8 [r14]=r47,8 | ||
493 | (p0) nop.f 0x0 | ||
446 | br.ctop.sptk .L_bn_mul_add_words_ctop};; | 494 | br.ctop.sptk .L_bn_mul_add_words_ctop};; |
447 | .L_bn_mul_add_words_cend: | 495 | .L_bn_mul_add_words_cend: |
448 | 496 | ||
449 | { .mmi; .pred.rel "mutex",p40,p42 | 497 | { .mii; nop.m 0x0 |
450 | (p40) add r8=r35,r0 | 498 | .pred.rel "mutex",p53,p57 |
451 | (p42) add r8=r35,r0,1 | 499 | (p53) add r8=r38,r0 |
452 | mov pr=r9,0x1ffff } | 500 | (p57) add r8=r38,r0,1 } |
453 | { .mib; rum 1<<5 // clear um.mfh | 501 | { .mfb; nop.m 0x0 |
454 | mov ar.lc=r3 | 502 | nop.f 0x0 |
503 | nop.b 0x0 };; | ||
504 | { .mii; | ||
505 | (p63) add r8=1,r8 | ||
506 | mov pr=r9,0x1ffff | ||
507 | mov ar.lc=r3 } | ||
508 | { .mfb; rum 1<<5 // clear um.mfh | ||
509 | nop.f 0x0 | ||
455 | br.ret.sptk.many b0 };; | 510 | br.ret.sptk.many b0 };; |
456 | .endp bn_mul_add_words# | 511 | .endp bn_mul_add_words# |
457 | #endif | 512 | #endif |
@@ -472,8 +527,7 @@ bn_sqr_words: | |||
472 | sxt4 r34=r34 };; | 527 | sxt4 r34=r34 };; |
473 | { .mii; cmp.le p6,p0=r34,r0 | 528 | { .mii; cmp.le p6,p0=r34,r0 |
474 | mov r8=r0 } // return value | 529 | mov r8=r0 } // return value |
475 | { .mfb; ADDP r32=0,r32 | 530 | { .mfb; nop.f 0x0 |
476 | nop.f 0x0 | ||
477 | (p6) br.ret.spnt.many b0 };; | 531 | (p6) br.ret.spnt.many b0 };; |
478 | 532 | ||
479 | .save ar.lc,r3 | 533 | .save ar.lc,r3 |
@@ -482,7 +536,11 @@ bn_sqr_words: | |||
482 | mov r9=pr };; | 536 | mov r9=pr };; |
483 | 537 | ||
484 | .body | 538 | .body |
485 | { .mib; ADDP r33=0,r33 | 539 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
540 | { .mii; addp4 r32=0,r32 | ||
541 | addp4 r33=0,r33 };; | ||
542 | #endif | ||
543 | { .mib; | ||
486 | mov pr.rot=1<<16 | 544 | mov pr.rot=1<<16 |
487 | brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 | 545 | brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 |
488 | } | 546 | } |
@@ -547,7 +605,7 @@ bn_sqr_comba8: | |||
547 | .prologue | 605 | .prologue |
548 | .fframe 0 | 606 | .fframe 0 |
549 | .save ar.pfs,r2 | 607 | .save ar.pfs,r2 |
550 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | 608 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
551 | { .mii; alloc r2=ar.pfs,2,1,0,0 | 609 | { .mii; alloc r2=ar.pfs,2,1,0,0 |
552 | addp4 r33=0,r33 | 610 | addp4 r33=0,r33 |
553 | addp4 r32=0,r32 };; | 611 | addp4 r32=0,r32 };; |
@@ -573,10 +631,6 @@ bn_sqr_comba8: | |||
573 | // clause in Itanium µ-architecture manual? Comments are welcomed and | 631 | // clause in Itanium µ-architecture manual? Comments are welcomed and |
574 | // highly appreciated. | 632 | // highly appreciated. |
575 | // | 633 | // |
576 | // On Itanium 2 it takes ~190 ticks. This is because of stalls on | ||
577 | // result from getf.sig. I do nothing about it at this point for | ||
578 | // reasons depicted below. | ||
579 | // | ||
580 | // However! It should be noted that even 160 ticks is darn good result | 634 | // However! It should be noted that even 160 ticks is darn good result |
581 | // as it's over 10 (yes, ten, spelled as t-e-n) times faster than the | 635 | // as it's over 10 (yes, ten, spelled as t-e-n) times faster than the |
582 | // C version (compiled with gcc with inline assembler). I really | 636 | // C version (compiled with gcc with inline assembler). I really |
@@ -619,7 +673,7 @@ bn_mul_comba8: | |||
619 | .prologue | 673 | .prologue |
620 | .fframe 0 | 674 | .fframe 0 |
621 | .save ar.pfs,r2 | 675 | .save ar.pfs,r2 |
622 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | 676 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
623 | { .mii; alloc r2=ar.pfs,3,0,0,0 | 677 | { .mii; alloc r2=ar.pfs,3,0,0,0 |
624 | addp4 r33=0,r33 | 678 | addp4 r33=0,r33 |
625 | addp4 r34=0,r34 };; | 679 | addp4 r34=0,r34 };; |
@@ -1177,7 +1231,7 @@ bn_sqr_comba4: | |||
1177 | .prologue | 1231 | .prologue |
1178 | .fframe 0 | 1232 | .fframe 0 |
1179 | .save ar.pfs,r2 | 1233 | .save ar.pfs,r2 |
1180 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | 1234 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
1181 | { .mii; alloc r2=ar.pfs,2,1,0,0 | 1235 | { .mii; alloc r2=ar.pfs,2,1,0,0 |
1182 | addp4 r32=0,r32 | 1236 | addp4 r32=0,r32 |
1183 | addp4 r33=0,r33 };; | 1237 | addp4 r33=0,r33 };; |
@@ -1210,7 +1264,7 @@ bn_mul_comba4: | |||
1210 | .prologue | 1264 | .prologue |
1211 | .fframe 0 | 1265 | .fframe 0 |
1212 | .save ar.pfs,r2 | 1266 | .save ar.pfs,r2 |
1213 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | 1267 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
1214 | { .mii; alloc r2=ar.pfs,3,0,0,0 | 1268 | { .mii; alloc r2=ar.pfs,3,0,0,0 |
1215 | addp4 r33=0,r33 | 1269 | addp4 r33=0,r33 |
1216 | addp4 r34=0,r34 };; | 1270 | addp4 r34=0,r34 };; |
@@ -1394,8 +1448,8 @@ bn_mul_comba4: | |||
1394 | #define I r21 | 1448 | #define I r21 |
1395 | 1449 | ||
1396 | #if 0 | 1450 | #if 0 |
1397 | // Some preprocessors (most notably HP-UX) appear to be allergic to | 1451 | // Some preprocessors (most notably HP-UX) apper to be allergic to |
1398 | // macros enclosed to parenthesis [as these three were]. | 1452 | // macros enclosed to parenthesis as these three will be. |
1399 | #define cont p16 | 1453 | #define cont p16 |
1400 | #define break p0 // p20 | 1454 | #define break p0 // p20 |
1401 | #define equ p24 | 1455 | #define equ p24 |
@@ -1527,18 +1581,9 @@ bn_div_words: | |||
1527 | // output: f8 = (int)(a/b) | 1581 | // output: f8 = (int)(a/b) |
1528 | // clobbered: f8,f9,f10,f11,pred | 1582 | // clobbered: f8,f9,f10,f11,pred |
1529 | pred=p15 | 1583 | pred=p15 |
1530 | // One can argue that this snippet is copyrighted to Intel | 1584 | // This procedure is essentially Intel code and therefore is |
1531 | // Corporation, as it's essentially identical to one of those | 1585 | // copyrighted to Intel Corporation (I suppose...). It's sligtly |
1532 | // found in "Divide, Square Root and Remainder" section at | 1586 | // modified for specific needs. |
1533 | // http://www.intel.com/software/products/opensource/libraries/num.htm. | ||
1534 | // Yes, I admit that the referred code was used as template, | ||
1535 | // but after I realized that there hardly is any other instruction | ||
1536 | // sequence which would perform this operation. I mean I figure that | ||
1537 | // any independent attempt to implement high-performance division | ||
1538 | // will result in code virtually identical to the Intel code. It | ||
1539 | // should be noted though that below division kernel is 1 cycle | ||
1540 | // faster than Intel one (note commented splits:-), not to mention | ||
1541 | // original prologue (rather lack of one) and epilogue. | ||
1542 | .align 32 | 1587 | .align 32 |
1543 | .skip 16 | 1588 | .skip 16 |
1544 | .L_udiv64_32_b6: | 1589 | .L_udiv64_32_b6: |