diff options
| author | djm <> | 2005-04-29 05:39:33 +0000 |
|---|---|---|
| committer | djm <> | 2005-04-29 05:39:33 +0000 |
| commit | 68edd00d9258df93b1366c71ac124e0cadf7bc08 (patch) | |
| tree | 3ce4ae2a9747bbc11aed1f95f9bbea92c41f8683 /src/lib/libcrypto/bn/asm | |
| parent | f396ed0f5ce0af56bfde2e75e15cf1f52924c779 (diff) | |
| download | openbsd-68edd00d9258df93b1366c71ac124e0cadf7bc08.tar.gz openbsd-68edd00d9258df93b1366c71ac124e0cadf7bc08.tar.bz2 openbsd-68edd00d9258df93b1366c71ac124e0cadf7bc08.zip | |
resolve conflicts
Diffstat (limited to 'src/lib/libcrypto/bn/asm')
| -rw-r--r-- | src/lib/libcrypto/bn/asm/ia64.S | 217 |
1 files changed, 86 insertions, 131 deletions
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S index 7dfda85566..7b82b820e6 100644 --- a/src/lib/libcrypto/bn/asm/ia64.S +++ b/src/lib/libcrypto/bn/asm/ia64.S | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | .explicit | 1 | .explicit |
| 2 | .text | 2 | .text |
| 3 | .ident "ia64.S, Version 2.0" | 3 | .ident "ia64.S, Version 2.1" |
| 4 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | 4 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" |
| 5 | 5 | ||
| 6 | // | 6 | // |
| @@ -35,7 +35,7 @@ | |||
| 35 | // What does it mean? You might ratiocinate that the original code | 35 | // What does it mean? You might ratiocinate that the original code |
| 36 | // should run just faster... Because sum of latencies is smaller... | 36 | // should run just faster... Because sum of latencies is smaller... |
| 37 | // Wrong! Note that getf latency increased. This means that if a loop is | 37 | // Wrong! Note that getf latency increased. This means that if a loop is |
| 38 | // scheduled for lower latency (and they are), then it will suffer from | 38 | // scheduled for lower latency (as they were), then it will suffer from |
| 39 | // stall condition and the code will therefore turn anti-scalable, e.g. | 39 | // stall condition and the code will therefore turn anti-scalable, e.g. |
| 40 | // original bn_mul_words spun at 5*n or 2.5 times slower than expected | 40 | // original bn_mul_words spun at 5*n or 2.5 times slower than expected |
| 41 | // on Itanium2! What to do? Reschedule loops for Itanium2? But then | 41 | // on Itanium2! What to do? Reschedule loops for Itanium2? But then |
| @@ -145,6 +145,12 @@ | |||
| 145 | // -Drum=nop.m in command line. | 145 | // -Drum=nop.m in command line. |
| 146 | // | 146 | // |
| 147 | 147 | ||
| 148 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | ||
| 149 | #define ADDP addp4 | ||
| 150 | #else | ||
| 151 | #define ADDP add | ||
| 152 | #endif | ||
| 153 | |||
| 148 | #if 1 | 154 | #if 1 |
| 149 | // | 155 | // |
| 150 | // bn_[add|sub]_words routines. | 156 | // bn_[add|sub]_words routines. |
| @@ -178,27 +184,12 @@ bn_add_words: | |||
| 178 | brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 | 184 | brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 |
| 179 | } | 185 | } |
| 180 | .body | 186 | .body |
| 181 | { .mib; | 187 | { .mib; ADDP r14=0,r32 // rp |
| 182 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 183 | addp4 r14=0,r32 // rp | ||
| 184 | #else | ||
| 185 | mov r14=r32 // rp | ||
| 186 | #endif | ||
| 187 | mov r9=pr };; | 188 | mov r9=pr };; |
| 188 | { .mii; | 189 | { .mii; ADDP r15=0,r33 // ap |
| 189 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 190 | addp4 r15=0,r33 // ap | ||
| 191 | #else | ||
| 192 | mov r15=r33 // ap | ||
| 193 | #endif | ||
| 194 | mov ar.lc=r10 | 190 | mov ar.lc=r10 |
| 195 | mov ar.ec=6 } | 191 | mov ar.ec=6 } |
| 196 | { .mib; | 192 | { .mib; ADDP r16=0,r34 // bp |
| 197 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 198 | addp4 r16=0,r34 // bp | ||
| 199 | #else | ||
| 200 | mov r16=r34 // bp | ||
| 201 | #endif | ||
| 202 | mov pr.rot=1<<16 };; | 193 | mov pr.rot=1<<16 };; |
| 203 | 194 | ||
| 204 | .L_bn_add_words_ctop: | 195 | .L_bn_add_words_ctop: |
| @@ -246,27 +237,12 @@ bn_sub_words: | |||
| 246 | brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 | 237 | brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 |
| 247 | } | 238 | } |
| 248 | .body | 239 | .body |
| 249 | { .mib; | 240 | { .mib; ADDP r14=0,r32 // rp |
| 250 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 251 | addp4 r14=0,r32 // rp | ||
| 252 | #else | ||
| 253 | mov r14=r32 // rp | ||
| 254 | #endif | ||
| 255 | mov r9=pr };; | 241 | mov r9=pr };; |
| 256 | { .mii; | 242 | { .mii; ADDP r15=0,r33 // ap |
| 257 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 258 | addp4 r15=0,r33 // ap | ||
| 259 | #else | ||
| 260 | mov r15=r33 // ap | ||
| 261 | #endif | ||
| 262 | mov ar.lc=r10 | 243 | mov ar.lc=r10 |
| 263 | mov ar.ec=6 } | 244 | mov ar.ec=6 } |
| 264 | { .mib; | 245 | { .mib; ADDP r16=0,r34 // bp |
| 265 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 266 | addp4 r16=0,r34 // bp | ||
| 267 | #else | ||
| 268 | mov r16=r34 // bp | ||
| 269 | #endif | ||
| 270 | mov pr.rot=1<<16 };; | 246 | mov pr.rot=1<<16 };; |
| 271 | 247 | ||
| 272 | .L_bn_sub_words_ctop: | 248 | .L_bn_sub_words_ctop: |
| @@ -332,16 +308,10 @@ bn_mul_words: | |||
| 332 | 308 | ||
| 333 | #ifndef XMA_TEMPTATION | 309 | #ifndef XMA_TEMPTATION |
| 334 | 310 | ||
| 335 | { .mii; | 311 | { .mmi; ADDP r14=0,r32 // rp |
| 336 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | 312 | ADDP r15=0,r33 // ap |
| 337 | addp4 r14=0,r32 // rp | ||
| 338 | addp4 r15=0,r33 // ap | ||
| 339 | #else | ||
| 340 | mov r14=r32 // rp | ||
| 341 | mov r15=r33 // ap | ||
| 342 | #endif | ||
| 343 | mov ar.lc=r10 } | 313 | mov ar.lc=r10 } |
| 344 | { .mii; mov r40=0 // serves as r35 at first (p27) | 314 | { .mmi; mov r40=0 // serves as r35 at first (p27) |
| 345 | mov ar.ec=13 };; | 315 | mov ar.ec=13 };; |
| 346 | 316 | ||
| 347 | // This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium | 317 | // This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium |
| @@ -424,89 +394,64 @@ bn_mul_words: | |||
| 424 | .global bn_mul_add_words# | 394 | .global bn_mul_add_words# |
| 425 | .proc bn_mul_add_words# | 395 | .proc bn_mul_add_words# |
| 426 | .align 64 | 396 | .align 64 |
| 427 | //.skip 0 // makes the loop split at 64-byte boundary | 397 | .skip 48 // makes the loop body aligned at 64-byte boundary |
| 428 | bn_mul_add_words: | 398 | bn_mul_add_words: |
| 429 | .prologue | 399 | .prologue |
| 430 | .fframe 0 | 400 | .fframe 0 |
| 431 | .save ar.pfs,r2 | 401 | .save ar.pfs,r2 |
| 432 | { .mii; alloc r2=ar.pfs,4,12,0,16 | ||
| 433 | cmp4.le p6,p0=r34,r0 };; | ||
| 434 | { .mfb; mov r8=r0 // return value | ||
| 435 | (p6) br.ret.spnt.many b0 };; | ||
| 436 | |||
| 437 | .save ar.lc,r3 | 402 | .save ar.lc,r3 |
| 438 | { .mii; sub r10=r34,r0,1 | 403 | .save pr,r9 |
| 439 | mov r3=ar.lc | 404 | { .mmi; alloc r2=ar.pfs,4,4,0,8 |
| 440 | mov r9=pr };; | 405 | cmp4.le p6,p0=r34,r0 |
| 406 | mov r3=ar.lc };; | ||
| 407 | { .mib; mov r8=r0 // return value | ||
| 408 | sub r10=r34,r0,1 | ||
| 409 | (p6) br.ret.spnt.many b0 };; | ||
| 441 | 410 | ||
| 442 | .body | 411 | .body |
| 443 | { .mib; setf.sig f8=r35 // w | 412 | { .mib; setf.sig f8=r35 // w |
| 444 | mov pr.rot=0x800001<<16 | 413 | mov r9=pr |
| 445 | // ------^----- serves as (p50) at first (p27) | ||
| 446 | brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 | 414 | brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 |
| 447 | } | 415 | } |
| 448 | { .mii; | 416 | { .mmi; ADDP r14=0,r32 // rp |
| 449 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | 417 | ADDP r15=0,r33 // ap |
| 450 | addp4 r14=0,r32 // rp | ||
| 451 | addp4 r15=0,r33 // ap | ||
| 452 | #else | ||
| 453 | mov r14=r32 // rp | ||
| 454 | mov r15=r33 // ap | ||
| 455 | #endif | ||
| 456 | mov ar.lc=r10 } | 418 | mov ar.lc=r10 } |
| 457 | { .mii; mov r40=0 // serves as r35 at first (p27) | 419 | { .mii; ADDP r16=0,r32 // rp copy |
| 458 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | 420 | mov pr.rot=0x2001<<16 |
| 459 | addp4 r18=0,r32 // rp copy | 421 | // ------^----- serves as (p40) at first (p27) |
| 460 | #else | 422 | mov ar.ec=11 };; |
| 461 | mov r18=r32 // rp copy | 423 | |
| 462 | #endif | 424 | // This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on |
| 463 | mov ar.ec=15 };; | 425 | // Itanium 2. Yes, unlike previous versions it scales:-) Previous |
| 464 | 426 | // version was peforming *all* additions in IALU and was starving | |
| 465 | // This loop spins in 3*(n+14) ticks on Itanium and should spin in | 427 | // for those even on Itanium 2. In this version one addition is |
| 466 | // 2*(n+14) on "wider" IA-64 implementations (to be verified with new | 428 | // moved to FPU and is folded with multiplication. This is at cost |
| 467 | // µ-architecture manuals as they become available). As usual it's | 429 | // of propogating the result from previous call to this subroutine |
| 468 | // possible to compress the epilogue, down to 10 in this case, at the | 430 | // to L2 cache... In other words negligible even for shorter keys. |
| 469 | // cost of scalability. Compressed (and therefore non-scalable) loop | 431 | // *Overall* performance improvement [over previous version] varies |
| 470 | // running at 3*(n+11) would buy you ~10% on Itanium but take ~35% | 432 | // from 11 to 22 percent depending on key length. |
| 471 | // from "wider" IA-64 so let it be scalable! Special attention was | ||
| 472 | // paid for having the loop body split at 64-byte boundary. ld8 is | ||
| 473 | // scheduled for L1 cache as the data is more than likely there. | ||
| 474 | // Indeed, bn_mul_words has put it there a moment ago:-) | ||
| 475 | .L_bn_mul_add_words_ctop: | 433 | .L_bn_mul_add_words_ctop: |
| 476 | { .mfi; (p25) getf.sig r36=f52 // low | 434 | .pred.rel "mutex",p40,p42 |
| 477 | (p21) xmpy.lu f48=f37,f8 | 435 | { .mfi; (p23) getf.sig r36=f45 // low |
| 478 | (p28) cmp.ltu p54,p50=r41,r39 } | 436 | (p20) xma.lu f42=f36,f8,f50 // low |
| 479 | { .mfi; (p16) ldf8 f32=[r15],8 | 437 | (p40) add r39=r39,r35 } // (p27) |
| 480 | (p21) xmpy.hu f40=f37,f8 | 438 | { .mfi; (p16) ldf8 f32=[r15],8 // *(ap++) |
| 481 | (p28) add r45=r45,r41 };; | 439 | (p20) xma.hu f36=f36,f8,f50 // high |
| 482 | { .mii; (p25) getf.sig r32=f44 // high | 440 | (p42) add r39=r39,r35,1 };; // (p27) |
| 483 | .pred.rel "mutex",p50,p54 | 441 | { .mmi; (p24) getf.sig r32=f40 // high |
| 484 | (p50) add r40=r38,r35 // (p27) | 442 | (p16) ldf8 f46=[r16],8 // *(rp1++) |
| 485 | (p54) add r40=r38,r35,1 } // (p27) | 443 | (p40) cmp.ltu p41,p39=r39,r35 } // (p27) |
| 486 | { .mfb; (p28) cmp.ltu.unc p60,p0=r45,r41 | 444 | { .mib; (p26) st8 [r14]=r39,8 // *(rp2++) |
| 487 | (p0) nop.f 0x0 | 445 | (p42) cmp.leu p41,p39=r39,r35 // (p27) |
| 488 | (p0) nop.b 0x0 } | ||
| 489 | { .mii; (p27) ld8 r44=[r18],8 | ||
| 490 | (p62) cmp.eq.or p61,p0=-1,r46 | ||
| 491 | (p62) add r46=1,r46 } | ||
| 492 | { .mfb; (p30) st8 [r14]=r47,8 | ||
| 493 | (p0) nop.f 0x0 | ||
| 494 | br.ctop.sptk .L_bn_mul_add_words_ctop};; | 446 | br.ctop.sptk .L_bn_mul_add_words_ctop};; |
| 495 | .L_bn_mul_add_words_cend: | 447 | .L_bn_mul_add_words_cend: |
| 496 | 448 | ||
| 497 | { .mii; nop.m 0x0 | 449 | { .mmi; .pred.rel "mutex",p40,p42 |
| 498 | .pred.rel "mutex",p53,p57 | 450 | (p40) add r8=r35,r0 |
| 499 | (p53) add r8=r38,r0 | 451 | (p42) add r8=r35,r0,1 |
| 500 | (p57) add r8=r38,r0,1 } | 452 | mov pr=r9,0x1ffff } |
| 501 | { .mfb; nop.m 0x0 | 453 | { .mib; rum 1<<5 // clear um.mfh |
| 502 | nop.f 0x0 | 454 | mov ar.lc=r3 |
| 503 | nop.b 0x0 };; | ||
| 504 | { .mii; | ||
| 505 | (p63) add r8=1,r8 | ||
| 506 | mov pr=r9,0x1ffff | ||
| 507 | mov ar.lc=r3 } | ||
| 508 | { .mfb; rum 1<<5 // clear um.mfh | ||
| 509 | nop.f 0x0 | ||
| 510 | br.ret.sptk.many b0 };; | 455 | br.ret.sptk.many b0 };; |
| 511 | .endp bn_mul_add_words# | 456 | .endp bn_mul_add_words# |
| 512 | #endif | 457 | #endif |
| @@ -527,7 +472,8 @@ bn_sqr_words: | |||
| 527 | sxt4 r34=r34 };; | 472 | sxt4 r34=r34 };; |
| 528 | { .mii; cmp.le p6,p0=r34,r0 | 473 | { .mii; cmp.le p6,p0=r34,r0 |
| 529 | mov r8=r0 } // return value | 474 | mov r8=r0 } // return value |
| 530 | { .mfb; nop.f 0x0 | 475 | { .mfb; ADDP r32=0,r32 |
| 476 | nop.f 0x0 | ||
| 531 | (p6) br.ret.spnt.many b0 };; | 477 | (p6) br.ret.spnt.many b0 };; |
| 532 | 478 | ||
| 533 | .save ar.lc,r3 | 479 | .save ar.lc,r3 |
| @@ -536,11 +482,7 @@ bn_sqr_words: | |||
| 536 | mov r9=pr };; | 482 | mov r9=pr };; |
| 537 | 483 | ||
| 538 | .body | 484 | .body |
| 539 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | 485 | { .mib; ADDP r33=0,r33 |
| 540 | { .mii; addp4 r32=0,r32 | ||
| 541 | addp4 r33=0,r33 };; | ||
| 542 | #endif | ||
| 543 | { .mib; | ||
| 544 | mov pr.rot=1<<16 | 486 | mov pr.rot=1<<16 |
| 545 | brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 | 487 | brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 |
| 546 | } | 488 | } |
| @@ -605,7 +547,7 @@ bn_sqr_comba8: | |||
| 605 | .prologue | 547 | .prologue |
| 606 | .fframe 0 | 548 | .fframe 0 |
| 607 | .save ar.pfs,r2 | 549 | .save ar.pfs,r2 |
| 608 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | 550 | #if defined(_HPUX_SOURCE) && !defined(_LP64) |
| 609 | { .mii; alloc r2=ar.pfs,2,1,0,0 | 551 | { .mii; alloc r2=ar.pfs,2,1,0,0 |
| 610 | addp4 r33=0,r33 | 552 | addp4 r33=0,r33 |
| 611 | addp4 r32=0,r32 };; | 553 | addp4 r32=0,r32 };; |
| @@ -631,6 +573,10 @@ bn_sqr_comba8: | |||
| 631 | // clause in Itanium µ-architecture manual? Comments are welcomed and | 573 | // clause in Itanium µ-architecture manual? Comments are welcomed and |
| 632 | // highly appreciated. | 574 | // highly appreciated. |
| 633 | // | 575 | // |
| 576 | // On Itanium 2 it takes ~190 ticks. This is because of stalls on | ||
| 577 | // result from getf.sig. I do nothing about it at this point for | ||
| 578 | // reasons depicted below. | ||
| 579 | // | ||
| 634 | // However! It should be noted that even 160 ticks is darn good result | 580 | // However! It should be noted that even 160 ticks is darn good result |
| 635 | // as it's over 10 (yes, ten, spelled as t-e-n) times faster than the | 581 | // as it's over 10 (yes, ten, spelled as t-e-n) times faster than the |
| 636 | // C version (compiled with gcc with inline assembler). I really | 582 | // C version (compiled with gcc with inline assembler). I really |
| @@ -673,7 +619,7 @@ bn_mul_comba8: | |||
| 673 | .prologue | 619 | .prologue |
| 674 | .fframe 0 | 620 | .fframe 0 |
| 675 | .save ar.pfs,r2 | 621 | .save ar.pfs,r2 |
| 676 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | 622 | #if defined(_HPUX_SOURCE) && !defined(_LP64) |
| 677 | { .mii; alloc r2=ar.pfs,3,0,0,0 | 623 | { .mii; alloc r2=ar.pfs,3,0,0,0 |
| 678 | addp4 r33=0,r33 | 624 | addp4 r33=0,r33 |
| 679 | addp4 r34=0,r34 };; | 625 | addp4 r34=0,r34 };; |
| @@ -1231,7 +1177,7 @@ bn_sqr_comba4: | |||
| 1231 | .prologue | 1177 | .prologue |
| 1232 | .fframe 0 | 1178 | .fframe 0 |
| 1233 | .save ar.pfs,r2 | 1179 | .save ar.pfs,r2 |
| 1234 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | 1180 | #if defined(_HPUX_SOURCE) && !defined(_LP64) |
| 1235 | { .mii; alloc r2=ar.pfs,2,1,0,0 | 1181 | { .mii; alloc r2=ar.pfs,2,1,0,0 |
| 1236 | addp4 r32=0,r32 | 1182 | addp4 r32=0,r32 |
| 1237 | addp4 r33=0,r33 };; | 1183 | addp4 r33=0,r33 };; |
| @@ -1264,7 +1210,7 @@ bn_mul_comba4: | |||
| 1264 | .prologue | 1210 | .prologue |
| 1265 | .fframe 0 | 1211 | .fframe 0 |
| 1266 | .save ar.pfs,r2 | 1212 | .save ar.pfs,r2 |
| 1267 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | 1213 | #if defined(_HPUX_SOURCE) && !defined(_LP64) |
| 1268 | { .mii; alloc r2=ar.pfs,3,0,0,0 | 1214 | { .mii; alloc r2=ar.pfs,3,0,0,0 |
| 1269 | addp4 r33=0,r33 | 1215 | addp4 r33=0,r33 |
| 1270 | addp4 r34=0,r34 };; | 1216 | addp4 r34=0,r34 };; |
| @@ -1448,8 +1394,8 @@ bn_mul_comba4: | |||
| 1448 | #define I r21 | 1394 | #define I r21 |
| 1449 | 1395 | ||
| 1450 | #if 0 | 1396 | #if 0 |
| 1451 | // Some preprocessors (most notably HP-UX) apper to be allergic to | 1397 | // Some preprocessors (most notably HP-UX) appear to be allergic to |
| 1452 | // macros enclosed to parenthesis as these three will be. | 1398 | // macros enclosed to parenthesis [as these three were]. |
| 1453 | #define cont p16 | 1399 | #define cont p16 |
| 1454 | #define break p0 // p20 | 1400 | #define break p0 // p20 |
| 1455 | #define equ p24 | 1401 | #define equ p24 |
| @@ -1581,9 +1527,18 @@ bn_div_words: | |||
| 1581 | // output: f8 = (int)(a/b) | 1527 | // output: f8 = (int)(a/b) |
| 1582 | // clobbered: f8,f9,f10,f11,pred | 1528 | // clobbered: f8,f9,f10,f11,pred |
| 1583 | pred=p15 | 1529 | pred=p15 |
| 1584 | // This procedure is essentially Intel code and therefore is | 1530 | // One can argue that this snippet is copyrighted to Intel |
| 1585 | // copyrighted to Intel Corporation (I suppose...). It's sligtly | 1531 | // Corporation, as it's essentially identical to one of those |
| 1586 | // modified for specific needs. | 1532 | // found in "Divide, Square Root and Remainder" section at |
| 1533 | // http://www.intel.com/software/products/opensource/libraries/num.htm. | ||
| 1534 | // Yes, I admit that the referred code was used as template, | ||
| 1535 | // but after I realized that there hardly is any other instruction | ||
| 1536 | // sequence which would perform this operation. I mean I figure that | ||
| 1537 | // any independent attempt to implement high-performance division | ||
| 1538 | // will result in code virtually identical to the Intel code. It | ||
| 1539 | // should be noted though that below division kernel is 1 cycle | ||
| 1540 | // faster than Intel one (note commented splits:-), not to mention | ||
| 1541 | // original prologue (rather lack of one) and epilogue. | ||
| 1587 | .align 32 | 1542 | .align 32 |
| 1588 | .skip 16 | 1543 | .skip 16 |
| 1589 | .L_udiv64_32_b6: | 1544 | .L_udiv64_32_b6: |
