diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2024-07-12 11:08:08 +0200 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2024-07-12 11:10:24 +0200 |
commit | 999e290ef64cbd49a9e0a0f6d3cfaf26414c1c3e (patch) | |
tree | 5a312096ffcc40026ad0bc3dd99aa966fcd6e9c3 | |
parent | d745852f136bac4646e50a4f03565273e687b28b (diff) | |
download | busybox-w32-999e290ef64cbd49a9e0a0f6d3cfaf26414c1c3e.tar.gz busybox-w32-999e290ef64cbd49a9e0a0f6d3cfaf26414c1c3e.tar.bz2 busybox-w32-999e290ef64cbd49a9e0a0f6d3cfaf26414c1c3e.zip |
tls: P256: improve x86_64 multiplication asm code
gcc is being rather silly. Usues suboptimal registers,
and does not realize that i and j are never negative,
thus usese even _more_ registers for temporaries
to sign-extend i/j to 64-bit offsets.
function old new delta
sp_256_mont_mul_8 155 132 -23
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | networking/tls_sp_c32.c | 58 |
1 files changed, 36 insertions, 22 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c index 9ab996f3b..e493c436a 100644 --- a/networking/tls_sp_c32.c +++ b/networking/tls_sp_c32.c | |||
@@ -411,10 +411,10 @@ static void sp_256_sub_8_p256_mod(sp_digit* r) | |||
411 | "\n subl $0xffffffff, (%0)" | 411 | "\n subl $0xffffffff, (%0)" |
412 | "\n sbbl $0xffffffff, 1*4(%0)" | 412 | "\n sbbl $0xffffffff, 1*4(%0)" |
413 | "\n sbbl $0xffffffff, 2*4(%0)" | 413 | "\n sbbl $0xffffffff, 2*4(%0)" |
414 | "\n sbbl $0, 3*4(%0)" | 414 | "\n sbbl $0x00000000, 3*4(%0)" |
415 | "\n sbbl $0, 4*4(%0)" | 415 | "\n sbbl $0x00000000, 4*4(%0)" |
416 | "\n sbbl $0, 5*4(%0)" | 416 | "\n sbbl $0x00000000, 5*4(%0)" |
417 | "\n sbbl $1, 6*4(%0)" | 417 | "\n sbbl $0x00000001, 6*4(%0)" |
418 | "\n sbbl $0xffffffff, 7*4(%0)" | 418 | "\n sbbl $0xffffffff, 7*4(%0)" |
419 | "\n" | 419 | "\n" |
420 | : "=r" (r) | 420 | : "=r" (r) |
@@ -433,10 +433,10 @@ static void sp_256_sub_8_p256_mod(sp_digit* r) | |||
433 | uint64_t ooff; | 433 | uint64_t ooff; |
434 | asm volatile ( | 434 | asm volatile ( |
435 | "\n subq $0xffffffffffffffff, (%0)" | 435 | "\n subq $0xffffffffffffffff, (%0)" |
436 | "\n sbbq %1, 1*8(%0)" | 436 | "\n sbbq %1, 1*8(%0)" // %1 = 00000000ffffffff |
437 | "\n sbbq $0, 2*8(%0)" | 437 | "\n sbbq $0x0000000000000000, 2*8(%0)" |
438 | "\n movq 3*8(%0), %2" | 438 | "\n movq 3*8(%0), %2" |
439 | "\n sbbq $0, %2" // subtract carry | 439 | "\n sbbq $0x0, %2" // subtract carry |
440 | "\n addq %1, %2" // adding 00000000ffffffff (in %1) | 440 | "\n addq %1, %2" // adding 00000000ffffffff (in %1) |
441 | "\n" // is the same as subtracting ffffffff00000001 | 441 | "\n" // is the same as subtracting ffffffff00000001 |
442 | "\n movq %2, 3*8(%0)" | 442 | "\n movq %2, 3*8(%0)" |
@@ -452,9 +452,9 @@ static void sp_256_sub_8_p256_mod(sp_digit* r) | |||
452 | "\n orl $0xffffffff, %%eax" // %1 (rax) = 00000000ffffffff | 452 | "\n orl $0xffffffff, %%eax" // %1 (rax) = 00000000ffffffff |
453 | "\n subq $0xffffffffffffffff, (%0)" | 453 | "\n subq $0xffffffffffffffff, (%0)" |
454 | "\n sbbq %1, 1*8(%0)" | 454 | "\n sbbq %1, 1*8(%0)" |
455 | "\n sbbq $0, 2*8(%0)" | 455 | "\n sbbq $0x0000000000000000, 2*8(%0)" |
456 | "\n movq 3*8(%0), %2" | 456 | "\n movq 3*8(%0), %2" |
457 | "\n sbbq $0, %2" // subtract carry | 457 | "\n sbbq $0x0, %2" // subtract carry |
458 | "\n addq %1, %2" // adding 00000000ffffffff (in %1) | 458 | "\n addq %1, %2" // adding 00000000ffffffff (in %1) |
459 | "\n" // is the same as subtracting ffffffff00000001 | 459 | "\n" // is the same as subtracting ffffffff00000001 |
460 | "\n movq %2, 3*8(%0)" | 460 | "\n movq %2, 3*8(%0)" |
@@ -495,15 +495,23 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) | |||
495 | //////////////////////// | 495 | //////////////////////// |
496 | // uint64_t m = ((uint64_t)a[i]) * b[j]; | 496 | // uint64_t m = ((uint64_t)a[i]) * b[j]; |
497 | // acc_hi:acch:accl += m; | 497 | // acc_hi:acch:accl += m; |
498 | long eax_clobbered; | ||
498 | asm volatile ( | 499 | asm volatile ( |
499 | // a[i] is already loaded in %%eax | 500 | // a[i] is already loaded in %%eax |
500 | "\n mull %7" | 501 | "\n mull %8" |
501 | "\n addl %%eax, %0" | 502 | "\n addl %%eax, %0" |
502 | "\n adcl %%edx, %1" | 503 | "\n adcl %%edx, %1" |
503 | "\n adcl $0, %2" | 504 | "\n adcl $0x0, %2" |
504 | : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi) | 505 | : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi), "=a" (eax_clobbered) |
505 | : "0" (accl), "1" (acch), "2" (acc_hi), "a" (a[i]), "m" (b[j]) | 506 | : "0" (accl), "1" (acch), "2" (acc_hi), "3" (a[i]), "m" (b[j]) |
506 | : "cc", "dx" | 507 | : "cc", "dx" |
508 | // What is "eax_clobbered"? gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html: | ||
509 | // "Do not modify the contents of input-only operands (except for inputs tied | ||
510 | // to outputs). The compiler assumes that on exit from the asm statement these | ||
511 | // operands contain the same values as they had before executing the statement. | ||
512 | // It is not possible to use clobbers to inform the compiler that the values | ||
513 | // in these inputs are changing. One common work-around is to tie the changing | ||
514 | // input variable to an output variable that never gets used." | ||
507 | ); | 515 | ); |
508 | //////////////////////// | 516 | //////////////////////// |
509 | j--; | 517 | j--; |
@@ -519,15 +527,20 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) | |||
519 | const uint64_t* bb = (const void*)b; | 527 | const uint64_t* bb = (const void*)b; |
520 | uint64_t* rr = (void*)r; | 528 | uint64_t* rr = (void*)r; |
521 | int k; | 529 | int k; |
522 | uint64_t accl; | 530 | register uint64_t accl asm("r8"); |
523 | uint64_t acch; | 531 | register uint64_t acch asm("r9"); |
532 | /* ^^^ ask gcc to not use rax/rdx/input arg regs for accumulator variables */ | ||
533 | /* (or else it may generate lots of silly mov's and even xchg's!) */ | ||
524 | 534 | ||
525 | acch = accl = 0; | 535 | acch = accl = 0; |
526 | for (k = 0; k < 7; k++) { | 536 | for (k = 0; k < 7; k++) { |
527 | int i, j; | 537 | unsigned i, j; |
528 | uint64_t acc_hi; | 538 | /* ^^^^^ not signed "int", |
539 | * or gcc can use a temp register to sign-extend i,j for aa[i],bb[j] */ | ||
540 | register uint64_t acc_hi asm("r10"); | ||
541 | /* ^^^ ask gcc to not use rax/rdx/input arg regs for accumulators */ | ||
529 | i = k - 3; | 542 | i = k - 3; |
530 | if (i < 0) | 543 | if ((int)i < 0) |
531 | i = 0; | 544 | i = 0; |
532 | j = k - i; | 545 | j = k - i; |
533 | acc_hi = 0; | 546 | acc_hi = 0; |
@@ -535,14 +548,15 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) | |||
535 | //////////////////////// | 548 | //////////////////////// |
536 | // uint128_t m = ((uint128_t)a[i]) * b[j]; | 549 | // uint128_t m = ((uint128_t)a[i]) * b[j]; |
537 | // acc_hi:acch:accl += m; | 550 | // acc_hi:acch:accl += m; |
551 | long rax_clobbered; | ||
538 | asm volatile ( | 552 | asm volatile ( |
539 | // aa[i] is already loaded in %%rax | 553 | // aa[i] is already loaded in %%rax |
540 | "\n mulq %7" | 554 | "\n mulq %8" |
541 | "\n addq %%rax, %0" | 555 | "\n addq %%rax, %0" |
542 | "\n adcq %%rdx, %1" | 556 | "\n adcq %%rdx, %1" |
543 | "\n adcq $0, %2" | 557 | "\n adcq $0x0, %2" |
544 | : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi) | 558 | : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi), "=a" (rax_clobbered) |
545 | : "0" (accl), "1" (acch), "2" (acc_hi), "a" (aa[i]), "m" (bb[j]) | 559 | : "0" (accl), "1" (acch), "2" (acc_hi), "3" (aa[i]), "m" (bb[j]) |
546 | : "cc", "dx" | 560 | : "cc", "dx" |
547 | ); | 561 | ); |
548 | //////////////////////// | 562 | //////////////////////// |