diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2021-10-06 00:19:30 +0200 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2021-10-06 00:24:03 +0200 |
commit | 2430fcfd8de47f786aca1185ae0500fa36c6a548 (patch) | |
tree | 7602d8ecdc0c38f272e03dffde9b20c949d1fd9c | |
parent | bbd723ebec33aa14746dde88b982b160977938b6 (diff) | |
download | busybox-w32-2430fcfd8de47f786aca1185ae0500fa36c6a548.tar.gz busybox-w32-2430fcfd8de47f786aca1185ae0500fa36c6a548.tar.bz2 busybox-w32-2430fcfd8de47f786aca1185ae0500fa36c6a548.zip |
tls: optimize sp_256_mont_reduce_8 in P256
The code size decrease is small, but we eliminate ALL multiplies!
function old new delta
sp_256_mont_reduce_8 268 262 -6
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | networking/tls_sp_c32.c | 146 |
1 files changed, 125 insertions, 21 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c index e1c4cdd54..0773a2d47 100644 --- a/networking/tls_sp_c32.c +++ b/networking/tls_sp_c32.c | |||
@@ -488,19 +488,118 @@ static void sp_256_mont_shift_8(sp_digit* r, const sp_digit* a) | |||
488 | } | 488 | } |
489 | 489 | ||
490 | /* Mul a by scalar b and add into r. (r += a * b) */ | 490 | /* Mul a by scalar b and add into r. (r += a * b) */ |
491 | static int sp_256_mul_add_8(sp_digit* r, const sp_digit* a, sp_digit b) | 491 | static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) |
492 | { | 492 | { |
493 | // const sp_digit* a = p256_mod; | ||
494 | //a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff | ||
495 | sp_digit b = r[0]; | ||
493 | uint64_t t = 0; | 496 | uint64_t t = 0; |
494 | int i; | ||
495 | 497 | ||
496 | for (i = 0; i < 8; i++) { | 498 | // for (i = 0; i < 8; i++) { |
497 | uint32_t t_hi; | 499 | // uint32_t t_hi; |
498 | uint64_t m = ((uint64_t)b * a[i]) + r[i]; | 500 | // uint64_t m = ((uint64_t)b * a[i]) + r[i]; |
501 | // t += m; | ||
502 | // t_hi = (t < m); | ||
503 | // r[i] = (sp_digit)t; | ||
504 | // t = (t >> 32) | ((uint64_t)t_hi << 32); | ||
505 | // } | ||
506 | // r[8] += (sp_digit)t; | ||
507 | |||
508 | // Unroll, then optimize the above loop: | ||
509 | //uint32_t t_hi; | ||
510 | uint64_t m; | ||
511 | |||
512 | //m = ((uint64_t)b * a[0]) + r[0]; | ||
513 | // Since b is r[0] and a[0] is ffffffff, the above optimizes to: | ||
514 | // m = r[0] * ffffffff + r[0] = (r[0] * 100000000 - r[0]) + r[0] = r[0] << 32; | ||
515 | //t += m; | ||
516 | // t = (uint64_t)r[0] << 32; | ||
517 | //t_hi = (t < m); | ||
518 | // t_hi = 0; | ||
519 | //r[0] = (sp_digit)t; | ||
520 | r[0] = 0; | ||
521 | //t = (t >> 32) | ((uint64_t)t_hi << 32); | ||
522 | // t = b; | ||
523 | |||
524 | //m = ((uint64_t)b * a[1]) + r[1]; | ||
525 | // Since a[1] is ffffffff, the above optimizes to: | ||
526 | // m = b * ffffffff + r[1] = (b * 100000000 - b) + r[1] = (b << 32) - b + r[1]; | ||
527 | //t += m; | ||
528 | // t = b + (b << 32) - b + r[1] = (b << 32) + r[1]; | ||
529 | //t_hi = (t < m); | ||
530 | // t_hi = 0; | ||
531 | //r[1] = (sp_digit)t; | ||
532 | // r[1] = r[1]; | ||
533 | //t = (t >> 32) | ((uint64_t)t_hi << 32); | ||
534 | // t = b; | ||
535 | |||
536 | //m = ((uint64_t)b * a[2]) + r[2]; | ||
537 | // Since a[2] is ffffffff, the above optimizes to: | ||
538 | // m = b * ffffffff + r[2] = (b * 100000000 - b) + r[2] = (b << 32) - b + r[2]; | ||
539 | //t += m; | ||
540 | // t = b + (b << 32) - b + r[2] = (b << 32) + r[2] | ||
541 | //t_hi = (t < m); | ||
542 | // t_hi = 0; | ||
543 | //r[2] = (sp_digit)t; | ||
544 | // r[2] = r[2]; | ||
545 | //t = (t >> 32) | ((uint64_t)t_hi << 32); | ||
546 | // t = b; | ||
547 | |||
548 | //m = ((uint64_t)b * a[3]) + r[3]; | ||
549 | // Since a[3] is 00000000, the above optimizes to: | ||
550 | // m = b * 0 + r[3] = r[3]; | ||
551 | //t += m; | ||
552 | // t += r[3]; | ||
553 | //t_hi = (t < m); | ||
554 | // t_hi = 0; | ||
555 | //r[3] = (sp_digit)t; | ||
556 | r[3] = r[3] + b; | ||
557 | //t = (t >> 32) | ((uint64_t)t_hi << 32); | ||
558 | t = (r[3] < b); | ||
559 | |||
560 | //m = ((uint64_t)b * a[4]) + r[4]; | ||
561 | // Since a[4] is 00000000, the above optimizes to: | ||
562 | // m = b * 0 + r[4] = r[4]; | ||
563 | //t += m; | ||
564 | t += r[4]; | ||
565 | //t_hi = (t < m); | ||
566 | // t_hi = 0; | ||
567 | r[4] = (sp_digit)t; | ||
568 | //t = (t >> 32) | ((uint64_t)t_hi << 32); | ||
569 | t = (t >> 32); | ||
570 | |||
571 | //m = ((uint64_t)b * a[5]) + r[5]; | ||
572 | // Since a[5] is 00000000, the above optimizes to: | ||
573 | // m = b * 0 + r[5] = r[5]; | ||
574 | //t += m; | ||
575 | t += r[5]; | ||
576 | //t_hi = (t < m); | ||
577 | // t_hi = 0; | ||
578 | r[5] = (sp_digit)t; | ||
579 | //t = (t >> 32) | ((uint64_t)t_hi << 32); | ||
580 | t = (t >> 32); | ||
581 | |||
582 | //m = ((uint64_t)b * a[6]) + r[6]; | ||
583 | // Since a[6] is 00000001, the above optimizes to: | ||
584 | m = (uint64_t)b + r[6]; // 33 bits at most | ||
499 | t += m; | 585 | t += m; |
500 | t_hi = (t < m); | 586 | //t_hi = (t < m); |
501 | r[i] = (sp_digit)t; | 587 | // t_hi = 0; //32bit_value + 33bit_value can't overflow 64 bits |
502 | t = (t >> 32) | ((uint64_t)t_hi << 32); | 588 | r[6] = (sp_digit)t; |
503 | } | 589 | //t = (t >> 32) | ((uint64_t)t_hi << 32); |
590 | t = (t >> 32); | ||
591 | |||
592 | //m = ((uint64_t)b * a[7]) + r[7]; | ||
593 | // Since a[7] is ffffffff, the above optimizes to: | ||
594 | // m = b * ffffffff + r[7] = (b * 100000000 - b) + r[7] | ||
595 | m = ((uint64_t)b << 32) - b + r[7]; | ||
596 | t += m; | ||
597 | //t_hi = (t < m); | ||
598 | // t_hi in fact is always 0 here | ||
599 | r[7] = (sp_digit)t; | ||
600 | //t = (t >> 32) | ((uint64_t)t_hi << 32); | ||
601 | t = (t >> 32); | ||
602 | |||
504 | r[8] += (sp_digit)t; | 603 | r[8] += (sp_digit)t; |
505 | return (r[8] < (sp_digit)t); /* 1 if addition overflowed */ | 604 | return (r[8] < (sp_digit)t); /* 1 if addition overflowed */ |
506 | } | 605 | } |
@@ -517,28 +616,33 @@ static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/ | |||
517 | sp_digit mp = p256_mp_mod; | 616 | sp_digit mp = p256_mp_mod; |
518 | 617 | ||
519 | int i; | 618 | int i; |
520 | sp_digit mu; | 619 | // sp_digit mu; |
521 | 620 | ||
522 | if (mp != 1) { | 621 | if (mp != 1) { |
523 | int too_wide; | 622 | sp_digit word16th = 0; |
524 | for (i = 0; i < 7; i++) { | 623 | for (i = 0; i < 8; i++) { |
525 | mu = (sp_digit)(a[i] * mp); | 624 | // mu = (sp_digit)(a[i] * mp); |
526 | if (sp_256_mul_add_8(a+i, m, mu)) | 625 | if (sp_256_mul_add_8(a+i /*, m, mu*/)) { |
527 | (a+i)[9]++; | 626 | int j = i + 8; |
627 | inc_next_word0: | ||
628 | if (++j > 15) { /* a[16] array has no more words? */ | ||
629 | word16th++; | ||
630 | continue; | ||
631 | } | ||
632 | if (++a[j] == 0) /* did this overflow too? */ | ||
633 | goto inc_next_word0; | ||
634 | } | ||
528 | } | 635 | } |
529 | mu = (sp_digit)(a[7] * mp); | ||
530 | too_wide = sp_256_mul_add_8(a+7, m, mu); | ||
531 | sp_256_mont_shift_8(a, a); | 636 | sp_256_mont_shift_8(a, a); |
532 | if (too_wide) | 637 | if (word16th != 0) |
533 | sp_256_sub_8(a, a, m); | 638 | sp_256_sub_8(a, a, m); |
534 | sp_256_norm_8(a); | 639 | sp_256_norm_8(a); |
535 | } | 640 | } |
536 | else { /* Same code for explicit mp == 1 (which is always the case for P256) */ | 641 | else { /* Same code for explicit mp == 1 (which is always the case for P256) */ |
537 | sp_digit word16th = 0; | 642 | sp_digit word16th = 0; |
538 | for (i = 0; i < 8; i++) { | 643 | for (i = 0; i < 8; i++) { |
539 | mu = a[i]; | 644 | // mu = a[i]; |
540 | //m = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff | 645 | if (sp_256_mul_add_8(a+i /*, m, mu*/)) { |
541 | if (sp_256_mul_add_8(a+i, m, mu)) { | ||
542 | int j = i + 8; | 646 | int j = i + 8; |
543 | inc_next_word: | 647 | inc_next_word: |
544 | if (++j > 15) { /* a[16] array has no more words? */ | 648 | if (++j > 15) { /* a[16] array has no more words? */ |