aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2021-10-06 10:15:29 +0200
committerDenys Vlasenko <vda.linux@googlemail.com>2021-10-06 10:15:29 +0200
commit00f2cceb6aa194aadcbe70675a0f0a0660aea233 (patch)
tree6d570d98453b6a54d655e37a3e2064dfa1fe357a
parentc78428461513afed5e3bf272bcbf17964cbd61a3 (diff)
downloadbusybox-w32-00f2cceb6aa194aadcbe70675a0f0a0660aea233.tar.gz
busybox-w32-00f2cceb6aa194aadcbe70675a0f0a0660aea233.tar.bz2
busybox-w32-00f2cceb6aa194aadcbe70675a0f0a0660aea233.zip
tls: P256: shrink sp_256_mul_add_8 a bit more
function old new delta sp_256_mont_reduce_8 257 245 -12 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--networking/tls_sp_c32.c38
1 files changed, 24 insertions, 14 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
index 1ab6106a7..6fca2aad8 100644
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@@ -569,8 +569,10 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
569// const sp_digit* a = p256_mod; 569// const sp_digit* a = p256_mod;
570//a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff 570//a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
571 sp_digit b = r[0]; 571 sp_digit b = r[0];
572 uint64_t t = 0;
573 572
573 uint64_t t;
574
575// t = 0;
574// for (i = 0; i < 8; i++) { 576// for (i = 0; i < 8; i++) {
575// uint32_t t_hi; 577// uint32_t t_hi;
576// uint64_t m = ((uint64_t)b * a[i]) + r[i]; 578// uint64_t m = ((uint64_t)b * a[i]) + r[i];
@@ -584,12 +586,13 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
584 // Unroll, then optimize the above loop: 586 // Unroll, then optimize the above loop:
585 //uint32_t t_hi; 587 //uint32_t t_hi;
586 uint64_t m; 588 uint64_t m;
589 uint32_t t32;
587 590
588 //m = ((uint64_t)b * a[0]) + r[0]; 591 //m = ((uint64_t)b * a[0]) + r[0];
589 // Since b is r[0] and a[0] is ffffffff, the above optimizes to: 592 // Since b is r[0] and a[0] is ffffffff, the above optimizes to:
590 // m = r[0] * ffffffff + r[0] = (r[0] * 100000000 - r[0]) + r[0] = r[0] << 32; 593 // m = r[0] * ffffffff + r[0] = (r[0] * 100000000 - r[0]) + r[0] = r[0] << 32;
591 //t += m; 594 //t += m;
592 // t = (uint64_t)r[0] << 32; 595 // t = r[0] << 32 = b << 32;
593 //t_hi = (t < m); 596 //t_hi = (t < m);
594 // t_hi = 0; 597 // t_hi = 0;
595 //r[0] = (sp_digit)t; 598 //r[0] = (sp_digit)t;
@@ -625,42 +628,49 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
625 // Since a[3] is 00000000, the above optimizes to: 628 // Since a[3] is 00000000, the above optimizes to:
626 // m = b * 0 + r[3] = r[3]; 629 // m = b * 0 + r[3] = r[3];
627 //t += m; 630 //t += m;
628 // t += r[3]; 631 // t = b + r[3];
629 //t_hi = (t < m); 632 //t_hi = (t < m);
630 // t_hi = 0; 633 // t_hi = 0;
631 //r[3] = (sp_digit)t; 634 //r[3] = (sp_digit)t;
632 r[3] = r[3] + b; 635 r[3] = r[3] + b;
633 //t = (t >> 32) | ((uint64_t)t_hi << 32); 636 //t = (t >> 32) | ((uint64_t)t_hi << 32);
634 t = (r[3] < b); 637 t32 = (r[3] < b); // 0 or 1
635 638
636 //m = ((uint64_t)b * a[4]) + r[4]; 639 //m = ((uint64_t)b * a[4]) + r[4];
637 // Since a[4] is 00000000, the above optimizes to: 640 // Since a[4] is 00000000, the above optimizes to:
638 // m = b * 0 + r[4] = r[4]; 641 // m = b * 0 + r[4] = r[4];
639 //t += m; 642 //t += m;
640 t += r[4]; 643 // t = t32 + r[4];
641 //t_hi = (t < m); 644 //t_hi = (t < m);
642 // t_hi = 0; 645 // t_hi = 0;
643 r[4] = (sp_digit)t; 646 //r[4] = (sp_digit)t;
644 //t = (t >> 32) | ((uint64_t)t_hi << 32); 647 //t = (t >> 32) | ((uint64_t)t_hi << 32);
645 t = (t >> 32); 648 if (t32 != 0) {
649 r[4]++;
650 t32 = (r[4] == 0); // 0 or 1
646 651
647 //m = ((uint64_t)b * a[5]) + r[5]; 652 //m = ((uint64_t)b * a[5]) + r[5];
648 // Since a[5] is 00000000, the above optimizes to: 653 // Since a[5] is 00000000, the above optimizes to:
649 // m = b * 0 + r[5] = r[5]; 654 // m = b * 0 + r[5] = r[5];
650 //t += m; 655 //t += m;
651 t += r[5]; 656 // t = t32 + r[5]; (t32 is 0 or 1)
652 //t_hi = (t < m); 657 //t_hi = (t < m);
653 // t_hi = 0; 658 // t_hi = 0;
654 r[5] = (sp_digit)t; 659 //r[5] = (sp_digit)t;
655 //t = (t >> 32) | ((uint64_t)t_hi << 32); 660 //t = (t >> 32) | ((uint64_t)t_hi << 32);
656 t = (t >> 32); 661 if (t32 != 0) {
662 r[5]++;
663 t32 = (r[5] == 0); // 0 or 1
664 }
665 }
657 666
658 //m = ((uint64_t)b * a[6]) + r[6]; 667 //m = ((uint64_t)b * a[6]) + r[6];
659 // Since a[6] is 00000001, the above optimizes to: 668 // Since a[6] is 00000001, the above optimizes to:
660 m = (uint64_t)b + r[6]; // 33 bits at most 669 // m = (uint64_t)b + r[6]; // 33 bits at most
661 t += m; 670 //t += m;
671 t = t32 + (uint64_t)b + r[6];
662 //t_hi = (t < m); 672 //t_hi = (t < m);
663 // t_hi = 0; //32bit_value + 33bit_value can't overflow 64 bits 673 // t_hi = 0;
664 r[6] = (sp_digit)t; 674 r[6] = (sp_digit)t;
665 //t = (t >> 32) | ((uint64_t)t_hi << 32); 675 //t = (t >> 32) | ((uint64_t)t_hi << 32);
666 t = (t >> 32); 676 t = (t >> 32);
@@ -671,7 +681,7 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
671 m = ((uint64_t)b << 32) - b + r[7]; 681 m = ((uint64_t)b << 32) - b + r[7];
672 t += m; 682 t += m;
673 //t_hi = (t < m); 683 //t_hi = (t < m);
674 // t_hi in fact is always 0 here 684 // t_hi in fact is always 0 here (256bit * 32bit can't have more than 32 bits of overflow)
675 r[7] = (sp_digit)t; 685 r[7] = (sp_digit)t;
676 //t = (t >> 32) | ((uint64_t)t_hi << 32); 686 //t = (t >> 32) | ((uint64_t)t_hi << 32);
677 t = (t >> 32); 687 t = (t >> 32);