aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2021-10-06 00:19:30 +0200
committerDenys Vlasenko <vda.linux@googlemail.com>2021-10-06 00:24:03 +0200
commit2430fcfd8de47f786aca1185ae0500fa36c6a548 (patch)
tree7602d8ecdc0c38f272e03dffde9b20c949d1fd9c
parentbbd723ebec33aa14746dde88b982b160977938b6 (diff)
downloadbusybox-w32-2430fcfd8de47f786aca1185ae0500fa36c6a548.tar.gz
busybox-w32-2430fcfd8de47f786aca1185ae0500fa36c6a548.tar.bz2
busybox-w32-2430fcfd8de47f786aca1185ae0500fa36c6a548.zip
tls: optimize sp_256_mont_reduce_8 in P256
The code size decrease is small, but we eliminate ALL multiplies! function old new delta sp_256_mont_reduce_8 268 262 -6 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--networking/tls_sp_c32.c146
1 files changed, 125 insertions, 21 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
index e1c4cdd54..0773a2d47 100644
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@@ -488,19 +488,118 @@ static void sp_256_mont_shift_8(sp_digit* r, const sp_digit* a)
488} 488}
489 489
490/* Mul a by scalar b and add into r. (r += a * b) */ 490/* Mul a by scalar b and add into r. (r += a * b) */
491static int sp_256_mul_add_8(sp_digit* r, const sp_digit* a, sp_digit b) 491static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
492{ 492{
493// const sp_digit* a = p256_mod;
494//a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
495 sp_digit b = r[0];
493 uint64_t t = 0; 496 uint64_t t = 0;
494 int i;
495 497
496 for (i = 0; i < 8; i++) { 498// for (i = 0; i < 8; i++) {
497 uint32_t t_hi; 499// uint32_t t_hi;
498 uint64_t m = ((uint64_t)b * a[i]) + r[i]; 500// uint64_t m = ((uint64_t)b * a[i]) + r[i];
501// t += m;
502// t_hi = (t < m);
503// r[i] = (sp_digit)t;
504// t = (t >> 32) | ((uint64_t)t_hi << 32);
505// }
506// r[8] += (sp_digit)t;
507
508 // Unroll, then optimize the above loop:
509 //uint32_t t_hi;
510 uint64_t m;
511
512 //m = ((uint64_t)b * a[0]) + r[0];
513 // Since b is r[0] and a[0] is ffffffff, the above optimizes to:
514 // m = r[0] * ffffffff + r[0] = (r[0] * 100000000 - r[0]) + r[0] = r[0] << 32;
515 //t += m;
516 // t = (uint64_t)r[0] << 32;
517 //t_hi = (t < m);
518 // t_hi = 0;
519 //r[0] = (sp_digit)t;
520 r[0] = 0;
521 //t = (t >> 32) | ((uint64_t)t_hi << 32);
522 // t = b;
523
524 //m = ((uint64_t)b * a[1]) + r[1];
525 // Since a[1] is ffffffff, the above optimizes to:
526 // m = b * ffffffff + r[1] = (b * 100000000 - b) + r[1] = (b << 32) - b + r[1];
527 //t += m;
528 // t = b + (b << 32) - b + r[1] = (b << 32) + r[1];
529 //t_hi = (t < m);
530 // t_hi = 0;
531 //r[1] = (sp_digit)t;
532 // r[1] = r[1];
533 //t = (t >> 32) | ((uint64_t)t_hi << 32);
534 // t = b;
535
536 //m = ((uint64_t)b * a[2]) + r[2];
537 // Since a[2] is ffffffff, the above optimizes to:
538 // m = b * ffffffff + r[2] = (b * 100000000 - b) + r[2] = (b << 32) - b + r[2];
539 //t += m;
540 // t = b + (b << 32) - b + r[2] = (b << 32) + r[2]
541 //t_hi = (t < m);
542 // t_hi = 0;
543 //r[2] = (sp_digit)t;
544 // r[2] = r[2];
545 //t = (t >> 32) | ((uint64_t)t_hi << 32);
546 // t = b;
547
548 //m = ((uint64_t)b * a[3]) + r[3];
549 // Since a[3] is 00000000, the above optimizes to:
550 // m = b * 0 + r[3] = r[3];
551 //t += m;
552 // t += r[3];
553 //t_hi = (t < m);
554 // t_hi = 0;
555 //r[3] = (sp_digit)t;
556 r[3] = r[3] + b;
557 //t = (t >> 32) | ((uint64_t)t_hi << 32);
558 t = (r[3] < b);
559
560 //m = ((uint64_t)b * a[4]) + r[4];
561 // Since a[4] is 00000000, the above optimizes to:
562 // m = b * 0 + r[4] = r[4];
563 //t += m;
564 t += r[4];
565 //t_hi = (t < m);
566 // t_hi = 0;
567 r[4] = (sp_digit)t;
568 //t = (t >> 32) | ((uint64_t)t_hi << 32);
569 t = (t >> 32);
570
571 //m = ((uint64_t)b * a[5]) + r[5];
572 // Since a[5] is 00000000, the above optimizes to:
573 // m = b * 0 + r[5] = r[5];
574 //t += m;
575 t += r[5];
576 //t_hi = (t < m);
577 // t_hi = 0;
578 r[5] = (sp_digit)t;
579 //t = (t >> 32) | ((uint64_t)t_hi << 32);
580 t = (t >> 32);
581
582 //m = ((uint64_t)b * a[6]) + r[6];
583 // Since a[6] is 00000001, the above optimizes to:
584 m = (uint64_t)b + r[6]; // 33 bits at most
499 t += m; 585 t += m;
500 t_hi = (t < m); 586 //t_hi = (t < m);
501 r[i] = (sp_digit)t; 587 // t_hi = 0; //32bit_value + 33bit_value can't overflow 64 bits
502 t = (t >> 32) | ((uint64_t)t_hi << 32); 588 r[6] = (sp_digit)t;
503 } 589 //t = (t >> 32) | ((uint64_t)t_hi << 32);
590 t = (t >> 32);
591
592 //m = ((uint64_t)b * a[7]) + r[7];
593 // Since a[7] is ffffffff, the above optimizes to:
594 // m = b * ffffffff + r[7] = (b * 100000000 - b) + r[7]
595 m = ((uint64_t)b << 32) - b + r[7];
596 t += m;
597 //t_hi = (t < m);
598 // t_hi in fact is always 0 here
599 r[7] = (sp_digit)t;
600 //t = (t >> 32) | ((uint64_t)t_hi << 32);
601 t = (t >> 32);
602
504 r[8] += (sp_digit)t; 603 r[8] += (sp_digit)t;
505 return (r[8] < (sp_digit)t); /* 1 if addition overflowed */ 604 return (r[8] < (sp_digit)t); /* 1 if addition overflowed */
506} 605}
@@ -517,28 +616,33 @@ static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/
517 sp_digit mp = p256_mp_mod; 616 sp_digit mp = p256_mp_mod;
518 617
519 int i; 618 int i;
520 sp_digit mu; 619// sp_digit mu;
521 620
522 if (mp != 1) { 621 if (mp != 1) {
523 int too_wide; 622 sp_digit word16th = 0;
524 for (i = 0; i < 7; i++) { 623 for (i = 0; i < 8; i++) {
525 mu = (sp_digit)(a[i] * mp); 624// mu = (sp_digit)(a[i] * mp);
526 if (sp_256_mul_add_8(a+i, m, mu)) 625 if (sp_256_mul_add_8(a+i /*, m, mu*/)) {
527 (a+i)[9]++; 626 int j = i + 8;
627 inc_next_word0:
628 if (++j > 15) { /* a[16] array has no more words? */
629 word16th++;
630 continue;
631 }
632 if (++a[j] == 0) /* did this overflow too? */
633 goto inc_next_word0;
634 }
528 } 635 }
529 mu = (sp_digit)(a[7] * mp);
530 too_wide = sp_256_mul_add_8(a+7, m, mu);
531 sp_256_mont_shift_8(a, a); 636 sp_256_mont_shift_8(a, a);
532 if (too_wide) 637 if (word16th != 0)
533 sp_256_sub_8(a, a, m); 638 sp_256_sub_8(a, a, m);
534 sp_256_norm_8(a); 639 sp_256_norm_8(a);
535 } 640 }
536 else { /* Same code for explicit mp == 1 (which is always the case for P256) */ 641 else { /* Same code for explicit mp == 1 (which is always the case for P256) */
537 sp_digit word16th = 0; 642 sp_digit word16th = 0;
538 for (i = 0; i < 8; i++) { 643 for (i = 0; i < 8; i++) {
539 mu = a[i]; 644// mu = a[i];
540//m = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff 645 if (sp_256_mul_add_8(a+i /*, m, mu*/)) {
541 if (sp_256_mul_add_8(a+i, m, mu)) {
542 int j = i + 8; 646 int j = i + 8;
543 inc_next_word: 647 inc_next_word:
544 if (++j > 15) { /* a[16] array has no more words? */ 648 if (++j > 15) { /* a[16] array has no more words? */