aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2021-11-28 15:38:51 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2021-11-28 15:44:08 +0100
commit90b0d3304455ad432c49f38e0419ac7820a625f7 (patch)
tree797bc349ec45f6960adea1cac27916ccfe2b6796
parent832626227ea3798403159080532f763a37273a91 (diff)
downloadbusybox-w32-90b0d3304455ad432c49f38e0419ac7820a625f7.tar.gz
busybox-w32-90b0d3304455ad432c49f38e0419ac7820a625f7.tar.bz2
busybox-w32-90b0d3304455ad432c49f38e0419ac7820a625f7.zip
tls: P256: add 64-bit montgomery reduce (disabled), small optimization in 32-bit code
function old new delta sp_512to256_mont_reduce_8 191 185 -6 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--networking/tls_sp_c32.c177
1 files changed, 159 insertions, 18 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
index eb6cc2431..b1c410037 100644
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@@ -705,36 +705,174 @@ static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /*, const sp_digit*
705 } 705 }
706} 706}
707 707
708/* Shift the result in the high 256 bits down to the bottom. 708/* Shift the result in the high 256 bits down to the bottom. */
709 */
710static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a) 709static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a)
711{ 710{
712 memcpy(r, a + 8, sizeof(*r) * 8); 711 memcpy(r, a + 8, sizeof(*r) * 8);
713} 712}
714 713
714// Disabled for now. Seems to work, but ugly and 40 bytes larger on x86-64.
715#if 0 //UNALIGNED_LE_64BIT
716/* 64-bit little-endian optimized version.
717 * See generic 32-bit version below for explanation.
718 * The benefit of this version is: even though r[3] calculation is atrocious,
719 * we call sp_256_mul_add_4() four times, not 8.
720 */
721static int sp_256_mul_add_4(uint64_t *r /*, const uint64_t* a, uint64_t b*/)
722{
723 uint64_t b = r[0];
724
725# if 0
726 const uint64_t* a = (const void*)p256_mod;
727//a[3..0] = ffffffff00000001 0000000000000000 00000000ffffffff ffffffffffffffff
728 uint128_t t;
729 int i;
730 t = 0;
731 for (i = 0; i < 4; i++) {
732 uint32_t t_hi;
733 uint128_t m = ((uint128_t)b * a[i]) + r[i];
734 t += m;
735 t_hi = (t < m);
736 r[i] = (uint64_t)t;
737 t = (t >> 64) | ((uint128_t)t_hi << 64);
738 }
739 r[4] += (uint64_t)t;
740 return (r[4] < (uint64_t)t); /* 1 if addition overflowed */
741# else
742 // Unroll, then optimize the above loop:
743 //uint32_t t_hi;
744 //uint128_t m;
745 uint64_t t64, t64u;
746
747 //m = ((uint128_t)b * a[0]) + r[0];
748 // Since b is r[0] and a[0] is ffffffffffffffff, the above optimizes to:
749 // m = r[0] * ffffffffffffffff + r[0] = (r[0] << 64 - r[0]) + r[0] = r[0] << 64;
750 //t += m;
751 // t = r[0] << 64 = b << 64;
752 //t_hi = (t < m);
753 // t_hi = 0;
754 //r[0] = (uint64_t)t;
755// r[0] = 0;
756//the store can be eliminated since caller won't look at lower 256 bits of the result
757 //t = (t >> 64) | ((uint128_t)t_hi << 64);
758 // t = b;
759
760 //m = ((uint128_t)b * a[1]) + r[1];
761 // Since a[1] is 00000000ffffffff, the above optimizes to:
762 // m = b * ffffffff + r[1] = (b * 100000000 - b) + r[1] = (b << 32) - b + r[1];
763 //t += m;
764 // t = b + (b << 32) - b + r[1] = (b << 32) + r[1];
765 //t_hi = (t < m);
766 // t_hi = 0;
767 //r[1] = (uint64_t)t;
768 r[1] += (b << 32);
769 //t = (t >> 64) | ((uint128_t)t_hi << 64);
770 t64 = (r[1] < (b << 32));
771 t64 += (b >> 32);
772
773 //m = ((uint128_t)b * a[2]) + r[2];
774 // Since a[2] is 0000000000000000, the above optimizes to:
775 // m = b * 0 + r[2] = r[2];
776 //t += m;
777 // t = t64 + r[2];
778 //t_hi = (t < m);
779 // t_hi = 0;
780 //r[2] = (uint64_t)t;
781 r[2] += t64;
782 //t = (t >> 64) | ((uint128_t)t_hi << 64);
783 t64 = (r[2] < t64);
784
785 //m = ((uint128_t)b * a[3]) + r[3];
786 // Since a[3] is ffffffff00000001, the above optimizes to:
787 // m = b * ffffffff00000001 + r[3];
788 // m = b + b*ffffffff00000000 + r[3]
789 // m = b + (b*ffffffff << 32) + r[3]
790 // m = b + (((b<<32) - b) << 32) + r[3]
791 //t += m;
792 // t = t64 + (uint128_t)b + ((((uint128_t)b << 32) - b) << 32) + r[3];
793 t64 += b;
794 t64u = (t64 < b);
795 t64 += r[3];
796 t64u += (t64 < r[3]);
797 {
798 uint64_t lo,hi;
799 //lo = (((b << 32) - b) << 32
800 //hi = (((uint128_t)b << 32) - b) >> 32
801 //but without uint128_t:
802 hi = (b << 32) - b; /* form lower 32 bits of "hi" part 1 */
803 b = (b >> 32) - (/*borrowed above?*/(b << 32) < b); /* upper 32 bits of "hi" are in b */
804 lo = hi << 32; /* (use "hi" value to calculate "lo",... */
805 t64 += lo; /* ...consume... */
806 t64u += (t64 < lo); /* ..."lo") */
807 hi >>= 32; /* form lower 32 bits of "hi" part 2 */
808 hi |= (b << 32); /* combine lower and upper */
809 t64u += hi; /* consume "hi" */
810 }
811 //t_hi = (t < m);
812 // t_hi = 0;
813 //r[3] = (uint64_t)t;
814 r[3] = t64;
815 //t = (t >> 64) | ((uint128_t)t_hi << 64);
816 // t = t64u;
817
818 r[4] += t64u;
819 return (r[4] < t64u); /* 1 if addition overflowed */
820# endif
821}
822
823static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* aa/*, const sp_digit* m, sp_digit mp*/)
824{
825// const sp_digit* m = p256_mod;
826 int i;
827 uint64_t *a = (void*)aa;
828
829 sp_digit carry = 0;
830 for (i = 0; i < 4; i++) {
831// mu = a[i];
832 if (sp_256_mul_add_4(a+i /*, m, mu*/)) {
833 int j = i + 4;
834 inc_next_word:
835 if (++j > 7) { /* a[8] array has no more words? */
836 carry++;
837 continue;
838 }
839 if (++a[j] == 0) /* did this overflow too? */
840 goto inc_next_word;
841 }
842 }
843 sp_512to256_mont_shift_8(r, aa);
844 if (carry != 0)
845 sp_256_sub_8_p256_mod(r);
846 sp_256_norm_8(r);
847}
848
849#else /* Generic 32-bit version */
850
715/* Mul a by scalar b and add into r. (r += a * b) 851/* Mul a by scalar b and add into r. (r += a * b)
716 * a = p256_mod 852 * a = p256_mod
717 * b = r[0] 853 * b = r[0]
718 */ 854 */
719static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) 855static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
720{ 856{
721// const sp_digit* a = p256_mod;
722//a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
723 sp_digit b = r[0]; 857 sp_digit b = r[0];
724
725 uint64_t t; 858 uint64_t t;
726 859
727// t = 0; 860# if 0
728// for (i = 0; i < 8; i++) { 861 const sp_digit* a = p256_mod;
729// uint32_t t_hi; 862//a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
730// uint64_t m = ((uint64_t)b * a[i]) + r[i]; 863 int i;
731// t += m; 864 t = 0;
732// t_hi = (t < m); 865 for (i = 0; i < 8; i++) {
733// r[i] = (sp_digit)t; 866 uint32_t t_hi;
734// t = (t >> 32) | ((uint64_t)t_hi << 32); 867 uint64_t m = ((uint64_t)b * a[i]) + r[i];
735// } 868 t += m;
736// r[8] += (sp_digit)t; 869 t_hi = (t < m);
737 870 r[i] = (sp_digit)t;
871 t = (t >> 32) | ((uint64_t)t_hi << 32);
872 }
873 r[8] += (sp_digit)t;
874 return (r[8] < (sp_digit)t); /* 1 if addition overflowed */
875# else
738 // Unroll, then optimize the above loop: 876 // Unroll, then optimize the above loop:
739 //uint32_t t_hi; 877 //uint32_t t_hi;
740 uint64_t m; 878 uint64_t m;
@@ -748,7 +886,8 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
748 //t_hi = (t < m); 886 //t_hi = (t < m);
749 // t_hi = 0; 887 // t_hi = 0;
750 //r[0] = (sp_digit)t; 888 //r[0] = (sp_digit)t;
751 r[0] = 0; 889// r[0] = 0;
890//the store can be eliminated since caller won't look at lower 256 bits of the result
752 //t = (t >> 32) | ((uint64_t)t_hi << 32); 891 //t = (t >> 32) | ((uint64_t)t_hi << 32);
753 // t = b; 892 // t = b;
754 893
@@ -840,6 +979,7 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
840 979
841 r[8] += (sp_digit)t; 980 r[8] += (sp_digit)t;
842 return (r[8] < (sp_digit)t); /* 1 if addition overflowed */ 981 return (r[8] < (sp_digit)t); /* 1 if addition overflowed */
982# endif
843} 983}
844 984
845/* Reduce the number back to 256 bits using Montgomery reduction. 985/* Reduce the number back to 256 bits using Montgomery reduction.
@@ -861,7 +1001,7 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
861 * Then a multiple of modulus is added to make T divisible by B^2. 1001 * Then a multiple of modulus is added to make T divisible by B^2.
862 * [In our case, it is (p256_mp_mod * a[1]) << 32.] 1002 * [In our case, it is (p256_mp_mod * a[1]) << 32.]
863 * And so on. Eventually T is divisible by R, and after division by R 1003 * And so on. Eventually T is divisible by R, and after division by R
864 * the algorithm is in the same place as the usual Montgomery reduction was. 1004 * the algorithm is in the same place as the usual Montgomery reduction.
865 * 1005 *
866 * TODO: Can conditionally use 64-bit (if bit-little-endian arch) logic? 1006 * TODO: Can conditionally use 64-bit (if bit-little-endian arch) logic?
867 */ 1007 */
@@ -914,6 +1054,7 @@ static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* a/*, const sp_digit
914 sp_256_norm_8(r); 1054 sp_256_norm_8(r);
915 } 1055 }
916} 1056}
1057#endif
917 1058
918/* Multiply two Montogmery form numbers mod the modulus (prime). 1059/* Multiply two Montogmery form numbers mod the modulus (prime).
919 * (r = a * b mod m) 1060 * (r = a * b mod m)