tls: P256: add 64-bit montgomery reduce (disabled), small optimization in 32-bit code

function old new delta sp_512to256_mont_reduce_8 191 185 -6 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
author: Denys Vlasenko <vda.linux@googlemail.com> 2021-11-28 15:38:51 +0100
committer: Denys Vlasenko <vda.linux@googlemail.com> 2021-11-28 15:44:08 +0100
commit: 90b0d3304455ad432c49f38e0419ac7820a625f7 (patch)
tree: 797bc349ec45f6960adea1cac27916ccfe2b6796
parent: 832626227ea3798403159080532f763a37273a91 (diff)
download: busybox-w32-90b0d3304455ad432c49f38e0419ac7820a625f7.tar.gz
busybox-w32-90b0d3304455ad432c49f38e0419ac7820a625f7.tar.bz2
busybox-w32-90b0d3304455ad432c49f38e0419ac7820a625f7.zip
1 files changed, 159 insertions, 18 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
index eb6cc2431..b1c410037 100644
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@@ -705,36 +705,174 @@ static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /*, const sp_digit*
        }
 }
-/* Shift the result in the high 256 bits down to the bottom.
+/* Shift the result in the high 256 bits down to the bottom. */
- */
 static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a)
 {
        memcpy(r, a + 8, sizeof(*r) * 8);
 }
+// Disabled for now. Seems to work, but ugly and 40 bytes larger on x86-64.
+#if 0 //UNALIGNED_LE_64BIT
+/* 64-bit little-endian optimized version.
+ * See generic 32-bit version below for explanation.
+ * The benefit of this version is: even though r[3] calculation is atrocious,
+ * we call sp_256_mul_add_4() four times, not 8.
+ */
+static int sp_256_mul_add_4(uint64_t *r /*, const uint64_t* a, uint64_t b*/)
+{
+        uint64_t b = r[0];
+# if 0
+        const uint64_t* a = (const void*)p256_mod;
+//a[3..0] = ffffffff00000001 0000000000000000 00000000ffffffff ffffffffffffffff
+        uint128_t t;
+        int i;
+        t = 0;
+        for (i = 0; i < 4; i++) {
+                uint32_t t_hi;
+                uint128_t m = ((uint128_t)b * a[i]) + r[i];
+                t += m;
+                t_hi = (t < m);
+                r[i] = (uint64_t)t;
+                t = (t >> 64) | ((uint128_t)t_hi << 64);
+        }
+        r[4] += (uint64_t)t;
+        return (r[4] < (uint64_t)t); /* 1 if addition overflowed */
+# else
+        // Unroll, then optimize the above loop:
+                //uint32_t t_hi;
+                //uint128_t m;
+                uint64_t t64, t64u;
+                //m = ((uint128_t)b * a[0]) + r[0];
+                //  Since b is r[0] and a[0] is ffffffffffffffff, the above optimizes to:
+                //  m = r[0] * ffffffffffffffff + r[0] = (r[0] << 64 - r[0]) + r[0] = r[0] << 64;
+                //t += m;
+                //  t = r[0] << 64 = b << 64;
+                //t_hi = (t < m);
+                //  t_hi = 0;
+                //r[0] = (uint64_t)t;
+//              r[0] = 0;
+//the store can be eliminated since caller won't look at lower 256 bits of the result
+                //t = (t >> 64) | ((uint128_t)t_hi << 64);
+                //  t = b;
+                //m = ((uint128_t)b * a[1]) + r[1];
+                //  Since a[1] is 00000000ffffffff, the above optimizes to:
+                //  m = b * ffffffff + r[1] = (b * 100000000 - b) + r[1] = (b << 32) - b + r[1];
+                //t += m;
+                //  t = b + (b << 32) - b + r[1] = (b << 32) + r[1];
+                //t_hi = (t < m);
+                //  t_hi = 0;
+                //r[1] = (uint64_t)t;
+                r[1] += (b << 32);
+                //t = (t >> 64) | ((uint128_t)t_hi << 64);
+                t64 = (r[1] < (b << 32));
+                t64 += (b >> 32);
+                //m = ((uint128_t)b * a[2]) + r[2];
+                //  Since a[2] is 0000000000000000, the above optimizes to:
+                //  m = b * 0 + r[2] = r[2];
+                //t += m;
+                //  t = t64 + r[2];
+                //t_hi = (t < m);
+                //  t_hi = 0;
+                //r[2] = (uint64_t)t;
+                r[2] += t64;
+                //t = (t >> 64) | ((uint128_t)t_hi << 64);
+                t64 = (r[2] < t64);
+                //m = ((uint128_t)b * a[3]) + r[3];
+                //  Since a[3] is ffffffff00000001, the above optimizes to:
+                //  m = b * ffffffff00000001 + r[3];
+                //  m = b +  b*ffffffff00000000 + r[3]
+                //  m = b + (b*ffffffff << 32) + r[3]
+                //  m = b + (((b<<32) - b) << 32) + r[3]
+                //t += m;
+                //  t = t64 + (uint128_t)b + ((((uint128_t)b << 32) - b) << 32) + r[3];
+                t64 += b;
+                t64u = (t64 < b);
+                t64 += r[3];
+                t64u += (t64 < r[3]);
+                {
+                        uint64_t lo,hi;
+                        //lo = (((b << 32) - b) << 32
+                        //hi = (((uint128_t)b << 32) - b) >> 32
+                        //but without uint128_t:
+                        hi = (b << 32) - b; /* form lower 32 bits of "hi" part 1 */
+                        b = (b >> 32) - (/*borrowed above?*/(b << 32) < b); /* upper 32 bits of "hi" are in b */
+                        lo = hi << 32;      /* (use "hi" value to calculate "lo",... */
+                        t64 += lo;          /* ...consume... */
+                        t64u += (t64 < lo); /* ..."lo") */
+                        hi >>= 32;          /* form lower 32 bits of "hi" part 2 */
+                        hi |= (b << 32);    /* combine lower and upper */
+                        t64u += hi;         /* consume "hi" */
+                }
+                //t_hi = (t < m);
+                //  t_hi = 0;
+                //r[3] = (uint64_t)t;
+                r[3] = t64;
+                //t = (t >> 64) | ((uint128_t)t_hi << 64);
+                //  t = t64u;
+        r[4] += t64u;
+        return (r[4] < t64u); /* 1 if addition overflowed */
+# endif
+}
+static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* aa/*, const sp_digit* m, sp_digit mp*/)
+{
+//      const sp_digit* m = p256_mod;
+        int i;
+        uint64_t *a = (void*)aa;
+        sp_digit carry = 0;
+        for (i = 0; i < 4; i++) {
+//              mu = a[i];
+                if (sp_256_mul_add_4(a+i /*, m, mu*/)) {
+                        int j = i + 4;
+ inc_next_word:
+                        if (++j > 7) { /* a[8] array has no more words? */
+                                carry++;
+                                continue;
+                        }
+                        if (++a[j] == 0) /* did this overflow too? */
+                                goto inc_next_word;
+                }
+        }
+        sp_512to256_mont_shift_8(r, aa);
+        if (carry != 0)
+                sp_256_sub_8_p256_mod(r);
+        sp_256_norm_8(r);
+}
+#else /* Generic 32-bit version */
 /* Mul a by scalar b and add into r. (r += a * b)
 * a = p256_mod
 * b = r[0]
 */
 static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
 {
-//      const sp_digit* a = p256_mod;
-//a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
        sp_digit b = r[0];
        uint64_t t;
-//      t = 0;
+# if 0
-//      for (i = 0; i < 8; i++) {
+        const sp_digit* a = p256_mod;
-//              uint32_t t_hi;
+//a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
-//              uint64_t m = ((uint64_t)b * a[i]) + r[i];
+        int i;
-//              t += m;
+        t = 0;
-//              t_hi = (t < m);
+        for (i = 0; i < 8; i++) {
-//              r[i] = (sp_digit)t;
+                uint32_t t_hi;
-//              t = (t >> 32) | ((uint64_t)t_hi << 32);
+                uint64_t m = ((uint64_t)b * a[i]) + r[i];
-//      }
+                t += m;
-//      r[8] += (sp_digit)t;
+                t_hi = (t < m);
+                r[i] = (sp_digit)t;
+                t = (t >> 32) | ((uint64_t)t_hi << 32);
+        }
+        r[8] += (sp_digit)t;
+        return (r[8] < (sp_digit)t); /* 1 if addition overflowed */
+# else
        // Unroll, then optimize the above loop:
                //uint32_t t_hi;
                uint64_t m;
@@ -748,7 +886,8 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
                //t_hi = (t < m);
                //  t_hi = 0;
                //r[0] = (sp_digit)t;
-                r[0] = 0;
+//              r[0] = 0;
+//the store can be eliminated since caller won't look at lower 256 bits of the result
                //t = (t >> 32) | ((uint64_t)t_hi << 32);
                //  t = b;
@@ -840,6 +979,7 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
        r[8] += (sp_digit)t;
        return (r[8] < (sp_digit)t); /* 1 if addition overflowed */
+# endif
 }
 /* Reduce the number back to 256 bits using Montgomery reduction.
@@ -861,7 +1001,7 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
 * Then a multiple of modulus is added to make T divisible by B^2.
 * [In our case, it is (p256_mp_mod * a[1]) << 32.]
 * And so on. Eventually T is divisible by R, and after division by R
- * the algorithm is in the same place as the usual Montgomery reduction was.
+ * the algorithm is in the same place as the usual Montgomery reduction.
 *
 * TODO: Can conditionally use 64-bit (if bit-little-endian arch) logic?
 */
@@ -914,6 +1054,7 @@ static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* a/*, const sp_digit
                sp_256_norm_8(r);
        }
 }
+#endif
 /* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
author	Denys Vlasenko <vda.linux@googlemail.com>	2021-11-28 15:38:51 +0100
committer	Denys Vlasenko <vda.linux@googlemail.com>	2021-11-28 15:44:08 +0100
commit	90b0d3304455ad432c49f38e0419ac7820a625f7 (patch)
tree	797bc349ec45f6960adea1cac27916ccfe2b6796
parent	832626227ea3798403159080532f763a37273a91 (diff)
download	busybox-w32-90b0d3304455ad432c49f38e0419ac7820a625f7.tar.gz busybox-w32-90b0d3304455ad432c49f38e0419ac7820a625f7.tar.bz2 busybox-w32-90b0d3304455ad432c49f38e0419ac7820a625f7.zip

diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c index eb6cc2431..b1c410037 100644 --- a/networking/tls_sp_c32.c +++ b/networking/tls_sp_c32.c
@@ -705,36 +705,174 @@ static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /, const sp_digit
705	}	705	}
706	}	706	}
707		707
708	/* Shift the result in the high 256 bits down to the bottom.	708	/* Shift the result in the high 256 bits down to the bottom. */
709	*/
710	static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a)	709	static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a)
711	{	710	{
712	memcpy(r, a + 8, sizeof(r) 8);	711	memcpy(r, a + 8, sizeof(r) 8);
713	}	712	}
714		713
		714	// Disabled for now. Seems to work, but ugly and 40 bytes larger on x86-64.
		715	#if 0 //UNALIGNED_LE_64BIT
		716	/* 64-bit little-endian optimized version.
		717	* See generic 32-bit version below for explanation.
		718	* The benefit of this version is: even though r[3] calculation is atrocious,
		719	* we call sp_256_mul_add_4() four times, not 8.
		720	*/
		721	static int sp_256_mul_add_4(uint64_t r /, const uint64_t* a, uint64_t b*/)
		722	{
		723	uint64_t b = r[0];
		724
		725	# if 0
		726	const uint64_t* a = (const void*)p256_mod;
		727	//a[3..0] = ffffffff00000001 0000000000000000 00000000ffffffff ffffffffffffffff
		728	uint128_t t;
		729	int i;
		730	t = 0;
		731	for (i = 0; i < 4; i++) {
		732	uint32_t t_hi;
		733	uint128_t m = ((uint128_t)b * a[i]) + r[i];
		734	t += m;
		735	t_hi = (t < m);
		736	r[i] = (uint64_t)t;
		737	t = (t >> 64) \| ((uint128_t)t_hi << 64);
		738	}
		739	r[4] += (uint64_t)t;
		740	return (r[4] < (uint64_t)t); /* 1 if addition overflowed */
		741	# else
		742	// Unroll, then optimize the above loop:
		743	//uint32_t t_hi;
		744	//uint128_t m;
		745	uint64_t t64, t64u;
		746
		747	//m = ((uint128_t)b * a[0]) + r[0];
		748	// Since b is r[0] and a[0] is ffffffffffffffff, the above optimizes to:
		749	// m = r[0] * ffffffffffffffff + r[0] = (r[0] << 64 - r[0]) + r[0] = r[0] << 64;
		750	//t += m;
		751	// t = r[0] << 64 = b << 64;
		752	//t_hi = (t < m);
		753	// t_hi = 0;
		754	//r[0] = (uint64_t)t;
		755	// r[0] = 0;
		756	//the store can be eliminated since caller won't look at lower 256 bits of the result
		757	//t = (t >> 64) \| ((uint128_t)t_hi << 64);
		758	// t = b;
		759
		760	//m = ((uint128_t)b * a[1]) + r[1];
		761	// Since a[1] is 00000000ffffffff, the above optimizes to:
		762	// m = b * ffffffff + r[1] = (b * 100000000 - b) + r[1] = (b << 32) - b + r[1];
		763	//t += m;
		764	// t = b + (b << 32) - b + r[1] = (b << 32) + r[1];
		765	//t_hi = (t < m);
		766	// t_hi = 0;
		767	//r[1] = (uint64_t)t;
		768	r[1] += (b << 32);
		769	//t = (t >> 64) \| ((uint128_t)t_hi << 64);
		770	t64 = (r[1] < (b << 32));
		771	t64 += (b >> 32);
		772
		773	//m = ((uint128_t)b * a[2]) + r[2];
		774	// Since a[2] is 0000000000000000, the above optimizes to:
		775	// m = b * 0 + r[2] = r[2];
		776	//t += m;
		777	// t = t64 + r[2];
		778	//t_hi = (t < m);
		779	// t_hi = 0;
		780	//r[2] = (uint64_t)t;
		781	r[2] += t64;
		782	//t = (t >> 64) \| ((uint128_t)t_hi << 64);
		783	t64 = (r[2] < t64);
		784
		785	//m = ((uint128_t)b * a[3]) + r[3];
		786	// Since a[3] is ffffffff00000001, the above optimizes to:
		787	// m = b * ffffffff00000001 + r[3];
		788	// m = b + b*ffffffff00000000 + r[3]
		789	// m = b + (b*ffffffff << 32) + r[3]
		790	// m = b + (((b<<32) - b) << 32) + r[3]
		791	//t += m;
		792	// t = t64 + (uint128_t)b + ((((uint128_t)b << 32) - b) << 32) + r[3];
		793	t64 += b;
		794	t64u = (t64 < b);
		795	t64 += r[3];
		796	t64u += (t64 < r[3]);
		797	{
		798	uint64_t lo,hi;
		799	//lo = (((b << 32) - b) << 32
		800	//hi = (((uint128_t)b << 32) - b) >> 32
		801	//but without uint128_t:
		802	hi = (b << 32) - b; /* form lower 32 bits of "hi" part 1 */
		803	b = (b >> 32) - (/borrowed above?/(b << 32) < b); /* upper 32 bits of "hi" are in b */
		804	lo = hi << 32; /* (use "hi" value to calculate "lo",... */
		805	t64 += lo; /* ...consume... */
		806	t64u += (t64 < lo); /* ..."lo") */
		807	hi >>= 32; /* form lower 32 bits of "hi" part 2 */
		808	hi \|= (b << 32); /* combine lower and upper */
		809	t64u += hi; /* consume "hi" */
		810	}
		811	//t_hi = (t < m);
		812	// t_hi = 0;
		813	//r[3] = (uint64_t)t;
		814	r[3] = t64;
		815	//t = (t >> 64) \| ((uint128_t)t_hi << 64);
		816	// t = t64u;
		817
		818	r[4] += t64u;
		819	return (r[4] < t64u); /* 1 if addition overflowed */
		820	# endif
		821	}
		822
		823	static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* aa/, const sp_digit m, sp_digit mp*/)
		824	{
		825	// const sp_digit* m = p256_mod;
		826	int i;
		827	uint64_t a = (void)aa;
		828
		829	sp_digit carry = 0;
		830	for (i = 0; i < 4; i++) {
		831	// mu = a[i];
		832	if (sp_256_mul_add_4(a+i /, m, mu/)) {
		833	int j = i + 4;
		834	inc_next_word:
		835	if (++j > 7) { /* a[8] array has no more words? */
		836	carry++;
		837	continue;
		838	}
		839	if (++a[j] == 0) /* did this overflow too? */
		840	goto inc_next_word;
		841	}
		842	}
		843	sp_512to256_mont_shift_8(r, aa);
		844	if (carry != 0)
		845	sp_256_sub_8_p256_mod(r);
		846	sp_256_norm_8(r);
		847	}
		848
		849	#else /* Generic 32-bit version */
		850
715	/* Mul a by scalar b and add into r. (r += a * b)	851	/* Mul a by scalar b and add into r. (r += a * b)
716	* a = p256_mod	852	* a = p256_mod
717	* b = r[0]	853	* b = r[0]
718	*/	854	*/
719	static int sp_256_mul_add_8(sp_digit* r /, const sp_digit a, sp_digit b*/)	855	static int sp_256_mul_add_8(sp_digit* r /, const sp_digit a, sp_digit b*/)
720	{	856	{
721	// const sp_digit* a = p256_mod;
722	//a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
723	sp_digit b = r[0];	857	sp_digit b = r[0];
724
725	uint64_t t;	858	uint64_t t;
726		859
727	// t = 0;	860	# if 0
728	// for (i = 0; i < 8; i++) {	861	const sp_digit* a = p256_mod;
729	// uint32_t t_hi;	862	//a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
730	// uint64_t m = ((uint64_t)b * a[i]) + r[i];	863	int i;
731	// t += m;	864	t = 0;
732	// t_hi = (t < m);	865	for (i = 0; i < 8; i++) {
733	// r[i] = (sp_digit)t;	866	uint32_t t_hi;
734	// t = (t >> 32) \| ((uint64_t)t_hi << 32);	867	uint64_t m = ((uint64_t)b * a[i]) + r[i];
735	// }	868	t += m;
736	// r[8] += (sp_digit)t;	869	t_hi = (t < m);
737		870	r[i] = (sp_digit)t;
		871	t = (t >> 32) \| ((uint64_t)t_hi << 32);
		872	}
		873	r[8] += (sp_digit)t;
		874	return (r[8] < (sp_digit)t); /* 1 if addition overflowed */
		875	# else
738	// Unroll, then optimize the above loop:	876	// Unroll, then optimize the above loop:
739	//uint32_t t_hi;	877	//uint32_t t_hi;
740	uint64_t m;	878	uint64_t m;
@@ -748,7 +886,8 @@ static int sp_256_mul_add_8(sp_digit* r /, const sp_digit a, sp_digit b*/)
748	//t_hi = (t < m);	886	//t_hi = (t < m);
749	// t_hi = 0;	887	// t_hi = 0;
750	//r[0] = (sp_digit)t;	888	//r[0] = (sp_digit)t;
751	r[0] = 0;	889	// r[0] = 0;
		890	//the store can be eliminated since caller won't look at lower 256 bits of the result
752	//t = (t >> 32) \| ((uint64_t)t_hi << 32);	891	//t = (t >> 32) \| ((uint64_t)t_hi << 32);
753	// t = b;	892	// t = b;
754		893
@@ -840,6 +979,7 @@ static int sp_256_mul_add_8(sp_digit* r /, const sp_digit a, sp_digit b*/)
840		979
841	r[8] += (sp_digit)t;	980	r[8] += (sp_digit)t;
842	return (r[8] < (sp_digit)t); /* 1 if addition overflowed */	981	return (r[8] < (sp_digit)t); /* 1 if addition overflowed */
		982	# endif
843	}	983	}
844		984
845	/* Reduce the number back to 256 bits using Montgomery reduction.	985	/* Reduce the number back to 256 bits using Montgomery reduction.
@@ -861,7 +1001,7 @@ static int sp_256_mul_add_8(sp_digit* r /, const sp_digit a, sp_digit b*/)
861	* Then a multiple of modulus is added to make T divisible by B^2.	1001	* Then a multiple of modulus is added to make T divisible by B^2.
862	* [In our case, it is (p256_mp_mod * a[1]) << 32.]	1002	* [In our case, it is (p256_mp_mod * a[1]) << 32.]
863	* And so on. Eventually T is divisible by R, and after division by R	1003	* And so on. Eventually T is divisible by R, and after division by R
864	* the algorithm is in the same place as the usual Montgomery reduction was.	1004	* the algorithm is in the same place as the usual Montgomery reduction.
865	*	1005	*
866	* TODO: Can conditionally use 64-bit (if bit-little-endian arch) logic?	1006	* TODO: Can conditionally use 64-bit (if bit-little-endian arch) logic?
867	*/	1007	*/
@@ -914,6 +1054,7 @@ static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* a/*, const sp_digit
914	sp_256_norm_8(r);	1054	sp_256_norm_8(r);
915	}	1055	}
916	}	1056	}
		1057	#endif
917		1058
918	/* Multiply two Montogmery form numbers mod the modulus (prime).	1059	/* Multiply two Montogmery form numbers mod the modulus (prime).
919	* (r = a * b mod m)	1060	* (r = a * b mod m)