aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2021-11-28 21:40:23 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2021-11-28 21:43:51 +0100
commit8514b4166d7a9d7720006d852ae67f43baed8ef1 (patch)
tree03fa38db9b168ca3265a8c9bb60fc8fb642c8773
parent90b0d3304455ad432c49f38e0419ac7820a625f7 (diff)
downloadbusybox-w32-8514b4166d7a9d7720006d852ae67f43baed8ef1.tar.gz
busybox-w32-8514b4166d7a9d7720006d852ae67f43baed8ef1.tar.bz2
busybox-w32-8514b4166d7a9d7720006d852ae67f43baed8ef1.zip
tls: P256: enable 64-bit version of montgomery reduction
After more testing, (1) I'm more sure it is indeed correct, and (2) it is a significant speedup - we do a lot of those multiplications. function old new delta sp_512to256_mont_reduce_8 191 223 +32 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--networking/tls_sp_c32.c15
1 files changed, 8 insertions, 7 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
index b1c410037..cb166e413 100644
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@@ -711,12 +711,13 @@ static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a)
711 memcpy(r, a + 8, sizeof(*r) * 8); 711 memcpy(r, a + 8, sizeof(*r) * 8);
712} 712}
713 713
714// Disabled for now. Seems to work, but ugly and 40 bytes larger on x86-64. 714#if UNALIGNED_LE_64BIT
715#if 0 //UNALIGNED_LE_64BIT
716/* 64-bit little-endian optimized version. 715/* 64-bit little-endian optimized version.
717 * See generic 32-bit version below for explanation. 716 * See generic 32-bit version below for explanation.
718 * The benefit of this version is: even though r[3] calculation is atrocious, 717 * The benefit of this version is: even though r[3] calculation is atrocious,
719 * we call sp_256_mul_add_4() four times, not 8. 718 * we call sp_256_mul_add_4() four times, not 8.
719 * Measured run time improvement of curve_P256_compute_pubkey_and_premaster()
720 * call on x86-64: from ~1500us to ~900us. Code size +32 bytes.
720 */ 721 */
721static int sp_256_mul_add_4(uint64_t *r /*, const uint64_t* a, uint64_t b*/) 722static int sp_256_mul_add_4(uint64_t *r /*, const uint64_t* a, uint64_t b*/)
722{ 723{
@@ -794,18 +795,18 @@ static int sp_256_mul_add_4(uint64_t *r /*, const uint64_t* a, uint64_t b*/)
794 t64u = (t64 < b); 795 t64u = (t64 < b);
795 t64 += r[3]; 796 t64 += r[3];
796 t64u += (t64 < r[3]); 797 t64u += (t64 < r[3]);
797 { 798 { // add ((((uint128_t)b << 32) - b) << 32):
798 uint64_t lo,hi; 799 uint64_t lo, hi;
799 //lo = (((b << 32) - b) << 32 800 //lo = (((b << 32) - b) << 32
800 //hi = (((uint128_t)b << 32) - b) >> 32 801 //hi = (((uint128_t)b << 32) - b) >> 32
801 //but without uint128_t: 802 //but without uint128_t:
802 hi = (b << 32) - b; /* form lower 32 bits of "hi" part 1 */ 803 hi = (b << 32) - b; /* make lower 32 bits of "hi", part 1 */
803 b = (b >> 32) - (/*borrowed above?*/(b << 32) < b); /* upper 32 bits of "hi" are in b */ 804 b = (b >> 32) - (/*borrowed above?*/(b << 32) < b); /* upper 32 bits of "hi" are in b */
804 lo = hi << 32; /* (use "hi" value to calculate "lo",... */ 805 lo = hi << 32; /* (use "hi" value to calculate "lo",... */
805 t64 += lo; /* ...consume... */ 806 t64 += lo; /* ...consume... */
806 t64u += (t64 < lo); /* ..."lo") */ 807 t64u += (t64 < lo); /* ..."lo") */
807 hi >>= 32; /* form lower 32 bits of "hi" part 2 */ 808 hi >>= 32; /* make lower 32 bits of "hi", part 2 */
808 hi |= (b << 32); /* combine lower and upper */ 809 hi |= (b << 32); /* combine lower and upper 32 bits */
809 t64u += hi; /* consume "hi" */ 810 t64u += hi; /* consume "hi" */
810 } 811 }
811 //t_hi = (t < m); 812 //t_hi = (t < m);