diff options
-rw-r--r-- | networking/tls_sp_c32.c | 15 |
1 files changed, 8 insertions, 7 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c index b1c410037..cb166e413 100644 --- a/networking/tls_sp_c32.c +++ b/networking/tls_sp_c32.c | |||
@@ -711,12 +711,13 @@ static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a) | |||
711 | memcpy(r, a + 8, sizeof(*r) * 8); | 711 | memcpy(r, a + 8, sizeof(*r) * 8); |
712 | } | 712 | } |
713 | 713 | ||
714 | // Disabled for now. Seems to work, but ugly and 40 bytes larger on x86-64. | 714 | #if UNALIGNED_LE_64BIT |
715 | #if 0 //UNALIGNED_LE_64BIT | ||
716 | /* 64-bit little-endian optimized version. | 715 | /* 64-bit little-endian optimized version. |
717 | * See generic 32-bit version below for explanation. | 716 | * See generic 32-bit version below for explanation. |
718 | * The benefit of this version is: even though r[3] calculation is atrocious, | 717 | * The benefit of this version is: even though r[3] calculation is atrocious, |
719 | * we call sp_256_mul_add_4() four times, not 8. | 718 | * we call sp_256_mul_add_4() four times, not 8. |
719 | * Measured run time improvement of curve_P256_compute_pubkey_and_premaster() | ||
720 | * call on x86-64: from ~1500us to ~900us. Code size +32 bytes. | ||
720 | */ | 721 | */ |
721 | static int sp_256_mul_add_4(uint64_t *r /*, const uint64_t* a, uint64_t b*/) | 722 | static int sp_256_mul_add_4(uint64_t *r /*, const uint64_t* a, uint64_t b*/) |
722 | { | 723 | { |
@@ -794,18 +795,18 @@ static int sp_256_mul_add_4(uint64_t *r /*, const uint64_t* a, uint64_t b*/) | |||
794 | t64u = (t64 < b); | 795 | t64u = (t64 < b); |
795 | t64 += r[3]; | 796 | t64 += r[3]; |
796 | t64u += (t64 < r[3]); | 797 | t64u += (t64 < r[3]); |
797 | { | 798 | { // add ((((uint128_t)b << 32) - b) << 32): |
798 | uint64_t lo,hi; | 799 | uint64_t lo, hi; |
799 | //lo = (((b << 32) - b) << 32 | 800 | //lo = (((b << 32) - b) << 32 |
800 | //hi = (((uint128_t)b << 32) - b) >> 32 | 801 | //hi = (((uint128_t)b << 32) - b) >> 32 |
801 | //but without uint128_t: | 802 | //but without uint128_t: |
802 | hi = (b << 32) - b; /* form lower 32 bits of "hi" part 1 */ | 803 | hi = (b << 32) - b; /* make lower 32 bits of "hi", part 1 */ |
803 | b = (b >> 32) - (/*borrowed above?*/(b << 32) < b); /* upper 32 bits of "hi" are in b */ | 804 | b = (b >> 32) - (/*borrowed above?*/(b << 32) < b); /* upper 32 bits of "hi" are in b */ |
804 | lo = hi << 32; /* (use "hi" value to calculate "lo",... */ | 805 | lo = hi << 32; /* (use "hi" value to calculate "lo",... */ |
805 | t64 += lo; /* ...consume... */ | 806 | t64 += lo; /* ...consume... */ |
806 | t64u += (t64 < lo); /* ..."lo") */ | 807 | t64u += (t64 < lo); /* ..."lo") */ |
807 | hi >>= 32; /* form lower 32 bits of "hi" part 2 */ | 808 | hi >>= 32; /* make lower 32 bits of "hi", part 2 */ |
808 | hi |= (b << 32); /* combine lower and upper */ | 809 | hi |= (b << 32); /* combine lower and upper 32 bits */ |
809 | t64u += hi; /* consume "hi" */ | 810 | t64u += hi; /* consume "hi" */ |
810 | } | 811 | } |
811 | //t_hi = (t < m); | 812 | //t_hi = (t < m); |