diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2021-11-28 15:38:51 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2021-11-28 15:44:08 +0100 |
commit | 90b0d3304455ad432c49f38e0419ac7820a625f7 (patch) | |
tree | 797bc349ec45f6960adea1cac27916ccfe2b6796 | |
parent | 832626227ea3798403159080532f763a37273a91 (diff) | |
download | busybox-w32-90b0d3304455ad432c49f38e0419ac7820a625f7.tar.gz busybox-w32-90b0d3304455ad432c49f38e0419ac7820a625f7.tar.bz2 busybox-w32-90b0d3304455ad432c49f38e0419ac7820a625f7.zip |
tls: P256: add 64-bit montgomery reduce (disabled), small optimization in 32-bit code
function old new delta
sp_512to256_mont_reduce_8 191 185 -6
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | networking/tls_sp_c32.c | 177 |
1 files changed, 159 insertions, 18 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c index eb6cc2431..b1c410037 100644 --- a/networking/tls_sp_c32.c +++ b/networking/tls_sp_c32.c | |||
@@ -705,36 +705,174 @@ static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /*, const sp_digit* | |||
705 | } | 705 | } |
706 | } | 706 | } |
707 | 707 | ||
708 | /* Shift the result in the high 256 bits down to the bottom. | 708 | /* Shift the result in the high 256 bits down to the bottom. */ |
709 | */ | ||
710 | static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a) | 709 | static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a) |
711 | { | 710 | { |
712 | memcpy(r, a + 8, sizeof(*r) * 8); | 711 | memcpy(r, a + 8, sizeof(*r) * 8); |
713 | } | 712 | } |
714 | 713 | ||
714 | // Disabled for now. Seems to work, but ugly and 40 bytes larger on x86-64. | ||
715 | #if 0 //UNALIGNED_LE_64BIT | ||
716 | /* 64-bit little-endian optimized version. | ||
717 | * See generic 32-bit version below for explanation. | ||
718 | * The benefit of this version is: even though r[3] calculation is atrocious, | ||
719 | * we call sp_256_mul_add_4() four times, not 8. | ||
720 | */ | ||
721 | static int sp_256_mul_add_4(uint64_t *r /*, const uint64_t* a, uint64_t b*/) | ||
722 | { | ||
723 | uint64_t b = r[0]; | ||
724 | |||
725 | # if 0 | ||
726 | const uint64_t* a = (const void*)p256_mod; | ||
727 | //a[3..0] = ffffffff00000001 0000000000000000 00000000ffffffff ffffffffffffffff | ||
728 | uint128_t t; | ||
729 | int i; | ||
730 | t = 0; | ||
731 | for (i = 0; i < 4; i++) { | ||
732 | uint32_t t_hi; | ||
733 | uint128_t m = ((uint128_t)b * a[i]) + r[i]; | ||
734 | t += m; | ||
735 | t_hi = (t < m); | ||
736 | r[i] = (uint64_t)t; | ||
737 | t = (t >> 64) | ((uint128_t)t_hi << 64); | ||
738 | } | ||
739 | r[4] += (uint64_t)t; | ||
740 | return (r[4] < (uint64_t)t); /* 1 if addition overflowed */ | ||
741 | # else | ||
742 | // Unroll, then optimize the above loop: | ||
743 | //uint32_t t_hi; | ||
744 | //uint128_t m; | ||
745 | uint64_t t64, t64u; | ||
746 | |||
747 | //m = ((uint128_t)b * a[0]) + r[0]; | ||
748 | // Since b is r[0] and a[0] is ffffffffffffffff, the above optimizes to: | ||
749 | // m = r[0] * ffffffffffffffff + r[0] = (r[0] << 64 - r[0]) + r[0] = r[0] << 64; | ||
750 | //t += m; | ||
751 | // t = r[0] << 64 = b << 64; | ||
752 | //t_hi = (t < m); | ||
753 | // t_hi = 0; | ||
754 | //r[0] = (uint64_t)t; | ||
755 | // r[0] = 0; | ||
756 | //the store can be eliminated since caller won't look at lower 256 bits of the result | ||
757 | //t = (t >> 64) | ((uint128_t)t_hi << 64); | ||
758 | // t = b; | ||
759 | |||
760 | //m = ((uint128_t)b * a[1]) + r[1]; | ||
761 | // Since a[1] is 00000000ffffffff, the above optimizes to: | ||
762 | // m = b * ffffffff + r[1] = (b * 100000000 - b) + r[1] = (b << 32) - b + r[1]; | ||
763 | //t += m; | ||
764 | // t = b + (b << 32) - b + r[1] = (b << 32) + r[1]; | ||
765 | //t_hi = (t < m); | ||
766 | // t_hi = 0; | ||
767 | //r[1] = (uint64_t)t; | ||
768 | r[1] += (b << 32); | ||
769 | //t = (t >> 64) | ((uint128_t)t_hi << 64); | ||
770 | t64 = (r[1] < (b << 32)); | ||
771 | t64 += (b >> 32); | ||
772 | |||
773 | //m = ((uint128_t)b * a[2]) + r[2]; | ||
774 | // Since a[2] is 0000000000000000, the above optimizes to: | ||
775 | // m = b * 0 + r[2] = r[2]; | ||
776 | //t += m; | ||
777 | // t = t64 + r[2]; | ||
778 | //t_hi = (t < m); | ||
779 | // t_hi = 0; | ||
780 | //r[2] = (uint64_t)t; | ||
781 | r[2] += t64; | ||
782 | //t = (t >> 64) | ((uint128_t)t_hi << 64); | ||
783 | t64 = (r[2] < t64); | ||
784 | |||
785 | //m = ((uint128_t)b * a[3]) + r[3]; | ||
786 | // Since a[3] is ffffffff00000001, the above optimizes to: | ||
787 | // m = b * ffffffff00000001 + r[3]; | ||
788 | // m = b + b*ffffffff00000000 + r[3] | ||
789 | // m = b + (b*ffffffff << 32) + r[3] | ||
790 | // m = b + (((b<<32) - b) << 32) + r[3] | ||
791 | //t += m; | ||
792 | // t = t64 + (uint128_t)b + ((((uint128_t)b << 32) - b) << 32) + r[3]; | ||
793 | t64 += b; | ||
794 | t64u = (t64 < b); | ||
795 | t64 += r[3]; | ||
796 | t64u += (t64 < r[3]); | ||
797 | { | ||
798 | uint64_t lo,hi; | ||
799 | //lo = (((b << 32) - b) << 32 | ||
800 | //hi = (((uint128_t)b << 32) - b) >> 32 | ||
801 | //but without uint128_t: | ||
802 | hi = (b << 32) - b; /* form lower 32 bits of "hi" part 1 */ | ||
803 | b = (b >> 32) - (/*borrowed above?*/(b << 32) < b); /* upper 32 bits of "hi" are in b */ | ||
804 | lo = hi << 32; /* (use "hi" value to calculate "lo",... */ | ||
805 | t64 += lo; /* ...consume... */ | ||
806 | t64u += (t64 < lo); /* ..."lo") */ | ||
807 | hi >>= 32; /* form lower 32 bits of "hi" part 2 */ | ||
808 | hi |= (b << 32); /* combine lower and upper */ | ||
809 | t64u += hi; /* consume "hi" */ | ||
810 | } | ||
811 | //t_hi = (t < m); | ||
812 | // t_hi = 0; | ||
813 | //r[3] = (uint64_t)t; | ||
814 | r[3] = t64; | ||
815 | //t = (t >> 64) | ((uint128_t)t_hi << 64); | ||
816 | // t = t64u; | ||
817 | |||
818 | r[4] += t64u; | ||
819 | return (r[4] < t64u); /* 1 if addition overflowed */ | ||
820 | # endif | ||
821 | } | ||
822 | |||
823 | static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* aa/*, const sp_digit* m, sp_digit mp*/) | ||
824 | { | ||
825 | // const sp_digit* m = p256_mod; | ||
826 | int i; | ||
827 | uint64_t *a = (void*)aa; | ||
828 | |||
829 | sp_digit carry = 0; | ||
830 | for (i = 0; i < 4; i++) { | ||
831 | // mu = a[i]; | ||
832 | if (sp_256_mul_add_4(a+i /*, m, mu*/)) { | ||
833 | int j = i + 4; | ||
834 | inc_next_word: | ||
835 | if (++j > 7) { /* a[8] array has no more words? */ | ||
836 | carry++; | ||
837 | continue; | ||
838 | } | ||
839 | if (++a[j] == 0) /* did this overflow too? */ | ||
840 | goto inc_next_word; | ||
841 | } | ||
842 | } | ||
843 | sp_512to256_mont_shift_8(r, aa); | ||
844 | if (carry != 0) | ||
845 | sp_256_sub_8_p256_mod(r); | ||
846 | sp_256_norm_8(r); | ||
847 | } | ||
848 | |||
849 | #else /* Generic 32-bit version */ | ||
850 | |||
715 | /* Mul a by scalar b and add into r. (r += a * b) | 851 | /* Mul a by scalar b and add into r. (r += a * b) |
716 | * a = p256_mod | 852 | * a = p256_mod |
717 | * b = r[0] | 853 | * b = r[0] |
718 | */ | 854 | */ |
719 | static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) | 855 | static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) |
720 | { | 856 | { |
721 | // const sp_digit* a = p256_mod; | ||
722 | //a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff | ||
723 | sp_digit b = r[0]; | 857 | sp_digit b = r[0]; |
724 | |||
725 | uint64_t t; | 858 | uint64_t t; |
726 | 859 | ||
727 | // t = 0; | 860 | # if 0 |
728 | // for (i = 0; i < 8; i++) { | 861 | const sp_digit* a = p256_mod; |
729 | // uint32_t t_hi; | 862 | //a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff |
730 | // uint64_t m = ((uint64_t)b * a[i]) + r[i]; | 863 | int i; |
731 | // t += m; | 864 | t = 0; |
732 | // t_hi = (t < m); | 865 | for (i = 0; i < 8; i++) { |
733 | // r[i] = (sp_digit)t; | 866 | uint32_t t_hi; |
734 | // t = (t >> 32) | ((uint64_t)t_hi << 32); | 867 | uint64_t m = ((uint64_t)b * a[i]) + r[i]; |
735 | // } | 868 | t += m; |
736 | // r[8] += (sp_digit)t; | 869 | t_hi = (t < m); |
737 | 870 | r[i] = (sp_digit)t; | |
871 | t = (t >> 32) | ((uint64_t)t_hi << 32); | ||
872 | } | ||
873 | r[8] += (sp_digit)t; | ||
874 | return (r[8] < (sp_digit)t); /* 1 if addition overflowed */ | ||
875 | # else | ||
738 | // Unroll, then optimize the above loop: | 876 | // Unroll, then optimize the above loop: |
739 | //uint32_t t_hi; | 877 | //uint32_t t_hi; |
740 | uint64_t m; | 878 | uint64_t m; |
@@ -748,7 +886,8 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) | |||
748 | //t_hi = (t < m); | 886 | //t_hi = (t < m); |
749 | // t_hi = 0; | 887 | // t_hi = 0; |
750 | //r[0] = (sp_digit)t; | 888 | //r[0] = (sp_digit)t; |
751 | r[0] = 0; | 889 | // r[0] = 0; |
890 | //the store can be eliminated since caller won't look at lower 256 bits of the result | ||
752 | //t = (t >> 32) | ((uint64_t)t_hi << 32); | 891 | //t = (t >> 32) | ((uint64_t)t_hi << 32); |
753 | // t = b; | 892 | // t = b; |
754 | 893 | ||
@@ -840,6 +979,7 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) | |||
840 | 979 | ||
841 | r[8] += (sp_digit)t; | 980 | r[8] += (sp_digit)t; |
842 | return (r[8] < (sp_digit)t); /* 1 if addition overflowed */ | 981 | return (r[8] < (sp_digit)t); /* 1 if addition overflowed */ |
982 | # endif | ||
843 | } | 983 | } |
844 | 984 | ||
845 | /* Reduce the number back to 256 bits using Montgomery reduction. | 985 | /* Reduce the number back to 256 bits using Montgomery reduction. |
@@ -861,7 +1001,7 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) | |||
861 | * Then a multiple of modulus is added to make T divisible by B^2. | 1001 | * Then a multiple of modulus is added to make T divisible by B^2. |
862 | * [In our case, it is (p256_mp_mod * a[1]) << 32.] | 1002 | * [In our case, it is (p256_mp_mod * a[1]) << 32.] |
863 | * And so on. Eventually T is divisible by R, and after division by R | 1003 | * And so on. Eventually T is divisible by R, and after division by R |
864 | * the algorithm is in the same place as the usual Montgomery reduction was. | 1004 | * the algorithm is in the same place as the usual Montgomery reduction. |
865 | * | 1005 | * |
866 | * TODO: Can conditionally use 64-bit (if bit-little-endian arch) logic? | 1006 | * TODO: Can conditionally use 64-bit (if bit-little-endian arch) logic? |
867 | */ | 1007 | */ |
@@ -914,6 +1054,7 @@ static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* a/*, const sp_digit | |||
914 | sp_256_norm_8(r); | 1054 | sp_256_norm_8(r); |
915 | } | 1055 | } |
916 | } | 1056 | } |
1057 | #endif | ||
917 | 1058 | ||
918 | /* Multiply two Montogmery form numbers mod the modulus (prime). | 1059 | /* Multiply two Montogmery form numbers mod the modulus (prime). |
919 | * (r = a * b mod m) | 1060 | * (r = a * b mod m) |