diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2021-11-27 15:47:26 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2021-11-27 15:47:26 +0100 |
commit | 4415f7bc06f1ee382bcbaabd86c3d7aca0b46d93 (patch) | |
tree | 9fb82bbbb4dca3f9ad86ef8f831e54333db4666b | |
parent | bbda85c74b7a53d8b2bb46f3b44d8f0932a6e95d (diff) | |
download | busybox-w32-4415f7bc06f1ee382bcbaabd86c3d7aca0b46d93.tar.gz busybox-w32-4415f7bc06f1ee382bcbaabd86c3d7aca0b46d93.tar.bz2 busybox-w32-4415f7bc06f1ee382bcbaabd86c3d7aca0b46d93.zip |
tls: P256: explain which functions use double-wide arrays, no code changes
function old new delta
sp_512to256_mont_reduce_8 - 243 +243
sp_256to512z_mont_mul_8 - 150 +150
sp_256to512z_mont_sqr_8 - 7 +7
sp_256_mont_sqr_8 7 - -7
sp_256_mont_mul_8 150 - -150
sp_256_mont_reduce_8 243 - -243
------------------------------------------------------------------------------
(add/remove: 3/3 grow/shrink: 0/0 up/down: 400/-400) Total: 0 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | networking/tls_sp_c32.c | 211 |
1 files changed, 58 insertions, 153 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c index 3b0473036..74ded2cda 100644 --- a/networking/tls_sp_c32.c +++ b/networking/tls_sp_c32.c | |||
@@ -455,8 +455,10 @@ static void sp_256_sub_8_p256_mod(sp_digit* r) | |||
455 | } | 455 | } |
456 | #endif | 456 | #endif |
457 | 457 | ||
458 | /* Multiply a and b into r. (r = a * b) */ | 458 | /* Multiply a and b into r. (r = a * b) |
459 | static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) | 459 | * r should be [16] array (512 bits). |
460 | */ | ||
461 | static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) | ||
460 | { | 462 | { |
461 | #if ALLOW_ASM && defined(__GNUC__) && defined(__i386__) | 463 | #if ALLOW_ASM && defined(__GNUC__) && defined(__i386__) |
462 | sp_digit rr[15]; /* in case r coincides with a or b */ | 464 | sp_digit rr[15]; /* in case r coincides with a or b */ |
@@ -704,9 +706,11 @@ static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /*, const sp_digit* | |||
704 | } | 706 | } |
705 | } | 707 | } |
706 | 708 | ||
707 | /* Shift the result in the high 256 bits down to the bottom. */ | 709 | /* Shift the result in the high 256 bits down to the bottom. |
710 | * High half is cleared to zeros. | ||
711 | */ | ||
708 | #if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff | 712 | #if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff |
709 | static void sp_256_mont_shift_8(sp_digit* rr) | 713 | static void sp_512to256_mont_shift_8(sp_digit* rr) |
710 | { | 714 | { |
711 | uint64_t *r = (void*)rr; | 715 | uint64_t *r = (void*)rr; |
712 | int i; | 716 | int i; |
@@ -717,7 +721,7 @@ static void sp_256_mont_shift_8(sp_digit* rr) | |||
717 | } | 721 | } |
718 | } | 722 | } |
719 | #else | 723 | #else |
720 | static void sp_256_mont_shift_8(sp_digit* r) | 724 | static void sp_512to256_mont_shift_8(sp_digit* r) |
721 | { | 725 | { |
722 | int i; | 726 | int i; |
723 | 727 | ||
@@ -728,7 +732,10 @@ static void sp_256_mont_shift_8(sp_digit* r) | |||
728 | } | 732 | } |
729 | #endif | 733 | #endif |
730 | 734 | ||
731 | /* Mul a by scalar b and add into r. (r += a * b) */ | 735 | /* Mul a by scalar b and add into r. (r += a * b) |
736 | * a = p256_mod | ||
737 | * b = r[0] | ||
738 | */ | ||
732 | static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) | 739 | static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) |
733 | { | 740 | { |
734 | // const sp_digit* a = p256_mod; | 741 | // const sp_digit* a = p256_mod; |
@@ -857,11 +864,11 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) | |||
857 | 864 | ||
858 | /* Reduce the number back to 256 bits using Montgomery reduction. | 865 | /* Reduce the number back to 256 bits using Montgomery reduction. |
859 | * | 866 | * |
860 | * a A single precision number to reduce in place. | 867 | * a Double-wide number to reduce in place. |
861 | * m The single precision number representing the modulus. | 868 | * m The single precision number representing the modulus. |
862 | * mp The digit representing the negative inverse of m mod 2^n. | 869 | * mp The digit representing the negative inverse of m mod 2^n. |
863 | */ | 870 | */ |
864 | static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/) | 871 | static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/) |
865 | { | 872 | { |
866 | // const sp_digit* m = p256_mod; | 873 | // const sp_digit* m = p256_mod; |
867 | sp_digit mp = p256_mp_mod; | 874 | sp_digit mp = p256_mp_mod; |
@@ -884,7 +891,7 @@ static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/ | |||
884 | goto inc_next_word0; | 891 | goto inc_next_word0; |
885 | } | 892 | } |
886 | } | 893 | } |
887 | sp_256_mont_shift_8(a); | 894 | sp_512to256_mont_shift_8(a); |
888 | if (word16th != 0) | 895 | if (word16th != 0) |
889 | sp_256_sub_8_p256_mod(a); | 896 | sp_256_sub_8_p256_mod(a); |
890 | sp_256_norm_8(a); | 897 | sp_256_norm_8(a); |
@@ -892,7 +899,7 @@ static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/ | |||
892 | else { /* Same code for explicit mp == 1 (which is always the case for P256) */ | 899 | else { /* Same code for explicit mp == 1 (which is always the case for P256) */ |
893 | sp_digit word16th = 0; | 900 | sp_digit word16th = 0; |
894 | for (i = 0; i < 8; i++) { | 901 | for (i = 0; i < 8; i++) { |
895 | /*mu = a[i];*/ | 902 | // mu = a[i]; |
896 | if (sp_256_mul_add_8(a+i /*, m, mu*/)) { | 903 | if (sp_256_mul_add_8(a+i /*, m, mu*/)) { |
897 | int j = i + 8; | 904 | int j = i + 8; |
898 | inc_next_word: | 905 | inc_next_word: |
@@ -904,148 +911,46 @@ static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/ | |||
904 | goto inc_next_word; | 911 | goto inc_next_word; |
905 | } | 912 | } |
906 | } | 913 | } |
907 | sp_256_mont_shift_8(a); | 914 | sp_512to256_mont_shift_8(a); |
908 | if (word16th != 0) | 915 | if (word16th != 0) |
909 | sp_256_sub_8_p256_mod(a); | 916 | sp_256_sub_8_p256_mod(a); |
910 | sp_256_norm_8(a); | 917 | sp_256_norm_8(a); |
911 | } | 918 | } |
912 | } | 919 | } |
913 | #if 0 | ||
914 | //TODO: arm32 asm (also adapt for x86?) | ||
915 | static void sp_256_mont_reduce_8(sp_digit* a, sp_digit* m, sp_digit mp) | ||
916 | { | ||
917 | sp_digit ca = 0; | ||
918 | |||
919 | asm volatile ( | ||
920 | # i = 0 | ||
921 | mov r12, #0 | ||
922 | ldr r10, [%[a], #0] | ||
923 | ldr r14, [%[a], #4] | ||
924 | 1: | ||
925 | # mu = a[i] * mp | ||
926 | mul r8, %[mp], r10 | ||
927 | # a[i+0] += m[0] * mu | ||
928 | ldr r7, [%[m], #0] | ||
929 | ldr r9, [%[a], #0] | ||
930 | umull r6, r7, r8, r7 | ||
931 | adds r10, r10, r6 | ||
932 | adc r5, r7, #0 | ||
933 | # a[i+1] += m[1] * mu | ||
934 | ldr r7, [%[m], #4] | ||
935 | ldr r9, [%[a], #4] | ||
936 | umull r6, r7, r8, r7 | ||
937 | adds r10, r14, r6 | ||
938 | adc r4, r7, #0 | ||
939 | adds r10, r10, r5 | ||
940 | adc r4, r4, #0 | ||
941 | # a[i+2] += m[2] * mu | ||
942 | ldr r7, [%[m], #8] | ||
943 | ldr r14, [%[a], #8] | ||
944 | umull r6, r7, r8, r7 | ||
945 | adds r14, r14, r6 | ||
946 | adc r5, r7, #0 | ||
947 | adds r14, r14, r4 | ||
948 | adc r5, r5, #0 | ||
949 | # a[i+3] += m[3] * mu | ||
950 | ldr r7, [%[m], #12] | ||
951 | ldr r9, [%[a], #12] | ||
952 | umull r6, r7, r8, r7 | ||
953 | adds r9, r9, r6 | ||
954 | adc r4, r7, #0 | ||
955 | adds r9, r9, r5 | ||
956 | str r9, [%[a], #12] | ||
957 | adc r4, r4, #0 | ||
958 | # a[i+4] += m[4] * mu | ||
959 | ldr r7, [%[m], #16] | ||
960 | ldr r9, [%[a], #16] | ||
961 | umull r6, r7, r8, r7 | ||
962 | adds r9, r9, r6 | ||
963 | adc r5, r7, #0 | ||
964 | adds r9, r9, r4 | ||
965 | str r9, [%[a], #16] | ||
966 | adc r5, r5, #0 | ||
967 | # a[i+5] += m[5] * mu | ||
968 | ldr r7, [%[m], #20] | ||
969 | ldr r9, [%[a], #20] | ||
970 | umull r6, r7, r8, r7 | ||
971 | adds r9, r9, r6 | ||
972 | adc r4, r7, #0 | ||
973 | adds r9, r9, r5 | ||
974 | str r9, [%[a], #20] | ||
975 | adc r4, r4, #0 | ||
976 | # a[i+6] += m[6] * mu | ||
977 | ldr r7, [%[m], #24] | ||
978 | ldr r9, [%[a], #24] | ||
979 | umull r6, r7, r8, r7 | ||
980 | adds r9, r9, r6 | ||
981 | adc r5, r7, #0 | ||
982 | adds r9, r9, r4 | ||
983 | str r9, [%[a], #24] | ||
984 | adc r5, r5, #0 | ||
985 | # a[i+7] += m[7] * mu | ||
986 | ldr r7, [%[m], #28] | ||
987 | ldr r9, [%[a], #28] | ||
988 | umull r6, r7, r8, r7 | ||
989 | adds r5, r5, r6 | ||
990 | adcs r7, r7, %[ca] | ||
991 | mov %[ca], #0 | ||
992 | adc %[ca], %[ca], %[ca] | ||
993 | adds r9, r9, r5 | ||
994 | str r9, [%[a], #28] | ||
995 | ldr r9, [%[a], #32] | ||
996 | adcs r9, r9, r7 | ||
997 | str r9, [%[a], #32] | ||
998 | adc %[ca], %[ca], #0 | ||
999 | # i += 1 | ||
1000 | add %[a], %[a], #4 | ||
1001 | add r12, r12, #4 | ||
1002 | cmp r12, #32 | ||
1003 | blt 1b | ||
1004 | |||
1005 | str r10, [%[a], #0] | ||
1006 | str r14, [%[a], #4] | ||
1007 | : [ca] "+r" (ca), [a] "+r" (a) | ||
1008 | : [m] "r" (m), [mp] "r" (mp) | ||
1009 | : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14" | ||
1010 | ); | ||
1011 | |||
1012 | memcpy(a, a + 8, 32); | ||
1013 | if (ca) | ||
1014 | a -= m; | ||
1015 | } | ||
1016 | #endif | ||
1017 | 920 | ||
1018 | /* Multiply two Montogmery form numbers mod the modulus (prime). | 921 | /* Multiply two Montogmery form numbers mod the modulus (prime). |
1019 | * (r = a * b mod m) | 922 | * (r = a * b mod m) |
1020 | * | 923 | * |
1021 | * r Result of multiplication. | 924 | * r Result of multiplication. |
925 | * Should be [16] array (512 bits), but high half is cleared to zeros (used as scratch pad). | ||
1022 | * a First number to multiply in Montogmery form. | 926 | * a First number to multiply in Montogmery form. |
1023 | * b Second number to multiply in Montogmery form. | 927 | * b Second number to multiply in Montogmery form. |
1024 | * m Modulus (prime). | 928 | * m Modulus (prime). |
1025 | * mp Montogmery mulitplier. | 929 | * mp Montogmery mulitplier. |
1026 | */ | 930 | */ |
1027 | static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b | 931 | static void sp_256to512z_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b |
1028 | /*, const sp_digit* m, sp_digit mp*/) | 932 | /*, const sp_digit* m, sp_digit mp*/) |
1029 | { | 933 | { |
1030 | //const sp_digit* m = p256_mod; | 934 | //const sp_digit* m = p256_mod; |
1031 | //sp_digit mp = p256_mp_mod; | 935 | //sp_digit mp = p256_mp_mod; |
1032 | sp_256_mul_8(r, a, b); | 936 | sp_256to512_mul_8(r, a, b); |
1033 | sp_256_mont_reduce_8(r /*, m, mp*/); | 937 | sp_512to256_mont_reduce_8(r /*, m, mp*/); |
1034 | } | 938 | } |
1035 | 939 | ||
1036 | /* Square the Montgomery form number. (r = a * a mod m) | 940 | /* Square the Montgomery form number. (r = a * a mod m) |
1037 | * | 941 | * |
1038 | * r Result of squaring. | 942 | * r Result of squaring. |
943 | * Should be [16] array (512 bits), but high half is cleared to zeros (used as scratch pad). | ||
1039 | * a Number to square in Montogmery form. | 944 | * a Number to square in Montogmery form. |
1040 | * m Modulus (prime). | 945 | * m Modulus (prime). |
1041 | * mp Montogmery mulitplier. | 946 | * mp Montogmery mulitplier. |
1042 | */ | 947 | */ |
1043 | static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a | 948 | static void sp_256to512z_mont_sqr_8(sp_digit* r, const sp_digit* a |
1044 | /*, const sp_digit* m, sp_digit mp*/) | 949 | /*, const sp_digit* m, sp_digit mp*/) |
1045 | { | 950 | { |
1046 | //const sp_digit* m = p256_mod; | 951 | //const sp_digit* m = p256_mod; |
1047 | //sp_digit mp = p256_mp_mod; | 952 | //sp_digit mp = p256_mp_mod; |
1048 | sp_256_mont_mul_8(r, a, a /*, m, mp*/); | 953 | sp_256to512z_mont_mul_8(r, a, a /*, m, mp*/); |
1049 | } | 954 | } |
1050 | 955 | ||
1051 | /* Invert the number, in Montgomery form, modulo the modulus (prime) of the | 956 | /* Invert the number, in Montgomery form, modulo the modulus (prime) of the |
@@ -1068,15 +973,15 @@ static const uint32_t p256_mod_2[8] = { | |||
1068 | #endif | 973 | #endif |
1069 | static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a) | 974 | static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a) |
1070 | { | 975 | { |
1071 | sp_digit t[2*8]; //can be just [8]? | 976 | sp_digit t[2*8]; |
1072 | int i; | 977 | int i; |
1073 | 978 | ||
1074 | memcpy(t, a, sizeof(sp_digit) * 8); | 979 | memcpy(t, a, sizeof(sp_digit) * 8); |
1075 | for (i = 254; i >= 0; i--) { | 980 | for (i = 254; i >= 0; i--) { |
1076 | sp_256_mont_sqr_8(t, t /*, p256_mod, p256_mp_mod*/); | 981 | sp_256to512z_mont_sqr_8(t, t /*, p256_mod, p256_mp_mod*/); |
1077 | /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/ | 982 | /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/ |
1078 | if (i >= 224 || i == 192 || (i <= 95 && i != 1)) | 983 | if (i >= 224 || i == 192 || (i <= 95 && i != 1)) |
1079 | sp_256_mont_mul_8(t, t, a /*, p256_mod, p256_mp_mod*/); | 984 | sp_256to512z_mont_mul_8(t, t, a /*, p256_mod, p256_mp_mod*/); |
1080 | } | 985 | } |
1081 | memcpy(r, t, sizeof(sp_digit) * 8); | 986 | memcpy(r, t, sizeof(sp_digit) * 8); |
1082 | } | 987 | } |
@@ -1152,22 +1057,22 @@ static void sp_256_map_8(sp_point* r, sp_point* p) | |||
1152 | 1057 | ||
1153 | sp_256_mont_inv_8(t1, p->z); | 1058 | sp_256_mont_inv_8(t1, p->z); |
1154 | 1059 | ||
1155 | sp_256_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/); | 1060 | sp_256to512z_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/); |
1156 | sp_256_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/); | 1061 | sp_256to512z_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/); |
1157 | 1062 | ||
1158 | /* x /= z^2 */ | 1063 | /* x /= z^2 */ |
1159 | sp_256_mont_mul_8(r->x, p->x, t2 /*, p256_mod, p256_mp_mod*/); | 1064 | sp_256to512z_mont_mul_8(r->x, p->x, t2 /*, p256_mod, p256_mp_mod*/); |
1160 | memset(r->x + 8, 0, sizeof(r->x) / 2); | 1065 | memset(r->x + 8, 0, sizeof(r->x) / 2); |
1161 | sp_256_mont_reduce_8(r->x /*, p256_mod, p256_mp_mod*/); | 1066 | sp_512to256_mont_reduce_8(r->x /*, p256_mod, p256_mp_mod*/); |
1162 | /* Reduce x to less than modulus */ | 1067 | /* Reduce x to less than modulus */ |
1163 | if (sp_256_cmp_8(r->x, p256_mod) >= 0) | 1068 | if (sp_256_cmp_8(r->x, p256_mod) >= 0) |
1164 | sp_256_sub_8_p256_mod(r->x); | 1069 | sp_256_sub_8_p256_mod(r->x); |
1165 | sp_256_norm_8(r->x); | 1070 | sp_256_norm_8(r->x); |
1166 | 1071 | ||
1167 | /* y /= z^3 */ | 1072 | /* y /= z^3 */ |
1168 | sp_256_mont_mul_8(r->y, p->y, t1 /*, p256_mod, p256_mp_mod*/); | 1073 | sp_256to512z_mont_mul_8(r->y, p->y, t1 /*, p256_mod, p256_mp_mod*/); |
1169 | memset(r->y + 8, 0, sizeof(r->y) / 2); | 1074 | memset(r->y + 8, 0, sizeof(r->y) / 2); |
1170 | sp_256_mont_reduce_8(r->y /*, p256_mod, p256_mp_mod*/); | 1075 | sp_512to256_mont_reduce_8(r->y /*, p256_mod, p256_mp_mod*/); |
1171 | /* Reduce y to less than modulus */ | 1076 | /* Reduce y to less than modulus */ |
1172 | if (sp_256_cmp_8(r->y, p256_mod) >= 0) | 1077 | if (sp_256_cmp_8(r->y, p256_mod) >= 0) |
1173 | sp_256_sub_8_p256_mod(r->y); | 1078 | sp_256_sub_8_p256_mod(r->y); |
@@ -1202,9 +1107,9 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p) | |||
1202 | } | 1107 | } |
1203 | 1108 | ||
1204 | /* T1 = Z * Z */ | 1109 | /* T1 = Z * Z */ |
1205 | sp_256_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/); | 1110 | sp_256to512z_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/); |
1206 | /* Z = Y * Z */ | 1111 | /* Z = Y * Z */ |
1207 | sp_256_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/); | 1112 | sp_256to512z_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/); |
1208 | /* Z = 2Z */ | 1113 | /* Z = 2Z */ |
1209 | sp_256_mont_dbl_8(r->z, r->z /*, p256_mod*/); | 1114 | sp_256_mont_dbl_8(r->z, r->z /*, p256_mod*/); |
1210 | /* T2 = X - T1 */ | 1115 | /* T2 = X - T1 */ |
@@ -1212,21 +1117,21 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p) | |||
1212 | /* T1 = X + T1 */ | 1117 | /* T1 = X + T1 */ |
1213 | sp_256_mont_add_8(t1, r->x, t1 /*, p256_mod*/); | 1118 | sp_256_mont_add_8(t1, r->x, t1 /*, p256_mod*/); |
1214 | /* T2 = T1 * T2 */ | 1119 | /* T2 = T1 * T2 */ |
1215 | sp_256_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/); | 1120 | sp_256to512z_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/); |
1216 | /* T1 = 3T2 */ | 1121 | /* T1 = 3T2 */ |
1217 | sp_256_mont_tpl_8(t1, t2 /*, p256_mod*/); | 1122 | sp_256_mont_tpl_8(t1, t2 /*, p256_mod*/); |
1218 | /* Y = 2Y */ | 1123 | /* Y = 2Y */ |
1219 | sp_256_mont_dbl_8(r->y, r->y /*, p256_mod*/); | 1124 | sp_256_mont_dbl_8(r->y, r->y /*, p256_mod*/); |
1220 | /* Y = Y * Y */ | 1125 | /* Y = Y * Y */ |
1221 | sp_256_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/); | 1126 | sp_256to512z_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/); |
1222 | /* T2 = Y * Y */ | 1127 | /* T2 = Y * Y */ |
1223 | sp_256_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/); | 1128 | sp_256to512z_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/); |
1224 | /* T2 = T2/2 */ | 1129 | /* T2 = T2/2 */ |
1225 | sp_256_div2_8(t2, t2, p256_mod); | 1130 | sp_256_div2_8(t2, t2, p256_mod); |
1226 | /* Y = Y * X */ | 1131 | /* Y = Y * X */ |
1227 | sp_256_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/); | 1132 | sp_256to512z_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/); |
1228 | /* X = T1 * T1 */ | 1133 | /* X = T1 * T1 */ |
1229 | sp_256_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/); | 1134 | sp_256to512z_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/); |
1230 | /* X = X - Y */ | 1135 | /* X = X - Y */ |
1231 | sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/); | 1136 | sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/); |
1232 | /* X = X - Y */ | 1137 | /* X = X - Y */ |
@@ -1234,7 +1139,7 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p) | |||
1234 | /* Y = Y - X */ | 1139 | /* Y = Y - X */ |
1235 | sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/); | 1140 | sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/); |
1236 | /* Y = Y * T1 */ | 1141 | /* Y = Y * T1 */ |
1237 | sp_256_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/); | 1142 | sp_256to512z_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/); |
1238 | /* Y = Y - T2 */ | 1143 | /* Y = Y - T2 */ |
1239 | sp_256_mont_sub_8(r->y, r->y, t2 /*, p256_mod*/); | 1144 | sp_256_mont_sub_8(r->y, r->y, t2 /*, p256_mod*/); |
1240 | dump_512("y2 %s\n", r->y); | 1145 | dump_512("y2 %s\n", r->y); |
@@ -1279,36 +1184,36 @@ static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point* | |||
1279 | } | 1184 | } |
1280 | 1185 | ||
1281 | /* U1 = X1*Z2^2 */ | 1186 | /* U1 = X1*Z2^2 */ |
1282 | sp_256_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/); | 1187 | sp_256to512z_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/); |
1283 | sp_256_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/); | 1188 | sp_256to512z_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/); |
1284 | sp_256_mont_mul_8(t1, t1, r->x /*, p256_mod, p256_mp_mod*/); | 1189 | sp_256to512z_mont_mul_8(t1, t1, r->x /*, p256_mod, p256_mp_mod*/); |
1285 | /* U2 = X2*Z1^2 */ | 1190 | /* U2 = X2*Z1^2 */ |
1286 | sp_256_mont_sqr_8(t2, r->z /*, p256_mod, p256_mp_mod*/); | 1191 | sp_256to512z_mont_sqr_8(t2, r->z /*, p256_mod, p256_mp_mod*/); |
1287 | sp_256_mont_mul_8(t4, t2, r->z /*, p256_mod, p256_mp_mod*/); | 1192 | sp_256to512z_mont_mul_8(t4, t2, r->z /*, p256_mod, p256_mp_mod*/); |
1288 | sp_256_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/); | 1193 | sp_256to512z_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/); |
1289 | /* S1 = Y1*Z2^3 */ | 1194 | /* S1 = Y1*Z2^3 */ |
1290 | sp_256_mont_mul_8(t3, t3, r->y /*, p256_mod, p256_mp_mod*/); | 1195 | sp_256to512z_mont_mul_8(t3, t3, r->y /*, p256_mod, p256_mp_mod*/); |
1291 | /* S2 = Y2*Z1^3 */ | 1196 | /* S2 = Y2*Z1^3 */ |
1292 | sp_256_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/); | 1197 | sp_256to512z_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/); |
1293 | /* H = U2 - U1 */ | 1198 | /* H = U2 - U1 */ |
1294 | sp_256_mont_sub_8(t2, t2, t1 /*, p256_mod*/); | 1199 | sp_256_mont_sub_8(t2, t2, t1 /*, p256_mod*/); |
1295 | /* R = S2 - S1 */ | 1200 | /* R = S2 - S1 */ |
1296 | sp_256_mont_sub_8(t4, t4, t3 /*, p256_mod*/); | 1201 | sp_256_mont_sub_8(t4, t4, t3 /*, p256_mod*/); |
1297 | /* Z3 = H*Z1*Z2 */ | 1202 | /* Z3 = H*Z1*Z2 */ |
1298 | sp_256_mont_mul_8(r->z, r->z, q->z /*, p256_mod, p256_mp_mod*/); | 1203 | sp_256to512z_mont_mul_8(r->z, r->z, q->z /*, p256_mod, p256_mp_mod*/); |
1299 | sp_256_mont_mul_8(r->z, r->z, t2 /*, p256_mod, p256_mp_mod*/); | 1204 | sp_256to512z_mont_mul_8(r->z, r->z, t2 /*, p256_mod, p256_mp_mod*/); |
1300 | /* X3 = R^2 - H^3 - 2*U1*H^2 */ | 1205 | /* X3 = R^2 - H^3 - 2*U1*H^2 */ |
1301 | sp_256_mont_sqr_8(r->x, t4 /*, p256_mod, p256_mp_mod*/); | 1206 | sp_256to512z_mont_sqr_8(r->x, t4 /*, p256_mod, p256_mp_mod*/); |
1302 | sp_256_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/); | 1207 | sp_256to512z_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/); |
1303 | sp_256_mont_mul_8(r->y, t1, t5 /*, p256_mod, p256_mp_mod*/); | 1208 | sp_256to512z_mont_mul_8(r->y, t1, t5 /*, p256_mod, p256_mp_mod*/); |
1304 | sp_256_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/); | 1209 | sp_256to512z_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/); |
1305 | sp_256_mont_sub_8(r->x, r->x, t5 /*, p256_mod*/); | 1210 | sp_256_mont_sub_8(r->x, r->x, t5 /*, p256_mod*/); |
1306 | sp_256_mont_dbl_8(t1, r->y /*, p256_mod*/); | 1211 | sp_256_mont_dbl_8(t1, r->y /*, p256_mod*/); |
1307 | sp_256_mont_sub_8(r->x, r->x, t1 /*, p256_mod*/); | 1212 | sp_256_mont_sub_8(r->x, r->x, t1 /*, p256_mod*/); |
1308 | /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ | 1213 | /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ |
1309 | sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/); | 1214 | sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/); |
1310 | sp_256_mont_mul_8(r->y, r->y, t4 /*, p256_mod, p256_mp_mod*/); | 1215 | sp_256to512z_mont_mul_8(r->y, r->y, t4 /*, p256_mod, p256_mp_mod*/); |
1311 | sp_256_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/); | 1216 | sp_256to512z_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/); |
1312 | sp_256_mont_sub_8(r->y, r->y, t5 /*, p256_mod*/); | 1217 | sp_256_mont_sub_8(r->y, r->y, t5 /*, p256_mod*/); |
1313 | } | 1218 | } |
1314 | 1219 | ||