aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2021-11-27 15:47:26 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2021-11-27 15:47:26 +0100
commit4415f7bc06f1ee382bcbaabd86c3d7aca0b46d93 (patch)
tree9fb82bbbb4dca3f9ad86ef8f831e54333db4666b
parentbbda85c74b7a53d8b2bb46f3b44d8f0932a6e95d (diff)
downloadbusybox-w32-4415f7bc06f1ee382bcbaabd86c3d7aca0b46d93.tar.gz
busybox-w32-4415f7bc06f1ee382bcbaabd86c3d7aca0b46d93.tar.bz2
busybox-w32-4415f7bc06f1ee382bcbaabd86c3d7aca0b46d93.zip
tls: P256: explain which functions use double-wide arrays, no code changes
function old new delta sp_512to256_mont_reduce_8 - 243 +243 sp_256to512z_mont_mul_8 - 150 +150 sp_256to512z_mont_sqr_8 - 7 +7 sp_256_mont_sqr_8 7 - -7 sp_256_mont_mul_8 150 - -150 sp_256_mont_reduce_8 243 - -243 ------------------------------------------------------------------------------ (add/remove: 3/3 grow/shrink: 0/0 up/down: 400/-400) Total: 0 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--networking/tls_sp_c32.c211
1 files changed, 58 insertions, 153 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
index 3b0473036..74ded2cda 100644
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@@ -455,8 +455,10 @@ static void sp_256_sub_8_p256_mod(sp_digit* r)
455} 455}
456#endif 456#endif
457 457
458/* Multiply a and b into r. (r = a * b) */ 458/* Multiply a and b into r. (r = a * b)
459static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) 459 * r should be [16] array (512 bits).
460 */
461static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
460{ 462{
461#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__) 463#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
462 sp_digit rr[15]; /* in case r coincides with a or b */ 464 sp_digit rr[15]; /* in case r coincides with a or b */
@@ -704,9 +706,11 @@ static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /*, const sp_digit*
704 } 706 }
705} 707}
706 708
707/* Shift the result in the high 256 bits down to the bottom. */ 709/* Shift the result in the high 256 bits down to the bottom.
710 * High half is cleared to zeros.
711 */
708#if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff 712#if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff
709static void sp_256_mont_shift_8(sp_digit* rr) 713static void sp_512to256_mont_shift_8(sp_digit* rr)
710{ 714{
711 uint64_t *r = (void*)rr; 715 uint64_t *r = (void*)rr;
712 int i; 716 int i;
@@ -717,7 +721,7 @@ static void sp_256_mont_shift_8(sp_digit* rr)
717 } 721 }
718} 722}
719#else 723#else
720static void sp_256_mont_shift_8(sp_digit* r) 724static void sp_512to256_mont_shift_8(sp_digit* r)
721{ 725{
722 int i; 726 int i;
723 727
@@ -728,7 +732,10 @@ static void sp_256_mont_shift_8(sp_digit* r)
728} 732}
729#endif 733#endif
730 734
731/* Mul a by scalar b and add into r. (r += a * b) */ 735/* Mul a by scalar b and add into r. (r += a * b)
736 * a = p256_mod
737 * b = r[0]
738 */
732static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) 739static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
733{ 740{
734// const sp_digit* a = p256_mod; 741// const sp_digit* a = p256_mod;
@@ -857,11 +864,11 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
857 864
858/* Reduce the number back to 256 bits using Montgomery reduction. 865/* Reduce the number back to 256 bits using Montgomery reduction.
859 * 866 *
860 * a A single precision number to reduce in place. 867 * a Double-wide number to reduce in place.
861 * m The single precision number representing the modulus. 868 * m The single precision number representing the modulus.
862 * mp The digit representing the negative inverse of m mod 2^n. 869 * mp The digit representing the negative inverse of m mod 2^n.
863 */ 870 */
864static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/) 871static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/)
865{ 872{
866// const sp_digit* m = p256_mod; 873// const sp_digit* m = p256_mod;
867 sp_digit mp = p256_mp_mod; 874 sp_digit mp = p256_mp_mod;
@@ -884,7 +891,7 @@ static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/
884 goto inc_next_word0; 891 goto inc_next_word0;
885 } 892 }
886 } 893 }
887 sp_256_mont_shift_8(a); 894 sp_512to256_mont_shift_8(a);
888 if (word16th != 0) 895 if (word16th != 0)
889 sp_256_sub_8_p256_mod(a); 896 sp_256_sub_8_p256_mod(a);
890 sp_256_norm_8(a); 897 sp_256_norm_8(a);
@@ -892,7 +899,7 @@ static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/
892 else { /* Same code for explicit mp == 1 (which is always the case for P256) */ 899 else { /* Same code for explicit mp == 1 (which is always the case for P256) */
893 sp_digit word16th = 0; 900 sp_digit word16th = 0;
894 for (i = 0; i < 8; i++) { 901 for (i = 0; i < 8; i++) {
895 /*mu = a[i];*/ 902// mu = a[i];
896 if (sp_256_mul_add_8(a+i /*, m, mu*/)) { 903 if (sp_256_mul_add_8(a+i /*, m, mu*/)) {
897 int j = i + 8; 904 int j = i + 8;
898 inc_next_word: 905 inc_next_word:
@@ -904,148 +911,46 @@ static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/
904 goto inc_next_word; 911 goto inc_next_word;
905 } 912 }
906 } 913 }
907 sp_256_mont_shift_8(a); 914 sp_512to256_mont_shift_8(a);
908 if (word16th != 0) 915 if (word16th != 0)
909 sp_256_sub_8_p256_mod(a); 916 sp_256_sub_8_p256_mod(a);
910 sp_256_norm_8(a); 917 sp_256_norm_8(a);
911 } 918 }
912} 919}
913#if 0
914//TODO: arm32 asm (also adapt for x86?)
915static void sp_256_mont_reduce_8(sp_digit* a, sp_digit* m, sp_digit mp)
916{
917 sp_digit ca = 0;
918
919 asm volatile (
920 # i = 0
921 mov r12, #0
922 ldr r10, [%[a], #0]
923 ldr r14, [%[a], #4]
9241:
925 # mu = a[i] * mp
926 mul r8, %[mp], r10
927 # a[i+0] += m[0] * mu
928 ldr r7, [%[m], #0]
929 ldr r9, [%[a], #0]
930 umull r6, r7, r8, r7
931 adds r10, r10, r6
932 adc r5, r7, #0
933 # a[i+1] += m[1] * mu
934 ldr r7, [%[m], #4]
935 ldr r9, [%[a], #4]
936 umull r6, r7, r8, r7
937 adds r10, r14, r6
938 adc r4, r7, #0
939 adds r10, r10, r5
940 adc r4, r4, #0
941 # a[i+2] += m[2] * mu
942 ldr r7, [%[m], #8]
943 ldr r14, [%[a], #8]
944 umull r6, r7, r8, r7
945 adds r14, r14, r6
946 adc r5, r7, #0
947 adds r14, r14, r4
948 adc r5, r5, #0
949 # a[i+3] += m[3] * mu
950 ldr r7, [%[m], #12]
951 ldr r9, [%[a], #12]
952 umull r6, r7, r8, r7
953 adds r9, r9, r6
954 adc r4, r7, #0
955 adds r9, r9, r5
956 str r9, [%[a], #12]
957 adc r4, r4, #0
958 # a[i+4] += m[4] * mu
959 ldr r7, [%[m], #16]
960 ldr r9, [%[a], #16]
961 umull r6, r7, r8, r7
962 adds r9, r9, r6
963 adc r5, r7, #0
964 adds r9, r9, r4
965 str r9, [%[a], #16]
966 adc r5, r5, #0
967 # a[i+5] += m[5] * mu
968 ldr r7, [%[m], #20]
969 ldr r9, [%[a], #20]
970 umull r6, r7, r8, r7
971 adds r9, r9, r6
972 adc r4, r7, #0
973 adds r9, r9, r5
974 str r9, [%[a], #20]
975 adc r4, r4, #0
976 # a[i+6] += m[6] * mu
977 ldr r7, [%[m], #24]
978 ldr r9, [%[a], #24]
979 umull r6, r7, r8, r7
980 adds r9, r9, r6
981 adc r5, r7, #0
982 adds r9, r9, r4
983 str r9, [%[a], #24]
984 adc r5, r5, #0
985 # a[i+7] += m[7] * mu
986 ldr r7, [%[m], #28]
987 ldr r9, [%[a], #28]
988 umull r6, r7, r8, r7
989 adds r5, r5, r6
990 adcs r7, r7, %[ca]
991 mov %[ca], #0
992 adc %[ca], %[ca], %[ca]
993 adds r9, r9, r5
994 str r9, [%[a], #28]
995 ldr r9, [%[a], #32]
996 adcs r9, r9, r7
997 str r9, [%[a], #32]
998 adc %[ca], %[ca], #0
999 # i += 1
1000 add %[a], %[a], #4
1001 add r12, r12, #4
1002 cmp r12, #32
1003 blt 1b
1004
1005 str r10, [%[a], #0]
1006 str r14, [%[a], #4]
1007 : [ca] "+r" (ca), [a] "+r" (a)
1008 : [m] "r" (m), [mp] "r" (mp)
1009 : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14"
1010 );
1011
1012 memcpy(a, a + 8, 32);
1013 if (ca)
1014 a -= m;
1015}
1016#endif
1017 920
1018/* Multiply two Montogmery form numbers mod the modulus (prime). 921/* Multiply two Montogmery form numbers mod the modulus (prime).
1019 * (r = a * b mod m) 922 * (r = a * b mod m)
1020 * 923 *
1021 * r Result of multiplication. 924 * r Result of multiplication.
925 * Should be [16] array (512 bits), but high half is cleared to zeros (used as scratch pad).
1022 * a First number to multiply in Montogmery form. 926 * a First number to multiply in Montogmery form.
1023 * b Second number to multiply in Montogmery form. 927 * b Second number to multiply in Montogmery form.
1024 * m Modulus (prime). 928 * m Modulus (prime).
1025 * mp Montogmery mulitplier. 929 * mp Montogmery mulitplier.
1026 */ 930 */
1027static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b 931static void sp_256to512z_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b
1028 /*, const sp_digit* m, sp_digit mp*/) 932 /*, const sp_digit* m, sp_digit mp*/)
1029{ 933{
1030 //const sp_digit* m = p256_mod; 934 //const sp_digit* m = p256_mod;
1031 //sp_digit mp = p256_mp_mod; 935 //sp_digit mp = p256_mp_mod;
1032 sp_256_mul_8(r, a, b); 936 sp_256to512_mul_8(r, a, b);
1033 sp_256_mont_reduce_8(r /*, m, mp*/); 937 sp_512to256_mont_reduce_8(r /*, m, mp*/);
1034} 938}
1035 939
1036/* Square the Montgomery form number. (r = a * a mod m) 940/* Square the Montgomery form number. (r = a * a mod m)
1037 * 941 *
1038 * r Result of squaring. 942 * r Result of squaring.
943 * Should be [16] array (512 bits), but high half is cleared to zeros (used as scratch pad).
1039 * a Number to square in Montogmery form. 944 * a Number to square in Montogmery form.
1040 * m Modulus (prime). 945 * m Modulus (prime).
1041 * mp Montogmery mulitplier. 946 * mp Montogmery mulitplier.
1042 */ 947 */
1043static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a 948static void sp_256to512z_mont_sqr_8(sp_digit* r, const sp_digit* a
1044 /*, const sp_digit* m, sp_digit mp*/) 949 /*, const sp_digit* m, sp_digit mp*/)
1045{ 950{
1046 //const sp_digit* m = p256_mod; 951 //const sp_digit* m = p256_mod;
1047 //sp_digit mp = p256_mp_mod; 952 //sp_digit mp = p256_mp_mod;
1048 sp_256_mont_mul_8(r, a, a /*, m, mp*/); 953 sp_256to512z_mont_mul_8(r, a, a /*, m, mp*/);
1049} 954}
1050 955
1051/* Invert the number, in Montgomery form, modulo the modulus (prime) of the 956/* Invert the number, in Montgomery form, modulo the modulus (prime) of the
@@ -1068,15 +973,15 @@ static const uint32_t p256_mod_2[8] = {
1068#endif 973#endif
1069static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a) 974static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a)
1070{ 975{
1071 sp_digit t[2*8]; //can be just [8]? 976 sp_digit t[2*8];
1072 int i; 977 int i;
1073 978
1074 memcpy(t, a, sizeof(sp_digit) * 8); 979 memcpy(t, a, sizeof(sp_digit) * 8);
1075 for (i = 254; i >= 0; i--) { 980 for (i = 254; i >= 0; i--) {
1076 sp_256_mont_sqr_8(t, t /*, p256_mod, p256_mp_mod*/); 981 sp_256to512z_mont_sqr_8(t, t /*, p256_mod, p256_mp_mod*/);
1077 /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/ 982 /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/
1078 if (i >= 224 || i == 192 || (i <= 95 && i != 1)) 983 if (i >= 224 || i == 192 || (i <= 95 && i != 1))
1079 sp_256_mont_mul_8(t, t, a /*, p256_mod, p256_mp_mod*/); 984 sp_256to512z_mont_mul_8(t, t, a /*, p256_mod, p256_mp_mod*/);
1080 } 985 }
1081 memcpy(r, t, sizeof(sp_digit) * 8); 986 memcpy(r, t, sizeof(sp_digit) * 8);
1082} 987}
@@ -1152,22 +1057,22 @@ static void sp_256_map_8(sp_point* r, sp_point* p)
1152 1057
1153 sp_256_mont_inv_8(t1, p->z); 1058 sp_256_mont_inv_8(t1, p->z);
1154 1059
1155 sp_256_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/); 1060 sp_256to512z_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/);
1156 sp_256_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/); 1061 sp_256to512z_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/);
1157 1062
1158 /* x /= z^2 */ 1063 /* x /= z^2 */
1159 sp_256_mont_mul_8(r->x, p->x, t2 /*, p256_mod, p256_mp_mod*/); 1064 sp_256to512z_mont_mul_8(r->x, p->x, t2 /*, p256_mod, p256_mp_mod*/);
1160 memset(r->x + 8, 0, sizeof(r->x) / 2); 1065 memset(r->x + 8, 0, sizeof(r->x) / 2);
1161 sp_256_mont_reduce_8(r->x /*, p256_mod, p256_mp_mod*/); 1066 sp_512to256_mont_reduce_8(r->x /*, p256_mod, p256_mp_mod*/);
1162 /* Reduce x to less than modulus */ 1067 /* Reduce x to less than modulus */
1163 if (sp_256_cmp_8(r->x, p256_mod) >= 0) 1068 if (sp_256_cmp_8(r->x, p256_mod) >= 0)
1164 sp_256_sub_8_p256_mod(r->x); 1069 sp_256_sub_8_p256_mod(r->x);
1165 sp_256_norm_8(r->x); 1070 sp_256_norm_8(r->x);
1166 1071
1167 /* y /= z^3 */ 1072 /* y /= z^3 */
1168 sp_256_mont_mul_8(r->y, p->y, t1 /*, p256_mod, p256_mp_mod*/); 1073 sp_256to512z_mont_mul_8(r->y, p->y, t1 /*, p256_mod, p256_mp_mod*/);
1169 memset(r->y + 8, 0, sizeof(r->y) / 2); 1074 memset(r->y + 8, 0, sizeof(r->y) / 2);
1170 sp_256_mont_reduce_8(r->y /*, p256_mod, p256_mp_mod*/); 1075 sp_512to256_mont_reduce_8(r->y /*, p256_mod, p256_mp_mod*/);
1171 /* Reduce y to less than modulus */ 1076 /* Reduce y to less than modulus */
1172 if (sp_256_cmp_8(r->y, p256_mod) >= 0) 1077 if (sp_256_cmp_8(r->y, p256_mod) >= 0)
1173 sp_256_sub_8_p256_mod(r->y); 1078 sp_256_sub_8_p256_mod(r->y);
@@ -1202,9 +1107,9 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
1202 } 1107 }
1203 1108
1204 /* T1 = Z * Z */ 1109 /* T1 = Z * Z */
1205 sp_256_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/); 1110 sp_256to512z_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/);
1206 /* Z = Y * Z */ 1111 /* Z = Y * Z */
1207 sp_256_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/); 1112 sp_256to512z_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/);
1208 /* Z = 2Z */ 1113 /* Z = 2Z */
1209 sp_256_mont_dbl_8(r->z, r->z /*, p256_mod*/); 1114 sp_256_mont_dbl_8(r->z, r->z /*, p256_mod*/);
1210 /* T2 = X - T1 */ 1115 /* T2 = X - T1 */
@@ -1212,21 +1117,21 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
1212 /* T1 = X + T1 */ 1117 /* T1 = X + T1 */
1213 sp_256_mont_add_8(t1, r->x, t1 /*, p256_mod*/); 1118 sp_256_mont_add_8(t1, r->x, t1 /*, p256_mod*/);
1214 /* T2 = T1 * T2 */ 1119 /* T2 = T1 * T2 */
1215 sp_256_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/); 1120 sp_256to512z_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/);
1216 /* T1 = 3T2 */ 1121 /* T1 = 3T2 */
1217 sp_256_mont_tpl_8(t1, t2 /*, p256_mod*/); 1122 sp_256_mont_tpl_8(t1, t2 /*, p256_mod*/);
1218 /* Y = 2Y */ 1123 /* Y = 2Y */
1219 sp_256_mont_dbl_8(r->y, r->y /*, p256_mod*/); 1124 sp_256_mont_dbl_8(r->y, r->y /*, p256_mod*/);
1220 /* Y = Y * Y */ 1125 /* Y = Y * Y */
1221 sp_256_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/); 1126 sp_256to512z_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/);
1222 /* T2 = Y * Y */ 1127 /* T2 = Y * Y */
1223 sp_256_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/); 1128 sp_256to512z_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/);
1224 /* T2 = T2/2 */ 1129 /* T2 = T2/2 */
1225 sp_256_div2_8(t2, t2, p256_mod); 1130 sp_256_div2_8(t2, t2, p256_mod);
1226 /* Y = Y * X */ 1131 /* Y = Y * X */
1227 sp_256_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/); 1132 sp_256to512z_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/);
1228 /* X = T1 * T1 */ 1133 /* X = T1 * T1 */
1229 sp_256_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/); 1134 sp_256to512z_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/);
1230 /* X = X - Y */ 1135 /* X = X - Y */
1231 sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/); 1136 sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/);
1232 /* X = X - Y */ 1137 /* X = X - Y */
@@ -1234,7 +1139,7 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
1234 /* Y = Y - X */ 1139 /* Y = Y - X */
1235 sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/); 1140 sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/);
1236 /* Y = Y * T1 */ 1141 /* Y = Y * T1 */
1237 sp_256_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/); 1142 sp_256to512z_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/);
1238 /* Y = Y - T2 */ 1143 /* Y = Y - T2 */
1239 sp_256_mont_sub_8(r->y, r->y, t2 /*, p256_mod*/); 1144 sp_256_mont_sub_8(r->y, r->y, t2 /*, p256_mod*/);
1240 dump_512("y2 %s\n", r->y); 1145 dump_512("y2 %s\n", r->y);
@@ -1279,36 +1184,36 @@ static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point*
1279 } 1184 }
1280 1185
1281 /* U1 = X1*Z2^2 */ 1186 /* U1 = X1*Z2^2 */
1282 sp_256_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/); 1187 sp_256to512z_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/);
1283 sp_256_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/); 1188 sp_256to512z_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/);
1284 sp_256_mont_mul_8(t1, t1, r->x /*, p256_mod, p256_mp_mod*/); 1189 sp_256to512z_mont_mul_8(t1, t1, r->x /*, p256_mod, p256_mp_mod*/);
1285 /* U2 = X2*Z1^2 */ 1190 /* U2 = X2*Z1^2 */
1286 sp_256_mont_sqr_8(t2, r->z /*, p256_mod, p256_mp_mod*/); 1191 sp_256to512z_mont_sqr_8(t2, r->z /*, p256_mod, p256_mp_mod*/);
1287 sp_256_mont_mul_8(t4, t2, r->z /*, p256_mod, p256_mp_mod*/); 1192 sp_256to512z_mont_mul_8(t4, t2, r->z /*, p256_mod, p256_mp_mod*/);
1288 sp_256_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/); 1193 sp_256to512z_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/);
1289 /* S1 = Y1*Z2^3 */ 1194 /* S1 = Y1*Z2^3 */
1290 sp_256_mont_mul_8(t3, t3, r->y /*, p256_mod, p256_mp_mod*/); 1195 sp_256to512z_mont_mul_8(t3, t3, r->y /*, p256_mod, p256_mp_mod*/);
1291 /* S2 = Y2*Z1^3 */ 1196 /* S2 = Y2*Z1^3 */
1292 sp_256_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/); 1197 sp_256to512z_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/);
1293 /* H = U2 - U1 */ 1198 /* H = U2 - U1 */
1294 sp_256_mont_sub_8(t2, t2, t1 /*, p256_mod*/); 1199 sp_256_mont_sub_8(t2, t2, t1 /*, p256_mod*/);
1295 /* R = S2 - S1 */ 1200 /* R = S2 - S1 */
1296 sp_256_mont_sub_8(t4, t4, t3 /*, p256_mod*/); 1201 sp_256_mont_sub_8(t4, t4, t3 /*, p256_mod*/);
1297 /* Z3 = H*Z1*Z2 */ 1202 /* Z3 = H*Z1*Z2 */
1298 sp_256_mont_mul_8(r->z, r->z, q->z /*, p256_mod, p256_mp_mod*/); 1203 sp_256to512z_mont_mul_8(r->z, r->z, q->z /*, p256_mod, p256_mp_mod*/);
1299 sp_256_mont_mul_8(r->z, r->z, t2 /*, p256_mod, p256_mp_mod*/); 1204 sp_256to512z_mont_mul_8(r->z, r->z, t2 /*, p256_mod, p256_mp_mod*/);
1300 /* X3 = R^2 - H^3 - 2*U1*H^2 */ 1205 /* X3 = R^2 - H^3 - 2*U1*H^2 */
1301 sp_256_mont_sqr_8(r->x, t4 /*, p256_mod, p256_mp_mod*/); 1206 sp_256to512z_mont_sqr_8(r->x, t4 /*, p256_mod, p256_mp_mod*/);
1302 sp_256_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/); 1207 sp_256to512z_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/);
1303 sp_256_mont_mul_8(r->y, t1, t5 /*, p256_mod, p256_mp_mod*/); 1208 sp_256to512z_mont_mul_8(r->y, t1, t5 /*, p256_mod, p256_mp_mod*/);
1304 sp_256_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/); 1209 sp_256to512z_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/);
1305 sp_256_mont_sub_8(r->x, r->x, t5 /*, p256_mod*/); 1210 sp_256_mont_sub_8(r->x, r->x, t5 /*, p256_mod*/);
1306 sp_256_mont_dbl_8(t1, r->y /*, p256_mod*/); 1211 sp_256_mont_dbl_8(t1, r->y /*, p256_mod*/);
1307 sp_256_mont_sub_8(r->x, r->x, t1 /*, p256_mod*/); 1212 sp_256_mont_sub_8(r->x, r->x, t1 /*, p256_mod*/);
1308 /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ 1213 /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
1309 sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/); 1214 sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/);
1310 sp_256_mont_mul_8(r->y, r->y, t4 /*, p256_mod, p256_mp_mod*/); 1215 sp_256to512z_mont_mul_8(r->y, r->y, t4 /*, p256_mod, p256_mp_mod*/);
1311 sp_256_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/); 1216 sp_256to512z_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/);
1312 sp_256_mont_sub_8(r->y, r->y, t5 /*, p256_mod*/); 1217 sp_256_mont_sub_8(r->y, r->y, t5 /*, p256_mod*/);
1313} 1218}
1314 1219