aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--networking/tls_sp_c32.c178
1 files changed, 72 insertions, 106 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
index 3291b553c..3452b08b9 100644
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@@ -49,9 +49,9 @@ typedef int32_t signed_sp_digit;
49 */ 49 */
50 50
51typedef struct sp_point { 51typedef struct sp_point {
52 sp_digit x[2 * 8]; 52 sp_digit x[8];
53 sp_digit y[2 * 8]; 53 sp_digit y[8];
54 sp_digit z[2 * 8]; 54 sp_digit z[8];
55 int infinity; 55 int infinity;
56} sp_point; 56} sp_point;
57 57
@@ -456,12 +456,11 @@ static void sp_256_sub_8_p256_mod(sp_digit* r)
456#endif 456#endif
457 457
458/* Multiply a and b into r. (r = a * b) 458/* Multiply a and b into r. (r = a * b)
459 * r should be [16] array (512 bits). 459 * r should be [16] array (512 bits), and must not coincide with a or b.
460 */ 460 */
461static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) 461static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
462{ 462{
463#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__) 463#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
464 sp_digit rr[15]; /* in case r coincides with a or b */
465 int k; 464 int k;
466 uint32_t accl; 465 uint32_t accl;
467 uint32_t acch; 466 uint32_t acch;
@@ -493,16 +492,15 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
493 j--; 492 j--;
494 i++; 493 i++;
495 } while (i != 8 && i <= k); 494 } while (i != 8 && i <= k);
496 rr[k] = accl; 495 r[k] = accl;
497 accl = acch; 496 accl = acch;
498 acch = acc_hi; 497 acch = acc_hi;
499 } 498 }
500 r[15] = accl; 499 r[15] = accl;
501 memcpy(r, rr, sizeof(rr));
502#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__) 500#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
503 const uint64_t* aa = (const void*)a; 501 const uint64_t* aa = (const void*)a;
504 const uint64_t* bb = (const void*)b; 502 const uint64_t* bb = (const void*)b;
505 uint64_t rr[8]; 503 const uint64_t* rr = (const void*)r;
506 int k; 504 int k;
507 uint64_t accl; 505 uint64_t accl;
508 uint64_t acch; 506 uint64_t acch;
@@ -539,11 +537,8 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
539 acch = acc_hi; 537 acch = acc_hi;
540 } 538 }
541 rr[7] = accl; 539 rr[7] = accl;
542 memcpy(r, rr, sizeof(rr));
543#elif 0 540#elif 0
544 //TODO: arm assembly (untested) 541 //TODO: arm assembly (untested)
545 sp_digit tmp[16];
546
547 asm volatile ( 542 asm volatile (
548"\n mov r5, #0" 543"\n mov r5, #0"
549"\n mov r6, #0" 544"\n mov r6, #0"
@@ -575,12 +570,10 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
575"\n cmp r5, #56" 570"\n cmp r5, #56"
576"\n ble 1b" 571"\n ble 1b"
577"\n str r6, [%[r], r5]" 572"\n str r6, [%[r], r5]"
578 : [r] "r" (tmp), [a] "r" (a), [b] "r" (b) 573 : [r] "r" (r), [a] "r" (a), [b] "r" (b)
579 : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14" 574 : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14"
580 ); 575 );
581 memcpy(r, tmp, sizeof(tmp));
582#else 576#else
583 sp_digit rr[15]; /* in case r coincides with a or b */
584 int i, j, k; 577 int i, j, k;
585 uint64_t acc; 578 uint64_t acc;
586 579
@@ -600,11 +593,10 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
600 j--; 593 j--;
601 i++; 594 i++;
602 } while (i != 8 && i <= k); 595 } while (i != 8 && i <= k);
603 rr[k] = acc; 596 r[k] = acc;
604 acc = (acc >> 32) | ((uint64_t)acc_hi << 32); 597 acc = (acc >> 32) | ((uint64_t)acc_hi << 32);
605 } 598 }
606 r[15] = acc; 599 r[15] = acc;
607 memcpy(r, rr, sizeof(rr));
608#endif 600#endif
609} 601}
610 602
@@ -709,30 +701,11 @@ static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /*, const sp_digit*
709} 701}
710 702
711/* Shift the result in the high 256 bits down to the bottom. 703/* Shift the result in the high 256 bits down to the bottom.
712 * High half is cleared to zeros.
713 */ 704 */
714#if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff 705static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a)
715static void sp_512to256_mont_shift_8(sp_digit* rr)
716{ 706{
717 uint64_t *r = (void*)rr; 707 memcpy(r, a + 8, sizeof(*r) * 8);
718 int i;
719
720 for (i = 0; i < 4; i++) {
721 r[i] = r[i+4];
722 r[i+4] = 0;
723 }
724} 708}
725#else
726static void sp_512to256_mont_shift_8(sp_digit* r)
727{
728 int i;
729
730 for (i = 0; i < 8; i++) {
731 r[i] = r[i+8];
732 r[i+8] = 0;
733 }
734}
735#endif
736 709
737/* Mul a by scalar b and add into r. (r += a * b) 710/* Mul a by scalar b and add into r. (r += a * b)
738 * a = p256_mod 711 * a = p256_mod
@@ -868,11 +841,12 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
868 * Note: the result is NOT guaranteed to be less than p256_mod! 841 * Note: the result is NOT guaranteed to be less than p256_mod!
869 * (it is only guaranteed to fit into 256 bits). 842 * (it is only guaranteed to fit into 256 bits).
870 * 843 *
871 * a Double-wide number to reduce in place. 844 * r Result.
845 * a Double-wide number to reduce. Clobbered.
872 * m The single precision number representing the modulus. 846 * m The single precision number representing the modulus.
873 * mp The digit representing the negative inverse of m mod 2^n. 847 * mp The digit representing the negative inverse of m mod 2^n.
874 */ 848 */
875static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/) 849static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* a/*, const sp_digit* m, sp_digit mp*/)
876{ 850{
877// const sp_digit* m = p256_mod; 851// const sp_digit* m = p256_mod;
878 sp_digit mp = p256_mp_mod; 852 sp_digit mp = p256_mp_mod;
@@ -895,10 +869,10 @@ static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit
895 goto inc_next_word0; 869 goto inc_next_word0;
896 } 870 }
897 } 871 }
898 sp_512to256_mont_shift_8(a); 872 sp_512to256_mont_shift_8(r, a);
899 if (word16th != 0) 873 if (word16th != 0)
900 sp_256_sub_8_p256_mod(a); 874 sp_256_sub_8_p256_mod(r);
901 sp_256_norm_8(a); 875 sp_256_norm_8(r);
902 } 876 }
903 else { /* Same code for explicit mp == 1 (which is always the case for P256) */ 877 else { /* Same code for explicit mp == 1 (which is always the case for P256) */
904 sp_digit word16th = 0; 878 sp_digit word16th = 0;
@@ -915,10 +889,10 @@ static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit
915 goto inc_next_word; 889 goto inc_next_word;
916 } 890 }
917 } 891 }
918 sp_512to256_mont_shift_8(a); 892 sp_512to256_mont_shift_8(r, a);
919 if (word16th != 0) 893 if (word16th != 0)
920 sp_256_sub_8_p256_mod(a); 894 sp_256_sub_8_p256_mod(r);
921 sp_256_norm_8(a); 895 sp_256_norm_8(r);
922 } 896 }
923} 897}
924 898
@@ -926,35 +900,34 @@ static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit
926 * (r = a * b mod m) 900 * (r = a * b mod m)
927 * 901 *
928 * r Result of multiplication. 902 * r Result of multiplication.
929 * Should be [16] array (512 bits), but high half is cleared to zeros (used as scratch pad).
930 * a First number to multiply in Montogmery form. 903 * a First number to multiply in Montogmery form.
931 * b Second number to multiply in Montogmery form. 904 * b Second number to multiply in Montogmery form.
932 * m Modulus (prime). 905 * m Modulus (prime).
933 * mp Montogmery mulitplier. 906 * mp Montogmery mulitplier.
934 */ 907 */
935static void sp_256to512z_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b 908static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b
936 /*, const sp_digit* m, sp_digit mp*/) 909 /*, const sp_digit* m, sp_digit mp*/)
937{ 910{
938 //const sp_digit* m = p256_mod; 911 //const sp_digit* m = p256_mod;
939 //sp_digit mp = p256_mp_mod; 912 //sp_digit mp = p256_mp_mod;
940 sp_256to512_mul_8(r, a, b); 913 sp_digit t[2 * 8];
941 sp_512to256_mont_reduce_8(r /*, m, mp*/); 914 sp_256to512_mul_8(t, a, b);
915 sp_512to256_mont_reduce_8(r, t /*, m, mp*/);
942} 916}
943 917
944/* Square the Montgomery form number. (r = a * a mod m) 918/* Square the Montgomery form number. (r = a * a mod m)
945 * 919 *
946 * r Result of squaring. 920 * r Result of squaring.
947 * Should be [16] array (512 bits), but high half is cleared to zeros (used as scratch pad).
948 * a Number to square in Montogmery form. 921 * a Number to square in Montogmery form.
949 * m Modulus (prime). 922 * m Modulus (prime).
950 * mp Montogmery mulitplier. 923 * mp Montogmery mulitplier.
951 */ 924 */
952static void sp_256to512z_mont_sqr_8(sp_digit* r, const sp_digit* a 925static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a
953 /*, const sp_digit* m, sp_digit mp*/) 926 /*, const sp_digit* m, sp_digit mp*/)
954{ 927{
955 //const sp_digit* m = p256_mod; 928 //const sp_digit* m = p256_mod;
956 //sp_digit mp = p256_mp_mod; 929 //sp_digit mp = p256_mp_mod;
957 sp_256to512z_mont_mul_8(r, a, a /*, m, mp*/); 930 sp_256_mont_mul_8(r, a, a /*, m, mp*/);
958} 931}
959 932
960/* Invert the number, in Montgomery form, modulo the modulus (prime) of the 933/* Invert the number, in Montgomery form, modulo the modulus (prime) of the
@@ -964,11 +937,8 @@ static void sp_256to512z_mont_sqr_8(sp_digit* r, const sp_digit* a
964 * a Number to invert. 937 * a Number to invert.
965 */ 938 */
966#if 0 939#if 0
967/* Mod-2 for the P256 curve. */ 940//p256_mod - 2:
968static const uint32_t p256_mod_2[8] = { 941//ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff - 2
969 0xfffffffd,0xffffffff,0xffffffff,0x00000000,
970 0x00000000,0x00000000,0x00000001,0xffffffff,
971};
972//Bit pattern: 942//Bit pattern:
973//2 2 2 2 2 2 2 1...1 943//2 2 2 2 2 2 2 1...1
974//5 5 4 3 2 1 0 9...0 9...1 944//5 5 4 3 2 1 0 9...0 9...1
@@ -977,15 +947,15 @@ static const uint32_t p256_mod_2[8] = {
977#endif 947#endif
978static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a) 948static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a)
979{ 949{
980 sp_digit t[2*8]; 950 sp_digit t[8];
981 int i; 951 int i;
982 952
983 memcpy(t, a, sizeof(sp_digit) * 8); 953 memcpy(t, a, sizeof(sp_digit) * 8);
984 for (i = 254; i >= 0; i--) { 954 for (i = 254; i >= 0; i--) {
985 sp_256to512z_mont_sqr_8(t, t /*, p256_mod, p256_mp_mod*/); 955 sp_256_mont_sqr_8(t, t /*, p256_mod, p256_mp_mod*/);
986 /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/ 956 /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/
987 if (i >= 224 || i == 192 || (i <= 95 && i != 1)) 957 if (i >= 224 || i == 192 || (i <= 95 && i != 1))
988 sp_256to512z_mont_mul_8(t, t, a /*, p256_mod, p256_mp_mod*/); 958 sp_256_mont_mul_8(t, t, a /*, p256_mod, p256_mp_mod*/);
989 } 959 }
990 memcpy(r, t, sizeof(sp_digit) * 8); 960 memcpy(r, t, sizeof(sp_digit) * 8);
991} 961}
@@ -1056,25 +1026,28 @@ static void sp_256_mod_mul_norm_8(sp_digit* r, const sp_digit* a)
1056 */ 1026 */
1057static void sp_256_map_8(sp_point* r, sp_point* p) 1027static void sp_256_map_8(sp_point* r, sp_point* p)
1058{ 1028{
1059 sp_digit t1[2*8]; 1029 sp_digit t1[8];
1060 sp_digit t2[2*8]; 1030 sp_digit t2[8];
1031 sp_digit rr[2 * 8];
1061 1032
1062 sp_256_mont_inv_8(t1, p->z); 1033 sp_256_mont_inv_8(t1, p->z);
1063 1034
1064 sp_256to512z_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/); 1035 sp_256_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/);
1065 sp_256to512z_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/); 1036 sp_256_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/);
1066 1037
1067 /* x /= z^2 */ 1038 /* x /= z^2 */
1068 sp_256to512z_mont_mul_8(r->x, p->x, t2 /*, p256_mod, p256_mp_mod*/); 1039 sp_256_mont_mul_8(rr, p->x, t2 /*, p256_mod, p256_mp_mod*/);
1069 sp_512to256_mont_reduce_8(r->x /*, p256_mod, p256_mp_mod*/); 1040 memset(rr + 8, 0, sizeof(rr) / 2);
1041 sp_512to256_mont_reduce_8(r->x, rr /*, p256_mod, p256_mp_mod*/);
1070 /* Reduce x to less than modulus */ 1042 /* Reduce x to less than modulus */
1071 if (sp_256_cmp_8(r->x, p256_mod) >= 0) 1043 if (sp_256_cmp_8(r->x, p256_mod) >= 0)
1072 sp_256_sub_8_p256_mod(r->x); 1044 sp_256_sub_8_p256_mod(r->x);
1073 sp_256_norm_8(r->x); 1045 sp_256_norm_8(r->x);
1074 1046
1075 /* y /= z^3 */ 1047 /* y /= z^3 */
1076 sp_256to512z_mont_mul_8(r->y, p->y, t1 /*, p256_mod, p256_mp_mod*/); 1048 sp_256_mont_mul_8(rr, p->y, t1 /*, p256_mod, p256_mp_mod*/);
1077 sp_512to256_mont_reduce_8(r->y /*, p256_mod, p256_mp_mod*/); 1049 memset(rr + 8, 0, sizeof(rr) / 2);
1050 sp_512to256_mont_reduce_8(r->y, rr /*, p256_mod, p256_mp_mod*/);
1078 /* Reduce y to less than modulus */ 1051 /* Reduce y to less than modulus */
1079 if (sp_256_cmp_8(r->y, p256_mod) >= 0) 1052 if (sp_256_cmp_8(r->y, p256_mod) >= 0)
1080 sp_256_sub_8_p256_mod(r->y); 1053 sp_256_sub_8_p256_mod(r->y);
@@ -1091,8 +1064,8 @@ static void sp_256_map_8(sp_point* r, sp_point* p)
1091 */ 1064 */
1092static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p) 1065static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
1093{ 1066{
1094 sp_digit t1[2*8]; 1067 sp_digit t1[8];
1095 sp_digit t2[2*8]; 1068 sp_digit t2[8];
1096 1069
1097 /* Put point to double into result */ 1070 /* Put point to double into result */
1098 if (r != p) 1071 if (r != p)
@@ -1101,17 +1074,10 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
1101 if (r->infinity) 1074 if (r->infinity)
1102 return; 1075 return;
1103 1076
1104 if (SP_DEBUG) {
1105 /* unused part of t2, may result in spurios
1106 * differences in debug output. Clear it.
1107 */
1108 memset(t2, 0, sizeof(t2));
1109 }
1110
1111 /* T1 = Z * Z */ 1077 /* T1 = Z * Z */
1112 sp_256to512z_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/); 1078 sp_256_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/);
1113 /* Z = Y * Z */ 1079 /* Z = Y * Z */
1114 sp_256to512z_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/); 1080 sp_256_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/);
1115 /* Z = 2Z */ 1081 /* Z = 2Z */
1116 sp_256_mont_dbl_8(r->z, r->z /*, p256_mod*/); 1082 sp_256_mont_dbl_8(r->z, r->z /*, p256_mod*/);
1117 /* T2 = X - T1 */ 1083 /* T2 = X - T1 */
@@ -1119,21 +1085,21 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
1119 /* T1 = X + T1 */ 1085 /* T1 = X + T1 */
1120 sp_256_mont_add_8(t1, r->x, t1 /*, p256_mod*/); 1086 sp_256_mont_add_8(t1, r->x, t1 /*, p256_mod*/);
1121 /* T2 = T1 * T2 */ 1087 /* T2 = T1 * T2 */
1122 sp_256to512z_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/); 1088 sp_256_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/);
1123 /* T1 = 3T2 */ 1089 /* T1 = 3T2 */
1124 sp_256_mont_tpl_8(t1, t2 /*, p256_mod*/); 1090 sp_256_mont_tpl_8(t1, t2 /*, p256_mod*/);
1125 /* Y = 2Y */ 1091 /* Y = 2Y */
1126 sp_256_mont_dbl_8(r->y, r->y /*, p256_mod*/); 1092 sp_256_mont_dbl_8(r->y, r->y /*, p256_mod*/);
1127 /* Y = Y * Y */ 1093 /* Y = Y * Y */
1128 sp_256to512z_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/); 1094 sp_256_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/);
1129 /* T2 = Y * Y */ 1095 /* T2 = Y * Y */
1130 sp_256to512z_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/); 1096 sp_256_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/);
1131 /* T2 = T2/2 */ 1097 /* T2 = T2/2 */
1132 sp_256_div2_8(t2 /*, p256_mod*/); 1098 sp_256_div2_8(t2 /*, p256_mod*/);
1133 /* Y = Y * X */ 1099 /* Y = Y * X */
1134 sp_256to512z_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/); 1100 sp_256_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/);
1135 /* X = T1 * T1 */ 1101 /* X = T1 * T1 */
1136 sp_256to512z_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/); 1102 sp_256_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/);
1137 /* X = X - Y */ 1103 /* X = X - Y */
1138 sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/); 1104 sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/);
1139 /* X = X - Y */ 1105 /* X = X - Y */
@@ -1141,7 +1107,7 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
1141 /* Y = Y - X */ 1107 /* Y = Y - X */
1142 sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/); 1108 sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/);
1143 /* Y = Y * T1 */ 1109 /* Y = Y * T1 */
1144 sp_256to512z_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/); 1110 sp_256_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/);
1145 /* Y = Y - T2 */ 1111 /* Y = Y - T2 */
1146 sp_256_mont_sub_8(r->y, r->y, t2 /*, p256_mod*/); 1112 sp_256_mont_sub_8(r->y, r->y, t2 /*, p256_mod*/);
1147 dump_512("y2 %s\n", r->y); 1113 dump_512("y2 %s\n", r->y);
@@ -1155,11 +1121,11 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
1155 */ 1121 */
1156static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point* q) 1122static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point* q)
1157{ 1123{
1158 sp_digit t1[2*8]; 1124 sp_digit t1[8];
1159 sp_digit t2[2*8]; 1125 sp_digit t2[8];
1160 sp_digit t3[2*8]; 1126 sp_digit t3[8];
1161 sp_digit t4[2*8]; 1127 sp_digit t4[8];
1162 sp_digit t5[2*8]; 1128 sp_digit t5[8];
1163 1129
1164 /* Ensure only the first point is the same as the result. */ 1130 /* Ensure only the first point is the same as the result. */
1165 if (q == r) { 1131 if (q == r) {
@@ -1186,36 +1152,36 @@ static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point*
1186 } 1152 }
1187 1153
1188 /* U1 = X1*Z2^2 */ 1154 /* U1 = X1*Z2^2 */
1189 sp_256to512z_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/); 1155 sp_256_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/);
1190 sp_256to512z_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/); 1156 sp_256_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/);
1191 sp_256to512z_mont_mul_8(t1, t1, r->x /*, p256_mod, p256_mp_mod*/); 1157 sp_256_mont_mul_8(t1, t1, r->x /*, p256_mod, p256_mp_mod*/);
1192 /* U2 = X2*Z1^2 */ 1158 /* U2 = X2*Z1^2 */
1193 sp_256to512z_mont_sqr_8(t2, r->z /*, p256_mod, p256_mp_mod*/); 1159 sp_256_mont_sqr_8(t2, r->z /*, p256_mod, p256_mp_mod*/);
1194 sp_256to512z_mont_mul_8(t4, t2, r->z /*, p256_mod, p256_mp_mod*/); 1160 sp_256_mont_mul_8(t4, t2, r->z /*, p256_mod, p256_mp_mod*/);
1195 sp_256to512z_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/); 1161 sp_256_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/);
1196 /* S1 = Y1*Z2^3 */ 1162 /* S1 = Y1*Z2^3 */
1197 sp_256to512z_mont_mul_8(t3, t3, r->y /*, p256_mod, p256_mp_mod*/); 1163 sp_256_mont_mul_8(t3, t3, r->y /*, p256_mod, p256_mp_mod*/);
1198 /* S2 = Y2*Z1^3 */ 1164 /* S2 = Y2*Z1^3 */
1199 sp_256to512z_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/); 1165 sp_256_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/);
1200 /* H = U2 - U1 */ 1166 /* H = U2 - U1 */
1201 sp_256_mont_sub_8(t2, t2, t1 /*, p256_mod*/); 1167 sp_256_mont_sub_8(t2, t2, t1 /*, p256_mod*/);
1202 /* R = S2 - S1 */ 1168 /* R = S2 - S1 */
1203 sp_256_mont_sub_8(t4, t4, t3 /*, p256_mod*/); 1169 sp_256_mont_sub_8(t4, t4, t3 /*, p256_mod*/);
1204 /* Z3 = H*Z1*Z2 */ 1170 /* Z3 = H*Z1*Z2 */
1205 sp_256to512z_mont_mul_8(r->z, r->z, q->z /*, p256_mod, p256_mp_mod*/); 1171 sp_256_mont_mul_8(r->z, r->z, q->z /*, p256_mod, p256_mp_mod*/);
1206 sp_256to512z_mont_mul_8(r->z, r->z, t2 /*, p256_mod, p256_mp_mod*/); 1172 sp_256_mont_mul_8(r->z, r->z, t2 /*, p256_mod, p256_mp_mod*/);
1207 /* X3 = R^2 - H^3 - 2*U1*H^2 */ 1173 /* X3 = R^2 - H^3 - 2*U1*H^2 */
1208 sp_256to512z_mont_sqr_8(r->x, t4 /*, p256_mod, p256_mp_mod*/); 1174 sp_256_mont_sqr_8(r->x, t4 /*, p256_mod, p256_mp_mod*/);
1209 sp_256to512z_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/); 1175 sp_256_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/);
1210 sp_256to512z_mont_mul_8(r->y, t1, t5 /*, p256_mod, p256_mp_mod*/); 1176 sp_256_mont_mul_8(r->y, t1, t5 /*, p256_mod, p256_mp_mod*/);
1211 sp_256to512z_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/); 1177 sp_256_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/);
1212 sp_256_mont_sub_8(r->x, r->x, t5 /*, p256_mod*/); 1178 sp_256_mont_sub_8(r->x, r->x, t5 /*, p256_mod*/);
1213 sp_256_mont_dbl_8(t1, r->y /*, p256_mod*/); 1179 sp_256_mont_dbl_8(t1, r->y /*, p256_mod*/);
1214 sp_256_mont_sub_8(r->x, r->x, t1 /*, p256_mod*/); 1180 sp_256_mont_sub_8(r->x, r->x, t1 /*, p256_mod*/);
1215 /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ 1181 /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
1216 sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/); 1182 sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/);
1217 sp_256to512z_mont_mul_8(r->y, r->y, t4 /*, p256_mod, p256_mp_mod*/); 1183 sp_256_mont_mul_8(r->y, r->y, t4 /*, p256_mod, p256_mp_mod*/);
1218 sp_256to512z_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/); 1184 sp_256_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/);
1219 sp_256_mont_sub_8(r->y, r->y, t5 /*, p256_mod*/); 1185 sp_256_mont_sub_8(r->y, r->y, t5 /*, p256_mod*/);
1220} 1186}
1221 1187