aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2021-11-27 19:15:43 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2021-11-27 19:27:03 +0100
commitf92ae1dc4bc00e352e683b826609efa5e1e22708 (patch)
tree3fa47fb70a6d8d3ea8194920be94301e48e139c2
parent9c671fe3dd2e46a28c02d266130f56a1a6296791 (diff)
downloadbusybox-w32-f92ae1dc4bc00e352e683b826609efa5e1e22708.tar.gz
busybox-w32-f92ae1dc4bc00e352e683b826609efa5e1e22708.tar.bz2
busybox-w32-f92ae1dc4bc00e352e683b826609efa5e1e22708.zip
tls: P256: change logic so that we don't need double-wide vectors everywhere
Change sp_256to512z_mont_{mul,sqr}_8 to not require/zero upper 256 bits. There is only one place where we actually used that (and that's why there used to be zeroing memset of top half!). Fix up that place. As a bonus, 256x256->512 multiply no longer needs to care for "r overlaps a or b" case. This shrinks sp_point structure as well, not just temporaries. function old new delta sp_256to512z_mont_mul_8 150 - -150 sp_256_mont_mul_8 - 147 +147 sp_256to512z_mont_sqr_8 7 - -7 sp_256_mont_sqr_8 - 7 +7 sp_256_ecc_mulmod_8 494 543 +49 sp_512to256_mont_reduce_8 243 249 +6 sp_256_point_from_bin2x32 73 70 -3 sp_256_proj_point_dbl_8 353 345 -8 sp_256_proj_point_add_8 544 499 -45 ------------------------------------------------------------------------------ (add/remove: 2/2 grow/shrink: 2/3 up/down: 209/-213) Total: -4 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--networking/tls_sp_c32.c178
1 files changed, 72 insertions, 106 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
index 3291b553c..3452b08b9 100644
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@@ -49,9 +49,9 @@ typedef int32_t signed_sp_digit;
49 */ 49 */
50 50
51typedef struct sp_point { 51typedef struct sp_point {
52 sp_digit x[2 * 8]; 52 sp_digit x[8];
53 sp_digit y[2 * 8]; 53 sp_digit y[8];
54 sp_digit z[2 * 8]; 54 sp_digit z[8];
55 int infinity; 55 int infinity;
56} sp_point; 56} sp_point;
57 57
@@ -456,12 +456,11 @@ static void sp_256_sub_8_p256_mod(sp_digit* r)
456#endif 456#endif
457 457
458/* Multiply a and b into r. (r = a * b) 458/* Multiply a and b into r. (r = a * b)
459 * r should be [16] array (512 bits). 459 * r should be [16] array (512 bits), and must not coincide with a or b.
460 */ 460 */
461static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) 461static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
462{ 462{
463#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__) 463#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
464 sp_digit rr[15]; /* in case r coincides with a or b */
465 int k; 464 int k;
466 uint32_t accl; 465 uint32_t accl;
467 uint32_t acch; 466 uint32_t acch;
@@ -493,16 +492,15 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
493 j--; 492 j--;
494 i++; 493 i++;
495 } while (i != 8 && i <= k); 494 } while (i != 8 && i <= k);
496 rr[k] = accl; 495 r[k] = accl;
497 accl = acch; 496 accl = acch;
498 acch = acc_hi; 497 acch = acc_hi;
499 } 498 }
500 r[15] = accl; 499 r[15] = accl;
501 memcpy(r, rr, sizeof(rr));
502#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__) 500#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
503 const uint64_t* aa = (const void*)a; 501 const uint64_t* aa = (const void*)a;
504 const uint64_t* bb = (const void*)b; 502 const uint64_t* bb = (const void*)b;
505 uint64_t rr[8]; 503 const uint64_t* rr = (const void*)r;
506 int k; 504 int k;
507 uint64_t accl; 505 uint64_t accl;
508 uint64_t acch; 506 uint64_t acch;
@@ -539,11 +537,8 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
539 acch = acc_hi; 537 acch = acc_hi;
540 } 538 }
541 rr[7] = accl; 539 rr[7] = accl;
542 memcpy(r, rr, sizeof(rr));
543#elif 0 540#elif 0
544 //TODO: arm assembly (untested) 541 //TODO: arm assembly (untested)
545 sp_digit tmp[16];
546
547 asm volatile ( 542 asm volatile (
548"\n mov r5, #0" 543"\n mov r5, #0"
549"\n mov r6, #0" 544"\n mov r6, #0"
@@ -575,12 +570,10 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
575"\n cmp r5, #56" 570"\n cmp r5, #56"
576"\n ble 1b" 571"\n ble 1b"
577"\n str r6, [%[r], r5]" 572"\n str r6, [%[r], r5]"
578 : [r] "r" (tmp), [a] "r" (a), [b] "r" (b) 573 : [r] "r" (r), [a] "r" (a), [b] "r" (b)
579 : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14" 574 : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14"
580 ); 575 );
581 memcpy(r, tmp, sizeof(tmp));
582#else 576#else
583 sp_digit rr[15]; /* in case r coincides with a or b */
584 int i, j, k; 577 int i, j, k;
585 uint64_t acc; 578 uint64_t acc;
586 579
@@ -600,11 +593,10 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
600 j--; 593 j--;
601 i++; 594 i++;
602 } while (i != 8 && i <= k); 595 } while (i != 8 && i <= k);
603 rr[k] = acc; 596 r[k] = acc;
604 acc = (acc >> 32) | ((uint64_t)acc_hi << 32); 597 acc = (acc >> 32) | ((uint64_t)acc_hi << 32);
605 } 598 }
606 r[15] = acc; 599 r[15] = acc;
607 memcpy(r, rr, sizeof(rr));
608#endif 600#endif
609} 601}
610 602
@@ -709,30 +701,11 @@ static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /*, const sp_digit*
709} 701}
710 702
711/* Shift the result in the high 256 bits down to the bottom. 703/* Shift the result in the high 256 bits down to the bottom.
712 * High half is cleared to zeros.
713 */ 704 */
714#if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff 705static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a)
715static void sp_512to256_mont_shift_8(sp_digit* rr)
716{ 706{
717 uint64_t *r = (void*)rr; 707 memcpy(r, a + 8, sizeof(*r) * 8);
718 int i;
719
720 for (i = 0; i < 4; i++) {
721 r[i] = r[i+4];
722 r[i+4] = 0;
723 }
724} 708}
725#else
726static void sp_512to256_mont_shift_8(sp_digit* r)
727{
728 int i;
729
730 for (i = 0; i < 8; i++) {
731 r[i] = r[i+8];
732 r[i+8] = 0;
733 }
734}
735#endif
736 709
737/* Mul a by scalar b and add into r. (r += a * b) 710/* Mul a by scalar b and add into r. (r += a * b)
738 * a = p256_mod 711 * a = p256_mod
@@ -868,11 +841,12 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
868 * Note: the result is NOT guaranteed to be less than p256_mod! 841 * Note: the result is NOT guaranteed to be less than p256_mod!
869 * (it is only guaranteed to fit into 256 bits). 842 * (it is only guaranteed to fit into 256 bits).
870 * 843 *
871 * a Double-wide number to reduce in place. 844 * r Result.
845 * a Double-wide number to reduce. Clobbered.
872 * m The single precision number representing the modulus. 846 * m The single precision number representing the modulus.
873 * mp The digit representing the negative inverse of m mod 2^n. 847 * mp The digit representing the negative inverse of m mod 2^n.
874 */ 848 */
875static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/) 849static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* a/*, const sp_digit* m, sp_digit mp*/)
876{ 850{
877// const sp_digit* m = p256_mod; 851// const sp_digit* m = p256_mod;
878 sp_digit mp = p256_mp_mod; 852 sp_digit mp = p256_mp_mod;
@@ -895,10 +869,10 @@ static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit
895 goto inc_next_word0; 869 goto inc_next_word0;
896 } 870 }
897 } 871 }
898 sp_512to256_mont_shift_8(a); 872 sp_512to256_mont_shift_8(r, a);
899 if (word16th != 0) 873 if (word16th != 0)
900 sp_256_sub_8_p256_mod(a); 874 sp_256_sub_8_p256_mod(r);
901 sp_256_norm_8(a); 875 sp_256_norm_8(r);
902 } 876 }
903 else { /* Same code for explicit mp == 1 (which is always the case for P256) */ 877 else { /* Same code for explicit mp == 1 (which is always the case for P256) */
904 sp_digit word16th = 0; 878 sp_digit word16th = 0;
@@ -915,10 +889,10 @@ static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit
915 goto inc_next_word; 889 goto inc_next_word;
916 } 890 }
917 } 891 }
918 sp_512to256_mont_shift_8(a); 892 sp_512to256_mont_shift_8(r, a);
919 if (word16th != 0) 893 if (word16th != 0)
920 sp_256_sub_8_p256_mod(a); 894 sp_256_sub_8_p256_mod(r);
921 sp_256_norm_8(a); 895 sp_256_norm_8(r);
922 } 896 }
923} 897}
924 898
@@ -926,35 +900,34 @@ static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit
926 * (r = a * b mod m) 900 * (r = a * b mod m)
927 * 901 *
928 * r Result of multiplication. 902 * r Result of multiplication.
929 * Should be [16] array (512 bits), but high half is cleared to zeros (used as scratch pad).
930 * a First number to multiply in Montogmery form. 903 * a First number to multiply in Montogmery form.
931 * b Second number to multiply in Montogmery form. 904 * b Second number to multiply in Montogmery form.
932 * m Modulus (prime). 905 * m Modulus (prime).
933 * mp Montogmery mulitplier. 906 * mp Montogmery mulitplier.
934 */ 907 */
935static void sp_256to512z_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b 908static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b
936 /*, const sp_digit* m, sp_digit mp*/) 909 /*, const sp_digit* m, sp_digit mp*/)
937{ 910{
938 //const sp_digit* m = p256_mod; 911 //const sp_digit* m = p256_mod;
939 //sp_digit mp = p256_mp_mod; 912 //sp_digit mp = p256_mp_mod;
940 sp_256to512_mul_8(r, a, b); 913 sp_digit t[2 * 8];
941 sp_512to256_mont_reduce_8(r /*, m, mp*/); 914 sp_256to512_mul_8(t, a, b);
915 sp_512to256_mont_reduce_8(r, t /*, m, mp*/);
942} 916}
943 917
944/* Square the Montgomery form number. (r = a * a mod m) 918/* Square the Montgomery form number. (r = a * a mod m)
945 * 919 *
946 * r Result of squaring. 920 * r Result of squaring.
947 * Should be [16] array (512 bits), but high half is cleared to zeros (used as scratch pad).
948 * a Number to square in Montogmery form. 921 * a Number to square in Montogmery form.
949 * m Modulus (prime). 922 * m Modulus (prime).
950 * mp Montogmery mulitplier. 923 * mp Montogmery mulitplier.
951 */ 924 */
952static void sp_256to512z_mont_sqr_8(sp_digit* r, const sp_digit* a 925static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a
953 /*, const sp_digit* m, sp_digit mp*/) 926 /*, const sp_digit* m, sp_digit mp*/)
954{ 927{
955 //const sp_digit* m = p256_mod; 928 //const sp_digit* m = p256_mod;
956 //sp_digit mp = p256_mp_mod; 929 //sp_digit mp = p256_mp_mod;
957 sp_256to512z_mont_mul_8(r, a, a /*, m, mp*/); 930 sp_256_mont_mul_8(r, a, a /*, m, mp*/);
958} 931}
959 932
960/* Invert the number, in Montgomery form, modulo the modulus (prime) of the 933/* Invert the number, in Montgomery form, modulo the modulus (prime) of the
@@ -964,11 +937,8 @@ static void sp_256to512z_mont_sqr_8(sp_digit* r, const sp_digit* a
964 * a Number to invert. 937 * a Number to invert.
965 */ 938 */
966#if 0 939#if 0
967/* Mod-2 for the P256 curve. */ 940//p256_mod - 2:
968static const uint32_t p256_mod_2[8] = { 941//ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff - 2
969 0xfffffffd,0xffffffff,0xffffffff,0x00000000,
970 0x00000000,0x00000000,0x00000001,0xffffffff,
971};
972//Bit pattern: 942//Bit pattern:
973//2 2 2 2 2 2 2 1...1 943//2 2 2 2 2 2 2 1...1
974//5 5 4 3 2 1 0 9...0 9...1 944//5 5 4 3 2 1 0 9...0 9...1
@@ -977,15 +947,15 @@ static const uint32_t p256_mod_2[8] = {
977#endif 947#endif
978static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a) 948static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a)
979{ 949{
980 sp_digit t[2*8]; 950 sp_digit t[8];
981 int i; 951 int i;
982 952
983 memcpy(t, a, sizeof(sp_digit) * 8); 953 memcpy(t, a, sizeof(sp_digit) * 8);
984 for (i = 254; i >= 0; i--) { 954 for (i = 254; i >= 0; i--) {
985 sp_256to512z_mont_sqr_8(t, t /*, p256_mod, p256_mp_mod*/); 955 sp_256_mont_sqr_8(t, t /*, p256_mod, p256_mp_mod*/);
986 /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/ 956 /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/
987 if (i >= 224 || i == 192 || (i <= 95 && i != 1)) 957 if (i >= 224 || i == 192 || (i <= 95 && i != 1))
988 sp_256to512z_mont_mul_8(t, t, a /*, p256_mod, p256_mp_mod*/); 958 sp_256_mont_mul_8(t, t, a /*, p256_mod, p256_mp_mod*/);
989 } 959 }
990 memcpy(r, t, sizeof(sp_digit) * 8); 960 memcpy(r, t, sizeof(sp_digit) * 8);
991} 961}
@@ -1056,25 +1026,28 @@ static void sp_256_mod_mul_norm_8(sp_digit* r, const sp_digit* a)
1056 */ 1026 */
1057static void sp_256_map_8(sp_point* r, sp_point* p) 1027static void sp_256_map_8(sp_point* r, sp_point* p)
1058{ 1028{
1059 sp_digit t1[2*8]; 1029 sp_digit t1[8];
1060 sp_digit t2[2*8]; 1030 sp_digit t2[8];
1031 sp_digit rr[2 * 8];
1061 1032
1062 sp_256_mont_inv_8(t1, p->z); 1033 sp_256_mont_inv_8(t1, p->z);
1063 1034
1064 sp_256to512z_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/); 1035 sp_256_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/);
1065 sp_256to512z_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/); 1036 sp_256_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/);
1066 1037
1067 /* x /= z^2 */ 1038 /* x /= z^2 */
1068 sp_256to512z_mont_mul_8(r->x, p->x, t2 /*, p256_mod, p256_mp_mod*/); 1039 sp_256_mont_mul_8(rr, p->x, t2 /*, p256_mod, p256_mp_mod*/);
1069 sp_512to256_mont_reduce_8(r->x /*, p256_mod, p256_mp_mod*/); 1040 memset(rr + 8, 0, sizeof(rr) / 2);
1041 sp_512to256_mont_reduce_8(r->x, rr /*, p256_mod, p256_mp_mod*/);
1070 /* Reduce x to less than modulus */ 1042 /* Reduce x to less than modulus */
1071 if (sp_256_cmp_8(r->x, p256_mod) >= 0) 1043 if (sp_256_cmp_8(r->x, p256_mod) >= 0)
1072 sp_256_sub_8_p256_mod(r->x); 1044 sp_256_sub_8_p256_mod(r->x);
1073 sp_256_norm_8(r->x); 1045 sp_256_norm_8(r->x);
1074 1046
1075 /* y /= z^3 */ 1047 /* y /= z^3 */
1076 sp_256to512z_mont_mul_8(r->y, p->y, t1 /*, p256_mod, p256_mp_mod*/); 1048 sp_256_mont_mul_8(rr, p->y, t1 /*, p256_mod, p256_mp_mod*/);
1077 sp_512to256_mont_reduce_8(r->y /*, p256_mod, p256_mp_mod*/); 1049 memset(rr + 8, 0, sizeof(rr) / 2);
1050 sp_512to256_mont_reduce_8(r->y, rr /*, p256_mod, p256_mp_mod*/);
1078 /* Reduce y to less than modulus */ 1051 /* Reduce y to less than modulus */
1079 if (sp_256_cmp_8(r->y, p256_mod) >= 0) 1052 if (sp_256_cmp_8(r->y, p256_mod) >= 0)
1080 sp_256_sub_8_p256_mod(r->y); 1053 sp_256_sub_8_p256_mod(r->y);
@@ -1091,8 +1064,8 @@ static void sp_256_map_8(sp_point* r, sp_point* p)
1091 */ 1064 */
1092static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p) 1065static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
1093{ 1066{
1094 sp_digit t1[2*8]; 1067 sp_digit t1[8];
1095 sp_digit t2[2*8]; 1068 sp_digit t2[8];
1096 1069
1097 /* Put point to double into result */ 1070 /* Put point to double into result */
1098 if (r != p) 1071 if (r != p)
@@ -1101,17 +1074,10 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
1101 if (r->infinity) 1074 if (r->infinity)
1102 return; 1075 return;
1103 1076
1104 if (SP_DEBUG) {
1105 /* unused part of t2, may result in spurios
1106 * differences in debug output. Clear it.
1107 */
1108 memset(t2, 0, sizeof(t2));
1109 }
1110
1111 /* T1 = Z * Z */ 1077 /* T1 = Z * Z */
1112 sp_256to512z_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/); 1078 sp_256_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/);
1113 /* Z = Y * Z */ 1079 /* Z = Y * Z */
1114 sp_256to512z_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/); 1080 sp_256_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/);
1115 /* Z = 2Z */ 1081 /* Z = 2Z */
1116 sp_256_mont_dbl_8(r->z, r->z /*, p256_mod*/); 1082 sp_256_mont_dbl_8(r->z, r->z /*, p256_mod*/);
1117 /* T2 = X - T1 */ 1083 /* T2 = X - T1 */
@@ -1119,21 +1085,21 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
1119 /* T1 = X + T1 */ 1085 /* T1 = X + T1 */
1120 sp_256_mont_add_8(t1, r->x, t1 /*, p256_mod*/); 1086 sp_256_mont_add_8(t1, r->x, t1 /*, p256_mod*/);
1121 /* T2 = T1 * T2 */ 1087 /* T2 = T1 * T2 */
1122 sp_256to512z_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/); 1088 sp_256_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/);
1123 /* T1 = 3T2 */ 1089 /* T1 = 3T2 */
1124 sp_256_mont_tpl_8(t1, t2 /*, p256_mod*/); 1090 sp_256_mont_tpl_8(t1, t2 /*, p256_mod*/);
1125 /* Y = 2Y */ 1091 /* Y = 2Y */
1126 sp_256_mont_dbl_8(r->y, r->y /*, p256_mod*/); 1092 sp_256_mont_dbl_8(r->y, r->y /*, p256_mod*/);
1127 /* Y = Y * Y */ 1093 /* Y = Y * Y */
1128 sp_256to512z_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/); 1094 sp_256_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/);
1129 /* T2 = Y * Y */ 1095 /* T2 = Y * Y */
1130 sp_256to512z_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/); 1096 sp_256_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/);
1131 /* T2 = T2/2 */ 1097 /* T2 = T2/2 */
1132 sp_256_div2_8(t2 /*, p256_mod*/); 1098 sp_256_div2_8(t2 /*, p256_mod*/);
1133 /* Y = Y * X */ 1099 /* Y = Y * X */
1134 sp_256to512z_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/); 1100 sp_256_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/);
1135 /* X = T1 * T1 */ 1101 /* X = T1 * T1 */
1136 sp_256to512z_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/); 1102 sp_256_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/);
1137 /* X = X - Y */ 1103 /* X = X - Y */
1138 sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/); 1104 sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/);
1139 /* X = X - Y */ 1105 /* X = X - Y */
@@ -1141,7 +1107,7 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
1141 /* Y = Y - X */ 1107 /* Y = Y - X */
1142 sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/); 1108 sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/);
1143 /* Y = Y * T1 */ 1109 /* Y = Y * T1 */
1144 sp_256to512z_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/); 1110 sp_256_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/);
1145 /* Y = Y - T2 */ 1111 /* Y = Y - T2 */
1146 sp_256_mont_sub_8(r->y, r->y, t2 /*, p256_mod*/); 1112 sp_256_mont_sub_8(r->y, r->y, t2 /*, p256_mod*/);
1147 dump_512("y2 %s\n", r->y); 1113 dump_512("y2 %s\n", r->y);
@@ -1155,11 +1121,11 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
1155 */ 1121 */
1156static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point* q) 1122static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point* q)
1157{ 1123{
1158 sp_digit t1[2*8]; 1124 sp_digit t1[8];
1159 sp_digit t2[2*8]; 1125 sp_digit t2[8];
1160 sp_digit t3[2*8]; 1126 sp_digit t3[8];
1161 sp_digit t4[2*8]; 1127 sp_digit t4[8];
1162 sp_digit t5[2*8]; 1128 sp_digit t5[8];
1163 1129
1164 /* Ensure only the first point is the same as the result. */ 1130 /* Ensure only the first point is the same as the result. */
1165 if (q == r) { 1131 if (q == r) {
@@ -1186,36 +1152,36 @@ static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point*
1186 } 1152 }
1187 1153
1188 /* U1 = X1*Z2^2 */ 1154 /* U1 = X1*Z2^2 */
1189 sp_256to512z_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/); 1155 sp_256_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/);
1190 sp_256to512z_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/); 1156 sp_256_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/);
1191 sp_256to512z_mont_mul_8(t1, t1, r->x /*, p256_mod, p256_mp_mod*/); 1157 sp_256_mont_mul_8(t1, t1, r->x /*, p256_mod, p256_mp_mod*/);
1192 /* U2 = X2*Z1^2 */ 1158 /* U2 = X2*Z1^2 */
1193 sp_256to512z_mont_sqr_8(t2, r->z /*, p256_mod, p256_mp_mod*/); 1159 sp_256_mont_sqr_8(t2, r->z /*, p256_mod, p256_mp_mod*/);
1194 sp_256to512z_mont_mul_8(t4, t2, r->z /*, p256_mod, p256_mp_mod*/); 1160 sp_256_mont_mul_8(t4, t2, r->z /*, p256_mod, p256_mp_mod*/);
1195 sp_256to512z_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/); 1161 sp_256_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/);
1196 /* S1 = Y1*Z2^3 */ 1162 /* S1 = Y1*Z2^3 */
1197 sp_256to512z_mont_mul_8(t3, t3, r->y /*, p256_mod, p256_mp_mod*/); 1163 sp_256_mont_mul_8(t3, t3, r->y /*, p256_mod, p256_mp_mod*/);
1198 /* S2 = Y2*Z1^3 */ 1164 /* S2 = Y2*Z1^3 */
1199 sp_256to512z_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/); 1165 sp_256_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/);
1200 /* H = U2 - U1 */ 1166 /* H = U2 - U1 */
1201 sp_256_mont_sub_8(t2, t2, t1 /*, p256_mod*/); 1167 sp_256_mont_sub_8(t2, t2, t1 /*, p256_mod*/);
1202 /* R = S2 - S1 */ 1168 /* R = S2 - S1 */
1203 sp_256_mont_sub_8(t4, t4, t3 /*, p256_mod*/); 1169 sp_256_mont_sub_8(t4, t4, t3 /*, p256_mod*/);
1204 /* Z3 = H*Z1*Z2 */ 1170 /* Z3 = H*Z1*Z2 */
1205 sp_256to512z_mont_mul_8(r->z, r->z, q->z /*, p256_mod, p256_mp_mod*/); 1171 sp_256_mont_mul_8(r->z, r->z, q->z /*, p256_mod, p256_mp_mod*/);
1206 sp_256to512z_mont_mul_8(r->z, r->z, t2 /*, p256_mod, p256_mp_mod*/); 1172 sp_256_mont_mul_8(r->z, r->z, t2 /*, p256_mod, p256_mp_mod*/);
1207 /* X3 = R^2 - H^3 - 2*U1*H^2 */ 1173 /* X3 = R^2 - H^3 - 2*U1*H^2 */
1208 sp_256to512z_mont_sqr_8(r->x, t4 /*, p256_mod, p256_mp_mod*/); 1174 sp_256_mont_sqr_8(r->x, t4 /*, p256_mod, p256_mp_mod*/);
1209 sp_256to512z_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/); 1175 sp_256_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/);
1210 sp_256to512z_mont_mul_8(r->y, t1, t5 /*, p256_mod, p256_mp_mod*/); 1176 sp_256_mont_mul_8(r->y, t1, t5 /*, p256_mod, p256_mp_mod*/);
1211 sp_256to512z_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/); 1177 sp_256_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/);
1212 sp_256_mont_sub_8(r->x, r->x, t5 /*, p256_mod*/); 1178 sp_256_mont_sub_8(r->x, r->x, t5 /*, p256_mod*/);
1213 sp_256_mont_dbl_8(t1, r->y /*, p256_mod*/); 1179 sp_256_mont_dbl_8(t1, r->y /*, p256_mod*/);
1214 sp_256_mont_sub_8(r->x, r->x, t1 /*, p256_mod*/); 1180 sp_256_mont_sub_8(r->x, r->x, t1 /*, p256_mod*/);
1215 /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ 1181 /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
1216 sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/); 1182 sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/);
1217 sp_256to512z_mont_mul_8(r->y, r->y, t4 /*, p256_mod, p256_mp_mod*/); 1183 sp_256_mont_mul_8(r->y, r->y, t4 /*, p256_mod, p256_mp_mod*/);
1218 sp_256to512z_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/); 1184 sp_256_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/);
1219 sp_256_mont_sub_8(r->y, r->y, t5 /*, p256_mod*/); 1185 sp_256_mont_sub_8(r->y, r->y, t5 /*, p256_mod*/);
1220} 1186}
1221 1187