diff options
| author | Denys Vlasenko <vda.linux@googlemail.com> | 2021-11-27 15:47:26 +0100 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2021-11-27 15:47:26 +0100 |
| commit | 4415f7bc06f1ee382bcbaabd86c3d7aca0b46d93 (patch) | |
| tree | 9fb82bbbb4dca3f9ad86ef8f831e54333db4666b | |
| parent | bbda85c74b7a53d8b2bb46f3b44d8f0932a6e95d (diff) | |
| download | busybox-w32-4415f7bc06f1ee382bcbaabd86c3d7aca0b46d93.tar.gz busybox-w32-4415f7bc06f1ee382bcbaabd86c3d7aca0b46d93.tar.bz2 busybox-w32-4415f7bc06f1ee382bcbaabd86c3d7aca0b46d93.zip | |
tls: P256: explain which functions use double-wide arrays, no code changes
function old new delta
sp_512to256_mont_reduce_8 - 243 +243
sp_256to512z_mont_mul_8 - 150 +150
sp_256to512z_mont_sqr_8 - 7 +7
sp_256_mont_sqr_8 7 - -7
sp_256_mont_mul_8 150 - -150
sp_256_mont_reduce_8 243 - -243
------------------------------------------------------------------------------
(add/remove: 3/3 grow/shrink: 0/0 up/down: 400/-400) Total: 0 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
| -rw-r--r-- | networking/tls_sp_c32.c | 211 |
1 files changed, 58 insertions, 153 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c index 3b0473036..74ded2cda 100644 --- a/networking/tls_sp_c32.c +++ b/networking/tls_sp_c32.c | |||
| @@ -455,8 +455,10 @@ static void sp_256_sub_8_p256_mod(sp_digit* r) | |||
| 455 | } | 455 | } |
| 456 | #endif | 456 | #endif |
| 457 | 457 | ||
| 458 | /* Multiply a and b into r. (r = a * b) */ | 458 | /* Multiply a and b into r. (r = a * b) |
| 459 | static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) | 459 | * r should be [16] array (512 bits). |
| 460 | */ | ||
| 461 | static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) | ||
| 460 | { | 462 | { |
| 461 | #if ALLOW_ASM && defined(__GNUC__) && defined(__i386__) | 463 | #if ALLOW_ASM && defined(__GNUC__) && defined(__i386__) |
| 462 | sp_digit rr[15]; /* in case r coincides with a or b */ | 464 | sp_digit rr[15]; /* in case r coincides with a or b */ |
| @@ -704,9 +706,11 @@ static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /*, const sp_digit* | |||
| 704 | } | 706 | } |
| 705 | } | 707 | } |
| 706 | 708 | ||
| 707 | /* Shift the result in the high 256 bits down to the bottom. */ | 709 | /* Shift the result in the high 256 bits down to the bottom. |
| 710 | * High half is cleared to zeros. | ||
| 711 | */ | ||
| 708 | #if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff | 712 | #if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff |
| 709 | static void sp_256_mont_shift_8(sp_digit* rr) | 713 | static void sp_512to256_mont_shift_8(sp_digit* rr) |
| 710 | { | 714 | { |
| 711 | uint64_t *r = (void*)rr; | 715 | uint64_t *r = (void*)rr; |
| 712 | int i; | 716 | int i; |
| @@ -717,7 +721,7 @@ static void sp_256_mont_shift_8(sp_digit* rr) | |||
| 717 | } | 721 | } |
| 718 | } | 722 | } |
| 719 | #else | 723 | #else |
| 720 | static void sp_256_mont_shift_8(sp_digit* r) | 724 | static void sp_512to256_mont_shift_8(sp_digit* r) |
| 721 | { | 725 | { |
| 722 | int i; | 726 | int i; |
| 723 | 727 | ||
| @@ -728,7 +732,10 @@ static void sp_256_mont_shift_8(sp_digit* r) | |||
| 728 | } | 732 | } |
| 729 | #endif | 733 | #endif |
| 730 | 734 | ||
| 731 | /* Mul a by scalar b and add into r. (r += a * b) */ | 735 | /* Mul a by scalar b and add into r. (r += a * b) |
| 736 | * a = p256_mod | ||
| 737 | * b = r[0] | ||
| 738 | */ | ||
| 732 | static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) | 739 | static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) |
| 733 | { | 740 | { |
| 734 | // const sp_digit* a = p256_mod; | 741 | // const sp_digit* a = p256_mod; |
| @@ -857,11 +864,11 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) | |||
| 857 | 864 | ||
| 858 | /* Reduce the number back to 256 bits using Montgomery reduction. | 865 | /* Reduce the number back to 256 bits using Montgomery reduction. |
| 859 | * | 866 | * |
| 860 | * a A single precision number to reduce in place. | 867 | * a Double-wide number to reduce in place. |
| 861 | * m The single precision number representing the modulus. | 868 | * m The single precision number representing the modulus. |
| 862 | * mp The digit representing the negative inverse of m mod 2^n. | 869 | * mp The digit representing the negative inverse of m mod 2^n. |
| 863 | */ | 870 | */ |
| 864 | static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/) | 871 | static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/) |
| 865 | { | 872 | { |
| 866 | // const sp_digit* m = p256_mod; | 873 | // const sp_digit* m = p256_mod; |
| 867 | sp_digit mp = p256_mp_mod; | 874 | sp_digit mp = p256_mp_mod; |
| @@ -884,7 +891,7 @@ static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/ | |||
| 884 | goto inc_next_word0; | 891 | goto inc_next_word0; |
| 885 | } | 892 | } |
| 886 | } | 893 | } |
| 887 | sp_256_mont_shift_8(a); | 894 | sp_512to256_mont_shift_8(a); |
| 888 | if (word16th != 0) | 895 | if (word16th != 0) |
| 889 | sp_256_sub_8_p256_mod(a); | 896 | sp_256_sub_8_p256_mod(a); |
| 890 | sp_256_norm_8(a); | 897 | sp_256_norm_8(a); |
| @@ -892,7 +899,7 @@ static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/ | |||
| 892 | else { /* Same code for explicit mp == 1 (which is always the case for P256) */ | 899 | else { /* Same code for explicit mp == 1 (which is always the case for P256) */ |
| 893 | sp_digit word16th = 0; | 900 | sp_digit word16th = 0; |
| 894 | for (i = 0; i < 8; i++) { | 901 | for (i = 0; i < 8; i++) { |
| 895 | /*mu = a[i];*/ | 902 | // mu = a[i]; |
| 896 | if (sp_256_mul_add_8(a+i /*, m, mu*/)) { | 903 | if (sp_256_mul_add_8(a+i /*, m, mu*/)) { |
| 897 | int j = i + 8; | 904 | int j = i + 8; |
| 898 | inc_next_word: | 905 | inc_next_word: |
| @@ -904,148 +911,46 @@ static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/ | |||
| 904 | goto inc_next_word; | 911 | goto inc_next_word; |
| 905 | } | 912 | } |
| 906 | } | 913 | } |
| 907 | sp_256_mont_shift_8(a); | 914 | sp_512to256_mont_shift_8(a); |
| 908 | if (word16th != 0) | 915 | if (word16th != 0) |
| 909 | sp_256_sub_8_p256_mod(a); | 916 | sp_256_sub_8_p256_mod(a); |
| 910 | sp_256_norm_8(a); | 917 | sp_256_norm_8(a); |
| 911 | } | 918 | } |
| 912 | } | 919 | } |
| 913 | #if 0 | ||
| 914 | //TODO: arm32 asm (also adapt for x86?) | ||
| 915 | static void sp_256_mont_reduce_8(sp_digit* a, sp_digit* m, sp_digit mp) | ||
| 916 | { | ||
| 917 | sp_digit ca = 0; | ||
| 918 | |||
| 919 | asm volatile ( | ||
| 920 | # i = 0 | ||
| 921 | mov r12, #0 | ||
| 922 | ldr r10, [%[a], #0] | ||
| 923 | ldr r14, [%[a], #4] | ||
| 924 | 1: | ||
| 925 | # mu = a[i] * mp | ||
| 926 | mul r8, %[mp], r10 | ||
| 927 | # a[i+0] += m[0] * mu | ||
| 928 | ldr r7, [%[m], #0] | ||
| 929 | ldr r9, [%[a], #0] | ||
| 930 | umull r6, r7, r8, r7 | ||
| 931 | adds r10, r10, r6 | ||
| 932 | adc r5, r7, #0 | ||
| 933 | # a[i+1] += m[1] * mu | ||
| 934 | ldr r7, [%[m], #4] | ||
| 935 | ldr r9, [%[a], #4] | ||
| 936 | umull r6, r7, r8, r7 | ||
| 937 | adds r10, r14, r6 | ||
| 938 | adc r4, r7, #0 | ||
| 939 | adds r10, r10, r5 | ||
| 940 | adc r4, r4, #0 | ||
| 941 | # a[i+2] += m[2] * mu | ||
| 942 | ldr r7, [%[m], #8] | ||
| 943 | ldr r14, [%[a], #8] | ||
| 944 | umull r6, r7, r8, r7 | ||
| 945 | adds r14, r14, r6 | ||
| 946 | adc r5, r7, #0 | ||
| 947 | adds r14, r14, r4 | ||
| 948 | adc r5, r5, #0 | ||
| 949 | # a[i+3] += m[3] * mu | ||
| 950 | ldr r7, [%[m], #12] | ||
| 951 | ldr r9, [%[a], #12] | ||
| 952 | umull r6, r7, r8, r7 | ||
| 953 | adds r9, r9, r6 | ||
| 954 | adc r4, r7, #0 | ||
| 955 | adds r9, r9, r5 | ||
| 956 | str r9, [%[a], #12] | ||
| 957 | adc r4, r4, #0 | ||
| 958 | # a[i+4] += m[4] * mu | ||
| 959 | ldr r7, [%[m], #16] | ||
| 960 | ldr r9, [%[a], #16] | ||
| 961 | umull r6, r7, r8, r7 | ||
| 962 | adds r9, r9, r6 | ||
| 963 | adc r5, r7, #0 | ||
| 964 | adds r9, r9, r4 | ||
| 965 | str r9, [%[a], #16] | ||
| 966 | adc r5, r5, #0 | ||
| 967 | # a[i+5] += m[5] * mu | ||
| 968 | ldr r7, [%[m], #20] | ||
| 969 | ldr r9, [%[a], #20] | ||
| 970 | umull r6, r7, r8, r7 | ||
| 971 | adds r9, r9, r6 | ||
| 972 | adc r4, r7, #0 | ||
| 973 | adds r9, r9, r5 | ||
| 974 | str r9, [%[a], #20] | ||
| 975 | adc r4, r4, #0 | ||
| 976 | # a[i+6] += m[6] * mu | ||
| 977 | ldr r7, [%[m], #24] | ||
| 978 | ldr r9, [%[a], #24] | ||
| 979 | umull r6, r7, r8, r7 | ||
| 980 | adds r9, r9, r6 | ||
| 981 | adc r5, r7, #0 | ||
| 982 | adds r9, r9, r4 | ||
| 983 | str r9, [%[a], #24] | ||
| 984 | adc r5, r5, #0 | ||
| 985 | # a[i+7] += m[7] * mu | ||
| 986 | ldr r7, [%[m], #28] | ||
| 987 | ldr r9, [%[a], #28] | ||
| 988 | umull r6, r7, r8, r7 | ||
| 989 | adds r5, r5, r6 | ||
| 990 | adcs r7, r7, %[ca] | ||
| 991 | mov %[ca], #0 | ||
| 992 | adc %[ca], %[ca], %[ca] | ||
| 993 | adds r9, r9, r5 | ||
| 994 | str r9, [%[a], #28] | ||
| 995 | ldr r9, [%[a], #32] | ||
| 996 | adcs r9, r9, r7 | ||
| 997 | str r9, [%[a], #32] | ||
| 998 | adc %[ca], %[ca], #0 | ||
| 999 | # i += 1 | ||
| 1000 | add %[a], %[a], #4 | ||
| 1001 | add r12, r12, #4 | ||
| 1002 | cmp r12, #32 | ||
| 1003 | blt 1b | ||
| 1004 | |||
| 1005 | str r10, [%[a], #0] | ||
| 1006 | str r14, [%[a], #4] | ||
| 1007 | : [ca] "+r" (ca), [a] "+r" (a) | ||
| 1008 | : [m] "r" (m), [mp] "r" (mp) | ||
| 1009 | : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14" | ||
| 1010 | ); | ||
| 1011 | |||
| 1012 | memcpy(a, a + 8, 32); | ||
| 1013 | if (ca) | ||
| 1014 | a -= m; | ||
| 1015 | } | ||
| 1016 | #endif | ||
| 1017 | 920 | ||
| 1018 | /* Multiply two Montogmery form numbers mod the modulus (prime). | 921 | /* Multiply two Montogmery form numbers mod the modulus (prime). |
| 1019 | * (r = a * b mod m) | 922 | * (r = a * b mod m) |
| 1020 | * | 923 | * |
| 1021 | * r Result of multiplication. | 924 | * r Result of multiplication. |
| 925 | * Should be [16] array (512 bits), but high half is cleared to zeros (used as scratch pad). | ||
| 1022 | * a First number to multiply in Montogmery form. | 926 | * a First number to multiply in Montogmery form. |
| 1023 | * b Second number to multiply in Montogmery form. | 927 | * b Second number to multiply in Montogmery form. |
| 1024 | * m Modulus (prime). | 928 | * m Modulus (prime). |
| 1025 | * mp Montogmery mulitplier. | 929 | * mp Montogmery mulitplier. |
| 1026 | */ | 930 | */ |
| 1027 | static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b | 931 | static void sp_256to512z_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b |
| 1028 | /*, const sp_digit* m, sp_digit mp*/) | 932 | /*, const sp_digit* m, sp_digit mp*/) |
| 1029 | { | 933 | { |
| 1030 | //const sp_digit* m = p256_mod; | 934 | //const sp_digit* m = p256_mod; |
| 1031 | //sp_digit mp = p256_mp_mod; | 935 | //sp_digit mp = p256_mp_mod; |
| 1032 | sp_256_mul_8(r, a, b); | 936 | sp_256to512_mul_8(r, a, b); |
| 1033 | sp_256_mont_reduce_8(r /*, m, mp*/); | 937 | sp_512to256_mont_reduce_8(r /*, m, mp*/); |
| 1034 | } | 938 | } |
| 1035 | 939 | ||
| 1036 | /* Square the Montgomery form number. (r = a * a mod m) | 940 | /* Square the Montgomery form number. (r = a * a mod m) |
| 1037 | * | 941 | * |
| 1038 | * r Result of squaring. | 942 | * r Result of squaring. |
| 943 | * Should be [16] array (512 bits), but high half is cleared to zeros (used as scratch pad). | ||
| 1039 | * a Number to square in Montogmery form. | 944 | * a Number to square in Montogmery form. |
| 1040 | * m Modulus (prime). | 945 | * m Modulus (prime). |
| 1041 | * mp Montogmery mulitplier. | 946 | * mp Montogmery mulitplier. |
| 1042 | */ | 947 | */ |
| 1043 | static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a | 948 | static void sp_256to512z_mont_sqr_8(sp_digit* r, const sp_digit* a |
| 1044 | /*, const sp_digit* m, sp_digit mp*/) | 949 | /*, const sp_digit* m, sp_digit mp*/) |
| 1045 | { | 950 | { |
| 1046 | //const sp_digit* m = p256_mod; | 951 | //const sp_digit* m = p256_mod; |
| 1047 | //sp_digit mp = p256_mp_mod; | 952 | //sp_digit mp = p256_mp_mod; |
| 1048 | sp_256_mont_mul_8(r, a, a /*, m, mp*/); | 953 | sp_256to512z_mont_mul_8(r, a, a /*, m, mp*/); |
| 1049 | } | 954 | } |
| 1050 | 955 | ||
| 1051 | /* Invert the number, in Montgomery form, modulo the modulus (prime) of the | 956 | /* Invert the number, in Montgomery form, modulo the modulus (prime) of the |
| @@ -1068,15 +973,15 @@ static const uint32_t p256_mod_2[8] = { | |||
| 1068 | #endif | 973 | #endif |
| 1069 | static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a) | 974 | static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a) |
| 1070 | { | 975 | { |
| 1071 | sp_digit t[2*8]; //can be just [8]? | 976 | sp_digit t[2*8]; |
| 1072 | int i; | 977 | int i; |
| 1073 | 978 | ||
| 1074 | memcpy(t, a, sizeof(sp_digit) * 8); | 979 | memcpy(t, a, sizeof(sp_digit) * 8); |
| 1075 | for (i = 254; i >= 0; i--) { | 980 | for (i = 254; i >= 0; i--) { |
| 1076 | sp_256_mont_sqr_8(t, t /*, p256_mod, p256_mp_mod*/); | 981 | sp_256to512z_mont_sqr_8(t, t /*, p256_mod, p256_mp_mod*/); |
| 1077 | /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/ | 982 | /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/ |
| 1078 | if (i >= 224 || i == 192 || (i <= 95 && i != 1)) | 983 | if (i >= 224 || i == 192 || (i <= 95 && i != 1)) |
| 1079 | sp_256_mont_mul_8(t, t, a /*, p256_mod, p256_mp_mod*/); | 984 | sp_256to512z_mont_mul_8(t, t, a /*, p256_mod, p256_mp_mod*/); |
| 1080 | } | 985 | } |
| 1081 | memcpy(r, t, sizeof(sp_digit) * 8); | 986 | memcpy(r, t, sizeof(sp_digit) * 8); |
| 1082 | } | 987 | } |
| @@ -1152,22 +1057,22 @@ static void sp_256_map_8(sp_point* r, sp_point* p) | |||
| 1152 | 1057 | ||
| 1153 | sp_256_mont_inv_8(t1, p->z); | 1058 | sp_256_mont_inv_8(t1, p->z); |
| 1154 | 1059 | ||
| 1155 | sp_256_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/); | 1060 | sp_256to512z_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/); |
| 1156 | sp_256_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/); | 1061 | sp_256to512z_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/); |
| 1157 | 1062 | ||
| 1158 | /* x /= z^2 */ | 1063 | /* x /= z^2 */ |
| 1159 | sp_256_mont_mul_8(r->x, p->x, t2 /*, p256_mod, p256_mp_mod*/); | 1064 | sp_256to512z_mont_mul_8(r->x, p->x, t2 /*, p256_mod, p256_mp_mod*/); |
| 1160 | memset(r->x + 8, 0, sizeof(r->x) / 2); | 1065 | memset(r->x + 8, 0, sizeof(r->x) / 2); |
| 1161 | sp_256_mont_reduce_8(r->x /*, p256_mod, p256_mp_mod*/); | 1066 | sp_512to256_mont_reduce_8(r->x /*, p256_mod, p256_mp_mod*/); |
| 1162 | /* Reduce x to less than modulus */ | 1067 | /* Reduce x to less than modulus */ |
| 1163 | if (sp_256_cmp_8(r->x, p256_mod) >= 0) | 1068 | if (sp_256_cmp_8(r->x, p256_mod) >= 0) |
| 1164 | sp_256_sub_8_p256_mod(r->x); | 1069 | sp_256_sub_8_p256_mod(r->x); |
| 1165 | sp_256_norm_8(r->x); | 1070 | sp_256_norm_8(r->x); |
| 1166 | 1071 | ||
| 1167 | /* y /= z^3 */ | 1072 | /* y /= z^3 */ |
| 1168 | sp_256_mont_mul_8(r->y, p->y, t1 /*, p256_mod, p256_mp_mod*/); | 1073 | sp_256to512z_mont_mul_8(r->y, p->y, t1 /*, p256_mod, p256_mp_mod*/); |
| 1169 | memset(r->y + 8, 0, sizeof(r->y) / 2); | 1074 | memset(r->y + 8, 0, sizeof(r->y) / 2); |
| 1170 | sp_256_mont_reduce_8(r->y /*, p256_mod, p256_mp_mod*/); | 1075 | sp_512to256_mont_reduce_8(r->y /*, p256_mod, p256_mp_mod*/); |
| 1171 | /* Reduce y to less than modulus */ | 1076 | /* Reduce y to less than modulus */ |
| 1172 | if (sp_256_cmp_8(r->y, p256_mod) >= 0) | 1077 | if (sp_256_cmp_8(r->y, p256_mod) >= 0) |
| 1173 | sp_256_sub_8_p256_mod(r->y); | 1078 | sp_256_sub_8_p256_mod(r->y); |
| @@ -1202,9 +1107,9 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p) | |||
| 1202 | } | 1107 | } |
| 1203 | 1108 | ||
| 1204 | /* T1 = Z * Z */ | 1109 | /* T1 = Z * Z */ |
| 1205 | sp_256_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/); | 1110 | sp_256to512z_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/); |
| 1206 | /* Z = Y * Z */ | 1111 | /* Z = Y * Z */ |
| 1207 | sp_256_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/); | 1112 | sp_256to512z_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/); |
| 1208 | /* Z = 2Z */ | 1113 | /* Z = 2Z */ |
| 1209 | sp_256_mont_dbl_8(r->z, r->z /*, p256_mod*/); | 1114 | sp_256_mont_dbl_8(r->z, r->z /*, p256_mod*/); |
| 1210 | /* T2 = X - T1 */ | 1115 | /* T2 = X - T1 */ |
| @@ -1212,21 +1117,21 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p) | |||
| 1212 | /* T1 = X + T1 */ | 1117 | /* T1 = X + T1 */ |
| 1213 | sp_256_mont_add_8(t1, r->x, t1 /*, p256_mod*/); | 1118 | sp_256_mont_add_8(t1, r->x, t1 /*, p256_mod*/); |
| 1214 | /* T2 = T1 * T2 */ | 1119 | /* T2 = T1 * T2 */ |
| 1215 | sp_256_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/); | 1120 | sp_256to512z_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/); |
| 1216 | /* T1 = 3T2 */ | 1121 | /* T1 = 3T2 */ |
| 1217 | sp_256_mont_tpl_8(t1, t2 /*, p256_mod*/); | 1122 | sp_256_mont_tpl_8(t1, t2 /*, p256_mod*/); |
| 1218 | /* Y = 2Y */ | 1123 | /* Y = 2Y */ |
| 1219 | sp_256_mont_dbl_8(r->y, r->y /*, p256_mod*/); | 1124 | sp_256_mont_dbl_8(r->y, r->y /*, p256_mod*/); |
| 1220 | /* Y = Y * Y */ | 1125 | /* Y = Y * Y */ |
| 1221 | sp_256_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/); | 1126 | sp_256to512z_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/); |
| 1222 | /* T2 = Y * Y */ | 1127 | /* T2 = Y * Y */ |
| 1223 | sp_256_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/); | 1128 | sp_256to512z_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/); |
| 1224 | /* T2 = T2/2 */ | 1129 | /* T2 = T2/2 */ |
| 1225 | sp_256_div2_8(t2, t2, p256_mod); | 1130 | sp_256_div2_8(t2, t2, p256_mod); |
| 1226 | /* Y = Y * X */ | 1131 | /* Y = Y * X */ |
| 1227 | sp_256_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/); | 1132 | sp_256to512z_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/); |
| 1228 | /* X = T1 * T1 */ | 1133 | /* X = T1 * T1 */ |
| 1229 | sp_256_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/); | 1134 | sp_256to512z_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/); |
| 1230 | /* X = X - Y */ | 1135 | /* X = X - Y */ |
| 1231 | sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/); | 1136 | sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/); |
| 1232 | /* X = X - Y */ | 1137 | /* X = X - Y */ |
| @@ -1234,7 +1139,7 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p) | |||
| 1234 | /* Y = Y - X */ | 1139 | /* Y = Y - X */ |
| 1235 | sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/); | 1140 | sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/); |
| 1236 | /* Y = Y * T1 */ | 1141 | /* Y = Y * T1 */ |
| 1237 | sp_256_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/); | 1142 | sp_256to512z_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/); |
| 1238 | /* Y = Y - T2 */ | 1143 | /* Y = Y - T2 */ |
| 1239 | sp_256_mont_sub_8(r->y, r->y, t2 /*, p256_mod*/); | 1144 | sp_256_mont_sub_8(r->y, r->y, t2 /*, p256_mod*/); |
| 1240 | dump_512("y2 %s\n", r->y); | 1145 | dump_512("y2 %s\n", r->y); |
| @@ -1279,36 +1184,36 @@ static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point* | |||
| 1279 | } | 1184 | } |
| 1280 | 1185 | ||
| 1281 | /* U1 = X1*Z2^2 */ | 1186 | /* U1 = X1*Z2^2 */ |
| 1282 | sp_256_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/); | 1187 | sp_256to512z_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/); |
| 1283 | sp_256_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/); | 1188 | sp_256to512z_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/); |
| 1284 | sp_256_mont_mul_8(t1, t1, r->x /*, p256_mod, p256_mp_mod*/); | 1189 | sp_256to512z_mont_mul_8(t1, t1, r->x /*, p256_mod, p256_mp_mod*/); |
| 1285 | /* U2 = X2*Z1^2 */ | 1190 | /* U2 = X2*Z1^2 */ |
| 1286 | sp_256_mont_sqr_8(t2, r->z /*, p256_mod, p256_mp_mod*/); | 1191 | sp_256to512z_mont_sqr_8(t2, r->z /*, p256_mod, p256_mp_mod*/); |
| 1287 | sp_256_mont_mul_8(t4, t2, r->z /*, p256_mod, p256_mp_mod*/); | 1192 | sp_256to512z_mont_mul_8(t4, t2, r->z /*, p256_mod, p256_mp_mod*/); |
| 1288 | sp_256_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/); | 1193 | sp_256to512z_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/); |
| 1289 | /* S1 = Y1*Z2^3 */ | 1194 | /* S1 = Y1*Z2^3 */ |
| 1290 | sp_256_mont_mul_8(t3, t3, r->y /*, p256_mod, p256_mp_mod*/); | 1195 | sp_256to512z_mont_mul_8(t3, t3, r->y /*, p256_mod, p256_mp_mod*/); |
| 1291 | /* S2 = Y2*Z1^3 */ | 1196 | /* S2 = Y2*Z1^3 */ |
| 1292 | sp_256_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/); | 1197 | sp_256to512z_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/); |
| 1293 | /* H = U2 - U1 */ | 1198 | /* H = U2 - U1 */ |
| 1294 | sp_256_mont_sub_8(t2, t2, t1 /*, p256_mod*/); | 1199 | sp_256_mont_sub_8(t2, t2, t1 /*, p256_mod*/); |
| 1295 | /* R = S2 - S1 */ | 1200 | /* R = S2 - S1 */ |
| 1296 | sp_256_mont_sub_8(t4, t4, t3 /*, p256_mod*/); | 1201 | sp_256_mont_sub_8(t4, t4, t3 /*, p256_mod*/); |
| 1297 | /* Z3 = H*Z1*Z2 */ | 1202 | /* Z3 = H*Z1*Z2 */ |
| 1298 | sp_256_mont_mul_8(r->z, r->z, q->z /*, p256_mod, p256_mp_mod*/); | 1203 | sp_256to512z_mont_mul_8(r->z, r->z, q->z /*, p256_mod, p256_mp_mod*/); |
| 1299 | sp_256_mont_mul_8(r->z, r->z, t2 /*, p256_mod, p256_mp_mod*/); | 1204 | sp_256to512z_mont_mul_8(r->z, r->z, t2 /*, p256_mod, p256_mp_mod*/); |
| 1300 | /* X3 = R^2 - H^3 - 2*U1*H^2 */ | 1205 | /* X3 = R^2 - H^3 - 2*U1*H^2 */ |
| 1301 | sp_256_mont_sqr_8(r->x, t4 /*, p256_mod, p256_mp_mod*/); | 1206 | sp_256to512z_mont_sqr_8(r->x, t4 /*, p256_mod, p256_mp_mod*/); |
| 1302 | sp_256_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/); | 1207 | sp_256to512z_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/); |
| 1303 | sp_256_mont_mul_8(r->y, t1, t5 /*, p256_mod, p256_mp_mod*/); | 1208 | sp_256to512z_mont_mul_8(r->y, t1, t5 /*, p256_mod, p256_mp_mod*/); |
| 1304 | sp_256_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/); | 1209 | sp_256to512z_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/); |
| 1305 | sp_256_mont_sub_8(r->x, r->x, t5 /*, p256_mod*/); | 1210 | sp_256_mont_sub_8(r->x, r->x, t5 /*, p256_mod*/); |
| 1306 | sp_256_mont_dbl_8(t1, r->y /*, p256_mod*/); | 1211 | sp_256_mont_dbl_8(t1, r->y /*, p256_mod*/); |
| 1307 | sp_256_mont_sub_8(r->x, r->x, t1 /*, p256_mod*/); | 1212 | sp_256_mont_sub_8(r->x, r->x, t1 /*, p256_mod*/); |
| 1308 | /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ | 1213 | /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ |
| 1309 | sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/); | 1214 | sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/); |
| 1310 | sp_256_mont_mul_8(r->y, r->y, t4 /*, p256_mod, p256_mp_mod*/); | 1215 | sp_256to512z_mont_mul_8(r->y, r->y, t4 /*, p256_mod, p256_mp_mod*/); |
| 1311 | sp_256_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/); | 1216 | sp_256to512z_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/); |
| 1312 | sp_256_mont_sub_8(r->y, r->y, t5 /*, p256_mod*/); | 1217 | sp_256_mont_sub_8(r->y, r->y, t5 /*, p256_mod*/); |
| 1313 | } | 1218 | } |
| 1314 | 1219 | ||
