diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2021-10-05 20:00:50 +0200 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2021-10-05 20:01:38 +0200 |
commit | 3b411ebbfc749f9f12b0eb739cb5ba3ec052197e (patch) | |
tree | da81f1546b78d25f4ac63e612e76cb27aa4c2db3 | |
parent | 55578f2fb7c05357fb0b1ce84b616ba8ffd6d907 (diff) | |
download | busybox-w32-3b411ebbfc749f9f12b0eb739cb5ba3ec052197e.tar.gz busybox-w32-3b411ebbfc749f9f12b0eb739cb5ba3ec052197e.tar.bz2 busybox-w32-3b411ebbfc749f9f12b0eb739cb5ba3ec052197e.zip |
tls: replace "26-bit" P256 code with 32-bit one.
function old new delta
sp_256_ecc_mulmod_8 - 1171 +1171
sp_256_mod_mul_norm_8 - 834 +834
sp_256_proj_point_dbl_8 - 374 +374
sp_256_mont_reduce_8 - 268 +268
sp_256_mont_mul_8 - 151 +151
sp_256_sub_8 - 76 +76
sp_256_add_8 - 76 +76
sp_256_cmp_8 - 38 +38
static.sp_256_mont_dbl_8 - 31 +31
static.sp_256_mont_sub_8 - 29 +29
sp_256_to_bin_8 - 28 +28
sp_256_point_from_bin2x32 50 73 +23
sp_256_mont_sqr_8 - 7 +7
sp_256_mont_sqr_10 7 - -7
p256_mod 40 32 -8
curve_P256_compute_pubkey_and_premaster 186 167 -19
sp_256_sub_10 22 - -22
sp_256_add_10 22 - -22
sp_256_cmp_10 24 - -24
sp_256_norm_10 31 - -31
static.sp_256_mont_sub_10 49 - -49
static.sp_256_mont_dbl_10 52 - -52
static.sp_256_mul_add_10 82 - -82
sp_256_from_bin_10 119 - -119
sp_256_to_bin_10 120 - -120
sp_256_mont_reduce_10 178 - -178
sp_256_mont_mul_10 214 - -214
sp_256_proj_point_dbl_10 451 - -451
sp_256_ecc_mulmod_10 1216 - -1216
sp_256_mod_mul_norm_10 1305 - -1305
------------------------------------------------------------------------------
(add/remove: 12/15 grow/shrink: 1/2 up/down: 3106/-3919) Total: -813 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | networking/tls.c | 15 | ||||
-rw-r--r-- | networking/tls_sp_c32.c | 1071 |
2 files changed, 588 insertions, 498 deletions
diff --git a/networking/tls.c b/networking/tls.c index 4f0e2b6eb..675ef4b3a 100644 --- a/networking/tls.c +++ b/networking/tls.c | |||
@@ -2334,7 +2334,6 @@ void FAST_FUNC tls_run_copy_loop(tls_state_t *tls, unsigned flags) | |||
2334 | // e.g. at the very beginning of wget_main() | 2334 | // e.g. at the very beginning of wget_main() |
2335 | // | 2335 | // |
2336 | { | 2336 | { |
2337 | //kbuild:lib-$(CONFIG_TLS) += tls_sp_c32_new.o | ||
2338 | uint8_t ecc_pub_key32[2 * 32]; | 2337 | uint8_t ecc_pub_key32[2 * 32]; |
2339 | uint8_t pubkey2x32[2 * 32]; | 2338 | uint8_t pubkey2x32[2 * 32]; |
2340 | uint8_t premaster32[32]; | 2339 | uint8_t premaster32[32]; |
@@ -2345,14 +2344,14 @@ void FAST_FUNC tls_run_copy_loop(tls_state_t *tls, unsigned flags) | |||
2345 | // memset(ecc_pub_key32, 0x00, sizeof(ecc_pub_key32)); | 2344 | // memset(ecc_pub_key32, 0x00, sizeof(ecc_pub_key32)); |
2346 | // ecc_pub_key32[18] = 0xab; | 2345 | // ecc_pub_key32[18] = 0xab; |
2347 | //Random key: | 2346 | //Random key: |
2348 | tls_get_random(ecc_pub_key32, sizeof(ecc_pub_key32)); | 2347 | // tls_get_random(ecc_pub_key32, sizeof(ecc_pub_key32)); |
2349 | //Biased random (almost all zeros or almost all ones): | 2348 | //Biased random (almost all zeros or almost all ones): |
2350 | // srand(time(NULL) ^ getpid()); | 2349 | srand(time(NULL) ^ getpid()); |
2351 | // if (rand() & 1) | 2350 | if (rand() & 1) |
2352 | // memset(ecc_pub_key32, 0x00, sizeof(ecc_pub_key32)); | 2351 | memset(ecc_pub_key32, 0x00, sizeof(ecc_pub_key32)); |
2353 | // else | 2352 | else |
2354 | // memset(ecc_pub_key32, 0xff, sizeof(ecc_pub_key32)); | 2353 | memset(ecc_pub_key32, 0xff, sizeof(ecc_pub_key32)); |
2355 | // ecc_pub_key32[rand() & 0x3f] = rand(); | 2354 | ecc_pub_key32[rand() & 0x3f] = rand(); |
2356 | 2355 | ||
2357 | xmove_fd(xopen("p256.OLD", O_WRONLY | O_CREAT | O_TRUNC), 2); | 2356 | xmove_fd(xopen("p256.OLD", O_WRONLY | O_CREAT | O_TRUNC), 2); |
2358 | curve_P256_compute_pubkey_and_premaster( | 2357 | curve_P256_compute_pubkey_and_premaster( |
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c index bba22dee3..b99951890 100644 --- a/networking/tls_sp_c32.c +++ b/networking/tls_sp_c32.c | |||
@@ -9,6 +9,8 @@ | |||
9 | #define FIXED_SECRET 0 | 9 | #define FIXED_SECRET 0 |
10 | #define FIXED_PEER_PUBKEY 0 | 10 | #define FIXED_PEER_PUBKEY 0 |
11 | 11 | ||
12 | #define ALLOW_ASM 1 | ||
13 | |||
12 | #if SP_DEBUG | 14 | #if SP_DEBUG |
13 | # define dbg(...) fprintf(stderr, __VA_ARGS__) | 15 | # define dbg(...) fprintf(stderr, __VA_ARGS__) |
14 | static void dump_hex(const char *fmt, const void *vp, int len) | 16 | static void dump_hex(const char *fmt, const void *vp, int len) |
@@ -24,7 +26,8 @@ static void dump_hex(const char *fmt, const void *vp, int len) | |||
24 | # define dump_hex(...) ((void)0) | 26 | # define dump_hex(...) ((void)0) |
25 | #endif | 27 | #endif |
26 | 28 | ||
27 | typedef int32_t sp_digit; | 29 | typedef uint32_t sp_digit; |
30 | typedef int32_t signed_sp_digit; | ||
28 | 31 | ||
29 | /* The code below is taken from parts of | 32 | /* The code below is taken from parts of |
30 | * wolfssl-3.15.3/wolfcrypt/src/sp_c32.c | 33 | * wolfssl-3.15.3/wolfcrypt/src/sp_c32.c |
@@ -32,53 +35,23 @@ typedef int32_t sp_digit; | |||
32 | * Header comment is kept intact: | 35 | * Header comment is kept intact: |
33 | */ | 36 | */ |
34 | 37 | ||
35 | /* sp.c | ||
36 | * | ||
37 | * Copyright (C) 2006-2018 wolfSSL Inc. | ||
38 | * | ||
39 | * This file is part of wolfSSL. | ||
40 | * | ||
41 | * wolfSSL is free software; you can redistribute it and/or modify | ||
42 | * it under the terms of the GNU General Public License as published by | ||
43 | * the Free Software Foundation; either version 2 of the License, or | ||
44 | * (at your option) any later version. | ||
45 | * | ||
46 | * wolfSSL is distributed in the hope that it will be useful, | ||
47 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
48 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
49 | * GNU General Public License for more details. | ||
50 | * | ||
51 | * You should have received a copy of the GNU General Public License | ||
52 | * along with this program; if not, write to the Free Software | ||
53 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA | ||
54 | */ | ||
55 | |||
56 | /* Implementation by Sean Parkinson. */ | ||
57 | |||
58 | typedef struct sp_point { | 38 | typedef struct sp_point { |
59 | sp_digit x[2 * 10]; | 39 | sp_digit x[2 * 8]; |
60 | sp_digit y[2 * 10]; | 40 | sp_digit y[2 * 8]; |
61 | sp_digit z[2 * 10]; | 41 | sp_digit z[2 * 8]; |
62 | int infinity; | 42 | int infinity; |
63 | } sp_point; | 43 | } sp_point; |
64 | 44 | ||
65 | /* The modulus (prime) of the curve P256. */ | 45 | /* The modulus (prime) of the curve P256. */ |
66 | static const sp_digit p256_mod[10] = { | 46 | static const sp_digit p256_mod[8] = { |
67 | 0x3ffffff,0x3ffffff,0x3ffffff,0x003ffff,0x0000000, | 47 | 0xffffffff,0xffffffff,0xffffffff,0x00000000, |
68 | 0x0000000,0x0000000,0x0000400,0x3ff0000,0x03fffff, | 48 | 0x00000000,0x00000000,0x00000001,0xffffffff, |
69 | }; | 49 | }; |
70 | 50 | ||
71 | #define p256_mp_mod ((sp_digit)0x000001) | 51 | #define p256_mp_mod ((sp_digit)0x000001) |
72 | 52 | ||
73 | /* Normalize the values in each word to 26 bits. */ | 53 | /* Normalize the values in each word to 32 bits - NOP */ |
74 | static void sp_256_norm_10(sp_digit* a) | 54 | #define sp_256_norm_8(a) ((void)0) |
75 | { | ||
76 | int i; | ||
77 | for (i = 0; i < 9; i++) { | ||
78 | a[i+1] += a[i] >> 26; | ||
79 | a[i] &= 0x3ffffff; | ||
80 | } | ||
81 | } | ||
82 | 55 | ||
83 | /* Write r as big endian to byte array. | 56 | /* Write r as big endian to byte array. |
84 | * Fixed length number of bytes written: 32 | 57 | * Fixed length number of bytes written: 32 |
@@ -86,31 +59,17 @@ static void sp_256_norm_10(sp_digit* a) | |||
86 | * r A single precision integer. | 59 | * r A single precision integer. |
87 | * a Byte array. | 60 | * a Byte array. |
88 | */ | 61 | */ |
89 | static void sp_256_to_bin_10(sp_digit* r, uint8_t* a) | 62 | static void sp_256_to_bin_8(const sp_digit* r, uint8_t* a) |
90 | { | 63 | { |
91 | int i, j, s = 0, b; | 64 | int i; |
92 | 65 | ||
93 | sp_256_norm_10(r); | 66 | sp_256_norm_8(r); |
94 | 67 | ||
95 | j = 256 / 8 - 1; | 68 | r += 8; |
96 | a[j] = 0; | 69 | for (i = 0; i < 8; i++) { |
97 | for (i = 0; i < 10 && j >= 0; i++) { | 70 | r--; |
98 | b = 0; | 71 | move_to_unaligned32(a, SWAP_BE32(*r)); |
99 | a[j--] |= r[i] << s; | 72 | a += 4; |
100 | b += 8 - s; | ||
101 | if (j < 0) | ||
102 | break; | ||
103 | while (b < 26) { | ||
104 | a[j--] = r[i] >> b; | ||
105 | b += 8; | ||
106 | if (j < 0) | ||
107 | break; | ||
108 | } | ||
109 | s = 8 - (b - 26); | ||
110 | if (j >= 0) | ||
111 | a[j] = 0; | ||
112 | if (s != 0) | ||
113 | j++; | ||
114 | } | 73 | } |
115 | } | 74 | } |
116 | 75 | ||
@@ -120,67 +79,32 @@ static void sp_256_to_bin_10(sp_digit* r, uint8_t* a) | |||
120 | * a Byte array. | 79 | * a Byte array. |
121 | * n Number of bytes in array to read. | 80 | * n Number of bytes in array to read. |
122 | */ | 81 | */ |
123 | static void sp_256_from_bin_10(sp_digit* r, const uint8_t* a) | 82 | static void sp_256_from_bin_8(sp_digit* r, const uint8_t* a) |
124 | { | 83 | { |
125 | int i, j = 0, s = 0; | 84 | int i; |
126 | 85 | ||
127 | r[0] = 0; | 86 | r += 8; |
128 | for (i = 32 - 1; i >= 0; i--) { | 87 | for (i = 0; i < 8; i++) { |
129 | r[j] |= ((sp_digit)a[i]) << s; | 88 | sp_digit v; |
130 | if (s >= 18) { | 89 | move_from_unaligned32(v, a); |
131 | r[j] &= 0x3ffffff; | 90 | *--r = SWAP_BE32(v); |
132 | s = 26 - s; | 91 | a += 4; |
133 | r[++j] = a[i] >> s; | ||
134 | s = 8 - s; | ||
135 | } | ||
136 | else | ||
137 | s += 8; | ||
138 | } | 92 | } |
139 | } | 93 | } |
140 | 94 | ||
141 | #if SP_DEBUG | 95 | #if SP_DEBUG |
142 | static void dump_256(const char *fmt, const sp_digit* cr) | 96 | static void dump_256(const char *fmt, const sp_digit* r) |
143 | { | 97 | { |
144 | sp_digit* r = (sp_digit*)cr; | ||
145 | uint8_t b32[32]; | 98 | uint8_t b32[32]; |
146 | sp_256_to_bin_10(r, b32); | 99 | sp_256_to_bin_8(r, b32); |
147 | dump_hex(fmt, b32, 32); | 100 | dump_hex(fmt, b32, 32); |
148 | } | 101 | } |
149 | static void dump_512(const char *fmt, const sp_digit* cr) | 102 | static void dump_512(const char *fmt, const sp_digit* r) |
150 | { | 103 | { |
151 | sp_digit* r = (sp_digit*)cr; | 104 | uint8_t b64[64]; |
152 | uint8_t a[64]; | 105 | sp_256_to_bin_8(r, b64 + 32); |
153 | int i, j, s, b; | 106 | sp_256_to_bin_8(r+8, b64); |
154 | 107 | dump_hex(fmt, b64, 64); | |
155 | /* sp_512_norm_10: */ | ||
156 | for (i = 0; i < 19; i++) { | ||
157 | r[i+1] += r[i] >> 26; | ||
158 | r[i] &= 0x3ffffff; | ||
159 | } | ||
160 | /* sp_512_to_bin_10: */ | ||
161 | s = 0; | ||
162 | j = 512 / 8 - 1; | ||
163 | a[j] = 0; | ||
164 | for (i = 0; i < 20 && j >= 0; i++) { | ||
165 | b = 0; | ||
166 | a[j--] |= r[i] << s; | ||
167 | b += 8 - s; | ||
168 | if (j < 0) | ||
169 | break; | ||
170 | while (b < 26) { | ||
171 | a[j--] = r[i] >> b; | ||
172 | b += 8; | ||
173 | if (j < 0) | ||
174 | break; | ||
175 | } | ||
176 | s = 8 - (b - 26); | ||
177 | if (j >= 0) | ||
178 | a[j] = 0; | ||
179 | if (s != 0) | ||
180 | j++; | ||
181 | } | ||
182 | |||
183 | dump_hex(fmt, a, 64); | ||
184 | } | 108 | } |
185 | #else | 109 | #else |
186 | # define dump_256(...) ((void)0) | 110 | # define dump_256(...) ((void)0) |
@@ -192,8 +116,8 @@ static void sp_256_point_from_bin2x32(sp_point* p, const uint8_t *bin2x32) | |||
192 | { | 116 | { |
193 | memset(p, 0, sizeof(*p)); | 117 | memset(p, 0, sizeof(*p)); |
194 | /*p->infinity = 0;*/ | 118 | /*p->infinity = 0;*/ |
195 | sp_256_from_bin_10(p->x, bin2x32); | 119 | sp_256_from_bin_8(p->x, bin2x32); |
196 | sp_256_from_bin_10(p->y, bin2x32 + 32); | 120 | sp_256_from_bin_8(p->y, bin2x32 + 32); |
197 | p->z[0] = 1; /* p->z = 1 */ | 121 | p->z[0] = 1; /* p->z = 1 */ |
198 | } | 122 | } |
199 | 123 | ||
@@ -202,170 +126,303 @@ static void sp_256_point_from_bin2x32(sp_point* p, const uint8_t *bin2x32) | |||
202 | * return -ve, 0 or +ve if a is less than, equal to or greater than b | 126 | * return -ve, 0 or +ve if a is less than, equal to or greater than b |
203 | * respectively. | 127 | * respectively. |
204 | */ | 128 | */ |
205 | static sp_digit sp_256_cmp_10(const sp_digit* a, const sp_digit* b) | 129 | static signed_sp_digit sp_256_cmp_8(const sp_digit* a, const sp_digit* b) |
206 | { | 130 | { |
207 | sp_digit r; | ||
208 | int i; | 131 | int i; |
209 | for (i = 9; i >= 0; i--) { | 132 | for (i = 7; i >= 0; i--) { |
210 | r = a[i] - b[i]; | 133 | /* signed_sp_digit r = a[i] - b[i]; |
211 | if (r != 0) | 134 | * if (r != 0) |
212 | break; | 135 | * return r; |
136 | * does not work: think about a[i]=0, b[i]=0xffffffff | ||
137 | */ | ||
138 | if (a[i] == b[i]) | ||
139 | continue; | ||
140 | return (a[i] > b[i]) * 2 - 1; | ||
213 | } | 141 | } |
214 | return r; | 142 | return 0; |
215 | } | 143 | } |
216 | 144 | ||
217 | /* Compare two numbers to determine if they are equal. | 145 | /* Compare two numbers to determine if they are equal. |
218 | * | 146 | * |
219 | * return 1 when equal and 0 otherwise. | 147 | * return 1 when equal and 0 otherwise. |
220 | */ | 148 | */ |
221 | static int sp_256_cmp_equal_10(const sp_digit* a, const sp_digit* b) | 149 | static int sp_256_cmp_equal_8(const sp_digit* a, const sp_digit* b) |
222 | { | 150 | { |
223 | return sp_256_cmp_10(a, b) == 0; | 151 | return sp_256_cmp_8(a, b) == 0; |
224 | } | 152 | } |
225 | 153 | ||
226 | /* Add b to a into r. (r = a + b) */ | 154 | /* Add b to a into r. (r = a + b). Return !0 on overflow */ |
227 | static void sp_256_add_10(sp_digit* r, const sp_digit* a, const sp_digit* b) | 155 | static int sp_256_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b) |
228 | { | 156 | { |
157 | #if ALLOW_ASM && defined(__GNUC__) && defined(__i386__) | ||
158 | sp_digit reg; | ||
159 | asm volatile ( | ||
160 | "\n movl (%0), %3" | ||
161 | "\n addl (%1), %3" | ||
162 | "\n movl %3, (%2)" | ||
163 | "\n" | ||
164 | "\n movl 1*4(%0), %3" | ||
165 | "\n adcl 1*4(%1), %3" | ||
166 | "\n movl %3, 1*4(%2)" | ||
167 | "\n" | ||
168 | "\n movl 2*4(%0), %3" | ||
169 | "\n adcl 2*4(%1), %3" | ||
170 | "\n movl %3, 2*4(%2)" | ||
171 | "\n" | ||
172 | "\n movl 3*4(%0), %3" | ||
173 | "\n adcl 3*4(%1), %3" | ||
174 | "\n movl %3, 3*4(%2)" | ||
175 | "\n" | ||
176 | "\n movl 4*4(%0), %3" | ||
177 | "\n adcl 4*4(%1), %3" | ||
178 | "\n movl %3, 4*4(%2)" | ||
179 | "\n" | ||
180 | "\n movl 5*4(%0), %3" | ||
181 | "\n adcl 5*4(%1), %3" | ||
182 | "\n movl %3, 5*4(%2)" | ||
183 | "\n" | ||
184 | "\n movl 6*4(%0), %3" | ||
185 | "\n adcl 6*4(%1), %3" | ||
186 | "\n movl %3, 6*4(%2)" | ||
187 | "\n" | ||
188 | "\n movl 7*4(%0), %3" | ||
189 | "\n adcl 7*4(%1), %3" | ||
190 | "\n movl %3, 7*4(%2)" | ||
191 | "\n" | ||
192 | "\n sbbl %3, %3" | ||
193 | "\n" | ||
194 | : "=r" (a), "=r" (b), "=r" (r), "=r" (reg) | ||
195 | : "0" (a), "1" (b), "2" (r) | ||
196 | : "memory" | ||
197 | ); | ||
198 | return reg; | ||
199 | #else | ||
229 | int i; | 200 | int i; |
230 | for (i = 0; i < 10; i++) | 201 | sp_digit carry; |
231 | r[i] = a[i] + b[i]; | 202 | |
203 | carry = 0; | ||
204 | for (i = 0; i < 8; i++) { | ||
205 | sp_digit w, v; | ||
206 | w = b[i] + carry; | ||
207 | v = a[i]; | ||
208 | if (w != 0) { | ||
209 | v = a[i] + w; | ||
210 | carry = (v < a[i]); | ||
211 | /* hope compiler detects above as "carry flag set" */ | ||
212 | } | ||
213 | /* else: b + carry == 0, two cases: | ||
214 | * b:ffffffff, carry:1 | ||
215 | * b:00000000, carry:0 | ||
216 | * in either case, r[i] = a[i] and carry remains unchanged | ||
217 | */ | ||
218 | r[i] = v; | ||
219 | } | ||
220 | return carry; | ||
221 | #endif | ||
232 | } | 222 | } |
233 | 223 | ||
234 | /* Sub b from a into r. (r = a - b) */ | 224 | /* Sub b from a into r. (r = a - b). Return !0 on underflow */ |
235 | static void sp_256_sub_10(sp_digit* r, const sp_digit* a, const sp_digit* b) | 225 | static int sp_256_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b) |
236 | { | 226 | { |
227 | #if ALLOW_ASM && defined(__GNUC__) && defined(__i386__) | ||
228 | sp_digit reg; | ||
229 | asm volatile ( | ||
230 | "\n movl (%0), %3" | ||
231 | "\n subl (%1), %3" | ||
232 | "\n movl %3, (%2)" | ||
233 | "\n" | ||
234 | "\n movl 1*4(%0), %3" | ||
235 | "\n sbbl 1*4(%1), %3" | ||
236 | "\n movl %3, 1*4(%2)" | ||
237 | "\n" | ||
238 | "\n movl 2*4(%0), %3" | ||
239 | "\n sbbl 2*4(%1), %3" | ||
240 | "\n movl %3, 2*4(%2)" | ||
241 | "\n" | ||
242 | "\n movl 3*4(%0), %3" | ||
243 | "\n sbbl 3*4(%1), %3" | ||
244 | "\n movl %3, 3*4(%2)" | ||
245 | "\n" | ||
246 | "\n movl 4*4(%0), %3" | ||
247 | "\n sbbl 4*4(%1), %3" | ||
248 | "\n movl %3, 4*4(%2)" | ||
249 | "\n" | ||
250 | "\n movl 5*4(%0), %3" | ||
251 | "\n sbbl 5*4(%1), %3" | ||
252 | "\n movl %3, 5*4(%2)" | ||
253 | "\n" | ||
254 | "\n movl 6*4(%0), %3" | ||
255 | "\n sbbl 6*4(%1), %3" | ||
256 | "\n movl %3, 6*4(%2)" | ||
257 | "\n" | ||
258 | "\n movl 7*4(%0), %3" | ||
259 | "\n sbbl 7*4(%1), %3" | ||
260 | "\n movl %3, 7*4(%2)" | ||
261 | "\n" | ||
262 | "\n sbbl %3, %3" | ||
263 | "\n" | ||
264 | : "=r" (a), "=r" (b), "=r" (r), "=r" (reg) | ||
265 | : "0" (a), "1" (b), "2" (r) | ||
266 | : "memory" | ||
267 | ); | ||
268 | return reg; | ||
269 | #else | ||
237 | int i; | 270 | int i; |
238 | for (i = 0; i < 10; i++) | 271 | sp_digit borrow; |
239 | r[i] = a[i] - b[i]; | 272 | |
273 | borrow = 0; | ||
274 | for (i = 0; i < 8; i++) { | ||
275 | sp_digit w, v; | ||
276 | w = b[i] + borrow; | ||
277 | v = a[i]; | ||
278 | if (w != 0) { | ||
279 | v = a[i] - w; | ||
280 | borrow = (v > a[i]); | ||
281 | /* hope compiler detects above as "carry flag set" */ | ||
282 | } | ||
283 | /* else: b + borrow == 0, two cases: | ||
284 | * b:ffffffff, borrow:1 | ||
285 | * b:00000000, borrow:0 | ||
286 | * in either case, r[i] = a[i] and borrow remains unchanged | ||
287 | */ | ||
288 | r[i] = v; | ||
289 | } | ||
290 | return borrow; | ||
291 | #endif | ||
240 | } | 292 | } |
241 | 293 | ||
242 | /* Multiply a and b into r. (r = a * b) */ | 294 | /* Multiply a and b into r. (r = a * b) */ |
243 | static void sp_256_mul_10(sp_digit* r, const sp_digit* a, const sp_digit* b) | 295 | static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) |
244 | { | 296 | { |
297 | sp_digit rr[15]; /* in case r coincides with a or b */ | ||
245 | int i, j, k; | 298 | int i, j, k; |
246 | int64_t c; | 299 | uint64_t acc; |
247 | 300 | ||
248 | c = ((int64_t)a[9]) * b[9]; | 301 | acc = 0; |
249 | r[19] = (sp_digit)(c >> 26); | 302 | for (k = 0; k < 15; k++) { |
250 | c = (c & 0x3ffffff) << 26; | 303 | uint32_t acc_hi; |
251 | for (k = 17; k >= 0; k--) { | 304 | i = k - 7; |
252 | for (i = 9; i >= 0; i--) { | 305 | if (i < 0) |
253 | j = k - i; | 306 | i = 0; |
254 | if (j >= 10) | 307 | j = k - i; |
255 | break; | 308 | acc_hi = 0; |
256 | if (j < 0) | 309 | while (i != 8 && i <= k) { |
257 | continue; | 310 | uint64_t m = ((uint64_t)a[i]) * b[j]; |
258 | c += ((int64_t)a[i]) * b[j]; | 311 | acc += m; |
312 | if (acc < m) | ||
313 | acc_hi++; | ||
314 | j--; | ||
315 | i++; | ||
259 | } | 316 | } |
260 | r[k + 2] += c >> 52; | 317 | rr[k] = acc; |
261 | r[k + 1] = (c >> 26) & 0x3ffffff; | 318 | acc = (acc >> 32) | ((uint64_t)acc_hi << 32); |
262 | c = (c & 0x3ffffff) << 26; | ||
263 | } | 319 | } |
264 | r[0] = (sp_digit)(c >> 26); | 320 | r[15] = acc; |
321 | memcpy(r, rr, sizeof(rr)); | ||
265 | } | 322 | } |
266 | 323 | ||
267 | /* Shift number right one bit. Bottom bit is lost. */ | 324 | /* Shift number right one bit. Bottom bit is lost. */ |
268 | static void sp_256_rshift1_10(sp_digit* r, sp_digit* a) | 325 | static void sp_256_rshift1_8(sp_digit* r, sp_digit* a, sp_digit carry) |
269 | { | 326 | { |
270 | int i; | 327 | int i; |
271 | for (i = 0; i < 9; i++) | 328 | |
272 | r[i] = ((a[i] >> 1) | (a[i + 1] << 25)) & 0x3ffffff; | 329 | carry = (!!carry << 31); |
273 | r[9] = a[9] >> 1; | 330 | for (i = 7; i >= 0; i--) { |
331 | sp_digit c = a[i] << 31; | ||
332 | r[i] = (a[i] >> 1) | carry; | ||
333 | carry = c; | ||
334 | } | ||
274 | } | 335 | } |
275 | 336 | ||
276 | /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) */ | 337 | /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) */ |
277 | static void sp_256_div2_10(sp_digit* r, const sp_digit* a, const sp_digit* m) | 338 | static void sp_256_div2_8(sp_digit* r, const sp_digit* a, const sp_digit* m) |
278 | { | 339 | { |
340 | int carry = 0; | ||
279 | if (a[0] & 1) | 341 | if (a[0] & 1) |
280 | sp_256_add_10(r, a, m); | 342 | carry = sp_256_add_8(r, a, m); |
281 | sp_256_norm_10(r); | 343 | sp_256_norm_8(r); |
282 | sp_256_rshift1_10(r, r); | 344 | sp_256_rshift1_8(r, r, carry); |
283 | } | 345 | } |
284 | 346 | ||
285 | /* Add two Montgomery form numbers (r = a + b % m) */ | 347 | /* Add two Montgomery form numbers (r = a + b % m) */ |
286 | static void sp_256_mont_add_10(sp_digit* r, const sp_digit* a, const sp_digit* b, | 348 | static void sp_256_mont_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b, |
287 | const sp_digit* m) | 349 | const sp_digit* m) |
288 | { | 350 | { |
289 | sp_256_add_10(r, a, b); | 351 | int carry = sp_256_add_8(r, a, b); |
290 | sp_256_norm_10(r); | 352 | sp_256_norm_8(r); |
291 | if ((r[9] >> 22) > 0) { | 353 | if (carry) { |
292 | sp_256_sub_10(r, r, m); | 354 | sp_256_sub_8(r, r, m); |
293 | sp_256_norm_10(r); | 355 | sp_256_norm_8(r); |
294 | } | 356 | } |
295 | } | 357 | } |
296 | 358 | ||
297 | /* Subtract two Montgomery form numbers (r = a - b % m) */ | 359 | /* Subtract two Montgomery form numbers (r = a - b % m) */ |
298 | static void sp_256_mont_sub_10(sp_digit* r, const sp_digit* a, const sp_digit* b, | 360 | static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b, |
299 | const sp_digit* m) | 361 | const sp_digit* m) |
300 | { | 362 | { |
301 | sp_256_sub_10(r, a, b); | 363 | int borrow; |
302 | sp_256_norm_10(r); | 364 | borrow = sp_256_sub_8(r, a, b); |
303 | if (r[9] >> 22) { | 365 | sp_256_norm_8(r); |
304 | sp_256_add_10(r, r, m); | 366 | if (borrow) { |
305 | sp_256_norm_10(r); | 367 | sp_256_add_8(r, r, m); |
306 | r[9] &= 0x03fffff; /* truncate to 22 bits */ | 368 | sp_256_norm_8(r); |
307 | } | 369 | } |
308 | } | 370 | } |
309 | 371 | ||
310 | /* Double a Montgomery form number (r = a + a % m) */ | 372 | /* Double a Montgomery form number (r = a + a % m) */ |
311 | static void sp_256_mont_dbl_10(sp_digit* r, const sp_digit* a, const sp_digit* m) | 373 | static void sp_256_mont_dbl_8(sp_digit* r, const sp_digit* a, const sp_digit* m) |
312 | { | 374 | { |
313 | sp_256_add_10(r, a, a); | 375 | int carry = sp_256_add_8(r, a, a); |
314 | sp_256_norm_10(r); | 376 | sp_256_norm_8(r); |
315 | if ((r[9] >> 22) > 0) | 377 | if (carry) |
316 | sp_256_sub_10(r, r, m); | 378 | sp_256_sub_8(r, r, m); |
317 | sp_256_norm_10(r); | 379 | sp_256_norm_8(r); |
318 | } | 380 | } |
319 | 381 | ||
320 | /* Triple a Montgomery form number (r = a + a + a % m) */ | 382 | /* Triple a Montgomery form number (r = a + a + a % m) */ |
321 | static void sp_256_mont_tpl_10(sp_digit* r, const sp_digit* a, const sp_digit* m) | 383 | static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a, const sp_digit* m) |
322 | { | 384 | { |
323 | sp_256_add_10(r, a, a); | 385 | int carry = sp_256_add_8(r, a, a); |
324 | sp_256_norm_10(r); | 386 | sp_256_norm_8(r); |
325 | if ((r[9] >> 22) > 0) { | 387 | if (carry) { |
326 | sp_256_sub_10(r, r, m); | 388 | sp_256_sub_8(r, r, m); |
327 | sp_256_norm_10(r); | 389 | sp_256_norm_8(r); |
328 | } | 390 | } |
329 | sp_256_add_10(r, r, a); | 391 | carry = sp_256_add_8(r, r, a); |
330 | sp_256_norm_10(r); | 392 | sp_256_norm_8(r); |
331 | if ((r[9] >> 22) > 0) { | 393 | if (carry) { |
332 | sp_256_sub_10(r, r, m); | 394 | sp_256_sub_8(r, r, m); |
333 | sp_256_norm_10(r); | 395 | sp_256_norm_8(r); |
334 | } | 396 | } |
335 | r[9] &= 0x03fffff; /* truncate to 22 bits */ | ||
336 | } | 397 | } |
337 | 398 | ||
338 | /* Shift the result in the high 256 bits down to the bottom. */ | 399 | /* Shift the result in the high 256 bits down to the bottom. */ |
339 | static void sp_256_mont_shift_10(sp_digit* r, const sp_digit* a) | 400 | static void sp_256_mont_shift_8(sp_digit* r, const sp_digit* a) |
340 | { | 401 | { |
341 | int i; | 402 | int i; |
342 | sp_digit n, s; | 403 | |
343 | 404 | for (i = 0; i < 8; i++) { | |
344 | s = a[10]; | 405 | r[i] = a[i+8]; |
345 | n = a[9] >> 22; | 406 | r[i+8] = 0; |
346 | for (i = 0; i < 9; i++) { | ||
347 | n += (s & 0x3ffffff) << 4; | ||
348 | r[i] = n & 0x3ffffff; | ||
349 | n >>= 26; | ||
350 | s = a[11 + i] + (s >> 26); | ||
351 | } | 407 | } |
352 | n += s << 4; | ||
353 | r[9] = n; | ||
354 | memset(&r[10], 0, sizeof(*r) * 10); | ||
355 | } | 408 | } |
356 | 409 | ||
357 | /* Mul a by scalar b and add into r. (r += a * b) */ | 410 | /* Mul a by scalar b and add into r. (r += a * b) */ |
358 | static void sp_256_mul_add_10(sp_digit* r, const sp_digit* a, sp_digit b) | 411 | static int sp_256_mul_add_8(sp_digit* r, const sp_digit* a, sp_digit b) |
359 | { | 412 | { |
360 | int64_t t = 0; | 413 | uint64_t t = 0; |
361 | int i; | 414 | int i; |
362 | 415 | ||
363 | for (i = 0; i < 10; i++) { | 416 | for (i = 0; i < 8; i++) { |
364 | t += ((int64_t)b * a[i]) + r[i]; | 417 | uint32_t t_hi; |
365 | r[i] = t & 0x3ffffff; | 418 | uint64_t m = ((uint64_t)b * a[i]) + r[i]; |
366 | t >>= 26; | 419 | t += m; |
420 | t_hi = (t < m); | ||
421 | r[i] = (sp_digit)t; | ||
422 | t = (t >> 32) | ((uint64_t)t_hi << 32); | ||
367 | } | 423 | } |
368 | r[10] += t; | 424 | r[8] += (sp_digit)t; |
425 | return (r[8] < (sp_digit)t); /* 1 if addition overflowed */ | ||
369 | } | 426 | } |
370 | 427 | ||
371 | /* Reduce the number back to 256 bits using Montgomery reduction. | 428 | /* Reduce the number back to 256 bits using Montgomery reduction. |
@@ -374,7 +431,7 @@ static void sp_256_mul_add_10(sp_digit* r, const sp_digit* a, sp_digit b) | |||
374 | * m The single precision number representing the modulus. | 431 | * m The single precision number representing the modulus. |
375 | * mp The digit representing the negative inverse of m mod 2^n. | 432 | * mp The digit representing the negative inverse of m mod 2^n. |
376 | */ | 433 | */ |
377 | static void sp_256_mont_reduce_10(sp_digit* a /*, const sp_digit* m, sp_digit mp*/) | 434 | static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/) |
378 | { | 435 | { |
379 | const sp_digit* m = p256_mod; | 436 | const sp_digit* m = p256_mod; |
380 | sp_digit mp = p256_mp_mod; | 437 | sp_digit mp = p256_mp_mod; |
@@ -383,33 +440,144 @@ static void sp_256_mont_reduce_10(sp_digit* a /*, const sp_digit* m, sp_digit mp | |||
383 | sp_digit mu; | 440 | sp_digit mu; |
384 | 441 | ||
385 | if (mp != 1) { | 442 | if (mp != 1) { |
386 | for (i = 0; i < 9; i++) { | 443 | int too_wide; |
387 | mu = (a[i] * mp) & 0x3ffffff; | 444 | for (i = 0; i < 7; i++) { |
388 | sp_256_mul_add_10(a+i, m, mu); | 445 | mu = (sp_digit)(a[i] * mp); |
389 | a[i+1] += a[i] >> 26; | 446 | if (sp_256_mul_add_8(a+i, m, mu)) |
447 | (a+i)[9]++; | ||
390 | } | 448 | } |
391 | mu = (a[i] * mp) & 0x03fffff; | 449 | mu = (sp_digit)(a[7] * mp); |
392 | sp_256_mul_add_10(a+i, m, mu); | 450 | too_wide = sp_256_mul_add_8(a+7, m, mu); |
393 | a[i+1] += a[i] >> 26; | 451 | sp_256_mont_shift_8(a, a); |
394 | a[i] &= 0x3ffffff; | 452 | if (too_wide) |
453 | sp_256_sub_8(a, a, m); | ||
454 | sp_256_norm_8(a); | ||
395 | } | 455 | } |
396 | else { /* Same code for explicit mp == 1 (which is always the case for P256) */ | 456 | else { /* Same code for explicit mp == 1 (which is always the case for P256) */ |
397 | for (i = 0; i < 9; i++) { | 457 | sp_digit word16th = 0; |
398 | mu = a[i] & 0x3ffffff; | 458 | for (i = 0; i < 8; i++) { |
399 | sp_256_mul_add_10(a+i, m, mu); | 459 | mu = a[i]; |
400 | a[i+1] += a[i] >> 26; | 460 | //m = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff |
461 | if (sp_256_mul_add_8(a+i, m, mu)) { | ||
462 | int j = i + 8; | ||
463 | inc_next_word: | ||
464 | if (++j > 15) { /* a[16] array has no more words? */ | ||
465 | word16th++; | ||
466 | continue; | ||
467 | } | ||
468 | if (++a[j] == 0) /* did this overflow too? */ | ||
469 | goto inc_next_word; | ||
470 | } | ||
401 | } | 471 | } |
402 | mu = a[i] & 0x03fffff; | 472 | sp_256_mont_shift_8(a, a); |
403 | sp_256_mul_add_10(a+i, m, mu); | 473 | if (word16th != 0) |
404 | a[i+1] += a[i] >> 26; | 474 | sp_256_sub_8(a, a, m); |
405 | a[i] &= 0x3ffffff; | 475 | sp_256_norm_8(a); |
406 | } | 476 | } |
407 | |||
408 | sp_256_mont_shift_10(a, a); | ||
409 | if ((a[9] >> 22) > 0) | ||
410 | sp_256_sub_10(a, a, m); | ||
411 | sp_256_norm_10(a); | ||
412 | } | 477 | } |
478 | #if 0 | ||
479 | //TODO: arm32 asm (also adapt for x86?) | ||
480 | static void sp_256_mont_reduce_8(sp_digit* a, sp_digit* m, sp_digit mp) | ||
481 | { | ||
482 | sp_digit ca = 0; | ||
483 | |||
484 | asm volatile ( | ||
485 | # i = 0 | ||
486 | mov r12, #0 # i = 0 | ||
487 | ldr r10, [%[a], #0] # r10 = a[0] | ||
488 | ldr r14, [%[a], #4] # r14 = a[1] | ||
489 | 1: | ||
490 | # mu = a[i] * mp # | ||
491 | mul r8, %[mp], r10 # mu = a[i] * mp | ||
492 | # a[i+0] += m[0] * mu # | ||
493 | ldr r7, [%[m], #0] # a[i+0] += m[0] * mu | ||
494 | ldr r9, [%[a], #0] # | ||
495 | umull r6, r7, r8, r7 # r7:r6 = mu * m[0] | ||
496 | adds r10, r10, r6 # r5:r10 += r7:r6 | ||
497 | adc r5, r7, #0 # | ||
498 | # a[i+1] += m[1] * mu # | ||
499 | ldr r7, [%[m], #4] # a[i+1] += m[1] * mu | ||
500 | ldr r9, [%[a], #4] # | ||
501 | umull r6, r7, r8, r7 # r7:r6 = mu * m[1] | ||
502 | adds r10, r14, r6 # r4:r10 = r7:r14 + r7:r6 | ||
503 | adc r4, r7, #0 # | ||
504 | adds r10, r10, r5 # r4:r10 += r5 | ||
505 | adc r4, r4, #0 # | ||
506 | # a[i+2] += m[2] * mu # | ||
507 | ldr r7, [%[m], #8] # a[i+2] += m[2] * mu | ||
508 | ldr r14, [%[a], #8] # | ||
509 | umull r6, r7, r8, r7 # | ||
510 | adds r14, r14, r6 # | ||
511 | adc r5, r7, #0 # | ||
512 | adds r14, r14, r4 # | ||
513 | adc r5, r5, #0 # | ||
514 | # a[i+3] += m[3] * mu # | ||
515 | ldr r7, [%[m], #12] # a[i+3] += m[3] * mu | ||
516 | ldr r9, [%[a], #12] # | ||
517 | umull r6, r7, r8, r7 # | ||
518 | adds r9, r9, r6 # | ||
519 | adc r4, r7, #0 # | ||
520 | adds r9, r9, r5 # | ||
521 | str r9, [%[a], #12] # a[3] = r9 | ||
522 | adc r4, r4, #0 # | ||
523 | # a[i+4] += m[4] * mu # | ||
524 | ldr r7, [%[m], #16] # a[i+4] += m[4] * mu | ||
525 | ldr r9, [%[a], #16] # | ||
526 | umull r6, r7, r8, r7 # | ||
527 | adds r9, r9, r6 # | ||
528 | adc r5, r7, #0 # | ||
529 | adds r9, r9, r4 # | ||
530 | str r9, [%[a], #16] # a[4] = r9 | ||
531 | adc r5, r5, #0 # | ||
532 | # a[i+5] += m[5] * mu # | ||
533 | ldr r7, [%[m], #20] # a[i+5] += m[5] * mu | ||
534 | ldr r9, [%[a], #20] # | ||
535 | umull r6, r7, r8, r7 # | ||
536 | adds r9, r9, r6 # | ||
537 | adc r4, r7, #0 # | ||
538 | adds r9, r9, r5 # | ||
539 | str r9, [%[a], #20] # a[5] = r9 | ||
540 | adc r4, r4, #0 # | ||
541 | # a[i+6] += m[6] * mu # | ||
542 | ldr r7, [%[m], #24] # a[i+6] += m[6] * mu | ||
543 | ldr r9, [%[a], #24] # | ||
544 | umull r6, r7, r8, r7 # | ||
545 | adds r9, r9, r6 # | ||
546 | adc r5, r7, #0 # | ||
547 | adds r9, r9, r4 # | ||
548 | str r9, [%[a], #24] # a[6] = r9 | ||
549 | adc r5, r5, #0 # | ||
550 | # a[i+7] += m[7] * mu # | ||
551 | ldr r7, [%[m], #28] # a[i+7] += m[7] * mu | ||
552 | ldr r9, [%[a], #28] # | ||
553 | umull r6, r7, r8, r7 # | ||
554 | adds r5, r5, r6 # | ||
555 | adcs r7, r7, %[ca] # | ||
556 | mov %[ca], #0 # | ||
557 | adc %[ca], %[ca], %[ca] # ca = CF | ||
558 | adds r9, r9, r5 # | ||
559 | str r9, [%[a], #28] # a[7] = r9 | ||
560 | ldr r9, [%[a], #32] # r9 = a[8] | ||
561 | adcs r9, r9, r7 # | ||
562 | str r9, [%[a], #32] # a[8] = r9 | ||
563 | adc %[ca], %[ca], #0 # ca += CF | ||
564 | # i += 1 # i++ | ||
565 | add %[a], %[a], #4 # a++ | ||
566 | add r12, r12, #4 # i += 4 | ||
567 | cmp r12, #32 # if (i < 32) | ||
568 | blt 1b # goto 1 | ||
569 | |||
570 | str r10, [%[a], #0] # a[0] = r10 | ||
571 | str r14, [%[a], #4] # a[1] = r14 | ||
572 | : [ca] "+r" (ca), [a] "+r" (a) | ||
573 | : [m] "r" (m), [mp] "r" (mp) | ||
574 | : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14", "r12" | ||
575 | ); | ||
576 | |||
577 | if (ca) | ||
578 | a -= m; | ||
579 | } | ||
580 | #endif | ||
413 | 581 | ||
414 | /* Multiply two Montogmery form numbers mod the modulus (prime). | 582 | /* Multiply two Montogmery form numbers mod the modulus (prime). |
415 | * (r = a * b mod m) | 583 | * (r = a * b mod m) |
@@ -420,14 +588,13 @@ static void sp_256_mont_reduce_10(sp_digit* a /*, const sp_digit* m, sp_digit mp | |||
420 | * m Modulus (prime). | 588 | * m Modulus (prime). |
421 | * mp Montogmery mulitplier. | 589 | * mp Montogmery mulitplier. |
422 | */ | 590 | */ |
423 | static void sp_256_mont_mul_10(sp_digit* r, const sp_digit* a, const sp_digit* b | 591 | static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b |
424 | /*, const sp_digit* m, sp_digit mp*/) | 592 | /*, const sp_digit* m, sp_digit mp*/) |
425 | { | 593 | { |
426 | //const sp_digit* m = p256_mod; | 594 | //const sp_digit* m = p256_mod; |
427 | //sp_digit mp = p256_mp_mod; | 595 | //sp_digit mp = p256_mp_mod; |
428 | 596 | sp_256_mul_8(r, a, b); | |
429 | sp_256_mul_10(r, a, b); | 597 | sp_256_mont_reduce_8(r /*, m, mp*/); |
430 | sp_256_mont_reduce_10(r /*, m, mp*/); | ||
431 | } | 598 | } |
432 | 599 | ||
433 | /* Square the Montgomery form number. (r = a * a mod m) | 600 | /* Square the Montgomery form number. (r = a * a mod m) |
@@ -437,13 +604,12 @@ static void sp_256_mont_mul_10(sp_digit* r, const sp_digit* a, const sp_digit* b | |||
437 | * m Modulus (prime). | 604 | * m Modulus (prime). |
438 | * mp Montogmery mulitplier. | 605 | * mp Montogmery mulitplier. |
439 | */ | 606 | */ |
440 | static void sp_256_mont_sqr_10(sp_digit* r, const sp_digit* a | 607 | static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a |
441 | /*, const sp_digit* m, sp_digit mp*/) | 608 | /*, const sp_digit* m, sp_digit mp*/) |
442 | { | 609 | { |
443 | //const sp_digit* m = p256_mod; | 610 | //const sp_digit* m = p256_mod; |
444 | //sp_digit mp = p256_mp_mod; | 611 | //sp_digit mp = p256_mp_mod; |
445 | 612 | sp_256_mont_mul_8(r, a, a /*, m, mp*/); | |
446 | sp_256_mont_mul_10(r, a, a /*, m, mp*/); | ||
447 | } | 613 | } |
448 | 614 | ||
449 | /* Invert the number, in Montgomery form, modulo the modulus (prime) of the | 615 | /* Invert the number, in Montgomery form, modulo the modulus (prime) of the |
@@ -464,19 +630,19 @@ static const uint32_t p256_mod_2[8] = { | |||
464 | //543210987654321098765432109876543210987654321098765432109876543210...09876543210...09876543210 | 630 | //543210987654321098765432109876543210987654321098765432109876543210...09876543210...09876543210 |
465 | //111111111111111111111111111111110000000000000000000000000000000100...00000111111...11111111101 | 631 | //111111111111111111111111111111110000000000000000000000000000000100...00000111111...11111111101 |
466 | #endif | 632 | #endif |
467 | static void sp_256_mont_inv_10(sp_digit* r, sp_digit* a) | 633 | static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a) |
468 | { | 634 | { |
469 | sp_digit t[2*10]; //can be just [10]? | 635 | sp_digit t[2*8]; //can be just [8]? |
470 | int i; | 636 | int i; |
471 | 637 | ||
472 | memcpy(t, a, sizeof(sp_digit) * 10); | 638 | memcpy(t, a, sizeof(sp_digit) * 8); |
473 | for (i = 254; i >= 0; i--) { | 639 | for (i = 254; i >= 0; i--) { |
474 | sp_256_mont_sqr_10(t, t /*, p256_mod, p256_mp_mod*/); | 640 | sp_256_mont_sqr_8(t, t /*, p256_mod, p256_mp_mod*/); |
475 | /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/ | 641 | /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/ |
476 | if (i >= 224 || i == 192 || (i <= 95 && i != 1)) | 642 | if (i >= 224 || i == 192 || (i <= 95 && i != 1)) |
477 | sp_256_mont_mul_10(t, t, a /*, p256_mod, p256_mp_mod*/); | 643 | sp_256_mont_mul_8(t, t, a /*, p256_mod, p256_mp_mod*/); |
478 | } | 644 | } |
479 | memcpy(r, t, sizeof(sp_digit) * 10); | 645 | memcpy(r, t, sizeof(sp_digit) * 8); |
480 | } | 646 | } |
481 | 647 | ||
482 | /* Multiply a number by Montogmery normalizer mod modulus (prime). | 648 | /* Multiply a number by Montogmery normalizer mod modulus (prime). |
@@ -484,93 +650,29 @@ static void sp_256_mont_inv_10(sp_digit* r, sp_digit* a) | |||
484 | * r The resulting Montgomery form number. | 650 | * r The resulting Montgomery form number. |
485 | * a The number to convert. | 651 | * a The number to convert. |
486 | */ | 652 | */ |
487 | static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a) | 653 | static void sp_256_mod_mul_norm_8(sp_digit* r, const sp_digit* a) |
488 | { | 654 | { |
489 | int64_t t[8]; | 655 | int64_t t[8]; |
490 | int64_t o; | 656 | int32_t o; |
491 | uint32_t a32; | ||
492 | 657 | ||
658 | #define A(n) ((uint64_t)a[n]) | ||
493 | /* 1 1 0 -1 -1 -1 -1 0 */ | 659 | /* 1 1 0 -1 -1 -1 -1 0 */ |
660 | t[0] = 0 + A(0) + A(1) - A(3) - A(4) - A(5) - A(6); | ||
494 | /* 0 1 1 0 -1 -1 -1 -1 */ | 661 | /* 0 1 1 0 -1 -1 -1 -1 */ |
662 | t[1] = 0 + A(1) + A(2) - A(4) - A(5) - A(6) - A(7); | ||
495 | /* 0 0 1 1 0 -1 -1 -1 */ | 663 | /* 0 0 1 1 0 -1 -1 -1 */ |
664 | t[2] = 0 + A(2) + A(3) - A(5) - A(6) - A(7); | ||
496 | /* -1 -1 0 2 2 1 0 -1 */ | 665 | /* -1 -1 0 2 2 1 0 -1 */ |
666 | t[3] = 0 - A(0) - A(1) + 2 * A(3) + 2 * A(4) + A(5) - A(7); | ||
497 | /* 0 -1 -1 0 2 2 1 0 */ | 667 | /* 0 -1 -1 0 2 2 1 0 */ |
668 | t[4] = 0 - A(1) - A(2) + 2 * A(4) + 2 * A(5) + A(6); | ||
498 | /* 0 0 -1 -1 0 2 2 1 */ | 669 | /* 0 0 -1 -1 0 2 2 1 */ |
670 | t[5] = 0 - A(2) - A(3) + 2 * A(5) + 2 * A(6) + A(7); | ||
499 | /* -1 -1 0 0 0 1 3 2 */ | 671 | /* -1 -1 0 0 0 1 3 2 */ |
672 | t[6] = 0 - A(0) - A(1) + A(5) + 3 * A(6) + 2 * A(7); | ||
500 | /* 1 0 -1 -1 -1 -1 0 3 */ | 673 | /* 1 0 -1 -1 -1 -1 0 3 */ |
501 | // t[] should be calculated from "a" (converted from 26-bit to 32-bit vector a32[8]) | 674 | t[7] = 0 + A(0) - A(2) - A(3) - A(4) - A(5) + 3 * A(7); |
502 | // according to the above matrix: | 675 | #undef A |
503 | //t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6] ; | ||
504 | //t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7] ; | ||
505 | //t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7] ; | ||
506 | //t[3] = 0 - a32[0] - a32[1] + 2*a32[3] + 2*a32[4] + a32[5] - a32[7] ; | ||
507 | //t[4] = 0 - a32[1] - a32[2] + 2*a32[4] + 2*a32[5] + a32[6] ; | ||
508 | //t[5] = 0 - a32[2] - a32[3] + 2*a32[5] + 2*a32[6] + a32[7] ; | ||
509 | //t[6] = 0 - a32[0] - a32[1] + a32[5] + 3*a32[6] + 2*a32[7]; | ||
510 | //t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3*a32[7]; | ||
511 | // We can do it "piecemeal" after each a32[i] is known, no need to store entire a32[8] vector: | ||
512 | |||
513 | #define A32 (int64_t)a32 | ||
514 | a32 = a[0] | (a[1] << 26); | ||
515 | t[0] = 0 + A32; | ||
516 | t[3] = 0 - A32; | ||
517 | t[6] = 0 - A32; | ||
518 | t[7] = 0 + A32; | ||
519 | |||
520 | a32 = (a[1] >> 6) | (a[2] << 20); | ||
521 | t[0] += A32 ; | ||
522 | t[1] = 0 + A32; | ||
523 | t[3] -= A32 ; | ||
524 | t[4] = 0 - A32; | ||
525 | t[6] -= A32 ; | ||
526 | |||
527 | a32 = (a[2] >> 12) | (a[3] << 14); | ||
528 | t[1] += A32 ; | ||
529 | t[2] = 0 + A32; | ||
530 | t[4] -= A32 ; | ||
531 | t[5] = 0 - A32; | ||
532 | t[7] -= A32 ; | ||
533 | |||
534 | a32 = (a[3] >> 18) | (a[4] << 8); | ||
535 | t[0] -= A32 ; | ||
536 | t[2] += A32 ; | ||
537 | t[3] += 2*A32; | ||
538 | t[5] -= A32 ; | ||
539 | t[7] -= A32 ; | ||
540 | |||
541 | a32 = (a[4] >> 24) | (a[5] << 2) | (a[6] << 28); | ||
542 | t[0] -= A32 ; | ||
543 | t[1] -= A32 ; | ||
544 | t[3] += 2*A32; | ||
545 | t[4] += 2*A32; | ||
546 | t[7] -= A32 ; | ||
547 | |||
548 | a32 = (a[6] >> 4) | (a[7] << 22); | ||
549 | t[0] -= A32 ; | ||
550 | t[1] -= A32 ; | ||
551 | t[2] -= A32 ; | ||
552 | t[3] += A32 ; | ||
553 | t[4] += 2*A32; | ||
554 | t[5] += 2*A32; | ||
555 | t[6] += A32 ; | ||
556 | t[7] -= A32 ; | ||
557 | |||
558 | a32 = (a[7] >> 10) | (a[8] << 16); | ||
559 | t[0] -= A32 ; | ||
560 | t[1] -= A32 ; | ||
561 | t[2] -= A32 ; | ||
562 | t[4] += A32 ; | ||
563 | t[5] += 2*A32; | ||
564 | t[6] += 3*A32; | ||
565 | |||
566 | a32 = (a[8] >> 16) | (a[9] << 10); | ||
567 | t[1] -= A32 ; | ||
568 | t[2] -= A32 ; | ||
569 | t[3] -= A32 ; | ||
570 | t[5] += A32 ; | ||
571 | t[6] += 2*A32; | ||
572 | t[7] += 3*A32; | ||
573 | #undef A32 | ||
574 | 676 | ||
575 | t[1] += t[0] >> 32; t[0] &= 0xffffffff; | 677 | t[1] += t[0] >> 32; t[0] &= 0xffffffff; |
576 | t[2] += t[1] >> 32; t[1] &= 0xffffffff; | 678 | t[2] += t[1] >> 32; t[1] &= 0xffffffff; |
@@ -579,29 +681,27 @@ static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a) | |||
579 | t[5] += t[4] >> 32; t[4] &= 0xffffffff; | 681 | t[5] += t[4] >> 32; t[4] &= 0xffffffff; |
580 | t[6] += t[5] >> 32; t[5] &= 0xffffffff; | 682 | t[6] += t[5] >> 32; t[5] &= 0xffffffff; |
581 | t[7] += t[6] >> 32; t[6] &= 0xffffffff; | 683 | t[7] += t[6] >> 32; t[6] &= 0xffffffff; |
582 | o = t[7] >> 32; t[7] &= 0xffffffff; | 684 | o = t[7] >> 32; //t[7] &= 0xffffffff; |
583 | t[0] += o; | 685 | t[0] += o; |
584 | t[3] -= o; | 686 | t[3] -= o; |
585 | t[6] -= o; | 687 | t[6] -= o; |
586 | t[7] += o; | 688 | t[7] += o; |
587 | t[1] += t[0] >> 32; //t[0] &= 0xffffffff; | 689 | r[0] = (sp_digit)t[0]; |
588 | t[2] += t[1] >> 32; //t[1] &= 0xffffffff; | 690 | t[1] += t[0] >> 32; |
589 | t[3] += t[2] >> 32; //t[2] &= 0xffffffff; | 691 | r[1] = (sp_digit)t[1]; |
590 | t[4] += t[3] >> 32; //t[3] &= 0xffffffff; | 692 | t[2] += t[1] >> 32; |
591 | t[5] += t[4] >> 32; //t[4] &= 0xffffffff; | 693 | r[2] = (sp_digit)t[2]; |
592 | t[6] += t[5] >> 32; //t[5] &= 0xffffffff; | 694 | t[3] += t[2] >> 32; |
593 | t[7] += t[6] >> 32; //t[6] &= 0xffffffff; - (uint32_t)t[i] casts below accomplish masking | 695 | r[3] = (sp_digit)t[3]; |
594 | 696 | t[4] += t[3] >> 32; | |
595 | r[0] = 0x3ffffff & ((sp_digit)((uint32_t)t[0])); | 697 | r[4] = (sp_digit)t[4]; |
596 | r[1] = 0x3ffffff & ((sp_digit)((uint32_t)t[0] >> 26) | ((sp_digit)t[1] << 6)); | 698 | t[5] += t[4] >> 32; |
597 | r[2] = 0x3ffffff & ((sp_digit)((uint32_t)t[1] >> 20) | ((sp_digit)t[2] << 12)); | 699 | r[5] = (sp_digit)t[5]; |
598 | r[3] = 0x3ffffff & ((sp_digit)((uint32_t)t[2] >> 14) | ((sp_digit)t[3] << 18)); | 700 | t[6] += t[5] >> 32; |
599 | r[4] = 0x3ffffff & ((sp_digit)((uint32_t)t[3] >> 8) | ((sp_digit)t[4] << 24)); | 701 | r[6] = (sp_digit)t[6]; |
600 | r[5] = 0x3ffffff & ((sp_digit)((uint32_t)t[4] >> 2)); | 702 | // t[7] += t[6] >> 32; |
601 | r[6] = 0x3ffffff & ((sp_digit)((uint32_t)t[4] >> 28) | ((sp_digit)t[5] << 4)); | 703 | // r[7] = (sp_digit)t[7]; |
602 | r[7] = 0x3ffffff & ((sp_digit)((uint32_t)t[5] >> 22) | ((sp_digit)t[6] << 10)); | 704 | r[7] = (sp_digit)t[7] + (sp_digit)(t[6] >> 32); |
603 | r[8] = 0x3ffffff & ((sp_digit)((uint32_t)t[6] >> 16) | ((sp_digit)t[7] << 16)); | ||
604 | r[9] = ((sp_digit)((uint32_t)t[7] >> 10)); | ||
605 | } | 705 | } |
606 | 706 | ||
607 | /* Map the Montgomery form projective co-ordinate point to an affine point. | 707 | /* Map the Montgomery form projective co-ordinate point to an affine point. |
@@ -609,33 +709,33 @@ static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a) | |||
609 | * r Resulting affine co-ordinate point. | 709 | * r Resulting affine co-ordinate point. |
610 | * p Montgomery form projective co-ordinate point. | 710 | * p Montgomery form projective co-ordinate point. |
611 | */ | 711 | */ |
612 | static void sp_256_map_10(sp_point* r, sp_point* p) | 712 | static void sp_256_map_8(sp_point* r, sp_point* p) |
613 | { | 713 | { |
614 | sp_digit t1[2*10]; | 714 | sp_digit t1[2*8]; |
615 | sp_digit t2[2*10]; | 715 | sp_digit t2[2*8]; |
616 | 716 | ||
617 | sp_256_mont_inv_10(t1, p->z); | 717 | sp_256_mont_inv_8(t1, p->z); |
618 | 718 | ||
619 | sp_256_mont_sqr_10(t2, t1 /*, p256_mod, p256_mp_mod*/); | 719 | sp_256_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/); |
620 | sp_256_mont_mul_10(t1, t2, t1 /*, p256_mod, p256_mp_mod*/); | 720 | sp_256_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/); |
621 | 721 | ||
622 | /* x /= z^2 */ | 722 | /* x /= z^2 */ |
623 | sp_256_mont_mul_10(r->x, p->x, t2 /*, p256_mod, p256_mp_mod*/); | 723 | sp_256_mont_mul_8(r->x, p->x, t2 /*, p256_mod, p256_mp_mod*/); |
624 | memset(r->x + 10, 0, sizeof(r->x) / 2); | 724 | memset(r->x + 8, 0, sizeof(r->x) / 2); |
625 | sp_256_mont_reduce_10(r->x /*, p256_mod, p256_mp_mod*/); | 725 | sp_256_mont_reduce_8(r->x /*, p256_mod, p256_mp_mod*/); |
626 | /* Reduce x to less than modulus */ | 726 | /* Reduce x to less than modulus */ |
627 | if (sp_256_cmp_10(r->x, p256_mod) >= 0) | 727 | if (sp_256_cmp_8(r->x, p256_mod) >= 0) |
628 | sp_256_sub_10(r->x, r->x, p256_mod); | 728 | sp_256_sub_8(r->x, r->x, p256_mod); |
629 | sp_256_norm_10(r->x); | 729 | sp_256_norm_8(r->x); |
630 | 730 | ||
631 | /* y /= z^3 */ | 731 | /* y /= z^3 */ |
632 | sp_256_mont_mul_10(r->y, p->y, t1 /*, p256_mod, p256_mp_mod*/); | 732 | sp_256_mont_mul_8(r->y, p->y, t1 /*, p256_mod, p256_mp_mod*/); |
633 | memset(r->y + 10, 0, sizeof(r->y) / 2); | 733 | memset(r->y + 8, 0, sizeof(r->y) / 2); |
634 | sp_256_mont_reduce_10(r->y /*, p256_mod, p256_mp_mod*/); | 734 | sp_256_mont_reduce_8(r->y /*, p256_mod, p256_mp_mod*/); |
635 | /* Reduce y to less than modulus */ | 735 | /* Reduce y to less than modulus */ |
636 | if (sp_256_cmp_10(r->y, p256_mod) >= 0) | 736 | if (sp_256_cmp_8(r->y, p256_mod) >= 0) |
637 | sp_256_sub_10(r->y, r->y, p256_mod); | 737 | sp_256_sub_8(r->y, r->y, p256_mod); |
638 | sp_256_norm_10(r->y); | 738 | sp_256_norm_8(r->y); |
639 | 739 | ||
640 | memset(r->z, 0, sizeof(r->z)); | 740 | memset(r->z, 0, sizeof(r->z)); |
641 | r->z[0] = 1; | 741 | r->z[0] = 1; |
@@ -646,16 +746,16 @@ static void sp_256_map_10(sp_point* r, sp_point* p) | |||
646 | * r Result of doubling point. | 746 | * r Result of doubling point. |
647 | * p Point to double. | 747 | * p Point to double. |
648 | */ | 748 | */ |
649 | static void sp_256_proj_point_dbl_10(sp_point* r, sp_point* p) | 749 | static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p) |
650 | { | 750 | { |
651 | sp_digit t1[2*10]; | 751 | sp_digit t1[2*8]; |
652 | sp_digit t2[2*10]; | 752 | sp_digit t2[2*8]; |
653 | 753 | ||
654 | /* Put point to double into result */ | 754 | /* Put point to double into result */ |
655 | if (r != p) | 755 | if (r != p) |
656 | *r = *p; /* struct copy */ | 756 | *r = *p; /* struct copy */ |
657 | 757 | ||
658 | if (r->infinity) /* If infinity, don't double */ | 758 | if (r->infinity) |
659 | return; | 759 | return; |
660 | 760 | ||
661 | if (SP_DEBUG) { | 761 | if (SP_DEBUG) { |
@@ -666,41 +766,42 @@ static void sp_256_proj_point_dbl_10(sp_point* r, sp_point* p) | |||
666 | } | 766 | } |
667 | 767 | ||
668 | /* T1 = Z * Z */ | 768 | /* T1 = Z * Z */ |
669 | sp_256_mont_sqr_10(t1, r->z /*, p256_mod, p256_mp_mod*/); | 769 | sp_256_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/); |
670 | /* Z = Y * Z */ | 770 | /* Z = Y * Z */ |
671 | sp_256_mont_mul_10(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/); | 771 | sp_256_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/); |
672 | /* Z = 2Z */ | 772 | /* Z = 2Z */ |
673 | sp_256_mont_dbl_10(r->z, r->z, p256_mod); | 773 | sp_256_mont_dbl_8(r->z, r->z, p256_mod); |
674 | /* T2 = X - T1 */ | 774 | /* T2 = X - T1 */ |
675 | sp_256_mont_sub_10(t2, r->x, t1, p256_mod); | 775 | sp_256_mont_sub_8(t2, r->x, t1, p256_mod); |
676 | /* T1 = X + T1 */ | 776 | /* T1 = X + T1 */ |
677 | sp_256_mont_add_10(t1, r->x, t1, p256_mod); | 777 | sp_256_mont_add_8(t1, r->x, t1, p256_mod); |
678 | /* T2 = T1 * T2 */ | 778 | /* T2 = T1 * T2 */ |
679 | sp_256_mont_mul_10(t2, t1, t2 /*, p256_mod, p256_mp_mod*/); | 779 | sp_256_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/); |
680 | /* T1 = 3T2 */ | 780 | /* T1 = 3T2 */ |
681 | sp_256_mont_tpl_10(t1, t2, p256_mod); | 781 | sp_256_mont_tpl_8(t1, t2, p256_mod); |
682 | /* Y = 2Y */ | 782 | /* Y = 2Y */ |
683 | sp_256_mont_dbl_10(r->y, r->y, p256_mod); | 783 | sp_256_mont_dbl_8(r->y, r->y, p256_mod); |
684 | /* Y = Y * Y */ | 784 | /* Y = Y * Y */ |
685 | sp_256_mont_sqr_10(r->y, r->y /*, p256_mod, p256_mp_mod*/); | 785 | sp_256_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/); |
686 | /* T2 = Y * Y */ | 786 | /* T2 = Y * Y */ |
687 | sp_256_mont_sqr_10(t2, r->y /*, p256_mod, p256_mp_mod*/); | 787 | sp_256_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/); |
688 | /* T2 = T2/2 */ | 788 | /* T2 = T2/2 */ |
689 | sp_256_div2_10(t2, t2, p256_mod); | 789 | sp_256_div2_8(t2, t2, p256_mod); |
690 | /* Y = Y * X */ | 790 | /* Y = Y * X */ |
691 | sp_256_mont_mul_10(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/); | 791 | sp_256_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/); |
692 | /* X = T1 * T1 */ | 792 | /* X = T1 * T1 */ |
693 | sp_256_mont_mul_10(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/); | 793 | sp_256_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/); |
694 | /* X = X - Y */ | 794 | /* X = X - Y */ |
695 | sp_256_mont_sub_10(r->x, r->x, r->y, p256_mod); | 795 | sp_256_mont_sub_8(r->x, r->x, r->y, p256_mod); |
696 | /* X = X - Y */ | 796 | /* X = X - Y */ |
697 | sp_256_mont_sub_10(r->x, r->x, r->y, p256_mod); | 797 | sp_256_mont_sub_8(r->x, r->x, r->y, p256_mod); |
698 | /* Y = Y - X */ | 798 | /* Y = Y - X */ |
699 | sp_256_mont_sub_10(r->y, r->y, r->x, p256_mod); | 799 | sp_256_mont_sub_8(r->y, r->y, r->x, p256_mod); |
700 | /* Y = Y * T1 */ | 800 | /* Y = Y * T1 */ |
701 | sp_256_mont_mul_10(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/); | 801 | sp_256_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/); |
702 | /* Y = Y - T2 */ | 802 | /* Y = Y - T2 */ |
703 | sp_256_mont_sub_10(r->y, r->y, t2, p256_mod); | 803 | sp_256_mont_sub_8(r->y, r->y, t2, p256_mod); |
804 | dump_512("y2 %s\n", r->y); | ||
704 | } | 805 | } |
705 | 806 | ||
706 | /* Add two Montgomery form projective points. | 807 | /* Add two Montgomery form projective points. |
@@ -709,13 +810,13 @@ static void sp_256_proj_point_dbl_10(sp_point* r, sp_point* p) | |||
709 | * p Frist point to add. | 810 | * p Frist point to add. |
710 | * q Second point to add. | 811 | * q Second point to add. |
711 | */ | 812 | */ |
712 | static void sp_256_proj_point_add_10(sp_point* r, sp_point* p, sp_point* q) | 813 | static void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point* q) |
713 | { | 814 | { |
714 | sp_digit t1[2*10]; | 815 | sp_digit t1[2*8]; |
715 | sp_digit t2[2*10]; | 816 | sp_digit t2[2*8]; |
716 | sp_digit t3[2*10]; | 817 | sp_digit t3[2*8]; |
717 | sp_digit t4[2*10]; | 818 | sp_digit t4[2*8]; |
718 | sp_digit t5[2*10]; | 819 | sp_digit t5[2*8]; |
719 | 820 | ||
720 | /* Ensure only the first point is the same as the result. */ | 821 | /* Ensure only the first point is the same as the result. */ |
721 | if (q == r) { | 822 | if (q == r) { |
@@ -725,13 +826,13 @@ static void sp_256_proj_point_add_10(sp_point* r, sp_point* p, sp_point* q) | |||
725 | } | 826 | } |
726 | 827 | ||
727 | /* Check double */ | 828 | /* Check double */ |
728 | sp_256_sub_10(t1, p256_mod, q->y); | 829 | sp_256_sub_8(t1, p256_mod, q->y); |
729 | sp_256_norm_10(t1); | 830 | sp_256_norm_8(t1); |
730 | if (sp_256_cmp_equal_10(p->x, q->x) | 831 | if (sp_256_cmp_equal_8(p->x, q->x) |
731 | && sp_256_cmp_equal_10(p->z, q->z) | 832 | && sp_256_cmp_equal_8(p->z, q->z) |
732 | && (sp_256_cmp_equal_10(p->y, q->y) || sp_256_cmp_equal_10(p->y, t1)) | 833 | && (sp_256_cmp_equal_8(p->y, q->y) || sp_256_cmp_equal_8(p->y, t1)) |
733 | ) { | 834 | ) { |
734 | sp_256_proj_point_dbl_10(r, p); | 835 | sp_256_proj_point_dbl_8(r, p); |
735 | } | 836 | } |
736 | else { | 837 | else { |
737 | sp_point tp; | 838 | sp_point tp; |
@@ -746,37 +847,37 @@ static void sp_256_proj_point_add_10(sp_point* r, sp_point* p, sp_point* q) | |||
746 | *r = p->infinity ? *q : *p; /* struct copy */ | 847 | *r = p->infinity ? *q : *p; /* struct copy */ |
747 | 848 | ||
748 | /* U1 = X1*Z2^2 */ | 849 | /* U1 = X1*Z2^2 */ |
749 | sp_256_mont_sqr_10(t1, q->z /*, p256_mod, p256_mp_mod*/); | 850 | sp_256_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/); |
750 | sp_256_mont_mul_10(t3, t1, q->z /*, p256_mod, p256_mp_mod*/); | 851 | sp_256_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/); |
751 | sp_256_mont_mul_10(t1, t1, v->x /*, p256_mod, p256_mp_mod*/); | 852 | sp_256_mont_mul_8(t1, t1, v->x /*, p256_mod, p256_mp_mod*/); |
752 | /* U2 = X2*Z1^2 */ | 853 | /* U2 = X2*Z1^2 */ |
753 | sp_256_mont_sqr_10(t2, v->z /*, p256_mod, p256_mp_mod*/); | 854 | sp_256_mont_sqr_8(t2, v->z /*, p256_mod, p256_mp_mod*/); |
754 | sp_256_mont_mul_10(t4, t2, v->z /*, p256_mod, p256_mp_mod*/); | 855 | sp_256_mont_mul_8(t4, t2, v->z /*, p256_mod, p256_mp_mod*/); |
755 | sp_256_mont_mul_10(t2, t2, q->x /*, p256_mod, p256_mp_mod*/); | 856 | sp_256_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/); |
756 | /* S1 = Y1*Z2^3 */ | 857 | /* S1 = Y1*Z2^3 */ |
757 | sp_256_mont_mul_10(t3, t3, v->y /*, p256_mod, p256_mp_mod*/); | 858 | sp_256_mont_mul_8(t3, t3, v->y /*, p256_mod, p256_mp_mod*/); |
758 | /* S2 = Y2*Z1^3 */ | 859 | /* S2 = Y2*Z1^3 */ |
759 | sp_256_mont_mul_10(t4, t4, q->y /*, p256_mod, p256_mp_mod*/); | 860 | sp_256_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/); |
760 | /* H = U2 - U1 */ | 861 | /* H = U2 - U1 */ |
761 | sp_256_mont_sub_10(t2, t2, t1, p256_mod); | 862 | sp_256_mont_sub_8(t2, t2, t1, p256_mod); |
762 | /* R = S2 - S1 */ | 863 | /* R = S2 - S1 */ |
763 | sp_256_mont_sub_10(t4, t4, t3, p256_mod); | 864 | sp_256_mont_sub_8(t4, t4, t3, p256_mod); |
764 | /* Z3 = H*Z1*Z2 */ | 865 | /* Z3 = H*Z1*Z2 */ |
765 | sp_256_mont_mul_10(v->z, v->z, q->z /*, p256_mod, p256_mp_mod*/); | 866 | sp_256_mont_mul_8(v->z, v->z, q->z /*, p256_mod, p256_mp_mod*/); |
766 | sp_256_mont_mul_10(v->z, v->z, t2 /*, p256_mod, p256_mp_mod*/); | 867 | sp_256_mont_mul_8(v->z, v->z, t2 /*, p256_mod, p256_mp_mod*/); |
767 | /* X3 = R^2 - H^3 - 2*U1*H^2 */ | 868 | /* X3 = R^2 - H^3 - 2*U1*H^2 */ |
768 | sp_256_mont_sqr_10(v->x, t4 /*, p256_mod, p256_mp_mod*/); | 869 | sp_256_mont_sqr_8(v->x, t4 /*, p256_mod, p256_mp_mod*/); |
769 | sp_256_mont_sqr_10(t5, t2 /*, p256_mod, p256_mp_mod*/); | 870 | sp_256_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/); |
770 | sp_256_mont_mul_10(v->y, t1, t5 /*, p256_mod, p256_mp_mod*/); | 871 | sp_256_mont_mul_8(v->y, t1, t5 /*, p256_mod, p256_mp_mod*/); |
771 | sp_256_mont_mul_10(t5, t5, t2 /*, p256_mod, p256_mp_mod*/); | 872 | sp_256_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/); |
772 | sp_256_mont_sub_10(v->x, v->x, t5, p256_mod); | 873 | sp_256_mont_sub_8(v->x, v->x, t5, p256_mod); |
773 | sp_256_mont_dbl_10(t1, v->y, p256_mod); | 874 | sp_256_mont_dbl_8(t1, v->y, p256_mod); |
774 | sp_256_mont_sub_10(v->x, v->x, t1, p256_mod); | 875 | sp_256_mont_sub_8(v->x, v->x, t1, p256_mod); |
775 | /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ | 876 | /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ |
776 | sp_256_mont_sub_10(v->y, v->y, v->x, p256_mod); | 877 | sp_256_mont_sub_8(v->y, v->y, v->x, p256_mod); |
777 | sp_256_mont_mul_10(v->y, v->y, t4 /*, p256_mod, p256_mp_mod*/); | 878 | sp_256_mont_mul_8(v->y, v->y, t4 /*, p256_mod, p256_mp_mod*/); |
778 | sp_256_mont_mul_10(t5, t5, t3 /*, p256_mod, p256_mp_mod*/); | 879 | sp_256_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/); |
779 | sp_256_mont_sub_10(v->y, v->y, t5, p256_mod); | 880 | sp_256_mont_sub_8(v->y, v->y, t5, p256_mod); |
780 | } | 881 | } |
781 | } | 882 | } |
782 | 883 | ||
@@ -788,12 +889,11 @@ static void sp_256_proj_point_add_10(sp_point* r, sp_point* p, sp_point* q) | |||
788 | * k Scalar to multiply by. | 889 | * k Scalar to multiply by. |
789 | * map Indicates whether to convert result to affine. | 890 | * map Indicates whether to convert result to affine. |
790 | */ | 891 | */ |
791 | static void sp_256_ecc_mulmod_10(sp_point* r, const sp_point* g, const sp_digit* k /*, int map*/) | 892 | static void sp_256_ecc_mulmod_8(sp_point* r, const sp_point* g, const sp_digit* k /*, int map*/) |
792 | { | 893 | { |
793 | enum { map = 1 }; /* we always convert result to affine coordinates */ | 894 | enum { map = 1 }; /* we always convert result to affine coordinates */ |
794 | sp_point t[3]; | 895 | sp_point t[3]; |
795 | sp_digit n; | 896 | sp_digit n = n; /* for compiler */ |
796 | int i; | ||
797 | int c, y; | 897 | int c, y; |
798 | 898 | ||
799 | memset(t, 0, sizeof(t)); | 899 | memset(t, 0, sizeof(t)); |
@@ -801,36 +901,44 @@ static void sp_256_ecc_mulmod_10(sp_point* r, const sp_point* g, const sp_digit* | |||
801 | /* t[0] = {0, 0, 1} * norm */ | 901 | /* t[0] = {0, 0, 1} * norm */ |
802 | t[0].infinity = 1; | 902 | t[0].infinity = 1; |
803 | /* t[1] = {g->x, g->y, g->z} * norm */ | 903 | /* t[1] = {g->x, g->y, g->z} * norm */ |
804 | sp_256_mod_mul_norm_10(t[1].x, g->x); | 904 | sp_256_mod_mul_norm_8(t[1].x, g->x); |
805 | sp_256_mod_mul_norm_10(t[1].y, g->y); | 905 | sp_256_mod_mul_norm_8(t[1].y, g->y); |
806 | sp_256_mod_mul_norm_10(t[1].z, g->z); | 906 | sp_256_mod_mul_norm_8(t[1].z, g->z); |
807 | dump_512("t[1].x %s\n", t[1].x); | ||
808 | dump_512("t[1].y %s\n", t[1].y); | ||
809 | dump_512("t[1].z %s\n", t[1].z); | ||
810 | |||
811 | i = 9; | ||
812 | c = 22; | ||
813 | n = k[i--] << (26 - c); | ||
814 | for (; ; c--) { | ||
815 | if (c == 0) { | ||
816 | if (i == -1) | ||
817 | break; | ||
818 | 907 | ||
819 | n = k[i--]; | 908 | /* For every bit, starting from most significant... */ |
820 | c = 26; | 909 | k += 7; |
910 | c = 256; | ||
911 | for (;;) { | ||
912 | if ((c & 0x1f) == 0) { | ||
913 | if (c == 0) | ||
914 | break; | ||
915 | n = *k--; | ||
821 | } | 916 | } |
822 | 917 | ||
823 | y = (n >> 25) & 1; | 918 | y = (n >> 31); |
824 | n <<= 1; | 919 | dbg("y:%d t[%d] = t[0]+t[1]\n", y, y^1); |
825 | 920 | sp_256_proj_point_add_8(&t[y^1], &t[0], &t[1]); | |
826 | sp_256_proj_point_add_10(&t[y^1], &t[0], &t[1]); | 921 | dump_512("t[0].x %s\n", t[0].x); |
922 | dump_512("t[0].y %s\n", t[0].y); | ||
923 | dump_512("t[0].z %s\n", t[0].z); | ||
924 | dump_512("t[1].x %s\n", t[1].x); | ||
925 | dump_512("t[1].y %s\n", t[1].y); | ||
926 | dump_512("t[1].z %s\n", t[1].z); | ||
927 | dbg("t[2] = t[%d]\n", y); | ||
827 | memcpy(&t[2], &t[y], sizeof(sp_point)); | 928 | memcpy(&t[2], &t[y], sizeof(sp_point)); |
828 | sp_256_proj_point_dbl_10(&t[2], &t[2]); | 929 | dbg("t[2] *= 2\n"); |
930 | sp_256_proj_point_dbl_8(&t[2], &t[2]); | ||
931 | dump_512("t[2].x %s\n", t[2].x); | ||
932 | dump_512("t[2].y %s\n", t[2].y); | ||
933 | dump_512("t[2].z %s\n", t[2].z); | ||
829 | memcpy(&t[y], &t[2], sizeof(sp_point)); | 934 | memcpy(&t[y], &t[2], sizeof(sp_point)); |
935 | |||
936 | n <<= 1; | ||
937 | c--; | ||
830 | } | 938 | } |
831 | 939 | ||
832 | if (map) | 940 | if (map) |
833 | sp_256_map_10(r, &t[0]); | 941 | sp_256_map_8(r, &t[0]); |
834 | else | 942 | else |
835 | memcpy(r, &t[0], sizeof(sp_point)); | 943 | memcpy(r, &t[0], sizeof(sp_point)); |
836 | 944 | ||
@@ -844,7 +952,7 @@ static void sp_256_ecc_mulmod_10(sp_point* r, const sp_point* g, const sp_digit* | |||
844 | * k Scalar to multiply by. | 952 | * k Scalar to multiply by. |
845 | * map Indicates whether to convert result to affine. | 953 | * map Indicates whether to convert result to affine. |
846 | */ | 954 | */ |
847 | static void sp_256_ecc_mulmod_base_10(sp_point* r, sp_digit* k /*, int map*/) | 955 | static void sp_256_ecc_mulmod_base_8(sp_point* r, sp_digit* k /*, int map*/) |
848 | { | 956 | { |
849 | /* Since this function is called only once, save space: | 957 | /* Since this function is called only once, save space: |
850 | * don't have "static const sp_point p256_base = {...}", | 958 | * don't have "static const sp_point p256_base = {...}", |
@@ -861,7 +969,7 @@ static void sp_256_ecc_mulmod_base_10(sp_point* r, sp_digit* k /*, int map*/) | |||
861 | 969 | ||
862 | sp_256_point_from_bin2x32(&p256_base, p256_base_bin); | 970 | sp_256_point_from_bin2x32(&p256_base, p256_base_bin); |
863 | 971 | ||
864 | sp_256_ecc_mulmod_10(r, &p256_base, k /*, map*/); | 972 | sp_256_ecc_mulmod_8(r, &p256_base, k /*, map*/); |
865 | } | 973 | } |
866 | 974 | ||
867 | /* Multiply the point by the scalar and serialize the X ordinate. | 975 | /* Multiply the point by the scalar and serialize the X ordinate. |
@@ -871,7 +979,7 @@ static void sp_256_ecc_mulmod_base_10(sp_point* r, sp_digit* k /*, int map*/) | |||
871 | * pub2x32 Point to multiply. | 979 | * pub2x32 Point to multiply. |
872 | * out32 Buffer to hold X ordinate. | 980 | * out32 Buffer to hold X ordinate. |
873 | */ | 981 | */ |
874 | static void sp_ecc_secret_gen_256(const sp_digit priv[10], const uint8_t *pub2x32, uint8_t* out32) | 982 | static void sp_ecc_secret_gen_256(const sp_digit priv[8], const uint8_t *pub2x32, uint8_t* out32) |
875 | { | 983 | { |
876 | sp_point point[1]; | 984 | sp_point point[1]; |
877 | 985 | ||
@@ -885,66 +993,48 @@ static void sp_ecc_secret_gen_256(const sp_digit priv[10], const uint8_t *pub2x3 | |||
885 | dump_512("point->x %s\n", point->x); | 993 | dump_512("point->x %s\n", point->x); |
886 | dump_512("point->y %s\n", point->y); | 994 | dump_512("point->y %s\n", point->y); |
887 | 995 | ||
888 | sp_256_ecc_mulmod_10(point, point, priv); | 996 | sp_256_ecc_mulmod_8(point, point, priv); |
889 | 997 | ||
890 | sp_256_to_bin_10(point->x, out32); | 998 | sp_256_to_bin_8(point->x, out32); |
891 | dump_hex("out32: %s\n", out32, 32); | 999 | dump_hex("out32: %s\n", out32, 32); |
892 | } | 1000 | } |
893 | 1001 | ||
894 | /* Generates a scalar that is in the range 1..order-1. */ | 1002 | /* Generates a random scalar in [1..order-1] range. */ |
895 | #define SIMPLIFY 1 | 1003 | static void sp_256_ecc_gen_k_8(sp_digit k[8]) |
896 | /* Add 1 to a. (a = a + 1) */ | ||
897 | static void sp_256_add_one_10(sp_digit* a) | ||
898 | { | ||
899 | a[0]++; | ||
900 | sp_256_norm_10(a); | ||
901 | } | ||
902 | static void sp_256_ecc_gen_k_10(sp_digit k[10]) | ||
903 | { | 1004 | { |
904 | #if !SIMPLIFY | 1005 | /* Since 32-bit words are "dense", no need to use |
905 | /* The order of the curve P256 minus 2. */ | 1006 | * sp_256_from_bin_8(k, buf) to convert random stream |
906 | static const sp_digit p256_order2[10] = { | 1007 | * to sp_digit array - just store random bits there directly. |
907 | 0x063254f,0x272b0bf,0x1e84f3b,0x2b69c5e,0x3bce6fa, | 1008 | */ |
908 | 0x3ffffff,0x3ffffff,0x00003ff,0x3ff0000,0x03fffff, | 1009 | tls_get_random(k, 8 * sizeof(k[0])); |
909 | }; | ||
910 | #endif | ||
911 | uint8_t buf[32]; | ||
912 | |||
913 | for (;;) { | ||
914 | tls_get_random(buf, sizeof(buf)); | ||
915 | #if FIXED_SECRET | 1010 | #if FIXED_SECRET |
916 | memset(buf, 0x77, sizeof(buf)); | 1011 | memset(k, 0x77, 8 * sizeof(k[0])); |
917 | #endif | 1012 | #endif |
918 | sp_256_from_bin_10(k, buf); | 1013 | |
919 | #if !SIMPLIFY | 1014 | // If scalar is too large, try again (pseudo-code) |
920 | if (sp_256_cmp_10(k, p256_order2) < 0) | 1015 | // if (k >= 0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551 - 1) // order of P256 |
921 | break; | 1016 | // goto pick_another_random; |
922 | #else | 1017 | // k++; // ensure non-zero |
923 | /* non-loopy version (and not needing p256_order2[]): | 1018 | /* Simpler alternative, at the cost of not choosing some valid |
924 | * if most-significant word seems that k can be larger | 1019 | * random values, and slightly non-uniform distribution */ |
925 | * than p256_order2, fix it up: | 1020 | if (k[0] == 0) |
926 | */ | 1021 | k[0] = 1; |
927 | if (k[9] >= 0x03fffff) | 1022 | if (k[7] >= 0xffffffff) |
928 | k[9] = 0x03ffffe; | 1023 | k[7] = 0xfffffffe; |
929 | break; | ||
930 | #endif | ||
931 | } | ||
932 | sp_256_add_one_10(k); | ||
933 | #undef SIMPLIFY | ||
934 | } | 1024 | } |
935 | 1025 | ||
936 | /* Makes a random EC key pair. */ | 1026 | /* Makes a random EC key pair. */ |
937 | static void sp_ecc_make_key_256(sp_digit privkey[10], uint8_t *pubkey) | 1027 | static void sp_ecc_make_key_256(sp_digit privkey[8], uint8_t *pubkey) |
938 | { | 1028 | { |
939 | sp_point point[1]; | 1029 | sp_point point[1]; |
940 | 1030 | ||
941 | sp_256_ecc_gen_k_10(privkey); | 1031 | sp_256_ecc_gen_k_8(privkey); |
942 | dump_256("privkey %s\n", privkey); | 1032 | dump_256("privkey %s\n", privkey); |
943 | sp_256_ecc_mulmod_base_10(point, privkey); | 1033 | sp_256_ecc_mulmod_base_8(point, privkey); |
944 | dump_512("point->x %s\n", point->x); | 1034 | dump_512("point->x %s\n", point->x); |
945 | dump_512("point->y %s\n", point->y); | 1035 | dump_512("point->y %s\n", point->y); |
946 | sp_256_to_bin_10(point->x, pubkey); | 1036 | sp_256_to_bin_8(point->x, pubkey); |
947 | sp_256_to_bin_10(point->y, pubkey + 32); | 1037 | sp_256_to_bin_8(point->y, pubkey + 32); |
948 | 1038 | ||
949 | memset(point, 0, sizeof(point)); //paranoia | 1039 | memset(point, 0, sizeof(point)); //paranoia |
950 | } | 1040 | } |
@@ -953,8 +1043,9 @@ void FAST_FUNC curve_P256_compute_pubkey_and_premaster( | |||
953 | uint8_t *pubkey2x32, uint8_t *premaster32, | 1043 | uint8_t *pubkey2x32, uint8_t *premaster32, |
954 | const uint8_t *peerkey2x32) | 1044 | const uint8_t *peerkey2x32) |
955 | { | 1045 | { |
956 | sp_digit privkey[10]; | 1046 | sp_digit privkey[8]; |
957 | 1047 | ||
1048 | dump_hex("peerkey2x32: %s\n", peerkey2x32, 64); | ||
958 | sp_ecc_make_key_256(privkey, pubkey2x32); | 1049 | sp_ecc_make_key_256(privkey, pubkey2x32); |
959 | dump_hex("pubkey: %s\n", pubkey2x32, 32); | 1050 | dump_hex("pubkey: %s\n", pubkey2x32, 32); |
960 | dump_hex(" %s\n", pubkey2x32 + 32, 32); | 1051 | dump_hex(" %s\n", pubkey2x32 + 32, 32); |