1 files changed, 72 insertions, 106 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
index 3291b553c..3452b08b9 100644
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@@ -49,9 +49,9 @@ typedef int32_t signed_sp_digit;
 */
 typedef struct sp_point {
-        sp_digit x[2 * 8];
+        sp_digit x[8];
-        sp_digit y[2 * 8];
+        sp_digit y[8];
-        sp_digit z[2 * 8];
+        sp_digit z[8];
        int infinity;
 } sp_point;
@@ -456,12 +456,11 @@ static void sp_256_sub_8_p256_mod(sp_digit* r)
 #endif
 /* Multiply a and b into r. (r = a * b)
- * r should be [16] array (512 bits).
+ * r should be [16] array (512 bits), and must not coincide with a or b.
 */
 static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
 {
 #if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
-        sp_digit rr[15]; /* in case r coincides with a or b */
        int k;
        uint32_t accl;
        uint32_t acch;
@@ -493,16 +492,15 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
                        j--;
                        i++;
                } while (i != 8 && i <= k);
-                rr[k] = accl;
+                r[k] = accl;
                accl = acch;
                acch = acc_hi;
        }
        r[15] = accl;
-        memcpy(r, rr, sizeof(rr));
 #elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
        const uint64_t* aa = (const void*)a;
        const uint64_t* bb = (const void*)b;
-        uint64_t rr[8];
+        const uint64_t* rr = (const void*)r;
        int k;
        uint64_t accl;
        uint64_t acch;
@@ -539,11 +537,8 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
                acch = acc_hi;
        }
        rr[7] = accl;
-        memcpy(r, rr, sizeof(rr));
 #elif 0
        //TODO: arm assembly (untested)
-        sp_digit tmp[16];
        asm volatile (
 "\n             mov     r5, #0"
 "\n             mov     r6, #0"
@@ -575,12 +570,10 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
 "\n             cmp     r5, #56"
 "\n             ble     1b"
 "\n             str     r6, [%[r], r5]"
-                : [r] "r" (tmp), [a] "r" (a), [b] "r" (b)
+                : [r] "r" (r), [a] "r" (a), [b] "r" (b)
                : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14"
        );
-        memcpy(r, tmp, sizeof(tmp));
 #else
-        sp_digit rr[15]; /* in case r coincides with a or b */
        int i, j, k;
        uint64_t acc;
@@ -600,11 +593,10 @@ static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
                        j--;
                        i++;
                } while (i != 8 && i <= k);
-                rr[k] = acc;
+                r[k] = acc;
                acc = (acc >> 32) | ((uint64_t)acc_hi << 32);
        }
        r[15] = acc;
-        memcpy(r, rr, sizeof(rr));
 #endif
 }
@@ -709,30 +701,11 @@ static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /*, const sp_digit*
 }
 /* Shift the result in the high 256 bits down to the bottom.
- * High half is cleared to zeros.
 */
-#if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff
+static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a)
-static void sp_512to256_mont_shift_8(sp_digit* rr)
 {
-        uint64_t *r = (void*)rr;
+        memcpy(r, a + 8, sizeof(*r) * 8);
-        int i;
-        for (i = 0; i < 4; i++) {
-                r[i] = r[i+4];
-                r[i+4] = 0;
-        }
 }
-#else
-static void sp_512to256_mont_shift_8(sp_digit* r)
-{
-        int i;
-        for (i = 0; i < 8; i++) {
-                r[i] = r[i+8];
-                r[i+8] = 0;
-        }
-}
-#endif
 /* Mul a by scalar b and add into r. (r += a * b)
 * a = p256_mod
@@ -868,11 +841,12 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
 * Note: the result is NOT guaranteed to be less than p256_mod!
 * (it is only guaranteed to fit into 256 bits).
 *
- * a   Double-wide number to reduce in place.
+ * r   Result.
+ * a   Double-wide number to reduce. Clobbered.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
-static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/)
+static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* a/*, const sp_digit* m, sp_digit mp*/)
 {
 //      const sp_digit* m = p256_mod;
        sp_digit mp = p256_mp_mod;
@@ -895,10 +869,10 @@ static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit
                                        goto inc_next_word0;
                        }
                }
-                sp_512to256_mont_shift_8(a);
+                sp_512to256_mont_shift_8(r, a);
                if (word16th != 0)
-                        sp_256_sub_8_p256_mod(a);
+                        sp_256_sub_8_p256_mod(r);
-                sp_256_norm_8(a);
+                sp_256_norm_8(r);
        }
        else { /* Same code for explicit mp == 1 (which is always the case for P256) */
                sp_digit word16th = 0;
@@ -915,10 +889,10 @@ static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit
                                        goto inc_next_word;
                        }
                }
-                sp_512to256_mont_shift_8(a);
+                sp_512to256_mont_shift_8(r, a);
                if (word16th != 0)
-                        sp_256_sub_8_p256_mod(a);
+                        sp_256_sub_8_p256_mod(r);
-                sp_256_norm_8(a);
+                sp_256_norm_8(r);
        }
 }
@@ -926,35 +900,34 @@ static void sp_512to256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
- *     Should be [16] array (512 bits), but high half is cleared to zeros (used as scratch pad).
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
-static void sp_256to512z_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b
+static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b
                /*, const sp_digit* m, sp_digit mp*/)
 {
        //const sp_digit* m = p256_mod;
        //sp_digit mp = p256_mp_mod;
-        sp_256to512_mul_8(r, a, b);
+        sp_digit t[2 * 8];
-        sp_512to256_mont_reduce_8(r /*, m, mp*/);
+        sp_256to512_mul_8(t, a, b);
+        sp_512to256_mont_reduce_8(r, t /*, m, mp*/);
 }
 /* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
- *     Should be [16] array (512 bits), but high half is cleared to zeros (used as scratch pad).
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
-static void sp_256to512z_mont_sqr_8(sp_digit* r, const sp_digit* a
+static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a
                /*, const sp_digit* m, sp_digit mp*/)
 {
        //const sp_digit* m = p256_mod;
        //sp_digit mp = p256_mp_mod;
-        sp_256to512z_mont_mul_8(r, a, a /*, m, mp*/);
+        sp_256_mont_mul_8(r, a, a /*, m, mp*/);
 }
 /* Invert the number, in Montgomery form, modulo the modulus (prime) of the
@@ -964,11 +937,8 @@ static void sp_256to512z_mont_sqr_8(sp_digit* r, const sp_digit* a
 * a   Number to invert.
 */
 #if 0
-/* Mod-2 for the P256 curve. */
+//p256_mod - 2:
-static const uint32_t p256_mod_2[8] = {
+//ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff - 2
-        0xfffffffd,0xffffffff,0xffffffff,0x00000000,
-        0x00000000,0x00000000,0x00000001,0xffffffff,
-};
 //Bit pattern:
 //2    2         2         2         2         2         2         1...1
 //5    5         4         3         2         1         0         9...0         9...1
@@ -977,15 +947,15 @@ static const uint32_t p256_mod_2[8] = {
 #endif
 static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a)
 {
-        sp_digit t[2*8];
+        sp_digit t[8];
        int i;
        memcpy(t, a, sizeof(sp_digit) * 8);
        for (i = 254; i >= 0; i--) {
-                sp_256to512z_mont_sqr_8(t, t /*, p256_mod, p256_mp_mod*/);
+                sp_256_mont_sqr_8(t, t /*, p256_mod, p256_mp_mod*/);
                /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/
                if (i >= 224 || i == 192 || (i <= 95 && i != 1))
-                        sp_256to512z_mont_mul_8(t, t, a /*, p256_mod, p256_mp_mod*/);
+                        sp_256_mont_mul_8(t, t, a /*, p256_mod, p256_mp_mod*/);
        }
        memcpy(r, t, sizeof(sp_digit) * 8);
 }
@@ -1056,25 +1026,28 @@ static void sp_256_mod_mul_norm_8(sp_digit* r, const sp_digit* a)
 */
 static void sp_256_map_8(sp_point* r, sp_point* p)
 {
-        sp_digit t1[2*8];
+        sp_digit t1[8];
-        sp_digit t2[2*8];
+        sp_digit t2[8];
+        sp_digit rr[2 * 8];
        sp_256_mont_inv_8(t1, p->z);
-        sp_256to512z_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/);
-        sp_256to512z_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/);
        /* x /= z^2 */
-        sp_256to512z_mont_mul_8(r->x, p->x, t2 /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(rr, p->x, t2 /*, p256_mod, p256_mp_mod*/);
-        sp_512to256_mont_reduce_8(r->x /*, p256_mod, p256_mp_mod*/);
+        memset(rr + 8, 0, sizeof(rr) / 2);
+        sp_512to256_mont_reduce_8(r->x, rr /*, p256_mod, p256_mp_mod*/);
        /* Reduce x to less than modulus */
        if (sp_256_cmp_8(r->x, p256_mod) >= 0)
                sp_256_sub_8_p256_mod(r->x);
        sp_256_norm_8(r->x);
        /* y /= z^3 */
-        sp_256to512z_mont_mul_8(r->y, p->y, t1 /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(rr, p->y, t1 /*, p256_mod, p256_mp_mod*/);
-        sp_512to256_mont_reduce_8(r->y /*, p256_mod, p256_mp_mod*/);
+        memset(rr + 8, 0, sizeof(rr) / 2);
+        sp_512to256_mont_reduce_8(r->y, rr /*, p256_mod, p256_mp_mod*/);
        /* Reduce y to less than modulus */
        if (sp_256_cmp_8(r->y, p256_mod) >= 0)
                sp_256_sub_8_p256_mod(r->y);
@@ -1091,8 +1064,8 @@ static void sp_256_map_8(sp_point* r, sp_point* p)
 */
 static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
 {
-        sp_digit t1[2*8];
+        sp_digit t1[8];
-        sp_digit t2[2*8];
+        sp_digit t2[8];
        /* Put point to double into result */
        if (r != p)
@@ -1101,17 +1074,10 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
        if (r->infinity)
                return;
-        if (SP_DEBUG) {
-                /* unused part of t2, may result in spurios
-                 * differences in debug output. Clear it.
-                 */
-                memset(t2, 0, sizeof(t2));
-        }
        /* T1 = Z * Z */
-        sp_256to512z_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/);
        /* Z = Y * Z */
-        sp_256to512z_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/);
        /* Z = 2Z */
        sp_256_mont_dbl_8(r->z, r->z /*, p256_mod*/);
        /* T2 = X - T1 */
@@ -1119,21 +1085,21 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
        /* T1 = X + T1 */
        sp_256_mont_add_8(t1, r->x, t1 /*, p256_mod*/);
        /* T2 = T1 * T2 */
-        sp_256to512z_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/);
        /* T1 = 3T2 */
        sp_256_mont_tpl_8(t1, t2 /*, p256_mod*/);
        /* Y = 2Y */
        sp_256_mont_dbl_8(r->y, r->y /*, p256_mod*/);
        /* Y = Y * Y */
-        sp_256to512z_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/);
        /* T2 = Y * Y */
-        sp_256to512z_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/);
        /* T2 = T2/2 */
        sp_256_div2_8(t2 /*, p256_mod*/);
        /* Y = Y * X */
-        sp_256to512z_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/);
        /* X = T1 * T1 */
-        sp_256to512z_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/);
        /* X = X - Y */
        sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/);
        /* X = X - Y */
@@ -1141,7 +1107,7 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
        /* Y = Y - X */
        sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/);
        /* Y = Y * T1 */
-        sp_256to512z_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/);
        /* Y = Y - T2 */
        sp_256_mont_sub_8(r->y, r->y, t2 /*, p256_mod*/);
        dump_512("y2 %s\n", r->y);
@@ -1155,11 +1121,11 @@ static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
 */
 static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point* q)
 {
-        sp_digit t1[2*8];
+        sp_digit t1[8];
-        sp_digit t2[2*8];
+        sp_digit t2[8];
-        sp_digit t3[2*8];
+        sp_digit t3[8];
-        sp_digit t4[2*8];
+        sp_digit t4[8];
-        sp_digit t5[2*8];
+        sp_digit t5[8];
        /* Ensure only the first point is the same as the result. */
        if (q == r) {
@@ -1186,36 +1152,36 @@ static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point*
        }
        /* U1 = X1*Z2^2 */
-        sp_256to512z_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/);
-        sp_256to512z_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/);
-        sp_256to512z_mont_mul_8(t1, t1, r->x /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(t1, t1, r->x /*, p256_mod, p256_mp_mod*/);
        /* U2 = X2*Z1^2 */
-        sp_256to512z_mont_sqr_8(t2, r->z /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_sqr_8(t2, r->z /*, p256_mod, p256_mp_mod*/);
-        sp_256to512z_mont_mul_8(t4, t2, r->z /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(t4, t2, r->z /*, p256_mod, p256_mp_mod*/);
-        sp_256to512z_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/);
        /* S1 = Y1*Z2^3 */
-        sp_256to512z_mont_mul_8(t3, t3, r->y /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(t3, t3, r->y /*, p256_mod, p256_mp_mod*/);
        /* S2 = Y2*Z1^3 */
-        sp_256to512z_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/);
        /* H = U2 - U1 */
        sp_256_mont_sub_8(t2, t2, t1 /*, p256_mod*/);
        /* R = S2 - S1 */
        sp_256_mont_sub_8(t4, t4, t3 /*, p256_mod*/);
        /* Z3 = H*Z1*Z2 */
-        sp_256to512z_mont_mul_8(r->z, r->z, q->z /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(r->z, r->z, q->z /*, p256_mod, p256_mp_mod*/);
-        sp_256to512z_mont_mul_8(r->z, r->z, t2 /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(r->z, r->z, t2 /*, p256_mod, p256_mp_mod*/);
        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_256to512z_mont_sqr_8(r->x, t4 /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_sqr_8(r->x, t4 /*, p256_mod, p256_mp_mod*/);
-        sp_256to512z_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/);
-        sp_256to512z_mont_mul_8(r->y, t1, t5 /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(r->y, t1, t5 /*, p256_mod, p256_mp_mod*/);
-        sp_256to512z_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/);
        sp_256_mont_sub_8(r->x, r->x, t5 /*, p256_mod*/);
        sp_256_mont_dbl_8(t1, r->y /*, p256_mod*/);
        sp_256_mont_sub_8(r->x, r->x, t1 /*, p256_mod*/);
        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
        sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/);
-        sp_256to512z_mont_mul_8(r->y, r->y, t4 /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(r->y, r->y, t4 /*, p256_mod, p256_mp_mod*/);
-        sp_256to512z_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/);
+        sp_256_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/);
        sp_256_mont_sub_8(r->y, r->y, t5 /*, p256_mod*/);
 }