diff options
| author | Denys Vlasenko <vda.linux@googlemail.com> | 2021-11-27 11:28:11 +0100 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2021-11-27 11:28:11 +0100 |
| commit | 4bc9da10718df7ed9e992b1ddd2e80d53d894177 (patch) | |
| tree | 3225ca484904b0f3d85a40cbbad02636b27f7aa7 | |
| parent | 15f7d618ea7f8c3a0277c98309268b709e20d77c (diff) | |
| download | busybox-w32-4bc9da10718df7ed9e992b1ddd2e80d53d894177.tar.gz busybox-w32-4bc9da10718df7ed9e992b1ddd2e80d53d894177.tar.bz2 busybox-w32-4bc9da10718df7ed9e992b1ddd2e80d53d894177.zip | |
tls: P256: 64-bit optimizations
function old new delta
sp_256_proj_point_dbl_8 421 428 +7
sp_256_point_from_bin2x32 78 84 +6
sp_256_cmp_8 38 42 +4
sp_256_to_bin_8 28 31 +3
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 4/0 up/down: 20/0) Total: 20 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
| -rw-r--r-- | include/platform.h | 2 | ||||
| -rw-r--r-- | networking/tls_sp_c32.c | 114 |
2 files changed, 101 insertions, 15 deletions
diff --git a/include/platform.h b/include/platform.h index 9e1fb047d..ad27bb31a 100644 --- a/include/platform.h +++ b/include/platform.h | |||
| @@ -239,6 +239,7 @@ typedef uint64_t bb__aliased_uint64_t FIX_ALIASING; | |||
| 239 | # define move_from_unaligned_long(v, longp) ((v) = *(bb__aliased_long*)(longp)) | 239 | # define move_from_unaligned_long(v, longp) ((v) = *(bb__aliased_long*)(longp)) |
| 240 | # define move_from_unaligned16(v, u16p) ((v) = *(bb__aliased_uint16_t*)(u16p)) | 240 | # define move_from_unaligned16(v, u16p) ((v) = *(bb__aliased_uint16_t*)(u16p)) |
| 241 | # define move_from_unaligned32(v, u32p) ((v) = *(bb__aliased_uint32_t*)(u32p)) | 241 | # define move_from_unaligned32(v, u32p) ((v) = *(bb__aliased_uint32_t*)(u32p)) |
| 242 | # define move_from_unaligned64(v, u64p) ((v) = *(bb__aliased_uint64_t*)(u64p)) | ||
| 242 | # define move_to_unaligned16(u16p, v) (*(bb__aliased_uint16_t*)(u16p) = (v)) | 243 | # define move_to_unaligned16(u16p, v) (*(bb__aliased_uint16_t*)(u16p) = (v)) |
| 243 | # define move_to_unaligned32(u32p, v) (*(bb__aliased_uint32_t*)(u32p) = (v)) | 244 | # define move_to_unaligned32(u32p, v) (*(bb__aliased_uint32_t*)(u32p) = (v)) |
| 244 | # define move_to_unaligned64(u64p, v) (*(bb__aliased_uint64_t*)(u64p) = (v)) | 245 | # define move_to_unaligned64(u64p, v) (*(bb__aliased_uint64_t*)(u64p) = (v)) |
| @@ -250,6 +251,7 @@ typedef uint64_t bb__aliased_uint64_t FIX_ALIASING; | |||
| 250 | # define move_from_unaligned_long(v, longp) (memcpy(&(v), (longp), sizeof(long))) | 251 | # define move_from_unaligned_long(v, longp) (memcpy(&(v), (longp), sizeof(long))) |
| 251 | # define move_from_unaligned16(v, u16p) (memcpy(&(v), (u16p), 2)) | 252 | # define move_from_unaligned16(v, u16p) (memcpy(&(v), (u16p), 2)) |
| 252 | # define move_from_unaligned32(v, u32p) (memcpy(&(v), (u32p), 4)) | 253 | # define move_from_unaligned32(v, u32p) (memcpy(&(v), (u32p), 4)) |
| 254 | # define move_from_unaligned64(v, u64p) (memcpy(&(v), (u64p), 8)) | ||
| 253 | # define move_to_unaligned16(u16p, v) do { \ | 255 | # define move_to_unaligned16(u16p, v) do { \ |
| 254 | uint16_t __t = (v); \ | 256 | uint16_t __t = (v); \ |
| 255 | memcpy((u16p), &__t, 2); \ | 257 | memcpy((u16p), &__t, 2); \ |
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c index 4d4ecdd74..d09f7e881 100644 --- a/networking/tls_sp_c32.c +++ b/networking/tls_sp_c32.c | |||
| @@ -29,6 +29,20 @@ static void dump_hex(const char *fmt, const void *vp, int len) | |||
| 29 | typedef uint32_t sp_digit; | 29 | typedef uint32_t sp_digit; |
| 30 | typedef int32_t signed_sp_digit; | 30 | typedef int32_t signed_sp_digit; |
| 31 | 31 | ||
| 32 | /* 64-bit optimizations: | ||
| 33 | * if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff, | ||
| 34 | * then loads and stores can be done in 64-bit chunks. | ||
| 35 | * | ||
| 36 | * A narrower case is when arch is also little-endian (such as x86_64), | ||
| 37 | * then "LSW first", uint32[8] and uint64[4] representations are equivalent, | ||
| 38 | * and arithmetic can be done in 64 bits too. | ||
| 39 | */ | ||
| 40 | #if defined(__GNUC__) && defined(__x86_64__) | ||
| 41 | # define UNALIGNED_LE_64BIT 1 | ||
| 42 | #else | ||
| 43 | # define UNALIGNED_LE_64BIT 0 | ||
| 44 | #endif | ||
| 45 | |||
| 32 | /* The code below is taken from parts of | 46 | /* The code below is taken from parts of |
| 33 | * wolfssl-3.15.3/wolfcrypt/src/sp_c32.c | 47 | * wolfssl-3.15.3/wolfcrypt/src/sp_c32.c |
| 34 | * and heavily modified. | 48 | * and heavily modified. |
| @@ -58,6 +72,22 @@ static const sp_digit p256_mod[8] = { | |||
| 58 | * r A single precision integer. | 72 | * r A single precision integer. |
| 59 | * a Byte array. | 73 | * a Byte array. |
| 60 | */ | 74 | */ |
| 75 | #if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff | ||
| 76 | static void sp_256_to_bin_8(const sp_digit* rr, uint8_t* a) | ||
| 77 | { | ||
| 78 | int i; | ||
| 79 | const uint64_t* r = (void*)rr; | ||
| 80 | |||
| 81 | sp_256_norm_8(rr); | ||
| 82 | |||
| 83 | r += 4; | ||
| 84 | for (i = 0; i < 4; i++) { | ||
| 85 | r--; | ||
| 86 | move_to_unaligned64(a, SWAP_BE64(*r)); | ||
| 87 | a += 8; | ||
| 88 | } | ||
| 89 | } | ||
| 90 | #else | ||
| 61 | static void sp_256_to_bin_8(const sp_digit* r, uint8_t* a) | 91 | static void sp_256_to_bin_8(const sp_digit* r, uint8_t* a) |
| 62 | { | 92 | { |
| 63 | int i; | 93 | int i; |
| @@ -71,6 +101,7 @@ static void sp_256_to_bin_8(const sp_digit* r, uint8_t* a) | |||
| 71 | a += 4; | 101 | a += 4; |
| 72 | } | 102 | } |
| 73 | } | 103 | } |
| 104 | #endif | ||
| 74 | 105 | ||
| 75 | /* Read big endian unsigned byte array into r. | 106 | /* Read big endian unsigned byte array into r. |
| 76 | * | 107 | * |
| @@ -78,6 +109,21 @@ static void sp_256_to_bin_8(const sp_digit* r, uint8_t* a) | |||
| 78 | * a Byte array. | 109 | * a Byte array. |
| 79 | * n Number of bytes in array to read. | 110 | * n Number of bytes in array to read. |
| 80 | */ | 111 | */ |
| 112 | #if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff | ||
| 113 | static void sp_256_from_bin_8(sp_digit* rr, const uint8_t* a) | ||
| 114 | { | ||
| 115 | int i; | ||
| 116 | uint64_t* r = (void*)rr; | ||
| 117 | |||
| 118 | r += 4; | ||
| 119 | for (i = 0; i < 4; i++) { | ||
| 120 | uint64_t v; | ||
| 121 | move_from_unaligned64(v, a); | ||
| 122 | *--r = SWAP_BE64(v); | ||
| 123 | a += 8; | ||
| 124 | } | ||
| 125 | } | ||
| 126 | #else | ||
| 81 | static void sp_256_from_bin_8(sp_digit* r, const uint8_t* a) | 127 | static void sp_256_from_bin_8(sp_digit* r, const uint8_t* a) |
| 82 | { | 128 | { |
| 83 | int i; | 129 | int i; |
| @@ -90,6 +136,7 @@ static void sp_256_from_bin_8(sp_digit* r, const uint8_t* a) | |||
| 90 | a += 4; | 136 | a += 4; |
| 91 | } | 137 | } |
| 92 | } | 138 | } |
| 139 | #endif | ||
| 93 | 140 | ||
| 94 | #if SP_DEBUG | 141 | #if SP_DEBUG |
| 95 | static void dump_256(const char *fmt, const sp_digit* r) | 142 | static void dump_256(const char *fmt, const sp_digit* r) |
| @@ -125,6 +172,20 @@ static void sp_256_point_from_bin2x32(sp_point* p, const uint8_t *bin2x32) | |||
| 125 | * return -ve, 0 or +ve if a is less than, equal to or greater than b | 172 | * return -ve, 0 or +ve if a is less than, equal to or greater than b |
| 126 | * respectively. | 173 | * respectively. |
| 127 | */ | 174 | */ |
| 175 | #if UNALIGNED_LE_64BIT | ||
| 176 | static signed_sp_digit sp_256_cmp_8(const sp_digit* aa, const sp_digit* bb) | ||
| 177 | { | ||
| 178 | const uint64_t* a = (void*)aa; | ||
| 179 | const uint64_t* b = (void*)bb; | ||
| 180 | int i; | ||
| 181 | for (i = 3; i >= 0; i--) { | ||
| 182 | if (a[i] == b[i]) | ||
| 183 | continue; | ||
| 184 | return (a[i] > b[i]) * 2 - 1; | ||
| 185 | } | ||
| 186 | return 0; | ||
| 187 | } | ||
| 188 | #else | ||
| 128 | static signed_sp_digit sp_256_cmp_8(const sp_digit* a, const sp_digit* b) | 189 | static signed_sp_digit sp_256_cmp_8(const sp_digit* a, const sp_digit* b) |
| 129 | { | 190 | { |
| 130 | int i; | 191 | int i; |
| @@ -140,6 +201,7 @@ static signed_sp_digit sp_256_cmp_8(const sp_digit* a, const sp_digit* b) | |||
| 140 | } | 201 | } |
| 141 | return 0; | 202 | return 0; |
| 142 | } | 203 | } |
| 204 | #endif | ||
| 143 | 205 | ||
| 144 | /* Compare two numbers to determine if they are equal. | 206 | /* Compare two numbers to determine if they are equal. |
| 145 | * | 207 | * |
| @@ -196,8 +258,6 @@ static int sp_256_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b) | |||
| 196 | ); | 258 | ); |
| 197 | return reg; | 259 | return reg; |
| 198 | #elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__) | 260 | #elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__) |
| 199 | /* x86_64 has no alignment restrictions, and is little-endian, | ||
| 200 | * so 64-bit and 32-bit representations are identical */ | ||
| 201 | uint64_t reg; | 261 | uint64_t reg; |
| 202 | asm volatile ( | 262 | asm volatile ( |
| 203 | "\n movq (%0), %3" | 263 | "\n movq (%0), %3" |
| @@ -294,8 +354,6 @@ static int sp_256_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b) | |||
| 294 | ); | 354 | ); |
| 295 | return reg; | 355 | return reg; |
| 296 | #elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__) | 356 | #elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__) |
| 297 | /* x86_64 has no alignment restrictions, and is little-endian, | ||
| 298 | * so 64-bit and 32-bit representations are identical */ | ||
| 299 | uint64_t reg; | 357 | uint64_t reg; |
| 300 | asm volatile ( | 358 | asm volatile ( |
| 301 | "\n movq (%0), %3" | 359 | "\n movq (%0), %3" |
| @@ -440,8 +498,6 @@ static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) | |||
| 440 | r[15] = accl; | 498 | r[15] = accl; |
| 441 | memcpy(r, rr, sizeof(rr)); | 499 | memcpy(r, rr, sizeof(rr)); |
| 442 | #elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__) | 500 | #elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__) |
| 443 | /* x86_64 has no alignment restrictions, and is little-endian, | ||
| 444 | * so 64-bit and 32-bit representations are identical */ | ||
| 445 | const uint64_t* aa = (const void*)a; | 501 | const uint64_t* aa = (const void*)a; |
| 446 | const uint64_t* bb = (const void*)b; | 502 | const uint64_t* bb = (const void*)b; |
| 447 | uint64_t rr[8]; | 503 | uint64_t rr[8]; |
| @@ -551,17 +607,32 @@ static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) | |||
| 551 | } | 607 | } |
| 552 | 608 | ||
| 553 | /* Shift number right one bit. Bottom bit is lost. */ | 609 | /* Shift number right one bit. Bottom bit is lost. */ |
| 554 | static void sp_256_rshift1_8(sp_digit* r, sp_digit* a, sp_digit carry) | 610 | #if UNALIGNED_LE_64BIT |
| 611 | static void sp_256_rshift1_8(sp_digit* rr, uint64_t carry) | ||
| 612 | { | ||
| 613 | uint64_t *r = (void*)rr; | ||
| 614 | int i; | ||
| 615 | |||
| 616 | carry = (((uint64_t)!!carry) << 63); | ||
| 617 | for (i = 3; i >= 0; i--) { | ||
| 618 | uint64_t c = r[i] << 63; | ||
| 619 | r[i] = (r[i] >> 1) | carry; | ||
| 620 | carry = c; | ||
| 621 | } | ||
| 622 | } | ||
| 623 | #else | ||
| 624 | static void sp_256_rshift1_8(sp_digit* r, sp_digit carry) | ||
| 555 | { | 625 | { |
| 556 | int i; | 626 | int i; |
| 557 | 627 | ||
| 558 | carry = (!!carry << 31); | 628 | carry = (((sp_digit)!!carry) << 31); |
| 559 | for (i = 7; i >= 0; i--) { | 629 | for (i = 7; i >= 0; i--) { |
| 560 | sp_digit c = a[i] << 31; | 630 | sp_digit c = r[i] << 31; |
| 561 | r[i] = (a[i] >> 1) | carry; | 631 | r[i] = (r[i] >> 1) | carry; |
| 562 | carry = c; | 632 | carry = c; |
| 563 | } | 633 | } |
| 564 | } | 634 | } |
| 635 | #endif | ||
| 565 | 636 | ||
| 566 | /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) */ | 637 | /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) */ |
| 567 | static void sp_256_div2_8(sp_digit* r, const sp_digit* a, const sp_digit* m) | 638 | static void sp_256_div2_8(sp_digit* r, const sp_digit* a, const sp_digit* m) |
| @@ -570,7 +641,7 @@ static void sp_256_div2_8(sp_digit* r, const sp_digit* a, const sp_digit* m) | |||
| 570 | if (a[0] & 1) | 641 | if (a[0] & 1) |
| 571 | carry = sp_256_add_8(r, a, m); | 642 | carry = sp_256_add_8(r, a, m); |
| 572 | sp_256_norm_8(r); | 643 | sp_256_norm_8(r); |
| 573 | sp_256_rshift1_8(r, r, carry); | 644 | sp_256_rshift1_8(r, carry); |
| 574 | } | 645 | } |
| 575 | 646 | ||
| 576 | /* Add two Montgomery form numbers (r = a + b % m) */ | 647 | /* Add two Montgomery form numbers (r = a + b % m) */ |
| @@ -634,15 +705,28 @@ static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /*, const sp_digit* | |||
| 634 | } | 705 | } |
| 635 | 706 | ||
| 636 | /* Shift the result in the high 256 bits down to the bottom. */ | 707 | /* Shift the result in the high 256 bits down to the bottom. */ |
| 637 | static void sp_256_mont_shift_8(sp_digit* r, const sp_digit* a) | 708 | #if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff |
| 709 | static void sp_256_mont_shift_8(sp_digit* rr) | ||
| 710 | { | ||
| 711 | uint64_t *r = (void*)rr; | ||
| 712 | int i; | ||
| 713 | |||
| 714 | for (i = 0; i < 4; i++) { | ||
| 715 | r[i] = r[i+4]; | ||
| 716 | r[i+4] = 0; | ||
| 717 | } | ||
| 718 | } | ||
| 719 | #else | ||
| 720 | static void sp_256_mont_shift_8(sp_digit* r) | ||
| 638 | { | 721 | { |
| 639 | int i; | 722 | int i; |
| 640 | 723 | ||
| 641 | for (i = 0; i < 8; i++) { | 724 | for (i = 0; i < 8; i++) { |
| 642 | r[i] = a[i+8]; | 725 | r[i] = r[i+8]; |
| 643 | r[i+8] = 0; | 726 | r[i+8] = 0; |
| 644 | } | 727 | } |
| 645 | } | 728 | } |
| 729 | #endif | ||
| 646 | 730 | ||
| 647 | /* Mul a by scalar b and add into r. (r += a * b) */ | 731 | /* Mul a by scalar b and add into r. (r += a * b) */ |
| 648 | static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) | 732 | static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/) |
| @@ -800,7 +884,7 @@ static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/ | |||
| 800 | goto inc_next_word0; | 884 | goto inc_next_word0; |
| 801 | } | 885 | } |
| 802 | } | 886 | } |
| 803 | sp_256_mont_shift_8(a, a); | 887 | sp_256_mont_shift_8(a); |
| 804 | if (word16th != 0) | 888 | if (word16th != 0) |
| 805 | sp_256_sub_8_p256_mod(a); | 889 | sp_256_sub_8_p256_mod(a); |
| 806 | sp_256_norm_8(a); | 890 | sp_256_norm_8(a); |
| @@ -820,7 +904,7 @@ static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/ | |||
| 820 | goto inc_next_word; | 904 | goto inc_next_word; |
| 821 | } | 905 | } |
| 822 | } | 906 | } |
| 823 | sp_256_mont_shift_8(a, a); | 907 | sp_256_mont_shift_8(a); |
| 824 | if (word16th != 0) | 908 | if (word16th != 0) |
| 825 | sp_256_sub_8_p256_mod(a); | 909 | sp_256_sub_8_p256_mod(a); |
| 826 | sp_256_norm_8(a); | 910 | sp_256_norm_8(a); |
