diff options
| author | Denys Vlasenko <vda.linux@googlemail.com> | 2021-04-26 23:07:32 +0200 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2021-04-26 23:09:20 +0200 |
| commit | d728a30c211c2df6adccd64c6e2fc23387b341f2 (patch) | |
| tree | 36264c52731ebb18a92863aa2a7158f978bda7cc | |
| parent | 9a40be433de31b8a7fea20b7ebce3dafbedaf504 (diff) | |
| download | busybox-w32-d728a30c211c2df6adccd64c6e2fc23387b341f2.tar.gz busybox-w32-d728a30c211c2df6adccd64c6e2fc23387b341f2.tar.bz2 busybox-w32-d728a30c211c2df6adccd64c6e2fc23387b341f2.zip | |
tls: add a patch with optimization which _should_ give better code
...but does not.
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
| -rw-r--r-- | networking/tls_sp_c32.patch | 142 |
1 files changed, 142 insertions, 0 deletions
diff --git a/networking/tls_sp_c32.patch b/networking/tls_sp_c32.patch new file mode 100644 index 000000000..7559586c9 --- /dev/null +++ b/networking/tls_sp_c32.patch | |||
| @@ -0,0 +1,142 @@ | |||
| 1 | Somehow, gcc 6+ does this optimization same or better than the below | ||
| 2 | hand-written optimized code (gcc seem to eliminate a32[] array, uses 32-bit | ||
| 3 | registers/memory for "lower halves" of a32[i] elements). | ||
| 4 | |||
| 5 | But there can be arches where gcc won't be this good? | ||
| 6 | |||
| 7 | diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c | ||
| 8 | index 72a3be537..e8a011ad1 100644 | ||
| 9 | --- a/networking/tls_sp_c32.c | ||
| 10 | +++ b/networking/tls_sp_c32.c | ||
| 11 | @@ -228,51 +228,96 @@ static void sp_256_rshift1_10(sp_digit* r, sp_digit* a) | ||
| 12 | static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a) | ||
| 13 | { | ||
| 14 | int64_t t[8]; | ||
| 15 | - int64_t a32[8]; | ||
| 16 | + uint32_t a32; | ||
| 17 | int64_t o; | ||
| 18 | |||
| 19 | - a32[0] = a[0]; | ||
| 20 | - a32[0] |= a[1] << 26; | ||
| 21 | - a32[0] &= 0xffffffff; | ||
| 22 | - a32[1] = (sp_digit)(a[1] >> 6); | ||
| 23 | - a32[1] |= a[2] << 20; | ||
| 24 | - a32[1] &= 0xffffffff; | ||
| 25 | - a32[2] = (sp_digit)(a[2] >> 12); | ||
| 26 | - a32[2] |= a[3] << 14; | ||
| 27 | - a32[2] &= 0xffffffff; | ||
| 28 | - a32[3] = (sp_digit)(a[3] >> 18); | ||
| 29 | - a32[3] |= a[4] << 8; | ||
| 30 | - a32[3] &= 0xffffffff; | ||
| 31 | - a32[4] = (sp_digit)(a[4] >> 24); | ||
| 32 | - a32[4] |= a[5] << 2; | ||
| 33 | - a32[4] |= a[6] << 28; | ||
| 34 | - a32[4] &= 0xffffffff; | ||
| 35 | - a32[5] = (sp_digit)(a[6] >> 4); | ||
| 36 | - a32[5] |= a[7] << 22; | ||
| 37 | - a32[5] &= 0xffffffff; | ||
| 38 | - a32[6] = (sp_digit)(a[7] >> 10); | ||
| 39 | - a32[6] |= a[8] << 16; | ||
| 40 | - a32[6] &= 0xffffffff; | ||
| 41 | - a32[7] = (sp_digit)(a[8] >> 16); | ||
| 42 | - a32[7] |= a[9] << 10; | ||
| 43 | - a32[7] &= 0xffffffff; | ||
| 44 | - | ||
| 45 | /* 1 1 0 -1 -1 -1 -1 0 */ | ||
| 46 | - t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6]; | ||
| 47 | /* 0 1 1 0 -1 -1 -1 -1 */ | ||
| 48 | - t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7]; | ||
| 49 | /* 0 0 1 1 0 -1 -1 -1 */ | ||
| 50 | - t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7]; | ||
| 51 | /* -1 -1 0 2 2 1 0 -1 */ | ||
| 52 | - t[3] = 0 - a32[0] - a32[1] + 2 * a32[3] + 2 * a32[4] + a32[5] - a32[7]; | ||
| 53 | /* 0 -1 -1 0 2 2 1 0 */ | ||
| 54 | - t[4] = 0 - a32[1] - a32[2] + 2 * a32[4] + 2 * a32[5] + a32[6]; | ||
| 55 | /* 0 0 -1 -1 0 2 2 1 */ | ||
| 56 | - t[5] = 0 - a32[2] - a32[3] + 2 * a32[5] + 2 * a32[6] + a32[7]; | ||
| 57 | /* -1 -1 0 0 0 1 3 2 */ | ||
| 58 | - t[6] = 0 - a32[0] - a32[1] + a32[5] + 3 * a32[6] + 2 * a32[7]; | ||
| 59 | /* 1 0 -1 -1 -1 -1 0 3 */ | ||
| 60 | - t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3 * a32[7]; | ||
| 61 | + //t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6] ; | ||
| 62 | + //t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7] ; | ||
| 63 | + //t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7] ; | ||
| 64 | + //t[3] = 0 - a32[0] - a32[1] + 2*a32[3] + 2*a32[4] + a32[5] - a32[7] ; | ||
| 65 | + //t[4] = 0 - a32[1] - a32[2] + 2*a32[4] + 2*a32[5] + a32[6] ; | ||
| 66 | + //t[5] = 0 - a32[2] - a32[3] + 2*a32[5] + 2*a32[6] + a32[7] ; | ||
| 67 | + //t[6] = 0 - a32[0] - a32[1] + a32[5] + 3*a32[6] + 2*a32[7]; | ||
| 68 | + //t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3*a32[7]; | ||
| 69 | + | ||
| 70 | +#define A32 (int64_t)a32 | ||
| 71 | + a32 = a[0]; | ||
| 72 | + a32 |= a[1] << 26; | ||
| 73 | + t[0] = 0 + A32; | ||
| 74 | + t[3] = 0 - A32; | ||
| 75 | + t[6] = 0 - A32; | ||
| 76 | + t[7] = 0 + A32; | ||
| 77 | + | ||
| 78 | + a32 = (sp_digit)(a[1] >> 6); | ||
| 79 | + a32 |= a[2] << 20; | ||
| 80 | + t[0] += A32 ; | ||
| 81 | + t[1] = 0 + A32; | ||
| 82 | + t[3] -= A32 ; | ||
| 83 | + t[4] = 0 - A32; | ||
| 84 | + t[6] -= A32 ; | ||
| 85 | + | ||
| 86 | + a32 = (sp_digit)(a[2] >> 12); | ||
| 87 | + a32 |= a[3] << 14; | ||
| 88 | + t[1] += A32 ; | ||
| 89 | + t[2] = 0 + A32; | ||
| 90 | + t[4] -= A32 ; | ||
| 91 | + t[5] = 0 - A32; | ||
| 92 | + t[7] -= A32 ; | ||
| 93 | + | ||
| 94 | + a32 = (sp_digit)(a[3] >> 18); | ||
| 95 | + a32 |= a[4] << 8; | ||
| 96 | + t[0] -= A32 ; | ||
| 97 | + t[2] += A32 ; | ||
| 98 | + t[3] += 2*A32; | ||
| 99 | + t[5] -= A32 ; | ||
| 100 | + t[7] -= A32 ; | ||
| 101 | + | ||
| 102 | + a32 = (sp_digit)(a[4] >> 24); | ||
| 103 | + a32 |= a[5] << 2; | ||
| 104 | + a32 |= a[6] << 28; | ||
| 105 | + t[0] -= A32 ; | ||
| 106 | + t[1] -= A32 ; | ||
| 107 | + t[3] += 2*A32; | ||
| 108 | + t[4] += 2*A32; | ||
| 109 | + t[7] -= A32 ; | ||
| 110 | + | ||
| 111 | + a32 = (sp_digit)(a[6] >> 4); | ||
| 112 | + a32 |= a[7] << 22; | ||
| 113 | + t[0] -= A32 ; | ||
| 114 | + t[1] -= A32 ; | ||
| 115 | + t[2] -= A32 ; | ||
| 116 | + t[3] += A32 ; | ||
| 117 | + t[4] += 2*A32; | ||
| 118 | + t[5] += 2*A32; | ||
| 119 | + t[6] += A32 ; | ||
| 120 | + t[7] -= A32 ; | ||
| 121 | + | ||
| 122 | + a32 = (sp_digit)(a[7] >> 10); | ||
| 123 | + a32 |= a[8] << 16; | ||
| 124 | + t[0] -= A32 ; | ||
| 125 | + t[1] -= A32 ; | ||
| 126 | + t[2] -= A32 ; | ||
| 127 | + t[4] += A32 ; | ||
| 128 | + t[5] += 2*A32; | ||
| 129 | + t[6] += 3*A32; | ||
| 130 | + | ||
| 131 | + a32 = (sp_digit)(a[8] >> 16); | ||
| 132 | + a32 |= a[9] << 10; | ||
| 133 | + t[1] -= A32 ; | ||
| 134 | + t[2] -= A32 ; | ||
| 135 | + t[3] -= A32 ; | ||
| 136 | + t[5] += A32 ; | ||
| 137 | + t[6] += 2*A32; | ||
| 138 | + t[7] += 3*A32; | ||
| 139 | +#undef A32 | ||
| 140 | |||
| 141 | t[1] += t[0] >> 32; t[0] &= 0xffffffff; | ||
| 142 | t[2] += t[1] >> 32; t[1] &= 0xffffffff; | ||
