diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2021-04-26 23:07:32 +0200 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2021-04-26 23:09:20 +0200 |
commit | d728a30c211c2df6adccd64c6e2fc23387b341f2 (patch) | |
tree | 36264c52731ebb18a92863aa2a7158f978bda7cc | |
parent | 9a40be433de31b8a7fea20b7ebce3dafbedaf504 (diff) | |
download | busybox-w32-d728a30c211c2df6adccd64c6e2fc23387b341f2.tar.gz busybox-w32-d728a30c211c2df6adccd64c6e2fc23387b341f2.tar.bz2 busybox-w32-d728a30c211c2df6adccd64c6e2fc23387b341f2.zip |
tls: add a patch with optimization which _should_ give better code
...but does not.
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | networking/tls_sp_c32.patch | 142 |
1 files changed, 142 insertions, 0 deletions
diff --git a/networking/tls_sp_c32.patch b/networking/tls_sp_c32.patch new file mode 100644 index 000000000..7559586c9 --- /dev/null +++ b/networking/tls_sp_c32.patch | |||
@@ -0,0 +1,142 @@ | |||
1 | Somehow, gcc 6+ does this optimization same or better than the below | ||
2 | hand-written optimized code (gcc seem to eliminate a32[] array, uses 32-bit | ||
3 | registers/memory for "lower halves" of a32[i] elements). | ||
4 | |||
5 | But there can be arches where gcc won't be this good? | ||
6 | |||
7 | diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c | ||
8 | index 72a3be537..e8a011ad1 100644 | ||
9 | --- a/networking/tls_sp_c32.c | ||
10 | +++ b/networking/tls_sp_c32.c | ||
11 | @@ -228,51 +228,96 @@ static void sp_256_rshift1_10(sp_digit* r, sp_digit* a) | ||
12 | static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a) | ||
13 | { | ||
14 | int64_t t[8]; | ||
15 | - int64_t a32[8]; | ||
16 | + uint32_t a32; | ||
17 | int64_t o; | ||
18 | |||
19 | - a32[0] = a[0]; | ||
20 | - a32[0] |= a[1] << 26; | ||
21 | - a32[0] &= 0xffffffff; | ||
22 | - a32[1] = (sp_digit)(a[1] >> 6); | ||
23 | - a32[1] |= a[2] << 20; | ||
24 | - a32[1] &= 0xffffffff; | ||
25 | - a32[2] = (sp_digit)(a[2] >> 12); | ||
26 | - a32[2] |= a[3] << 14; | ||
27 | - a32[2] &= 0xffffffff; | ||
28 | - a32[3] = (sp_digit)(a[3] >> 18); | ||
29 | - a32[3] |= a[4] << 8; | ||
30 | - a32[3] &= 0xffffffff; | ||
31 | - a32[4] = (sp_digit)(a[4] >> 24); | ||
32 | - a32[4] |= a[5] << 2; | ||
33 | - a32[4] |= a[6] << 28; | ||
34 | - a32[4] &= 0xffffffff; | ||
35 | - a32[5] = (sp_digit)(a[6] >> 4); | ||
36 | - a32[5] |= a[7] << 22; | ||
37 | - a32[5] &= 0xffffffff; | ||
38 | - a32[6] = (sp_digit)(a[7] >> 10); | ||
39 | - a32[6] |= a[8] << 16; | ||
40 | - a32[6] &= 0xffffffff; | ||
41 | - a32[7] = (sp_digit)(a[8] >> 16); | ||
42 | - a32[7] |= a[9] << 10; | ||
43 | - a32[7] &= 0xffffffff; | ||
44 | - | ||
45 | /* 1 1 0 -1 -1 -1 -1 0 */ | ||
46 | - t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6]; | ||
47 | /* 0 1 1 0 -1 -1 -1 -1 */ | ||
48 | - t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7]; | ||
49 | /* 0 0 1 1 0 -1 -1 -1 */ | ||
50 | - t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7]; | ||
51 | /* -1 -1 0 2 2 1 0 -1 */ | ||
52 | - t[3] = 0 - a32[0] - a32[1] + 2 * a32[3] + 2 * a32[4] + a32[5] - a32[7]; | ||
53 | /* 0 -1 -1 0 2 2 1 0 */ | ||
54 | - t[4] = 0 - a32[1] - a32[2] + 2 * a32[4] + 2 * a32[5] + a32[6]; | ||
55 | /* 0 0 -1 -1 0 2 2 1 */ | ||
56 | - t[5] = 0 - a32[2] - a32[3] + 2 * a32[5] + 2 * a32[6] + a32[7]; | ||
57 | /* -1 -1 0 0 0 1 3 2 */ | ||
58 | - t[6] = 0 - a32[0] - a32[1] + a32[5] + 3 * a32[6] + 2 * a32[7]; | ||
59 | /* 1 0 -1 -1 -1 -1 0 3 */ | ||
60 | - t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3 * a32[7]; | ||
61 | + //t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6] ; | ||
62 | + //t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7] ; | ||
63 | + //t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7] ; | ||
64 | + //t[3] = 0 - a32[0] - a32[1] + 2*a32[3] + 2*a32[4] + a32[5] - a32[7] ; | ||
65 | + //t[4] = 0 - a32[1] - a32[2] + 2*a32[4] + 2*a32[5] + a32[6] ; | ||
66 | + //t[5] = 0 - a32[2] - a32[3] + 2*a32[5] + 2*a32[6] + a32[7] ; | ||
67 | + //t[6] = 0 - a32[0] - a32[1] + a32[5] + 3*a32[6] + 2*a32[7]; | ||
68 | + //t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3*a32[7]; | ||
69 | + | ||
70 | +#define A32 (int64_t)a32 | ||
71 | + a32 = a[0]; | ||
72 | + a32 |= a[1] << 26; | ||
73 | + t[0] = 0 + A32; | ||
74 | + t[3] = 0 - A32; | ||
75 | + t[6] = 0 - A32; | ||
76 | + t[7] = 0 + A32; | ||
77 | + | ||
78 | + a32 = (sp_digit)(a[1] >> 6); | ||
79 | + a32 |= a[2] << 20; | ||
80 | + t[0] += A32 ; | ||
81 | + t[1] = 0 + A32; | ||
82 | + t[3] -= A32 ; | ||
83 | + t[4] = 0 - A32; | ||
84 | + t[6] -= A32 ; | ||
85 | + | ||
86 | + a32 = (sp_digit)(a[2] >> 12); | ||
87 | + a32 |= a[3] << 14; | ||
88 | + t[1] += A32 ; | ||
89 | + t[2] = 0 + A32; | ||
90 | + t[4] -= A32 ; | ||
91 | + t[5] = 0 - A32; | ||
92 | + t[7] -= A32 ; | ||
93 | + | ||
94 | + a32 = (sp_digit)(a[3] >> 18); | ||
95 | + a32 |= a[4] << 8; | ||
96 | + t[0] -= A32 ; | ||
97 | + t[2] += A32 ; | ||
98 | + t[3] += 2*A32; | ||
99 | + t[5] -= A32 ; | ||
100 | + t[7] -= A32 ; | ||
101 | + | ||
102 | + a32 = (sp_digit)(a[4] >> 24); | ||
103 | + a32 |= a[5] << 2; | ||
104 | + a32 |= a[6] << 28; | ||
105 | + t[0] -= A32 ; | ||
106 | + t[1] -= A32 ; | ||
107 | + t[3] += 2*A32; | ||
108 | + t[4] += 2*A32; | ||
109 | + t[7] -= A32 ; | ||
110 | + | ||
111 | + a32 = (sp_digit)(a[6] >> 4); | ||
112 | + a32 |= a[7] << 22; | ||
113 | + t[0] -= A32 ; | ||
114 | + t[1] -= A32 ; | ||
115 | + t[2] -= A32 ; | ||
116 | + t[3] += A32 ; | ||
117 | + t[4] += 2*A32; | ||
118 | + t[5] += 2*A32; | ||
119 | + t[6] += A32 ; | ||
120 | + t[7] -= A32 ; | ||
121 | + | ||
122 | + a32 = (sp_digit)(a[7] >> 10); | ||
123 | + a32 |= a[8] << 16; | ||
124 | + t[0] -= A32 ; | ||
125 | + t[1] -= A32 ; | ||
126 | + t[2] -= A32 ; | ||
127 | + t[4] += A32 ; | ||
128 | + t[5] += 2*A32; | ||
129 | + t[6] += 3*A32; | ||
130 | + | ||
131 | + a32 = (sp_digit)(a[8] >> 16); | ||
132 | + a32 |= a[9] << 10; | ||
133 | + t[1] -= A32 ; | ||
134 | + t[2] -= A32 ; | ||
135 | + t[3] -= A32 ; | ||
136 | + t[5] += A32 ; | ||
137 | + t[6] += 2*A32; | ||
138 | + t[7] += 3*A32; | ||
139 | +#undef A32 | ||
140 | |||
141 | t[1] += t[0] >> 32; t[0] &= 0xffffffff; | ||
142 | t[2] += t[1] >> 32; t[1] &= 0xffffffff; | ||