aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2021-04-26 23:07:32 +0200
committerDenys Vlasenko <vda.linux@googlemail.com>2021-04-26 23:09:20 +0200
commitd728a30c211c2df6adccd64c6e2fc23387b341f2 (patch)
tree36264c52731ebb18a92863aa2a7158f978bda7cc
parent9a40be433de31b8a7fea20b7ebce3dafbedaf504 (diff)
downloadbusybox-w32-d728a30c211c2df6adccd64c6e2fc23387b341f2.tar.gz
busybox-w32-d728a30c211c2df6adccd64c6e2fc23387b341f2.tar.bz2
busybox-w32-d728a30c211c2df6adccd64c6e2fc23387b341f2.zip
tls: add a patch with optimization which _should_ give better code
...but does not. Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--networking/tls_sp_c32.patch142
1 files changed, 142 insertions, 0 deletions
diff --git a/networking/tls_sp_c32.patch b/networking/tls_sp_c32.patch
new file mode 100644
index 000000000..7559586c9
--- /dev/null
+++ b/networking/tls_sp_c32.patch
@@ -0,0 +1,142 @@
1Somehow, gcc 6+ does this optimization same or better than the below
2hand-written optimized code (gcc seem to eliminate a32[] array, uses 32-bit
3registers/memory for "lower halves" of a32[i] elements).
4
5But there can be arches where gcc won't be this good?
6
7diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
8index 72a3be537..e8a011ad1 100644
9--- a/networking/tls_sp_c32.c
10+++ b/networking/tls_sp_c32.c
11@@ -228,51 +228,96 @@ static void sp_256_rshift1_10(sp_digit* r, sp_digit* a)
12 static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a)
13 {
14 int64_t t[8];
15- int64_t a32[8];
16+ uint32_t a32;
17 int64_t o;
18
19- a32[0] = a[0];
20- a32[0] |= a[1] << 26;
21- a32[0] &= 0xffffffff;
22- a32[1] = (sp_digit)(a[1] >> 6);
23- a32[1] |= a[2] << 20;
24- a32[1] &= 0xffffffff;
25- a32[2] = (sp_digit)(a[2] >> 12);
26- a32[2] |= a[3] << 14;
27- a32[2] &= 0xffffffff;
28- a32[3] = (sp_digit)(a[3] >> 18);
29- a32[3] |= a[4] << 8;
30- a32[3] &= 0xffffffff;
31- a32[4] = (sp_digit)(a[4] >> 24);
32- a32[4] |= a[5] << 2;
33- a32[4] |= a[6] << 28;
34- a32[4] &= 0xffffffff;
35- a32[5] = (sp_digit)(a[6] >> 4);
36- a32[5] |= a[7] << 22;
37- a32[5] &= 0xffffffff;
38- a32[6] = (sp_digit)(a[7] >> 10);
39- a32[6] |= a[8] << 16;
40- a32[6] &= 0xffffffff;
41- a32[7] = (sp_digit)(a[8] >> 16);
42- a32[7] |= a[9] << 10;
43- a32[7] &= 0xffffffff;
44-
45 /* 1 1 0 -1 -1 -1 -1 0 */
46- t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6];
47 /* 0 1 1 0 -1 -1 -1 -1 */
48- t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7];
49 /* 0 0 1 1 0 -1 -1 -1 */
50- t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7];
51 /* -1 -1 0 2 2 1 0 -1 */
52- t[3] = 0 - a32[0] - a32[1] + 2 * a32[3] + 2 * a32[4] + a32[5] - a32[7];
53 /* 0 -1 -1 0 2 2 1 0 */
54- t[4] = 0 - a32[1] - a32[2] + 2 * a32[4] + 2 * a32[5] + a32[6];
55 /* 0 0 -1 -1 0 2 2 1 */
56- t[5] = 0 - a32[2] - a32[3] + 2 * a32[5] + 2 * a32[6] + a32[7];
57 /* -1 -1 0 0 0 1 3 2 */
58- t[6] = 0 - a32[0] - a32[1] + a32[5] + 3 * a32[6] + 2 * a32[7];
59 /* 1 0 -1 -1 -1 -1 0 3 */
60- t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3 * a32[7];
61+ //t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6] ;
62+ //t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7] ;
63+ //t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7] ;
64+ //t[3] = 0 - a32[0] - a32[1] + 2*a32[3] + 2*a32[4] + a32[5] - a32[7] ;
65+ //t[4] = 0 - a32[1] - a32[2] + 2*a32[4] + 2*a32[5] + a32[6] ;
66+ //t[5] = 0 - a32[2] - a32[3] + 2*a32[5] + 2*a32[6] + a32[7] ;
67+ //t[6] = 0 - a32[0] - a32[1] + a32[5] + 3*a32[6] + 2*a32[7];
68+ //t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3*a32[7];
69+
70+#define A32 (int64_t)a32
71+ a32 = a[0];
72+ a32 |= a[1] << 26;
73+ t[0] = 0 + A32;
74+ t[3] = 0 - A32;
75+ t[6] = 0 - A32;
76+ t[7] = 0 + A32;
77+
78+ a32 = (sp_digit)(a[1] >> 6);
79+ a32 |= a[2] << 20;
80+ t[0] += A32 ;
81+ t[1] = 0 + A32;
82+ t[3] -= A32 ;
83+ t[4] = 0 - A32;
84+ t[6] -= A32 ;
85+
86+ a32 = (sp_digit)(a[2] >> 12);
87+ a32 |= a[3] << 14;
88+ t[1] += A32 ;
89+ t[2] = 0 + A32;
90+ t[4] -= A32 ;
91+ t[5] = 0 - A32;
92+ t[7] -= A32 ;
93+
94+ a32 = (sp_digit)(a[3] >> 18);
95+ a32 |= a[4] << 8;
96+ t[0] -= A32 ;
97+ t[2] += A32 ;
98+ t[3] += 2*A32;
99+ t[5] -= A32 ;
100+ t[7] -= A32 ;
101+
102+ a32 = (sp_digit)(a[4] >> 24);
103+ a32 |= a[5] << 2;
104+ a32 |= a[6] << 28;
105+ t[0] -= A32 ;
106+ t[1] -= A32 ;
107+ t[3] += 2*A32;
108+ t[4] += 2*A32;
109+ t[7] -= A32 ;
110+
111+ a32 = (sp_digit)(a[6] >> 4);
112+ a32 |= a[7] << 22;
113+ t[0] -= A32 ;
114+ t[1] -= A32 ;
115+ t[2] -= A32 ;
116+ t[3] += A32 ;
117+ t[4] += 2*A32;
118+ t[5] += 2*A32;
119+ t[6] += A32 ;
120+ t[7] -= A32 ;
121+
122+ a32 = (sp_digit)(a[7] >> 10);
123+ a32 |= a[8] << 16;
124+ t[0] -= A32 ;
125+ t[1] -= A32 ;
126+ t[2] -= A32 ;
127+ t[4] += A32 ;
128+ t[5] += 2*A32;
129+ t[6] += 3*A32;
130+
131+ a32 = (sp_digit)(a[8] >> 16);
132+ a32 |= a[9] << 10;
133+ t[1] -= A32 ;
134+ t[2] -= A32 ;
135+ t[3] -= A32 ;
136+ t[5] += A32 ;
137+ t[6] += 2*A32;
138+ t[7] += 3*A32;
139+#undef A32
140
141 t[1] += t[0] >> 32; t[0] &= 0xffffffff;
142 t[2] += t[1] >> 32; t[1] &= 0xffffffff;