summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm/x86_64-gcc.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/bn/asm/x86_64-gcc.c')
-rw-r--r--src/lib/libcrypto/bn/asm/x86_64-gcc.c554
1 files changed, 0 insertions, 554 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c
deleted file mode 100644
index bd068cfb51..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c
+++ /dev/null
@@ -1,554 +0,0 @@
1/* $OpenBSD: x86_64-gcc.c,v 1.6 2015/09/12 09:04:12 miod Exp $ */
2#include "../bn_lcl.h"
3/*
4 * x86_64 BIGNUM accelerator version 0.1, December 2002.
5 *
6 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * project.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
11 * disclaimed.
12 *
13 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
14 * versions, like 1.0...
15 * A. Well, that's because this code is basically a quick-n-dirty
16 * proof-of-concept hack. As you can see it's implemented with
17 * inline assembler, which means that you're bound to GCC and that
18 * there might be enough room for further improvement.
19 *
20 * Q. Why inline assembler?
21 * A. x86_64 features own ABI which I'm not familiar with. This is
22 * why I decided to let the compiler take care of subroutine
23 * prologue/epilogue as well as register allocation. For reference.
24 * Win64 implements different ABI for AMD64, different from Linux.
25 *
26 * Q. How much faster does it get?
27 * A. 'apps/openssl speed rsa dsa' output with no-asm:
28 *
29 * sign verify sign/s verify/s
30 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
31 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
32 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
33 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
34 * sign verify sign/s verify/s
35 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
36 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
37 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
38 *
39 * 'apps/openssl speed rsa dsa' output with this module:
40 *
41 * sign verify sign/s verify/s
42 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
43 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
44 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
45 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
46 * sign verify sign/s verify/s
47 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
48 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
49 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
50 *
51 * For the reference. IA-32 assembler implementation performs
52 * very much like 64-bit code compiled with no-asm on the same
53 * machine.
54 */
55
56#define BN_ULONG unsigned long
57
58#undef mul
59#undef mul_add
60#undef sqr
61
62/*
63 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
64 * "g"(0) let the compiler to decide where does it
65 * want to keep the value of zero;
66 */
67#define mul_add(r,a,word,carry) do { \
68 BN_ULONG high,low; \
69 asm ("mulq %3" \
70 : "=a"(low),"=d"(high) \
71 : "a"(word),"m"(a) \
72 : "cc"); \
73 asm ("addq %2,%0; adcq %3,%1" \
74 : "+r"(carry),"+d"(high)\
75 : "a"(low),"g"(0) \
76 : "cc"); \
77 asm ("addq %2,%0; adcq %3,%1" \
78 : "+m"(r),"+d"(high) \
79 : "r"(carry),"g"(0) \
80 : "cc"); \
81 carry=high; \
82 } while (0)
83
84#define mul(r,a,word,carry) do { \
85 BN_ULONG high,low; \
86 asm ("mulq %3" \
87 : "=a"(low),"=d"(high) \
88 : "a"(word),"g"(a) \
89 : "cc"); \
90 asm ("addq %2,%0; adcq %3,%1" \
91 : "+r"(carry),"+d"(high)\
92 : "a"(low),"g"(0) \
93 : "cc"); \
94 (r)=carry, carry=high; \
95 } while (0)
96
97#define sqr(r0,r1,a) \
98 asm ("mulq %2" \
99 : "=a"(r0),"=d"(r1) \
100 : "a"(a) \
101 : "cc");
102
103BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
104 {
105 BN_ULONG c1=0;
106
107 if (num <= 0) return(c1);
108
109 while (num&~3)
110 {
111 mul_add(rp[0],ap[0],w,c1);
112 mul_add(rp[1],ap[1],w,c1);
113 mul_add(rp[2],ap[2],w,c1);
114 mul_add(rp[3],ap[3],w,c1);
115 ap+=4; rp+=4; num-=4;
116 }
117 if (num)
118 {
119 mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
120 mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
121 mul_add(rp[2],ap[2],w,c1); return c1;
122 }
123
124 return(c1);
125 }
126
127BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
128 {
129 BN_ULONG c1=0;
130
131 if (num <= 0) return(c1);
132
133 while (num&~3)
134 {
135 mul(rp[0],ap[0],w,c1);
136 mul(rp[1],ap[1],w,c1);
137 mul(rp[2],ap[2],w,c1);
138 mul(rp[3],ap[3],w,c1);
139 ap+=4; rp+=4; num-=4;
140 }
141 if (num)
142 {
143 mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
144 mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
145 mul(rp[2],ap[2],w,c1);
146 }
147 return(c1);
148 }
149
150void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
151 {
152 if (n <= 0) return;
153
154 while (n&~3)
155 {
156 sqr(r[0],r[1],a[0]);
157 sqr(r[2],r[3],a[1]);
158 sqr(r[4],r[5],a[2]);
159 sqr(r[6],r[7],a[3]);
160 a+=4; r+=8; n-=4;
161 }
162 if (n)
163 {
164 sqr(r[0],r[1],a[0]); if (--n == 0) return;
165 sqr(r[2],r[3],a[1]); if (--n == 0) return;
166 sqr(r[4],r[5],a[2]);
167 }
168 }
169
170BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
171{ BN_ULONG ret,waste;
172
173 asm ("divq %4"
174 : "=a"(ret),"=d"(waste)
175 : "a"(l),"d"(h),"g"(d)
176 : "cc");
177
178 return ret;
179}
180
181BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
182{ BN_ULONG ret=0,i=0;
183
184 if (n <= 0) return 0;
185
186 asm (
187 " subq %2,%2 \n"
188 ".p2align 4 \n"
189 "1: movq (%4,%2,8),%0 \n"
190 " adcq (%5,%2,8),%0 \n"
191 " movq %0,(%3,%2,8) \n"
192 " leaq 1(%2),%2 \n"
193 " loop 1b \n"
194 " sbbq %0,%0 \n"
195 : "=&a"(ret),"+c"(n),"=&r"(i)
196 : "r"(rp),"r"(ap),"r"(bp)
197 : "cc"
198 );
199
200 return ret&1;
201}
202
203BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
204{ BN_ULONG ret=0,i=0;
205
206 if (n <= 0) return 0;
207
208 asm (
209 " subq %2,%2 \n"
210 ".p2align 4 \n"
211 "1: movq (%4,%2,8),%0 \n"
212 " sbbq (%5,%2,8),%0 \n"
213 " movq %0,(%3,%2,8) \n"
214 " leaq 1(%2),%2 \n"
215 " loop 1b \n"
216 " sbbq %0,%0 \n"
217 : "=&a"(ret),"+c"(n),"=&r"(i)
218 : "r"(rp),"r"(ap),"r"(bp)
219 : "cc"
220 );
221
222 return ret&1;
223}
224
225/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
226/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
227/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
228/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
229
230/*
231 * Keep in mind that carrying into high part of multiplication result
232 * can not overflow, because it cannot be all-ones.
233 */
234#if 0
235/* original macros are kept for reference purposes */
236#define mul_add_c(a,b,c0,c1,c2) do { \
237 BN_ULONG ta = (a), tb = (b); \
238 BN_ULONG lo, hi; \
239 BN_UMULT_LOHI(lo,hi,ta,tb); \
240 c0 += lo; hi += (c0<lo)?1:0; \
241 c1 += hi; c2 += (c1<hi)?1:0; \
242 } while(0)
243
244#define mul_add_c2(a,b,c0,c1,c2) do { \
245 BN_ULONG ta = (a), tb = (b); \
246 BN_ULONG lo, hi, tt; \
247 BN_UMULT_LOHI(lo,hi,ta,tb); \
248 c0 += lo; tt = hi+((c0<lo)?1:0); \
249 c1 += tt; c2 += (c1<tt)?1:0; \
250 c0 += lo; hi += (c0<lo)?1:0; \
251 c1 += hi; c2 += (c1<hi)?1:0; \
252 } while(0)
253
254#define sqr_add_c(a,i,c0,c1,c2) do { \
255 BN_ULONG ta = (a)[i]; \
256 BN_ULONG lo, hi; \
257 BN_UMULT_LOHI(lo,hi,ta,ta); \
258 c0 += lo; hi += (c0<lo)?1:0; \
259 c1 += hi; c2 += (c1<hi)?1:0; \
260 } while(0)
261#else
262#define mul_add_c(a,b,c0,c1,c2) do { \
263 BN_ULONG t1,t2; \
264 asm ("mulq %3" \
265 : "=a"(t1),"=d"(t2) \
266 : "a"(a),"m"(b) \
267 : "cc"); \
268 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
269 : "+r"(c0),"+r"(c1),"+r"(c2) \
270 : "r"(t1),"r"(t2),"g"(0) \
271 : "cc"); \
272 } while (0)
273
274#define sqr_add_c(a,i,c0,c1,c2) do { \
275 BN_ULONG t1,t2; \
276 asm ("mulq %2" \
277 : "=a"(t1),"=d"(t2) \
278 : "a"(a[i]) \
279 : "cc"); \
280 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
281 : "+r"(c0),"+r"(c1),"+r"(c2) \
282 : "r"(t1),"r"(t2),"g"(0) \
283 : "cc"); \
284 } while (0)
285
286#define mul_add_c2(a,b,c0,c1,c2) do { \
287 BN_ULONG t1,t2; \
288 asm ("mulq %3" \
289 : "=a"(t1),"=d"(t2) \
290 : "a"(a),"m"(b) \
291 : "cc"); \
292 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
293 : "+r"(c0),"+r"(c1),"+r"(c2) \
294 : "r"(t1),"r"(t2),"g"(0) \
295 : "cc"); \
296 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
297 : "+r"(c0),"+r"(c1),"+r"(c2) \
298 : "r"(t1),"r"(t2),"g"(0) \
299 : "cc"); \
300 } while (0)
301#endif
302
303#define sqr_add_c2(a,i,j,c0,c1,c2) \
304 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
305
306void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
307 {
308 BN_ULONG c1,c2,c3;
309
310 c1=0;
311 c2=0;
312 c3=0;
313 mul_add_c(a[0],b[0],c1,c2,c3);
314 r[0]=c1;
315 c1=0;
316 mul_add_c(a[0],b[1],c2,c3,c1);
317 mul_add_c(a[1],b[0],c2,c3,c1);
318 r[1]=c2;
319 c2=0;
320 mul_add_c(a[2],b[0],c3,c1,c2);
321 mul_add_c(a[1],b[1],c3,c1,c2);
322 mul_add_c(a[0],b[2],c3,c1,c2);
323 r[2]=c3;
324 c3=0;
325 mul_add_c(a[0],b[3],c1,c2,c3);
326 mul_add_c(a[1],b[2],c1,c2,c3);
327 mul_add_c(a[2],b[1],c1,c2,c3);
328 mul_add_c(a[3],b[0],c1,c2,c3);
329 r[3]=c1;
330 c1=0;
331 mul_add_c(a[4],b[0],c2,c3,c1);
332 mul_add_c(a[3],b[1],c2,c3,c1);
333 mul_add_c(a[2],b[2],c2,c3,c1);
334 mul_add_c(a[1],b[3],c2,c3,c1);
335 mul_add_c(a[0],b[4],c2,c3,c1);
336 r[4]=c2;
337 c2=0;
338 mul_add_c(a[0],b[5],c3,c1,c2);
339 mul_add_c(a[1],b[4],c3,c1,c2);
340 mul_add_c(a[2],b[3],c3,c1,c2);
341 mul_add_c(a[3],b[2],c3,c1,c2);
342 mul_add_c(a[4],b[1],c3,c1,c2);
343 mul_add_c(a[5],b[0],c3,c1,c2);
344 r[5]=c3;
345 c3=0;
346 mul_add_c(a[6],b[0],c1,c2,c3);
347 mul_add_c(a[5],b[1],c1,c2,c3);
348 mul_add_c(a[4],b[2],c1,c2,c3);
349 mul_add_c(a[3],b[3],c1,c2,c3);
350 mul_add_c(a[2],b[4],c1,c2,c3);
351 mul_add_c(a[1],b[5],c1,c2,c3);
352 mul_add_c(a[0],b[6],c1,c2,c3);
353 r[6]=c1;
354 c1=0;
355 mul_add_c(a[0],b[7],c2,c3,c1);
356 mul_add_c(a[1],b[6],c2,c3,c1);
357 mul_add_c(a[2],b[5],c2,c3,c1);
358 mul_add_c(a[3],b[4],c2,c3,c1);
359 mul_add_c(a[4],b[3],c2,c3,c1);
360 mul_add_c(a[5],b[2],c2,c3,c1);
361 mul_add_c(a[6],b[1],c2,c3,c1);
362 mul_add_c(a[7],b[0],c2,c3,c1);
363 r[7]=c2;
364 c2=0;
365 mul_add_c(a[7],b[1],c3,c1,c2);
366 mul_add_c(a[6],b[2],c3,c1,c2);
367 mul_add_c(a[5],b[3],c3,c1,c2);
368 mul_add_c(a[4],b[4],c3,c1,c2);
369 mul_add_c(a[3],b[5],c3,c1,c2);
370 mul_add_c(a[2],b[6],c3,c1,c2);
371 mul_add_c(a[1],b[7],c3,c1,c2);
372 r[8]=c3;
373 c3=0;
374 mul_add_c(a[2],b[7],c1,c2,c3);
375 mul_add_c(a[3],b[6],c1,c2,c3);
376 mul_add_c(a[4],b[5],c1,c2,c3);
377 mul_add_c(a[5],b[4],c1,c2,c3);
378 mul_add_c(a[6],b[3],c1,c2,c3);
379 mul_add_c(a[7],b[2],c1,c2,c3);
380 r[9]=c1;
381 c1=0;
382 mul_add_c(a[7],b[3],c2,c3,c1);
383 mul_add_c(a[6],b[4],c2,c3,c1);
384 mul_add_c(a[5],b[5],c2,c3,c1);
385 mul_add_c(a[4],b[6],c2,c3,c1);
386 mul_add_c(a[3],b[7],c2,c3,c1);
387 r[10]=c2;
388 c2=0;
389 mul_add_c(a[4],b[7],c3,c1,c2);
390 mul_add_c(a[5],b[6],c3,c1,c2);
391 mul_add_c(a[6],b[5],c3,c1,c2);
392 mul_add_c(a[7],b[4],c3,c1,c2);
393 r[11]=c3;
394 c3=0;
395 mul_add_c(a[7],b[5],c1,c2,c3);
396 mul_add_c(a[6],b[6],c1,c2,c3);
397 mul_add_c(a[5],b[7],c1,c2,c3);
398 r[12]=c1;
399 c1=0;
400 mul_add_c(a[6],b[7],c2,c3,c1);
401 mul_add_c(a[7],b[6],c2,c3,c1);
402 r[13]=c2;
403 c2=0;
404 mul_add_c(a[7],b[7],c3,c1,c2);
405 r[14]=c3;
406 r[15]=c1;
407 }
408
409void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
410 {
411 BN_ULONG c1,c2,c3;
412
413 c1=0;
414 c2=0;
415 c3=0;
416 mul_add_c(a[0],b[0],c1,c2,c3);
417 r[0]=c1;
418 c1=0;
419 mul_add_c(a[0],b[1],c2,c3,c1);
420 mul_add_c(a[1],b[0],c2,c3,c1);
421 r[1]=c2;
422 c2=0;
423 mul_add_c(a[2],b[0],c3,c1,c2);
424 mul_add_c(a[1],b[1],c3,c1,c2);
425 mul_add_c(a[0],b[2],c3,c1,c2);
426 r[2]=c3;
427 c3=0;
428 mul_add_c(a[0],b[3],c1,c2,c3);
429 mul_add_c(a[1],b[2],c1,c2,c3);
430 mul_add_c(a[2],b[1],c1,c2,c3);
431 mul_add_c(a[3],b[0],c1,c2,c3);
432 r[3]=c1;
433 c1=0;
434 mul_add_c(a[3],b[1],c2,c3,c1);
435 mul_add_c(a[2],b[2],c2,c3,c1);
436 mul_add_c(a[1],b[3],c2,c3,c1);
437 r[4]=c2;
438 c2=0;
439 mul_add_c(a[2],b[3],c3,c1,c2);
440 mul_add_c(a[3],b[2],c3,c1,c2);
441 r[5]=c3;
442 c3=0;
443 mul_add_c(a[3],b[3],c1,c2,c3);
444 r[6]=c1;
445 r[7]=c2;
446 }
447
448void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
449 {
450 BN_ULONG c1,c2,c3;
451
452 c1=0;
453 c2=0;
454 c3=0;
455 sqr_add_c(a,0,c1,c2,c3);
456 r[0]=c1;
457 c1=0;
458 sqr_add_c2(a,1,0,c2,c3,c1);
459 r[1]=c2;
460 c2=0;
461 sqr_add_c(a,1,c3,c1,c2);
462 sqr_add_c2(a,2,0,c3,c1,c2);
463 r[2]=c3;
464 c3=0;
465 sqr_add_c2(a,3,0,c1,c2,c3);
466 sqr_add_c2(a,2,1,c1,c2,c3);
467 r[3]=c1;
468 c1=0;
469 sqr_add_c(a,2,c2,c3,c1);
470 sqr_add_c2(a,3,1,c2,c3,c1);
471 sqr_add_c2(a,4,0,c2,c3,c1);
472 r[4]=c2;
473 c2=0;
474 sqr_add_c2(a,5,0,c3,c1,c2);
475 sqr_add_c2(a,4,1,c3,c1,c2);
476 sqr_add_c2(a,3,2,c3,c1,c2);
477 r[5]=c3;
478 c3=0;
479 sqr_add_c(a,3,c1,c2,c3);
480 sqr_add_c2(a,4,2,c1,c2,c3);
481 sqr_add_c2(a,5,1,c1,c2,c3);
482 sqr_add_c2(a,6,0,c1,c2,c3);
483 r[6]=c1;
484 c1=0;
485 sqr_add_c2(a,7,0,c2,c3,c1);
486 sqr_add_c2(a,6,1,c2,c3,c1);
487 sqr_add_c2(a,5,2,c2,c3,c1);
488 sqr_add_c2(a,4,3,c2,c3,c1);
489 r[7]=c2;
490 c2=0;
491 sqr_add_c(a,4,c3,c1,c2);
492 sqr_add_c2(a,5,3,c3,c1,c2);
493 sqr_add_c2(a,6,2,c3,c1,c2);
494 sqr_add_c2(a,7,1,c3,c1,c2);
495 r[8]=c3;
496 c3=0;
497 sqr_add_c2(a,7,2,c1,c2,c3);
498 sqr_add_c2(a,6,3,c1,c2,c3);
499 sqr_add_c2(a,5,4,c1,c2,c3);
500 r[9]=c1;
501 c1=0;
502 sqr_add_c(a,5,c2,c3,c1);
503 sqr_add_c2(a,6,4,c2,c3,c1);
504 sqr_add_c2(a,7,3,c2,c3,c1);
505 r[10]=c2;
506 c2=0;
507 sqr_add_c2(a,7,4,c3,c1,c2);
508 sqr_add_c2(a,6,5,c3,c1,c2);
509 r[11]=c3;
510 c3=0;
511 sqr_add_c(a,6,c1,c2,c3);
512 sqr_add_c2(a,7,5,c1,c2,c3);
513 r[12]=c1;
514 c1=0;
515 sqr_add_c2(a,7,6,c2,c3,c1);
516 r[13]=c2;
517 c2=0;
518 sqr_add_c(a,7,c3,c1,c2);
519 r[14]=c3;
520 r[15]=c1;
521 }
522
523void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
524 {
525 BN_ULONG c1,c2,c3;
526
527 c1=0;
528 c2=0;
529 c3=0;
530 sqr_add_c(a,0,c1,c2,c3);
531 r[0]=c1;
532 c1=0;
533 sqr_add_c2(a,1,0,c2,c3,c1);
534 r[1]=c2;
535 c2=0;
536 sqr_add_c(a,1,c3,c1,c2);
537 sqr_add_c2(a,2,0,c3,c1,c2);
538 r[2]=c3;
539 c3=0;
540 sqr_add_c2(a,3,0,c1,c2,c3);
541 sqr_add_c2(a,2,1,c1,c2,c3);
542 r[3]=c1;
543 c1=0;
544 sqr_add_c(a,2,c2,c3,c1);
545 sqr_add_c2(a,3,1,c2,c3,c1);
546 r[4]=c2;
547 c2=0;
548 sqr_add_c2(a,3,2,c3,c1,c2);
549 r[5]=c3;
550 c3=0;
551 sqr_add_c(a,3,c1,c2,c3);
552 r[6]=c1;
553 r[7]=c2;
554 }